-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathjessica_neo4j.py
103 lines (100 loc) · 3.01 KB
/
jessica_neo4j.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
#######jessica_neo4j.py#######
'''
https://neo4j.com/docs/cypher-manual/current/clauses/create/#create-create-a-node-with-a-label
'''
import re
import os
import time
from neo4j import *
from pyspark import *
from pyspark.sql import *
from pyspark.sql.types import *
from pyspark.sql.functions import *
def ingest_node_json2neo4j(bolt_url,
bolt_username,
bolt_password,
input_json,
sqlContext = None,
delect_neo4j = True):
print("connecting to the neo4j server")
neo4j_instance = GraphDatabase.driver(bolt_url, auth=(bolt_username, bolt_password))
#neo4j_session = neo4j_instance.session()
print("loading node data from "+input_json)
sqlContext.read.json(input_json).registerTempTable('input')
input = sqlContext.sql(u"""
SELECT DISTINCT * FROM input
""")
n = input.collect()
print("loaded "+str(len(n))+' nodes from '+input_json)
with neo4j_instance.session() as neo4j_session:
if delect_neo4j is True:
print("deleting data of neo4j")
neo4j_session.run(u"""
MATCH (n) OPTIONAL MATCH (n)-[r]-() DELETE n,r;
""")
print('inserting nodes to neo4j')
for r in n:
try:
with neo4j_instance.session() as neo4j_session:
#node_id = re.sub(r'\'',r'\'', r.node_id)
'''
content = re.sub(r'\'',r'\'', r.node_content)
content = re.sub(u"[^\u0000-\u007e]",r'', content)
node_type = re.sub(r'[^A-z\d]+',r'_', r.node_type)
re.sub(r'\'',r'\'', r.node_id)
'''
node_cmd = u"""
CREATE (n:%s { id: '%s', content: '%s' });
"""%(r.node_type, r.node_id, r.node_content)
neo4j_session.run(node_cmd)
#print(neo4j_cmd)
except Exception as e:
print(e)
#print(node_cmd)
pass
return None
def ingest_relation_json2neo4j(bolt_url,
bolt_username,
bolt_password,
input_json,
sqlContext = None,
delect_neo4j = False):
print("connecting to the neo4j server")
neo4j_instance = GraphDatabase.driver(bolt_url, auth=(bolt_username, bolt_password))
#neo4j_session = neo4j_instance.session()
print("loading relation data from "+input_json)
sqlContext.read.json(input_json).registerTempTable('input')
input = sqlContext.sql(u"""
SELECT DISTINCT * FROM input
""")
t = input.collect()
print("loaded "+str(len(t))+' tripletes from '+input_json)
with neo4j_instance.session() as neo4j_session:
if delect_neo4j is True:
print("deleting data of neo4j")
neo4j_session.run(u"""
MATCH (n) OPTIONAL MATCH (n)-[r]-() DELETE n,r;
""")
print('inserting edges to neo4j')
for r in t:
try:
with neo4j_instance.session() as neo4j_session:
'''
re.sub(r'[^A-z\d]+',r'_', r.subject_type)
re.sub(r'[^A-z\d]+',r'_', r.object_type)
re.sub(r'\'',r'\'', r.subject_id)
re.sub(r'\'',r'\'', r.object_id),
re.sub(r'[^A-z]+',r'_', )
'''
neo4j_cmd = u"""
MATCH (a:%s),(b:%s) WHERE a.id = '%s' AND b.id = '%s'
CREATE (a)-[r:%s]->(b);
"""%(r.subject_type,r.object_type,
r.subject_id,r.object_id,
r.relation)
neo4j_session.run(neo4j_cmd)
except Exception as e:
print(e)
#print(neo4j_cmd)
return None
#######jessica_neo4j.py#######