This repository has been archived by the owner on Oct 25, 2019. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 0
/
cypher2node.py
117 lines (91 loc) · 3.55 KB
/
cypher2node.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
import csv
import json
import pprint
import unidecode
import sys
def write_csv(filename, fields, data):
with open(filename, 'w', newline='') as f:
writer = csv.writer(f)
writer.writerow(fields)
for record in data:
writer.writerow(map(lambda x: record[x], fields))
def main():
if len(sys.argv) < 4:
print('usage: cypher2csv.py <authorfile> <journalfile> <conffile>', file=sys.stderr)
return 1
authorfile = sys.argv[1]
journalfile = sys.argv[2]
conffile = sys.argv[3]
# Create buckets for author nodes and publication nodes.
authors = []
conf_pubs = []
journal_pubs = []
# The input consists of a JSON string per line.
author_key = set()
for line in sys.stdin:
rec = json.loads(line)['p']
out = {}
# Sweep through the labels field, looking for either "Article" or
# "Author".
label = None
for label in rec['labels']:
if label in ['Author', 'Article', 'ConferencePaper']:
break
# Split paths to handle authors and pubs differently.
if label == 'Author':
out['type'] = 'author'
# Copy over properties.
try:
out['name'] = rec['properties']['name']
out['_key'] = unidecode.unidecode(rec['properties']['id'])
except KeyError as e:
print(e)
pprint.pprint(rec)
# Disambiguate author keys if they wind up equal to something
# already seen (this can happen if the author appears twice, once
# with diacritics and once without).
suffix = 0
key = out['_key']
while key in author_key:
suffix += 1
key = out['_key'] + str(suffix)
out['_key'] = key
author_key.add(out['_key'])
authors.append(out)
elif label == 'ConferencePaper':
out['type'] = 'conference_publication'
# Copy over properties.
try:
for field in ['year', 'booktitle', 'title', 'url']:
out[field] = rec['properties'][field]
except KeyError as e:
print(e)
pprint.pprint(rec)
# Set venue and key.
key_parts = rec['properties']['key'].split('/')
out['venue'] = key_parts[1]
out['_key'] = '{}-{}'.format(key_parts[1], key_parts[2])
conf_pubs.append(out)
elif label == 'Article':
out['type'] = 'journal_publication'
# Copy over properties.
try:
for field in ['ee', 'year', 'title', 'url']:
out[field] = rec['properties'][field]
except KeyError as e:
print(e)
pprint.pprint(rec)
out['pages'] = rec['properties'].get('pages', 'unknown')
# Set journal and key.
key_parts = rec['properties']['key'].split('/')
out['journal'] = key_parts[1]
out['_key'] = '{}-{}'.format(key_parts[1], key_parts[2])
journal_pubs.append(out)
else:
print('error: unknown node type {}'.format(rec['labels']))
pprint.pprint(rec)
write_csv(authorfile, ['_key', 'type', 'name'], authors)
write_csv(conffile, ['_key', 'type', 'venue', 'year', 'booktitle', 'title', 'url'], conf_pubs)
write_csv(journalfile, ['_key', 'type', 'journal', 'ee', 'year', 'title', 'url'], journal_pubs)
if __name__ == '__main__':
sys.exit(main())