This repository was archived by the owner on Aug 10, 2020. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathwb2ttl.py
executable file
·278 lines (227 loc) · 10.6 KB
/
wb2ttl.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
#!/usr/bin/env python3
""" Convert Wikibase Items and Properties to RDF.
Tested with Wikibase DataModel 6.3.0.
Requires environment variables to be set:
wbdbhost: MySQL/MariaDB Host
wbdbuser: MySQL/MariaDB user name
wbdbpasswd: MySQL/MariaDB user password
wbdbdb: Name of database
"""
from os import environ
import uuid
from pprint import pprint
import pymysql.cursors
import json
from rdflib import Namespace, Graph, URIRef, BNode, Literal
from rdflib.namespace import DCTERMS, RDFS, RDF, DC, SKOS, OWL, XSD
import sys
from argparse import ArgumentParser
import traceback
argparser = ArgumentParser('Translate Wikibase into TTL, based on the Wikidata DataModel 6.3.0.')
argparser.add_argument('local_base', help='Base URI')
argparser.add_argument('outfile', help='Filename to write TTL to')
argparser.add_argument('-e', '--exactMatch', help='Wikibase property that is equivalent with skos:exactMatch', default=False)
options = argparser.parse_args()
# Set the namespaces
# Here we are trying to replicate the exact same structure as in Wikidata
wikidata_base = "http://www.wikidata.org/"
# Mint local namespace
wdp = Namespace("prop/")
rhp = Namespace(options.local_base+"prop/")
wdwdt = Namespace(wikidata_base+"prop/direct/")
rhwdt = Namespace(options.local_base+"prop/direct/")
wdwd = Namespace(wikidata_base+"entity/")
rhwd = Namespace(options.local_base+"entity/")
wdref = Namespace(wikidata_base+"reference/")
rhref = Namespace(options.local_base+"reference/")
wds = Namespace(wikidata_base+"entity/statement/")
rhs = Namespace(options.local_base+"entity/statement/")
wdps = Namespace(wikidata_base+"prop/statement/")
rhps = Namespace(options.local_base+"prop/statement/")
wdpr = Namespace(wikidata_base+"prop/reference/")
rhr = Namespace(options.local_base+"prop/reference/")
wdpq = Namespace(wikidata_base+"prop/qualifier/")
rhq = Namespace(options.local_base+"prop/qualifier/")
wikibase = Namespace("http://wikiba.se/ontology#")
# Remote namespaces
wikibase = Namespace("http://wikiba.se/ontology#")
schema = Namespace("http://schema.org/")
prov = Namespace("http://www.w3.org/ns/prov#")
# Define the rhizomeGraph
rhizomeGraph = Graph()
rhizomeGraph.bind("schema", schema)
rhizomeGraph.bind("skos", SKOS)
rhizomeGraph.bind("wikibase", wikibase)
connection = pymysql.connect(
host=environ['wbdbhost'],
user=environ['wbdbuser'],
password=environ['wbdbpasswd'],
db=environ['wbdbdb'],
cursorclass=pymysql.cursors.DictCursor
)
# Read SQL queries
sql_getjson = open('get-json.sql', 'r').read(); # to get entities as json
sql_propinfo = open('property-info.sql', 'r').read(); # to get type info on property
try:
with connection.cursor() as cursor_properties:
cursor_properties.execute(sql_propinfo)
propinfo_results = cursor_properties.fetchall()
finally:
pass
# Convert property info into dictionary, so the type of a property can be easily
# checked. Example:
# propinfo['P2'] -> 'url'
propinfo = {}
for pi in propinfo_results:
if 'type' in pi.keys():
propinfo[pi['id']] = pi['type']
else:
print('Error: Missing data type information for property {0} in database table wb_property_info.'.format(pi['id']))
exit()
try:
with connection.cursor() as cursor_entities:
cursor_entities.execute(sql_getjson)
finally:
pass
for row in cursor_entities:
j = json.loads(row['json_text'])
j['modified'] = row['modified']
id = j['id'] # P or Q identifier
isProperty = (id[0] == 'P')
rhizomeGraph.add((rhwd[id], RDFS.label, Literal(j['labels']['en']['value'], lang='en')))
if 'en' in j['descriptions']:
rhizomeGraph.add((rhwd[id], schema.description, Literal(j['descriptions']['en']['value'], lang='en')))
if isProperty:
rhizomeGraph.add((rhwd[id], RDF.type, wikibase.Property))
rhizomeGraph.add((rhwd[id], wikibase.directClaim, rhwdt[id]))
rhizomeGraph.add((rhwd[id], wikibase.claim, rhp[id]))
# walk all claims
for claim_prop in j['claims']:
if claim_prop not in propinfo:
print('Property {0} not available in database table wb_property_info.'.format(claim_prop))
exit()
for claim in j['claims'][claim_prop]:
# walk all statements
for statement in j['claims'][claim_prop]:
statementNode = rhs[statement['id']] # extract unique ID for statement node
rhizomeGraph.add((rhwd[id], rhp[claim_prop], statementNode))
try:
datavalue = statement['mainsnak']['datavalue']
# - - - translate each wikibase datatype to ttl - - -
if claim_prop not in propinfo:
print('Error: No information about Property {0} available (at entity {1})'.format(prop_info, id))
exit()
# matching local exactMatch with SKOS
if claim_prop == options.exactMatch:
rhizomeGraph.add((
rhwd[id],
SKOS.exactMatch,
URIRef(datavalue['value'])
))
# output to validate that exactMatch is being applied
print('{0} -> {1}'.format(rhwd[id], datavalue['value']))
#regular statements
if propinfo[claim_prop] == 'wikibase-item':
rhizomeGraph.add((
rhwd[id],
rhwdt[claim_prop],
URIRef(rhwd['Q' + str(datavalue['value']['numeric-id']) ])
))
rhizomeGraph.add((
statementNode,
rhps[claim_prop],
URIRef(rhwd['Q' + str(datavalue['value']['numeric-id'])])
))
elif propinfo[claim_prop] == 'string':
rhizomeGraph.add((
rhwd[id],
rhwdt[claim_prop],
Literal(datavalue['value'])
))
rhizomeGraph.add((
statementNode,
rhps[claim_prop],
Literal(datavalue["value"])
))
# TODO: revisit commonsMedia
elif propinfo[claim_prop] == 'commonsMedia':
rhizomeGraph.add((
rhwd[id],
rhwdt[claim_prop],
Literal(datavalue['value'])
))
rhizomeGraph.add((
statementNode,
rhps[claim_prop],
Literal(datavalue['value'])
))
elif propinfo[claim_prop] == 'url':
rhizomeGraph.add((
rhwd[id],
rhwdt[claim_prop],
URIRef(datavalue['value'])
))
rhizomeGraph.add((
statementNode,
rhps[claim_prop],
URIRef(datavalue['value'])
))
elif propinfo[claim_prop] == 'time':
rhizomeGraph.add((
rhwd[id],
rhwdt[claim_prop],
Literal(datavalue['value']['time'], datatype=XSD.dateTime)
))
rhizomeGraph.add((
statementNode,
rhps[claim_prop],
Literal(datavalue['value']['time'], datatype=XSD.dateTime)
))
# references
if 'references' in statement:
for reference in statement["references"]:
referenceNode = rhref[str(uuid.uuid4())]
rhizomeGraph.add((
statementNode,
prov['wasDerivedFrom'],
referenceNode
))
for snakProperty in reference['snaks']:
for snak in reference['snaks'][snakProperty]:
if snak['datavalue']['type'] == 'url':
rhizomeGraph.add((
referenceNode,
rhr[snak['property']],
URIRef(snak['datavalue']['value'])
))
elif snak['datavalue']['type'] == 'string':
rhizomeGraph.add((
referenceNode,
rhr[snak['property']],
Literal(snak['datavalue']['value'])
))
elif snak['datavalue']['type'] == 'time':
rhizomeGraph.add((
referenceNode,
rhr[snak['property']],
Literal(snak['datavalue']['value']['time'])
))
# qualifiers
if 'qualifiers' in statement.keys():
for qualifier in statement['qualifiers']:
for snak in statement['qualifiers'][qualifier]:
if snak['datavalue']['type'] == 'quantity':
rhizomeGraph.add((
statementNode,
rhq[snak['property']],
Literal(snak['datavalue']['value']['amount'])
))
except KeyError as e:
traceback.print_exc()
print('KeyError', e)
print('ID: {id}, CLAIM: {claim_prop}'.format(id=id, claim_prop=claim_prop))
pprint(statement)
exit()
connection.close()
# Export final results
rhizomeGraph.serialize(destination=options.outfile, format='turtle')