-
Notifications
You must be signed in to change notification settings - Fork 0
/
stackscore_annotations.py
executable file
·273 lines (245 loc) · 9.81 KB
/
stackscore_annotations.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
#!/usr/bin/env python
"""
Add StackScore usage data as annotations.
Timing:
10s to read 2.3M stackscores into dict
150min to parse 2M records from Cornell LD4L RDF and write annotations
=> expect 600min = 10h to write annotations for 8M records
"""
import glob
import gzip
import logging
import optparse
import os.path
from rdflib import Graph, URIRef, Literal
from rdflib.namespace import Namespace, NamespaceManager, RDF
import re
import time
def split_multiext(filename, max=2):
"""Wrapper around os.path.splitext to remove potentially multiple extensions."""
all_ext = ''
n = 0
while (n < max):
n += 1
(filename,ext) = os.path.splitext(filename)
if (ext):
all_ext = ext + all_ext
else:
break
return(filename,all_ext)
def read_stackscores(filename):
"""Read StackScores from filename.
Each line is simply bibid and stackscore
"""
logging.info("Reading StackScores from %s..." % (filename))
fh = gzip.open(filename,'r')
scores = {}
n = 0
for line in fh:
n+=1
if (re.match(r'''\s*#''',line)):
continue
(bibid,score) = line.split()
bibid = int(bibid)
score = int(score)
scores[bibid]=score
logging.info("Read %d StackScores"%(len(scores)))
return scores
def bind_namespace(ns_mgr, prefix, namespace):
"""Bind prefix to namespace in the NamespaceManager ns_mgr."""
ns = Namespace(namespace)
ns_mgr.bind(prefix, ns, override=False)
return ns
cornell_prefix = 'http://draft.ld4l.org/cornell'
namespace_manager = NamespaceManager(Graph())
cnt = bind_namespace(namespace_manager, 'cnt', 'http://www.w3.org/2011/content#')
oa = bind_namespace(namespace_manager, 'oa', 'http://www.w3.org/ns/oa#')
ld4l = bind_namespace(namespace_manager, 'ld4l', 'http://bib.ld4l.org/ontology/')
rdf = bind_namespace(namespace_manager, 'rdf', 'http://www.w3.org/1999/02/22-rdf-syntax-ns#')
# Build URIRefs we'll need ahead for speed...
oa_hasTarget = oa['hasTarget']
oa_Annotation = oa['Annotation']
oa_hasBody = oa['hasBody']
oa_motivatedBy = oa['motivatedBy']
ld4l_Instance = ld4l['Instance']
ld4l_identifiedBy = ld4l['identifiedBy']
ld4l_LocalIlsIdentifier = ld4l['LocalIlsIdentifier']
ld4l_hasAnnotation = ld4l['hasAnnotation']
ld4l_stackViewScoring = ld4l['stackViewScoring']
cnt_ContentAsText = cnt['ContentAsText']
cnt_chars = cnt['chars']
rdf_type = rdf['type']
rdf_value = rdf['value']
def add_score(g, instance, score):
"""Add to graph g the StackScore score as annotation on instance."""
istr = str(instance) # base ss-anno and ss-body URIs in instance URI
annotation = URIRef('%s-ss-anno' % (istr))
body = URIRef('%s-ss-body' % (istr))
score = Literal(str(score))
g.add( (instance, ld4l_hasAnnotation, annotation) )
g.add( (annotation, oa_hasTarget, instance) )
g.add( (annotation, RDF.type, oa_Annotation) )
g.add( (annotation, oa_hasBody, body) )
g.add( (annotation, oa_motivatedBy, ld4l_stackViewScoring) )
g.add( (body, RDF.type, cnt_ContentAsText) )
g.add( (body, cnt_chars, score) )
from rdflib.term import URIRef as URI
from rdflib.plugins.parsers.ntriples import NTriplesParser, unquote, uriquote, ParseError
import codecs
class StoreSink(object):
"""Trivial triple sink that stores one triple."""
def __init__(self):
"""Initialize with empty store."""
self.triple()
def triple(self, s=None, p=None, o=None):
"""Store new triple."""
self.s = s
self.p = p
self.o = o
def last(self):
"""Return last triple else ValueError if none or has been read."""
if (self.s is not None):
s = self.s
self.s = None
return(s,self.p,self.o)
else:
raise ValueError()
class NTriplesStreamer(NTriplesParser):
"""Modification of NTriplesParse to provide iterator over triples."""
def __init__(self, filename=None):
"""Initialize with an empty sink that we will use to yield the last triple."""
self.sink = StoreSink()
def open(self, filename):
"""Open that handles plain of gzipped files based on extension typing."""
if (filename.endswith('.gz')):
self.file = gzip.open(filename,'r')
else:
self.file = open(filename,'r')
# since N-Triples 1.1 files can and should be utf-8 encoded
self.file = codecs.getreader('utf-8')(self.file)
def parse_generator(self,filename):
"""Parse f as an N-Triples file yielding triples.
Modified version of rdflib.plugins.parsers.ntriples.NTriplesParser.parse(...)
that works as a generator.
"""
self.open(filename)
self.buffer = ''
self.bad_lines = 0
while True:
self.line = self.readline()
if self.line is None:
break
try:
self.parseline()
yield(self.sink.last())
except ParseError:
self.bad_lines += 1
#raise ParseError("Invalid line: %r" % self.line)
except ValueError:
# no new data, just keep going
pass
if (self.bad_lines):
logging.warn("Warning - ignored %d bad lines" % (self.bad_lines))
def process_file(bib_file, namespace_manager):
"""Process one file producing one annotation file.
In each file look for Cornell ld4l:Instances to annotate with StackScores
based on extracting the bibid from ld4l:LocalIlsIdentifier triples. Data
pattern is:
instance? rdf:type ld4l:Instance .
instance? ld4l:identifiedBy ils_id? .
ild_id? rdf:type ld4l:LocalIlsIdentifier .
ils_id? rdf:value literal_value? .
For each input file, create an output file in the local directory but with a
similar name to the input file that contains the annotations.
"""
# Start new greph for this file
g = Graph()
g.namespace_manager = namespace_manager
nts = NTriplesStreamer()
# Work out output file name
ss_anno_file = split_multiext(os.path.basename(bib_file))[0] + "-ss-anno.nt.gz"
ss_anno_fh = gzip.open(ss_anno_file,'w')
logging.info("Parsing %s, writing %s" % (bib_file,ss_anno_file))
# Read the file pulling out four types of triple we need and
# stashing the results in in-memory data structures:
instances = set()
id_by = {}
ils_ids = set()
rdf_val = {}
n = 0
for (s,p,o) in nts.parse_generator(bib_file):
n += 1
try:
if (p == rdf_type):
if (o == ld4l_Instance):
# instance? rdf:type ld4l:instance --> instances
instances.add(str(s))
elif (o == ld4l_LocalIlsIdentifier):
ils_ids.add(str(s))
elif (p == ld4l_identifiedBy):
# instance? ld4l:identifiedBy ils_id? .
ss = str(s)
if (ss not in id_by):
id_by[ss] = []
id_by[ss].append(str(o))
elif (p == rdf_value):
# ils_id? rdf:value literal_value? . --- ASSUMING UNIQUE BY ILS_ID
ss = str(s)
if (ss not in rdf_val):
rdf_val[ss] = []
rdf_val[ss].append(str(o))
except Exception as e:
logging.warn("%s - skipping triple (%r,%r,%r)", str(e), s, p, o)
logging.info("-- read %d triples, extracted %d instances, %d ils_ids" % (n, len(instances), len(ils_ids)))
# Go through all instances seeing whether we find a bibid
n = 0
for instance in instances:
try:
by_id = None
if (instance not in id_by):
continue # an instance with no bf:identifiedBy
for by in id_by[instance]:
if by in ils_ids:
by_id = by
break
if (by_id is None or by_id not in rdf_val):
continue # Non-ILS id, skip silently
bibids = rdf_val[by_id]
if (len(bibids)!=1):
raise Exception("Expected one bibid for ILS id %s, got %d", by_id, len(bibids))
add_score(g, URIRef(instance), scores.get(int(bibids[0]),1)) #score=1 if no value stored
n += 1
except Exception as e:
if (e != None):
logging.warn("%r - skipping instance %r", e, instance)
# Write out
try:
ss_anno_fh.write(g.serialize(format='nt'))
ss_anno_fh.close()
logging.info("-- wrote %d scores" % (n))
except Exception as e:
logging.warn("Writing %s failed: %s", bib_file, str(e))
return(n)
p = optparse.OptionParser(description='Stackscore RDF generation for LD4L',
usage="%0 [[input-files.nt]]")
p.add_option('--stackscores', action='store', default='stackscores.dat.gz',
help="Input file of stackscores, format is 'bibid stackscore', "
"one per line. Bibids without an entry will get an annotation "
"of stackscore 1.")
p.add_option('--logfile', action='store', default=None,
help="Write logging output to file instead of STDOUT")
(opts, bib_files) = p.parse_args()
extra = {'filename': opts.logfile } if opts.logfile else {}
logging.basicConfig(format='%(asctime)s %(levelname)s %(message)s', datefmt='%Y-%m-%dT%H:%M:%S', level=logging.INFO, **extra)
scores = read_stackscores(opts.stackscores)
# Iterate from bib_files treating each one separately because we know
# that they conatin complete LD4L models for a number of MARC records.
#
start_time = time.time()
records = 0
for bib_glob in bib_files:
for bib_file in glob.glob(bib_glob):
records += process_file(bib_file, namespace_manager)
elapsed = (time.time() - start_time)
logging.info("-- %.1fs elapsed, %d records, overall rate %.2frecords/s" % (elapsed,records,records/elapsed))
logging.info("Done")