Skip to content

Commit

Permalink
Implement librarylink.util.rdf_from_site() and librarylink.util.jsoni…
Browse files Browse the repository at this point in the history
…ze_site()
  • Loading branch information
uogbuji committed Apr 18, 2018
1 parent a739f67 commit 2abe76b
Show file tree
Hide file tree
Showing 4 changed files with 75 additions and 4 deletions.
70 changes: 69 additions & 1 deletion lib/util.py
Original file line number Diff line number Diff line change
Expand Up @@ -49,7 +49,8 @@ def sink(accumulator):
lastmod = next(select_name(e, 'lastmod'))
s = liblink_site()
s.sitemap = loc.xml_value
s.base_url, _, tail = s.sitemap.partition('harvest/sitemap.xml')
s.url, _, tail = s.sitemap.partition('harvest/sitemap.xml')
s.base_url = s.url #Legacy property name
#Early warning for funky URLs breaking stuff downstream
assert not tail
protocol, s.host, path, query, fragment = iri.split_uri_ref(s.sitemap)
Expand Down Expand Up @@ -85,6 +86,73 @@ def prep_site_model(site):
return model, sitetext


def rdf_from_site(site, rules=None):
'''
>>> from librarylink.util import rdf_from_site
>>> g = rdf_from_site('http://link.denverlibrary.org')
>>> s = g.serialize(format='json-ld', indent=2)
>>> with open('denverlibrary.ld.json', 'wb') as fp: fp.write(s)
>>> rules = {'ignore-predicates': ['http://bibfra.me/', 'http://library.link/'], 'rename-predicates': {'http://library.link/vocab/branchOf': 'http://schema.org/branch'}}
>>> g = rdf_from_site('http://link.denverlibrary.org', rules=rules)
>>> s = g.serialize(format='json-ld', indent=2)
>>> with open('denverlibrary.ld.json', 'wb') as fp: fp.write(s)
'''
from rdflib import ConjunctiveGraph, URIRef, Literal, RDF, RDFS
from versa.writer.rdf import mock_bnode, prep, RDF_TYPE
#Also requires: pip install rdflib-jsonld
rules = rules or {}
ignore_pred = rules.get('ignore-predicates', set())
rename_pred = rules.get('rename-predicates', {})
model, sitetext = prep_site_model(site)
g = ConjunctiveGraph()
#Hoover up everything with a type
for o, r, t, a in model.match():
for oldp, newp in rename_pred.items():
if r == oldp: r = newp
for igp in ignore_pred:
if r.startswith(igp):
break
else:
g.add(prep(o, r, t))
return g


def jsonize_site(site, rules=None):
'''
>>> from librarylink.util import jsonize_site
>>> obj = jsonize_site('http://link.denverlibrary.org')
>>> with open('denverlibrary.ld.json', 'w') as fp: json.dump(obj, fp, indent=2)
>>> rules = {'ignore-predicates': ['http://bibfra.me/', 'http://library.link/'], 'rename-predicates': {'http://library.link/vocab/branchOf': 'http://schema.org/branch'}}
>>> obj = jsonize_site('http://link.denverlibrary.org', rules=rules)
>>> with open('denverlibrary.ld.json', 'w') as fp: json.dump(obj, fp, indent=2)
'''
from versa.util import uniquify
from versa.writer import jsonld
rules = rules or {}
ignore_pred = rules.get('ignore-predicates', set())
rename_pred = rules.get('rename-predicates', {})
ignore_oftypes = rules.get('ignore-oftypes', {})
context = rules.get('context', {})
pre_model, _ = prep_site_model(site)
if not pre_model:
return None
uniquify(pre_model)
post_model = memory.connection()
for o, r, t, a in pre_model.match():
#print(o, r, t)
for oldp, newp in rename_pred.items():
if r == oldp: r = newp
for igp in ignore_pred:
if r.startswith(igp):
break
else:
post_model.add(o, r, t, a)
obj = jsonld.bind(post_model, context=context, ignore_oftypes=ignore_oftypes)
return obj


def get_orgname(site, reuse=None):
'''
Given a site URL return the org's name
Expand Down
1 change: 1 addition & 0 deletions requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -11,3 +11,4 @@ amara3-xml

#Optional bits
#CacheControl[filecache]
#rdflib-jsonld
4 changes: 3 additions & 1 deletion test/resource/csvdedup.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,8 @@
import argparse
import csv

HASH_WIDTH = 2

def merge_rows(row1, row2):
assert row1[0] == row2[0]
assert len(row1) == len(row2)
Expand Down Expand Up @@ -32,7 +34,7 @@ def run(source, dest):
resource_rows[row[0]] = row

for resid, inrow in resource_rows.items():
resstem = resid[:3]
resstem = resid[:HASH_WIDTH]
resstem_fpath = os.path.join(dest, resstem + '.csv')
resstem_fpath_exists = os.path.exists(resstem_fpath)
#Read then write back out
Expand Down
4 changes: 2 additions & 2 deletions test/resource/csvexport.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,7 @@
from librarylink.crawler.framework import crawltask, base_sink, links_from_html, LIBRARY_LINK_HEADER

SCHEMAORG = 'http://schema.org/'

HASH_WIDTH = 2

class csvexport_sink(base_sink):
@staticmethod
Expand Down Expand Up @@ -107,7 +107,7 @@ async def send(self, data):
model = memory.connection()
rdfalite.toversa(body, model, respurl)
#Lock the file for
resstem = resid[:3]
resstem = resid[:HASH_WIDTH]
csvexport_sink.locks.setdefault(resstem, Lock())
#csvexport_sink.logger.debug('Awaiting lock on {}; TASK [{}].'.format(resstem, task_id))
print('Awaiting lock on {}; TASK [{}].'.format(resstem, task_id), file=sys.stderr)
Expand Down

0 comments on commit 2abe76b

Please sign in to comment.