diff --git a/lib/util.py b/lib/util.py index 86def97..519141e 100644 --- a/lib/util.py +++ b/lib/util.py @@ -49,7 +49,8 @@ def sink(accumulator): lastmod = next(select_name(e, 'lastmod')) s = liblink_site() s.sitemap = loc.xml_value - s.base_url, _, tail = s.sitemap.partition('harvest/sitemap.xml') + s.url, _, tail = s.sitemap.partition('harvest/sitemap.xml') + s.base_url = s.url #Legacy property name #Early warning for funky URLs breaking stuff downstream assert not tail protocol, s.host, path, query, fragment = iri.split_uri_ref(s.sitemap) @@ -85,6 +86,73 @@ def prep_site_model(site): return model, sitetext +def rdf_from_site(site, rules=None): + ''' + >>> from librarylink.util import rdf_from_site + >>> g = rdf_from_site('http://link.denverlibrary.org') + >>> s = g.serialize(format='json-ld', indent=2) + >>> with open('denverlibrary.ld.json', 'wb') as fp: fp.write(s) + + >>> rules = {'ignore-predicates': ['http://bibfra.me/', 'http://library.link/'], 'rename-predicates': {'http://library.link/vocab/branchOf': 'http://schema.org/branch'}} + >>> g = rdf_from_site('http://link.denverlibrary.org', rules=rules) + >>> s = g.serialize(format='json-ld', indent=2) + >>> with open('denverlibrary.ld.json', 'wb') as fp: fp.write(s) + ''' + from rdflib import ConjunctiveGraph, URIRef, Literal, RDF, RDFS + from versa.writer.rdf import mock_bnode, prep, RDF_TYPE + #Also requires: pip install rdflib-jsonld + rules = rules or {} + ignore_pred = rules.get('ignore-predicates', set()) + rename_pred = rules.get('rename-predicates', {}) + model, sitetext = prep_site_model(site) + g = ConjunctiveGraph() + #Hoover up everything with a type + for o, r, t, a in model.match(): + for oldp, newp in rename_pred.items(): + if r == oldp: r = newp + for igp in ignore_pred: + if r.startswith(igp): + break + else: + g.add(prep(o, r, t)) + return g + + +def jsonize_site(site, rules=None): + ''' + >>> from librarylink.util import jsonize_site + >>> obj = jsonize_site('http://link.denverlibrary.org') + >>> with open('denverlibrary.ld.json', 'w') as fp: json.dump(obj, fp, indent=2) + + >>> rules = {'ignore-predicates': ['http://bibfra.me/', 'http://library.link/'], 'rename-predicates': {'http://library.link/vocab/branchOf': 'http://schema.org/branch'}} + >>> obj = jsonize_site('http://link.denverlibrary.org', rules=rules) + >>> with open('denverlibrary.ld.json', 'w') as fp: json.dump(obj, fp, indent=2) + ''' + from versa.util import uniquify + from versa.writer import jsonld + rules = rules or {} + ignore_pred = rules.get('ignore-predicates', set()) + rename_pred = rules.get('rename-predicates', {}) + ignore_oftypes = rules.get('ignore-oftypes', {}) + context = rules.get('context', {}) + pre_model, _ = prep_site_model(site) + if not pre_model: + return None + uniquify(pre_model) + post_model = memory.connection() + for o, r, t, a in pre_model.match(): + #print(o, r, t) + for oldp, newp in rename_pred.items(): + if r == oldp: r = newp + for igp in ignore_pred: + if r.startswith(igp): + break + else: + post_model.add(o, r, t, a) + obj = jsonld.bind(post_model, context=context, ignore_oftypes=ignore_oftypes) + return obj + + def get_orgname(site, reuse=None): ''' Given a site URL return the org's name diff --git a/requirements.txt b/requirements.txt index efe5f81..44cc734 100644 --- a/requirements.txt +++ b/requirements.txt @@ -11,3 +11,4 @@ amara3-xml #Optional bits #CacheControl[filecache] +#rdflib-jsonld diff --git a/test/resource/csvdedup.py b/test/resource/csvdedup.py index 9281225..11d68a1 100644 --- a/test/resource/csvdedup.py +++ b/test/resource/csvdedup.py @@ -3,6 +3,8 @@ import argparse import csv +HASH_WIDTH = 2 + def merge_rows(row1, row2): assert row1[0] == row2[0] assert len(row1) == len(row2) @@ -32,7 +34,7 @@ def run(source, dest): resource_rows[row[0]] = row for resid, inrow in resource_rows.items(): - resstem = resid[:3] + resstem = resid[:HASH_WIDTH] resstem_fpath = os.path.join(dest, resstem + '.csv') resstem_fpath_exists = os.path.exists(resstem_fpath) #Read then write back out diff --git a/test/resource/csvexport.py b/test/resource/csvexport.py index 18f9166..0466bf1 100755 --- a/test/resource/csvexport.py +++ b/test/resource/csvexport.py @@ -31,7 +31,7 @@ from librarylink.crawler.framework import crawltask, base_sink, links_from_html, LIBRARY_LINK_HEADER SCHEMAORG = 'http://schema.org/' - +HASH_WIDTH = 2 class csvexport_sink(base_sink): @staticmethod @@ -107,7 +107,7 @@ async def send(self, data): model = memory.connection() rdfalite.toversa(body, model, respurl) #Lock the file for - resstem = resid[:3] + resstem = resid[:HASH_WIDTH] csvexport_sink.locks.setdefault(resstem, Lock()) #csvexport_sink.logger.debug('Awaiting lock on {}; TASK [{}].'.format(resstem, task_id)) print('Awaiting lock on {}; TASK [{}].'.format(resstem, task_id), file=sys.stderr)