-
Notifications
You must be signed in to change notification settings - Fork 0
/
fetcher.py
30 lines (26 loc) · 1.05 KB
/
fetcher.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
import json
import urllib.request
import codecs
from retriever import Retriever
import aiohttp
import sys
def stream_pages(file):
with open(file) as endpoints:
for line in endpoints:
endpoint,pagecount = line.split('\t')
url = "http://ldf.lodlaundromat.org/"+endpoint+"?predicate=http%3A%2F%2Fwww.w3.org%2F2002%2F07%2Fowl%23sameAs&page="
pages = [url+str(page) for page in range(1,1+int(pagecount))]
for endpoint,url,page in Retriever(pages,[endpoint]):
yield endpoint,page
def stream_quads(pages):
for page in pages:
endpoint = " <http://lodlaundromat.org/resource/"+page[0][0]+">."
for statement in page[1].split('\n'):
triple = statement.split(' ')
if len(triple) == 3 and triple[1]=="<http://www.w3.org/2002/07/owl#sameAs>":
yield statement[:-1]+endpoint+"\n"
if __name__ == '__main__':
pages = stream_pages("endpoints.txt")
with open("identity.nq","w") as f:
for quad in stream_quads(pages):
f.write(quad)