forked from karpathy/nipspreview
-
Notifications
You must be signed in to change notification settings - Fork 6
/
Copy pathscrape_cvpr13.py
75 lines (56 loc) · 2 KB
/
scrape_cvpr13.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
# scrape the CVPR PDFs file looking for authors names, titles
# and create a database of all papers. This is necessary because
# extracting the authors and titles from PDFs directly is tricky.
import pyPdf
from pyPdf import PdfFileReader
import os
import cPickle as pickle
import random
papers_dir = 'papers/'
class Paper:
def __init__(self, id_=None, title=None, authors=None, filename=None, keywords=None):
self.paper = id_ # the id of the paper
self.title = title # the title of the paper
self.authors = authors # the author list of the paper
self.filename = filename
self.keywords = keywords
def get_text(page):
content = page['/Contents'].getObject()
content = pyPdf.pdf.ContentStream(content, page.pdf)
text = u""
for operands,operator in content.operations:
if operator in ["TJ", "Tj"]:
for i in operands[0]:
if isinstance(i, pyPdf.generic.TextStringObject):
if type(text) is not list:
text += i
elif type(i) is pyPdf.generic.NumberObject and i < 0:
text += " "
elif operator in ['Tf']:
text += " "
remove = [u"\u201d", u"\u201c"]
for r in remove:
text = text.replace(r, u" ")
text = text.replace(u"\ufb01", u"fi")
return text
files = os.listdir(papers_dir)
files = [x for x in files if x[-3:]=='pdf']
titles = []
authors = []
keywords = []
all_papers = []
for i,f in enumerate(files):
reader = PdfFileReader(open(papers_dir+f, 'rb'))
info = dict(reader.documentInfo)
title = info['/Title'].encode('ascii', 'ignore')
author = info['/Author'].encode('ascii', 'ignore')
keyword = info['/Keywords'].encode('ascii', 'ignore')
# titles += [info['/Title']]
# authors += [info['/Author']]
# keywords += [info['/Keywords']]
all_papers += [Paper(id_=random.randint(0, 999999999), title=title, authors=author, filename=f, keywords=keyword)]
outdict = {}
for p in all_papers:
outdict[p.paper] = (p.paper, p.title, p.authors, p.filename, p.keywords)
# dump a dictionary indexed by paper id that points to (title, authors) tuple
pickle.dump(outdict, open("papers.p", "wb"))