-
Notifications
You must be signed in to change notification settings - Fork 1
/
arxivposts.py
112 lines (94 loc) · 3.53 KB
/
arxivposts.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
"""
This script writes a html blog post about a specific paper, written for isimba.dk
"""
import argparse
import re
from requests_html import HTMLSession
session = HTMLSession()
parser = argparse.ArgumentParser()
parser.add_argument("arxivid")
def main(arxivid):
arxivlink = "https://arxiv.org/abs/" + arxivid
r = session.get(arxivlink)
# arxiv.org pages start with XML encoding declaration,
# and requests_html does not like that. Remove the encoding declaration.
r.html.html = re.sub(r"^<\?xml.*?\?>", "", r.html.html)
# Now find your page, find the value you want to extract, inspect element
# and copy selector and insert it in the string.
titles = r.html.find("#abs > h1")
(titles,) = titles
title = titles.text.replace("Title:", "")
# This finds the href blocks instantly
# TODO: Map name in dictionary, so the right childs are read
datadict = {}
for tr in r.html.find("#abs > div.metatable > table > tr"):
key, data = tr.find("td")
datadict[key.text] = data
try:
pub = datadict["Comments:"].text
except KeyError:
pub = ""
try:
doi = datadict["DOI:"].text
doi = "https://dx.doi.org/" + doi
except:
doi = arxivlink
(abstracts,) = r.html.find("#abs > blockquote")
abstract = abstracts.text.replace("Abstract: ", "")
# Fix in-line math with MathJax by replacing $ with \(\).
c = abstract.count("$")
# assert c % 2 == 0
# Replace every other occurance with the opposite parantheses
# https://stackoverflow.com/questions/46705546/python-replace-every-nth-occurrence-of-string
def nth_repl_all(s, sub, repl, nth):
find = s.find(sub)
# loop util we find no match
i = 1
while find != -1:
# if i is equal to nth we found nth matches so replace
if i == nth:
s = s[:find] + repl + s[find + len(sub) :]
i = 0
# find + len(sub) + 1 means we start after the last match
find = s.find(sub, find + len(sub) + 1)
i += 1
return s
abstract = nth_repl_all(abstract, "$", "\)", 2)
abstract = abstract.replace("$", "\([mathjax]", 1)
abstract = abstract.replace("$", "\(")
tpl = """<p style="text-align: justify;">{authors}</p>
<p style="text-align: left;">
<strong><a class="link" href="{doi}"= target="_blank" rel="ContributionToJournal noopener">{title}</a>
</strong> <a class="link" href="{arxiv}"= target="_blank" rel="ContributionToJournal noopener">See arXiv version</a>
<em>{pub}</em>
</p>
<p style="text-align: justify;"><!--more--></p>
<p></p>
<h3 style="text-align: justify;"><strong>Abstract</strong></h3>
<p style="text-align: justify;">{abstract}</p>
"""
# typeset authors. if author is in isimba: boldface them.
(authorss,) = r.html.find("#abs > div.authors")
authors = authorss.find("a")
authorlist = []
authortpl = '<a href="{href}">{name}</a>'
for author in authors:
href = "https://arxiv.org" + author.attrs["href"]
a = authortpl.format(
href="".join(href.split()), name=author.text.replace(",", "")
)
authorlist.append(a)
blogtext = tpl.format(
authors=", ".join(authorlist),
doi=doi,
title=title,
pub=pub,
arxiv=arxivlink,
abstract=abstract,
)
print(blogtext)
bp = open(f"./blogposts/{str(arxivid)}.txt", "w")
bp.write(blogtext)
bp.close()
if __name__ == "__main__":
main(**vars(parser.parse_args()))