-
Notifications
You must be signed in to change notification settings - Fork 3
/
main.py
107 lines (86 loc) · 4.53 KB
/
main.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
# encoding: utf-8
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
from __future__ import unicode_literals
import os
from bs4 import BeautifulSoup as bs
import urllib.request
import json
from transformers import pipeline as pipe
from github_issue import make_github_issue
from config import NEW_SUB_URL, KEYWORD_LIST, PWC_URL
class Model:
# def __init__(self, model_path):
def __init__(self):
self.summarizer = pipe("summarization", model="facebook/bart-base")
def summarize(self, text: str):
return self.summarizer(text, max_length=100)
def main():
model = Model()
page = urllib.request.urlopen(NEW_SUB_URL)
soup = bs(page)
content = soup.body.find("div", {'id': 'content'})
issue_title = content.find("h3").text
dt_list = content.dl.find_all("dt")
dd_list = content.dl.find_all("dd")
arxiv_base = "https://arxiv.org/abs/"
assert len(dt_list) == len(dd_list)
keyword_list = sorted(KEYWORD_LIST)
keyword_dict = {key: [] for key in keyword_list}
for i in range(len(dt_list)):
paper = {}
paper_number = dt_list[i].text.strip().split(" ")[2].split(":")[-1]
paper['main_page'] = arxiv_base + paper_number
paper['pdf'] = arxiv_base.replace('abs', 'pdf') + paper_number
paper['title'] = dd_list[i].find("div", {"class": "list-title mathjax"}).text.replace("Title: ", "").strip()
paper['authors'] = dd_list[i].find("div", {"class": "list-authors"}).text.replace("Authors:\n", "").replace(
"\n", "").strip()
paper['subjects'] = dd_list[i].find("div", {"class": "list-subjects"}).text.replace("Subjects: ", "").strip()
abstract = dd_list[i].find("p", {"class": "mathjax"}).text.replace("\n", " ").strip()
paper['tldr'] = model.summarize(abstract)[0]["summary_text"]
#paper['abstract'] = dd_list[i].find("p", {"class": "mathjax"}).text.replace("\n", " ").strip()
client = urllib.request.Request(PWC_URL + paper_number, headers={"User-Agent" : "Mozilla/5.0"})
try:
pwc_response = urllib.request.urlopen(client)
if pwc_response.getcode() == 200:
r = json.load(pwc_response)
try:
repo_url = r.get("official", {}).get("url", "")
if repo_url:
paper['repo_url'] = repo_url
except AttributeError:
print(f'Response does not contain the requested attribute. Response: {r}')
except urllib.error.HTTPError:
print(f'URL not found: {PWC_URL + paper_number}')
for keyword in keyword_list:
if keyword.lower() in abstract.lower():
keyword_dict[keyword].append(paper)
full_report = ''
for keyword in keyword_list:
# full_report = full_report + '## Keyword: ' + keyword + '\n'
#full_report = full_report + '<h2>Keyword: ' + keyword + '</h2>'
if len(keyword_dict[keyword]) == 0:
#full_report = full_report + 'There is no result <br>'
pass
else:
full_report = full_report + '<h2>Keyword: ' + keyword + '</h2>'
full_report = full_report + "<details>"
for paper in keyword_dict[keyword]:
# report = '### {}\n - **Authors:** {}\n - **Subjects:** {}\n - **Arxiv link:** {}\n - **Pdf link:** {}\n - **Abstract**\n {}' \
# .format(paper['title'], paper['authors'], paper['subjects'], paper['main_page'], paper['pdf'],
# paper['abstract'])
report = f"<h3>{paper['title']}</h3>\
<strong>Authors:</strong> {paper['authors']}<br>\
<strong>Arxiv:</strong> <a href='{paper['main_page']}'>{paper['main_page']}</a><br>\
<strong>TLDR:</strong> {paper['tldr']}<br>\
<strong>Repo: <a href='{paper.get('repo_url')}'>{paper.get('repo_url')}</a><strong>"
full_report = full_report + report + '<br>'
full_report = full_report + "</details>"
if full_report != '' and len(full_report) > 0: # only make issue if at least one keyword has a new paper
# Authentication for user filing issue (must have read/write access to repository to add issue to)
if 'GITHUB' in os.environ:
USERNAME, TOKEN = os.environ['GITHUB'].split(',')
make_github_issue(title=issue_title, body=full_report, assignee=USERNAME, TOKEN=TOKEN, labels=keyword_list)
if __name__ == '__main__':
main()