-
Notifications
You must be signed in to change notification settings - Fork 8
/
Copy pathpaper.py
358 lines (309 loc) · 13.8 KB
/
paper.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
import asyncio
import csv
import sqlite3
from collections import defaultdict
from dataclasses import dataclass, fields
from datetime import datetime, timedelta, UTC
from pathlib import Path
from rich.console import Console
from typing_extensions import Iterable
from async_translator import async_translate
from categories import parse_categories
@dataclass
class Paper:
first_submitted_date: datetime
title: str
categories: list
url: str
authors: str
abstract: str
comments: str
title_translated: str | None = None
abstract_translated: str | None = None
first_announced_date: datetime | None = None
@classmethod
def from_row(cls, row: sqlite3.Row):
return cls(
first_submitted_date=datetime.strptime(row["first_submitted_date"], "%Y-%m-%d"),
title=row["title"],
categories=row["categories"].split(","),
url=row["url"],
authors=row["authors"],
abstract=row["abstract"],
comments=row["comments"],
title_translated=row["title_translated"],
abstract_translated=row["abstract_translated"],
first_announced_date=datetime.strptime(row["first_announced_date"], "%Y-%m-%d"),
)
@property
def papers_cool_url(self):
return self.url.replace("https://arxiv.org/abs", "https://papers.cool/arxiv")
@property
def pdf_url(self):
return self.url.replace("https://arxiv.org/abs", "https://arxiv.org/pdf")
def to_markdown(self):
categories = ",".join(parse_categories(self.categories))
abstract = (
f"- **摘要**: {self.abstract_translated}"
if self.abstract_translated
else f"- **Abstract**: {self.abstract}"
)
return f"""### {self.title}
[[arxiv]({self.url})] [[cool]({self.papers_cool_url})] [[pdf]({self.pdf_url})]
> **Authors**: {self.authors}
> **First submission**: {self.first_submitted_date.strftime("%Y-%m-%d")}
> **First announcement**: {self.first_announced_date.strftime("%Y-%m-%d")}
> **comment**: {self.comments}
- **标题**: {self.title_translated}
- **领域**: {categories}
{abstract}
"""
async def translate(self, langto="zh-CN"):
self.title_translated = await async_translate(self.title, langto=langto)
self.abstract_translated = await async_translate(self.abstract, langto=langto)
@dataclass
class PaperRecord:
paper: Paper
comment: str
def to_markdown(self):
if self.comment != "-":
return f"""- [{self.paper.title}]({self.paper.url})
- **标题**: {self.paper.title_translated}
- **Filtered Reason**: {self.comment}
"""
else:
return self.paper.to_markdown()
class PaperDatabase:
def __init__(self, db_path="papers.db"):
self.conn = sqlite3.connect(db_path)
self.conn.row_factory = self._row_factory
self._create_table()
@staticmethod
def _row_factory(cursor, row):
row = sqlite3.Row(cursor, row)
# all fields in Paper, plus `update_time`
if len(row.keys()) == len(fields(Paper)) + 1:
return Paper.from_row(row)
else:
return row
def _create_table(self):
with self.conn:
self.conn.execute(
"""
CREATE TABLE IF NOT EXISTS papers (
url TEXT PRIMARY KEY,
authors TEXT NOT NULL,
title_translated TEXT,
first_submitted_date DATE NOT NULL,
first_announced_date DATE NOT NULL,
update_time DATETIME NOT NULL,
categories TEXT NOT NULL,
title TEXT NOT NULL,
comments TEXT NOT NULL,
abstract TEXT NOT NULL,
abstract_translated TEXT
)
"""
)
def add_papers(self, papers: Iterable[Paper]):
assert all([paper.first_announced_date is not None for paper in papers])
with self.conn:
data_to_insert = [
(
paper.url,
paper.authors,
paper.abstract,
paper.title,
",".join(paper.categories),
paper.first_submitted_date.strftime("%Y-%m-%d"),
paper.first_announced_date.strftime("%Y-%m-%d"),
paper.title_translated,
paper.abstract_translated,
paper.comments,
datetime.now(UTC).replace(tzinfo=None)
)
for paper in papers
]
self.conn.executemany(
"""
INSERT OR REPLACE INTO papers
(url, authors, abstract, title, categories, first_submitted_date, first_announced_date, title_translated, abstract_translated, comments, update_time)
VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
""",
data_to_insert,
)
def count_new_papers(self, papers: Iterable[Paper]) -> int:
cnt = 0
for paper in papers:
with self.conn:
cursor = self.conn.execute(
"""
SELECT * FROM papers WHERE url = ?
""",
(paper.url,),
)
if cursor.fetchone():
break
else:
cnt += 1
return cnt
def fetch_papers_on_date(self, date: datetime) -> list[Paper]:
with self.conn:
cursor = self.conn.execute(
"""
SELECT * FROM papers WHERE first_announced_date = ?
""",
(date.strftime("%Y-%m-%d"),),
)
return cursor.fetchall()
def fetch_all(self) -> list[Paper]:
with self.conn:
cursor = self.conn.execute(
"""
SELECT * FROM papers ORDER BY url DESC
"""
)
return cursor.fetchall()
def newest_update_time(self) -> datetime:
"""
最新更新时间是“上一次爬取最新论文的时间”
由于数据库可能补充爬取过去的论文,所以先选最新论文,再从其中选最新的爬取时间
"""
with self.conn:
cursor = self.conn.execute(
"""
SELECT MAX(update_time) as max_updated_time
FROM papers
WHERE first_announced_date = (SELECT MAX(first_announced_date) FROM papers)
"""
)
time = cursor.fetchone()["max_updated_time"].split(".")[0]
return datetime.strptime(time, "%Y-%m-%d %H:%M:%S")
async def translate_missing(self, langto="zh-CN"):
with self.conn:
cursor = self.conn.execute(
"SELECT url, title, abstract FROM papers WHERE title_translated IS NULL OR abstract_translated IS NULL"
)
papers = cursor.fetchall()
async def worker(url, title, abstract):
title_translated = await async_translate(title, langto=langto) if title else None
abstract_translated = await async_translate(abstract, langto=langto) if abstract else None
with self.conn:
self.conn.execute(
"UPDATE papers SET title_translated = ?, abstract_translated = ? WHERE url = ?",
(title_translated, abstract_translated, url),
)
await asyncio.gather(*[worker(url, title, abstract) for url, title, abstract in papers])
class PaperExporter:
def __init__(
self,
date_from: str,
date_until: str,
categories_blacklist: list[str] = [],
categories_whitelist: list[str] = ["cs.CV", "cs.AI", "cs.LG", "cs.CL", "cs.IR", "cs.MA"],
database_path="papers.db",
):
self.db = PaperDatabase(database_path)
self.date_from = datetime.strptime(date_from, "%Y-%m-%d")
self.date_until = datetime.strptime(date_until, "%Y-%m-%d")
self.date_range_days = (self.date_until - self.date_from).days + 1
self.categories_blacklist = set(categories_blacklist)
self.categories_whitelist = set(categories_whitelist)
self.console = Console()
def filter_papers(self, papers: list[Paper]) -> tuple[list[PaperRecord], list[PaperRecord]]:
filtered_paper_records = []
chosen_paper_records = []
for paper in papers:
categories = set(paper.categories)
if not (self.categories_whitelist & categories):
categories_str = ",".join(categories)
filtered_paper_records.append(PaperRecord(paper, f"none of {categories_str} in whitelist"))
elif black := self.categories_blacklist & categories:
black_str = ",".join(black)
filtered_paper_records.append(PaperRecord(paper, f"cat:{black_str} in blacklist"))
else:
chosen_paper_records.append(PaperRecord(paper, "-"))
return chosen_paper_records, filtered_paper_records
def to_markdown(self, output_dir="./output_llms", filename_format="%Y-%m-%d", metadata=None):
output_dir = Path(output_dir)
output_dir.mkdir(exist_ok=True, parents=True)
if metadata:
repo_url = metadata["repo_url"]
categories = ",".join(metadata["category_whitelist"])
optional_keywords = ", ".join(metadata["optional_keywords"])
preface_str = f"""
> 本文由 [{repo_url}]({repo_url}) 自动生成
>
> 领域白名单:{categories}
> 关键词: {optional_keywords}
""".lstrip()
else:
preface_str = ""
for i in range(self.date_range_days):
current = self.date_from + timedelta(days=i)
current_filename = current.strftime(filename_format)
with open(output_dir / f"{current_filename}.md", "w", encoding="utf-8") as file:
papers = self.db.fetch_papers_on_date(current)
chosen_records, filtered_records = self.filter_papers(papers)
papers_str = f"# 论文全览:{current_filename}\n\n共有{len(chosen_records)}篇相关领域论文, 另有{len(filtered_records)}篇其他\n\n"
chosen_dict = defaultdict(list)
for record in chosen_records:
chosen_dict[record.paper.categories[0]].append(record)
for category in sorted(chosen_dict.keys()):
category_en = parse_categories([category], lang="en")[0]
category_zh = parse_categories([category], lang="zh-CN")[0]
papers_str += f"## {category_zh}({category}:{category_en})\n\n"
for record in chosen_dict[category]:
papers_str += record.to_markdown()
papers_str += "## 其他论文\n\n"
for record in filtered_records:
papers_str += record.to_markdown()
file.write(preface_str + papers_str)
self.console.log(
f"[bold green]Output {current_filename}.md completed. {len(chosen_records)} papers chosen, {len(filtered_records)} papers filtered"
)
def to_csv(self, output_dir="./output_llms", filename_format="%Y-%m-%d", header=True, csv_config={}):
output_dir = Path(output_dir)
output_dir.mkdir(exist_ok=True, parents=True)
csv_table = {
"Title": lambda record: record.paper.title,
"Interest": lambda record: ("chosen" if record.comment == "-" else "filtered"),
"Title Translated": lambda record: (
record.paper.title_translated if record.paper.title_translated else "-"
),
"Categories": lambda record: ",".join(record.paper.categories),
"Authors": lambda record: record.paper.authors,
"URL": lambda record: record.paper.url,
"PapersCool": lambda record: record.paper.url.replace("https://arxiv.org/abs", "https://papers.cool/arxiv"),
"First Submitted Date": lambda record: record.paper.first_submitted_date.strftime("%Y-%m-%d"),
"First Announced Date": lambda record: record.paper.first_announced_date.strftime("%Y-%m-%d"),
"Abstract": lambda record: record.paper.abstract,
"Abstract Translated": lambda record: (
record.paper.abstract_translated if record.paper.abstract_translated else "-"
),
"Comments": lambda record: record.paper.comments,
"Note": lambda record: record.comment,
}
headers = list(csv_table.keys())
for i in range(self.date_range_days):
current = self.date_from + timedelta(days=i)
current_filename = current.strftime(filename_format)
with open(output_dir / f"{current_filename}.csv", "w", encoding="utf-8") as file:
if "lineterminator" not in csv_config:
csv_config["lineterminator"] = "\n"
writer = csv.writer(file, **csv_config)
if header:
writer.writerow(headers)
papers = self.db.fetch_papers_on_date(current)
chosen_records, filtered_records = self.filter_papers(papers)
for record in chosen_records + filtered_records:
writer.writerow([fn(record) for fn in csv_table.values()])
self.console.log(
f"[bold green]Output {current_filename}.csv completed. {len(chosen_records)} papers chosen, {len(filtered_records)} papers filtered"
)
if __name__ == "__main__":
from datetime import date, timedelta
today = date.today()
exporter = PaperExporter(today.strftime("%Y-%m-%d"), today.strftime("%Y-%m-%d"))
exporter.to_markdown()
exporter.to_csv(csv_config=dict(delimiter="\t", header=False))