-
Notifications
You must be signed in to change notification settings - Fork 50
/
nsfc_downloader.py
96 lines (74 loc) · 3.61 KB
/
nsfc_downloader.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
import os
import re
import json
import argparse
import img2pdf
import requests
def arg_parser():
parse = argparse.ArgumentParser(
description='A tool to Download PDF format conclusion from http://output.nsfc.gov.cn/')
parse.add_argument('--ratify', '-r', help='The ratify id of the project you want to download', required=True)
parse.add_argument('--tmp_path', '-t', default='./tmp', help='The path you want to save tmp file')
parse.add_argument('--out_path', '-o', default='./output', help='The path you want to save output PDF file')
return parse.parse_args()
def clean_filename(string: str) -> str:
string = string.replace(':', '_').replace('/', '_').replace('\x00', '_')
string = re.sub('[\n\\\*><?\"|\t]', '', string)
string = string.strip()
return string
class NsfcDownloader:
def __init__(self, tmp_path, out_path):
self.tmp_path = tmp_path
self.out_path = out_path
def download(self, ratify):
print('开始获取项目信息,项目编号: {}'.format(ratify))
project_info_file = os.path.join(self.tmp_path, '{}.json'.format(ratify))
if os.path.exists(project_info_file):
rj = json.load(open(project_info_file, 'r', encoding='utf-8'))
else:
r = requests.get('http://output.nsfc.gov.cn/baseQuery/data/conclusionProjectInfo/{}'.format(ratify))
r.raise_for_status()
rj = r.json() # project_info
print('保存项目信息至 {}'.format(project_info_file))
json.dump(rj, open(project_info_file, 'w', encoding='utf-8'), ensure_ascii=False, sort_keys=True)
if rj.get('code') != 200:
exit('项目可能不存在,请重新检查网页 http://output.nsfc.gov.cn/conclusionProject/{} 显示'.format(ratify))
ratify_prefix = ratify[:2]
project_name = rj['data'].get('projectName')
out_pdf_file = os.path.join(self.out_path, clean_filename('{} {}.pdf'.format(ratify, project_name)))
if os.path.exists(out_pdf_file):
print('PDF已存在 ,请打开 {}'.format(out_pdf_file))
else:
print('开始下载 {} {}'.format(ratify, project_name))
img_files_list = []
img_bytes_list = []
i = 1
while True:
tmp_file = os.path.join(self.tmp_path, '{}_{}.png'.format(ratify, i))
if os.path.exists(tmp_file):
content = open(tmp_file, 'rb').read()
else:
req_url = "http://output.nsfc.gov.cn/report/{}/{}_{}.png".format(ratify_prefix, ratify, i)
print('正在请求页面 {}'.format(req_url))
r = requests.get(req_url, timeout=10)
if r.status_code == 404:
break
content = r.content
with open(tmp_file, 'wb') as tmp_f:
tmp_f.write(r.content)
img_files_list.append(tmp_file)
img_bytes_list.append(content)
i += 1
print('下载完成 {} {}'.format(ratify, project_name))
if len(img_bytes_list) > 0:
print('正在组合PDF {}'.format(out_pdf_file))
pdf = img2pdf.convert(img_bytes_list)
with open(out_pdf_file, "wb") as file_:
file_.write(pdf)
print('移除临时文件')
for f in img_files_list:
os.remove(f)
if __name__ == '__main__':
args = arg_parser()
downloader = NsfcDownloader(args.tmp_path, args.out_path)
downloader.download(args.ratify)