-
Notifications
You must be signed in to change notification settings - Fork 7
/
cc-spider.py
181 lines (149 loc) · 6.7 KB
/
cc-spider.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
import requests
from bs4 import BeautifulSoup
from docx import Document
from docx.shared import Inches
from datetime import datetime
import os,time
#初始化变量
DEBUG = True
USER = '葡萄' #配置用户名
SLEEP_TIME = 5 #抓取每篇文章的时间间隔,单位秒
start_page = 874 #起始页面
end_page = 1007 #结束页面
FIXED_URL = 'https://talkcc.org/' #固定URL
FILE_NAME = f'./西西河-{USER}-专辑.docx' #生成word文档名
LOG_FILE = f'./cc-spider.log' #生成日志文件名
# 初始化Word文档
if os.path.exists(FILE_NAME):
# 文件存在,打开并追加内容
doc = Document(FILE_NAME)
doc.add_paragraph('')
doc.save(FILE_NAME)
else:
# 文件不存在,创建新文件
doc = Document()
doc.add_heading(f"{USER}文章集合", 0)
doc.save(FILE_NAME)
def log_message(message):
current_time = datetime.now()
formatted_time = current_time.strftime("%Y-%m-%d %H:%M:%S ")
if DEBUG:
print(message) #output to terminal
with open(LOG_FILE, 'a') as file:
# 将内容追加到文件中
file.write(formatted_time + message + '\n')
def http_req(base_url):
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3'
}
# 设置cookies,注意URL解码
cc_value = 'DcuxFcMgDEXRhVRISF%2BCaTgYmU1SZA%2BXLtNmm3iP0LzqPoSo9uf1fq7P73v3YswnMkaKzTFCQ5srN6CMlrN1CXGDSw0ycaftd%2FdmhFIbFSGmwsR9yEpkRRxTXAXuinPVwzwRNlfnPw%3D%3D'
cookies = {
'cc': requests.utils.unquote(cc_value)
}
# 发送请求并解析网页以获取文章链接
response = requests.get(base_url, headers=headers, cookies=cookies)
return response
def get_article_links(page_rsp):
soup = BeautifulSoup(page_rsp.text, 'html.parser')
div_tree = soup.find('div', class_='s_Tree') #format <div class="s_Tree">
all_links = div_tree.find_all('a', href=True) # format: <a href="/article/4487276">xxxx</a>
all_dates = div_tree.find_all('small') #format: <small>2020-03-08 12:43:34</small>
articles_links = [s for s in all_links if 'article' in str(s)]
return all_dates[::-1], articles_links[::-1] #reverse the list for correct order
def handle_article(url, date, article_response):
article_soup = BeautifulSoup(article_response.text, 'html.parser')
#print(article_soup)
# 提取文章内容,这里需要根据实际页面结构来调整
sections = article_soup.find(lambda tag: tag.name == 'div' and tag.get('class') == ['s_Sec']) # 精确匹配 <div class="s_Sec">
if sections is None:
log_message(f'ERROR: when reading article {url}')
return
title = sections.find_all('b')
p_tags = sections.find_all('p')
# 将文章标题和内容添加到Word文档
valid_idx = 1;
doc.add_heading(title[valid_idx].get_text(), level=3)
doc.add_paragraph(f'原文:{url}')
doc.add_paragraph(date)
for p_tag in p_tags:
if not p_tag.findChildren(): #no subnodes and only string
doc.add_paragraph(p_tag)
elif p_tag.find('a'): #link
h_ref = p_tag.a['href']
doc.add_paragraph(f'{p_tag.a.string} ({h_ref})')
elif p_tag.find('img'): #pictures
img_element = p_tag.find('img')
src_value = img_element['src'] #<img ... src="/picture/0,2405/2256_06203932.png"/>
img_path = download_image(FIXED_URL+src_value, './'+src_value)
if img_path == None:
doc.add_paragraph('missing img src from ' + FIXED_URL + src_value)
else:
doc.add_picture(img_path, width=Inches(5.5))
else:
doc.add_paragraph(p_tag.get_text())
log_message(f"INFO: handle special p_tag = {p_tag}")
# 可以选择添加一些分隔符或格式化
doc.add_paragraph('')
doc.add_paragraph('')
#doc.add_page_break()
def download_image(url, file_path):
# 兼容Windows/linux风格的路径
file_path = os.path.join(*file_path.split('/'))
# 创建目录(如果不存在)
os.makedirs(os.path.dirname(file_path), exist_ok=True)
response = requests.get(url)
if response.status_code == 200:
with open(file_path, 'wb') as file:
file.write(response.content)
log_message(f" Image downloaded and saved as {file_path}")
return file_path
else:
log_message(f"ERROR: Failed to download image from {url}")
return None
def print_article(date, url):
article_response = http_req(url)
if (article_response.status_code == 200):
handle_article(url, date, article_response)
else:
log_message(f'ERROR: fail to get article from {url}')
# 目标网址
def main():
HOME_PATH = f'{FIXED_URL}user/{USER}/所有帖/'
log_message('-----------------------------------------------------------------------------------')
log_message(f'Ready to collect {USER}\'s articles from {HOME_PATH}, page {start_page}~{end_page}')
start_time = time.time() #record the start time
total_articles = 0 #Initialize a couter for total articles
for page in range(start_page, end_page+1):
log_message(f'Start to process Page-{page}')
page_url = f'{HOME_PATH}{page}'
page_rsp = http_req(page_url)
if (page_rsp.status_code != 200):
log_message(f'ERROR: fail to get Page-{page},status_code:{page_rsp.status_code}')
continue
#page rsp successful
dates, articles_links = get_article_links(page_rsp)
# 循环处理每个文章链接
for i, link in enumerate(articles_links):
article_url = link['href']
if article_url.startswith('/article/'):
log_message(f' Start to process {article_url}')
url = f'{FIXED_URL}{article_url}'
print_article(dates[i], url)
total_articles += 1
time.sleep(SLEEP_TIME)
doc.save(FILE_NAME) #save for every page
#Finally save the file
doc.save(FILE_NAME)
end_time = time.time()
elapsed_time = end_time - start_time
hours = int(elapsed_time // 3600)
minutes = int((elapsed_time % 3600) // 60)
seconds = int(elapsed_time % 60)
log_message(f'Done! Total articles downloaded: {total_articles}')
log_message(f'Elapsed time: {hours:02d}:{minutes:02d}:{seconds:02d}s')
if __name__ == "__main__":
if 1:
main()
else: #test code
print_article('', 'https://talkcc.org/article/4108278')