-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathHtmlToRssTest.py
135 lines (114 loc) · 4.5 KB
/
HtmlToRssTest.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
# -*- coding: utf-8 -*-
import urllib.request
import os
import shutil
from bs4 import BeautifulSoup
from docx import Document
from docx.shared import Pt
from docx.enum.text import WD_ALIGN_PARAGRAPH
national = "国内"
international = "国际"
def get_html_soup(url):#获取解编码后的HTML
html = None
try:
response = urllib.request.urlopen(url, timeout = 10)
html = response.read().decode(encoding = "utf8", errors='ignore')
except Exception as e:
print(e, "please check your network situation")
return None
soup = BeautifulSoup(str(html), "lxml")
return soup
def page_url(url, page_num):#生成带页面的URL
if page_num == 1:
return url
index = url.rfind(".")
return url[0 : index] + "_" + str(page_num) + url[index : ]
def get_title_link(url, pattern):#获取新闻的标题和正文链接
soup = get_html_soup(url)
news_link = {}
scroll_list = BeautifulSoup(str(soup.find("div", attrs = pattern)), "lxml")
for link in scroll_list.find_all("a"):
if len(link.get_text().strip()) > 0 and link.get("href").find("http") != -1:
news_link[link.get_text()] = link.get('href')
return news_link
def get_news_body(url):#抓取新闻主体内容
first = True
content_text = []
page_num = 1
article_div = ""
#使用循环处理有分页的新闻
while first == True or article_div.find("下一页</a>") != -1:
soup = get_html_soup(page_url(url, page_num))
if soup == None:
return None
#article_div = str(soup.find("div", attrs = {"class": "article"}))
article_div = str(soup.find("div", attrs = {"class": "main"}))
soup = BeautifulSoup(str(article_div), "lxml")
for content in soup.find_all("p"):
if len(content.get_text().strip()) > 0:
content_text.append(" " + content.get_text().strip())
page_num += 1
first = False
for x in content_text:
if x == " None":
return None
return content_text
def clean_chinese_character(text):
'''处理特殊的中文符号,将其全部替换为'-' 否则在保存时Windows无法将有的中文符号作为路径'''
chars = chars = ["/", "\"", "'", "·", "。","?", "!", ",", "、", ";", ":", "‘", "’", "“", "”", "(", ")", "…", "–", ".", "《", "》"];
new_text = ""
for i in range(len(text)):
if text[i] not in chars:
new_text += text[i]
else:
new_text += "_"
return new_text;
def create_docx(news_type, title, content):
'''这里使用python-docx库将新闻的内容生成word文件'''
document = Document()
paragraph = document.add_paragraph(title)
paragraph.alignment = WD_ALIGN_PARAGRAPH.CENTER
paragraph.bold = True
for x in content:
paragraph = document.add_paragraph(x)
style = paragraph.style
font = style.font
font.size = Pt(15)
font.name = "consolas"
name = news_type + "-" + clean_chinese_character(title) + ".docx"
document.save(news_type + "/" + name)
########################################################################
national_news = "http://www.news.cn/politics/"
national_news_pattern = {"id": "hideData0"}
international_news = "http://www.news.cn/world/"
international_news_pattern = {"class": "partR domPC"}
#删除旧目录
print("deleting old dir")
if os.path.exists(international):
shutil.rmtree(international)
if os.path.exists(national):
shutil.rmtree(national)
#创建新目录
print("creating dir: ", international)
os.mkdir(international)
print("creating dir: ", national)
os.mkdir(national)
#获取新闻的标题和链接
international_news_list = get_title_link(international_news, international_news_pattern)
print("\ngetting international news content")
#获取新闻的内容主体并写入文件
for x in international_news_list:
paras = get_news_body(international_news_list[x])
#paras = get_news_body(x)
if paras != None and len(paras) > 0:
print("writing:", clean_chinese_character(x), international_news_list[x])
create_docx(international, x, paras)
national_news_list = get_title_link(national_news, national_news_pattern);
print("\ngetting national news content")
for x in national_news_list:
paras = get_news_body(national_news_list[x])
#paras = get_news_body(x)
if paras != None and len(paras) > 0:
print("writing:", clean_chinese_character(x), national_news_list[x])
create_docx(national, x, paras)
print("All done, have a nice day")