-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathblog.py
126 lines (104 loc) · 4.02 KB
/
blog.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
'a test module'
__author__ = 'Dou Ba'
import os
import shutil
from urllib import request
from bs4 import BeautifulSoup
from docx import Document
from docx.shared import Pt
from docx.enum.text import WD_ALIGN_PARAGRAPH
def get_html_soup(url):#获取解编码后的HTML
html = None
try:
response = urllib.request.urlopen(url, timeout = 10)
html = response.read().decode(encoding = "gb2312", errors='ignore')
except Exception as e:
print(e, "please check your network situation")
return None
soup = BeautifulSoup(str(html), "lxml")
return soup
def page_url(url, page_num):#生成带页面的URL
if page_num == 1:
return url
index = url.rfind(".")
return url[0 : index] + "_" + str(page_num) + url[index : ]
def get_title_link(url, pattern):#获取新闻的标题和正文链接
soup = get_html_soup(url)
news_link = {}
scroll_list1 = BeautifulSoup(str(soup.find("div", attrs = pattern)), "lxml")
scroll_list = BeautifulSoup(str(scroll_list1.find_all("h2")), "lxml")
for link in scroll_list.find_all("a"):
if len(link.get_text().strip()) > 0 and link.get("href").find("http") != -1:
news_link[link.get_text().strip()] = link.get('href')
return news_link
def get_news_body(url):#抓取新闻主体内容
first = True
content_text = []
page_num = 1
article_div = ""
#使用循环处理有分页的新闻
while first == True or article_div.find("下一页</a>") != -1:
soup = get_html_soup(page_url(url, page_num))
if soup == None:
return None
#article_div = str(soup.find("div", attrs = {"class": "article"}))
article_div = str(soup.find("div", attrs = {"class": "news_info"}))
soup = BeautifulSoup(str(article_div), "lxml")
for content in soup.find_all("p"):
if len(content.get_text().strip()) > 0:
content_text.append(" " + content.get_text().strip())
page_num += 1
first = False
for x in content_text:
if x == " None":
return None
return content_text
def clean_chinese_character(text):
'''处理特殊的中文符号,将其全部替换为'-' 否则在保存时Windows无法将有的中文符号作为路径'''
chars = chars = ["/", "\"", "'", "·", "。","?", "!", ",", "、", ";", ":", "‘", "’", "“", "”", "(", ")", "…", "–", ".", "《", "》"];
new_text = ""
for i in range(len(text)):
if text[i] not in chars:
new_text += text[i]
else:
new_text += "_"
return new_text;
def create_docx(news_type, title, content):
'''这里使用python-docx库将新闻的内容生成word文件'''
document = Document()
paragraph = document.add_paragraph(title)
paragraph.alignment = WD_ALIGN_PARAGRAPH.CENTER
paragraph.bold = True
for x in content:
paragraph = document.add_paragraph(x)
style = paragraph.style
font = style.font
font.size = Pt(15)
font.name = "consolas"
name = news_type + "-" + clean_chinese_character(title) + ".docx"
document.save(news_type + "/" + name)
########################################################################
def githrml():
national = "mydrivers"
blog_news = "http://blog.mydrivers.com/"
blog_news_pattern = {"id": "main"}
#删除旧目录
print("deleting old dir")
if os.path.exists(national):
shutil.rmtree(national)
#创建新目录
print("creating dir: ", national)
os.mkdir(national)
blog_news_list = get_title_link(blog_news, blog_news_pattern);
print("\ngetting national news content")
for x in blog_news_list:
paras = get_news_body(blog_news_list[x])
#paras = get_news_body(x)
if paras != None and len(paras) > 0:
print("writing:", clean_chinese_character(x), blog_news_list[x])
create_docx(national, x, paras)
print("All done, have a nice day")
if __name__ == "__main__":
githrml()