forked from icodeu/BeautifulSoup
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathmain.py
76 lines (66 loc) · 2.93 KB
/
main.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
#coding=utf8
import requests
from bs4 import BeautifulSoup
import sqlite3
import datetime
# 设置要抓取的总页数
ALL_PAGE_NUMBER = 21
# 保存到本地Sqlite
def saveToSqlite(lesson_info):
# 获取lesson_info字典中的信息
name = lesson_info['name']
link = lesson_info['link']
des = lesson_info['des']
number = lesson_info['number']
time = lesson_info['time']
degree = lesson_info['degree']
# 连接数据库并插入相应数据
con = sqlite3.connect("lesson.db")
cur = con.cursor()
sql = "insert into lesson_info values ('%s', '%s','%s','%s','%s','%s')" % (name, link, des, number, time, degree)
cur.execute(sql)
con.commit()
# 抓取主函数
def startGrab():
# 所有课程页面的BaseURL
base_url = 'http://www.jikexueyuan.com/course/?pageNum='
# 当前页码
page_number = 1
while page_number <= ALL_PAGE_NUMBER:
url = base_url + str(page_number)
print ">>>>>>>>>>>将要抓取", url
# 可能因为超时等网络问题造成异常,需要捕获并重新抓取
try:
page = requests.get(url)
except:
print "重新抓取 ", url
continue
# 使用BeautifulSoup规范化网页并生成对象
soup = BeautifulSoup(page.content)
lesson_data = soup.find_all("li")
for item in lesson_data:
try:
if (item.contents[1].find("a").text):
name = item.contents[1].find("a").text
link = item.contents[1].find("a").get("href")
des = item.contents[1].find("p").text
number = item.contents[1].find("em", {"class": "learn-number"}).text
time = item.contents[1].find("dd", {"class": "mar-b8"}).contents[1].text
degree = item.contents[1].find("dd", {"class": "zhongji"}).contents[1].text
lesson_info = {"name": name, "link": link, "des": des, "number": number, "time": time, "degree": degree}
saveToSqlite(lesson_info)
# print "课程名称: ", item.contents[1].find("a").text
# print "课程链接: ", item.contents[1].find("a").get("href")
# print "课程简介: ", item.contents[1].find("p").text
# print "学习人数: ", item.contents[1].find("em", {"class": "learn-number"}).text
# print "课程时间: ", item.contents[1].find("dd", {"class": "mar-b8"}).contents[1].text
# print "课程难度: ", item.contents[1].find("dd", {"class": "zhongji"}).contents[1].text
# print "-----------------------------------------------"
except:
pass
page_number = page_number + 1
if __name__ == '__main__':
starttime = datetime.datetime.now()
startGrab()
endtime = datetime.datetime.now()
print "执行时间: ", (endtime - starttime).seconds, "s"