-
Notifications
You must be signed in to change notification settings - Fork 0
/
股票爬虫实例.py
79 lines (71 loc) · 3.2 KB
/
股票爬虫实例.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
import requests
from bs4 import BeautifulSoup
import re
import traceback
import time
import numpy as np
#修改请求头文件,避免封IP
hds=[{'User-Agent':'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US; rv:1.9.1.6) Gecko/20091201 Firefox/3.5.6'},{'User-Agent':'Mozilla/5.0 (Windows NT 6.2) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.12 Safari/535.11'},{'User-Agent': 'Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.2; Trident/6.0)'}]
def getHTMLText(url):#参数为页面
try:
r = requests.get(url, timeout = 30,headers=hds[np.random.randint(0,len(hds))])
r.raise_for_status()
r.encoding = r.apparent_encoding
return r.text
except:
return ""
def getStockList(lst, stockURL):#参数为存取股票代码列表,以及股票信息网站
html = getHTMLText(stockURL)#解析获取股票代码的网站
soup = BeautifulSoup(html, 'html.parser')
a = soup.find_all('a')#找到存储代码的位置
for i in a:
try:
href = i.attrs['href']
lst.append(re.findall(r"[s][hz]\d{6}",href)[0])#获得代码
except:
continue
def getStockInfo(lst, stockURL, fpath):#参数为存取股票列表,股票信息网站,以及股票信息存储位置
count = 0
sum = 0
for s in lst:#对于sum这一块的代码主要是计算存入多少个,前90个股票存在问题,不考虑
sum = sum + 1
if sum < 90:
continue
if sum >= 100:
break
#time.sleep(np.random.rand() * 2)
url = stockURL + s + ".html"
html = getHTMLText(url)
try:
if html == "":
continue
soup = BeautifulSoup(html, 'html.parser')
StockDict = {}
stockInfo = soup.find('div', attrs = {'class':'bets-content'})
try:
name = soup.find_all(attrs={'class':'bets-name'})[0].text.split()[0]
except IndexError:
continue
StockDict.update({'股票信息':name})#将获取到的股票名字先存下来
keyList = stockInfo.find_all('dt')#获得该股票的信息名称
valueList = stockInfo.find_all('dd')#获得该股票的信息的详细资料
for i in range(len(keyList)):
key = keyList[i].text
value = valueList[i].text
StockDict[key] = value
with open(fpath, 'a',encoding='utf-8') as f:#将信息存到文件中
f.write(str(StockDict) + '\n')
count = count + 1
print("\r当前进度:{:.2f}%".format(count*100/len(lst)))
except:
traceback.print_exc()
count = count + 1
print("\r当前进度:{:.2f}%".format(count * 100 / len(lst)))
continue
if __name__ == '__main__':
Stock_list_url = 'http://quote.eastmoney.com/stocklist.html'#获取股票代码地址
Stock_info_url = 'https://gupiao.baidu.com/stock/'#获取股票详细信息地址
file_save = 'D://python项目//Python_//爬虫项目实例//StockInfo.txt'#存储股票信息位置
slist = []#存储股票代码
getStockList(slist, Stock_list_url)
getStockInfo(slist, Stock_info_url,file_save)