-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathwb.py
50 lines (44 loc) · 2.48 KB
/
wb.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
from weibo.items import WeiboItem
from copy import deepcopy
import scrapy
import json
class WbSpider(scrapy.Spider):
name = "wb"
allowed_domains = ["m.weibo.cn"]
Q = input('请输入搜索关键词:')
page = 1
start_urls = [f"https://m.weibo.cn/api/container/getIndex?containerid=100103type%3D61%26q%3D{Q}" #ah, so it has parsed the url format of weibo, to change it according to our searching/iterating request
f"%26t%3D%26page_type=searchall&page={page}"]
#here it mainly handles long-short text problem
def parse(self, response):
result = json.loads(response.text) #json.loads transform raw json data as a string to python dictionary
result = result['data']['cards'] #extracts the list of cards from the JSON structure, each card will be a post or activity in weibo
item = WeiboItem()
for detail in result:
item['_id'] = detail['mblog']['user']['id']
item['_name'] = detail['mblog']['user']['screen_name']
print('++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++', detail['mblog']['isLongText'])
if detail['mblog']['isLongText'] is True: #if the mblog is long-text, it means the post is truncated,and the full-text need to be fetched from another URL
blog_id = detail['mblog']['id']
blog_id = ''.join(blog_id)
full_text_url = 'https://m.weibo.cn/statuses/extend?id=' + blog_id
yield scrapy.Request(full_text_url, callback=self.long_text_parse, meta={'item': deepcopy(item)},
priority=1)
else:
_text = detail['mblog']['text']
_text = _text.replace('\n', '')
_text = ''.join([x.strip() for x in _text])
item['_text'] = _text
yield item
self.page += 1
next_page = f"https://m.weibo.cn/api/container/getIndex?containerid=100103type%3D61%26q%3D{self.Q}" \
f"%26t%3D%26page_type=searchall&page={self.page}"
yield scrapy.Request(next_page, callback=self.parse, meta={'item': item})
def long_text_parse(self, response):
result = json.loads(response.text)
item = response.meta['item']
_text = result['data']['longTextContent']
_text = _text.replace('\n', '')
_text = ''.join([x.strip() for x in _text])
item['_text'] = _text
yield item