-
Notifications
You must be signed in to change notification settings - Fork 1
/
first_spider.py
106 lines (82 loc) · 3.98 KB
/
first_spider.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
import scrapy
import re
import pickle
import json
import sys
from scrapy.http import HtmlResponse
class firstSpider(scrapy.Spider):
pageScollCount = 0
countOfDataPushed = 0
loopCount = 0
allData = []
name = "scrapePlaylist"
localHost = "http://localhost:8050/render.html?url="
youtubeUrl = "https://www.youtube.com/user/physicsgalaxy74/playlists"
start_urls = [localHost + youtubeUrl]
def start_requests(self):
for url in self.start_urls:
yield scrapy.Request(url, self.parse)
def parse(self, response):
print("loop count in the beginning is "+ str(self.loopCount))
if not 'browse_ajax' in response.url:
urlForScroll = self.handleFirstResponse(response)
yield scrapy.Request(urlForScroll, callback=self.parse)
else:
nextScrollUrl = self.handleScrollResponse(response)
yield scrapy.Request(nextScrollUrl , callback = self.parse)
def handleFirstResponse(self ,response):
HTMLbodyForSearch = response.body.decode("utf-8")
pattern = r"continuation\":(.+?\")"
continuationToken = (re.search(pattern , HTMLbodyForSearch , re.MULTILINE).group(1)).replace('"','')
playlistnames= response.css('a.yt-simple-endpoint.style-scope.yt-formatted-string::text').extract()
playlistUrls = response.css('a.yt-simple-endpoint.style-scope.yt-formatted-string::attr(href)').extract()
self.addNameUrlToData(playlistnames, playlistUrls)
urlForscroll = "https://www.youtube.com/browse_ajax?ctoken=" + continuationToken
print("final scroll url \n" + urlForscroll)
return urlForscroll
def handleScrollResponse(self , response):
print("inside of scroll response")
# print(response.url)
self.pageScollCount = self.pageScollCount + 1
print("scroll tiems "+ str(self.pageScollCount))
jsonResponse = json.loads(response.text)
loadMoreDatafromHtml = jsonResponse['load_more_widget_html']
contentHTML = jsonResponse['content_html']
scrollHtmlBody = HtmlResponse(url="testing", body = contentHTML , encoding= 'utf-8')
playlistNames = scrollHtmlBody.css('a.yt-uix-sessionlink.yt-uix-tile-link.spf-link.yt-ui-ellipsis.yt-ui-ellipsis-2::attr(title)').extract()
playlistUrls = scrollHtmlBody.css('a.yt-uix-sessionlink.yt-uix-tile-link.spf-link.yt-ui-ellipsis.yt-ui-ellipsis-2::attr(href)').extract()
self.addNameUrlToData(playlistNames, playlistUrls )
htmlInAjaxCall = jsonResponse['content_html']
pattern = r";continuation=(.+?)\""
try:
next_continuationToken = (re.search(pattern , loadMoreDatafromHtml , re.MULTILINE).group(1))
if next_continuationToken:
urlForscroll = "https://www.youtube.com/browse_ajax?ctoken=" + next_continuationToken
return urlForscroll
except Exception as e:
print("got caught in exception ")
print(e)
self.writeDataToFile()
def writeDataToFile(self):
jsonData = json.dumps( self.allData )
print("size of item gathered "+ str(len(self.allData)))
with open("lol.json", 'a') as outfile:
json.dump( jsonData, outfile)
self.countOfDataPushed = self.countOfDataPushed + len(self.allData)
print("count of data pushed "+ str(self.countOfDataPushed))
def addNameUrlToData(self , playlistnames, playlistUrls):
print("length of playlist is " + str(len(playlistnames)))
for playlistname , url in zip(playlistnames , playlistUrls):
item = {}
self.loopCount = self.loopCount + 1
if 'full playlist' in playlistname:
continue
item['playlist']= playlistname
item['url'] = url
item['playlistID'] = self.extractPlaylistIdFromUrl(url)
self.allData.append(item)
print("total loop count "+ str(self.loopCount))
def extractPlaylistIdFromUrl(self , url):
regex = r"list=(.*)"
playlistID = (re.search(regex , url , re.MULTILINE).group(1))
return playlistID