-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathlambda_handler.py
197 lines (159 loc) · 7.45 KB
/
lambda_handler.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
managedkaos/azlyrics.py
https://gist.github.com/managedkaos/e3262b80154129cc9a976ee6ee943da3
"""
# Requests is a library that allows you to programmatically send out http requests
# from botocore.vendored import requests
import csv
import requests
from requests.exceptions import ConnectionError
from requests.adapters import HTTPAdapter
from requests.packages.urllib3.util.retry import Retry
# requests inside botocore really shouldn't be used. but curious if performance is better
# from botocore.vendored import requests
# from botocore.vendored.requests.exceptions import ConnectionError
# from botocore.vendored.requests.adapters import HTTPAdapter
# from botocore.vendored.requests.packages.urllib3.util.retry import Retry
# os is a library for doing operating system things, like looking through file directories
import os
import s3fs
import time
import logging
import random
# BeautifulSoupp is a library made to allow developers to parse through the contents of a webpage
from bs4 import BeautifulSoup
# import pandas as pd
logger = logging.getLogger('rap_webscraper.{}'.format(__name__))
logger.setLevel(logging.DEBUG)
ch = logging.StreamHandler()
ch.setLevel(logging.INFO)
formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')
logger.addHandler(ch)
SLEEP_TIME = 5.212
NOISE = (-4.438, 4.32)
# act like a mac when requesting url
headers = {
'User-Agent': "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.121 Safari/537.36"}
#NOISE = (0,5)
# TODO: create from scrape_config.csv
BUCKETNAME = "bars-api"
BARSDIR = "just-lyrics"
def handler(event, context):
today_day = time.localtime().tm_mday
fs = s3fs.S3FileSystem()
sess = requests.Session()
retries = Retry(total=2, backoff_factor=2.5, status_forcelist=[500, 501, 502, 503, 504])
sess.mount('http://', HTTPAdapter(max_retries=retries))
sess.mount('https://', HTTPAdapter(max_retries=retries))
#scrape_config = os.path.join("webscrape", "scrape_config.csv")
#df = pd.read_csv(scrape_config)
# artists_and_urls = zip(df.to_dict()['artist'].values(), df.to_dict()['url'].values())
# artists_and_urls = zip(['frankocean'], ['https://www.azlyrics.com/f/frankocean.html'])
lyric_links = os.path.join(BUCKETNAME, 'lyrics_sources', 'scrape_config.csv')
with fs.open(lyric_links, 'r') as f:
reader = csv.DictReader(f)
artists_and_urls = []
for row in reader:
artists_and_urls.append((row['artist'], row['url']))
# pick a random artist to start with so everyone gets a turn
for artist_folder_name, url in artists_and_urls[today_day:]:
"""
sleep before request
you could check if the artists directory exists and skip first
but if a previous scrape session ended midway you wouldnt check for any missed songs
if the url lists gets too long you may get blocked just from the initial request to artist pages
"""
logger.info('Requesting songs from: {}'.format(url))
pre_request_sleep = SLEEP_TIME + random.uniform(NOISE[0], NOISE[1])
logger.info('Sleeping: {}'.format(pre_request_sleep))
time.sleep(pre_request_sleep)
# make a request for the data
try:
r = sess.get(url, headers=headers)
except ConnectionError as ce:
logger.info('Errored on:{} \n{}'.format(url, ce))
continue
# convert the response text to soup use lxml parser
soup = BeautifulSoup(r.text, "lxml")
# get the songs and links to the lyrics
for song_link in soup.find_all("a", href=True):
if len(song_link.text) == 0:
continue
lyric_url = song_link['href']
if ".." in lyric_url:
lyric_url = "https://www.azlyrics.com" + lyric_url[2:]
logger.info("Looking @ Lyrics for {}".format(lyric_url))
filename = song_link.text.replace(' ', '_').replace("'", '').replace('/', '')
filename += ".txt"
if filename.lower() == "submit.txt":
continue
filename = os.path.join(BUCKETNAME, BARSDIR, artist_folder_name, filename)
filename = str(filename)
logger.info("Filename: {}".format(filename))
"""
a lot of the unspecified excepts are just to catch annoying character encoding bs
that comes with webscraping
string casting because fs exists needs to be a string or there will
be external errors if the passed type is a byte-string
"""
if fs.exists(filename):
try:
logger.info('File {} already exists, skipping web request'.format(filename.encode('utf-8')))
except:
continue
continue
try:
logger.debug('Requesting: {}'.format(lyric_url))
except:
pass
"""
sleep for some time (in seconds) so you arent banned from sites..
add some random noise to the sleep so it don't look like a robot
"""
this_sleep = SLEEP_TIME + random.uniform(NOISE[0], NOISE[1])
logger.info('Sleeping: {}'.format(this_sleep))
time.sleep(this_sleep)
try:
response = sess.get(lyric_url, headers=headers)
except ConnectionError as ce:
logger.info('{}'.format(ce))
continue
new_soup = BeautifulSoup(response.text, "lxml")
try:
logger.debug('Will Write to: {}'.format(filename))
except:
pass
# https://stackoverflow.com/questions/12517451/automatically-creating-directories-with-file-output
if not fs.exists(str(os.path.dirname(filename))):
logger.info('{} Does not exist.'.format(os.path.dirname(filename)))
try:
fs.mkdir(str(os.path.dirname(filename)))
except OSError as exc: # Guard against race condition
if exc.errno != errno.EEXIST:
raise
f = fs.open(filename, "w")
# loop through the no clas divs. they contain the lyrics
for lyric in new_soup.find_all("div", {"class": None}):
try:
f = fs.open(filename, "a")
except IOError:
logger.warning('IOError could not write filename: {}'.format(filename))
continue
try:
f.write(lyric.text)
except UnicodeError:
logger.warning('UnicodeError, Skipping: {}'.format(filename))
f.close()
continue
# the song panel div has the album name and the year
for song_panel_div in new_soup.find_all("div", {"class": "panel songlist-panel noprint"}):
try:
f.write('ALBUM INFO')
f.write(song_panel_div.text)
except UnicodeError:
logger.warning('UnicodeError, Skipping')
f.close()
continue
f.close()