-
Notifications
You must be signed in to change notification settings - Fork 0
/
data_extraction.py
196 lines (170 loc) · 6.99 KB
/
data_extraction.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
import lyricsgenius
from config_parser import config_args
from song_info import SongInfo
from songs_db import SongsInfoDB
import datetime
from os.path import join
from os import listdir, walk, makedirs
from requests.exceptions import Timeout, ConnectionError
import chosen_artists
def get_genre(genre, db_pickle_path=None, page=1):
"""
gets the lyrics + metadata + annotations of all the songs that have the requested genre tag.
takes ~0.5 min per song with the following setup.
:param genre:
:param db_pickle_path:
:param page: number
:return: save pickle
"""
# # genre
# per song: full title, url, id, annotation_count, lyrics, (verses, annotations), metadata (genre, artists, album...)
token = config_args['data_extraction']['token']
save_every = config_args['data_extraction']['save_songs_db_every']
genius = lyricsgenius.Genius(token)
# in order to handle with timeouts
genius.timeout = 10
genius.sleep_time = 3
genius.retries = 5
retries_num = 2
name_by_date = genre + '_' + datetime.datetime.today().strftime('%d%m%y_%H%M') # _%H%M')
songs_info_db = SongsInfoDB(name=name_by_date, genre=genre, pickle_path=db_pickle_path)
while page: # 20 songs at each page
# returns urls of songs
res = genius.tag(genre, page=page)
for hit in res['hits']:
retries = 0
while retries < retries_num:
try:
# lyrics + metadata (by title and main artist)
song = genius.search_song(hit['title'], hit['artists'][0])
# annotations (by song id)
if song == None:
retries = retries_num
continue
else:
annotation = genius.song_annotations(song.id)
song_info = SongInfo(genre, song, annotation)
songs_info_db.add_song(song_info)
# save every # songs to pickle
if songs_info_db.get_len() % save_every == 0:
songs_info_db.save_to_pickle()
print('# songs in db:', str(songs_info_db.get_len()))
print('current page:', str(page))
break # break to next hit
except TimeoutError as e:
retries += 1
continue
except ConnectionError as e:
retries += 1
songs_info_db.save_to_pickle()
print('# songs in db:', str(songs_info_db.get_len()))
print('ConnectionError: current page:', str(page))
continue
page = res['next_page']
# final save
songs_info_db.save_to_pickle()
print('# songs in db:', str(songs_info_db.get_len()))
print('Done: extracting', genre, 'songs :)')
print('Done')
def all_genres_extraction(config_args):
"""
main Genius tags - ['country', 'pop', 'r&b', 'rap', 'rock']
secondary tags (hundreds...) - https://genius.com/Genius-tags-music-genres-international-annotated
:return:
"""
genres = config_args['data_extraction']['genres'][:-2] # without final
for genre in genres:
get_genre(genre)
def genre_from_last_point(last_file, genre, page):
"""
:param last_file:
:param genre:
:param page:
:return:
"""
db_pickle_path_2_load = join(config_args['data_extraction']['pickles_parent_dir'], genre, last_file)
get_genre(genre, db_pickle_path_2_load, page=page)
def get_songs_by_artists(chosen_artists, db_pickle_path=None, page=1):
"""
the genre is artists. no specified genre for each song.
:param chosen_artist:
:param db_pickle_path:
:param page:
:return:
"""
token = config_args['data_extraction']['token']
max_songs_per_artist = config_args['data_extraction']['max_songs_per_artist']
save_every = config_args['data_extraction']['save_songs_db_every']
genre = 'artists'
genius = lyricsgenius.Genius(token)
# in order to handle with timeouts
genius.timeout = 12
genius.sleep_time = 3
genius.retries = 3
retries_num = 2
for ch_artist in chosen_artists:
name_by_date = ch_artist + '_' + datetime.datetime.today().strftime('%d%m%y_%H%M') # _%H%M')
songs_info_db = SongsInfoDB(name=name_by_date, genre=genre, pickle_path=db_pickle_path)
artist = genius.search_artist(ch_artist, max_songs=max_songs_per_artist)
if artist == None:
print('Warning: The artist:', ch_artist, 'was not found.')
continue
if len(artist.songs) < 3: # not enough songs -> no worthy
continue
for song in artist.songs:
retries = 0
while retries < retries_num:
try:
annotation = genius.song_annotations(song.id)
song_info = SongInfo(genre, song, annotation)
songs_info_db.add_song(song_info)
# save every # songs to pickle
# if songs_info_db.get_len() % save_every == 0:
# songs_info_db.save_to_pickle(pi_name=ch_artist)
# print('# songs in db:', str(songs_info_db.get_len()))
break # break to next hit
except TimeoutError as e:
retries += 1
continue
except ConnectionError as e:
retries += 1
songs_info_db.save_to_pickle(pi_name=ch_artist)
print('# songs in db:', str(songs_info_db.get_len()))
continue
# save when finish artists songs
songs_info_db.save_to_pickle(pi_name=ch_artist)
print('# songs in db:', str(songs_info_db.get_len()))
continue
# final save
songs_info_db.save_to_pickle(pi_name=ch_artist)
print('# songs in db:', str(songs_info_db.get_len()))
print('Done: extracting', genre, 'songs :)')
print('Done')
if __name__ == '__main__':
# # one genre extraction
# genre = 'country'
# get_genre(genre)
#
# # genre from last checkpoint
# last_file = 'rap_050222_1022.pickle'
# page = 43
# genre_from_last_point(last_file, genre, page)
#
# # all genres extraction
# all_genres_extraction(config_args)
#
# by artists
chosen_artists = chosen_artists.chosen_artists
get_songs_by_artists(chosen_artists, db_pickle_path=None, page=1)
# default search by title: A-Z/ pageviews / release date
# # # artist search
# artist = genius.search_artist("Imagine Dragons", max_songs=1, sort="popularity", include_features=True)
# print(artist.songs)
# #
# # # song by artist and song's title
# # has song_id inside
# song = genius.search_song("enemy", artist.name)
# print(song.lyrics)
# #
# # # annotations
# # a = genius.song_annotations(5992642) #song id