-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathget_lyrics.py
147 lines (107 loc) · 4.11 KB
/
get_lyrics.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
import requests
import re
from requests_html import HTMLSession
from unicodedata import normalize
def requestsScrape(original_artist, original_music):
"""Gets the lyrics from cifraclub according to the artist and music names"""
url = search_in_google(original_artist, original_music)
if not url:
return None
else:
title = get_music_title(url)
if not title:
return None
else:
music, artist = title
if original_artist.lower() == artist.lower() and original_music.lower() == music.lower():
letra = pure_letra(f"{url}/letra")
else:
return {"Music": music, "Artist": artist, "Url": url}
result = filter_letra(letra)
return result
def pure_letra(url):
"""Goes to the url and try to find the lyrics. If it wasn't the right website returns an empty list"""
try:
session = HTMLSession()
response = session.get(url)
except requests.exceptions.RequestException as e:
print(e)
return []
if response.status_code != 200:
return []
# It gets the lyrics
lyrics = response.html.xpath('//div[@class="letra"]/p')
if len(lyrics) == 0:
lyrics = response.html.xpath(
'//div[@class="letra"]/div[@class="letra-l"]/p')
return lyrics
# Filtra a letra em um formato mais fácil para usar depois
def filter_letra(link):
"""Filters the lyrics, so it's easier to manipulate it"""
# Basicamente divide as estrofes em versos
if hasattr(link[0], 'text'):
result = [i.text.split("\n") for i in link]
else:
result = [i.split("\n") for i in link]
# Se tem o formato "(x3)" ou "(x4)", repete a estrofe N vezes de acordo com "(xN)"
regex = re.escape("(") + r"x\d" + re.escape(")")
for i, estrofe in enumerate(result):
for j, verso in enumerate(estrofe):
# Em por enquanto só repete duas vezes independente do número, preciso mudar isso
searched = re.search(regex, verso)
if searched is not None:
result[i][j] = re.sub(regex, "", verso)
# Repete a estrofe n vezes. 'n' é o número depois do parêntese e do 'x'
for _ in range(int(searched.group()[2]) - 1):
result.insert(i+1, result[i])
if result[i][j] == "":
result[i].pop(j)
return result
# Faz uma pesquisa no google e retorna o link do cifraclub
def search_in_google(artist, music):
"""Makes a google search and returns the cifraclub link"""
url = f"https://www.google.com/search?q={music} {artist} cifra club"
try:
session = HTMLSession()
response = session.get(url)
except requests.exceptions.RequestException as e:
print(e)
return None
link = response.html.xpath('//div[@class="yuRUbf"]/a/@href')
if not link or "https://www.cifraclub" not in link[0]:
print(f"Failed to find {music} lyrics!")
return None
else:
return link[0]
def get_music_title(url):
try:
session = HTMLSession()
response = session.get(url)
except requests.exceptions.RequestException as e:
print(e)
return None
music_el = response.html.xpath('//h1[@class="t1"]')
artist_el = response.html.xpath('//h2[@class="t3"]')
if not music_el or not artist_el:
return None
else:
music = music_el[0].text
artist = artist_el[0].text
return music, artist
# Divide a letra em versos e estrofes
def divide_by_text(text):
"""Divides the lyrics in verses and strophes"""
text = text.replace("\r", "")
while text[-1] == "\n":
text = text[:-1]
regex = re.escape("(") + r".+\S" + re.escape(")") + "\n"
text = re.sub(regex, "", text)
# Divide em estrofes
text = text.split("\n\n")
text = [re.sub(regex, "", i) for i in text]
letra = filter_letra(text)
letra = filter(lambda x: len(x) > 0 and not "" in x, letra)
return list(letra)
# https://wiki.python.org.br/RemovedorDeAcentos
def stress_remove(txt, codif='utf-8'):
return normalize('NFKD', txt).encode('ASCII', 'ignore').decode('ASCII')