forked from cbrincoveanu/MusicCogGroup1
-
Notifications
You must be signed in to change notification settings - Fork 0
/
parser.py
48 lines (44 loc) · 1.88 KB
/
parser.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
import glob
import csv
def main():
with open('data.csv', 'w', newline='') as csvfile:
csvWriter = csv.writer(csvfile, delimiter=',',quotechar='|', quoting=csv.QUOTE_MINIMAL)
csvWriter.writerow(["Genre", "Artist", "Title", "UniqueLinesRatio"])
#iterate through files
for filename in glob.iglob('data/*.txt'):
print("### Parsing file " + filename)
genre = filename[5:len(filename)-4]
print("### Genre: "+ genre)
numberOfSongs = 0
file = open(filename)
lines = file.readlines()
i = 0
while i < len(lines) and not lines[i].startswith("###"):
i = i + 1
while i < len(lines):
artist = lines[i][3:].strip()
title = lines[i+1][3:].strip()
numberOfSongs = numberOfSongs + 1
print (artist + " - " + title)
i = i + 2
#extract features from song
lyrics = []
while i < len(lines) and not lines[i].startswith("###"):
line = lines[i].strip()
if line != "":
lyrics.append(line)
#print (line)
i = i + 1
uniqueLinesRatio = calc_unique_lines_ratio(lyrics)
#TODO: average character count per word
#TODO: average word count per line
#TODO: total word count
#TODO: bag of words extraction
#TODO: unique word ratio per song
#TODO: unique word ratio per line
csvWriter.writerow([genre, artist, title, uniqueLinesRatio])
print("### "+ str(numberOfSongs) + " songs found for "+ genre +"\n")
def calc_unique_lines_ratio(lyrics):
return len(set(lyrics)) / len(lyrics)
if __name__ == "__main__":
main()