Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

nielsen text mining #62

Open
wants to merge 9 commits into
base: master
Choose a base branch
from
48 changes: 48 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,48 @@
api_client_codes.py

hamilton_0.html
hamilton_1.html
hamilton_2.html
hamilton_3.html
hamilton_4.html
hamilton_5.html
hamilton_6.html
hamilton_7.html
hamilton_8.html
hamilton_9.html
hamilton_10.html
hamilton_11.html
hamilton_12.html
hamilton_13.html
hamilton_14.html
hamilton_15.html
hamilton_16.html
hamilton_17.html
hamilton_18.html
hamilton_19.html
hamilton_20.html
hamilton_21.html
hamilton_22.html
hamilton_23.html
hamilton_24.html
hamilton_25.html
hamilton_26.html
hamilton_27.html
hamilton_28.html
hamilton_29.html
hamilton_30.html
hamilton_31.html
hamilton_32.html
hamilton_33.html
hamilton_34.html
hamilton_35.html
hamilton_36.html
hamilton_37.html
hamilton_38.html
hamilton_39.html
hamilton_40.html
hamilton_41.html
hamilton_42.html
hamilton_43.html
hamilton_44.html
hamilton_45.html
70 changes: 70 additions & 0 deletions import_hamilton.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,70 @@
"""this is the file i'll use to import/download all of the hamilton lyrics
into text files"""

# packages are useful
from bs4 import BeautifulSoup
import requests
from lxml import html


def find_links(soup):
"""this function takes in an html file and returns a list of the
links within the html file."""
list_o_links = []
# this iterates through all links in the html file and appends them to a list
for link in soup.find_all('a'):
list_o_links.append('http://www.themusicallyrics.com/' + link.get('href'))
# and the list gets returned
return list_o_links


def cull_links(beginning_url, list_urls):
"""this function takes in the beginning of a URL and a list of URLs and
returns a list of URLs that begin with the beginning_url."""
new_list_urls = []
# iterates through the list of URLs; if an item contains the desired URL,
# appends the item to the new list of URLs.
for x in list_urls:
if beginning_url in x:
new_list_urls.append(x)
return new_list_urls


def file_names(list_of_links, base_name):
n = len(list_of_links)
names_list = []
for i in range(n):
names_list.append(base_name + '_' + str(i) + '.html')
return names_list


def save_files(list_of_links, list_of_names):
for i in range(len(list_of_links)):
song = requests.get(list_of_links[i])
text_file = open(list_of_names[i], "w")
text_file.write(song.content)
text_file.close

# i can get the html of the page i want
url_source = requests.get('http://www.themusicallyrics.com/h/351-hamilton-the-musical-lyrics.html')
# i can save it in a file
text_file = open('url_page.txt', 'w')
text_file.write(url_source.content)
text_file.close

# this lets me use BeautifulSoup because i want to
soup = BeautifulSoup(url_source.content, 'lxml')

# this uses the find_links function and defines which links i want
some_urls = find_links(soup)
useful_url = '/351-hamilton-the-musical-lyrics/'

# this makes a list of links and a base name for naming the files i create
list_of_links = cull_links(useful_url, some_urls)
base_name = 'hamilton'

# this is a list of names for the files i create
names_list = file_names(list_of_links, base_name)

# this uses the save_files function to get all the things i want
save_files(list_of_links, names_list)
72 changes: 72 additions & 0 deletions import_hamilton2.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,72 @@
"""this is the file i'll use to import/download all of the hamilton lyrics
into text files"""

# packages are useful
from bs4 import BeautifulSoup
import requests
from lxml import html


def find_links(soup):
"""this function takes in an html file and returns a list of the
links within the html file."""
list_o_links = []
# this iterates through all links in the html file and appends them to a list
for link in soup.find_all('a'):
list_o_links.append(link.get('href'))
# and the list gets returned
return list_o_links


def cull_links(beginning_url, list_urls):
"""this function takes in the beginning of a URL and a list of URLs and
returns a list of URLs that begin with the beginning_url."""
new_list_urls = []
# iterates through the list of URLs; if an item contains the desired URL,
# appends the item to the new list of URLs.
for x in list_urls:
if x:
if beginning_url in x:
new_list_urls.append(x)
return new_list_urls


def file_names(list_of_links, base_name):
n = len(list_of_links)
names_list = []
for i in range(n):
names_list.append(base_name + '_' + str(i) + '.html')
return names_list


def save_files(list_of_links, list_of_names):
for i in range(len(list_of_links)):
song = requests.get(list_of_links[i])
text_file = open(list_of_names[i], "w")
text_file.write(song.content)
text_file.close

# i can get the html of the page i want
url_source = requests.get('file:///home/lnielsen/Downloads/Hamilton%20(Original%20Broadway%20Cast%20Recording)%20-%20Act%20I%20Booklet%20-%20FINAL.pdf')
# i can save it in a file
text_file = open('url_page.txt', 'w')
text_file.write(url_source.content)
text_file.close

# this lets me use BeautifulSoup because i want to
soup = BeautifulSoup(url_source.content, 'lxml')
print soup

# this uses the find_links function and defines which links i want
some_urls = find_links(soup)
useful_url = 'http://genius.com/Lin-manuel-miranda-'

# this makes a list of links and a base name for naming the files i create
list_of_links = cull_links(useful_url, some_urls)
base_name = 'hamilton'

# this is a list of names for the files i create
names_list = file_names(list_of_links, base_name)

# this uses the save_files function to get all the things i want
save_files(list_of_links, names_list)
96 changes: 96 additions & 0 deletions parsing_text_files.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,96 @@
"""this contains the code and functions to parse the lyrics from the text files
"""

# let's import some useful things
from bs4 import BeautifulSoup

# here is the functions i will use


def make_a_soup(filename):
"""opens the file, defines a variable as the file, and returns it
"""
current_song = open(filename)
important = current_song.read()
current_song.close
return important


def find_lyrics(soup):
"""finds the section of the html file that contains the lyrics
turns it into a list
turns that into one string
"""
start = soup.find('</em>') + 5
end = soup[start:].find('</p>')
new_soup = soup[start:end]
list_lyrics = new_soup.split('<br />')
string_lyrics = ' '.join([x for x in list_lyrics])
return string_lyrics


def tuples_of_lyrics(string_o_lyrics):
"""takes in a string of lyrics
splits it by ' ' (every time the character speaking/singing changes)
splits each of those into a tuple ('CHARACTER', 'Lyrics')
returns a list of tuples
"""
list_of_strings_of_lyrics = string_o_lyrics.split(' ')
list_of_tuples = []
for x in list_of_strings_of_lyrics:
if x:
avocado = tuple(x.split(': '))
list_of_tuples.append(avocado)
return list_of_tuples


def names(number_of_songs, base_name):
n = number_of_songs
names_list = []
for i in range(n):
names_list.append(base_name + '_' + str(i) + '.txt')
return names_list


def filenames(number_of_songs, base_name):
n = number_of_songs
names_list = []
for i in range(n):
names_list.append(base_name + '_' + str(i) + '.html')
return names_list


def assign_names(number_of_songs, base_name_name, base_name_file):
names_list = names(number_of_songs, base_name_name)
filenames_list = filenames(number_of_songs, base_name_file)
for i in range(len(names_list)):
x = tuples_of_lyrics(find_lyrics(make_a_soup(filenames_list[i])))
return names_list

names_list = names(46, 'hamilton_lyrics')
filenames_list = filenames(46, 'hamilton')
lyric = []
for i in range(46):
lyric.append(tuples_of_lyrics(find_lyrics(make_a_soup(filenames_list[i]))))
n = 9
# print ''
# print names_list[n]
# print ''
print lyric[n]

soup = make_a_soup('hamilton_9.html')

start = soup.find('</em>') + 5
end = soup[start:].find('</p>')
new_soup = soup[start:end]
list_lyrics = new_soup.split('<br />')
string_lyrics = ' '.join([x for x in list_lyrics])
# print start
# print end
# print soup[start:end]
# if lyric[n][2]=None:
# print '...'

assign_names(46, 'hamilton_lyrics', 'hamilton')

# print tuples_of_lyrics(find_lyrics(make_a_soup('hamilton_0.html')))
Binary file added writeup_and_reflection.pdf
Binary file not shown.