sd16spring · nielseneli · Feb 22, 2016 · Feb 25, 2016 · Feb 29, 2016 · Feb 29, 2016
diff --git a/.gitignore b/.gitignore
@@ -0,0 +1,48 @@
+api_client_codes.py
+
+hamilton_0.html
+hamilton_1.html
+hamilton_2.html
+hamilton_3.html
+hamilton_4.html
+hamilton_5.html
+hamilton_6.html
+hamilton_7.html
+hamilton_8.html
+hamilton_9.html
+hamilton_10.html
+hamilton_11.html
+hamilton_12.html
+hamilton_13.html
+hamilton_14.html
+hamilton_15.html
+hamilton_16.html
+hamilton_17.html
+hamilton_18.html
+hamilton_19.html
+hamilton_20.html
+hamilton_21.html
+hamilton_22.html
+hamilton_23.html
+hamilton_24.html
+hamilton_25.html
+hamilton_26.html
+hamilton_27.html
+hamilton_28.html
+hamilton_29.html
+hamilton_30.html
+hamilton_31.html
+hamilton_32.html
+hamilton_33.html
+hamilton_34.html
+hamilton_35.html
+hamilton_36.html
+hamilton_37.html
+hamilton_38.html
+hamilton_39.html
+hamilton_40.html
+hamilton_41.html
+hamilton_42.html
+hamilton_43.html
+hamilton_44.html
+hamilton_45.html
diff --git a/import_hamilton.py b/import_hamilton.py
@@ -0,0 +1,70 @@
+"""this is the file i'll use to import/download all of the hamilton lyrics
+into text files"""
+
+# packages are useful
+from bs4 import BeautifulSoup
+import requests
+from lxml import html
+
+
+def find_links(soup):
+    """this function takes in an html file and returns a list of the
+    links within the html file."""
+    list_o_links = []
+# this iterates through all links in the html file and appends them to a list
+    for link in soup.find_all('a'):
+        list_o_links.append('http://www.themusicallyrics.com/' + link.get('href'))
+# and the list gets returned
+    return list_o_links
+
+
+def cull_links(beginning_url, list_urls):
+    """this function takes in the beginning of a URL and a list of URLs and
+    returns a list of URLs that begin with the beginning_url."""
+    new_list_urls = []
+# iterates through the list of URLs; if an item contains the desired URL,
+# appends the item to the new list of URLs.
+    for x in list_urls:
+        if beginning_url in x:
+            new_list_urls.append(x)
+    return new_list_urls
+
+
+def file_names(list_of_links, base_name):
+    n = len(list_of_links)
+    names_list = []
+    for i in range(n):
+        names_list.append(base_name + '_' + str(i) + '.html')
+    return names_list
+
+
+def save_files(list_of_links, list_of_names):
+    for i in range(len(list_of_links)):
+        song = requests.get(list_of_links[i])
+        text_file = open(list_of_names[i], "w")
+        text_file.write(song.content)
+        text_file.close
+
+# i can get the html of the page i want
+url_source = requests.get('http://www.themusicallyrics.com/h/351-hamilton-the-musical-lyrics.html')
+# i can save it in a file
+text_file = open('url_page.txt', 'w')
+text_file.write(url_source.content)
+text_file.close
+
+# this lets me use BeautifulSoup because i want to
+soup = BeautifulSoup(url_source.content, 'lxml')
+
+# this uses the find_links function and defines which links i want
+some_urls = find_links(soup)
+useful_url = '/351-hamilton-the-musical-lyrics/'
+
+# this makes a list of links and a base name for naming the files i create
+list_of_links = cull_links(useful_url, some_urls)
+base_name = 'hamilton'
+
+# this is a list of names for the files i create
+names_list = file_names(list_of_links, base_name)
+
+# this uses the save_files function to get all the things i want
+save_files(list_of_links, names_list)
diff --git a/import_hamilton2.py b/import_hamilton2.py
@@ -0,0 +1,72 @@
+"""this is the file i'll use to import/download all of the hamilton lyrics
+into text files"""
+
+# packages are useful
+from bs4 import BeautifulSoup
+import requests
+from lxml import html
+
+
+def find_links(soup):
+    """this function takes in an html file and returns a list of the
+    links within the html file."""
+    list_o_links = []
+# this iterates through all links in the html file and appends them to a list
+    for link in soup.find_all('a'):
+        list_o_links.append(link.get('href'))
+# and the list gets returned
+    return list_o_links
+
+
+def cull_links(beginning_url, list_urls):
+    """this function takes in the beginning of a URL and a list of URLs and
+    returns a list of URLs that begin with the beginning_url."""
+    new_list_urls = []
+# iterates through the list of URLs; if an item contains the desired URL,
+# appends the item to the new list of URLs.
+    for x in list_urls:
+        if x:
+            if beginning_url in x:
+                new_list_urls.append(x)
+    return new_list_urls
+
+
+def file_names(list_of_links, base_name):
+    n = len(list_of_links)
+    names_list = []
+    for i in range(n):
+        names_list.append(base_name + '_' + str(i) + '.html')
+    return names_list
+
+
+def save_files(list_of_links, list_of_names):
+    for i in range(len(list_of_links)):
+        song = requests.get(list_of_links[i])
+        text_file = open(list_of_names[i], "w")
+        text_file.write(song.content)
+        text_file.close
+
+# i can get the html of the page i want
+url_source = requests.get('file:///home/lnielsen/Downloads/Hamilton%20(Original%20Broadway%20Cast%20Recording)%20-%20Act%20I%20Booklet%20-%20FINAL.pdf')
+# i can save it in a file
+text_file = open('url_page.txt', 'w')
+text_file.write(url_source.content)
+text_file.close
+
+# this lets me use BeautifulSoup because i want to
+soup = BeautifulSoup(url_source.content, 'lxml')
+print soup
+
+# this uses the find_links function and defines which links i want
+some_urls = find_links(soup)
+useful_url = 'http://genius.com/Lin-manuel-miranda-'
+
+# this makes a list of links and a base name for naming the files i create
+list_of_links = cull_links(useful_url, some_urls)
+base_name = 'hamilton'
+
+# this is a list of names for the files i create
+names_list = file_names(list_of_links, base_name)
+
+# this uses the save_files function to get all the things i want
+save_files(list_of_links, names_list)
diff --git a/parsing_text_files.py b/parsing_text_files.py
@@ -0,0 +1,96 @@
+"""this contains the code and functions to parse the lyrics from the text files
+"""
+
+# let's import some useful things
+from bs4 import BeautifulSoup
+
+# here is the functions i will use
+
+
+def make_a_soup(filename):
+    """opens the file, defines a variable as the file, and returns it
+    """
+    current_song = open(filename)
+    important = current_song.read()
+    current_song.close
+    return important
+
+
+def find_lyrics(soup):
+    """finds the section of the html file that contains the lyrics
+    turns it into a list
+    turns that into one string
+    """
+    start = soup.find('</em>') + 5
+    end = soup[start:].find('</p>')
+    new_soup = soup[start:end]
+    list_lyrics = new_soup.split('<br />')
+    string_lyrics = ' '.join([x for x in list_lyrics])
+    return string_lyrics
+
+
+def tuples_of_lyrics(string_o_lyrics):
+    """takes in a string of lyrics
+    splits it by '  ' (every time the character speaking/singing changes)
+    splits each of those into a tuple ('CHARACTER', 'Lyrics')
+    returns a list of tuples
+    """
+    list_of_strings_of_lyrics = string_o_lyrics.split('  ')
+    list_of_tuples = []
+    for x in list_of_strings_of_lyrics:
+        if x:
+            avocado = tuple(x.split(': '))
+            list_of_tuples.append(avocado)
+    return list_of_tuples
+
+
+def names(number_of_songs, base_name):
+    n = number_of_songs
+    names_list = []
+    for i in range(n):
+        names_list.append(base_name + '_' + str(i) + '.txt')
+    return names_list
+
+
+def filenames(number_of_songs, base_name):
+    n = number_of_songs
+    names_list = []
+    for i in range(n):
+        names_list.append(base_name + '_' + str(i) + '.html')
+    return names_list
+
+
+def assign_names(number_of_songs, base_name_name, base_name_file):
+    names_list = names(number_of_songs, base_name_name)
+    filenames_list = filenames(number_of_songs, base_name_file)
+    for i in range(len(names_list)):
+        x = tuples_of_lyrics(find_lyrics(make_a_soup(filenames_list[i])))
+    return names_list
+
+names_list = names(46, 'hamilton_lyrics')
+filenames_list = filenames(46, 'hamilton')
+lyric = []
+for i in range(46):
+    lyric.append(tuples_of_lyrics(find_lyrics(make_a_soup(filenames_list[i]))))
+n = 9
+# print ''
+# print names_list[n]
+# print ''
+print lyric[n]
+
+soup = make_a_soup('hamilton_9.html')
+
+start = soup.find('</em>') + 5
+end = soup[start:].find('</p>')
+new_soup = soup[start:end]
+list_lyrics = new_soup.split('<br />')
+string_lyrics = ' '.join([x for x in list_lyrics])
+# print start
+# print end
+# print soup[start:end]
+# if lyric[n][2]=None:
+#     print '...'
+
+assign_names(46, 'hamilton_lyrics', 'hamilton')
+
+# print tuples_of_lyrics(find_lyrics(make_a_soup('hamilton_0.html')))
diff --git a/writeup_and_reflection.pdf b/writeup_and_reflection.pdf