From bc419b9e441701c5539b25428ae9535cd0cf8631 Mon Sep 17 00:00:00 2001 From: rayonnant14 Date: Thu, 12 May 2022 20:03:14 +0300 Subject: [PATCH] fixed folders structure --- .../accent_archive_parse/.DS_Store | Bin 0 -> 6148 bytes .../accent_archive_parse/README.md | 88 + .../processed_bio_metadata.csv | 3041 +++++++++++++++++ .../accent_archive_parse/src/.DS_Store | Bin 0 -> 6148 bytes .../accent_archive_parse/src/fromwebsite.py | 169 + .../accent_archive_parse/src/getaudio.py | 66 + 6 files changed, 3364 insertions(+) create mode 100644 research/accent_recognition/accent_archive_parse/.DS_Store create mode 100644 research/accent_recognition/accent_archive_parse/README.md create mode 100644 research/accent_recognition/accent_archive_parse/processed_bio_metadata.csv create mode 100644 research/accent_recognition/accent_archive_parse/src/.DS_Store create mode 100644 research/accent_recognition/accent_archive_parse/src/fromwebsite.py create mode 100644 research/accent_recognition/accent_archive_parse/src/getaudio.py diff --git a/research/accent_recognition/accent_archive_parse/.DS_Store b/research/accent_recognition/accent_archive_parse/.DS_Store new file mode 100644 index 0000000000000000000000000000000000000000..9a874b5768f336915163bb88cd434575b859f936 GIT binary patch literal 6148 zcmeH~Jr2S!425ml0g0s}V-^m;4I%_5-~tF3k&vj^b9A16778<}(6eNJu~Vz<8=6`~ zboab&MFtUB!i}=AFfm2m$tVxGT*u4pe81nUlA49C} z?O@64YO)2RT{MRe%{!}2F))pG(Sih~)xkgosK7*lF7m<7{{#Hn{6A@7N(HFEpDCdI z{H1@V-^m;4Wg<&0T*E43hX&L&p$$qDprKhvt+--jT7}7np#A3 zem<@ulZcFPQ@L2!n>{z**++&mCkOWA81W14cNZlEfg7;MkzE(HCqgga^y>{tEnwC%0;vJ&^%eQ zLs35+`xjp>T0 tag string + :param p_tag (str): HTML string + :return (str): string of link + ''' + text = p_tag.text.replace(' ','').split(',') + return([ROOT_URL+p_tag.a['href'], text[0], text[1]]) + +def get_bio(hrefs): + ''' + Retrieves HTML from list of hrefs and returns bio information + :param hrefs (list): list of hrefs + :return (DataFrame): Pandas DataFrame with bio information + ''' + + htmls = get_htmls(hrefs) + bss = [BeautifulSoup(html,'html.parser') for html in htmls] + rows = [] + bio_row = [] + for bs in bss: + rows.append([li.text for li in bs.find('ul','bio').find_all('li')]) + for row in rows: + bio_row.append(parse_bio(row)) + + return(pd.DataFrame(bio_row)) + +def parse_bio(row): + ''' + Parse bio data from row string + :param row (str): Unparsed bio string + :return (list): Bio columns + ''' + cols = [] + for col in row: + try: + tmp_col = re.search((r"\:(.+)",col.replace(' ','')).group(1)) + except: + tmp_col = col + cols.append(tmp_col) + return(cols) + + +def create_dataframe(languages): + ''' + + :param languages (str): language from which you want to get html + :return df (DataFrame): DataFrame that contains all audio metadata from searched language + ''' + htmls = get_htmls(build_search_urls(languages)) + bss = [BeautifulSoup(html,'html.parser') for html in htmls] + persons = [] + + for bs in bss: + for p in bs.find_all('p'): + if p.a: + persons.append(parse_p(p)) + + df = pd.DataFrame(persons, columns=['href','language_num','sex']) + + bio_rows = get_bio(df['href']) + + if DEBUG: + print('loading finished') + + df['birth_place'] = bio_rows.iloc[:,0] + df['native_language'] = bio_rows.iloc[:,1] + df['other_languages'] = bio_rows.iloc[:,2] + df['age_sex'] = bio_rows.iloc[:,3] + df['age_of_english_onset'] = bio_rows.iloc[:,4] + df['english_learning_method'] = bio_rows.iloc[:,5] + df['english_residence'] = bio_rows.iloc[:,6] + df['length_of_english_residence'] = bio_rows.iloc[:,7] + + df['birth_place'] = df['birth_place'].apply(lambda x: x[:-6].split(' ')[-2:]) + # print(df['birth_place']) + # df['birth_place'] = lambda x: x[:-6].split(' ')[2:], df['birth_place'] + df['native_language'] = df['native_language'].apply(lambda x: x.split(' ')[2]) + # print(df['native_language']) + # df['native_language'] = lambda x: x.split(' ')[2], df['native_language'] + df['other_languages'] = df['other_languages'].apply(lambda x: x.split(' ')[2:]) + # print(df['other_languages']) + # df['other_languages'] = lambda x: x.split(' ')[2:], df['other_languages'] + df['age_sex'], df['age'] = df['age_sex'].apply(lambda x: x.split(' ')[2:]), df['age_sex'].apply(lambda x: x.replace('sex:','').split(',')[1]) + # print(df['age']) + # df['age_sex'] = lambda x: x.split(' ')[2], df['age_sex'] + # df['age_of_english_onset'] = lambda x: float(x.split(' ')[-1]), df['age_of_english_onset'] + df['age_of_english_onset'] = df['age_of_english_onset'].apply(lambda x: float(x.split(' ')[-1])) + # print(df['age_of_english_onset']) + # df['english_learning_method'] = lambda x: x.split(' ')[-1], df['english_learning_method'] + df['english_learning_method'] = df['english_learning_method'].apply(lambda x: x.split(' ')[-1]) + # print(df['english_learning_method']) + # df['english_residence'] = lambda x: x.split(' ')[2:], df['english_residence'] + df['english_residence'] = df['english_residence'].apply(lambda x: x.split(' ')[2:]) + # print(df['english_residence']) + # df['length_of_english_residence'] = lambda x: float(x.split(' ')[-2]), df['length_of_english_residence'] + df['length_of_english_residence'] = df['length_of_english_residence'].apply(lambda x: float(x.split(' ')[-2])) + # print(df['length_of_english_residence']) + + # df['age'] = lambda x: x.replace(' ','').split(',')[0], df['age_sex'] + + return(df) + +if __name__ == '__main__': + ''' + console command example: + python fromwebsite.py bio_metadata.csv mandarin english arabic + ''' + + df = None + + # Set destination file + destination_file = sys.argv[1] + + # If no language arguments, use 'mandarin' as default + try: + languages = sys.argv[2:] + except: + languages = ['mandarin'] + pass + + # Check if destination file exists, else create a new one + try: + df = pd.read_csv(destination_file) + df = df.append(create_dataframe(languages=languages),ignore_index=True) + + except: + df = create_dataframe(languages=languages) + + + df.drop_duplicates(subset='language_num',inplace=True) + + df.to_csv(destination_file,index=False) diff --git a/research/accent_recognition/accent_archive_parse/src/getaudio.py b/research/accent_recognition/accent_archive_parse/src/getaudio.py new file mode 100644 index 0000000..8c7dd3c --- /dev/null +++ b/research/accent_recognition/accent_archive_parse/src/getaudio.py @@ -0,0 +1,66 @@ +import pandas as pd +import urllib.request +import os +import sys +from pydub import AudioSegment + +class GetAudio: + + def __init__(self, csv_filepath, destination_folder= 'audio/', wait= 1.5, debug=False ): + ''' + Initializes GetAudio class object + :param destination_folder (str): Folder where audio files will be saved + :param wait (float): Length (in seconds) between web requests + :param debug (bool): Outputs status indicators to console when True + ''' + self.csv_filepath = csv_filepath + self.audio_df = pd.read_csv(csv_filepath, sep='\t') + self.url = 'https://accent.gmu.edu/soundtracks/{}.mp3' + self.destination_folder = destination_folder + self.wait = wait + self.debug = False + + def check_path(self): + ''' + Checks if self.distination_folder exists. If not, a folder called self.destination_folder is created + ''' + if not os.path.exists(self.destination_folder): + if self.debug: + print('{} does not exist, creating'.format(self.destination_folder)) + os.makedirs('../' + self.destination_folder) + + def get_audio(self): + ''' + Retrieves all audio files from 'language_num' column of self.audio_df + If audio file already exists, move on to the next + :return (int): Number of audio files downloaded + ''' + + self.check_path() + + counter = 0 + + for lang_num, category in zip(self.audio_df['language_num'], self.audio_df['category']): + if not os.path.exists(self.destination_folder +'{}.wav'.format(lang_num)): + if self.debug: + print('downloading {}'.format(lang_num)) + (filename, headers) = urllib.request.urlretrieve(self.url.format(lang_num)) + #print(self.url.format(lang_num)) + #print(filename) + sound = AudioSegment.from_mp3(filename) + if not os.path.exists('../' + self.destination_folder + category): + os.makedirs('../' + self.destination_folder + category) + sound.export('../' + self.destination_folder + category + '/' + "{}.wav".format(lang_num), format="wav") + counter += 1 + + return counter + +if __name__ == '__main__': + ''' + Example console command + python GetAudio.py audio_metadata.csv + ''' + csv_file = sys.argv[1] + ga = GetAudio(csv_filepath=csv_file) + ga.get_audio() +