From 5c42e30c6b5789ba53d9209d43e88f61cf72ab02 Mon Sep 17 00:00:00 2001 From: Jasper Landa Date: Sat, 20 Aug 2022 16:05:36 +0200 Subject: [PATCH 1/3] file open improved. os.path used instead of /. Fixed BS link open warning --- downloader.py | 192 ++++++++++++++++++++++++++------------------------ 1 file changed, 98 insertions(+), 94 deletions(-) diff --git a/downloader.py b/downloader.py index ef233c5..ac97dc1 100644 --- a/downloader.py +++ b/downloader.py @@ -1,106 +1,110 @@ import os -import urllib.parse as urlparse -import urllib.request as urllib2, json +import urllib.request as urllib2 from bs4 import BeautifulSoup BASE_URL = 'https://downloads.khinsider.com' -def validate_url (url): - if '//downloads.khinsider.com/game-soundtracks/album/' not in url: - return False - return True - -def fetch_from_url (url): - valid = validate_url(url) - if not valid: - print('[error] Invalid url: ' + url) - return - print('[info] Url found: ' + url) - - base_dir = 'downloads' - url_parts = url.split('/') - dir_name = base_dir + '/' + url_parts[len(url_parts) - 1] - - # Create directories - if not os.path.exists(base_dir): - print('[info] creating directory: ' + base_dir) - os.makedirs(base_dir) - if not os.path.exists(dir_name): - print ('[info] creating directory: ' + dir_name) - os.makedirs(dir_name) - - print('[info] crawling for links...') - - soup = BeautifulSoup(urllib2.urlopen(url)) - - song_list = soup.find(id="songlist") - anchors = song_list.find_all('a') - - # href (string) -> song name (string) - songMap = {} - - # Acquire links - for anchor in anchors: - href = anchor.get('href') - if href and 'mp3' in href: - href = BASE_URL + href - if href not in songMap: - songMap[href] = anchor.string - if not songMap: - print('[error] No links found for the url. Double check that the url is correct and try again.') - print('[error] url: ' + url) - return - - print('[info] ' + str(len(songMap)) + ' links acquired') - - # Map so we don't download duplicate links on the page - downloaded_mp3s = {} - - # http://stackoverflow.com/questions/22676/how-do-i-download-a-file-over-http-using-python - # Iterate through links, grab the mp3s, and download them - for href, song_name in songMap.items(): - link_soup = BeautifulSoup(urllib2.urlopen(href)) - audio = link_soup.find('audio') - mp3_url = audio.get('src') - if mp3_url not in downloaded_mp3s: - downloaded_mp3s[mp3_url] = True - parts = mp3_url.split('/') - file_name = song_name + '.mp3' - - mp3file = urllib2.urlopen(mp3_url) - - # get file size - meta = mp3file.info() - file_size = float(meta.get("Content-Length")) / 1000000 - - file_on_disk_path = dir_name + '/' + file_name - # check if file already exists - file_already_downloaded = False - if os.path.exists(file_on_disk_path): - stat = os.stat(file_on_disk_path) - file_already_downloaded = round(float(stat.st_size) / 1000000, 2) == round(file_size, 2) - - # It exists but isn't already the same size - if not file_already_downloaded: - print('[downloading] ' + file_name + ' [%.2f' % file_size + 'MB]') - - with open(file_on_disk_path,'wb') as output: - output.write(mp3file.read()) - print('[done] "' + file_name + '"') - else: - print('[skipping] "' + file_name + '"" already downloaded.') + +def validate_url(url): + if '//downloads.khinsider.com/game-soundtracks/album/' not in url: + return False + return True + + +def fetch_from_url(url): + valid = validate_url(url) + if not valid: + print('[error] Invalid url: ' + url) + return + print('[info] Url found: ' + url) + + base_dir = os.path.join(os.getcwd(), 'downloads') + url_parts = url.split('/') + dir_name = os.path.join(os.getcwd(), base_dir, url_parts[-1]) + + # Create directories + if not os.path.exists(base_dir): + print('[info] creating directory: ' + base_dir) + os.makedirs(base_dir) + if not os.path.exists(dir_name): + print('[info] creating directory: ' + dir_name) + os.makedirs(dir_name) + + print('[info] crawling for links...') + + soup = BeautifulSoup(urllib2.urlopen(url)) + + song_list = soup.find(id="songlist") + anchors = song_list.find_all('a') + + # href (string) -> song name (string) + song_map = {} + + # Acquire links + for anchor in anchors: + href = anchor.get('href') + if href and 'mp3' in href: + href = BASE_URL + href + if href not in song_map: + song_map[href] = anchor.string + if not song_map: + print('[error] No links found for the url. Double check that the url is correct and try again.') + print('[error] url: ' + url) + return + + print('[info] ' + str(len(song_map)) + ' links acquired') + + # Map so we don't download duplicate links on the page + downloaded_mp3s = {} + + # http://stackoverflow.com/questions/22676/how-do-i-download-a-file-over-http-using-python + # Iterate through links, grab the mp3s, and download them + for href, song_name in song_map.items(): + link_soup = BeautifulSoup(urllib2.urlopen(href), features="html.parser") + audio = link_soup.find('audio') + mp3_url = audio.get('src') + if mp3_url not in downloaded_mp3s: + downloaded_mp3s[mp3_url] = True + parts = mp3_url.split('/') + file_name = song_name + '.mp3' + + mp3file = urllib2.urlopen(mp3_url) + + # get file size + meta = mp3file.info() + file_size = float(meta.get("Content-Length")) / 1000000 + + file_on_disk_path = dir_name + '/' + file_name + # check if file already exists + file_already_downloaded = False + if os.path.exists(file_on_disk_path): + stat = os.stat(file_on_disk_path) + file_already_downloaded = round(float(stat.st_size) / 1000000, 2) == round(file_size, 2) + + # It exists but isn't already the same size + if not file_already_downloaded: + print('[downloading] ' + file_name + ' [%.2f' % file_size + 'MB]') + + with open(file_on_disk_path, 'wb') as output: + output.write(mp3file.read()) + print('[done] "' + file_name + '"') + else: + print('[skipping] "' + file_name + '"" already downloaded.') + input_file_name = 'inputs.txt' if os.path.exists(input_file_name): - print('[info] Input file found. Parsing for links...') - file = open(input_file_name, 'r') - for line in file: - fetch_from_url(line) + print('[info] Input file found. Parsing for links...') + with open(input_file_name, 'r') as f: + lines = (x.strip() for x in f.readlines()) + for line in lines: + fetch_from_url(line) else: - print('Please input link in quotes to album on khinsider.') - print('Example input (including quotes): \'http://downloads.khinsider.com/game-soundtracks/album/disgaea-4-a-promise-unforgotten-soundtrack\'') - url = input('Url: ') - fetch_from_url(url) + print('Please input link in quotes to album on khinsider.') + print( + 'Example input (including quotes): \'http://downloads.khinsider.com/game-soundtracks/album/disgaea-4-a-promise-unforgotten-soundtrack\'') + url = input('Url: ') + fetch_from_url(url) # For testing # url = 'http://downloads.khinsider.com/game-soundtracks/album/disgaea-4-a-promise-unforgotten-soundtrack' From 60ce59a4d4aff7bbf5751ea7cd85776d0a2ba426 Mon Sep 17 00:00:00 2001 From: Jasper Landa Date: Sat, 20 Aug 2022 16:19:12 +0200 Subject: [PATCH 2/3] more improvements (url validation and f-string usage) --- downloader.py | 35 ++++++++++++----------------------- 1 file changed, 12 insertions(+), 23 deletions(-) diff --git a/downloader.py b/downloader.py index ac97dc1..0ac81f8 100644 --- a/downloader.py +++ b/downloader.py @@ -5,40 +5,30 @@ BASE_URL = 'https://downloads.khinsider.com' -def validate_url(url): - if '//downloads.khinsider.com/game-soundtracks/album/' not in url: - return False - return True - - def fetch_from_url(url): - valid = validate_url(url) - if not valid: - print('[error] Invalid url: ' + url) + if not url.startswith(f'{BASE_URL}/game-soundtracks/album/'): + print(f'[error] Invalid url: {url}') return - print('[info] Url found: ' + url) + print(f'[info] Url found: {url}') base_dir = os.path.join(os.getcwd(), 'downloads') url_parts = url.split('/') - dir_name = os.path.join(os.getcwd(), base_dir, url_parts[-1]) + dir_name = os.path.join(base_dir, url_parts[-1]) # Create directories - if not os.path.exists(base_dir): - print('[info] creating directory: ' + base_dir) - os.makedirs(base_dir) if not os.path.exists(dir_name): print('[info] creating directory: ' + dir_name) os.makedirs(dir_name) print('[info] crawling for links...') - soup = BeautifulSoup(urllib2.urlopen(url)) + soup = BeautifulSoup(urllib2.urlopen(url), features="html.parser") song_list = soup.find(id="songlist") anchors = song_list.find_all('a') # href (string) -> song name (string) - song_map = {} + song_map = dict() # Acquire links for anchor in anchors: @@ -49,10 +39,10 @@ def fetch_from_url(url): song_map[href] = anchor.string if not song_map: print('[error] No links found for the url. Double check that the url is correct and try again.') - print('[error] url: ' + url) + print(f'[error] url: {url}') return - print('[info] ' + str(len(song_map)) + ' links acquired') + print(f'[info] {len(song_map)} links acquired') # Map so we don't download duplicate links on the page downloaded_mp3s = {} @@ -83,13 +73,13 @@ def fetch_from_url(url): # It exists but isn't already the same size if not file_already_downloaded: - print('[downloading] ' + file_name + ' [%.2f' % file_size + 'MB]') + print(f'[downloading] {file_name} [{file_size:.2f} MB]') with open(file_on_disk_path, 'wb') as output: output.write(mp3file.read()) - print('[done] "' + file_name + '"') + print(f'[done] "{file_name}"') else: - print('[skipping] "' + file_name + '"" already downloaded.') + print(f'[skipping] "{file_name}" (already downloaded).') input_file_name = 'inputs.txt' @@ -101,8 +91,7 @@ def fetch_from_url(url): fetch_from_url(line) else: print('Please input link in quotes to album on khinsider.') - print( - 'Example input (including quotes): \'http://downloads.khinsider.com/game-soundtracks/album/disgaea-4-a-promise-unforgotten-soundtrack\'') + print('Example input (including quotes): \'http://downloads.khinsider.com/game-soundtracks/album/disgaea-4-a-promise-unforgotten-soundtrack\'') url = input('Url: ') fetch_from_url(url) From 05cfdd6f2019b50ff455527705f6e9c60e035f5b Mon Sep 17 00:00:00 2001 From: Jasper Landa Date: Sat, 20 Aug 2022 16:22:45 +0200 Subject: [PATCH 3/3] more f-strings. Deleted first dir check, since second will create intermediate dirs --- downloader.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/downloader.py b/downloader.py index 0ac81f8..0848c24 100644 --- a/downloader.py +++ b/downloader.py @@ -17,7 +17,7 @@ def fetch_from_url(url): # Create directories if not os.path.exists(dir_name): - print('[info] creating directory: ' + dir_name) + print(f'[info] creating directory: {dir_name}') os.makedirs(dir_name) print('[info] crawling for links...')