From 5c42e30c6b5789ba53d9209d43e88f61cf72ab02 Mon Sep 17 00:00:00 2001
From: Jasper Landa <jasperlanda91@gmail.com>
Date: Sat, 20 Aug 2022 16:05:36 +0200
Subject: [PATCH 1/3] file open improved. os.path used instead of /. Fixed BS
 link open warning

---
 downloader.py | 192 ++++++++++++++++++++++++++------------------------
 1 file changed, 98 insertions(+), 94 deletions(-)

diff --git a/downloader.py b/downloader.py
index ef233c5..ac97dc1 100644
--- a/downloader.py
+++ b/downloader.py
@@ -1,106 +1,110 @@
 import os
-import urllib.parse as urlparse
-import urllib.request as urllib2, json
+import urllib.request as urllib2
 from bs4 import BeautifulSoup
 
 BASE_URL = 'https://downloads.khinsider.com'
 
-def validate_url (url):
-	if '//downloads.khinsider.com/game-soundtracks/album/' not in url:
-		return False
-	return True
-
-def fetch_from_url (url):
-	valid = validate_url(url)
-	if not valid:
-		print('[error] Invalid url: ' + url)
-		return
-	print('[info] Url found: ' + url)
-
-	base_dir = 'downloads'
-	url_parts = url.split('/')
-	dir_name = base_dir + '/' + url_parts[len(url_parts) - 1]
-
-	# Create directories
-	if not os.path.exists(base_dir):
-		print('[info] creating directory: ' + base_dir)
-		os.makedirs(base_dir)
-	if not os.path.exists(dir_name):
-		print ('[info] creating directory: ' + dir_name)
-		os.makedirs(dir_name)
-
-	print('[info] crawling for links...')
-
-	soup = BeautifulSoup(urllib2.urlopen(url))
-
-	song_list = soup.find(id="songlist")
-	anchors = song_list.find_all('a')
-
-	# href (string) -> song name (string)
-	songMap = {}
-
-	# Acquire links
-	for anchor in anchors:
-		href = anchor.get('href')
-		if href and 'mp3' in href:
-			href = BASE_URL + href
-			if href not in songMap:
-				songMap[href] = anchor.string
-	if not songMap:
-		print('[error] No links found for the url. Double check that the url is correct and try again.')
-		print('[error] url: ' + url)
-		return
-
-	print('[info] ' + str(len(songMap)) + ' links acquired')
-
-	# Map so we don't download duplicate links on the page
-	downloaded_mp3s = {}
-
-	# http://stackoverflow.com/questions/22676/how-do-i-download-a-file-over-http-using-python
-	# Iterate through links, grab the mp3s, and download them
-	for href, song_name in songMap.items():
-		link_soup = BeautifulSoup(urllib2.urlopen(href))
-		audio = link_soup.find('audio')
-		mp3_url = audio.get('src')
-		if mp3_url not in downloaded_mp3s:
-			downloaded_mp3s[mp3_url] = True
-			parts = mp3_url.split('/')
-			file_name = song_name + '.mp3'
-
-			mp3file = urllib2.urlopen(mp3_url)
-
-			# get file size
-			meta = mp3file.info()
-			file_size = float(meta.get("Content-Length")) / 1000000
-
-			file_on_disk_path = dir_name + '/' + file_name
-			# check if file already exists
-			file_already_downloaded = False
-			if os.path.exists(file_on_disk_path):
-				stat = os.stat(file_on_disk_path)
-				file_already_downloaded = round(float(stat.st_size) / 1000000, 2) == round(file_size, 2)
-
-			# It exists but isn't already the same size
-			if not file_already_downloaded:
-				print('[downloading] ' + file_name + ' [%.2f' % file_size + 'MB]')
-
-				with open(file_on_disk_path,'wb') as output:
-					output.write(mp3file.read())
-					print('[done] "' + file_name + '"')
-			else:
-				print('[skipping] "' + file_name + '"" already downloaded.')
+
+def validate_url(url):
+    if '//downloads.khinsider.com/game-soundtracks/album/' not in url:
+        return False
+    return True
+
+
+def fetch_from_url(url):
+    valid = validate_url(url)
+    if not valid:
+        print('[error] Invalid url: ' + url)
+        return
+    print('[info] Url found: ' + url)
+
+    base_dir = os.path.join(os.getcwd(), 'downloads')
+    url_parts = url.split('/')
+    dir_name = os.path.join(os.getcwd(), base_dir, url_parts[-1])
+
+    # Create directories
+    if not os.path.exists(base_dir):
+        print('[info] creating directory: ' + base_dir)
+        os.makedirs(base_dir)
+    if not os.path.exists(dir_name):
+        print('[info] creating directory: ' + dir_name)
+        os.makedirs(dir_name)
+
+    print('[info] crawling for links...')
+
+    soup = BeautifulSoup(urllib2.urlopen(url))
+
+    song_list = soup.find(id="songlist")
+    anchors = song_list.find_all('a')
+
+    # href (string) -> song name (string)
+    song_map = {}
+
+    # Acquire links
+    for anchor in anchors:
+        href = anchor.get('href')
+        if href and 'mp3' in href:
+            href = BASE_URL + href
+            if href not in song_map:
+                song_map[href] = anchor.string
+    if not song_map:
+        print('[error] No links found for the url. Double check that the url is correct and try again.')
+        print('[error] url: ' + url)
+        return
+
+    print('[info] ' + str(len(song_map)) + ' links acquired')
+
+    # Map so we don't download duplicate links on the page
+    downloaded_mp3s = {}
+
+    # http://stackoverflow.com/questions/22676/how-do-i-download-a-file-over-http-using-python
+    # Iterate through links, grab the mp3s, and download them
+    for href, song_name in song_map.items():
+        link_soup = BeautifulSoup(urllib2.urlopen(href), features="html.parser")
+        audio = link_soup.find('audio')
+        mp3_url = audio.get('src')
+        if mp3_url not in downloaded_mp3s:
+            downloaded_mp3s[mp3_url] = True
+            parts = mp3_url.split('/')
+            file_name = song_name + '.mp3'
+
+            mp3file = urllib2.urlopen(mp3_url)
+
+            # get file size
+            meta = mp3file.info()
+            file_size = float(meta.get("Content-Length")) / 1000000
+
+            file_on_disk_path = dir_name + '/' + file_name
+            # check if file already exists
+            file_already_downloaded = False
+            if os.path.exists(file_on_disk_path):
+                stat = os.stat(file_on_disk_path)
+                file_already_downloaded = round(float(stat.st_size) / 1000000, 2) == round(file_size, 2)
+
+            # It exists but isn't already the same size
+            if not file_already_downloaded:
+                print('[downloading] ' + file_name + ' [%.2f' % file_size + 'MB]')
+
+                with open(file_on_disk_path, 'wb') as output:
+                    output.write(mp3file.read())
+                    print('[done] "' + file_name + '"')
+            else:
+                print('[skipping] "' + file_name + '"" already downloaded.')
+
 
 input_file_name = 'inputs.txt'
 if os.path.exists(input_file_name):
-	print('[info] Input file found. Parsing for links...')
-	file = open(input_file_name, 'r')
-	for line in file:
-		fetch_from_url(line)
+    print('[info] Input file found. Parsing for links...')
+    with open(input_file_name, 'r') as f:
+        lines = (x.strip() for x in f.readlines())
+        for line in lines:
+            fetch_from_url(line)
 else:
-	print('Please input link in quotes to album on khinsider.')
-	print('Example input (including quotes): \'http://downloads.khinsider.com/game-soundtracks/album/disgaea-4-a-promise-unforgotten-soundtrack\'')
-	url = input('Url: ')
-	fetch_from_url(url)
+    print('Please input link in quotes to album on khinsider.')
+    print(
+        'Example input (including quotes): \'http://downloads.khinsider.com/game-soundtracks/album/disgaea-4-a-promise-unforgotten-soundtrack\'')
+    url = input('Url: ')
+    fetch_from_url(url)
 
 # For testing
 # url = 'http://downloads.khinsider.com/game-soundtracks/album/disgaea-4-a-promise-unforgotten-soundtrack'

From 60ce59a4d4aff7bbf5751ea7cd85776d0a2ba426 Mon Sep 17 00:00:00 2001
From: Jasper Landa <jasperlanda91@gmail.com>
Date: Sat, 20 Aug 2022 16:19:12 +0200
Subject: [PATCH 2/3] more improvements (url validation and f-string usage)

---
 downloader.py | 35 ++++++++++++-----------------------
 1 file changed, 12 insertions(+), 23 deletions(-)

diff --git a/downloader.py b/downloader.py
index ac97dc1..0ac81f8 100644
--- a/downloader.py
+++ b/downloader.py
@@ -5,40 +5,30 @@
 BASE_URL = 'https://downloads.khinsider.com'
 
 
-def validate_url(url):
-    if '//downloads.khinsider.com/game-soundtracks/album/' not in url:
-        return False
-    return True
-
-
 def fetch_from_url(url):
-    valid = validate_url(url)
-    if not valid:
-        print('[error] Invalid url: ' + url)
+    if not url.startswith(f'{BASE_URL}/game-soundtracks/album/'):
+        print(f'[error] Invalid url: {url}')
         return
-    print('[info] Url found: ' + url)
+    print(f'[info] Url found: {url}')
 
     base_dir = os.path.join(os.getcwd(), 'downloads')
     url_parts = url.split('/')
-    dir_name = os.path.join(os.getcwd(), base_dir, url_parts[-1])
+    dir_name = os.path.join(base_dir, url_parts[-1])
 
     # Create directories
-    if not os.path.exists(base_dir):
-        print('[info] creating directory: ' + base_dir)
-        os.makedirs(base_dir)
     if not os.path.exists(dir_name):
         print('[info] creating directory: ' + dir_name)
         os.makedirs(dir_name)
 
     print('[info] crawling for links...')
 
-    soup = BeautifulSoup(urllib2.urlopen(url))
+    soup = BeautifulSoup(urllib2.urlopen(url), features="html.parser")
 
     song_list = soup.find(id="songlist")
     anchors = song_list.find_all('a')
 
     # href (string) -> song name (string)
-    song_map = {}
+    song_map = dict()
 
     # Acquire links
     for anchor in anchors:
@@ -49,10 +39,10 @@ def fetch_from_url(url):
                 song_map[href] = anchor.string
     if not song_map:
         print('[error] No links found for the url. Double check that the url is correct and try again.')
-        print('[error] url: ' + url)
+        print(f'[error] url: {url}')
         return
 
-    print('[info] ' + str(len(song_map)) + ' links acquired')
+    print(f'[info] {len(song_map)} links acquired')
 
     # Map so we don't download duplicate links on the page
     downloaded_mp3s = {}
@@ -83,13 +73,13 @@ def fetch_from_url(url):
 
             # It exists but isn't already the same size
             if not file_already_downloaded:
-                print('[downloading] ' + file_name + ' [%.2f' % file_size + 'MB]')
+                print(f'[downloading] {file_name} [{file_size:.2f} MB]')
 
                 with open(file_on_disk_path, 'wb') as output:
                     output.write(mp3file.read())
-                    print('[done] "' + file_name + '"')
+                    print(f'[done] "{file_name}"')
             else:
-                print('[skipping] "' + file_name + '"" already downloaded.')
+                print(f'[skipping] "{file_name}" (already downloaded).')
 
 
 input_file_name = 'inputs.txt'
@@ -101,8 +91,7 @@ def fetch_from_url(url):
             fetch_from_url(line)
 else:
     print('Please input link in quotes to album on khinsider.')
-    print(
-        'Example input (including quotes): \'http://downloads.khinsider.com/game-soundtracks/album/disgaea-4-a-promise-unforgotten-soundtrack\'')
+    print('Example input (including quotes): \'http://downloads.khinsider.com/game-soundtracks/album/disgaea-4-a-promise-unforgotten-soundtrack\'')
     url = input('Url: ')
     fetch_from_url(url)
 

From 05cfdd6f2019b50ff455527705f6e9c60e035f5b Mon Sep 17 00:00:00 2001
From: Jasper Landa <jasperlanda91@gmail.com>
Date: Sat, 20 Aug 2022 16:22:45 +0200
Subject: [PATCH 3/3] more f-strings. Deleted first dir check, since second
 will create intermediate dirs

---
 downloader.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/downloader.py b/downloader.py
index 0ac81f8..0848c24 100644
--- a/downloader.py
+++ b/downloader.py
@@ -17,7 +17,7 @@ def fetch_from_url(url):
 
     # Create directories
     if not os.path.exists(dir_name):
-        print('[info] creating directory: ' + dir_name)
+        print(f'[info] creating directory: {dir_name}')
         os.makedirs(dir_name)
 
     print('[info] crawling for links...')