From 2241364aa839e72a226220b64ded486f23322e08 Mon Sep 17 00:00:00 2001 From: Greeley Date: Sun, 31 Mar 2024 06:22:55 -0400 Subject: [PATCH] Merge missytake (#5) * #41 Clean up alignment of progress percentage. * Use a temporary file to prevent cache file corruption when interrupted. * Add timeouts/retries Add timeouts to prevent downloads from timing out in minutes. Add retries to all downloads (the only way I was able to download my entire library). * Suppress "No url found" warning for 0 byte entries This commit suppresses the "No url found" warning for entries which contain a "human_size" field set to "0 bytes". When downloading a product, occasionally an entry in the "download_struct" won't have a viable URL to download. In every case I observed, this was harmless; the entry was extra information, like a link to a web page where you could download the soundtrack for free ("World Of Goo"), or a link to a help page on which APK to choose ("Costume Quest"), or some sort of... Javascript soundtrack player? ("Dustforce"). Also, in every case I observed, the entry in the "download_struct" contained a "human_size" field set to the exact string "0 bytes". This seems like a hint in the JSON blob saying: yeah, this entry doesn't have anything for you to download, you can ignore it. It's a little worrying to the casual user of the tool when it prints these error messages. But it seems like it's harmless. So let's suppress these error messages when "human_size" is "0 bytes". * Content-type argument. Possible to download bittorrent files. * Fixed Changes that got wiped - don't cancel a rebase. * multi-processing. - lambda to be pedantic about the type map expects. - added exception package with InvalidCookieException.py - added multiprocess package with exorcise_daemons.py - exorcise_daemons#ExorcistPool creates multiprocessing pools without creating daemons - mapped self._process_order_id to multiprocess pool of purchase_keys. * multiprocessing - Downloading: basename now prints to avoid so much console spam. - join the pool to finalize it properly. * removed unnecessary import to base __init__.py that I pushed accidentally. * Process Safe File Writing - created CacheData class in cache.py - read cache data in for every write. * _get_trove_products - updated trove_base_url * Caching Update: (#4) - caching with json is pretty crazy, so I've switched it to a csv. - added _strtobool - added _strtonone will be useful when converting old json cache - cache object inheriting list. - file operations moved to file_ops.py - readability changes... sorry - changes for consistency across trove and non-trove cache * Fixed Changes that got wiped - don't cancel a rebase. * Rebased on missytake/main - multi-processing - file safety - no tmp cache anymore - cache is a visible file. - intellij .idea to .gitignore * Rebased on missytake/main - multi-processing - file safety - no tmp cache anymore - cache is a visible file. - intellij .idea to .gitignore * changed uploaded_at to string again --------- Co-authored-by: Thomas Aglassinger Co-authored-by: yakovlevtx <87836552+yakovlevtx@users.noreply.github.com> Co-authored-by: Robert Xu Co-authored-by: Larry Hastings Co-authored-by: Valtteri-Jokinen <48318078+Valtteri-Jokinen@users.noreply.github.com> Co-authored-by: missytake --- .gitignore | 3 + humblebundle_downloader/cli.py | 8 + humblebundle_downloader/data/cache.py | 8 - humblebundle_downloader/download_library.py | 350 +++++++++----------- humblebundle_downloader/iops/file_ops.py | 3 +- requirements.txt | 3 + 6 files changed, 165 insertions(+), 210 deletions(-) create mode 100644 requirements.txt diff --git a/.gitignore b/.gitignore index 15c9ea8..a330d94 100644 --- a/.gitignore +++ b/.gitignore @@ -2,3 +2,6 @@ __pycache__ *cookies.txt .tox/ +build/ +dist/ +.idea diff --git a/humblebundle_downloader/cli.py b/humblebundle_downloader/cli.py index 1aa270c..03f51d9 100644 --- a/humblebundle_downloader/cli.py +++ b/humblebundle_downloader/cli.py @@ -50,6 +50,13 @@ def parse_args(args): help=("Only get content in a platform. Values can be seen in your " "humble bundle's library dropdown. Ex: -p ebook video"), ) + parser.add_argument( + '--content-types', + type=str, nargs='*', + help="Whether to download only direct content (web) and/or the bittorrent file. Default is only direct content. " + "Available variables: 'web', 'bittorrent' " + "Ex: --content-types web bittorrent ", + ) parser.add_argument( '--progress', action='store_true', @@ -92,6 +99,7 @@ def cli(): purchase_keys=cli_args.keys, trove=cli_args.trove, update=cli_args.update, + content_types=cli_args.content_types ).start() diff --git a/humblebundle_downloader/data/cache.py b/humblebundle_downloader/data/cache.py index 0488531..e73f87a 100644 --- a/humblebundle_downloader/data/cache.py +++ b/humblebundle_downloader/data/cache.py @@ -32,14 +32,6 @@ def _strtonone(val): else: raise ValueError("value was none 'None' %r" % (val,)) -class CacheDataJson: - key: str - value: dict - - def __init__(self, key: str, value: dict): - self.key = key - self.value = value - class CsvCacheData: diff --git a/humblebundle_downloader/download_library.py b/humblebundle_downloader/download_library.py index a7a1199..726020a 100644 --- a/humblebundle_downloader/download_library.py +++ b/humblebundle_downloader/download_library.py @@ -3,7 +3,6 @@ import sys import json import time -from typing import Any import parsel import logging @@ -12,10 +11,13 @@ import http.cookiejar from multiprocess.exorcise_daemons import ExorcistPool from exceptions.InvalidCookieException import InvalidCookieException -from cache import CacheData +from data.cache import CsvCacheData, Cache +from iops import file_ops -logger = logging.getLogger(__name__) +from requests.adapters import HTTPAdapter +from urllib3.util.retry import Retry +logger = logging.getLogger(__name__) def _clean_name(dirty_str): @@ -28,27 +30,61 @@ def _clean_name(dirty_str): return "".join(clean).strip().rstrip('.') +DEFAULT_TIMEOUT = 5 # seconds + + +class TimeoutHTTPAdapter(HTTPAdapter): + timeout = DEFAULT_TIMEOUT + + def __init__(self, *args, **kwargs): + self.timeout = DEFAULT_TIMEOUT + if "timeout" in kwargs: + self.timeout = kwargs["timeout"] + del kwargs["timeout"] + super().__init__(*args, **kwargs) + + def send(self, request, **kwargs): + timeout = kwargs.get("timeout") + if timeout is None: + kwargs["timeout"] = self.timeout + return super().send(request, **kwargs) + + class DownloadLibrary: def __init__(self, library_path, cookie_path=None, cookie_auth=None, progress_bar=False, ext_include=None, ext_exclude=None, platform_include=None, purchase_keys=None, trove=False, - update=False): - self.library_path = library_path + update=False, content_types=None): + + self.cache_data = {} # to remove. + file_ops.set_library_path(library_path) + self.progress_bar = progress_bar - self.ext_include = [] if ext_include is None else list(map(str.lower, ext_include)) # noqa: E501 - self.ext_exclude = [] if ext_exclude is None else list(map(str.lower, ext_exclude)) # noqa: E501 + self.ext_include = [] if ext_include is None else list(map(lambda s: str(s).lower(), ext_include)) # noqa: E501 + self.ext_exclude = [] if ext_exclude is None else list(map(lambda s: str(s).lower(), ext_exclude)) # noqa: E501 + + self.cache_data_csv: Cache = file_ops.load_cache_csv() + + # todo: investigate how platform_include works if platform_include is None or 'all' in platform_include: # if 'all', then do not need to use this check - platform_include = [] - self.platform_include = list(map(lambda s: str(s).lower, platform_include)) + platform_include = [] # why not make the d + self.platform_include = list(map(lambda s: str(s).lower(), platform_include)) self.purchase_keys = purchase_keys self.trove = trove self.update = update + self.content_types = ['web'] if content_types is None else list(map(str.lower, content_types)) # noqa: E501 + + retries = Retry(total=3, backoff_factor=1, + status_forcelist=[429, 500, 502, 503, 504]) + timeout_adapter = TimeoutHTTPAdapter(max_retries=retries) self.session = requests.Session() + self.session.mount('http://', timeout_adapter) + self.session.mount('https://', timeout_adapter) if cookie_path: try: cookie_jar = http.cookiejar.MozillaCookieJar(cookie_path) @@ -64,9 +100,7 @@ def __init__(self, library_path, cookie_path=None, cookie_auth=None, ) def start(self): - - self.cache_file = os.path.join(self.library_path, '.cache.json') - self.cache_data = self._load_cache_data(self.cache_file) + # todo: convert old cache. self.purchase_keys = self.purchase_keys if self.purchase_keys else self._get_purchase_keys() # noqa: E501 if self.trove is True: @@ -76,16 +110,16 @@ def start(self): self._process_trove_product(title, product) else: manager = multiprocessing.Manager() - queue = manager.Queue() - with ExorcistPool(int(multiprocessing.cpu_count()/2)) as pool: + queue = manager.JoinableQueue() + with ExorcistPool(multiprocessing.cpu_count()) as pool: - pool.apply_async(self._update_cache_file, (queue,)) + pool.apply_async(file_ops.update_csv_cache, (queue,)) jobs = list() job_dict = dict() for purchase_key in self.purchase_keys: job = pool.apply_async(self._process_order_id, - (purchase_key, queue) - ) + (purchase_key, queue) + ) jobs.append(job) job_dict[purchase_key] = job @@ -99,11 +133,13 @@ def start(self): for job in jobs: job.get() - queue.put(CacheData("kill", "kill")) + queue.put(CsvCacheData("kill", "kill")) + + queue.join() + pool.close() pool.join() - def _get_trove_download_url(self, machine_name, web_name): try: sign_r = self.session.post( @@ -130,7 +166,7 @@ def _process_trove_product(self, title, product): for platform, download in product['downloads'].items(): # Sometimes the name has a dir in it # Example is "Broken Sword 5 - the Serpent's Curse" - # Only the windows file has a dir like + # Only the Windows file has a dir like # "revolutionsoftware/BS5_v2.2.1-win32.zip" if self._should_download_platform(platform) is False: # noqa: E501 logger.info(f"Skipping {platform} for {title}") @@ -142,28 +178,29 @@ def _process_trove_product(self, title, product): logger.info("Skipping the file {web_name}".format(web_name=web_name)) continue - cache_file_key = 'trove:{name}'.format(name=web_name) file_info = { 'uploaded_at': (download.get('uploaded_at') or download.get('timestamp') or product.get('date_added', '0')), 'md5': download.get('md5', 'UNKNOWN_MD5'), } - cache_file_info = self.cache_data.get(cache_file_key, {}) - if cache_file_info != {} and self.update is not True: + cache_file_info: CsvCacheData = self.cache_data_csv.get_cache_item("trove", web_name, trove=True,) + + # cache_file_info: CsvCacheData = CsvCacheData() + # = self.cache_data.get(cache_file_key, {}) + + if cache_file_info in self.cache_data_csv and self.update is not True: # Do not care about checking for updates at this time continue - if (file_info['uploaded_at'] != cache_file_info.get('uploaded_at') - and file_info['md5'] != cache_file_info.get('md5')): - product_folder = os.path.join( - self.library_path, 'Humble Trove', title - ) - # Create directory to save the files to - try: os.makedirs(product_folder) # noqa: E701 - except OSError: pass # noqa: E701 - local_filename = os.path.join( + if file_info['uploaded_at'] != cache_file_info['remote_modified_date'] \ + and file_info['md5'] != cache_file_info['md5']: + cache_file_info.set_remote_modified_date(file_info['uploaded_at']) + cache_file_info.set_md5(file_info['md5']) + product_folder = file_ops.create_product_folder("Humble Trove", title) + + local_filepath = os.path.join( str(product_folder), web_name, ) @@ -181,21 +218,15 @@ def _process_trove_product(self, title, product): logger.error(f"Failed to get trove product {web_name}") continue - if 'uploaded_at' in cache_file_info: + if 'remote_modified_date' in cache_file_info: uploaded_at = time.strftime( '%Y-%m-%d', - time.localtime(int(cache_file_info['uploaded_at'])) + time.localtime(int(cache_file_info['remote_modified_date'])) ) else: uploaded_at = None - self._process_download( - product_r, - cache_file_key, - file_info, - local_filename, - rename_str=uploaded_at, - ) + self._process_download(product_r, cache_file_info, local_filepath, rename_date_str=uploaded_at) def _get_trove_products(self): trove_products = [] @@ -221,8 +252,8 @@ def _get_trove_products(self): return trove_products - def _process_order_id(self, order_id, multiprocess_queue: multiprocessing.Queue): - order_url = 'https://www.humblebundle.com/api/v1/order/{order_id}?all_tpkds=true'.format(order_id=order_id) # noqa: E501 + def _process_order_id(self, order_id, multiprocess_queue: multiprocessing.JoinableQueue): + order_url = 'https://www.humblebundle.com/api/v1/order/{order_id}?all_tpkds=true'.format(order_id=order_id) try: order_r = self.session.get( order_url, @@ -231,7 +262,7 @@ def _process_order_id(self, order_id, multiprocess_queue: multiprocessing.Queue) 'content-encoding': 'gzip', }, ) - except Exception: + except Exception as e: logger.error("Failed to get order key {order_id}" .format(order_id=order_id)) return @@ -243,132 +274,82 @@ def _process_order_id(self, order_id, multiprocess_queue: multiprocessing.Queue) for product in order['subproducts']: self._process_product(order_id, bundle_title, product, multiprocess_queue) - def _rename_old_file(self, local_filename, append_str): - # Check if older file exists, if so rename - if os.path.isfile(local_filename) is True: - filename_parts = local_filename.rsplit('.', 1) - new_name = "{name}_{append_str}.{ext}"\ - .format(name=filename_parts[0], - append_str=append_str, - ext=filename_parts[1]) - os.rename(local_filename, new_name) - logger.info("Renamed older file to {new_name}".format(new_name=new_name)) - def _process_product(self, order_id, bundle_title, product, multiprocess_queue: multiprocessing.Queue): product_title = _clean_name(product['human_name']) # Get all types of download for a product for download_type in product['downloads']: if self._should_download_platform(download_type['platform']) is False: # noqa: E501 - logger.info("Skipping {platform} for {product_title}".format(platform=download_type['platform'],product_title=product_title)) + logger.info("Skipping {platform} for {product_title}" + .format(platform=download_type['platform'], product_title=product_title) + ) continue - product_folder = os.path.join( - self.library_path, bundle_title, product_title - ) - # Create directory to save the files to - try: os.makedirs(product_folder) # noqa: E701 - except OSError: pass # noqa: E701 + product_folder = file_ops.create_product_folder(bundle_title, product_title) - # Download each file type of a product + # Download each filetype of a product for file_type in download_type['download_struct']: - try: - url = file_type['url']['web'] - except KeyError: - logger.info("No url found: {bundle_title}/{product_title}".format(bundle_title=bundle_title,product_title=product_title)) - continue - - url_filename = url.split('?')[0].split('/')[-1] - cache_file_key = order_id + ':' + url_filename - ext = url_filename.split('.')[-1] - if self._should_download_file_type(ext) is False: - logger.info("Skipping the file {url_filename}".format(url_filename=url_filename)) - continue - - local_filename = os.path.join(product_folder, url_filename) - cache_file_info = self.cache_data.get(cache_file_key, {}) - - if cache_file_info != {} and self.update is not True: - # Do not care about checking for updates at this time - continue - - try: - product_r = self.session.get(url, stream=True) - except Exception: - logger.error("Failed to download {url}".format(url=url)) - continue - - # Check to see if the file still exists - if product_r.status_code != 200: - logger.debug( - "File missing for {bundle_title}/{product_title}: {url}" - .format(bundle_title=bundle_title, - product_title=product_title, - url=url)) - continue - - logger.debug("Item request: {product_r}, Url: {url}" - .format(product_r=product_r, url=url)) - file_info = { - 'url_last_modified': product_r.headers['Last-Modified'], - } - if file_info['url_last_modified'] != cache_file_info.get('url_last_modified'): # noqa: E501 - if 'url_last_modified' in cache_file_info: - last_modified = datetime.datetime.strptime( - cache_file_info['url_last_modified'], - '%a, %d %b %Y %H:%M:%S %Z' - ).strftime('%Y-%m-%d') - else: - last_modified = None - self._process_download( - product_r, - cache_file_key, - file_info, - local_filename, - rename_str=last_modified, - multiprocess_queue=multiprocess_queue - ) - - def _update_cache_data(self, cache_file_key, file_info): - self.cache_data[cache_file_key] = file_info - # Update cache file with newest data so if the script - # quits it can keep track of the progress - # Note: Only safe because of single thread, - # need to change if refactor to multi threading - with open(self.cache_file, 'w') as outfile: - json.dump( - self.cache_data, outfile, - sort_keys=True, indent=4, - ) - - def _update_cache_file(self, multiprocess_queue: multiprocessing.Queue): - """ - Process safe cache update. - Can't use class member cache_data for any sort of process safety - :param multiprocess_queue: the queue containing cache data - """ - cache: dict - with open(self.cache_file, "r") as infile: - cache = json.load(infile) - - with (open(self.cache_file, 'w') as outfile): - while 1: - cache_data:CacheData = multiprocess_queue.get(True, 2) - cache.update({cache_data.key: str(cache_data.value)}) - if "kill" == cache_data.key: - break - json.dump( - cache, outfile, - sort_keys=True, indent=4, - ) - outfile.flush() - - def _process_download(self, open_r, cache_file_key, file_info, - local_filename, rename_str=None, multiprocess_queue=None): + for content_type in self.content_types: + try: + url = file_type['url'][content_type] + except KeyError: + if file_type.get("human_size") != "0 bytes": + logger.info("No url found: {bundle_title}/{product_title}" + .format(bundle_title=bundle_title, + product_title=product_title)) + continue + + url_filename = url.split('?')[0].split('/')[-1] + + ext = url_filename.split('.')[-1] + if self._should_download_file_type(ext) is False: + logger.info("Skipping the file {url_filename}".format(url_filename=url_filename)) + continue + + local_filename = os.path.join(product_folder, url_filename) + cache_file_info: CsvCacheData = self.cache_data_csv.get_cache_item(order_id, url_filename) + + if cache_file_info in self.cache_data_csv and self.update is False: + # We have the file, and don't want to update. + continue + cache_file_info.set_md5(file_type['md5']) + + try: + product_r = self.session.get(url, stream=True) + except Exception: + logger.error("Failed to download {url}".format(url=url)) + continue + + # Check to see if the file still exists + if product_r.status_code != 200: + logger.debug(f"File missing for {bundle_title}/{product_title}: {url}") + continue + + logger.debug(f"Item request: {product_r}, Url: {url}") + + if product_r.headers['Last-Modified'] != cache_file_info['remote_modified_date']: # noqa: E501 + if 'remote_modified_date' in cache_file_info: + last_modified = datetime.datetime.strptime( + cache_file_info['remote_modified_date'], + '%a, %d %b %Y %H:%M:%S %Z' + ).strftime('%Y-%m-%d') + else: + last_modified = None + cache_file_info.set_remote_modified_date(product_r.headers['Last-Modified']) + self._process_download( + product_r, + cache_file_info, + local_filename, + rename_date_str=last_modified, + multiprocess_queue=multiprocess_queue + ) + + def _process_download(self, open_r, cache_data: CsvCacheData, local_filename, rename_date_str=None, + multiprocess_queue=None): try: - if rename_str: - self._rename_old_file(local_filename, rename_str) + if rename_date_str: + file_ops.rename_old_file(local_filename, rename_date_str) - self._download_file(open_r, local_filename) + file_ops.download_file(open_r, local_filename, self.progress_bar) except (Exception, KeyboardInterrupt) as e: if self.progress_bar: @@ -378,60 +359,27 @@ def _process_download(self, open_r, cache_file_key, file_info, .format(local_filename=os.path.basename(local_filename))) # Clean up broken downloaded file - try: os.remove(local_filename) # noqa: E701 - except OSError: pass # noqa: E701 + try: + os.remove(local_filename) # noqa: E701 + except OSError: + pass # noqa: E701 if type(e).__name__ == 'KeyboardInterrupt': sys.exit() else: + cache_data.set_local_modified_date( + datetime.datetime.now().strftime("%d %b %Y %H:%M:%S %Z") + ) if self.progress_bar: # Do not overwrite the progress bar on next print print() - if multiprocess_queue: - multiprocess_queue.put(CacheData(cache_file_key, file_info)) - else: - self._update_cache_data(cache_file_key, file_info) + multiprocess_queue.put(cache_data) finally: - # Since its a stream connection, make sure to close it + # Since it's a stream connection, make sure to close it open_r.connection.close() - - def _download_file(self, product_r, local_filename): - # logger.info() - - with open(local_filename, 'wb') as outfile: - total_length = product_r.headers.get('content-length') - if total_length is None: # no content length header - outfile.write(product_r.content) - else: - dl = 0 - total_length = int(total_length) - for data in product_r.iter_content(chunk_size=4096): - dl += len(data) - outfile.write(data) - pb_width = 50 - done = int(pb_width * dl / total_length) - if self.progress_bar: - print("\n Downloading: {local_filename} \t{percent}% [{filler}{space}]" # this is nice. - .format(percent=int(done * (100 / pb_width)), - filler='=' * done, - space=' ' * (pb_width - done), - local_filename=os.path.basename(local_filename) - ), end='\r') - if dl != total_length: - raise ValueError("Download did not complete") - - def _load_cache_data(self, cache_file): - try: - with open(cache_file, 'r') as f: - cache_data = json.load(f) - except FileNotFoundError: - cache_data = {} - - return cache_data - def _get_purchase_keys(self): try: library_r = self.session.get('https://www.humblebundle.com/home/library') # noqa: E501 diff --git a/humblebundle_downloader/iops/file_ops.py b/humblebundle_downloader/iops/file_ops.py index 897444a..d0310fc 100644 --- a/humblebundle_downloader/iops/file_ops.py +++ b/humblebundle_downloader/iops/file_ops.py @@ -6,6 +6,7 @@ logger = logging.getLogger(__name__) _HUMBLE_ENV_VAR = "HUMBLE_LIBRARY_PATH" + def rename_old_file(local_filepath, append_str): # Check if older file exists, if so rename if os.path.isfile(local_filepath) is True: @@ -55,7 +56,7 @@ def update_csv_cache(queue: multiprocessing.JoinableQueue): cache_data: CsvCacheData = queue.get(True, 15.0) except: pass - if "kill" == cache_data.key: + if "kill" == cache_data['order_id']: queue.task_done() break diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000..985ed01 --- /dev/null +++ b/requirements.txt @@ -0,0 +1,3 @@ +parsel~=1.9.0 +requests~=2.31.0 +urllib3~=2.2.1 \ No newline at end of file