From 8c79392956fd4b13e1c770e0fcc630c5da1ebcd7 Mon Sep 17 00:00:00 2001 From: djdembeck Date: Wed, 25 Aug 2021 00:01:33 -0500 Subject: [PATCH 01/30] More efficient contributors check; fix multiple same artist tag; make genre varr clearer --- Contents/Code/__init__.py | 36 +++++++++++++++--------------------- 1 file changed, 15 insertions(+), 21 deletions(-) diff --git a/Contents/Code/__init__.py b/Contents/Code/__init__.py index 7577087..7bf0acd 100644 --- a/Contents/Code/__init__.py +++ b/Contents/Code/__init__.py @@ -759,8 +759,8 @@ def update(self, metadata, media, lang, force=False): series = '' series2 = '' series_def = '' - genre1 = None - genre2 = None + genre_parent = None + genre_child = None volume = '' volume2 = '' volume_def = '' @@ -799,13 +799,13 @@ def update(self, metadata, media, lang, force=False): series = self.getStringContentFromXPath( r, '//div[contains (@class, "adbl-series-link")]//a[1]' ) - genre1 = self.getStringContentFromXPath( + genre_parent = self.getStringContentFromXPath( r, ( '//div[contains(@class,"adbl-pd-breadcrumb")]' '/div[2]/a/span/text()' ) ) - genre2 = self.getStringContentFromXPath( + genre_child = self.getStringContentFromXPath( r, ( '//div[contains(@class,"adbl-pd-breadcrumb")]' '/div[3]/a/span/text()' @@ -858,11 +858,11 @@ def update(self, metadata, media, lang, force=False): studio = json_data['publisher'] synopsis = json_data['description'] if 'itemListElement' in json_data: - genre1 = ( + genre_parent = ( json_data['itemListElement'][1]['item']['name'] ) try: - genre2 = ( + genre_child = ( json_data['itemListElement'][2]['item']['name'] ) except: @@ -966,7 +966,7 @@ def update(self, metadata, media, lang, force=False): self.Log('studio: %s', studio) self.Log('thumb: %s', thumb) self.Log('rating: %s', rating) - self.Log('genres: %s, %s', genre1, genre2) + self.Log('genres: %s, %s', genre_parent, genre_child) self.Log('synopsis: %s', synopsis) self.Log('Series: %s', series) self.Log('Volume: %s', volume) @@ -981,18 +981,16 @@ def update(self, metadata, media, lang, force=False): # Add the genres metadata.genres.clear() - metadata.genres.add(genre1) - metadata.genres.add(genre2) + metadata.genres.add(genre_parent) + metadata.genres.add(genre_child) # Add Narrators to Styles narrators_list = narrator.split(",") contributors_list = ['full cast'] metadata.styles.clear() - for narrators in narrators_list: - if not [ - item for item in contributors_list if item in narrators.lower() - ]: - metadata.styles.add(narrators.strip()) + for narrator in narrators_list: + if narrator.lower() not in contributors_list: + metadata.styles.add(narrator.strip()) # Add Authors to Moods author_list = author.split(",") @@ -1004,13 +1002,9 @@ def update(self, metadata, media, lang, force=False): 'full cast', ] metadata.moods.clear() - for authors in author_list: - metadata.moods.add(authors.strip()) - for contributors in contributers_list: - if not [ - item for item in contributers_list if item in authors.lower() - ]: - metadata.moods.add(authors) + for author in author_list: + if author.lower() not in contributers_list: + metadata.moods.add(author.strip()) # Clean series x = re.match("(.*)(: A .* Series)", series_def) From fb93666963f493788c31be22b44018e6d9ae3171 Mon Sep 17 00:00:00 2001 From: djdembeck Date: Wed, 25 Aug 2021 00:17:24 -0500 Subject: [PATCH 02/30] Codefactor improvements --- Contents/Code/__init__.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/Contents/Code/__init__.py b/Contents/Code/__init__.py index 7bf0acd..7ac8128 100644 --- a/Contents/Code/__init__.py +++ b/Contents/Code/__init__.py @@ -627,9 +627,9 @@ def search(self, results, media, lang, manual): itemId = None # New Search results contain question marks after the ID - for itemId in itemId.split('?'): + for q_itemId in itemId.split('?'): # IDs No longer start with just 'B0' - if re.match(r'^[0-9A-Z]{10,10}', itemId): + if re.match(r'^[0-9A-Z]{10,10}', q_itemId): break if len(itemId) == 0: @@ -1103,22 +1103,22 @@ def writeInfo(self, header, url, metadata): if len(metadata.collections) > 0: self.Log('|\\') - for i in range(len(metadata.collections)): + for i, item in enumerate(metadata.collections): self.Log('| * Collection: %s', metadata.collections[i]) if len(metadata.genres) > 0: self.Log('|\\') - for i in range(len(metadata.genres)): + for i, item in enumerate(metadata.genres): self.Log('| * Genre: %s', metadata.genres[i]) if len(metadata.moods) > 0: self.Log('|\\') - for i in range(len(metadata.moods)): + for i, item in enumerate(metadata.moods): self.Log('| * Moods: %s', metadata.moods[i]) if len(metadata.styles) > 0: self.Log('|\\') - for i in range(len(metadata.styles)): + for i, item in enumerate(metadata.styles): self.Log('| * Styles: %s', metadata.styles[i]) if len(metadata.posters) > 0: From 25521c5a5d38fa6221b734b3f2652262c4caee71 Mon Sep 17 00:00:00 2001 From: djdembeck Date: Wed, 25 Aug 2021 00:26:21 -0500 Subject: [PATCH 03/30] More minor CodeFactor cleanup --- Contents/Code/__init__.py | 34 +++++++++++++++------------------- 1 file changed, 15 insertions(+), 19 deletions(-) diff --git a/Contents/Code/__init__.py b/Contents/Code/__init__.py index 7ac8128..5b12e3c 100644 --- a/Contents/Code/__init__.py +++ b/Contents/Code/__init__.py @@ -291,10 +291,6 @@ def search(self, results, media, lang, manual=False): '------------------------------------------------' '------------------------------------------------' ) - return - - def update(self, metadata, media, lang, force=False): - return def hasProxy(self): return Prefs['imageproxyurl'] is not None @@ -594,19 +590,19 @@ def search(self, results, media, lang, manual): if len(found) == 0: self.Log('No results found for query "%s"', normalizedName) return - else: + + self.Log( + 'Found %s result(s) for query "%s"', len(found), normalizedName + ) + i = 1 + for f in found: self.Log( - 'Found %s result(s) for query "%s"', len(found), normalizedName + ' %s. (title) %s (author) %s (url)[%s]' + ' (date)(%s) (thumb){%s}', + i, f['title'], f['author'], + f['url'], str(f['date']), f['thumb'] ) - i = 1 - for f in found: - self.Log( - ' %s. (title) %s (author) %s (url)[%s]' - ' (date)(%s) (thumb){%s}', - i, f['title'], f['author'], - f['url'], str(f['date']), f['thumb'] - ) - i += 1 + i += 1 self.Log( '-----------------------------------' @@ -620,9 +616,10 @@ def search(self, results, media, lang, manual): self.Log('URL For Breakdown: %s', url) # Get the id - for itemId in url.split('/'): + for item in url.split('/'): # IDs No longer start with just 'B0' - if re.match(r'^[0-9A-Z]{10,10}', itemId): + if re.match(r'^[0-9A-Z]{10,10}', item): + itemId = item break itemId = None @@ -1145,5 +1142,4 @@ def safe_unicode(s, encoding='utf-8'): return s else: return s.decode(encoding) - else: - return str(s).decode(encoding) + return str(s).decode(encoding) From 87795c9403ee1f43c5bea0441748a7377eb3b2fd Mon Sep 17 00:00:00 2001 From: djdembeck Date: Wed, 25 Aug 2021 01:49:05 -0500 Subject: [PATCH 04/30] Simplify replace and logs --- Contents/Code/__init__.py | 85 ++++++++++++++++++++++----------------- 1 file changed, 49 insertions(+), 36 deletions(-) diff --git a/Contents/Code/__init__.py b/Contents/Code/__init__.py index 5b12e3c..2ccdaaf 100644 --- a/Contents/Code/__init__.py +++ b/Contents/Code/__init__.py @@ -935,42 +935,55 @@ def update(self, metadata, media, lang, force=False): volume_def = w.group(2) # cleanup synopsis - synopsis = synopsis.replace("", "") - synopsis = synopsis.replace("", "") - synopsis = synopsis.replace("", "") - synopsis = synopsis.replace("", "") - synopsis = synopsis.replace("", "") - synopsis = synopsis.replace("", "") - synopsis = synopsis.replace("", "") - synopsis = synopsis.replace("", "") - synopsis = synopsis.replace("", "") - synopsis = synopsis.replace("", "") - synopsis = synopsis.replace("
    ", "") - synopsis = synopsis.replace("
", "\n") - synopsis = synopsis.replace("
    ", "") - synopsis = synopsis.replace("
", "\n") - synopsis = synopsis.replace("
  • ", " • ") - synopsis = synopsis.replace("
  • ", "\n") - synopsis = synopsis.replace("
    ", "") - synopsis = synopsis.replace("

    ", "") - synopsis = synopsis.replace("

    ", "\n") - - self.Log('date: %s', date) - self.Log('title: %s', title) - self.Log('author: %s', author) - self.Log('series: %s', series) - self.Log('narrator: %s', narrator) - self.Log('studio: %s', studio) - self.Log('thumb: %s', thumb) - self.Log('rating: %s', rating) - self.Log('genres: %s, %s', genre_parent, genre_child) - self.Log('synopsis: %s', synopsis) - self.Log('Series: %s', series) - self.Log('Volume: %s', volume) - self.Log('Series2: %s', series2) - self.Log('Volume2: %s', volume2) - self.Log('Series_def: %s', series_def) - self.Log('Volume_def: %s', volume_def) + synopsis = ( + synopsis.replace("", "") + .replace("", "") + .replace("", "") + .replace("", "") + .replace("", "") + .replace("", "") + .replace("", "") + .replace("", "") + .replace("", "") + .replace("", "") + .replace("
      ", "") + .replace("
    ", "\n") + .replace("
      ", "") + .replace("
    ", "\n") + .replace("
  • ", " • ") + .replace("
  • ", "\n") + .replace("
    ", "") + .replace("

    ", "") + .replace("

    ", "\n") + ) + + # Setup logging of all data in the array + type_arr = [ + {'date': date}, + {'title': title}, + {'author': author}, + {'series': series}, + {'narrator': narrator}, + {'studio': studio}, + {'thumb': thumb}, + {'rating': rating}, + {'genres': genre_parent + ', ' + genre_child}, + {'synopsis': synopsis}, + {'volume': volume}, + {'series2': series2}, + {'volume2': volume2}, + {'series def': series_def}, + {'volume def': volume_def}, + ] + # Loop through and log values that exist + for log_type in type_arr: + for key, val in log_type.items(): + if val: + self.Log("{key:<15}{val}".format( + key=key, + val=val + ) + ) # Set the date and year if found. if date is not None: From 714ca3ae95fccaa80f29bf6560aa54e5aae96adf Mon Sep 17 00:00:00 2001 From: djdembeck Date: Wed, 25 Aug 2021 02:19:50 -0500 Subject: [PATCH 05/30] "How did this run before?" --- Contents/Code/__init__.py | 18 ++++++++++-------- 1 file changed, 10 insertions(+), 8 deletions(-) diff --git a/Contents/Code/__init__.py b/Contents/Code/__init__.py index 2ccdaaf..d909d59 100644 --- a/Contents/Code/__init__.py +++ b/Contents/Code/__init__.py @@ -265,7 +265,6 @@ def doSearch(self, url, ctx): return found def search(self, results, media, lang, manual=False): - # Author data is pulling from last.fm automatically. # This will probably never be built out unless a good # author source is identified. @@ -611,6 +610,9 @@ def search(self, results, media, lang, manual): # Walk the found items and gather extended information info = [] i = 1 + itemId_full = None + itemId = None + valid_itemId = None for f in found: url = f['url'] self.Log('URL For Breakdown: %s', url) @@ -619,21 +621,21 @@ def search(self, results, media, lang, manual): for item in url.split('/'): # IDs No longer start with just 'B0' if re.match(r'^[0-9A-Z]{10,10}', item): - itemId = item + itemId_full = item break - itemId = None # New Search results contain question marks after the ID - for q_itemId in itemId.split('?'): + for itemId in itemId_full.split('?'): # IDs No longer start with just 'B0' - if re.match(r'^[0-9A-Z]{10,10}', q_itemId): + if re.match(r'^[0-9A-Z]{10,10}', itemId): + valid_itemId = itemId break - if len(itemId) == 0: + if len(valid_itemId) == 0: Log('No Match: %s', url) continue - self.Log('* ID is %s', itemId) + self.Log('* ID is %s', valid_itemId) title = f['title'] thumb = f['thumb'] @@ -674,7 +676,7 @@ def search(self, results, media, lang, manual): if score >= LCL_IGNORE_SCORE: info.append( { - 'id': itemId, + 'id': valid_itemId, 'title': title, 'year': year, 'date': date, From 5f925bb8114463232ed3105a5c25c4c962930799 Mon Sep 17 00:00:00 2001 From: djdembeck Date: Wed, 25 Aug 2021 13:30:07 -0500 Subject: [PATCH 06/30] Fix reversed logic of checking contributor list --- Contents/Code/__init__.py | 20 ++++++++++++++++---- 1 file changed, 16 insertions(+), 4 deletions(-) diff --git a/Contents/Code/__init__.py b/Contents/Code/__init__.py index d909d59..a7273f5 100644 --- a/Contents/Code/__init__.py +++ b/Contents/Code/__init__.py @@ -998,15 +998,22 @@ def update(self, metadata, media, lang, force=False): # Add Narrators to Styles narrators_list = narrator.split(",") - contributors_list = ['full cast'] + narr_contributors_list = [ + 'full cast' + ] metadata.styles.clear() + # Loop through narrators to check if it has contributor wording for narrator in narrators_list: - if narrator.lower() not in contributors_list: + if not [ + contrib for contrib in narr_contributors_list if ( + contrib in narrator.lower() + ) + ]: metadata.styles.add(narrator.strip()) # Add Authors to Moods author_list = author.split(",") - contributers_list = [ + author_contributers_list = [ 'contributor', 'translator', 'foreword', @@ -1014,8 +1021,13 @@ def update(self, metadata, media, lang, force=False): 'full cast', ] metadata.moods.clear() + # Loop through authors to check if it has contributor wording for author in author_list: - if author.lower() not in contributers_list: + if not [ + contrib for contrib in author_contributers_list if ( + contrib in author.lower() + ) + ]: metadata.moods.add(author.strip()) # Clean series From 532b26b103826f5917eaa024bbb8bb201ee2613e Mon Sep 17 00:00:00 2001 From: djdembeck Date: Wed, 25 Aug 2021 15:39:11 -0500 Subject: [PATCH 07/30] Greatly simplifiy logging method --- Contents/Code/__init__.py | 188 ++++++++++++++------------------------ 1 file changed, 67 insertions(+), 121 deletions(-) diff --git a/Contents/Code/__init__.py b/Contents/Code/__init__.py index a7273f5..f899a24 100644 --- a/Contents/Code/__init__.py +++ b/Contents/Code/__init__.py @@ -270,11 +270,7 @@ def search(self, results, media, lang, manual=False): # author source is identified. # Log some stuff - self.Log( - '------------------------------------------------' - 'ARTIST SEARCH' - '------------------------------------------------' - ) + self.log_separator('ARTIST SEARCH') self.Log( '* Album: %s', media.album ) @@ -286,10 +282,7 @@ def search(self, results, media, lang, manual=False): 'Not Ready For Artist Search Yet' '****************************************' ) - self.Log( - '------------------------------------------------' - '------------------------------------------------' - ) + self.log_separator() def hasProxy(self): return Prefs['imageproxyurl'] is not None @@ -364,11 +357,7 @@ def findDateInTitle(self, title): def doSearch(self, url, ctx): html = HTML.ElementFromURL(url, sleep=REQUEST_DELAY) found = [] - self.Log( - '-----------------------------------------' - 'just before new xpath line' - '-----------------------------------------' - ) + self.log_separator('just before new xpath line') for r in html.xpath('//ul//li[contains(@class,"productListItem")]'): datetext = self.getStringContentFromXPath( r, ( @@ -403,11 +392,7 @@ def doSearch(self, url, ctx): '[contains (@class,"narratorLabel")]/span//a[1]' ).format(ctx['NAR_BY']) ) - self.Log( - '-----------------------------------------------' - 'XPATH SEARCH HIT' - '-----------------------------------------------' - ) + self.log_separator('XPATH SEARCH HIT') found.append( { @@ -419,12 +404,7 @@ def doSearch(self, url, ctx): 'narrator': narrator } ) - - self.Log( - '-----------------------------------------' - 'just after new xpath line' - '-----------------------------------------' - ) + self.log_separator('just after new xpath line') for r in html.xpath('//div[contains (@class, "adbl-search-result")]'): date = self.getDateFromString( @@ -457,11 +437,7 @@ def doSearch(self, url, ctx): ctx['NAR_BY'] ) ) - self.Log( - '-----------------------------------------------' - 'XPATH SEARCH HIT' - '-----------------------------------------------' - ) + self.log_separator('XPATH SEARCH HIT') found.append( { @@ -479,21 +455,13 @@ def doSearch(self, url, ctx): def search(self, results, media, lang, manual): ctx = SetupUrls(Prefs['sitetype'], Prefs['site'], lang) LCL_IGNORE_SCORE = IGNORE_SCORE - - self.Log( - '-----------------------------------------------' - 'ALBUM SEARCH' - '-----------------------------------------------' - ) + self.log_separator('ALBUM SEARCH') self.Log('* ID: %s', media.parent_metadata.id) self.Log('* Title: %s', media.title) self.Log('* Name: %s', media.name) self.Log('* Album: %s', media.album) self.Log('* Artist: %s', media.artist) - self.Log( - '-------------------------------------------------' - '-------------------------------------------------' - ) + self.log_separator() # Handle a couple of edge cases where # album search will give bad results. @@ -533,18 +501,12 @@ def search(self, results, media, lang, manual): Log('Album search: ' + media.title) # Log some stuff for troubleshooting detail - self.Log( - '-----------------------------------' - '------------------------------------' - ) + self.log_separator() self.Log('* ID: %s', media.parent_metadata.id) self.Log('* Title: %s', media.title) self.Log('* Name: %s', media.name) self.Log('* Album: %s', media.album) - self.Log( - '-----------------------------------' - '------------------------------------' - ) + self.log_separator() # Normalize the name normalizedName = String.StripDiacritics(media.album) @@ -603,10 +565,7 @@ def search(self, results, media, lang, manual): ) i += 1 - self.Log( - '-----------------------------------' - '------------------------------------' - ) + self.log_separator() # Walk the found items and gather extended information info = [] i = 1 @@ -692,20 +651,14 @@ def search(self, results, media, lang, manual): ) if i != len(found): - self.Log( - '-----------------------------------' - '------------------------------------' - ) + self.log_separator() i += 1 info = sorted(info, key=lambda inf: inf['score'], reverse=True) # Output the final results. - self.Log( - '***********************************' - '************************************' - ) + self.log_separator() self.Log('Final result:') i = 1 for r in info: @@ -810,11 +763,7 @@ def update(self, metadata, media, lang, force=False): '/div[3]/a/span/text()' ) ) - self.Log( - '-----------------------------------------------' - 'XPATH SEARCH HIT' - '-----------------------------------------------' - ) + self.log_separator('XPATH SEARCH HIT') if date is None: for r in html.xpath( @@ -977,7 +926,7 @@ def update(self, metadata, media, lang, force=False): {'series def': series_def}, {'volume def': volume_def}, ] - # Loop through and log values that exist + # Loop through dicts in array for log_type in type_arr: for key, val in log_type.items(): if val: @@ -1099,66 +1048,63 @@ def worker(self, queue, stoprequest): def addTask(self, queue, func, *args, **kargs): queue.put((func, args, kargs)) - # Writes metadata information to log. - def writeInfo(self, header, url, metadata): - self.Log(header) - self.Log( - '-----------------------------------' - '------------------------------------' - ) - self.Log( - '* ID: %s', metadata.id - ) - self.Log( - '* URL: %s', url - ) - self.Log( - '* Title: %s', metadata.title - ) - self.Log( - '* Release date: %s', str(metadata.originally_available_at) - ) - self.Log( - '* Studio: %s', metadata.studio - ) - self.Log( - '* Summary: %s', metadata.summary - ) - - if len(metadata.collections) > 0: - self.Log('|\\') - for i, item in enumerate(metadata.collections): - self.Log('| * Collection: %s', metadata.collections[i]) + # Prints a bunch of divider chars like --- + def log_separator(self, msg=None): + divider = "-" * 35 + output = divider + divider + # Override output with message if passed + if msg: + output = divider + msg + divider - if len(metadata.genres) > 0: - self.Log('|\\') - for i, item in enumerate(metadata.genres): - self.Log('| * Genre: %s', metadata.genres[i]) + return self.Log(output) - if len(metadata.moods) > 0: - self.Log('|\\') - for i, item in enumerate(metadata.moods): - self.Log('| * Moods: %s', metadata.moods[i]) - - if len(metadata.styles) > 0: - self.Log('|\\') - for i, item in enumerate(metadata.styles): - self.Log('| * Styles: %s', metadata.styles[i]) + # Writes metadata information to log. + def writeInfo(self, header, url, metadata): + self.log_separator(header) - if len(metadata.posters) > 0: - self.Log('|\\') - for poster in metadata.posters.keys(): - self.Log('| * Poster URL: %s', poster) + # Log basic metadata + type_arr = [ + {'ID': metadata.id}, + {'URL': url}, + {'Title': metadata.title}, + {'Release date': str(metadata.originally_available_at)}, + {'Studio': metadata.studio}, + {'Summary': metadata.summary}, + ] + # Loop through dicts in array + for log_type in type_arr: + # Loop through each key/value + for key, val in log_type.items(): + if val: + self.Log("{key:<15}{val}".format( + key=key, + val=val + ) + ) - if len(metadata.art) > 0: - self.Log('|\\') - for art in metadata.art.keys(): - self.Log('| * Fan art URL: %s', art) + # Log basic metadata stored in arrays + multi_arr = [ + {'Collection', metadata.collections}, + {'Genre', metadata.genres}, + {'Moods', metadata.moods}, + {'Styles', metadata.styles}, + {'Poster URL', metadata.posters}, + {'Fan art URL', metadata.art}, + ] + # Loop through dicts in array + for log_type in multi_arr: + # Loop through each key/value + for key, val in log_type.items(): + if val: + # Loop through dict's array + for item in val: + self.Log("{key:<15}{val}".format( + key=key, + val=item + ) + ) - self.Log( - '***********************************' - '************************************' - ) + self.log_separator() def safe_unicode(s, encoding='utf-8'): From d4ddc103a8f92cbdd3809998c97449be46a28b5b Mon Sep 17 00:00:00 2001 From: djdembeck Date: Wed, 25 Aug 2021 15:58:49 -0500 Subject: [PATCH 08/30] Remove dupe log section; further DRY --- Contents/Code/__init__.py | 63 ++++++++++++++++++++------------------- 1 file changed, 33 insertions(+), 30 deletions(-) diff --git a/Contents/Code/__init__.py b/Contents/Code/__init__.py index f899a24..3b0d171 100644 --- a/Contents/Code/__init__.py +++ b/Contents/Code/__init__.py @@ -456,11 +456,15 @@ def search(self, results, media, lang, manual): ctx = SetupUrls(Prefs['sitetype'], Prefs['site'], lang) LCL_IGNORE_SCORE = IGNORE_SCORE self.log_separator('ALBUM SEARCH') - self.Log('* ID: %s', media.parent_metadata.id) - self.Log('* Title: %s', media.title) - self.Log('* Name: %s', media.name) - self.Log('* Album: %s', media.album) - self.Log('* Artist: %s', media.artist) + # Log basic metadata + data_to_log = [ + {'ID': media.parent_metadata.id}, + {'Title': media.title}, + {'Name': media.name}, + {'Album': media.album}, + {'Artist': media.artist}, + ] + self.log_metadata(data_to_log) self.log_separator() # Handle a couple of edge cases where @@ -500,14 +504,6 @@ def search(self, results, media, lang, manual): else: Log('Album search: ' + media.title) - # Log some stuff for troubleshooting detail - self.log_separator() - self.Log('* ID: %s', media.parent_metadata.id) - self.Log('* Title: %s', media.title) - self.Log('* Name: %s', media.name) - self.Log('* Album: %s', media.album) - self.log_separator() - # Normalize the name normalizedName = String.StripDiacritics(media.album) if len(normalizedName) == 0: @@ -625,12 +621,16 @@ def search(self, results, media, lang, manual): scorebase3, scorebase4 ) - self.Log('* Title is %s', title) - self.Log('* Author is %s', author) - self.Log('* Narrator is %s', narrator) - self.Log('* Date is %s', str(date)) - self.Log('* Score is %s', str(score)) - self.Log('* Thumb is %s', thumb) + # Log basic metadata + data_to_log = [ + {'Title is': title}, + {'Author is': author}, + {'Narrator is': narrator}, + {'Date is ': str(date)}, + {'Score is': str(score)}, + {'Thumb is': thumb}, + ] + self.log_metadata(data_to_log) if score >= LCL_IGNORE_SCORE: info.append( @@ -1058,12 +1058,24 @@ def log_separator(self, msg=None): return self.Log(output) + def log_metadata(self, dict_arr): + # Loop through dicts in array + for log_type in dict_arr: + # Loop through each key/value + for key, val in log_type.items(): + if val: + self.Log("{key:<15}{val}".format( + key=key, + val=val + ) + ) + # Writes metadata information to log. def writeInfo(self, header, url, metadata): self.log_separator(header) # Log basic metadata - type_arr = [ + data_to_log = [ {'ID': metadata.id}, {'URL': url}, {'Title': metadata.title}, @@ -1071,16 +1083,7 @@ def writeInfo(self, header, url, metadata): {'Studio': metadata.studio}, {'Summary': metadata.summary}, ] - # Loop through dicts in array - for log_type in type_arr: - # Loop through each key/value - for key, val in log_type.items(): - if val: - self.Log("{key:<15}{val}".format( - key=key, - val=val - ) - ) + self.log_metadata(data_to_log) # Log basic metadata stored in arrays multi_arr = [ From 7024dc0f3fda0c39c6eeaf3c11f7be9de73dac66 Mon Sep 17 00:00:00 2001 From: djdembeck Date: Wed, 25 Aug 2021 17:28:27 -0500 Subject: [PATCH 09/30] Break up complex functions; Unified logging module --- Contents/Code/__init__.py | 820 +++++++++++++++++++------------------- Contents/Code/logging.py | 57 +++ 2 files changed, 470 insertions(+), 407 deletions(-) create mode 100644 Contents/Code/logging.py diff --git a/Contents/Code/__init__.py b/Contents/Code/__init__.py index 3b0d171..46e1436 100644 --- a/Contents/Code/__init__.py +++ b/Contents/Code/__init__.py @@ -4,6 +4,8 @@ import re import types +from logging import Log + import Queue @@ -71,13 +73,13 @@ def json_decode(output): def SetupUrls(sitetype, base, lang='en'): - Log('Library/Search language is : %s', lang) + Log.debug('Library/Search language is : %s', lang) ctx = dict() if sitetype: - Log('Manual Site Selection Enabled : %s', base) - Log('Language being ignored due to manual site selection') + Log.debug('Manual Site Selection Enabled : %s', base) + Log.debug('Language being ignored due to manual site selection') if base in sites_langs: - Log('Pulling language from sites array') + Log.debug('Pulling language from sites array') lang = sites_langs[base]['lang'] if lang in intl_sites: base = intl_sites[lang]['url'] @@ -97,35 +99,35 @@ def SetupUrls(sitetype, base, lang='en'): ctx['REL_DATE_INFO'] = ctx['REL_DATE'] ctx['NAR_BY'] = 'Narrated By' ctx['NAR_BY_INFO'] = 'Narrated by' - Log( + Log.debug( 'Sites language is : %s', lang ) - Log( + Log.debug( '/************************************' 'LANG DEBUGGING' '************************************/' ) - Log( + Log.debug( '/* REL_DATE = %s', ctx['REL_DATE'] ) - Log( + Log.debug( '/* REL_DATE_INFO = %s', ctx['REL_DATE_INFO'] ) - Log( + Log.debug( '/* NAR_BY = %s', ctx['NAR_BY'] ) - Log( + Log.debug( '/* NAR_BY_INFO = %s', ctx['NAR_BY_INFO'] ) - Log( + Log.debug( '/****************************************' '****************************************/' ) else: - Log( + Log.debug( 'Audible site will be chosen by library language' ) - Log( + Log.debug( 'Library Language is %s', lang ) if base is None: @@ -209,10 +211,6 @@ class AudiobookArtist(Agent.Artist): prev_search_provider = 0 - def Log(self, message, *args): - if Prefs['debug']: - Log(message, *args) - def getDateFromString(self, string): try: return Datetime.ParseDate(string).date() @@ -270,19 +268,19 @@ def search(self, results, media, lang, manual=False): # author source is identified. # Log some stuff - self.log_separator('ARTIST SEARCH') - self.Log( + Log.separator(msg='ARTIST SEARCH', log_level='debug') + Log.debug( '* Album: %s', media.album ) - self.Log( + Log.debug( '* Artist: %s', media.artist ) - self.Log( + Log.debug( '****************************************' 'Not Ready For Artist Search Yet' '****************************************' ) - self.log_separator() + Log.separator(log_level='debug') def hasProxy(self): return Prefs['imageproxyurl'] is not None @@ -297,7 +295,7 @@ def worker(self, queue, stoprequest): try: func(*args, **kargs) except Exception as e: - self.Log(e) + Log.info(e) queue.task_done() except Queue.Empty: continue @@ -319,10 +317,6 @@ class AudiobookAlbum(Agent.Album): prev_search_provider = 0 - def Log(self, message, *args): - if Prefs['debug']: - Log(message, *args) - def getDateFromString(self, string): try: return Datetime.ParseDate(string).date() @@ -354,11 +348,10 @@ def findDateInTitle(self, title): return Datetime.ParseDate(result.group(0)).date() return None - def doSearch(self, url, ctx): - html = HTML.ElementFromURL(url, sleep=REQUEST_DELAY) - found = [] - self.log_separator('just before new xpath line') - for r in html.xpath('//ul//li[contains(@class,"productListItem")]'): + def before_xpath(self): + for r in self.html.xpath( + '//ul//li[contains(@class,"productListItem")]' + ): datetext = self.getStringContentFromXPath( r, ( u'div/div/div/div/div/div/span/ul/li' @@ -392,9 +385,9 @@ def doSearch(self, url, ctx): '[contains (@class,"narratorLabel")]/span//a[1]' ).format(ctx['NAR_BY']) ) - self.log_separator('XPATH SEARCH HIT') + Log.separator(msg='XPATH SEARCH HIT', log_level="debug") - found.append( + self.found.append( { 'url': murl, 'title': title, @@ -404,9 +397,11 @@ def doSearch(self, url, ctx): 'narrator': narrator } ) - self.log_separator('just after new xpath line') - for r in html.xpath('//div[contains (@class, "adbl-search-result")]'): + def after_xpath(self): + for r in self.html.xpath( + '//div[contains (@class, "adbl-search-result")]' + ): date = self.getDateFromString( self.getStringContentFromXPath( r, ( @@ -434,12 +429,12 @@ def doSearch(self, url, ctx): ) narrator = self.getStringContentFromXPath( r, u'div/div/ul/li[contains (., "{0}")]//a[1]'.format( - ctx['NAR_BY'] + self.ctx['NAR_BY'] ) ) - self.log_separator('XPATH SEARCH HIT') + Log.separator(msg='XPATH SEARCH HIT', log_level="debug") - found.append( + self.found.append( { 'url': murl, 'title': title, @@ -450,127 +445,107 @@ def doSearch(self, url, ctx): } ) - return found + def doSearch(self, url, ctx): + self.html = HTML.ElementFromURL(url, sleep=REQUEST_DELAY) + self.found = [] + self.ctx = ctx - def search(self, results, media, lang, manual): - ctx = SetupUrls(Prefs['sitetype'], Prefs['site'], lang) - LCL_IGNORE_SCORE = IGNORE_SCORE - self.log_separator('ALBUM SEARCH') + Log.separator(msg='just before new xpath line', log_level="debug") + self.before_xpath() + + Log.separator(msg='just after new xpath line', log_level="debug") + self.after_xpath() + + return self.found + + def pre_search(self): + Log.separator(msg='ALBUM SEARCH', log_level="info") # Log basic metadata data_to_log = [ - {'ID': media.parent_metadata.id}, - {'Title': media.title}, - {'Name': media.name}, - {'Album': media.album}, - {'Artist': media.artist}, + {'ID': self.media.parent_metadata.id}, + {'Title': self.media.title}, + {'Name': self.media.name}, + {'Album': self.media.album}, + {'Artist': self.media.artist}, ] - self.log_metadata(data_to_log) - self.log_separator() + Log.metadata(data_to_log) + Log.separator(log_level="info") # Handle a couple of edge cases where # album search will give bad results. - if media.album is None and not manual: - self.Log('Album Title is NULL on an automatic search. Returning') + if self.media.album is None and not self.manual: + Log.info('Album Title is NULL on an automatic search. Returning') return - if media.album == '[Unknown Album]' and not manual: - self.Log( + if self.media.album == '[Unknown Album]' and not self.manual: + Log.info( 'Album Title is [Unknown Album]' ' on an automatic search. Returning' ) return - if manual: - Log( + if self.manual: + Log.info( 'You clicked \'fix match\'. ' 'This may have returned no useful results because ' 'it\'s searching using the title of the first track.' ) - Log( + Log.info( 'There\'s not currently a way around this initial failure. ' 'But clicking \'Search Options\' and ' 'entering the title works just fine.' ) - Log( + Log.info( 'This message will appear during the initial ' 'search and the actual manual search.' ) # If this is a custom search, # use the user-entered name instead of the scanner hint. - if media.name: - Log( - 'Custom album search for: ' + media.name + if self.media.name: + Log.info( + 'Custom album search for: ' + self.media.name ) - media.album = media.name + self.media.album = self.media.name else: - Log('Album search: ' + media.title) + Log.info('Album search: ' + self.media.title) + def format_title(self): # Normalize the name - normalizedName = String.StripDiacritics(media.album) - if len(normalizedName) == 0: - normalizedName = media.album - Log( - 'normalizedName = %s', normalizedName + self.normalizedName = String.StripDiacritics( + self.media.album + ) + if len(self.normalizedName) == 0: + self.normalizedName = self.media.album + Log.debug( + 'normalizedName = %s', self.normalizedName ) # Chop off "unabridged" - normalizedName = re.sub(r"[\(\[].*?[\)\]]", "", normalizedName) - Log( - 'chopping bracketed text = %s', normalizedName + self.normalizedName = re.sub( + r"[\(\[].*?[\)\]]", "", self.normalizedName ) - normalizedName = normalizedName.strip() - Log( - 'normalizedName stripped = %s', normalizedName + Log.debug( + 'chopping bracketed text = %s', self.normalizedName ) - - self.Log( - '***** SEARCHING FOR "%s" - AUDIBLE v.%s *****', - normalizedName, VERSION_NO + self.normalizedName = self.normalizedName.strip() + Log.debug( + 'normalizedName stripped = %s', self.normalizedName ) - # Make the URL - if media.artist is not None: - searchUrl = ctx['AUD_SEARCH_URL'].format( - ( - String.Quote((normalizedName).encode('utf-8'), usePlus=True) - ), - ( - String.Quote((media.artist).encode('utf-8'), usePlus=True) - ) - ) - else: - searchUrl = ctx['AUD_KEYWORD_SEARCH_URL'] % ( - String.Quote((normalizedName).encode('utf-8'), usePlus=True) - ) - found = self.doSearch(searchUrl, ctx) - - # Write search result status to log - if len(found) == 0: - self.Log('No results found for query "%s"', normalizedName) - return - - self.Log( - 'Found %s result(s) for query "%s"', len(found), normalizedName + Log.info( + '***** SEARCHING FOR "%s" - AUDIBLE v.%s *****', + self.normalizedName, VERSION_NO ) - i = 1 - for f in found: - self.Log( - ' %s. (title) %s (author) %s (url)[%s]' - ' (date)(%s) (thumb){%s}', - i, f['title'], f['author'], - f['url'], str(f['date']), f['thumb'] - ) - i += 1 - self.log_separator() + def run_search(self): # Walk the found items and gather extended information info = [] i = 1 itemId_full = None itemId = None valid_itemId = None - for f in found: + for f in self.found: url = f['url'] - self.Log('URL For Breakdown: %s', url) + Log.debug('URL For Breakdown: %s', url) # Get the id for item in url.split('/'): @@ -587,10 +562,10 @@ def search(self, results, media, lang, manual): break if len(valid_itemId) == 0: - Log('No Match: %s', url) + Log.info('No Match: %s', url) continue - self.Log('* ID is %s', valid_itemId) + Log.debug('* ID is %s', valid_itemId) title = f['title'] thumb = f['thumb'] @@ -603,20 +578,20 @@ def search(self, results, media, lang, manual): year = date.year # Score the album name - scorebase1 = media.album + scorebase1 = self.media.album scorebase2 = title.encode('utf-8') - # self.Log('scorebase1: %s', scorebase1) - # self.Log('scorebase2: %s', scorebase2) + # Log.debug('scorebase1: %s', scorebase1) + # Log.debug('scorebase2: %s', scorebase2) score = INITIAL_SCORE - Util.LevenshteinDistance( scorebase1, scorebase2 ) - if media.artist: - scorebase3 = media.artist + if self.media.artist: + scorebase3 = self.media.artist scorebase4 = author - # self.Log('scorebase3: %s', scorebase3) - # self.Log('scorebase4: %s', scorebase4) + # Log.debug('scorebase3: %s', scorebase3) + # Log.debug('scorebase4: %s', scorebase4) score = INITIAL_SCORE - Util.LevenshteinDistance( scorebase3, scorebase4 ) @@ -630,9 +605,9 @@ def search(self, results, media, lang, manual): {'Score is': str(score)}, {'Thumb is': thumb}, ] - self.log_metadata(data_to_log) + Log.metadata(data_to_log, log_level="info") - if score >= LCL_IGNORE_SCORE: + if score >= self.LCL_IGNORE_SCORE: info.append( { 'id': valid_itemId, @@ -645,27 +620,83 @@ def search(self, results, media, lang, manual): } ) else: - self.Log( + Log.info( '# Score is below ignore boundary (%s)... Skipping!', - LCL_IGNORE_SCORE + self.LCL_IGNORE_SCORE ) - if i != len(found): - self.log_separator() + if i != len(self.found): + Log.separator() i += 1 info = sorted(info, key=lambda inf: inf['score'], reverse=True) + return info + + def search(self, results, media, lang, manual): + self.ctx = SetupUrls(Prefs['sitetype'], Prefs['site'], lang) + self.LCL_IGNORE_SCORE = IGNORE_SCORE + self.results = results + self.media = media + self.lang = lang + self.manual = manual + + self.pre_search() + + self.format_title() + + # Make the URL + if self.media.artist is not None: + searchUrl = self.ctx['AUD_SEARCH_URL'].format( + ( + String.Quote((self.normalizedName).encode('utf-8'), usePlus=True) + ), + ( + String.Quote((self.media.artist).encode('utf-8'), usePlus=True) + ) + ) + else: + searchUrl = self.ctx['AUD_KEYWORD_SEARCH_URL'] % ( + String.Quote((self.normalizedName).encode('utf-8'), usePlus=True) + ) + self.result = self.doSearch(searchUrl, self.ctx) + + # Write search result status to log + if len(self.result) == 0: + Log.info( + 'No results found for query "%s"', + self.normalizedName + ) + return + + Log.debug( + 'Found %s result(s) for query "%s"', + len(self.result), + self.normalizedName + ) + i = 1 + for f in self.result: + Log.debug( + ' %s. (title) %s (author) %s (url)[%s]' + ' (date)(%s) (thumb){%s}', + i, f['title'], f['author'], + f['url'], str(f['date']), f['thumb'] + ) + i += 1 + + Log.separator(log_level="info") + + info = self.run_search() # Output the final results. - self.log_separator() - self.Log('Final result:') + Log.separator(log_level="debug") + Log.debug('Final result:') i = 1 for r in info: description = '\"%s\" by %s [%s]' % ( r['title'], r['artist'], r['year'] ) - self.Log( + Log.debug( ' [%s] %s. %s (%s) %s {%s} [%s]', r['score'], i, r['title'], r['year'], r['artist'], r['id'], r['thumb'] @@ -684,273 +715,196 @@ def search(self, results, media, lang, manual): # and this one has a score that is >= GOOD SCORE, # then ignore the rest of the results if not manual and len(info) > 1 and r['score'] >= GOOD_SCORE: - self.Log( + Log.info( ' *** The score for these results are great, ' 'so we will use them, and ignore the rest. ***' ) break i += 1 - def update(self, metadata, media, lang, force=False): - self.Log( - '***** UPDATING "%s" ID: %s - AUDIBLE v.%s *****', - media.title, metadata.id, VERSION_NO - ) - ctx = SetupUrls(Prefs['sitetype'], Prefs['site'], lang) - - # Make url - url = ctx['AUD_BOOK_INFO'] % metadata.id + def use_copyright_date(self): + cstring = None - try: - html = HTML.ElementFromURL(url, sleep=REQUEST_DELAY) - except NetworkError: - pass + for r in self.html.xpath(u'//span[contains(text(), "\xA9")]'): + cstring = self.getStringContentFromXPath( + r, u'normalize-space(//span[contains(text(), "\xA9")])' + ) + # only contains Audible copyright + if cstring.startswith(u"\xA9 "): + cstring = "" + date = date[:4] + + if cstring: + if "Public Domain" in cstring: + date = re.match(".*\(P\)(\d{4})", cstring).group(1) + else: + if cstring.startswith(u'\xA9'): + cstring = cstring[1:] + if "(P)" in cstring: + cstring = re.match("(.*)\(P\).*", cstring).group(1) + if ";" in cstring: + date = str( + min( + [int(i) for i in cstring.split() if i.isdigit()] + ) + ) + else: + date = re.match(".?(\d{4}).*", cstring).group(1) - date = None - rating = None - series = '' - series2 = '' - series_def = '' - genre_parent = None - genre_child = None - volume = '' - volume2 = '' - volume_def = '' - - for r in html.xpath('//div[contains (@id, "adbl_page_content")]'): - date = self.getDateFromString( + def update_scrape(self): + for r in self.html.xpath('//div[contains (@id, "adbl_page_content")]'): + self.date = self.getDateFromString( self.getStringContentFromXPath( r, u'//li[contains (., "{0}")]/span[2]//text()'.format( - ctx['REL_DATE_INFO'] + self.ctx['REL_DATE_INFO'] ) ) ) - title = self.getStringContentFromXPath( + self.title = self.getStringContentFromXPath( r, '//h1[contains (@class, "adbl-prod-h1-title")]/text()' ) - murl = self.getAnchorUrlFromXPath( + self.murl = self.getAnchorUrlFromXPath( r, 'div/div/div/div/a[1]' ) - thumb = self.getImageUrlFromXPath( + self.thumb = self.getImageUrlFromXPath( r, 'div/div/div/div/div/img' ) - author = self.getStringContentFromXPath( + self.author = self.getStringContentFromXPath( r, '//li//a[contains (@class,"author-profile-link")][1]' ) - narrator = self.getStringContentFromXPath( + self.narrator = self.getStringContentFromXPath( r, '//li[contains (., "{0}")]//span[2]'.format( - ctx['NAR_BY_INFO'] + self.ctx['NAR_BY_INFO'] ) ).strip().decode('utf-8') - studio = self.getStringContentFromXPath( + self.studio = self.getStringContentFromXPath( r, '//li//a[contains (@id,"PublisherSearchLink")][1]' ) - synopsis = self.getStringContentFromXPath( + self.synopsis = self.getStringContentFromXPath( r, '//div[contains (@class, "disc-summary")]/div[*]' ).strip() - series = self.getStringContentFromXPath( + self.series = self.getStringContentFromXPath( r, '//div[contains (@class, "adbl-series-link")]//a[1]' ) - genre_parent = self.getStringContentFromXPath( + self.genre_parent = self.getStringContentFromXPath( r, ( '//div[contains(@class,"adbl-pd-breadcrumb")]' '/div[2]/a/span/text()' ) ) - genre_child = self.getStringContentFromXPath( + self.genre_child = self.getStringContentFromXPath( r, ( '//div[contains(@class,"adbl-pd-breadcrumb")]' '/div[3]/a/span/text()' ) ) - self.log_separator('XPATH SEARCH HIT') - - if date is None: - for r in html.xpath( - '//script[contains (@type, "application/ld+json")]' - ): - page_content = r.text_content() - page_content = page_content.replace('\n', '') - # Remove any backslashes that aren't - # escaping a character JSON needs escaped - remove_inv_json_esc = re.compile( - r'([^\\])(\\(?![bfnrt\'\"\\/]|u[A-Fa-f0-9]{4}))' - ) - page_content = remove_inv_json_esc.sub(r'\1\\\2', page_content) - self.Log(page_content) - json_data = json_decode(page_content) - for json_data in json_data: - if 'datePublished' in json_data: - date = json_data['datePublished'] - title = json_data['name'] - thumb = json_data['image'] - # Set rating when available - if 'aggregateRating' in json_data: - rating = ( - json_data['aggregateRating']['ratingValue'] - ) - author = '' - counter = 0 - for c in json_data['author']: - counter += 1 - if counter > 1: - author += ', ' - author += c['name'] - narrator = '' - counter = 0 - for c in json_data['readBy']: - counter += 1 - if counter > 1: - narrator += ',' - narrator += c['name'] - studio = json_data['publisher'] - synopsis = json_data['description'] - if 'itemListElement' in json_data: - genre_parent = ( - json_data['itemListElement'][1]['item']['name'] + Log.separator(msg='XPATH SEARCH HIT') + + def date_missing(self): + for r in self.html.xpath( + '//script[contains (@type, "application/ld+json")]' + ): + page_content = r.text_content() + page_content = page_content.replace('\n', '') + # Remove any backslashes that aren't + # escaping a character JSON needs escaped + remove_inv_json_esc = re.compile( + r'([^\\])(\\(?![bfnrt\'\"\\/]|u[A-Fa-f0-9]{4}))' + ) + page_content = remove_inv_json_esc.sub(r'\1\\\2', page_content) + Log.debug(page_content) + json_data = json_decode(page_content) + for json_data in json_data: + if 'datePublished' in json_data: + self.date = json_data['datePublished'] + self.title = json_data['name'] + self.thumb = json_data['image'] + # Set rating when available + if 'aggregateRating' in json_data: + self.rating = ( + json_data['aggregateRating']['ratingValue'] ) - try: - genre_child = ( - json_data['itemListElement'][2]['item']['name'] - ) - except: - continue - - # prefer copyright year over datePublished - if Prefs['copyyear']: - cstring = None - - for r in html.xpath(u'//span[contains(text(), "\xA9")]'): - cstring = self.getStringContentFromXPath( - r, u'normalize-space(//span[contains(text(), "\xA9")])' + self.author = '' + counter = 0 + for c in json_data['author']: + counter += 1 + if counter > 1: + self.author += ', ' + self.author += c['name'] + self.narrator = '' + counter = 0 + for c in json_data['readBy']: + counter += 1 + if counter > 1: + self.narrator += ',' + self.narrator += c['name'] + self.studio = json_data['publisher'] + self.synopsis = json_data['description'] + if 'itemListElement' in json_data: + self.genre_parent = ( + json_data['itemListElement'][1]['item']['name'] ) - # only contains Audible copyright - if cstring.startswith(u"\xA9 "): - cstring = "" - date = date[:4] - - if cstring: - if "Public Domain" in cstring: - date = re.match(".*\(P\)(\d{4})", cstring).group(1) - else: - if cstring.startswith(u'\xA9'): - cstring = cstring[1:] - if "(P)" in cstring: - cstring = re.match("(.*)\(P\).*", cstring).group(1) - if ";" in cstring: - date = str( - min( - [int(i) for i in cstring.split() if i.isdigit()] - ) - ) - else: - date = re.match(".?(\d{4}).*", cstring).group(1) - - date = self.getDateFromString(date) - - for r in html.xpath('//span[contains(@class, "seriesLabel")]'): - series = self.getStringContentFromXPath( - r, '//li[contains(@class, "seriesLabel")]//a[1]' - ) - series2 = self.getStringContentFromXPath( - r, '//li[contains(@class, "seriesLabel")]//a[2]' - ) - - series_def = series2 if series2 else series - - volume = self.getStringContentFromXPath( - r, '//li[contains(@class, "seriesLabel")]/text()[2]' - ).strip() - if volume == ",": - volume = "" - volume2 = self.getStringContentFromXPath( - r, '//li[contains(@class, "seriesLabel")]/text()[3]' - ).strip() - if volume2 == ",": - volume2 = "" - - volume_def = volume2 if volume2 else volume + try: + self.genre_child = ( + json_data['itemListElement'][2]['item']['name'] + ) + except: + continue - # fix series when audible 'forgets' the series link… - if not series_def: - for r in html.xpath('//div[contains(@class, "adbl-main")]'): - subtitle = self.getStringContentFromXPath( - r, 'normalize-space(//li[contains' - '(@class, "authorLabel")]' - '//preceding::li[1]//span//text())' - ).strip() + def handle_series(self): + for r in self.html.xpath('//span[contains(@class, "seriesLabel")]'): + self.series = self.getStringContentFromXPath( + r, '//li[contains(@class, "seriesLabel")]//a[1]' + ) + self.series2 = self.getStringContentFromXPath( + r, '//li[contains(@class, "seriesLabel")]//a[2]' + ) - w = re.match("(.*)(, Book \d+)", subtitle) - if not series_def and w: - series_def = w.group(1) - volume_def = w.group(2) + self.series_def = self.series2 if self.series2 else self.series - # cleanup synopsis - synopsis = ( - synopsis.replace("", "") - .replace("", "") - .replace("", "") - .replace("", "") - .replace("", "") - .replace("", "") - .replace("", "") - .replace("", "") - .replace("", "") - .replace("", "") - .replace("
      ", "") - .replace("
    ", "\n") - .replace("
      ", "") - .replace("
    ", "\n") - .replace("
  • ", " • ") - .replace("
  • ", "\n") - .replace("
    ", "") - .replace("

    ", "") - .replace("

    ", "\n") - ) + self.volume = self.getStringContentFromXPath( + r, '//li[contains(@class, "seriesLabel")]/text()[2]' + ).strip() + if self.volume == ",": + self.volume = "" + self.volume2 = self.getStringContentFromXPath( + r, '//li[contains(@class, "seriesLabel")]/text()[3]' + ).strip() + if self.volume2 == ",": + self.volume2 = "" + + self.volume_def = self.volume2 if self.volume2 else self.volume + + # fix series when audible 'forgets' the series link… + if not self.series_def: + for r in self.html.xpath('//div[contains(@class, "adbl-main")]'): + self.subtitle = self.getStringContentFromXPath( + r, 'normalize-space(//li[contains' + '(@class, "authorLabel")]' + '//preceding::li[1]//span//text())' + ).strip() - # Setup logging of all data in the array - type_arr = [ - {'date': date}, - {'title': title}, - {'author': author}, - {'series': series}, - {'narrator': narrator}, - {'studio': studio}, - {'thumb': thumb}, - {'rating': rating}, - {'genres': genre_parent + ', ' + genre_child}, - {'synopsis': synopsis}, - {'volume': volume}, - {'series2': series2}, - {'volume2': volume2}, - {'series def': series_def}, - {'volume def': volume_def}, - ] - # Loop through dicts in array - for log_type in type_arr: - for key, val in log_type.items(): - if val: - self.Log("{key:<15}{val}".format( - key=key, - val=val - ) - ) + w = re.match("(.*)(, Book \d+)", self.subtitle) + if not self.series_def and w: + self.series_def = w.group(1) + self.volume_def = w.group(2) + def compile_metadata(self): # Set the date and year if found. - if date is not None: - metadata.originally_available_at = date + if self.date is not None: + self.metadata.originally_available_at = self.date # Add the genres - metadata.genres.clear() - metadata.genres.add(genre_parent) - metadata.genres.add(genre_child) + self.metadata.genres.clear() + self.metadata.genres.add(self.genre_parent) + self.metadata.genres.add(self.genre_child) # Add Narrators to Styles - narrators_list = narrator.split(",") + narrators_list = self.narrator.split(",") narr_contributors_list = [ 'full cast' ] - metadata.styles.clear() + self.metadata.styles.clear() # Loop through narrators to check if it has contributor wording for narrator in narrators_list: if not [ @@ -958,10 +912,10 @@ def update(self, metadata, media, lang, force=False): contrib in narrator.lower() ) ]: - metadata.styles.add(narrator.strip()) + self.metadata.styles.add(narrator.strip()) # Add Authors to Moods - author_list = author.split(",") + author_list = self.author.split(",") author_contributers_list = [ 'contributor', 'translator', @@ -969,7 +923,7 @@ def update(self, metadata, media, lang, force=False): 'translated', 'full cast', ] - metadata.moods.clear() + self.metadata.moods.clear() # Loop through authors to check if it has contributor wording for author in author_list: if not [ @@ -977,10 +931,10 @@ def update(self, metadata, media, lang, force=False): contrib in author.lower() ) ]: - metadata.moods.add(author.strip()) + self.metadata.moods.add(author.strip()) # Clean series - x = re.match("(.*)(: A .* Series)", series_def) + x = re.match("(.*)(: A .* Series)", self.series_def) if x: series_def = x.group(1) @@ -992,40 +946,125 @@ def update(self, metadata, media, lang, force=False): seriesshort = series_def[:-len(checkseries)] y = re.match( - "(.*)((: .* " + volume_def[2:] + ": A .* Series)|" - "(((:|,|-) )((" + seriesshort + volume_def + ")|" - "((?", "") + .replace("", "") + .replace("", "") + .replace("", "") + .replace("", "") + .replace("", "") + .replace("", "") + .replace("", "") + .replace("", "") + .replace("", "") + .replace("
      ", "") + .replace("
    ", "\n") + .replace("
      ", "") + .replace("
    ", "\n") + .replace("
  • ", " • ") + .replace("
  • ", "\n") + .replace("
    ", "") + .replace("

    ", "") + .replace("

    ", "\n") + ) + + # Setup logging of all data in the array + data_to_log = [ + {'date': self.date}, + {'title': self.title}, + {'author': self.author}, + {'series': self.series}, + {'narrator': self.narrator}, + {'studio': self.studio}, + {'thumb': self.thumb}, + {'rating': self.rating}, + {'genres': self.genre_parent + ', ' + self.genre_child}, + {'synopsis': self.synopsis}, + {'volume': self.volume}, + {'series2': self.series2}, + {'volume2': self.volume2}, + {'series def': self.series_def}, + {'volume def': self.volume_def}, + ] + Log.metadata(data_to_log, log_level="debug") + + self.compile_metadata() def hasProxy(self): return Prefs['imageproxyurl'] is not None @@ -1040,7 +1079,7 @@ def worker(self, queue, stoprequest): try: func(*args, **kargs) except Exception as e: - self.Log(e) + Log.info(e) queue.task_done() except Queue.Empty: continue @@ -1048,31 +1087,9 @@ def worker(self, queue, stoprequest): def addTask(self, queue, func, *args, **kargs): queue.put((func, args, kargs)) - # Prints a bunch of divider chars like --- - def log_separator(self, msg=None): - divider = "-" * 35 - output = divider + divider - # Override output with message if passed - if msg: - output = divider + msg + divider - - return self.Log(output) - - def log_metadata(self, dict_arr): - # Loop through dicts in array - for log_type in dict_arr: - # Loop through each key/value - for key, val in log_type.items(): - if val: - self.Log("{key:<15}{val}".format( - key=key, - val=val - ) - ) - # Writes metadata information to log. def writeInfo(self, header, url, metadata): - self.log_separator(header) + Log.separator(msg=header, log_level="info") # Log basic metadata data_to_log = [ @@ -1083,7 +1100,7 @@ def writeInfo(self, header, url, metadata): {'Studio': metadata.studio}, {'Summary': metadata.summary}, ] - self.log_metadata(data_to_log) + Log.metadata(data_to_log, log_level="info") # Log basic metadata stored in arrays multi_arr = [ @@ -1094,20 +1111,9 @@ def writeInfo(self, header, url, metadata): {'Poster URL', metadata.posters}, {'Fan art URL', metadata.art}, ] - # Loop through dicts in array - for log_type in multi_arr: - # Loop through each key/value - for key, val in log_type.items(): - if val: - # Loop through dict's array - for item in val: - self.Log("{key:<15}{val}".format( - key=key, - val=item - ) - ) + Log.metadata_arrs(multi_arr, log_level="info") - self.log_separator() + Log.separator(log_level="info") def safe_unicode(s, encoding='utf-8'): diff --git a/Contents/Code/logging.py b/Contents/Code/logging.py new file mode 100644 index 0000000..c3d6f94 --- /dev/null +++ b/Contents/Code/logging.py @@ -0,0 +1,57 @@ +class Logging: + # Only prints message with debug mode + def debug(self, message, *args): + if Prefs['debug']: + return Log(message, *args) + + # Prints any message you give + def info(self, message, *args): + return Log(message, *args) + + # For the below logging: + # Default level is info + # Set debug by calling ('sometext', 'debug') + + # Prints a bunch of divider chars like --- + def separator(self, msg=None, log_level="info"): + divider = "-" * 35 + output = divider + divider + # Override output with message if passed + if msg: + output = divider + msg + divider + + if log_level.lower() == "debug": + return self.debug(output) + return self.info(output) + + # Loops through array of dictionaries and logs them + def metadata(self, dict_arr, log_level="info"): + # Loop through dicts in array + for log_type in dict_arr: + # Loop through each key/value + for key, val in log_type.items(): + if val: + output = "{key:<15}{val}".format( + key=key, + val=val + ) + if log_level.lower() == "debug": + return self.debug(output) + return self.info(output) + + def metadata_arrs(self, dict_arr, log_level="info"): + # Loop through dicts in array + for log_type in dict_arr: + # Loop through each key/value + for key, val in log_type.items(): + if val: + # Loop through dict's array + for item in val: + output = ("{key:<15}{val}".format( + key=key, + val=item + ) + ) + if log_level.lower() == "debug": + return self.debug(output) + return self.info(output) From 1ab09c2db203cd3680fdfc2ac7352967e999f6d3 Mon Sep 17 00:00:00 2001 From: djdembeck Date: Wed, 25 Aug 2021 17:30:19 -0500 Subject: [PATCH 10/30] Remove unecessary else --- Contents/Code/__init__.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/Contents/Code/__init__.py b/Contents/Code/__init__.py index 46e1436..d11b182 100644 --- a/Contents/Code/__init__.py +++ b/Contents/Code/__init__.py @@ -1122,6 +1122,5 @@ def safe_unicode(s, encoding='utf-8'): if isinstance(s, basestring): if isinstance(s, types.UnicodeType): return s - else: - return s.decode(encoding) + return s.decode(encoding) return str(s).decode(encoding) From c313eae0fa2825e4743112d7f3eda79c1afa9b1b Mon Sep 17 00:00:00 2001 From: djdembeck Date: Wed, 25 Aug 2021 17:41:45 -0500 Subject: [PATCH 11/30] Add exception types --- Contents/Code/__init__.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/Contents/Code/__init__.py b/Contents/Code/__init__.py index d11b182..85e764e 100644 --- a/Contents/Code/__init__.py +++ b/Contents/Code/__init__.py @@ -12,7 +12,7 @@ def json_decode(output): try: return json.loads(output, encoding="utf-8") - except: + except AttributeError: return None @@ -214,7 +214,7 @@ class AudiobookArtist(Agent.Artist): def getDateFromString(self, string): try: return Datetime.ParseDate(string).date() - except: + except AttributeError: return None def getStringContentFromXPath(self, source, query): @@ -320,7 +320,7 @@ class AudiobookAlbum(Agent.Album): def getDateFromString(self, string): try: return Datetime.ParseDate(string).date() - except: + except AttributeError: return None def getStringContentFromXPath(self, source, query): @@ -848,7 +848,7 @@ def date_missing(self): self.genre_child = ( json_data['itemListElement'][2]['item']['name'] ) - except: + except AttributeError: continue def handle_series(self): From a71dc941d30f8d93038b2df0e550bac3cd1aa5de Mon Sep 17 00:00:00 2001 From: djdembeck Date: Wed, 25 Aug 2021 17:52:33 -0500 Subject: [PATCH 12/30] Use join instead of concat --- Contents/Code/__init__.py | 19 +++++++------------ 1 file changed, 7 insertions(+), 12 deletions(-) diff --git a/Contents/Code/__init__.py b/Contents/Code/__init__.py index 85e764e..4e7f0f1 100644 --- a/Contents/Code/__init__.py +++ b/Contents/Code/__init__.py @@ -824,20 +824,15 @@ def date_missing(self): self.rating = ( json_data['aggregateRating']['ratingValue'] ) - self.author = '' - counter = 0 + author_array = [] for c in json_data['author']: - counter += 1 - if counter > 1: - self.author += ', ' - self.author += c['name'] - self.narrator = '' - counter = 0 + author_array.append(c['name']) + self.author = ",".join(author_array) + + narrator_array = [] for c in json_data['readBy']: - counter += 1 - if counter > 1: - self.narrator += ',' - self.narrator += c['name'] + narrator_array.append(c['name']) + self.narrator = ",".join(narrator_array) self.studio = json_data['publisher'] self.synopsis = json_data['description'] if 'itemListElement' in json_data: From 8dea1aaea489c38580ee3d2fa47355f2ed3c63a9 Mon Sep 17 00:00:00 2001 From: djdembeck Date: Wed, 25 Aug 2021 17:56:13 -0500 Subject: [PATCH 13/30] Fix conflicting logkit name --- Contents/Code/__init__.py | 124 +++++++++++++++++++------------------- 1 file changed, 62 insertions(+), 62 deletions(-) diff --git a/Contents/Code/__init__.py b/Contents/Code/__init__.py index 4e7f0f1..1d6ed87 100644 --- a/Contents/Code/__init__.py +++ b/Contents/Code/__init__.py @@ -4,7 +4,7 @@ import re import types -from logging import Log +from logging import Logging import Queue @@ -73,13 +73,13 @@ def json_decode(output): def SetupUrls(sitetype, base, lang='en'): - Log.debug('Library/Search language is : %s', lang) + Logging.debug('Library/Search language is : %s', lang) ctx = dict() if sitetype: - Log.debug('Manual Site Selection Enabled : %s', base) - Log.debug('Language being ignored due to manual site selection') + Logging.debug('Manual Site Selection Enabled : %s', base) + Logging.debug('Language being ignored due to manual site selection') if base in sites_langs: - Log.debug('Pulling language from sites array') + Logging.debug('Pulling language from sites array') lang = sites_langs[base]['lang'] if lang in intl_sites: base = intl_sites[lang]['url'] @@ -99,35 +99,35 @@ def SetupUrls(sitetype, base, lang='en'): ctx['REL_DATE_INFO'] = ctx['REL_DATE'] ctx['NAR_BY'] = 'Narrated By' ctx['NAR_BY_INFO'] = 'Narrated by' - Log.debug( + Logging.debug( 'Sites language is : %s', lang ) - Log.debug( + Logging.debug( '/************************************' 'LANG DEBUGGING' '************************************/' ) - Log.debug( + Logging.debug( '/* REL_DATE = %s', ctx['REL_DATE'] ) - Log.debug( + Logging.debug( '/* REL_DATE_INFO = %s', ctx['REL_DATE_INFO'] ) - Log.debug( + Logging.debug( '/* NAR_BY = %s', ctx['NAR_BY'] ) - Log.debug( + Logging.debug( '/* NAR_BY_INFO = %s', ctx['NAR_BY_INFO'] ) - Log.debug( + Logging.debug( '/****************************************' '****************************************/' ) else: - Log.debug( + Logging.debug( 'Audible site will be chosen by library language' ) - Log.debug( + Logging.debug( 'Library Language is %s', lang ) if base is None: @@ -268,19 +268,19 @@ def search(self, results, media, lang, manual=False): # author source is identified. # Log some stuff - Log.separator(msg='ARTIST SEARCH', log_level='debug') - Log.debug( + Logging.separator(msg='ARTIST SEARCH', log_level='debug') + Logging.debug( '* Album: %s', media.album ) - Log.debug( + Logging.debug( '* Artist: %s', media.artist ) - Log.debug( + Logging.debug( '****************************************' 'Not Ready For Artist Search Yet' '****************************************' ) - Log.separator(log_level='debug') + Logging.separator(log_level='debug') def hasProxy(self): return Prefs['imageproxyurl'] is not None @@ -295,7 +295,7 @@ def worker(self, queue, stoprequest): try: func(*args, **kargs) except Exception as e: - Log.info(e) + Logging.info(e) queue.task_done() except Queue.Empty: continue @@ -385,7 +385,7 @@ def before_xpath(self): '[contains (@class,"narratorLabel")]/span//a[1]' ).format(ctx['NAR_BY']) ) - Log.separator(msg='XPATH SEARCH HIT', log_level="debug") + Logging.separator(msg='XPATH SEARCH HIT', log_level="debug") self.found.append( { @@ -432,7 +432,7 @@ def after_xpath(self): self.ctx['NAR_BY'] ) ) - Log.separator(msg='XPATH SEARCH HIT', log_level="debug") + Logging.separator(msg='XPATH SEARCH HIT', log_level="debug") self.found.append( { @@ -450,16 +450,16 @@ def doSearch(self, url, ctx): self.found = [] self.ctx = ctx - Log.separator(msg='just before new xpath line', log_level="debug") + Logging.separator(msg='just before new xpath line', log_level="debug") self.before_xpath() - Log.separator(msg='just after new xpath line', log_level="debug") + Logging.separator(msg='just after new xpath line', log_level="debug") self.after_xpath() return self.found def pre_search(self): - Log.separator(msg='ALBUM SEARCH', log_level="info") + Logging.separator(msg='ALBUM SEARCH', log_level="info") # Log basic metadata data_to_log = [ {'ID': self.media.parent_metadata.id}, @@ -468,45 +468,45 @@ def pre_search(self): {'Album': self.media.album}, {'Artist': self.media.artist}, ] - Log.metadata(data_to_log) - Log.separator(log_level="info") + Logging.metadata(data_to_log) + Logging.separator(log_level="info") # Handle a couple of edge cases where # album search will give bad results. if self.media.album is None and not self.manual: - Log.info('Album Title is NULL on an automatic search. Returning') + Logging.info('Album Title is NULL on an automatic search. Returning') return if self.media.album == '[Unknown Album]' and not self.manual: - Log.info( + Logging.info( 'Album Title is [Unknown Album]' ' on an automatic search. Returning' ) return if self.manual: - Log.info( + Logging.info( 'You clicked \'fix match\'. ' 'This may have returned no useful results because ' 'it\'s searching using the title of the first track.' ) - Log.info( + Logging.info( 'There\'s not currently a way around this initial failure. ' 'But clicking \'Search Options\' and ' 'entering the title works just fine.' ) - Log.info( + Logging.info( 'This message will appear during the initial ' 'search and the actual manual search.' ) # If this is a custom search, # use the user-entered name instead of the scanner hint. if self.media.name: - Log.info( + Logging.info( 'Custom album search for: ' + self.media.name ) self.media.album = self.media.name else: - Log.info('Album search: ' + self.media.title) + Logging.info('Album search: ' + self.media.title) def format_title(self): # Normalize the name @@ -515,7 +515,7 @@ def format_title(self): ) if len(self.normalizedName) == 0: self.normalizedName = self.media.album - Log.debug( + Logging.debug( 'normalizedName = %s', self.normalizedName ) @@ -523,15 +523,15 @@ def format_title(self): self.normalizedName = re.sub( r"[\(\[].*?[\)\]]", "", self.normalizedName ) - Log.debug( + Logging.debug( 'chopping bracketed text = %s', self.normalizedName ) self.normalizedName = self.normalizedName.strip() - Log.debug( + Logging.debug( 'normalizedName stripped = %s', self.normalizedName ) - Log.info( + Logging.info( '***** SEARCHING FOR "%s" - AUDIBLE v.%s *****', self.normalizedName, VERSION_NO ) @@ -545,7 +545,7 @@ def run_search(self): valid_itemId = None for f in self.found: url = f['url'] - Log.debug('URL For Breakdown: %s', url) + Logging.debug('URL For Breakdown: %s', url) # Get the id for item in url.split('/'): @@ -562,10 +562,10 @@ def run_search(self): break if len(valid_itemId) == 0: - Log.info('No Match: %s', url) + Logging.info('No Match: %s', url) continue - Log.debug('* ID is %s', valid_itemId) + Logging.debug('* ID is %s', valid_itemId) title = f['title'] thumb = f['thumb'] @@ -580,8 +580,8 @@ def run_search(self): # Score the album name scorebase1 = self.media.album scorebase2 = title.encode('utf-8') - # Log.debug('scorebase1: %s', scorebase1) - # Log.debug('scorebase2: %s', scorebase2) + # Logging.debug('scorebase1: %s', scorebase1) + # Logging.debug('scorebase2: %s', scorebase2) score = INITIAL_SCORE - Util.LevenshteinDistance( scorebase1, scorebase2 @@ -590,8 +590,8 @@ def run_search(self): if self.media.artist: scorebase3 = self.media.artist scorebase4 = author - # Log.debug('scorebase3: %s', scorebase3) - # Log.debug('scorebase4: %s', scorebase4) + # Logging.debug('scorebase3: %s', scorebase3) + # Logging.debug('scorebase4: %s', scorebase4) score = INITIAL_SCORE - Util.LevenshteinDistance( scorebase3, scorebase4 ) @@ -605,7 +605,7 @@ def run_search(self): {'Score is': str(score)}, {'Thumb is': thumb}, ] - Log.metadata(data_to_log, log_level="info") + Logging.metadata(data_to_log, log_level="info") if score >= self.LCL_IGNORE_SCORE: info.append( @@ -620,13 +620,13 @@ def run_search(self): } ) else: - Log.info( + Logging.info( '# Score is below ignore boundary (%s)... Skipping!', self.LCL_IGNORE_SCORE ) if i != len(self.found): - Log.separator() + Logging.separator() i += 1 @@ -663,20 +663,20 @@ def search(self, results, media, lang, manual): # Write search result status to log if len(self.result) == 0: - Log.info( + Logging.info( 'No results found for query "%s"', self.normalizedName ) return - Log.debug( + Logging.debug( 'Found %s result(s) for query "%s"', len(self.result), self.normalizedName ) i = 1 for f in self.result: - Log.debug( + Logging.debug( ' %s. (title) %s (author) %s (url)[%s]' ' (date)(%s) (thumb){%s}', i, f['title'], f['author'], @@ -684,19 +684,19 @@ def search(self, results, media, lang, manual): ) i += 1 - Log.separator(log_level="info") + Logging.separator(log_level="info") info = self.run_search() # Output the final results. - Log.separator(log_level="debug") - Log.debug('Final result:') + Logging.separator(log_level="debug") + Logging.debug('Final result:') i = 1 for r in info: description = '\"%s\" by %s [%s]' % ( r['title'], r['artist'], r['year'] ) - Log.debug( + Logging.debug( ' [%s] %s. %s (%s) %s {%s} [%s]', r['score'], i, r['title'], r['year'], r['artist'], r['id'], r['thumb'] @@ -715,7 +715,7 @@ def search(self, results, media, lang, manual): # and this one has a score that is >= GOOD SCORE, # then ignore the rest of the results if not manual and len(info) > 1 and r['score'] >= GOOD_SCORE: - Log.info( + Logging.info( ' *** The score for these results are great, ' 'so we will use them, and ignore the rest. ***' ) @@ -798,7 +798,7 @@ def update_scrape(self): '/div[3]/a/span/text()' ) ) - Log.separator(msg='XPATH SEARCH HIT') + Logging.separator(msg='XPATH SEARCH HIT') def date_missing(self): for r in self.html.xpath( @@ -812,7 +812,7 @@ def date_missing(self): r'([^\\])(\\(?![bfnrt\'\"\\/]|u[A-Fa-f0-9]{4}))' ) page_content = remove_inv_json_esc.sub(r'\1\\\2', page_content) - Log.debug(page_content) + Logging.debug(page_content) json_data = json_decode(page_content) for json_data in json_data: if 'datePublished' in json_data: @@ -977,7 +977,7 @@ def compile_metadata(self): self.writeInfo('New data', self.url, self.metadata) def update(self, metadata, media, lang, force=False): - Log.debug( + Logging.debug( '***** UPDATING "%s" ID: %s - AUDIBLE v.%s *****', media.title, self.metadata.id, VERSION_NO ) @@ -1057,7 +1057,7 @@ def update(self, metadata, media, lang, force=False): {'series def': self.series_def}, {'volume def': self.volume_def}, ] - Log.metadata(data_to_log, log_level="debug") + Logging.metadata(data_to_log, log_level="debug") self.compile_metadata() @@ -1074,7 +1074,7 @@ def worker(self, queue, stoprequest): try: func(*args, **kargs) except Exception as e: - Log.info(e) + Logging.info(e) queue.task_done() except Queue.Empty: continue @@ -1084,7 +1084,7 @@ def addTask(self, queue, func, *args, **kargs): # Writes metadata information to log. def writeInfo(self, header, url, metadata): - Log.separator(msg=header, log_level="info") + Logging.separator(msg=header, log_level="info") # Log basic metadata data_to_log = [ From c0275ca083b39ca2b531c5a4dd082d3199ea5ecb Mon Sep 17 00:00:00 2001 From: djdembeck Date: Wed, 25 Aug 2021 17:59:40 -0500 Subject: [PATCH 14/30] I try to do smart things sometimes --- Contents/Code/__init__.py | 127 +++++++++++++++++++------------------- 1 file changed, 64 insertions(+), 63 deletions(-) diff --git a/Contents/Code/__init__.py b/Contents/Code/__init__.py index 1d6ed87..44432b0 100644 --- a/Contents/Code/__init__.py +++ b/Contents/Code/__init__.py @@ -3,9 +3,7 @@ import json import re import types - from logging import Logging - import Queue @@ -32,6 +30,9 @@ def json_decode(output): THREAD_MAX = 20 +# Setup logger +log = Logging() + intl_sites = { 'en': { 'url': 'www.audible.com', @@ -73,13 +74,13 @@ def json_decode(output): def SetupUrls(sitetype, base, lang='en'): - Logging.debug('Library/Search language is : %s', lang) + log.debug('Library/Search language is : %s', lang) ctx = dict() if sitetype: - Logging.debug('Manual Site Selection Enabled : %s', base) - Logging.debug('Language being ignored due to manual site selection') + log.debug('Manual Site Selection Enabled : %s', base) + log.debug('Language being ignored due to manual site selection') if base in sites_langs: - Logging.debug('Pulling language from sites array') + log.debug('Pulling language from sites array') lang = sites_langs[base]['lang'] if lang in intl_sites: base = intl_sites[lang]['url'] @@ -99,35 +100,35 @@ def SetupUrls(sitetype, base, lang='en'): ctx['REL_DATE_INFO'] = ctx['REL_DATE'] ctx['NAR_BY'] = 'Narrated By' ctx['NAR_BY_INFO'] = 'Narrated by' - Logging.debug( + log.debug( 'Sites language is : %s', lang ) - Logging.debug( + log.debug( '/************************************' 'LANG DEBUGGING' '************************************/' ) - Logging.debug( + log.debug( '/* REL_DATE = %s', ctx['REL_DATE'] ) - Logging.debug( + log.debug( '/* REL_DATE_INFO = %s', ctx['REL_DATE_INFO'] ) - Logging.debug( + log.debug( '/* NAR_BY = %s', ctx['NAR_BY'] ) - Logging.debug( + log.debug( '/* NAR_BY_INFO = %s', ctx['NAR_BY_INFO'] ) - Logging.debug( + log.debug( '/****************************************' '****************************************/' ) else: - Logging.debug( + log.debug( 'Audible site will be chosen by library language' ) - Logging.debug( + log.debug( 'Library Language is %s', lang ) if base is None: @@ -268,19 +269,19 @@ def search(self, results, media, lang, manual=False): # author source is identified. # Log some stuff - Logging.separator(msg='ARTIST SEARCH', log_level='debug') - Logging.debug( + log.separator(msg='ARTIST SEARCH', log_level='debug') + log.debug( '* Album: %s', media.album ) - Logging.debug( + log.debug( '* Artist: %s', media.artist ) - Logging.debug( + log.debug( '****************************************' 'Not Ready For Artist Search Yet' '****************************************' ) - Logging.separator(log_level='debug') + log.separator(log_level='debug') def hasProxy(self): return Prefs['imageproxyurl'] is not None @@ -295,7 +296,7 @@ def worker(self, queue, stoprequest): try: func(*args, **kargs) except Exception as e: - Logging.info(e) + log.info(e) queue.task_done() except Queue.Empty: continue @@ -385,7 +386,7 @@ def before_xpath(self): '[contains (@class,"narratorLabel")]/span//a[1]' ).format(ctx['NAR_BY']) ) - Logging.separator(msg='XPATH SEARCH HIT', log_level="debug") + log.separator(msg='XPATH SEARCH HIT', log_level="debug") self.found.append( { @@ -432,7 +433,7 @@ def after_xpath(self): self.ctx['NAR_BY'] ) ) - Logging.separator(msg='XPATH SEARCH HIT', log_level="debug") + log.separator(msg='XPATH SEARCH HIT', log_level="debug") self.found.append( { @@ -450,16 +451,16 @@ def doSearch(self, url, ctx): self.found = [] self.ctx = ctx - Logging.separator(msg='just before new xpath line', log_level="debug") + log.separator(msg='just before new xpath line', log_level="debug") self.before_xpath() - Logging.separator(msg='just after new xpath line', log_level="debug") + log.separator(msg='just after new xpath line', log_level="debug") self.after_xpath() return self.found def pre_search(self): - Logging.separator(msg='ALBUM SEARCH', log_level="info") + log.separator(msg='ALBUM SEARCH', log_level="info") # Log basic metadata data_to_log = [ {'ID': self.media.parent_metadata.id}, @@ -468,45 +469,45 @@ def pre_search(self): {'Album': self.media.album}, {'Artist': self.media.artist}, ] - Logging.metadata(data_to_log) - Logging.separator(log_level="info") + log.metadata(data_to_log) + log.separator(log_level="info") # Handle a couple of edge cases where # album search will give bad results. if self.media.album is None and not self.manual: - Logging.info('Album Title is NULL on an automatic search. Returning') + log.info('Album Title is NULL on an automatic search. Returning') return if self.media.album == '[Unknown Album]' and not self.manual: - Logging.info( + log.info( 'Album Title is [Unknown Album]' ' on an automatic search. Returning' ) return if self.manual: - Logging.info( + log.info( 'You clicked \'fix match\'. ' 'This may have returned no useful results because ' 'it\'s searching using the title of the first track.' ) - Logging.info( + log.info( 'There\'s not currently a way around this initial failure. ' 'But clicking \'Search Options\' and ' 'entering the title works just fine.' ) - Logging.info( + log.info( 'This message will appear during the initial ' 'search and the actual manual search.' ) # If this is a custom search, # use the user-entered name instead of the scanner hint. if self.media.name: - Logging.info( + log.info( 'Custom album search for: ' + self.media.name ) self.media.album = self.media.name else: - Logging.info('Album search: ' + self.media.title) + log.info('Album search: ' + self.media.title) def format_title(self): # Normalize the name @@ -515,7 +516,7 @@ def format_title(self): ) if len(self.normalizedName) == 0: self.normalizedName = self.media.album - Logging.debug( + log.debug( 'normalizedName = %s', self.normalizedName ) @@ -523,15 +524,15 @@ def format_title(self): self.normalizedName = re.sub( r"[\(\[].*?[\)\]]", "", self.normalizedName ) - Logging.debug( + log.debug( 'chopping bracketed text = %s', self.normalizedName ) self.normalizedName = self.normalizedName.strip() - Logging.debug( + log.debug( 'normalizedName stripped = %s', self.normalizedName ) - Logging.info( + log.info( '***** SEARCHING FOR "%s" - AUDIBLE v.%s *****', self.normalizedName, VERSION_NO ) @@ -545,7 +546,7 @@ def run_search(self): valid_itemId = None for f in self.found: url = f['url'] - Logging.debug('URL For Breakdown: %s', url) + log.debug('URL For Breakdown: %s', url) # Get the id for item in url.split('/'): @@ -562,10 +563,10 @@ def run_search(self): break if len(valid_itemId) == 0: - Logging.info('No Match: %s', url) + log.info('No Match: %s', url) continue - Logging.debug('* ID is %s', valid_itemId) + log.debug('* ID is %s', valid_itemId) title = f['title'] thumb = f['thumb'] @@ -580,8 +581,8 @@ def run_search(self): # Score the album name scorebase1 = self.media.album scorebase2 = title.encode('utf-8') - # Logging.debug('scorebase1: %s', scorebase1) - # Logging.debug('scorebase2: %s', scorebase2) + # log.debug('scorebase1: %s', scorebase1) + # log.debug('scorebase2: %s', scorebase2) score = INITIAL_SCORE - Util.LevenshteinDistance( scorebase1, scorebase2 @@ -590,8 +591,8 @@ def run_search(self): if self.media.artist: scorebase3 = self.media.artist scorebase4 = author - # Logging.debug('scorebase3: %s', scorebase3) - # Logging.debug('scorebase4: %s', scorebase4) + # log.debug('scorebase3: %s', scorebase3) + # log.debug('scorebase4: %s', scorebase4) score = INITIAL_SCORE - Util.LevenshteinDistance( scorebase3, scorebase4 ) @@ -605,7 +606,7 @@ def run_search(self): {'Score is': str(score)}, {'Thumb is': thumb}, ] - Logging.metadata(data_to_log, log_level="info") + log.metadata(data_to_log, log_level="info") if score >= self.LCL_IGNORE_SCORE: info.append( @@ -620,13 +621,13 @@ def run_search(self): } ) else: - Logging.info( + log.info( '# Score is below ignore boundary (%s)... Skipping!', self.LCL_IGNORE_SCORE ) if i != len(self.found): - Logging.separator() + log.separator() i += 1 @@ -663,20 +664,20 @@ def search(self, results, media, lang, manual): # Write search result status to log if len(self.result) == 0: - Logging.info( + log.info( 'No results found for query "%s"', self.normalizedName ) return - Logging.debug( + log.debug( 'Found %s result(s) for query "%s"', len(self.result), self.normalizedName ) i = 1 for f in self.result: - Logging.debug( + log.debug( ' %s. (title) %s (author) %s (url)[%s]' ' (date)(%s) (thumb){%s}', i, f['title'], f['author'], @@ -684,19 +685,19 @@ def search(self, results, media, lang, manual): ) i += 1 - Logging.separator(log_level="info") + log.separator(log_level="info") info = self.run_search() # Output the final results. - Logging.separator(log_level="debug") - Logging.debug('Final result:') + log.separator(log_level="debug") + log.debug('Final result:') i = 1 for r in info: description = '\"%s\" by %s [%s]' % ( r['title'], r['artist'], r['year'] ) - Logging.debug( + log.debug( ' [%s] %s. %s (%s) %s {%s} [%s]', r['score'], i, r['title'], r['year'], r['artist'], r['id'], r['thumb'] @@ -715,7 +716,7 @@ def search(self, results, media, lang, manual): # and this one has a score that is >= GOOD SCORE, # then ignore the rest of the results if not manual and len(info) > 1 and r['score'] >= GOOD_SCORE: - Logging.info( + log.info( ' *** The score for these results are great, ' 'so we will use them, and ignore the rest. ***' ) @@ -798,7 +799,7 @@ def update_scrape(self): '/div[3]/a/span/text()' ) ) - Logging.separator(msg='XPATH SEARCH HIT') + log.separator(msg='XPATH SEARCH HIT') def date_missing(self): for r in self.html.xpath( @@ -812,7 +813,7 @@ def date_missing(self): r'([^\\])(\\(?![bfnrt\'\"\\/]|u[A-Fa-f0-9]{4}))' ) page_content = remove_inv_json_esc.sub(r'\1\\\2', page_content) - Logging.debug(page_content) + log.debug(page_content) json_data = json_decode(page_content) for json_data in json_data: if 'datePublished' in json_data: @@ -977,7 +978,7 @@ def compile_metadata(self): self.writeInfo('New data', self.url, self.metadata) def update(self, metadata, media, lang, force=False): - Logging.debug( + log.debug( '***** UPDATING "%s" ID: %s - AUDIBLE v.%s *****', media.title, self.metadata.id, VERSION_NO ) @@ -1057,7 +1058,7 @@ def update(self, metadata, media, lang, force=False): {'series def': self.series_def}, {'volume def': self.volume_def}, ] - Logging.metadata(data_to_log, log_level="debug") + log.metadata(data_to_log, log_level="debug") self.compile_metadata() @@ -1074,7 +1075,7 @@ def worker(self, queue, stoprequest): try: func(*args, **kargs) except Exception as e: - Logging.info(e) + log.info(e) queue.task_done() except Queue.Empty: continue @@ -1084,7 +1085,7 @@ def addTask(self, queue, func, *args, **kargs): # Writes metadata information to log. def writeInfo(self, header, url, metadata): - Logging.separator(msg=header, log_level="info") + log.separator(msg=header, log_level="info") # Log basic metadata data_to_log = [ From 701a26248d91aa0fa5ac13ced0aae3b8f2bd8013 Mon Sep 17 00:00:00 2001 From: djdembeck Date: Wed, 25 Aug 2021 18:35:53 -0500 Subject: [PATCH 15/30] Cleanup remaining errors from overhaul --- Contents/Code/__init__.py | 60 ++++++++++++++++++++------------------- Contents/Code/logging.py | 10 ++++--- 2 files changed, 37 insertions(+), 33 deletions(-) diff --git a/Contents/Code/__init__.py b/Contents/Code/__init__.py index 44432b0..f849c5f 100644 --- a/Contents/Code/__init__.py +++ b/Contents/Code/__init__.py @@ -75,7 +75,7 @@ def json_decode(output): def SetupUrls(sitetype, base, lang='en'): log.debug('Library/Search language is : %s', lang) - ctx = dict() + ctx = {} if sitetype: log.debug('Manual Site Selection Enabled : %s', base) log.debug('Language being ignored due to manual site selection') @@ -384,7 +384,7 @@ def before_xpath(self): r, ( u'div/div/div/div/div/div/span/ul/li' '[contains (@class,"narratorLabel")]/span//a[1]' - ).format(ctx['NAR_BY']) + ).format(self.ctx['NAR_BY']) ) log.separator(msg='XPATH SEARCH HIT', log_level="debug") @@ -409,7 +409,7 @@ def after_xpath(self): u'div/div/ul/li[contains (., "{0}")]' '/span[2]//text()' ).format( - ctx['REL_DATE'] + self.ctx['REL_DATE'] ) ) ) @@ -566,8 +566,6 @@ def run_search(self): log.info('No Match: %s', url) continue - log.debug('* ID is %s', valid_itemId) - title = f['title'] thumb = f['thumb'] date = f['date'] @@ -599,6 +597,7 @@ def run_search(self): # Log basic metadata data_to_log = [ + {'ID is': valid_itemId}, {'Title is': title}, {'Author is': author}, {'Narrator is': narrator}, @@ -932,14 +931,14 @@ def compile_metadata(self): # Clean series x = re.match("(.*)(: A .* Series)", self.series_def) if x: - series_def = x.group(1) + self.series_def = x.group(1) # Clean title - seriesshort = series_def + seriesshort = self.series_def checkseries = " Series" # Handle edge cases in titles - if series_def.endswith(checkseries): - seriesshort = series_def[:-len(checkseries)] + if self.series_def.endswith(checkseries): + seriesshort = self.series_def[:-len(checkseries)] y = re.match( "(.*)((: .* " + self.volume_def[2:] + ": A .* Series)|" @@ -975,9 +974,12 @@ def compile_metadata(self): self.metadata.collections.add(self.series) if self.series2: self.metadata.collections.add(self.series2) - self.writeInfo('New data', self.url, self.metadata) + self.writeInfo() def update(self, metadata, media, lang, force=False): + self.metadata = metadata + self.media = media + self.lang = lang log.debug( '***** UPDATING "%s" ID: %s - AUDIBLE v.%s *****', media.title, self.metadata.id, VERSION_NO @@ -989,8 +991,8 @@ def update(self, metadata, media, lang, force=False): self.url = self.ctx['AUD_BOOK_INFO'] % self.metadata.id try: - self.html = HTML.ElementFromURL(url, sleep=REQUEST_DELAY) - except NetworkError: + self.html = HTML.ElementFromURL(self.url, sleep=REQUEST_DELAY) + except Exception: pass self.date = None @@ -1084,32 +1086,32 @@ def addTask(self, queue, func, *args, **kargs): queue.put((func, args, kargs)) # Writes metadata information to log. - def writeInfo(self, header, url, metadata): - log.separator(msg=header, log_level="info") + def writeInfo(self): + log.separator(msg='New data', log_level="info") # Log basic metadata data_to_log = [ - {'ID': metadata.id}, - {'URL': url}, - {'Title': metadata.title}, - {'Release date': str(metadata.originally_available_at)}, - {'Studio': metadata.studio}, - {'Summary': metadata.summary}, + {'ID': self.metadata.id}, + {'URL': self.url}, + {'Title': self.metadata.title}, + {'Release date': str(self.metadata.originally_available_at)}, + {'Studio': self.metadata.studio}, + {'Summary': self.metadata.summary}, ] - Log.metadata(data_to_log, log_level="info") + log.metadata(data_to_log, log_level="info") # Log basic metadata stored in arrays multi_arr = [ - {'Collection', metadata.collections}, - {'Genre', metadata.genres}, - {'Moods', metadata.moods}, - {'Styles', metadata.styles}, - {'Poster URL', metadata.posters}, - {'Fan art URL', metadata.art}, + {'Collection': self.metadata.collections}, + {'Genre': self.metadata.genres}, + {'Moods': self.metadata.moods}, + {'Styles': self.metadata.styles}, + {'Poster URL': self.metadata.posters}, + {'Fan art URL': self.metadata.art}, ] - Log.metadata_arrs(multi_arr, log_level="info") + log.metadata_arrs(multi_arr, log_level="info") - Log.separator(log_level="info") + log.separator(log_level="info") def safe_unicode(s, encoding='utf-8'): diff --git a/Contents/Code/logging.py b/Contents/Code/logging.py index c3d6f94..1249c6a 100644 --- a/Contents/Code/logging.py +++ b/Contents/Code/logging.py @@ -36,8 +36,9 @@ def metadata(self, dict_arr, log_level="info"): val=val ) if log_level.lower() == "debug": - return self.debug(output) - return self.info(output) + self.debug(output) + else: + self.info(output) def metadata_arrs(self, dict_arr, log_level="info"): # Loop through dicts in array @@ -53,5 +54,6 @@ def metadata_arrs(self, dict_arr, log_level="info"): ) ) if log_level.lower() == "debug": - return self.debug(output) - return self.info(output) + self.debug(output) + else: + self.info(output) From 844d2d37c6df31f05bb192f99ffab27eacdf0ccb Mon Sep 17 00:00:00 2001 From: djdembeck Date: Wed, 25 Aug 2021 21:27:03 -0500 Subject: [PATCH 16/30] Log error rather than pass it --- Contents/Code/__init__.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/Contents/Code/__init__.py b/Contents/Code/__init__.py index f849c5f..6021751 100644 --- a/Contents/Code/__init__.py +++ b/Contents/Code/__init__.py @@ -992,8 +992,8 @@ def update(self, metadata, media, lang, force=False): try: self.html = HTML.ElementFromURL(self.url, sleep=REQUEST_DELAY) - except Exception: - pass + except Exception as e: + log.info(e) self.date = None self.rating = None From 13b17b42682b132080b1cd0ede1840ac53669cd8 Mon Sep 17 00:00:00 2001 From: djdembeck Date: Wed, 25 Aug 2021 22:16:16 -0500 Subject: [PATCH 17/30] Break urls into own import; remove unused function; further simplify --- Contents/Code/__init__.py | 220 +++++--------------------------------- Contents/Code/urls.py | 195 +++++++++++++++++++++++++++++++++ 2 files changed, 222 insertions(+), 193 deletions(-) create mode 100644 Contents/Code/urls.py diff --git a/Contents/Code/__init__.py b/Contents/Code/__init__.py index 6021751..b1b3b25 100644 --- a/Contents/Code/__init__.py +++ b/Contents/Code/__init__.py @@ -1,20 +1,13 @@ # Audiobooks (Audible) # coding: utf-8 import json +import Queue import re import types +# Import internal tools from logging import Logging -import Queue - - -def json_decode(output): - try: - return json.loads(output, encoding="utf-8") - except AttributeError: - return None - +from urls import SiteUrl -# URLs VERSION_NO = '1.2021.08.24.1' # Delay used when requesting HTML, @@ -33,165 +26,6 @@ def json_decode(output): # Setup logger log = Logging() -intl_sites = { - 'en': { - 'url': 'www.audible.com', - 'urltitle': u'title=', - 'rel_date': u'Release date', - 'nar_by': u'Narrated By', - 'nar_by2': u'Narrated by' - }, - 'fr': { - 'url': 'www.audible.fr', - 'urltitle': u'title=', - 'rel_date': u'Date de publication', - 'nar_by': u'Narrateur(s)', - 'nar_by2': u'Lu par' - }, - 'de': { - 'url': 'www.audible.de', - 'urltitle': u'title=', - 'rel_date': u'Erscheinungsdatum', - 'nar_by': u'Gesprochen von', - 'rel_date2': u'Veröffentlicht' - }, - 'it': { - 'url': 'www.audible.it', - 'urltitle': u'title=', - 'rel_date': u'Data di Pubblicazione', - 'nar_by': u'Narratore' - }, -} - -sites_langs = { - 'www.audible.com': {'lang': 'en'}, - 'www.audible.co.uk': {'lang': 'en'}, - 'www.audible.com.au': {'lang': 'en'}, - 'www.audible.fr': {'lang': 'fr'}, - 'www.audible.de': {'lang': 'de'}, - 'www.audible.it': {'lang': 'it'}, -} - - -def SetupUrls(sitetype, base, lang='en'): - log.debug('Library/Search language is : %s', lang) - ctx = {} - if sitetype: - log.debug('Manual Site Selection Enabled : %s', base) - log.debug('Language being ignored due to manual site selection') - if base in sites_langs: - log.debug('Pulling language from sites array') - lang = sites_langs[base]['lang'] - if lang in intl_sites: - base = intl_sites[lang]['url'] - urlsearchtitle = intl_sites[lang]['urltitle'] - ctx['REL_DATE'] = intl_sites[lang]['rel_date'] - ctx['NAR_BY'] = intl_sites[lang]['nar_by'] - if 'rel_date2' in intl_sites[lang]: - ctx['REL_DATE_INFO'] = intl_sites[lang]['rel_date2'] - else: - ctx['REL_DATE_INFO'] = ctx['REL_DATE'] - if 'nar_by2' in intl_sites[lang]: - ctx['NAR_BY_INFO'] = intl_sites[lang]['nar_by2'] - else: - ctx['NAR_BY_INFO'] = ctx['NAR_BY'] - else: - ctx['REL_DATE'] = 'Release date' - ctx['REL_DATE_INFO'] = ctx['REL_DATE'] - ctx['NAR_BY'] = 'Narrated By' - ctx['NAR_BY_INFO'] = 'Narrated by' - log.debug( - 'Sites language is : %s', lang - ) - log.debug( - '/************************************' - 'LANG DEBUGGING' - '************************************/' - ) - log.debug( - '/* REL_DATE = %s', ctx['REL_DATE'] - ) - log.debug( - '/* REL_DATE_INFO = %s', ctx['REL_DATE_INFO'] - ) - log.debug( - '/* NAR_BY = %s', ctx['NAR_BY'] - ) - log.debug( - '/* NAR_BY_INFO = %s', ctx['NAR_BY_INFO'] - ) - log.debug( - '/****************************************' - '****************************************/' - ) - else: - log.debug( - 'Audible site will be chosen by library language' - ) - log.debug( - 'Library Language is %s', lang - ) - if base is None: - base = 'www.audible.com' - if lang in intl_sites: - base = intl_sites[lang]['url'] - urlsearchtitle = intl_sites[lang]['urltitle'] - ctx['REL_DATE'] = intl_sites[lang]['rel_date'] - ctx['NAR_BY'] = intl_sites[lang]['nar_by'] - if 'rel_date2' in intl_sites[lang]: - ctx['REL_DATE_INFO'] = intl_sites[lang]['rel_date2'] - else: - ctx['REL_DATE_INFO'] = ctx['REL_DATE'] - if 'nar_by2' in intl_sites[lang]: - ctx['NAR_BY_INFO'] = intl_sites[lang]['nar_by2'] - else: - ctx['NAR_BY_INFO'] = ctx['NAR_BY'] - else: - ctx['REL_DATE'] = 'Release date' - ctx['REL_DATE_INFO'] = ctx['REL_DATE'] - ctx['NAR_BY'] = 'Narrated By' - ctx['NAR_BY_INFO'] = 'Narrated by' - - AUD_BASE_URL = 'https://' + str(base) + '/' - AUD_TITLE_URL = urlsearchtitle - - AUD_BOOK_INFO_ARR = [ - AUD_BASE_URL, - 'pd/%s?ipRedirectOverride=true', - ] - ctx['AUD_BOOK_INFO'] = ''.join(AUD_BOOK_INFO_ARR) - - AUD_ARTIST_SEARCH_URL_ARR = [ - AUD_BASE_URL, - 'search?searchAuthor=%s&ipRedirectOverride=true', - ] - ctx['AUD_ARTIST_SEARCH_URL'] = ''.join(AUD_ARTIST_SEARCH_URL_ARR) - - AUD_ALBUM_SEARCH_URL_ARR = [ - AUD_BASE_URL, - 'search?', - AUD_TITLE_URL, - '%s&x=41&ipRedirectOverride=true', - ] - ctx['AUD_ALBUM_SEARCH_URL'] = ''.join(AUD_ALBUM_SEARCH_URL_ARR) - - AUD_KEYWORD_SEARCH_URL_ARR = [ - AUD_BASE_URL, - ('search?filterby=field-keywords&advsearchKeywords=%s' - '&x=41&ipRedirectOverride=true'), - ] - ctx['AUD_KEYWORD_SEARCH_URL'] = ''.join(AUD_KEYWORD_SEARCH_URL_ARR) - - AUD_SEARCH_URL_ARR = [ - AUD_BASE_URL, - 'search?', - AUD_TITLE_URL, - '{0}&searchAuthor={1}&x=41&ipRedirectOverride=true', - ] - ctx['AUD_SEARCH_URL'] = ''.join(AUD_SEARCH_URL_ARR) - - return ctx - def Start(): # HTTP.ClearCache() @@ -579,8 +413,6 @@ def run_search(self): # Score the album name scorebase1 = self.media.album scorebase2 = title.encode('utf-8') - # log.debug('scorebase1: %s', scorebase1) - # log.debug('scorebase2: %s', scorebase2) score = INITIAL_SCORE - Util.LevenshteinDistance( scorebase1, scorebase2 @@ -589,8 +421,6 @@ def run_search(self): if self.media.artist: scorebase3 = self.media.artist scorebase4 = author - # log.debug('scorebase3: %s', scorebase3) - # log.debug('scorebase4: %s', scorebase4) score = INITIAL_SCORE - Util.LevenshteinDistance( scorebase3, scorebase4 ) @@ -634,7 +464,8 @@ def run_search(self): return info def search(self, results, media, lang, manual): - self.ctx = SetupUrls(Prefs['sitetype'], Prefs['site'], lang) + url_info = SiteUrl(Prefs['sitetype'], Prefs['site'], lang) + self.ctx = url_info.SetupUrls() self.LCL_IGNORE_SCORE = IGNORE_SCORE self.results = results self.media = media @@ -884,16 +715,7 @@ def handle_series(self): self.series_def = w.group(1) self.volume_def = w.group(2) - def compile_metadata(self): - # Set the date and year if found. - if self.date is not None: - self.metadata.originally_available_at = self.date - - # Add the genres - self.metadata.genres.clear() - self.metadata.genres.add(self.genre_parent) - self.metadata.genres.add(self.genre_child) - + def parse_author_narrator(self): # Add Narrators to Styles narrators_list = self.narrator.split(",") narr_contributors_list = [ @@ -928,6 +750,7 @@ def compile_metadata(self): ]: self.metadata.moods.add(author.strip()) + def parse_series(self): # Clean series x = re.match("(.*)(: A .* Series)", self.series_def) if x: @@ -955,6 +778,20 @@ def compile_metadata(self): if y: self.title = y.group(1) + def compile_metadata(self): + # Set the date and year if found. + if self.date is not None: + self.metadata.originally_available_at = self.date + + # Add the genres + self.metadata.genres.clear() + self.metadata.genres.add(self.genre_parent) + self.metadata.genres.add(self.genre_child) + + self.parse_author_narrator() + + self.parse_series() + # Other metadata self.metadata.title = self.title self.metadata.title_sort = ' - '.join( @@ -977,6 +814,8 @@ def compile_metadata(self): self.writeInfo() def update(self, metadata, media, lang, force=False): + url_info = SiteUrl(Prefs['sitetype'], Prefs['site'], lang) + self.ctx = url_info.SetupUrls() self.metadata = metadata self.media = media self.lang = lang @@ -984,8 +823,6 @@ def update(self, metadata, media, lang, force=False): '***** UPDATING "%s" ID: %s - AUDIBLE v.%s *****', media.title, self.metadata.id, VERSION_NO ) - self.ctx = SetupUrls(Prefs['sitetype'], Prefs['site'], lang) - self.metadata = metadata # Make url self.url = self.ctx['AUD_BOOK_INFO'] % self.metadata.id @@ -1114,11 +951,8 @@ def writeInfo(self): log.separator(log_level="info") -def safe_unicode(s, encoding='utf-8'): - if s is None: +def json_decode(output): + try: + return json.loads(output, encoding="utf-8") + except AttributeError: return None - if isinstance(s, basestring): - if isinstance(s, types.UnicodeType): - return s - return s.decode(encoding) - return str(s).decode(encoding) diff --git a/Contents/Code/urls.py b/Contents/Code/urls.py new file mode 100644 index 0000000..bc041b3 --- /dev/null +++ b/Contents/Code/urls.py @@ -0,0 +1,195 @@ +from logging import Logging + + +class SiteUrl: + intl_sites = { + 'en': { + 'url': 'www.audible.com', + 'urltitle': u'title=', + 'rel_date': u'Release date', + 'nar_by': u'Narrated By', + 'nar_by2': u'Narrated by' + }, + 'fr': { + 'url': 'www.audible.fr', + 'urltitle': u'title=', + 'rel_date': u'Date de publication', + 'nar_by': u'Narrateur(s)', + 'nar_by2': u'Lu par' + }, + 'de': { + 'url': 'www.audible.de', + 'urltitle': u'title=', + 'rel_date': u'Erscheinungsdatum', + 'nar_by': u'Gesprochen von', + 'rel_date2': u'Veröffentlicht' + }, + 'it': { + 'url': 'www.audible.it', + 'urltitle': u'title=', + 'rel_date': u'Data di Pubblicazione', + 'nar_by': u'Narratore' + }, + } + + sites_langs = { + 'www.audible.com': {'lang': 'en'}, + 'www.audible.co.uk': {'lang': 'en'}, + 'www.audible.com.au': {'lang': 'en'}, + 'www.audible.fr': {'lang': 'fr'}, + 'www.audible.de': {'lang': 'de'}, + 'www.audible.it': {'lang': 'it'}, + } + + def __init__(self, sitetype, base, lang='en'): + self.sitetype = sitetype + self.base = base + self.lang = lang + + def set_context_urls(self): + AUD_BASE_URL = 'https://' + str(self.base) + '/' + AUD_TITLE_URL = self.urlsearchtitle + + AUD_BOOK_INFO_ARR = [ + AUD_BASE_URL, + 'pd/%s?ipRedirectOverride=true', + ] + self.context['AUD_BOOK_INFO'] = ''.join( + AUD_BOOK_INFO_ARR + ) + + AUD_ARTIST_SEARCH_URL_ARR = [ + AUD_BASE_URL, + 'search?searchAuthor=%s&ipRedirectOverride=true', + ] + self.context['AUD_ARTIST_SEARCH_URL'] = ''.join( + AUD_ARTIST_SEARCH_URL_ARR + ) + + AUD_ALBUM_SEARCH_URL_ARR = [ + AUD_BASE_URL, + 'search?', + AUD_TITLE_URL, + '%s&x=41&ipRedirectOverride=true', + ] + self.context['AUD_ALBUM_SEARCH_URL'] = ''.join( + AUD_ALBUM_SEARCH_URL_ARR + ) + + AUD_KEYWORD_SEARCH_URL_ARR = [ + AUD_BASE_URL, + ('search?filterby=field-keywords&advsearchKeywords=%s' + '&x=41&ipRedirectOverride=true'), + ] + self.context['AUD_KEYWORD_SEARCH_URL'] = ''.join( + AUD_KEYWORD_SEARCH_URL_ARR + ) + + AUD_SEARCH_URL_ARR = [ + AUD_BASE_URL, + 'search?', + AUD_TITLE_URL, + '{0}&searchAuthor={1}&x=41&ipRedirectOverride=true', + ] + self.context['AUD_SEARCH_URL'] = ''.join(AUD_SEARCH_URL_ARR) + + def base_is_manual(self): + if self.base in self.sites_langs: + log.debug('Pulling language from sites array') + self.lang = self.sites_langs[self.base]['lang'] + if self.lang in self.intl_sites: + self.base = self.intl_sites[self.lang]['url'] + self.urlsearchtitle = ( + self.intl_sites[self.lang]['urltitle'] + ) + self.context['REL_DATE'] = ( + self.intl_sites[self.lang]['rel_date'] + ) + self.context['NAR_BY'] = ( + self.intl_sites[self.lang]['nar_by'] + ) + if 'rel_date2' in self.intl_sites[self.lang]: + self.context['REL_DATE_INFO'] = ( + self.intl_sites[self.lang]['rel_date2'] + ) + else: + self.context['REL_DATE_INFO'] = ( + self.context['REL_DATE'] + ) + if 'nar_by2' in self.intl_sites[self.lang]: + self.context['NAR_BY_INFO'] = ( + self.intl_sites[self.lang]['nar_by2'] + ) + else: + self.context['NAR_BY_INFO'] = self.context['NAR_BY'] + else: + self.context['REL_DATE'] = 'Release date' + self.context['REL_DATE_INFO'] = self.context['REL_DATE'] + self.context['NAR_BY'] = 'Narrated By' + self.context['NAR_BY_INFO'] = 'Narrated by' + + # Log translations of certain terms + log.separator(msg='LANG DEBUGGING', log_level="debug") + data_to_log = [ + {'Sites language is': self.lang}, + {'REL_DATE': self.context['REL_DATE']}, + {'REL_DATE_INFO': self.context['REL_DATE_INFO']}, + {'NAR_BY date': self.context['NAR_BY']}, + {'NAR_BY_INFO': self.context['NAR_BY_INFO']}, + ] + log.metadata(data_to_log, log_level="debug") + log.separator(log_level="debug") + + def base_is_auto(self): + log.debug( + 'Audible site will be chosen by library language' + ) + log.debug( + 'Library Language is %s', self.lang + ) + if self.base is None: + self.base = 'www.audible.com' + if self.lang in self.intl_sites: + self.base = self.intl_sites[self.lang]['url'] + self.urlsearchtitle = self.intl_sites[self.lang]['urltitle'] + self.context['REL_DATE'] = ( + self.intl_sites[self.lang]['rel_date'] + ) + self.context['NAR_BY'] = self.intl_sites[self.lang]['nar_by'] + if 'rel_date2' in self.intl_sites[self.lang]: + self.context['REL_DATE_INFO'] = ( + self.intl_sites[self.lang]['rel_date2'] + ) + else: + self.context['REL_DATE_INFO'] = ( + self.context['REL_DATE'] + ) + if 'nar_by2' in self.intl_sites[self.lang]: + self.context['NAR_BY_INFO'] = ( + self.intl_sites[self.lang]['nar_by2'] + ) + else: + self.context['NAR_BY_INFO'] = self.context['NAR_BY'] + else: + self.context['REL_DATE'] = 'Release date' + self.context['REL_DATE_INFO'] = self.context['REL_DATE'] + self.context['NAR_BY'] = 'Narrated By' + self.context['NAR_BY_INFO'] = 'Narrated by' + + def SetupUrls(self): + log.debug('Library/Search language is : %s', self.lang) + self.context = {} + if self.sitetype: + log.debug('Manual Site Selection Enabled : %s', self.base) + log.debug('Language being ignored due to manual site selection') + self.base_is_manual() + else: + self.base_is_auto() + + self.set_context_urls() + + return self.context + + +# Setup logger +log = Logging() From 42442a35e3d64c302e2df3fa87f54acaed14d064 Mon Sep 17 00:00:00 2001 From: djdembeck Date: Wed, 25 Aug 2021 22:28:04 -0500 Subject: [PATCH 18/30] Don't bother logging collection of fan art; fix logging poster url --- Contents/Code/__init__.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/Contents/Code/__init__.py b/Contents/Code/__init__.py index b1b3b25..8de136d 100644 --- a/Contents/Code/__init__.py +++ b/Contents/Code/__init__.py @@ -939,12 +939,12 @@ def writeInfo(self): # Log basic metadata stored in arrays multi_arr = [ - {'Collection': self.metadata.collections}, + # {'Collection': self.metadata.collections}, {'Genre': self.metadata.genres}, - {'Moods': self.metadata.moods}, - {'Styles': self.metadata.styles}, - {'Poster URL': self.metadata.posters}, - {'Fan art URL': self.metadata.art}, + {'Moods(Authors)': self.metadata.moods}, + {'Styles(Narrators)': self.metadata.styles}, + {'Poster URL': self.metadata.posters.keys()}, + # {'Fan art URL': self.metadata.art}, ] log.metadata_arrs(multi_arr, log_level="info") From 9e9aeb0033b1a776d46c10a6a3037049d7d81105 Mon Sep 17 00:00:00 2001 From: djdembeck Date: Wed, 25 Aug 2021 22:53:19 -0500 Subject: [PATCH 19/30] Remove redundant search result logging; give logs more space --- Contents/Code/__init__.py | 35 +++++++++++++++-------------------- Contents/Code/logging.py | 4 ++-- 2 files changed, 17 insertions(+), 22 deletions(-) diff --git a/Contents/Code/__init__.py b/Contents/Code/__init__.py index 8de136d..b8b9672 100644 --- a/Contents/Code/__init__.py +++ b/Contents/Code/__init__.py @@ -319,6 +319,7 @@ def pre_search(self): return if self.manual: + log.separator(msg="NOTE", log_level="info") log.info( 'You clicked \'fix match\'. ' 'This may have returned no useful results because ' @@ -366,9 +367,13 @@ def format_title(self): 'normalizedName stripped = %s', self.normalizedName ) - log.info( - '***** SEARCHING FOR "%s" - AUDIBLE v.%s *****', - self.normalizedName, VERSION_NO + log.separator( + msg=( + "SEARCHING FOR " + '"' + self.normalizedName + '"' + ) + ( + " - " + "AUDIBLE v" + VERSION_NO + ), + log_level="info" ) def run_search(self): @@ -378,7 +383,9 @@ def run_search(self): itemId_full = None itemId = None valid_itemId = None - for f in self.found: + + log.separator(msg="Search results", log_level="info") + for i, f in enumerate(self.found): url = f['url'] log.debug('URL For Breakdown: %s', url) @@ -455,10 +462,9 @@ def run_search(self): self.LCL_IGNORE_SCORE ) - if i != len(self.found): - log.separator() - - i += 1 + # Print separators for easy reading + if i <= len(self.found): + log.separator(log_level="info") info = sorted(info, key=lambda inf: inf['score'], reverse=True) return info @@ -505,17 +511,6 @@ def search(self, results, media, lang, manual): len(self.result), self.normalizedName ) - i = 1 - for f in self.result: - log.debug( - ' %s. (title) %s (author) %s (url)[%s]' - ' (date)(%s) (thumb){%s}', - i, f['title'], f['author'], - f['url'], str(f['date']), f['thumb'] - ) - i += 1 - - log.separator(log_level="info") info = self.run_search() @@ -629,7 +624,7 @@ def update_scrape(self): '/div[3]/a/span/text()' ) ) - log.separator(msg='XPATH SEARCH HIT') + log.separator(msg='XPATH SEARCH HIT', log_level="debug") def date_missing(self): for r in self.html.xpath( diff --git a/Contents/Code/logging.py b/Contents/Code/logging.py index 1249c6a..d876a50 100644 --- a/Contents/Code/logging.py +++ b/Contents/Code/logging.py @@ -31,7 +31,7 @@ def metadata(self, dict_arr, log_level="info"): # Loop through each key/value for key, val in log_type.items(): if val: - output = "{key:<15}{val}".format( + output = "{key:<20}{val}".format( key=key, val=val ) @@ -48,7 +48,7 @@ def metadata_arrs(self, dict_arr, log_level="info"): if val: # Loop through dict's array for item in val: - output = ("{key:<15}{val}".format( + output = ("{key:<20}{val}".format( key=key, val=item ) From c80c322a250764adc5a33f013d9b47d1aa92dc3e Mon Sep 17 00:00:00 2001 From: djdembeck Date: Wed, 25 Aug 2021 23:28:50 -0500 Subject: [PATCH 20/30] Show version on plugin start; Fix poster url; Show result numbers in logs --- Contents/Code/__init__.py | 25 +++++++++++++++++-------- 1 file changed, 17 insertions(+), 8 deletions(-) diff --git a/Contents/Code/__init__.py b/Contents/Code/__init__.py index b8b9672..31e5da7 100644 --- a/Contents/Code/__init__.py +++ b/Contents/Code/__init__.py @@ -8,7 +8,7 @@ from logging import Logging from urls import SiteUrl -VERSION_NO = '1.2021.08.24.1' +VERSION_NO = '2021.08.25.1' # Delay used when requesting HTML, # may be good to have to prevent being banned from the site @@ -36,6 +36,12 @@ def Start(): 'Media Center PC 6.0' ) HTTP.Headers['Accept-Encoding'] = 'gzip' + log.separator( + msg=( + " - " + "Audible Audiobooks Agent v" + VERSION_NO + ), + log_level="info" + ) class AudiobookArtist(Agent.Artist): @@ -370,9 +376,7 @@ def format_title(self): log.separator( msg=( "SEARCHING FOR " + '"' + self.normalizedName + '"' - ) + ( - " - " + "AUDIBLE v" + VERSION_NO - ), + ), log_level="info" ) @@ -432,6 +436,7 @@ def run_search(self): scorebase3, scorebase4 ) + log.info(msg=("Result #" + i), log_level="info") # Log basic metadata data_to_log = [ {'ID is': valid_itemId}, @@ -814,9 +819,13 @@ def update(self, metadata, media, lang, force=False): self.metadata = metadata self.media = media self.lang = lang - log.debug( - '***** UPDATING "%s" ID: %s - AUDIBLE v.%s *****', - media.title, self.metadata.id, VERSION_NO + log.separator( + msg=( + "UPDATING" + ' "' + self.media.title + '" ' + ( + "ID: " + self.metadata.id + ) + ), + log_level="info" ) # Make url @@ -929,6 +938,7 @@ def writeInfo(self): {'Release date': str(self.metadata.originally_available_at)}, {'Studio': self.metadata.studio}, {'Summary': self.metadata.summary}, + {'Poster URL': self.thumb}, ] log.metadata(data_to_log, log_level="info") @@ -938,7 +948,6 @@ def writeInfo(self): {'Genre': self.metadata.genres}, {'Moods(Authors)': self.metadata.moods}, {'Styles(Narrators)': self.metadata.styles}, - {'Poster URL': self.metadata.posters.keys()}, # {'Fan art URL': self.metadata.art}, ] log.metadata_arrs(multi_arr, log_level="info") From 062f89e7c543f53e14bc1882ab65f731ce123724 Mon Sep 17 00:00:00 2001 From: djdembeck Date: Wed, 25 Aug 2021 23:36:12 -0500 Subject: [PATCH 21/30] Fix leftover and result num --- Contents/Code/__init__.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/Contents/Code/__init__.py b/Contents/Code/__init__.py index 31e5da7..792f07f 100644 --- a/Contents/Code/__init__.py +++ b/Contents/Code/__init__.py @@ -38,7 +38,7 @@ def Start(): HTTP.Headers['Accept-Encoding'] = 'gzip' log.separator( msg=( - " - " + "Audible Audiobooks Agent v" + VERSION_NO + "Audible Audiobooks Agent v" + VERSION_NO ), log_level="info" ) @@ -436,7 +436,7 @@ def run_search(self): scorebase3, scorebase4 ) - log.info(msg=("Result #" + i), log_level="info") + log.info("Result #" + str(i + 1)) # Log basic metadata data_to_log = [ {'ID is': valid_itemId}, From 170730a73382f80229b53a83c7a2e4ac02cd84f5 Mon Sep 17 00:00:00 2001 From: djdembeck Date: Thu, 26 Aug 2021 00:10:19 -0500 Subject: [PATCH 22/30] Add prefs to not download cover and not overwrite existing genres --- Contents/Code/__init__.py | 14 +++++++++----- Contents/DefaultPrefs.json | 10 ++++++++++ 2 files changed, 19 insertions(+), 5 deletions(-) diff --git a/Contents/Code/__init__.py b/Contents/Code/__init__.py index 792f07f..7138936 100644 --- a/Contents/Code/__init__.py +++ b/Contents/Code/__init__.py @@ -784,9 +784,10 @@ def compile_metadata(self): self.metadata.originally_available_at = self.date # Add the genres - self.metadata.genres.clear() - self.metadata.genres.add(self.genre_parent) - self.metadata.genres.add(self.genre_child) + if not Prefs['no_overwrite_genre']: + self.metadata.genres.clear() + self.metadata.genres.add(self.genre_parent) + self.metadata.genres.add(self.genre_child) self.parse_author_narrator() @@ -799,8 +800,11 @@ def compile_metadata(self): ) self.metadata.studio = self.studio self.metadata.summary = self.synopsis - self.metadata.posters[1] = Proxy.Media(HTTP.Request(self.thumb)) - self.metadata.posters.validate_keys(self.thumb) + + if not Prefs['disable_cover']: + self.metadata.posters[1] = Proxy.Media(HTTP.Request(self.thumb)) + self.metadata.posters.validate_keys(self.thumb) + # Use rating only when available if self.rating: self.metadata.rating = float(self.rating) * 2 diff --git a/Contents/DefaultPrefs.json b/Contents/DefaultPrefs.json index c7ee786..745d2cb 100644 --- a/Contents/DefaultPrefs.json +++ b/Contents/DefaultPrefs.json @@ -8,6 +8,16 @@ "type" : "enum", "values" : ["www.audible.com","www.audible.co.uk","www.audible.com.au","www.audible.de","www.audible.fr","www.audible.it"], "default" : "www.audible.com" +},{ + "id": "disable_cover", + "label": "Disables the agent fetching artwork from Audible", + "type": "bool", + "default": "false" +},{ + "id": "no_overwrite_genre", + "label": "Leaves existing genres in place", + "type": "bool", + "default": "false" },{ "id": "copyyear", "label": "Uses copyright year instead of datePublished", From cb1faaa04bc14496c457279262c5544f1067f7e6 Mon Sep 17 00:00:00 2001 From: djdembeck Date: Thu, 26 Aug 2021 00:12:50 -0500 Subject: [PATCH 23/30] not plural --- Contents/DefaultPrefs.json | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/Contents/DefaultPrefs.json b/Contents/DefaultPrefs.json index 745d2cb..ab64ff8 100644 --- a/Contents/DefaultPrefs.json +++ b/Contents/DefaultPrefs.json @@ -10,17 +10,17 @@ "default" : "www.audible.com" },{ "id": "disable_cover", - "label": "Disables the agent fetching artwork from Audible", + "label": "Disable the fetching artwork from Audible", "type": "bool", "default": "false" },{ "id": "no_overwrite_genre", - "label": "Leaves existing genres in place", + "label": "Leave existing genres in place", "type": "bool", "default": "false" },{ "id": "copyyear", - "label": "Uses copyright year instead of datePublished", + "label": "Use copyright year instead of datePublished", "type": "bool", "default": "false" },{ From 72b5aeddf006c72b4b474945c7c5f9cde30a4175 Mon Sep 17 00:00:00 2001 From: djdembeck Date: Thu, 26 Aug 2021 00:22:29 -0500 Subject: [PATCH 24/30] Add audible.ca --- Contents/Code/urls.py | 1 + Contents/DefaultPrefs.json | 10 +++++++++- 2 files changed, 10 insertions(+), 1 deletion(-) diff --git a/Contents/Code/urls.py b/Contents/Code/urls.py index bc041b3..bba9006 100644 --- a/Contents/Code/urls.py +++ b/Contents/Code/urls.py @@ -34,6 +34,7 @@ class SiteUrl: sites_langs = { 'www.audible.com': {'lang': 'en'}, + 'www.audible.ca': {'lang': 'en'}, 'www.audible.co.uk': {'lang': 'en'}, 'www.audible.com.au': {'lang': 'en'}, 'www.audible.fr': {'lang': 'fr'}, diff --git a/Contents/DefaultPrefs.json b/Contents/DefaultPrefs.json index ab64ff8..e175e51 100644 --- a/Contents/DefaultPrefs.json +++ b/Contents/DefaultPrefs.json @@ -6,7 +6,15 @@ "id" : "site", "label" : "Select Audible site to use: ", "type" : "enum", - "values" : ["www.audible.com","www.audible.co.uk","www.audible.com.au","www.audible.de","www.audible.fr","www.audible.it"], + "values" : [ + "www.audible.com", + "www.audible.ca", + "www.audible.co.uk", + "www.audible.com.au", + "www.audible.de", + "www.audible.fr", + "www.audible.it" + ], "default" : "www.audible.com" },{ "id": "disable_cover", From be2073664d66f36cd085180a9b26b7e2efd8bc71 Mon Sep 17 00:00:00 2001 From: djdembeck Date: Thu, 26 Aug 2021 00:56:00 -0500 Subject: [PATCH 25/30] Don't need to add quotes --- Contents/Code/__init__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Contents/Code/__init__.py b/Contents/Code/__init__.py index 7138936..78a2468 100644 --- a/Contents/Code/__init__.py +++ b/Contents/Code/__init__.py @@ -825,7 +825,7 @@ def update(self, metadata, media, lang, force=False): self.lang = lang log.separator( msg=( - "UPDATING" + ' "' + self.media.title + '" ' + ( + "UPDATING" + self.media.title + ( "ID: " + self.metadata.id ) ), From a89e550141d45d905fa8c3f11d09464f5b2fa178 Mon Sep 17 00:00:00 2001 From: djdembeck Date: Thu, 26 Aug 2021 14:03:42 -0500 Subject: [PATCH 26/30] Change cover prefs, add validateprefs --- Contents/Code/__init__.py | 14 ++++++++++---- Contents/DefaultPrefs.json | 13 +++++++++---- 2 files changed, 19 insertions(+), 8 deletions(-) diff --git a/Contents/Code/__init__.py b/Contents/Code/__init__.py index 78a2468..bc33113 100644 --- a/Contents/Code/__init__.py +++ b/Contents/Code/__init__.py @@ -8,7 +8,7 @@ from logging import Logging from urls import SiteUrl -VERSION_NO = '2021.08.25.1' +VERSION_NO = '2021.08.25.2' # Delay used when requesting HTML, # may be good to have to prevent being banned from the site @@ -27,6 +27,10 @@ log = Logging() +def ValidatePrefs(): + log.debug('ValidatePrefs function call') + + def Start(): # HTTP.ClearCache() HTTP.CacheTime = CACHE_1WEEK @@ -347,8 +351,6 @@ def pre_search(self): 'Custom album search for: ' + self.media.name ) self.media.album = self.media.name - else: - log.info('Album search: ' + self.media.title) def format_title(self): # Normalize the name @@ -801,9 +803,13 @@ def compile_metadata(self): self.metadata.studio = self.studio self.metadata.summary = self.synopsis - if not Prefs['disable_cover']: + if Prefs['cover_options'] == "Use Audible cover": self.metadata.posters[1] = Proxy.Media(HTTP.Request(self.thumb)) self.metadata.posters.validate_keys(self.thumb) + elif Prefs['cover_options'] == "Download cover but don't overwrite existing": + self.metadata.posters[self.thumb] = Proxy.Media( + HTTP.Request(self.thumb), sort_order=1 + ) # Use rating only when available if self.rating: diff --git a/Contents/DefaultPrefs.json b/Contents/DefaultPrefs.json index e175e51..4a15b75 100644 --- a/Contents/DefaultPrefs.json +++ b/Contents/DefaultPrefs.json @@ -17,10 +17,15 @@ ], "default" : "www.audible.com" },{ - "id": "disable_cover", - "label": "Disable the fetching artwork from Audible", - "type": "bool", - "default": "false" + "id": "cover_options", + "label": "How to handle artwork from Audible: ", + "type": "enum", + "values": [ + "Use Audible cover", + "Download cover but don't overwrite existing", + "Don't download cover" + ], + "default": "Use Audible cover" },{ "id": "no_overwrite_genre", "label": "Leave existing genres in place", From fe64dafb7a3b0c16554de1afae4560319f935a32 Mon Sep 17 00:00:00 2001 From: djdembeck Date: Fri, 27 Aug 2021 15:53:38 -0500 Subject: [PATCH 27/30] Use helper functions to hold values and execute basic, non-agent specific tasks --- Contents/Code/__init__.py | 1033 +++++++++++++++------------------ Contents/Code/search_tools.py | 119 ++++ Contents/Code/update_tools.py | 85 +++ 3 files changed, 681 insertions(+), 556 deletions(-) create mode 100644 Contents/Code/search_tools.py create mode 100644 Contents/Code/update_tools.py diff --git a/Contents/Code/__init__.py b/Contents/Code/__init__.py index bc33113..246ddff 100644 --- a/Contents/Code/__init__.py +++ b/Contents/Code/__init__.py @@ -6,9 +6,11 @@ import types # Import internal tools from logging import Logging +from search_tools import SearchTool +from update_tools import UpdateTool from urls import SiteUrl -VERSION_NO = '2021.08.25.2' +VERSION_NO = '2021.08.27.1' # Delay used when requesting HTML, # may be good to have to prevent being banned from the site @@ -87,10 +89,8 @@ def findDateInTitle(self, title): return Datetime.ParseDate(result.group(0)).date() return None - def doSearch(self, url, ctx): - + def doSearch(self, ctx, url): html = HTML.ElementFromURL(url, sleep=REQUEST_DELAY) - found = [] for r in html.xpath('//div[a/img[@class="yborder"]]'): @@ -162,41 +162,231 @@ class AudiobookAlbum(Agent.Album): prev_search_provider = 0 - def getDateFromString(self, string): + def search(self, results, media, lang, manual): + url_info = SiteUrl(Prefs['sitetype'], Prefs['site'], lang) + ctx = url_info.SetupUrls() + + search_helper = SearchTool(lang, manual, media, results) + """ + The process needs to be as follows: + 1. Search class is instantiated. + 2. Search class is run linearly. This can be done 1 of 2 ways: + - Call helper in the class to run it itself. + - Call each function here, using the same object. + 3. Search class returns necessary data for update function. + + What does the output of these actions need to be, and in what form? + What do I need to run here vs there? + """ + + """ + Functions I know I can call with object: + pre_search_logging + strip_title + """ + search_helper.pre_search_logging() + + """ + Functions I can't call with object + create_search_url + doSearch + before_xpath + after_xpath + run_search + """ + + # Run helper before passing to SearchTool + normalizedName = self.normalize_name(media.album) + # Strip title of things like unabridged and spaces + search_helper.strip_title(normalizedName) + # Generate search url + searchUrl = self.create_search_url(ctx, media, search_helper.normalizedName) + # Run actual search, and set the variable to it's return + result = self.doSearch(ctx, searchUrl) + + # Write search result status to log + if not result: + log.info( + 'No results found for query "%s"', + normalizedName + ) + return + log.debug( + 'Found %s result(s) for query "%s"', + len(result), + normalizedName + ) + + info = self.run_search(search_helper, media, result) + + # Output the final results. + log.separator(log_level="debug") + log.debug('Final result:') + for i, r in enumerate(info): + description = '\"%s\" by %s [%s]' % ( + r['title'], r['artist'], r['year'] + ) + log.debug( + ' [%s] %s. %s (%s) %s {%s} [%s]', + r['score'], (i + 1), r['title'], r['year'], + r['artist'], r['id'], r['thumb'] + ) + results.Append( + MetadataSearchResult( + id=r['id'], + name=description, + score=r['score'], + thumb=r['thumb'], + lang=lang + ) + ) + + """ + If there are more than one result, + and this one has a score that is >= GOOD SCORE, + then ignore the rest of the results + """ + if not manual and len(info) > 1 and r['score'] >= GOOD_SCORE: + log.info( + ' *** The score for these results are great, ' + 'so we will use them, and ignore the rest. ***' + ) + break + + def update(self, metadata, media, lang, force=False): + url_info = SiteUrl(Prefs['sitetype'], Prefs['site'], lang) + ctx = url_info.SetupUrls() + + log.separator( + msg=( + "UPDATING" + media.title + ( + "ID: " + metadata.id + ) + ), + log_level="info" + ) + + # Make url + url = ctx['AUD_BOOK_INFO'] % metadata.id + try: - return Datetime.ParseDate(string).date() - except AttributeError: - return None + html = HTML.ElementFromURL(url, sleep=REQUEST_DELAY) + except Exception as e: + log.info(e) - def getStringContentFromXPath(self, source, query): - return source.xpath('string(' + query + ')') + update_helper = UpdateTool(force, lang, media, metadata, url) + self.scrape_book_metadata(ctx, update_helper, html) - def getAnchorUrlFromXPath(self, source, query): - anchor = source.xpath(query) + if not update_helper.date: + self.date_missing(update_helper, html) - if len(anchor) == 0: - return None + # prefer copyright year over datePublished + if Prefs['copyyear']: + self.use_copyright_date(update_helper, html) - return anchor[0].get('href') + update_helper.date = self.getDateFromString(update_helper.date) - def getImageUrlFromXPath(self, source, query): - img = source.xpath(query) + self.handle_series(update_helper, html) - if len(img) == 0: - return None + # cleanup synopsis + update_helper.synopsis = ( + update_helper.synopsis.replace("", "") + .replace("", "") + .replace("", "") + .replace("", "") + .replace("", "") + .replace("", "") + .replace("", "") + .replace("", "") + .replace("", "") + .replace("", "") + .replace("
      ", "") + .replace("
    ", "\n") + .replace("
      ", "") + .replace("
    ", "\n") + .replace("
  • ", " • ") + .replace("
  • ", "\n") + .replace("
    ", "") + .replace("

    ", "") + .replace("

    ", "\n") + ) - return img[0].get('src') + # Setup logging of all data in the array + data_to_log = [ + {'date': update_helper.date}, + {'title': update_helper.title}, + {'author': update_helper.author}, + {'narrator': update_helper.narrator}, + {'series': update_helper.series}, + {'genres': update_helper.genre_parent + ', ' + update_helper.genre_child}, + {'studio': update_helper.studio}, + {'thumb': update_helper.thumb}, + {'rating': update_helper.rating}, + {'synopsis': update_helper.synopsis}, + {'volume': update_helper.volume}, + {'series2': update_helper.series2}, + {'volume2': update_helper.volume2}, + {'series def': update_helper.series_def}, + {'volume def': update_helper.volume_def}, + ] + log.metadata(data_to_log, log_level="debug") - def findDateInTitle(self, title): - result = re.search(r'(\d+-\d+-\d+)', title) - if result is not None: - return Datetime.ParseDate(result.group(0)).date() - return None + self.compile_metadata(update_helper) + + """ + Search functions that require PMS imports, + thus we cannot 'outsource' them to SearchTool + Sorted by position in the search process + """ + + def normalize_name(self, input): + # Normalize the name + normalizedName = String.StripDiacritics( + input + ) + return normalizedName + + def create_search_url(self, ctx, media, normalizedName): + # Make the URL + if media.artist: + searchUrl = ctx['AUD_SEARCH_URL'].format( + ( + String.Quote((normalizedName).encode('utf-8'), usePlus=True) + ), + ( + String.Quote((media.artist).encode('utf-8'), usePlus=True) + ) + ) + else: + searchUrl = ctx['AUD_KEYWORD_SEARCH_URL'] % ( + String.Quote((normalizedName).encode('utf-8'), usePlus=True) + ) + return searchUrl + + def doSearch(self, ctx, url): + html = HTML.ElementFromURL(url, sleep=REQUEST_DELAY) + found = [] - def before_xpath(self): - for r in self.html.xpath( + log.separator(msg='just before new xpath line', log_level="debug") + # Set append to the returned array from this function + found = self.before_xpath(ctx, found, html) + + log.separator(msg='just after new xpath line', log_level="debug") + # Set append to the returned array from this function + found = self.after_xpath(ctx, found, html) + + return found + + def before_xpath(self, ctx, found, html): + for r in html.xpath( '//ul//li[contains(@class,"productListItem")]' ): + author = self.getStringContentFromXPath( + r, ( + 'div/div/div/div/div/div/span/ul' + '/li[contains (@class,"authorLabel")]/span/a[1]' + ) + ) datetext = self.getStringContentFromXPath( r, ( u'div/div/div/div/div/div/span/ul/li' @@ -205,212 +395,95 @@ def before_xpath(self): ) datetext = re.sub(r'[^0-9\-]', '', datetext) date = self.getDateFromString(datetext) + narrator = self.getStringContentFromXPath( + r, ( + u'div/div/div/div/div/div/span/ul/li' + '[contains (@class,"narratorLabel")]/span//a[1]' + ).format(ctx['NAR_BY']) + ) + murl = self.getAnchorUrlFromXPath( + r, 'div/div/div/div/div/div/span/ul/li/h3//a[1]' + ) title = self.getStringContentFromXPath( r, ( 'div/div/div/div/div/div/span/ul//a' '[contains (@class,"bc-link")][1]' ) ) - murl = self.getAnchorUrlFromXPath( - r, 'div/div/div/div/div/div/span/ul/li/h3//a[1]' - ) thumb = self.getImageUrlFromXPath( r, 'div/div/div/div/div/div/div' '[contains(@class,"responsive-product-square")]/div/a/img' ) - author = self.getStringContentFromXPath( - r, ( - 'div/div/div/div/div/div/span/ul' - '/li[contains (@class,"authorLabel")]/span/a[1]' - ) - ) - narrator = self.getStringContentFromXPath( - r, ( - u'div/div/div/div/div/div/span/ul/li' - '[contains (@class,"narratorLabel")]/span//a[1]' - ).format(self.ctx['NAR_BY']) - ) log.separator(msg='XPATH SEARCH HIT', log_level="debug") - self.found.append( + found.append( { - 'url': murl, - 'title': title, + 'author': author, 'date': date, + 'narrator': narrator, 'thumb': thumb, - 'author': author, - 'narrator': narrator + 'title': title, + 'url': murl, } ) + return found - def after_xpath(self): - for r in self.html.xpath( + def after_xpath(self, ctx, found, html): + for r in html.xpath( '//div[contains (@class, "adbl-search-result")]' ): + author = self.getStringContentFromXPath( + r, ( + 'div/div/ul/li/' + '/a[contains (@class,"author-profile-link")][1]' + ) + ) date = self.getDateFromString( self.getStringContentFromXPath( r, ( u'div/div/ul/li[contains (., "{0}")]' '/span[2]//text()' ).format( - self.ctx['REL_DATE'] + ctx['REL_DATE'] ) ) ) - title = self.getStringContentFromXPath( - r, 'div/div/div/div/a[1]' - ) murl = self.getAnchorUrlFromXPath( r, 'div/div/div/div/a[1]' ) - thumb = self.getImageUrlFromXPath( - r, 'div[contains (@class,"adbl-prod-image-sample-cont")]/a/img' - ) - author = self.getStringContentFromXPath( - r, ( - 'div/div/ul/li/' - '/a[contains (@class,"author-profile-link")][1]' - ) - ) narrator = self.getStringContentFromXPath( r, u'div/div/ul/li[contains (., "{0}")]//a[1]'.format( - self.ctx['NAR_BY'] + ctx['NAR_BY'] ) ) + thumb = self.getImageUrlFromXPath( + r, 'div[contains (@class,"adbl-prod-image-sample-cont")]/a/img' + ) + title = self.getStringContentFromXPath( + r, 'div/div/div/div/a[1]' + ) log.separator(msg='XPATH SEARCH HIT', log_level="debug") - self.found.append( + found.append( { - 'url': murl, - 'title': title, + 'author': author, 'date': date, + 'narrator': narrator, 'thumb': thumb, - 'author': author, - 'narrator': narrator + 'title': title, + 'url': murl, } ) + return found - def doSearch(self, url, ctx): - self.html = HTML.ElementFromURL(url, sleep=REQUEST_DELAY) - self.found = [] - self.ctx = ctx - - log.separator(msg='just before new xpath line', log_level="debug") - self.before_xpath() - - log.separator(msg='just after new xpath line', log_level="debug") - self.after_xpath() - - return self.found - - def pre_search(self): - log.separator(msg='ALBUM SEARCH', log_level="info") - # Log basic metadata - data_to_log = [ - {'ID': self.media.parent_metadata.id}, - {'Title': self.media.title}, - {'Name': self.media.name}, - {'Album': self.media.album}, - {'Artist': self.media.artist}, - ] - log.metadata(data_to_log) - log.separator(log_level="info") - - # Handle a couple of edge cases where - # album search will give bad results. - if self.media.album is None and not self.manual: - log.info('Album Title is NULL on an automatic search. Returning') - return - if self.media.album == '[Unknown Album]' and not self.manual: - log.info( - 'Album Title is [Unknown Album]' - ' on an automatic search. Returning' - ) - return - - if self.manual: - log.separator(msg="NOTE", log_level="info") - log.info( - 'You clicked \'fix match\'. ' - 'This may have returned no useful results because ' - 'it\'s searching using the title of the first track.' - ) - log.info( - 'There\'s not currently a way around this initial failure. ' - 'But clicking \'Search Options\' and ' - 'entering the title works just fine.' - ) - log.info( - 'This message will appear during the initial ' - 'search and the actual manual search.' - ) - # If this is a custom search, - # use the user-entered name instead of the scanner hint. - if self.media.name: - log.info( - 'Custom album search for: ' + self.media.name - ) - self.media.album = self.media.name - - def format_title(self): - # Normalize the name - self.normalizedName = String.StripDiacritics( - self.media.album - ) - if len(self.normalizedName) == 0: - self.normalizedName = self.media.album - log.debug( - 'normalizedName = %s', self.normalizedName - ) - - # Chop off "unabridged" - self.normalizedName = re.sub( - r"[\(\[].*?[\)\]]", "", self.normalizedName - ) - log.debug( - 'chopping bracketed text = %s', self.normalizedName - ) - self.normalizedName = self.normalizedName.strip() - log.debug( - 'normalizedName stripped = %s', self.normalizedName - ) - - log.separator( - msg=( - "SEARCHING FOR " + '"' + self.normalizedName + '"' - ), - log_level="info" - ) - - def run_search(self): + def run_search(self, helper, media, result): # Walk the found items and gather extended information info = [] - i = 1 - itemId_full = None - itemId = None - valid_itemId = None log.separator(msg="Search results", log_level="info") - for i, f in enumerate(self.found): - url = f['url'] - log.debug('URL For Breakdown: %s', url) - - # Get the id - for item in url.split('/'): - # IDs No longer start with just 'B0' - if re.match(r'^[0-9A-Z]{10,10}', item): - itemId_full = item - break - - # New Search results contain question marks after the ID - for itemId in itemId_full.split('?'): - # IDs No longer start with just 'B0' - if re.match(r'^[0-9A-Z]{10,10}', itemId): - valid_itemId = itemId - break - - if len(valid_itemId) == 0: - log.info('No Match: %s', url) + for i, f in enumerate(result): + valid_itemId = helper.get_id_from_url(item=f) + if not valid_itemId: continue title = f['title'] @@ -424,15 +497,15 @@ def run_search(self): year = date.year # Score the album name - scorebase1 = self.media.album + scorebase1 = media.album scorebase2 = title.encode('utf-8') score = INITIAL_SCORE - Util.LevenshteinDistance( scorebase1, scorebase2 ) - if self.media.artist: - scorebase3 = self.media.artist + if media.artist: + scorebase3 = media.artist scorebase4 = author score = INITIAL_SCORE - Util.LevenshteinDistance( scorebase3, scorebase4 @@ -451,7 +524,7 @@ def run_search(self): ] log.metadata(data_to_log, log_level="info") - if score >= self.LCL_IGNORE_SCORE: + if score >= IGNORE_SCORE: info.append( { 'id': valid_itemId, @@ -466,175 +539,102 @@ def run_search(self): else: log.info( '# Score is below ignore boundary (%s)... Skipping!', - self.LCL_IGNORE_SCORE + IGNORE_SCORE ) # Print separators for easy reading - if i <= len(self.found): + if i <= len(result): log.separator(log_level="info") info = sorted(info, key=lambda inf: inf['score'], reverse=True) return info - def search(self, results, media, lang, manual): - url_info = SiteUrl(Prefs['sitetype'], Prefs['site'], lang) - self.ctx = url_info.SetupUrls() - self.LCL_IGNORE_SCORE = IGNORE_SCORE - self.results = results - self.media = media - self.lang = lang - self.manual = manual - - self.pre_search() - - self.format_title() - - # Make the URL - if self.media.artist is not None: - searchUrl = self.ctx['AUD_SEARCH_URL'].format( - ( - String.Quote((self.normalizedName).encode('utf-8'), usePlus=True) - ), - ( - String.Quote((self.media.artist).encode('utf-8'), usePlus=True) - ) - ) - else: - searchUrl = self.ctx['AUD_KEYWORD_SEARCH_URL'] % ( - String.Quote((self.normalizedName).encode('utf-8'), usePlus=True) - ) - self.result = self.doSearch(searchUrl, self.ctx) - - # Write search result status to log - if len(self.result) == 0: - log.info( - 'No results found for query "%s"', - self.normalizedName - ) - return - - log.debug( - 'Found %s result(s) for query "%s"', - len(self.result), - self.normalizedName - ) - - info = self.run_search() - - # Output the final results. - log.separator(log_level="debug") - log.debug('Final result:') - i = 1 - for r in info: - description = '\"%s\" by %s [%s]' % ( - r['title'], r['artist'], r['year'] - ) - log.debug( - ' [%s] %s. %s (%s) %s {%s} [%s]', - r['score'], i, r['title'], r['year'], - r['artist'], r['id'], r['thumb'] - ) - results.Append( - MetadataSearchResult( - id=r['id'], - name=description, - score=r['score'], - thumb=r['thumb'], - lang=lang - ) - ) - - # If there are more than one result, - # and this one has a score that is >= GOOD SCORE, - # then ignore the rest of the results - if not manual and len(info) > 1 and r['score'] >= GOOD_SCORE: - log.info( - ' *** The score for these results are great, ' - 'so we will use them, and ignore the rest. ***' - ) - break - i += 1 - - def use_copyright_date(self): - cstring = None + """ + Update functions that require PMS imports, + thus we cannot 'outsource' them to UpdateTool + Sorted by position in the update process + """ - for r in self.html.xpath(u'//span[contains(text(), "\xA9")]'): - cstring = self.getStringContentFromXPath( - r, u'normalize-space(//span[contains(text(), "\xA9")])' + def scrape_book_metadata(self, ctx, helper, html): + # result = None + for r in html.xpath('//div[contains (@id, "adbl_page_content")]'): + author = self.getStringContentFromXPath( + r, '//li//a[contains (@class,"author-profile-link")][1]' ) - # only contains Audible copyright - if cstring.startswith(u"\xA9 "): - cstring = "" - date = date[:4] - - if cstring: - if "Public Domain" in cstring: - date = re.match(".*\(P\)(\d{4})", cstring).group(1) - else: - if cstring.startswith(u'\xA9'): - cstring = cstring[1:] - if "(P)" in cstring: - cstring = re.match("(.*)\(P\).*", cstring).group(1) - if ";" in cstring: - date = str( - min( - [int(i) for i in cstring.split() if i.isdigit()] - ) - ) - else: - date = re.match(".?(\d{4}).*", cstring).group(1) - - def update_scrape(self): - for r in self.html.xpath('//div[contains (@id, "adbl_page_content")]'): - self.date = self.getDateFromString( + date = self.getDateFromString( self.getStringContentFromXPath( r, u'//li[contains (., "{0}")]/span[2]//text()'.format( - self.ctx['REL_DATE_INFO'] + ctx['REL_DATE_INFO'] ) ) ) - self.title = self.getStringContentFromXPath( - r, '//h1[contains (@class, "adbl-prod-h1-title")]/text()' - ) - self.murl = self.getAnchorUrlFromXPath( - r, 'div/div/div/div/a[1]' + genre_child = self.getStringContentFromXPath( + r, ( + '//div[contains(@class,"adbl-pd-breadcrumb")]' + '/div[3]/a/span/text()' + ) ) - self.thumb = self.getImageUrlFromXPath( - r, 'div/div/div/div/div/img' + genre_parent = self.getStringContentFromXPath( + r, ( + '//div[contains(@class,"adbl-pd-breadcrumb")]' + '/div[2]/a/span/text()' + ) ) - self.author = self.getStringContentFromXPath( - r, '//li//a[contains (@class,"author-profile-link")][1]' + murl = self.getAnchorUrlFromXPath( + r, 'div/div/div/div/a[1]' ) - self.narrator = self.getStringContentFromXPath( + narrator = self.getStringContentFromXPath( r, '//li[contains (., "{0}")]//span[2]'.format( - self.ctx['NAR_BY_INFO'] + ctx['NAR_BY_INFO'] ) ).strip().decode('utf-8') - self.studio = self.getStringContentFromXPath( + series = self.getStringContentFromXPath( + r, '//div[contains (@class, "adbl-series-link")]//a[1]' + ) + studio = self.getStringContentFromXPath( r, '//li//a[contains (@id,"PublisherSearchLink")][1]' ) - self.synopsis = self.getStringContentFromXPath( + synopsis = self.getStringContentFromXPath( r, '//div[contains (@class, "disc-summary")]/div[*]' ).strip() - self.series = self.getStringContentFromXPath( - r, '//div[contains (@class, "adbl-series-link")]//a[1]' - ) - self.genre_parent = self.getStringContentFromXPath( - r, ( - '//div[contains(@class,"adbl-pd-breadcrumb")]' - '/div[2]/a/span/text()' - ) + thumb = self.getImageUrlFromXPath( + r, 'div/div/div/div/div/img' ) - self.genre_child = self.getStringContentFromXPath( - r, ( - '//div[contains(@class,"adbl-pd-breadcrumb")]' - '/div[3]/a/span/text()' - ) + title = self.getStringContentFromXPath( + r, '//h1[contains (@class, "adbl-prod-h1-title")]/text()' ) log.separator(msg='XPATH SEARCH HIT', log_level="debug") - def date_missing(self): - for r in self.html.xpath( + # Set values in helper object + helper.author = author + helper.date = date + helper.genre_child = genre_child + helper.genre_parent = genre_parent + # helper.url = murl + helper.narrator = narrator + helper.series = series + helper.studio = studio + helper.synopsis = synopsis + helper.thumb = thumb + helper.title = title + + # result = { + # 'author': author, + # 'date': date, + # 'genre_child': genre_child, + # 'genre_parent': genre_parent, + # 'narrator': narrator, + # 'series': series, + # 'studio': studio, + # 'synopsis': synopsis, + # 'thumb': thumb, + # 'title': title, + # 'url': murl, + # } + # return result + + def date_missing(self, helper, html): + for r in html.xpath( '//script[contains (@type, "application/ld+json")]' ): page_content = r.text_content() @@ -646,84 +646,129 @@ def date_missing(self): ) page_content = remove_inv_json_esc.sub(r'\1\\\2', page_content) log.debug(page_content) - json_data = json_decode(page_content) - for json_data in json_data: - if 'datePublished' in json_data: - self.date = json_data['datePublished'] - self.title = json_data['name'] - self.thumb = json_data['image'] - # Set rating when available - if 'aggregateRating' in json_data: - self.rating = ( - json_data['aggregateRating']['ratingValue'] + json_data = self.json_decode(page_content) + + helper.re_parse_with_date_published(json_data) + # return book_data + + def use_copyright_date(self, helper, html): + cstring = None + + for r in html.xpath(u'//span[contains(text(), "\xA9")]'): + cstring = self.getStringContentFromXPath( + r, u'normalize-space(//span[contains(text(), "\xA9")])' + ) + # only contains Audible copyright + if cstring.startswith(u"\xA9 "): + cstring = "" + helper.date = helper.date[:4] + + if cstring: + if "Public Domain" in cstring: + helper.date = re.match(".*\(P\)(\d{4})", cstring).group(1) + else: + if cstring.startswith(u'\xA9'): + cstring = cstring[1:] + if "(P)" in cstring: + cstring = re.match("(.*)\(P\).*", cstring).group(1) + if ";" in cstring: + helper.date = str( + min( + [int(i) for i in cstring.split() if i.isdigit()] ) - author_array = [] - for c in json_data['author']: - author_array.append(c['name']) - self.author = ",".join(author_array) - - narrator_array = [] - for c in json_data['readBy']: - narrator_array.append(c['name']) - self.narrator = ",".join(narrator_array) - self.studio = json_data['publisher'] - self.synopsis = json_data['description'] - if 'itemListElement' in json_data: - self.genre_parent = ( - json_data['itemListElement'][1]['item']['name'] ) - try: - self.genre_child = ( - json_data['itemListElement'][2]['item']['name'] - ) - except AttributeError: - continue + else: + helper.date = re.match(".?(\d{4}).*", cstring).group(1) + # return date - def handle_series(self): - for r in self.html.xpath('//span[contains(@class, "seriesLabel")]'): - self.series = self.getStringContentFromXPath( + def handle_series(self, helper, html): + for r in html.xpath('//span[contains(@class, "seriesLabel")]'): + helper.series = self.getStringContentFromXPath( r, '//li[contains(@class, "seriesLabel")]//a[1]' ) - self.series2 = self.getStringContentFromXPath( + helper.series2 = self.getStringContentFromXPath( r, '//li[contains(@class, "seriesLabel")]//a[2]' ) - self.series_def = self.series2 if self.series2 else self.series + helper.series_def = helper.series2 if helper.series2 else helper.series - self.volume = self.getStringContentFromXPath( + helper.volume = self.getStringContentFromXPath( r, '//li[contains(@class, "seriesLabel")]/text()[2]' ).strip() - if self.volume == ",": - self.volume = "" - self.volume2 = self.getStringContentFromXPath( + if helper.volume == ",": + helper.helper.volume = "" + helper.volume2 = self.getStringContentFromXPath( r, '//li[contains(@class, "seriesLabel")]/text()[3]' ).strip() - if self.volume2 == ",": - self.volume2 = "" + if helper.volume2 == ",": + helper.volume2 = "" - self.volume_def = self.volume2 if self.volume2 else self.volume + helper.volume_def = helper.helper.volume2 if helper.volume2 else helper.volume # fix series when audible 'forgets' the series link… - if not self.series_def: - for r in self.html.xpath('//div[contains(@class, "adbl-main")]'): - self.subtitle = self.getStringContentFromXPath( + if not helper.series_def: + for r in html.xpath('//div[contains(@class, "adbl-main")]'): + subtitle = self.getStringContentFromXPath( r, 'normalize-space(//li[contains' '(@class, "authorLabel")]' '//preceding::li[1]//span//text())' ).strip() - w = re.match("(.*)(, Book \d+)", self.subtitle) - if not self.series_def and w: - self.series_def = w.group(1) - self.volume_def = w.group(2) + w = re.match("(.*)(, Book \d+)", subtitle) + if not helper.series_def and w: + helper.series_def = w.group(1) + helper.volume_def = w.group(2) + + def compile_metadata(self, helper): + # Set the date and year if found. + if helper.date is not None: + helper.metadata.originally_available_at = helper.date + + # Add the genres + if not Prefs['no_overwrite_genre']: + helper.metadata.genres.clear() + helper.metadata.genres.add(helper.genre_parent) + helper.metadata.genres.add(helper.genre_child) + + self.parse_author_narrator(helper) + + self.parse_series(helper) + + # Other metadata + helper.metadata.title = helper.title + helper.metadata.title_sort = ' - '.join( + filter(None, [(helper.series_def + helper.volume_def), helper.title]) + ) + helper.metadata.studio = helper.studio + helper.metadata.summary = helper.synopsis + + if Prefs['cover_options'] == "Use Audible cover": + helper.metadata.posters[1] = Proxy.Media(HTTP.Request(helper.thumb)) + helper.metadata.posters.validate_keys(helper.thumb) + elif Prefs['cover_options'] == "Download cover but don't overwrite existing": + helper.metadata.posters[helper.thumb] = Proxy.Media( + HTTP.Request(helper.thumb), sort_order=1 + ) + + # Use rating only when available + if helper.rating: + helper.metadata.rating = float(helper.rating) * 2 - def parse_author_narrator(self): + # Collections if/when Plex supports them + # https://github.com/seanap/Audiobooks.bundle/issues/1#issuecomment-713191070 + helper.metadata.collections.clear() + helper.metadata.collections.add(helper.series) + if helper.series2: + helper.metadata.collections.add(helper.series2) + helper.writeInfo() + + def parse_author_narrator(self, helper): # Add Narrators to Styles - narrators_list = self.narrator.split(",") + narrators_list = helper.narrator.split(",") narr_contributors_list = [ 'full cast' ] - self.metadata.styles.clear() + helper.metadata.styles.clear() # Loop through narrators to check if it has contributor wording for narrator in narrators_list: if not [ @@ -731,10 +776,10 @@ def parse_author_narrator(self): contrib in narrator.lower() ) ]: - self.metadata.styles.add(narrator.strip()) + helper.metadata.styles.add(narrator.strip()) # Add Authors to Moods - author_list = self.author.split(",") + author_list = helper.author.split(",") author_contributers_list = [ 'contributor', 'translator', @@ -742,7 +787,7 @@ def parse_author_narrator(self): 'translated', 'full cast', ] - self.metadata.moods.clear() + helper.metadata.moods.clear() # Loop through authors to check if it has contributor wording for author in author_list: if not [ @@ -750,177 +795,88 @@ def parse_author_narrator(self): contrib in author.lower() ) ]: - self.metadata.moods.add(author.strip()) + helper.metadata.moods.add(author.strip()) - def parse_series(self): + def parse_series(self, helper): # Clean series - x = re.match("(.*)(: A .* Series)", self.series_def) + x = re.match("(.*)(: A .* Series)", helper.series_def) if x: - self.series_def = x.group(1) + helper.series_def = x.group(1) # Clean title - seriesshort = self.series_def + seriesshort = helper.series_def checkseries = " Series" # Handle edge cases in titles - if self.series_def.endswith(checkseries): - seriesshort = self.series_def[:-len(checkseries)] + if helper.series_def.endswith(checkseries): + seriesshort = helper.series_def[:-len(checkseries)] y = re.match( - "(.*)((: .* " + self.volume_def[2:] + ": A .* Series)|" - "(((:|,|-) )((" + seriesshort + self.volume_def + ")|" - "((?", "") - .replace("", "") - .replace("", "") - .replace("", "") - .replace("", "") - .replace("", "") - .replace("", "") - .replace("", "") - .replace("", "") - .replace("", "") - .replace("
      ", "") - .replace("
    ", "\n") - .replace("
      ", "") - .replace("
    ", "\n") - .replace("
  • ", " • ") - .replace("
  • ", "\n") - .replace("
    ", "") - .replace("

    ", "") - .replace("

    ", "\n") - ) + def getImageUrlFromXPath(self, source, query): + img = source.xpath(query) - # Setup logging of all data in the array - data_to_log = [ - {'date': self.date}, - {'title': self.title}, - {'author': self.author}, - {'series': self.series}, - {'narrator': self.narrator}, - {'studio': self.studio}, - {'thumb': self.thumb}, - {'rating': self.rating}, - {'genres': self.genre_parent + ', ' + self.genre_child}, - {'synopsis': self.synopsis}, - {'volume': self.volume}, - {'series2': self.series2}, - {'volume2': self.volume2}, - {'series def': self.series_def}, - {'volume def': self.volume_def}, - ] - log.metadata(data_to_log, log_level="debug") + if len(img) == 0: + return None - self.compile_metadata() + return img[0].get('src') def hasProxy(self): return Prefs['imageproxyurl'] is not None + def json_decode(self, output): + try: + return json.loads(output, encoding="utf-8") + except AttributeError: + return None + def makeProxyUrl(self, url, referer): return Prefs['imageproxyurl'] + ('?url=%s&referer=%s' % (url, referer)) + """ + Queueing functions + """ + def worker(self, queue, stoprequest): while not stoprequest.isSet(): try: @@ -935,38 +891,3 @@ def worker(self, queue, stoprequest): def addTask(self, queue, func, *args, **kargs): queue.put((func, args, kargs)) - - # Writes metadata information to log. - def writeInfo(self): - log.separator(msg='New data', log_level="info") - - # Log basic metadata - data_to_log = [ - {'ID': self.metadata.id}, - {'URL': self.url}, - {'Title': self.metadata.title}, - {'Release date': str(self.metadata.originally_available_at)}, - {'Studio': self.metadata.studio}, - {'Summary': self.metadata.summary}, - {'Poster URL': self.thumb}, - ] - log.metadata(data_to_log, log_level="info") - - # Log basic metadata stored in arrays - multi_arr = [ - # {'Collection': self.metadata.collections}, - {'Genre': self.metadata.genres}, - {'Moods(Authors)': self.metadata.moods}, - {'Styles(Narrators)': self.metadata.styles}, - # {'Fan art URL': self.metadata.art}, - ] - log.metadata_arrs(multi_arr, log_level="info") - - log.separator(log_level="info") - - -def json_decode(output): - try: - return json.loads(output, encoding="utf-8") - except AttributeError: - return None diff --git a/Contents/Code/search_tools.py b/Contents/Code/search_tools.py new file mode 100644 index 0000000..370f6ce --- /dev/null +++ b/Contents/Code/search_tools.py @@ -0,0 +1,119 @@ +import re +# Import internal tools +from logging import Logging + +# Setup logger +log = Logging() + + +class SearchTool: + def __init__(self, lang, manual, media, results): + self.lang = lang + self.manual = manual + self.media = media + self.results = results + + def get_id_from_url(self, item): + itemId_full = None + itemId = None + valid_itemId = None + url = item['url'] + log.debug('URL For Breakdown: %s', url) + + #TODO these can probably be combined into one + # Get the id + for item in url.split('/'): + # IDs No longer start with just 'B0' + if re.match(r'^[0-9A-Z]{10,10}', item): + itemId_full = item + break + + # New Search results contain question marks after the ID + for itemId in itemId_full.split('?'): + # IDs No longer start with just 'B0' + if re.match(r'^[0-9A-Z]{10,10}', itemId): + valid_itemId = itemId + break + + if not valid_itemId: + log.info('No Match: %s', url) + return None + + return valid_itemId + + def pre_search_logging(self): + log.separator(msg='ALBUM SEARCH', log_level="info") + # Log basic metadata + data_to_log = [ + {'ID': self.media.parent_metadata.id}, + {'Title': self.media.title}, + {'Name': self.media.name}, + {'Album': self.media.album}, + {'Artist': self.media.artist}, + ] + log.metadata(data_to_log) + log.separator(log_level="info") + + # Handle a couple of edge cases where + # album search will give bad results. + if self.media.album is None and not self.manual: + log.info('Album Title is NULL on an automatic search. Returning') + return + if self.media.album == '[Unknown Album]' and not self.manual: + log.info( + 'Album Title is [Unknown Album]' + ' on an automatic search. Returning' + ) + return + + if self.manual: + log.separator(msg="NOTE", log_level="info") + log.info( + 'You clicked \'fix match\'. ' + 'This may have returned no useful results because ' + 'it\'s searching using the title of the first track.' + ) + log.info( + 'There\'s not currently a way around this initial failure. ' + 'But clicking \'Search Options\' and ' + 'entering the title works just fine.' + ) + log.info( + 'This message will appear during the initial ' + 'search and the actual manual search.' + ) + # If this is a custom search, + # use the user-entered name instead of the scanner hint. + if self.media.name: + log.info( + 'Custom album search for: ' + self.media.name + ) + self.media.album = self.media.name + + def strip_title(self, normalizedName): + if len(normalizedName) == 0: + normalizedName = self.media.album + log.debug( + 'normalizedName = %s', normalizedName + ) + + # Chop off "unabridged" + normalizedName = re.sub( + r"[\(\[].*?[\)\]]", "", normalizedName + ) + log.debug( + 'chopping bracketed text = %s', normalizedName + ) + normalizedName = normalizedName.strip() + log.debug( + 'normalizedName stripped = %s', normalizedName + ) + + log.separator( + msg=( + "SEARCHING FOR " + '"' + normalizedName + '"' + ), + log_level="info" + ) + # Give access of this variable to the class + self.normalizedName = normalizedName diff --git a/Contents/Code/update_tools.py b/Contents/Code/update_tools.py new file mode 100644 index 0000000..569849f --- /dev/null +++ b/Contents/Code/update_tools.py @@ -0,0 +1,85 @@ +# Import internal tools +from logging import Logging + +# Setup logger +log = Logging() + + +class UpdateTool: + def __init__(self, force, lang, media, metadata, url): + self.date = None + self.force = force + self.genre_child = None + self.genre_parent = None + self.lang = lang + self.media = media + self.metadata = metadata + self.rating = None + self.series = '' + self.series2 = '' + self.series_def = '' + self.url = url + self.volume = '' + self.volume2 = '' + self.volume_def = '' + + def re_parse_with_date_published(self, json_data): + for data in json_data: + if 'datePublished' in data: + self.date = data['datePublished'] + self.title = data['name'] + self.thumb = data['image'] + # Set rating when available + if 'aggregateRating' in data: + self.rating = ( + data['aggregateRating']['ratingValue'] + ) + author_array = [] + for c in data['author']: + author_array.append(c['name']) + self.author = ",".join(author_array) + + narrator_array = [] + for c in data['readBy']: + narrator_array.append(c['name']) + self.narrator = ",".join(narrator_array) + self.studio = data['publisher'] + self.synopsis = data['description'] + if 'itemListElement' in data: + self.genre_parent = ( + data['itemListElement'][1]['item']['name'] + ) + try: + self.genre_child = ( + data['itemListElement'][2]['item']['name'] + ) + except AttributeError: + continue + + # Writes metadata information to log. + def writeInfo(self): + log.separator(msg='New data', log_level="info") + + # Log basic metadata + data_to_log = [ + {'ID': self.metadata.id}, + {'URL': self.url}, + {'Title': self.metadata.title}, + {'Release date': str(self.metadata.originally_available_at)}, + {'Studio': self.metadata.studio}, + {'Summary': self.metadata.summary}, + {'Poster URL': self.thumb}, + ] + log.metadata(data_to_log, log_level="info") + + # Log basic metadata stored in arrays + multi_arr = [ + # {'Collection': self.metadata.collections}, + {'Genre': self.metadata.genres}, + {'Moods(Authors)': self.metadata.moods}, + {'Styles(Narrators)': self.metadata.styles}, + # {'Fan art URL': self.metadata.art}, + ] + log.metadata_arrs(multi_arr, log_level="info") + + log.separator(log_level="info") \ No newline at end of file From b9f0e53079d3d7c3dd75951788f3f849525686af Mon Sep 17 00:00:00 2001 From: djdembeck Date: Fri, 27 Aug 2021 15:58:32 -0500 Subject: [PATCH 28/30] remove dev comments --- Contents/Code/__init__.py | 49 ++++----------------------------------- 1 file changed, 4 insertions(+), 45 deletions(-) diff --git a/Contents/Code/__init__.py b/Contents/Code/__init__.py index 246ddff..9031a1b 100644 --- a/Contents/Code/__init__.py +++ b/Contents/Code/__init__.py @@ -166,34 +166,10 @@ def search(self, results, media, lang, manual): url_info = SiteUrl(Prefs['sitetype'], Prefs['site'], lang) ctx = url_info.SetupUrls() + # Instantiate search helper search_helper = SearchTool(lang, manual, media, results) - """ - The process needs to be as follows: - 1. Search class is instantiated. - 2. Search class is run linearly. This can be done 1 of 2 ways: - - Call helper in the class to run it itself. - - Call each function here, using the same object. - 3. Search class returns necessary data for update function. - - What does the output of these actions need to be, and in what form? - What do I need to run here vs there? - """ - - """ - Functions I know I can call with object: - pre_search_logging - strip_title - """ - search_helper.pre_search_logging() - """ - Functions I can't call with object - create_search_url - doSearch - before_xpath - after_xpath - run_search - """ + search_helper.pre_search_logging() # Run helper before passing to SearchTool normalizedName = self.normalize_name(media.album) @@ -273,8 +249,9 @@ def update(self, metadata, media, lang, force=False): html = HTML.ElementFromURL(url, sleep=REQUEST_DELAY) except Exception as e: log.info(e) - + # Instantiate update helper update_helper = UpdateTool(force, lang, media, metadata, url) + self.scrape_book_metadata(ctx, update_helper, html) if not update_helper.date: @@ -556,7 +533,6 @@ def run_search(self, helper, media, result): """ def scrape_book_metadata(self, ctx, helper, html): - # result = None for r in html.xpath('//div[contains (@id, "adbl_page_content")]'): author = self.getStringContentFromXPath( r, '//li//a[contains (@class,"author-profile-link")][1]' @@ -618,21 +594,6 @@ def scrape_book_metadata(self, ctx, helper, html): helper.thumb = thumb helper.title = title - # result = { - # 'author': author, - # 'date': date, - # 'genre_child': genre_child, - # 'genre_parent': genre_parent, - # 'narrator': narrator, - # 'series': series, - # 'studio': studio, - # 'synopsis': synopsis, - # 'thumb': thumb, - # 'title': title, - # 'url': murl, - # } - # return result - def date_missing(self, helper, html): for r in html.xpath( '//script[contains (@type, "application/ld+json")]' @@ -649,7 +610,6 @@ def date_missing(self, helper, html): json_data = self.json_decode(page_content) helper.re_parse_with_date_published(json_data) - # return book_data def use_copyright_date(self, helper, html): cstring = None @@ -679,7 +639,6 @@ def use_copyright_date(self, helper, html): ) else: helper.date = re.match(".?(\d{4}).*", cstring).group(1) - # return date def handle_series(self, helper, html): for r in html.xpath('//span[contains(@class, "seriesLabel")]'): From 3082453a6b0a1965108b1e25b4e658b1ea43f254 Mon Sep 17 00:00:00 2001 From: djdembeck Date: Fri, 27 Aug 2021 16:07:51 -0500 Subject: [PATCH 29/30] better naming of vars --- Contents/Code/__init__.py | 4 ++-- Contents/Code/search_tools.py | 6 +++--- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/Contents/Code/__init__.py b/Contents/Code/__init__.py index 9031a1b..993324b 100644 --- a/Contents/Code/__init__.py +++ b/Contents/Code/__init__.py @@ -316,10 +316,10 @@ def update(self, metadata, media, lang, force=False): Sorted by position in the search process """ - def normalize_name(self, input): + def normalize_name(self, input_name): # Normalize the name normalizedName = String.StripDiacritics( - input + input_name ) return normalizedName diff --git a/Contents/Code/search_tools.py b/Contents/Code/search_tools.py index 370f6ce..9635d47 100644 --- a/Contents/Code/search_tools.py +++ b/Contents/Code/search_tools.py @@ -22,10 +22,10 @@ def get_id_from_url(self, item): #TODO these can probably be combined into one # Get the id - for item in url.split('/'): + for partial in url.split('/'): # IDs No longer start with just 'B0' - if re.match(r'^[0-9A-Z]{10,10}', item): - itemId_full = item + if re.match(r'^[0-9A-Z]{10,10}', partial): + itemId_full = partial break # New Search results contain question marks after the ID From 3a52e14d3b28d981813a9c5598fcc1c3934a0296 Mon Sep 17 00:00:00 2001 From: djdembeck Date: Fri, 27 Aug 2021 16:50:26 -0500 Subject: [PATCH 30/30] Greatlly simplify ASIN extraction --- Contents/Code/search_tools.py | 28 ++++++---------------------- 1 file changed, 6 insertions(+), 22 deletions(-) diff --git a/Contents/Code/search_tools.py b/Contents/Code/search_tools.py index 9635d47..168fe28 100644 --- a/Contents/Code/search_tools.py +++ b/Contents/Code/search_tools.py @@ -14,32 +14,16 @@ def __init__(self, lang, manual, media, results): self.results = results def get_id_from_url(self, item): - itemId_full = None - itemId = None - valid_itemId = None url = item['url'] log.debug('URL For Breakdown: %s', url) - #TODO these can probably be combined into one - # Get the id - for partial in url.split('/'): - # IDs No longer start with just 'B0' - if re.match(r'^[0-9A-Z]{10,10}', partial): - itemId_full = partial - break + # Find ASIN before ? in URL + asin = re.search(r'[0-9A-Z]{9}.+?(?=\?)', url).group(0) + if asin: + return asin - # New Search results contain question marks after the ID - for itemId in itemId_full.split('?'): - # IDs No longer start with just 'B0' - if re.match(r'^[0-9A-Z]{10,10}', itemId): - valid_itemId = itemId - break - - if not valid_itemId: - log.info('No Match: %s', url) - return None - - return valid_itemId + log.info('No Match: %s', url) + return None def pre_search_logging(self): log.separator(msg='ALBUM SEARCH', log_level="info")