Improve score handling; Reduce limits; Remove unused xpath function

seanap · Aug 30, 2021 · ca10e17 · ca10e17
1 parent 93da952
commit ca10e17
Show file tree

Hide file tree

Showing 2 changed files with 135 additions and 121 deletions.
diff --git a/Contents/Code/__init__.py b/Contents/Code/__init__.py
@@ -10,11 +10,11 @@
 from update_tools import UpdateTool
 from urls import SiteUrl
 
-VERSION_NO = '2021.08.28.2'
+VERSION_NO = '2021.08.29.1'
 
 # Delay used when requesting HTML,
 # may be good to have to prevent being banned from the site
-REQUEST_DELAY = 10
+REQUEST_DELAY = 1
 
 # Starting value for score before deductions are taken.
 INITIAL_SCORE = 100
@@ -23,7 +23,7 @@
 # Any score lower than this will be ignored.
 IGNORE_SCORE = 45
 
-THREAD_MAX = 20
+#THREAD_MAX = 20
 
 # Setup logger
 log = Logging()
@@ -251,6 +251,7 @@ def update(self, metadata, media, lang, force=False):
             html = HTML.ElementFromURL(url, sleep=REQUEST_DELAY)
         except Exception as e:
             log.info(e)
+
         # Instantiate update helper
         update_helper = UpdateTool(force, lang, media, metadata, url)
 
@@ -356,10 +357,6 @@ def doSearch(self, ctx, url):
         # Set append to the returned array from this function
         found = self.before_xpath(ctx, found, html)
 
-        log.separator(msg='just after new xpath line', log_level="debug")
-        # Set append to the returned array from this function
-        found = self.after_xpath(ctx, found, html)
-
         return found
 
     def before_xpath(self, ctx, found, html):
@@ -380,6 +377,12 @@ def before_xpath(self, ctx, found, html):
             )
             datetext = re.sub(r'[^0-9\-]', '', datetext)
             date = self.getDateFromString(datetext)
+            language = self.getStringContentFromXPath(
+                r, (
+                    u'div/div/div/div/div/div/span/ul/li'
+                    '[contains (@class,"languageLabel")]/span'
+                    )
+            ).split()[1]
             narrator = self.getStringContentFromXPath(
                 r, (
                     u'div/div/div/div/div/div/span/ul/li'
@@ -405,54 +408,7 @@ def before_xpath(self, ctx, found, html):
                 {
                     'author': author,
                     'date': date,
-                    'narrator': narrator,
-                    'thumb': thumb,
-                    'title': title,
-                    'url': murl,
-                }
-            )
-        return found
-
-    def after_xpath(self, ctx, found, html):
-        for r in html.xpath(
-            '//div[contains (@class, "adbl-search-result")]'
-        ):
-            author = self.getStringContentFromXPath(
-                r, (
-                        'div/div/ul/li/'
-                        '/a[contains (@class,"author-profile-link")][1]'
-                    )
-            )
-            date = self.getDateFromString(
-                self.getStringContentFromXPath(
-                    r, (
-                        u'div/div/ul/li[contains (., "{0}")]'
-                        '/span[2]//text()'
-                        ).format(
-                        ctx['REL_DATE']
-                    )
-                )
-            )
-            murl = self.getAnchorUrlFromXPath(
-                r, 'div/div/div/div/a[1]'
-            )
-            narrator = self.getStringContentFromXPath(
-                r, u'div/div/ul/li[contains (., "{0}")]//a[1]'.format(
-                    ctx['NAR_BY']
-                )
-            )
-            thumb = self.getImageUrlFromXPath(
-                r, 'div[contains (@class,"adbl-prod-image-sample-cont")]/a/img'
-            )
-            title = self.getStringContentFromXPath(
-                r, 'div/div/div/div/a[1]'
-            )
-            log.separator(msg='XPATH SEARCH HIT', log_level="debug")
-
-            found.append(
-                {
-                    'author': author,
-                    'date': date,
+                    'language': language,
                     'narrator': narrator,
                     'thumb': thumb,
                     'title': title,
@@ -471,73 +427,16 @@ def run_search(self, helper, result):
             if not valid_itemId:
                 continue
 
-            title = f['title']
-            thumb = f['thumb']
             date = f['date']
             year = ''
-            author = f['author']
-            narrator = f['narrator']
-
             if date is not None:
                 year = date.year
 
                 # Make sure this isn't a pre-order listing
                 if helper.check_if_preorder(date):
                     continue
 
-            # Score the album name
-            scorebase1 = helper.media.album
-            scorebase2 = title.encode('utf-8')
-            album_score = INITIAL_SCORE - Util.LevenshteinDistance(
-                scorebase1, scorebase2
-            )
-            log.debug("Score from album: " + str(album_score))
-
-            # Score the author name
-            if helper.media.artist:
-                scorebase3 = helper.media.artist
-                scorebase4 = author
-                author_score = INITIAL_SCORE - Util.LevenshteinDistance(
-                    scorebase3, scorebase4
-                )
-                log.debug("Score from author: " + str(author_score))
-                # Find the difference in score between name and author
-                score = (
-                    album_score + author_score
-                ) - INITIAL_SCORE
-            else:
-                score = album_score
-
-            log.info("Result #" + str(i + 1))
-            # Log basic metadata
-            data_to_log = [
-                {'ID is': valid_itemId},
-                {'Title is': title},
-                {'Author is': author},
-                {'Narrator is': narrator},
-                {'Date is ': str(date)},
-                {'Score is': str(score)},
-                {'Thumb is': thumb},
-            ]
-            log.metadata(data_to_log, log_level="info")
-
-            if score >= IGNORE_SCORE:
-                info.append(
-                    {
-                        'id': valid_itemId,
-                        'title': title,
-                        'year': year,
-                        'date': date,
-                        'score': score,
-                        'thumb': thumb,
-                        'artist': author
-                    }
-                )
-            else:
-                log.info(
-                    '# Score is below ignore boundary (%s)... Skipping!',
-                    IGNORE_SCORE
-                )
+            self.score_result(f, helper, i, info, valid_itemId, year)
 
             # Print separators for easy reading
             if i <= len(result):
@@ -546,6 +445,110 @@ def run_search(self, helper, result):
         info = sorted(info, key=lambda inf: inf['score'], reverse=True)
         return info
 
+    def score_result(self, f, helper, i, info, valid_itemId, year):
+        author = f['author']
+        date = f['date']
+        language = f['language']
+        narrator = f['narrator']
+        thumb = f['thumb']
+        title = f['title']
+
+        # Array to hold score points for processing
+        all_scores = []
+
+        # Album name score
+        all_scores.append(
+            self.score_album(helper, title)
+        )
+        # Author name score
+        all_scores.append(
+            self.score_author(author, helper)
+        )
+        # Library language score
+        all_scores.append(
+            self.score_language(helper, language)
+        )
+
+        # Because builtin sum() isn't available
+        sum=lambda numberlist:reduce(lambda x,y:x+y,numberlist,0)
+        # Subtract difference from initial score
+        score = INITIAL_SCORE - sum(all_scores)
+
+        log.info("Result #" + str(i + 1))
+        # Log basic metadata
+        data_to_log = [
+            {'ID is': valid_itemId},
+            {'Title is': title},
+            {'Author is': author},
+            {'Narrator is': narrator},
+            {'Date is ': str(date)},
+            {'Score is': str(score)},
+            {'Thumb is': thumb},
+        ]
+        log.metadata(data_to_log, log_level="info")
+
+        if score >= IGNORE_SCORE:
+            info.append(
+                {
+                    'id': valid_itemId,
+                    'title': title,
+                    'year': year,
+                    'date': date,
+                    'score': score,
+                    'thumb': thumb,
+                    'artist': author
+                }
+            )
+        else:
+            log.info(
+                '# Score is below ignore boundary (%s)... Skipping!',
+                IGNORE_SCORE
+            )
+
+    def score_album(self, helper, title):
+        """
+            Compare the input album similarity to the search result album.
+            Score is calculated with LevenshteinDistance
+        """
+        scorebase1 = helper.media.album
+        scorebase2 = title.encode('utf-8')
+        album_score = Util.LevenshteinDistance(
+            scorebase1, scorebase2
+        )
+        log.debug("Score from album: " + str(album_score))
+        return album_score
+
+    def score_author(self, author, helper):
+        """
+            Compare the input author similarity to the search result author.
+            Score is calculated with LevenshteinDistance
+        """
+        if helper.media.artist:
+            scorebase3 = helper.media.artist
+            scorebase4 = author
+            author_score = Util.LevenshteinDistance(
+                scorebase3, scorebase4
+            )
+            log.debug("Score from author: " + str(author_score))
+            return author_score
+
+    def score_language(self, helper, language):
+        """
+            Compare the library language to search results
+            and knock off 2 points if they don't match.
+        """
+        lang_dict = {
+            Locale.Language.English: 'English',
+            'de': 'German',
+            'fr': 'French',
+            'it': 'Italian'
+        }
+
+        if language != lang_dict[helper.lang]:
+            log.debug("Book is not library language, deduct 2 points")
+            return 2
+        return 0
+
     """
         Update functions that require PMS imports,
         thus we cannot 'outsource' them to UpdateTool
@@ -669,7 +672,9 @@ def handle_series(self, helper, html):
                 r, '//li[contains(@class, "seriesLabel")]//a[2]'
             )
 
-            helper.series_def = helper.series2 if helper.series2 else helper.series
+            helper.series_def = (
+                helper.series2 if helper.series2 else helper.series
+            )
 
             helper.volume = self.getStringContentFromXPath(
                 r, '//li[contains(@class, "seriesLabel")]/text()[2]'
@@ -682,7 +687,9 @@ def handle_series(self, helper, html):
             if helper.volume2 == ",":
                 helper.volume2 = ""
 
-            helper.volume_def = helper.helper.volume2 if helper.volume2 else helper.volume
+            helper.volume_def = (
+                helper.helper.volume2 if helper.volume2 else helper.volume
+            )
 
         # fix series when audible 'forgets' the series link…
         if not helper.series_def:
@@ -718,15 +725,23 @@ def compile_metadata(self, helper):
         # Other metadata
         helper.metadata.title = helper.title
         helper.metadata.title_sort = ' - '.join(
-            filter(None, [(helper.series_def + helper.volume_def), helper.title])
+            filter(
+                None, [(helper.series_def + helper.volume_def), helper.title]
+            )
         )
         helper.metadata.studio = helper.studio
         helper.metadata.summary = helper.synopsis
 
-        if Prefs['cover_options'] == "Use Audible cover":
-            helper.metadata.posters[1] = Proxy.Media(HTTP.Request(helper.thumb))
+        if Prefs['cover_options'] == (
+            "Use Audible cover"
+        ):
+            helper.metadata.posters[1] = Proxy.Media(
+                HTTP.Request(helper.thumb)
+            )
             helper.metadata.posters.validate_keys(helper.thumb)
-        elif Prefs['cover_options'] == "Download cover but don't overwrite existing":
+        elif Prefs['cover_options'] == (
+            "Download cover but don't overwrite existing"
+        ):
             helper.metadata.posters[helper.thumb] = Proxy.Media(
                 HTTP.Request(helper.thumb), sort_order=1
             )

diff --git a/Contents/Code/search_tools.py b/Contents/Code/search_tools.py
@@ -124,6 +124,5 @@ def validate_author_name(self):
                 log.info(
                     "Artist name seems to be bad, "
                     "not using it in search."
-
-                    )
+                )
                 break