From 574ee5d6006eb236f235d9ce08c6cafbf780f14a Mon Sep 17 00:00:00 2001
From: "Samuel J. Woodward" <sam@woodward.fyi>
Date: Mon, 22 Apr 2024 10:00:01 -0400
Subject: [PATCH 01/10] Commiting .gitignore changes from vim. Will be restored
 to master version later

---
 .gitignore | 11 +++++++++++
 1 file changed, 11 insertions(+)

diff --git a/.gitignore b/.gitignore
index 8f8bc7e4..af080c95 100755
--- a/.gitignore
+++ b/.gitignore
@@ -5,3 +5,14 @@ mp3
 *.mp3
 .DS_Store
 *.cnf
+
+# To be restored later
+__pycache__/
+.ruff_cache/
+[._]*.s[a-v][a-z]
+!*.svg  # comment out if you don't need vector files
+[._]*.sw[a-p]
+[._]s[a-rt-v][a-z]
+[._]ss[a-gi-z]
+[._]sw[a-p]
+[._]*.un~

From d77a8659a365a57605ad3fccf9eed68cab4d7705 Mon Sep 17 00:00:00 2001
From: "Samuel J. Woodward" <sam@woodward.fyi>
Date: Mon, 22 Apr 2024 10:01:16 -0400
Subject: [PATCH 02/10] Adding placeholder definitions of Dejavu.songs,
 Dejavu.songshashes_set to take advantage of Python's special, optimized dict
 build for __init__

---
 dejavu/__init__.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/dejavu/__init__.py b/dejavu/__init__.py
index fac72bc5..b1559876 100755
--- a/dejavu/__init__.py
+++ b/dejavu/__init__.py
@@ -33,6 +33,8 @@ def __init__(self, config):
         self.limit = self.config.get("fingerprint_limit", None)
         if self.limit == -1:  # for JSON compatibility
             self.limit = None
+        self.songs = None
+        self.songhashes_set = set()  # to know which ones we've computed before
         self.__load_fingerprinted_audio_hashes()
 
     def __load_fingerprinted_audio_hashes(self) -> None:

From ba77e00cf3261c6f3588a0eaef332b1df0e58db0 Mon Sep 17 00:00:00 2001
From: "Samuel J. Woodward" <sam@woodward.fyi>
Date: Mon, 22 Apr 2024 10:30:15 -0400
Subject: [PATCH 03/10] fingerprint_directory now uses a
 concurrent.futures.ProcessPoolExecutor to compute hashes. Since a
 ProcessPoolExecutor can pass args and kwargs, Dejavu._fingerprint_worker not
 takes the file_name and limit args directly.

---
 dejavu/__init__.py | 98 ++++++++++++++++++----------------------------
 1 file changed, 39 insertions(+), 59 deletions(-)

diff --git a/dejavu/__init__.py b/dejavu/__init__.py
index b1559876..5f34fb36 100755
--- a/dejavu/__init__.py
+++ b/dejavu/__init__.py
@@ -1,7 +1,7 @@
-import multiprocessing
 import os
 import sys
 import traceback
+from concurrent.futures import ProcessPoolExecutor, as_completed
 from itertools import groupby
 from time import time
 from typing import Dict, List, Tuple
@@ -73,52 +73,37 @@ def fingerprint_directory(self, path: str, extensions: str, nprocesses: int = No
         :param extensions: list of file extensions to consider.
         :param nprocesses: amount of processes to fingerprint the files within the directory.
         """
-        # Try to use the maximum amount of processes if not given.
-        try:
-            nprocesses = nprocesses or multiprocessing.cpu_count()
-        except NotImplementedError:
-            nprocesses = 1
-        else:
-            nprocesses = 1 if nprocesses <= 0 else nprocesses
-
-        pool = multiprocessing.Pool(nprocesses)
-
-        filenames_to_fingerprint = []
-        for filename, _ in decoder.find_files(path, extensions):
-            # don't refingerprint already fingerprinted files
-            if decoder.unique_hash(filename) in self.songhashes_set:
-                print(f"{filename} already fingerprinted, continuing...")
-                continue
-
-            filenames_to_fingerprint.append(filename)
-
-        # Prepare _fingerprint_worker input
-        worker_input = list(zip(filenames_to_fingerprint, [self.limit] * len(filenames_to_fingerprint)))
-
-        # Send off our tasks
-        iterator = pool.imap_unordered(Dejavu._fingerprint_worker, worker_input)
-
-        # Loop till we have all of them
-        while True:
-            try:
-                song_name, hashes, file_hash = next(iterator)
-            except multiprocessing.TimeoutError:
-                continue
-            except StopIteration:
-                break
-            except Exception:
-                print("Failed fingerprinting")
-                # Print traceback because we can't reraise it here
-                traceback.print_exc(file=sys.stdout)
-            else:
-                sid = self.db.insert_song(song_name, file_hash, len(hashes))
-
-                self.db.insert_hashes(sid, hashes)
-                self.db.set_song_fingerprinted(sid)
-                self.__load_fingerprinted_audio_hashes()
-
-        pool.close()
-        pool.join()
+        nprocesses = int(nprocesses) if nprocesses is not None else None
+
+        with ProcessPoolExecutor(max_workers=nprocesses) as executor:
+            futures = []
+            for filename, _ in decoder.find_files_g(path, extensions):
+                # don't refingerprint already fingerprinted files
+                if decoder.unique_hash(filename) in self.songhashes_set:
+                    print(f"{filename} already fingerprinted, continuing...")
+                else:
+                    futures.append(
+                        executor.submit(
+                            self._fingerprint_worker,
+                            filename,
+                            self.limit,
+                        )
+                    )
+            for future in as_completed(futures):
+                try:
+                    song_name, hashes, file_hash = future.result()
+                except StopIteration:
+                    break
+                except Exception:
+                    print("Failed fingerprinting")
+                    # Print traceback because we can't reraise it here
+                    traceback.print_exc(file=sys.stdout)
+                else:
+                    sid = self.db.insert_song(song_name, file_hash, len(hashes))
+                    self.db.insert_hashes(sid, hashes)
+                    self.db.set_song_fingerprinted(sid)
+        # Wait until all songs are processed to reload hashes
+        self.__load_fingerprinted_audio_hashes()
 
     def fingerprint_file(self, file_path: str, song_name: str = None) -> None:
         """
@@ -228,18 +213,13 @@ def recognize(self, recognizer, *options, **kwoptions) -> Dict[str, any]:
         return r.recognize(*options, **kwoptions)
 
     @staticmethod
-    def _fingerprint_worker(arguments):
-        # Pool.imap sends arguments as tuples so we have to unpack
-        # them ourself.
-        try:
-            file_name, limit = arguments
-        except ValueError:
-            pass
-
-        song_name, extension = os.path.splitext(os.path.basename(file_name))
-
-        fingerprints, file_hash = Dejavu.get_file_fingerprints(file_name, limit, print_output=True)
-
+    def _fingerprint_worker(file_name, limit):
+        song_name = os.path.splitext(os.path.basename(file_name))[0]
+        # Suppressing print_output because MP will step all over itself
+        # while printing to stdout
+        fingerprints, file_hash = Dejavu.get_file_fingerprints(
+            file_name, limit, print_output=False
+        )
         return song_name, fingerprints, file_hash
 
     @staticmethod

From 6ce7dbf0581373a8402dc810a6cc60b3c019a32c Mon Sep 17 00:00:00 2001
From: "Samuel J. Woodward" <sam@woodward.fyi>
Date: Mon, 22 Apr 2024 10:34:15 -0400
Subject: [PATCH 04/10] Minor tweak for cleanup

---
 dejavu/logic/decoder.py | 5 +----
 1 file changed, 1 insertion(+), 4 deletions(-)

diff --git a/dejavu/logic/decoder.py b/dejavu/logic/decoder.py
index ccafa262..02a40f7b 100755
--- a/dejavu/logic/decoder.py
+++ b/dejavu/logic/decoder.py
@@ -23,10 +23,7 @@ def unique_hash(file_path: str, block_size: int = 2**20) -> str:
     """
     s = sha1()
     with open(file_path, "rb") as f:
-        while True:
-            buf = f.read(block_size)
-            if not buf:
-                break
+        while buf := f.read(block_size)
             s.update(buf)
     return s.hexdigest().upper()
 

From 3eab4f1008a2abdd24240a9e0528e501992d42c4 Mon Sep 17 00:00:00 2001
From: "Samuel J. Woodward" <sam@woodward.fyi>
Date: Mon, 22 Apr 2024 10:45:53 -0400
Subject: [PATCH 05/10] Adding find_files_g which is an iterator that yields
 the fpath, extension tuple. Also adds the ':' to unique_hash that I forgot
 but am too lazy to fix in the commit

---
 dejavu/logic/decoder.py | 28 ++++++++++++++++++++++++++--
 1 file changed, 26 insertions(+), 2 deletions(-)

diff --git a/dejavu/logic/decoder.py b/dejavu/logic/decoder.py
index 02a40f7b..81edb2af 100755
--- a/dejavu/logic/decoder.py
+++ b/dejavu/logic/decoder.py
@@ -1,7 +1,7 @@
 import fnmatch
 import os
 from hashlib import sha1
-from typing import List, Tuple
+from typing import Generator, List, Tuple
 
 import numpy as np
 from pydub import AudioSegment
@@ -23,7 +23,7 @@ def unique_hash(file_path: str, block_size: int = 2**20) -> str:
     """
     s = sha1()
     with open(file_path, "rb") as f:
-        while buf := f.read(block_size)
+        while buf := f.read(block_size):
             s.update(buf)
     return s.hexdigest().upper()
 
@@ -48,6 +48,30 @@ def find_files(path: str, extensions: List[str]) -> List[Tuple[str, str]]:
     return results
 
 
+def find_files_g(path: str, extensions: List[str]) -> Generator[Tuple[str, str]]:
+    """
+    Get all files that meet the specified extensions.
+
+    :param path: path to a directory with audio files.
+    :param extensions: file extensions to look for.
+    :yields: a tuple with file name and its extension.
+    """
+    # Allow both with ".mp3" and without "mp3" to be used for extensions
+    norm_extensions = set()
+    for extension in extensions:
+        extension = extension.lower()
+        norm_extensions.add(extension)
+        if extension.startswith('.'):
+            norm_extensions.add(extension.lstrip('.'))
+        else:
+            norm_extensions.add(f'.{extension}')
+    for root, dirs, files in os.walk(path):
+        for f in files:
+            ext = os.path.splitext(f)[1].lower()
+            if ext in norm_extensions:
+                yield os.path.join(root, f), ext
+
+
 def read(file_name: str, limit: int = None) -> Tuple[List[List[int]], int, str]:
     """
     Reads any file supported by pydub (ffmpeg) and returns the data contained

From 21f68edcb03615bd5c8792fd9da61e207099addd Mon Sep 17 00:00:00 2001
From: "Samuel J. Woodward" <sam@woodward.fyi>
Date: Mon, 22 Apr 2024 10:51:59 -0400
Subject: [PATCH 06/10] Making channels in read list comps

---
 dejavu/logic/decoder.py | 10 ++++------
 1 file changed, 4 insertions(+), 6 deletions(-)

diff --git a/dejavu/logic/decoder.py b/dejavu/logic/decoder.py
index 81edb2af..dc8b516b 100755
--- a/dejavu/logic/decoder.py
+++ b/dejavu/logic/decoder.py
@@ -95,9 +95,9 @@ def read(file_name: str, limit: int = None) -> Tuple[List[List[int]], int, str]:
 
         data = np.fromstring(audiofile.raw_data, np.int16)
 
-        channels = []
-        for chn in range(audiofile.channels):
-            channels.append(data[chn::audiofile.channels])
+        channels = [
+            data[chn::audiofile.channels] for chn in range(audiofile.channels)
+        ]
 
         audiofile.frame_rate
     except audioop.error:
@@ -109,9 +109,7 @@ def read(file_name: str, limit: int = None) -> Tuple[List[List[int]], int, str]:
         audiofile = audiofile.T
         audiofile = audiofile.astype(np.int16)
 
-        channels = []
-        for chn in audiofile:
-            channels.append(chn)
+        channels = [chn for chn in audiofile]
 
     return channels, audiofile.frame_rate, unique_hash(file_name)
 

From 7ab1ce5e76fa3dcd463503b0eafa13f31fdb90a8 Mon Sep 17 00:00:00 2001
From: "Samuel J. Woodward" <sam@woodward.fyi>
Date: Mon, 22 Apr 2024 10:54:16 -0400
Subject: [PATCH 07/10] Adding song hashes directly to songhashes_set to
 prevent an unnecessary lookup

---
 dejavu/__init__.py | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/dejavu/__init__.py b/dejavu/__init__.py
index 5f34fb36..757a9b05 100755
--- a/dejavu/__init__.py
+++ b/dejavu/__init__.py
@@ -46,8 +46,7 @@ def __load_fingerprinted_audio_hashes(self) -> None:
         self.songs = self.db.get_songs()
         self.songhashes_set = set()  # to know which ones we've computed before
         for song in self.songs:
-            song_hash = song[FIELD_FILE_SHA1]
-            self.songhashes_set.add(song_hash)
+            self.songhashes_set.add(song[FIELD_FILE_SHA1])
 
     def get_fingerprinted_songs(self) -> List[Dict[str, any]]:
         """

From 19201bf7e9475bc20c37b9f820225bf0bb575aff Mon Sep 17 00:00:00 2001
From: "Samuel J. Woodward" <sam@woodward.fyi>
Date: Mon, 22 Apr 2024 11:02:57 -0400
Subject: [PATCH 08/10] Changing counts and song_matches in align_matches to
 use generator expressions for lazy evaluation

---
 dejavu/__init__.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/dejavu/__init__.py b/dejavu/__init__.py
index 757a9b05..1be47f0f 100755
--- a/dejavu/__init__.py
+++ b/dejavu/__init__.py
@@ -173,9 +173,9 @@ def align_matches(self, matches: List[Tuple[int, int]], dedup_hashes: Dict[str,
         """
         # count offset occurrences per song and keep only the maximum ones.
         sorted_matches = sorted(matches, key=lambda m: (m[0], m[1]))
-        counts = [(*key, len(list(group))) for key, group in groupby(sorted_matches, key=lambda m: (m[0], m[1]))]
+        counts = ((*key, len(list(group))) for key, group in groupby(sorted_matches, key=lambda m: (m[0], m[1])))
         songs_matches = sorted(
-            [max(list(group), key=lambda g: g[2]) for key, group in groupby(counts, key=lambda count: count[0])],
+            (max(group, key=lambda g: g[2]) for key, group in groupby(counts, key=lambda count: count[0])),
             key=lambda count: count[2], reverse=True
         )
 

From 603497805f6e9d4f0c79e2cf293771205f27c914 Mon Sep 17 00:00:00 2001
From: "Samuel J. Woodward" <sam@woodward.fyi>
Date: Mon, 22 Apr 2024 11:09:24 -0400
Subject: [PATCH 09/10] Fixing type hint for find_files_g to correctly capture
 send_type and return_type

---
 dejavu/logic/decoder.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/dejavu/logic/decoder.py b/dejavu/logic/decoder.py
index dc8b516b..cab09bb6 100755
--- a/dejavu/logic/decoder.py
+++ b/dejavu/logic/decoder.py
@@ -48,7 +48,7 @@ def find_files(path: str, extensions: List[str]) -> List[Tuple[str, str]]:
     return results
 
 
-def find_files_g(path: str, extensions: List[str]) -> Generator[Tuple[str, str]]:
+def find_files_g(path: str, extensions: List[str]) -> Generator[Tuple[str, str], None, None]:
     """
     Get all files that meet the specified extensions.
 

From 23056293ea92b18b7a47db4a73e2a72bb416750f Mon Sep 17 00:00:00 2001
From: "Samuel J. Woodward" <sam@woodward.fyi>
Date: Mon, 22 Apr 2024 14:42:15 -0400
Subject: [PATCH 10/10] Restoring .gitignore to master

---
 .gitignore | 11 -----------
 1 file changed, 11 deletions(-)

diff --git a/.gitignore b/.gitignore
index af080c95..8f8bc7e4 100755
--- a/.gitignore
+++ b/.gitignore
@@ -5,14 +5,3 @@ mp3
 *.mp3
 .DS_Store
 *.cnf
-
-# To be restored later
-__pycache__/
-.ruff_cache/
-[._]*.s[a-v][a-z]
-!*.svg  # comment out if you don't need vector files
-[._]*.sw[a-p]
-[._]s[a-rt-v][a-z]
-[._]ss[a-gi-z]
-[._]sw[a-p]
-[._]*.un~