Merge pull request #4199 from jcassette/duplicate

Allow to configure which fields are used to find duplicates
beetbox · Aug 21, 2022 · e584b04 · e584b04
2 parents 7467bc3 + 2ebc28d
commit e584b04
Show file tree

Hide file tree

Showing 7 changed files with 122 additions and 28 deletions.
diff --git a/beets/autotag/__init__.py b/beets/autotag/__init__.py
@@ -27,7 +27,7 @@
     TrackMatch,
     Distance,
 )
-from .match import tag_item, tag_album, Proposal  # noqa
+from .match import tag_item, tag_album, current_metadata, Proposal  # noqa
 from .match import Recommendation  # noqa
 
 # Global logger.

diff --git a/beets/config_default.yaml b/beets/config_default.yaml
@@ -27,6 +27,9 @@ import:
     group_albums: no
     pretend: false
     search_ids: []
+    duplicate_keys:
+        album: albumartist album
+        item: artist title
     duplicate_action: ask
     bell: no
     set_fields: {}

diff --git a/beets/dbcore/db.py b/beets/dbcore/db.py
@@ -27,7 +27,7 @@
 from beets.util import functemplate
 from beets.util import py3_path
 from beets.dbcore import types
-from .query import MatchQuery, NullSort, TrueQuery
+from .query import MatchQuery, NullSort, TrueQuery, AndQuery
 from collections.abc import Mapping
 
 
@@ -641,6 +641,25 @@ def set_parse(self, key, string):
         """
         self[key] = self._parse(key, string)
 
+    # Convenient queries.
+
+    @classmethod
+    def field_query(cls, field, pattern, query_cls=MatchQuery):
+        """Get a `FieldQuery` for this model."""
+        return query_cls(field, pattern, field in cls._fields)
+
+    @classmethod
+    def all_fields_query(cls, pats, query_cls=MatchQuery):
+        """Get a query that matches many fields with different patterns.
+
+        `pats` should be a mapping from field names to patterns. The
+        resulting query is a conjunction ("and") of per-field queries
+        for all of these field/pattern pairs.
+        """
+        subqueries = [cls.field_query(k, v, query_cls)
+                      for k, v in pats.items()]
+        return AndQuery(subqueries)
+
 
 # Database controller and supporting interfaces.
 

diff --git a/beets/importer.py b/beets/importer.py
@@ -521,17 +521,18 @@ def skip(self):
 
     # Convenient data.
 
-    def chosen_ident(self):
-        """Returns identifying metadata about the current choice. For
-        albums, this is an (artist, album) pair. For items, this is
-        (artist, title). May only be called when the choice flag is ASIS
-        or RETAG (in which case the data comes from the files' current
-        metadata) or APPLY (data comes from the choice).
+    def chosen_info(self):
+        """Return a dictionary of metadata about the current choice.
+        May only be called when the choice flag is ASIS or RETAG
+        (in which case the data comes from the files' current metadata)
+        or APPLY (in which case the data comes from the choice).
         """
         if self.choice_flag in (action.ASIS, action.RETAG):
-            return (self.cur_artist, self.cur_album)
+            likelies, consensus = autotag.current_metadata(self.items)
+            return likelies
         elif self.choice_flag is action.APPLY:
-            return (self.match.info.artist, self.match.info.album)
+            return self.match.info.copy()
+        assert False
 
     def imported_items(self):
         """Return a list of Items that should be added to the library.
@@ -667,26 +668,34 @@ def find_duplicates(self, lib):
         """Return a list of albums from `lib` with the same artist and
         album name as the task.
         """
-        artist, album = self.chosen_ident()
+        info = self.chosen_info()
+        info['albumartist'] = info['artist']
 
-        if artist is None:
+        if info['artist'] is None:
             # As-is import with no artist. Skip check.
             return []
 
-        duplicates = []
+        # Construct a query to find duplicates with this metadata. We
+        # use a temporary Album object to generate any computed fields.
+        tmp_album = library.Album(lib, **info)
+        keys = config['import']['duplicate_keys']['album'].as_str_seq()
+        dup_query = library.Album.all_fields_query({
+            key: tmp_album.get(key)
+            for key in keys
+        })
+
+        # Don't count albums with the same files as duplicates.
         task_paths = {i.path for i in self.items if i}
-        duplicate_query = dbcore.AndQuery((
-            dbcore.MatchQuery('albumartist', artist),
-            dbcore.MatchQuery('album', album),
-        ))
 
-        for album in lib.albums(duplicate_query):
+        duplicates = []
+        for album in lib.albums(dup_query):
             # Check whether the album paths are all present in the task
             # i.e. album is being completely re-imported by the task,
             # in which case it is not a duplicate (will be replaced).
             album_paths = {i.path for i in album.items()}
             if not (album_paths <= task_paths):
                 duplicates.append(album)
+
         return duplicates
 
     def align_album_level_fields(self):
@@ -892,12 +901,17 @@ def __init__(self, toppath, item):
         self.is_album = False
         self.paths = [item.path]
 
-    def chosen_ident(self):
-        assert self.choice_flag in (action.ASIS, action.APPLY, action.RETAG)
+    def chosen_info(self):
+        """Return a dictionary of metadata about the current choice.
+        May only be called when the choice flag is ASIS or RETAG
+        (in which case the data comes from the files' current metadata)
+        or APPLY (in which case the data comes from the choice).
+        """
+        assert self.choice_flag in (action.ASIS, action.RETAG, action.APPLY)
         if self.choice_flag in (action.ASIS, action.RETAG):
-            return (self.item.artist, self.item.title)
+            return dict(self.item)
         elif self.choice_flag is action.APPLY:
-            return (self.match.info.artist, self.match.info.title)
+            return self.match.info.copy()
 
     def imported_items(self):
         return [self.item]
@@ -918,14 +932,19 @@ def find_duplicates(self, lib):
         """Return a list of items from `lib` that have the same artist
         and title as the task.
         """
-        artist, title = self.chosen_ident()
+        info = self.chosen_info()
+
+        # Query for existing items using the same metadata. We use a
+        # temporary `Item` object to generate any computed fields.
+        tmp_item = library.Item(lib, **info)
+        keys = config['import']['duplicate_keys']['item'].as_str_seq()
+        dup_query = library.Album.all_fields_query({
+            key: tmp_item.get(key)
+            for key in keys
+        })
 
         found_items = []
-        query = dbcore.AndQuery((
-            dbcore.MatchQuery('artist', artist),
-            dbcore.MatchQuery('title', title),
-        ))
-        for other_item in lib.items(query):
+        for other_item in lib.items(dup_query):
             # Existing items not considered duplicates.
             if other_item.path != self.item.path:
                 found_items.append(other_item)

diff --git a/docs/changelog.rst b/docs/changelog.rst
@@ -25,6 +25,10 @@ New features:
 * :doc:`/plugins/kodiupdate`: Now supports multiple kodi instances
   :bug:`4101`
 * Add the item fields ``bitrate_mode``, ``encoder_info`` and ``encoder_settings``.
+* Add query prefixes ``=`` and ``~``.
+* A new configuration option, :ref:`duplicate_keys`, lets you change which
+  fields the beets importer uses to identify duplicates.
+  :bug:`1133` :bug:`4199`
 * Add :ref:`exact match <exact-match>` queries, using the prefixes ``=`` and
   ``=~``.
   :bug:`4251`

diff --git a/docs/reference/config.rst b/docs/reference/config.rst
@@ -689,6 +689,22 @@ with the ``-a`` flag to the :ref:`import-cmd` command.)
 
 Default: ``yes``.
 
+.. _duplicate_keys:
+
+duplicate_keys
+~~~~~~~~~~~~~~
+
+The fields used to find duplicates when importing.
+There are two sub-values here: ``album`` and ``item``.
+Each one is a list of field names; if an existing object (album or item) in
+the library matches the new object on all of these fields, the importer will
+consider it a duplicate.
+
+Default::
+
+    album: albumartist album
+    item: artist title
+
 .. _duplicate_action:
 
 duplicate_action

diff --git a/test/test_importer.py b/test/test_importer.py
@@ -1234,6 +1234,7 @@ def test_album_info(*args, **kwargs):
         tracks=[track_info],
         album_id='albumid',
         artist_id='artistid',
+        flex='flex',
     )
     return iter([album_info])
 
@@ -1251,6 +1252,7 @@ def setUp(self):
         # Create import session
         self.importer = self.create_importer()
         config['import']['autotag'] = True
+        config['import']['duplicate_keys']['album'] = 'albumartist album'
 
     def tearDown(self):
         self.teardown_beets()
@@ -1320,6 +1322,24 @@ def test_merge_duplicate_album(self):
     def test_twice_in_import_dir(self):
         self.skipTest('write me')
 
+    def test_keep_when_extra_key_is_different(self):
+        config['import']['duplicate_keys']['album'] = 'albumartist album flex'
+
+        item = self.lib.items().get()
+        import_file = MediaFile(os.path.join(
+            self.importer.paths[0], b'album 0', b'track 0.mp3'))
+        import_file.artist = item['artist']
+        import_file.albumartist = item['artist']
+        import_file.album = item['album']
+        import_file.title = item['title']
+        import_file.flex = 'different'
+
+        self.importer.default_resolution = self.importer.Resolution.SKIP
+        self.importer.run()
+
+        self.assertEqual(len(self.lib.albums()), 2)
+        self.assertEqual(len(self.lib.items()), 2)
+
     def add_album_fixture(self, **kwargs):
         # TODO move this into upstream
         album = super().add_album_fixture()
@@ -1349,6 +1369,7 @@ def setUp(self):
         self.importer = self.create_importer()
         config['import']['autotag'] = True
         config['import']['singletons'] = True
+        config['import']['duplicate_keys']['item'] = 'artist title'
 
     def tearDown(self):
         self.teardown_beets()
@@ -1385,6 +1406,18 @@ def test_skip_duplicate(self):
         item = self.lib.items().get()
         self.assertEqual(item.mb_trackid, 'old trackid')
 
+    def test_keep_when_extra_key_is_different(self):
+        config['import']['duplicate_keys']['item'] = 'artist title flex'
+        item = self.lib.items().get()
+        item.flex = 'different'
+        item.store()
+        self.assertEqual(len(self.lib.items()), 1)
+
+        self.importer.default_resolution = self.importer.Resolution.SKIP
+        self.importer.run()
+
+        self.assertEqual(len(self.lib.items()), 2)
+
     def test_twice_in_import_dir(self):
         self.skipTest('write me')