Skip to content

Commit

Permalink
Merge pull request #4199 from jcassette/duplicate
Browse files Browse the repository at this point in the history
Allow to configure which fields are used to find duplicates
  • Loading branch information
sampsyo authored Aug 21, 2022
2 parents 7467bc3 + 2ebc28d commit e584b04
Show file tree
Hide file tree
Showing 7 changed files with 122 additions and 28 deletions.
2 changes: 1 addition & 1 deletion beets/autotag/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@
TrackMatch,
Distance,
)
from .match import tag_item, tag_album, Proposal # noqa
from .match import tag_item, tag_album, current_metadata, Proposal # noqa
from .match import Recommendation # noqa

# Global logger.
Expand Down
3 changes: 3 additions & 0 deletions beets/config_default.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,9 @@ import:
group_albums: no
pretend: false
search_ids: []
duplicate_keys:
album: albumartist album
item: artist title
duplicate_action: ask
bell: no
set_fields: {}
Expand Down
21 changes: 20 additions & 1 deletion beets/dbcore/db.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@
from beets.util import functemplate
from beets.util import py3_path
from beets.dbcore import types
from .query import MatchQuery, NullSort, TrueQuery
from .query import MatchQuery, NullSort, TrueQuery, AndQuery
from collections.abc import Mapping


Expand Down Expand Up @@ -641,6 +641,25 @@ def set_parse(self, key, string):
"""
self[key] = self._parse(key, string)

# Convenient queries.

@classmethod
def field_query(cls, field, pattern, query_cls=MatchQuery):
"""Get a `FieldQuery` for this model."""
return query_cls(field, pattern, field in cls._fields)

@classmethod
def all_fields_query(cls, pats, query_cls=MatchQuery):
"""Get a query that matches many fields with different patterns.
`pats` should be a mapping from field names to patterns. The
resulting query is a conjunction ("and") of per-field queries
for all of these field/pattern pairs.
"""
subqueries = [cls.field_query(k, v, query_cls)
for k, v in pats.items()]
return AndQuery(subqueries)


# Database controller and supporting interfaces.

Expand Down
71 changes: 45 additions & 26 deletions beets/importer.py
Original file line number Diff line number Diff line change
Expand Up @@ -521,17 +521,18 @@ def skip(self):

# Convenient data.

def chosen_ident(self):
"""Returns identifying metadata about the current choice. For
albums, this is an (artist, album) pair. For items, this is
(artist, title). May only be called when the choice flag is ASIS
or RETAG (in which case the data comes from the files' current
metadata) or APPLY (data comes from the choice).
def chosen_info(self):
"""Return a dictionary of metadata about the current choice.
May only be called when the choice flag is ASIS or RETAG
(in which case the data comes from the files' current metadata)
or APPLY (in which case the data comes from the choice).
"""
if self.choice_flag in (action.ASIS, action.RETAG):
return (self.cur_artist, self.cur_album)
likelies, consensus = autotag.current_metadata(self.items)
return likelies
elif self.choice_flag is action.APPLY:
return (self.match.info.artist, self.match.info.album)
return self.match.info.copy()
assert False

def imported_items(self):
"""Return a list of Items that should be added to the library.
Expand Down Expand Up @@ -667,26 +668,34 @@ def find_duplicates(self, lib):
"""Return a list of albums from `lib` with the same artist and
album name as the task.
"""
artist, album = self.chosen_ident()
info = self.chosen_info()
info['albumartist'] = info['artist']

if artist is None:
if info['artist'] is None:
# As-is import with no artist. Skip check.
return []

duplicates = []
# Construct a query to find duplicates with this metadata. We
# use a temporary Album object to generate any computed fields.
tmp_album = library.Album(lib, **info)
keys = config['import']['duplicate_keys']['album'].as_str_seq()
dup_query = library.Album.all_fields_query({
key: tmp_album.get(key)
for key in keys
})

# Don't count albums with the same files as duplicates.
task_paths = {i.path for i in self.items if i}
duplicate_query = dbcore.AndQuery((
dbcore.MatchQuery('albumartist', artist),
dbcore.MatchQuery('album', album),
))

for album in lib.albums(duplicate_query):
duplicates = []
for album in lib.albums(dup_query):
# Check whether the album paths are all present in the task
# i.e. album is being completely re-imported by the task,
# in which case it is not a duplicate (will be replaced).
album_paths = {i.path for i in album.items()}
if not (album_paths <= task_paths):
duplicates.append(album)

return duplicates

def align_album_level_fields(self):
Expand Down Expand Up @@ -892,12 +901,17 @@ def __init__(self, toppath, item):
self.is_album = False
self.paths = [item.path]

def chosen_ident(self):
assert self.choice_flag in (action.ASIS, action.APPLY, action.RETAG)
def chosen_info(self):
"""Return a dictionary of metadata about the current choice.
May only be called when the choice flag is ASIS or RETAG
(in which case the data comes from the files' current metadata)
or APPLY (in which case the data comes from the choice).
"""
assert self.choice_flag in (action.ASIS, action.RETAG, action.APPLY)
if self.choice_flag in (action.ASIS, action.RETAG):
return (self.item.artist, self.item.title)
return dict(self.item)
elif self.choice_flag is action.APPLY:
return (self.match.info.artist, self.match.info.title)
return self.match.info.copy()

def imported_items(self):
return [self.item]
Expand All @@ -918,14 +932,19 @@ def find_duplicates(self, lib):
"""Return a list of items from `lib` that have the same artist
and title as the task.
"""
artist, title = self.chosen_ident()
info = self.chosen_info()

# Query for existing items using the same metadata. We use a
# temporary `Item` object to generate any computed fields.
tmp_item = library.Item(lib, **info)
keys = config['import']['duplicate_keys']['item'].as_str_seq()
dup_query = library.Album.all_fields_query({
key: tmp_item.get(key)
for key in keys
})

found_items = []
query = dbcore.AndQuery((
dbcore.MatchQuery('artist', artist),
dbcore.MatchQuery('title', title),
))
for other_item in lib.items(query):
for other_item in lib.items(dup_query):
# Existing items not considered duplicates.
if other_item.path != self.item.path:
found_items.append(other_item)
Expand Down
4 changes: 4 additions & 0 deletions docs/changelog.rst
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,10 @@ New features:
* :doc:`/plugins/kodiupdate`: Now supports multiple kodi instances
:bug:`4101`
* Add the item fields ``bitrate_mode``, ``encoder_info`` and ``encoder_settings``.
* Add query prefixes ``=`` and ``~``.
* A new configuration option, :ref:`duplicate_keys`, lets you change which
fields the beets importer uses to identify duplicates.
:bug:`1133` :bug:`4199`
* Add :ref:`exact match <exact-match>` queries, using the prefixes ``=`` and
``=~``.
:bug:`4251`
Expand Down
16 changes: 16 additions & 0 deletions docs/reference/config.rst
Original file line number Diff line number Diff line change
Expand Up @@ -689,6 +689,22 @@ with the ``-a`` flag to the :ref:`import-cmd` command.)

Default: ``yes``.

.. _duplicate_keys:

duplicate_keys
~~~~~~~~~~~~~~

The fields used to find duplicates when importing.
There are two sub-values here: ``album`` and ``item``.
Each one is a list of field names; if an existing object (album or item) in
the library matches the new object on all of these fields, the importer will
consider it a duplicate.

Default::

album: albumartist album
item: artist title

.. _duplicate_action:

duplicate_action
Expand Down
33 changes: 33 additions & 0 deletions test/test_importer.py
Original file line number Diff line number Diff line change
Expand Up @@ -1234,6 +1234,7 @@ def test_album_info(*args, **kwargs):
tracks=[track_info],
album_id='albumid',
artist_id='artistid',
flex='flex',
)
return iter([album_info])

Expand All @@ -1251,6 +1252,7 @@ def setUp(self):
# Create import session
self.importer = self.create_importer()
config['import']['autotag'] = True
config['import']['duplicate_keys']['album'] = 'albumartist album'

def tearDown(self):
self.teardown_beets()
Expand Down Expand Up @@ -1320,6 +1322,24 @@ def test_merge_duplicate_album(self):
def test_twice_in_import_dir(self):
self.skipTest('write me')

def test_keep_when_extra_key_is_different(self):
config['import']['duplicate_keys']['album'] = 'albumartist album flex'

item = self.lib.items().get()
import_file = MediaFile(os.path.join(
self.importer.paths[0], b'album 0', b'track 0.mp3'))
import_file.artist = item['artist']
import_file.albumartist = item['artist']
import_file.album = item['album']
import_file.title = item['title']
import_file.flex = 'different'

self.importer.default_resolution = self.importer.Resolution.SKIP
self.importer.run()

self.assertEqual(len(self.lib.albums()), 2)
self.assertEqual(len(self.lib.items()), 2)

def add_album_fixture(self, **kwargs):
# TODO move this into upstream
album = super().add_album_fixture()
Expand Down Expand Up @@ -1349,6 +1369,7 @@ def setUp(self):
self.importer = self.create_importer()
config['import']['autotag'] = True
config['import']['singletons'] = True
config['import']['duplicate_keys']['item'] = 'artist title'

def tearDown(self):
self.teardown_beets()
Expand Down Expand Up @@ -1385,6 +1406,18 @@ def test_skip_duplicate(self):
item = self.lib.items().get()
self.assertEqual(item.mb_trackid, 'old trackid')

def test_keep_when_extra_key_is_different(self):
config['import']['duplicate_keys']['item'] = 'artist title flex'
item = self.lib.items().get()
item.flex = 'different'
item.store()
self.assertEqual(len(self.lib.items()), 1)

self.importer.default_resolution = self.importer.Resolution.SKIP
self.importer.run()

self.assertEqual(len(self.lib.items()), 2)

def test_twice_in_import_dir(self):
self.skipTest('write me')

Expand Down

0 comments on commit e584b04

Please sign in to comment.