diff --git a/tools/add_metadata/add_metadata.py b/tools/add_metadata/add_metadata.py index 4febf9e..1f559c5 100644 --- a/tools/add_metadata/add_metadata.py +++ b/tools/add_metadata/add_metadata.py @@ -16,6 +16,7 @@ import project_settings # noqa from dspace_rest_client.models import Item from utils import init_logging, update_settings # noqa +logging.getLogger("dspace.client").setLevel(logging.WARNING) _logger = logging.getLogger() @@ -56,13 +57,13 @@ def parse(self) -> bool: if len(self._d) < 1: return False - formats = ['%Y/%m/%d', '%d/%m/%Y', '%Y.%m.%d', '%d.%m.%Y', '%d. %m. %Y', '%Y', + formats = ['%Y/%m/%d', '%d/%m/%Y', '%Y.%m.%d', '%d.%m.%Y', '%Y', '%Y-%m', '%m-%Y', '%Y/%m', '%m/%Y', '%Y.%m', '%m.%Y'] for fmt in formats: try: datetime_obj = datetime.strptime(self._d, fmt) # Normalize date to 'YYYY-MM-DD' - if fmt in ['%Y-%m', '%Y/%m', '%Y.%m', '%m-%Y', '%m/%Y', '%m.%Y']: + if fmt in ['%Y-%m', '%Y/%m', '%Y.%m']: self._d = datetime_obj.strftime('%Y-%m-01') elif fmt == '%Y': self._d = datetime_obj.strftime('%Y-01-01') @@ -160,6 +161,8 @@ def add_new_metadata(self, item) -> bool: if date_meta is None: continue id_str = f"Item [{uuid}]: [{from_mtd}]" + if len(date_meta) != 1: + _logger.warning(f"{id_str}: more than one value {date_meta}") # If there is more than one value, get only the first one date_val = date(date_meta[0]["value"]) @@ -222,6 +225,35 @@ def update(self, item: dict) -> bool: return self.add_new_metadata(item) +class additional_stats: + + def __init__(self): + self._titles = defaultdict(int) + self._doubles = defaultdict(list) + + def update(self, item: dict): + uuid = item['uuid'] + dc_titles = item['metadata'].get('dc.title', []) + if len(dc_titles) > 0: + self._titles[dc_titles[0]['value']] += 1 + key = 'dc.date.issued' + if len(item['metadata'].get(key, [])) > 1: + self._doubles[key].append(uuid) + + def print_info(self, show_limit=100): + duplicates = {k: v for k, v in self._titles.items() if v > 1} + _logger.info( + f"Duplicates {len(duplicates)} ({sum(duplicates.values())}) (showing first {show_limit}):") + for i, (k, v) in enumerate(duplicates.items()): + if i >= show_limit: + break + _logger.info(f"Title [{k}] : {v}") + if len(self._doubles) > 0: + _logger.info("Multiple values when expecting at most 1:") + for k, v in self._doubles.items(): + _logger.info(f"{k}: {v}") + + if __name__ == '__main__': parser = argparse.ArgumentParser(description="Add metadata for DSpace items") parser.add_argument("--to_mtd_field", @@ -234,6 +266,7 @@ def update(self, item: dict) -> bool: parser.add_argument("--password", type=str, default=env["backend"]["password"]) parser.add_argument("--dry-run", action='store_true', default=False) args = parser.parse_args() + _logger.info(f"Arguments: {args}") start = time.time() @@ -242,14 +275,17 @@ def update(self, item: dict) -> bool: upd = updater(dspace_be, args.from_mtd_field, args.to_mtd_field, dry_run=args.dry_run) - titles = defaultdict(int) + stats = additional_stats() + # Process items + len_all_items = 0 + len_used_items = 0 for items in dspace_be.iter_items(): + len_all_items += len(items) items = [item for item in items if not item['withdrawn'] and item['inArchive']] + len_used_items += len(items) for item in items: - dc_titles = item['metadata'].get('dc.title', []) - if len(dc_titles) > 0: - titles[dc_titles[0]['value']] += 1 + stats.update(item) upd.update(item) _logger.info(40 * "=") @@ -268,16 +304,22 @@ def update(self, item: dict) -> bool: _logger.info(f"Date invalid [{len(msgs)}]:\n\t{msgs}") _logger.info(40 * "=") - show_limit = 100 - duplicates = {k: v for k, v in titles.items() if v > 1} - _logger.info("Duplicates {len(duplicates)} (showing first {show_limit}:") - for i, (k, v) in enumerate(duplicates.items()): - if i >= show_limit: - break - _logger.info(f"Title [{k}] : {v}") + stats.print_info() _logger.info(40 * "=") - _logger.info("Statistics:") + _logger.info("Update statistics:") for k, v in upd.info.items(): _logger.info(f"{k:25s}: {len(v):6d}") - _logger.info(f"Total time: {time.time() - start:.2f} s") + took = time.time() - start + + _logger.info(40 * "=") + _logger.info("Counts:") + _logger.info(f"Total items: {len_all_items}") + _logger.info(f"Used items: {len_used_items}") + # sets are not counted + _logger.info( + f"Sum of updates: {sum(len(x) for x in upd.info.values() if isinstance(x, list))}") + + _logger.info(40 * "=") + _logger.info( + f"Total time: {took:.2f} s [{time.strftime('%H:%M:%S', time.gmtime(took))}]")