Skip to content

Commit

Permalink
added validation control and correct date conversion
Browse files Browse the repository at this point in the history
  • Loading branch information
Paurikova2 committed Oct 18, 2024
1 parent b9ec256 commit d3366b8
Showing 1 changed file with 98 additions and 33 deletions.
131 changes: 98 additions & 33 deletions tools/add_metadata/add_metadata.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
import logging
import os
import sys
from datetime import datetime

_this_dir = os.path.dirname(os.path.abspath(__file__))
sys.path.insert(0, os.path.join(_this_dir, "../../src"))
Expand Down Expand Up @@ -34,41 +35,97 @@ def parse_arguments():
def fetch_items(dspace_be):
"""Fetch items from DSpace backend, filtering out withdrawn or non-archived items."""
all_items = dspace_be.fetch_items()
_logger.info(f"Number of fetched items: {len(all_items)}")
return [
item for item in all_items
if not item['withdrawn'] and item['inArchive'] and args.to_mtd_field not in item['metadata']
if not item['withdrawn'] and item['inArchive']
]


def create_missing_metadata(dspace_be, items, from_mtd_fields, to_mtd_field):
def is_valid_date(date: str):
"""Check if the given string is a valid date."""
try:
datetime.strptime(date, '%Y-%m-%d')
return True
except ValueError as e:
_logger.warning(f"[{date}] is not valid date. Error: {e}")
return False


def convert_to_date(value: str):
"""Convert the value to a date format. Normalize date to 'YYYY-MM-DD' format, filling missing parts with '01'."""
formats = ['%Y/%m/%d', '%d/%m/%Y', '%Y.%m.%d', '%d.%m.%Y', '%Y',
'%Y-%m', '%m-%Y', '%Y/%m', '%m/%Y', '%Y.%m', '%m.%Y']
found = False
for fmt in formats:
try:
datetime_obj = datetime.strptime(value, fmt)
# Normalize date to 'YYYY-MM-DD'
if fmt in ['%Y-%m', '%Y/%m', '%Y.%m']:
return datetime_obj.strftime('%Y-%m-01')
elif fmt == '%Y':
return datetime_obj.strftime('%Y-01-01')
return datetime_obj.strftime('%Y-%m-%d')
except ValueError:
continue
_logger.error(f"Error converting [{value}] to date.")
return None


def process_metadata(dspace_be, items, from_mtd_fields, to_mtd_field):
"""Create missing metadata for items based on provided fields."""
created, not_created, error_items = [], [], []
created, updated, not_created, error_items, ok_items = [], [], [], [], []

for item in items:
mtd = item["metadata"]
found = False

for from_mtd in from_mtd_fields:
if from_mtd in mtd and mtd[from_mtd]:
found = True
val = mtd[from_mtd][0]["value"]
_logger.info(
f"Metadata [{to_mtd_field}] replaced by [{from_mtd}] for item [{item['uuid']}]")

# Add the new metadata
if dspace_be.client.add_metadata(Item(item), to_mtd_field, val):
created.append(item["uuid"])
else:
_logger.warning(
f"Error creating metadata [{to_mtd_field}] for item [{item['uuid']}]")
error_items.append(item["uuid"])

break # Stop searching once we find a valid field

if not found:
not_created.append(item["id"])

return created, not_created, error_items
uuid = item['uuid']
item_mtd = item["metadata"]

if to_mtd_field in item_mtd and item_mtd[to_mtd_field]:
val = item_mtd[to_mtd_field][0]["value"]
if is_valid_date(val):
ok_items.append(uuid)
continue
_logger.info(f"Item [{uuid}] has an invalid date in [{to_mtd_field}]: {val}")
new_mtd = convert_to_date(val)
if new_mtd is None:
_logger.warning(f"Cannot convert [{to_mtd_field}] "
f"to valid date for item [{uuid}]: {val}")
error_items.append(uuid)
continue
item_mtd[to_mtd_field][0]["value"] = new_mtd
item["metadata"] = item_mtd
if dspace_be.client.update_item(Item(item)):
updated.append(uuid)
else:
_logger.error(
f"Error updating [{to_mtd_field}] for item [{uuid}]")
error_items.append(uuid)
else:
found = False
for from_mtd in from_mtd_fields:
if from_mtd in item_mtd and item_mtd[from_mtd]:
val = item_mtd[from_mtd][0]["value"]
if not is_valid_date(val):
val = convert_to_date(val)
if val is None:
_logger.warning(f"Cannot convert [{from_mtd}] "
f"to valid date for item [{uuid}]: {val}")
continue
found = True
_logger.info(
f"Metadata [{to_mtd_field}] created from [{from_mtd}] for item [{uuid}]")
if dspace_be.client.add_metadata(Item(item), to_mtd_field, val):
created.append(uuid)
else:
_logger.warning(
f"Error creating metadata [{to_mtd_field}] for item [{uuid}]")
error_items.append(uuid)
break

if not found:
not_created.append(uuid)

return created, updated, not_created, error_items, ok_items


if __name__ == '__main__':
Expand All @@ -85,13 +142,21 @@ def create_missing_metadata(dspace_be, items, from_mtd_fields, to_mtd_field):
# Fetch and filter items
items_to_update = fetch_items(dspace_be)

# Create missing metadata
created, not_created, error_items = create_missing_metadata(
# Process items
created, updated, not_created, error_items, ok_items = process_metadata(
dspace_be, items_to_update, args.from_mtd_field, args.to_mtd_field
)

# Log results
_logger.info(f"Metadata [{args.to_mtd_field}] added to items: {created}")
_logger.warning(f"Metadata [{args.to_mtd_field}] not added to items: {not_created}")
_logger.warning(
f"Error adding metadata [{args.to_mtd_field}] to items: {error_items}")
_logger.info(f"Items with correct [{args.to_mtd_field}]: {ok_items}")
_logger.info(f"Items with created [{args.to_mtd_field}]: {created}")
_logger.warning(f"Items where [{args.to_mtd_field}] was not created: {not_created}")
_logger.warning(f"Items with errors during processing: {error_items}")

_logger.info(f"Number of items to update: {len(items_to_update)}")
_logger.info(f"Number of items with correct [{args.to_mtd_field}]: {len(ok_items)}")
_logger.info(f"Number of items with updated [{args.to_mtd_field}]: {len(updated)}")
_logger.info(f"Number of items with created [{args.to_mtd_field}]: {len(created)}")
_logger.info(
f"Number of items where [{args.to_mtd_field}] was not created: {len(not_created)}")
_logger.info(f"Number of items with errors during processing: {len(error_items)}")

0 comments on commit d3366b8

Please sign in to comment.