diff --git a/libs/dspace-rest-python b/libs/dspace-rest-python index ed12a06..4bd99fe 160000 --- a/libs/dspace-rest-python +++ b/libs/dspace-rest-python @@ -1 +1 @@ -Subproject commit ed12a060eb5233f5520d0b21b925a6455f20e74b +Subproject commit 4bd99fefe4aa105e17ff1e9cd6a1764de67f3683 diff --git a/requirements.txt b/requirements.txt index d0c3103..f46b5b9 100644 --- a/requirements.txt +++ b/requirements.txt @@ -6,3 +6,4 @@ pre-commit tqdm requests-toolbelt six +pysolr~=3.9.0 diff --git a/src/dspace/_rest.py b/src/dspace/_rest.py index a67cb32..5d88726 100644 --- a/src/dspace/_rest.py +++ b/src/dspace/_rest.py @@ -243,6 +243,25 @@ def put_bitstreamregistry(self, data: dict): # ======= + def fetch_licenses(self): + url = 'core/clarinlicenses' + _logger.debug(f"Fetch [] using [{url}]") + page = 0 + licenses = [] + while True: + r = self._fetch(url, self.get, "_embedded", + params={"page": page, "size": 100}) + if r is None: + break + key = "clarinlicenses" + licenses_data = r.get(key, []) + if licenses_data: + licenses.extend(licenses_data) + else: + _logger.warning(f"Key [{key}] does not exist in response: {r}") + page += 1 + return licenses + def put_license_label(self, data: dict): url = 'core/clarinlicenselabels' _logger.debug(f"Importing [{data}] using [{url}]") diff --git a/src/pump/_item.py b/src/pump/_item.py index 1f6e180..7240ddf 100644 --- a/src/pump/_item.py +++ b/src/pump/_item.py @@ -279,7 +279,7 @@ def _item_import_to(self, dspace, handles, metadatas, epersons, collections): } i_meta = metadatas.filter_res_d(metadatas.value( - items.TYPE, i_id, None, True, self.ignored_fields)) + items.TYPE, i_id, None, True), self.ignored_fields) if i_meta: data['metadata'] = i_meta diff --git a/src/pump/_license.py b/src/pump/_license.py index 87849d4..6cc3bec 100644 --- a/src/pump/_license.py +++ b/src/pump/_license.py @@ -1,6 +1,6 @@ import os import logging -from ._utils import read_json, time_method, serialize, deserialize, progress_bar, log_before_import, log_after_import +from pump._utils import read_json, time_method, serialize, deserialize, progress_bar, log_before_import, log_after_import _logger = logging.getLogger("pump.license") @@ -68,7 +68,7 @@ def imported_labels(self): def imported_licenses(self): return self._imported['licenses'] - def import_to(self, env, dspace, epersons): + def import_to(self, env, dspace, epersons=None): self._import_license_labels(env, dspace) self._import_license_defs(env, dspace, epersons) @@ -143,7 +143,9 @@ def _import_license_defs(self, env, dspace, epersons): if lic_id in self._license2label: data['extendedClarinLicenseLabels'] = self._license2label[lic_id] - params = {'eperson': epersons.uuid(lic['eperson_id'])} + params = {} + if epersons: + params = {'eperson': epersons.uuid(lic['eperson_id'])} try: resp = dspace.put_license(params, data) self._imported["licenses"] += 1 diff --git a/tools/license/README.md b/tools/license/README.md new file mode 100644 index 0000000..81094d0 --- /dev/null +++ b/tools/license/README.md @@ -0,0 +1,15 @@ +# fetch_licenses.py + +This script retrieves all licenses, labels, and mappings from DSpace that meet the defined conditions and returns them in JSON format. + +``` +python fetch_licenses.py --no_definition dev-5.pc:85 --output data +``` + +# import_licenses.py + +This script imports licenses, labels, and mappings. + +``` +python import_licenses.py --input data +``` \ No newline at end of file diff --git a/tools/license/fetch_licenses.py b/tools/license/fetch_licenses.py new file mode 100644 index 0000000..7a4be7a --- /dev/null +++ b/tools/license/fetch_licenses.py @@ -0,0 +1,138 @@ +### +# This script retrieves all licenses, labels, and mappings from DSpace that meet the defined conditions and returns them in JSON format. +### + +import argparse +import logging +import os +import json +import sys + +_this_dir = os.path.dirname(os.path.abspath(__file__)) +path_to_dspace_lib = os.path.join(_this_dir, "../../libs/dspace-rest-python") +sys.path.insert(0, os.path.join(_this_dir, "../../src")) + +import dspace # noqa +import settings # noqa +import project_settings # noqa +from dspace_rest_client.models import License # noqa +from utils import init_logging, update_settings # noqa + +_logger = logging.getLogger() + +# env settings, update with project_settings +env = update_settings(settings.env, project_settings.settings) +init_logging(_logger, env["log_file"]) + + +class LicenseProcessor: + """Class to handle DSpace license retrieval, filtering, and output.""" + + def __init__(self, dspace_backend, no_definition): + """ + Initialize LicenseProcessor with the DSpace backend and settings. + + :param dspace_backend: The DSpace backend instance for fetching data. + :param no_definition: List of strings that cannot be part of the license definition. + """ + self._dspace_be = dspace_backend + self._no_definition = set(no_definition) + + def fetch_licenses(self): + """Fetch licenses from DSpace backend.""" + all_licenses = self._dspace_be.fetch_licenses() + _logger.info(f"Number of fetched licenses: {len(all_licenses)}") + return all_licenses + + def filter_licenses(self, all_licenses: list): + """Filter licenses based on the no_definition criteria.""" + key = "definition" + return [ + License(license) + for license in all_licenses + if key in license and not any(arg in license[key] for arg in self._no_definition) + ] + + def collect_license_labels(self, filtered_licenses: list): + """Collect unique license labels and extended license mappings.""" + added_ids = set() + filtered_license_labels = [] + + for license in filtered_licenses: + # Add the primary license label + label = license.licenseLabel + if label and label.id not in added_ids: + added_ids.add(label.id) + filtered_license_labels.append(label) + + # Add extended license labels + for ext in license.extendedLicenseLabel or []: + if ext and ext.id not in added_ids: + added_ids.add(ext.id) + filtered_license_labels.append(ext) + + return filtered_license_labels + + def create_license_mapping(self, filtered_licenses: list): + """Create extended license mappings.""" + return [ + {'license_id': license.id, 'label_id': ext.id} + for license in filtered_licenses + for ext in license.extendedLicenseLabel or [] + ] + + +def write_data_to_file(data: list, output_path: str): + """Write the filtered data to a JSON file.""" + os.makedirs(os.path.dirname(output_path), + exist_ok=True) # Ensure output directory exists + with open(output_path, 'w', encoding='utf-8') as f: + json.dump(data, f, indent=2, sort_keys=True) + + +if __name__ == '__main__': + parser = argparse.ArgumentParser( + description="Get DSpace licenses that meet condition.") + parser.add_argument("--no_definition", type=str, nargs='+', required=True, + help="String that cannot be part of the license definition") + parser.add_argument('--output', type=str, + default=os.path.join(_this_dir, "data"), + help='Output directory for the JSON file') + args = parser.parse_args() + + # Initialize DSpace backend + dspace_be = dspace.rest( + env["backend"]["endpoint"], + env["backend"]["user"], + env["backend"]["password"], + env["backend"]["authentication"] + ) + + # Create LicenseProcessor instance and process the licenses + processor = LicenseProcessor(dspace_be, args.no_definition) + + # Fetch and filter licenses + all_licenses = processor.fetch_licenses() + filtered_licenses = processor.filter_licenses(all_licenses) + + # Collect unique license labels and extended mappings + filtered_license_labels = processor.collect_license_labels(filtered_licenses) + filtered_ext_mapping = processor.create_license_mapping(filtered_licenses) + + # Log filtered results + _logger.info(f"Filtered licenses: {filtered_licenses}") + _logger.info(f"Filtered license labels: {filtered_license_labels}") + _logger.info(f"Filtered license extended mapping: {filtered_ext_mapping}") + + _logger.info(f"Number of filtered licenses: {len(filtered_licenses)}") + _logger.info(f"Number of filtered license labels: {len(filtered_license_labels)}") + _logger.info( + f"Number of filtered license extended mapping: {len(filtered_ext_mapping)}") + + # Write the filtered data to the specified output file + write_data_to_file([license.to_dict() for license in filtered_licenses], + os.path.join(args.output, 'licenses.json')) + write_data_to_file([license.to_dict() for license in filtered_license_labels], + os.path.join(args.output, 'labels.json')) + write_data_to_file( + filtered_ext_mapping, os.path.join(args.output, 'mapping.json')) diff --git a/tools/license/import_licenses.py b/tools/license/import_licenses.py new file mode 100644 index 0000000..02f903b --- /dev/null +++ b/tools/license/import_licenses.py @@ -0,0 +1,52 @@ +### +# This script import license, labels and mappings. +### +import argparse +import logging +import os +import sys + +_this_dir = os.path.dirname(os.path.abspath(__file__)) +path_to_dspace_lib = os.path.join(_this_dir, "../../libs/dspace-rest-python") +sys.path.insert(0, os.path.join(_this_dir, "../../src")) +sys.path.insert(0, os.path.join(_this_dir, "../../src/pump")) + +import dspace # noqa +import pump # noqa +import settings # noqa +import project_settings # noqa +from dspace_rest_client.models import License # noqa +from utils import init_logging, update_settings # noqa + +from _license import licenses + +_logger = logging.getLogger() + +# env settings, update with project_settings +env = update_settings(settings.env, project_settings.settings) +init_logging(_logger, env["log_file"]) + + +if __name__ == '__main__': + parser = argparse.ArgumentParser(description="Import licenses to DSpace.") + parser.add_argument('--input', type=str, + default=os.path.join(_this_dir, "data"), + help='Input directory for the JSON file') + args = parser.parse_args() + + # Initialize DSpace backend + dspace_be = dspace.rest( + env["backend"]["endpoint"], + env["backend"]["user"], + env["backend"]["password"], + env["backend"]["authentication"] + ) + + _logger.info("Loading license import") + licenses_imp = licenses(os.path.join(args.input, 'labels.json'), os.path.join( + args.input, 'licenses.json'), os.path.join(args.input, 'mapping.json')) + + # import licenses + _logger.info("Start license import") + licenses_imp.import_to(env, dspace_be) + _logger.info("End license import")