Skip to content

Commit

Permalink
Merge branch 'main' of github.com:dataquest-dev/dspace-python-api
Browse files Browse the repository at this point in the history
  • Loading branch information
jm committed Jan 15, 2025
2 parents edffe57 + db71145 commit 71f4f25
Show file tree
Hide file tree
Showing 8 changed files with 232 additions and 5 deletions.
2 changes: 1 addition & 1 deletion libs/dspace-rest-python
1 change: 1 addition & 0 deletions requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -6,3 +6,4 @@ pre-commit
tqdm
requests-toolbelt
six
pysolr~=3.9.0
19 changes: 19 additions & 0 deletions src/dspace/_rest.py
Original file line number Diff line number Diff line change
Expand Up @@ -243,6 +243,25 @@ def put_bitstreamregistry(self, data: dict):

# =======

def fetch_licenses(self):
url = 'core/clarinlicenses'
_logger.debug(f"Fetch [] using [{url}]")
page = 0
licenses = []
while True:
r = self._fetch(url, self.get, "_embedded",
params={"page": page, "size": 100})
if r is None:
break
key = "clarinlicenses"
licenses_data = r.get(key, [])
if licenses_data:
licenses.extend(licenses_data)
else:
_logger.warning(f"Key [{key}] does not exist in response: {r}")
page += 1
return licenses

def put_license_label(self, data: dict):
url = 'core/clarinlicenselabels'
_logger.debug(f"Importing [{data}] using [{url}]")
Expand Down
2 changes: 1 addition & 1 deletion src/pump/_item.py
Original file line number Diff line number Diff line change
Expand Up @@ -279,7 +279,7 @@ def _item_import_to(self, dspace, handles, metadatas, epersons, collections):
}

i_meta = metadatas.filter_res_d(metadatas.value(
items.TYPE, i_id, None, True, self.ignored_fields))
items.TYPE, i_id, None, True), self.ignored_fields)
if i_meta:
data['metadata'] = i_meta

Expand Down
8 changes: 5 additions & 3 deletions src/pump/_license.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
import os
import logging
from ._utils import read_json, time_method, serialize, deserialize, progress_bar, log_before_import, log_after_import
from pump._utils import read_json, time_method, serialize, deserialize, progress_bar, log_before_import, log_after_import

_logger = logging.getLogger("pump.license")

Expand Down Expand Up @@ -68,7 +68,7 @@ def imported_labels(self):
def imported_licenses(self):
return self._imported['licenses']

def import_to(self, env, dspace, epersons):
def import_to(self, env, dspace, epersons=None):
self._import_license_labels(env, dspace)
self._import_license_defs(env, dspace, epersons)

Expand Down Expand Up @@ -143,7 +143,9 @@ def _import_license_defs(self, env, dspace, epersons):
if lic_id in self._license2label:
data['extendedClarinLicenseLabels'] = self._license2label[lic_id]

params = {'eperson': epersons.uuid(lic['eperson_id'])}
params = {}
if epersons:
params = {'eperson': epersons.uuid(lic['eperson_id'])}
try:
resp = dspace.put_license(params, data)
self._imported["licenses"] += 1
Expand Down
15 changes: 15 additions & 0 deletions tools/license/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
# fetch_licenses.py

This script retrieves all licenses, labels, and mappings from DSpace that meet the defined conditions and returns them in JSON format.

```
python fetch_licenses.py --no_definition dev-5.pc:85 --output data
```

# import_licenses.py

This script imports licenses, labels, and mappings.

```
python import_licenses.py --input data
```
138 changes: 138 additions & 0 deletions tools/license/fetch_licenses.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,138 @@
###
# This script retrieves all licenses, labels, and mappings from DSpace that meet the defined conditions and returns them in JSON format.
###

import argparse
import logging
import os
import json
import sys

_this_dir = os.path.dirname(os.path.abspath(__file__))
path_to_dspace_lib = os.path.join(_this_dir, "../../libs/dspace-rest-python")
sys.path.insert(0, os.path.join(_this_dir, "../../src"))

import dspace # noqa
import settings # noqa
import project_settings # noqa
from dspace_rest_client.models import License # noqa
from utils import init_logging, update_settings # noqa

_logger = logging.getLogger()

# env settings, update with project_settings
env = update_settings(settings.env, project_settings.settings)
init_logging(_logger, env["log_file"])


class LicenseProcessor:
"""Class to handle DSpace license retrieval, filtering, and output."""

def __init__(self, dspace_backend, no_definition):
"""
Initialize LicenseProcessor with the DSpace backend and settings.
:param dspace_backend: The DSpace backend instance for fetching data.
:param no_definition: List of strings that cannot be part of the license definition.
"""
self._dspace_be = dspace_backend
self._no_definition = set(no_definition)

def fetch_licenses(self):
"""Fetch licenses from DSpace backend."""
all_licenses = self._dspace_be.fetch_licenses()
_logger.info(f"Number of fetched licenses: {len(all_licenses)}")
return all_licenses

def filter_licenses(self, all_licenses: list):
"""Filter licenses based on the no_definition criteria."""
key = "definition"
return [
License(license)
for license in all_licenses
if key in license and not any(arg in license[key] for arg in self._no_definition)
]

def collect_license_labels(self, filtered_licenses: list):
"""Collect unique license labels and extended license mappings."""
added_ids = set()
filtered_license_labels = []

for license in filtered_licenses:
# Add the primary license label
label = license.licenseLabel
if label and label.id not in added_ids:
added_ids.add(label.id)
filtered_license_labels.append(label)

# Add extended license labels
for ext in license.extendedLicenseLabel or []:
if ext and ext.id not in added_ids:
added_ids.add(ext.id)
filtered_license_labels.append(ext)

return filtered_license_labels

def create_license_mapping(self, filtered_licenses: list):
"""Create extended license mappings."""
return [
{'license_id': license.id, 'label_id': ext.id}
for license in filtered_licenses
for ext in license.extendedLicenseLabel or []
]


def write_data_to_file(data: list, output_path: str):
"""Write the filtered data to a JSON file."""
os.makedirs(os.path.dirname(output_path),
exist_ok=True) # Ensure output directory exists
with open(output_path, 'w', encoding='utf-8') as f:
json.dump(data, f, indent=2, sort_keys=True)


if __name__ == '__main__':
parser = argparse.ArgumentParser(
description="Get DSpace licenses that meet condition.")
parser.add_argument("--no_definition", type=str, nargs='+', required=True,
help="String that cannot be part of the license definition")
parser.add_argument('--output', type=str,
default=os.path.join(_this_dir, "data"),
help='Output directory for the JSON file')
args = parser.parse_args()

# Initialize DSpace backend
dspace_be = dspace.rest(
env["backend"]["endpoint"],
env["backend"]["user"],
env["backend"]["password"],
env["backend"]["authentication"]
)

# Create LicenseProcessor instance and process the licenses
processor = LicenseProcessor(dspace_be, args.no_definition)

# Fetch and filter licenses
all_licenses = processor.fetch_licenses()
filtered_licenses = processor.filter_licenses(all_licenses)

# Collect unique license labels and extended mappings
filtered_license_labels = processor.collect_license_labels(filtered_licenses)
filtered_ext_mapping = processor.create_license_mapping(filtered_licenses)

# Log filtered results
_logger.info(f"Filtered licenses: {filtered_licenses}")
_logger.info(f"Filtered license labels: {filtered_license_labels}")
_logger.info(f"Filtered license extended mapping: {filtered_ext_mapping}")

_logger.info(f"Number of filtered licenses: {len(filtered_licenses)}")
_logger.info(f"Number of filtered license labels: {len(filtered_license_labels)}")
_logger.info(
f"Number of filtered license extended mapping: {len(filtered_ext_mapping)}")

# Write the filtered data to the specified output file
write_data_to_file([license.to_dict() for license in filtered_licenses],
os.path.join(args.output, 'licenses.json'))
write_data_to_file([license.to_dict() for license in filtered_license_labels],
os.path.join(args.output, 'labels.json'))
write_data_to_file(
filtered_ext_mapping, os.path.join(args.output, 'mapping.json'))
52 changes: 52 additions & 0 deletions tools/license/import_licenses.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,52 @@
###
# This script import license, labels and mappings.
###
import argparse
import logging
import os
import sys

_this_dir = os.path.dirname(os.path.abspath(__file__))
path_to_dspace_lib = os.path.join(_this_dir, "../../libs/dspace-rest-python")
sys.path.insert(0, os.path.join(_this_dir, "../../src"))
sys.path.insert(0, os.path.join(_this_dir, "../../src/pump"))

import dspace # noqa
import pump # noqa
import settings # noqa
import project_settings # noqa
from dspace_rest_client.models import License # noqa
from utils import init_logging, update_settings # noqa

from _license import licenses

_logger = logging.getLogger()

# env settings, update with project_settings
env = update_settings(settings.env, project_settings.settings)
init_logging(_logger, env["log_file"])


if __name__ == '__main__':
parser = argparse.ArgumentParser(description="Import licenses to DSpace.")
parser.add_argument('--input', type=str,
default=os.path.join(_this_dir, "data"),
help='Input directory for the JSON file')
args = parser.parse_args()

# Initialize DSpace backend
dspace_be = dspace.rest(
env["backend"]["endpoint"],
env["backend"]["user"],
env["backend"]["password"],
env["backend"]["authentication"]
)

_logger.info("Loading license import")
licenses_imp = licenses(os.path.join(args.input, 'labels.json'), os.path.join(
args.input, 'licenses.json'), os.path.join(args.input, 'mapping.json'))

# import licenses
_logger.info("Start license import")
licenses_imp.import_to(env, dspace_be)
_logger.info("End license import")

0 comments on commit 71f4f25

Please sign in to comment.