From 8e405050aac0c7f9efcc8b75134e916889c26f48 Mon Sep 17 00:00:00 2001 From: jm Date: Mon, 12 Feb 2024 18:36:32 +0100 Subject: [PATCH] refactored, only 20 bitstream item are being created --- requirements.txt | 3 +- src/create_bitstreams/__init__.py | 0 src/create_bitstreams/create_bitstreams.py | 208 ------------------ {src => tools}/create_bitstreams/README.md | 0 tools/create_bitstreams/create_bitstreams.py | 188 ++++++++++++++++ tools/create_bitstreams/requirements.txt | 1 + {src => tools}/create_bitstreams/template.png | Bin tools/data_checker/__init__.py | 0 tools/localization/__init__.py | 0 9 files changed, 190 insertions(+), 210 deletions(-) delete mode 100644 src/create_bitstreams/__init__.py delete mode 100644 src/create_bitstreams/create_bitstreams.py rename {src => tools}/create_bitstreams/README.md (100%) create mode 100644 tools/create_bitstreams/create_bitstreams.py create mode 100644 tools/create_bitstreams/requirements.txt rename {src => tools}/create_bitstreams/template.png (100%) delete mode 100644 tools/data_checker/__init__.py delete mode 100644 tools/localization/__init__.py diff --git a/requirements.txt b/requirements.txt index d3db195..e68a798 100644 --- a/requirements.txt +++ b/requirements.txt @@ -3,5 +3,4 @@ requests lxml psycopg2 pre-commit -tqdm -requests-toolbelt \ No newline at end of file +tqdm \ No newline at end of file diff --git a/src/create_bitstreams/__init__.py b/src/create_bitstreams/__init__.py deleted file mode 100644 index e69de29..0000000 diff --git a/src/create_bitstreams/create_bitstreams.py b/src/create_bitstreams/create_bitstreams.py deleted file mode 100644 index 7fdd1cc..0000000 --- a/src/create_bitstreams/create_bitstreams.py +++ /dev/null @@ -1,208 +0,0 @@ -import logging -import os -import zipfile - -import src.dspace # noqa -import src.settings # noqa -import src.project_settings # noqa - -from src.dspace.impl.models import Item -from src.utils import update_settings - -env = update_settings(src.settings.env, src.project_settings.settings) - -MULTIPART_CONTENT_TYPE = 'multipart/form-data' -COPIES_COUNT = 100 - -TEMPLATE_FILE_PATH = 'template.png' -ZIP_FILE_PATH = 'zipfile.zip' -BIG_FILE_PATH = 'bigfile.txt' - -COMMUNITY_2_CREATE = { - "type": { - "value": "community" - }, - "metadata": { - "dc.title": [ - { - "language": None, - "value": "Test Item Community" - } - ], - } -} - -COLLECTION_2_CREATE = { - "type": { - "value": "collection" - }, - "metadata": { - "dc.title": [ - { - "language": None, - "value": "Test Item Collection" - } - ] - }, -} - -ITEM_2_CREATE = { - "type": { - "value": "item" - }, - "metadata": { - "dc.title": [ - { - "language": None, - "value": "Test Item" - } - ] - }, - "inArchive": True, - "discoverable": True, - "withdrawn": False, -} - -def remove_file(path): - """ - Remove file from path. - @param path: path to the file - """ - try: - os.remove(path) - except OSError as e: - logging.warning(f"Error: {e.filename} - {e.strerror}.") - - -def fetch_original_bundle(dspace_client, item): - """ - Fetch original bundle from item. - @param dspace_client: dspace client - @param item: item where the bundle will be fetched - @return: original bundle or None if bundle was not found - """ - item_bundles = dspace_client.client.get_bundles(item) - for bundle in item_bundles: - if bundle.name == 'ORIGINAL': - return bundle - return None - - -def create_bistreams(dspace_client, item, is_big_file=False, is_zip_file=False, is_hundred_files=False): - """ - Create bitstreams for item. - @param dspace_client: dsapce client - @param item: item where the bitstreams will be created - @param is_big_file: if create an Item with big file - @param is_zip_file: if create an Item with zip file - @param is_hundred_files: if create an Item with 100 files - """ - # Fetch a bundle of existing Item or create a new one - # It is a bundle where the files will be uploaded - original_bundle = fetch_original_bundle(dspace_be, item) - if original_bundle is None: - dspace_client.client.create_bundle(item) - if not original_bundle: - logging.warning(f'The bundle was neither found nor created.') - - if is_hundred_files: - for i in range(COPIES_COUNT): - # create bitstream - logging.info(f'Creating bitstream with file: template_{i}') - dspace_client.client.create_bitstream(original_bundle, TEMPLATE_FILE_PATH, TEMPLATE_FILE_PATH, - MULTIPART_CONTENT_TYPE) - return - - if is_zip_file: - # generate zip file - zipfile.ZipFile(ZIP_FILE_PATH, mode='w').write(TEMPLATE_FILE_PATH) - - # create bitstream - logging.info(f'Creating bitstream with file: {ZIP_FILE_PATH}') - dspace_client.client.create_bitstream(original_bundle, ZIP_FILE_PATH, ZIP_FILE_PATH, MULTIPART_CONTENT_TYPE) - remove_file(ZIP_FILE_PATH) - return - - if is_big_file: - # generate big file - with open(BIG_FILE_PATH, 'wb') as f: - # 3GB - f.seek(3 * 1024 * 1024 * 1024) - f.write(b'\0') - - # create bitstream - logging.info(f'Creating bitstream with file: {BIG_FILE_PATH}') - dspace_client.client.create_bitstream(original_bundle, BIG_FILE_PATH, BIG_FILE_PATH, - MULTIPART_CONTENT_TYPE) - remove_file(BIG_FILE_PATH) - return - - -def create_item_with_title(dspace_client, parent, title): - """ - Create item with specific title. - @param dspace_client: dspace client - @param parent: collection where the item will be created - @param title: title of the item - @return: created item or None if item was not created - """ - item2create = ITEM_2_CREATE - item2create['metadata']['dc.title'][0]['value'] = title - return dspace_client.client.create_item(parent.uuid, Item(item2create)) - - -def pop_item(items: list): - """ - Pop item from list. - @param items: list of item fetched from the server - @return: item or None - """ - if items is None: - return None - - return items.pop() - - -if __name__ == '__main__': - dspace_be = src.dspace.rest( - env["backend"]["endpoint"], - env["backend"]["user"], - env["backend"]["password"], - env["backend"]["authentication"] - ) - - # Fetch all items from the server - all_items = dspace_be.client.get_items() - - # 3 Items are updated - if they don't exist create a community and collection where a new item will be created - if len(all_items) < 3: - # Create community - community = dspace_be.client.create_community(None, COMMUNITY_2_CREATE) - if not community: - logging.warning(f'Community was not created.') - - # Create collection - collection = dspace_be.client.create_collection(community.uuid, COLLECTION_2_CREATE) - if not collection: - logging.warning(f'Collection was not created.') - - # Update item with files or create a new one - - # Item with 100 bitstreams - item_hundred_files = pop_item(all_items) - if item_hundred_files is None: - item_hundred_files = create_item_with_title(dspace_be, collection, 'Hundred Files') - create_bistreams(dspace_be, item_hundred_files, is_hundred_files=True) - - # Item with zip bitstream - item_zip_files = pop_item(all_items) - if item_zip_files is None: - item_zip_files = create_item_with_title(dspace_be, collection, 'Zip File') - create_bistreams(dspace_be, item_zip_files, is_zip_file=True) - - # Item with big bitstream - item_big_file = pop_item(all_items) - if item_big_file is None: - item_big_file = create_item_with_title(dspace_be, collection, 'Big File') - create_bistreams(dspace_be, item_big_file, is_big_file=True) - diff --git a/src/create_bitstreams/README.md b/tools/create_bitstreams/README.md similarity index 100% rename from src/create_bitstreams/README.md rename to tools/create_bitstreams/README.md diff --git a/tools/create_bitstreams/create_bitstreams.py b/tools/create_bitstreams/create_bitstreams.py new file mode 100644 index 0000000..9221f6b --- /dev/null +++ b/tools/create_bitstreams/create_bitstreams.py @@ -0,0 +1,188 @@ +import logging +import os +import zipfile +import sys +import tqdm +from urllib.parse import urljoin, urlparse + +_this_dir = os.path.dirname(os.path.abspath(__file__)) +sys.path.insert(0, os.path.join(_this_dir, "../../src")) + + +import dspace # noqa +import settings # noqa +import project_settings # noqa +from dspace.impl.models import Item # noqa +from utils import init_logging, update_settings # noqa + +_logger = logging.getLogger() + +# env settings, update with project_settings +env = update_settings(settings.env, project_settings.settings) +init_logging(_logger, env["log_file"]) + +MULTIPART_CONTENT_TYPE = 'multipart/form-data' +COPIES_COUNT = 20 + +TEMPLATE_FILE_PATH = 'template.png' +ZIP_FILE_PATH = 'zipfile.zip' +BIG_FILE_PATH = 'bigfile.txt' + +COMMUNITY_2_CREATE = { + "type": { + "value": "community" + }, + "metadata": { + "dc.title": [ + { + "language": None, + "value": "Test Item Community" + } + ], + } +} + +COLLECTION_2_CREATE = { + "type": { + "value": "collection" + }, + "metadata": { + "dc.title": [ + { + "language": None, + "value": "Test Item Collection" + } + ] + }, +} + +ITEM_2_CREATE = { + "type": { + "value": "item" + }, + "metadata": { + "dc.title": [ + { + "language": None, + "value": "Test Item" + } + ] + }, + "inArchive": True, + "discoverable": True, + "withdrawn": False, +} + + +def remove_file(path): + try: + os.remove(path) + except OSError as e: + _logger.warning(f"Error: {e.filename} - {e.strerror}.") + + +def get_bundle(dspace_client, item): + """ + Fetch a bundle of existing Item or create a new one + """ + original_bundle = None + item_bundles = dspace_client.client.get_bundles(item) + for bundle in item_bundles: + if bundle.name == 'ORIGINAL': + return bundle + if original_bundle is None: + original_bundle = dspace_client.client.create_bundle(item) + if not original_bundle: + _logger.warning('The bundle was neither found nor created.') + return None + + return original_bundle + + +def create_item_with_title(dspace_client, parent, title): + """ + Create item with specific title. + @param dspace_client: dspace client + @param parent: collection where the item will be created + @param title: title of the item + @return: created item or None if item was not created + """ + item2create = ITEM_2_CREATE + item2create['metadata']['dc.title'][0]['value'] = title + return dspace_client.client.create_item(parent.uuid, Item(item2create)) + + +if __name__ == '__main__': + dspace_be = dspace.rest( + env["backend"]["endpoint"], + env["backend"]["user"], + env["backend"]["password"], + env["backend"]["authentication"] + ) + + def _link(uuid): + parsed_url = urlparse(env["backend"]["endpoint"]) + return urljoin(f"{parsed_url.scheme}://{parsed_url.netloc}", f'items/{uuid}') + + # Fetch all items from the server + all_items = dspace_be.client.get_items() + _logger.info(f"Found {len(all_items)} items.") + + # 3 Items are updated - if they don't exist create a community and collection where a new item will be created + if len(all_items) < 3: + # Create community + community = dspace_be.client.create_community(None, COMMUNITY_2_CREATE) + if not community: + _logger.warning('Community was not created.') + + # Create collection + collection = dspace_be.client.create_collection( + community.uuid, COLLECTION_2_CREATE) + if not collection: + _logger.warning('Collection was not created.') + + item_hundred_files = create_item_with_title( + dspace_be, collection, 'Hundred Files') + item_zip_files = create_item_with_title(dspace_be, collection, 'Zip File') + item_big_file = create_item_with_title(dspace_be, collection, 'Big File') + + else: + import random + for item in all_items: + _logger.info(f"Item: {_link(item.uuid)}") + + random.shuffle(all_items) + item_hundred_files, item_zip_files, item_big_file = all_items[:3] + + _logger.info( + f"Using items:\n{_link(item_hundred_files.uuid)}\n{_link(item_zip_files.uuid)}\n{_link(item_big_file.uuid)}") + + # Item with 100 bitstreams + b = get_bundle(dspace_be, item_hundred_files) + _logger.info( + f"Adding many files to [{item_hundred_files.handle}] [{item_hundred_files.uuid}]") + for i in tqdm.tqdm(range(COPIES_COUNT)): + dspace_be.client.create_bitstream( + b, TEMPLATE_FILE_PATH, TEMPLATE_FILE_PATH, MULTIPART_CONTENT_TYPE) + _logger.info(f"Created [{item_hundred_files.handle}] with many files") + + # Item with zip bitstream + b = get_bundle(dspace_be, item_zip_files) + zipfile.ZipFile(ZIP_FILE_PATH, mode='w').write(TEMPLATE_FILE_PATH) + _logger.debug(f'Creating bitstream with file: {ZIP_FILE_PATH}') + dspace_be.client.create_bitstream( + b, ZIP_FILE_PATH, ZIP_FILE_PATH, MULTIPART_CONTENT_TYPE) + _logger.info( + f"Created [{item_zip_files.handle}] [{item_zip_files.uuid}] with ZIP file") + remove_file(ZIP_FILE_PATH) + + # Item with big bitstream + big_size = 3 * 1024 * 1024 * 1024 + _logger.debug(f'Creating [{big_size// (1024 * 1024)} GB] file: {BIG_FILE_PATH}') + with open(BIG_FILE_PATH, 'wb') as f: + f.seek(big_size) + f.write(b'\0') + dspace_be.client.create_bitstream( + item_big_file, BIG_FILE_PATH, BIG_FILE_PATH, MULTIPART_CONTENT_TYPE) + _logger.info(f"Created [{item_big_file.handle}] [{item_big_file.uuid}] with BIG file") + remove_file(BIG_FILE_PATH) diff --git a/tools/create_bitstreams/requirements.txt b/tools/create_bitstreams/requirements.txt new file mode 100644 index 0000000..1ecd96b --- /dev/null +++ b/tools/create_bitstreams/requirements.txt @@ -0,0 +1 @@ +requests-toolbelt \ No newline at end of file diff --git a/src/create_bitstreams/template.png b/tools/create_bitstreams/template.png similarity index 100% rename from src/create_bitstreams/template.png rename to tools/create_bitstreams/template.png diff --git a/tools/data_checker/__init__.py b/tools/data_checker/__init__.py deleted file mode 100644 index e69de29..0000000 diff --git a/tools/localization/__init__.py b/tools/localization/__init__.py deleted file mode 100644 index e69de29..0000000