From 5f7e4ee0393ae67dbdd8cba39fee87ad10985e3d Mon Sep 17 00:00:00 2001 From: Kevin Stadler Date: Sun, 17 Nov 2024 16:22:01 +0100 Subject: [PATCH] feat: add baserow import/export scripts --- scripts/2_from_baserow.py | 21 +++++ scripts/3_to_typesense.py | 172 ++++++++++++++++++++++++++++++++++ scripts/typesense-schema.json | 141 ++++++++++++++++++++++++++++ 3 files changed, 334 insertions(+) create mode 100755 scripts/2_from_baserow.py create mode 100755 scripts/3_to_typesense.py create mode 100644 scripts/typesense-schema.json diff --git a/scripts/2_from_baserow.py b/scripts/2_from_baserow.py new file mode 100755 index 0000000..1832d1d --- /dev/null +++ b/scripts/2_from_baserow.py @@ -0,0 +1,21 @@ +#!/usr/bin/env python3 +# type: ignore +from acdh_baserow_pyutils import BaseRowClient +import os +from dotenv import load_dotenv + +load_dotenv("../.env.local") + +BASEROW_BASE_URL = os.environ.get("BASEROW_BASE_URL") +BASEROW_USER = os.environ.get("BASEROW_USER") +BASEROW_PW = os.environ.get("BASEROW_PW") +BASEROW_TOKEN = os.environ.get("BASEROW_TOKEN") +BASEROW_DATABASE_ID = os.environ.get("BASEROW_DATABASE_ID") +# initialize the client +br_client = BaseRowClient( + BASEROW_USER, BASEROW_PW, BASEROW_TOKEN, BASEROW_BASE_URL, BASEROW_DATABASE_ID +) + +folder_name = "from_baserow" +# os.mkdir(folder_name) +br_client.dump_tables_as_json(BASEROW_DATABASE_ID, folder_name=folder_name, indent="\t") diff --git a/scripts/3_to_typesense.py b/scripts/3_to_typesense.py new file mode 100755 index 0000000..183cc38 --- /dev/null +++ b/scripts/3_to_typesense.py @@ -0,0 +1,172 @@ +#!/usr/bin/env python3 +# type: ignore +import logging +from dotenv import load_dotenv +import json +import argparse +import os +import typesense +from typesense.exceptions import ObjectNotFound + +parser = argparse.ArgumentParser( + prog="baserow", description="import / export data from / to baserow" +) + +parser.add_argument( + "-env", + default="../.env.local", + help=".env file to be used for getting typesense server details", +) +parser.add_argument( + "-v", + "--verbose", + action="count", + default=3, + help="Increase the verbosity of the logging output: default is WARNING, use -v for INFO, -vv for DEBUG", +) + +args = parser.parse_args() + +logging.basicConfig( + level=max(10, 30 - 10 * args.verbose), + # format="%(levelname)-8s %(message)s", +) + + +logging.debug(f"Loading typesense access data from {args.env}") +os.chdir(os.path.dirname(__file__)) +load_dotenv(args.env) + + +def fatal(msg): + logging.fatal(msg) + exit(1) + + +logging.info( + "step 1: accumulate relational data into a typesense-ready nested structure" +) + + +def load_json(dirname, filename): + return json.load(open(f"{dirname}/{filename}.json")) + + +# get the data as it was originally exported to baserow +publications = load_json("to_baserow", "Publikation") +translations = load_json("to_baserow", "Übersetzung") +works = load_json("to_baserow", "BernhardWerk") +translators = load_json("to_baserow", "Übersetzer") + +# TODO get and apply the manual edits +# publications = json.load(open("from_baserow/Publikation.json")) +# translations = json.load(open("from_baserow/Übersetzung.json")) +# works = json.load(open("from_baserow/BernhardWerk.json.json")) +# translators = json.load(open("from_baserow/Übersetzer.json")) + +# create nested structures + + +def del_empty_strings(o, field_names): + for f in field_names: + if not o[f]: + del o[f] + + +for i, t in enumerate(translators): + # add 1-indexed translator ids to allow links to translator pages + t["id"] = i + 1 + +for i, w in enumerate(works): + # add 1-indexed bernhard work ids to allow links to work ids + w["id"] = i + 1 + # TODO to be removed/replaced by work id + w["yeartitle"] = str(w["year"]) + w["title"] + +for t in translations: + t["work"] = works[t["work"] - 1] + t["translators"] = [translators[t_id - 1] for t_id in t["translators"]] + del_empty_strings(t, ["work_display_title"]) + +for i, pub in enumerate(publications): + pub["id"] = str(i + 1) + pub["contains"] = [translations[t_id - 1] for t_id in pub["contains"]] + + pub["images"] = [{"id": img} for img in pub["images"].split(" ")] + pub["has_image"] = len(pub["images"]) > 0 + if not pub["year_display"]: + pub["year_display"] = str(pub["year"]) + + for pid in pub["parents"]: + if "later" in publications[pid - 1]: + publications[pid - 1]["later"].append(i + 1) + else: + publications[pid - 1]["later"] = [i + 1] + + del_empty_strings(pub, ["publication_details"]) + + # trim data a little + del pub["exemplar_suhrkamp_berlin"] + del pub["exemplar_oeaw"] + del pub["original_publication"] + del pub["zusatzinfos"] + +logging.info("step 2: insert nested documents into typesense") + +if "TYPESENSE_ADMIN_API_KEY" not in os.environ: + fatal("Couldn't find typesense database information in environment files") + +logging.info(f"connecting to {os.environ.get('NEXT_PUBLIC_TYPESENSE_HOST')}") + +client = typesense.Client( + { + "api_key": os.environ.get("TYPESENSE_ADMIN_API_KEY"), + "nodes": [ + { + "host": os.environ.get("NEXT_PUBLIC_TYPESENSE_HOST"), + "port": os.environ.get("NEXT_PUBLIC_TYPESENSE_PORT"), + "protocol": os.environ.get("NEXT_PUBLIC_TYPESENSE_PROTOCOL"), + } + ], + "connection_timeout_seconds": 5, + } +) + +collection_name = os.environ.get("NEXT_PUBLIC_TYPESENSE_COLLECTION_NAME") + +try: + r = client.collections[collection_name].retrieve() +except ObjectNotFound: + logging.info(f"collection '{collection_name}' does not exist yet, creating") + schema = json.load(open("typesense-schema.json")) + schema["collection_name"] = collection_name + create = client.collections.create(schema) + r = client.collections[collection_name].retrieve() + + +if r["num_documents"] > 0: + logging.info(f'Clearing {r["num_documents"]} existing documents') + r = client.collections[collection_name].documents.delete({"filter_by": 'id :!= ""'}) + logging.info( + f'Cleared {r["num_deleted"]} documents from collection {collection_name}' + ) + +logging.info(f"importing {len(publications)} documents") +r = client.collections[collection_name].documents.import_(publications) + +nfails = list(map(lambda d: d["success"], r)).count(False) +if nfails == len(publications): + if args.verbose > 0: + print(r) + logging.error( + f"Failed to insert any of the documents. Either the documents don't comply with the schema of the collection, or maybe you are using an api key that only has read access to the collection? (run the script again with --verbose to see all {nfails} errors)" + ) + exit(1) +elif nfails > 0: + logging.error(f"{nfails} documents could not be inserted.") + for doc in filter(lambda d: not d["success"], r): + logging.error(doc) + exit(1) + +print(publications[0]) +logging.info("Success!") diff --git a/scripts/typesense-schema.json b/scripts/typesense-schema.json new file mode 100644 index 0000000..43fc547 --- /dev/null +++ b/scripts/typesense-schema.json @@ -0,0 +1,141 @@ +{ + "name": "SET IN .env FILE NOT HERE", + "fields": [ + { + "name": "erstpublikation", + "type": "bool", + "facet": false, + "optional": false, + "index": true, + "sort": false, + "infix": false, + "locale": "", + "stem": false + }, + { + "name": "has_image", + "type": "bool", + "facet": false, + "optional": false, + "index": true, + "sort": false, + "infix": false, + "locale": "", + "stem": false + }, + { + "name": "year", + "type": "int32", + "facet": false, + "optional": false, + "index": true, + "sort": true, + "infix": false, + "locale": "", + "stem": false + }, + { + "name": "signatur", + "type": "string", + "facet": false, + "optional": false, + "index": true, + "sort": false, + "infix": false, + "locale": "", + "stem": false + }, + { + "name": "year_display", + "type": "string", + "facet": false, + "optional": false, + "index": true, + "sort": false, + "infix": false, + "locale": "", + "stem": false + }, + { + "name": "contains.work.category", + "type": "string[]", + "facet": true, + "optional": true, + "index": true, + "sort": false, + "infix": false, + "locale": "", + "stem": false + }, + { + "name": "contains.work.title", + "type": "string[]", + "facet": true, + "optional": false, + "index": true, + "sort": false, + "infix": false, + "locale": "", + "stem": false + }, + { + "name": "contains.work.yeartitle", + "type": "string[]", + "facet": true, + "optional": false, + "index": true, + "sort": false, + "infix": false, + "locale": "", + "stem": false + }, + { + "name": "language", + "type": "string", + "facet": true, + "optional": false, + "index": true, + "sort": true, + "infix": false, + "locale": "", + "stem": false + }, + { + "name": "contains.title", + "type": "string[]", + "facet": true, + "optional": false, + "index": true, + "sort": false, + "infix": false, + "locale": "", + "stem": false + }, + { + "name": "contains.translators.name", + "type": "string[]", + "facet": true, + "optional": true, + "index": true, + "sort": false, + "infix": false, + "locale": "", + "stem": false + }, + { + "name": "title", + "type": "string", + "facet": true, + "optional": false, + "index": true, + "sort": true, + "infix": false, + "locale": "", + "stem": false + } + ], + "default_sorting_field": "year", + "enable_nested_fields": true, + "symbols_to_index": [], + "token_separators": [] +}