Skip to content

Commit

Permalink
feat: add baserow import/export scripts
Browse files Browse the repository at this point in the history
  • Loading branch information
kevinstadler committed Nov 17, 2024
1 parent a302676 commit 5f7e4ee
Show file tree
Hide file tree
Showing 3 changed files with 334 additions and 0 deletions.
21 changes: 21 additions & 0 deletions scripts/2_from_baserow.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
#!/usr/bin/env python3
# type: ignore
from acdh_baserow_pyutils import BaseRowClient
import os
from dotenv import load_dotenv

load_dotenv("../.env.local")

BASEROW_BASE_URL = os.environ.get("BASEROW_BASE_URL")
BASEROW_USER = os.environ.get("BASEROW_USER")
BASEROW_PW = os.environ.get("BASEROW_PW")
BASEROW_TOKEN = os.environ.get("BASEROW_TOKEN")
BASEROW_DATABASE_ID = os.environ.get("BASEROW_DATABASE_ID")
# initialize the client
br_client = BaseRowClient(
BASEROW_USER, BASEROW_PW, BASEROW_TOKEN, BASEROW_BASE_URL, BASEROW_DATABASE_ID
)

folder_name = "from_baserow"
# os.mkdir(folder_name)
br_client.dump_tables_as_json(BASEROW_DATABASE_ID, folder_name=folder_name, indent="\t")
172 changes: 172 additions & 0 deletions scripts/3_to_typesense.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,172 @@
#!/usr/bin/env python3
# type: ignore
import logging
from dotenv import load_dotenv
import json
import argparse
import os
import typesense
from typesense.exceptions import ObjectNotFound

parser = argparse.ArgumentParser(
prog="baserow", description="import / export data from / to baserow"
)

parser.add_argument(
"-env",
default="../.env.local",
help=".env file to be used for getting typesense server details",
)
parser.add_argument(
"-v",
"--verbose",
action="count",
default=3,
help="Increase the verbosity of the logging output: default is WARNING, use -v for INFO, -vv for DEBUG",
)

args = parser.parse_args()

logging.basicConfig(
level=max(10, 30 - 10 * args.verbose),
# format="%(levelname)-8s %(message)s",
)


logging.debug(f"Loading typesense access data from {args.env}")
os.chdir(os.path.dirname(__file__))
load_dotenv(args.env)


def fatal(msg):
logging.fatal(msg)
exit(1)


logging.info(
"step 1: accumulate relational data into a typesense-ready nested structure"
)


def load_json(dirname, filename):
return json.load(open(f"{dirname}/{filename}.json"))


# get the data as it was originally exported to baserow
publications = load_json("to_baserow", "Publikation")
translations = load_json("to_baserow", "Übersetzung")
works = load_json("to_baserow", "BernhardWerk")
translators = load_json("to_baserow", "Übersetzer")

# TODO get and apply the manual edits
# publications = json.load(open("from_baserow/Publikation.json"))
# translations = json.load(open("from_baserow/Übersetzung.json"))
# works = json.load(open("from_baserow/BernhardWerk.json.json"))
# translators = json.load(open("from_baserow/Übersetzer.json"))

# create nested structures


def del_empty_strings(o, field_names):
for f in field_names:
if not o[f]:
del o[f]


for i, t in enumerate(translators):
# add 1-indexed translator ids to allow links to translator pages
t["id"] = i + 1

for i, w in enumerate(works):
# add 1-indexed bernhard work ids to allow links to work ids
w["id"] = i + 1
# TODO to be removed/replaced by work id
w["yeartitle"] = str(w["year"]) + w["title"]

for t in translations:
t["work"] = works[t["work"] - 1]
t["translators"] = [translators[t_id - 1] for t_id in t["translators"]]
del_empty_strings(t, ["work_display_title"])

for i, pub in enumerate(publications):
pub["id"] = str(i + 1)
pub["contains"] = [translations[t_id - 1] for t_id in pub["contains"]]

pub["images"] = [{"id": img} for img in pub["images"].split(" ")]
pub["has_image"] = len(pub["images"]) > 0
if not pub["year_display"]:
pub["year_display"] = str(pub["year"])

for pid in pub["parents"]:
if "later" in publications[pid - 1]:
publications[pid - 1]["later"].append(i + 1)
else:
publications[pid - 1]["later"] = [i + 1]

del_empty_strings(pub, ["publication_details"])

# trim data a little
del pub["exemplar_suhrkamp_berlin"]
del pub["exemplar_oeaw"]
del pub["original_publication"]
del pub["zusatzinfos"]

logging.info("step 2: insert nested documents into typesense")

if "TYPESENSE_ADMIN_API_KEY" not in os.environ:
fatal("Couldn't find typesense database information in environment files")

logging.info(f"connecting to {os.environ.get('NEXT_PUBLIC_TYPESENSE_HOST')}")

client = typesense.Client(
{
"api_key": os.environ.get("TYPESENSE_ADMIN_API_KEY"),
"nodes": [
{
"host": os.environ.get("NEXT_PUBLIC_TYPESENSE_HOST"),
"port": os.environ.get("NEXT_PUBLIC_TYPESENSE_PORT"),
"protocol": os.environ.get("NEXT_PUBLIC_TYPESENSE_PROTOCOL"),
}
],
"connection_timeout_seconds": 5,
}
)

collection_name = os.environ.get("NEXT_PUBLIC_TYPESENSE_COLLECTION_NAME")

try:
r = client.collections[collection_name].retrieve()
except ObjectNotFound:
logging.info(f"collection '{collection_name}' does not exist yet, creating")
schema = json.load(open("typesense-schema.json"))
schema["collection_name"] = collection_name
create = client.collections.create(schema)
r = client.collections[collection_name].retrieve()


if r["num_documents"] > 0:
logging.info(f'Clearing {r["num_documents"]} existing documents')
r = client.collections[collection_name].documents.delete({"filter_by": 'id :!= ""'})
logging.info(
f'Cleared {r["num_deleted"]} documents from collection {collection_name}'
)

logging.info(f"importing {len(publications)} documents")
r = client.collections[collection_name].documents.import_(publications)

nfails = list(map(lambda d: d["success"], r)).count(False)
if nfails == len(publications):
if args.verbose > 0:
print(r)
logging.error(
f"Failed to insert any of the documents. Either the documents don't comply with the schema of the collection, or maybe you are using an api key that only has read access to the collection? (run the script again with --verbose to see all {nfails} errors)"
)
exit(1)
elif nfails > 0:
logging.error(f"{nfails} documents could not be inserted.")
for doc in filter(lambda d: not d["success"], r):
logging.error(doc)
exit(1)

print(publications[0])
logging.info("Success!")
141 changes: 141 additions & 0 deletions scripts/typesense-schema.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,141 @@
{
"name": "SET IN .env FILE NOT HERE",
"fields": [
{
"name": "erstpublikation",
"type": "bool",
"facet": false,
"optional": false,
"index": true,
"sort": false,
"infix": false,
"locale": "",
"stem": false
},
{
"name": "has_image",
"type": "bool",
"facet": false,
"optional": false,
"index": true,
"sort": false,
"infix": false,
"locale": "",
"stem": false
},
{
"name": "year",
"type": "int32",
"facet": false,
"optional": false,
"index": true,
"sort": true,
"infix": false,
"locale": "",
"stem": false
},
{
"name": "signatur",
"type": "string",
"facet": false,
"optional": false,
"index": true,
"sort": false,
"infix": false,
"locale": "",
"stem": false
},
{
"name": "year_display",
"type": "string",
"facet": false,
"optional": false,
"index": true,
"sort": false,
"infix": false,
"locale": "",
"stem": false
},
{
"name": "contains.work.category",
"type": "string[]",
"facet": true,
"optional": true,
"index": true,
"sort": false,
"infix": false,
"locale": "",
"stem": false
},
{
"name": "contains.work.title",
"type": "string[]",
"facet": true,
"optional": false,
"index": true,
"sort": false,
"infix": false,
"locale": "",
"stem": false
},
{
"name": "contains.work.yeartitle",
"type": "string[]",
"facet": true,
"optional": false,
"index": true,
"sort": false,
"infix": false,
"locale": "",
"stem": false
},
{
"name": "language",
"type": "string",
"facet": true,
"optional": false,
"index": true,
"sort": true,
"infix": false,
"locale": "",
"stem": false
},
{
"name": "contains.title",
"type": "string[]",
"facet": true,
"optional": false,
"index": true,
"sort": false,
"infix": false,
"locale": "",
"stem": false
},
{
"name": "contains.translators.name",
"type": "string[]",
"facet": true,
"optional": true,
"index": true,
"sort": false,
"infix": false,
"locale": "",
"stem": false
},
{
"name": "title",
"type": "string",
"facet": true,
"optional": false,
"index": true,
"sort": true,
"infix": false,
"locale": "",
"stem": false
}
],
"default_sorting_field": "year",
"enable_nested_fields": true,
"symbols_to_index": [],
"token_separators": []
}

0 comments on commit 5f7e4ee

Please sign in to comment.