Skip to content

Commit

Permalink
data: add script for exporting final data to json
Browse files Browse the repository at this point in the history
  • Loading branch information
kevinstadler committed Feb 3, 2025
1 parent 5b568af commit 0edaf24
Show file tree
Hide file tree
Showing 11 changed files with 61,388 additions and 38 deletions.
14 changes: 11 additions & 3 deletions lib/model.ts
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,11 @@ export interface Publication {
id: number;
signatur: string;
title: string;

// if unset, short_title is the same as title
short_title: string;

// language tag according to https://www.rfc-editor.org/rfc/rfc5646.html -- see messages scripts/3_merge_data.py or messages/*.json for the list of codes used
language: string;
contains: Array<Translation>;

Expand All @@ -36,7 +40,7 @@ export interface Publication {

export interface Translation {
id: number;
title: string; // translated title,
title: string; // title of the translation
work: BernhardWork;

// the original work title of a translation might deviate from the canonical title of the original work, e.g. adding '(Auswahl)' etc.
Expand All @@ -46,8 +50,12 @@ export interface Translation {

export interface BernhardWork {
id: number;
title: string; // german/french original
short_title: string; // abbreviated title, commonly used for letters

// canonical title of the german/french original
title: string;

// abbreviated title, commonly used for letters. if unset, short_title is the same as title
short_title: string;
year?: number;
category?: Category;
}
Expand Down
126 changes: 91 additions & 35 deletions scripts/3_to_typesense.py → scripts/3_merge_changes.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,12 @@
#!/usr/bin/env python3
# type: ignore
import logging
from dotenv import load_dotenv
import json
import argparse
import json
import logging
import os
import sys

from dotenv import load_dotenv
import typesense
from typesense.exceptions import ObjectNotFound

Expand All @@ -25,6 +27,19 @@
help="Increase the verbosity of the logging output: default is WARNING, use -v for INFO, -vv for DEBUG",
)

output = parser.add_mutually_exclusive_group(required=True)

output.add_argument(
"--typesense", "-t", action="store_true", help="write the merged data to typesense"
)

output.add_argument(
"--json",
"-j",
action="store_true",
help="write the merged data to json files in the data-final/ directory",
)

args = parser.parse_args()

logging.basicConfig(
Expand All @@ -33,19 +48,12 @@
)


logging.debug(f"Loading typesense access data from {args.env}")
os.chdir(os.path.dirname(__file__))
load_dotenv(args.env)


def fatal(msg):
logging.fatal(msg)
exit(1)


logging.info(
"step 1: accumulate relational data into a typesense-ready nested structure"
)
logging.info("accumulating relational data into a typesense-ready nested structure")


def load_json(dirname, filename):
Expand Down Expand Up @@ -102,15 +110,19 @@ def merge_changes(orig, changed, field_names):
merge_changes(works, work_changes, ["title", "short_title", "year", "category", "gnd"])
merge_changes(translators, translator_changes, ["name", "gnd"])

# create nested structures


def del_empty_strings(o, field_names):
for f in field_names:
if not o[f]:
del o[f]


def null_empty_strings(o, field_names):
for f in field_names:
if not o[f]:
o[f] = None


for i, t in enumerate(translators):
# add 1-indexed translator ids to allow links to translator pages
t["id"] = i + 1
Expand All @@ -131,20 +143,26 @@ def del_empty_strings(o, field_names):
for i, w in enumerate(works):
# add 1-indexed bernhard work ids to allow links to work ids
w["id"] = i + 1
w["category"] = categories[w["category"]] if w["category"] else "fragments"

if not w["short_title"]:
w["short_title"] = w["title"]
w["short_title"] = w["title"] if args.typesense else None
null_empty_strings(w, ["gnd"])

w["category"] = categories[w["category"]] if w["category"] else "fragments"

for t in translations:
t["work"] = works[t["work"] - 1]
t["translators"] = [translators[t_id - 1] for t_id in t["translators"]]
for i, t in enumerate(translations):
t["id"] = i + 1
if "MISSING" in t["title"]:
t["title"] = "???"
# work around https://typesense.org/docs/guide/tips-for-searching-common-types-of-data.html#searching-for-null-or-empty-values
# for the /translators page
t["has_translators"] = len(t["translators"]) != 0
del_empty_strings(t, ["work_display_title"])
null_empty_strings(t, ["work_display_title"])

if args.typesense:
# create nested structures
t["work"] = works[t["work"] - 1]
t["translators"] = [translators[t_id - 1] for t_id in t["translators"]]
# work around https://typesense.org/docs/guide/tips-for-searching-common-types-of-data.html#searching-for-null-or-empty-values
# for the /translators page
t["has_translators"] = len(t["translators"]) != 0

languages = {
"albanian": "sq",
Expand Down Expand Up @@ -192,23 +210,26 @@ def del_empty_strings(o, field_names):
}

for i, pub in enumerate(publications):
pub["id"] = str(i + 1)
if "short_title" not in pub:
pub["short_title"] = pub["title"]

pub["contains"] = [
translations[t_id - 1]
for t_id in pub["contains"]
if "MISSING" not in translations[t_id - 1]["work"]["title"]
]
pub["id"] = i + 1
pub["language"] = languages[pub["language"]]

if "short_title" not in pub:
pub["short_title"] = pub["title"] if args.typesense else None

if args.typesense:
# create nested structures
pub["contains"] = [
translations[t_id - 1]
for t_id in pub["contains"]
if "MISSING" not in translations[t_id - 1]["work"]["title"]
]
pub["has_image"] = len(pub["images"]) > 0
if not pub["year_display"]:
pub["year_display"] = str(pub["year"])

pub["images"] = (
[{"id": img} for img in pub["images"].split(" ")] if len(pub["images"]) else []
)
pub["has_image"] = len(pub["images"]) > 0
if not pub["year_display"]:
pub["year_display"] = str(pub["year"])

for pid in pub["parents"]:
if "later" in publications[pid - 1]:
Expand All @@ -217,14 +238,49 @@ def del_empty_strings(o, field_names):
publications[pid - 1]["later"] = [i + 1]

del_empty_strings(pub, ["isbn", "parents", "publication_details"])
null_empty_strings(pub, ["year_display"])

# trim data a little
del pub["exemplar_suhrkamp_berlin"]
del pub["exemplar_oeaw"]
del pub["original_publication"]
del pub["zusatzinfos"]

logging.info("step 2: insert nested documents into typesense")
if args.json:
logging.info("removing orphans before json writeout")
for t in translations:
if all([t["id"] not in p["contains"] for p in publications]):
logging.info(f"deleting orphaned translation #{t['id']}")
for w in works:
if all([t["id"] != t["work"] for t in translations]):
logging.info(f"deleting orphaned work #{t['id']}")
for i, tr in enumerate(translators):
if all([tr["id"] not in t["translators"] for t in translations]):
logging.info(f"deleting orphaned translator #{tr['id']}")
del translators[i]

logging.info("writing json to data-final/")

def dump_relational(name, data):
with open(f"data-final/{name}.json", "w") as file:
file.write(json.dumps(data, indent=4))

dump_relational("publications", publications)
dump_relational("translations", translations)
dump_relational("works", works)
dump_relational("translators", translators)
sys.exit(0)


logging.info(f"loading typesense access data from {args.env}")
os.chdir(os.path.dirname(__file__))
load_dotenv(args.env)

# for typesense
for pub in publications:
pub["id"] = str(pub["id"])

logging.info("inserting nested documents into typesense")

if "TYPESENSE_ADMIN_API_KEY" not in os.environ:
fatal("Couldn't find typesense database information in environment files")
Expand Down
Loading

0 comments on commit 0edaf24

Please sign in to comment.