diff --git a/app/records/schemas/details.py b/app/records/schemas/details.py index 5a781ab..0d99a52 100644 --- a/app/records/schemas/details.py +++ b/app/records/schemas/details.py @@ -18,6 +18,7 @@ def toJSON(self): class Record(Details): type: str = "record" ref: str | None = None + former_ref: str | None = None title: str = "" description: str = "" date: str = "" diff --git a/app/sources/rosetta/api.py b/app/sources/rosetta/api.py index dfb71ed..b31ed7a 100644 --- a/app/sources/rosetta/api.py +++ b/app/sources/rosetta/api.py @@ -88,6 +88,7 @@ def parse_results(self, raw_results, source_url): if parsed_data.type() == "record": record = Record(parsed_data.id()) record.ref = parsed_data.identifier() + record.former_ref = parsed_data.former_identifier() record.title = parsed_data.title() record.description = parsed_data.description() record.date = parsed_data.date_range() diff --git a/app/sources/rosetta/lib/response_parser.py b/app/sources/rosetta/lib/response_parser.py index 48dcbdc..3f8010f 100644 --- a/app/sources/rosetta/lib/response_parser.py +++ b/app/sources/rosetta/lib/response_parser.py @@ -11,12 +11,15 @@ class RosettaSourceParser: def __init__(self, rosetta_data_source): self.source = rosetta_data_source - def strip_outside_tags(self, markup, query): + def strip_scope_and_content(self, markup): document = PyQuery(markup) - return str(document(query).contents().eq(0)) + return str(document("span.scopecontent").contents().eq(0)) - def strip_scope_and_content(self, markup): - return self.strip_outside_tags(markup, "span.scopecontent") + def strip_wrapper_and_split_span(self, markup): + document = PyQuery(markup) + spans = document("span.wrapper").find("span.emph") + contents = [span.text for span in spans if span.text is not None] + return "
".join(contents) def type(self) -> str: if "@datatype" in self.source and "base" in self.source["@datatype"]: @@ -275,9 +278,14 @@ def description(self) -> str: None, ): if "value" in description: - return self.strip_scope_and_content( - description["value"] - ) # TODO: Breaks on C17371160 + # TODO: Breaks on C17371160 + return ( + self.strip_scope_and_content(description["value"]) + or self.strip_wrapper_and_split_span( + description["value"] + ) + or description["value"] + ) elif ( "ephemera" in description and "value" in description["ephemera"] @@ -406,6 +414,21 @@ def identifier(self) -> str: ) return "" + def former_identifier(self) -> str: + if "identifier" in self.source: + if identifier := next( + ( + item["value"] + for item in self.source["identifier"] + if "type" in item + and item["type"] == "former reference (Department)" + and "value" in item + ), + None, + ): + return identifier + return "" + def reference_number(self) -> str: if "identifier" in self.source: if reference_number := next(