diff --git a/app/__init__.py b/app/__init__.py index d2d88de..80c50da 100644 --- a/app/__init__.py +++ b/app/__init__.py @@ -15,10 +15,10 @@ def healthcheck(): from .records import routes as record_routes app.include_router( - record_routes.router, prefix=f"{base_uri}/records", tags=["records"] + record_routes.router, prefix=f"{base_uri}/records", tags=["Records"] ) app.include_router( - article_routes.router, prefix=f"{base_uri}/articles", tags=["articles"] + article_routes.router, prefix=f"{base_uri}/articles", tags=["Articles"] ) return app diff --git a/app/articles/routes.py b/app/articles/routes.py index 313faf8..26a29b3 100644 --- a/app/articles/routes.py +++ b/app/articles/routes.py @@ -40,7 +40,7 @@ async def index( website_api.add_parameter("order", "-first_published_at") elif order == "date:asc": website_api.add_parameter("order", "first_published_at") - results = website_api.get_results(page) + results = website_api.get_result(page) return results diff --git a/app/articles/schemas.py b/app/articles/schemas.py index adc36cf..3620c49 100644 --- a/app/articles/schemas.py +++ b/app/articles/schemas.py @@ -1,8 +1,8 @@ -from app.schemas import APIResponse, APIResult +from app.schemas import APISearchResponse, APISearchResult from pydantic import ConfigDict -class Article(APIResult): +class Article(APISearchResult): url: str = "" type: str = "" first_published: str = "" @@ -15,7 +15,7 @@ def toJSON(self): return self.__dict__ -class ArticleSearchResults(APIResponse): +class ArticleSearchResults(APISearchResponse): model_config = ConfigDict(arbitrary_types_allowed=True) results: list[Article] = [] diff --git a/app/sources/api.py b/app/lib/api.py similarity index 83% rename from app/sources/api.py rename to app/lib/api.py index 6678332..2f1ac0e 100644 --- a/app/sources/api.py +++ b/app/lib/api.py @@ -6,11 +6,13 @@ class BaseAPI(ABC): @abstractmethod - def get_results(self): + def get_result(self): pass class GetAPI(BaseAPI): + api_base_url: str + api_path: str = "/" results_per_page: int = 20 params: dict = {} @@ -24,6 +26,9 @@ def build_query_string(self) -> str: else "" ) + def build_url(self) -> str: + return f"{self.api_base_url}{self.api_path}{self.build_query_string()}" + def execute(self, url: str) -> dict: r = requests.get(url) if r.status_code == 404: diff --git a/app/records/routes.py b/app/records/routes.py index 2b3f1d7..16afb2c 100644 --- a/app/records/routes.py +++ b/app/records/routes.py @@ -1,8 +1,14 @@ from app.records import router from app.schemas import Filter -from app.sources.rosetta import RosettaRecords +from app.sources.rosetta import RosettaRecordDetails, RosettaRecordsSearch -from .schemas import RecordSearchResults +from .schemas import ( + ExternalRecord, + Record, + RecordArchive, + RecordCreator, + RecordSearchResults, +) @router.get("/") @@ -12,12 +18,12 @@ async def index( groups: str | None = None, highlight: bool | None = False, ) -> RecordSearchResults: - rosetta_api = RosettaRecords() + rosetta_api = RosettaRecordsSearch() rosetta_api.add_query(q) if groups: # group:(tna,digitised,nonTna,creator,archive) rosetta_api.add_parameter("filter", f"group:({groups})") - results = rosetta_api.get_results(page, highlight) + results = rosetta_api.get_result(page, highlight) return results @@ -46,3 +52,57 @@ async def filters() -> list[Filter]: filters.append(level_filter) return filters + + +@router.get("/{item_id}/") +async def item( + item_id: str, +): # ) -> Record | ExternalRecord | RecordCreator | RecordArchive: + rosetta_api = RosettaRecordDetails() + result = rosetta_api.get_result(item_id) + return result + + +@router.get("/external/") +async def external( + q: str = "", + page: int | None = 1, + highlight: bool | None = False, +) -> RecordSearchResults: + return index(q, page, "nonTna", highlight) + + +@router.get("/external/filters/") +async def external_filters() -> list[Filter]: + filters = [] + return filters + + +@router.get("/creators/") +async def creators( + q: str = "", + page: int | None = 1, + highlight: bool | None = False, +) -> RecordSearchResults: + return index(q, page, "creator", highlight) + + +@router.get("/creators/filters/") +async def creators_filters() -> list[Filter]: + filters = [] + return filters + + +@router.get("/archives/") +async def archives( + q: str = "", + page: int | None = 1, + highlight: bool | None = False, +) -> RecordSearchResults: + return index(q, page, "archive", highlight) + + +@router.get("/archives/filters/") +async def archives_filters() -> list[Filter]: + filters = [] + return filters diff --git a/app/records/schemas/__init__.py b/app/records/schemas/__init__.py new file mode 100644 index 0000000..ebe00b3 --- /dev/null +++ b/app/records/schemas/__init__.py @@ -0,0 +1,9 @@ +from .details import ( + Details, + ExternalRecord, + Record, + RecordArchive, + RecordCreator, + RecordCreatorPerson, +) +from .search import RecordSearchResult, RecordSearchResults diff --git a/app/records/schemas/details.py b/app/records/schemas/details.py new file mode 100644 index 0000000..0d0321e --- /dev/null +++ b/app/records/schemas/details.py @@ -0,0 +1,72 @@ +from pydantic import BaseModel + + +class Details(BaseModel): + type: str + id: str = "" + # dump: dict = {} # TEMP + + def __init__(self, id: str): + super().__init__() + self.id = id + + def toJSON(self): + return self.__dict__ + + +class Record(Details): + type: str = "record" + ref: str | None = None + title: str = "" + date: str = "" + is_digitised: bool | None = None + + def __init__(self, id: str): + super().__init__(id) + + +class ExternalRecord(Details): + type: str = "external_record" + ref: str | None = None + title: str = "" + covering_date: str | None = None + held_by: str | None = None + + def __init__(self, id: str): + super().__init__(id) + + +class RecordCreator(Details): + type: str = "creator" + name: str = "" + date: str = "" + places: list[str] = [] + identifier: str = "" + history: str = "" + + def __init__(self, id: str): + super().__init__(id) + + +class RecordCreatorPerson(RecordCreator): + type: str = "person" + name_parts: dict = {} + date: str = "" + gender: str = "" + functions: str = "" + biography: str = "" + + def __init__(self, id: str): + super().__init__(id) + + +class RecordArchive(Details): + type: str = "archive" + name: str = "" + archon: str = "" + places: list[str] = [] + agents: dict = {} + contact_info: dict = {} + + def __init__(self, id: str): + super().__init__(id) diff --git a/app/records/schemas.py b/app/records/schemas/search.py similarity index 61% rename from app/records/schemas.py rename to app/records/schemas/search.py index 64bd607..2929747 100644 --- a/app/records/schemas.py +++ b/app/records/schemas/search.py @@ -1,10 +1,8 @@ -import math - -from app.schemas import APIResponse, APIResult +from app.schemas import APISearchResponse, APISearchResult from pydantic import ConfigDict -class Record(APIResult): +class RecordSearchResult(APISearchResult): ref: str | None = None covering_date: str | None = None held_by: str | None = None @@ -16,7 +14,7 @@ def toJSON(self): return self.__dict__ -class RecordSearchResults(APIResponse): +class RecordSearchResults(APISearchResponse): model_config = ConfigDict(arbitrary_types_allowed=True) - results: list[Record] = [] + results: list[RecordSearchResult] = [] diff --git a/app/schemas/__init__.py b/app/schemas/__init__.py index 0797a0c..1313d47 100644 --- a/app/schemas/__init__.py +++ b/app/schemas/__init__.py @@ -1,3 +1,2 @@ -from .api_response import APIResponse -from .api_result import APIResult -from .filter import Filter +from .common_filters_api import Filter +from .common_search_api import APISearchResponse, APISearchResult diff --git a/app/schemas/api_result.py b/app/schemas/api_result.py deleted file mode 100644 index 1c37c01..0000000 --- a/app/schemas/api_result.py +++ /dev/null @@ -1,10 +0,0 @@ -from pydantic import BaseModel - - -class APIResult(BaseModel): - title: str | None = None - id: int | str = "" - description: str | None = None - - def toJSON(self): - return self.__dict__ diff --git a/app/schemas/filter.py b/app/schemas/common_filters_api.py similarity index 100% rename from app/schemas/filter.py rename to app/schemas/common_filters_api.py diff --git a/app/schemas/api_response.py b/app/schemas/common_search_api.py similarity index 84% rename from app/schemas/api_response.py rename to app/schemas/common_search_api.py index cf645ba..d0b2414 100644 --- a/app/schemas/api_response.py +++ b/app/schemas/common_search_api.py @@ -3,10 +3,17 @@ from fastapi import HTTPException from pydantic import BaseModel, ConfigDict -from .api_result import APIResult +class APISearchResult(BaseModel): + title: str | None = None + id: int | str = "" + description: str | None = None -class APIResponse(BaseModel): + def toJSON(self): + return self.__dict__ + + +class APISearchResponse(BaseModel): model_config = ConfigDict(arbitrary_types_allowed=True) count: int = 0 @@ -15,7 +22,7 @@ class APIResponse(BaseModel): result_range_min: int = 0 result_range_max: int = 0 results_per_page: int = 0 - results: list[APIResult] = [] + results: list[APISearchResult] = [] def get_pages(self): return ( diff --git a/app/sources/rosetta.py b/app/sources/rosetta.py deleted file mode 100644 index cef034e..0000000 --- a/app/sources/rosetta.py +++ /dev/null @@ -1,58 +0,0 @@ -import math - -from app.records.schemas import Record, RecordSearchResults -from config import Config - -from .api import GetAPI - - -class RosettaRecords(GetAPI): - def __init__(self): - self.api_url = Config().ROSETTA_API_URL - - def add_query(self, query_string: str) -> None: - self.add_parameter("q", query_string) - - def get_results( - self, page: int | None = 1, highlight: bool | None = False - ) -> dict: - offset = (page - 1) * self.results_per_page - self.add_parameter("size", self.results_per_page) - self.add_parameter("from", offset) - url = f"{self.api_url}/search{self.build_query_string()}" - raw_results = self.execute(url) - response = RecordSearchResults() - print(url) - for r in raw_results["metadata"]: - record = Record() - record.id = r["id"] - details = r["detail"]["@template"]["details"] - record.ref = ( - details["referenceNumber"] - if "referenceNumber" in details - else None - ) - record.title = ( - details["summaryTitle"] if "summaryTitle" in details else None - ) - record.description = ( - details["description"][0] if "description" in details else None - ) - record.covering_date = ( - details["dateCovering"] if "dateCovering" in details else None - ) - record.held_by = details["heldBy"] if "heldBy" in details else None - # if highlight and "highLight" in r: - # if "@template.details.summaryTitle" in r["highLight"]: - # record.title = r["highLight"]["@template.details.summaryTitle"][0] - # if "@template.details.description" in r["highLight"]: - # record.title = r["highLight"]["@template.details.description"][0] - response.results.append(record) - response.count = ( - raw_results["stats"]["total"] - if raw_results["stats"]["total"] <= 10000 - else 10000 - ) - response.results_per_page = self.results_per_page - response.page = page - return response.toJSON() if response.page_in_range() else {} diff --git a/app/sources/rosetta/__init__.py b/app/sources/rosetta/__init__.py new file mode 100644 index 0000000..cc5378f --- /dev/null +++ b/app/sources/rosetta/__init__.py @@ -0,0 +1 @@ +from .api import RosettaRecordDetails, RosettaRecordsSearch diff --git a/app/sources/rosetta/api.py b/app/sources/rosetta/api.py new file mode 100644 index 0000000..5fa4193 --- /dev/null +++ b/app/sources/rosetta/api.py @@ -0,0 +1,154 @@ +from app.lib.api import GetAPI +from app.records.schemas import ( + ExternalRecord, + Record, + RecordArchive, + RecordCreator, + RecordCreatorPerson, + RecordSearchResult, + RecordSearchResults, +) +from config import Config + +from .lib import RosettaResponseParser + + +class RosettaRecords(GetAPI): + def __init__(self): + self.api_base_url = Config().ROSETTA_API_URL + + +class RosettaRecordsSearch(RosettaRecords): + def __init__(self): + super().__init__() + self.api_path = "/search" + + def add_query(self, query_string: str) -> None: + self.add_parameter("q", query_string) + + def get_result( + self, page: int | None = 1, highlight: bool | None = False + ) -> dict: + offset = (page - 1) * self.results_per_page + self.add_parameter("size", self.results_per_page) + self.add_parameter("from", offset) + url = self.build_url() + print(url) + raw_results = self.execute(url) + return self.parse_results(raw_results, page) + + def parse_results(self, raw_results, page): + response = RecordSearchResults() + for r in raw_results["metadata"]: + record = RecordSearchResult() + record.id = r["id"] + details = r["detail"]["@template"]["details"] + record.ref = ( + details["referenceNumber"] + if "referenceNumber" in details + else None + ) + record.title = ( + details["summaryTitle"] if "summaryTitle" in details else None + ) + record.description = ( + details["description"][0] if "description" in details else None + ) + record.covering_date = ( + details["dateCovering"] if "dateCovering" in details else None + ) + record.held_by = details["heldBy"] if "heldBy" in details else None + # if highlight and "highLight" in r: + # if "@template.details.summaryTitle" in r["highLight"]: + # record.title = r["highLight"]["@template.details.summaryTitle"][0] + # if "@template.details.description" in r["highLight"]: + # record.title = r["highLight"]["@template.details.description"][0] + response.results.append(record) + response.count = ( + raw_results["stats"]["total"] + if raw_results["stats"]["total"] <= 10000 + else 10000 + ) + response.results_per_page = self.results_per_page + response.page = page + return response.toJSON() if response.page_in_range() else {} + + +class RosettaRecordDetails(RosettaRecords): + def __init__(self): + super().__init__() + self.api_path = "/fetch" + + def get_result(self, id: str) -> dict: + self.add_parameter("id", id) + self.add_parameter("includeSource", True) + url = self.build_url() + print(url) + raw_results = self.execute(url) + return self.parse_results(raw_results) + + def parse_results(self, raw_results): + parsed_data = RosettaResponseParser(raw_results) + # dump = { + # "actual_type": parsed_data.actual_type(), + # "type": parsed_data.type(), + # "title": parsed_data.title(), + # "name": parsed_data.name(), + # "names": parsed_data.names(), + # "date": parsed_data.date(), + # "lifespan": parsed_data.lifespan(), + # "date_range": parsed_data.date_range(), + # "places": parsed_data.places(), + # "gender": parsed_data.gender(), + # "contact_info": parsed_data.contact_info(), + # "description": parsed_data.description(), + # "functions": parsed_data.functions(), + # "history": parsed_data.history(), + # "biography": parsed_data.biography(), + # "identifier": parsed_data.identifier(), + # "reference_number": parsed_data.reference_number(), + # # 'agents': parsed_data.agents() + # } + if parsed_data.type() == "record": + # TODO: ExternalRecord + record = Record(parsed_data.id()) + record.ref = "" + record.title = parsed_data.title() + record.date = parsed_data.date_range() + record.is_digitised = parsed_data.is_digitised() + # record.dump = dump + return record.toJSON() + if ( + parsed_data.type() == "archive" + or parsed_data.type() == "repository" + ): + # return raw_results + record = RecordArchive(parsed_data.id()) + record.name = parsed_data.title() + record.archon = parsed_data.reference_number() + record.places = parsed_data.places() + record.contact_info = parsed_data.contact_info() + record.agents = parsed_data.agents() + # record.dump = dump + return record.toJSON() + if parsed_data.type() == "agent": + if parsed_data.actual_type() == "person": + record = RecordCreatorPerson(parsed_data.id()) + record.name = parsed_data.name() + record.name_parts = parsed_data.names() + record.date = parsed_data.lifespan() + record.gender = parsed_data.gender() + record.identifier = parsed_data.identifier() + record.functions = parsed_data.functions() + record.history = parsed_data.functions() + record.biography = parsed_data.biography() + return record.toJSON() + record = RecordCreator(parsed_data.id()) + record.name = parsed_data.title() + record.date = parsed_data.date() + record.places = parsed_data.places() + record.identifier = parsed_data.identifier() + record.history = parsed_data.functions() + # record.dump = dump + return record.toJSON() + return {} diff --git a/app/sources/rosetta/lib/__init__.py b/app/sources/rosetta/lib/__init__.py new file mode 100644 index 0000000..211d6b8 --- /dev/null +++ b/app/sources/rosetta/lib/__init__.py @@ -0,0 +1 @@ +from .response_parser import RosettaResponseParser diff --git a/app/sources/rosetta/lib/response_parser.py b/app/sources/rosetta/lib/response_parser.py new file mode 100644 index 0000000..35f0893 --- /dev/null +++ b/app/sources/rosetta/lib/response_parser.py @@ -0,0 +1,417 @@ +from pyquery import PyQuery + + +class RosettaResponseParser: + def __init__(self, rosetta_data): + self.data = rosetta_data + self.source = self.data["metadata"][0]["_source"] + + def strip_outside_tags(self, markup, query): + document = PyQuery(markup) + return str(document(query).contents().eq(0)) + + def strip_scope_and_content(self, markup): + return self.strip_outside_tags(markup, "span.scopecontent") + + def type(self) -> str: + if "@datatype" in self.source and "base" in self.source["@datatype"]: + return self.source["@datatype"]["base"] + return "" + + def actual_type(self) -> str: + if "@datatype" in self.source and "actual" in self.source["@datatype"]: + return self.source["@datatype"]["actual"] + return "" + + def id(self) -> str: + if "@admin" in self.source: + if "id" in self.source["@admin"]: + return self.source["@admin"]["id"] + return "UNKNOWN" + + def uuid(self) -> str: + if "@admin" in self.source: + if "uuid" in self.source["@admin"]: + return self.source["@admin"]["uuid"] + return "" + + def is_digitised(self) -> bool: + if "digitised" in self.source: + return self.source["digitised"] + return False + + def title(self) -> str: + if "title" in self.source: + if primary_title := next( + ( + item["value"] + for item in self.source["title"] + if "primary" in item and item["primary"] and "value" in item + ), + None, + ): + return primary_title + if name := self.name(): + return name + if description := self.description(): + return description + return "" + + def name(self) -> str: + names = self.names() + if "name" in names: + return names["name"] + return "" + + def names(self) -> dict: + names = {} + if "name" in self.source: + if name_data := next( + ( + item + for item in self.source["name"] + if "primary" in item and item["primary"] + ), + None, + ): + full_name = [] + if "title_prefix" in name_data: + names["prefix"] = name_data["title_prefix"] + full_name.append(names["prefix"]) + if "first" in name_data: + names["forenames"] = name_data["first"] + full_name.append(" ".join(name_data["first"])) + if "last" in name_data: + names["surname"] = name_data["last"] + full_name.append(names["surname"]) + if "title" in name_data: + names["title"] = name_data["title"] + if full_name: + names["name"] = " ".join(full_name) + if aka := next( + ( + item["value"] + for item in self.source["name"] + if "type" in item and item["type"] == "also known as" + ), + None, + ): + names["Alternative name(s)"] = aka + return names + + def date(self) -> str: + return self.lifespan() or self.date_range() or "" + + def lifespan(self) -> str: + if "birth" in self.source or "death" in self.source: + date_from = ( + self.source["birth"]["date"]["value"] + if "birth" in self.source + and "date" in self.source["birth"] + and "value" in self.source["birth"]["date"] + else "" + ) + date_to = ( + self.source["death"]["date"]["value"] + if "death" in self.source + and "date" in self.source["death"] + and "value" in self.source["death"]["date"] + else "" + ) + return f"{date_from}–{date_to}" + return "" + + def date_range(self) -> str: + date_from = ( + next( + ( + item["value"] + for item in self.source["start"]["date"] + if "primary" in item and item["primary"] and "value" in item + ), + None, + ) + if "start" in self.source and "date" in self.source["start"] + else "" + ) + date_to = ( + next( + ( + item["value"] + for item in self.source["end"]["date"] + if "primary" in item and item["primary"] and "value" in item + ), + None, + ) + if "end" in self.source and "date" in self.source["end"] + else "" + ) + if date_from or date_to: + return f"{date_from}–{date_to}" + return "" + + def places(self) -> list[str]: + places = [] + if "place" in self.source: + for place in self.source["place"]: + place_address = [] + if "name" in place: + place_address = ", ".join( + [ + place_name["value"] + for place_name in place["name"] + if "value" in place_name + ] + ) + else: + if "town" in place and "name" in place["town"]: + towns = [ + town["value"] + for town in place["town"]["name"] + if "value" in town + ] + place_address.append(", ".join(towns)) + if "region" in place and "name" in place["region"]: + regions = [ + region["value"] + for region in place["region"]["name"] + if "value" in region + ] + place_address.append(", ".join(regions)) + if "county" in place and "name" in place["county"]: + counties = [ + county["value"] + for county in place["county"]["name"] + if "value" in county + ] + place_address.append(", ".join(counties)) + if "country" in place and "name" in place["country"]: + countries = [ + country["value"] + for country in place["country"]["name"] + if "value" in country + ] + place_address.append(", ".join(countries)) + places.append(place_address) + return places + + def gender(self) -> str: + if "gender" in self.source: + return ( + "Male" + if self.source["gender"] == "M" + else "Female" + if self.source["gender"] == "F" + else self.source["gender"] + ) + return "" + + def contact_info(self) -> dict: + if "description" in self.source: + if ephemera := next( + ( + item["ephemera"]["value"] + for item in self.source["description"] + if "ephemera" in item + and "primary" in item + and item["primary"] + ), + None, + ): + document = PyQuery(ephemera) + return { + "address_line_1": document("addressline1").text(), + "town": document("addresstown").text(), + "postcode": document("postcode").text(), + "country": document("addresscountry").text(), + "map_url": document("mapURL").text(), + "url": document("url").text(), + "phone": document("telephone").text(), + "fax": document("fax").text(), + "email": document("email").text(), + } + return {} + + def description(self) -> str: + if "description" in self.source: + if description := next( + ( + item + for item in self.source["description"] + if "primary" in item and item["primary"] + ), + None, + ): + if "value" in description: + return self.strip_scope_and_content(description["value"]) + elif ( + "ephemera" in description + and "value" in description["ephemera"] + ): + document = PyQuery(description["ephemera"]["value"]) + for tag in ("foa", "function", "address"): + if doc_value := document(tag).text(): + return doc_value + return description + return "" + + def functions(self) -> str: + if "description" in self.source: + functions = next( + ( + item + for item in self.source["description"] + if "type" in item + and item["type"] == "functions, occupations and activities" + ), + None, + ) + if functions and "value" in functions: + document = PyQuery(functions["value"]) + for tag in ("foa", "function", "address"): + if doc_value := document(tag).text(): + return doc_value + return functions["value"] + return "" + + def epithet(self) -> str: + if "description" in self.source: + epithet = next( + ( + item + for item in self.source["description"] + if "type" in item and item["type"] == "epithet" + ), + None, + ) + if epithet and "value" in epithet: + return epithet["value"] + return "" + + def history(self) -> str: + if "description" in self.source: + history = next( + ( + item + for item in self.source["description"] + if "type" in item and item["type"] == "history" + ), + None, + ) + if history and "value" in history: + document = PyQuery(history["value"]) + for tag in ("foa", "function"): + if doc_value := document(tag).text(): + return doc_value + return history["value"] + return "" + + def biography(self) -> str: + if "description" in self.source: + biography = next( + ( + item + for item in self.source["description"] + if "type" in item and item["type"] == "biography" + ), + None, + ) + if biography and "value" in biography and "url" in biography: + url = biography["url"] + text = biography["value"] + url = f'{text}' + return url + return "" + + def identifier(self) -> str: + if "identifier" in self.source: + primary_identifier = next( + ( + item["value"] + for item in self.source["identifier"] + if "type" in item + and item["type"] == "name authority reference" + and "value" in item + ), + None, + ) + former_identifier = next( + ( + item["value"] + for item in self.source["identifier"] + if "type" in item + and item["type"] == "former name authority reference" + and "value" in item + ), + None, + ) + return ( + f"{primary_identifier} (Former ISAAR ref: {former_identifier})" + if former_identifier + else primary_identifier + ) + return "" + + def reference_number(self) -> str: + if "identifier" in self.source: + if reference_number := next( + ( + item["value"] + for item in self.source["identifier"] + if "type" in item and item["type"] == "reference number" + ), + None, + ): + return reference_number + return "" + + def agents(self) -> dict: + agents = { + "businesses": [], + "diaries": [], + "families": [], + "organisations": [], + "persons": [], + } + if "agent" in self.source: + for agent in self.source["agent"]: + if archon_number := next( + ( + item["value"] + for item in agent["identifier"] + if "type" in item and item["type"] == "Archon number" + ), + None, + ): + id = ( + agent["@admin"]["id"] + if "@admin" in agent and "id" in agent["@admin"] + else "" + ) + name = ( + agent["name"]["value"] + if "name" in agent and "value" in agent["name"] + else "" + ) + if id and name: + places = ( + [ + item["value"] + for item in agent["place"]["name"] + if "value" in item + ] + if "place" in agent and "name" in agent["place"] + else [] + ) + agent_data = {"id": id, "name": name, "places": places} + if archon_number == "B": + agents["businesses"].append(agent_data) + if archon_number == "D": + agents["diaries"].append(agent_data) + if archon_number == "F": + agents["families"].append(agent_data) + if archon_number == "O": + agents["organisations"].append(agent_data) + if archon_number == "P": + agents["persons"].append(agent_data) + return agents diff --git a/app/sources/website.py b/app/sources/website.py index ffb3da5..f53c377 100644 --- a/app/sources/website.py +++ b/app/sources/website.py @@ -1,16 +1,15 @@ import math from app.articles.schemas import Article, ArticleSearchResults +from app.lib.api import GetAPI from config import Config -from .api import GetAPI - class WagtailAPI(GetAPI): def __init__(self): self.api_url = Config().WAGTAIL_API_URL - def get_results(self) -> dict: + def get_result(self) -> dict: url = f"{self.api_url}/pages/{self.build_query_string()}" return self.execute(url) @@ -19,7 +18,7 @@ class WebsiteArticles(WagtailAPI): def add_query(self, query_string: str) -> None: self.add_parameter("search", query_string) - def get_results(self, page: int | None = 1) -> dict: + def get_result(self, page: int | None = 1) -> dict: offset = (page - 1) * self.results_per_page self.add_parameter("offset", offset) self.add_parameter("limit", self.results_per_page) @@ -56,7 +55,7 @@ def get_time_periods(): # api.results_per_page = 100 # TODO: Make higher api.params = {} # TODO: Why isn't this blank by default? api.add_parameter("child_of", 54) # TODO: Make variable - results = api.get_results() + results = api.get_result() time_periods = [ {"name": time_period["title"], "value": time_period["id"]} for time_period in results["items"] @@ -69,7 +68,7 @@ def get_topics(): # api.results_per_page = 100 # TODO: Make higher api.params = {} # TODO: Why isn't this blank by default? api.add_parameter("child_of", 53) # TODO: Make variable - results = api.get_results() + results = api.get_result() topics = [ {"name": topic["title"], "value": topic["id"]} for topic in results["items"] diff --git a/poetry.lock b/poetry.lock index ea97633..272e99e 100644 --- a/poetry.lock +++ b/poetry.lock @@ -210,6 +210,17 @@ files = [ {file = "colorama-0.4.6.tar.gz", hash = "sha256:08695f5cb7ed6e0531a20572697297273c47b8cae5a63ffc6d6ed5c201be6e44"}, ] +[[package]] +name = "cssselect" +version = "1.2.0" +description = "cssselect parses CSS3 Selectors and translates them to XPath 1.0" +optional = false +python-versions = ">=3.7" +files = [ + {file = "cssselect-1.2.0-py2.py3-none-any.whl", hash = "sha256:da1885f0c10b60c03ed5eccbb6b68d6eff248d91976fcde348f395d54c9fd35e"}, + {file = "cssselect-1.2.0.tar.gz", hash = "sha256:666b19839cfaddb9ce9d36bfe4c969132c647b92fc9088c4e23f786b30f1b3dc"}, +] + [[package]] name = "fastapi" version = "0.108.0" @@ -270,6 +281,99 @@ files = [ [package.extras] colors = ["colorama (>=0.4.6)"] +[[package]] +name = "lxml" +version = "5.1.0" +description = "Powerful and Pythonic XML processing library combining libxml2/libxslt with the ElementTree API." +optional = false +python-versions = ">=3.6" +files = [ + {file = "lxml-5.1.0-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:704f5572ff473a5f897745abebc6df40f22d4133c1e0a1f124e4f2bd3330ff7e"}, + {file = "lxml-5.1.0-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:9d3c0f8567ffe7502d969c2c1b809892dc793b5d0665f602aad19895f8d508da"}, + {file = "lxml-5.1.0-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:5fcfbebdb0c5d8d18b84118842f31965d59ee3e66996ac842e21f957eb76138c"}, + {file = "lxml-5.1.0-cp310-cp310-manylinux_2_12_i686.manylinux2010_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:2f37c6d7106a9d6f0708d4e164b707037b7380fcd0b04c5bd9cae1fb46a856fb"}, + {file = "lxml-5.1.0-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:2befa20a13f1a75c751f47e00929fb3433d67eb9923c2c0b364de449121f447c"}, + {file = "lxml-5.1.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:22b7ee4c35f374e2c20337a95502057964d7e35b996b1c667b5c65c567d2252a"}, + {file = "lxml-5.1.0-cp310-cp310-musllinux_1_1_aarch64.whl", hash = "sha256:bf8443781533b8d37b295016a4b53c1494fa9a03573c09ca5104550c138d5c05"}, + {file = "lxml-5.1.0-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:82bddf0e72cb2af3cbba7cec1d2fd11fda0de6be8f4492223d4a268713ef2147"}, + {file = "lxml-5.1.0-cp310-cp310-win32.whl", hash = "sha256:b66aa6357b265670bb574f050ffceefb98549c721cf28351b748be1ef9577d93"}, + {file = "lxml-5.1.0-cp310-cp310-win_amd64.whl", hash = "sha256:4946e7f59b7b6a9e27bef34422f645e9a368cb2be11bf1ef3cafc39a1f6ba68d"}, + {file = "lxml-5.1.0-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:14deca1460b4b0f6b01f1ddc9557704e8b365f55c63070463f6c18619ebf964f"}, + {file = "lxml-5.1.0-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:ed8c3d2cd329bf779b7ed38db176738f3f8be637bb395ce9629fc76f78afe3d4"}, + {file = "lxml-5.1.0-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:436a943c2900bb98123b06437cdd30580a61340fbdb7b28aaf345a459c19046a"}, + {file = "lxml-5.1.0-cp311-cp311-manylinux_2_12_i686.manylinux2010_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:acb6b2f96f60f70e7f34efe0c3ea34ca63f19ca63ce90019c6cbca6b676e81fa"}, + {file = "lxml-5.1.0-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:af8920ce4a55ff41167ddbc20077f5698c2e710ad3353d32a07d3264f3a2021e"}, + {file = "lxml-5.1.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:7cfced4a069003d8913408e10ca8ed092c49a7f6cefee9bb74b6b3e860683b45"}, + {file = "lxml-5.1.0-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:9e5ac3437746189a9b4121db2a7b86056ac8786b12e88838696899328fc44bb2"}, + {file = "lxml-5.1.0-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:f4c9bda132ad108b387c33fabfea47866af87f4ea6ffb79418004f0521e63204"}, + {file = "lxml-5.1.0-cp311-cp311-win32.whl", hash = "sha256:bc64d1b1dab08f679fb89c368f4c05693f58a9faf744c4d390d7ed1d8223869b"}, + {file = "lxml-5.1.0-cp311-cp311-win_amd64.whl", hash = "sha256:a5ab722ae5a873d8dcee1f5f45ddd93c34210aed44ff2dc643b5025981908cda"}, + {file = "lxml-5.1.0-cp312-cp312-macosx_10_9_universal2.whl", hash = "sha256:9aa543980ab1fbf1720969af1d99095a548ea42e00361e727c58a40832439114"}, + {file = "lxml-5.1.0-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:6f11b77ec0979f7e4dc5ae081325a2946f1fe424148d3945f943ceaede98adb8"}, + {file = "lxml-5.1.0-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:a36c506e5f8aeb40680491d39ed94670487ce6614b9d27cabe45d94cd5d63e1e"}, + {file = "lxml-5.1.0-cp312-cp312-manylinux_2_12_i686.manylinux2010_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:f643ffd2669ffd4b5a3e9b41c909b72b2a1d5e4915da90a77e119b8d48ce867a"}, + {file = "lxml-5.1.0-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:16dd953fb719f0ffc5bc067428fc9e88f599e15723a85618c45847c96f11f431"}, + {file = "lxml-5.1.0-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:16018f7099245157564d7148165132c70adb272fb5a17c048ba70d9cc542a1a1"}, + {file = "lxml-5.1.0-cp312-cp312-musllinux_1_1_aarch64.whl", hash = "sha256:82cd34f1081ae4ea2ede3d52f71b7be313756e99b4b5f829f89b12da552d3aa3"}, + {file = "lxml-5.1.0-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:19a1bc898ae9f06bccb7c3e1dfd73897ecbbd2c96afe9095a6026016e5ca97b8"}, + {file = "lxml-5.1.0-cp312-cp312-win32.whl", hash = "sha256:13521a321a25c641b9ea127ef478b580b5ec82aa2e9fc076c86169d161798b01"}, + {file = "lxml-5.1.0-cp312-cp312-win_amd64.whl", hash = "sha256:1ad17c20e3666c035db502c78b86e58ff6b5991906e55bdbef94977700c72623"}, + {file = "lxml-5.1.0-cp36-cp36m-macosx_10_9_x86_64.whl", hash = "sha256:24ef5a4631c0b6cceaf2dbca21687e29725b7c4e171f33a8f8ce23c12558ded1"}, + {file = "lxml-5.1.0-cp36-cp36m-manylinux_2_12_i686.manylinux2010_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:8d2900b7f5318bc7ad8631d3d40190b95ef2aa8cc59473b73b294e4a55e9f30f"}, + {file = "lxml-5.1.0-cp36-cp36m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:601f4a75797d7a770daed8b42b97cd1bb1ba18bd51a9382077a6a247a12aa38d"}, + {file = "lxml-5.1.0-cp36-cp36m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:b4b68c961b5cc402cbd99cca5eb2547e46ce77260eb705f4d117fd9c3f932b95"}, + {file = "lxml-5.1.0-cp36-cp36m-musllinux_1_1_aarch64.whl", hash = "sha256:afd825e30f8d1f521713a5669b63657bcfe5980a916c95855060048b88e1adb7"}, + {file = "lxml-5.1.0-cp36-cp36m-musllinux_1_1_x86_64.whl", hash = "sha256:262bc5f512a66b527d026518507e78c2f9c2bd9eb5c8aeeb9f0eb43fcb69dc67"}, + {file = "lxml-5.1.0-cp36-cp36m-win32.whl", hash = "sha256:e856c1c7255c739434489ec9c8aa9cdf5179785d10ff20add308b5d673bed5cd"}, + {file = "lxml-5.1.0-cp36-cp36m-win_amd64.whl", hash = "sha256:c7257171bb8d4432fe9d6fdde4d55fdbe663a63636a17f7f9aaba9bcb3153ad7"}, + {file = "lxml-5.1.0-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:b9e240ae0ba96477682aa87899d94ddec1cc7926f9df29b1dd57b39e797d5ab5"}, + {file = "lxml-5.1.0-cp37-cp37m-manylinux_2_12_i686.manylinux2010_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:a96f02ba1bcd330807fc060ed91d1f7a20853da6dd449e5da4b09bfcc08fdcf5"}, + {file = "lxml-5.1.0-cp37-cp37m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:3e3898ae2b58eeafedfe99e542a17859017d72d7f6a63de0f04f99c2cb125936"}, + {file = "lxml-5.1.0-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:61c5a7edbd7c695e54fca029ceb351fc45cd8860119a0f83e48be44e1c464862"}, + {file = "lxml-5.1.0-cp37-cp37m-musllinux_1_1_aarch64.whl", hash = "sha256:3aeca824b38ca78d9ee2ab82bd9883083d0492d9d17df065ba3b94e88e4d7ee6"}, + {file = "lxml-5.1.0-cp37-cp37m-musllinux_1_1_x86_64.whl", hash = "sha256:8f52fe6859b9db71ee609b0c0a70fea5f1e71c3462ecf144ca800d3f434f0764"}, + {file = "lxml-5.1.0-cp37-cp37m-win32.whl", hash = "sha256:d42e3a3fc18acc88b838efded0e6ec3edf3e328a58c68fbd36a7263a874906c8"}, + {file = "lxml-5.1.0-cp37-cp37m-win_amd64.whl", hash = "sha256:eac68f96539b32fce2c9b47eb7c25bb2582bdaf1bbb360d25f564ee9e04c542b"}, + {file = "lxml-5.1.0-cp38-cp38-macosx_10_9_universal2.whl", hash = "sha256:ae15347a88cf8af0949a9872b57a320d2605ae069bcdf047677318bc0bba45b1"}, + {file = "lxml-5.1.0-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:c26aab6ea9c54d3bed716b8851c8bfc40cb249b8e9880e250d1eddde9f709bf5"}, + {file = "lxml-5.1.0-cp38-cp38-manylinux_2_12_i686.manylinux2010_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:342e95bddec3a698ac24378d61996b3ee5ba9acfeb253986002ac53c9a5f6f84"}, + {file = "lxml-5.1.0-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:725e171e0b99a66ec8605ac77fa12239dbe061482ac854d25720e2294652eeaa"}, + {file = "lxml-5.1.0-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:3d184e0d5c918cff04cdde9dbdf9600e960161d773666958c9d7b565ccc60c45"}, + {file = "lxml-5.1.0-cp38-cp38-musllinux_1_1_aarch64.whl", hash = "sha256:98f3f020a2b736566c707c8e034945c02aa94e124c24f77ca097c446f81b01f1"}, + {file = "lxml-5.1.0-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:6d48fc57e7c1e3df57be5ae8614bab6d4e7b60f65c5457915c26892c41afc59e"}, + {file = "lxml-5.1.0-cp38-cp38-win32.whl", hash = "sha256:7ec465e6549ed97e9f1e5ed51c657c9ede767bc1c11552f7f4d022c4df4a977a"}, + {file = "lxml-5.1.0-cp38-cp38-win_amd64.whl", hash = "sha256:b21b4031b53d25b0858d4e124f2f9131ffc1530431c6d1321805c90da78388d1"}, + {file = "lxml-5.1.0-cp39-cp39-macosx_10_9_universal2.whl", hash = "sha256:52427a7eadc98f9e62cb1368a5079ae826f94f05755d2d567d93ee1bc3ceb354"}, + {file = "lxml-5.1.0-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:6a2a2c724d97c1eb8cf966b16ca2915566a4904b9aad2ed9a09c748ffe14f969"}, + {file = "lxml-5.1.0-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:843b9c835580d52828d8f69ea4302537337a21e6b4f1ec711a52241ba4a824f3"}, + {file = "lxml-5.1.0-cp39-cp39-manylinux_2_12_i686.manylinux2010_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:9b99f564659cfa704a2dd82d0684207b1aadf7d02d33e54845f9fc78e06b7581"}, + {file = "lxml-5.1.0-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:4f8b0c78e7aac24979ef09b7f50da871c2de2def043d468c4b41f512d831e912"}, + {file = "lxml-5.1.0-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:9bcf86dfc8ff3e992fed847c077bd875d9e0ba2fa25d859c3a0f0f76f07f0c8d"}, + {file = "lxml-5.1.0-cp39-cp39-musllinux_1_1_aarch64.whl", hash = "sha256:49a9b4af45e8b925e1cd6f3b15bbba2c81e7dba6dce170c677c9cda547411e14"}, + {file = "lxml-5.1.0-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:280f3edf15c2a967d923bcfb1f8f15337ad36f93525828b40a0f9d6c2ad24890"}, + {file = "lxml-5.1.0-cp39-cp39-win32.whl", hash = "sha256:ed7326563024b6e91fef6b6c7a1a2ff0a71b97793ac33dbbcf38f6005e51ff6e"}, + {file = "lxml-5.1.0-cp39-cp39-win_amd64.whl", hash = "sha256:8d7b4beebb178e9183138f552238f7e6613162a42164233e2bda00cb3afac58f"}, + {file = "lxml-5.1.0-pp310-pypy310_pp73-macosx_10_9_x86_64.whl", hash = "sha256:9bd0ae7cc2b85320abd5e0abad5ccee5564ed5f0cc90245d2f9a8ef330a8deae"}, + {file = "lxml-5.1.0-pp310-pypy310_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:d8c1d679df4361408b628f42b26a5d62bd3e9ba7f0c0e7969f925021554755aa"}, + {file = "lxml-5.1.0-pp310-pypy310_pp73-win_amd64.whl", hash = "sha256:2ad3a8ce9e8a767131061a22cd28fdffa3cd2dc193f399ff7b81777f3520e372"}, + {file = "lxml-5.1.0-pp37-pypy37_pp73-macosx_10_9_x86_64.whl", hash = "sha256:304128394c9c22b6569eba2a6d98392b56fbdfbad58f83ea702530be80d0f9df"}, + {file = "lxml-5.1.0-pp37-pypy37_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:d74fcaf87132ffc0447b3c685a9f862ffb5b43e70ea6beec2fb8057d5d2a1fea"}, + {file = "lxml-5.1.0-pp37-pypy37_pp73-win_amd64.whl", hash = "sha256:8cf5877f7ed384dabfdcc37922c3191bf27e55b498fecece9fd5c2c7aaa34c33"}, + {file = "lxml-5.1.0-pp38-pypy38_pp73-macosx_10_9_x86_64.whl", hash = "sha256:877efb968c3d7eb2dad540b6cabf2f1d3c0fbf4b2d309a3c141f79c7e0061324"}, + {file = "lxml-5.1.0-pp38-pypy38_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:3f14a4fb1c1c402a22e6a341a24c1341b4a3def81b41cd354386dcb795f83897"}, + {file = "lxml-5.1.0-pp38-pypy38_pp73-win_amd64.whl", hash = "sha256:25663d6e99659544ee8fe1b89b1a8c0aaa5e34b103fab124b17fa958c4a324a6"}, + {file = "lxml-5.1.0-pp39-pypy39_pp73-macosx_10_9_x86_64.whl", hash = "sha256:8b9f19df998761babaa7f09e6bc169294eefafd6149aaa272081cbddc7ba4ca3"}, + {file = "lxml-5.1.0-pp39-pypy39_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:5e53d7e6a98b64fe54775d23a7c669763451340c3d44ad5e3a3b48a1efbdc96f"}, + {file = "lxml-5.1.0-pp39-pypy39_pp73-win_amd64.whl", hash = "sha256:c3cd1fc1dc7c376c54440aeaaa0dcc803d2126732ff5c6b68ccd619f2e64be4f"}, + {file = "lxml-5.1.0.tar.gz", hash = "sha256:3eea6ed6e6c918e468e693c41ef07f3c3acc310b70ddd9cc72d9ef84bc9564ca"}, +] + +[package.extras] +cssselect = ["cssselect (>=0.7)"] +html5 = ["html5lib"] +htmlsoup = ["BeautifulSoup4"] +source = ["Cython (>=3.0.7)"] + [[package]] name = "mccabe" version = "0.7.0" @@ -487,6 +591,24 @@ files = [ {file = "pyflakes-3.1.0.tar.gz", hash = "sha256:a0aae034c444db0071aa077972ba4768d40c830d9539fd45bf4cd3f8f6992efc"}, ] +[[package]] +name = "pyquery" +version = "2.0.0" +description = "A jquery-like library for python" +optional = false +python-versions = "*" +files = [ + {file = "pyquery-2.0.0-py3-none-any.whl", hash = "sha256:8dfc9b4b7c5f877d619bbae74b1898d5743f6ca248cfd5d72b504dd614da312f"}, + {file = "pyquery-2.0.0.tar.gz", hash = "sha256:963e8d4e90262ff6d8dec072ea97285dc374a2f69cad7776f4082abcf6a1d8ae"}, +] + +[package.dependencies] +cssselect = ">=1.2.0" +lxml = ">=2.1" + +[package.extras] +test = ["pytest", "pytest-cov", "requests", "webob", "webtest"] + [[package]] name = "requests" version = "2.31.0" @@ -566,4 +688,4 @@ zstd = ["zstandard (>=0.18.0)"] [metadata] lock-version = "2.0" python-versions = "^3.11" -content-hash = "1249153cef63613f45cd3ae579d320421f38024c4353878ae01dbb8f05b088c5" +content-hash = "09eca8816abb57a7e5f16e71bff23fc6fb9830884fe7ab868ffad03e621eb959" diff --git a/pyproject.toml b/pyproject.toml index 7500aa7..2070802 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -13,6 +13,7 @@ flake8 = "^6.1.0" isort = "^5.12.0" requests = "^2.31.0" fastapi = "^0.108.0" +pyquery = "^2.0.0" [build-system] requires = ["poetry-core"]