From ab60932edc2eb97f06fecb8133c00fe5cd27e525 Mon Sep 17 00:00:00 2001 From: fsimonjetz Date: Thu, 21 Sep 2023 15:29:59 +0000 Subject: [PATCH 01/30] add ngram utility functions --- ebl/common/query/util.py | 51 ++++++++++++++++++++++++++++++++++++---- 1 file changed, 47 insertions(+), 4 deletions(-) diff --git a/ebl/common/query/util.py b/ebl/common/query/util.py index ee71de18c..9c08d2023 100644 --- a/ebl/common/query/util.py +++ b/ebl/common/query/util.py @@ -1,4 +1,4 @@ -from typing import Union, Dict +from typing import Union, Dict, Sequence def flatten_field(input_: Union[str, Dict], depth=1) -> Dict: @@ -15,9 +15,9 @@ def drop_duplicates(input_: Union[str, Dict]) -> Dict: return {"$setUnion": [input_, []]} -def ngrams(input_: Union[str, Dict], n) -> Dict: - if n <= 1: - raise ValueError("ngram size must be 2 or more") +def ngrams(input_: Union[str, Dict], n: int) -> Dict: + if n <= 0: + raise ValueError("ngram size must be 1 or more") return { "$zip": { "inputs": [ @@ -39,3 +39,46 @@ def ngrams(input_: Union[str, Dict], n) -> Dict: def filter_array(input_, as_, cond) -> Dict: return {"$filter": {"input": input_, "as": as_, "cond": cond}} + + +def add_ngram_field( + input_: Union[str, Dict], + N: Sequence[int], + output_: str = "ngrams", +): + signs_to_exclude = ["X", ""] + + exclude_empty = { + "$eq": [ + { + "$size": { + "$setIntersection": [ + "$$this", + signs_to_exclude, + ] + } + }, + 0, + ] + } + return { + "$addFields": { + output_: drop_duplicates( + filter_array( + {"$concatArrays": [ngrams(input_, n) for n in N if n > 0]}, + "this", + exclude_empty, + ) + ) + } + } + + +def replace_all(input_: Union[str, Dict], old: str, new: str): + return { + "$replaceAll": { + "input": input_, + "find": old, + "replacement": new, + } + } From 8fdf2157b8d5f31368f58a03d68fdf0ce42b62a6 Mon Sep 17 00:00:00 2001 From: fsimonjetz Date: Thu, 21 Sep 2023 15:30:13 +0000 Subject: [PATCH 02/30] add ngram global --- ebl/common/infrastructure/ngrams.py | 1 + 1 file changed, 1 insertion(+) create mode 100644 ebl/common/infrastructure/ngrams.py diff --git a/ebl/common/infrastructure/ngrams.py b/ebl/common/infrastructure/ngrams.py new file mode 100644 index 000000000..71a624575 --- /dev/null +++ b/ebl/common/infrastructure/ngrams.py @@ -0,0 +1 @@ +NGRAM_N_VALUES = [1, 2, 3] From c7a7dad634900c4b219af12ca67678719580ecdf Mon Sep 17 00:00:00 2001 From: fsimonjetz Date: Thu, 21 Sep 2023 15:30:58 +0000 Subject: [PATCH 03/30] add update_ngrams method --- .../mongo_fragment_repository.py | 36 +++++++++++++------ 1 file changed, 26 insertions(+), 10 deletions(-) diff --git a/ebl/fragmentarium/infrastructure/mongo_fragment_repository.py b/ebl/fragmentarium/infrastructure/mongo_fragment_repository.py index 33494a7eb..0d92141e4 100644 --- a/ebl/fragmentarium/infrastructure/mongo_fragment_repository.py +++ b/ebl/fragmentarium/infrastructure/mongo_fragment_repository.py @@ -6,8 +6,10 @@ from ebl.bibliography.infrastructure.bibliography import join_reference_documents from ebl.common.domain.scopes import Scope +from ebl.common.infrastructure.ngrams import NGRAM_N_VALUES from ebl.common.query.query_result import QueryResult from ebl.common.query.query_schemas import QueryResultSchema +from ebl.common.query.util import add_ngram_field, replace_all from ebl.errors import NotFoundError from ebl.fragmentarium.application.fragment_info_schema import FragmentInfoSchema from ebl.fragmentarium.application.fragment_repository import FragmentRepository @@ -103,13 +105,16 @@ def count_lines(self): return 0 def create(self, fragment, sort_key=None): - return self._fragments.insert_one( + id_ = self._fragments.insert_one( { "_id": str(fragment.number), **FragmentSchema(exclude=["joins"]).dump(fragment), **({} if sort_key is None else {"_sortKey": sort_key}), } ) + self.update_ngrams(fragment.number) + + return id_ def create_many(self, fragments: Sequence[Fragment]) -> Sequence[str]: schema = FragmentSchema(exclude=["joins"]) @@ -295,6 +300,9 @@ def update_field(self, field, fragment): {"$set": query if query else {field: None}}, ) + if field == "transliteration": + self.update_ngrams(fragment.number) + def query_next_and_previous_folio(self, folio_name, folio_number, number): sort_ascending = {"$sort": {"key": 1}} sort_descending = {"$sort": {"key": -1}} @@ -342,15 +350,6 @@ def get_numbers(pipeline): else: return result - def query_museum_numbers(self, prefix: str, number_regex: str) -> Sequence[dict]: - return self._fragments.find_many( - { - "museumNumber.prefix": prefix, - "museumNumber.number": {"$regex": number_regex}, - }, - projection={"museumNumber": True}, - ) - def query_by_sort_key(self, key: int) -> MuseumNumber: if key < 0: last_fragment = next( @@ -423,3 +422,20 @@ def list_all_fragments( return list( self._fragments.get_all_values("_id", match_user_scopes(user_scopes)) ) + + def update_ngrams(self, number: MuseumNumber): + tmp_signs = "tmp_signs" + pipeline = [ + { + "$addFields": { + tmp_signs: {"$split": [replace_all("$signs", "\n", " # "), " "]} + } + }, + add_ngram_field(f"${tmp_signs}", NGRAM_N_VALUES, "ngrams"), + {"$unset": tmp_signs}, + ] + + self._fragments.update_one( + museum_number_is(number), + pipeline, + ) From 93a1e4ea7d62bd457c599fc74eab8a1d8e589aeb Mon Sep 17 00:00:00 2001 From: fsimonjetz Date: Thu, 21 Sep 2023 15:31:10 +0000 Subject: [PATCH 04/30] add helper functions for ngram tests --- ebl/tests/common/ngram_test_support.py | 21 +++++++++++++++++++++ 1 file changed, 21 insertions(+) create mode 100644 ebl/tests/common/ngram_test_support.py diff --git a/ebl/tests/common/ngram_test_support.py b/ebl/tests/common/ngram_test_support.py new file mode 100644 index 000000000..9c670f018 --- /dev/null +++ b/ebl/tests/common/ngram_test_support.py @@ -0,0 +1,21 @@ +from typing import Sequence, Set, Tuple, TypeVar, Optional + +T = TypeVar("T") + + +def _ngrams(sequence: Sequence[T], n: int) -> Set[Tuple[T]]: + return set(zip(*(sequence[i:] for i in range(n)))) + + +def ngrams_from_signs(signs: str, N: Sequence[int]) -> Set[Tuple[str]]: + split_signs = signs.replace("\n", " # ").split() + all_ngrams = set.union(*(_ngrams(split_signs, n) for n in N)) + return {ngram for ngram in all_ngrams if "X" not in ngram} + + +def chapter_ngrams_from_signs( + chapter_signs: Sequence[Optional[str]], N: Sequence[int] +) -> Set[Tuple[str]]: + return set.union( + *(ngrams_from_signs(signs, N) for signs in chapter_signs if signs is not None) + ) From 0c06bd9c1b8657433e4a56366d27b698870037f1 Mon Sep 17 00:00:00 2001 From: fsimonjetz Date: Thu, 21 Sep 2023 15:31:20 +0000 Subject: [PATCH 05/30] add tests for fragment ngrams --- .../fragmentarium/test_fragment_repository.py | 33 ++++++++++++++++++- 1 file changed, 32 insertions(+), 1 deletion(-) diff --git a/ebl/tests/fragmentarium/test_fragment_repository.py b/ebl/tests/fragmentarium/test_fragment_repository.py index e4817c8af..b196c4712 100644 --- a/ebl/tests/fragmentarium/test_fragment_repository.py +++ b/ebl/tests/fragmentarium/test_fragment_repository.py @@ -4,6 +4,7 @@ import random from ebl.common.domain.period import Period, PeriodModifier from ebl.common.domain.scopes import Scope +from ebl.common.infrastructure.ngrams import NGRAM_N_VALUES from ebl.common.query.query_result import QueryItem, QueryResult from ebl.dictionary.domain.word import WordId @@ -26,6 +27,7 @@ from ebl.fragmentarium.domain.transliteration_update import TransliterationUpdate from ebl.common.query.query_result import LemmaQueryType from ebl.lemmatization.domain.lemmatization import Lemmatization, LemmatizationToken +from ebl.tests.common.ngram_test_support import ngrams_from_signs from ebl.tests.factories.bibliography import ReferenceFactory from ebl.tests.factories.fragment import ( FragmentFactory, @@ -50,6 +52,7 @@ from ebl.common.query.query_schemas import QueryResultSchema from ebl.tests.fragmentarium.test_fragments_search_route import query_item_of from ebl.transliteration.application.sign_repository import SignRepository +from ebl.transliteration.infrastructure.queries import museum_number_is COLLECTION = "fragments" @@ -110,7 +113,7 @@ def test_create(database, fragment_repository): assert fragment_id == str(fragment.number) assert database[COLLECTION].find_one( - {"_id": fragment_id}, projection={"_id": False} + {"_id": fragment_id}, projection={"_id": False, "ngrams": False} ) == FragmentSchema(exclude=["joins"]).dump(fragment) @@ -1003,3 +1006,31 @@ def test_query_genres(fragment_repository, query, expected): ) assert fragment_repository.query({"genre": query}) == expected_result + + +def test_create_fragment_extracts_ngrams(fragment_repository): + fragment = TransliteratedFragmentFactory.build() + + fragment_repository.create(fragment) + data = fragment_repository._fragments.find_one(museum_number_is(fragment.number)) + + assert set(map(tuple, data["ngrams"])) == ngrams_from_signs( + fragment.signs, NGRAM_N_VALUES + ) + + +def test_update_transliteration_updates_ngrams(fragment_repository, user): + fragment = TransliteratedFragmentFactory.build(signs="") + fragment_repository.create(fragment) + + updated_fragment = fragment.update_transliteration( + TransliterationUpdate(parse_atf_lark("1. X MU TA MA UD MI KU")), user + ) + + fragment_repository.update_field("transliteration", updated_fragment) + + data = fragment_repository._fragments.find_one(museum_number_is(fragment.number)) + + assert set(map(tuple, data["ngrams"])) == ngrams_from_signs( + updated_fragment.signs, NGRAM_N_VALUES + ) From 7370db210cb1fa95b5d2eca5b94607bbd1ff8ace Mon Sep 17 00:00:00 2001 From: fsimonjetz Date: Fri, 22 Sep 2023 15:40:42 +0000 Subject: [PATCH 06/30] simplify function --- ebl/common/query/util.py | 21 ++++++++------------- 1 file changed, 8 insertions(+), 13 deletions(-) diff --git a/ebl/common/query/util.py b/ebl/common/query/util.py index 9c08d2023..b3590d9d2 100644 --- a/ebl/common/query/util.py +++ b/ebl/common/query/util.py @@ -41,10 +41,9 @@ def filter_array(input_, as_, cond) -> Dict: return {"$filter": {"input": input_, "as": as_, "cond": cond}} -def add_ngram_field( +def extract_ngrams( input_: Union[str, Dict], N: Sequence[int], - output_: str = "ngrams", ): signs_to_exclude = ["X", ""] @@ -61,17 +60,13 @@ def add_ngram_field( 0, ] } - return { - "$addFields": { - output_: drop_duplicates( - filter_array( - {"$concatArrays": [ngrams(input_, n) for n in N if n > 0]}, - "this", - exclude_empty, - ) - ) - } - } + return drop_duplicates( + filter_array( + {"$concatArrays": [ngrams(input_, n) for n in N if n > 0]}, + "this", + exclude_empty, + ) + ) def replace_all(input_: Union[str, Dict], old: str, new: str): From 22a43edc79c1ec55de14e084a96f8e931c8751b7 Mon Sep 17 00:00:00 2001 From: fsimonjetz Date: Fri, 22 Sep 2023 15:41:05 +0000 Subject: [PATCH 07/30] refactoring --- .../infrastructure/mongo_fragment_repository.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/ebl/fragmentarium/infrastructure/mongo_fragment_repository.py b/ebl/fragmentarium/infrastructure/mongo_fragment_repository.py index 0d92141e4..9f1b06479 100644 --- a/ebl/fragmentarium/infrastructure/mongo_fragment_repository.py +++ b/ebl/fragmentarium/infrastructure/mongo_fragment_repository.py @@ -9,7 +9,7 @@ from ebl.common.infrastructure.ngrams import NGRAM_N_VALUES from ebl.common.query.query_result import QueryResult from ebl.common.query.query_schemas import QueryResultSchema -from ebl.common.query.util import add_ngram_field, replace_all +from ebl.common.query.util import extract_ngrams, replace_all from ebl.errors import NotFoundError from ebl.fragmentarium.application.fragment_info_schema import FragmentInfoSchema from ebl.fragmentarium.application.fragment_repository import FragmentRepository @@ -424,15 +424,15 @@ def list_all_fragments( ) def update_ngrams(self, number: MuseumNumber): - tmp_signs = "tmp_signs" pipeline = [ { - "$addFields": { - tmp_signs: {"$split": [replace_all("$signs", "\n", " # "), " "]} + "$set": { + "ngrams": extract_ngrams( + {"$split": [replace_all("$signs", "\n", " # "), " "]}, + NGRAM_N_VALUES, + ) } }, - add_ngram_field(f"${tmp_signs}", NGRAM_N_VALUES, "ngrams"), - {"$unset": tmp_signs}, ] self._fragments.update_one( From 4d7f7c90106fe865869c8588fede3c9eead53386 Mon Sep 17 00:00:00 2001 From: fsimonjetz Date: Fri, 22 Sep 2023 15:59:33 +0000 Subject: [PATCH 08/30] add _update_ngrams --- .../infrastructure/mongo_text_repository.py | 28 +++++++++++++++++++ 1 file changed, 28 insertions(+) diff --git a/ebl/corpus/infrastructure/mongo_text_repository.py b/ebl/corpus/infrastructure/mongo_text_repository.py index 1c61fccef..459c24691 100644 --- a/ebl/corpus/infrastructure/mongo_text_repository.py +++ b/ebl/corpus/infrastructure/mongo_text_repository.py @@ -6,8 +6,15 @@ from ebl.bibliography.infrastructure.bibliography import join_reference_documents +from ebl.common.infrastructure.ngrams import NGRAM_N_VALUES from ebl.common.query.query_result import CorpusQueryResult from ebl.common.query.query_schemas import CorpusQueryResultSchema +from ebl.common.query.util import ( + drop_duplicates, + extract_ngrams, + flatten_field, + replace_all, +) from ebl.corpus.application.text_repository import TextRepository from ebl.corpus.application.display_schemas import ChapterDisplaySchema from ebl.corpus.application.schemas import ( @@ -109,6 +116,7 @@ def create(self, text: Text) -> None: def create_chapter(self, chapter: Chapter) -> None: self._chapters.insert_one(ChapterSchema().dump(chapter)) + self._update_ngrams(chapter.id_) def find(self, id_: TextId) -> Text: try: @@ -243,6 +251,7 @@ def update(self, id_: ChapterId, chapter: Chapter) -> None: ).dump(chapter) }, ) + self._update_ngrams(id_) def query_by_transliteration( self, query: TransliterationQuery, pagination_index: int @@ -449,3 +458,22 @@ def query_corpus_by_manuscript( ] ) return ManuscriptAttestationSchema().load(cursor, many=True) + + def _update_ngrams(self, id_: ChapterId) -> None: + map_extract_ngrams = { + "$map": { + "input": "$signs", + "in": extract_ngrams( + {"$split": [replace_all("$$this", "\n", " # "), " "]}, + NGRAM_N_VALUES, + ), + } + } + pipeline = [ + {"$set": {"ngrams": drop_duplicates(flatten_field(map_extract_ngrams))}}, + ] + + self._chapters.update_one( + chapter_id_query(id_), + pipeline, + ) From be0eb358f1b3b50e545855d0944e819a87adf742 Mon Sep 17 00:00:00 2001 From: fsimonjetz Date: Fri, 22 Sep 2023 16:39:25 +0000 Subject: [PATCH 09/30] allow ngram field --- ebl/corpus/application/schemas.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/ebl/corpus/application/schemas.py b/ebl/corpus/application/schemas.py index 2ef36b71e..53bc63653 100644 --- a/ebl/corpus/application/schemas.py +++ b/ebl/corpus/application/schemas.py @@ -1,5 +1,6 @@ from ebl.corpus.domain.provenance import Provenance from marshmallow import ( + EXCLUDE, Schema, ValidationError, fields, @@ -260,6 +261,9 @@ class DictionaryLinePaginationSchema(Schema): class ChapterSchema(Schema): + class Meta: + unknown = EXCLUDE + text_id = fields.Nested(TextIdSchema, required=True, data_key="textId") classification = ValueEnumField(Classification, required=True) stage = ValueEnumField(Stage, required=True) From bdeeecd9589c69c8f073f7a4659603813c272f4a Mon Sep 17 00:00:00 2001 From: fsimonjetz Date: Fri, 22 Sep 2023 16:39:41 +0000 Subject: [PATCH 10/30] add chapter ngram tests --- .../corpus/test_mongo_text_repository.py | 52 ++++++++++++++++++- .../fragmentarium/test_fragment_repository.py | 2 +- 2 files changed, 52 insertions(+), 2 deletions(-) diff --git a/ebl/tests/corpus/test_mongo_text_repository.py b/ebl/tests/corpus/test_mongo_text_repository.py index 578c3ea3c..c490bf8c2 100644 --- a/ebl/tests/corpus/test_mongo_text_repository.py +++ b/ebl/tests/corpus/test_mongo_text_repository.py @@ -1,6 +1,7 @@ from typing import Sequence import attr import pytest +from ebl.common.infrastructure.ngrams import NGRAM_N_VALUES from ebl.corpus.application.text_repository import TextRepository from ebl.corpus.application.schemas import ChapterSchema, TextSchema @@ -13,6 +14,7 @@ from ebl.fragmentarium.application.joins_schema import JoinSchema from ebl.fragmentarium.domain.fragment import Fragment from ebl.fragmentarium.domain.joins import Join, Joins +from ebl.tests.common.ngram_test_support import chapter_ngrams_from_signs from ebl.tests.factories.corpus import ( ChapterFactory, LineFactory, @@ -167,7 +169,7 @@ def test_creating_chapter(database, text_repository) -> None: "stage": CHAPTER.stage.value, "name": CHAPTER.name, }, - projection={"_id": False}, + projection={"_id": False, "ngrams": False}, ) assert inserted_chapter == ChapterSchema().dump(CHAPTER) @@ -466,3 +468,51 @@ def test_query_corpus_by_manuscript(database, text_repository) -> None: assert text_repository.query_corpus_by_manuscript( [CHAPTER.manuscripts[0].museum_number] ) == [expected_manuscript_attestation] + + +def test_create_chapter_stores_ngrams(database, text_repository): + text_repository.create_chapter(CHAPTER) + + data = database[CHAPTERS_COLLECTION].find_one( + { + "textId.category": CHAPTER.text_id.category, + "textId.index": CHAPTER.text_id.index, + "stage": CHAPTER.stage.value, + "name": CHAPTER.name, + }, + projection={"_id": False, "ngrams": True}, + ) + + assert set(map(tuple, data["ngrams"])) == chapter_ngrams_from_signs( + CHAPTER.signs, NGRAM_N_VALUES + ) + + +def test_update_chapter_stores_ngrams(database, text_repository): + text_repository.create_chapter(CHAPTER) + + updated_chapter = attr.evolve( + CHAPTER, + signs=( + "X ABZ411 ABZ11 ABZ41", + "X X X TI BA", + ), + ) + + when_chapter_in_collection(database) + + text_repository.update(CHAPTER.id_, updated_chapter) + + data = database[CHAPTERS_COLLECTION].find_one( + { + "textId.category": CHAPTER.text_id.category, + "textId.index": CHAPTER.text_id.index, + "stage": CHAPTER.stage.value, + "name": CHAPTER.name, + }, + projection={"_id": False, "ngrams": True}, + ) + + assert set(map(tuple, data["ngrams"])) == chapter_ngrams_from_signs( + updated_chapter.signs, NGRAM_N_VALUES + ) diff --git a/ebl/tests/fragmentarium/test_fragment_repository.py b/ebl/tests/fragmentarium/test_fragment_repository.py index b196c4712..6613d8154 100644 --- a/ebl/tests/fragmentarium/test_fragment_repository.py +++ b/ebl/tests/fragmentarium/test_fragment_repository.py @@ -1008,7 +1008,7 @@ def test_query_genres(fragment_repository, query, expected): assert fragment_repository.query({"genre": query}) == expected_result -def test_create_fragment_extracts_ngrams(fragment_repository): +def test_create_fragment_stores_ngrams(fragment_repository): fragment = TransliteratedFragmentFactory.build() fragment_repository.create(fragment) From 532555f0af2be59c11dccc187105c343ecbc4128 Mon Sep 17 00:00:00 2001 From: fsimonjetz Date: Mon, 25 Sep 2023 10:26:07 +0000 Subject: [PATCH 11/30] update importer to compute ngrams --- ebl/io/fragments/importer.py | 33 +++++++++++++++++++++++++++------ 1 file changed, 27 insertions(+), 6 deletions(-) diff --git a/ebl/io/fragments/importer.py b/ebl/io/fragments/importer.py index bdc49051a..6840865e4 100644 --- a/ebl/io/fragments/importer.py +++ b/ebl/io/fragments/importer.py @@ -6,6 +6,8 @@ from marshmallow import ValidationError from pymongo import MongoClient import pymongo +from ebl.common.infrastructure.ngrams import NGRAM_N_VALUES +from ebl.common.query.util import extract_ngrams, replace_all from ebl.fragmentarium.application.fragment_schema import FragmentSchema from ebl.fragmentarium.infrastructure.fragment_search_aggregations import ( sort_by_museum_number, @@ -114,6 +116,24 @@ def create_sort_index(fragments_collection: MongoCollection) -> None: fragments_collection.create_index([("_sortKey", pymongo.ASCENDING)]) +def _create_ngrams(fragments_collection: MongoCollection, fragments: dict) -> None: + print(f"Extracting n-grams from {len(fragments)} fragment(s)...") + numbers = [fragment["_id"] for fragment in fragments.values()] + fragments_collection.update_many( + {"_id": {"$in": numbers}}, + [ + { + "$set": { + "ngrams": extract_ngrams( + {"$split": [replace_all("$signs", "\n", " # "), " "]}, + NGRAM_N_VALUES, + ) + } + } + ], + ) + + def write_to_db( fragments: Sequence[dict], fragments_collection: MongoCollection ) -> List: @@ -129,7 +149,7 @@ def write_to_tsv( writer = csv.writer(csvfile, delimiter="\t") # pyre-ignore[6] if column_names: writer.writerow(column_names) - writer.writerows(FAILS) + writer.writerows(data) if __name__ == "__main__": @@ -265,7 +285,7 @@ def _reindex_database(collection, db): if FAILS: print( - f"Skipping {fail_count} document(s), see {os.path.abspath(error_file)} for details" + f"Skipping {fail_count} document(s), see {os.path.abspath(error_file)} for details." ) write_to_tsv(error_file, FAILS, ["file", "error"]) @@ -296,27 +316,28 @@ def _reindex_database(collection, db): if input(prompt) != passphrase: sys.exit("Aborting.") - fragments = { + valid_fragments = { filename: data for filename, data in fragments.items() if filename not in set(file for file, _ in FAILS) } result = write_to_db( - list(fragments.values()), + list(valid_fragments.values()), COLLECTION, ) print("Result:") print(result) + _create_ngrams(COLLECTION, valid_fragments) _reindex_database(COLLECTION, args.database) write_to_tsv( summary_file, - [[data["_id"], filename] for filename, data in fragments.items()], + [[data["_id"], filename] for filename, data in valid_fragments.items()], ["id", "file"], ) print( - f"Done! See {os.path.abspath(summary_file)} for a summary of added documents", + f"Done! See {os.path.abspath(summary_file)} for a summary of added documents.", ) From e6cba9477594e1c361613c01c98c8bb7c81d1303 Mon Sep 17 00:00:00 2001 From: fsimonjetz Date: Mon, 25 Sep 2023 10:27:49 +0000 Subject: [PATCH 12/30] refactoring --- .../mongo_fragment_repository.py | 22 +++++++++---------- 1 file changed, 10 insertions(+), 12 deletions(-) diff --git a/ebl/fragmentarium/infrastructure/mongo_fragment_repository.py b/ebl/fragmentarium/infrastructure/mongo_fragment_repository.py index 9f1b06479..0cdc12932 100644 --- a/ebl/fragmentarium/infrastructure/mongo_fragment_repository.py +++ b/ebl/fragmentarium/infrastructure/mongo_fragment_repository.py @@ -424,18 +424,16 @@ def list_all_fragments( ) def update_ngrams(self, number: MuseumNumber): - pipeline = [ - { - "$set": { - "ngrams": extract_ngrams( - {"$split": [replace_all("$signs", "\n", " # "), " "]}, - NGRAM_N_VALUES, - ) - } - }, - ] - self._fragments.update_one( museum_number_is(number), - pipeline, + [ + { + "$set": { + "ngrams": extract_ngrams( + {"$split": [replace_all("$signs", "\n", " # "), " "]}, + NGRAM_N_VALUES, + ) + } + }, + ], ) From d4e77337bcd759cb6d7e6f7917836d0b3a4dff07 Mon Sep 17 00:00:00 2001 From: fsimonjetz Date: Mon, 25 Sep 2023 10:46:39 +0000 Subject: [PATCH 13/30] add NgramMatchResource --- ebl/common/application/ngram_matcher.py | 46 +++++++++++++++++++ ebl/fragmentarium/web/bootstrap.py | 3 ++ .../web/fragment_ngram_matcher.py | 22 +++++++++ 3 files changed, 71 insertions(+) create mode 100644 ebl/common/application/ngram_matcher.py create mode 100644 ebl/fragmentarium/web/fragment_ngram_matcher.py diff --git a/ebl/common/application/ngram_matcher.py b/ebl/common/application/ngram_matcher.py new file mode 100644 index 000000000..a7502f22d --- /dev/null +++ b/ebl/common/application/ngram_matcher.py @@ -0,0 +1,46 @@ +from typing import Set, Tuple, Optional, Sequence, List + + +def aggregate_ngram_overlaps( + ngrams: Set[Tuple[str]], limit: Optional[int] = None +) -> Sequence[dict]: + ngram_list = list(ngrams) + pipeline: List[dict] = [ + {"$match": {"textId.category": {"$ne": 99}}}, + { + "$project": { + "_id": 0, + "textId": 1, + "name": 1, + "stage": 1, + "overlap": { + "$let": { + "vars": { + "intersection": { + "$size": {"$setIntersection": ["$ngrams", ngram_list]} + }, + "minLength": { + "$min": [ + {"$size": "$ngrams"}, + {"$size": [ngram_list]}, + ] + }, + }, + "in": { + "$cond": [ + {"$eq": ["$$minLength", 0]}, + 0.0, + {"$divide": ["$$intersection", "$$minLength"]}, + ] + }, + } + }, + } + }, + {"$sort": {"overlap": -1}}, + ] + + if limit: + pipeline.append({"$limit": limit}) + + return pipeline diff --git a/ebl/fragmentarium/web/bootstrap.py b/ebl/fragmentarium/web/bootstrap.py index 442cba654..43c66473f 100644 --- a/ebl/fragmentarium/web/bootstrap.py +++ b/ebl/fragmentarium/web/bootstrap.py @@ -10,6 +10,7 @@ from ebl.fragmentarium.web.folio_pager import FolioPagerResource from ebl.fragmentarium.web.folios import FoliosResource from ebl.fragmentarium.web.fragment_genre import FragmentGenreResource +from ebl.fragmentarium.web.fragment_ngram_matcher import NgramMatchResource from ebl.fragmentarium.web.fragment_script import FragmentScriptResource from ebl.fragmentarium.web.fragment_date import ( FragmentDateResource, @@ -85,6 +86,7 @@ def create_fragmentarium_routes(api: falcon.App, context: Context): fragment_query = FragmentsQueryResource( context.fragment_repository, context.get_transliteration_query_factory() ) + ngram_matcher = NgramMatchResource(context.fragment_repository) genres = GenresResource() periods = PeriodsResource() lemmatization = LemmatizationResource(updater) @@ -124,6 +126,7 @@ def create_fragmentarium_routes(api: falcon.App, context: Context): ("/fragments/{number}/annotations", annotations), ("/fragments/{number}/photo", photo), ("/fragments/{number}/corpus", chapters), + ("/fragments/{number}/ngrams", ngram_matcher), ("/genres", genres), ("/periods", periods), ("/statistics", statistics), diff --git a/ebl/fragmentarium/web/fragment_ngram_matcher.py b/ebl/fragmentarium/web/fragment_ngram_matcher.py new file mode 100644 index 000000000..f60d2a82c --- /dev/null +++ b/ebl/fragmentarium/web/fragment_ngram_matcher.py @@ -0,0 +1,22 @@ +from ebl.common.application.ngram_matcher import aggregate_ngram_overlaps +from ebl.corpus.application.text_repository import TextRepository +from ebl.fragmentarium.application.fragment_repository import FragmentRepository +from falcon import Request, Response + +from ebl.transliteration.domain.museum_number import MuseumNumber + + +class NgramMatchResource: + def __init__( + self, + fragment_repository: FragmentRepository, + text_repository: TextRepository, + ): + self._fragment_repository = fragment_repository + self._text_repository = text_repository + + def on_get(self, _req: Request, resp: Response, number: str) -> None: + ngrams = self._fragment_repository.get_ngrams(MuseumNumber.of(number)) + resp.media = self._text_repository._chapters.aggregate( + aggregate_ngram_overlaps(ngrams) + ) From 24cde92a329ae4c5771baad431fdaf4575cf2793 Mon Sep 17 00:00:00 2001 From: fsimonjetz Date: Mon, 25 Sep 2023 12:15:22 +0000 Subject: [PATCH 14/30] refactoring --- .../infrastructure/mongo_fragment_repository.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/ebl/fragmentarium/infrastructure/mongo_fragment_repository.py b/ebl/fragmentarium/infrastructure/mongo_fragment_repository.py index 0cdc12932..4190b2ff0 100644 --- a/ebl/fragmentarium/infrastructure/mongo_fragment_repository.py +++ b/ebl/fragmentarium/infrastructure/mongo_fragment_repository.py @@ -112,7 +112,7 @@ def create(self, fragment, sort_key=None): **({} if sort_key is None else {"_sortKey": sort_key}), } ) - self.update_ngrams(fragment.number) + self._update_ngrams(fragment.number) return id_ @@ -301,7 +301,7 @@ def update_field(self, field, fragment): ) if field == "transliteration": - self.update_ngrams(fragment.number) + self._update_ngrams(fragment.number) def query_next_and_previous_folio(self, folio_name, folio_number, number): sort_ascending = {"$sort": {"key": 1}} @@ -423,7 +423,7 @@ def list_all_fragments( self._fragments.get_all_values("_id", match_user_scopes(user_scopes)) ) - def update_ngrams(self, number: MuseumNumber): + def _update_ngrams(self, number: MuseumNumber): self._fragments.update_one( museum_number_is(number), [ From ba8e3992b04ea7a9e2d874abe6d27e25b537212f Mon Sep 17 00:00:00 2001 From: fsimonjetz Date: Mon, 25 Sep 2023 13:39:51 +0000 Subject: [PATCH 15/30] add text repo --- ebl/fragmentarium/web/bootstrap.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/ebl/fragmentarium/web/bootstrap.py b/ebl/fragmentarium/web/bootstrap.py index 43c66473f..d23f4a3e2 100644 --- a/ebl/fragmentarium/web/bootstrap.py +++ b/ebl/fragmentarium/web/bootstrap.py @@ -10,7 +10,7 @@ from ebl.fragmentarium.web.folio_pager import FolioPagerResource from ebl.fragmentarium.web.folios import FoliosResource from ebl.fragmentarium.web.fragment_genre import FragmentGenreResource -from ebl.fragmentarium.web.fragment_ngram_matcher import NgramMatchResource +from ebl.fragmentarium.web.fragment_ngram_matcher import NgramMatcherResource from ebl.fragmentarium.web.fragment_script import FragmentScriptResource from ebl.fragmentarium.web.fragment_date import ( FragmentDateResource, @@ -86,7 +86,9 @@ def create_fragmentarium_routes(api: falcon.App, context: Context): fragment_query = FragmentsQueryResource( context.fragment_repository, context.get_transliteration_query_factory() ) - ngram_matcher = NgramMatchResource(context.fragment_repository) + ngram_matcher = NgramMatcherResource( + context.fragment_repository, context.text_repository + ) genres = GenresResource() periods = PeriodsResource() lemmatization = LemmatizationResource(updater) From d5b74bc840d6deeb7c1611f764c4d158762ee333 Mon Sep 17 00:00:00 2001 From: fsimonjetz Date: Mon, 25 Sep 2023 13:40:14 +0000 Subject: [PATCH 16/30] move aggregate ngrams into TextRepo --- ebl/common/application/ngram_matcher.py | 46 ------------------- ebl/corpus/application/text_repository.py | 6 +++ .../infrastructure/mongo_text_repository.py | 46 +++++++++++++++++++ 3 files changed, 52 insertions(+), 46 deletions(-) delete mode 100644 ebl/common/application/ngram_matcher.py diff --git a/ebl/common/application/ngram_matcher.py b/ebl/common/application/ngram_matcher.py deleted file mode 100644 index a7502f22d..000000000 --- a/ebl/common/application/ngram_matcher.py +++ /dev/null @@ -1,46 +0,0 @@ -from typing import Set, Tuple, Optional, Sequence, List - - -def aggregate_ngram_overlaps( - ngrams: Set[Tuple[str]], limit: Optional[int] = None -) -> Sequence[dict]: - ngram_list = list(ngrams) - pipeline: List[dict] = [ - {"$match": {"textId.category": {"$ne": 99}}}, - { - "$project": { - "_id": 0, - "textId": 1, - "name": 1, - "stage": 1, - "overlap": { - "$let": { - "vars": { - "intersection": { - "$size": {"$setIntersection": ["$ngrams", ngram_list]} - }, - "minLength": { - "$min": [ - {"$size": "$ngrams"}, - {"$size": [ngram_list]}, - ] - }, - }, - "in": { - "$cond": [ - {"$eq": ["$$minLength", 0]}, - 0.0, - {"$divide": ["$$intersection", "$$minLength"]}, - ] - }, - } - }, - } - }, - {"$sort": {"overlap": -1}}, - ] - - if limit: - pipeline.append({"$limit": limit}) - - return pipeline diff --git a/ebl/corpus/application/text_repository.py b/ebl/corpus/application/text_repository.py index a1ed7fd61..f217ad2ca 100644 --- a/ebl/corpus/application/text_repository.py +++ b/ebl/corpus/application/text_repository.py @@ -85,3 +85,9 @@ def query_manuscripts_with_joins_by_chapter( @abstractmethod def query(self, query: dict) -> CorpusQueryResult: ... + + @abstractmethod + def aggregate_ngram_overlaps( + self, ngrams: Sequence[Sequence[str]], limit: Optional[int] = None + ) -> Sequence[dict]: + ... diff --git a/ebl/corpus/infrastructure/mongo_text_repository.py b/ebl/corpus/infrastructure/mongo_text_repository.py index 459c24691..b9b6cfdfe 100644 --- a/ebl/corpus/infrastructure/mongo_text_repository.py +++ b/ebl/corpus/infrastructure/mongo_text_repository.py @@ -477,3 +477,49 @@ def _update_ngrams(self, id_: ChapterId) -> None: chapter_id_query(id_), pipeline, ) + + def aggregate_ngram_overlaps( + self, ngrams: Sequence[Sequence[str]], limit: Optional[int] = None + ) -> Sequence[dict]: + ngram_list = list(ngrams) + pipeline: List[dict] = [ + {"$match": {"textId.category": {"$ne": 99}}}, + { + "$project": { + "_id": 0, + "textId": 1, + "name": 1, + "stage": 1, + "overlap": { + "$let": { + "vars": { + "intersection": { + "$size": { + "$setIntersection": ["$ngrams", ngram_list] + } + }, + "minLength": { + "$min": [ + {"$size": "$ngrams"}, + {"$size": [ngram_list]}, + ] + }, + }, + "in": { + "$cond": [ + {"$eq": ["$$minLength", 0]}, + 0.0, + {"$divide": ["$$intersection", "$$minLength"]}, + ] + }, + } + }, + } + }, + {"$sort": {"overlap": -1}}, + ] + + if limit: + pipeline.append({"$limit": limit}) + + return list(self._chapters.aggregate(pipeline)) From 87cb373bb961c1015a4c61b147da36e774edc949 Mon Sep 17 00:00:00 2001 From: fsimonjetz Date: Mon, 25 Sep 2023 13:40:27 +0000 Subject: [PATCH 17/30] add get_ngrams --- ebl/fragmentarium/application/fragment_repository.py | 4 ++++ .../infrastructure/mongo_fragment_repository.py | 5 +++++ 2 files changed, 9 insertions(+) diff --git a/ebl/fragmentarium/application/fragment_repository.py b/ebl/fragmentarium/application/fragment_repository.py index f88efee6d..9911120d7 100644 --- a/ebl/fragmentarium/application/fragment_repository.py +++ b/ebl/fragmentarium/application/fragment_repository.py @@ -106,3 +106,7 @@ def fetch_date(self, number: MuseumNumber) -> Optional[Date]: @abstractmethod def list_all_fragments(self) -> Sequence[str]: ... + + @abstractmethod + def get_ngrams(self, number: MuseumNumber) -> Sequence[Sequence[str]]: + ... diff --git a/ebl/fragmentarium/infrastructure/mongo_fragment_repository.py b/ebl/fragmentarium/infrastructure/mongo_fragment_repository.py index 4190b2ff0..15fd4fd29 100644 --- a/ebl/fragmentarium/infrastructure/mongo_fragment_repository.py +++ b/ebl/fragmentarium/infrastructure/mongo_fragment_repository.py @@ -437,3 +437,8 @@ def _update_ngrams(self, number: MuseumNumber): }, ], ) + + def get_ngrams(self, number: MuseumNumber) -> Sequence[Sequence[str]]: + return self._fragments.find_one( + museum_number_is(number), projection={"ngrams": True} + )["ngrams"] From 73c00b562bc867373e3ca2356d81e3c6ce74236f Mon Sep 17 00:00:00 2001 From: fsimonjetz Date: Mon, 25 Sep 2023 13:40:43 +0000 Subject: [PATCH 18/30] refactor --- ebl/fragmentarium/web/fragment_ngram_matcher.py | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/ebl/fragmentarium/web/fragment_ngram_matcher.py b/ebl/fragmentarium/web/fragment_ngram_matcher.py index f60d2a82c..d86e26a14 100644 --- a/ebl/fragmentarium/web/fragment_ngram_matcher.py +++ b/ebl/fragmentarium/web/fragment_ngram_matcher.py @@ -1,4 +1,3 @@ -from ebl.common.application.ngram_matcher import aggregate_ngram_overlaps from ebl.corpus.application.text_repository import TextRepository from ebl.fragmentarium.application.fragment_repository import FragmentRepository from falcon import Request, Response @@ -6,7 +5,7 @@ from ebl.transliteration.domain.museum_number import MuseumNumber -class NgramMatchResource: +class NgramMatcherResource: def __init__( self, fragment_repository: FragmentRepository, @@ -17,6 +16,4 @@ def __init__( def on_get(self, _req: Request, resp: Response, number: str) -> None: ngrams = self._fragment_repository.get_ngrams(MuseumNumber.of(number)) - resp.media = self._text_repository._chapters.aggregate( - aggregate_ngram_overlaps(ngrams) - ) + resp.media = self._text_repository.aggregate_ngram_overlaps(ngrams) From bb3cfd9b34c2bcd08621d19074e1f977107e6999 Mon Sep 17 00:00:00 2001 From: fsimonjetz Date: Mon, 25 Sep 2023 13:40:52 +0000 Subject: [PATCH 19/30] add test_match_fragment_ngrams --- .../fragmentarium/test_ngram_matcher_route.py | 51 +++++++++++++++++++ 1 file changed, 51 insertions(+) create mode 100644 ebl/tests/fragmentarium/test_ngram_matcher_route.py diff --git a/ebl/tests/fragmentarium/test_ngram_matcher_route.py b/ebl/tests/fragmentarium/test_ngram_matcher_route.py new file mode 100644 index 000000000..1e9e61305 --- /dev/null +++ b/ebl/tests/fragmentarium/test_ngram_matcher_route.py @@ -0,0 +1,51 @@ +from typing import List, Sequence +from ebl.common.infrastructure.ngrams import NGRAM_N_VALUES +from ebl.corpus.application.id_schemas import ChapterIdSchema +from ebl.corpus.application.text_repository import TextRepository +from ebl.corpus.domain.chapter import Chapter +from ebl.fragmentarium.application.fragment_repository import FragmentRepository +from ebl.fragmentarium.domain.fragment import Fragment +from ebl.tests.factories.corpus import ChapterFactory +from ebl.tests.factories.fragment import TransliteratedFragmentFactory +from ebl.tests.common.ngram_test_support import ( + ngrams_from_signs, + chapter_ngrams_from_signs, +) +import falcon + +SIGNS = ["X BA KU ABZ075", "KI DU ABZ411 BA MA TI\nX MU TA MA UD", "KU ABZ411 MA KI"] + + +def compute_overlap(fragment: Fragment, chapter: Chapter, N: Sequence[int]) -> float: + F = ngrams_from_signs(fragment.signs, N) + C = chapter_ngrams_from_signs(chapter.signs, N) + + return (len(F & C) / min(len(F), len(C))) if F and C else 0.0 + + +def test_match_fragment_ngrams( + client, + fragment_repository: FragmentRepository, + text_repository: TextRepository, +): + fragment = TransliteratedFragmentFactory.build() + fragment_id = fragment_repository.create(fragment) + chapters: List[Chapter] = [ChapterFactory.build(signs=(signs,)) for signs in SIGNS] + + for chapter in chapters: + text_repository.create_chapter(chapter) + + result = client.simulate_get(f"/fragments/{fragment_id}/ngrams") + + assert result.status == falcon.HTTP_OK + assert result.json == sorted( + ( + { + **ChapterIdSchema().dump(chapter.id_), + "overlap": compute_overlap(fragment, chapter, NGRAM_N_VALUES), + } + for chapter in chapters + ), + key=lambda item: item["overlap"], + reverse=True, + ) From 714250216e8bd34cb1aa985ef3f255cba251e8f2 Mon Sep 17 00:00:00 2001 From: fsimonjetz Date: Mon, 25 Sep 2023 15:27:42 +0000 Subject: [PATCH 20/30] include textName in ngram output, refactor text join --- .../infrastructure/mongo_text_repository.py | 44 +++++------------ ebl/corpus/infrastructure/queries.py | 49 +++++++++---------- 2 files changed, 36 insertions(+), 57 deletions(-) diff --git a/ebl/corpus/infrastructure/mongo_text_repository.py b/ebl/corpus/infrastructure/mongo_text_repository.py index b9b6cfdfe..9403f4b4a 100644 --- a/ebl/corpus/infrastructure/mongo_text_repository.py +++ b/ebl/corpus/infrastructure/mongo_text_repository.py @@ -44,7 +44,7 @@ chapter_id_query, join_chapters, join_text, - join_text_title, + join_text_names, ) from ebl.errors import NotFoundError from ebl.fragmentarium.infrastructure.queries import is_in_fragmentarium, join_joins @@ -261,35 +261,8 @@ def query_by_transliteration( cursor = self._chapters.aggregate( [ {"$match": mongo_query}, - { - "$lookup": { - "from": "texts", - "let": { - "chapterGenre": "$textId.genre", - "chapterCategory": "$textId.category", - "chapterIndex": "$textId.index", - }, - "pipeline": [ - { - "$match": { - "$expr": { - "$and": [ - {"$eq": ["$genre", "$$chapterGenre"]}, - {"$eq": ["$category", "$$chapterCategory"]}, - {"$eq": ["$index", "$$chapterIndex"]}, - ] - } - } - }, - {"$project": {"name": 1, "_id": 0}}, - ], - "as": "textNames", - } - }, + *join_text_names(), {"$project": {"_id": False}}, - {"$addFields": {"textName": {"$first": "$textNames"}}}, - {"$addFields": {"textName": "$textName.name"}}, - {"$project": {"textNames": False}}, {"$skip": LIMIT * pagination_index}, {"$limit": LIMIT}, ], @@ -342,12 +315,12 @@ def query_by_lemma( }, {"$unwind": "$lines"}, {"$match": lemma_query}, - join_text_title(), + *join_text_names(), filter_manuscripts_by_lemma(lemma), { "$project": { "textId": True, - "textName": {"$first": "$textName.name"}, + "textName": True, "chapterName": "$name", "stage": True, "line": "$lines", @@ -482,8 +455,14 @@ def aggregate_ngram_overlaps( self, ngrams: Sequence[Sequence[str]], limit: Optional[int] = None ) -> Sequence[dict]: ngram_list = list(ngrams) + test_chapter_category = 99 pipeline: List[dict] = [ - {"$match": {"textId.category": {"$ne": 99}}}, + { + "$match": { + "textId.category": {"$ne": test_chapter_category}, + "ngrams": {"$exists": True, "$not": {"$size": 0}}, + } + }, { "$project": { "_id": 0, @@ -516,6 +495,7 @@ def aggregate_ngram_overlaps( }, } }, + *join_text_names(), {"$sort": {"overlap": -1}}, ] diff --git a/ebl/corpus/infrastructure/queries.py b/ebl/corpus/infrastructure/queries.py index 39de8e23f..c37fe7ca0 100644 --- a/ebl/corpus/infrastructure/queries.py +++ b/ebl/corpus/infrastructure/queries.py @@ -187,29 +187,28 @@ def join_text() -> List[dict]: ] -def join_text_title() -> dict: - return { - "$lookup": { - "from": "texts", - "let": { - "genre": "$textId.genre", - "category": "$textId.category", - "index": "$textId.index", - }, - "pipeline": [ - { - "$match": { - "$expr": { - "$and": [ - {"$eq": ["$genre", "$$genre"]}, - {"$eq": ["$category", "$$category"]}, - {"$eq": ["$index", "$$index"]}, - ] +def join_text_names() -> List[dict]: + return [ + { + "$lookup": { + "from": "texts", + "let": {"textId": "$textId"}, + "pipeline": [ + { + "$match": { + "$expr": { + "$and": [ + {"$eq": ["$genre", "$$textId.genre"]}, + {"$eq": ["$category", "$$textId.category"]}, + {"$eq": ["$index", "$$textId.index"]}, + ] + } } - } - }, - {"$project": {"_id": False, "name": True}}, - ], - "as": "textName", - } - } + }, + {"$project": {"name": 1, "_id": 0}}, + ], + "as": "textName", + } + }, + {"$addFields": {"textName": {"$first": "$textName.name"}}}, + ] From 562051fd1db0eab549ebb9d5339a00a0ef72ab01 Mon Sep 17 00:00:00 2001 From: fsimonjetz Date: Tue, 26 Sep 2023 09:59:36 +0000 Subject: [PATCH 21/30] add test_get_ngrams --- ebl/tests/fragmentarium/test_fragment_repository.py | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/ebl/tests/fragmentarium/test_fragment_repository.py b/ebl/tests/fragmentarium/test_fragment_repository.py index 6613d8154..2744e4845 100644 --- a/ebl/tests/fragmentarium/test_fragment_repository.py +++ b/ebl/tests/fragmentarium/test_fragment_repository.py @@ -1019,6 +1019,15 @@ def test_create_fragment_stores_ngrams(fragment_repository): ) +def test_get_ngrams(fragment_repository): + fragment = TransliteratedFragmentFactory.build() + fragment_repository.create(fragment) + + assert set( + map(tuple, fragment_repository.get_ngrams(fragment.number)) + ) == ngrams_from_signs(fragment.signs, NGRAM_N_VALUES) + + def test_update_transliteration_updates_ngrams(fragment_repository, user): fragment = TransliteratedFragmentFactory.build(signs="") fragment_repository.create(fragment) From 8b571cc5f5c8f71cb8f7ecebed44185af21cc8d6 Mon Sep 17 00:00:00 2001 From: fsimonjetz Date: Tue, 26 Sep 2023 10:29:30 +0000 Subject: [PATCH 22/30] add test_aggregate_ngram_overlaps --- ebl/tests/common/ngram_test_support.py | 9 +++++ ebl/tests/corpus/test_corpus_query_route.py | 7 +--- .../corpus/test_mongo_text_repository.py | 34 ++++++++++++++++++- .../fragmentarium/test_ngram_matcher_route.py | 15 ++------ 4 files changed, 45 insertions(+), 20 deletions(-) diff --git a/ebl/tests/common/ngram_test_support.py b/ebl/tests/common/ngram_test_support.py index 9c670f018..7149dc05d 100644 --- a/ebl/tests/common/ngram_test_support.py +++ b/ebl/tests/common/ngram_test_support.py @@ -1,4 +1,6 @@ from typing import Sequence, Set, Tuple, TypeVar, Optional +from ebl.corpus.domain.chapter import Chapter +from ebl.fragmentarium.domain.fragment import Fragment T = TypeVar("T") @@ -19,3 +21,10 @@ def chapter_ngrams_from_signs( return set.union( *(ngrams_from_signs(signs, N) for signs in chapter_signs if signs is not None) ) + + +def compute_overlap(fragment: Fragment, chapter: Chapter, N: Sequence[int]) -> float: + F = ngrams_from_signs(fragment.signs, N) + C = chapter_ngrams_from_signs(chapter.signs, N) + + return (len(F & C) / min(len(F), len(C))) if F and C else 0.0 diff --git a/ebl/tests/corpus/test_corpus_query_route.py b/ebl/tests/corpus/test_corpus_query_route.py index 1a9bb2e68..60b0bd1e1 100644 --- a/ebl/tests/corpus/test_corpus_query_route.py +++ b/ebl/tests/corpus/test_corpus_query_route.py @@ -5,7 +5,7 @@ from ebl.corpus.application.id_schemas import TextIdSchema from ebl.corpus.domain.chapter import Chapter from ebl.dictionary.domain.word import WordId -from ebl.tests.corpus.test_mongo_text_repository import LITERATURE_TEXT +from ebl.tests.corpus.test_mongo_text_repository import LITERATURE_TEXT, SIGNS from ebl.tests.factories.corpus import ( ChapterFactory, LineFactory, @@ -122,11 +122,6 @@ def test_query_chapter_lemmas( } -SIGNS = [ - "X ABZ411 ABZ11 ABZ41", - "X X X TI BA", - "MA ŠU X\nTI BA X", -] MANUSCRIPTS = ManuscriptFactory.build_batch(3) VARIANT_LINES = [ [ diff --git a/ebl/tests/corpus/test_mongo_text_repository.py b/ebl/tests/corpus/test_mongo_text_repository.py index c490bf8c2..d97bd85a7 100644 --- a/ebl/tests/corpus/test_mongo_text_repository.py +++ b/ebl/tests/corpus/test_mongo_text_repository.py @@ -2,6 +2,7 @@ import attr import pytest from ebl.common.infrastructure.ngrams import NGRAM_N_VALUES +from ebl.corpus.application.id_schemas import ChapterIdSchema from ebl.corpus.application.text_repository import TextRepository from ebl.corpus.application.schemas import ChapterSchema, TextSchema @@ -14,7 +15,11 @@ from ebl.fragmentarium.application.joins_schema import JoinSchema from ebl.fragmentarium.domain.fragment import Fragment from ebl.fragmentarium.domain.joins import Join, Joins -from ebl.tests.common.ngram_test_support import chapter_ngrams_from_signs +from ebl.tests.common.ngram_test_support import ( + chapter_ngrams_from_signs, + compute_overlap, + ngrams_from_signs, +) from ebl.tests.factories.corpus import ( ChapterFactory, LineFactory, @@ -140,6 +145,11 @@ ), ), ) +SIGNS = [ + "X ABZ411 ABZ11 ABZ41", + "X X X TI BA", + "MA ŠU X\nTI BA X", +] def when_text_in_collection(database, text=TEXT) -> None: @@ -516,3 +526,25 @@ def test_update_chapter_stores_ngrams(database, text_repository): assert set(map(tuple, data["ngrams"])) == chapter_ngrams_from_signs( updated_chapter.signs, NGRAM_N_VALUES ) + + +def test_aggregate_ngram_overlaps(text_repository, database): + chapters = [ChapterFactory.build(signs=(f"{signs} {signs}",)) for signs in SIGNS] + + for chapter in chapters: + text_repository.create_chapter(chapter) + + fragment = FragmentFactory.create(signs="TI BA ABZ11") + assert text_repository.aggregate_ngram_overlaps( + ngrams_from_signs(fragment.signs, NGRAM_N_VALUES) + ) == sorted( + ( + { + **ChapterIdSchema().dump(chapter.id_), + "overlap": compute_overlap(fragment, chapter, NGRAM_N_VALUES), + } + for chapter in chapters + ), + key=lambda item: item["overlap"], + reverse=True, + ) diff --git a/ebl/tests/fragmentarium/test_ngram_matcher_route.py b/ebl/tests/fragmentarium/test_ngram_matcher_route.py index 1e9e61305..65bb0259d 100644 --- a/ebl/tests/fragmentarium/test_ngram_matcher_route.py +++ b/ebl/tests/fragmentarium/test_ngram_matcher_route.py @@ -1,28 +1,17 @@ -from typing import List, Sequence +from typing import List from ebl.common.infrastructure.ngrams import NGRAM_N_VALUES from ebl.corpus.application.id_schemas import ChapterIdSchema from ebl.corpus.application.text_repository import TextRepository from ebl.corpus.domain.chapter import Chapter from ebl.fragmentarium.application.fragment_repository import FragmentRepository -from ebl.fragmentarium.domain.fragment import Fragment +from ebl.tests.common.ngram_test_support import compute_overlap from ebl.tests.factories.corpus import ChapterFactory from ebl.tests.factories.fragment import TransliteratedFragmentFactory -from ebl.tests.common.ngram_test_support import ( - ngrams_from_signs, - chapter_ngrams_from_signs, -) import falcon SIGNS = ["X BA KU ABZ075", "KI DU ABZ411 BA MA TI\nX MU TA MA UD", "KU ABZ411 MA KI"] -def compute_overlap(fragment: Fragment, chapter: Chapter, N: Sequence[int]) -> float: - F = ngrams_from_signs(fragment.signs, N) - C = chapter_ngrams_from_signs(chapter.signs, N) - - return (len(F & C) / min(len(F), len(C))) if F and C else 0.0 - - def test_match_fragment_ngrams( client, fragment_repository: FragmentRepository, From a6df055ca7e5b38a5645201d1a0f3269ab3c9e27 Mon Sep 17 00:00:00 2001 From: fsimonjetz Date: Tue, 26 Sep 2023 11:02:45 +0000 Subject: [PATCH 23/30] add ChapterNgramScore and schema --- ebl/corpus/application/display_schemas.py | 15 ++++++++- ebl/corpus/domain/chapter_display.py | 11 +++++++ .../infrastructure/mongo_text_repository.py | 4 +-- ebl/tests/common/ngram_test_support.py | 4 ++- .../corpus/test_mongo_text_repository.py | 31 ++++++++++++------ .../fragmentarium/test_ngram_matcher_route.py | 32 +++++++++++++------ 6 files changed, 74 insertions(+), 23 deletions(-) diff --git a/ebl/corpus/application/display_schemas.py b/ebl/corpus/application/display_schemas.py index d6db5c44d..daf2a576d 100644 --- a/ebl/corpus/application/display_schemas.py +++ b/ebl/corpus/application/display_schemas.py @@ -3,7 +3,11 @@ from ebl.corpus.application.id_schemas import ChapterIdSchema from ebl.corpus.application.record_schemas import RecordSchema from ebl.corpus.application.schemas import LineVariantSchema, ManuscriptSchema -from ebl.corpus.domain.chapter_display import ChapterDisplay, LineDisplay +from ebl.corpus.domain.chapter_display import ( + ChapterDisplay, + ChapterNgramScore, + LineDisplay, +) from ebl.corpus.domain.record import Record from ebl.transliteration.application.line_number_schemas import ( OneOfLineNumberSchema, @@ -89,3 +93,12 @@ def add_line_indexes(self, data: dict, **kwargs) -> dict: ] return data + + +class ChapterNgramScoreSchema(ChapterIdSchema): + text_name = fields.String(required=True, data_key="textName") + score = fields.Float(required=True) + + @post_load + def make_result(self, data: dict, **kwargs) -> ChapterNgramScore: + return ChapterNgramScore(*+data) diff --git a/ebl/corpus/domain/chapter_display.py b/ebl/corpus/domain/chapter_display.py index 948208924..fa7cfa2d1 100644 --- a/ebl/corpus/domain/chapter_display.py +++ b/ebl/corpus/domain/chapter_display.py @@ -139,3 +139,14 @@ def of_chapter(text: Text, chapter: Chapter) -> "ChapterDisplay": chapter.record, chapter.manuscripts, ) + + +@attr.s(frozen=True, auto_attribs=True) +class ChapterNgramScore(ChapterId): + text_name: str + score: float + + def of(chapter_id: ChapterId, text_name: str, score: float) -> "ChapterNgramScore": + return ChapterNgramScore( + chapter_id.text_id, chapter_id.stage, chapter_id.name, text_name, score + ) diff --git a/ebl/corpus/infrastructure/mongo_text_repository.py b/ebl/corpus/infrastructure/mongo_text_repository.py index 9403f4b4a..793fb9ae7 100644 --- a/ebl/corpus/infrastructure/mongo_text_repository.py +++ b/ebl/corpus/infrastructure/mongo_text_repository.py @@ -469,7 +469,7 @@ def aggregate_ngram_overlaps( "textId": 1, "name": 1, "stage": 1, - "overlap": { + "score": { "$let": { "vars": { "intersection": { @@ -496,7 +496,7 @@ def aggregate_ngram_overlaps( } }, *join_text_names(), - {"$sort": {"overlap": -1}}, + {"$sort": {"score": -1}}, ] if limit: diff --git a/ebl/tests/common/ngram_test_support.py b/ebl/tests/common/ngram_test_support.py index 7149dc05d..e4c6390a2 100644 --- a/ebl/tests/common/ngram_test_support.py +++ b/ebl/tests/common/ngram_test_support.py @@ -23,7 +23,9 @@ def chapter_ngrams_from_signs( ) -def compute_overlap(fragment: Fragment, chapter: Chapter, N: Sequence[int]) -> float: +def compute_ngram_score( + fragment: Fragment, chapter: Chapter, N: Sequence[int] +) -> float: F = ngrams_from_signs(fragment.signs, N) C = chapter_ngrams_from_signs(chapter.signs, N) diff --git a/ebl/tests/corpus/test_mongo_text_repository.py b/ebl/tests/corpus/test_mongo_text_repository.py index d97bd85a7..2b63b20b1 100644 --- a/ebl/tests/corpus/test_mongo_text_repository.py +++ b/ebl/tests/corpus/test_mongo_text_repository.py @@ -2,12 +2,12 @@ import attr import pytest from ebl.common.infrastructure.ngrams import NGRAM_N_VALUES -from ebl.corpus.application.id_schemas import ChapterIdSchema +from ebl.corpus.application.display_schemas import ChapterNgramScoreSchema from ebl.corpus.application.text_repository import TextRepository from ebl.corpus.application.schemas import ChapterSchema, TextSchema from ebl.corpus.domain.chapter import Chapter -from ebl.corpus.domain.chapter_display import ChapterDisplay +from ebl.corpus.domain.chapter_display import ChapterDisplay, ChapterNgramScore from ebl.corpus.domain.dictionary_line import DictionaryLine from ebl.corpus.domain.text import Text, UncertainFragment from ebl.dictionary.domain.word import WordId @@ -17,7 +17,7 @@ from ebl.fragmentarium.domain.joins import Join, Joins from ebl.tests.common.ngram_test_support import ( chapter_ngrams_from_signs, - compute_overlap, + compute_ngram_score, ngrams_from_signs, ) from ebl.tests.factories.corpus import ( @@ -528,23 +528,34 @@ def test_update_chapter_stores_ngrams(database, text_repository): ) -def test_aggregate_ngram_overlaps(text_repository, database): - chapters = [ChapterFactory.build(signs=(f"{signs} {signs}",)) for signs in SIGNS] +def test_aggregate_ngram_overlaps(text_repository): + text_repository.create(TEXT) + chapters = [ + ChapterFactory.build( + signs=(f"{signs} {signs}",), + text_id=TextId(TEXT.genre, TEXT.category, TEXT.index), + ) + for signs in SIGNS + ] for chapter in chapters: text_repository.create_chapter(chapter) fragment = FragmentFactory.create(signs="TI BA ABZ11") + assert text_repository.aggregate_ngram_overlaps( ngrams_from_signs(fragment.signs, NGRAM_N_VALUES) ) == sorted( ( - { - **ChapterIdSchema().dump(chapter.id_), - "overlap": compute_overlap(fragment, chapter, NGRAM_N_VALUES), - } + ChapterNgramScoreSchema().dump( + ChapterNgramScore.of( + chapter.id_, + TEXT.name, + compute_ngram_score(fragment, chapter, NGRAM_N_VALUES), + ) + ) for chapter in chapters ), - key=lambda item: item["overlap"], + key=lambda item: item["score"], reverse=True, ) diff --git a/ebl/tests/fragmentarium/test_ngram_matcher_route.py b/ebl/tests/fragmentarium/test_ngram_matcher_route.py index 65bb0259d..3f0691944 100644 --- a/ebl/tests/fragmentarium/test_ngram_matcher_route.py +++ b/ebl/tests/fragmentarium/test_ngram_matcher_route.py @@ -1,15 +1,20 @@ from typing import List from ebl.common.infrastructure.ngrams import NGRAM_N_VALUES -from ebl.corpus.application.id_schemas import ChapterIdSchema +from ebl.corpus.application.display_schemas import ChapterNgramScoreSchema from ebl.corpus.application.text_repository import TextRepository from ebl.corpus.domain.chapter import Chapter +from ebl.corpus.domain.chapter_display import ChapterNgramScore from ebl.fragmentarium.application.fragment_repository import FragmentRepository -from ebl.tests.common.ngram_test_support import compute_overlap -from ebl.tests.factories.corpus import ChapterFactory +from ebl.tests.common.ngram_test_support import compute_ngram_score +from ebl.corpus.domain.text import Text +from ebl.tests.factories.corpus import ChapterFactory, TextFactory from ebl.tests.factories.fragment import TransliteratedFragmentFactory import falcon +from ebl.transliteration.domain.text_id import TextId + SIGNS = ["X BA KU ABZ075", "KI DU ABZ411 BA MA TI\nX MU TA MA UD", "KU ABZ411 MA KI"] +TEXT: Text = TextFactory.build() def test_match_fragment_ngrams( @@ -19,7 +24,13 @@ def test_match_fragment_ngrams( ): fragment = TransliteratedFragmentFactory.build() fragment_id = fragment_repository.create(fragment) - chapters: List[Chapter] = [ChapterFactory.build(signs=(signs,)) for signs in SIGNS] + text_repository.create(TEXT) + chapters: List[Chapter] = [ + ChapterFactory.build( + signs=(signs,), text_id=TextId(TEXT.genre, TEXT.category, TEXT.index) + ) + for signs in SIGNS + ] for chapter in chapters: text_repository.create_chapter(chapter) @@ -29,12 +40,15 @@ def test_match_fragment_ngrams( assert result.status == falcon.HTTP_OK assert result.json == sorted( ( - { - **ChapterIdSchema().dump(chapter.id_), - "overlap": compute_overlap(fragment, chapter, NGRAM_N_VALUES), - } + ChapterNgramScoreSchema().dump( + ChapterNgramScore.of( + chapter.id_, + TEXT.name, + compute_ngram_score(fragment, chapter, NGRAM_N_VALUES), + ) + ) for chapter in chapters ), - key=lambda item: item["overlap"], + key=lambda item: item["score"], reverse=True, ) From 5c3ced2e7d3f8e4a83296a5b06d14f2efd03ffef Mon Sep 17 00:00:00 2001 From: fsimonjetz Date: Tue, 26 Sep 2023 11:09:02 +0000 Subject: [PATCH 24/30] fix typo --- ebl/corpus/application/display_schemas.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ebl/corpus/application/display_schemas.py b/ebl/corpus/application/display_schemas.py index daf2a576d..0d08c9996 100644 --- a/ebl/corpus/application/display_schemas.py +++ b/ebl/corpus/application/display_schemas.py @@ -101,4 +101,4 @@ class ChapterNgramScoreSchema(ChapterIdSchema): @post_load def make_result(self, data: dict, **kwargs) -> ChapterNgramScore: - return ChapterNgramScore(*+data) + return ChapterNgramScore(**data) From f07201583e79a8b7d47c3db87219b209ee65388b Mon Sep 17 00:00:00 2001 From: fsimonjetz Date: Tue, 26 Sep 2023 11:16:15 +0000 Subject: [PATCH 25/30] bug fix --- ebl/corpus/domain/chapter_display.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/ebl/corpus/domain/chapter_display.py b/ebl/corpus/domain/chapter_display.py index fa7cfa2d1..e4387040e 100644 --- a/ebl/corpus/domain/chapter_display.py +++ b/ebl/corpus/domain/chapter_display.py @@ -145,7 +145,8 @@ def of_chapter(text: Text, chapter: Chapter) -> "ChapterDisplay": class ChapterNgramScore(ChapterId): text_name: str score: float - + + @staticmethod def of(chapter_id: ChapterId, text_name: str, score: float) -> "ChapterNgramScore": return ChapterNgramScore( chapter_id.text_id, chapter_id.stage, chapter_id.name, text_name, score From f4e7f2a6ede14dd246187bfcc0c4812391bf3722 Mon Sep 17 00:00:00 2001 From: fsimonjetz Date: Tue, 26 Sep 2023 11:18:24 +0000 Subject: [PATCH 26/30] reformat --- ebl/corpus/domain/chapter_display.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ebl/corpus/domain/chapter_display.py b/ebl/corpus/domain/chapter_display.py index e4387040e..490a10b66 100644 --- a/ebl/corpus/domain/chapter_display.py +++ b/ebl/corpus/domain/chapter_display.py @@ -145,7 +145,7 @@ def of_chapter(text: Text, chapter: Chapter) -> "ChapterDisplay": class ChapterNgramScore(ChapterId): text_name: str score: float - + @staticmethod def of(chapter_id: ChapterId, text_name: str, score: float) -> "ChapterNgramScore": return ChapterNgramScore( From e9d544906793fc48aa27b12379b2e30d9afb371c Mon Sep 17 00:00:00 2001 From: fsimonjetz Date: Tue, 26 Sep 2023 11:53:53 +0000 Subject: [PATCH 27/30] add None sign handling --- .../infrastructure/mongo_text_repository.py | 24 +++++++++++++++++-- .../corpus/test_mongo_text_repository.py | 5 +--- 2 files changed, 23 insertions(+), 6 deletions(-) diff --git a/ebl/corpus/infrastructure/mongo_text_repository.py b/ebl/corpus/infrastructure/mongo_text_repository.py index 793fb9ae7..440326913 100644 --- a/ebl/corpus/infrastructure/mongo_text_repository.py +++ b/ebl/corpus/infrastructure/mongo_text_repository.py @@ -12,6 +12,7 @@ from ebl.common.query.util import ( drop_duplicates, extract_ngrams, + filter_array, flatten_field, replace_all, ) @@ -443,7 +444,19 @@ def _update_ngrams(self, id_: ChapterId) -> None: } } pipeline = [ - {"$set": {"ngrams": drop_duplicates(flatten_field(map_extract_ngrams))}}, + { + "$set": { + "ngrams": drop_duplicates( + flatten_field( + filter_array( + map_extract_ngrams, + "manuscriptSigns", + {"$ne": ["$$manuscriptSigns", None]}, + ) + ) + ) + } + }, ] self._chapters.update_one( @@ -454,6 +467,9 @@ def _update_ngrams(self, id_: ChapterId) -> None: def aggregate_ngram_overlaps( self, ngrams: Sequence[Sequence[str]], limit: Optional[int] = None ) -> Sequence[dict]: + if not ngrams: + raise ValueError("ngrams must not be empty") + ngram_list = list(ngrams) test_chapter_category = 99 pipeline: List[dict] = [ @@ -480,7 +496,7 @@ def aggregate_ngram_overlaps( "minLength": { "$min": [ {"$size": "$ngrams"}, - {"$size": [ngram_list]}, + len(ngram_list), ] }, }, @@ -502,4 +518,8 @@ def aggregate_ngram_overlaps( if limit: pipeline.append({"$limit": limit}) + import json + + print(json.dumps(pipeline, indent=2)) + return list(self._chapters.aggregate(pipeline)) diff --git a/ebl/tests/corpus/test_mongo_text_repository.py b/ebl/tests/corpus/test_mongo_text_repository.py index 2b63b20b1..a73b16953 100644 --- a/ebl/tests/corpus/test_mongo_text_repository.py +++ b/ebl/tests/corpus/test_mongo_text_repository.py @@ -503,10 +503,7 @@ def test_update_chapter_stores_ngrams(database, text_repository): updated_chapter = attr.evolve( CHAPTER, - signs=( - "X ABZ411 ABZ11 ABZ41", - "X X X TI BA", - ), + signs=("X ABZ411 ABZ11 ABZ41", "X X X TI BA", None), ) when_chapter_in_collection(database) From 49f1ad2af81855476c0f9cb47509325679eedbe2 Mon Sep 17 00:00:00 2001 From: fsimonjetz Date: Tue, 26 Sep 2023 13:21:17 +0000 Subject: [PATCH 28/30] remove debug output --- ebl/corpus/infrastructure/mongo_text_repository.py | 4 ---- 1 file changed, 4 deletions(-) diff --git a/ebl/corpus/infrastructure/mongo_text_repository.py b/ebl/corpus/infrastructure/mongo_text_repository.py index 440326913..97ad15928 100644 --- a/ebl/corpus/infrastructure/mongo_text_repository.py +++ b/ebl/corpus/infrastructure/mongo_text_repository.py @@ -518,8 +518,4 @@ def aggregate_ngram_overlaps( if limit: pipeline.append({"$limit": limit}) - import json - - print(json.dumps(pipeline, indent=2)) - return list(self._chapters.aggregate(pipeline)) From 0323d48be701549e042d2ce92547e0f506f897e3 Mon Sep 17 00:00:00 2001 From: fsimonjetz Date: Tue, 26 Sep 2023 13:21:59 +0000 Subject: [PATCH 29/30] add batch ngram update script --- ebl/io/corpus/update_ngrams.py | 25 +++++++++++++++++++++++++ 1 file changed, 25 insertions(+) create mode 100644 ebl/io/corpus/update_ngrams.py diff --git a/ebl/io/corpus/update_ngrams.py b/ebl/io/corpus/update_ngrams.py new file mode 100644 index 000000000..1c44b140a --- /dev/null +++ b/ebl/io/corpus/update_ngrams.py @@ -0,0 +1,25 @@ +import os + +from pymongo import MongoClient +from ebl.corpus.application.id_schemas import ChapterIdSchema + +from ebl.corpus.infrastructure.mongo_text_repository import MongoTextRepository +from tqdm import tqdm + + +client = MongoClient(os.environ["MONGODB_URI"]) +database = client.get_database(os.environ.get("MONGODB_DB")) + +text_repository = MongoTextRepository(database) + + +chapters_with_signs = [ + ChapterIdSchema().load(id_) + for id_ in text_repository._chapters.find_many( + {"signs": {"$exists": 1, "$ne": ""}}, + projection={"_id": False, "textId": True, "stage": True, "name": True}, + ) +] + +for id_ in tqdm(chapters_with_signs, total=len(chapters_with_signs)): + text_repository._update_ngrams(id_) From f7e79d751479c6494487bd2ee48d18f21ba45d0f Mon Sep 17 00:00:00 2001 From: fsimonjetz Date: Tue, 26 Sep 2023 13:56:19 +0000 Subject: [PATCH 30/30] add fragment batch ngram update --- ebl/io/corpus/update_ngrams.py | 61 +++++++++++++++++++++++++++------- 1 file changed, 49 insertions(+), 12 deletions(-) diff --git a/ebl/io/corpus/update_ngrams.py b/ebl/io/corpus/update_ngrams.py index 1c44b140a..8f5f29904 100644 --- a/ebl/io/corpus/update_ngrams.py +++ b/ebl/io/corpus/update_ngrams.py @@ -1,25 +1,62 @@ import os from pymongo import MongoClient +from ebl.common.infrastructure.ngrams import NGRAM_N_VALUES from ebl.corpus.application.id_schemas import ChapterIdSchema from ebl.corpus.infrastructure.mongo_text_repository import MongoTextRepository from tqdm import tqdm +from ebl.fragmentarium.infrastructure.mongo_fragment_repository import ( + MongoFragmentRepository, +) + +from ebl.transliteration.application.museum_number_schema import MuseumNumberSchema client = MongoClient(os.environ["MONGODB_URI"]) database = client.get_database(os.environ.get("MONGODB_DB")) -text_repository = MongoTextRepository(database) - +DO_CHAPTERS = False +DO_FRAGMENTS = True -chapters_with_signs = [ - ChapterIdSchema().load(id_) - for id_ in text_repository._chapters.find_many( - {"signs": {"$exists": 1, "$ne": ""}}, - projection={"_id": False, "textId": True, "stage": True, "name": True}, - ) -] - -for id_ in tqdm(chapters_with_signs, total=len(chapters_with_signs)): - text_repository._update_ngrams(id_) +text_repository = MongoTextRepository(database) +fragment_repository = MongoFragmentRepository(database) + + +def update_all_chapter_ngrams(): + chapters_with_signs = [ + ChapterIdSchema().load(id_) + for id_ in text_repository._chapters.find_many( + {"signs": {"$exists": 1}}, + projection={"_id": False, "textId": True, "stage": True, "name": True}, + ) + ] + + for id_ in tqdm(chapters_with_signs, total=len(chapters_with_signs)): + text_repository._update_ngrams(id_) + + +def update_all_fragment_ngrams(): + fragments_with_signs = [ + MuseumNumberSchema().load(fragment["museumNumber"]) + for fragment in fragment_repository._fragments.find_many( + {"signs": {"$exists": 1, "$ne": ""}, "ngrams": {"$exists": False}}, + projection={"museumNumber": True}, + ) + ] + for number in tqdm(fragments_with_signs, total=len(fragments_with_signs)): + fragment_repository._update_ngrams(number) + + +if __name__ == "__main__": + if DO_CHAPTERS: + print("Updating chapter ngrams with n ∈", NGRAM_N_VALUES) + update_all_chapter_ngrams() + + if DO_FRAGMENTS: + print( + "\nUpdating fragment ngrams with n ∈", + NGRAM_N_VALUES, + "(This may take a while.)", + ) + update_all_fragment_ngrams()