diff --git a/ebl/common/infrastructure/ngrams.py b/ebl/common/infrastructure/ngrams.py new file mode 100644 index 000000000..71a624575 --- /dev/null +++ b/ebl/common/infrastructure/ngrams.py @@ -0,0 +1 @@ +NGRAM_N_VALUES = [1, 2, 3] diff --git a/ebl/common/query/util.py b/ebl/common/query/util.py index ee71de18c..b3590d9d2 100644 --- a/ebl/common/query/util.py +++ b/ebl/common/query/util.py @@ -1,4 +1,4 @@ -from typing import Union, Dict +from typing import Union, Dict, Sequence def flatten_field(input_: Union[str, Dict], depth=1) -> Dict: @@ -15,9 +15,9 @@ def drop_duplicates(input_: Union[str, Dict]) -> Dict: return {"$setUnion": [input_, []]} -def ngrams(input_: Union[str, Dict], n) -> Dict: - if n <= 1: - raise ValueError("ngram size must be 2 or more") +def ngrams(input_: Union[str, Dict], n: int) -> Dict: + if n <= 0: + raise ValueError("ngram size must be 1 or more") return { "$zip": { "inputs": [ @@ -39,3 +39,41 @@ def ngrams(input_: Union[str, Dict], n) -> Dict: def filter_array(input_, as_, cond) -> Dict: return {"$filter": {"input": input_, "as": as_, "cond": cond}} + + +def extract_ngrams( + input_: Union[str, Dict], + N: Sequence[int], +): + signs_to_exclude = ["X", ""] + + exclude_empty = { + "$eq": [ + { + "$size": { + "$setIntersection": [ + "$$this", + signs_to_exclude, + ] + } + }, + 0, + ] + } + return drop_duplicates( + filter_array( + {"$concatArrays": [ngrams(input_, n) for n in N if n > 0]}, + "this", + exclude_empty, + ) + ) + + +def replace_all(input_: Union[str, Dict], old: str, new: str): + return { + "$replaceAll": { + "input": input_, + "find": old, + "replacement": new, + } + } diff --git a/ebl/corpus/application/display_schemas.py b/ebl/corpus/application/display_schemas.py index d6db5c44d..0d08c9996 100644 --- a/ebl/corpus/application/display_schemas.py +++ b/ebl/corpus/application/display_schemas.py @@ -3,7 +3,11 @@ from ebl.corpus.application.id_schemas import ChapterIdSchema from ebl.corpus.application.record_schemas import RecordSchema from ebl.corpus.application.schemas import LineVariantSchema, ManuscriptSchema -from ebl.corpus.domain.chapter_display import ChapterDisplay, LineDisplay +from ebl.corpus.domain.chapter_display import ( + ChapterDisplay, + ChapterNgramScore, + LineDisplay, +) from ebl.corpus.domain.record import Record from ebl.transliteration.application.line_number_schemas import ( OneOfLineNumberSchema, @@ -89,3 +93,12 @@ def add_line_indexes(self, data: dict, **kwargs) -> dict: ] return data + + +class ChapterNgramScoreSchema(ChapterIdSchema): + text_name = fields.String(required=True, data_key="textName") + score = fields.Float(required=True) + + @post_load + def make_result(self, data: dict, **kwargs) -> ChapterNgramScore: + return ChapterNgramScore(**data) diff --git a/ebl/corpus/application/schemas.py b/ebl/corpus/application/schemas.py index 2ef36b71e..53bc63653 100644 --- a/ebl/corpus/application/schemas.py +++ b/ebl/corpus/application/schemas.py @@ -1,5 +1,6 @@ from ebl.corpus.domain.provenance import Provenance from marshmallow import ( + EXCLUDE, Schema, ValidationError, fields, @@ -260,6 +261,9 @@ class DictionaryLinePaginationSchema(Schema): class ChapterSchema(Schema): + class Meta: + unknown = EXCLUDE + text_id = fields.Nested(TextIdSchema, required=True, data_key="textId") classification = ValueEnumField(Classification, required=True) stage = ValueEnumField(Stage, required=True) diff --git a/ebl/corpus/application/text_repository.py b/ebl/corpus/application/text_repository.py index a1ed7fd61..f217ad2ca 100644 --- a/ebl/corpus/application/text_repository.py +++ b/ebl/corpus/application/text_repository.py @@ -85,3 +85,9 @@ def query_manuscripts_with_joins_by_chapter( @abstractmethod def query(self, query: dict) -> CorpusQueryResult: ... + + @abstractmethod + def aggregate_ngram_overlaps( + self, ngrams: Sequence[Sequence[str]], limit: Optional[int] = None + ) -> Sequence[dict]: + ... diff --git a/ebl/corpus/domain/chapter_display.py b/ebl/corpus/domain/chapter_display.py index 948208924..490a10b66 100644 --- a/ebl/corpus/domain/chapter_display.py +++ b/ebl/corpus/domain/chapter_display.py @@ -139,3 +139,15 @@ def of_chapter(text: Text, chapter: Chapter) -> "ChapterDisplay": chapter.record, chapter.manuscripts, ) + + +@attr.s(frozen=True, auto_attribs=True) +class ChapterNgramScore(ChapterId): + text_name: str + score: float + + @staticmethod + def of(chapter_id: ChapterId, text_name: str, score: float) -> "ChapterNgramScore": + return ChapterNgramScore( + chapter_id.text_id, chapter_id.stage, chapter_id.name, text_name, score + ) diff --git a/ebl/corpus/infrastructure/mongo_text_repository.py b/ebl/corpus/infrastructure/mongo_text_repository.py index 1c61fccef..97ad15928 100644 --- a/ebl/corpus/infrastructure/mongo_text_repository.py +++ b/ebl/corpus/infrastructure/mongo_text_repository.py @@ -6,8 +6,16 @@ from ebl.bibliography.infrastructure.bibliography import join_reference_documents +from ebl.common.infrastructure.ngrams import NGRAM_N_VALUES from ebl.common.query.query_result import CorpusQueryResult from ebl.common.query.query_schemas import CorpusQueryResultSchema +from ebl.common.query.util import ( + drop_duplicates, + extract_ngrams, + filter_array, + flatten_field, + replace_all, +) from ebl.corpus.application.text_repository import TextRepository from ebl.corpus.application.display_schemas import ChapterDisplaySchema from ebl.corpus.application.schemas import ( @@ -37,7 +45,7 @@ chapter_id_query, join_chapters, join_text, - join_text_title, + join_text_names, ) from ebl.errors import NotFoundError from ebl.fragmentarium.infrastructure.queries import is_in_fragmentarium, join_joins @@ -109,6 +117,7 @@ def create(self, text: Text) -> None: def create_chapter(self, chapter: Chapter) -> None: self._chapters.insert_one(ChapterSchema().dump(chapter)) + self._update_ngrams(chapter.id_) def find(self, id_: TextId) -> Text: try: @@ -243,6 +252,7 @@ def update(self, id_: ChapterId, chapter: Chapter) -> None: ).dump(chapter) }, ) + self._update_ngrams(id_) def query_by_transliteration( self, query: TransliterationQuery, pagination_index: int @@ -252,35 +262,8 @@ def query_by_transliteration( cursor = self._chapters.aggregate( [ {"$match": mongo_query}, - { - "$lookup": { - "from": "texts", - "let": { - "chapterGenre": "$textId.genre", - "chapterCategory": "$textId.category", - "chapterIndex": "$textId.index", - }, - "pipeline": [ - { - "$match": { - "$expr": { - "$and": [ - {"$eq": ["$genre", "$$chapterGenre"]}, - {"$eq": ["$category", "$$chapterCategory"]}, - {"$eq": ["$index", "$$chapterIndex"]}, - ] - } - } - }, - {"$project": {"name": 1, "_id": 0}}, - ], - "as": "textNames", - } - }, + *join_text_names(), {"$project": {"_id": False}}, - {"$addFields": {"textName": {"$first": "$textNames"}}}, - {"$addFields": {"textName": "$textName.name"}}, - {"$project": {"textNames": False}}, {"$skip": LIMIT * pagination_index}, {"$limit": LIMIT}, ], @@ -333,12 +316,12 @@ def query_by_lemma( }, {"$unwind": "$lines"}, {"$match": lemma_query}, - join_text_title(), + *join_text_names(), filter_manuscripts_by_lemma(lemma), { "$project": { "textId": True, - "textName": {"$first": "$textName.name"}, + "textName": True, "chapterName": "$name", "stage": True, "line": "$lines", @@ -449,3 +432,90 @@ def query_corpus_by_manuscript( ] ) return ManuscriptAttestationSchema().load(cursor, many=True) + + def _update_ngrams(self, id_: ChapterId) -> None: + map_extract_ngrams = { + "$map": { + "input": "$signs", + "in": extract_ngrams( + {"$split": [replace_all("$$this", "\n", " # "), " "]}, + NGRAM_N_VALUES, + ), + } + } + pipeline = [ + { + "$set": { + "ngrams": drop_duplicates( + flatten_field( + filter_array( + map_extract_ngrams, + "manuscriptSigns", + {"$ne": ["$$manuscriptSigns", None]}, + ) + ) + ) + } + }, + ] + + self._chapters.update_one( + chapter_id_query(id_), + pipeline, + ) + + def aggregate_ngram_overlaps( + self, ngrams: Sequence[Sequence[str]], limit: Optional[int] = None + ) -> Sequence[dict]: + if not ngrams: + raise ValueError("ngrams must not be empty") + + ngram_list = list(ngrams) + test_chapter_category = 99 + pipeline: List[dict] = [ + { + "$match": { + "textId.category": {"$ne": test_chapter_category}, + "ngrams": {"$exists": True, "$not": {"$size": 0}}, + } + }, + { + "$project": { + "_id": 0, + "textId": 1, + "name": 1, + "stage": 1, + "score": { + "$let": { + "vars": { + "intersection": { + "$size": { + "$setIntersection": ["$ngrams", ngram_list] + } + }, + "minLength": { + "$min": [ + {"$size": "$ngrams"}, + len(ngram_list), + ] + }, + }, + "in": { + "$cond": [ + {"$eq": ["$$minLength", 0]}, + 0.0, + {"$divide": ["$$intersection", "$$minLength"]}, + ] + }, + } + }, + } + }, + *join_text_names(), + {"$sort": {"score": -1}}, + ] + + if limit: + pipeline.append({"$limit": limit}) + + return list(self._chapters.aggregate(pipeline)) diff --git a/ebl/corpus/infrastructure/queries.py b/ebl/corpus/infrastructure/queries.py index 39de8e23f..c37fe7ca0 100644 --- a/ebl/corpus/infrastructure/queries.py +++ b/ebl/corpus/infrastructure/queries.py @@ -187,29 +187,28 @@ def join_text() -> List[dict]: ] -def join_text_title() -> dict: - return { - "$lookup": { - "from": "texts", - "let": { - "genre": "$textId.genre", - "category": "$textId.category", - "index": "$textId.index", - }, - "pipeline": [ - { - "$match": { - "$expr": { - "$and": [ - {"$eq": ["$genre", "$$genre"]}, - {"$eq": ["$category", "$$category"]}, - {"$eq": ["$index", "$$index"]}, - ] +def join_text_names() -> List[dict]: + return [ + { + "$lookup": { + "from": "texts", + "let": {"textId": "$textId"}, + "pipeline": [ + { + "$match": { + "$expr": { + "$and": [ + {"$eq": ["$genre", "$$textId.genre"]}, + {"$eq": ["$category", "$$textId.category"]}, + {"$eq": ["$index", "$$textId.index"]}, + ] + } } - } - }, - {"$project": {"_id": False, "name": True}}, - ], - "as": "textName", - } - } + }, + {"$project": {"name": 1, "_id": 0}}, + ], + "as": "textName", + } + }, + {"$addFields": {"textName": {"$first": "$textName.name"}}}, + ] diff --git a/ebl/fragmentarium/application/fragment_repository.py b/ebl/fragmentarium/application/fragment_repository.py index f88efee6d..9911120d7 100644 --- a/ebl/fragmentarium/application/fragment_repository.py +++ b/ebl/fragmentarium/application/fragment_repository.py @@ -106,3 +106,7 @@ def fetch_date(self, number: MuseumNumber) -> Optional[Date]: @abstractmethod def list_all_fragments(self) -> Sequence[str]: ... + + @abstractmethod + def get_ngrams(self, number: MuseumNumber) -> Sequence[Sequence[str]]: + ... diff --git a/ebl/fragmentarium/infrastructure/mongo_fragment_repository.py b/ebl/fragmentarium/infrastructure/mongo_fragment_repository.py index 33494a7eb..15fd4fd29 100644 --- a/ebl/fragmentarium/infrastructure/mongo_fragment_repository.py +++ b/ebl/fragmentarium/infrastructure/mongo_fragment_repository.py @@ -6,8 +6,10 @@ from ebl.bibliography.infrastructure.bibliography import join_reference_documents from ebl.common.domain.scopes import Scope +from ebl.common.infrastructure.ngrams import NGRAM_N_VALUES from ebl.common.query.query_result import QueryResult from ebl.common.query.query_schemas import QueryResultSchema +from ebl.common.query.util import extract_ngrams, replace_all from ebl.errors import NotFoundError from ebl.fragmentarium.application.fragment_info_schema import FragmentInfoSchema from ebl.fragmentarium.application.fragment_repository import FragmentRepository @@ -103,13 +105,16 @@ def count_lines(self): return 0 def create(self, fragment, sort_key=None): - return self._fragments.insert_one( + id_ = self._fragments.insert_one( { "_id": str(fragment.number), **FragmentSchema(exclude=["joins"]).dump(fragment), **({} if sort_key is None else {"_sortKey": sort_key}), } ) + self._update_ngrams(fragment.number) + + return id_ def create_many(self, fragments: Sequence[Fragment]) -> Sequence[str]: schema = FragmentSchema(exclude=["joins"]) @@ -295,6 +300,9 @@ def update_field(self, field, fragment): {"$set": query if query else {field: None}}, ) + if field == "transliteration": + self._update_ngrams(fragment.number) + def query_next_and_previous_folio(self, folio_name, folio_number, number): sort_ascending = {"$sort": {"key": 1}} sort_descending = {"$sort": {"key": -1}} @@ -342,15 +350,6 @@ def get_numbers(pipeline): else: return result - def query_museum_numbers(self, prefix: str, number_regex: str) -> Sequence[dict]: - return self._fragments.find_many( - { - "museumNumber.prefix": prefix, - "museumNumber.number": {"$regex": number_regex}, - }, - projection={"museumNumber": True}, - ) - def query_by_sort_key(self, key: int) -> MuseumNumber: if key < 0: last_fragment = next( @@ -423,3 +422,23 @@ def list_all_fragments( return list( self._fragments.get_all_values("_id", match_user_scopes(user_scopes)) ) + + def _update_ngrams(self, number: MuseumNumber): + self._fragments.update_one( + museum_number_is(number), + [ + { + "$set": { + "ngrams": extract_ngrams( + {"$split": [replace_all("$signs", "\n", " # "), " "]}, + NGRAM_N_VALUES, + ) + } + }, + ], + ) + + def get_ngrams(self, number: MuseumNumber) -> Sequence[Sequence[str]]: + return self._fragments.find_one( + museum_number_is(number), projection={"ngrams": True} + )["ngrams"] diff --git a/ebl/fragmentarium/web/bootstrap.py b/ebl/fragmentarium/web/bootstrap.py index 442cba654..d23f4a3e2 100644 --- a/ebl/fragmentarium/web/bootstrap.py +++ b/ebl/fragmentarium/web/bootstrap.py @@ -10,6 +10,7 @@ from ebl.fragmentarium.web.folio_pager import FolioPagerResource from ebl.fragmentarium.web.folios import FoliosResource from ebl.fragmentarium.web.fragment_genre import FragmentGenreResource +from ebl.fragmentarium.web.fragment_ngram_matcher import NgramMatcherResource from ebl.fragmentarium.web.fragment_script import FragmentScriptResource from ebl.fragmentarium.web.fragment_date import ( FragmentDateResource, @@ -85,6 +86,9 @@ def create_fragmentarium_routes(api: falcon.App, context: Context): fragment_query = FragmentsQueryResource( context.fragment_repository, context.get_transliteration_query_factory() ) + ngram_matcher = NgramMatcherResource( + context.fragment_repository, context.text_repository + ) genres = GenresResource() periods = PeriodsResource() lemmatization = LemmatizationResource(updater) @@ -124,6 +128,7 @@ def create_fragmentarium_routes(api: falcon.App, context: Context): ("/fragments/{number}/annotations", annotations), ("/fragments/{number}/photo", photo), ("/fragments/{number}/corpus", chapters), + ("/fragments/{number}/ngrams", ngram_matcher), ("/genres", genres), ("/periods", periods), ("/statistics", statistics), diff --git a/ebl/fragmentarium/web/fragment_ngram_matcher.py b/ebl/fragmentarium/web/fragment_ngram_matcher.py new file mode 100644 index 000000000..d86e26a14 --- /dev/null +++ b/ebl/fragmentarium/web/fragment_ngram_matcher.py @@ -0,0 +1,19 @@ +from ebl.corpus.application.text_repository import TextRepository +from ebl.fragmentarium.application.fragment_repository import FragmentRepository +from falcon import Request, Response + +from ebl.transliteration.domain.museum_number import MuseumNumber + + +class NgramMatcherResource: + def __init__( + self, + fragment_repository: FragmentRepository, + text_repository: TextRepository, + ): + self._fragment_repository = fragment_repository + self._text_repository = text_repository + + def on_get(self, _req: Request, resp: Response, number: str) -> None: + ngrams = self._fragment_repository.get_ngrams(MuseumNumber.of(number)) + resp.media = self._text_repository.aggregate_ngram_overlaps(ngrams) diff --git a/ebl/io/corpus/update_ngrams.py b/ebl/io/corpus/update_ngrams.py new file mode 100644 index 000000000..8f5f29904 --- /dev/null +++ b/ebl/io/corpus/update_ngrams.py @@ -0,0 +1,62 @@ +import os + +from pymongo import MongoClient +from ebl.common.infrastructure.ngrams import NGRAM_N_VALUES +from ebl.corpus.application.id_schemas import ChapterIdSchema + +from ebl.corpus.infrastructure.mongo_text_repository import MongoTextRepository +from tqdm import tqdm + +from ebl.fragmentarium.infrastructure.mongo_fragment_repository import ( + MongoFragmentRepository, +) + +from ebl.transliteration.application.museum_number_schema import MuseumNumberSchema + +client = MongoClient(os.environ["MONGODB_URI"]) +database = client.get_database(os.environ.get("MONGODB_DB")) + +DO_CHAPTERS = False +DO_FRAGMENTS = True + +text_repository = MongoTextRepository(database) +fragment_repository = MongoFragmentRepository(database) + + +def update_all_chapter_ngrams(): + chapters_with_signs = [ + ChapterIdSchema().load(id_) + for id_ in text_repository._chapters.find_many( + {"signs": {"$exists": 1}}, + projection={"_id": False, "textId": True, "stage": True, "name": True}, + ) + ] + + for id_ in tqdm(chapters_with_signs, total=len(chapters_with_signs)): + text_repository._update_ngrams(id_) + + +def update_all_fragment_ngrams(): + fragments_with_signs = [ + MuseumNumberSchema().load(fragment["museumNumber"]) + for fragment in fragment_repository._fragments.find_many( + {"signs": {"$exists": 1, "$ne": ""}, "ngrams": {"$exists": False}}, + projection={"museumNumber": True}, + ) + ] + for number in tqdm(fragments_with_signs, total=len(fragments_with_signs)): + fragment_repository._update_ngrams(number) + + +if __name__ == "__main__": + if DO_CHAPTERS: + print("Updating chapter ngrams with n ∈", NGRAM_N_VALUES) + update_all_chapter_ngrams() + + if DO_FRAGMENTS: + print( + "\nUpdating fragment ngrams with n ∈", + NGRAM_N_VALUES, + "(This may take a while.)", + ) + update_all_fragment_ngrams() diff --git a/ebl/io/fragments/importer.py b/ebl/io/fragments/importer.py index bdc49051a..6840865e4 100644 --- a/ebl/io/fragments/importer.py +++ b/ebl/io/fragments/importer.py @@ -6,6 +6,8 @@ from marshmallow import ValidationError from pymongo import MongoClient import pymongo +from ebl.common.infrastructure.ngrams import NGRAM_N_VALUES +from ebl.common.query.util import extract_ngrams, replace_all from ebl.fragmentarium.application.fragment_schema import FragmentSchema from ebl.fragmentarium.infrastructure.fragment_search_aggregations import ( sort_by_museum_number, @@ -114,6 +116,24 @@ def create_sort_index(fragments_collection: MongoCollection) -> None: fragments_collection.create_index([("_sortKey", pymongo.ASCENDING)]) +def _create_ngrams(fragments_collection: MongoCollection, fragments: dict) -> None: + print(f"Extracting n-grams from {len(fragments)} fragment(s)...") + numbers = [fragment["_id"] for fragment in fragments.values()] + fragments_collection.update_many( + {"_id": {"$in": numbers}}, + [ + { + "$set": { + "ngrams": extract_ngrams( + {"$split": [replace_all("$signs", "\n", " # "), " "]}, + NGRAM_N_VALUES, + ) + } + } + ], + ) + + def write_to_db( fragments: Sequence[dict], fragments_collection: MongoCollection ) -> List: @@ -129,7 +149,7 @@ def write_to_tsv( writer = csv.writer(csvfile, delimiter="\t") # pyre-ignore[6] if column_names: writer.writerow(column_names) - writer.writerows(FAILS) + writer.writerows(data) if __name__ == "__main__": @@ -265,7 +285,7 @@ def _reindex_database(collection, db): if FAILS: print( - f"Skipping {fail_count} document(s), see {os.path.abspath(error_file)} for details" + f"Skipping {fail_count} document(s), see {os.path.abspath(error_file)} for details." ) write_to_tsv(error_file, FAILS, ["file", "error"]) @@ -296,27 +316,28 @@ def _reindex_database(collection, db): if input(prompt) != passphrase: sys.exit("Aborting.") - fragments = { + valid_fragments = { filename: data for filename, data in fragments.items() if filename not in set(file for file, _ in FAILS) } result = write_to_db( - list(fragments.values()), + list(valid_fragments.values()), COLLECTION, ) print("Result:") print(result) + _create_ngrams(COLLECTION, valid_fragments) _reindex_database(COLLECTION, args.database) write_to_tsv( summary_file, - [[data["_id"], filename] for filename, data in fragments.items()], + [[data["_id"], filename] for filename, data in valid_fragments.items()], ["id", "file"], ) print( - f"Done! See {os.path.abspath(summary_file)} for a summary of added documents", + f"Done! See {os.path.abspath(summary_file)} for a summary of added documents.", ) diff --git a/ebl/tests/common/ngram_test_support.py b/ebl/tests/common/ngram_test_support.py new file mode 100644 index 000000000..e4c6390a2 --- /dev/null +++ b/ebl/tests/common/ngram_test_support.py @@ -0,0 +1,32 @@ +from typing import Sequence, Set, Tuple, TypeVar, Optional +from ebl.corpus.domain.chapter import Chapter +from ebl.fragmentarium.domain.fragment import Fragment + +T = TypeVar("T") + + +def _ngrams(sequence: Sequence[T], n: int) -> Set[Tuple[T]]: + return set(zip(*(sequence[i:] for i in range(n)))) + + +def ngrams_from_signs(signs: str, N: Sequence[int]) -> Set[Tuple[str]]: + split_signs = signs.replace("\n", " # ").split() + all_ngrams = set.union(*(_ngrams(split_signs, n) for n in N)) + return {ngram for ngram in all_ngrams if "X" not in ngram} + + +def chapter_ngrams_from_signs( + chapter_signs: Sequence[Optional[str]], N: Sequence[int] +) -> Set[Tuple[str]]: + return set.union( + *(ngrams_from_signs(signs, N) for signs in chapter_signs if signs is not None) + ) + + +def compute_ngram_score( + fragment: Fragment, chapter: Chapter, N: Sequence[int] +) -> float: + F = ngrams_from_signs(fragment.signs, N) + C = chapter_ngrams_from_signs(chapter.signs, N) + + return (len(F & C) / min(len(F), len(C))) if F and C else 0.0 diff --git a/ebl/tests/corpus/test_corpus_query_route.py b/ebl/tests/corpus/test_corpus_query_route.py index 1a9bb2e68..60b0bd1e1 100644 --- a/ebl/tests/corpus/test_corpus_query_route.py +++ b/ebl/tests/corpus/test_corpus_query_route.py @@ -5,7 +5,7 @@ from ebl.corpus.application.id_schemas import TextIdSchema from ebl.corpus.domain.chapter import Chapter from ebl.dictionary.domain.word import WordId -from ebl.tests.corpus.test_mongo_text_repository import LITERATURE_TEXT +from ebl.tests.corpus.test_mongo_text_repository import LITERATURE_TEXT, SIGNS from ebl.tests.factories.corpus import ( ChapterFactory, LineFactory, @@ -122,11 +122,6 @@ def test_query_chapter_lemmas( } -SIGNS = [ - "X ABZ411 ABZ11 ABZ41", - "X X X TI BA", - "MA ŠU X\nTI BA X", -] MANUSCRIPTS = ManuscriptFactory.build_batch(3) VARIANT_LINES = [ [ diff --git a/ebl/tests/corpus/test_mongo_text_repository.py b/ebl/tests/corpus/test_mongo_text_repository.py index 578c3ea3c..a73b16953 100644 --- a/ebl/tests/corpus/test_mongo_text_repository.py +++ b/ebl/tests/corpus/test_mongo_text_repository.py @@ -1,11 +1,13 @@ from typing import Sequence import attr import pytest +from ebl.common.infrastructure.ngrams import NGRAM_N_VALUES +from ebl.corpus.application.display_schemas import ChapterNgramScoreSchema from ebl.corpus.application.text_repository import TextRepository from ebl.corpus.application.schemas import ChapterSchema, TextSchema from ebl.corpus.domain.chapter import Chapter -from ebl.corpus.domain.chapter_display import ChapterDisplay +from ebl.corpus.domain.chapter_display import ChapterDisplay, ChapterNgramScore from ebl.corpus.domain.dictionary_line import DictionaryLine from ebl.corpus.domain.text import Text, UncertainFragment from ebl.dictionary.domain.word import WordId @@ -13,6 +15,11 @@ from ebl.fragmentarium.application.joins_schema import JoinSchema from ebl.fragmentarium.domain.fragment import Fragment from ebl.fragmentarium.domain.joins import Join, Joins +from ebl.tests.common.ngram_test_support import ( + chapter_ngrams_from_signs, + compute_ngram_score, + ngrams_from_signs, +) from ebl.tests.factories.corpus import ( ChapterFactory, LineFactory, @@ -138,6 +145,11 @@ ), ), ) +SIGNS = [ + "X ABZ411 ABZ11 ABZ41", + "X X X TI BA", + "MA ŠU X\nTI BA X", +] def when_text_in_collection(database, text=TEXT) -> None: @@ -167,7 +179,7 @@ def test_creating_chapter(database, text_repository) -> None: "stage": CHAPTER.stage.value, "name": CHAPTER.name, }, - projection={"_id": False}, + projection={"_id": False, "ngrams": False}, ) assert inserted_chapter == ChapterSchema().dump(CHAPTER) @@ -466,3 +478,81 @@ def test_query_corpus_by_manuscript(database, text_repository) -> None: assert text_repository.query_corpus_by_manuscript( [CHAPTER.manuscripts[0].museum_number] ) == [expected_manuscript_attestation] + + +def test_create_chapter_stores_ngrams(database, text_repository): + text_repository.create_chapter(CHAPTER) + + data = database[CHAPTERS_COLLECTION].find_one( + { + "textId.category": CHAPTER.text_id.category, + "textId.index": CHAPTER.text_id.index, + "stage": CHAPTER.stage.value, + "name": CHAPTER.name, + }, + projection={"_id": False, "ngrams": True}, + ) + + assert set(map(tuple, data["ngrams"])) == chapter_ngrams_from_signs( + CHAPTER.signs, NGRAM_N_VALUES + ) + + +def test_update_chapter_stores_ngrams(database, text_repository): + text_repository.create_chapter(CHAPTER) + + updated_chapter = attr.evolve( + CHAPTER, + signs=("X ABZ411 ABZ11 ABZ41", "X X X TI BA", None), + ) + + when_chapter_in_collection(database) + + text_repository.update(CHAPTER.id_, updated_chapter) + + data = database[CHAPTERS_COLLECTION].find_one( + { + "textId.category": CHAPTER.text_id.category, + "textId.index": CHAPTER.text_id.index, + "stage": CHAPTER.stage.value, + "name": CHAPTER.name, + }, + projection={"_id": False, "ngrams": True}, + ) + + assert set(map(tuple, data["ngrams"])) == chapter_ngrams_from_signs( + updated_chapter.signs, NGRAM_N_VALUES + ) + + +def test_aggregate_ngram_overlaps(text_repository): + text_repository.create(TEXT) + chapters = [ + ChapterFactory.build( + signs=(f"{signs} {signs}",), + text_id=TextId(TEXT.genre, TEXT.category, TEXT.index), + ) + for signs in SIGNS + ] + + for chapter in chapters: + text_repository.create_chapter(chapter) + + fragment = FragmentFactory.create(signs="TI BA ABZ11") + + assert text_repository.aggregate_ngram_overlaps( + ngrams_from_signs(fragment.signs, NGRAM_N_VALUES) + ) == sorted( + ( + ChapterNgramScoreSchema().dump( + ChapterNgramScore.of( + chapter.id_, + TEXT.name, + compute_ngram_score(fragment, chapter, NGRAM_N_VALUES), + ) + ) + for chapter in chapters + ), + key=lambda item: item["score"], + reverse=True, + ) diff --git a/ebl/tests/fragmentarium/test_fragment_repository.py b/ebl/tests/fragmentarium/test_fragment_repository.py index e4817c8af..2744e4845 100644 --- a/ebl/tests/fragmentarium/test_fragment_repository.py +++ b/ebl/tests/fragmentarium/test_fragment_repository.py @@ -4,6 +4,7 @@ import random from ebl.common.domain.period import Period, PeriodModifier from ebl.common.domain.scopes import Scope +from ebl.common.infrastructure.ngrams import NGRAM_N_VALUES from ebl.common.query.query_result import QueryItem, QueryResult from ebl.dictionary.domain.word import WordId @@ -26,6 +27,7 @@ from ebl.fragmentarium.domain.transliteration_update import TransliterationUpdate from ebl.common.query.query_result import LemmaQueryType from ebl.lemmatization.domain.lemmatization import Lemmatization, LemmatizationToken +from ebl.tests.common.ngram_test_support import ngrams_from_signs from ebl.tests.factories.bibliography import ReferenceFactory from ebl.tests.factories.fragment import ( FragmentFactory, @@ -50,6 +52,7 @@ from ebl.common.query.query_schemas import QueryResultSchema from ebl.tests.fragmentarium.test_fragments_search_route import query_item_of from ebl.transliteration.application.sign_repository import SignRepository +from ebl.transliteration.infrastructure.queries import museum_number_is COLLECTION = "fragments" @@ -110,7 +113,7 @@ def test_create(database, fragment_repository): assert fragment_id == str(fragment.number) assert database[COLLECTION].find_one( - {"_id": fragment_id}, projection={"_id": False} + {"_id": fragment_id}, projection={"_id": False, "ngrams": False} ) == FragmentSchema(exclude=["joins"]).dump(fragment) @@ -1003,3 +1006,40 @@ def test_query_genres(fragment_repository, query, expected): ) assert fragment_repository.query({"genre": query}) == expected_result + + +def test_create_fragment_stores_ngrams(fragment_repository): + fragment = TransliteratedFragmentFactory.build() + + fragment_repository.create(fragment) + data = fragment_repository._fragments.find_one(museum_number_is(fragment.number)) + + assert set(map(tuple, data["ngrams"])) == ngrams_from_signs( + fragment.signs, NGRAM_N_VALUES + ) + + +def test_get_ngrams(fragment_repository): + fragment = TransliteratedFragmentFactory.build() + fragment_repository.create(fragment) + + assert set( + map(tuple, fragment_repository.get_ngrams(fragment.number)) + ) == ngrams_from_signs(fragment.signs, NGRAM_N_VALUES) + + +def test_update_transliteration_updates_ngrams(fragment_repository, user): + fragment = TransliteratedFragmentFactory.build(signs="") + fragment_repository.create(fragment) + + updated_fragment = fragment.update_transliteration( + TransliterationUpdate(parse_atf_lark("1. X MU TA MA UD MI KU")), user + ) + + fragment_repository.update_field("transliteration", updated_fragment) + + data = fragment_repository._fragments.find_one(museum_number_is(fragment.number)) + + assert set(map(tuple, data["ngrams"])) == ngrams_from_signs( + updated_fragment.signs, NGRAM_N_VALUES + ) diff --git a/ebl/tests/fragmentarium/test_ngram_matcher_route.py b/ebl/tests/fragmentarium/test_ngram_matcher_route.py new file mode 100644 index 000000000..3f0691944 --- /dev/null +++ b/ebl/tests/fragmentarium/test_ngram_matcher_route.py @@ -0,0 +1,54 @@ +from typing import List +from ebl.common.infrastructure.ngrams import NGRAM_N_VALUES +from ebl.corpus.application.display_schemas import ChapterNgramScoreSchema +from ebl.corpus.application.text_repository import TextRepository +from ebl.corpus.domain.chapter import Chapter +from ebl.corpus.domain.chapter_display import ChapterNgramScore +from ebl.fragmentarium.application.fragment_repository import FragmentRepository +from ebl.tests.common.ngram_test_support import compute_ngram_score +from ebl.corpus.domain.text import Text +from ebl.tests.factories.corpus import ChapterFactory, TextFactory +from ebl.tests.factories.fragment import TransliteratedFragmentFactory +import falcon + +from ebl.transliteration.domain.text_id import TextId + +SIGNS = ["X BA KU ABZ075", "KI DU ABZ411 BA MA TI\nX MU TA MA UD", "KU ABZ411 MA KI"] +TEXT: Text = TextFactory.build() + + +def test_match_fragment_ngrams( + client, + fragment_repository: FragmentRepository, + text_repository: TextRepository, +): + fragment = TransliteratedFragmentFactory.build() + fragment_id = fragment_repository.create(fragment) + text_repository.create(TEXT) + chapters: List[Chapter] = [ + ChapterFactory.build( + signs=(signs,), text_id=TextId(TEXT.genre, TEXT.category, TEXT.index) + ) + for signs in SIGNS + ] + + for chapter in chapters: + text_repository.create_chapter(chapter) + + result = client.simulate_get(f"/fragments/{fragment_id}/ngrams") + + assert result.status == falcon.HTTP_OK + assert result.json == sorted( + ( + ChapterNgramScoreSchema().dump( + ChapterNgramScore.of( + chapter.id_, + TEXT.name, + compute_ngram_score(fragment, chapter, NGRAM_N_VALUES), + ) + ) + for chapter in chapters + ), + key=lambda item: item["score"], + reverse=True, + )