ElectronicBabylonianLiterature · fsimonjetz · Sep 21, 2023 · Sep 21, 2023 · Sep 21, 2023 · Sep 21, 2023
diff --git a/ebl/common/infrastructure/ngrams.py b/ebl/common/infrastructure/ngrams.py
@@ -0,0 +1 @@
+NGRAM_N_VALUES = [1, 2, 3]
diff --git a/ebl/common/query/util.py b/ebl/common/query/util.py
@@ -1,4 +1,4 @@
-from typing import Union, Dict
+from typing import Union, Dict, Sequence
 
 
 def flatten_field(input_: Union[str, Dict], depth=1) -> Dict:
@@ -15,9 +15,9 @@ def drop_duplicates(input_: Union[str, Dict]) -> Dict:
     return {"$setUnion": [input_, []]}
 
 
-def ngrams(input_: Union[str, Dict], n) -> Dict:
-    if n <= 1:
-        raise ValueError("ngram size must be 2 or more")
+def ngrams(input_: Union[str, Dict], n: int) -> Dict:
+    if n <= 0:
+        raise ValueError("ngram size must be 1 or more")
     return {
         "$zip": {
             "inputs": [
@@ -39,3 +39,41 @@ def ngrams(input_: Union[str, Dict], n) -> Dict:
 
 def filter_array(input_, as_, cond) -> Dict:
     return {"$filter": {"input": input_, "as": as_, "cond": cond}}
+
+
+def extract_ngrams(
+    input_: Union[str, Dict],
+    N: Sequence[int],
+):
+    signs_to_exclude = ["X", ""]
+
+    exclude_empty = {
+        "$eq": [
+            {
+                "$size": {
+                    "$setIntersection": [
+                        "$$this",
+                        signs_to_exclude,
+                    ]
+                }
+            },
+            0,
+        ]
+    }
+    return drop_duplicates(
+        filter_array(
+            {"$concatArrays": [ngrams(input_, n) for n in N if n > 0]},
+            "this",
+            exclude_empty,
+        )
+    )
+
+
+def replace_all(input_: Union[str, Dict], old: str, new: str):
+    return {
+        "$replaceAll": {
+            "input": input_,
+            "find": old,
+            "replacement": new,
+        }
+    }
diff --git a/ebl/corpus/application/display_schemas.py b/ebl/corpus/application/display_schemas.py
@@ -3,7 +3,11 @@
 from ebl.corpus.application.id_schemas import ChapterIdSchema
 from ebl.corpus.application.record_schemas import RecordSchema
 from ebl.corpus.application.schemas import LineVariantSchema, ManuscriptSchema
-from ebl.corpus.domain.chapter_display import ChapterDisplay, LineDisplay
+from ebl.corpus.domain.chapter_display import (
+    ChapterDisplay,
+    ChapterNgramScore,
+    LineDisplay,
+)
 from ebl.corpus.domain.record import Record
 from ebl.transliteration.application.line_number_schemas import (
     OneOfLineNumberSchema,
@@ -89,3 +93,12 @@ def add_line_indexes(self, data: dict, **kwargs) -> dict:
         ]
 
         return data
+
+
+class ChapterNgramScoreSchema(ChapterIdSchema):
+    text_name = fields.String(required=True, data_key="textName")
+    score = fields.Float(required=True)
+
+    @post_load
+    def make_result(self, data: dict, **kwargs) -> ChapterNgramScore:
+        return ChapterNgramScore(**data)
diff --git a/ebl/corpus/application/schemas.py b/ebl/corpus/application/schemas.py
@@ -1,5 +1,6 @@
 from ebl.corpus.domain.provenance import Provenance
 from marshmallow import (
+    EXCLUDE,
     Schema,
     ValidationError,
     fields,
@@ -260,6 +261,9 @@ class DictionaryLinePaginationSchema(Schema):
 
 
 class ChapterSchema(Schema):
+    class Meta:
+        unknown = EXCLUDE
+
     text_id = fields.Nested(TextIdSchema, required=True, data_key="textId")
     classification = ValueEnumField(Classification, required=True)
     stage = ValueEnumField(Stage, required=True)

diff --git a/ebl/corpus/application/text_repository.py b/ebl/corpus/application/text_repository.py
@@ -85,3 +85,9 @@
     @abstractmethod
     def query(self, query: dict) -> CorpusQueryResult:
         ...
+
+    @abstractmethod
+    def aggregate_ngram_overlaps(
+        self, ngrams: Sequence[Sequence[str]], limit: Optional[int] = None
+    ) -> Sequence[dict]:
+        ...
diff --git a/ebl/corpus/domain/chapter_display.py b/ebl/corpus/domain/chapter_display.py
@@ -139,3 +139,15 @@ def of_chapter(text: Text, chapter: Chapter) -> "ChapterDisplay":
             chapter.record,
             chapter.manuscripts,
         )
+
+
+@attr.s(frozen=True, auto_attribs=True)
+class ChapterNgramScore(ChapterId):
+    text_name: str
+    score: float
+
+    @staticmethod
+    def of(chapter_id: ChapterId, text_name: str, score: float) -> "ChapterNgramScore":
+        return ChapterNgramScore(
+            chapter_id.text_id, chapter_id.stage, chapter_id.name, text_name, score
+        )
diff --git a/ebl/corpus/infrastructure/mongo_text_repository.py b/ebl/corpus/infrastructure/mongo_text_repository.py
@@ -6,8 +6,16 @@
 
 
 from ebl.bibliography.infrastructure.bibliography import join_reference_documents
+from ebl.common.infrastructure.ngrams import NGRAM_N_VALUES
 from ebl.common.query.query_result import CorpusQueryResult
 from ebl.common.query.query_schemas import CorpusQueryResultSchema
+from ebl.common.query.util import (
+    drop_duplicates,
+    extract_ngrams,
+    filter_array,
+    flatten_field,
+    replace_all,
+)
 from ebl.corpus.application.text_repository import TextRepository
 from ebl.corpus.application.display_schemas import ChapterDisplaySchema
 from ebl.corpus.application.schemas import (
@@ -37,7 +45,7 @@
     chapter_id_query,
     join_chapters,
     join_text,
-    join_text_title,
+    join_text_names,
 )
 from ebl.errors import NotFoundError
 from ebl.fragmentarium.infrastructure.queries import is_in_fragmentarium, join_joins
@@ -109,6 +117,7 @@ def create(self, text: Text) -> None:
 
     def create_chapter(self, chapter: Chapter) -> None:
         self._chapters.insert_one(ChapterSchema().dump(chapter))
+        self._update_ngrams(chapter.id_)
 
     def find(self, id_: TextId) -> Text:
         try:
@@ -243,6 +252,7 @@ def update(self, id_: ChapterId, chapter: Chapter) -> None:
                 ).dump(chapter)
             },
         )
+        self._update_ngrams(id_)
 
     def query_by_transliteration(
         self, query: TransliterationQuery, pagination_index: int
@@ -252,35 +262,8 @@ def query_by_transliteration(
         cursor = self._chapters.aggregate(
             [
                 {"$match": mongo_query},
-                {
-                    "$lookup": {
-                        "from": "texts",
-                        "let": {
-                            "chapterGenre": "$textId.genre",
-                            "chapterCategory": "$textId.category",
-                            "chapterIndex": "$textId.index",
-                        },
-                        "pipeline": [
-                            {
-                                "$match": {
-                                    "$expr": {
-                                        "$and": [
-                                            {"$eq": ["$genre", "$$chapterGenre"]},
-                                            {"$eq": ["$category", "$$chapterCategory"]},
-                                            {"$eq": ["$index", "$$chapterIndex"]},
-                                        ]
-                                    }
-                                }
-                            },
-                            {"$project": {"name": 1, "_id": 0}},
-                        ],
-                        "as": "textNames",
-                    }
-                },
+                *join_text_names(),
                 {"$project": {"_id": False}},
-                {"$addFields": {"textName": {"$first": "$textNames"}}},
-                {"$addFields": {"textName": "$textName.name"}},
-                {"$project": {"textNames": False}},
                 {"$skip": LIMIT * pagination_index},
                 {"$limit": LIMIT},
             ],
@@ -333,12 +316,12 @@ def query_by_lemma(
                 },
                 {"$unwind": "$lines"},
                 {"$match": lemma_query},
-                join_text_title(),
+                *join_text_names(),
                 filter_manuscripts_by_lemma(lemma),
                 {
                     "$project": {
                         "textId": True,
-                        "textName": {"$first": "$textName.name"},
+                        "textName": True,
                         "chapterName": "$name",
                         "stage": True,
                         "line": "$lines",
@@ -449,3 +432,90 @@ def query_corpus_by_manuscript(
             ]
         )
         return ManuscriptAttestationSchema().load(cursor, many=True)
+
+    def _update_ngrams(self, id_: ChapterId) -> None:
+        map_extract_ngrams = {
+            "$map": {
+                "input": "$signs",
+                "in": extract_ngrams(
+                    {"$split": [replace_all("$$this", "\n", " # "), " "]},
+                    NGRAM_N_VALUES,
+                ),
+            }
+        }
+        pipeline = [
+            {
+                "$set": {
+                    "ngrams": drop_duplicates(
+                        flatten_field(
+                            filter_array(
+                                map_extract_ngrams,
+                                "manuscriptSigns",
+                                {"$ne": ["$$manuscriptSigns", None]},
+                            )
+                        )
+                    )
+                }
+            },
+        ]
+
+        self._chapters.update_one(
+            chapter_id_query(id_),
+            pipeline,
+        )
+
+    def aggregate_ngram_overlaps(
+        self, ngrams: Sequence[Sequence[str]], limit: Optional[int] = None
+    ) -> Sequence[dict]:
+        if not ngrams:
+            raise ValueError("ngrams must not be empty")
+
+        ngram_list = list(ngrams)
+        test_chapter_category = 99
+        pipeline: List[dict] = [
+            {
+                "$match": {
+                    "textId.category": {"$ne": test_chapter_category},
+                    "ngrams": {"$exists": True, "$not": {"$size": 0}},
+                }
+            },
+            {
+                "$project": {
+                    "_id": 0,
+                    "textId": 1,
+                    "name": 1,
+                    "stage": 1,
+                    "score": {
+                        "$let": {
+                            "vars": {
+                                "intersection": {
+                                    "$size": {
+                                        "$setIntersection": ["$ngrams", ngram_list]
+                                    }
+                                },
+                                "minLength": {
+                                    "$min": [
+                                        {"$size": "$ngrams"},
+                                        len(ngram_list),
+                                    ]
+                                },
+                            },
+                            "in": {
+                                "$cond": [
+                                    {"$eq": ["$$minLength", 0]},
+                                    0.0,
+                                    {"$divide": ["$$intersection", "$$minLength"]},
+                                ]
+                            },
+                        }
+                    },
+                }
+            },
+            *join_text_names(),
+            {"$sort": {"score": -1}},
+        ]
+
+        if limit:
+            pipeline.append({"$limit": limit})
+
+        return list(self._chapters.aggregate(pipeline))
diff --git a/ebl/corpus/infrastructure/queries.py b/ebl/corpus/infrastructure/queries.py
@@ -187,29 +187,28 @@ def join_text() -> List[dict]:
     ]
 
 
-def join_text_title() -> dict:
-    return {
-        "$lookup": {
-            "from": "texts",
-            "let": {
-                "genre": "$textId.genre",
-                "category": "$textId.category",
-                "index": "$textId.index",
-            },
-            "pipeline": [
-                {
-                    "$match": {
-                        "$expr": {
-                            "$and": [
-                                {"$eq": ["$genre", "$$genre"]},
-                                {"$eq": ["$category", "$$category"]},
-                                {"$eq": ["$index", "$$index"]},
-                            ]
+def join_text_names() -> List[dict]:
+    return [
+        {
+            "$lookup": {
+                "from": "texts",
+                "let": {"textId": "$textId"},
+                "pipeline": [
+                    {
+                        "$match": {
+                            "$expr": {
+                                "$and": [
+                                    {"$eq": ["$genre", "$$textId.genre"]},
+                                    {"$eq": ["$category", "$$textId.category"]},
+                                    {"$eq": ["$index", "$$textId.index"]},
+                                ]
+                            }
                         }
-                    }
-                },
-                {"$project": {"_id": False, "name": True}},
-            ],
-            "as": "textName",
-        }
-    }
+                    },
+                    {"$project": {"name": 1, "_id": 0}},
+                ],
+                "as": "textName",
+            }
+        },
+        {"$addFields": {"textName": {"$first": "$textName.name"}}},
+    ]
diff --git a/ebl/fragmentarium/application/fragment_repository.py b/ebl/fragmentarium/application/fragment_repository.py
@@ -106,3 +106,7 @@
     @abstractmethod
     def list_all_fragments(self) -> Sequence[str]:
         ...
+
+    @abstractmethod
+    def get_ngrams(self, number: MuseumNumber) -> Sequence[Sequence[str]]:
+        ...