Init refactor of custom classes to Pydantic

Began moving code from `src\backend` into Pydantic schemas to allow validation. Continue refactor, will need to run in notebook again, most likely as a test. Refactor seems to cut down number of requested Stanza model downloads, need to check
AaronWChen · Aug 6, 2024 · af969dd · af969dd
1 parent 8bdcdb8
commit af969dd
Show file tree

Hide file tree

Showing 3 changed files with 62 additions and 18 deletions.
diff --git a/main.py → main_example.py b/main.py → main_example.py
diff --git a/schemas.py b/schemas.py
@@ -1,24 +1,44 @@
-# BugBytes instructor likes to use the schemas.py file and add all the Pydantic schemas to that file
-from datetime import date
-from enum import Enum
-from pydantic import BaseModel
+from pydantic.dataclasses import dataclass
 
 
-class GenreURLChoices(Enum):
-    ROCK = "rock"
-    ELECTRONIC = "electronic"
-    METAL = "metal"
-    HIP_HOP = "hip-hop"
-    SHOEGAZE = "shoegaze"
+# BACKEND
+#
+@dataclass
+class CustomSKLearnAnalyzer(dataclass):
+    """
+    This class handles allows sklearn text transformers to incorporate a Stanza pipeline with a custom analyzer
+    """
 
+    # Options: require a stanza_lang_str which will be used to download a Stanza model
+    stanza_lang_str: str = "en"
+    depparse_batch_size: int = 50
+    depparse_min_length_to_batch_separately: int = 50
+    verbose: bool = True
+    use_gpu: bool = True
+    batch_size: int = 100
 
-class Album(BaseModel):
-    title: str
-    release_date: date
+    @classmethod
+    def ngram_maker(self, min_ngram_length: int, max_ngram_length: int):
+        def ngrams_per_line(row: str):
+            for ln in row.split(" brk "):
+                at_least_two_english_characters_whole_words = r"(?u)\b\w{2,}\b"
+                terms = re.findall(at_least_two_english_characters_whole_words, ln)
+                for ngram_length in range(min_ngram_length, max_ngram_length + 1):
 
+                    # find and return all ngrams
+                    # for ngram in zip(*[terms[i:] for i in range(3)]):
+                    # <-- solution without a generator (works the same but has higher memory usage)
+                    for ngram in (
+                        word
+                        for i in range(len(terms) - ngram_length + 1)
+                        for word in (" ".join(terms[i : i + ngram_length]),)
+                    ):
+                        yield ngram
 
-class Band(BaseModel):
-    id: int
-    name: str
-    genre: str
-    albums: list[Album] = []
+        return ngrams_per_line
+
+    # TODO
+    # Is it possible to move the download of the model into the container creation, and require the Stanza model in this instantiation instead
+    # stanza_model: StanzaModel
+    # stanza documentation here (https://github.com/stanfordnlp/stanza-train) implies that the pretrained models are PyTorch models
+    # having some difficulty finding examples of creating a BaseModel from a PyTorch model. Switch to string option above
diff --git a/schemas_example.py b/schemas_example.py
@@ -0,0 +1,24 @@
+# BugBytes instructor likes to use the schemas.py file and add all the Pydantic schemas to that file
+from datetime import date
+from enum import Enum
+from pydantic import BaseModel
+
+
+class GenreURLChoices(Enum):
+    ROCK = "rock"
+    ELECTRONIC = "electronic"
+    METAL = "metal"
+    HIP_HOP = "hip-hop"
+    SHOEGAZE = "shoegaze"
+
+
+class Album(BaseModel):
+    title: str
+    release_date: date
+
+
+class Band(BaseModel):
+    id: int
+    name: str
+    genre: str
+    albums: list[Album] = []