diff --git a/main.py b/main_example.py similarity index 100% rename from main.py rename to main_example.py diff --git a/schemas.py b/schemas.py index 8e4a3d0..0292fb7 100644 --- a/schemas.py +++ b/schemas.py @@ -1,24 +1,44 @@ -# BugBytes instructor likes to use the schemas.py file and add all the Pydantic schemas to that file -from datetime import date -from enum import Enum -from pydantic import BaseModel +from pydantic.dataclasses import dataclass -class GenreURLChoices(Enum): - ROCK = "rock" - ELECTRONIC = "electronic" - METAL = "metal" - HIP_HOP = "hip-hop" - SHOEGAZE = "shoegaze" +# BACKEND +# +@dataclass +class CustomSKLearnAnalyzer(dataclass): + """ + This class handles allows sklearn text transformers to incorporate a Stanza pipeline with a custom analyzer + """ + # Options: require a stanza_lang_str which will be used to download a Stanza model + stanza_lang_str: str = "en" + depparse_batch_size: int = 50 + depparse_min_length_to_batch_separately: int = 50 + verbose: bool = True + use_gpu: bool = True + batch_size: int = 100 -class Album(BaseModel): - title: str - release_date: date + @classmethod + def ngram_maker(self, min_ngram_length: int, max_ngram_length: int): + def ngrams_per_line(row: str): + for ln in row.split(" brk "): + at_least_two_english_characters_whole_words = r"(?u)\b\w{2,}\b" + terms = re.findall(at_least_two_english_characters_whole_words, ln) + for ngram_length in range(min_ngram_length, max_ngram_length + 1): + # find and return all ngrams + # for ngram in zip(*[terms[i:] for i in range(3)]): + # <-- solution without a generator (works the same but has higher memory usage) + for ngram in ( + word + for i in range(len(terms) - ngram_length + 1) + for word in (" ".join(terms[i : i + ngram_length]),) + ): + yield ngram -class Band(BaseModel): - id: int - name: str - genre: str - albums: list[Album] = [] + return ngrams_per_line + + # TODO + # Is it possible to move the download of the model into the container creation, and require the Stanza model in this instantiation instead + # stanza_model: StanzaModel + # stanza documentation here (https://github.com/stanfordnlp/stanza-train) implies that the pretrained models are PyTorch models + # having some difficulty finding examples of creating a BaseModel from a PyTorch model. Switch to string option above diff --git a/schemas_example.py b/schemas_example.py new file mode 100644 index 0000000..8e4a3d0 --- /dev/null +++ b/schemas_example.py @@ -0,0 +1,24 @@ +# BugBytes instructor likes to use the schemas.py file and add all the Pydantic schemas to that file +from datetime import date +from enum import Enum +from pydantic import BaseModel + + +class GenreURLChoices(Enum): + ROCK = "rock" + ELECTRONIC = "electronic" + METAL = "metal" + HIP_HOP = "hip-hop" + SHOEGAZE = "shoegaze" + + +class Album(BaseModel): + title: str + release_date: date + + +class Band(BaseModel): + id: int + name: str + genre: str + albums: list[Album] = []