Skip to content

Commit

Permalink
Init refactor of custom classes to Pydantic
Browse files Browse the repository at this point in the history
Began moving code from `src\backend` into Pydantic schemas to allow validation.

Continue refactor, will need to run in notebook again, most likely as a test.

Refactor seems to cut down number of requested Stanza model downloads, need to check
  • Loading branch information
AaronWChen committed Aug 6, 2024
1 parent 8bdcdb8 commit af969dd
Show file tree
Hide file tree
Showing 3 changed files with 62 additions and 18 deletions.
File renamed without changes.
56 changes: 38 additions & 18 deletions schemas.py
Original file line number Diff line number Diff line change
@@ -1,24 +1,44 @@
# BugBytes instructor likes to use the schemas.py file and add all the Pydantic schemas to that file
from datetime import date
from enum import Enum
from pydantic import BaseModel
from pydantic.dataclasses import dataclass


class GenreURLChoices(Enum):
ROCK = "rock"
ELECTRONIC = "electronic"
METAL = "metal"
HIP_HOP = "hip-hop"
SHOEGAZE = "shoegaze"
# BACKEND
#
@dataclass
class CustomSKLearnAnalyzer(dataclass):
"""
This class handles allows sklearn text transformers to incorporate a Stanza pipeline with a custom analyzer
"""

# Options: require a stanza_lang_str which will be used to download a Stanza model
stanza_lang_str: str = "en"
depparse_batch_size: int = 50
depparse_min_length_to_batch_separately: int = 50
verbose: bool = True
use_gpu: bool = True
batch_size: int = 100

class Album(BaseModel):
title: str
release_date: date
@classmethod
def ngram_maker(self, min_ngram_length: int, max_ngram_length: int):
def ngrams_per_line(row: str):
for ln in row.split(" brk "):
at_least_two_english_characters_whole_words = r"(?u)\b\w{2,}\b"
terms = re.findall(at_least_two_english_characters_whole_words, ln)
for ngram_length in range(min_ngram_length, max_ngram_length + 1):

# find and return all ngrams
# for ngram in zip(*[terms[i:] for i in range(3)]):
# <-- solution without a generator (works the same but has higher memory usage)
for ngram in (
word
for i in range(len(terms) - ngram_length + 1)
for word in (" ".join(terms[i : i + ngram_length]),)
):
yield ngram

class Band(BaseModel):
id: int
name: str
genre: str
albums: list[Album] = []
return ngrams_per_line

# TODO
# Is it possible to move the download of the model into the container creation, and require the Stanza model in this instantiation instead
# stanza_model: StanzaModel
# stanza documentation here (https://github.com/stanfordnlp/stanza-train) implies that the pretrained models are PyTorch models
# having some difficulty finding examples of creating a BaseModel from a PyTorch model. Switch to string option above
24 changes: 24 additions & 0 deletions schemas_example.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
# BugBytes instructor likes to use the schemas.py file and add all the Pydantic schemas to that file
from datetime import date
from enum import Enum
from pydantic import BaseModel


class GenreURLChoices(Enum):
ROCK = "rock"
ELECTRONIC = "electronic"
METAL = "metal"
HIP_HOP = "hip-hop"
SHOEGAZE = "shoegaze"


class Album(BaseModel):
title: str
release_date: date


class Band(BaseModel):
id: int
name: str
genre: str
albums: list[Album] = []

0 comments on commit af969dd

Please sign in to comment.