-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Init refactor of custom classes to Pydantic
Began moving code from `src\backend` into Pydantic schemas to allow validation. Continue refactor, will need to run in notebook again, most likely as a test. Refactor seems to cut down number of requested Stanza model downloads, need to check
- Loading branch information
1 parent
8bdcdb8
commit af969dd
Showing
3 changed files
with
62 additions
and
18 deletions.
There are no files selected for viewing
File renamed without changes.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,24 +1,44 @@ | ||
# BugBytes instructor likes to use the schemas.py file and add all the Pydantic schemas to that file | ||
from datetime import date | ||
from enum import Enum | ||
from pydantic import BaseModel | ||
from pydantic.dataclasses import dataclass | ||
|
||
|
||
class GenreURLChoices(Enum): | ||
ROCK = "rock" | ||
ELECTRONIC = "electronic" | ||
METAL = "metal" | ||
HIP_HOP = "hip-hop" | ||
SHOEGAZE = "shoegaze" | ||
# BACKEND | ||
# | ||
@dataclass | ||
class CustomSKLearnAnalyzer(dataclass): | ||
""" | ||
This class handles allows sklearn text transformers to incorporate a Stanza pipeline with a custom analyzer | ||
""" | ||
|
||
# Options: require a stanza_lang_str which will be used to download a Stanza model | ||
stanza_lang_str: str = "en" | ||
depparse_batch_size: int = 50 | ||
depparse_min_length_to_batch_separately: int = 50 | ||
verbose: bool = True | ||
use_gpu: bool = True | ||
batch_size: int = 100 | ||
|
||
class Album(BaseModel): | ||
title: str | ||
release_date: date | ||
@classmethod | ||
def ngram_maker(self, min_ngram_length: int, max_ngram_length: int): | ||
def ngrams_per_line(row: str): | ||
for ln in row.split(" brk "): | ||
at_least_two_english_characters_whole_words = r"(?u)\b\w{2,}\b" | ||
terms = re.findall(at_least_two_english_characters_whole_words, ln) | ||
for ngram_length in range(min_ngram_length, max_ngram_length + 1): | ||
|
||
# find and return all ngrams | ||
# for ngram in zip(*[terms[i:] for i in range(3)]): | ||
# <-- solution without a generator (works the same but has higher memory usage) | ||
for ngram in ( | ||
word | ||
for i in range(len(terms) - ngram_length + 1) | ||
for word in (" ".join(terms[i : i + ngram_length]),) | ||
): | ||
yield ngram | ||
|
||
class Band(BaseModel): | ||
id: int | ||
name: str | ||
genre: str | ||
albums: list[Album] = [] | ||
return ngrams_per_line | ||
|
||
# TODO | ||
# Is it possible to move the download of the model into the container creation, and require the Stanza model in this instantiation instead | ||
# stanza_model: StanzaModel | ||
# stanza documentation here (https://github.com/stanfordnlp/stanza-train) implies that the pretrained models are PyTorch models | ||
# having some difficulty finding examples of creating a BaseModel from a PyTorch model. Switch to string option above |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,24 @@ | ||
# BugBytes instructor likes to use the schemas.py file and add all the Pydantic schemas to that file | ||
from datetime import date | ||
from enum import Enum | ||
from pydantic import BaseModel | ||
|
||
|
||
class GenreURLChoices(Enum): | ||
ROCK = "rock" | ||
ELECTRONIC = "electronic" | ||
METAL = "metal" | ||
HIP_HOP = "hip-hop" | ||
SHOEGAZE = "shoegaze" | ||
|
||
|
||
class Album(BaseModel): | ||
title: str | ||
release_date: date | ||
|
||
|
||
class Band(BaseModel): | ||
id: int | ||
name: str | ||
genre: str | ||
albums: list[Album] = [] |