Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[REVIEW] Switch Models to use Crossfit #58

Merged
merged 43 commits into from
May 21, 2024
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
Show all changes
43 commits
Select commit Hold shift + click to select a range
9f1c6fe
Working Domain Classifier with Crossfit
VibhuJawa May 9, 2024
db6fc48
Black on file
VibhuJawa May 9, 2024
b09fc93
fix model path
VibhuJawa May 9, 2024
accea76
style fixes
VibhuJawa May 9, 2024
b6615f0
style fixes
VibhuJawa May 9, 2024
3bb6b73
style fixes
VibhuJawa May 9, 2024
22c2e57
style fixes
VibhuJawa May 9, 2024
b43c906
style fixes
VibhuJawa May 9, 2024
9f6960b
Notebook with dask cuda cluster
VibhuJawa May 9, 2024
8139dc9
Notebook with dask cuda cluster
VibhuJawa May 9, 2024
507eab5
wip benchmark
VibhuJawa May 9, 2024
16f4689
First pass at switching to quality clasifier
VibhuJawa May 16, 2024
c863ce0
Quality Classifier working
VibhuJawa May 16, 2024
3f20781
nb fix
VibhuJawa May 16, 2024
f80f49e
Make both classifiers work with labelling
VibhuJawa May 16, 2024
09670ba
Revert domain_api_example.py to main
VibhuJawa May 16, 2024
3423748
domain classifier
VibhuJawa May 16, 2024
509c85b
Classifier update
VibhuJawa May 16, 2024
97264b0
Added setup.py
VibhuJawa May 16, 2024
3ed74b1
Add crossfit to cpu install
VibhuJawa May 16, 2024
d3e2fad
Address Reviews
VibhuJawa May 20, 2024
399712e
Remove keep_prob_column
VibhuJawa May 20, 2024
b1e2f9a
Remove distributed_data_classification module
VibhuJawa May 20, 2024
7d7d866
Address Sarah's review about read_json
VibhuJawa May 20, 2024
8c46541
Working Domain Classifier with Crossfit
VibhuJawa May 9, 2024
3856ea9
Black on file
VibhuJawa May 9, 2024
5cffb0b
style fixes
VibhuJawa May 9, 2024
1acf345
style fixes
VibhuJawa May 9, 2024
0e42106
First pass at switching to quality clasifier
VibhuJawa May 16, 2024
d010a24
Quality Classifier working
VibhuJawa May 16, 2024
a1b867d
nb fix
VibhuJawa May 16, 2024
9b1cd79
Make both classifiers work with labelling
VibhuJawa May 16, 2024
97e89a0
Revert domain_api_example.py to main
VibhuJawa May 16, 2024
c181b94
Added setup.py
VibhuJawa May 16, 2024
24d046d
Add crossfit to cpu install
VibhuJawa May 16, 2024
cc3088a
Address Reviews
VibhuJawa May 20, 2024
63af368
Fix conflicts from rebase
VibhuJawa May 20, 2024
ecd95d4
Update tutorials/distributed_data_classification/distributed_data_cla…
VibhuJawa May 20, 2024
02fe2ef
Update tutorials/distributed_data_classification/distributed_data_cla…
VibhuJawa May 20, 2024
ca713c8
Update based on reviews
VibhuJawa May 21, 2024
1f39839
Update setup.py based on reviews
VibhuJawa May 21, 2024
c6b5d9e
Add -output-file-type
VibhuJawa May 21, 2024
e1ec135
Align Model Path instead of Model File Name
VibhuJawa May 21, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Prev Previous commit
Next Next commit
Black on file
Signed-off-by: Vibhu Jawa <vibhujawa@gmail.com>
  • Loading branch information
VibhuJawa committed May 21, 2024
commit db6fc482ceb5b362d769ea9ab85af1733e6110c2
Original file line number Diff line number Diff line change
Expand Up @@ -56,7 +56,9 @@ def main(args):
model_file_name = "/home/nfs/syurick/LLM_domain_classifier_inference/GoogleDebertaAgree_v3b_bce_maxlen512_bs64_noRef_best.pth"

# Input can be a string or list
input_file_path = "/home/nfs/syurick/LLM_domain_classifier_inference/4360_results_jsonl_dir/"
input_file_path = (
"/home/nfs/syurick/LLM_domain_classifier_inference/4360_results_jsonl_dir/"
)
output_file_path = "/raid/vjawa/output_file.parquet"

client = get_client(args, cluster_type=args.device)
Expand All @@ -68,7 +70,7 @@ def main(args):
domain_classifier = DomainClassifier(
model_file_name=model_file_name,
labels=labels,
#filter_by=["Games", "Sports"],
# filter_by=["Games", "Sports"],
)
result_dataset = domain_classifier(dataset=input_dataset)

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@
import os
import time
import warnings

os.environ["RAPIDS_NO_INITIALIZE"] = "1"
import torch
from packaging import version
Expand All @@ -38,9 +39,9 @@
from nemo_curator.utils.file_utils import get_remaining_files



warnings.filterwarnings("ignore")


@dataclass
class Config:
model = "microsoft/deberta-v3-base"
Expand All @@ -49,7 +50,9 @@ class Config:


class CustomModel(nn.Module):
def __init__(self, config, out_dim, config_path=None, pretrained=False, autocast=False):
def __init__(
self, config, out_dim, config_path=None, pretrained=False, autocast=False
):
super().__init__()
self.config = config
if config_path is None:
Expand Down Expand Up @@ -95,7 +98,7 @@ def forward(self, batch):
feature = self.feature(batch["input_ids"], batch["attention_mask"])
output = self.fc(self.fc_dropout(feature))
return torch.softmax(output[:, 0, :], dim=1)


def load_model(config, device, model_path, autocast):
"""
Expand All @@ -111,7 +114,9 @@ def load_model(config, device, model_path, autocast):
The loaded model.

"""
model = CustomModel(config, out_dim=27, config_path=None, pretrained=True, autocast=autocast)
model = CustomModel(
config, out_dim=27, config_path=None, pretrained=True, autocast=autocast
)
model = model.to(device)
if os.path.exists(model_path):
sd = torch.load(os.path.join(model_path), map_location="cpu")
Expand All @@ -127,22 +132,24 @@ class DomainModel(HFModel):
def __init__(self, config, model_path=None, autocast=False):
self.config = config
self.model_path = model_path
self.autocast=autocast
self.autocast = autocast
super().__init__(self.config.model)

def load_model(self, device="cuda"):
return load_model(self.config, device=device,
model_path=self.model_path or self.path_or_name,
autocast=self.autocast)

return load_model(
self.config,
device=device,
model_path=self.model_path or self.path_or_name,
autocast=self.autocast,
)

def load_tokenizer(self):
return DebertaV2TokenizerFast.from_pretrained(self.config.model)

def load_config(self):
return AutoConfig.from_pretrained(self.path_or_name)



def main():
labels = [
"Adult",
Expand Down Expand Up @@ -205,11 +212,11 @@ def main():
file_type=args.input_file_type,
add_filename=add_filename,
)
df['sliced_text'] = df['text'].str.slice(0, max_chars)
df["sliced_text"] = df["text"].str.slice(0, max_chars)
columns_to_keep_list = df.columns.to_list()
columns_to_keep_list.remove('sliced_text')
columns_to_keep_list.remove("sliced_text")

model_path ="/home/nfs/syurick/LLM_domain_classifier_inference/GoogleDebertaAgree_v3b_bce_maxlen512_bs64_best.pth"
model_path = "/home/nfs/syurick/LLM_domain_classifier_inference/GoogleDebertaAgree_v3b_bce_maxlen512_bs64_best.pth"
model = DomainModel(Config, model_path=model_path, autocast=args.autocast)
pipe = op.Sequential(
op.Tokenizer(model, cols=["sliced_text"], tokenizer_type="sentencepiece"),
Expand Down
46 changes: 29 additions & 17 deletions nemo_curator/modules/distributed_data_classifier.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,7 @@

from nemo_curator.datasets import DocumentDataset


@dataclass
class domain_Config:
model = "microsoft/deberta-v3-base"
Expand All @@ -36,7 +37,9 @@ class domain_Config:


class CustomModel(nn.Module):
def __init__(self, config, out_dim, config_path=None, pretrained=False, autocast=False):
def __init__(
self, config, out_dim, config_path=None, pretrained=False, autocast=False
):
super().__init__()
self.config = config
if config_path is None:
Expand Down Expand Up @@ -82,7 +85,7 @@ def forward(self, batch):
feature = self.feature(batch["input_ids"], batch["attention_mask"])
output = self.fc(self.fc_dropout(feature))
return torch.softmax(output[:, 0, :], dim=1)


def _load_model(model, device, model_path):
"""
Expand All @@ -107,7 +110,6 @@ def _load_model(model, device, model_path):
return model



class DistributedDataClassifier(ABC):
"""Abstract class for running multi-node multi-GPU data classification"""

Expand Down Expand Up @@ -144,7 +146,6 @@ def __call__(self, dataset: DocumentDataset):
def _run_classifier(self):
pass


def _filter_documents(
self,
dataset: DocumentDataset,
Expand All @@ -167,20 +168,26 @@ def __init__(self, config, out_dim=None, model_path=None, autocast=False):
self.config = config
self.out_dim = out_dim
self.model_path = model_path
self.autocast=autocast
self.autocast = autocast
super().__init__(self.config.model)

def load_model(self, device="cuda"):
model = CustomModel(self.config, out_dim=self.out_dim, config_path=None, pretrained=True, autocast=self.autocast)
model = CustomModel(
self.config,
out_dim=self.out_dim,
config_path=None,
pretrained=True,
autocast=self.autocast,
)
return _load_model(model, device, self.model_path)

def load_tokenizer(self):
return DebertaV2TokenizerFast.from_pretrained(self.config.model)

def load_config(self):
return AutoConfig.from_pretrained(self.path_or_name)


class DomainClassifier(DistributedDataClassifier):
def __init__(
self,
Expand All @@ -197,11 +204,12 @@ def __init__(
if out_dim is None:
out_dim = len(labels)

model = DomainModel(config=domain_Config,
out_dim=out_dim,
model_path=model_file_name,
autocast=autocast)

model = DomainModel(
config=domain_Config,
out_dim=out_dim,
model_path=model_file_name,
autocast=autocast,
)

super().__init__(
model=model,
Expand All @@ -219,13 +227,17 @@ def _run_classifier(self, dataset: DocumentDataset):
print("Starting domain classifier inference", flush=True)

df = dataset.df
df['sliced_text'] = df['text'].str.slice(0, self.max_chars)
df["sliced_text"] = df["text"].str.slice(0, self.max_chars)
columns_to_keep_list = df.columns.to_list()
columns_to_keep_list.remove('sliced_text')
columns_to_keep_list.remove("sliced_text")

pipe = op.Sequential(
op.Tokenizer(self.model, cols=["sliced_text"], tokenizer_type="sentencepiece"),
op.Predictor(self.model, sorted_data_loader=True, batch_size=self.batch_size),
op.Tokenizer(
self.model, cols=["sliced_text"], tokenizer_type="sentencepiece"
),
op.Predictor(
self.model, sorted_data_loader=True, batch_size=self.batch_size
),
op.Labeler(self.labels, cols=["preds"]),
repartition=df.npartitions,
keep_cols=columns_to_keep_list,
Expand Down