Skip to content

Commit

Permalink
rename input_text_field, input-text-field, input_id_field, input_id_f…
Browse files Browse the repository at this point in the history
…ields

Signed-off-by: Sarah Yurick <[email protected]>
  • Loading branch information
sarahyurick committed Jan 17, 2025
1 parent 7cfda44 commit ab54623
Show file tree
Hide file tree
Showing 21 changed files with 73 additions and 73 deletions.
4 changes: 2 additions & 2 deletions examples/translation_example.py
Original file line number Diff line number Diff line change
Expand Up @@ -344,7 +344,7 @@ def attach_args():
parser.set_defaults(
pretrained_model_name_or_path="ai4bharat/indictrans2-en-indic-1B"
)
parser.set_defaults(input_text_field="text")
parser.set_defaults(text_field="text")
parser.set_defaults(device="gpu")
return parser

Expand All @@ -356,7 +356,7 @@ def main(args):
print(client.dashboard_link)
translator_model = IndicTranslation(
pretrained_model_name_or_path=args.pretrained_model_name_or_path,
input_column=args.input_text_field,
input_column=args.text_field,
batch_size=args.batch_size,
autocast=args.autocast,
)
Expand Down
12 changes: 6 additions & 6 deletions nemo_curator/modules/fuzzy_dedup.py
Original file line number Diff line number Diff line change
Expand Up @@ -776,19 +776,19 @@ def __init__(

@staticmethod
def _combine_multiple_ids(
input_df: cudf.DataFrame, input_id_fields: list, output_id_field: str
input_df: cudf.DataFrame, id_fields: list, output_id_field: str
) -> cudf.DataFrame:
if output_id_field in input_df.columns:
raise ValueError(
f"Input df already contains column named: {output_id_field}"
)

output_df = input_df.copy()[input_df.columns.difference(input_id_fields)]
output_df = input_df.copy()[input_df.columns.difference(id_fields)]

output_df[output_id_field] = input_df[input_id_fields[0]].astype(str)
for input_field in input_id_fields[1:]:
output_df[output_id_field] = input_df[id_fields[0]].astype(str)
for input_field in id_fields[1:]:
output_df[output_id_field] = output_df[output_id_field] = (
input_df[input_id_fields[0]].astype(str)
input_df[id_fields[0]].astype(str)
+ "-"
+ input_df[input_field].astype(str)
)
Expand Down Expand Up @@ -825,7 +825,7 @@ def __call__(self, dataset: DocumentDataset) -> DocumentDataset:
if len(self.id_fields) > 1:
buckets_df = buckets_df.map_partitions(
BucketsToEdges._combine_multiple_ids,
input_id_fields=self.id_fields,
id_fields=self.id_fields,
output_id_field=self.str_id_name,
)

Expand Down
16 changes: 8 additions & 8 deletions nemo_curator/scripts/classifiers/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@ domain_classifier_inference \
--input-file-type "jsonl" \
--input-file-extension "jsonl" \
--output-file-type "jsonl" \
--input-text-field "text" \
--text-field "text" \
--batch-size 64 \
--autocast \
--max-chars 2000 \
Expand All @@ -48,7 +48,7 @@ multilingual_domain_classifier_inference \
--input-file-type "jsonl" \
--input-file-extension "jsonl" \
--output-file-type "jsonl" \
--input-text-field "text" \
--text-field "text" \
--batch-size 64 \
--autocast \
--max-chars 2000 \
Expand All @@ -67,7 +67,7 @@ quality_classifier_inference \
--input-file-type "jsonl" \
--input-file-extension "jsonl" \
--output-file-type "jsonl" \
--input-text-field "text" \
--text-field "text" \
--batch-size 64 \
--autocast \
--max-chars 2000 \
Expand All @@ -86,7 +86,7 @@ aegis_classifier_inference \
--input-file-type "jsonl" \
--input-file-extension "jsonl" \
--output-file-type "jsonl" \
--input-text-field "text" \
--text-field "text" \
--batch-size 64 \
--max-chars 6000 \
--device "gpu" \
Expand All @@ -109,7 +109,7 @@ instruction_data_guard_classifier_inference \
--input-file-type "jsonl" \
--input-file-extension "jsonl" \
--output-file-type "jsonl" \
--input-text-field "text" \
--text-field "text" \
--batch-size 64 \
--max-chars 6000 \
--device "gpu" \
Expand All @@ -130,7 +130,7 @@ fineweb_edu_classifier_inference \
--input-file-type "jsonl" \
--input-file-extension "jsonl" \
--output-file-type "jsonl" \
--input-text-field "text" \
--text-field "text" \
--batch-size 64 \
--autocast \
--max-chars 2000 \
Expand All @@ -149,7 +149,7 @@ content_type_classifier_inference \
--input-file-type "jsonl" \
--input-file-extension "jsonl" \
--output-file-type "jsonl" \
--input-text-field "text" \
--text-field "text" \
--batch-size 64 \
--autocast \
--max-chars 5000 \
Expand All @@ -168,7 +168,7 @@ prompt_task_complexity_classifier_inference \
--input-file-type "jsonl" \
--input-file-extension "jsonl" \
--output-file-type "jsonl" \
--input-text-field "text" \
--text-field "text" \
--batch-size 64 \
--autocast \
--max-chars 2000 \
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -63,7 +63,7 @@ def main():
aegis_classifier = AegisClassifier(
aegis_variant=args.aegis_variant,
token=args.token,
text_field=args.input_text_field,
text_field=args.text_field,
max_chars=args.max_chars,
batch_size=args.batch_size,
max_mem_gb=args.max_mem_gb_classifier,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -62,7 +62,7 @@ def main():
add_filename = True

content_type_classifier = ContentTypeClassifier(
text_field=args.input_text_field,
text_field=args.text_field,
max_chars=args.max_chars,
batch_size=args.batch_size,
autocast=args.autocast,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -62,7 +62,7 @@ def main():
add_filename = True

domain_classifier = DomainClassifier(
text_field=args.input_text_field,
text_field=args.text_field,
max_chars=args.max_chars,
batch_size=args.batch_size,
autocast=args.autocast,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -63,7 +63,7 @@ def main():
add_filename = True

fineweb_edu_classifier = FineWebEduClassifier(
text_field=args.input_text_field,
text_field=args.text_field,
batch_size=args.batch_size,
autocast=args.autocast,
max_chars=args.max_chars,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -62,7 +62,7 @@ def main():

instruction_data_guard_classifier = InstructionDataGuardClassifier(
token=args.token,
text_field=args.input_text_field,
text_field=args.text_field,
max_chars=args.max_chars,
batch_size=args.batch_size,
max_mem_gb=args.max_mem_gb_classifier,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -62,7 +62,7 @@ def main():
add_filename = True

multilingual_domain_classifier = MultilingualDomainClassifier(
text_field=args.input_text_field,
text_field=args.text_field,
max_chars=args.max_chars,
batch_size=args.batch_size,
autocast=args.autocast,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -62,7 +62,7 @@ def main():
add_filename = True

prompt_task_complexity_classifier = PromptTaskComplexityClassifier(
text_field=args.input_text_field,
text_field=args.text_field,
max_chars=args.max_chars,
batch_size=args.batch_size,
autocast=args.autocast,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -63,7 +63,7 @@ def main():
add_filename = True

classifier = QualityClassifier(
text_field=args.input_text_field,
text_field=args.text_field,
max_chars=args.max_chars,
batch_size=args.batch_size,
autocast=args.autocast,
Expand Down
4 changes: 2 additions & 2 deletions nemo_curator/scripts/find_matching_ngrams.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,7 @@ def main(args):
task_ngrams = pickle.load(fp)

decontaminator = nemo_curator.TaskDecontamination(
[], text_field=args.input_text_field, max_ngram_size=args.max_ngram_size
[], text_field=args.text_field, max_ngram_size=args.max_ngram_size
)

files = get_all_files_paths_under(args.input_data_dir)
Expand Down Expand Up @@ -64,7 +64,7 @@ def attach_args(

argumentHelper.add_arg_input_data_dir()
argumentHelper.add_arg_input_file_type()
argumentHelper.add_arg_input_text_field()
argumentHelper.add_arg_text_field()
argumentHelper.add_distributed_args()
parser.add_argument(
"--input-task-ngrams",
Expand Down
18 changes: 9 additions & 9 deletions nemo_curator/scripts/fuzzy_deduplication/map_buckets.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,8 +33,8 @@ def get_anchor_and_output_map_info(
num_workers,
shuffle_type,
input_bucket_field,
input_id_field,
input_text_field,
id_field,
text_field,
input_meta,
):
"""
Expand All @@ -53,8 +53,8 @@ def get_anchor_and_output_map_info(
input_data_paths=input_data_paths,
num_files=num_files,
blocksize=text_ddf_blocksize,
id_column=input_id_field,
text_column=input_text_field,
id_column=id_field,
text_column=text_field,
input_meta=input_meta,
)
ddf_bk = get_bucket_ddf_from_parquet_path(
Expand All @@ -63,7 +63,7 @@ def get_anchor_and_output_map_info(
map_buckets = _MapBuckets(
id_fields=["dataset_id", "doc_id"],
bucket_field=input_bucket_field,
text_field=input_text_field,
text_field=text_field,
)
ddf_anchor_docs_with_bk = map_buckets.map_buckets_with_anchors(
documents_df=ddf_text, buckets_df=ddf_bk, shuffle_type=shuffle_type
Expand Down Expand Up @@ -118,8 +118,8 @@ def jaccard_get_output_map_workflow(
num_files,
shuffle_type,
input_bucket_field,
input_id_field,
input_text_field,
id_field,
text_field,
input_meta,
):
"""
Expand All @@ -143,8 +143,8 @@ def jaccard_get_output_map_workflow(
num_workers,
shuffle_type,
input_bucket_field,
input_id_field,
input_text_field,
id_field,
text_field,
input_meta=input_meta,
)
ddf_anchor_docs_with_bk.to_parquet(
Expand Down
4 changes: 2 additions & 2 deletions nemo_curator/scripts/remove_matching_ngrams.py
Original file line number Diff line number Diff line change
Expand Up @@ -46,7 +46,7 @@ def main(args):

decontaminator = nemo_curator.TaskDecontamination(
[],
text_field=args.input_text_field,
text_field=args.text_field,
max_ngram_size=max_ngram_size,
max_matches=args.match_threshold,
max_splits=args.max_document_splits,
Expand Down Expand Up @@ -100,7 +100,7 @@ def attach_args(
argumentHelper.add_arg_batch_size()
argumentHelper.add_arg_input_data_dir()
argumentHelper.add_arg_input_file_type()
argumentHelper.add_arg_input_text_field()
argumentHelper.add_arg_text_field()
argumentHelper.add_arg_output_file_type()
argumentHelper.add_distributed_args()
parser.add_argument(
Expand Down
2 changes: 1 addition & 1 deletion nemo_curator/scripts/semdedup/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@ Please edit `config/sem_dedup_config.yaml` to configure the pipeline and run it

2) Compute embeddings:
```sh
semdedup_extract_embeddings --input-data-dir "$INPUT_DATA_DIR" --input-file-type "jsonl" --input-file-extension "json" --input-text-field "text" --config-file "$CONFIG_FILE"
semdedup_extract_embeddings --input-data-dir "$INPUT_DATA_DIR" --input-file-type "jsonl" --input-file-extension "json" --text-field "text" --config-file "$CONFIG_FILE"
```
**Input:** `input_data_dir/*.jsonl` and YAML file from step (1)

Expand Down
4 changes: 2 additions & 2 deletions nemo_curator/scripts/semdedup/compute_embeddings.py
Original file line number Diff line number Diff line change
Expand Up @@ -79,7 +79,7 @@ def main(args):
embedding_output_dir=os.path.join(
semdedup_config.cache_dir, semdedup_config.embeddings_save_loc
),
input_column=args.input_text_field,
input_column=args.text_field,
logger=logger,
write_to_filename=True,
)
Expand All @@ -101,7 +101,7 @@ def attach_args():
"--input-data-dir for the directory containing input data files, "
'--input-file-type for the type of input files (e.g., "json", "csv"), '
'--input-file-extension for specifying the file extension of input files (e.g., ".jsonl"), '
"--input-text-field for the field in the input files containing the text data to be embedded, "
"--text-field for the field in the input files containing the text data to be embedded, "
"--config-file for the path to the semantic deduplication configuration file. "
"Important configuration parameters include: "
" cache_dir for the directory to store cache"
Expand Down
4 changes: 2 additions & 2 deletions nemo_curator/scripts/text_cleaning.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,7 @@ def main(args):
output_clean_dir = expand_outdir_and_mkdir(args.output_clean_dir)

cleaner = nemo_curator.Modify(
UnicodeReformatter(), text_field=args.input_text_field
UnicodeReformatter(), text_field=args.text_field
)

for files in get_batched_files(
Expand Down Expand Up @@ -76,7 +76,7 @@ def attach_args(
argumentHelper.add_arg_batch_size()
argumentHelper.add_arg_input_data_dir()
argumentHelper.add_arg_input_file_type()
argumentHelper.add_arg_input_text_field()
argumentHelper.add_arg_text_field()
argumentHelper.add_arg_output_file_type()
argumentHelper.add_distributed_args()
parser.add_argument(
Expand Down
8 changes: 4 additions & 4 deletions nemo_curator/utils/script_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -169,9 +169,9 @@ def add_arg_input_meta(self):
"their respective data types within the JSONL input files.",
)

def add_arg_input_text_field(self):
def add_arg_text_field(self):
self.parser.add_argument(
"--input-text-field",
"--text-field",
type=str,
default="text",
help="The name of the field within each datapoint object of the input "
Expand Down Expand Up @@ -495,7 +495,7 @@ def parse_distributed_classifier_args(
argumentHelper.add_arg_input_file_type()
argumentHelper.add_arg_input_file_extension()
argumentHelper.add_arg_output_file_type()
argumentHelper.add_arg_input_text_field()
argumentHelper.add_arg_text_field()
argumentHelper.add_arg_batch_size(
help="The batch size to be used for inference."
)
Expand Down Expand Up @@ -590,7 +590,7 @@ def parse_semdedup_args(
argumentHelper.add_arg_input_data_dir()
argumentHelper.add_arg_input_file_extension()
argumentHelper.add_arg_input_file_type()
argumentHelper.add_arg_input_text_field()
argumentHelper.add_arg_text_field()
argumentHelper.add_arg_id_column()
argumentHelper.add_arg_id_column_type()

Expand Down
Loading

0 comments on commit ab54623

Please sign in to comment.