rename input_text_field, input-text-field, input_id_field, input_id_f…

…ields Signed-off-by: Sarah Yurick <[email protected]>
NVIDIA · Jan 17, 2025 · ab54623 · ab54623
1 parent 7cfda44
commit ab54623
Show file tree

Hide file tree

Showing 21 changed files with 73 additions and 73 deletions.
diff --git a/examples/translation_example.py b/examples/translation_example.py
@@ -344,7 +344,7 @@ def attach_args():
     parser.set_defaults(
         pretrained_model_name_or_path="ai4bharat/indictrans2-en-indic-1B"
     )
-    parser.set_defaults(input_text_field="text")
+    parser.set_defaults(text_field="text")
     parser.set_defaults(device="gpu")
     return parser
 
@@ -356,7 +356,7 @@ def main(args):
     print(client.dashboard_link)
     translator_model = IndicTranslation(
         pretrained_model_name_or_path=args.pretrained_model_name_or_path,
-        input_column=args.input_text_field,
+        input_column=args.text_field,
         batch_size=args.batch_size,
         autocast=args.autocast,
     )

diff --git a/nemo_curator/modules/fuzzy_dedup.py b/nemo_curator/modules/fuzzy_dedup.py
@@ -776,19 +776,19 @@ def __init__(
 
     @staticmethod
     def _combine_multiple_ids(
-        input_df: cudf.DataFrame, input_id_fields: list, output_id_field: str
+        input_df: cudf.DataFrame, id_fields: list, output_id_field: str
     ) -> cudf.DataFrame:
         if output_id_field in input_df.columns:
             raise ValueError(
                 f"Input df already contains column named: {output_id_field}"
             )
 
-        output_df = input_df.copy()[input_df.columns.difference(input_id_fields)]
+        output_df = input_df.copy()[input_df.columns.difference(id_fields)]
 
-        output_df[output_id_field] = input_df[input_id_fields[0]].astype(str)
-        for input_field in input_id_fields[1:]:
+        output_df[output_id_field] = input_df[id_fields[0]].astype(str)
+        for input_field in id_fields[1:]:
             output_df[output_id_field] = output_df[output_id_field] = (
-                input_df[input_id_fields[0]].astype(str)
+                input_df[id_fields[0]].astype(str)
                 + "-"
                 + input_df[input_field].astype(str)
             )
@@ -825,7 +825,7 @@ def __call__(self, dataset: DocumentDataset) -> DocumentDataset:
         if len(self.id_fields) > 1:
             buckets_df = buckets_df.map_partitions(
                 BucketsToEdges._combine_multiple_ids,
-                input_id_fields=self.id_fields,
+                id_fields=self.id_fields,
                 output_id_field=self.str_id_name,
             )
 

diff --git a/nemo_curator/scripts/classifiers/README.md b/nemo_curator/scripts/classifiers/README.md
@@ -27,7 +27,7 @@ domain_classifier_inference \
     --input-file-type "jsonl" \
     --input-file-extension "jsonl" \
     --output-file-type "jsonl" \
-    --input-text-field "text" \
+    --text-field "text" \
     --batch-size 64 \
     --autocast \
     --max-chars 2000 \
@@ -48,7 +48,7 @@ multilingual_domain_classifier_inference \
     --input-file-type "jsonl" \
     --input-file-extension "jsonl" \
     --output-file-type "jsonl" \
-    --input-text-field "text" \
+    --text-field "text" \
     --batch-size 64 \
     --autocast \
     --max-chars 2000 \
@@ -67,7 +67,7 @@ quality_classifier_inference \
     --input-file-type "jsonl" \
     --input-file-extension "jsonl" \
     --output-file-type "jsonl" \
-    --input-text-field "text" \
+    --text-field "text" \
     --batch-size 64 \
     --autocast \
     --max-chars 2000 \
@@ -86,7 +86,7 @@ aegis_classifier_inference \
     --input-file-type "jsonl" \
     --input-file-extension "jsonl" \
     --output-file-type "jsonl" \
-    --input-text-field "text" \
+    --text-field "text" \
     --batch-size 64 \
     --max-chars 6000 \
     --device "gpu" \
@@ -109,7 +109,7 @@ instruction_data_guard_classifier_inference \
     --input-file-type "jsonl" \
     --input-file-extension "jsonl" \
     --output-file-type "jsonl" \
-    --input-text-field "text" \
+    --text-field "text" \
     --batch-size 64 \
     --max-chars 6000 \
     --device "gpu" \
@@ -130,7 +130,7 @@ fineweb_edu_classifier_inference \
     --input-file-type "jsonl" \
     --input-file-extension "jsonl" \
     --output-file-type "jsonl" \
-    --input-text-field "text" \
+    --text-field "text" \
     --batch-size 64 \
     --autocast \
     --max-chars 2000 \
@@ -149,7 +149,7 @@ content_type_classifier_inference \
     --input-file-type "jsonl" \
     --input-file-extension "jsonl" \
     --output-file-type "jsonl" \
-    --input-text-field "text" \
+    --text-field "text" \
     --batch-size 64 \
     --autocast \
     --max-chars 5000 \
@@ -168,7 +168,7 @@ prompt_task_complexity_classifier_inference \
     --input-file-type "jsonl" \
     --input-file-extension "jsonl" \
     --output-file-type "jsonl" \
-    --input-text-field "text" \
+    --text-field "text" \
     --batch-size 64 \
     --autocast \
     --max-chars 2000 \

diff --git a/nemo_curator/scripts/classifiers/aegis_classifier_inference.py b/nemo_curator/scripts/classifiers/aegis_classifier_inference.py
@@ -63,7 +63,7 @@ def main():
     aegis_classifier = AegisClassifier(
         aegis_variant=args.aegis_variant,
         token=args.token,
-        text_field=args.input_text_field,
+        text_field=args.text_field,
         max_chars=args.max_chars,
         batch_size=args.batch_size,
         max_mem_gb=args.max_mem_gb_classifier,

diff --git a/nemo_curator/scripts/classifiers/content_type_classifier_inference.py b/nemo_curator/scripts/classifiers/content_type_classifier_inference.py
@@ -62,7 +62,7 @@ def main():
         add_filename = True
 
     content_type_classifier = ContentTypeClassifier(
-        text_field=args.input_text_field,
+        text_field=args.text_field,
         max_chars=args.max_chars,
         batch_size=args.batch_size,
         autocast=args.autocast,

diff --git a/nemo_curator/scripts/classifiers/domain_classifier_inference.py b/nemo_curator/scripts/classifiers/domain_classifier_inference.py
@@ -62,7 +62,7 @@ def main():
         add_filename = True
 
     domain_classifier = DomainClassifier(
-        text_field=args.input_text_field,
+        text_field=args.text_field,
         max_chars=args.max_chars,
         batch_size=args.batch_size,
         autocast=args.autocast,

diff --git a/nemo_curator/scripts/classifiers/fineweb_edu_classifier_inference.py b/nemo_curator/scripts/classifiers/fineweb_edu_classifier_inference.py
@@ -63,7 +63,7 @@ def main():
         add_filename = True
 
     fineweb_edu_classifier = FineWebEduClassifier(
-        text_field=args.input_text_field,
+        text_field=args.text_field,
         batch_size=args.batch_size,
         autocast=args.autocast,
         max_chars=args.max_chars,

diff --git a/nemo_curator/scripts/classifiers/instruction_data_guard_classifier_inference.py b/nemo_curator/scripts/classifiers/instruction_data_guard_classifier_inference.py
@@ -62,7 +62,7 @@ def main():
 
     instruction_data_guard_classifier = InstructionDataGuardClassifier(
         token=args.token,
-        text_field=args.input_text_field,
+        text_field=args.text_field,
         max_chars=args.max_chars,
         batch_size=args.batch_size,
         max_mem_gb=args.max_mem_gb_classifier,

diff --git a/nemo_curator/scripts/classifiers/multilingual_domain_classifier_inference.py b/nemo_curator/scripts/classifiers/multilingual_domain_classifier_inference.py
@@ -62,7 +62,7 @@ def main():
         add_filename = True
 
     multilingual_domain_classifier = MultilingualDomainClassifier(
-        text_field=args.input_text_field,
+        text_field=args.text_field,
         max_chars=args.max_chars,
         batch_size=args.batch_size,
         autocast=args.autocast,

diff --git a/nemo_curator/scripts/classifiers/prompt_task_complexity_classifier_inference.py b/nemo_curator/scripts/classifiers/prompt_task_complexity_classifier_inference.py
@@ -62,7 +62,7 @@ def main():
         add_filename = True
 
     prompt_task_complexity_classifier = PromptTaskComplexityClassifier(
-        text_field=args.input_text_field,
+        text_field=args.text_field,
         max_chars=args.max_chars,
         batch_size=args.batch_size,
         autocast=args.autocast,

diff --git a/nemo_curator/scripts/classifiers/quality_classifier_inference.py b/nemo_curator/scripts/classifiers/quality_classifier_inference.py
@@ -63,7 +63,7 @@ def main():
         add_filename = True
 
     classifier = QualityClassifier(
-        text_field=args.input_text_field,
+        text_field=args.text_field,
         max_chars=args.max_chars,
         batch_size=args.batch_size,
         autocast=args.autocast,

diff --git a/nemo_curator/scripts/find_matching_ngrams.py b/nemo_curator/scripts/find_matching_ngrams.py
@@ -30,7 +30,7 @@ def main(args):
         task_ngrams = pickle.load(fp)
 
     decontaminator = nemo_curator.TaskDecontamination(
-        [], text_field=args.input_text_field, max_ngram_size=args.max_ngram_size
+        [], text_field=args.text_field, max_ngram_size=args.max_ngram_size
     )
 
     files = get_all_files_paths_under(args.input_data_dir)
@@ -64,7 +64,7 @@ def attach_args(
 
     argumentHelper.add_arg_input_data_dir()
     argumentHelper.add_arg_input_file_type()
-    argumentHelper.add_arg_input_text_field()
+    argumentHelper.add_arg_text_field()
     argumentHelper.add_distributed_args()
     parser.add_argument(
         "--input-task-ngrams",

diff --git a/nemo_curator/scripts/fuzzy_deduplication/map_buckets.py b/nemo_curator/scripts/fuzzy_deduplication/map_buckets.py
@@ -33,8 +33,8 @@ def get_anchor_and_output_map_info(
     num_workers,
     shuffle_type,
     input_bucket_field,
-    input_id_field,
-    input_text_field,
+    id_field,
+    text_field,
     input_meta,
 ):
     """
@@ -53,8 +53,8 @@ def get_anchor_and_output_map_info(
         input_data_paths=input_data_paths,
         num_files=num_files,
         blocksize=text_ddf_blocksize,
-        id_column=input_id_field,
-        text_column=input_text_field,
+        id_column=id_field,
+        text_column=text_field,
         input_meta=input_meta,
     )
     ddf_bk = get_bucket_ddf_from_parquet_path(
@@ -63,7 +63,7 @@ def get_anchor_and_output_map_info(
     map_buckets = _MapBuckets(
         id_fields=["dataset_id", "doc_id"],
         bucket_field=input_bucket_field,
-        text_field=input_text_field,
+        text_field=text_field,
     )
     ddf_anchor_docs_with_bk = map_buckets.map_buckets_with_anchors(
         documents_df=ddf_text, buckets_df=ddf_bk, shuffle_type=shuffle_type
@@ -118,8 +118,8 @@ def jaccard_get_output_map_workflow(
     num_files,
     shuffle_type,
     input_bucket_field,
-    input_id_field,
-    input_text_field,
+    id_field,
+    text_field,
     input_meta,
 ):
     """
@@ -143,8 +143,8 @@ def jaccard_get_output_map_workflow(
         num_workers,
         shuffle_type,
         input_bucket_field,
-        input_id_field,
-        input_text_field,
+        id_field,
+        text_field,
         input_meta=input_meta,
     )
     ddf_anchor_docs_with_bk.to_parquet(

diff --git a/nemo_curator/scripts/remove_matching_ngrams.py b/nemo_curator/scripts/remove_matching_ngrams.py
@@ -46,7 +46,7 @@ def main(args):
 
     decontaminator = nemo_curator.TaskDecontamination(
         [],
-        text_field=args.input_text_field,
+        text_field=args.text_field,
         max_ngram_size=max_ngram_size,
         max_matches=args.match_threshold,
         max_splits=args.max_document_splits,
@@ -100,7 +100,7 @@ def attach_args(
     argumentHelper.add_arg_batch_size()
     argumentHelper.add_arg_input_data_dir()
     argumentHelper.add_arg_input_file_type()
-    argumentHelper.add_arg_input_text_field()
+    argumentHelper.add_arg_text_field()
     argumentHelper.add_arg_output_file_type()
     argumentHelper.add_distributed_args()
     parser.add_argument(

diff --git a/nemo_curator/scripts/semdedup/README.md b/nemo_curator/scripts/semdedup/README.md
@@ -10,7 +10,7 @@ Please edit `config/sem_dedup_config.yaml` to configure the pipeline and run it
 
 2) Compute embeddings:
     ```sh
-    semdedup_extract_embeddings --input-data-dir "$INPUT_DATA_DIR" --input-file-type "jsonl" --input-file-extension "json" --input-text-field "text" --config-file "$CONFIG_FILE"
+    semdedup_extract_embeddings --input-data-dir "$INPUT_DATA_DIR" --input-file-type "jsonl" --input-file-extension "json" --text-field "text" --config-file "$CONFIG_FILE"
     ```
     **Input:** `input_data_dir/*.jsonl` and YAML file from step (1)
 

diff --git a/nemo_curator/scripts/semdedup/compute_embeddings.py b/nemo_curator/scripts/semdedup/compute_embeddings.py
@@ -79,7 +79,7 @@ def main(args):
         embedding_output_dir=os.path.join(
             semdedup_config.cache_dir, semdedup_config.embeddings_save_loc
         ),
-        input_column=args.input_text_field,
+        input_column=args.text_field,
         logger=logger,
         write_to_filename=True,
     )
@@ -101,7 +101,7 @@ def attach_args():
             "--input-data-dir for the directory containing input data files, "
             '--input-file-type for the type of input files (e.g., "json", "csv"), '
             '--input-file-extension for specifying the file extension of input files (e.g., ".jsonl"), '
-            "--input-text-field for the field in the input files containing the text data to be embedded, "
+            "--text-field for the field in the input files containing the text data to be embedded, "
             "--config-file for the path to the semantic deduplication configuration file. "
             "Important configuration parameters include: "
             " cache_dir for the directory to store cache"

diff --git a/nemo_curator/scripts/text_cleaning.py b/nemo_curator/scripts/text_cleaning.py
@@ -29,7 +29,7 @@ def main(args):
     output_clean_dir = expand_outdir_and_mkdir(args.output_clean_dir)
 
     cleaner = nemo_curator.Modify(
-        UnicodeReformatter(), text_field=args.input_text_field
+        UnicodeReformatter(), text_field=args.text_field
     )
 
     for files in get_batched_files(
@@ -76,7 +76,7 @@ def attach_args(
     argumentHelper.add_arg_batch_size()
     argumentHelper.add_arg_input_data_dir()
     argumentHelper.add_arg_input_file_type()
-    argumentHelper.add_arg_input_text_field()
+    argumentHelper.add_arg_text_field()
     argumentHelper.add_arg_output_file_type()
     argumentHelper.add_distributed_args()
     parser.add_argument(

diff --git a/nemo_curator/utils/script_utils.py b/nemo_curator/utils/script_utils.py
@@ -169,9 +169,9 @@ def add_arg_input_meta(self):
             "their respective data types within the JSONL input files.",
         )
 
-    def add_arg_input_text_field(self):
+    def add_arg_text_field(self):
         self.parser.add_argument(
-            "--input-text-field",
+            "--text-field",
             type=str,
             default="text",
             help="The name of the field within each datapoint object of the input "
@@ -495,7 +495,7 @@ def parse_distributed_classifier_args(
         argumentHelper.add_arg_input_file_type()
         argumentHelper.add_arg_input_file_extension()
         argumentHelper.add_arg_output_file_type()
-        argumentHelper.add_arg_input_text_field()
+        argumentHelper.add_arg_text_field()
         argumentHelper.add_arg_batch_size(
             help="The batch size to be used for inference."
         )
@@ -590,7 +590,7 @@ def parse_semdedup_args(
         argumentHelper.add_arg_input_data_dir()
         argumentHelper.add_arg_input_file_extension()
         argumentHelper.add_arg_input_file_type()
-        argumentHelper.add_arg_input_text_field()
+        argumentHelper.add_arg_text_field()
         argumentHelper.add_arg_id_column()
         argumentHelper.add_arg_id_column_type()