Skip to content

Commit

Permalink
input_field
Browse files Browse the repository at this point in the history
Signed-off-by: Sarah Yurick <[email protected]>
  • Loading branch information
sarahyurick committed Jan 17, 2025
1 parent 2b1fa11 commit fe461b1
Show file tree
Hide file tree
Showing 9 changed files with 14 additions and 14 deletions.
2 changes: 1 addition & 1 deletion config/fasttext_langid.yaml
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
input_field: text
text_field: text
filters:
- name: nemo_curator.filters.classifier_filter.FastTextLangId
log_score: True
Expand Down
2 changes: 1 addition & 1 deletion config/fasttext_quality_filter.yaml
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
input_field: text
text_field: text
filters:
- name: nemo_curator.filters.classifier_filter.FastTextQualityFilter
params:
Expand Down
2 changes: 1 addition & 1 deletion config/heuristic_filter_code.yaml
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
input_field: text
text_field: text
filters:
# The filters below define a chain of heuristic filters to be applied to each document in a corpus.
# This particular cascade of filters is intended to filter Python code data.
Expand Down
2 changes: 1 addition & 1 deletion config/heuristic_filter_en.yaml
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
input_field: text
text_field: text
filters:
# The filters below define a chain of heuristic filters to be applied to each document in a corpus.
# This particular cascade of filters is intended to filter English language data.
Expand Down
2 changes: 1 addition & 1 deletion config/heuristic_filter_non-en.yaml
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
input_field: text
text_field: text
filters:
# The filters below define a chain of heuristic filters to be applied to each document in a corpus.
# This particular cascade of filters is intended to filter generic non-English data that use spaces for separating words.
Expand Down
4 changes: 2 additions & 2 deletions nemo_curator/modules/fuzzy_dedup.py
Original file line number Diff line number Diff line change
Expand Up @@ -786,11 +786,11 @@ def _combine_multiple_ids(
output_df = input_df.copy()[input_df.columns.difference(id_fields)]

output_df[output_id_field] = input_df[id_fields[0]].astype(str)
for input_field in id_fields[1:]:
for id_field in id_fields[1:]:
output_df[output_id_field] = output_df[output_id_field] = (
input_df[id_fields[0]].astype(str)
+ "-"
+ input_df[input_field].astype(str)
+ input_df[id_field].astype(str)
)

return output_df
Expand Down
10 changes: 5 additions & 5 deletions nemo_curator/utils/config_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -45,7 +45,7 @@ def build_filter(filter_config):
doc_filter._name if filter_config.get("log_score", False) else None
)
filter_stage = nemo_curator.ScoreFilter(
doc_filter, filter_config.get("input_field"), score_field=score_field
doc_filter, filter_config.get("text_field"), score_field=score_field
)

return filter_stage
Expand All @@ -57,13 +57,13 @@ def build_filter_pipeline(filter_config_file):
filter_params = yaml.load(config_file, Loader=yaml.FullLoader)

filters = []
text_field = filter_params.get("input_field")
text_field = filter_params.get("text_field")
for nc_filter_config in filter_params.get("filters"):
if (
"input_field" not in nc_filter_config
or nc_filter_config["input_field"] is None
"text_field" not in nc_filter_config
or nc_filter_config["text_field"] is None
):
nc_filter_config["input_field"] = text_field
nc_filter_config["text_field"] = text_field
new_filter = build_filter(nc_filter_config)
filters.append(new_filter)

Expand Down
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
input_field: raw_content
text_field: raw_content
filters:
- name: nemo_curator.filters.heuristic_filter.NonAlphaNumericFilter
params:
Expand Down
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
input_field: text
text_field: text
filters:
# The filters below define a chain of heuristic filters to be applied to each document in a corpus.
# This particular cascade of filters is intended to filter generic non-English data that use spaces for separating words.
Expand Down

0 comments on commit fe461b1

Please sign in to comment.