Skip to content

Commit

Permalink
Disable string conversion globally (#56)
Browse files Browse the repository at this point in the history
Signed-off-by: Ryan Wolf <[email protected]>
  • Loading branch information
ryantwolf authored May 7, 2024
1 parent 8dd4255 commit 0e9b7f1
Show file tree
Hide file tree
Showing 3 changed files with 9 additions and 5 deletions.
1 change: 1 addition & 0 deletions config/fasttext_langid.yaml
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
input_field: text
filters:
- name: nemo_curator.filters.classifier_filter.FastTextLangId
log_score: True
params:
model_path: <Path to the FasText language id model (e.g., lid.176.bin)>
8 changes: 8 additions & 0 deletions nemo_curator/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,4 +12,12 @@
# See the License for the specific language governing permissions and
# limitations under the License.

import dask

from .modules import *

# Dask will automatically convert the list score type
# to a string without this option.
# See https://github.com/NVIDIA/NeMo-Curator/issues/33
# This also happens when reading and writing to files
dask.config.set({"dataframe.convert-string": False})
5 changes: 0 additions & 5 deletions nemo_curator/filters/classifier_filter.py
Original file line number Diff line number Diff line change
Expand Up @@ -76,11 +76,6 @@ def __init__(self, model_path=None, min_langid_score=0.3):
self._cutoff = min_langid_score
self._name = "lang_id"

# Dask will automatically convert the list score type
# to a string without this option.
# See https://github.com/NVIDIA/NeMo-Curator/issues/33
dask.config.set({"dataframe.convert-string": False})

@batched
def score_document(self, df: pd.Series):
model_attr = f"{self._name}_{self._model_path}"
Expand Down

0 comments on commit 0e9b7f1

Please sign in to comment.