From f02c75616ae0fc2ac0b7382f1b431a888e49c65f Mon Sep 17 00:00:00 2001 From: ccl-core Date: Fri, 24 Jan 2025 14:20:51 +0000 Subject: [PATCH] Handle dash in the regex2glob function --- python/mlcroissant/mlcroissant/_src/core/regex.py | 2 ++ python/mlcroissant/mlcroissant/_src/core/regex_test.py | 6 ++++++ 2 files changed, 8 insertions(+) diff --git a/python/mlcroissant/mlcroissant/_src/core/regex.py b/python/mlcroissant/mlcroissant/_src/core/regex.py index 468bfb6e..c7ffb3bb 100644 --- a/python/mlcroissant/mlcroissant/_src/core/regex.py +++ b/python/mlcroissant/mlcroissant/_src/core/regex.py @@ -65,6 +65,8 @@ def _regex_to_glob_for_str(regex: str) -> Iterable[str]: regex = re.sub(r"\.\*", "*", regex) # Interpret .+ as * regex = re.sub(r"\.\+", "*", regex) + # Interpret \\- as - + regex = re.sub(r"\\-", "-", regex) return [regex] diff --git a/python/mlcroissant/mlcroissant/_src/core/regex_test.py b/python/mlcroissant/mlcroissant/_src/core/regex_test.py index 3a153f01..c9a46bee 100644 --- a/python/mlcroissant/mlcroissant/_src/core/regex_test.py +++ b/python/mlcroissant/mlcroissant/_src/core/regex_test.py @@ -27,6 +27,12 @@ "*/train/*.parquet", # ...to a valid glob pattern. ], ], + [ + "^.+/my\\-train/.*\.parquet$", # From a valid regex... + [ + "*/my-train/*.parquet", # ...to a valid glob pattern. + ], + ], ], ) def test_regex_to_glob(regex: str, output: list[str]):