diff --git a/haystack/nodes/preprocessor/preprocessor.py b/haystack/nodes/preprocessor/preprocessor.py index 9dd21978ce..56e6b310bd 100644 --- a/haystack/nodes/preprocessor/preprocessor.py +++ b/haystack/nodes/preprocessor/preprocessor.py @@ -120,7 +120,7 @@ def __init__( nltk.data.find("tokenizers/punkt") except LookupError: try: - nltk.download("punkt") + nltk.download("punkt_tab") except FileExistsError as error: logger.debug("NLTK punkt tokenizer seems to be already downloaded. Error message: %s", error) pass diff --git a/pyproject.toml b/pyproject.toml index 5f576ac52a..56e865482f 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -147,7 +147,7 @@ crawler = [ "selenium>=4.11.0" ] preprocessing = [ - "nltk", + "nltk>=3.9", "langdetect", # for language classification ] file-conversion = [