Skip to content

Commit

Permalink
Fix broken NeMo dependencies (#372)
Browse files Browse the repository at this point in the history
* add packaging

Signed-off-by: Sarah Yurick <[email protected]>

* move to requires

Signed-off-by: Sarah Yurick <[email protected]>

* move to github ci file

Signed-off-by: Sarah Yurick <[email protected]>

* add pin

Signed-off-by: Sarah Yurick <[email protected]>

* add torch

Signed-off-by: Sarah Yurick <[email protected]>

* add suggestion from mamba readme

Signed-off-by: Sarah Yurick <[email protected]>

* try github install

Signed-off-by: Sarah Yurick <[email protected]>

* add comma

Signed-off-by: Sarah Yurick <[email protected]>

* another attempt

Signed-off-by: Sarah Yurick <[email protected]>

* remove nemo toolkit

Signed-off-by: Sarah Yurick <[email protected]>

* add datasets

Signed-off-by: Sarah Yurick <[email protected]>

* try removing cython

Signed-off-by: Sarah Yurick <[email protected]>

* remove cython

Signed-off-by: Sarah Yurick <[email protected]>

* sentencepiece

Signed-off-by: Sarah Yurick <[email protected]>

* run black

Signed-off-by: Sarah Yurick <[email protected]>

* apply ryan's suggestion

Signed-off-by: Sarah Yurick <[email protected]>

---------

Signed-off-by: Sarah Yurick <[email protected]>
  • Loading branch information
sarahyurick authored Nov 15, 2024
1 parent f7441ea commit 363a66b
Show file tree
Hide file tree
Showing 8 changed files with 11 additions and 15 deletions.
3 changes: 1 addition & 2 deletions .github/workflows/test.yml
Original file line number Diff line number Diff line change
Expand Up @@ -37,9 +37,8 @@ jobs:

# Installing wheel beforehand due to fasttext issue:
# https://github.com/facebookresearch/fastText/issues/512#issuecomment-1837367666
# Explicitly install cython: https://github.com/VKCOM/YouTokenToMe/issues/94
run: |
pip install wheel cython
pip install wheel
pip install --no-cache-dir .
pip install pytest
- name: Run tests
Expand Down
2 changes: 1 addition & 1 deletion Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,7 @@ RUN conda create -y --name curator -c conda-forge -c nvidia \
libcusparse \
libcusolver && \
source activate curator && \
pip install --upgrade cython pytest pip
pip install --upgrade pytest pip

RUN \
--mount=type=bind,source=/opt/NeMo-Curator/nemo_curator/__init__.py,target=/opt/NeMo-Curator/nemo_curator/__init__.py,from=curator-update \
Expand Down
2 changes: 0 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -83,14 +83,12 @@ You can get NeMo-Curator in 3 ways.
#### PyPi

```bash
pip install cython
pip install --extra-index-url https://pypi.nvidia.com nemo-curator[all]
```

#### Source
```bash
git clone https://github.com/NVIDIA/NeMo-Curator.git
pip install cython
pip install --extra-index-url https://pypi.nvidia.com "./NeMo-Curator[all]"
```

Expand Down
2 changes: 0 additions & 2 deletions docs/user-guide/image/gettingstarted.rst
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,6 @@ NeMo Curator's PyPi page can be found `here <https://pypi.org/project/nemo-curat

.. code-block:: bash
pip install cython
pip install nemo-curator[image]
#####################
Expand All @@ -44,7 +43,6 @@ NeMo Curator's GitHub can be found `here <https://github.com/NVIDIA/NeMo-Curator
.. code-block:: bash
git clone https://github.com/NVIDIA/NeMo-Curator.git
pip install cython
pip install ./NeMo-Curator[image]
############################
Expand Down
8 changes: 4 additions & 4 deletions nemo_curator/filters/code.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@
import csv
import warnings

import sentencepiece
from bs4 import BeautifulSoup
from comment_parser import comment_parser

Expand Down Expand Up @@ -102,19 +103,18 @@ def keep_document(self, score):
class TokenizerFertilityFilter(DocumentFilter):

def __init__(self, path_to_tokenizer=None, min_char_to_token_ratio=2.5):
from nemo.collections.common.tokenizers import SentencePieceTokenizer

if path_to_tokenizer is None:
raise ValueError(
"Must provide a valid path to a SentencePiece " "tokenizer"
)
self._tokenizer = SentencePieceTokenizer(path_to_tokenizer)
self._tokenizer = sentencepiece.SentencePieceProcessor()
self._tokenizer.Load(path_to_tokenizer)
self._threshold = min_char_to_token_ratio

self._name = "tokenizer_fertility"

def score_document(self, source):
tokens = self._tokenizer.text_to_tokens(source)
tokens = self._tokenizer.encode_as_pieces(source)
num_chars = len(source)
num_tokens = len(tokens)
if num_tokens == 0:
Expand Down
3 changes: 2 additions & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,7 @@ dependencies = [
"crossfit>=0.0.6",
"dask-mpi>=2021.11.0",
"dask[complete]>=2021.7.1",
"datasets",
"distributed>=2021.7.1",
"fasttext==0.9.2",
"ftfy==6.1.1",
Expand All @@ -54,14 +55,14 @@ dependencies = [
"lxml_html_clean",
"mecab-python3",
"mwparserfromhell==0.6.5",
"nemo_toolkit[nlp]>=1.23.0",
"numpy<2",
"openai",
"peft",
"presidio-analyzer==2.2.351",
"presidio-anonymizer==2.2.351",
"pycld2",
"resiliparse",
"sentencepiece",
"spacy>=3.6.0, <3.8.0",
"unidic-lite==1.0.8",
"usaddress==0.5.10",
Expand Down
2 changes: 1 addition & 1 deletion tutorials/image-curation/image-curation.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -49,7 +49,7 @@
},
"outputs": [],
"source": [
"!pip install cython ipywidgets aiofiles\n",
"!pip install ipywidgets aiofiles\n",
"# Install from source by default\n",
"!pip install --extra-index-url https://pypi.nvidia.com ../../[image]\n",
"%env DASK_DATAFRAME__QUERY_PLANNING False"
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -58,11 +58,11 @@
},
{
"cell_type": "code",
"execution_count": 2,
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"!pip install -qU wheel cython"
"!pip install -qU wheel"
]
},
{
Expand Down

0 comments on commit 363a66b

Please sign in to comment.