From 54c2c7f441947ff525f1b83ae2555493cc998deb Mon Sep 17 00:00:00 2001 From: Ryan Wolf Date: Thu, 4 Apr 2024 10:21:25 -0700 Subject: [PATCH 1/4] Add dependency to fix justext Signed-off-by: Ryan Wolf --- setup.py | 1 + 1 file changed, 1 insertion(+) diff --git a/setup.py b/setup.py index 9652d646e..872ccc639 100644 --- a/setup.py +++ b/setup.py @@ -64,6 +64,7 @@ "presidio-anonymizer==2.2.351", "usaddress==0.5.10", "nemo_toolkit[nlp]>=1.23.0", + "lxml[html_clean]", ], entry_points={ "console_scripts": [ From 22df5c13d12cd930a81bdb3af2bf2358730f9f4f Mon Sep 17 00:00:00 2001 From: Ryan Wolf Date: Tue, 9 Apr 2024 13:21:53 -0700 Subject: [PATCH 2/4] Add test for imports Signed-off-by: Ryan Wolf --- tests/test_download.py | 9 +++++++++ 1 file changed, 9 insertions(+) create mode 100644 tests/test_download.py diff --git a/tests/test_download.py b/tests/test_download.py new file mode 100644 index 000000000..0d590da8f --- /dev/null +++ b/tests/test_download.py @@ -0,0 +1,9 @@ +class TestDownload: + def test_imports(): + from nemo_curator.download import ( + download_arxiv, + download_common_crawl, + download_wikipedia, + ) + + assert True From 4c04253d69eb1eeb7b654eeff82f512542f7f1d5 Mon Sep 17 00:00:00 2001 From: Ryan Wolf Date: Tue, 9 Apr 2024 13:26:37 -0700 Subject: [PATCH 3/4] Add self arg Signed-off-by: Ryan Wolf --- tests/test_download.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/test_download.py b/tests/test_download.py index 0d590da8f..eea9bff50 100644 --- a/tests/test_download.py +++ b/tests/test_download.py @@ -1,5 +1,5 @@ class TestDownload: - def test_imports(): + def test_imports(self): from nemo_curator.download import ( download_arxiv, download_common_crawl, From 9d7b28511821b0be08cb92f7478399cdaec10787 Mon Sep 17 00:00:00 2001 From: Ryan Wolf Date: Wed, 10 Apr 2024 10:46:38 -0700 Subject: [PATCH 4/4] Add note about upstream issue Signed-off-by: Ryan Wolf --- setup.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/setup.py b/setup.py index 872ccc639..b47ef5c95 100644 --- a/setup.py +++ b/setup.py @@ -64,6 +64,8 @@ "presidio-anonymizer==2.2.351", "usaddress==0.5.10", "nemo_toolkit[nlp]>=1.23.0", + # justext installation breaks without lxml[html_clean] + # due to this: https://github.com/miso-belica/jusText/issues/47 "lxml[html_clean]", ], entry_points={