skip flaky tests

Signed-off-by: Sarah Yurick <[email protected]>
NVIDIA · Dec 24, 2024 · df22622 · df22622
1 parent 74a17be
commit df22622
Showing 1 changed file with 22 additions and 31 deletions.
diff --git a/tests/test_download.py b/tests/test_download.py
@@ -82,6 +82,9 @@ def test_resiliparse_extract_text(self):
 
         assert result == expected
 
+    @pytest.mark.skip(
+        reason="Skipping until we figure out how to get this to a non flaky state"
+    )
     def test_common_crawl_urls(self):
         start_snapshot = "2021-04"
         end_snapshot = "2021-10"
@@ -103,6 +106,9 @@ def test_incorrect_snapshot_order(self):
             start_snapshot = "2021-10"
             urls = get_common_crawl_urls(start_snapshot, end_snapshot)
 
+    @pytest.mark.skip(
+        reason="Skipping until we figure out how to get this to a non flaky state"
+    )
     def test_common_crawl_news_urls(self):
         start_snapshot = "2021-04"
         end_snapshot = "2021-10"
@@ -111,52 +117,37 @@ def test_common_crawl_news_urls(self):
         assert (
             urls[0]
             == "https://data.commoncrawl.org/crawl-data/CC-NEWS/2021/04/CC-NEWS-20210401004522-01022.warc.gz"
-        ) or (
-            # Flaky test
-            urls[0]
-            == "https://data.commoncrawl.org/crawl-data/CC-NEWS/2021/05/CC-NEWS-20210501004458-01527.warc.gz"
         )
         assert (
             urls[-1]
             == "https://data.commoncrawl.org/crawl-data/CC-NEWS/2021/10/CC-NEWS-20211031225258-00089.warc.gz"
-        ) or (
-            # Flaky test
-            urls[-1]
-            == "https://data.commoncrawl.org/crawl-data/CC-NEWS/2021/09/CC-NEWS-20210930225622-00754.warc.gz"
         )
 
-        # Flaky test
-        assert len(urls) == 3838 or len(urls) == 3275
+        assert len(urls) == 3838
 
     def test_incorrect_snapshot_order_news(self):
         with pytest.raises(ValueError):
             end_snapshot = "2021-04"
             start_snapshot = "2021-10"
             urls = get_common_crawl_urls(start_snapshot, end_snapshot, news=True)
 
+    @pytest.mark.skip(
+        reason="Skipping until we figure out how to get this to a non flaky state"
+    )
     def test_uneven_common_crawl_range(self):
-        try:
-            start_snapshot = "2021-03"
-            end_snapshot = "2021-11"
-            urls = get_common_crawl_urls(start_snapshot, end_snapshot)
+        start_snapshot = "2021-03"
+        end_snapshot = "2021-11"
+        urls = get_common_crawl_urls(start_snapshot, end_snapshot)
 
-            assert (
-                urls[0]
-                == "https://data.commoncrawl.org/crawl-data/CC-MAIN-2021-10/segments/1614178347293.1/warc/CC-MAIN-20210224165708-20210224195708-00000.warc.gz"
-            )
-            assert (
-                urls[-1]
-                == "https://data.commoncrawl.org/crawl-data/CC-MAIN-2021-04/segments/1610704847953.98/warc/CC-MAIN-20210128134124-20210128164124-00799.warc.gz"
-            )
-            assert len(urls) == 143840
-
-        except Exception as exception_string:
-            # We expect this flaky error
-            if "JSONDecodeError" in exception_string:
-                pass
-            # Else, something else is going on that needs to debugged
-            else:
-                assert False
+        assert (
+            urls[0]
+            == "https://data.commoncrawl.org/crawl-data/CC-MAIN-2021-10/segments/1614178347293.1/warc/CC-MAIN-20210224165708-20210224195708-00000.warc.gz"
+        )
+        assert (
+            urls[-1]
+            == "https://data.commoncrawl.org/crawl-data/CC-MAIN-2021-04/segments/1610704847953.98/warc/CC-MAIN-20210128134124-20210128164124-00799.warc.gz"
+        )
+        assert len(urls) == 143840
 
     def test_no_urls(self):
         with pytest.raises(ValueError):