Skip to content

Commit

Permalink
skip flaky tests
Browse files Browse the repository at this point in the history
Signed-off-by: Sarah Yurick <[email protected]>
  • Loading branch information
sarahyurick committed Dec 24, 2024
1 parent 74a17be commit df22622
Showing 1 changed file with 22 additions and 31 deletions.
53 changes: 22 additions & 31 deletions tests/test_download.py
Original file line number Diff line number Diff line change
Expand Up @@ -82,6 +82,9 @@ def test_resiliparse_extract_text(self):

assert result == expected

@pytest.mark.skip(
reason="Skipping until we figure out how to get this to a non flaky state"
)
def test_common_crawl_urls(self):
start_snapshot = "2021-04"
end_snapshot = "2021-10"
Expand All @@ -103,6 +106,9 @@ def test_incorrect_snapshot_order(self):
start_snapshot = "2021-10"
urls = get_common_crawl_urls(start_snapshot, end_snapshot)

@pytest.mark.skip(
reason="Skipping until we figure out how to get this to a non flaky state"
)
def test_common_crawl_news_urls(self):
start_snapshot = "2021-04"
end_snapshot = "2021-10"
Expand All @@ -111,52 +117,37 @@ def test_common_crawl_news_urls(self):
assert (
urls[0]
== "https://data.commoncrawl.org/crawl-data/CC-NEWS/2021/04/CC-NEWS-20210401004522-01022.warc.gz"
) or (
# Flaky test
urls[0]
== "https://data.commoncrawl.org/crawl-data/CC-NEWS/2021/05/CC-NEWS-20210501004458-01527.warc.gz"
)
assert (
urls[-1]
== "https://data.commoncrawl.org/crawl-data/CC-NEWS/2021/10/CC-NEWS-20211031225258-00089.warc.gz"
) or (
# Flaky test
urls[-1]
== "https://data.commoncrawl.org/crawl-data/CC-NEWS/2021/09/CC-NEWS-20210930225622-00754.warc.gz"
)

# Flaky test
assert len(urls) == 3838 or len(urls) == 3275
assert len(urls) == 3838

def test_incorrect_snapshot_order_news(self):
with pytest.raises(ValueError):
end_snapshot = "2021-04"
start_snapshot = "2021-10"
urls = get_common_crawl_urls(start_snapshot, end_snapshot, news=True)

@pytest.mark.skip(
reason="Skipping until we figure out how to get this to a non flaky state"
)
def test_uneven_common_crawl_range(self):
try:
start_snapshot = "2021-03"
end_snapshot = "2021-11"
urls = get_common_crawl_urls(start_snapshot, end_snapshot)
start_snapshot = "2021-03"
end_snapshot = "2021-11"
urls = get_common_crawl_urls(start_snapshot, end_snapshot)

assert (
urls[0]
== "https://data.commoncrawl.org/crawl-data/CC-MAIN-2021-10/segments/1614178347293.1/warc/CC-MAIN-20210224165708-20210224195708-00000.warc.gz"
)
assert (
urls[-1]
== "https://data.commoncrawl.org/crawl-data/CC-MAIN-2021-04/segments/1610704847953.98/warc/CC-MAIN-20210128134124-20210128164124-00799.warc.gz"
)
assert len(urls) == 143840

except Exception as exception_string:
# We expect this flaky error
if "JSONDecodeError" in exception_string:
pass
# Else, something else is going on that needs to debugged
else:
assert False
assert (
urls[0]
== "https://data.commoncrawl.org/crawl-data/CC-MAIN-2021-10/segments/1614178347293.1/warc/CC-MAIN-20210224165708-20210224195708-00000.warc.gz"
)
assert (
urls[-1]
== "https://data.commoncrawl.org/crawl-data/CC-MAIN-2021-04/segments/1610704847953.98/warc/CC-MAIN-20210128134124-20210128164124-00799.warc.gz"
)
assert len(urls) == 143840

def test_no_urls(self):
with pytest.raises(ValueError):
Expand Down

0 comments on commit df22622

Please sign in to comment.