-
Notifications
You must be signed in to change notification settings - Fork 90
/
Copy pathtest_scrapers.py
74 lines (62 loc) · 2.57 KB
/
test_scrapers.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
from mlscraper.html import Page
from mlscraper.matches import AttributeValueExtractor
from mlscraper.matches import TextValueExtractor
from mlscraper.scrapers import DictScraper
from mlscraper.scrapers import ListScraper
from mlscraper.scrapers import ValueScraper
from mlscraper.selectors import CssRuleSelector
from mlscraper.selectors import PassThroughSelector
class TestListOfDictScraper:
def test_scrape(self, stackoverflow_samples):
user_scraper = ValueScraper(
CssRuleSelector(".user-details a"), AttributeValueExtractor("href")
)
upvotes_scraper = ValueScraper(
CssRuleSelector(".js-vote-count"), TextValueExtractor()
)
when_scraper = ValueScraper(
CssRuleSelector(".user-action-time span"), AttributeValueExtractor("title")
)
scraper_per_key = {
"user": user_scraper,
"upvotes": upvotes_scraper,
"when": when_scraper,
}
scraper = DictScraper(scraper_per_key)
selector = CssRuleSelector(".answer")
ls = ListScraper(selector, scraper)
sample = stackoverflow_samples[0]
results = ls.get(sample.page)
assert sample.value == results
class TestDictScraper:
def test_scrape_matches(self):
item = {"h": "no 1", "t": "the first one"}
elem_temp = "<div><h1>%(h)s</h1><p>%(t)s</p></div>"
elem = elem_temp % item
html = f"<html><body>{elem}</body></html>"
page = Page(html)
text_extractor = TextValueExtractor()
ds = DictScraper(
scraper_per_key={
"h": ValueScraper(CssRuleSelector("h1"), text_extractor),
"t": ValueScraper(CssRuleSelector("p"), text_extractor),
}
)
assert ds.get(page) == item
class TestValueScraper:
def test_value_scraper(self):
page1_html = '<html><body><p class="test">test</p><p>bla</p></body></html>'
page1 = Page(page1_html)
page2_html = '<html><body><div></div><p class="test">hallo</p></body></html>'
page2 = Page(page2_html)
vs = ValueScraper(CssRuleSelector(".test"), TextValueExtractor())
assert vs.get(page1) == "test"
assert vs.get(page2) == "hallo"
class TestListOfValuesScraper:
def test_list_of_values_scraper(self):
page = Page(b"<html><body><p>a</p><i>noise</i><p>b</p><p>c</p></body></html>")
scraper = ListScraper(
CssRuleSelector("p"),
ValueScraper(PassThroughSelector(), TextValueExtractor()),
)
assert scraper.get(page) == ["a", "b", "c"]