-
Notifications
You must be signed in to change notification settings - Fork 90
/
Copy pathquotes_to_scrape.py
38 lines (29 loc) · 1.07 KB
/
quotes_to_scrape.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
import logging
import requests
from mlscraper.html import Page
from mlscraper.samples import Sample, TrainingSet
from mlscraper.training import train_scraper
def main():
"""
This example shows you how to build a scraper for authors on quotes.toscrape.com
"""
# fetch the page to train
einstein_url = "http://quotes.toscrape.com/author/Albert-Einstein/"
resp = requests.get(einstein_url)
assert resp.status_code == 200
# create a sample for Albert Einstein
training_set = TrainingSet()
page = Page(resp.content)
sample = Sample(page, {"name": "Albert Einstein", "born": "March 14, 1879"})
training_set.add_sample(sample)
# train the scraper with the created training set
scraper = train_scraper(training_set)
# scrape another page
resp = requests.get("http://quotes.toscrape.com/author/J-K-Rowling")
result = scraper.get(Page(resp.content))
print(result)
return result
# returns {'name': 'J.K. Rowling', 'born': 'July 31, 1965'}
if __name__ == "__main__":
logging.basicConfig(level=logging.INFO)
main()