forked from unpackpy/unpack-py101-webscrapping
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathget_data.py
74 lines (50 loc) · 1.88 KB
/
get_data.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
from urllib.parse import quote
import pandas as pd
from bs4 import BeautifulSoup
from requests_html import HTMLSession
def url_2_soup(url):
"""Get the BeautifulSoup object for a URL"""
session = HTMLSession()
resp = session.get(url)
return BeautifulSoup(resp.html.html, "html.parser")
def get_jd_url(keywords):
"""Return the URL for a JD search based on a keyword"""
return f"https://search.jd.com/Search?keyword={quote(keywords)}"
def get_jd_image(prod):
"""Get the image of a given JD product"""
path = prod.find("div", class_="p-img").find("img").attrs["data-lazy-img"].strip()
return "http:" + path
def get_df_jd(keywords):
"""Get a dataframe for the results of a JD search"""
url = get_jd_url(keywords)
soup = url_2_soup(url)
products = soup.find_all("div", {"class": "gl-i-wrap"})
data_products = [
{
"name": p.find("div", class_="p-name").text.strip(),
"price": float(p.find("div", class_="p-price").find("i").text.strip()),
"image": get_jd_image(p),
}
for p in products
]
return pd.DataFrame(data_products)
# TODO: Replace ... with implementation of web scrappers
def get_amazon_url(keywords, domain="co.uk"):
"""Return the URL for an Amazon search based on a keyword"""
...
def get_amazon_price(prod):
"""Get the price of a given Amazon product"""
...
def get_df_amazon(keywords, domain="co.uk"):
"""Get a dataframe for the results of a Amazon search"""
...
def get_newegg_url(keywords):
"""Return the URL for an NewEgg search based on a keyword"""
...
def get_newegg_price(price_current):
"""Process the price of a result (string) and return the string"""
import re
return float(re.sub(r".*?([\d\.]+).*", r"\1", price_current))
def get_df_newegg(keywords):
"""Get a dataframe for the results of a NewEgg search"""
...