-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathdecathlon_scrape.py
113 lines (94 loc) · 4.49 KB
/
decathlon_scrape.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
''' This script scrapes ~130 pages of Last Call items on Decathlon website.
Export them to CSV as "price|item_name|item_link|image_link".
As of 25-OCT-2018, there are 15 items per page and 133 pages in total.
It takes ~15 minutes. So, something definitely should be improved.
May be faster if CSV was appended page by page instead of line by line.
Usage:
% python decathlon_scrape.py
Can Sakirt - October 2018
'''
import csv
from urllib.request import urlopen
from bs4 import BeautifulSoup
def opengetpage(link):
''' Open link. Return list of items (products).
'''
# Open link
html = urlopen(link)
# Convert to a bs object
bsObj = BeautifulSoup(html, "lxml")
# Find all <div> elements with the id=bc-sf-filter-products,
# this returns a list, so select the zeroth one (notice [0] at the end)
table = bsObj.findAll("div", {"id": "bc-sf-filter-products"})[0]
# Find all <div> elements with the class=grid__item large--one-third... etc
# Returns a list of 15 items. (Decathlon displays 15 items on this page.)
items = table.findAll("div", {"class": "grid__item large--one-third \
one-half collectionProduct \
js-bln-product"})
return items
def get_items(item):
''' Parse each item. These are div elements for each product. '''
# Get img element's data-src and make it a proper link
image_link = "https:" + item.img["data-src"]
# Get the text in a that is under p
item_name = item.p.a.get_text()
# Get a element's href and make it a proper link
item_link = "https://www.decathlon.com" + item.p.a["href"]
# Get text (price) of the first element with said attributes
price = item.findAll(attrs={"class": "productPrice collectionProduct-price\
u-marginBottom0x"})[0].get_text()
# Strip price of spaces and other text.
price = only_price(price)
# Turn this data into a list
current_item = [price, item_name, item_link, image_link]
# return the current_item i.e. product info for 1.
return current_item
def only_price(t):
''' Strips the prices got from Decathlon's products.
Strips different kinds of prices.
See HTML code where the prices are.'''
t = t.strip()
if "\n" in t:
return t.replace(" ", "").split("\n")[1]
elif " " in t:
return t.replace(" ", "").split(" ")[1]
else:
return t
def write_to_file(line):
''' Writes each line into csv. Default location is C:
It may require run as admin.'''
print(line) # printing for testing. Delete this & return to write to file
return # delet dis
with open(r"C:\decathlon_last_call.csv", "a", newline='') as f:
writer = csv.writer(f, delimiter='|')
writer.writerow(line)
def number_of_pages():
''' Looks at the last-call page and finds the last page
by looking at the paginator on the bottom of the page.'''
decathlon_page = "https://www.decathlon.com/collections/last-call"
html1 = urlopen(decathlon_page)
bsObj1 = BeautifulSoup(html1, "lxml")
number = bsObj1.select('[id="bc-sf-filter-products"]')[0] \
.select('[class="paginate-bottom"]')[0] \
.findAll('span')[4].get_text()
number = int(number) # convert to int because it was string.
return number
# Number of pages
number = 3 # use this if you wanna test a couple of pages.
number = number_of_pages()
for i in range(1, number):
# iterate over pages and return 15 items.
decathlon_page = "https://www.decathlon.com/collections/last-call?page="
print("printing page no", i)
# increment page in the url--ready to get the new page.
decathlon_page += str(i)
# get items of the current page.
items = opengetpage(decathlon_page)
for item in items:
# iterate over each item of 15 that comes from one page.
write_to_file(get_items(item))
# each line looks like this: [price, name, link, image link]
# ['$3.99', "Women's Hiking Short-Sleeve T-Shirt Techfresh 50", 'https://www.decathlon.com/collections/last-call/products/womens-short-sleeved-hiking-t-shirt-techfresh-50', 'https://cdn.shopify.com/s/files/1/1330/6287/products/13dc432b6d3b42f893727ca49408c140_large.jpg?v=1505958167']
# Links look like this:
# https://cdn.shopify.com/s/files/1/1330/6287/products/13dc432b6d3b42f893727ca49408c140_large.jpg?v=1505958167
# https://www.decathlon.com/collections/last-call/products/womens-short-sleeved-hiking-t-shirt-techfresh-50