-
Notifications
You must be signed in to change notification settings - Fork 0
/
scrapMSP.py
88 lines (71 loc) · 2.8 KB
/
scrapMSP.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
"""
AUTHOR : SANJAY KUMAR AGARWAL
OWNER : KAMPAYER.IN
DATE : MAY 22, 2014
"""
import sys
import urllib2
from bs4 import BeautifulSoup
storeIndex = {"flipkart":0, "amazon":1, "homeshop18":2, "infibeam":3, "ebay":4, "snapdeal":5}
def getShortestLink(productURL, productStore):
productURL = urllib2.unquote(productURL)
if productStore == "homeshop18":
return urllib2.unquote(productURL.strip().split('redirect=')[-1]).split('?')[0]
if productStore == "amazon":
return 'http://www.amazon.in/dp/'+productURL.strip().split('/')[5]
if productStore == "flipkart":
return productURL.strip().split('&')[0]
if productStore == "infibeam":
return productURL.strip().split('?')[0]
if productStore == "ebay":
return urllib2.unquote(productURL.strip().split('&mpre=')[-1]).split('?')[0]
if productStore == "snapdeal":
return "NA"
return "1";
def categoryScrap(categoryURL, category, fileName):
urlSourceCode = urllib2.urlopen(categoryURL.strip())
soup = BeautifulSoup(urlSourceCode)
tableOfProducts = soup.findAll('div', 'listitems_rd')[0]
products = tableOfProducts.findAll('div', 'msplistitem')
f = open(fileName,'a')
for product in products:
mspId = product['data-mspid']
productInfo = product.findAll('a', 'item-title')[0]
productName = productInfo.text
productURL = productInfo['href']
print >> f, '\t'.join([mspId.strip(), productName.strip(), urllib2.unquote(productURL).strip()])
nextPage = soup.find('a', 'msplistnav next')
if nextPage:
categoryScrap(nextPage['href'], category, fileName)
f.close()
return 1;
def productScrap(productURL, fileName, mspId, mspName):
urlSourceCode = urllib2.urlopen(productURL.strip())
soup = BeautifulSoup(urlSourceCode)
tableStores = soup.findAll('div', "price_table_in")[0]
stores = tableStores.findAll('div', "store_pricetable")
f = open(fileName,'a')
productStores = []
for store in stores:
if store.find('div', 'store_price').string == "Not Available":
break;
storeInfo = store.find('a')
generalStoreLinkMSP, storeName = storeInfo['href'], storeInfo.find('img')['alt']
if storeName not in ["flipkart", "amazon", "homeshop18", "infibeam", "ebay", "snapdeal"]:
continue;
generalStoreLink = BeautifulSoup(urllib2.urlopen(generalStoreLinkMSP)).find('a', "store-link")['href']
productStores.append([storeName, generalStoreLink])
storeInfos = [mspId, mspName, "NA", "NA", "NA", "NA", "NA", "NA"]
for productStore in productStores:
storeInfos[storeIndex[productStore[0]]+2] = getShortestLink(productStore[1], productStore[0])
print >> f, '\t'.join(storeInfos)
f.close()
return 1;
f = open("mobile_phones.csv",'r')
for line in f:
line = line.strip().split(',')
try:
productScrap(line[2], "productStores.csv",line[0],line[1])
except:
print line
f.close()