Skip to content

Commit

Permalink
Formatting updates
Browse files Browse the repository at this point in the history
  • Loading branch information
antgad committed Nov 1, 2021
1 parent b05afe3 commit 48fde02
Show file tree
Hide file tree
Showing 4 changed files with 26 additions and 15 deletions.
12 changes: 8 additions & 4 deletions src/formatter.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@
from datetime import datetime
import math

def formatResult(website, titles, prices, links,ratings):
def formatResult(website, titles, prices, links,ratings,df_flag):
"""
The formatResult function takes the scraped HTML as input, and extracts the
necessary values from the HTML code. Ex. extracting a price '$19.99' from
Expand All @@ -24,18 +24,21 @@ def formatResult(website, titles, prices, links,ratings):
title, price, link, rating = '', '', '', ''
if titles: title = titles[0].get_text().strip()
if prices: price = prices[0].get_text().strip()
#if links: link = links[0]['href']
if links: link = links[0]['href']
if ratings: rating = ratings[0].get_text().strip().split()[0]
if df_flag==0: title=formatTitle(title)
if df_flag==0: link=formatTitle(link)
product = {
'timestamp': datetime.now().strftime("%d/%m/%Y %H:%M:%S"),
"title": formatTitle(title),
"title": title,
"price": price,
# "link":f'www.{website}.com{link}',
"link":f'www.{website}.com{link}',
"website": website,
"rating" : rating,
}
return product


def sortList(arr, sortBy, reverse):
"""
The sortList function is used to sort the products list based on the
Expand Down Expand Up @@ -64,6 +67,7 @@ def formatTitle(title):
return title[:40] + "..."
return title


def getNumbers(st):
"""
The getNumbers function extracts float values (price) from a string.
Expand Down
14 changes: 10 additions & 4 deletions src/full_version.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,7 @@
import json
import os
import pandas as pd
import scraper

class full_version:
def __init__(self):
Expand All @@ -13,6 +15,7 @@ def __init__(self):
"json",
"user_data.json"
)
self.df=pd.DataFrame()


def login(self):
Expand Down Expand Up @@ -43,10 +46,13 @@ def extract_list(self):
pass

def scrape(self,prod):
products_1 = scraper.searchAmazon(prod)
products_2 = scraper.searchWalmart(prod)
products_3 = scraper.searchEtsy(prod)

products_1 = scraper.searchAmazon(prod,1)
products_2 = scraper.searchWalmart(prod,1)
products_3 = scraper.searchEtsy(prod,1)
results=products_1+products_2+products_3
#esults = formatter.sortList(results, "ra" , True)
self.df=pd.DataFrame.from_dict(results, orient='columns')
print(self.df)



Expand Down
12 changes: 6 additions & 6 deletions src/scraper.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,7 @@ def httpsGet(URL):
return BeautifulSoup(soup1.prettify(), "html.parser")


def searchAmazon(query):
def searchAmazon(query, df_flag=0):
"""
The searchAmazon function scrapes amazon.com
"""
Expand All @@ -43,11 +43,11 @@ def searchAmazon(query):
for res in results:
titles, prices, links = res.select("h2 a span"), res.select("span.a-price span"), res.select("h2 a.a-link-normal")
ratings = res.select("span.a-icon-alt")
product = formatter.formatResult("amazon", titles, prices, links,ratings)
product = formatter.formatResult("amazon", titles, prices, links,ratings, df_flag)
products.append(product)
return products

def searchWalmart(query):
def searchWalmart(query, df_flag=0):
"""
The searchWalmart function scrapes walmart.com
"""
Expand All @@ -61,11 +61,11 @@ def searchWalmart(query):
for res in results:
titles, prices, links = res.select("span.lh-title"), res.select("div.lh-copy"), res.select("a")
ratings = res.findAll("span",{"class":"w_Cj"},text=pattern)
product = formatter.formatResult("walmart", titles, prices, links,ratings)
product = formatter.formatResult("walmart", titles, prices, links,ratings, df_flag)
products.append(product)
return products

def searchEtsy(query):
def searchEtsy(query, df_flag=0):
"""
The searchEtsy function scrapes Etsy.com
"""
Expand All @@ -79,6 +79,6 @@ def searchEtsy(query):
for item in soup.select('.wt-grid__item-xs-6'):
titles, prices, links = (item.select("h3")), (item.select(".currency-value")), (item.select('.width-full'))
ratings = item.select('span.screen-reader-only')
product = formatter.formatResult("Etsy", titles, prices, links, ratings)
product = formatter.formatResult("Etsy", titles, prices, links, ratings, df_flag)
products.append(product)
return products
3 changes: 2 additions & 1 deletion src/slash.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@
import os
import csv
import full_version
import csv_writer



Expand Down Expand Up @@ -50,7 +51,7 @@ def main():
print(tabulate(results, headers="keys", tablefmt="github"))
print()
print()
rint("CSV Saved at: ",args.cd)
print("CSV Saved at: ",args.cd)
print("File Name:", csv_writer.write_csv(results_1, args.search, args.cd))

if __name__ == '__main__':
Expand Down

0 comments on commit 48fde02

Please sign in to comment.