diff --git a/src/formatter.py b/src/formatter.py index eea54133..77544811 100644 --- a/src/formatter.py +++ b/src/formatter.py @@ -15,7 +15,7 @@ from datetime import datetime import math -def formatResult(website, titles, prices, links,ratings): +def formatResult(website, titles, prices, links,ratings,df_flag): """ The formatResult function takes the scraped HTML as input, and extracts the necessary values from the HTML code. Ex. extracting a price '$19.99' from @@ -24,18 +24,21 @@ def formatResult(website, titles, prices, links,ratings): title, price, link, rating = '', '', '', '' if titles: title = titles[0].get_text().strip() if prices: price = prices[0].get_text().strip() - #if links: link = links[0]['href'] + if links: link = links[0]['href'] if ratings: rating = ratings[0].get_text().strip().split()[0] + if df_flag==0: title=formatTitle(title) + if df_flag==0: link=formatTitle(link) product = { 'timestamp': datetime.now().strftime("%d/%m/%Y %H:%M:%S"), - "title": formatTitle(title), + "title": title, "price": price, - # "link":f'www.{website}.com{link}', + "link":f'www.{website}.com{link}', "website": website, "rating" : rating, } return product + def sortList(arr, sortBy, reverse): """ The sortList function is used to sort the products list based on the @@ -64,6 +67,7 @@ def formatTitle(title): return title[:40] + "..." return title + def getNumbers(st): """ The getNumbers function extracts float values (price) from a string. diff --git a/src/full_version.py b/src/full_version.py index bd77665f..673c67fc 100644 --- a/src/full_version.py +++ b/src/full_version.py @@ -1,5 +1,7 @@ import json import os +import pandas as pd +import scraper class full_version: def __init__(self): @@ -13,6 +15,7 @@ def __init__(self): "json", "user_data.json" ) + self.df=pd.DataFrame() def login(self): @@ -43,10 +46,13 @@ def extract_list(self): pass def scrape(self,prod): - products_1 = scraper.searchAmazon(prod) - products_2 = scraper.searchWalmart(prod) - products_3 = scraper.searchEtsy(prod) - + products_1 = scraper.searchAmazon(prod,1) + products_2 = scraper.searchWalmart(prod,1) + products_3 = scraper.searchEtsy(prod,1) + results=products_1+products_2+products_3 + #esults = formatter.sortList(results, "ra" , True) + self.df=pd.DataFrame.from_dict(results, orient='columns') + print(self.df) diff --git a/src/scraper.py b/src/scraper.py index 8f3abcbc..036de4fb 100644 --- a/src/scraper.py +++ b/src/scraper.py @@ -31,7 +31,7 @@ def httpsGet(URL): return BeautifulSoup(soup1.prettify(), "html.parser") -def searchAmazon(query): +def searchAmazon(query, df_flag=0): """ The searchAmazon function scrapes amazon.com """ @@ -43,11 +43,11 @@ def searchAmazon(query): for res in results: titles, prices, links = res.select("h2 a span"), res.select("span.a-price span"), res.select("h2 a.a-link-normal") ratings = res.select("span.a-icon-alt") - product = formatter.formatResult("amazon", titles, prices, links,ratings) + product = formatter.formatResult("amazon", titles, prices, links,ratings, df_flag) products.append(product) return products -def searchWalmart(query): +def searchWalmart(query, df_flag=0): """ The searchWalmart function scrapes walmart.com """ @@ -61,11 +61,11 @@ def searchWalmart(query): for res in results: titles, prices, links = res.select("span.lh-title"), res.select("div.lh-copy"), res.select("a") ratings = res.findAll("span",{"class":"w_Cj"},text=pattern) - product = formatter.formatResult("walmart", titles, prices, links,ratings) + product = formatter.formatResult("walmart", titles, prices, links,ratings, df_flag) products.append(product) return products -def searchEtsy(query): +def searchEtsy(query, df_flag=0): """ The searchEtsy function scrapes Etsy.com """ @@ -79,6 +79,6 @@ def searchEtsy(query): for item in soup.select('.wt-grid__item-xs-6'): titles, prices, links = (item.select("h3")), (item.select(".currency-value")), (item.select('.width-full')) ratings = item.select('span.screen-reader-only') - product = formatter.formatResult("Etsy", titles, prices, links, ratings) + product = formatter.formatResult("Etsy", titles, prices, links, ratings, df_flag) products.append(product) return products diff --git a/src/slash.py b/src/slash.py index 42e7fce6..5f507c82 100644 --- a/src/slash.py +++ b/src/slash.py @@ -14,6 +14,7 @@ import os import csv import full_version +import csv_writer @@ -50,7 +51,7 @@ def main(): print(tabulate(results, headers="keys", tablefmt="github")) print() print() - rint("CSV Saved at: ",args.cd) + print("CSV Saved at: ",args.cd) print("File Name:", csv_writer.write_csv(results_1, args.search, args.cd)) if __name__ == '__main__':