From bc38da0ab253d52d2a2dd87d701207992b130dbc Mon Sep 17 00:00:00 2001 From: antgad Date: Mon, 1 Nov 2021 13:04:40 -0700 Subject: [PATCH 01/31] addition to csv --- .gitignore | 3 +++ src/csv_writer.py | 15 +++++++++++++++ src/formatter.py | 2 +- src/slash.py | 15 +++++++++++---- 4 files changed, 30 insertions(+), 5 deletions(-) create mode 100644 src/csv_writer.py diff --git a/.gitignore b/.gitignore index af235a83..3118facd 100644 --- a/.gitignore +++ b/.gitignore @@ -127,3 +127,6 @@ dmypy.json # Pyre type checker .pyre/ + + +.csv \ No newline at end of file diff --git a/src/csv_writer.py b/src/csv_writer.py new file mode 100644 index 00000000..e911a7a4 --- /dev/null +++ b/src/csv_writer.py @@ -0,0 +1,15 @@ +import csv +from datetime import datetime +import os + + +def write_csv(arr,product,file_path): + os.chdir(file_path) + keys = arr[0].keys() + now=datetime.now() + file_name=product+now.strftime("%m%d%y_%H%M")+'.csv' + a_file = open(file_name, "w", newline='') + dict_writer = csv.DictWriter(a_file, keys) + dict_writer.writeheader() + dict_writer.writerows(arr) + a_file.close() \ No newline at end of file diff --git a/src/formatter.py b/src/formatter.py index 11412cac..ea91efaa 100644 --- a/src/formatter.py +++ b/src/formatter.py @@ -29,7 +29,7 @@ def formatResult(website, titles, prices, links): 'timestamp': datetime.now().strftime("%d/%m/%Y %H:%M:%S"), "title": formatTitle(title), "price": price, - # "link":f'www.{website}.com{link}', + "link":f'www.{website}.com{link}', "website": website, } return product diff --git a/src/slash.py b/src/slash.py index d4655dbb..8d277b53 100644 --- a/src/slash.py +++ b/src/slash.py @@ -11,6 +11,9 @@ import scraper import formatter from tabulate import tabulate +import pandas as pd +import os +import csv_writer def main(): @@ -20,14 +23,15 @@ def main(): parser.add_argument('--sort', type=str, nargs='+', help="Sort according to re (relevance: default), pr (price) or ra (rating)", default="re") parser.add_argument('--link', action='store_true', help="Show links in the table") parser.add_argument('--des', action='store_true', help="Sort in descending (non-increasing) order") + parser.add_argument('--cd', type=str, help="Change directory to save CSV file with search results", default=os.getcwd()) args = parser.parse_args() - products1 = scraper.searchAmazon(args.search) - products2 = scraper.searchWalmart(args.search) + products_1 = scraper.searchAmazon(args.search) + products_2 = scraper.searchWalmart(args.search) for sortBy in args.sort: - products1 = formatter.sortList(products1, sortBy, args.des)[:args.num] - products2 = formatter.sortList(products2, sortBy, args.des)[:args.num] + products1 = formatter.sortList(products_1, sortBy, args.des)[:args.num] + products2 = formatter.sortList(products_2, sortBy, args.des)[:args.num] results = products1 + products2 results = formatter.sortList(results, sortBy, args.des) @@ -37,6 +41,9 @@ def main(): print(tabulate(results, headers="keys", tablefmt="github")) print() print() + print("CSV Saved at: ",os.getcwd()) + csv_writer.write_csv((products_1+products_2), args.search, args.cd) + if __name__ == '__main__': main() \ No newline at end of file From b73d7303d73a2523e5e9729cb332eb8e1928fec7 Mon Sep 17 00:00:00 2001 From: antgad Date: Mon, 1 Nov 2021 13:07:23 -0700 Subject: [PATCH 02/31] print filename --- src/csv_writer.py | 3 ++- src/slash.py | 3 ++- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/src/csv_writer.py b/src/csv_writer.py index e911a7a4..460bd877 100644 --- a/src/csv_writer.py +++ b/src/csv_writer.py @@ -12,4 +12,5 @@ def write_csv(arr,product,file_path): dict_writer = csv.DictWriter(a_file, keys) dict_writer.writeheader() dict_writer.writerows(arr) - a_file.close() \ No newline at end of file + a_file.close() + return file_name \ No newline at end of file diff --git a/src/slash.py b/src/slash.py index 8d277b53..41d77165 100644 --- a/src/slash.py +++ b/src/slash.py @@ -42,7 +42,8 @@ def main(): print() print() print("CSV Saved at: ",os.getcwd()) - csv_writer.write_csv((products_1+products_2), args.search, args.cd) + print("File Name:", csv_writer.write_csv((products_1+products_2), args.search, args.cd)) + if __name__ == '__main__': From 8341caa047ff3c3b912284f2f795fb2357b3c15d Mon Sep 17 00:00:00 2001 From: AnmolikaGoyal <68813421+AnmolikaGoyal@users.noreply.github.com> Date: Mon, 1 Nov 2021 16:51:30 -0400 Subject: [PATCH 03/31] Added Etsy scraper --- src/scraper.py | 45 ++++++++++++++++++++++++++++++++++++++------- 1 file changed, 38 insertions(+), 7 deletions(-) diff --git a/src/scraper.py b/src/scraper.py index 281afff4..32aff560 100644 --- a/src/scraper.py +++ b/src/scraper.py @@ -14,15 +14,24 @@ import requests import formatter from bs4 import BeautifulSoup +from selenium import webdriver +from selenium.webdriver.common.by import By +from selenium.common.exceptions import NoSuchElementException + def httpsGet(URL): """ - The httpsGet funciton makes HTTP called to the requested URL with custom headers + The httpsGet function makes HTTP called to the requested URL with custom headers """ - headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.108 Safari/537.36", "Accept-Encoding":"gzip, deflate", "Accept":"text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8", "DNT":"1","Connection":"close", "Upgrade-Insecure-Requests":"1"} + + headers = { + "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.108 Safari/537.36", + "Accept-Encoding": "gzip, deflate", "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8", + "DNT": "1", "Connection": "close", "Upgrade-Insecure-Requests": "1"} page = requests.get(URL, headers=headers) soup1 = BeautifulSoup(page.content, "html.parser") - return BeautifulSoup(soup1.prettify(), "html.parser") + return BeautifulSoup(soup1.prettify(), "html.parser") + def searchAmazon(query): """ @@ -31,11 +40,12 @@ def searchAmazon(query): query = formatter.formatSearchQuery(query) URL = f'https://www.amazon.com/s?k={query}' page = httpsGet(URL) - results = page.findAll("div", {"data-component-type":"s-search-result"}) + results = page.findAll("div", {"data-component-type": "s-search-result"}) products = [] for res in results: - titles, prices, links = res.select("h2 a span"), res.select("span.a-price span"), res.select("h2 a.a-link-normal") - product = formatter.formatResult("amazon", titles, prices, links) + titles, prices, links = res.select("h2 a span"), res.select("span.a-price span"), res.select( + "h2 a.a-link-normal") + product = formatter.formatResult("amazon", titles, prices, links) products.append(product) return products @@ -52,4 +62,25 @@ def searchWalmart(query): titles, prices, links = res.select("span.lh-title"), res.select("div.lh-copy"), res.select("a") product = formatter.formatResult("walmart", titles, prices, links) products.append(product) - return products \ No newline at end of file + return products + + +def searchEtsy(query): + """ + The searchEtsy function scrapes Etsy.com + """ + query = formatter.formatSearchQuery(query) + url = f'https://www.etsy.com/search?q={query}' + products = [] + headers = { + 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_2) AppleWebKit/601.3.9 (KHTML, like Gecko) Version/9.0.2 Safari/601.3.9'} + response = requests.get(url, headers=headers) + soup = BeautifulSoup(response.content, 'lxml') + for item in soup.select('.wt-grid__item-xs-6'): + titles, prices, links = (item.select("h3")), (item.select(".currency-value")), (item.select('.width-full')) + rating = item.select('span.screen-reader-only') + if rating == []: + rating = '[0 out of 5 stars]' + product = formatter.formatResult("Etsy", titles, prices, links) + products.append(product) + return products From 4c6be3a25cbec1faee9897d4518466aa87e872a4 Mon Sep 17 00:00:00 2001 From: AnmolikaGoyal <68813421+AnmolikaGoyal@users.noreply.github.com> Date: Mon, 1 Nov 2021 16:52:29 -0400 Subject: [PATCH 04/31] Added product23 --- src/slash.py | 23 ++++++++--------------- 1 file changed, 8 insertions(+), 15 deletions(-) diff --git a/src/slash.py b/src/slash.py index 41d77165..ad62092c 100644 --- a/src/slash.py +++ b/src/slash.py @@ -11,9 +11,6 @@ import scraper import formatter from tabulate import tabulate -import pandas as pd -import os -import csv_writer def main(): @@ -23,28 +20,24 @@ def main(): parser.add_argument('--sort', type=str, nargs='+', help="Sort according to re (relevance: default), pr (price) or ra (rating)", default="re") parser.add_argument('--link', action='store_true', help="Show links in the table") parser.add_argument('--des', action='store_true', help="Sort in descending (non-increasing) order") - parser.add_argument('--cd', type=str, help="Change directory to save CSV file with search results", default=os.getcwd()) args = parser.parse_args() - products_1 = scraper.searchAmazon(args.search) - products_2 = scraper.searchWalmart(args.search) + products1 = scraper.searchAmazon(args.search) + products2 = scraper.searchWalmart(args.search) + products3 = scraper.searchEtsy(args.search) for sortBy in args.sort: - products1 = formatter.sortList(products_1, sortBy, args.des)[:args.num] - products2 = formatter.sortList(products_2, sortBy, args.des)[:args.num] - results = products1 + products2 + products1 = formatter.sortList(products1, sortBy, args.des)[:args.num] + products2 = formatter.sortList(products2, sortBy, args.des)[:args.num] + products3 = formatter.sortList(products3, sortBy, args.des)[:args.num] + results = products1 + products2 + products3 results = formatter.sortList(results, sortBy, args.des) - print() print() print(tabulate(results, headers="keys", tablefmt="github")) print() print() - print("CSV Saved at: ",os.getcwd()) - print("File Name:", csv_writer.write_csv((products_1+products_2), args.search, args.cd)) - - if __name__ == '__main__': - main() \ No newline at end of file + main() From 0b13c7fa30a9e6aa732e254dcbe0dbd1ad9af3ca Mon Sep 17 00:00:00 2001 From: Shubhangi Jain <48826459+shubhangij12@users.noreply.github.com> Date: Mon, 1 Nov 2021 16:58:25 -0400 Subject: [PATCH 05/31] Update formatter.py --- src/formatter.py | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) diff --git a/src/formatter.py b/src/formatter.py index ea91efaa..eea54133 100644 --- a/src/formatter.py +++ b/src/formatter.py @@ -15,22 +15,24 @@ from datetime import datetime import math -def formatResult(website, titles, prices, links): +def formatResult(website, titles, prices, links,ratings): """ The formatResult function takes the scraped HTML as input, and extracts the necessary values from the HTML code. Ex. extracting a price '$19.99' from a paragraph tag. """ - title, price, link = '', '', '' + title, price, link, rating = '', '', '', '' if titles: title = titles[0].get_text().strip() if prices: price = prices[0].get_text().strip() - if links: link = links[0]['href'] + #if links: link = links[0]['href'] + if ratings: rating = ratings[0].get_text().strip().split()[0] product = { 'timestamp': datetime.now().strftime("%d/%m/%Y %H:%M:%S"), "title": formatTitle(title), "price": price, - "link":f'www.{website}.com{link}', + # "link":f'www.{website}.com{link}', "website": website, + "rating" : rating, } return product @@ -43,7 +45,7 @@ def sortList(arr, sortBy, reverse): return sorted(arr, key=lambda x: getNumbers(x["price"]), reverse=reverse) # To-do: sort by rating elif sortBy == "ra": - # return sorted(arr, key=lambda x: getNumbers(x.price), reverse=reverse) + return sorted(arr, key=lambda x: getNumbers(x["rating"]), reverse=reverse) pass return arr @@ -75,4 +77,4 @@ def getNumbers(st): ans = float(ans) except: ans = math.inf - return ans \ No newline at end of file + return ans From 0669ba700614866969b9e010ff26616418f05f80 Mon Sep 17 00:00:00 2001 From: Shubhangi Jain <48826459+shubhangij12@users.noreply.github.com> Date: Mon, 1 Nov 2021 16:59:01 -0400 Subject: [PATCH 06/31] Update scraper.py --- src/scraper.py | 79 ++++++++++++++++++++++---------------------------- 1 file changed, 34 insertions(+), 45 deletions(-) diff --git a/src/scraper.py b/src/scraper.py index 281afff4..8b8ded2e 100644 --- a/src/scraper.py +++ b/src/scraper.py @@ -7,49 +7,38 @@ """ -""" -The scraper module holds functions that actually scrape the e-commerce websites -""" - -import requests +import argparse +import scraper import formatter -from bs4 import BeautifulSoup - -def httpsGet(URL): - """ - The httpsGet funciton makes HTTP called to the requested URL with custom headers - """ - headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.108 Safari/537.36", "Accept-Encoding":"gzip, deflate", "Accept":"text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8", "DNT":"1","Connection":"close", "Upgrade-Insecure-Requests":"1"} - page = requests.get(URL, headers=headers) - soup1 = BeautifulSoup(page.content, "html.parser") - return BeautifulSoup(soup1.prettify(), "html.parser") - -def searchAmazon(query): - """ - The searchAmazon function scrapes amazon.com - """ - query = formatter.formatSearchQuery(query) - URL = f'https://www.amazon.com/s?k={query}' - page = httpsGet(URL) - results = page.findAll("div", {"data-component-type":"s-search-result"}) - products = [] - for res in results: - titles, prices, links = res.select("h2 a span"), res.select("span.a-price span"), res.select("h2 a.a-link-normal") - product = formatter.formatResult("amazon", titles, prices, links) - products.append(product) - return products - -def searchWalmart(query): - """ - The searchWalmart function scrapes walmart.com - """ - query = formatter.formatSearchQuery(query) - URL = f'https://www.walmart.com/search?q={query}' - page = httpsGet(URL) - results = page.findAll("div", {"data-item-id":True}) - products = [] - for res in results: - titles, prices, links = res.select("span.lh-title"), res.select("div.lh-copy"), res.select("a") - product = formatter.formatResult("walmart", titles, prices, links) - products.append(product) - return products \ No newline at end of file +from tabulate import tabulate + + +def main(): + parser = argparse.ArgumentParser(description="Slash") + parser.add_argument('--search', type=str, help='Product search query') + parser.add_argument('--num', type=int, help="Maximum number of records", default=3) + parser.add_argument('--sort', type=str, nargs='+', help="Sort according to re (relevance: default), pr (price) or ra (rating)", default="re") + parser.add_argument('--link', action='store_true', help="Show links in the table") + parser.add_argument('--des', action='store_true', help="Sort in descending (non-increasing) order") + args = parser.parse_args() + + products1 = scraper.searchAmazon(args.search) + products2 = scraper.searchWalmart(args.search) + products3 = scraper.searchEtsy(args.search) + + for sortBy in args.sort: + products1 = formatter.sortList(products1, sortBy, args.des)[:args.num] + products2 = formatter.sortList(products2, sortBy, args.des)[:args.num] + products3 = formatter.sortList(products3, sortBy, args.des)[:args.num] + results = products1 + products2 + products3 + results = formatter.sortList(results, "ra" , args.des) + + + print() + print() + print(tabulate(results, headers="keys", tablefmt="github")) + print() + print() + +if __name__ == '__main__': + main() From 0e91c77ae61f9371f0305269911691e0b4fc48f5 Mon Sep 17 00:00:00 2001 From: Shubhangi Jain <48826459+shubhangij12@users.noreply.github.com> Date: Mon, 1 Nov 2021 16:59:54 -0400 Subject: [PATCH 07/31] Update slash.py --- src/slash.py | 24 +++++++++--------------- 1 file changed, 9 insertions(+), 15 deletions(-) diff --git a/src/slash.py b/src/slash.py index 41d77165..8b8ded2e 100644 --- a/src/slash.py +++ b/src/slash.py @@ -11,9 +11,6 @@ import scraper import formatter from tabulate import tabulate -import pandas as pd -import os -import csv_writer def main(): @@ -23,17 +20,18 @@ def main(): parser.add_argument('--sort', type=str, nargs='+', help="Sort according to re (relevance: default), pr (price) or ra (rating)", default="re") parser.add_argument('--link', action='store_true', help="Show links in the table") parser.add_argument('--des', action='store_true', help="Sort in descending (non-increasing) order") - parser.add_argument('--cd', type=str, help="Change directory to save CSV file with search results", default=os.getcwd()) args = parser.parse_args() - products_1 = scraper.searchAmazon(args.search) - products_2 = scraper.searchWalmart(args.search) + products1 = scraper.searchAmazon(args.search) + products2 = scraper.searchWalmart(args.search) + products3 = scraper.searchEtsy(args.search) for sortBy in args.sort: - products1 = formatter.sortList(products_1, sortBy, args.des)[:args.num] - products2 = formatter.sortList(products_2, sortBy, args.des)[:args.num] - results = products1 + products2 - results = formatter.sortList(results, sortBy, args.des) + products1 = formatter.sortList(products1, sortBy, args.des)[:args.num] + products2 = formatter.sortList(products2, sortBy, args.des)[:args.num] + products3 = formatter.sortList(products3, sortBy, args.des)[:args.num] + results = products1 + products2 + products3 + results = formatter.sortList(results, "ra" , args.des) print() @@ -41,10 +39,6 @@ def main(): print(tabulate(results, headers="keys", tablefmt="github")) print() print() - print("CSV Saved at: ",os.getcwd()) - print("File Name:", csv_writer.write_csv((products_1+products_2), args.search, args.cd)) - - if __name__ == '__main__': - main() \ No newline at end of file + main() From 45d51bff19d6e272a6e887e60c1356ae00081550 Mon Sep 17 00:00:00 2001 From: Shubhangi Jain <48826459+shubhangij12@users.noreply.github.com> Date: Mon, 1 Nov 2021 17:00:36 -0400 Subject: [PATCH 08/31] Update scraper.py --- src/scraper.py | 103 +++++++++++++++++++++++++++++++++---------------- 1 file changed, 69 insertions(+), 34 deletions(-) diff --git a/src/scraper.py b/src/scraper.py index 8b8ded2e..33693e46 100644 --- a/src/scraper.py +++ b/src/scraper.py @@ -7,38 +7,73 @@ """ -import argparse -import scraper +""" +The scraper module holds functions that actually scrape the e-commerce websites +""" + +import requests import formatter -from tabulate import tabulate - - -def main(): - parser = argparse.ArgumentParser(description="Slash") - parser.add_argument('--search', type=str, help='Product search query') - parser.add_argument('--num', type=int, help="Maximum number of records", default=3) - parser.add_argument('--sort', type=str, nargs='+', help="Sort according to re (relevance: default), pr (price) or ra (rating)", default="re") - parser.add_argument('--link', action='store_true', help="Show links in the table") - parser.add_argument('--des', action='store_true', help="Sort in descending (non-increasing) order") - args = parser.parse_args() - - products1 = scraper.searchAmazon(args.search) - products2 = scraper.searchWalmart(args.search) - products3 = scraper.searchEtsy(args.search) - - for sortBy in args.sort: - products1 = formatter.sortList(products1, sortBy, args.des)[:args.num] - products2 = formatter.sortList(products2, sortBy, args.des)[:args.num] - products3 = formatter.sortList(products3, sortBy, args.des)[:args.num] - results = products1 + products2 + products3 - results = formatter.sortList(results, "ra" , args.des) - - - print() - print() - print(tabulate(results, headers="keys", tablefmt="github")) - print() - print() - -if __name__ == '__main__': - main() +from bs4 import BeautifulSoup +import re + + +def httpsGet(URL): + """ + The httpsGet funciton makes HTTP called to the requested URL with custom headers + """ + headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.108 Safari/537.36", "Accept-Encoding":"gzip, deflate", "Accept":"text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8", "DNT":"1","Connection":"close", "Upgrade-Insecure-Requests":"1"} + page = requests.get(URL, headers=headers) + soup1 = BeautifulSoup(page.content, "html.parser") + return BeautifulSoup(soup1.prettify(), "html.parser") + +def searchAmazon(query): + """ + The searchAmazon function scrapes amazon.com + """ + query = formatter.formatSearchQuery(query) + URL = f'https://www.amazon.com/s?k={query}' + page = httpsGet(URL) + results = page.findAll("div", {"data-component-type":"s-search-result"}) + products = [] + for res in results: + titles, prices, links = res.select("h2 a span"), res.select("span.a-price span"), res.select("h2 a.a-link-normal") + ratings = res.select("span.a-icon-alt") + product = formatter.formatResult("amazon", titles, prices, links,ratings) + products.append(product) + return products + +def searchWalmart(query): + """ + The searchWalmart function scrapes walmart.com + """ + query = formatter.formatSearchQuery(query) + URL = f'https://www.walmart.com/search?q={query}' + page = httpsGet(URL) + results = page.findAll("div", {"data-item-id":True}) + #print(results) + products = [] + pattern = re.compile(r'Stars') + for res in results: + titles, prices, links = res.select("span.lh-title"), res.select("div.lh-copy"), res.select("a") + ratings = res.findAll("span",{"class":"w_Cj"},text=pattern) + product = formatter.formatResult("walmart", titles, prices, links,ratings) + products.append(product) + return products + +def searchEtsy(query): + """ + The searchEtsy function scrapes Etsy.com + """ + query = formatter.formatSearchQuery(query) + url = f'https://www.etsy.com/search?q={query}' + products = [] + headers = { + 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_2) AppleWebKit/601.3.9 (KHTML, like Gecko) Version/9.0.2 Safari/601.3.9'} + response = requests.get(url, headers=headers) + soup = BeautifulSoup(response.content, 'lxml') + for item in soup.select('.wt-grid__item-xs-6'): + titles, prices, links = (item.select("h3")), (item.select(".currency-value")), (item.select('.width-full')) + ratings = item.select('span.screen-reader-only') + product = formatter.formatResult("Etsy", titles, prices, links, ratings) + products.append(product) + return products From 8d97a69ac4080af5825ac868f2383ea2e3e2633d Mon Sep 17 00:00:00 2001 From: antgad Date: Mon, 1 Nov 2021 14:20:16 -0700 Subject: [PATCH 09/31] Update slash.py Update to csv generator --- src/slash.py | 19 +++++++++++++------ 1 file changed, 13 insertions(+), 6 deletions(-) diff --git a/src/slash.py b/src/slash.py index 8b8ded2e..f91cb5c3 100644 --- a/src/slash.py +++ b/src/slash.py @@ -11,6 +11,9 @@ import scraper import formatter from tabulate import tabulate +import os +import csv + def main(): @@ -20,17 +23,19 @@ def main(): parser.add_argument('--sort', type=str, nargs='+', help="Sort according to re (relevance: default), pr (price) or ra (rating)", default="re") parser.add_argument('--link', action='store_true', help="Show links in the table") parser.add_argument('--des', action='store_true', help="Sort in descending (non-increasing) order") + parser.add_argument('--cd', type=str, help="Change directory to save CSV file with search results", default=os.getcwd()) args = parser.parse_args() - products1 = scraper.searchAmazon(args.search) - products2 = scraper.searchWalmart(args.search) - products3 = scraper.searchEtsy(args.search) + products_1 = scraper.searchAmazon(args.search) + products_2 = scraper.searchWalmart(args.search) + products_3 = scraper.searchEtsy(args.search) for sortBy in args.sort: - products1 = formatter.sortList(products1, sortBy, args.des)[:args.num] - products2 = formatter.sortList(products2, sortBy, args.des)[:args.num] - products3 = formatter.sortList(products3, sortBy, args.des)[:args.num] + products1 = formatter.sortList(products_1, sortBy, args.des)[:args.num] + products2 = formatter.sortList(products_2, sortBy, args.des)[:args.num] + products3 = formatter.sortList(products_3, sortBy, args.des)[:args.num] results = products1 + products2 + products3 + results_1 = products_1 + products_2 + products_3 results = formatter.sortList(results, "ra" , args.des) @@ -39,6 +44,8 @@ def main(): print(tabulate(results, headers="keys", tablefmt="github")) print() print() + rint("CSV Saved at: ",args.cd)) + print("File Name:", csv_writer.write_csv(results_1, args.search, args.cd)) if __name__ == '__main__': main() From 44bdf041593e4d5112e7f354127369bb400a5145 Mon Sep 17 00:00:00 2001 From: antgad Date: Mon, 1 Nov 2021 16:28:33 -0700 Subject: [PATCH 10/31] Added functionality to save user name and email --- .gitignore | 3 ++- src/full_version.py | 43 +++++++++++++++++++++++++++++++++++++++++++ src/slash.py | 8 +++++++- 3 files changed, 52 insertions(+), 2 deletions(-) create mode 100644 src/full_version.py diff --git a/.gitignore b/.gitignore index 3118facd..82c88c5a 100644 --- a/.gitignore +++ b/.gitignore @@ -129,4 +129,5 @@ dmypy.json .pyre/ -.csv \ No newline at end of file +.csv +user_data.json \ No newline at end of file diff --git a/src/full_version.py b/src/full_version.py new file mode 100644 index 00000000..2489b4b3 --- /dev/null +++ b/src/full_version.py @@ -0,0 +1,43 @@ +import json +import os + +class full_version: + def __init__(self): + self.data={} + self.name="" + self.email="" + self.user_data = os.path.join( + os.path.dirname( + os.path.dirname( + os.path.abspath(__file__))), + "json", + "user_data.json" + ) + + + def login(self): + if not os.path.exists(self.user_data): + print("Welcome to Slash!") + print("Please enter the following information: ") + name=input("Name: ") + email=input("Email: ") + self.data['name']=name + self.data['email']=email + with open(self.user_data, 'w') as outfile: + json.dump(self.data, outfile) + self.name=name + self.email=email + else: + with open(self.user_data) as json_file: + data = json.load(json_file) + self.name=data['name'] + self.email=data['email'] + return self.name, self.email + + + + + + def driver(self): + self.login() + print("Welcome ",self.name) diff --git a/src/slash.py b/src/slash.py index f91cb5c3..af6e0b0f 100644 --- a/src/slash.py +++ b/src/slash.py @@ -13,11 +13,13 @@ from tabulate import tabulate import os import csv +import full_version def main(): parser = argparse.ArgumentParser(description="Slash") + parser.add_argument('--full', type=str, help='T for full version of app; F for mini version of app' ,default='F') parser.add_argument('--search', type=str, help='Product search query') parser.add_argument('--num', type=int, help="Maximum number of records", default=3) parser.add_argument('--sort', type=str, nargs='+', help="Sort according to re (relevance: default), pr (price) or ra (rating)", default="re") @@ -25,6 +27,10 @@ def main(): parser.add_argument('--des', action='store_true', help="Sort in descending (non-increasing) order") parser.add_argument('--cd', type=str, help="Change directory to save CSV file with search results", default=os.getcwd()) args = parser.parse_args() + if args.full=='T': + + full_version.full_version().driver() + return products_1 = scraper.searchAmazon(args.search) products_2 = scraper.searchWalmart(args.search) @@ -44,7 +50,7 @@ def main(): print(tabulate(results, headers="keys", tablefmt="github")) print() print() - rint("CSV Saved at: ",args.cd)) + rint("CSV Saved at: ",args.cd) print("File Name:", csv_writer.write_csv(results_1, args.search, args.cd)) if __name__ == '__main__': From b05afe3772a0d926a76a97ecd51db1643011dc89 Mon Sep 17 00:00:00 2001 From: antgad Date: Mon, 1 Nov 2021 16:43:47 -0700 Subject: [PATCH 11/31] added functionality to choose what to do --- src/full_version.py | 26 ++++++++++++++++++++++++++ src/slash.py | 2 +- 2 files changed, 27 insertions(+), 1 deletion(-) diff --git a/src/full_version.py b/src/full_version.py index 2489b4b3..bd77665f 100644 --- a/src/full_version.py +++ b/src/full_version.py @@ -34,10 +34,36 @@ def login(self): self.email=data['email'] return self.name, self.email + def search_fn(self): + prod=input("Enter name of product to Search: ") + self.scrape(prod) + pass + + def extract_list(self): + pass + + def scrape(self,prod): + products_1 = scraper.searchAmazon(prod) + products_2 = scraper.searchWalmart(prod) + products_3 = scraper.searchEtsy(prod) def driver(self): self.login() + flag_loop=1 print("Welcome ",self.name) + while flag_loop==1: + print("Select from following:") + print("1. Search new product\n2. See exiting list\n3. Exit") + choice=int(input()) + if choice==1: + self.search_fn() + elif choice==2: + self.extract_list() + elif choice==3: + print("Thank You for Using Slash") + flag_loop = 0 + else: + rint("Incorrect Option") \ No newline at end of file diff --git a/src/slash.py b/src/slash.py index af6e0b0f..42e7fce6 100644 --- a/src/slash.py +++ b/src/slash.py @@ -45,7 +45,7 @@ def main(): results = formatter.sortList(results, "ra" , args.des) - print() + print(args.des) print() print(tabulate(results, headers="keys", tablefmt="github")) print() From 48fde0214681830c713027f7c83f562539340185 Mon Sep 17 00:00:00 2001 From: antgad Date: Mon, 1 Nov 2021 16:59:37 -0700 Subject: [PATCH 12/31] Formatting updates --- src/formatter.py | 12 ++++++++---- src/full_version.py | 14 ++++++++++---- src/scraper.py | 12 ++++++------ src/slash.py | 3 ++- 4 files changed, 26 insertions(+), 15 deletions(-) diff --git a/src/formatter.py b/src/formatter.py index eea54133..77544811 100644 --- a/src/formatter.py +++ b/src/formatter.py @@ -15,7 +15,7 @@ from datetime import datetime import math -def formatResult(website, titles, prices, links,ratings): +def formatResult(website, titles, prices, links,ratings,df_flag): """ The formatResult function takes the scraped HTML as input, and extracts the necessary values from the HTML code. Ex. extracting a price '$19.99' from @@ -24,18 +24,21 @@ def formatResult(website, titles, prices, links,ratings): title, price, link, rating = '', '', '', '' if titles: title = titles[0].get_text().strip() if prices: price = prices[0].get_text().strip() - #if links: link = links[0]['href'] + if links: link = links[0]['href'] if ratings: rating = ratings[0].get_text().strip().split()[0] + if df_flag==0: title=formatTitle(title) + if df_flag==0: link=formatTitle(link) product = { 'timestamp': datetime.now().strftime("%d/%m/%Y %H:%M:%S"), - "title": formatTitle(title), + "title": title, "price": price, - # "link":f'www.{website}.com{link}', + "link":f'www.{website}.com{link}', "website": website, "rating" : rating, } return product + def sortList(arr, sortBy, reverse): """ The sortList function is used to sort the products list based on the @@ -64,6 +67,7 @@ def formatTitle(title): return title[:40] + "..." return title + def getNumbers(st): """ The getNumbers function extracts float values (price) from a string. diff --git a/src/full_version.py b/src/full_version.py index bd77665f..673c67fc 100644 --- a/src/full_version.py +++ b/src/full_version.py @@ -1,5 +1,7 @@ import json import os +import pandas as pd +import scraper class full_version: def __init__(self): @@ -13,6 +15,7 @@ def __init__(self): "json", "user_data.json" ) + self.df=pd.DataFrame() def login(self): @@ -43,10 +46,13 @@ def extract_list(self): pass def scrape(self,prod): - products_1 = scraper.searchAmazon(prod) - products_2 = scraper.searchWalmart(prod) - products_3 = scraper.searchEtsy(prod) - + products_1 = scraper.searchAmazon(prod,1) + products_2 = scraper.searchWalmart(prod,1) + products_3 = scraper.searchEtsy(prod,1) + results=products_1+products_2+products_3 + #esults = formatter.sortList(results, "ra" , True) + self.df=pd.DataFrame.from_dict(results, orient='columns') + print(self.df) diff --git a/src/scraper.py b/src/scraper.py index 8f3abcbc..036de4fb 100644 --- a/src/scraper.py +++ b/src/scraper.py @@ -31,7 +31,7 @@ def httpsGet(URL): return BeautifulSoup(soup1.prettify(), "html.parser") -def searchAmazon(query): +def searchAmazon(query, df_flag=0): """ The searchAmazon function scrapes amazon.com """ @@ -43,11 +43,11 @@ def searchAmazon(query): for res in results: titles, prices, links = res.select("h2 a span"), res.select("span.a-price span"), res.select("h2 a.a-link-normal") ratings = res.select("span.a-icon-alt") - product = formatter.formatResult("amazon", titles, prices, links,ratings) + product = formatter.formatResult("amazon", titles, prices, links,ratings, df_flag) products.append(product) return products -def searchWalmart(query): +def searchWalmart(query, df_flag=0): """ The searchWalmart function scrapes walmart.com """ @@ -61,11 +61,11 @@ def searchWalmart(query): for res in results: titles, prices, links = res.select("span.lh-title"), res.select("div.lh-copy"), res.select("a") ratings = res.findAll("span",{"class":"w_Cj"},text=pattern) - product = formatter.formatResult("walmart", titles, prices, links,ratings) + product = formatter.formatResult("walmart", titles, prices, links,ratings, df_flag) products.append(product) return products -def searchEtsy(query): +def searchEtsy(query, df_flag=0): """ The searchEtsy function scrapes Etsy.com """ @@ -79,6 +79,6 @@ def searchEtsy(query): for item in soup.select('.wt-grid__item-xs-6'): titles, prices, links = (item.select("h3")), (item.select(".currency-value")), (item.select('.width-full')) ratings = item.select('span.screen-reader-only') - product = formatter.formatResult("Etsy", titles, prices, links, ratings) + product = formatter.formatResult("Etsy", titles, prices, links, ratings, df_flag) products.append(product) return products diff --git a/src/slash.py b/src/slash.py index 42e7fce6..5f507c82 100644 --- a/src/slash.py +++ b/src/slash.py @@ -14,6 +14,7 @@ import os import csv import full_version +import csv_writer @@ -50,7 +51,7 @@ def main(): print(tabulate(results, headers="keys", tablefmt="github")) print() print() - rint("CSV Saved at: ",args.cd) + print("CSV Saved at: ",args.cd) print("File Name:", csv_writer.write_csv(results_1, args.search, args.cd)) if __name__ == '__main__': From 935e4ccb258945c18b99e332530c100d1d89cdc9 Mon Sep 17 00:00:00 2001 From: antgad Date: Mon, 1 Nov 2021 17:04:29 -0700 Subject: [PATCH 13/31] displaying all available ptoducts on 3 websites --- src/full_version.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/src/full_version.py b/src/full_version.py index 673c67fc..32eab262 100644 --- a/src/full_version.py +++ b/src/full_version.py @@ -16,6 +16,10 @@ def __init__(self): "user_data.json" ) self.df=pd.DataFrame() + pd.set_option('display.max_rows', None) + pd.set_option('display.max_columns', None) + pd.set_option('display.width', None) + pd.set_option('display.max_colwidth', 40) def login(self): From 6ab75909d7ea900e98244ac98fd9fbb8ed9d3ef5 Mon Sep 17 00:00:00 2001 From: antgad Date: Mon, 1 Nov 2021 17:19:57 -0700 Subject: [PATCH 14/31] consolidated the scraping --- src/scraper.py | 20 +++++++++++++++++--- src/slash.py | 17 +++++------------ 2 files changed, 22 insertions(+), 15 deletions(-) diff --git a/src/scraper.py b/src/scraper.py index 036de4fb..aaa78d56 100644 --- a/src/scraper.py +++ b/src/scraper.py @@ -15,6 +15,8 @@ import formatter from bs4 import BeautifulSoup import re +import csv_writer +import csv def httpsGet(URL): @@ -31,7 +33,7 @@ def httpsGet(URL): return BeautifulSoup(soup1.prettify(), "html.parser") -def searchAmazon(query, df_flag=0): +def searchAmazon(query, df_flag): """ The searchAmazon function scrapes amazon.com """ @@ -47,7 +49,7 @@ def searchAmazon(query, df_flag=0): products.append(product) return products -def searchWalmart(query, df_flag=0): +def searchWalmart(query, df_flag): """ The searchWalmart function scrapes walmart.com """ @@ -65,7 +67,7 @@ def searchWalmart(query, df_flag=0): products.append(product) return products -def searchEtsy(query, df_flag=0): +def searchEtsy(query, df_flag): """ The searchEtsy function scrapes Etsy.com """ @@ -82,3 +84,15 @@ def searchEtsy(query, df_flag=0): product = formatter.formatResult("Etsy", titles, prices, links, ratings, df_flag) products.append(product) return products + +def driver(product, num=None, df_flag=0,csv=False): + products_1 = searchAmazon(product,df_flag) + products_2 = searchWalmart(product,df_flag) + products_3 = searchEtsy(product,df_flag) + results=products_1+products_2+products_3 + if csv==True: + + print("CSV Saved at: ",args.cd) + print("File Name:", csv_writer.write_csv(results, args.search, args.cd)) + return products_1[:num]+products_2[:num]+products_3[:num] + diff --git a/src/slash.py b/src/slash.py index 5f507c82..42f468b5 100644 --- a/src/slash.py +++ b/src/slash.py @@ -14,7 +14,7 @@ import os import csv import full_version -import csv_writer + @@ -27,23 +27,18 @@ def main(): parser.add_argument('--link', action='store_true', help="Show links in the table") parser.add_argument('--des', action='store_true', help="Sort in descending (non-increasing) order") parser.add_argument('--cd', type=str, help="Change directory to save CSV file with search results", default=os.getcwd()) + parser.add_argument('--csv', action='store_false', help="Save results as CSV",) args = parser.parse_args() if args.full=='T': full_version.full_version().driver() return - products_1 = scraper.searchAmazon(args.search) - products_2 = scraper.searchWalmart(args.search) - products_3 = scraper.searchEtsy(args.search) + results = scraper.driver(args.search,args.num,args.csv) + for sortBy in args.sort: - products1 = formatter.sortList(products_1, sortBy, args.des)[:args.num] - products2 = formatter.sortList(products_2, sortBy, args.des)[:args.num] - products3 = formatter.sortList(products_3, sortBy, args.des)[:args.num] - results = products1 + products2 + products3 - results_1 = products_1 + products_2 + products_3 - results = formatter.sortList(results, "ra" , args.des) + results = formatter.sortList(results, sortBy , args.des) print(args.des) @@ -51,8 +46,6 @@ def main(): print(tabulate(results, headers="keys", tablefmt="github")) print() print() - print("CSV Saved at: ",args.cd) - print("File Name:", csv_writer.write_csv(results_1, args.search, args.cd)) if __name__ == '__main__': main() From 78db3c7da9053fceada8e91126165d11943a14ca Mon Sep 17 00:00:00 2001 From: antgad Date: Mon, 1 Nov 2021 18:45:44 -0700 Subject: [PATCH 15/31] updated to save favourite product to local csv --- src/full_version.py | 25 +++++++++++++++++++++++-- 1 file changed, 23 insertions(+), 2 deletions(-) diff --git a/src/full_version.py b/src/full_version.py index 32eab262..da59abc6 100644 --- a/src/full_version.py +++ b/src/full_version.py @@ -15,6 +15,13 @@ def __init__(self): "json", "user_data.json" ) + self.user_list = os.path.join( + os.path.dirname( + os.path.dirname( + os.path.abspath(__file__))), + "csvs", + "user_list.csv" + ) self.df=pd.DataFrame() pd.set_option('display.max_rows', None) pd.set_option('display.max_columns', None) @@ -44,6 +51,19 @@ def login(self): def search_fn(self): prod=input("Enter name of product to Search: ") self.scrape(prod) + ch=int(input("\n\nEnter 1 to save product to list \nelse enter any other key to continue")) + if ch==1: + indx=int(input("Enter row number of product to save: ")) + if indx Date: Mon, 1 Nov 2021 18:48:36 -0700 Subject: [PATCH 16/31] Update .gitignore ignroes all csv files --- .gitignore | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.gitignore b/.gitignore index 82c88c5a..c1ca7bca 100644 --- a/.gitignore +++ b/.gitignore @@ -129,5 +129,5 @@ dmypy.json .pyre/ -.csv +*.csv user_data.json \ No newline at end of file From 15fede35d72003ca596f3e4f8b2a1dc904483266 Mon Sep 17 00:00:00 2001 From: antgad Date: Mon, 1 Nov 2021 18:51:24 -0700 Subject: [PATCH 17/31] functionality to view saved products --- src/full_version.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/src/full_version.py b/src/full_version.py index da59abc6..efee97bc 100644 --- a/src/full_version.py +++ b/src/full_version.py @@ -67,6 +67,11 @@ def search_fn(self): pass def extract_list(self): + if os.path.exists(self.user_list): + old_data=pd.read_csv(self.user_list) + print(old_data) + else: + print("No saved data found.") pass def scrape(self,prod): From 5631a81a5af6b684381ce0acce5d2eaac92a933a Mon Sep 17 00:00:00 2001 From: antgad Date: Mon, 1 Nov 2021 18:53:42 -0700 Subject: [PATCH 18/31] saving csv in quick mode in the csv folder by default --- src/slash.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/slash.py b/src/slash.py index 42f468b5..072e7068 100644 --- a/src/slash.py +++ b/src/slash.py @@ -26,7 +26,7 @@ def main(): parser.add_argument('--sort', type=str, nargs='+', help="Sort according to re (relevance: default), pr (price) or ra (rating)", default="re") parser.add_argument('--link', action='store_true', help="Show links in the table") parser.add_argument('--des', action='store_true', help="Sort in descending (non-increasing) order") - parser.add_argument('--cd', type=str, help="Change directory to save CSV file with search results", default=os.getcwd()) + parser.add_argument('--cd', type=str, help="Change directory to save CSV file with search results", default=os.path.join(os.path.dirname(os.path.dirname(os.path.abspath(__file__))),"csvs")) parser.add_argument('--csv', action='store_false', help="Save results as CSV",) args = parser.parse_args() if args.full=='T': From 1134ef2594bc166eae6cc512189000e6d0533126 Mon Sep 17 00:00:00 2001 From: AnmolikaGoyal <68813421+AnmolikaGoyal@users.noreply.github.com> Date: Tue, 2 Nov 2021 16:42:12 -0400 Subject: [PATCH 19/31] Update requirements.txt --- requirements.txt | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/requirements.txt b/requirements.txt index 76792b38..81470781 100644 --- a/requirements.txt +++ b/requirements.txt @@ -34,4 +34,5 @@ urllib3==1.26.6 Werkzeug==1.0.1 wheel==0.37.0 zipp==3.5.0 -DateTime==4.3 \ No newline at end of file +DateTime==4.3 +lxml==4.6.3 From 405d532b24e9a7a08057a9d4594441231430c20d Mon Sep 17 00:00:00 2001 From: AnmolikaGoyal <68813421+AnmolikaGoyal@users.noreply.github.com> Date: Tue, 2 Nov 2021 16:42:59 -0400 Subject: [PATCH 20/31] Update requirements.txt --- requirements.txt | 1 + 1 file changed, 1 insertion(+) diff --git a/requirements.txt b/requirements.txt index 81470781..a8032e3a 100644 --- a/requirements.txt +++ b/requirements.txt @@ -36,3 +36,4 @@ wheel==0.37.0 zipp==3.5.0 DateTime==4.3 lxml==4.6.3 +requests-oauthlib==1.3.0 From 3fb20aa0c0dd1dde0bb647de21fda72bf3fe264e Mon Sep 17 00:00:00 2001 From: AnmolikaGoyal <68813421+AnmolikaGoyal@users.noreply.github.com> Date: Tue, 2 Nov 2021 16:50:18 -0400 Subject: [PATCH 21/31] Update README.md --- README.md | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/README.md b/README.md index 9785e94d..fced40f8 100644 --- a/README.md +++ b/README.md @@ -183,11 +183,11 @@ python slash.py --search "philips hue" --num 5 - - - - - + + + + +

Shubham Mankar

Pratik Devnani


Moksh Jain


Rahil Sarvaiya


Anushi Keswani


Anant Gadodia

Anmolika Goyal


Shubhangi Jain


Shreya Karra


Srujana Rao

From 821d8eff2f83d5cce9106bdc25d1bf6311768e15 Mon Sep 17 00:00:00 2001 From: AnmolikaGoyal <68813421+AnmolikaGoyal@users.noreply.github.com> Date: Tue, 2 Nov 2021 17:19:07 -0400 Subject: [PATCH 22/31] Update README.md --- README.md | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/README.md b/README.md index fced40f8..22467233 100644 --- a/README.md +++ b/README.md @@ -183,11 +183,11 @@ python slash.py --search "philips hue" --num 5 - - - - - + + + + +

Anant Gadodia

Anmolika Goyal


Shubhangi Jain


Shreya Karra


Srujana Rao


Anant Gadodia

Anmolika Goyal


Shubhangi Jain


Shreya Karra


Srujana Rao

From e0b9830cea0cf82c6ca4f17dc5b65a5d0da08370 Mon Sep 17 00:00:00 2001 From: AnmolikaGoyal <68813421+AnmolikaGoyal@users.noreply.github.com> Date: Tue, 2 Nov 2021 17:22:43 -0400 Subject: [PATCH 23/31] Update README.md --- README.md | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/README.md b/README.md index 22467233..74781d3d 100644 --- a/README.md +++ b/README.md @@ -183,11 +183,11 @@ python slash.py --search "philips hue" --num 5 - - - - - + + + + +

Anant Gadodia

Anmolika Goyal


Shubhangi Jain


Shreya Karra


Srujana Rao


Anant Gadodia

Anmolika Goyal


Shubhangi Jain


Shreya Karra


Srujana Rao

From a467a140fa5187de82b303e1745131ec95796908 Mon Sep 17 00:00:00 2001 From: antgad Date: Tue, 2 Nov 2021 16:30:30 -0700 Subject: [PATCH 24/31] minor updates --- src/scraper.py | 6 +++--- src/slash.py | 7 ++++--- 2 files changed, 7 insertions(+), 6 deletions(-) diff --git a/src/scraper.py b/src/scraper.py index aaa78d56..d280a8ff 100644 --- a/src/scraper.py +++ b/src/scraper.py @@ -85,14 +85,14 @@ def searchEtsy(query, df_flag): products.append(product) return products -def driver(product, num=None, df_flag=0,csv=False): +def driver(product, num=None, df_flag=0,csv=False,cd=None): products_1 = searchAmazon(product,df_flag) products_2 = searchWalmart(product,df_flag) products_3 = searchEtsy(product,df_flag) results=products_1+products_2+products_3 if csv==True: - print("CSV Saved at: ",args.cd) - print("File Name:", csv_writer.write_csv(results, args.search, args.cd)) + print("CSV Saved at: ",cd) + print("File Name:", csv_writer.write_csv(results, product, cd)) return products_1[:num]+products_2[:num]+products_3[:num] diff --git a/src/slash.py b/src/slash.py index 072e7068..e38f5be9 100644 --- a/src/slash.py +++ b/src/slash.py @@ -27,21 +27,22 @@ def main(): parser.add_argument('--link', action='store_true', help="Show links in the table") parser.add_argument('--des', action='store_true', help="Sort in descending (non-increasing) order") parser.add_argument('--cd', type=str, help="Change directory to save CSV file with search results", default=os.path.join(os.path.dirname(os.path.dirname(os.path.abspath(__file__))),"csvs")) - parser.add_argument('--csv', action='store_false', help="Save results as CSV",) + parser.add_argument('--csv', action='store_true', help="Save results as CSV",) args = parser.parse_args() + if args.full=='T': full_version.full_version().driver() return - results = scraper.driver(args.search,args.num,args.csv) + results = scraper.driver(args.search,args.num,csv=args.csv,cd=args.cd) for sortBy in args.sort: results = formatter.sortList(results, sortBy , args.des) - print(args.des) + print() print(tabulate(results, headers="keys", tablefmt="github")) print() From f13ed042839ba892cd67933c4aaf198c0f851a35 Mon Sep 17 00:00:00 2001 From: Srujana Rao Date: Tue, 2 Nov 2021 21:14:47 -0400 Subject: [PATCH 25/31] Update slash.py --- src/slash.py | 1 + 1 file changed, 1 insertion(+) diff --git a/src/slash.py b/src/slash.py index e38f5be9..91d0baca 100644 --- a/src/slash.py +++ b/src/slash.py @@ -28,6 +28,7 @@ def main(): parser.add_argument('--des', action='store_true', help="Sort in descending (non-increasing) order") parser.add_argument('--cd', type=str, help="Change directory to save CSV file with search results", default=os.path.join(os.path.dirname(os.path.dirname(os.path.abspath(__file__))),"csvs")) parser.add_argument('--csv', action='store_true', help="Save results as CSV",) + parser.add_argument('--currency', type=str, help="Display the amount in specified currency") args = parser.parse_args() if args.full=='T': From b5509e8af329cc1e3d1bc8c86ed914c7e263d923 Mon Sep 17 00:00:00 2001 From: srujanarao Date: Tue, 2 Nov 2021 21:44:09 -0400 Subject: [PATCH 26/31] changes to currency function --- .idea/.gitignore | 3 +++ .idea/inspectionProfiles/Project_Default.xml | 14 +++++++++++++ .../inspectionProfiles/profiles_settings.xml | 6 ++++++ .idea/misc.xml | 4 ++++ .idea/modules.xml | 8 ++++++++ .idea/slash_new.iml | 14 +++++++++++++ .idea/vcs.xml | 6 ++++++ src/formatter.py | 16 +++++++++++++-- src/scraper.py | 20 +++++++++---------- src/slash.py | 3 ++- 10 files changed, 81 insertions(+), 13 deletions(-) create mode 100644 .idea/.gitignore create mode 100644 .idea/inspectionProfiles/Project_Default.xml create mode 100644 .idea/inspectionProfiles/profiles_settings.xml create mode 100644 .idea/misc.xml create mode 100644 .idea/modules.xml create mode 100644 .idea/slash_new.iml create mode 100644 .idea/vcs.xml diff --git a/.idea/.gitignore b/.idea/.gitignore new file mode 100644 index 00000000..26d33521 --- /dev/null +++ b/.idea/.gitignore @@ -0,0 +1,3 @@ +# Default ignored files +/shelf/ +/workspace.xml diff --git a/.idea/inspectionProfiles/Project_Default.xml b/.idea/inspectionProfiles/Project_Default.xml new file mode 100644 index 00000000..5064a650 --- /dev/null +++ b/.idea/inspectionProfiles/Project_Default.xml @@ -0,0 +1,14 @@ + + + + \ No newline at end of file diff --git a/.idea/inspectionProfiles/profiles_settings.xml b/.idea/inspectionProfiles/profiles_settings.xml new file mode 100644 index 00000000..105ce2da --- /dev/null +++ b/.idea/inspectionProfiles/profiles_settings.xml @@ -0,0 +1,6 @@ + + + + \ No newline at end of file diff --git a/.idea/misc.xml b/.idea/misc.xml new file mode 100644 index 00000000..2305f400 --- /dev/null +++ b/.idea/misc.xml @@ -0,0 +1,4 @@ + + + + \ No newline at end of file diff --git a/.idea/modules.xml b/.idea/modules.xml new file mode 100644 index 00000000..7ad07a6b --- /dev/null +++ b/.idea/modules.xml @@ -0,0 +1,8 @@ + + + + + + + + \ No newline at end of file diff --git a/.idea/slash_new.iml b/.idea/slash_new.iml new file mode 100644 index 00000000..8e5446ac --- /dev/null +++ b/.idea/slash_new.iml @@ -0,0 +1,14 @@ + + + + + + + + + + + + \ No newline at end of file diff --git a/.idea/vcs.xml b/.idea/vcs.xml new file mode 100644 index 00000000..94a25f7f --- /dev/null +++ b/.idea/vcs.xml @@ -0,0 +1,6 @@ + + + + + + \ No newline at end of file diff --git a/src/formatter.py b/src/formatter.py index 77544811..2c2726f5 100644 --- a/src/formatter.py +++ b/src/formatter.py @@ -15,19 +15,21 @@ from datetime import datetime import math -def formatResult(website, titles, prices, links,ratings,df_flag): +def formatResult(website, titles, prices, links,ratings,df_flag, currency): """ The formatResult function takes the scraped HTML as input, and extracts the necessary values from the HTML code. Ex. extracting a price '$19.99' from a paragraph tag. """ - title, price, link, rating = '', '', '', '' + + title, price, currency, link, rating, converted_cur = '', '', '', '', '', '' if titles: title = titles[0].get_text().strip() if prices: price = prices[0].get_text().strip() if links: link = links[0]['href'] if ratings: rating = ratings[0].get_text().strip().split()[0] if df_flag==0: title=formatTitle(title) if df_flag==0: link=formatTitle(link) + if currency: converted_cur = getCurrency(currency, price) product = { 'timestamp': datetime.now().strftime("%d/%m/%Y %H:%M:%S"), "title": title, @@ -35,6 +37,7 @@ def formatResult(website, titles, prices, links,ratings,df_flag): "link":f'www.{website}.com{link}', "website": website, "rating" : rating, + "converted price": converted_cur } return product @@ -82,3 +85,12 @@ def getNumbers(st): except: ans = math.inf return ans + +def getCurrency(currency, price): + + converted_cur = '' + if currency == "inr": + converted_cur = 75 * price + elif currency == "euro": + converted_cur = 1.16 * price + return converted_cur diff --git a/src/scraper.py b/src/scraper.py index d280a8ff..ed3176ee 100644 --- a/src/scraper.py +++ b/src/scraper.py @@ -33,7 +33,7 @@ def httpsGet(URL): return BeautifulSoup(soup1.prettify(), "html.parser") -def searchAmazon(query, df_flag): +def searchAmazon(query, df_flag, currency): """ The searchAmazon function scrapes amazon.com """ @@ -45,11 +45,11 @@ def searchAmazon(query, df_flag): for res in results: titles, prices, links = res.select("h2 a span"), res.select("span.a-price span"), res.select("h2 a.a-link-normal") ratings = res.select("span.a-icon-alt") - product = formatter.formatResult("amazon", titles, prices, links,ratings, df_flag) + product = formatter.formatResult("amazon", titles, prices, links,ratings, df_flag, currency) products.append(product) return products -def searchWalmart(query, df_flag): +def searchWalmart(query, df_flag, currency): """ The searchWalmart function scrapes walmart.com """ @@ -63,11 +63,11 @@ def searchWalmart(query, df_flag): for res in results: titles, prices, links = res.select("span.lh-title"), res.select("div.lh-copy"), res.select("a") ratings = res.findAll("span",{"class":"w_Cj"},text=pattern) - product = formatter.formatResult("walmart", titles, prices, links,ratings, df_flag) + product = formatter.formatResult("walmart", titles, prices, links,ratings, df_flag, currency) products.append(product) return products -def searchEtsy(query, df_flag): +def searchEtsy(query, df_flag, currency): """ The searchEtsy function scrapes Etsy.com """ @@ -81,14 +81,14 @@ def searchEtsy(query, df_flag): for item in soup.select('.wt-grid__item-xs-6'): titles, prices, links = (item.select("h3")), (item.select(".currency-value")), (item.select('.width-full')) ratings = item.select('span.screen-reader-only') - product = formatter.formatResult("Etsy", titles, prices, links, ratings, df_flag) + product = formatter.formatResult("Etsy", titles, prices, links, ratings, df_flag, currency) products.append(product) return products -def driver(product, num=None, df_flag=0,csv=False,cd=None): - products_1 = searchAmazon(product,df_flag) - products_2 = searchWalmart(product,df_flag) - products_3 = searchEtsy(product,df_flag) +def driver(product, currency, num=None, df_flag=0,csv=False,cd=None): + products_1 = searchAmazon(product,df_flag, currency) + products_2 = searchWalmart(product,df_flag, currency) + products_3 = searchEtsy(product,df_flag, currency) results=products_1+products_2+products_3 if csv==True: diff --git a/src/slash.py b/src/slash.py index e38f5be9..786d330a 100644 --- a/src/slash.py +++ b/src/slash.py @@ -28,6 +28,7 @@ def main(): parser.add_argument('--des', action='store_true', help="Sort in descending (non-increasing) order") parser.add_argument('--cd', type=str, help="Change directory to save CSV file with search results", default=os.path.join(os.path.dirname(os.path.dirname(os.path.abspath(__file__))),"csvs")) parser.add_argument('--csv', action='store_true', help="Save results as CSV",) + parser.add_argument('--currency', type=str, help="Display the amount in specified currency") args = parser.parse_args() if args.full=='T': @@ -35,7 +36,7 @@ def main(): full_version.full_version().driver() return - results = scraper.driver(args.search,args.num,csv=args.csv,cd=args.cd) + results = scraper.driver(args.search, args.currency, args.num,csv=args.csv,cd=args.cd,) for sortBy in args.sort: From dc8e99ee3369d1022b8cec3d4672eb812b5151a7 Mon Sep 17 00:00:00 2001 From: antgad Date: Tue, 2 Nov 2021 19:01:08 -0700 Subject: [PATCH 27/31] currency conversion debugging --- src/formatter.py | 17 +++++++++++------ 1 file changed, 11 insertions(+), 6 deletions(-) diff --git a/src/formatter.py b/src/formatter.py index 2c2726f5..a36f0091 100644 --- a/src/formatter.py +++ b/src/formatter.py @@ -22,9 +22,11 @@ def formatResult(website, titles, prices, links,ratings,df_flag, currency): a paragraph tag. """ - title, price, currency, link, rating, converted_cur = '', '', '', '', '', '' + title, price, link, rating, converted_cur = '', '', '', '', '' if titles: title = titles[0].get_text().strip() if prices: price = prices[0].get_text().strip() + if '$' not in price: + price='$'+price if links: link = links[0]['href'] if ratings: rating = ratings[0].get_text().strip().split()[0] if df_flag==0: title=formatTitle(title) @@ -39,6 +41,7 @@ def formatResult(website, titles, prices, links,ratings,df_flag, currency): "rating" : rating, "converted price": converted_cur } + return product @@ -88,9 +91,11 @@ def getNumbers(st): def getCurrency(currency, price): - converted_cur = '' - if currency == "inr": - converted_cur = 75 * price - elif currency == "euro": - converted_cur = 1.16 * price + converted_cur = 0.0 + if len(price)>1 : + if currency == "inr": + converted_cur = 75 * int(price[(price.index("$")+1):price.index(".")]) + elif currency == "euro": + converted_cur = 1.16 * int(price[(price.index("$")+1):price.index(".")]) + converted_cur=currency.upper()+' '+str(converted_cur) return converted_cur From 1243bc7602f9e45d9cb2be02c322dade22689b95 Mon Sep 17 00:00:00 2001 From: antgad Date: Tue, 2 Nov 2021 19:04:26 -0700 Subject: [PATCH 28/31] Update formatter.py --- src/formatter.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/formatter.py b/src/formatter.py index a36f0091..b1ad8e8e 100644 --- a/src/formatter.py +++ b/src/formatter.py @@ -94,8 +94,8 @@ def getCurrency(currency, price): converted_cur = 0.0 if len(price)>1 : if currency == "inr": - converted_cur = 75 * int(price[(price.index("$")+1):price.index(".")]) + converted_cur = 75 * int(price[(price.index("$")+1):price.index(".")].replace(",","")) elif currency == "euro": - converted_cur = 1.16 * int(price[(price.index("$")+1):price.index(".")]) + converted_cur = 1.16 * int(price[(price.index("$")+1):price.index(".")].replace(",","")) converted_cur=currency.upper()+' '+str(converted_cur) return converted_cur From 6fd12c88c1731eab2e2e059f7866ca7b37f3d649 Mon Sep 17 00:00:00 2001 From: sskarra1234 <89954066+sskarra1234@users.noreply.github.com> Date: Wed, 3 Nov 2021 18:13:30 -0400 Subject: [PATCH 29/31] use dataframes --- src/scraper.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/src/scraper.py b/src/scraper.py index ed3176ee..214b6b86 100644 --- a/src/scraper.py +++ b/src/scraper.py @@ -4,7 +4,6 @@ terms of the MIT license. You should have received a copy of the MIT license with this file. If not, please write to: secheaper@gmail.com - """ """ @@ -17,7 +16,8 @@ import re import csv_writer import csv - +from datetime import datetime +import pandas as pd def httpsGet(URL): """ @@ -86,13 +86,13 @@ def searchEtsy(query, df_flag, currency): return products def driver(product, currency, num=None, df_flag=0,csv=False,cd=None): + now=datetime.now() + file_name=product+now.strftime("%m%d%y_%H%M")+'.csv' products_1 = searchAmazon(product,df_flag, currency) products_2 = searchWalmart(product,df_flag, currency) products_3 = searchEtsy(product,df_flag, currency) results=products_1+products_2+products_3 if csv==True: - - print("CSV Saved at: ",cd) - print("File Name:", csv_writer.write_csv(results, product, cd)) + df=pd.DataFrame(results) + df.to_csv(file_name, encoding='utf-8', index=False) return products_1[:num]+products_2[:num]+products_3[:num] - From 4555170056edaeb09e5b8c6f8e8d970e01a6602b Mon Sep 17 00:00:00 2001 From: sskarra1234 <89954066+sskarra1234@users.noreply.github.com> Date: Wed, 3 Nov 2021 19:21:47 -0400 Subject: [PATCH 30/31] add Docstrings for the functions --- src/csv_writer.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/src/csv_writer.py b/src/csv_writer.py index 460bd877..41b3e654 100644 --- a/src/csv_writer.py +++ b/src/csv_writer.py @@ -4,6 +4,9 @@ def write_csv(arr,product,file_path): + ''' Returns the CSV file with the naming nomenclature as 'ProductDate_Time' + Parameters- product: product entered by the user, file_path: path where the csv needs to be stored + Returns- file_name: CSV file ''' os.chdir(file_path) keys = arr[0].keys() now=datetime.now() @@ -13,4 +16,4 @@ def write_csv(arr,product,file_path): dict_writer.writeheader() dict_writer.writerows(arr) a_file.close() - return file_name \ No newline at end of file + return file_name From 48e51957a78e1e06411a9e4ed18a80c70c8813a9 Mon Sep 17 00:00:00 2001 From: sskarra1234 <89954066+sskarra1234@users.noreply.github.com> Date: Wed, 3 Nov 2021 20:26:44 -0400 Subject: [PATCH 31/31] add docstrings --- src/scraper.py | 28 +++++++++++++++++++++------- 1 file changed, 21 insertions(+), 7 deletions(-) diff --git a/src/scraper.py b/src/scraper.py index 214b6b86..7b763433 100644 --- a/src/scraper.py +++ b/src/scraper.py @@ -16,8 +16,11 @@ import re import csv_writer import csv -from datetime import datetime import pandas as pd +import os +from datetime import datetime + + def httpsGet(URL): """ @@ -79,20 +82,31 @@ def searchEtsy(query, df_flag, currency): response = requests.get(url, headers=headers) soup = BeautifulSoup(response.content, 'lxml') for item in soup.select('.wt-grid__item-xs-6'): - titles, prices, links = (item.select("h3")), (item.select(".currency-value")), (item.select('.width-full')) + str = (item.select("a")) + if str == []: + continue + else: + links = str + titles, prices = (item.select("h3")), (item.select(".currency-value")) ratings = item.select('span.screen-reader-only') product = formatter.formatResult("Etsy", titles, prices, links, ratings, df_flag, currency) products.append(product) return products def driver(product, currency, num=None, df_flag=0,csv=False,cd=None): - now=datetime.now() - file_name=product+now.strftime("%m%d%y_%H%M")+'.csv' products_1 = searchAmazon(product,df_flag, currency) products_2 = searchWalmart(product,df_flag, currency) products_3 = searchEtsy(product,df_flag, currency) results=products_1+products_2+products_3 + result_condensed=products_1[:num]+products_2[:num]+products_3[:num] + result_condensed=pd.DataFrame.from_dict(result_condensed,orient='columns') + results =pd.DataFrame.from_dict(results, orient='columns') + if currency=="" or currency==None: + results=results.drop(columns='converted price') + result_condensed=result_condensed.drop(columns='converted price') if csv==True: - df=pd.DataFrame(results) - df.to_csv(file_name, encoding='utf-8', index=False) - return products_1[:num]+products_2[:num]+products_3[:num] + file_name=os.path.join(cd,(product+datetime.now().strftime("%y%m%d_%H%M")+".csv")) + print("CSV Saved at: ",cd) + print("File Name:", file_name) + results.to_csv(file_name, index=False,header=results.columns) + return result_condensed