From f5297aabd51b51ae0447e5fa1ba26233b3b4b80e Mon Sep 17 00:00:00 2001 From: rafrox <37214120+rafrox@users.noreply.github.com> Date: Sun, 11 Mar 2018 23:35:01 +1100 Subject: [PATCH 1/8] rafchanges --- .../price_monitor/resources/urls.json | 26 ++----------------- .../price_monitor/settings.py | 2 +- .../price_monitor/spiders/amazon.py | 19 +++++++++----- 3 files changed, 15 insertions(+), 32 deletions(-) diff --git a/scrapy_price_monitor/price_monitor/resources/urls.json b/scrapy_price_monitor/price_monitor/resources/urls.json index 6bc20ba..2c7dc4b 100644 --- a/scrapy_price_monitor/price_monitor/resources/urls.json +++ b/scrapy_price_monitor/price_monitor/resources/urls.json @@ -1,27 +1,5 @@ { - "headsetlogitech": [ - "https://www.amazon.com/Logitech-Wireless-Headset-Over-Design/dp/B005GTO07O/", - "http://www.bestbuy.com/site/logitech-h600-wireless-headset-black/3436118.p", - "http://www.ebay.com/itm/N-Logitech-Wireless-Headset-H600-Over-The-Head-Design-981-000341-/110985874014" - ], - "webcamlogitech": [ - "https://www.amazon.com/Logitech-Widescreen-Calling-Recording-Desktop/dp/B006JH8T3S/", - "http://www.bestbuy.com/site/logitech-c920-pro-webcam-black/4612476.p?skuId=4612476", - "http://www.ebay.com/itm/Logitech-HD-Pro-Webcam-C920-1080p-Widescreen-Video-Calling-and-Recording-/272381890214" - ], - "amazonechodot": [ - "https://www.amazon.com/dp/B01DFKC2SO", - "http://www.bestbuy.com/site/amazon-echo-dot/5578851.p?skuId=5578851", - "http://www.ebay.com/itm/Amazon-Echo-Dot-2nd-Generation-w-Alexa-Voice-Media-Device-All-New-2016-/201668562192" - ], - "nikoncoolpix": [ - "https://www.amazon.com/Nikon-COOLPIX-B500-Digital-Camera/dp/B01C3LEE9G/", - "http://www.bestbuy.com/site/nikon-coolpix-b500-16-0-megapixel-digital-camera-red/4997500.p?skuId=4997500", - "http://www.ebay.com/itm/Nikon-COOLPIX-B500-Digital-Camera-Red-/162225974018" - ], - "bluemicrophone": [ - "https://www.amazon.com/Blue-Snowball-iCE-Condenser-Microphone/dp/B014PYGTUQ/", - "http://www.bestbuy.com/site/blue-microphones-snowball-usb-cardioid-and-omnidirectional-electret-condenser-vocal-microphone-black/9918056.p?skuId=9918056", - "http://www.ebay.com/itm/Blue-Microphones-Snowball-Black-iCE-Condenser-Microphone-/172260373002" + "On Sale Mil": [ + "http://www.milsims.com.au/catalog/1746/", ] } diff --git a/scrapy_price_monitor/price_monitor/settings.py b/scrapy_price_monitor/price_monitor/settings.py index 9888b56..2fb0115 100644 --- a/scrapy_price_monitor/price_monitor/settings.py +++ b/scrapy_price_monitor/price_monitor/settings.py @@ -9,7 +9,7 @@ SHUB_KEY = os.getenv('$SHUB_KEY') # if you want to run it locally, replace '999999' by your Scrapy Cloud project ID below -SHUB_PROJ_ID = os.getenv('SHUB_JOBKEY', '999999').split('/')[0] +SHUB_PROJ_ID = os.getenv('SHUB_JOBKEY', '291701').split('/')[0] # settings for Amazon SES email service diff --git a/scrapy_price_monitor/price_monitor/spiders/amazon.py b/scrapy_price_monitor/price_monitor/spiders/amazon.py index 0f3ec3c..c1258de 100644 --- a/scrapy_price_monitor/price_monitor/spiders/amazon.py +++ b/scrapy_price_monitor/price_monitor/spiders/amazon.py @@ -2,13 +2,18 @@ class AmazonSpider(BaseSpider): - name = "amazon.com" + name = "milsims.com" def parse(self, response): - item = response.meta.get('item', {}) - item['url'] = response.url - item['title'] = response.css("span#productTitle::text").extract_first("").strip() - item['price'] = float( - response.css("span#priceblock_ourprice::text").re_first("\$(.*)") or 0 - ) + for product in response.css(".view-advanced-catalog tr > td"): + + item = {} + item['title'] = product.css(".views-field-title a ::text").extract_first() + item['price'] = product.css(".views-field-phpcode span span::text").extract()[1] + item['url'] = product.css(".views-field-title a::attr(href)").extract() yield item + + next_page = response.css('li.pager-nexta::attr(href)').extract_first() + if next_page is not None: + next_page = response.urljoin(next_page) + yield scrapy.Request(next_page, callback=self.parse) From 792a573859f8522f3181b6b0e75d70138d3b4115 Mon Sep 17 00:00:00 2001 From: rafrox <37214120+rafrox@users.noreply.github.com> Date: Sun, 11 Mar 2018 23:59:29 +1100 Subject: [PATCH 2/8] Test --- bin/monitor.py | 119 +++++++++++++++++ .../__init__.py | 0 .../quotes_crawler => price_monitor}/items.py | 2 +- price_monitor/pipelines.py | 21 +++ price_monitor/resources/urls.json | 5 + price_monitor/settings.py | 27 ++++ .../spiders/__init__.py | 0 price_monitor/spiders/amazon.py | 19 +++ price_monitor/spiders/base_spider.py | 16 +++ price_monitor/spiders/bestbuy.py | 14 ++ price_monitor/spiders/ebay.py | 17 +++ price_monitor/templates/email.html | 14 ++ price_monitor/utils.py | 35 +++++ quotes_crawler/README.md | 13 -- quotes_crawler/quotes_crawler/pipelines.py | 11 -- quotes_crawler/quotes_crawler/settings.py | 90 ------------- .../spiders/toscrape-csrf-login-v1.py | 35 ----- .../spiders/toscrape-csrf-login-v2.py | 32 ----- .../quotes_crawler/spiders/toscrape-css.py | 21 --- .../spiders/toscrape-infinite-scrolling.py | 18 --- .../quotes_crawler/spiders/toscrape-js.py | 25 ---- .../spiders/toscrape-microdata.py | 19 --- .../spiders/toscrape-selenium.py | 28 ---- .../quotes_crawler/spiders/toscrape-xpath.py | 21 --- quotes_crawler/requirements.txt | 3 - requirements.txt | 5 + sc_custom_image/README.md | 4 - sc_custom_image/requirements.txt | 1 - sc_custom_image/sc_custom_image/__init__.py | 0 sc_custom_image/sc_custom_image/items.py | 14 -- sc_custom_image/sc_custom_image/pipelines.py | 11 -- sc_custom_image/sc_custom_image/settings.py | 90 ------------- .../sc_custom_image/spiders/__init__.py | 4 - .../sc_custom_image/spiders/demo.py | 23 ---- sc_custom_image/scrapinghub.yml | 3 - sc_custom_image/scrapy.cfg | 11 -- sc_scripts_demo/bin/check_jobs.py | 121 ------------------ sc_scripts_demo/requirements.txt | 1 - sc_scripts_demo/sc_scripts_demo/__init__.py | 0 sc_scripts_demo/sc_scripts_demo/settings.py | 10 -- .../sc_scripts_demo/spiders/__init__.py | 4 - .../sc_scripts_demo/spiders/bad_spider.py | 12 -- .../sc_scripts_demo/spiders/good_spider.py | 21 --- sc_scripts_demo/scrapinghub.yml | 3 - sc_scripts_demo/scrapy.cfg | 11 -- sc_scripts_demo/setup.py | 14 -- .../scrapinghub.yml => scrapinghub.yml | 2 +- quotes_crawler/scrapy.cfg => scrapy.cfg | 4 +- setup.py | 12 ++ splash_based_project/scrapy.cfg | 11 -- .../splash_based_project/__init__.py | 0 .../splash_based_project/settings.py | 20 --- .../splash_based_project/spiders/__init__.py | 4 - .../spiders/quotes-js-1.py | 27 ---- .../spiders/quotes-js-2.py | 36 ------ splash_crawlera_example/README.md | 28 ---- splash_crawlera_example/requirements.txt | 1 - splash_crawlera_example/scrapy.cfg | 11 -- splash_crawlera_example/setup.py | 10 -- .../splash_crawlera_example/__init__.py | 0 .../scripts/crawlera.lua | 49 ------- .../splash_crawlera_example/settings.py | 22 ---- .../spiders/__init__.py | 4 - .../spiders/quotes-js.py | 54 -------- 64 files changed, 308 insertions(+), 955 deletions(-) create mode 100644 bin/monitor.py rename {quotes_crawler/quotes_crawler => price_monitor}/__init__.py (100%) rename {quotes_crawler/quotes_crawler => price_monitor}/items.py (86%) create mode 100644 price_monitor/pipelines.py create mode 100644 price_monitor/resources/urls.json create mode 100644 price_monitor/settings.py rename {quotes_crawler/quotes_crawler => price_monitor}/spiders/__init__.py (100%) create mode 100644 price_monitor/spiders/amazon.py create mode 100644 price_monitor/spiders/base_spider.py create mode 100644 price_monitor/spiders/bestbuy.py create mode 100644 price_monitor/spiders/ebay.py create mode 100644 price_monitor/templates/email.html create mode 100644 price_monitor/utils.py delete mode 100644 quotes_crawler/README.md delete mode 100644 quotes_crawler/quotes_crawler/pipelines.py delete mode 100644 quotes_crawler/quotes_crawler/settings.py delete mode 100644 quotes_crawler/quotes_crawler/spiders/toscrape-csrf-login-v1.py delete mode 100644 quotes_crawler/quotes_crawler/spiders/toscrape-csrf-login-v2.py delete mode 100644 quotes_crawler/quotes_crawler/spiders/toscrape-css.py delete mode 100644 quotes_crawler/quotes_crawler/spiders/toscrape-infinite-scrolling.py delete mode 100644 quotes_crawler/quotes_crawler/spiders/toscrape-js.py delete mode 100644 quotes_crawler/quotes_crawler/spiders/toscrape-microdata.py delete mode 100644 quotes_crawler/quotes_crawler/spiders/toscrape-selenium.py delete mode 100644 quotes_crawler/quotes_crawler/spiders/toscrape-xpath.py delete mode 100644 quotes_crawler/requirements.txt create mode 100644 requirements.txt delete mode 100644 sc_custom_image/README.md delete mode 100644 sc_custom_image/requirements.txt delete mode 100644 sc_custom_image/sc_custom_image/__init__.py delete mode 100644 sc_custom_image/sc_custom_image/items.py delete mode 100644 sc_custom_image/sc_custom_image/pipelines.py delete mode 100644 sc_custom_image/sc_custom_image/settings.py delete mode 100644 sc_custom_image/sc_custom_image/spiders/__init__.py delete mode 100644 sc_custom_image/sc_custom_image/spiders/demo.py delete mode 100644 sc_custom_image/scrapinghub.yml delete mode 100644 sc_custom_image/scrapy.cfg delete mode 100644 sc_scripts_demo/bin/check_jobs.py delete mode 100644 sc_scripts_demo/requirements.txt delete mode 100644 sc_scripts_demo/sc_scripts_demo/__init__.py delete mode 100644 sc_scripts_demo/sc_scripts_demo/settings.py delete mode 100644 sc_scripts_demo/sc_scripts_demo/spiders/__init__.py delete mode 100644 sc_scripts_demo/sc_scripts_demo/spiders/bad_spider.py delete mode 100644 sc_scripts_demo/sc_scripts_demo/spiders/good_spider.py delete mode 100644 sc_scripts_demo/scrapinghub.yml delete mode 100644 sc_scripts_demo/scrapy.cfg delete mode 100644 sc_scripts_demo/setup.py rename splash_crawlera_example/scrapinghub.yml => scrapinghub.yml (62%) rename quotes_crawler/scrapy.cfg => scrapy.cfg (78%) create mode 100644 setup.py delete mode 100644 splash_based_project/scrapy.cfg delete mode 100644 splash_based_project/splash_based_project/__init__.py delete mode 100644 splash_based_project/splash_based_project/settings.py delete mode 100644 splash_based_project/splash_based_project/spiders/__init__.py delete mode 100644 splash_based_project/splash_based_project/spiders/quotes-js-1.py delete mode 100644 splash_based_project/splash_based_project/spiders/quotes-js-2.py delete mode 100644 splash_crawlera_example/README.md delete mode 100644 splash_crawlera_example/requirements.txt delete mode 100644 splash_crawlera_example/scrapy.cfg delete mode 100644 splash_crawlera_example/setup.py delete mode 100644 splash_crawlera_example/splash_crawlera_example/__init__.py delete mode 100644 splash_crawlera_example/splash_crawlera_example/scripts/crawlera.lua delete mode 100644 splash_crawlera_example/splash_crawlera_example/settings.py delete mode 100644 splash_crawlera_example/splash_crawlera_example/spiders/__init__.py delete mode 100644 splash_crawlera_example/splash_crawlera_example/spiders/quotes-js.py diff --git a/bin/monitor.py b/bin/monitor.py new file mode 100644 index 0000000..a9dc370 --- /dev/null +++ b/bin/monitor.py @@ -0,0 +1,119 @@ +"""Simple price monitor built with Scrapy and Scrapy Cloud +""" +import argparse +import os +from datetime import datetime, timedelta + +import boto +from hubstorage import HubstorageClient +from jinja2 import Environment, PackageLoader +from price_monitor import settings +from price_monitor.utils import get_product_names, get_retailers_for_product +from w3lib.html import remove_tags + +jinja_env = Environment(loader=PackageLoader('price_monitor', 'templates')) + + +class DealsChecker(object): + + def __init__(self, latest_deals, previous_deals, price_threshold=0): + self.price_threshold = price_threshold + self.latest_deals = latest_deals + self.previous_deals = previous_deals + + def is_from_latest_crawl(self, deal): + """Checks whether the given deal is from the most recent execution. + """ + return deal in self.latest_deals + + def get_best_deal(self): + """Returns the item with the best overall price. self.price_threshold can be set to avoid + considering minor price drops. + """ + best_so_far = min(self.previous_deals, key=lambda x: x.get('price')) + best_from_last = min(self.latest_deals, key=lambda x: x.get('price')) + if best_from_last.get('price') + self.price_threshold < best_so_far.get('price'): + return best_from_last + else: + return best_so_far + + +class DealsFetcher(object): + + def __init__(self, product_name, apikey, project_id, hours): + self.product_name = product_name + project = HubstorageClient(apikey).get_project(project_id) + self.item_store = project.collections.new_store(product_name) + self.load_items_from_last_n_hours(hours) + + def load_items_from_last_n_hours(self, n=24): + """Load items from the last n hours, from the newest to the oldest. + """ + since_time = int((datetime.now() - timedelta(hours=n)).timestamp() * 1000) + self.deals = [item.get('value') for item in self.fetch_deals_newer_than(since_time)] + + def fetch_deals_newer_than(self, since_time): + return list(self.item_store.get(meta=['_key', '_ts'], startts=since_time)) + + def get_latest_deal_from_retailer(self, retailer): + """Returns the most recently extracted deal from a given retailer. + """ + for deals in self.deals: + if retailer in deals.get('url'): + return deals + + def get_deals(self): + """Returns a tuple with (deals from latest crawl, deals from previous crawls) + """ + latest_deals = [ + self.get_latest_deal_from_retailer(retailer) + for retailer in get_retailers_for_product(self.product_name) + ] + previous_deals = [ + deal for deal in self.deals if deal not in latest_deals + ] + return latest_deals, previous_deals + + +def send_email_alert(items): + ses = boto.connect_ses(settings.AWS_ACCESS_KEY, settings.AWS_SECRET_KEY) + html_body = jinja_env.get_template('email.html').render(items=items) + + ses.send_email( + settings.EMAIL_ALERT_FROM, + 'Price drop alert', + remove_tags(html_body), + settings.EMAIL_ALERT_TO, + html_body=html_body + ) + + +def main(args): + items = [] + for prod_name in get_product_names(): + fetcher = DealsFetcher(prod_name, args.apikey, args.project, args.days * 24) + checker = DealsChecker(*fetcher.get_deals(), args.threshold) + best_deal = checker.get_best_deal() + if checker.is_from_latest_crawl(best_deal): + items.append(best_deal) + + if items: + send_email_alert(items) + + +def parse_args(): + parser = argparse.ArgumentParser(description=__doc__) + parser.add_argument('--apikey', default=settings.SHUB_KEY or os.getenv('SHUB_KEY'), + help='API key to use for scrapinghub (fallbacks to SHUB_KEY variable)') + parser.add_argument('--days', type=int, default=1, + help='How many days back to compare with the last price') + parser.add_argument('--threshold', type=float, default=0, + help='A margin to avoid raising alerts with minor price drops') + parser.add_argument('--project', type=int, default=settings.SHUB_PROJ_ID, + help='Project ID to get info from') + + return parser.parse_args() + + +if __name__ == '__main__': + main(parse_args()) diff --git a/quotes_crawler/quotes_crawler/__init__.py b/price_monitor/__init__.py similarity index 100% rename from quotes_crawler/quotes_crawler/__init__.py rename to price_monitor/__init__.py diff --git a/quotes_crawler/quotes_crawler/items.py b/price_monitor/items.py similarity index 86% rename from quotes_crawler/quotes_crawler/items.py rename to price_monitor/items.py index 318f4fb..20a91f9 100644 --- a/quotes_crawler/quotes_crawler/items.py +++ b/price_monitor/items.py @@ -8,7 +8,7 @@ import scrapy -class QuotesCrawlerItem(scrapy.Item): +class PriceMonitorItem(scrapy.Item): # define the fields for your item here like: # name = scrapy.Field() pass diff --git a/price_monitor/pipelines.py b/price_monitor/pipelines.py new file mode 100644 index 0000000..18de561 --- /dev/null +++ b/price_monitor/pipelines.py @@ -0,0 +1,21 @@ +# -*- coding: utf-8 -*- +from price_monitor import settings +from hubstorage import HubstorageClient +from price_monitor.utils import reversed_timestamp, get_product_names + + +class CollectionStoragePipeline(object): + + def open_spider(self, spider): + client = HubstorageClient(auth=settings.SHUB_KEY) + project = client.get_project(settings.SHUB_PROJ_ID) + self.data_stores = {} + for product_name in get_product_names(): + self.data_stores[product_name] = project.collections.new_store(product_name) + + def process_item(self, item, spider): + key = "{}-{}-{}".format( + reversed_timestamp(), item.get('product_name'), item.get('retailer') + ) + self.data_stores[item['product_name']].set({'_key': key, 'value': item}) + return item diff --git a/price_monitor/resources/urls.json b/price_monitor/resources/urls.json new file mode 100644 index 0000000..2c7dc4b --- /dev/null +++ b/price_monitor/resources/urls.json @@ -0,0 +1,5 @@ +{ + "On Sale Mil": [ + "http://www.milsims.com.au/catalog/1746/", + ] +} diff --git a/price_monitor/settings.py b/price_monitor/settings.py new file mode 100644 index 0000000..2fb0115 --- /dev/null +++ b/price_monitor/settings.py @@ -0,0 +1,27 @@ +# -*- coding: utf-8 -*- +import os + +BOT_NAME = 'price_monitor' +SPIDER_MODULES = ['price_monitor.spiders'] +NEWSPIDER_MODULE = 'price_monitor.spiders' + +ROBOTSTXT_OBEY = True + +SHUB_KEY = os.getenv('$SHUB_KEY') +# if you want to run it locally, replace '999999' by your Scrapy Cloud project ID below +SHUB_PROJ_ID = os.getenv('SHUB_JOBKEY', '291701').split('/')[0] + + +# settings for Amazon SES email service +AWS_ACCESS_KEY = os.getenv('$AWS_ACCESS_KEY') +AWS_SECRET_KEY = os.getenv('$AWS_SECRET_KEY') +EMAIL_ALERT_FROM = 'Price Monitor ' +EMAIL_ALERT_TO = ['RECEIVER_EMAIL@provider.com'] + +# See http://scrapy.readthedocs.org/en/latest/topics/item-pipeline.html +ITEM_PIPELINES = { + 'price_monitor.pipelines.CollectionStoragePipeline': 400, +} + +AUTOTHROTTLE_ENABLED = True +# HTTPCACHE_ENABLED = True diff --git a/quotes_crawler/quotes_crawler/spiders/__init__.py b/price_monitor/spiders/__init__.py similarity index 100% rename from quotes_crawler/quotes_crawler/spiders/__init__.py rename to price_monitor/spiders/__init__.py diff --git a/price_monitor/spiders/amazon.py b/price_monitor/spiders/amazon.py new file mode 100644 index 0000000..c1258de --- /dev/null +++ b/price_monitor/spiders/amazon.py @@ -0,0 +1,19 @@ +from .base_spider import BaseSpider + + +class AmazonSpider(BaseSpider): + name = "milsims.com" + + def parse(self, response): + for product in response.css(".view-advanced-catalog tr > td"): + + item = {} + item['title'] = product.css(".views-field-title a ::text").extract_first() + item['price'] = product.css(".views-field-phpcode span span::text").extract()[1] + item['url'] = product.css(".views-field-title a::attr(href)").extract() + yield item + + next_page = response.css('li.pager-nexta::attr(href)').extract_first() + if next_page is not None: + next_page = response.urljoin(next_page) + yield scrapy.Request(next_page, callback=self.parse) diff --git a/price_monitor/spiders/base_spider.py b/price_monitor/spiders/base_spider.py new file mode 100644 index 0000000..e726c9c --- /dev/null +++ b/price_monitor/spiders/base_spider.py @@ -0,0 +1,16 @@ +import json +import pkgutil +import scrapy +from datetime import datetime + + +class BaseSpider(scrapy.Spider): + + def start_requests(self): + products = json.loads(pkgutil.get_data('price_monitor', 'resources/urls.json').decode()) + for name, urls in products.items(): + for url in urls: + if self.name in url: + now = datetime.now().strftime('%Y/%m/%d %H:%M:%S') + item = {'product_name': name, 'retailer': self.name, 'when': now} + yield scrapy.Request(url, meta={'item': item}) diff --git a/price_monitor/spiders/bestbuy.py b/price_monitor/spiders/bestbuy.py new file mode 100644 index 0000000..03c49f6 --- /dev/null +++ b/price_monitor/spiders/bestbuy.py @@ -0,0 +1,14 @@ +from .base_spider import BaseSpider + + +class BestbuySpider(BaseSpider): + name = "bestbuy.com" + + def parse(self, response): + item = response.meta.get('item', {}) + item['url'] = response.url + item['title'] = response.css("div#sku-title > h1 ::text").extract_first().strip() + item['price'] = float( + response.css('div.price-block ::attr(data-customer-price)').extract_first(default=0) + ) + yield item diff --git a/price_monitor/spiders/ebay.py b/price_monitor/spiders/ebay.py new file mode 100644 index 0000000..7721fa6 --- /dev/null +++ b/price_monitor/spiders/ebay.py @@ -0,0 +1,17 @@ +from extruct.w3cmicrodata import MicrodataExtractor +from .base_spider import BaseSpider + + +class EbaySpider(BaseSpider): + name = "ebay.com" + + def parse(self, response): + extractor = MicrodataExtractor() + properties = extractor.extract(response.body_as_unicode()).get('items')[0].get('properties', {}) + item = response.meta.get('item', {}) + item['url'] = response.url + item['title'] = properties.get('name').replace('Details about', '').strip() + item['price'] = float( + properties.get('offers', {}).get('properties', {}).get('price', 0) + ) + yield item diff --git a/price_monitor/templates/email.html b/price_monitor/templates/email.html new file mode 100644 index 0000000..c51ef0c --- /dev/null +++ b/price_monitor/templates/email.html @@ -0,0 +1,14 @@ +

🎉 Hey, we found a good deal! 🎁

+ + +{% for item in items %} + +{% endfor %} +
+

Product: {{item.title}}

+

Price: {{item.price}}

+

Store: {{item.retailer}}

+

Price obtained at: {{item.when}}

+

Visit the product page at {{item.retailer}}: {{item.url}}

+
+ diff --git a/price_monitor/utils.py b/price_monitor/utils.py new file mode 100644 index 0000000..8deb616 --- /dev/null +++ b/price_monitor/utils.py @@ -0,0 +1,35 @@ +import json +import pkgutil +from datetime import datetime, timedelta + + +def timestamp_from_reversed(reversed): + return datetime(5000, 1, 1) - timedelta(seconds=float(reversed)) + + +def reversed_timestamp(): + return str((datetime(5000, 1, 1) - datetime.now()).total_seconds()) + + +def normalize_name(name): + return name.replace('-', '') + + +def get_product_names(): + return [ + normalize_name(name) + for name in json.loads( + pkgutil.get_data("price_monitor", "resources/urls.json").decode() + ).keys() + ] + + +def get_retailer_name_from_url(url): + return url.split("://")[1].split("/")[0].replace("www.", "") + + +def get_retailers_for_product(product_name): + data = json.loads( + pkgutil.get_data("price_monitor", "resources/urls.json").decode() + ) + return {get_retailer_name_from_url(url) for url in data[product_name]} diff --git a/quotes_crawler/README.md b/quotes_crawler/README.md deleted file mode 100644 index 70b417f..0000000 --- a/quotes_crawler/README.md +++ /dev/null @@ -1,13 +0,0 @@ -#Spiders for Quotes.Toscrape.com - -This project contains spiders to scrape many variations of the [quotes.toscrape.com](https://quotes.toscrape.com), such as: - -* `toscrape-css`: scrapes [quotes.toscrape.com](https://quotes.toscrape.com) using CSS selectors; -* `toscrape-xpath`: scrapes [quotes.toscrape.com](https://quotes.toscrape.com) using XPath; -* `toscrape-microdata`: read the semantic markup data from [quotes.toscrape.com](https://quotes.toscrape.com) using [extruct](https://github.com/scrapinghub/extruct); -* `toscrape-js`: scrapes the JavaScript-powered version of `Quotes to Scrape`([quotes.toscrape.com/js](https://quotes.toscrape.com/js)) using [js2xml](https://github.com/scrapinghub/js2xml) to parse the data from inside the JavaScript code; -* `toscrape-selenium`: scrapes the JavaScript-powered version of `Quotes to Scrape`([quotes.toscrape.com/js](https://quotes.toscrape.com/js)) using Selenium + PhantomJS to render the page; -* `toscrape-infinite-scrolling`: scrapes the infinite scrolling version ([quotes.toscrape.com/scroll](https://quotes.toscrape.com/scroll)) via AJAX API calls; -* `toscrape-csrf-login-v1`: authenticates into [quotes.toscrape.com/login](https://quotes.toscrape.com/login) loading the CSRF token manually into the request; -* `toscrape-csrf-login-v2`: authenticates into [quotes.toscrape.com/login](https://quotes.toscrape.com/login) using `FormRequest.from_respose()` to load automatically the CSRF token; - diff --git a/quotes_crawler/quotes_crawler/pipelines.py b/quotes_crawler/quotes_crawler/pipelines.py deleted file mode 100644 index 3d615b2..0000000 --- a/quotes_crawler/quotes_crawler/pipelines.py +++ /dev/null @@ -1,11 +0,0 @@ -# -*- coding: utf-8 -*- - -# Define your item pipelines here -# -# Don't forget to add your pipeline to the ITEM_PIPELINES setting -# See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html - - -class QuotesCrawlerPipeline(object): - def process_item(self, item, spider): - return item diff --git a/quotes_crawler/quotes_crawler/settings.py b/quotes_crawler/quotes_crawler/settings.py deleted file mode 100644 index e4cdc07..0000000 --- a/quotes_crawler/quotes_crawler/settings.py +++ /dev/null @@ -1,90 +0,0 @@ -# -*- coding: utf-8 -*- - -# Scrapy settings for quotes_crawler project -# -# For simplicity, this file contains only settings considered important or -# commonly used. You can find more settings consulting the documentation: -# -# http://doc.scrapy.org/en/latest/topics/settings.html -# http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html -# http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html - -BOT_NAME = 'quotes_crawler' - -SPIDER_MODULES = ['quotes_crawler.spiders'] -NEWSPIDER_MODULE = 'quotes_crawler.spiders' - - -# Crawl responsibly by identifying yourself (and your website) on the user-agent -#USER_AGENT = 'quotes_crawler (+http://www.yourdomain.com)' - -# Obey robots.txt rules -ROBOTSTXT_OBEY = True - -# Configure maximum concurrent requests performed by Scrapy (default: 16) -#CONCURRENT_REQUESTS = 32 - -# Configure a delay for requests for the same website (default: 0) -# See http://scrapy.readthedocs.org/en/latest/topics/settings.html#download-delay -# See also autothrottle settings and docs -#DOWNLOAD_DELAY = 3 -# The download delay setting will honor only one of: -#CONCURRENT_REQUESTS_PER_DOMAIN = 16 -#CONCURRENT_REQUESTS_PER_IP = 16 - -# Disable cookies (enabled by default) -#COOKIES_ENABLED = False - -# Disable Telnet Console (enabled by default) -#TELNETCONSOLE_ENABLED = False - -# Override the default request headers: -#DEFAULT_REQUEST_HEADERS = { -# 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', -# 'Accept-Language': 'en', -#} - -# Enable or disable spider middlewares -# See http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html -#SPIDER_MIDDLEWARES = { -# 'quotes_crawler.middlewares.MyCustomSpiderMiddleware': 543, -#} - -# Enable or disable downloader middlewares -# See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html -#DOWNLOADER_MIDDLEWARES = { -# 'quotes_crawler.middlewares.MyCustomDownloaderMiddleware': 543, -#} - -# Enable or disable extensions -# See http://scrapy.readthedocs.org/en/latest/topics/extensions.html -#EXTENSIONS = { -# 'scrapy.extensions.telnet.TelnetConsole': None, -#} - -# Configure item pipelines -# See http://scrapy.readthedocs.org/en/latest/topics/item-pipeline.html -#ITEM_PIPELINES = { -# 'quotes_crawler.pipelines.SomePipeline': 300, -#} - -# Enable and configure the AutoThrottle extension (disabled by default) -# See http://doc.scrapy.org/en/latest/topics/autothrottle.html -#AUTOTHROTTLE_ENABLED = True -# The initial download delay -#AUTOTHROTTLE_START_DELAY = 5 -# The maximum download delay to be set in case of high latencies -#AUTOTHROTTLE_MAX_DELAY = 60 -# The average number of requests Scrapy should be sending in parallel to -# each remote server -#AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0 -# Enable showing throttling stats for every response received: -#AUTOTHROTTLE_DEBUG = False - -# Enable and configure HTTP caching (disabled by default) -# See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings -#HTTPCACHE_ENABLED = True -#HTTPCACHE_EXPIRATION_SECS = 0 -#HTTPCACHE_DIR = 'httpcache' -#HTTPCACHE_IGNORE_HTTP_CODES = [] -#HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage' diff --git a/quotes_crawler/quotes_crawler/spiders/toscrape-csrf-login-v1.py b/quotes_crawler/quotes_crawler/spiders/toscrape-csrf-login-v1.py deleted file mode 100644 index 280f72f..0000000 --- a/quotes_crawler/quotes_crawler/spiders/toscrape-csrf-login-v1.py +++ /dev/null @@ -1,35 +0,0 @@ -import scrapy - - -class ToScrapeCSRFLoginSpiderV1(scrapy.Spider): - name = 'toscrape-csrf-login-v1' - start_urls = [ - 'http://quotes.toscrape.com/login' - ] - - def parse(self, response): - # Forms with CSRF verification generates a CSRF token for each request - # and they require that same value in the data the client sends back. - # WARNING: - # This could be done automatically using FormRequest.from_response() - # check toscrape-csrf-login-v2.py for reference - token = response.css("input[name=csrf_token] ::attr(value)").extract_first() - yield scrapy.FormRequest( - self.start_urls[0], - formdata={ - 'csrf_token': token, - 'username': 'valdir', - 'password': 'abc' - }, - callback=self.after_login - ) - - def after_login(self, response): - authenticated = response.css('div.header-box p > a::text').extract_first() == 'Logout' - for quote in response.css('div.quote'): - yield { - 'text': quote.css('span::text').extract_first(), - 'author': quote.css('small::text').extract_first(), - 'tags': quote.css('.tags a::text').extract(), - 'authenticated': authenticated, - } diff --git a/quotes_crawler/quotes_crawler/spiders/toscrape-csrf-login-v2.py b/quotes_crawler/quotes_crawler/spiders/toscrape-csrf-login-v2.py deleted file mode 100644 index bc22ced..0000000 --- a/quotes_crawler/quotes_crawler/spiders/toscrape-csrf-login-v2.py +++ /dev/null @@ -1,32 +0,0 @@ -import scrapy - - -class ToScrapeCSRFLoginSpiderV2(scrapy.Spider): - name = 'toscrape-csrf-login-v2' - start_urls = [ - 'http://quotes.toscrape.com/login' - ] - - def parse(self, response): - # FormRequest.from_response automatically loads all the form data that - # is in the form present in the response object. This way, we don't - # have to worry about explicitly loading the CSRF token in the data we - # will POST to the server. - yield scrapy.FormRequest.from_response( - response, - formdata={ - 'username': 'any', - 'password': 'doesnt matter' - }, - callback=self.after_login, - ) - - def after_login(self, response): - authenticated = response.css('div.header-box p > a::text').extract_first() == 'Logout' - for quote in response.css('div.quote'): - yield { - 'text': quote.css('span::text').extract_first(), - 'author': quote.css('small::text').extract_first(), - 'tags': quote.css('.tags a::text').extract(), - 'authenticated': authenticated, - } diff --git a/quotes_crawler/quotes_crawler/spiders/toscrape-css.py b/quotes_crawler/quotes_crawler/spiders/toscrape-css.py deleted file mode 100644 index 7d4ee81..0000000 --- a/quotes_crawler/quotes_crawler/spiders/toscrape-css.py +++ /dev/null @@ -1,21 +0,0 @@ -# -*- coding: utf-8 -*- -import scrapy - - -class ToScrapeCSSSpider(scrapy.Spider): - name = "toscrape-css" - start_urls = [ - 'http://quotes.toscrape.com/', - ] - - def parse(self, response): - for quote in response.css("div.quote"): - yield { - 'text': quote.css("span.text::text").extract_first(), - 'author': quote.css("small.author::text").extract_first(), - 'tags': quote.css("div.tags > a.tag::text").extract() - } - - next_page_url = response.css("li.next > a::attr(href)").extract_first() - if next_page_url is not None: - yield scrapy.Request(response.urljoin(next_page_url)) diff --git a/quotes_crawler/quotes_crawler/spiders/toscrape-infinite-scrolling.py b/quotes_crawler/quotes_crawler/spiders/toscrape-infinite-scrolling.py deleted file mode 100644 index a5492a8..0000000 --- a/quotes_crawler/quotes_crawler/spiders/toscrape-infinite-scrolling.py +++ /dev/null @@ -1,18 +0,0 @@ -import json -import scrapy - - -# Most AJAX based websites can be scraped by reproducing the API calls made -# by the browser, as we do in this simple example that scrapes -# a website paginated via infinite scrolling (quotes.toscrape.com/scroll) -class ToScrapeInfiniteScrollingSpider(scrapy.Spider): - name = 'toscrape-infinite-scrolling' - base_url = 'http://quotes.toscrape.com/api/quotes?page=%d' - start_urls = [base_url % 1] - - def parse(self, response): - json_data = json.loads(response.text) - for quote in json_data['quotes']: - yield quote - if json_data['has_next']: - yield scrapy.Request(self.base_url % (int(json_data['page']) + 1)) diff --git a/quotes_crawler/quotes_crawler/spiders/toscrape-js.py b/quotes_crawler/quotes_crawler/spiders/toscrape-js.py deleted file mode 100644 index 9e2e4d5..0000000 --- a/quotes_crawler/quotes_crawler/spiders/toscrape-js.py +++ /dev/null @@ -1,25 +0,0 @@ -import scrapy -import js2xml - - -class ToScrapeJSSpider(scrapy.Spider): - name = 'toscrape-js' - start_urls = [ - 'http://quotes.toscrape.com/js/' - ] - - def parse(self, response): - script = response.xpath('//script[contains(., "var data =")]/text()').extract_first() - sel = scrapy.Selector(_root=js2xml.parse(script)) - for quote in sel.xpath('//var[@name="data"]/array/object'): - yield { - 'text': quote.xpath('string(./property[@name="text"])').extract_first(), - 'author': quote.xpath( - 'string(./property[@name="author"]//property[@name="name"])' - ).extract_first(), - 'tags': quote.xpath('./property[@name="tags"]//string/text()').extract(), - } - - link_next = response.css('li.next a::attr("href")').extract_first() - if link_next: - yield scrapy.Request(response.urljoin(link_next)) diff --git a/quotes_crawler/quotes_crawler/spiders/toscrape-microdata.py b/quotes_crawler/quotes_crawler/spiders/toscrape-microdata.py deleted file mode 100644 index 568788b..0000000 --- a/quotes_crawler/quotes_crawler/spiders/toscrape-microdata.py +++ /dev/null @@ -1,19 +0,0 @@ -import scrapy -from extruct.w3cmicrodata import LxmlMicrodataExtractor - - -class ToScrapeMicrodataSpider(scrapy.Spider): - name = "toscrape-microdata" - start_urls = [ - 'http://quotes.toscrape.com/' - ] - - def parse(self, response): - extractor = LxmlMicrodataExtractor() - items = extractor.extract(response.text, response.url)['items'] - for it in items: - yield it['properties'] - - next_page_url = response.css("li.next > a::attr(href)").extract_first() - if next_page_url is not None: - yield scrapy.Request(response.urljoin(next_page_url)) diff --git a/quotes_crawler/quotes_crawler/spiders/toscrape-selenium.py b/quotes_crawler/quotes_crawler/spiders/toscrape-selenium.py deleted file mode 100644 index 90b6ab3..0000000 --- a/quotes_crawler/quotes_crawler/spiders/toscrape-selenium.py +++ /dev/null @@ -1,28 +0,0 @@ -# -*- coding:utf-8 -*- -import scrapy -from selenium import webdriver - - -# this spider needs PhantomJS (http://phantomjs.org/) installed somewhere in your PATH -class ToScrapeSeleniumSpider(scrapy.Spider): - name = 'toscrape-selenium' - start_urls = [ - 'http://quotes.toscrape.com/js' - ] - - def __init__(self, *args, **kwargs): - self.driver = webdriver.PhantomJS() - super(ToScrapeSeleniumSpider, self).__init__(*args, **kwargs) - - def parse(self, response): - self.driver.get(response.url) - for quote in self.driver.find_elements_by_css_selector('div.quote'): - yield { - 'quote': quote.find_element_by_css_selector("span.text").text, - 'author': quote.find_element_by_css_selector("small.author").text, - 'tags': [e.text for e in quote.find_elements_by_class_name('tag')], - } - # pagination links are not generated by JS code in this page - next_page_url = response.css("li.next > a::attr(href)").extract_first() - if next_page_url is not None: - yield scrapy.Request(response.urljoin(next_page_url)) diff --git a/quotes_crawler/quotes_crawler/spiders/toscrape-xpath.py b/quotes_crawler/quotes_crawler/spiders/toscrape-xpath.py deleted file mode 100644 index 1ec56af..0000000 --- a/quotes_crawler/quotes_crawler/spiders/toscrape-xpath.py +++ /dev/null @@ -1,21 +0,0 @@ -# -*- coding: utf-8 -*- -import scrapy - - -class ToScrapeSpiderXPath(scrapy.Spider): - name = 'toscrape-xpath' - start_urls = [ - 'http://quotes.toscrape.com/', - ] - - def parse(self, response): - for quote in response.xpath('//div[@class="quote"]'): - yield { - 'text': quote.xpath('./span[@class="text"]/text()').extract_first(), - 'author': quote.xpath('.//small[@class="author"]/text()').extract_first(), - 'tags': quote.xpath('.//div[@class="tags"]/a[@class="tag"]/text()').extract() - } - - next_page_url = response.xpath('//li[@class="next"]/a/@href').extract_first() - if next_page_url is not None: - yield scrapy.Request(response.urljoin(next_page_url)) diff --git a/quotes_crawler/requirements.txt b/quotes_crawler/requirements.txt deleted file mode 100644 index b9e58ba..0000000 --- a/quotes_crawler/requirements.txt +++ /dev/null @@ -1,3 +0,0 @@ -extruct -js2xml -selenium diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000..2567afb --- /dev/null +++ b/requirements.txt @@ -0,0 +1,5 @@ +scrapy +boto +extruct +w3lib +jinja2 diff --git a/sc_custom_image/README.md b/sc_custom_image/README.md deleted file mode 100644 index ecdae91..0000000 --- a/sc_custom_image/README.md +++ /dev/null @@ -1,4 +0,0 @@ -## Scrapy Cloud Custom Image - -Sample Scrapy project demonstrating using PhantomJS and -deploying it to Scrapy Cloud using a custom Docker image. diff --git a/sc_custom_image/requirements.txt b/sc_custom_image/requirements.txt deleted file mode 100644 index 7cb6656..0000000 --- a/sc_custom_image/requirements.txt +++ /dev/null @@ -1 +0,0 @@ -selenium diff --git a/sc_custom_image/sc_custom_image/__init__.py b/sc_custom_image/sc_custom_image/__init__.py deleted file mode 100644 index e69de29..0000000 diff --git a/sc_custom_image/sc_custom_image/items.py b/sc_custom_image/sc_custom_image/items.py deleted file mode 100644 index e3e7af0..0000000 --- a/sc_custom_image/sc_custom_image/items.py +++ /dev/null @@ -1,14 +0,0 @@ -# -*- coding: utf-8 -*- - -# Define here the models for your scraped items -# -# See documentation in: -# http://doc.scrapy.org/en/latest/topics/items.html - -import scrapy - - -class ScCustomImageItem(scrapy.Item): - # define the fields for your item here like: - # name = scrapy.Field() - pass diff --git a/sc_custom_image/sc_custom_image/pipelines.py b/sc_custom_image/sc_custom_image/pipelines.py deleted file mode 100644 index 2c4122a..0000000 --- a/sc_custom_image/sc_custom_image/pipelines.py +++ /dev/null @@ -1,11 +0,0 @@ -# -*- coding: utf-8 -*- - -# Define your item pipelines here -# -# Don't forget to add your pipeline to the ITEM_PIPELINES setting -# See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html - - -class ScCustomImagePipeline(object): - def process_item(self, item, spider): - return item diff --git a/sc_custom_image/sc_custom_image/settings.py b/sc_custom_image/sc_custom_image/settings.py deleted file mode 100644 index 6f84f68..0000000 --- a/sc_custom_image/sc_custom_image/settings.py +++ /dev/null @@ -1,90 +0,0 @@ -# -*- coding: utf-8 -*- - -# Scrapy settings for sc_custom_image project -# -# For simplicity, this file contains only settings considered important or -# commonly used. You can find more settings consulting the documentation: -# -# http://doc.scrapy.org/en/latest/topics/settings.html -# http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html -# http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html - -BOT_NAME = 'sc_custom_image' - -SPIDER_MODULES = ['sc_custom_image.spiders'] -NEWSPIDER_MODULE = 'sc_custom_image.spiders' - - -# Crawl responsibly by identifying yourself (and your website) on the user-agent -#USER_AGENT = 'sc_custom_image (+http://www.yourdomain.com)' - -# Obey robots.txt rules -ROBOTSTXT_OBEY = True - -# Configure maximum concurrent requests performed by Scrapy (default: 16) -#CONCURRENT_REQUESTS = 32 - -# Configure a delay for requests for the same website (default: 0) -# See http://scrapy.readthedocs.org/en/latest/topics/settings.html#download-delay -# See also autothrottle settings and docs -#DOWNLOAD_DELAY = 3 -# The download delay setting will honor only one of: -#CONCURRENT_REQUESTS_PER_DOMAIN = 16 -#CONCURRENT_REQUESTS_PER_IP = 16 - -# Disable cookies (enabled by default) -#COOKIES_ENABLED = False - -# Disable Telnet Console (enabled by default) -#TELNETCONSOLE_ENABLED = False - -# Override the default request headers: -#DEFAULT_REQUEST_HEADERS = { -# 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', -# 'Accept-Language': 'en', -#} - -# Enable or disable spider middlewares -# See http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html -#SPIDER_MIDDLEWARES = { -# 'sc_custom_image.middlewares.MyCustomSpiderMiddleware': 543, -#} - -# Enable or disable downloader middlewares -# See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html -#DOWNLOADER_MIDDLEWARES = { -# 'sc_custom_image.middlewares.MyCustomDownloaderMiddleware': 543, -#} - -# Enable or disable extensions -# See http://scrapy.readthedocs.org/en/latest/topics/extensions.html -#EXTENSIONS = { -# 'scrapy.extensions.telnet.TelnetConsole': None, -#} - -# Configure item pipelines -# See http://scrapy.readthedocs.org/en/latest/topics/item-pipeline.html -#ITEM_PIPELINES = { -# 'sc_custom_image.pipelines.SomePipeline': 300, -#} - -# Enable and configure the AutoThrottle extension (disabled by default) -# See http://doc.scrapy.org/en/latest/topics/autothrottle.html -#AUTOTHROTTLE_ENABLED = True -# The initial download delay -#AUTOTHROTTLE_START_DELAY = 5 -# The maximum download delay to be set in case of high latencies -#AUTOTHROTTLE_MAX_DELAY = 60 -# The average number of requests Scrapy should be sending in parallel to -# each remote server -#AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0 -# Enable showing throttling stats for every response received: -#AUTOTHROTTLE_DEBUG = False - -# Enable and configure HTTP caching (disabled by default) -# See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings -#HTTPCACHE_ENABLED = True -#HTTPCACHE_EXPIRATION_SECS = 0 -#HTTPCACHE_DIR = 'httpcache' -#HTTPCACHE_IGNORE_HTTP_CODES = [] -#HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage' diff --git a/sc_custom_image/sc_custom_image/spiders/__init__.py b/sc_custom_image/sc_custom_image/spiders/__init__.py deleted file mode 100644 index ebd689a..0000000 --- a/sc_custom_image/sc_custom_image/spiders/__init__.py +++ /dev/null @@ -1,4 +0,0 @@ -# This package will contain the spiders of your Scrapy project -# -# Please refer to the documentation for information on how to create and manage -# your spiders. diff --git a/sc_custom_image/sc_custom_image/spiders/demo.py b/sc_custom_image/sc_custom_image/spiders/demo.py deleted file mode 100644 index b1073ff..0000000 --- a/sc_custom_image/sc_custom_image/spiders/demo.py +++ /dev/null @@ -1,23 +0,0 @@ -import scrapy -from selenium import webdriver - - -class DemoSpider(scrapy.Spider): - name = 'demo' - start_urls = ['http://quotes.toscrape.com/js'] - - def __init__(self, *args, **kwargs): - # XXX: needs phantomjs binary available in PATH - self.driver = webdriver.PhantomJS() - super(DemoSpider, self).__init__(*args, **kwargs) - - def parse(self, response): - self.driver.get(response.url) - for quote in self.driver.find_elements_by_css_selector('div.quote'): - yield { - 'quote': quote.find_element_by_css_selector('span').text, - 'author': quote.find_element_by_css_selector('small').text, - } - next_page_url = response.css('nav li.next a ::attr(href)').extract_first() - if next_page_url: - yield scrapy.Request(response.urljoin(next_page_url)) diff --git a/sc_custom_image/scrapinghub.yml b/sc_custom_image/scrapinghub.yml deleted file mode 100644 index ac21d6a..0000000 --- a/sc_custom_image/scrapinghub.yml +++ /dev/null @@ -1,3 +0,0 @@ -project: PUT_YOUR_PROJECT_ID_HERE -requirements_file: ./requirements.txt -image: true diff --git a/sc_custom_image/scrapy.cfg b/sc_custom_image/scrapy.cfg deleted file mode 100644 index 991d985..0000000 --- a/sc_custom_image/scrapy.cfg +++ /dev/null @@ -1,11 +0,0 @@ -# Automatically created by: scrapy startproject -# -# For more information about the [deploy] section see: -# https://scrapyd.readthedocs.org/en/latest/deploy.html - -[settings] -default = sc_custom_image.settings - -[deploy] -#url = http://localhost:6800/ -project = sc_custom_image diff --git a/sc_scripts_demo/bin/check_jobs.py b/sc_scripts_demo/bin/check_jobs.py deleted file mode 100644 index d8a817a..0000000 --- a/sc_scripts_demo/bin/check_jobs.py +++ /dev/null @@ -1,121 +0,0 @@ -#!/usr/bin/env python -# -*- coding: utf-8 -*- -"""Simple monitor jobs checker for the last 24 hours -""" - -from __future__ import print_function - -import argparse -import os - -import boto -from datetime import datetime -from datetime import timedelta -from scrapinghub import Project, Connection - -# Configure your SES credentials here -AWS_ACCESS_KEY = '' -AWS_SECRET_KEY = '' - -# Configure the Mail-from here -DEFAULT_MAIL_FROM = 'Custom Notification ' - - -def send_email(recipients, subject, body, mail_from=DEFAULT_MAIL_FROM): - """Send an email using AWS Simple Email Service - """ - ses = boto.connect_ses(AWS_ACCESS_KEY, AWS_SECRET_KEY) - ses.send_email(mail_from, subject, body, recipients) - print('Email sent to %s' % ', '.join(recipients)) - - -def parse_date(date_str): - return datetime.strptime(date_str, '%Y-%m-%dT%H:%M:%S') - - -def has_job_error(job): - success_reason = 'no_reason' - return (job.info.get('errors_count', 0) > 0 - or job.info.get('close_reason') != success_reason) - - -def is_job_newer_than(job, since_time): - cancelled_before_starting = ('updated_time' not in job.info - and job.info.get('close_reason') == 'cancelled') - if cancelled_before_starting: - return False - return since_time <= parse_date(job.info['updated_time']) - - -def get_last_24h_jobs(apikey, project_id): - """Fetch jobs that finished in the last 24 hours - """ - project = Project(Connection(apikey), project_id) - since_time = datetime.utcnow() - timedelta(hours=24) - jobs = [ - job for job in project.jobs(state='finished') - if is_job_newer_than(job, since_time) - ] - return jobs - - -def render_report(jobs_with_error): - """Build a text report for the jobs with errors - """ - output = [] - for job in jobs_with_error: - errors_count = job.info.get('errors_count', 0) - close_reason = job.info.get('close_reason') - - job_id = job.info["id"].split('/') - url = 'https://app.scrapinghub.com/p/{0}/job/{1}/{2}'.format( - job_id[0], job_id[1], job_id[2]) - - error_message = ['Errors found for job "{0}" ({1}):'.format( - job.info['spider'], url)] - if errors_count > 0: - error_message.append(' There were {} error{}.'.format( - errors_count, '' if errors_count == 1 else 's')) - - success_reasons = ('no_reason', 'finished') - if close_reason not in success_reasons: - error_message.append(' Close reason should not be "{}".'.format( - close_reason)) - output.append('\n'.join(error_message)) - - return '\n\n'.join(output) - - -def main(args): - job_list = get_last_24h_jobs(args.apikey, args.project_id) - jobs_with_errors = [job for job in job_list if has_job_error(job)] - - if jobs_with_errors: - report = render_report(jobs_with_errors) - if args.mail: - subject = 'Scrapy Cloud - jobs with errors' - send_email(args.mail, subject, body=report) - else: - print(report) - else: - print('No errors found.') - - -def parse_args(): - parser = argparse.ArgumentParser(description=__doc__) - - parser.add_argument('--apikey', default=os.getenv('SHUB_APIKEY', None), - help='API key to use for scrapinghub (will fallback ' - 'to SHUB_APIKEY variable)') - parser.add_argument('project_id', type=int, - help='Project ID to get info from.') - parser.add_argument('--mail', action='append', help='Send output as email') - args = parser.parse_args() - - if not args.apikey: - parser.error('Please provide an API key with --apikey option') - return args - - -if '__main__' == __name__: - main(parse_args()) diff --git a/sc_scripts_demo/requirements.txt b/sc_scripts_demo/requirements.txt deleted file mode 100644 index 96cbf0b..0000000 --- a/sc_scripts_demo/requirements.txt +++ /dev/null @@ -1 +0,0 @@ -scrapinghub diff --git a/sc_scripts_demo/sc_scripts_demo/__init__.py b/sc_scripts_demo/sc_scripts_demo/__init__.py deleted file mode 100644 index e69de29..0000000 diff --git a/sc_scripts_demo/sc_scripts_demo/settings.py b/sc_scripts_demo/sc_scripts_demo/settings.py deleted file mode 100644 index ad7babf..0000000 --- a/sc_scripts_demo/sc_scripts_demo/settings.py +++ /dev/null @@ -1,10 +0,0 @@ -# -*- coding: utf-8 -*- - -BOT_NAME = 'sc_scripts_demo' - -SPIDER_MODULES = ['sc_scripts_demo.spiders'] -NEWSPIDER_MODULE = 'sc_scripts_demo.spiders' - -USER_AGENT = 'sc_scripts_demo (http://scrapinghub.com)' - -ROBOTSTXT_OBEY = True diff --git a/sc_scripts_demo/sc_scripts_demo/spiders/__init__.py b/sc_scripts_demo/sc_scripts_demo/spiders/__init__.py deleted file mode 100644 index ebd689a..0000000 --- a/sc_scripts_demo/sc_scripts_demo/spiders/__init__.py +++ /dev/null @@ -1,4 +0,0 @@ -# This package will contain the spiders of your Scrapy project -# -# Please refer to the documentation for information on how to create and manage -# your spiders. diff --git a/sc_scripts_demo/sc_scripts_demo/spiders/bad_spider.py b/sc_scripts_demo/sc_scripts_demo/spiders/bad_spider.py deleted file mode 100644 index 6b45c75..0000000 --- a/sc_scripts_demo/sc_scripts_demo/spiders/bad_spider.py +++ /dev/null @@ -1,12 +0,0 @@ -# -*- coding: utf-8 -*- -import scrapy - - -class SpiderWithErrors(scrapy.Spider): - name = "bad" - start_urls = [ - 'http://quotes.toscrape.com/', - ] - - def parse(self, response): - raise ValueError('Oops, this spider has errors') diff --git a/sc_scripts_demo/sc_scripts_demo/spiders/good_spider.py b/sc_scripts_demo/sc_scripts_demo/spiders/good_spider.py deleted file mode 100644 index 4d6f156..0000000 --- a/sc_scripts_demo/sc_scripts_demo/spiders/good_spider.py +++ /dev/null @@ -1,21 +0,0 @@ -# -*- coding: utf-8 -*- -import scrapy - - -class NiceWorkingSpider(scrapy.Spider): - name = "good" - start_urls = [ - 'http://quotes.toscrape.com/', - ] - - def parse(self, response): - for quote in response.css("div.quote"): - yield { - 'text': quote.css("span.text::text").extract_first(), - 'author': quote.css("small.author::text").extract_first(), - 'tags': quote.css("div.tags > a.tag::text").extract() - } - - next_page_url = response.css("li.next > a::attr(href)").extract_first() - if next_page_url is not None: - yield scrapy.Request(response.urljoin(next_page_url)) diff --git a/sc_scripts_demo/scrapinghub.yml b/sc_scripts_demo/scrapinghub.yml deleted file mode 100644 index 196de16..0000000 --- a/sc_scripts_demo/scrapinghub.yml +++ /dev/null @@ -1,3 +0,0 @@ -projects: - default: 105217 -requirements_file: requirements.txt diff --git a/sc_scripts_demo/scrapy.cfg b/sc_scripts_demo/scrapy.cfg deleted file mode 100644 index de78200..0000000 --- a/sc_scripts_demo/scrapy.cfg +++ /dev/null @@ -1,11 +0,0 @@ -# Automatically created by: scrapy startproject -# -# For more information about the [deploy] section see: -# https://scrapyd.readthedocs.org/en/latest/deploy.html - -[settings] -default = sc_scripts_demo.settings - -[deploy] -#url = http://localhost:6800/ -project = sc_scripts_demo diff --git a/sc_scripts_demo/setup.py b/sc_scripts_demo/setup.py deleted file mode 100644 index 0412d07..0000000 --- a/sc_scripts_demo/setup.py +++ /dev/null @@ -1,14 +0,0 @@ -from setuptools import setup, find_packages - - -setup( - name='sc_scripts_demo', - version='1.0', - packages=find_packages(), - scripts=[ - 'bin/check_jobs.py', - ], - entry_points={ - 'scrapy': ['settings = sc_scripts_demo.settings'], - }, -) diff --git a/splash_crawlera_example/scrapinghub.yml b/scrapinghub.yml similarity index 62% rename from splash_crawlera_example/scrapinghub.yml rename to scrapinghub.yml index 745b37f..7a8527c 100644 --- a/splash_crawlera_example/scrapinghub.yml +++ b/scrapinghub.yml @@ -1,3 +1,3 @@ requirements_file: requirements.txt stacks: - default: scrapy:1.3-py3 + default: scrapy:1.1-py3 diff --git a/quotes_crawler/scrapy.cfg b/scrapy.cfg similarity index 78% rename from quotes_crawler/scrapy.cfg rename to scrapy.cfg index e62509b..d34a107 100644 --- a/quotes_crawler/scrapy.cfg +++ b/scrapy.cfg @@ -4,8 +4,8 @@ # https://scrapyd.readthedocs.org/en/latest/deploy.html [settings] -default = quotes_crawler.settings +default = price_monitor.settings [deploy] #url = http://localhost:6800/ -project = quotes_crawler +project = price_monitor diff --git a/setup.py b/setup.py new file mode 100644 index 0000000..3e0698a --- /dev/null +++ b/setup.py @@ -0,0 +1,12 @@ +# Automatically created by: shub deploy + +from setuptools import setup, find_packages + +setup( + name='project', + version='1.0', + packages=find_packages(), + package_data={'price_monitor': ['resources/*.json', 'templates/*.html']}, + scripts=['bin/monitor.py'], + entry_points={'scrapy': ['settings = price_monitor.settings']}, +) diff --git a/splash_based_project/scrapy.cfg b/splash_based_project/scrapy.cfg deleted file mode 100644 index b038630..0000000 --- a/splash_based_project/scrapy.cfg +++ /dev/null @@ -1,11 +0,0 @@ -# Automatically created by: scrapy startproject -# -# For more information about the [deploy] section see: -# https://scrapyd.readthedocs.org/en/latest/deploy.html - -[settings] -default = splash_based_project.settings - -[deploy] -#url = http://localhost:6800/ -project = splash_based_project diff --git a/splash_based_project/splash_based_project/__init__.py b/splash_based_project/splash_based_project/__init__.py deleted file mode 100644 index e69de29..0000000 diff --git a/splash_based_project/splash_based_project/settings.py b/splash_based_project/splash_based_project/settings.py deleted file mode 100644 index 4ce3c47..0000000 --- a/splash_based_project/splash_based_project/settings.py +++ /dev/null @@ -1,20 +0,0 @@ -# -*- coding: utf-8 -*- - -# Scrapy settings for splash_based_project project - -BOT_NAME = 'splash_based_project' -SPIDER_MODULES = ['splash_based_project.spiders'] -NEWSPIDER_MODULE = 'splash_based_project.spiders' - -# Splash settings -SPLASH_URL = '' # <-- Splash instance URL from Scrapy Cloud -APIKEY = '' # <-- your API key -SPIDER_MIDDLEWARES = { - 'scrapy_splash.SplashDeduplicateArgsMiddleware': 100, -} -DOWNLOADER_MIDDLEWARES = { - 'scrapy_splash.SplashCookiesMiddleware': 723, - 'scrapy_splash.SplashMiddleware': 725, - 'scrapy.downloadermiddlewares.httpcompression.HttpCompressionMiddleware': 810, -} -DUPEFILTER_CLASS = 'scrapy_splash.SplashAwareDupeFilter' diff --git a/splash_based_project/splash_based_project/spiders/__init__.py b/splash_based_project/splash_based_project/spiders/__init__.py deleted file mode 100644 index ebd689a..0000000 --- a/splash_based_project/splash_based_project/spiders/__init__.py +++ /dev/null @@ -1,4 +0,0 @@ -# This package will contain the spiders of your Scrapy project -# -# Please refer to the documentation for information on how to create and manage -# your spiders. diff --git a/splash_based_project/splash_based_project/spiders/quotes-js-1.py b/splash_based_project/splash_based_project/spiders/quotes-js-1.py deleted file mode 100644 index bc9ec4d..0000000 --- a/splash_based_project/splash_based_project/spiders/quotes-js-1.py +++ /dev/null @@ -1,27 +0,0 @@ -import scrapy -from scrapy_splash import SplashRequest - - -class QuotesJs1Spider(scrapy.Spider): - """Example spider using Splash to render JavaScript-based pages. - Make sure you configure settings.py according to your Splash - credentials (available on Scrapy Cloud). - """ - - name = 'quotes-js-1' - http_user = '' # <-- your API key goes here - - def start_requests(self): - yield SplashRequest('http://quotes.toscrape.com/js') - - def parse(self, response): - for quote in response.css('div.quote'): - yield { - 'text': quote.css('span.text::text').extract_first(), - 'author': quote.css('span small::text').extract_first(), - 'tags': quote.css('div.tags a.tag::text').extract(), - } - - next_page = response.css('li.next > a::attr(href)').extract_first() - if next_page: - yield SplashRequest(response.urljoin(next_page)) diff --git a/splash_based_project/splash_based_project/spiders/quotes-js-2.py b/splash_based_project/splash_based_project/spiders/quotes-js-2.py deleted file mode 100644 index ab9606a..0000000 --- a/splash_based_project/splash_based_project/spiders/quotes-js-2.py +++ /dev/null @@ -1,36 +0,0 @@ -import scrapy -from scrapy_splash import SplashRequest -from w3lib.http import basic_auth_header - - -class QuotesJs2Spider(scrapy.Spider): - """Example spider using Splash to render JavaScript-based pages. - Make sure you configure settings.py with your Splash - credentials (available on Scrapy Cloud). - """ - name = 'quotes-js-2' - - def start_requests(self): - yield SplashRequest( - 'http://quotes.toscrape.com/js', - splash_headers={ - 'Authorization': basic_auth_header(self.settings['APIKEY'], ''), - }, - ) - - def parse(self, response): - for quote in response.css('div.quote'): - yield { - 'text': quote.css('span.text::text').extract_first(), - 'author': quote.css('span small::text').extract_first(), - 'tags': quote.css('div.tags a.tag::text').extract(), - } - - next_page = response.css('li.next > a::attr(href)').extract_first() - if next_page: - yield SplashRequest( - response.urljoin(next_page), - splash_headers={ - 'Authorization': basic_auth_header(self.settings['APIKEY'], ''), - }, - ) diff --git a/splash_crawlera_example/README.md b/splash_crawlera_example/README.md deleted file mode 100644 index 1f1f95f..0000000 --- a/splash_crawlera_example/README.md +++ /dev/null @@ -1,28 +0,0 @@ -# Splash + Crawlera Example Project - -This example project shows how to use [Crawlera](http://scrapinghub.com/crawlera) -(a smart downloader) and [Splash](https://scrapinghub.com/splash) (a JavaScript -rendering service) with Scrapy spiders. - - -## How does it work? - -The integration between Splash and Crawlera is done by a -[Lua script](https://github.com/scrapinghub/sample-projects/blob/master/splash_crawlera_example/splash_crawlera_example/scripts/crawlera.lua) -that is sent to Splash with every request created by the spider. This script configures -Splash to use Crawlera as its proxy and also defines a couple rules to avoid doing -useless requests, such as analytics ones, stylesheets, images, etc. - - -## What do I need to run this project? - -Here's what you'll need: - -- a Splash instance and a Crawlera account: you can get both via Scrapy Cloud billing page - - you can also run Splash in your own machine following the [instructions here](http://splash.readthedocs.io/en/stable/install.html) -- set your Splash settings this project's [settings.py](https://github.com/scrapinghub/sample-projects/blob/master/splash_crawlera_example/splash_crawlera_example/settings.py) -file: - - `SPLASH_URL`: the URL where your Splash instance is available - - `SPLASH_APIKEY`: your Splash API key (required if you're using an instance from Scrapy Cloud) -- set your Crawlera settings in the same file: - - `CRAWLERA_APIKEY`: the API key for your Crawlera user diff --git a/splash_crawlera_example/requirements.txt b/splash_crawlera_example/requirements.txt deleted file mode 100644 index d15d2d9..0000000 --- a/splash_crawlera_example/requirements.txt +++ /dev/null @@ -1 +0,0 @@ -scrapy-splash diff --git a/splash_crawlera_example/scrapy.cfg b/splash_crawlera_example/scrapy.cfg deleted file mode 100644 index 125cb7c..0000000 --- a/splash_crawlera_example/scrapy.cfg +++ /dev/null @@ -1,11 +0,0 @@ -# Automatically created by: scrapy startproject -# -# For more information about the [deploy] section see: -# https://scrapyd.readthedocs.org/en/latest/deploy.html - -[settings] -default = splash_crawlera_example.settings - -[deploy] -#url = http://localhost:6800/ -project = splash_crawlera_example diff --git a/splash_crawlera_example/setup.py b/splash_crawlera_example/setup.py deleted file mode 100644 index 0e8e841..0000000 --- a/splash_crawlera_example/setup.py +++ /dev/null @@ -1,10 +0,0 @@ -# Automatically created by: shub deploy -from setuptools import setup, find_packages - -setup( - name = 'project', - version = '1.0', - packages = find_packages(), - package_data = {'splash_crawlera_example': ['scripts/*.lua',]}, - entry_points = {'scrapy': ['settings = splash_crawlera_example.settings']}, -) diff --git a/splash_crawlera_example/splash_crawlera_example/__init__.py b/splash_crawlera_example/splash_crawlera_example/__init__.py deleted file mode 100644 index e69de29..0000000 diff --git a/splash_crawlera_example/splash_crawlera_example/scripts/crawlera.lua b/splash_crawlera_example/splash_crawlera_example/scripts/crawlera.lua deleted file mode 100644 index de11a28..0000000 --- a/splash_crawlera_example/splash_crawlera_example/scripts/crawlera.lua +++ /dev/null @@ -1,49 +0,0 @@ -function use_crawlera(splash) - -- Make sure you pass your Crawlera API key in the 'crawlera_user' arg. - -- Have a look at the file spiders/quotes-js.py to see how to do it. - -- Find your Crawlera credentials in https://app.scrapinghub.com/ - local user = splash.args.crawlera_user - - local host = 'proxy.crawlera.com' - local port = 8010 - local session_header = 'X-Crawlera-Session' - local session_id = 'create' - - splash:on_request(function (request) - -- The commented code below can be used to speed up the crawling - -- process. They filter requests to undesired domains and useless - -- resources. Uncomment the ones that make sense to your use case - -- and add your own rules. - - -- Discard requests to advertising and tracking domains. - -- if string.find(request.url, 'doubleclick%.net') or - -- string.find(request.url, 'analytics%.google%.com') then - -- request.abort() - -- return - -- end - - -- Avoid using Crawlera for subresources fetching to increase crawling - -- speed. The example below avoids using Crawlera for URLS starting - -- with 'static.' and the ones ending with '.png'. - -- if string.find(request.url, '://static%.') ~= nil or - -- string.find(request.url, '%.png$') ~= nil then - -- return - -- end - - request:set_header('X-Crawlera-Cookies', 'disable') - request:set_header(session_header, session_id) - request:set_proxy{host, port, username=user, password=''} - end) - - splash:on_response_headers(function (response) - if type(response.headers[session_header]) ~= nil then - session_id = response.headers[session_header] - end - end) -end - -function main(splash) - use_crawlera(splash) - splash:go(splash.args.url) - return splash:html() -end diff --git a/splash_crawlera_example/splash_crawlera_example/settings.py b/splash_crawlera_example/splash_crawlera_example/settings.py deleted file mode 100644 index 94bccb2..0000000 --- a/splash_crawlera_example/splash_crawlera_example/settings.py +++ /dev/null @@ -1,22 +0,0 @@ -# -*- coding: utf-8 -*- - -BOT_NAME = 'splash_crawlera_example' -SPIDER_MODULES = ['splash_crawlera_example.spiders'] -NEWSPIDER_MODULE = 'splash_crawlera_example.spiders' - -SPIDER_MIDDLEWARES = { - 'scrapy_splash.SplashDeduplicateArgsMiddleware': 100, -} - -DOWNLOADER_MIDDLEWARES = { - 'scrapy_splash.SplashCookiesMiddleware': 723, - 'scrapy_splash.SplashMiddleware': 725, - 'scrapy.downloadermiddlewares.httpcompression.HttpCompressionMiddleware': 810, -} - -CRAWLERA_APIKEY = '' # Your crawlera API key - -# Splash settings -SPLASH_URL = '' # Splash instance URL from Scrapy Cloud -SPLASH_APIKEY = '' # Your API key for the Splash instance hosted on Scrapy Cloud -DUPEFILTER_CLASS = 'scrapy_splash.SplashAwareDupeFilter' diff --git a/splash_crawlera_example/splash_crawlera_example/spiders/__init__.py b/splash_crawlera_example/splash_crawlera_example/spiders/__init__.py deleted file mode 100644 index ebd689a..0000000 --- a/splash_crawlera_example/splash_crawlera_example/spiders/__init__.py +++ /dev/null @@ -1,4 +0,0 @@ -# This package will contain the spiders of your Scrapy project -# -# Please refer to the documentation for information on how to create and manage -# your spiders. diff --git a/splash_crawlera_example/splash_crawlera_example/spiders/quotes-js.py b/splash_crawlera_example/splash_crawlera_example/spiders/quotes-js.py deleted file mode 100644 index 3eb2f77..0000000 --- a/splash_crawlera_example/splash_crawlera_example/spiders/quotes-js.py +++ /dev/null @@ -1,54 +0,0 @@ -from pkgutil import get_data -import scrapy -from scrapy_splash import SplashRequest -from w3lib.http import basic_auth_header - - -class QuotesJsSpider(scrapy.Spider): - name = 'quotes-js' - - def __init__(self, *args, **kwargs): - # to be able to load the Lua script on Scrapy Cloud, make sure your - # project's setup.py file contains the "package_data" setting, similar - # to this project's setup.py - self.LUA_SOURCE = get_data( - 'splash_crawlera_example', 'scripts/crawlera.lua' - ).decode('utf-8') - super(QuotesJsSpider, self).__init__(*args, **kwargs) - - def start_requests(self): - yield SplashRequest( - url='http://quotes.toscrape.com/js/', - endpoint='execute', - splash_headers={ - 'Authorization': basic_auth_header(self.settings['SPLASH_APIKEY'], ''), - }, - args={ - 'lua_source': self.LUA_SOURCE, - 'crawlera_user': self.settings['CRAWLERA_APIKEY'], - }, - # tell Splash to cache the lua script, to avoid sending it for every request - cache_args=['lua_source'], - ) - - def parse(self, response): - for quote in response.css('div.quote'): - yield { - 'text': quote.css('span.text::text').extract_first(), - 'author': quote.css('span small::text').extract_first(), - 'tags': quote.css('div.tags a.tag::text').extract(), - } - next_page = response.css('li.next > a::attr(href)').extract_first() - if next_page: - yield SplashRequest( - url=response.urljoin(next_page), - endpoint='execute', - splash_headers={ - 'Authorization': basic_auth_header(self.settings['SPLASH_APIKEY'], ''), - }, - args={ - 'lua_source': self.LUA_SOURCE, - 'crawlera_user': self.settings['CRAWLERA_APIKEY'], - }, - cache_args=['lua_source'], - ) From dd6646b07919788199245e1f3ba9c6faa4eb0ed1 Mon Sep 17 00:00:00 2001 From: rafrox <37214120+rafrox@users.noreply.github.com> Date: Mon, 12 Mar 2018 00:06:31 +1100 Subject: [PATCH 3/8] Update amazon.py --- price_monitor/spiders/amazon.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/price_monitor/spiders/amazon.py b/price_monitor/spiders/amazon.py index c1258de..fc1d256 100644 --- a/price_monitor/spiders/amazon.py +++ b/price_monitor/spiders/amazon.py @@ -7,10 +7,10 @@ class AmazonSpider(BaseSpider): def parse(self, response): for product in response.css(".view-advanced-catalog tr > td"): - item = {} - item['title'] = product.css(".views-field-title a ::text").extract_first() - item['price'] = product.css(".views-field-phpcode span span::text").extract()[1] - item['url'] = product.css(".views-field-title a::attr(href)").extract() + item = {} + item['title'] = product.css(".views-field-title a ::text").extract_first() + item['price'] = product.css(".views-field-phpcode span span::text").extract()[1] + item['url'] = product.css(".views-field-title a::attr(href)").extract() yield item next_page = response.css('li.pager-nexta::attr(href)').extract_first() From b853d797356e4154f7d88fc3fdf412da389bf938 Mon Sep 17 00:00:00 2001 From: rafrox <37214120+rafrox@users.noreply.github.com> Date: Mon, 12 Mar 2018 00:09:20 +1100 Subject: [PATCH 4/8] Update amazon.py --- price_monitor/spiders/amazon.py | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/price_monitor/spiders/amazon.py b/price_monitor/spiders/amazon.py index fc1d256..21ab5b3 100644 --- a/price_monitor/spiders/amazon.py +++ b/price_monitor/spiders/amazon.py @@ -6,11 +6,10 @@ class AmazonSpider(BaseSpider): def parse(self, response): for product in response.css(".view-advanced-catalog tr > td"): - - item = {} - item['title'] = product.css(".views-field-title a ::text").extract_first() - item['price'] = product.css(".views-field-phpcode span span::text").extract()[1] - item['url'] = product.css(".views-field-title a::attr(href)").extract() + item = {} + item ['title'] = product.css(".views-field-title a ::text").extract_first() + item ['price'] = product.css(".views-field-phpcode span span::text").extract()[1] + item ['url'] = product.css(".views-field-title a::attr(href)").extract() yield item next_page = response.css('li.pager-nexta::attr(href)').extract_first() From c70c14e03acb79c83227598e614231ce41810486 Mon Sep 17 00:00:00 2001 From: rafrox <37214120+rafrox@users.noreply.github.com> Date: Mon, 12 Mar 2018 00:15:12 +1100 Subject: [PATCH 5/8] Update amazon.py --- price_monitor/spiders/amazon.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/price_monitor/spiders/amazon.py b/price_monitor/spiders/amazon.py index 21ab5b3..d8daa42 100644 --- a/price_monitor/spiders/amazon.py +++ b/price_monitor/spiders/amazon.py @@ -15,4 +15,4 @@ def parse(self, response): next_page = response.css('li.pager-nexta::attr(href)').extract_first() if next_page is not None: next_page = response.urljoin(next_page) - yield scrapy.Request(next_page, callback=self.parse) + From b96d23ec7caf4db32d955caa717df0c2d51689ae Mon Sep 17 00:00:00 2001 From: rafrox <37214120+rafrox@users.noreply.github.com> Date: Mon, 12 Mar 2018 00:17:14 +1100 Subject: [PATCH 6/8] Update amazon.py --- price_monitor/spiders/amazon.py | 5 ----- 1 file changed, 5 deletions(-) diff --git a/price_monitor/spiders/amazon.py b/price_monitor/spiders/amazon.py index d8daa42..6e2b41e 100644 --- a/price_monitor/spiders/amazon.py +++ b/price_monitor/spiders/amazon.py @@ -11,8 +11,3 @@ def parse(self, response): item ['price'] = product.css(".views-field-phpcode span span::text").extract()[1] item ['url'] = product.css(".views-field-title a::attr(href)").extract() yield item - - next_page = response.css('li.pager-nexta::attr(href)').extract_first() - if next_page is not None: - next_page = response.urljoin(next_page) - From 0a0c2c99bc25fb6ecd2f7c2afb12149be1dc8447 Mon Sep 17 00:00:00 2001 From: rafrox <37214120+rafrox@users.noreply.github.com> Date: Mon, 12 Mar 2018 00:18:24 +1100 Subject: [PATCH 7/8] Delete bestbuy.py --- price_monitor/spiders/bestbuy.py | 14 -------------- 1 file changed, 14 deletions(-) delete mode 100644 price_monitor/spiders/bestbuy.py diff --git a/price_monitor/spiders/bestbuy.py b/price_monitor/spiders/bestbuy.py deleted file mode 100644 index 03c49f6..0000000 --- a/price_monitor/spiders/bestbuy.py +++ /dev/null @@ -1,14 +0,0 @@ -from .base_spider import BaseSpider - - -class BestbuySpider(BaseSpider): - name = "bestbuy.com" - - def parse(self, response): - item = response.meta.get('item', {}) - item['url'] = response.url - item['title'] = response.css("div#sku-title > h1 ::text").extract_first().strip() - item['price'] = float( - response.css('div.price-block ::attr(data-customer-price)').extract_first(default=0) - ) - yield item From 156ab16efbc832696b22086f462298a0d4974a97 Mon Sep 17 00:00:00 2001 From: rafrox <37214120+rafrox@users.noreply.github.com> Date: Mon, 12 Mar 2018 00:19:48 +1100 Subject: [PATCH 8/8] Delete ebay.py --- price_monitor/spiders/ebay.py | 17 ----------------- 1 file changed, 17 deletions(-) delete mode 100644 price_monitor/spiders/ebay.py diff --git a/price_monitor/spiders/ebay.py b/price_monitor/spiders/ebay.py deleted file mode 100644 index 7721fa6..0000000 --- a/price_monitor/spiders/ebay.py +++ /dev/null @@ -1,17 +0,0 @@ -from extruct.w3cmicrodata import MicrodataExtractor -from .base_spider import BaseSpider - - -class EbaySpider(BaseSpider): - name = "ebay.com" - - def parse(self, response): - extractor = MicrodataExtractor() - properties = extractor.extract(response.body_as_unicode()).get('items')[0].get('properties', {}) - item = response.meta.get('item', {}) - item['url'] = response.url - item['title'] = properties.get('name').replace('Details about', '').strip() - item['price'] = float( - properties.get('offers', {}).get('properties', {}).get('price', 0) - ) - yield item