scrapinghub · rafrox · Mar 11, 2018 · Mar 11, 2018 · Mar 11, 2018 · Mar 11, 2018
diff --git a/bin/monitor.py b/bin/monitor.py
@@ -0,0 +1,119 @@
+"""Simple price monitor built with Scrapy and Scrapy Cloud
+"""
+import argparse
+import os
+from datetime import datetime, timedelta
+
+import boto
+from hubstorage import HubstorageClient
+from jinja2 import Environment, PackageLoader
+from price_monitor import settings
+from price_monitor.utils import get_product_names, get_retailers_for_product
+from w3lib.html import remove_tags
+
+jinja_env = Environment(loader=PackageLoader('price_monitor', 'templates'))
+
+
+class DealsChecker(object):
+
+    def __init__(self, latest_deals, previous_deals, price_threshold=0):
+        self.price_threshold = price_threshold
+        self.latest_deals = latest_deals
+        self.previous_deals = previous_deals
+
+    def is_from_latest_crawl(self, deal):
+        """Checks whether the given deal is from the most recent execution.
+        """
+        return deal in self.latest_deals
+
+    def get_best_deal(self):
+        """Returns the item with the best overall price. self.price_threshold can be set to avoid
+           considering minor price drops.
+        """
+        best_so_far = min(self.previous_deals, key=lambda x: x.get('price'))
+        best_from_last = min(self.latest_deals, key=lambda x: x.get('price'))
+        if best_from_last.get('price') + self.price_threshold < best_so_far.get('price'):
+            return best_from_last
+        else:
+            return best_so_far
+
+
+class DealsFetcher(object):
+
+    def __init__(self, product_name, apikey, project_id, hours):
+        self.product_name = product_name
+        project = HubstorageClient(apikey).get_project(project_id)
+        self.item_store = project.collections.new_store(product_name)
+        self.load_items_from_last_n_hours(hours)
+
+    def load_items_from_last_n_hours(self, n=24):
+        """Load items from the last n hours, from the newest to the oldest.
+        """
+        since_time = int((datetime.now() - timedelta(hours=n)).timestamp() * 1000)
+        self.deals = [item.get('value') for item in self.fetch_deals_newer_than(since_time)]
+
+    def fetch_deals_newer_than(self, since_time):
+        return list(self.item_store.get(meta=['_key', '_ts'], startts=since_time))
+
+    def get_latest_deal_from_retailer(self, retailer):
+        """Returns the most recently extracted deal from a given retailer.
+        """
+        for deals in self.deals:
+            if retailer in deals.get('url'):
+                return deals
+
+    def get_deals(self):
+        """Returns a tuple with (deals from latest crawl, deals from previous crawls)
+        """
+        latest_deals = [
+            self.get_latest_deal_from_retailer(retailer)
+            for retailer in get_retailers_for_product(self.product_name)
+        ]
+        previous_deals = [
+            deal for deal in self.deals if deal not in latest_deals
+        ]
+        return latest_deals, previous_deals
+
+
+def send_email_alert(items):
+    ses = boto.connect_ses(settings.AWS_ACCESS_KEY, settings.AWS_SECRET_KEY)
+    html_body = jinja_env.get_template('email.html').render(items=items)
+
+    ses.send_email(
+        settings.EMAIL_ALERT_FROM,
+        'Price drop alert',
+        remove_tags(html_body),
+        settings.EMAIL_ALERT_TO,
+        html_body=html_body
+    )
+
+
+def main(args):
+    items = []
+    for prod_name in get_product_names():
+        fetcher = DealsFetcher(prod_name, args.apikey, args.project, args.days * 24)
+        checker = DealsChecker(*fetcher.get_deals(), args.threshold)
+        best_deal = checker.get_best_deal()
+        if checker.is_from_latest_crawl(best_deal):
+            items.append(best_deal)
+
+    if items:
+        send_email_alert(items)
+
+
+def parse_args():
+    parser = argparse.ArgumentParser(description=__doc__)
+    parser.add_argument('--apikey', default=settings.SHUB_KEY or os.getenv('SHUB_KEY'),
+                        help='API key to use for scrapinghub (fallbacks to SHUB_KEY variable)')
+    parser.add_argument('--days', type=int, default=1,
+                        help='How many days back to compare with the last price')
+    parser.add_argument('--threshold', type=float, default=0,
+                        help='A margin to avoid raising alerts with minor price drops')
+    parser.add_argument('--project', type=int, default=settings.SHUB_PROJ_ID,
+                        help='Project ID to get info from')
+
+    return parser.parse_args()
+
+
+if __name__ == '__main__':
+    main(parse_args())
diff --git a/quotes_crawler/quotes_crawler/__init__.py → price_monitor/__init__.py b/quotes_crawler/quotes_crawler/__init__.py → price_monitor/__init__.py
diff --git a/quotes_crawler/quotes_crawler/items.py → price_monitor/items.py b/quotes_crawler/quotes_crawler/items.py → price_monitor/items.py
@@ -8,7 +8,7 @@
 import scrapy
 
 
-class QuotesCrawlerItem(scrapy.Item):
+class PriceMonitorItem(scrapy.Item):
     # define the fields for your item here like:
     # name = scrapy.Field()
     pass
diff --git a/price_monitor/pipelines.py b/price_monitor/pipelines.py
@@ -0,0 +1,21 @@
+# -*- coding: utf-8 -*-
+from price_monitor import settings
+from hubstorage import HubstorageClient
+from price_monitor.utils import reversed_timestamp, get_product_names
+
+
+class CollectionStoragePipeline(object):
+
+    def open_spider(self, spider):
+        client = HubstorageClient(auth=settings.SHUB_KEY)
+        project = client.get_project(settings.SHUB_PROJ_ID)
+        self.data_stores = {}
+        for product_name in get_product_names():
+            self.data_stores[product_name] = project.collections.new_store(product_name)
+
+    def process_item(self, item, spider):
+        key = "{}-{}-{}".format(
+            reversed_timestamp(), item.get('product_name'), item.get('retailer')
+        )
+        self.data_stores[item['product_name']].set({'_key': key, 'value': item})
+        return item
diff --git a/price_monitor/resources/urls.json b/price_monitor/resources/urls.json
@@ -0,0 +1,5 @@
+{
+    "On Sale Mil": [
+        "http://www.milsims.com.au/catalog/1746/",
+    ]
+}
diff --git a/price_monitor/settings.py b/price_monitor/settings.py
@@ -0,0 +1,27 @@
+# -*- coding: utf-8 -*-
+import os
+
+BOT_NAME = 'price_monitor'
+SPIDER_MODULES = ['price_monitor.spiders']
+NEWSPIDER_MODULE = 'price_monitor.spiders'
+
+ROBOTSTXT_OBEY = True
+
+SHUB_KEY = os.getenv('$SHUB_KEY')
+# if you want to run it locally, replace '999999' by your Scrapy Cloud project ID below
+SHUB_PROJ_ID = os.getenv('SHUB_JOBKEY', '291701').split('/')[0]
+
+
+# settings for Amazon SES email service
+AWS_ACCESS_KEY = os.getenv('$AWS_ACCESS_KEY')
+AWS_SECRET_KEY = os.getenv('$AWS_SECRET_KEY')
+EMAIL_ALERT_FROM = 'Price Monitor <[email protected]>'
+EMAIL_ALERT_TO = ['[email protected]']
+
+# See http://scrapy.readthedocs.org/en/latest/topics/item-pipeline.html
+ITEM_PIPELINES = {
+    'price_monitor.pipelines.CollectionStoragePipeline': 400,
+}
+
+AUTOTHROTTLE_ENABLED = True
+# HTTPCACHE_ENABLED = True
diff --git a/...rawler/quotes_crawler/spiders/__init__.py → price_monitor/spiders/__init__.py b/...rawler/quotes_crawler/spiders/__init__.py → price_monitor/spiders/__init__.py
diff --git a/price_monitor/spiders/amazon.py b/price_monitor/spiders/amazon.py
@@ -0,0 +1,13 @@
+from .base_spider import BaseSpider
+
+
+class AmazonSpider(BaseSpider):
+    name = "milsims.com"
+
+    def parse(self, response):
+        for product in response.css(".view-advanced-catalog tr > td"):
+            item = {}
+            item ['title'] = product.css(".views-field-title a ::text").extract_first()
+            item ['price'] = product.css(".views-field-phpcode span span::text").extract()[1]
+            item ['url'] = product.css(".views-field-title a::attr(href)").extract()
+        yield item
diff --git a/price_monitor/spiders/base_spider.py b/price_monitor/spiders/base_spider.py
@@ -0,0 +1,16 @@
+import json
+import pkgutil
+import scrapy
+from datetime import datetime
+
+
+class BaseSpider(scrapy.Spider):
+
+    def start_requests(self):
+        products = json.loads(pkgutil.get_data('price_monitor', 'resources/urls.json').decode())
+        for name, urls in products.items():
+            for url in urls:
+                if self.name in url:
+                    now = datetime.now().strftime('%Y/%m/%d %H:%M:%S')
+                    item = {'product_name': name, 'retailer': self.name, 'when': now}
+                    yield scrapy.Request(url, meta={'item': item})
diff --git a/price_monitor/templates/email.html b/price_monitor/templates/email.html
@@ -0,0 +1,14 @@
+<h1>🎉 Hey, we found a good deal! 🎁</h1>
+<table border="1">
+
+{% for item in items %}
+<tr><td>
+    <p><strong>Product:</strong> {{item.title}}</p>
+    <p><strong>Price:</strong> {{item.price}}</p>
+    <p><strong>Store:</strong> {{item.retailer}}</p>
+    <p><strong>Price obtained at:</strong> {{item.when}}</p>
+    <p>Visit the product page at {{item.retailer}}: <a href="{{item.url}}">{{item.url}}</a></p>
+</td></tr>
+{% endfor %}
+</table>
+
diff --git a/price_monitor/utils.py b/price_monitor/utils.py
@@ -0,0 +1,35 @@
+import json
+import pkgutil
+from datetime import datetime, timedelta
+
+
+def timestamp_from_reversed(reversed):
+    return datetime(5000, 1, 1) - timedelta(seconds=float(reversed))
+
+
+def reversed_timestamp():
+    return str((datetime(5000, 1, 1) - datetime.now()).total_seconds())
+
+
+def normalize_name(name):
+    return name.replace('-', '')
+
+
+def get_product_names():
+    return [
+        normalize_name(name)
+        for name in json.loads(
+            pkgutil.get_data("price_monitor", "resources/urls.json").decode()
+        ).keys()
+    ]
+
+
+def get_retailer_name_from_url(url):
+        return url.split("://")[1].split("/")[0].replace("www.", "")
+
+
+def get_retailers_for_product(product_name):
+    data = json.loads(
+        pkgutil.get_data("price_monitor", "resources/urls.json").decode()
+    )
+    return {get_retailer_name_from_url(url) for url in data[product_name]}
diff --git a/quotes_crawler/README.md b/quotes_crawler/README.md
diff --git a/quotes_crawler/quotes_crawler/pipelines.py b/quotes_crawler/quotes_crawler/pipelines.py
diff --git a/quotes_crawler/quotes_crawler/settings.py b/quotes_crawler/quotes_crawler/settings.py