Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

rafchanges #5

Open
wants to merge 8 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
119 changes: 119 additions & 0 deletions bin/monitor.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,119 @@
"""Simple price monitor built with Scrapy and Scrapy Cloud
"""
import argparse
import os
from datetime import datetime, timedelta

import boto
from hubstorage import HubstorageClient
from jinja2 import Environment, PackageLoader
from price_monitor import settings
from price_monitor.utils import get_product_names, get_retailers_for_product
from w3lib.html import remove_tags

jinja_env = Environment(loader=PackageLoader('price_monitor', 'templates'))


class DealsChecker(object):

def __init__(self, latest_deals, previous_deals, price_threshold=0):
self.price_threshold = price_threshold
self.latest_deals = latest_deals
self.previous_deals = previous_deals

def is_from_latest_crawl(self, deal):
"""Checks whether the given deal is from the most recent execution.
"""
return deal in self.latest_deals

def get_best_deal(self):
"""Returns the item with the best overall price. self.price_threshold can be set to avoid
considering minor price drops.
"""
best_so_far = min(self.previous_deals, key=lambda x: x.get('price'))
best_from_last = min(self.latest_deals, key=lambda x: x.get('price'))
if best_from_last.get('price') + self.price_threshold < best_so_far.get('price'):
return best_from_last
else:
return best_so_far


class DealsFetcher(object):

def __init__(self, product_name, apikey, project_id, hours):
self.product_name = product_name
project = HubstorageClient(apikey).get_project(project_id)
self.item_store = project.collections.new_store(product_name)
self.load_items_from_last_n_hours(hours)

def load_items_from_last_n_hours(self, n=24):
"""Load items from the last n hours, from the newest to the oldest.
"""
since_time = int((datetime.now() - timedelta(hours=n)).timestamp() * 1000)
self.deals = [item.get('value') for item in self.fetch_deals_newer_than(since_time)]

def fetch_deals_newer_than(self, since_time):
return list(self.item_store.get(meta=['_key', '_ts'], startts=since_time))

def get_latest_deal_from_retailer(self, retailer):
"""Returns the most recently extracted deal from a given retailer.
"""
for deals in self.deals:
if retailer in deals.get('url'):
return deals

def get_deals(self):
"""Returns a tuple with (deals from latest crawl, deals from previous crawls)
"""
latest_deals = [
self.get_latest_deal_from_retailer(retailer)
for retailer in get_retailers_for_product(self.product_name)
]
previous_deals = [
deal for deal in self.deals if deal not in latest_deals
]
return latest_deals, previous_deals


def send_email_alert(items):
ses = boto.connect_ses(settings.AWS_ACCESS_KEY, settings.AWS_SECRET_KEY)
html_body = jinja_env.get_template('email.html').render(items=items)

ses.send_email(
settings.EMAIL_ALERT_FROM,
'Price drop alert',
remove_tags(html_body),
settings.EMAIL_ALERT_TO,
html_body=html_body
)


def main(args):
items = []
for prod_name in get_product_names():
fetcher = DealsFetcher(prod_name, args.apikey, args.project, args.days * 24)
checker = DealsChecker(*fetcher.get_deals(), args.threshold)
best_deal = checker.get_best_deal()
if checker.is_from_latest_crawl(best_deal):
items.append(best_deal)

if items:
send_email_alert(items)


def parse_args():
parser = argparse.ArgumentParser(description=__doc__)
parser.add_argument('--apikey', default=settings.SHUB_KEY or os.getenv('SHUB_KEY'),
help='API key to use for scrapinghub (fallbacks to SHUB_KEY variable)')
parser.add_argument('--days', type=int, default=1,
help='How many days back to compare with the last price')
parser.add_argument('--threshold', type=float, default=0,
help='A margin to avoid raising alerts with minor price drops')
parser.add_argument('--project', type=int, default=settings.SHUB_PROJ_ID,
help='Project ID to get info from')

return parser.parse_args()


if __name__ == '__main__':
main(parse_args())
File renamed without changes.
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@
import scrapy


class QuotesCrawlerItem(scrapy.Item):
class PriceMonitorItem(scrapy.Item):
# define the fields for your item here like:
# name = scrapy.Field()
pass
21 changes: 21 additions & 0 deletions price_monitor/pipelines.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
# -*- coding: utf-8 -*-
from price_monitor import settings
from hubstorage import HubstorageClient
from price_monitor.utils import reversed_timestamp, get_product_names


class CollectionStoragePipeline(object):

def open_spider(self, spider):
client = HubstorageClient(auth=settings.SHUB_KEY)
project = client.get_project(settings.SHUB_PROJ_ID)
self.data_stores = {}
for product_name in get_product_names():
self.data_stores[product_name] = project.collections.new_store(product_name)

def process_item(self, item, spider):
key = "{}-{}-{}".format(
reversed_timestamp(), item.get('product_name'), item.get('retailer')
)
self.data_stores[item['product_name']].set({'_key': key, 'value': item})
return item
5 changes: 5 additions & 0 deletions price_monitor/resources/urls.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
{
"On Sale Mil": [
"http://www.milsims.com.au/catalog/1746/",
]
}
27 changes: 27 additions & 0 deletions price_monitor/settings.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@
# -*- coding: utf-8 -*-
import os

BOT_NAME = 'price_monitor'
SPIDER_MODULES = ['price_monitor.spiders']
NEWSPIDER_MODULE = 'price_monitor.spiders'

ROBOTSTXT_OBEY = True

SHUB_KEY = os.getenv('$SHUB_KEY')
# if you want to run it locally, replace '999999' by your Scrapy Cloud project ID below
SHUB_PROJ_ID = os.getenv('SHUB_JOBKEY', '291701').split('/')[0]


# settings for Amazon SES email service
AWS_ACCESS_KEY = os.getenv('$AWS_ACCESS_KEY')
AWS_SECRET_KEY = os.getenv('$AWS_SECRET_KEY')
EMAIL_ALERT_FROM = 'Price Monitor <[email protected]>'
EMAIL_ALERT_TO = ['[email protected]']

# See http://scrapy.readthedocs.org/en/latest/topics/item-pipeline.html
ITEM_PIPELINES = {
'price_monitor.pipelines.CollectionStoragePipeline': 400,
}

AUTOTHROTTLE_ENABLED = True
# HTTPCACHE_ENABLED = True
13 changes: 13 additions & 0 deletions price_monitor/spiders/amazon.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
from .base_spider import BaseSpider


class AmazonSpider(BaseSpider):
name = "milsims.com"

def parse(self, response):
for product in response.css(".view-advanced-catalog tr > td"):
item = {}
item ['title'] = product.css(".views-field-title a ::text").extract_first()
item ['price'] = product.css(".views-field-phpcode span span::text").extract()[1]
item ['url'] = product.css(".views-field-title a::attr(href)").extract()
yield item
16 changes: 16 additions & 0 deletions price_monitor/spiders/base_spider.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
import json
import pkgutil
import scrapy
from datetime import datetime


class BaseSpider(scrapy.Spider):

def start_requests(self):
products = json.loads(pkgutil.get_data('price_monitor', 'resources/urls.json').decode())
for name, urls in products.items():
for url in urls:
if self.name in url:
now = datetime.now().strftime('%Y/%m/%d %H:%M:%S')
item = {'product_name': name, 'retailer': self.name, 'when': now}
yield scrapy.Request(url, meta={'item': item})
14 changes: 14 additions & 0 deletions price_monitor/templates/email.html
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
<h1>🎉 Hey, we found a good deal! 🎁</h1>
<table border="1">

{% for item in items %}
<tr><td>
<p><strong>Product:</strong> {{item.title}}</p>
<p><strong>Price:</strong> {{item.price}}</p>
<p><strong>Store:</strong> {{item.retailer}}</p>
<p><strong>Price obtained at:</strong> {{item.when}}</p>
<p>Visit the product page at {{item.retailer}}: <a href="{{item.url}}">{{item.url}}</a></p>
</td></tr>
{% endfor %}
</table>

35 changes: 35 additions & 0 deletions price_monitor/utils.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,35 @@
import json
import pkgutil
from datetime import datetime, timedelta


def timestamp_from_reversed(reversed):
return datetime(5000, 1, 1) - timedelta(seconds=float(reversed))


def reversed_timestamp():
return str((datetime(5000, 1, 1) - datetime.now()).total_seconds())


def normalize_name(name):
return name.replace('-', '')


def get_product_names():
return [
normalize_name(name)
for name in json.loads(
pkgutil.get_data("price_monitor", "resources/urls.json").decode()
).keys()
]


def get_retailer_name_from_url(url):
return url.split("://")[1].split("/")[0].replace("www.", "")


def get_retailers_for_product(product_name):
data = json.loads(
pkgutil.get_data("price_monitor", "resources/urls.json").decode()
)
return {get_retailer_name_from_url(url) for url in data[product_name]}
13 changes: 0 additions & 13 deletions quotes_crawler/README.md

This file was deleted.

11 changes: 0 additions & 11 deletions quotes_crawler/quotes_crawler/pipelines.py

This file was deleted.

90 changes: 0 additions & 90 deletions quotes_crawler/quotes_crawler/settings.py

This file was deleted.

Loading