Skip to content
This repository has been archived by the owner on Aug 9, 2019. It is now read-only.

Commit

Permalink
Using __init__.py instead of a custom module for main classes.
Browse files Browse the repository at this point in the history
+ datCrawl:register_crawler(crawler) sort of completed.

Tests improved.
  • Loading branch information
Felipe Martín committed Mar 14, 2013
1 parent ce777b1 commit c5eb229
Show file tree
Hide file tree
Showing 3 changed files with 72 additions and 30 deletions.
36 changes: 36 additions & 0 deletions datCrawl/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,36 @@
class Crawler(object):
"Base crawler class."
urls = [] # List with regular expression of URLs that the crawler handle


class CrawlerIsNotInstanceOfBaseCrawler(Exception):
"Class is not instance of the base crawler"
pass


class CrawlerDontHaveUrlsToWatch(Exception):
"Crawler class have the -urls- parameter empty"
pass


class datCrawl(object):
def __init__(self):
self.crawlers = {}
self.urls = []

def register_crawler(self, crawler):
"Registers a crawler on the core to use in certain urls"
class_name = crawler().__class__.__name__
if isinstance(crawler(), Crawler):
urls = crawler().urls
if len(urls) > 0:
[self.register_url(url, class_name) for url in urls]
self.crawlers[class_name] = crawler
else:
raise CrawlerDontHaveUrlsToWatch('Crawler %s dont have URLs to watch for.' % class_name)
else:
raise CrawlerIsNotInstanceOfBaseCrawler('Crawler %s is not correctly created. (must be instance of base Crawler class)' % class_name)

def register_url(self, url, crawler):
"Registers a certain URL to work with a crawler"
self.urls.append((url, crawler))
23 changes: 0 additions & 23 deletions datCrawl/base.py

This file was deleted.

43 changes: 36 additions & 7 deletions test/test_base.py
Original file line number Diff line number Diff line change
@@ -1,16 +1,45 @@
import unittest
from datCrawl import base
from datCrawl import *


class AwesomeGoogleCrawler(Crawler):
urls = [
'(www\.)?google\.es',
'(www\.)?google\.de'
]


class AwesomeEmptyCrawler(Crawler):
pass


class datCrawlTests(unittest.TestCase):

def test_instance_check(self):
try:
core = base.datCrawl
except Exception, e:
print e
print core
self.assertTrue(core)
core = datCrawl()
self.assertTrue(isinstance(core, datCrawl))

def test_register_urls(self):
core = datCrawl()
data = ('http://www.google.es/', 'AwesomeGoogleCrawler')
core.register_url(data[0], data[1])
self.assertEquals(core.urls[0], data)

def test_register_crawler_with_urls(self):
core = datCrawl()
core.register_crawler(AwesomeGoogleCrawler)
self.assertEqual(core.crawlers['AwesomeGoogleCrawler'], AwesomeGoogleCrawler)
self.assertEqual(core.urls[0][0], AwesomeGoogleCrawler().urls[0])
self.assertEqual(core.urls[1][0], AwesomeGoogleCrawler().urls[1])

def test_register_crawler_without_urls(self):
core = datCrawl()
self.assertRaises(CrawlerDontHaveUrlsToWatch, lambda: core.register_crawler(AwesomeEmptyCrawler))

def test_register_incorrect_crawler(self):
core = datCrawl()
self.assertRaises(CrawlerIsNotInstanceOfBaseCrawler, lambda: core.register_crawler(object))


if __name__ == '__main__':
unittest.main()

0 comments on commit c5eb229

Please sign in to comment.