This repository has been archived by the owner on Aug 9, 2019. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 1
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Using __init__.py instead of a custom module for main classes.
+ datCrawl:register_crawler(crawler) sort of completed. Tests improved.
- Loading branch information
Felipe Martín
committed
Mar 14, 2013
1 parent
ce777b1
commit c5eb229
Showing
3 changed files
with
72 additions
and
30 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,36 @@ | ||
class Crawler(object): | ||
"Base crawler class." | ||
urls = [] # List with regular expression of URLs that the crawler handle | ||
|
||
|
||
class CrawlerIsNotInstanceOfBaseCrawler(Exception): | ||
"Class is not instance of the base crawler" | ||
pass | ||
|
||
|
||
class CrawlerDontHaveUrlsToWatch(Exception): | ||
"Crawler class have the -urls- parameter empty" | ||
pass | ||
|
||
|
||
class datCrawl(object): | ||
def __init__(self): | ||
self.crawlers = {} | ||
self.urls = [] | ||
|
||
def register_crawler(self, crawler): | ||
"Registers a crawler on the core to use in certain urls" | ||
class_name = crawler().__class__.__name__ | ||
if isinstance(crawler(), Crawler): | ||
urls = crawler().urls | ||
if len(urls) > 0: | ||
[self.register_url(url, class_name) for url in urls] | ||
self.crawlers[class_name] = crawler | ||
else: | ||
raise CrawlerDontHaveUrlsToWatch('Crawler %s dont have URLs to watch for.' % class_name) | ||
else: | ||
raise CrawlerIsNotInstanceOfBaseCrawler('Crawler %s is not correctly created. (must be instance of base Crawler class)' % class_name) | ||
|
||
def register_url(self, url, crawler): | ||
"Registers a certain URL to work with a crawler" | ||
self.urls.append((url, crawler)) |
This file was deleted.
Oops, something went wrong.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,16 +1,45 @@ | ||
import unittest | ||
from datCrawl import base | ||
from datCrawl import * | ||
|
||
|
||
class AwesomeGoogleCrawler(Crawler): | ||
urls = [ | ||
'(www\.)?google\.es', | ||
'(www\.)?google\.de' | ||
] | ||
|
||
|
||
class AwesomeEmptyCrawler(Crawler): | ||
pass | ||
|
||
|
||
class datCrawlTests(unittest.TestCase): | ||
|
||
def test_instance_check(self): | ||
try: | ||
core = base.datCrawl | ||
except Exception, e: | ||
print e | ||
print core | ||
self.assertTrue(core) | ||
core = datCrawl() | ||
self.assertTrue(isinstance(core, datCrawl)) | ||
|
||
def test_register_urls(self): | ||
core = datCrawl() | ||
data = ('http://www.google.es/', 'AwesomeGoogleCrawler') | ||
core.register_url(data[0], data[1]) | ||
self.assertEquals(core.urls[0], data) | ||
|
||
def test_register_crawler_with_urls(self): | ||
core = datCrawl() | ||
core.register_crawler(AwesomeGoogleCrawler) | ||
self.assertEqual(core.crawlers['AwesomeGoogleCrawler'], AwesomeGoogleCrawler) | ||
self.assertEqual(core.urls[0][0], AwesomeGoogleCrawler().urls[0]) | ||
self.assertEqual(core.urls[1][0], AwesomeGoogleCrawler().urls[1]) | ||
|
||
def test_register_crawler_without_urls(self): | ||
core = datCrawl() | ||
self.assertRaises(CrawlerDontHaveUrlsToWatch, lambda: core.register_crawler(AwesomeEmptyCrawler)) | ||
|
||
def test_register_incorrect_crawler(self): | ||
core = datCrawl() | ||
self.assertRaises(CrawlerIsNotInstanceOfBaseCrawler, lambda: core.register_crawler(object)) | ||
|
||
|
||
if __name__ == '__main__': | ||
unittest.main() |