Skip to content
This repository has been archived by the owner on Aug 9, 2019. It is now read-only.

Commit

Permalink
+ Improved URL parsing via Regex
Browse files Browse the repository at this point in the history
+ Passing the Regex matches to the crawler action to work with the groups.
  • Loading branch information
Felipe Martin Garcia committed Mar 19, 2013
1 parent 77e3a55 commit f5136f7
Show file tree
Hide file tree
Showing 6 changed files with 57 additions and 13 deletions.
6 changes: 6 additions & 0 deletions CHANGES.md
Original file line number Diff line number Diff line change
@@ -1,3 +1,9 @@
## 0.2.0 (WIP)
Modified crawler behaviour:
- Now every regular expression of a certain URL **needs** a group called `url`. That group will be the URL sent to the associated `Downloader`.
- The core send the matches object (`re.compile.match()`) as a `kwarg` to the `Crawler` object called: `matches`, so you can play around with URL values too.
- Added an Exception and test cases for this behaviour: Checking for the `url` group on a pattern and checking for the kwargs being sent correctly.

## 0.1.1 (2013-03-16)
- Fixing pypi package

Expand Down
15 changes: 10 additions & 5 deletions datCrawl/__init__.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
from datCrawl.exceptions import CrawlerDontHaveUrlsToWatch, \
CrawlerIsNotInstanceOfBase, CrawlerForThisURLNotFound, \
NoCrawlerRegistered, CrawlerAlreadyRegistered, DownloaderAlreadyRegistered, \
DownloaderIsNotInstanceOfBase, DownloaderIsNotRegistered
DownloaderIsNotInstanceOfBase, DownloaderIsNotRegistered, CrawlerUrlDontHaveGroupDefined
from datCrawl.crawlers import Crawler
from datCrawl.downloaders import Downloader
import re
Expand Down Expand Up @@ -71,12 +71,17 @@ def run(self, url):
for registered_url in self.urls:
pattern = registered_url[0]
regexp = re.compile(pattern)
if regexp.match(url):
action = registered_url[1]
matches = regexp.match(url)
if matches:
crawler = registered_url[2]
try:
crawl_url = matches.group('url')
except IndexError:
raise CrawlerUrlDontHaveGroupDefined('The pattern [%s] of crawler [%s] dont have a url group defined.' % (pattern, crawler))
action = registered_url[1]
downloader = getattr(self.crawlers[crawler], 'downloader')
data = self.download(url, downloader)
return self.crawlers[crawler]().do(action, data)
data = self.download(crawl_url, downloader)
return self.crawlers[crawler]().do(action, data, matches=matches)
raise CrawlerForThisURLNotFound("No crawler registered a URL pattern for: %s" % url)
else:
raise NoCrawlerRegistered("You must register a Crawler in order to do something.")
8 changes: 4 additions & 4 deletions datCrawl/crawlers.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,10 +6,10 @@ class Crawler(object):
urls = [] # List of tuples with regular expression of URLs that the crawler handle
downloader = 'Downloader' # Name of the downloader class to use

def do(self, action, data):
def do(self, action, data, **kwargs):
try:
method = getattr(self, 'action_%s' % action)
result = method(data)
result = method(data, **kwargs)
return result
except AttributeError:
raise CrawlerActionDoesNotExist('%s: action (%s) does not exist' % (self.__class__.__name__, action))
except AttributeError as error:
raise CrawlerActionDoesNotExist('%s: action (%s) does not exist: %s' % (self.__class__.__name__, action, error))
5 changes: 5 additions & 0 deletions datCrawl/exceptions.py
Original file line number Diff line number Diff line change
Expand Up @@ -41,3 +41,8 @@ class DownloaderAlreadyRegistered(Exception):
class DownloaderIsNotRegistered(Exception):
"When you try to register a Crawler before its downloader."
pass


class CrawlerUrlDontHaveGroupDefined(Exception):
"When a you try to use a URL whos regex dont have set the <url> group"
pass
24 changes: 20 additions & 4 deletions test/requirements.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,8 +3,24 @@

class AwesomeGoogleCrawler(Crawler):
urls = [
('es', 'http\:\/\/(www\.)?google\.es', ),
('de', 'http\:\/\/(www\.)?google\.de', )
('es', '(?P<url>http\:\/\/google\.(?P<tld>es))', ),
('de', '(?P<url>http\:\/\/google\.(?P<tld>de))', )
]


# For testing kwargs
class AwesomeGoogleKwargsCrawler(Crawler):
urls = [
('do_things', '(?P<url>http\:\/\/google\.(?P<tld>es|com|co\.jp))', ),
]

def action_do_things(self, data, **kwargs):
return kwargs.get('matches').group('tld')


class CrawlerWithRegexGroupError(Crawler):
urls = [
('es', '(?P<this_should_be_url>http\:\/\/google\.es)', ),
]


Expand All @@ -14,10 +30,10 @@ class AwesomeEmptyCrawler(Crawler):

class AwesomeWikipediaTitleCrawler(Crawler):
urls = [
('get_title', 'http\:\/\/en.wikipedia.org\/wiki\/(.*)', )
('get_title', '(?P<url>http\:\/\/en.wikipedia.org\/wiki\/(?P<name>.*))', )
]
downloader = 'Downloader'

def action_get_title(self, data):
def action_get_title(self, data, **kwargs):
# LOOK, IM CRAWLING THE INTERNETS!
return {'title': 'Python'}
12 changes: 12 additions & 0 deletions test/test_crawlers.py
Original file line number Diff line number Diff line change
Expand Up @@ -43,5 +43,17 @@ def test_running_without_url_parameters(self):
core = datCrawl()
self.assertRaises(TypeError, lambda: core.run())

def test_kwargs_beign_sent(self):
core = datCrawl()
core.register_crawler(AwesomeGoogleKwargsCrawler)
self.assertEqual(core.run('http://google.es'), 'es')
self.assertEqual(core.run('http://google.com'), 'com')
self.assertEqual(core.run('http://google.co.jp'), 'co.jp')

def test_crawler_url_need_regex_with_group(self):
core = datCrawl()
core.register_crawler(CrawlerWithRegexGroupError)
self.assertRaises(CrawlerUrlDontHaveGroupDefined, lambda: core.run('http://google.es'))

if __name__ == '__main__':
unittest.main()

0 comments on commit f5136f7

Please sign in to comment.