+ Improved URL parsing via Regex

+ Passing the Regex matches to the crawler action to work with the groups.
fmartingr · Mar 19, 2013 · f5136f7 · f5136f7
1 parent 77e3a55
commit f5136f7
Show file tree

Hide file tree

Showing 6 changed files with 57 additions and 13 deletions.
diff --git a/CHANGES.md b/CHANGES.md
@@ -1,3 +1,9 @@
+## 0.2.0 (WIP)
+Modified crawler behaviour:
+- Now every regular expression of a certain URL **needs** a group called `url`. That group will be the URL sent to the associated `Downloader`.
+- The core send the matches object (`re.compile.match()`) as a `kwarg` to the `Crawler` object called: `matches`, so you can play around with URL values too.
+- Added an Exception and test cases for this behaviour: Checking for the `url` group on a pattern and checking for the kwargs being sent correctly.
+
 ## 0.1.1 (2013-03-16)
 - Fixing pypi package
 

diff --git a/datCrawl/__init__.py b/datCrawl/__init__.py
@@ -1,7 +1,7 @@
 from datCrawl.exceptions import CrawlerDontHaveUrlsToWatch, \
     CrawlerIsNotInstanceOfBase, CrawlerForThisURLNotFound, \
     NoCrawlerRegistered, CrawlerAlreadyRegistered, DownloaderAlreadyRegistered, \
-    DownloaderIsNotInstanceOfBase, DownloaderIsNotRegistered
+    DownloaderIsNotInstanceOfBase, DownloaderIsNotRegistered, CrawlerUrlDontHaveGroupDefined
 from datCrawl.crawlers import Crawler
 from datCrawl.downloaders import Downloader
 import re
@@ -71,12 +71,17 @@ def run(self, url):
             for registered_url in self.urls:
                 pattern = registered_url[0]
                 regexp = re.compile(pattern)
-                if regexp.match(url):
-                    action = registered_url[1]
+                matches = regexp.match(url)
+                if matches:
                     crawler = registered_url[2]
+                    try:
+                        crawl_url = matches.group('url')
+                    except IndexError:
+                        raise CrawlerUrlDontHaveGroupDefined('The pattern [%s] of crawler [%s] dont have a url group defined.' % (pattern, crawler))
+                    action = registered_url[1]
                     downloader = getattr(self.crawlers[crawler], 'downloader')
-                    data = self.download(url, downloader)
-                    return self.crawlers[crawler]().do(action, data)
+                    data = self.download(crawl_url, downloader)
+                    return self.crawlers[crawler]().do(action, data, matches=matches)
             raise CrawlerForThisURLNotFound("No crawler registered a URL pattern for: %s" % url)
         else:
             raise NoCrawlerRegistered("You must register a Crawler in order to do something.")
diff --git a/datCrawl/crawlers.py b/datCrawl/crawlers.py
@@ -6,10 +6,10 @@ class Crawler(object):
     urls = []  # List of tuples with regular expression of URLs that the crawler handle
     downloader = 'Downloader'  # Name of the downloader class to use
 
-    def do(self, action, data):
+    def do(self, action, data, **kwargs):
         try:
             method = getattr(self, 'action_%s' % action)
-            result = method(data)
+            result = method(data, **kwargs)
             return result
-        except AttributeError:
-            raise CrawlerActionDoesNotExist('%s: action (%s) does not exist' % (self.__class__.__name__, action))
+        except AttributeError as error:
+            raise CrawlerActionDoesNotExist('%s: action (%s) does not exist: %s' % (self.__class__.__name__, action, error))
diff --git a/datCrawl/exceptions.py b/datCrawl/exceptions.py
@@ -41,3 +41,8 @@ class DownloaderAlreadyRegistered(Exception):
 class DownloaderIsNotRegistered(Exception):
     "When you try to register a Crawler before its downloader."
     pass
+
+
+class CrawlerUrlDontHaveGroupDefined(Exception):
+    "When a you try to use a URL whos regex dont have set the <url> group"
+    pass
diff --git a/test/requirements.py b/test/requirements.py
@@ -3,8 +3,24 @@
 
 class AwesomeGoogleCrawler(Crawler):
     urls = [
-        ('es', 'http\:\/\/(www\.)?google\.es', ),
-        ('de', 'http\:\/\/(www\.)?google\.de', )
+        ('es', '(?P<url>http\:\/\/google\.(?P<tld>es))', ),
+        ('de', '(?P<url>http\:\/\/google\.(?P<tld>de))', )
+    ]
+
+
+# For testing kwargs
+class AwesomeGoogleKwargsCrawler(Crawler):
+    urls = [
+        ('do_things', '(?P<url>http\:\/\/google\.(?P<tld>es|com|co\.jp))', ),
+    ]
+
+    def action_do_things(self, data, **kwargs):
+        return kwargs.get('matches').group('tld')
+
+
+class CrawlerWithRegexGroupError(Crawler):
+    urls = [
+        ('es', '(?P<this_should_be_url>http\:\/\/google\.es)', ),
     ]
 
 
@@ -14,10 +30,10 @@ class AwesomeEmptyCrawler(Crawler):
 
 class AwesomeWikipediaTitleCrawler(Crawler):
     urls = [
-        ('get_title', 'http\:\/\/en.wikipedia.org\/wiki\/(.*)', )
+        ('get_title', '(?P<url>http\:\/\/en.wikipedia.org\/wiki\/(?P<name>.*))', )
     ]
     downloader = 'Downloader'
 
-    def action_get_title(self, data):
+    def action_get_title(self, data, **kwargs):
         # LOOK, IM CRAWLING THE INTERNETS!
         return {'title': 'Python'}
diff --git a/test/test_crawlers.py b/test/test_crawlers.py
@@ -43,5 +43,17 @@ def test_running_without_url_parameters(self):
         core = datCrawl()
         self.assertRaises(TypeError, lambda: core.run())
 
+    def test_kwargs_beign_sent(self):
+        core = datCrawl()
+        core.register_crawler(AwesomeGoogleKwargsCrawler)
+        self.assertEqual(core.run('http://google.es'), 'es')
+        self.assertEqual(core.run('http://google.com'), 'com')
+        self.assertEqual(core.run('http://google.co.jp'), 'co.jp')
+
+    def test_crawler_url_need_regex_with_group(self):
+        core = datCrawl()
+        core.register_crawler(CrawlerWithRegexGroupError)
+        self.assertRaises(CrawlerUrlDontHaveGroupDefined, lambda: core.run('http://google.es'))
+
 if __name__ == '__main__':
     unittest.main()