Skip to content

Commit

Permalink
Add support for job_name etc.
Browse files Browse the repository at this point in the history
  • Loading branch information
anjackson committed Jul 4, 2019
1 parent 24df540 commit 6613de6
Show file tree
Hide file tree
Showing 2 changed files with 11 additions and 4 deletions.
4 changes: 3 additions & 1 deletion crawldb/hadoop.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
import os
import sys
import logging
import crawldb
Expand Down Expand Up @@ -42,6 +43,7 @@ class SendLogFileToCrawlDB(luigi.contrib.hadoop.JobTask):
"""

task_namespace = 'analyse'
crawl_job_name = luigi.Parameter(default='frequent')
log_paths = luigi.ListParameter()
from_hdfs = luigi.BoolParameter(default=False)
cdb_db = luigi.Parameter(default='crawl_db')
Expand Down Expand Up @@ -147,7 +149,7 @@ def _map_input_reporter(self, stdin, stdout):

def mapper(self, line):
# Parse:
c = CrawlLogLine(line)
c = CrawlLogLine(line, job_name=self.crawl_job_name, log_filename=os.getenv('map_input_file', None))

# Yield this line if there seems there is no key collision with the previous line:
if self.last_c and c.ssurt == self.last_c.ssurt and c.timestamp == self.last_c.timestamp:
Expand Down
11 changes: 8 additions & 3 deletions crawldb/heritrix.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,13 +15,18 @@ class CrawlLogLine(object):
re_tries = re.compile('^\d+t$')
re_dol = re.compile('^dol:\d+') # Discarded out-links - make a total?

def __init__(self, line):
def __init__(self, line, job_name=None, job_launch=None, log_filename=None):
"""
Parse from a standard log-line.
:param line:
"""
# Store the line for reference
self.line = line
# Store the supplied metadata:
self.job_name = job_name
self.job_launch = job_launch
self.log_filename = log_filename

# Split the line up:
(self.timestamp, self.status_code, self.content_length, self.url, self.hop_path, self.via,
self.mime, self.thread, self.start_time_plus_duration, self.hash, self.source,
Expand Down Expand Up @@ -92,7 +97,7 @@ def stats(self):
"""
return self.stats

upsert_sql = """UPSERT INTO crawl_log (ssurt, timestamp, url, host, domain, content_type, content_length, content_digest, via, hop_path, status_code, ip ) VALUES %s"""
upsert_sql = """UPSERT INTO crawl_log (ssurt, timestamp, url, host, domain, content_type, content_length, content_digest, via, hop_path, status_code, ip, job_name, job_launch, log_filename ) VALUES %s"""

def upsert_values(self):
return (self.ssurt, self.timestamp, self.url, self.host, self.domain, self.mime, self.content_length, self.hash, self.via, self.hop_path, self.status_code, self.ip)
return (self.ssurt, self.timestamp, self.url, self.host, self.domain, self.mime, self.content_length, self.hash, self.via, self.hop_path, self.status_code, self.ip, self.job_name, self.job_launch, self.log_filename)

0 comments on commit 6613de6

Please sign in to comment.