Skip to content

Commit

Permalink
Bullet-proof somewhat. Recover deep crawls.
Browse files Browse the repository at this point in the history
  • Loading branch information
skington committed Jan 22, 2012
1 parent aed2899 commit 2ef3eca
Showing 1 changed file with 10 additions and 3 deletions.
13 changes: 10 additions & 3 deletions crawl.pl
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,9 @@
use Net::Twitter;
use LWP;

# Crude attempt at argument parsing.
$ARGV[0] ~~ ['-h', '--help'] and die "Syntax: perl crawl.pl [pagenum]\n";

# Get our configuration file, connect to Twitter.
my $conf = do(lib::abs::path('oauth'))
or die "Couldn't find oauth config file: $OS_ERROR";
Expand All @@ -25,8 +28,9 @@
# Get a user agent.
my $useragent = LWP::UserAgent->new(agent => 'YATweetArchiver/0.1');

# Find the most recent tweets.
my $page = 1;
# Find the most recent tweets, optionally starting at a given page if the
# script recently crashed.
my $page = $ARGV[0] =~ /^ \d+ $/x ? shift : 1;
page:
while (1) {
print "Page $page...\n\n";
Expand Down Expand Up @@ -92,7 +96,10 @@ sub store_tweet {
print "Fetching $url\n";
my $response = $useragent->get($url);
if (!$response->is_success) {
carp "Couldn't fetch $url:", $response->status;
carp "Couldn't fetch $url:", $response->status_line;
if (my $warning = $response->headers->{'client-warning'}) {
carp "Warning for $url: $warning";
}
next url;
}
store($url_subdir, 'origurl', $url);
Expand Down

0 comments on commit 2ef3eca

Please sign in to comment.