-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathscrape.pl
executable file
·115 lines (104 loc) · 3.05 KB
/
scrape.pl
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
#!/usr/bin/perl
use strict;
use WWW::Mechanize;
use Data::Dumper;
use JSON::Any;
use XML::Atom::SimpleFeed;
use File::Slurp;
use List::Util;
use DateTime;
use POSIX qw/strftime/;
use Text::vCard::Addressbook;
use pQuery;
my $DIR = "data";
my $mech = WWW::Mechanize->new();
# The parlament doesn't like lwp-www. Thus, we lie.
$mech->agent_alias('Windows IE 6');
sub now { strftime('%Y-%m-%dT%H:%M:%SZ',gmtime()) };
my $now = now();
my $json = JSON::Any->new();
if(!-d $DIR) {
mkdir($DIR)||die("Error creating data dir $DIR: $!");
}
$mech->get(q{http://www.parlament.cat/web/composicio/ple-parlament/diputats-fotos?p_pant=CO});
if(!$mech->response->is_success()) {
die("Error getting parlament data");
}
my $html = $mech->content();
my %dip = ($html=~m,<a .*?href="/web/composicio/diputats-fitxa\?p_codi=(\d+)".*?>(.*?)</a>,igs);
write_file("$DIR/diputats.json",$json->encode(\%dip));
my $feed = XML::Atom::SimpleFeed->new(
title => 'Parlament de Catalunya',
link => {rel=>'via',href=>q{http://www.parlament.cat/web/composicio/ple-parlament/diputats-fotos?p_pant=CO}},
updated => $now,
author => 'opendatabcn.org',
id => "tag:cat.parlament.diputat.list",
);
foreach my $id (sort {$a<=>$b} keys %dip) {
$feed->add_entry(
title => $dip{$id},
link => {rel=>'via',href=>qq{http://www.parlament.cat/web/composicio/diputats-fitxa?p_codi=$id}},
link => {rel=>'related',href=>qq{diputats/$id.vcard},type=>'text/x-vcard',title=>'vCard'},
link => {rel=>'related',href=>qq{diputats/$id.atom},tyle=>'application/atom+xml',title=>'Atom'},
id => "tag:cat.parlament.diputat:$id",
summary => $dip{$id},
updated => $now,
category => 'Atom',
category => 'Miscellaneous',
);
}
write_file("$DIR/diputats.atom",$feed->as_string);
foreach my $id (keys %dip) {
print "Doing $id: $dip{$id}\n";
$mech->get(qq{http://www.parlament.cat/web/composicio/diputats-fitxa?p_codi=$id});
if(!$mech->response->is_success()) {
warn("Error getting data for diputat/$id");
next;
}
$html = $mech->content();
my $pq = pQuery($html);
my(%data,@key,@val);
$pq->find('.filiacio dt')->each(sub{
my $str=pQuery($_)->text();
$str =~s/:$//;
$str =~s/\s+/_/g;
$str = uc($str);
push @key,$str;
});
$pq->find('.filiacio dd')->each(sub{push @val, pQuery($_)->text()});
%data = map{$_=>shift(@val)} @key;
print Dumper(\%data);
my $ab = Text::vCard::Addressbook->new();
my $vcard = $ab->add_vcard();
my %args = (
FN => $dip{$id},
TITLE => [],
);
foreach my $entry ($html=~m,<address>(.*?)</address>,igs) {
$entry=~s,[\n\r],,igs;
$entry=~s,\s+, ,igs;
my($addr);
my $address = $vcard->add_node({
'node_type' => 'ADR',
});
foreach my $line (split(/<br>/,$entry)) {
$line=~s,\s+$,,igs;
$line=~s,^\s+,,igs;
if($line=~m,^Tel\.? (.*?)$,) {
# $vcard->tel($1);
} elsif($line=~m,^Fax\.? (.*?)$,) {
} else {
if($line=~m,^(\d{5}) (.*?),) {
$address->city($2);
} else {
$address->street($1);
}
}
}
}
while(my($key,$val)=each(%args)) {
$vcard->$key($val);
}
write_file("$DIR/$id.vcard",$ab->export());
exit;
}