-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathcompile-catalog.pl
executable file
·103 lines (87 loc) · 2.75 KB
/
compile-catalog.pl
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
#!/usr/bin/perl
use 5.012;
use warnings;
use File::Basename 'basename';
use File::Path 'make_path';
use Getopt::Long;
use JSON::PP;
use LWP::UserAgent::Cached;
use URI::Escape 'uri_escape_utf8';
use YAML::PP;
my ($single, $no_cache);
my $cache_dir = '/tmp/lwp-cache';
GetOptions(
's' => \$single,
'no-cache' => \$no_cache,
'cache-dir=s' => \$cache_dir,
) or die("Error in command line arguments\n");
make_path($cache_dir) unless $no_cache;
my $ypp = YAML::PP->new(
schema => [qw(Core Merge)],
boolean => 'JSON::PP',
);
my $ua = LWP::UserAgent::Cached->new( $no_cache ? () : (cache_dir => $cache_dir) );
my %stat_files = (
tokens => 'ddc.tokens.all.txt',
sentences => 'ddc.sentences.all.txt',
documents => 'ddc.files.all.txt',
);
my $dstar_base = 'https://kaskade.dwds.de/dstar/';
my %stat_queries = (
tokens => 'COUNT(* #SEP #HAS[flags,/\b%s\b/])',
sentences => 'COUNT(* #JOIN #HAS[flags,/\b%s\b/])',
documents => 'COUNT(* #WITHIN file #HAS[flags,/\b%s\b/])',
);
my %out;
my $i = 0;
FILE:
foreach my $file ( @ARGV ) {
my $base = basename $file, '.yml';
say STDERR "$base: processing ...";
my $yaml = $ypp->load_file( $file );
next unless exists $yaml->{dstar};
my $corpus = $yaml->{dstar}{corpus};
next unless $corpus;
next if exists $yaml->{status} and $yaml->{status} eq 'wip';
my $flags = $yaml->{dstar}{flags};
if ( $flags ) {
# DDC queries
while ( my ($k, $v) = each %stat_queries ) {
my $url = sprintf '%s%s/dstar.perl?q=%s&fmt=text', $dstar_base, uri_escape_utf8($corpus), uri_escape_utf8(sprintf($v, $flags));
say STDERR "fetching $url ...";
my $res = $ua->get($url);
if ( !$res->is_success ) {
say STDERR "$url: ".$res->status_line;
next FILE;
}
if ( $res->content =~ /no hits/ ) {
say STDERR "$base: no token numbers, skipping ...";
next FILE;
}
my ($number) = split /\s+/ => $res->content;
$yaml->{numbers}{$k} = $number;
}
}
else {
# grab stat files directly, should be much faster
while ( my ($k, $v) = each %stat_files ) {
my $url = sprintf '%s%s/stats/%s', $dstar_base, $corpus, $v;
say STDERR "fetching $url ...";
my $res = $ua->get($url);
if ( !$res->is_success ) {
say STDERR "$url: ".$res->status_line;
next FILE;
}
my ($number) = split /\s+/ => $res->content;
$yaml->{numbers}{$k} = $number;
}
}
if ( $single ) {
say encode_json($yaml);
exit;
}
$out{ $base } = $yaml;
$i++;
}
say encode_json(\%out);
__END__