-
Notifications
You must be signed in to change notification settings - Fork 46
/
mwtoken-stats.pl
executable file
·49 lines (47 loc) · 1.27 KB
/
mwtoken-stats.pl
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
#!/usr/bin/env perl
# Collects statistics about multi-word tokens in a CoNLL-U file.
# Copyright © 2017 Dan Zeman <[email protected]>
# License: GNU GPL
use utf8;
use open ':utf8';
binmode(STDIN, ':utf8');
binmode(STDOUT, ':utf8');
binmode(STDERR, ':utf8');
while(<>)
{
if(m/^(\d+)-(\d+)\t(.+?)\t/)
{
$current_from = $1;
$current_to = $2;
$current_mwt = lc($3);
}
elsif(m/^(\d+)\t(.+?)\t(.+?)\t(.+?)\t/)
{
$id = $1;
$form = lc($2);
$lemma = $3;
$upos = $4;
if(defined($current_from))
{
# Merge verb+clitic combinations in Romance languages.
if($current_to-$current_from == 1 && $id == $current_from && $upos =~ m/^(VERB|AUX)$/ && $current_mwt =~ s/^$form/VERB/)
{
$form = 'VERB';
}
push(@current_words, $form);
if($id == $current_to)
{
$hash{$current_mwt}{join(' ', @current_words)}++;
$current_from = $current_to = $current_mwt = undef;
@current_words = ();
}
}
}
}
foreach my $mwt (sort(keys(%hash)))
{
foreach my $sequence (sort(keys(%{$hash{$mwt}})))
{
print("$mwt\t$sequence\t$hash{$mwt}{$sequence}\n");
}
}