-
Notifications
You must be signed in to change notification settings - Fork 46
/
generate_comparison_of_treebanks.pl
executable file
·84 lines (81 loc) · 2.59 KB
/
generate_comparison_of_treebanks.pl
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
#!/usr/bin/env perl
# Calls conllu-stats.pl for all sets of treebanks that need comparison.
# Copyright © 2017, 2021 Dan Zeman <[email protected]>
# License: GNU GPL
use utf8;
use open ':utf8';
binmode(STDIN, ':utf8');
binmode(STDOUT, ':utf8');
binmode(STDERR, ':utf8');
# Make sure that the tools folder is searched for Perl modules. Then use udlib from there.
# Without it, we could not run this script from other folders.
BEGIN
{
our $toolsdir = $0;
unless($toolsdir =~ s:[/\\][^/\\]*$::)
{
$toolsdir = '.';
}
}
use lib "$toolsdir";
use udlib;
sub usage
{
print STDERR ("Usage: perl generate_comparison_of_treebanks.pl [UD_XXX, UD_YYY, ...]\n");
print STDERR (" Generates a MarkDown page with comparison of treebanks in each language where there are multiple treebanks.\n");
print STDERR (" Saves the page in docs/treebanks/$LCODE-comparison.md.\n");
print STDERR (" If no UD folders are provided as arguments, scans all UD_* subfolders of the current folder.\n");
}
my $languages = udlib::get_language_hash();
my @folders;
if(scalar(@ARGV) > 0)
{
@folders = sort(@ARGV);
}
else
{
@folders = udlib::list_ud_folders(); # the list comes sorted
}
my $current_language;
my @current_group;
foreach my $folder (@folders)
{
# Skip empty folders.
my $tbkrecord = udlib::get_ud_files_and_codes($folder);
next if(scalar(@{$tbkrecord->{files}})==0);
my $language = $folder;
$language =~ s/^UD_//;
$language =~ s/-.*//;
$language =~ s/_/ /g;
if(defined($current_language) && $language eq $current_language)
{
push(@current_group, $folder);
}
else
{
if(scalar(@current_group)>1)
{
if(!exists($languages->{$current_language}))
{
print STDERR ("WARNING: Unknown language $current_language\n");
}
my $folders = join(' ', @current_group);
my $command = "perl tools/conllu-stats.pl --oformat hubcompare $folders > docs/treebanks/$languages->{$current_language}{lcode}-comparison.md";
print("$command\n");
system($command);
}
$current_language = $language;
@current_group = ($folder);
}
}
if(scalar(@current_group)>1)
{
if(!exists($languages->{$current_language}))
{
print STDERR ("WARNING: Unknown language $current_language\n");
}
my $folders = join(' ', @current_group);
my $command = "perl tools/conllu-stats.pl --oformat hubcompare $folders > docs/treebanks/$languages->{$current_language}{lcode}-comparison.md";
print("$command\n");
system($command);
}