forked from clarin-eric/ParlaMint
-
Notifications
You must be signed in to change notification settings - Fork 0
/
parlamintp2conllu.pl
executable file
·117 lines (105 loc) · 3.94 KB
/
parlamintp2conllu.pl
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
#!/usr/bin/perl
# Convert ParlaMint .ana files to CoNLL-U and validate them
# Also produces meta-data .tsv files
# Tomaž Erjavec <[email protected]>
# License: GNU GPL
use warnings;
use utf8;
use open ':utf8';
binmode(STDIN, ':utf8');
binmode(STDOUT, ':utf8');
binmode(STDERR, ':utf8');
sub usage
{
print STDERR ("Usage: parlamint2conllu.pl <InputDirectory> <OutputDirectory>\n");
print STDERR (" Converts ParlaMint .ana files in the <InputDirectory> to\n");
print STDERR (" .conllu and -meta.tsv files in the <OutputDirectory>\n");
print STDERR (" Also validates the .conllu agains UD validations script\n");
}
use FindBin qw($Bin);
use File::Spec;
use File::Temp qw/ tempfile tempdir /; #creation of tmp files and directory
my $tempdirroot = "$Bin/tmp";
my $DIR = tempdir(DIR => $tempdirroot, CLEANUP => 1);
$inDir = File::Spec->rel2abs(shift);
$outDir = File::Spec->rel2abs(shift);
$Para = 'parallel --gnu --halt 0 --jobs 8';
$Saxon = 'java -jar /usr/share/java/saxon.jar';
$Convert = "$Bin/parlamint2conllu.xsl";
$Meta = "$Bin/parlamint2meta.xsl";
$Valid = "$Bin/tools/validate.py";
$country2lang{'AT'} = 'de';
$country2lang{'BE'} = 'fr, nl';
$country2lang{'BG'} = 'bg';
$country2lang{'CZ'} = 'cs';
$country2lang{'DK'} = 'da';
$country2lang{'EE'} = 'et';
$country2lang{'ES'} = 'es';
$country2lang{'ES-CT'} = 'ca';
$country2lang{'ES-GA'} = 'gl';
$country2lang{'ES-PV'} = 'eu';
$country2lang{'FI'} = 'fi';
$country2lang{'FR'} = 'fr';
$country2lang{'GB'} = 'en';
$country2lang{'GR'} = 'el';
$country2lang{'HR'} = 'hr';
$country2lang{'HU'} = 'hu';
$country2lang{'IS'} = 'is';
$country2lang{'IT'} = 'it';
$country2lang{'LT'} = 'lt';
$country2lang{'LV'} = 'lv';
$country2lang{'NL'} = 'nl';
$country2lang{'NO'} = 'no';
$country2lang{'PL'} = 'pl';
$country2lang{'PT'} = 'pt';
$country2lang{'RO'} = 'ro';
$country2lang{'RO'} = 'ro';
$country2lang{'SE'} = 'sv';
$country2lang{'SI'} = 'sl';
$country2lang{'TR'} = 'tr';
print STDERR "INFO: Converting directory $inDir\n";
my $rootAnaFile = '';
my @compAnaFiles = ();
$inDir =~ s|[^/]+\.xml$||; # If a specific filename is given, get rid of it
$corpusFiles = "$inDir/*.ana.xml $inDir/*/*.ana.xml";
foreach $inFile (glob($corpusFiles)) {
if ($inFile =~ m|ParlaMint-[A-Z]{2}(?:-[A-Z0-9]{1,3})?(?:-[a-z]{2,3})?\.ana\.xml|) {$rootAnaFile = $inFile}
elsif ($inFile =~ m|ParlaMint-[A-Z]{2}(?:-[A-Z0-9]{1,3})?(?:-[a-z]{2,3})?_.+\.ana\.xml|) {push(@compAnaFiles, $inFile)}
}
my ($country, $langs) = $rootAnaFile =~ /ParlaMint-([A-Z]{2}(?:-[A-Z0-9]{1,3})?)(?:-([a-z]{2,3}))?\.ana\.xml/
or die "Can't find country code in root file $rootAnaFile!\n";
$langs = $country2lang{$country} unless defined $langs;
die "ERROR: Language is not defined for $country" unless defined $langs;
#Store all files to be processed in $fileFile
$fileFile = "$DIR/files.lst";
open(TMP, '>:utf8', $fileFile);
foreach $inFile (@compAnaFiles) {
print TMP "$inFile\n"
}
close TMP;
`mkdir $outDir` unless -e "$outDir";
`rm -f $outDir/*-meta.tsv`;
`rm -f $outDir/*.conllu`;
$command = "$Saxon hdr=$rootAnaFile -xsl:$Meta {} > $outDir/{/.}-meta.tsv";
`cat $fileFile | $Para '$command'`;
`rename 's/\.ana//' $outDir/*-meta.tsv`;
if ($langs !~ /,/) {
$command = "$Saxon meta=$rootAnaFile -xsl:$Convert {} > $outDir/{/.}.conllu";
`cat $fileFile | $Para '$command'`;
`rename 's/\.ana//' $outDir/*.conllu`;
$command = "python3 $Valid --lang $langs --level 1 {}";
`ls $outDir/*.conllu | $Para '$command'`;
$command = "python3 $Valid --lang $langs --level 2 {}";
`ls $outDir/*.conllu | $Para '$command'`;
}
else {
foreach $lang (split(/,\s*/, $langs)) {
$command = "$Saxon meta=$rootAnaFile seg-lang=$lang -xsl:$Convert {} > $outDir/{/.}-$lang.conllu";
`cat $fileFile | $Para '$command'`;
`rename 's/\.ana//' $outDir/*.conllu`;
$command = "python3 $Valid --lang $lang --level 1 {}";
`ls $outDir/*.conllu | $Para '$command'`;
$command = "python3 $Valid --lang $lang --level 2 {}";
`ls $outDir/*.conllu | $Para '$command'`;
}
}