forked from clarin-eric/ParlaMint
-
Notifications
You must be signed in to change notification settings - Fork 0
/
pack-parlamint.pl
executable file
·80 lines (67 loc) · 2.27 KB
/
pack-parlamint.pl
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
#!/usr/bin/perl
# Pack ParlaMint corpora
# Tomaž Erjavec <[email protected]>
# License: GNU GPL
use warnings;
use utf8;
use open ':utf8';
binmode(STDIN, ':utf8');
binmode(STDOUT, ':utf8');
binmode(STDERR, ':utf8');
sub usage {
print STDERR ("Usage:\n");
print STDERR ("pack-parlamint.pl -help\n");
print STDERR ("pack-parlamint.pl -codes '<Codes>' -in <Input> -out <Output>\n");
print STDERR (" Packs ParlaMint corpora into .tgz.\n");
print STDERR (" <Codes> is the list of country codes of the corpora to be processed.\n");
print STDERR (" <Input> is the directory where ParlaMint.TEI-XX/ and ParlaMint.TEI.ana-XX/ are.\n");
print STDERR (" <Output> is the directory where output .tgz are written.\n");
}
use Getopt::Long;
use FindBin qw($Bin);
use File::Spec;
use File::Copy;
GetOptions
(
'help' => \$help,
'codes=s' => \$countryCodes,
'in=s' => \$inDir,
'out=s' => \$outDir,
);
if ($help) {
&usage;
exit;
}
$inDir = File::Spec->rel2abs($inDir);
$outDir = File::Spec->rel2abs($outDir);
$XX_template = "ParlaMint-XX";
foreach my $countryCode (split(/[, ]+/, $countryCodes)) {
print STDERR "INFO: ***Packing $countryCode\n";
$XX = $XX_template;
$XX =~ s|XX|$countryCode|g;
$teiDir = "$XX.TEI";
$anaDir = "$XX.TEI.ana";
$TxtDir = "$XX.txt";
$ConlDir = "$XX.conllu";
$VertDir = "$XX.vert";
$outTxt = "$XX.tgz";
$outAna = "$XX.ana.tgz";
print STDERR "INFO: *Packing $teiDir, $TxtDir\n";
`rm -fr $outDir/$outTxt`;
die "Can't find $inDir/$teiDir\n" unless -e "$inDir/$teiDir";
die "Can't find $inDir/$TxtDir\n" unless -e "$inDir/$TxtDir";
`cd $inDir; tar -czf $outTxt --mode='a+rwX' $teiDir $TxtDir`;
move("$inDir/$outTxt", $outDir);
print STDERR "INFO: *Packing $anaDir, $ConlDir, $VertDir\n";
if (-e "$inDir/$anaDir/$XX.ana.xml") {
`rm -fr $outDir/$outAna`;
die "Can't find $inDir/$anaDir\n" unless -e "$inDir/$anaDir";
die "Can't find $inDir/$ConlDir\n" unless -e "$inDir/$ConlDir";
die "Can't find $inDir/$VertDir\n" unless -e "$inDir/$VertDir";
`cd $inDir; tar -czf $outAna --mode='a+rwX' $anaDir $ConlDir $VertDir`;
move("$inDir/$outAna", $outDir);
}
else {
print STDERR "WARN: No ana root file, skipping\n";
}
}