forked from clarin-eric/ParlaMint
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathparlamintp-tei2text.pl
executable file
·47 lines (39 loc) · 1.5 KB
/
parlamintp-tei2text.pl
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
#!/usr/bin/perl
use warnings;
use utf8;
use FindBin qw($Bin);
use File::Spec;
use File::Temp qw/ tempfile tempdir /; #creation of tmp files and directory
use File::Temp qw/ tempfile tempdir /; #creation of tmp files and directory
my $tempdirroot = "$Bin/tmp";
my $DIR = tempdir(DIR => $tempdirroot, CLEANUP => 1);
$inDir = File::Spec->rel2abs(shift);
$outDir = File::Spec->rel2abs(shift);
binmode(STDERR, 'utf8');
$Para = 'parallel --gnu --halt 2 --jobs 10';
$Saxon = 'java -jar /usr/share/java/saxon.jar';
$Meta = "$Bin/parlamint2meta.xsl";
$Convert = "$Bin/parlamint-tei2text.xsl";
print STDERR "INFO: Converting directory $inDir\n";
#Store all files to be processed in $fileFile
$fileFile = "$DIR/files.lst";
$corpusFiles = "$inDir/*_*.xml $inDir/*/*_*.xml";
#We convert only plain files, not .ana!
open(TMP, '>:utf8', $fileFile);
foreach $inFile (glob $corpusFiles) {
print TMP "$inFile\n"
unless $inFile =~ /\.ana/;
}
close TMP;
print STDERR "INFO: Making text files\n";
$command = "$Saxon -xsl:$Convert {} > $outDir/{/.}.txt";
`cat $fileFile | $Para '$command'`;
`rename 's/\.ana//' $outDir/*.txt`;
print STDERR "INFO: Making metadata files\n";
opendir(CORPUSDIR, $inDir);
@rootFile = grep {/ParlaMint-[A-Z]{2}(?:-[A-Z0-9]{1,3})?(?:-[a-z]{2,3})?\.xml$/} readdir(CORPUSDIR);
closedir(CORPUSDIR);
$command = "$Saxon hdr=".File::Spec->catfile($inDir,$rootFile[0])." -xsl:$Meta {} > $outDir/{/.}-meta.tsv";
`cat $fileFile | $Para '$command'`;
`rm -f $outDir/*.ana-meta.tsv`;
`rename 's/\.ana//' $outDir/*-meta.tsv`;