-
Notifications
You must be signed in to change notification settings - Fork 46
/
conllu-tenfold.pl
executable file
·87 lines (85 loc) · 2.56 KB
/
conllu-tenfold.pl
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
#!/usr/bin/env perl
# Reads a CoNLL-U file and splits it to ten parts with approximately the same
# number of tokens.
# Copyright © 2018 Dan Zeman <[email protected]>
# License: GNU GPL
use utf8;
use open ':utf8';
binmode(STDIN, ':utf8');
binmode(STDOUT, ':utf8');
binmode(STDERR, ':utf8');
# We will read the file twice: first count the tokens, then create the parts.
# Therefore we do not want to read from STDIN. We require a filename argument.
my $infile = shift(@ARGV);
if(!defined($infile))
{
die("Usage: conllu-tenfold.pl inputfile.conllu");
}
my $nw = 0;
my $ns = 0;
open(IN, $infile) or die("Cannot read $infile: $!");
while(<IN>)
{
if(m/^\d+\t/)
{
$nw++;
}
elsif(m/^\s*$/)
{
$ns++;
}
}
close(IN);
if($ns<10)
{
die("Cannot create 10 folds when there are only $ns sentences.");
###!!! Even if there are more sentences it is not guaranteed that we will succeed.
###!!! If there are 10 sentences, sentences 1-9 have 1 token each, and sentence 10 has 91 tokens, we will consume all sentences for the first part of 10+ tokens.
}
my $np = sprintf("%d", $nw/10+0.5);
print STDERR ("Found $ns sentences and $nw words. Will try to create 10 files with approximately $np words each.\n");
$nw = 0;
$ns = 0;
my $ip = 1;
my $nwp = 0;
my $nsp = 0;
open(IN, $infile) or die("Cannot read $infile: $!");
open(OUT, '>01.conllu') or die("Cannot write 01.conllu: $!");
while(<IN>)
{
print OUT;
if(m/^\d+\t/)
{
$nw++;
$nwp++;
}
# We can switch to another output file at sentence boundary.
elsif(m/^\s*$/)
{
$ns++;
$nsp++;
if($ip < 10 && $nw >= $ip * $np)
{
close(OUT);
print STDERR ("Part $ip: $nsp sentences, $nwp words.\n");
$ip++;
my $filename = sprintf("%02d.conllu", $ip);
open(OUT, ">$filename") or die("Cannot write $filename: $!");
$nwp = 0;
$nsp = 0;
}
}
}
close(OUT);
close(IN);
print STDERR ("Part $ip: $nsp sentences, $nwp words.\n");
# The training data for each part is the concatenation of the other parts.
for(my $i = 1; $i <= 10; $i++)
{
my @files = map {sprintf("%02d.conllu", $_)} (grep {$_ != $i} (1..10));
my $command = sprintf("cat %s > train%02d.conllu", join(' ', @files), $i);
print STDERR ("$command\n");
system($command);
}
# for i in 01 02 03 04 05 06 07 08 09 10 ; do udpipe --train --tokenizer=none --tagger=none model$i.udpipe train$i.conllu ; done
# for i in 01 02 03 04 05 06 07 08 09 10 ; do udpipe --accuracy --parse model$i.udpipe $i.conllu ; done