-
Notifications
You must be signed in to change notification settings - Fork 46
/
check_overlaps.pl
executable file
·62 lines (60 loc) · 1.87 KB
/
check_overlaps.pl
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
#!/usr/bin/env perl
# Checks for possible overlaps between training and dev/test sentences.
# Takes a list of treebanks (UD repositories) and cross-checks all treebanks
# of the same language. Note that non-empty overlap does not automatically
# mean an error. There might be naturally recurring sentences, especially short
# ones.
# Copyright © 2018 Dan Zeman <[email protected]>
# License: GNU GPL
use utf8;
use open ':utf8';
binmode(STDIN, ':utf8');
binmode(STDOUT, ':utf8');
binmode(STDERR, ':utf8');
my @treebanks = map {s:/$::; $_} (@ARGV);
my %languages;
foreach my $treebank (@treebanks)
{
my $language = $treebank;
$language =~ s/-.*//;
$languages{$language}++;
}
my @languages = sort(keys(%languages));
foreach my $language (@languages)
{
# Get all CoNLL-U files of all treebanks of the current language.
my @ltreebanks = grep {m/^$language/} (@treebanks);
my @files;
foreach my $treebank (@ltreebanks)
{
opendir(DIR, $treebank) or die("Cannot read $treebank: $!");
my @tfiles = map {"$treebank/$_"} (grep {m/-ud-(train|dev|test)\.conllu$/} (readdir(DIR)));
closedir(DIR);
push(@files, @tfiles);
}
# Test each pair of files where unnatural overlap is undesirable.
for(my $i = 0; $i <= $#files; $i++)
{
my $ifile = $files[$i];
my $itype = '';
if($ifile =~ m/-ud-(train|dev|test)\.conllu$/)
{
$itype = $1;
}
for(my $j = $i+1; $j <= $#files; $j++)
{
my $jfile = $files[$j];
my $jtype = '';
if($jfile =~ m/-ud-(train|dev|test)\.conllu$/)
{
$jtype = $1;
}
unless($itype eq $jtype)
{
my $command = "overlap.py --raw $ifile $jfile";
print("$command\n");
system($command);
}
}
}
}