-
Notifications
You must be signed in to change notification settings - Fork 46
/
find_duplicate_sentences.pl
executable file
·61 lines (58 loc) · 1.49 KB
/
find_duplicate_sentences.pl
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
#!/usr/bin/env perl
# Looks for duplicate sentences within CoNLL-U input. Compares the text attribute.
# Copyright © 2017 Dan Zeman <[email protected]>
# License: GNU GPL
use utf8;
use open ':utf8';
binmode(STDIN, ':utf8');
binmode(STDOUT, ':utf8');
binmode(STDERR, ':utf8');
use Getopt::Long;
# An extra file can be designated as "training" data. This is useful if we are
# not concerned about duplicates within one file but rather about overlap
# between training and test file. If train is given, only sentences that occur
# in it and outside of it will be reported.
my $train;
GetOptions
(
'train=s' => \$train
);
my %h;
if (defined($train))
{
open(TRAIN, $train) or die("Cannot read $train: $!");
while(<TRAIN>)
{
s/\r?\n$//;
if(m/^\#\s*text\s*=\s*(.+)/)
{
my $text = $1;
$text =~ s/\s+$//;
# Count only one occurrence of every sentence in the training data. No intra-train duplicates are counted!
$h{$text} = 1;
}
}
close(TRAIN);
}
while(<>)
{
s/\r?\n$//;
if(m/^\#\s*text\s*=\s*(.+)/)
{
my $text = $1;
$text =~ s/\s+$//;
if (defined($train))
{
$h{$text}++ if (exists($h{$text}));
}
else
{
$h{$text}++;
}
}
}
my @duplicates = sort {my $r = length($b)<=>length($a); unless($r) {$r = $h{$b}<=>$h{$a}} $r;} (grep {$h{$_}>1} (keys(%h)));
foreach my $d (@duplicates)
{
printf("%d × %s\n", $h{$d}, $d);
}