-
Notifications
You must be signed in to change notification settings - Fork 46
/
restore_conllu_lines.pl
executable file
·51 lines (48 loc) · 1.81 KB
/
restore_conllu_lines.pl
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
#!/usr/bin/env perl
# Merges a CoNLL-X and a CoNLL-U file. CoNLL-X is an output from an old parser,
# CoNLL-U is the desired output format, which will be compared to the gold
# standard. All node lines will be copied from the CoNLL-X file, except for the
# FORM field, which will be taken from the CoNLL-U file, and any CoNLL-U
# specific lines will also be taken from the CoNLL-U file. These include sentence
# level comments, empty nodes and, most importantly, multi-word token lines.
# Copyright © 2017, 2020 Dan Zeman <[email protected]>
# License: GNU GPL
use utf8;
use open ':utf8';
binmode(STDIN, ':utf8');
binmode(STDOUT, ':utf8');
binmode(STDERR, ':utf8');
# Usage: restore_conllu_lines.pl x.conll x.conllu > x-merged.conllu
my $xin = $ARGV[0];
my $uin = $ARGV[1];
open(XIN, $xin) or die("Cannot read $xin: $!");
open(UIN, $uin) or die("Cannot read $uin: $!");
while(<UIN>)
{
if(m/^\#/ ||
m/^\d+\./ ||
m/^\d+-/)
{
print;
}
else # node line or empty line after a sentence
{
my $uline = $_;
my $xline = <XIN>;
# If the original CoNLL-U file contains a word with spaces, the spaces have been replaced by underscores
# in CoNLL-X because spaces are not allowed there. Furthermore, if the CoNLL-X parser performed any
# token normalization in the FORM field, the resulting file would now be invalid because the FORM would
# not match the value of the sentence text comment. Therefore we take the FORM (but not the LEMMA) from
# the original file (we do it with multi-word tokens anyway).
if($xline =~ m/^\d+\t/)
{
my @xf = split(/\t/, $xline);
my @uf = split(/\t/, $uline);
$xf[1] = $uf[1];
$xline = join("\t", @xf);
}
print($xline);
}
}
close(XIN);
close(UIN);