-
Notifications
You must be signed in to change notification settings - Fork 46
/
text_without_spaces.pl
executable file
·116 lines (102 loc) · 2.72 KB
/
text_without_spaces.pl
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
#!/usr/bin/env perl
# Outputs raw text without spaces. Used to verify that one file is a tokenization of another.
# Copyright © 2017 Dan Zeman <[email protected]>
# License: GNU GPL
use utf8;
use open ':utf8';
binmode(STDIN, ':utf8');
binmode(STDOUT, ':utf8');
binmode(STDERR, ':utf8');
use Getopt::Long;
my $input = 'plaintext'; # conllutext | conlluform(s)
GetOptions
(
'input=s' => \$input
);
if($input =~ m/^conllu?form/i)
{
process_conllu_forms();
}
elsif($input =~ m/^conllu?text/i)
{
process_conllu_sentence_text();
}
else
{
process_plain_text();
}
#------------------------------------------------------------------------------
# Takes new text, removes spaces and adds the result to the buffer. If the
# buffer exceeds a pre-set size, prints the prefix of the buffer and shifts the
# rest. Returns the new buffer.
#------------------------------------------------------------------------------
sub buffer
{
my $buffer = shift;
my $newtext = shift;
# Remove spaces.
$newtext =~ s/\s//g;
$buffer .= $newtext;
# Print lines until the buffer fits in the pre-set size.
while(length($buffer)>80)
{
my $line = substr($buffer, 0, 80);
$buffer = substr($buffer, 80);
print("$line\n");
}
# We must not forget to flush the rest of the buffer at the end!
return $buffer;
}
#------------------------------------------------------------------------------
# Processes plain text input.
#------------------------------------------------------------------------------
sub process_plain_text
{
my $buffer;
while(<>)
{
$buffer = buffer($buffer, $_);
}
print("$buffer\n");
}
#------------------------------------------------------------------------------
# Processes text attributes from a CoNLL-U file.
#------------------------------------------------------------------------------
sub process_conllu_sentence_text
{
my $buffer;
while(<>)
{
if(m/^\#\s*text\s*=\s*(.+)$/)
{
$buffer = buffer($buffer, $1);
}
}
print("$buffer\n");
}
#------------------------------------------------------------------------------
# Processes token forms from a CoNLL-U file.
#------------------------------------------------------------------------------
sub process_conllu_forms
{
my $buffer;
my $mwtlast;
while(<>)
{
if(m/^\d+-(\d+)\t(.+?)\t/)
{
$mwtlast = $1;
$buffer = buffer($buffer, $2);
}
elsif(m/^(\d+)\t(.+?)\t/ && !(defined($mwtlast) && $1<=$mwtlast))
{
$mwtlast = undef;
$buffer = buffer($buffer, $2);
}
elsif(m/^\D/)
{
$mwtlast = undef;
}
}
print("$buffer\n");
}