-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy patheustagger_lite.cc
190 lines (158 loc) · 5.86 KB
/
eustagger_lite.cc
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
//////////////////////////////////////////////////////////////////
//
// EUSTAGGER LITE
//
// Copyright (C) 1996-2013 IXA Taldea
// EHU-UPV
//
// This file is part of EUSTAGGER LITE.
//
// EUSTAGGER LITE is free software: you can redistribute it and/or modify
// it under the terms of the GNU General Public License as published by
// the Free Software Foundation, either version 3 of the License, or
// (at your option) any later version.
//
// EUSTAGGER LITE is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
// GNU General Public License for more details.
//
// You should have received a copy of the GNU General Public License
// along with EUSTAGGER LITE. If not, see <http://www.gnu.org/licenses/>.
//
// Contact: Ixa Taldea ([email protected])
// 649 Posta kutxa
// 20080 Donostia (Basque Country)
//
//////////////////////////////////////////////////////////////////
#include <unistd.h>
#include <stdio.h>
#include <iostream>
#include <sstream>
#include <string>
#include "dat_orok.h"
#include "constants_decl.h"
#include "iconv.hpp"
#include "formatua.h"
#ifdef _USE_SWI_
#include "SWI-cpp.h"
#endif
using namespace std;
int edbl_bertsioa = 4;
const int PAROLE_OUTPUT = 1;
const int MG_OUTPUT = 2;
const int NAF_OUTPUT = 3;
extern void segHasieraketak(int sar_lem, int lex_uzei, int bigarren, int ez_est, int erab_lex, string &lexiko_izena, int parentizatua, int deslokala);
extern void segAmaierakoak();
extern void segmentazioaSortuRaw(string &fitxategiIzena, string &segIrteera, int zuriuneetan, bool utf8in);
extern void morfosintaxiaSortuRaw(string &fitxategiIzena, string &segIrteera, bool haul_seguruak, bool cg3form);
extern int prozesatuCG3Raw(int maila, string &oinIzen, int zuriuneetan, int format, bool utf8out);
void help() {
stringstream eustaggerVersion;
eustaggerVersion << "eustagger honako aukerekin konpilatu da:" << endl;
#ifdef _USE_FOMA_
eustaggerVersion << "\tFOMAko transduktoreak erabilita" << endl;
#else
eustaggerVersion << "\tXEROXeko transduktoreak erabilita" << endl;
#endif // _USE_FOMA_
#ifdef _USE_SWI_
eustaggerVersion << "\tSWI-prolog erabilita morfosintaxian" << endl;
#else
eustaggerVersion << "\tsicstus-prolog erabilita morfosintaxian" << endl;
#endif // _USE_SWI_
#ifdef __BRIDGE_VARIANTS__
eustaggerVersion << "\taldaeren analisirik ez da egingo" << endl;
#else
eustaggerVersion << "\taldaeren analisia egingo da" << endl;
#endif //__BRIDGE_VARIANTS__
cerr << "Erabilera:" <<endl;
cerr << "eustagger_lite [-hsz] [-m maila] [-i enc] [-o enc]" << endl;
cerr << "-h laguntza hau" << endl;
cerr << "-i [iso-8859-15|utf-8] Sarrerako testuaren kodeketa (defektuz iso-8859-15)" << endl;
cerr << "-o [iso-8859-15|utf-8] Irteerako testuaren kodeketa (defektuz iso-8859-15)" << endl;
cerr << "-s HAUL seguruak prozesatu (defektuz ez)" << endl;
cerr << "-z Sarreran testu tokenizatua eman -zuriunez bereiztutako tokenak- (defektuz ez)" << endl;
cerr << "-m [0|1|2|3|4|5] (defektuz 2)" << endl;
cerr << "-m 0 denean ez du desanbiguatuko" << endl;
cerr << "-m 4 denean bakarrik aplikatuko du CG3 desanbiguatzeko" << endl;
cerr << "-m 5 denean CG 3. maila + HMM + CGren sekzio guztiak + funtzio sintaktikoak" << endl;
cerr << "-f [mg|naf] (defektuz freeling/parole formatua)" << endl;
cerr << eustaggerVersion.str() ;
exit(EXIT_FAILURE);
}
std::string getEnvVar(std::string const& key);
int main(int argc, char *argv[])
{
int Sarrera_berezia = 1 ; /*** 99/9/2 -L aukerarako */
int lexiko_uzei = 0;
int bigarren_aukera = 0; /*** 2000/11/20 -2 aukerarako */
int lex_ald = 0; /*** 2000/12/20 erabiltzailearen lexikorako -L eta -2 ez dira onartzen*/
int ez_estandarrak = 0;
int zuriune_token = 0;
int maila = 2;
int deslokala = 1;
int parentizatua = 1;
// int out_format = NAF_OUTPUT;
int out_format = PAROLE_OUTPUT;
bool haul_seguruak = false;
char c;
string lex_izena;
string format;
string inenc_str = "";
string outenc_str = "";
int inenc, outenc;
// eustagger-ek Latin kodeketarekin lan egiten du barrutik
// morfosintaxiak (prolog?) karaktere bereziak ondo landu ditzan LANG egokia jarriko dugu gure ingurunean
putenv("LANG=en_US.ISO-8859-15");
while ((c = getopt(argc, argv, "sShHzZm:M:f:F:i:I:o:O")) != EOF) {
switch (c) {
case 'I':
case 'i': inenc_str = optarg; break;
case 'O':
case 'o': outenc_str = optarg; break;
case 'S':
case 's': haul_seguruak = 1; break;
case 'Z':
case 'z': zuriune_token = 1; break;
case 'M':
case 'm': maila =atoi(optarg); break;
case 'F':
case 'f': format = optarg; break;
case 'H':
case 'h':
default: help();
}
}
if (format == "naf") out_format = NAF_OUTPUT;
else if (format == "mg") out_format = MG_OUTPUT;
else out_format = PAROLE_OUTPUT;
if (inenc_str == "" || inenc_str == "iso-8859-15") inenc = EUSTAGGER_ISO_8859_15;
else if (inenc_str == "utf-8") inenc = EUSTAGGER_UTF_8;
else help();
if (outenc_str == "" || outenc_str == "iso-8859-15") outenc = EUSTAGGER_ISO_8859_15;
else if (outenc_str == "utf-8") outenc = EUSTAGGER_UTF_8;
else help();
#ifdef _USE_SWI_
PlEngine e(argv[0]);
#endif
if (Sarrera_berezia) parentizatua = 1;
edbl_bertsioa = 4;
segHasieraketak(Sarrera_berezia,lexiko_uzei,bigarren_aukera,ez_estandarrak,lex_ald,lex_izena,parentizatua,deslokala);
if (optind < argc) {
char pid[10];
sprintf(pid, "%d", getpid());
for (int i=optind;i<argc;i++) {
string fitxategiIzena = argv[i];
string segIrteera;
segmentazioaSortuRaw(fitxategiIzena,segIrteera, zuriune_token,(bool)inenc);
fitxategiIzena += pid;
morfosintaxiaSortuRaw(fitxategiIzena,segIrteera,haul_seguruak,OUT_MG);
prozesatuCG3Raw(maila,fitxategiIzena, zuriune_token, out_format,(bool)outenc);
}
}
else {
help();
}
segAmaierakoak();
return(0);
}