-
Notifications
You must be signed in to change notification settings - Fork 0
/
loctrans
executable file
·668 lines (582 loc) · 23.9 KB
/
loctrans
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
#!/usr/bin/env python
# loctrans, translation tool for YAML files generated by loc2yaml
# https://github.com/sergeuz/locode
#
# Dependencies:
# Python 3.2 or higher;
# pyyaml, https://bitbucket.org/xi/pyyaml
#
# This file is subject to the terms and conditions defined in LICENSE file,
# which is part of this source code package.
import yaml, json, urllib.request, urllib.parse, tempfile, getopt, re, copy, os, sys
from collections import OrderedDict, namedtuple
import locode
from locode import print_q, print_v
# Certain definitions of Google Translate API
API_URL_BASE = "https://www.googleapis.com/language/translate/v2"
API_MAX_URL_LENGTH = 2000
API_MAX_REQUEST_ENTRIES = 128
class Config:
def __init__(self):
# Default settings
self.src_file = None
self.dest_file = None
self.ctry_code = None # Country code
self.region_codes = set()
self.def_lang = None
self.src_lang = None
self.dest_langs = set()
self.api_key = None
self.retry = False
self.force = False
# Helper structures
TransResult = namedtuple('TransResult', ['items', 'skipped'])
NameItem = namedtuple('NameItem', ['name', 'lang'])
TransItemTuple = namedtuple('TransItemTuple', ['name', 'lang', 'country', 'region', 'city'])
def TransItem(name, lang=None, country=None, region=None, city=None):
return TransItemTuple(name, lang, country, region, city)
# Current settings
cfg = Config()
# Supported languages:
# https://developers.google.com/translate/v2/using_rest#language-params
supp_langs = {
'af': "Afrikaans",
'sq': "Albanian",
'ar': "Arabic",
'az': "Azerbaijani",
'eu': "Basque",
'bn': "Bengali",
'be': "Belarusian",
'bg': "Bulgarian",
'ca': "Catalan",
'zh-cn': "Chinese Simplified",
'zh-tw': "Chinese Traditional",
'hr': "Croatian",
'cs': "Czech",
'da': "Danish",
'nl': "Dutch",
'en': "English",
'eo': "Esperanto",
'et': "Estonian",
'tl': "Filipino",
'fi': "Finnish",
'fr': "French",
'gl': "Galician",
'ka': "Georgian",
'de': "German",
'el': "Greek",
'gu': "Gujarati",
'ht': "Haitian Creole",
'iw': "Hebrew",
'hi': "Hindi",
'hu': "Hungarian",
'is': "Icelandic",
'id': "Indonesian",
'ga': "Irish",
'it': "Italian",
'ja': "Japanese",
'kn': "Kannada",
'ko': "Korean",
'la': "Latin",
'lv': "Latvian",
'lt': "Lithuanian",
'mk': "Macedonian",
'ms': "Malay",
'mt': "Maltese",
'no': "Norwegian",
'fa': "Persian",
'pl': "Polish",
'pt': "Portuguese",
'ro': "Romanian",
'ru': "Russian",
'sr': "Serbian",
'sk': "Slovak",
'sl': "Slovenian",
'es': "Spanish",
'sw': "Swahili",
'sv': "Swedish",
'ta': "Tamil",
'te': "Telugu",
'th': "Thai",
'tr': "Turkish",
'uk': "Ukrainian",
'ur': "Urdu",
'vi': "Vietnamese",
'cy': "Welsh",
'yi': "Yiddish"}
def print_usage():
print("""Translation tool for files generated by loc2yaml, version {version}
{homepage}
Usage:
loctrans -k key [-c code] -t language [-o file.yaml] file.yaml
Arguments:
-k key
--key=key
API key to use with Google Translate service.
-d code
--default=code
Language of default name translations provided in YAML file. Two-letter
codes as defined in ISO 639-1 are recognized. Default translations are
usually given in national language of particular country (see UN/LOCODE
manual). Google Translate will try to autodetect language if necessary,
but it's recommended to always specify default language.
-s code
--source=code
Language of existent translation entries to use as source text. In case
particular name doesn't provide such entry, default translation will be
used.
-t code[,code[,...]]
--target=code[,code[,...]]
Target language(s) for translation. Only names without corresponding
translation entries will be processed.
-c code
--country=code
Generate translations for specified country only. Two-letter codes as
defined in ISO 3166-1 are recognized.
-r code[,code[,...]]
--region=code[,code[,...]]
Generate translations for specified regions only.
-o file
--output=file
Destination file to save updated YAML contents. Original file will be
updated if no custom file specified.
--retry
For better performance requests sent to Google Translate service contain
batch of entries for translation. Same time particular request may be
rejected with all its contents, for example, when service is unable to
determine source language. This parameter allows to split failed batch
request into separate requests, trying to translate original entries
one by one.
--force
Don't ask any questions while processing.
-v
--verbose
Print various statistics while processing data. It's recommended to
enable this option for large datasets, especially when using language
autodetection.
-q
--quiet
Suppress any normal output. Implies --force.
--version
Show version string.
-h
--help
Show this message.
Examples:
loctrans -k GOOGLE_API_KEY -t uk -o dest/ua.yaml src/ua.yaml
Generate translations for Ukrainian language and save results as
separate file. Default translations will be used as source texts.
loctrans -k GOOGLE_API_KEY -c ru -s en -t ru world.yaml
Assuming 'world.yaml' contains location data for several countries,
generate translations only for Russia (target language is also Russian)
and update original YAML file. Existent English translations will be
used as source texts when possible.
""".format(
version=locode.VERSION_STRING,
homepage=locode.PROJECT_HOMEPAGE))
def parse_cmd_args(argv):
global cfg
args, src_files = getopt.getopt(argv[1:], \
"k:c:r:d:s:t:o:vqh", \
["key=", "country=", "region=", "default=", "source=", "target=", "output=", "verbose", "quiet", "help", \
"retry", "force", "version"]) # No short alternatives
for param, val in args:
if param in ("-h", "--help"):
print_usage();
sys.exit();
elif param == "--version":
print(locode.VERSION_STRING);
sys.exit();
elif param in ("-c", "--country"):
if len(val) == 2:
cfg.ctry_code = val.upper()
elif val:
sys.stderr.write("Warning: \"{}\" doesn't look like ISO 3166-1 country code\n".format(val))
elif param in ("-r", "--region"):
for code in val.split(','):
code = code.strip()
if code:
cfg.region_codes.add(code.upper())
elif param in ("-t", "--target"):
for code in val.split(','):
code = code.strip()
if code:
cfg.dest_langs.add(code.casefold())
elif param in ("-d", "--default"):
cfg.def_lang = val.casefold()
elif param in ("-s", "--source"):
cfg.src_lang = val.casefold()
elif param in ("-o", "--output"):
cfg.dest_file = val
elif param in ("-k", "--key"):
cfg.api_key = val
elif param in ("-v", "--verbose"):
locode.verbose = True # See locode.py
elif param in ("-q", "--quiet"):
locode.quiet = True # See locode.py
cfg.force = True # Implies --force
elif param == "--force":
cfg.force = True
elif param == "--retry":
cfg.retry = True
else:
assert False
# Source file
if len(src_files) >= 1:
cfg.src_file = os.path.abspath(src_files[0])
if len(src_files) > 1:
sys.stderr.write("Warning: Multiple input files are not supported\n")
else:
raise RuntimeError("No input file specified.\nTry 'loctrans --help' for list of supported options.")
if not cfg.dest_langs:
raise RuntimeError("No target language(s) specified.")
# Check if Google Translate supports all specified languages
all_langs = list(cfg.dest_langs)
if cfg.def_lang:
all_langs.append(cfg.def_lang)
if cfg.src_lang:
all_langs.append(cfg.src_lang)
for lang in all_langs:
if lang not in supp_langs:
raise RuntimeError("Unsupported language: {}".format(lang))
if not cfg.api_key:
raise RuntimeError("No Google API key specified.")
# Destination file
if cfg.dest_file:
cfg.dest_file = os.path.abspath(dest_file)
if not cfg.dest_file.endswith(".yaml"):
cfg.dest_file += ".yaml"
else:
cfg.dest_file = cfg.src_file
print_q("Source file:", cfg.src_file)
print_q("Destination file:", cfg.dest_file)
# Default language (optional)
if cfg.def_lang:
print_q("Default language: {} ({})".format(supp_langs[cfg.def_lang], cfg.def_lang))
# Source language (optional)
if cfg.src_lang:
print_q("Source language: {} ({})".format(supp_langs[cfg.src_lang], cfg.src_lang))
# Target language(s)
if len(cfg.dest_langs) > 1:
print_q("Target languages:")
for lang in sorted(cfg.dest_langs):
print_q(" {} ({})".format(supp_langs[lang], lang))
else:
lang = list(cfg.dest_langs)[0] # Better ideas?
print_q("Target language: {} ({})".format(supp_langs[lang], lang))
# Just in case...
if cfg.def_lang and cfg.def_lang in cfg.dest_langs:
sys.stderr.write("Warning: Target language matches default language: {} ({})\n".format(supp_langs[cfg.def_lang], cfg.def_lang))
if cfg.src_lang and cfg.src_lang in cfg.dest_langs:
sys.stderr.write("Warning: Target language matches source language: {} ({})\n".format(supp_langs[cfg.src_lang], cfg.src_lang))
# Country/region filter
if cfg.ctry_code:
print_q("Country code:", cfg.ctry_code)
if cfg.region_codes:
print_q("Region code(s):", ", ".join(sorted(cfg.region_codes)))
else:
print_q("Using all region codes")
def get_name_item(yml_node, dest_lang):
name = None
lang = cfg.def_lang
if type(yml_node) == type({}):
if "name" in yml_node:
yml_name = yml_node["name"]
if type(yml_name) == type({}):
if dest_lang not in yml_name or not yml_name[dest_lang].strip():
if cfg.src_lang and cfg.src_lang in yml_name:
name = yml_name[cfg.src_lang]
lang = cfg.src_lang
elif "default" in yml_name:
name = yml_name["default"]
elif type(yml_name) == type(''):
# Separate element:
# MOW:
# name: Moscow
name = yml_name
elif type(yml_node) == type(''):
# In-place naming:
# MOW: Moscow
name = yml_node
if not name or name.find(locode.TODO_MARKER) > -1:
return None # Skipping incomplete names
return NameItem(name=name, lang=lang)
def updated_name_item(yml_src_node, item):
yml_node = copy.deepcopy(yml_src_node) # FIXME: Make caller to update node if yml_src_node is a string
if type(yml_node) == type({}):
if "name" in yml_node:
yml_name = yml_node["name"]
if type(yml_name) == type({}): # Translated name
yml_name[item.lang] = item.name
elif type(yml_name) == type(''): # Separate element
yml_node["name"] = {"default": yml_name, item.lang: item.name}
else:
yml_node["name"] = {item.lang: item.name} # Should be unreachable actually
elif type(yml_node) == type(''): # In-place naming
yml_node = {"name": {"default": yml_node, item.lang: item.name}}
return yml_node
def is_odd_entry(yml_city):
# Getting parser hint flags
if type(yml_city) == type({}) and locode.PARSER_HINT_TAG in yml_city:
for flag in yml_city[locode.PARSER_HINT_TAG].split(','):
flag = flag.strip().casefold()
if flag == locode.PARSER_HINT_ODD:
return True
return False
def get_trans_items(yml_root, dest_lang):
items = []
ctry_code = None
yml_ctry_root = yml_root["country"]
if cfg.ctry_code:
if cfg.ctry_code in yml_ctry_root:
ctry_code = cfg.ctry_code
elif yml_ctry_root:
ctry_code = list(yml_ctry_root.keys())[0] # Better ideas?
if not ctry_code in yml_ctry_root:
return []
# Country name
yml_ctry = yml_ctry_root[ctry_code]
item = get_name_item(yml_ctry, dest_lang)
if item:
items.append(TransItem(
country=ctry_code,
name=item.name,
lang=item.lang))
if "region" in yml_ctry:
yml_region_root = yml_ctry["region"]
for region_code, yml_region in yml_region_root.items():
if cfg.region_codes and region_code not in cfg.region_codes:
continue
# Region name
item = get_name_item(yml_region, dest_lang)
if item:
items.append(TransItem(
country=ctry_code,
region=region_code,
name=item.name,
lang=item.lang))
if "city" in yml_region:
for city_code, yml_city in yml_region["city"].items():
# Skipping entries marked as "odd"
if is_odd_entry(yml_city):
continue
# City name
item = get_name_item(yml_city, dest_lang)
if item:
items.append(TransItem(
country=ctry_code,
region=region_code,
city=city_code,
name=item.name,
lang=item.lang))
return items
def update_trans_items(yml_root, trans_items):
yml_ctry_root = yml_root["country"]
for trans_item in trans_items:
name_item = NameItem(name=trans_item.name, lang=trans_item.lang)
yml_ctry = yml_ctry_root[trans_item.country] # Always specified
if trans_item.region:
yml_region = yml_ctry["region"][trans_item.region]
if trans_item.city:
yml_city = yml_region["city"][trans_item.city]
yml_region["city"][trans_item.city] = updated_name_item(yml_city, name_item) # City name
else:
yml_ctry["region"][trans_item.region] = updated_name_item(yml_region, name_item) # Region name
else:
yml_ctry_root[trans_item.country] = updated_name_item(yml_ctry, name_item) # Country name
def is_lang_pair_error(error):
if error.code == 400:
resp = locode.simplify_str(error.read().decode("utf-8")).casefold()
return resp.find("bad language pair") > -1
else:
return False
def parse_api_resp(resp):
if resp.status != 200:
raise RuntimeError("Unexpected HTTP response: {} ({})".format(resp.reason, resp.status))
# Parse JSON, preserving order of elements
js = json.loads(resp.read().decode("utf-8"), object_pairs_hook=OrderedDict)
if not "data" in js or not "translations" in js["data"]:
raise RuntimeError("Unexpected response data.")
texts = []
for t in js["data"]["translations"]:
texts.append(t["translatedText"])
return texts
def do_trans(items, dest_lang):
# Group items by source language
items_by_lang = {}
for item in items:
items_by_lang.setdefault(item.lang, []).append(item)
dest_items = []
skipped_names = []
for src_lang, src_items in items_by_lang.items():
while src_items:
if dest_items or skipped_names:
print_v("{} ({}): Translated {} entries ({} errors)...".format(
supp_langs[dest_lang], dest_lang, len(dest_items), len(skipped_names)))
get_params = [
("key", cfg.api_key),
("target", dest_lang),
("format", "text")]
if src_lang:
get_params.append(("source", src_lang))
url_base = API_URL_BASE + '?' + urllib.parse.urlencode(get_params)
url = url_base
req_items = []
while src_items and len(req_items) < API_MAX_REQUEST_ENTRIES: # Avoid "Too many text segments" error
q = '&' + urllib.parse.urlencode({"q": src_items[0].name})
if len(url + q) < API_MAX_URL_LENGTH: # Maximum URL length is also limited
url += q
req_items.append(src_items.pop(0))
else:
break
# TODO: Taking in account that translation is a paid service, would be
# good to preserve already translated entries regardless of any errors,
# which may happen while processing further requests
try:
resp = urllib.request.urlopen(url)
except urllib.error.HTTPError as e:
# Checking some special cases here
if is_lang_pair_error(e):
if cfg.retry:
# Trying to split failed request and translate its entries one by one
for src_item in req_items:
try:
url = url_base + '&' + urllib.parse.urlencode({"q": src_item.name})
resp = urllib.request.urlopen(url)
dest_names = parse_api_resp(resp)
if len(dest_names) != 1:
raise RuntimeError("Response contains unexpected number of elements.")
dest_items.append(TransItem(
name=dest_names[0],
lang=dest_lang,
country=src_item.country,
region=src_item.region,
city=src_item.city))
except urllib.error.HTTPError as e:
if is_lang_pair_error(e):
skipped_names.append(src_item.name)
else:
raise e
continue # Proceed with next batch
else:
for src_item in req_items:
skipped_names.append(src_item.name)
continue # Proceed with next batch
else:
raise e
dest_names = parse_api_resp(resp) # Order of elements matches req_items
if len(dest_names) != len(req_items):
raise RuntimeError("Response contains unexpected number of elements.")
# Copy original items, replacing name and language according to translation
for i, dest_name in enumerate(dest_names):
src_item = req_items[i]
dest_items.append(TransItem(
name=dest_name,
lang=dest_lang,
country=src_item.country,
region=src_item.region,
city=src_item.city))
print_v("{} ({}): Summary: Translated {} entries ({} errors)".format(
supp_langs[dest_lang], dest_lang, len(dest_items), len(skipped_names)))
return TransResult(items=dest_items, skipped=skipped_names)
def save_yml_file(yml_root, file_name):
dest_file = open(file_name, 'w', encoding="utf-8", newline='\n')
# Copy all header comments from original file
with open(cfg.src_file, 'r', encoding="utf-8") as src_file:
for line in src_file:
s = line.strip()
if not s or s.startswith('#'):
dest_file.write(line)
else:
break
locode.write_yml_data(yml_root, dest_file)
dest_file.close()
def main(argv):
try:
# Parsing command-line arguments
parse_cmd_args(argv)
# Parsing source YAML file into dictionary
yml_root = {"country": {}}
locode.parse_yml_file(cfg.src_file, yml_root, cfg.ctry_code)
if len(yml_root["country"]) > 1 and not cfg.ctry_code:
raise RuntimeError("No country code specified for file with multiple country data.")
if cfg.ctry_code and cfg.ctry_code not in yml_root["country"]:
sys.stderr.write("Warning: Country code not found in YAML data: {}\n".format(cfg.ctry_code))
# Getting untranslated names for each target language
src_items = {}
total_items = 0
total_chars = 0
for dest_lang in cfg.dest_langs:
items = get_trans_items(yml_root, dest_lang)
src_items[dest_lang] = items
total_items += len(items)
for item in items:
total_chars += len(item.name)
if total_items > 0:
print_q("Total characters for translation: {} ({} names)".format(total_chars, total_items))
else:
print_q("No untranslated entries found.")
sys.exit()
if not cfg.force:
s = None
while s != 'y' and s != 'n':
s = input("Proceed with translation (y/n)? ").strip().casefold()
if s != 'y':
sys.exit()
# Translating collected names
print_q("Translating...")
dest_items = []
skipped_names = []
for dest_lang, items in src_items.items():
trans = do_trans(items, dest_lang)
dest_items += trans.items
skipped_names += trans.skipped
# Updating YAML data
update_trans_items(yml_root, dest_items)
# Ensure destination path is exist
dest_path = os.path.dirname(cfg.dest_file)
if not os.path.exists(dest_path):
os.makedirs(dest_path)
elif not os.path.isdir(dest_path):
raise RuntimeError("Destination path is not a directory: {}".format(dest_path))
# All destination files are created in temporary directory first
temp_dir = tempfile.TemporaryDirectory()
# Saving YAML file and copying it to destination directory
save_yml_file(yml_root, temp_dir.name + '/' + os.path.basename(cfg.dest_file))
locode.transact_copy(temp_dir.name, dest_path)
if skipped_names:
sys.stderr.write("""Warning: Translation errors occured while processing some location names. Try
to change default and source languages if necessary (see -d and -s options).\n""")
if cfg.retry:
print_v("Skipped location names:", "; ".join(skipped_names))
else:
sys.stderr.write("""Additional information can be obtained with '--retry' and '--verbose' options
enabled.\n""")
print_q("Summary: Translated {} entries ({} errors)".format(len(dest_items), len(skipped_names)))
except getopt.GetoptError:
print_usage()
sys.exit(1)
except FileNotFoundError as e:
sys.stderr.write("Error: Unable to open file: {}\n".format(e.filename))
sys.exit(1)
except yaml.parser.ParserError as e:
mark = e.problem_mark
sys.stderr.write("Error: Unable to parse YAML file: {}, line {}: {}\n".format(mark.name, mark.line + 1, e.problem))
sys.exit(1)
except yaml.YAMLError as e:
sys.stderr.write("Error: Unable to process YAML data: {}\n".format(e))
sys.exit(1)
except urllib.error.HTTPError as e:
sys.stderr.write("HTTP Error: {} ({})\n".format(e.reason, e.code))
resp = e.read().decode("utf-8").strip()
if resp:
sys.stderr.write("Response data:\n{}\n".format(resp))
sys.exit(1)
except urllib.error.URLError as e:
sys.stderr.write("Error: {}\n".format(e))
sys.exit(1)
except RuntimeError as e:
sys.stderr.write("Error: {}\n".format(e))
sys.exit(1)
if __name__ == "__main__":
main(sys.argv)