From 0b64085e87ad58ebea337638ea3ca7edf0a41215 Mon Sep 17 00:00:00 2001 From: Emerson Rocha Date: Tue, 29 Jun 2021 19:31:49 -0300 Subject: [PATCH] mvp-HXLTM (#16): hxltm-cli v0.8, MVP of Translation Memory eXchange format (TMX) exporter (refs EticaAI/HXL-Data-Science-file-formats#19) --- _hxltm/.gitignore | 2 + _systema/programma/hxltm2xliff.py | 206 +++++++++++------------------- 2 files changed, 80 insertions(+), 128 deletions(-) diff --git a/_hxltm/.gitignore b/_hxltm/.gitignore index feeccf3..55ac0fe 100644 --- a/_hxltm/.gitignore +++ b/_hxltm/.gitignore @@ -7,3 +7,5 @@ out/* !out/.gitkeep *.zip +*.tmx +*.dtd diff --git a/_systema/programma/hxltm2xliff.py b/_systema/programma/hxltm2xliff.py index 326416f..5864f46 100755 --- a/_systema/programma/hxltm2xliff.py +++ b/_systema/programma/hxltm2xliff.py @@ -28,11 +28,13 @@ # COMPANY: EticaAI # LICENSE: Public Domain dedication # SPDX-License-Identifier: Unlicense -# VERSION: v0.7 +# VERSION: v0.8 # CREATED: 2021-06-27 19:50 UTC v0.5, de github.com/EticaAI # /HXL-Data-Science-file-formats/blob/main/bin/hxl2example # REVISION: 2021-06-27 21:16 UTC v0.6 de hxl2tab # REVISION: 2021-06-27 23:53 UTC v0.7 --archivum-extensionem=.csv +# 2021-06-29 22:29 UTC v0.8 MVP of --archivum-extensionem=.tmx +# Translation Memory eXchange format (TMX). # ============================================================================== # Tests @@ -42,9 +44,10 @@ # ./_systema/programma/hxltm2xliff.py _hxltm/schemam-un-htcds-5items.tm.hxl.csv # ./_systema/programma/hxltm2xliff.py _hxltm/schemam-un-htcds.tm.hxl.csv --fontem-linguam=eng-Latn # ./_systema/programma/hxltm2xliff.py _hxltm/schemam-un-htcds-5items.tm.hxl.csv --fontem-linguam=eng-Latn --archivum-extensionem=.tmx +# ./_systema/programma/hxltm2xliff.py _hxltm/schemam-un-htcds-5items.tm.hxl.csv _hxltm/schemam-un-htcds-5items.tmx --fontem-linguam=eng-Latn --archivum-extensionem=.tmx # python3 -m doctest ./_systema/programma/hxltm2xliff.py -__VERSION__ = "v0.7" +__VERSION__ = "v0.8" import sys import os @@ -345,6 +348,7 @@ def hxltm2tmx(self, hxlated_input, tmx_output, is_stdout, args): '
') + # TODO: make source and adminlang configurable resultatum.append(' ') num = 0 @@ -356,31 +360,47 @@ def hxltm2tmx(self, hxlated_input, tmx_output, is_stdout, args): # print(rem) unit_id = rem['#item+id'] if '#item+id' in rem else num - resultatum.append(' ') - if '#item+wikidata+code' in rem: + resultatum.append(' ') + if '#item+wikidata+code' in rem and rem['#item+wikidata+code']: resultatum.append( - ' ' + rem['#item+wikidata+code'] + '') + ' ' + rem['#item+wikidata+code'] + '') + + if '#meta+item+url+list' in rem and rem['#meta+item+url+list']: + resultatum.append( + # TODO: improve naming + ' ' + rem['#meta+item+url+list'] + '') # TODO: reduzir repetitividade; os valores estao hardcoded. Não ideal. - if '#item+i_la+i_lat+is_latn' in rem: - resultatum.append(' ') - resultatum.append( - ' ' + rem['#item+i_la+i_lat+is_latn'] + '') + hattrsl = HXLTMUtil.hxllangattrs_list_from_item(rem) + # print(hattrsl) + for langattrs in hattrsl: + # print(langattrs) - if '#item+i_pt+i_por+is_latn' in rem: - resultatum.append(' ') - resultatum.append( - ' ' + rem['#item+i_pt+i_por+is_latn'] + '') + if '#item' + langattrs in rem: + bcp47 = HXLTMUtil.bcp47_from_hxlattrs(langattrs) + resultatum.append(' ') + resultatum.append( + ' ' + rem['#item' + langattrs] + '') + resultatum.append(' ') - resultatum.append(' ') - if '#item+i_en+i_eng+is_latn' in rem: - resultatum.append(' ') - resultatum.append( - ' ' + rem['#item+i_en+i_eng+is_latn'] + '') - resultatum.append(' ') + # if '#item+i_la+i_lat+is_latn' in rem: + # resultatum.append(' ') + # resultatum.append( + # ' ' + rem['#item+i_la+i_lat+is_latn'] + '') + + # if '#item+i_pt+i_por+is_latn' in rem: + # resultatum.append(' ') + # resultatum.append( + # ' ' + rem['#item+i_pt+i_por+is_latn'] + '') + + # resultatum.append(' ') + # if '#item+i_en+i_eng+is_latn' in rem: + # resultatum.append(' ') + # resultatum.append( + # ' ' + rem['#item+i_en+i_eng+is_latn'] + '') - resultatum.append(' ') + resultatum.append(' ') resultatum.append(' ') resultatum.append('') @@ -620,18 +640,38 @@ def bcp47_from_hxlattrs(hashtag): return '' - # def hxlattrlangs_list_from_item(item): - # result = [] + def hxllangattrs_list_from_item(item): + """hxllangattrs_list_from_item get only the raw attr string part + that is repeated severa times and mean the same logical group. - # def hxlattrlangs_list_from_item(item): - # result = [] + Example: + >>> item = {'#item+i_pt+i_por+is_latn': + ... '','#item+i_pt+i_por+is_latn+alt+list': '', + ... '#meta+item+i_pt+i_por+is_latn': ''} + >>> HXLTMUtil.hxllangattrs_list_from_item(item) + {'+i_pt+i_por+is_latn'} - # for k in item: - # if k.startswith('#x_xliff'): - # if item[k] == '∅': - # item_neo[k] = None - # else: - # item_neo[k] = item[k] + Args: + item ([Dict]): An dict item + Returns: + [Set]: Set of unique HXL language attributes + """ + result = set() + + for k in item: + rawstr = '' + bcp47 = HXLTMUtil.bcp47_from_hxlattrs(k) + iso6393 = HXLTMUtil.iso6393_from_hxlattrs(k) + iso115924 = HXLTMUtil.iso115924_from_hxlattrs(k) + if bcp47: + rawstr += '+i_' + bcp47 + if iso6393: + rawstr += '+i_' + iso6393 + if iso115924: + rawstr += '+is_' + iso115924 + # print(' ', k, ' ', rawstr) + result.add(rawstr) + return result def iso6393_from_hxlattrs(hashtag): """From a typical HXLTM hashtag, return only the ISO 639-3 language code @@ -642,6 +682,8 @@ def iso6393_from_hxlattrs(hashtag): 'arb' >>> HXLTMUtil.iso6393_from_hxlattrs('#item+i_ar') '' + >>> HXLTMUtil.iso6393_from_hxlattrs('#item+i_pt+i_por+is_latn+alt+list') + 'por' Args: hashtag ([String]): A hashtag string @@ -650,14 +692,15 @@ def iso6393_from_hxlattrs(hashtag): [String]: HXL Attributes """ if hashtag: - parts = hashtag.lower().split('+i_') + # parts = hashtag.lower().split('+i_') + parts = hashtag.lower().split('+') # '#item+i_ar+i_arb+is_arab' => ['#item', 'ar', 'arb+is_arab'] # print(parts) for k in parts: - if len(k) == 3: - return k - if len(k) == 11 and k.find('+is_') > -1: - return k.split('+is_')[0] + # if len(k) == 5 and k.find('+i_') == 0: + if len(k) == 5 and k.startswith('i_'): + # print(k.find('i_')) + return k.replace('i_', '') return '' @@ -702,10 +745,6 @@ def item_linguam_keys_grouped(item): Returns: [String]: HXL Attributes """ - - pattern = hxl.model.TagPattern.parse("#*+i_por") - print(pattern) - # print(item) alllangs = set() for k in item: @@ -717,95 +756,6 @@ def item_linguam_keys_grouped(item): # @see https://github.com/HXLStandard/libhxl-python/blob/main/hxl/model.py#L29 return '' - def item_to_hxlrow(item): - """Syntatic sugar for ad hoc usage of - HXLStandard/libhxl-python/blob/main/hxl/model.py. - - TODO: consider optimize the speed on how this is used on HXLTM script. - - Example: - >>> item = {'#item+i_pt+i_por+is_latn': '','#item+i_pt+i_por+is_latn+alt+list': '', '#meta+item+i_pt+i_por+is_latn': ''} - >>> HXLTMUtil.item_to_hxlrow(item) - 'arab' - - Args: - item (Union[Dict, List[Dict]]): [description] - """ - hxlcolumns = [] - hxlcolumnsvals = [] - for key in item: - hxlcolumns.append(hxl.model.Column.parse(key)) - if item[key] == '∅': - hxlcolumnsvals.append(None) - else: - hxlcolumnsvals.append(item[key]) - - HXLRow = hxl.model.Row(hxlcolumns, hxlcolumnsvals) - # queries = [hxl.model.RowQuery.parse('#item')] - # queries = hxl.model.RowQuery.parse('item+i_pt') - # print(queries) - - # result = hxl.data("https://example.org/data.csv") - # result = hxl.data([item]) - # result = hxl.data([ - # ["#item+i_pt+i_por+is_latn", '#item+i_pt+i_por+is_latn+alt+list', '#meta+item+i_pt+i_por+is_latn'], - # ["teste", "teste2|teste3", "Exemplo de teste"] - # ]) - result = hxl.io.make_input([ - # ["#item+i_pt+i_por+is_latn", '#item+i_pt+i_por+is_latn+alt+list', - # '#meta+item+i_pt+i_por+is_latn'], - # ["teste", "teste2|teste3", "Exemplo de teste"] - ['#item+i_pt+i_por+is_latn,#item+i_pt+i_por+is_latn+alt+list,#meta+item+i_pt+i_por+is_latn'], - ['#item+i_pt+i_por+is_latn,#item+i_pt+i_por+is_latn+alt+list,#meta+item+i_pt+i_por+is_latn'], - ["teste,teste2|teste3,Exemplo de teste"], - ["teste,teste2|teste3,Exemplo de teste"] - ]) - result2 = hxl.data({'data': result}) - - print(result) - print(result2) - print(result2.values()) - for i in result2: - print(i) - result3 = result2.with_columns('#meta') - print(result3) - print(result3.values()) - for i2 in result3: - print(i2) - # print(result2.columns()) - # print(result.columns()) - return None - - # hxl.model.RowQuery.match_list(HXLRow) - - return HXLRow - - # def item_to_hxlrow(item_or_items): - # """Syntatic sugar for ad hoc usage of - # HXLStandard/libhxl-python/blob/main/hxl/model.py. - - # Likely to not be optimized. - - # Args: - # item (Union[Dict, List[Dict]]): [description] - # """ - # hxlcolumns = [] - # hxlcolumnsvals = [] - # if isinstance(item_or_items, dict): - # items = item_or_items - # else: - # items = item_or_items - - # for index, item in enumerate(items): - # for key in item: - # hxlcolumns.append(hxl.model.Column.parse(key)) - # if item[key] == '∅': - # hxlcolumnsvals.append(None) - # else: - # hxlcolumnsvals.append(item[key]) - - # HXLRow = hxl.model.Row(hxlcolumns, hxlcolumnsvals) - def linguam_2_hxlattrs(linguam): """linguam_2_hxlattrs