-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathplain2sgm.py
executable file
·54 lines (45 loc) · 2.06 KB
/
plain2sgm.py
1
import codecsimport argparse# def file2list(DATA_DIR,FILE_NAME,LANG):# file_in = codecs.open(DATA_DIR+FILE_NAME+'.'+LANG, 'rt', encoding="utf-8")# line_list = [line.strip() for line in file_in]# print 'file list:',len(line_list)# file_in.close()# return line_listif __name__ == '__main__': parser = argparse.ArgumentParser() parser.add_argument("--inputs", type=str, nargs="+", required=True, help="plain text") parser.add_argument("--docid", type=str, default="AFC20030102.0015", help="doc id") parser.add_argument("--type", type=str, default="refset", help="refset or srcset") parser.add_argument("--setid", type=str, default="mt03_chinese_eval", help="set id") parser.add_argument("--srclang", type=str, default="Chinese", help="source langauge") parser.add_argument("--tgtlang", type=str, default="English", help="target langauge") parser.add_argument("--output", type=str, required=True, ) args = parser.parse_args() # DATA_DIR = '/Users/gdxie/Documents/Shared/autodesk_zh/zhen/' # FILE_NAME = 'test2.bpe' # LANG = 'zh' # # FINAL_FILE_NAME = FILE_NAME + '.sgm.' + LANG file_out = codecs.open(args.output, "w", encoding="utf-8") file_out.write('<' + args.type + ' = "' + args.setid + '" srclang = ' + args.srclang + '" trglang = "' + args.tgtlang + '>') text_count = len(args.inputs) for id, file_in in enumerate(args.inputs): file_out.write('<DOC docid = "' + args.docid + '" sysid = "E%02d">' % (id+1)) with codecs.open(file_in, encoding='utf-8') as _file: for line_id, line in enumerate(_file): file_out.write('<p>\n') file_out.write('<seg id=' + str(line_id) + '> ' + line + '</seg>\n') file_out.write('</p>\n') file_out.write('</DOC>\n') file_out.write('</' + args.type +'>\n') file_out.close()