-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathfix_2024_mojibake.py
executable file
·129 lines (115 loc) · 2.97 KB
/
fix_2024_mojibake.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
#!/usr/bin/env python
import argparse
import re
import sys
import traceback
HELP = """
Try to fix the mojibake found in the 2024 CAMP23 collection. It contains many
'Korean' characters like 큄 which are actually Czech characters like š. This
script uses a simple mapping table to concert wrong into right characters. The
original file is then replaces with a fixed version. Run with --dry-run to just
print the broken characters. You may extend the mapping by adding entries to
the char_map line of this script. Be careful to keep the encoding of the script
as UTF-8.
"""
char_map = str.maketrans(
{
"큄": "š",
"처": "ó",
"챕": "é",
"챌": "ç",
"찾": "ã",
"첵": "ý",
"챠": "í",
"큄": "š",
"찼": "á",
"탑": "Ž",
"흫": "ň",
"찼": "á",
"휁": "ď",
"척": "ô",
"첬": "ú",
"컁": "ľ",
"첬": "ú",
"챕": "é",
"훾": "Č",
"큐": "ť",
"흦": "ń",
"철": "ö",
"첫": "ù",
"챘": "ë",
"혻": "é",
"훳": "ą",
"탉": "ż",
"탄": "ź",
"휌": "ę",
"챵": "ò",
"횪": "à",
"체": "ü",
"횩": "ß",
"채": "ä",
"횥": "Ü",
"체": "ü",
"횉": "Ç",
"횙": "Ó",
"횒": "Í",
"횁": "Á",
"창": "â",
"챕": "é",
"챗": "ê",
"청": "û",
"챦": "ï",
"챙": "ì",
"챤": "î",
"챔": "è",
"쨉": "µ",
"힄": "ś",
"횖": "Ð",
"챰": "ñ",
"첩": "ø",
"횠": "Ø",
"천": "õ",
"찾": "ã",
"책": "å",
"훶": "ć",
"쨈": "'",
"횗": "Ñ",
"횋": "É",
"횞": "×",
"훻": "Č",
"흹": "œ",
"흢": "ł",
"징": " ",
"째": "°",
"횊": "È",
"횣": "Po",
"쩔": "",
"쨩": "",
"징": "",
"쩔": "",
}
)
RE_KOREAN_CHARACTER = re.compile(r"[가-힣]")
def fix_2024_mojibake(path, dry_run):
with open(path, "r", encoding="utf-8") as f:
text = f.read()
text = text.translate(char_map)
unknown_chars = RE_KOREAN_CHARACTER.findall(text)
if not dry_run:
with open(path, "w") as f:
f.write(text)
return unknown_chars
def main(argv):
parser = argparse.ArgumentParser(description=HELP)
parser.add_argument("files", nargs="+")
parser.add_argument("--dry-run", action="store_true")
args = parser.parse_args(argv)
unknown_chars = []
for path in args.files:
try:
unknown_chars += fix_2024_mojibake(path, args.dry_run)
except Exception as ex:
traceback.print_exc()
print("unknown characters: " + " ".join(set(unknown_chars)))
if __name__ == "__main__":
main(sys.argv[1:])