From 13d4e38387b2ed48c2c07a4c5d9e37be0bd0ef5c Mon Sep 17 00:00:00 2001 From: popcion Date: Wed, 25 Dec 2024 20:16:33 +0800 Subject: [PATCH 1/6] Update manga_translator.py MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit 之前的代码复制少了 --- manga_translator/manga_translator.py | 63 ++++++++++++++++++++++++++++ 1 file changed, 63 insertions(+) diff --git a/manga_translator/manga_translator.py b/manga_translator/manga_translator.py index dd7343bb..de638575 100644 --- a/manga_translator/manga_translator.py +++ b/manga_translator/manga_translator.py @@ -352,6 +352,69 @@ async def _run_textline_merge(self, config: Config, ctx: Context): new_text_regions = [] for region in text_regions: + # Remove leading spaces and specified characters (after pre-translation dictionary replacement) + original_text = region.text + stripped_text = original_text.lstrip('、?!') + + # Record removed leading characters + removed_start_chars = original_text[:len(original_text) - len(stripped_text)] + if removed_start_chars: + logger.info(f'Removed leading characters: "{removed_start_chars}" from "{original_text}"') + + # Modified filtering condition: handle incomplete parentheses + # Combine left parentheses and left quotation marks into one list + left_symbols = ['(', '(', '[', '【', '{', '〔', '〈', '「', + '“', '‘', '《', '『', '"', '〝', '﹁', '﹃', + '⸂', '⸄', '⸉', '⸌', '⸜', '⸠', '‹', '«'] + + # Combine right parentheses and right quotation marks into one list + right_symbols = [')', ')', ']', '】', '}', '〕', '〉', '」', + '”', '’', '》', '』', '"', '〞', '﹂', '﹄', + '⸃', '⸅', '⸊', '⸍', '⸝', '⸡', '›', '»'] + # Combine all symbols + all_symbols = left_symbols + right_symbols + + # Count the number of left and right symbols + left_count = sum(stripped_text.count(s) for s in left_symbols) + right_count = sum(stripped_text.count(s) for s in right_symbols) + + # Check if the number of left and right symbols match + if left_count != right_count: + # Symbols don't match, remove all symbols + for s in all_symbols: + stripped_text = stripped_text.replace(s, '') + logger.info(f'Removed unpaired symbols from "{stripped_text}"') + + stripped_text = stripped_text.rstrip() + + # Replace double quotes with 『』, for translators often incorrectly change quotation marks from the source language to those commonly used in the target language, which is not align with some established translation conventions. + #while True: + # double_quote_index = -1 + + # for i in range(len(stripped_text)): + # if stripped_text[i] in ['"',"“","”"]: + # double_quote_index = i + # break + + # if double_quote_index == -1: + # break # No more double quotes found + + # left_quote_index = -1 + # for i in range(double_quote_index -1, -2, -1): + # if i == -1 or stripped_text[i] in all_symbols : + # left_quote_index = i + # break + + # right_quote_index = -1 + # for i in range(double_quote_index + 1, len(stripped_text) + 1): + # if i == len(stripped_text) or stripped_text[i] in all_symbols: + # right_quote_index = i + # break + + # if left_quote_index != -1 and right_quote_index != -1: + # stripped_text = stripped_text[:left_quote_index + 1] + '『' + stripped_text[left_quote_index + 2:right_quote_index-1] + '』' + stripped_text[right_quote_index:] + + region.text = stripped_text.strip() if len(region.text) >= config.ocr.min_text_length \ and not is_valuable_text(region.text) \ or (not config.translator.no_text_lang_skip and langcodes.tag_distance(region.source_lang, config.translator.target_lang) == 0): From ca5d4b111424a9b850deacf6f43436a2c3ac98e2 Mon Sep 17 00:00:00 2001 From: popcion Date: Wed, 25 Dec 2024 20:20:23 +0800 Subject: [PATCH 2/6] Update manga_translator.py --- manga_translator/manga_translator.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/manga_translator/manga_translator.py b/manga_translator/manga_translator.py index de638575..ffa19cfa 100644 --- a/manga_translator/manga_translator.py +++ b/manga_translator/manga_translator.py @@ -385,7 +385,7 @@ async def _run_textline_merge(self, config: Config, ctx: Context): stripped_text = stripped_text.replace(s, '') logger.info(f'Removed unpaired symbols from "{stripped_text}"') - stripped_text = stripped_text.rstrip() + stripped_text = stripped_text.strip() # Replace double quotes with 『』, for translators often incorrectly change quotation marks from the source language to those commonly used in the target language, which is not align with some established translation conventions. #while True: From c6b54efa6e9a6c94a0f8bb5155e37e6edb13682b Mon Sep 17 00:00:00 2001 From: popcion Date: Wed, 25 Dec 2024 21:24:58 +0800 Subject: [PATCH 3/6] Update manga_translator.py remove JPN detection as it is not working --- manga_translator/manga_translator.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/manga_translator/manga_translator.py b/manga_translator/manga_translator.py index ffa19cfa..e8b6a641 100644 --- a/manga_translator/manga_translator.py +++ b/manga_translator/manga_translator.py @@ -477,7 +477,7 @@ async def _run_text_translation(self, config: Config, ctx: Context): new_text_regions = [] # List of languages with specific language detection - special_langs = ['CHS', 'CHT', 'JPN', 'KOR', 'IND', 'UKR', 'RUS', 'THA', 'ARA'] + special_langs = ['CHS', 'CHT', 'KOR', 'IND', 'UKR', 'RUS', 'THA', 'ARA'] # Process special language scenarios if config.translator.target_lang in special_langs: From 734083b0abffdf2e89ae86f796636631f4e08b0c Mon Sep 17 00:00:00 2001 From: popcion Date: Thu, 26 Dec 2024 23:22:23 +0800 Subject: [PATCH 4/6] Punctuation correction after translation --- manga_translator/manga_translator.py | 80 ++++++++++++++++++---------- 1 file changed, 51 insertions(+), 29 deletions(-) diff --git a/manga_translator/manga_translator.py b/manga_translator/manga_translator.py index e8b6a641..4de19aae 100644 --- a/manga_translator/manga_translator.py +++ b/manga_translator/manga_translator.py @@ -385,36 +385,8 @@ async def _run_textline_merge(self, config: Config, ctx: Context): stripped_text = stripped_text.replace(s, '') logger.info(f'Removed unpaired symbols from "{stripped_text}"') - stripped_text = stripped_text.strip() + region.text = stripped_text.strip() - # Replace double quotes with 『』, for translators often incorrectly change quotation marks from the source language to those commonly used in the target language, which is not align with some established translation conventions. - #while True: - # double_quote_index = -1 - - # for i in range(len(stripped_text)): - # if stripped_text[i] in ['"',"“","”"]: - # double_quote_index = i - # break - - # if double_quote_index == -1: - # break # No more double quotes found - - # left_quote_index = -1 - # for i in range(double_quote_index -1, -2, -1): - # if i == -1 or stripped_text[i] in all_symbols : - # left_quote_index = i - # break - - # right_quote_index = -1 - # for i in range(double_quote_index + 1, len(stripped_text) + 1): - # if i == len(stripped_text) or stripped_text[i] in all_symbols: - # right_quote_index = i - # break - - # if left_quote_index != -1 and right_quote_index != -1: - # stripped_text = stripped_text[:left_quote_index + 1] + '『' + stripped_text[left_quote_index + 2:right_quote_index-1] + '』' + stripped_text[right_quote_index:] - - region.text = stripped_text.strip() if len(region.text) >= config.ocr.min_text_length \ and not is_valuable_text(region.text) \ or (not config.translator.no_text_lang_skip and langcodes.tag_distance(region.source_lang, config.translator.target_lang) == 0): @@ -457,6 +429,56 @@ async def _run_text_translation(self, config: Config, ctx: Context): region._alignment = config.render.alignment region._direction = config.render.direction + # Punctuation correction logic. for translators often incorrectly change quotation marks from the source language to those commonly used in the target language. + check_items = [ + ["(", "(", "「"], + ["(", "(", "「"], + [")", ")", "」"], + [")", ")", "」"], + ["「", "“", "‘", "『"], + ["」", "”", "’", "』"], + ["『", "“", "‘", "「"], + ["』", "”", "’", "」"], + ] + + replace_items = [ + ["「", "“"], + ["「", "‘"], + ["」", "”"], + ["」", "’"], + ] + + for region in ctx.text_regions: + if region.text and region.translation: + # Detect 「」 or 『』 in the source text + if '「' in region.text and '」' in region.text: + quote_type = '「」' + elif '『' in region.text and '』' in region.text: + quote_type = '『』' + else: + quote_type = None + + # If the source text has 「」 or 『』, and the translation has "", replace them + if quote_type and '"' in region.translation: + # Replace "" with 「」 or 『』 + if quote_type == '「」': + region.translation = re.sub(r'"([^"]*)"', r'「\1」', region.translation) + elif quote_type == '『』': + region.translation = re.sub(r'"([^"]*)"', r'『\1』', region.translation) + + # Correct ellipsis + region.translation = re.sub(r'\.{3}', '…', region.translation) + + # Check and replace other symbols + for v in check_items: + num_s = region.text.count(v[0]) + num_t = sum(region.translation.count(t) for t in v[1:]) + if num_s == num_t: + for t in v[1:]: + region.translation = region.translation.replace(t, v[0]) + for v in replace_items: + region.translation = region.translation.replace(v[1], v[0]) + # Apply post dictionary after translating post_dict = load_dictionary(self.post_dict) post_replacements = [] From a61be487207cfd56295e5bd013b3168e5e9936db Mon Sep 17 00:00:00 2001 From: popcion Date: Fri, 27 Dec 2024 01:01:57 +0800 Subject: [PATCH 5/6] Update manga_translator.py --- manga_translator/manga_translator.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/manga_translator/manga_translator.py b/manga_translator/manga_translator.py index 4de19aae..9ecf4eb1 100644 --- a/manga_translator/manga_translator.py +++ b/manga_translator/manga_translator.py @@ -352,9 +352,9 @@ async def _run_textline_merge(self, config: Config, ctx: Context): new_text_regions = [] for region in text_regions: - # Remove leading spaces and specified characters (after pre-translation dictionary replacement) + # Remove leading spaces after pre-translation dictionary replacement original_text = region.text - stripped_text = original_text.lstrip('、?!') + stripped_text = original_text.strip() # Record removed leading characters removed_start_chars = original_text[:len(original_text) - len(stripped_text)] From 56e423105b6e7b6053a17c742e239fdf328b234f Mon Sep 17 00:00:00 2001 From: popcion Date: Mon, 30 Dec 2024 20:39:42 +0800 Subject: [PATCH 6/6] Update chatgpt.py --- manga_translator/translators/chatgpt.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/manga_translator/translators/chatgpt.py b/manga_translator/translators/chatgpt.py index 836276e9..fec375ae 100644 --- a/manga_translator/translators/chatgpt.py +++ b/manga_translator/translators/chatgpt.py @@ -357,7 +357,7 @@ async def _request_translation(self, to_lang: str, prompt: str) -> str: try: response = await self.client.chat.completions.create( - model='gpt-4o-mini', + model='gpt-4o', messages=messages, max_tokens=self._MAX_TOKENS // 2, temperature=self.temperature,