Merge pull request #739 from popcion/main

Add Pre-translation and Post-translation Replacement Dictionary; Fix The problem of filtering of text that cannot be disabled when both source and target languages are same; add filtered_out reason
zyddnys · Nov 13, 2024 · 8095938 · 8095938
2 parents 2ab7f74 + 12681db
commit 8095938
Show file tree

Hide file tree

Showing 5 changed files with 209 additions and 12 deletions.
diff --git a/README.md b/README.md
@@ -451,6 +451,13 @@ Colorizer: **mc2**
 --save-text-file SAVE_TEXT_FILE              Like --save-text but with a specified file path.
 --filter-text FILTER_TEXT                    Filter regions by their text with a regex. Example
                                              usage: --text-filter ".*badtext.*"
+--pre-dict FILe_PATH                         Path to the pre-translation dictionary file. One entry per line,
+                                             Comments can be added with `#` and `//`.
+                                             usage: //Example
+                                                    dog cat #Example
+                                                    abc def
+                                                    abc
+--post-dict FILE_PATH                        Path to the post-translation dictionary file. Same as above.
 --skip-lang                                  Skip translation if source image is one of the provide languages, 
                                              use comma to separate multiple languages. Example: JPN,ENG
 --prep-manual                                Prepare for manual typesetting by outputting blank,

diff --git a/README_CN.md b/README_CN.md
@@ -205,6 +205,15 @@ FIL: Filipino (Tagalog)
 --save-text-file SAVE_TEXT_FILE              Like --save-text but with a specified file path.
 --filter-text FILTER_TEXT                    Filter regions by their text with a regex. Example
                                              usage: --text-filter ".*badtext.*"
+--filter-text FILTER_TEXT                    Filter regions by their text with a regex. Example
+                                             usage: --text-filter ".*badtext.*"
+--pre-dict FILe_PATH                         Path to the pre-translation dictionary file. One entry per line,
+                                             Comments can be added with `#` and `//`.
+                                             usage: //Example
+                                                    dog cat #Example
+                                                    abc def
+                                                    abc
+--post-dict                                  file_path Path to the post-translation dictionary file. Same as above.
 --skip-lang                                  Skip translation if source image is one of the provide languages, 
                                              use comma to separate multiple languages. Example: JPN,ENG
 --prep-manual                                Prepare for manual typesetting by outputting blank,

diff --git a/manga_translator/__main__.py b/manga_translator/__main__.py
@@ -30,16 +30,41 @@ async def dispatch(args: Namespace):
         if not args.input:
             raise Exception('No input image was supplied. Use -i <image_path>')
         translator = MangaTranslator(args_dict)
+
+        # Load pre-translation and post-translation dictionaries
+        pre_dict = translator.load_dictionary(args.pre_dict)  
+        post_dict = translator.load_dictionary(args.post_dict)  
+
         if args.mode == 'demo':
             if len(args.input) != 1 or not os.path.isfile(args.input[0]):
                 raise FileNotFoundError(f'Invalid single image file path for demo mode: "{" ".join(args.input)}". Use `-m batch`.')
             dest = os.path.join(BASE_PATH, 'result/final.png')
             args.overwrite = True # Do overwrite result/final.png file
+
+            # Apply pre-translation dictionaries
             await translator.translate_path(args.input[0], dest, args_dict)
+            for textline in translator.textlines:
+                textline.text = translator.apply_dictionary(textline.text, pre_dict)  
+                logger.info(f'Pre-translation dictionary applied: {textline.text}')
+
+            # Apply post-translation dictionaries
+            for textline in translator.textlines:
+                textline.translation = translator.apply_dictionary(textline.translation, post_dict)  
+                logger.info(f'Post-translation dictionary applied: {textline.translation}')
+
         else: # batch
             dest = args.dest
             for path in natural_sort(args.input):
+                # Apply pre-translation dictionaries
                 await translator.translate_path(path, dest, args_dict)
+                for textline in translator.textlines:
+                    textline.text = translator.apply_dictionary(textline.text, pre_dict) 
+                    logger.info(f'Pre-translation dictionary applied: {textline.text}')
+
+                # Apply post-translation dictionaries
+                for textline in translator.textlines:
+                    textline.translation = translator.apply_dictionary(textline.translation, post_dict)  
+                    logger.info(f'Post-translation dictionary applied: {textline.translation}')
 
     elif args.mode == 'web':
         from .server.web_main import dispatch

diff --git a/manga_translator/args.py b/manga_translator/args.py
@@ -177,6 +177,8 @@ def _format_action_invocation(self, action: argparse.Action) -> str:
 
 parser.add_argument('--kernel-size', default=3, type=int, help='Set the convolution kernel size of the text erasure area to completely clean up text residues')
 
+parser.add_argument('--pre-dict', default=None, type=file_path, help='Path to the pre-translation dictionary file')
+parser.add_argument('--post-dict', default=None, type=file_path, help='Path to the post-translation dictionary file')
 
 # Generares dict with a default value for each argument
 DEFAULT_ARGS = vars(parser.parse_args([]))
diff --git a/manga_translator/manga_translator.py b/manga_translator/manga_translator.py
@@ -304,6 +304,35 @@ async def translate(self, image: Image.Image, params: Union[dict, Context] = Non
         # translate
         return await self._translate(ctx)
 
+    def load_dictionary(self, file_path):
+        dictionary = []
+        if file_path and os.path.exists(file_path):
+            with open(file_path, 'r', encoding='utf-8') as file:
+                for line_number, line in enumerate(file, start=1):
+                    # Ignore empty lines and lines starting with '#' or '//'
+                    if not line.strip() or line.strip().startswith('#') or line.strip().startswith('//'):
+                        continue
+                    # Remove comment parts
+                    line = line.split('#')[0].strip()
+                    line = line.split('//')[0].strip()
+                    parts = line.split()
+                    if len(parts) == 1:
+                        # If there is only the left part, the right part defaults to an empty string, meaning delete the left part
+                        pattern = re.compile(parts[0])
+                        dictionary.append((pattern, ''))
+                    elif len(parts) == 2:
+                        # If both left and right parts are present, perform the replacement
+                        pattern = re.compile(parts[0])
+                        dictionary.append((pattern, parts[1]))
+                    else:
+                        logger.error(f'Invalid dictionary entry at line {line_number}: {line.strip()}')
+        return dictionary
+
+    def apply_dictionary(self, text, dictionary):
+        for pattern, value in dictionary:
+            text = pattern.sub(value, text)
+        return text
+
     def _preprocess_params(self, ctx: Context):
         # params auto completion
         # TODO: Move args into ctx.args and only calculate once, or just copy into ctx
@@ -411,6 +440,22 @@ async def _translate(self, ctx: Context) -> Context:
             # If no text was found result is intermediate image product
             ctx.result = ctx.upscaled
             return await self._revert_upscale(ctx)
+
+        # Apply pre-dictionary after OCR
+        pre_dict = self.load_dictionary(ctx.pre_dict)  
+        pre_replacements = []  
+        for textline in ctx.textlines:  
+            original = textline.text  
+            textline.text = self.apply_dictionary(textline.text, pre_dict)  
+            if original != textline.text:  
+                pre_replacements.append(f"{original} => {textline.text}")  
+
+        if pre_replacements:  
+            logger.info("Pre-translation replacements:")  
+            for replacement in pre_replacements:  
+                logger.info(replacement)  
+        else:  
+            logger.info("No pre-translation replacements made.")
 
         # -- Textline merge
         await self._report_progress('textline_merge')
@@ -510,13 +555,20 @@ async def _run_textline_merge(self, ctx: Context):
                     or (not ctx.no_text_lang_skip and langcodes.tag_distance(region.source_lang, ctx.target_lang) == 0):
                 if region.text.strip():
                     logger.info(f'Filtered out: {region.text}')
+                    if len(region.text) < ctx.min_text_length:
+                        logger.info('Reason: Text length is less than the minimum required length.')
+                    elif not is_valuable_text(region.text):
+                        logger.info('Reason: Text is not considered valuable.')
+                    elif langcodes.tag_distance(region.source_lang, ctx.target_lang) == 0:
+                        logger.info('Reason: Text language matches the target language and no_text_lang_skip is False.')
             else:
                 if ctx.font_color_fg or ctx.font_color_bg:
                     if ctx.font_color_bg:
                         region.adjust_bg_color = False
                 new_text_regions.append(region)
         text_regions = new_text_regions
 
+
         # Sort ctd (comic text detector) regions left to right. Otherwise right to left.
         # Sorting will improve text translation quality.
         text_regions = sort_regions(text_regions, right_to_left=True if ctx.detector != 'ctd' else False)
@@ -539,18 +591,120 @@ async def _run_text_translation(self, ctx: Context):
             region._alignment = ctx.alignment
             region._direction = ctx.direction
 
-        # Filter out regions by their translations
-        new_text_regions = []
-        for region in ctx.text_regions:
-            # TODO: Maybe print reasons for filtering
-            if not ctx.translator == 'none' and (region.translation.isnumeric() \
-                    or ctx.filter_text and re.search(ctx.filter_text, region.translation)
-                    or not ctx.translator == 'original' and region.text.lower().strip() == region.translation.lower().strip()):
-                if region.translation.strip():
-                    logger.info(f'Filtered out: {region.translation}')
-            else:
-                new_text_regions.append(region)
-        return new_text_regions
+        # Apply post dictionary after translating
+        post_dict = self.load_dictionary(ctx.post_dict)  
+        post_replacements = []  
+        for region in ctx.text_regions:  
+            original = region.translation  
+            region.translation = self.apply_dictionary(region.translation, post_dict)  
+            if original != region.translation:  
+                post_replacements.append(f"{original} => {region.translation}")  
+
+        if post_replacements:  
+            logger.info("Post-translation replacements:")  
+            for replacement in post_replacements:  
+                logger.info(replacement)  
+        else:  
+            logger.info("No post-translation replacements made.")  
+
+        # Filter out regions by their translations  
+        new_text_regions = []  
+
+        # List of languages with specific language detection  
+        special_langs = ['CHS', 'CHT', 'JPN', 'KOR', 'IND', 'UKR', 'RUS', 'THA', 'ARA']  
+
+        # Process special language scenarios  
+        if ctx.target_lang in special_langs:  
+            # Categorize regions  
+            same_target_regions = []    # Target language regions with identical translation  
+            diff_target_regions = []    # Target language regions with different translation  
+            same_non_target_regions = []  # Non-target language regions with identical translation  
+            diff_non_target_regions = []  # Non-target language regions with different translation  
+
+            for region in ctx.text_regions:  
+                text_equal = region.text.lower().strip() == region.translation.lower().strip()  
+                has_target_lang = False  
+
+                # Target language detection  
+                if ctx.target_lang in ['CHS', 'CHT']:  # Chinese  
+                    has_target_lang = bool(re.search('[\u4e00-\u9fff]', region.text))  
+                elif ctx.target_lang == 'JPN':  # Japanese  
+                    has_target_lang = bool(re.search('[\u3040-\u309f\u30a0-\u30ff\u4e00-\u9fff]', region.text))  
+                elif ctx.target_lang == 'KOR':  # Korean  
+                    has_target_lang = bool(re.search('[\uac00-\ud7af\u1100-\u11ff]', region.text))  
+                elif ctx.target_lang == 'ARA':  # Arabic  
+                    has_target_lang = bool(re.search('[\u0600-\u06ff]', region.text))  
+                elif ctx.target_lang == 'THA':  # Thai  
+                    has_target_lang = bool(re.search('[\u0e00-\u0e7f]', region.text))  
+                elif ctx.target_lang == 'RUS':  # Russian  
+                    has_target_lang = bool(re.search('[\u0400-\u04ff]', region.text))  
+                elif ctx.target_lang == 'UKR':  # Ukrainian  
+                    has_target_lang = bool(re.search('[\u0400-\u04ff]', region.text))  
+                elif ctx.target_lang == 'IND':  # Indonesian  
+                    has_target_lang = bool(re.search('[A-Za-z]', region.text))
+
+                # Skip numeric translations and filtered text  
+                if region.translation.isnumeric():  
+                    logger.info(f'Filtered out: {region.translation}')  
+                    logger.info('Reason: Numeric translation')  
+                    continue  
+
+                if ctx.filter_text and re.search(ctx.filter_text, region.translation):  
+                    logger.info(f'Filtered out: {region.translation}')  
+                    logger.info(f'Reason: Matched filter text: {ctx.filter_text}')  
+                    continue  
+
+                if has_target_lang:  
+                    if text_equal:  
+                        logger.info(f'Filtered out: {region.translation}')  
+                        logger.info('Reason: Translation identical to original')  
+                        same_target_regions.append(region)  
+                    else:  
+                        diff_target_regions.append(region)  
+                else:  
+                    if text_equal:  
+                        logger.info(f'Filtered out: {region.translation}')  
+                        logger.info('Reason: Translation identical to original')  
+                        same_non_target_regions.append(region)  
+                    else:  
+                        diff_non_target_regions.append(region)  
+
+            # If any different translations exist, retain all target language regions  
+            if diff_target_regions or diff_non_target_regions:  
+                new_text_regions.extend(same_target_regions)  
+                new_text_regions.extend(diff_target_regions)  
+
+            # Retain all non-target language regions with different translations (It appears empty, it clears all contents.) 
+            new_text_regions.extend(diff_non_target_regions)  
+
+        else:  
+            # Process non-special language scenarios using original logic  
+            for region in ctx.text_regions:  
+                should_filter = False  
+                filter_reason = ""  
+
+                if not ctx.translator == 'none':  
+                    if region.translation.isnumeric():  
+                        should_filter = True  
+                        filter_reason = "Numeric translation"  
+                    elif ctx.filter_text and re.search(ctx.filter_text, region.translation):  
+                        should_filter = True  
+                        filter_reason = f"Matched filter text: {ctx.filter_text}"  
+                    elif not ctx.translator == 'original':  
+                        text_equal = region.text.lower().strip() == region.translation.lower().strip()  
+                        if text_equal:  
+                            should_filter = True  
+                            filter_reason = "Translation identical to original"  
+
+                if should_filter:  
+                    if region.translation.strip():  
+                        logger.info(f'Filtered out: {region.translation}')  
+                        logger.info(f'Reason: {filter_reason}')  
+                else:  
+                    new_text_regions.append(region)  
+
+        return new_text_regions 
+
 
     async def _run_mask_refinement(self, ctx: Context):
         return await dispatch_mask_refinement(ctx.text_regions, ctx.img_rgb, ctx.mask_raw, 'fit_text',