From a3e22c1581842bcd37cc5a7585b686477933f12f Mon Sep 17 00:00:00 2001
From: Benjamin Kiessling <mittagessen@l.unchti.me>
Date: Mon, 22 Apr 2024 17:26:52 +0200
Subject: [PATCH] Add WER calculation to `ketos test` report

Fixes #559
---
 kraken/ketos/pretrain.py     | 12 +------
 kraken/ketos/recognition.py  | 61 +++++++++++++++++-------------------
 kraken/lib/pretrain/model.py |  7 -----
 kraken/lib/train.py          |  7 -----
 kraken/serialization.py      |  5 ++-
 kraken/templates/report      |  3 +-
 6 files changed, 36 insertions(+), 59 deletions(-)

diff --git a/kraken/ketos/pretrain.py b/kraken/ketos/pretrain.py
index 841d39191..89fdd92c2 100644
--- a/kraken/ketos/pretrain.py
+++ b/kraken/ketos/pretrain.py
@@ -141,15 +141,6 @@
 @click.option('--threads', show_default=True, default=1, type=click.IntRange(1), help='Maximum size of OpenMP/BLAS thread pool.')
 @click.option('--load-hyper-parameters/--no-load-hyper-parameters', show_default=True, default=False,
               help='When loading an existing model, retrieve hyperparameters from the model')
-@click.option('--repolygonize/--no-repolygonize', show_default=True,
-              default=False, help='Repolygonizes line data in ALTO/PageXML '
-              'files. This ensures that the trained model is compatible with the '
-              'segmenter in kraken even if the original image files either do '
-              'not contain anything but transcriptions and baseline information '
-              'or the polygon data was created using a different method. Will '
-              'be ignored in `path` mode. Note that this option will be slow '
-              'and will not scale input images to the same size as the segmenter '
-              'does.')
 @click.option('--force-binarization/--no-binarization', show_default=True,
               default=False, help='Forces input images to be binary, otherwise '
               'the appropriate color format will be auto-determined through the '
@@ -188,7 +179,7 @@ def pretrain(ctx, batch_size, pad, output, spec, load, freq, quit, epochs,
              min_epochs, lag, min_delta, device, precision, optimizer, lrate, momentum,
              weight_decay, warmup, schedule, gamma, step_size, sched_patience,
              cos_max, cos_min_lr, partition, fixed_splits, training_files,
-             evaluation_files, workers, threads, load_hyper_parameters, repolygonize,
+             evaluation_files, workers, threads, load_hyper_parameters,
              force_binarization, format_type, augment,
              mask_probability, mask_width, num_negatives, logit_temp,
              ground_truth, legacy_polygons):
@@ -278,7 +269,6 @@ def pretrain(ctx, batch_size, pad, output, spec, load, freq, quit, epochs,
                                      height=model.height,
                                      width=model.width,
                                      channels=model.channels,
-                                     repolygonize=repolygonize,
                                      force_binarization=force_binarization,
                                      format_type=format_type,
                                      legacy_polygons=legacy_polygons,)
diff --git a/kraken/ketos/recognition.py b/kraken/ketos/recognition.py
index e4e0b76ea..1296135c8 100644
--- a/kraken/ketos/recognition.py
+++ b/kraken/ketos/recognition.py
@@ -167,15 +167,6 @@
 @click.option('--threads', show_default=True, default=1, type=click.IntRange(1), help='Maximum size of OpenMP/BLAS thread pool.')
 @click.option('--load-hyper-parameters/--no-load-hyper-parameters', show_default=True, default=False,
               help='When loading an existing model, retrieve hyperparameters from the model')
-@click.option('--repolygonize/--no-repolygonize', show_default=True,
-              default=False, help='Repolygonizes line data in ALTO/PageXML '
-              'files. This ensures that the trained model is compatible with the '
-              'segmenter in kraken even if the original image files either do '
-              'not contain anything but transcriptions and baseline information '
-              'or the polygon data was created using a different method. Will '
-              'be ignored in `path` mode. Note that this option will be slow '
-              'and will not scale input images to the same size as the segmenter '
-              'does.')
 @click.option('--force-binarization/--no-binarization', show_default=True,
               default=False, help='Forces input images to be binary, otherwise '
               'the appropriate color format will be auto-determined through the '
@@ -203,7 +194,7 @@ def train(ctx, batch_size, pad, output, spec, append, load, freq, quit, epochs,
           step_size, sched_patience, cos_max, cos_min_lr, partition,
           fixed_splits, normalization, normalize_whitespace, codec, resize,
           reorder, base_dir, training_files, evaluation_files, workers,
-          threads, load_hyper_parameters, repolygonize, force_binarization,
+          threads, load_hyper_parameters, force_binarization,
           format_type, augment, pl_logger, log_dir, ground_truth,
           legacy_polygons):
     """
@@ -305,7 +296,6 @@ def train(ctx, batch_size, pad, output, spec, append, load, freq, quit, epochs,
                              binary_dataset_split=fixed_splits,
                              num_workers=workers,
                              load_hyper_parameters=load_hyper_parameters,
-                             repolygonize=repolygonize,
                              force_binarization=force_binarization,
                              format_type=format_type,
                              codec=codec,
@@ -385,15 +375,6 @@ def train(ctx, batch_size, pad, output, spec, append, load, freq, quit, epochs,
               default=None, help='Ground truth normalization')
 @click.option('-n', '--normalize-whitespace/--no-normalize-whitespace',
               show_default=True, default=True, help='Normalizes unicode whitespace')
-@click.option('--repolygonize/--no-repolygonize', show_default=True,
-              default=False, help='Repolygonizes line data in ALTO/PageXML '
-              'files. This ensures that the trained model is compatible with the '
-              'segmenter in kraken even if the original image files either do '
-              'not contain anything but transcriptions and baseline information '
-              'or the polygon data was created using a different method. Will '
-              'be ignored in `path` mode. Note, that this option will be slow '
-              'and will not scale input images to the same size as the segmenter '
-              'does.')
 @click.option('--force-binarization/--no-binarization', show_default=True,
               default=False, help='Forces input images to be binary, otherwise '
               'the appropriate color format will be auto-determined through the '
@@ -411,7 +392,7 @@ def train(ctx, batch_size, pad, output, spec, append, load, freq, quit, epochs,
 @click.option('--no-legacy-polygons', show_default=True, default=False, is_flag=True, help='Force disable the legacy polygon extractor.')
 def test(ctx, batch_size, model, evaluation_files, device, pad, workers,
          threads, reorder, base_dir, normalization, normalize_whitespace,
-         repolygonize, force_binarization, format_type, fixed_splits, test_set, no_legacy_polygons):
+         force_binarization, format_type, fixed_splits, test_set, no_legacy_polygons):
     """
     Evaluate on a test set.
     """
@@ -421,6 +402,8 @@ def test(ctx, batch_size, model, evaluation_files, device, pad, workers,
     import numpy as np
     from torch.utils.data import DataLoader
 
+    from torchmetrics.text import CharErrorRate, WordErrorRate
+
     from kraken.lib import models, util
     from kraken.lib.dataset import (ArrowIPCRecognitionDataset,
                                     GroundTruthDataset, ImageInputTransforms,
@@ -475,15 +458,11 @@ def test(ctx, batch_size, model, evaluation_files, device, pad, workers,
             dataset_kwargs["split_filter"] = "test"
 
     if format_type in ['xml', 'page', 'alto']:
-        if repolygonize:
-            message('Repolygonizing data')
         test_set = [{'page': XMLPage(file, filetype=format_type).to_container()} for file in test_set]
         valid_norm = False
         DatasetClass = partial(PolygonGTDataset, legacy_polygons=legacy_polygons)
     elif format_type == 'binary':
         DatasetClass = ArrowIPCRecognitionDataset
-        if repolygonize:
-            logger.warning('Repolygonization enabled in `binary` mode. Will be ignored.')
         test_set = [{'file': file} for file in test_set]
         valid_norm = False
     else:
@@ -491,8 +470,6 @@ def test(ctx, batch_size, model, evaluation_files, device, pad, workers,
         if force_binarization:
             logger.warning('Forced binarization enabled in `path` mode. Will be ignored.')
             force_binarization = False
-        if repolygonize:
-            logger.warning('Repolygonization enabled in `path` mode. Will be ignored.')
         test_set = [{'line': util.parse_gt_path(img)} for img in test_set]
         valid_norm = True
 
@@ -502,7 +479,8 @@ def test(ctx, batch_size, model, evaluation_files, device, pad, workers,
     if reorder and base_dir != 'auto':
         reorder = base_dir
 
-    acc_list = []
+    cer_list = []
+    wer_list = []
 
     with threadpool_limits(limits=threads):
         for p, net in nn.items():
@@ -539,6 +517,9 @@ def test(ctx, batch_size, model, evaluation_files, device, pad, workers,
                                    pin_memory=pin_ds_mem,
                                    collate_fn=collate_sequences)
 
+            test_cer = CharErrorRate()
+            test_wer = WordErrorRate()
+
             with KrakenProgressBar() as progress:
                 batches = len(ds_loader)
                 pred_task = progress.add_task('Evaluating', total=batches, visible=True if not ctx.meta['verbose'] else False)
@@ -555,6 +536,9 @@ def test(ctx, batch_size, model, evaluation_files, device, pad, workers,
                             algn_gt.extend(algn1)
                             algn_pred.extend(algn2)
                             error += c
+                            test_cer.update(x, y)
+                            test_wer.update(x, y)
+
                     except FileNotFoundError as e:
                         batches -= 1
                         progress.update(pred_task, total=batches)
@@ -565,10 +549,23 @@ def test(ctx, batch_size, model, evaluation_files, device, pad, workers,
                         logger.warning(str(e))
                     progress.update(pred_task, advance=1)
 
-            acc_list.append((chars - error) / chars)
+            cer_list.append(1.0 - test_cer.compute())
+            wer_list.append(1.0 - test_wer.compute())
             confusions, scripts, ins, dels, subs = compute_confusions(algn_gt, algn_pred)
-            rep = render_report(p, chars, error, confusions, scripts, ins, dels, subs)
+            rep = render_report(p,
+                                chars,
+                                error,
+                                cer_list[-1],
+                                wer_list[-1],
+                                confusions,
+                                scripts,
+                                ins,
+                                dels,
+                                subs)
             logger.info(rep)
             message(rep)
-    logger.info('Average accuracy: {:0.2f}%, (stddev: {:0.2f})'.format(np.mean(acc_list) * 100, np.std(acc_list) * 100))
-    message('Average accuracy: {:0.2f}%, (stddev: {:0.2f})'.format(np.mean(acc_list) * 100, np.std(acc_list) * 100))
+
+    logger.info('Average character accuracy: {:0.2f}%, (stddev: {:0.2f})'.format(np.mean(cer_list) * 100, np.std(cer_list) * 100))
+    message('Average character accuracy: {:0.2f}%, (stddev: {:0.2f})'.format(np.mean(cer_list) * 100, np.std(cer_list) * 100))
+    logger.info('Average word accuracy: {:0.2f}%, (stddev: {:0.2f})'.format(np.mean(wer_list) * 100, np.std(wer_list) * 100))
+    message('Average word accuracy: {:0.2f}%, (stddev: {:0.2f})'.format(np.mean(wer_list) * 100, np.std(wer_list) * 100))
diff --git a/kraken/lib/pretrain/model.py b/kraken/lib/pretrain/model.py
index cd1d12e32..eb0d3cc57 100644
--- a/kraken/lib/pretrain/model.py
+++ b/kraken/lib/pretrain/model.py
@@ -84,7 +84,6 @@ def __init__(self,
                  width: int = 0,
                  channels: int = 1,
                  num_workers: int = 1,
-                 repolygonize: bool = False,
                  force_binarization: bool = False,
                  format_type: str = 'path',
                  pad: int = 16,
@@ -125,8 +124,6 @@ def __init__(self,
             valid_norm = False
         elif format_type == 'binary':
             DatasetClass = ArrowIPCRecognitionDataset
-            if repolygonize:
-                logger.warning('Repolygonization enabled in `binary` mode. Will be ignored.')
             valid_norm = False
             logger.info(f'Got {len(training_data)} binary dataset files for training data')
             training_data = [{'file': file} for file in training_data]
@@ -137,8 +134,6 @@ def __init__(self,
             if force_binarization:
                 logger.warning('Forced binarization enabled in `path` mode. Will be ignored.')
                 force_binarization = False
-            if repolygonize:
-                logger.warning('Repolygonization enabled in `path` mode. Will be ignored.')
             if binary_dataset_split:
                 logger.warning('Internal binary dataset splits are enabled but using non-binary dataset files. Will be ignored.')
                 binary_dataset_split = False
@@ -157,8 +152,6 @@ def __init__(self,
                 if force_binarization:
                     logger.warning('Forced binarization enabled with box lines. Will be ignored.')
                     force_binarization = False
-                if repolygonize:
-                    logger.warning('Repolygonization enabled with box lines. Will be ignored.')
                 if binary_dataset_split:
                     logger.warning('Internal binary dataset splits are enabled but using non-binary dataset files. Will be ignored.')
                     binary_dataset_split = False
diff --git a/kraken/lib/train.py b/kraken/lib/train.py
index ec9685aff..0a8bc7665 100644
--- a/kraken/lib/train.py
+++ b/kraken/lib/train.py
@@ -213,7 +213,6 @@ def __init__(self,
                  binary_dataset_split: bool = False,
                  num_workers: int = 1,
                  load_hyper_parameters: bool = False,
-                 repolygonize: bool = False,
                  force_binarization: bool = False,
                  format_type: Literal['path', 'alto', 'page', 'xml', 'binary'] = 'path',
                  codec: Optional[Dict] = None,
@@ -291,8 +290,6 @@ def __init__(self,
             valid_norm = False
         elif format_type == 'binary':
             DatasetClass = ArrowIPCRecognitionDataset
-            if repolygonize:
-                logger.warning('Repolygonization enabled in `binary` mode. Will be ignored.')
             valid_norm = False
             logger.info(f'Got {len(training_data)} binary dataset files for training data')
             training_data = [{'file': file} for file in training_data]
@@ -303,8 +300,6 @@ def __init__(self,
             if force_binarization:
                 logger.warning('Forced binarization enabled in `path` mode. Will be ignored.')
                 force_binarization = False
-            if repolygonize:
-                logger.warning('Repolygonization enabled in `path` mode. Will be ignored.')
             if binary_dataset_split:
                 logger.warning('Internal binary dataset splits are enabled but using non-binary dataset files. Will be ignored.')
                 binary_dataset_split = False
@@ -323,8 +318,6 @@ def __init__(self,
                 if force_binarization:
                     logger.warning('Forced binarization enabled with box lines. Will be ignored.')
                     force_binarization = False
-                if repolygonize:
-                    logger.warning('Repolygonization enabled with box lines. Will be ignored.')
                 if binary_dataset_split:
                     logger.warning('Internal binary dataset splits are enabled but using non-binary dataset files. Will be ignored.')
                     binary_dataset_split = False
diff --git a/kraken/serialization.py b/kraken/serialization.py
index b1e7ab9e8..f4fd4432d 100644
--- a/kraken/serialization.py
+++ b/kraken/serialization.py
@@ -247,6 +247,8 @@ def _load_template(name):
 def render_report(model: str,
                   chars: int,
                   errors: int,
+                  char_accuracy: float,
+                  word_accuracy: float,
                   char_confusions: 'Counter',
                   scripts: 'Counter',
                   insertions: 'Counter',
@@ -275,7 +277,8 @@ def render_report(model: str,
     report = {'model': model,
               'chars': chars,
               'errors': errors,
-              'accuracy': (chars-errors)/chars * 100,
+              'character_accuracy': char_accuracy * 100,
+              'word_accuracy': word_accuracy * 100,
               'insertions': sum(insertions.values()),
               'deletions': deletions,
               'substitutions': sum(substitutions.values()),
diff --git a/kraken/templates/report b/kraken/templates/report
index 264b8b6aa..abd81fbb2 100644
--- a/kraken/templates/report
+++ b/kraken/templates/report
@@ -2,7 +2,8 @@
 
 {{ report.chars }}	Characters
 {{ report.errors }}	Errors
-{{ '%0.2f'| format(report.accuracy) }}%	Accuracy
+{{ '%0.2f'| format(report.character_accuracy) }}%	Character Accuracy
+{{ '%0.2f'| format(report.word_accuracy) }}%	Word Accuracy
 
 {{ report.insertions }}	Insertions
 {{ report.deletions }}	Deletions