From a3e22c1581842bcd37cc5a7585b686477933f12f Mon Sep 17 00:00:00 2001 From: Benjamin Kiessling Date: Mon, 22 Apr 2024 17:26:52 +0200 Subject: [PATCH] Add WER calculation to `ketos test` report Fixes #559 --- kraken/ketos/pretrain.py | 12 +------ kraken/ketos/recognition.py | 61 +++++++++++++++++------------------- kraken/lib/pretrain/model.py | 7 ----- kraken/lib/train.py | 7 ----- kraken/serialization.py | 5 ++- kraken/templates/report | 3 +- 6 files changed, 36 insertions(+), 59 deletions(-) diff --git a/kraken/ketos/pretrain.py b/kraken/ketos/pretrain.py index 841d39191..89fdd92c2 100644 --- a/kraken/ketos/pretrain.py +++ b/kraken/ketos/pretrain.py @@ -141,15 +141,6 @@ @click.option('--threads', show_default=True, default=1, type=click.IntRange(1), help='Maximum size of OpenMP/BLAS thread pool.') @click.option('--load-hyper-parameters/--no-load-hyper-parameters', show_default=True, default=False, help='When loading an existing model, retrieve hyperparameters from the model') -@click.option('--repolygonize/--no-repolygonize', show_default=True, - default=False, help='Repolygonizes line data in ALTO/PageXML ' - 'files. This ensures that the trained model is compatible with the ' - 'segmenter in kraken even if the original image files either do ' - 'not contain anything but transcriptions and baseline information ' - 'or the polygon data was created using a different method. Will ' - 'be ignored in `path` mode. Note that this option will be slow ' - 'and will not scale input images to the same size as the segmenter ' - 'does.') @click.option('--force-binarization/--no-binarization', show_default=True, default=False, help='Forces input images to be binary, otherwise ' 'the appropriate color format will be auto-determined through the ' @@ -188,7 +179,7 @@ def pretrain(ctx, batch_size, pad, output, spec, load, freq, quit, epochs, min_epochs, lag, min_delta, device, precision, optimizer, lrate, momentum, weight_decay, warmup, schedule, gamma, step_size, sched_patience, cos_max, cos_min_lr, partition, fixed_splits, training_files, - evaluation_files, workers, threads, load_hyper_parameters, repolygonize, + evaluation_files, workers, threads, load_hyper_parameters, force_binarization, format_type, augment, mask_probability, mask_width, num_negatives, logit_temp, ground_truth, legacy_polygons): @@ -278,7 +269,6 @@ def pretrain(ctx, batch_size, pad, output, spec, load, freq, quit, epochs, height=model.height, width=model.width, channels=model.channels, - repolygonize=repolygonize, force_binarization=force_binarization, format_type=format_type, legacy_polygons=legacy_polygons,) diff --git a/kraken/ketos/recognition.py b/kraken/ketos/recognition.py index e4e0b76ea..1296135c8 100644 --- a/kraken/ketos/recognition.py +++ b/kraken/ketos/recognition.py @@ -167,15 +167,6 @@ @click.option('--threads', show_default=True, default=1, type=click.IntRange(1), help='Maximum size of OpenMP/BLAS thread pool.') @click.option('--load-hyper-parameters/--no-load-hyper-parameters', show_default=True, default=False, help='When loading an existing model, retrieve hyperparameters from the model') -@click.option('--repolygonize/--no-repolygonize', show_default=True, - default=False, help='Repolygonizes line data in ALTO/PageXML ' - 'files. This ensures that the trained model is compatible with the ' - 'segmenter in kraken even if the original image files either do ' - 'not contain anything but transcriptions and baseline information ' - 'or the polygon data was created using a different method. Will ' - 'be ignored in `path` mode. Note that this option will be slow ' - 'and will not scale input images to the same size as the segmenter ' - 'does.') @click.option('--force-binarization/--no-binarization', show_default=True, default=False, help='Forces input images to be binary, otherwise ' 'the appropriate color format will be auto-determined through the ' @@ -203,7 +194,7 @@ def train(ctx, batch_size, pad, output, spec, append, load, freq, quit, epochs, step_size, sched_patience, cos_max, cos_min_lr, partition, fixed_splits, normalization, normalize_whitespace, codec, resize, reorder, base_dir, training_files, evaluation_files, workers, - threads, load_hyper_parameters, repolygonize, force_binarization, + threads, load_hyper_parameters, force_binarization, format_type, augment, pl_logger, log_dir, ground_truth, legacy_polygons): """ @@ -305,7 +296,6 @@ def train(ctx, batch_size, pad, output, spec, append, load, freq, quit, epochs, binary_dataset_split=fixed_splits, num_workers=workers, load_hyper_parameters=load_hyper_parameters, - repolygonize=repolygonize, force_binarization=force_binarization, format_type=format_type, codec=codec, @@ -385,15 +375,6 @@ def train(ctx, batch_size, pad, output, spec, append, load, freq, quit, epochs, default=None, help='Ground truth normalization') @click.option('-n', '--normalize-whitespace/--no-normalize-whitespace', show_default=True, default=True, help='Normalizes unicode whitespace') -@click.option('--repolygonize/--no-repolygonize', show_default=True, - default=False, help='Repolygonizes line data in ALTO/PageXML ' - 'files. This ensures that the trained model is compatible with the ' - 'segmenter in kraken even if the original image files either do ' - 'not contain anything but transcriptions and baseline information ' - 'or the polygon data was created using a different method. Will ' - 'be ignored in `path` mode. Note, that this option will be slow ' - 'and will not scale input images to the same size as the segmenter ' - 'does.') @click.option('--force-binarization/--no-binarization', show_default=True, default=False, help='Forces input images to be binary, otherwise ' 'the appropriate color format will be auto-determined through the ' @@ -411,7 +392,7 @@ def train(ctx, batch_size, pad, output, spec, append, load, freq, quit, epochs, @click.option('--no-legacy-polygons', show_default=True, default=False, is_flag=True, help='Force disable the legacy polygon extractor.') def test(ctx, batch_size, model, evaluation_files, device, pad, workers, threads, reorder, base_dir, normalization, normalize_whitespace, - repolygonize, force_binarization, format_type, fixed_splits, test_set, no_legacy_polygons): + force_binarization, format_type, fixed_splits, test_set, no_legacy_polygons): """ Evaluate on a test set. """ @@ -421,6 +402,8 @@ def test(ctx, batch_size, model, evaluation_files, device, pad, workers, import numpy as np from torch.utils.data import DataLoader + from torchmetrics.text import CharErrorRate, WordErrorRate + from kraken.lib import models, util from kraken.lib.dataset import (ArrowIPCRecognitionDataset, GroundTruthDataset, ImageInputTransforms, @@ -475,15 +458,11 @@ def test(ctx, batch_size, model, evaluation_files, device, pad, workers, dataset_kwargs["split_filter"] = "test" if format_type in ['xml', 'page', 'alto']: - if repolygonize: - message('Repolygonizing data') test_set = [{'page': XMLPage(file, filetype=format_type).to_container()} for file in test_set] valid_norm = False DatasetClass = partial(PolygonGTDataset, legacy_polygons=legacy_polygons) elif format_type == 'binary': DatasetClass = ArrowIPCRecognitionDataset - if repolygonize: - logger.warning('Repolygonization enabled in `binary` mode. Will be ignored.') test_set = [{'file': file} for file in test_set] valid_norm = False else: @@ -491,8 +470,6 @@ def test(ctx, batch_size, model, evaluation_files, device, pad, workers, if force_binarization: logger.warning('Forced binarization enabled in `path` mode. Will be ignored.') force_binarization = False - if repolygonize: - logger.warning('Repolygonization enabled in `path` mode. Will be ignored.') test_set = [{'line': util.parse_gt_path(img)} for img in test_set] valid_norm = True @@ -502,7 +479,8 @@ def test(ctx, batch_size, model, evaluation_files, device, pad, workers, if reorder and base_dir != 'auto': reorder = base_dir - acc_list = [] + cer_list = [] + wer_list = [] with threadpool_limits(limits=threads): for p, net in nn.items(): @@ -539,6 +517,9 @@ def test(ctx, batch_size, model, evaluation_files, device, pad, workers, pin_memory=pin_ds_mem, collate_fn=collate_sequences) + test_cer = CharErrorRate() + test_wer = WordErrorRate() + with KrakenProgressBar() as progress: batches = len(ds_loader) pred_task = progress.add_task('Evaluating', total=batches, visible=True if not ctx.meta['verbose'] else False) @@ -555,6 +536,9 @@ def test(ctx, batch_size, model, evaluation_files, device, pad, workers, algn_gt.extend(algn1) algn_pred.extend(algn2) error += c + test_cer.update(x, y) + test_wer.update(x, y) + except FileNotFoundError as e: batches -= 1 progress.update(pred_task, total=batches) @@ -565,10 +549,23 @@ def test(ctx, batch_size, model, evaluation_files, device, pad, workers, logger.warning(str(e)) progress.update(pred_task, advance=1) - acc_list.append((chars - error) / chars) + cer_list.append(1.0 - test_cer.compute()) + wer_list.append(1.0 - test_wer.compute()) confusions, scripts, ins, dels, subs = compute_confusions(algn_gt, algn_pred) - rep = render_report(p, chars, error, confusions, scripts, ins, dels, subs) + rep = render_report(p, + chars, + error, + cer_list[-1], + wer_list[-1], + confusions, + scripts, + ins, + dels, + subs) logger.info(rep) message(rep) - logger.info('Average accuracy: {:0.2f}%, (stddev: {:0.2f})'.format(np.mean(acc_list) * 100, np.std(acc_list) * 100)) - message('Average accuracy: {:0.2f}%, (stddev: {:0.2f})'.format(np.mean(acc_list) * 100, np.std(acc_list) * 100)) + + logger.info('Average character accuracy: {:0.2f}%, (stddev: {:0.2f})'.format(np.mean(cer_list) * 100, np.std(cer_list) * 100)) + message('Average character accuracy: {:0.2f}%, (stddev: {:0.2f})'.format(np.mean(cer_list) * 100, np.std(cer_list) * 100)) + logger.info('Average word accuracy: {:0.2f}%, (stddev: {:0.2f})'.format(np.mean(wer_list) * 100, np.std(wer_list) * 100)) + message('Average word accuracy: {:0.2f}%, (stddev: {:0.2f})'.format(np.mean(wer_list) * 100, np.std(wer_list) * 100)) diff --git a/kraken/lib/pretrain/model.py b/kraken/lib/pretrain/model.py index cd1d12e32..eb0d3cc57 100644 --- a/kraken/lib/pretrain/model.py +++ b/kraken/lib/pretrain/model.py @@ -84,7 +84,6 @@ def __init__(self, width: int = 0, channels: int = 1, num_workers: int = 1, - repolygonize: bool = False, force_binarization: bool = False, format_type: str = 'path', pad: int = 16, @@ -125,8 +124,6 @@ def __init__(self, valid_norm = False elif format_type == 'binary': DatasetClass = ArrowIPCRecognitionDataset - if repolygonize: - logger.warning('Repolygonization enabled in `binary` mode. Will be ignored.') valid_norm = False logger.info(f'Got {len(training_data)} binary dataset files for training data') training_data = [{'file': file} for file in training_data] @@ -137,8 +134,6 @@ def __init__(self, if force_binarization: logger.warning('Forced binarization enabled in `path` mode. Will be ignored.') force_binarization = False - if repolygonize: - logger.warning('Repolygonization enabled in `path` mode. Will be ignored.') if binary_dataset_split: logger.warning('Internal binary dataset splits are enabled but using non-binary dataset files. Will be ignored.') binary_dataset_split = False @@ -157,8 +152,6 @@ def __init__(self, if force_binarization: logger.warning('Forced binarization enabled with box lines. Will be ignored.') force_binarization = False - if repolygonize: - logger.warning('Repolygonization enabled with box lines. Will be ignored.') if binary_dataset_split: logger.warning('Internal binary dataset splits are enabled but using non-binary dataset files. Will be ignored.') binary_dataset_split = False diff --git a/kraken/lib/train.py b/kraken/lib/train.py index ec9685aff..0a8bc7665 100644 --- a/kraken/lib/train.py +++ b/kraken/lib/train.py @@ -213,7 +213,6 @@ def __init__(self, binary_dataset_split: bool = False, num_workers: int = 1, load_hyper_parameters: bool = False, - repolygonize: bool = False, force_binarization: bool = False, format_type: Literal['path', 'alto', 'page', 'xml', 'binary'] = 'path', codec: Optional[Dict] = None, @@ -291,8 +290,6 @@ def __init__(self, valid_norm = False elif format_type == 'binary': DatasetClass = ArrowIPCRecognitionDataset - if repolygonize: - logger.warning('Repolygonization enabled in `binary` mode. Will be ignored.') valid_norm = False logger.info(f'Got {len(training_data)} binary dataset files for training data') training_data = [{'file': file} for file in training_data] @@ -303,8 +300,6 @@ def __init__(self, if force_binarization: logger.warning('Forced binarization enabled in `path` mode. Will be ignored.') force_binarization = False - if repolygonize: - logger.warning('Repolygonization enabled in `path` mode. Will be ignored.') if binary_dataset_split: logger.warning('Internal binary dataset splits are enabled but using non-binary dataset files. Will be ignored.') binary_dataset_split = False @@ -323,8 +318,6 @@ def __init__(self, if force_binarization: logger.warning('Forced binarization enabled with box lines. Will be ignored.') force_binarization = False - if repolygonize: - logger.warning('Repolygonization enabled with box lines. Will be ignored.') if binary_dataset_split: logger.warning('Internal binary dataset splits are enabled but using non-binary dataset files. Will be ignored.') binary_dataset_split = False diff --git a/kraken/serialization.py b/kraken/serialization.py index b1e7ab9e8..f4fd4432d 100644 --- a/kraken/serialization.py +++ b/kraken/serialization.py @@ -247,6 +247,8 @@ def _load_template(name): def render_report(model: str, chars: int, errors: int, + char_accuracy: float, + word_accuracy: float, char_confusions: 'Counter', scripts: 'Counter', insertions: 'Counter', @@ -275,7 +277,8 @@ def render_report(model: str, report = {'model': model, 'chars': chars, 'errors': errors, - 'accuracy': (chars-errors)/chars * 100, + 'character_accuracy': char_accuracy * 100, + 'word_accuracy': word_accuracy * 100, 'insertions': sum(insertions.values()), 'deletions': deletions, 'substitutions': sum(substitutions.values()), diff --git a/kraken/templates/report b/kraken/templates/report index 264b8b6aa..abd81fbb2 100644 --- a/kraken/templates/report +++ b/kraken/templates/report @@ -2,7 +2,8 @@ {{ report.chars }} Characters {{ report.errors }} Errors -{{ '%0.2f'| format(report.accuracy) }}% Accuracy +{{ '%0.2f'| format(report.character_accuracy) }}% Character Accuracy +{{ '%0.2f'| format(report.word_accuracy) }}% Word Accuracy {{ report.insertions }} Insertions {{ report.deletions }} Deletions