-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathcaptcha_ocr.py
188 lines (156 loc) · 6.03 KB
/
captcha_ocr.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
"""
Title: OCR model for reading Captchas
Author: [A_K_Nain](https://twitter.com/A_K_Nain)
Date created: 2020/06/14
Last modified: 2020/06/26
Description: How to implement an OCR model using CNNs, RNNs and CTC loss.
This example demonstrates a simple OCR model built with the Functional API. Apart from
combining CNN and RNN, it also illustrates how you can instantiate a new layer
and use it as an "Endpoint layer" for implementing CTC loss. For a detailed
guide to layer subclassing, please check out
[this page](https://keras.io/guides/making_new_layers_and_models_via_subclassing/)
in the developer guides.
"""
import os
import keras
import tensorflow as tf
from keras import layers
os.environ["KERAS_BACKEND"] = "tensorflow"
def ctc_batch_cost(
y_true, y_pred, input_length, label_length, num_to_char: layers.StringLookup
):
label_length = tf.cast(tf.squeeze(label_length, axis=-1), tf.int32)
input_length = tf.cast(tf.squeeze(input_length, axis=-1), tf.int32)
sparse_labels = tf.cast(ctc_label_dense_to_sparse(y_true, label_length), tf.int32)
results = keras.backend.ctc_decode(y_pred, input_length=input_length)[0][0][
:, : tf.shape(y_true)[1]
]
results = tf.map_fn(
tf.strings.reduce_join,
tf.map_fn(num_to_char, results, dtype="string"),
)
labels = tf.map_fn(tf.strings.reduce_join, num_to_char(y_true))
y_pred = tf.math.log(tf.transpose(y_pred, perm=[1, 0, 2]) + keras.backend.epsilon())
return (
tf.expand_dims(
tf.compat.v1.nn.ctc_loss(
inputs=y_pred, labels=sparse_labels, sequence_length=input_length
),
1,
),
tf.cast(
tf.equal(
tf.map_fn(tf.strings.upper, results),
tf.map_fn(tf.strings.upper, labels),
),
tf.int32,
),
)
def ctc_label_dense_to_sparse(labels, label_lengths):
label_shape = tf.shape(labels)
num_batches_tns = tf.stack([label_shape[0]])
max_num_labels_tns = tf.stack([label_shape[1]])
def range_less_than(old_input, current_input):
return tf.expand_dims(tf.range(tf.shape(old_input)[1]), 0) < tf.fill(
max_num_labels_tns, current_input
)
init = tf.cast(tf.fill([1, label_shape[1]], 0), tf.bool)
dense_mask = tf.compat.v1.scan(
range_less_than, label_lengths, initializer=init, parallel_iterations=1
)
dense_mask = dense_mask[:, 0, :]
label_array = tf.reshape(
tf.tile(tf.range(0, label_shape[1]), num_batches_tns), label_shape
)
label_ind = tf.compat.v1.boolean_mask(label_array, dense_mask)
batch_array = tf.transpose(
tf.reshape(
tf.tile(tf.range(0, label_shape[0]), max_num_labels_tns),
tf.reverse(label_shape, [0]),
)
)
batch_ind = tf.compat.v1.boolean_mask(batch_array, dense_mask)
indices = tf.transpose(
tf.reshape(tf.concat([batch_ind, label_ind], axis=0), [2, -1])
)
vals_sparse = tf.compat.v1.gather_nd(labels, indices)
return tf.SparseTensor(
tf.cast(indices, tf.int64), vals_sparse, tf.cast(label_shape, tf.int64)
)
class CTCLayer(layers.Layer):
def __init__(self, name, num_to_char: layers.StringLookup):
super().__init__(name=name)
self.loss_fn = ctc_batch_cost
self.num_to_char = num_to_char
def call(self, y_true, y_pred):
# Compute the training-time loss value and add it
# to the layer using `self.add_loss()`.
batch_len = tf.cast(tf.shape(y_true)[0], dtype="int64")
input_length = tf.cast(tf.shape(y_pred)[1], dtype="int64")
label_length = tf.cast(tf.shape(y_true)[1], dtype="int64")
input_length = input_length * tf.ones(shape=(batch_len, 1), dtype="int64")
label_length = label_length * tf.ones(shape=(batch_len, 1), dtype="int64")
loss, acc = self.loss_fn(
y_true, y_pred, input_length, label_length, self.num_to_char
)
self.add_loss(loss)
self.add_metric(acc, "acc")
# At test time, just return the computed predictions
return y_pred
def build_model(
img_width: int,
img_height: int,
channels: int,
num_to_char: layers.StringLookup,
):
# Inputs to the model
input_img = layers.Input(
shape=(img_width, img_height, channels), name="image", dtype="float32"
)
labels = layers.Input(name="label", shape=(None,), dtype="float32")
# First conv block
x = layers.Conv2D(
32,
(3, 3),
activation="relu",
kernel_initializer="he_normal",
padding="same",
name="Conv1",
)(input_img)
x = layers.MaxPooling2D((2, 2), name="pool1")(x)
# Second conv block
x = layers.Conv2D(
64,
(3, 3),
activation="relu",
kernel_initializer="he_normal",
padding="same",
name="Conv2",
)(x)
x = layers.MaxPooling2D((2, 2), name="pool2")(x)
# We have used two max pool with pool size and strides 2.
# Hence, downsampled feature maps are 4x smaller. The number of
# filters in the last layer is 64. Reshape accordingly before
# passing the output to the RNN part of the model
new_shape = ((img_width // 4), (img_height // 4) * 64)
x = layers.Reshape(target_shape=new_shape, name="reshape")(x)
x = layers.Dense(64, activation="relu", name="dense1")(x)
x = layers.Dropout(0.2)(x)
# RNNs
x = layers.Bidirectional(layers.LSTM(128, return_sequences=True, dropout=0.25))(x)
x = layers.Bidirectional(layers.LSTM(64, return_sequences=True, dropout=0.25))(x)
# Output layer
x = layers.Dense(
len(num_to_char.get_vocabulary()) + 1, activation="softmax", name="dense2"
)(x)
# Add CTC layer for calculating CTC loss at each step
output = CTCLayer("ctc_loss", num_to_char)(labels, x)
# Define the model
model = keras.models.Model(
inputs=[input_img, labels], outputs=output, name="ocr_model_v1"
)
# Optimizer
opt = keras.optimizers.Adam()
# Compile the model and return
model.compile(optimizer=opt)
return model