Merge pull request #187 from eragonruan/dev

Dev
eragonruan · Jun 26, 2018 · ba431d9 · ba431d9
2 parents 37993c4 + afa58b3
commit ba431d9
Show file tree

Hide file tree

Showing 53 changed files with 4,413 additions and 1,644 deletions.
diff --git a/.gitignore b/.gitignore
@@ -6,9 +6,10 @@ logs/
 output/
 build/
 dist/
+checkpoints/
 .idea/
 *.py[cod]
 *.c[cod]
 *.so
 *.swp
-
+*.pb
diff --git a/README.md b/README.md
@@ -2,14 +2,20 @@
 
 text detection mainly based on ctpn (connectionist text proposal network). It is implemented in tensorflow. I use id card detect as an example to demonstrate the results, but it should be noticing that this model can be used in almost every horizontal scene text detection task. The origin paper can be found [here](https://arxiv.org/abs/1609.03605). Also, the origin repo in caffe can be found in [here](https://github.com/tianzhi0549/CTPN). For more detail about the paper and code, see this [blog](http://slade-ruan.me/2017/10/22/text-detection-ctpn/). If you got any questions, check the issue first, if the problem persists, open a new issue.
 ***
-# setup
-- requirements: tensorflow1.3, cython0.24, opencv-python, easydict,(recommend to install Anaconda)
-- if you do not have a gpu device,follow here to [setup](https://github.com/eragonruan/text-detection-ctpn/issues/43)
-- if you have a gpu device, build the library by
+# roadmap
+- [x] freeze the graph for convenient inference
+- [x] pure python, cython nms and cuda nms
+- [x] loss function as referred in paper
+- [x] oriented text connector
+- [x] BLSTM
+***
+# demo
+- for a quick demo,you don't have to build the library, simpely use demo_pb.py for inference.
+- download the pb file from [release](https://github.com/eragonruan/text-detection-ctpn/releases)
+- put ctpn.pb in data/
+- put your images in data/demo, the results will be saved in data/results, and run demo in the root 
 ```shell
-cd lib/utils
-chmod +x make.sh
-./make.sh
+python ./ctpn/demo_pb.py
 ```
 ***
 # parameters
@@ -18,14 +24,16 @@ there are some parameters you may need to modify according to your requirement,
 - DETECT_MODE # H represents horizontal mode, O represents oriented mode, default is H
 - checkpoints_path # the model I provided is in checkpoints/, if you train the model by yourself,it will be saved in output/
 ***
-# demo
-- download the checkpoints from release, unzip it in checkpoints/
-- put your images in data/demo, the results will be saved in data/results, and run demo in the root 
+# training
+## setup
+- requirements: python2.7, tensorflow1.3, cython0.24, opencv-python, easydict,(recommend to install Anaconda)
+- if you do not have a gpu device,follow here to [setup](https://github.com/eragonruan/text-detection-ctpn/issues/43)
+- if you have a gpu device, build the library by
 ```shell
-python ./ctpn/demo.py
+cd lib/utils
+chmod +x make.sh
+./make.sh
 ```
-***
-# training
 ## prepare data
 - First, download the pre-trained model of VGG net and put it in data/pretrain/VGG_imagenet.npy. you can download it from [google drive](https://drive.google.com/open?id=0B_WmJoEtfQhDRl82b1dJTjB2ZGc) or [baidu yun](https://pan.baidu.com/s/1kUNTl1l). 
 - Second, prepare the training data as referred in paper, or you can download the data I prepared from [google drive](https://drive.google.com/open?id=0B_WmJoEtfGhDRl82b1dJTjB2ZGc) or [baidu yun](https://pan.baidu.com/s/1kUNTl1l). Or you can prepare your own data according to the following steps. 
@@ -52,17 +60,6 @@ python ./ctpn/train_net.py
 - The model I provided in checkpoints is trained on GTX1070 for 50k iters.
 - If you are using cuda nms, it takes about 0.2s per iter. So it will takes about 2.5 hours to finished 50k iterations.
 ***
-# roadmap
-- [x] cython nms
-- [x] cuda nms
-- [x] python2/python3 compatblity
-- [x] tensorflow1.3
-- [x] delete useless code
-- [x] loss function as referred in paper
-- [x] oriented text connector
-- [x] BLSTM
-- [ ] side refinement
-***
 # some results
 `NOTICE:` all the photos used below are collected from the internet. If it affects you, please contact me to delete them.
 <img src="/data/results/001.jpg" width=320 height=240 /><img src="/data/results/002.jpg" width=320 height=240 />

diff --git a/ctpn/__init__.py b/ctpn/__init__.py
@@ -1,2 +1 @@
-from . import text_proposal_connector
-from . import text_connect
+
diff --git a/ctpn/demo.py b/ctpn/demo.py
@@ -1,26 +1,31 @@
 from __future__ import print_function
-import tensorflow as tf
-import numpy as np
-import os, sys, cv2
+
+import cv2
 import glob
+import os
 import shutil
+import sys
+
+import numpy as np
+import tensorflow as tf
+
 sys.path.append(os.getcwd())
 from lib.networks.factory import get_network
-from lib.fast_rcnn.config import cfg,cfg_from_file
+from lib.fast_rcnn.config import cfg, cfg_from_file
 from lib.fast_rcnn.test import test_ctpn
 from lib.utils.timer import Timer
 from lib.text_connector.detectors import TextDetector
 from lib.text_connector.text_connect_cfg import Config as TextLineCfg
 
 
 def resize_im(im, scale, max_scale=None):
-    f=float(scale)/min(im.shape[0], im.shape[1])
-    if max_scale!=None and f*max(im.shape[0], im.shape[1])>max_scale:
-        f=float(max_scale)/max(im.shape[0], im.shape[1])
-    return cv2.resize(im, None,None, fx=f, fy=f,interpolation=cv2.INTER_LINEAR), f
+    f = float(scale) / min(im.shape[0], im.shape[1])
+    if max_scale != None and f * max(im.shape[0], im.shape[1]) > max_scale:
+        f = float(max_scale) / max(im.shape[0], im.shape[1])
+    return cv2.resize(im, None, None, fx=f, fy=f, interpolation=cv2.INTER_LINEAR), f
 
 
-def draw_boxes(img,image_name,boxes,scale):
+def draw_boxes(img, image_name, boxes, scale):
     base_name = image_name.split('/')[-1]
     with open('data/results/' + 'res_{}.txt'.format(base_name.split('.')[0]), 'w') as f:
         for box in boxes:
@@ -35,17 +40,18 @@ def draw_boxes(img,image_name,boxes,scale):
             cv2.line(img, (int(box[6]), int(box[7])), (int(box[2]), int(box[3])), color, 2)
             cv2.line(img, (int(box[4]), int(box[5])), (int(box[6]), int(box[7])), color, 2)
 
-            min_x = min(int(box[0]/scale),int(box[2]/scale),int(box[4]/scale),int(box[6]/scale))
-            min_y = min(int(box[1]/scale),int(box[3]/scale),int(box[5]/scale),int(box[7]/scale))
-            max_x = max(int(box[0]/scale),int(box[2]/scale),int(box[4]/scale),int(box[6]/scale))
-            max_y = max(int(box[1]/scale),int(box[3]/scale),int(box[5]/scale),int(box[7]/scale))
+            min_x = min(int(box[0] / scale), int(box[2] / scale), int(box[4] / scale), int(box[6] / scale))
+            min_y = min(int(box[1] / scale), int(box[3] / scale), int(box[5] / scale), int(box[7] / scale))
+            max_x = max(int(box[0] / scale), int(box[2] / scale), int(box[4] / scale), int(box[6] / scale))
+            max_y = max(int(box[1] / scale), int(box[3] / scale), int(box[5] / scale), int(box[7] / scale))
 
-            line = ','.join([str(min_x),str(min_y),str(max_x),str(max_y)])+'\r\n'
+            line = ','.join([str(min_x), str(min_y), str(max_x), str(max_y)]) + '\r\n'
             f.write(line)
 
-    img=cv2.resize(img, None, None, fx=1.0/scale, fy=1.0/scale, interpolation=cv2.INTER_LINEAR)
+    img = cv2.resize(img, None, None, fx=1.0 / scale, fy=1.0 / scale, interpolation=cv2.INTER_LINEAR)
     cv2.imwrite(os.path.join("data/results", base_name), img)
 
+
 def ctpn(sess, net, image_name):
     timer = Timer()
     timer.tic()
@@ -62,7 +68,6 @@ def ctpn(sess, net, image_name):
            '{:d} object proposals').format(timer.total_time, boxes.shape[0]))
 
 
-
 if __name__ == '__main__':
     if os.path.exists("data/results/"):
         shutil.rmtree("data/results/")
@@ -98,4 +103,3 @@ def ctpn(sess, net, image_name):
         print('~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~')
         print(('Demo for {:s}'.format(im_name)))
         ctpn(sess, net, im_name)
-
diff --git a/ctpn/demo_pb.py b/ctpn/demo_pb.py
@@ -0,0 +1,98 @@
+from __future__ import print_function
+
+import glob
+import os
+import shutil
+import sys
+
+import cv2
+import numpy as np
+import tensorflow as tf
+from tensorflow.python.platform import gfile
+
+sys.path.append(os.getcwd())
+from lib.fast_rcnn.config import cfg, cfg_from_file
+from lib.fast_rcnn.test import _get_blobs
+from lib.text_connector.detectors import TextDetector
+from lib.text_connector.text_connect_cfg import Config as TextLineCfg
+from lib.rpn_msr.proposal_layer_tf import proposal_layer
+
+
+def resize_im(im, scale, max_scale=None):
+    f = float(scale) / min(im.shape[0], im.shape[1])
+    if max_scale != None and f * max(im.shape[0], im.shape[1]) > max_scale:
+        f = float(max_scale) / max(im.shape[0], im.shape[1])
+    return cv2.resize(im, None, None, fx=f, fy=f, interpolation=cv2.INTER_LINEAR), f
+
+
+def draw_boxes(img, image_name, boxes, scale):
+    base_name = image_name.split('/')[-1]
+    with open('data/results/' + 'res_{}.txt'.format(base_name.split('.')[0]), 'w') as f:
+        for box in boxes:
+            if np.linalg.norm(box[0] - box[1]) < 5 or np.linalg.norm(box[3] - box[0]) < 5:
+                continue
+            if box[8] >= 0.9:
+                color = (0, 255, 0)
+            elif box[8] >= 0.8:
+                color = (255, 0, 0)
+            cv2.line(img, (int(box[0]), int(box[1])), (int(box[2]), int(box[3])), color, 2)
+            cv2.line(img, (int(box[0]), int(box[1])), (int(box[4]), int(box[5])), color, 2)
+            cv2.line(img, (int(box[6]), int(box[7])), (int(box[2]), int(box[3])), color, 2)
+            cv2.line(img, (int(box[4]), int(box[5])), (int(box[6]), int(box[7])), color, 2)
+
+            min_x = min(int(box[0] / scale), int(box[2] / scale), int(box[4] / scale), int(box[6] / scale))
+            min_y = min(int(box[1] / scale), int(box[3] / scale), int(box[5] / scale), int(box[7] / scale))
+            max_x = max(int(box[0] / scale), int(box[2] / scale), int(box[4] / scale), int(box[6] / scale))
+            max_y = max(int(box[1] / scale), int(box[3] / scale), int(box[5] / scale), int(box[7] / scale))
+
+            line = ','.join([str(min_x), str(min_y), str(max_x), str(max_y)]) + '\r\n'
+            f.write(line)
+
+    img = cv2.resize(img, None, None, fx=1.0 / scale, fy=1.0 / scale, interpolation=cv2.INTER_LINEAR)
+    cv2.imwrite(os.path.join("data/results", base_name), img)
+
+
+if __name__ == '__main__':
+
+    if os.path.exists("data/results/"):
+        shutil.rmtree("data/results/")
+    os.makedirs("data/results/")
+
+    cfg_from_file('ctpn/text.yml')
+
+    # init session
+    config = tf.ConfigProto(allow_soft_placement=True)
+    sess = tf.Session(config=config)
+    with gfile.FastGFile('data/ctpn.pb', 'rb') as f:
+        graph_def = tf.GraphDef()
+        graph_def.ParseFromString(f.read())
+        sess.graph.as_default()
+        tf.import_graph_def(graph_def, name='')
+    sess.run(tf.global_variables_initializer())
+
+    input_img = sess.graph.get_tensor_by_name('Placeholder:0')
+    output_cls_prob = sess.graph.get_tensor_by_name('Reshape_2:0')
+    output_box_pred = sess.graph.get_tensor_by_name('rpn_bbox_pred/Reshape_1:0')
+
+    im_names = glob.glob(os.path.join(cfg.DATA_DIR, 'demo', '*.png')) + \
+               glob.glob(os.path.join(cfg.DATA_DIR, 'demo', '*.jpg'))
+
+    for im_name in im_names:
+        print('~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~')
+        print(('Demo for {:s}'.format(im_name)))
+        img = cv2.imread(im_name)
+        img, scale = resize_im(img, scale=TextLineCfg.SCALE, max_scale=TextLineCfg.MAX_SCALE)
+        blobs, im_scales = _get_blobs(img, None)
+        if cfg.TEST.HAS_RPN:
+            im_blob = blobs['data']
+            blobs['im_info'] = np.array(
+                [[im_blob.shape[1], im_blob.shape[2], im_scales[0]]],
+                dtype=np.float32)
+        cls_prob, box_pred = sess.run([output_cls_prob, output_box_pred], feed_dict={input_img: blobs['data']})
+        rois, _ = proposal_layer(cls_prob, box_pred, blobs['im_info'], 'TEST', anchor_scales=cfg.ANCHOR_SCALES)
+
+        scores = rois[:, 0]
+        boxes = rois[:, 1:5] / im_scales[0]
+        textdetector = TextDetector()
+        boxes = textdetector.detect(boxes, scores[:, np.newaxis], img.shape[:2])
+        draw_boxes(img, im_name, boxes, scale)
diff --git a/ctpn/generate_pb.py b/ctpn/generate_pb.py
@@ -0,0 +1,41 @@
+from __future__ import print_function
+
+import os
+import sys
+
+import tensorflow as tf
+from tensorflow.python.framework.graph_util import convert_variables_to_constants
+
+sys.path.append(os.getcwd())
+from lib.networks.factory import get_network
+from lib.fast_rcnn.config import cfg, cfg_from_file
+
+if __name__ == "__main__":
+    cfg_from_file('ctpn/text.yml')
+
+    config = tf.ConfigProto(allow_soft_placement=True)
+    sess = tf.Session(config=config)
+    net = get_network("VGGnet_test")
+    print(('Loading network {:s}... '.format("VGGnet_test")), end=' ')
+    saver = tf.train.Saver()
+    try:
+        ckpt = tf.train.get_checkpoint_state(cfg.TEST.checkpoints_path)
+        print('Restoring from {}...'.format(ckpt.model_checkpoint_path), end=' ')
+        saver.restore(sess, ckpt.model_checkpoint_path)
+        print('done')
+    except:
+        raise 'Check your pretrained {:s}'.format(ckpt.model_checkpoint_path)
+    print(' done.')
+
+    print('all nodes are:\n')
+    graph = tf.get_default_graph()
+    input_graph_def = graph.as_graph_def()
+    node_names = [node.name for node in input_graph_def.node]
+    for x in node_names:
+        print(x)
+    output_node_names = 'Reshape_2,rpn_bbox_pred/Reshape_1'
+    output_graph_def = convert_variables_to_constants(sess, input_graph_def, output_node_names.split(','))
+    output_graph = 'data/ctpn.pb'
+    with tf.gfile.GFile(output_graph, 'wb') as f:
+        f.write(output_graph_def.SerializeToString())
+    sess.close()
diff --git a/ctpn/train_net.py b/ctpn/train_net.py
@@ -1,10 +1,8 @@
+import os.path
 import pprint
 import sys
-import os.path
 
 sys.path.append(os.getcwd())
-this_dir = os.path.dirname(__file__)
-
 from lib.fast_rcnn.train import get_training_roidb, train_net
 from lib.fast_rcnn.config import cfg_from_file, get_output_dir, get_log_dir
 from lib.datasets.factory import get_imdb

diff --git a/data/VOCdevkit2007 b/data/VOCdevkit2007
@@ -0,0 +1 @@
+/media/D/code/OCR/CTPN_LSTM/data/VOCdevkit
diff --git a/data/results/001.jpg b/data/results/001.jpg
diff --git a/data/results/002.jpg b/data/results/002.jpg
diff --git a/data/results/003.jpg b/data/results/003.jpg
diff --git a/data/results/004.jpg b/data/results/004.jpg
diff --git a/data/results/005.jpg b/data/results/005.jpg
diff --git a/data/results/006.jpg b/data/results/006.jpg
diff --git a/data/results/007.jpg b/data/results/007.jpg
diff --git a/data/results/008.jpg b/data/results/008.jpg
diff --git a/data/results/009.jpg b/data/results/009.jpg
diff --git a/data/results/010.png b/data/results/010.png
diff --git a/data/results/res_001.txt b/data/results/res_001.txt
@@ -1,6 +1,6 @@
-54,190,433,234
+54,191,433,234
 54,267,505,311
-54,349,595,393
+54,350,595,392
 54,105,306,155
-36,548,884,594
+36,549,884,592
 162,398,469,447
diff --git a/data/results/res_002.txt b/data/results/res_002.txt
@@ -1,6 +1,6 @@
 104,172,192,193
 48,238,416,260
-48,77,224,102
+48,78,80,94
 48,148,296,168
 48,39,192,63
 48,113,256,131
diff --git a/data/results/res_003.txt b/data/results/res_003.txt
@@ -1,6 +1,6 @@
 28,72,200,92
 78,159,135,179
 28,138,257,157
-28,223,378,240
-28,105,221,120
+28,224,378,240
+28,105,221,119
 21,35,142,58
diff --git a/data/results/res_004.txt b/data/results/res_004.txt
@@ -1,6 +1,6 @@
-56,414,668,446
-142,306,355,338
-56,268,469,298
-56,209,397,238
+56,415,668,445
+142,307,355,337
+56,269,469,298
+56,210,397,237
 56,95,270,130
 56,153,341,190
diff --git a/data/results/res_005.txt b/data/results/res_005.txt
@@ -1,6 +1,6 @@
-184,546,768,604
-184,651,890,708
-153,338,522,422
-153,444,706,515
-307,715,522,776
-153,899,1228,962
+184,548,768,602
+184,653,890,707
+153,337,522,422
+153,446,706,514
+307,717,522,775
+153,901,1228,960
diff --git a/data/results/res_006.txt b/data/results/res_006.txt
@@ -3,5 +3,5 @@
 179,118,691,237
 179,623,614,740
 153,952,947,1069
-102,0,512,30
+102,0,512,29
 230,800,921,906
diff --git a/data/results/res_007.txt b/data/results/res_007.txt
@@ -1,5 +1,5 @@
-0,653,254,675
-872,654,1018,676
-181,373,836,558
-436,287,545,387
+0,654,254,674
+872,655,1018,675
+181,374,836,556
+436,288,545,386
 345,100,654,310
Original file line number	Diff line number	Diff line change
		@@ -1,2 +1 @@
		from . import text_proposal_connector
		from . import text_connect