diff --git a/.travis.yml b/.travis.yml
index c7c9c1e5087..6478c866904 100644
--- a/.travis.yml
+++ b/.travis.yml
@@ -6,7 +6,7 @@ compiler: gcc
 
 env:
   global:
-    - NUM_THREADS=4
+    - NUM_THREADS=8
   matrix:
     # Use a build matrix to test many builds in parallel
     # envvar defaults:
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 64aef17340e..6e18fd62b39 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -10,8 +10,8 @@ endif()
 project(Caffe C CXX)
 
 # ---[ Caffe version
-set(CAFFE_TARGET_VERSION "0.16.6")
-set(CAFFE_TARGET_SOVERSION "0.16")
+set(CAFFE_TARGET_VERSION "0.17.0")
+set(CAFFE_TARGET_SOVERSION "0.17")
 add_definitions(-DCAFFE_VERSION=${CAFFE_TARGET_VERSION})
 
 # Skip `typedef __half half;`
@@ -53,7 +53,6 @@ caffe_option(BUILD_docs   "Build documentation" ON IF UNIX OR APPLE)
 caffe_option(BUILD_python_layer "Build the Caffe Python layer" ON)
 caffe_option(USE_LEVELDB "Build with levelDB" ON)
 caffe_option(USE_LMDB "Build with lmdb" ON)
-caffe_option(ALLOW_LMDB_NOLOCK "Allow MDB_NOLOCK when reading LMDB files (only if necessary)" OFF)
 caffe_option(TEST_FP16 "Build Caffe Tests with 16 bit mode included" OFF)
 caffe_option(NO_NVML "Build Caffe Tests without NVML (i.e. no CPU affinity)" OFF)
 
diff --git a/LICENSE b/LICENSE
index d69d16f5bc7..6b6633dfc1f 100644
--- a/LICENSE
+++ b/LICENSE
@@ -1,5 +1,11 @@
 COPYRIGHT
 
+All changes from Caffe SSD (https://github.com/weiliu89/caffe/tree/ssd)
+Copyright (c) 2015, 2016 Wei Liu (UNC Chapel Hill), Dragomir Anguelov (Zoox),
+Dumitru Erhan (Google), Christian Szegedy (Google), Scott Reed (UMich Ann Arbor),
+Cheng-Yang Fu (UNC Chapel Hill), Alexander C. Berg (UNC Chapel Hill).
+All rights reserved.
+
 All contributions by the University of California:
 Copyright (c) 2014, 2015, The Regents of the University of California (Regents)
 All rights reserved.
diff --git a/Makefile b/Makefile
index 42a5b6cb167..858886cc775 100644
--- a/Makefile
+++ b/Makefile
@@ -35,8 +35,8 @@ LIBRARY_NAME := $(PROJECT)$(LIBRARY_NAME_SUFFIX)
 LIB_BUILD_DIR := $(BUILD_DIR)/lib
 STATIC_NAME := $(LIB_BUILD_DIR)/lib$(LIBRARY_NAME).a
 DYNAMIC_VERSION_MAJOR 		:= 0
-DYNAMIC_VERSION_MINOR 		:= 16
-DYNAMIC_VERSION_REVISION 	:= 6
+DYNAMIC_VERSION_MINOR 		:= 17
+DYNAMIC_VERSION_REVISION 	:= 0
 DYNAMIC_NAME_SHORT := lib$(LIBRARY_NAME).so
 DYNAMIC_SONAME_SHORT := $(DYNAMIC_NAME_SHORT).$(DYNAMIC_VERSION_MAJOR).$(DYNAMIC_VERSION_MINOR)
 DYNAMIC_VERSIONED_NAME_SHORT := $(DYNAMIC_SONAME_SHORT).$(DYNAMIC_VERSION_REVISION)
@@ -216,7 +216,7 @@ ifeq ($(USE_OPENCV), 1)
 	LIBRARIES += opencv_core opencv_highgui opencv_imgproc
 
 	ifeq ($(OPENCV_VERSION), 3)
-		LIBRARIES += opencv_imgcodecs
+		LIBRARIES += opencv_imgcodecs opencv_videoio
 	endif
 		
 endif
@@ -292,7 +292,7 @@ ifeq ($(LINUX), 1)
 	endif
 	# boost::thread is reasonably called boost_thread (compare OS X)
 	# We will also explicitly add stdc++ to the link target.
-	LIBRARIES += boost_thread stdc++
+	LIBRARIES += boost_thread boost_regex stdc++
 	VERSIONFLAGS += -Wl,-soname,$(DYNAMIC_SONAME_SHORT) -Wl,-rpath,$(ORIGIN)/../lib
 endif
 
@@ -376,9 +376,6 @@ ifeq ($(USE_LEVELDB), 1)
 endif
 ifeq ($(USE_LMDB), 1)
 	COMMON_FLAGS += -DUSE_LMDB
-ifeq ($(ALLOW_LMDB_NOLOCK), 1)
-	COMMON_FLAGS += -DALLOW_LMDB_NOLOCK
-endif
 endif
 
 # New place for HDF5
diff --git a/README.md b/README.md
index f95295482f3..2eed99237da 100644
--- a/README.md
+++ b/README.md
@@ -13,17 +13,19 @@ Here are the major features:
 * **Mixed-precision support**. It allows to store and/or compute data in either 
 64, 32 or 16 bit formats. Precision can be defined for every layer (forward and 
 backward passes might be different too), or it can be set for the whole Net.
+* **Layer-wise Adaptive Rate Control (LARC) and adaptive global gradient scaler** for better
+ accuracy, especially in 16-bit training.
 * **Integration with  [cuDNN](https://developer.nvidia.com/cudnn) v7**.
 * **Automatic selection of the best cuDNN convolution algorithm**.
 * **Integration with v2.2 of [NCCL library](https://github.com/NVIDIA/nccl)**
  for improved multi-GPU scaling.
 * **Optimized GPU memory management** for data and parameters storage, I/O buffers 
 and workspace for convolutional layers.
-* **Parallel data parser and transformer** for improved I/O performance.
+* **Parallel data parser, transformer and image reader** for improved I/O performance.
 * **Parallel back propagation and gradient reduction** on multi-GPU systems.
 * **Fast solvers implementation with fused CUDA kernels for weights and history update**.
 * **Multi-GPU test phase** for even memory load across multiple GPUs.
-* **Backward compatibility with BVLC Caffe and NVCaffe 0.15**.
+* **Backward compatibility with BVLC Caffe and NVCaffe 0.15 and higher**.
 * **Extended set of optimized models** (including 16 bit floating point examples).
 
 
@@ -45,6 +47,6 @@ Please cite Caffe in your publications if it helps your research:
 
 Libturbojpeg library is used since 0.16.5. It has a packaging bug. Please execute the following (required for Makefile, optional for CMake):
 ```
-sudo apt-get install libturbojpeg libturbojpeg-dev
+sudo apt-get install libturbojpeg
 sudo ln -s /usr/lib/x86_64-linux-gnu/libturbojpeg.so.0.1.0 /usr/lib/x86_64-linux-gnu/libturbojpeg.so
 ```
\ No newline at end of file
diff --git a/cmake/ConfigGen.cmake b/cmake/ConfigGen.cmake
index 1db099279f8..5870984dd5a 100644
--- a/cmake/ConfigGen.cmake
+++ b/cmake/ConfigGen.cmake
@@ -53,14 +53,10 @@ function(caffe_generate_export_configs)
   set(Caffe_DEFINITIONS "")
   if(NOT HAVE_CUDA)
     set(HAVE_CUDA FALSE)
-    list(APPEND Caffe_DEFINITIONS -DCPU_ONLY)
   endif()
 
   if(USE_LMDB)
     list(APPEND Caffe_DEFINITIONS -DUSE_LMDB)
-    if (ALLOW_LMDB_NOLOCK)
-        list(APPEND Caffe_DEFINITIONS -DALLOW_LMDB_NOLOCK)
-    endif()
   endif()
 
   if(USE_LEVELDB)
diff --git a/cmake/Dependencies.cmake b/cmake/Dependencies.cmake
index 386333fd20e..c55c1116552 100644
--- a/cmake/Dependencies.cmake
+++ b/cmake/Dependencies.cmake
@@ -34,9 +34,6 @@ if(USE_LMDB)
   include_directories(SYSTEM ${LMDB_INCLUDE_DIR})
   list(APPEND Caffe_LINKER_LIBS ${LMDB_LIBRARIES})
   add_definitions(-DUSE_LMDB)
-  if(ALLOW_LMDB_NOLOCK)
-    add_definitions(-DALLOW_LMDB_NOLOCK)
-  endif()
 endif()
 
 # ---[ LevelDB
@@ -62,14 +59,12 @@ list(APPEND Caffe_LINKER_LIBS ${JPEGTurbo_LIBRARIES})
 include(cmake/Cuda.cmake)
 if(NOT HAVE_CUDA)
   message(SEND_ERROR "-- CUDA is not detected by cmake. Building without it...")
-  # TODO: remove this not cross platform define in future. Use caffe_config.h instead.
-  add_definitions(-DCPU_ONLY)
 endif()
 
 # ---[ OpenCV
 find_package(OpenCV QUIET COMPONENTS imgcodecs)
 if(OPENCV_IMGCODECS_FOUND)
-  find_package(OpenCV REQUIRED COMPONENTS core imgcodecs imgproc)
+  find_package(OpenCV REQUIRED COMPONENTS core imgcodecs highgui imgproc videoio)
   message(STATUS "Found OpenCV 3.x: ${OpenCV_CONFIG_PATH}")
 else()
   find_package(OpenCV REQUIRED COMPONENTS core highgui imgproc)
diff --git a/cmake/Summary.cmake b/cmake/Summary.cmake
index 467440761d5..b7505614c89 100644
--- a/cmake/Summary.cmake
+++ b/cmake/Summary.cmake
@@ -115,7 +115,6 @@ function(caffe_print_configuration_summary)
   caffe_status("  BUILD_docs        :   ${BUILD_docs}")
   caffe_status("  USE_LEVELDB       :   ${USE_LEVELDB}")
   caffe_status("  USE_LMDB          :   ${USE_LMDB}")
-  caffe_status("  ALLOW_LMDB_NOLOCK :   ${ALLOW_LMDB_NOLOCK}")
   caffe_status("  TEST_FP16         :   ${TEST_FP16}")
   caffe_status("")
   caffe_status("Dependencies:")
diff --git a/data/ILSVRC2016/README.md b/data/ILSVRC2016/README.md
new file mode 100644
index 00000000000..c4e5a6fc5a7
--- /dev/null
+++ b/data/ILSVRC2016/README.md
@@ -0,0 +1,29 @@
+### Preparation
+#### ILSVRC2016
+We encourage you to register [ILSVRC2016](http://image-net.org/challenges/LSVRC/2016) and download the DET dataset. By default, we assume the data is stored in `$HOME/data/ILSVRC` and will call it `$ILSVRC_ROOT`.
+
+#### ILSVRC2015
+If you choose to use ILSVRC2015 DET dataset, here are a few noticeable steps before running the following scripts:
+
+1. There are a few problematic images. You can download the fixed ones [here](http://www.cs.unc.edu/~wliu/projects/SSD/ILSVRC2015_DET_fix.tar.gz).
+
+2. You should download the [val1/val2 split](http://www.cs.unc.edu/~wliu/projects/SSD/ILSVRC2015_DET_val1_val2.tar.gz), courtesy of [Ross Girshick](http://people.eecs.berkeley.edu/~rbg), and put it in `$ILSVRC_ROOT/ImageSets/DET`.
+
+### Remove an invalid file
+Find the invalid image file `Data/DET/val/ILSVRC2013_val_00004542.JPEG`, and remove it.
+
+### Create the LMDB file.
+After you have downloaded the dataset, we can create the lmdb files.
+
+  ```Shell
+  cd $CAFFE_ROOT
+  # Create the trainval1.txt, val2.txt, val2_name_size.txt, test.txt and test_name_size.txt in data/ILSVRC2016/
+  python data/ILSVRC2016/create_list.py
+  # You can modify the parameters in create_data.sh if needed.
+  # It will create lmdb files for trainval1, val2 and test with encoded original image:
+  #   - $HOME/data/ILSVRC/lmdb/DET/ILSVRC2016_trainval1_lmdb
+  #   - $HOME/data/ILSVRC/lmdb/DET/ILSVRC2016_val2_lmdb
+  #   - $HOME/data/ILSVRC/lmdb/DET/ILSVRC2016_test_lmdb
+  # and make soft links at examples/ILSVRC2016/
+  ./data/ILSVRC2016/create_data.sh
+  ```
diff --git a/data/ILSVRC2016/create_data.sh b/data/ILSVRC2016/create_data.sh
new file mode 100644
index 00000000000..688db535824
--- /dev/null
+++ b/data/ILSVRC2016/create_data.sh
@@ -0,0 +1,30 @@
+cur_dir=$(cd $( dirname ${BASH_SOURCE[0]} ) && pwd )
+root_dir=$cur_dir/../..
+
+cd $root_dir
+
+redo=false
+data_root_dir="$HOME/data/ILSVRC"
+dataset_name="ILSVRC2016"
+mapfile="$root_dir/data/$dataset_name/labelmap_ilsvrc_det.prototxt"
+db="lmdb"
+min_dim=0
+max_dim=0
+width=0
+height=0
+
+extra_cmd="--encode-type=jpg --encoded"
+if $redo
+then
+  extra_cmd="$extra_cmd --redo"
+fi
+
+for dataset in test
+do
+  python $root_dir/scripts/create_annoset.py --anno-type="classification" --label-map-file=$mapfile --min-dim=$min_dim --max-dim=$max_dim --resize-width=$width --resize-height=$height --check-label $extra_cmd $data_root_dir $root_dir/data/$dataset_name/$dataset".txt" $data_root_dir/$db/DET/$dataset_name"_"$dataset"_"$db examples/$dataset_name 2>&1 | tee $root_dir/data/$dataset_name/$dataset.log
+done
+
+for dataset in val2 trainval1
+do
+  python $root_dir/scripts/create_annoset.py --anno-type="detection" --label-map-file=$mapfile --min-dim=$min_dim --max-dim=$max_dim --resize-width=$width --resize-height=$height --check-label $extra_cmd $data_root_dir $root_dir/data/$dataset_name/$dataset".txt" $data_root_dir/$db/DET/$dataset_name"_"$dataset"_"$db examples/$dataset_name 2>&1 | tee $root_dir/data/$dataset_name/$dataset.log
+done
diff --git a/data/ILSVRC2016/create_list.py b/data/ILSVRC2016/create_list.py
new file mode 100644
index 00000000000..8e6dce2b8fc
--- /dev/null
+++ b/data/ILSVRC2016/create_list.py
@@ -0,0 +1,109 @@
+import argparse
+import os
+from random import shuffle
+import shutil
+import subprocess
+import sys
+
+HOMEDIR = os.path.expanduser("~")
+CURDIR = os.path.dirname(os.path.realpath(__file__))
+
+# If true, re-create all list files.
+redo = False
+# The root directory which holds all information of the dataset.
+data_dir = "{}/data/ILSVRC".format(HOMEDIR)
+# The directory name which holds the image sets.
+imgset_dir = "ImageSets/DET"
+# The direcotry which contains the images.
+img_dir = "Data/DET"
+img_ext = "JPEG"
+# The directory which contains the annotations.
+anno_dir = "Annotations/DET"
+anno_ext = "xml"
+
+train_list_file = "{}/trainval1.txt".format(CURDIR)
+val_list_file = "{}/val2.txt".format(CURDIR)
+val_name_size_file = "{}/val2_name_size.txt".format(CURDIR)
+test_list_file = "{}/test.txt".format(CURDIR)
+test_name_size_file = "{}/test_name_size.txt".format(CURDIR)
+
+# Create training set.
+# We follow Ross Girschick's split in R-CNN.
+if redo or not os.path.exists(train_list_file):
+    datasets = ["train", "val1"]
+    img_files = []
+    anno_files = []
+    for dataset in datasets:
+        imgset_file = "{}/{}/{}.txt".format(data_dir, imgset_dir, dataset)
+        with open(imgset_file, "r") as f:
+            for line in f.readlines():
+                name = line.strip("\n").split(" ")[0]
+                subset = name.split("/")[0].split("_")[1]
+                anno_file = "{}/{}/{}.{}".format(anno_dir, subset, name, anno_ext)
+                # Ignore image if it does not have annotation. These are the negative images in ILSVRC.
+                if not os.path.exists("{}/{}".format(data_dir, anno_file)):
+                    continue
+                img_file = "{}/{}/{}.{}".format(img_dir, subset, name, img_ext)
+                assert os.path.exists("{}/{}".format(data_dir, img_file))
+                img_files.append(img_file)
+                anno_files.append(anno_file)
+    # Shuffle the images.
+    idx = [i for i in xrange(len(img_files))]
+    shuffle(idx)
+    with open(train_list_file, "w") as f:
+        for i in idx:
+            f.write("{} {}\n".format(img_files[i], anno_files[i]))
+
+if redo or not os.path.exists(val_list_file):
+    datasets = ["val2"]
+    subset = "val"
+    img_files = []
+    anno_files = []
+    for dataset in datasets:
+        imgset_file = "{}/{}/{}.txt".format(data_dir, imgset_dir, dataset)
+        with open(imgset_file, "r") as f:
+            for line in f.readlines():
+                name = line.strip("\n").split(" ")[0]
+                img_file = "{}/{}/{}.{}".format(img_dir, subset, name, img_ext)
+                assert os.path.exists("{}/{}".format(data_dir, img_file))
+                anno_file = "{}/{}/{}.{}".format(anno_dir, subset, name, anno_ext)
+                assert os.path.exists("{}/{}".format(data_dir, anno_file))
+                img_files.append(img_file)
+                anno_files.append(anno_file)
+    with open(val_list_file, "w") as f:
+        for i in xrange(len(img_files)):
+            f.write("{} {}\n".format(img_files[i], anno_files[i]))
+
+if redo or not os.path.exists(val_name_size_file):
+    dataset = 'val2'
+    imgset_file = "{}/{}/{}.txt".format(data_dir, imgset_dir, dataset)
+    cmd = "{}/../../build/tools/get_image_size --name_id_file={} {} {} {}".format(
+            CURDIR, imgset_file, data_dir, val_list_file, val_name_size_file)
+    print cmd
+    process = subprocess.Popen(cmd.split(), stdout=subprocess.PIPE)
+    output = process.communicate()[0]
+
+if redo or not os.path.exists(test_list_file):
+    datasets = ["test"]
+    subset = "test"
+    img_files = []
+    for dataset in datasets:
+        imgset_file = "{}/{}/{}.txt".format(data_dir, imgset_dir, dataset)
+        with open(imgset_file, "r") as f:
+            for line in f.readlines():
+                name = line.strip("\n").split(" ")[0]
+                img_file = "{}/{}/{}.{}".format(img_dir, subset, name, img_ext)
+                assert os.path.exists("{}/{}".format(data_dir, img_file))
+                img_files.append(img_file)
+    with open(test_list_file, "w") as f:
+        for i in xrange(len(img_files)):
+            f.write("{} 0\n".format(img_files[i]))
+
+if redo or not os.path.exists(test_name_size_file):
+    dataset = 'test'
+    imgset_file = "{}/{}/{}.txt".format(data_dir, imgset_dir, dataset)
+    cmd = "{}/../../build/tools/get_image_size --name_id_file={} {} {} {}".format(
+            CURDIR, imgset_file, data_dir, test_list_file, test_name_size_file)
+    print cmd
+    process = subprocess.Popen(cmd.split(), stdout=subprocess.PIPE)
+    output = process.communicate()[0]
diff --git a/data/ILSVRC2016/labelmap_ilsvrc_clsloc.prototxt b/data/ILSVRC2016/labelmap_ilsvrc_clsloc.prototxt
new file mode 100644
index 00000000000..1a27998cd53
--- /dev/null
+++ b/data/ILSVRC2016/labelmap_ilsvrc_clsloc.prototxt
@@ -0,0 +1,5005 @@
+item {
+  name: "none_of_the_above"
+  label: 0
+  display_name: "background"
+}
+item {
+  name: "n02119789"
+  label: 1
+  display_name: "kit_fox"
+}
+item {
+  name: "n02100735"
+  label: 2
+  display_name: "English_setter"
+}
+item {
+  name: "n02110185"
+  label: 3
+  display_name: "Siberian_husky"
+}
+item {
+  name: "n02096294"
+  label: 4
+  display_name: "Australian_terrier"
+}
+item {
+  name: "n02102040"
+  label: 5
+  display_name: "English_springer"
+}
+item {
+  name: "n02066245"
+  label: 6
+  display_name: "grey_whale"
+}
+item {
+  name: "n02509815"
+  label: 7
+  display_name: "lesser_panda"
+}
+item {
+  name: "n02124075"
+  label: 8
+  display_name: "Egyptian_cat"
+}
+item {
+  name: "n02417914"
+  label: 9
+  display_name: "ibex"
+}
+item {
+  name: "n02123394"
+  label: 10
+  display_name: "Persian_cat"
+}
+item {
+  name: "n02125311"
+  label: 11
+  display_name: "cougar"
+}
+item {
+  name: "n02423022"
+  label: 12
+  display_name: "gazelle"
+}
+item {
+  name: "n02346627"
+  label: 13
+  display_name: "porcupine"
+}
+item {
+  name: "n02077923"
+  label: 14
+  display_name: "sea_lion"
+}
+item {
+  name: "n02110063"
+  label: 15
+  display_name: "malamute"
+}
+item {
+  name: "n02447366"
+  label: 16
+  display_name: "badger"
+}
+item {
+  name: "n02109047"
+  label: 17
+  display_name: "Great_Dane"
+}
+item {
+  name: "n02089867"
+  label: 18
+  display_name: "Walker_hound"
+}
+item {
+  name: "n02102177"
+  label: 19
+  display_name: "Welsh_springer_spaniel"
+}
+item {
+  name: "n02091134"
+  label: 20
+  display_name: "whippet"
+}
+item {
+  name: "n02092002"
+  label: 21
+  display_name: "Scottish_deerhound"
+}
+item {
+  name: "n02071294"
+  label: 22
+  display_name: "killer_whale"
+}
+item {
+  name: "n02442845"
+  label: 23
+  display_name: "mink"
+}
+item {
+  name: "n02504458"
+  label: 24
+  display_name: "African_elephant"
+}
+item {
+  name: "n02092339"
+  label: 25
+  display_name: "Weimaraner"
+}
+item {
+  name: "n02098105"
+  label: 26
+  display_name: "soft-coated_wheaten_terrier"
+}
+item {
+  name: "n02096437"
+  label: 27
+  display_name: "Dandie_Dinmont"
+}
+item {
+  name: "n02114712"
+  label: 28
+  display_name: "red_wolf"
+}
+item {
+  name: "n02105641"
+  label: 29
+  display_name: "Old_English_sheepdog"
+}
+item {
+  name: "n02128925"
+  label: 30
+  display_name: "jaguar"
+}
+item {
+  name: "n02091635"
+  label: 31
+  display_name: "otterhound"
+}
+item {
+  name: "n02088466"
+  label: 32
+  display_name: "bloodhound"
+}
+item {
+  name: "n02096051"
+  label: 33
+  display_name: "Airedale"
+}
+item {
+  name: "n02117135"
+  label: 34
+  display_name: "hyena"
+}
+item {
+  name: "n02138441"
+  label: 35
+  display_name: "meerkat"
+}
+item {
+  name: "n02097130"
+  label: 36
+  display_name: "giant_schnauzer"
+}
+item {
+  name: "n02493509"
+  label: 37
+  display_name: "titi"
+}
+item {
+  name: "n02457408"
+  label: 38
+  display_name: "three-toed_sloth"
+}
+item {
+  name: "n02389026"
+  label: 39
+  display_name: "sorrel"
+}
+item {
+  name: "n02443484"
+  label: 40
+  display_name: "black-footed_ferret"
+}
+item {
+  name: "n02110341"
+  label: 41
+  display_name: "dalmatian"
+}
+item {
+  name: "n02089078"
+  label: 42
+  display_name: "black-and-tan_coonhound"
+}
+item {
+  name: "n02086910"
+  label: 43
+  display_name: "papillon"
+}
+item {
+  name: "n02445715"
+  label: 44
+  display_name: "skunk"
+}
+item {
+  name: "n02093256"
+  label: 45
+  display_name: "Staffordshire_bullterrier"
+}
+item {
+  name: "n02113978"
+  label: 46
+  display_name: "Mexican_hairless"
+}
+item {
+  name: "n02106382"
+  label: 47
+  display_name: "Bouvier_des_Flandres"
+}
+item {
+  name: "n02441942"
+  label: 48
+  display_name: "weasel"
+}
+item {
+  name: "n02113712"
+  label: 49
+  display_name: "miniature_poodle"
+}
+item {
+  name: "n02113186"
+  label: 50
+  display_name: "Cardigan"
+}
+item {
+  name: "n02105162"
+  label: 51
+  display_name: "malinois"
+}
+item {
+  name: "n02415577"
+  label: 52
+  display_name: "bighorn"
+}
+item {
+  name: "n02356798"
+  label: 53
+  display_name: "fox_squirrel"
+}
+item {
+  name: "n02488702"
+  label: 54
+  display_name: "colobus"
+}
+item {
+  name: "n02123159"
+  label: 55
+  display_name: "tiger_cat"
+}
+item {
+  name: "n02098413"
+  label: 56
+  display_name: "Lhasa"
+}
+item {
+  name: "n02422699"
+  label: 57
+  display_name: "impala"
+}
+item {
+  name: "n02114855"
+  label: 58
+  display_name: "coyote"
+}
+item {
+  name: "n02094433"
+  label: 59
+  display_name: "Yorkshire_terrier"
+}
+item {
+  name: "n02111277"
+  label: 60
+  display_name: "Newfoundland"
+}
+item {
+  name: "n02132136"
+  label: 61
+  display_name: "brown_bear"
+}
+item {
+  name: "n02119022"
+  label: 62
+  display_name: "red_fox"
+}
+item {
+  name: "n02091467"
+  label: 63
+  display_name: "Norwegian_elkhound"
+}
+item {
+  name: "n02106550"
+  label: 64
+  display_name: "Rottweiler"
+}
+item {
+  name: "n02422106"
+  label: 65
+  display_name: "hartebeest"
+}
+item {
+  name: "n02091831"
+  label: 66
+  display_name: "Saluki"
+}
+item {
+  name: "n02120505"
+  label: 67
+  display_name: "grey_fox"
+}
+item {
+  name: "n02104365"
+  label: 68
+  display_name: "schipperke"
+}
+item {
+  name: "n02086079"
+  label: 69
+  display_name: "Pekinese"
+}
+item {
+  name: "n02112706"
+  label: 70
+  display_name: "Brabancon_griffon"
+}
+item {
+  name: "n02098286"
+  label: 71
+  display_name: "West_Highland_white_terrier"
+}
+item {
+  name: "n02095889"
+  label: 72
+  display_name: "Sealyham_terrier"
+}
+item {
+  name: "n02484975"
+  label: 73
+  display_name: "guenon"
+}
+item {
+  name: "n02137549"
+  label: 74
+  display_name: "mongoose"
+}
+item {
+  name: "n02500267"
+  label: 75
+  display_name: "indri"
+}
+item {
+  name: "n02129604"
+  label: 76
+  display_name: "tiger"
+}
+item {
+  name: "n02090721"
+  label: 77
+  display_name: "Irish_wolfhound"
+}
+item {
+  name: "n02396427"
+  label: 78
+  display_name: "wild_boar"
+}
+item {
+  name: "n02108000"
+  label: 79
+  display_name: "EntleBucher"
+}
+item {
+  name: "n02391049"
+  label: 80
+  display_name: "zebra"
+}
+item {
+  name: "n02412080"
+  label: 81
+  display_name: "ram"
+}
+item {
+  name: "n02108915"
+  label: 82
+  display_name: "French_bulldog"
+}
+item {
+  name: "n02480495"
+  label: 83
+  display_name: "orangutan"
+}
+item {
+  name: "n02110806"
+  label: 84
+  display_name: "basenji"
+}
+item {
+  name: "n02128385"
+  label: 85
+  display_name: "leopard"
+}
+item {
+  name: "n02107683"
+  label: 86
+  display_name: "Bernese_mountain_dog"
+}
+item {
+  name: "n02085936"
+  label: 87
+  display_name: "Maltese_dog"
+}
+item {
+  name: "n02094114"
+  label: 88
+  display_name: "Norfolk_terrier"
+}
+item {
+  name: "n02087046"
+  label: 89
+  display_name: "toy_terrier"
+}
+item {
+  name: "n02100583"
+  label: 90
+  display_name: "vizsla"
+}
+item {
+  name: "n02096177"
+  label: 91
+  display_name: "cairn"
+}
+item {
+  name: "n02494079"
+  label: 92
+  display_name: "squirrel_monkey"
+}
+item {
+  name: "n02105056"
+  label: 93
+  display_name: "groenendael"
+}
+item {
+  name: "n02101556"
+  label: 94
+  display_name: "clumber"
+}
+item {
+  name: "n02123597"
+  label: 95
+  display_name: "Siamese_cat"
+}
+item {
+  name: "n02481823"
+  label: 96
+  display_name: "chimpanzee"
+}
+item {
+  name: "n02105505"
+  label: 97
+  display_name: "komondor"
+}
+item {
+  name: "n02088094"
+  label: 98
+  display_name: "Afghan_hound"
+}
+item {
+  name: "n02085782"
+  label: 99
+  display_name: "Japanese_spaniel"
+}
+item {
+  name: "n02489166"
+  label: 100
+  display_name: "proboscis_monkey"
+}
+item {
+  name: "n02364673"
+  label: 101
+  display_name: "guinea_pig"
+}
+item {
+  name: "n02114548"
+  label: 102
+  display_name: "white_wolf"
+}
+item {
+  name: "n02134084"
+  label: 103
+  display_name: "ice_bear"
+}
+item {
+  name: "n02480855"
+  label: 104
+  display_name: "gorilla"
+}
+item {
+  name: "n02090622"
+  label: 105
+  display_name: "borzoi"
+}
+item {
+  name: "n02113624"
+  label: 106
+  display_name: "toy_poodle"
+}
+item {
+  name: "n02093859"
+  label: 107
+  display_name: "Kerry_blue_terrier"
+}
+item {
+  name: "n02403003"
+  label: 108
+  display_name: "ox"
+}
+item {
+  name: "n02097298"
+  label: 109
+  display_name: "Scotch_terrier"
+}
+item {
+  name: "n02108551"
+  label: 110
+  display_name: "Tibetan_mastiff"
+}
+item {
+  name: "n02493793"
+  label: 111
+  display_name: "spider_monkey"
+}
+item {
+  name: "n02107142"
+  label: 112
+  display_name: "Doberman"
+}
+item {
+  name: "n02096585"
+  label: 113
+  display_name: "Boston_bull"
+}
+item {
+  name: "n02107574"
+  label: 114
+  display_name: "Greater_Swiss_Mountain_dog"
+}
+item {
+  name: "n02107908"
+  label: 115
+  display_name: "Appenzeller"
+}
+item {
+  name: "n02086240"
+  label: 116
+  display_name: "Shih-Tzu"
+}
+item {
+  name: "n02102973"
+  label: 117
+  display_name: "Irish_water_spaniel"
+}
+item {
+  name: "n02112018"
+  label: 118
+  display_name: "Pomeranian"
+}
+item {
+  name: "n02093647"
+  label: 119
+  display_name: "Bedlington_terrier"
+}
+item {
+  name: "n02397096"
+  label: 120
+  display_name: "warthog"
+}
+item {
+  name: "n02437312"
+  label: 121
+  display_name: "Arabian_camel"
+}
+item {
+  name: "n02483708"
+  label: 122
+  display_name: "siamang"
+}
+item {
+  name: "n02097047"
+  label: 123
+  display_name: "miniature_schnauzer"
+}
+item {
+  name: "n02106030"
+  label: 124
+  display_name: "collie"
+}
+item {
+  name: "n02099601"
+  label: 125
+  display_name: "golden_retriever"
+}
+item {
+  name: "n02093991"
+  label: 126
+  display_name: "Irish_terrier"
+}
+item {
+  name: "n02110627"
+  label: 127
+  display_name: "affenpinscher"
+}
+item {
+  name: "n02106166"
+  label: 128
+  display_name: "Border_collie"
+}
+item {
+  name: "n02326432"
+  label: 129
+  display_name: "hare"
+}
+item {
+  name: "n02108089"
+  label: 130
+  display_name: "boxer"
+}
+item {
+  name: "n02097658"
+  label: 131
+  display_name: "silky_terrier"
+}
+item {
+  name: "n02088364"
+  label: 132
+  display_name: "beagle"
+}
+item {
+  name: "n02111129"
+  label: 133
+  display_name: "Leonberg"
+}
+item {
+  name: "n02100236"
+  label: 134
+  display_name: "German_short-haired_pointer"
+}
+item {
+  name: "n02486261"
+  label: 135
+  display_name: "patas"
+}
+item {
+  name: "n02115913"
+  label: 136
+  display_name: "dhole"
+}
+item {
+  name: "n02486410"
+  label: 137
+  display_name: "baboon"
+}
+item {
+  name: "n02487347"
+  label: 138
+  display_name: "macaque"
+}
+item {
+  name: "n02099849"
+  label: 139
+  display_name: "Chesapeake_Bay_retriever"
+}
+item {
+  name: "n02108422"
+  label: 140
+  display_name: "bull_mastiff"
+}
+item {
+  name: "n02104029"
+  label: 141
+  display_name: "kuvasz"
+}
+item {
+  name: "n02492035"
+  label: 142
+  display_name: "capuchin"
+}
+item {
+  name: "n02110958"
+  label: 143
+  display_name: "pug"
+}
+item {
+  name: "n02099429"
+  label: 144
+  display_name: "curly-coated_retriever"
+}
+item {
+  name: "n02094258"
+  label: 145
+  display_name: "Norwich_terrier"
+}
+item {
+  name: "n02099267"
+  label: 146
+  display_name: "flat-coated_retriever"
+}
+item {
+  name: "n02395406"
+  label: 147
+  display_name: "hog"
+}
+item {
+  name: "n02112350"
+  label: 148
+  display_name: "keeshond"
+}
+item {
+  name: "n02109961"
+  label: 149
+  display_name: "Eskimo_dog"
+}
+item {
+  name: "n02101388"
+  label: 150
+  display_name: "Brittany_spaniel"
+}
+item {
+  name: "n02113799"
+  label: 151
+  display_name: "standard_poodle"
+}
+item {
+  name: "n02095570"
+  label: 152
+  display_name: "Lakeland_terrier"
+}
+item {
+  name: "n02128757"
+  label: 153
+  display_name: "snow_leopard"
+}
+item {
+  name: "n02101006"
+  label: 154
+  display_name: "Gordon_setter"
+}
+item {
+  name: "n02115641"
+  label: 155
+  display_name: "dingo"
+}
+item {
+  name: "n02097209"
+  label: 156
+  display_name: "standard_schnauzer"
+}
+item {
+  name: "n02342885"
+  label: 157
+  display_name: "hamster"
+}
+item {
+  name: "n02097474"
+  label: 158
+  display_name: "Tibetan_terrier"
+}
+item {
+  name: "n02120079"
+  label: 159
+  display_name: "Arctic_fox"
+}
+item {
+  name: "n02095314"
+  label: 160
+  display_name: "wire-haired_fox_terrier"
+}
+item {
+  name: "n02088238"
+  label: 161
+  display_name: "basset"
+}
+item {
+  name: "n02408429"
+  label: 162
+  display_name: "water_buffalo"
+}
+item {
+  name: "n02133161"
+  label: 163
+  display_name: "American_black_bear"
+}
+item {
+  name: "n02328150"
+  label: 164
+  display_name: "Angora"
+}
+item {
+  name: "n02410509"
+  label: 165
+  display_name: "bison"
+}
+item {
+  name: "n02492660"
+  label: 166
+  display_name: "howler_monkey"
+}
+item {
+  name: "n02398521"
+  label: 167
+  display_name: "hippopotamus"
+}
+item {
+  name: "n02112137"
+  label: 168
+  display_name: "chow"
+}
+item {
+  name: "n02510455"
+  label: 169
+  display_name: "giant_panda"
+}
+item {
+  name: "n02093428"
+  label: 170
+  display_name: "American_Staffordshire_terrier"
+}
+item {
+  name: "n02105855"
+  label: 171
+  display_name: "Shetland_sheepdog"
+}
+item {
+  name: "n02111500"
+  label: 172
+  display_name: "Great_Pyrenees"
+}
+item {
+  name: "n02085620"
+  label: 173
+  display_name: "Chihuahua"
+}
+item {
+  name: "n02123045"
+  label: 174
+  display_name: "tabby"
+}
+item {
+  name: "n02490219"
+  label: 175
+  display_name: "marmoset"
+}
+item {
+  name: "n02099712"
+  label: 176
+  display_name: "Labrador_retriever"
+}
+item {
+  name: "n02109525"
+  label: 177
+  display_name: "Saint_Bernard"
+}
+item {
+  name: "n02454379"
+  label: 178
+  display_name: "armadillo"
+}
+item {
+  name: "n02111889"
+  label: 179
+  display_name: "Samoyed"
+}
+item {
+  name: "n02088632"
+  label: 180
+  display_name: "bluetick"
+}
+item {
+  name: "n02090379"
+  label: 181
+  display_name: "redbone"
+}
+item {
+  name: "n02443114"
+  label: 182
+  display_name: "polecat"
+}
+item {
+  name: "n02361337"
+  label: 183
+  display_name: "marmot"
+}
+item {
+  name: "n02105412"
+  label: 184
+  display_name: "kelpie"
+}
+item {
+  name: "n02483362"
+  label: 185
+  display_name: "gibbon"
+}
+item {
+  name: "n02437616"
+  label: 186
+  display_name: "llama"
+}
+item {
+  name: "n02107312"
+  label: 187
+  display_name: "miniature_pinscher"
+}
+item {
+  name: "n02325366"
+  label: 188
+  display_name: "wood_rabbit"
+}
+item {
+  name: "n02091032"
+  label: 189
+  display_name: "Italian_greyhound"
+}
+item {
+  name: "n02129165"
+  label: 190
+  display_name: "lion"
+}
+item {
+  name: "n02102318"
+  label: 191
+  display_name: "cocker_spaniel"
+}
+item {
+  name: "n02100877"
+  label: 192
+  display_name: "Irish_setter"
+}
+item {
+  name: "n02074367"
+  label: 193
+  display_name: "dugong"
+}
+item {
+  name: "n02504013"
+  label: 194
+  display_name: "Indian_elephant"
+}
+item {
+  name: "n02363005"
+  label: 195
+  display_name: "beaver"
+}
+item {
+  name: "n02102480"
+  label: 196
+  display_name: "Sussex_spaniel"
+}
+item {
+  name: "n02113023"
+  label: 197
+  display_name: "Pembroke"
+}
+item {
+  name: "n02086646"
+  label: 198
+  display_name: "Blenheim_spaniel"
+}
+item {
+  name: "n02497673"
+  label: 199
+  display_name: "Madagascar_cat"
+}
+item {
+  name: "n02087394"
+  label: 200
+  display_name: "Rhodesian_ridgeback"
+}
+item {
+  name: "n02127052"
+  label: 201
+  display_name: "lynx"
+}
+item {
+  name: "n02116738"
+  label: 202
+  display_name: "African_hunting_dog"
+}
+item {
+  name: "n02488291"
+  label: 203
+  display_name: "langur"
+}
+item {
+  name: "n02091244"
+  label: 204
+  display_name: "Ibizan_hound"
+}
+item {
+  name: "n02114367"
+  label: 205
+  display_name: "timber_wolf"
+}
+item {
+  name: "n02130308"
+  label: 206
+  display_name: "cheetah"
+}
+item {
+  name: "n02089973"
+  label: 207
+  display_name: "English_foxhound"
+}
+item {
+  name: "n02105251"
+  label: 208
+  display_name: "briard"
+}
+item {
+  name: "n02134418"
+  label: 209
+  display_name: "sloth_bear"
+}
+item {
+  name: "n02093754"
+  label: 210
+  display_name: "Border_terrier"
+}
+item {
+  name: "n02106662"
+  label: 211
+  display_name: "German_shepherd"
+}
+item {
+  name: "n02444819"
+  label: 212
+  display_name: "otter"
+}
+item {
+  name: "n01882714"
+  label: 213
+  display_name: "koala"
+}
+item {
+  name: "n01871265"
+  label: 214
+  display_name: "tusker"
+}
+item {
+  name: "n01872401"
+  label: 215
+  display_name: "echidna"
+}
+item {
+  name: "n01877812"
+  label: 216
+  display_name: "wallaby"
+}
+item {
+  name: "n01873310"
+  label: 217
+  display_name: "platypus"
+}
+item {
+  name: "n01883070"
+  label: 218
+  display_name: "wombat"
+}
+item {
+  name: "n04086273"
+  label: 219
+  display_name: "revolver"
+}
+item {
+  name: "n04507155"
+  label: 220
+  display_name: "umbrella"
+}
+item {
+  name: "n04147183"
+  label: 221
+  display_name: "schooner"
+}
+item {
+  name: "n04254680"
+  label: 222
+  display_name: "soccer_ball"
+}
+item {
+  name: "n02672831"
+  label: 223
+  display_name: "accordion"
+}
+item {
+  name: "n02219486"
+  label: 224
+  display_name: "ant"
+}
+item {
+  name: "n02317335"
+  label: 225
+  display_name: "starfish"
+}
+item {
+  name: "n01968897"
+  label: 226
+  display_name: "chambered_nautilus"
+}
+item {
+  name: "n03452741"
+  label: 227
+  display_name: "grand_piano"
+}
+item {
+  name: "n03642806"
+  label: 228
+  display_name: "laptop"
+}
+item {
+  name: "n07745940"
+  label: 229
+  display_name: "strawberry"
+}
+item {
+  name: "n02690373"
+  label: 230
+  display_name: "airliner"
+}
+item {
+  name: "n04552348"
+  label: 231
+  display_name: "warplane"
+}
+item {
+  name: "n02692877"
+  label: 232
+  display_name: "airship"
+}
+item {
+  name: "n02782093"
+  label: 233
+  display_name: "balloon"
+}
+item {
+  name: "n04266014"
+  label: 234
+  display_name: "space_shuttle"
+}
+item {
+  name: "n03344393"
+  label: 235
+  display_name: "fireboat"
+}
+item {
+  name: "n03447447"
+  label: 236
+  display_name: "gondola"
+}
+item {
+  name: "n04273569"
+  label: 237
+  display_name: "speedboat"
+}
+item {
+  name: "n03662601"
+  label: 238
+  display_name: "lifeboat"
+}
+item {
+  name: "n02951358"
+  label: 239
+  display_name: "canoe"
+}
+item {
+  name: "n04612504"
+  label: 240
+  display_name: "yawl"
+}
+item {
+  name: "n02981792"
+  label: 241
+  display_name: "catamaran"
+}
+item {
+  name: "n04483307"
+  label: 242
+  display_name: "trimaran"
+}
+item {
+  name: "n03095699"
+  label: 243
+  display_name: "container_ship"
+}
+item {
+  name: "n03673027"
+  label: 244
+  display_name: "liner"
+}
+item {
+  name: "n03947888"
+  label: 245
+  display_name: "pirate"
+}
+item {
+  name: "n02687172"
+  label: 246
+  display_name: "aircraft_carrier"
+}
+item {
+  name: "n04347754"
+  label: 247
+  display_name: "submarine"
+}
+item {
+  name: "n04606251"
+  label: 248
+  display_name: "wreck"
+}
+item {
+  name: "n03478589"
+  label: 249
+  display_name: "half_track"
+}
+item {
+  name: "n04389033"
+  label: 250
+  display_name: "tank"
+}
+item {
+  name: "n03773504"
+  label: 251
+  display_name: "missile"
+}
+item {
+  name: "n02860847"
+  label: 252
+  display_name: "bobsled"
+}
+item {
+  name: "n03218198"
+  label: 253
+  display_name: "dogsled"
+}
+item {
+  name: "n02835271"
+  label: 254
+  display_name: "bicycle-built-for-two"
+}
+item {
+  name: "n03792782"
+  label: 255
+  display_name: "mountain_bike"
+}
+item {
+  name: "n03393912"
+  label: 256
+  display_name: "freight_car"
+}
+item {
+  name: "n03895866"
+  label: 257
+  display_name: "passenger_car"
+}
+item {
+  name: "n02797295"
+  label: 258
+  display_name: "barrow"
+}
+item {
+  name: "n04204347"
+  label: 259
+  display_name: "shopping_cart"
+}
+item {
+  name: "n03791053"
+  label: 260
+  display_name: "motor_scooter"
+}
+item {
+  name: "n03384352"
+  label: 261
+  display_name: "forklift"
+}
+item {
+  name: "n03272562"
+  label: 262
+  display_name: "electric_locomotive"
+}
+item {
+  name: "n04310018"
+  label: 263
+  display_name: "steam_locomotive"
+}
+item {
+  name: "n02704792"
+  label: 264
+  display_name: "amphibian"
+}
+item {
+  name: "n02701002"
+  label: 265
+  display_name: "ambulance"
+}
+item {
+  name: "n02814533"
+  label: 266
+  display_name: "beach_wagon"
+}
+item {
+  name: "n02930766"
+  label: 267
+  display_name: "cab"
+}
+item {
+  name: "n03100240"
+  label: 268
+  display_name: "convertible"
+}
+item {
+  name: "n03594945"
+  label: 269
+  display_name: "jeep"
+}
+item {
+  name: "n03670208"
+  label: 270
+  display_name: "limousine"
+}
+item {
+  name: "n03770679"
+  label: 271
+  display_name: "minivan"
+}
+item {
+  name: "n03777568"
+  label: 272
+  display_name: "Model_T"
+}
+item {
+  name: "n04037443"
+  label: 273
+  display_name: "racer"
+}
+item {
+  name: "n04285008"
+  label: 274
+  display_name: "sports_car"
+}
+item {
+  name: "n03444034"
+  label: 275
+  display_name: "go-kart"
+}
+item {
+  name: "n03445924"
+  label: 276
+  display_name: "golfcart"
+}
+item {
+  name: "n03785016"
+  label: 277
+  display_name: "moped"
+}
+item {
+  name: "n04252225"
+  label: 278
+  display_name: "snowplow"
+}
+item {
+  name: "n03345487"
+  label: 279
+  display_name: "fire_engine"
+}
+item {
+  name: "n03417042"
+  label: 280
+  display_name: "garbage_truck"
+}
+item {
+  name: "n03930630"
+  label: 281
+  display_name: "pickup"
+}
+item {
+  name: "n04461696"
+  label: 282
+  display_name: "tow_truck"
+}
+item {
+  name: "n04467665"
+  label: 283
+  display_name: "trailer_truck"
+}
+item {
+  name: "n03796401"
+  label: 284
+  display_name: "moving_van"
+}
+item {
+  name: "n03977966"
+  label: 285
+  display_name: "police_van"
+}
+item {
+  name: "n04065272"
+  label: 286
+  display_name: "recreational_vehicle"
+}
+item {
+  name: "n04335435"
+  label: 287
+  display_name: "streetcar"
+}
+item {
+  name: "n04252077"
+  label: 288
+  display_name: "snowmobile"
+}
+item {
+  name: "n04465501"
+  label: 289
+  display_name: "tractor"
+}
+item {
+  name: "n03776460"
+  label: 290
+  display_name: "mobile_home"
+}
+item {
+  name: "n04482393"
+  label: 291
+  display_name: "tricycle"
+}
+item {
+  name: "n04509417"
+  label: 292
+  display_name: "unicycle"
+}
+item {
+  name: "n03538406"
+  label: 293
+  display_name: "horse_cart"
+}
+item {
+  name: "n03599486"
+  label: 294
+  display_name: "jinrikisha"
+}
+item {
+  name: "n03868242"
+  label: 295
+  display_name: "oxcart"
+}
+item {
+  name: "n02804414"
+  label: 296
+  display_name: "bassinet"
+}
+item {
+  name: "n03125729"
+  label: 297
+  display_name: "cradle"
+}
+item {
+  name: "n03131574"
+  label: 298
+  display_name: "crib"
+}
+item {
+  name: "n03388549"
+  label: 299
+  display_name: "four-poster"
+}
+item {
+  name: "n02870880"
+  label: 300
+  display_name: "bookcase"
+}
+item {
+  name: "n03018349"
+  label: 301
+  display_name: "china_cabinet"
+}
+item {
+  name: "n03742115"
+  label: 302
+  display_name: "medicine_chest"
+}
+item {
+  name: "n03016953"
+  label: 303
+  display_name: "chiffonier"
+}
+item {
+  name: "n04380533"
+  label: 304
+  display_name: "table_lamp"
+}
+item {
+  name: "n03337140"
+  label: 305
+  display_name: "file"
+}
+item {
+  name: "n03891251"
+  label: 306
+  display_name: "park_bench"
+}
+item {
+  name: "n02791124"
+  label: 307
+  display_name: "barber_chair"
+}
+item {
+  name: "n04429376"
+  label: 308
+  display_name: "throne"
+}
+item {
+  name: "n03376595"
+  label: 309
+  display_name: "folding_chair"
+}
+item {
+  name: "n04099969"
+  label: 310
+  display_name: "rocking_chair"
+}
+item {
+  name: "n04344873"
+  label: 311
+  display_name: "studio_couch"
+}
+item {
+  name: "n04447861"
+  label: 312
+  display_name: "toilet_seat"
+}
+item {
+  name: "n03179701"
+  label: 313
+  display_name: "desk"
+}
+item {
+  name: "n03982430"
+  label: 314
+  display_name: "pool_table"
+}
+item {
+  name: "n03201208"
+  label: 315
+  display_name: "dining_table"
+}
+item {
+  name: "n03290653"
+  label: 316
+  display_name: "entertainment_center"
+}
+item {
+  name: "n04550184"
+  label: 317
+  display_name: "wardrobe"
+}
+item {
+  name: "n07742313"
+  label: 318
+  display_name: "Granny_Smith"
+}
+item {
+  name: "n07747607"
+  label: 319
+  display_name: "orange"
+}
+item {
+  name: "n07749582"
+  label: 320
+  display_name: "lemon"
+}
+item {
+  name: "n07753113"
+  label: 321
+  display_name: "fig"
+}
+item {
+  name: "n07753275"
+  label: 322
+  display_name: "pineapple"
+}
+item {
+  name: "n07753592"
+  label: 323
+  display_name: "banana"
+}
+item {
+  name: "n07754684"
+  label: 324
+  display_name: "jackfruit"
+}
+item {
+  name: "n07760859"
+  label: 325
+  display_name: "custard_apple"
+}
+item {
+  name: "n07768694"
+  label: 326
+  display_name: "pomegranate"
+}
+item {
+  name: "n12267677"
+  label: 327
+  display_name: "acorn"
+}
+item {
+  name: "n12620546"
+  label: 328
+  display_name: "hip"
+}
+item {
+  name: "n13133613"
+  label: 329
+  display_name: "ear"
+}
+item {
+  name: "n11879895"
+  label: 330
+  display_name: "rapeseed"
+}
+item {
+  name: "n12144580"
+  label: 331
+  display_name: "corn"
+}
+item {
+  name: "n12768682"
+  label: 332
+  display_name: "buckeye"
+}
+item {
+  name: "n03854065"
+  label: 333
+  display_name: "organ"
+}
+item {
+  name: "n04515003"
+  label: 334
+  display_name: "upright"
+}
+item {
+  name: "n03017168"
+  label: 335
+  display_name: "chime"
+}
+item {
+  name: "n03249569"
+  label: 336
+  display_name: "drum"
+}
+item {
+  name: "n03447721"
+  label: 337
+  display_name: "gong"
+}
+item {
+  name: "n03720891"
+  label: 338
+  display_name: "maraca"
+}
+item {
+  name: "n03721384"
+  label: 339
+  display_name: "marimba"
+}
+item {
+  name: "n04311174"
+  label: 340
+  display_name: "steel_drum"
+}
+item {
+  name: "n02787622"
+  label: 341
+  display_name: "banjo"
+}
+item {
+  name: "n02992211"
+  label: 342
+  display_name: "cello"
+}
+item {
+  name: "n04536866"
+  label: 343
+  display_name: "violin"
+}
+item {
+  name: "n03495258"
+  label: 344
+  display_name: "harp"
+}
+item {
+  name: "n02676566"
+  label: 345
+  display_name: "acoustic_guitar"
+}
+item {
+  name: "n03272010"
+  label: 346
+  display_name: "electric_guitar"
+}
+item {
+  name: "n03110669"
+  label: 347
+  display_name: "cornet"
+}
+item {
+  name: "n03394916"
+  label: 348
+  display_name: "French_horn"
+}
+item {
+  name: "n04487394"
+  label: 349
+  display_name: "trombone"
+}
+item {
+  name: "n03494278"
+  label: 350
+  display_name: "harmonica"
+}
+item {
+  name: "n03840681"
+  label: 351
+  display_name: "ocarina"
+}
+item {
+  name: "n03884397"
+  label: 352
+  display_name: "panpipe"
+}
+item {
+  name: "n02804610"
+  label: 353
+  display_name: "bassoon"
+}
+item {
+  name: "n03838899"
+  label: 354
+  display_name: "oboe"
+}
+item {
+  name: "n04141076"
+  label: 355
+  display_name: "sax"
+}
+item {
+  name: "n03372029"
+  label: 356
+  display_name: "flute"
+}
+item {
+  name: "n11939491"
+  label: 357
+  display_name: "daisy"
+}
+item {
+  name: "n12057211"
+  label: 358
+  display_name: "yellow_lady\'s_slipper"
+}
+item {
+  name: "n09246464"
+  label: 359
+  display_name: "cliff"
+}
+item {
+  name: "n09468604"
+  label: 360
+  display_name: "valley"
+}
+item {
+  name: "n09193705"
+  label: 361
+  display_name: "alp"
+}
+item {
+  name: "n09472597"
+  label: 362
+  display_name: "volcano"
+}
+item {
+  name: "n09399592"
+  label: 363
+  display_name: "promontory"
+}
+item {
+  name: "n09421951"
+  label: 364
+  display_name: "sandbar"
+}
+item {
+  name: "n09256479"
+  label: 365
+  display_name: "coral_reef"
+}
+item {
+  name: "n09332890"
+  label: 366
+  display_name: "lakeside"
+}
+item {
+  name: "n09428293"
+  label: 367
+  display_name: "seashore"
+}
+item {
+  name: "n09288635"
+  label: 368
+  display_name: "geyser"
+}
+item {
+  name: "n03498962"
+  label: 369
+  display_name: "hatchet"
+}
+item {
+  name: "n03041632"
+  label: 370
+  display_name: "cleaver"
+}
+item {
+  name: "n03658185"
+  label: 371
+  display_name: "letter_opener"
+}
+item {
+  name: "n03954731"
+  label: 372
+  display_name: "plane"
+}
+item {
+  name: "n03995372"
+  label: 373
+  display_name: "power_drill"
+}
+item {
+  name: "n03649909"
+  label: 374
+  display_name: "lawn_mower"
+}
+item {
+  name: "n03481172"
+  label: 375
+  display_name: "hammer"
+}
+item {
+  name: "n03109150"
+  label: 376
+  display_name: "corkscrew"
+}
+item {
+  name: "n02951585"
+  label: 377
+  display_name: "can_opener"
+}
+item {
+  name: "n03970156"
+  label: 378
+  display_name: "plunger"
+}
+item {
+  name: "n04154565"
+  label: 379
+  display_name: "screwdriver"
+}
+item {
+  name: "n04208210"
+  label: 380
+  display_name: "shovel"
+}
+item {
+  name: "n03967562"
+  label: 381
+  display_name: "plow"
+}
+item {
+  name: "n03000684"
+  label: 382
+  display_name: "chain_saw"
+}
+item {
+  name: "n01514668"
+  label: 383
+  display_name: "cock"
+}
+item {
+  name: "n01514859"
+  label: 384
+  display_name: "hen"
+}
+item {
+  name: "n01518878"
+  label: 385
+  display_name: "ostrich"
+}
+item {
+  name: "n01530575"
+  label: 386
+  display_name: "brambling"
+}
+item {
+  name: "n01531178"
+  label: 387
+  display_name: "goldfinch"
+}
+item {
+  name: "n01532829"
+  label: 388
+  display_name: "house_finch"
+}
+item {
+  name: "n01534433"
+  label: 389
+  display_name: "junco"
+}
+item {
+  name: "n01537544"
+  label: 390
+  display_name: "indigo_bunting"
+}
+item {
+  name: "n01558993"
+  label: 391
+  display_name: "robin"
+}
+item {
+  name: "n01560419"
+  label: 392
+  display_name: "bulbul"
+}
+item {
+  name: "n01580077"
+  label: 393
+  display_name: "jay"
+}
+item {
+  name: "n01582220"
+  label: 394
+  display_name: "magpie"
+}
+item {
+  name: "n01592084"
+  label: 395
+  display_name: "chickadee"
+}
+item {
+  name: "n01601694"
+  label: 396
+  display_name: "water_ouzel"
+}
+item {
+  name: "n01608432"
+  label: 397
+  display_name: "kite"
+}
+item {
+  name: "n01614925"
+  label: 398
+  display_name: "bald_eagle"
+}
+item {
+  name: "n01616318"
+  label: 399
+  display_name: "vulture"
+}
+item {
+  name: "n01622779"
+  label: 400
+  display_name: "great_grey_owl"
+}
+item {
+  name: "n01795545"
+  label: 401
+  display_name: "black_grouse"
+}
+item {
+  name: "n01796340"
+  label: 402
+  display_name: "ptarmigan"
+}
+item {
+  name: "n01797886"
+  label: 403
+  display_name: "ruffed_grouse"
+}
+item {
+  name: "n01798484"
+  label: 404
+  display_name: "prairie_chicken"
+}
+item {
+  name: "n01806143"
+  label: 405
+  display_name: "peacock"
+}
+item {
+  name: "n01806567"
+  label: 406
+  display_name: "quail"
+}
+item {
+  name: "n01807496"
+  label: 407
+  display_name: "partridge"
+}
+item {
+  name: "n01817953"
+  label: 408
+  display_name: "African_grey"
+}
+item {
+  name: "n01818515"
+  label: 409
+  display_name: "macaw"
+}
+item {
+  name: "n01819313"
+  label: 410
+  display_name: "sulphur-crested_cockatoo"
+}
+item {
+  name: "n01820546"
+  label: 411
+  display_name: "lorikeet"
+}
+item {
+  name: "n01824575"
+  label: 412
+  display_name: "coucal"
+}
+item {
+  name: "n01828970"
+  label: 413
+  display_name: "bee_eater"
+}
+item {
+  name: "n01829413"
+  label: 414
+  display_name: "hornbill"
+}
+item {
+  name: "n01833805"
+  label: 415
+  display_name: "hummingbird"
+}
+item {
+  name: "n01843065"
+  label: 416
+  display_name: "jacamar"
+}
+item {
+  name: "n01843383"
+  label: 417
+  display_name: "toucan"
+}
+item {
+  name: "n01847000"
+  label: 418
+  display_name: "drake"
+}
+item {
+  name: "n01855032"
+  label: 419
+  display_name: "red-breasted_merganser"
+}
+item {
+  name: "n01855672"
+  label: 420
+  display_name: "goose"
+}
+item {
+  name: "n01860187"
+  label: 421
+  display_name: "black_swan"
+}
+item {
+  name: "n02002556"
+  label: 422
+  display_name: "white_stork"
+}
+item {
+  name: "n02002724"
+  label: 423
+  display_name: "black_stork"
+}
+item {
+  name: "n02006656"
+  label: 424
+  display_name: "spoonbill"
+}
+item {
+  name: "n02007558"
+  label: 425
+  display_name: "flamingo"
+}
+item {
+  name: "n02009912"
+  label: 426
+  display_name: "American_egret"
+}
+item {
+  name: "n02009229"
+  label: 427
+  display_name: "little_blue_heron"
+}
+item {
+  name: "n02011460"
+  label: 428
+  display_name: "bittern"
+}
+item {
+  name: "n02012849"
+  label: 429
+  display_name: "crane"
+}
+item {
+  name: "n02013706"
+  label: 430
+  display_name: "limpkin"
+}
+item {
+  name: "n02018207"
+  label: 431
+  display_name: "American_coot"
+}
+item {
+  name: "n02018795"
+  label: 432
+  display_name: "bustard"
+}
+item {
+  name: "n02025239"
+  label: 433
+  display_name: "ruddy_turnstone"
+}
+item {
+  name: "n02027492"
+  label: 434
+  display_name: "red-backed_sandpiper"
+}
+item {
+  name: "n02028035"
+  label: 435
+  display_name: "redshank"
+}
+item {
+  name: "n02033041"
+  label: 436
+  display_name: "dowitcher"
+}
+item {
+  name: "n02037110"
+  label: 437
+  display_name: "oystercatcher"
+}
+item {
+  name: "n02017213"
+  label: 438
+  display_name: "European_gallinule"
+}
+item {
+  name: "n02051845"
+  label: 439
+  display_name: "pelican"
+}
+item {
+  name: "n02056570"
+  label: 440
+  display_name: "king_penguin"
+}
+item {
+  name: "n02058221"
+  label: 441
+  display_name: "albatross"
+}
+item {
+  name: "n01484850"
+  label: 442
+  display_name: "great_white_shark"
+}
+item {
+  name: "n01491361"
+  label: 443
+  display_name: "tiger_shark"
+}
+item {
+  name: "n01494475"
+  label: 444
+  display_name: "hammerhead"
+}
+item {
+  name: "n01496331"
+  label: 445
+  display_name: "electric_ray"
+}
+item {
+  name: "n01498041"
+  label: 446
+  display_name: "stingray"
+}
+item {
+  name: "n02514041"
+  label: 447
+  display_name: "barracouta"
+}
+item {
+  name: "n02536864"
+  label: 448
+  display_name: "coho"
+}
+item {
+  name: "n01440764"
+  label: 449
+  display_name: "tench"
+}
+item {
+  name: "n01443537"
+  label: 450
+  display_name: "goldfish"
+}
+item {
+  name: "n02526121"
+  label: 451
+  display_name: "eel"
+}
+item {
+  name: "n02606052"
+  label: 452
+  display_name: "rock_beauty"
+}
+item {
+  name: "n02607072"
+  label: 453
+  display_name: "anemone_fish"
+}
+item {
+  name: "n02643566"
+  label: 454
+  display_name: "lionfish"
+}
+item {
+  name: "n02655020"
+  label: 455
+  display_name: "puffer"
+}
+item {
+  name: "n02640242"
+  label: 456
+  display_name: "sturgeon"
+}
+item {
+  name: "n02641379"
+  label: 457
+  display_name: "gar"
+}
+item {
+  name: "n01664065"
+  label: 458
+  display_name: "loggerhead"
+}
+item {
+  name: "n01665541"
+  label: 459
+  display_name: "leatherback_turtle"
+}
+item {
+  name: "n01667114"
+  label: 460
+  display_name: "mud_turtle"
+}
+item {
+  name: "n01667778"
+  label: 461
+  display_name: "terrapin"
+}
+item {
+  name: "n01669191"
+  label: 462
+  display_name: "box_turtle"
+}
+item {
+  name: "n01675722"
+  label: 463
+  display_name: "banded_gecko"
+}
+item {
+  name: "n01677366"
+  label: 464
+  display_name: "common_iguana"
+}
+item {
+  name: "n01682714"
+  label: 465
+  display_name: "American_chameleon"
+}
+item {
+  name: "n01685808"
+  label: 466
+  display_name: "whiptail"
+}
+item {
+  name: "n01687978"
+  label: 467
+  display_name: "agama"
+}
+item {
+  name: "n01688243"
+  label: 468
+  display_name: "frilled_lizard"
+}
+item {
+  name: "n01689811"
+  label: 469
+  display_name: "alligator_lizard"
+}
+item {
+  name: "n01692333"
+  label: 470
+  display_name: "Gila_monster"
+}
+item {
+  name: "n01693334"
+  label: 471
+  display_name: "green_lizard"
+}
+item {
+  name: "n01694178"
+  label: 472
+  display_name: "African_chameleon"
+}
+item {
+  name: "n01695060"
+  label: 473
+  display_name: "Komodo_dragon"
+}
+item {
+  name: "n01704323"
+  label: 474
+  display_name: "triceratops"
+}
+item {
+  name: "n01697457"
+  label: 475
+  display_name: "African_crocodile"
+}
+item {
+  name: "n01698640"
+  label: 476
+  display_name: "American_alligator"
+}
+item {
+  name: "n01728572"
+  label: 477
+  display_name: "thunder_snake"
+}
+item {
+  name: "n01728920"
+  label: 478
+  display_name: "ringneck_snake"
+}
+item {
+  name: "n01729322"
+  label: 479
+  display_name: "hognose_snake"
+}
+item {
+  name: "n01729977"
+  label: 480
+  display_name: "green_snake"
+}
+item {
+  name: "n01734418"
+  label: 481
+  display_name: "king_snake"
+}
+item {
+  name: "n01735189"
+  label: 482
+  display_name: "garter_snake"
+}
+item {
+  name: "n01737021"
+  label: 483
+  display_name: "water_snake"
+}
+item {
+  name: "n01739381"
+  label: 484
+  display_name: "vine_snake"
+}
+item {
+  name: "n01740131"
+  label: 485
+  display_name: "night_snake"
+}
+item {
+  name: "n01742172"
+  label: 486
+  display_name: "boa_constrictor"
+}
+item {
+  name: "n01744401"
+  label: 487
+  display_name: "rock_python"
+}
+item {
+  name: "n01748264"
+  label: 488
+  display_name: "Indian_cobra"
+}
+item {
+  name: "n01749939"
+  label: 489
+  display_name: "green_mamba"
+}
+item {
+  name: "n01751748"
+  label: 490
+  display_name: "sea_snake"
+}
+item {
+  name: "n01753488"
+  label: 491
+  display_name: "horned_viper"
+}
+item {
+  name: "n01755581"
+  label: 492
+  display_name: "diamondback"
+}
+item {
+  name: "n01756291"
+  label: 493
+  display_name: "sidewinder"
+}
+item {
+  name: "n01629819"
+  label: 494
+  display_name: "European_fire_salamander"
+}
+item {
+  name: "n01630670"
+  label: 495
+  display_name: "common_newt"
+}
+item {
+  name: "n01631663"
+  label: 496
+  display_name: "eft"
+}
+item {
+  name: "n01632458"
+  label: 497
+  display_name: "spotted_salamander"
+}
+item {
+  name: "n01632777"
+  label: 498
+  display_name: "axolotl"
+}
+item {
+  name: "n01641577"
+  label: 499
+  display_name: "bullfrog"
+}
+item {
+  name: "n01644373"
+  label: 500
+  display_name: "tree_frog"
+}
+item {
+  name: "n01644900"
+  label: 501
+  display_name: "tailed_frog"
+}
+item {
+  name: "n04579432"
+  label: 502
+  display_name: "whistle"
+}
+item {
+  name: "n04592741"
+  label: 503
+  display_name: "wing"
+}
+item {
+  name: "n03876231"
+  label: 504
+  display_name: "paintbrush"
+}
+item {
+  name: "n03483316"
+  label: 505
+  display_name: "hand_blower"
+}
+item {
+  name: "n03868863"
+  label: 506
+  display_name: "oxygen_mask"
+}
+item {
+  name: "n04251144"
+  label: 507
+  display_name: "snorkel"
+}
+item {
+  name: "n03691459"
+  label: 508
+  display_name: "loudspeaker"
+}
+item {
+  name: "n03759954"
+  label: 509
+  display_name: "microphone"
+}
+item {
+  name: "n04152593"
+  label: 510
+  display_name: "screen"
+}
+item {
+  name: "n03793489"
+  label: 511
+  display_name: "mouse"
+}
+item {
+  name: "n03271574"
+  label: 512
+  display_name: "electric_fan"
+}
+item {
+  name: "n03843555"
+  label: 513
+  display_name: "oil_filter"
+}
+item {
+  name: "n04332243"
+  label: 514
+  display_name: "strainer"
+}
+item {
+  name: "n04265275"
+  label: 515
+  display_name: "space_heater"
+}
+item {
+  name: "n04330267"
+  label: 516
+  display_name: "stove"
+}
+item {
+  name: "n03467068"
+  label: 517
+  display_name: "guillotine"
+}
+item {
+  name: "n02794156"
+  label: 518
+  display_name: "barometer"
+}
+item {
+  name: "n04118776"
+  label: 519
+  display_name: "rule"
+}
+item {
+  name: "n03841143"
+  label: 520
+  display_name: "odometer"
+}
+item {
+  name: "n04141975"
+  label: 521
+  display_name: "scale"
+}
+item {
+  name: "n02708093"
+  label: 522
+  display_name: "analog_clock"
+}
+item {
+  name: "n03196217"
+  label: 523
+  display_name: "digital_clock"
+}
+item {
+  name: "n04548280"
+  label: 524
+  display_name: "wall_clock"
+}
+item {
+  name: "n03544143"
+  label: 525
+  display_name: "hourglass"
+}
+item {
+  name: "n04355338"
+  label: 526
+  display_name: "sundial"
+}
+item {
+  name: "n03891332"
+  label: 527
+  display_name: "parking_meter"
+}
+item {
+  name: "n04328186"
+  label: 528
+  display_name: "stopwatch"
+}
+item {
+  name: "n03197337"
+  label: 529
+  display_name: "digital_watch"
+}
+item {
+  name: "n04317175"
+  label: 530
+  display_name: "stethoscope"
+}
+item {
+  name: "n04376876"
+  label: 531
+  display_name: "syringe"
+}
+item {
+  name: "n03706229"
+  label: 532
+  display_name: "magnetic_compass"
+}
+item {
+  name: "n02841315"
+  label: 533
+  display_name: "binoculars"
+}
+item {
+  name: "n04009552"
+  label: 534
+  display_name: "projector"
+}
+item {
+  name: "n04356056"
+  label: 535
+  display_name: "sunglasses"
+}
+item {
+  name: "n03692522"
+  label: 536
+  display_name: "loupe"
+}
+item {
+  name: "n04044716"
+  label: 537
+  display_name: "radio_telescope"
+}
+item {
+  name: "n02879718"
+  label: 538
+  display_name: "bow"
+}
+item {
+  name: "n02950826"
+  label: 539
+  display_name: "cannon"
+}
+item {
+  name: "n02749479"
+  label: 540
+  display_name: "assault_rifle"
+}
+item {
+  name: "n04090263"
+  label: 541
+  display_name: "rifle"
+}
+item {
+  name: "n04008634"
+  label: 542
+  display_name: "projectile"
+}
+item {
+  name: "n03085013"
+  label: 543
+  display_name: "computer_keyboard"
+}
+item {
+  name: "n04505470"
+  label: 544
+  display_name: "typewriter_keyboard"
+}
+item {
+  name: "n03126707"
+  label: 545
+  display_name: "crane"
+}
+item {
+  name: "n03666591"
+  label: 546
+  display_name: "lighter"
+}
+item {
+  name: "n02666196"
+  label: 547
+  display_name: "abacus"
+}
+item {
+  name: "n02977058"
+  label: 548
+  display_name: "cash_machine"
+}
+item {
+  name: "n04238763"
+  label: 549
+  display_name: "slide_rule"
+}
+item {
+  name: "n03180011"
+  label: 550
+  display_name: "desktop_computer"
+}
+item {
+  name: "n03485407"
+  label: 551
+  display_name: "hand-held_computer"
+}
+item {
+  name: "n03832673"
+  label: 552
+  display_name: "notebook"
+}
+item {
+  name: "n06359193"
+  label: 553
+  display_name: "web_site"
+}
+item {
+  name: "n03496892"
+  label: 554
+  display_name: "harvester"
+}
+item {
+  name: "n04428191"
+  label: 555
+  display_name: "thresher"
+}
+item {
+  name: "n04004767"
+  label: 556
+  display_name: "printer"
+}
+item {
+  name: "n04243546"
+  label: 557
+  display_name: "slot"
+}
+item {
+  name: "n04525305"
+  label: 558
+  display_name: "vending_machine"
+}
+item {
+  name: "n04179913"
+  label: 559
+  display_name: "sewing_machine"
+}
+item {
+  name: "n03602883"
+  label: 560
+  display_name: "joystick"
+}
+item {
+  name: "n04372370"
+  label: 561
+  display_name: "switch"
+}
+item {
+  name: "n03532672"
+  label: 562
+  display_name: "hook"
+}
+item {
+  name: "n02974003"
+  label: 563
+  display_name: "car_wheel"
+}
+item {
+  name: "n03874293"
+  label: 564
+  display_name: "paddlewheel"
+}
+item {
+  name: "n03944341"
+  label: 565
+  display_name: "pinwheel"
+}
+item {
+  name: "n03992509"
+  label: 566
+  display_name: "potter\'s_wheel"
+}
+item {
+  name: "n03425413"
+  label: 567
+  display_name: "gas_pump"
+}
+item {
+  name: "n02966193"
+  label: 568
+  display_name: "carousel"
+}
+item {
+  name: "n04371774"
+  label: 569
+  display_name: "swing"
+}
+item {
+  name: "n04067472"
+  label: 570
+  display_name: "reel"
+}
+item {
+  name: "n04040759"
+  label: 571
+  display_name: "radiator"
+}
+item {
+  name: "n04019541"
+  label: 572
+  display_name: "puck"
+}
+item {
+  name: "n03492542"
+  label: 573
+  display_name: "hard_disc"
+}
+item {
+  name: "n04355933"
+  label: 574
+  display_name: "sunglass"
+}
+item {
+  name: "n03929660"
+  label: 575
+  display_name: "pick"
+}
+item {
+  name: "n02965783"
+  label: 576
+  display_name: "car_mirror"
+}
+item {
+  name: "n04258138"
+  label: 577
+  display_name: "solar_dish"
+}
+item {
+  name: "n04074963"
+  label: 578
+  display_name: "remote_control"
+}
+item {
+  name: "n03208938"
+  label: 579
+  display_name: "disk_brake"
+}
+item {
+  name: "n02910353"
+  label: 580
+  display_name: "buckle"
+}
+item {
+  name: "n03476684"
+  label: 581
+  display_name: "hair_slide"
+}
+item {
+  name: "n03627232"
+  label: 582
+  display_name: "knot"
+}
+item {
+  name: "n03075370"
+  label: 583
+  display_name: "combination_lock"
+}
+item {
+  name: "n03874599"
+  label: 584
+  display_name: "padlock"
+}
+item {
+  name: "n03804744"
+  label: 585
+  display_name: "nail"
+}
+item {
+  name: "n04127249"
+  label: 586
+  display_name: "safety_pin"
+}
+item {
+  name: "n04153751"
+  label: 587
+  display_name: "screw"
+}
+item {
+  name: "n03803284"
+  label: 588
+  display_name: "muzzle"
+}
+item {
+  name: "n04162706"
+  label: 589
+  display_name: "seat_belt"
+}
+item {
+  name: "n04228054"
+  label: 590
+  display_name: "ski"
+}
+item {
+  name: "n02948072"
+  label: 591
+  display_name: "candle"
+}
+item {
+  name: "n03590841"
+  label: 592
+  display_name: "jack-o\'-lantern"
+}
+item {
+  name: "n04286575"
+  label: 593
+  display_name: "spotlight"
+}
+item {
+  name: "n04456115"
+  label: 594
+  display_name: "torch"
+}
+item {
+  name: "n03814639"
+  label: 595
+  display_name: "neck_brace"
+}
+item {
+  name: "n03933933"
+  label: 596
+  display_name: "pier"
+}
+item {
+  name: "n04485082"
+  label: 597
+  display_name: "tripod"
+}
+item {
+  name: "n03733131"
+  label: 598
+  display_name: "maypole"
+}
+item {
+  name: "n03794056"
+  label: 599
+  display_name: "mousetrap"
+}
+item {
+  name: "n04275548"
+  label: 600
+  display_name: "spider_web"
+}
+item {
+  name: "n01768244"
+  label: 601
+  display_name: "trilobite"
+}
+item {
+  name: "n01770081"
+  label: 602
+  display_name: "harvestman"
+}
+item {
+  name: "n01770393"
+  label: 603
+  display_name: "scorpion"
+}
+item {
+  name: "n01773157"
+  label: 604
+  display_name: "black_and_gold_garden_spider"
+}
+item {
+  name: "n01773549"
+  label: 605
+  display_name: "barn_spider"
+}
+item {
+  name: "n01773797"
+  label: 606
+  display_name: "garden_spider"
+}
+item {
+  name: "n01774384"
+  label: 607
+  display_name: "black_widow"
+}
+item {
+  name: "n01774750"
+  label: 608
+  display_name: "tarantula"
+}
+item {
+  name: "n01775062"
+  label: 609
+  display_name: "wolf_spider"
+}
+item {
+  name: "n01776313"
+  label: 610
+  display_name: "tick"
+}
+item {
+  name: "n01784675"
+  label: 611
+  display_name: "centipede"
+}
+item {
+  name: "n01990800"
+  label: 612
+  display_name: "isopod"
+}
+item {
+  name: "n01978287"
+  label: 613
+  display_name: "Dungeness_crab"
+}
+item {
+  name: "n01978455"
+  label: 614
+  display_name: "rock_crab"
+}
+item {
+  name: "n01980166"
+  label: 615
+  display_name: "fiddler_crab"
+}
+item {
+  name: "n01981276"
+  label: 616
+  display_name: "king_crab"
+}
+item {
+  name: "n01983481"
+  label: 617
+  display_name: "American_lobster"
+}
+item {
+  name: "n01984695"
+  label: 618
+  display_name: "spiny_lobster"
+}
+item {
+  name: "n01985128"
+  label: 619
+  display_name: "crayfish"
+}
+item {
+  name: "n01986214"
+  label: 620
+  display_name: "hermit_crab"
+}
+item {
+  name: "n02165105"
+  label: 621
+  display_name: "tiger_beetle"
+}
+item {
+  name: "n02165456"
+  label: 622
+  display_name: "ladybug"
+}
+item {
+  name: "n02167151"
+  label: 623
+  display_name: "ground_beetle"
+}
+item {
+  name: "n02168699"
+  label: 624
+  display_name: "long-horned_beetle"
+}
+item {
+  name: "n02169497"
+  label: 625
+  display_name: "leaf_beetle"
+}
+item {
+  name: "n02172182"
+  label: 626
+  display_name: "dung_beetle"
+}
+item {
+  name: "n02174001"
+  label: 627
+  display_name: "rhinoceros_beetle"
+}
+item {
+  name: "n02177972"
+  label: 628
+  display_name: "weevil"
+}
+item {
+  name: "n02190166"
+  label: 629
+  display_name: "fly"
+}
+item {
+  name: "n02206856"
+  label: 630
+  display_name: "bee"
+}
+item {
+  name: "n02226429"
+  label: 631
+  display_name: "grasshopper"
+}
+item {
+  name: "n02229544"
+  label: 632
+  display_name: "cricket"
+}
+item {
+  name: "n02231487"
+  label: 633
+  display_name: "walking_stick"
+}
+item {
+  name: "n02233338"
+  label: 634
+  display_name: "cockroach"
+}
+item {
+  name: "n02236044"
+  label: 635
+  display_name: "mantis"
+}
+item {
+  name: "n02256656"
+  label: 636
+  display_name: "cicada"
+}
+item {
+  name: "n02259212"
+  label: 637
+  display_name: "leafhopper"
+}
+item {
+  name: "n02264363"
+  label: 638
+  display_name: "lacewing"
+}
+item {
+  name: "n02268443"
+  label: 639
+  display_name: "dragonfly"
+}
+item {
+  name: "n02268853"
+  label: 640
+  display_name: "damselfly"
+}
+item {
+  name: "n02276258"
+  label: 641
+  display_name: "admiral"
+}
+item {
+  name: "n02277742"
+  label: 642
+  display_name: "ringlet"
+}
+item {
+  name: "n02279972"
+  label: 643
+  display_name: "monarch"
+}
+item {
+  name: "n02280649"
+  label: 644
+  display_name: "cabbage_butterfly"
+}
+item {
+  name: "n02281406"
+  label: 645
+  display_name: "sulphur_butterfly"
+}
+item {
+  name: "n02281787"
+  label: 646
+  display_name: "lycaenid"
+}
+item {
+  name: "n01910747"
+  label: 647
+  display_name: "jellyfish"
+}
+item {
+  name: "n01914609"
+  label: 648
+  display_name: "sea_anemone"
+}
+item {
+  name: "n01917289"
+  label: 649
+  display_name: "brain_coral"
+}
+item {
+  name: "n01924916"
+  label: 650
+  display_name: "flatworm"
+}
+item {
+  name: "n01930112"
+  label: 651
+  display_name: "nematode"
+}
+item {
+  name: "n01943899"
+  label: 652
+  display_name: "conch"
+}
+item {
+  name: "n01944390"
+  label: 653
+  display_name: "snail"
+}
+item {
+  name: "n01945685"
+  label: 654
+  display_name: "slug"
+}
+item {
+  name: "n01950731"
+  label: 655
+  display_name: "sea_slug"
+}
+item {
+  name: "n01955084"
+  label: 656
+  display_name: "chiton"
+}
+item {
+  name: "n02319095"
+  label: 657
+  display_name: "sea_urchin"
+}
+item {
+  name: "n02321529"
+  label: 658
+  display_name: "sea_cucumber"
+}
+item {
+  name: "n03584829"
+  label: 659
+  display_name: "iron"
+}
+item {
+  name: "n03297495"
+  label: 660
+  display_name: "espresso_maker"
+}
+item {
+  name: "n03761084"
+  label: 661
+  display_name: "microwave"
+}
+item {
+  name: "n03259280"
+  label: 662
+  display_name: "Dutch_oven"
+}
+item {
+  name: "n04111531"
+  label: 663
+  display_name: "rotisserie"
+}
+item {
+  name: "n04442312"
+  label: 664
+  display_name: "toaster"
+}
+item {
+  name: "n04542943"
+  label: 665
+  display_name: "waffle_iron"
+}
+item {
+  name: "n04517823"
+  label: 666
+  display_name: "vacuum"
+}
+item {
+  name: "n03207941"
+  label: 667
+  display_name: "dishwasher"
+}
+item {
+  name: "n04070727"
+  label: 668
+  display_name: "refrigerator"
+}
+item {
+  name: "n04554684"
+  label: 669
+  display_name: "washer"
+}
+item {
+  name: "n03133878"
+  label: 670
+  display_name: "Crock_Pot"
+}
+item {
+  name: "n03400231"
+  label: 671
+  display_name: "frying_pan"
+}
+item {
+  name: "n04596742"
+  label: 672
+  display_name: "wok"
+}
+item {
+  name: "n02939185"
+  label: 673
+  display_name: "caldron"
+}
+item {
+  name: "n03063689"
+  label: 674
+  display_name: "coffeepot"
+}
+item {
+  name: "n04398044"
+  label: 675
+  display_name: "teapot"
+}
+item {
+  name: "n04270147"
+  label: 676
+  display_name: "spatula"
+}
+item {
+  name: "n02699494"
+  label: 677
+  display_name: "altar"
+}
+item {
+  name: "n04486054"
+  label: 678
+  display_name: "triumphal_arch"
+}
+item {
+  name: "n03899768"
+  label: 679
+  display_name: "patio"
+}
+item {
+  name: "n04311004"
+  label: 680
+  display_name: "steel_arch_bridge"
+}
+item {
+  name: "n04366367"
+  label: 681
+  display_name: "suspension_bridge"
+}
+item {
+  name: "n04532670"
+  label: 682
+  display_name: "viaduct"
+}
+item {
+  name: "n02793495"
+  label: 683
+  display_name: "barn"
+}
+item {
+  name: "n03457902"
+  label: 684
+  display_name: "greenhouse"
+}
+item {
+  name: "n03877845"
+  label: 685
+  display_name: "palace"
+}
+item {
+  name: "n03781244"
+  label: 686
+  display_name: "monastery"
+}
+item {
+  name: "n03661043"
+  label: 687
+  display_name: "library"
+}
+item {
+  name: "n02727426"
+  label: 688
+  display_name: "apiary"
+}
+item {
+  name: "n02859443"
+  label: 689
+  display_name: "boathouse"
+}
+item {
+  name: "n03028079"
+  label: 690
+  display_name: "church"
+}
+item {
+  name: "n03788195"
+  label: 691
+  display_name: "mosque"
+}
+item {
+  name: "n04346328"
+  label: 692
+  display_name: "stupa"
+}
+item {
+  name: "n03956157"
+  label: 693
+  display_name: "planetarium"
+}
+item {
+  name: "n04081281"
+  label: 694
+  display_name: "restaurant"
+}
+item {
+  name: "n03032252"
+  label: 695
+  display_name: "cinema"
+}
+item {
+  name: "n03529860"
+  label: 696
+  display_name: "home_theater"
+}
+item {
+  name: "n03697007"
+  label: 697
+  display_name: "lumbermill"
+}
+item {
+  name: "n03065424"
+  label: 698
+  display_name: "coil"
+}
+item {
+  name: "n03837869"
+  label: 699
+  display_name: "obelisk"
+}
+item {
+  name: "n04458633"
+  label: 700
+  display_name: "totem_pole"
+}
+item {
+  name: "n02980441"
+  label: 701
+  display_name: "castle"
+}
+item {
+  name: "n04005630"
+  label: 702
+  display_name: "prison"
+}
+item {
+  name: "n03461385"
+  label: 703
+  display_name: "grocery_store"
+}
+item {
+  name: "n02776631"
+  label: 704
+  display_name: "bakery"
+}
+item {
+  name: "n02791270"
+  label: 705
+  display_name: "barbershop"
+}
+item {
+  name: "n02871525"
+  label: 706
+  display_name: "bookshop"
+}
+item {
+  name: "n02927161"
+  label: 707
+  display_name: "butcher_shop"
+}
+item {
+  name: "n03089624"
+  label: 708
+  display_name: "confectionery"
+}
+item {
+  name: "n04200800"
+  label: 709
+  display_name: "shoe_shop"
+}
+item {
+  name: "n04443257"
+  label: 710
+  display_name: "tobacco_shop"
+}
+item {
+  name: "n04462240"
+  label: 711
+  display_name: "toyshop"
+}
+item {
+  name: "n03388043"
+  label: 712
+  display_name: "fountain"
+}
+item {
+  name: "n03042490"
+  label: 713
+  display_name: "cliff_dwelling"
+}
+item {
+  name: "n04613696"
+  label: 714
+  display_name: "yurt"
+}
+item {
+  name: "n03216828"
+  label: 715
+  display_name: "dock"
+}
+item {
+  name: "n02892201"
+  label: 716
+  display_name: "brass"
+}
+item {
+  name: "n03743016"
+  label: 717
+  display_name: "megalith"
+}
+item {
+  name: "n02788148"
+  label: 718
+  display_name: "bannister"
+}
+item {
+  name: "n02894605"
+  label: 719
+  display_name: "breakwater"
+}
+item {
+  name: "n03160309"
+  label: 720
+  display_name: "dam"
+}
+item {
+  name: "n03000134"
+  label: 721
+  display_name: "chainlink_fence"
+}
+item {
+  name: "n03930313"
+  label: 722
+  display_name: "picket_fence"
+}
+item {
+  name: "n04604644"
+  label: 723
+  display_name: "worm_fence"
+}
+item {
+  name: "n04326547"
+  label: 724
+  display_name: "stone_wall"
+}
+item {
+  name: "n03459775"
+  label: 725
+  display_name: "grille"
+}
+item {
+  name: "n04239074"
+  label: 726
+  display_name: "sliding_door"
+}
+item {
+  name: "n04501370"
+  label: 727
+  display_name: "turnstile"
+}
+item {
+  name: "n03792972"
+  label: 728
+  display_name: "mountain_tent"
+}
+item {
+  name: "n04149813"
+  label: 729
+  display_name: "scoreboard"
+}
+item {
+  name: "n03530642"
+  label: 730
+  display_name: "honeycomb"
+}
+item {
+  name: "n03961711"
+  label: 731
+  display_name: "plate_rack"
+}
+item {
+  name: "n03903868"
+  label: 732
+  display_name: "pedestal"
+}
+item {
+  name: "n02814860"
+  label: 733
+  display_name: "beacon"
+}
+item {
+  name: "n07711569"
+  label: 734
+  display_name: "mashed_potato"
+}
+item {
+  name: "n07720875"
+  label: 735
+  display_name: "bell_pepper"
+}
+item {
+  name: "n07714571"
+  label: 736
+  display_name: "head_cabbage"
+}
+item {
+  name: "n07714990"
+  label: 737
+  display_name: "broccoli"
+}
+item {
+  name: "n07715103"
+  label: 738
+  display_name: "cauliflower"
+}
+item {
+  name: "n07716358"
+  label: 739
+  display_name: "zucchini"
+}
+item {
+  name: "n07716906"
+  label: 740
+  display_name: "spaghetti_squash"
+}
+item {
+  name: "n07717410"
+  label: 741
+  display_name: "acorn_squash"
+}
+item {
+  name: "n07717556"
+  label: 742
+  display_name: "butternut_squash"
+}
+item {
+  name: "n07718472"
+  label: 743
+  display_name: "cucumber"
+}
+item {
+  name: "n07718747"
+  label: 744
+  display_name: "artichoke"
+}
+item {
+  name: "n07730033"
+  label: 745
+  display_name: "cardoon"
+}
+item {
+  name: "n07734744"
+  label: 746
+  display_name: "mushroom"
+}
+item {
+  name: "n04209239"
+  label: 747
+  display_name: "shower_curtain"
+}
+item {
+  name: "n03594734"
+  label: 748
+  display_name: "jean"
+}
+item {
+  name: "n02971356"
+  label: 749
+  display_name: "carton"
+}
+item {
+  name: "n03485794"
+  label: 750
+  display_name: "handkerchief"
+}
+item {
+  name: "n04133789"
+  label: 751
+  display_name: "sandal"
+}
+item {
+  name: "n02747177"
+  label: 752
+  display_name: "ashcan"
+}
+item {
+  name: "n04125021"
+  label: 753
+  display_name: "safe"
+}
+item {
+  name: "n07579787"
+  label: 754
+  display_name: "plate"
+}
+item {
+  name: "n03814906"
+  label: 755
+  display_name: "necklace"
+}
+item {
+  name: "n03134739"
+  label: 756
+  display_name: "croquet_ball"
+}
+item {
+  name: "n03404251"
+  label: 757
+  display_name: "fur_coat"
+}
+item {
+  name: "n04423845"
+  label: 758
+  display_name: "thimble"
+}
+item {
+  name: "n03877472"
+  label: 759
+  display_name: "pajama"
+}
+item {
+  name: "n04120489"
+  label: 760
+  display_name: "running_shoe"
+}
+item {
+  name: "n03062245"
+  label: 761
+  display_name: "cocktail_shaker"
+}
+item {
+  name: "n03014705"
+  label: 762
+  display_name: "chest"
+}
+item {
+  name: "n03717622"
+  label: 763
+  display_name: "manhole_cover"
+}
+item {
+  name: "n03777754"
+  label: 764
+  display_name: "modem"
+}
+item {
+  name: "n04493381"
+  label: 765
+  display_name: "tub"
+}
+item {
+  name: "n04476259"
+  label: 766
+  display_name: "tray"
+}
+item {
+  name: "n02777292"
+  label: 767
+  display_name: "balance_beam"
+}
+item {
+  name: "n07693725"
+  label: 768
+  display_name: "bagel"
+}
+item {
+  name: "n03998194"
+  label: 769
+  display_name: "prayer_rug"
+}
+item {
+  name: "n03617480"
+  label: 770
+  display_name: "kimono"
+}
+item {
+  name: "n07590611"
+  label: 771
+  display_name: "hot_pot"
+}
+item {
+  name: "n04579145"
+  label: 772
+  display_name: "whiskey_jug"
+}
+item {
+  name: "n03623198"
+  label: 773
+  display_name: "knee_pad"
+}
+item {
+  name: "n07248320"
+  label: 774
+  display_name: "book_jacket"
+}
+item {
+  name: "n04277352"
+  label: 775
+  display_name: "spindle"
+}
+item {
+  name: "n04229816"
+  label: 776
+  display_name: "ski_mask"
+}
+item {
+  name: "n02823428"
+  label: 777
+  display_name: "beer_bottle"
+}
+item {
+  name: "n03127747"
+  label: 778
+  display_name: "crash_helmet"
+}
+item {
+  name: "n02877765"
+  label: 779
+  display_name: "bottlecap"
+}
+item {
+  name: "n04435653"
+  label: 780
+  display_name: "tile_roof"
+}
+item {
+  name: "n03724870"
+  label: 781
+  display_name: "mask"
+}
+item {
+  name: "n03710637"
+  label: 782
+  display_name: "maillot"
+}
+item {
+  name: "n03920288"
+  label: 783
+  display_name: "Petri_dish"
+}
+item {
+  name: "n03379051"
+  label: 784
+  display_name: "football_helmet"
+}
+item {
+  name: "n02807133"
+  label: 785
+  display_name: "bathing_cap"
+}
+item {
+  name: "n04399382"
+  label: 786
+  display_name: "teddy"
+}
+item {
+  name: "n03527444"
+  label: 787
+  display_name: "holster"
+}
+item {
+  name: "n03983396"
+  label: 788
+  display_name: "pop_bottle"
+}
+item {
+  name: "n03924679"
+  label: 789
+  display_name: "photocopier"
+}
+item {
+  name: "n04532106"
+  label: 790
+  display_name: "vestment"
+}
+item {
+  name: "n06785654"
+  label: 791
+  display_name: "crossword_puzzle"
+}
+item {
+  name: "n03445777"
+  label: 792
+  display_name: "golf_ball"
+}
+item {
+  name: "n07613480"
+  label: 793
+  display_name: "trifle"
+}
+item {
+  name: "n04350905"
+  label: 794
+  display_name: "suit"
+}
+item {
+  name: "n04562935"
+  label: 795
+  display_name: "water_tower"
+}
+item {
+  name: "n03325584"
+  label: 796
+  display_name: "feather_boa"
+}
+item {
+  name: "n03045698"
+  label: 797
+  display_name: "cloak"
+}
+item {
+  name: "n07892512"
+  label: 798
+  display_name: "red_wine"
+}
+item {
+  name: "n03250847"
+  label: 799
+  display_name: "drumstick"
+}
+item {
+  name: "n04192698"
+  label: 800
+  display_name: "shield"
+}
+item {
+  name: "n03026506"
+  label: 801
+  display_name: "Christmas_stocking"
+}
+item {
+  name: "n03534580"
+  label: 802
+  display_name: "hoopskirt"
+}
+item {
+  name: "n07565083"
+  label: 803
+  display_name: "menu"
+}
+item {
+  name: "n04296562"
+  label: 804
+  display_name: "stage"
+}
+item {
+  name: "n02869837"
+  label: 805
+  display_name: "bonnet"
+}
+item {
+  name: "n07871810"
+  label: 806
+  display_name: "meat_loaf"
+}
+item {
+  name: "n02799071"
+  label: 807
+  display_name: "baseball"
+}
+item {
+  name: "n03314780"
+  label: 808
+  display_name: "face_powder"
+}
+item {
+  name: "n04141327"
+  label: 809
+  display_name: "scabbard"
+}
+item {
+  name: "n04357314"
+  label: 810
+  display_name: "sunscreen"
+}
+item {
+  name: "n02823750"
+  label: 811
+  display_name: "beer_glass"
+}
+item {
+  name: "n13052670"
+  label: 812
+  display_name: "hen-of-the-woods"
+}
+item {
+  name: "n07583066"
+  label: 813
+  display_name: "guacamole"
+}
+item {
+  name: "n03637318"
+  label: 814
+  display_name: "lampshade"
+}
+item {
+  name: "n04599235"
+  label: 815
+  display_name: "wool"
+}
+item {
+  name: "n07802026"
+  label: 816
+  display_name: "hay"
+}
+item {
+  name: "n02883205"
+  label: 817
+  display_name: "bow_tie"
+}
+item {
+  name: "n03709823"
+  label: 818
+  display_name: "mailbag"
+}
+item {
+  name: "n04560804"
+  label: 819
+  display_name: "water_jug"
+}
+item {
+  name: "n02909870"
+  label: 820
+  display_name: "bucket"
+}
+item {
+  name: "n03207743"
+  label: 821
+  display_name: "dishrag"
+}
+item {
+  name: "n04263257"
+  label: 822
+  display_name: "soup_bowl"
+}
+item {
+  name: "n07932039"
+  label: 823
+  display_name: "eggnog"
+}
+item {
+  name: "n03786901"
+  label: 824
+  display_name: "mortar"
+}
+item {
+  name: "n04479046"
+  label: 825
+  display_name: "trench_coat"
+}
+item {
+  name: "n03873416"
+  label: 826
+  display_name: "paddle"
+}
+item {
+  name: "n02999410"
+  label: 827
+  display_name: "chain"
+}
+item {
+  name: "n04367480"
+  label: 828
+  display_name: "swab"
+}
+item {
+  name: "n03775546"
+  label: 829
+  display_name: "mixing_bowl"
+}
+item {
+  name: "n07875152"
+  label: 830
+  display_name: "potpie"
+}
+item {
+  name: "n04591713"
+  label: 831
+  display_name: "wine_bottle"
+}
+item {
+  name: "n04201297"
+  label: 832
+  display_name: "shoji"
+}
+item {
+  name: "n02916936"
+  label: 833
+  display_name: "bulletproof_vest"
+}
+item {
+  name: "n03240683"
+  label: 834
+  display_name: "drilling_platform"
+}
+item {
+  name: "n02840245"
+  label: 835
+  display_name: "binder"
+}
+item {
+  name: "n02963159"
+  label: 836
+  display_name: "cardigan"
+}
+item {
+  name: "n04370456"
+  label: 837
+  display_name: "sweatshirt"
+}
+item {
+  name: "n03991062"
+  label: 838
+  display_name: "pot"
+}
+item {
+  name: "n02843684"
+  label: 839
+  display_name: "birdhouse"
+}
+item {
+  name: "n03482405"
+  label: 840
+  display_name: "hamper"
+}
+item {
+  name: "n03942813"
+  label: 841
+  display_name: "ping-pong_ball"
+}
+item {
+  name: "n03908618"
+  label: 842
+  display_name: "pencil_box"
+}
+item {
+  name: "n03902125"
+  label: 843
+  display_name: "pay-phone"
+}
+item {
+  name: "n07584110"
+  label: 844
+  display_name: "consomme"
+}
+item {
+  name: "n02730930"
+  label: 845
+  display_name: "apron"
+}
+item {
+  name: "n04023962"
+  label: 846
+  display_name: "punching_bag"
+}
+item {
+  name: "n02769748"
+  label: 847
+  display_name: "backpack"
+}
+item {
+  name: "n10148035"
+  label: 848
+  display_name: "groom"
+}
+item {
+  name: "n02817516"
+  label: 849
+  display_name: "bearskin"
+}
+item {
+  name: "n03908714"
+  label: 850
+  display_name: "pencil_sharpener"
+}
+item {
+  name: "n02906734"
+  label: 851
+  display_name: "broom"
+}
+item {
+  name: "n03788365"
+  label: 852
+  display_name: "mosquito_net"
+}
+item {
+  name: "n02667093"
+  label: 853
+  display_name: "abaya"
+}
+item {
+  name: "n03787032"
+  label: 854
+  display_name: "mortarboard"
+}
+item {
+  name: "n03980874"
+  label: 855
+  display_name: "poncho"
+}
+item {
+  name: "n03141823"
+  label: 856
+  display_name: "crutch"
+}
+item {
+  name: "n03976467"
+  label: 857
+  display_name: "Polaroid_camera"
+}
+item {
+  name: "n04264628"
+  label: 858
+  display_name: "space_bar"
+}
+item {
+  name: "n07930864"
+  label: 859
+  display_name: "cup"
+}
+item {
+  name: "n04039381"
+  label: 860
+  display_name: "racket"
+}
+item {
+  name: "n06874185"
+  label: 861
+  display_name: "traffic_light"
+}
+item {
+  name: "n04033901"
+  label: 862
+  display_name: "quill"
+}
+item {
+  name: "n04041544"
+  label: 863
+  display_name: "radio"
+}
+item {
+  name: "n07860988"
+  label: 864
+  display_name: "dough"
+}
+item {
+  name: "n03146219"
+  label: 865
+  display_name: "cuirass"
+}
+item {
+  name: "n03763968"
+  label: 866
+  display_name: "military_uniform"
+}
+item {
+  name: "n03676483"
+  label: 867
+  display_name: "lipstick"
+}
+item {
+  name: "n04209133"
+  label: 868
+  display_name: "shower_cap"
+}
+item {
+  name: "n03782006"
+  label: 869
+  display_name: "monitor"
+}
+item {
+  name: "n03857828"
+  label: 870
+  display_name: "oscilloscope"
+}
+item {
+  name: "n03775071"
+  label: 871
+  display_name: "mitten"
+}
+item {
+  name: "n02892767"
+  label: 872
+  display_name: "brassiere"
+}
+item {
+  name: "n07684084"
+  label: 873
+  display_name: "French_loaf"
+}
+item {
+  name: "n04522168"
+  label: 874
+  display_name: "vase"
+}
+item {
+  name: "n03764736"
+  label: 875
+  display_name: "milk_can"
+}
+item {
+  name: "n04118538"
+  label: 876
+  display_name: "rugby_ball"
+}
+item {
+  name: "n03887697"
+  label: 877
+  display_name: "paper_towel"
+}
+item {
+  name: "n13044778"
+  label: 878
+  display_name: "earthstar"
+}
+item {
+  name: "n03291819"
+  label: 879
+  display_name: "envelope"
+}
+item {
+  name: "n03770439"
+  label: 880
+  display_name: "miniskirt"
+}
+item {
+  name: "n03124170"
+  label: 881
+  display_name: "cowboy_hat"
+}
+item {
+  name: "n04487081"
+  label: 882
+  display_name: "trolleybus"
+}
+item {
+  name: "n03916031"
+  label: 883
+  display_name: "perfume"
+}
+item {
+  name: "n02808440"
+  label: 884
+  display_name: "bathtub"
+}
+item {
+  name: "n07697537"
+  label: 885
+  display_name: "hotdog"
+}
+item {
+  name: "n12985857"
+  label: 886
+  display_name: "coral_fungus"
+}
+item {
+  name: "n02917067"
+  label: 887
+  display_name: "bullet_train"
+}
+item {
+  name: "n03938244"
+  label: 888
+  display_name: "pillow"
+}
+item {
+  name: "n15075141"
+  label: 889
+  display_name: "toilet_tissue"
+}
+item {
+  name: "n02978881"
+  label: 890
+  display_name: "cassette"
+}
+item {
+  name: "n02966687"
+  label: 891
+  display_name: "carpenter\'s_kit"
+}
+item {
+  name: "n03633091"
+  label: 892
+  display_name: "ladle"
+}
+item {
+  name: "n13040303"
+  label: 893
+  display_name: "stinkhorn"
+}
+item {
+  name: "n03690938"
+  label: 894
+  display_name: "lotion"
+}
+item {
+  name: "n03476991"
+  label: 895
+  display_name: "hair_spray"
+}
+item {
+  name: "n02669723"
+  label: 896
+  display_name: "academic_gown"
+}
+item {
+  name: "n03220513"
+  label: 897
+  display_name: "dome"
+}
+item {
+  name: "n03127925"
+  label: 898
+  display_name: "crate"
+}
+item {
+  name: "n04584207"
+  label: 899
+  display_name: "wig"
+}
+item {
+  name: "n07880968"
+  label: 900
+  display_name: "burrito"
+}
+item {
+  name: "n03937543"
+  label: 901
+  display_name: "pill_bottle"
+}
+item {
+  name: "n03000247"
+  label: 902
+  display_name: "chain_mail"
+}
+item {
+  name: "n04418357"
+  label: 903
+  display_name: "theater_curtain"
+}
+item {
+  name: "n04590129"
+  label: 904
+  display_name: "window_shade"
+}
+item {
+  name: "n02795169"
+  label: 905
+  display_name: "barrel"
+}
+item {
+  name: "n04553703"
+  label: 906
+  display_name: "washbasin"
+}
+item {
+  name: "n02783161"
+  label: 907
+  display_name: "ballpoint"
+}
+item {
+  name: "n02802426"
+  label: 908
+  display_name: "basketball"
+}
+item {
+  name: "n02808304"
+  label: 909
+  display_name: "bath_towel"
+}
+item {
+  name: "n03124043"
+  label: 910
+  display_name: "cowboy_boot"
+}
+item {
+  name: "n03450230"
+  label: 911
+  display_name: "gown"
+}
+item {
+  name: "n04589890"
+  label: 912
+  display_name: "window_screen"
+}
+item {
+  name: "n12998815"
+  label: 913
+  display_name: "agaric"
+}
+item {
+  name: "n02992529"
+  label: 914
+  display_name: "cellular_telephone"
+}
+item {
+  name: "n03825788"
+  label: 915
+  display_name: "nipple"
+}
+item {
+  name: "n02790996"
+  label: 916
+  display_name: "barbell"
+}
+item {
+  name: "n03710193"
+  label: 917
+  display_name: "mailbox"
+}
+item {
+  name: "n03630383"
+  label: 918
+  display_name: "lab_coat"
+}
+item {
+  name: "n03347037"
+  label: 919
+  display_name: "fire_screen"
+}
+item {
+  name: "n03769881"
+  label: 920
+  display_name: "minibus"
+}
+item {
+  name: "n03871628"
+  label: 921
+  display_name: "packet"
+}
+item {
+  name: "n03733281"
+  label: 922
+  display_name: "maze"
+}
+item {
+  name: "n03976657"
+  label: 923
+  display_name: "pole"
+}
+item {
+  name: "n03535780"
+  label: 924
+  display_name: "horizontal_bar"
+}
+item {
+  name: "n04259630"
+  label: 925
+  display_name: "sombrero"
+}
+item {
+  name: "n03929855"
+  label: 926
+  display_name: "pickelhaube"
+}
+item {
+  name: "n04049303"
+  label: 927
+  display_name: "rain_barrel"
+}
+item {
+  name: "n04548362"
+  label: 928
+  display_name: "wallet"
+}
+item {
+  name: "n02979186"
+  label: 929
+  display_name: "cassette_player"
+}
+item {
+  name: "n06596364"
+  label: 930
+  display_name: "comic_book"
+}
+item {
+  name: "n03935335"
+  label: 931
+  display_name: "piggy_bank"
+}
+item {
+  name: "n06794110"
+  label: 932
+  display_name: "street_sign"
+}
+item {
+  name: "n02825657"
+  label: 933
+  display_name: "bell_cote"
+}
+item {
+  name: "n03388183"
+  label: 934
+  display_name: "fountain_pen"
+}
+item {
+  name: "n04591157"
+  label: 935
+  display_name: "Windsor_tie"
+}
+item {
+  name: "n04540053"
+  label: 936
+  display_name: "volleyball"
+}
+item {
+  name: "n03866082"
+  label: 937
+  display_name: "overskirt"
+}
+item {
+  name: "n04136333"
+  label: 938
+  display_name: "sarong"
+}
+item {
+  name: "n04026417"
+  label: 939
+  display_name: "purse"
+}
+item {
+  name: "n02865351"
+  label: 940
+  display_name: "bolo_tie"
+}
+item {
+  name: "n02834397"
+  label: 941
+  display_name: "bib"
+}
+item {
+  name: "n03888257"
+  label: 942
+  display_name: "parachute"
+}
+item {
+  name: "n04235860"
+  label: 943
+  display_name: "sleeping_bag"
+}
+item {
+  name: "n04404412"
+  label: 944
+  display_name: "television"
+}
+item {
+  name: "n04371430"
+  label: 945
+  display_name: "swimming_trunks"
+}
+item {
+  name: "n03733805"
+  label: 946
+  display_name: "measuring_cup"
+}
+item {
+  name: "n07920052"
+  label: 947
+  display_name: "espresso"
+}
+item {
+  name: "n07873807"
+  label: 948
+  display_name: "pizza"
+}
+item {
+  name: "n02895154"
+  label: 949
+  display_name: "breastplate"
+}
+item {
+  name: "n04204238"
+  label: 950
+  display_name: "shopping_basket"
+}
+item {
+  name: "n04597913"
+  label: 951
+  display_name: "wooden_spoon"
+}
+item {
+  name: "n04131690"
+  label: 952
+  display_name: "saltshaker"
+}
+item {
+  name: "n07836838"
+  label: 953
+  display_name: "chocolate_sauce"
+}
+item {
+  name: "n09835506"
+  label: 954
+  display_name: "ballplayer"
+}
+item {
+  name: "n03443371"
+  label: 955
+  display_name: "goblet"
+}
+item {
+  name: "n13037406"
+  label: 956
+  display_name: "gyromitra"
+}
+item {
+  name: "n04336792"
+  label: 957
+  display_name: "stretcher"
+}
+item {
+  name: "n04557648"
+  label: 958
+  display_name: "water_bottle"
+}
+item {
+  name: "n03187595"
+  label: 959
+  display_name: "dial_telephone"
+}
+item {
+  name: "n04254120"
+  label: 960
+  display_name: "soap_dispenser"
+}
+item {
+  name: "n03595614"
+  label: 961
+  display_name: "jersey"
+}
+item {
+  name: "n04146614"
+  label: 962
+  display_name: "school_bus"
+}
+item {
+  name: "n03598930"
+  label: 963
+  display_name: "jigsaw_puzzle"
+}
+item {
+  name: "n03958227"
+  label: 964
+  display_name: "plastic_bag"
+}
+item {
+  name: "n04069434"
+  label: 965
+  display_name: "reflex_camera"
+}
+item {
+  name: "n03188531"
+  label: 966
+  display_name: "diaper"
+}
+item {
+  name: "n02786058"
+  label: 967
+  display_name: "Band_Aid"
+}
+item {
+  name: "n07615774"
+  label: 968
+  display_name: "ice_lolly"
+}
+item {
+  name: "n04525038"
+  label: 969
+  display_name: "velvet"
+}
+item {
+  name: "n04409515"
+  label: 970
+  display_name: "tennis_ball"
+}
+item {
+  name: "n03424325"
+  label: 971
+  display_name: "gasmask"
+}
+item {
+  name: "n03223299"
+  label: 972
+  display_name: "doormat"
+}
+item {
+  name: "n03680355"
+  label: 973
+  display_name: "Loafer"
+}
+item {
+  name: "n07614500"
+  label: 974
+  display_name: "ice_cream"
+}
+item {
+  name: "n07695742"
+  label: 975
+  display_name: "pretzel"
+}
+item {
+  name: "n04033995"
+  label: 976
+  display_name: "quilt"
+}
+item {
+  name: "n03710721"
+  label: 977
+  display_name: "maillot"
+}
+item {
+  name: "n04392985"
+  label: 978
+  display_name: "tape_player"
+}
+item {
+  name: "n03047690"
+  label: 979
+  display_name: "clog"
+}
+item {
+  name: "n03584254"
+  label: 980
+  display_name: "iPod"
+}
+item {
+  name: "n13054560"
+  label: 981
+  display_name: "bolete"
+}
+item {
+  name: "n10565667"
+  label: 982
+  display_name: "scuba_diver"
+}
+item {
+  name: "n03950228"
+  label: 983
+  display_name: "pitcher"
+}
+item {
+  name: "n03729826"
+  label: 984
+  display_name: "matchstick"
+}
+item {
+  name: "n02837789"
+  label: 985
+  display_name: "bikini"
+}
+item {
+  name: "n04254777"
+  label: 986
+  display_name: "sock"
+}
+item {
+  name: "n02988304"
+  label: 987
+  display_name: "CD_player"
+}
+item {
+  name: "n03657121"
+  label: 988
+  display_name: "lens_cap"
+}
+item {
+  name: "n04417672"
+  label: 989
+  display_name: "thatch"
+}
+item {
+  name: "n04523525"
+  label: 990
+  display_name: "vault"
+}
+item {
+  name: "n02815834"
+  label: 991
+  display_name: "beaker"
+}
+item {
+  name: "n09229709"
+  label: 992
+  display_name: "bubble"
+}
+item {
+  name: "n07697313"
+  label: 993
+  display_name: "cheeseburger"
+}
+item {
+  name: "n03888605"
+  label: 994
+  display_name: "parallel_bars"
+}
+item {
+  name: "n03355925"
+  label: 995
+  display_name: "flagpole"
+}
+item {
+  name: "n03063599"
+  label: 996
+  display_name: "coffee_mug"
+}
+item {
+  name: "n04116512"
+  label: 997
+  display_name: "rubber_eraser"
+}
+item {
+  name: "n04325704"
+  label: 998
+  display_name: "stole"
+}
+item {
+  name: "n07831146"
+  label: 999
+  display_name: "carbonara"
+}
+item {
+  name: "n03255030"
+  label: 1000
+  display_name: "dumbbell"
+}
diff --git a/data/ILSVRC2016/labelmap_ilsvrc_det.prototxt b/data/ILSVRC2016/labelmap_ilsvrc_det.prototxt
new file mode 100644
index 00000000000..54c493e340f
--- /dev/null
+++ b/data/ILSVRC2016/labelmap_ilsvrc_det.prototxt
@@ -0,0 +1,1005 @@
+item {
+  name: "none_of_the_above"
+  label: 0
+  display_name: "background"
+}
+item {
+  name: "n02672831"
+  label: 1
+  display_name: "accordion"
+}
+item {
+  name: "n02691156"
+  label: 2
+  display_name: "airplane"
+}
+item {
+  name: "n02219486"
+  label: 3
+  display_name: "ant"
+}
+item {
+  name: "n02419796"
+  label: 4
+  display_name: "antelope"
+}
+item {
+  name: "n07739125"
+  label: 5
+  display_name: "apple"
+}
+item {
+  name: "n02454379"
+  label: 6
+  display_name: "armadillo"
+}
+item {
+  name: "n07718747"
+  label: 7
+  display_name: "artichoke"
+}
+item {
+  name: "n02764044"
+  label: 8
+  display_name: "axe"
+}
+item {
+  name: "n02766320"
+  label: 9
+  display_name: "baby_bed"
+}
+item {
+  name: "n02769748"
+  label: 10
+  display_name: "backpack"
+}
+item {
+  name: "n07693725"
+  label: 11
+  display_name: "bagel"
+}
+item {
+  name: "n02777292"
+  label: 12
+  display_name: "balance_beam"
+}
+item {
+  name: "n07753592"
+  label: 13
+  display_name: "banana"
+}
+item {
+  name: "n02786058"
+  label: 14
+  display_name: "band_aid"
+}
+item {
+  name: "n02787622"
+  label: 15
+  display_name: "banjo"
+}
+item {
+  name: "n02799071"
+  label: 16
+  display_name: "baseball"
+}
+item {
+  name: "n02802426"
+  label: 17
+  display_name: "basketball"
+}
+item {
+  name: "n02807133"
+  label: 18
+  display_name: "bathing_cap"
+}
+item {
+  name: "n02815834"
+  label: 19
+  display_name: "beaker"
+}
+item {
+  name: "n02131653"
+  label: 20
+  display_name: "bear"
+}
+item {
+  name: "n02206856"
+  label: 21
+  display_name: "bee"
+}
+item {
+  name: "n07720875"
+  label: 22
+  display_name: "bell_pepper"
+}
+item {
+  name: "n02828884"
+  label: 23
+  display_name: "bench"
+}
+item {
+  name: "n02834778"
+  label: 24
+  display_name: "bicycle"
+}
+item {
+  name: "n02840245"
+  label: 25
+  display_name: "binder"
+}
+item {
+  name: "n01503061"
+  label: 26
+  display_name: "bird"
+}
+item {
+  name: "n02870880"
+  label: 27
+  display_name: "bookshelf"
+}
+item {
+  name: "n02883205"
+  label: 28
+  display_name: "bow_tie"
+}
+item {
+  name: "n02879718"
+  label: 29
+  display_name: "bow"
+}
+item {
+  name: "n02880940"
+  label: 30
+  display_name: "bowl"
+}
+item {
+  name: "n02892767"
+  label: 31
+  display_name: "brassiere"
+}
+item {
+  name: "n07880968"
+  label: 32
+  display_name: "burrito"
+}
+item {
+  name: "n02924116"
+  label: 33
+  display_name: "bus"
+}
+item {
+  name: "n02274259"
+  label: 34
+  display_name: "butterfly"
+}
+item {
+  name: "n02437136"
+  label: 35
+  display_name: "camel"
+}
+item {
+  name: "n02951585"
+  label: 36
+  display_name: "can_opener"
+}
+item {
+  name: "n02958343"
+  label: 37
+  display_name: "car"
+}
+item {
+  name: "n02970849"
+  label: 38
+  display_name: "cart"
+}
+item {
+  name: "n02402425"
+  label: 39
+  display_name: "cattle"
+}
+item {
+  name: "n02992211"
+  label: 40
+  display_name: "cello"
+}
+item {
+  name: "n01784675"
+  label: 41
+  display_name: "centipede"
+}
+item {
+  name: "n03000684"
+  label: 42
+  display_name: "chain_saw"
+}
+item {
+  name: "n03001627"
+  label: 43
+  display_name: "chair"
+}
+item {
+  name: "n03017168"
+  label: 44
+  display_name: "chime"
+}
+item {
+  name: "n03062245"
+  label: 45
+  display_name: "cocktail_shaker"
+}
+item {
+  name: "n03063338"
+  label: 46
+  display_name: "coffee_maker"
+}
+item {
+  name: "n03085013"
+  label: 47
+  display_name: "computer_keyboard"
+}
+item {
+  name: "n03793489"
+  label: 48
+  display_name: "computer_mouse"
+}
+item {
+  name: "n03109150"
+  label: 49
+  display_name: "corkscrew"
+}
+item {
+  name: "n03128519"
+  label: 50
+  display_name: "cream"
+}
+item {
+  name: "n03134739"
+  label: 51
+  display_name: "croquet_ball"
+}
+item {
+  name: "n03141823"
+  label: 52
+  display_name: "crutch"
+}
+item {
+  name: "n07718472"
+  label: 53
+  display_name: "cucumber"
+}
+item {
+  name: "n03797390"
+  label: 54
+  display_name: "cup_or_mug"
+}
+item {
+  name: "n03188531"
+  label: 55
+  display_name: "diaper"
+}
+item {
+  name: "n03196217"
+  label: 56
+  display_name: "digital_clock"
+}
+item {
+  name: "n03207941"
+  label: 57
+  display_name: "dishwasher"
+}
+item {
+  name: "n02084071"
+  label: 58
+  display_name: "dog"
+}
+item {
+  name: "n02121808"
+  label: 59
+  display_name: "domestic_cat"
+}
+item {
+  name: "n02268443"
+  label: 60
+  display_name: "dragonfly"
+}
+item {
+  name: "n03249569"
+  label: 61
+  display_name: "drum"
+}
+item {
+  name: "n03255030"
+  label: 62
+  display_name: "dumbbell"
+}
+item {
+  name: "n03271574"
+  label: 63
+  display_name: "electric_fan"
+}
+item {
+  name: "n02503517"
+  label: 64
+  display_name: "elephant"
+}
+item {
+  name: "n03314780"
+  label: 65
+  display_name: "face_powder"
+}
+item {
+  name: "n07753113"
+  label: 66
+  display_name: "fig"
+}
+item {
+  name: "n03337140"
+  label: 67
+  display_name: "filing_cabinet"
+}
+item {
+  name: "n03991062"
+  label: 68
+  display_name: "flower_pot"
+}
+item {
+  name: "n03372029"
+  label: 69
+  display_name: "flute"
+}
+item {
+  name: "n02118333"
+  label: 70
+  display_name: "fox"
+}
+item {
+  name: "n03394916"
+  label: 71
+  display_name: "french_horn"
+}
+item {
+  name: "n01639765"
+  label: 72
+  display_name: "frog"
+}
+item {
+  name: "n03400231"
+  label: 73
+  display_name: "frying_pan"
+}
+item {
+  name: "n02510455"
+  label: 74
+  display_name: "giant_panda"
+}
+item {
+  name: "n01443537"
+  label: 75
+  display_name: "goldfish"
+}
+item {
+  name: "n03445777"
+  label: 76
+  display_name: "golf_ball"
+}
+item {
+  name: "n03445924"
+  label: 77
+  display_name: "golfcart"
+}
+item {
+  name: "n07583066"
+  label: 78
+  display_name: "guacamole"
+}
+item {
+  name: "n03467517"
+  label: 79
+  display_name: "guitar"
+}
+item {
+  name: "n03483316"
+  label: 80
+  display_name: "hair_dryer"
+}
+item {
+  name: "n03476991"
+  label: 81
+  display_name: "hair_spray"
+}
+item {
+  name: "n07697100"
+  label: 82
+  display_name: "hamburger"
+}
+item {
+  name: "n03481172"
+  label: 83
+  display_name: "hammer"
+}
+item {
+  name: "n02342885"
+  label: 84
+  display_name: "hamster"
+}
+item {
+  name: "n03494278"
+  label: 85
+  display_name: "harmonica"
+}
+item {
+  name: "n03495258"
+  label: 86
+  display_name: "harp"
+}
+item {
+  name: "n03124170"
+  label: 87
+  display_name: "hat_with_a_wide_brim"
+}
+item {
+  name: "n07714571"
+  label: 88
+  display_name: "head_cabbage"
+}
+item {
+  name: "n03513137"
+  label: 89
+  display_name: "helmet"
+}
+item {
+  name: "n02398521"
+  label: 90
+  display_name: "hippopotamus"
+}
+item {
+  name: "n03535780"
+  label: 91
+  display_name: "horizontal_bar"
+}
+item {
+  name: "n02374451"
+  label: 92
+  display_name: "horse"
+}
+item {
+  name: "n07697537"
+  label: 93
+  display_name: "hotdog"
+}
+item {
+  name: "n03584254"
+  label: 94
+  display_name: "iPod"
+}
+item {
+  name: "n01990800"
+  label: 95
+  display_name: "isopod"
+}
+item {
+  name: "n01910747"
+  label: 96
+  display_name: "jellyfish"
+}
+item {
+  name: "n01882714"
+  label: 97
+  display_name: "koala_bear"
+}
+item {
+  name: "n03633091"
+  label: 98
+  display_name: "ladle"
+}
+item {
+  name: "n02165456"
+  label: 99
+  display_name: "ladybug"
+}
+item {
+  name: "n03636649"
+  label: 100
+  display_name: "lamp"
+}
+item {
+  name: "n03642806"
+  label: 101
+  display_name: "laptop"
+}
+item {
+  name: "n07749582"
+  label: 102
+  display_name: "lemon"
+}
+item {
+  name: "n02129165"
+  label: 103
+  display_name: "lion"
+}
+item {
+  name: "n03676483"
+  label: 104
+  display_name: "lipstick"
+}
+item {
+  name: "n01674464"
+  label: 105
+  display_name: "lizard"
+}
+item {
+  name: "n01982650"
+  label: 106
+  display_name: "lobster"
+}
+item {
+  name: "n03710721"
+  label: 107
+  display_name: "maillot"
+}
+item {
+  name: "n03720891"
+  label: 108
+  display_name: "maraca"
+}
+item {
+  name: "n03759954"
+  label: 109
+  display_name: "microphone"
+}
+item {
+  name: "n03761084"
+  label: 110
+  display_name: "microwave"
+}
+item {
+  name: "n03764736"
+  label: 111
+  display_name: "milk_can"
+}
+item {
+  name: "n03770439"
+  label: 112
+  display_name: "miniskirt"
+}
+item {
+  name: "n02484322"
+  label: 113
+  display_name: "monkey"
+}
+item {
+  name: "n03790512"
+  label: 114
+  display_name: "motorcycle"
+}
+item {
+  name: "n07734744"
+  label: 115
+  display_name: "mushroom"
+}
+item {
+  name: "n03804744"
+  label: 116
+  display_name: "nail"
+}
+item {
+  name: "n03814639"
+  label: 117
+  display_name: "neck_brace"
+}
+item {
+  name: "n03838899"
+  label: 118
+  display_name: "oboe"
+}
+item {
+  name: "n07747607"
+  label: 119
+  display_name: "orange"
+}
+item {
+  name: "n02444819"
+  label: 120
+  display_name: "otter"
+}
+item {
+  name: "n03908618"
+  label: 121
+  display_name: "pencil_box"
+}
+item {
+  name: "n03908714"
+  label: 122
+  display_name: "pencil_sharpener"
+}
+item {
+  name: "n03916031"
+  label: 123
+  display_name: "perfume"
+}
+item {
+  name: "n00007846"
+  label: 124
+  display_name: "person"
+}
+item {
+  name: "n03928116"
+  label: 125
+  display_name: "piano"
+}
+item {
+  name: "n07753275"
+  label: 126
+  display_name: "pineapple"
+}
+item {
+  name: "n03942813"
+  label: 127
+  display_name: "ping-pong_ball"
+}
+item {
+  name: "n03950228"
+  label: 128
+  display_name: "pitcher"
+}
+item {
+  name: "n07873807"
+  label: 129
+  display_name: "pizza"
+}
+item {
+  name: "n03958227"
+  label: 130
+  display_name: "plastic_bag"
+}
+item {
+  name: "n03961711"
+  label: 131
+  display_name: "plate_rack"
+}
+item {
+  name: "n07768694"
+  label: 132
+  display_name: "pomegranate"
+}
+item {
+  name: "n07615774"
+  label: 133
+  display_name: "popsicle"
+}
+item {
+  name: "n02346627"
+  label: 134
+  display_name: "porcupine"
+}
+item {
+  name: "n03995372"
+  label: 135
+  display_name: "power_drill"
+}
+item {
+  name: "n07695742"
+  label: 136
+  display_name: "pretzel"
+}
+item {
+  name: "n04004767"
+  label: 137
+  display_name: "printer"
+}
+item {
+  name: "n04019541"
+  label: 138
+  display_name: "puck"
+}
+item {
+  name: "n04023962"
+  label: 139
+  display_name: "punching_bag"
+}
+item {
+  name: "n04026417"
+  label: 140
+  display_name: "purse"
+}
+item {
+  name: "n02324045"
+  label: 141
+  display_name: "rabbit"
+}
+item {
+  name: "n04039381"
+  label: 142
+  display_name: "racket"
+}
+item {
+  name: "n01495701"
+  label: 143
+  display_name: "ray"
+}
+item {
+  name: "n02509815"
+  label: 144
+  display_name: "red_panda"
+}
+item {
+  name: "n04070727"
+  label: 145
+  display_name: "refrigerator"
+}
+item {
+  name: "n04074963"
+  label: 146
+  display_name: "remote_control"
+}
+item {
+  name: "n04116512"
+  label: 147
+  display_name: "rubber_eraser"
+}
+item {
+  name: "n04118538"
+  label: 148
+  display_name: "rugby_ball"
+}
+item {
+  name: "n04118776"
+  label: 149
+  display_name: "ruler"
+}
+item {
+  name: "n04131690"
+  label: 150
+  display_name: "salt_or_pepper_shaker"
+}
+item {
+  name: "n04141076"
+  label: 151
+  display_name: "saxophone"
+}
+item {
+  name: "n01770393"
+  label: 152
+  display_name: "scorpion"
+}
+item {
+  name: "n04154565"
+  label: 153
+  display_name: "screwdriver"
+}
+item {
+  name: "n02076196"
+  label: 154
+  display_name: "seal"
+}
+item {
+  name: "n02411705"
+  label: 155
+  display_name: "sheep"
+}
+item {
+  name: "n04228054"
+  label: 156
+  display_name: "ski"
+}
+item {
+  name: "n02445715"
+  label: 157
+  display_name: "skunk"
+}
+item {
+  name: "n01944390"
+  label: 158
+  display_name: "snail"
+}
+item {
+  name: "n01726692"
+  label: 159
+  display_name: "snake"
+}
+item {
+  name: "n04252077"
+  label: 160
+  display_name: "snowmobile"
+}
+item {
+  name: "n04252225"
+  label: 161
+  display_name: "snowplow"
+}
+item {
+  name: "n04254120"
+  label: 162
+  display_name: "soap_dispenser"
+}
+item {
+  name: "n04254680"
+  label: 163
+  display_name: "soccer_ball"
+}
+item {
+  name: "n04256520"
+  label: 164
+  display_name: "sofa"
+}
+item {
+  name: "n04270147"
+  label: 165
+  display_name: "spatula"
+}
+item {
+  name: "n02355227"
+  label: 166
+  display_name: "squirrel"
+}
+item {
+  name: "n02317335"
+  label: 167
+  display_name: "starfish"
+}
+item {
+  name: "n04317175"
+  label: 168
+  display_name: "stethoscope"
+}
+item {
+  name: "n04330267"
+  label: 169
+  display_name: "stove"
+}
+item {
+  name: "n04332243"
+  label: 170
+  display_name: "strainer"
+}
+item {
+  name: "n07745940"
+  label: 171
+  display_name: "strawberry"
+}
+item {
+  name: "n04336792"
+  label: 172
+  display_name: "stretcher"
+}
+item {
+  name: "n04356056"
+  label: 173
+  display_name: "sunglasses"
+}
+item {
+  name: "n04371430"
+  label: 174
+  display_name: "swimming_trunks"
+}
+item {
+  name: "n02395003"
+  label: 175
+  display_name: "swine"
+}
+item {
+  name: "n04376876"
+  label: 176
+  display_name: "syringe"
+}
+item {
+  name: "n04379243"
+  label: 177
+  display_name: "table"
+}
+item {
+  name: "n04392985"
+  label: 178
+  display_name: "tape_player"
+}
+item {
+  name: "n04409515"
+  label: 179
+  display_name: "tennis_ball"
+}
+item {
+  name: "n01776313"
+  label: 180
+  display_name: "tick"
+}
+item {
+  name: "n04591157"
+  label: 181
+  display_name: "tie"
+}
+item {
+  name: "n02129604"
+  label: 182
+  display_name: "tiger"
+}
+item {
+  name: "n04442312"
+  label: 183
+  display_name: "toaster"
+}
+item {
+  name: "n06874185"
+  label: 184
+  display_name: "traffic_light"
+}
+item {
+  name: "n04468005"
+  label: 185
+  display_name: "train"
+}
+item {
+  name: "n04487394"
+  label: 186
+  display_name: "trombone"
+}
+item {
+  name: "n03110669"
+  label: 187
+  display_name: "trumpet"
+}
+item {
+  name: "n01662784"
+  label: 188
+  display_name: "turtle"
+}
+item {
+  name: "n03211117"
+  label: 189
+  display_name: "tv_or_monitor"
+}
+item {
+  name: "n04509417"
+  label: 190
+  display_name: "unicycle"
+}
+item {
+  name: "n04517823"
+  label: 191
+  display_name: "vacuum"
+}
+item {
+  name: "n04536866"
+  label: 192
+  display_name: "violin"
+}
+item {
+  name: "n04540053"
+  label: 193
+  display_name: "volleyball"
+}
+item {
+  name: "n04542943"
+  label: 194
+  display_name: "waffle_iron"
+}
+item {
+  name: "n04554684"
+  label: 195
+  display_name: "washer"
+}
+item {
+  name: "n04557648"
+  label: 196
+  display_name: "water_bottle"
+}
+item {
+  name: "n04530566"
+  label: 197
+  display_name: "watercraft"
+}
+item {
+  name: "n02062744"
+  label: 198
+  display_name: "whale"
+}
+item {
+  name: "n04591713"
+  label: 199
+  display_name: "wine_bottle"
+}
+item {
+  name: "n02391049"
+  label: 200
+  display_name: "zebra"
+}
diff --git a/data/VOC0712/coco_voc_map.txt b/data/VOC0712/coco_voc_map.txt
new file mode 100644
index 00000000000..7ff84d19b61
--- /dev/null
+++ b/data/VOC0712/coco_voc_map.txt
@@ -0,0 +1,21 @@
+0,0,background
+5,1,aeroplane
+2,2,bicycle
+15,3,bird
+9,4,boat
+40,5,bottle
+6,6,bus
+3,7,car
+16,8,cat
+57,9,chair
+20,10,cow
+61,11,diningtable
+17,12,dog
+18,13,horse
+4,14,motorbike
+1,15,person
+59,16,pottedplant
+19,17,sheep
+58,18,sofa
+7,19,train
+63,20,tvmonitor
diff --git a/data/VOC0712/create_data.sh b/data/VOC0712/create_data.sh
new file mode 100644
index 00000000000..ffcc97d747b
--- /dev/null
+++ b/data/VOC0712/create_data.sh
@@ -0,0 +1,25 @@
+cur_dir=$(cd $( dirname ${BASH_SOURCE[0]} ) && pwd )
+root_dir=$cur_dir/../..
+
+cd $root_dir
+
+redo=1
+data_root_dir="$HOME/data/VOCdevkit"
+dataset_name="VOC0712"
+mapfile="$root_dir/data/$dataset_name/labelmap_voc.prototxt"
+anno_type="detection"
+db="lmdb"
+min_dim=0
+max_dim=0
+width=0
+height=0
+
+extra_cmd="--encode-type=jpg --encoded"
+if [ $redo ]
+then
+  extra_cmd="$extra_cmd --redo"
+fi
+for subset in test trainval
+do
+  python $root_dir/scripts/create_annoset.py --anno-type=$anno_type --label-map-file=$mapfile --min-dim=$min_dim --max-dim=$max_dim --resize-width=$width --resize-height=$height --check-label $extra_cmd $data_root_dir $root_dir/data/$dataset_name/$subset.txt $data_root_dir/$dataset_name/$db/$dataset_name"_"$subset"_"$db examples/$dataset_name
+done
diff --git a/data/VOC0712/create_list.sh b/data/VOC0712/create_list.sh
new file mode 100644
index 00000000000..67eb1007b2b
--- /dev/null
+++ b/data/VOC0712/create_list.sh
@@ -0,0 +1,51 @@
+#!/bin/bash
+
+root_dir=$HOME/data/VOCdevkit/
+sub_dir=ImageSets/Main
+bash_dir="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+for dataset in trainval test
+do
+  dst_file=$bash_dir/$dataset.txt
+  if [ -f $dst_file ]
+  then
+    rm -f $dst_file
+  fi
+  for name in VOC2007 VOC2012
+  do
+    if [[ $dataset == "test" && $name == "VOC2012" ]]
+    then
+      continue
+    fi
+    echo "Create list for $name $dataset..."
+    dataset_file=$root_dir/$name/$sub_dir/$dataset.txt
+
+    img_file=$bash_dir/$dataset"_img.txt"
+    cp $dataset_file $img_file
+    sed -i "s/^/$name\/JPEGImages\//g" $img_file
+    sed -i "s/$/.jpg/g" $img_file
+
+    label_file=$bash_dir/$dataset"_label.txt"
+    cp $dataset_file $label_file
+    sed -i "s/^/$name\/Annotations\//g" $label_file
+    sed -i "s/$/.xml/g" $label_file
+
+    paste -d' ' $img_file $label_file >> $dst_file
+
+    rm -f $label_file
+    rm -f $img_file
+  done
+
+  # Generate image name and size infomation.
+  if [ $dataset == "test" ]
+  then
+    $bash_dir/../../build/tools/get_image_size $root_dir $dst_file $bash_dir/$dataset"_name_size.txt"
+  fi
+
+  # Shuffle trainval file.
+  if [ $dataset == "trainval" ]
+  then
+    rand_file=$dst_file.random
+    cat $dst_file | perl -MList::Util=shuffle -e 'print shuffle(<STDIN>);' > $rand_file
+    mv $rand_file $dst_file
+  fi
+done
diff --git a/data/VOC0712/labelmap_voc.prototxt b/data/VOC0712/labelmap_voc.prototxt
new file mode 100644
index 00000000000..b5c177b7245
--- /dev/null
+++ b/data/VOC0712/labelmap_voc.prototxt
@@ -0,0 +1,105 @@
+item {
+  name: "none_of_the_above"
+  label: 0
+  display_name: "background"
+}
+item {
+  name: "aeroplane"
+  label: 1
+  display_name: "aeroplane"
+}
+item {
+  name: "bicycle"
+  label: 2
+  display_name: "bicycle"
+}
+item {
+  name: "bird"
+  label: 3
+  display_name: "bird"
+}
+item {
+  name: "boat"
+  label: 4
+  display_name: "boat"
+}
+item {
+  name: "bottle"
+  label: 5
+  display_name: "bottle"
+}
+item {
+  name: "bus"
+  label: 6
+  display_name: "bus"
+}
+item {
+  name: "car"
+  label: 7
+  display_name: "car"
+}
+item {
+  name: "cat"
+  label: 8
+  display_name: "cat"
+}
+item {
+  name: "chair"
+  label: 9
+  display_name: "chair"
+}
+item {
+  name: "cow"
+  label: 10
+  display_name: "cow"
+}
+item {
+  name: "diningtable"
+  label: 11
+  display_name: "diningtable"
+}
+item {
+  name: "dog"
+  label: 12
+  display_name: "dog"
+}
+item {
+  name: "horse"
+  label: 13
+  display_name: "horse"
+}
+item {
+  name: "motorbike"
+  label: 14
+  display_name: "motorbike"
+}
+item {
+  name: "person"
+  label: 15
+  display_name: "person"
+}
+item {
+  name: "pottedplant"
+  label: 16
+  display_name: "pottedplant"
+}
+item {
+  name: "sheep"
+  label: 17
+  display_name: "sheep"
+}
+item {
+  name: "sofa"
+  label: 18
+  display_name: "sofa"
+}
+item {
+  name: "train"
+  label: 19
+  display_name: "train"
+}
+item {
+  name: "tvmonitor"
+  label: 20
+  display_name: "tvmonitor"
+}
diff --git a/data/coco/README.md b/data/coco/README.md
new file mode 100644
index 00000000000..7589a7cbd3c
--- /dev/null
+++ b/data/coco/README.md
@@ -0,0 +1,38 @@
+### Preparation
+1. Download Images and Annotations from [MSCOCO](http://mscoco.org/dataset/#download). By default, we assume the data is stored in `$HOME/data/coco`
+
+2. Get the coco code. We will call the directory that you cloned coco into `$COCO_ROOT`
+  ```Shell
+  git clone https://github.com/weiliu89/coco.git
+  cd coco
+  git checkout dev
+  ```
+
+3. Build the coco code.
+  ```Shell
+  cd PythonAPI
+  python setup.py build_ext --inplace
+  ```
+
+4. Split the annotation to many files per image and get the image size info.
+  ```Shell
+  # Check scripts/batch_split_annotation.py and change settings accordingly.
+  python scripts/batch_split_annotation.py
+  # Create the minival2014_name_size.txt and test-dev2015_name_size.txt in $CAFFE_ROOT/data/coco
+  python scripts/batch_get_image_size.py
+  ```
+
+5. Create the LMDB file.
+  ```Shell
+  cd $CAFFE_ROOT
+  # Create the minival.txt, testdev.txt, test.txt, train.txt in data/coco/
+  python data/coco/create_list.py
+  # You can modify the parameters in create_data.sh if needed.
+  # It will create lmdb files for minival, testdev, test, and train with encoded original image:
+  #   - $HOME/data/coco/lmdb/coco_minival_lmdb
+  #   - $HOME/data/coco/lmdb/coco_testdev_lmdb
+  #   - $HOME/data/coco/lmdb/coco_test_lmdb
+  #   - $HOME/data/coco/lmdb/coco_train_lmdb
+  # and make soft links at examples/coco/
+  ./data/coco/create_data.sh
+  ```
diff --git a/data/coco/create_data.sh b/data/coco/create_data.sh
new file mode 100644
index 00000000000..32f45e6d7d9
--- /dev/null
+++ b/data/coco/create_data.sh
@@ -0,0 +1,27 @@
+#!/usr/bin/env bash
+cur_dir=$(cd $( dirname ${BASH_SOURCE[0]} ) && pwd )
+root_dir=$cur_dir/../..
+
+cd $root_dir
+
+redo=true
+data_root_dir="$HOME/data/coco"
+dataset_name="coco"
+mapfile="$root_dir/data/$dataset_name/labelmap_coco.prototxt"
+anno_type="detection"
+label_type="json" #"xml"
+db="lmdb"
+min_dim=0
+max_dim=0
+width=0
+height=0
+
+extra_cmd="--encode-type=jpg --encoded"
+if $redo
+then
+  extra_cmd="$extra_cmd --redo"
+fi
+for subset in minival testdev train test
+do
+  python $root_dir/scripts/create_annoset.py --anno-type=$anno_type --label-type=$label_type --label-map-file=$mapfile --min-dim=$min_dim --max-dim=$max_dim --resize-width=$width --resize-height=$height --check-label $extra_cmd $data_root_dir $root_dir/data/$dataset_name/$subset.txt $data_root_dir/$db/$dataset_name"_"$subset"_"$db examples/$dataset_name 2>&1 | tee $root_dir/data/$dataset_name/$subset.log
+done
diff --git a/data/coco/create_list.py b/data/coco/create_list.py
new file mode 100644
index 00000000000..cdd86f3b2d3
--- /dev/null
+++ b/data/coco/create_list.py
@@ -0,0 +1,125 @@
+import argparse
+import os
+from random import shuffle
+import shutil
+import subprocess
+import sys
+
+HOMEDIR = os.path.expanduser("~")
+CURDIR = os.path.dirname(os.path.realpath(__file__))
+
+# If true, re-create all list files.
+redo = True
+# The root directory which holds all information of the dataset.
+data_dir = "{}/data/coco".format(HOMEDIR)
+# The directory name which holds the image sets.
+imgset_dir = "ImageSets"
+# The direcotry which contains the images.
+img_dir = "images"
+img_ext = "jpg"
+# The directory which contains the annotations.
+anno_dir = "Annotations"
+anno_ext = "json"
+
+train_list_file = "{}/train.txt".format(CURDIR)
+minival_list_file = "{}/minival.txt".format(CURDIR)
+testdev_list_file = "{}/testdev.txt".format(CURDIR)
+test_list_file = "{}/test.txt".format(CURDIR)
+
+# Create training set.
+# We follow Ross Girschick's split.
+if redo or not os.path.exists(train_list_file):
+    datasets = ["train2014"]#, "valminusminival2014"]
+    img_files = []
+    anno_files = []
+    for dataset in datasets:
+        imgset_file = "{}/{}/{}.txt".format(data_dir, imgset_dir, dataset)
+        with open(imgset_file, "r") as f:
+            for line in f.readlines():
+                name = line.strip("\n")
+                subset = name.split("_")[1]
+                img_file = "{}/{}/{}.{}".format(img_dir, subset, name, img_ext)
+                assert os.path.exists("{}/{}".format(data_dir, img_file)), \
+                        "{}/{} does not exist".format(data_dir, img_file)
+                anno_file = "{}/{}/{}.{}".format(anno_dir, subset, name, anno_ext)
+                assert os.path.exists("{}/{}".format(data_dir, anno_file)), \
+                        "{}/{} does not exist".format(data_dir, anno_file)
+                img_files.append(img_file)
+                anno_files.append(anno_file)
+    # Shuffle the images.
+    idx = [i for i in xrange(len(img_files))]
+    shuffle(idx)
+    with open(train_list_file, "w") as f:
+        for i in idx:
+            f.write("{} {}\n".format(img_files[i], anno_files[i]))
+
+if redo or not os.path.exists(minival_list_file):
+    datasets = ["val2014"]
+    subset = "val2014"
+    img_files = []
+    anno_files = []
+    for dataset in datasets:
+        imgset_file = "{}/{}/{}.txt".format(data_dir, imgset_dir, dataset)
+        with open(imgset_file, "r") as f:
+            for line in f.readlines():
+                name = line.strip("\n")
+                img_file = "{}/{}/{}.{}".format(img_dir, subset, name, img_ext)
+                assert os.path.exists("{}/{}".format(data_dir, img_file)), \
+                        "{}/{} does not exist".format(data_dir, img_file)
+                anno_file = "{}/{}/{}.{}".format(anno_dir, subset, name, anno_ext)
+                assert os.path.exists("{}/{}".format(data_dir, anno_file)), \
+                        "{}/{} does not exist".format(data_dir, anno_file)
+                img_files.append(img_file)
+                anno_files.append(anno_file)
+    with open(minival_list_file, "w") as f:
+        for i in xrange(len(img_files)):
+            f.write("{} {}\n".format(img_files[i], anno_files[i]))
+
+if redo or not os.path.exists(testdev_list_file):
+#    datasets = ["test-dev2015"]
+#    subset = "test2015"
+    datasets = ["test2014"]
+    subset = "test2014"
+    img_files = []
+    anno_files = []
+    for dataset in datasets:
+        imgset_file = "{}/{}/{}.txt".format(data_dir, imgset_dir, dataset)
+        with open(imgset_file, "r") as f:
+            for line in f.readlines():
+                name = line.strip("\n")
+                img_file = "{}/{}/{}.{}".format(img_dir, subset, name, img_ext)
+                assert os.path.exists("{}/{}".format(data_dir, img_file)), \
+                        "{}/{} does not exist".format(data_dir, img_file)
+                anno_file = "{}/{}/{}.{}".format(anno_dir, subset, name, anno_ext)
+                assert os.path.exists("{}/{}".format(data_dir, anno_file)), \
+                        "{}/{} does not exist".format(data_dir, anno_file)
+                img_files.append(img_file)
+                anno_files.append(anno_file)
+    with open(testdev_list_file, "w") as f:
+        for i in xrange(len(img_files)):
+            f.write("{} {}\n".format(img_files[i], anno_files[i]))
+
+if redo or not os.path.exists(test_list_file):
+    # datasets = ["test2015"]
+    # subset = "test2015"
+    datasets = ["test2014"]
+    subset = "test2014"
+    img_files = []
+    anno_files = []
+    for dataset in datasets:
+        imgset_file = "{}/{}/{}.txt".format(data_dir, imgset_dir, dataset)
+        with open(imgset_file, "r") as f:
+            for line in f.readlines():
+                name = line.strip("\n")
+                img_file = "{}/{}/{}.{}".format(img_dir, subset, name, img_ext)
+                assert os.path.exists("{}/{}".format(data_dir, img_file)), \
+                        "{}/{} does not exist".format(data_dir, img_file)
+                anno_file = "{}/{}/{}.{}".format(anno_dir, subset, name, anno_ext)
+                assert os.path.exists("{}/{}".format(data_dir, anno_file)), \
+                        "{}/{} does not exist".format(data_dir, anno_file)
+                img_files.append(img_file)
+                anno_files.append(anno_file)
+    with open(test_list_file, "w") as f:
+        for i in xrange(len(img_files)):
+            f.write("{} {}\n".format(img_files[i], anno_files[i]))
+
diff --git a/data/coco/labelmap_coco.prototxt b/data/coco/labelmap_coco.prototxt
new file mode 100644
index 00000000000..82252d2e9dc
--- /dev/null
+++ b/data/coco/labelmap_coco.prototxt
@@ -0,0 +1,405 @@
+item {
+  name: "none_of_the_above"
+  label: 0
+  display_name: "background"
+}
+item {
+  name: "1"
+  label: 1
+  display_name: "person"
+}
+item {
+  name: "2"
+  label: 2
+  display_name: "bicycle"
+}
+item {
+  name: "3"
+  label: 3
+  display_name: "car"
+}
+item {
+  name: "4"
+  label: 4
+  display_name: "motorcycle"
+}
+item {
+  name: "5"
+  label: 5
+  display_name: "airplane"
+}
+item {
+  name: "6"
+  label: 6
+  display_name: "bus"
+}
+item {
+  name: "7"
+  label: 7
+  display_name: "train"
+}
+item {
+  name: "8"
+  label: 8
+  display_name: "truck"
+}
+item {
+  name: "9"
+  label: 9
+  display_name: "boat"
+}
+item {
+  name: "10"
+  label: 10
+  display_name: "traffic light"
+}
+item {
+  name: "11"
+  label: 11
+  display_name: "fire hydrant"
+}
+item {
+  name: "13"
+  label: 12
+  display_name: "stop sign"
+}
+item {
+  name: "14"
+  label: 13
+  display_name: "parking meter"
+}
+item {
+  name: "15"
+  label: 14
+  display_name: "bench"
+}
+item {
+  name: "16"
+  label: 15
+  display_name: "bird"
+}
+item {
+  name: "17"
+  label: 16
+  display_name: "cat"
+}
+item {
+  name: "18"
+  label: 17
+  display_name: "dog"
+}
+item {
+  name: "19"
+  label: 18
+  display_name: "horse"
+}
+item {
+  name: "20"
+  label: 19
+  display_name: "sheep"
+}
+item {
+  name: "21"
+  label: 20
+  display_name: "cow"
+}
+item {
+  name: "22"
+  label: 21
+  display_name: "elephant"
+}
+item {
+  name: "23"
+  label: 22
+  display_name: "bear"
+}
+item {
+  name: "24"
+  label: 23
+  display_name: "zebra"
+}
+item {
+  name: "25"
+  label: 24
+  display_name: "giraffe"
+}
+item {
+  name: "27"
+  label: 25
+  display_name: "backpack"
+}
+item {
+  name: "28"
+  label: 26
+  display_name: "umbrella"
+}
+item {
+  name: "31"
+  label: 27
+  display_name: "handbag"
+}
+item {
+  name: "32"
+  label: 28
+  display_name: "tie"
+}
+item {
+  name: "33"
+  label: 29
+  display_name: "suitcase"
+}
+item {
+  name: "34"
+  label: 30
+  display_name: "frisbee"
+}
+item {
+  name: "35"
+  label: 31
+  display_name: "skis"
+}
+item {
+  name: "36"
+  label: 32
+  display_name: "snowboard"
+}
+item {
+  name: "37"
+  label: 33
+  display_name: "sports ball"
+}
+item {
+  name: "38"
+  label: 34
+  display_name: "kite"
+}
+item {
+  name: "39"
+  label: 35
+  display_name: "baseball bat"
+}
+item {
+  name: "40"
+  label: 36
+  display_name: "baseball glove"
+}
+item {
+  name: "41"
+  label: 37
+  display_name: "skateboard"
+}
+item {
+  name: "42"
+  label: 38
+  display_name: "surfboard"
+}
+item {
+  name: "43"
+  label: 39
+  display_name: "tennis racket"
+}
+item {
+  name: "44"
+  label: 40
+  display_name: "bottle"
+}
+item {
+  name: "46"
+  label: 41
+  display_name: "wine glass"
+}
+item {
+  name: "47"
+  label: 42
+  display_name: "cup"
+}
+item {
+  name: "48"
+  label: 43
+  display_name: "fork"
+}
+item {
+  name: "49"
+  label: 44
+  display_name: "knife"
+}
+item {
+  name: "50"
+  label: 45
+  display_name: "spoon"
+}
+item {
+  name: "51"
+  label: 46
+  display_name: "bowl"
+}
+item {
+  name: "52"
+  label: 47
+  display_name: "banana"
+}
+item {
+  name: "53"
+  label: 48
+  display_name: "apple"
+}
+item {
+  name: "54"
+  label: 49
+  display_name: "sandwich"
+}
+item {
+  name: "55"
+  label: 50
+  display_name: "orange"
+}
+item {
+  name: "56"
+  label: 51
+  display_name: "broccoli"
+}
+item {
+  name: "57"
+  label: 52
+  display_name: "carrot"
+}
+item {
+  name: "58"
+  label: 53
+  display_name: "hot dog"
+}
+item {
+  name: "59"
+  label: 54
+  display_name: "pizza"
+}
+item {
+  name: "60"
+  label: 55
+  display_name: "donut"
+}
+item {
+  name: "61"
+  label: 56
+  display_name: "cake"
+}
+item {
+  name: "62"
+  label: 57
+  display_name: "chair"
+}
+item {
+  name: "63"
+  label: 58
+  display_name: "couch"
+}
+item {
+  name: "64"
+  label: 59
+  display_name: "potted plant"
+}
+item {
+  name: "65"
+  label: 60
+  display_name: "bed"
+}
+item {
+  name: "67"
+  label: 61
+  display_name: "dining table"
+}
+item {
+  name: "70"
+  label: 62
+  display_name: "toilet"
+}
+item {
+  name: "72"
+  label: 63
+  display_name: "tv"
+}
+item {
+  name: "73"
+  label: 64
+  display_name: "laptop"
+}
+item {
+  name: "74"
+  label: 65
+  display_name: "mouse"
+}
+item {
+  name: "75"
+  label: 66
+  display_name: "remote"
+}
+item {
+  name: "76"
+  label: 67
+  display_name: "keyboard"
+}
+item {
+  name: "77"
+  label: 68
+  display_name: "cell phone"
+}
+item {
+  name: "78"
+  label: 69
+  display_name: "microwave"
+}
+item {
+  name: "79"
+  label: 70
+  display_name: "oven"
+}
+item {
+  name: "80"
+  label: 71
+  display_name: "toaster"
+}
+item {
+  name: "81"
+  label: 72
+  display_name: "sink"
+}
+item {
+  name: "82"
+  label: 73
+  display_name: "refrigerator"
+}
+item {
+  name: "84"
+  label: 74
+  display_name: "book"
+}
+item {
+  name: "85"
+  label: 75
+  display_name: "clock"
+}
+item {
+  name: "86"
+  label: 76
+  display_name: "vase"
+}
+item {
+  name: "87"
+  label: 77
+  display_name: "scissors"
+}
+item {
+  name: "88"
+  label: 78
+  display_name: "teddy bear"
+}
+item {
+  name: "89"
+  label: 79
+  display_name: "hair drier"
+}
+item {
+  name: "90"
+  label: 80
+  display_name: "toothbrush"
+}
diff --git a/data/coco/labels.txt b/data/coco/labels.txt
new file mode 100644
index 00000000000..146dd8daae0
--- /dev/null
+++ b/data/coco/labels.txt
@@ -0,0 +1,80 @@
+1,1,person
+2,2,bicycle
+3,3,car
+4,4,motorcycle
+5,5,airplane
+6,6,bus
+7,7,train
+8,8,truck
+9,9,boat
+10,10,traffic light
+11,11,fire hydrant
+13,12,stop sign
+14,13,parking meter
+15,14,bench
+16,15,bird
+17,16,cat
+18,17,dog
+19,18,horse
+20,19,sheep
+21,20,cow
+22,21,elephant
+23,22,bear
+24,23,zebra
+25,24,giraffe
+27,25,backpack
+28,26,umbrella
+31,27,handbag
+32,28,tie
+33,29,suitcase
+34,30,frisbee
+35,31,skis
+36,32,snowboard
+37,33,sports ball
+38,34,kite
+39,35,baseball bat
+40,36,baseball glove
+41,37,skateboard
+42,38,surfboard
+43,39,tennis racket
+44,40,bottle
+46,41,wine glass
+47,42,cup
+48,43,fork
+49,44,knife
+50,45,spoon
+51,46,bowl
+52,47,banana
+53,48,apple
+54,49,sandwich
+55,50,orange
+56,51,broccoli
+57,52,carrot
+58,53,hot dog
+59,54,pizza
+60,55,donut
+61,56,cake
+62,57,chair
+63,58,couch
+64,59,potted plant
+65,60,bed
+67,61,dining table
+70,62,toilet
+72,63,tv
+73,64,laptop
+74,65,mouse
+75,66,remote
+76,67,keyboard
+77,68,cell phone
+78,69,microwave
+79,70,oven
+80,71,toaster
+81,72,sink
+82,73,refrigerator
+84,74,book
+85,75,clock
+86,76,vase
+87,77,scissors
+88,78,teddy bear
+89,79,hair drier
+90,80,toothbrush
diff --git a/examples/02-fine-tuning.ipynb b/examples/02-fine-tuning.ipynb
index 07ca8df4d74..3372f549cd9 100644
--- a/examples/02-fine-tuning.ipynb
+++ b/examples/02-fine-tuning.ipynb
@@ -762,7 +762,7 @@
     },
     {
      "data": {
-      "image/png": "iVBORw0KGgoAAAANSUhEUgAAAYcAAAEPCAYAAACp/QjLAAAABHNCSVQICAgIfAhkiAAAAAlwSFlz\nAAALEgAACxIB0t1+/AAAIABJREFUeJzsnXd8VFX6/z8nJBAS0iAkQJCuoCC6IJbFEjvWtay6rq79\nu/ysW3R13XVXEAsdFZFFFFxXwYYVEQSlKChFWpBOSEgjpJOeSeb5/fF4cu/cuXfmzmRa4nm/XrxI\nJndumbn3fM7neZ5zjiAiKBQKhUKhJyrcJ6BQKBSKyEOJg0KhUCjcUOKgUCgUCjeUOCgUCoXCDSUO\nCoVCoXBDiYNCoVAo3AiqOAghFgghioUQWRZ/TxVCLBdCbBdC7BJC3BXM81EoFAqFPYLtHBYCGOfh\n7w8B2EZEpwPIBDBDCBEd5HNSKBQKhReCKg5E9C2ACg+bFAFI/PnnRABlRNQczHNSKBQKhXfC3Uuf\nD+AbIUQhgAQAN4f5fBQKhUKB8Cek/wFgOxH1AXA6gDlCiIQwn5NCoVD84gm3c/g1gOcAgIgOCSEO\nAxgKYIt+IyGEmgBKoVAo/ICIhD/vC7dz2AvgEgAQQqSDhSHbbEMiUv8C9O/pp58O+zl0lH/qs1Sf\nZyT/awtBdQ5CiMUALgCQKoTIA/A0gBgAIKJ5AJ4HsFAIsQMsVI8TUXkwz0mhUCgU3gmqOBDRrV7+\nXgrgmmCeg0KhUCh8J9xhJUUYyMzMDPcpdBjUZxlY1OcZOYi2xqVCgRCC2sN5KhQKRSQhhAC104S0\nQqFQKCIQJQ4KhUKhcEOJg0KhUCjcUOKgUCgUCjeUOCgUCoXCDSUOCoVCoXBDiYNCoVAo3FDioFAo\nFAo3lDgoFAqFwg0lDgqFQqFwQ4mDQqFQKNxQ4qBQKBQKN5Q4KBQKhcINJQ4KhUKhcEOJg0KhUCjc\nUOKgUCgUCjeUOCgUCoXCDSUONtlftj/cp6BQKBQhQ4mDDbKKs3Dq3FNR56gL96koFApFSFDiYINl\nB5ahqaUJP+T/EO5TCSgNzQ2od9SH+zRcOFB2ALVNteE+DYXiF88vQhyaWpra9P5lB5fh9F6nY23O\nWp/eV+eoAxG16diShuaG1p9bnC1t3l9jcyMu+u9FeHzl423eV6CobapF5n8z8e/V//Z7H6V1pbj3\n03tR01QTwDMLP9WN1ThaczTcp6H4BRFUcRBCLBBCFAshsjxskymE2CaE2CWEWNPWYxob42O1xzDg\nxQH4bN9nfu2vsqES24q24d/n/xtrc+2LQ01TDU7/z+mYv3W+5TaldaXYVLDJ674+2vMRLn7rYgBA\ns7MZvWf0RvLkZIx7exxyK3NdtrUjHESEB5c9iE5RnbBkzxI4yQlHiwMb8zd6fW8wmfH9DAzvORwL\nty9EcU2xz+8vry/Hpf+7FB/t/Qif7/scADB9w3R8svcTn/flaHH4/B4AqHfUY3HW4tbfa5tq8Y+v\n/4FhrwxD/vF8v47xfd73GPmfkbjto9taXyurK/Pr/PzlSNWR1mOW15dj1vez2hRm3VywGf/4+h9Y\nsG1B6+ewLncdPt7zcUDO1y6Hyg8FbF9EhIeWPYSdxTtdXvf3Xgo3wXYOCwGMs/qjECIZwBwA1xDR\nCAC/9ecg0hnUNNVg6CtDW3v4RIQ/fv5HDEsdhr+v+juanc0AuNf86IpHbd0YKw+txHn9z8Mlgy7B\nlsItqG2qxbi3x2H6huke3/fXFX9F15iuWLBtAQAgpzIHS/cvddlm/NLxuPvTu03fP2fTHHyd/TUA\nYPGuxfg+73uU1pViY/5G9Enog+w/ZePigRdjzPwxmLR2ElZlr8LVi67Gma+f6fWa5myeg40FG7Hs\n98uQGpeKDXkb8Ma2NzB2wVjsOLrDbfuaphpUNlQC4B7swm0LQURwkhN3fHyHm6gcrTkKJzm9nsdL\nP7yEPSV7AABF1UV4aeNLmHf1PNx26m2YtmEaAH6w/vXNv7C1aKvX/d372b24oP8FmHX5LLy/+33U\nNtXi2XXP4l+r/+XVwe0u2d16PyzKWoQRc0fYugYjn+77FLd/fDsq6isAAFe8cwUOVRzCdcOuww3v\n3YCG5gYQEaaun4qMmRmt96QVOZU5uHrx1Xjh4hewpXALSutKsb9sP3rN6IXdJbvdtt91bBfmbp7r\n83l74+EvH8aTXz8JAHhrx1uYsn4KRrw6ArO+n4W1OWtbOyWbCjbhYPlBj/tqbG7ErUtuRU1TDWb9\nMAvvZL0DAJi0bhIe/vJhr07/ze1v4p9f/7P1d39Do0drjmLI7CHYVrQNALcjB8oOYHPB5tbQZk1T\njW33vzJ7JT7e+zEueesSfHngSwDcmRsyewh+LPzRp3MjIr+jDiW1JYHpPMiTCNY/AAMAZFn87QEA\nz9jYB/1l+V/on1//k178/kVam7OWSmpLqKK+gu74+A5KnZpK+0r30RMrn6BBLw2icxecS06nk+Zt\nmUcj546kBkcDXbDwAnpty2tUUV9BV71zFfWf1Z+ue/c6smLN4TV01yd30ah5o+iVja8QEdGY18bQ\nNYuuoXMXnEvDXhlG/1j1D3I6na3vqW2qpTmb5tDNH9xMA18cSOV15dR7em/aU7KHLn3rUuoyqQvt\nOLqDiIi+2P8FDX5pMHWf0p3yq/Jdju1ocVDatDQa89oYqmmsocQXEunXb/yaFmctpqe+foqeWPlE\n67Y7ju6gR5Y9QqfNPY2mrZ9GadPS6HDFYcvrWn14NaVNS6ODZQeJiGjC6gn0wNIH6ISZJ9D/ffZ/\ndM7r51CLs6V1+8r6Sho9bzQNemkQHSo/RFe8fQV1fbYrzdsyj17d9Cr1nNqTzpp/VuvnkFWcRYkv\nJNJLP7zkduyyujK6/aPbqbyunLYXbaeoiVF058d3EhHRYyseo0eWPUJERPlV+dR9Sne665O76OL/\nXkxp09Jo/OfjLa+JiOjHwh+pz4w+VO+op4r6Ckp8IZFmbJhB1yy6hobPGU6rDq1y2b7F2UKbCzZT\nc0sz1TvqacjLQ+iEmSfQzqM7KX1aOvWZ0YdWH17dun3B8QL6Nvdb02Ovy1lHKw6uICKi69+9nro+\n25Xe2/Ue5VTkUI8pPcjR4iCn00k3f3AzDXl5CGW+mUkjXh1Bg14aRBvzN7rtr7qxuvWe+NtXf6O/\nLv8rERHd+N6NtGDrAnpsxWOUMSODbvngFrf33vT+TRT9TDQdKDtA9Y56mrB6ApXUlrQ+D1sLtxIR\n0YqDK2jc2+OopLbE4+dKRFTvqKeE5xOo+5Tu1OBooLNfP5uWH1hOKw+tpAeWPkCnzT2Nhs8ZTnd+\nfCd1mdSFfr/k9x73N/W7qXT1oquJiOij3R/R+QvPp8LjhZT0QhKdt+A8+u/2/7q9J68qjxwtDqpp\nrKFe03tR4guJVFJbQj/k/UDdnu9GhccLWz87/f1rJKcip/W7WrRzEXWZ1IVuW3IbOVocdPbrZ1P/\nWf3ptLmnUfxz8ZQ+LZ2in4mmG9+7kZqam0z31+JsoaLqIiIiuvKdK+n1H1+ntTlrKX1aOjlaHPRN\n9jeECaB7P73X6+e8tXArVdZXUkV9BV3034tanwci/g7s8uSqJ+nRFY8SERE38X623f6+0fYBPIvD\nLACvAFgNYAuAP1hsR8+tnk7PrHmG7l96P501/yxKnpxMUROj6IGlD9DLP7xM/Wf1px5TelBeVR4N\nnT2Unlz1JKVPS6e9JXuJiGhj/kaKey6O4p+Lp9s/up2qG6tpwIsDXBoASXFNMfWa3oumfDeFpnw3\nhUprS4mIG7CE5xMopyKHjtUco1HzRtEDSx+g2qZaWrRzEQ14cQBd/+71tGDrAio4XkBE/HCfOf9M\nOvmVk1vFav6P86nvzL60/MBy+u37v6U3t73pcvyvDn5Fo+aNoiEvD6E/ffknuvStS2n2xtl09yd3\n05jXxpies+SuT+6il3942fRvP+T9QGnT0mjloZWtr2UVZ5GYIGjc2+OoxdlCZ79+Nj279llyOp1U\nWltK5y04jx784kGa9f0sin02li7732WUVZxFqVNTqceUHpRVnEWj5o2ixVmLaWvhVuo3qx89sfIJ\nypiR4XZDP/jFg5QxI4MufetSuvDNC2nC6gmU9EIS5VXlUfcp3Sm7PLt126PVR2nqd1Pp2bXP0u5j\nu6n39N7U4myhktoSWpuzloiIyuvK6dwF59KS3Uvo2sXXulz31Yuups6TOtOXB76k17a81toYEREt\n27+Mhs8ZTsmTk+mOj++gCasn0HXvXkfPrXuOYp6JoT9/+Weavn463fXJXeRocdCdH99JyZOTKXly\ncuv9JGlxttApc06h3tN709Hqo5T4QiI9s+YZuuuTu+jF71+kuz65q3Xb5pZm2lq4ld7f9T5V1FfQ\nQ188RJO/nez2PY3/fDxlzMigwxWHKXVqKh0qP0RERO/sfIcu+99l1HNqT9pWtI3SpqXRruJdre87\nUnmEUian0ONfPU63fHAL3fnxnXTS7JPopNkn0T2f3EMZMzJo+JzhVNNYQ0NnD6VrFl1DQ2cPbd2/\npKyujNbmrKXNBZvJ6XTSlwe+pHMXnEsXLLyAZm6YSalTU10aS6fTSV/s/4Ke+vop2lW8i1Imp7T+\nvbqxunW759Y9R7/78HfUfUr31s+xsbmRek7tSQ9+8SDd8fEdtPzAcjr11VNbOxu1TbX0xMonqPOk\nznTt4mtpwuoJdPMHN9Pdn9xNz659ls5feD6dNPskenTFo1TdWE2DXxpMp809zeUe13PzBzdTv1n9\nqMXZQvd9eh9NXDORUian0ENfPESXvHVJ63HrmuroSOURqnfU01XvXEU3vnejqei8/uPrFPtsLD23\n7jlKm5ZGdU11RER01vyzaNn+ZfTIskfo4WUPU/LkZKqor2j9vF7/8XWauGYivbrpVSqtLaVp66dR\n6tRU6j6lOw18cSDd9+l9lDI5hQqOF9CS3Uuo2/PdWkXNDHluLc4WOmHmCa2d0PYsDq8A2ACgK4Ae\nAPYDONFkOxo9+ml6+mn+t3r1aiIianA0tH4409dPpze2vkFERIuzFlPnSZ1pXc46lw9Q38snIno3\n613qObUnDXppEA2dPZRu/fBWmrZ+Gl3+v8tdeueS7PJsl4a5sr6Szl94PsU+G0sX/fcit94pEdGu\n4l2ECaAVB1eQ0+mkuz+5m2547wZaum8pERHN2zKPbltym8t77v7kbpqxYQbN2TSHMAH02pbXaF/p\nPkqdmkqJLyRSY3Oj23EkS3YvoUvfupSqG6vpkrcuoeUHlhMRN4ipU1Nbj6v/TC5961LalL+p9RpP\n/8/pdPn/Lqf0aen06IpHW2+8lYdWUlVDFRFxj++1La8REbuRqIlRNODFAfTqpleJiHtR8mciop1H\nd1LPqT2puKaYLv/f5XTyKyeTo8VBt3xwC/3qP7+iG967wfKaiIiGvTKMNuZvpHs+uYe6TOpCO4/u\npAeWPkDXLLqGBr00qNU1SN7e8TYNfmkwtThbqK6pjjJmZNC9n95LT656kvrO7Etf7P+Cahpr6MI3\nL6S45+IopyKHnE4nvbntTapurKai6iJKeiGJ7l96P132v8voeMNxmrFhBl3+v8vJ6XRSdnk2Nbc0\n05LdS+iM186gm96/ic6cfyZd8fYVdLDsIPWa3ovOX3g+fbr3U4/f1bi3xxERO7h1OesotzKXUian\n0ANLH6CeU3vSNYuuad2+sr6SYp6JoYv+exEREc3cMJMSX0ikfrP60d+++hv9Zflf6OFlD7f2rk//\nz+lU01hDczfPpRvfu5Eq6yvpyneupNHzRtOlb11KTqeTZm+cTWnT0mjZ/mVExAI2et5oOuO1M6jX\n9F70zs536KEvHqIXvn2B5v84n7pM6uLVxY15bQytOrSKluxeQsmTk6ngeAGtPryaBrw4gBbtXEQb\njmxw2f4vy//i8oyMnjeaHlj6AO0p2UOnzT2Nbnr/JjpSeYR+9+HvKGpiFO0+tpu2F22nrs92peFz\nhlNORQ6lTE6h25bcRnd8fAd9+NOH1H9Wf/rr8r+6iNjekr3Uc2pPGjp7KH2b+y0NemkQZRVn0Z+/\n/HNrp88M6Zbmbp7r8rrT6aTT5p5GMzfMpMEvDaZ/f/Pv1r/N2TSHbvngFuo3qx/tKt5Ft3xwS2vn\n5d/f/JtGzh1JT339FP1+ye8p4fkEOvmVk+lI5RHKrcylLw98SU6nk/785Z/prk/uol7Te9HMDTMp\nbVoaTVs/jSrrK1uP878d/6OTXzmZukzqQm/veJtmLppJ6Velt7aV7VkcngAwQff76wB+a7IdpacT\n7d9v+t254XQ6Ka8qz9Z2Wwu30v7S/bTj6A56c9ub9MiyR+i+T+/z2ADraWpuanUWVmwv2m75t+zy\nbEqflk7birbRKXNOoefWPUcpk1MovyqfaptqaewbY+lYzTFyOp004MUBLo2FGdWN1ZTwfAL97sPf\n0WX/u4zSpqXRje/dSH1n9nUTSyvqHfU05bsprYJhh+KaYhfx/SHvB0qenEwXLLyAzltwHvWY0qNV\nLOqa6uho9VEiolbbbRWykTyx8gm68b0bKXVqKs36fhYNeHEApU9Lp/K6cjrecJwOlB1w2d7pdFJ5\nXXnr72V1ZfTU10/Rje/d2BoGIOKeqQy1GLnqnato0EuDqKyujIj4uz75lZPplDmnULfnu9GZ88+k\n4XOG0yd7PqEDZQco+ploWrhtIRERnfjyiRT/XHxrT9KM0tpSSng+gTblb6IeU3pQz6k96Yq3r6An\nVj5BzS3N9IeP/kDrj6x3ec8fPvoDfb7v89ZrLDheQHtL9tItH9xCYoKgfaX7iIhoS8EWl+uUHK44\nTOnT0imrOKv1tXU566j39N70xtY3aO7mua1h2U35myh9Wjr1ndmXdh7dSRX1FdR5Umf6Jvsby2si\nInp27bN0/9L7aejsoXTpW5fStYuvpdHzRtPirMWm2+8q3kXD5wwnR4uDiNgR3vzBzRT9TDS9+P2L\nrfdVi7OFthVta33f7R/d3uoQ7vv0PsqYkdHaOy+rK6Mr37mSRs4dSe9mvUt5VXn0h4/+QBPXTKTn\n1z1PV75zJaVNS2u9T7zd69Itf5v7LY15bQzdv/R+WpuzlobOHkotzhZqam6i5pbm1u1La0sp9tlY\nGvLyEHI6nbT+yHqKfy6eRs4dSSe+fCIV1xS3bltRX2F6n+RX5VPnSZ3pT1/+iYiIdh/bTbd8cAul\nTE6hyd9Opi/2f0Hp09Lpu9zv6Ie8H6jn1J407u1xNH399NZ9tGdxGAZgFYBOAOIAZAE4xWQ7uvVW\nogULPH5/7ZYhLw+h5MnJ9PIPL9N1715HV71zlel2T656kt7a/pbX/Y17exwNemkQHW84TtuLttNf\nl//VpaEMFftL99M32d/QN9nfuIUuJE6nkz7d+6mbqzOy/sh6wgTQ5G8nk9PppIe+eIje3/V+ME67\nlezybLfeZFZxFi3dt5QcLQ6a/+N8+u37v211Vl9nf936kD+y7BG68b0bvR5j5NyRNOTlIfTi9y/S\nBz99QD2m9KBjNcf8Ol87HSIiMg2P7C3ZS31n9qWkF5JaQxJEHEo9YeYJrd/PvtJ9Xr+rrOIsipoY\nRZlvZlKDo4FOmXMKjXltjMdcgHGfTqfTVNysqKivaM2j6ffx+b7P6YKFF1CfGX3opNknUXldOWWX\nZxMmwDRn44knVz1JnSd1ppkbZtI5r59D3ad0p9kbZ1tuf/2719Pfvvpb6+/HG47Td7nf+fT9rstZ\n5yYch8oP0aVvXeoWHZm0dhJFTYxqzb8QRbA4AFgMoBBAE4A8APcAGA9gvG6bxwD89LMwPGKxH5o9\nm+i++2x/pu2KeVvm0Ue7PwrY/nYc3UF7SvYEbH+RQHNLMz224jGqbaoN96nYoqqhylYj8Kcv/+SS\nn9H3PkPNwbKDtGjnIpfXGpsb3VyZN5xOJ2W+mdmabD9ccdglnxQJnLvgXJr/43yf3tPU3NTaWais\nr6QHlj7QGmY1o6qhyqdEsi84nU4XESDiQhZjdKAt4iD4/ZGNEIJ+/JHwhz8AP/0U7rNRKAJHdkU2\nCqsLcW6/c8N9Kr8ojjceR7fO3RAlOvY4YCEEiEj49d72Ig4OByElBThyBEhJCfcZKRQKReTTFnFo\nN7IZHQ2MGQP80LGmN1IoFIqIpN2IAwCMHQts2BDus1AoFIqOT7sSh3POUeKgUCgUoaBdicOIEcC+\nfeE+C4VCoej4tJuENBGhpQWIjwcqK4HY2HCflUKhUEQ2v4iENAB06gT06wfk5IT7TBQKhaJj067E\nAQAGDQKys8N9FgqFQtGxaXfiMHCgEgeFQqEINu1OHJRzUCgUiuCjxEGhUCgUbihxUCgUCoUb7VYc\nIrECt64O+L//C/dZKBQKRdtpd+KQlAR06QKUlIT7TNxZsgRYsCDcZ6FQKBRtp92JAxC5oaXXXwec\nTqClJdxnolAoFG2j3YrD4cPhPgtX9u8H9u4FOncGGhvDfTYKhULRNtqtOBw8GO6zcGXBAuDOO4Gu\nXYGmpnCfjUKhULSNdikOQ4dG3gR8O3YAmZnsHJQ4KBSK9k67FIdhwzRxKCsDZswI7/kAHEqKjVXi\noFAoOgbtUhyGDuX4PhGwejUwe3a4z4jFoXNnrqRS4qBQKNo77VIcUlKAuDigsBD48Ufg2LHQjHs4\nehTYtcv8b42NLAzKOSgUio5AuxQHQAstbd0K1NcDtbXBP+bHHwOzZpn/TS8OqlpJoVC0d9qtOAwd\nCuzZw86hWzd2D8GmoQFobjb/m3IOCoWiIxFUcRBCLBBCFAshsrxsN0YI0SyEuMHuvocNA1auBGJi\ngFNOCY041NcrcVAoFL8Mgu0cFgIY52kDIUQnAFMALAdgezm7YcOAZcuA0aOBtDTlHBQKhSKQBFUc\niOhbABVeNnsYwIcAfJotaehQwOEIvTg4HOZ/U+KgUCg6EtHhPLgQIgPAbwBcBGAMANs1RwMGcEM8\nejQ3xso5KOzidAJC8D+FQmFOWMUBwIsA/k5EJIQQ8BBWmjBhQuvPmZmZyMzMxEMPAWPH8lQaR44E\n/2TtiIMa5xD5/OtfQL9+wPjx4T4ThSKwrFmzBmvWrAnIvsItDqMBvMu6gFQAVwghHET0mXFDvThI\n5MjotDSuWgok330HFBQAt9yivWYlDvK16GhVytoeqKhg96BQdDRkx1kyceJEv/cV1lJWIhpERAOJ\naCA473C/mTB4Ixg5hxUrgK++cn2tvt4859DUxI4BUGGl9kBTU2SuB6JQRBJBdQ5CiMUALgCQKoTI\nA/A0gBgAIKJ5gTpOMMQhN9e9kbdyDjKkBChxaA84HEBlZbjPQqGIbIIqDkR0qw/b3u3vcYIlDomJ\nrq8pcegYOByhKWBQKNoz7XaEtJ7UVKC0NLBx5Nxc9yk5rEpZlTi0LxwOFVZSKLzRIcShc2cgIYET\njYGguRnIzzcXB+Uc2j9KHBQK73QIcQACG1oqKuJ1oH0Rh86d+WdVyhr5OBzA8eOqqkyh8IQSBxNy\nc3l/RnGwmlvJ6BxUoxPZyNCgcg8KhTVKHEzIzeXJ/FTOoWOixEGh8I4SBxM8iYPKObR/HA5e0lVV\nLCkU1nQYcejbl6fRCARHjgAnn8xhJH0FlBKHjoHDAWRkKOegUHiiw4jDuHHA0qWBWS40NxcYOJAb\n/Pp67XUVVuoYOBxAnz5KHBQKT3QYcfjVr7iR3rOn7fvKzeWJ2eLjgbo6fq25mSuYlHNo/0hxUGEl\nhcKaDiMOQgC/+Q3w6adt2w8Ri0P//iwOMu8gHYQSh/aPcg4KhXc6jDgAgRGH8nJeejQx0VUcGhqA\nrl29i0OXLqqUNdJROQeFwjsdShwuuADYvx84etT1dX2eYOtWYP16632UlgI9e/LPRnFISFCzsnYE\nVFhJofBOhxKHmBhePjQ7W3utpQUYPhzYsoV//89/gHfesd5HRQXQvTv/bBSHbt3YORiT3uEMK/3t\nb8CCBaE7XkdAhZUUCu90KHEAuNev7xF+9RVw4ACwejX/vnGj50ahogJISeGfjeIQFwdERblP8BdO\ncdi0CcjLC93xOgJKHBQK73RIcdA/9G+8AVx4IYeSamqAXbv8F4fYWF7tzRhaCpc4EAE//aTWJvAV\nh4Nn8q2vV/khhcKKcC8TGnDS0rTG/9gx4OuvecnPCy8ENm/mRLOnWHN5ubk41Ndr4mBMSjc2ams/\nhFIcjh0DysqUOPiKw8HfU48e/Pn16RPuM1IoIo8O6Rxk4//++8C113LOITaWcw1XXNE25xATYy4O\nclbWUIrDTz/x/0ocfMPh4O9RVZYpFNZ0SHGQjf/evcAZZ/DPY8cCb78NXHklN6YtLebv95SQ9uQc\nwlHKuns3cOKJShx8gYi/v5gYVVmmUHiiw4mDPqyUn89zLgHAr3/NjfbYsUBSEoePzNA7h7g493EO\nkZRz+Oknvp6OKg47dwKzZwd2nw4Hf4dC8HdlVpqsUCg6oDjonYNeHM4/H+jdGxgwwL2iSY9VzsFb\nWEmJQ+DZuLHtgxqNyJASwP8r56BQmNMhxUE2/Hl5mjicdhqQlcU9RmNFkx471UqRIA6yUqkji8Ox\nYzwoMZDoxaEjOoejR7WybYWiLXRIcSgt5Qa7spLDTJIePfh/fejJiJU46KuVIiGsJAXwpJOA6mr3\nsRcdgWCLQ0d0Dl99Bbz0UrjPQtER6HDiEBvLDfWePRxG6tTJfRtvzqEtCelQicOePbzmRKdOfJ7V\n1YE/RmkpT4UeLo4d4+8pENOwSzq6cygtDc81lZUB06eH/riK4BFUcRBCLBBCFAshsiz+fpsQYocQ\nYqcQYr0QYmQgjtuzJ8+hJENKZn8/dowb8TffdO11G52DnLLbbs5B9kYD2aCZceQI508AIDk5OKGl\no0eBFSsCt4iSHZYs0aY/kd9RTU3g9t/RnUO4xGH/fuCtt0J/XEXwCLZzWAjAU98zG8D5RDQSwCQA\nrwXioGlpwLZtnsWhpIQTnnffDfz97/x6YyM3FvHx/LvZrKxmzkE/8V5UlHnoKdAcOcJrTgAsDhUV\ngT+GFMYVKwK/byteeglYuZJ/lqGzQIaWfgnOwWzm4GBTX9/xPstIgyj4nU49QRUHIvoWgGWzRUTf\nE1HVz7+C3BnvAAAgAElEQVRuBGDRnPuGN+cgcw6bNwO/+x2vILdwoeYahODt/Jk+A+Cfg90jNYpD\nMJxDOMTh8GG+NoDFoW/fwM6BpJxDcKirU+IQbGbPBp59NnTHi6Scw70AlgViRz17Atu3e3cOW7YA\nl18OTJgAfPaZa0gJ8K+UFQhN3iFU4nDGGcDataFpRBsbgYICvraWFi4rHjZMOQdfCKc4hMOxBIsf\nf+QZjyOJgoLQThYZEXMrCSEuBHAPgLFW20yYMKH158zMTGRmZlruLy2Nb1Zv4pCfD/zzn+wUdu1y\nTUYDvs2tFA5xOOEE/jmY4tC/P/+8YQPg4SMPCLm5bJuPHGFhSE4GevUKnjiE2jnU1vLx9B2QQFNa\nyoM3Q01HCytlZ3P0IZKoqrKe2UGyZs0arFmzJiDHC7s4/JyEng9gHBFZhqD04uANuViPJ3HIyWFR\nGDaME9L5+azM3pyDnbBSsMWBiMdwhMI5xMUBl1zCtfPBFofDh1mMjhzhkFJaGs+eGqywUqidw/z5\nwLJlXG4aLEpLuUov1HS0sFJDgxZWjRSOH+ecpieMHeeJEyf6fbywhpWEEP0AfATgdiIKWE2MFAfZ\nszaSmsqN/q9+xaWgMTE8R9GGDfbEIdzOobKSb5KkJP492OKQnh6agXaHD/NI9oICoKhIE4eO4hxK\nSjjZvm1bcPbvdHJJaXsIK1VXR/bgzYYG7dmPFKqq+LxCRbBLWRcD2ABgqBAiTwhxjxBivBBi/M+b\n/BtACoC5QohtQohNgThuWho3+r16mf89OprDR3JSPgAYMQL49ltXcZAzrTY1adVKdnMOwZx8T59v\nAOyJw/ff+26TpTjExobmpjx8mJ1c9+48r1JamjaoMVAYnUMoxaGiAhgyBJg2LTj7r6xkgQiHOPga\nVnrlleB9DoGgvj7ynENVFZ9XqAhqWImIbvXy9/sA3Bfo46alsTCYDYCT9OwJjBmj/T5iBPDeezyl\ntx7pHrw5BykkQPAbHTNx2LHD83vef5+vZdQo+8eR4hCqmWYPHwauv56vbcuW0DiHUDakFRXAX/8K\n/Otf7t9hICgt5Xs+HIlhX8NKlZWRPV16pIaV9O1MsImkaqWAMXIk8OGHnrd5/HHgssu030eM4F6X\nMVloJg7hLmXV5xsAe86hutp9JtqWFs8JLr04hMo5DBzI17Z5c2hyDqF2Dv378/154EDg919ayiHA\n9hBWksn5SKW+PjLDSqF0Dh1SHDp1As4+2/M299yjzbUEsDgArtVKgCYOVtVK8udonQcLhXPQ51Ps\niMPx4+7iMHUqMGWK9Xv0YSV/e3l1dfYbd704HDzYMZ1DSgqHJ4MhtjIZHa6wUkuL/UFaNTXKOfhK\nxImDEKKbEKLTzz8PFUJcK4SICf6phZYBA7gh9OYcZM6hqQn4zW/cXQMQnrCSHXEwjqIuKuKqLStq\na9vuHBYtAv7xD8/bELGzqa9nQZDX1hFzDikpwcvhSHEIV1gJsC9MkS4O9fX8OUZKBRYRP8MRJQ4A\n1gHoIoTIALACwB8AvBnMkwoHUVHA6NFARobr61ZhpepqHjhXWNg+xMEsrFRZCRQXW78nEM6hutrz\npIBNTSwCzz/PAi2ENrYiLY0bU08r9/lKJDiH2NjgPOThdA4dTRykeAcrtPTmm751EGpqWCAiTRwE\nEdUBuAHAq0R0E4ARwT2t8LB6tWsFEwAMHcpVPsaEtPxid+8Ojzj4E1YyOoeqKnvi0JaEdF2dZ3t+\n/Dj/ffFiDikBrs4hOtrzyn2+Ei7nQMTfUbCdQ69e4QsrAfZdS6SLg7yeYIWWHn3Us2s3UlXF922k\niQOEEOcAuA3AF768r71hVt109dXAF1+4l7J6E4dg3vgVFa75ksRE72s6WDmHo0et3xOIUlZvJYHV\n1Rw62rwZmDWLX9OLAxDY0FJTk1bxEUrnUFPDx+3cOfg5h/YSVorkhLT8foIhDvX1/Cz60kYcP87F\nBpEmDn8G8CSAj4noJyHEYAC/mLWmLrmExwjU1kaGcyDiG1bOHAuwqHXrxjeQFcePa3XwEukcrJKI\ngXIOnqx5dTWQkMACcOKJ/FpqKpd7JiZqvwdKHMLlHPTzdgXbOTQ3h3b2TsB3caitbR/OIRhhpcJC\n/t+X66+q0sQhVN+tV3EgorVEdC0RTRFCRAEoIaJHQnBuEUFCAvDrX3PMu3NnLecgH+49e0IrDg0N\n3LgZXU5KCo+ONUMmfGNj+SaTVFby/qxEJRClrHacQ0KC62tCAM88o82OG8hy1nDlHIziEKycQ8+e\nnD8LtXuQ33FHCSsF0zkUFPD/vopDjx783YbqnrVTrbRYCJEohIgHsAvAHiHE48E/tcjhqqv4gRbC\n1TkIYS4OwRznUFvr6hokqanW4tDQwDdVr16uoaWqKi7dtco7BKqU1VdxMNK9u5YvaW5um4sIpHNw\nOIB16+xtGyrnkJpqPoo/2Eix6ygJ6fp6ftYjSRwSEzkkGarQkp2w0ilEdBzAdQC+BDAAXLH0i+Hq\nq/mhA1xzDoMG8c0TSufgSRysGs3qar6xUlK0RtbpZMdw4onexSGYCWl5bp7QJ9yXLeMxKv4SyIn3\nli8H7rrL3rZ6cQhmzkGKQ6iT0jLUaee4Tmfkh5UaGrhTEoywkj/icPw4F2ZEmjhE/zyu4ToAnxOR\nA0CII5rhZdAgYN8+/lkfVho8mBuYSBEH6Ry2bdPOF+AbKyGBb3bpHGpqeD8ZGdZJ6VAlpL05h6Qk\nTRyOHfNcYeWNQE6899ln9h/UYDuHlhbuXaakhGYlQiN1dfw92XEsMm4eyeJQX89hnEhyDpEoDvMA\n5ADoBmCdEGIAgCoP23dI5Bz5+rBSfDyXXwZDHBoazGv77TiH//yHV7aTmDmHykq+2dLTPTuH+Pjg\nl7J6E4fkZC1XUlnZtrLWQDkHpxP4/HP7jUewcw7V1fxdyVmGQy0O9fV8j9k5ruyNR0K10g8/mCd4\nGxqCKw6dOnUAcSCil4kog4iuICIngFwAFwX/1CITGVZqbOSHfPDgwJWyPvig1qO8917g7bfdt6mt\n5cokI3pxOHqUF3yXmDmHqipudK3EQQ646dpVy6F4KpW1oq6O32vVo7TjHPRhpYoK69yKHQLlHDZt\n4u/BH+cQjLCSDDsAoc85OBzckbEbVqqp4c/Al2ckGBVYTidwzjnA3Lnuf5POIVhhpX79fA8rJSaG\nboZkwF5COlkIMUsI8aMQ4kcA0wGEYa2pyEDvHDyJg7HR2bMHWLDAer9NTcCrr3LZLBGwahUvQGTE\njnMwioN0DvrErnQOvXqZh5UaGvi6oqI48e6vG5KNp1Uj6o84tGXEdKCcw2efATfdxPeCnYY42GEl\n2XgAoQ8r1dezs7YrSjU1fC/60jheeSWPhQkkDgff308/DWRluf4tmM6hsJBD1b48TxHpHAAsAHAc\nwE0AbgZQDWChx3d0YPQ5h9hYnp/fjjhs3QosWWK9X9kb/vprYO9ejq2bJZhlrsCIXhyKinjiOtnT\nl84hJcW+c5D5Bom/5azy4bJ6yHwVh8pKbbSxPwTKOaxcyQ2W3Yc12OIgq1mA0IeVpMO0e9yaGm54\nfRGH3Fx7uaamJvsOo6mJ7/GnngKefdb1bzIhHWhxIOLnc+DADhBWAjCYiJ4momwiOkREEwAMDvJ5\nRSxG53DHHdzz0GNWylpf7zrGwEhpKffQv/4aWLuWb1qz2n5vzoGIH6KEBJ7aG/DPORjFwd9yVm8l\ngf44B8D/vEOgnENVFY/g9vawHjvG30mwcw7hDCvJe8WuY/HHOZSUeJ6jS3LLLfwM2UGOlh82zLWz\nIUOqwRCH0lIORyYmdoxqpXohxHnyFyHEuQAibDLb0KEvZY2N5YZrsEEqzZyDp8FmAN80Z50F7NrF\nic6rrjJ3Dt7EoaKCH9QRI7Q1A2TIwegcPCWkzZyDP+JQV+fZnvsjDlFR/ucdAuUc5AJPcXGeH9Zr\nr+XvMxQ5h3A5B3mv+OIckpP5Zzsi1tzMn58dcSgp0aqBvCHFIS7O9f50ODhhnJgY+JxDQQFXCNp9\nnqT7j9RxDv8PwBwhRK4QIhfAKz+/9otE7xyM4SSJmTjU13sXh4wMFohly4Abb/TNOfTowfs4epTd\nwEknaXkH2QDrE9KVla5hJaMVN3MO/oaVUlPbJg5JSa7VSiecEH7n0NTE33/Xrp57l0VFvPBUKMJK\n0jkEO+dQUgK8/LL2uz6sZKexl0UVdgeLyo6AHXGorbXfcbASB3k98fGBdw6+isP55wMbN0ZoWImI\nthPRSAAjAYwkotMBXBj0M4tQjDkHM7p25d6RHjthpdRU4OKLufE780zfnEOPHtxgFhWxOJx4orlz\nkGEZebPFxfEDIs/tk094DYZAOAenkx9AT/bcjjgkJvLn2dLC5z94cGDEoa3OQYqDp4e1tBRYupQb\n1FAlpIMdVtq2jee+kr1af8JKUhzs3FOyk2RXHOzeG/JeMIqDfLaNrweCggKgTx/7115cDLz7buSG\nlQAARFRFRLJ5ezRI5xPxGMNKZpx8MoeH9MiwklWyTIrDrbcCkyZZz0RqJQ6yB7R3r2/OAXANLW3b\nxpVSgXAOcvU8Tz0wO+IQFcXbVFWxOAwZEpiwUludg7ewUl0dN6AnnxyanEOowkolJXy83bv5d3/C\nSt262S/5ls+BHXGoqQmcc4iLC39YqaGBC1ki0jkoXDEmpM045RSurtDfzPX13FBYNZJSHAYMAO68\nkxvUlhb37a3EAeD379rlLg6y4dAnpPVhCH3j3dTEiWy5CpzEH+cgSxzlw+d0crmuHjviALCQFRVx\nLLhPn8hyDt6+0xtv1MaLAMHJOYQyrHTsGP+/YQP/72tYKVKcg5U4yGfbn7DSzTd7jhDIUey+iEND\nAz87dpxqIFHi4CPywZOD4MyIieGE8I4d2mvyC7W6cWRDIhHCfL4kO+LQuzfXUefl8bnKUta4OO3c\n9c5BH/ttbOTFhNpSylpaqgmh7IHJtaQfftjVPfkiDjk5/GB17x5e5+B0ciMYE+P5YS0pYQf4299y\nmFASqWGlI0eAOXO8b1dSAvTty2NyAP+cgy8j70tKtDVLPEEU2JyDr2Glhgbggw9cp64xIvdt1zXV\n1wO/+x0LvxARIg5CiBohRLXZPwB9QnN6kYcd5wDwkqNbt2q/y8bAKiltFAeAGxZjUtquc+jcmXMX\nBw5opaxCaO5B39PU36hNTdxIGJ2DL6WsN9zA5biy0ZAPWXk5N6wyHyMfZrMR30aSk4HDh/l/mV/x\nh0A4B9moCOE5rCS/0379gDVrtNeDJQ76UlZ/RG/dOo5ve+PYMV4/3SgOdh2LPiFtN6w0aJB3cWhs\nZLftq3OQDa7stOhzDr6EleTKbtnZ1tv4OpllQwNP7ihXfowIcSCibkSUYPHPZM00d4QQC4QQxUKI\nLA/bvCyEOCCE2CGE+JU/FxFK7OQcAGDUKODHH7Xf5RfqizhYOQerxjQ1lR1Br17aOWzd6jp/UUYG\nD5Dz5BwaGlgg/A0rHTrEDYgxrKQvo5XXEhtrvgKfkUA6B7kSnL/OQYaUAM9hJekcjJhNR/LII8D2\n7drveXk8G7D+NU/oB8H5G1Y6cMCeaJWUABdeyCN9y8pcnYOvYSU74lxSwoPGjEUeRmRD7qtziIpy\ndcb+OgcpCocPW2/jizjIz3LUKGDLFv45IsQhQCwEMM7qj0KIKwEMIaITAfwRgMksJ5GFv87B17AS\nYO4crEZIA9r7pTiccQbfVPppsS+7jKea9uQcALbG/iSkm5o4N1Be7hpW0seC5WdgN6QE8LkePqyJ\ng7fe4c6dHMIyO79AOQfA88Nq9p0C7DiMjcOqVcA33/DP27cDY8bwNezcae+cApGQPnjQ3nd87BiH\nLs88kyeu82eEtK/OYeBA786htpbvE1/FAXAVAn9zDtnZ/DkEyjno25ion1tqvcvRh62DQVDFgYi+\nBVDhYZNrAfz35203AkgWQqQH85zaip1SVgAYPpx70PobztNSnr44B1/FQe8crryS18SWI6QB1x6c\n/H/vXnvO4YYb3Hu8ckSwMaykT4YDvomDDCulpHBYyVsDkJWl9bb06MNKnTpxQ+3rPE1652AnrGSG\nUWwLCjSn+fbbwPjxnNz0tMa3Hr3Y+5tzsCsO0hGddRZPQBiKUlY7YaXaWr73Gxvt7ddKHOxWK61e\nDbz/vjYvU3Y2cN553sVBFid4O0d5Hnpkpdu+fTxQNpiEOyGdASBP93s+gL5hOhdb6MNKVoPgAP7b\nySdr6l5fzyWjZs6hrk6b1VKPPzmHTp20BmnUKG649Y3wOedwJVVzs9b4651DYyM/YPv323MOBQW8\nP4n8ubzc3Z7ry2gB/8QhOdmecygqMu/16cUB8M89GJ2Dr2ElwLWctaaGBVyKw/r1HLbp1cv+2hWB\ncA52w0rHjvHUISNHAj/9FPxSVrviIPdr5/4AvDsH+ZpV+fl11wFvvMFTdgAsCpdc4i4On32mibUM\ntfrqHCTSORQUeA+ztZXo4O7eFsLwu+lXMWHChNafMzMzkZmZGbwz8oDdsBLAg7Vyc7lBluJg5hzK\nyrhBF4ZPIjXVPebsTRzS0zULmpTEOYbcXO0hiI7m0NKqVdrxjM5hyBDgu+9cj2N1Mzc1uT6Iubks\nUGbOoS1hJTmFRkoKX1dNDX8P0RZ3cGGhea/PKA5yNLuxh+YJuzkHu86hsJCT1nl53PDu3Mkhm7w8\nHndih7bOylpezt9LlJfuYkMDX39iIlfkPf0033PBLGX1JawUH68VLPTu7Xl7b84hOpr/ydHweoj4\nOj7/XBuTlJ0N/POfPJGf/j67915O3g8Zoj0Tctp/T1iJQ0OD9f29Zs0arNFXP7SBcItDAYATdL/3\n/fk1N/TiEE7shpUAbvjkF9jQwA2AmThYNSK+OodevbjEUM8ZZ2jhHMmVV7qGXIw5BykOdkpZm5pc\n95+byxOZSecQF6fFbtsqDgCLQ1QU/15RYd0zLyqyJw7+9LKNYSWrEJcn56Af61BQAPTvz43Z/Pnc\n6MbFeV6I6fvvudMhr6mxUbsv/AkrHTzI37u3eYnkNQnBo/Bzc/l79bVaSZayenNtRPx89O+vLYBl\nVcAg99vSYi/v4M05yNdra93Fob6eX+vcmb+H775jcTjpJP4e8/LY7RDx/S7vRSkOdkJfZmElvXNo\nbna9BsC94zxx4kTvH4QF4Q4rfQbgDgAQQpwNoJKI2rAIZPCRzsHTOAdJfLxm/TyFlazEQeYc1q7l\nGLQs/bQSh7PP5p6MnjPOcG+Ar7uOezgSY7XSkCH8s51SVofD3TmcfrrmHIxhpd692yYO8n99Oeu3\n3wJ//KPr9nbFwZ91KuyGlew6BzlqdvRoHiR47rn8utWMuTU1wNixWrWT/BylE/RH8A4eBE491XtY\nSS94nTvzvbJ1q39hJTvO4fhxrdxU/zx52q/dUmd95ZrROejFwez7lccC+Pv66CN+T1ISi4IMLdXX\n83HkefuSc/AUVios1M4jWARVHIQQiwFsADBUCJEnhLhHCDFeCDEeAIhoGYBsIcRB8HKkDwTzfAKB\n3VJWgG8e2UB5Cit5E4cXXuA6+cZGzeqaIQTHgvWcc4577zUxEbj7bu13M+cA2EtIG51DTg6Lg6xW\nMoaVBg5su3MAXMtZP/yQ4/R6CgvN48WBdg7eqpXs5BwKC1kcRo3in8eO5detnEN5udZRAFzHOAD+\nhZUOHGDHB3h2HTLfIBkxgvNTwQor6T/DhATPoSXZcbJb6qyvXDM6B9ljt6pY0ovDeefxFBeDBvHv\nenGQ97q+kyhzDt46JXbEIRgr1UmCXa10KxH1IaLORHQCES0gonlENE+3zUNENISITiOirZ72Fwn4\nknPo1k27KRoafBeHnj254mnbNh534KmM1Yqzz+b8gieMzmHAAA7d2ElIm+UcfvUrFgyzcQ7+ioNs\n/KQ46HuHK1bw56MXgqIiDi8YH8BAOwdjtVJ1NfDEE9yrr6jghsoMK+cAaOIgx60YG3p53fLe0o9x\nANoWVvJWsmwMlY0Ywf/bCSvdfTeXvjY18XHsiENJifZs2BUHO9VsgG9hJSN6cTjzTL5uM3GQxRfG\nsJLdaiVPCWn9foNBuMNK7Q5fcg56cfAnrNS9O9/AjzzC+8rJ8V0c5Hl4wugc4uK4sbLjHPRhpZYW\nvmlPO819nIMUh0GDAuMcMjI4cZuTw/uNinKtgmpp4fcYH55gOAd9zzIvD5g6lavUEhKsXZ4x55CR\nwQ3t7Nl8nwAcW+/Rwz3vJD9v2VDqk9H+XpNdcTBzDoC9sNKKFcB99/H9KJeeDaRzkJ0ns7BSczMX\nYujLlr0lpI2vG48ln6uuXTl8qxcHORBO7xzkZxMTYz+sZJVzKCw0v78DiRIHH5EPgFkFgxEpDkR8\nI6SluTqHnBxgwgQOiZiJQ0wMz8szfjwn5H76yd5UE75irFbq3Bl48kktzCC38ZaQLirSxiE4HNxY\nBzqsJP//85+BWbN4uofLL+fPR5bRFhXx5Hz6kMCGDfw9BMo5WIWV5PEWLrQOKQHuzqFPHxaShx5y\n3c4s72AmDm0NKx06xNV1/joHb2ElmViOjdXuYbvOwZewkixlNTqHsjJe2vXIEe01u87BmzgA/Ixe\ndhn/rL8XZYelpkbrLNm9dquwUl0d3xNDhihxiCiio/kLkXPreEIm0OSqYcnJruJw//08F1JiIg8o\nMuODD1g4+vfnKZL9cQ7eMI5z6NKFz002xIB1QlofVsrN5fMUgkWioEBzDtXVfO39+/snDrJnLJ3D\nyScD11/P6wpcfjlXgskHv6iIE9/x8drDc9VVbPXNxMEf52AVVqqt5et/5x3rZDTgmnOQzsEMs7xD\noMNKRLzP1FTfncPAgVoHwJMoVVXxvTB3Loc6AXtxd72rTkjwnIA1lrLqke5LPymeHedgJ+cA8EzK\nskhIP92+fqoY/WSWbRGHykr+LHr0UOIQUURHc6PmzTUAWkJaxg4TE7Wb5csvube2aBFXOowZ43lf\nwRQHM+dgto1VWEk6h9xczlcAmjjIhuPoUW1NCX/EITqawxL67SdM4B73ZZe5ikNhoeYcamu58Tt+\nnP8eiEFwnsJKtbVcBFBTY885OJ382fSxmMpSDoTbvJndCKB93lbOwdewUl0df+cxMb47h6goYMEC\nDqV4Oq5835gxnLwF7DWQZWXcCAL2w0pmzsEXcfA152BEP7OBPqwkc3CA9sxZDbADrEdIA9yZ0Hd+\ngoESBx+RCWlv+QZACyvJLzkxUVvw57HHgOnTzRtiM0LlHKzCZWaNhtPJMdyqKv5ZOgfAXRyam/mh\n1S/5WV6uOQE7SNsu6dOHY7tpaebOQT7Yci2JnBz+X18n749z8JSQrqvjBj0z07NzkDmHkhL+TKw6\nG+npLB4LF3JVFhD4nIN+6g1fnQPAI4SluHgTBz1W4rBrl7afsjItqd+tm72wUiCdgz5vqMeTOCQl\n8bnI0GqnTlpYSYpDVBS/7ul7MnMOcpJAfecnWChx8BHZ6/RFHGRiSdriPXv4/2uusX/cfv24IQy2\nc9CHTIzbGB9kWSceH88NlF4cunfnhqRrV+1BM4pDTo62vb/IEb1WYaW6Ou14hw7x96cPBwbCORjD\nSvHxwP/7f1qYwQzZCHsKKQFaWOmrr/i6AC0BLxtKY1jJ15yD/v12xMHKEenDWfv2ufaKfRGHe+7h\nQWUAX6td5+CplLWkhGP0e/dqr9lxDvr7VY8ncRBCq5iqquLOguyk6J2AN+dkVfTStasmDu12nENH\nRFaf+Ooc5NTUcXE88d0FF3jPWejp358ftlA4BzNxkI0GkTalh6wTl3PZGJ0DwNfbqRM/CN27a4u2\ntLRwYy7DUG1FnwQ0hpVknkeKg/Ha2+IczMJKcXGcD7n9dut9yJyDN3Ho1YuT6YWFWmK6vJzfIxsG\ns7CSLzkH/fs9iUNjI5/DCSeY/10vSr/5jTYhHWAuDlbVSsePa6vN+RJW0ouDmXM491zfnYMncfD0\nLMrQUmWl9l35ui67WVgJcBUH5RwiCF/EQSq7/ktOTOSJuHydGko2usF0Dk6n66hR4zaNjTxYSs4G\nKR+ulBSOgxudA6A9DHFx/Fp0NH92Bw7w+3yZ08gTnsJKnsShrc7BLCFt5zvSOwerfAPAzmHTJh7V\nXlKiLWbTr1/ow0r797OYW4VC9cetqdGcDuCbc6iu1sJA/uQc5PQU+rLVkhIef1NZqe0jWM4B0MSh\nqspaHLyV8lo5h9hYlXOISPxxDvp65aQk7glecIFvx+3enW+GYDoHmaw1czSylDUvT2s8pJBIG2/m\nHPT14lIwkpLYfQwcGLhr6N2bGwC5noS+lPX4ce6BHzwYGOdgDCtJRwXw8ex8R/J92dlafbwZcvr1\nq67iz7SkhMWhf3+tkdNPvw74F1ayIw579nCVmBV6x1JXp/X+AWtxMBPm6mpX5yDvG7ulrGbLaZaU\nsNCeeKLmHjxNn2HHOdgRh8pKnu9MFqb44hzshJWUOEQQQnCYxK5z0FcrAdzDk2s8+3rc/v2D6xw8\njd2Qpaz5+a4hKBlWOniQHzTZg7VyDkBwxCE6mj/X777j3njfvq5hpeHD2d0Ewjnoe5xRUbwP2aAa\nl1e1QjbCcnyBFVIcLrlEG/MgxUGGlfS9a3lNnsJKy5e75gP0zsOTOOzeDZxyivV+9aJUV+dagmvX\nOcjZTktKtBJbX8NKgPv4BHn8oUM1cfA0fYbeOcixCnrsiENJibtz8CXnYBVWOvVUHoOkn54nGChx\n8AMZGvFGTAxvW1HhGlbKzPQt3yAJljhI52CVjAa0Gzkvzz0/kZLCU3zok8uhdg4Ah1puvhmYOJE/\nZ31YKSODz8l4fWbOITcXuOMO6+PonQPgGlryJaxUX+9dHFJTeWLB9HQWP7nKnj6sZCYOVs7B4eBZ\nefWzrwbSOTgcHM5pbPRPHGTp8bFj3JhGRWn3kN2wEuDeq5aJ9GHDXMUhFM7B35yDlXN4912e/VU5\nh0fe6kMAAB5GSURBVAjErjgAfAPJkaEAlwFefLF/x/31r7VJ8QKJ3jlYiYNsNPLzuVeqz090786N\nvZk4hMo5AOzGxo7l0dOAa1gpKYkbVDvOIT+f5wCywvg56UMYdsNKdsUB0GZp7d2bK7yamvhnK3Hw\nFFYqLOTGV78gTaDDSvL9dsJKxsZRXtOxY64hJcB+WAmwdg5DhrDLBdzFQT+9fqBzDsZBcGbXP3eu\na/WRtyl6lDhEIHJuFDvEx/ONKXsi8+Z57pV64qmnODEZaKRz8BRWkjdyfj7/LrePiWEhyMpyFQdj\nWEmO6AT4gSsuDrw4vPgiL9soXZk+rJSYaC4OZs6hvt56OVfA3TnoK5bshpW6dmUX1qWL60h0T/Tu\nzaGd7t21smi5JKu+EfUUVpLfn6/i0NzsOnOrGVKU5GfhzTmYJWRravj7KylxFz1PI6SdTteYvr7h\ndDq18FRSkveEdLCcg7ecw5Qp3FmQWIWVJEocIhB/nIN+OL7VYiXhQjoHT2El2Wjk/byoqxQH6Rwa\nG92dQ6dOWmP82mvaIDbZEAVaHFJS3MM93sTBzDl4EwejiPobVvrpJ++uQU+vXpo4yAFhVVXapHf6\na5KC99vfugqF/P70jZA+52A1Bfnhw3x8T8Inj2sUByLX2VUlVs6hb192Dvp8A+DZOdTV8Wcqx73o\nG/vycn5vTIxrg2omDvX1rkvoJif7Lw7FxXys3r3thZWOH3d1O3acgxrnEGG0JawUifjiHPLyuNGX\n1U0y5wC4ikOPHq6NZP/+2oOYnMz7sKqXDxRG59C/vz3n0NCgNRJmGEXU37BSZaVv4tC7NwuKdA7V\n1e69a0DrwTscPFWFvkHNz+eGy1fn4C0ZDWiOpa6O73spDnK+KePnYlatVF2t5VOKi12vTc4wYIZR\nlPVhIr1r0YuGmTgcPszlurIDp5/VQI8dccjO5u8pPp7vmepq64S0nOLFV3GQ12hnuVVfUeLgB3IO\nGjt06+YaVopE7DgHWcJbV8fJUZmjkNVKgKs4pKdzItWMpCQWBqvprAOFPufgq3MArHup3sJKdktZ\nAd/FobjYNaxkjMsDWiMte5X63mVeHnD++b6Lg7d8A6CJUn093wulpRzSsVou1co5JCWxKOzd63pt\nKSksqPrxCxJ9vgFwnTDPKA6enIMxByRDyMbwjR1xOHKEr0Um1UtL3Z2DvPfq6/m6/BWHyy4DNm60\n3tYflDj4ga/OIdLFQe8cPM31FBvLll+WteoT0oD7VBgjR5rvJykp8CElM4xhpdGj3ceXWOUcAK2X\n+uKLrqEF4+dkDCvZLWUFfA8rAa5hJTPnIMM7UhT0DVt+Pn8GenGwM0Lazmh2fVgpOZnPsaLCd3FI\nSODt9+51T7QnJZkvAWocsax3CPrj60VD/z127syN89697t+JWd7BjjjINUUArR2wCivJjoheHHzJ\nORw65Dr6OxAocfADX8QhPj7yw0p2xjnI7fr21W5qfc6ha1fPs5DqOfVUYNy4wJy7J4xhpSFDgMmT\nXbfx5BykOEybxnMbScycgz9hJcB35wBo4lBTw/eWlTjIBsfoHMaM4b+ZTfltJQ6Vld4nSdSLQ1wc\nV+YVF/smDjU1LA5paexWjNeWluZaBSUxOjZ9w2knrCQE/y0ry70i0CgOcjZVTx2p+Hi+Pim63sRB\n3mu+OAc5zqGlhce/5ORYb+sPShz8oC0J6UjEzjgHQHMOenGIieHXVq+2P3bjgguAxx8PzLl7whhW\nMsMq5wBojWtFBfDNN9rfjSLqT1jJH3Ho1o3/paTwPdilCzsBq5yDVVjphBPcVyvz5hwqK71XVckZ\ni+VgLzlpoJU4mFUrVVfzNfbsydN1mImDcWU8wD2s5Mk5mIWV5Ht27jR3DpWVwNatwIMPaq7B0/0u\nBLsHvXOQE1FK2ioO8lrktCry+wwUShz8wNecQ1NTZIuDv85BhpWEsF6sKJwYnYMZ3pxDYyP/rhcH\no4j6E1aSU5lLN2CXXr1cp7DOzTV3Dvqcg74xLCvTRujLiiU74lBV5V0cpHOQJZvp6dwgenIOZglp\n6RyamtzzKT17mjsHY1jJm3MgMheHPXusncPWrTy63FtISZKaqn2usqTdF+fgLawkz106BuUcIgBf\nnQMQ2WEl+VA3NHh3Diec4B5WilSMOQczvOUcKiq4YSkrcx3jYRZWInKvZbciPZ0H2vk6Ur53b9e5\nhnJy7IeVCgv5uJ06sThkZ/N32NysNUJtcQ5WYaX1683zT95yDoD/YSUr5xATwwnipib3SSbj4vg1\nY25FlrMeOcKfd1mZfXHQOweHI7BhJYCvef9+Hn8ixSEvj8ektBUlDn4gLb0d9IuQRypysfeaGs+N\nvXQO+gS2sfonkpC9x6oq6xXnvDkHOcDswgs5dAaYj5CWNfJdutgfx3Lqqb5dDwD86U/aiGkrcbAK\nK+Xna+XDUhykcEqRaqs4GMNKBw+y6/rNb9y3l8KsLxPVOwfAflipuNh1HIVxnIN+P/Jvxvs3Lo4r\n2ozPtnQOublcfbVzpz1x6NnTNecgjyHxJA5EvonDWWex+Dc388p8CxZ4Pz9vKHHwA1/DSkBkOweA\nH1Rvy59ecglw+unuYaVIJS6OG0a5noQZVjkHue51RQXH+C+8UAstmc2tVFdnP6TUFm68kRswwH5Y\nSf4v8w0Ax9UPHHANKQGexUG/nRn6EdIyrPTOO1w6ayYsQriLszfnYBVW2raN702JPqxkTKbLXJRZ\nWMksB6QXh4QEDi/ZEYdevTTBkq7GU85BCE0cHA6+b711NOLj+XscMIA/7/x8nvX5nHO8n583gioO\nQohxQoi9QogDQognTP6eKoRYLoTYLoTYJYS4K5jnEyh8rVYCIts5AHyjVld7buynT+dyVTtzMUUC\n0dGuM8WaYeUc0tI055CSwvNabdrEfzfmZuTiMnYrlQJFQgI35N7CSrKRzM9n5wdwQ7p1qz1xkGNg\nvF2bWVipvJwnQ7TCGFrSVysB7hVSVmGlbduAUaO03/XOweh6ZLjRzDmYzV2mF4fMTD6WHXF45hlg\n/Hj+2Y5zSE01nxnWE1Ic+vRhgcjO5vEOES0OQohOAF4BMA7AKQBuFUIYh9E8BGAbEZ0OIBPADCFE\nkIdGtR1/cg6RLg7SOdhp7I3VSpFMfLxncbDKOaSn8wMr17lOT9eWnjQmpOVsqXYrlQKFDJUZk7b6\nsFJMjLlzyMjgv+3c6V0cZDLaW34kKor/1dRoziEmBrj2Wuv3GCuWZLVSWhqfl3GgpJk41NVxozh8\nuPaa0TkYxaGqivcdFeX6upVzqKjgsM3FF9sXh4QE17Wo5TEkRnHo1ct/ccjIYHFYtozdld2yck8E\n0zmcCeAgEeUQkQPAuwCMkcciAPLRTQRQRkQ+LHAYHh57zH51TnsJK3Xpwg+1nVxKewkrAd7Fwco5\npKe7hpVSUlgoZJWL/nPq3ZsbjlCLg7y3rMJK1dV8HVIciov5d8mYMcCqVa6fj5k42Mk3SKKjueHt\n2hU47TTgv//1/F5jxZIMKw0aBLzxhvv2ZjmHrCxOyBpDRLIqyXj+8fH8vRrv3auuAi691P2YSUna\nmItTT7VfraRHv86ERC+MZuJgp0Mpx7tI5/Duu+xyA0EwxSEDQJ7u9/yfX9MzH8BwIUQhgB0A/hTE\n8wkY48a5TyJmhXIO4SUuzrtzMIpDQ4N7WCk2lhu+ujp359CnDzsH48RqwSYhgc/JeH36EdK9emni\nYJy9VYqDXedgh5gY/txkqe6tt3re3hhWkuIQHc35FSNmOYdt23gJUD0ydNTQwI5H3zmLi2PBMN7r\n99zjvh+AP5+dOzmketJJ/Jqv4iDHReiP6ck56BcI84QUnT59eNaBgoLAhJQAIJghHPK+Cf4BYDsR\nZQohBgNYKYQ4jYjcZrWZMGFC68+ZmZnI9HUR5jDRXsRB5hx+ac7BKqyUlsYPWkWF67rY5eXuCWkZ\nVvK26HygSUjgczKGe4ziIMMrUugkY8awm/AmDnaS0fpjy5li7WAlDlakpPA2cklbgHMnxkZdJp3N\nXI+Vc7BCTtnRrx83wnFx/olDXJzrd2UUh6FDtbEnvoSVoqNZNLkEdw127FgDXXPpN8EUhwIA+nk3\nTwC7Bz2/BvAcABDRISHEYQBDAWwx7mxCIK42DMjGItLDSr44B30pq68PSaixE1YqLgbeew847zx+\n+I1hJVkFk5KiTSanj4XHxvJx8vJCH1YyhpQAbaRydTULl1z1zSgOZ5zB/9sRB1/CStI52MFXcYiK\n0tZKkAMIt20D7rrLdTsZVjJes/ybmXOwQl57//58/CFD/AsrGT+TQOUcevfm8xo8GEhOzsQrr2S2\nVjlNnDjRtxPVEcyw0hYAJwohBgghOgO4BcBnhm32ArgEAIQQ6WBhyEYHor05h19aWGnAAO4RPv00\n8Oab/Jo+Ia1vXLp35zls5KhwPX36cGIw1GElM3Ho1IkFTDY4+rCSvqHs0YNj+2Y5B/3YA1/EQToH\nu/d7aqoWJnI67VV86UNLDgdPY24cZCcT0lbOwRdxkOIpHeRJJ/nvHPR4CyvZ+Qzj4/neA/g+PnAg\ncOvFBE0cfk4sPwRgBYDdAN4joj1CiPFCiJ8LvPA8gDOEEDsArALwOBGZzLnYfmkv4mBnnIOkI4WV\n+vblKRHuuksbiKTPOchqJUATB7PPqHdvHvAV6rCSmTjI8QMVFZo4OJ3mDeU557hWtsja+uZmYOZM\n/p59FQdfnIN+Gg+5YI+3xk1fsVRUpE1EqEc6B7Nz99U5GMVh0iTgppvsvVfSrZt7GxAo5yDFAbCf\nC7VDUMtGiehLAF8aXpun+7kUwDXBPIdwExcHXHNN5PewfXUOVVWRP84B8C4OksRErmMHXMc5xMVp\n4pCSojkHI336cOz7xBMDd+7euOYa8wQqwOEdKQ5yChEZn9bzn/+4X09sLN8Ljz/OAx99DStVVNgX\nh8GDtenDvYWUJPqKpcJCLuM0Ip1DRYW5c8jJsX/vyvtHDj70tFSqFSkp7nmbQIhDcnLwFs2K+DEF\n7Z2oKOAzYzAtAvHVOegX+4lkevSwV/OtX2VMn3Po0sW+czh0KLRhJVlia4bROZjF3gHz8EhsLK9r\n0NLC11RZqVXpeCMmhj83X5yDXKTGrjjow0qFha49Z/15yLWozZxDRYX9ezc6mq+/LWuQjBrl3g7o\ny3j9DSuNH+9eUBEolDgoAPiXc2gPYaXJk+2tOJeU5CoOSUncuBw75ioOO3ZYi0NDQ2jDSp6IieFB\ne97EwQy5vjXAvXpfS1kB+2FUf52DN3EAWAQKCtqecwDavpCOENqob/35yZHazc38/TQ2cgjQrnMI\nZmdEza2kAKCthOVrQjrSxUGOT/CGdA5y3eiYGH4tKkp7AFNSOMZtFVYCIkcc5DXLsJI/4hATozkH\nX8JKgO85ByL7g8t69WJRAPh/q2nP4+P5722tVgoWQ4bwtZeXa5Mfyhl+7YpDMFHioACg9YZ9SUi3\nh7CSXaQ46AcfJSa6jiPwFlYCIkcc5NrHsrb+6FHfxGHXLmDsWN/FQd4PdsUhOZkb6dJS+85BzigL\neHcOhYXWYaVwi0N8POcLNm/W8hoykW43rBRMlDgoALiupWtn2/YSVrKLXhzkQ5mQ4NqgekpIS3EI\nZc7BEzExWi+8Wzceg+Grc7jsMv/FwZeGTboHu+Ige9yAZ3GIj7cOK0WCOAA8hmbdOu26pTgo56CI\nGPx1DpHwgAUCM3FITHRtULt3t07aR5pziI7WGpxu3XhGVuMEfVbExrIIXnwxv6+01Ddx6NLFdUI7\nb8i8g11xyMjQZsH1J+cgF/WJhHv39NOBb791dw4VFfZHpQcLJQ4KAL45B31CuqOFlfQTnpmJA2D+\nGcmS2UgRB71zkKO3fXEOAHDyyVy1dfSo/YYqOtp39ySdw6FD9gQsKkqbnrqoyLNzqKszdw5A5IjD\nli2u4lBby5+5r0vIBhpVraQAoPWGfRGH5ubIeMACgbz+ykrXnIO+BywbVyt3lZERWeIgG2kZVho3\nzt57Y2O5skbOjpqXZ69HbzyuXQYPBhYt4kqwLW4T51i/56efOIltNhAQ0M7DzDkAkXHvnn46F4IY\nncPRo5x4DyfKOSgAaA+Kr+McIuEBCxSJifxQWuUcZHmr1Wf0wQfA6NHBP087tDXnIBe9GTyYr9tu\nmMgfcRg0iFfZe+QR9/WbrRg8GPjuO+5dW60zoa8y0xNJzqFXLxZiozgUFYXfOShxUADwzzl0pLAS\nwA9ocbEmDklJrmGOqChuaKw+o+HDAzevTVuJjnYVh9JS38RBLnrDk7n5dlxfq2xGjACuvJJHZNtl\n8GCO1XtqQKUIGENikeQchGD3EInOQYWVFAB8dw6NjVybHgkPWKBISuLBVbJxe/hh9zESKSn2PqNw\nExOjhYJkI+mLOMgpKQYN8k0c/HEOPXsCX3zh23uGDOE1Fq6/3nobuaaE8R6NJOcAAGef7VqtVFnJ\n+S+rcFmoUOKgAOCfc7C7fXtBOgeZczCbs6Z79/ZxzcawEmBfHO69V2uYxo4Fbr/dt+OGopx38GDu\nnFglowEWATNhiyTnAPCMwDI0FhfH8z717OlbxVcwUOKgAODfOIeoqI4dVjKjvTgHY1gJsC8O+iVw\n+/YFHn3Ut+OGQhwGDOAG1ZM4xMWZi4MU/0i5d43rWB86FP58A6ByDoqf+aWPcwDsiUP37u1DHPRh\nJSkOvoSH2nLcUIzs7dKFnZ0/4iCnRInEezcujkt0w51vAJQ4KH7Gn3EOv1RxaA/XbBznINdlDsVx\nQzVKfMwYz7PFWoWVACUOdlBhJQUArTdsx2rrE9KRYs0DgTHnYEZ6evhjwXYwhpXshpQCcdxQicOH\nH3r+e1qa+VoPAAtHpIrDsWNKHBQRROfO3NDbafg6d+Yy1o5WrZSYyJUinpzDY4+F7nzawkUXaQsB\nhVIcYmIip8Nwww3W1UyR7ByAyMg5KHFQAGA3YPdhkYnojjjOAfAsDpEysZ437r9f+zk+/pcpDkJY\nD5CLZOcAKOegiCA6d/Yt0dqlCzsHq4evPWJHHNojZ56plR4HmwsuiBxx8ESkOwclDoqIwRfnILd3\nOoN3PuFAjqQN91TJgWbAAPvTUrSVK64IzXHainIO3lHioADAD4ovD0vnztqqaR2FjuocFO6kpoam\ntNdX5OhtJQ6KiKFbN3tLNEq6dFHioGi/vPFGaEp7fSUujsuOI2F23wj8eBThYMgQnhnTLl26cEK6\nI6HE4ZdDpOZFkpPNp20JB0Gt2BZCjBNC7BVCHBBCPGGxTaYQYpsQYpcQYk0wz0fhmfR0+9t26RK5\nD5i/SHHoaDkHRfvhpJOA9evDfRZM0JyDEKITgFcAXAKgAMBmIcRnRLRHt00ygDkALieifCFEarDO\nRxFYfE1gtweUc1BEApGSCwmmczgTwEEiyiEiB4B3AfzGsM3vASwhonwAIKLSIJ6PIoB0RHGIjfVv\nPQKFoiMSTHHIAJCn+z3/59f0nAiguxBitRBiixDiD0E8H0UA6YhhJSHYPShxUCiCm5AmG9vEABgF\n4GIAcQC+F0L8QEQHjBtOmDCh9efMzExkZmYG5iwVftERnQPAy3ymquCmop2yZs0arFmzJiD7EkR2\n2nA/dizE2QAmENG4n39/EoCTiKbotnkCQFcimvDz768DWE5EHxr2RcE6T4V/XH89UFYGrFsX7jNR\nKBRWCCFARH7NYxDMsNIWACcKIQYIIToDuAXAZ4ZtPgVwrhCikxAiDsBZAHYH8ZwUAaIjhpUUCoVG\n0MJKRNQshHgIwAoAnQC8QUR7hBDjf/77PCLaK4RYDmAnACeA+USkxKEd0FHDSgqFgglaWCmQqLBS\n5PHHP/LaB59+Gu4zUSgUVkRqWEnRgVFhJYWiY6PEQeEXKqykUHRslDgo/EKJg0LRsVHioPALFVZS\nKDo2ShwUfuHr+g8KhaJ9oabsVvjFwIGROR++QqEIDKqUVaFQKDooqpRVoVAoFAFFiYNCoVAo3FDi\noFAoFAo3lDgoFAqFwg0lDgqFQqFwQ4mDQqFQKNxQ4qBQKBQKN5Q4KBQKhcINJQ4KhUKhcEOJg0Kh\nUCjcUOKgUCgUCjeUOCgUCoXCDSUOCoVCoXBDiYNCoVAo3FDioFAoFAo3gioOQohxQoi9QogDQogn\nPGw3RgjRLMT/b+/uYuQq6ziOf3+ygqA1QGiqYmObWCglMfRCbKxbmpCUcqH1JYI1Ri4MaBBoTDCh\nXig3hjZEw4Wx8aUgqYqpL63FRKASihXEtbGvbpUQrQHBloteFI2k4M+L8wwc9sx0pu3M7uz297nZ\nOc+cfc6z/zxz/nPO2ed59PFBticiInozsOQg6SzgW8BKYBGwWtJlHfZbDzwEnNKiFHFyduzYMdVN\nmDESy/5KPIfHIK8crgSesX3I9nHgJ8CqNvvdCvwMeHGAbYmafAD7J7Hsr8RzeAwyOVwMPFvbfq6U\nvUbSxVQJY0MpylqgERFDYJDJoZcT/T3AHWWBaJHbShERQ0HVeXkAFUtLgDttryzba4H/2V5f2+dv\nvJ4QLgL+A9xoe9uEunJFERFxCmyf0pfuQSaHEeCvwNXA88AYsNr2wQ773wc8aPsXA2lQRET0bGRQ\nFdt+RdItwMPAWcBG2wclfb68/51BHTsiIk7PwK4cIiJi+hrqEdK9DqKLziQdkrRP0m5JY6XsQknb\nJT0t6RFJ5091O4eVpHslHZa0v1bWMX6S1pb++hdJK6am1cOpQyzvlPRc6Z+7JV1bey+xPAFJcyU9\nJunPkg5Iuq2U96V/Dm1y6HUQXXRlYLntxbavLGV3ANttXwI8Wrajvfuo+mBd2/hJWgRcT9VfVwLf\nljS0n7Ep0C6WBr5Z+udi27+GxLJHx4Ev2b4cWAJ8sZwj+9I/hznYvQ6ii+4m/rfCR4D7y+v7gY9O\nbnOmD9s7gaMTijvFbxXwgO3jtg8Bz1D146BjLKH9v7Anll3Y/pftPeX1S8BBqrFkfemfw5wcug6i\ni54Y+I2kXZJuLGVzbB8urw8Dc6amadNWp/i9i6qftqTP9uZWSXslbazdAkksT4KkecBi4A/0qX8O\nc3LIk/L+WGp7MXAt1WXnaP3NMgAxsT5FPcQvsT2xDcB84ArgBeAbJ9g3sWxD0tuAnwNrbB+rv3c6\n/XOYk8M/gbm17bm8MetFD2y/UH6+CGyhuow8LOkdAJLeCRyZuhZOS53iN7HPvruURQe2j7gAvs/r\ntzkSyx5IejNVYthke2sp7kv/HObksAtYIGmepLOpHqRs6/I7USPpPEmzyuu3AiuA/VRxvKHsdgOw\ntX0N0UGn+G0DPiXpbEnzgQVUgz+jg3LyavkYVf+ExLIrSQI2AuO276m91Zf+ObBBcKer0yC6KW7W\ndDMH2FL1IUaAH9l+RNIuYLOkzwGHgOumronDTdIDwFXARZKeBb4KrKNN/GyPS9oMjAOvADc7A4le\n0yaWXwOWS7qC6vbG34HWINnEsrulwGeAfZJ2l7K19Kl/ZhBcREQ0DPNtpYiImCJJDhER0ZDkEBER\nDUkOERHRkOQQERENSQ4REdGQ5BAzlqSXys/3SFrd57q/MmH7iT7Xf6mkH6jyZD/rjuhFkkPMZK1B\nPPOBT5/ML5Zlbk9k7RsOZC89mfp7MAr8FngfcKDPdUd0leQQZ4J1wGhZTGaNpDdJulvSWJkN9CYA\nScsl7ZT0S8oJWdLWMqPtgdastpLWAeeW+jaVstZVikrd+1UtsnRdre4dkn4q6aCkH7ZrqKTRMtp1\nPXA78CvgGpWFmiImS0ZIx4wl6ZjtWZKuAm63/eFSfhMw2/bXJZ0D/A74JDCP6mR8ue1/lH0vsH1U\n0rlU89AsK9vHbM9qc6xPUE0BcQ0wG/gj8AFgIdUcN4uoZh99Aviy7ba3oyQ9afuDku4F7s7UMTHZ\ncuUQZ4KJi8msAD5bvqE/BVwIvLe8N9ZKDMUaSXuA31PNaLmgy7E+BPy4TDR6BHgceD/VLa4x28+X\n+Wz2UCWjZmOl84CXy+YC4Onuf2JEfw3txHsRA3aL7e31AknLgX9P2L4aWGL7v5IeA97SpV7TTEat\ny/OXa2Wv0ubzV25pLQTOl7SXKoHsknSX7c1djh3RN7lyiDPBMWBWbfth4ObWQ2dJl5Rv6xO9HTha\nEsNCqnV6W453eGi9E7i+PNeYDSyjuh3VbinMBturgO8BXwBuAzaUtZWTGGJSJTnETNb6xr4XeFXS\nHklrqBaVGQf+JGk/1WpkI2X/+kO4h4ARSePAXVS3llq+SzVV8qb6sWxvAfaVYz5K9VzhSJu6abPd\nsozqmcQo1W2piEmXB9IREdGQK4eIiGhIcoiIiIYkh4iIaEhyiIiIhiSHiIhoSHKIiIiGJIeIiGhI\ncoiIiIb/AxSD6Sq0YLMCAAAAAElFTkSuQmCC\n",
+      "image/png": "iVBORw0KGgoAAAANSUhEUgAAAYcAAAEPCAYAAACp/QjLAAAABHNCSVQICAgIfAhkiAAAAAlwSFlz\nAAALEgAACxIB0t1+/AAAIABJREFUeJzsnXd8VFX6/z8nJBAS0iAkQJCuoCC6IJbFEjvWtay6rq79\nu/ysW3R13XVXEAsdFZFFFFxXwYYVEQSlKChFWpBOSEgjpJOeSeb5/fF4cu/cuXfmzmRa4nm/XrxI\nJndumbn3fM7neZ5zjiAiKBQKhUKhJyrcJ6BQKBSKyEOJg0KhUCjcUOKgUCgUCjeUOCgUCoXCDSUO\nCoVCoXBDiYNCoVAo3AiqOAghFgghioUQWRZ/TxVCLBdCbBdC7BJC3BXM81EoFAqFPYLtHBYCGOfh\n7w8B2EZEpwPIBDBDCBEd5HNSKBQKhReCKg5E9C2ACg+bFAFI/PnnRABlRNQczHNSKBQKhXfC3Uuf\nD+AbIUQhgAQAN4f5fBQKhUKB8Cek/wFgOxH1AXA6gDlCiIQwn5NCoVD84gm3c/g1gOcAgIgOCSEO\nAxgKYIt+IyGEmgBKoVAo/ICIhD/vC7dz2AvgEgAQQqSDhSHbbEMiUv8C9O/pp58O+zl0lH/qs1Sf\nZyT/awtBdQ5CiMUALgCQKoTIA/A0gBgAIKJ5AJ4HsFAIsQMsVI8TUXkwz0mhUCgU3gmqOBDRrV7+\nXgrgmmCeg0KhUCh8J9xhJUUYyMzMDPcpdBjUZxlY1OcZOYi2xqVCgRCC2sN5KhQKRSQhhAC104S0\nQqFQKCIQJQ4KhUKhcEOJg0KhUCjcUOKgUCgUCjeUOCgUCoXCDSUOCoVCoXBDiYNCoVAo3FDioFAo\nFAo3lDgoFAqFwg0lDgqFQqFwQ4mDQqFQKNxQ4qBQKBQKN5Q4KBQKhcINJQ4KhUKhcEOJg0KhUCjc\nUOKgUCgUCjeUOCgUCoXCDSUONtlftj/cp6BQKBQhQ4mDDbKKs3Dq3FNR56gL96koFApFSFDiYINl\nB5ahqaUJP+T/EO5TCSgNzQ2od9SH+zRcOFB2ALVNteE+DYXiF88vQhyaWpra9P5lB5fh9F6nY23O\nWp/eV+eoAxG16diShuaG1p9bnC1t3l9jcyMu+u9FeHzl423eV6CobapF5n8z8e/V//Z7H6V1pbj3\n03tR01QTwDMLP9WN1ThaczTcp6H4BRFUcRBCLBBCFAshsjxskymE2CaE2CWEWNPWYxob42O1xzDg\nxQH4bN9nfu2vsqES24q24d/n/xtrc+2LQ01TDU7/z+mYv3W+5TaldaXYVLDJ674+2vMRLn7rYgBA\ns7MZvWf0RvLkZIx7exxyK3NdtrUjHESEB5c9iE5RnbBkzxI4yQlHiwMb8zd6fW8wmfH9DAzvORwL\nty9EcU2xz+8vry/Hpf+7FB/t/Qif7/scADB9w3R8svcTn/flaHH4/B4AqHfUY3HW4tbfa5tq8Y+v\n/4FhrwxD/vF8v47xfd73GPmfkbjto9taXyurK/Pr/PzlSNWR1mOW15dj1vez2hRm3VywGf/4+h9Y\nsG1B6+ewLncdPt7zcUDO1y6Hyg8FbF9EhIeWPYSdxTtdXvf3Xgo3wXYOCwGMs/qjECIZwBwA1xDR\nCAC/9ecg0hnUNNVg6CtDW3v4RIQ/fv5HDEsdhr+v+juanc0AuNf86IpHbd0YKw+txHn9z8Mlgy7B\nlsItqG2qxbi3x2H6huke3/fXFX9F15iuWLBtAQAgpzIHS/cvddlm/NLxuPvTu03fP2fTHHyd/TUA\nYPGuxfg+73uU1pViY/5G9Enog+w/ZePigRdjzPwxmLR2ElZlr8LVi67Gma+f6fWa5myeg40FG7Hs\n98uQGpeKDXkb8Ma2NzB2wVjsOLrDbfuaphpUNlQC4B7swm0LQURwkhN3fHyHm6gcrTkKJzm9nsdL\nP7yEPSV7AABF1UV4aeNLmHf1PNx26m2YtmEaAH6w/vXNv7C1aKvX/d372b24oP8FmHX5LLy/+33U\nNtXi2XXP4l+r/+XVwe0u2d16PyzKWoQRc0fYugYjn+77FLd/fDsq6isAAFe8cwUOVRzCdcOuww3v\n3YCG5gYQEaaun4qMmRmt96QVOZU5uHrx1Xjh4hewpXALSutKsb9sP3rN6IXdJbvdtt91bBfmbp7r\n83l74+EvH8aTXz8JAHhrx1uYsn4KRrw6ArO+n4W1OWtbOyWbCjbhYPlBj/tqbG7ErUtuRU1TDWb9\nMAvvZL0DAJi0bhIe/vJhr07/ze1v4p9f/7P1d39Do0drjmLI7CHYVrQNALcjB8oOYHPB5tbQZk1T\njW33vzJ7JT7e+zEueesSfHngSwDcmRsyewh+LPzRp3MjIr+jDiW1JYHpPMiTCNY/AAMAZFn87QEA\nz9jYB/1l+V/on1//k178/kVam7OWSmpLqKK+gu74+A5KnZpK+0r30RMrn6BBLw2icxecS06nk+Zt\nmUcj546kBkcDXbDwAnpty2tUUV9BV71zFfWf1Z+ue/c6smLN4TV01yd30ah5o+iVja8QEdGY18bQ\nNYuuoXMXnEvDXhlG/1j1D3I6na3vqW2qpTmb5tDNH9xMA18cSOV15dR7em/aU7KHLn3rUuoyqQvt\nOLqDiIi+2P8FDX5pMHWf0p3yq/Jdju1ocVDatDQa89oYqmmsocQXEunXb/yaFmctpqe+foqeWPlE\n67Y7ju6gR5Y9QqfNPY2mrZ9GadPS6HDFYcvrWn14NaVNS6ODZQeJiGjC6gn0wNIH6ISZJ9D/ffZ/\ndM7r51CLs6V1+8r6Sho9bzQNemkQHSo/RFe8fQV1fbYrzdsyj17d9Cr1nNqTzpp/VuvnkFWcRYkv\nJNJLP7zkduyyujK6/aPbqbyunLYXbaeoiVF058d3EhHRYyseo0eWPUJERPlV+dR9Sne665O76OL/\nXkxp09Jo/OfjLa+JiOjHwh+pz4w+VO+op4r6Ckp8IZFmbJhB1yy6hobPGU6rDq1y2b7F2UKbCzZT\nc0sz1TvqacjLQ+iEmSfQzqM7KX1aOvWZ0YdWH17dun3B8QL6Nvdb02Ovy1lHKw6uICKi69+9nro+\n25Xe2/Ue5VTkUI8pPcjR4iCn00k3f3AzDXl5CGW+mUkjXh1Bg14aRBvzN7rtr7qxuvWe+NtXf6O/\nLv8rERHd+N6NtGDrAnpsxWOUMSODbvngFrf33vT+TRT9TDQdKDtA9Y56mrB6ApXUlrQ+D1sLtxIR\n0YqDK2jc2+OopLbE4+dKRFTvqKeE5xOo+5Tu1OBooLNfP5uWH1hOKw+tpAeWPkCnzT2Nhs8ZTnd+\nfCd1mdSFfr/k9x73N/W7qXT1oquJiOij3R/R+QvPp8LjhZT0QhKdt+A8+u/2/7q9J68qjxwtDqpp\nrKFe03tR4guJVFJbQj/k/UDdnu9GhccLWz87/f1rJKcip/W7WrRzEXWZ1IVuW3IbOVocdPbrZ1P/\nWf3ptLmnUfxz8ZQ+LZ2in4mmG9+7kZqam0z31+JsoaLqIiIiuvKdK+n1H1+ntTlrKX1aOjlaHPRN\n9jeECaB7P73X6+e8tXArVdZXUkV9BV3034tanwci/g7s8uSqJ+nRFY8SERE38X623f6+0fYBPIvD\nLACvAFgNYAuAP1hsR8+tnk7PrHmG7l96P501/yxKnpxMUROj6IGlD9DLP7xM/Wf1px5TelBeVR4N\nnT2Unlz1JKVPS6e9JXuJiGhj/kaKey6O4p+Lp9s/up2qG6tpwIsDXBoASXFNMfWa3oumfDeFpnw3\nhUprS4mIG7CE5xMopyKHjtUco1HzRtEDSx+g2qZaWrRzEQ14cQBd/+71tGDrAio4XkBE/HCfOf9M\nOvmVk1vFav6P86nvzL60/MBy+u37v6U3t73pcvyvDn5Fo+aNoiEvD6E/ffknuvStS2n2xtl09yd3\n05jXxpies+SuT+6il3942fRvP+T9QGnT0mjloZWtr2UVZ5GYIGjc2+OoxdlCZ79+Nj279llyOp1U\nWltK5y04jx784kGa9f0sin02li7732WUVZxFqVNTqceUHpRVnEWj5o2ixVmLaWvhVuo3qx89sfIJ\nypiR4XZDP/jFg5QxI4MufetSuvDNC2nC6gmU9EIS5VXlUfcp3Sm7PLt126PVR2nqd1Pp2bXP0u5j\nu6n39N7U4myhktoSWpuzloiIyuvK6dwF59KS3Uvo2sXXulz31Yuups6TOtOXB76k17a81toYEREt\n27+Mhs8ZTsmTk+mOj++gCasn0HXvXkfPrXuOYp6JoT9/+Weavn463fXJXeRocdCdH99JyZOTKXly\ncuv9JGlxttApc06h3tN709Hqo5T4QiI9s+YZuuuTu+jF71+kuz65q3Xb5pZm2lq4ld7f9T5V1FfQ\nQ188RJO/nez2PY3/fDxlzMigwxWHKXVqKh0qP0RERO/sfIcu+99l1HNqT9pWtI3SpqXRruJdre87\nUnmEUian0ONfPU63fHAL3fnxnXTS7JPopNkn0T2f3EMZMzJo+JzhVNNYQ0NnD6VrFl1DQ2cPbd2/\npKyujNbmrKXNBZvJ6XTSlwe+pHMXnEsXLLyAZm6YSalTU10aS6fTSV/s/4Ke+vop2lW8i1Imp7T+\nvbqxunW759Y9R7/78HfUfUr31s+xsbmRek7tSQ9+8SDd8fEdtPzAcjr11VNbOxu1TbX0xMonqPOk\nznTt4mtpwuoJdPMHN9Pdn9xNz659ls5feD6dNPskenTFo1TdWE2DXxpMp809zeUe13PzBzdTv1n9\nqMXZQvd9eh9NXDORUian0ENfPESXvHVJ63HrmuroSOURqnfU01XvXEU3vnejqei8/uPrFPtsLD23\n7jlKm5ZGdU11RER01vyzaNn+ZfTIskfo4WUPU/LkZKqor2j9vF7/8XWauGYivbrpVSqtLaVp66dR\n6tRU6j6lOw18cSDd9+l9lDI5hQqOF9CS3Uuo2/PdWkXNDHluLc4WOmHmCa2d0PYsDq8A2ACgK4Ae\nAPYDONFkOxo9+ml6+mn+t3r1aiIianA0tH4409dPpze2vkFERIuzFlPnSZ1pXc46lw9Q38snIno3\n613qObUnDXppEA2dPZRu/fBWmrZ+Gl3+v8tdeueS7PJsl4a5sr6Szl94PsU+G0sX/fcit94pEdGu\n4l2ECaAVB1eQ0+mkuz+5m2547wZaum8pERHN2zKPbltym8t77v7kbpqxYQbN2TSHMAH02pbXaF/p\nPkqdmkqJLyRSY3Oj23EkS3YvoUvfupSqG6vpkrcuoeUHlhMRN4ipU1Nbj6v/TC5961LalL+p9RpP\n/8/pdPn/Lqf0aen06IpHW2+8lYdWUlVDFRFxj++1La8REbuRqIlRNODFAfTqpleJiHtR8mciop1H\nd1LPqT2puKaYLv/f5XTyKyeTo8VBt3xwC/3qP7+iG967wfKaiIiGvTKMNuZvpHs+uYe6TOpCO4/u\npAeWPkDXLLqGBr00qNU1SN7e8TYNfmkwtThbqK6pjjJmZNC9n95LT656kvrO7Etf7P+Cahpr6MI3\nL6S45+IopyKHnE4nvbntTapurKai6iJKeiGJ7l96P132v8voeMNxmrFhBl3+v8vJ6XRSdnk2Nbc0\n05LdS+iM186gm96/ic6cfyZd8fYVdLDsIPWa3ovOX3g+fbr3U4/f1bi3xxERO7h1OesotzKXUian\n0ANLH6CeU3vSNYuuad2+sr6SYp6JoYv+exEREc3cMJMSX0ikfrP60d+++hv9Zflf6OFlD7f2rk//\nz+lU01hDczfPpRvfu5Eq6yvpyneupNHzRtOlb11KTqeTZm+cTWnT0mjZ/mVExAI2et5oOuO1M6jX\n9F70zs536KEvHqIXvn2B5v84n7pM6uLVxY15bQytOrSKluxeQsmTk6ngeAGtPryaBrw4gBbtXEQb\njmxw2f4vy//i8oyMnjeaHlj6AO0p2UOnzT2Nbnr/JjpSeYR+9+HvKGpiFO0+tpu2F22nrs92peFz\nhlNORQ6lTE6h25bcRnd8fAd9+NOH1H9Wf/rr8r+6iNjekr3Uc2pPGjp7KH2b+y0NemkQZRVn0Z+/\n/HNrp88M6Zbmbp7r8rrT6aTT5p5GMzfMpMEvDaZ/f/Pv1r/N2TSHbvngFuo3qx/tKt5Ft3xwS2vn\n5d/f/JtGzh1JT339FP1+ye8p4fkEOvmVk+lI5RHKrcylLw98SU6nk/785Z/prk/uol7Te9HMDTMp\nbVoaTVs/jSrrK1uP878d/6OTXzmZukzqQm/veJtmLppJ6Velt7aV7VkcngAwQff76wB+a7IdpacT\n7d9v+t254XQ6Ka8qz9Z2Wwu30v7S/bTj6A56c9ub9MiyR+i+T+/z2ADraWpuanUWVmwv2m75t+zy\nbEqflk7birbRKXNOoefWPUcpk1MovyqfaptqaewbY+lYzTFyOp004MUBLo2FGdWN1ZTwfAL97sPf\n0WX/u4zSpqXRje/dSH1n9nUTSyvqHfU05bsprYJhh+KaYhfx/SHvB0qenEwXLLyAzltwHvWY0qNV\nLOqa6uho9VEiolbbbRWykTyx8gm68b0bKXVqKs36fhYNeHEApU9Lp/K6cjrecJwOlB1w2d7pdFJ5\nXXnr72V1ZfTU10/Rje/d2BoGIOKeqQy1GLnqnato0EuDqKyujIj4uz75lZPplDmnULfnu9GZ88+k\n4XOG0yd7PqEDZQco+ploWrhtIRERnfjyiRT/XHxrT9KM0tpSSng+gTblb6IeU3pQz6k96Yq3r6An\nVj5BzS3N9IeP/kDrj6x3ec8fPvoDfb7v89ZrLDheQHtL9tItH9xCYoKgfaX7iIhoS8EWl+uUHK44\nTOnT0imrOKv1tXU566j39N70xtY3aO7mua1h2U35myh9Wjr1ndmXdh7dSRX1FdR5Umf6Jvsby2si\nInp27bN0/9L7aejsoXTpW5fStYuvpdHzRtPirMWm2+8q3kXD5wwnR4uDiNgR3vzBzRT9TDS9+P2L\nrfdVi7OFthVta33f7R/d3uoQ7vv0PsqYkdHaOy+rK6Mr37mSRs4dSe9mvUt5VXn0h4/+QBPXTKTn\n1z1PV75zJaVNS2u9T7zd69Itf5v7LY15bQzdv/R+WpuzlobOHkotzhZqam6i5pbm1u1La0sp9tlY\nGvLyEHI6nbT+yHqKfy6eRs4dSSe+fCIV1xS3bltRX2F6n+RX5VPnSZ3pT1/+iYiIdh/bTbd8cAul\nTE6hyd9Opi/2f0Hp09Lpu9zv6Ie8H6jn1J407u1xNH399NZ9tGdxGAZgFYBOAOIAZAE4xWQ7uvVW\nogULPH5/7ZYhLw+h5MnJ9PIPL9N1715HV71zlel2T656kt7a/pbX/Y17exwNemkQHW84TtuLttNf\nl//VpaEMFftL99M32d/QN9nfuIUuJE6nkz7d+6mbqzOy/sh6wgTQ5G8nk9PppIe+eIje3/V+ME67\nlezybLfeZFZxFi3dt5QcLQ6a/+N8+u37v211Vl9nf936kD+y7BG68b0bvR5j5NyRNOTlIfTi9y/S\nBz99QD2m9KBjNcf8Ol87HSIiMg2P7C3ZS31n9qWkF5JaQxJEHEo9YeYJrd/PvtJ9Xr+rrOIsipoY\nRZlvZlKDo4FOmXMKjXltjMdcgHGfTqfTVNysqKivaM2j6ffx+b7P6YKFF1CfGX3opNknUXldOWWX\nZxMmwDRn44knVz1JnSd1ppkbZtI5r59D3ad0p9kbZ1tuf/2719Pfvvpb6+/HG47Td7nf+fT9rstZ\n5yYch8oP0aVvXeoWHZm0dhJFTYxqzb8QRbA4AFgMoBBAE4A8APcAGA9gvG6bxwD89LMwPGKxH5o9\nm+i++2x/pu2KeVvm0Ue7PwrY/nYc3UF7SvYEbH+RQHNLMz224jGqbaoN96nYoqqhylYj8Kcv/+SS\nn9H3PkPNwbKDtGjnIpfXGpsb3VyZN5xOJ2W+mdmabD9ccdglnxQJnLvgXJr/43yf3tPU3NTaWais\nr6QHlj7QGmY1o6qhyqdEsi84nU4XESDiQhZjdKAt4iD4/ZGNEIJ+/JHwhz8AP/0U7rNRKAJHdkU2\nCqsLcW6/c8N9Kr8ojjceR7fO3RAlOvY4YCEEiEj49d72Ig4OByElBThyBEhJCfcZKRQKReTTFnFo\nN7IZHQ2MGQP80LGmN1IoFIqIpN2IAwCMHQts2BDus1AoFIqOT7sSh3POUeKgUCgUoaBdicOIEcC+\nfeE+C4VCoej4tJuENBGhpQWIjwcqK4HY2HCflUKhUEQ2v4iENAB06gT06wfk5IT7TBQKhaJj067E\nAQAGDQKys8N9FgqFQtGxaXfiMHCgEgeFQqEINu1OHJRzUCgUiuCjxEGhUCgUbihxUCgUCoUb7VYc\nIrECt64O+L//C/dZKBQKRdtpd+KQlAR06QKUlIT7TNxZsgRYsCDcZ6FQKBRtp92JAxC5oaXXXwec\nTqClJdxnolAoFG2j3YrD4cPhPgtX9u8H9u4FOncGGhvDfTYKhULRNtqtOBw8GO6zcGXBAuDOO4Gu\nXYGmpnCfjUKhULSNdikOQ4dG3gR8O3YAmZnsHJQ4KBSK9k67FIdhwzRxKCsDZswI7/kAHEqKjVXi\noFAoOgbtUhyGDuX4PhGwejUwe3a4z4jFoXNnrqRS4qBQKNo77VIcUlKAuDigsBD48Ufg2LHQjHs4\nehTYtcv8b42NLAzKOSgUio5AuxQHQAstbd0K1NcDtbXBP+bHHwOzZpn/TS8OqlpJoVC0d9qtOAwd\nCuzZw86hWzd2D8GmoQFobjb/m3IOCoWiIxFUcRBCLBBCFAshsrxsN0YI0SyEuMHuvocNA1auBGJi\ngFNOCY041NcrcVAoFL8Mgu0cFgIY52kDIUQnAFMALAdgezm7YcOAZcuA0aOBtDTlHBQKhSKQBFUc\niOhbABVeNnsYwIcAfJotaehQwOEIvTg4HOZ/U+KgUCg6EtHhPLgQIgPAbwBcBGAMANs1RwMGcEM8\nejQ3xso5KOzidAJC8D+FQmFOWMUBwIsA/k5EJIQQ8BBWmjBhQuvPmZmZyMzMxEMPAWPH8lQaR44E\n/2TtiIMa5xD5/OtfQL9+wPjx4T4ThSKwrFmzBmvWrAnIvsItDqMBvMu6gFQAVwghHET0mXFDvThI\n5MjotDSuWgok330HFBQAt9yivWYlDvK16GhVytoeqKhg96BQdDRkx1kyceJEv/cV1lJWIhpERAOJ\naCA473C/mTB4Ixg5hxUrgK++cn2tvt4859DUxI4BUGGl9kBTU2SuB6JQRBJBdQ5CiMUALgCQKoTI\nA/A0gBgAIKJ5gTpOMMQhN9e9kbdyDjKkBChxaA84HEBlZbjPQqGIbIIqDkR0qw/b3u3vcYIlDomJ\nrq8pcegYOByhKWBQKNoz7XaEtJ7UVKC0NLBx5Nxc9yk5rEpZlTi0LxwOFVZSKLzRIcShc2cgIYET\njYGguRnIzzcXB+Uc2j9KHBQK73QIcQACG1oqKuJ1oH0Rh86d+WdVyhr5OBzA8eOqqkyh8IQSBxNy\nc3l/RnGwmlvJ6BxUoxPZyNCgcg8KhTVKHEzIzeXJ/FTOoWOixEGh8I4SBxM8iYPKObR/HA5e0lVV\nLCkU1nQYcejbl6fRCARHjgAnn8xhJH0FlBKHjoHDAWRkKOegUHiiw4jDuHHA0qWBWS40NxcYOJAb\n/Pp67XUVVuoYOBxAnz5KHBQKT3QYcfjVr7iR3rOn7fvKzeWJ2eLjgbo6fq25mSuYlHNo/0hxUGEl\nhcKaDiMOQgC/+Q3w6adt2w8Ri0P//iwOMu8gHYQSh/aPcg4KhXc6jDgAgRGH8nJeejQx0VUcGhqA\nrl29i0OXLqqUNdJROQeFwjsdShwuuADYvx84etT1dX2eYOtWYP16632UlgI9e/LPRnFISFCzsnYE\nVFhJofBOhxKHmBhePjQ7W3utpQUYPhzYsoV//89/gHfesd5HRQXQvTv/bBSHbt3YORiT3uEMK/3t\nb8CCBaE7XkdAhZUUCu90KHEAuNev7xF+9RVw4ACwejX/vnGj50ahogJISeGfjeIQFwdERblP8BdO\ncdi0CcjLC93xOgJKHBQK73RIcdA/9G+8AVx4IYeSamqAXbv8F4fYWF7tzRhaCpc4EAE//aTWJvAV\nh4Nn8q2vV/khhcKKcC8TGnDS0rTG/9gx4OuvecnPCy8ENm/mRLOnWHN5ubk41Ndr4mBMSjc2ams/\nhFIcjh0DysqUOPiKw8HfU48e/Pn16RPuM1IoIo8O6Rxk4//++8C113LOITaWcw1XXNE25xATYy4O\nclbWUIrDTz/x/0ocfMPh4O9RVZYpFNZ0SHGQjf/evcAZZ/DPY8cCb78NXHklN6YtLebv95SQ9uQc\nwlHKuns3cOKJShx8gYi/v5gYVVmmUHiiw4mDPqyUn89zLgHAr3/NjfbYsUBSEoePzNA7h7g493EO\nkZRz+Oknvp6OKg47dwKzZwd2nw4Hf4dC8HdlVpqsUCg6oDjonYNeHM4/H+jdGxgwwL2iSY9VzsFb\nWEmJQ+DZuLHtgxqNyJASwP8r56BQmNMhxUE2/Hl5mjicdhqQlcU9RmNFkx471UqRIA6yUqkji8Ox\nYzwoMZDoxaEjOoejR7WybYWiLXRIcSgt5Qa7spLDTJIePfh/fejJiJU46KuVIiGsJAXwpJOA6mr3\nsRcdgWCLQ0d0Dl99Bbz0UrjPQtER6HDiEBvLDfWePRxG6tTJfRtvzqEtCelQicOePbzmRKdOfJ7V\n1YE/RmkpT4UeLo4d4+8pENOwSzq6cygtDc81lZUB06eH/riK4BFUcRBCLBBCFAshsiz+fpsQYocQ\nYqcQYr0QYmQgjtuzJ8+hJENKZn8/dowb8TffdO11G52DnLLbbs5B9kYD2aCZceQI508AIDk5OKGl\no0eBFSsCt4iSHZYs0aY/kd9RTU3g9t/RnUO4xGH/fuCtt0J/XEXwCLZzWAjAU98zG8D5RDQSwCQA\nrwXioGlpwLZtnsWhpIQTnnffDfz97/x6YyM3FvHx/LvZrKxmzkE/8V5UlHnoKdAcOcJrTgAsDhUV\ngT+GFMYVKwK/byteeglYuZJ/lqGzQIaWfgnOwWzm4GBTX9/xPstIgyj4nU49QRUHIvoWgGWzRUTf\nE1HVz7+C3BnvAAAgAElEQVRuBGDRnPuGN+cgcw6bNwO/+x2vILdwoeYahODt/Jk+A+Cfg90jNYpD\nMJxDOMTh8GG+NoDFoW/fwM6BpJxDcKirU+IQbGbPBp59NnTHi6Scw70AlgViRz17Atu3e3cOW7YA\nl18OTJgAfPaZa0gJ8K+UFQhN3iFU4nDGGcDataFpRBsbgYICvraWFi4rHjZMOQdfCKc4hMOxBIsf\nf+QZjyOJgoLQThYZEXMrCSEuBHAPgLFW20yYMKH158zMTGRmZlruLy2Nb1Zv4pCfD/zzn+wUdu1y\nTUYDvs2tFA5xOOEE/jmY4tC/P/+8YQPg4SMPCLm5bJuPHGFhSE4GevUKnjiE2jnU1vLx9B2QQFNa\nyoM3Q01HCytlZ3P0IZKoqrKe2UGyZs0arFmzJiDHC7s4/JyEng9gHBFZhqD04uANuViPJ3HIyWFR\nGDaME9L5+azM3pyDnbBSsMWBiMdwhMI5xMUBl1zCtfPBFofDh1mMjhzhkFJaGs+eGqywUqidw/z5\nwLJlXG4aLEpLuUov1HS0sFJDgxZWjRSOH+ecpieMHeeJEyf6fbywhpWEEP0AfATgdiIKWE2MFAfZ\nszaSmsqN/q9+xaWgMTE8R9GGDfbEIdzOobKSb5KkJP492OKQnh6agXaHD/NI9oICoKhIE4eO4hxK\nSjjZvm1bcPbvdHJJaXsIK1VXR/bgzYYG7dmPFKqq+LxCRbBLWRcD2ABgqBAiTwhxjxBivBBi/M+b\n/BtACoC5QohtQohNgThuWho3+r16mf89OprDR3JSPgAYMQL49ltXcZAzrTY1adVKdnMOwZx8T59v\nAOyJw/ff+26TpTjExobmpjx8mJ1c9+48r1JamjaoMVAYnUMoxaGiAhgyBJg2LTj7r6xkgQiHOPga\nVnrlleB9DoGgvj7ynENVFZ9XqAhqWImIbvXy9/sA3Bfo46alsTCYDYCT9OwJjBmj/T5iBPDeezyl\ntx7pHrw5BykkQPAbHTNx2LHD83vef5+vZdQo+8eR4hCqmWYPHwauv56vbcuW0DiHUDakFRXAX/8K\n/Otf7t9hICgt5Xs+HIlhX8NKlZWRPV16pIaV9O1MsImkaqWAMXIk8OGHnrd5/HHgssu030eM4F6X\nMVloJg7hLmXV5xsAe86hutp9JtqWFs8JLr04hMo5DBzI17Z5c2hyDqF2Dv378/154EDg919ayiHA\n9hBWksn5SKW+PjLDSqF0Dh1SHDp1As4+2/M299yjzbUEsDgArtVKgCYOVtVK8udonQcLhXPQ51Ps\niMPx4+7iMHUqMGWK9Xv0YSV/e3l1dfYbd704HDzYMZ1DSgqHJ4MhtjIZHa6wUkuL/UFaNTXKOfhK\nxImDEKKbEKLTzz8PFUJcK4SICf6phZYBA7gh9OYcZM6hqQn4zW/cXQMQnrCSHXEwjqIuKuKqLStq\na9vuHBYtAv7xD8/bELGzqa9nQZDX1hFzDikpwcvhSHEIV1gJsC9MkS4O9fX8OUZKBRYRP8MRJQ4A\n1gHoIoTIALACwB8AvBnMkwoHUVHA6NFARobr61ZhpepqHjhXWNg+xMEsrFRZCRQXW78nEM6hutrz\npIBNTSwCzz/PAi2ENrYiLY0bU08r9/lKJDiH2NjgPOThdA4dTRykeAcrtPTmm751EGpqWCAiTRwE\nEdUBuAHAq0R0E4ARwT2t8LB6tWsFEwAMHcpVPsaEtPxid+8Ojzj4E1YyOoeqKnvi0JaEdF2dZ3t+\n/Dj/ffFiDikBrs4hOtrzyn2+Ei7nQMTfUbCdQ69e4QsrAfZdS6SLg7yeYIWWHn3Us2s3UlXF922k\niQOEEOcAuA3AF768r71hVt109dXAF1+4l7J6E4dg3vgVFa75ksRE72s6WDmHo0et3xOIUlZvJYHV\n1Rw62rwZmDWLX9OLAxDY0FJTk1bxEUrnUFPDx+3cOfg5h/YSVorkhLT8foIhDvX1/Cz60kYcP87F\nBpEmDn8G8CSAj4noJyHEYAC/mLWmLrmExwjU1kaGcyDiG1bOHAuwqHXrxjeQFcePa3XwEukcrJKI\ngXIOnqx5dTWQkMACcOKJ/FpqKpd7JiZqvwdKHMLlHPTzdgXbOTQ3h3b2TsB3caitbR/OIRhhpcJC\n/t+X66+q0sQhVN+tV3EgorVEdC0RTRFCRAEoIaJHQnBuEUFCAvDrX3PMu3NnLecgH+49e0IrDg0N\n3LgZXU5KCo+ONUMmfGNj+SaTVFby/qxEJRClrHacQ0KC62tCAM88o82OG8hy1nDlHIziEKycQ8+e\nnD8LtXuQ33FHCSsF0zkUFPD/vopDjx783YbqnrVTrbRYCJEohIgHsAvAHiHE48E/tcjhqqv4gRbC\n1TkIYS4OwRznUFvr6hokqanW4tDQwDdVr16uoaWqKi7dtco7BKqU1VdxMNK9u5YvaW5um4sIpHNw\nOIB16+xtGyrnkJpqPoo/2Eix6ygJ6fp6ftYjSRwSEzkkGarQkp2w0ilEdBzAdQC+BDAAXLH0i+Hq\nq/mhA1xzDoMG8c0TSufgSRysGs3qar6xUlK0RtbpZMdw4onexSGYCWl5bp7QJ9yXLeMxKv4SyIn3\nli8H7rrL3rZ6cQhmzkGKQ6iT0jLUaee4Tmfkh5UaGrhTEoywkj/icPw4F2ZEmjhE/zyu4ToAnxOR\nA0CII5rhZdAgYN8+/lkfVho8mBuYSBEH6Ry2bdPOF+AbKyGBb3bpHGpqeD8ZGdZJ6VAlpL05h6Qk\nTRyOHfNcYeWNQE6899ln9h/UYDuHlhbuXaakhGYlQiN1dfw92XEsMm4eyeJQX89hnEhyDpEoDvMA\n5ADoBmCdEGIAgCoP23dI5Bz5+rBSfDyXXwZDHBoazGv77TiH//yHV7aTmDmHykq+2dLTPTuH+Pjg\nl7J6E4fkZC1XUlnZtrLWQDkHpxP4/HP7jUewcw7V1fxdyVmGQy0O9fV8j9k5ruyNR0K10g8/mCd4\nGxqCKw6dOnUAcSCil4kog4iuICIngFwAFwX/1CITGVZqbOSHfPDgwJWyPvig1qO8917g7bfdt6mt\n5cokI3pxOHqUF3yXmDmHqipudK3EQQ646dpVy6F4KpW1oq6O32vVo7TjHPRhpYoK69yKHQLlHDZt\n4u/BH+cQjLCSDDsAoc85OBzckbEbVqqp4c/Al2ckGBVYTidwzjnA3Lnuf5POIVhhpX79fA8rJSaG\nboZkwF5COlkIMUsI8aMQ4kcA0wGEYa2pyEDvHDyJg7HR2bMHWLDAer9NTcCrr3LZLBGwahUvQGTE\njnMwioN0DvrErnQOvXqZh5UaGvi6oqI48e6vG5KNp1Uj6o84tGXEdKCcw2efATfdxPeCnYY42GEl\n2XgAoQ8r1dezs7YrSjU1fC/60jheeSWPhQkkDgff308/DWRluf4tmM6hsJBD1b48TxHpHAAsAHAc\nwE0AbgZQDWChx3d0YPQ5h9hYnp/fjjhs3QosWWK9X9kb/vprYO9ejq2bJZhlrsCIXhyKinjiOtnT\nl84hJcW+c5D5Bom/5azy4bJ6yHwVh8pKbbSxPwTKOaxcyQ2W3Yc12OIgq1mA0IeVpMO0e9yaGm54\nfRGH3Fx7uaamJvsOo6mJ7/GnngKefdb1bzIhHWhxIOLnc+DADhBWAjCYiJ4momwiOkREEwAMDvJ5\nRSxG53DHHdzz0GNWylpf7zrGwEhpKffQv/4aWLuWb1qz2n5vzoGIH6KEBJ7aG/DPORjFwd9yVm8l\ngf44B8D/vEOgnENVFY/g9vawHjvG30mwcw7hDCvJe8WuY/HHOZSUeJ6jS3LLLfwM2UGOlh82zLWz\nIUOqwRCH0lIORyYmdoxqpXohxHnyFyHEuQAibDLb0KEvZY2N5YZrsEEqzZyDp8FmAN80Z50F7NrF\nic6rrjJ3Dt7EoaKCH9QRI7Q1A2TIwegcPCWkzZyDP+JQV+fZnvsjDlFR/ucdAuUc5AJPcXGeH9Zr\nr+XvMxQ5h3A5B3mv+OIckpP5Zzsi1tzMn58dcSgp0aqBvCHFIS7O9f50ODhhnJgY+JxDQQFXCNp9\nnqT7j9RxDv8PwBwhRK4QIhfAKz+/9otE7xyM4SSJmTjU13sXh4wMFohly4Abb/TNOfTowfs4epTd\nwEknaXkH2QDrE9KVla5hJaMVN3MO/oaVUlPbJg5JSa7VSiecEH7n0NTE33/Xrp57l0VFvPBUKMJK\n0jkEO+dQUgK8/LL2uz6sZKexl0UVdgeLyo6AHXGorbXfcbASB3k98fGBdw6+isP55wMbN0ZoWImI\nthPRSAAjAYwkotMBXBj0M4tQjDkHM7p25d6RHjthpdRU4OKLufE780zfnEOPHtxgFhWxOJx4orlz\nkGEZebPFxfEDIs/tk094DYZAOAenkx9AT/bcjjgkJvLn2dLC5z94cGDEoa3OQYqDp4e1tBRYupQb\n1FAlpIMdVtq2jee+kr1af8JKUhzs3FOyk2RXHOzeG/JeMIqDfLaNrweCggKgTx/7115cDLz7buSG\nlQAARFRFRLJ5ezRI5xPxGMNKZpx8MoeH9MiwklWyTIrDrbcCkyZZz0RqJQ6yB7R3r2/OAXANLW3b\nxpVSgXAOcvU8Tz0wO+IQFcXbVFWxOAwZEpiwUludg7ewUl0dN6AnnxyanEOowkolJXy83bv5d3/C\nSt262S/5ls+BHXGoqQmcc4iLC39YqaGBC1ki0jkoXDEmpM045RSurtDfzPX13FBYNZJSHAYMAO68\nkxvUlhb37a3EAeD379rlLg6y4dAnpPVhCH3j3dTEiWy5CpzEH+cgSxzlw+d0crmuHjviALCQFRVx\nLLhPn8hyDt6+0xtv1MaLAMHJOYQyrHTsGP+/YQP/72tYKVKcg5U4yGfbn7DSzTd7jhDIUey+iEND\nAz87dpxqIFHi4CPywZOD4MyIieGE8I4d2mvyC7W6cWRDIhHCfL4kO+LQuzfXUefl8bnKUta4OO3c\n9c5BH/ttbOTFhNpSylpaqgmh7IHJtaQfftjVPfkiDjk5/GB17x5e5+B0ciMYE+P5YS0pYQf4299y\nmFASqWGlI0eAOXO8b1dSAvTty2NyAP+cgy8j70tKtDVLPEEU2JyDr2Glhgbggw9cp64xIvdt1zXV\n1wO/+x0LvxARIg5CiBohRLXZPwB9QnN6kYcd5wDwkqNbt2q/y8bAKiltFAeAGxZjUtquc+jcmXMX\nBw5opaxCaO5B39PU36hNTdxIGJ2DL6WsN9zA5biy0ZAPWXk5N6wyHyMfZrMR30aSk4HDh/l/mV/x\nh0A4B9moCOE5rCS/0379gDVrtNeDJQ76UlZ/RG/dOo5ve+PYMV4/3SgOdh2LPiFtN6w0aJB3cWhs\nZLftq3OQDa7stOhzDr6EleTKbtnZ1tv4OpllQwNP7ihXfowIcSCibkSUYPHPZM00d4QQC4QQxUKI\nLA/bvCyEOCCE2CGE+JU/FxFK7OQcAGDUKODHH7Xf5RfqizhYOQerxjQ1lR1Br17aOWzd6jp/UUYG\nD5Dz5BwaGlgg/A0rHTrEDYgxrKQvo5XXEhtrvgKfkUA6B7kSnL/OQYaUAM9hJekcjJhNR/LII8D2\n7drveXk8G7D+NU/oB8H5G1Y6cMCeaJWUABdeyCN9y8pcnYOvYSU74lxSwoPGjEUeRmRD7qtziIpy\ndcb+OgcpCocPW2/jizjIz3LUKGDLFv45IsQhQCwEMM7qj0KIKwEMIaITAfwRgMksJ5GFv87B17AS\nYO4crEZIA9r7pTiccQbfVPppsS+7jKea9uQcALbG/iSkm5o4N1Be7hpW0seC5WdgN6QE8LkePqyJ\ng7fe4c6dHMIyO79AOQfA88Nq9p0C7DiMjcOqVcA33/DP27cDY8bwNezcae+cApGQPnjQ3nd87BiH\nLs88kyeu82eEtK/OYeBA786htpbvE1/FAXAVAn9zDtnZ/DkEyjno25ion1tqvcvRh62DQVDFgYi+\nBVDhYZNrAfz35203AkgWQqQH85zaip1SVgAYPpx70PobztNSnr44B1/FQe8crryS18SWI6QB1x6c\n/H/vXnvO4YYb3Hu8ckSwMaykT4YDvomDDCulpHBYyVsDkJWl9bb06MNKnTpxQ+3rPE1652AnrGSG\nUWwLCjSn+fbbwPjxnNz0tMa3Hr3Y+5tzsCsO0hGddRZPQBiKUlY7YaXaWr73Gxvt7ddKHOxWK61e\nDbz/vjYvU3Y2cN553sVBFid4O0d5Hnpkpdu+fTxQNpiEOyGdASBP93s+gL5hOhdb6MNKVoPgAP7b\nySdr6l5fzyWjZs6hrk6b1VKPPzmHTp20BmnUKG649Y3wOedwJVVzs9b4651DYyM/YPv323MOBQW8\nP4n8ubzc3Z7ry2gB/8QhOdmecygqMu/16cUB8M89GJ2Dr2ElwLWctaaGBVyKw/r1HLbp1cv+2hWB\ncA52w0rHjvHUISNHAj/9FPxSVrviIPdr5/4AvDsH+ZpV+fl11wFvvMFTdgAsCpdc4i4On32mibUM\ntfrqHCTSORQUeA+ztZXo4O7eFsLwu+lXMWHChNafMzMzkZmZGbwz8oDdsBLAg7Vyc7lBluJg5hzK\nyrhBF4ZPIjXVPebsTRzS0zULmpTEOYbcXO0hiI7m0NKqVdrxjM5hyBDgu+9cj2N1Mzc1uT6Iubks\nUGbOoS1hJTmFRkoKX1dNDX8P0RZ3cGGhea/PKA5yNLuxh+YJuzkHu86hsJCT1nl53PDu3Mkhm7w8\nHndih7bOylpezt9LlJfuYkMDX39iIlfkPf0033PBLGX1JawUH68VLPTu7Xl7b84hOpr/ydHweoj4\nOj7/XBuTlJ0N/POfPJGf/j67915O3g8Zoj0Tctp/T1iJQ0OD9f29Zs0arNFXP7SBcItDAYATdL/3\n/fk1N/TiEE7shpUAbvjkF9jQwA2AmThYNSK+OodevbjEUM8ZZ2jhHMmVV7qGXIw5BykOdkpZm5pc\n95+byxOZSecQF6fFbtsqDgCLQ1QU/15RYd0zLyqyJw7+9LKNYSWrEJcn56Af61BQAPTvz43Z/Pnc\n6MbFeV6I6fvvudMhr6mxUbsv/AkrHTzI37u3eYnkNQnBo/Bzc/l79bVaSZayenNtRPx89O+vLYBl\nVcAg99vSYi/v4M05yNdra93Fob6eX+vcmb+H775jcTjpJP4e8/LY7RDx/S7vRSkOdkJfZmElvXNo\nbna9BsC94zxx4kTvH4QF4Q4rfQbgDgAQQpwNoJKI2rAIZPCRzsHTOAdJfLxm/TyFlazEQeYc1q7l\nGLQs/bQSh7PP5p6MnjPOcG+Ar7uOezgSY7XSkCH8s51SVofD3TmcfrrmHIxhpd692yYO8n99Oeu3\n3wJ//KPr9nbFwZ91KuyGlew6BzlqdvRoHiR47rn8utWMuTU1wNixWrWT/BylE/RH8A4eBE491XtY\nSS94nTvzvbJ1q39hJTvO4fhxrdxU/zx52q/dUmd95ZrROejFwez7lccC+Pv66CN+T1ISi4IMLdXX\n83HkefuSc/AUVios1M4jWARVHIQQiwFsADBUCJEnhLhHCDFeCDEeAIhoGYBsIcRB8HKkDwTzfAKB\n3VJWgG8e2UB5Cit5E4cXXuA6+cZGzeqaIQTHgvWcc4577zUxEbj7bu13M+cA2EtIG51DTg6Lg6xW\nMoaVBg5su3MAXMtZP/yQ4/R6CgvN48WBdg7eqpXs5BwKC1kcRo3in8eO5detnEN5udZRAFzHOAD+\nhZUOHGDHB3h2HTLfIBkxgvNTwQor6T/DhATPoSXZcbJb6qyvXDM6B9ljt6pY0ovDeefxFBeDBvHv\nenGQ97q+kyhzDt46JXbEIRgr1UmCXa10KxH1IaLORHQCES0gonlENE+3zUNENISITiOirZ72Fwn4\nknPo1k27KRoafBeHnj254mnbNh534KmM1Yqzz+b8gieMzmHAAA7d2ElIm+UcfvUrFgyzcQ7+ioNs\n/KQ46HuHK1bw56MXgqIiDi8YH8BAOwdjtVJ1NfDEE9yrr6jghsoMK+cAaOIgx60YG3p53fLe0o9x\nANoWVvJWsmwMlY0Ywf/bCSvdfTeXvjY18XHsiENJifZs2BUHO9VsgG9hJSN6cTjzTL5uM3GQxRfG\nsJLdaiVPCWn9foNBuMNK7Q5fcg56cfAnrNS9O9/AjzzC+8rJ8V0c5Hl4wugc4uK4sbLjHPRhpZYW\nvmlPO819nIMUh0GDAuMcMjI4cZuTw/uNinKtgmpp4fcYH55gOAd9zzIvD5g6lavUEhKsXZ4x55CR\nwQ3t7Nl8nwAcW+/Rwz3vJD9v2VDqk9H+XpNdcTBzDoC9sNKKFcB99/H9KJeeDaRzkJ0ns7BSczMX\nYujLlr0lpI2vG48ln6uuXTl8qxcHORBO7xzkZxMTYz+sZJVzKCw0v78DiRIHH5EPgFkFgxEpDkR8\nI6SluTqHnBxgwgQOiZiJQ0wMz8szfjwn5H76yd5UE75irFbq3Bl48kktzCC38ZaQLirSxiE4HNxY\nBzqsJP//85+BWbN4uofLL+fPR5bRFhXx5Hz6kMCGDfw9BMo5WIWV5PEWLrQOKQHuzqFPHxaShx5y\n3c4s72AmDm0NKx06xNV1/joHb2ElmViOjdXuYbvOwZewkixlNTqHsjJe2vXIEe01u87BmzgA/Ixe\ndhn/rL8XZYelpkbrLNm9dquwUl0d3xNDhihxiCiio/kLkXPreEIm0OSqYcnJruJw//08F1JiIg8o\nMuODD1g4+vfnKZL9cQ7eMI5z6NKFz002xIB1QlofVsrN5fMUgkWioEBzDtXVfO39+/snDrJnLJ3D\nyScD11/P6wpcfjlXgskHv6iIE9/x8drDc9VVbPXNxMEf52AVVqqt5et/5x3rZDTgmnOQzsEMs7xD\noMNKRLzP1FTfncPAgVoHwJMoVVXxvTB3Loc6AXtxd72rTkjwnIA1lrLqke5LPymeHedgJ+cA8EzK\nskhIP92+fqoY/WSWbRGHykr+LHr0UOIQUURHc6PmzTUAWkJaxg4TE7Wb5csvube2aBFXOowZ43lf\nwRQHM+dgto1VWEk6h9xczlcAmjjIhuPoUW1NCX/EITqawxL67SdM4B73ZZe5ikNhoeYcamu58Tt+\nnP8eiEFwnsJKtbVcBFBTY885OJ382fSxmMpSDoTbvJndCKB93lbOwdewUl0df+cxMb47h6goYMEC\nDqV4Oq5835gxnLwF7DWQZWXcCAL2w0pmzsEXcfA152BEP7OBPqwkc3CA9sxZDbADrEdIA9yZ0Hd+\ngoESBx+RCWlv+QZACyvJLzkxUVvw57HHgOnTzRtiM0LlHKzCZWaNhtPJMdyqKv5ZOgfAXRyam/mh\n1S/5WV6uOQE7SNsu6dOHY7tpaebOQT7Yci2JnBz+X18n749z8JSQrqvjBj0z07NzkDmHkhL+TKw6\nG+npLB4LF3JVFhD4nIN+6g1fnQPAI4SluHgTBz1W4rBrl7afsjItqd+tm72wUiCdgz5vqMeTOCQl\n8bnI0GqnTlpYSYpDVBS/7ul7MnMOcpJAfecnWChx8BHZ6/RFHGRiSdriPXv4/2uusX/cfv24IQy2\nc9CHTIzbGB9kWSceH88NlF4cunfnhqRrV+1BM4pDTo62vb/IEb1WYaW6Ou14hw7x96cPBwbCORjD\nSvHxwP/7f1qYwQzZCHsKKQFaWOmrr/i6AC0BLxtKY1jJ15yD/v12xMHKEenDWfv2ufaKfRGHe+7h\nQWUAX6td5+CplLWkhGP0e/dqr9lxDvr7VY8ncRBCq5iqquLOguyk6J2AN+dkVfTStasmDu12nENH\nRFaf+Ooc5NTUcXE88d0FF3jPWejp358ftlA4BzNxkI0GkTalh6wTl3PZGJ0DwNfbqRM/CN27a4u2\ntLRwYy7DUG1FnwQ0hpVknkeKg/Ha2+IczMJKcXGcD7n9dut9yJyDN3Ho1YuT6YWFWmK6vJzfIxsG\ns7CSLzkH/fs9iUNjI5/DCSeY/10vSr/5jTYhHWAuDlbVSsePa6vN+RJW0ouDmXM491zfnYMncfD0\nLMrQUmWl9l35ui67WVgJcBUH5RwiCF/EQSq7/ktOTOSJuHydGko2usF0Dk6n66hR4zaNjTxYSs4G\nKR+ulBSOgxudA6A9DHFx/Fp0NH92Bw7w+3yZ08gTnsJKnsShrc7BLCFt5zvSOwerfAPAzmHTJh7V\nXlKiLWbTr1/ow0r797OYW4VC9cetqdGcDuCbc6iu1sJA/uQc5PQU+rLVkhIef1NZqe0jWM4B0MSh\nqspaHLyV8lo5h9hYlXOISPxxDvp65aQk7glecIFvx+3enW+GYDoHmaw1czSylDUvT2s8pJBIG2/m\nHPT14lIwkpLYfQwcGLhr6N2bGwC5noS+lPX4ce6BHzwYGOdgDCtJRwXw8ex8R/J92dlafbwZcvr1\nq67iz7SkhMWhf3+tkdNPvw74F1ayIw579nCVmBV6x1JXp/X+AWtxMBPm6mpX5yDvG7ulrGbLaZaU\nsNCeeKLmHjxNn2HHOdgRh8pKnu9MFqb44hzshJWUOEQQQnCYxK5z0FcrAdzDk2s8+3rc/v2D6xw8\njd2Qpaz5+a4hKBlWOniQHzTZg7VyDkBwxCE6mj/X777j3njfvq5hpeHD2d0Ewjnoe5xRUbwP2aAa\nl1e1QjbCcnyBFVIcLrlEG/MgxUGGlfS9a3lNnsJKy5e75gP0zsOTOOzeDZxyivV+9aJUV+dagmvX\nOcjZTktKtBJbX8NKgPv4BHn8oUM1cfA0fYbeOcixCnrsiENJibtz8CXnYBVWOvVUHoOkn54nGChx\n8AMZGvFGTAxvW1HhGlbKzPQt3yAJljhI52CVjAa0Gzkvzz0/kZLCU3zok8uhdg4Ah1puvhmYOJE/\nZ31YKSODz8l4fWbOITcXuOMO6+PonQPgGlryJaxUX+9dHFJTeWLB9HQWP7nKnj6sZCYOVs7B4eBZ\nefWzrwbSOTgcHM5pbPRPHGTp8bFj3JhGRWn3kN2wEuDeq5aJ9GHDXMUhFM7B35yDlXN4912e/VU5\nh0fe6kMAAB5GSURBVAjErjgAfAPJkaEAlwFefLF/x/31r7VJ8QKJ3jlYiYNsNPLzuVeqz090786N\nvZk4hMo5AOzGxo7l0dOAa1gpKYkbVDvOIT+f5wCywvg56UMYdsNKdsUB0GZp7d2bK7yamvhnK3Hw\nFFYqLOTGV78gTaDDSvL9dsJKxsZRXtOxY64hJcB+WAmwdg5DhrDLBdzFQT+9fqBzDsZBcGbXP3eu\na/WRtyl6lDhEIHJuFDvEx/ONKXsi8+Z57pV64qmnODEZaKRz8BRWkjdyfj7/LrePiWEhyMpyFQdj\nWEmO6AT4gSsuDrw4vPgiL9soXZk+rJSYaC4OZs6hvt56OVfA3TnoK5bshpW6dmUX1qWL60h0T/Tu\nzaGd7t21smi5JKu+EfUUVpLfn6/i0NzsOnOrGVKU5GfhzTmYJWRravj7KylxFz1PI6SdTteYvr7h\ndDq18FRSkveEdLCcg7ecw5Qp3FmQWIWVJEocIhB/nIN+OL7VYiXhQjoHT2El2Wjk/byoqxQH6Rwa\nG92dQ6dOWmP82mvaIDbZEAVaHFJS3MM93sTBzDl4EwejiPobVvrpJ++uQU+vXpo4yAFhVVXapHf6\na5KC99vfugqF/P70jZA+52A1Bfnhw3x8T8Inj2sUByLX2VUlVs6hb192Dvp8A+DZOdTV8Wcqx73o\nG/vycn5vTIxrg2omDvX1rkvoJif7Lw7FxXys3r3thZWOH3d1O3acgxrnEGG0JawUifjiHPLyuNGX\n1U0y5wC4ikOPHq6NZP/+2oOYnMz7sKqXDxRG59C/vz3n0NCgNRJmGEXU37BSZaVv4tC7NwuKdA7V\n1e69a0DrwTscPFWFvkHNz+eGy1fn4C0ZDWiOpa6O73spDnK+KePnYlatVF2t5VOKi12vTc4wYIZR\nlPVhIr1r0YuGmTgcPszlurIDp5/VQI8dccjO5u8pPp7vmepq64S0nOLFV3GQ12hnuVVfUeLgB3IO\nGjt06+YaVopE7DgHWcJbV8fJUZmjkNVKgKs4pKdzItWMpCQWBqvprAOFPufgq3MArHup3sJKdktZ\nAd/FobjYNaxkjMsDWiMte5X63mVeHnD++b6Lg7d8A6CJUn093wulpRzSsVou1co5JCWxKOzd63pt\nKSksqPrxCxJ9vgFwnTDPKA6enIMxByRDyMbwjR1xOHKEr0Um1UtL3Z2DvPfq6/m6/BWHyy4DNm60\n3tYflDj4ga/OIdLFQe8cPM31FBvLll+WteoT0oD7VBgjR5rvJykp8CElM4xhpdGj3ceXWOUcAK2X\n+uKLrqEF4+dkDCvZLWUFfA8rAa5hJTPnIMM7UhT0DVt+Pn8GenGwM0Lazmh2fVgpOZnPsaLCd3FI\nSODt9+51T7QnJZkvAWocsax3CPrj60VD/z127syN89697t+JWd7BjjjINUUArR2wCivJjoheHHzJ\nORw65Dr6OxAocfADX8QhPj7yw0p2xjnI7fr21W5qfc6ha1fPs5DqOfVUYNy4wJy7J4xhpSFDgMmT\nXbfx5BykOEybxnMbScycgz9hJcB35wBo4lBTw/eWlTjIBsfoHMaM4b+ZTfltJQ6Vld4nSdSLQ1wc\nV+YVF/smDjU1LA5paexWjNeWluZaBSUxOjZ9w2knrCQE/y0ry70i0CgOcjZVTx2p+Hi+Pim63sRB\n3mu+OAc5zqGlhce/5ORYb+sPShz8oC0J6UjEzjgHQHMOenGIieHXVq+2P3bjgguAxx8PzLl7whhW\nMsMq5wBojWtFBfDNN9rfjSLqT1jJH3Ho1o3/paTwPdilCzsBq5yDVVjphBPcVyvz5hwqK71XVckZ\ni+VgLzlpoJU4mFUrVVfzNfbsydN1mImDcWU8wD2s5Mk5mIWV5Ht27jR3DpWVwNatwIMPaq7B0/0u\nBLsHvXOQE1FK2ioO8lrktCry+wwUShz8wNecQ1NTZIuDv85BhpWEsF6sKJwYnYMZ3pxDYyP/rhcH\no4j6E1aSU5lLN2CXXr1cp7DOzTV3Dvqcg74xLCvTRujLiiU74lBV5V0cpHOQJZvp6dwgenIOZglp\n6RyamtzzKT17mjsHY1jJm3MgMheHPXusncPWrTy63FtISZKaqn2usqTdF+fgLawkz106BuUcIgBf\nnQMQ2WEl+VA3NHh3Diec4B5WilSMOQczvOUcKiq4YSkrcx3jYRZWInKvZbciPZ0H2vk6Ur53b9e5\nhnJy7IeVCgv5uJ06sThkZ/N32NysNUJtcQ5WYaX1683zT95yDoD/YSUr5xATwwnipib3SSbj4vg1\nY25FlrMeOcKfd1mZfXHQOweHI7BhJYCvef9+Hn8ixSEvj8ektBUlDn4gLb0d9IuQRypysfeaGs+N\nvXQO+gS2sfonkpC9x6oq6xXnvDkHOcDswgs5dAaYj5CWNfJdutgfx3Lqqb5dDwD86U/aiGkrcbAK\nK+Xna+XDUhykcEqRaqs4GMNKBw+y6/rNb9y3l8KsLxPVOwfAflipuNh1HIVxnIN+P/Jvxvs3Lo4r\n2ozPtnQOublcfbVzpz1x6NnTNecgjyHxJA5EvonDWWex+Dc388p8CxZ4Pz9vKHHwA1/DSkBkOweA\nH1Rvy59ecglw+unuYaVIJS6OG0a5noQZVjkHue51RQXH+C+8UAstmc2tVFdnP6TUFm68kRswwH5Y\nSf4v8w0Ax9UPHHANKQGexUG/nRn6EdIyrPTOO1w6ayYsQriLszfnYBVW2raN702JPqxkTKbLXJRZ\nWMksB6QXh4QEDi/ZEYdevTTBkq7GU85BCE0cHA6+b711NOLj+XscMIA/7/x8nvX5nHO8n583gioO\nQohxQoi9QogDQognTP6eKoRYLoTYLoTYJYS4K5jnEyh8rVYCIts5AHyjVld7buynT+dyVTtzMUUC\n0dGuM8WaYeUc0tI055CSwvNabdrEfzfmZuTiMnYrlQJFQgI35N7CSrKRzM9n5wdwQ7p1qz1xkGNg\nvF2bWVipvJwnQ7TCGFrSVysB7hVSVmGlbduAUaO03/XOweh6ZLjRzDmYzV2mF4fMTD6WHXF45hlg\n/Hj+2Y5zSE01nxnWE1Ic+vRhgcjO5vEOES0OQohOAF4BMA7AKQBuFUIYh9E8BGAbEZ0OIBPADCFE\nkIdGtR1/cg6RLg7SOdhp7I3VSpFMfLxncbDKOaSn8wMr17lOT9eWnjQmpOVsqXYrlQKFDJUZk7b6\nsFJMjLlzyMjgv+3c6V0cZDLaW34kKor/1dRoziEmBrj2Wuv3GCuWZLVSWhqfl3GgpJk41NVxozh8\nuPaa0TkYxaGqivcdFeX6upVzqKjgsM3FF9sXh4QE17Wo5TEkRnHo1ct/ccjIYHFYtozdld2yck8E\n0zmcCeAgEeUQkQPAuwCMkcciAPLRTQRQRkQ+LHAYHh57zH51TnsJK3Xpwg+1nVxKewkrAd7Fwco5\npKe7hpVSUlgoZJWL/nPq3ZsbjlCLg7y3rMJK1dV8HVIciov5d8mYMcCqVa6fj5k42Mk3SKKjueHt\n2hU47TTgv//1/F5jxZIMKw0aBLzxhvv2ZjmHrCxOyBpDRLIqyXj+8fH8vRrv3auuAi691P2YSUna\nmItTT7VfraRHv86ERC+MZuJgp0Mpx7tI5/Duu+xyA0EwxSEDQJ7u9/yfX9MzH8BwIUQhgB0A/hTE\n8wkY48a5TyJmhXIO4SUuzrtzMIpDQ4N7WCk2lhu+ujp359CnDzsH48RqwSYhgc/JeH36EdK9emni\nYJy9VYqDXedgh5gY/txkqe6tt3re3hhWkuIQHc35FSNmOYdt23gJUD0ydNTQwI5H3zmLi2PBMN7r\n99zjvh+AP5+dOzmketJJ/Jqv4iDHReiP6ck56BcI84QUnT59eNaBgoLAhJQAIJghHPK+Cf4BYDsR\nZQohBgNYKYQ4jYjcZrWZMGFC68+ZmZnI9HUR5jDRXsRB5hx+ac7BKqyUlsYPWkWF67rY5eXuCWkZ\nVvK26HygSUjgczKGe4ziIMMrUugkY8awm/AmDnaS0fpjy5li7WAlDlakpPA2cklbgHMnxkZdJp3N\nXI+Vc7BCTtnRrx83wnFx/olDXJzrd2UUh6FDtbEnvoSVoqNZNLkEdw127FgDXXPpN8EUhwIA+nk3\nTwC7Bz2/BvAcABDRISHEYQBDAWwx7mxCIK42DMjGItLDSr44B30pq68PSaixE1YqLgbeew847zx+\n+I1hJVkFk5KiTSanj4XHxvJx8vJCH1YyhpQAbaRydTULl1z1zSgOZ5zB/9sRB1/CStI52MFXcYiK\n0tZKkAMIt20D7rrLdTsZVjJes/ybmXOwQl57//58/CFD/AsrGT+TQOUcevfm8xo8GEhOzsQrr2S2\nVjlNnDjRtxPVEcyw0hYAJwohBgghOgO4BcBnhm32ArgEAIQQ6WBhyEYHor05h19aWGnAAO4RPv00\n8Oab/Jo+Ia1vXLp35zls5KhwPX36cGIw1GElM3Ho1IkFTDY4+rCSvqHs0YNj+2Y5B/3YA1/EQToH\nu/d7aqoWJnI67VV86UNLDgdPY24cZCcT0lbOwRdxkOIpHeRJJ/nvHPR4CyvZ+Qzj4/neA/g+PnAg\ncOvFBE0cfk4sPwRgBYDdAN4joj1CiPFCiJ8LvPA8gDOEEDsArALwOBGZzLnYfmkv4mBnnIOkI4WV\n+vblKRHuuksbiKTPOchqJUATB7PPqHdvHvAV6rCSmTjI8QMVFZo4OJ3mDeU557hWtsja+uZmYOZM\n/p59FQdfnIN+Gg+5YI+3xk1fsVRUpE1EqEc6B7Nz99U5GMVh0iTgppvsvVfSrZt7GxAo5yDFAbCf\nC7VDUMtGiehLAF8aXpun+7kUwDXBPIdwExcHXHNN5PewfXUOVVWRP84B8C4OksRErmMHXMc5xMVp\n4pCSojkHI336cOz7xBMDd+7euOYa8wQqwOEdKQ5yChEZn9bzn/+4X09sLN8Ljz/OAx99DStVVNgX\nh8GDtenDvYWUJPqKpcJCLuM0Ip1DRYW5c8jJsX/vyvtHDj70tFSqFSkp7nmbQIhDcnLwFs2K+DEF\n7Z2oKOAzYzAtAvHVOegX+4lkevSwV/OtX2VMn3Po0sW+czh0KLRhJVlia4bROZjF3gHz8EhsLK9r\n0NLC11RZqVXpeCMmhj83X5yDXKTGrjjow0qFha49Z/15yLWozZxDRYX9ezc6mq+/LWuQjBrl3g7o\ny3j9DSuNH+9eUBEolDgoAPiXc2gPYaXJk+2tOJeU5CoOSUncuBw75ioOO3ZYi0NDQ2jDSp6IieFB\ne97EwQy5vjXAvXpfS1kB+2FUf52DN3EAWAQKCtqecwDavpCOENqob/35yZHazc38/TQ2cgjQrnMI\nZmdEza2kAKCthOVrQjrSxUGOT/CGdA5y3eiYGH4tKkp7AFNSOMZtFVYCIkcc5DXLsJI/4hATozkH\nX8JKgO85ByL7g8t69WJRAPh/q2nP4+P5722tVgoWQ4bwtZeXa5Mfyhl+7YpDMFHioACg9YZ9SUi3\nh7CSXaQ46AcfJSa6jiPwFlYCIkcc5NrHsrb+6FHfxGHXLmDsWN/FQd4PdsUhOZkb6dJS+85BzigL\neHcOhYXWYaVwi0N8POcLNm/W8hoykW43rBRMlDgoALiupWtn2/YSVrKLXhzkQ5mQ4NqgekpIS3EI\nZc7BEzExWi+8Wzceg+Grc7jsMv/FwZeGTboHu+Ige9yAZ3GIj7cOK0WCOAA8hmbdOu26pTgo56CI\nGPx1DpHwgAUCM3FITHRtULt3t07aR5pziI7WGpxu3XhGVuMEfVbExrIIXnwxv6+01Ddx6NLFdUI7\nb8i8g11xyMjQZsH1J+cgF/WJhHv39NOBb791dw4VFfZHpQcLJQ4KAL45B31CuqOFlfQTnpmJA2D+\nGcmS2UgRB71zkKO3fXEOAHDyyVy1dfSo/YYqOtp39ySdw6FD9gQsKkqbnrqoyLNzqKszdw5A5IjD\nli2u4lBby5+5r0vIBhpVraQAoPWGfRGH5ubIeMACgbz+ykrXnIO+BywbVyt3lZERWeIgG2kZVho3\nzt57Y2O5skbOjpqXZ69HbzyuXQYPBhYt4kqwLW4T51i/56efOIltNhAQ0M7DzDkAkXHvnn46F4IY\nncPRo5x4DyfKOSgAaA+Kr+McIuEBCxSJifxQWuUcZHmr1Wf0wQfA6NHBP087tDXnIBe9GTyYr9tu\nmMgfcRg0iFfZe+QR9/WbrRg8GPjuO+5dW60zoa8y0xNJzqFXLxZiozgUFYXfOShxUADwzzl0pLAS\nwA9ocbEmDklJrmGOqChuaKw+o+HDAzevTVuJjnYVh9JS38RBLnrDk7n5dlxfq2xGjACuvJJHZNtl\n8GCO1XtqQKUIGENikeQchGD3EInOQYWVFAB8dw6NjVybHgkPWKBISuLBVbJxe/hh9zESKSn2PqNw\nExOjhYJkI+mLOMgpKQYN8k0c/HEOPXsCX3zh23uGDOE1Fq6/3nobuaaE8R6NJOcAAGef7VqtVFnJ\n+S+rcFmoUOKgAOCfc7C7fXtBOgeZczCbs6Z79/ZxzcawEmBfHO69V2uYxo4Fbr/bdt+OGopx38GDu\nnFglowEWATNhiyTnAPCMwDI0FhfH8z717OlbxVcwUOKgAODfOIeoqI4dVjKjvTgHY1gJsC8O+iVw\n+/YFHn3Ut+OGQhwGDOAG1ZM4xMWZi4MU/0i5d43rWB86FP58A6ByDoqf+aWPcwDsiUP37u1DHPRh\nJSkOvoSH2nLcUIzs7dKFnZ0/4iCnRInEezcujkt0w51vAJQ4KH7Gn3EOv1RxaA/XbBznINdlDsVx\nQzVKfMwYz7PFWoWVACUOdlBhJQUArTdsx2rrE9KRYs0DgTHnYEZ6evhjwXYwhpXshpQCcdxQicOH\nH3r+e1qa+VoPAAtHpIrDsWNKHBQRROfO3NDbafg6d+Yy1o5WrZSYyJUinpzDY4+F7nzawkUXaQsB\nhVIcYmIip8Nwww3W1UyR7ByAyMg5KHFQAGA3YPdhkYnojjjOAfAsDpEysZ437r9f+zk+/pcpDkJY\nD5CLZOcAKOegiCA6d/Yt0dqlCzsHq4evPWJHHNojZ56plR4HmwsuiBxx8ESkOwclDoqIwRfnILd3\nOoN3PuFAjqQN91TJgWbAAPvTUrSVK64IzXHainIO3lHioADAD4ovD0vnztqqaR2FjuocFO6kpoam\ntNdX5OhtJQ6KiKFbN3tLNEq6dFHioGi/vPFGaEp7fSUujsuOI2F23wj8eBThYMgQnhnTLl26cEK6\nI6HE4ZdDpOZFkpPNp20JB0Gt2BZCjBNC7BVCHBBCPGGxTaYQYpsQYpcQYk0wz0fhmfR0+9t26RK5\nD5i/SHHoaDkHRfvhpJOA9evDfRZM0JyDEKITgFcAXAKgAMBmIcRnRLRHt00ygDkALieifCFEarDO\nRxFYfE1gtweUc1BEApGSCwmmczgTwEEiyiEiB4B3AfzGsM3vASwhonwAIKLSIJ6PIoB0RHGIjfVv\nPQKFoiMSTHHIAJCn+z3/59f0nAiguxBitRBiixDiD0E8H0UA6YhhJSHYPShxUCiCm5AmG9vEABgF\n4GIAcQC+F0L8QEQHjBtOmDCh9efMzExkZmYG5iwVftERnQPAy3ymquCmop2yZs0arFmzJiD7EkR2\n2nA/dizE2QAmENG4n39/EoCTiKbotnkCQFcimvDz768DWE5EHxr2RcE6T4V/XH89UFYGrFsX7jNR\nKBRWCCFARH7NYxDMsNIWACcKIQYIIToDuAXAZ4ZtPgVwrhCikxAiDsBZAHYH8ZwUAaIjhpUUCoVG\n0MJKRNQshHgIwAoAnQC8QUR7hBDjf/77PCLaK4RYDmAnACeA+USkxKEd0FHDSgqFgglaWCmQqLBS\n5PHHP/LaB59+Gu4zUSgUVkRqWEnRgVFhJYWiY6PEQeEXKqykUHRslDgo/EKJg0LRsVHioPALFVZS\nKDo2ShwUfuHr+g8KhaJ9oabsVvjFwIGROR++QqEIDKqUVaFQKDooqpRVoVAoFAFFiYNCoVAo3FDi\noFAoFAo3lDgoFAqFwg0lDgqFQqFwQ4mDQqFQKNxQ4qBQKBQKN5Q4KBQKhcINJQ4KhUKhcEOJg0Kh\nUCjcUOKgUCgUCjeUOCgUCoXCDSUOCoVCoXBDiYNCoVAo3FDioFAoFAo3gioOQohxQoi9QogDQogn\nPGw3RgjRLMT/b+/uYuQq6ziOf3+ygqA1QGiqYmObWCglMfRCbKxbmpCUcqH1JYI1Ri4MaBBoTDCh\nXig3hjZEw4Wx8aUgqYqpL63FRKASihXEtbGvbpUQrQHBloteFI2k4M+L8wwc9sx0pu3M7uz297nZ\nOc+cfc6z/zxz/nPO2ed59PFBticiInozsOQg6SzgW8BKYBGwWtJlHfZbDzwEnNKiFHFyduzYMdVN\nmDESy/5KPIfHIK8crgSesX3I9nHgJ8CqNvvdCvwMeHGAbYmafAD7J7Hsr8RzeAwyOVwMPFvbfq6U\nvUbSxVQJY0MpylqgERFDYJDJoZcT/T3AHWWBaJHbShERQ0HVeXkAFUtLgDttryzba4H/2V5f2+dv\nvJ4QLgL+A9xoe9uEunJFERFxCmyf0pfuQSaHEeCvwNXA88AYsNr2wQ773wc8aPsXA2lQRET0bGRQ\nFdt+RdItwMPAWcBG2wclfb68/51BHTsiIk7PwK4cIiJi+hrqEdK9DqKLziQdkrRP0m5JY6XsQknb\nJT0t6RFJ5091O4eVpHslHZa0v1bWMX6S1pb++hdJK6am1cOpQyzvlPRc6Z+7JV1bey+xPAFJcyU9\nJunPkg5Iuq2U96V/Dm1y6HUQXXRlYLntxbavLGV3ANttXwI8Wrajvfuo+mBd2/hJWgRcT9VfVwLf\nljS0n7Ep0C6WBr5Z+udi27+GxLJHx4Ev2b4cWAJ8sZwj+9I/hznYvQ6ii+4m/rfCR4D7y+v7gY9O\nbnOmD9s7gaMTijvFbxXwgO3jtg8Bz1D146BjLKH9v7Anll3Y/pftPeX1S8BBqrFkfemfw5wcug6i\ni54Y+I2kXZJuLGVzbB8urw8Dc6amadNWp/i9i6qftqTP9uZWSXslbazdAkksT4KkecBi4A/0qX8O\nc3LIk/L+WGp7MXAt1WXnaP3NMgAxsT5FPcQvsT2xDcB84ArgBeAbJ9g3sWxD0tuAnwNrbB+rv3c6\n/XOYk8M/gbm17bm8MetFD2y/UH6+CGyhuow8LOkdAJLeCRyZuhZOS53iN7HPvruURQe2j7gAvs/r\ntzkSyx5IejNVYthke2sp7kv/HObksAtYIGmepLOpHqRs6/I7USPpPEmzyuu3AiuA/VRxvKHsdgOw\ntX0N0UGn+G0DPiXpbEnzgQVUgz+jg3LyavkYVf+ExLIrSQI2AuO276m91Zf+ObBBcKer0yC6KW7W\ndDMH2FL1IUaAH9l+RNIuYLOkzwGHgOumronDTdIDwFXARZKeBb4KrKNN/GyPS9oMjAOvADc7A4le\n0yaWXwOWS7qC6vbG34HWINnEsrulwGeAfZJ2l7K19Kl/ZhBcREQ0DPNtpYiImCJJDhER0ZDkEBER\nDUkOERHRkOQQERENSQ4REdGQ5BAzlqSXys/3SFrd57q/MmH7iT7Xf6mkH6jyZD/rjuhFkkPMZK1B\nPPOBT5/ML5Zlbk9k7RsOZC89mfp7MAr8FngfcKDPdUd0leQQZ4J1wGhZTGaNpDdJulvSWJkN9CYA\nScsl7ZT0S8oJWdLWMqPtgdastpLWAeeW+jaVstZVikrd+1UtsnRdre4dkn4q6aCkH7ZrqKTRMtp1\nPXA78CvgGpWFmiImS0ZIx4wl6ZjtWZKuAm63/eFSfhMw2/bXJZ0D/A74JDCP6mR8ue1/lH0vsH1U\n0rlU89AsK9vHbM9qc6xPUE0BcQ0wG/gj8AFgIdUcN4uoZh99Aviy7ba3oyQ9afuDku4F7s7UMTHZ\ncuUQZ4KJi8msAD5bvqE/BVwIvLe8N9ZKDMUaSXuA31PNaLmgy7E+BPy4TDR6BHgceD/VLa4x28+X\n+Wz2UCWjZmOl84CXy+YC4Onuf2JEfw3txHsRA3aL7e31AknLgX9P2L4aWGL7v5IeA97SpV7TTEat\ny/OXa2Wv0ubzV25pLQTOl7SXKoHsknSX7c1djh3RN7lyiDPBMWBWbfth4ObWQ2dJl5Rv6xO9HTha\nEsNCqnV6W453eGi9E7i+PNeYDSyjuh3VbinMBturgO8BXwBuAzaUtZWTGGJSJTnETNb6xr4XeFXS\nHklrqBaVGQf+JGk/1WpkI2X/+kO4h4ARSePAXVS3llq+SzVV8qb6sWxvAfaVYz5K9VzhSJu6abPd\nsozqmcQo1W2piEmXB9IREdGQK4eIiGhIcoiIiIYkh4iIaEhyiIiIhiSHiIhoSHKIiIiGJIeIiGhI\ncoiIiIb/AxSD6Sq0YLMCAAAAAElFTkSuQmCC\n",
       "text/plain": [
        "<matplotlib.figure.Figure at 0x7f75d496e890>"
       ]
diff --git a/examples/detection.ipynb b/examples/detection.ipynb
index 6a03c996245..36f419f834e 100644
--- a/examples/detection.ipynb
+++ b/examples/detection.ipynb
@@ -1806,7 +1806,7 @@
        "dLvdaza+eH10u11LmYWWOe5QCjKDTnJ9fd3iLsrlslKplIbDoSlIQDLG7RMQD4AzozpHpYJ/HfU4\n",
        "aV2Li4tGanrT61YNTVgYUCsdx9H8/Lw1K3BtUTQsLi6aQThaQZosHOXn5uZMv5ZIJLS7u2s7PU0Z\n",
        "qmdqZmpRFiA7FaPuyclJ86qD7ww/hIEGqabjkB8BnKhKxnfLy8tL87igJ4AZyOtA5GHRj9vzIlJg\n",
-       "kWIOiXHO8vKywXfU/XiLzM3NmXqFwQeuUdJVPuN4LiGYfyqVMgHxcDjU2traje7/rdqZganm5+dt\n",
+       "kWIOiXHO8vKywXfU/XiLzM3NmXqFwQeuUdJVPuN4LiGYfyqVMgHxcDjU2traje7/rdqZganm5+bdt\n",
        "6kT2M0GPHN00c/l83gYY9XpdpVLJ0I5ut2v13Dh0BfmdLp8d/fT0VOVy2WrdcDis8/Nz40wwZOl2\n",
        "uzbuHvdwAxtnOrm3t6dyuWziV5Qf8DtOT08Vi8WMQffy5Ut7T9LVWBu73mazqVwuZycJtlrS1QNU\n",
        "KBTsBKnVauYeSkNNn8FpVq/Xbdw9GAz08uVLM1qv1WrGQDw5OTGolO+gVCoZlo76u9Pp6MWLFze6\n",
diff --git a/examples/faceboxes/SSD.prototxt b/examples/faceboxes/SSD.prototxt
new file mode 100644
index 00000000000..5b73fb021d4
--- /dev/null
+++ b/examples/faceboxes/SSD.prototxt
@@ -0,0 +1,1540 @@
+name: "VGG_WIDER_FACE_SFD_deploy"
+
+#default_forward_type:  FLOAT16
+#default_backward_type: FLOAT16
+#default_forward_math:  FLOAT16
+#default_backward_math: FLOAT16
+
+#global_grad_scale: 10
+#global_grad_scale_adaptive: true
+
+input: "data"
+input_shape {
+  dim: 1
+  dim: 3
+  dim: 1080
+  dim: 1920
+}
+layer {
+  name: "conv1_1"
+  type: "Convolution"
+  bottom: "data"
+  top: "conv1_1"
+  param {
+    lr_mult: 1
+    decay_mult: 1
+  }
+  param {
+    lr_mult: 2
+    decay_mult: 0
+  }
+  convolution_param {
+    num_output: 64
+    pad: 1
+    kernel_size: 3
+    weight_filler {
+      type: "xavier"
+    }
+    bias_filler {
+      type: "constant"
+      value: 0
+    }
+  }
+}
+layer {
+  name: "relu1_1"
+  type: "ReLU"
+  bottom: "conv1_1"
+  top: "conv1_1"
+}
+layer {
+  name: "conv1_2"
+  type: "Convolution"
+  bottom: "conv1_1"
+  top: "conv1_2"
+  param {
+    lr_mult: 1
+    decay_mult: 1
+  }
+  param {
+    lr_mult: 2
+    decay_mult: 0
+  }
+  convolution_param {
+    num_output: 64
+    pad: 1
+    kernel_size: 3
+    weight_filler {
+      type: "xavier"
+    }
+    bias_filler {
+      type: "constant"
+      value: 0
+    }
+  }
+}
+layer {
+  name: "relu1_2"
+  type: "ReLU"
+  bottom: "conv1_2"
+  top: "conv1_2"
+}
+layer {
+  name: "pool1"
+  type: "Pooling"
+  bottom: "conv1_2"
+  top: "pool1"
+  pooling_param {
+    pool: MAX
+    kernel_size: 2
+    stride: 2
+  }
+}
+layer {
+  name: "conv2_1"
+  type: "Convolution"
+  bottom: "pool1"
+  top: "conv2_1"
+  param {
+    lr_mult: 1
+    decay_mult: 1
+  }
+  param {
+    lr_mult: 2
+    decay_mult: 0
+  }
+  convolution_param {
+    num_output: 128
+    pad: 1
+    kernel_size: 3
+    weight_filler {
+      type: "xavier"
+    }
+    bias_filler {
+      type: "constant"
+      value: 0
+    }
+  }
+}
+layer {
+  name: "relu2_1"
+  type: "ReLU"
+  bottom: "conv2_1"
+  top: "conv2_1"
+}
+layer {
+  name: "conv2_2"
+  type: "Convolution"
+  bottom: "conv2_1"
+  top: "conv2_2"
+  param {
+    lr_mult: 1
+    decay_mult: 1
+  }
+  param {
+    lr_mult: 2
+    decay_mult: 0
+  }
+  convolution_param {
+    num_output: 128
+    pad: 1
+    kernel_size: 3
+    weight_filler {
+      type: "xavier"
+    }
+    bias_filler {
+      type: "constant"
+      value: 0
+    }
+  }
+}
+layer {
+  name: "relu2_2"
+  type: "ReLU"
+  bottom: "conv2_2"
+  top: "conv2_2"
+}
+layer {
+  name: "pool2"
+  type: "Pooling"
+  bottom: "conv2_2"
+  top: "pool2"
+  pooling_param {
+    pool: MAX
+    kernel_size: 2
+    stride: 2
+  }
+}
+layer {
+  name: "conv3_1"
+  type: "Convolution"
+  bottom: "pool2"
+  top: "conv3_1"
+  param {
+    lr_mult: 1
+    decay_mult: 1
+  }
+  param {
+    lr_mult: 2
+    decay_mult: 0
+  }
+  convolution_param {
+    num_output: 256
+    pad: 1
+    kernel_size: 3
+    weight_filler {
+      type: "xavier"
+    }
+    bias_filler {
+      type: "constant"
+      value: 0
+    }
+  }
+}
+layer {
+  name: "relu3_1"
+  type: "ReLU"
+  bottom: "conv3_1"
+  top: "conv3_1"
+}
+layer {
+  name: "conv3_2"
+  type: "Convolution"
+  bottom: "conv3_1"
+  top: "conv3_2"
+  param {
+    lr_mult: 1
+    decay_mult: 1
+  }
+  param {
+    lr_mult: 2
+    decay_mult: 0
+  }
+  convolution_param {
+    num_output: 256
+    pad: 1
+    kernel_size: 3
+    weight_filler {
+      type: "xavier"
+    }
+    bias_filler {
+      type: "constant"
+      value: 0
+    }
+  }
+}
+layer {
+  name: "relu3_2"
+  type: "ReLU"
+  bottom: "conv3_2"
+  top: "conv3_2"
+}
+layer {
+  name: "conv3_3"
+  type: "Convolution"
+  bottom: "conv3_2"
+  top: "conv3_3"
+  param {
+    lr_mult: 1
+    decay_mult: 1
+  }
+  param {
+    lr_mult: 2
+    decay_mult: 0
+  }
+  convolution_param {
+    num_output: 256
+    pad: 1
+    kernel_size: 3
+    weight_filler {
+      type: "xavier"
+    }
+    bias_filler {
+      type: "constant"
+      value: 0
+    }
+  }
+}
+layer {
+  name: "relu3_3"
+  type: "ReLU"
+  bottom: "conv3_3"
+  top: "conv3_3"
+}
+layer {
+  name: "pool3"
+  type: "Pooling"
+  bottom: "conv3_3"
+  top: "pool3"
+  pooling_param {
+    pool: MAX
+    kernel_size: 2
+    stride: 2
+  }
+}
+layer {
+  name: "conv4_1"
+  type: "Convolution"
+  bottom: "pool3"
+  top: "conv4_1"
+  param {
+    lr_mult: 1
+    decay_mult: 1
+  }
+  param {
+    lr_mult: 2
+    decay_mult: 0
+  }
+  convolution_param {
+    num_output: 512
+    pad: 1
+    kernel_size: 3
+    weight_filler {
+      type: "xavier"
+    }
+    bias_filler {
+      type: "constant"
+      value: 0
+    }
+  }
+}
+layer {
+  name: "relu4_1"
+  type: "ReLU"
+  bottom: "conv4_1"
+  top: "conv4_1"
+}
+layer {
+  name: "conv4_2"
+  type: "Convolution"
+  bottom: "conv4_1"
+  top: "conv4_2"
+  param {
+    lr_mult: 1
+    decay_mult: 1
+  }
+  param {
+    lr_mult: 2
+    decay_mult: 0
+  }
+  convolution_param {
+    num_output: 512
+    pad: 1
+    kernel_size: 3
+    weight_filler {
+      type: "xavier"
+    }
+    bias_filler {
+      type: "constant"
+      value: 0
+    }
+  }
+}
+layer {
+  name: "relu4_2"
+  type: "ReLU"
+  bottom: "conv4_2"
+  top: "conv4_2"
+}
+layer {
+  name: "conv4_3"
+  type: "Convolution"
+  bottom: "conv4_2"
+  top: "conv4_3"
+  param {
+    lr_mult: 1
+    decay_mult: 1
+  }
+  param {
+    lr_mult: 2
+    decay_mult: 0
+  }
+  convolution_param {
+    num_output: 512
+    pad: 1
+    kernel_size: 3
+    weight_filler {
+      type: "xavier"
+    }
+    bias_filler {
+      type: "constant"
+      value: 0
+    }
+  }
+}
+layer {
+  name: "relu4_3"
+  type: "ReLU"
+  bottom: "conv4_3"
+  top: "conv4_3"
+}
+layer {
+  name: "pool4"
+  type: "Pooling"
+  bottom: "conv4_3"
+  top: "pool4"
+  pooling_param {
+    pool: MAX
+    kernel_size: 2
+    stride: 2
+  }
+}
+layer {
+  name: "conv5_1"
+  type: "Convolution"
+  bottom: "pool4"
+  top: "conv5_1"
+  param {
+    lr_mult: 1
+    decay_mult: 1
+  }
+  param {
+    lr_mult: 2
+    decay_mult: 0
+  }
+  convolution_param {
+    num_output: 512
+    pad: 1
+    kernel_size: 3
+    weight_filler {
+      type: "xavier"
+    }
+    bias_filler {
+      type: "constant"
+      value: 0
+    }
+    dilation: 1
+  }
+}
+layer {
+  name: "relu5_1"
+  type: "ReLU"
+  bottom: "conv5_1"
+  top: "conv5_1"
+}
+layer {
+  name: "conv5_2"
+  type: "Convolution"
+  bottom: "conv5_1"
+  top: "conv5_2"
+  param {
+    lr_mult: 1
+    decay_mult: 1
+  }
+  param {
+    lr_mult: 2
+    decay_mult: 0
+  }
+  convolution_param {
+    num_output: 512
+    pad: 1
+    kernel_size: 3
+    weight_filler {
+      type: "xavier"
+    }
+    bias_filler {
+      type: "constant"
+      value: 0
+    }
+    dilation: 1
+  }
+}
+layer {
+  name: "relu5_2"
+  type: "ReLU"
+  bottom: "conv5_2"
+  top: "conv5_2"
+}
+layer {
+  name: "conv5_3"
+  type: "Convolution"
+  bottom: "conv5_2"
+  top: "conv5_3"
+  param {
+    lr_mult: 1
+    decay_mult: 1
+  }
+  param {
+    lr_mult: 2
+    decay_mult: 0
+  }
+  convolution_param {
+    num_output: 512
+    pad: 1
+    kernel_size: 3
+    weight_filler {
+      type: "xavier"
+    }
+    bias_filler {
+      type: "constant"
+      value: 0
+    }
+    dilation: 1
+  }
+}
+layer {
+  name: "relu5_3"
+  type: "ReLU"
+  bottom: "conv5_3"
+  top: "conv5_3"
+}
+layer {
+  name: "pool5"
+  type: "Pooling"
+  bottom: "conv5_3"
+  top: "pool5"
+  pooling_param {
+    pool: MAX
+    kernel_size: 2
+    stride: 2
+  }
+}
+layer {
+  name: "fc6"
+  type: "Convolution"
+  bottom: "pool5"
+  top: "fc6"
+  param {
+    lr_mult: 1
+    decay_mult: 1
+  }
+  param {
+    lr_mult: 2
+    decay_mult: 0
+  }
+  convolution_param {
+    num_output: 1024
+    pad: 3
+    kernel_size: 3
+    weight_filler {
+      type: "xavier"
+    }
+    bias_filler {
+      type: "constant"
+      value: 0
+    }
+    dilation: 1
+  }
+}
+layer {
+  name: "relu6"
+  type: "ReLU"
+  bottom: "fc6"
+  top: "fc6"
+}
+layer {
+  name: "fc7"
+  type: "Convolution"
+  bottom: "fc6"
+  top: "fc7"
+  param {
+    lr_mult: 1
+    decay_mult: 1
+  }
+  param {
+    lr_mult: 2
+    decay_mult: 0
+  }
+  convolution_param {
+    num_output: 1024
+    kernel_size: 1
+    weight_filler {
+      type: "xavier"
+    }
+    bias_filler {
+      type: "constant"
+      value: 0
+    }
+  }
+}
+layer {
+  name: "relu7"
+  type: "ReLU"
+  bottom: "fc7"
+  top: "fc7"
+}
+layer {
+  name: "conv6_1"
+  type: "Convolution"
+  bottom: "fc7"
+  top: "conv6_1"
+  param {
+    lr_mult: 1
+    decay_mult: 1
+  }
+  param {
+    lr_mult: 2
+    decay_mult: 0
+  }
+  convolution_param {
+    num_output: 256
+    pad: 0
+    kernel_size: 1
+    stride: 1
+    weight_filler {
+      type: "xavier"
+    }
+    bias_filler {
+      type: "constant"
+      value: 0
+    }
+  }
+}
+layer {
+  name: "conv6_1_relu"
+  type: "ReLU"
+  bottom: "conv6_1"
+  top: "conv6_1"
+}
+layer {
+  name: "conv6_2"
+  type: "Convolution"
+  bottom: "conv6_1"
+  top: "conv6_2"
+  param {
+    lr_mult: 1
+    decay_mult: 1
+  }
+  param {
+    lr_mult: 2
+    decay_mult: 0
+  }
+  convolution_param {
+    num_output: 512
+    pad: 1
+    kernel_size: 3
+    stride: 2
+    weight_filler {
+      type: "xavier"
+    }
+    bias_filler {
+      type: "constant"
+      value: 0
+    }
+  }
+}
+layer {
+  name: "conv6_2_relu"
+  type: "ReLU"
+  bottom: "conv6_2"
+  top: "conv6_2"
+}
+layer {
+  name: "conv7_1"
+  type: "Convolution"
+  bottom: "conv6_2"
+  top: "conv7_1"
+  param {
+    lr_mult: 1
+    decay_mult: 1
+  }
+  param {
+    lr_mult: 2
+    decay_mult: 0
+  }
+  convolution_param {
+    num_output: 128
+    pad: 0
+    kernel_size: 1
+    stride: 1
+    weight_filler {
+      type: "xavier"
+    }
+    bias_filler {
+      type: "constant"
+      value: 0
+    }
+  }
+}
+layer {
+  name: "conv7_1_relu"
+  type: "ReLU"
+  bottom: "conv7_1"
+  top: "conv7_1"
+}
+layer {
+  name: "conv7_2"
+  type: "Convolution"
+  bottom: "conv7_1"
+  top: "conv7_2"
+  param {
+    lr_mult: 1
+    decay_mult: 1
+  }
+  param {
+    lr_mult: 2
+    decay_mult: 0
+  }
+  convolution_param {
+    num_output: 256
+    pad: 1
+    kernel_size: 3
+    stride: 2
+    weight_filler {
+      type: "xavier"
+    }
+    bias_filler {
+      type: "constant"
+      value: 0
+    }
+  }
+}
+layer {
+  name: "conv7_2_relu"
+  type: "ReLU"
+  bottom: "conv7_2"
+  top: "conv7_2"
+}
+layer {
+  name: "conv3_3_norm"
+  type: "Normalize"
+  bottom: "conv3_3"
+  top: "conv3_3_norm"
+  norm_param {
+    across_spatial: false
+    scale_filler {
+      type: "constant"
+      value: 10
+    }
+    channel_shared: false
+  }
+}
+layer {
+  name: "conv3_3_norm_mbox_loc"
+  type: "Convolution"
+  bottom: "conv3_3_norm"
+  top: "conv3_3_norm_mbox_loc"
+  param {
+    lr_mult: 1
+    decay_mult: 1
+  }
+  param {
+    lr_mult: 2
+    decay_mult: 0
+  }
+  convolution_param {
+    num_output: 4
+    pad: 1
+    kernel_size: 3
+    stride: 1
+    weight_filler {
+      type: "xavier"
+    }
+    bias_filler {
+      type: "constant"
+      value: 0
+    }
+  }
+}
+layer {
+  name: "conv3_3_norm_mbox_loc_perm"
+  type: "Permute"
+  bottom: "conv3_3_norm_mbox_loc"
+  top: "conv3_3_norm_mbox_loc_perm"
+  permute_param {
+    order: 0
+    order: 2
+    order: 3
+    order: 1
+  }
+}
+layer {
+  name: "conv3_3_norm_mbox_loc_flat"
+  type: "Flatten"
+  bottom: "conv3_3_norm_mbox_loc_perm"
+  top: "conv3_3_norm_mbox_loc_flat"
+  flatten_param {
+    axis: 1
+  }
+}
+layer {
+  name: "conv3_3_norm_mbox_conf"
+  type: "Convolution"
+  bottom: "conv3_3_norm"
+  top: "conv3_3_norm_mbox_conf"
+  param {
+    lr_mult: 1
+    decay_mult: 1
+  }
+  param {
+    lr_mult: 2
+    decay_mult: 0
+  }
+  convolution_param {
+    num_output: 4
+    pad: 1
+    kernel_size: 3
+    stride: 1
+    weight_filler {
+      type: "xavier"
+    }
+    bias_filler {
+      type: "constant"
+      value: 0
+    }
+  }
+}
+layer {
+  name: "conv3_3_norm_mbox_conf_slice"
+  type: "Slice"
+  bottom: "conv3_3_norm_mbox_conf"
+  top: "conv3_3_norm_mbox_conf1"
+  top: "conv3_3_norm_mbox_conf2"
+  top: "conv3_3_norm_mbox_conf3"
+  top: "conv3_3_norm_mbox_conf4"
+  slice_param {
+    axis: 1
+    slice_point: 1
+    slice_point: 2
+    slice_point: 3
+  }
+}
+layer {
+  name: "conv3_3_norm_mbox_conf_maxout"
+  type: "Eltwise"
+  bottom: "conv3_3_norm_mbox_conf1"
+  bottom: "conv3_3_norm_mbox_conf2"
+  bottom: "conv3_3_norm_mbox_conf3"
+  top: "conv3_3_norm_mbox_conf_maxout"
+  eltwise_param {
+    operation: MAX
+  }
+}
+layer {
+  name: "conv3_3_norm_mbox_conf_out"
+  type: "Concat"
+  bottom: "conv3_3_norm_mbox_conf_maxout"
+  bottom: "conv3_3_norm_mbox_conf4"
+  top: "conv3_3_norm_mbox_conf_out"
+  concat_param {
+    axis: 1
+  }
+}
+layer {
+  name: "conv3_3_norm_mbox_conf_perm"
+  type: "Permute"
+  bottom: "conv3_3_norm_mbox_conf_out"
+  top: "conv3_3_norm_mbox_conf_perm"
+  permute_param {
+    order: 0
+    order: 2
+    order: 3
+    order: 1
+  }
+}
+layer {
+  name: "conv3_3_norm_mbox_conf_flat"
+  type: "Flatten"
+  bottom: "conv3_3_norm_mbox_conf_perm"
+  top: "conv3_3_norm_mbox_conf_flat"
+  flatten_param {
+    axis: 1
+  }
+}
+layer {
+  name: "conv3_3_norm_mbox_priorbox"
+  type: "PriorBox"
+  bottom: "conv3_3_norm"
+  bottom: "data"
+  top: "conv3_3_norm_mbox_priorbox"
+  prior_box_param {
+    min_size: 16
+    clip: false
+    variance: 0.1
+    variance: 0.1
+    variance: 0.2
+    variance: 0.2
+    step: 4
+    offset: 0.5
+  }
+}
+layer {
+  name: "conv4_3_norm"
+  type: "Normalize"
+  bottom: "conv4_3"
+  top: "conv4_3_norm"
+  norm_param {
+    across_spatial: false
+    scale_filler {
+      type: "constant"
+      value: 8
+    }
+    channel_shared: false
+  }
+}
+layer {
+  name: "conv4_3_norm_mbox_loc"
+  type: "Convolution"
+  bottom: "conv4_3_norm"
+  top: "conv4_3_norm_mbox_loc"
+  param {
+    lr_mult: 1
+    decay_mult: 1
+  }
+  param {
+    lr_mult: 2
+    decay_mult: 0
+  }
+  convolution_param {
+    num_output: 4
+    pad: 1
+    kernel_size: 3
+    stride: 1
+    weight_filler {
+      type: "xavier"
+    }
+    bias_filler {
+      type: "constant"
+      value: 0
+    }
+  }
+}
+layer {
+  name: "conv4_3_norm_mbox_loc_perm"
+  type: "Permute"
+  bottom: "conv4_3_norm_mbox_loc"
+  top: "conv4_3_norm_mbox_loc_perm"
+  permute_param {
+    order: 0
+    order: 2
+    order: 3
+    order: 1
+  }
+}
+layer {
+  name: "conv4_3_norm_mbox_loc_flat"
+  type: "Flatten"
+  bottom: "conv4_3_norm_mbox_loc_perm"
+  top: "conv4_3_norm_mbox_loc_flat"
+  flatten_param {
+    axis: 1
+  }
+}
+layer {
+  name: "conv4_3_norm_mbox_conf"
+  type: "Convolution"
+  bottom: "conv4_3_norm"
+  top: "conv4_3_norm_mbox_conf"
+  param {
+    lr_mult: 1
+    decay_mult: 1
+  }
+  param {
+    lr_mult: 2
+    decay_mult: 0
+  }
+  convolution_param {
+    num_output: 2
+    pad: 1
+    kernel_size: 3
+    stride: 1
+    weight_filler {
+      type: "xavier"
+    }
+    bias_filler {
+      type: "constant"
+      value: 0
+    }
+  }
+}
+layer {
+  name: "conv4_3_norm_mbox_conf_perm"
+  type: "Permute"
+  bottom: "conv4_3_norm_mbox_conf"
+  top: "conv4_3_norm_mbox_conf_perm"
+  permute_param {
+    order: 0
+    order: 2
+    order: 3
+    order: 1
+  }
+}
+layer {
+  name: "conv4_3_norm_mbox_conf_flat"
+  type: "Flatten"
+  bottom: "conv4_3_norm_mbox_conf_perm"
+  top: "conv4_3_norm_mbox_conf_flat"
+  flatten_param {
+    axis: 1
+  }
+}
+layer {
+  name: "conv4_3_norm_mbox_priorbox"
+  type: "PriorBox"
+  bottom: "conv4_3_norm"
+  bottom: "data"
+  top: "conv4_3_norm_mbox_priorbox"
+  prior_box_param {
+    min_size: 32
+    clip: false
+    variance: 0.1
+    variance: 0.1
+    variance: 0.2
+    variance: 0.2
+    step: 8
+    offset: 0.5
+  }
+}
+layer {
+  name: "conv5_3_norm"
+  type: "Normalize"
+  bottom: "conv5_3"
+  top: "conv5_3_norm"
+  norm_param {
+    across_spatial: false
+    scale_filler {
+      type: "constant"
+      value: 5
+    }
+    channel_shared: false
+  }
+}
+layer {
+  name: "conv5_3_norm_mbox_loc"
+  type: "Convolution"
+  bottom: "conv5_3_norm"
+  top: "conv5_3_norm_mbox_loc"
+  param {
+    lr_mult: 1
+    decay_mult: 1
+  }
+  param {
+    lr_mult: 2
+    decay_mult: 0
+  }
+  convolution_param {
+    num_output: 4
+    pad: 1
+    kernel_size: 3
+    stride: 1
+    weight_filler {
+      type: "xavier"
+    }
+    bias_filler {
+      type: "constant"
+      value: 0
+    }
+  }
+}
+layer {
+  name: "conv5_3_norm_mbox_loc_perm"
+  type: "Permute"
+  bottom: "conv5_3_norm_mbox_loc"
+  top: "conv5_3_norm_mbox_loc_perm"
+  permute_param {
+    order: 0
+    order: 2
+    order: 3
+    order: 1
+  }
+}
+layer {
+  name: "conv5_3_norm_mbox_loc_flat"
+  type: "Flatten"
+  bottom: "conv5_3_norm_mbox_loc_perm"
+  top: "conv5_3_norm_mbox_loc_flat"
+  flatten_param {
+    axis: 1
+  }
+}
+layer {
+  name: "conv5_3_norm_mbox_conf"
+  type: "Convolution"
+  bottom: "conv5_3_norm"
+  top: "conv5_3_norm_mbox_conf"
+  param {
+    lr_mult: 1
+    decay_mult: 1
+  }
+  param {
+    lr_mult: 2
+    decay_mult: 0
+  }
+  convolution_param {
+    num_output: 2
+    pad: 1
+    kernel_size: 3
+    stride: 1
+    weight_filler {
+      type: "xavier"
+    }
+    bias_filler {
+      type: "constant"
+      value: 0
+    }
+  }
+}
+layer {
+  name: "conv5_3_norm_mbox_conf_perm"
+  type: "Permute"
+  bottom: "conv5_3_norm_mbox_conf"
+  top: "conv5_3_norm_mbox_conf_perm"
+  permute_param {
+    order: 0
+    order: 2
+    order: 3
+    order: 1
+  }
+}
+layer {
+  name: "conv5_3_norm_mbox_conf_flat"
+  type: "Flatten"
+  bottom: "conv5_3_norm_mbox_conf_perm"
+  top: "conv5_3_norm_mbox_conf_flat"
+  flatten_param {
+    axis: 1
+  }
+}
+layer {
+  name: "conv5_3_norm_mbox_priorbox"
+  type: "PriorBox"
+  bottom: "conv5_3_norm"
+  bottom: "data"
+  top: "conv5_3_norm_mbox_priorbox"
+  prior_box_param {
+    min_size: 64
+    clip: false
+    variance: 0.1
+    variance: 0.1
+    variance: 0.2
+    variance: 0.2
+    step: 16
+    offset: 0.5
+  }
+}
+layer {
+  name: "fc7_mbox_loc"
+  type: "Convolution"
+  bottom: "fc7"
+  top: "fc7_mbox_loc"
+  param {
+    lr_mult: 1
+    decay_mult: 1
+  }
+  param {
+    lr_mult: 2
+    decay_mult: 0
+  }
+  convolution_param {
+    num_output: 4
+    pad: 1
+    kernel_size: 3
+    stride: 1
+    weight_filler {
+      type: "xavier"
+    }
+    bias_filler {
+      type: "constant"
+      value: 0
+    }
+  }
+}
+layer {
+  name: "fc7_mbox_loc_perm"
+  type: "Permute"
+  bottom: "fc7_mbox_loc"
+  top: "fc7_mbox_loc_perm"
+  permute_param {
+    order: 0
+    order: 2
+    order: 3
+    order: 1
+  }
+}
+layer {
+  name: "fc7_mbox_loc_flat"
+  type: "Flatten"
+  bottom: "fc7_mbox_loc_perm"
+  top: "fc7_mbox_loc_flat"
+  flatten_param {
+    axis: 1
+  }
+}
+layer {
+  name: "fc7_mbox_conf"
+  type: "Convolution"
+  bottom: "fc7"
+  top: "fc7_mbox_conf"
+  param {
+    lr_mult: 1
+    decay_mult: 1
+  }
+  param {
+    lr_mult: 2
+    decay_mult: 0
+  }
+  convolution_param {
+    num_output: 2
+    pad: 1
+    kernel_size: 3
+    stride: 1
+    weight_filler {
+      type: "xavier"
+    }
+    bias_filler {
+      type: "constant"
+      value: 0
+    }
+  }
+}
+layer {
+  name: "fc7_mbox_conf_perm"
+  type: "Permute"
+  bottom: "fc7_mbox_conf"
+  top: "fc7_mbox_conf_perm"
+  permute_param {
+    order: 0
+    order: 2
+    order: 3
+    order: 1
+  }
+}
+layer {
+  name: "fc7_mbox_conf_flat"
+  type: "Flatten"
+  bottom: "fc7_mbox_conf_perm"
+  top: "fc7_mbox_conf_flat"
+  flatten_param {
+    axis: 1
+  }
+}
+layer {
+  name: "fc7_mbox_priorbox"
+  type: "PriorBox"
+  bottom: "fc7"
+  bottom: "data"
+  top: "fc7_mbox_priorbox"
+  prior_box_param {
+    min_size: 128
+    clip: false
+    variance: 0.1
+    variance: 0.1
+    variance: 0.2
+    variance: 0.2
+    step: 32
+    offset: 0.5
+  }
+}
+layer {
+  name: "conv6_2_mbox_loc"
+  type: "Convolution"
+  bottom: "conv6_2"
+  top: "conv6_2_mbox_loc"
+  param {
+    lr_mult: 1
+    decay_mult: 1
+  }
+  param {
+    lr_mult: 2
+    decay_mult: 0
+  }
+  convolution_param {
+    num_output: 4
+    pad: 1
+    kernel_size: 3
+    stride: 1
+    weight_filler {
+      type: "xavier"
+    }
+    bias_filler {
+      type: "constant"
+      value: 0
+    }
+  }
+}
+layer {
+  name: "conv6_2_mbox_loc_perm"
+  type: "Permute"
+  bottom: "conv6_2_mbox_loc"
+  top: "conv6_2_mbox_loc_perm"
+  permute_param {
+    order: 0
+    order: 2
+    order: 3
+    order: 1
+  }
+}
+layer {
+  name: "conv6_2_mbox_loc_flat"
+  type: "Flatten"
+  bottom: "conv6_2_mbox_loc_perm"
+  top: "conv6_2_mbox_loc_flat"
+  flatten_param {
+    axis: 1
+  }
+}
+layer {
+  name: "conv6_2_mbox_conf"
+  type: "Convolution"
+  bottom: "conv6_2"
+  top: "conv6_2_mbox_conf"
+  param {
+    lr_mult: 1
+    decay_mult: 1
+  }
+  param {
+    lr_mult: 2
+    decay_mult: 0
+  }
+  convolution_param {
+    num_output: 2
+    pad: 1
+    kernel_size: 3
+    stride: 1
+    weight_filler {
+      type: "xavier"
+    }
+    bias_filler {
+      type: "constant"
+      value: 0
+    }
+  }
+}
+layer {
+  name: "conv6_2_mbox_conf_perm"
+  type: "Permute"
+  bottom: "conv6_2_mbox_conf"
+  top: "conv6_2_mbox_conf_perm"
+  permute_param {
+    order: 0
+    order: 2
+    order: 3
+    order: 1
+  }
+}
+layer {
+  name: "conv6_2_mbox_conf_flat"
+  type: "Flatten"
+  bottom: "conv6_2_mbox_conf_perm"
+  top: "conv6_2_mbox_conf_flat"
+  flatten_param {
+    axis: 1
+  }
+}
+layer {
+  name: "conv6_2_mbox_priorbox"
+  type: "PriorBox"
+  bottom: "conv6_2"
+  bottom: "data"
+  top: "conv6_2_mbox_priorbox"
+  prior_box_param {
+    min_size: 256
+    clip: false
+    variance: 0.1
+    variance: 0.1
+    variance: 0.2
+    variance: 0.2
+    step: 64
+    offset: 0.5
+  }
+}
+layer {
+  name: "conv7_2_mbox_loc"
+  type: "Convolution"
+  bottom: "conv7_2"
+  top: "conv7_2_mbox_loc"
+  param {
+    lr_mult: 1
+    decay_mult: 1
+  }
+  param {
+    lr_mult: 2
+    decay_mult: 0
+  }
+  convolution_param {
+    num_output: 4
+    pad: 1
+    kernel_size: 3
+    stride: 1
+    weight_filler {
+      type: "xavier"
+    }
+    bias_filler {
+      type: "constant"
+      value: 0
+    }
+  }
+}
+layer {
+  name: "conv7_2_mbox_loc_perm"
+  type: "Permute"
+  bottom: "conv7_2_mbox_loc"
+  top: "conv7_2_mbox_loc_perm"
+  permute_param {
+    order: 0
+    order: 2
+    order: 3
+    order: 1
+  }
+}
+layer {
+  name: "conv7_2_mbox_loc_flat"
+  type: "Flatten"
+  bottom: "conv7_2_mbox_loc_perm"
+  top: "conv7_2_mbox_loc_flat"
+  flatten_param {
+    axis: 1
+  }
+}
+layer {
+  name: "conv7_2_mbox_conf"
+  type: "Convolution"
+  bottom: "conv7_2"
+  top: "conv7_2_mbox_conf"
+  param {
+    lr_mult: 1
+    decay_mult: 1
+  }
+  param {
+    lr_mult: 2
+    decay_mult: 0
+  }
+  convolution_param {
+    num_output: 2
+    pad: 1
+    kernel_size: 3
+    stride: 1
+    weight_filler {
+      type: "xavier"
+    }
+    bias_filler {
+      type: "constant"
+      value: 0
+    }
+  }
+}
+layer {
+  name: "conv7_2_mbox_conf_perm"
+  type: "Permute"
+  bottom: "conv7_2_mbox_conf"
+  top: "conv7_2_mbox_conf_perm"
+  permute_param {
+    order: 0
+    order: 2
+    order: 3
+    order: 1
+  }
+}
+layer {
+  name: "conv7_2_mbox_conf_flat"
+  type: "Flatten"
+  bottom: "conv7_2_mbox_conf_perm"
+  top: "conv7_2_mbox_conf_flat"
+  flatten_param {
+    axis: 1
+  }
+}
+layer {
+  name: "conv7_2_mbox_priorbox"
+  type: "PriorBox"
+  bottom: "conv7_2"
+  bottom: "data"
+  top: "conv7_2_mbox_priorbox"
+  prior_box_param {
+    min_size: 512
+    clip: false
+    variance: 0.1
+    variance: 0.1
+    variance: 0.2
+    variance: 0.2
+    step: 128
+    offset: 0.5
+  }
+}
+layer {
+  name: "mbox_loc"
+  type: "Concat"
+  bottom: "conv3_3_norm_mbox_loc_flat"
+  bottom: "conv4_3_norm_mbox_loc_flat"
+  bottom: "conv5_3_norm_mbox_loc_flat"
+  bottom: "fc7_mbox_loc_flat"
+  bottom: "conv6_2_mbox_loc_flat"
+  bottom: "conv7_2_mbox_loc_flat"
+  top: "mbox_loc"
+  concat_param {
+    axis: 1
+  }
+}
+layer {
+  name: "mbox_conf"
+  type: "Concat"
+  bottom: "conv3_3_norm_mbox_conf_flat"
+  bottom: "conv4_3_norm_mbox_conf_flat"
+  bottom: "conv5_3_norm_mbox_conf_flat"
+  bottom: "fc7_mbox_conf_flat"
+  bottom: "conv6_2_mbox_conf_flat"
+  bottom: "conv7_2_mbox_conf_flat"
+  top: "mbox_conf"
+  concat_param {
+    axis: 1
+  }
+}
+layer {
+  name: "mbox_priorbox"
+  type: "Concat"
+  bottom: "conv3_3_norm_mbox_priorbox"
+  bottom: "conv4_3_norm_mbox_priorbox"
+  bottom: "conv5_3_norm_mbox_priorbox"
+  bottom: "fc7_mbox_priorbox"
+  bottom: "conv6_2_mbox_priorbox"
+  bottom: "conv7_2_mbox_priorbox"
+  top: "mbox_priorbox"
+  concat_param {
+    axis: 2
+  }
+}
+layer {
+  name: "mbox_conf_reshape"
+  type: "Reshape"
+  bottom: "mbox_conf"
+  top: "mbox_conf_reshape"
+  reshape_param {
+    shape {
+      dim: 0
+      dim: -1
+      dim: 2
+    }
+  }
+}
+layer {
+  name: "mbox_conf_softmax"
+  type: "Softmax"
+  bottom: "mbox_conf_reshape"
+  top: "mbox_conf_softmax"
+  softmax_param {
+    axis: 2
+  }
+}
+layer {
+  name: "mbox_conf_flatten"
+  type: "Flatten"
+  bottom: "mbox_conf_softmax"
+  top: "mbox_conf_flatten"
+  flatten_param {
+    axis: 1
+  }
+}
+layer {
+  name: "detection_out"
+  type: "DetectionOutput"
+  bottom: "mbox_loc"
+  bottom: "mbox_conf_flatten"
+  bottom: "mbox_priorbox"
+  top: "detection_out"
+  include {
+    phase: TEST
+  }
+  detection_output_param {
+    num_classes: 2
+    share_location: true
+    background_label_id: 0
+    nms_param {
+      nms_threshold: 0.3
+      top_k: 5000
+    }
+    code_type: CENTER_SIZE
+    keep_top_k: 750
+    confidence_threshold: 0.6
+  }
+}
+
diff --git a/examples/faceboxes/faceboxes_test.py b/examples/faceboxes/faceboxes_test.py
new file mode 100644
index 00000000000..ace7b1c91c8
--- /dev/null
+++ b/examples/faceboxes/faceboxes_test.py
@@ -0,0 +1,61 @@
+import numpy as np
+import sys, os
+import cv2
+
+sys.path.insert(0, '../../python')
+import caffe
+import time
+
+net_file = 'SSD.prototxt'
+caffe_model = 'SSD.caffemodel'
+test_dir = "images"
+
+if not os.path.exists(caffe_model):
+    print("SSD.caffemodel does not exist, see https://github.com/sfzhang15/SFD")
+    exit()
+caffe.set_mode_gpu()
+net = caffe.Net(net_file, caffe_model, caffe.TEST)
+
+CLASSES = ('background',
+           'face')
+
+transformer = caffe.io.Transformer({'data': net.blobs['data'].data.shape})
+transformer.set_transpose('data', (2, 0, 1))
+transformer.set_mean('data', np.array([104, 117, 123]))  # mean pixel
+
+
+def postprocess(img, out):
+    h = img.shape[0]
+    w = img.shape[1]
+    box = out['detection_out'][0, 0, :, 3:7] * np.array([w, h, w, h])
+    cls = out['detection_out'][0, 0, :, 1]
+    conf = out['detection_out'][0, 0, :, 2]
+    return (box.astype(np.int32), conf, cls)
+
+
+def detect(imgfile):
+    frame = cv2.imread(imgfile)
+    transformed_image = transformer.preprocess('data', frame)
+    net.blobs['data'].data[...] = transformed_image
+    time_start = time.time()
+    out = net.forward()
+    time_end = time.time()
+    print (time_end - time_start),
+    print ("s")
+
+    box, conf, cls = postprocess(frame, out)
+
+    for i in range(len(box)):
+        p1 = (box[i][0], box[i][1])
+        p2 = (box[i][2], box[i][3])
+        cv2.rectangle(frame, p1, p2, (0, 255, 0))
+        p3 = (max(p1[0], 15), max(p1[1], 15))
+        title = "%s:%.2f" % (CLASSES[int(cls[i])], conf[i])
+        cv2.putText(frame, title, p3, cv2.FONT_ITALIC, 0.6, (0, 255, 0), 1)
+    cv2.imshow("SSD, %d boxes" % len(box), frame)
+    cv2.waitKey()
+    # if cv2.waitKey(100) & 0xFF == ord('q'):
+    #     break
+
+
+detect("pepper.jpg")
diff --git a/examples/faceboxes/pepper.jpg b/examples/faceboxes/pepper.jpg
new file mode 100644
index 00000000000..c019ed4651b
Binary files /dev/null and b/examples/faceboxes/pepper.jpg differ
diff --git a/examples/ssd/model_libs.py b/examples/ssd/model_libs.py
new file mode 100644
index 00000000000..84c8bf27d71
--- /dev/null
+++ b/examples/ssd/model_libs.py
@@ -0,0 +1,933 @@
+import os
+
+import caffe
+from caffe import layers as L
+from caffe import params as P
+from caffe.proto import caffe_pb2
+
+def check_if_exist(path):
+    return os.path.exists(path)
+
+def make_if_not_exist(path):
+    if not os.path.exists(path):
+        os.makedirs(path)
+
+def UnpackVariable(var, num):
+  assert len > 0
+  if type(var) is list and len(var) == num:
+    return var
+  else:
+    ret = []
+    if type(var) is list:
+      assert len(var) == 1
+      for i in xrange(0, num):
+        ret.append(var[0])
+    else:
+      for i in xrange(0, num):
+        ret.append(var)
+    return ret
+
+def ConvBNLayer(net, from_layer, out_layer, use_bn, use_relu, num_output,
+    kernel_size, pad, stride, dilation=1, use_scale=True, lr_mult=1,
+    conv_prefix='', conv_postfix='', bn_prefix='', bn_postfix='_bn',
+    scale_prefix='', scale_postfix='_scale', bias_prefix='', bias_postfix='_bias',
+    **bn_params):
+  if use_bn:
+    # parameters for convolution layer with batchnorm.
+    kwargs = {
+        'param': [dict(lr_mult=lr_mult, decay_mult=1)],
+        'weight_filler': dict(type='gaussian', std=0.01),
+        'bias_term': False,
+        }
+    eps = bn_params.get('eps', 0.001)
+    moving_average_fraction = bn_params.get('moving_average_fraction', 0.999)
+    use_global_stats = bn_params.get('use_global_stats', False)
+    # parameters for batchnorm layer.
+    bn_kwargs = {
+        'param': [
+            dict(lr_mult=0, decay_mult=0),
+            dict(lr_mult=0, decay_mult=0),
+            dict(lr_mult=0, decay_mult=0)],
+        'eps': eps,
+        'moving_average_fraction': moving_average_fraction,
+        }
+    bn_lr_mult = lr_mult
+    if use_global_stats:
+      # only specify if use_global_stats is explicitly provided;
+      # otherwise, use_global_stats_ = this->phase_ == TEST;
+      bn_kwargs = {
+          'param': [
+              dict(lr_mult=0, decay_mult=0),
+              dict(lr_mult=0, decay_mult=0),
+              dict(lr_mult=0, decay_mult=0)],
+          'eps': eps,
+          'use_global_stats': use_global_stats,
+          }
+      # not updating scale/bias parameters
+      bn_lr_mult = 0
+    # parameters for scale bias layer after batchnorm.
+    if use_scale:
+      sb_kwargs = {
+          'bias_term': True,
+          'param': [
+              dict(lr_mult=bn_lr_mult, decay_mult=0),
+              dict(lr_mult=bn_lr_mult, decay_mult=0)],
+          'filler': dict(type='constant', value=1.0),
+          'bias_filler': dict(type='constant', value=0.0),
+          }
+    else:
+      bias_kwargs = {
+          'param': [dict(lr_mult=bn_lr_mult, decay_mult=0)],
+          'filler': dict(type='constant', value=0.0),
+          }
+  else:
+    kwargs = {
+        'param': [
+            dict(lr_mult=lr_mult, decay_mult=1),
+            dict(lr_mult=2 * lr_mult, decay_mult=0)],
+        'weight_filler': dict(type='xavier'),
+        'bias_filler': dict(type='constant', value=0)
+        }
+
+  conv_name = '{}{}{}'.format(conv_prefix, out_layer, conv_postfix)
+  [kernel_h, kernel_w] = UnpackVariable(kernel_size, 2)
+  [pad_h, pad_w] = UnpackVariable(pad, 2)
+  [stride_h, stride_w] = UnpackVariable(stride, 2)
+  if kernel_h == kernel_w:
+    net[conv_name] = L.Convolution(net[from_layer], num_output=num_output,
+        kernel_size=kernel_h, pad=pad_h, stride=stride_h, **kwargs)
+  else:
+    net[conv_name] = L.Convolution(net[from_layer], num_output=num_output,
+        kernel_h=kernel_h, kernel_w=kernel_w, pad_h=pad_h, pad_w=pad_w,
+        stride_h=stride_h, stride_w=stride_w, **kwargs)
+  if dilation > 1:
+    net.update(conv_name, {'dilation': dilation})
+  if use_bn:
+    bn_name = '{}{}{}'.format(bn_prefix, out_layer, bn_postfix)
+    net[bn_name] = L.BatchNorm(net[conv_name], in_place=True, **bn_kwargs)
+    if use_scale:
+      sb_name = '{}{}{}'.format(scale_prefix, out_layer, scale_postfix)
+      net[sb_name] = L.Scale(net[bn_name], in_place=True, **sb_kwargs)
+    else:
+      bias_name = '{}{}{}'.format(bias_prefix, out_layer, bias_postfix)
+      net[bias_name] = L.Bias(net[bn_name], in_place=True, **bias_kwargs)
+  if use_relu:
+    relu_name = '{}_relu'.format(conv_name)
+    net[relu_name] = L.ReLU(net[conv_name], in_place=True)
+
+def ResBody(net, from_layer, block_name, out2a, out2b, out2c, stride, use_branch1, dilation=1, **bn_param):
+  # ResBody(net, 'pool1', '2a', 64, 64, 256, 1, True)
+
+  conv_prefix = 'res{}_'.format(block_name)
+  conv_postfix = ''
+  bn_prefix = 'bn{}_'.format(block_name)
+  bn_postfix = ''
+  scale_prefix = 'scale{}_'.format(block_name)
+  scale_postfix = ''
+  use_scale = True
+
+  if use_branch1:
+    branch_name = 'branch1'
+    ConvBNLayer(net, from_layer, branch_name, use_bn=True, use_relu=False,
+        num_output=out2c, kernel_size=1, pad=0, stride=stride, use_scale=use_scale,
+        conv_prefix=conv_prefix, conv_postfix=conv_postfix,
+        bn_prefix=bn_prefix, bn_postfix=bn_postfix,
+        scale_prefix=scale_prefix, scale_postfix=scale_postfix, **bn_param)
+    branch1 = '{}{}'.format(conv_prefix, branch_name)
+  else:
+    branch1 = from_layer
+
+  branch_name = 'branch2a'
+  ConvBNLayer(net, from_layer, branch_name, use_bn=True, use_relu=True,
+      num_output=out2a, kernel_size=1, pad=0, stride=stride, use_scale=use_scale,
+      conv_prefix=conv_prefix, conv_postfix=conv_postfix,
+      bn_prefix=bn_prefix, bn_postfix=bn_postfix,
+      scale_prefix=scale_prefix, scale_postfix=scale_postfix, **bn_param)
+  out_name = '{}{}'.format(conv_prefix, branch_name)
+
+  branch_name = 'branch2b'
+  if dilation == 1:
+    ConvBNLayer(net, out_name, branch_name, use_bn=True, use_relu=True,
+        num_output=out2b, kernel_size=3, pad=1, stride=1, use_scale=use_scale,
+        conv_prefix=conv_prefix, conv_postfix=conv_postfix,
+        bn_prefix=bn_prefix, bn_postfix=bn_postfix,
+        scale_prefix=scale_prefix, scale_postfix=scale_postfix, **bn_param)
+  else:
+    pad = int((3 + (dilation - 1) * 2) - 1) / 2
+    ConvBNLayer(net, out_name, branch_name, use_bn=True, use_relu=True,
+        num_output=out2b, kernel_size=3, pad=pad, stride=1, use_scale=use_scale,
+        dilation=dilation, conv_prefix=conv_prefix, conv_postfix=conv_postfix,
+        bn_prefix=bn_prefix, bn_postfix=bn_postfix,
+        scale_prefix=scale_prefix, scale_postfix=scale_postfix, **bn_param)
+  out_name = '{}{}'.format(conv_prefix, branch_name)
+
+  branch_name = 'branch2c'
+  ConvBNLayer(net, out_name, branch_name, use_bn=True, use_relu=False,
+      num_output=out2c, kernel_size=1, pad=0, stride=1, use_scale=use_scale,
+      conv_prefix=conv_prefix, conv_postfix=conv_postfix,
+      bn_prefix=bn_prefix, bn_postfix=bn_postfix,
+      scale_prefix=scale_prefix, scale_postfix=scale_postfix, **bn_param)
+  branch2 = '{}{}'.format(conv_prefix, branch_name)
+
+  res_name = 'res{}'.format(block_name)
+  net[res_name] = L.Eltwise(net[branch1], net[branch2])
+  relu_name = '{}_relu'.format(res_name)
+  net[relu_name] = L.ReLU(net[res_name], in_place=True)
+
+
+def InceptionTower(net, from_layer, tower_name, layer_params, **bn_param):
+  use_scale = False
+  for param in layer_params:
+    tower_layer = '{}/{}'.format(tower_name, param['name'])
+    del param['name']
+    if 'pool' in tower_layer:
+      net[tower_layer] = L.Pooling(net[from_layer], **param)
+    else:
+      param.update(bn_param)
+      ConvBNLayer(net, from_layer, tower_layer, use_bn=True, use_relu=True,
+          use_scale=use_scale, **param)
+    from_layer = tower_layer
+  return net[from_layer]
+
+def CreateAnnotatedDataLayer(source, batch_size=32, backend=P.Data.LMDB,
+        output_label=True, train=True, label_map_file='', anno_type=None,
+        transform_param={}, batch_sampler=[{}]):
+    if train:
+        kwargs = {
+                'include': dict(phase=caffe_pb2.Phase.Value('TRAIN')),
+                'transform_param': transform_param,
+                }
+    else:
+        kwargs = {
+                'include': dict(phase=caffe_pb2.Phase.Value('TEST')),
+                'transform_param': transform_param,
+                }
+    ntop = 1
+    if output_label:
+        ntop = 2
+    annotated_data_param = {
+        'label_map_file': label_map_file,
+        'batch_sampler': batch_sampler,
+        }
+    if anno_type is not None:
+        annotated_data_param.update({'anno_type': anno_type})
+    return L.AnnotatedData(name="data", annotated_data_param=annotated_data_param,
+        data_param=dict(batch_size=batch_size, backend=backend, source=source),
+        ntop=ntop, **kwargs)
+
+
+def ZFNetBody(net, from_layer, need_fc=True, fully_conv=False, reduced=False,
+        dilated=False, dropout=True, need_fc8=False, freeze_layers=[]):
+    kwargs = {
+            'param': [dict(lr_mult=1, decay_mult=1), dict(lr_mult=2, decay_mult=0)],
+            'weight_filler': dict(type='xavier'),
+            'bias_filler': dict(type='constant', value=0)}
+
+    assert from_layer in net.keys()
+    net.conv1 = L.Convolution(net[from_layer], num_output=96, pad=3, kernel_size=7, stride=2, **kwargs)
+    net.relu1 = L.ReLU(net.conv1, in_place=True)
+
+    net.norm1 = L.LRN(net.relu1, local_size=3, alpha=0.00005, beta=0.75,
+            norm_region=P.LRN.WITHIN_CHANNEL, engine=P.LRN.CAFFE)
+
+    net.pool1 = L.Pooling(net.norm1, pool=P.Pooling.MAX, pad=1, kernel_size=3, stride=2)
+
+    net.conv2 = L.Convolution(net.pool1, num_output=256, pad=2, kernel_size=5, stride=2, **kwargs)
+    net.relu2 = L.ReLU(net.conv2, in_place=True)
+
+    net.norm2 = L.LRN(net.relu2, local_size=3, alpha=0.00005, beta=0.75,
+            norm_region=P.LRN.WITHIN_CHANNEL, engine=P.LRN.CAFFE)
+
+    net.pool2 = L.Pooling(net.norm2, pool=P.Pooling.MAX, pad=1, kernel_size=3, stride=2)
+
+    net.conv3 = L.Convolution(net.pool2, num_output=384, pad=1, kernel_size=3, **kwargs)
+    net.relu3 = L.ReLU(net.conv3, in_place=True)
+    net.conv4 = L.Convolution(net.relu3, num_output=384, pad=1, kernel_size=3, **kwargs)
+    net.relu4 = L.ReLU(net.conv4, in_place=True)
+    net.conv5 = L.Convolution(net.relu4, num_output=256, pad=1, kernel_size=3, **kwargs)
+    net.relu5 = L.ReLU(net.conv5, in_place=True)
+
+    if need_fc:
+        if dilated:
+            name = 'pool5'
+            net[name] = L.Pooling(net.relu5, pool=P.Pooling.MAX, pad=1, kernel_size=3, stride=1)
+        else:
+            name = 'pool5'
+            net[name] = L.Pooling(net.relu5, pool=P.Pooling.MAX, pad=1, kernel_size=3, stride=2)
+
+        if fully_conv:
+            if dilated:
+                if reduced:
+                    net.fc6 = L.Convolution(net[name], num_output=1024, pad=5, kernel_size=3, dilation=5, **kwargs)
+                else:
+                    net.fc6 = L.Convolution(net[name], num_output=4096, pad=5, kernel_size=6, dilation=2, **kwargs)
+            else:
+                if reduced:
+                    net.fc6 = L.Convolution(net[name], num_output=1024, pad=2, kernel_size=3, dilation=2,  **kwargs)
+                else:
+                    net.fc6 = L.Convolution(net[name], num_output=4096, pad=2, kernel_size=6, **kwargs)
+
+            net.relu6 = L.ReLU(net.fc6, in_place=True)
+            if dropout:
+                net.drop6 = L.Dropout(net.relu6, dropout_ratio=0.5, in_place=True)
+
+            if reduced:
+                net.fc7 = L.Convolution(net.relu6, num_output=1024, kernel_size=1, **kwargs)
+            else:
+                net.fc7 = L.Convolution(net.relu6, num_output=4096, kernel_size=1, **kwargs)
+            net.relu7 = L.ReLU(net.fc7, in_place=True)
+            if dropout:
+                net.drop7 = L.Dropout(net.relu7, dropout_ratio=0.5, in_place=True)
+        else:
+            net.fc6 = L.InnerProduct(net.pool5, num_output=4096)
+            net.relu6 = L.ReLU(net.fc6, in_place=True)
+            if dropout:
+                net.drop6 = L.Dropout(net.relu6, dropout_ratio=0.5, in_place=True)
+            net.fc7 = L.InnerProduct(net.relu6, num_output=4096)
+            net.relu7 = L.ReLU(net.fc7, in_place=True)
+            if dropout:
+                net.drop7 = L.Dropout(net.relu7, dropout_ratio=0.5, in_place=True)
+    if need_fc8:
+        from_layer = net.keys()[-1]
+        if fully_conv:
+            net.fc8 = L.Convolution(net[from_layer], num_output=1000, kernel_size=1, **kwargs)
+        else:
+            net.fc8 = L.InnerProduct(net[from_layer], num_output=1000)
+        net.prob = L.Softmax(net.fc8)
+
+    # Update freeze layers.
+    kwargs['param'] = [dict(lr_mult=0, decay_mult=0), dict(lr_mult=0, decay_mult=0)]
+    layers = net.keys()
+    for freeze_layer in freeze_layers:
+        if freeze_layer in layers:
+            net.update(freeze_layer, kwargs)
+
+    return net
+
+
+def VGGNetBody(net, from_layer, need_fc=True, fully_conv=False, reduced=False,
+        dilated=False, nopool=False, dropout=True, freeze_layers=[], dilate_pool4=False):
+    kwargs = {
+            'param': [dict(lr_mult=1, decay_mult=1), dict(lr_mult=2, decay_mult=0)],
+            'weight_filler': dict(type='xavier'),
+            'bias_filler': dict(type='constant', value=0)}
+
+    assert from_layer in net.keys()
+    net.conv1_1 = L.Convolution(net[from_layer], num_output=64, pad=1, kernel_size=3, **kwargs)
+
+    net.relu1_1 = L.ReLU(net.conv1_1, in_place=True)
+    net.conv1_2 = L.Convolution(net.relu1_1, num_output=64, pad=1, kernel_size=3, **kwargs)
+    net.relu1_2 = L.ReLU(net.conv1_2, in_place=True)
+
+    if nopool:
+        name = 'conv1_3'
+        net[name] = L.Convolution(net.relu1_2, num_output=64, pad=1, kernel_size=3, stride=2, **kwargs)
+    else:
+        name = 'pool1'
+        net.pool1 = L.Pooling(net.relu1_2, pool=P.Pooling.MAX, kernel_size=2, stride=2)
+
+    net.conv2_1 = L.Convolution(net[name], num_output=128, pad=1, kernel_size=3, **kwargs)
+    net.relu2_1 = L.ReLU(net.conv2_1, in_place=True)
+    net.conv2_2 = L.Convolution(net.relu2_1, num_output=128, pad=1, kernel_size=3, **kwargs)
+    net.relu2_2 = L.ReLU(net.conv2_2, in_place=True)
+
+    if nopool:
+        name = 'conv2_3'
+        net[name] = L.Convolution(net.relu2_2, num_output=128, pad=1, kernel_size=3, stride=2, **kwargs)
+    else:
+        name = 'pool2'
+        net[name] = L.Pooling(net.relu2_2, pool=P.Pooling.MAX, kernel_size=2, stride=2)
+
+    net.conv3_1 = L.Convolution(net[name], num_output=256, pad=1, kernel_size=3, **kwargs)
+    net.relu3_1 = L.ReLU(net.conv3_1, in_place=True)
+    net.conv3_2 = L.Convolution(net.relu3_1, num_output=256, pad=1, kernel_size=3, **kwargs)
+    net.relu3_2 = L.ReLU(net.conv3_2, in_place=True)
+    net.conv3_3 = L.Convolution(net.relu3_2, num_output=256, pad=1, kernel_size=3, **kwargs)
+    net.relu3_3 = L.ReLU(net.conv3_3, in_place=True)
+
+    if nopool:
+        name = 'conv3_4'
+        net[name] = L.Convolution(net.relu3_3, num_output=256, pad=1, kernel_size=3, stride=2, **kwargs)
+    else:
+        name = 'pool3'
+        net[name] = L.Pooling(net.relu3_3, pool=P.Pooling.MAX, kernel_size=2, stride=2)
+
+    net.conv4_1 = L.Convolution(net[name], num_output=512, pad=1, kernel_size=3, **kwargs)
+    net.relu4_1 = L.ReLU(net.conv4_1, in_place=True)
+    net.conv4_2 = L.Convolution(net.relu4_1, num_output=512, pad=1, kernel_size=3, **kwargs)
+    net.relu4_2 = L.ReLU(net.conv4_2, in_place=True)
+    net.conv4_3 = L.Convolution(net.relu4_2, num_output=512, pad=1, kernel_size=3, **kwargs)
+    net.relu4_3 = L.ReLU(net.conv4_3, in_place=True)
+
+    if nopool:
+        name = 'conv4_4'
+        net[name] = L.Convolution(net.relu4_3, num_output=512, pad=1, kernel_size=3, stride=2, **kwargs)
+    else:
+        name = 'pool4'
+        if dilate_pool4:
+            net[name] = L.Pooling(net.relu4_3, pool=P.Pooling.MAX, kernel_size=3, stride=1, pad=1)
+            dilation = 2
+        else:
+            net[name] = L.Pooling(net.relu4_3, pool=P.Pooling.MAX, kernel_size=2, stride=2)
+            dilation = 1
+
+    kernel_size = 3
+    pad = int((kernel_size + (dilation - 1) * (kernel_size - 1)) - 1) / 2
+    net.conv5_1 = L.Convolution(net[name], num_output=512, pad=pad, kernel_size=kernel_size, dilation=dilation, **kwargs)
+    net.relu5_1 = L.ReLU(net.conv5_1, in_place=True)
+    net.conv5_2 = L.Convolution(net.relu5_1, num_output=512, pad=pad, kernel_size=kernel_size, dilation=dilation, **kwargs)
+    net.relu5_2 = L.ReLU(net.conv5_2, in_place=True)
+    net.conv5_3 = L.Convolution(net.relu5_2, num_output=512, pad=pad, kernel_size=kernel_size, dilation=dilation, **kwargs)
+    net.relu5_3 = L.ReLU(net.conv5_3, in_place=True)
+
+    if need_fc:
+        if dilated:
+            if nopool:
+                name = 'conv5_4'
+                net[name] = L.Convolution(net.relu5_3, num_output=512, pad=1, kernel_size=3, stride=1, **kwargs)
+            else:
+                name = 'pool5'
+                net[name] = L.Pooling(net.relu5_3, pool=P.Pooling.MAX, pad=1, kernel_size=3, stride=1)
+        else:
+            if nopool:
+                name = 'conv5_4'
+                net[name] = L.Convolution(net.relu5_3, num_output=512, pad=1, kernel_size=3, stride=2, **kwargs)
+            else:
+                name = 'pool5'
+                net[name] = L.Pooling(net.relu5_3, pool=P.Pooling.MAX, kernel_size=2, stride=2)
+
+        if fully_conv:
+            if dilated:
+                if reduced:
+                    dilation = dilation * 6
+                    kernel_size = 3
+                    num_output = 1024
+                else:
+                    dilation = dilation * 2
+                    kernel_size = 7
+                    num_output = 4096
+            else:
+                if reduced:
+                    dilation = dilation * 3
+                    kernel_size = 3
+                    num_output = 1024
+                else:
+                    kernel_size = 7
+                    num_output = 4096
+            pad = int((kernel_size + (dilation - 1) * (kernel_size - 1)) - 1) / 2
+            net.fc6 = L.Convolution(net[name], num_output=num_output, pad=pad, kernel_size=kernel_size, dilation=dilation, **kwargs)
+
+            net.relu6 = L.ReLU(net.fc6, in_place=True)
+            if dropout:
+                net.drop6 = L.Dropout(net.relu6, dropout_ratio=0.5, in_place=True)
+
+            if reduced:
+                net.fc7 = L.Convolution(net.relu6, num_output=1024, kernel_size=1, **kwargs)
+            else:
+                net.fc7 = L.Convolution(net.relu6, num_output=4096, kernel_size=1, **kwargs)
+            net.relu7 = L.ReLU(net.fc7, in_place=True)
+            if dropout:
+                net.drop7 = L.Dropout(net.relu7, dropout_ratio=0.5, in_place=True)
+        else:
+            net.fc6 = L.InnerProduct(net.pool5, num_output=4096)
+            net.relu6 = L.ReLU(net.fc6, in_place=True)
+            if dropout:
+                net.drop6 = L.Dropout(net.relu6, dropout_ratio=0.5, in_place=True)
+            net.fc7 = L.InnerProduct(net.relu6, num_output=4096)
+            net.relu7 = L.ReLU(net.fc7, in_place=True)
+            if dropout:
+                net.drop7 = L.Dropout(net.relu7, dropout_ratio=0.5, in_place=True)
+
+    # Update freeze layers.
+    kwargs['param'] = [dict(lr_mult=0, decay_mult=0), dict(lr_mult=0, decay_mult=0)]
+    layers = net.keys()
+    for freeze_layer in freeze_layers:
+        if freeze_layer in layers:
+            net.update(freeze_layer, kwargs)
+
+    return net
+
+
+def ResNet101Body(net, from_layer, use_pool5=True, use_dilation_conv5=False, **bn_param):
+    conv_prefix = ''
+    conv_postfix = ''
+    bn_prefix = 'bn_'
+    bn_postfix = ''
+    scale_prefix = 'scale_'
+    scale_postfix = ''
+    ConvBNLayer(net, from_layer, 'conv1', use_bn=True, use_relu=True,
+        num_output=64, kernel_size=7, pad=3, stride=2,
+        conv_prefix=conv_prefix, conv_postfix=conv_postfix,
+        bn_prefix=bn_prefix, bn_postfix=bn_postfix,
+        scale_prefix=scale_prefix, scale_postfix=scale_postfix, **bn_param)
+
+    net.pool1 = L.Pooling(net.conv1, pool=P.Pooling.MAX, kernel_size=3, stride=2)
+
+    ResBody(net, 'pool1', '2a', out2a=64, out2b=64, out2c=256, stride=1, use_branch1=True, **bn_param)
+    ResBody(net, 'res2a', '2b', out2a=64, out2b=64, out2c=256, stride=1, use_branch1=False, **bn_param)
+    ResBody(net, 'res2b', '2c', out2a=64, out2b=64, out2c=256, stride=1, use_branch1=False, **bn_param)
+
+    ResBody(net, 'res2c', '3a', out2a=128, out2b=128, out2c=512, stride=2, use_branch1=True, **bn_param)
+
+    from_layer = 'res3a'
+    for i in xrange(1, 4):
+      block_name = '3b{}'.format(i)
+      ResBody(net, from_layer, block_name, out2a=128, out2b=128, out2c=512, stride=1, use_branch1=False, **bn_param)
+      from_layer = 'res{}'.format(block_name)
+
+    ResBody(net, from_layer, '4a', out2a=256, out2b=256, out2c=1024, stride=2, use_branch1=True, **bn_param)
+
+    from_layer = 'res4a'
+    for i in xrange(1, 23):
+      block_name = '4b{}'.format(i)
+      ResBody(net, from_layer, block_name, out2a=256, out2b=256, out2c=1024, stride=1, use_branch1=False, **bn_param)
+      from_layer = 'res{}'.format(block_name)
+
+    stride = 2
+    dilation = 1
+    if use_dilation_conv5:
+      stride = 1
+      dilation = 2
+
+    ResBody(net, from_layer, '5a', out2a=512, out2b=512, out2c=2048, stride=stride, use_branch1=True, dilation=dilation, **bn_param)
+    ResBody(net, 'res5a', '5b', out2a=512, out2b=512, out2c=2048, stride=1, use_branch1=False, dilation=dilation, **bn_param)
+    ResBody(net, 'res5b', '5c', out2a=512, out2b=512, out2c=2048, stride=1, use_branch1=False, dilation=dilation, **bn_param)
+
+    if use_pool5:
+      net.pool5 = L.Pooling(net.res5c, pool=P.Pooling.AVE, global_pooling=True)
+
+    return net
+
+
+def ResNet152Body(net, from_layer, use_pool5=True, use_dilation_conv5=False, **bn_param):
+    conv_prefix = ''
+    conv_postfix = ''
+    bn_prefix = 'bn_'
+    bn_postfix = ''
+    scale_prefix = 'scale_'
+    scale_postfix = ''
+    ConvBNLayer(net, from_layer, 'conv1', use_bn=True, use_relu=True,
+        num_output=64, kernel_size=7, pad=3, stride=2,
+        conv_prefix=conv_prefix, conv_postfix=conv_postfix,
+        bn_prefix=bn_prefix, bn_postfix=bn_postfix,
+        scale_prefix=scale_prefix, scale_postfix=scale_postfix, **bn_param)
+
+    net.pool1 = L.Pooling(net.conv1, pool=P.Pooling.MAX, kernel_size=3, stride=2)
+
+    ResBody(net, 'pool1', '2a', out2a=64, out2b=64, out2c=256, stride=1, use_branch1=True, **bn_param)
+    ResBody(net, 'res2a', '2b', out2a=64, out2b=64, out2c=256, stride=1, use_branch1=False, **bn_param)
+    ResBody(net, 'res2b', '2c', out2a=64, out2b=64, out2c=256, stride=1, use_branch1=False, **bn_param)
+
+    ResBody(net, 'res2c', '3a', out2a=128, out2b=128, out2c=512, stride=2, use_branch1=True, **bn_param)
+
+    from_layer = 'res3a'
+    for i in xrange(1, 8):
+      block_name = '3b{}'.format(i)
+      ResBody(net, from_layer, block_name, out2a=128, out2b=128, out2c=512, stride=1, use_branch1=False, **bn_param)
+      from_layer = 'res{}'.format(block_name)
+
+    ResBody(net, from_layer, '4a', out2a=256, out2b=256, out2c=1024, stride=2, use_branch1=True, **bn_param)
+
+    from_layer = 'res4a'
+    for i in xrange(1, 36):
+      block_name = '4b{}'.format(i)
+      ResBody(net, from_layer, block_name, out2a=256, out2b=256, out2c=1024, stride=1, use_branch1=False, **bn_param)
+      from_layer = 'res{}'.format(block_name)
+
+    stride = 2
+    dilation = 1
+    if use_dilation_conv5:
+      stride = 1
+      dilation = 2
+
+    ResBody(net, from_layer, '5a', out2a=512, out2b=512, out2c=2048, stride=stride, use_branch1=True, dilation=dilation, **bn_param)
+    ResBody(net, 'res5a', '5b', out2a=512, out2b=512, out2c=2048, stride=1, use_branch1=False, dilation=dilation, **bn_param)
+    ResBody(net, 'res5b', '5c', out2a=512, out2b=512, out2c=2048, stride=1, use_branch1=False, dilation=dilation, **bn_param)
+
+    if use_pool5:
+      net.pool5 = L.Pooling(net.res5c, pool=P.Pooling.AVE, global_pooling=True)
+
+    return net
+
+
+def InceptionV3Body(net, from_layer, output_pred=False, **bn_param):
+  # scale is fixed to 1, thus we ignore it.
+  use_scale = False
+
+  out_layer = 'conv'
+  ConvBNLayer(net, from_layer, out_layer, use_bn=True, use_relu=True,
+      num_output=32, kernel_size=3, pad=0, stride=2, use_scale=use_scale,
+      **bn_param)
+  from_layer = out_layer
+
+  out_layer = 'conv_1'
+  ConvBNLayer(net, from_layer, out_layer, use_bn=True, use_relu=True,
+      num_output=32, kernel_size=3, pad=0, stride=1, use_scale=use_scale,
+      **bn_param)
+  from_layer = out_layer
+
+  out_layer = 'conv_2'
+  ConvBNLayer(net, from_layer, out_layer, use_bn=True, use_relu=True,
+      num_output=64, kernel_size=3, pad=1, stride=1, use_scale=use_scale,
+      **bn_param)
+  from_layer = out_layer
+
+  out_layer = 'pool'
+  net[out_layer] = L.Pooling(net[from_layer], pool=P.Pooling.MAX,
+      kernel_size=3, stride=2, pad=0)
+  from_layer = out_layer
+
+  out_layer = 'conv_3'
+  ConvBNLayer(net, from_layer, out_layer, use_bn=True, use_relu=True,
+      num_output=80, kernel_size=1, pad=0, stride=1, use_scale=use_scale,
+      **bn_param)
+  from_layer = out_layer
+
+  out_layer = 'conv_4'
+  ConvBNLayer(net, from_layer, out_layer, use_bn=True, use_relu=True,
+      num_output=192, kernel_size=3, pad=0, stride=1, use_scale=use_scale,
+      **bn_param)
+  from_layer = out_layer
+
+  out_layer = 'pool_1'
+  net[out_layer] = L.Pooling(net[from_layer], pool=P.Pooling.MAX,
+      kernel_size=3, stride=2, pad=0)
+  from_layer = out_layer
+
+  # inceptions with 1x1, 3x3, 5x5 convolutions
+  for inception_id in xrange(0, 3):
+    if inception_id == 0:
+      out_layer = 'mixed'
+      tower_2_conv_num_output = 32
+    else:
+      out_layer = 'mixed_{}'.format(inception_id)
+      tower_2_conv_num_output = 64
+    towers = []
+    tower_name = '{}'.format(out_layer)
+    tower = InceptionTower(net, from_layer, tower_name, [
+        dict(name='conv', num_output=64, kernel_size=1, pad=0, stride=1),
+        ], **bn_param)
+    towers.append(tower)
+    tower_name = '{}/tower'.format(out_layer)
+    tower = InceptionTower(net, from_layer, tower_name, [
+        dict(name='conv', num_output=48, kernel_size=1, pad=0, stride=1),
+        dict(name='conv_1', num_output=64, kernel_size=5, pad=2, stride=1),
+        ], **bn_param)
+    towers.append(tower)
+    tower_name = '{}/tower_1'.format(out_layer)
+    tower = InceptionTower(net, from_layer, tower_name, [
+        dict(name='conv', num_output=64, kernel_size=1, pad=0, stride=1),
+        dict(name='conv_1', num_output=96, kernel_size=3, pad=1, stride=1),
+        dict(name='conv_2', num_output=96, kernel_size=3, pad=1, stride=1),
+        ], **bn_param)
+    towers.append(tower)
+    tower_name = '{}/tower_2'.format(out_layer)
+    tower = InceptionTower(net, from_layer, tower_name, [
+        dict(name='pool', pool=P.Pooling.AVE, kernel_size=3, pad=1, stride=1),
+        dict(name='conv', num_output=tower_2_conv_num_output, kernel_size=1, pad=0, stride=1),
+        ], **bn_param)
+    towers.append(tower)
+    out_layer = '{}/join'.format(out_layer)
+    net[out_layer] = L.Concat(*towers, axis=1)
+    from_layer = out_layer
+
+  # inceptions with 1x1, 3x3(in sequence) convolutions
+  out_layer = 'mixed_3'
+  towers = []
+  tower_name = '{}'.format(out_layer)
+  tower = InceptionTower(net, from_layer, tower_name, [
+      dict(name='conv', num_output=384, kernel_size=3, pad=0, stride=2),
+      ], **bn_param)
+  towers.append(tower)
+  tower_name = '{}/tower'.format(out_layer)
+  tower = InceptionTower(net, from_layer, tower_name, [
+      dict(name='conv', num_output=64, kernel_size=1, pad=0, stride=1),
+      dict(name='conv_1', num_output=96, kernel_size=3, pad=1, stride=1),
+      dict(name='conv_2', num_output=96, kernel_size=3, pad=0, stride=2),
+      ], **bn_param)
+  towers.append(tower)
+  tower_name = '{}'.format(out_layer)
+  tower = InceptionTower(net, from_layer, tower_name, [
+      dict(name='pool', pool=P.Pooling.MAX, kernel_size=3, pad=0, stride=2),
+      ], **bn_param)
+  towers.append(tower)
+  out_layer = '{}/join'.format(out_layer)
+  net[out_layer] = L.Concat(*towers, axis=1)
+  from_layer = out_layer
+
+  # inceptions with 1x1, 7x1, 1x7 convolutions
+  for inception_id in xrange(4, 8):
+    if inception_id == 4:
+      num_output = 128
+    elif inception_id == 5 or inception_id == 6:
+      num_output = 160
+    elif inception_id == 7:
+      num_output = 192
+    out_layer = 'mixed_{}'.format(inception_id)
+    towers = []
+    tower_name = '{}'.format(out_layer)
+    tower = InceptionTower(net, from_layer, tower_name, [
+        dict(name='conv', num_output=192, kernel_size=1, pad=0, stride=1),
+        ], **bn_param)
+    towers.append(tower)
+    tower_name = '{}/tower'.format(out_layer)
+    tower = InceptionTower(net, from_layer, tower_name, [
+        dict(name='conv', num_output=num_output, kernel_size=1, pad=0, stride=1),
+        dict(name='conv_1', num_output=num_output, kernel_size=[1, 7], pad=[0, 3], stride=[1, 1]),
+        dict(name='conv_2', num_output=192, kernel_size=[7, 1], pad=[3, 0], stride=[1, 1]),
+        ], **bn_param)
+    towers.append(tower)
+    tower_name = '{}/tower_1'.format(out_layer)
+    tower = InceptionTower(net, from_layer, tower_name, [
+        dict(name='conv', num_output=num_output, kernel_size=1, pad=0, stride=1),
+        dict(name='conv_1', num_output=num_output, kernel_size=[7, 1], pad=[3, 0], stride=[1, 1]),
+        dict(name='conv_2', num_output=num_output, kernel_size=[1, 7], pad=[0, 3], stride=[1, 1]),
+        dict(name='conv_3', num_output=num_output, kernel_size=[7, 1], pad=[3, 0], stride=[1, 1]),
+        dict(name='conv_4', num_output=192, kernel_size=[1, 7], pad=[0, 3], stride=[1, 1]),
+        ], **bn_param)
+    towers.append(tower)
+    tower_name = '{}/tower_2'.format(out_layer)
+    tower = InceptionTower(net, from_layer, tower_name, [
+        dict(name='pool', pool=P.Pooling.AVE, kernel_size=3, pad=1, stride=1),
+        dict(name='conv', num_output=192, kernel_size=1, pad=0, stride=1),
+        ], **bn_param)
+    towers.append(tower)
+    out_layer = '{}/join'.format(out_layer)
+    net[out_layer] = L.Concat(*towers, axis=1)
+    from_layer = out_layer
+
+  # inceptions with 1x1, 3x3, 1x7, 7x1 filters
+  out_layer = 'mixed_8'
+  towers = []
+  tower_name = '{}/tower'.format(out_layer)
+  tower = InceptionTower(net, from_layer, tower_name, [
+      dict(name='conv', num_output=192, kernel_size=1, pad=0, stride=1),
+      dict(name='conv_1', num_output=320, kernel_size=3, pad=0, stride=2),
+      ], **bn_param)
+  towers.append(tower)
+  tower_name = '{}/tower_1'.format(out_layer)
+  tower = InceptionTower(net, from_layer, tower_name, [
+      dict(name='conv', num_output=192, kernel_size=1, pad=0, stride=1),
+      dict(name='conv_1', num_output=192, kernel_size=[1, 7], pad=[0, 3], stride=[1, 1]),
+      dict(name='conv_2', num_output=192, kernel_size=[7, 1], pad=[3, 0], stride=[1, 1]),
+      dict(name='conv_3', num_output=192, kernel_size=3, pad=0, stride=2),
+      ], **bn_param)
+  towers.append(tower)
+  tower_name = '{}'.format(out_layer)
+  tower = InceptionTower(net, from_layer, tower_name, [
+      dict(name='pool', pool=P.Pooling.MAX, kernel_size=3, pad=0, stride=2),
+      ], **bn_param)
+  towers.append(tower)
+  out_layer = '{}/join'.format(out_layer)
+  net[out_layer] = L.Concat(*towers, axis=1)
+  from_layer = out_layer
+
+  for inception_id in xrange(9, 11):
+    num_output = 384
+    num_output2 = 448
+    if inception_id == 9:
+      pool = P.Pooling.AVE
+    else:
+      pool = P.Pooling.MAX
+    out_layer = 'mixed_{}'.format(inception_id)
+    towers = []
+    tower_name = '{}'.format(out_layer)
+    tower = InceptionTower(net, from_layer, tower_name, [
+        dict(name='conv', num_output=320, kernel_size=1, pad=0, stride=1),
+        ], **bn_param)
+    towers.append(tower)
+
+    tower_name = '{}/tower'.format(out_layer)
+    tower = InceptionTower(net, from_layer, tower_name, [
+        dict(name='conv', num_output=num_output, kernel_size=1, pad=0, stride=1),
+        ], **bn_param)
+    subtowers = []
+    subtower_name = '{}/mixed'.format(tower_name)
+    subtower = InceptionTower(net, '{}/conv'.format(tower_name), subtower_name, [
+        dict(name='conv', num_output=num_output, kernel_size=[1, 3], pad=[0, 1], stride=[1, 1]),
+        ], **bn_param)
+    subtowers.append(subtower)
+    subtower = InceptionTower(net, '{}/conv'.format(tower_name), subtower_name, [
+        dict(name='conv_1', num_output=num_output, kernel_size=[3, 1], pad=[1, 0], stride=[1, 1]),
+        ], **bn_param)
+    subtowers.append(subtower)
+    net[subtower_name] = L.Concat(*subtowers, axis=1)
+    towers.append(net[subtower_name])
+
+    tower_name = '{}/tower_1'.format(out_layer)
+    tower = InceptionTower(net, from_layer, tower_name, [
+        dict(name='conv', num_output=num_output2, kernel_size=1, pad=0, stride=1),
+        dict(name='conv_1', num_output=num_output, kernel_size=3, pad=1, stride=1),
+        ], **bn_param)
+    subtowers = []
+    subtower_name = '{}/mixed'.format(tower_name)
+    subtower = InceptionTower(net, '{}/conv_1'.format(tower_name), subtower_name, [
+        dict(name='conv', num_output=num_output, kernel_size=[1, 3], pad=[0, 1], stride=[1, 1]),
+        ], **bn_param)
+    subtowers.append(subtower)
+    subtower = InceptionTower(net, '{}/conv_1'.format(tower_name), subtower_name, [
+        dict(name='conv_1', num_output=num_output, kernel_size=[3, 1], pad=[1, 0], stride=[1, 1]),
+        ], **bn_param)
+    subtowers.append(subtower)
+    net[subtower_name] = L.Concat(*subtowers, axis=1)
+    towers.append(net[subtower_name])
+
+    tower_name = '{}/tower_2'.format(out_layer)
+    tower = InceptionTower(net, from_layer, tower_name, [
+        dict(name='pool', pool=pool, kernel_size=3, pad=1, stride=1),
+        dict(name='conv', num_output=192, kernel_size=1, pad=0, stride=1),
+        ], **bn_param)
+    towers.append(tower)
+    out_layer = '{}/join'.format(out_layer)
+    net[out_layer] = L.Concat(*towers, axis=1)
+    from_layer = out_layer
+
+  if output_pred:
+    net.pool_3 = L.Pooling(net[from_layer], pool=P.Pooling.AVE, kernel_size=8, pad=0, stride=1)
+    net.softmax = L.InnerProduct(net.pool_3, num_output=1008)
+    net.softmax_prob = L.Softmax(net.softmax)
+
+  return net
+
+def CreateMultiBoxHead(net, data_layer="data", num_classes=[], from_layers=[],
+        use_objectness=False, normalizations=[], use_batchnorm=True, lr_mult=1,
+        use_scale=True, min_sizes=[], max_sizes=[], prior_variance = [0.1],
+        aspect_ratios=[], steps=[], img_height=0, img_width=0, share_location=True,
+        flip=True, clip=True, offset=0.5, inter_layer_depth=[], kernel_size=1, pad=0,
+        conf_postfix='', loc_postfix='', **bn_param):
+    assert num_classes, "must provide num_classes"
+    assert num_classes > 0, "num_classes must be positive number"
+    if normalizations:
+        assert len(from_layers) == len(normalizations), "from_layers and normalizations should have same length"
+    assert len(from_layers) == len(min_sizes), "from_layers and min_sizes should have same length"
+    if max_sizes:
+        assert len(from_layers) == len(max_sizes), "from_layers and max_sizes should have same length"
+    if aspect_ratios:
+        assert len(from_layers) == len(aspect_ratios), "from_layers and aspect_ratios should have same length"
+    if steps:
+        assert len(from_layers) == len(steps), "from_layers and steps should have same length"
+    net_layers = net.keys()
+    assert data_layer in net_layers, "data_layer is not in net's layers"
+    if inter_layer_depth:
+        assert len(from_layers) == len(inter_layer_depth), "from_layers and inter_layer_depth should have same length"
+
+    num = len(from_layers)
+    priorbox_layers = []
+    loc_layers = []
+    conf_layers = []
+    objectness_layers = []
+    for i in range(0, num):
+        from_layer = from_layers[i]
+
+        # Get the normalize value.
+        if normalizations:
+            if normalizations[i] != -1:
+                norm_name = "{}_norm".format(from_layer)
+                net[norm_name] = L.Normalize(net[from_layer], scale_filler=dict(type="constant", value=normalizations[i]),
+                    across_spatial=False, channel_shared=False)
+                from_layer = norm_name
+
+        # Add intermediate layers.
+        if inter_layer_depth:
+            if inter_layer_depth[i] > 0:
+                inter_name = "{}_inter".format(from_layer)
+                ConvBNLayer(net, from_layer, inter_name, use_bn=use_batchnorm, use_relu=True, lr_mult=lr_mult,
+                      num_output=inter_layer_depth[i], kernel_size=3, pad=1, stride=1, **bn_param)
+                from_layer = inter_name
+
+        # Estimate number of priors per location given provided parameters.
+        min_size = min_sizes[i]
+        if type(min_size) is not list:
+            min_size = [min_size]
+        aspect_ratio = []
+        if len(aspect_ratios) > i:
+            aspect_ratio = aspect_ratios[i]
+            if type(aspect_ratio) is not list:
+                aspect_ratio = [aspect_ratio]
+        max_size = []
+        if len(max_sizes) > i:
+            max_size = max_sizes[i]
+            if type(max_size) is not list:
+                max_size = [max_size]
+            if max_size:
+                assert len(max_size) == len(min_size), "max_size and min_size should have same length."
+        if max_size:
+            num_priors_per_location = (2 + len(aspect_ratio)) * len(min_size)
+        else:
+            num_priors_per_location = (1 + len(aspect_ratio)) * len(min_size)
+        if flip:
+            num_priors_per_location += len(aspect_ratio) * len(min_size)
+        step = []
+        if len(steps) > i:
+            step = steps[i]
+
+        # Create location prediction layer.
+        name = "{}_mbox_loc{}".format(from_layer, loc_postfix)
+        num_loc_output = num_priors_per_location * 4;
+        if not share_location:
+            num_loc_output *= num_classes
+        ConvBNLayer(net, from_layer, name, use_bn=use_batchnorm, use_relu=False, lr_mult=lr_mult,
+            num_output=num_loc_output, kernel_size=kernel_size, pad=pad, stride=1, **bn_param)
+        permute_name = "{}_perm".format(name)
+        net[permute_name] = L.Permute(net[name], order=[0, 2, 3, 1])
+        flatten_name = "{}_flat".format(name)
+        net[flatten_name] = L.Flatten(net[permute_name], axis=1)
+        loc_layers.append(net[flatten_name])
+
+        # Create confidence prediction layer.
+        name = "{}_mbox_conf{}".format(from_layer, conf_postfix)
+        num_conf_output = num_priors_per_location * num_classes;
+        ConvBNLayer(net, from_layer, name, use_bn=use_batchnorm, use_relu=False, lr_mult=lr_mult,
+            num_output=num_conf_output, kernel_size=kernel_size, pad=pad, stride=1, **bn_param)
+        permute_name = "{}_perm".format(name)
+        net[permute_name] = L.Permute(net[name], order=[0, 2, 3, 1])
+        flatten_name = "{}_flat".format(name)
+        net[flatten_name] = L.Flatten(net[permute_name], axis=1)
+        conf_layers.append(net[flatten_name])
+
+        # Create prior generation layer.
+        name = "{}_mbox_priorbox".format(from_layer)
+        net[name] = L.PriorBox(net[from_layer], net[data_layer], min_size=min_size,
+                clip=clip, variance=prior_variance, offset=offset)
+        if max_size:
+            net.update(name, {'max_size': max_size})
+        if aspect_ratio:
+            net.update(name, {'aspect_ratio': aspect_ratio, 'flip': flip})
+        if step:
+            net.update(name, {'step': step})
+        if img_height != 0 and img_width != 0:
+            if img_height == img_width:
+                net.update(name, {'img_size': img_height})
+            else:
+                net.update(name, {'img_h': img_height, 'img_w': img_width})
+        priorbox_layers.append(net[name])
+
+        # Create objectness prediction layer.
+        if use_objectness:
+            name = "{}_mbox_objectness".format(from_layer)
+            num_obj_output = num_priors_per_location * 2;
+            ConvBNLayer(net, from_layer, name, use_bn=use_batchnorm, use_relu=False, lr_mult=lr_mult,
+                num_output=num_obj_output, kernel_size=kernel_size, pad=pad, stride=1, **bn_param)
+            permute_name = "{}_perm".format(name)
+            net[permute_name] = L.Permute(net[name], order=[0, 2, 3, 1])
+            flatten_name = "{}_flat".format(name)
+            net[flatten_name] = L.Flatten(net[permute_name], axis=1)
+            objectness_layers.append(net[flatten_name])
+
+    # Concatenate priorbox, loc, and conf layers.
+    mbox_layers = []
+    name = "mbox_loc"
+    net[name] = L.Concat(*loc_layers, axis=1)
+    mbox_layers.append(net[name])
+    name = "mbox_conf"
+    net[name] = L.Concat(*conf_layers, axis=1)
+    mbox_layers.append(net[name])
+    name = "mbox_priorbox"
+    net[name] = L.Concat(*priorbox_layers, axis=2)
+    mbox_layers.append(net[name])
+    if use_objectness:
+        name = "mbox_objectness"
+        net[name] = L.Concat(*objectness_layers, axis=1)
+        mbox_layers.append(net[name])
+
+    return mbox_layers
diff --git a/examples/ssd/plot_detections.py b/examples/ssd/plot_detections.py
new file mode 100644
index 00000000000..e682b4b5e07
--- /dev/null
+++ b/examples/ssd/plot_detections.py
@@ -0,0 +1,124 @@
+'''
+Plot the detection results output by ssd_detect.cpp.
+'''
+
+import argparse
+from collections import OrderedDict
+from google.protobuf import text_format
+import matplotlib
+# Force matplotlib to not use any Xwindows backend.
+matplotlib.use('Agg')
+import matplotlib.pyplot as plt
+import numpy as np
+import os
+import skimage.io as io
+import sys
+
+import caffe
+from caffe.proto import caffe_pb2
+
+def get_labelname(labelmap, labels):
+    num_labels = len(labelmap.item)
+    labelnames = []
+    if type(labels) is not list:
+        labels = [labels]
+    for label in labels:
+        found = False
+        for i in xrange(0, num_labels):
+            if label == labelmap.item[i].label:
+                found = True
+                labelnames.append(labelmap.item[i].display_name)
+                break
+        assert found == True
+    return labelnames
+
+def showResults(img_file, results, labelmap=None, threshold=None, display=None):
+    if not os.path.exists(img_file):
+        print "{} does not exist".format(img_file)
+        return
+    img = io.imread(img_file)
+    plt.clf()
+    plt.imshow(img)
+    plt.axis('off');
+    ax = plt.gca()
+    if labelmap:
+        # generate same number of colors as classes in labelmap.
+        num_classes = len(labelmap.item)
+    else:
+        # generate 20 colors.
+        num_classes = 20
+    colors = plt.cm.hsv(np.linspace(0, 1, num_classes)).tolist()
+    for res in results:
+        if 'score' in res and threshold and float(res["score"]) < threshold:
+            continue
+        label = res['label']
+        name = "class " + str(label)
+        if labelmap:
+            name = get_labelname(labelmap, label)[0]
+        if display_classes and name not in display_classes:
+            continue
+        color = colors[label % num_classes]
+        bbox = res['bbox']
+        coords = (bbox[0], bbox[1]), bbox[2] - bbox[0], bbox[3] - bbox[1]
+        ax.add_patch(plt.Rectangle(*coords, fill=False, edgecolor=color, linewidth=3))
+        if 'score' in res:
+            score = res['score']
+            display_text = '%s: %.2f' % (name, score)
+        else:
+            display_text = name
+        ax.text(bbox[0], bbox[1], display_text, bbox={'facecolor':color, 'alpha':0.5})
+    if len(results) > 0 and "out_file" in results[0]:
+        plt.savefig(results[0]["out_file"], bbox_inches="tight")
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(
+            description = "Plot the detection results output by ssd_detect.")
+    parser.add_argument("resultfile",
+            help = "A file which contains all the detection results.")
+    parser.add_argument("imgdir",
+            help = "A directory which contains the images.")
+    parser.add_argument("--labelmap-file", default="",
+            help = "A file which contains the LabelMap.")
+    parser.add_argument("--visualize-threshold", default=0.01, type=float,
+            help = "Display detections with score higher than the threshold.")
+    parser.add_argument("--save-dir", default="",
+            help = "A directory which saves the image with detection results.")
+    parser.add_argument("--display-classes", default=None,
+            help = "If provided, only display specified class. Separate by ','")
+
+    args = parser.parse_args()
+    result_file = args.resultfile
+    img_dir = args.imgdir
+    if not os.path.exists(img_dir):
+        print "{} does not exist".format(img_dir)
+        sys.exit()
+    labelmap_file = args.labelmap_file
+    labelmap = None
+    if labelmap_file and os.path.exists(labelmap_file):
+        file = open(labelmap_file, 'r')
+        labelmap = caffe_pb2.LabelMap()
+        text_format.Merge(str(file.read()), labelmap)
+    visualize_threshold = args.visualize_threshold
+    save_dir = args.save_dir
+    if save_dir and not os.path.exists(save_dir):
+        os.makedirs(save_dir)
+    display_classes = args.display_classes
+
+    img_results = OrderedDict()
+    with open(result_file, "r") as f:
+        for line in f.readlines():
+            img_name, label, score, xmin, ymin, xmax, ymax = line.strip("\n").split()
+            img_file = "{}/{}".format(img_dir, img_name)
+            result = dict()
+            result["label"] = int(label)
+            result["score"] = float(score)
+            result["bbox"] = [float(xmin), float(ymin), float(xmax), float(ymax)]
+            if save_dir:
+                out_file = "{}/{}.png".format(save_dir, os.path.basename(img_name))
+                result["out_file"] = out_file
+            if img_file not in img_results:
+                img_results[img_file] = [result]
+            else:
+                img_results[img_file].append(result)
+    for img_file, results in img_results.iteritems():
+        showResults(img_file, results, labelmap, visualize_threshold, display_classes)
diff --git a/examples/ssd/score_ssd_coco.py b/examples/ssd/score_ssd_coco.py
new file mode 100644
index 00000000000..5d817092f07
--- /dev/null
+++ b/examples/ssd/score_ssd_coco.py
@@ -0,0 +1,558 @@
+from __future__ import print_function
+import caffe
+from google.protobuf import text_format
+
+import math
+import os
+import shutil
+import stat
+import subprocess
+import sys
+
+sys.path.insert(0, 'examples/ssd')
+from model_libs import *
+
+# Add extra layers on top of a "base" network (e.g. VGGNet or Inception).
+def AddExtraLayers(net, use_batchnorm=True, lr_mult=1):
+    use_relu = True
+
+    # Add additional convolutional layers.
+    # 19 x 19
+    from_layer = net.keys()[-1]
+
+    # TODO(weiliu89): Construct the name using the last layer to avoid duplication.
+    # 10 x 10
+    out_layer = "conv6_1"
+    ConvBNLayer(net, from_layer, out_layer, use_batchnorm, use_relu, 256, 1, 0, 1,
+        lr_mult=lr_mult)
+
+    from_layer = out_layer
+    out_layer = "conv6_2"
+    ConvBNLayer(net, from_layer, out_layer, use_batchnorm, use_relu, 512, 3, 1, 2,
+        lr_mult=lr_mult)
+
+    # 5 x 5
+    from_layer = out_layer
+    out_layer = "conv7_1"
+    ConvBNLayer(net, from_layer, out_layer, use_batchnorm, use_relu, 128, 1, 0, 1,
+      lr_mult=lr_mult)
+
+    from_layer = out_layer
+    out_layer = "conv7_2"
+    ConvBNLayer(net, from_layer, out_layer, use_batchnorm, use_relu, 256, 3, 1, 2,
+      lr_mult=lr_mult)
+
+    # 3 x 3
+    from_layer = out_layer
+    out_layer = "conv8_1"
+    ConvBNLayer(net, from_layer, out_layer, use_batchnorm, use_relu, 128, 1, 0, 1,
+      lr_mult=lr_mult)
+
+    from_layer = out_layer
+    out_layer = "conv8_2"
+    ConvBNLayer(net, from_layer, out_layer, use_batchnorm, use_relu, 256, 3, 0, 1,
+      lr_mult=lr_mult)
+
+    # 1 x 1
+    from_layer = out_layer
+    out_layer = "conv9_1"
+    ConvBNLayer(net, from_layer, out_layer, use_batchnorm, use_relu, 128, 1, 0, 1,
+      lr_mult=lr_mult)
+
+    from_layer = out_layer
+    out_layer = "conv9_2"
+    ConvBNLayer(net, from_layer, out_layer, use_batchnorm, use_relu, 256, 3, 0, 1,
+      lr_mult=lr_mult)
+
+    return net
+
+
+### Modify the following parameters accordingly ###
+# Notice: we do evaluation by setting the solver parameters approximately.
+# The reason that we do not use ./build/tools/caffe test ... is because it
+# only supports testing for classification problem now.
+# The directory which contains the caffe code.
+# We assume you are running the script at the CAFFE_ROOT.
+caffe_root = os.getcwd()
+
+# Set true if you want to start training right after generating all files.
+run_soon = True
+
+# The database file for training data. Created by data/coco/create_data.sh
+train_data = "examples/coco/coco_train_lmdb"
+# The database file for testing data. Created by data/coco/create_data.sh
+test_data = "examples/coco/coco_minival_lmdb"
+# Specify the batch sampler.
+resize_width = 300
+resize_height = 300
+resize = "{}x{}".format(resize_width, resize_height)
+batch_sampler = [
+        {
+                'sampler': {
+                        },
+                'max_trials': 1,
+                'max_sample': 1,
+        },
+        {
+                'sampler': {
+                        'min_scale': 0.3,
+                        'max_scale': 1.0,
+                        'min_aspect_ratio': 0.5,
+                        'max_aspect_ratio': 2.0,
+                        },
+                'sample_constraint': {
+                        'min_jaccard_overlap': 0.1,
+                        },
+                'max_trials': 50,
+                'max_sample': 1,
+        },
+        {
+                'sampler': {
+                        'min_scale': 0.3,
+                        'max_scale': 1.0,
+                        'min_aspect_ratio': 0.5,
+                        'max_aspect_ratio': 2.0,
+                        },
+                'sample_constraint': {
+                        'min_jaccard_overlap': 0.3,
+                        },
+                'max_trials': 50,
+                'max_sample': 1,
+        },
+        {
+                'sampler': {
+                        'min_scale': 0.3,
+                        'max_scale': 1.0,
+                        'min_aspect_ratio': 0.5,
+                        'max_aspect_ratio': 2.0,
+                        },
+                'sample_constraint': {
+                        'min_jaccard_overlap': 0.5,
+                        },
+                'max_trials': 50,
+                'max_sample': 1,
+        },
+        {
+                'sampler': {
+                        'min_scale': 0.3,
+                        'max_scale': 1.0,
+                        'min_aspect_ratio': 0.5,
+                        'max_aspect_ratio': 2.0,
+                        },
+                'sample_constraint': {
+                        'min_jaccard_overlap': 0.7,
+                        },
+                'max_trials': 50,
+                'max_sample': 1,
+        },
+        {
+                'sampler': {
+                        'min_scale': 0.3,
+                        'max_scale': 1.0,
+                        'min_aspect_ratio': 0.5,
+                        'max_aspect_ratio': 2.0,
+                        },
+                'sample_constraint': {
+                        'min_jaccard_overlap': 0.9,
+                        },
+                'max_trials': 50,
+                'max_sample': 1,
+        },
+        {
+                'sampler': {
+                        'min_scale': 0.3,
+                        'max_scale': 1.0,
+                        'min_aspect_ratio': 0.5,
+                        'max_aspect_ratio': 2.0,
+                        },
+                'sample_constraint': {
+                        'max_jaccard_overlap': 1.0,
+                        },
+                'max_trials': 50,
+                'max_sample': 1,
+        },
+        ]
+train_transform_param = {
+        'mirror': True,
+        'mean_value': [104, 117, 123],
+        'force_color': True,
+        'resize_param': {
+                'prob': 1,
+                'resize_mode': P.Resize.WARP,
+                'height': resize_height,
+                'width': resize_width,
+                'interp_mode': [
+                        P.Resize.LINEAR,
+                        P.Resize.AREA,
+                        P.Resize.NEAREST,
+                        P.Resize.CUBIC,
+                        P.Resize.LANCZOS4,
+                        ],
+                },
+        'distort_param': {
+                'brightness_prob': 0.5,
+                'brightness_delta': 32,
+                'contrast_prob': 0.5,
+                'contrast_lower': 0.5,
+                'contrast_upper': 1.5,
+                'hue_prob': 0.5,
+                'hue_delta': 18,
+                'saturation_prob': 0.5,
+                'saturation_lower': 0.5,
+                'saturation_upper': 1.5,
+                'random_order_prob': 0.0,
+                },
+        'expand_param': {
+                'prob': 0.5,
+                'max_expand_ratio': 4.0,
+                },
+        'emit_constraint': {
+            'emit_type': caffe_pb2.EmitConstraint.CENTER,
+            }
+        }
+test_transform_param = {
+        'mean_value': [104, 117, 123],
+        'force_color': True,
+        'resize_param': {
+                'prob': 1,
+                'resize_mode': P.Resize.WARP,
+                'height': resize_height,
+                'width': resize_width,
+                'interp_mode': [P.Resize.LINEAR],
+                },
+        }
+
+# If true, use batch norm for all newly added layers.
+# Currently only the non batch norm version has been tested.
+use_batchnorm = False
+lr_mult = 1
+# Use different initial learning rate.
+if use_batchnorm:
+    base_lr = 0.0004
+else:
+    # A learning rate for batch_size = 1, num_gpus = 1.
+    base_lr = 0.00004
+
+# The job name should be same as the name used in examples/ssd/ssd_coco.py.
+job_name = "SSD_{}".format(resize)
+# The name of the model. Modify it if you want.
+model_name = "VGG_coco_{}".format(job_name)
+
+# Directory which stores the model .prototxt file.
+save_dir = "models/VGGNet/coco/{}_score".format(job_name)
+# Directory which stores the snapshot of trained models.
+snapshot_dir = "models/VGGNet/coco/{}".format(job_name)
+# Directory which stores the job script and log file.
+job_dir = "jobs/VGGNet/coco/{}_score".format(job_name)
+# Directory which stores the detection results.
+output_result_dir = "{}/data/mscoco/results/{}_score/".format(os.environ['HOME'], job_name)
+
+# model definition files.
+train_net_file = "{}/train.prototxt".format(save_dir)
+test_net_file = "{}/test.prototxt".format(save_dir)
+deploy_net_file = "{}/deploy.prototxt".format(save_dir)
+solver_file = "{}/solver.prototxt".format(save_dir)
+# snapshot prefix.
+snapshot_prefix = "{}/{}".format(snapshot_dir, model_name)
+# job script path.
+job_file = "{}/{}.sh".format(job_dir, model_name)
+
+# Find most recent snapshot.
+max_iter = 0
+for file in os.listdir(snapshot_dir):
+  if file.endswith(".caffemodel"):
+    basename = os.path.splitext(file)[0]
+    iter = int(basename.split("{}_iter_".format(model_name))[1])
+    if iter > max_iter:
+      max_iter = iter
+
+if max_iter == 0:
+  print("Cannot find snapshot in {}".format(snapshot_dir))
+  sys.exit()
+
+# Stores the test image names and sizes. Created by data/coco/create_list.sh
+name_size_file = "data/coco/minival2014_name_size.txt"
+# The resume model.
+pretrain_model = "{}_iter_{}.caffemodel".format(snapshot_prefix, max_iter)
+# Stores LabelMapItem.
+label_map_file = "data/coco/labelmap_coco.prototxt"
+
+# MultiBoxLoss parameters.
+num_classes = 81
+share_location = True
+background_label_id=0
+train_on_diff_gt = False
+normalization_mode = P.Loss.VALID
+code_type = P.PriorBox.CENTER_SIZE
+ignore_cross_boundary_bbox = False
+mining_type = P.MultiBoxLoss.MAX_NEGATIVE
+neg_pos_ratio = 3.
+loc_weight = (neg_pos_ratio + 1.) / 4.
+multibox_loss_param = {
+    'loc_loss_type': P.MultiBoxLoss.SMOOTH_L1,
+    'conf_loss_type': P.MultiBoxLoss.SOFTMAX,
+    'loc_weight': loc_weight,
+    'num_classes': num_classes,
+    'share_location': share_location,
+    'match_type': P.MultiBoxLoss.PER_PREDICTION,
+    'overlap_threshold': 0.5,
+    'use_prior_for_matching': True,
+    'background_label_id': background_label_id,
+    'use_difficult_gt': train_on_diff_gt,
+    'mining_type': mining_type,
+    'neg_pos_ratio': neg_pos_ratio,
+    'neg_overlap': 0.5,
+    'code_type': code_type,
+    'ignore_cross_boundary_bbox': ignore_cross_boundary_bbox,
+    }
+loss_param = {
+    'normalization': normalization_mode,
+    }
+
+# parameters for generating priors.
+# minimum dimension of input image
+min_dim = 300
+# conv4_3 ==> 38 x 38
+# fc7 ==> 19 x 19
+# conv6_2 ==> 10 x 10
+# conv7_2 ==> 5 x 5
+# conv8_2 ==> 3 x 3
+# conv9_2 ==> 1 x 1
+mbox_source_layers = ['conv4_3', 'fc7', 'conv6_2', 'conv7_2', 'conv8_2', 'conv9_2']
+# in percent %
+min_ratio = 15
+max_ratio = 90
+step = int(math.floor((max_ratio - min_ratio) / (len(mbox_source_layers) - 2)))
+min_sizes = []
+max_sizes = []
+for ratio in xrange(min_ratio, max_ratio + 1, step):
+  min_sizes.append(min_dim * ratio / 100.)
+  max_sizes.append(min_dim * (ratio + step) / 100.)
+min_sizes = [min_dim * 7 / 100.] + min_sizes
+max_sizes = [min_dim * 15 / 100.] + max_sizes
+steps = [8, 16, 32, 64, 100, 300]
+aspect_ratios = [[2], [2, 3], [2, 3], [2, 3], [2], [2]]
+# L2 normalize conv4_3.
+normalizations = [20, -1, -1, -1, -1, -1]
+# variance used to encode/decode prior bboxes.
+if code_type == P.PriorBox.CENTER_SIZE:
+  prior_variance = [0.1, 0.1, 0.2, 0.2]
+else:
+  prior_variance = [0.1]
+flip = True
+clip = False
+
+# Solver parameters.
+# Defining which GPUs to use.
+gpus = "0"
+gpulist = gpus.split(",")
+num_gpus = len(gpulist)
+
+# The number does not matter since we do not do training with this script.
+batch_size = 1
+accum_batch_size = 1
+iter_size = accum_batch_size / batch_size
+solver_mode = P.Solver.CPU
+device_id = 0
+batch_size_per_device = batch_size
+if num_gpus > 0:
+  batch_size_per_device = int(math.ceil(float(batch_size) / num_gpus))
+  iter_size = int(math.ceil(float(accum_batch_size) / (batch_size_per_device * num_gpus)))
+  solver_mode = P.Solver.GPU
+  device_id = int(gpulist[0])
+
+if normalization_mode == P.Loss.NONE:
+  base_lr /= batch_size_per_device
+elif normalization_mode == P.Loss.VALID:
+  base_lr *= 25. / loc_weight
+elif normalization_mode == P.Loss.FULL:
+  # Roughly there are 2000 prior bboxes per image.
+  # TODO(weiliu89): Estimate the exact # of priors.
+  base_lr *= 2000.
+
+# Evaluate on whole test set.
+num_test_image = 5000
+test_batch_size = 8
+test_iter = num_test_image / test_batch_size
+
+solver_param = {
+    # Train parameters
+    'base_lr': base_lr,
+    'weight_decay': 0.0005,
+    'lr_policy': "multistep",
+    'stepvalue': [280000, 360000, 400000],
+    'gamma': 0.1,
+    'momentum': 0.9,
+    'iter_size': iter_size,
+    'max_iter': 0,
+    'snapshot': 0,
+    'display': 10,
+    'average_loss': 10,
+    'type': "SGD",
+    'solver_mode': solver_mode,
+    'device_id': device_id,
+    'debug_info': False,
+    'snapshot_after_train': False,
+    # Test parameters
+    'test_iter': [test_iter],
+    'test_interval': 10000,
+    'eval_type': "detection",
+    'ap_version': "11point",
+    'test_initialization': True,
+    }
+
+# parameters for generating detection output.
+det_out_param = {
+    'num_classes': num_classes,
+    'share_location': share_location,
+    'background_label_id': background_label_id,
+    'nms_param': {'nms_threshold': 0.45, 'top_k': 400},
+    'save_output_param': {
+        'output_directory': output_result_dir,
+        'output_name_prefix': "detections_minival_ssd300_results",
+        'output_format': "COCO",
+        'label_map_file': label_map_file,
+        'name_size_file': name_size_file,
+        'num_test_image': num_test_image,
+        },
+    'keep_top_k': 200,
+    'confidence_threshold': 0.01,
+    'code_type': code_type,
+    }
+
+# parameters for evaluating detection results.
+det_eval_param = {
+    'num_classes': num_classes,
+    'background_label_id': background_label_id,
+    'overlap_threshold': 0.5,
+    'evaluate_difficult_gt': False,
+    'name_size_file': name_size_file,
+    }
+
+### Hopefully you don't need to change the following ###
+# Check file.
+check_if_exist(train_data)
+check_if_exist(test_data)
+check_if_exist(label_map_file)
+check_if_exist(pretrain_model)
+make_if_not_exist(save_dir)
+make_if_not_exist(job_dir)
+make_if_not_exist(snapshot_dir)
+
+# Create train net.
+net = caffe.NetSpec()
+net.data, net.label = CreateAnnotatedDataLayer(train_data, batch_size=batch_size_per_device,
+        train=True, output_label=True, label_map_file=label_map_file,
+        transform_param=train_transform_param, batch_sampler=batch_sampler)
+
+VGGNetBody(net, from_layer='data', fully_conv=True, reduced=True, dilated=True,
+    dropout=False)
+
+AddExtraLayers(net, use_batchnorm, lr_mult=lr_mult)
+
+mbox_layers = CreateMultiBoxHead(net, data_layer='data', from_layers=mbox_source_layers,
+        use_batchnorm=use_batchnorm, min_sizes=min_sizes, max_sizes=max_sizes,
+        aspect_ratios=aspect_ratios, steps=steps, normalizations=normalizations,
+        num_classes=num_classes, share_location=share_location, flip=flip, clip=clip,
+        prior_variance=prior_variance, kernel_size=3, pad=1, lr_mult=lr_mult)
+
+# Create the MultiBoxLossLayer.
+name = "mbox_loss"
+mbox_layers.append(net.label)
+net[name] = L.MultiBoxLoss(*mbox_layers, multibox_loss_param=multibox_loss_param,
+        loss_param=loss_param, include=dict(phase=caffe_pb2.Phase.Value('TRAIN')),
+        propagate_down=[True, True, False, False])
+
+with open(train_net_file, 'w') as f:
+    print('name: "{}_train"'.format(model_name), file=f)
+    print(net.to_proto(), file=f)
+shutil.copy(train_net_file, job_dir)
+
+# Create test net.
+net = caffe.NetSpec()
+net.data, net.label = CreateAnnotatedDataLayer(test_data, batch_size=test_batch_size,
+        train=False, output_label=True, label_map_file=label_map_file,
+        transform_param=test_transform_param)
+
+VGGNetBody(net, from_layer='data', fully_conv=True, reduced=True, dilated=True,
+    dropout=False)
+
+AddExtraLayers(net, use_batchnorm, lr_mult=lr_mult)
+
+mbox_layers = CreateMultiBoxHead(net, data_layer='data', from_layers=mbox_source_layers,
+        use_batchnorm=use_batchnorm, min_sizes=min_sizes, max_sizes=max_sizes,
+        aspect_ratios=aspect_ratios, steps=steps, normalizations=normalizations,
+        num_classes=num_classes, share_location=share_location, flip=flip, clip=clip,
+        prior_variance=prior_variance, kernel_size=3, pad=1, lr_mult=lr_mult)
+
+conf_name = "mbox_conf"
+if multibox_loss_param["conf_loss_type"] == P.MultiBoxLoss.SOFTMAX:
+  reshape_name = "{}_reshape".format(conf_name)
+  net[reshape_name] = L.Reshape(net[conf_name], shape=dict(dim=[0, -1, num_classes]))
+  softmax_name = "{}_softmax".format(conf_name)
+  net[softmax_name] = L.Softmax(net[reshape_name], axis=2)
+  flatten_name = "{}_flatten".format(conf_name)
+  net[flatten_name] = L.Flatten(net[softmax_name], axis=1)
+  mbox_layers[1] = net[flatten_name]
+elif multibox_loss_param["conf_loss_type"] == P.MultiBoxLoss.LOGISTIC:
+  sigmoid_name = "{}_sigmoid".format(conf_name)
+  net[sigmoid_name] = L.Sigmoid(net[conf_name])
+  mbox_layers[1] = net[sigmoid_name]
+
+net.detection_out = L.DetectionOutput(*mbox_layers,
+    detection_output_param=det_out_param,
+    include=dict(phase=caffe_pb2.Phase.Value('TEST')))
+net.detection_eval = L.DetectionEvaluate(net.detection_out, net.label,
+    detection_evaluate_param=det_eval_param,
+    include=dict(phase=caffe_pb2.Phase.Value('TEST')))
+
+with open(test_net_file, 'w') as f:
+    print('name: "{}_test"'.format(model_name), file=f)
+    print(net.to_proto(), file=f)
+shutil.copy(test_net_file, job_dir)
+
+# Create deploy net.
+# Remove the first and last layer from test net.
+deploy_net = net
+with open(deploy_net_file, 'w') as f:
+    net_param = deploy_net.to_proto()
+    # Remove the first (AnnotatedData) and last (DetectionEvaluate) layer from test net.
+    del net_param.layer[0]
+    del net_param.layer[-1]
+    net_param.name = '{}_deploy'.format(model_name)
+    net_param.input.extend(['data'])
+    net_param.input_shape.extend([
+        caffe_pb2.BlobShape(dim=[1, 3, resize_height, resize_width])])
+    print(net_param, file=f)
+shutil.copy(deploy_net_file, job_dir)
+
+# Create solver.
+solver = caffe_pb2.SolverParameter(
+        train_net=train_net_file,
+        test_net=[test_net_file],
+        snapshot_prefix=snapshot_prefix,
+        **solver_param)
+
+with open(solver_file, 'w') as f:
+    print(solver, file=f)
+shutil.copy(solver_file, job_dir)
+
+# Create job file.
+with open(job_file, 'w') as f:
+  f.write('cd {}\n'.format(caffe_root))
+  f.write('./build/tools/caffe train \\\n')
+  f.write('--solver="{}" \\\n'.format(solver_file))
+  f.write('--weights="{}" \\\n'.format(pretrain_model))
+  if solver_param['solver_mode'] == P.Solver.GPU:
+    f.write('--gpu {} 2>&1 | tee {}/{}_test{}.log\n'.format(gpus, job_dir, model_name, max_iter))
+  else:
+    f.write('2>&1 | tee {}/{}.log\n'.format(job_dir, model_name))
+
+# Copy the python script to job_dir.
+py_file = os.path.abspath(__file__)
+shutil.copy(py_file, job_dir)
+
+# Run the job.
+os.chmod(job_file, stat.S_IRWXU)
+if run_soon:
+  subprocess.call(job_file, shell=True)
diff --git a/examples/ssd/score_ssd_pascal.py b/examples/ssd/score_ssd_pascal.py
new file mode 100644
index 00000000000..cc13125e32b
--- /dev/null
+++ b/examples/ssd/score_ssd_pascal.py
@@ -0,0 +1,558 @@
+from __future__ import print_function
+import caffe
+from google.protobuf import text_format
+
+import math
+import os
+import shutil
+import stat
+import subprocess
+import sys
+
+sys.path.insert(0, 'examples/ssd')
+from model_libs import *
+
+# Add extra layers on top of a "base" network (e.g. VGGNet or Inception).
+def AddExtraLayers(net, use_batchnorm=True, lr_mult=1):
+    use_relu = True
+
+    # Add additional convolutional layers.
+    # 19 x 19
+    from_layer = net.keys()[-1]
+
+    # TODO(weiliu89): Construct the name using the last layer to avoid duplication.
+    # 10 x 10
+    out_layer = "conv6_1"
+    ConvBNLayer(net, from_layer, out_layer, use_batchnorm, use_relu, 256, 1, 0, 1,
+        lr_mult=lr_mult)
+
+    from_layer = out_layer
+    out_layer = "conv6_2"
+    ConvBNLayer(net, from_layer, out_layer, use_batchnorm, use_relu, 512, 3, 1, 2,
+        lr_mult=lr_mult)
+
+    # 5 x 5
+    from_layer = out_layer
+    out_layer = "conv7_1"
+    ConvBNLayer(net, from_layer, out_layer, use_batchnorm, use_relu, 128, 1, 0, 1,
+      lr_mult=lr_mult)
+
+    from_layer = out_layer
+    out_layer = "conv7_2"
+    ConvBNLayer(net, from_layer, out_layer, use_batchnorm, use_relu, 256, 3, 1, 2,
+      lr_mult=lr_mult)
+
+    # 3 x 3
+    from_layer = out_layer
+    out_layer = "conv8_1"
+    ConvBNLayer(net, from_layer, out_layer, use_batchnorm, use_relu, 128, 1, 0, 1,
+      lr_mult=lr_mult)
+
+    from_layer = out_layer
+    out_layer = "conv8_2"
+    ConvBNLayer(net, from_layer, out_layer, use_batchnorm, use_relu, 256, 3, 0, 1,
+      lr_mult=lr_mult)
+
+    # 1 x 1
+    from_layer = out_layer
+    out_layer = "conv9_1"
+    ConvBNLayer(net, from_layer, out_layer, use_batchnorm, use_relu, 128, 1, 0, 1,
+      lr_mult=lr_mult)
+
+    from_layer = out_layer
+    out_layer = "conv9_2"
+    ConvBNLayer(net, from_layer, out_layer, use_batchnorm, use_relu, 256, 3, 0, 1,
+      lr_mult=lr_mult)
+
+    return net
+
+
+### Modify the following parameters accordingly ###
+# Notice: we do evaluation by setting the solver parameters approximately.
+# The reason that we do not use ./build/tools/caffe test ... is because it
+# only supports testing for classification problem now.
+# The directory which contains the caffe code.
+# We assume you are running the script at the CAFFE_ROOT.
+caffe_root = os.getcwd()
+
+# Set true if you want to start training right after generating all files.
+run_soon = True
+
+# The database file for training data. Created by data/VOC0712/create_data.sh
+train_data = "examples/VOC0712/VOC0712_trainval_lmdb"
+# The database file for testing data. Created by data/VOC0712/create_data.sh
+test_data = "examples/VOC0712/VOC0712_test_lmdb"
+# Specify the batch sampler.
+resize_width = 300
+resize_height = 300
+resize = "{}x{}".format(resize_width, resize_height)
+batch_sampler = [
+        {
+                'sampler': {
+                        },
+                'max_trials': 1,
+                'max_sample': 1,
+        },
+        {
+                'sampler': {
+                        'min_scale': 0.3,
+                        'max_scale': 1.0,
+                        'min_aspect_ratio': 0.5,
+                        'max_aspect_ratio': 2.0,
+                        },
+                'sample_constraint': {
+                        'min_jaccard_overlap': 0.1,
+                        },
+                'max_trials': 50,
+                'max_sample': 1,
+        },
+        {
+                'sampler': {
+                        'min_scale': 0.3,
+                        'max_scale': 1.0,
+                        'min_aspect_ratio': 0.5,
+                        'max_aspect_ratio': 2.0,
+                        },
+                'sample_constraint': {
+                        'min_jaccard_overlap': 0.3,
+                        },
+                'max_trials': 50,
+                'max_sample': 1,
+        },
+        {
+                'sampler': {
+                        'min_scale': 0.3,
+                        'max_scale': 1.0,
+                        'min_aspect_ratio': 0.5,
+                        'max_aspect_ratio': 2.0,
+                        },
+                'sample_constraint': {
+                        'min_jaccard_overlap': 0.5,
+                        },
+                'max_trials': 50,
+                'max_sample': 1,
+        },
+        {
+                'sampler': {
+                        'min_scale': 0.3,
+                        'max_scale': 1.0,
+                        'min_aspect_ratio': 0.5,
+                        'max_aspect_ratio': 2.0,
+                        },
+                'sample_constraint': {
+                        'min_jaccard_overlap': 0.7,
+                        },
+                'max_trials': 50,
+                'max_sample': 1,
+        },
+        {
+                'sampler': {
+                        'min_scale': 0.3,
+                        'max_scale': 1.0,
+                        'min_aspect_ratio': 0.5,
+                        'max_aspect_ratio': 2.0,
+                        },
+                'sample_constraint': {
+                        'min_jaccard_overlap': 0.9,
+                        },
+                'max_trials': 50,
+                'max_sample': 1,
+        },
+        {
+                'sampler': {
+                        'min_scale': 0.3,
+                        'max_scale': 1.0,
+                        'min_aspect_ratio': 0.5,
+                        'max_aspect_ratio': 2.0,
+                        },
+                'sample_constraint': {
+                        'max_jaccard_overlap': 1.0,
+                        },
+                'max_trials': 50,
+                'max_sample': 1,
+        },
+        ]
+train_transform_param = {
+        'mirror': True,
+        'mean_value': [104, 117, 123],
+        'resize_param': {
+                'prob': 1,
+                'resize_mode': P.Resize.WARP,
+                'height': resize_height,
+                'width': resize_width,
+                'interp_mode': [
+                        P.Resize.LINEAR,
+                        P.Resize.AREA,
+                        P.Resize.NEAREST,
+                        P.Resize.CUBIC,
+                        P.Resize.LANCZOS4,
+                        ],
+                },
+        'distort_param': {
+                'brightness_prob': 0.5,
+                'brightness_delta': 32,
+                'contrast_prob': 0.5,
+                'contrast_lower': 0.5,
+                'contrast_upper': 1.5,
+                'hue_prob': 0.5,
+                'hue_delta': 18,
+                'saturation_prob': 0.5,
+                'saturation_lower': 0.5,
+                'saturation_upper': 1.5,
+                'random_order_prob': 0.0,
+                },
+        'expand_param': {
+                'prob': 0.5,
+                'max_expand_ratio': 4.0,
+                },
+        'emit_constraint': {
+            'emit_type': caffe_pb2.EmitConstraint.CENTER,
+            }
+        }
+test_transform_param = {
+        'mean_value': [104, 117, 123],
+        'resize_param': {
+                'prob': 1,
+                'resize_mode': P.Resize.WARP,
+                'height': resize_height,
+                'width': resize_width,
+                'interp_mode': [P.Resize.LINEAR],
+                },
+        }
+
+# If true, use batch norm for all newly added layers.
+# Currently only the non batch norm version has been tested.
+use_batchnorm = False
+lr_mult = 1
+# Use different initial learning rate.
+if use_batchnorm:
+    base_lr = 0.0004
+else:
+    # A learning rate for batch_size = 1, num_gpus = 1.
+    base_lr = 0.00004
+
+# The job name should be same as the name used in examples/ssd/ssd_pascal.py.
+job_name = "SSD_{}".format(resize)
+# The name of the model. Modify it if you want.
+model_name = "VGG_VOC0712_{}".format(job_name)
+
+# Directory which stores the model .prototxt file.
+save_dir = "models/VGGNet/VOC0712/{}_score".format(job_name)
+# Directory which stores the snapshot of trained models.
+snapshot_dir = "models/VGGNet/VOC0712/{}".format(job_name)
+# Directory which stores the job script and log file.
+job_dir = "jobs/VGGNet/VOC0712/{}_score".format(job_name)
+# Directory which stores the detection results.
+output_result_dir = "{}/data/VOCdevkit/results/VOC2007/{}_score/Main".format(os.environ['HOME'], job_name)
+
+# model definition files.
+train_net_file = "{}/train.prototxt".format(save_dir)
+test_net_file = "{}/test.prototxt".format(save_dir)
+deploy_net_file = "{}/deploy.prototxt".format(save_dir)
+solver_file = "{}/solver.prototxt".format(save_dir)
+# snapshot prefix.
+snapshot_prefix = "{}/{}".format(snapshot_dir, model_name)
+# job script path.
+job_file = "{}/{}.sh".format(job_dir, model_name)
+
+# Find most recent snapshot.
+max_iter = 0
+for file in os.listdir(snapshot_dir):
+  if file.endswith(".caffemodel"):
+    basename = os.path.splitext(file)[0]
+    iter = int(basename.split("{}_iter_".format(model_name))[1])
+    if iter > max_iter:
+      max_iter = iter
+
+if max_iter == 0:
+  print("Cannot find snapshot in {}".format(snapshot_dir))
+  sys.exit()
+
+# Stores the test image names and sizes. Created by data/VOC0712/create_list.sh
+name_size_file = "data/VOC0712/test_name_size.txt"
+# The resume model.
+pretrain_model = "{}_iter_{}.caffemodel".format(snapshot_prefix, max_iter)
+# Stores LabelMapItem.
+label_map_file = "data/VOC0712/labelmap_voc.prototxt"
+
+# MultiBoxLoss parameters.
+num_classes = 21
+share_location = True
+background_label_id=0
+train_on_diff_gt = True
+normalization_mode = P.Loss.VALID
+code_type = P.PriorBox.CENTER_SIZE
+ignore_cross_boundary_bbox = False
+mining_type = P.MultiBoxLoss.MAX_NEGATIVE
+neg_pos_ratio = 3.
+loc_weight = (neg_pos_ratio + 1.) / 4.
+multibox_loss_param = {
+    'loc_loss_type': P.MultiBoxLoss.SMOOTH_L1,
+    'conf_loss_type': P.MultiBoxLoss.SOFTMAX,
+    'loc_weight': loc_weight,
+    'num_classes': num_classes,
+    'share_location': share_location,
+    'match_type': P.MultiBoxLoss.PER_PREDICTION,
+    'overlap_threshold': 0.5,
+    'use_prior_for_matching': True,
+    'background_label_id': background_label_id,
+    'use_difficult_gt': train_on_diff_gt,
+    'mining_type': mining_type,
+    'neg_pos_ratio': neg_pos_ratio,
+    'neg_overlap': 0.5,
+    'code_type': code_type,
+    'ignore_cross_boundary_bbox': ignore_cross_boundary_bbox,
+    }
+loss_param = {
+    'normalization': normalization_mode,
+    }
+
+# parameters for generating priors.
+# minimum dimension of input image
+min_dim = 300
+# conv4_3 ==> 38 x 38
+# fc7 ==> 19 x 19
+# conv6_2 ==> 10 x 10
+# conv7_2 ==> 5 x 5
+# conv8_2 ==> 3 x 3
+# conv9_2 ==> 1 x 1
+mbox_source_layers = ['conv4_3', 'fc7', 'conv6_2', 'conv7_2', 'conv8_2', 'conv9_2']
+# in percent %
+min_ratio = 20
+max_ratio = 90
+step = int(math.floor((max_ratio - min_ratio) / (len(mbox_source_layers) - 2)))
+min_sizes = []
+max_sizes = []
+for ratio in xrange(min_ratio, max_ratio + 1, step):
+  min_sizes.append(min_dim * ratio / 100.)
+  max_sizes.append(min_dim * (ratio + step) / 100.)
+min_sizes = [min_dim * 10 / 100.] + min_sizes
+max_sizes = [min_dim * 20 / 100.] + max_sizes
+steps = [8, 16, 32, 64, 100, 300]
+aspect_ratios = [[2], [2, 3], [2, 3], [2, 3], [2], [2]]
+# L2 normalize conv4_3.
+normalizations = [20, -1, -1, -1, -1, -1]
+# variance used to encode/decode prior bboxes.
+if code_type == P.PriorBox.CENTER_SIZE:
+  prior_variance = [0.1, 0.1, 0.2, 0.2]
+else:
+  prior_variance = [0.1]
+flip = True
+clip = False
+
+# Solver parameters.
+# Defining which GPUs to use.
+gpus = "0"
+gpulist = gpus.split(",")
+num_gpus = len(gpulist)
+
+# The number does not matter since we do not do training with this script.
+batch_size = 1
+accum_batch_size = 1
+iter_size = accum_batch_size / batch_size
+solver_mode = P.Solver.CPU
+device_id = 0
+batch_size_per_device = batch_size
+if num_gpus > 0:
+  batch_size_per_device = int(math.ceil(float(batch_size) / num_gpus))
+  iter_size = int(math.ceil(float(accum_batch_size) / (batch_size_per_device * num_gpus)))
+  solver_mode = P.Solver.GPU
+  device_id = int(gpulist[0])
+
+if normalization_mode == P.Loss.NONE:
+  base_lr /= batch_size_per_device
+elif normalization_mode == P.Loss.VALID:
+  base_lr *= 25. / loc_weight
+elif normalization_mode == P.Loss.FULL:
+  # Roughly there are 2000 prior bboxes per image.
+  # TODO(weiliu89): Estimate the exact # of priors.
+  base_lr *= 2000.
+
+# Evaluate on whole test set.
+num_test_image = 4952
+test_batch_size = 8
+# Ideally test_batch_size should be divisible by num_test_image,
+# otherwise mAP will be slightly off the true value.
+test_iter = int(math.ceil(float(num_test_image) / test_batch_size))
+
+solver_param = {
+    # Train parameters
+    'base_lr': base_lr,
+    'weight_decay': 0.0005,
+    'lr_policy': "multistep",
+    'stepvalue': [80000, 100000, 120000],
+    'gamma': 0.1,
+    'momentum': 0.9,
+    'iter_size': iter_size,
+    'max_iter': 0,
+    'snapshot': 0,
+    'display': 10,
+    'average_loss': 10,
+    'type': "SGD",
+    'solver_mode': solver_mode,
+    'device_id': device_id,
+    'debug_info': False,
+    'snapshot_after_train': False,
+    # Test parameters
+    'test_iter': [test_iter],
+    'test_interval': 10000,
+    'eval_type': "detection",
+    'ap_version': "11point",
+    'test_initialization': True,
+    }
+
+# parameters for generating detection output.
+det_out_param = {
+    'num_classes': num_classes,
+    'share_location': share_location,
+    'background_label_id': background_label_id,
+    'nms_param': {'nms_threshold': 0.45, 'top_k': 400},
+    'save_output_param': {
+        'output_directory': output_result_dir,
+        'output_name_prefix': "comp4_det_test_",
+        'output_format': "VOC",
+        'label_map_file': label_map_file,
+        'name_size_file': name_size_file,
+        'num_test_image': num_test_image,
+        },
+    'keep_top_k': 200,
+    'confidence_threshold': 0.01,
+    'code_type': code_type,
+    }
+
+# parameters for evaluating detection results.
+det_eval_param = {
+    'num_classes': num_classes,
+    'background_label_id': background_label_id,
+    'overlap_threshold': 0.5,
+    'evaluate_difficult_gt': False,
+    'name_size_file': name_size_file,
+    }
+
+### Hopefully you don't need to change the following ###
+# Check file.
+check_if_exist(train_data)
+check_if_exist(test_data)
+check_if_exist(label_map_file)
+check_if_exist(pretrain_model)
+make_if_not_exist(save_dir)
+make_if_not_exist(job_dir)
+make_if_not_exist(snapshot_dir)
+
+# Create train net.
+net = caffe.NetSpec()
+net.data, net.label = CreateAnnotatedDataLayer(train_data, batch_size=batch_size_per_device,
+        train=True, output_label=True, label_map_file=label_map_file,
+        transform_param=train_transform_param, batch_sampler=batch_sampler)
+
+VGGNetBody(net, from_layer='data', fully_conv=True, reduced=True, dilated=True,
+    dropout=False)
+
+AddExtraLayers(net, use_batchnorm, lr_mult=lr_mult)
+
+mbox_layers = CreateMultiBoxHead(net, data_layer='data', from_layers=mbox_source_layers,
+        use_batchnorm=use_batchnorm, min_sizes=min_sizes, max_sizes=max_sizes,
+        aspect_ratios=aspect_ratios, steps=steps, normalizations=normalizations,
+        num_classes=num_classes, share_location=share_location, flip=flip, clip=clip,
+        prior_variance=prior_variance, kernel_size=3, pad=1, lr_mult=lr_mult)
+
+# Create the MultiBoxLossLayer.
+name = "mbox_loss"
+mbox_layers.append(net.label)
+net[name] = L.MultiBoxLoss(*mbox_layers, multibox_loss_param=multibox_loss_param,
+        loss_param=loss_param, include=dict(phase=caffe_pb2.Phase.Value('TRAIN')),
+        propagate_down=[True, True, False, False])
+
+with open(train_net_file, 'w') as f:
+    print('name: "{}_train"'.format(model_name), file=f)
+    print(net.to_proto(), file=f)
+shutil.copy(train_net_file, job_dir)
+
+# Create test net.
+net = caffe.NetSpec()
+net.data, net.label = CreateAnnotatedDataLayer(test_data, batch_size=test_batch_size,
+        train=False, output_label=True, label_map_file=label_map_file,
+        transform_param=test_transform_param)
+
+VGGNetBody(net, from_layer='data', fully_conv=True, reduced=True, dilated=True,
+    dropout=False)
+
+AddExtraLayers(net, use_batchnorm, lr_mult=lr_mult)
+
+mbox_layers = CreateMultiBoxHead(net, data_layer='data', from_layers=mbox_source_layers,
+        use_batchnorm=use_batchnorm, min_sizes=min_sizes, max_sizes=max_sizes,
+        aspect_ratios=aspect_ratios, steps=steps, normalizations=normalizations,
+        num_classes=num_classes, share_location=share_location, flip=flip, clip=clip,
+        prior_variance=prior_variance, kernel_size=3, pad=1, lr_mult=lr_mult)
+
+conf_name = "mbox_conf"
+if multibox_loss_param["conf_loss_type"] == P.MultiBoxLoss.SOFTMAX:
+  reshape_name = "{}_reshape".format(conf_name)
+  net[reshape_name] = L.Reshape(net[conf_name], shape=dict(dim=[0, -1, num_classes]))
+  softmax_name = "{}_softmax".format(conf_name)
+  net[softmax_name] = L.Softmax(net[reshape_name], axis=2)
+  flatten_name = "{}_flatten".format(conf_name)
+  net[flatten_name] = L.Flatten(net[softmax_name], axis=1)
+  mbox_layers[1] = net[flatten_name]
+elif multibox_loss_param["conf_loss_type"] == P.MultiBoxLoss.LOGISTIC:
+  sigmoid_name = "{}_sigmoid".format(conf_name)
+  net[sigmoid_name] = L.Sigmoid(net[conf_name])
+  mbox_layers[1] = net[sigmoid_name]
+
+net.detection_out = L.DetectionOutput(*mbox_layers,
+    detection_output_param=det_out_param,
+    include=dict(phase=caffe_pb2.Phase.Value('TEST')))
+net.detection_eval = L.DetectionEvaluate(net.detection_out, net.label,
+    detection_evaluate_param=det_eval_param,
+    include=dict(phase=caffe_pb2.Phase.Value('TEST')))
+
+with open(test_net_file, 'w') as f:
+    print('name: "{}_test"'.format(model_name), file=f)
+    print(net.to_proto(), file=f)
+shutil.copy(test_net_file, job_dir)
+
+# Create deploy net.
+# Remove the first and last layer from test net.
+deploy_net = net
+with open(deploy_net_file, 'w') as f:
+    net_param = deploy_net.to_proto()
+    # Remove the first (AnnotatedData) and last (DetectionEvaluate) layer from test net.
+    del net_param.layer[0]
+    del net_param.layer[-1]
+    net_param.name = '{}_deploy'.format(model_name)
+    net_param.input.extend(['data'])
+    net_param.input_shape.extend([
+        caffe_pb2.BlobShape(dim=[1, 3, resize_height, resize_width])])
+    print(net_param, file=f)
+shutil.copy(deploy_net_file, job_dir)
+
+# Create solver.
+solver = caffe_pb2.SolverParameter(
+        train_net=train_net_file,
+        test_net=[test_net_file],
+        snapshot_prefix=snapshot_prefix,
+        **solver_param)
+
+with open(solver_file, 'w') as f:
+    print(solver, file=f)
+shutil.copy(solver_file, job_dir)
+
+# Create job file.
+with open(job_file, 'w') as f:
+  f.write('cd {}\n'.format(caffe_root))
+  f.write('./build/tools/caffe train \\\n')
+  f.write('--solver="{}" \\\n'.format(solver_file))
+  f.write('--weights="{}" \\\n'.format(pretrain_model))
+  if solver_param['solver_mode'] == P.Solver.GPU:
+    f.write('--gpu {} 2>&1 | tee {}/{}_test{}.log\n'.format(gpus, job_dir, model_name, max_iter))
+  else:
+    f.write('2>&1 | tee {}/{}.log\n'.format(job_dir, model_name))
+
+# Copy the python script to job_dir.
+py_file = os.path.abspath(__file__)
+shutil.copy(py_file, job_dir)
+
+# Run the job.
+os.chmod(job_file, stat.S_IRWXU)
+if run_soon:
+  subprocess.call(job_file, shell=True)
diff --git a/examples/ssd/ssd_coco.py b/examples/ssd/ssd_coco.py
new file mode 100644
index 00000000000..3296d7abb32
--- /dev/null
+++ b/examples/ssd/ssd_coco.py
@@ -0,0 +1,576 @@
+from __future__ import print_function
+import caffe
+from google.protobuf import text_format
+
+import math
+import os
+import shutil
+import stat
+import subprocess
+import sys
+
+sys.path.insert(0, 'examples/ssd')
+from model_libs import *
+
+# Add extra layers on top of a "base" network (e.g. VGGNet or Inception).
+def AddExtraLayers(net, use_batchnorm=True, lr_mult=1):
+    use_relu = True
+
+    # Add additional convolutional layers.
+    # 19 x 19
+    from_layer = net.keys()[-1]
+
+    # TODO(weiliu89): Construct the name using the last layer to avoid duplication.
+    # 10 x 10
+    out_layer = "conv6_1"
+    ConvBNLayer(net, from_layer, out_layer, use_batchnorm, use_relu, 256, 1, 0, 1,
+        lr_mult=lr_mult)
+
+    from_layer = out_layer
+    out_layer = "conv6_2"
+    ConvBNLayer(net, from_layer, out_layer, use_batchnorm, use_relu, 512, 3, 1, 2,
+        lr_mult=lr_mult)
+
+    # 5 x 5
+    from_layer = out_layer
+    out_layer = "conv7_1"
+    ConvBNLayer(net, from_layer, out_layer, use_batchnorm, use_relu, 128, 1, 0, 1,
+      lr_mult=lr_mult)
+
+    from_layer = out_layer
+    out_layer = "conv7_2"
+    ConvBNLayer(net, from_layer, out_layer, use_batchnorm, use_relu, 256, 3, 1, 2,
+      lr_mult=lr_mult)
+
+    # 3 x 3
+    from_layer = out_layer
+    out_layer = "conv8_1"
+    ConvBNLayer(net, from_layer, out_layer, use_batchnorm, use_relu, 128, 1, 0, 1,
+      lr_mult=lr_mult)
+
+    from_layer = out_layer
+    out_layer = "conv8_2"
+    ConvBNLayer(net, from_layer, out_layer, use_batchnorm, use_relu, 256, 3, 0, 1,
+      lr_mult=lr_mult)
+
+    # 1 x 1
+    from_layer = out_layer
+    out_layer = "conv9_1"
+    ConvBNLayer(net, from_layer, out_layer, use_batchnorm, use_relu, 128, 1, 0, 1,
+      lr_mult=lr_mult)
+
+    from_layer = out_layer
+    out_layer = "conv9_2"
+    ConvBNLayer(net, from_layer, out_layer, use_batchnorm, use_relu, 256, 3, 0, 1,
+      lr_mult=lr_mult)
+
+    return net
+
+
+### Modify the following parameters accordingly ###
+# The directory which contains the caffe code.
+# We assume you are running the script at the CAFFE_ROOT.
+caffe_root = os.getcwd()
+
+# Set true if you want to start training right after generating all files.
+run_soon = True
+# Set true if you want to load from most recently saved snapshot.
+# Otherwise, we will load from the pretrain_model defined below.
+resume_training = True
+# If true, Remove old model files.
+remove_old_models = False
+
+# The database file for training data. Created by data/coco/create_data.sh
+train_data = "examples/coco/coco_train_lmdb"
+# The database file for testing data. Created by data/coco/create_data.sh
+test_data = "examples/coco/coco_minival_lmdb"
+# Specify the batch sampler.
+resize_width = 300
+resize_height = 300
+resize = "{}x{}".format(resize_width, resize_height)
+batch_sampler = [
+        {
+                'sampler': {
+                        },
+                'max_trials': 1,
+                'max_sample': 1,
+        },
+        {
+                'sampler': {
+                        'min_scale': 0.3,
+                        'max_scale': 1.0,
+                        'min_aspect_ratio': 0.5,
+                        'max_aspect_ratio': 2.0,
+                        },
+                'sample_constraint': {
+                        'min_jaccard_overlap': 0.1,
+                        },
+                'max_trials': 50,
+                'max_sample': 1,
+        },
+        {
+                'sampler': {
+                        'min_scale': 0.3,
+                        'max_scale': 1.0,
+                        'min_aspect_ratio': 0.5,
+                        'max_aspect_ratio': 2.0,
+                        },
+                'sample_constraint': {
+                        'min_jaccard_overlap': 0.3,
+                        },
+                'max_trials': 50,
+                'max_sample': 1,
+        },
+        {
+                'sampler': {
+                        'min_scale': 0.3,
+                        'max_scale': 1.0,
+                        'min_aspect_ratio': 0.5,
+                        'max_aspect_ratio': 2.0,
+                        },
+                'sample_constraint': {
+                        'min_jaccard_overlap': 0.5,
+                        },
+                'max_trials': 50,
+                'max_sample': 1,
+        },
+        {
+                'sampler': {
+                        'min_scale': 0.3,
+                        'max_scale': 1.0,
+                        'min_aspect_ratio': 0.5,
+                        'max_aspect_ratio': 2.0,
+                        },
+                'sample_constraint': {
+                        'min_jaccard_overlap': 0.7,
+                        },
+                'max_trials': 50,
+                'max_sample': 1,
+        },
+        {
+                'sampler': {
+                        'min_scale': 0.3,
+                        'max_scale': 1.0,
+                        'min_aspect_ratio': 0.5,
+                        'max_aspect_ratio': 2.0,
+                        },
+                'sample_constraint': {
+                        'min_jaccard_overlap': 0.9,
+                        },
+                'max_trials': 50,
+                'max_sample': 1,
+        },
+        {
+                'sampler': {
+                        'min_scale': 0.3,
+                        'max_scale': 1.0,
+                        'min_aspect_ratio': 0.5,
+                        'max_aspect_ratio': 2.0,
+                        },
+                'sample_constraint': {
+                        'max_jaccard_overlap': 1.0,
+                        },
+                'max_trials': 50,
+                'max_sample': 1,
+        },
+        ]
+train_transform_param = {
+        'mirror': True,
+        'mean_value': [104, 117, 123],
+        'force_color': True,
+        'resize_param': {
+                'prob': 1,
+                'resize_mode': P.Resize.WARP,
+                'height': resize_height,
+                'width': resize_width,
+                'interp_mode': [
+                        P.Resize.LINEAR,
+                        P.Resize.AREA,
+                        P.Resize.NEAREST,
+                        P.Resize.CUBIC,
+                        P.Resize.LANCZOS4,
+                        ],
+                },
+        'distort_param': {
+                'brightness_prob': 0.5,
+                'brightness_delta': 32,
+                'contrast_prob': 0.5,
+                'contrast_lower': 0.5,
+                'contrast_upper': 1.5,
+                'hue_prob': 0.5,
+                'hue_delta': 18,
+                'saturation_prob': 0.5,
+                'saturation_lower': 0.5,
+                'saturation_upper': 1.5,
+                'random_order_prob': 0.0,
+                },
+        'expand_param': {
+                'prob': 0.5,
+                'max_expand_ratio': 4.0,
+                },
+        'emit_constraint': {
+            'emit_type': caffe_pb2.EmitConstraint.CENTER,
+            }
+        }
+test_transform_param = {
+        'mean_value': [104, 117, 123],
+        'force_color': True,
+        'resize_param': {
+                'prob': 1,
+                'resize_mode': P.Resize.WARP,
+                'height': resize_height,
+                'width': resize_width,
+                'interp_mode': [P.Resize.LINEAR],
+                },
+        }
+
+# If true, use batch norm for all newly added layers.
+# Currently only the non batch norm version has been tested.
+use_batchnorm = False
+lr_mult = 1
+# Use different initial learning rate.
+if use_batchnorm:
+    base_lr = 0.0004
+else:
+    # A learning rate for batch_size = 1, num_gpus = 1.
+    base_lr = 0.00004
+
+# Modify the job name if you want.
+job_name = "SSD_{}".format(resize)
+# The name of the model. Modify it if you want.
+model_name = "VGG_coco_{}".format(job_name)
+
+# Directory which stores the model .prototxt file.
+save_dir = "models/VGGNet/coco/{}".format(job_name)
+# Directory which stores the snapshot of models.
+snapshot_dir = "models/VGGNet/coco/{}".format(job_name)
+# Directory which stores the job script and log file.
+job_dir = "jobs/VGGNet/coco/{}".format(job_name)
+# Directory which stores the detection results.
+output_result_dir = "{}/data/mscoco/results/{}".format(os.environ['HOME'], job_name)
+
+# model definition files.
+train_net_file = "{}/train.prototxt".format(save_dir)
+test_net_file = "{}/test.prototxt".format(save_dir)
+deploy_net_file = "{}/deploy.prototxt".format(save_dir)
+solver_file = "{}/solver.prototxt".format(save_dir)
+# snapshot prefix.
+snapshot_prefix = "{}/{}".format(snapshot_dir, model_name)
+# job script path.
+job_file = "{}/{}.sh".format(job_dir, model_name)
+
+# Stores the test image names and sizes. Created by data/coco/create_list.sh
+name_size_file = "data/coco/test2014_name_size.txt"
+# The pretrained model. We use the Fully convolutional reduced (atrous) VGGNet.
+pretrain_model = "models/VGGNet/VGG_ILSVRC_16_layers_fc_reduced.caffemodel"
+# Stores LabelMapItem.
+label_map_file = "data/coco/labelmap_coco.prototxt"
+
+# MultiBoxLoss parameters.
+num_classes = 81
+share_location = True
+background_label_id=0
+train_on_diff_gt = False
+normalization_mode = P.Loss.VALID
+code_type = P.PriorBox.CENTER_SIZE
+ignore_cross_boundary_bbox = False
+mining_type = P.MultiBoxLoss.MAX_NEGATIVE
+neg_pos_ratio = 3.
+loc_weight = (neg_pos_ratio + 1.) / 4.
+multibox_loss_param = {
+    'loc_loss_type': P.MultiBoxLoss.SMOOTH_L1,
+    'conf_loss_type': P.MultiBoxLoss.SOFTMAX,
+    'loc_weight': loc_weight,
+    'num_classes': num_classes,
+    'share_location': share_location,
+    'match_type': P.MultiBoxLoss.PER_PREDICTION,
+    'overlap_threshold': 0.5,
+    'use_prior_for_matching': True,
+    'background_label_id': background_label_id,
+    'use_difficult_gt': train_on_diff_gt,
+    'mining_type': mining_type,
+    'neg_pos_ratio': neg_pos_ratio,
+    'neg_overlap': 0.5,
+    'code_type': code_type,
+    'ignore_cross_boundary_bbox': ignore_cross_boundary_bbox,
+    }
+loss_param = {
+    'normalization': normalization_mode,
+    }
+
+# parameters for generating priors.
+# minimum dimension of input image
+min_dim = 300
+# conv4_3 ==> 38 x 38
+# fc7 ==> 19 x 19
+# conv6_2 ==> 10 x 10
+# conv7_2 ==> 5 x 5
+# conv8_2 ==> 3 x 3
+# conv9_2 ==> 1 x 1
+mbox_source_layers = ['conv4_3', 'fc7', 'conv6_2', 'conv7_2', 'conv8_2', 'conv9_2']
+# in percent %
+min_ratio = 15
+max_ratio = 90
+step = int(math.floor((max_ratio - min_ratio) / (len(mbox_source_layers) - 2)))
+min_sizes = []
+max_sizes = []
+for ratio in xrange(min_ratio, max_ratio + 1, step):
+  min_sizes.append(min_dim * ratio / 100.)
+  max_sizes.append(min_dim * (ratio + step) / 100.)
+min_sizes = [min_dim * 7 / 100.] + min_sizes
+max_sizes = [min_dim * 15 / 100.] + max_sizes
+steps = [8, 16, 32, 64, 100, 300]
+aspect_ratios = [[2], [2, 3], [2, 3], [2, 3], [2], [2]]
+# L2 normalize conv4_3.
+normalizations = [20, -1, -1, -1, -1, -1]
+# variance used to encode/decode prior bboxes.
+if code_type == P.PriorBox.CENTER_SIZE:
+  prior_variance = [0.1, 0.1, 0.2, 0.2]
+else:
+  prior_variance = [0.1]
+flip = True
+clip = False
+
+# Solver parameters.
+# Defining which GPUs to use.
+gpus = "0,1"
+gpulist = gpus.split(",")
+num_gpus = len(gpulist)
+
+# Divide the mini-batch to different GPUs.
+batch_size = 32
+accum_batch_size = 32
+iter_size = accum_batch_size / batch_size
+solver_mode = P.Solver.CPU
+device_id = 0
+batch_size_per_device = batch_size
+if num_gpus > 0:
+  batch_size_per_device = int(math.ceil(float(batch_size) / num_gpus))
+  iter_size = int(math.ceil(float(accum_batch_size) / (batch_size_per_device * num_gpus)))
+  solver_mode = P.Solver.GPU
+  device_id = int(gpulist[0])
+
+if normalization_mode == P.Loss.NONE:
+  base_lr /= batch_size_per_device
+elif normalization_mode == P.Loss.VALID:
+  base_lr *= 25. / loc_weight
+elif normalization_mode == P.Loss.FULL:
+  # Roughly there are 2000 prior bboxes per image.
+  # TODO(weiliu89): Estimate the exact # of priors.
+  base_lr *= 2000.
+
+# Evaluate on whole test set.
+num_test_image = 5000
+test_batch_size = 8
+test_iter = num_test_image / test_batch_size
+
+solver_param = {
+    # Train parameters
+    'base_lr': base_lr,
+    'weight_decay': 0.0005,
+    'lr_policy': "multistep",
+    'stepvalue': [280000, 360000, 400000],
+    'gamma': 0.1,
+    'momentum': 0.9,
+    'iter_size': iter_size,
+    'max_iter': 400000,
+    'snapshot': 40000,
+    'display': 10,
+    'average_loss': 10,
+    'type': "SGD",
+    'solver_mode': solver_mode,
+    'device_id': device_id,
+    'debug_info': False,
+    'snapshot_after_train': True,
+    # Test parameters
+    'test_iter': [test_iter],
+    'test_interval': 10000,
+    'eval_type': "detection",
+    'ap_version': "11point",
+    'test_initialization': False,
+    }
+
+# parameters for generating detection output.
+det_out_param = {
+    'num_classes': num_classes,
+    'share_location': share_location,
+    'background_label_id': background_label_id,
+    'nms_param': {'nms_threshold': 0.45, 'top_k': 400},
+    'save_output_param': {
+        'output_directory': output_result_dir,
+        'output_name_prefix': "detections_minival_ssd300_results",
+        'output_format': "COCO",
+        'label_map_file': label_map_file,
+        'name_size_file': name_size_file,
+        'num_test_image': num_test_image,
+        },
+    'keep_top_k': 200,
+    'confidence_threshold': 0.01,
+    'code_type': code_type,
+    }
+
+# parameters for evaluating detection results.
+det_eval_param = {
+    'num_classes': num_classes,
+    'background_label_id': background_label_id,
+    'overlap_threshold': 0.5,
+    'evaluate_difficult_gt': False,
+    'name_size_file': name_size_file,
+    }
+
+### Hopefully you don't need to change the following ###
+# Check file.
+check_if_exist(train_data)
+check_if_exist(test_data)
+check_if_exist(label_map_file)
+check_if_exist(pretrain_model)
+make_if_not_exist(save_dir)
+make_if_not_exist(job_dir)
+make_if_not_exist(snapshot_dir)
+
+# Create train net.
+net = caffe.NetSpec()
+net.data, net.label = CreateAnnotatedDataLayer(train_data, batch_size=batch_size_per_device,
+        train=True, output_label=True, label_map_file=label_map_file,
+        transform_param=train_transform_param, batch_sampler=batch_sampler)
+
+VGGNetBody(net, from_layer='data', fully_conv=True, reduced=True, dilated=True,
+    dropout=False)
+
+AddExtraLayers(net, use_batchnorm, lr_mult=lr_mult)
+
+mbox_layers = CreateMultiBoxHead(net, data_layer='data', from_layers=mbox_source_layers,
+        use_batchnorm=use_batchnorm, min_sizes=min_sizes, max_sizes=max_sizes,
+        aspect_ratios=aspect_ratios, steps=steps, normalizations=normalizations,
+        num_classes=num_classes, share_location=share_location, flip=flip, clip=clip,
+        prior_variance=prior_variance, kernel_size=3, pad=1, lr_mult=lr_mult)
+
+# Create the MultiBoxLossLayer.
+name = "mbox_loss"
+mbox_layers.append(net.label)
+net[name] = L.MultiBoxLoss(*mbox_layers, multibox_loss_param=multibox_loss_param,
+        loss_param=loss_param, include=dict(phase=caffe_pb2.Phase.Value('TRAIN')),
+        propagate_down=[True, True, False, False])
+
+with open(train_net_file, 'w') as f:
+    print('name: "{}_train"'.format(model_name), file=f)
+    print(net.to_proto(), file=f)
+shutil.copy(train_net_file, job_dir)
+
+# Create test net.
+net = caffe.NetSpec()
+net.data, net.label = CreateAnnotatedDataLayer(test_data, batch_size=test_batch_size,
+        train=False, output_label=True, label_map_file=label_map_file,
+        transform_param=test_transform_param)
+
+VGGNetBody(net, from_layer='data', fully_conv=True, reduced=True, dilated=True,
+    dropout=False)
+
+AddExtraLayers(net, use_batchnorm, lr_mult=lr_mult)
+
+mbox_layers = CreateMultiBoxHead(net, data_layer='data', from_layers=mbox_source_layers,
+        use_batchnorm=use_batchnorm, min_sizes=min_sizes, max_sizes=max_sizes,
+        aspect_ratios=aspect_ratios, steps=steps, normalizations=normalizations,
+        num_classes=num_classes, share_location=share_location, flip=flip, clip=clip,
+        prior_variance=prior_variance, kernel_size=3, pad=1, lr_mult=lr_mult)
+
+conf_name = "mbox_conf"
+if multibox_loss_param["conf_loss_type"] == P.MultiBoxLoss.SOFTMAX:
+  reshape_name = "{}_reshape".format(conf_name)
+  net[reshape_name] = L.Reshape(net[conf_name], shape=dict(dim=[0, -1, num_classes]))
+  softmax_name = "{}_softmax".format(conf_name)
+  net[softmax_name] = L.Softmax(net[reshape_name], axis=2)
+  flatten_name = "{}_flatten".format(conf_name)
+  net[flatten_name] = L.Flatten(net[softmax_name], axis=1)
+  mbox_layers[1] = net[flatten_name]
+elif multibox_loss_param["conf_loss_type"] == P.MultiBoxLoss.LOGISTIC:
+  sigmoid_name = "{}_sigmoid".format(conf_name)
+  net[sigmoid_name] = L.Sigmoid(net[conf_name])
+  mbox_layers[1] = net[sigmoid_name]
+
+net.detection_out = L.DetectionOutput(*mbox_layers,
+    detection_output_param=det_out_param,
+    include=dict(phase=caffe_pb2.Phase.Value('TEST')))
+net.detection_eval = L.DetectionEvaluate(net.detection_out, net.label,
+    detection_evaluate_param=det_eval_param,
+    include=dict(phase=caffe_pb2.Phase.Value('TEST')))
+
+with open(test_net_file, 'w') as f:
+    print('name: "{}_test"'.format(model_name), file=f)
+    print(net.to_proto(), file=f)
+shutil.copy(test_net_file, job_dir)
+
+# Create deploy net.
+# Remove the first and last layer from test net.
+deploy_net = net
+with open(deploy_net_file, 'w') as f:
+    net_param = deploy_net.to_proto()
+    # Remove the first (AnnotatedData) and last (DetectionEvaluate) layer from test net.
+    del net_param.layer[0]
+    del net_param.layer[-1]
+    net_param.name = '{}_deploy'.format(model_name)
+    net_param.input.extend(['data'])
+    net_param.input_shape.extend([
+        caffe_pb2.BlobShape(dim=[1, 3, resize_height, resize_width])])
+    print(net_param, file=f)
+shutil.copy(deploy_net_file, job_dir)
+
+# Create solver.
+solver = caffe_pb2.SolverParameter(
+        train_net=train_net_file,
+        test_net=[test_net_file],
+        snapshot_prefix=snapshot_prefix,
+        **solver_param)
+
+with open(solver_file, 'w') as f:
+    print(solver, file=f)
+shutil.copy(solver_file, job_dir)
+
+max_iter = 0
+# Find most recent snapshot.
+for file in os.listdir(snapshot_dir):
+  if file.endswith(".solverstate"):
+    basename = os.path.splitext(file)[0]
+    iter = int(basename.split("{}_iter_".format(model_name))[1])
+    if iter > max_iter:
+      max_iter = iter
+
+train_src_param = '--weights="{}" \\\n'.format(pretrain_model)
+if resume_training:
+  if max_iter > 0:
+    train_src_param = '--snapshot="{}_iter_{}.solverstate" \\\n'.format(snapshot_prefix, max_iter)
+
+if remove_old_models:
+  # Remove any snapshots smaller than max_iter.
+  for file in os.listdir(snapshot_dir):
+    if file.endswith(".solverstate"):
+      basename = os.path.splitext(file)[0]
+      iter = int(basename.split("{}_iter_".format(model_name))[1])
+      if max_iter > iter:
+        os.remove("{}/{}".format(snapshot_dir, file))
+    if file.endswith(".caffemodel"):
+      basename = os.path.splitext(file)[0]
+      iter = int(basename.split("{}_iter_".format(model_name))[1])
+      if max_iter > iter:
+        os.remove("{}/{}".format(snapshot_dir, file))
+
+# Create job file.
+with open(job_file, 'w') as f:
+  f.write('cd {}\n'.format(caffe_root))
+  f.write('./build/tools/caffe train \\\n')
+#  f.write('./cmake-build-debug/tools/caffe-d train \\\n')
+  f.write('--solver="{}" \\\n'.format(solver_file))
+  f.write(train_src_param)
+  if solver_param['solver_mode'] == P.Solver.GPU:
+    f.write('--gpu {} 2>&1 | tee {}/{}.log\n'.format(gpus, job_dir, model_name))
+  else:
+    f.write('2>&1 | tee {}/{}.log\n'.format(job_dir, model_name))
+
+# Copy the python script to job_dir.
+py_file = os.path.abspath(__file__)
+shutil.copy(py_file, job_dir)
+
+# Run the job.
+os.chmod(job_file, stat.S_IRWXU)
+if run_soon:
+  subprocess.call(job_file, shell=True)
diff --git a/examples/ssd/ssd_detect.cpp b/examples/ssd/ssd_detect.cpp
new file mode 100644
index 00000000000..11d3dc3bd5e
--- /dev/null
+++ b/examples/ssd/ssd_detect.cpp
@@ -0,0 +1,346 @@
+// This is a demo code for using a SSD model to do detection.
+// The code is modified from examples/cpp_classification/classification.cpp.
+// Usage:
+//    ssd_detect [FLAGS] model_file weights_file list_file
+//
+// where model_file is the .prototxt file defining the network architecture, and
+// weights_file is the .caffemodel file containing the network parameters, and
+// list_file contains a list of image files with the format as follows:
+//    folder/img1.JPEG
+//    folder/img2.JPEG
+// list_file can also contain a list of video files with the format as follows:
+//    folder/video1.mp4
+//    folder/video2.mp4
+//
+#include <caffe/caffe.hpp>
+#include <opencv2/core/core.hpp>
+#include <opencv2/highgui/highgui.hpp>
+#include <opencv2/imgproc/imgproc.hpp>
+#include <algorithm>
+#include <iomanip>
+#include <iosfwd>
+#include <memory>
+#include <string>
+#include <utility>
+#include <vector>
+
+using namespace caffe;  // NOLINT(build/namespaces)
+
+class Detector {
+ public:
+  Detector(const string& model_file,
+           const string& weights_file,
+           const string& mean_file,
+           const string& mean_value);
+
+  std::vector<vector<float> > Detect(const cv::Mat& img);
+
+ private:
+  void SetMean(const string& mean_file, const string& mean_value);
+
+  void WrapInputLayer(std::vector<cv::Mat>* input_channels);
+
+  void Preprocess(const cv::Mat& img,
+                  std::vector<cv::Mat>* input_channels);
+
+ private:
+  shared_ptr<Net> net_;
+  cv::Size input_geometry_;
+  int num_channels_;
+  cv::Mat mean_;
+};
+
+Detector::Detector(const string& model_file,
+                   const string& weights_file,
+                   const string& mean_file,
+                   const string& mean_value) {
+  Caffe::set_mode(Caffe::GPU);
+
+  /* Load the network. */
+  net_.reset(new Net(model_file, TEST));
+  net_->CopyTrainedLayersFrom(weights_file);
+
+  CHECK_EQ(net_->num_inputs(), 1) << "Network should have exactly one input.";
+  CHECK_EQ(net_->num_outputs(), 1) << "Network should have exactly one output.";
+
+  Blob* input_layer = net_->input_blobs()[0];
+  num_channels_ = input_layer->channels();
+  CHECK(num_channels_ == 3 || num_channels_ == 1)
+    << "Input layer should have 1 or 3 channels.";
+  input_geometry_ = cv::Size(input_layer->width(), input_layer->height());
+
+  /* Load the binaryproto mean file. */
+  SetMean(mean_file, mean_value);
+}
+
+std::vector<vector<float> > Detector::Detect(const cv::Mat& img) {
+  Blob* input_layer = net_->input_blobs()[0];
+  input_layer->Reshape(1, num_channels_,
+                       input_geometry_.height, input_geometry_.width);
+  /* Forward dimension change to all layers. */
+  net_->Reshape();
+
+  std::vector<cv::Mat> input_channels;
+  WrapInputLayer(&input_channels);
+
+  Preprocess(img, &input_channels);
+
+  net_->Forward();
+
+  /* Copy the output layer to a std::vector */
+  Blob* result_blob = net_->output_blobs()[0];
+  const float* result = result_blob->cpu_data<float>();
+  const int num_det = result_blob->height();
+  vector<vector<float> > detections;
+  for (int k = 0; k < num_det; ++k) {
+    if (result[0] == -1) {
+      // Skip invalid detection.
+      result += 7;
+      continue;
+    }
+    vector<float> detection(result, result + 7);
+    detections.push_back(detection);
+    result += 7;
+  }
+  return detections;
+}
+
+/* Load the mean file in binaryproto format. */
+void Detector::SetMean(const string& mean_file, const string& mean_value) {
+  cv::Scalar channel_mean;
+  if (!mean_file.empty()) {
+    CHECK(mean_value.empty()) <<
+      "Cannot specify mean_file and mean_value at the same time";
+    BlobProto blob_proto;
+    ReadProtoFromBinaryFileOrDie(mean_file.c_str(), &blob_proto);
+
+    /* Convert from BlobProto to Blob<float> */
+    TBlob<float> mean_blob;
+    mean_blob.FromProto(blob_proto);
+    CHECK_EQ(mean_blob.channels(), num_channels_)
+      << "Number of channels of mean file doesn't match input layer.";
+
+    /* The format of the mean file is planar 32-bit float BGR or grayscale. */
+    std::vector<cv::Mat> channels;
+    float* data = mean_blob.mutable_cpu_data();
+    for (int i = 0; i < num_channels_; ++i) {
+      /* Extract an individual channel. */
+      cv::Mat channel(mean_blob.height(), mean_blob.width(), CV_32FC1, data);
+      channels.push_back(channel);
+      data += mean_blob.height() * mean_blob.width();
+    }
+
+    /* Merge the separate channels into a single image. */
+    cv::Mat mean;
+    cv::merge(channels, mean);
+
+    /* Compute the global mean pixel value and create a mean image
+     * filled with this value. */
+    channel_mean = cv::mean(mean);
+    mean_ = cv::Mat(input_geometry_, mean.type(), channel_mean);
+  }
+  if (!mean_value.empty()) {
+    CHECK(mean_file.empty()) <<
+      "Cannot specify mean_file and mean_value at the same time";
+    stringstream ss(mean_value);
+    vector<float> values;
+    string item;
+    while (getline(ss, item, ',')) {
+      float value = std::atof(item.c_str());
+      values.push_back(value);
+    }
+    CHECK(values.size() == 1 || values.size() == num_channels_) <<
+      "Specify either 1 mean_value or as many as channels: " << num_channels_;
+
+    std::vector<cv::Mat> channels;
+    for (int i = 0; i < num_channels_; ++i) {
+      /* Extract an individual channel. */
+      cv::Mat channel(input_geometry_.height, input_geometry_.width, CV_32FC1,
+          cv::Scalar(values[i]));
+      channels.push_back(channel);
+    }
+    cv::merge(channels, mean_);
+  }
+}
+
+/* Wrap the input layer of the network in separate cv::Mat objects
+ * (one per channel). This way we save one memcpy operation and we
+ * don't need to rely on cudaMemcpy2D. The last preprocessing
+ * operation will write the separate channels directly to the input
+ * layer. */
+void Detector::WrapInputLayer(std::vector<cv::Mat>* input_channels) {
+  Blob* input_layer = net_->input_blobs()[0];
+
+  int width = input_layer->width();
+  int height = input_layer->height();
+  float* input_data = input_layer->mutable_cpu_data<float>();
+  for (int i = 0; i < input_layer->channels(); ++i) {
+    cv::Mat channel(height, width, CV_32FC1, input_data);
+    input_channels->push_back(channel);
+    input_data += width * height;
+  }
+}
+
+void Detector::Preprocess(const cv::Mat& img,
+                            std::vector<cv::Mat>* input_channels) {
+  /* Convert the input image to the input image format of the network. */
+  cv::Mat sample;
+  if (img.channels() == 3 && num_channels_ == 1)
+    cv::cvtColor(img, sample, cv::COLOR_BGR2GRAY);
+  else if (img.channels() == 4 && num_channels_ == 1)
+    cv::cvtColor(img, sample, cv::COLOR_BGRA2GRAY);
+  else if (img.channels() == 4 && num_channels_ == 3)
+    cv::cvtColor(img, sample, cv::COLOR_BGRA2BGR);
+  else if (img.channels() == 1 && num_channels_ == 3)
+    cv::cvtColor(img, sample, cv::COLOR_GRAY2BGR);
+  else
+    sample = img;
+
+  cv::Mat sample_resized;
+  if (sample.size() != input_geometry_)
+    cv::resize(sample, sample_resized, input_geometry_);
+  else
+    sample_resized = sample;
+
+  cv::Mat sample_float;
+  if (num_channels_ == 3)
+    sample_resized.convertTo(sample_float, CV_32FC3);
+  else
+    sample_resized.convertTo(sample_float, CV_32FC1);
+
+  cv::Mat sample_normalized;
+  cv::subtract(sample_float, mean_, sample_normalized);
+
+  /* This operation will write the separate BGR planes directly to the
+   * input layer of the network because it is wrapped by the cv::Mat
+   * objects in input_channels. */
+  cv::split(sample_normalized, *input_channels);
+
+  CHECK(reinterpret_cast<float*>(input_channels->at(0).data)
+        == net_->input_blobs()[0]->cpu_data<float>())
+    << "Input channels are not wrapping the input layer of the network.";
+}
+
+DEFINE_string(mean_file, "",
+    "The mean file used to subtract from the input image.");
+DEFINE_string(mean_value, "104,117,123",
+    "If specified, can be one value or can be same as image channels"
+    " - would subtract from the corresponding channel). Separated by ','."
+    "Either mean_file or mean_value should be provided, not both.");
+DEFINE_string(file_type, "image",
+    "The file type in the list_file. Currently support image and video.");
+DEFINE_string(out_file, "",
+    "If provided, store the detection results in the out_file.");
+DEFINE_double(confidence_threshold, 0.01,
+    "Only store detections with score higher than the threshold.");
+
+int main(int argc, char** argv) {
+  ::google::InitGoogleLogging(argv[0]);
+  // Print output to stderr (while still logging)
+  FLAGS_alsologtostderr = 1;
+
+#ifndef GFLAGS_GFLAGS_H_
+  namespace gflags = google;
+#endif
+
+  gflags::SetUsageMessage("Do detection using SSD mode.\n"
+        "Usage:\n"
+        "    ssd_detect [FLAGS] model_file weights_file list_file\n");
+  gflags::ParseCommandLineFlags(&argc, &argv, true);
+
+  if (argc < 4) {
+    gflags::ShowUsageWithFlagsRestrict(argv[0], "examples/ssd/ssd_detect");
+    return 1;
+  }
+
+  const string& model_file = argv[1];
+  const string& weights_file = argv[2];
+  const string& mean_file = FLAGS_mean_file;
+  const string& mean_value = FLAGS_mean_value;
+  const string& file_type = FLAGS_file_type;
+  const string& out_file = FLAGS_out_file;
+  const float confidence_threshold = FLAGS_confidence_threshold;
+
+  // Initialize the network.
+  Detector detector(model_file, weights_file, mean_file, mean_value);
+
+  // Set the output mode.
+  std::streambuf* buf = std::cout.rdbuf();
+  std::ofstream outfile;
+  if (!out_file.empty()) {
+    outfile.open(out_file.c_str());
+    if (outfile.good()) {
+      buf = outfile.rdbuf();
+    }
+  }
+  std::ostream out(buf);
+
+  // Process image one by one.
+  std::ifstream infile(argv[3]);
+  std::string file;
+  while (infile >> file) {
+    if (file_type == "image") {
+      cv::Mat img = cv::imread(file, -1);
+      CHECK(!img.empty()) << "Unable to decode image " << file;
+      std::vector<vector<float> > detections = detector.Detect(img);
+
+      /* Print the detection results. */
+      for (int i = 0; i < detections.size(); ++i) {
+        const vector<float>& d = detections[i];
+        // Detection format: [image_id, label, score, xmin, ymin, xmax, ymax].
+        CHECK_EQ(d.size(), 7);
+        const float score = d[2];
+        if (score >= confidence_threshold) {
+          out << file << " ";
+          out << static_cast<int>(d[1]) << " ";
+          out << score << " ";
+          out << static_cast<int>(d[3] * img.cols) << " ";
+          out << static_cast<int>(d[4] * img.rows) << " ";
+          out << static_cast<int>(d[5] * img.cols) << " ";
+          out << static_cast<int>(d[6] * img.rows) << std::endl;
+        }
+      }
+    } else if (file_type == "video") {
+      cv::VideoCapture cap(file);
+      if (!cap.isOpened()) {
+        LOG(FATAL) << "Failed to open video: " << file;
+      }
+      cv::Mat img;
+      int frame_count = 0;
+      while (true) {
+        bool success = cap.read(img);
+        if (!success) {
+          LOG(INFO) << "Process " << frame_count << " frames from " << file;
+          break;
+        }
+        CHECK(!img.empty()) << "Error when read frame";
+        std::vector<vector<float> > detections = detector.Detect(img);
+
+        /* Print the detection results. */
+        for (int i = 0; i < detections.size(); ++i) {
+          const vector<float>& d = detections[i];
+          // Detection format: [image_id, label, score, xmin, ymin, xmax, ymax].
+          CHECK_EQ(d.size(), 7);
+          const float score = d[2];
+          if (score >= confidence_threshold) {
+            out << file << "_";
+            out << std::setfill('0') << std::setw(6) << frame_count << " ";
+            out << static_cast<int>(d[1]) << " ";
+            out << score << " ";
+            out << static_cast<int>(d[3] * img.cols) << " ";
+            out << static_cast<int>(d[4] * img.rows) << " ";
+            out << static_cast<int>(d[5] * img.cols) << " ";
+            out << static_cast<int>(d[6] * img.rows) << std::endl;
+          }
+        }
+        ++frame_count;
+      }
+      if (cap.isOpened()) {
+        cap.release();
+      }
+    } else {
+      LOG(FATAL) << "Unknown file_type: " << file_type;
+    }
+  }
+  return 0;
+}
diff --git a/examples/ssd/ssd_detect.py b/examples/ssd/ssd_detect.py
new file mode 100644
index 00000000000..945930bea03
--- /dev/null
+++ b/examples/ssd/ssd_detect.py
@@ -0,0 +1,149 @@
+#encoding=utf8
+'''
+Detection with SSD
+In this example, we will load a SSD model and use it to detect objects.
+'''
+
+import os
+import sys
+import argparse
+import numpy as np
+from PIL import Image, ImageDraw
+# Make sure that caffe is on the python path:
+caffe_root = './'
+os.chdir(caffe_root)
+sys.path.insert(0, os.path.join(caffe_root, 'python'))
+import caffe
+
+from google.protobuf import text_format
+from caffe.proto import caffe_pb2
+
+
+def get_labelname(labelmap, labels):
+    num_labels = len(labelmap.item)
+    labelnames = []
+    if type(labels) is not list:
+        labels = [labels]
+    for label in labels:
+        found = False
+        for i in xrange(0, num_labels):
+            if label == labelmap.item[i].label:
+                found = True
+                labelnames.append(labelmap.item[i].display_name)
+                break
+        assert found == True
+    return labelnames
+
+class CaffeDetection:
+    def __init__(self, gpu_id, model_def, model_weights, image_resize, labelmap_file):
+        caffe.set_device(gpu_id)
+        caffe.set_mode_gpu()
+
+        self.image_resize = image_resize
+        # Load the net in the test phase for inference, and configure input preprocessing.
+        self.net = caffe.Net(model_def,      # defines the structure of the model
+                             model_weights,  # contains the trained weights
+                             caffe.TEST)     # use test mode (e.g., don't perform dropout)
+         # input preprocessing: 'data' is the name of the input blob == net.inputs[0]
+        self.transformer = caffe.io.Transformer({'data': self.net.blobs['data'].data.shape})
+        self.transformer.set_transpose('data', (2, 0, 1))
+        self.transformer.set_mean('data', np.array([104, 117, 123])) # mean pixel
+        # the reference model operates on images in [0,255] range instead of [0,1]
+        self.transformer.set_raw_scale('data', 255)
+        # the reference model has channels in BGR order instead of RGB
+        self.transformer.set_channel_swap('data', (2, 1, 0))
+
+        # load PASCAL VOC labels
+        file = open(labelmap_file, 'r')
+        self.labelmap = caffe_pb2.LabelMap()
+        text_format.Merge(str(file.read()), self.labelmap)
+
+    def detect(self, image_file, conf_thresh=0.5, topn=5):
+        '''
+        SSD detection
+        '''
+        # set net to batch size of 1
+        # image_resize = 300
+        self.net.blobs['data'].reshape(1, 3, self.image_resize, self.image_resize)
+        image = caffe.io.load_image(image_file)
+
+        #Run the net and examine the top_k results
+        transformed_image = self.transformer.preprocess('data', image)
+        self.net.blobs['data'].data[...] = transformed_image
+
+        # Forward pass.
+        detections = self.net.forward()['detection_out']
+
+        # Parse the outputs.
+        det_label = detections[0,0,:,1]
+        det_conf = detections[0,0,:,2]
+        det_xmin = detections[0,0,:,3]
+        det_ymin = detections[0,0,:,4]
+        det_xmax = detections[0,0,:,5]
+        det_ymax = detections[0,0,:,6]
+
+        # Get detections with confidence higher than 0.6.
+        top_indices = [i for i, conf in enumerate(det_conf) if conf >= conf_thresh]
+
+        top_conf = det_conf[top_indices]
+        top_label_indices = det_label[top_indices].tolist()
+        top_labels = get_labelname(self.labelmap, top_label_indices)
+        top_xmin = det_xmin[top_indices]
+        top_ymin = det_ymin[top_indices]
+        top_xmax = det_xmax[top_indices]
+        top_ymax = det_ymax[top_indices]
+
+        result = []
+        for i in xrange(min(topn, top_conf.shape[0])):
+            xmin = top_xmin[i] # xmin = int(round(top_xmin[i] * image.shape[1]))
+            ymin = top_ymin[i] # ymin = int(round(top_ymin[i] * image.shape[0]))
+            xmax = top_xmax[i] # xmax = int(round(top_xmax[i] * image.shape[1]))
+            ymax = top_ymax[i] # ymax = int(round(top_ymax[i] * image.shape[0]))
+            score = top_conf[i]
+            label = int(top_label_indices[i])
+            label_name = top_labels[i]
+            result.append([xmin, ymin, xmax, ymax, label, score, label_name])
+        return result
+
+def main(args):
+    '''main '''
+    detection = CaffeDetection(args.gpu_id,
+                               args.model_def, args.model_weights,
+                               args.image_resize, args.labelmap_file)
+    result = detection.detect(args.image_file)
+    print result
+
+    img = Image.open(args.image_file)
+    draw = ImageDraw.Draw(img)
+    width, height = img.size
+    print width, height
+    for item in result:
+        xmin = int(round(item[0] * width))
+        ymin = int(round(item[1] * height))
+        xmax = int(round(item[2] * width))
+        ymax = int(round(item[3] * height))
+        draw.rectangle([xmin, ymin, xmax, ymax], outline=(255, 0, 0))
+        draw.text([xmin, ymin], item[-1] + str(item[-2]), (0, 0, 255))
+        print item
+        print [xmin, ymin, xmax, ymax]
+        print [xmin, ymin], item[-1]
+    img.save('detect_result.jpg')
+
+
+def parse_args():
+    '''parse args'''
+    parser = argparse.ArgumentParser()
+    parser.add_argument('--gpu_id', type=int, default=0, help='gpu id')
+    parser.add_argument('--labelmap_file',
+                        default='data/VOC0712/labelmap_voc.prototxt')
+    parser.add_argument('--model_def',
+                        default='models/VGGNet/VOC0712/SSD_300x300/deploy.prototxt')
+    parser.add_argument('--image_resize', default=300, type=int)
+    parser.add_argument('--model_weights',
+                        default='models/VGGNet/VOC0712/SSD_300x300/'
+                        'VGG_VOC0712_SSD_300x300_iter_120000.caffemodel')
+    parser.add_argument('--image_file', default='examples/images/fish-bike.jpg')
+    return parser.parse_args()
+
+if __name__ == '__main__':
+    main(parse_args())
diff --git a/examples/ssd/ssd_ilsvrc.py b/examples/ssd/ssd_ilsvrc.py
new file mode 100644
index 00000000000..932bc0da2e4
--- /dev/null
+++ b/examples/ssd/ssd_ilsvrc.py
@@ -0,0 +1,575 @@
+from __future__ import print_function
+import caffe
+from google.protobuf import text_format
+
+import math
+import os
+import shutil
+import stat
+import subprocess
+import sys
+
+sys.path.insert(0, 'examples/ssd')
+from model_libs import *
+
+# Add extra layers on top of a "base" network (e.g. VGGNet or Inception).
+def AddExtraLayers(net, use_batchnorm=True, lr_mult=1):
+    use_relu = True
+
+    # Add additional convolutional layers.
+    # 19 x 19
+    from_layer = net.keys()[-1]
+
+    # TODO(weiliu89): Construct the name using the last layer to avoid duplication.
+    # 10 x 10
+    out_layer = "conv6_1"
+    ConvBNLayer(net, from_layer, out_layer, use_batchnorm, use_relu, 256, 1, 0, 1,
+        lr_mult=lr_mult)
+
+    from_layer = out_layer
+    out_layer = "conv6_2"
+    ConvBNLayer(net, from_layer, out_layer, use_batchnorm, use_relu, 512, 3, 1, 2,
+        lr_mult=lr_mult)
+
+    # 5 x 5
+    from_layer = out_layer
+    out_layer = "conv7_1"
+    ConvBNLayer(net, from_layer, out_layer, use_batchnorm, use_relu, 128, 1, 0, 1,
+      lr_mult=lr_mult)
+
+    from_layer = out_layer
+    out_layer = "conv7_2"
+    ConvBNLayer(net, from_layer, out_layer, use_batchnorm, use_relu, 256, 3, 1, 2,
+      lr_mult=lr_mult)
+
+    # 3 x 3
+    from_layer = out_layer
+    out_layer = "conv8_1"
+    ConvBNLayer(net, from_layer, out_layer, use_batchnorm, use_relu, 128, 1, 0, 1,
+      lr_mult=lr_mult)
+
+    from_layer = out_layer
+    out_layer = "conv8_2"
+    ConvBNLayer(net, from_layer, out_layer, use_batchnorm, use_relu, 256, 3, 0, 1,
+      lr_mult=lr_mult)
+
+    # 1 x 1
+    from_layer = out_layer
+    out_layer = "conv9_1"
+    ConvBNLayer(net, from_layer, out_layer, use_batchnorm, use_relu, 128, 1, 0, 1,
+      lr_mult=lr_mult)
+
+    from_layer = out_layer
+    out_layer = "conv9_2"
+    ConvBNLayer(net, from_layer, out_layer, use_batchnorm, use_relu, 256, 3, 0, 1,
+      lr_mult=lr_mult)
+
+    return net
+
+
+### Modify the following parameters accordingly ###
+# The directory which contains the caffe code.
+# We assume you are running the script at the CAFFE_ROOT.
+caffe_root = os.getcwd()
+
+# Set true if you want to start training right after generating all files.
+run_soon = True
+# Set true if you want to load from most recently saved snapshot.
+# Otherwise, we will load from the pretrain_model defined below.
+resume_training = True
+# If true, Remove old model files.
+remove_old_models = True
+
+# The database file for training data. Created by data/ILSVRC2016/create_data.sh
+train_data = "examples/ILSVRC2016/ILSVRC2016_trainval1_lmdb"
+# The database file for testing data. Created by data/ILSVRC2016/create_data.sh
+test_data = "examples/ILSVRC2016/ILSVRC2016_val2_lmdb"
+# Specify the batch sampler.
+resize_width = 300
+resize_height = 300
+resize = "{}x{}".format(resize_width, resize_height)
+batch_sampler = [
+        {
+                'sampler': {
+                        },
+                'max_trials': 1,
+                'max_sample': 1,
+        },
+        {
+                'sampler': {
+                        'min_scale': 0.3,
+                        'max_scale': 1.0,
+                        'min_aspect_ratio': 0.5,
+                        'max_aspect_ratio': 2.0,
+                        },
+                'sample_constraint': {
+                        'min_jaccard_overlap': 0.1,
+                        },
+                'max_trials': 50,
+                'max_sample': 1,
+        },
+        {
+                'sampler': {
+                        'min_scale': 0.3,
+                        'max_scale': 1.0,
+                        'min_aspect_ratio': 0.5,
+                        'max_aspect_ratio': 2.0,
+                        },
+                'sample_constraint': {
+                        'min_jaccard_overlap': 0.3,
+                        },
+                'max_trials': 50,
+                'max_sample': 1,
+        },
+        {
+                'sampler': {
+                        'min_scale': 0.3,
+                        'max_scale': 1.0,
+                        'min_aspect_ratio': 0.5,
+                        'max_aspect_ratio': 2.0,
+                        },
+                'sample_constraint': {
+                        'min_jaccard_overlap': 0.5,
+                        },
+                'max_trials': 50,
+                'max_sample': 1,
+        },
+        {
+                'sampler': {
+                        'min_scale': 0.3,
+                        'max_scale': 1.0,
+                        'min_aspect_ratio': 0.5,
+                        'max_aspect_ratio': 2.0,
+                        },
+                'sample_constraint': {
+                        'min_jaccard_overlap': 0.7,
+                        },
+                'max_trials': 50,
+                'max_sample': 1,
+        },
+        {
+                'sampler': {
+                        'min_scale': 0.3,
+                        'max_scale': 1.0,
+                        'min_aspect_ratio': 0.5,
+                        'max_aspect_ratio': 2.0,
+                        },
+                'sample_constraint': {
+                        'min_jaccard_overlap': 0.9,
+                        },
+                'max_trials': 50,
+                'max_sample': 1,
+        },
+        {
+                'sampler': {
+                        'min_scale': 0.3,
+                        'max_scale': 1.0,
+                        'min_aspect_ratio': 0.5,
+                        'max_aspect_ratio': 2.0,
+                        },
+                'sample_constraint': {
+                        'max_jaccard_overlap': 1.0,
+                        },
+                'max_trials': 50,
+                'max_sample': 1,
+        },
+        ]
+train_transform_param = {
+        'mirror': True,
+        'mean_value': [104, 117, 123],
+        'force_color': True,
+        'resize_param': {
+                'prob': 1,
+                'resize_mode': P.Resize.WARP,
+                'height': resize_height,
+                'width': resize_width,
+                'interp_mode': [
+                        P.Resize.LINEAR,
+                        P.Resize.AREA,
+                        P.Resize.NEAREST,
+                        P.Resize.CUBIC,
+                        P.Resize.LANCZOS4,
+                        ],
+                },
+        'distort_param': {
+                'brightness_prob': 0.5,
+                'brightness_delta': 32,
+                'contrast_prob': 0.5,
+                'contrast_lower': 0.5,
+                'contrast_upper': 1.5,
+                'hue_prob': 0.5,
+                'hue_delta': 18,
+                'saturation_prob': 0.5,
+                'saturation_lower': 0.5,
+                'saturation_upper': 1.5,
+                'random_order_prob': 0.0,
+                },
+        'expand_param': {
+                'prob': 0.5,
+                'max_expand_ratio': 4.0,
+                },
+        'emit_constraint': {
+            'emit_type': caffe_pb2.EmitConstraint.CENTER,
+            }
+        }
+test_transform_param = {
+        'mean_value': [104, 117, 123],
+        'force_color': True,
+        'resize_param': {
+                'prob': 1,
+                'resize_mode': P.Resize.WARP,
+                'height': resize_height,
+                'width': resize_width,
+                'interp_mode': [P.Resize.LINEAR],
+                },
+        }
+
+# If true, use batch norm for all newly added layers.
+# Currently only the non batch norm version has been tested.
+use_batchnorm = False
+lr_mult = 1
+# Use different initial learning rate.
+if use_batchnorm:
+    base_lr = 0.0004
+else:
+    # A learning rate for batch_size = 1, num_gpus = 1.
+    base_lr = 0.00004
+
+# Modify the job name if you want.
+job_name = "SSD_{}".format(resize)
+# The name of the model. Modify it if you want.
+model_name = "VGG_ILSVRC2016_{}".format(job_name)
+
+# Directory which stores the model .prototxt file.
+save_dir = "models/VGGNet/ILSVRC2016/{}".format(job_name)
+# Directory which stores the snapshot of models.
+snapshot_dir = "models/VGGNet/ILSVRC2016/{}".format(job_name)
+# Directory which stores the job script and log file.
+job_dir = "jobs/VGGNet/ILSVRC2016/{}".format(job_name)
+# Directory which stores the detection results.
+output_result_dir = "{}/data/ILSVRC2016/results/{}".format(os.environ['HOME'], job_name)
+
+# model definition files.
+train_net_file = "{}/train.prototxt".format(save_dir)
+test_net_file = "{}/test.prototxt".format(save_dir)
+deploy_net_file = "{}/deploy.prototxt".format(save_dir)
+solver_file = "{}/solver.prototxt".format(save_dir)
+# snapshot prefix.
+snapshot_prefix = "{}/{}".format(snapshot_dir, model_name)
+# job script path.
+job_file = "{}/{}.sh".format(job_dir, model_name)
+
+# Stores the test image names and sizes. Created by data/ILSVRC2016/create_list.py
+name_size_file = "data/ILSVRC2016/val2_name_size.txt"
+# The pretrained model. We use the Fully convolutional reduced (atrous) VGGNet.
+pretrain_model = "models/VGGNet/VGG_ILSVRC_16_layers_fc_reduced.caffemodel"
+# Stores LabelMapItem.
+label_map_file = "data/ILSVRC2016/labelmap_ilsvrc_det.prototxt"
+
+# MultiBoxLoss parameters.
+num_classes = 201
+share_location = True
+background_label_id=0
+train_on_diff_gt = False
+normalization_mode = P.Loss.VALID
+code_type = P.PriorBox.CENTER_SIZE
+ignore_cross_boundary_bbox = False
+mining_type = P.MultiBoxLoss.MAX_NEGATIVE
+neg_pos_ratio = 3.
+loc_weight = (neg_pos_ratio + 1.) / 4.
+multibox_loss_param = {
+    'loc_loss_type': P.MultiBoxLoss.SMOOTH_L1,
+    'conf_loss_type': P.MultiBoxLoss.SOFTMAX,
+    'loc_weight': loc_weight,
+    'num_classes': num_classes,
+    'share_location': share_location,
+    'match_type': P.MultiBoxLoss.PER_PREDICTION,
+    'overlap_threshold': 0.5,
+    'use_prior_for_matching': True,
+    'background_label_id': background_label_id,
+    'use_difficult_gt': train_on_diff_gt,
+    'mining_type': mining_type,
+    'neg_pos_ratio': neg_pos_ratio,
+    'neg_overlap': 0.5,
+    'code_type': code_type,
+    'ignore_cross_boundary_bbox': ignore_cross_boundary_bbox,
+    }
+loss_param = {
+    'normalization': normalization_mode,
+    }
+
+# parameters for generating priors.
+# minimum dimension of input image
+min_dim = 300
+# conv4_3 ==> 38 x 38
+# fc7 ==> 19 x 19
+# conv6_2 ==> 10 x 10
+# conv7_2 ==> 5 x 5
+# conv8_2 ==> 3 x 3
+# conv9_2 ==> 1 x 1
+mbox_source_layers = ['conv4_3', 'fc7', 'conv6_2', 'conv7_2', 'conv8_2', 'conv9_2']
+# in percent %
+min_ratio = 20
+max_ratio = 90
+step = int(math.floor((max_ratio - min_ratio) / (len(mbox_source_layers) - 2)))
+min_sizes = []
+max_sizes = []
+for ratio in xrange(min_ratio, max_ratio + 1, step):
+  min_sizes.append(min_dim * ratio / 100.)
+  max_sizes.append(min_dim * (ratio + step) / 100.)
+min_sizes = [min_dim * 10 / 100.] + min_sizes
+max_sizes = [min_dim * 20 / 100.] + max_sizes
+steps = [8, 16, 32, 64, 100, 300]
+aspect_ratios = [[2], [2, 3], [2, 3], [2, 3], [2], [2]]
+# L2 normalize conv4_3.
+normalizations = [20, -1, -1, -1, -1, -1]
+# variance used to encode/decode prior bboxes.
+if code_type == P.PriorBox.CENTER_SIZE:
+  prior_variance = [0.1, 0.1, 0.2, 0.2]
+else:
+  prior_variance = [0.1]
+flip = True
+clip = False
+
+# Solver parameters.
+# Defining which GPUs to use.
+gpus = "0,1,2,3"
+gpulist = gpus.split(",")
+num_gpus = len(gpulist)
+
+# Divide the mini-batch to different GPUs.
+batch_size = 32
+accum_batch_size = 32
+iter_size = accum_batch_size / batch_size
+solver_mode = P.Solver.CPU
+device_id = 0
+batch_size_per_device = batch_size
+if num_gpus > 0:
+  batch_size_per_device = int(math.ceil(float(batch_size) / num_gpus))
+  iter_size = int(math.ceil(float(accum_batch_size) / (batch_size_per_device * num_gpus)))
+  solver_mode = P.Solver.GPU
+  device_id = int(gpulist[0])
+
+if normalization_mode == P.Loss.NONE:
+  base_lr /= batch_size_per_device
+elif normalization_mode == P.Loss.VALID:
+  base_lr *= 25. / loc_weight
+elif normalization_mode == P.Loss.FULL:
+  # Roughly there are 2000 prior bboxes per image.
+  # TODO(weiliu89): Estimate the exact # of priors.
+  base_lr *= 2000.
+
+# Evaluate on whole test set.
+num_test_image = 9917
+test_batch_size = 1
+test_iter = num_test_image / test_batch_size
+
+solver_param = {
+    # Train parameters
+    'base_lr': base_lr,
+    'weight_decay': 0.0005,
+    'lr_policy': "multistep",
+    'stepvalue': [320000, 400000, 440000],
+    'gamma': 0.1,
+    'momentum': 0.9,
+    'iter_size': iter_size,
+    'max_iter': 440000,
+    'snapshot': 30000,
+    'display': 10,
+    'average_loss': 10,
+    'type': "SGD",
+    'solver_mode': solver_mode,
+    'device_id': device_id,
+    'debug_info': False,
+    'snapshot_after_train': True,
+    # Test parameters
+    'test_iter': [test_iter],
+    'test_interval': 10000,
+    'eval_type': "detection",
+    'ap_version': "MaxIntegral",
+    'test_initialization': False,
+    }
+
+# parameters for generating detection output.
+det_out_param = {
+    'num_classes': num_classes,
+    'share_location': share_location,
+    'background_label_id': background_label_id,
+    'nms_param': {'nms_threshold': 0.45, 'top_k': 400},
+    'save_output_param': {
+        'output_directory': output_result_dir,
+        'output_name_prefix': "val2_ssd{}_results".format(min_dim),
+        'output_format': "ILSVRC",
+        'label_map_file': label_map_file,
+        'name_size_file': name_size_file,
+        'num_test_image': num_test_image,
+        },
+    'keep_top_k': 200,
+    'confidence_threshold': 0.01,
+    'code_type': code_type,
+    }
+
+# parameters for evaluating detection results.
+det_eval_param = {
+    'num_classes': num_classes,
+    'background_label_id': background_label_id,
+    'overlap_threshold': 0.5,
+    'evaluate_difficult_gt': False,
+    'name_size_file': name_size_file,
+    }
+
+### Hopefully you don't need to change the following ###
+# Check file.
+check_if_exist(train_data)
+check_if_exist(test_data)
+check_if_exist(label_map_file)
+check_if_exist(pretrain_model)
+make_if_not_exist(save_dir)
+make_if_not_exist(job_dir)
+make_if_not_exist(snapshot_dir)
+
+# Create train net.
+net = caffe.NetSpec()
+net.data, net.label = CreateAnnotatedDataLayer(train_data, batch_size=batch_size_per_device,
+        train=True, output_label=True, label_map_file=label_map_file,
+        transform_param=train_transform_param, batch_sampler=batch_sampler)
+
+VGGNetBody(net, from_layer='data', fully_conv=True, reduced=True, dilated=True,
+    dropout=False)
+
+AddExtraLayers(net, use_batchnorm, lr_mult=lr_mult)
+
+mbox_layers = CreateMultiBoxHead(net, data_layer='data', from_layers=mbox_source_layers,
+        use_batchnorm=use_batchnorm, min_sizes=min_sizes, max_sizes=max_sizes,
+        aspect_ratios=aspect_ratios, steps=steps, normalizations=normalizations,
+        num_classes=num_classes, share_location=share_location, flip=flip, clip=clip,
+        prior_variance=prior_variance, kernel_size=3, pad=1, lr_mult=lr_mult)
+
+# Create the MultiBoxLossLayer.
+name = "mbox_loss"
+mbox_layers.append(net.label)
+net[name] = L.MultiBoxLoss(*mbox_layers, multibox_loss_param=multibox_loss_param,
+        loss_param=loss_param, include=dict(phase=caffe_pb2.Phase.Value('TRAIN')),
+        propagate_down=[True, True, False, False])
+
+with open(train_net_file, 'w') as f:
+    print('name: "{}_train"'.format(model_name), file=f)
+    print(net.to_proto(), file=f)
+shutil.copy(train_net_file, job_dir)
+
+# Create test net.
+net = caffe.NetSpec()
+net.data, net.label = CreateAnnotatedDataLayer(test_data, batch_size=test_batch_size,
+        train=False, output_label=True, label_map_file=label_map_file,
+        transform_param=test_transform_param)
+
+VGGNetBody(net, from_layer='data', fully_conv=True, reduced=True, dilated=True,
+    dropout=False)
+
+AddExtraLayers(net, use_batchnorm, lr_mult=lr_mult)
+
+mbox_layers = CreateMultiBoxHead(net, data_layer='data', from_layers=mbox_source_layers,
+        use_batchnorm=use_batchnorm, min_sizes=min_sizes, max_sizes=max_sizes,
+        aspect_ratios=aspect_ratios, steps=steps, normalizations=normalizations,
+        num_classes=num_classes, share_location=share_location, flip=flip, clip=clip,
+        prior_variance=prior_variance, kernel_size=3, pad=1, lr_mult=lr_mult)
+
+conf_name = "mbox_conf"
+if multibox_loss_param["conf_loss_type"] == P.MultiBoxLoss.SOFTMAX:
+  reshape_name = "{}_reshape".format(conf_name)
+  net[reshape_name] = L.Reshape(net[conf_name], shape=dict(dim=[0, -1, num_classes]))
+  softmax_name = "{}_softmax".format(conf_name)
+  net[softmax_name] = L.Softmax(net[reshape_name], axis=2)
+  flatten_name = "{}_flatten".format(conf_name)
+  net[flatten_name] = L.Flatten(net[softmax_name], axis=1)
+  mbox_layers[1] = net[flatten_name]
+elif multibox_loss_param["conf_loss_type"] == P.MultiBoxLoss.LOGISTIC:
+  sigmoid_name = "{}_sigmoid".format(conf_name)
+  net[sigmoid_name] = L.Sigmoid(net[conf_name])
+  mbox_layers[1] = net[sigmoid_name]
+
+net.detection_out = L.DetectionOutput(*mbox_layers,
+    detection_output_param=det_out_param,
+    include=dict(phase=caffe_pb2.Phase.Value('TEST')))
+net.detection_eval = L.DetectionEvaluate(net.detection_out, net.label,
+    detection_evaluate_param=det_eval_param,
+    include=dict(phase=caffe_pb2.Phase.Value('TEST')))
+
+with open(test_net_file, 'w') as f:
+    print('name: "{}_test"'.format(model_name), file=f)
+    print(net.to_proto(), file=f)
+shutil.copy(test_net_file, job_dir)
+
+# Create deploy net.
+# Remove the first and last layer from test net.
+deploy_net = net
+with open(deploy_net_file, 'w') as f:
+    net_param = deploy_net.to_proto()
+    # Remove the first (AnnotatedData) and last (DetectionEvaluate) layer from test net.
+    del net_param.layer[0]
+    del net_param.layer[-1]
+    net_param.name = '{}_deploy'.format(model_name)
+    net_param.input.extend(['data'])
+    net_param.input_shape.extend([
+        caffe_pb2.BlobShape(dim=[1, 3, resize_height, resize_width])])
+    print(net_param, file=f)
+shutil.copy(deploy_net_file, job_dir)
+
+# Create solver.
+solver = caffe_pb2.SolverParameter(
+        train_net=train_net_file,
+        test_net=[test_net_file],
+        snapshot_prefix=snapshot_prefix,
+        **solver_param)
+
+with open(solver_file, 'w') as f:
+    print(solver, file=f)
+shutil.copy(solver_file, job_dir)
+
+max_iter = 0
+# Find most recent snapshot.
+for file in os.listdir(snapshot_dir):
+  if file.endswith(".solverstate"):
+    basename = os.path.splitext(file)[0]
+    iter = int(basename.split("{}_iter_".format(model_name))[1])
+    if iter > max_iter:
+      max_iter = iter
+
+train_src_param = '--weights="{}" \\\n'.format(pretrain_model)
+if resume_training:
+  if max_iter > 0:
+    train_src_param = '--snapshot="{}_iter_{}.solverstate" \\\n'.format(snapshot_prefix, max_iter)
+
+if remove_old_models:
+  # Remove any snapshots smaller than max_iter.
+  for file in os.listdir(snapshot_dir):
+    if file.endswith(".solverstate"):
+      basename = os.path.splitext(file)[0]
+      iter = int(basename.split("{}_iter_".format(model_name))[1])
+      if max_iter > iter:
+        os.remove("{}/{}".format(snapshot_dir, file))
+    if file.endswith(".caffemodel"):
+      basename = os.path.splitext(file)[0]
+      iter = int(basename.split("{}_iter_".format(model_name))[1])
+      if max_iter > iter:
+        os.remove("{}/{}".format(snapshot_dir, file))
+
+# Create job file.
+with open(job_file, 'w') as f:
+  f.write('cd {}\n'.format(caffe_root))
+  f.write('./build/tools/caffe train \\\n')
+  f.write('--solver="{}" \\\n'.format(solver_file))
+  f.write(train_src_param)
+  if solver_param['solver_mode'] == P.Solver.GPU:
+    f.write('--gpu {} 2>&1 | tee {}/{}.log\n'.format(gpus, job_dir, model_name))
+  else:
+    f.write('2>&1 | tee {}/{}.log\n'.format(job_dir, model_name))
+
+# Copy the python script to job_dir.
+py_file = os.path.abspath(__file__)
+shutil.copy(py_file, job_dir)
+
+# Run the job.
+os.chmod(job_file, stat.S_IRWXU)
+if run_soon:
+  subprocess.call(job_file, shell=True)
diff --git a/examples/ssd/ssd_pascal.py b/examples/ssd/ssd_pascal.py
new file mode 100644
index 00000000000..e7a98174b82
--- /dev/null
+++ b/examples/ssd/ssd_pascal.py
@@ -0,0 +1,575 @@
+from __future__ import print_function
+import caffe
+from google.protobuf import text_format
+
+import math
+import os
+import shutil
+import stat
+import subprocess
+import sys
+
+sys.path.insert(0, 'examples/ssd')
+from model_libs import *
+
+# Add extra layers on top of a "base" network (e.g. VGGNet or Inception).
+def AddExtraLayers(net, use_batchnorm=True, lr_mult=1):
+    use_relu = True
+
+    # Add additional convolutional layers.
+    # 19 x 19
+    from_layer = net.keys()[-1]
+
+    # TODO(weiliu89): Construct the name using the last layer to avoid duplication.
+    # 10 x 10
+    out_layer = "conv6_1"
+    ConvBNLayer(net, from_layer, out_layer, use_batchnorm, use_relu, 256, 1, 0, 1,
+        lr_mult=lr_mult)
+
+    from_layer = out_layer
+    out_layer = "conv6_2"
+    ConvBNLayer(net, from_layer, out_layer, use_batchnorm, use_relu, 512, 3, 1, 2,
+        lr_mult=lr_mult)
+
+    # 5 x 5
+    from_layer = out_layer
+    out_layer = "conv7_1"
+    ConvBNLayer(net, from_layer, out_layer, use_batchnorm, use_relu, 128, 1, 0, 1,
+      lr_mult=lr_mult)
+
+    from_layer = out_layer
+    out_layer = "conv7_2"
+    ConvBNLayer(net, from_layer, out_layer, use_batchnorm, use_relu, 256, 3, 1, 2,
+      lr_mult=lr_mult)
+
+    # 3 x 3
+    from_layer = out_layer
+    out_layer = "conv8_1"
+    ConvBNLayer(net, from_layer, out_layer, use_batchnorm, use_relu, 128, 1, 0, 1,
+      lr_mult=lr_mult)
+
+    from_layer = out_layer
+    out_layer = "conv8_2"
+    ConvBNLayer(net, from_layer, out_layer, use_batchnorm, use_relu, 256, 3, 0, 1,
+      lr_mult=lr_mult)
+
+    # 1 x 1
+    from_layer = out_layer
+    out_layer = "conv9_1"
+    ConvBNLayer(net, from_layer, out_layer, use_batchnorm, use_relu, 128, 1, 0, 1,
+      lr_mult=lr_mult)
+
+    from_layer = out_layer
+    out_layer = "conv9_2"
+    ConvBNLayer(net, from_layer, out_layer, use_batchnorm, use_relu, 256, 3, 0, 1,
+      lr_mult=lr_mult)
+
+    return net
+
+
+### Modify the following parameters accordingly ###
+# The directory which contains the caffe code.
+# We assume you are running the script at the CAFFE_ROOT.
+caffe_root = os.getcwd()
+
+# Set true if you want to start training right after generating all files.
+run_soon = True
+# Set true if you want to load from most recently saved snapshot.
+# Otherwise, we will load from the pretrain_model defined below.
+resume_training = True
+# If true, Remove old model files.
+remove_old_models = False
+
+# The database file for training data. Created by data/VOC0712/create_data.sh
+train_data = "examples/VOC0712/VOC0712_trainval_lmdb"
+# The database file for testing data. Created by data/VOC0712/create_data.sh
+test_data = "examples/VOC0712/VOC0712_test_lmdb"
+# Specify the batch sampler.
+resize_width = 300
+resize_height = 300
+resize = "{}x{}".format(resize_width, resize_height)
+batch_sampler = [
+        {
+                'sampler': {
+                        },
+                'max_trials': 1,
+                'max_sample': 1,
+        },
+        {
+                'sampler': {
+                        'min_scale': 0.3,
+                        'max_scale': 1.0,
+                        'min_aspect_ratio': 0.5,
+                        'max_aspect_ratio': 2.0,
+                        },
+                'sample_constraint': {
+                        'min_jaccard_overlap': 0.1,
+                        },
+                'max_trials': 50,
+                'max_sample': 1,
+        },
+        {
+                'sampler': {
+                        'min_scale': 0.3,
+                        'max_scale': 1.0,
+                        'min_aspect_ratio': 0.5,
+                        'max_aspect_ratio': 2.0,
+                        },
+                'sample_constraint': {
+                        'min_jaccard_overlap': 0.3,
+                        },
+                'max_trials': 50,
+                'max_sample': 1,
+        },
+        {
+                'sampler': {
+                        'min_scale': 0.3,
+                        'max_scale': 1.0,
+                        'min_aspect_ratio': 0.5,
+                        'max_aspect_ratio': 2.0,
+                        },
+                'sample_constraint': {
+                        'min_jaccard_overlap': 0.5,
+                        },
+                'max_trials': 50,
+                'max_sample': 1,
+        },
+        {
+                'sampler': {
+                        'min_scale': 0.3,
+                        'max_scale': 1.0,
+                        'min_aspect_ratio': 0.5,
+                        'max_aspect_ratio': 2.0,
+                        },
+                'sample_constraint': {
+                        'min_jaccard_overlap': 0.7,
+                        },
+                'max_trials': 50,
+                'max_sample': 1,
+        },
+        {
+                'sampler': {
+                        'min_scale': 0.3,
+                        'max_scale': 1.0,
+                        'min_aspect_ratio': 0.5,
+                        'max_aspect_ratio': 2.0,
+                        },
+                'sample_constraint': {
+                        'min_jaccard_overlap': 0.9,
+                        },
+                'max_trials': 50,
+                'max_sample': 1,
+        },
+        {
+                'sampler': {
+                        'min_scale': 0.3,
+                        'max_scale': 1.0,
+                        'min_aspect_ratio': 0.5,
+                        'max_aspect_ratio': 2.0,
+                        },
+                'sample_constraint': {
+                        'max_jaccard_overlap': 1.0,
+                        },
+                'max_trials': 50,
+                'max_sample': 1,
+        },
+        ]
+train_transform_param = {
+        'mirror': True,
+        'mean_value': [104, 117, 123],
+        'resize_param': {
+                'prob': 1,
+                'resize_mode': P.Resize.WARP,
+                'height': resize_height,
+                'width': resize_width,
+                'interp_mode': [
+                        P.Resize.LINEAR,
+                        P.Resize.AREA,
+                        P.Resize.NEAREST,
+                        P.Resize.CUBIC,
+                        P.Resize.LANCZOS4,
+                        ],
+                },
+        'distort_param': {
+                'brightness_prob': 0.5,
+                'brightness_delta': 32,
+                'contrast_prob': 0.5,
+                'contrast_lower': 0.5,
+                'contrast_upper': 1.5,
+                'hue_prob': 0.5,
+                'hue_delta': 18,
+                'saturation_prob': 0.5,
+                'saturation_lower': 0.5,
+                'saturation_upper': 1.5,
+                'random_order_prob': 0.0,
+                },
+        'expand_param': {
+                'prob': 0.5,
+                'max_expand_ratio': 4.0,
+                },
+        'emit_constraint': {
+            'emit_type': caffe_pb2.EmitConstraint.CENTER,
+            }
+        }
+test_transform_param = {
+        'mean_value': [104, 117, 123],
+        'resize_param': {
+                'prob': 1,
+                'resize_mode': P.Resize.WARP,
+                'height': resize_height,
+                'width': resize_width,
+                'interp_mode': [P.Resize.LINEAR],
+                },
+        }
+
+# If true, use batch norm for all newly added layers.
+# Currently only the non batch norm version has been tested.
+use_batchnorm = False
+lr_mult = 1
+# Use different initial learning rate.
+if use_batchnorm:
+    base_lr = 0.0004
+else:
+    # A learning rate for batch_size = 1, num_gpus = 1.
+    base_lr = 0.00004
+
+# Modify the job name if you want.
+job_name = "SSD_{}".format(resize)
+# The name of the model. Modify it if you want.
+model_name = "VGG_VOC0712_{}".format(job_name)
+
+# Directory which stores the model .prototxt file.
+save_dir = "models/VGGNet/VOC0712/{}".format(job_name)
+# Directory which stores the snapshot of models.
+snapshot_dir = "models/VGGNet/VOC0712/{}".format(job_name)
+# Directory which stores the job script and log file.
+job_dir = "jobs/VGGNet/VOC0712/{}".format(job_name)
+# Directory which stores the detection results.
+output_result_dir = "{}/data/VOCdevkit/results/VOC2007/{}/Main".format(os.environ['HOME'], job_name)
+
+# model definition files.
+train_net_file = "{}/train.prototxt".format(save_dir)
+test_net_file = "{}/test.prototxt".format(save_dir)
+deploy_net_file = "{}/deploy.prototxt".format(save_dir)
+solver_file = "{}/solver.prototxt".format(save_dir)
+# snapshot prefix.
+snapshot_prefix = "{}/{}".format(snapshot_dir, model_name)
+# job script path.
+job_file = "{}/{}.sh".format(job_dir, model_name)
+
+# Stores the test image names and sizes. Created by data/VOC0712/create_list.sh
+name_size_file = "data/VOC0712/test_name_size.txt"
+# The pretrained model. We use the Fully convolutional reduced (atrous) VGGNet.
+pretrain_model = "models/VGGNet/VGG_ILSVRC_16_layers_fc_reduced.caffemodel"
+# Stores LabelMapItem.
+label_map_file = "data/VOC0712/labelmap_voc.prototxt"
+
+# MultiBoxLoss parameters.
+num_classes = 21
+share_location = True
+background_label_id=0
+train_on_diff_gt = True
+normalization_mode = P.Loss.VALID
+code_type = P.PriorBox.CENTER_SIZE
+ignore_cross_boundary_bbox = False
+mining_type = P.MultiBoxLoss.MAX_NEGATIVE
+neg_pos_ratio = 3.
+loc_weight = (neg_pos_ratio + 1.) / 4.
+multibox_loss_param = {
+    'loc_loss_type': P.MultiBoxLoss.SMOOTH_L1,
+    'conf_loss_type': P.MultiBoxLoss.SOFTMAX,
+    'loc_weight': loc_weight,
+    'num_classes': num_classes,
+    'share_location': share_location,
+    'match_type': P.MultiBoxLoss.PER_PREDICTION,
+    'overlap_threshold': 0.5,
+    'use_prior_for_matching': True,
+    'background_label_id': background_label_id,
+    'use_difficult_gt': train_on_diff_gt,
+    'mining_type': mining_type,
+    'neg_pos_ratio': neg_pos_ratio,
+    'neg_overlap': 0.5,
+    'code_type': code_type,
+    'ignore_cross_boundary_bbox': ignore_cross_boundary_bbox,
+    }
+loss_param = {
+    'normalization': normalization_mode,
+    }
+
+# parameters for generating priors.
+# minimum dimension of input image
+min_dim = 300
+# conv4_3 ==> 38 x 38
+# fc7 ==> 19 x 19
+# conv6_2 ==> 10 x 10
+# conv7_2 ==> 5 x 5
+# conv8_2 ==> 3 x 3
+# conv9_2 ==> 1 x 1
+mbox_source_layers = ['conv4_3', 'fc7', 'conv6_2', 'conv7_2', 'conv8_2', 'conv9_2']
+# in percent %
+min_ratio = 20
+max_ratio = 90
+step = int(math.floor((max_ratio - min_ratio) / (len(mbox_source_layers) - 2)))
+min_sizes = []
+max_sizes = []
+for ratio in xrange(min_ratio, max_ratio + 1, step):
+  min_sizes.append(min_dim * ratio / 100.)
+  max_sizes.append(min_dim * (ratio + step) / 100.)
+min_sizes = [min_dim * 10 / 100.] + min_sizes
+max_sizes = [min_dim * 20 / 100.] + max_sizes
+steps = [8, 16, 32, 64, 100, 300]
+aspect_ratios = [[2], [2, 3], [2, 3], [2, 3], [2], [2]]
+# L2 normalize conv4_3.
+normalizations = [20, -1, -1, -1, -1, -1]
+# variance used to encode/decode prior bboxes.
+if code_type == P.PriorBox.CENTER_SIZE:
+  prior_variance = [0.1, 0.1, 0.2, 0.2]
+else:
+  prior_variance = [0.1]
+flip = True
+clip = False
+
+# Solver parameters.
+# Defining which GPUs to use.
+gpus = "0,1,2,3"
+gpulist = gpus.split(",")
+num_gpus = len(gpulist)
+
+# Divide the mini-batch to different GPUs.
+batch_size = 32
+accum_batch_size = 32
+iter_size = accum_batch_size / batch_size
+solver_mode = P.Solver.CPU
+device_id = 0
+batch_size_per_device = batch_size
+if num_gpus > 0:
+  batch_size_per_device = int(math.ceil(float(batch_size) / num_gpus))
+  iter_size = int(math.ceil(float(accum_batch_size) / (batch_size_per_device * num_gpus)))
+  solver_mode = P.Solver.GPU
+  device_id = int(gpulist[0])
+
+if normalization_mode == P.Loss.NONE:
+  base_lr /= batch_size_per_device
+elif normalization_mode == P.Loss.VALID:
+  base_lr *= 25. / loc_weight
+elif normalization_mode == P.Loss.FULL:
+  # Roughly there are 2000 prior bboxes per image.
+  # TODO(weiliu89): Estimate the exact # of priors.
+  base_lr *= 2000.
+
+# Evaluate on whole test set.
+num_test_image = 4952
+test_batch_size = 8
+# Ideally test_batch_size should be divisible by num_test_image,
+# otherwise mAP will be slightly off the true value.
+test_iter = int(math.ceil(float(num_test_image) / test_batch_size))
+
+solver_param = {
+    # Train parameters
+    'base_lr': base_lr,
+    'weight_decay': 0.0005,
+    'lr_policy': "multistep",
+    'stepvalue': [80000, 100000, 120000],
+    'gamma': 0.1,
+    'momentum': 0.9,
+    'iter_size': iter_size,
+    'max_iter': 120000,
+    'snapshot': 80000,
+    'display': 10,
+    'average_loss': 10,
+    'type': "SGD",
+    'solver_mode': solver_mode,
+    'device_id': device_id,
+    'debug_info': False,
+    'snapshot_after_train': True,
+    # Test parameters
+    'test_iter': [test_iter],
+    'test_interval': 10000,
+    'eval_type': "detection",
+    'ap_version': "11point",
+    'test_initialization': False,
+    }
+
+# parameters for generating detection output.
+det_out_param = {
+    'num_classes': num_classes,
+    'share_location': share_location,
+    'background_label_id': background_label_id,
+    'nms_param': {'nms_threshold': 0.45, 'top_k': 400},
+    'save_output_param': {
+        'output_directory': output_result_dir,
+        'output_name_prefix': "comp4_det_test_",
+        'output_format': "VOC",
+        'label_map_file': label_map_file,
+        'name_size_file': name_size_file,
+        'num_test_image': num_test_image,
+        },
+    'keep_top_k': 200,
+    'confidence_threshold': 0.01,
+    'code_type': code_type,
+    }
+
+# parameters for evaluating detection results.
+det_eval_param = {
+    'num_classes': num_classes,
+    'background_label_id': background_label_id,
+    'overlap_threshold': 0.5,
+    'evaluate_difficult_gt': False,
+    'name_size_file': name_size_file,
+    }
+
+### Hopefully you don't need to change the following ###
+# Check file.
+check_if_exist(train_data)
+check_if_exist(test_data)
+check_if_exist(label_map_file)
+check_if_exist(pretrain_model)
+make_if_not_exist(save_dir)
+make_if_not_exist(job_dir)
+make_if_not_exist(snapshot_dir)
+
+# Create train net.
+net = caffe.NetSpec()
+net.data, net.label = CreateAnnotatedDataLayer(train_data, batch_size=batch_size_per_device,
+        train=True, output_label=True, label_map_file=label_map_file,
+        transform_param=train_transform_param, batch_sampler=batch_sampler)
+
+VGGNetBody(net, from_layer='data', fully_conv=True, reduced=True, dilated=True,
+    dropout=False)
+
+AddExtraLayers(net, use_batchnorm, lr_mult=lr_mult)
+
+mbox_layers = CreateMultiBoxHead(net, data_layer='data', from_layers=mbox_source_layers,
+        use_batchnorm=use_batchnorm, min_sizes=min_sizes, max_sizes=max_sizes,
+        aspect_ratios=aspect_ratios, steps=steps, normalizations=normalizations,
+        num_classes=num_classes, share_location=share_location, flip=flip, clip=clip,
+        prior_variance=prior_variance, kernel_size=3, pad=1, lr_mult=lr_mult)
+
+# Create the MultiBoxLossLayer.
+name = "mbox_loss"
+mbox_layers.append(net.label)
+net[name] = L.MultiBoxLoss(*mbox_layers, multibox_loss_param=multibox_loss_param,
+        loss_param=loss_param, include=dict(phase=caffe_pb2.Phase.Value('TRAIN')),
+        propagate_down=[True, True, False, False])
+
+with open(train_net_file, 'w') as f:
+    print('name: "{}_train"'.format(model_name), file=f)
+    print(net.to_proto(), file=f)
+shutil.copy(train_net_file, job_dir)
+
+# Create test net.
+net = caffe.NetSpec()
+net.data, net.label = CreateAnnotatedDataLayer(test_data, batch_size=test_batch_size,
+        train=False, output_label=True, label_map_file=label_map_file,
+        transform_param=test_transform_param)
+
+VGGNetBody(net, from_layer='data', fully_conv=True, reduced=True, dilated=True,
+    dropout=False)
+
+AddExtraLayers(net, use_batchnorm, lr_mult=lr_mult)
+
+mbox_layers = CreateMultiBoxHead(net, data_layer='data', from_layers=mbox_source_layers,
+        use_batchnorm=use_batchnorm, min_sizes=min_sizes, max_sizes=max_sizes,
+        aspect_ratios=aspect_ratios, steps=steps, normalizations=normalizations,
+        num_classes=num_classes, share_location=share_location, flip=flip, clip=clip,
+        prior_variance=prior_variance, kernel_size=3, pad=1, lr_mult=lr_mult)
+
+conf_name = "mbox_conf"
+if multibox_loss_param["conf_loss_type"] == P.MultiBoxLoss.SOFTMAX:
+  reshape_name = "{}_reshape".format(conf_name)
+  net[reshape_name] = L.Reshape(net[conf_name], shape=dict(dim=[0, -1, num_classes]))
+  softmax_name = "{}_softmax".format(conf_name)
+  net[softmax_name] = L.Softmax(net[reshape_name], axis=2)
+  flatten_name = "{}_flatten".format(conf_name)
+  net[flatten_name] = L.Flatten(net[softmax_name], axis=1)
+  mbox_layers[1] = net[flatten_name]
+elif multibox_loss_param["conf_loss_type"] == P.MultiBoxLoss.LOGISTIC:
+  sigmoid_name = "{}_sigmoid".format(conf_name)
+  net[sigmoid_name] = L.Sigmoid(net[conf_name])
+  mbox_layers[1] = net[sigmoid_name]
+
+net.detection_out = L.DetectionOutput(*mbox_layers,
+    detection_output_param=det_out_param,
+    include=dict(phase=caffe_pb2.Phase.Value('TEST')))
+net.detection_eval = L.DetectionEvaluate(net.detection_out, net.label,
+    detection_evaluate_param=det_eval_param,
+    include=dict(phase=caffe_pb2.Phase.Value('TEST')))
+
+with open(test_net_file, 'w') as f:
+    print('name: "{}_test"'.format(model_name), file=f)
+    print(net.to_proto(), file=f)
+shutil.copy(test_net_file, job_dir)
+
+# Create deploy net.
+# Remove the first and last layer from test net.
+deploy_net = net
+with open(deploy_net_file, 'w') as f:
+    net_param = deploy_net.to_proto()
+    # Remove the first (AnnotatedData) and last (DetectionEvaluate) layer from test net.
+    del net_param.layer[0]
+    del net_param.layer[-1]
+    net_param.name = '{}_deploy'.format(model_name)
+    net_param.input.extend(['data'])
+    net_param.input_shape.extend([
+        caffe_pb2.BlobShape(dim=[1, 3, resize_height, resize_width])])
+    print(net_param, file=f)
+shutil.copy(deploy_net_file, job_dir)
+
+# Create solver.
+solver = caffe_pb2.SolverParameter(
+        train_net=train_net_file,
+        test_net=[test_net_file],
+        snapshot_prefix=snapshot_prefix,
+        **solver_param)
+
+with open(solver_file, 'w') as f:
+    print(solver, file=f)
+shutil.copy(solver_file, job_dir)
+
+max_iter = 0
+# Find most recent snapshot.
+for file in os.listdir(snapshot_dir):
+  if file.endswith(".solverstate"):
+    basename = os.path.splitext(file)[0]
+    iter = int(basename.split("{}_iter_".format(model_name))[1])
+    if iter > max_iter:
+      max_iter = iter
+
+train_src_param = '--weights="{}" \\\n'.format(pretrain_model)
+if resume_training:
+  if max_iter > 0:
+    train_src_param = '--snapshot="{}_iter_{}.solverstate" \\\n'.format(snapshot_prefix, max_iter)
+
+if remove_old_models:
+  # Remove any snapshots smaller than max_iter.
+  for file in os.listdir(snapshot_dir):
+    if file.endswith(".solverstate"):
+      basename = os.path.splitext(file)[0]
+      iter = int(basename.split("{}_iter_".format(model_name))[1])
+      if max_iter > iter:
+        os.remove("{}/{}".format(snapshot_dir, file))
+    if file.endswith(".caffemodel"):
+      basename = os.path.splitext(file)[0]
+      iter = int(basename.split("{}_iter_".format(model_name))[1])
+      if max_iter > iter:
+        os.remove("{}/{}".format(snapshot_dir, file))
+
+# Create job file.
+with open(job_file, 'w') as f:
+  f.write('cd {}\n'.format(caffe_root))
+  f.write('./build/tools/caffe train \\\n')
+  f.write('--solver="{}" \\\n'.format(solver_file))
+  f.write(train_src_param)
+  if solver_param['solver_mode'] == P.Solver.GPU:
+    f.write('--gpu {} 2>&1 | tee {}/{}.log\n'.format(gpus, job_dir, model_name))
+  else:
+    f.write('2>&1 | tee {}/{}.log\n'.format(job_dir, model_name))
+
+# Copy the python script to job_dir.
+py_file = os.path.abspath(__file__)
+shutil.copy(py_file, job_dir)
+
+# Run the job.
+os.chmod(job_file, stat.S_IRWXU)
+if run_soon:
+  subprocess.call(job_file, shell=True)
diff --git a/examples/ssd/ssd_pascal_orig.py b/examples/ssd/ssd_pascal_orig.py
new file mode 100644
index 00000000000..a927d4490e8
--- /dev/null
+++ b/examples/ssd/ssd_pascal_orig.py
@@ -0,0 +1,582 @@
+from __future__ import print_function
+import caffe
+from google.protobuf import text_format
+
+import math
+import os
+import shutil
+import stat
+import subprocess
+import sys
+
+sys.path.insert(0, 'examples/ssd')
+from model_libs import *
+
+# Add extra layers on top of a "base" network (e.g. VGGNet or Inception).
+def AddExtraLayers(net, use_batchnorm=True, lr_mult=1):
+    use_relu = True
+
+    # Add additional convolutional layers.
+    # 19 x 19
+    from_layer = net.keys()[-1]
+
+    # TODO(weiliu89): Construct the name using the last layer to avoid duplication.
+    # 10 x 10
+    out_layer = "conv6_1"
+    ConvBNLayer(net, from_layer, out_layer, use_batchnorm, use_relu, 256, 1, 0, 1,
+        lr_mult=lr_mult)
+
+    from_layer = out_layer
+    out_layer = "conv6_2"
+    ConvBNLayer(net, from_layer, out_layer, use_batchnorm, use_relu, 512, 3, 1, 2,
+        lr_mult=lr_mult)
+
+    # 5 x 5
+    from_layer = out_layer
+    out_layer = "conv7_1"
+    ConvBNLayer(net, from_layer, out_layer, use_batchnorm, use_relu, 128, 1, 0, 1,
+      lr_mult=lr_mult)
+
+    from_layer = out_layer
+    out_layer = "conv7_2"
+    ConvBNLayer(net, from_layer, out_layer, use_batchnorm, use_relu, 256, 3, 1, 2,
+      lr_mult=lr_mult)
+
+    # 3 x 3
+    from_layer = out_layer
+    out_layer = "conv8_1"
+    ConvBNLayer(net, from_layer, out_layer, use_batchnorm, use_relu, 128, 1, 0, 1,
+      lr_mult=lr_mult)
+
+    from_layer = out_layer
+    out_layer = "conv8_2"
+    ConvBNLayer(net, from_layer, out_layer, use_batchnorm, use_relu, 256, 3, 0, 1,
+      lr_mult=lr_mult)
+
+    # 1 x 1
+    from_layer = out_layer
+    out_layer = "conv9_1"
+    ConvBNLayer(net, from_layer, out_layer, use_batchnorm, use_relu, 128, 1, 0, 1,
+      lr_mult=lr_mult)
+
+    from_layer = out_layer
+    out_layer = "conv9_2"
+    ConvBNLayer(net, from_layer, out_layer, use_batchnorm, use_relu, 256, 3, 0, 1,
+      lr_mult=lr_mult)
+
+    return net
+
+
+### Modify the following parameters accordingly ###
+# The directory which contains the caffe code.
+# We assume you are running the script at the CAFFE_ROOT.
+caffe_root = os.getcwd()
+
+# Set true if you want to start training right after generating all files.
+run_soon = True
+# Set true if you want to load from most recently saved snapshot.
+# Otherwise, we will load from the pretrain_model defined below.
+resume_training = True
+# If true, Remove old model files.
+remove_old_models = False
+
+# The database file for training data. Created by data/VOC0712/create_data.sh
+train_data = "examples/VOC0712/VOC0712_trainval_lmdb"
+# The database file for testing data. Created by data/VOC0712/create_data.sh
+test_data = "examples/VOC0712/VOC0712_test_lmdb"
+# Specify the batch sampler.
+resize_width = 300
+resize_height = 300
+resize = "{}x{}".format(resize_width, resize_height)
+batch_sampler = [
+        {
+                'sampler': {
+                        },
+                'max_trials': 1,
+                'max_sample': 1,
+        },
+        {
+                'sampler': {
+                        'min_scale': 0.3,
+                        'max_scale': 1.0,
+                        'min_aspect_ratio': 0.5,
+                        'max_aspect_ratio': 2.0,
+                        },
+                'sample_constraint': {
+                        'min_jaccard_overlap': 0.1,
+                        },
+                'max_trials': 50,
+                'max_sample': 1,
+        },
+        {
+                'sampler': {
+                        'min_scale': 0.3,
+                        'max_scale': 1.0,
+                        'min_aspect_ratio': 0.5,
+                        'max_aspect_ratio': 2.0,
+                        },
+                'sample_constraint': {
+                        'min_jaccard_overlap': 0.3,
+                        },
+                'max_trials': 50,
+                'max_sample': 1,
+        },
+        {
+                'sampler': {
+                        'min_scale': 0.3,
+                        'max_scale': 1.0,
+                        'min_aspect_ratio': 0.5,
+                        'max_aspect_ratio': 2.0,
+                        },
+                'sample_constraint': {
+                        'min_jaccard_overlap': 0.5,
+                        },
+                'max_trials': 50,
+                'max_sample': 1,
+        },
+        {
+                'sampler': {
+                        'min_scale': 0.3,
+                        'max_scale': 1.0,
+                        'min_aspect_ratio': 0.5,
+                        'max_aspect_ratio': 2.0,
+                        },
+                'sample_constraint': {
+                        'min_jaccard_overlap': 0.7,
+                        },
+                'max_trials': 50,
+                'max_sample': 1,
+        },
+        {
+                'sampler': {
+                        'min_scale': 0.3,
+                        'max_scale': 1.0,
+                        'min_aspect_ratio': 0.5,
+                        'max_aspect_ratio': 2.0,
+                        },
+                'sample_constraint': {
+                        'min_jaccard_overlap': 0.9,
+                        },
+                'max_trials': 50,
+                'max_sample': 1,
+        },
+        {
+                'sampler': {
+                        'min_scale': 0.3,
+                        'max_scale': 1.0,
+                        'min_aspect_ratio': 0.5,
+                        'max_aspect_ratio': 2.0,
+                        },
+                'sample_constraint': {
+                        'max_jaccard_overlap': 1.0,
+                        },
+                'max_trials': 50,
+                'max_sample': 1,
+        },
+        ]
+train_transform_param = {
+        'mirror': True,
+        'mean_value': [104, 117, 123],
+        'resize_param': {
+                'prob': 1,
+                'resize_mode': P.Resize.FIT_SMALL_SIZE,
+                'height': resize_height,
+                'width': resize_width,
+                'height_scale': resize_height,
+                'width_scale': resize_width,
+                'interp_mode': [
+                        P.Resize.LINEAR,
+                        P.Resize.AREA,
+                        P.Resize.NEAREST,
+                        P.Resize.CUBIC,
+                        P.Resize.LANCZOS4,
+                        ],
+                },
+        'distort_param': {
+                'brightness_prob': 0.5,
+                'brightness_delta': 32,
+                'contrast_prob': 0.5,
+                'contrast_lower': 0.5,
+                'contrast_upper': 1.5,
+                'hue_prob': 0.5,
+                'hue_delta': 18,
+                'saturation_prob': 0.5,
+                'saturation_lower': 0.5,
+                'saturation_upper': 1.5,
+                'random_order_prob': 0.0,
+                },
+        'expand_param': {
+                'prob': 0.5,
+                'max_expand_ratio': 4.0,
+                },
+        'emit_constraint': {
+            'emit_type': caffe_pb2.EmitConstraint.CENTER,
+            }
+        }
+resize_param = {
+        'prob': 1,
+        'resize_mode': P.Resize.FIT_SMALL_SIZE,
+        'height': resize_height,
+        'width': resize_width,
+        'height_scale': resize_height,
+        'width_scale': resize_height,
+        'interp_mode': [P.Resize.LINEAR],
+        }
+test_transform_param = {
+        'mean_value': [104, 117, 123],
+        'resize_param': resize_param,
+        }
+
+# If true, use batch norm for all newly added layers.
+# Currently only the non batch norm version has been tested.
+use_batchnorm = False
+lr_mult = 1
+# Use different initial learning rate.
+if use_batchnorm:
+    base_lr = 0.0004
+else:
+    # A learning rate for batch_size = 1, num_gpus = 1.
+    base_lr = 0.00004
+
+# Modify the job name if you want.
+job_name = "SSD_{}_orig".format(resize)
+# The name of the model. Modify it if you want.
+model_name = "VGG_VOC0712_{}".format(job_name)
+
+# Directory which stores the model .prototxt file.
+save_dir = "models/VGGNet/VOC0712/{}".format(job_name)
+# Directory which stores the snapshot of models.
+snapshot_dir = "models/VGGNet/VOC0712/{}".format(job_name)
+# Directory which stores the job script and log file.
+job_dir = "jobs/VGGNet/VOC0712/{}".format(job_name)
+# Directory which stores the detection results.
+output_result_dir = "{}/data/VOCdevkit/results/VOC2007/{}/Main".format(os.environ['HOME'], job_name)
+
+# model definition files.
+train_net_file = "{}/train.prototxt".format(save_dir)
+test_net_file = "{}/test.prototxt".format(save_dir)
+deploy_net_file = "{}/deploy.prototxt".format(save_dir)
+solver_file = "{}/solver.prototxt".format(save_dir)
+# snapshot prefix.
+snapshot_prefix = "{}/{}".format(snapshot_dir, model_name)
+# job script path.
+job_file = "{}/{}.sh".format(job_dir, model_name)
+
+# Stores the test image names and sizes. Created by data/VOC0712/create_list.sh
+name_size_file = "data/VOC0712/test_name_size.txt"
+# The pretrained model. We use the Fully convolutional reduced (atrous) VGGNet.
+pretrain_model = "models/VGGNet/VGG_ILSVRC_16_layers_fc_reduced.caffemodel"
+# Stores LabelMapItem.
+label_map_file = "data/VOC0712/labelmap_voc.prototxt"
+
+# MultiBoxLoss parameters.
+num_classes = 21
+share_location = True
+background_label_id=0
+train_on_diff_gt = True
+normalization_mode = P.Loss.VALID
+code_type = P.PriorBox.CENTER_SIZE
+ignore_cross_boundary_bbox = False
+mining_type = P.MultiBoxLoss.MAX_NEGATIVE
+neg_pos_ratio = 3.
+loc_weight = (neg_pos_ratio + 1.) / 4.
+multibox_loss_param = {
+    'loc_loss_type': P.MultiBoxLoss.SMOOTH_L1,
+    'conf_loss_type': P.MultiBoxLoss.SOFTMAX,
+    'loc_weight': loc_weight,
+    'num_classes': num_classes,
+    'share_location': share_location,
+    'match_type': P.MultiBoxLoss.PER_PREDICTION,
+    'overlap_threshold': 0.5,
+    'use_prior_for_matching': True,
+    'background_label_id': background_label_id,
+    'use_difficult_gt': train_on_diff_gt,
+    'mining_type': mining_type,
+    'neg_pos_ratio': neg_pos_ratio,
+    'neg_overlap': 0.5,
+    'code_type': code_type,
+    'ignore_cross_boundary_bbox': ignore_cross_boundary_bbox,
+    }
+loss_param = {
+    'normalization': normalization_mode,
+    }
+
+# parameters for generating priors.
+# minimum dimension of input image
+min_dim = 300
+# conv4_3 ==> 38 x 38
+# fc7 ==> 19 x 19
+# conv6_2 ==> 10 x 10
+# conv7_2 ==> 5 x 5
+# conv8_2 ==> 3 x 3
+# conv9_2 ==> 1 x 1
+mbox_source_layers = ['conv4_3', 'fc7', 'conv6_2', 'conv7_2', 'conv8_2', 'conv9_2']
+# in percent %
+min_ratio = 20
+max_ratio = 90
+step = int(math.floor((max_ratio - min_ratio) / (len(mbox_source_layers) - 2)))
+min_sizes = []
+max_sizes = []
+for ratio in xrange(min_ratio, max_ratio + 1, step):
+  min_sizes.append(min_dim * ratio / 100.)
+  max_sizes.append(min_dim * (ratio + step) / 100.)
+min_sizes = [min_dim * 10 / 100.] + min_sizes
+max_sizes = [min_dim * 20 / 100.] + max_sizes
+steps = [8, 16, 32, 64, 100, 300]
+aspect_ratios = [[2], [2, 3], [2, 3], [2, 3], [2], [2]]
+# L2 normalize conv4_3.
+normalizations = [20, -1, -1, -1, -1, -1]
+# variance used to encode/decode prior bboxes.
+if code_type == P.PriorBox.CENTER_SIZE:
+  prior_variance = [0.1, 0.1, 0.2, 0.2]
+else:
+  prior_variance = [0.1]
+flip = True
+clip = False
+
+# Solver parameters.
+# Defining which GPUs to use.
+gpus = "0,1,2,3"
+gpulist = gpus.split(",")
+num_gpus = len(gpulist)
+
+# Divide the mini-batch to different GPUs.
+batch_size = num_gpus
+accum_batch_size = 32
+iter_size = accum_batch_size / batch_size
+solver_mode = P.Solver.CPU
+device_id = 0
+batch_size_per_device = batch_size
+if num_gpus > 0:
+  batch_size_per_device = int(math.ceil(float(batch_size) / num_gpus))
+  iter_size = int(math.ceil(float(accum_batch_size) / (batch_size_per_device * num_gpus)))
+  solver_mode = P.Solver.GPU
+  device_id = int(gpulist[0])
+
+if normalization_mode == P.Loss.NONE:
+  base_lr /= batch_size_per_device
+elif normalization_mode == P.Loss.VALID:
+  base_lr *= 25. / loc_weight
+elif normalization_mode == P.Loss.FULL:
+  # Roughly there are 2000 prior bboxes per image.
+  # TODO(weiliu89): Estimate the exact # of priors.
+  base_lr *= 2000.
+
+# Evaluate on whole test set.
+num_test_image = 4952
+test_batch_size = 1
+test_iter = num_test_image / test_batch_size
+
+solver_param = {
+    # Train parameters
+    'base_lr': base_lr,
+    'weight_decay': 0.0005,
+    'lr_policy': "multistep",
+    'stepvalue': [80000, 100000, 120000],
+    'gamma': 0.1,
+    'momentum': 0.9,
+    'iter_size': iter_size,
+    'max_iter': 120000,
+    'snapshot': 80000,
+    'display': 10,
+    'average_loss': 10,
+    'type': "SGD",
+    'solver_mode': solver_mode,
+    'device_id': device_id,
+    'debug_info': False,
+    'snapshot_after_train': True,
+    # Test parameters
+    'test_iter': [test_iter],
+    'test_interval': 10000,
+    'eval_type': "detection",
+    'ap_version': "11point",
+    'test_initialization': False,
+    }
+
+# parameters for generating detection output.
+det_out_param = {
+    'num_classes': num_classes,
+    'share_location': share_location,
+    'background_label_id': background_label_id,
+    'nms_param': {'nms_threshold': 0.45, 'top_k': 400},
+    'save_output_param': {
+        'output_directory': output_result_dir,
+        'output_name_prefix': "comp4_det_test_",
+        'output_format': "VOC",
+        'label_map_file': label_map_file,
+        'name_size_file': name_size_file,
+        'num_test_image': num_test_image,
+        'resize_param': resize_param,
+        },
+    'keep_top_k': 200,
+    'confidence_threshold': 0.01,
+    'code_type': code_type,
+    }
+
+# parameters for evaluating detection results.
+det_eval_param = {
+    'num_classes': num_classes,
+    'background_label_id': background_label_id,
+    'overlap_threshold': 0.5,
+    'evaluate_difficult_gt': False,
+    'name_size_file': name_size_file,
+    'resize_param': resize_param,
+    }
+
+### Hopefully you don't need to change the following ###
+# Check file.
+check_if_exist(train_data)
+check_if_exist(test_data)
+check_if_exist(label_map_file)
+check_if_exist(pretrain_model)
+make_if_not_exist(save_dir)
+make_if_not_exist(job_dir)
+make_if_not_exist(snapshot_dir)
+
+# Create train net.
+net = caffe.NetSpec()
+net.data, net.label = CreateAnnotatedDataLayer(train_data, batch_size=batch_size_per_device,
+        train=True, output_label=True, label_map_file=label_map_file,
+        transform_param=train_transform_param, batch_sampler=batch_sampler)
+
+VGGNetBody(net, from_layer='data', fully_conv=True, reduced=True, dilated=True,
+    dropout=False)
+
+AddExtraLayers(net, use_batchnorm, lr_mult=lr_mult)
+
+mbox_layers = CreateMultiBoxHead(net, data_layer='data', from_layers=mbox_source_layers,
+        use_batchnorm=use_batchnorm, min_sizes=min_sizes, max_sizes=max_sizes,
+        aspect_ratios=aspect_ratios, steps=steps, img_height=resize_height,
+        img_width=resize_width, normalizations=normalizations,
+        num_classes=num_classes, share_location=share_location, flip=flip, clip=clip,
+        prior_variance=prior_variance, kernel_size=3, pad=1, lr_mult=lr_mult)
+
+# Create the MultiBoxLossLayer.
+name = "mbox_loss"
+mbox_layers.append(net.label)
+net[name] = L.MultiBoxLoss(*mbox_layers, multibox_loss_param=multibox_loss_param,
+        loss_param=loss_param, include=dict(phase=caffe_pb2.Phase.Value('TRAIN')),
+        propagate_down=[True, True, False, False])
+
+with open(train_net_file, 'w') as f:
+    print('name: "{}_train"'.format(model_name), file=f)
+    print(net.to_proto(), file=f)
+shutil.copy(train_net_file, job_dir)
+
+# Create test net.
+net = caffe.NetSpec()
+net.data, net.label = CreateAnnotatedDataLayer(test_data, batch_size=test_batch_size,
+        train=False, output_label=True, label_map_file=label_map_file,
+        transform_param=test_transform_param)
+
+VGGNetBody(net, from_layer='data', fully_conv=True, reduced=True, dilated=True,
+    dropout=False)
+
+AddExtraLayers(net, use_batchnorm, lr_mult=lr_mult)
+
+mbox_layers = CreateMultiBoxHead(net, data_layer='data', from_layers=mbox_source_layers,
+        use_batchnorm=use_batchnorm, min_sizes=min_sizes, max_sizes=max_sizes,
+        aspect_ratios=aspect_ratios, steps=steps, img_height=resize_height,
+        img_width=resize_width, normalizations=normalizations,
+        num_classes=num_classes, share_location=share_location, flip=flip, clip=clip,
+        prior_variance=prior_variance, kernel_size=3, pad=1, lr_mult=lr_mult)
+
+conf_name = "mbox_conf"
+if multibox_loss_param["conf_loss_type"] == P.MultiBoxLoss.SOFTMAX:
+  reshape_name = "{}_reshape".format(conf_name)
+  net[reshape_name] = L.Reshape(net[conf_name], shape=dict(dim=[0, -1, num_classes]))
+  softmax_name = "{}_softmax".format(conf_name)
+  net[softmax_name] = L.Softmax(net[reshape_name], axis=2)
+  flatten_name = "{}_flatten".format(conf_name)
+  net[flatten_name] = L.Flatten(net[softmax_name], axis=1)
+  mbox_layers[1] = net[flatten_name]
+elif multibox_loss_param["conf_loss_type"] == P.MultiBoxLoss.LOGISTIC:
+  sigmoid_name = "{}_sigmoid".format(conf_name)
+  net[sigmoid_name] = L.Sigmoid(net[conf_name])
+  mbox_layers[1] = net[sigmoid_name]
+
+net.detection_out = L.DetectionOutput(*mbox_layers,
+    detection_output_param=det_out_param,
+    include=dict(phase=caffe_pb2.Phase.Value('TEST')))
+net.detection_eval = L.DetectionEvaluate(net.detection_out, net.label,
+    detection_evaluate_param=det_eval_param,
+    include=dict(phase=caffe_pb2.Phase.Value('TEST')))
+
+with open(test_net_file, 'w') as f:
+    print('name: "{}_test"'.format(model_name), file=f)
+    print(net.to_proto(), file=f)
+shutil.copy(test_net_file, job_dir)
+
+# Create deploy net.
+# Remove the first and last layer from test net.
+deploy_net = net
+with open(deploy_net_file, 'w') as f:
+    net_param = deploy_net.to_proto()
+    # Remove the first (AnnotatedData) and last (DetectionEvaluate) layer from test net.
+    del net_param.layer[0]
+    del net_param.layer[-1]
+    net_param.name = '{}_deploy'.format(model_name)
+    net_param.input.extend(['data'])
+    net_param.input_shape.extend([
+        caffe_pb2.BlobShape(dim=[1, 3, resize_height, resize_width])])
+    print(net_param, file=f)
+shutil.copy(deploy_net_file, job_dir)
+
+# Create solver.
+solver = caffe_pb2.SolverParameter(
+        train_net=train_net_file,
+        test_net=[test_net_file],
+        snapshot_prefix=snapshot_prefix,
+        **solver_param)
+
+with open(solver_file, 'w') as f:
+    print(solver, file=f)
+shutil.copy(solver_file, job_dir)
+
+max_iter = 0
+# Find most recent snapshot.
+for file in os.listdir(snapshot_dir):
+  if file.endswith(".solverstate"):
+    basename = os.path.splitext(file)[0]
+    iter = int(basename.split("{}_iter_".format(model_name))[1])
+    if iter > max_iter:
+      max_iter = iter
+
+train_src_param = '--weights="{}" \\\n'.format(pretrain_model)
+if resume_training:
+  if max_iter > 0:
+    train_src_param = '--snapshot="{}_iter_{}.solverstate" \\\n'.format(snapshot_prefix, max_iter)
+
+if remove_old_models:
+  # Remove any snapshots smaller than max_iter.
+  for file in os.listdir(snapshot_dir):
+    if file.endswith(".solverstate"):
+      basename = os.path.splitext(file)[0]
+      iter = int(basename.split("{}_iter_".format(model_name))[1])
+      if max_iter > iter:
+        os.remove("{}/{}".format(snapshot_dir, file))
+    if file.endswith(".caffemodel"):
+      basename = os.path.splitext(file)[0]
+      iter = int(basename.split("{}_iter_".format(model_name))[1])
+      if max_iter > iter:
+        os.remove("{}/{}".format(snapshot_dir, file))
+
+# Create job file.
+with open(job_file, 'w') as f:
+  f.write('cd {}\n'.format(caffe_root))
+  f.write('./build/tools/caffe train \\\n')
+  f.write('--solver="{}" \\\n'.format(solver_file))
+  f.write(train_src_param)
+  if solver_param['solver_mode'] == P.Solver.GPU:
+    f.write('--gpu {} 2>&1 | tee {}/{}.log\n'.format(gpus, job_dir, model_name))
+  else:
+    f.write('2>&1 | tee {}/{}.log\n'.format(job_dir, model_name))
+
+# Copy the python script to job_dir.
+py_file = os.path.abspath(__file__)
+shutil.copy(py_file, job_dir)
+
+# Run the job.
+os.chmod(job_file, stat.S_IRWXU)
+if run_soon:
+  subprocess.call(job_file, shell=True)
diff --git a/examples/ssd/ssd_pascal_resnet.py b/examples/ssd/ssd_pascal_resnet.py
new file mode 100644
index 00000000000..c721be8b34a
--- /dev/null
+++ b/examples/ssd/ssd_pascal_resnet.py
@@ -0,0 +1,522 @@
+from __future__ import print_function
+import caffe
+from google.protobuf import text_format
+
+import math
+import os
+import shutil
+import stat
+import subprocess
+import sys
+
+sys.path.insert(0, 'examples/ssd')
+from model_libs import *
+
+# Add extra layers on top of a "base" network (e.g. VGGNet or Inception).
+def AddExtraLayers(net, use_batchnorm=True):
+    use_relu = True
+
+    # Add additional convolutional layers.
+    # 19 x 19
+    last_layer = net.keys()[-1]
+
+    # 10 x 10
+    from_layer = last_layer
+    out_layer = "{}/conv1_1".format(last_layer)
+    ConvBNLayer(net, from_layer, out_layer, use_batchnorm, use_relu, 256, 1, 0, 1)
+    from_layer = out_layer
+
+    out_layer = "{}/conv1_2".format(last_layer)
+    ConvBNLayer(net, from_layer, out_layer, use_batchnorm, use_relu, 512, 3, 1, 2)
+    from_layer = out_layer
+
+    for i in xrange(2, 4):
+      out_layer = "{}/conv{}_1".format(last_layer, i)
+      ConvBNLayer(net, from_layer, out_layer, use_batchnorm, use_relu, 256, 1, 0, 1)
+      from_layer = out_layer
+
+      out_layer = "{}/conv{}_2".format(last_layer, i)
+      ConvBNLayer(net, from_layer, out_layer, use_batchnorm, use_relu, 512, 3, 1, 2)
+      from_layer = out_layer
+
+    # Add global pooling layer.
+    name = net.keys()[-1]
+    net.pool6 = L.Pooling(net[name], pool=P.Pooling.AVE, global_pooling=True)
+
+    return net
+
+
+### Modify the following parameters accordingly ###
+# The directory which contains the caffe code.
+# We assume you are running the script at the CAFFE_ROOT.
+caffe_root = os.getcwd()
+
+# Set true if you want to start training right after generating all files.
+run_soon = True
+# Set true if you want to load from most recently saved snapshot.
+# Otherwise, we will load from the pretrain_model defined below.
+resume_training = True
+# If true, Remove old model files.
+remove_old_models = False
+
+# The database file for training data. Created by data/VOC0712/create_data.sh
+train_data = "examples/VOC0712/VOC0712_trainval_lmdb"
+# The database file for testing data. Created by data/VOC0712/create_data.sh
+test_data = "examples/VOC0712/VOC0712_test_lmdb"
+# Specify the batch sampler.
+resize_width = 300
+resize_height = 300
+resize = "{}x{}".format(resize_width, resize_height)
+batch_sampler = [
+        {
+                'sampler': {
+                        },
+                'max_trials': 1,
+                'max_sample': 1,
+        },
+        {
+                'sampler': {
+                        'min_scale': 0.3,
+                        'max_scale': 1.0,
+                        'min_aspect_ratio': 0.5,
+                        'max_aspect_ratio': 2.0,
+                        },
+                'sample_constraint': {
+                        'min_jaccard_overlap': 0.1,
+                        },
+                'max_trials': 50,
+                'max_sample': 1,
+        },
+        {
+                'sampler': {
+                        'min_scale': 0.3,
+                        'max_scale': 1.0,
+                        'min_aspect_ratio': 0.5,
+                        'max_aspect_ratio': 2.0,
+                        },
+                'sample_constraint': {
+                        'min_jaccard_overlap': 0.3,
+                        },
+                'max_trials': 50,
+                'max_sample': 1,
+        },
+        {
+                'sampler': {
+                        'min_scale': 0.3,
+                        'max_scale': 1.0,
+                        'min_aspect_ratio': 0.5,
+                        'max_aspect_ratio': 2.0,
+                        },
+                'sample_constraint': {
+                        'min_jaccard_overlap': 0.5,
+                        },
+                'max_trials': 50,
+                'max_sample': 1,
+        },
+        {
+                'sampler': {
+                        'min_scale': 0.3,
+                        'max_scale': 1.0,
+                        'min_aspect_ratio': 0.5,
+                        'max_aspect_ratio': 2.0,
+                        },
+                'sample_constraint': {
+                        'min_jaccard_overlap': 0.7,
+                        },
+                'max_trials': 50,
+                'max_sample': 1,
+        },
+        {
+                'sampler': {
+                        'min_scale': 0.3,
+                        'max_scale': 1.0,
+                        'min_aspect_ratio': 0.5,
+                        'max_aspect_ratio': 2.0,
+                        },
+                'sample_constraint': {
+                        'min_jaccard_overlap': 0.9,
+                        },
+                'max_trials': 50,
+                'max_sample': 1,
+        },
+        {
+                'sampler': {
+                        'min_scale': 0.3,
+                        'max_scale': 1.0,
+                        'min_aspect_ratio': 0.5,
+                        'max_aspect_ratio': 2.0,
+                        },
+                'sample_constraint': {
+                        'max_jaccard_overlap': 1.0,
+                        },
+                'max_trials': 50,
+                'max_sample': 1,
+        },
+        ]
+train_transform_param = {
+        'mirror': True,
+        'mean_value': [104, 117, 123],
+        'resize_param': {
+                'prob': 1,
+                'resize_mode': P.Resize.WARP,
+                'height': resize_height,
+                'width': resize_width,
+                'interp_mode': [
+                        P.Resize.LINEAR,
+                        P.Resize.AREA,
+                        P.Resize.NEAREST,
+                        P.Resize.CUBIC,
+                        P.Resize.LANCZOS4,
+                        ],
+                },
+        'emit_constraint': {
+            'emit_type': caffe_pb2.EmitConstraint.CENTER,
+            }
+        }
+test_transform_param = {
+        'mean_value': [104, 117, 123],
+        'resize_param': {
+                'prob': 1,
+                'resize_mode': P.Resize.WARP,
+                'height': resize_height,
+                'width': resize_width,
+                'interp_mode': [P.Resize.LINEAR],
+                },
+        }
+
+# A learning rate for batch_size = 1, num_gpus = 1.
+base_lr = 0.00004
+
+# Modify the job name if you want.
+job_name = "SSD_{}".format(resize)
+# The name of the model. Modify it if you want.
+model_name = "ResNet_VOC0712_{}".format(job_name)
+
+# Directory which stores the model .prototxt file.
+save_dir = "models/ResNet/VOC0712/{}".format(job_name)
+# Directory which stores the snapshot of models.
+snapshot_dir = "models/ResNet/VOC0712/{}".format(job_name)
+# Directory which stores the job script and log file.
+job_dir = "jobs/ResNet/VOC0712/{}".format(job_name)
+# Directory which stores the detection results.
+output_result_dir = "{}/data/VOCdevkit/results/VOC2007/{}/Main".format(os.environ['HOME'], job_name)
+
+# model definition files.
+train_net_file = "{}/train.prototxt".format(save_dir)
+test_net_file = "{}/test.prototxt".format(save_dir)
+deploy_net_file = "{}/deploy.prototxt".format(save_dir)
+solver_file = "{}/solver.prototxt".format(save_dir)
+# snapshot prefix.
+snapshot_prefix = "{}/{}".format(snapshot_dir, model_name)
+# job script path.
+job_file = "{}/{}.sh".format(job_dir, model_name)
+
+# Stores the test image names and sizes. Created by data/VOC0712/create_list.sh
+name_size_file = "data/VOC0712/test_name_size.txt"
+# The pretrained ResNet101 model from https://github.com/KaimingHe/deep-residual-networks.
+pretrain_model = "models/ResNet/ResNet-101-model.caffemodel"
+# Stores LabelMapItem.
+label_map_file = "data/VOC0712/labelmap_voc.prototxt"
+
+# MultiBoxLoss parameters.
+num_classes = 21
+share_location = True
+background_label_id=0
+train_on_diff_gt = True
+normalization_mode = P.Loss.VALID
+code_type = P.PriorBox.CENTER_SIZE
+neg_pos_ratio = 3.
+loc_weight = (neg_pos_ratio + 1.) / 4.
+mining_type = P.MultiBoxLoss.MAX_NEGATIVE
+multibox_loss_param = {
+    'loc_loss_type': P.MultiBoxLoss.SMOOTH_L1,
+    'conf_loss_type': P.MultiBoxLoss.SOFTMAX,
+    'loc_weight': loc_weight,
+    'num_classes': num_classes,
+    'share_location': share_location,
+    'match_type': P.MultiBoxLoss.PER_PREDICTION,
+    'overlap_threshold': 0.5,
+    'use_prior_for_matching': True,
+    'background_label_id': background_label_id,
+    'use_difficult_gt': train_on_diff_gt,
+    'mining_type': mining_type,
+    'neg_pos_ratio': neg_pos_ratio,
+    'neg_overlap': 0.5,
+    'code_type': code_type,
+    }
+loss_param = {
+    'normalization': normalization_mode,
+    }
+
+# parameters for generating priors.
+# minimum dimension of input image
+min_dim = 300
+# res3b3_relu ==> 38 x 38
+# res5c_relu ==> 19 x 19
+# res5c_relu/conv1_2 ==> 10 x 10
+# res5c_relu/conv2_2 ==> 5 x 5
+# res5c_relu/conv3_2 ==> 3 x 3
+# pool6 ==> 1 x 1
+mbox_source_layers = ['res3b3_relu', 'res5c_relu', 'res5c_relu/conv1_2', 'res5c_relu/conv2_2', 'res5c_relu/conv3_2', 'pool6']
+# in percent %
+min_ratio = 20
+max_ratio = 95
+step = int(math.floor((max_ratio - min_ratio) / (len(mbox_source_layers) - 2)))
+min_sizes = []
+max_sizes = []
+for ratio in xrange(min_ratio, max_ratio + 1, step):
+  min_sizes.append(min_dim * ratio / 100.)
+  max_sizes.append(min_dim * (ratio + step) / 100.)
+min_sizes = [min_dim * 10 / 100.] + min_sizes
+max_sizes = [[]] + max_sizes
+aspect_ratios = [[2], [2, 3], [2, 3], [2, 3], [2, 3], [2, 3]]
+# variance used to encode/decode prior bboxes.
+if code_type == P.PriorBox.CENTER_SIZE:
+  prior_variance = [0.1, 0.1, 0.2, 0.2]
+else:
+  prior_variance = [0.1]
+flip = True
+clip = True
+
+# Solver parameters.
+# Defining which GPUs to use.
+gpus = "0,1,2,3"
+gpulist = gpus.split(",")
+num_gpus = len(gpulist)
+
+# Divide the mini-batch to different GPUs.
+batch_size = 32
+accum_batch_size = 32
+iter_size = accum_batch_size / batch_size
+solver_mode = P.Solver.CPU
+device_id = 0
+batch_size_per_device = batch_size
+if num_gpus > 0:
+  batch_size_per_device = int(math.ceil(float(batch_size) / num_gpus))
+  iter_size = int(math.ceil(float(accum_batch_size) / (batch_size_per_device * num_gpus)))
+  solver_mode = P.Solver.GPU
+  device_id = int(gpulist[0])
+
+if normalization_mode == P.Loss.NONE:
+  base_lr /= batch_size_per_device
+elif normalization_mode == P.Loss.VALID:
+  base_lr *= 25. / loc_weight
+elif normalization_mode == P.Loss.FULL:
+  # Roughly there are 2000 prior bboxes per image.
+  # TODO(weiliu89): Estimate the exact # of priors.
+  base_lr *= 2000.
+
+# Evaluate on whole test set.
+num_test_image = 4952
+test_batch_size = 1
+test_iter = num_test_image / test_batch_size
+
+solver_param = {
+    # Train parameters
+    'base_lr': base_lr,
+    'weight_decay': 0.0005,
+    'lr_policy': "step",
+    'stepsize': 40000,
+    'gamma': 0.1,
+    'momentum': 0.9,
+    'iter_size': iter_size,
+    'max_iter': 60000,
+    'snapshot': 40000,
+    'display': 10,
+    'average_loss': 10,
+    'type': "SGD",
+    'solver_mode': solver_mode,
+    'device_id': device_id,
+    'debug_info': False,
+    'snapshot_after_train': True,
+    # Test parameters
+    'test_iter': [test_iter],
+    'test_interval': 10000,
+    'eval_type': "detection",
+    'ap_version': "11point",
+    'test_initialization': False,
+    }
+
+# parameters for generating detection output.
+det_out_param = {
+    'num_classes': num_classes,
+    'share_location': share_location,
+    'background_label_id': background_label_id,
+    'nms_param': {'nms_threshold': 0.45, 'top_k': 400},
+    'save_output_param': {
+        'output_directory': output_result_dir,
+        'output_name_prefix': "comp4_det_test_",
+        'output_format': "VOC",
+        'label_map_file': label_map_file,
+        'name_size_file': name_size_file,
+        'num_test_image': num_test_image,
+        },
+    'keep_top_k': 200,
+    'confidence_threshold': 0.01,
+    'code_type': code_type,
+    }
+
+# parameters for evaluating detection results.
+det_eval_param = {
+    'num_classes': num_classes,
+    'background_label_id': background_label_id,
+    'overlap_threshold': 0.5,
+    'evaluate_difficult_gt': False,
+    'name_size_file': name_size_file,
+    }
+
+### Hopefully you don't need to change the following ###
+# Check file.
+check_if_exist(train_data)
+check_if_exist(test_data)
+check_if_exist(label_map_file)
+check_if_exist(pretrain_model)
+make_if_not_exist(save_dir)
+make_if_not_exist(job_dir)
+make_if_not_exist(snapshot_dir)
+
+# Create train net.
+net = caffe.NetSpec()
+net.data, net.label = CreateAnnotatedDataLayer(train_data, batch_size=batch_size_per_device,
+        train=True, output_label=True, label_map_file=label_map_file,
+        transform_param=train_transform_param, batch_sampler=batch_sampler)
+
+ResNet101Body(net, from_layer='data', use_pool5=False, use_dilation_conv5=True)
+
+# Use batch norm for the newly added layers.
+AddExtraLayers(net, use_batchnorm=True)
+
+# Don't use batch norm for location/confidence prediction layers.
+mbox_layers = CreateMultiBoxHead(net, data_layer='data', from_layers=mbox_source_layers,
+        use_batchnorm=False, min_sizes=min_sizes, max_sizes=max_sizes,
+        aspect_ratios=aspect_ratios, num_classes=num_classes, share_location=share_location,
+        flip=flip, clip=clip, prior_variance=prior_variance, kernel_size=3, pad=1)
+
+# Create the MultiBoxLossLayer.
+name = "mbox_loss"
+mbox_layers.append(net.label)
+net[name] = L.MultiBoxLoss(*mbox_layers, multibox_loss_param=multibox_loss_param,
+        loss_param=loss_param, include=dict(phase=caffe_pb2.Phase.Value('TRAIN')),
+        propagate_down=[True, True, False, False])
+
+with open(train_net_file, 'w') as f:
+    print('name: "{}_train"'.format(model_name), file=f)
+    print(net.to_proto(), file=f)
+shutil.copy(train_net_file, job_dir)
+
+# Create test net.
+net = caffe.NetSpec()
+net.data, net.label = CreateAnnotatedDataLayer(test_data, batch_size=test_batch_size,
+        train=False, output_label=True, label_map_file=label_map_file,
+        transform_param=test_transform_param)
+
+ResNet101Body(net, from_layer='data', use_pool5=False, use_dilation_conv5=True)
+
+# Use batch norm for the newly added layers.
+AddExtraLayers(net, use_batchnorm=True)
+
+# Don't use batch norm for location/confidence prediction layers.
+mbox_layers = CreateMultiBoxHead(net, data_layer='data', from_layers=mbox_source_layers,
+        use_batchnorm=False, min_sizes=min_sizes, max_sizes=max_sizes,
+        aspect_ratios=aspect_ratios, num_classes=num_classes, share_location=share_location,
+        flip=flip, clip=clip, prior_variance=prior_variance, kernel_size=3, pad=1)
+
+conf_name = "mbox_conf"
+if multibox_loss_param["conf_loss_type"] == P.MultiBoxLoss.SOFTMAX:
+  reshape_name = "{}_reshape".format(conf_name)
+  net[reshape_name] = L.Reshape(net[conf_name], shape=dict(dim=[0, -1, num_classes]))
+  softmax_name = "{}_softmax".format(conf_name)
+  net[softmax_name] = L.Softmax(net[reshape_name], axis=2)
+  flatten_name = "{}_flatten".format(conf_name)
+  net[flatten_name] = L.Flatten(net[softmax_name], axis=1)
+  mbox_layers[1] = net[flatten_name]
+elif multibox_loss_param["conf_loss_type"] == P.MultiBoxLoss.LOGISTIC:
+  sigmoid_name = "{}_sigmoid".format(conf_name)
+  net[sigmoid_name] = L.Sigmoid(net[conf_name])
+  mbox_layers[1] = net[sigmoid_name]
+
+net.detection_out = L.DetectionOutput(*mbox_layers,
+    detection_output_param=det_out_param,
+    include=dict(phase=caffe_pb2.Phase.Value('TEST')))
+net.detection_eval = L.DetectionEvaluate(net.detection_out, net.label,
+    detection_evaluate_param=det_eval_param,
+    include=dict(phase=caffe_pb2.Phase.Value('TEST')))
+
+with open(test_net_file, 'w') as f:
+    print('name: "{}_test"'.format(model_name), file=f)
+    print(net.to_proto(), file=f)
+shutil.copy(test_net_file, job_dir)
+
+# Create deploy net.
+# Remove the first and last layer from test net.
+deploy_net = net
+with open(deploy_net_file, 'w') as f:
+    net_param = deploy_net.to_proto()
+    # Remove the first (AnnotatedData) and last (DetectionEvaluate) layer from test net.
+    del net_param.layer[0]
+    del net_param.layer[-1]
+    net_param.name = '{}_deploy'.format(model_name)
+    net_param.input.extend(['data'])
+    net_param.input_shape.extend([
+        caffe_pb2.BlobShape(dim=[1, 3, resize_height, resize_width])])
+    print(net_param, file=f)
+shutil.copy(deploy_net_file, job_dir)
+
+# Create solver.
+solver = caffe_pb2.SolverParameter(
+        train_net=train_net_file,
+        test_net=[test_net_file],
+        snapshot_prefix=snapshot_prefix,
+        **solver_param)
+
+with open(solver_file, 'w') as f:
+    print(solver, file=f)
+shutil.copy(solver_file, job_dir)
+
+max_iter = 0
+# Find most recent snapshot.
+for file in os.listdir(snapshot_dir):
+  if file.endswith(".solverstate"):
+    basename = os.path.splitext(file)[0]
+    iter = int(basename.split("{}_iter_".format(model_name))[1])
+    if iter > max_iter:
+      max_iter = iter
+
+train_src_param = '--weights="{}" \\\n'.format(pretrain_model)
+if resume_training:
+  if max_iter > 0:
+    train_src_param = '--snapshot="{}_iter_{}.solverstate" \\\n'.format(snapshot_prefix, max_iter)
+
+if remove_old_models:
+  # Remove any snapshots smaller than max_iter.
+  for file in os.listdir(snapshot_dir):
+    if file.endswith(".solverstate"):
+      basename = os.path.splitext(file)[0]
+      iter = int(basename.split("{}_iter_".format(model_name))[1])
+      if max_iter > iter:
+        os.remove("{}/{}".format(snapshot_dir, file))
+    if file.endswith(".caffemodel"):
+      basename = os.path.splitext(file)[0]
+      iter = int(basename.split("{}_iter_".format(model_name))[1])
+      if max_iter > iter:
+        os.remove("{}/{}".format(snapshot_dir, file))
+
+# Create job file.
+with open(job_file, 'w') as f:
+  f.write('cd {}\n'.format(caffe_root))
+  f.write('./build/tools/caffe train \\\n')
+  f.write('--solver="{}" \\\n'.format(solver_file))
+  f.write(train_src_param)
+  if solver_param['solver_mode'] == P.Solver.GPU:
+    f.write('--gpu {} 2>&1 | tee {}/{}.log\n'.format(gpus, job_dir, model_name))
+  else:
+    f.write('2>&1 | tee {}/{}.log\n'.format(job_dir, model_name))
+
+# Copy the python script to job_dir.
+py_file = os.path.abspath(__file__)
+shutil.copy(py_file, job_dir)
+
+# Run the job.
+os.chmod(job_file, stat.S_IRWXU)
+if run_soon:
+  subprocess.call(job_file, shell=True)
diff --git a/examples/ssd/ssd_pascal_speed.py b/examples/ssd/ssd_pascal_speed.py
new file mode 100644
index 00000000000..fd766a62322
--- /dev/null
+++ b/examples/ssd/ssd_pascal_speed.py
@@ -0,0 +1,556 @@
+from __future__ import print_function
+import caffe
+from google.protobuf import text_format
+
+import math
+import os
+import shutil
+import stat
+import subprocess
+import sys
+
+sys.path.insert(0, 'examples/ssd')
+from model_libs import *
+
+# Add extra layers on top of a "base" network (e.g. VGGNet or Inception).
+def AddExtraLayers(net, use_batchnorm=True, lr_mult=1):
+    use_relu = True
+
+    # Add additional convolutional layers.
+    # 19 x 19
+    from_layer = net.keys()[-1]
+
+    # TODO(weiliu89): Construct the name using the last layer to avoid duplication.
+    # 10 x 10
+    out_layer = "conv6_1"
+    ConvBNLayer(net, from_layer, out_layer, use_batchnorm, use_relu, 256, 1, 0, 1,
+        lr_mult=lr_mult)
+
+    from_layer = out_layer
+    out_layer = "conv6_2"
+    ConvBNLayer(net, from_layer, out_layer, use_batchnorm, use_relu, 512, 3, 1, 2,
+        lr_mult=lr_mult)
+
+    # 5 x 5
+    from_layer = out_layer
+    out_layer = "conv7_1"
+    ConvBNLayer(net, from_layer, out_layer, use_batchnorm, use_relu, 128, 1, 0, 1,
+      lr_mult=lr_mult)
+
+    from_layer = out_layer
+    out_layer = "conv7_2"
+    ConvBNLayer(net, from_layer, out_layer, use_batchnorm, use_relu, 256, 3, 1, 2,
+      lr_mult=lr_mult)
+
+    # 3 x 3
+    from_layer = out_layer
+    out_layer = "conv8_1"
+    ConvBNLayer(net, from_layer, out_layer, use_batchnorm, use_relu, 128, 1, 0, 1,
+      lr_mult=lr_mult)
+
+    from_layer = out_layer
+    out_layer = "conv8_2"
+    ConvBNLayer(net, from_layer, out_layer, use_batchnorm, use_relu, 256, 3, 0, 1,
+      lr_mult=lr_mult)
+
+    # 1 x 1
+    from_layer = out_layer
+    out_layer = "conv9_1"
+    ConvBNLayer(net, from_layer, out_layer, use_batchnorm, use_relu, 128, 1, 0, 1,
+      lr_mult=lr_mult)
+
+    from_layer = out_layer
+    out_layer = "conv9_2"
+    ConvBNLayer(net, from_layer, out_layer, use_batchnorm, use_relu, 256, 3, 0, 1,
+      lr_mult=lr_mult)
+
+    return net
+
+
+### Modify the following parameters accordingly ###
+# Notice: we do evaluation by setting the solver parameters approximately.
+# The reason that we do not use ./build/tools/caffe test ... is because it
+# only supports testing for classification problem now.
+# The directory which contains the caffe code.
+# We assume you are running the script at the CAFFE_ROOT.
+caffe_root = os.getcwd()
+
+# Set true if you want to start training right after generating all files.
+run_soon = True
+
+# The database file for training data. Created by data/VOC0712/create_data.sh
+train_data = "examples/VOC0712/VOC0712_trainval_lmdb"
+# The database file for testing data. Created by data/VOC0712/create_data.sh
+test_data = "examples/VOC0712/VOC0712_test_lmdb"
+# Specify the batch sampler.
+resize_width = 300
+resize_height = 300
+resize = "{}x{}".format(resize_width, resize_height)
+batch_sampler = [
+        {
+                'sampler': {
+                        },
+                'max_trials': 1,
+                'max_sample': 1,
+        },
+        {
+                'sampler': {
+                        'min_scale': 0.3,
+                        'max_scale': 1.0,
+                        'min_aspect_ratio': 0.5,
+                        'max_aspect_ratio': 2.0,
+                        },
+                'sample_constraint': {
+                        'min_jaccard_overlap': 0.1,
+                        },
+                'max_trials': 50,
+                'max_sample': 1,
+        },
+        {
+                'sampler': {
+                        'min_scale': 0.3,
+                        'max_scale': 1.0,
+                        'min_aspect_ratio': 0.5,
+                        'max_aspect_ratio': 2.0,
+                        },
+                'sample_constraint': {
+                        'min_jaccard_overlap': 0.3,
+                        },
+                'max_trials': 50,
+                'max_sample': 1,
+        },
+        {
+                'sampler': {
+                        'min_scale': 0.3,
+                        'max_scale': 1.0,
+                        'min_aspect_ratio': 0.5,
+                        'max_aspect_ratio': 2.0,
+                        },
+                'sample_constraint': {
+                        'min_jaccard_overlap': 0.5,
+                        },
+                'max_trials': 50,
+                'max_sample': 1,
+        },
+        {
+                'sampler': {
+                        'min_scale': 0.3,
+                        'max_scale': 1.0,
+                        'min_aspect_ratio': 0.5,
+                        'max_aspect_ratio': 2.0,
+                        },
+                'sample_constraint': {
+                        'min_jaccard_overlap': 0.7,
+                        },
+                'max_trials': 50,
+                'max_sample': 1,
+        },
+        {
+                'sampler': {
+                        'min_scale': 0.3,
+                        'max_scale': 1.0,
+                        'min_aspect_ratio': 0.5,
+                        'max_aspect_ratio': 2.0,
+                        },
+                'sample_constraint': {
+                        'min_jaccard_overlap': 0.9,
+                        },
+                'max_trials': 50,
+                'max_sample': 1,
+        },
+        {
+                'sampler': {
+                        'min_scale': 0.3,
+                        'max_scale': 1.0,
+                        'min_aspect_ratio': 0.5,
+                        'max_aspect_ratio': 2.0,
+                        },
+                'sample_constraint': {
+                        'max_jaccard_overlap': 1.0,
+                        },
+                'max_trials': 50,
+                'max_sample': 1,
+        },
+        ]
+train_transform_param = {
+        'mirror': True,
+        'mean_value': [104, 117, 123],
+        'resize_param': {
+                'prob': 1,
+                'resize_mode': P.Resize.WARP,
+                'height': resize_height,
+                'width': resize_width,
+                'interp_mode': [
+                        P.Resize.LINEAR,
+                        P.Resize.AREA,
+                        P.Resize.NEAREST,
+                        P.Resize.CUBIC,
+                        P.Resize.LANCZOS4,
+                        ],
+                },
+        'distort_param': {
+                'brightness_prob': 0.5,
+                'brightness_delta': 32,
+                'contrast_prob': 0.5,
+                'contrast_lower': 0.5,
+                'contrast_upper': 1.5,
+                'hue_prob': 0.5,
+                'hue_delta': 18,
+                'saturation_prob': 0.5,
+                'saturation_lower': 0.5,
+                'saturation_upper': 1.5,
+                'random_order_prob': 0.0,
+                },
+        'expand_param': {
+                'prob': 0.5,
+                'max_expand_ratio': 4.0,
+                },
+        'emit_constraint': {
+            'emit_type': caffe_pb2.EmitConstraint.CENTER,
+            }
+        }
+test_transform_param = {
+        'mean_value': [104, 117, 123],
+        'resize_param': {
+                'prob': 1,
+                'resize_mode': P.Resize.WARP,
+                'height': resize_height,
+                'width': resize_width,
+                'interp_mode': [P.Resize.LINEAR],
+                },
+        }
+
+# If true, use batch norm for all newly added layers.
+# Currently only the non batch norm version has been tested.
+use_batchnorm = False
+lr_mult = 1
+# Use different initial learning rate.
+if use_batchnorm:
+    base_lr = 0.0004
+else:
+    # A learning rate for batch_size = 1, num_gpus = 1.
+    base_lr = 0.00004
+
+# The job name should be same as the name used in examples/ssd/ssd_pascal.py.
+job_name = "SSD_{}".format(resize)
+# The name of the model. Modify it if you want.
+model_name = "VGG_VOC0712_{}".format(job_name)
+
+# Directory which stores the model .prototxt file.
+save_dir = "models/VGGNet/VOC0712/{}_speed".format(job_name)
+# Directory which stores the snapshot of trained models.
+snapshot_dir = "models/VGGNet/VOC0712/{}".format(job_name)
+# Directory which stores the job script and log file.
+job_dir = "jobs/VGGNet/VOC0712/{}_speed".format(job_name)
+# Directory which stores the detection results.
+output_result_dir = "{}/data/VOCdevkit/results/VOC2007/{}_speed/Main".format(os.environ['HOME'], job_name)
+
+# model definition files.
+train_net_file = "{}/train.prototxt".format(save_dir)
+test_net_file = "{}/test.prototxt".format(save_dir)
+deploy_net_file = "{}/deploy.prototxt".format(save_dir)
+solver_file = "{}/solver.prototxt".format(save_dir)
+# snapshot prefix.
+snapshot_prefix = "{}/{}".format(snapshot_dir, model_name)
+# job script path.
+job_file = "{}/{}.sh".format(job_dir, model_name)
+
+# Find most recent snapshot.
+max_iter = 0
+for file in os.listdir(snapshot_dir):
+  if file.endswith(".caffemodel"):
+    basename = os.path.splitext(file)[0]
+    iter = int(basename.split("{}_iter_".format(model_name))[1])
+    if iter > max_iter:
+      max_iter = iter
+
+if max_iter == 0:
+  print("Cannot find snapshot in {}".format(snapshot_dir))
+  sys.exit()
+
+# Stores the test image names and sizes. Created by data/VOC0712/create_list.sh
+name_size_file = "data/VOC0712/test_name_size.txt"
+# The resume model.
+pretrain_model = "{}_iter_{}.caffemodel".format(snapshot_prefix, max_iter)
+# Stores LabelMapItem.
+label_map_file = "data/VOC0712/labelmap_voc.prototxt"
+
+# MultiBoxLoss parameters.
+num_classes = 21
+share_location = True
+background_label_id=0
+train_on_diff_gt = True
+normalization_mode = P.Loss.VALID
+code_type = P.PriorBox.CENTER_SIZE
+ignore_cross_boundary_bbox = False
+mining_type = P.MultiBoxLoss.MAX_NEGATIVE
+neg_pos_ratio = 3.
+loc_weight = (neg_pos_ratio + 1.) / 4.
+multibox_loss_param = {
+    'loc_loss_type': P.MultiBoxLoss.SMOOTH_L1,
+    'conf_loss_type': P.MultiBoxLoss.SOFTMAX,
+    'loc_weight': loc_weight,
+    'num_classes': num_classes,
+    'share_location': share_location,
+    'match_type': P.MultiBoxLoss.PER_PREDICTION,
+    'overlap_threshold': 0.5,
+    'use_prior_for_matching': True,
+    'background_label_id': background_label_id,
+    'use_difficult_gt': train_on_diff_gt,
+    'mining_type': mining_type,
+    'neg_pos_ratio': neg_pos_ratio,
+    'neg_overlap': 0.5,
+    'code_type': code_type,
+    'ignore_cross_boundary_bbox': ignore_cross_boundary_bbox,
+    }
+loss_param = {
+    'normalization': normalization_mode,
+    }
+
+# parameters for generating priors.
+# minimum dimension of input image
+min_dim = 300
+# conv4_3 ==> 38 x 38
+# fc7 ==> 19 x 19
+# conv6_2 ==> 10 x 10
+# conv7_2 ==> 5 x 5
+# conv8_2 ==> 3 x 3
+# conv9_2 ==> 1 x 1
+mbox_source_layers = ['conv4_3', 'fc7', 'conv6_2', 'conv7_2', 'conv8_2', 'conv9_2']
+# in percent %
+min_ratio = 20
+max_ratio = 90
+step = int(math.floor((max_ratio - min_ratio) / (len(mbox_source_layers) - 2)))
+min_sizes = []
+max_sizes = []
+for ratio in xrange(min_ratio, max_ratio + 1, step):
+  min_sizes.append(min_dim * ratio / 100.)
+  max_sizes.append(min_dim * (ratio + step) / 100.)
+min_sizes = [min_dim * 10 / 100.] + min_sizes
+max_sizes = [min_dim * 20 / 100.] + max_sizes
+steps = [8, 16, 32, 64, 100, 300]
+aspect_ratios = [[2], [2, 3], [2, 3], [2, 3], [2], [2]]
+# L2 normalize conv4_3.
+normalizations = [20, -1, -1, -1, -1, -1]
+# variance used to encode/decode prior bboxes.
+if code_type == P.PriorBox.CENTER_SIZE:
+  prior_variance = [0.1, 0.1, 0.2, 0.2]
+else:
+  prior_variance = [0.1]
+flip = True
+clip = False
+
+# Solver parameters.
+# Defining which GPUs to use.
+gpus = "0"
+gpulist = gpus.split(",")
+num_gpus = len(gpulist)
+
+# The number does not matter since we do not do training with this script.
+batch_size = 1
+accum_batch_size = 1
+iter_size = accum_batch_size / batch_size
+solver_mode = P.Solver.CPU
+device_id = 0
+batch_size_per_device = batch_size
+if num_gpus > 0:
+  batch_size_per_device = int(math.ceil(float(batch_size) / num_gpus))
+  iter_size = int(math.ceil(float(accum_batch_size) / (batch_size_per_device * num_gpus)))
+  solver_mode = P.Solver.GPU
+  device_id = int(gpulist[0])
+
+if normalization_mode == P.Loss.NONE:
+  base_lr /= batch_size_per_device
+elif normalization_mode == P.Loss.VALID:
+  base_lr *= 25. / loc_weight
+elif normalization_mode == P.Loss.FULL:
+  # Roughly there are 2000 prior bboxes per image.
+  # TODO(weiliu89): Estimate the exact # of priors.
+  base_lr *= 2000.
+
+# Evaluate on whole test set.
+num_test_image = 4952
+test_batch_size = 8
+# Ideally test_batch_size should be divisible by num_test_image,
+# otherwise mAP will be slightly off the true value.
+test_iter = int(math.ceil(float(num_test_image) / test_batch_size))
+
+solver_param = {
+    # Train parameters
+    'base_lr': base_lr,
+    'weight_decay': 0.0005,
+    'lr_policy': "multistep",
+    'stepvalue': [80000, 100000, 120000],
+    'gamma': 0.1,
+    'momentum': 0.9,
+    'iter_size': iter_size,
+    'max_iter': 0,
+    'snapshot': 0,
+    'display': 10,
+    'average_loss': 10,
+    'type': "SGD",
+    'solver_mode': solver_mode,
+    'device_id': device_id,
+    'debug_info': False,
+    'snapshot_after_train': False,
+    # Test parameters
+    'test_iter': [test_iter],
+    'test_interval': 10000,
+    'test_initialization': True,
+    }
+
+# parameters for generating detection output.
+det_out_param = {
+    'num_classes': num_classes,
+    'share_location': share_location,
+    'background_label_id': background_label_id,
+    'nms_param': {'nms_threshold': 0.45, 'top_k': 400},
+    'save_output_param': {
+        # Not saving results when testing speed.
+        # 'output_directory': output_result_dir,
+        'output_name_prefix': "comp4_det_test_",
+        'output_format': "VOC",
+        'label_map_file': label_map_file,
+        'name_size_file': name_size_file,
+        'num_test_image': num_test_image,
+        },
+    'keep_top_k': 200,
+    'confidence_threshold': 0.01,
+    'code_type': code_type,
+    }
+
+# parameters for evaluating detection results.
+det_eval_param = {
+    'num_classes': num_classes,
+    'background_label_id': background_label_id,
+    'overlap_threshold': 0.5,
+    'evaluate_difficult_gt': False,
+    'name_size_file': name_size_file,
+    }
+
+### Hopefully you don't need to change the following ###
+# Check file.
+check_if_exist(train_data)
+check_if_exist(test_data)
+check_if_exist(label_map_file)
+check_if_exist(pretrain_model)
+make_if_not_exist(save_dir)
+make_if_not_exist(job_dir)
+make_if_not_exist(snapshot_dir)
+
+# Create train net.
+net = caffe.NetSpec()
+net.data, net.label = CreateAnnotatedDataLayer(train_data, batch_size=batch_size_per_device,
+        train=True, output_label=True, label_map_file=label_map_file,
+        transform_param=train_transform_param, batch_sampler=batch_sampler)
+
+VGGNetBody(net, from_layer='data', fully_conv=True, reduced=True, dilated=True,
+    dropout=False)
+
+AddExtraLayers(net, use_batchnorm, lr_mult=lr_mult)
+
+mbox_layers = CreateMultiBoxHead(net, data_layer='data', from_layers=mbox_source_layers,
+        use_batchnorm=use_batchnorm, min_sizes=min_sizes, max_sizes=max_sizes,
+        aspect_ratios=aspect_ratios, steps=steps, normalizations=normalizations,
+        num_classes=num_classes, share_location=share_location, flip=flip, clip=clip,
+        prior_variance=prior_variance, kernel_size=3, pad=1, lr_mult=lr_mult)
+
+# Create the MultiBoxLossLayer.
+name = "mbox_loss"
+mbox_layers.append(net.label)
+net[name] = L.MultiBoxLoss(*mbox_layers, multibox_loss_param=multibox_loss_param,
+        loss_param=loss_param, include=dict(phase=caffe_pb2.Phase.Value('TRAIN')),
+        propagate_down=[True, True, False, False])
+
+with open(train_net_file, 'w') as f:
+    print('name: "{}_train"'.format(model_name), file=f)
+    print(net.to_proto(), file=f)
+shutil.copy(train_net_file, job_dir)
+
+# Create test net.
+net = caffe.NetSpec()
+net.data = CreateAnnotatedDataLayer(test_data, batch_size=test_batch_size,
+        train=False, output_label=False, label_map_file=label_map_file,
+        transform_param=test_transform_param)
+
+VGGNetBody(net, from_layer='data', fully_conv=True, reduced=True, dilated=True,
+    dropout=False)
+
+AddExtraLayers(net, use_batchnorm, lr_mult=lr_mult)
+
+mbox_layers = CreateMultiBoxHead(net, data_layer='data', from_layers=mbox_source_layers,
+        use_batchnorm=use_batchnorm, min_sizes=min_sizes, max_sizes=max_sizes,
+        aspect_ratios=aspect_ratios, steps=steps, normalizations=normalizations,
+        num_classes=num_classes, share_location=share_location, flip=flip, clip=clip,
+        prior_variance=prior_variance, kernel_size=3, pad=1, lr_mult=lr_mult)
+
+conf_name = "mbox_conf"
+if multibox_loss_param["conf_loss_type"] == P.MultiBoxLoss.SOFTMAX:
+  reshape_name = "{}_reshape".format(conf_name)
+  net[reshape_name] = L.Reshape(net[conf_name], shape=dict(dim=[0, -1, num_classes]))
+  softmax_name = "{}_softmax".format(conf_name)
+  net[softmax_name] = L.Softmax(net[reshape_name], axis=2)
+  flatten_name = "{}_flatten".format(conf_name)
+  net[flatten_name] = L.Flatten(net[softmax_name], axis=1)
+  mbox_layers[1] = net[flatten_name]
+elif multibox_loss_param["conf_loss_type"] == P.MultiBoxLoss.LOGISTIC:
+  sigmoid_name = "{}_sigmoid".format(conf_name)
+  net[sigmoid_name] = L.Sigmoid(net[conf_name])
+  mbox_layers[1] = net[sigmoid_name]
+
+net.detection_out = L.DetectionOutput(*mbox_layers,
+    detection_output_param=det_out_param,
+    include=dict(phase=caffe_pb2.Phase.Value('TEST')))
+net.slience = L.Silence(net.detection_out, ntop=0,
+    include=dict(phase=caffe_pb2.Phase.Value('TEST')))
+
+with open(test_net_file, 'w') as f:
+    print('name: "{}_test"'.format(model_name), file=f)
+    print(net.to_proto(), file=f)
+shutil.copy(test_net_file, job_dir)
+
+# Create deploy net.
+# Remove the first and last layer from test net.
+deploy_net = net
+with open(deploy_net_file, 'w') as f:
+    net_param = deploy_net.to_proto()
+    # Remove the first (AnnotatedData) and last (DetectionEvaluate) layer from test net.
+    del net_param.layer[0]
+    del net_param.layer[-1]
+    net_param.name = '{}_deploy'.format(model_name)
+    net_param.input.extend(['data'])
+    net_param.input_shape.extend([
+        caffe_pb2.BlobShape(dim=[1, 3, resize_height, resize_width])])
+    print(net_param, file=f)
+shutil.copy(deploy_net_file, job_dir)
+
+# Create solver.
+solver = caffe_pb2.SolverParameter(
+        train_net=train_net_file,
+        test_net=[test_net_file],
+        snapshot_prefix=snapshot_prefix,
+        **solver_param)
+
+with open(solver_file, 'w') as f:
+    print(solver, file=f)
+shutil.copy(solver_file, job_dir)
+
+# Create job file.
+with open(job_file, 'w') as f:
+  f.write('cd {}\n'.format(caffe_root))
+  f.write('./build/tools/caffe train \\\n')
+  f.write('--solver="{}" \\\n'.format(solver_file))
+  f.write('--weights="{}" \\\n'.format(pretrain_model))
+  if solver_param['solver_mode'] == P.Solver.GPU:
+    f.write('--gpu {} 2>&1 | tee {}/{}_test{}.log\n'.format(gpus, job_dir, model_name, max_iter))
+  else:
+    f.write('2>&1 | tee {}/{}.log\n'.format(job_dir, model_name))
+
+# Copy the python script to job_dir.
+py_file = os.path.abspath(__file__)
+shutil.copy(py_file, job_dir)
+
+# Run the job.
+os.chmod(job_file, stat.S_IRWXU)
+if run_soon:
+  subprocess.call(job_file, shell=True)
diff --git a/examples/ssd/ssd_pascal_video.py b/examples/ssd/ssd_pascal_video.py
new file mode 100644
index 00000000000..ac2463d97db
--- /dev/null
+++ b/examples/ssd/ssd_pascal_video.py
@@ -0,0 +1,293 @@
+from __future__ import print_function
+import caffe
+from google.protobuf import text_format
+
+import math
+import os
+import shutil
+import stat
+import subprocess
+import sys
+
+sys.path.insert(0, 'examples/ssd')
+from model_libs import *
+
+# Add extra layers on top of a "base" network (e.g. VGGNet or Inception).
+def AddExtraLayers(net, use_batchnorm=True, lr_mult=1):
+    use_relu = True
+
+    # Add additional convolutional layers.
+    # 19 x 19
+    from_layer = net.keys()[-1]
+
+    # TODO(weiliu89): Construct the name using the last layer to avoid duplication.
+    # 10 x 10
+    out_layer = "conv6_1"
+    ConvBNLayer(net, from_layer, out_layer, use_batchnorm, use_relu, 256, 1, 0, 1,
+        lr_mult=lr_mult)
+
+    from_layer = out_layer
+    out_layer = "conv6_2"
+    ConvBNLayer(net, from_layer, out_layer, use_batchnorm, use_relu, 512, 3, 1, 2,
+        lr_mult=lr_mult)
+
+    # 5 x 5
+    from_layer = out_layer
+    out_layer = "conv7_1"
+    ConvBNLayer(net, from_layer, out_layer, use_batchnorm, use_relu, 128, 1, 0, 1,
+      lr_mult=lr_mult)
+
+    from_layer = out_layer
+    out_layer = "conv7_2"
+    ConvBNLayer(net, from_layer, out_layer, use_batchnorm, use_relu, 256, 3, 1, 2,
+      lr_mult=lr_mult)
+
+    # 3 x 3
+    from_layer = out_layer
+    out_layer = "conv8_1"
+    ConvBNLayer(net, from_layer, out_layer, use_batchnorm, use_relu, 128, 1, 0, 1,
+      lr_mult=lr_mult)
+
+    from_layer = out_layer
+    out_layer = "conv8_2"
+    ConvBNLayer(net, from_layer, out_layer, use_batchnorm, use_relu, 256, 3, 0, 1,
+      lr_mult=lr_mult)
+
+    # 1 x 1
+    from_layer = out_layer
+    out_layer = "conv9_1"
+    ConvBNLayer(net, from_layer, out_layer, use_batchnorm, use_relu, 128, 1, 0, 1,
+      lr_mult=lr_mult)
+
+    from_layer = out_layer
+    out_layer = "conv9_2"
+    ConvBNLayer(net, from_layer, out_layer, use_batchnorm, use_relu, 256, 3, 0, 1,
+      lr_mult=lr_mult)
+
+    return net
+
+
+### Modify the following parameters accordingly ###
+# The directory which contains the caffe code.
+# We assume you are running the script at the CAFFE_ROOT.
+caffe_root = os.getcwd()
+
+# Set true if you want to start training right after generating all files.
+run_soon = True
+# The video file path
+video_file = "examples/videos/ILSVRC2015_train_00755001.mp4"
+
+# The parameters for the video demo
+
+# Key parameters used in training
+# If true, use batch norm for all newly added layers.
+# Currently only the non batch norm version has been tested.
+use_batchnorm = False
+num_classes = 21
+share_location = True
+background_label_id=0
+conf_loss_type = P.MultiBoxLoss.SOFTMAX
+code_type = P.PriorBox.CENTER_SIZE
+lr_mult = 1.
+# Stores LabelMapItem.
+label_map_file = "data/VOC0712/labelmap_voc.prototxt"
+# The resized image size
+resize_width = 300
+resize_height = 300
+
+# Parameters needed for test.
+# Set the number of test iterations to the maximum integer number.
+test_iter = int(math.pow(2, 29) - 1)
+# Use GPU or CPU
+solver_mode = P.Solver.GPU
+# Defining which GPUs to use.
+gpus = "0"
+# Number of frames to be processed per batch.
+test_batch_size = 1
+# Only display high quality detections whose scores are higher than a threshold.
+visualize_threshold = 0.3
+# Size of video image.
+video_width = 1280
+video_height = 720
+# Scale the image size for display.
+scale = 0.8
+
+### Hopefully you don't need to change the following ###
+resize = "{}x{}".format(resize_width, resize_height)
+video_data_param = {
+        'video_type': P.VideoData.VIDEO,
+        'video_file': video_file,
+        }
+test_transform_param = {
+        'mean_value': [104, 117, 123],
+        'resize_param': {
+                'prob': 1,
+                'resize_mode': P.Resize.WARP,
+                'height': resize_height,
+                'width': resize_width,
+                'interp_mode': [P.Resize.LINEAR],
+                },
+        }
+output_transform_param = {
+        'mean_value': [104, 117, 123],
+        'resize_param': {
+                'prob': 1,
+                'resize_mode': P.Resize.WARP,
+                'height': int(video_height * scale),
+                'width': int(video_width * scale),
+                'interp_mode': [P.Resize.LINEAR],
+                },
+        }
+# parameters for generating detection output.
+det_out_param = {
+    'num_classes': num_classes,
+    'share_location': share_location,
+    'background_label_id': background_label_id,
+    'nms_param': {'nms_threshold': 0.45, 'top_k': 400},
+    'save_output_param': {
+            'label_map_file': label_map_file,
+            },
+    'keep_top_k': 200,
+    'confidence_threshold': 0.01,
+    'code_type': code_type,
+    'visualize': True,
+    'visualize_threshold': visualize_threshold,
+    }
+
+# The job name should be same as the name used in examples/ssd/ssd_pascal.py.
+job_name = "SSD_{}".format(resize)
+# The name of the model. Modify it if you want.
+model_name = "VGG_VOC0712_{}".format(job_name)
+
+# Directory which stores the model .prototxt file.
+save_dir = "models/VGGNet/VOC0712/{}_video".format(job_name)
+# Directory which stores the snapshot of trained models.
+snapshot_dir = "models/VGGNet/VOC0712/{}".format(job_name)
+# Directory which stores the job script and log file.
+job_dir = "jobs/VGGNet/VOC0712/{}_video".format(job_name)
+
+# model definition files.
+test_net_file = "{}/test.prototxt".format(save_dir)
+# snapshot prefix.
+snapshot_prefix = "{}/{}".format(snapshot_dir, model_name)
+# job script path.
+job_file = "{}/{}.sh".format(job_dir, model_name)
+
+# Find most recent snapshot.
+max_iter = 0
+for file in os.listdir(snapshot_dir):
+  if file.endswith(".caffemodel"):
+    basename = os.path.splitext(file)[0]
+    iter = int(basename.split("{}_iter_".format(model_name))[1])
+    if iter > max_iter:
+      max_iter = iter
+
+if max_iter == 0:
+  print("Cannot find snapshot in {}".format(snapshot_dir))
+  sys.exit()
+
+# The resume model.
+pretrain_model = "{}_iter_{}.caffemodel".format(snapshot_prefix, max_iter)
+
+# parameters for generating priors.
+# minimum dimension of input image
+min_dim = 300
+# conv4_3 ==> 38 x 38
+# fc7 ==> 19 x 19
+# conv6_2 ==> 10 x 10
+# conv7_2 ==> 5 x 5
+# conv8_2 ==> 3 x 3
+# conv9_2 ==> 1 x 1
+mbox_source_layers = ['conv4_3', 'fc7', 'conv6_2', 'conv7_2', 'conv8_2', 'conv9_2']
+# in percent %
+min_ratio = 20
+max_ratio = 90
+step = int(math.floor((max_ratio - min_ratio) / (len(mbox_source_layers) - 2)))
+min_sizes = []
+max_sizes = []
+for ratio in xrange(min_ratio, max_ratio + 1, step):
+  min_sizes.append(min_dim * ratio / 100.)
+  max_sizes.append(min_dim * (ratio + step) / 100.)
+min_sizes = [min_dim * 10 / 100.] + min_sizes
+max_sizes = [min_dim * 20 / 100.] + max_sizes
+steps = [8, 16, 32, 64, 100, 300]
+aspect_ratios = [[2], [2, 3], [2, 3], [2, 3], [2], [2]]
+# L2 normalize conv4_3.
+normalizations = [20, -1, -1, -1, -1, -1]
+# variance used to encode/decode prior bboxes.
+if code_type == P.PriorBox.CENTER_SIZE:
+  prior_variance = [0.1, 0.1, 0.2, 0.2]
+else:
+  prior_variance = [0.1]
+flip = True
+clip = False
+
+# Check file.
+check_if_exist(label_map_file)
+check_if_exist(pretrain_model)
+make_if_not_exist(save_dir)
+make_if_not_exist(job_dir)
+make_if_not_exist(snapshot_dir)
+
+# Create test net.
+net = caffe.NetSpec()
+net.data = L.VideoData(video_data_param=video_data_param,
+        data_param=dict(batch_size=test_batch_size),
+        transform_param=test_transform_param)
+
+VGGNetBody(net, from_layer='data', fully_conv=True, reduced=True, dilated=True,
+    dropout=False)
+
+AddExtraLayers(net, use_batchnorm, lr_mult=lr_mult)
+
+mbox_layers = CreateMultiBoxHead(net, data_layer='data', from_layers=mbox_source_layers,
+        use_batchnorm=use_batchnorm, min_sizes=min_sizes, max_sizes=max_sizes,
+        aspect_ratios=aspect_ratios, steps=steps, normalizations=normalizations,
+        num_classes=num_classes, share_location=share_location, flip=flip, clip=clip,
+        prior_variance=prior_variance, kernel_size=3, pad=1, lr_mult=lr_mult)
+
+conf_name = "mbox_conf"
+if conf_loss_type == P.MultiBoxLoss.SOFTMAX:
+  reshape_name = "{}_reshape".format(conf_name)
+  net[reshape_name] = L.Reshape(net[conf_name], shape=dict(dim=[0, -1, num_classes]))
+  softmax_name = "{}_softmax".format(conf_name)
+  net[softmax_name] = L.Softmax(net[reshape_name], axis=2)
+  flatten_name = "{}_flatten".format(conf_name)
+  net[flatten_name] = L.Flatten(net[softmax_name], axis=1)
+  mbox_layers[1] = net[flatten_name]
+elif conf_loss_type == P.MultiBoxLoss.LOGISTIC:
+  sigmoid_name = "{}_sigmoid".format(conf_name)
+  net[sigmoid_name] = L.Sigmoid(net[conf_name])
+  mbox_layers[1] = net[sigmoid_name]
+
+mbox_layers.append(net.data)
+net.detection_out = L.DetectionOutput(*mbox_layers,
+    detection_output_param=det_out_param,
+    transform_param=output_transform_param,
+    include=dict(phase=caffe_pb2.Phase.Value('TEST')))
+net.slience = L.Silence(net.detection_out, ntop=0,
+    include=dict(phase=caffe_pb2.Phase.Value('TEST')))
+
+with open(test_net_file, 'w') as f:
+    print('name: "{}_test"'.format(model_name), file=f)
+    print(net.to_proto(), file=f)
+shutil.copy(test_net_file, job_dir)
+
+# Create job file.
+with open(job_file, 'w') as f:
+  f.write('cd {}\n'.format(caffe_root))
+  f.write('./build/tools/caffe test \\\n')
+  f.write('--model="{}" \\\n'.format(test_net_file))
+  f.write('--weights="{}" \\\n'.format(pretrain_model))
+  f.write('--iterations="{}" \\\n'.format(test_iter))
+  if solver_mode == P.Solver.GPU:
+    f.write('--gpu {}\n'.format(gpus))
+
+# Copy the python script to job_dir.
+py_file = os.path.abspath(__file__)
+shutil.copy(py_file, job_dir)
+
+# Run the job.
+os.chmod(job_file, stat.S_IRWXU)
+if run_soon:
+  subprocess.call(job_file, shell=True)
diff --git a/examples/ssd/ssd_pascal_webcam.py b/examples/ssd/ssd_pascal_webcam.py
new file mode 100644
index 00000000000..3b58e5437b2
--- /dev/null
+++ b/examples/ssd/ssd_pascal_webcam.py
@@ -0,0 +1,296 @@
+from __future__ import print_function
+import caffe
+from google.protobuf import text_format
+
+import math
+import os
+import shutil
+import stat
+import subprocess
+import sys
+
+sys.path.insert(0, 'examples/ssd')
+from model_libs import *
+
+# Add extra layers on top of a "base" network (e.g. VGGNet or Inception).
+def AddExtraLayers(net, use_batchnorm=True, lr_mult=1):
+    use_relu = True
+
+    # Add additional convolutional layers.
+    # 19 x 19
+    from_layer = net.keys()[-1]
+
+    # TODO(weiliu89): Construct the name using the last layer to avoid duplication.
+    # 10 x 10
+    out_layer = "conv6_1"
+    ConvBNLayer(net, from_layer, out_layer, use_batchnorm, use_relu, 256, 1, 0, 1,
+        lr_mult=lr_mult)
+
+    from_layer = out_layer
+    out_layer = "conv6_2"
+    ConvBNLayer(net, from_layer, out_layer, use_batchnorm, use_relu, 512, 3, 1, 2,
+        lr_mult=lr_mult)
+
+    # 5 x 5
+    from_layer = out_layer
+    out_layer = "conv7_1"
+    ConvBNLayer(net, from_layer, out_layer, use_batchnorm, use_relu, 128, 1, 0, 1,
+      lr_mult=lr_mult)
+
+    from_layer = out_layer
+    out_layer = "conv7_2"
+    ConvBNLayer(net, from_layer, out_layer, use_batchnorm, use_relu, 256, 3, 1, 2,
+      lr_mult=lr_mult)
+
+    # 3 x 3
+    from_layer = out_layer
+    out_layer = "conv8_1"
+    ConvBNLayer(net, from_layer, out_layer, use_batchnorm, use_relu, 128, 1, 0, 1,
+      lr_mult=lr_mult)
+
+    from_layer = out_layer
+    out_layer = "conv8_2"
+    ConvBNLayer(net, from_layer, out_layer, use_batchnorm, use_relu, 256, 3, 0, 1,
+      lr_mult=lr_mult)
+
+    # 1 x 1
+    from_layer = out_layer
+    out_layer = "conv9_1"
+    ConvBNLayer(net, from_layer, out_layer, use_batchnorm, use_relu, 128, 1, 0, 1,
+      lr_mult=lr_mult)
+
+    from_layer = out_layer
+    out_layer = "conv9_2"
+    ConvBNLayer(net, from_layer, out_layer, use_batchnorm, use_relu, 256, 3, 0, 1,
+      lr_mult=lr_mult)
+
+    return net
+
+
+### Modify the following parameters accordingly ###
+# The directory which contains the caffe code.
+# We assume you are running the script at the CAFFE_ROOT.
+caffe_root = os.getcwd()
+
+# Set true if you want to start training right after generating all files.
+run_soon = True
+# The device id for webcam
+webcam_id = 0
+# Number of frames to be skipped.
+skip_frames = 0
+
+# The parameters for the webcam demo
+
+# Key parameters used in training
+# If true, use batch norm for all newly added layers.
+# Currently only the non batch norm version has been tested.
+use_batchnorm = False
+num_classes = 21
+share_location = True
+background_label_id=0
+conf_loss_type = P.MultiBoxLoss.SOFTMAX
+code_type = P.PriorBox.CENTER_SIZE
+lr_mult = 1.
+# Stores LabelMapItem.
+label_map_file = "data/VOC0712/labelmap_voc.prototxt"
+# The resized image size
+resize_width = 300
+resize_height = 300
+
+# Parameters needed for test.
+# Set the number of test iterations to the maximum integer number.
+test_iter = int(math.pow(2, 29) - 1)
+# Use GPU or CPU
+solver_mode = P.Solver.GPU
+# Defining which GPUs to use.
+gpus = "0"
+# Number of frames to be processed per batch.
+test_batch_size = 1
+# Only display high quality detections whose scores are higher than a threshold.
+visualize_threshold = 0.6
+# Size of webcam image.
+webcam_width = 640
+webcam_height = 480
+# Scale the image size for display.
+scale = 1.5
+
+### Hopefully you don't need to change the following ###
+resize = "{}x{}".format(resize_width, resize_height)
+video_data_param = {
+        'video_type': P.VideoData.WEBCAM,
+        'device_id': webcam_id,
+        'skip_frames': skip_frames,
+        }
+test_transform_param = {
+        'mean_value': [104, 117, 123],
+        'resize_param': {
+                'prob': 1,
+                'resize_mode': P.Resize.WARP,
+                'height': resize_height,
+                'width': resize_width,
+                'interp_mode': [P.Resize.LINEAR],
+                },
+        }
+output_transform_param = {
+        'mean_value': [104, 117, 123],
+        'resize_param': {
+                'prob': 1,
+                'resize_mode': P.Resize.WARP,
+                'height': int(webcam_height * scale),
+                'width': int(webcam_width * scale),
+                'interp_mode': [P.Resize.LINEAR],
+                },
+        }
+# parameters for generating detection output.
+det_out_param = {
+    'num_classes': num_classes,
+    'share_location': share_location,
+    'background_label_id': background_label_id,
+    'nms_param': {'nms_threshold': 0.45, 'top_k': 400},
+    'save_output_param': {
+            'label_map_file': label_map_file,
+            },
+    'keep_top_k': 200,
+    'confidence_threshold': 0.01,
+    'code_type': code_type,
+    'visualize': True,
+    'visualize_threshold': visualize_threshold,
+    }
+
+# The job name should be same as the name used in examples/ssd/ssd_pascal.py.
+job_name = "SSD_{}".format(resize)
+# The name of the model. Modify it if you want.
+model_name = "VGG_VOC0712_{}".format(job_name)
+
+# Directory which stores the model .prototxt file.
+save_dir = "models/VGGNet/VOC0712/{}_webcam".format(job_name)
+# Directory which stores the snapshot of trained models.
+snapshot_dir = "models/VGGNet/VOC0712/{}".format(job_name)
+# Directory which stores the job script and log file.
+job_dir = "jobs/VGGNet/VOC0712/{}_webcam".format(job_name)
+
+# model definition files.
+test_net_file = "{}/test.prototxt".format(save_dir)
+# snapshot prefix.
+snapshot_prefix = "{}/{}".format(snapshot_dir, model_name)
+# job script path.
+job_file = "{}/{}.sh".format(job_dir, model_name)
+
+# Find most recent snapshot.
+max_iter = 0
+for file in os.listdir(snapshot_dir):
+  if file.endswith(".caffemodel"):
+    basename = os.path.splitext(file)[0]
+    iter = int(basename.split("{}_iter_".format(model_name))[1])
+    if iter > max_iter:
+      max_iter = iter
+
+if max_iter == 0:
+  print("Cannot find snapshot in {}".format(snapshot_dir))
+  sys.exit()
+
+# The resume model.
+pretrain_model = "{}_iter_{}.caffemodel".format(snapshot_prefix, max_iter)
+
+# parameters for generating priors.
+# minimum dimension of input image
+min_dim = 300
+# conv4_3 ==> 38 x 38
+# fc7 ==> 19 x 19
+# conv6_2 ==> 10 x 10
+# conv7_2 ==> 5 x 5
+# conv8_2 ==> 3 x 3
+# conv9_2 ==> 1 x 1
+mbox_source_layers = ['conv4_3', 'fc7', 'conv6_2', 'conv7_2', 'conv8_2', 'conv9_2']
+# in percent %
+min_ratio = 20
+max_ratio = 90
+step = int(math.floor((max_ratio - min_ratio) / (len(mbox_source_layers) - 2)))
+min_sizes = []
+max_sizes = []
+for ratio in xrange(min_ratio, max_ratio + 1, step):
+  min_sizes.append(min_dim * ratio / 100.)
+  max_sizes.append(min_dim * (ratio + step) / 100.)
+min_sizes = [min_dim * 10 / 100.] + min_sizes
+max_sizes = [min_dim * 20 / 100.] + max_sizes
+steps = [8, 16, 32, 64, 100, 300]
+aspect_ratios = [[2], [2, 3], [2, 3], [2, 3], [2], [2]]
+# L2 normalize conv4_3.
+normalizations = [20, -1, -1, -1, -1, -1]
+# variance used to encode/decode prior bboxes.
+if code_type == P.PriorBox.CENTER_SIZE:
+  prior_variance = [0.1, 0.1, 0.2, 0.2]
+else:
+  prior_variance = [0.1]
+flip = True
+clip = False
+
+# Check file.
+check_if_exist(label_map_file)
+check_if_exist(pretrain_model)
+make_if_not_exist(save_dir)
+make_if_not_exist(job_dir)
+make_if_not_exist(snapshot_dir)
+
+# Create test net.
+net = caffe.NetSpec()
+net.data = L.VideoData(video_data_param=video_data_param,
+        data_param=dict(batch_size=test_batch_size),
+        transform_param=test_transform_param)
+
+VGGNetBody(net, from_layer='data', fully_conv=True, reduced=True, dilated=True,
+    dropout=False)
+
+AddExtraLayers(net, use_batchnorm, lr_mult=lr_mult)
+
+mbox_layers = CreateMultiBoxHead(net, data_layer='data', from_layers=mbox_source_layers,
+        use_batchnorm=use_batchnorm, min_sizes=min_sizes, max_sizes=max_sizes,
+        aspect_ratios=aspect_ratios, steps=steps, normalizations=normalizations,
+        num_classes=num_classes, share_location=share_location, flip=flip, clip=clip,
+        prior_variance=prior_variance, kernel_size=3, pad=1, lr_mult=lr_mult)
+
+conf_name = "mbox_conf"
+if conf_loss_type == P.MultiBoxLoss.SOFTMAX:
+  reshape_name = "{}_reshape".format(conf_name)
+  net[reshape_name] = L.Reshape(net[conf_name], shape=dict(dim=[0, -1, num_classes]))
+  softmax_name = "{}_softmax".format(conf_name)
+  net[softmax_name] = L.Softmax(net[reshape_name], axis=2)
+  flatten_name = "{}_flatten".format(conf_name)
+  net[flatten_name] = L.Flatten(net[softmax_name], axis=1)
+  mbox_layers[1] = net[flatten_name]
+elif conf_loss_type == P.MultiBoxLoss.LOGISTIC:
+  sigmoid_name = "{}_sigmoid".format(conf_name)
+  net[sigmoid_name] = L.Sigmoid(net[conf_name])
+  mbox_layers[1] = net[sigmoid_name]
+
+mbox_layers.append(net.data)
+net.detection_out = L.DetectionOutput(*mbox_layers,
+    detection_output_param=det_out_param,
+    transform_param=output_transform_param,
+    include=dict(phase=caffe_pb2.Phase.Value('TEST')))
+net.slience = L.Silence(net.detection_out, ntop=0,
+    include=dict(phase=caffe_pb2.Phase.Value('TEST')))
+
+with open(test_net_file, 'w') as f:
+    print('name: "{}_test"'.format(model_name), file=f)
+    print(net.to_proto(), file=f)
+shutil.copy(test_net_file, job_dir)
+
+# Create job file.
+with open(job_file, 'w') as f:
+  f.write('cd {}\n'.format(caffe_root))
+  f.write('./build/tools/caffe test \\\n')
+  f.write('--model="{}" \\\n'.format(test_net_file))
+  f.write('--weights="{}" \\\n'.format(pretrain_model))
+  f.write('--iterations="{}" \\\n'.format(test_iter))
+  if solver_mode == P.Solver.GPU:
+    f.write('--gpu {}\n'.format(gpus))
+
+# Copy the python script to job_dir.
+py_file = os.path.abspath(__file__)
+shutil.copy(py_file, job_dir)
+
+# Run the job.
+os.chmod(job_file, stat.S_IRWXU)
+if run_soon:
+  subprocess.call(job_file, shell=True)
diff --git a/examples/ssd/ssd_pascal_zf.py b/examples/ssd/ssd_pascal_zf.py
new file mode 100644
index 00000000000..1531c26560d
--- /dev/null
+++ b/examples/ssd/ssd_pascal_zf.py
@@ -0,0 +1,580 @@
+'''
+Before running this script, you should download the fully convolutional reduced (atrous) ZFNet at:
+  http://cs.unc.edu/~wliu/projects/SSD/ZF_conv_reduced.caffemodel
+By default, we assume the model is stored in `$CAFFE_ROOT/models/ZFNet/`
+'''
+from __future__ import print_function
+import caffe
+from google.protobuf import text_format
+
+import math
+import os
+import shutil
+import stat
+import subprocess
+import sys
+
+sys.path.insert(0, 'examples/ssd')
+from model_libs import *
+
+# Add extra layers on top of a "base" network (e.g. VGGNet or Inception).
+def AddExtraLayers(net, use_batchnorm=True, lr_mult=1):
+    use_relu = True
+
+    # Add additional convolutional layers.
+    # 19 x 19
+    from_layer = net.keys()[-1]
+
+    # TODO(weiliu89): Construct the name using the last layer to avoid duplication.
+    # 10 x 10
+    out_layer = "conv6_1"
+    ConvBNLayer(net, from_layer, out_layer, use_batchnorm, use_relu, 256, 1, 0, 1,
+        lr_mult=lr_mult)
+
+    from_layer = out_layer
+    out_layer = "conv6_2"
+    ConvBNLayer(net, from_layer, out_layer, use_batchnorm, use_relu, 512, 3, 1, 2,
+        lr_mult=lr_mult)
+
+    # 5 x 5
+    from_layer = out_layer
+    out_layer = "conv7_1"
+    ConvBNLayer(net, from_layer, out_layer, use_batchnorm, use_relu, 128, 1, 0, 1,
+      lr_mult=lr_mult)
+
+    from_layer = out_layer
+    out_layer = "conv7_2"
+    ConvBNLayer(net, from_layer, out_layer, use_batchnorm, use_relu, 256, 3, 1, 2,
+      lr_mult=lr_mult)
+
+    # 3 x 3
+    from_layer = out_layer
+    out_layer = "conv8_1"
+    ConvBNLayer(net, from_layer, out_layer, use_batchnorm, use_relu, 128, 1, 0, 1,
+      lr_mult=lr_mult)
+
+    from_layer = out_layer
+    out_layer = "conv8_2"
+    ConvBNLayer(net, from_layer, out_layer, use_batchnorm, use_relu, 256, 3, 0, 1,
+      lr_mult=lr_mult)
+
+    # 1 x 1
+    from_layer = out_layer
+    out_layer = "conv9_1"
+    ConvBNLayer(net, from_layer, out_layer, use_batchnorm, use_relu, 128, 1, 0, 1,
+      lr_mult=lr_mult)
+
+    from_layer = out_layer
+    out_layer = "conv9_2"
+    ConvBNLayer(net, from_layer, out_layer, use_batchnorm, use_relu, 256, 3, 0, 1,
+      lr_mult=lr_mult)
+
+    return net
+
+
+### Modify the following parameters accordingly ###
+# The directory which contains the caffe code.
+# We assume you are running the script at the CAFFE_ROOT.
+caffe_root = os.getcwd()
+
+# Set true if you want to start training right after generating all files.
+run_soon = True
+# Set true if you want to load from most recently saved snapshot.
+# Otherwise, we will load from the pretrain_model defined below.
+resume_training = True
+# If true, Remove old model files.
+remove_old_models = False
+
+# The database file for training data. Created by data/VOC0712/create_data.sh
+train_data = "examples/VOC0712/VOC0712_trainval_lmdb"
+# The database file for testing data. Created by data/VOC0712/create_data.sh
+test_data = "examples/VOC0712/VOC0712_test_lmdb"
+# Specify the batch sampler.
+resize_width = 300
+resize_height = 300
+resize = "{}x{}".format(resize_width, resize_height)
+batch_sampler = [
+        {
+                'sampler': {
+                        },
+                'max_trials': 1,
+                'max_sample': 1,
+        },
+        {
+                'sampler': {
+                        'min_scale': 0.3,
+                        'max_scale': 1.0,
+                        'min_aspect_ratio': 0.5,
+                        'max_aspect_ratio': 2.0,
+                        },
+                'sample_constraint': {
+                        'min_jaccard_overlap': 0.1,
+                        },
+                'max_trials': 50,
+                'max_sample': 1,
+        },
+        {
+                'sampler': {
+                        'min_scale': 0.3,
+                        'max_scale': 1.0,
+                        'min_aspect_ratio': 0.5,
+                        'max_aspect_ratio': 2.0,
+                        },
+                'sample_constraint': {
+                        'min_jaccard_overlap': 0.3,
+                        },
+                'max_trials': 50,
+                'max_sample': 1,
+        },
+        {
+                'sampler': {
+                        'min_scale': 0.3,
+                        'max_scale': 1.0,
+                        'min_aspect_ratio': 0.5,
+                        'max_aspect_ratio': 2.0,
+                        },
+                'sample_constraint': {
+                        'min_jaccard_overlap': 0.5,
+                        },
+                'max_trials': 50,
+                'max_sample': 1,
+        },
+        {
+                'sampler': {
+                        'min_scale': 0.3,
+                        'max_scale': 1.0,
+                        'min_aspect_ratio': 0.5,
+                        'max_aspect_ratio': 2.0,
+                        },
+                'sample_constraint': {
+                        'min_jaccard_overlap': 0.7,
+                        },
+                'max_trials': 50,
+                'max_sample': 1,
+        },
+        {
+                'sampler': {
+                        'min_scale': 0.3,
+                        'max_scale': 1.0,
+                        'min_aspect_ratio': 0.5,
+                        'max_aspect_ratio': 2.0,
+                        },
+                'sample_constraint': {
+                        'min_jaccard_overlap': 0.9,
+                        },
+                'max_trials': 50,
+                'max_sample': 1,
+        },
+        {
+                'sampler': {
+                        'min_scale': 0.3,
+                        'max_scale': 1.0,
+                        'min_aspect_ratio': 0.5,
+                        'max_aspect_ratio': 2.0,
+                        },
+                'sample_constraint': {
+                        'max_jaccard_overlap': 1.0,
+                        },
+                'max_trials': 50,
+                'max_sample': 1,
+        },
+        ]
+train_transform_param = {
+        'mirror': True,
+        'mean_value': [104, 117, 123],
+        'resize_param': {
+                'prob': 1,
+                'resize_mode': P.Resize.WARP,
+                'height': resize_height,
+                'width': resize_width,
+                'interp_mode': [
+                        P.Resize.LINEAR,
+                        P.Resize.AREA,
+                        P.Resize.NEAREST,
+                        P.Resize.CUBIC,
+                        P.Resize.LANCZOS4,
+                        ],
+                },
+        'distort_param': {
+                'brightness_prob': 0.5,
+                'brightness_delta': 32,
+                'contrast_prob': 0.5,
+                'contrast_lower': 0.5,
+                'contrast_upper': 1.5,
+                'hue_prob': 0.5,
+                'hue_delta': 18,
+                'saturation_prob': 0.5,
+                'saturation_lower': 0.5,
+                'saturation_upper': 1.5,
+                'random_order_prob': 0.0,
+                },
+        'expand_param': {
+                'prob': 0.5,
+                'max_expand_ratio': 4.0,
+                },
+        'emit_constraint': {
+            'emit_type': caffe_pb2.EmitConstraint.CENTER,
+            }
+        }
+test_transform_param = {
+        'mean_value': [104, 117, 123],
+        'resize_param': {
+                'prob': 1,
+                'resize_mode': P.Resize.WARP,
+                'height': resize_height,
+                'width': resize_width,
+                'interp_mode': [P.Resize.LINEAR],
+                },
+        }
+
+# If true, use batch norm for all newly added layers.
+# Currently only the non batch norm version has been tested.
+use_batchnorm = False
+lr_mult = 1
+# Use different initial learning rate.
+if use_batchnorm:
+    base_lr = 0.0004
+else:
+    # A learning rate for batch_size = 1, num_gpus = 1.
+    base_lr = 0.00004
+
+# Modify the job name if you want.
+job_name = "SSD_{}".format(resize)
+# The name of the model. Modify it if you want.
+model_name = "ZF_VOC0712_{}".format(job_name)
+
+# Directory which stores the model .prototxt file.
+save_dir = "models/ZFNet/VOC0712/{}".format(job_name)
+# Directory which stores the snapshot of models.
+snapshot_dir = "models/ZFNet/VOC0712/{}".format(job_name)
+# Directory which stores the job script and log file.
+job_dir = "jobs/ZFNet/VOC0712/{}".format(job_name)
+# Directory which stores the detection results.
+output_result_dir = "{}/data/VOCdevkit/results/VOC2007/{}/Main".format(os.environ['HOME'], job_name)
+
+# model definition files.
+train_net_file = "{}/train.prototxt".format(save_dir)
+test_net_file = "{}/test.prototxt".format(save_dir)
+deploy_net_file = "{}/deploy.prototxt".format(save_dir)
+solver_file = "{}/solver.prototxt".format(save_dir)
+# snapshot prefix.
+snapshot_prefix = "{}/{}".format(snapshot_dir, model_name)
+# job script path.
+job_file = "{}/{}.sh".format(job_dir, model_name)
+
+# Stores the test image names and sizes. Created by data/VOC0712/create_list.sh
+name_size_file = "data/VOC0712/test_name_size.txt"
+# The pretrained model. We use the Fully convolutional reduced (atrous) ZFNet.
+pretrain_model = "models/ZFNet/ZF_conv_reduced.caffemodel"
+# Stores LabelMapItem.
+label_map_file = "data/VOC0712/labelmap_voc.prototxt"
+
+# MultiBoxLoss parameters.
+num_classes = 21
+share_location = True
+background_label_id=0
+train_on_diff_gt = True
+normalization_mode = P.Loss.VALID
+code_type = P.PriorBox.CENTER_SIZE
+ignore_cross_boundary_bbox = False
+mining_type = P.MultiBoxLoss.MAX_NEGATIVE
+neg_pos_ratio = 3.
+loc_weight = (neg_pos_ratio + 1.) / 4.
+multibox_loss_param = {
+    'loc_loss_type': P.MultiBoxLoss.SMOOTH_L1,
+    'conf_loss_type': P.MultiBoxLoss.SOFTMAX,
+    'loc_weight': loc_weight,
+    'num_classes': num_classes,
+    'share_location': share_location,
+    'match_type': P.MultiBoxLoss.PER_PREDICTION,
+    'overlap_threshold': 0.5,
+    'use_prior_for_matching': True,
+    'background_label_id': background_label_id,
+    'use_difficult_gt': train_on_diff_gt,
+    'mining_type': mining_type,
+    'neg_pos_ratio': neg_pos_ratio,
+    'neg_overlap': 0.5,
+    'code_type': code_type,
+    'ignore_cross_boundary_bbox': ignore_cross_boundary_bbox,
+    }
+loss_param = {
+    'normalization': normalization_mode,
+    }
+
+# parameters for generating priors.
+# minimum dimension of input image
+min_dim = 300
+# conv2 ==> 38 x 38
+# fc7 ==> 19 x 19
+# conv6_2 ==> 10 x 10
+# conv7_2 ==> 5 x 5
+# conv8_2 ==> 3 x 3
+# conv9_2 ==> 1 x 1
+mbox_source_layers = ['conv2', 'fc7', 'conv6_2', 'conv7_2', 'conv8_2', 'conv9_2']
+# in percent %
+min_ratio = 20
+max_ratio = 90
+step = int(math.floor((max_ratio - min_ratio) / (len(mbox_source_layers) - 2)))
+min_sizes = []
+max_sizes = []
+for ratio in xrange(min_ratio, max_ratio + 1, step):
+  min_sizes.append(min_dim * ratio / 100.)
+  max_sizes.append(min_dim * (ratio + step) / 100.)
+min_sizes = [min_dim * 10 / 100.] + min_sizes
+max_sizes = [min_dim * 20 / 100.] + max_sizes
+steps = [8, 16, 32, 64, 100, 300]
+aspect_ratios = [[2], [2, 3], [2, 3], [2, 3], [2], [2]]
+# L2 normalize conv2.
+normalizations = [20, -1, -1, -1, -1, -1]
+# variance used to encode/decode prior bboxes.
+if code_type == P.PriorBox.CENTER_SIZE:
+  prior_variance = [0.1, 0.1, 0.2, 0.2]
+else:
+  prior_variance = [0.1]
+flip = True
+clip = False
+
+# Solver parameters.
+# Defining which GPUs to use.
+gpus = "0,1,2,3"
+gpulist = gpus.split(",")
+num_gpus = len(gpulist)
+
+# Divide the mini-batch to different GPUs.
+batch_size = 32
+accum_batch_size = 32
+iter_size = accum_batch_size / batch_size
+solver_mode = P.Solver.CPU
+device_id = 0
+batch_size_per_device = batch_size
+if num_gpus > 0:
+  batch_size_per_device = int(math.ceil(float(batch_size) / num_gpus))
+  iter_size = int(math.ceil(float(accum_batch_size) / (batch_size_per_device * num_gpus)))
+  solver_mode = P.Solver.GPU
+  device_id = int(gpulist[0])
+
+if normalization_mode == P.Loss.NONE:
+  base_lr /= batch_size_per_device
+elif normalization_mode == P.Loss.VALID:
+  base_lr *= 25. / loc_weight
+elif normalization_mode == P.Loss.FULL:
+  # Roughly there are 2000 prior bboxes per image.
+  # TODO(weiliu89): Estimate the exact # of priors.
+  base_lr *= 2000.
+
+# Evaluate on whole test set.
+num_test_image = 4952
+test_batch_size = 8
+# Ideally test_batch_size should be divisible by num_test_image,
+# otherwise mAP will be slightly off the true value.
+test_iter = int(math.ceil(float(num_test_image) / test_batch_size))
+
+solver_param = {
+    # Train parameters
+    'base_lr': base_lr,
+    'weight_decay': 0.0005,
+    'lr_policy': "multistep",
+    'stepvalue': [80000, 100000, 120000],
+    'gamma': 0.1,
+    'momentum': 0.9,
+    'iter_size': iter_size,
+    'max_iter': 120000,
+    'snapshot': 80000,
+    'display': 10,
+    'average_loss': 10,
+    'type': "SGD",
+    'solver_mode': solver_mode,
+    'device_id': device_id,
+    'debug_info': False,
+    'snapshot_after_train': True,
+    # Test parameters
+    'test_iter': [test_iter],
+    'test_interval': 10000,
+    'eval_type': "detection",
+    'ap_version': "11point",
+    'test_initialization': False,
+    }
+
+# parameters for generating detection output.
+det_out_param = {
+    'num_classes': num_classes,
+    'share_location': share_location,
+    'background_label_id': background_label_id,
+    'nms_param': {'nms_threshold': 0.45, 'top_k': 400},
+    'save_output_param': {
+        'output_directory': output_result_dir,
+        'output_name_prefix': "comp4_det_test_",
+        'output_format': "VOC",
+        'label_map_file': label_map_file,
+        'name_size_file': name_size_file,
+        'num_test_image': num_test_image,
+        },
+    'keep_top_k': 200,
+    'confidence_threshold': 0.01,
+    'code_type': code_type,
+    }
+
+# parameters for evaluating detection results.
+det_eval_param = {
+    'num_classes': num_classes,
+    'background_label_id': background_label_id,
+    'overlap_threshold': 0.5,
+    'evaluate_difficult_gt': False,
+    'name_size_file': name_size_file,
+    }
+
+### Hopefully you don't need to change the following ###
+# Check file.
+check_if_exist(train_data)
+check_if_exist(test_data)
+check_if_exist(label_map_file)
+check_if_exist(pretrain_model)
+make_if_not_exist(save_dir)
+make_if_not_exist(job_dir)
+make_if_not_exist(snapshot_dir)
+
+# Create train net.
+net = caffe.NetSpec()
+net.data, net.label = CreateAnnotatedDataLayer(train_data, batch_size=batch_size_per_device,
+        train=True, output_label=True, label_map_file=label_map_file,
+        transform_param=train_transform_param, batch_sampler=batch_sampler)
+
+ZFNetBody(net, from_layer='data', fully_conv=True, reduced=True, dilated=True,
+    dropout=False)
+
+AddExtraLayers(net, use_batchnorm, lr_mult=lr_mult)
+
+mbox_layers = CreateMultiBoxHead(net, data_layer='data', from_layers=mbox_source_layers,
+        use_batchnorm=use_batchnorm, min_sizes=min_sizes, max_sizes=max_sizes,
+        aspect_ratios=aspect_ratios, steps=steps, normalizations=normalizations,
+        num_classes=num_classes, share_location=share_location, flip=flip, clip=clip,
+        prior_variance=prior_variance, kernel_size=3, pad=1, lr_mult=lr_mult)
+
+# Create the MultiBoxLossLayer.
+name = "mbox_loss"
+mbox_layers.append(net.label)
+net[name] = L.MultiBoxLoss(*mbox_layers, multibox_loss_param=multibox_loss_param,
+        loss_param=loss_param, include=dict(phase=caffe_pb2.Phase.Value('TRAIN')),
+        propagate_down=[True, True, False, False])
+
+with open(train_net_file, 'w') as f:
+    print('name: "{}_train"'.format(model_name), file=f)
+    print(net.to_proto(), file=f)
+shutil.copy(train_net_file, job_dir)
+
+# Create test net.
+net = caffe.NetSpec()
+net.data, net.label = CreateAnnotatedDataLayer(test_data, batch_size=test_batch_size,
+        train=False, output_label=True, label_map_file=label_map_file,
+        transform_param=test_transform_param)
+
+ZFNetBody(net, from_layer='data', fully_conv=True, reduced=True, dilated=True,
+    dropout=False)
+
+AddExtraLayers(net, use_batchnorm, lr_mult=lr_mult)
+
+mbox_layers = CreateMultiBoxHead(net, data_layer='data', from_layers=mbox_source_layers,
+        use_batchnorm=use_batchnorm, min_sizes=min_sizes, max_sizes=max_sizes,
+        aspect_ratios=aspect_ratios, steps=steps, normalizations=normalizations,
+        num_classes=num_classes, share_location=share_location, flip=flip, clip=clip,
+        prior_variance=prior_variance, kernel_size=3, pad=1, lr_mult=lr_mult)
+
+conf_name = "mbox_conf"
+if multibox_loss_param["conf_loss_type"] == P.MultiBoxLoss.SOFTMAX:
+  reshape_name = "{}_reshape".format(conf_name)
+  net[reshape_name] = L.Reshape(net[conf_name], shape=dict(dim=[0, -1, num_classes]))
+  softmax_name = "{}_softmax".format(conf_name)
+  net[softmax_name] = L.Softmax(net[reshape_name], axis=2)
+  flatten_name = "{}_flatten".format(conf_name)
+  net[flatten_name] = L.Flatten(net[softmax_name], axis=1)
+  mbox_layers[1] = net[flatten_name]
+elif multibox_loss_param["conf_loss_type"] == P.MultiBoxLoss.LOGISTIC:
+  sigmoid_name = "{}_sigmoid".format(conf_name)
+  net[sigmoid_name] = L.Sigmoid(net[conf_name])
+  mbox_layers[1] = net[sigmoid_name]
+
+net.detection_out = L.DetectionOutput(*mbox_layers,
+    detection_output_param=det_out_param,
+    include=dict(phase=caffe_pb2.Phase.Value('TEST')))
+net.detection_eval = L.DetectionEvaluate(net.detection_out, net.label,
+    detection_evaluate_param=det_eval_param,
+    include=dict(phase=caffe_pb2.Phase.Value('TEST')))
+
+with open(test_net_file, 'w') as f:
+    print('name: "{}_test"'.format(model_name), file=f)
+    print(net.to_proto(), file=f)
+shutil.copy(test_net_file, job_dir)
+
+# Create deploy net.
+# Remove the first and last layer from test net.
+deploy_net = net
+with open(deploy_net_file, 'w') as f:
+    net_param = deploy_net.to_proto()
+    # Remove the first (AnnotatedData) and last (DetectionEvaluate) layer from test net.
+    del net_param.layer[0]
+    del net_param.layer[-1]
+    net_param.name = '{}_deploy'.format(model_name)
+    net_param.input.extend(['data'])
+    net_param.input_shape.extend([
+        caffe_pb2.BlobShape(dim=[1, 3, resize_height, resize_width])])
+    print(net_param, file=f)
+shutil.copy(deploy_net_file, job_dir)
+
+# Create solver.
+solver = caffe_pb2.SolverParameter(
+        train_net=train_net_file,
+        test_net=[test_net_file],
+        snapshot_prefix=snapshot_prefix,
+        **solver_param)
+
+with open(solver_file, 'w') as f:
+    print(solver, file=f)
+shutil.copy(solver_file, job_dir)
+
+max_iter = 0
+# Find most recent snapshot.
+for file in os.listdir(snapshot_dir):
+  if file.endswith(".solverstate"):
+    basename = os.path.splitext(file)[0]
+    iter = int(basename.split("{}_iter_".format(model_name))[1])
+    if iter > max_iter:
+      max_iter = iter
+
+train_src_param = '--weights="{}" \\\n'.format(pretrain_model)
+if resume_training:
+  if max_iter > 0:
+    train_src_param = '--snapshot="{}_iter_{}.solverstate" \\\n'.format(snapshot_prefix, max_iter)
+
+if remove_old_models:
+  # Remove any snapshots smaller than max_iter.
+  for file in os.listdir(snapshot_dir):
+    if file.endswith(".solverstate"):
+      basename = os.path.splitext(file)[0]
+      iter = int(basename.split("{}_iter_".format(model_name))[1])
+      if max_iter > iter:
+        os.remove("{}/{}".format(snapshot_dir, file))
+    if file.endswith(".caffemodel"):
+      basename = os.path.splitext(file)[0]
+      iter = int(basename.split("{}_iter_".format(model_name))[1])
+      if max_iter > iter:
+        os.remove("{}/{}".format(snapshot_dir, file))
+
+# Create job file.
+with open(job_file, 'w') as f:
+  f.write('cd {}\n'.format(caffe_root))
+  f.write('./build/tools/caffe train \\\n')
+  f.write('--solver="{}" \\\n'.format(solver_file))
+  f.write(train_src_param)
+  if solver_param['solver_mode'] == P.Solver.GPU:
+    f.write('--gpu {} 2>&1 | tee {}/{}.log\n'.format(gpus, job_dir, model_name))
+  else:
+    f.write('2>&1 | tee {}/{}.log\n'.format(job_dir, model_name))
+
+# Copy the python script to job_dir.
+py_file = os.path.abspath(__file__)
+shutil.copy(py_file, job_dir)
+
+# Run the job.
+os.chmod(job_file, stat.S_IRWXU)
+if run_soon:
+  subprocess.call(job_file, shell=True)
diff --git a/include/caffe/common.hpp b/include/caffe/common.hpp
index 36c497c0fa9..19ee97f4d54 100644
--- a/include/caffe/common.hpp
+++ b/include/caffe/common.hpp
@@ -350,6 +350,12 @@ class Caffe {
   static curandGenerator_t curand_generator() {
     return Get().curand_generator_;
   }
+  static cudaStream_t curand_stream() {
+    return Get().curand_stream_->get();
+  }
+  static shared_ptr<CudaStream> thread_pstream(int group = 0) {
+    return Get().pstream(group);
+  }
   static shared_ptr<CuBLASHandle> short_term_cublas_phandle() {
     return make_shared<CuBLASHandle>();
   }
@@ -432,10 +438,14 @@ class Caffe {
   static void set_restored_iter(int val);
 
   static void set_gpus(const std::vector<int>& gpus) {
-    props().gpus_ = gpus;
+    std::lock_guard<std::mutex> lock(caffe_mutex_);
+    gpus_ = gpus;
+    if (gpus_.empty()) {
+      gpus_.push_back(root_device_);
+    }
   }
   static const std::vector<int>& gpus() {
-    return props().gpus_;
+    return gpus_;
   }
   static const std::string& caffe_version() {
     return props().caffe_version();
@@ -505,9 +515,10 @@ class Caffe {
 
   // Default device chosen by a user and associated with the main thread.
   // For example, if user runs `caffe train -gpu=1,0,3` then it has to be set to 1.
+  static int root_device_;
   static Brew mode_;
   static int solver_count_;
-  static int root_device_;
+  static std::vector<int> gpus_;
   static int thread_count_;
   static int restored_iter_;
   static std::atomic<uint64_t> root_seed_;
@@ -526,10 +537,14 @@ class Caffe {
 
   DISABLE_COPY_MOVE_AND_ASSIGN(Caffe);
 
+ public:
   // Caffe Properties singleton
   class Properties {
     friend class Caffe;
+
    public:
+    Properties();
+
     const std::string& caffe_version() const {
       return caffe_version_;
     }
@@ -560,7 +575,6 @@ class Caffe {
     }
 
    private:
-    std::vector<int> gpus_;
     std::time_t init_time_;
     std::uint32_t main_thread_id_;
     std::string caffe_version_;
@@ -570,15 +584,10 @@ class Caffe {
     std::string cuda_driver_version_;
     std::vector<int> compute_capabilities_;
 
-    Properties();
     DISABLE_COPY_MOVE_AND_ASSIGN(Properties);
   };
 
-  static Properties props_;
-
-  static Properties& props() {
-    return props_;
-  }
+  static Properties& props();
 };
 
 // Yet another Event implementation
diff --git a/include/caffe/data_reader.hpp b/include/caffe/data_reader.hpp
index 0887315dea6..9c2fccc48a4 100644
--- a/include/caffe/data_reader.hpp
+++ b/include/caffe/data_reader.hpp
@@ -23,6 +23,7 @@ namespace caffe {
  * to allow deterministic ordering down the road. Data is distributed to solvers
  * in a round-robin way to keep parallel training deterministic.
  */
+template <typename DatumType>
 class DataReader : public InternalThread {
  private:
   class CursorManager {
@@ -43,8 +44,8 @@ class DataReader : public InternalThread {
         size_t solver_rank, size_t parser_threads, size_t parser_thread_id, size_t batch_size_,
         bool cache, bool shuffle, bool epoch_count_required);
     ~CursorManager();
-    void next(shared_ptr<Datum>& datum);
-    void fetch(Datum* datum);
+    void next(shared_ptr<DatumType>& datum);
+    void fetch(DatumType* datum);
     void rewind();
 
     size_t full_cycle() const {
@@ -66,8 +67,8 @@ class DataReader : public InternalThread {
       return data_cache_inst_.get();
     }
 
-    shared_ptr<Datum>& next_new();
-    shared_ptr<Datum>& next_cached(DataReader& reader);
+    shared_ptr<DatumType>& next_new();
+    shared_ptr<DatumType>& next_cached(DataReader& reader);
     bool check_memory();
     void check_db(const std::string& db_source) {
       std::lock_guard<std::mutex> lock(cache_mutex_);
@@ -92,7 +93,7 @@ class DataReader : public InternalThread {
           just_cached_(false) {}
 
     std::string db_source_;
-    vector<shared_ptr<Datum>> cache_buffer_;
+    vector<shared_ptr<DatumType>> cache_buffer_;
     size_t cache_idx_;
     boost::barrier cache_bar_;
     bool shuffle_;
@@ -120,37 +121,37 @@ class DataReader : public InternalThread {
     start_reading_flag_.set();
   }
 
-  void free_push(size_t queue_id, const shared_ptr<Datum>& datum) {
+  void free_push(size_t queue_id, const shared_ptr<DatumType>& datum) {
     if (!sample_only_) {
       free_[queue_id]->push(datum);
     }
   }
 
-  shared_ptr<Datum> free_pop(size_t queue_id) {
+  shared_ptr<DatumType> free_pop(size_t queue_id) {
     return free_[queue_id]->pop();
   }
 
-  shared_ptr<Datum> sample() {
+  shared_ptr<DatumType> sample() {
     return init_->peek();
   }
 
-  void full_push(size_t queue_id, const shared_ptr<Datum>& datum) {
+  void full_push(size_t queue_id, const shared_ptr<DatumType>& datum) {
     full_[queue_id]->push(datum);
   }
 
-  shared_ptr<Datum> full_peek(size_t queue_id) {
+  shared_ptr<DatumType> full_peek(size_t queue_id) {
     return full_[queue_id]->peek();
   }
 
-  shared_ptr<Datum> full_pop(size_t queue_id, const char* log_on_wait) {
+  shared_ptr<DatumType> full_pop(size_t queue_id, const char* log_on_wait) {
     return full_[queue_id]->pop(log_on_wait);
   }
 
-  shared_ptr<Datum>& next_new() {
+  shared_ptr<DatumType>& next_new() {
     return data_cache_->next_new();
   }
 
-  shared_ptr<Datum>& next_cached() {
+  shared_ptr<DatumType>& next_cached() {
     return data_cache_->next_cached(*this);
   }
 
@@ -174,9 +175,9 @@ class DataReader : public InternalThread {
   const bool skip_one_batch_;
   DataParameter_DB backend_;
 
-  shared_ptr<BlockingQueue<shared_ptr<Datum>>> init_;
-  vector<shared_ptr<BlockingQueue<shared_ptr<Datum>>>> free_;
-  vector<shared_ptr<BlockingQueue<shared_ptr<Datum>>>> full_;
+  shared_ptr<BlockingQueue<shared_ptr<DatumType>>> init_;
+  vector<shared_ptr<BlockingQueue<shared_ptr<DatumType>>>> free_;
+  vector<shared_ptr<BlockingQueue<shared_ptr<DatumType>>>> full_;
 
  private:
   int current_rec_;
@@ -192,6 +193,10 @@ class DataReader : public InternalThread {
   DISABLE_COPY_MOVE_AND_ASSIGN(DataReader);
 };
 
+template <typename DatumType>
+unique_ptr<typename DataReader<DatumType>::DataCache>
+    DataReader<DatumType>::DataCache::data_cache_inst_;
+
 }  // namespace caffe
 
 #endif  // CAFFE_DATA_READER_HPP_
diff --git a/include/caffe/data_transformer.hpp b/include/caffe/data_transformer.hpp
index 3d2658d7af2..853a1f281c8 100644
--- a/include/caffe/data_transformer.hpp
+++ b/include/caffe/data_transformer.hpp
@@ -11,24 +11,31 @@
 #include "caffe/util/blocking_queue.hpp"
 #include "caffe/util/io.hpp"
 
+#include "google/protobuf/repeated_field.h"
+using google::protobuf::RepeatedPtrField;
+
 namespace caffe {
 
 /**
  * @brief Applies common transformations to the input data, such as
- * scaling, mirroring, substracting the image mean...
+ * scaling, mirroring, subtracting the image mean...
  */
+template<typename Dtype>
 class DataTransformer {
  public:
   DataTransformer(const TransformationParameter& param, Phase phase);
   ~DataTransformer() = default;
 
+  const TransformationParameter& transform_param() const {
+    return param_;
+  }
+
   /**
    * @brief Initialize the Random number generations if needed by the
    *    transformation.
    */
   void InitRand();
 
-  template<typename Dtype>
   void TransformGPU(int N, int C, int H, int W, size_t sizeof_element,
       const void* in, Dtype* out, const unsigned int* rands, bool signed_data);
 
@@ -45,42 +52,8 @@ class DataTransformer {
    *    shape. If nullptr passed then only shape vector is computed.
    * @return Output shape
    */
-  template<typename Dtype>
   vector<int> Transform(const Datum* datum, Dtype* buf, size_t buf_len,
-      Packing& out_packing, bool repack = true) {
-    vector<int> shape;
-    const bool shape_only = buf == nullptr;
-    CHECK(!(param_.force_color() && param_.force_gray()))
-        << "cannot set both force_color and force_gray";
-    const int color_mode = param_.force_color() ? 1 : (param_.force_gray() ? -1 : 0);
-    cv::Mat img;
-    bool v1_path = false;
-    if (datum->encoded()) {
-      shape = DecodeDatumToCVMat(*datum, color_mode, img, shape_only, false);
-      out_packing = NHWC;
-    } else {
-      if (image_random_resize_enabled() || buf == nullptr || buf_len == 0UL) {
-        shape = DatumToCVMat(*datum, img, shape_only);
-        out_packing = NHWC;
-      } else {
-        // here we can use fast V1 path
-        TransformV1(*datum, buf, buf_len);
-        shape = vector<int>{1, datum->channels(), datum->height(), datum->width()};
-        v1_path = true;
-        out_packing = NCHW;
-      }
-    }
-    if (param_.crop_size() > 0) {
-      shape[2] = param_.crop_size();
-      shape[3] = param_.crop_size();
-    }
-    if (!shape_only && !v1_path) {
-      CHECK_NOTNULL(img.data);
-      Transform(img, buf, buf_len, repack);
-      out_packing = NHWC;
-    }
-    return shape;
-  }
+      Packing& out_packing, bool repack = true);
 
   /**
    * @brief Applies transformations defined in the image data layer's
@@ -94,96 +67,23 @@ class DataTransformer {
    *    The destination array that will store transformed data of a fixed
    *    shape.
    */
-  template<typename Dtype>
-  void Transform(const cv::Mat& src, Dtype* buf, size_t buf_len, bool repack = true) {
-    cv::Mat tmp, dst;
-
-    image_random_resize(src, tmp);
-
-    if (image_random_crop_enabled()) {
-      image_random_crop(param_.crop_size(), param_.crop_size(), tmp);  // TODO
-    } else if (image_center_crop_enabled()) {
-      image_center_crop(param_.crop_size(), param_.crop_size(), tmp);
-    }
-    apply_mean_scale_mirror(tmp, dst);
-    FloatCVMatToBuf<Dtype>(dst, buf_len, buf, repack);
-  }
+  void Transform(const cv::Mat& src, Dtype* buf, size_t buf_len, bool repack = true) const;
 
-  /**
-   * @brief Applies the transformation defined in the data layer's
-   * transform_param block to a vector of Mat.
-   *
-   * @param mat_vector
-   *    A vector of Mat containing the data to be transformed.
-   * @param transformed_blob
-   *    This is destination blob. It can be part of top blob's data if
-   *    set_cpu_data() is used. See memory_layer.cpp for an example.
-   */
-  template<typename Dtype>
-  void Transform(const vector<cv::Mat>& mat_vector, TBlob<Dtype>* transformed_blob) {
-    const size_t mat_num = mat_vector.size();
-    const int num = transformed_blob->num();
-    CHECK_GT(mat_num, 0) << "There is no MAT to add";
-    CHECK_EQ(mat_num, num) << "The size of mat_vector must be equals to transformed_blob->num()";
-    cv::Mat dst;
-    size_t buf_len = transformed_blob->offset(1);
-    for (size_t item_id = 0; item_id < mat_num; ++item_id) {
-      size_t offset = transformed_blob->offset(item_id);
-      apply_mean_scale_mirror(mat_vector[item_id], dst);
-      FloatCVMatToBuf<Dtype>(dst, buf_len, transformed_blob->mutable_cpu_data(false) + offset);
-    }
-  }
+  void Transform(const cv::Mat& img, TBlob<Dtype> *transformed_blob) const;
 
-  template<typename Dtype>
-  void Transform(const cv::Mat& img, TBlob<Dtype> *transformed_blob) {
-    const int crop_size = param_.crop_size();
-    const int img_channels = img.channels();
-    const int img_height = img.rows;
-    const int img_width = img.cols;
-
-    // Check dimensions.
-    const int channels = transformed_blob->channels();
-    const int height = transformed_blob->height();
-    const int width = transformed_blob->width();
-    const int num = transformed_blob->num();
-
-    CHECK_EQ(channels, img_channels);
-    CHECK_LE(height, img_height);
-    CHECK_LE(width, img_width);
-    CHECK_GE(num, 1);
-    // TODO
-    if (crop_size > 0) {
-      CHECK_EQ(crop_size, height);
-      CHECK_EQ(crop_size, width);
-    }
-    Transform(img, transformed_blob->mutable_cpu_data(false), transformed_blob->count());
-  }
+    /**
+     * @brief Applies the transformation defined in the data layer's
+     * transform_param block to a vector of Mat.
+     *
+     * @param mat_vector
+     *    A vector of Mat containing the data to be transformed.
+     * @param transformed_blob
+     *    This is destination blob. It can be part of top blob's data if
+     *    set_cpu_data() is used. See memory_layer.cpp for an example.
+     */
+  void Transform(const vector<cv::Mat>& mat_vector, TBlob<Dtype>* transformed_blob) const;
 
-  /**
-   * @brief Applies the transformation defined in the data layer's
-   * transform_param block to a vector of Datum.
-   *
-   * @param datum_vector
-   *    A vector of Datum containing the data to be transformed.
-   * @param transformed_blob
-   *    This is destination blob. It can be part of top blob's data if
-   *    set_cpu_data() is used. See memory_layer.cpp for an example.
-   */
-  template<typename Dtype>
-  void Transform(const vector<Datum>& datum_vector, TBlob<Dtype>* transformed_blob) {
-    const size_t datum_num = datum_vector.size();
-    const int num = transformed_blob->num();
-    CHECK_GT(datum_num, 0) << "There is no datum to add";
-    CHECK_LE(datum_num, num)
-      << "The size of datum_vector must be not greater than transformed_blob->num()";
-    cv::Mat dst;
-    size_t buf_len = transformed_blob->offset(1);
-    for (size_t item_id = 0; item_id < datum_num; ++item_id) {
-      size_t offset = transformed_blob->offset(item_id);
-      DatumToCVMat(datum_vector[item_id], dst, false);
-      FloatCVMatToBuf<Dtype>(dst, buf_len, transformed_blob->mutable_cpu_data(false) + offset);
-    }
-  }
+  void Transform(const vector<Datum>& datum_vector, TBlob<Dtype> *transformed_blob) const;
 
   /**
    * @brief Generates a random integer from Uniform({0, 1, ..., n-1}).
@@ -199,173 +99,194 @@ class DataTransformer {
   }
 
   // tests only, TODO: clean
-  template<typename Dtype>
-  void Transform(Datum& datum, TBlob<Dtype>* transformed_blob) {
+  void Transform(const Datum& datum, TBlob<Dtype>* transformed_blob) const {
     cv::Mat img;
     DatumToCVMat(datum, img, false);
     Transform(img, transformed_blob);
   }
 
-  void VariableSizedTransforms(Datum* datum);
   void Fill3Randoms(unsigned int *rand) const;
 
- protected:
+  void TransformInv(const Blob* blob, vector<cv::Mat>* cv_imgs);
+  void TransformInv(const Dtype* data, cv::Mat* cv_img, const int height,
+      const int width, const int channels);
+
+  vector<int> InferBlobShape(const cv::Mat& cv_img);
+  vector<int> InferDatumShape(const Datum& datum);
+  vector<int> InferCVMatShape(const cv::Mat& img);
+
+  /**
+   * @brief Infers the shape of transformed_blob will have when
+   *    the transformation is applied to the data.
+   *
+   * @param bottom_shape
+   *    The shape of the data to be transformed.
+   */
+  vector<int> InferBlobShape(const vector<int>& bottom_shape, bool use_gpu = false);
+
+  /**
+   * @brief Infers the shape of transformed_blob will have when
+   *    the transformation is applied to the data.
+   *
+   * @param datum
+   *    Datum containing the data to be transformed.
+   */
+  vector<int> InferBlobShape(const Datum& datum);
+
+  /**
+   * @brief Crops the datum according to bbox.
+   */
+
+  void CropImage(const Datum& datum, const NormalizedBBox& bbox, Datum* crop_datum);
+
+  /**
+   * @brief Crops the datum and AnnotationGroup according to bbox.
+   */
+  void CropImage(const AnnotatedDatum& anno_datum, const NormalizedBBox& bbox,
+                 AnnotatedDatum* cropped_anno_datum);
+
+  /**
+   * @brief Expand the datum.
+   */
+  void ExpandImage(const Datum& datum, const float expand_ratio,
+                   NormalizedBBox* expand_bbox, Datum* expanded_datum);
+
+  /**
+   * @brief Expand the datum and adjust AnnotationGroup.
+   */
+  void ExpandImage(const AnnotatedDatum& anno_datum, AnnotatedDatum* expanded_anno_datum);
+
+  /**
+   * @brief Apply distortion to the datum.
+   */
+  void DistortImage(const Datum& datum, Datum* distort_datum);
+
+  /**
+   * @brief Applies the transformation defined in the data layer's
+   * transform_param block to the annotated data.
+   *
+   * @param anno_datum
+   *    AnnotatedDatum containing the data and annotation to be transformed.
+   * @param transformed_blob
+   *    This is destination blob. It can be part of top blob's data if
+   *    set_cpu_data() is used. See annotated_data_layer.cpp for an example.
+   * @param transformed_anno_vec
+   *    This is destination annotation.
+   */
+  void Transform(const AnnotatedDatum& anno_datum,
+                 TBlob<Dtype>* transformed_blob,
+                 RepeatedPtrField<AnnotationGroup>* transformed_anno_vec);
+
+  void Transform(const AnnotatedDatum& anno_datum,
+                 TBlob<Dtype>* transformed_blob,
+                 RepeatedPtrField<AnnotationGroup>* transformed_anno_vec,
+                 bool* do_mirror);
+
+  void Transform(const AnnotatedDatum& anno_datum,
+                 TBlob<Dtype>* transformed_blob,
+                 vector<AnnotationGroup>* transformed_anno_vec,
+                 bool* do_mirror);
+
+  void Transform(const AnnotatedDatum& anno_datum,
+                 TBlob<Dtype>* transformed_blob,
+                 vector<AnnotationGroup>* transformed_anno_vec);
+
   bool image_random_resize_enabled() const;
-  bool image_random_crop_enabled() const;
   bool image_center_crop_enabled() const;
+  bool image_random_crop_enabled() const;
+  void image_random_resize(const cv::Mat& src, cv::Mat& dst) const;
+  void image_center_crop(int crop_w, int crop_h, cv::Mat& img) const;
+  void image_random_crop(int crop_w, int crop_h, cv::Mat& img) const;
 
-  void apply_mean_scale_mirror(const cv::Mat& src, cv::Mat& dst);
-  void image_random_crop(int crop_w, int crop_h, cv::Mat& img);
-
-  template<typename Dtype>
-  void TransformV1(const Datum& datum, Dtype* buf, size_t buf_len) {
-    const string& data = datum.data();
-    const int datum_channels = datum.channels();
-    const int datum_height = datum.height();
-    const int datum_width = datum.width();
-
-    const int crop_size = param_.crop_size();
-    const float scale = param_.scale();
-    const bool do_mirror = param_.mirror() && (Rand() % 2);
-    const bool has_mean_file = param_.has_mean_file();
-    const bool has_uint8 = data.size() > 0;
-    const bool has_mean_values = mean_values_.size() > 0;
-
-    CHECK_GT(datum_channels, 0);
-    CHECK_GE(datum_height, crop_size);
-    CHECK_GE(datum_width, crop_size);
-
-    const float* mean = NULL;
-    if (has_mean_file) {
-      CHECK_EQ(datum_channels, data_mean_.channels());
-      CHECK_EQ(datum_height, data_mean_.height());
-      CHECK_EQ(datum_width, data_mean_.width());
-      mean = data_mean_.cpu_data();
-    }
-    if (has_mean_values) {
-      CHECK(mean_values_.size() == 1 || mean_values_.size() == datum_channels)
-          << "Specify either 1 mean_value or as many as channels: " << datum_channels;
-      if (datum_channels > 1 && mean_values_.size() == 1) {
-        // Replicate the mean_value for simplicity
-        for (int c = 1; c < datum_channels; ++c) {
-          mean_values_.push_back(mean_values_[0]);
-        }
-      }
-    }
-
-    int height = datum_height;
-    int width = datum_width;
-
-    int h_off = 0;
-    int w_off = 0;
-    if (crop_size) {
-      height = crop_size;
-      width = crop_size;
-      // We only do random crop when we do training.
-      if (phase_ == TRAIN) {
-        h_off = Rand() % (datum_height - crop_size + 1);
-        w_off = Rand() % (datum_width - crop_size + 1);
-      } else {
-        h_off = (datum_height - crop_size) / 2;
-        w_off = (datum_width - crop_size) / 2;
-      }
-    }
-
-    int top_index, data_index, ch, cdho;
-    const int m = do_mirror ? -1 : 1;
-
-    if (has_uint8) {
-      float datum_element, mnv;
-
-      if (scale == 1.F) {
-        for (int c = 0; c < datum_channels; ++c) {
-          cdho = c * datum_height + h_off;
-          ch = c * height;
-          mnv = has_mean_values && !has_mean_file ? mean_values_[c] : 0.F;
-          for (int h = 0; h < height; ++h) {
-            top_index = do_mirror ? (ch + h + 1) * width - 1 : (ch + h) * width;
-            data_index = (cdho + h) * datum_width + w_off;
-            for (int w = 0; w < width; ++w) {
-              datum_element = static_cast<unsigned char>(data[data_index]);
-              if (has_mean_file) {
-                buf[top_index] = datum_element - mean[data_index];
-              } else {
-                if (has_mean_values) {
-                  buf[top_index] = datum_element - mnv;
-                } else {
-                  buf[top_index] = datum_element;
-                }
-              }
-              ++data_index;
-              top_index += m;
-            }
-          }
-        }
-      } else {
-        for (int c = 0; c < datum_channels; ++c) {
-          cdho = c * datum_height + h_off;
-          ch = c * height;
-          mnv = has_mean_values && !has_mean_file ? mean_values_[c] : 0.F;
-          for (int h = 0; h < height; ++h) {
-            top_index = do_mirror ? (ch + h + 1) * width - 1 : (ch + h) * width;
-            data_index = (cdho + h) * datum_width + w_off;
-            for (int w = 0; w < width; ++w) {
-              datum_element = static_cast<unsigned char>(data[data_index]);
-              if (has_mean_file) {
-                buf[top_index] = (datum_element - mean[data_index]) * scale;
-              } else {
-                if (has_mean_values) {
-                  buf[top_index] = (datum_element - mnv) * scale;
-                } else {
-                  buf[top_index] = datum_element * scale;
-                }
-              }
-              ++data_index;
-              top_index += m;
-            }
-          }
-        }
-      }
-    } else {
-      float datum_element;
-      for (int c = 0; c < datum_channels; ++c) {
-        cdho = c * datum_height + h_off;
-        ch = c * height;
-        for (int h = 0; h < height; ++h) {
-          top_index = do_mirror ? (ch + h + 1) * width - 1 : (ch + h) * width;
-          data_index = (cdho + h) * datum_width + w_off;
-          for (int w = 0; w < width; ++w) {
-            datum_element = datum.float_data(data_index);
-            if (has_mean_file) {
-              buf[top_index] = (datum_element - mean[data_index]) * scale;
-            } else {
-              if (has_mean_values) {
-                buf[top_index] = (datum_element - mean_values_[c]) * scale;
-              } else {
-                buf[top_index] = datum_element * scale;
-              }
-            }
-            ++data_index;
-            top_index += m;
-          }
-        }
-      }
-    }
-  }
+ protected:
+  void apply_mean_scale_mirror(const cv::Mat& src, cv::Mat& dst) const;
+
+  void TransformV1(const Datum& datum, Dtype* buf, size_t buf_len);
 
-  void image_random_resize(const cv::Mat& src, cv::Mat& dst);
-  static void image_center_crop(int crop_w, int crop_h, cv::Mat& img);
   unsigned int Rand() const;
   float Rand(float lo, float up) const;
 
+  void Copy(const Datum& datum, Dtype* data, size_t& out_sizeof_element);
+  void Copy(const cv::Mat& datum, Dtype* data);
+
+  /**
+   * @brief Transform the annotation according to the transformation applied
+   * to the datum.
+   *
+   * @param anno_datum
+   *    AnnotatedDatum containing the data and annotation to be transformed.
+   * @param do_resize
+   *    If true, resize the annotation accordingly before crop.
+   * @param crop_bbox
+   *    The cropped region applied to anno_datum.datum()
+   * @param do_mirror
+   *    If true, meaning the datum has mirrored.
+   * @param transformed_anno_group_all
+   *    Stores all transformed AnnotationGroup.
+   */
+  void TransformAnnotation(
+      const AnnotatedDatum& anno_datum, const bool do_resize,
+      const NormalizedBBox& crop_bbox, const bool do_mirror,
+      RepeatedPtrField<AnnotationGroup>* transformed_anno_group_all);
+
+
+  /**
+   * @brief Applies the transformation defined in the data layer's
+   * transform_param block to a cv::Mat
+   *
+   * @param cv_img
+   *    cv::Mat containing the data to be transformed.
+   * @param transformed_blob
+   *    This is destination blob. It can be part of top blob's data if
+   *    set_cpu_data() is used. See image_data_layer.cpp for an example.
+   */
+  void Transform(const cv::Mat& cv_img, TBlob<Dtype>* transformed_blob,
+                 NormalizedBBox* crop_bbox, bool* do_mirror);
+
+  /**
+   * @brief Crops img according to bbox.
+   */
+  void CropImage(const cv::Mat& img, const NormalizedBBox& bbox, cv::Mat* crop_img);
+
+  /**
+   * @brief Expand img to include mean value as background.
+   */
+  void ExpandImage(const cv::Mat& img, const float expand_ratio,
+                   NormalizedBBox* expand_bbox, cv::Mat* expand_img);
+
+  /**
+   * @brief Infers the shape of transformed_blob will have when
+   *    the transformation is applied to the data.
+   *    It uses the first element to infer the shape of the blob.
+   *
+   * @param datum_vector
+   *    A vector of Datum containing the data to be transformed.
+   */
+  vector<int> InferBlobShape(const vector<Datum> & datum_vector);
+
+  void Transform(const Datum& datum,
+      Dtype *transformed_data, const std::array<unsigned int, 3>& rand);
+
+ protected:
+  // Transform and return the transformation information.
+  void Transform(const Datum& datum, Dtype* transformed_data,
+                 NormalizedBBox* crop_bbox, bool* do_mirror);
+  /**
+   * @brief Applies the transformation defined in the data layer's
+   * transform_param block to the data and return transform information.
+   */
+  void Transform(const Datum& datum, TBlob<Dtype>* transformed_blob,
+                 NormalizedBBox* crop_bbox, bool* do_mirror);
+
   // Tranformation parameters
   TransformationParameter param_;
   shared_ptr<Caffe::RNG> rng_;
   Phase phase_;
   TBlob<float> data_mean_;
   vector<float> mean_values_;
-  cv::Mat mean_mat_orig_, mean_mat_;
-  cv::Mat tmp_;
+  cv::Mat mean_mat_orig_;
+  mutable cv::Mat mean_mat_;
+  mutable cv::Mat tmp_;
 
   const float rand_resize_ratio_lower_, rand_resize_ratio_upper_;
   const float vertical_stretch_lower_;
diff --git a/include/caffe/layer.hpp b/include/caffe/layer.hpp
index 6b1bf079b86..552654e33ab 100644
--- a/include/caffe/layer.hpp
+++ b/include/caffe/layer.hpp
@@ -54,7 +54,6 @@ class LayerBase {
         bm_by_user_(false),
         parent_net_(nullptr),
         net_inititialized_flag_(nullptr),
-        net_iteration0_flag_(nullptr),
         is_shared_(false) {
     InitMutex();
   }
@@ -137,7 +136,7 @@ class LayerBase {
 
   // Iteration counter maintained by Solver
   int iter() const;
-  int relative_iter() const;
+  int parent_rank() const;
 
   Net* parent_net() {
     return parent_net_;
@@ -356,10 +355,6 @@ class LayerBase {
     net_inititialized_flag_ = init_flag;
   }
 
-  void set_net_iteration0_flag(Flag* iter0_flag) {
-    net_iteration0_flag_ = iter0_flag;
-  }
-
   /**
    * Some layers need to be initialized after first iteration
    * They should override this function and return a flag
@@ -444,9 +439,6 @@ class LayerBase {
   /** Gets set when Net::Init is over */
   Flag* net_inititialized_flag_;
 
-  /** Gets set when Net::Init is over */
-  Flag* net_iteration0_flag_;
-
  private:
   /** Whether this layer is actually shared by other nets*/
   bool is_shared_;
@@ -567,16 +559,11 @@ inline float Layer<Ftype, Btype>::Forward(const vector<Blob*>& bottom, const vec
       for (int top_id = 0; top_id < top.size(); ++top_id) {
         if (this->loss(top_id) == 0.F) { continue; }
         const int count = top[top_id]->count();
-        if (count < 16 && is_precise<Ftype>()) {
-          loss += caffe_cpu_dot(count, top[top_id]->cpu_data<Ftype>(),
-              top[top_id]->cpu_diff<Ftype>());
-        } else {
-          const Ftype* data = top[top_id]->gpu_data<Ftype>();
-          const Ftype* loss_weights = top[top_id]->gpu_diff<Ftype>();
-          float blob_loss = 0.F;
-          caffe_gpu_dot(count, data, loss_weights, &blob_loss);
-          loss += blob_loss;
-        }
+        const Ftype* data = top[top_id]->gpu_data<Ftype>();
+        const Ftype* loss_weights = top[top_id]->gpu_diff<Ftype>();
+        float blob_loss = 0.F;
+        caffe_gpu_dot(count, data, loss_weights, &blob_loss);
+        loss += blob_loss;
       }
       break;
     default:
diff --git a/include/caffe/layers/accuracy_layer.hpp b/include/caffe/layers/accuracy_layer.hpp
index 4b2009fedaa..08990f4e297 100644
--- a/include/caffe/layers/accuracy_layer.hpp
+++ b/include/caffe/layers/accuracy_layer.hpp
@@ -28,17 +28,16 @@ class AccuracyLayer : public Layer<Ftype, Btype> {
    */
   explicit AccuracyLayer(const LayerParameter& param)
       : Layer<Ftype, Btype>(param) {}
-  virtual void LayerSetUp(const vector<Blob*>& bottom,
-      const vector<Blob*>& top);
-  virtual void Reshape(const vector<Blob*>& bottom,
-      const vector<Blob*>& top);
+  void LayerSetUp(const vector<Blob*>& bottom, const vector<Blob*>& top) override;
+  void Reshape(const vector<Blob*>& bottom, const vector<Blob*>& top) override;
 
-  virtual inline const char* type() const { return "Accuracy"; }
-  virtual inline int ExactNumBottomBlobs() const { return 2; }
+  inline const char* type() const override { return "Accuracy"; }
+  inline int ExactNumBottomBlobs() const override { return 2; }
 
   // If there are two top blobs, then the second blob will contain
   // accuracies per class.
-  virtual inline int MinTopBlobs() const { return 1; }
+  inline int MinTopBlobs() const override { return 1; }
+  inline int MaxTopBlobs() const override { return 2; }
 
  protected:
   /**
@@ -65,16 +64,18 @@ class AccuracyLayer : public Layer<Ftype, Btype> {
    *         \end{array} \right.
    *      @f$
    */
-  virtual void Forward_cpu(const vector<Blob*>& bottom,
-      const vector<Blob*>& top);
+  void Forward_cpu(const vector<Blob*>& bottom, const vector<Blob*>& top) override;
+  void Forward_gpu(const vector<Blob*>& bottom, const vector<Blob*>& top) override;
 
   /// @brief Not implemented -- AccuracyLayer cannot be used as a loss.
-  virtual void Backward_cpu(const vector<Blob*>& top,
-      const vector<bool>& propagate_down, const vector<Blob*>& bottom) {
+  void Backward_cpu(const vector<Blob*>& top,
+      const vector<bool>& propagate_down, const vector<Blob*>& bottom) override {
     for (int i = 0; i < propagate_down.size(); ++i) {
       if (propagate_down[i]) { NOT_IMPLEMENTED; }
     }
   }
+  void Backward_gpu(const vector<Blob*>& top,
+      const vector<bool>& propagate_down, const vector<Blob*>& bottom) override;
 
   int label_axis_, outer_num_, inner_num_;
   int top_k_;
diff --git a/include/caffe/layers/annotated_data_layer.hpp b/include/caffe/layers/annotated_data_layer.hpp
new file mode 100644
index 00000000000..bdaf95b638d
--- /dev/null
+++ b/include/caffe/layers/annotated_data_layer.hpp
@@ -0,0 +1,51 @@
+#ifndef CAFFE_ANNOTATED_DATA_LAYER_HPP_
+#define CAFFE_ANNOTATED_DATA_LAYER_HPP_
+
+#include <string>
+#include <map>
+#include <vector>
+#include <atomic>
+
+#include "caffe/blob.hpp"
+#include "caffe/data_reader.hpp"
+#include "caffe/data_transformer.hpp"
+#include "caffe/internal_thread.hpp"
+#include "caffe/layer.hpp"
+#include "caffe/layers/data_layer.hpp"
+#include "caffe/proto/caffe.pb.h"
+#include "caffe/util/db.hpp"
+
+namespace caffe {
+
+template <typename Ftype, typename Btype>
+class AnnotatedDataLayer : public DataLayer<Ftype, Btype> {
+ public:
+  AnnotatedDataLayer(const LayerParameter& param, size_t solver_rank);
+  void DataLayerSetUp(const vector<Blob*>& bottom, const vector<Blob*>& top) override;
+
+  const char* type() const override {
+    return "AnnotatedData";
+  }
+  int ExactNumBottomBlobs() const override {
+    return 0;
+  }
+  int MinTopBlobs() const override {
+    return 1;
+  }
+
+ protected:
+  void load_batch(Batch* batch, int thread_id, size_t queue_id) override;
+  void start_reading() override {
+    areader_->start_reading();
+  }
+
+  std::shared_ptr<DataReader<AnnotatedDatum>> sample_areader_, areader_;
+  bool has_anno_type_;
+  AnnotatedDatum_AnnotationType anno_type_;
+  vector<BatchSampler> batch_samplers_;
+  string label_map_file_;
+};
+
+}  // namespace caffe
+
+#endif  // CAFFE_DATA_LAYER_HPP_
diff --git a/include/caffe/layers/axpy_layer.hpp b/include/caffe/layers/axpy_layer.hpp
new file mode 100644
index 00000000000..908206cbea8
--- /dev/null
+++ b/include/caffe/layers/axpy_layer.hpp
@@ -0,0 +1,60 @@
+/*
+ * Axpy Layer
+ *
+ * Created on: May 1, 2017
+ * Author: hujie
+ */
+
+#ifndef CAFFE_AXPY_LAYER_HPP_
+#define CAFFE_AXPY_LAYER_HPP_
+
+#include <vector>
+
+#include "caffe/blob.hpp"
+#include "caffe/layer.hpp"
+
+namespace caffe {
+
+/**
+ * @brief For reduce memory and time both on training and testing, we combine
+ *        channel-wise scale operation and element-wise addition operation 
+ *        into a single layer called "axpy".
+ *       
+ */
+template <typename Ftype, typename Btype>
+class AxpyLayer: public Layer<Ftype, Btype> {
+ public:
+  explicit AxpyLayer(const LayerParameter& param)
+      : Layer<Ftype, Btype>(param) {}
+  virtual void LayerSetUp(const vector<Blob*>& bottom, const vector<Blob*>& top) {}
+  virtual void Reshape(const vector<Blob*>& bottom, const vector<Blob*>& top);
+
+  virtual inline const char* type() const { return "Axpy"; }
+  virtual inline int ExactNumBottomBlobs() const { return 3; }
+  virtual inline int ExactNumTopBlobs() const { return 1; }
+
+ protected:
+/**
+ * @param Formulation:
+ *            F = a * X + Y
+ *	  Shape info:
+ *            a:  N x C          --> bottom[0]      
+ *            X:  N x C x H x W  --> bottom[1]       
+ *            Y:  N x C x H x W  --> bottom[2]     
+ *            F:  N x C x H x W  --> top[0]
+ */
+  virtual void Forward_cpu(const vector<Blob*>& bottom,
+      const vector<Blob*>& top);
+  virtual void Forward_gpu(const vector<Blob*>& bottom,
+      const vector<Blob*>& top);
+  virtual void Backward_cpu(const vector<Blob*>& top,
+      const vector<bool>& propagate_down, const vector<Blob*>& bottom);
+  virtual void Backward_gpu(const vector<Blob*>& top,
+      const vector<bool>& propagate_down, const vector<Blob*>& bottom);
+
+  TBlob<Btype> spatial_sum_multiplier_;
+};
+
+}  // namespace caffe
+
+#endif  // CAFFE_AXPY_LAYER_HPP_
diff --git a/include/caffe/layers/base_data_layer.hpp b/include/caffe/layers/base_data_layer.hpp
index 77053042be2..a25b52956d0 100644
--- a/include/caffe/layers/base_data_layer.hpp
+++ b/include/caffe/layers/base_data_layer.hpp
@@ -61,8 +61,11 @@ class BasePrefetchingDataLayer : public BaseDataLayer<Ftype, Btype>, public Inte
   void Forward_cpu(const vector<Blob*>& bottom, const vector<Blob*>& top) override;
   void Forward_gpu(const vector<Blob*>& bottom, const vector<Blob*>& top) override;
 
-  DataTransformer* dt(int id) {
-    return data_transformers_.at(id).get();
+  DataTransformer<Btype>* bdt(int id) {
+    return bwd_data_transformers_.at(id).get();
+  }
+  DataTransformer<Ftype>* fdt(int id) {
+    return fwd_data_transformers_.at(id).get();
   }
 
   bool is_gpu_transform() const override {
@@ -110,18 +113,23 @@ class BasePrefetchingDataLayer : public BaseDataLayer<Ftype, Btype>, public Inte
   static bool auto_mode(const LayerParameter& param);
 
   std::vector<size_t> batch_ids_;
-  const bool auto_mode_;
+  bool auto_mode_;
   size_t parsers_num_, transf_num_, queues_num_;
 
   // These two are for delayed init only
   std::vector<Blob*> bottom_init_;
   std::vector<Blob*> top_init_;
 
-  vector<shared_ptr<DataTransformer>> data_transformers_;
+  // Use Btype as a transformer type (i.e. better float 32)
+  // since data layers don't have bottom channels
+  std::vector<shared_ptr<DataTransformer<Btype>>> bwd_data_transformers_;
+  // TransformGPU may do this in-place
+  std::vector<shared_ptr<DataTransformer<Ftype>>> fwd_data_transformers_;
 
-  boost::shared_ptr<BatchTransformer<Ftype, Btype>> batch_transformer_;
+  shared_ptr<BatchTransformer<Ftype, Btype>> batch_transformer_;
   std::vector<int> last_shape_;
   int batch_size_;
+  Flag iter0_;
 };
 
 }  // namespace caffe
diff --git a/include/caffe/layers/cudnn_conv_layer.hpp b/include/caffe/layers/cudnn_conv_layer.hpp
index 048ee558e60..70783ae4b4a 100644
--- a/include/caffe/layers/cudnn_conv_layer.hpp
+++ b/include/caffe/layers/cudnn_conv_layer.hpp
@@ -41,13 +41,11 @@ class CuDNNConvolutionLayer : public ConvolutionLayer<Ftype, Btype> {
   static constexpr int MAX_PARALLEL_GROUPS = Caffe::MAX_CONV_GROUPS;
   static constexpr int REQUEST_ALGO_COUNT = 1;
   static constexpr int ATTEMPTS_TO_RESERVE_WS = 3;
-  static std::mutex m_;
+  static constexpr int REALLOC_COUNT = 3;
 
-  static ThreadSafeMap<std::unordered_map<int, size_t>> ws_allocated_;
-  static ThreadSafeMap<std::unordered_map<int, size_t>> train_mem_req_all_grps_;
-  static ThreadSafeMap<std::unordered_map<int, size_t>> test_mem_req_all_grps_;
-  static ThreadSafeMap<std::unordered_map<int, size_t>> train_tmp_weights_mem_;
-  static ThreadSafeMap<std::unordered_map<int, bool>> ws_released_;
+  static std::atomic<size_t> train_mem_req_all_grps_;
+  static std::atomic<size_t> test_mem_req_all_grps_;
+  static std::atomic<size_t> train_tmp_weights_mem_;
 
  public:
   explicit CuDNNConvolutionLayer(const LayerParameter& param)
@@ -109,7 +107,7 @@ class CuDNNConvolutionLayer : public ConvolutionLayer<Ftype, Btype> {
     return fwd_count_ < 2UL;
   }
   bool ok_to_release() const {
-    return bwd_count_ == 3UL;
+    return bwd_count_ == REALLOC_COUNT;
   }
 
   void FindExConvAlgo(const vector<Blob*>& bottom, const vector<Blob*>& top);
@@ -164,29 +162,15 @@ template<typename Ftype, typename Btype>
 constexpr int CuDNNConvolutionLayer<Ftype, Btype>::REQUEST_ALGO_COUNT;
 template<typename Ftype, typename Btype>
 constexpr int CuDNNConvolutionLayer<Ftype, Btype>::ATTEMPTS_TO_RESERVE_WS;
-
-template<typename Ftype, typename Btype>
-std::mutex CuDNNConvolutionLayer<Ftype, Btype>::m_;
 template<typename Ftype, typename Btype>
-ThreadSafeMap<std::unordered_map<int, size_t>>
-CuDNNConvolutionLayer<Ftype, Btype>::ws_allocated_(
-    CuDNNConvolutionLayer<Ftype, Btype>::m_);
-template<typename Ftype, typename Btype>
-ThreadSafeMap<std::unordered_map<int, bool>>
-CuDNNConvolutionLayer<Ftype, Btype>::ws_released_(
-    CuDNNConvolutionLayer<Ftype, Btype>::m_);
+constexpr int CuDNNConvolutionLayer<Ftype, Btype>::REALLOC_COUNT;
+
 template<typename Ftype, typename Btype>
-ThreadSafeMap<std::unordered_map<int, size_t>>
-CuDNNConvolutionLayer<Ftype, Btype>::train_mem_req_all_grps_(
-    CuDNNConvolutionLayer<Ftype, Btype>::m_);
+std::atomic<size_t> CuDNNConvolutionLayer<Ftype, Btype>::train_mem_req_all_grps_;
 template<typename Ftype, typename Btype>
-ThreadSafeMap<std::unordered_map<int, size_t>>
-CuDNNConvolutionLayer<Ftype, Btype>::test_mem_req_all_grps_(
-    CuDNNConvolutionLayer<Ftype, Btype>::m_);
+std::atomic<size_t> CuDNNConvolutionLayer<Ftype, Btype>::test_mem_req_all_grps_;
 template<typename Ftype, typename Btype>
-ThreadSafeMap<std::unordered_map<int, size_t>>
-CuDNNConvolutionLayer<Ftype, Btype>::train_tmp_weights_mem_(
-    CuDNNConvolutionLayer<Ftype, Btype>::m_);
+std::atomic<size_t> CuDNNConvolutionLayer<Ftype, Btype>::train_tmp_weights_mem_;
 
 #endif
 
diff --git a/include/caffe/layers/cudnn_deconv_layer.hpp b/include/caffe/layers/cudnn_deconv_layer.hpp
new file mode 100644
index 00000000000..9abdcd589ef
--- /dev/null
+++ b/include/caffe/layers/cudnn_deconv_layer.hpp
@@ -0,0 +1,69 @@
+#ifndef CAFFE_CUDNN_DECONV_LAYER_HPP_
+#define CAFFE_CUDNN_DECONV_LAYER_HPP_
+
+#include <vector>
+
+#include "caffe/blob.hpp"
+#include "caffe/layer.hpp"
+#include "caffe/proto/caffe.pb.h"
+
+#include "caffe/layers/deconv_layer.hpp"
+
+namespace caffe {
+
+#ifdef USE_CUDNN
+/*
+ * @brief cuDNN implementation of DeConvolutionLayer.
+ *        Fallback to DeConvolutionLayer for CPU mode.
+ *
+ * cuDNN accelerates deconvolution through forward kernels for filtering and
+ * bias plus backward kernels for the gradient w.r.t. the filters, biases, and
+ * inputs. Caffe + cuDNN further speeds up the computation through forward
+ * parallelism across groups and backward parallelism across gradients.
+*/
+template<typename Ftype, typename Btype>
+class CuDNNDeconvolutionLayer : public DeconvolutionLayer<Ftype, Btype> {
+ public:
+  explicit CuDNNDeconvolutionLayer(const LayerParameter& param)
+    : DeconvolutionLayer<Ftype, Btype>(param),
+      handles_setup_(false),
+      forward_math_(tpmax<Ftype, float>()),
+      backward_data_math_(tpmax<Btype, float>()),
+      backward_filter_math_(tpmax<Btype, float>()) {}
+  virtual ~CuDNNDeconvolutionLayer();
+  void LayerSetUp(const vector<Blob*>& bottom, const vector<Blob*>& top) override;
+  void Reshape(const vector<Blob*>& bottom, const vector<Blob*>& top) override;
+
+ protected:
+  void Forward_gpu(const vector<Blob*>& bottom, const vector<Blob*>& top) override;
+  void Backward_gpu(const vector<Blob*>& top, const vector<bool>& propagate_down,
+                    const vector<Blob*>& bottom) override;
+
+  bool handles_setup_;
+  cudnnHandle_t* handle_;
+  cudaStream_t*  stream_;
+
+  // algorithms for forward and backwards convolutions
+  cudnnConvolutionFwdAlgo_t *fwd_algo_;
+  cudnnConvolutionBwdFilterAlgo_t *bwd_filter_algo_;
+  cudnnConvolutionBwdDataAlgo_t *bwd_data_algo_;
+
+  vector<cudnnTensorDescriptor_t> bottom_descs_, top_descs_;
+  cudnnTensorDescriptor_t bias_desc_;
+  cudnnFilterDescriptor_t filter_desc_;
+  vector<cudnnConvolutionDescriptor_t> conv_descs_;
+  int bottom_offset_, top_offset_, bias_offset_;
+  Type forward_math_, backward_data_math_, backward_filter_math_;
+
+  size_t *workspace_fwd_sizes_;
+  size_t *workspace_bwd_data_sizes_;
+  size_t *workspace_bwd_filter_sizes_;
+  size_t workspaceSizeInBytes;  // size of underlying storage
+  void *workspaceData;  // underlying storage
+  void **workspace;  // aliases into workspaceData
+};
+#endif
+
+}  // namespace caffe
+
+#endif  // CAFFE_CUDNN_DECONV_LAYER_HPP_
diff --git a/include/caffe/layers/data_layer.hpp b/include/caffe/layers/data_layer.hpp
index 661d28441b2..49f63837936 100644
--- a/include/caffe/layers/data_layer.hpp
+++ b/include/caffe/layers/data_layer.hpp
@@ -57,12 +57,12 @@ class DataLayer : public BasePrefetchingDataLayer<Ftype, Btype> {
     reader_->start_reading();
   }
 
-  std::shared_ptr<DataReader> sample_reader_, reader_;
-  vector<shared_ptr<GPUMemory::Workspace>> tmp_gpu_buffer_;
+  std::shared_ptr<DataReader<Datum>> sample_reader_, reader_;
+  std::vector<shared_ptr<GPUMemory::Workspace>> tmp_gpu_buffer_;
 
   // stored random numbers for this batch
-  vector<shared_ptr<TBlob<unsigned int>>> random_vectors_;
-  mutable vector<size_t> parser_offsets_, queue_ids_;
+  std::vector<shared_ptr<TBlob<unsigned int>>> random_vectors_;
+  mutable std::vector<size_t> parser_offsets_, queue_ids_;
   Flag layer_inititialized_flag_;
   std::atomic_bool sample_only_;
   const bool cache_, shuffle_;
diff --git a/include/caffe/layers/detection_evaluate_layer.hpp b/include/caffe/layers/detection_evaluate_layer.hpp
new file mode 100644
index 00000000000..e783b0ac3c8
--- /dev/null
+++ b/include/caffe/layers/detection_evaluate_layer.hpp
@@ -0,0 +1,73 @@
+#ifndef CAFFE_DETECTION_EVALUATE_LAYER_HPP_
+#define CAFFE_DETECTION_EVALUATE_LAYER_HPP_
+
+#include <utility>
+#include <vector>
+
+#include "caffe/blob.hpp"
+#include "caffe/layer.hpp"
+#include "caffe/proto/caffe.pb.h"
+
+namespace caffe {
+
+/**
+ * @brief Generate the detection evaluation based on DetectionOutputLayer and
+ * ground truth bounding box labels.
+ *
+ * Intended for use with MultiBox detection method.
+ *
+ * NOTE: does not implement Backwards operation.
+ */
+template <typename Ftype, typename Btype>
+class DetectionEvaluateLayer : public Layer<Ftype, Btype> {
+  typedef Ftype Dtype;
+
+ public:
+  explicit DetectionEvaluateLayer(const LayerParameter& param)
+      : Layer<Ftype, Btype>(param) {}
+  virtual void LayerSetUp(const vector<Blob*>& bottom,
+      const vector<Blob*>& top);
+  virtual void Reshape(const vector<Blob*>& bottom,
+      const vector<Blob*>& top);
+
+  virtual inline const char* type() const { return "DetectionEvaluate"; }
+  virtual inline int ExactBottomBlobs() const { return 2; }
+  virtual inline int ExactNumTopBlobs() const { return 1; }
+
+ protected:
+  /**
+   * @brief Evaluate the detection output.
+   *
+   * @param bottom input Blob vector (exact 2)
+   *   -# @f$ (1 \times 1 \times N \times 7) @f$
+   *      N detection results.
+   *   -# @f$ (1 \times 1 \times M \times 7) @f$
+   *      M ground truth.
+   * @param top Blob vector (length 1)
+   *   -# @f$ (1 \times 1 \times N \times 4) @f$
+   *      N is the number of detections, and each row is:
+   *      [image_id, label, confidence, true_pos, false_pos]
+   */
+  virtual void Forward_cpu(const vector<Blob*>& bottom,
+      const vector<Blob*>& top);
+  /// @brief Not implemented
+  virtual void Backward_cpu(const vector<Blob*>& top,
+      const vector<bool>& propagate_down, const vector<Blob*>& bottom) {
+    NOT_IMPLEMENTED;
+  }
+
+  int num_classes_;
+  int background_label_id_;
+  float overlap_threshold_;
+  bool evaluate_difficult_gt_;
+  vector<pair<int, int> > sizes_;
+  int count_;
+  bool use_normalized_bbox_;
+
+  bool has_resize_;
+  ResizeParameter resize_param_;
+};
+
+}  // namespace caffe
+
+#endif  // CAFFE_DETECTION_EVALUATE_LAYER_HPP_
diff --git a/include/caffe/layers/detection_output_layer.hpp b/include/caffe/layers/detection_output_layer.hpp
new file mode 100644
index 00000000000..a8c7e413bc1
--- /dev/null
+++ b/include/caffe/layers/detection_output_layer.hpp
@@ -0,0 +1,124 @@
+#ifndef CAFFE_DETECTION_OUTPUT_LAYER_HPP_
+#define CAFFE_DETECTION_OUTPUT_LAYER_HPP_
+
+#include "caffe/common.hpp"
+
+#if (__GNUC__ >= 5) && (BOOST_VERSION >= 105800)
+#define WRITE_JSON_SUPPORTED
+#include <boost/property_tree/json_parser.hpp>
+#endif
+
+#include <boost/property_tree/ptree.hpp>
+#include <boost/regex.hpp>
+
+#include <map>
+#include <string>
+#include <utility>
+#include <vector>
+
+#include "caffe/blob.hpp"
+#include "caffe/data_transformer.hpp"
+#include "caffe/layer.hpp"
+#include "caffe/proto/caffe.pb.h"
+#include "caffe/util/bbox_util.hpp"
+
+using namespace boost::property_tree;  // NOLINT(build/namespaces)
+
+namespace caffe {
+
+/**
+ * @brief Generate the detection output based on location and confidence
+ * predictions by doing non maximum suppression.
+ *
+ * Intended for use with MultiBox detection method.
+ *
+ * NOTE: does not implement Backwards operation.
+ */
+template <typename Ftype, typename Btype>
+class DetectionOutputLayer : public Layer<Ftype, Btype> {
+ public:
+  explicit DetectionOutputLayer(const LayerParameter& param)
+      : Layer<Ftype, Btype>(param) {}
+  virtual void LayerSetUp(const vector<Blob*>& bottom,
+      const vector<Blob*>& top);
+  virtual void Reshape(const vector<Blob*>& bottom,
+      const vector<Blob*>& top);
+
+  virtual inline const char* type() const { return "DetectionOutput"; }
+  virtual inline int MinBottomBlobs() const { return 3; }
+  virtual inline int MaxBottomBlobs() const { return 4; }
+  virtual inline int ExactNumTopBlobs() const { return 1; }
+
+ protected:
+  /**
+   * @brief Do non maximum suppression (nms) on prediction results.
+   *
+   * @param bottom input Blob vector (at least 2)
+   *   -# @f$ (N \times C1 \times 1 \times 1) @f$
+   *      the location predictions with C1 predictions.
+   *   -# @f$ (N \times C2 \times 1 \times 1) @f$
+   *      the confidence predictions with C2 predictions.
+   *   -# @f$ (N \times 2 \times C3 \times 1) @f$
+   *      the prior bounding boxes with C3 values.
+   * @param top output Blob vector (length 1)
+   *   -# @f$ (1 \times 1 \times N \times 7) @f$
+   *      N is the number of detections after nms, and each row is:
+   *      [image_id, label, confidence, xmin, ymin, xmax, ymax]
+   */
+  virtual void Forward_cpu(const vector<Blob*>& bottom,
+      const vector<Blob*>& top);
+  virtual void Forward_gpu(const vector<Blob*>& bottom,
+      const vector<Blob*>& top);
+  /// @brief Not implemented
+  virtual void Backward_cpu(const vector<Blob*>& top,
+      const vector<bool>& propagate_down, const vector<Blob*>& bottom) {
+    NOT_IMPLEMENTED;
+  }
+  virtual void Backward_gpu(const vector<Blob*>& top,
+      const vector<bool>& propagate_down, const vector<Blob*>& bottom) {
+    NOT_IMPLEMENTED;
+  }
+
+  int num_classes_;
+  bool share_location_;
+  int num_loc_classes_;
+  int background_label_id_;
+  CodeType code_type_;
+  bool variance_encoded_in_target_;
+  int keep_top_k_;
+  float confidence_threshold_;
+
+  int num_;
+  int num_priors_;
+
+  float nms_threshold_;
+  int top_k_;
+  float eta_;
+
+  bool need_save_;
+  string output_directory_;
+  string output_name_prefix_;
+  string output_format_;
+  map<int, string> label_to_name_;
+  map<int, string> label_to_display_name_;
+  vector<string> names_;
+  vector<pair<int, int> > sizes_;
+  int num_test_image_;
+  int name_count_;
+  bool has_resize_;
+  ResizeParameter resize_param_;
+
+  ptree detections_;
+
+  bool visualize_;
+  float visualize_threshold_;
+  shared_ptr<DataTransformer<Ftype>> data_transformer_;
+  string save_file_;
+  TBlob<Ftype> bbox_preds_;
+  TBlob<Ftype> bbox_permute_;
+  TBlob<Ftype> conf_permute_;
+};
+
+}  // namespace caffe
+
+#endif  // CAFFE_DETECTION_OUTPUT_LAYER_HPP_
diff --git a/include/caffe/layers/loss_layer.hpp b/include/caffe/layers/loss_layer.hpp
index bd0659f3325..232ee742e4e 100644
--- a/include/caffe/layers/loss_layer.hpp
+++ b/include/caffe/layers/loss_layer.hpp
@@ -21,6 +21,8 @@ const float kLOG_THRESHOLD = 1e-20;
  */
 template <typename Ftype, typename Btype>
 class LossLayer : public Layer<Ftype, Btype> {
+  typedef Ftype Dtype;
+
  public:
   explicit LossLayer(const LayerParameter& param)
      : Layer<Ftype, Btype>(param) {}
@@ -29,6 +31,16 @@ class LossLayer : public Layer<Ftype, Btype> {
   virtual void Reshape(
       const vector<Blob*>& bottom, const vector<Blob*>& top);
 
+  /**
+   * Read the normalization mode parameter and compute the normalizer based
+   * on the blob size. If normalization_mode is VALID, the count of valid
+   * outputs will be read from valid_count, unless it is -1 in which case
+   * all outputs are assumed to be valid.
+   */
+  Ftype GetNormalizer(
+      const LossParameter_NormalizationMode normalization_mode,
+      const int outer_num, const int inner_num, const int valid_count);
+
   virtual inline int ExactNumBottomBlobs() const { return 2; }
 
   /**
diff --git a/include/caffe/layers/lstm_layer.hpp b/include/caffe/layers/lstm_layer.hpp
new file mode 100644
index 00000000000..402236f429e
--- /dev/null
+++ b/include/caffe/layers/lstm_layer.hpp
@@ -0,0 +1,149 @@
+#ifndef CAFFE_LSTM_LAYER_HPP_
+#define CAFFE_LSTM_LAYER_HPP_
+
+#include <string>
+#include <utility>
+#include <vector>
+
+#include "caffe/blob.hpp"
+#include "caffe/common.hpp"
+#include "caffe/layer.hpp"
+#include "caffe/layers/recurrent_layer.hpp"
+#include "caffe/net.hpp"
+#include "caffe/proto/caffe.pb.h"
+
+namespace caffe {
+
+/**
+ * @brief Processes sequential inputs using a "Long Short-Term Memory" (LSTM)
+ *        [1] style recurrent neural network (RNN). Implemented by unrolling
+ *        the LSTM computation through time.
+ *
+ * The specific architecture used in this implementation is as described in
+ * "Learning to Execute" [2], reproduced below:
+ *     i_t := \sigmoid[ W_{hi} * h_{t-1} + W_{xi} * x_t + b_i ]
+ *     f_t := \sigmoid[ W_{hf} * h_{t-1} + W_{xf} * x_t + b_f ]
+ *     o_t := \sigmoid[ W_{ho} * h_{t-1} + W_{xo} * x_t + b_o ]
+ *     g_t :=    \tanh[ W_{hg} * h_{t-1} + W_{xg} * x_t + b_g ]
+ *     c_t := (f_t .* c_{t-1}) + (i_t .* g_t)
+ *     h_t := o_t .* \tanh[c_t]
+ * In the implementation, the i, f, o, and g computations are performed as a
+ * single inner product.
+ *
+ * Notably, this implementation lacks the "diagonal" gates, as used in the
+ * LSTM architectures described by Alex Graves [3] and others.
+ *
+ * [1] Hochreiter, Sepp, and Schmidhuber, Jürgen. "Long short-term memory."
+ *     Neural Computation 9, no. 8 (1997): 1735-1780.
+ *
+ * [2] Zaremba, Wojciech, and Sutskever, Ilya. "Learning to execute."
+ *     arXiv preprint arXiv:1410.4615 (2014).
+ *
+ * [3] Graves, Alex. "Generating sequences with recurrent neural networks."
+ *     arXiv preprint arXiv:1308.0850 (2013).
+ */
+template<typename Ftype, typename Btype>
+class LSTMLayer : public RecurrentLayer<Ftype, Btype> {
+ public:
+  explicit LSTMLayer(const LayerParameter& param)
+      : RecurrentLayer<Ftype, Btype>(param) {}
+
+  virtual inline const char* type() const { return "LSTM"; }
+
+ protected:
+  void FillUnrolledNet(NetParameter* net_param) const override;
+  void RecurrentInputBlobNames(vector<string>* names) const override;
+  void RecurrentOutputBlobNames(vector<string>* names) const override;
+  void RecurrentInputShapes(vector<BlobShape>* shapes) const override;
+  void OutputBlobNames(vector<string>* names) const override;
+};
+
+/**
+ * @brief A helper for LSTMLayer: computes a single timestep of the
+ *        non-linearity of the LSTM, producing the updated cell and hidden
+ *        states.
+ */
+template<typename Ftype, typename Btype>
+class LSTMUnitLayer : public Layer<Ftype, Btype> {
+ public:
+  explicit LSTMUnitLayer(const LayerParameter& param)
+      : Layer<Ftype, Btype>(param), hidden_dim_(0), X_acts_(Blob::create<Ftype, Btype>()) {}
+  void Reshape(const vector<Blob*>& bottom, const vector<Blob*>& top) override;
+
+  const char* type() const override { return "LSTMUnit"; }
+  int ExactNumBottomBlobs() const override { return 3; }
+  int ExactNumTopBlobs() const override { return 2; }
+
+  bool AllowForceBackward(const int bottom_index) const override {
+    // Can't propagate to sequence continuation indicators.
+    return bottom_index != 2;
+  }
+
+ protected:
+  /**
+   * @param bottom input Blob vector (length 3)
+   *   -# @f$ (1 \times N \times D) @f$
+   *      the previous timestep cell state @f$ c_{t-1} @f$
+   *   -# @f$ (1 \times N \times 4D) @f$
+   *      the "gate inputs" @f$ [i_t', f_t', o_t', g_t'] @f$
+   *   -# @f$ (1 \times N) @f$
+   *      the sequence continuation indicators  @f$ \delta_t @f$
+   * @param top output Blob vector (length 2)
+   *   -# @f$ (1 \times N \times D) @f$
+   *      the updated cell state @f$ c_t @f$, computed as:
+   *          i_t := \sigmoid[i_t']
+   *          f_t := \sigmoid[f_t']
+   *          o_t := \sigmoid[o_t']
+   *          g_t := \tanh[g_t']
+   *          c_t := cont_t * (f_t .* c_{t-1}) + (i_t .* g_t)
+   *   -# @f$ (1 \times N \times D) @f$
+   *      the updated hidden state @f$ h_t @f$, computed as:
+   *          h_t := o_t .* \tanh[c_t]
+   */
+  void Forward_cpu(const vector<Blob*>& bottom, const vector<Blob*>& top) override;
+  void Forward_gpu(const vector<Blob*>& bottom, const vector<Blob*>& top) override;
+
+  /**
+   * @brief Computes the error gradient w.r.t. the LSTMUnit inputs.
+   *
+   * @param top output Blob vector (length 2), providing the error gradient with
+   *        respect to the outputs
+   *   -# @f$ (1 \times N \times D) @f$:
+   *      containing error gradients @f$ \frac{\partial E}{\partial c_t} @f$
+   *      with respect to the updated cell state @f$ c_t @f$
+   *   -# @f$ (1 \times N \times D) @f$:
+   *      containing error gradients @f$ \frac{\partial E}{\partial h_t} @f$
+   *      with respect to the updated cell state @f$ h_t @f$
+   * @param propagate_down see Layer::Backward.
+   * @param bottom input Blob vector (length 3), into which the error gradients
+   *        with respect to the LSTMUnit inputs @f$ c_{t-1} @f$ and the gate
+   *        inputs are computed.  Computatation of the error gradients w.r.t.
+   *        the sequence indicators is not implemented.
+   *   -# @f$ (1 \times N \times D) @f$
+   *      the error gradient w.r.t. the previous timestep cell state
+   *      @f$ c_{t-1} @f$
+   *   -# @f$ (1 \times N \times 4D) @f$
+   *      the error gradient w.r.t. the "gate inputs"
+   *      @f$ [
+   *          \frac{\partial E}{\partial i_t}
+   *          \frac{\partial E}{\partial f_t}
+   *          \frac{\partial E}{\partial o_t}
+   *          \frac{\partial E}{\partial g_t}
+   *          ] @f$
+   *   -# @f$ (1 \times 1 \times N) @f$
+   *      the gradient w.r.t. the sequence continuation indicators
+   *      @f$ \delta_t @f$ is currently not computed.
+   */
+  void Backward_cpu(const vector<Blob*>& top, const vector<bool>& propagate_down,
+                    const vector<Blob*>& bottom) override;
+  void Backward_gpu(const vector<Blob*>& top, const vector<bool>& propagate_down,
+                    const vector<Blob*>& bottom) override;
+
+  /// @brief The hidden and output dimension.
+  int hidden_dim_;
+  shared_ptr<Blob> X_acts_;
+};
+
+}  // namespace caffe
+
+#endif  // CAFFE_LSTM_LAYER_HPP_
diff --git a/include/caffe/layers/memory_data_layer.hpp b/include/caffe/layers/memory_data_layer.hpp
index 8bb75dfe718..e879e784ac2 100644
--- a/include/caffe/layers/memory_data_layer.hpp
+++ b/include/caffe/layers/memory_data_layer.hpp
@@ -21,7 +21,7 @@ class MemoryDataLayer : public BaseDataLayer<Ftype, Btype> {
  public:
   explicit MemoryDataLayer(const LayerParameter& param)
       : BaseDataLayer<Ftype, Btype>(param, 1), has_new_data_(false) {
-    dt_ = make_shared<DataTransformer>(this->transform_param_, this->phase_);
+    dt_ = make_shared<DataTransformer<Ftype>>(this->transform_param_, this->phase_);
   }
   virtual void DataLayerSetUp(const vector<Blob*>& bottom,
       const vector<Blob*>& top);
@@ -58,7 +58,7 @@ class MemoryDataLayer : public BaseDataLayer<Ftype, Btype> {
   TBlob<Ftype> added_data_;
   TBlob<Ftype> added_label_;
   bool has_new_data_;
-  shared_ptr<DataTransformer> dt_;
+  shared_ptr<DataTransformer<Ftype>> dt_;
 };
 
 }  // namespace caffe
diff --git a/include/caffe/layers/multibox_loss_layer.hpp b/include/caffe/layers/multibox_loss_layer.hpp
new file mode 100644
index 00000000000..d7236565a91
--- /dev/null
+++ b/include/caffe/layers/multibox_loss_layer.hpp
@@ -0,0 +1,114 @@
+#ifndef CAFFE_MULTIBOX_LOSS_LAYER_HPP_
+#define CAFFE_MULTIBOX_LOSS_LAYER_HPP_
+
+#include <map>
+#include <utility>
+#include <vector>
+
+#include "caffe/blob.hpp"
+#include "caffe/layer.hpp"
+#include "caffe/proto/caffe.pb.h"
+#include "caffe/util/bbox_util.hpp"
+
+#include "caffe/layers/loss_layer.hpp"
+
+namespace caffe {
+
+/**
+ * @brief Perform MultiBox operations. Including the following:
+ *
+ *  - decode the predictions.
+ *  - perform matching between priors/predictions and ground truth.
+ *  - use matched boxes and confidences to compute loss.
+ *
+ */
+template <typename Ftype, typename Btype>
+class MultiBoxLossLayer : public LossLayer<Ftype, Btype> {
+  typedef Ftype Dtype;
+
+ public:
+  explicit MultiBoxLossLayer(const LayerParameter& param)
+      : LossLayer<Ftype, Btype>(param) {}
+  virtual void LayerSetUp(const vector<Blob*>& bottom,
+      const vector<Blob*>& top);
+  virtual void Reshape(const vector<Blob*>& bottom,
+      const vector<Blob*>& top);
+
+  virtual inline const char* type() const { return "MultiBoxLoss"; }
+  // bottom[0] stores the location predictions.
+  // bottom[1] stores the confidence predictions.
+  // bottom[2] stores the prior bounding boxes.
+  // bottom[3] stores the ground truth bounding boxes.
+  virtual inline int ExactNumBottomBlobs() const { return 4; }
+  virtual inline int ExactNumTopBlobs() const { return 1; }
+
+ protected:
+  virtual void Forward_cpu(const vector<Blob*>& bottom,
+      const vector<Blob*>& top);
+  virtual void Backward_cpu(const vector<Blob*>& top,
+      const vector<bool>& propagate_down, const vector<Blob*>& bottom);
+
+  // The internal localization loss layer.
+  shared_ptr<LayerBase> loc_loss_layer_;
+  LocLossType loc_loss_type_;
+  float loc_weight_;
+  // bottom vector holder used in Forward function.
+  vector<Blob*> loc_bottom_vec_;
+  // top vector holder used in Forward function.
+  vector<Blob*> loc_top_vec_;
+  // blob which stores the matched location prediction.
+  shared_ptr<Blob> loc_pred_;
+  // blob which stores the corresponding matched ground truth.
+  shared_ptr<Blob> loc_gt_;
+  // localization loss.
+  shared_ptr<Blob> loc_loss_;
+
+  // The internal confidence loss layer.
+  shared_ptr<LayerBase> conf_loss_layer_;
+  ConfLossType conf_loss_type_;
+  // bottom vector holder used in Forward function.
+  vector<Blob*> conf_bottom_vec_;
+  // top vector holder used in Forward function.
+  vector<Blob*> conf_top_vec_;
+  // blob which stores the confidence prediction.
+  shared_ptr<Blob> conf_pred_;
+  // blob which stores the corresponding ground truth label.
+  shared_ptr<Blob> conf_gt_;
+  // confidence loss.
+  shared_ptr<Blob> conf_loss_;
+
+  MultiBoxLossParameter multibox_loss_param_;
+  int num_classes_;
+  bool share_location_;
+  MatchType match_type_;
+  float overlap_threshold_;
+  bool use_prior_for_matching_;
+  int background_label_id_;
+  bool use_difficult_gt_;
+  bool do_neg_mining_;
+  float neg_pos_ratio_;
+  float neg_overlap_;
+  CodeType code_type_;
+  bool encode_variance_in_target_;
+  bool map_object_to_agnostic_;
+  bool ignore_cross_boundary_bbox_;
+  bool bp_inside_;
+  MiningType mining_type_;
+
+  int loc_classes_;
+  int num_gt_;
+  int num_;
+  int num_priors_;
+
+  int num_matches_;
+  int num_conf_;
+  vector<map<int, vector<int> > > all_match_indices_;
+  vector<vector<int> > all_neg_indices_;
+
+  // How to normalize the loss.
+  LossParameter_NormalizationMode normalization_;
+};
+
+}  // namespace caffe
+
+#endif  // CAFFE_MULTIBOX_LOSS_LAYER_HPP_
diff --git a/include/caffe/layers/normalize_layer.hpp b/include/caffe/layers/normalize_layer.hpp
new file mode 100644
index 00000000000..0cfb22cadf0
--- /dev/null
+++ b/include/caffe/layers/normalize_layer.hpp
@@ -0,0 +1,53 @@
+#ifndef CAFFE_NORMALIZE_LAYER_HPP_
+#define CAFFE_NORMALIZE_LAYER_HPP_
+
+#include <vector>
+
+#include "caffe/blob.hpp"
+#include "caffe/layer.hpp"
+#include "caffe/proto/caffe.pb.h"
+
+namespace caffe {
+
+/**
+ * @brief Normalizes the input to have L_p norm of 1 with scale learnable.
+ *
+ * TODO(weiliu89): thorough documentation for Forward, Backward, and proto params.
+ */
+template <typename Ftype, typename Btype>
+class NormalizeLayer : public Layer<Ftype, Btype> {
+  typedef Ftype Dtype;
+
+ public:
+  explicit NormalizeLayer(const LayerParameter& param)
+      : Layer<Ftype, Btype>(param) {}
+  virtual void LayerSetUp(const vector<Blob*>& bottom,
+      const vector<Blob*>& top);
+  virtual void Reshape(const vector<Blob*>& bottom,
+      const vector<Blob*>& top);
+
+  virtual inline const char* type() const { return "Normalize"; }
+  virtual inline int ExactNumBottomBlobs() const { return 1; }
+  virtual inline int ExactNumTopBlobs() const { return 1; }
+
+ protected:
+  virtual void Forward_cpu(const vector<Blob*>& bottom,
+      const vector<Blob*>& top);
+  virtual void Forward_gpu(const vector<Blob*>& bottom,
+      const vector<Blob*>& top);
+  virtual void Backward_cpu(const vector<Blob*>& top,
+      const vector<bool>& propagate_down, const vector<Blob*>& bottom);
+  virtual void Backward_gpu(const vector<Blob*>& top,
+     const vector<bool>& propagate_down, const vector<Blob*>& bottom);
+
+  TBlob<Dtype> norm_;
+  TBlob<Dtype> sum_channel_multiplier_, sum_spatial_multiplier_;
+  TBlob<Dtype> buffer_, buffer_channel_, buffer_spatial_;
+  bool across_spatial_;
+  bool channel_shared_;
+  Dtype eps_;
+};
+
+}  // namespace caffe
+
+#endif  // CAFFE_MVN_LAYER_HPP_
diff --git a/include/caffe/layers/permute_layer.hpp b/include/caffe/layers/permute_layer.hpp
new file mode 100644
index 00000000000..1dff70f0059
--- /dev/null
+++ b/include/caffe/layers/permute_layer.hpp
@@ -0,0 +1,61 @@
+#ifndef CAFFE_PERMUTE_LAYER_HPP_
+#define CAFFE_PERMUTE_LAYER_HPP_
+
+#include <vector>
+
+#include "caffe/blob.hpp"
+#include "caffe/layer.hpp"
+#include "caffe/proto/caffe.pb.h"
+
+namespace caffe {
+
+/**
+ * @brief Permute the input blob by changing the memory order of the data.
+ *
+ * TODO(weiliu89): thorough documentation for Forward, Backward, and proto params.
+ */
+
+// The main function which does the permute.
+template <typename Dtype>
+void Permute(const int count, Dtype* bottom_data, const bool forward,
+    const int* permute_order, const int* old_steps, const int* new_steps,
+    const int num_axes, Dtype* top_data);
+
+template <typename Ftype, typename Btype>
+class PermuteLayer : public Layer<Ftype, Btype> {
+  typedef Ftype Dtype;
+
+ public:
+  explicit PermuteLayer(const LayerParameter& param)
+      : Layer<Ftype, Btype>(param) {}
+  virtual void LayerSetUp(const vector<Blob*>& bottom,
+      const vector<Blob*>& top);
+  virtual void Reshape(const vector<Blob*>& bottom,
+      const vector<Blob*>& top);
+
+  virtual inline const char* type() const { return "Permute"; }
+  virtual inline int ExactNumBottomBlobs() const { return 1; }
+  virtual inline int ExactNumTopBlobs() const { return 1; }
+
+ protected:
+  virtual void Forward_cpu(const vector<Blob*>& bottom,
+      const vector<Blob*>& top);
+  virtual void Forward_gpu(const vector<Blob*>& bottom,
+      const vector<Blob*>& top);
+  virtual void Backward_cpu(const vector<Blob*>& top,
+      const vector<bool>& propagate_down, const vector<Blob*>& bottom);
+  virtual void Backward_gpu(const vector<Blob*>& top,
+      const vector<bool>& propagate_down, const vector<Blob*>& bottom);
+
+  int num_axes_;
+  bool need_permute_;
+
+  // Use Blob because it is convenient to be accessible in .cu file.
+  TBlob<int> permute_order_;
+  TBlob<int> old_steps_;
+  TBlob<int> new_steps_;
+};
+
+}  // namespace caffe
+
+#endif  // CAFFE_PERMUTE_LAYER_HPP_
diff --git a/include/caffe/layers/pooling_layer.hpp b/include/caffe/layers/pooling_layer.hpp
index 6cf7930300f..48e2d21d111 100644
--- a/include/caffe/layers/pooling_layer.hpp
+++ b/include/caffe/layers/pooling_layer.hpp
@@ -18,7 +18,7 @@ template <typename Ftype, typename Btype>
 class PoolingLayer : public Layer<Ftype, Btype> {
  public:
   explicit PoolingLayer(const LayerParameter& param)
-      : Layer<Ftype, Btype>(param) {}
+      : Layer<Ftype, Btype>(param), rand_idx_(Blob::create<Ftype>()) {}
   virtual void LayerSetUp(const vector<Blob*>& bottom,
       const vector<Blob*>& top);
   virtual void Reshape(const vector<Blob*>& bottom,
@@ -52,7 +52,7 @@ class PoolingLayer : public Layer<Ftype, Btype> {
   int pooled_height_, pooled_width_;
   bool global_pooling_;
   bool is_max_pooling_;
-  TBlob<float> rand_idx_;
+  shared_ptr<Blob> rand_idx_;
   TBlob<int> max_idx_;
 };
 
diff --git a/include/caffe/layers/prior_box_layer.hpp b/include/caffe/layers/prior_box_layer.hpp
new file mode 100644
index 00000000000..0a33f625ec0
--- /dev/null
+++ b/include/caffe/layers/prior_box_layer.hpp
@@ -0,0 +1,86 @@
+#ifndef CAFFE_PRIORBOX_LAYER_HPP_
+#define CAFFE_PRIORBOX_LAYER_HPP_
+
+#include <vector>
+
+#include "caffe/blob.hpp"
+#include "caffe/layer.hpp"
+#include "caffe/proto/caffe.pb.h"
+
+namespace caffe {
+
+/**
+ * @brief Generate the prior boxes of designated sizes and aspect ratios across
+ *        all dimensions @f$ (H \times W) @f$.
+ *
+ * Intended for use with MultiBox detection method to generate prior (template).
+ *
+ * NOTE: does not implement Backwards operation.
+ */
+template <typename Ftype, typename Btype>
+class PriorBoxLayer : public Layer<Ftype, Btype> {
+  typedef Ftype Dtype;
+
+ public:
+  /**
+   * @param param provides PriorBoxParameter prior_box_param,
+   *     with PriorBoxLayer options:
+   *   - min_size (\b minimum box size in pixels. can be multiple. required!).
+   *   - max_size (\b maximum box size in pixels. can be ignored or same as the
+   *   # of min_size.).
+   *   - aspect_ratio (\b optional aspect ratios of the boxes. can be multiple).
+   *   - flip (\b optional bool, default true).
+   *     if set, flip the aspect ratio.
+   */
+  explicit PriorBoxLayer(const LayerParameter& param)
+      : Layer<Ftype, Btype>(param) {}
+  virtual void LayerSetUp(const vector<Blob*>& bottom,
+      const vector<Blob*>& top);
+  virtual void Reshape(const vector<Blob*>& bottom,
+      const vector<Blob*>& top);
+
+  virtual inline const char* type() const { return "PriorBox"; }
+  virtual inline int ExactBottomBlobs() const { return 2; }
+  virtual inline int ExactNumTopBlobs() const { return 1; }
+
+ protected:
+  /**
+   * @brief Generates prior boxes for a layer with specified parameters.
+   *
+   * @param bottom input Blob vector (at least 2)
+   *   -# @f$ (N \times C \times H_i \times W_i) @f$
+   *      the input layer @f$ x_i @f$
+   *   -# @f$ (N \times C \times H_0 \times W_0) @f$
+   *      the data layer @f$ x_0 @f$
+   * @param top output Blob vector (length 1)
+   *   -# @f$ (N \times 2 \times K*4) @f$ where @f$ K @f$ is the prior numbers
+   *   By default, a box of aspect ratio 1 and min_size and a box of aspect
+   *   ratio 1 and sqrt(min_size * max_size) are created.
+   */
+  virtual void Forward_cpu(const vector<Blob*>& bottom,
+      const vector<Blob*>& top);
+  /// @brief Not implemented
+  virtual void Backward_cpu(const vector<Blob*>& top,
+      const vector<bool>& propagate_down, const vector<Blob*>& bottom) {
+    return;
+  }
+
+  vector<float> min_sizes_;
+  vector<float> max_sizes_;
+  vector<float> aspect_ratios_;
+  bool flip_;
+  int num_priors_;
+  bool clip_;
+  vector<float> variance_;
+
+  int img_w_;
+  int img_h_;
+  float step_w_;
+  float step_h_;
+
+  float offset_;
+};
+
+}  // namespace caffe
+
+#endif  // CAFFE_PRIORBOX_LAYER_HPP_
diff --git a/include/caffe/layers/recurrent_layer.hpp b/include/caffe/layers/recurrent_layer.hpp
new file mode 100644
index 00000000000..c772b264dc7
--- /dev/null
+++ b/include/caffe/layers/recurrent_layer.hpp
@@ -0,0 +1,181 @@
+#ifndef CAFFE_RECURRENT_LAYER_HPP_
+#define CAFFE_RECURRENT_LAYER_HPP_
+
+#include <string>
+#include <utility>
+#include <vector>
+
+#include "caffe/blob.hpp"
+#include "caffe/common.hpp"
+#include "caffe/layer.hpp"
+#include "caffe/net.hpp"
+#include "caffe/proto/caffe.pb.h"
+#include "caffe/util/format.hpp"
+
+namespace caffe {
+
+/**
+ * @brief An abstract class for implementing recurrent behavior inside of an
+ *        unrolled network.  This Layer type cannot be instantiated -- instead,
+ *        you should use one of its implementations which defines the recurrent
+ *        architecture, such as RNNLayer or LSTMLayer.
+ */
+template<typename Ftype, typename Btype>
+class RecurrentLayer : public Layer<Ftype, Btype> {
+ public:
+  explicit RecurrentLayer(const LayerParameter& param)
+      : Layer<Ftype, Btype>(param) {}
+  virtual void LayerSetUp(const vector<Blob*>& bottom, const vector<Blob*>& top);
+  virtual void Reshape(const vector<Blob*>& bottom, const vector<Blob*>& top);
+  virtual void Reset();
+
+  virtual inline const char* type() const { return "Recurrent"; }
+  virtual inline int MinBottomBlobs() const {
+    int min_bottoms = 2;
+    if (this->layer_param_.recurrent_param().expose_hidden()) {
+      vector<string> inputs;
+      this->RecurrentInputBlobNames(&inputs);
+      min_bottoms += inputs.size();
+    }
+    return min_bottoms;
+  }
+  virtual inline int MaxBottomBlobs() const { return MinBottomBlobs() + 1; }
+  virtual inline int ExactNumTopBlobs() const {
+    int num_tops = 1;
+    if (this->layer_param_.recurrent_param().expose_hidden()) {
+      vector<string> outputs;
+      this->RecurrentOutputBlobNames(&outputs);
+      num_tops += outputs.size();
+    }
+    return num_tops;
+  }
+
+  virtual inline bool AllowForceBackward(const int bottom_index) const {
+    // Can't propagate to sequence continuation indicators.
+    return bottom_index != 1;
+  }
+
+ protected:
+  /**
+   * @brief Fills net_param with the recurrent network architecture.  Subclasses
+   *        should define this -- see RNNLayer and LSTMLayer for examples.
+   */
+  virtual void FillUnrolledNet(NetParameter* net_param) const = 0;
+
+  /**
+   * @brief Fills names with the names of the 0th timestep recurrent input
+   *        Blob&s.  Subclasses should define this -- see RNNLayer and LSTMLayer
+   *        for examples.
+   */
+  virtual void RecurrentInputBlobNames(vector<string>* names) const = 0;
+
+  /**
+   * @brief Fills shapes with the shapes of the recurrent input Blob&s.
+   *        Subclasses should define this -- see RNNLayer and LSTMLayer
+   *        for examples.
+   */
+  virtual void RecurrentInputShapes(vector<BlobShape>* shapes) const = 0;
+
+  /**
+   * @brief Fills names with the names of the Tth timestep recurrent output
+   *        Blob&s.  Subclasses should define this -- see RNNLayer and LSTMLayer
+   *        for examples.
+   */
+  virtual void RecurrentOutputBlobNames(vector<string>* names) const = 0;
+
+  /**
+   * @brief Fills names with the names of the output blobs, concatenated across
+   *        all timesteps.  Should return a name for each top Blob.
+   *        Subclasses should define this -- see RNNLayer and LSTMLayer for
+   *        examples.
+   */
+  virtual void OutputBlobNames(vector<string>* names) const = 0;
+
+  /**
+   * @param bottom input Blob vector (length 2-3)
+   *
+   *   -# @f$ (T \times N \times ...) @f$
+   *      the time-varying input @f$ x @f$.  After the first two axes, whose
+   *      dimensions must correspond to the number of timesteps @f$ T @f$ and
+   *      the number of independent streams @f$ N @f$, respectively, its
+   *      dimensions may be arbitrary.  Note that the ordering of dimensions --
+   *      @f$ (T \times N \times ...) @f$, rather than
+   *      @f$ (N \times T \times ...) @f$ -- means that the @f$ N @f$
+   *      independent input streams must be "interleaved".
+   *
+   *   -# @f$ (T \times N) @f$
+   *      the sequence continuation indicators @f$ \delta @f$.
+   *      These inputs should be binary (0 or 1) indicators, where
+   *      @f$ \delta_{t,n} = 0 @f$ means that timestep @f$ t @f$ of stream
+   *      @f$ n @f$ is the beginning of a new sequence, and hence the previous
+   *      hidden state @f$ h_{t-1} @f$ is multiplied by @f$ \delta_t = 0 @f$
+   *      and has no effect on the cell's output at timestep @f$ t @f$, and
+   *      a value of @f$ \delta_{t,n} = 1 @f$ means that timestep @f$ t @f$ of
+   *      stream @f$ n @f$ is a continuation from the previous timestep
+   *      @f$ t-1 @f$, and the previous hidden state @f$ h_{t-1} @f$ affects the
+   *      updated hidden state and output.
+   *
+   *   -# @f$ (N \times ...) @f$ (optional)
+   *      the static (non-time-varying) input @f$ x_{static} @f$.
+   *      After the first axis, whose dimension must be the number of
+   *      independent streams, its dimensions may be arbitrary.
+   *      This is mathematically equivalent to using a time-varying input of
+   *      @f$ x'_t = [x_t; x_{static}] @f$ -- i.e., tiling the static input
+   *      across the @f$ T @f$ timesteps and concatenating with the time-varying
+   *      input.  Note that if this input is used, all timesteps in a single
+   *      batch within a particular one of the @f$ N @f$ streams must share the
+   *      same static input, even if the sequence continuation indicators
+   *      suggest that difference sequences are ending and beginning within a
+   *      single batch.  This may require padding and/or truncation for uniform
+   *      length.
+   *
+   * @param top output Blob vector (length 1)
+   *   -# @f$ (T \times N \times D) @f$
+   *      the time-varying output @f$ y @f$, where @f$ D @f$ is
+   *      <code>recurrent_param.num_output()</code>.
+   *      Refer to documentation for particular RecurrentLayer implementations
+   *      (such as RNNLayer and LSTMLayer) for the definition of @f$ y @f$.
+   */
+  virtual void Forward_cpu(const vector<Blob*>& bottom, const vector<Blob*>& top);
+  virtual void Forward_gpu(const vector<Blob*>& bottom, const vector<Blob*>& top);
+  virtual void Backward_cpu(const vector<Blob*>& top,
+      const vector<bool>& propagate_down, const vector<Blob*>& bottom);
+
+  /// @brief A Net to implement the Recurrent functionality.
+  shared_ptr<Net> unrolled_net_;
+
+  /// @brief The number of independent streams to process simultaneously.
+  int N_;
+
+  /**
+   * @brief The number of timesteps in the layer's input, and the number of
+   *        timesteps over which to backpropagate through time.
+   */
+  int T_;
+
+  /// @brief Whether the layer has a "static" input copied across all timesteps.
+  bool static_input_;
+
+  /**
+   * @brief The last layer to run in the network. (Any later layers are losses
+   *        added to force the recurrent net to do backprop.)
+   */
+  int last_layer_index_;
+
+  /**
+   * @brief Whether the layer's hidden state at the first and last timesteps
+   *        are layer inputs and outputs, respectively.
+   */
+  bool expose_hidden_;
+
+  vector<Blob*> recur_input_blobs_;
+  vector<Blob*> recur_output_blobs_;
+  vector<Blob*> output_blobs_;
+  Blob* x_input_blob_;
+  Blob* x_static_input_blob_;
+  Blob* cont_input_blob_;
+};
+
+}  // namespace caffe
+
+#endif  // CAFFE_RECURRENT_LAYER_HPP_
diff --git a/include/caffe/layers/rnn_layer.hpp b/include/caffe/layers/rnn_layer.hpp
new file mode 100644
index 00000000000..bd36e68f437
--- /dev/null
+++ b/include/caffe/layers/rnn_layer.hpp
@@ -0,0 +1,45 @@
+#ifndef CAFFE_RNN_LAYER_HPP_
+#define CAFFE_RNN_LAYER_HPP_
+
+#include <string>
+#include <utility>
+#include <vector>
+
+#include "caffe/blob.hpp"
+#include "caffe/common.hpp"
+#include "caffe/layer.hpp"
+#include "caffe/layers/recurrent_layer.hpp"
+#include "caffe/net.hpp"
+#include "caffe/proto/caffe.pb.h"
+
+namespace caffe {
+
+/**
+ * @brief Processes time-varying inputs using a simple recurrent neural network
+ *        (RNN). Implemented as a network unrolling the RNN computation in time.
+ *
+ * Given time-varying inputs @f$ x_t @f$, computes hidden state @f$
+ *     h_t := \tanh[ W_{hh} h_{t_1} + W_{xh} x_t + b_h ]
+ * @f$, and outputs @f$
+ *     o_t := \tanh[ W_{ho} h_t + b_o ]
+ * @f$.
+ */
+template<typename Ftype, typename Btype>
+class RNNLayer : public RecurrentLayer<Ftype, Btype> {
+ public:
+  explicit RNNLayer(const LayerParameter& param)
+      : RecurrentLayer<Ftype, Btype>(param) {}
+
+  virtual inline const char* type() const { return "RNN"; }
+
+ protected:
+  void FillUnrolledNet(NetParameter* net_param) const override;
+  void RecurrentInputBlobNames(vector<string>* names) const override;
+  void RecurrentOutputBlobNames(vector<string>* names) const override;
+  void RecurrentInputShapes(vector<BlobShape>* shapes) const override;
+  void OutputBlobNames(vector<string>* names) const override;
+};
+
+}  // namespace caffe
+
+#endif  // CAFFE_RNN_LAYER_HPP_
diff --git a/include/caffe/layers/smooth_L1_loss_layer.hpp b/include/caffe/layers/smooth_L1_loss_layer.hpp
new file mode 100644
index 00000000000..889ef4ebf8b
--- /dev/null
+++ b/include/caffe/layers/smooth_L1_loss_layer.hpp
@@ -0,0 +1,70 @@
+// ------------------------------------------------------------------
+// Fast R-CNN
+// copyright (c) 2015 Microsoft
+// Licensed under The MIT License [see fast-rcnn/LICENSE for details]
+// Written by Ross Girshick
+// Modified by Wei Liu
+// ------------------------------------------------------------------
+
+#ifndef CAFFE_SMOOTH_L1_LOSS_LAYER_HPP_
+#define CAFFE_SMOOTH_L1_LOSS_LAYER_HPP_
+
+#include <vector>
+
+#include "caffe/blob.hpp"
+#include "caffe/layer.hpp"
+#include "caffe/proto/caffe.pb.h"
+
+#include "caffe/layers/loss_layer.hpp"
+
+namespace caffe {
+
+/**
+ * @brief Computes the SmoothL1 loss as introduced in:@f$
+ *  Fast R-CNN, Ross Girshick, ICCV 2015.
+ */
+template <typename Ftype, typename Btype>
+class SmoothL1LossLayer : public LossLayer<Ftype, Btype> {
+  typedef Ftype Dtype;
+
+ public:
+  explicit SmoothL1LossLayer(const LayerParameter& param)
+      : LossLayer<Ftype, Btype>(param), diff_() {}
+  virtual void LayerSetUp(const vector<Blob*>& bottom,
+      const vector<Blob*>& top);
+  virtual void Reshape(const vector<Blob*>& bottom,
+      const vector<Blob*>& top);
+
+  virtual inline const char* type() const { return "SmoothL1Loss"; }
+
+  virtual inline int MinBottomBlobs() const { return 2; }
+  virtual inline int MaxBottomBlobs() const { return 3; }
+
+  /**
+   * Unlike most loss layers, in the SmoothL1LossLayer we can backpropagate
+   * to both inputs -- override to return true and always allow force_backward.
+   */
+  virtual inline bool AllowForceBackward(const int bottom_index) const {
+    return true;
+  }
+
+ protected:
+  /// @copydoc SmoothL1LossLayer
+  virtual void Forward_cpu(const vector<Blob*>& bottom,
+      const vector<Blob*>& top);
+  virtual void Forward_gpu(const vector<Blob*>& bottom,
+      const vector<Blob*>& top);
+
+  virtual void Backward_cpu(const vector<Blob*>& top,
+      const vector<bool>& propagate_down, const vector<Blob*>& bottom);
+  virtual void Backward_gpu(const vector<Blob*>& top,
+      const vector<bool>& propagate_down, const vector<Blob*>& bottom);
+
+  TBlob<Dtype> diff_;
+  TBlob<Dtype> errors_;
+  bool has_weights_;
+};
+
+}  // namespace caffe
+
+#endif  // CAFFE_SMOOTH_L1_LOSS_LAYER_HPP_
diff --git a/include/caffe/layers/video_data_layer.hpp b/include/caffe/layers/video_data_layer.hpp
new file mode 100644
index 00000000000..a4e09b62515
--- /dev/null
+++ b/include/caffe/layers/video_data_layer.hpp
@@ -0,0 +1,56 @@
+#ifndef CAFFE_VIDEO_DATA_LAYER_HPP_
+#define CAFFE_VIDEO_DATA_LAYER_HPP_
+
+#if OPENCV_VERSION == 3
+#include <opencv2/videoio.hpp>
+#else
+#include <opencv2/opencv.hpp>
+#endif  // OPENCV_VERSION == 3
+
+#include <string>
+#include <vector>
+
+#include "caffe/blob.hpp"
+#include "caffe/data_transformer.hpp"
+#include "caffe/internal_thread.hpp"
+#include "caffe/layer.hpp"
+#include "caffe/layers/base_data_layer.hpp"
+#include "caffe/proto/caffe.pb.h"
+#include "caffe/util/db.hpp"
+
+namespace caffe {
+
+/**
+ * @brief Provides data to the Net from webcam or video files.
+ *
+ * TODO(weiliu89): thorough documentation for Forward and proto params.
+ */
+template <typename Ftype, typename Btype>
+class VideoDataLayer : public BasePrefetchingDataLayer<Ftype, Btype> {
+ public:
+  VideoDataLayer(const LayerParameter& param, size_t solver_rank);
+  virtual ~VideoDataLayer();
+  virtual void DataLayerSetUp(const vector<Blob*>& bottom,
+      const vector<Blob*>& top);
+  virtual inline bool ShareInParallel() const { return false; }
+  virtual inline const char* type() const { return "VideoData"; }
+  virtual inline int ExactNumBottomBlobs() const { return 0; }
+  virtual inline int MinTopBlobs() const { return 1; }
+
+ protected:
+  void load_batch(Batch* batch, int thread_id, size_t queue_id) override;
+  void start_reading() override {}
+
+  VideoDataParameter_VideoType video_type_;
+  cv::VideoCapture cap_;
+
+  int skip_frames_;
+
+  int total_frames_;
+  int processed_frames_;
+  vector<int> top_shape_;
+};
+
+}  // namespace caffe
+
+#endif  // CAFFE_VIDEO_DATA_LAYER_HPP_
diff --git a/include/caffe/net.hpp b/include/caffe/net.hpp
index 31821c0ac6a..ab02a2652e2 100644
--- a/include/caffe/net.hpp
+++ b/include/caffe/net.hpp
@@ -16,6 +16,7 @@
 #include "caffe/util/blocking_queue.hpp"
 #include "caffe/util/thread_pool.hpp"
 #include "caffe/layers/data_layer.hpp"
+#include "caffe/layers/annotated_data_layer.hpp"
 
 namespace caffe {
 
@@ -32,14 +33,18 @@ class Net {
   explicit Net(const NetParameter& param,
       size_t solver_rank = 0U,
       Flag* solver_init_flag = nullptr,
-      Flag* solver_iter0_flag = nullptr,
-      const Net* root_net = nullptr);
+      const Net* root_net = nullptr,
+      bool inner_net = false,
+      int level = 0,
+      const vector<string>* stages = NULL);
   Net(const string& param_file,
       Phase phase,
       size_t solver_rank = 0U,
       Flag* solver_init_flag = nullptr,
-      Flag* solver_iter0_flag = nullptr,
-      const Net* root_net = nullptr);
+      const Net* root_net = nullptr,
+      bool inner_net = false,
+      int level = 0,
+      const vector<string>* stages = NULL);
   ~Net();
 
   /// @brief Initialize a network with a NetParameter.
@@ -275,8 +280,16 @@ class Net {
     return infer_count_;
   }
 
+  size_t solver_rank() const {
+    return solver_rank_;
+  }
+
   bool global_grad_scale_enabled() const {
-    return global_grad_scale_param_ > 1.F;
+    return has_global_grad_scale_param_ && global_grad_scale_param_ > 0.F;
+  }
+
+  bool inner_net() const {
+    return inner_net_;
   }
 
   void update_grad_scale();
@@ -293,7 +306,8 @@ class Net {
   size_t prefetch_bytes() {
     size_t bytes = 0UL;
     for (const shared_ptr<LayerBase>& layer : layers_) {
-      if (typeid(*layer) == typeid(DataLayer<Ftype, Btype>)) {
+      if (typeid(*layer) == typeid(DataLayer<Ftype, Btype>) ||
+          typeid(*layer) == typeid(AnnotatedDataLayer<Ftype, Btype>)) {
         bytes += reinterpret_cast<DataLayer<Ftype, Btype>*>(layer.get())->prefetch_bytes();
       }
     }
@@ -420,14 +434,15 @@ class Net {
   size_t solver_rank_;
   BlockingQueue<int> reduction_queue_[2];
   Flag* solver_init_flag_;
-  Flag* solver_iter0_flag_;
   vector<Flag*> layer_inititialized_flags_;
   NetParameter net_param_;
 
   size_t infer_count_;
   std::atomic_llong wgrad_sq_;
   float global_grad_scale_coeff_, global_grad_scale_param_;
-  bool global_grad_scale_adaptive_;
+  bool has_global_grad_scale_param_, global_grad_scale_adaptive_;
+  /// Inner net runs on singe GPU (see recurrent layers)
+  const bool inner_net_;
 
   static constexpr float GRAD_FACTOR = 1.E6F;
 
diff --git a/include/caffe/parallel.hpp b/include/caffe/parallel.hpp
index 9df87d525fe..b7cab7f4c4a 100644
--- a/include/caffe/parallel.hpp
+++ b/include/caffe/parallel.hpp
@@ -36,7 +36,7 @@ struct SharedScores {
 
  private:
   vector<vector<Dtype>> memory_;
-  static constexpr size_t MAX_SCORES = 1000;
+  static constexpr size_t MAX_SCORES = 1000*10;
 };
 
 template<typename Dtype>
diff --git a/include/caffe/solver.hpp b/include/caffe/solver.hpp
index 3c9cc493dab..30f2395847d 100644
--- a/include/caffe/solver.hpp
+++ b/include/caffe/solver.hpp
@@ -197,6 +197,7 @@ class Solver {
   // The test routine
   vector<float> TestAll(const int iters = 0, bool use_multi_gpu = false);
   vector<float> Test(const int test_net_id = 0, const int iters = 0, bool use_multi_gpu = false);
+  vector<float> TestDetection(const int test_net_id = 0);
   virtual void SnapshotSolverState(const string& model_filename) = 0;
   virtual void RestoreSolverStateFromHDF5(const string& state_file) = 0;
   virtual void RestoreSolverStateFromBinaryProto(const string& state_file) = 0;
@@ -238,7 +239,7 @@ class Solver {
   bool requested_early_exit_;
 
   // some layers like Data have to wait for this one
-  Flag init_flag_, iter0_flag_;
+  Flag init_flag_;
 
   // Timing information
   shared_ptr<Timer> iteration_timer_;
diff --git a/include/caffe/syncedmem.hpp b/include/caffe/syncedmem.hpp
index 7c99d5015be..daf8b469daa 100644
--- a/include/caffe/syncedmem.hpp
+++ b/include/caffe/syncedmem.hpp
@@ -73,6 +73,7 @@ class SyncedMemory {
   bool own_gpu_data_;
   int  device_;
   bool valid_;
+  shared_ptr<CudaStream> pstream_;
 
   DISABLE_COPY_MOVE_AND_ASSIGN(SyncedMemory);
 };  // class SyncedMemory
diff --git a/include/caffe/test/test_gradient_check_util.hpp b/include/caffe/test/test_gradient_check_util.hpp
index 4513d61a56e..a668d639d01 100644
--- a/include/caffe/test/test_gradient_check_util.hpp
+++ b/include/caffe/test/test_gradient_check_util.hpp
@@ -22,9 +22,12 @@ class GradientChecker {
   // kink - kink_range <= |feature value| <= kink + kink_range,
   // which accounts for all nonsmoothness in use by caffe
   GradientChecker(const float stepsize, const float threshold, const unsigned int seed = 1701,
-      const float kink = 0., const float kink_range = -1) : stepsize_(stepsize),
-                                                            threshold_(threshold), kink_(kink),
-                                                            kink_range_(kink_range), seed_(seed) {}
+      const float kink = 0.F, const float kink_range = -1.F)
+      : stepsize_(stepsize),
+        threshold_(threshold),
+        kink_(kink),
+        kink_range_(kink_range),
+        seed_(seed) {}
 
   // Checks the gradient of a layer, with provided bottom layers and top
   // layers.
@@ -132,9 +135,9 @@ void GradientChecker<Dtype>::CheckGradientSingle(LayerBase* layer, const vector<
       // bottom[blob_id][i] only for i == top_data_id.  For any other
       // i != top_data_id, we know the derivative is 0 by definition, and simply
       // check that that's true.
-      float estimated_gradient = 0;
-      float positive_objective = 0;
-      float negative_objective = 0;
+      float estimated_gradient = 0.F;
+      float positive_objective = 0.F;
+      float negative_objective = 0.F;
       if (!element_wise || (feat_id == top_data_id)) {
         // Do finite differencing.
         // Compute loss with stepsize_ added to input.
@@ -186,7 +189,31 @@ void GradientChecker<Dtype>::CheckGradientExhaustive(LayerBase* layer, const vec
 template<typename Dtype>
 void GradientChecker<Dtype>::CheckGradientEltwise(LayerBase* layer, const vector<Blob*>& bottom,
     const vector<Blob*>& top) {
-  layer->SetUp(bottom, top);
+  vector<Blob*> bottom_copy(bottom.size()), top_copy(top.size());
+  vector<shared_ptr<Blob>> bottom_scopy(bottom.size()), top_scopy(top.size());
+  for (int i = 0; i < bottom.size(); ++i) {
+    if (bottom[i]->count() > 0) {
+      bottom_scopy[i] = Blob::create<Dtype>(bottom[i]->shape());
+      bottom_scopy[i]->CopyDataFrom(*bottom[i]);
+      bottom_scopy[i]->CopyDiffFrom(*bottom[i]);
+    } else {
+      bottom_scopy[i] = Blob::create<Dtype>();
+    }
+    bottom_copy[i] = bottom_scopy[i].get();
+  }
+  for (int i = 0; i < top.size(); ++i) {
+    if (top[i]->count() > 0) {
+      top_scopy[i] = Blob::create<Dtype>(top[i]->shape());
+      top_scopy[i]->CopyDataFrom(*top[i]);
+      top_scopy[i]->CopyDiffFrom(*top[i]);
+    } else {
+      top_scopy[i] = Blob::create<Dtype>(bottom[0]->shape());
+      top_scopy[i]->CopyDataFrom(*bottom[0]);
+      top_scopy[i]->CopyDiffFrom(*bottom[0]);
+    }
+    top_copy[i] = top_scopy[i].get();
+  }
+  layer->SetUp(bottom_copy, top_copy);
   CHECK_GT(top.size(), 0) << "Eltwise mode requires at least one top blob.";
   const int check_bottom = -1;
   const bool element_wise = true;
diff --git a/include/caffe/type.hpp b/include/caffe/type.hpp
index 2c2e5207b53..23cf368364d 100644
--- a/include/caffe/type.hpp
+++ b/include/caffe/type.hpp
@@ -34,6 +34,11 @@ template <>
 inline constexpr Type tp<unsigned int>() {
   return UINT;
 }
+template <>
+inline constexpr Type tp<bool>() {
+  //CHECK(false) << "Should not reach here: tp<bool>()";
+  return BOOL;
+}
 
 #ifdef USE_CUDNN
 template <typename Dtype>
diff --git a/include/caffe/util/bbox_util.hpp b/include/caffe/util/bbox_util.hpp
new file mode 100644
index 00000000000..b41ab0dc529
--- /dev/null
+++ b/include/caffe/util/bbox_util.hpp
@@ -0,0 +1,518 @@
+#include <opencv2/core/core.hpp>
+#include <opencv2/highgui/highgui.hpp>
+#include <opencv2/imgproc/imgproc.hpp>
+
+#ifndef CAFFE_UTIL_BBOX_UTIL_H_
+#define CAFFE_UTIL_BBOX_UTIL_H_
+
+#include <stdint.h>
+#include <cmath>  // for std::fabs and std::signbit
+#include <map>
+#include <string>
+#include <utility>
+#include <vector>
+
+#include "glog/logging.h"
+
+#include "caffe/caffe.hpp"
+
+namespace caffe {
+
+typedef EmitConstraint_EmitType EmitType;
+typedef PriorBoxParameter_CodeType CodeType;
+typedef MultiBoxLossParameter_MatchType MatchType;
+typedef MultiBoxLossParameter_LocLossType LocLossType;
+typedef MultiBoxLossParameter_ConfLossType ConfLossType;
+typedef MultiBoxLossParameter_MiningType MiningType;
+
+typedef map<int, vector<NormalizedBBox> > LabelBBox;
+
+// Function used to sort NormalizedBBox, stored in STL container (e.g. vector),
+// in ascend order based on the score value.
+bool SortBBoxAscend(const NormalizedBBox& bbox1, const NormalizedBBox& bbox2);
+
+// Function used to sort NormalizedBBox, stored in STL container (e.g. vector),
+// in descend order based on the score value.
+bool SortBBoxDescend(const NormalizedBBox& bbox1, const NormalizedBBox& bbox2);
+
+// Function sued to sort pair<float, T>, stored in STL container (e.g. vector)
+// in descend order based on the score (first) value.
+template <typename T>
+bool SortScorePairAscend(const pair<float, T>& pair1,
+                         const pair<float, T>& pair2);
+
+// Function sued to sort pair<float, T>, stored in STL container (e.g. vector)
+// in descend order based on the score (first) value.
+template <typename T>
+bool SortScorePairDescend(const pair<float, T>& pair1,
+                          const pair<float, T>& pair2);
+
+// Generate unit bbox [0, 0, 1, 1]
+NormalizedBBox UnitBBox();
+
+// Check if a bbox is cross boundary or not.
+bool IsCrossBoundaryBBox(const NormalizedBBox& bbox);
+
+// Compute the intersection between two bboxes.
+void IntersectBBox(const NormalizedBBox& bbox1, const NormalizedBBox& bbox2,
+                   NormalizedBBox* intersect_bbox);
+
+// Compute bbox size.
+float BBoxSize(const NormalizedBBox& bbox, const bool normalized = true);
+
+template <typename Dtype>
+Dtype BBoxSize(const Dtype* bbox, const bool normalized = true);
+
+// Clip the NormalizedBBox such that the range for each corner is [0, 1].
+void ClipBBox(const NormalizedBBox& bbox, NormalizedBBox* clip_bbox);
+
+// Clip the bbox such that the bbox is within [0, 0; width, height].
+void ClipBBox(const NormalizedBBox& bbox, const float height, const float width,
+              NormalizedBBox* clip_bbox);
+
+// Scale the NormalizedBBox w.r.t. height and width.
+void ScaleBBox(const NormalizedBBox& bbox, const int height, const int width,
+               NormalizedBBox* scale_bbox);
+
+// Output predicted bbox on the actual image.
+void OutputBBox(const NormalizedBBox& bbox, const pair<int, int>& img_size,
+                const bool has_resize, const ResizeParameter& resize_param,
+                NormalizedBBox* out_bbox);
+
+// Locate bbox in the coordinate system that src_bbox sits.
+void LocateBBox(const NormalizedBBox& src_bbox, const NormalizedBBox& bbox,
+                NormalizedBBox* loc_bbox);
+
+// Project bbox onto the coordinate system defined by src_bbox.
+bool ProjectBBox(const NormalizedBBox& src_bbox, const NormalizedBBox& bbox,
+                 NormalizedBBox* proj_bbox);
+
+// Extrapolate the transformed bbox if height_scale and width_scale is
+// explicitly provided, and it is only effective for FIT_SMALL_SIZE case.
+void ExtrapolateBBox(const ResizeParameter& param, const int height,
+    const int width, const NormalizedBBox& crop_bbox, NormalizedBBox* bbox);
+
+// Compute the jaccard (intersection over union IoU) overlap between two bboxes.
+float JaccardOverlap(const NormalizedBBox& bbox1, const NormalizedBBox& bbox2,
+                     const bool normalized = true);
+
+template <typename Dtype>
+Dtype JaccardOverlap(const Dtype* bbox1, const Dtype* bbox2);
+
+// Compute the coverage of bbox1 by bbox2.
+float BBoxCoverage(const NormalizedBBox& bbox1, const NormalizedBBox& bbox2);
+
+// Encode a bbox according to a prior bbox.
+void EncodeBBox(const NormalizedBBox& prior_bbox,
+    const vector<float>& prior_variance, const CodeType code_type,
+    const bool encode_variance_in_target, const NormalizedBBox& bbox,
+    NormalizedBBox* encode_bbox);
+
+// Check if a bbox meet emit constraint w.r.t. src_bbox.
+bool MeetEmitConstraint(const NormalizedBBox& src_bbox,
+    const NormalizedBBox& bbox, const EmitConstraint& emit_constraint);
+
+// Decode a bbox according to a prior bbox.
+void DecodeBBox(const NormalizedBBox& prior_bbox,
+    const vector<float>& prior_variance, const CodeType code_type,
+    const bool variance_encoded_in_target, const bool clip_bbox,
+    const NormalizedBBox& bbox, NormalizedBBox* decode_bbox);
+
+// Decode a set of bboxes according to a set of prior bboxes.
+void DecodeBBoxes(const vector<NormalizedBBox>& prior_bboxes,
+    const vector<vector<float> >& prior_variances,
+    const CodeType code_type, const bool variance_encoded_in_target,
+    const bool clip_bbox, const vector<NormalizedBBox>& bboxes,
+    vector<NormalizedBBox>* decode_bboxes);
+
+// Decode all bboxes in a batch.
+void DecodeBBoxesAll(const vector<LabelBBox>& all_loc_pred,
+    const vector<NormalizedBBox>& prior_bboxes,
+    const vector<vector<float> >& prior_variances,
+    const int num, const bool share_location,
+    const int num_loc_classes, const int background_label_id,
+    const CodeType code_type, const bool variance_encoded_in_target,
+    const bool clip, vector<LabelBBox>* all_decode_bboxes);
+
+// Match prediction bboxes with ground truth bboxes.
+void MatchBBox(const vector<NormalizedBBox>& gt,
+    const vector<NormalizedBBox>& pred_bboxes, const int label,
+    const MatchType match_type, const float overlap_threshold,
+    const bool ignore_cross_boundary_bbox,
+    vector<int>* match_indices, vector<float>* match_overlaps);
+
+// Find matches between prediction bboxes and ground truth bboxes.
+//    all_loc_preds: stores the location prediction, where each item contains
+//      location prediction for an image.
+//    all_gt_bboxes: stores ground truth bboxes for the batch.
+//    prior_bboxes: stores all the prior bboxes in the format of NormalizedBBox.
+//    prior_variances: stores all the variances needed by prior bboxes.
+//    multibox_loss_param: stores the parameters for MultiBoxLossLayer.
+//    all_match_overlaps: stores jaccard overlaps between predictions and gt.
+//    all_match_indices: stores mapping between predictions and ground truth.
+void FindMatches(const vector<LabelBBox>& all_loc_preds,
+      const map<int, vector<NormalizedBBox> >& all_gt_bboxes,
+      const vector<NormalizedBBox>& prior_bboxes,
+      const vector<vector<float> >& prior_variances,
+      const MultiBoxLossParameter& multibox_loss_param,
+      vector<map<int, vector<float> > >* all_match_overlaps,
+      vector<map<int, vector<int> > >* all_match_indices);
+
+// Count the number of matches from the match indices.
+int CountNumMatches(const vector<map<int, vector<int> > >& all_match_indices,
+                    const int num);
+
+// Mine the hard examples from the batch.
+//    conf_blob: stores the confidence prediction.
+//    all_loc_preds: stores the location prediction, where each item contains
+//      location prediction for an image.
+//    all_gt_bboxes: stores ground truth bboxes for the batch.
+//    prior_bboxes: stores all the prior bboxes in the format of NormalizedBBox.
+//    prior_variances: stores all the variances needed by prior bboxes.
+//    all_match_overlaps: stores jaccard overlap between predictions and gt.
+//    multibox_loss_param: stores the parameters for MultiBoxLossLayer.
+//    all_match_indices: stores mapping between predictions and ground truth.
+//    all_loc_loss: stores the confidence loss per location for each image.
+template <typename Dtype>
+void MineHardExamples(const TBlob<Dtype>& conf_blob,
+    const vector<LabelBBox>& all_loc_preds,
+    const map<int, vector<NormalizedBBox> >& all_gt_bboxes,
+    const vector<NormalizedBBox>& prior_bboxes,
+    const vector<vector<float> >& prior_variances,
+    const vector<map<int, vector<float> > >& all_match_overlaps,
+    const MultiBoxLossParameter& multibox_loss_param,
+    int* num_matches, int* num_negs,
+    vector<map<int, vector<int> > >* all_match_indices,
+    vector<vector<int> >* all_neg_indices);
+
+// Retrieve bounding box ground truth from gt_data.
+//    gt_data: 1 x 1 x num_gt x 7 blob.
+//    num_gt: the number of ground truth.
+//    background_label_id: the label for background class which is used to do
+//      santity check so that no ground truth contains it.
+//    all_gt_bboxes: stores ground truth for each image. Label of each bbox is
+//      stored in NormalizedBBox.
+template <typename Dtype>
+void GetGroundTruth(const Dtype* gt_data, const int num_gt,
+      const int background_label_id, const bool use_difficult_gt,
+      map<int, vector<NormalizedBBox> >* all_gt_bboxes);
+// Store ground truth bboxes of same label in a group.
+template <typename Dtype>
+void GetGroundTruth(const Dtype* gt_data, const int num_gt,
+      const int background_label_id, const bool use_difficult_gt,
+      map<int, LabelBBox>* all_gt_bboxes);
+
+// Get location predictions from loc_data.
+//    loc_data: num x num_preds_per_class * num_loc_classes * 4 blob.
+//    num: the number of images.
+//    num_preds_per_class: number of predictions per class.
+//    num_loc_classes: number of location classes. It is 1 if share_location is
+//      true; and is equal to number of classes needed to predict otherwise.
+//    share_location: if true, all classes share the same location prediction.
+//    loc_preds: stores the location prediction, where each item contains
+//      location prediction for an image.
+template <typename Dtype>
+void GetLocPredictions(const Dtype* loc_data, const int num,
+      const int num_preds_per_class, const int num_loc_classes,
+      const bool share_location, vector<LabelBBox>* loc_preds);
+
+// Encode the localization prediction and ground truth for each matched prior.
+//    all_loc_preds: stores the location prediction, where each item contains
+//      location prediction for an image.
+//    all_gt_bboxes: stores ground truth bboxes for the batch.
+//    all_match_indices: stores mapping between predictions and ground truth.
+//    prior_bboxes: stores all the prior bboxes in the format of NormalizedBBox.
+//    prior_variances: stores all the variances needed by prior bboxes.
+//    multibox_loss_param: stores the parameters for MultiBoxLossLayer.
+//    loc_pred_data: stores the location prediction results.
+//    loc_gt_data: stores the encoded location ground truth.
+template <typename Dtype>
+void EncodeLocPrediction(const vector<LabelBBox>& all_loc_preds,
+      const map<int, vector<NormalizedBBox> >& all_gt_bboxes,
+      const vector<map<int, vector<int> > >& all_match_indices,
+      const vector<NormalizedBBox>& prior_bboxes,
+      const vector<vector<float> >& prior_variances,
+      const MultiBoxLossParameter& multibox_loss_param,
+      Dtype* loc_pred_data, Dtype* loc_gt_data);
+
+// Compute the localization loss per matched prior.
+//    loc_pred: stores the location prediction results.
+//    loc_gt: stores the encoded location ground truth.
+//    all_match_indices: stores mapping between predictions and ground truth.
+//    num: number of images in the batch.
+//    num_priors: total number of priors.
+//    loc_loss_type: type of localization loss, Smooth_L1 or L2.
+//    all_loc_loss: stores the localization loss for all priors in a batch.
+template <typename Dtype>
+void ComputeLocLoss(const TBlob<Dtype>& loc_pred, const TBlob<Dtype>& loc_gt,
+      const vector<map<int, vector<int> > >& all_match_indices,
+      const int num, const int num_priors, const LocLossType loc_loss_type,
+      vector<vector<float> >* all_loc_loss);
+
+// Get confidence predictions from conf_data.
+//    conf_data: num x num_preds_per_class * num_classes blob.
+//    num: the number of images.
+//    num_preds_per_class: number of predictions per class.
+//    num_classes: number of classes.
+//    conf_preds: stores the confidence prediction, where each item contains
+//      confidence prediction for an image.
+template <typename Dtype>
+void GetConfidenceScores(const Dtype* conf_data, const int num,
+      const int num_preds_per_class, const int num_classes,
+      vector<map<int, vector<float> > >* conf_scores);
+
+// Get confidence predictions from conf_data.
+//    conf_data: num x num_preds_per_class * num_classes blob.
+//    num: the number of images.
+//    num_preds_per_class: number of predictions per class.
+//    num_classes: number of classes.
+//    class_major: if true, data layout is
+//      num x num_classes x num_preds_per_class; otherwise, data layerout is
+//      num x num_preds_per_class * num_classes.
+//    conf_preds: stores the confidence prediction, where each item contains
+//      confidence prediction for an image.
+template <typename Dtype>
+void GetConfidenceScores(const Dtype* conf_data, const int num,
+      const int num_preds_per_class, const int num_classes,
+      const bool class_major, vector<map<int, vector<float> > >* conf_scores);
+
+// Compute the confidence loss for each prior from conf_data.
+//    conf_data: num x num_preds_per_class * num_classes blob.
+//    num: the number of images.
+//    num_preds_per_class: number of predictions per class.
+//    num_classes: number of classes.
+//    background_label_id: it is used to skip selecting max scores from
+//      background class.
+//    loss_type: compute the confidence loss according to the loss type.
+//    all_match_indices: stores mapping between predictions and ground truth.
+//    all_gt_bboxes: stores ground truth bboxes from the batch.
+//    all_conf_loss: stores the confidence loss per location for each image.
+template <typename Dtype>
+void ComputeConfLoss(const Dtype* conf_data, const int num,
+      const int num_preds_per_class, const int num_classes,
+      const int background_label_id, const ConfLossType loss_type,
+      const vector<map<int, vector<int> > >& all_match_indices,
+      const map<int, vector<NormalizedBBox> >& all_gt_bboxes,
+      vector<vector<float> >* all_conf_loss);
+
+// Compute the negative confidence loss for each prior from conf_data.
+//    conf_data: num x num_preds_per_class * num_classes blob.
+//    num: the number of images.
+//    num_preds_per_class: number of predictions per class.
+//    num_classes: number of classes.
+//    background_label_id: it is used to skip selecting max scores from
+//      background class.
+//    loss_type: compute the confidence loss according to the loss type.
+//    all_conf_loss: stores the confidence loss per location for each image.
+template <typename Dtype>
+void ComputeConfLoss(const Dtype* conf_data, const int num,
+      const int num_preds_per_class, const int num_classes,
+      const int background_label_id, const ConfLossType loss_type,
+      vector<vector<float> >* all_conf_loss);
+
+// Encode the confidence predictions and ground truth for each matched prior.
+//    conf_data: num x num_priors * num_classes blob.
+//    num: number of images.
+//    num_priors: number of priors (predictions) per image.
+//    multibox_loss_param: stores the parameters for MultiBoxLossLayer.
+//    all_match_indices: stores mapping between predictions and ground truth.
+//    all_neg_indices: stores the indices for negative samples.
+//    all_gt_bboxes: stores ground truth bboxes for the batch.
+//    conf_pred_data: stores the confidence prediction results.
+//    conf_gt_data: stores the confidence ground truth.
+template <typename Dtype>
+void EncodeConfPrediction(const Dtype* conf_data, const int num,
+      const int num_priors, const MultiBoxLossParameter& multibox_loss_param,
+      const vector<map<int, vector<int> > >& all_match_indices,
+      const vector<vector<int> >& all_neg_indices,
+      const map<int, vector<NormalizedBBox> >& all_gt_bboxes,
+      Dtype* conf_pred_data, Dtype* conf_gt_data);
+
+// Get prior bounding boxes from prior_data.
+//    prior_data: 1 x 2 x num_priors * 4 x 1 blob.
+//    num_priors: number of priors.
+//    prior_bboxes: stores all the prior bboxes in the format of NormalizedBBox.
+//    prior_variances: stores all the variances needed by prior bboxes.
+template <typename Dtype>
+void GetPriorBBoxes(const Dtype* prior_data, const int num_priors,
+      vector<NormalizedBBox>* prior_bboxes,
+      vector<vector<float> >* prior_variances);
+
+// Get detection results from det_data.
+//    det_data: 1 x 1 x num_det x 7 blob.
+//    num_det: the number of detections.
+//    background_label_id: the label for background class which is used to do
+//      santity check so that no detection contains it.
+//    all_detections: stores detection results for each class from each image.
+template <typename Dtype>
+void GetDetectionResults(const Dtype* det_data, const int num_det,
+      const int background_label_id,
+      map<int, LabelBBox>* all_detections);
+
+// Get top_k scores with corresponding indices.
+//    scores: a set of scores.
+//    indices: a set of corresponding indices.
+//    top_k: if -1, keep all; otherwise, keep at most top_k.
+//    score_index_vec: store the sorted (score, index) pair.
+void GetTopKScoreIndex(const vector<float>& scores, const vector<int>& indices,
+      const int top_k, vector<pair<float, int> >* score_index_vec);
+
+// Get max scores with corresponding indices.
+//    scores: a set of scores.
+//    threshold: only consider scores higher than the threshold.
+//    top_k: if -1, keep all; otherwise, keep at most top_k.
+//    score_index_vec: store the sorted (score, index) pair.
+void GetMaxScoreIndex(const vector<float>& scores, const float threshold,
+      const int top_k, vector<pair<float, int> >* score_index_vec);
+
+// Get max scores with corresponding indices.
+//    scores: an array of scores.
+//    num: number of total scores in the array.
+//    threshold: only consider scores higher than the threshold.
+//    top_k: if -1, keep all; otherwise, keep at most top_k.
+//    score_index_vec: store the sorted (score, index) pair.
+template <typename Dtype>
+void GetMaxScoreIndex(const Dtype* scores, const int num, const float threshold,
+      const int top_k, vector<pair<Dtype, int> >* score_index_vec);
+
+// Get max scores with corresponding indices.
+//    scores: a set of scores.
+//    threshold: only consider scores higher than the threshold.
+//    top_k: if -1, keep all; otherwise, keep at most top_k.
+//    score_index_vec: store the sorted (score, index) pair.
+void GetMaxScoreIndex(const vector<float>& scores, const float threshold,
+      const int top_k, vector<pair<float, int> >* score_index_vec);
+
+// Do non maximum suppression given bboxes and scores.
+//    bboxes: a set of bounding boxes.
+//    scores: a set of corresponding confidences.
+//    threshold: the threshold used in non maximum suppression.
+//    top_k: if not -1, keep at most top_k picked indices.
+//    reuse_overlaps: if true, use and update overlaps; otherwise, always
+//      compute overlap.
+//    overlaps: a temp place to optionally store the overlaps between pairs of
+//      bboxes if reuse_overlaps is true.
+//    indices: the kept indices of bboxes after nms.
+void ApplyNMS(const vector<NormalizedBBox>& bboxes, const vector<float>& scores,
+      const float threshold, const int top_k, const bool reuse_overlaps,
+      map<int, map<int, float> >* overlaps, vector<int>* indices);
+
+void ApplyNMS(const vector<NormalizedBBox>& bboxes, const vector<float>& scores,
+      const float threshold, const int top_k, vector<int>* indices);
+
+void ApplyNMS(const bool* overlapped, const int num, vector<int>* indices);
+
+// Do non maximum suppression given bboxes and scores.
+// Inspired by Piotr Dollar's NMS implementation in EdgeBox.
+// https://goo.gl/jV3JYS
+//    bboxes: a set of bounding boxes.
+//    scores: a set of corresponding confidences.
+//    score_threshold: a threshold used to filter detection results.
+//    nms_threshold: a threshold used in non maximum suppression.
+//    eta: adaptation rate for nms threshold (see Piotr's paper).
+//    top_k: if not -1, keep at most top_k picked indices.
+//    indices: the kept indices of bboxes after nms.
+void ApplyNMSFast(const vector<NormalizedBBox>& bboxes,
+      const vector<float>& scores, const float score_threshold,
+      const float nms_threshold, const float eta, const int top_k,
+      vector<int>* indices);
+
+// Do non maximum suppression based on raw bboxes and scores data.
+// Inspired by Piotr Dollar's NMS implementation in EdgeBox.
+// https://goo.gl/jV3JYS
+//    bboxes: an array of bounding boxes.
+//    scores: an array of corresponding confidences.
+//    num: number of total boxes/confidences in the array.
+//    score_threshold: a threshold used to filter detection results.
+//    nms_threshold: a threshold used in non maximum suppression.
+//    eta: adaptation rate for nms threshold (see Piotr's paper).
+//    top_k: if not -1, keep at most top_k picked indices.
+//    indices: the kept indices of bboxes after nms.
+template <typename Dtype>
+void ApplyNMSFast(const Dtype* bboxes, const Dtype* scores, const int num,
+      const float score_threshold, const float nms_threshold,
+      const float eta, const int top_k, vector<int>* indices);
+
+// Compute cumsum of a set of pairs.
+void CumSum(const vector<pair<float, int> >& pairs, vector<int>* cumsum);
+
+// Compute average precision given true positive and false positive vectors.
+//    tp: contains pairs of scores and true positive.
+//    num_pos: number of positives.
+//    fp: contains pairs of scores and false positive.
+//    ap_version: different ways of computing Average Precision.
+//      Check https://sanchom.wordpress.com/tag/average-precision/ for details.
+//      11point: the 11-point interpolated average precision. Used in VOC2007.
+//      MaxIntegral: maximally interpolated AP. Used in VOC2012/ILSVRC.
+//      Integral: the natural integral of the precision-recall curve.
+//    prec: stores the computed precisions.
+//    rec: stores the computed recalls.
+//    ap: the computed Average Precision.
+void ComputeAP(const vector<pair<float, int> >& tp, const int num_pos,
+               const vector<pair<float, int> >& fp, const string ap_version,
+               vector<float>* prec, vector<float>* rec, float* ap);
+
+template <typename Dtype>
+__host__ __device__ Dtype BBoxSizeGPU(const Dtype* bbox,
+                                      const bool normalized = true);
+
+template <typename Dtype>
+__host__ __device__ Dtype JaccardOverlapGPU(const Dtype* bbox1,
+                                            const Dtype* bbox2);
+
+template <typename Dtype>
+void DecodeBBoxesGPU(const int nthreads,
+          const Dtype* loc_data, const Dtype* prior_data,
+          const CodeType code_type, const bool variance_encoded_in_target,
+          const int num_priors, const bool share_location,
+          const int num_loc_classes, const int background_label_id,
+          const bool clip_bbox, Dtype* bbox_data);
+
+template <typename Dtype>
+void PermuteDataGPU(const int nthreads,
+          const Dtype* data, const int num_classes, const int num_data,
+          const int num_dim, Dtype* new_data);
+
+template <typename Dtype>
+void SoftMaxGPU(const Dtype* data, const int outer_num, const int channels,
+    const int inner_num, Dtype* prob);
+
+template <typename Dtype>
+void ComputeOverlappedGPU(const int nthreads,
+          const Dtype* bbox_data, const int num_bboxes, const int num_classes,
+          const Dtype overlap_threshold, bool* overlapped_data);
+
+template <typename Dtype>
+void ComputeOverlappedByIdxGPU(const int nthreads,
+          const Dtype* bbox_data, const Dtype overlap_threshold,
+          const int* idx, const int num_idx, bool* overlapped_data);
+
+template <typename Dtype>
+void ApplyNMSGPU(const Dtype* bbox_data, const Dtype* conf_data,
+          const int num_bboxes, const float confidence_threshold,
+          const int top_k, const float nms_threshold, vector<int>* indices);
+
+template <typename Dtype>
+void GetDetectionsGPU(const Dtype* bbox_data, const Dtype* conf_data,
+          const int image_id, const int label, const vector<int>& indices,
+          const bool clip_bbox, TBlob<Dtype>* detection_blob);
+
+template <typename Dtype>
+  void ComputeConfLossGPU(const TBlob<Dtype>& conf_blob, const int num,
+      const int num_preds_per_class, const int num_classes,
+      const int background_label_id, const ConfLossType loss_type,
+      const vector<map<int, vector<int> > >& all_match_indices,
+      const map<int, vector<NormalizedBBox> >& all_gt_bboxes,
+      vector<vector<float> >* all_conf_loss);
+
+vector<cv::Scalar> GetColors(const int n);
+
+template <typename Dtype>
+void VisualizeBBox(const vector<cv::Mat>& images, const Blob* detections,
+                   const float threshold, const vector<cv::Scalar>& colors,
+                   const map<int, string>& label_to_display_name,
+                   const string& save_file);
+
+}  // namespace caffe
+
+#endif  // CAFFE_UTIL_BBOX_UTIL_H_
diff --git a/include/caffe/util/benchmark.hpp b/include/caffe/util/benchmark.hpp
index 842812ea04c..61c223fae64 100644
--- a/include/caffe/util/benchmark.hpp
+++ b/include/caffe/util/benchmark.hpp
@@ -17,9 +17,9 @@ class Timer {
   virtual float MicroSeconds();
   virtual float Seconds();
 
-  inline bool initted() { return initted_; }
-  inline bool running() { return running_; }
-  inline bool has_run_at_least_once() { return has_run_at_least_once_; }
+  bool initted() { return initted_; }
+  bool running() { return running_; }
+  bool has_run_at_least_once() { return has_run_at_least_once_; }
 
  protected:
   void Init();
@@ -33,16 +33,17 @@ class Timer {
   boost::posix_time::ptime stop_cpu_;
   float elapsed_milliseconds_;
   float elapsed_microseconds_;
+  int device_;
 };
 
 class CPUTimer : public Timer {
  public:
   explicit CPUTimer();
-  virtual ~CPUTimer() {}
-  virtual void Start();
-  virtual void Stop();
-  virtual float MilliSeconds();
-  virtual float MicroSeconds();
+  ~CPUTimer() override {}
+  void Start() override;
+  void Stop() override;
+  float MilliSeconds() override;
+  float MicroSeconds() override;
 };
 
 }  // namespace caffe
diff --git a/include/caffe/util/cudnn.hpp b/include/caffe/util/cudnn.hpp
index 51978d7b148..0c5de457fb6 100644
--- a/include/caffe/util/cudnn.hpp
+++ b/include/caffe/util/cudnn.hpp
@@ -127,6 +127,22 @@ cudnnDataType_t cudnn_data_type(Type math) {
   return ret;
 }
 
+template <typename Dtype>
+inline void createFilterDesc(cudnnFilterDescriptor_t* desc, int n, int c, int h, int w) {
+  CUDNN_CHECK(cudnnCreateFilterDescriptor(desc));
+  CUDNN_CHECK(cudnnSetFilter4dDescriptor(*desc, cudnn::dataType<Dtype>::type,
+      CUDNN_TENSOR_NCHW, n, c, h, w));
+}
+
+inline void setConvolutionDesc(Type math, cudnnConvolutionDescriptor_t conv,
+      int pad_h, int pad_w, int stride_h, int stride_w, int dilation_h, int dilation_w) {
+  int padA[2] = {pad_h, pad_w};
+  int strideA[2] = {stride_h, stride_w};
+  int upscaleA[2] = {dilation_h, dilation_w};
+  CUDNN_CHECK(cudnnSetConvolutionNdDescriptor(conv, 2, padA, strideA, upscaleA,
+      CUDNN_CROSS_CORRELATION, cudnn::cudnn_data_type(math)));
+}
+
 template<typename Dtype>
 inline void createTensor4dDesc(cudnnTensorDescriptor_t *desc) {
   CUDNN_CHECK(cudnnCreateTensorDescriptor(desc));
diff --git a/include/caffe/util/db.hpp b/include/caffe/util/db.hpp
index 1239d579e2f..4d0186ac2f6 100644
--- a/include/caffe/util/db.hpp
+++ b/include/caffe/util/db.hpp
@@ -21,6 +21,7 @@ class Cursor {
   virtual const void* data() const = 0;
   virtual size_t size() const = 0;
   virtual bool parse(Datum* datum) const = 0;
+  virtual bool parse(AnnotatedDatum* datum) const = 0;
   virtual bool parse(C2TensorProtos* c2p) const = 0;
   virtual bool valid() const = 0;
 
diff --git a/include/caffe/util/db_leveldb.hpp b/include/caffe/util/db_leveldb.hpp
index e228ffdeffb..92c7f234ec2 100644
--- a/include/caffe/util/db_leveldb.hpp
+++ b/include/caffe/util/db_leveldb.hpp
@@ -20,9 +20,13 @@ class LevelDBCursor : public Cursor {
   void Next() override { iter_->Next(); }
   string key() const override { return iter_->key().ToString(); }
   string value() const override { return iter_->value().ToString(); }
+
   bool parse(Datum* datum) const override {
     return datum->ParseFromArray(iter_->value().data(), iter_->value().size());
   }
+  bool parse(AnnotatedDatum* adatum) const override {
+    return adatum->ParseFromArray(iter_->value().data(), iter_->value().size());
+  }
   bool parse(C2TensorProtos* c2p) const override {
     return c2p->ParseFromArray(iter_->value().data(), iter_->value().size());
   }
diff --git a/include/caffe/util/db_lmdb.hpp b/include/caffe/util/db_lmdb.hpp
index be8b77b8d1a..e7609ba2f6e 100644
--- a/include/caffe/util/db_lmdb.hpp
+++ b/include/caffe/util/db_lmdb.hpp
@@ -38,6 +38,9 @@ class LMDBCursor : public Cursor {
   bool parse(Datum* datum) const override {
     return datum->ParseFromArray(mdb_value_.mv_data, mdb_value_.mv_size);
   }
+  bool parse(AnnotatedDatum* adatum) const override {
+    return adatum->ParseFromArray(mdb_value_.mv_data, mdb_value_.mv_size);
+  }
   bool parse(C2TensorProtos* c2p) const override {
     return c2p->ParseFromArray(mdb_value_.mv_data, mdb_value_.mv_size);
   }
diff --git a/include/caffe/util/device_alternate.hpp b/include/caffe/util/device_alternate.hpp
index 467721baaba..654d8c31d68 100644
--- a/include/caffe/util/device_alternate.hpp
+++ b/include/caffe/util/device_alternate.hpp
@@ -98,11 +98,10 @@ namespace nvml {
 struct NVMLInit {
   NVMLInit();
   ~NVMLInit();
-  nvmlDevice_t device_;
   static std::mutex m_;
 };
 
-void setCpuAffinity();
+void setCpuAffinity(int device);
 
 }
 #endif  // NO_NVML
diff --git a/include/caffe/util/gpu_math_functions.cuh b/include/caffe/util/gpu_math_functions.cuh
index 5cbd2ccb694..a1d706aa0ce 100644
--- a/include/caffe/util/gpu_math_functions.cuh
+++ b/include/caffe/util/gpu_math_functions.cuh
@@ -26,6 +26,15 @@ half hmul(half a, half b) {
 #endif
 }
 
+__device__ __inline__
+half hdiv(half a, half b) {
+#if __CUDA_ARCH__ >= 530
+  return __hdiv(a, b);
+#else
+  return float2half_clip(__half2float(a) / __half2float(b));
+#endif
+}
+
 __device__ __inline__
 half2 hmul2(half2 a, half2 b) {
 #if __CUDA_ARCH__ >= 530
diff --git a/include/caffe/util/gpu_memory.hpp b/include/caffe/util/gpu_memory.hpp
index 19d988e1688..4072d2c6da4 100644
--- a/include/caffe/util/gpu_memory.hpp
+++ b/include/caffe/util/gpu_memory.hpp
@@ -28,8 +28,8 @@ struct GPUMemory {
   }
 
   template <class Any>
-  static void allocate(Any** ptr, size_t size, int device = current_device(), int group = 0) {
-    if (!try_allocate(reinterpret_cast<void**>(ptr), size, device, group)) {
+  static void allocate(Any** ptr, size_t size, int device, const shared_ptr<CudaStream>& pstream) {
+    if (!try_allocate(reinterpret_cast<void**>(ptr), size, device, pstream)) {
       LOG(FATAL) << "Failed to allocate " << size << " bytes on device " << device
           << ". " << mgr_.report_dev_info(device);
     }
@@ -39,8 +39,9 @@ struct GPUMemory {
     mgr_.deallocate(ptr, device);
   }
 
-  static bool try_allocate(void** ptr, size_t size, int device = current_device(), int group = 0) {
-    return mgr_.try_allocate(ptr, size, device, group);
+  static bool try_allocate(void** ptr, size_t size, int device,
+                           const shared_ptr<CudaStream>& pstream) {
+    return mgr_.try_allocate(ptr, size, device, pstream);
   }
 
   static shared_mutex& read_write_mutex() {
@@ -108,10 +109,26 @@ struct GPUMemory {
     void* ptr_;
     size_t size_;
     int device_;
+    shared_ptr<CudaStream> pstream_;
 
     DISABLE_COPY_MOVE_AND_ASSIGN(Workspace);
   };
 
+  struct PinnedBuffer {
+    explicit PinnedBuffer(size_t size);
+    ~PinnedBuffer();
+
+    void* get() {
+      return dptr_;
+    }
+
+   private:
+    void* hptr_;
+    void* dptr_;
+
+    DISABLE_COPY_MOVE_AND_ASSIGN(PinnedBuffer);
+  };
+
  private:
   struct Manager {
     Manager();
@@ -119,7 +136,7 @@ struct GPUMemory {
     void lazy_init(int device);
     void GetInfo(size_t* free_mem, size_t* used_mem, bool with_update);
     void deallocate(void* ptr, int device);
-    bool try_allocate(void** ptr, size_t size, int device, int group = 0);
+    bool try_allocate(void** ptr, size_t size, int device, const shared_ptr<CudaStream>& pstream);
     void init(const std::vector<int>&, bool);
     void reset();
     std::string report_dev_info(int device);
diff --git a/include/caffe/util/im_transforms.hpp b/include/caffe/util/im_transforms.hpp
new file mode 100644
index 00000000000..dbbc1a5ba8e
--- /dev/null
+++ b/include/caffe/util/im_transforms.hpp
@@ -0,0 +1,86 @@
+#ifndef IM_TRANSFORMS_HPP
+#define IM_TRANSFORMS_HPP
+
+#include <opencv2/core/core.hpp>
+#include <opencv2/imgproc/imgproc.hpp>
+
+#include <vector>
+
+#include "caffe/common.hpp"
+#include "caffe/proto/caffe.pb.h"
+
+namespace caffe {
+
+// Generate random number given the probablities for each number.
+int roll_weighted_die(const std::vector<float>& probabilities);
+
+void UpdateBBoxByResizePolicy(const ResizeParameter& param,
+                              const int old_width, const int old_height,
+                              NormalizedBBox* bbox);
+
+void InferNewSize(const ResizeParameter& resize_param,
+                  const int old_width, const int old_height,
+                  int* new_width, int* new_height);
+
+template <typename T>
+bool is_border(const cv::Mat& edge, T color);
+
+// Auto cropping image.
+template <typename T>
+cv::Rect CropMask(const cv::Mat& src, T point, int padding = 2);
+
+cv::Mat colorReduce(const cv::Mat& image, int div = 64);
+
+void fillEdgeImage(const cv::Mat& edgesIn, cv::Mat* filledEdgesOut);
+
+void CenterObjectAndFillBg(const cv::Mat& in_img, const bool fill_bg,
+                           cv::Mat* out_img);
+
+cv::Mat AspectKeepingResizeAndPad(const cv::Mat& in_img,
+                                  const int new_width, const int new_height,
+                                  const int pad_type = cv::BORDER_CONSTANT,
+                                  const cv::Scalar pad = cv::Scalar(0, 0, 0),
+                                  const int interp_mode = cv::INTER_LINEAR);
+
+cv::Mat AspectKeepingResizeBySmall(const cv::Mat& in_img,
+                                   const int new_width, const int new_height,
+                                   const int interp_mode = cv::INTER_LINEAR);
+
+void constantNoise(const int n, const vector<uchar>& val, cv::Mat* image);
+
+cv::Mat ApplyResize(const cv::Mat& in_img, const ResizeParameter& param);
+
+cv::Mat ApplyNoise(const cv::Mat& in_img, const NoiseParameter& param);
+
+
+void RandomBrightness(const cv::Mat& in_img, cv::Mat* out_img,
+    const float brightness_prob, const float brightness_delta);
+
+void AdjustBrightness(const cv::Mat& in_img, const float delta,
+                      cv::Mat* out_img);
+
+void RandomContrast(const cv::Mat& in_img, cv::Mat* out_img,
+    const float contrast_prob, const float lower, const float upper);
+
+void AdjustContrast(const cv::Mat& in_img, const float delta,
+                    cv::Mat* out_img);
+
+void RandomSaturation(const cv::Mat& in_img, cv::Mat* out_img,
+    const float saturation_prob, const float lower, const float upper);
+
+void AdjustSaturation(const cv::Mat& in_img, const float delta,
+                      cv::Mat* out_img);
+
+void RandomHue(const cv::Mat& in_img, cv::Mat* out_img,
+               const float hue_prob, const float hue_delta);
+
+void AdjustHue(const cv::Mat& in_img, const float delta, cv::Mat* out_img);
+
+void RandomOrderChannels(const cv::Mat& in_img, cv::Mat* out_img,
+                         const float random_order_prob);
+
+cv::Mat ApplyDistort(const cv::Mat& in_img, const DistortionParameter& param);
+
+}  // namespace caffe
+
+#endif  // IM_TRANSFORMS_HPP
diff --git a/include/caffe/util/io.hpp b/include/caffe/util/io.hpp
index dacffcd96eb..ec98f5a43c0 100644
--- a/include/caffe/util/io.hpp
+++ b/include/caffe/util/io.hpp
@@ -7,6 +7,7 @@
 #include <boost/filesystem.hpp>
 #include <iomanip>
 #include <iostream>  // NOLINT(readability/streams)
+#include <map>
 #include <string>
 
 #include "google/protobuf/message.h"
@@ -131,6 +132,24 @@ inline bool ReadFileToDatum(const string& filename, Datum* datum) {
   return ReadFileToDatum(filename, -1, datum);
 }
 
+bool ReadImageToDatum(const string& filename, const int label,
+    const int height, const int width, const int min_dim, const int max_dim,
+    const bool is_color, const std::string & encoding, Datum* datum);
+
+inline bool ReadImageToDatum(const string& filename, const int label,
+    const int height, const int width, const int min_dim, const int max_dim,
+    const bool is_color, Datum* datum) {
+  return ReadImageToDatum(filename, label, height, width, min_dim, max_dim,
+                          is_color, "", datum);
+}
+
+inline bool ReadImageToDatum(const string& filename, const int label,
+    const int height, const int width, const int min_dim, const int max_dim,
+    Datum* datum) {
+  return ReadImageToDatum(filename, label, height, width, min_dim, max_dim,
+                          true, datum);
+}
+
 bool ReadImageToDatum(const string& filename, const int label,
     const int height, const int width, const bool is_color,
     const std::string & encoding, Datum* datum);
@@ -161,6 +180,78 @@ inline bool ReadImageToDatum(const string& filename, const int label,
   return ReadImageToDatum(filename, label, 0, 0, true, encoding, datum);
 }
 
+
+void GetImageSize(const string& filename, int* height, int* width);
+
+bool ReadRichImageToAnnotatedDatum(const string& filename,
+    const string& labelname, const int height, const int width,
+    const int min_dim, const int max_dim, const bool is_color,
+    const std::string& encoding, const AnnotatedDatum_AnnotationType type,
+    const string& labeltype, const std::map<string, int>& name_to_label,
+    AnnotatedDatum* anno_datum);
+
+inline bool ReadRichImageToAnnotatedDatum(const string& filename,
+    const string& labelname, const int height, const int width,
+    const bool is_color, const std::string & encoding,
+    const AnnotatedDatum_AnnotationType type, const string& labeltype,
+    const std::map<string, int>& name_to_label, AnnotatedDatum* anno_datum) {
+  return ReadRichImageToAnnotatedDatum(filename, labelname, height, width, 0, 0,
+                      is_color, encoding, type, labeltype, name_to_label,
+                      anno_datum);
+}
+
+bool ReadXMLToAnnotatedDatum(const string& labelname, const int img_height,
+    const int img_width, const std::map<string, int>& name_to_label,
+    AnnotatedDatum* anno_datum);
+
+bool ReadJSONToAnnotatedDatum(const string& labelname, const int img_height,
+    const int img_width, const std::map<string, int>& name_to_label,
+    AnnotatedDatum* anno_datum);
+
+bool ReadTxtToAnnotatedDatum(const string& labelname, const int height,
+    const int width, AnnotatedDatum* anno_datum);
+
+bool ReadLabelFileToLabelMap(const string& filename, bool include_background,
+    const string& delimiter, LabelMap* map);
+
+inline bool ReadLabelFileToLabelMap(const string& filename,
+      bool include_background, LabelMap* map) {
+  return ReadLabelFileToLabelMap(filename, include_background, " ", map);
+}
+
+inline bool ReadLabelFileToLabelMap(const string& filename, LabelMap* map) {
+  return ReadLabelFileToLabelMap(filename, true, map);
+}
+
+bool MapNameToLabel(const LabelMap& map, const bool strict_check,
+                    std::map<string, int>* name_to_label);
+
+inline bool MapNameToLabel(const LabelMap& map,
+                           std::map<string, int>* name_to_label) {
+  return MapNameToLabel(map, true, name_to_label);
+}
+
+bool MapLabelToName(const LabelMap& map, const bool strict_check,
+                    std::map<int, string>* label_to_name);
+
+inline bool MapLabelToName(const LabelMap& map,
+                           std::map<int, string>* label_to_name) {
+  return MapLabelToName(map, true, label_to_name);
+}
+
+bool MapLabelToDisplayName(const LabelMap& map, const bool strict_check,
+                           std::map<int, string>* label_to_display_name);
+
+inline bool MapLabelToDisplayName(const LabelMap& map,
+                              std::map<int, string>* label_to_display_name) {
+  return MapLabelToDisplayName(map, true, label_to_display_name);
+}
+cv::Mat ReadImageToCVMat(const string& filename, const int height,
+    const int width, const int min_dim, const int max_dim, const bool is_color);
+
+cv::Mat ReadImageToCVMat(const string& filename, const int height,
+    const int width, const int min_dim, const int max_dim);
+
 cv::Mat ReadImageToCVMat(const string& filename,
     int height, int width, bool is_color,
     int short_side = 0);
@@ -177,6 +268,9 @@ cv::Mat DecodeDatumToCVMatNative(const Datum& datum);
 cv::Mat DecodeDatumToCVMat(const Datum& datum, bool is_color);
 bool DecodeDatumNative(Datum* datum);
 bool DecodeDatum(Datum* datum, bool is_color);
+void EncodeCVMatToDatum(const cv::Mat& cv_img, const string& encoding,
+                        Datum* datum);
+
 void CVMatToDatum(const cv::Mat& cv_img, Datum& datum);
 vector<int> DatumToCVMat(const Datum& datum, cv::Mat& img, bool shape_only);
 vector<int> DecodeDatumToCVMat(const Datum& datum, int color_mode, cv::Mat& cv_img,
diff --git a/include/caffe/util/sampler.hpp b/include/caffe/util/sampler.hpp
new file mode 100644
index 00000000000..9f0280d79f6
--- /dev/null
+++ b/include/caffe/util/sampler.hpp
@@ -0,0 +1,39 @@
+#ifndef CAFFE_UTIL_SAMPLER_H_
+#define CAFFE_UTIL_SAMPLER_H_
+
+#include <vector>
+
+#include "glog/logging.h"
+
+#include "caffe/caffe.hpp"
+
+namespace caffe {
+
+// Find all annotated NormalizedBBox.
+void GroupObjectBBoxes(const AnnotatedDatum& anno_datum,
+                       vector<NormalizedBBox>* object_bboxes);
+
+// Check if a sampled bbox satisfy the constraints with all object bboxes.
+bool SatisfySampleConstraint(const NormalizedBBox& sampled_bbox,
+                             const vector<NormalizedBBox>& object_bboxes,
+                             const SampleConstraint& sample_constraint);
+
+// Sample a NormalizedBBox given the specifictions.
+void SampleBBox(const Sampler& sampler, NormalizedBBox* sampled_bbox);
+
+// Generate samples from NormalizedBBox using the BatchSampler.
+void GenerateSamples(const NormalizedBBox& source_bbox,
+                     const vector<NormalizedBBox>& object_bboxes,
+                     const BatchSampler& batch_sampler,
+                     vector<NormalizedBBox>* sampled_bboxes);
+
+// Generate samples from AnnotatedDatum using the BatchSampler.
+// All sampled bboxes which satisfy the constraints defined in BatchSampler
+// is stored in sampled_bboxes.
+void GenerateBatchSamples(const AnnotatedDatum& anno_datum,
+                          const vector<BatchSampler>& batch_samplers,
+                          vector<NormalizedBBox>* sampled_bboxes);
+
+}  // namespace caffe
+
+#endif  // CAFFE_UTIL_SAMPLER_H_
diff --git a/models/VGGNet/coco/SSD_300x300/deploy.prototxt b/models/VGGNet/coco/SSD_300x300/deploy.prototxt
new file mode 100644
index 00000000000..d5e2116ad8a
--- /dev/null
+++ b/models/VGGNet/coco/SSD_300x300/deploy.prototxt
@@ -0,0 +1,1629 @@
+name: "VGG_coco_SSD_300x300_deploy"
+input: "data"
+input_shape {
+  dim: 1
+  dim: 3
+  dim: 300
+  dim: 300
+}
+layer {
+  name: "conv1_1"
+  type: "Convolution"
+  bottom: "data"
+  top: "conv1_1"
+  param {
+    lr_mult: 1.0
+    decay_mult: 1.0
+  }
+  param {
+    lr_mult: 2.0
+    decay_mult: 0.0
+  }
+  convolution_param {
+    num_output: 64
+    pad: 1
+    kernel_size: 3
+    weight_filler {
+      type: "xavier"
+    }
+    bias_filler {
+      type: "constant"
+      value: 0.0
+    }
+  }
+}
+layer {
+  name: "relu1_1"
+  type: "ReLU"
+  bottom: "conv1_1"
+  top: "conv1_1"
+}
+layer {
+  name: "conv1_2"
+  type: "Convolution"
+  bottom: "conv1_1"
+  top: "conv1_2"
+  param {
+    lr_mult: 1.0
+    decay_mult: 1.0
+  }
+  param {
+    lr_mult: 2.0
+    decay_mult: 0.0
+  }
+  convolution_param {
+    num_output: 64
+    pad: 1
+    kernel_size: 3
+    weight_filler {
+      type: "xavier"
+    }
+    bias_filler {
+      type: "constant"
+      value: 0.0
+    }
+  }
+}
+layer {
+  name: "relu1_2"
+  type: "ReLU"
+  bottom: "conv1_2"
+  top: "conv1_2"
+}
+layer {
+  name: "pool1"
+  type: "Pooling"
+  bottom: "conv1_2"
+  top: "pool1"
+  pooling_param {
+    pool: MAX
+    kernel_size: 2
+    stride: 2
+  }
+}
+layer {
+  name: "conv2_1"
+  type: "Convolution"
+  bottom: "pool1"
+  top: "conv2_1"
+  param {
+    lr_mult: 1.0
+    decay_mult: 1.0
+  }
+  param {
+    lr_mult: 2.0
+    decay_mult: 0.0
+  }
+  convolution_param {
+    num_output: 128
+    pad: 1
+    kernel_size: 3
+    weight_filler {
+      type: "xavier"
+    }
+    bias_filler {
+      type: "constant"
+      value: 0.0
+    }
+  }
+}
+layer {
+  name: "relu2_1"
+  type: "ReLU"
+  bottom: "conv2_1"
+  top: "conv2_1"
+}
+layer {
+  name: "conv2_2"
+  type: "Convolution"
+  bottom: "conv2_1"
+  top: "conv2_2"
+  param {
+    lr_mult: 1.0
+    decay_mult: 1.0
+  }
+  param {
+    lr_mult: 2.0
+    decay_mult: 0.0
+  }
+  convolution_param {
+    num_output: 128
+    pad: 1
+    kernel_size: 3
+    weight_filler {
+      type: "xavier"
+    }
+    bias_filler {
+      type: "constant"
+      value: 0.0
+    }
+  }
+}
+layer {
+  name: "relu2_2"
+  type: "ReLU"
+  bottom: "conv2_2"
+  top: "conv2_2"
+}
+layer {
+  name: "pool2"
+  type: "Pooling"
+  bottom: "conv2_2"
+  top: "pool2"
+  pooling_param {
+    pool: MAX
+    kernel_size: 2
+    stride: 2
+  }
+}
+layer {
+  name: "conv3_1"
+  type: "Convolution"
+  bottom: "pool2"
+  top: "conv3_1"
+  param {
+    lr_mult: 1.0
+    decay_mult: 1.0
+  }
+  param {
+    lr_mult: 2.0
+    decay_mult: 0.0
+  }
+  convolution_param {
+    num_output: 256
+    pad: 1
+    kernel_size: 3
+    weight_filler {
+      type: "xavier"
+    }
+    bias_filler {
+      type: "constant"
+      value: 0.0
+    }
+  }
+}
+layer {
+  name: "relu3_1"
+  type: "ReLU"
+  bottom: "conv3_1"
+  top: "conv3_1"
+}
+layer {
+  name: "conv3_2"
+  type: "Convolution"
+  bottom: "conv3_1"
+  top: "conv3_2"
+  param {
+    lr_mult: 1.0
+    decay_mult: 1.0
+  }
+  param {
+    lr_mult: 2.0
+    decay_mult: 0.0
+  }
+  convolution_param {
+    num_output: 256
+    pad: 1
+    kernel_size: 3
+    weight_filler {
+      type: "xavier"
+    }
+    bias_filler {
+      type: "constant"
+      value: 0.0
+    }
+  }
+}
+layer {
+  name: "relu3_2"
+  type: "ReLU"
+  bottom: "conv3_2"
+  top: "conv3_2"
+}
+layer {
+  name: "conv3_3"
+  type: "Convolution"
+  bottom: "conv3_2"
+  top: "conv3_3"
+  param {
+    lr_mult: 1.0
+    decay_mult: 1.0
+  }
+  param {
+    lr_mult: 2.0
+    decay_mult: 0.0
+  }
+  convolution_param {
+    num_output: 256
+    pad: 1
+    kernel_size: 3
+    weight_filler {
+      type: "xavier"
+    }
+    bias_filler {
+      type: "constant"
+      value: 0.0
+    }
+  }
+}
+layer {
+  name: "relu3_3"
+  type: "ReLU"
+  bottom: "conv3_3"
+  top: "conv3_3"
+}
+layer {
+  name: "pool3"
+  type: "Pooling"
+  bottom: "conv3_3"
+  top: "pool3"
+  pooling_param {
+    pool: MAX
+    kernel_size: 2
+    stride: 2
+  }
+}
+layer {
+  name: "conv4_1"
+  type: "Convolution"
+  bottom: "pool3"
+  top: "conv4_1"
+  param {
+    lr_mult: 1.0
+    decay_mult: 1.0
+  }
+  param {
+    lr_mult: 2.0
+    decay_mult: 0.0
+  }
+  convolution_param {
+    num_output: 512
+    pad: 1
+    kernel_size: 3
+    weight_filler {
+      type: "xavier"
+    }
+    bias_filler {
+      type: "constant"
+      value: 0.0
+    }
+  }
+}
+layer {
+  name: "relu4_1"
+  type: "ReLU"
+  bottom: "conv4_1"
+  top: "conv4_1"
+}
+layer {
+  name: "conv4_2"
+  type: "Convolution"
+  bottom: "conv4_1"
+  top: "conv4_2"
+  param {
+    lr_mult: 1.0
+    decay_mult: 1.0
+  }
+  param {
+    lr_mult: 2.0
+    decay_mult: 0.0
+  }
+  convolution_param {
+    num_output: 512
+    pad: 1
+    kernel_size: 3
+    weight_filler {
+      type: "xavier"
+    }
+    bias_filler {
+      type: "constant"
+      value: 0.0
+    }
+  }
+}
+layer {
+  name: "relu4_2"
+  type: "ReLU"
+  bottom: "conv4_2"
+  top: "conv4_2"
+}
+layer {
+  name: "conv4_3"
+  type: "Convolution"
+  bottom: "conv4_2"
+  top: "conv4_3"
+  param {
+    lr_mult: 1.0
+    decay_mult: 1.0
+  }
+  param {
+    lr_mult: 2.0
+    decay_mult: 0.0
+  }
+  convolution_param {
+    num_output: 512
+    pad: 1
+    kernel_size: 3
+    weight_filler {
+      type: "xavier"
+    }
+    bias_filler {
+      type: "constant"
+      value: 0.0
+    }
+  }
+}
+layer {
+  name: "relu4_3"
+  type: "ReLU"
+  bottom: "conv4_3"
+  top: "conv4_3"
+}
+layer {
+  name: "pool4"
+  type: "Pooling"
+  bottom: "conv4_3"
+  top: "pool4"
+  pooling_param {
+    pool: MAX
+    kernel_size: 2
+    stride: 2
+  }
+}
+layer {
+  name: "conv5_1"
+  type: "Convolution"
+  bottom: "pool4"
+  top: "conv5_1"
+  param {
+    lr_mult: 1.0
+    decay_mult: 1.0
+  }
+  param {
+    lr_mult: 2.0
+    decay_mult: 0.0
+  }
+  convolution_param {
+    num_output: 512
+    pad: 1
+    kernel_size: 3
+    weight_filler {
+      type: "xavier"
+    }
+    bias_filler {
+      type: "constant"
+      value: 0.0
+    }
+    dilation: 1
+  }
+}
+layer {
+  name: "relu5_1"
+  type: "ReLU"
+  bottom: "conv5_1"
+  top: "conv5_1"
+}
+layer {
+  name: "conv5_2"
+  type: "Convolution"
+  bottom: "conv5_1"
+  top: "conv5_2"
+  param {
+    lr_mult: 1.0
+    decay_mult: 1.0
+  }
+  param {
+    lr_mult: 2.0
+    decay_mult: 0.0
+  }
+  convolution_param {
+    num_output: 512
+    pad: 1
+    kernel_size: 3
+    weight_filler {
+      type: "xavier"
+    }
+    bias_filler {
+      type: "constant"
+      value: 0.0
+    }
+    dilation: 1
+  }
+}
+layer {
+  name: "relu5_2"
+  type: "ReLU"
+  bottom: "conv5_2"
+  top: "conv5_2"
+}
+layer {
+  name: "conv5_3"
+  type: "Convolution"
+  bottom: "conv5_2"
+  top: "conv5_3"
+  param {
+    lr_mult: 1.0
+    decay_mult: 1.0
+  }
+  param {
+    lr_mult: 2.0
+    decay_mult: 0.0
+  }
+  convolution_param {
+    num_output: 512
+    pad: 1
+    kernel_size: 3
+    weight_filler {
+      type: "xavier"
+    }
+    bias_filler {
+      type: "constant"
+      value: 0.0
+    }
+    dilation: 1
+  }
+}
+layer {
+  name: "relu5_3"
+  type: "ReLU"
+  bottom: "conv5_3"
+  top: "conv5_3"
+}
+layer {
+  name: "pool5"
+  type: "Pooling"
+  bottom: "conv5_3"
+  top: "pool5"
+  pooling_param {
+    pool: MAX
+    kernel_size: 3
+    stride: 1
+    pad: 1
+  }
+}
+layer {
+  name: "fc6"
+  type: "Convolution"
+  bottom: "pool5"
+  top: "fc6"
+  param {
+    lr_mult: 1.0
+    decay_mult: 1.0
+  }
+  param {
+    lr_mult: 2.0
+    decay_mult: 0.0
+  }
+  convolution_param {
+    num_output: 1024
+    pad: 6
+    kernel_size: 3
+    weight_filler {
+      type: "xavier"
+    }
+    bias_filler {
+      type: "constant"
+      value: 0.0
+    }
+    dilation: 6
+  }
+}
+layer {
+  name: "relu6"
+  type: "ReLU"
+  bottom: "fc6"
+  top: "fc6"
+}
+layer {
+  name: "fc7"
+  type: "Convolution"
+  bottom: "fc6"
+  top: "fc7"
+  param {
+    lr_mult: 1.0
+    decay_mult: 1.0
+  }
+  param {
+    lr_mult: 2.0
+    decay_mult: 0.0
+  }
+  convolution_param {
+    num_output: 1024
+    kernel_size: 1
+    weight_filler {
+      type: "xavier"
+    }
+    bias_filler {
+      type: "constant"
+      value: 0.0
+    }
+  }
+}
+layer {
+  name: "relu7"
+  type: "ReLU"
+  bottom: "fc7"
+  top: "fc7"
+}
+layer {
+  name: "conv6_1"
+  type: "Convolution"
+  bottom: "fc7"
+  top: "conv6_1"
+  param {
+    lr_mult: 1.0
+    decay_mult: 1.0
+  }
+  param {
+    lr_mult: 2.0
+    decay_mult: 0.0
+  }
+  convolution_param {
+    num_output: 256
+    pad: 0
+    kernel_size: 1
+    stride: 1
+    weight_filler {
+      type: "xavier"
+    }
+    bias_filler {
+      type: "constant"
+      value: 0.0
+    }
+  }
+}
+layer {
+  name: "conv6_1_relu"
+  type: "ReLU"
+  bottom: "conv6_1"
+  top: "conv6_1"
+}
+layer {
+  name: "conv6_2"
+  type: "Convolution"
+  bottom: "conv6_1"
+  top: "conv6_2"
+  param {
+    lr_mult: 1.0
+    decay_mult: 1.0
+  }
+  param {
+    lr_mult: 2.0
+    decay_mult: 0.0
+  }
+  convolution_param {
+    num_output: 512
+    pad: 1
+    kernel_size: 3
+    stride: 2
+    weight_filler {
+      type: "xavier"
+    }
+    bias_filler {
+      type: "constant"
+      value: 0.0
+    }
+  }
+}
+layer {
+  name: "conv6_2_relu"
+  type: "ReLU"
+  bottom: "conv6_2"
+  top: "conv6_2"
+}
+layer {
+  name: "conv7_1"
+  type: "Convolution"
+  bottom: "conv6_2"
+  top: "conv7_1"
+  param {
+    lr_mult: 1.0
+    decay_mult: 1.0
+  }
+  param {
+    lr_mult: 2.0
+    decay_mult: 0.0
+  }
+  convolution_param {
+    num_output: 128
+    pad: 0
+    kernel_size: 1
+    stride: 1
+    weight_filler {
+      type: "xavier"
+    }
+    bias_filler {
+      type: "constant"
+      value: 0.0
+    }
+  }
+}
+layer {
+  name: "conv7_1_relu"
+  type: "ReLU"
+  bottom: "conv7_1"
+  top: "conv7_1"
+}
+layer {
+  name: "conv7_2"
+  type: "Convolution"
+  bottom: "conv7_1"
+  top: "conv7_2"
+  param {
+    lr_mult: 1.0
+    decay_mult: 1.0
+  }
+  param {
+    lr_mult: 2.0
+    decay_mult: 0.0
+  }
+  convolution_param {
+    num_output: 256
+    pad: 1
+    kernel_size: 3
+    stride: 2
+    weight_filler {
+      type: "xavier"
+    }
+    bias_filler {
+      type: "constant"
+      value: 0.0
+    }
+  }
+}
+layer {
+  name: "conv7_2_relu"
+  type: "ReLU"
+  bottom: "conv7_2"
+  top: "conv7_2"
+}
+layer {
+  name: "conv8_1"
+  type: "Convolution"
+  bottom: "conv7_2"
+  top: "conv8_1"
+  param {
+    lr_mult: 1.0
+    decay_mult: 1.0
+  }
+  param {
+    lr_mult: 2.0
+    decay_mult: 0.0
+  }
+  convolution_param {
+    num_output: 128
+    pad: 0
+    kernel_size: 1
+    stride: 1
+    weight_filler {
+      type: "xavier"
+    }
+    bias_filler {
+      type: "constant"
+      value: 0.0
+    }
+  }
+}
+layer {
+  name: "conv8_1_relu"
+  type: "ReLU"
+  bottom: "conv8_1"
+  top: "conv8_1"
+}
+layer {
+  name: "conv8_2"
+  type: "Convolution"
+  bottom: "conv8_1"
+  top: "conv8_2"
+  param {
+    lr_mult: 1.0
+    decay_mult: 1.0
+  }
+  param {
+    lr_mult: 2.0
+    decay_mult: 0.0
+  }
+  convolution_param {
+    num_output: 256
+    pad: 0
+    kernel_size: 3
+    stride: 1
+    weight_filler {
+      type: "xavier"
+    }
+    bias_filler {
+      type: "constant"
+      value: 0.0
+    }
+  }
+}
+layer {
+  name: "conv8_2_relu"
+  type: "ReLU"
+  bottom: "conv8_2"
+  top: "conv8_2"
+}
+layer {
+  name: "conv9_1"
+  type: "Convolution"
+  bottom: "conv8_2"
+  top: "conv9_1"
+  param {
+    lr_mult: 1.0
+    decay_mult: 1.0
+  }
+  param {
+    lr_mult: 2.0
+    decay_mult: 0.0
+  }
+  convolution_param {
+    num_output: 128
+    pad: 0
+    kernel_size: 1
+    stride: 1
+    weight_filler {
+      type: "xavier"
+    }
+    bias_filler {
+      type: "constant"
+      value: 0.0
+    }
+  }
+}
+layer {
+  name: "conv9_1_relu"
+  type: "ReLU"
+  bottom: "conv9_1"
+  top: "conv9_1"
+}
+layer {
+  name: "conv9_2"
+  type: "Convolution"
+  bottom: "conv9_1"
+  top: "conv9_2"
+  param {
+    lr_mult: 1.0
+    decay_mult: 1.0
+  }
+  param {
+    lr_mult: 2.0
+    decay_mult: 0.0
+  }
+  convolution_param {
+    num_output: 256
+    pad: 0
+    kernel_size: 3
+    stride: 1
+    weight_filler {
+      type: "xavier"
+    }
+    bias_filler {
+      type: "constant"
+      value: 0.0
+    }
+  }
+}
+layer {
+  name: "conv9_2_relu"
+  type: "ReLU"
+  bottom: "conv9_2"
+  top: "conv9_2"
+}
+layer {
+  name: "conv4_3_norm"
+  type: "Normalize"
+  bottom: "conv4_3"
+  top: "conv4_3_norm"
+  norm_param {
+    across_spatial: false
+    scale_filler {
+      type: "constant"
+      value: 20.0
+    }
+    channel_shared: false
+  }
+}
+layer {
+  name: "conv4_3_norm_mbox_loc"
+  type: "Convolution"
+  bottom: "conv4_3_norm"
+  top: "conv4_3_norm_mbox_loc"
+  param {
+    lr_mult: 1.0
+    decay_mult: 1.0
+  }
+  param {
+    lr_mult: 2.0
+    decay_mult: 0.0
+  }
+  convolution_param {
+    num_output: 16
+    pad: 1
+    kernel_size: 3
+    stride: 1
+    weight_filler {
+      type: "xavier"
+    }
+    bias_filler {
+      type: "constant"
+      value: 0.0
+    }
+  }
+}
+layer {
+  name: "conv4_3_norm_mbox_loc_perm"
+  type: "Permute"
+  bottom: "conv4_3_norm_mbox_loc"
+  top: "conv4_3_norm_mbox_loc_perm"
+  permute_param {
+    order: 0
+    order: 2
+    order: 3
+    order: 1
+  }
+}
+layer {
+  name: "conv4_3_norm_mbox_loc_flat"
+  type: "Flatten"
+  bottom: "conv4_3_norm_mbox_loc_perm"
+  top: "conv4_3_norm_mbox_loc_flat"
+  flatten_param {
+    axis: 1
+  }
+}
+layer {
+  name: "conv4_3_norm_mbox_conf"
+  type: "Convolution"
+  bottom: "conv4_3_norm"
+  top: "conv4_3_norm_mbox_conf"
+  param {
+    lr_mult: 1.0
+    decay_mult: 1.0
+  }
+  param {
+    lr_mult: 2.0
+    decay_mult: 0.0
+  }
+  convolution_param {
+    num_output: 324
+    pad: 1
+    kernel_size: 3
+    stride: 1
+    weight_filler {
+      type: "xavier"
+    }
+    bias_filler {
+      type: "constant"
+      value: 0.0
+    }
+  }
+}
+layer {
+  name: "conv4_3_norm_mbox_conf_perm"
+  type: "Permute"
+  bottom: "conv4_3_norm_mbox_conf"
+  top: "conv4_3_norm_mbox_conf_perm"
+  permute_param {
+    order: 0
+    order: 2
+    order: 3
+    order: 1
+  }
+}
+layer {
+  name: "conv4_3_norm_mbox_conf_flat"
+  type: "Flatten"
+  bottom: "conv4_3_norm_mbox_conf_perm"
+  top: "conv4_3_norm_mbox_conf_flat"
+  flatten_param {
+    axis: 1
+  }
+}
+layer {
+  name: "conv4_3_norm_mbox_priorbox"
+  type: "PriorBox"
+  bottom: "conv4_3_norm"
+  bottom: "data"
+  top: "conv4_3_norm_mbox_priorbox"
+  prior_box_param {
+    min_size: 21.0
+    max_size: 45.0
+    aspect_ratio: 2.0
+    flip: true
+    clip: false
+    variance: 0.10000000149
+    variance: 0.10000000149
+    variance: 0.20000000298
+    variance: 0.20000000298
+    step: 8.0
+    offset: 0.5
+  }
+}
+layer {
+  name: "fc7_mbox_loc"
+  type: "Convolution"
+  bottom: "fc7"
+  top: "fc7_mbox_loc"
+  param {
+    lr_mult: 1.0
+    decay_mult: 1.0
+  }
+  param {
+    lr_mult: 2.0
+    decay_mult: 0.0
+  }
+  convolution_param {
+    num_output: 24
+    pad: 1
+    kernel_size: 3
+    stride: 1
+    weight_filler {
+      type: "xavier"
+    }
+    bias_filler {
+      type: "constant"
+      value: 0.0
+    }
+  }
+}
+layer {
+  name: "fc7_mbox_loc_perm"
+  type: "Permute"
+  bottom: "fc7_mbox_loc"
+  top: "fc7_mbox_loc_perm"
+  permute_param {
+    order: 0
+    order: 2
+    order: 3
+    order: 1
+  }
+}
+layer {
+  name: "fc7_mbox_loc_flat"
+  type: "Flatten"
+  bottom: "fc7_mbox_loc_perm"
+  top: "fc7_mbox_loc_flat"
+  flatten_param {
+    axis: 1
+  }
+}
+layer {
+  name: "fc7_mbox_conf"
+  type: "Convolution"
+  bottom: "fc7"
+  top: "fc7_mbox_conf"
+  param {
+    lr_mult: 1.0
+    decay_mult: 1.0
+  }
+  param {
+    lr_mult: 2.0
+    decay_mult: 0.0
+  }
+  convolution_param {
+    num_output: 486
+    pad: 1
+    kernel_size: 3
+    stride: 1
+    weight_filler {
+      type: "xavier"
+    }
+    bias_filler {
+      type: "constant"
+      value: 0.0
+    }
+  }
+}
+layer {
+  name: "fc7_mbox_conf_perm"
+  type: "Permute"
+  bottom: "fc7_mbox_conf"
+  top: "fc7_mbox_conf_perm"
+  permute_param {
+    order: 0
+    order: 2
+    order: 3
+    order: 1
+  }
+}
+layer {
+  name: "fc7_mbox_conf_flat"
+  type: "Flatten"
+  bottom: "fc7_mbox_conf_perm"
+  top: "fc7_mbox_conf_flat"
+  flatten_param {
+    axis: 1
+  }
+}
+layer {
+  name: "fc7_mbox_priorbox"
+  type: "PriorBox"
+  bottom: "fc7"
+  bottom: "data"
+  top: "fc7_mbox_priorbox"
+  prior_box_param {
+    min_size: 45.0
+    max_size: 99.0
+    aspect_ratio: 2.0
+    aspect_ratio: 3.0
+    flip: true
+    clip: false
+    variance: 0.10000000149
+    variance: 0.10000000149
+    variance: 0.20000000298
+    variance: 0.20000000298
+    step: 16.0
+    offset: 0.5
+  }
+}
+layer {
+  name: "conv6_2_mbox_loc"
+  type: "Convolution"
+  bottom: "conv6_2"
+  top: "conv6_2_mbox_loc"
+  param {
+    lr_mult: 1.0
+    decay_mult: 1.0
+  }
+  param {
+    lr_mult: 2.0
+    decay_mult: 0.0
+  }
+  convolution_param {
+    num_output: 24
+    pad: 1
+    kernel_size: 3
+    stride: 1
+    weight_filler {
+      type: "xavier"
+    }
+    bias_filler {
+      type: "constant"
+      value: 0.0
+    }
+  }
+}
+layer {
+  name: "conv6_2_mbox_loc_perm"
+  type: "Permute"
+  bottom: "conv6_2_mbox_loc"
+  top: "conv6_2_mbox_loc_perm"
+  permute_param {
+    order: 0
+    order: 2
+    order: 3
+    order: 1
+  }
+}
+layer {
+  name: "conv6_2_mbox_loc_flat"
+  type: "Flatten"
+  bottom: "conv6_2_mbox_loc_perm"
+  top: "conv6_2_mbox_loc_flat"
+  flatten_param {
+    axis: 1
+  }
+}
+layer {
+  name: "conv6_2_mbox_conf"
+  type: "Convolution"
+  bottom: "conv6_2"
+  top: "conv6_2_mbox_conf"
+  param {
+    lr_mult: 1.0
+    decay_mult: 1.0
+  }
+  param {
+    lr_mult: 2.0
+    decay_mult: 0.0
+  }
+  convolution_param {
+    num_output: 486
+    pad: 1
+    kernel_size: 3
+    stride: 1
+    weight_filler {
+      type: "xavier"
+    }
+    bias_filler {
+      type: "constant"
+      value: 0.0
+    }
+  }
+}
+layer {
+  name: "conv6_2_mbox_conf_perm"
+  type: "Permute"
+  bottom: "conv6_2_mbox_conf"
+  top: "conv6_2_mbox_conf_perm"
+  permute_param {
+    order: 0
+    order: 2
+    order: 3
+    order: 1
+  }
+}
+layer {
+  name: "conv6_2_mbox_conf_flat"
+  type: "Flatten"
+  bottom: "conv6_2_mbox_conf_perm"
+  top: "conv6_2_mbox_conf_flat"
+  flatten_param {
+    axis: 1
+  }
+}
+layer {
+  name: "conv6_2_mbox_priorbox"
+  type: "PriorBox"
+  bottom: "conv6_2"
+  bottom: "data"
+  top: "conv6_2_mbox_priorbox"
+  prior_box_param {
+    min_size: 99.0
+    max_size: 153.0
+    aspect_ratio: 2.0
+    aspect_ratio: 3.0
+    flip: true
+    clip: false
+    variance: 0.10000000149
+    variance: 0.10000000149
+    variance: 0.20000000298
+    variance: 0.20000000298
+    step: 32.0
+    offset: 0.5
+  }
+}
+layer {
+  name: "conv7_2_mbox_loc"
+  type: "Convolution"
+  bottom: "conv7_2"
+  top: "conv7_2_mbox_loc"
+  param {
+    lr_mult: 1.0
+    decay_mult: 1.0
+  }
+  param {
+    lr_mult: 2.0
+    decay_mult: 0.0
+  }
+  convolution_param {
+    num_output: 24
+    pad: 1
+    kernel_size: 3
+    stride: 1
+    weight_filler {
+      type: "xavier"
+    }
+    bias_filler {
+      type: "constant"
+      value: 0.0
+    }
+  }
+}
+layer {
+  name: "conv7_2_mbox_loc_perm"
+  type: "Permute"
+  bottom: "conv7_2_mbox_loc"
+  top: "conv7_2_mbox_loc_perm"
+  permute_param {
+    order: 0
+    order: 2
+    order: 3
+    order: 1
+  }
+}
+layer {
+  name: "conv7_2_mbox_loc_flat"
+  type: "Flatten"
+  bottom: "conv7_2_mbox_loc_perm"
+  top: "conv7_2_mbox_loc_flat"
+  flatten_param {
+    axis: 1
+  }
+}
+layer {
+  name: "conv7_2_mbox_conf"
+  type: "Convolution"
+  bottom: "conv7_2"
+  top: "conv7_2_mbox_conf"
+  param {
+    lr_mult: 1.0
+    decay_mult: 1.0
+  }
+  param {
+    lr_mult: 2.0
+    decay_mult: 0.0
+  }
+  convolution_param {
+    num_output: 486
+    pad: 1
+    kernel_size: 3
+    stride: 1
+    weight_filler {
+      type: "xavier"
+    }
+    bias_filler {
+      type: "constant"
+      value: 0.0
+    }
+  }
+}
+layer {
+  name: "conv7_2_mbox_conf_perm"
+  type: "Permute"
+  bottom: "conv7_2_mbox_conf"
+  top: "conv7_2_mbox_conf_perm"
+  permute_param {
+    order: 0
+    order: 2
+    order: 3
+    order: 1
+  }
+}
+layer {
+  name: "conv7_2_mbox_conf_flat"
+  type: "Flatten"
+  bottom: "conv7_2_mbox_conf_perm"
+  top: "conv7_2_mbox_conf_flat"
+  flatten_param {
+    axis: 1
+  }
+}
+layer {
+  name: "conv7_2_mbox_priorbox"
+  type: "PriorBox"
+  bottom: "conv7_2"
+  bottom: "data"
+  top: "conv7_2_mbox_priorbox"
+  prior_box_param {
+    min_size: 153.0
+    max_size: 207.0
+    aspect_ratio: 2.0
+    aspect_ratio: 3.0
+    flip: true
+    clip: false
+    variance: 0.10000000149
+    variance: 0.10000000149
+    variance: 0.20000000298
+    variance: 0.20000000298
+    step: 64.0
+    offset: 0.5
+  }
+}
+layer {
+  name: "conv8_2_mbox_loc"
+  type: "Convolution"
+  bottom: "conv8_2"
+  top: "conv8_2_mbox_loc"
+  param {
+    lr_mult: 1.0
+    decay_mult: 1.0
+  }
+  param {
+    lr_mult: 2.0
+    decay_mult: 0.0
+  }
+  convolution_param {
+    num_output: 16
+    pad: 1
+    kernel_size: 3
+    stride: 1
+    weight_filler {
+      type: "xavier"
+    }
+    bias_filler {
+      type: "constant"
+      value: 0.0
+    }
+  }
+}
+layer {
+  name: "conv8_2_mbox_loc_perm"
+  type: "Permute"
+  bottom: "conv8_2_mbox_loc"
+  top: "conv8_2_mbox_loc_perm"
+  permute_param {
+    order: 0
+    order: 2
+    order: 3
+    order: 1
+  }
+}
+layer {
+  name: "conv8_2_mbox_loc_flat"
+  type: "Flatten"
+  bottom: "conv8_2_mbox_loc_perm"
+  top: "conv8_2_mbox_loc_flat"
+  flatten_param {
+    axis: 1
+  }
+}
+layer {
+  name: "conv8_2_mbox_conf"
+  type: "Convolution"
+  bottom: "conv8_2"
+  top: "conv8_2_mbox_conf"
+  param {
+    lr_mult: 1.0
+    decay_mult: 1.0
+  }
+  param {
+    lr_mult: 2.0
+    decay_mult: 0.0
+  }
+  convolution_param {
+    num_output: 324
+    pad: 1
+    kernel_size: 3
+    stride: 1
+    weight_filler {
+      type: "xavier"
+    }
+    bias_filler {
+      type: "constant"
+      value: 0.0
+    }
+  }
+}
+layer {
+  name: "conv8_2_mbox_conf_perm"
+  type: "Permute"
+  bottom: "conv8_2_mbox_conf"
+  top: "conv8_2_mbox_conf_perm"
+  permute_param {
+    order: 0
+    order: 2
+    order: 3
+    order: 1
+  }
+}
+layer {
+  name: "conv8_2_mbox_conf_flat"
+  type: "Flatten"
+  bottom: "conv8_2_mbox_conf_perm"
+  top: "conv8_2_mbox_conf_flat"
+  flatten_param {
+    axis: 1
+  }
+}
+layer {
+  name: "conv8_2_mbox_priorbox"
+  type: "PriorBox"
+  bottom: "conv8_2"
+  bottom: "data"
+  top: "conv8_2_mbox_priorbox"
+  prior_box_param {
+    min_size: 207.0
+    max_size: 261.0
+    aspect_ratio: 2.0
+    flip: true
+    clip: false
+    variance: 0.10000000149
+    variance: 0.10000000149
+    variance: 0.20000000298
+    variance: 0.20000000298
+    step: 100.0
+    offset: 0.5
+  }
+}
+layer {
+  name: "conv9_2_mbox_loc"
+  type: "Convolution"
+  bottom: "conv9_2"
+  top: "conv9_2_mbox_loc"
+  param {
+    lr_mult: 1.0
+    decay_mult: 1.0
+  }
+  param {
+    lr_mult: 2.0
+    decay_mult: 0.0
+  }
+  convolution_param {
+    num_output: 16
+    pad: 1
+    kernel_size: 3
+    stride: 1
+    weight_filler {
+      type: "xavier"
+    }
+    bias_filler {
+      type: "constant"
+      value: 0.0
+    }
+  }
+}
+layer {
+  name: "conv9_2_mbox_loc_perm"
+  type: "Permute"
+  bottom: "conv9_2_mbox_loc"
+  top: "conv9_2_mbox_loc_perm"
+  permute_param {
+    order: 0
+    order: 2
+    order: 3
+    order: 1
+  }
+}
+layer {
+  name: "conv9_2_mbox_loc_flat"
+  type: "Flatten"
+  bottom: "conv9_2_mbox_loc_perm"
+  top: "conv9_2_mbox_loc_flat"
+  flatten_param {
+    axis: 1
+  }
+}
+layer {
+  name: "conv9_2_mbox_conf"
+  type: "Convolution"
+  bottom: "conv9_2"
+  top: "conv9_2_mbox_conf"
+  param {
+    lr_mult: 1.0
+    decay_mult: 1.0
+  }
+  param {
+    lr_mult: 2.0
+    decay_mult: 0.0
+  }
+  convolution_param {
+    num_output: 324
+    pad: 1
+    kernel_size: 3
+    stride: 1
+    weight_filler {
+      type: "xavier"
+    }
+    bias_filler {
+      type: "constant"
+      value: 0.0
+    }
+  }
+}
+layer {
+  name: "conv9_2_mbox_conf_perm"
+  type: "Permute"
+  bottom: "conv9_2_mbox_conf"
+  top: "conv9_2_mbox_conf_perm"
+  permute_param {
+    order: 0
+    order: 2
+    order: 3
+    order: 1
+  }
+}
+layer {
+  name: "conv9_2_mbox_conf_flat"
+  type: "Flatten"
+  bottom: "conv9_2_mbox_conf_perm"
+  top: "conv9_2_mbox_conf_flat"
+  flatten_param {
+    axis: 1
+  }
+}
+layer {
+  name: "conv9_2_mbox_priorbox"
+  type: "PriorBox"
+  bottom: "conv9_2"
+  bottom: "data"
+  top: "conv9_2_mbox_priorbox"
+  prior_box_param {
+    min_size: 261.0
+    max_size: 315.0
+    aspect_ratio: 2.0
+    flip: true
+    clip: false
+    variance: 0.10000000149
+    variance: 0.10000000149
+    variance: 0.20000000298
+    variance: 0.20000000298
+    step: 300.0
+    offset: 0.5
+  }
+}
+layer {
+  name: "mbox_loc"
+  type: "Concat"
+  bottom: "conv4_3_norm_mbox_loc_flat"
+  bottom: "fc7_mbox_loc_flat"
+  bottom: "conv6_2_mbox_loc_flat"
+  bottom: "conv7_2_mbox_loc_flat"
+  bottom: "conv8_2_mbox_loc_flat"
+  bottom: "conv9_2_mbox_loc_flat"
+  top: "mbox_loc"
+  concat_param {
+    axis: 1
+  }
+}
+layer {
+  name: "mbox_conf"
+  type: "Concat"
+  bottom: "conv4_3_norm_mbox_conf_flat"
+  bottom: "fc7_mbox_conf_flat"
+  bottom: "conv6_2_mbox_conf_flat"
+  bottom: "conv7_2_mbox_conf_flat"
+  bottom: "conv8_2_mbox_conf_flat"
+  bottom: "conv9_2_mbox_conf_flat"
+  top: "mbox_conf"
+  concat_param {
+    axis: 1
+  }
+}
+layer {
+  name: "mbox_priorbox"
+  type: "Concat"
+  bottom: "conv4_3_norm_mbox_priorbox"
+  bottom: "fc7_mbox_priorbox"
+  bottom: "conv6_2_mbox_priorbox"
+  bottom: "conv7_2_mbox_priorbox"
+  bottom: "conv8_2_mbox_priorbox"
+  bottom: "conv9_2_mbox_priorbox"
+  top: "mbox_priorbox"
+  concat_param {
+    axis: 2
+  }
+}
+layer {
+  name: "mbox_conf_reshape"
+  type: "Reshape"
+  bottom: "mbox_conf"
+  top: "mbox_conf_reshape"
+  reshape_param {
+    shape {
+      dim: 0
+      dim: -1
+      dim: 81
+    }
+  }
+}
+layer {
+  name: "mbox_conf_softmax"
+  type: "Softmax"
+  bottom: "mbox_conf_reshape"
+  top: "mbox_conf_softmax"
+  softmax_param {
+    axis: 2
+  }
+}
+layer {
+  name: "mbox_conf_flatten"
+  type: "Flatten"
+  bottom: "mbox_conf_softmax"
+  top: "mbox_conf_flatten"
+  flatten_param {
+    axis: 1
+  }
+}
+layer {
+  name: "detection_out"
+  type: "DetectionOutput"
+  bottom: "mbox_loc"
+  bottom: "mbox_conf_flatten"
+  bottom: "mbox_priorbox"
+  top: "detection_out"
+  include {
+    phase: TEST
+  }
+  detection_output_param {
+    num_classes: 81
+    share_location: true
+    background_label_id: 0
+    nms_param {
+      nms_threshold: 0.449999988079
+      top_k: 400
+    }
+    save_output_param {
+      output_directory: "/home/snikolaev/data/mscoco/results/SSD_300x300"
+      output_name_prefix: "detections_minival_ssd300_results"
+      output_format: "COCO"
+      label_map_file: "data/coco/labelmap_coco.prototxt"
+      name_size_file: "data/coco/test2014_name_size.txt"
+      num_test_image: 5000
+    }
+    code_type: CENTER_SIZE
+    keep_top_k: 200
+    confidence_threshold: 0.00999999977648
+  }
+}
+
diff --git a/models/VGGNet/coco/SSD_300x300/solver.prototxt b/models/VGGNet/coco/SSD_300x300/solver.prototxt
new file mode 100644
index 00000000000..1e5cc781df3
--- /dev/null
+++ b/models/VGGNet/coco/SSD_300x300/solver.prototxt
@@ -0,0 +1,33 @@
+train_net: "models/VGGNet/coco/SSD_300x300/train.prototxt"
+test_net: "models/VGGNet/coco/SSD_300x300/test.prototxt"
+test_iter: 156  #625
+test_interval: 1000
+
+base_lr: 0.001
+
+lr_policy: "poly"
+power: 2
+
+display: 10
+max_iter: 5000 #400000
+
+#lr_policy: "multistep"
+#gamma: 0.10000000149
+
+momentum: 0.9
+weight_decay: 0.0005
+snapshot: 1000 #40000
+snapshot_prefix: "models/VGGNet/coco/SSD_300x300/VGG_coco_SSD_300x300"
+solver_mode: GPU
+device_id: 0
+debug_info: false
+snapshot_after_train: true
+test_initialization: false
+average_loss: 10
+#stepvalue: 3500 #280000
+#stepvalue: 4500 #360000
+#stepvalue: 5000 #400000
+#iter_size: 1
+type: "SGD"
+eval_type: "detection"
+ap_version: "11point"
diff --git a/models/VGGNet/coco/SSD_300x300/test.prototxt b/models/VGGNet/coco/SSD_300x300/test.prototxt
new file mode 100644
index 00000000000..bd7c91b0041
--- /dev/null
+++ b/models/VGGNet/coco/SSD_300x300/test.prototxt
@@ -0,0 +1,1671 @@
+name: "VGG_coco_SSD_300x300_test"
+layer {
+  name: "data"
+  type: "AnnotatedData"
+  top: "data"
+  top: "label"
+  include {
+    phase: TEST
+  }
+  transform_param {
+    mean_value: 104.0
+    mean_value: 117.0
+    mean_value: 123.0
+    force_color: true
+    resize_param {
+      prob: 1.0
+      resize_mode: WARP
+      height: 300
+      width: 300
+      interp_mode: LINEAR
+    }
+  }
+  data_param {
+    source: "examples/coco/coco_minival_lmdb"
+    batch_size: 128 #8
+    backend: LMDB
+  }
+  annotated_data_param {
+    batch_sampler {
+    }
+    label_map_file: "data/coco/labelmap_coco.prototxt"
+  }
+}
+layer {
+  name: "conv1_1"
+  type: "Convolution"
+  bottom: "data"
+  top: "conv1_1"
+  param {
+    lr_mult: 1.0
+    decay_mult: 1.0
+  }
+  param {
+    lr_mult: 2.0
+    decay_mult: 0.0
+  }
+  convolution_param {
+    num_output: 64
+    pad: 1
+    kernel_size: 3
+    weight_filler {
+      type: "xavier"
+    }
+    bias_filler {
+      type: "constant"
+      value: 0.0
+    }
+  }
+}
+layer {
+  name: "relu1_1"
+  type: "ReLU"
+  bottom: "conv1_1"
+  top: "conv1_1"
+}
+layer {
+  name: "conv1_2"
+  type: "Convolution"
+  bottom: "conv1_1"
+  top: "conv1_2"
+  param {
+    lr_mult: 1.0
+    decay_mult: 1.0
+  }
+  param {
+    lr_mult: 2.0
+    decay_mult: 0.0
+  }
+  convolution_param {
+    num_output: 64
+    pad: 1
+    kernel_size: 3
+    weight_filler {
+      type: "xavier"
+    }
+    bias_filler {
+      type: "constant"
+      value: 0.0
+    }
+  }
+}
+layer {
+  name: "relu1_2"
+  type: "ReLU"
+  bottom: "conv1_2"
+  top: "conv1_2"
+}
+layer {
+  name: "pool1"
+  type: "Pooling"
+  bottom: "conv1_2"
+  top: "pool1"
+  pooling_param {
+    pool: MAX
+    kernel_size: 2
+    stride: 2
+  }
+}
+layer {
+  name: "conv2_1"
+  type: "Convolution"
+  bottom: "pool1"
+  top: "conv2_1"
+  param {
+    lr_mult: 1.0
+    decay_mult: 1.0
+  }
+  param {
+    lr_mult: 2.0
+    decay_mult: 0.0
+  }
+  convolution_param {
+    num_output: 128
+    pad: 1
+    kernel_size: 3
+    weight_filler {
+      type: "xavier"
+    }
+    bias_filler {
+      type: "constant"
+      value: 0.0
+    }
+  }
+}
+layer {
+  name: "relu2_1"
+  type: "ReLU"
+  bottom: "conv2_1"
+  top: "conv2_1"
+}
+layer {
+  name: "conv2_2"
+  type: "Convolution"
+  bottom: "conv2_1"
+  top: "conv2_2"
+  param {
+    lr_mult: 1.0
+    decay_mult: 1.0
+  }
+  param {
+    lr_mult: 2.0
+    decay_mult: 0.0
+  }
+  convolution_param {
+    num_output: 128
+    pad: 1
+    kernel_size: 3
+    weight_filler {
+      type: "xavier"
+    }
+    bias_filler {
+      type: "constant"
+      value: 0.0
+    }
+  }
+}
+layer {
+  name: "relu2_2"
+  type: "ReLU"
+  bottom: "conv2_2"
+  top: "conv2_2"
+}
+layer {
+  name: "pool2"
+  type: "Pooling"
+  bottom: "conv2_2"
+  top: "pool2"
+  pooling_param {
+    pool: MAX
+    kernel_size: 2
+    stride: 2
+  }
+}
+layer {
+  name: "conv3_1"
+  type: "Convolution"
+  bottom: "pool2"
+  top: "conv3_1"
+  param {
+    lr_mult: 1.0
+    decay_mult: 1.0
+  }
+  param {
+    lr_mult: 2.0
+    decay_mult: 0.0
+  }
+  convolution_param {
+    num_output: 256
+    pad: 1
+    kernel_size: 3
+    weight_filler {
+      type: "xavier"
+    }
+    bias_filler {
+      type: "constant"
+      value: 0.0
+    }
+  }
+}
+layer {
+  name: "relu3_1"
+  type: "ReLU"
+  bottom: "conv3_1"
+  top: "conv3_1"
+}
+layer {
+  name: "conv3_2"
+  type: "Convolution"
+  bottom: "conv3_1"
+  top: "conv3_2"
+  param {
+    lr_mult: 1.0
+    decay_mult: 1.0
+  }
+  param {
+    lr_mult: 2.0
+    decay_mult: 0.0
+  }
+  convolution_param {
+    num_output: 256
+    pad: 1
+    kernel_size: 3
+    weight_filler {
+      type: "xavier"
+    }
+    bias_filler {
+      type: "constant"
+      value: 0.0
+    }
+  }
+}
+layer {
+  name: "relu3_2"
+  type: "ReLU"
+  bottom: "conv3_2"
+  top: "conv3_2"
+}
+layer {
+  name: "conv3_3"
+  type: "Convolution"
+  bottom: "conv3_2"
+  top: "conv3_3"
+  param {
+    lr_mult: 1.0
+    decay_mult: 1.0
+  }
+  param {
+    lr_mult: 2.0
+    decay_mult: 0.0
+  }
+  convolution_param {
+    num_output: 256
+    pad: 1
+    kernel_size: 3
+    weight_filler {
+      type: "xavier"
+    }
+    bias_filler {
+      type: "constant"
+      value: 0.0
+    }
+  }
+}
+layer {
+  name: "relu3_3"
+  type: "ReLU"
+  bottom: "conv3_3"
+  top: "conv3_3"
+}
+layer {
+  name: "pool3"
+  type: "Pooling"
+  bottom: "conv3_3"
+  top: "pool3"
+  pooling_param {
+    pool: MAX
+    kernel_size: 2
+    stride: 2
+  }
+}
+layer {
+  name: "conv4_1"
+  type: "Convolution"
+  bottom: "pool3"
+  top: "conv4_1"
+  param {
+    lr_mult: 1.0
+    decay_mult: 1.0
+  }
+  param {
+    lr_mult: 2.0
+    decay_mult: 0.0
+  }
+  convolution_param {
+    num_output: 512
+    pad: 1
+    kernel_size: 3
+    weight_filler {
+      type: "xavier"
+    }
+    bias_filler {
+      type: "constant"
+      value: 0.0
+    }
+  }
+}
+layer {
+  name: "relu4_1"
+  type: "ReLU"
+  bottom: "conv4_1"
+  top: "conv4_1"
+}
+layer {
+  name: "conv4_2"
+  type: "Convolution"
+  bottom: "conv4_1"
+  top: "conv4_2"
+  param {
+    lr_mult: 1.0
+    decay_mult: 1.0
+  }
+  param {
+    lr_mult: 2.0
+    decay_mult: 0.0
+  }
+  convolution_param {
+    num_output: 512
+    pad: 1
+    kernel_size: 3
+    weight_filler {
+      type: "xavier"
+    }
+    bias_filler {
+      type: "constant"
+      value: 0.0
+    }
+  }
+}
+layer {
+  name: "relu4_2"
+  type: "ReLU"
+  bottom: "conv4_2"
+  top: "conv4_2"
+}
+layer {
+  name: "conv4_3"
+  type: "Convolution"
+  bottom: "conv4_2"
+  top: "conv4_3"
+  param {
+    lr_mult: 1.0
+    decay_mult: 1.0
+  }
+  param {
+    lr_mult: 2.0
+    decay_mult: 0.0
+  }
+  convolution_param {
+    num_output: 512
+    pad: 1
+    kernel_size: 3
+    weight_filler {
+      type: "xavier"
+    }
+    bias_filler {
+      type: "constant"
+      value: 0.0
+    }
+  }
+}
+layer {
+  name: "relu4_3"
+  type: "ReLU"
+  bottom: "conv4_3"
+  top: "conv4_3"
+}
+layer {
+  name: "pool4"
+  type: "Pooling"
+  bottom: "conv4_3"
+  top: "pool4"
+  pooling_param {
+    pool: MAX
+    kernel_size: 2
+    stride: 2
+  }
+}
+layer {
+  name: "conv5_1"
+  type: "Convolution"
+  bottom: "pool4"
+  top: "conv5_1"
+  param {
+    lr_mult: 1.0
+    decay_mult: 1.0
+  }
+  param {
+    lr_mult: 2.0
+    decay_mult: 0.0
+  }
+  convolution_param {
+    num_output: 512
+    pad: 1
+    kernel_size: 3
+    weight_filler {
+      type: "xavier"
+    }
+    bias_filler {
+      type: "constant"
+      value: 0.0
+    }
+    dilation: 1
+  }
+}
+layer {
+  name: "relu5_1"
+  type: "ReLU"
+  bottom: "conv5_1"
+  top: "conv5_1"
+}
+layer {
+  name: "conv5_2"
+  type: "Convolution"
+  bottom: "conv5_1"
+  top: "conv5_2"
+  param {
+    lr_mult: 1.0
+    decay_mult: 1.0
+  }
+  param {
+    lr_mult: 2.0
+    decay_mult: 0.0
+  }
+  convolution_param {
+    num_output: 512
+    pad: 1
+    kernel_size: 3
+    weight_filler {
+      type: "xavier"
+    }
+    bias_filler {
+      type: "constant"
+      value: 0.0
+    }
+    dilation: 1
+  }
+}
+layer {
+  name: "relu5_2"
+  type: "ReLU"
+  bottom: "conv5_2"
+  top: "conv5_2"
+}
+layer {
+  name: "conv5_3"
+  type: "Convolution"
+  bottom: "conv5_2"
+  top: "conv5_3"
+  param {
+    lr_mult: 1.0
+    decay_mult: 1.0
+  }
+  param {
+    lr_mult: 2.0
+    decay_mult: 0.0
+  }
+  convolution_param {
+    num_output: 512
+    pad: 1
+    kernel_size: 3
+    weight_filler {
+      type: "xavier"
+    }
+    bias_filler {
+      type: "constant"
+      value: 0.0
+    }
+    dilation: 1
+  }
+}
+layer {
+  name: "relu5_3"
+  type: "ReLU"
+  bottom: "conv5_3"
+  top: "conv5_3"
+}
+layer {
+  name: "pool5"
+  type: "Pooling"
+  bottom: "conv5_3"
+  top: "pool5"
+  pooling_param {
+    pool: MAX
+    kernel_size: 3
+    stride: 1
+    pad: 1
+  }
+}
+layer {
+  name: "fc6"
+  type: "Convolution"
+  bottom: "pool5"
+  top: "fc6"
+  param {
+    lr_mult: 1.0
+    decay_mult: 1.0
+  }
+  param {
+    lr_mult: 2.0
+    decay_mult: 0.0
+  }
+  convolution_param {
+    num_output: 1024
+    pad: 6
+    kernel_size: 3
+    weight_filler {
+      type: "xavier"
+    }
+    bias_filler {
+      type: "constant"
+      value: 0.0
+    }
+    dilation: 6
+  }
+}
+layer {
+  name: "relu6"
+  type: "ReLU"
+  bottom: "fc6"
+  top: "fc6"
+}
+layer {
+  name: "fc7"
+  type: "Convolution"
+  bottom: "fc6"
+  top: "fc7"
+  param {
+    lr_mult: 1.0
+    decay_mult: 1.0
+  }
+  param {
+    lr_mult: 2.0
+    decay_mult: 0.0
+  }
+  convolution_param {
+    num_output: 1024
+    kernel_size: 1
+    weight_filler {
+      type: "xavier"
+    }
+    bias_filler {
+      type: "constant"
+      value: 0.0
+    }
+  }
+}
+layer {
+  name: "relu7"
+  type: "ReLU"
+  bottom: "fc7"
+  top: "fc7"
+}
+layer {
+  name: "conv6_1"
+  type: "Convolution"
+  bottom: "fc7"
+  top: "conv6_1"
+  param {
+    lr_mult: 1.0
+    decay_mult: 1.0
+  }
+  param {
+    lr_mult: 2.0
+    decay_mult: 0.0
+  }
+  convolution_param {
+    num_output: 256
+    pad: 0
+    kernel_size: 1
+    stride: 1
+    weight_filler {
+      type: "xavier"
+    }
+    bias_filler {
+      type: "constant"
+      value: 0.0
+    }
+  }
+}
+layer {
+  name: "conv6_1_relu"
+  type: "ReLU"
+  bottom: "conv6_1"
+  top: "conv6_1"
+}
+layer {
+  name: "conv6_2"
+  type: "Convolution"
+  bottom: "conv6_1"
+  top: "conv6_2"
+  param {
+    lr_mult: 1.0
+    decay_mult: 1.0
+  }
+  param {
+    lr_mult: 2.0
+    decay_mult: 0.0
+  }
+  convolution_param {
+    num_output: 512
+    pad: 1
+    kernel_size: 3
+    stride: 2
+    weight_filler {
+      type: "xavier"
+    }
+    bias_filler {
+      type: "constant"
+      value: 0.0
+    }
+  }
+}
+layer {
+  name: "conv6_2_relu"
+  type: "ReLU"
+  bottom: "conv6_2"
+  top: "conv6_2"
+}
+layer {
+  name: "conv7_1"
+  type: "Convolution"
+  bottom: "conv6_2"
+  top: "conv7_1"
+  param {
+    lr_mult: 1.0
+    decay_mult: 1.0
+  }
+  param {
+    lr_mult: 2.0
+    decay_mult: 0.0
+  }
+  convolution_param {
+    num_output: 128
+    pad: 0
+    kernel_size: 1
+    stride: 1
+    weight_filler {
+      type: "xavier"
+    }
+    bias_filler {
+      type: "constant"
+      value: 0.0
+    }
+  }
+}
+layer {
+  name: "conv7_1_relu"
+  type: "ReLU"
+  bottom: "conv7_1"
+  top: "conv7_1"
+}
+layer {
+  name: "conv7_2"
+  type: "Convolution"
+  bottom: "conv7_1"
+  top: "conv7_2"
+  param {
+    lr_mult: 1.0
+    decay_mult: 1.0
+  }
+  param {
+    lr_mult: 2.0
+    decay_mult: 0.0
+  }
+  convolution_param {
+    num_output: 256
+    pad: 1
+    kernel_size: 3
+    stride: 2
+    weight_filler {
+      type: "xavier"
+    }
+    bias_filler {
+      type: "constant"
+      value: 0.0
+    }
+  }
+}
+layer {
+  name: "conv7_2_relu"
+  type: "ReLU"
+  bottom: "conv7_2"
+  top: "conv7_2"
+}
+layer {
+  name: "conv8_1"
+  type: "Convolution"
+  bottom: "conv7_2"
+  top: "conv8_1"
+  param {
+    lr_mult: 1.0
+    decay_mult: 1.0
+  }
+  param {
+    lr_mult: 2.0
+    decay_mult: 0.0
+  }
+  convolution_param {
+    num_output: 128
+    pad: 0
+    kernel_size: 1
+    stride: 1
+    weight_filler {
+      type: "xavier"
+    }
+    bias_filler {
+      type: "constant"
+      value: 0.0
+    }
+  }
+}
+layer {
+  name: "conv8_1_relu"
+  type: "ReLU"
+  bottom: "conv8_1"
+  top: "conv8_1"
+}
+layer {
+  name: "conv8_2"
+  type: "Convolution"
+  bottom: "conv8_1"
+  top: "conv8_2"
+  param {
+    lr_mult: 1.0
+    decay_mult: 1.0
+  }
+  param {
+    lr_mult: 2.0
+    decay_mult: 0.0
+  }
+  convolution_param {
+    num_output: 256
+    pad: 0
+    kernel_size: 3
+    stride: 1
+    weight_filler {
+      type: "xavier"
+    }
+    bias_filler {
+      type: "constant"
+      value: 0.0
+    }
+  }
+}
+layer {
+  name: "conv8_2_relu"
+  type: "ReLU"
+  bottom: "conv8_2"
+  top: "conv8_2"
+}
+layer {
+  name: "conv9_1"
+  type: "Convolution"
+  bottom: "conv8_2"
+  top: "conv9_1"
+  param {
+    lr_mult: 1.0
+    decay_mult: 1.0
+  }
+  param {
+    lr_mult: 2.0
+    decay_mult: 0.0
+  }
+  convolution_param {
+    num_output: 128
+    pad: 0
+    kernel_size: 1
+    stride: 1
+    weight_filler {
+      type: "xavier"
+    }
+    bias_filler {
+      type: "constant"
+      value: 0.0
+    }
+  }
+}
+layer {
+  name: "conv9_1_relu"
+  type: "ReLU"
+  bottom: "conv9_1"
+  top: "conv9_1"
+}
+layer {
+  name: "conv9_2"
+  type: "Convolution"
+  bottom: "conv9_1"
+  top: "conv9_2"
+  param {
+    lr_mult: 1.0
+    decay_mult: 1.0
+  }
+  param {
+    lr_mult: 2.0
+    decay_mult: 0.0
+  }
+  convolution_param {
+    num_output: 256
+    pad: 0
+    kernel_size: 3
+    stride: 1
+    weight_filler {
+      type: "xavier"
+    }
+    bias_filler {
+      type: "constant"
+      value: 0.0
+    }
+  }
+}
+layer {
+  name: "conv9_2_relu"
+  type: "ReLU"
+  bottom: "conv9_2"
+  top: "conv9_2"
+}
+layer {
+  name: "conv4_3_norm"
+  type: "Normalize"
+  bottom: "conv4_3"
+  top: "conv4_3_norm"
+  norm_param {
+    across_spatial: false
+    scale_filler {
+      type: "constant"
+      value: 20.0
+    }
+    channel_shared: false
+  }
+}
+layer {
+  name: "conv4_3_norm_mbox_loc"
+  type: "Convolution"
+  bottom: "conv4_3_norm"
+  top: "conv4_3_norm_mbox_loc"
+  param {
+    lr_mult: 1.0
+    decay_mult: 1.0
+  }
+  param {
+    lr_mult: 2.0
+    decay_mult: 0.0
+  }
+  convolution_param {
+    num_output: 16
+    pad: 1
+    kernel_size: 3
+    stride: 1
+    weight_filler {
+      type: "xavier"
+    }
+    bias_filler {
+      type: "constant"
+      value: 0.0
+    }
+  }
+}
+layer {
+  name: "conv4_3_norm_mbox_loc_perm"
+  type: "Permute"
+  bottom: "conv4_3_norm_mbox_loc"
+  top: "conv4_3_norm_mbox_loc_perm"
+  permute_param {
+    order: 0
+    order: 2
+    order: 3
+    order: 1
+  }
+}
+layer {
+  name: "conv4_3_norm_mbox_loc_flat"
+  type: "Flatten"
+  bottom: "conv4_3_norm_mbox_loc_perm"
+  top: "conv4_3_norm_mbox_loc_flat"
+  flatten_param {
+    axis: 1
+  }
+}
+layer {
+  name: "conv4_3_norm_mbox_conf"
+  type: "Convolution"
+  bottom: "conv4_3_norm"
+  top: "conv4_3_norm_mbox_conf"
+  param {
+    lr_mult: 1.0
+    decay_mult: 1.0
+  }
+  param {
+    lr_mult: 2.0
+    decay_mult: 0.0
+  }
+  convolution_param {
+    num_output: 324
+    pad: 1
+    kernel_size: 3
+    stride: 1
+    weight_filler {
+      type: "xavier"
+    }
+    bias_filler {
+      type: "constant"
+      value: 0.0
+    }
+  }
+}
+layer {
+  name: "conv4_3_norm_mbox_conf_perm"
+  type: "Permute"
+  bottom: "conv4_3_norm_mbox_conf"
+  top: "conv4_3_norm_mbox_conf_perm"
+  permute_param {
+    order: 0
+    order: 2
+    order: 3
+    order: 1
+  }
+}
+layer {
+  name: "conv4_3_norm_mbox_conf_flat"
+  type: "Flatten"
+  bottom: "conv4_3_norm_mbox_conf_perm"
+  top: "conv4_3_norm_mbox_conf_flat"
+  flatten_param {
+    axis: 1
+  }
+}
+layer {
+  name: "conv4_3_norm_mbox_priorbox"
+  type: "PriorBox"
+  bottom: "conv4_3_norm"
+  bottom: "data"
+  top: "conv4_3_norm_mbox_priorbox"
+  prior_box_param {
+    min_size: 21.0
+    max_size: 45.0
+    aspect_ratio: 2.0
+    flip: true
+    clip: false
+    variance: 0.10000000149
+    variance: 0.10000000149
+    variance: 0.20000000298
+    variance: 0.20000000298
+    step: 8.0
+    offset: 0.5
+  }
+}
+layer {
+  name: "fc7_mbox_loc"
+  type: "Convolution"
+  bottom: "fc7"
+  top: "fc7_mbox_loc"
+  param {
+    lr_mult: 1.0
+    decay_mult: 1.0
+  }
+  param {
+    lr_mult: 2.0
+    decay_mult: 0.0
+  }
+  convolution_param {
+    num_output: 24
+    pad: 1
+    kernel_size: 3
+    stride: 1
+    weight_filler {
+      type: "xavier"
+    }
+    bias_filler {
+      type: "constant"
+      value: 0.0
+    }
+  }
+}
+layer {
+  name: "fc7_mbox_loc_perm"
+  type: "Permute"
+  bottom: "fc7_mbox_loc"
+  top: "fc7_mbox_loc_perm"
+  permute_param {
+    order: 0
+    order: 2
+    order: 3
+    order: 1
+  }
+}
+layer {
+  name: "fc7_mbox_loc_flat"
+  type: "Flatten"
+  bottom: "fc7_mbox_loc_perm"
+  top: "fc7_mbox_loc_flat"
+  flatten_param {
+    axis: 1
+  }
+}
+layer {
+  name: "fc7_mbox_conf"
+  type: "Convolution"
+  bottom: "fc7"
+  top: "fc7_mbox_conf"
+  param {
+    lr_mult: 1.0
+    decay_mult: 1.0
+  }
+  param {
+    lr_mult: 2.0
+    decay_mult: 0.0
+  }
+  convolution_param {
+    num_output: 486
+    pad: 1
+    kernel_size: 3
+    stride: 1
+    weight_filler {
+      type: "xavier"
+    }
+    bias_filler {
+      type: "constant"
+      value: 0.0
+    }
+  }
+}
+layer {
+  name: "fc7_mbox_conf_perm"
+  type: "Permute"
+  bottom: "fc7_mbox_conf"
+  top: "fc7_mbox_conf_perm"
+  permute_param {
+    order: 0
+    order: 2
+    order: 3
+    order: 1
+  }
+}
+layer {
+  name: "fc7_mbox_conf_flat"
+  type: "Flatten"
+  bottom: "fc7_mbox_conf_perm"
+  top: "fc7_mbox_conf_flat"
+  flatten_param {
+    axis: 1
+  }
+}
+layer {
+  name: "fc7_mbox_priorbox"
+  type: "PriorBox"
+  bottom: "fc7"
+  bottom: "data"
+  top: "fc7_mbox_priorbox"
+  prior_box_param {
+    min_size: 45.0
+    max_size: 99.0
+    aspect_ratio: 2.0
+    aspect_ratio: 3.0
+    flip: true
+    clip: false
+    variance: 0.10000000149
+    variance: 0.10000000149
+    variance: 0.20000000298
+    variance: 0.20000000298
+    step: 16.0
+    offset: 0.5
+  }
+}
+layer {
+  name: "conv6_2_mbox_loc"
+  type: "Convolution"
+  bottom: "conv6_2"
+  top: "conv6_2_mbox_loc"
+  param {
+    lr_mult: 1.0
+    decay_mult: 1.0
+  }
+  param {
+    lr_mult: 2.0
+    decay_mult: 0.0
+  }
+  convolution_param {
+    num_output: 24
+    pad: 1
+    kernel_size: 3
+    stride: 1
+    weight_filler {
+      type: "xavier"
+    }
+    bias_filler {
+      type: "constant"
+      value: 0.0
+    }
+  }
+}
+layer {
+  name: "conv6_2_mbox_loc_perm"
+  type: "Permute"
+  bottom: "conv6_2_mbox_loc"
+  top: "conv6_2_mbox_loc_perm"
+  permute_param {
+    order: 0
+    order: 2
+    order: 3
+    order: 1
+  }
+}
+layer {
+  name: "conv6_2_mbox_loc_flat"
+  type: "Flatten"
+  bottom: "conv6_2_mbox_loc_perm"
+  top: "conv6_2_mbox_loc_flat"
+  flatten_param {
+    axis: 1
+  }
+}
+layer {
+  name: "conv6_2_mbox_conf"
+  type: "Convolution"
+  bottom: "conv6_2"
+  top: "conv6_2_mbox_conf"
+  param {
+    lr_mult: 1.0
+    decay_mult: 1.0
+  }
+  param {
+    lr_mult: 2.0
+    decay_mult: 0.0
+  }
+  convolution_param {
+    num_output: 486
+    pad: 1
+    kernel_size: 3
+    stride: 1
+    weight_filler {
+      type: "xavier"
+    }
+    bias_filler {
+      type: "constant"
+      value: 0.0
+    }
+  }
+}
+layer {
+  name: "conv6_2_mbox_conf_perm"
+  type: "Permute"
+  bottom: "conv6_2_mbox_conf"
+  top: "conv6_2_mbox_conf_perm"
+  permute_param {
+    order: 0
+    order: 2
+    order: 3
+    order: 1
+  }
+}
+layer {
+  name: "conv6_2_mbox_conf_flat"
+  type: "Flatten"
+  bottom: "conv6_2_mbox_conf_perm"
+  top: "conv6_2_mbox_conf_flat"
+  flatten_param {
+    axis: 1
+  }
+}
+layer {
+  name: "conv6_2_mbox_priorbox"
+  type: "PriorBox"
+  bottom: "conv6_2"
+  bottom: "data"
+  top: "conv6_2_mbox_priorbox"
+  prior_box_param {
+    min_size: 99.0
+    max_size: 153.0
+    aspect_ratio: 2.0
+    aspect_ratio: 3.0
+    flip: true
+    clip: false
+    variance: 0.10000000149
+    variance: 0.10000000149
+    variance: 0.20000000298
+    variance: 0.20000000298
+    step: 32.0
+    offset: 0.5
+  }
+}
+layer {
+  name: "conv7_2_mbox_loc"
+  type: "Convolution"
+  bottom: "conv7_2"
+  top: "conv7_2_mbox_loc"
+  param {
+    lr_mult: 1.0
+    decay_mult: 1.0
+  }
+  param {
+    lr_mult: 2.0
+    decay_mult: 0.0
+  }
+  convolution_param {
+    num_output: 24
+    pad: 1
+    kernel_size: 3
+    stride: 1
+    weight_filler {
+      type: "xavier"
+    }
+    bias_filler {
+      type: "constant"
+      value: 0.0
+    }
+  }
+}
+layer {
+  name: "conv7_2_mbox_loc_perm"
+  type: "Permute"
+  bottom: "conv7_2_mbox_loc"
+  top: "conv7_2_mbox_loc_perm"
+  permute_param {
+    order: 0
+    order: 2
+    order: 3
+    order: 1
+  }
+}
+layer {
+  name: "conv7_2_mbox_loc_flat"
+  type: "Flatten"
+  bottom: "conv7_2_mbox_loc_perm"
+  top: "conv7_2_mbox_loc_flat"
+  flatten_param {
+    axis: 1
+  }
+}
+layer {
+  name: "conv7_2_mbox_conf"
+  type: "Convolution"
+  bottom: "conv7_2"
+  top: "conv7_2_mbox_conf"
+  param {
+    lr_mult: 1.0
+    decay_mult: 1.0
+  }
+  param {
+    lr_mult: 2.0
+    decay_mult: 0.0
+  }
+  convolution_param {
+    num_output: 486
+    pad: 1
+    kernel_size: 3
+    stride: 1
+    weight_filler {
+      type: "xavier"
+    }
+    bias_filler {
+      type: "constant"
+      value: 0.0
+    }
+  }
+}
+layer {
+  name: "conv7_2_mbox_conf_perm"
+  type: "Permute"
+  bottom: "conv7_2_mbox_conf"
+  top: "conv7_2_mbox_conf_perm"
+  permute_param {
+    order: 0
+    order: 2
+    order: 3
+    order: 1
+  }
+}
+layer {
+  name: "conv7_2_mbox_conf_flat"
+  type: "Flatten"
+  bottom: "conv7_2_mbox_conf_perm"
+  top: "conv7_2_mbox_conf_flat"
+  flatten_param {
+    axis: 1
+  }
+}
+layer {
+  name: "conv7_2_mbox_priorbox"
+  type: "PriorBox"
+  bottom: "conv7_2"
+  bottom: "data"
+  top: "conv7_2_mbox_priorbox"
+  prior_box_param {
+    min_size: 153.0
+    max_size: 207.0
+    aspect_ratio: 2.0
+    aspect_ratio: 3.0
+    flip: true
+    clip: false
+    variance: 0.10000000149
+    variance: 0.10000000149
+    variance: 0.20000000298
+    variance: 0.20000000298
+    step: 64.0
+    offset: 0.5
+  }
+}
+layer {
+  name: "conv8_2_mbox_loc"
+  type: "Convolution"
+  bottom: "conv8_2"
+  top: "conv8_2_mbox_loc"
+  param {
+    lr_mult: 1.0
+    decay_mult: 1.0
+  }
+  param {
+    lr_mult: 2.0
+    decay_mult: 0.0
+  }
+  convolution_param {
+    num_output: 16
+    pad: 1
+    kernel_size: 3
+    stride: 1
+    weight_filler {
+      type: "xavier"
+    }
+    bias_filler {
+      type: "constant"
+      value: 0.0
+    }
+  }
+}
+layer {
+  name: "conv8_2_mbox_loc_perm"
+  type: "Permute"
+  bottom: "conv8_2_mbox_loc"
+  top: "conv8_2_mbox_loc_perm"
+  permute_param {
+    order: 0
+    order: 2
+    order: 3
+    order: 1
+  }
+}
+layer {
+  name: "conv8_2_mbox_loc_flat"
+  type: "Flatten"
+  bottom: "conv8_2_mbox_loc_perm"
+  top: "conv8_2_mbox_loc_flat"
+  flatten_param {
+    axis: 1
+  }
+}
+layer {
+  name: "conv8_2_mbox_conf"
+  type: "Convolution"
+  bottom: "conv8_2"
+  top: "conv8_2_mbox_conf"
+  param {
+    lr_mult: 1.0
+    decay_mult: 1.0
+  }
+  param {
+    lr_mult: 2.0
+    decay_mult: 0.0
+  }
+  convolution_param {
+    num_output: 324
+    pad: 1
+    kernel_size: 3
+    stride: 1
+    weight_filler {
+      type: "xavier"
+    }
+    bias_filler {
+      type: "constant"
+      value: 0.0
+    }
+  }
+}
+layer {
+  name: "conv8_2_mbox_conf_perm"
+  type: "Permute"
+  bottom: "conv8_2_mbox_conf"
+  top: "conv8_2_mbox_conf_perm"
+  permute_param {
+    order: 0
+    order: 2
+    order: 3
+    order: 1
+  }
+}
+layer {
+  name: "conv8_2_mbox_conf_flat"
+  type: "Flatten"
+  bottom: "conv8_2_mbox_conf_perm"
+  top: "conv8_2_mbox_conf_flat"
+  flatten_param {
+    axis: 1
+  }
+}
+layer {
+  name: "conv8_2_mbox_priorbox"
+  type: "PriorBox"
+  bottom: "conv8_2"
+  bottom: "data"
+  top: "conv8_2_mbox_priorbox"
+  prior_box_param {
+    min_size: 207.0
+    max_size: 261.0
+    aspect_ratio: 2.0
+    flip: true
+    clip: false
+    variance: 0.10000000149
+    variance: 0.10000000149
+    variance: 0.20000000298
+    variance: 0.20000000298
+    step: 100.0
+    offset: 0.5
+  }
+}
+layer {
+  name: "conv9_2_mbox_loc"
+  type: "Convolution"
+  bottom: "conv9_2"
+  top: "conv9_2_mbox_loc"
+  param {
+    lr_mult: 1.0
+    decay_mult: 1.0
+  }
+  param {
+    lr_mult: 2.0
+    decay_mult: 0.0
+  }
+  convolution_param {
+    num_output: 16
+    pad: 1
+    kernel_size: 3
+    stride: 1
+    weight_filler {
+      type: "xavier"
+    }
+    bias_filler {
+      type: "constant"
+      value: 0.0
+    }
+  }
+}
+layer {
+  name: "conv9_2_mbox_loc_perm"
+  type: "Permute"
+  bottom: "conv9_2_mbox_loc"
+  top: "conv9_2_mbox_loc_perm"
+  permute_param {
+    order: 0
+    order: 2
+    order: 3
+    order: 1
+  }
+}
+layer {
+  name: "conv9_2_mbox_loc_flat"
+  type: "Flatten"
+  bottom: "conv9_2_mbox_loc_perm"
+  top: "conv9_2_mbox_loc_flat"
+  flatten_param {
+    axis: 1
+  }
+}
+layer {
+  name: "conv9_2_mbox_conf"
+  type: "Convolution"
+  bottom: "conv9_2"
+  top: "conv9_2_mbox_conf"
+  param {
+    lr_mult: 1.0
+    decay_mult: 1.0
+  }
+  param {
+    lr_mult: 2.0
+    decay_mult: 0.0
+  }
+  convolution_param {
+    num_output: 324
+    pad: 1
+    kernel_size: 3
+    stride: 1
+    weight_filler {
+      type: "xavier"
+    }
+    bias_filler {
+      type: "constant"
+      value: 0.0
+    }
+  }
+}
+layer {
+  name: "conv9_2_mbox_conf_perm"
+  type: "Permute"
+  bottom: "conv9_2_mbox_conf"
+  top: "conv9_2_mbox_conf_perm"
+  permute_param {
+    order: 0
+    order: 2
+    order: 3
+    order: 1
+  }
+}
+layer {
+  name: "conv9_2_mbox_conf_flat"
+  type: "Flatten"
+  bottom: "conv9_2_mbox_conf_perm"
+  top: "conv9_2_mbox_conf_flat"
+  flatten_param {
+    axis: 1
+  }
+}
+layer {
+  name: "conv9_2_mbox_priorbox"
+  type: "PriorBox"
+  bottom: "conv9_2"
+  bottom: "data"
+  top: "conv9_2_mbox_priorbox"
+  prior_box_param {
+    min_size: 261.0
+    max_size: 315.0
+    aspect_ratio: 2.0
+    flip: true
+    clip: false
+    variance: 0.10000000149
+    variance: 0.10000000149
+    variance: 0.20000000298
+    variance: 0.20000000298
+    step: 300.0
+    offset: 0.5
+  }
+}
+layer {
+  name: "mbox_loc"
+  type: "Concat"
+  bottom: "conv4_3_norm_mbox_loc_flat"
+  bottom: "fc7_mbox_loc_flat"
+  bottom: "conv6_2_mbox_loc_flat"
+  bottom: "conv7_2_mbox_loc_flat"
+  bottom: "conv8_2_mbox_loc_flat"
+  bottom: "conv9_2_mbox_loc_flat"
+  top: "mbox_loc"
+  concat_param {
+    axis: 1
+  }
+}
+layer {
+  name: "mbox_conf"
+  type: "Concat"
+  bottom: "conv4_3_norm_mbox_conf_flat"
+  bottom: "fc7_mbox_conf_flat"
+  bottom: "conv6_2_mbox_conf_flat"
+  bottom: "conv7_2_mbox_conf_flat"
+  bottom: "conv8_2_mbox_conf_flat"
+  bottom: "conv9_2_mbox_conf_flat"
+  top: "mbox_conf"
+  concat_param {
+    axis: 1
+  }
+}
+layer {
+  name: "mbox_priorbox"
+  type: "Concat"
+  bottom: "conv4_3_norm_mbox_priorbox"
+  bottom: "fc7_mbox_priorbox"
+  bottom: "conv6_2_mbox_priorbox"
+  bottom: "conv7_2_mbox_priorbox"
+  bottom: "conv8_2_mbox_priorbox"
+  bottom: "conv9_2_mbox_priorbox"
+  top: "mbox_priorbox"
+  concat_param {
+    axis: 2
+  }
+}
+layer {
+  name: "mbox_conf_reshape"
+  type: "Reshape"
+  bottom: "mbox_conf"
+  top: "mbox_conf_reshape"
+  reshape_param {
+    shape {
+      dim: 0
+      dim: -1
+      dim: 81
+    }
+  }
+}
+layer {
+  name: "mbox_conf_softmax"
+  type: "Softmax"
+  bottom: "mbox_conf_reshape"
+  top: "mbox_conf_softmax"
+  softmax_param {
+    axis: 2
+  }
+}
+layer {
+  name: "mbox_conf_flatten"
+  type: "Flatten"
+  bottom: "mbox_conf_softmax"
+  top: "mbox_conf_flatten"
+  flatten_param {
+    axis: 1
+  }
+}
+layer {
+  name: "detection_out"
+  type: "DetectionOutput"
+  bottom: "mbox_loc"
+  bottom: "mbox_conf_flatten"
+  bottom: "mbox_priorbox"
+  top: "detection_out"
+  include {
+    phase: TEST
+  }
+  detection_output_param {
+    num_classes: 81
+    share_location: true
+    background_label_id: 0
+    nms_param {
+      nms_threshold: 0.449999988079
+      top_k: 400
+    }
+    save_output_param {
+      output_directory: "/home/snikolaev/data/mscoco/results/SSD_300x300"
+      output_name_prefix: "detections_minival_ssd300_results"
+      output_format: "COCO"
+      label_map_file: "data/coco/labelmap_coco.prototxt"
+      name_size_file: "data/coco/test2014_name_size.txt"
+      num_test_image: 5000
+    }
+    code_type: CENTER_SIZE
+    keep_top_k: 200
+    confidence_threshold: 0.00999999977648
+  }
+}
+layer {
+  name: "detection_eval"
+  type: "DetectionEvaluate"
+  bottom: "detection_out"
+  bottom: "label"
+  top: "detection_eval"
+  include {
+    phase: TEST
+  }
+  detection_evaluate_param {
+    num_classes: 81
+    background_label_id: 0
+    overlap_threshold: 0.5
+    evaluate_difficult_gt: false
+    name_size_file: "data/coco/test2014_name_size.txt"
+  }
+}
+
diff --git a/models/VGGNet/coco/SSD_300x300/train.prototxt b/models/VGGNet/coco/SSD_300x300/train.prototxt
new file mode 100644
index 00000000000..2fd7179ca3b
--- /dev/null
+++ b/models/VGGNet/coco/SSD_300x300/train.prototxt
@@ -0,0 +1,1486 @@
+name: "VGG_coco_SSD_300x300_train"
+layer {
+  name: "data"
+  type: "AnnotatedData"
+  top: "data"
+  top: "label"
+  include {
+    phase: TRAIN
+  }
+  transform_param {
+    mirror: true
+    mean_value: 104.0
+    mean_value: 117.0
+    mean_value: 123.0
+    force_color: true
+    resize_param {
+      prob: 1.0
+      resize_mode: WARP
+      height: 300
+      width: 300
+      interp_mode: LINEAR
+      interp_mode: AREA
+      interp_mode: NEAREST
+      interp_mode: CUBIC
+      interp_mode: LANCZOS4
+    }
+    emit_constraint {
+      emit_type: CENTER
+    }
+    distort_param {
+      brightness_prob: 0.5
+      brightness_delta: 32.0
+      contrast_prob: 0.5
+      contrast_lower: 0.5
+      contrast_upper: 1.5
+      hue_prob: 0.5
+      hue_delta: 18.0
+      saturation_prob: 0.5
+      saturation_lower: 0.5
+      saturation_upper: 1.5
+      random_order_prob: 0.0
+    }
+    expand_param {
+      prob: 0.5
+      max_expand_ratio: 4.0
+    }
+  }
+  data_param {
+    source: "examples/coco/coco_train_lmdb"
+    batch_size: 128 #16
+    backend: LMDB
+  }
+  annotated_data_param {
+    batch_sampler {
+      max_sample: 1
+      max_trials: 1
+    }
+    batch_sampler {
+      sampler {
+        min_scale: 0.300000011921
+        max_scale: 1.0
+        min_aspect_ratio: 0.5
+        max_aspect_ratio: 2.0
+      }
+      sample_constraint {
+        min_jaccard_overlap: 0.10000000149
+      }
+      max_sample: 1
+      max_trials: 50
+    }
+    batch_sampler {
+      sampler {
+        min_scale: 0.300000011921
+        max_scale: 1.0
+        min_aspect_ratio: 0.5
+        max_aspect_ratio: 2.0
+      }
+      sample_constraint {
+        min_jaccard_overlap: 0.300000011921
+      }
+      max_sample: 1
+      max_trials: 50
+    }
+    batch_sampler {
+      sampler {
+        min_scale: 0.300000011921
+        max_scale: 1.0
+        min_aspect_ratio: 0.5
+        max_aspect_ratio: 2.0
+      }
+      sample_constraint {
+        min_jaccard_overlap: 0.5
+      }
+      max_sample: 1
+      max_trials: 50
+    }
+    batch_sampler {
+      sampler {
+        min_scale: 0.300000011921
+        max_scale: 1.0
+        min_aspect_ratio: 0.5
+        max_aspect_ratio: 2.0
+      }
+      sample_constraint {
+        min_jaccard_overlap: 0.699999988079
+      }
+      max_sample: 1
+      max_trials: 50
+    }
+    batch_sampler {
+      sampler {
+        min_scale: 0.300000011921
+        max_scale: 1.0
+        min_aspect_ratio: 0.5
+        max_aspect_ratio: 2.0
+      }
+      sample_constraint {
+        min_jaccard_overlap: 0.899999976158
+      }
+      max_sample: 1
+      max_trials: 50
+    }
+    batch_sampler {
+      sampler {
+        min_scale: 0.300000011921
+        max_scale: 1.0
+        min_aspect_ratio: 0.5
+        max_aspect_ratio: 2.0
+      }
+      sample_constraint {
+        max_jaccard_overlap: 1.0
+      }
+      max_sample: 1
+      max_trials: 50
+    }
+    label_map_file: "data/coco/labelmap_coco.prototxt"
+  }
+}
+layer {
+  name: "conv1_1"
+  type: "Convolution"
+  bottom: "data"
+  top: "conv1_1"
+  convolution_param {
+    num_output: 64
+    pad: 1
+    kernel_size: 3
+    weight_filler {
+      type: "xavier"
+    }
+    bias_filler {
+      type: "constant"
+      value: 0.0
+    }
+  }
+}
+layer {
+  name: "relu1_1"
+  type: "ReLU"
+  bottom: "conv1_1"
+  top: "conv1_1"
+}
+layer {
+  name: "conv1_2"
+  type: "Convolution"
+  bottom: "conv1_1"
+  top: "conv1_2"
+  
+  convolution_param {
+    num_output: 64
+    pad: 1
+    kernel_size: 3
+    weight_filler {
+      type: "xavier"
+    }
+    bias_filler {
+      type: "constant"
+      value: 0.0
+    }
+  }
+}
+layer {
+  name: "relu1_2"
+  type: "ReLU"
+  bottom: "conv1_2"
+  top: "conv1_2"
+}
+layer {
+  name: "pool1"
+  type: "Pooling"
+  bottom: "conv1_2"
+  top: "pool1"
+  pooling_param {
+    pool: MAX
+    kernel_size: 2
+    stride: 2
+  }
+}
+layer {
+  name: "conv2_1"
+  type: "Convolution"
+  bottom: "pool1"
+  top: "conv2_1"
+  
+  convolution_param {
+    num_output: 128
+    pad: 1
+    kernel_size: 3
+    weight_filler {
+      type: "xavier"
+    }
+    bias_filler {
+      type: "constant"
+      value: 0.0
+    }
+  }
+}
+layer {
+  name: "relu2_1"
+  type: "ReLU"
+  bottom: "conv2_1"
+  top: "conv2_1"
+}
+layer {
+  name: "conv2_2"
+  type: "Convolution"
+  bottom: "conv2_1"
+  top: "conv2_2"
+  
+  convolution_param {
+    num_output: 128
+    pad: 1
+    kernel_size: 3
+    weight_filler {
+      type: "xavier"
+    }
+    bias_filler {
+      type: "constant"
+      value: 0.0
+    }
+  }
+}
+layer {
+  name: "relu2_2"
+  type: "ReLU"
+  bottom: "conv2_2"
+  top: "conv2_2"
+}
+layer {
+  name: "pool2"
+  type: "Pooling"
+  bottom: "conv2_2"
+  top: "pool2"
+  pooling_param {
+    pool: MAX
+    kernel_size: 2
+    stride: 2
+  }
+}
+layer {
+  name: "conv3_1"
+  type: "Convolution"
+  bottom: "pool2"
+  top: "conv3_1"
+  
+  convolution_param {
+    num_output: 256
+    pad: 1
+    kernel_size: 3
+    weight_filler {
+      type: "xavier"
+    }
+    bias_filler {
+      type: "constant"
+      value: 0.0
+    }
+  }
+}
+layer {
+  name: "relu3_1"
+  type: "ReLU"
+  bottom: "conv3_1"
+  top: "conv3_1"
+}
+layer {
+  name: "conv3_2"
+  type: "Convolution"
+  bottom: "conv3_1"
+  top: "conv3_2"
+  
+  convolution_param {
+    num_output: 256
+    pad: 1
+    kernel_size: 3
+    weight_filler {
+      type: "xavier"
+    }
+    bias_filler {
+      type: "constant"
+      value: 0.0
+    }
+  }
+}
+layer {
+  name: "relu3_2"
+  type: "ReLU"
+  bottom: "conv3_2"
+  top: "conv3_2"
+}
+layer {
+  name: "conv3_3"
+  type: "Convolution"
+  bottom: "conv3_2"
+  top: "conv3_3"
+  
+  convolution_param {
+    num_output: 256
+    pad: 1
+    kernel_size: 3
+    weight_filler {
+      type: "xavier"
+    }
+    bias_filler {
+      type: "constant"
+      value: 0.0
+    }
+  }
+}
+layer {
+  name: "relu3_3"
+  type: "ReLU"
+  bottom: "conv3_3"
+  top: "conv3_3"
+}
+layer {
+  name: "pool3"
+  type: "Pooling"
+  bottom: "conv3_3"
+  top: "pool3"
+  pooling_param {
+    pool: MAX
+    kernel_size: 2
+    stride: 2
+  }
+}
+layer {
+  name: "conv4_1"
+  type: "Convolution"
+  bottom: "pool3"
+  top: "conv4_1"
+  
+  convolution_param {
+    num_output: 512
+    pad: 1
+    kernel_size: 3
+    weight_filler {
+      type: "xavier"
+    }
+    bias_filler {
+      type: "constant"
+      value: 0.0
+    }
+  }
+}
+layer {
+  name: "relu4_1"
+  type: "ReLU"
+  bottom: "conv4_1"
+  top: "conv4_1"
+}
+layer {
+  name: "conv4_2"
+  type: "Convolution"
+  bottom: "conv4_1"
+  top: "conv4_2"
+  
+  convolution_param {
+    num_output: 512
+    pad: 1
+    kernel_size: 3
+    weight_filler {
+      type: "xavier"
+    }
+    bias_filler {
+      type: "constant"
+      value: 0.0
+    }
+  }
+}
+layer {
+  name: "relu4_2"
+  type: "ReLU"
+  bottom: "conv4_2"
+  top: "conv4_2"
+}
+layer {
+  name: "conv4_3"
+  type: "Convolution"
+  bottom: "conv4_2"
+  top: "conv4_3"
+  
+  convolution_param {
+    num_output: 512
+    pad: 1
+    kernel_size: 3
+    weight_filler {
+      type: "xavier"
+    }
+    bias_filler {
+      type: "constant"
+      value: 0.0
+    }
+  }
+}
+layer {
+  name: "relu4_3"
+  type: "ReLU"
+  bottom: "conv4_3"
+  top: "conv4_3"
+}
+layer {
+  name: "pool4"
+  type: "Pooling"
+  bottom: "conv4_3"
+  top: "pool4"
+  pooling_param {
+    pool: MAX
+    kernel_size: 2
+    stride: 2
+  }
+}
+layer {
+  name: "conv5_1"
+  type: "Convolution"
+  bottom: "pool4"
+  top: "conv5_1"
+  
+  convolution_param {
+    num_output: 512
+    pad: 1
+    kernel_size: 3
+    weight_filler {
+      type: "xavier"
+    }
+    bias_filler {
+      type: "constant"
+      value: 0.0
+    }
+    dilation: 1
+  }
+}
+layer {
+  name: "relu5_1"
+  type: "ReLU"
+  bottom: "conv5_1"
+  top: "conv5_1"
+}
+layer {
+  name: "conv5_2"
+  type: "Convolution"
+  bottom: "conv5_1"
+  top: "conv5_2"
+  
+  convolution_param {
+    num_output: 512
+    pad: 1
+    kernel_size: 3
+    weight_filler {
+      type: "xavier"
+    }
+    bias_filler {
+      type: "constant"
+      value: 0.0
+    }
+    dilation: 1
+  }
+}
+layer {
+  name: "relu5_2"
+  type: "ReLU"
+  bottom: "conv5_2"
+  top: "conv5_2"
+}
+layer {
+  name: "conv5_3"
+  type: "Convolution"
+  bottom: "conv5_2"
+  top: "conv5_3"
+  
+  convolution_param {
+    num_output: 512
+    pad: 1
+    kernel_size: 3
+    weight_filler {
+      type: "xavier"
+    }
+    bias_filler {
+      type: "constant"
+      value: 0.0
+    }
+    dilation: 1
+  }
+}
+layer {
+  name: "relu5_3"
+  type: "ReLU"
+  bottom: "conv5_3"
+  top: "conv5_3"
+}
+layer {
+  name: "pool5"
+  type: "Pooling"
+  bottom: "conv5_3"
+  top: "pool5"
+  pooling_param {
+    pool: MAX
+    kernel_size: 3
+    stride: 1
+    pad: 1
+  }
+}
+layer {
+  name: "fc6"
+  type: "Convolution"
+  bottom: "pool5"
+  top: "fc6"
+  convolution_param {
+    num_output: 1024
+    pad: 6
+    kernel_size: 3
+    weight_filler {
+      type: "xavier"
+    }
+    bias_filler {
+      type: "constant"
+      value: 0.0
+    }
+    dilation: 6
+  }
+}
+layer {
+  name: "relu6"
+  type: "ReLU"
+  bottom: "fc6"
+  top: "fc6"
+}
+layer {
+  name: "fc7"
+  type: "Convolution"
+  bottom: "fc6"
+  top: "fc7"
+  
+  convolution_param {
+    num_output: 1024
+    kernel_size: 1
+    weight_filler {
+      type: "xavier"
+    }
+    bias_filler {
+      type: "constant"
+      value: 0.0
+    }
+  }
+}
+layer {
+  name: "relu7"
+  type: "ReLU"
+  bottom: "fc7"
+  top: "fc7"
+}
+layer {
+  name: "conv6_1"
+  type: "Convolution"
+  bottom: "fc7"
+  top: "conv6_1"
+  
+  convolution_param {
+    num_output: 256
+    pad: 0
+    kernel_size: 1
+    stride: 1
+    weight_filler {
+      type: "xavier"
+    }
+    bias_filler {
+      type: "constant"
+      value: 0.0
+    }
+  }
+}
+layer {
+  name: "conv6_1_relu"
+  type: "ReLU"
+  bottom: "conv6_1"
+  top: "conv6_1"
+}
+layer {
+  name: "conv6_2"
+  type: "Convolution"
+  bottom: "conv6_1"
+  top: "conv6_2"
+  
+  convolution_param {
+    num_output: 512
+    pad: 1
+    kernel_size: 3
+    stride: 2
+    weight_filler {
+      type: "xavier"
+    }
+    bias_filler {
+      type: "constant"
+      value: 0.0
+    }
+  }
+}
+layer {
+  name: "conv6_2_relu"
+  type: "ReLU"
+  bottom: "conv6_2"
+  top: "conv6_2"
+}
+layer {
+  name: "conv7_1"
+  type: "Convolution"
+  bottom: "conv6_2"
+  top: "conv7_1"
+  
+  convolution_param {
+    num_output: 128
+    pad: 0
+    kernel_size: 1
+    stride: 1
+    weight_filler {
+      type: "xavier"
+    }
+    bias_filler {
+      type: "constant"
+      value: 0.0
+    }
+  }
+}
+layer {
+  name: "conv7_1_relu"
+  type: "ReLU"
+  bottom: "conv7_1"
+  top: "conv7_1"
+}
+layer {
+  name: "conv7_2"
+  type: "Convolution"
+  bottom: "conv7_1"
+  top: "conv7_2"
+  
+  convolution_param {
+    num_output: 256
+    pad: 1
+    kernel_size: 3
+    stride: 2
+    weight_filler {
+      type: "xavier"
+    }
+    bias_filler {
+      type: "constant"
+      value: 0.0
+    }
+  }
+}
+layer {
+  name: "conv7_2_relu"
+  type: "ReLU"
+  bottom: "conv7_2"
+  top: "conv7_2"
+}
+layer {
+  name: "conv8_1"
+  type: "Convolution"
+  bottom: "conv7_2"
+  top: "conv8_1"
+  
+  convolution_param {
+    num_output: 128
+    pad: 0
+    kernel_size: 1
+    stride: 1
+    weight_filler {
+      type: "xavier"
+    }
+    bias_filler {
+      type: "constant"
+      value: 0.0
+    }
+  }
+}
+layer {
+  name: "conv8_1_relu"
+  type: "ReLU"
+  bottom: "conv8_1"
+  top: "conv8_1"
+}
+layer {
+  name: "conv8_2"
+  type: "Convolution"
+  bottom: "conv8_1"
+  top: "conv8_2"
+  
+  convolution_param {
+    num_output: 256
+    pad: 0
+    kernel_size: 3
+    stride: 1
+    weight_filler {
+      type: "xavier"
+    }
+    bias_filler {
+      type: "constant"
+      value: 0.0
+    }
+  }
+}
+layer {
+  name: "conv8_2_relu"
+  type: "ReLU"
+  bottom: "conv8_2"
+  top: "conv8_2"
+}
+layer {
+  name: "conv9_1"
+  type: "Convolution"
+  bottom: "conv8_2"
+  top: "conv9_1"
+  
+  convolution_param {
+    num_output: 128
+    pad: 0
+    kernel_size: 1
+    stride: 1
+    weight_filler {
+      type: "xavier"
+    }
+    bias_filler {
+      type: "constant"
+      value: 0.0
+    }
+  }
+}
+layer {
+  name: "conv9_1_relu"
+  type: "ReLU"
+  bottom: "conv9_1"
+  top: "conv9_1"
+}
+layer {
+  name: "conv9_2"
+  type: "Convolution"
+  bottom: "conv9_1"
+  top: "conv9_2"
+  
+  convolution_param {
+    num_output: 256
+    pad: 0
+    kernel_size: 3
+    stride: 1
+    weight_filler {
+      type: "xavier"
+    }
+    bias_filler {
+      type: "constant"
+      value: 0.0
+    }
+  }
+}
+layer {
+  name: "conv9_2_relu"
+  type: "ReLU"
+  bottom: "conv9_2"
+  top: "conv9_2"
+}
+layer {
+  name: "conv4_3_norm"
+  type: "Normalize"
+  bottom: "conv4_3"
+  top: "conv4_3_norm"
+  norm_param {
+    across_spatial: false
+    scale_filler {
+      type: "constant"
+      value: 20.0
+    }
+    channel_shared: false
+  }
+}
+layer {
+  name: "conv4_3_norm_mbox_loc"
+  type: "Convolution"
+  bottom: "conv4_3_norm"
+  top: "conv4_3_norm_mbox_loc"
+  
+  convolution_param {
+    num_output: 16
+    pad: 1
+    kernel_size: 3
+    stride: 1
+    weight_filler {
+      type: "xavier"
+    }
+    bias_filler {
+      type: "constant"
+      value: 0.0
+    }
+  }
+}
+layer {
+  name: "conv4_3_norm_mbox_loc_perm"
+  type: "Permute"
+  bottom: "conv4_3_norm_mbox_loc"
+  top: "conv4_3_norm_mbox_loc_perm"
+  permute_param {
+    order: 0
+    order: 2
+    order: 3
+    order: 1
+  }
+}
+layer {
+  name: "conv4_3_norm_mbox_loc_flat"
+  type: "Flatten"
+  bottom: "conv4_3_norm_mbox_loc_perm"
+  top: "conv4_3_norm_mbox_loc_flat"
+  flatten_param {
+    axis: 1
+  }
+}
+layer {
+  name: "conv4_3_norm_mbox_conf"
+  type: "Convolution"
+  bottom: "conv4_3_norm"
+  top: "conv4_3_norm_mbox_conf"
+  
+  convolution_param {
+    num_output: 324
+    pad: 1
+    kernel_size: 3
+    stride: 1
+    weight_filler {
+      type: "xavier"
+    }
+    bias_filler {
+      type: "constant"
+      value: 0.0
+    }
+  }
+}
+layer {
+  name: "conv4_3_norm_mbox_conf_perm"
+  type: "Permute"
+  bottom: "conv4_3_norm_mbox_conf"
+  top: "conv4_3_norm_mbox_conf_perm"
+  permute_param {
+    order: 0
+    order: 2
+    order: 3
+    order: 1
+  }
+}
+layer {
+  name: "conv4_3_norm_mbox_conf_flat"
+  type: "Flatten"
+  bottom: "conv4_3_norm_mbox_conf_perm"
+  top: "conv4_3_norm_mbox_conf_flat"
+  flatten_param {
+    axis: 1
+  }
+}
+layer {
+  name: "conv4_3_norm_mbox_priorbox"
+  type: "PriorBox"
+  bottom: "conv4_3_norm"
+  bottom: "data"
+  top: "conv4_3_norm_mbox_priorbox"
+  prior_box_param {
+    min_size: 21.0
+    max_size: 45.0
+    aspect_ratio: 2.0
+    flip: true
+    clip: false
+    variance: 0.10000000149
+    variance: 0.10000000149
+    variance: 0.20000000298
+    variance: 0.20000000298
+    step: 8.0
+    offset: 0.5
+  }
+}
+layer {
+  name: "fc7_mbox_loc"
+  type: "Convolution"
+  bottom: "fc7"
+  top: "fc7_mbox_loc"
+  
+  convolution_param {
+    num_output: 24
+    pad: 1
+    kernel_size: 3
+    stride: 1
+    weight_filler {
+      type: "xavier"
+    }
+    bias_filler {
+      type: "constant"
+      value: 0.0
+    }
+  }
+}
+layer {
+  name: "fc7_mbox_loc_perm"
+  type: "Permute"
+  bottom: "fc7_mbox_loc"
+  top: "fc7_mbox_loc_perm"
+  permute_param {
+    order: 0
+    order: 2
+    order: 3
+    order: 1
+  }
+}
+layer {
+  name: "fc7_mbox_loc_flat"
+  type: "Flatten"
+  bottom: "fc7_mbox_loc_perm"
+  top: "fc7_mbox_loc_flat"
+  flatten_param {
+    axis: 1
+  }
+}
+layer {
+  name: "fc7_mbox_conf"
+  type: "Convolution"
+  bottom: "fc7"
+  top: "fc7_mbox_conf"
+  
+  convolution_param {
+    num_output: 486
+    pad: 1
+    kernel_size: 3
+    stride: 1
+    weight_filler {
+      type: "xavier"
+    }
+    bias_filler {
+      type: "constant"
+      value: 0.0
+    }
+  }
+}
+layer {
+  name: "fc7_mbox_conf_perm"
+  type: "Permute"
+  bottom: "fc7_mbox_conf"
+  top: "fc7_mbox_conf_perm"
+  permute_param {
+    order: 0
+    order: 2
+    order: 3
+    order: 1
+  }
+}
+layer {
+  name: "fc7_mbox_conf_flat"
+  type: "Flatten"
+  bottom: "fc7_mbox_conf_perm"
+  top: "fc7_mbox_conf_flat"
+  flatten_param {
+    axis: 1
+  }
+}
+layer {
+  name: "fc7_mbox_priorbox"
+  type: "PriorBox"
+  bottom: "fc7"
+  bottom: "data"
+  top: "fc7_mbox_priorbox"
+  prior_box_param {
+    min_size: 45.0
+    max_size: 99.0
+    aspect_ratio: 2.0
+    aspect_ratio: 3.0
+    flip: true
+    clip: false
+    variance: 0.10000000149
+    variance: 0.10000000149
+    variance: 0.20000000298
+    variance: 0.20000000298
+    step: 16.0
+    offset: 0.5
+  }
+}
+layer {
+  name: "conv6_2_mbox_loc"
+  type: "Convolution"
+  bottom: "conv6_2"
+  top: "conv6_2_mbox_loc"
+  
+  convolution_param {
+    num_output: 24
+    pad: 1
+    kernel_size: 3
+    stride: 1
+    weight_filler {
+      type: "xavier"
+    }
+    bias_filler {
+      type: "constant"
+      value: 0.0
+    }
+  }
+}
+layer {
+  name: "conv6_2_mbox_loc_perm"
+  type: "Permute"
+  bottom: "conv6_2_mbox_loc"
+  top: "conv6_2_mbox_loc_perm"
+  permute_param {
+    order: 0
+    order: 2
+    order: 3
+    order: 1
+  }
+}
+layer {
+  name: "conv6_2_mbox_loc_flat"
+  type: "Flatten"
+  bottom: "conv6_2_mbox_loc_perm"
+  top: "conv6_2_mbox_loc_flat"
+  flatten_param {
+    axis: 1
+  }
+}
+layer {
+  name: "conv6_2_mbox_conf"
+  type: "Convolution"
+  bottom: "conv6_2"
+  top: "conv6_2_mbox_conf"
+  
+  convolution_param {
+    num_output: 486
+    pad: 1
+    kernel_size: 3
+    stride: 1
+    weight_filler {
+      type: "xavier"
+    }
+    bias_filler {
+      type: "constant"
+      value: 0.0
+    }
+  }
+}
+layer {
+  name: "conv6_2_mbox_conf_perm"
+  type: "Permute"
+  bottom: "conv6_2_mbox_conf"
+  top: "conv6_2_mbox_conf_perm"
+  permute_param {
+    order: 0
+    order: 2
+    order: 3
+    order: 1
+  }
+}
+layer {
+  name: "conv6_2_mbox_conf_flat"
+  type: "Flatten"
+  bottom: "conv6_2_mbox_conf_perm"
+  top: "conv6_2_mbox_conf_flat"
+  flatten_param {
+    axis: 1
+  }
+}
+layer {
+  name: "conv6_2_mbox_priorbox"
+  type: "PriorBox"
+  bottom: "conv6_2"
+  bottom: "data"
+  top: "conv6_2_mbox_priorbox"
+  prior_box_param {
+    min_size: 99.0
+    max_size: 153.0
+    aspect_ratio: 2.0
+    aspect_ratio: 3.0
+    flip: true
+    clip: false
+    variance: 0.10000000149
+    variance: 0.10000000149
+    variance: 0.20000000298
+    variance: 0.20000000298
+    step: 32.0
+    offset: 0.5
+  }
+}
+layer {
+  name: "conv7_2_mbox_loc"
+  type: "Convolution"
+  bottom: "conv7_2"
+  top: "conv7_2_mbox_loc"
+  
+  convolution_param {
+    num_output: 24
+    pad: 1
+    kernel_size: 3
+    stride: 1
+    weight_filler {
+      type: "xavier"
+    }
+    bias_filler {
+      type: "constant"
+      value: 0.0
+    }
+  }
+}
+layer {
+  name: "conv7_2_mbox_loc_perm"
+  type: "Permute"
+  bottom: "conv7_2_mbox_loc"
+  top: "conv7_2_mbox_loc_perm"
+  permute_param {
+    order: 0
+    order: 2
+    order: 3
+    order: 1
+  }
+}
+layer {
+  name: "conv7_2_mbox_loc_flat"
+  type: "Flatten"
+  bottom: "conv7_2_mbox_loc_perm"
+  top: "conv7_2_mbox_loc_flat"
+  flatten_param {
+    axis: 1
+  }
+}
+layer {
+  name: "conv7_2_mbox_conf"
+  type: "Convolution"
+  bottom: "conv7_2"
+  top: "conv7_2_mbox_conf"
+  
+  convolution_param {
+    num_output: 486
+    pad: 1
+    kernel_size: 3
+    stride: 1
+    weight_filler {
+      type: "xavier"
+    }
+    bias_filler {
+      type: "constant"
+      value: 0.0
+    }
+  }
+}
+layer {
+  name: "conv7_2_mbox_conf_perm"
+  type: "Permute"
+  bottom: "conv7_2_mbox_conf"
+  top: "conv7_2_mbox_conf_perm"
+  permute_param {
+    order: 0
+    order: 2
+    order: 3
+    order: 1
+  }
+}
+layer {
+  name: "conv7_2_mbox_conf_flat"
+  type: "Flatten"
+  bottom: "conv7_2_mbox_conf_perm"
+  top: "conv7_2_mbox_conf_flat"
+  flatten_param {
+    axis: 1
+  }
+}
+layer {
+  name: "conv7_2_mbox_priorbox"
+  type: "PriorBox"
+  bottom: "conv7_2"
+  bottom: "data"
+  top: "conv7_2_mbox_priorbox"
+  prior_box_param {
+    min_size: 153.0
+    max_size: 207.0
+    aspect_ratio: 2.0
+    aspect_ratio: 3.0
+    flip: true
+    clip: false
+    variance: 0.10000000149
+    variance: 0.10000000149
+    variance: 0.20000000298
+    variance: 0.20000000298
+    step: 64.0
+    offset: 0.5
+  }
+}
+layer {
+  name: "conv8_2_mbox_loc"
+  type: "Convolution"
+  bottom: "conv8_2"
+  top: "conv8_2_mbox_loc"
+  
+  convolution_param {
+    num_output: 16
+    pad: 1
+    kernel_size: 3
+    stride: 1
+    weight_filler {
+      type: "xavier"
+    }
+    bias_filler {
+      type: "constant"
+      value: 0.0
+    }
+  }
+}
+layer {
+  name: "conv8_2_mbox_loc_perm"
+  type: "Permute"
+  bottom: "conv8_2_mbox_loc"
+  top: "conv8_2_mbox_loc_perm"
+  permute_param {
+    order: 0
+    order: 2
+    order: 3
+    order: 1
+  }
+}
+layer {
+  name: "conv8_2_mbox_loc_flat"
+  type: "Flatten"
+  bottom: "conv8_2_mbox_loc_perm"
+  top: "conv8_2_mbox_loc_flat"
+  flatten_param {
+    axis: 1
+  }
+}
+layer {
+  name: "conv8_2_mbox_conf"
+  type: "Convolution"
+  bottom: "conv8_2"
+  top: "conv8_2_mbox_conf"
+  
+  convolution_param {
+    num_output: 324
+    pad: 1
+    kernel_size: 3
+    stride: 1
+    weight_filler {
+      type: "xavier"
+    }
+    bias_filler {
+      type: "constant"
+      value: 0.0
+    }
+  }
+}
+layer {
+  name: "conv8_2_mbox_conf_perm"
+  type: "Permute"
+  bottom: "conv8_2_mbox_conf"
+  top: "conv8_2_mbox_conf_perm"
+  permute_param {
+    order: 0
+    order: 2
+    order: 3
+    order: 1
+  }
+}
+layer {
+  name: "conv8_2_mbox_conf_flat"
+  type: "Flatten"
+  bottom: "conv8_2_mbox_conf_perm"
+  top: "conv8_2_mbox_conf_flat"
+  flatten_param {
+    axis: 1
+  }
+}
+layer {
+  name: "conv8_2_mbox_priorbox"
+  type: "PriorBox"
+  bottom: "conv8_2"
+  bottom: "data"
+  top: "conv8_2_mbox_priorbox"
+  prior_box_param {
+    min_size: 207.0
+    max_size: 261.0
+    aspect_ratio: 2.0
+    flip: true
+    clip: false
+    variance: 0.10000000149
+    variance: 0.10000000149
+    variance: 0.20000000298
+    variance: 0.20000000298
+    step: 100.0
+    offset: 0.5
+  }
+}
+layer {
+  name: "conv9_2_mbox_loc"
+  type: "Convolution"
+  bottom: "conv9_2"
+  top: "conv9_2_mbox_loc"
+  
+  convolution_param {
+    num_output: 16
+    pad: 1
+    kernel_size: 3
+    stride: 1
+    weight_filler {
+      type: "xavier"
+    }
+    bias_filler {
+      type: "constant"
+      value: 0.0
+    }
+  }
+}
+layer {
+  name: "conv9_2_mbox_loc_perm"
+  type: "Permute"
+  bottom: "conv9_2_mbox_loc"
+  top: "conv9_2_mbox_loc_perm"
+  permute_param {
+    order: 0
+    order: 2
+    order: 3
+    order: 1
+  }
+}
+layer {
+  name: "conv9_2_mbox_loc_flat"
+  type: "Flatten"
+  bottom: "conv9_2_mbox_loc_perm"
+  top: "conv9_2_mbox_loc_flat"
+  flatten_param {
+    axis: 1
+  }
+}
+layer {
+  name: "conv9_2_mbox_conf"
+  type: "Convolution"
+  bottom: "conv9_2"
+  top: "conv9_2_mbox_conf"
+  
+  convolution_param {
+    num_output: 324
+    pad: 1
+    kernel_size: 3
+    stride: 1
+    weight_filler {
+      type: "xavier"
+    }
+    bias_filler {
+      type: "constant"
+      value: 0.0
+    }
+  }
+}
+layer {
+  name: "conv9_2_mbox_conf_perm"
+  type: "Permute"
+  bottom: "conv9_2_mbox_conf"
+  top: "conv9_2_mbox_conf_perm"
+  permute_param {
+    order: 0
+    order: 2
+    order: 3
+    order: 1
+  }
+}
+layer {
+  name: "conv9_2_mbox_conf_flat"
+  type: "Flatten"
+  bottom: "conv9_2_mbox_conf_perm"
+  top: "conv9_2_mbox_conf_flat"
+  flatten_param {
+    axis: 1
+  }
+}
+layer {
+  name: "conv9_2_mbox_priorbox"
+  type: "PriorBox"
+  bottom: "conv9_2"
+  bottom: "data"
+  top: "conv9_2_mbox_priorbox"
+  prior_box_param {
+    min_size: 261.0
+    max_size: 315.0
+    aspect_ratio: 2.0
+    flip: true
+    clip: false
+    variance: 0.10000000149
+    variance: 0.10000000149
+    variance: 0.20000000298
+    variance: 0.20000000298
+    step: 300.0
+    offset: 0.5
+  }
+}
+layer {
+  name: "mbox_loc"
+  type: "Concat"
+  bottom: "conv4_3_norm_mbox_loc_flat"
+  bottom: "fc7_mbox_loc_flat"
+  bottom: "conv6_2_mbox_loc_flat"
+  bottom: "conv7_2_mbox_loc_flat"
+  bottom: "conv8_2_mbox_loc_flat"
+  bottom: "conv9_2_mbox_loc_flat"
+  top: "mbox_loc"
+  concat_param {
+    axis: 1
+  }
+}
+layer {
+  name: "mbox_conf"
+  type: "Concat"
+  bottom: "conv4_3_norm_mbox_conf_flat"
+  bottom: "fc7_mbox_conf_flat"
+  bottom: "conv6_2_mbox_conf_flat"
+  bottom: "conv7_2_mbox_conf_flat"
+  bottom: "conv8_2_mbox_conf_flat"
+  bottom: "conv9_2_mbox_conf_flat"
+  top: "mbox_conf"
+  concat_param {
+    axis: 1
+  }
+}
+layer {
+  name: "mbox_priorbox"
+  type: "Concat"
+  bottom: "conv4_3_norm_mbox_priorbox"
+  bottom: "fc7_mbox_priorbox"
+  bottom: "conv6_2_mbox_priorbox"
+  bottom: "conv7_2_mbox_priorbox"
+  bottom: "conv8_2_mbox_priorbox"
+  bottom: "conv9_2_mbox_priorbox"
+  top: "mbox_priorbox"
+  concat_param {
+    axis: 2
+  }
+}
+layer {
+  name: "mbox_loss"
+  type: "MultiBoxLoss"
+  bottom: "mbox_loc"
+  bottom: "mbox_conf"
+  bottom: "mbox_priorbox"
+  bottom: "label"
+  top: "mbox_loss"
+  include {
+    phase: TRAIN
+  }
+  propagate_down: true
+  propagate_down: true
+  propagate_down: false
+  propagate_down: false
+  loss_param {
+    normalization: VALID
+  }
+  multibox_loss_param {
+    loc_loss_type: SMOOTH_L1
+    conf_loss_type: SOFTMAX
+    loc_weight: 1.0
+    num_classes: 81
+    share_location: true
+    match_type: PER_PREDICTION
+    overlap_threshold: 0.5
+    use_prior_for_matching: true
+    background_label_id: 0
+    use_difficult_gt: false
+    neg_pos_ratio: 3.0
+    neg_overlap: 0.5
+    code_type: CENTER_SIZE
+    ignore_cross_boundary_bbox: false
+    mining_type: MAX_NEGATIVE
+  }
+}
+
diff --git a/models/modelBuilder/build_resnet.py b/models/modelBuilder/build_resnet.py
index 27be766dd82..8effdc5e01f 100755
--- a/models/modelBuilder/build_resnet.py
+++ b/models/modelBuilder/build_resnet.py
@@ -163,5 +163,32 @@ def main():
     fp = open("resnet_50.prototxt", 'w')
     fp.write(model)
 
+    netConfig = numpy.matrix([
+        [ 64, 3, 1, 0],
+        [128, 4, 1, 1],
+        [256, 23, 1, 1],
+        [512, 3, 1, 1]])
+    model = buildResidualModel(netConfig, name="Resnet101", net_type="large")
+    fp = open("resnet_101.prototxt", 'w')
+    fp.write(model)
+
+    netConfig = numpy.matrix([
+        [ 64, 3, 1, 0],
+        [128, 8, 1, 1],
+        [256, 36, 1, 1],
+        [512, 3, 1, 1]])
+    model = buildResidualModel(netConfig, name="Resnet152", net_type="large")
+    fp = open("resnet_152.prototxt", 'w')
+    fp.write(model)
+
+    netConfig = numpy.matrix([
+        [ 64, 3, 1, 0],
+        [128, 8, 1, 1],
+        [256, 52, 1, 1],
+        [512, 3, 1, 1]])
+    model = buildResidualModel(netConfig, name="Resnet200", net_type="large")
+    fp = open("resnet_200.prototxt", 'w')
+    fp.write(model)
+
 if __name__ == '__main__':
     main()
diff --git a/models/modelBuilder/build_se_resnet.py b/models/modelBuilder/build_se_resnet.py
new file mode 100755
index 00000000000..623eb778f07
--- /dev/null
+++ b/models/modelBuilder/build_se_resnet.py
@@ -0,0 +1,135 @@
+#!/usr/bin/env python
+# ref: Kaiming He, Xiangyu Zhang, Shaoqing Ren, Jian Sun -- "Deep Residual Learning for Image Recognition"
+#      https://arxiv.org/pdf/1512.03385v1.pdf
+#
+# ref: By Jie Hu, Li Shen, Gang Sun -- "Squeeze-and-Excitation Networks"
+#      https://arxiv.org/pdf/1709.01507.pdf
+
+import numpy
+from layers import *
+
+#------------------------------------------------------------------------------
+def addSEBlock(model, name, bottom, num_output, reduction=1):
+
+    model, top = addPool(model=model, name='{}.global_pool'.format(name), bottom=bottom, pool_type="AVE", kernel_size=0, stride=1, global_pooling=True)
+
+    model, top = addConvRelu(model=model, name='{}.conv_down'.format(name), bottom=top, num_output=num_output//reduction, kernel_size=1)
+    model, top = addConv(model=model, name='{}.conv_up'.format(name), bottom=top, num_output=num_output,  kernel_size=1)
+    model, top = addSigmoid(model=model, name='{}.prob'.format(name), bottom=top)
+    return model, top
+
+#------------------------------------------------------------------------------
+def addSERes(model, name , bottom, num_output, group, j, fix_dim, dilation = False):
+
+    prefix="{name}.{j}.".format(name=name,j=str(j))
+    block=""
+    block, top = addConvBnRelu(model=block, name='{}conv1'.format(prefix), bottom=bottom, num_output=num_output,
+                               kernel_size=1, group=1, stride=2 if (fix_dim and (j==1)) else 1, pad=0)
+    block, top = addConvBnRelu(model=block, name='{}conv2'.format(prefix), bottom=top, num_output=num_output,
+                               kernel_size=3, group=group, stride=1, pad=1)
+    block, top = addConvBn(model=block, name='{}conv3'.format(prefix), bottom=top, num_output=(num_output * 4),
+                               kernel_size=1, group=1, stride=1, pad=0)
+
+    block, se_top = addSEBlock(model=block, name='{}se'.format(prefix), bottom=top, num_output=(num_output*4), reduction=16)
+
+    if (j == 1):
+        block, res_top = addConvBn(model=block, name='{}skipConv'.format(prefix), bottom=bottom,
+                               num_output=(num_output * 4),
+                               kernel_size=1, group=1, stride=2 if fix_dim  else 1, pad=0)
+    else:
+        res_top = bottom
+
+
+
+    block, top = addAxpy(model=block, name='{}axpy'.format(prefix), bottom_1=se_top, bottom_2= top, bottom_3=res_top)
+
+    block, top = addActivation(model=block, name="{}relu".format(prefix), bottom=top)
+
+    model += block
+    return model, top
+
+#------------------------------------------------------------------------------
+#        [ 64, 3, 1, 0],
+
+def addResSuperBlock(model, bottom, i, num_subBlocks, num_output, group, fix_dim, net_type, dilation = False):
+    name = "res{i}".format(i=str(i))
+    model = addComment(model, comment=name)
+    top=bottom
+    for j in xrange(1, num_subBlocks + 1):
+        model, top = addSERes(model, name, bottom=top, num_output=num_output, group=group,
+                                    j=j, fix_dim=fix_dim, dilation=dilation)
+    return model, top
+
+#------------------------------------------------------------------------------
+
+def print_netconfig(netConfig):
+
+    header_str='''
+# n:  ch\t:  s\t:  g\t: skip\t:'''
+
+    num_blocks=netConfig.shape[0]
+    for i in xrange(0, num_blocks):
+        header_str += '''
+# {i}:  {num_outputs}\t:  {subblocks}\t:  {group}\t:  {skip}\t:'''.format(
+            i=str(i),
+            num_outputs=str(netConfig[i,0]),
+            subblocks  =str(netConfig[i,1]),
+            group      =str(netConfig[i,2]),
+            skip       =str(netConfig[i,3]) )
+
+    header_str += "\n"
+    return header_str
+
+#------------------------------------------------------------------------------
+
+def buildResidualModel(netConfig, name, net_type):
+
+    model = ""
+    model = addHeader(model, name=name)
+    model += print_netconfig(netConfig)
+    print(model)
+
+    train_batch = 128
+    test_batch  = 32
+    model, last_top = addData(model, train_batch, test_batch)
+
+    model, top = addConvBnRelu(model, name="conv1", bottom="data", num_output=64,
+                               kernel_size=7, group=1, stride=2, pad=3)
+    model, top = addPool(model, name="pool1", bottom=top,
+                               kernel_size=3, stride=2, pool_type="MAX")
+
+    num_blocks = len(netConfig)
+    for i in xrange(1, num_blocks+1):
+        num_output    = netConfig[i-1,0]
+        num_subBlocks = netConfig[i-1,1]
+        group = netConfig[i-1,2]
+        fix_dim = (netConfig[i-1,3]==1)
+        bottom=top
+        model, top = addResSuperBlock(model, bottom, i+1, num_subBlocks, num_output, group, fix_dim, net_type)
+
+    model, top = addPool(model, name="pool2", bottom=top, kernel_size=7, stride=1, pool_type="AVE")
+#    model, top = addDropout(model, name="dropout", bottom=top, ratio=0.5)
+    model, top = addFC(model, name="fc", bottom=top, num_output=1000, filler='msra')
+
+    fc_top = top
+    model, top = addSoftmaxLoss(model, name="loss", bottom_1=fc_top)
+    model, top = addAccuracy(model, name="accuracy/top-1", bottom_1=fc_top, k=1)
+    model, top = addAccuracy(model, name="accuracy/top-5", bottom_1=fc_top, k=5)
+
+    return model
+
+#------------------------------------------------------------------------------
+
+def main():
+
+    netConfig = numpy.matrix([
+        [ 64, 3, 1, 0],
+        [128, 4, 1, 1],
+        [256, 6, 1, 1],
+        [512, 3, 1, 1]])
+    model = buildResidualModel(netConfig, name="SEResnet50", net_type="large")
+    fp = open("se_resnet_50.prototxt", 'w')
+    fp.write(model)
+
+if __name__ == '__main__':
+    main()
diff --git a/models/modelBuilder/layers.py b/models/modelBuilder/layers.py
index 068d66ef3ee..6fabb49632f 100755
--- a/models/modelBuilder/layers.py
+++ b/models/modelBuilder/layers.py
@@ -15,6 +15,33 @@ def  addHeader(model, name):
 
 #------------------------------------------------------------------------------
 
+def addSigmoid(model, name, bottom):
+    layer = '''
+layer {{
+    name: "{name}"
+    type: "Sigmoid"
+    bottom: "{bottom}"
+    top: "{top}"
+}}'''.format(name=name, bottom=bottom, top=bottom)
+    model += layer
+    return model, bottom
+
+
+def addAxpy(model, name, bottom_1, bottom_2, bottom_3):
+    layer = '''
+layer {{
+  name: "{name}"
+  type: "Axpy"
+  bottom: "{bottom_1}"
+  bottom: "{bottom_2}"
+  bottom: "{bottom_3}"
+  top: "{top}"
+}}'''.format(name=name, bottom_1=bottom_1, bottom_2=bottom_2, bottom_3=bottom_3, top=name)
+    top = name
+    model += layer
+    return model, top
+
+
 def addData(model, train_batch=32, test_batch=32,
                  train_file="examples/imagenet/ilsvrc12_train_lmdb",
                  test_file = "examples/imagenet/ilsvrc12_val_lmdb",
@@ -337,7 +364,7 @@ def addConvBnSelu(model, name, bottom, num_output,
 
 #---------------------------------------------------------------------------------
 
-def addPool(model, name, bottom, kernel_size, stride, pool_type, pad=0):
+def addPool(model, name, bottom, kernel_size, stride, pool_type, pad=0, global_pooling=False):
 
     layer = '''
 layer {{
@@ -346,9 +373,11 @@ def addPool(model, name, bottom, kernel_size, stride, pool_type, pad=0):
   bottom: "{bottom}"
   top: "{top}"
   pooling_param {{
-    pool: {pool_type}
-    kernel_size: {kernel_size}\n'''.format(name=name, top=name, bottom=bottom,
-                               pool_type=pool_type, kernel_size=kernel_size)
+    pool: {pool_type}'''.format(name=name, top=name, bottom=bottom,
+                                pool_type=pool_type)
+
+    if (kernel_size > 0):
+        layer += '''    kernel_size: {kernel_size}\n'''.format(kernel_size=kernel_size)
 
     if (stride>1):
         layer += '''    stride: {}\n'''.format(stride)
@@ -356,6 +385,9 @@ def addPool(model, name, bottom, kernel_size, stride, pool_type, pad=0):
     if (pad>0):
         layer += '''    pad: {}\n'''.format(pad)
 
+    if (global_pooling):
+        layer += '''    global_pooling: true\n'''
+
     layer+='''  }\n}'''
     model += layer
     return model, name
@@ -552,7 +584,7 @@ def addAccuracy(model, name, bottom_1, bottom_2="label", k=1):
   bottom: "{bottom_2}"
   top: "{top}"
   accuracy_param {{ top_k: {k} }}
-#  include {{ phase: TEST }}
+  include {{ phase: TEST }}
 }}'''.format(name=name, top=name, bottom_1=bottom_1, bottom_2=bottom_2, k=k)
     model += layer
     return model, name
diff --git a/packaging/deb/templates/rules b/packaging/deb/templates/rules
index defa8e32522..fc8122c2319 100755
--- a/packaging/deb/templates/rules
+++ b/packaging/deb/templates/rules
@@ -17,7 +17,6 @@ endif
 
 override_dh_auto_configure:
 	dh_auto_configure -- \
-		-DALLOW_LMDB_NOLOCK=ON \
 		-DBLAS=Open \
 		-DCMAKE_BUILD_TYPE="Release" \
 		-DCMAKE_SKIP_RPATH=TRUE \
diff --git a/python/caffe/_caffe.cpp b/python/caffe/_caffe.cpp
index de733e8dfe6..d663e5b33b2 100644
--- a/python/caffe/_caffe.cpp
+++ b/python/caffe/_caffe.cpp
@@ -81,6 +81,7 @@ void set_mode_cpu() {
 }
 
 void set_mode_gpu() {
+  PyGILRelease gil;
   Caffe::set_mode(Caffe::GPU);
   vector<int> gpus(1, 0);
   initialize_gpu_memory_scope(gpus);
@@ -142,14 +143,37 @@ void CheckContiguousArray(PyArrayObject* arr, string name,
 }
 
 // Net constructor for passing phase as int
-shared_ptr<Net> Net_Init(string param_file, int phase) {
+shared_ptr<Net> Net_Init(string param_file, int phase,
+    const int level, const bp::object& stages,
+    const bp::object& weights) {
+  // Convert stages from list to vector
+  vector<string> stages_vector;
+  if (!stages.is_none()) {
+    for (int i = 0; i < bp::len(stages); i++) {
+      stages_vector.push_back(bp::extract<string>(stages[i]));
+    }
+  }
+  PyGILRelease gil;
   CheckFile(param_file);
-  shared_ptr<Net> net(new Net(param_file, static_cast<Phase>(phase)));
+  shared_ptr<Net> net(new Net(param_file, static_cast<Phase>(phase),
+      0U, nullptr, nullptr, false, level, &stages_vector));
+  // Load weights
+  if (!weights.is_none()) {
+    std::string weights_file_str = bp::extract<std::string>(weights);
+    CheckFile(weights_file_str);
+    net->CopyTrainedLayersFrom(weights_file_str);
+  }
   return net;
 }
 
 // Net construct-and-load convenience constructor
 shared_ptr<Net> Net_Init_Load(string param_file, string pretrained_param_file, int phase) {
+  PyGILRelease gil;
+  LOG(WARNING) << "DEPRECATION WARNING - deprecated use of Python interface";
+  LOG(WARNING) << "Use this instead (with the named \"weights\""
+    << " parameter):";
+  LOG(WARNING) << "Net('" << param_file << "', " << phase
+    << ", weights='" << pretrained_param_file << "')";
   CheckFile(param_file);
   CheckFile(pretrained_param_file);
   shared_ptr<Net> net(new Net(param_file, static_cast<Phase>(phase)));
@@ -412,7 +436,12 @@ BOOST_PYTHON_MODULE(_caffe) {
 
   bp::class_<Net, shared_ptr<Net>, boost::noncopyable >("Net",
     bp::no_init)
-    .def("__init__", bp::make_constructor(&Net_Init))
+    // Constructor
+    .def("__init__", bp::make_constructor(&Net_Init,
+          bp::default_call_policies(), (bp::arg("network_file"), "phase",
+            bp::arg("level")=0, bp::arg("stages")=bp::object(),
+            bp::arg("weights")=bp::object())))
+    // Legacy constructor
     .def("__init__", bp::make_constructor(&Net_Init_Load))
     .def("_forward", &Net_ForwardFromToNoGIL)
     .def("_backward", &Net_BackwardFromToNoGIL)
diff --git a/python/caffe/io.py b/python/caffe/io.py
index 5f933876cc0..dcb24cc31f4 100644
--- a/python/caffe/io.py
+++ b/python/caffe/io.py
@@ -330,7 +330,7 @@ def resize_image(im, new_dims, interp_order=1):
             # skimage is fast but only understands {1,3} channel images
             # in [0, 1].
             im_std = (im - im_min) / (im_max - im_min)
-            resized_std = resize(im_std, new_dims, order=interp_order)
+            resized_std = resize(im_std, new_dims, mode = 'reflect', order=interp_order)
             resized_im = resized_std * (im_max - im_min) + im_min
         else:
             # the image is a constant -- avoid divide by 0
diff --git a/python/caffe/net_spec.py b/python/caffe/net_spec.py
index 5fb1f0b3fb1..5d13067382b 100644
--- a/python/caffe/net_spec.py
+++ b/python/caffe/net_spec.py
@@ -93,6 +93,9 @@ def to_proto(self):
 
         return to_proto(self)
 
+    def _update(self, params):
+        self.fn._update(params)
+
     def _to_proto(self, layers, names, autonames):
         return self.fn._to_proto(layers, names, autonames)
 
@@ -128,6 +131,9 @@ def _get_top_name(self, top, names, autonames):
             names[top] = top.fn.type_name + str(autonames[top.fn.type_name])
         return names[top]
 
+    def _update(self, params):
+        self.params.update(params)
+
     def _to_proto(self, layers, names, autonames):
         if self in layers:
             return
@@ -181,7 +187,21 @@ def __setitem__(self, key, value):
     def __getitem__(self, item):
         return self.__getattr__(item)
 
-    def to_proto(self):
+    def __delitem__(self, name):
+        del self.tops[name]
+
+    def keys(self):
+        keys = [k for k, v in six.iteritems(self.tops)]
+        return keys
+
+    def vals(self):
+        vals = [v for k, v in six.iteritems(self.tops)]
+        return vals
+
+    def update(self, name, params):
+        self.tops[name]._update(params)
+		
+    def to_proto(self, verbose=False):
         names = {v: k for k, v in six.iteritems(self.tops)}
         autonames = Counter()
         layers = OrderedDict()
diff --git a/python/caffe/test/test_net.py b/python/caffe/test/test_net.py
index 3abc171c679..92b4d5ee7bb 100644
--- a/python/caffe/test/test_net.py
+++ b/python/caffe/test/test_net.py
@@ -83,10 +83,239 @@ def test_save_and_read(self):
         f.close()
         self.net.save(f.name)
         net_file = simple_net_file(self.num_output)
-        net2 = caffe.Net(net_file, f.name, caffe.TRAIN)
+        # Test legacy constructor
+        #   should print deprecation warning
+        caffe.Net(net_file, f.name, caffe.TRAIN)
+        # Test named constructor
+        net2 = caffe.Net(net_file, caffe.TRAIN, weights=f.name)
+#        net2 = caffe.Net(net_file, f.name, caffe.TRAIN)
         os.remove(net_file)
         os.remove(f.name)
         for name in self.net.params:
             for i in range(len(self.net.params[name])):
                 self.assertEqual(abs(self.net.params[name][i].data
                     - net2.params[name][i].data).sum(), 0)
+
+class TestLevels(unittest.TestCase):
+
+    TEST_NET = """
+layer {
+  name: "data"
+  type: "DummyData"
+  top: "data"
+  dummy_data_param { shape { dim: 1 dim: 1 dim: 10 dim: 10 } }
+}
+layer {
+  name: "NoLevel"
+  type: "InnerProduct"
+  bottom: "data"
+  top: "NoLevel"
+  inner_product_param { num_output: 1 }
+}
+layer {
+  name: "Level0Only"
+  type: "InnerProduct"
+  bottom: "data"
+  top: "Level0Only"
+  include { min_level: 0 max_level: 0 }
+  inner_product_param { num_output: 1 }
+}
+layer {
+  name: "Level1Only"
+  type: "InnerProduct"
+  bottom: "data"
+  top: "Level1Only"
+  include { min_level: 1 max_level: 1 }
+  inner_product_param { num_output: 1 }
+}
+layer {
+  name: "Level>=0"
+  type: "InnerProduct"
+  bottom: "data"
+  top: "Level>=0"
+  include { min_level: 0 }
+  inner_product_param { num_output: 1 }
+}
+layer {
+  name: "Level>=1"
+  type: "InnerProduct"
+  bottom: "data"
+  top: "Level>=1"
+  include { min_level: 1 }
+  inner_product_param { num_output: 1 }
+}
+"""
+
+    def setUp(self):
+        self.f = tempfile.NamedTemporaryFile(mode='w+', delete=False)
+        self.f.write(self.TEST_NET)
+        self.f.close()
+
+    def tearDown(self):
+        os.remove(self.f.name)
+
+    def check_net(self, net, blobs):
+        net_blobs = [b for b in net.blobs.keys() if 'data' not in b]
+        self.assertEqual(net_blobs, blobs)
+
+    def test_0(self):
+        net = caffe.Net(self.f.name, caffe.TEST)
+        self.check_net(net, ['NoLevel', 'Level0Only', 'Level>=0'])
+
+    def test_1(self):
+        net = caffe.Net(self.f.name, caffe.TEST, level=1)
+        self.check_net(net, ['NoLevel', 'Level1Only', 'Level>=0', 'Level>=1'])
+
+
+class TestStages(unittest.TestCase):
+
+    TEST_NET = """
+layer {
+  name: "data"
+  type: "DummyData"
+  top: "data"
+  dummy_data_param { shape { dim: 1 dim: 1 dim: 10 dim: 10 } }
+}
+layer {
+  name: "A"
+  type: "InnerProduct"
+  bottom: "data"
+  top: "A"
+  include { stage: "A" }
+  inner_product_param { num_output: 1 }
+}
+layer {
+  name: "B"
+  type: "InnerProduct"
+  bottom: "data"
+  top: "B"
+  include { stage: "B" }
+  inner_product_param { num_output: 1 }
+}
+layer {
+  name: "AorB"
+  type: "InnerProduct"
+  bottom: "data"
+  top: "AorB"
+  include { stage: "A" }
+  include { stage: "B" }
+  inner_product_param { num_output: 1 }
+}
+layer {
+  name: "AandB"
+  type: "InnerProduct"
+  bottom: "data"
+  top: "AandB"
+  include { stage: "A" stage: "B" }
+  inner_product_param { num_output: 1 }
+}
+"""
+
+    def setUp(self):
+        self.f = tempfile.NamedTemporaryFile(mode='w+', delete=False)
+        self.f.write(self.TEST_NET)
+        self.f.close()
+
+    def tearDown(self):
+        os.remove(self.f.name)
+
+    def check_net(self, net, blobs):
+        net_blobs = [b for b in net.blobs.keys() if 'data' not in b]
+        self.assertEqual(net_blobs, blobs)
+
+    def test_A(self):
+        net = caffe.Net(self.f.name, caffe.TEST, stages=['A'])
+        self.check_net(net, ['A', 'AorB'])
+
+    def test_B(self):
+        net = caffe.Net(self.f.name, caffe.TEST, stages=['B'])
+        self.check_net(net, ['B', 'AorB'])
+
+    def test_AandB(self):
+        net = caffe.Net(self.f.name, caffe.TEST, stages=['A', 'B'])
+        self.check_net(net, ['A', 'B', 'AorB', 'AandB'])
+
+
+class TestAllInOne(unittest.TestCase):
+
+    TEST_NET = """
+layer {
+  name: "train_data"
+  type: "DummyData"
+  top: "data"
+  top: "label"
+  dummy_data_param {
+    shape { dim: 1 dim: 1 dim: 10 dim: 10 }
+    shape { dim: 1 dim: 1 dim: 1 dim: 1 }
+  }
+  include { phase: TRAIN stage: "train" }
+}
+layer {
+  name: "val_data"
+  type: "DummyData"
+  top: "data"
+  top: "label"
+  dummy_data_param {
+    shape { dim: 1 dim: 1 dim: 10 dim: 10 }
+    shape { dim: 1 dim: 1 dim: 1 dim: 1 }
+  }
+  include { phase: TEST stage: "val" }
+}
+layer {
+  name: "deploy_data"
+  type: "Input"
+  top: "data"
+  input_param { shape { dim: 1 dim: 1 dim: 10 dim: 10 } }
+  include { phase: TEST stage: "deploy" }
+}
+layer {
+  name: "ip"
+  type: "InnerProduct"
+  bottom: "data"
+  top: "ip"
+  inner_product_param { num_output: 2 }
+}
+layer {
+  name: "loss"
+  type: "SoftmaxWithLoss"
+  bottom: "ip"
+  bottom: "label"
+  top: "loss"
+  include: { phase: TRAIN stage: "train" }
+  include: { phase: TEST stage: "val" }
+}
+layer {
+  name: "pred"
+  type: "Softmax"
+  bottom: "ip"
+  top: "pred"
+  include: { phase: TEST stage: "deploy" }
+}
+"""
+
+    def setUp(self):
+        self.f = tempfile.NamedTemporaryFile(mode='w+', delete=False)
+        self.f.write(self.TEST_NET)
+        self.f.close()
+
+    def tearDown(self):
+        os.remove(self.f.name)
+
+    def check_net(self, net, outputs):
+        self.assertEqual(list(net.blobs['data'].shape), [1,1,10,10])
+        self.assertEqual(net.outputs, outputs)
+
+    def test_train(self):
+        net = caffe.Net(self.f.name, caffe.TRAIN, stages=['train'])
+        self.check_net(net, ['loss'])
+
+    def test_val(self):
+        net = caffe.Net(self.f.name, caffe.TEST, stages=['val'])
+        self.check_net(net, ['loss'])
+
+    def test_deploy(self):
+        net = caffe.Net(self.f.name, caffe.TEST, stages=['deploy'])
+        self.check_net(net, ['pred'])
+
+# if __name__ == '__main__':
+#     unittest.main()
\ No newline at end of file
diff --git a/scripts/create_annoset.py b/scripts/create_annoset.py
new file mode 100644
index 00000000000..eed11ab9556
--- /dev/null
+++ b/scripts/create_annoset.py
@@ -0,0 +1,167 @@
+import argparse
+import os
+import shutil
+import subprocess
+import sys
+
+from caffe.proto import caffe_pb2
+from google.protobuf import text_format
+
+if __name__ == "__main__":
+  parser = argparse.ArgumentParser(description="Create AnnotatedDatum database")
+  parser.add_argument("root",
+      help="The root directory which contains the images and annotations.")
+  parser.add_argument("listfile",
+      help="The file which contains image paths and annotation info.")
+  parser.add_argument("outdir",
+      help="The output directory which stores the database file.")
+  parser.add_argument("exampledir",
+      help="The directory to store the link of the database files.")
+  parser.add_argument("--redo", default = False, action = "store_true",
+      help="Recreate the database.")
+  parser.add_argument("--anno-type", default = "classification",
+      help="The type of annotation {classification, detection}.")
+  parser.add_argument("--label-type", default = "xml",
+      help="The type of label file format for detection {xml, json, txt}.")
+  parser.add_argument("--backend", default = "lmdb",
+      help="The backend {lmdb, leveldb} for storing the result")
+  parser.add_argument("--check-size", default = False, action = "store_true",
+      help="Check that all the datum have the same size.")
+  parser.add_argument("--encode-type", default = "",
+      help="What type should we encode the image as ('png','jpg',...).")
+  parser.add_argument("--encoded", default = False, action = "store_true",
+      help="The encoded image will be save in datum.")
+  parser.add_argument("--gray", default = False, action = "store_true",
+      help="Treat images as grayscale ones.")
+  parser.add_argument("--label-map-file", default = "",
+      help="A file with LabelMap protobuf message.")
+  parser.add_argument("--min-dim", default = 0, type = int,
+      help="Minimum dimension images are resized to.")
+  parser.add_argument("--max-dim", default = 0, type = int,
+      help="Maximum dimension images are resized to.")
+  parser.add_argument("--resize-height", default = 0, type = int,
+      help="Height images are resized to.")
+  parser.add_argument("--resize-width", default = 0, type = int,
+      help="Width images are resized to.")
+  parser.add_argument("--shuffle", default = False, action = "store_true",
+      help="Randomly shuffle the order of images and their labels.")
+  parser.add_argument("--check-label", default = False, action = "store_true",
+      help="Check that there is no duplicated name/label.")
+
+  args = parser.parse_args()
+  root_dir = args.root
+  list_file = args.listfile
+  out_dir = args.outdir
+  example_dir = args.exampledir
+
+  redo = args.redo
+  anno_type = args.anno_type
+  label_type = args.label_type
+  backend = args.backend
+  check_size = args.check_size
+  encode_type = args.encode_type
+  encoded = args.encoded
+  gray = args.gray
+  label_map_file = args.label_map_file
+  min_dim = args.min_dim
+  max_dim = args.max_dim
+  resize_height = args.resize_height
+  resize_width = args.resize_width
+  shuffle = args.shuffle
+  check_label = args.check_label
+
+  # check if root directory exists
+  if not os.path.exists(root_dir):
+    print "root directory: {} does not exist".format(root_dir)
+    sys.exit()
+  # add "/" to root directory if needed
+  if root_dir[-1] != "/":
+    root_dir += "/"
+  # check if list file exists
+  if not os.path.exists(list_file):
+    print "list file: {} does not exist".format(list_file)
+    sys.exit()
+  # check list file format is correct
+  with open(list_file, "r") as lf:
+    for line in lf.readlines():
+      img_file, anno = line.strip("\n").split(" ")
+      if not os.path.exists(root_dir + img_file):
+        print "image file: {} does not exist".format(root_dir + img_file)
+      if anno_type == "classification":
+        if not anno.isdigit():
+          print "annotation: {} is not an integer".format(anno)
+      elif anno_type == "detection":
+        if not os.path.exists(root_dir + anno):
+          print "annofation file: {} does not exist".format(root_dir + anno)
+          sys.exit()
+      break
+  # check if label map file exist
+  if anno_type == "detection":
+    if not os.path.exists(label_map_file):
+      print "label map file: {} does not exist".format(label_map_file)
+      sys.exit()
+    label_map = caffe_pb2.LabelMap()
+    lmf = open(label_map_file, "r")
+    try:
+      text_format.Merge(str(lmf.read()), label_map)
+    except:
+      print "Cannot parse label map file: {}".format(label_map_file)
+      sys.exit()
+  out_parent_dir = os.path.dirname(out_dir)
+  if not os.path.exists(out_parent_dir):
+    os.makedirs(out_parent_dir)
+  if os.path.exists(out_dir) and not redo:
+    print "{} already exists and I do not hear redo".format(out_dir)
+    sys.exit()
+  if os.path.exists(out_dir):
+    shutil.rmtree(out_dir)
+
+  # get caffe root directory
+  caffe_root = os.path.dirname(os.path.dirname(os.path.realpath(__file__)))
+  if anno_type == "detection":
+    cmd = "{}/build/tools/convert_annoset" \
+        " --anno_type={}" \
+        " --label_type={}" \
+        " --label_map_file={}" \
+        " --check_label={}" \
+        " --min_dim={}" \
+        " --max_dim={}" \
+        " --resize_height={}" \
+        " --resize_width={}" \
+        " --backend={}" \
+        " --shuffle={}" \
+        " --check_size={}" \
+        " --encode_type={}" \
+        " --encoded={}" \
+        " --gray={}" \
+        " {} {} {}" \
+        .format(caffe_root, anno_type, label_type, label_map_file, check_label,
+            min_dim, max_dim, resize_height, resize_width, backend, shuffle,
+            check_size, encode_type, encoded, gray, root_dir, list_file, out_dir)
+  elif anno_type == "classification":
+    cmd = "{}/build/tools/convert_annoset" \
+        " --anno_type={}" \
+        " --min_dim={}" \
+        " --max_dim={}" \
+        " --resize_height={}" \
+        " --resize_width={}" \
+        " --backend={}" \
+        " --shuffle={}" \
+        " --check_size={}" \
+        " --encode_type={}" \
+        " --encoded={}" \
+        " --gray={}" \
+        " {} {} {}" \
+        .format(caffe_root, anno_type, min_dim, max_dim, resize_height,
+            resize_width, backend, shuffle, check_size, encode_type, encoded,
+            gray, root_dir, list_file, out_dir)
+  print cmd
+  process = subprocess.Popen(cmd.split(), stdout=subprocess.PIPE)
+  output = process.communicate()[0]
+
+  if not os.path.exists(example_dir):
+    os.makedirs(example_dir)
+  link_dir = os.path.join(example_dir, os.path.basename(out_dir))
+  if os.path.exists(link_dir):
+    os.unlink(link_dir)
+  os.symlink(out_dir, link_dir)
diff --git a/scripts/travis/build.sh b/scripts/travis/build.sh
index ca459a2531b..980a392ab68 100755
--- a/scripts/travis/build.sh
+++ b/scripts/travis/build.sh
@@ -5,9 +5,9 @@ BASEDIR=$(dirname $0)
 source $BASEDIR/defaults.sh
 
 if ! $WITH_CMAKE ; then
-  make --jobs $NUM_THREADS all test pycaffe warn
+  make -j"$(nproc)" all test pycaffe warn
 else
   cd build
-  make --jobs $NUM_THREADS all test.testbin
+  make -j"$(nproc)" all test.testbin
 fi
 make lint
diff --git a/scripts/travis/configure-cmake.sh b/scripts/travis/configure-cmake.sh
index adafa7eca69..76fb65ad329 100644
--- a/scripts/travis/configure-cmake.sh
+++ b/scripts/travis/configure-cmake.sh
@@ -19,7 +19,7 @@ fi
 
 if $WITH_CUDA ; then
   # Only build SM50
-  ARGS="$ARGS -DCPU_ONLY=Off -DCUDA_ARCH_NAME=Manual -DCUDA_ARCH_BIN=\"50\" -DCUDA_ARCH_PTX=\"\""
+  ARGS="$ARGS -DCUDA_ARCH_NAME=Manual -DCUDA_ARCH_BIN=\"50\" -DCUDA_ARCH_PTX=\"\""
 fi
 
 if $WITH_CUDNN ; then
diff --git a/scripts/travis/test.sh b/scripts/travis/test.sh
index 24e8ea48b3e..05a35cf7dfe 100755
--- a/scripts/travis/test.sh
+++ b/scripts/travis/test.sh
@@ -10,10 +10,12 @@ if $WITH_CUDA ; then
 fi
 
 if ! $WITH_CMAKE ; then
+  make -j"$(nproc)"
   make runtest
   make pytest
 else
   cd build
+  make -j"$(nproc)"
   make runtest
   make pytest
 fi
diff --git a/src/caffe/batch_transformer.cpp b/src/caffe/batch_transformer.cpp
index bb3b15a384f..0d3cdd0b1a0 100644
--- a/src/caffe/batch_transformer.cpp
+++ b/src/caffe/batch_transformer.cpp
@@ -23,7 +23,7 @@ void BatchTransformer<Ftype, Btype>::ResizeQueues(size_t queues_num) {
   if (queues_num_ > prefetches_free_.size()) {
     resize(true);
   }
-  StartInternalThread();
+  StartInternalThread(true);
 }
 
 template<typename Ftype, typename Btype>
@@ -97,7 +97,7 @@ void BatchTransformer<Ftype, Btype>::reshape(const vector<int>& data_shape,
   if (processed_free_.try_peek(&processed_batch)) {
     processed_batch->data_->Reshape(data_shape);
     processed_batch->label_->Reshape(label_shape);
-    if (Caffe::mode() == Caffe::GPU) {
+    if (preallocate && Caffe::mode() == Caffe::GPU) {
       processed_batch->data_->template mutable_gpu_data_c<Ftype>(false);
       processed_batch->label_->template mutable_gpu_data_c<Ftype>(false);
     }
diff --git a/src/caffe/blob.cpp b/src/caffe/blob.cpp
index c6e1cc443d7..522d981489d 100644
--- a/src/caffe/blob.cpp
+++ b/src/caffe/blob.cpp
@@ -1,7 +1,8 @@
 #include <climits>
 #include <vector>
+#if defined(USE_CUDNN)
 #include <caffe/util/cudnn.hpp>
-
+#endif
 #include "caffe/blob.hpp"
 
 namespace caffe {
diff --git a/src/caffe/common.cpp b/src/caffe/common.cpp
index 3f51d19d77f..f5a1a2f7812 100644
--- a/src/caffe/common.cpp
+++ b/src/caffe/common.cpp
@@ -17,6 +17,7 @@ namespace caffe {
 // Must be set before brewing
 Caffe::Brew Caffe::mode_ = Caffe::GPU;
 int Caffe::solver_count_ = 1;
+std::vector<int> Caffe::gpus_;
 int Caffe::root_device_ = -1;
 int Caffe::thread_count_ = 0;
 int Caffe::restored_iter_ = -1;
@@ -143,12 +144,10 @@ Caffe::Caffe()
 
 void Caffe::init() {
   if (mode_ == GPU && curand_generator_ == nullptr) {
-    CURAND_CHECK_ARG(curandCreateGenerator(&curand_generator_, CURAND_RNG_PSEUDO_DEFAULT),
-        current_device());
-    CURAND_CHECK_ARG(curandSetPseudoRandomGeneratorSeed(curand_generator_, cluster_seedgen()),
-        current_device());
     curand_stream_ = CudaStream::create();
-    CURAND_CHECK_ARG(curandSetStream(curand_generator_, curand_stream_->get()), current_device());
+    CURAND_CHECK(curandCreateGenerator(&curand_generator_, CURAND_RNG_PSEUDO_DEFAULT));
+    CURAND_CHECK(curandSetPseudoRandomGeneratorSeed(curand_generator_, cluster_seedgen()));
+    CURAND_CHECK(curandSetStream(curand_generator_, curand_stream_->get()));
   }
 }
 
@@ -162,6 +161,7 @@ Caffe::~Caffe() {
 }
 
 size_t Caffe::min_avail_device_memory() {
+  std::lock_guard<std::mutex> lock(caffe_mutex_);
   size_t ret = 0UL;
   const std::vector<int>& cur_gpus = gpus();
   int cur_device;
@@ -453,23 +453,27 @@ CuDNNHandle::~CuDNNHandle() {
 }
 #endif
 
-Caffe::Properties Caffe::props_;
+Caffe::Properties& Caffe::props() {
+  static Caffe::Properties props_;
+  return props_;
+}
 
 Caffe::Properties::Properties() :
       init_time_(std::time(nullptr)),
       main_thread_id_(lwp_id()),
       caffe_version_(AS_STRING(CAFFE_VERSION)) {
-  const int count = Caffe::device_count();
+  const std::vector<int>& gpus = Caffe::gpus();
+  const int count = gpus.size();
   if (count == 0) {
     return;
   }
   compute_capabilities_.resize(count);
   cudaDeviceProp device_prop;
   for (int gpu = 0; gpu < compute_capabilities_.size(); ++gpu) {
-    CUDA_CHECK(cudaGetDeviceProperties(&device_prop, gpu));
+    CUDA_CHECK(cudaGetDeviceProperties(&device_prop, gpus[gpu]));
     compute_capabilities_[gpu] = device_prop.major * 100 + device_prop.minor;
-    DLOG(INFO) << "GPU " << gpu << " '" << device_prop.name << "' has compute capability "
-        << device_prop.major << "." << device_prop.minor;
+    DLOG(INFO) << "GPU " << gpus[gpu] << " '" << device_prop.name
+               << "' has compute capability " << device_prop.major << "." << device_prop.minor;
   }
 #ifdef USE_CUDNN
   cudnn_version_ = std::to_string(cudnnGetVersion());
@@ -520,22 +524,9 @@ std::mutex NVMLInit::m_;
 NVMLInit::NVMLInit() {
   if (nvmlInit() != NVML_SUCCESS) {
     LOG(ERROR) << "NVML failed to initialize";
-    return;
   } else {
     LOG(INFO) << "NVML initialized, thread " << lwp_id();
   }
-  unsigned int deviceCount = 0U;
-  if (nvmlDeviceGetCount(&deviceCount) == NVML_SUCCESS) {
-    for (unsigned int id = 0; id < deviceCount; ++id) {
-      if (nvmlDeviceGetHandleByIndex(id, &device_) != NVML_SUCCESS ||
-          nvmlDeviceSetCpuAffinity(device_) != NVML_SUCCESS) {
-          LOG(ERROR) << "NVML failed to set CPU affinity on device " << id
-              << ", thread " << lwp_id();
-      }
-    }
-  } else {
-    LOG(ERROR) << "nvmlDeviceGetCount failed, thread " << lwp_id();
-  }
 }
 
 NVMLInit::~NVMLInit() {
@@ -543,9 +534,22 @@ NVMLInit::~NVMLInit() {
 }
 
 // set the CPU affinity for this thread
-void setCpuAffinity() {
+void setCpuAffinity(int device) {
   std::lock_guard<std::mutex> lock(NVMLInit::m_);
-  static thread_local NVMLInit nvml_init_;
+  static NVMLInit nvml_init_;
+
+  char pciBusId[16];
+  CUDA_CHECK(cudaDeviceGetPCIBusId(pciBusId, 16, device));
+  nvmlDevice_t nvml_device;
+
+  if (nvmlDeviceGetHandleByPciBusId(pciBusId, &nvml_device) != NVML_SUCCESS ||
+      nvmlDeviceSetCpuAffinity(nvml_device) != NVML_SUCCESS) {
+    LOG(ERROR) << "NVML failed to set CPU affinity on device " << device
+               << ", thread " << lwp_id();
+  } else {
+    LOG(INFO) << "NVML succeeded to set CPU affinity on device " << device
+               << ", thread " << lwp_id();
+  }
 }
 
 }  // namespace nvml
diff --git a/src/caffe/data_reader.cpp b/src/caffe/data_reader.cpp
index ba94cbc6e8d..4e23ced613f 100644
--- a/src/caffe/data_reader.cpp
+++ b/src/caffe/data_reader.cpp
@@ -2,17 +2,20 @@
 #include <sys/sysinfo.h>
 
 #include "caffe/util/rng.hpp"
-#include "caffe/common.hpp"
 #include "caffe/parallel.hpp"
 #include "caffe/data_reader.hpp"
 
 namespace caffe {
 
-std::mutex DataReader::db_mutex_;
-std::mutex DataReader::DataCache::cache_mutex_;
-unique_ptr<DataReader::DataCache> DataReader::DataCache::data_cache_inst_;
+template<typename DatumType>
+std::mutex DataReader<DatumType>::db_mutex_;
+// https://stackoverflow.com/questions/26935824/gcc-gives-an-undefined-reference-error-to-static-data-members-in-templated-cla
 
-DataReader::DataReader(const LayerParameter& param,
+template<typename DatumType>
+std::mutex DataReader<DatumType>::DataCache::cache_mutex_{};
+
+template<typename DatumType>
+DataReader<DatumType>::DataReader(const LayerParameter& param,
     size_t solver_count,
     size_t solver_rank,
     size_t parser_threads_num,
@@ -55,26 +58,29 @@ DataReader::DataReader(const LayerParameter& param,
   LOG(INFO) << (sample_only ? "Sample " : "") << "Data Reader threads: "
       << this->threads_num() << ", out queues: " << queues_num_ << ", depth: " << queue_depth_;
   for (size_t i = 0; i < queues_num_; ++i) {
-    full_[i] = make_shared<BlockingQueue<shared_ptr<Datum>>>();
-    free_[i] = make_shared<BlockingQueue<shared_ptr<Datum>>>();
-    for (size_t j = 0; j < queue_depth_ - 1U; ++j) {  // +1 in InternalThreadEntryN
-      free_[i]->push(make_shared<Datum>());
+    full_[i] = make_shared<BlockingQueue<shared_ptr<DatumType>>>();
+    free_[i] = make_shared<BlockingQueue<shared_ptr<DatumType>>>();
+    for (size_t j = 0; j < queue_depth_; ++j) {
+      free_[i]->push(make_shared<DatumType>());
     }
   }
   db_source_ = param.data_param().source();
-  init_ = make_shared<BlockingQueue<shared_ptr<Datum>>>();
+  init_ = make_shared<BlockingQueue<shared_ptr<DatumType>>>();
   StartInternalThread(false, Caffe::next_seed());
 }
 
-DataReader::~DataReader() {
+template<typename DatumType>
+DataReader<DatumType>::~DataReader() {
   StopInternalThread();
 }
 
-void DataReader::InternalThreadEntry() {
+template<typename DatumType>
+void DataReader<DatumType>::InternalThreadEntry() {
   InternalThreadEntryN(0U);
 }
 
-void DataReader::InternalThreadEntryN(size_t thread_id) {
+template<typename DatumType>
+void DataReader<DatumType>::InternalThreadEntryN(size_t thread_id) {
   if (cache_) {
     data_cache_->check_db(db_source_);
     data_cache_->register_new_thread();
@@ -97,7 +103,7 @@ void DataReader::InternalThreadEntryN(size_t thread_id) {
       cache_ && !sample_only_,
       shuffle_ && !sample_only_,
       epoch_count_required_);
-  shared_ptr<Datum> init_datum = make_shared<Datum>();
+  shared_ptr<DatumType> init_datum = make_shared<DatumType>();
   cm.fetch(init_datum.get());
   init_->push(init_datum);
 
@@ -108,7 +114,7 @@ void DataReader::InternalThreadEntryN(size_t thread_id) {
   size_t skip = skip_one_batch_ ? batch_size_ : 0UL;
 
   size_t queue_id, ranked_rec, batch_on_solver, sample_count = 0UL;
-  shared_ptr<Datum> datum = make_shared<Datum>();
+  shared_ptr<DatumType> datum = make_shared<DatumType>();
   try {
     while (!must_stop(thread_id)) {
       cm.next(datum);
@@ -137,13 +143,15 @@ void DataReader::InternalThreadEntryN(size_t thread_id) {
   }
 }
 
-shared_ptr<Datum>& DataReader::DataCache::next_new() {
+template<typename DatumType>
+shared_ptr<DatumType>& DataReader<DatumType>::DataCache::next_new() {
   std::lock_guard<std::mutex> lock(cache_mutex_);
-  cache_buffer_.emplace_back(make_shared<Datum>());
+  cache_buffer_.emplace_back(make_shared<DatumType>());
   return cache_buffer_.back();
 }
 
-shared_ptr<Datum>& DataReader::DataCache::next_cached(DataReader& reader) {
+template<typename DatumType>
+shared_ptr<DatumType>& DataReader<DatumType>::DataCache::next_cached(DataReader& reader) {
   if (just_cached_.load()) {
     cache_bar_.wait();
     just_cached_.store(false);
@@ -170,19 +178,21 @@ shared_ptr<Datum>& DataReader::DataCache::next_cached(DataReader& reader) {
     LOG(INFO) << "Shuffling " << cache_buffer_.size() << " records...";
     caffe::shuffle(cache_buffer_.begin(), cache_buffer_.end());
   }
-  shared_ptr<Datum>& datum = cache_buffer_[cache_idx_++];
+  shared_ptr<DatumType>& datum = cache_buffer_[cache_idx_++];
   if (cache_idx_ >= cache_buffer_.size()) {
     cache_idx_= 0UL;
   }
   return datum;
 }
 
-void DataReader::DataCache::just_cached() {
+template<typename DatumType>
+void DataReader<DatumType>::DataCache::just_cached() {
   just_cached_.store(true);
   cached_flags_[lwp_id()]->set();
 }
 
-bool DataReader::DataCache::check_memory() {
+template<typename DatumType>
+bool DataReader<DatumType>::DataCache::check_memory() {
 #ifdef __APPLE__
   return true;
 #else
@@ -240,7 +250,8 @@ bool DataReader::DataCache::check_memory() {
 #endif
 }
 
-DataReader::CursorManager::CursorManager(db::DB* db, DataReader* reader,
+template<typename DatumType>
+DataReader<DatumType>::CursorManager::CursorManager(db::DB* db, DataReader<DatumType>* reader,
     size_t solver_count, size_t solver_rank, size_t parser_threads, size_t parser_thread_id,
     size_t batch_size, bool cache, bool shuffle, bool epoch_count_required)
     : db_(db),
@@ -261,12 +272,14 @@ DataReader::CursorManager::CursorManager(db::DB* db, DataReader* reader,
       epoch_count_(0UL),
       epoch_count_required_(epoch_count_required) {}
 
-DataReader::CursorManager::~CursorManager() {
+template<typename DatumType>
+DataReader<DatumType>::CursorManager::~CursorManager() {
   cursor_.reset();
   db_->Close();
 }
 
-void DataReader::CursorManager::next(shared_ptr<Datum>& datum) {
+template<typename DatumType>
+void DataReader<DatumType>::CursorManager::next(shared_ptr<DatumType>& datum) {
   if (cached_all_) {
     datum = reader_->next_cached();
   } else {
@@ -332,7 +345,8 @@ S1 |                          r1pt1.q1                    --> S1.tr0 S1.q2
       <-- rank cycle ->
       <---------- full cycle ----------->
 */
-void DataReader::CursorManager::rewind() {
+template<typename DatumType>
+void DataReader<DatumType>::CursorManager::rewind() {
   CHECK(parser_threads_);
   size_t rank_cycle_begin = rank_cycle_ * solver_rank_;
   rec_id_ = rank_cycle_begin + parser_thread_id_ * batch_size_;
@@ -346,7 +360,8 @@ void DataReader::CursorManager::rewind() {
   }
 }
 
-void DataReader::CursorManager::fetch(Datum* datum) {
+template<>
+void DataReader<Datum>::CursorManager::fetch(Datum* datum) {
   C2TensorProtos protos;
   if (cursor_->parse(&protos) && protos.protos_size() >= 2) {
     C2TensorProto* image_proto = protos.mutable_protos(0);
@@ -378,4 +393,14 @@ void DataReader::CursorManager::fetch(Datum* datum) {
   // DLOG(INFO) << cursor_->key() << " " << datum->label();
 }
 
+template<>
+void DataReader<AnnotatedDatum>::CursorManager::fetch(AnnotatedDatum* datum) {
+  if (!cursor_->parse(datum)) {
+    LOG(ERROR) << "Database cursor failed to parse Datum record";
+  }
+}
+
+template class DataReader<Datum>;
+template class DataReader<AnnotatedDatum>;
+
 }  // namespace caffe
diff --git a/src/caffe/data_transformer.cpp b/src/caffe/data_transformer.cpp
index d0454efa15b..b8894e1f607 100644
--- a/src/caffe/data_transformer.cpp
+++ b/src/caffe/data_transformer.cpp
@@ -1,23 +1,28 @@
 #include <opencv2/core/core.hpp>
 #include <opencv2/imgproc/imgproc.hpp>
 #include <opencv2/highgui/highgui.hpp>
-#include <string>
-#include <vector>
 
 #include "caffe/data_transformer.hpp"
+#include "caffe/util/bbox_util.hpp"
+#include "caffe/util/im_transforms.hpp"
 #include "caffe/util/rng.hpp"
+#include "caffe/proto/caffe.pb.h"
+
 
 namespace caffe {
 
-bool DataTransformer::image_random_crop_enabled() const {
+template<typename Dtype>
+bool DataTransformer<Dtype>::image_random_crop_enabled() const {
   return this->phase_ == TRAIN && param_.crop_size() > 0;
 }
 
-bool DataTransformer::image_center_crop_enabled() const {
+template<typename Dtype>
+bool DataTransformer<Dtype>::image_center_crop_enabled() const {
   return this->phase_ == TEST && param_.crop_size() > 0;
 }
 
-DataTransformer::DataTransformer(const TransformationParameter& param, Phase phase)
+template<typename Dtype>
+DataTransformer<Dtype>::DataTransformer(const TransformationParameter& param, Phase phase)
     : param_(param), phase_(phase),
       rand_resize_ratio_lower_(param_.rand_resize_ratio_lower()),
       rand_resize_ratio_upper_(param_.rand_resize_ratio_upper()),
@@ -49,7 +54,8 @@ DataTransformer::DataTransformer(const TransformationParameter& param, Phase pha
   InitRand();
 }
 
-void DataTransformer::image_random_resize(const cv::Mat& src, cv::Mat& dst) {
+template<typename Dtype>
+void DataTransformer<Dtype>::image_random_resize(const cv::Mat& src, cv::Mat& dst) const {
   if (!image_random_resize_enabled()) {
     dst = src;
     return;
@@ -114,7 +120,8 @@ void DataTransformer::image_random_resize(const cv::Mat& src, cv::Mat& dst) {
   }
 }
 
-bool DataTransformer::image_random_resize_enabled() const {
+template<typename Dtype>
+bool DataTransformer<Dtype>::image_random_resize_enabled() const {
   const int resize_lower = param_.img_rand_resize_lower();
   const int resize_upper = param_.img_rand_resize_upper();
   const bool use_rand_resize =
@@ -127,7 +134,8 @@ bool DataTransformer::image_random_resize_enabled() const {
   return resize_lower != 0 || resize_upper != 0 || use_rand_resize;
 }
 
-void DataTransformer::image_random_crop(int crop_w, int crop_h, cv::Mat& img) {
+template<typename Dtype>
+void DataTransformer<Dtype>::image_random_crop(int crop_w, int crop_h, cv::Mat& img) const {
   CHECK_GT(crop_w, 0) << "crop_w parameter must be positive";
   CHECK_GT(crop_h, 0) << "crop_h parameter must be positive";
   const int img_width = img.cols;
@@ -143,7 +151,8 @@ void DataTransformer::image_random_crop(int crop_w, int crop_h, cv::Mat& img) {
   img = img(roi).clone();
 }
 
-void DataTransformer::image_center_crop(int crop_w, int crop_h, cv::Mat& img) {
+template<typename Dtype>
+void DataTransformer<Dtype>::image_center_crop(int crop_w, int crop_h, cv::Mat& img) const {
   CHECK_GT(crop_w, 0) << "center crop_w parameter must be positive";
   CHECK_GT(crop_h, 0) << "center crop_h parameter must be positive";
   const int img_width = img.cols;
@@ -159,7 +168,8 @@ void DataTransformer::image_center_crop(int crop_w, int crop_h, cv::Mat& img) {
   img = img(roi).clone();
 }
 
-void DataTransformer::apply_mean_scale_mirror(const cv::Mat& src, cv::Mat& dst) {
+template<typename Dtype>
+void DataTransformer<Dtype>::apply_mean_scale_mirror(const cv::Mat& src, cv::Mat& dst) const {
   const float scale = param_.scale();
   const bool has_mean_file = param_.has_mean_file();
   const bool has_mean_values = !mean_values_.empty();
@@ -202,21 +212,24 @@ void DataTransformer::apply_mean_scale_mirror(const cv::Mat& src, cv::Mat& dst)
   }
 }
 
-void DataTransformer::InitRand() {
+template<typename Dtype>
+void DataTransformer<Dtype>::InitRand() {
   // Use random_seed setting for deterministic transformations
   const uint64_t random_seed = param_.random_seed() >= 0 ?
       static_cast<uint64_t>(param_.random_seed()) : Caffe::next_seed();
   rng_.reset(new Caffe::RNG(random_seed));
 }
 
-unsigned int DataTransformer::Rand() const {
+template<typename Dtype>
+unsigned int DataTransformer<Dtype>::Rand() const {
   CHECK(rng_);
   caffe::rng_t *rng = static_cast<caffe::rng_t*>(rng_->generator());
   // this doesn't actually produce a uniform distribution
   return (*rng)();
 }
 
-float DataTransformer::Rand(float a, float b) const {
+template<typename Dtype>
+float DataTransformer<Dtype>::Rand(float a, float b) const {
   if (a == b) {
     return a;
   }
@@ -226,28 +239,71 @@ float DataTransformer::Rand(float a, float b) const {
   return static_cast<float>(lo + (up - lo) * r / UM);
 }
 
-// tests only, TODO: clean
-void DataTransformer::VariableSizedTransforms(Datum* datum) {
-  cv::Mat img1, img2;
-  const int color_mode = param_.force_color() ? 1 : (param_.force_gray() ? -1 : 0);
-  if (datum->encoded()) {
-    DecodeDatumToCVMat(*datum, color_mode, img1, false);
-  } else {
-    DatumToCVMat(*datum, img1, false);
-  }
+template<typename Dtype>
+void DataTransformer<Dtype>::Copy(const cv::Mat& cv_img, Dtype *data) {
+  const int channels = cv_img.channels();
+  const int height = cv_img.rows;
+  const int width = cv_img.cols;
 
-  image_random_resize(img1, img2);
+  CHECK(cv_img.depth() == CV_8U) << "Image data type must be unsigned byte";
 
-  if (image_random_crop_enabled()) {
-    image_random_crop(param_.crop_size(), param_.crop_size(), img2);
+  int top_index;
+  for (int c = 0; c < channels; ++c) {
+    for (int h = 0; h < height; ++h) {
+      const uchar *ptr = cv_img.ptr<uchar>(h);
+      for (int w = 0; w < width; ++w) {
+        int img_index = w * channels + c;
+        top_index = (c * height + h) * width + w;
+        data[top_index] = static_cast<Dtype>(ptr[img_index]);
+      }
+    }
+  }
+}
+
+template<typename Dtype>
+void DataTransformer<Dtype>::Copy(const Datum& datum, Dtype* data, size_t& out_sizeof_element) {
+  // If datum is encoded, decoded and transform the cv::image.
+  if (datum.encoded()) {
+    CHECK(!(param_.force_color() && param_.force_gray()))
+    << "cannot set both force_color and force_gray";
+    cv::Mat cv_img;
+    if (param_.force_color() || param_.force_gray()) {
+      // If force_color then decode in color otherwise decode in gray.
+      cv_img = DecodeDatumToCVMat(datum, param_.force_color());
+    } else {
+      cv_img = DecodeDatumToCVMatNative(datum);
+    }
+    // Transform the cv::image into blob.
+    Copy(cv_img, data);
+    out_sizeof_element = sizeof(Dtype);
+    return;
+  } else {
+    if (param_.force_color() || param_.force_gray()) {
+      LOG(ERROR) << "Force_color and force_gray are for encoded datum only";
+    }
   }
-  if (image_center_crop_enabled()) {
-    image_center_crop(param_.crop_size(), param_.crop_size(), img2);
+
+  const string& datum_data = datum.data();
+  const int N = datum.channels() * datum.height() * datum.width();
+  const void* src_ptr;
+  if (datum_data.size() > 0) {
+    CHECK_LE(sizeof(uint8_t), sizeof(Dtype));
+    CHECK_EQ(N, datum_data.size());
+    out_sizeof_element = sizeof(uint8_t);
+    src_ptr = &datum_data.front();
+  } else {
+    CHECK_LE(sizeof(float), sizeof(Dtype));
+    out_sizeof_element = sizeof(float);
+    src_ptr = &datum.float_data().Get(0);
   }
-  CVMatToDatum(img2, *datum);
+  cudaStream_t stream = Caffe::thread_stream();
+  CUDA_CHECK(cudaMemcpyAsync(data, src_ptr, N * out_sizeof_element,
+      cudaMemcpyHostToDevice, stream));
+  CUDA_CHECK(cudaStreamSynchronize(stream));
 }
 
-void DataTransformer::Fill3Randoms(unsigned int *rand) const {
+template<typename Dtype>
+void DataTransformer<Dtype>::Fill3Randoms(unsigned int *rand) const {
   rand[0] = rand[1] = rand[2] = 0;
   if (param_.mirror()) {
     rand[0] = Rand() + 1;
@@ -258,4 +314,1330 @@ void DataTransformer::Fill3Randoms(unsigned int *rand) const {
   }
 }
 
+template<typename Dtype>
+vector<int> DataTransformer<Dtype>::Transform(const Datum* datum, Dtype* buf, size_t buf_len,
+    Packing& out_packing, bool repack) {
+  vector<int> shape;
+  const bool shape_only = buf == nullptr;
+  CHECK(!(param_.force_color() && param_.force_gray()))
+  << "cannot set both force_color and force_gray";
+  const int color_mode = param_.force_color() ? 1 : (param_.force_gray() ? -1 : 0);
+  cv::Mat img;
+  bool v1_path = false;
+  if (datum->encoded()) {
+    shape = DecodeDatumToCVMat(*datum, color_mode, img, shape_only, false);
+    out_packing = NHWC;
+  } else {
+    if (image_random_resize_enabled() || buf == nullptr || buf_len == 0UL) {
+      shape = DatumToCVMat(*datum, img, shape_only);
+      out_packing = NHWC;
+    } else {
+      // here we can use fast V1 path
+      TransformV1(*datum, buf, buf_len);
+      shape = vector<int>{1, datum->channels(), datum->height(), datum->width()};
+      v1_path = true;
+      out_packing = NCHW;
+    }
+  }
+  if (param_.crop_size() > 0) {
+    shape[2] = param_.crop_size();
+    shape[3] = param_.crop_size();
+  }
+  if (!shape_only && !v1_path) {
+    CHECK_NOTNULL(img.data);
+    Transform(img, buf, buf_len, repack);
+    out_packing = NHWC;
+  }
+  return shape;
+}
+
+
+template<typename Dtype>
+void DataTransformer<Dtype>::Transform(const cv::Mat& src, Dtype* buf, size_t buf_len,
+    bool repack) const {
+  cv::Mat tmp, dst;
+
+  image_random_resize(src, tmp);
+
+  const int crop_w = param_.crop_w() > 0 ? param_.crop_w() : param_.crop_size();
+  const int crop_h = param_.crop_h() > 0 ? param_.crop_h() : param_.crop_size();
+  if (image_random_crop_enabled()) {
+    image_random_crop(crop_w, crop_h, tmp);
+  } else if (image_center_crop_enabled()) {
+    image_center_crop(crop_w, crop_h, tmp);
+  }
+  apply_mean_scale_mirror(tmp, dst);
+  FloatCVMatToBuf<Dtype>(dst, buf_len, buf, repack);
+}
+
+template<typename Dtype>
+void DataTransformer<Dtype>::Transform(const vector<cv::Mat>& mat_vector,
+    TBlob<Dtype>* transformed_blob) const {
+  const size_t mat_num = mat_vector.size();
+  const int num = transformed_blob->num();
+  CHECK_GT(mat_num, 0) << "There is no MAT to add";
+  CHECK_EQ(mat_num, num) << "The size of mat_vector must be equals to transformed_blob->num()";
+  cv::Mat dst;
+  size_t buf_len = transformed_blob->offset(1);
+  for (size_t item_id = 0; item_id < mat_num; ++item_id) {
+    size_t offset = transformed_blob->offset(item_id);
+    apply_mean_scale_mirror(mat_vector[item_id], dst);
+    FloatCVMatToBuf<Dtype>(dst, buf_len, transformed_blob->mutable_cpu_data(false) + offset);
+  }
+}
+
+template<typename Dtype>
+void DataTransformer<Dtype>::Transform(const vector<Datum>& datum_vector,
+    TBlob<Dtype> *transformed_blob) const {
+  const int datum_num = datum_vector.size();
+  const int num = transformed_blob->num();
+  const int channels = transformed_blob->channels();
+  const int height = transformed_blob->height();
+  const int width = transformed_blob->width();
+
+  CHECK_GT(datum_num, 0) << "There is no datum to add";
+  CHECK_LE(datum_num, num)
+    << "The size of datum_vector must be no greater than transformed_blob->num()";
+  TBlob<Dtype> uni_blob(1, channels, height, width);
+  for (int item_id = 0; item_id < datum_num; ++item_id) {
+    int offset = transformed_blob->offset(item_id);
+    uni_blob.set_cpu_data(transformed_blob->mutable_cpu_data() + offset);
+    Transform(datum_vector[item_id], &uni_blob);
+  }
+}
+
+template<typename Dtype>
+void DataTransformer<Dtype>::Transform(const Datum& datum,
+                                       TBlob<Dtype>* transformed_blob,
+                                       NormalizedBBox* crop_bbox,
+                                       bool* do_mirror) {
+  // If datum is encoded, decoded and transform the cv::image.
+  if (datum.encoded()) {
+    CHECK(!(param_.force_color() && param_.force_gray()))
+        << "cannot set both force_color and force_gray";
+    cv::Mat cv_img;
+    if (param_.force_color() || param_.force_gray()) {
+    // If force_color then decode in color otherwise decode in gray.
+      cv_img = DecodeDatumToCVMat(datum, param_.force_color());
+    } else {
+      cv_img = DecodeDatumToCVMatNative(datum);
+    }
+    // Transform the cv::image into blob.
+    return Transform(cv_img, transformed_blob, crop_bbox, do_mirror);
+  } else {
+    if (param_.force_color() || param_.force_gray()) {
+      LOG(ERROR) << "force_color and force_gray only for encoded datum";
+    }
+  }
+
+  const int crop_size = param_.crop_size();
+  const int datum_channels = datum.channels();
+  const int datum_height = datum.height();
+  const int datum_width = datum.width();
+
+  // Check dimensions.
+  const int channels = transformed_blob->channels();
+  const int height = transformed_blob->height();
+  const int width = transformed_blob->width();
+  const int num = transformed_blob->num();
+
+  CHECK_EQ(channels, datum_channels);
+  CHECK_LE(height, datum_height);
+  CHECK_LE(width, datum_width);
+  CHECK_GE(num, 1);
+
+  if (crop_size) {
+    CHECK_EQ(crop_size, height);
+    CHECK_EQ(crop_size, width);
+  } else {
+    CHECK_EQ(datum_height, height);
+    CHECK_EQ(datum_width, width);
+  }
+
+  Dtype* transformed_data = transformed_blob->mutable_cpu_data();
+  Transform(datum, transformed_data, crop_bbox, do_mirror);
+}
+
+
+template<typename Dtype>
+void DataTransformer<Dtype>::Transform(
+    const AnnotatedDatum& anno_datum, TBlob<Dtype>* transformed_blob,
+    RepeatedPtrField<AnnotationGroup>* transformed_anno_group_all,
+    bool* do_mirror) {
+  // Transform datum.
+  const Datum& datum = anno_datum.datum();
+  NormalizedBBox crop_bbox;
+  Transform(datum, transformed_blob, &crop_bbox, do_mirror);
+
+  // Transform annotation.
+  const bool do_resize = true;
+  TransformAnnotation(anno_datum, do_resize, crop_bbox, *do_mirror,
+                      transformed_anno_group_all);
+}
+
+template<typename Dtype>
+void DataTransformer<Dtype>::Transform(
+    const AnnotatedDatum& anno_datum, TBlob<Dtype>* transformed_blob,
+    RepeatedPtrField<AnnotationGroup>* transformed_anno_group_all) {
+  bool do_mirror;
+  Transform(anno_datum, transformed_blob, transformed_anno_group_all,
+            &do_mirror);
+}
+
+template<typename Dtype>
+void DataTransformer<Dtype>::Transform(
+    const AnnotatedDatum& anno_datum, TBlob<Dtype>* transformed_blob,
+    vector<AnnotationGroup>* transformed_anno_vec, bool* do_mirror) {
+  RepeatedPtrField<AnnotationGroup> transformed_anno_group_all;
+  Transform(anno_datum, transformed_blob, &transformed_anno_group_all,
+            do_mirror);
+  for (int g = 0; g < transformed_anno_group_all.size(); ++g) {
+    transformed_anno_vec->push_back(transformed_anno_group_all.Get(g));
+  }
+}
+
+template<typename Dtype>
+void DataTransformer<Dtype>::Transform(
+    const AnnotatedDatum& anno_datum, TBlob<Dtype>* transformed_blob,
+    vector<AnnotationGroup>* transformed_anno_vec) {
+  bool do_mirror;
+  Transform(anno_datum, transformed_blob, transformed_anno_vec, &do_mirror);
+}
+
+// Transform and return the transformation information.
+template<typename Dtype>
+void DataTransformer<Dtype>::Transform(const Datum& datum, Dtype* transformed_data,
+               NormalizedBBox* crop_bbox, bool* do_mirror) {
+  const string& data = datum.data();
+  const int datum_channels = datum.channels();
+  const int datum_height = datum.height();
+  const int datum_width = datum.width();
+
+  const int crop_size = param_.crop_size();
+  const Dtype scale = param_.scale();
+  *do_mirror = param_.mirror() && Rand(2);
+  const bool has_mean_file = param_.has_mean_file();
+  const bool has_uint8 = data.size() > 0;
+  const bool has_mean_values = mean_values_.size() > 0;
+
+  CHECK_GT(datum_channels, 0);
+  CHECK_GE(datum_height, crop_size);
+  CHECK_GE(datum_width, crop_size);
+
+  float* mean = NULL;
+  if (has_mean_file) {
+    CHECK_EQ(datum_channels, data_mean_.channels());
+    CHECK_EQ(datum_height, data_mean_.height());
+    CHECK_EQ(datum_width, data_mean_.width());
+    mean = data_mean_.mutable_cpu_data();
+  }
+  if (has_mean_values) {
+    CHECK(mean_values_.size() == 1 || mean_values_.size() == datum_channels) <<
+        "Specify either 1 mean_value or as many as channels: " << datum_channels;
+    if (datum_channels > 1 && mean_values_.size() == 1) {
+      // Replicate the mean_value for simplicity
+      for (int c = 1; c < datum_channels; ++c) {
+        mean_values_.push_back(mean_values_[0]);
+      }
+    }
+  }
+
+  int height = datum_height;
+  int width = datum_width;
+
+  int h_off = 0;
+  int w_off = 0;
+  if (crop_size) {
+    height = crop_size;
+    width = crop_size;
+    // We only do random crop when we do training.
+    if (phase_ == TRAIN) {
+      h_off = Rand(datum_height - crop_size + 1);
+      w_off = Rand(datum_width - crop_size + 1);
+    } else {
+      h_off = (datum_height - crop_size) / 2;
+      w_off = (datum_width - crop_size) / 2;
+    }
+  }
+
+  // Return the normalized crop bbox.
+  crop_bbox->set_xmin(Dtype(w_off) / datum_width);
+  crop_bbox->set_ymin(Dtype(h_off) / datum_height);
+  crop_bbox->set_xmax(Dtype(w_off + width) / datum_width);
+  crop_bbox->set_ymax(Dtype(h_off + height) / datum_height);
+
+  Dtype datum_element;
+  int top_index, data_index;
+  for (int c = 0; c < datum_channels; ++c) {
+    for (int h = 0; h < height; ++h) {
+      for (int w = 0; w < width; ++w) {
+        data_index = (c * datum_height + h_off + h) * datum_width + w_off + w;
+        if (*do_mirror) {
+          top_index = (c * height + h) * width + (width - 1 - w);
+        } else {
+          top_index = (c * height + h) * width + w;
+        }
+        if (has_uint8) {
+          datum_element =
+              static_cast<Dtype>(static_cast<uint8_t>(data[data_index]));
+        } else {
+          datum_element = datum.float_data(data_index);
+        }
+        if (has_mean_file) {
+          transformed_data[top_index] =
+              (datum_element - mean[data_index]) * scale;
+        } else {
+          if (has_mean_values) {
+            transformed_data[top_index] =
+                (datum_element - mean_values_[c]) * scale;
+          } else {
+            transformed_data[top_index] = datum_element * scale;
+          }
+        }
+      }
+    }
+  }
+}
+
+
+template<typename Dtype>
+void DataTransformer<Dtype>::TransformAnnotation(
+    const AnnotatedDatum& anno_datum, const bool do_resize,
+    const NormalizedBBox& crop_bbox, const bool do_mirror,
+    RepeatedPtrField<AnnotationGroup>* transformed_anno_group_all) {
+  const int img_height = anno_datum.datum().height();
+  const int img_width = anno_datum.datum().width();
+  if (anno_datum.type() == AnnotatedDatum_AnnotationType_BBOX) {
+    // Go through each AnnotationGroup.
+    for (int g = 0; g < anno_datum.annotation_group_size(); ++g) {
+      const AnnotationGroup& anno_group = anno_datum.annotation_group(g);
+      AnnotationGroup transformed_anno_group;
+      // Go through each Annotation.
+      bool has_valid_annotation = false;
+      for (int a = 0; a < anno_group.annotation_size(); ++a) {
+        const Annotation& anno = anno_group.annotation(a);
+        const NormalizedBBox& bbox = anno.bbox();
+        // Adjust bounding box annotation.
+        NormalizedBBox resize_bbox = bbox;
+        if (do_resize && param_.has_resize_param()) {
+          CHECK_GT(img_height, 0);
+          CHECK_GT(img_width, 0);
+          UpdateBBoxByResizePolicy(param_.resize_param(), img_width, img_height,
+                                   &resize_bbox);
+        }
+        if (param_.has_emit_constraint() &&
+            !MeetEmitConstraint(crop_bbox, resize_bbox,
+                                param_.emit_constraint())) {
+          continue;
+        }
+        NormalizedBBox proj_bbox;
+        if (ProjectBBox(crop_bbox, resize_bbox, &proj_bbox)) {
+          has_valid_annotation = true;
+          Annotation* transformed_anno =
+              transformed_anno_group.add_annotation();
+          transformed_anno->set_instance_id(anno.instance_id());
+          NormalizedBBox* transformed_bbox = transformed_anno->mutable_bbox();
+          transformed_bbox->CopyFrom(proj_bbox);
+          if (do_mirror) {
+            Dtype temp = transformed_bbox->xmin();
+            transformed_bbox->set_xmin(1 - transformed_bbox->xmax());
+            transformed_bbox->set_xmax(1 - temp);
+          }
+          if (do_resize && param_.has_resize_param()) {
+            ExtrapolateBBox(param_.resize_param(), img_height, img_width,
+                crop_bbox, transformed_bbox);
+          }
+        }
+      }
+      // Save for output.
+      if (has_valid_annotation) {
+        transformed_anno_group.set_group_label(anno_group.group_label());
+        transformed_anno_group_all->Add()->CopyFrom(transformed_anno_group);
+      }
+    }
+  } else {
+    LOG(FATAL) << "Unknown annotation type.";
+  }
+}
+
+template<typename Dtype>
+void DataTransformer<Dtype>::CropImage(const Datum& datum,
+                                       const NormalizedBBox& bbox,
+                                       Datum* crop_datum) {
+  // If datum is encoded, decode and crop the cv::image.
+  if (datum.encoded()) {
+    CHECK(!(param_.force_color() && param_.force_gray()))
+        << "cannot set both force_color and force_gray";
+    cv::Mat cv_img;
+    if (param_.force_color() || param_.force_gray()) {
+      // If force_color then decode in color otherwise decode in gray.
+      cv_img = DecodeDatumToCVMat(datum, param_.force_color());
+    } else {
+      cv_img = DecodeDatumToCVMatNative(datum);
+    }
+    // Crop the image.
+    cv::Mat crop_img;
+    CropImage(cv_img, bbox, &crop_img);
+    // Save the image into datum.
+    EncodeCVMatToDatum(crop_img, "jpg", crop_datum);
+    crop_datum->set_label(datum.label());
+    return;
+  } else {
+    if (param_.force_color() || param_.force_gray()) {
+      LOG(ERROR) << "force_color and force_gray only for encoded datum";
+    }
+  }
+
+  const int datum_channels = datum.channels();
+  const int datum_height = datum.height();
+  const int datum_width = datum.width();
+
+  // Get the bbox dimension.
+  NormalizedBBox clipped_bbox;
+  ClipBBox(bbox, &clipped_bbox);
+  NormalizedBBox scaled_bbox;
+  ScaleBBox(clipped_bbox, datum_height, datum_width, &scaled_bbox);
+  const int w_off = static_cast<int>(scaled_bbox.xmin());
+  const int h_off = static_cast<int>(scaled_bbox.ymin());
+  const int width = static_cast<int>(scaled_bbox.xmax() - scaled_bbox.xmin());
+  const int height = static_cast<int>(scaled_bbox.ymax() - scaled_bbox.ymin());
+
+  // Crop the image using bbox.
+  crop_datum->set_channels(datum_channels);
+  crop_datum->set_height(height);
+  crop_datum->set_width(width);
+  crop_datum->set_label(datum.label());
+  crop_datum->clear_data();
+  crop_datum->clear_float_data();
+  crop_datum->set_encoded(false);
+  const int crop_datum_size = datum_channels * height * width;
+  const std::string& datum_buffer = datum.data();
+  std::string buffer(crop_datum_size, ' ');
+  for (int h = h_off; h < h_off + height; ++h) {
+    for (int w = w_off; w < w_off + width; ++w) {
+      for (int c = 0; c < datum_channels; ++c) {
+        int datum_index = (c * datum_height + h) * datum_width + w;
+        int crop_datum_index = (c * height + h - h_off) * width + w - w_off;
+        buffer[crop_datum_index] = datum_buffer[datum_index];
+      }
+    }
+  }
+  crop_datum->set_data(buffer);
+}
+
+template<typename Dtype>
+void DataTransformer<Dtype>::CropImage(const AnnotatedDatum& anno_datum,
+                                       const NormalizedBBox& bbox,
+                                       AnnotatedDatum* cropped_anno_datum) {
+  // Crop the datum.
+  CropImage(anno_datum.datum(), bbox, cropped_anno_datum->mutable_datum());
+  cropped_anno_datum->set_type(anno_datum.type());
+
+  // Transform the annotation according to crop_bbox.
+  const bool do_resize = false;
+  const bool do_mirror = false;
+  NormalizedBBox crop_bbox;
+  ClipBBox(bbox, &crop_bbox);
+  TransformAnnotation(anno_datum, do_resize, crop_bbox, do_mirror,
+                      cropped_anno_datum->mutable_annotation_group());
+}
+
+template<typename Dtype>
+void DataTransformer<Dtype>::ExpandImage(const Datum& datum,
+                                         const float expand_ratio,
+                                         NormalizedBBox* expand_bbox,
+                                         Datum* expand_datum) {
+  // If datum is encoded, decode and crop the cv::image.
+  if (datum.encoded()) {
+    CHECK(!(param_.force_color() && param_.force_gray()))
+        << "cannot set both force_color and force_gray";
+    cv::Mat cv_img;
+    if (param_.force_color() || param_.force_gray()) {
+      // If force_color then decode in color otherwise decode in gray.
+      cv_img = DecodeDatumToCVMat(datum, param_.force_color());
+    } else {
+      cv_img = DecodeDatumToCVMatNative(datum);
+    }
+    // Expand the image.
+    cv::Mat expand_img;
+    ExpandImage(cv_img, expand_ratio, expand_bbox, &expand_img);
+    // Save the image into datum.
+    EncodeCVMatToDatum(expand_img, "jpg", expand_datum);
+    expand_datum->set_label(datum.label());
+    return;
+  } else {
+    if (param_.force_color() || param_.force_gray()) {
+      LOG(ERROR) << "force_color and force_gray only for encoded datum";
+    }
+  }
+
+  const int datum_channels = datum.channels();
+  const int datum_height = datum.height();
+  const int datum_width = datum.width();
+
+  // Get the bbox dimension.
+  int height = static_cast<int>(datum_height * expand_ratio);
+  int width = static_cast<int>(datum_width * expand_ratio);
+  float h_off, w_off;
+  caffe_rng_uniform(1, 0.f, static_cast<float>(height - datum_height), &h_off);
+  caffe_rng_uniform(1, 0.f, static_cast<float>(width - datum_width), &w_off);
+  h_off = floor(h_off);
+  w_off = floor(w_off);
+  expand_bbox->set_xmin(-w_off/datum_width);
+  expand_bbox->set_ymin(-h_off/datum_height);
+  expand_bbox->set_xmax((width - w_off)/datum_width);
+  expand_bbox->set_ymax((height - h_off)/datum_height);
+
+  // Crop the image using bbox.
+  expand_datum->set_channels(datum_channels);
+  expand_datum->set_height(height);
+  expand_datum->set_width(width);
+  expand_datum->set_label(datum.label());
+  expand_datum->clear_data();
+  expand_datum->clear_float_data();
+  expand_datum->set_encoded(false);
+  const int expand_datum_size = datum_channels * height * width;
+  const std::string& datum_buffer = datum.data();
+  std::string buffer(expand_datum_size, ' ');
+  for (int h = h_off; h < h_off + datum_height; ++h) {
+    for (int w = w_off; w < w_off + datum_width; ++w) {
+      for (int c = 0; c < datum_channels; ++c) {
+        int datum_index =
+            (c * datum_height + h - h_off) * datum_width + w - w_off;
+        int expand_datum_index = (c * height + h) * width + w;
+        buffer[expand_datum_index] = datum_buffer[datum_index];
+      }
+    }
+  }
+  expand_datum->set_data(buffer);
+}
+
+template<typename Dtype>
+void DataTransformer<Dtype>::ExpandImage(const AnnotatedDatum& anno_datum,
+                                         AnnotatedDatum* expanded_anno_datum) {
+  if (!param_.has_expand_param()) {
+    expanded_anno_datum->CopyFrom(anno_datum);
+    return;
+  }
+  const ExpansionParameter& expand_param = param_.expand_param();
+  const float expand_prob = expand_param.prob();
+  float prob;
+  caffe_rng_uniform(1, 0.f, 1.f, &prob);
+  if (prob > expand_prob) {
+    expanded_anno_datum->CopyFrom(anno_datum);
+    return;
+  }
+  const float max_expand_ratio = expand_param.max_expand_ratio();
+  if (fabs(max_expand_ratio - 1.) < 1e-2) {
+    expanded_anno_datum->CopyFrom(anno_datum);
+    return;
+  }
+  float expand_ratio;
+  caffe_rng_uniform(1, 1.f, max_expand_ratio, &expand_ratio);
+  // Expand the datum.
+  NormalizedBBox expand_bbox;
+  ExpandImage(anno_datum.datum(), expand_ratio, &expand_bbox,
+              expanded_anno_datum->mutable_datum());
+  expanded_anno_datum->set_type(anno_datum.type());
+
+  // Transform the annotation according to crop_bbox.
+  const bool do_resize = false;
+  const bool do_mirror = false;
+  TransformAnnotation(anno_datum, do_resize, expand_bbox, do_mirror,
+                      expanded_anno_datum->mutable_annotation_group());
+}
+
+template<typename Dtype>
+void DataTransformer<Dtype>::DistortImage(const Datum& datum,
+                                          Datum* distort_datum) {
+  if (!param_.has_distort_param()) {
+    distort_datum->CopyFrom(datum);
+    return;
+  }
+  // If datum is encoded, decode and crop the cv::image.
+  if (datum.encoded()) {
+    CHECK(!(param_.force_color() && param_.force_gray()))
+        << "cannot set both force_color and force_gray";
+    cv::Mat cv_img;
+    if (param_.force_color() || param_.force_gray()) {
+      // If force_color then decode in color otherwise decode in gray.
+      cv_img = DecodeDatumToCVMat(datum, param_.force_color());
+    } else {
+      cv_img = DecodeDatumToCVMatNative(datum);
+    }
+    // Distort the image.
+    cv::Mat distort_img = ApplyDistort(cv_img, param_.distort_param());
+    // Save the image into datum.
+    EncodeCVMatToDatum(distort_img, "jpg", distort_datum);
+    distort_datum->set_label(datum.label());
+    return;
+  } else {
+    LOG(ERROR) << "Only support encoded datum now";
+  }
+}
+
+template<typename Dtype>
+void DataTransformer<Dtype>::Transform(const cv::Mat& cv_img,
+                                       TBlob<Dtype>* transformed_blob,
+                                       NormalizedBBox* crop_bbox,
+                                       bool* do_mirror) {
+  // Check dimensions.
+  const int img_channels = cv_img.channels();
+  const int channels = transformed_blob->channels();
+  const int height = transformed_blob->height();
+  const int width = transformed_blob->width();
+  const int num = transformed_blob->num();
+
+  CHECK_GT(img_channels, 0);
+  CHECK(cv_img.depth() == CV_8U) << "Image data type must be unsigned byte";
+  CHECK_EQ(channels, img_channels);
+  CHECK_GE(num, 1);
+
+  const int crop_size = param_.crop_size();
+  const Dtype scale = param_.scale();
+  *do_mirror = param_.mirror() && Rand(2);
+  const bool has_mean_file = param_.has_mean_file();
+  const bool has_mean_values = mean_values_.size() > 0;
+
+  float* mean = NULL;
+  if (has_mean_file) {
+    CHECK_EQ(img_channels, data_mean_.channels());
+    mean = data_mean_.mutable_cpu_data();
+  }
+  if (has_mean_values) {
+    CHECK(mean_values_.size() == 1 || mean_values_.size() == img_channels) <<
+        "Specify either 1 mean_value or as many as channels: " << img_channels;
+    if (img_channels > 1 && mean_values_.size() == 1) {
+      // Replicate the mean_value for simplicity
+      for (int c = 1; c < img_channels; ++c) {
+        mean_values_.push_back(mean_values_[0]);
+      }
+    }
+  }
+
+  int crop_h = param_.crop_h();
+  int crop_w = param_.crop_w();
+  if (crop_size) {
+    crop_h = crop_size;
+    crop_w = crop_size;
+  }
+
+  cv::Mat cv_resized_image, cv_noised_image, cv_cropped_image;
+  if (param_.has_resize_param()) {
+    cv_resized_image = ApplyResize(cv_img, param_.resize_param());
+  } else {
+    cv_resized_image = cv_img;
+  }
+  if (param_.has_noise_param()) {
+    cv_noised_image = ApplyNoise(cv_resized_image, param_.noise_param());
+  } else {
+    cv_noised_image = cv_resized_image;
+  }
+  int img_height = cv_noised_image.rows;
+  int img_width = cv_noised_image.cols;
+  CHECK_GE(img_height, crop_h);
+  CHECK_GE(img_width, crop_w);
+
+  int h_off = 0;
+  int w_off = 0;
+  if ((crop_h > 0) && (crop_w > 0)) {
+    CHECK_EQ(crop_h, height);
+    CHECK_EQ(crop_w, width);
+    // We only do random crop when we do training.
+    if (phase_ == TRAIN) {
+      h_off = Rand(img_height - crop_h + 1);
+      w_off = Rand(img_width - crop_w + 1);
+    } else {
+      h_off = (img_height - crop_h) / 2;
+      w_off = (img_width - crop_w) / 2;
+    }
+    cv::Rect roi(w_off, h_off, crop_w, crop_h);
+    cv_cropped_image = cv_noised_image(roi);
+  } else {
+    cv_cropped_image = cv_noised_image;
+  }
+
+  // Return the normalized crop bbox.
+  crop_bbox->set_xmin(Dtype(w_off) / img_width);
+  crop_bbox->set_ymin(Dtype(h_off) / img_height);
+  crop_bbox->set_xmax(Dtype(w_off + width) / img_width);
+  crop_bbox->set_ymax(Dtype(h_off + height) / img_height);
+
+  if (has_mean_file) {
+    CHECK_EQ(cv_cropped_image.rows, data_mean_.height());
+    CHECK_EQ(cv_cropped_image.cols, data_mean_.width());
+  }
+  CHECK(cv_cropped_image.data);
+
+  Dtype* transformed_data = transformed_blob->mutable_cpu_data();
+  int top_index;
+  for (int h = 0; h < height; ++h) {
+    const uchar* ptr = cv_cropped_image.ptr<uchar>(h);
+    int img_index = 0;
+    int h_idx = h;
+    for (int w = 0; w < width; ++w) {
+      int w_idx = w;
+      if (*do_mirror) {
+        w_idx = (width - 1 - w);
+      }
+      int h_idx_real = h_idx;
+      int w_idx_real = w_idx;
+      for (int c = 0; c < img_channels; ++c) {
+        top_index = (c * height + h_idx_real) * width + w_idx_real;
+        Dtype pixel = static_cast<Dtype>(ptr[img_index++]);
+        if (has_mean_file) {
+          int mean_index = (c * img_height + h_off + h_idx_real) * img_width
+              + w_off + w_idx_real;
+          transformed_data[top_index] =
+              (pixel - mean[mean_index]) * scale;
+        } else {
+          if (has_mean_values) {
+            transformed_data[top_index] =
+                (pixel - mean_values_[c]) * scale;
+          } else {
+            transformed_data[top_index] = pixel * scale;
+          }
+        }
+      }
+    }
+  }
+}
+
+template<typename Dtype>
+void DataTransformer<Dtype>::TransformInv(const Dtype* data, cv::Mat* cv_img,
+                                          const int height, const int width,
+                                          const int channels) {
+  const Dtype scale = param_.scale();
+  const bool has_mean_file = param_.has_mean_file();
+  const bool has_mean_values = mean_values_.size() > 0;
+
+  float* mean = NULL;
+  if (has_mean_file) {
+    CHECK_EQ(channels, data_mean_.channels());
+    CHECK_EQ(height, data_mean_.height());
+    CHECK_EQ(width, data_mean_.width());
+    mean = data_mean_.mutable_cpu_data();
+  }
+  if (has_mean_values) {
+    CHECK(mean_values_.size() == 1 || mean_values_.size() == channels) <<
+        "Specify either 1 mean_value or as many as channels: " << channels;
+    if (channels > 1 && mean_values_.size() == 1) {
+      // Replicate the mean_value for simplicity
+      for (int c = 1; c < channels; ++c) {
+        mean_values_.push_back(mean_values_[0]);
+      }
+    }
+  }
+
+  const int img_type = channels == 3 ? CV_8UC3 : CV_8UC1;
+  cv::Mat orig_img(height, width, img_type, cv::Scalar(0, 0, 0));
+  for (int h = 0; h < height; ++h) {
+    uchar* ptr = orig_img.ptr<uchar>(h);
+    int img_idx = 0;
+    for (int w = 0; w < width; ++w) {
+      for (int c = 0; c < channels; ++c) {
+        int idx = (c * height + h) * width + w;
+        if (has_mean_file) {
+          ptr[img_idx++] = static_cast<uchar>(data[idx] / scale + mean[idx]);
+        } else {
+          if (has_mean_values) {
+            ptr[img_idx++] =
+                static_cast<uchar>(data[idx] / scale + mean_values_[c]);
+          } else {
+            ptr[img_idx++] = static_cast<uchar>(data[idx] / scale);
+          }
+        }
+      }
+    }
+  }
+
+  if (param_.has_resize_param()) {
+    *cv_img = ApplyResize(orig_img, param_.resize_param());
+  } else {
+    *cv_img = orig_img;
+  }
+}
+
+template<typename Dtype>
+void DataTransformer<Dtype>::TransformInv(const Blob* blob,
+                                          vector<cv::Mat>* cv_imgs) {
+  const int channels = blob->channels();
+  const int height = blob->height();
+  const int width = blob->width();
+  const int num = blob->num();
+  CHECK_GE(num, 1);
+  const Dtype* image_data = blob->cpu_data<Dtype>();
+
+  for (int i = 0; i < num; ++i) {
+    cv::Mat cv_img;
+    TransformInv(image_data, &cv_img, height, width, channels);
+    cv_imgs->push_back(cv_img);
+    image_data += blob->offset(1);
+  }
+}
+
+template <typename Dtype>
+void DataTransformer<Dtype>::CropImage(const cv::Mat& img,
+                                       const NormalizedBBox& bbox,
+                                       cv::Mat* crop_img) {
+  const int img_height = img.rows;
+  const int img_width = img.cols;
+
+  // Get the bbox dimension.
+  NormalizedBBox clipped_bbox;
+  ClipBBox(bbox, &clipped_bbox);
+  NormalizedBBox scaled_bbox;
+  ScaleBBox(clipped_bbox, img_height, img_width, &scaled_bbox);
+
+  // Crop the image using bbox.
+  int w_off = static_cast<int>(scaled_bbox.xmin());
+  int h_off = static_cast<int>(scaled_bbox.ymin());
+  int width = static_cast<int>(scaled_bbox.xmax() - scaled_bbox.xmin());
+  int height = static_cast<int>(scaled_bbox.ymax() - scaled_bbox.ymin());
+  cv::Rect bbox_roi(w_off, h_off, width, height);
+
+  img(bbox_roi).copyTo(*crop_img);
+}
+
+template <typename Dtype>
+void DataTransformer<Dtype>::ExpandImage(const cv::Mat& img,
+                                         const float expand_ratio,
+                                         NormalizedBBox* expand_bbox,
+                                         cv::Mat* expand_img) {
+  const int img_height = img.rows;
+  const int img_width = img.cols;
+  const int img_channels = img.channels();
+
+  // Get the bbox dimension.
+  int height = static_cast<int>(img_height * expand_ratio);
+  int width = static_cast<int>(img_width * expand_ratio);
+  float h_off, w_off;
+  caffe_rng_uniform(1, 0.f, static_cast<float>(height - img_height), &h_off);
+  caffe_rng_uniform(1, 0.f, static_cast<float>(width - img_width), &w_off);
+  h_off = floor(h_off);
+  w_off = floor(w_off);
+  expand_bbox->set_xmin(-w_off/img_width);
+  expand_bbox->set_ymin(-h_off/img_height);
+  expand_bbox->set_xmax((width - w_off)/img_width);
+  expand_bbox->set_ymax((height - h_off)/img_height);
+
+  expand_img->create(height, width, img.type());
+  expand_img->setTo(cv::Scalar(0));
+  const bool has_mean_file = param_.has_mean_file();
+  const bool has_mean_values = mean_values_.size() > 0;
+
+  if (has_mean_file) {
+    CHECK_EQ(img_channels, data_mean_.channels());
+    CHECK_EQ(height, data_mean_.height());
+    CHECK_EQ(width, data_mean_.width());
+    float* mean = data_mean_.mutable_cpu_data();
+    for (int h = 0; h < height; ++h) {
+      uchar* ptr = expand_img->ptr<uchar>(h);
+      int img_index = 0;
+      for (int w = 0; w < width; ++w) {
+        for (int c = 0; c < img_channels; ++c) {
+          int blob_index = (c * height + h) * width + w;
+          ptr[img_index++] = static_cast<char>(mean[blob_index]);
+        }
+      }
+    }
+  }
+  if (has_mean_values) {
+    CHECK(mean_values_.size() == 1 || mean_values_.size() == img_channels) <<
+        "Specify either 1 mean_value or as many as channels: " << img_channels;
+    if (img_channels > 1 && mean_values_.size() == 1) {
+      // Replicate the mean_value for simplicity
+      for (int c = 1; c < img_channels; ++c) {
+        mean_values_.push_back(mean_values_[0]);
+      }
+    }
+    vector<cv::Mat> channels(img_channels);
+    cv::split(*expand_img, channels);
+    CHECK_EQ(channels.size(), mean_values_.size());
+    for (int c = 0; c < img_channels; ++c) {
+      channels[c] = mean_values_[c];
+    }
+    cv::merge(channels, *expand_img);
+  }
+
+  cv::Rect bbox_roi(w_off, h_off, img_width, img_height);
+  img.copyTo((*expand_img)(bbox_roi));
+}
+
+template<typename Dtype>
+vector<int> DataTransformer<Dtype>::InferBlobShape(const Datum& datum) {
+  if (datum.encoded()) {
+    CHECK(!(param_.force_color() && param_.force_gray()))
+        << "cannot set both force_color and force_gray";
+    cv::Mat cv_img;
+    if (param_.force_color() || param_.force_gray()) {
+    // If force_color then decode in color otherwise decode in gray.
+      cv_img = DecodeDatumToCVMat(datum, param_.force_color());
+    } else {
+      cv_img = DecodeDatumToCVMatNative(datum);
+    }
+    // InferBlobShape using the cv::image.
+    return InferBlobShape(cv_img);
+  }
+  const int crop_size = param_.crop_size();
+  int crop_h = param_.crop_h();
+  int crop_w = param_.crop_w();
+  if (crop_size) {
+    crop_h = crop_size;
+    crop_w = crop_size;
+  }
+  const int datum_channels = datum.channels();
+  int datum_height = datum.height();
+  int datum_width = datum.width();
+
+  // Check dimensions.
+  CHECK_GT(datum_channels, 0);
+  if (param_.has_resize_param()) {
+    InferNewSize(param_.resize_param(), datum_width, datum_height,
+                 &datum_width, &datum_height);
+  }
+  CHECK_GE(datum_height, crop_h);
+  CHECK_GE(datum_width, crop_w);
+
+  // Build BlobShape.
+  vector<int> shape(4);
+  shape[0] = 1;
+  shape[1] = datum_channels;
+  shape[2] = (crop_h)? crop_h: datum_height;
+  shape[3] = (crop_w)? crop_w: datum_width;
+  return shape;
+}
+
+template<typename Dtype>
+vector<int> DataTransformer<Dtype>::InferBlobShape(
+    const vector<Datum> & datum_vector) {
+  const int num = datum_vector.size();
+  CHECK_GT(num, 0) << "There is no datum to in the vector";
+  // Use first datum in the vector to InferBlobShape.
+  vector<int> shape = InferBlobShape(datum_vector[0]);
+  // Adjust num to the size of the vector.
+  shape[0] = num;
+  return shape;
+}
+
+template<typename Dtype>
+vector<int> DataTransformer<Dtype>::InferBlobShape(const cv::Mat& cv_img) {
+  const int crop_size = param_.crop_size();
+  int crop_h = param_.crop_h();
+  int crop_w = param_.crop_w();
+  if (crop_size) {
+    crop_h = crop_size;
+    crop_w = crop_size;
+  }
+  const int img_channels = cv_img.channels();
+  int img_height = cv_img.rows;
+  int img_width = cv_img.cols;
+  // Check dimensions.
+  CHECK_GT(img_channels, 0);
+  if (param_.has_resize_param()) {
+    InferNewSize(param_.resize_param(), img_width, img_height,
+                 &img_width, &img_height);
+  }
+  CHECK_GE(img_height, crop_h);
+  CHECK_GE(img_width, crop_w);
+
+  // Build BlobShape.
+  vector<int> shape(4);
+  shape[0] = 1;
+  shape[1] = img_channels;
+  shape[2] = (crop_h)? crop_h: img_height;
+  shape[3] = (crop_w)? crop_w: img_width;
+  return shape;
+}
+
+template<typename Dtype>
+vector<int> DataTransformer<Dtype>::InferDatumShape(const Datum& datum) {
+  if (datum.encoded()) {
+    CHECK(!(param_.force_color() && param_.force_gray()))
+    << "cannot set both force_color and force_gray";
+    cv::Mat cv_img;
+    if (param_.force_color() || param_.force_gray()) {
+      // If force_color then decode in color otherwise decode in gray.
+      cv_img = DecodeDatumToCVMat(datum, param_.force_color());
+    } else {
+      cv_img = DecodeDatumToCVMatNative(datum);
+    }
+    // Infer shape using the cv::image.
+    return InferCVMatShape(cv_img);
+  }
+  const int datum_channels = datum.channels();
+  const int datum_height = datum.height();
+  const int datum_width = datum.width();
+  vector<int> datum_shape(4);
+  datum_shape[0] = 1;
+  datum_shape[1] = datum_channels;
+  datum_shape[2] = datum_height;
+  datum_shape[3] = datum_width;
+  return datum_shape;
+}
+
+template<typename Dtype>
+vector<int> DataTransformer<Dtype>::InferCVMatShape(const cv::Mat& cv_img) {
+  int img_channels = cv_img.channels();
+  int img_height = cv_img.rows;
+  int img_width = cv_img.cols;
+  vector<int> shape(4);
+  shape[0] = 1;
+  shape[1] = img_channels;
+  shape[2] = img_height;
+  shape[3] = img_width;
+
+  const int crop_size = param_.crop_size();
+  int crop_h = param_.crop_h();
+  int crop_w = param_.crop_w();
+  if (crop_size) {
+    crop_h = crop_size;
+    crop_w = crop_size;
+  }
+
+  // Check dimensions.
+  if (param_.has_resize_param()) {
+    InferNewSize(param_.resize_param(), img_width, img_height,
+        &img_width, &img_height);
+  }
+  CHECK_GE(img_height, crop_h);
+  CHECK_GE(img_width, crop_w);
+
+  // Build BlobShape.
+  shape[0] = 1;
+  shape[1] = img_channels;
+  shape[2] = (crop_h)? crop_h: img_height;
+  shape[3] = (crop_w)? crop_w: img_width;
+
+  return shape;
+}
+
+
+template<typename Dtype>
+vector<int> DataTransformer<Dtype>::InferBlobShape(const vector<int>& bottom_shape, bool use_gpu) {
+  const int crop_size = param_.crop_size();
+  CHECK_EQ(bottom_shape.size(), 4);
+  CHECK_EQ(bottom_shape[0], 1);
+  const int bottom_channels = bottom_shape[1];
+  const int bottom_height = bottom_shape[2];
+  const int bottom_width = bottom_shape[3];
+  // Check dimensions.
+  CHECK_GT(bottom_channels, 0);
+  CHECK_GE(bottom_height, crop_size);
+  CHECK_GE(bottom_width, crop_size);
+  // Build BlobShape.
+  vector<int> top_shape(4);
+  top_shape[0] = 1;
+  top_shape[1] = bottom_channels;
+  // if using GPU transform, don't crop
+  if (use_gpu) {
+    top_shape[2] = bottom_height;
+    top_shape[3] = bottom_width;
+  } else {
+    top_shape[2] = (crop_size) ? crop_size : bottom_height;
+    top_shape[3] = (crop_size) ? crop_size : bottom_width;
+  }
+  return top_shape;
+}
+
+template<typename Dtype>
+void DataTransformer<Dtype>::Transform(const Datum& datum,
+    Dtype *transformed_data, const std::array<unsigned int, 3>& rand) {
+  const string& data = datum.data();
+  const int datum_channels = datum.channels();
+  const int datum_height = datum.height();
+  const int datum_width = datum.width();
+
+  const int crop_size = param_.crop_size();
+  const float scale = param_.scale();
+  const bool do_mirror = param_.mirror() && (rand[0] % 2);
+  const bool has_mean_file = param_.has_mean_file();
+  const bool has_uint8 = data.size() > 0;
+  const bool has_mean_values = mean_values_.size() > 0;
+
+  CHECK_GT(datum_channels, 0);
+  CHECK_GE(datum_height, crop_size);
+  CHECK_GE(datum_width, crop_size);
+
+  const float* mean = NULL;
+  if (has_mean_file) {
+    CHECK_EQ(datum_channels, data_mean_.channels());
+    CHECK_EQ(datum_height, data_mean_.height());
+    CHECK_EQ(datum_width, data_mean_.width());
+    mean = data_mean_.cpu_data();
+  }
+  if (has_mean_values) {
+    CHECK(mean_values_.size() == 1 || mean_values_.size() == datum_channels)
+    << "Specify either 1 mean_value or as many as channels: " << datum_channels;
+    if (datum_channels > 1 && mean_values_.size() == 1) {
+      // Replicate the mean_value for simplicity
+      for (int c = 1; c < datum_channels; ++c) {
+        mean_values_.push_back(mean_values_[0]);
+      }
+    }
+  }
+
+  int height = datum_height;
+  int width = datum_width;
+
+  int h_off = 0;
+  int w_off = 0;
+  if (crop_size) {
+    height = crop_size;
+    width = crop_size;
+    // We only do random crop when we do training.
+    if (phase_ == TRAIN) {
+      h_off = rand[1] % (datum_height - crop_size + 1);
+      w_off = rand[2] % (datum_width - crop_size + 1);
+    } else {
+      h_off = (datum_height - crop_size) / 2;
+      w_off = (datum_width - crop_size) / 2;
+    }
+  }
+
+  int top_index, data_index, ch, cdho;
+  const int m = do_mirror ? -1 : 1;
+
+  if (has_uint8) {
+    Dtype datum_element, mnv;
+
+    if (scale == 1.F) {
+      for (int c = 0; c < datum_channels; ++c) {
+        cdho = c * datum_height + h_off;
+        ch = c * height;
+        mnv = has_mean_values && !has_mean_file ? mean_values_[c] : 0.F;
+        for (int h = 0; h < height; ++h) {
+          top_index = do_mirror ? (ch + h + 1) * width - 1 : (ch + h) * width;
+          data_index = (cdho + h) * datum_width + w_off;
+          for (int w = 0; w < width; ++w) {
+            datum_element = static_cast<unsigned char>(data[data_index]);
+            if (has_mean_file) {
+              transformed_data[top_index] = datum_element - mean[data_index];
+            } else {
+              if (has_mean_values) {
+                transformed_data[top_index] = datum_element - mnv;
+              } else {
+                transformed_data[top_index] = datum_element;
+              }
+            }
+            ++data_index;
+            top_index += m;
+          }
+        }
+      }
+    } else {
+      for (int c = 0; c < datum_channels; ++c) {
+        cdho = c * datum_height + h_off;
+        ch = c * height;
+        mnv = has_mean_values && !has_mean_file ? mean_values_[c] : 0.F;
+        for (int h = 0; h < height; ++h) {
+          top_index = do_mirror ? (ch + h + 1) * width - 1 : (ch + h) * width;
+          data_index = (cdho + h) * datum_width + w_off;
+          for (int w = 0; w < width; ++w) {
+            datum_element = static_cast<unsigned char>(data[data_index]);
+            if (has_mean_file) {
+              transformed_data[top_index] = (datum_element - mean[data_index]) * scale;
+            } else {
+              if (has_mean_values) {
+                transformed_data[top_index] = (datum_element - mnv) * scale;
+              } else {
+                transformed_data[top_index] = datum_element * scale;
+              }
+            }
+            ++data_index;
+            top_index += m;
+          }
+        }
+      }
+    }
+  } else {
+    Dtype datum_element;
+    for (int c = 0; c < datum_channels; ++c) {
+      cdho = c * datum_height + h_off;
+      ch = c * height;
+      for (int h = 0; h < height; ++h) {
+        top_index = do_mirror ? (ch + h + 1) * width - 1 : (ch + h) * width;
+        data_index = (cdho + h) * datum_width + w_off;
+        for (int w = 0; w < width; ++w) {
+          datum_element = datum.float_data(data_index);
+          if (has_mean_file) {
+            transformed_data[top_index] = (datum_element - mean[data_index]) * scale;
+          } else {
+            if (has_mean_values) {
+              transformed_data[top_index] = (datum_element - mean_values_[c]) * scale;
+            } else {
+              transformed_data[top_index] = datum_element * scale;
+            }
+          }
+          ++data_index;
+          top_index += m;
+        }
+      }
+    }
+  }
+}
+
+template<typename Dtype>
+void DataTransformer<Dtype>::Transform(const cv::Mat& img, TBlob<Dtype> *transformed_blob) const {
+  const int crop_size = param_.crop_size();
+  const int img_channels = img.channels();
+  const int img_height = img.rows;
+  const int img_width = img.cols;
+
+  // Check dimensions.
+  const int channels = transformed_blob->channels();
+  const int height = transformed_blob->height();
+  const int width = transformed_blob->width();
+  const int num = transformed_blob->num();
+
+  CHECK_EQ(channels, img_channels);
+  CHECK_LE(height, img_height);
+  CHECK_LE(width, img_width);
+  CHECK_GE(num, 1);
+  // TODO
+  if (crop_size > 0) {
+    CHECK_EQ(crop_size, height);
+    CHECK_EQ(crop_size, width);
+  }
+  Transform(img, transformed_blob->mutable_cpu_data(false), transformed_blob->count());
+}
+
+template<typename Dtype>
+void DataTransformer<Dtype>::TransformV1(const Datum& datum, Dtype* buf, size_t buf_len) {
+  const string& data = datum.data();
+  const int datum_channels = datum.channels();
+  const int datum_height = datum.height();
+  const int datum_width = datum.width();
+
+  const int crop_size = param_.crop_size();
+  const float scale = param_.scale();
+  const bool do_mirror = param_.mirror() && (Rand() % 2);
+  const bool has_mean_file = param_.has_mean_file();
+  const bool has_uint8 = data.size() > 0;
+  const bool has_mean_values = mean_values_.size() > 0;
+
+  CHECK_GT(datum_channels, 0);
+  CHECK_GE(datum_height, crop_size);
+  CHECK_GE(datum_width, crop_size);
+
+  const float* mean = NULL;
+  if (has_mean_file) {
+    CHECK_EQ(datum_channels, data_mean_.channels());
+    CHECK_EQ(datum_height, data_mean_.height());
+    CHECK_EQ(datum_width, data_mean_.width());
+    mean = data_mean_.cpu_data();
+  }
+  if (has_mean_values) {
+    CHECK(mean_values_.size() == 1 || mean_values_.size() == datum_channels)
+    << "Specify either 1 mean_value or as many as channels: " << datum_channels;
+    if (datum_channels > 1 && mean_values_.size() == 1) {
+      // Replicate the mean_value for simplicity
+      for (int c = 1; c < datum_channels; ++c) {
+        mean_values_.push_back(mean_values_[0]);
+      }
+    }
+  }
+
+  int height = datum_height;
+  int width = datum_width;
+
+  int h_off = 0;
+  int w_off = 0;
+  if (crop_size) {
+    height = crop_size;
+    width = crop_size;
+    // We only do random crop when we do training.
+    if (phase_ == TRAIN) {
+      h_off = Rand() % (datum_height - crop_size + 1);
+      w_off = Rand() % (datum_width - crop_size + 1);
+    } else {
+      h_off = (datum_height - crop_size) / 2;
+      w_off = (datum_width - crop_size) / 2;
+    }
+  }
+
+  int top_index, data_index, ch, cdho;
+  const int m = do_mirror ? -1 : 1;
+
+  if (has_uint8) {
+    float datum_element, mnv;
+
+    if (scale == 1.F) {
+      for (int c = 0; c < datum_channels; ++c) {
+        cdho = c * datum_height + h_off;
+        ch = c * height;
+        mnv = has_mean_values && !has_mean_file ? mean_values_[c] : 0.F;
+        for (int h = 0; h < height; ++h) {
+          top_index = do_mirror ? (ch + h + 1) * width - 1 : (ch + h) * width;
+          data_index = (cdho + h) * datum_width + w_off;
+          for (int w = 0; w < width; ++w) {
+            datum_element = static_cast<unsigned char>(data[data_index]);
+            if (has_mean_file) {
+              buf[top_index] = datum_element - mean[data_index];
+            } else {
+              if (has_mean_values) {
+                buf[top_index] = datum_element - mnv;
+              } else {
+                buf[top_index] = datum_element;
+              }
+            }
+            ++data_index;
+            top_index += m;
+          }
+        }
+      }
+    } else {
+      for (int c = 0; c < datum_channels; ++c) {
+        cdho = c * datum_height + h_off;
+        ch = c * height;
+        mnv = has_mean_values && !has_mean_file ? mean_values_[c] : 0.F;
+        for (int h = 0; h < height; ++h) {
+          top_index = do_mirror ? (ch + h + 1) * width - 1 : (ch + h) * width;
+          data_index = (cdho + h) * datum_width + w_off;
+          for (int w = 0; w < width; ++w) {
+            datum_element = static_cast<unsigned char>(data[data_index]);
+            if (has_mean_file) {
+              buf[top_index] = (datum_element - mean[data_index]) * scale;
+            } else {
+              if (has_mean_values) {
+                buf[top_index] = (datum_element - mnv) * scale;
+              } else {
+                buf[top_index] = datum_element * scale;
+              }
+            }
+            ++data_index;
+            top_index += m;
+          }
+        }
+      }
+    }
+  } else {
+    float datum_element;
+    for (int c = 0; c < datum_channels; ++c) {
+      cdho = c * datum_height + h_off;
+      ch = c * height;
+      for (int h = 0; h < height; ++h) {
+        top_index = do_mirror ? (ch + h + 1) * width - 1 : (ch + h) * width;
+        data_index = (cdho + h) * datum_width + w_off;
+        for (int w = 0; w < width; ++w) {
+          datum_element = datum.float_data(data_index);
+          if (has_mean_file) {
+            buf[top_index] = (datum_element - mean[data_index]) * scale;
+          } else {
+            if (has_mean_values) {
+              buf[top_index] = (datum_element - mean_values_[c]) * scale;
+            } else {
+              buf[top_index] = datum_element * scale;
+            }
+          }
+          ++data_index;
+          top_index += m;
+        }
+      }
+    }
+  }
+}
+
+INSTANTIATE_CLASS(DataTransformer);
+
 }  // namespace caffe
diff --git a/src/caffe/data_transformer.cu b/src/caffe/data_transformer.cu
index 88a6cfbd48d..c682b0d04d9 100644
--- a/src/caffe/data_transformer.cu
+++ b/src/caffe/data_transformer.cu
@@ -190,7 +190,7 @@ void transform_kernel<__half>(int N, int C,
 
 
 template <typename Dtype>
-void DataTransformer::TransformGPU(int N, int C, int H, int W,
+void DataTransformer<Dtype>::TransformGPU(int N, int C, int H, int W,
     size_t sizeof_element,
     const void *in, Dtype *out,
     const unsigned int *random_numbers, bool signed_data) {
@@ -274,11 +274,11 @@ void DataTransformer::TransformGPU(int N, int C, int H, int W,
   CUDA_CHECK(cudaStreamSynchronize(stream));
 }
 
-template void DataTransformer::TransformGPU<float>(int, int, int, int,
+template void DataTransformer<float>::TransformGPU(int, int, int, int,
     size_t, const void*, float*, const unsigned int*, bool);
-template void DataTransformer::TransformGPU<double>(int, int, int, int,
+template void DataTransformer<double>::TransformGPU(int, int, int, int,
     size_t, const void*, double*, const unsigned int*, bool);
-template void DataTransformer::TransformGPU<float16>(int, int, int, int,
+template void DataTransformer<float16>::TransformGPU(int, int, int, int,
     size_t, const void*, float16*, const unsigned int*, bool);
 
 }  // namespace caffe
diff --git a/src/caffe/internal_thread.cpp b/src/caffe/internal_thread.cpp
index 64f133e2cb7..8fd09a49946 100644
--- a/src/caffe/internal_thread.cpp
+++ b/src/caffe/internal_thread.cpp
@@ -79,7 +79,7 @@ void InternalThread::entry(int thread_id, int device, Caffe::Brew mode, uint64_t
             << " on device " << device << ", rank " << rank_;
   if (mode == Caffe::GPU && set_cpu_affinity) {
 #ifndef NO_NVML
-    nvml::setCpuAffinity();
+    nvml::setCpuAffinity(device);
 #endif
   }
   if (threads_.size() == 1) {
diff --git a/src/caffe/layer.cpp b/src/caffe/layer.cpp
index 13867ecaf8b..09fddf8bcc7 100644
--- a/src/caffe/layer.cpp
+++ b/src/caffe/layer.cpp
@@ -88,9 +88,8 @@ int LayerBase::iter() const {
   return psolver == nullptr ? 0 : psolver->iter();
 }
 
-int LayerBase::relative_iter() const {
-  const Solver* psolver = parent_solver();
-  return psolver == nullptr ? 0 : psolver->relative_iter();
+int LayerBase::parent_rank() const {
+  return parent_net_ == nullptr ? 0 : parent_net_->solver_rank();
 }
 
 std::string LayerBase::print_current_device() const {
diff --git a/src/caffe/layers/accuracy_layer.cu b/src/caffe/layers/accuracy_layer.cu
new file mode 100644
index 00000000000..365d63f09ce
--- /dev/null
+++ b/src/caffe/layers/accuracy_layer.cu
@@ -0,0 +1,153 @@
+#include <vector>
+#include <device_launch_parameters.h>
+
+#include "caffe/layers/accuracy_layer.hpp"
+#include "caffe/util/math_functions.hpp"
+
+
+namespace caffe {
+
+template <typename Dtype>
+__global__ void AccuracyForwardGPU(const int nthreads,
+          const Dtype* bottom_data, const Dtype* label, Dtype* acc,
+          const int num, const int dim, const int spatial_dim,
+          const int num_labels, const int top_k,
+          const bool has_ignore_label_, const int ignore_label_,
+          Dtype* counts) {
+  CUDA_KERNEL_LOOP(index, nthreads) {
+    const int n = index / spatial_dim;
+    const int s = index % spatial_dim;
+    const int label_value = static_cast<int>(label[n * spatial_dim + s]);
+    const Dtype prob_of_true_class = bottom_data[n * dim
+                                                 + label_value * spatial_dim
+                                                 + s];
+    int num_better_predictions = -1;  // true_class also counts as "better"
+    if (has_ignore_label_ && label_value == ignore_label_) {
+      acc[index] = 0;
+      counts[index] = 0;
+    } else {
+      for (int k = 0; k < num_labels & num_better_predictions < top_k; k++) {
+        num_better_predictions +=
+          (bottom_data[n * dim + k * spatial_dim + s] >= prob_of_true_class);
+      }
+      acc[index] = (num_better_predictions < top_k);
+      counts[index] = 1;
+    }
+  }
+}
+
+template<typename Dtype>
+__global__ void AccuracyForwardWithPerClassGPU(const int nthreads,
+          const Dtype* bottom_data, const Dtype* label,
+          Dtype* acc, Dtype* counts,
+          const int num, const int dim, const int spatial_dim,
+          const int num_labels, const int top_k,
+          const bool has_ignore_label_, const int ignore_label_) {
+  CUDA_KERNEL_LOOP(index, nthreads) {
+    const int n = index / spatial_dim;
+    const int s = index % spatial_dim;
+    const int label_value = static_cast<int>(label[n * spatial_dim + s]);
+    const Dtype prob_of_true_class = bottom_data[n * dim
+                                                 + label_value * spatial_dim
+                                                 + s];
+    if (has_ignore_label_ && label_value == ignore_label_) {
+      // nothing to be done.
+    } else {
+      int num_better_predictions = -1;  // true_class also counts as "better"
+      for (int k = 0; k < num_labels & num_better_predictions < top_k; k++) {
+        num_better_predictions +=
+          (bottom_data[n * dim + k * spatial_dim + s] >= prob_of_true_class);
+      }
+      acc[label_value*nthreads + index] += (num_better_predictions < top_k);
+      counts[label_value*nthreads + index] = 1;
+    }
+  }
+}
+
+template<typename Ftype, typename Btype>
+void AccuracyLayer<Ftype, Btype>::Forward_gpu(
+    const vector<Blob*>& bottom, const vector<Blob*>& top) {
+  const Ftype* bottom_data = bottom[0]->gpu_data<Ftype>();
+  const Ftype* bottom_label = bottom[1]->gpu_data<Ftype>();
+  const int dim = bottom[0]->count() / outer_num_;
+  const int num_labels = bottom[0]->shape(label_axis_);
+  const int nthreads = outer_num_ * inner_num_;
+  cudaStream_t stream = Caffe::thread_stream();
+  // Since this memory is not used for anything, we use it here to avoid having
+  // to allocate new GPU memory to accumulate intermediate results.
+  Ftype* acc_data = bottom[0]->mutable_gpu_diff<Ftype>();
+  if (top.size() == 1) {
+    // simple case - report only global accuracy.
+
+    // Similarly, this memory is never used elsewhere, and thus we can use it
+    // to avoid having to allocate additional GPU memory.
+    Ftype* counts = bottom[1]->mutable_gpu_diff<Ftype>();
+    // NOLINT_NEXT_LINE(whitespace/operators)
+    AccuracyForwardGPU<<<CAFFE_GET_BLOCKS(nthreads),
+        CAFFE_CUDA_NUM_THREADS, 0, stream>>>(nthreads, bottom_data, bottom_label,
+        acc_data, outer_num_, dim, inner_num_, num_labels, top_k_,
+        has_ignore_label_, ignore_label_, counts);
+    CUDA_CHECK(cudaStreamSynchronize(stream));
+    Ftype acc;
+    caffe_gpu_asum(nthreads, acc_data, &acc, 0);
+    Ftype valid_count;
+    caffe_gpu_asum(nthreads, counts, &valid_count, 0);
+    if (valid_count > 0) {
+      top[0]->mutable_cpu_data<Ftype>()[0] = acc / valid_count;
+    } else {
+      top[0]->mutable_cpu_data<Ftype>()[0] = 0;
+    }
+  } else {
+    // need to report per-class accuracy as well
+
+    // allocate space for more detailed "counts"
+    nums_buffer_.ReshapeLike(*bottom[0]);
+    Ftype* counts = nums_buffer_.mutable_gpu_data<Ftype>();
+
+    caffe_gpu_set(bottom[0]->count(), Ftype(0), acc_data);
+    caffe_gpu_set(nums_buffer_.count(), Ftype(0), counts);
+
+    // NOLINT_NEXT_LINE(whitespace/operators)
+    AccuracyForwardWithPerClassGPU<<<CAFFE_GET_BLOCKS(nthreads),
+        CAFFE_CUDA_NUM_THREADS, 0, stream>>>(nthreads, bottom_data, bottom_label,
+        acc_data, counts, outer_num_, dim, inner_num_, num_labels, top_k_,
+        has_ignore_label_, ignore_label_);
+    CUDA_CHECK(cudaStreamSynchronize(stream));
+
+    // get the overall accuracy
+    Ftype acc;
+    caffe_gpu_asum(bottom[0]->count(), acc_data, &acc, 0);
+    Ftype valid_count;
+    caffe_gpu_asum(nums_buffer_.count(), counts, &valid_count, 0);
+    if (valid_count > 0) {
+      top[0]->mutable_cpu_data<Ftype>()[0] = acc / valid_count;
+    } else {
+      top[0]->mutable_cpu_data<Ftype>()[0] = 0;
+    }
+
+    // get per-class accuracy
+    Ftype* per_class_acc = top[1]->mutable_cpu_data<Ftype>();
+    for (int l = 0; l < num_labels; l++) {
+      caffe_gpu_asum(nthreads, acc_data + l*nthreads, per_class_acc+l, 0);
+      caffe_gpu_asum(nthreads, counts + l*nthreads, &valid_count, 0);
+      if (valid_count > 0) {
+        per_class_acc[l] /= valid_count;
+      } else {
+        per_class_acc[l] = 0;
+      }
+    }
+  }
+  // Clear scratch memory to prevent interfering with backward (see #6202).
+  caffe_gpu_set(bottom[0]->count(), Ftype(0), bottom[0]->mutable_gpu_diff<Ftype>());
+}
+
+
+template<typename Ftype, typename Btype>
+void AccuracyLayer<Ftype, Btype>::Backward_gpu(const vector<Blob*>& top,
+    const vector<bool>& propagate_down, const vector<Blob*>& bottom) {
+  if (propagate_down[1]) {  NOT_IMPLEMENTED;  }
+}
+
+INSTANTIATE_LAYER_GPU_FUNCS_FB(AccuracyLayer);
+
+}  // namespace caffe
diff --git a/src/caffe/layers/annotated_data_layer.cpp b/src/caffe/layers/annotated_data_layer.cpp
new file mode 100644
index 00000000000..6e14b9ec188
--- /dev/null
+++ b/src/caffe/layers/annotated_data_layer.cpp
@@ -0,0 +1,337 @@
+#include <opencv2/core/core.hpp>
+#include <stdint.h>
+
+#include <algorithm>
+#include <map>
+#include <vector>
+
+#include "caffe/data_transformer.hpp"
+#include "caffe/layers/annotated_data_layer.hpp"
+#include "caffe/util/benchmark.hpp"
+#include "caffe/util/sampler.hpp"
+#include "caffe/parallel.hpp"
+
+namespace caffe {
+
+template <typename Ftype, typename Btype>
+AnnotatedDataLayer<Ftype, Btype>::AnnotatedDataLayer(const LayerParameter& param,
+    size_t solver_rank)
+  : DataLayer<Ftype, Btype>(param, solver_rank) {}
+
+template <typename Ftype, typename Btype>
+void AnnotatedDataLayer<Ftype, Btype>::DataLayerSetUp(
+    const vector<Blob*>& bottom, const vector<Blob*>& top) {
+  const LayerParameter& param = this->layer_param();
+  const AnnotatedDataParameter& anno_data_param = param.annotated_data_param();
+  const int batch_size = param.data_param().batch_size();
+  const bool cache = this->cache_ && this->phase_ == TRAIN;
+  const bool shuffle = cache && this->shuffle_ && this->phase_ == TRAIN;
+  TBlob<Ftype> transformed_datum;
+
+  for (int i = 0; i < anno_data_param.batch_sampler_size(); ++i) {
+    batch_samplers_.push_back(anno_data_param.batch_sampler(i));
+  }
+
+  if (this->auto_mode()) {
+    if (!sample_areader_) {
+      sample_areader_ = std::make_shared<DataReader<AnnotatedDatum>>(param,
+          Caffe::solver_count(),
+          this->rank_,
+          this->parsers_num_,
+          this->threads_num(),
+          batch_size,
+          true,
+          false,
+          cache,
+          shuffle,
+          false);
+    } else if (!areader_) {
+      areader_ = std::make_shared<DataReader<AnnotatedDatum>>(param,
+          Caffe::solver_count(),
+          this->rank_,
+          this->parsers_num_,
+          this->threads_num(),
+          batch_size,
+          false,
+          true,
+          cache,
+          shuffle,
+          this->phase_ == TRAIN);
+    }
+  } else if (!areader_) {
+    areader_ = std::make_shared<DataReader<AnnotatedDatum>>(param,
+        Caffe::solver_count(),
+        this->rank_,
+        this->parsers_num_,
+        this->threads_num(),
+        batch_size,
+        false,
+        false,
+        cache,
+        shuffle,
+        this->phase_ == TRAIN);
+    start_reading();
+  }
+
+  label_map_file_ = anno_data_param.label_map_file();
+  // Make sure dimension is consistent within batch.
+  const TransformationParameter& transform_param =
+    this->layer_param_.transform_param();
+  if (transform_param.has_resize_param()) {
+    if (transform_param.resize_param().resize_mode() ==
+        ResizeParameter_Resize_mode_FIT_SMALL_SIZE) {
+      CHECK_EQ(batch_size, 1)
+        << "Only support batch size of 1 for FIT_SMALL_SIZE.";
+    }
+  }
+
+  // Read a data point, and use it to initialize the top blob.
+  shared_ptr<AnnotatedDatum> sample_datum =
+      this->sample_only_ ? this->sample_areader_->sample() : this->areader_->sample();
+  AnnotatedDatum& anno_datum = *sample_datum;
+  this->ResizeQueues();
+  this->init_offsets();
+
+  // Calculate the variable sized transformed datum shape.
+  vector<int> sample_datum_shape = this->bdt(0)->InferDatumShape(sample_datum->datum());
+  // Reshape top[0] and prefetch_data according to the batch_size.
+  // Note: all these reshapings here in load_batch are needed only in case of
+  // different datum shapes coming from database.
+  vector<int> top_shape = this->bdt(0)->InferBlobShape(sample_datum_shape);
+  transformed_datum.Reshape(top_shape);
+  top_shape[0] = batch_size;
+  top[0]->Reshape(top_shape);
+
+  LOG(INFO) << "output data size: " << top[0]->num() << ","
+      << top[0]->channels() << "," << top[0]->height() << ","
+      << top[0]->width();
+  // label
+  vector<int> label_shape(4, 1);
+  if (this->output_labels_) {
+    has_anno_type_ = anno_datum.has_type() || anno_data_param.has_anno_type();
+    if (has_anno_type_) {
+      anno_type_ = anno_datum.type();
+      if (anno_data_param.has_anno_type()) {
+        // If anno_type is provided in AnnotatedDataParameter, replace
+        // the type stored in each individual AnnotatedDatum.
+        LOG(WARNING) << "type stored in AnnotatedDatum is shadowed.";
+        anno_type_ = anno_data_param.anno_type();
+      }
+      // Infer the label shape from anno_datum.AnnotationGroup().
+      int num_bboxes = 0;
+      if (anno_type_ == AnnotatedDatum_AnnotationType_BBOX) {
+        // Since the number of bboxes can be different for each image,
+        // we store the bbox information in a specific format. In specific:
+        // All bboxes are stored in one spatial plane (num and channels are 1)
+        // And each row contains one and only one box in the following format:
+        // [item_id, group_label, instance_id, xmin, ymin, xmax, ymax, diff]
+        // Note: Refer to caffe.proto for details about group_label and
+        // instance_id.
+        for (int g = 0; g < anno_datum.annotation_group_size(); ++g) {
+          num_bboxes += anno_datum.annotation_group(g).annotation_size();
+        }
+        label_shape[0] = 1;
+        label_shape[1] = 1;
+        // BasePrefetchingDataLayer<Dtype>::LayerSetUp() requires to call
+        // cpu_data and gpu_data for consistent prefetch thread. Thus we make
+        // sure there is at least one bbox.
+        label_shape[2] = std::max(num_bboxes, 1);
+        label_shape[3] = 8;
+      } else {
+        LOG(FATAL) << "Unknown annotation type.";
+      }
+    } else {
+      label_shape[0] = batch_size;
+    }
+    top[1]->Reshape(label_shape);
+  }
+  this->batch_transformer_->reshape(top_shape, label_shape, this->is_gpu_transform());
+
+  LOG(INFO) << this->print_current_device() << " Output data size: "
+      << top[0]->num() << ", "
+      << top[0]->channels() << ", "
+      << top[0]->height() << ", "
+      << top[0]->width();
+}
+
+// This function is called on prefetch thread
+template <typename Ftype, typename Btype>
+void AnnotatedDataLayer<Ftype, Btype>::load_batch(Batch* batch, int thread_id, size_t queue_id) {
+  const bool sample_only = this->sample_only_.load();
+  TBlob<Ftype> transformed_datum;
+
+  //const bool use_gpu_transform = false;//this->is_gpu_transform();
+  // Reshape according to the first anno_datum of each batch
+  // on single input batches allows for inputs of varying dimension.
+  const int batch_size = this->layer_param_.data_param().batch_size();
+  const AnnotatedDataParameter& anno_data_param =
+      this->layer_param_.annotated_data_param();
+  const TransformationParameter& transform_param =
+    this->layer_param_.transform_param();
+
+  const size_t qid = sample_only ? 0UL : queue_id;
+  DataReader<AnnotatedDatum>* reader = sample_only ? sample_areader_.get() : areader_.get();
+  shared_ptr<AnnotatedDatum> init_datum = reader->full_peek(qid);
+  CHECK(init_datum);
+
+  // Use data_transformer to infer the expected blob shape from datum.
+  vector<int> top_shape = this->bdt(thread_id)->InferBlobShape(init_datum->datum());
+  transformed_datum.Reshape(top_shape);
+  // Reshape batch according to the batch_size.
+  top_shape[0] = batch_size;
+  batch->data_->Reshape(top_shape);
+
+  Ftype* top_data = batch->data_->mutable_cpu_data<Ftype>();
+  Ftype* top_label = NULL;  // suppress warnings about uninitialized variables
+  if (this->output_labels_ && !has_anno_type_) {
+    top_label = batch->label_->mutable_cpu_data<Ftype>();
+  }
+
+  // Store transformed annotation.
+  map<int, vector<AnnotationGroup> > all_anno;
+  int num_bboxes = 0;
+
+  size_t current_batch_id = 0UL;
+  for (size_t entry = 0; entry < batch_size; ++entry) {
+    // get an anno_datum
+    shared_ptr<AnnotatedDatum> anno_datum = reader->full_pop(qid, "Waiting for data");
+    size_t item_id = anno_datum->record_id() % batch_size;
+    if (item_id == 0UL) {
+      current_batch_id = anno_datum->record_id() / batch_size;
+    }
+    AnnotatedDatum distort_datum;
+    AnnotatedDatum expand_datum;
+    if (transform_param.has_distort_param()) {
+      distort_datum.CopyFrom(*anno_datum);
+      this->bdt(thread_id)->DistortImage(anno_datum->datum(), distort_datum.mutable_datum());
+      if (transform_param.has_expand_param()) {
+        this->bdt(thread_id)->ExpandImage(distort_datum, &expand_datum);
+      } else {
+        expand_datum = distort_datum;
+      }
+    } else {
+      if (transform_param.has_expand_param()) {
+        this->bdt(thread_id)->ExpandImage(*anno_datum, &expand_datum);
+      } else {
+        expand_datum = *anno_datum;
+      }
+    }
+    AnnotatedDatum sampled_datum;
+    if (batch_samplers_.size() > 0) {
+      // Generate sampled bboxes from expand_datum.
+      vector<NormalizedBBox> sampled_bboxes;
+      GenerateBatchSamples(expand_datum, batch_samplers_, &sampled_bboxes);
+      if (sampled_bboxes.size() > 0) {
+        // Randomly pick a sampled bbox and crop the expand_datum.
+        int rand_idx = caffe_rng_rand() % sampled_bboxes.size();
+        this->bdt(thread_id)->CropImage(expand_datum, sampled_bboxes[rand_idx], &sampled_datum);
+      } else {
+        sampled_datum = expand_datum;
+      }
+    } else {
+      sampled_datum = expand_datum;
+    }
+    vector<int> shape =
+        this->bdt(thread_id)->InferBlobShape(sampled_datum.datum());
+    if (transform_param.has_resize_param()) {
+      if (transform_param.resize_param().resize_mode() ==
+          ResizeParameter_Resize_mode_FIT_SMALL_SIZE) {
+        transformed_datum.Reshape(shape);
+        batch->data_->Reshape(shape);
+        top_data = batch->data_->mutable_cpu_data<Ftype>();
+      } else {
+        CHECK(std::equal(top_shape.begin() + 1, top_shape.begin() + 4,
+              shape.begin() + 1));
+      }
+    } else {
+      CHECK(std::equal(top_shape.begin() + 1, top_shape.begin() + 4,
+            shape.begin() + 1));
+    }
+    // Apply data transformations (mirror, scale, crop...)
+    int offset = batch->data_->offset(item_id);
+    transformed_datum.set_cpu_data(top_data + offset);
+    vector<AnnotationGroup> transformed_anno_vec;
+    if (this->output_labels_) {
+      if (has_anno_type_) {
+        // Make sure all data have same annotation type.
+        CHECK(sampled_datum.has_type()) << "Some datum misses AnnotationType.";
+        if (anno_data_param.has_anno_type()) {
+          sampled_datum.set_type(anno_type_);
+        } else {
+          CHECK_EQ(anno_type_, sampled_datum.type()) << "Different AnnotationType.";
+        }
+        // Transform datum and annotation_group at the same time
+        transformed_anno_vec.clear();
+        this->fdt(thread_id)->Transform(sampled_datum, &transformed_datum, &transformed_anno_vec);
+        if (anno_type_ == AnnotatedDatum_AnnotationType_BBOX) {
+          // Count the number of bboxes.
+          for (int g = 0; g < transformed_anno_vec.size(); ++g) {
+            num_bboxes += transformed_anno_vec[g].annotation_size();
+          }
+        } else {
+          LOG(FATAL) << "Unknown annotation type.";
+        }
+        all_anno[item_id] = transformed_anno_vec;
+      } else {
+        this->fdt(thread_id)->Transform(sampled_datum.datum(), &(transformed_datum));
+        // Otherwise, store the label from datum.
+        CHECK(sampled_datum.datum().has_label()) << "Cannot find any label.";
+        top_label[item_id] = sampled_datum.datum().label();
+      }
+    } else {
+      this->fdt(thread_id)->Transform(sampled_datum.datum(), &transformed_datum);
+    }
+
+    reader->free_push(queue_id, anno_datum);
+  }
+
+  // Store "rich" annotation if needed.
+  if (this->output_labels_ && has_anno_type_) {
+    vector<int> label_shape(4);
+    if (anno_type_ == AnnotatedDatum_AnnotationType_BBOX) {
+      label_shape[0] = 1;
+      label_shape[1] = 1;
+      label_shape[3] = 8;
+      if (num_bboxes == 0) {
+        // Store all -1 in the label.
+        label_shape[2] = 1;
+        batch->label_->Reshape(label_shape);
+        caffe_set<Ftype>(8, -1, batch->label_->mutable_cpu_data<Ftype>());
+      } else {
+        // Reshape the label and store the annotation.
+        label_shape[2] = num_bboxes;
+        batch->label_->Reshape(label_shape);
+        top_label = batch->label_->mutable_cpu_data<Ftype>();
+        int idx = 0;
+        for (int item_id = 0; item_id < batch_size; ++item_id) {
+          const vector<AnnotationGroup>& anno_vec = all_anno[item_id];
+          for (int g = 0; g < anno_vec.size(); ++g) {
+            const AnnotationGroup& anno_group = anno_vec[g];
+            for (int a = 0; a < anno_group.annotation_size(); ++a) {
+              const Annotation& anno = anno_group.annotation(a);
+              const NormalizedBBox& bbox = anno.bbox();
+              top_label[idx++] = item_id;
+              top_label[idx++] = anno_group.group_label();
+              top_label[idx++] = anno.instance_id();
+              top_label[idx++] = bbox.xmin();
+              top_label[idx++] = bbox.ymin();
+              top_label[idx++] = bbox.xmax();
+              top_label[idx++] = bbox.ymax();
+              top_label[idx++] = bbox.difficult();
+            }
+          }
+        }
+      }
+    } else {
+      LOG(FATAL) << "Unknown annotation type.";
+    }
+  }
+//    batch->set_data_packing(packing); todo
+  batch->set_id(current_batch_id);
+  this->sample_only_.store(false);
+}
+
+INSTANTIATE_CLASS_FB(AnnotatedDataLayer);
+REGISTER_LAYER_CLASS_R(AnnotatedData);
+
+}  // namespace caffe
diff --git a/src/caffe/layers/axpy_layer.cpp b/src/caffe/layers/axpy_layer.cpp
new file mode 100644
index 00000000000..476af93f5bd
--- /dev/null
+++ b/src/caffe/layers/axpy_layer.cpp
@@ -0,0 +1,90 @@
+/*
+ * Axpy Layer
+ *
+ * Created on: May 1, 2017
+ * Author: hujie
+ */
+
+#include "caffe/layers/axpy_layer.hpp"
+
+namespace caffe {
+
+template <typename Ftype, typename Btype>
+void AxpyLayer<Ftype, Btype>::Reshape(const vector<Blob*>& bottom,
+    const vector<Blob*>& top) {
+  CHECK_EQ(bottom[0]->shape(0), bottom[1]->shape(0));
+  CHECK_EQ(bottom[0]->shape(1), bottom[1]->shape(1));
+  if (bottom[0]->num_axes() == 4) {
+    CHECK_EQ(bottom[0]->shape(2), 1);
+    CHECK_EQ(bottom[0]->shape(3), 1);
+  }
+  CHECK(bottom[1]->shape() == bottom[2]->shape());
+  top[0]->ReshapeLike(*bottom[1]);
+  int spatial_dim = bottom[1]->count(2);
+  if (spatial_sum_multiplier_.count() < spatial_dim) {
+    spatial_sum_multiplier_.Reshape(vector<int>(1, spatial_dim));
+    caffe_set(spatial_dim, Btype(1),
+        spatial_sum_multiplier_.mutable_cpu_data());
+  }
+}
+
+template <typename Ftype, typename Btype>
+void AxpyLayer<Ftype, Btype>::Forward_cpu(const vector<Blob*>& bottom,
+    const vector<Blob*>& top) {
+  int channel_dim = bottom[1]->channels();
+  int spatial_dim = bottom[1]->count(2);
+  const Ftype* scale_data = bottom[0]->cpu_data<Ftype>();
+  const Ftype* x_data = bottom[1]->cpu_data<Ftype>();
+  Ftype* top_data = top[0]->mutable_cpu_data<Ftype>();
+  caffe_copy(bottom[2]->count(), bottom[2]->cpu_data<Ftype>(), top_data);
+  for (int n = 0; n < bottom[1]->num(); ++n) {
+    for (int c = 0; c < channel_dim; ++c) {
+      int scale_offset = n * channel_dim + c;
+      caffe_axpy(spatial_dim, scale_data[scale_offset],
+          x_data + scale_offset * spatial_dim,
+          top_data + scale_offset * spatial_dim);
+    }
+  }
+}
+
+template <typename Ftype, typename Btype>
+void AxpyLayer<Ftype, Btype>::Backward_cpu(const vector<Blob*>& top,
+    const vector<bool>& propagate_down, const vector<Blob*>& bottom) {
+  const int count = top[0]->count();
+  const Btype* top_diff = top[0]->cpu_diff<Btype>();
+  if (propagate_down[0]) {
+    int spatial_dim = bottom[1]->count(2);
+    const Btype* x_data = bottom[1]->cpu_data<Btype>();
+    Btype* x_diff = bottom[1]->mutable_cpu_diff<Btype>();
+    Btype* scale_diff = bottom[0]->mutable_cpu_diff<Btype>();
+    caffe_mul(count, top_diff, x_data, x_diff);
+    caffe_set(bottom[0]->count(), Btype(0), scale_diff);
+    caffe_cpu_gemv(CblasNoTrans, bottom[0]->count(), spatial_dim, Btype(1),
+        x_diff, spatial_sum_multiplier_.cpu_data(), Btype(1), scale_diff);
+    if (!propagate_down[1]) {
+      caffe_set(bottom[1]->count(), Btype(0), x_diff);
+    }
+  }
+  if (propagate_down[0]) {
+    int channel_dim = bottom[1]->channels();
+    int spatial_dim = bottom[1]->count(2);
+    const Btype* scale_data = bottom[0]->cpu_data<Btype>();
+    Btype* x_diff = bottom[1]->mutable_cpu_diff<Btype>();
+    for (int n = 0; n < bottom[1]->num(); ++n) {
+      for (int c = 0; c < channel_dim; ++c) {
+        int scale_offset = n * channel_dim + c;
+        caffe_cpu_scale(spatial_dim, scale_data[scale_offset],
+            top_diff + scale_offset * spatial_dim,
+            x_diff + scale_offset * spatial_dim);
+      }
+    }
+  }
+  if (propagate_down[2]) {
+    caffe_copy(count, top_diff, bottom[2]->mutable_cpu_diff<Btype>());
+  }
+}
+
+INSTANTIATE_CLASS_FB(AxpyLayer);
+REGISTER_LAYER_CLASS(Axpy);
+
+} // namespace
diff --git a/src/caffe/layers/axpy_layer.cu b/src/caffe/layers/axpy_layer.cu
new file mode 100644
index 00000000000..79ed130745e
--- /dev/null
+++ b/src/caffe/layers/axpy_layer.cu
@@ -0,0 +1,108 @@
+/*
+ * Axpy Layer
+ *
+ * Created on: May 1, 2017
+ * Author: hujie
+ */
+
+#include <device_launch_parameters.h>
+#include "caffe/util/half.cuh"
+#include "caffe/util/gpu_math_functions.cuh"
+#include "caffe/layers/axpy_layer.hpp"
+
+namespace caffe {
+
+template <typename Dtype>
+__global__ void AxpyForward(const int count, const int spatial_dim,
+    const Dtype* scale_data, const Dtype* x_data, const Dtype* y_data,
+    Dtype* out_data) {
+  CUDA_KERNEL_LOOP(index, count) {
+    out_data[index] = scale_data[index / spatial_dim] * x_data[index]
+        + y_data[index];
+  }
+}
+
+template <typename Ftype, typename Btype>
+void AxpyLayer<Ftype, Btype>::Forward_gpu(
+    const vector<Blob*>& bottom, const vector<Blob*>& top) {
+  const Ftype* scale_data = bottom[0]->gpu_data<Ftype>();
+  const Ftype* x_data = bottom[1]->gpu_data<Ftype>();
+  const Ftype* y_data = bottom[2]->gpu_data<Ftype>();
+  Ftype* out_data = top[0]->mutable_gpu_data<Ftype>();
+  const int count = bottom[1]->count();
+  cudaStream_t stream = Caffe::thread_stream();
+  // NOLINT_NEXT_LINE(whitespace/operators)
+  AxpyForward<Ftype><<<CAFFE_GET_BLOCKS(count), CAFFE_CUDA_NUM_THREADS, 0, stream>>>(
+      count, bottom[1]->count(2), scale_data, x_data, y_data, out_data);
+  CUDA_CHECK(cudaStreamSynchronize(stream));
+}
+
+template <typename Dtype>
+__global__ void AxpyBackwardScale(const int outer_num, const int spatial_dim,
+    const Dtype* x_data, const Dtype* top_diff, Dtype* scale_diff) {
+  __shared__ char axpy_buffer[CAFFE_CUDA_NUM_THREADS * sizeof(Dtype)];
+  Dtype* buffer = reinterpret_cast<Dtype*>(axpy_buffer);
+  unsigned int tid = threadIdx.x;
+  buffer[tid] = 0;
+  __syncthreads();
+
+  for (int j = tid; j < spatial_dim; j += blockDim.x) {
+    int offset = blockIdx.x * spatial_dim + j;
+    buffer[tid] += top_diff[offset] * x_data[offset];
+  }
+  __syncthreads();
+
+  for (int i = blockDim.x / 2; i > 0; i >>= 1) {
+    if (tid < i) {
+      buffer[threadIdx.x] += buffer[threadIdx.x + i];
+    }
+    __syncthreads();
+  }
+
+  if (tid == 0) {
+    scale_diff[blockIdx.x] = buffer[0];
+  }
+}
+
+template <typename Dtype>
+__global__ void AxpyBackwardX(const int count, const int spatial_dim,
+    const Dtype* scale_data, const Dtype* top_diff, Dtype* out) {
+  CUDA_KERNEL_LOOP(index, count) {
+    out[index] = scale_data[index / spatial_dim] * top_diff[index];
+  }
+}
+
+template <typename Ftype, typename Btype>
+void AxpyLayer<Ftype, Btype>::Backward_gpu(const vector<Blob*>& top,
+    const vector<bool>& propagate_down, const vector<Blob*>& bottom) {
+  const int count = top[0]->count();
+  const Btype* top_diff = top[0]->gpu_diff<Btype>();
+  if (propagate_down[0]) {
+    cudaStream_t stream = Caffe::thread_stream();
+    int outer_num = bottom[1]->count(0, 2);
+    // NOLINT_NEXT_LINE(whitespace/operators)
+    AxpyBackwardScale<<<outer_num, CAFFE_CUDA_NUM_THREADS, 0, stream>>>(
+        outer_num, bottom[1]->count(2),
+        bottom[1]->gpu_data<Btype>(), top_diff,
+        bottom[0]->mutable_gpu_diff<Btype>());
+    CUDA_POST_KERNEL_CHECK;
+    CUDA_CHECK(cudaStreamSynchronize(stream));
+  }
+  if (propagate_down[1]) {
+    cudaStream_t stream = Caffe::thread_stream();
+    // NOLINT_NEXT_LINE(whitespace/operators)
+    AxpyBackwardX<<<CAFFE_GET_BLOCKS(count), CAFFE_CUDA_NUM_THREADS, 0, stream>>>(
+        count, top[0]->count(2),
+        bottom[0]->gpu_data<Btype>(), top_diff,
+        bottom[1]->mutable_gpu_diff<Btype>());
+    CUDA_POST_KERNEL_CHECK;
+    CUDA_CHECK(cudaStreamSynchronize(stream));
+  }
+  if (propagate_down[2]) {
+    caffe_copy(count, top_diff, bottom[2]->mutable_gpu_diff<Btype>());
+  }
+}
+
+INSTANTIATE_LAYER_GPU_FUNCS_FB(AxpyLayer);
+
+}  // namespace caffe
diff --git a/src/caffe/layers/base_data_layer.cpp b/src/caffe/layers/base_data_layer.cpp
index cb68035a61b..050167ea42b 100644
--- a/src/caffe/layers/base_data_layer.cpp
+++ b/src/caffe/layers/base_data_layer.cpp
@@ -13,10 +13,9 @@ namespace caffe {
 
 template<typename Ftype, typename Btype>
 size_t BasePrefetchingDataLayer<Ftype, Btype>::threads(const LayerParameter& param) {
-  if (param.has_image_data_param()) {
+  if (param.type().compare("ImageData") == 0 && param.has_image_data_param()) {
     return param.image_data_param().threads();
   }
-
   // Check user's override in prototxt file
   size_t threads = param.data_param().threads();
   if (!auto_mode(param) && threads == 0U) {
@@ -67,7 +66,8 @@ BasePrefetchingDataLayer<Ftype, Btype>::BasePrefetchingDataLayer(const LayerPara
       transf_num_(threads(param)),
       queues_num_(transf_num_ * parsers_num_),
       batch_transformer_(make_shared<BatchTransformer<Ftype, Btype>>(Caffe::current_device(),
-          solver_rank, queues_num_, param.transform_param(), is_gpu_transform())) {
+          solver_rank, queues_num_, param.transform_param(), is_gpu_transform())),
+      iter0_(true) {
   CHECK_EQ(transf_num_, threads_num());
   batch_size_ = param.data_param().batch_size();
   // We begin with minimum required
@@ -87,14 +87,16 @@ void BasePrefetchingDataLayer<Ftype, Btype>::LayerSetUp(const vector<Blob*>& bot
   BaseDataLayer<Ftype, Btype>::LayerSetUp(bottom, top);
 
   for (int i = 0; i < transf_num_; ++i) {
-    data_transformers_.emplace_back(
-        make_shared<DataTransformer>(this->transform_param_, this->phase_));
+    bwd_data_transformers_.emplace_back(
+        make_shared<DataTransformer<Btype>>(this->transform_param_, this->phase_));
+    fwd_data_transformers_.emplace_back(
+        make_shared<DataTransformer<Ftype>>(this->transform_param_, this->phase_));
   }
   const Solver* psolver = this->parent_solver();
   const uint64_t random_seed = (psolver == nullptr ||
       static_cast<uint64_t>(psolver->param().random_seed()) == Caffe::SEED_NOT_SET) ?
           Caffe::next_seed() : static_cast<uint64_t>(psolver->param().random_seed());
-  StartInternalThread(true, random_seed);
+  StartInternalThread(false, random_seed);
 }
 
 template<typename Ftype, typename Btype>
@@ -104,14 +106,18 @@ void BasePrefetchingDataLayer<Ftype, Btype>::InternalThreadEntry() {
 
 template<typename Ftype, typename Btype>
 void BasePrefetchingDataLayer<Ftype, Btype>::InternalThreadEntryN(size_t thread_id) {
-  static thread_local bool iter0 = this->phase_ == TRAIN;
-  if (iter0 && this->net_inititialized_flag_ != nullptr) {
+  const bool auto_mode = this->auto_mode();
+  if (auto_mode) {
+    iter0_.wait_reset();  // sample reader first
+  } else if (this->phase_ == TRAIN) {
+    iter0_.wait();
+  }
+  if (auto_mode && this->net_inititialized_flag_ != nullptr) {
     this->net_inititialized_flag_->wait();
-  } else {  // nothing to wait -> initialize and start pumping
-    InitializePrefetch();
-    start_reading();
-    iter0 = false;
   }
+  InitializePrefetch();
+  start_reading();
+
   try {
     while (!must_stop(thread_id)) {
       const size_t qid = this->queue_id(thread_id);
@@ -122,19 +128,10 @@ void BasePrefetchingDataLayer<Ftype, Btype>::InternalThreadEntryN(size_t thread_
         break;
       }
       batch_transformer_->prefetched_push_full(qid, batch);
-      if (iter0) {
-        if (this->net_iteration0_flag_ != nullptr) {
-          this->net_iteration0_flag_->wait();
-        }
-        if (this->net_inititialized_flag_ != nullptr) {
-          this->net_inititialized_flag_ = nullptr;  // no wait on the second round
-          InitializePrefetch();
-          start_reading();
-        }
-        if (this->auto_mode()) {
-          break;
-        }  // manual otherwise, thus keep rolling
-        iter0 = false;
+
+      if (auto_mode) {
+        iter0_.set();
+        break;
       }
     }
   } catch (boost::thread_interrupted&) {
@@ -150,11 +147,18 @@ void BasePrefetchingDataLayer<Ftype, Btype>::ResizeQueues() {
       batch_ids_[i] = i;
     }
   }
-  size = this->data_transformers_.size();
+  size = this->bwd_data_transformers_.size();
+  if (transf_num_ > size) {
+    for (size_t i = size; i < transf_num_; ++i) {
+      this->bwd_data_transformers_.emplace_back(
+          make_shared<DataTransformer<Btype>>(this->transform_param_, this->phase_));
+    }
+  }
+  size = this->fwd_data_transformers_.size();
   if (transf_num_ > size) {
     for (size_t i = size; i < transf_num_; ++i) {
-      this->data_transformers_.emplace_back(
-          make_shared<DataTransformer>(this->transform_param_, this->phase_));
+      this->fwd_data_transformers_.emplace_back(
+          make_shared<DataTransformer<Ftype>>(this->transform_param_, this->phase_));
     }
   }
 }
diff --git a/src/caffe/layers/concat_layer.cu b/src/caffe/layers/concat_layer.cu
index bf67a44d947..5b5a520efb9 100644
--- a/src/caffe/layers/concat_layer.cu
+++ b/src/caffe/layers/concat_layer.cu
@@ -37,8 +37,6 @@ void ConcatLayer<Ftype, Btype>::Forward_gpu(const vector<Blob*>& bottom,
   if (bottom.size() == 1) {
     return;
   }
-
-
   for (int i = 0; i < bottom.size(); ++i) {
     bottom_data = bottom[i]->gpu_data<Ftype>();
     const int bottom_concat_axis = bottom[i]->shape(concat_axis_);
@@ -56,10 +54,9 @@ void ConcatLayer<Ftype, Btype>::Forward_gpu(const vector<Blob*>& bottom,
           nthreads, bottom_data, kForward, num_concats_, concat_input_size_,
               top_concat_axis, bottom_concat_axis, offset_concat_axis, top_data);
     }
+    CUDA_CHECK(cudaStreamSynchronize(Caffe::thread_stream()));
     offset_concat_axis += bottom_concat_axis;
   }
-
-  CUDA_CHECK(cudaStreamSynchronize(Caffe::thread_stream()));
 }
 
 template <typename Ftype, typename Btype>
@@ -83,6 +80,7 @@ void ConcatLayer<Ftype, Btype>::Backward_gpu(const vector<Blob*>& top,
           <<<CAFFE_GET_BLOCKS(nthreads), CAFFE_CUDA_NUM_THREADS, 0, Caffe::thread_stream()>>>(
           nthreads, top_diff, kForward, num_concats_, concat_input_size_,
           top_concat_axis, bottom_concat_axis, offset_concat_axis, bottom_diff);
+      CUDA_CHECK(cudaStreamSynchronize(Caffe::thread_stream()));
     }
     offset_concat_axis += bottom_concat_axis;
   }
diff --git a/src/caffe/layers/cudnn_conv_layer.cpp b/src/caffe/layers/cudnn_conv_layer.cpp
index 093cca3cebf..d7e17e751a1 100644
--- a/src/caffe/layers/cudnn_conv_layer.cpp
+++ b/src/caffe/layers/cudnn_conv_layer.cpp
@@ -20,23 +20,6 @@ namespace caffe {
     (CUDNN_CONVOLUTION_BWD_FILTER_ALGO_WINOGRAD_NONFUSED + 1)
 #endif
 
-template <typename Dtype>
-void createFilterDesc(cudnnFilterDescriptor_t* desc, int n, int c, int h, int w) {
-  CUDNN_CHECK(cudnnCreateFilterDescriptor(desc));
-  CUDNN_CHECK(cudnnSetFilter4dDescriptor(*desc, cudnn::dataType<Dtype>::type,
-      CUDNN_TENSOR_NCHW, n, c, h, w));
-}
-
-void setConvolutionDesc(Type math, cudnnConvolutionDescriptor_t conv,
-    int pad_h, int pad_w, int stride_h, int stride_w, int dilation_h, int dilation_w) {
-  int padA[2] = {pad_h, pad_w};
-  int strideA[2] = {stride_h, stride_w};
-  int upscaleA[2] = {dilation_h, dilation_w};
-  CUDNN_CHECK(cudnnSetConvolutionNdDescriptor(conv,
-      2, padA, strideA, upscaleA, CUDNN_CROSS_CORRELATION,
-      cudnn::cudnn_data_type(math)));
-}
-
 void setConvolutionDescMath(Type math, cudnnConvolutionDescriptor_t conv) {
   int padA[2];
   int strideA[2];
@@ -155,19 +138,19 @@ void CuDNNConvolutionLayer<Ftype, Btype>::LayerSetUp(
   const int kernel_w = kernel_shape_data[1];
 
   if (use_v7grouping()) {
-    createFilterDesc<Ftype>(&fwd_filter_desc_,
+    cudnn::createFilterDesc<Ftype>(&fwd_filter_desc_,
         this->num_output_, this->channels_ / groups(),
         kernel_h, kernel_w);
-    createFilterDesc<Btype>(&bwd_filter_desc_,
+    cudnn::createFilterDesc<Btype>(&bwd_filter_desc_,
         this->num_output_, this->channels_ / groups(),
         kernel_h, kernel_w);
     this->weight_offset_ = this->num_output_ *
                            (this->channels_ / groups()) * kernel_h * kernel_w;
   } else {
-    createFilterDesc<Ftype>(&fwd_filter_desc_,
+    cudnn::createFilterDesc<Ftype>(&fwd_filter_desc_,
         this->num_output_ / groups(), this->channels_ / groups(),
         kernel_h, kernel_w);
-    createFilterDesc<Btype>(&bwd_filter_desc_,
+    cudnn::createFilterDesc<Btype>(&bwd_filter_desc_,
         this->num_output_ / groups(), this->channels_ / groups(),
         kernel_h, kernel_w);
     this->weight_offset_ = (this->num_output_ / groups()) *
@@ -175,8 +158,8 @@ void CuDNNConvolutionLayer<Ftype, Btype>::LayerSetUp(
   }
 
   if (this->phase_ == TRAIN) {
-    train_tmp_weights_mem_.insert_max(Caffe::current_device(),
-        align_up<7>(this->weight_offset_ * tsize(tpmax<Btype, float>())));
+    atomic_maximum(train_tmp_weights_mem_,
+                   align_up<8>(this->weight_offset_ * tsize(tpmax<Btype, float>())));
   }
 
   // Create tensor descriptor(s) for data and corresponding convolution(s).
@@ -252,34 +235,30 @@ void CuDNNConvolutionLayer<Ftype, Btype>::LayerSetUp(
 template <typename Ftype, typename Btype>
 void CuDNNConvolutionLayer<Ftype, Btype>::AllocateFindExWorkspace() {
   const int dev = Caffe::current_device();
-  if (ws_released_[dev]) {
-    return;
-  }
-  shared_ptr<GPUMemory::Workspace> ws = GPUMemory::workspace_[dev];
+  shared_ptr<GPUMemory::Workspace>& ws = GPUMemory::workspace_[dev];
   size_t bytes_available, bytes_total;
   GPUMemory::GetInfo(&bytes_available, &bytes_total, true);
   bytes_available = std::min(bytes_available + ws->size(), bytes_total / 2UL);
 
-  const size_t tmp_weights_size = train_tmp_weights_mem_[dev];
+  const size_t tmp_weights_size = train_tmp_weights_mem_.load();
   if (bytes_available > tmp_weights_size) {
     bytes_available -= tmp_weights_size;
   } else {
     bytes_available = 0UL;
   }
   // 2+ pages => reallocate
-  size_t req_bytes = align_down<7>(bytes_available > 2UL * PAGE_SIZE ?
+  size_t req_bytes = align_down<8>(bytes_available > 2UL * PAGE_SIZE ?
       bytes_available - 2UL * PAGE_SIZE : 0UL);
   if (static_cast<float>(req_bytes) <= PAGE_SIZE) {
     return;
   }
   int attempts = ATTEMPTS_TO_RESERVE_WS;
   while (!ws->try_reserve(req_bytes) && attempts > 0) {
-    req_bytes = align_down<7>(req_bytes > PAGE_SIZE ? req_bytes - PAGE_SIZE : 0UL);
+    req_bytes = align_down<8>(req_bytes > PAGE_SIZE ? req_bytes - PAGE_SIZE : 0UL);
     --attempts;
     LOG(INFO) << this->print_current_device() << " Retrying to allocate " << req_bytes
               << " bytes, attempts left: " << attempts;
   }
-  ws_allocated_[dev] = ws->size();
 }
 
 template <typename Ftype, typename Btype>
@@ -303,20 +282,20 @@ size_t CuDNNConvolutionLayer<Ftype, Btype>::AllocateWorkspace(size_t bottom_size
 
   for (int i = 0; i < bottom_size; ++i) {
     if (this->phase_ == TRAIN) {
-      train_mem_req_all_grps_.insert_max(dev,
-          align_up<7>(workspace_bwd_data_sizes_[i]) * ws_groups());
-      train_mem_req_all_grps_.insert_max(dev,
-          align_up<7>(workspace_bwd_filter_sizes_[i]) * ws_groups());
-      train_mem_req_all_grps_.insert_max(dev,
-          align_up<7>(workspace_fwd_sizes_[i]) * ws_groups());
+      atomic_maximum(train_mem_req_all_grps_,
+                     align_up<8>(workspace_bwd_data_sizes_[i]) * ws_groups());
+      atomic_maximum(train_mem_req_all_grps_,
+                     align_up<8>(workspace_bwd_filter_sizes_[i]) * ws_groups());
+      atomic_maximum(train_mem_req_all_grps_,
+                     align_up<8>(workspace_fwd_sizes_[i]) * ws_groups());
     } else {
-      test_mem_req_all_grps_.insert_max(dev,
-          align_up<7>(workspace_fwd_sizes_[i]) * ws_groups());
+      atomic_maximum(test_mem_req_all_grps_,
+                     align_up<8>(workspace_fwd_sizes_[i]) * ws_groups());
     }
   }
   shared_ptr<GPUMemory::Workspace>& ws = GPUMemory::workspace_[dev];
   ws->safe_reserve(this->phase_ == TRAIN ?
-      train_mem_req_all_grps_[dev] : test_mem_req_all_grps_[dev]);
+      train_mem_req_all_grps_.load() : test_mem_req_all_grps_.load());
   return ws->size();
 }
 
@@ -397,17 +376,17 @@ void CuDNNConvolutionLayer<Ftype, Btype>::Reshape(
         this->num_output_ * this->out_spatial_dim_,
         this->out_spatial_dim_, width_out, 1);
 
-    setConvolutionDesc(forward_math_, fwd_conv_descs_[i],
+    cudnn::setConvolutionDesc(forward_math_, fwd_conv_descs_[i],
         pad_h, pad_w, stride_h, stride_w, dilation_h, dilation_w);
-    setConvolutionDesc(forward_math_, fwd_cached_conv_descs_[i],
+    cudnn::setConvolutionDesc(forward_math_, fwd_cached_conv_descs_[i],
         pad_h, pad_w, stride_h, stride_w, dilation_h, dilation_w);
-    setConvolutionDesc(backward_data_math_, bwd_conv_data_descs_[i],
+    cudnn::setConvolutionDesc(backward_data_math_, bwd_conv_data_descs_[i],
         pad_h, pad_w, stride_h, stride_w, dilation_h, dilation_w);
-    setConvolutionDesc(backward_filter_math_, bwd_conv_filter_descs_[i],
+    cudnn::setConvolutionDesc(backward_filter_math_, bwd_conv_filter_descs_[i],
         pad_h, pad_w, stride_h, stride_w, dilation_h, dilation_w);
-    setConvolutionDesc(backward_data_math_, bwd_cached_conv_data_descs_[i],
+    cudnn::setConvolutionDesc(backward_data_math_, bwd_cached_conv_data_descs_[i],
         pad_h, pad_w, stride_h, stride_w, dilation_h, dilation_w);
-    setConvolutionDesc(backward_filter_math_, bwd_cached_conv_filter_descs_[i],
+    cudnn::setConvolutionDesc(backward_filter_math_, bwd_cached_conv_filter_descs_[i],
         pad_h, pad_w, stride_h, stride_w, dilation_h, dilation_w);
 
     // Set cached descriptors
@@ -436,6 +415,7 @@ void CuDNNConvolutionLayer<Ftype, Btype>::Reshape(
         1, 1);
   }
 
+  size_t workspace_bytes = AllocateWorkspace(bottom.size());
   // Ask cuDNN to find the best algorithm
   // When batch is small and every image is different we don't want to call Find* over and over
   if (use_algo_seeker_) {
@@ -444,12 +424,6 @@ void CuDNNConvolutionLayer<Ftype, Btype>::Reshape(
     //         FindEx-backward-filter. The size of buffer is as big as weights.
     // Get: workspace_bytes is only used as a workspace limit by Get.
     //      (no allocation happens before Get or by Get).
-    size_t workspace_bytes = 0UL;
-    if (fwd_count_ == 0) {
-      // In iteration 0, use a small amount of memory in order to leave
-      // most of memory for allocating layer blobs.
-      workspace_bytes = AllocateWorkspace(bottom.size());
-    }
     switch (this->layer_param_.convolution_param().cudnn_convolution_algo_seeker()) {
       case ConvolutionParameter_CuDNNConvolutionAlgorithmSeeker_GET:
         GetConvAlgo(bottom, top, workspace_bytes, pad_h, pad_w, stride_h, stride_w);
@@ -461,9 +435,9 @@ void CuDNNConvolutionLayer<Ftype, Btype>::Reshape(
             // Now taking the rest for running FindEx calls
             // We'll release what's possible in BW pass
             AllocateFindExWorkspace();
+            // Also used by Test Net but based on shared space taken by Train:
+            FindExConvAlgo(bottom, top);
           }
-          // Also used by Test Net but based on shared space taken by Train:
-          FindExConvAlgo(bottom, top);
           use_algo_seeker_ = false;
         }
         break;
@@ -471,27 +445,6 @@ void CuDNNConvolutionLayer<Ftype, Btype>::Reshape(
         LOG(FATAL) << "Wrong value for cudnn_convolution_algo_seeker";
     }
   }
-
-  if (ok_to_release() && this->phase_ == TRAIN) {
-    const int dev = Caffe::current_device();
-    shared_ptr<GPUMemory::Workspace>& ws = GPUMemory::workspace_[dev];
-    if (!ws_released_[dev] && ws_allocated_[dev] > 0UL) {
-      // Housekeeping: release excessive amount of device memory after FindEx calls
-      size_t mem_req = align_up<7>(std::max(train_mem_req_all_grps_[dev],
-          test_mem_req_all_grps_[dev]) + PAGE_SIZE);
-      if (mem_req > 0UL && ws->size() > mem_req) {
-        // Winner needs half less - release the rest
-        LOG(INFO) << this->print_current_device()
-                  << " Layer '" << this->name() << "' reallocating workspace "
-                  << mem_fmt(ws->size()) << " to " << mem_fmt(mem_req);
-        // TRAIN only
-        ws->release();
-        ws->reserve(mem_req);
-        ws_released_[dev] = true;
-        GPUMemory::weights_workspace_[dev]->release();
-      }
-    }
-  }
 }
 
 template <typename Ftype, typename Btype>
@@ -504,14 +457,14 @@ void CuDNNConvolutionLayer<Ftype, Btype>::GetConvAlgo(const vector<Blob*>& botto
       CUDNN_CHECK(cudnnGetConvolutionBackwardDataAlgorithm(Caffe::cudnn_handle(0),
           bwd_filter_desc_, bwd_top_descs_[i], bwd_conv_data_descs_[i], bwd_bottom_descs_[i],
           CUDNN_CONVOLUTION_BWD_DATA_SPECIFY_WORKSPACE_LIMIT,
-          align_down<7>(workspace_bytes / ws_groups()), &bwd_data_algo_[i]));
+          align_down<8>(workspace_bytes / ws_groups()), &bwd_data_algo_[i]));
     }
     // Get forward algorithm (if not set by user)
     if (user_algos_override_[0] < 0) {
       CUDNN_CHECK(cudnnGetConvolutionForwardAlgorithm(Caffe::cudnn_handle(0),
           fwd_bottom_descs_[i], fwd_filter_desc_, fwd_conv_descs_[i], fwd_top_descs_[i],
           CUDNN_CONVOLUTION_FWD_SPECIFY_WORKSPACE_LIMIT,
-          align_down<7>(workspace_bytes / ws_groups()), &fwd_algo_[i]));
+          align_down<8>(workspace_bytes / ws_groups()), &fwd_algo_[i]));
       CUDA_CHECK(cudaStreamSynchronize(Caffe::thread_stream(0)));
     }
     // Get backward filter algorithm (if not set by user)
@@ -519,7 +472,7 @@ void CuDNNConvolutionLayer<Ftype, Btype>::GetConvAlgo(const vector<Blob*>& botto
       CUDNN_CHECK(cudnnGetConvolutionBackwardFilterAlgorithm(Caffe::cudnn_handle(0),
           bwd_bottom_descs_[i], bwd_top_descs_[i], bwd_conv_filter_descs_[i], bwd_filter_desc_,
           CUDNN_CONVOLUTION_BWD_FILTER_SPECIFY_WORKSPACE_LIMIT,
-          align_down<7>(workspace_bytes / ws_groups()), &bwd_filter_algo_[i]));
+          align_down<8>(workspace_bytes / ws_groups()), &bwd_filter_algo_[i]));
       CUDA_CHECK(cudaStreamSynchronize(Caffe::thread_stream(0)));
     }
     LOG(INFO) << Phase_Name(this->phase_)
@@ -556,7 +509,7 @@ void CuDNNConvolutionLayer<Ftype, Btype>::FindExConvAlgo(
   cudaStream_t stream = Caffe::thread_stream(0);
 
   const int dev = Caffe::current_device();
-  shared_ptr<GPUMemory::Workspace> ws = GPUMemory::workspace_[dev];
+  shared_ptr<GPUMemory::Workspace>& ws = GPUMemory::workspace_[dev];
   const size_t gsize = ws->size() / ws_groups();
   CHECK(is_even(gsize)) << ws->size() << " / " << ws_groups() << " -> " << gsize;
 
@@ -636,11 +589,11 @@ void CuDNNConvolutionLayer<Ftype, Btype>::FindExConvAlgo(
 #endif
             workspace_fwd_sizes_[i] = fwd_results[k].memory;
             if (this->phase_ == TRAIN) {
-              train_mem_req_all_grps_.insert_max(dev,
-                  align_up<7>(workspace_fwd_sizes_[i]) * ws_groups());
+              atomic_maximum(train_mem_req_all_grps_,
+                             align_up<8>(workspace_fwd_sizes_[i]) * ws_groups());
             } else {
-              test_mem_req_all_grps_.insert_max(dev,
-                  align_up<7>(workspace_fwd_sizes_[i]) * ws_groups());
+              atomic_maximum(test_mem_req_all_grps_,
+                             align_up<8>(workspace_fwd_sizes_[i]) * ws_groups());
             }
             fwd_pseudo = is_precise(forward_math_) && !is_precise(tp<Ftype>());
             break;
@@ -667,7 +620,7 @@ void CuDNNConvolutionLayer<Ftype, Btype>::FindExConvAlgo(
       }
 #endif
       if (user_algos_override_[2] < 0) {
-        const size_t tmp_weights_size = train_tmp_weights_mem_[dev];
+        const size_t tmp_weights_size = train_tmp_weights_mem_.load();
         shared_ptr<GPUMemory::Workspace>& tmp_ws = GPUMemory::weights_workspace_[dev];
         tmp_ws->safe_reserve(tmp_weights_size);
         float algo_time = 0.F;
@@ -736,8 +689,8 @@ void CuDNNConvolutionLayer<Ftype, Btype>::FindExConvAlgo(
               }
 #endif
               workspace_bwd_filter_sizes_[i] = bwd_filter_results[k].memory;
-              train_mem_req_all_grps_.insert_max(dev,
-                  align_up<7>(workspace_bwd_filter_sizes_[i]) * ws_groups());
+              atomic_maximum(train_mem_req_all_grps_,
+                             align_up<8>(workspace_bwd_filter_sizes_[i]) * ws_groups());
               bwd_filter_pseudo = is_precise(backward_filter_math_) && !is_precise(tp<Btype>());
               bftime = bwd_filter_results[k].time;
               break;
@@ -829,8 +782,8 @@ void CuDNNConvolutionLayer<Ftype, Btype>::FindExConvAlgo(
                 }
 #endif
                 workspace_bwd_data_sizes_[i] = bwd_data_results[k].memory;
-                train_mem_req_all_grps_.insert_max(dev,
-                    align_up<7>(workspace_bwd_data_sizes_[i]) * ws_groups());
+                atomic_maximum(train_mem_req_all_grps_,
+                               align_up<8>(workspace_bwd_data_sizes_[i]) * ws_groups());
                 bwd_data_pseudo = is_precise(backward_data_math_) && !is_precise(tp<Btype>());
                 bdtime = bwd_data_results[k].time;
                 break;
@@ -848,6 +801,7 @@ void CuDNNConvolutionLayer<Ftype, Btype>::FindExConvAlgo(
       }
     }
     CUDA_CHECK(cudaStreamSynchronize(Caffe::thread_stream()));
+    ws->release();
     AllocateWorkspace(bottom.size());  // if user overrides
 
     size_t available_memory, total_memory;
@@ -884,7 +838,7 @@ void CuDNNConvolutionLayer<Ftype, Btype>::FindExConvAlgo(
 
     os << "\t(avail " << mem_fmt(available_memory) << ", req "
         << mem_fmt(this->phase_ == TRAIN ?
-            train_mem_req_all_grps_[dev] : test_mem_req_all_grps_[dev])
+            train_mem_req_all_grps_.load() : test_mem_req_all_grps_.load())
         << ")\tt: " << f_round2(ftime);
 
     if (this->phase_ == TRAIN) {
@@ -982,8 +936,6 @@ bool CuDNNConvolutionLayer<Ftype, Btype>::IsConvDescChanged(
 
 template <typename Ftype, typename Btype>
 CuDNNConvolutionLayer<Ftype, Btype>::~CuDNNConvolutionLayer() {
-  const int dev = Caffe::current_device();
-  ws_released_[dev] = false;  // For next unit test
   // Check that handles have been setup before destroying.
   if (!handles_setup_) { return; }
 
diff --git a/src/caffe/layers/cudnn_conv_layer.cu b/src/caffe/layers/cudnn_conv_layer.cu
index 569fcf0058a..5a898662692 100644
--- a/src/caffe/layers/cudnn_conv_layer.cu
+++ b/src/caffe/layers/cudnn_conv_layer.cu
@@ -12,9 +12,6 @@ template<typename Ftype, typename Btype>
 void CuDNNConvolutionLayer<Ftype, Btype>::Forward_gpu(const vector<Blob*>& bottom,
     const vector<Blob*>& top) {
   const Ftype* weight = this->blobs_[0]->template gpu_data<Ftype>();
-  if (fwd_count_ < 4) {
-    AllocateWorkspace(bottom.size());
-  }
   shared_ptr<GPUMemory::Workspace>& ws = GPUMemory::workspace_[Caffe::current_device()];
   if (use_v7grouping()) {
     for (int i = 0; i < bottom.size(); ++i) {
@@ -83,9 +80,6 @@ template <typename Ftype, typename Btype>
 void CuDNNConvolutionLayer<Ftype, Btype>::Backward_gpu(const vector<Blob*>& top,
     const vector<bool>& propagate_down, const vector<Blob*>& bottom) {
   propagate_down_ = propagate_down;
-  if (bwd_count_ < 4) {
-    AllocateWorkspace(bottom.size());
-  }
   shared_ptr<GPUMemory::Workspace>& ws = GPUMemory::workspace_[Caffe::current_device()];
   if (use_v7grouping()) {
     // compute dE/dB = sum_c(dE/dy)
diff --git a/src/caffe/layers/cudnn_deconv_layer.cpp b/src/caffe/layers/cudnn_deconv_layer.cpp
new file mode 100644
index 00000000000..6834dfda3b6
--- /dev/null
+++ b/src/caffe/layers/cudnn_deconv_layer.cpp
@@ -0,0 +1,328 @@
+#ifdef USE_CUDNN
+#include <algorithm>
+#include <vector>
+
+#include "caffe/layers/cudnn_deconv_layer.hpp"
+
+namespace caffe {
+
+// Set to three for the benefit of the backward pass, which
+// can use separate streams for calculating the gradient w.r.t.
+// bias, filter weights, and bottom data for each group independently
+#define CUDNN_STREAMS_PER_GROUP 3
+
+/**
+ * TODO(dox) explain cuDNN interface
+ */
+template<typename Ftype, typename Btype>
+void CuDNNDeconvolutionLayer<Ftype, Btype>::LayerSetUp(
+    const vector<Blob*>& bottom, const vector<Blob*>& top) {
+  DeconvolutionLayer<Ftype, Btype>::LayerSetUp(bottom, top);
+  // Initialize CUDA streams and cuDNN.
+  stream_         = new cudaStream_t[this->group_ * CUDNN_STREAMS_PER_GROUP];
+  handle_         = new cudnnHandle_t[this->group_ * CUDNN_STREAMS_PER_GROUP];
+
+  // Initialize algorithm arrays
+  fwd_algo_       = new cudnnConvolutionFwdAlgo_t[bottom.size()];
+  bwd_filter_algo_= new cudnnConvolutionBwdFilterAlgo_t[bottom.size()];
+  bwd_data_algo_  = new cudnnConvolutionBwdDataAlgo_t[bottom.size()];
+
+  // initialize size arrays
+  workspace_fwd_sizes_ = new size_t[bottom.size()];
+  workspace_bwd_filter_sizes_ = new size_t[bottom.size()];
+  workspace_bwd_data_sizes_ = new size_t[bottom.size()];
+
+  // workspace data
+  workspaceSizeInBytes = 0;
+  workspaceData = NULL;
+  workspace = new void*[this->group_ * CUDNN_STREAMS_PER_GROUP];
+
+  for (size_t i = 0; i < bottom.size(); ++i) {
+    // initialize all to default algorithms
+    fwd_algo_[i] = (cudnnConvolutionFwdAlgo_t)0;
+    bwd_filter_algo_[i] = (cudnnConvolutionBwdFilterAlgo_t)0;
+    bwd_data_algo_[i] = (cudnnConvolutionBwdDataAlgo_t)0;
+    // default algorithms don't require workspace
+    workspace_fwd_sizes_[i] = 0;
+    workspace_bwd_data_sizes_[i] = 0;
+    workspace_bwd_filter_sizes_[i] = 0;
+  }
+
+  for (int g = 0; g < this->group_ * CUDNN_STREAMS_PER_GROUP; g++) {
+    CUDA_CHECK(cudaStreamCreate(&stream_[g]));
+    CUDNN_CHECK(cudnnCreate(&handle_[g]));
+    CUDNN_CHECK(cudnnSetStream(handle_[g], stream_[g]));
+    workspace[g] = NULL;
+  }
+
+  // Set the indexing parameters.
+  bias_offset_ = (this->num_output_ / this->group_);
+
+  // Create filter descriptor.
+  const int* kernel_shape_data = this->kernel_shape_.cpu_data();
+  const int kernel_h = kernel_shape_data[0];
+  const int kernel_w = kernel_shape_data[1];
+  cudnn::createFilterDesc<Btype>(&filter_desc_,    ///////// FIXME!!!!!!!!!!!!!
+                                 this->channels_ / this->group_,
+                                 this->num_output_ / this->group_,
+                                 kernel_h,
+                                 kernel_w);
+
+  // Create tensor descriptor(s) for data and corresponding convolution(s).
+  for (int i = 0; i < bottom.size(); i++) {
+    cudnnTensorDescriptor_t bottom_desc;
+    cudnn::createTensor4dDesc<Btype>(&bottom_desc);
+    bottom_descs_.push_back(bottom_desc);
+    cudnnTensorDescriptor_t top_desc;
+    cudnn::createTensor4dDesc<Btype>(&top_desc);
+    top_descs_.push_back(top_desc);
+    cudnnConvolutionDescriptor_t conv_desc;
+    cudnnCreateConvolutionDescriptor(&conv_desc);
+    conv_descs_.push_back(conv_desc);
+  }
+
+  // Tensor descriptor for bias.
+  if (this->bias_term_) {
+    cudnn::createTensor4dDesc<Btype>(&bias_desc_);
+  }
+
+  handles_setup_ = true;
+}
+
+template<typename Ftype, typename Btype>
+void CuDNNDeconvolutionLayer<Ftype, Btype>::Reshape(
+    const vector<Blob*>& bottom, const vector<Blob*>& top) {
+  DeconvolutionLayer<Ftype, Btype>::Reshape(bottom, top);
+  CHECK_EQ(2, this->num_spatial_axes_)
+      << "CuDNNDeconvolutionLayer input must have 2 spatial axes "
+      << "(e.g., height and width). "
+      << "Use 'engine: CAFFE' for general ND convolution.";
+  bottom_offset_ = this->bottom_dim_ / this->group_;
+  top_offset_ = this->top_dim_ / this->group_;
+  const int height = bottom[0]->shape(this->channel_axis_ + 1);
+  const int width = bottom[0]->shape(this->channel_axis_ + 2);
+  const int height_out = top[0]->shape(this->channel_axis_ + 1);
+  const int width_out = top[0]->shape(this->channel_axis_ + 2);
+  const int* pad_data = this->pad_.cpu_data();
+  const int pad_h = pad_data[0];
+  const int pad_w = pad_data[1];
+  const int* stride_data = this->stride_.cpu_data();
+  const int stride_h = stride_data[0];
+  const int stride_w = stride_data[1];
+
+  // Specify workspace limit for kernels directly until we have a
+  // planning strategy and a rewrite of Caffe's GPU memory mangagement
+  size_t workspace_limit_bytes = 8*1024*1024;
+
+  for (int i = 0; i < bottom.size(); i++) {
+    cudnn::setTensor4dDesc<Btype>(&bottom_descs_[i],
+                                  this->num_,
+                                  this->channels_ / this->group_,
+                                  height,
+                                  width,
+                                  this->channels_ * height * width,
+                                  height * width,
+                                  width,
+                                  1);
+    cudnn::setTensor4dDesc<Btype>(&top_descs_[i],
+                                  this->num_,
+                                  this->num_output_ / this->group_,
+                                  height_out,
+                                  width_out,
+                                  this->num_output_ * height_out * width_out,
+                                  height_out * width_out,
+                                  width_out,
+                                  1);
+    cudnn::setConvolutionDesc(forward_math_,
+                              conv_descs_[i],
+//                              top_descs_[i],
+//                              filter_desc_,
+                              pad_h,
+                              pad_w,
+                              stride_h,
+                              stride_w, 1, 1);
+
+    // choose forward and backward algorithms + workspace(s)
+    CUDNN_CHECK(cudnnGetConvolutionForwardAlgorithm(
+        handle_[0],
+        top_descs_[i],
+        filter_desc_,
+        conv_descs_[i],
+        bottom_descs_[i],
+        CUDNN_CONVOLUTION_FWD_SPECIFY_WORKSPACE_LIMIT,
+        workspace_limit_bytes,
+        &fwd_algo_[i]));
+
+    // We have found that CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_PRECOMP_GEMM is
+    // buggy. Thus, if this algo was chosen, choose winograd instead. If
+    // winograd is not supported or workspace is larger than threshold, choose
+    // implicit_gemm instead.
+//    if (fwd_algo_[i] == CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_PRECOMP_GEMM) {
+//      size_t winograd_workspace_size;
+//      cudnnStatus_t status = cudnnGetConvolutionForwardWorkspaceSize(
+//          handle_[0],
+//          top_descs_[i],
+//          filter_desc_,
+//          conv_descs_[i],
+//          bottom_descs_[i],
+//          CUDNN_CONVOLUTION_FWD_ALGO_WINOGRAD,
+//          &winograd_workspace_size);
+//      if (status != CUDNN_STATUS_SUCCESS ||
+//          winograd_workspace_size >= workspace_limit_bytes) {
+//        fwd_algo_[i] = CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_GEMM;
+//      } else {
+//        fwd_algo_[i] = CUDNN_CONVOLUTION_FWD_ALGO_WINOGRAD;
+//      }
+//    }
+
+    CUDNN_CHECK(cudnnGetConvolutionForwardWorkspaceSize(
+        handle_[0],
+        top_descs_[i],
+        filter_desc_,
+        conv_descs_[i],
+        bottom_descs_[i],
+        fwd_algo_[i],
+        &(workspace_fwd_sizes_[i])));
+
+    // choose backward algorithm for filter
+    CUDNN_CHECK(cudnnGetConvolutionBackwardFilterAlgorithm(
+        handle_[0],
+        top_descs_[i],
+        bottom_descs_[i],
+        conv_descs_[i],
+        filter_desc_,
+        CUDNN_CONVOLUTION_BWD_FILTER_SPECIFY_WORKSPACE_LIMIT,
+        workspace_limit_bytes,
+        &bwd_filter_algo_[i]));
+
+    // get workspace for backwards filter algorithm
+    CUDNN_CHECK(cudnnGetConvolutionBackwardFilterWorkspaceSize(
+        handle_[0],
+        top_descs_[i],
+        bottom_descs_[i],
+        conv_descs_[i],
+        filter_desc_,
+        bwd_filter_algo_[i],
+        &workspace_bwd_filter_sizes_[i]));
+
+    // choose backward algo for data
+    CUDNN_CHECK(cudnnGetConvolutionBackwardDataAlgorithm(
+        handle_[0],
+        filter_desc_,
+        bottom_descs_[i],
+        conv_descs_[i],
+        top_descs_[i],
+        CUDNN_CONVOLUTION_BWD_DATA_SPECIFY_WORKSPACE_LIMIT,
+        workspace_limit_bytes,
+        &bwd_data_algo_[i]));
+
+    // get workspace size
+    CUDNN_CHECK(cudnnGetConvolutionBackwardDataWorkspaceSize(
+        handle_[0],
+        filter_desc_,
+        bottom_descs_[i],
+        conv_descs_[i],
+        top_descs_[i],
+        bwd_data_algo_[i],
+        &workspace_bwd_data_sizes_[i]));
+  }
+
+  // reduce over all workspace sizes to get a maximum to allocate / reallocate
+  size_t total_workspace_fwd = 0;
+  size_t total_workspace_bwd_data = 0;
+  size_t total_workspace_bwd_filter = 0;
+
+  for (size_t i = 0; i < bottom.size(); i++) {
+    total_workspace_fwd        = std::max(total_workspace_fwd,
+                                     workspace_fwd_sizes_[i]);
+    total_workspace_bwd_data   = std::max(total_workspace_bwd_data,
+                                     workspace_bwd_data_sizes_[i]);
+    total_workspace_bwd_filter = std::max(total_workspace_bwd_filter,
+                                     workspace_bwd_filter_sizes_[i]);
+  }
+  // get max over all operations
+  size_t max_workspace = std::max(total_workspace_fwd,
+                             total_workspace_bwd_data);
+  max_workspace = std::max(max_workspace, total_workspace_bwd_filter);
+  // ensure all groups have enough workspace
+  size_t total_max_workspace = max_workspace *
+                               (this->group_ * CUDNN_STREAMS_PER_GROUP);
+
+  // this is the total amount of storage needed over all groups + streams
+  if (total_max_workspace > workspaceSizeInBytes) {
+    DLOG(INFO) << "Reallocating workspace storage: " << total_max_workspace;
+    workspaceSizeInBytes = total_max_workspace;
+
+    // free the existing workspace and allocate a new (larger) one
+    cudaFree(this->workspaceData);
+
+    cudaError_t err = cudaMalloc(&(this->workspaceData), workspaceSizeInBytes);
+    if (err != cudaSuccess) {
+      // force zero memory path
+      for (int i = 0; i < bottom.size(); i++) {
+        workspace_fwd_sizes_[i] = 0;
+        workspace_bwd_filter_sizes_[i] = 0;
+        workspace_bwd_data_sizes_[i] = 0;
+        fwd_algo_[i] = CUDNN_CONVOLUTION_FWD_ALGO_FFT_TILING;
+        bwd_filter_algo_[i] = CUDNN_CONVOLUTION_BWD_FILTER_ALGO_0;
+        bwd_data_algo_[i] = CUDNN_CONVOLUTION_BWD_DATA_ALGO_0;
+      }
+
+      // NULL out all workspace pointers
+      for (int g = 0; g < (this->group_ * CUDNN_STREAMS_PER_GROUP); g++) {
+        workspace[g] = NULL;
+      }
+      // NULL out underlying data
+      workspaceData = NULL;
+      workspaceSizeInBytes = 0;
+    }
+
+    // if we succeed in the allocation, set pointer aliases for workspaces
+    for (int g = 0; g < (this->group_ * CUDNN_STREAMS_PER_GROUP); g++) {
+      workspace[g] = reinterpret_cast<char *>(workspaceData) + g*max_workspace;
+    }
+  }
+
+  // Tensor descriptor for bias.
+  if (this->bias_term_) {
+    cudnn::setTensor4dDesc<Btype>(
+        &bias_desc_, 1, this->num_output_ / this->group_, 1, 1);
+  }
+}
+
+template<typename Ftype, typename Btype>
+CuDNNDeconvolutionLayer<Ftype, Btype>::~CuDNNDeconvolutionLayer() {
+  // Check that handles have been setup before destroying.
+  if (!handles_setup_) { return; }
+
+  for (int i = 0; i < bottom_descs_.size(); i++) {
+    cudnnDestroyTensorDescriptor(bottom_descs_[i]);
+    cudnnDestroyTensorDescriptor(top_descs_[i]);
+    cudnnDestroyConvolutionDescriptor(conv_descs_[i]);
+  }
+  if (this->bias_term_) {
+    cudnnDestroyTensorDescriptor(bias_desc_);
+  }
+  cudnnDestroyFilterDescriptor(filter_desc_);
+
+  for (int g = 0; g < this->group_ * CUDNN_STREAMS_PER_GROUP; g++) {
+    cudaStreamDestroy(stream_[g]);
+    cudnnDestroy(handle_[g]);
+  }
+
+  cudaFree(workspaceData);
+  delete [] workspace;
+  delete [] stream_;
+  delete [] handle_;
+  delete [] fwd_algo_;
+  delete [] bwd_filter_algo_;
+  delete [] bwd_data_algo_;
+  delete [] workspace_fwd_sizes_;
+  delete [] workspace_bwd_data_sizes_;
+  delete [] workspace_bwd_filter_sizes_;
+}
+
+INSTANTIATE_CLASS_FB(CuDNNDeconvolutionLayer);
+
+}   // namespace caffe
+#endif
diff --git a/src/caffe/layers/cudnn_deconv_layer.cu b/src/caffe/layers/cudnn_deconv_layer.cu
new file mode 100644
index 00000000000..377a23e0303
--- /dev/null
+++ b/src/caffe/layers/cudnn_deconv_layer.cu
@@ -0,0 +1,138 @@
+#ifdef USE_CUDNN
+#include <vector>
+
+#include "caffe/layers/cudnn_deconv_layer.hpp"
+
+namespace caffe {
+
+__global__ void sync_deconv_groups() {}
+
+template<typename Ftype, typename Btype>
+void CuDNNDeconvolutionLayer<Ftype, Btype>::Forward_gpu(
+    const vector<Blob*>& bottom, const vector<Blob*>& top) {
+  const Ftype* weight = this->blobs_[0]->template gpu_data<Ftype>();
+  for (int i = 0; i < bottom.size(); ++i) {
+    const Ftype* bottom_data = bottom[i]->gpu_data<Ftype>();
+    Ftype* top_data = top[i]->mutable_gpu_data<Ftype>();
+
+    // Forward through cuDNN in parallel over groups.
+    for (int g = 0; g < this->group_; g++) {
+      // Filters.
+      CUDNN_CHECK(cudnnConvolutionBackwardData(
+          handle_[g],
+          cudnn::dataType<Ftype>::one,
+          filter_desc_,
+          weight + this->weight_offset_ * g,
+          bottom_descs_[i],
+          bottom_data + bottom_offset_ * g,
+          conv_descs_[i],
+          bwd_data_algo_[i],
+          workspace[g],
+          workspace_bwd_data_sizes_[i],
+          cudnn::dataType<Ftype>::zero,
+          top_descs_[i],
+          top_data + top_offset_ * g));
+
+      // Bias.
+      if (this->bias_term_) {
+        const Ftype* bias_data = this->blobs_[1]->template gpu_data<Ftype>();
+        CUDNN_CHECK(cudnnAddTensor(handle_[g],
+                                   cudnn::dataType<Ftype>::one,
+                                   bias_desc_,
+                                   bias_data + bias_offset_ * g,
+                                   cudnn::dataType<Ftype>::one,
+                                   top_descs_[i],
+                                   top_data + top_offset_ * g));
+      }
+    }
+
+    // Synchronize the work across groups, each of which went into its own
+    // stream, by launching an empty kernel into the default (null) stream.
+    // NOLINT_NEXT_LINE(whitespace/operators)
+    sync_deconv_groups<<<1, 1>>>();  // FIXME
+  }
+}
+
+template<typename Ftype, typename Btype>
+void CuDNNDeconvolutionLayer<Ftype, Btype>::Backward_gpu(
+    const vector<Blob*>& top,
+    const vector<bool>& propagate_down,
+    const vector<Blob*>& bottom) {
+  const Btype* weight = NULL;
+  Btype* weight_diff = NULL;
+  if (this->param_propagate_down_[0]) {
+    weight = this->blobs_[0]->template gpu_data<Btype>();
+    weight_diff = this->blobs_[0]->template mutable_gpu_diff<Btype>();
+  }
+  Btype* bias_diff = NULL;
+  if (this->bias_term_ && this->param_propagate_down_[1]) {
+    bias_diff = this->blobs_[1]->template mutable_gpu_diff<Btype>();
+  }
+  for (int i = 0; i < top.size(); ++i) {
+    const Btype* top_diff = top[i]->gpu_diff<Btype>();
+    // Backward through cuDNN in parallel over groups and gradients.
+    for (int g = 0; g < this->group_; g++) {
+      // Gradient w.r.t. bias.
+      if (this->bias_term_ && this->param_propagate_down_[1]) {
+        CUDNN_CHECK(cudnnConvolutionBackwardBias(handle_[0 * this->group_ + g],
+                                                 cudnn::dataType<Btype>::one,
+                                                 top_descs_[i],
+                                                 top_diff + top_offset_ * g,
+                                                 cudnn::dataType<Btype>::one,
+                                                 bias_desc_,
+                                                 bias_diff + bias_offset_ * g));
+      }
+
+      // Gradient w.r.t. weights.
+      if (this->param_propagate_down_[0]) {
+        const Btype* bottom_data = bottom[i]->gpu_data<Btype>();
+        CUDNN_CHECK(cudnnConvolutionBackwardFilter(
+            handle_[1 * this->group_ + g],
+            cudnn::dataType<Btype>::one,
+            top_descs_[i],
+            top_diff + top_offset_ * g,
+            bottom_descs_[i],
+            bottom_data + bottom_offset_ * g,
+            conv_descs_[i],
+            bwd_filter_algo_[i],
+            workspace[1 * this->group_ + g],
+            workspace_bwd_filter_sizes_[i],
+            cudnn::dataType<Btype>::one,
+            filter_desc_,
+            weight_diff + this->weight_offset_ * g));
+      }
+
+      // Gradient w.r.t. bottom data.
+      if (propagate_down[i]) {
+        if (weight == NULL) {
+          weight = this->blobs_[0]->template gpu_data<Btype>();
+        }
+        Btype* bottom_diff = bottom[i]->mutable_gpu_diff<Btype>();
+        CUDNN_CHECK(
+            cudnnConvolutionForward(handle_[2 * this->group_ + g],
+                                    cudnn::dataType<Btype>::one,
+                                    top_descs_[i],
+                                    top_diff + top_offset_ * g,
+                                    filter_desc_,
+                                    weight + this->weight_offset_ * g,
+                                    conv_descs_[i],
+                                    fwd_algo_[i],
+                                    workspace[2 * this->group_ + g],
+                                    workspace_fwd_sizes_[i],
+                                    cudnn::dataType<Btype>::zero,
+                                    bottom_descs_[i],
+                                    bottom_diff + bottom_offset_ * g));
+      }
+    }
+
+    // Synchronize the work across groups, each of which went into its own
+    // stream, by launching an empty kernel into the default (null) stream.
+    // NOLINT_NEXT_LINE(whitespace/operators)
+    sync_deconv_groups<<<1, 1>>>();
+  }
+}
+
+INSTANTIATE_LAYER_GPU_FUNCS_FB(CuDNNDeconvolutionLayer);
+
+}  // namespace caffe
+#endif
diff --git a/src/caffe/layers/data_layer.cpp b/src/caffe/layers/data_layer.cpp
index 2a18ee483c8..98447acb675 100644
--- a/src/caffe/layers/data_layer.cpp
+++ b/src/caffe/layers/data_layer.cpp
@@ -10,7 +10,7 @@ DataLayer<Ftype, Btype>::DataLayer(const LayerParameter& param, size_t solver_ra
   : BasePrefetchingDataLayer<Ftype, Btype>(param, solver_rank),
     cache_(param.data_param().cache()),
     shuffle_(param.data_param().shuffle()) {
-  sample_only_.store(this->auto_mode_ && this->phase_ == TRAIN);
+  sample_only_.store(this->auto_mode_);
   init_offsets();
   datum_encoded_ = false;
 }
@@ -44,7 +44,8 @@ DataLayer<Ftype, Btype>::InitializePrefetch() {
   if (layer_inititialized_flag_.is_set()) {
     return;
   }
-  if (this->auto_mode()) {
+  const bool auto_mode = this->auto_mode_;
+  if (auto_mode) {
     // Here we try to optimize memory split between prefetching and convolution.
     // All data and parameter blobs are allocated at this moment.
     // Now let's find out what's left...
@@ -81,7 +82,7 @@ DataLayer<Ftype, Btype>::InitializePrefetch() {
     } else {
       // in this mode memory demand is O(1)
       if (batches_fit > 0) {
-        parsers_num = cache_ ? 1 : 2;
+        parsers_num = cache_ ? 1 : 3;
         transf_num = 4;
       }
     }
@@ -95,14 +96,16 @@ DataLayer<Ftype, Btype>::InitializePrefetch() {
     if (this->parsers_num_ > 1) {
       parser_offsets_[0]++;  // 0th already processed
     }
+    this->auto_mode_ = false;
+    layer_inititialized_flag_.set();
     this->go();  // kick off new threads if any
   }
 
   CHECK_EQ(this->threads_num(), this->transf_num_);
   LOG(INFO) << this->print_current_device() << " Parser threads: "
-      << this->parsers_num_ << (this->auto_mode_ ? " (auto)" : "");
+      << this->parsers_num_ << (auto_mode ? " (auto)" : "");
   LOG(INFO) << this->print_current_device() << " Transformer threads: "
-      << this->transf_num_ << (this->auto_mode_ ? " (auto)" : "");
+      << this->transf_num_ << (auto_mode ? " (auto)" : "");
   layer_inititialized_flag_.set();
 }
 
@@ -127,7 +130,7 @@ DataLayer<Ftype, Btype>::DataLayerSetUp(const vector<Blob*>& bottom, const vecto
 
   if (this->auto_mode_) {
     if (!sample_reader_) {
-      sample_reader_ = std::make_shared<DataReader>(param, Caffe::solver_count(),
+      sample_reader_ = std::make_shared<DataReader<Datum>>(param, Caffe::solver_count(),
           this->rank_,
           this->parsers_num_,
           this->threads_num(),
@@ -138,7 +141,7 @@ DataLayer<Ftype, Btype>::DataLayerSetUp(const vector<Blob*>& bottom, const vecto
           shuffle,
           false);
     } else if (!reader_) {
-      reader_ = std::make_shared<DataReader>(param,
+      reader_ = std::make_shared<DataReader<Datum>>(param,
           Caffe::solver_count(),
           this->rank_,
           this->parsers_num_,
@@ -151,7 +154,7 @@ DataLayer<Ftype, Btype>::DataLayerSetUp(const vector<Blob*>& bottom, const vecto
           this->phase_ == TRAIN);
     }
   } else if (!reader_) {
-    reader_ = std::make_shared<DataReader>(param,
+    reader_ = std::make_shared<DataReader<Datum>>(param,
         Caffe::solver_count(),
         this->rank_,
         this->parsers_num_,
@@ -174,8 +177,7 @@ DataLayer<Ftype, Btype>::DataLayerSetUp(const vector<Blob*>& bottom, const vecto
   // Note: all these reshapings here in load_batch are needed only in case of
   // different datum shapes coming from database.
   Packing packing = NHWC;  // OpenCV
-  vector<int> top_shape = this->dt(0)->template Transform<Btype>(sample_datum.get(),
-      nullptr, 0, packing);
+  vector<int> top_shape = this->bdt(0)->Transform(sample_datum.get(), nullptr, 0, packing);
   top_shape[0] = batch_size;
   top[0]->Reshape(top_shape);
 
@@ -209,14 +211,14 @@ void DataLayer<Ftype, Btype>::load_batch(Batch* batch, int thread_id, size_t que
   const int batch_size = this->layer_param_.data_param().batch_size();
 
   const size_t qid = sample_only ? 0UL : queue_id;
-  DataReader* reader = sample_only ? sample_reader_.get() : reader_.get();
+  DataReader<Datum>* reader = sample_only ? sample_reader_.get() : reader_.get();
   shared_ptr<Datum> init_datum = reader->full_peek(qid);
   CHECK(init_datum);
   const bool use_gpu_transform = this->is_gpu_transform();
   Packing packing = NHWC;  // OpenCV
   // Use data_transformer to infer the expected blob shape from datum.
   vector<int> top_shape =
-      this->dt(thread_id)->template Transform<Btype>(init_datum.get(), nullptr, 0, packing);
+      this->bdt(thread_id)->Transform(init_datum.get(), nullptr, 0, packing);
   // Reshape batch according to the batch_size.
   top_shape[0] = batch_size;
   if (top_shape != batch->data_->shape()) {
@@ -303,19 +305,19 @@ void DataLayer<Ftype, Btype>::load_batch(Batch* batch, int thread_id, size_t que
       CUDA_CHECK(cudaMemcpyAsync(static_cast<char*>(dst_gptr) + item_id * datum_size,
           src_buf.data(), datum_size, cudaMemcpyHostToDevice, stream));
       CUDA_CHECK(cudaStreamSynchronize(stream));
-      this->dt(thread_id)->Fill3Randoms(&random_vectors_[thread_id]->
+      this->bdt(thread_id)->Fill3Randoms(&random_vectors_[thread_id]->
           mutable_cpu_data()[item_id * 3]);
     } else {
       // Get data offset for this datum to hand off to transform thread
       const size_t offset = batch->data_->offset(item_id);
       CHECK_EQ(0, offset % buf_len);
 #if defined(USE_CUDNN)
-      vector<int> shape = this->dt(thread_id)->Transform(datum.get(), dst_cptr + offset,
+      vector<int> shape = this->bdt(thread_id)->Transform(datum.get(), dst_cptr + offset,
           buf_len, packing, false);
 #else
       vector<Btype> tmp(top_shape[1] * top_shape[2] * top_shape[3]);
       CHECK_EQ(buf_len, tmp.size());
-      vector<int> shape = this->dt(thread_id)->Transform(datum.get(), tmp.data(), buf_len,
+      vector<int> shape = this->bdt(thread_id)->Transform(datum.get(), tmp.data(), buf_len,
           packing, false);
       if (packing == NHWC) {
         hwc2chw(top_shape[1], top_shape[3], top_shape[2], tmp.data(), dst_cptr + offset);
@@ -333,7 +335,7 @@ void DataLayer<Ftype, Btype>::load_batch(Batch* batch, int thread_id, size_t que
   }
 
   if (use_gpu_transform) {
-    this->dt(thread_id)->TransformGPU(top_shape[0], top_shape[1],
+    this->fdt(thread_id)->TransformGPU(top_shape[0], top_shape[1],
         init_datum_height,  // non-crop
         init_datum_width,  // non-crop
         datum_sizeof_element,
diff --git a/src/caffe/layers/detection_evaluate_layer.cpp b/src/caffe/layers/detection_evaluate_layer.cpp
new file mode 100644
index 00000000000..6b53f04342f
--- /dev/null
+++ b/src/caffe/layers/detection_evaluate_layer.cpp
@@ -0,0 +1,250 @@
+#include <algorithm>
+#include <map>
+#include <string>
+#include <vector>
+
+#include "caffe/layers/detection_evaluate_layer.hpp"
+#include "caffe/util/bbox_util.hpp"
+
+namespace caffe {
+
+template <typename Ftype, typename Btype>
+void DetectionEvaluateLayer<Ftype, Btype>::LayerSetUp(
+      const vector<Blob*>& bottom, const vector<Blob*>& top) {
+  const DetectionEvaluateParameter& detection_evaluate_param =
+      this->layer_param_.detection_evaluate_param();
+  CHECK(detection_evaluate_param.has_num_classes())
+      << "Must provide num_classes.";
+  num_classes_ = detection_evaluate_param.num_classes();
+  background_label_id_ = detection_evaluate_param.background_label_id();
+  overlap_threshold_ = detection_evaluate_param.overlap_threshold();
+  CHECK_GT(overlap_threshold_, 0.) << "overlap_threshold must be non negative.";
+  evaluate_difficult_gt_ = detection_evaluate_param.evaluate_difficult_gt();
+  if (detection_evaluate_param.has_name_size_file()) {
+    string name_size_file = detection_evaluate_param.name_size_file();
+    std::ifstream infile(name_size_file.c_str());
+    CHECK(infile.good())
+        << "Failed to open name size file: " << name_size_file;
+    // The file is in the following format:
+    //    name height width
+    //    ...
+    string name;
+    int height, width;
+    while (infile >> name >> height >> width) {
+      sizes_.push_back(std::make_pair(height, width));
+    }
+    infile.close();
+  }
+  count_ = 0;
+  // If there is no name_size_file provided, use normalized bbox to evaluate.
+  use_normalized_bbox_ = sizes_.size() == 0;
+
+  // Retrieve resize parameter if there is any provided.
+  has_resize_ = detection_evaluate_param.has_resize_param();
+  if (has_resize_) {
+    resize_param_ = detection_evaluate_param.resize_param();
+  }
+}
+
+template <typename Ftype, typename Btype>
+void DetectionEvaluateLayer<Ftype, Btype>::Reshape(const vector<Blob*>& bottom,
+      const vector<Blob*>& top) {
+  CHECK_LE(count_, sizes_.size());
+  CHECK_EQ(bottom[0]->num(), 1);
+  CHECK_EQ(bottom[0]->channels(), 1);
+  CHECK_EQ(bottom[0]->width(), 7);
+  CHECK_EQ(bottom[1]->num(), 1);
+  CHECK_EQ(bottom[1]->channels(), 1);
+  CHECK_EQ(bottom[1]->width(), 8);
+
+  // num() and channels() are 1.
+  vector<int> top_shape(2, 1);
+  int num_pos_classes = background_label_id_ == -1 ?
+      num_classes_ : num_classes_ - 1;
+  int num_valid_det = 0;
+  const Dtype* det_data = bottom[0]->cpu_data<Dtype>();
+  for (int i = 0; i < bottom[0]->height(); ++i) {
+    if (det_data[1] != -1) {
+      ++num_valid_det;
+    }
+    det_data += 7;
+  }
+  top_shape.push_back(num_pos_classes + num_valid_det);
+  // Each row is a 5 dimension vector, which stores
+  // [image_id, label, confidence, true_pos, false_pos]
+  top_shape.push_back(5);
+  top[0]->Reshape(top_shape);
+}
+
+template <typename Ftype, typename Btype>
+void DetectionEvaluateLayer<Ftype, Btype>::Forward_cpu(
+    const vector<Blob*>& bottom, const vector<Blob*>& top) {
+  const Dtype* det_data = bottom[0]->cpu_data<Dtype>();
+  const Dtype* gt_data = bottom[1]->cpu_data<Dtype>();
+
+  // Retrieve all detection results.
+  map<int, LabelBBox> all_detections;
+  GetDetectionResults(det_data, bottom[0]->height(), background_label_id_,
+                      &all_detections);
+
+  // Retrieve all ground truth (including difficult ones).
+  map<int, LabelBBox> all_gt_bboxes;
+  GetGroundTruth(gt_data, bottom[1]->height(), background_label_id_,
+                 true, &all_gt_bboxes);
+
+  Dtype* top_data = top[0]->mutable_cpu_data<Dtype>();
+  caffe_set(top[0]->count(), Dtype(0.), top_data);
+  int num_det = 0;
+
+  // Insert number of ground truth for each label.
+  map<int, int> num_pos;
+  for (map<int, LabelBBox>::iterator it = all_gt_bboxes.begin();
+       it != all_gt_bboxes.end(); ++it) {
+    for (LabelBBox::iterator iit = it->second.begin(); iit != it->second.end();
+         ++iit) {
+      int count = 0;
+      if (evaluate_difficult_gt_) {
+        count = iit->second.size();
+      } else {
+        // Get number of non difficult ground truth.
+        for (int i = 0; i < iit->second.size(); ++i) {
+          if (!iit->second[i].difficult()) {
+            ++count;
+          }
+        }
+      }
+      if (num_pos.find(iit->first) == num_pos.end()) {
+        num_pos[iit->first] = count;
+      } else {
+        num_pos[iit->first] += count;
+      }
+    }
+  }
+  for (int c = 0; c < num_classes_; ++c) {
+    if (c == background_label_id_) {
+      continue;
+    }
+    top_data[num_det * 5] = -1;
+    top_data[num_det * 5 + 1] = c;
+    if (num_pos.find(c) == num_pos.end()) {
+      top_data[num_det * 5 + 2] = 0;
+    } else {
+      top_data[num_det * 5 + 2] = num_pos.find(c)->second;
+    }
+    top_data[num_det * 5 + 3] = -1;
+    top_data[num_det * 5 + 4] = -1;
+    ++num_det;
+  }
+
+  // Insert detection evaluate status.
+  for (map<int, LabelBBox>::iterator it = all_detections.begin();
+       it != all_detections.end(); ++it) {
+    int image_id = it->first;
+    LabelBBox& detections = it->second;
+    if (all_gt_bboxes.find(image_id) == all_gt_bboxes.end()) {
+      // No ground truth for current image. All detections become false_pos.
+      for (LabelBBox::iterator iit = detections.begin();
+           iit != detections.end(); ++iit) {
+        int label = iit->first;
+        if (label == -1) {
+          continue;
+        }
+        const vector<NormalizedBBox>& bboxes = iit->second;
+        for (int i = 0; i < bboxes.size(); ++i) {
+          top_data[num_det * 5] = image_id;
+          top_data[num_det * 5 + 1] = label;
+          top_data[num_det * 5 + 2] = bboxes[i].score();
+          top_data[num_det * 5 + 3] = 0;
+          top_data[num_det * 5 + 4] = 1;
+          ++num_det;
+        }
+      }
+    } else {
+      LabelBBox& label_bboxes = all_gt_bboxes.find(image_id)->second;
+      for (LabelBBox::iterator iit = detections.begin();
+           iit != detections.end(); ++iit) {
+        int label = iit->first;
+        if (label == -1) {
+          continue;
+        }
+        vector<NormalizedBBox>& bboxes = iit->second;
+        if (label_bboxes.find(label) == label_bboxes.end()) {
+          // No ground truth for current label. All detections become false_pos.
+          for (int i = 0; i < bboxes.size(); ++i) {
+            top_data[num_det * 5] = image_id;
+            top_data[num_det * 5 + 1] = label;
+            top_data[num_det * 5 + 2] = bboxes[i].score();
+            top_data[num_det * 5 + 3] = 0;
+            top_data[num_det * 5 + 4] = 1;
+            ++num_det;
+          }
+        } else {
+          vector<NormalizedBBox>& gt_bboxes = label_bboxes.find(label)->second;
+          // Scale ground truth if needed.
+          if (!use_normalized_bbox_) {
+            CHECK_LT(count_, sizes_.size());
+            for (int i = 0; i < gt_bboxes.size(); ++i) {
+              OutputBBox(gt_bboxes[i], sizes_[count_], has_resize_,
+                         resize_param_, &(gt_bboxes[i]));
+            }
+          }
+          vector<bool> visited(gt_bboxes.size(), false);
+          // Sort detections in descend order based on scores.
+          std::sort(bboxes.begin(), bboxes.end(), SortBBoxDescend);
+          for (int i = 0; i < bboxes.size(); ++i) {
+            top_data[num_det * 5] = image_id;
+            top_data[num_det * 5 + 1] = label;
+            top_data[num_det * 5 + 2] = bboxes[i].score();
+            if (!use_normalized_bbox_) {
+              OutputBBox(bboxes[i], sizes_[count_], has_resize_,
+                         resize_param_, &(bboxes[i]));
+            }
+            // Compare with each ground truth bbox.
+            float overlap_max = -1;
+            int jmax = -1;
+            for (int j = 0; j < gt_bboxes.size(); ++j) {
+              float overlap = JaccardOverlap(bboxes[i], gt_bboxes[j],
+                                             use_normalized_bbox_);
+              if (overlap > overlap_max) {
+                overlap_max = overlap;
+                jmax = j;
+              }
+            }
+            if (overlap_max >= overlap_threshold_) {
+              if (evaluate_difficult_gt_ ||
+                  (!evaluate_difficult_gt_ && !gt_bboxes[jmax].difficult())) {
+                if (!visited[jmax]) {
+                  // true positive.
+                  top_data[num_det * 5 + 3] = 1;
+                  top_data[num_det * 5 + 4] = 0;
+                  visited[jmax] = true;
+                } else {
+                  // false positive (multiple detection).
+                  top_data[num_det * 5 + 3] = 0;
+                  top_data[num_det * 5 + 4] = 1;
+                }
+              }
+            } else {
+              // false positive.
+              top_data[num_det * 5 + 3] = 0;
+              top_data[num_det * 5 + 4] = 1;
+            }
+            ++num_det;
+          }
+        }
+      }
+    }
+    if (sizes_.size() > 0) {
+      ++count_;
+      if (count_ == sizes_.size()) {
+        // reset count after a full iterations through the DB.
+        count_ = 0;
+      }
+    }
+  }
+}
+
+INSTANTIATE_CLASS_FB(DetectionEvaluateLayer);
+REGISTER_LAYER_CLASS(DetectionEvaluate);
+
+}  // namespace caffe
diff --git a/src/caffe/layers/detection_output_layer.cpp b/src/caffe/layers/detection_output_layer.cpp
new file mode 100644
index 00000000000..0e9b619346a
--- /dev/null
+++ b/src/caffe/layers/detection_output_layer.cpp
@@ -0,0 +1,472 @@
+#include <algorithm>
+#include <fstream>  // NOLINT(readability/streams)
+#include <map>
+#include <string>
+#include <utility>
+#include <vector>
+
+#include "boost/filesystem.hpp"
+#include "boost/foreach.hpp"
+
+#include "caffe/layers/detection_output_layer.hpp"
+
+namespace caffe {
+
+template <typename Ftype, typename Btype>
+void DetectionOutputLayer<Ftype, Btype>::LayerSetUp(const vector<Blob*>& bottom,
+      const vector<Blob*>& top) {
+  const DetectionOutputParameter& detection_output_param =
+      this->layer_param_.detection_output_param();
+  CHECK(detection_output_param.has_num_classes()) << "Must specify num_classes";
+  num_classes_ = detection_output_param.num_classes();
+  share_location_ = detection_output_param.share_location();
+  num_loc_classes_ = share_location_ ? 1 : num_classes_;
+  background_label_id_ = detection_output_param.background_label_id();
+  code_type_ = detection_output_param.code_type();
+  variance_encoded_in_target_ =
+      detection_output_param.variance_encoded_in_target();
+  keep_top_k_ = detection_output_param.keep_top_k();
+  confidence_threshold_ = detection_output_param.has_confidence_threshold() ?
+      detection_output_param.confidence_threshold() : -FLT_MAX;
+  // Parameters used in nms.
+  nms_threshold_ = detection_output_param.nms_param().nms_threshold();
+  CHECK_GE(nms_threshold_, 0.) << "nms_threshold must be non negative.";
+  eta_ = detection_output_param.nms_param().eta();
+  CHECK_GT(eta_, 0.);
+  CHECK_LE(eta_, 1.);
+  top_k_ = -1;
+  if (detection_output_param.nms_param().has_top_k()) {
+    top_k_ = detection_output_param.nms_param().top_k();
+  }
+  const SaveOutputParameter& save_output_param =
+      detection_output_param.save_output_param();
+  output_directory_ = save_output_param.output_directory();
+  if (!output_directory_.empty()) {
+    if (boost::filesystem::is_directory(output_directory_)) {
+      boost::filesystem::remove_all(output_directory_);
+    }
+    if (!boost::filesystem::create_directories(output_directory_)) {
+        LOG(WARNING) << "Failed to create directory: " << output_directory_;
+    }
+  }
+  output_name_prefix_ = save_output_param.output_name_prefix();
+  need_save_ = output_directory_ == "" ? false : true;
+  output_format_ = save_output_param.output_format();
+  if (save_output_param.has_label_map_file()) {
+    string label_map_file = save_output_param.label_map_file();
+    if (label_map_file.empty()) {
+      // Ignore saving if there is no label_map_file provided.
+      LOG(WARNING) << "Provide label_map_file if output results to files.";
+      need_save_ = false;
+    } else {
+      LabelMap label_map;
+      CHECK(ReadProtoFromTextFile(label_map_file, &label_map))
+          << "Failed to read label map file: " << label_map_file;
+      CHECK(MapLabelToName(label_map, true, &label_to_name_))
+          << "Failed to convert label to name.";
+      CHECK(MapLabelToDisplayName(label_map, true, &label_to_display_name_))
+          << "Failed to convert label to display name.";
+    }
+  } else {
+    need_save_ = false;
+  }
+  if (save_output_param.has_name_size_file()) {
+    string name_size_file = save_output_param.name_size_file();
+    if (name_size_file.empty()) {
+      // Ignore saving if there is no name_size_file provided.
+      LOG(WARNING) << "Provide name_size_file if output results to files.";
+      need_save_ = false;
+    } else {
+      std::ifstream infile(name_size_file.c_str());
+      CHECK(infile.good())
+          << "Failed to open name size file: " << name_size_file;
+      // The file is in the following format:
+      //    name height width
+      //    ...
+      string name;
+      int height, width;
+      while (infile >> name >> height >> width) {
+        names_.push_back(name);
+        sizes_.push_back(std::make_pair(height, width));
+      }
+      infile.close();
+      if (save_output_param.has_num_test_image()) {
+        num_test_image_ = save_output_param.num_test_image();
+      } else {
+        num_test_image_ = names_.size();
+      }
+      CHECK_LE(num_test_image_, names_.size());
+    }
+  } else {
+    need_save_ = false;
+  }
+  has_resize_ = save_output_param.has_resize_param();
+  if (has_resize_) {
+    resize_param_ = save_output_param.resize_param();
+  }
+  name_count_ = 0;
+  visualize_ = detection_output_param.visualize();
+  if (visualize_) {
+    visualize_threshold_ = 0.6;
+    if (detection_output_param.has_visualize_threshold()) {
+      visualize_threshold_ = detection_output_param.visualize_threshold();
+    }
+    data_transformer_.reset(new DataTransformer<Ftype>(this->layer_param_.transform_param(),
+                                                this->phase_));
+    data_transformer_->InitRand();
+    save_file_ = detection_output_param.save_file();
+  }
+  bbox_preds_.ReshapeLike(*(bottom[0]));
+  if (!share_location_) {
+    bbox_permute_.ReshapeLike(*(bottom[0]));
+  }
+  conf_permute_.ReshapeLike(*(bottom[1]));
+}
+
+template <typename Ftype, typename Btype>
+void DetectionOutputLayer<Ftype, Btype>::Reshape(const vector<Blob*>& bottom,
+      const vector<Blob*>& top) {
+  if (need_save_) {
+    CHECK_LE(name_count_, names_.size());
+    if (name_count_ % num_test_image_ == 0) {
+      // Clean all outputs.
+      if (output_format_ == "VOC") {
+        boost::filesystem::path output_directory(output_directory_);
+        for (map<int, string>::iterator it = label_to_name_.begin();
+             it != label_to_name_.end(); ++it) {
+          if (it->first == background_label_id_) {
+            continue;
+          }
+          std::ofstream outfile;
+          boost::filesystem::path file(
+              output_name_prefix_ + it->second + ".txt");
+          boost::filesystem::path out_file = output_directory / file;
+          outfile.open(out_file.string().c_str(), std::ofstream::out);
+        }
+      }
+    }
+  }
+  CHECK_EQ(bottom[0]->num(), bottom[1]->num());
+  if (bbox_preds_.num() != bottom[0]->num() ||
+      bbox_preds_.count(1) != bottom[0]->count(1)) {
+    bbox_preds_.ReshapeLike(*(bottom[0]));
+  }
+  if (!share_location_ && (bbox_permute_.num() != bottom[0]->num() ||
+      bbox_permute_.count(1) != bottom[0]->count(1))) {
+    bbox_permute_.ReshapeLike(*(bottom[0]));
+  }
+  if (conf_permute_.num() != bottom[1]->num() ||
+      conf_permute_.count(1) != bottom[1]->count(1)) {
+    conf_permute_.ReshapeLike(*(bottom[1]));
+  }
+  num_priors_ = bottom[2]->height() / 4;
+  CHECK_EQ(num_priors_ * num_loc_classes_ * 4, bottom[0]->channels())
+      << "Number of priors must match number of location predictions.";
+  CHECK_EQ(num_priors_ * num_classes_, bottom[1]->channels())
+      << "Number of priors must match number of confidence predictions.";
+  // num() and channels() are 1.
+  vector<int> top_shape(2, 1);
+  // Since the number of bboxes to be kept is unknown before nms, we manually
+  // set it to (fake) 1.
+  top_shape.push_back(1);
+  // Each row is a 7 dimension vector, which stores
+  // [image_id, label, confidence, xmin, ymin, xmax, ymax]
+  top_shape.push_back(7);
+  top[0]->Reshape(top_shape);
+}
+
+template <typename Ftype, typename Btype>
+void DetectionOutputLayer<Ftype, Btype>::Forward_cpu(
+    const vector<Blob*>& bottom, const vector<Blob*>& top) {
+  const Ftype* loc_data = bottom[0]->cpu_data<Ftype>();
+  const Ftype* conf_data = bottom[1]->cpu_data<Ftype>();
+  const Ftype* prior_data = bottom[2]->cpu_data<Ftype>();
+  const int num = bottom[0]->num();
+
+  // Retrieve all location predictions.
+  vector<LabelBBox> all_loc_preds;
+  GetLocPredictions(loc_data, num, num_priors_, num_loc_classes_,
+                    share_location_, &all_loc_preds);
+
+  // Retrieve all confidences.
+  vector<map<int, vector<float> > > all_conf_scores;
+  GetConfidenceScores(conf_data, num, num_priors_, num_classes_,
+                      &all_conf_scores);
+
+  // Retrieve all prior bboxes. It is same within a batch since we assume all
+  // images in a batch are of same dimension.
+  vector<NormalizedBBox> prior_bboxes;
+  vector<vector<float> > prior_variances;
+  GetPriorBBoxes(prior_data, num_priors_, &prior_bboxes, &prior_variances);
+
+  // Decode all loc predictions to bboxes.
+  vector<LabelBBox> all_decode_bboxes;
+  const bool clip_bbox = false;
+  DecodeBBoxesAll(all_loc_preds, prior_bboxes, prior_variances, num,
+                  share_location_, num_loc_classes_, background_label_id_,
+                  code_type_, variance_encoded_in_target_, clip_bbox,
+                  &all_decode_bboxes);
+
+  int num_kept = 0;
+  vector<map<int, vector<int> > > all_indices;
+  for (int i = 0; i < num; ++i) {
+    const LabelBBox& decode_bboxes = all_decode_bboxes[i];
+    const map<int, vector<float> >& conf_scores = all_conf_scores[i];
+    map<int, vector<int> > indices;
+    int num_det = 0;
+    for (int c = 0; c < num_classes_; ++c) {
+      if (c == background_label_id_) {
+        // Ignore background class.
+        continue;
+      }
+      if (conf_scores.find(c) == conf_scores.end()) {
+        // Something bad happened if there are no predictions for current label.
+        LOG(FATAL) << "Could not find confidence predictions for label " << c;
+      }
+      const vector<float>& scores = conf_scores.find(c)->second;
+      int label = share_location_ ? -1 : c;
+      if (decode_bboxes.find(label) == decode_bboxes.end()) {
+        // Something bad happened if there are no predictions for current label.
+        LOG(FATAL) << "Could not find location predictions for label " << label;
+        continue;
+      }
+      const vector<NormalizedBBox>& bboxes = decode_bboxes.find(label)->second;
+      ApplyNMSFast(bboxes, scores, confidence_threshold_, nms_threshold_, eta_,
+          top_k_, &(indices[c]));
+      num_det += indices[c].size();
+    }
+    if (keep_top_k_ > -1 && num_det > keep_top_k_) {
+      vector<pair<float, pair<int, int> > > score_index_pairs;
+      for (map<int, vector<int> >::iterator it = indices.begin();
+           it != indices.end(); ++it) {
+        int label = it->first;
+        const vector<int>& label_indices = it->second;
+        if (conf_scores.find(label) == conf_scores.end()) {
+          // Something bad happened for current label.
+          LOG(FATAL) << "Could not find location predictions for " << label;
+          continue;
+        }
+        const vector<float>& scores = conf_scores.find(label)->second;
+        for (int j = 0; j < label_indices.size(); ++j) {
+          int idx = label_indices[j];
+          CHECK_LT(idx, scores.size());
+          score_index_pairs.push_back(std::make_pair(
+                  scores[idx], std::make_pair(label, idx)));
+        }
+      }
+      // Keep top k results per image.
+      std::sort(score_index_pairs.begin(), score_index_pairs.end(),
+                SortScorePairDescend<pair<int, int> >);
+      score_index_pairs.resize(keep_top_k_);
+      // Store the new indices.
+      map<int, vector<int> > new_indices;
+      for (int j = 0; j < score_index_pairs.size(); ++j) {
+        int label = score_index_pairs[j].second.first;
+        int idx = score_index_pairs[j].second.second;
+        new_indices[label].push_back(idx);
+      }
+      all_indices.push_back(new_indices);
+      num_kept += keep_top_k_;
+    } else {
+      all_indices.push_back(indices);
+      num_kept += num_det;
+    }
+  }
+
+  vector<int> top_shape(2, 1);
+  top_shape.push_back(num_kept);
+  top_shape.push_back(7);
+  Ftype* top_data;
+  if (num_kept == 0) {
+    LOG(INFO) << "Couldn't find any detections";
+    top_shape[2] = num;
+    top[0]->Reshape(top_shape);
+    top_data = top[0]->mutable_cpu_data<Ftype>();
+    caffe_set(top[0]->count(), static_cast<Ftype>(-1), top_data);
+    // Generate fake results per image.
+    for (int i = 0; i < num; ++i) {
+      top_data[0] = i;
+      top_data += 7;
+    }
+  } else {
+    top[0]->Reshape(top_shape);
+    top_data = top[0]->mutable_cpu_data<Ftype>();
+  }
+
+  int count = 0;
+  boost::filesystem::path output_directory(output_directory_);
+  for (int i = 0; i < num; ++i) {
+    const map<int, vector<float> >& conf_scores = all_conf_scores[i];
+    const LabelBBox& decode_bboxes = all_decode_bboxes[i];
+    for (map<int, vector<int> >::iterator it = all_indices[i].begin();
+         it != all_indices[i].end(); ++it) {
+      int label = it->first;
+      if (conf_scores.find(label) == conf_scores.end()) {
+        // Something bad happened if there are no predictions for current label.
+        LOG(FATAL) << "Could not find confidence predictions for " << label;
+        continue;
+      }
+      const vector<float>& scores = conf_scores.find(label)->second;
+      int loc_label = share_location_ ? -1 : label;
+      if (decode_bboxes.find(loc_label) == decode_bboxes.end()) {
+        // Something bad happened if there are no predictions for current label.
+        LOG(FATAL) << "Could not find location predictions for " << loc_label;
+        continue;
+      }
+      const vector<NormalizedBBox>& bboxes =
+          decode_bboxes.find(loc_label)->second;
+      vector<int>& indices = it->second;
+      if (need_save_) {
+        CHECK(label_to_name_.find(label) != label_to_name_.end())
+          << "Cannot find label: " << label << " in the label map.";
+        CHECK_LT(name_count_, names_.size());
+      }
+      for (int j = 0; j < indices.size(); ++j) {
+        int idx = indices[j];
+        top_data[count * 7] = i;
+        top_data[count * 7 + 1] = label;
+        top_data[count * 7 + 2] = scores[idx];
+        const NormalizedBBox& bbox = bboxes[idx];
+        top_data[count * 7 + 3] = bbox.xmin();
+        top_data[count * 7 + 4] = bbox.ymin();
+        top_data[count * 7 + 5] = bbox.xmax();
+        top_data[count * 7 + 6] = bbox.ymax();
+        if (need_save_) {
+          NormalizedBBox out_bbox;
+          OutputBBox(bbox, sizes_[name_count_], has_resize_, resize_param_,
+                     &out_bbox);
+          float score = top_data[count * 7 + 2];
+          float xmin = out_bbox.xmin();
+          float ymin = out_bbox.ymin();
+          float xmax = out_bbox.xmax();
+          float ymax = out_bbox.ymax();
+          ptree pt_xmin, pt_ymin, pt_width, pt_height;
+          pt_xmin.put<float>("", round(xmin * 100) / 100.);
+          pt_ymin.put<float>("", round(ymin * 100) / 100.);
+          pt_width.put<float>("", round((xmax - xmin) * 100) / 100.);
+          pt_height.put<float>("", round((ymax - ymin) * 100) / 100.);
+
+          ptree cur_bbox;
+          cur_bbox.push_back(std::make_pair("", pt_xmin));
+          cur_bbox.push_back(std::make_pair("", pt_ymin));
+          cur_bbox.push_back(std::make_pair("", pt_width));
+          cur_bbox.push_back(std::make_pair("", pt_height));
+
+          ptree cur_det;
+          cur_det.put("image_id", names_[name_count_]);
+          if (output_format_ == "ILSVRC") {
+            cur_det.put<int>("category_id", label);
+          } else {
+            cur_det.put("category_id", label_to_name_[label].c_str());
+          }
+          cur_det.add_child("bbox", cur_bbox);
+          cur_det.put<float>("score", score);
+
+          detections_.push_back(std::make_pair("", cur_det));
+        }
+        ++count;
+      }
+    }
+    if (need_save_) {
+      ++name_count_;
+      if (name_count_ % num_test_image_ == 0) {
+        if (output_format_ == "VOC") {
+          map<string, std::ofstream*> outfiles;
+          for (int c = 0; c < num_classes_; ++c) {
+            if (c == background_label_id_) {
+              continue;
+            }
+            string label_name = label_to_name_[c];
+            boost::filesystem::path file(
+                output_name_prefix_ + label_name + ".txt");
+            boost::filesystem::path out_file = output_directory / file;
+            outfiles[label_name] = new std::ofstream(out_file.string().c_str(),
+                std::ofstream::out);
+          }
+          BOOST_FOREACH(ptree::value_type &det, detections_.get_child("")) {
+            ptree pt = det.second;
+            string label_name = pt.get<string>("category_id");
+            if (outfiles.find(label_name) == outfiles.end()) {
+              std::cout << "Cannot find " << label_name << std::endl;
+              continue;
+            }
+            string image_name = pt.get<string>("image_id");
+            float score = pt.get<float>("score");
+            vector<int> bbox;
+            BOOST_FOREACH(ptree::value_type &elem, pt.get_child("bbox")) {
+              bbox.push_back(static_cast<int>(elem.second.get_value<float>()));
+            }
+            *(outfiles[label_name]) << image_name;
+            *(outfiles[label_name]) << " " << score;
+            *(outfiles[label_name]) << " " << bbox[0] << " " << bbox[1];
+            *(outfiles[label_name]) << " " << bbox[0] + bbox[2];
+            *(outfiles[label_name]) << " " << bbox[1] + bbox[3];
+            *(outfiles[label_name]) << std::endl;
+          }
+          for (int c = 0; c < num_classes_; ++c) {
+            if (c == background_label_id_) {
+              continue;
+            }
+            string label_name = label_to_name_[c];
+            outfiles[label_name]->flush();
+            outfiles[label_name]->close();
+            delete outfiles[label_name];
+          }
+        } else if (output_format_ == "COCO") {
+          boost::filesystem::path output_directory(output_directory_);
+          boost::filesystem::path file(output_name_prefix_ + ".json");
+          boost::filesystem::path out_file = output_directory / file;
+          std::ofstream outfile;
+          outfile.open(out_file.string().c_str(), std::ofstream::out);
+
+          boost::regex exp("\"(null|true|false|-?[0-9]+(\\.[0-9]+)?)\"");
+          ptree output;
+          output.add_child("detections", detections_);
+          std::stringstream ss;
+#ifdef WRITE_JSON_SUPPORTED
+          write_json(ss, output);
+#endif
+          std::string rv = boost::regex_replace(ss.str(), exp, "$1");
+          outfile << rv.substr(rv.find("["), rv.rfind("]") - rv.find("["))
+              << std::endl << "]" << std::endl;
+        } else if (output_format_ == "ILSVRC") {
+          boost::filesystem::path output_directory(output_directory_);
+          boost::filesystem::path file(output_name_prefix_ + ".txt");
+          boost::filesystem::path out_file = output_directory / file;
+          std::ofstream outfile;
+          outfile.open(out_file.string().c_str(), std::ofstream::out);
+
+          BOOST_FOREACH(ptree::value_type &det, detections_.get_child("")) {
+            ptree pt = det.second;
+            int label = pt.get<int>("category_id");
+            string image_name = pt.get<string>("image_id");
+            float score = pt.get<float>("score");
+            vector<int> bbox;
+            BOOST_FOREACH(ptree::value_type &elem, pt.get_child("bbox")) {
+              bbox.push_back(static_cast<int>(elem.second.get_value<float>()));
+            }
+            outfile << image_name << " " << label << " " << score;
+            outfile << " " << bbox[0] << " " << bbox[1];
+            outfile << " " << bbox[0] + bbox[2];
+            outfile << " " << bbox[1] + bbox[3];
+            outfile << std::endl;
+          }
+        }
+        name_count_ = 0;
+        detections_.clear();
+      }
+    }
+  }
+  if (visualize_) {
+    vector<cv::Mat> cv_imgs;
+    this->data_transformer_->TransformInv(bottom[3], &cv_imgs);
+    vector<cv::Scalar> colors = GetColors(label_to_display_name_.size());
+    VisualizeBBox<Ftype>(cv_imgs, top[0], visualize_threshold_, colors,
+        label_to_display_name_, save_file_);
+  }
+}
+
+INSTANTIATE_CLASS_FB(DetectionOutputLayer);
+REGISTER_LAYER_CLASS(DetectionOutput);
+
+}  // namespace caffe
diff --git a/src/caffe/layers/detection_output_layer.cu b/src/caffe/layers/detection_output_layer.cu
new file mode 100644
index 00000000000..1b02185e07f
--- /dev/null
+++ b/src/caffe/layers/detection_output_layer.cu
@@ -0,0 +1,306 @@
+#include <opencv2/core/core.hpp>
+#include <opencv2/imgproc/imgproc.hpp>
+#include <algorithm>
+#include <fstream>  // NOLINT(readability/streams)
+#include <map>
+#include <string>
+#include <utility>
+#include <vector>
+
+#include "caffe/layers/detection_output_layer.hpp"
+
+#include <boost/filesystem.hpp>
+#include <boost/foreach.hpp>
+
+namespace caffe {
+
+template <typename Ftype, typename Btype>
+void DetectionOutputLayer<Ftype, Btype>::Forward_gpu(
+    const vector<Blob*>& bottom, const vector<Blob*>& top) {
+  const Ftype* loc_data = bottom[0]->gpu_data<Ftype>();
+  const Ftype* prior_data = bottom[2]->gpu_data<Ftype>();
+  const int num = bottom[0]->num();
+
+  // Decode predictions.
+  Ftype* bbox_data = bbox_preds_.mutable_gpu_data();
+  const int loc_count = bbox_preds_.count();
+  const bool clip_bbox = false;
+  DecodeBBoxesGPU<Ftype>(loc_count, loc_data, prior_data, code_type_,
+      variance_encoded_in_target_, num_priors_, share_location_,
+      num_loc_classes_, background_label_id_, clip_bbox, bbox_data);
+  // Retrieve all decoded location predictions.
+  const Ftype* bbox_cpu_data;
+  if (!share_location_) {
+    Ftype* bbox_permute_data = bbox_permute_.mutable_gpu_data();
+    PermuteDataGPU<Ftype>(loc_count, bbox_data, num_loc_classes_, num_priors_,
+        4, bbox_permute_data);
+    bbox_cpu_data = bbox_permute_.cpu_data();
+  } else {
+    bbox_cpu_data = bbox_preds_.cpu_data();
+  }
+
+  // Retrieve all confidences.
+  Ftype* conf_permute_data = conf_permute_.mutable_gpu_data();
+  PermuteDataGPU<Ftype>(bottom[1]->count(), bottom[1]->gpu_data<Ftype>(),
+      num_classes_, num_priors_, 1, conf_permute_data);
+  const Ftype* conf_cpu_data = conf_permute_.cpu_data();
+
+  int num_kept = 0;
+  vector<map<int, vector<int> > > all_indices;
+  for (int i = 0; i < num; ++i) {
+    map<int, vector<int> > indices;
+    int num_det = 0;
+    const int conf_idx = i * num_classes_ * num_priors_;
+    int bbox_idx;
+    if (share_location_) {
+      bbox_idx = i * num_priors_ * 4;
+    } else {
+      bbox_idx = conf_idx * 4;
+    }
+    for (int c = 0; c < num_classes_; ++c) {
+      if (c == background_label_id_) {
+        // Ignore background class.
+        continue;
+      }
+      const Ftype* cur_conf_data = conf_cpu_data + conf_idx + c * num_priors_;
+      const Ftype* cur_bbox_data = bbox_cpu_data + bbox_idx;
+      if (!share_location_) {
+        cur_bbox_data += c * num_priors_ * 4;
+      }
+      ApplyNMSFast(cur_bbox_data, cur_conf_data, num_priors_,
+          confidence_threshold_, nms_threshold_, eta_, top_k_, &(indices[c]));
+      num_det += indices[c].size();
+    }
+    if (keep_top_k_ > -1 && num_det > keep_top_k_) {
+      vector<pair<float, pair<int, int> > > score_index_pairs;
+      for (map<int, vector<int> >::iterator it = indices.begin();
+           it != indices.end(); ++it) {
+        int label = it->first;
+        const vector<int>& label_indices = it->second;
+        for (int j = 0; j < label_indices.size(); ++j) {
+          int idx = label_indices[j];
+          float score = conf_cpu_data[conf_idx + label * num_priors_ + idx];
+          score_index_pairs.push_back(std::make_pair(
+                  score, std::make_pair(label, idx)));
+        }
+      }
+      // Keep top k results per image.
+//      std::sort(score_index_pairs.begin(), score_index_pairs.end(),
+//                SortScorePairDescend<pair<int, int> >);
+//      score_index_pairs.resize(keep_top_k_);
+      std::partial_sort(
+          score_index_pairs.begin(), score_index_pairs.begin() +
+              std::min(score_index_pairs.size(), (size_t)keep_top_k_),
+          score_index_pairs.end(), SortScorePairDescend<pair<int, int>>);
+      // Store the new indices.
+      map<int, vector<int> > new_indices;
+      for (int j = 0; j < keep_top_k_; ++j) {
+        int label = score_index_pairs[j].second.first;
+        int idx = score_index_pairs[j].second.second;
+        new_indices[label].push_back(idx);
+      }
+      all_indices.push_back(new_indices);
+      num_kept += keep_top_k_;
+    } else {
+      all_indices.push_back(indices);
+      num_kept += num_det;
+    }
+  }
+
+  vector<int> top_shape(2, 1);
+  top_shape.push_back(num_kept);
+  top_shape.push_back(7);
+  Ftype* top_data;
+  if (num_kept == 0) {
+    LOG(INFO) << "Couldn't find any detections";
+    top_shape[2] = num;
+    top[0]->Reshape(top_shape);
+    top_data = top[0]->mutable_cpu_data<Ftype>();
+    caffe_set<Ftype>(top[0]->count(), -1, top_data);
+    // Generate fake results per image.
+    for (int i = 0; i < num; ++i) {
+      top_data[0] = i;
+      top_data += 7;
+    }
+  } else {
+    top[0]->Reshape(top_shape);
+    top_data = top[0]->mutable_cpu_data<Ftype>();
+  }
+
+  int count = 0;
+  boost::filesystem::path output_directory(output_directory_);
+  for (int i = 0; i < num; ++i) {
+    const int conf_idx = i * num_classes_ * num_priors_;
+    int bbox_idx;
+    if (share_location_) {
+      bbox_idx = i * num_priors_ * 4;
+    } else {
+      bbox_idx = conf_idx * 4;
+    }
+    for (map<int, vector<int> >::iterator it = all_indices[i].begin();
+         it != all_indices[i].end(); ++it) {
+      int label = it->first;
+      vector<int>& indices = it->second;
+      if (need_save_) {
+        CHECK(label_to_name_.find(label) != label_to_name_.end())
+          << "Cannot find label: " << label << " in the label map.";
+        CHECK_LT(name_count_, names_.size());
+      }
+      const Ftype* cur_conf_data =
+        conf_cpu_data + conf_idx + label * num_priors_;
+      const Ftype* cur_bbox_data = bbox_cpu_data + bbox_idx;
+      if (!share_location_) {
+        cur_bbox_data += label * num_priors_ * 4;
+      }
+      for (int j = 0; j < indices.size(); ++j) {
+        int idx = indices[j];
+        top_data[count * 7] = i;
+        top_data[count * 7 + 1] = label;
+        top_data[count * 7 + 2] = cur_conf_data[idx];
+        for (int k = 0; k < 4; ++k) {
+          top_data[count * 7 + 3 + k] = cur_bbox_data[idx * 4 + k];
+        }
+        if (need_save_) {
+          // Generate output bbox.
+          NormalizedBBox bbox;
+          bbox.set_xmin(top_data[count * 7 + 3]);
+          bbox.set_ymin(top_data[count * 7 + 4]);
+          bbox.set_xmax(top_data[count * 7 + 5]);
+          bbox.set_ymax(top_data[count * 7 + 6]);
+          NormalizedBBox out_bbox;
+          OutputBBox(bbox, sizes_[name_count_], has_resize_, resize_param_,
+                     &out_bbox);
+          float score = top_data[count * 7 + 2];
+          float xmin = out_bbox.xmin();
+          float ymin = out_bbox.ymin();
+          float xmax = out_bbox.xmax();
+          float ymax = out_bbox.ymax();
+          ptree pt_xmin, pt_ymin, pt_width, pt_height;
+          pt_xmin.put<float>("", round(xmin * 100) / 100.);
+          pt_ymin.put<float>("", round(ymin * 100) / 100.);
+          pt_width.put<float>("", round((xmax - xmin) * 100) / 100.);
+          pt_height.put<float>("", round((ymax - ymin) * 100) / 100.);
+
+          ptree cur_bbox;
+          cur_bbox.push_back(std::make_pair("", pt_xmin));
+          cur_bbox.push_back(std::make_pair("", pt_ymin));
+          cur_bbox.push_back(std::make_pair("", pt_width));
+          cur_bbox.push_back(std::make_pair("", pt_height));
+
+          ptree cur_det;
+          cur_det.put("image_id", names_[name_count_]);
+          if (output_format_ == "ILSVRC") {
+            cur_det.put<int>("category_id", label);
+          } else {
+            cur_det.put("category_id", label_to_name_[label].c_str());
+          }
+          cur_det.add_child("bbox", cur_bbox);
+          cur_det.put<float>("score", score);
+
+          detections_.push_back(std::make_pair("", cur_det));
+        }
+        ++count;
+      }
+    }
+    if (need_save_) {
+      ++name_count_;
+      if (name_count_ % num_test_image_ == 0) {
+        if (output_format_ == "VOC") {
+          map<string, std::ofstream*> outfiles;
+          for (int c = 0; c < num_classes_; ++c) {
+            if (c == background_label_id_) {
+              continue;
+            }
+            string label_name = label_to_name_[c];
+            boost::filesystem::path file(
+                output_name_prefix_ + label_name + ".txt");
+            boost::filesystem::path out_file = output_directory / file;
+            outfiles[label_name] = new std::ofstream(out_file.string().c_str(),
+                std::ofstream::out);
+          }
+          BOOST_FOREACH(ptree::value_type &det, detections_.get_child("")) {
+            ptree pt = det.second;
+            string label_name = pt.get<string>("category_id");
+            if (outfiles.find(label_name) == outfiles.end()) {
+              std::cout << "Cannot find " << label_name << std::endl;
+              continue;
+            }
+            string image_name = pt.get<string>("image_id");
+            float score = pt.get<float>("score");
+            vector<int> bbox;
+            BOOST_FOREACH(ptree::value_type &elem, pt.get_child("bbox")) {
+              bbox.push_back(static_cast<int>(elem.second.get_value<float>()));
+            }
+            *(outfiles[label_name]) << image_name;
+            *(outfiles[label_name]) << " " << score;
+            *(outfiles[label_name]) << " " << bbox[0] << " " << bbox[1];
+            *(outfiles[label_name]) << " " << bbox[0] + bbox[2];
+            *(outfiles[label_name]) << " " << bbox[1] + bbox[3];
+            *(outfiles[label_name]) << std::endl;
+          }
+          for (int c = 0; c < num_classes_; ++c) {
+            if (c == background_label_id_) {
+              continue;
+            }
+            string label_name = label_to_name_[c];
+            outfiles[label_name]->flush();
+            outfiles[label_name]->close();
+            delete outfiles[label_name];
+          }
+        } else if (output_format_ == "COCO") {
+          boost::filesystem::path output_directory(output_directory_);
+          boost::filesystem::path file(output_name_prefix_ + ".json");
+          boost::filesystem::path out_file = output_directory / file;
+          std::ofstream outfile;
+          outfile.open(out_file.string().c_str(), std::ofstream::out);
+
+          boost::regex exp("\"(null|true|false|-?[0-9]+(\\.[0-9]+)?)\"");
+          ptree output;
+          output.add_child("detections", detections_);
+          std::stringstream ss;
+#ifdef WRITE_JSON_SUPPORTED
+          write_json(ss, output);
+#endif
+          std::string rv = boost::regex_replace(ss.str(), exp, "$1");
+          outfile << rv.substr(rv.find("["), rv.rfind("]") - rv.find("["))
+              << std::endl << "]" << std::endl;
+        } else if (output_format_ == "ILSVRC") {
+          boost::filesystem::path output_directory(output_directory_);
+          boost::filesystem::path file(output_name_prefix_ + ".txt");
+          boost::filesystem::path out_file = output_directory / file;
+          std::ofstream outfile;
+          outfile.open(out_file.string().c_str(), std::ofstream::out);
+
+          BOOST_FOREACH(ptree::value_type &det, detections_.get_child("")) {
+            ptree pt = det.second;
+            int label = pt.get<int>("category_id");
+            string image_name = pt.get<string>("image_id");
+            float score = pt.get<float>("score");
+            vector<int> bbox;
+            BOOST_FOREACH(ptree::value_type &elem, pt.get_child("bbox")) {
+              bbox.push_back(static_cast<int>(elem.second.get_value<float>()));
+            }
+            outfile << image_name << " " << label << " " << score;
+            outfile << " " << bbox[0] << " " << bbox[1];
+            outfile << " " << bbox[0] + bbox[2];
+            outfile << " " << bbox[1] + bbox[3];
+            outfile << std::endl;
+          }
+        }
+        name_count_ = 0;
+        detections_.clear();
+      }
+    }
+  }
+  if (visualize_) {
+    vector<cv::Mat> cv_imgs;
+    this->data_transformer_->TransformInv(bottom[3], &cv_imgs);
+    vector<cv::Scalar> colors = GetColors(label_to_display_name_.size());
+    VisualizeBBox<Ftype>(cv_imgs, top[0], visualize_threshold_, colors,
+        label_to_display_name_, save_file_);
+  }
+}
+
+INSTANTIATE_LAYER_GPU_FUNCS_FB(DetectionOutputLayer);
+
+}  // namespace caffe
diff --git a/src/caffe/layers/dropout_layer.cu b/src/caffe/layers/dropout_layer.cu
index 5d46514af9c..d3703afea3d 100644
--- a/src/caffe/layers/dropout_layer.cu
+++ b/src/caffe/layers/dropout_layer.cu
@@ -11,11 +11,7 @@ __global__ void
 DropoutForward(const int n, const Dtype* in, const unsigned int* mask, const unsigned int threshold,
     const float scale, Dtype* out) {
   CUDA_KERNEL_LOOP(index, n) {
-    //    out[index] = in[index] * (mask[index] > threshold) * scale;
-    if (mask[index] > threshold)
-      out[index] = Dtype(static_cast<float>(in[index]) * scale);
-    else
-      out[index] = 0.;
+      out[index] = in[index] * (mask[index] > threshold ? Dtype(1) : Dtype(0)) * scale;
   }
 }
 
@@ -36,7 +32,7 @@ DropoutLayer<Ftype, Btype>::Forward_gpu(const vector<Blob*>& bottom, const vecto
     CUDA_POST_KERNEL_CHECK;
     CUDA_CHECK(cudaStreamSynchronize(stream));
   } else {
-    caffe_copy<Ftype>(count, bottom_data, top_data);
+    caffe_copy(count, bottom_data, top_data);
   }
 }
 
@@ -44,24 +40,19 @@ template<typename Dtype>
 __global__ void DropoutBackward(const int n, const Dtype* in_diff, const unsigned int* mask,
     const unsigned int threshold, const float scale, Dtype* out_diff) {
   CUDA_KERNEL_LOOP(index, n) {
-    //    out_diff[index] = in_diff[index] * (mask[index] > threshold) * scale;
-    if (mask[index] > threshold)
-      out_diff[index] = Dtype(static_cast<float>(in_diff[index]) * scale);
-    else
-      out_diff[index] = 0.;
+    out_diff[index] = in_diff[index] * (mask[index] > threshold ? Dtype(1) : Dtype(0)) * scale;
   }
 }
 
 template<typename Ftype, typename Btype>
 void DropoutLayer<Ftype, Btype>::Backward_gpu(const vector<Blob*>& top,
     const vector<bool>& propagate_down, const vector<Blob*>& bottom) {
-  const Btype* top_diff = top[0]->gpu_diff<Btype>();
-  Btype* bottom_diff = bottom[0]->mutable_gpu_diff<Btype>();
-
   if (propagate_down[0]) {
+    const Btype* top_diff = top[0]->gpu_diff<Btype>();
+    Btype* bottom_diff = bottom[0]->mutable_gpu_diff<Btype>();
     if (this->phase_ == TRAIN) {  // Needed for TEST
       cudaStream_t stream = Caffe::thread_stream();
-      const unsigned int* mask = static_cast<const unsigned int*>(rand_vec_.gpu_data());
+      const unsigned int* mask = rand_vec_.gpu_data();
       const int count = bottom[0]->count();
       // NOLINT_NEXT_LINE(whitespace/operators)
       DropoutBackward<<<CAFFE_GET_BLOCKS(count), CAFFE_CUDA_NUM_THREADS, 0, stream>>>
diff --git a/src/caffe/layers/eltwise_layer.cpp b/src/caffe/layers/eltwise_layer.cpp
index ba04cae21fd..1b37d80c177 100644
--- a/src/caffe/layers/eltwise_layer.cpp
+++ b/src/caffe/layers/eltwise_layer.cpp
@@ -2,7 +2,7 @@
 #include <vector>
 
 #include "caffe/layers/eltwise_layer.hpp"
-#include "caffe/util/math_functions.hpp"
+#include "caffe/net.hpp"
 
 namespace caffe {
 
@@ -30,7 +30,9 @@ void EltwiseLayer<Ftype, Btype>::LayerSetUp(const vector<Blob*>& bottom,
 template <typename Ftype, typename Btype>
 void EltwiseLayer<Ftype, Btype>::Reshape(const vector<Blob*>& bottom,
       const vector<Blob*>& top) {
-  no_coeffs_ = true;
+  const Net* pnet = this->parent_net();
+  // Inner nets are usually cyclic
+  no_coeffs_ = pnet != nullptr && !pnet->inner_net();
   for (int i = 0; i < bottom.size(); ++i) {
     no_coeffs_ = no_coeffs_ && coeffs_[i] == 1.F;
   }
@@ -44,7 +46,10 @@ void EltwiseLayer<Ftype, Btype>::Reshape(const vector<Blob*>& bottom,
     max_idx_.Reshape(bottom[0]->shape());
   }
   if (op_ == EltwiseParameter_EltwiseOp_SUM && no_coeffs_) {
-    bottom[0]->ShareDiff(*top[0]);
+    for (int i = 0; i < bottom.size(); ++i) {
+      bottom[i]->ShareDiff(*top[0]);
+    }
+    top[0]->ShareData(*bottom[0]);
   }
 }
 
@@ -64,10 +69,16 @@ void EltwiseLayer<Ftype, Btype>::Forward_cpu(
     }
     break;
   case EltwiseParameter_EltwiseOp_SUM:
-    caffe_set(count, Ftype(0), top_data);
-    // TODO(shelhamer) does BLAS optimize to sum for coeff = 1?
-    for (int i = 0; i < bottom.size(); ++i) {
-      caffe_axpy(count, Ftype(coeffs_[i]), bottom[i]->cpu_data<Ftype>(), top_data);
+    if (no_coeffs_) {
+      for (int i = 1; i < bottom.size(); ++i) {
+        caffe_axpy(count, Ftype(1), bottom[i]->cpu_data<Ftype>(), top_data);
+      }
+    } else {
+      caffe_set(count, Ftype(0), top_data);
+      // TODO(shelhamer) does BLAS optimize to sum for coeff = 1?
+      for (int i = 0; i < bottom.size(); ++i) {
+        caffe_axpy(count, Ftype(coeffs_[i]), bottom[i]->cpu_data<Ftype>(), top_data);
+      }
     }
     break;
   case EltwiseParameter_EltwiseOp_MAX:
@@ -134,11 +145,7 @@ void EltwiseLayer<Ftype, Btype>::Backward_cpu(const vector<Blob*>& top,
         caffe_mul(count, bottom_diff, top_diff, bottom_diff);
         break;
       case EltwiseParameter_EltwiseOp_SUM:
-        if (no_coeffs_) {
-          if (i > 0) {
-            caffe_copy(count, top_diff, bottom_diff);
-          }
-        } else {
+        if (!no_coeffs_) {
           caffe_cpu_scale(count, Btype(coeffs_[i]), top_diff, bottom_diff);
         }
         break;
diff --git a/src/caffe/layers/eltwise_layer.cu b/src/caffe/layers/eltwise_layer.cu
index acc22e24be8..fb469ac7e70 100644
--- a/src/caffe/layers/eltwise_layer.cu
+++ b/src/caffe/layers/eltwise_layer.cu
@@ -47,13 +47,8 @@ void EltwiseLayer<Ftype, Btype>::Forward_gpu(const vector<Blob*>& bottom,
     break;
   case EltwiseParameter_EltwiseOp_SUM:
     if (no_coeffs_) {
-      if (bottom.size() >= 2) {
-        caffe_gpu_add(count, bottom[0]->gpu_data<Ftype>(), bottom[1]->gpu_data<Ftype>(), top_data);
-        for (int i = 2; i < bottom.size(); ++i) {
-          caffe_gpu_incr(count, bottom[i]->gpu_data<Ftype>(), top_data);
-        }
-      } else if (bottom.size() == 1) {
-        caffe_copy(count, bottom[0]->gpu_data<Ftype>(), top_data);
+      for (int i = 1; i < bottom.size(); ++i) {
+        caffe_gpu_incr(count, bottom[i]->gpu_data<Ftype>(), top_data);
       }
     } else {
       caffe_gpu_set(count, Ftype(0.), top_data);
@@ -127,11 +122,7 @@ void EltwiseLayer<Ftype, Btype>::Backward_gpu(const vector<Blob*>& top,
         }
         break;
       case EltwiseParameter_EltwiseOp_SUM:
-        if (no_coeffs_) {
-          if (i > 0) {
-            caffe_copy(count, top_diff, bottom[i]->mutable_gpu_diff<Btype>());
-          }
-        } else {
+        if (!no_coeffs_) {
           caffe_gpu_scale(count, Btype(coeffs_[i]), top_diff, bottom[i]->mutable_gpu_diff<Btype>());
         }
         break;
diff --git a/src/caffe/layers/embed_layer.cu b/src/caffe/layers/embed_layer.cu
index 8fa153ff855..bf721244931 100644
--- a/src/caffe/layers/embed_layer.cu
+++ b/src/caffe/layers/embed_layer.cu
@@ -9,26 +9,19 @@ namespace caffe {
 
 template <typename Dtype>
 __global__ void EmbedForward(const int nthreads, const Dtype* bottom_data,
-    const Dtype* weight, const int M, const int N, const int K,
-    Dtype* top_data) {
+    const Dtype* weight, const int N, Dtype* top_data) {
   CUDA_KERNEL_LOOP(top_index, nthreads) {
     const int n = top_index / N;
     const int d = top_index % N;
-    const int index = static_cast<int>(static_cast<double>(bottom_data[n]));
-    const int weight_index = index * N + d;
+    const int index = static_cast<int>(bottom_data[n]);
+    const int weight_index = abs(index * N + d);
     top_data[top_index] = weight[weight_index];
   }
 }
 
 template <typename Dtype>
 __global__ void EmbedBackward(const int nthreads, const Dtype* bottom_data,
-    const Dtype* top_diff, const int M, const int N, const int K,
-    Dtype* weight_diff);
-
-template <typename Dtype>
-__global__ void EmbedBackward(const int nthreads, const Dtype* bottom_data,
-    const Dtype* top_diff, const int M, const int N, const int K,
-    Dtype* weight_diff) {
+    const Dtype* top_diff, const int N, Dtype* weight_diff) {
   CUDA_KERNEL_LOOP(top_index, nthreads) {
     const int n = top_index / N;
     const int d = top_index % N;
@@ -45,15 +38,16 @@ void EmbedLayer<Ftype, Btype>::Forward_gpu(const vector<Blob*>& bottom,
   Ftype* top_data = top[0]->mutable_gpu_data<Ftype>();
   const Ftype* weight = this->blobs_[0]->template gpu_data<Ftype>();
   const int count = top[0]->count();
+  cudaStream_t stream = Caffe::thread_stream();
   EmbedForward  // NOLINT_NEXT_LINE(whitespace/operators)
-      <<<CAFFE_GET_BLOCKS(count), CAFFE_CUDA_NUM_THREADS, 0, Caffe::thread_stream()>>>(
-      count, bottom_data, weight, M_, N_, K_, top_data);
+      <<<CAFFE_GET_BLOCKS(count), CAFFE_CUDA_NUM_THREADS, 0, stream>>>(
+      count, bottom_data, weight, N_, top_data);
+  CUDA_CHECK(cudaStreamSynchronize(stream));
   if (bias_term_) {
     caffe_gpu_gemm(CblasNoTrans, CblasNoTrans, M_, N_, 1, Ftype(1),
         bias_multiplier_.template gpu_data<Ftype>(),
         this->blobs_[1]->template gpu_data<Ftype>(), Ftype(1), top_data);
   }
-  CUDA_CHECK(cudaStreamSynchronize(Caffe::thread_stream()));
 }
 
 template <typename Ftype, typename Btype>
@@ -65,10 +59,11 @@ void EmbedLayer<Ftype, Btype>::Backward_gpu(const vector<Blob*>& top,
     const Btype* top_diff = top[0]->gpu_diff<Btype>();
     const Btype* bottom_data = bottom[0]->gpu_data<Btype>();
     Btype* weight_diff = this->blobs_[0]->template mutable_gpu_diff<Btype>();
+    cudaStream_t stream = Caffe::thread_stream();
     EmbedBackward  // NOLINT_NEXT_LINE(whitespace/operators)
-        <<<CAFFE_GET_BLOCKS(top_count), CAFFE_CUDA_NUM_THREADS, 0, Caffe::thread_stream()>>>(
-        top_count, bottom_data, top_diff, M_, N_, K_, weight_diff);
-    CUDA_CHECK(cudaStreamSynchronize(Caffe::thread_stream()));
+        <<<CAFFE_GET_BLOCKS(top_count), CAFFE_CUDA_NUM_THREADS, 0, stream>>>(
+        top_count, bottom_data, top_diff, N_, weight_diff);
+    CUDA_CHECK(cudaStreamSynchronize(stream));
   }
   if (bias_term_ && this->param_propagate_down_[1]) {
     const Btype* top_diff = top[0]->gpu_diff<Btype>();
diff --git a/src/caffe/layers/flatten_layer.cpp b/src/caffe/layers/flatten_layer.cpp
index c5025c27007..18ede195854 100644
--- a/src/caffe/layers/flatten_layer.cpp
+++ b/src/caffe/layers/flatten_layer.cpp
@@ -23,7 +23,11 @@ void FlattenLayer<Ftype, Btype>::Reshape(const vector<Blob*>& bottom,
     top_shape.push_back(bottom[0]->shape(i));
   }
   top[0]->Reshape(top_shape);
+  bottom[0]->cpu_data<Ftype>();
+  top[0]->cpu_data<Ftype>();
   CHECK_EQ(top[0]->count(), bottom[0]->count());
+  top[0]->ShareData(*bottom[0]);
+  bottom[0]->ShareDiff(*top[0]);
 }
 
 template <typename Ftype, typename Btype>
diff --git a/src/caffe/layers/hdf5_data_layer.cpp b/src/caffe/layers/hdf5_data_layer.cpp
index 6f778e9cb07..0c3bfc1fb6e 100644
--- a/src/caffe/layers/hdf5_data_layer.cpp
+++ b/src/caffe/layers/hdf5_data_layer.cpp
@@ -62,10 +62,10 @@ void HDF5DataLayer<Ftype, Btype>::LoadHDF5FileData(const char* filename) {
   // Shuffle if needed.
   if (this->layer_param_.hdf5_data_param().shuffle()) {
     caffe::shuffle(data_permutation_.begin(), data_permutation_.end());
-    DLOG(INFO) << "Successully loaded " << hdf_blobs_[0]->shape(0)
+    LOG(INFO) << "Successully loaded " << hdf_blobs_[0]->shape(0)
                << " rows (shuffled)";
   } else {
-    DLOG(INFO) << "Successully loaded " << hdf_blobs_[0]->shape(0) << " rows";
+    LOG(INFO) << "Successully loaded " << hdf_blobs_[0]->shape(0) << " rows";
   }
 }
 
diff --git a/src/caffe/layers/image_data_layer.cpp b/src/caffe/layers/image_data_layer.cpp
index 0ec9c3988e1..fe47e3cd1c0 100644
--- a/src/caffe/layers/image_data_layer.cpp
+++ b/src/caffe/layers/image_data_layer.cpp
@@ -68,6 +68,7 @@ void ImageDataLayer<Ftype, Btype>::DataLayerSetUp(const vector<Blob*>& bottom,
     const string &source = image_data_param.source();
     LOG(INFO) << "Opening file " << source;
     std::ifstream infile(source.c_str());
+    CHECK(infile.good()) << "File " << source;
     string filename;
     int label;
     while (infile >> filename >> label) {
@@ -206,10 +207,10 @@ void ImageDataLayer<Ftype, Btype>::load_batch(Batch* batch, int thread_id, size_
     if (cv_img.data) {
       int offset = batch->data_->offset(item_id);
 #if defined(USE_CUDNN)
-      this->dt(thread_id)->Transform(cv_img, prefetch_data + offset, buf_len, false);
+      this->bdt(thread_id)->Transform(cv_img, prefetch_data + offset, buf_len, false);
 #else
       CHECK_EQ(buf_len, tmp.size());
-      this->dt(thread_id)->Transform(cv_img, prefetch_data + offset, buf_len, false);
+      this->bdt(thread_id)->Transform(cv_img, prefetch_data + offset, buf_len, false);
       hwc2chw(top_shape[1], top_shape[3], top_shape[2], tmp.data(), prefetch_data + offset);
       packing = NCHW;
 #endif
@@ -242,16 +243,23 @@ void ImageDataLayer<Ftype, Btype>::load_batch(Batch* batch, int thread_id, size_
     // go to the next iter
     line_ids_[thread_id] += line_bucket;
     if (line_ids_[thread_id] >= lines_size) {
-      // We have reached the end. Restart from the first.
-      DLOG(INFO) << this->print_current_device() << " Restarting data prefetching from start.";
       while (line_ids_[thread_id] >= lines_size) {
         line_ids_[thread_id] -= lines_size;
       }
-      if (thread_id == 0 && this->rank_ == 0 && shuffle) {
-        LOG(INFO) << "Shuffling data";
-        ShuffleImages();
-        epoch_count_ += lines_size;
-        Caffe::report_epoch_count(epoch_count_);
+      if (thread_id == 0 && this->rank_ == 0) {
+        if (this->phase_ == TRAIN) {
+          // We have reached the end. Restart from the first.
+          LOG(INFO) << this->print_current_device() << " Restarting data prefetching ("
+                    << lines_size << ")";
+          if (epoch_count_ == 0UL) {
+            epoch_count_ += lines_size;
+            Caffe::report_epoch_count(epoch_count_);
+          }
+        }
+        if (shuffle) {
+          LOG(INFO) << "Shuffling data";
+          ShuffleImages();
+        }
       }
     }
     line_id = line_ids_[thread_id];
diff --git a/src/caffe/layers/inner_product_layer.cpp b/src/caffe/layers/inner_product_layer.cpp
index 56bf2a0574d..7fa4d7c4e65 100644
--- a/src/caffe/layers/inner_product_layer.cpp
+++ b/src/caffe/layers/inner_product_layer.cpp
@@ -37,7 +37,7 @@ InnerProductLayer<Ftype, Btype>::LayerSetUp(const vector<Blob*>& bottom, const v
       weight_shape[1] = K_;
     }
     // CPU filler always 32 bits
-    this->blobs_[0] = Blob::create<Ftype, Btype>(weight_shape);
+    this->blobs_[0] = Blob::create<Ftype>(weight_shape);
     shared_ptr<Filler<Ftype>> weight_filler(
         GetFiller<Ftype>(this->layer_param_.inner_product_param().weight_filler()));
 
diff --git a/src/caffe/layers/loss_layer.cpp b/src/caffe/layers/loss_layer.cpp
index 848e101a35a..54fb58306cb 100644
--- a/src/caffe/layers/loss_layer.cpp
+++ b/src/caffe/layers/loss_layer.cpp
@@ -22,6 +22,37 @@ void LossLayer<Ftype, Btype>::Reshape(
   top[0]->Reshape(loss_shape);
 }
 
+template <typename Ftype, typename Btype>
+Ftype LossLayer<Ftype, Btype>::GetNormalizer(
+    const LossParameter_NormalizationMode normalization_mode,
+    const int outer_num, const int inner_num, const int valid_count) {
+  Dtype normalizer;
+  switch (normalization_mode) {
+    case LossParameter_NormalizationMode_FULL:
+      normalizer = Dtype(outer_num * inner_num);
+      break;
+    case LossParameter_NormalizationMode_VALID:
+      if (valid_count == -1) {
+        normalizer = Dtype(outer_num * inner_num);
+      } else {
+        normalizer = Dtype(valid_count);
+      }
+      break;
+    case LossParameter_NormalizationMode_BATCH_SIZE:
+      normalizer = Dtype(outer_num);
+      break;
+    case LossParameter_NormalizationMode_NONE:
+      normalizer = Dtype(1);
+      break;
+    default:
+      LOG(FATAL) << "Unknown normalization mode: "
+          << LossParameter_NormalizationMode_Name(normalization_mode);
+  }
+  // Some users will have no labels for some examples in order to 'turn off' a
+  // particular loss in a multi-task setup. The max prevents NaNs in that case.
+  return std::max(Dtype(1.0), normalizer);
+}
+
 INSTANTIATE_CLASS_FB(LossLayer);
 
 }  // namespace caffe
diff --git a/src/caffe/layers/lstm_layer.cpp b/src/caffe/layers/lstm_layer.cpp
new file mode 100644
index 00000000000..81eb6f32cac
--- /dev/null
+++ b/src/caffe/layers/lstm_layer.cpp
@@ -0,0 +1,244 @@
+#include <string>
+#include <vector>
+
+#include "caffe/common.hpp"
+#include "caffe/blob.hpp"
+#include "caffe/filler.hpp"
+#include "caffe/layer.hpp"
+#include "caffe/layers/lstm_layer.hpp"
+#include "caffe/util/math_functions.hpp"
+
+namespace caffe {
+
+template<typename Ftype, typename Btype>
+void LSTMLayer<Ftype, Btype>::RecurrentInputBlobNames(vector<string>* names) const {
+  names->resize(2);
+  (*names)[0] = "h_0";
+  (*names)[1] = "c_0";
+}
+
+template<typename Ftype, typename Btype>
+void LSTMLayer<Ftype, Btype>::RecurrentOutputBlobNames(vector<string>* names) const {
+  names->resize(2);
+  (*names)[0] = "h_" + format_int(this->T_);
+  (*names)[1] = "c_T";
+}
+
+template<typename Ftype, typename Btype>
+void LSTMLayer<Ftype, Btype>::RecurrentInputShapes(vector<BlobShape>* shapes) const {
+  const int num_output = this->layer_param_.recurrent_param().num_output();
+  const int num_blobs = 2;
+  shapes->resize(num_blobs);
+  for (int i = 0; i < num_blobs; ++i) {
+    (*shapes)[i].Clear();
+    (*shapes)[i].add_dim(1);  // a single timestep
+    (*shapes)[i].add_dim(this->N_);
+    (*shapes)[i].add_dim(num_output);
+  }
+}
+
+template<typename Ftype, typename Btype>
+void LSTMLayer<Ftype, Btype>::OutputBlobNames(vector<string>* names) const {
+  names->resize(1);
+  (*names)[0] = "h";
+}
+
+template<typename Ftype, typename Btype>
+void LSTMLayer<Ftype, Btype>::FillUnrolledNet(NetParameter* net_param) const {
+  const int num_output = this->layer_param_.recurrent_param().num_output();
+  CHECK_GT(num_output, 0) << "num_output must be positive";
+  const FillerParameter& weight_filler =
+      this->layer_param_.recurrent_param().weight_filler();
+  const FillerParameter& bias_filler =
+      this->layer_param_.recurrent_param().bias_filler();
+
+  // Add generic LayerParameter's (without bottoms/tops) of layer types we'll
+  // use to save redundant code.
+  LayerParameter hidden_param;
+  hidden_param.set_type("InnerProduct");
+  hidden_param.mutable_inner_product_param()->set_num_output(num_output * 4);
+  hidden_param.mutable_inner_product_param()->set_bias_term(false);
+  hidden_param.mutable_inner_product_param()->set_axis(2);
+  hidden_param.mutable_inner_product_param()->
+      mutable_weight_filler()->CopyFrom(weight_filler);
+
+  LayerParameter biased_hidden_param(hidden_param);
+  biased_hidden_param.mutable_inner_product_param()->set_bias_term(true);
+  biased_hidden_param.mutable_inner_product_param()->
+      mutable_bias_filler()->CopyFrom(bias_filler);
+
+  LayerParameter sum_param;
+  sum_param.set_type("Eltwise");
+  sum_param.mutable_eltwise_param()->set_operation(
+      EltwiseParameter_EltwiseOp_SUM);
+
+  LayerParameter scale_param;
+  scale_param.set_type("Scale");
+  scale_param.mutable_scale_param()->set_axis(0);
+
+  LayerParameter slice_param;
+  slice_param.set_type("Slice");
+  slice_param.mutable_slice_param()->set_axis(0);
+
+  LayerParameter split_param;
+  split_param.set_type("Split");
+
+  vector<BlobShape> input_shapes;
+  RecurrentInputShapes(&input_shapes);
+  CHECK_EQ(2, input_shapes.size());
+
+  LayerParameter* input_layer_param = net_param->add_layer();
+  input_layer_param->set_type("Input");
+  InputParameter* input_param = input_layer_param->mutable_input_param();
+
+  input_layer_param->add_top("c_0");
+  input_param->add_shape()->CopyFrom(input_shapes[0]);
+
+  input_layer_param->add_top("h_0");
+  input_param->add_shape()->CopyFrom(input_shapes[1]);
+
+  LayerParameter* cont_slice_param = net_param->add_layer();
+  cont_slice_param->CopyFrom(slice_param);
+  cont_slice_param->set_name("cont_slice");
+  cont_slice_param->add_bottom("cont");
+  cont_slice_param->mutable_slice_param()->set_axis(0);
+
+  // Add layer to transform all timesteps of x to the hidden state dimension.
+  //     W_xc_x = W_xc * x + b_c
+  {
+    LayerParameter* x_transform_param = net_param->add_layer();
+    x_transform_param->CopyFrom(biased_hidden_param);
+    x_transform_param->set_name("x_transform");
+    x_transform_param->add_param()->set_name("W_xc");
+    x_transform_param->add_param()->set_name("b_c");
+    x_transform_param->add_bottom("x");
+    x_transform_param->add_top("W_xc_x");
+    x_transform_param->add_propagate_down(true);
+  }
+
+  if (this->static_input_) {
+    // Add layer to transform x_static to the gate dimension.
+    //     W_xc_x_static = W_xc_static * x_static
+    LayerParameter* x_static_transform_param = net_param->add_layer();
+    x_static_transform_param->CopyFrom(hidden_param);
+    x_static_transform_param->mutable_inner_product_param()->set_axis(1);
+    x_static_transform_param->set_name("W_xc_x_static");
+    x_static_transform_param->add_param()->set_name("W_xc_static");
+    x_static_transform_param->add_bottom("x_static");
+    x_static_transform_param->add_top("W_xc_x_static_preshape");
+    x_static_transform_param->add_propagate_down(true);
+
+    LayerParameter* reshape_param = net_param->add_layer();
+    reshape_param->set_type("Reshape");
+    BlobShape* new_shape =
+         reshape_param->mutable_reshape_param()->mutable_shape();
+    new_shape->add_dim(1);  // One timestep.
+    // Should infer this->N as the dimension so we can reshape on batch size.
+    new_shape->add_dim(-1);
+    new_shape->add_dim(
+        x_static_transform_param->inner_product_param().num_output());
+    reshape_param->set_name("W_xc_x_static_reshape");
+    reshape_param->add_bottom("W_xc_x_static_preshape");
+    reshape_param->add_top("W_xc_x_static");
+  }
+
+  LayerParameter* x_slice_param = net_param->add_layer();
+  x_slice_param->CopyFrom(slice_param);
+  x_slice_param->add_bottom("W_xc_x");
+  x_slice_param->set_name("W_xc_x_slice");
+
+  LayerParameter output_concat_layer;
+  output_concat_layer.set_name("h_concat");
+  output_concat_layer.set_type("Concat");
+  output_concat_layer.add_top("h");
+  output_concat_layer.mutable_concat_param()->set_axis(0);
+
+  for (int t = 1; t <= this->T_; ++t) {
+    string tm1s = format_int(t - 1);
+    string ts = format_int(t);
+
+    cont_slice_param->add_top("cont_" + ts);
+    x_slice_param->add_top("W_xc_x_" + ts);
+
+    // Add layers to flush the hidden state when beginning a new
+    // sequence, as indicated by cont_t.
+    //     h_conted_{t-1} := cont_t * h_{t-1}
+    //
+    // Normally, cont_t is binary (i.e., 0 or 1), so:
+    //     h_conted_{t-1} := h_{t-1} if cont_t == 1
+    //                       0   otherwise
+    {
+      LayerParameter* cont_h_param = net_param->add_layer();
+      cont_h_param->CopyFrom(scale_param);
+      cont_h_param->set_name("h_conted_" + tm1s);
+      cont_h_param->add_bottom("h_" + tm1s);
+      cont_h_param->add_bottom("cont_" + ts);
+      cont_h_param->add_top("h_conted_" + tm1s);
+    }
+
+    // Add layer to compute
+    //     W_hc_h_{t-1} := W_hc * h_conted_{t-1}
+    {
+      LayerParameter* w_param = net_param->add_layer();
+      w_param->CopyFrom(hidden_param);
+      w_param->set_name("transform_" + ts);
+      w_param->add_param()->set_name("W_hc");
+      w_param->add_bottom("h_conted_" + tm1s);
+      w_param->add_top("W_hc_h_" + tm1s);
+      w_param->mutable_inner_product_param()->set_axis(2);
+    }
+
+    // Add the outputs of the linear transformations to compute the gate input.
+    //     gate_input_t := W_hc * h_conted_{t-1} + W_xc * x_t + b_c
+    //                   = W_hc_h_{t-1} + W_xc_x_t + b_c
+    {
+      LayerParameter* input_sum_layer = net_param->add_layer();
+      input_sum_layer->CopyFrom(sum_param);
+      input_sum_layer->set_name("gate_input_" + ts);
+      input_sum_layer->add_bottom("W_hc_h_" + tm1s);
+      input_sum_layer->add_bottom("W_xc_x_" + ts);
+      if (this->static_input_) {
+        input_sum_layer->add_bottom("W_xc_x_static");
+      }
+      input_sum_layer->add_top("gate_input_" + ts);
+    }
+
+    // Add LSTMUnit layer to compute the cell & hidden vectors c_t and h_t.
+    // Inputs: c_{t-1}, gate_input_t = (i_t, f_t, o_t, g_t), cont_t
+    // Outputs: c_t, h_t
+    //     [ i_t' ]
+    //     [ f_t' ] := gate_input_t
+    //     [ o_t' ]
+    //     [ g_t' ]
+    //         i_t := \sigmoid[i_t']
+    //         f_t := \sigmoid[f_t']
+    //         o_t := \sigmoid[o_t']
+    //         g_t := \tanh[g_t']
+    //         c_t := cont_t * (f_t .* c_{t-1}) + (i_t .* g_t)
+    //         h_t := o_t .* \tanh[c_t]
+    {
+      LayerParameter* lstm_unit_param = net_param->add_layer();
+      lstm_unit_param->set_type("LSTMUnit");
+      lstm_unit_param->add_bottom("c_" + tm1s);
+      lstm_unit_param->add_bottom("gate_input_" + ts);
+      lstm_unit_param->add_bottom("cont_" + ts);
+      lstm_unit_param->add_top("c_" + ts);
+      lstm_unit_param->add_top("h_" + ts);
+      lstm_unit_param->set_name("unit_" + ts);
+    }
+    output_concat_layer.add_bottom("h_" + ts);
+  }  // for (int t = 1; t <= this->T_; ++t)
+
+  {
+    LayerParameter* c_T_copy_param = net_param->add_layer();
+    c_T_copy_param->CopyFrom(split_param);
+    c_T_copy_param->add_bottom("c_" + format_int(this->T_));
+    c_T_copy_param->add_top("c_T");
+  }
+  net_param->add_layer()->CopyFrom(output_concat_layer);
+}
+
+INSTANTIATE_CLASS_FB(LSTMLayer);
+REGISTER_LAYER_CLASS(LSTM);
+
+}  // namespace caffe
diff --git a/src/caffe/layers/lstm_unit_layer.cpp b/src/caffe/layers/lstm_unit_layer.cpp
new file mode 100644
index 00000000000..4f093f83801
--- /dev/null
+++ b/src/caffe/layers/lstm_unit_layer.cpp
@@ -0,0 +1,126 @@
+#include <algorithm>
+#include <cmath>
+#include <vector>
+
+#include "caffe/layer.hpp"
+#include "caffe/layers/lstm_layer.hpp"
+
+namespace caffe {
+
+template <typename Dtype>
+inline Dtype sigmoid(Dtype x) {
+  return 1. / (1. + exp(-x));
+}
+
+template <typename Dtype>
+inline Dtype tanh(Dtype x) {
+  return 2. * sigmoid(2. * x) - 1.;
+}
+
+template<typename Ftype, typename Btype>
+void LSTMUnitLayer<Ftype, Btype>::Reshape(const vector<Blob*>& bottom,
+    const vector<Blob*>& top) {
+  const int num_instances = bottom[0]->shape(1);
+  for (int i = 0; i < bottom.size(); ++i) {
+    if (i == 2) {
+      CHECK_EQ(2, bottom[i]->num_axes());
+    } else {
+      CHECK_EQ(3, bottom[i]->num_axes());
+    }
+    CHECK_EQ(1, bottom[i]->shape(0));
+    CHECK_EQ(num_instances, bottom[i]->shape(1));
+  }
+  hidden_dim_ = bottom[0]->shape(2);
+  CHECK_EQ(4 * hidden_dim_, bottom[1]->shape(2));
+  top[0]->ReshapeLike(*bottom[0]);
+  top[1]->ReshapeLike(*bottom[0]);
+  X_acts_->ReshapeLike(*bottom[1]);
+}
+
+template<typename Ftype, typename Btype>
+void LSTMUnitLayer<Ftype, Btype>::Forward_cpu(const vector<Blob*>& bottom,
+    const vector<Blob*>& top) {
+  const int num = bottom[0]->shape(1);
+  const int x_dim = hidden_dim_ * 4;
+  const Ftype* C_prev = bottom[0]->cpu_data<Ftype>();
+  const Ftype* X = bottom[1]->cpu_data<Ftype>();
+  const Ftype* cont = bottom[2]->cpu_data<Ftype>();
+  Ftype* C = top[0]->mutable_cpu_data<Ftype>();
+  Ftype* H = top[1]->mutable_cpu_data<Ftype>();
+  for (int n = 0; n < num; ++n) {
+    for (int d = 0; d < hidden_dim_; ++d) {
+      const Ftype i = sigmoid(X[d]);
+      const Ftype f = (*cont == 0) ? 0 :
+          (*cont * sigmoid(X[1 * hidden_dim_ + d]));
+      const Ftype o = sigmoid(X[2 * hidden_dim_ + d]);
+      const Ftype g = tanh(X[3 * hidden_dim_ + d]);
+      const Ftype c_prev = C_prev[d];
+      const Ftype c = f * c_prev + i * g;
+      C[d] = c;
+      const Ftype tanh_c = tanh(c);
+      H[d] = o * tanh_c;
+    }
+    C_prev += hidden_dim_;
+    X += x_dim;
+    C += hidden_dim_;
+    H += hidden_dim_;
+    ++cont;
+  }
+}
+
+template<typename Ftype, typename Btype>
+void LSTMUnitLayer<Ftype, Btype>::Backward_cpu(const vector<Blob*>& top,
+    const vector<bool>& propagate_down, const vector<Blob*>& bottom) {
+  CHECK(!propagate_down[2]) << "Cannot backpropagate to sequence indicators.";
+  if (!propagate_down[0] && !propagate_down[1]) { return; }
+
+  const int num = bottom[0]->shape(1);
+  const int x_dim = hidden_dim_ * 4;
+  const Btype* C_prev = bottom[0]->cpu_data<Btype>();
+  const Btype* X = bottom[1]->cpu_data<Btype>();
+  const Btype* cont = bottom[2]->cpu_data<Btype>();
+  const Btype* C = top[0]->cpu_data<Btype>();
+  const Btype* H = top[1]->cpu_data<Btype>();
+  const Btype* C_diff = top[0]->cpu_diff<Btype>();
+  const Btype* H_diff = top[1]->cpu_diff<Btype>();
+  Btype* C_prev_diff = bottom[0]->mutable_cpu_diff<Btype>();
+  Btype* X_diff = bottom[1]->mutable_cpu_diff<Btype>();
+  for (int n = 0; n < num; ++n) {
+    for (int d = 0; d < hidden_dim_; ++d) {
+      const Btype i = sigmoid(X[d]);
+      const Btype f = (*cont == 0) ? 0 :
+          (*cont * sigmoid(X[1 * hidden_dim_ + d]));
+      const Btype o = sigmoid(X[2 * hidden_dim_ + d]);
+      const Btype g = tanh(X[3 * hidden_dim_ + d]);
+      const Btype c_prev = C_prev[d];
+      const Btype c = C[d];
+      const Btype tanh_c = tanh(c);
+      Btype* c_prev_diff = C_prev_diff + d;
+      Btype* i_diff = X_diff + d;
+      Btype* f_diff = X_diff + 1 * hidden_dim_ + d;
+      Btype* o_diff = X_diff + 2 * hidden_dim_ + d;
+      Btype* g_diff = X_diff + 3 * hidden_dim_ + d;
+      const Btype c_term_diff =
+          C_diff[d] + H_diff[d] * o * (1 - tanh_c * tanh_c);
+      *c_prev_diff = c_term_diff * f;
+      *i_diff = c_term_diff * g * i * (1 - i);
+      *f_diff = c_term_diff * c_prev * f * (1 - f);
+      *o_diff = H_diff[d] * tanh_c * o * (1 - o);
+      *g_diff = c_term_diff * i * (1 - g * g);
+    }
+    C_prev += hidden_dim_;
+    X += x_dim;
+    C += hidden_dim_;
+    H += hidden_dim_;
+    C_diff += hidden_dim_;
+    H_diff += hidden_dim_;
+    X_diff += x_dim;
+    C_prev_diff += hidden_dim_;
+    ++cont;
+  }
+}
+
+INSTANTIATE_CLASS_FB(LSTMUnitLayer);
+REGISTER_LAYER_CLASS(LSTMUnit);
+
+}  // namespace caffe
diff --git a/src/caffe/layers/lstm_unit_layer.cu b/src/caffe/layers/lstm_unit_layer.cu
new file mode 100644
index 00000000000..d8ee81d0ef3
--- /dev/null
+++ b/src/caffe/layers/lstm_unit_layer.cu
@@ -0,0 +1,162 @@
+#include <algorithm>
+#include <cmath>
+#include <vector>
+#include <device_launch_parameters.h>
+
+#include "caffe/layer.hpp"
+#include "caffe/layers/lstm_layer.hpp"
+
+namespace caffe {
+
+template <typename Dtype>
+__device__ Dtype sigmoid(const Dtype x) {
+  return Dtype(1) / (Dtype(1) + exp(-x));
+}
+
+template <typename Dtype>
+__device__ Dtype tanh(const Dtype x) {
+  return Dtype(2) * sigmoid(Dtype(2) * x) - Dtype(1);
+}
+
+template <typename Dtype>
+__global__ void LSTMActsForward(const int nthreads, const int dim,
+                                const Dtype* X, Dtype* X_acts) {
+  CUDA_KERNEL_LOOP(index, nthreads) {
+    const int x_dim = 4 * dim;
+    const int d = index % x_dim;
+    if (d < 3 * dim) {
+      X_acts[index] = sigmoid(X[index]);
+    } else {
+      X_acts[index] = tanh(X[index]);
+    }
+  }
+}
+
+template <typename Dtype>
+__global__ void LSTMUnitForward(const int nthreads, const int dim,
+    const Dtype* C_prev, const Dtype* X, const Dtype* cont,
+    Dtype* C, Dtype* H) {
+  CUDA_KERNEL_LOOP(index, nthreads) {
+    const int n = index / dim;
+    const int d = index % dim;
+    const Dtype* X_offset = X + 4 * dim * n;
+    const Dtype i = X_offset[d];
+    const Dtype f = X_offset[1 * dim + d];
+    const Dtype o = X_offset[2 * dim + d];
+    const Dtype g = X_offset[3 * dim + d];
+    const Dtype c_prev = C_prev[index];
+    const Dtype c = cont[n] * f * c_prev + i * g;
+    C[index] = c;
+    const Dtype tanh_c = tanh(c);
+    H[index] = o * tanh_c;
+  }
+}
+
+template<typename Ftype, typename Btype>
+void LSTMUnitLayer<Ftype, Btype>::Forward_gpu(const vector<Blob*>& bottom,
+                                              const vector<Blob*>& top) {
+  const int count = top[1]->count();
+  const Ftype* C_prev = bottom[0]->gpu_data<Ftype>();
+  const Ftype* X = bottom[1]->gpu_data<Ftype>();
+  const Ftype* cont = bottom[2]->gpu_data<Ftype>();
+  Ftype* X_acts = X_acts_->mutable_gpu_data<Ftype>();
+  Ftype* C = top[0]->mutable_gpu_data<Ftype>();
+  Ftype* H = top[1]->mutable_gpu_data<Ftype>();
+  const int X_count = bottom[1]->count();
+  cudaStream_t stream = Caffe::thread_stream();
+  // NOLINT_NEXT_LINE(whitespace/operators)
+  LSTMActsForward<<<CAFFE_GET_BLOCKS(X_count), CAFFE_CUDA_NUM_THREADS, 0, stream>>>(
+      X_count, hidden_dim_, X, X_acts);
+  CUDA_POST_KERNEL_CHECK;
+  CUDA_CHECK(cudaStreamSynchronize(stream));
+  // NOLINT_NEXT_LINE(whitespace/operators)
+  LSTMUnitForward<<<CAFFE_GET_BLOCKS(count), CAFFE_CUDA_NUM_THREADS, 0, stream>>>(
+      count, hidden_dim_, C_prev, X_acts, cont, C, H);
+  CUDA_POST_KERNEL_CHECK;
+  CUDA_CHECK(cudaStreamSynchronize(stream));
+}
+
+template <typename Dtype>
+__global__ void LSTMUnitBackward(const int nthreads, const int dim,
+    const Dtype* C_prev, const Dtype* X, const Dtype* C, const Dtype* H,
+    const Dtype* cont, const Dtype* C_diff, const Dtype* H_diff,
+    Dtype* C_prev_diff, Dtype* X_diff) {
+  CUDA_KERNEL_LOOP(index, nthreads) {
+    const int n = index / dim;
+    const int d = index % dim;
+    const Dtype* X_offset = X + 4 * dim * n;
+    const Dtype i = X_offset[d];
+    const Dtype f = X_offset[1 * dim + d];
+    const Dtype o = X_offset[2 * dim + d];
+    const Dtype g = X_offset[3 * dim + d];
+    const Dtype c_prev = C_prev[index];
+    const Dtype c = C[index];
+    const Dtype tanh_c = tanh(c);
+    Dtype* c_prev_diff = C_prev_diff + index;
+    Dtype* X_diff_offset = X_diff + 4 * dim * n;
+    Dtype* i_diff = X_diff_offset + d;
+    Dtype* f_diff = X_diff_offset + 1 * dim + d;
+    Dtype* o_diff = X_diff_offset + 2 * dim + d;
+    Dtype* g_diff = X_diff_offset + 3 * dim + d;
+    const Dtype c_term_diff =
+        C_diff[index] + H_diff[index] * o * (1 - tanh_c * tanh_c);
+    const Dtype cont_n = cont[n];
+    *c_prev_diff = cont_n * c_term_diff * f;
+    *i_diff = c_term_diff * g;
+    *f_diff = cont_n * c_term_diff * c_prev;
+    *o_diff = H_diff[index] * tanh_c;
+    *g_diff = c_term_diff * i;
+  }
+}
+
+template <typename Dtype>
+__global__ void LSTMActsBackward(const int nthreads, const int dim,
+    const Dtype* X_acts, const Dtype* X_acts_diff, Dtype* X_diff) {
+  CUDA_KERNEL_LOOP(index, nthreads) {
+    const int x_dim = 4 * dim;
+    const int d = index % x_dim;
+    const Dtype X_act = X_acts[index];
+    if (d < 3 * dim) {
+      X_diff[index] = X_acts_diff[index] * X_act * (Dtype(1) - X_act);
+    } else {
+      X_diff[index] = X_acts_diff[index] * (Dtype(1) - X_act * X_act);
+    }
+  }
+}
+
+template<typename Ftype, typename Btype>
+void LSTMUnitLayer<Ftype, Btype>::Backward_gpu(const vector<Blob*>& top,
+    const vector<bool>& propagate_down,
+    const vector<Blob*>& bottom) {
+  CHECK(!propagate_down[2]) << "Cannot backpropagate to sequence indicators.";
+  if (!propagate_down[0] && !propagate_down[1]) { return; }
+
+  const int count = top[1]->count();
+  const Btype* C_prev = bottom[0]->gpu_data<Btype>();
+  const Btype* X_acts = X_acts_->gpu_data<Btype>();
+  const Btype* cont = bottom[2]->gpu_data<Btype>();
+  const Btype* C = top[0]->gpu_data<Btype>();
+  const Btype* H = top[1]->gpu_data<Btype>();
+  const Btype* C_diff = top[0]->gpu_diff<Btype>();
+  const Btype* H_diff = top[1]->gpu_diff<Btype>();
+  Btype* C_prev_diff = bottom[0]->mutable_gpu_diff<Btype>();
+  Btype* X_acts_diff = X_acts_->mutable_gpu_diff<Btype>();
+
+  cudaStream_t stream = Caffe::thread_stream();
+  LSTMUnitBackward  // NOLINT_NEXT_LINE(whitespace/operators)
+      <<<CAFFE_GET_BLOCKS(count), CAFFE_CUDA_NUM_THREADS, 0, stream>>>(count, hidden_dim_,
+      C_prev, X_acts, C, H, cont, C_diff, H_diff, C_prev_diff, X_acts_diff);
+  CUDA_POST_KERNEL_CHECK;
+  CUDA_CHECK(cudaStreamSynchronize(stream));
+  const int X_count = bottom[1]->count();
+  Btype* X_diff = bottom[1]->mutable_gpu_diff<Btype>();
+  LSTMActsBackward  // NOLINT_NEXT_LINE(whitespace/operators)
+      <<<CAFFE_GET_BLOCKS(X_count), CAFFE_CUDA_NUM_THREADS, 0, stream>>>(
+      X_count, hidden_dim_, X_acts, X_acts_diff, X_diff);
+  CUDA_POST_KERNEL_CHECK;
+  CUDA_CHECK(cudaStreamSynchronize(stream));
+}
+
+INSTANTIATE_LAYER_GPU_FUNCS_FB(LSTMUnitLayer);
+
+}  // namespace caffe
diff --git a/src/caffe/layers/multibox_loss_layer.cpp b/src/caffe/layers/multibox_loss_layer.cpp
new file mode 100644
index 00000000000..951b4a54fff
--- /dev/null
+++ b/src/caffe/layers/multibox_loss_layer.cpp
@@ -0,0 +1,383 @@
+#include <algorithm>
+#include <map>
+#include <utility>
+#include <vector>
+
+#include "caffe/layers/multibox_loss_layer.hpp"
+#include "caffe/util/math_functions.hpp"
+
+namespace caffe {
+
+template <typename Ftype, typename Btype>
+void MultiBoxLossLayer<Ftype, Btype>::LayerSetUp(const vector<Blob*>& bottom,
+      const vector<Blob*>& top) {
+  LossLayer<Ftype, Btype>::LayerSetUp(bottom, top);
+  if (this->layer_param_.propagate_down_size() == 0) {
+    this->layer_param_.add_propagate_down(true);
+    this->layer_param_.add_propagate_down(true);
+    this->layer_param_.add_propagate_down(false);
+    this->layer_param_.add_propagate_down(false);
+  }
+  const MultiBoxLossParameter& multibox_loss_param =
+      this->layer_param_.multibox_loss_param();
+  multibox_loss_param_ = this->layer_param_.multibox_loss_param();
+
+  num_ = bottom[0]->num();
+  num_priors_ = bottom[2]->height() / 4;
+  // Get other parameters.
+  CHECK(multibox_loss_param.has_num_classes()) << "Must provide num_classes.";
+  num_classes_ = multibox_loss_param.num_classes();
+  CHECK_GE(num_classes_, 1) << "num_classes should not be less than 1.";
+  share_location_ = multibox_loss_param.share_location();
+  loc_classes_ = share_location_ ? 1 : num_classes_;
+  background_label_id_ = multibox_loss_param.background_label_id();
+  use_difficult_gt_ = multibox_loss_param.use_difficult_gt();
+  mining_type_ = multibox_loss_param.mining_type();
+  if (multibox_loss_param.has_do_neg_mining()) {
+    LOG(WARNING) << "do_neg_mining is deprecated, use mining_type instead.";
+    do_neg_mining_ = multibox_loss_param.do_neg_mining();
+    CHECK_EQ(do_neg_mining_,
+             mining_type_ != MultiBoxLossParameter_MiningType_NONE);
+  }
+  do_neg_mining_ = mining_type_ != MultiBoxLossParameter_MiningType_NONE;
+
+  if (!this->layer_param_.loss_param().has_normalization() &&
+      this->layer_param_.loss_param().has_normalize()) {
+    normalization_ = this->layer_param_.loss_param().normalize() ?
+                     LossParameter_NormalizationMode_VALID :
+                     LossParameter_NormalizationMode_BATCH_SIZE;
+  } else {
+    normalization_ = this->layer_param_.loss_param().normalization();
+  }
+
+  if (do_neg_mining_) {
+    CHECK(share_location_)
+        << "Currently only support negative mining if share_location is true.";
+  }
+
+  vector<int> loss_shape(1, 1);
+  // Set up localization loss layer.
+  loc_weight_ = multibox_loss_param.loc_weight();
+  loc_loss_type_ = multibox_loss_param.loc_loss_type();
+  // fake shape.
+  vector<int> loc_shape(1, 1);
+  loc_shape.push_back(4);
+  loc_pred_ = Blob::create<Dtype>();
+  loc_pred_->Reshape(loc_shape);
+  loc_gt_ = Blob::create<Dtype>();
+  loc_gt_->Reshape(loc_shape);
+  loc_bottom_vec_.push_back(loc_pred_.get());
+  loc_bottom_vec_.push_back(loc_gt_.get());
+  loc_loss_ = Blob::create<Dtype>();
+  loc_loss_->Reshape(loss_shape);
+  loc_top_vec_.push_back(loc_loss_.get());
+  if (loc_loss_type_ == MultiBoxLossParameter_LocLossType_L2) {
+    LayerParameter layer_param;
+    layer_param.set_name(this->layer_param_.name() + "_l2_loc");
+    layer_param.set_type("EuclideanLoss");
+    layer_param.add_loss_weight(loc_weight_);
+    loc_loss_layer_ = LayerRegistry::CreateLayer(layer_param, this->parent_rank());
+    loc_loss_layer_->SetUp(loc_bottom_vec_, loc_top_vec_);
+  } else if (loc_loss_type_ == MultiBoxLossParameter_LocLossType_SMOOTH_L1) {
+    LayerParameter layer_param;
+    layer_param.set_name(this->layer_param_.name() + "_smooth_L1_loc");
+    layer_param.set_type("SmoothL1Loss");
+    layer_param.add_loss_weight(loc_weight_);
+    loc_loss_layer_ = LayerRegistry::CreateLayer(layer_param, this->parent_rank());
+    loc_loss_layer_->SetUp(loc_bottom_vec_, loc_top_vec_);
+  } else {
+    LOG(FATAL) << "Unknown localization loss type.";
+  }
+  // Set up confidence loss layer.
+  conf_loss_type_ = multibox_loss_param.conf_loss_type();
+  conf_pred_ = Blob::create<Dtype>();
+  conf_gt_ = Blob::create<Dtype>();
+  conf_loss_ = Blob::create<Dtype>();
+  conf_loss_->Reshape(loss_shape);
+  conf_top_vec_.push_back(conf_loss_.get());
+  if (conf_loss_type_ == MultiBoxLossParameter_ConfLossType_SOFTMAX) {
+    CHECK_GE(background_label_id_, 0)
+        << "background_label_id should be within [0, num_classes) for Softmax.";
+    CHECK_LT(background_label_id_, num_classes_)
+        << "background_label_id should be within [0, num_classes) for Softmax.";
+    LayerParameter layer_param;
+    layer_param.set_name(this->layer_param_.name() + "_softmax_conf");
+    layer_param.set_type("SoftmaxWithLoss");
+    layer_param.add_loss_weight(Dtype(1.));
+    layer_param.mutable_loss_param()->set_normalization(
+        LossParameter_NormalizationMode_NONE);
+    SoftmaxParameter* softmax_param = layer_param.mutable_softmax_param();
+    softmax_param->set_axis(1);
+    // Fake reshape.
+    vector<int> conf_shape(1, 1);
+    conf_gt_->Reshape(conf_shape);
+    conf_shape.push_back(num_classes_);
+    conf_pred_->Reshape(conf_shape);
+    conf_bottom_vec_.push_back(conf_pred_.get());
+    conf_bottom_vec_.push_back(conf_gt_.get());
+    conf_loss_layer_ = LayerRegistry::CreateLayer(layer_param, this->parent_rank());
+    conf_loss_layer_->SetUp(conf_bottom_vec_, conf_top_vec_);
+  } else if (conf_loss_type_ == MultiBoxLossParameter_ConfLossType_LOGISTIC) {
+    LayerParameter layer_param;
+    layer_param.set_name(this->layer_param_.name() + "_logistic_conf");
+    layer_param.set_type("SigmoidCrossEntropyLoss");
+    layer_param.add_loss_weight(Dtype(1.));
+    // Fake reshape.
+    vector<int> conf_shape(1, 1);
+    conf_shape.push_back(num_classes_);
+    conf_gt_->Reshape(conf_shape);
+    conf_pred_->Reshape(conf_shape);
+    conf_bottom_vec_.push_back(conf_pred_.get());
+    conf_bottom_vec_.push_back(conf_gt_.get());
+    conf_loss_layer_ = LayerRegistry::CreateLayer(layer_param, this->parent_rank());
+    conf_loss_layer_->SetUp(conf_bottom_vec_, conf_top_vec_);
+  } else {
+    LOG(FATAL) << "Unknown confidence loss type.";
+  }
+}
+
+template <typename Ftype, typename Btype>
+void MultiBoxLossLayer<Ftype, Btype>::Reshape(const vector<Blob*>& bottom,
+      const vector<Blob*>& top) {
+  LossLayer<Ftype, Btype>::Reshape(bottom, top);
+  num_ = bottom[0]->num();
+  num_priors_ = bottom[2]->height() / 4;
+  num_gt_ = bottom[3]->height();
+  CHECK_EQ(bottom[0]->num(), bottom[1]->num());
+  CHECK_EQ(num_priors_ * loc_classes_ * 4, bottom[0]->channels())
+      << "Number of priors must match number of location predictions.";
+  CHECK_EQ(num_priors_ * num_classes_, bottom[1]->channels())
+      << "Number of priors must match number of confidence predictions.";
+}
+
+template <typename Ftype, typename Btype>
+void MultiBoxLossLayer<Ftype, Btype>::Forward_cpu(const vector<Blob*>& bottom,
+    const vector<Blob*>& top) {
+  const Dtype* loc_data = bottom[0]->cpu_data<Dtype>();
+  const Dtype* conf_data = bottom[1]->cpu_data<Dtype>();
+  const Dtype* prior_data = bottom[2]->cpu_data<Dtype>();
+  const Dtype* gt_data = bottom[3]->cpu_data<Dtype>();
+
+  // Retrieve all ground truth.
+  map<int, vector<NormalizedBBox> > all_gt_bboxes;
+  GetGroundTruth(gt_data, num_gt_, background_label_id_, use_difficult_gt_,
+                 &all_gt_bboxes);
+
+  // Retrieve all prior bboxes. It is same within a batch since we assume all
+  // images in a batch are of same dimension.
+  vector<NormalizedBBox> prior_bboxes;
+  vector<vector<float> > prior_variances;
+  GetPriorBBoxes(prior_data, num_priors_, &prior_bboxes, &prior_variances);
+
+  // Retrieve all predictions.
+  vector<LabelBBox> all_loc_preds;
+  GetLocPredictions(loc_data, num_, num_priors_, loc_classes_, share_location_,
+                    &all_loc_preds);
+
+  // Find matches between source bboxes and ground truth bboxes.
+  vector<map<int, vector<float> > > all_match_overlaps;
+  FindMatches(all_loc_preds, all_gt_bboxes, prior_bboxes, prior_variances,
+              multibox_loss_param_, &all_match_overlaps, &all_match_indices_);
+
+  num_matches_ = 0;
+  int num_negs = 0;
+  // Sample hard negative (and positive) examples based on mining type.
+  MineHardExamples(*static_cast<TBlob<Dtype>*>(bottom[1]),
+      all_loc_preds, all_gt_bboxes, prior_bboxes,
+      prior_variances, all_match_overlaps, multibox_loss_param_,
+      &num_matches_, &num_negs, &all_match_indices_, &all_neg_indices_);
+
+  if (num_matches_ >= 1) {
+    // Form data to pass on to loc_loss_layer_.
+    vector<int> loc_shape(2);
+    loc_shape[0] = 1;
+    loc_shape[1] = num_matches_ * 4;
+    loc_pred_->Reshape(loc_shape);
+    loc_gt_->Reshape(loc_shape);
+    Dtype* loc_pred_data = loc_pred_->mutable_cpu_data<Dtype>();
+    Dtype* loc_gt_data = loc_gt_->mutable_cpu_data<Dtype>();
+    EncodeLocPrediction(all_loc_preds, all_gt_bboxes, all_match_indices_,
+                        prior_bboxes, prior_variances, multibox_loss_param_,
+                        loc_pred_data, loc_gt_data);
+    loc_loss_layer_->Reshape(loc_bottom_vec_, loc_top_vec_);
+    loc_loss_layer_->Forward(loc_bottom_vec_, loc_top_vec_);
+  } else {
+    loc_loss_->mutable_cpu_data<Dtype>()[0] = 0;
+  }
+
+  // Form data to pass on to conf_loss_layer_.
+  if (do_neg_mining_) {
+    num_conf_ = num_matches_ + num_negs;
+  } else {
+    num_conf_ = num_ * num_priors_;
+  }
+  if (num_conf_ >= 1) {
+    // Reshape the confidence data.
+    vector<int> conf_shape;
+    if (conf_loss_type_ == MultiBoxLossParameter_ConfLossType_SOFTMAX) {
+      conf_shape.push_back(num_conf_);
+      conf_bottom_vec_[1]->Reshape(conf_shape);
+      conf_shape.push_back(num_classes_);
+      conf_bottom_vec_[0]->Reshape(conf_shape);
+    } else if (conf_loss_type_ == MultiBoxLossParameter_ConfLossType_LOGISTIC) {
+      conf_shape.push_back(1);
+      conf_shape.push_back(num_conf_);
+      conf_shape.push_back(num_classes_);
+      conf_bottom_vec_[0]->Reshape(conf_shape);
+      conf_bottom_vec_[1]->Reshape(conf_shape);
+    } else {
+      LOG(FATAL) << "Unknown confidence loss type.";
+    }
+    if (!do_neg_mining_) {
+      // Consider all scores.
+      // Share data and diff with bottom[1].
+      CHECK_EQ(conf_pred_->count(), bottom[1]->count());
+      conf_pred_->ShareData(*(bottom[1]));
+    }
+    Dtype* conf_pred_data = conf_pred_->mutable_cpu_data<Dtype>();
+    Dtype* conf_gt_data = conf_gt_->mutable_cpu_data<Dtype>();
+    caffe_set(conf_gt_->count(), Dtype(background_label_id_), conf_gt_data);
+    EncodeConfPrediction(conf_data, num_, num_priors_, multibox_loss_param_,
+                         all_match_indices_, all_neg_indices_, all_gt_bboxes,
+                         conf_pred_data, conf_gt_data);
+    conf_loss_layer_->Reshape(conf_bottom_vec_, conf_top_vec_);
+    conf_loss_layer_->Forward(conf_bottom_vec_, conf_top_vec_);
+  } else {
+    conf_loss_->mutable_cpu_data<Dtype>()[0] = 0;
+  }
+
+  top[0]->mutable_cpu_data<Dtype>()[0] = 0;
+  if (this->layer_param_.propagate_down(0)) {
+    Dtype normalizer = LossLayer<Ftype, Btype>::GetNormalizer(
+        normalization_, num_, num_priors_, num_matches_);
+    top[0]->mutable_cpu_data<Dtype>()[0] +=
+        loc_weight_ * loc_loss_->cpu_data<Dtype>()[0] / normalizer;
+  }
+  if (this->layer_param_.propagate_down(1)) {
+    Dtype normalizer = LossLayer<Ftype, Btype>::GetNormalizer(
+        normalization_, num_, num_priors_, num_matches_);
+    top[0]->mutable_cpu_data<Dtype>()[0] += conf_loss_->cpu_data<Dtype>()[0] / normalizer;
+  }
+}
+
+template <typename Ftype, typename Btype>
+void MultiBoxLossLayer<Ftype, Btype>::Backward_cpu(const vector<Blob*>& top,
+    const vector<bool>& propagate_down,
+    const vector<Blob*>& bottom) {
+
+  if (propagate_down[2]) {
+    LOG(FATAL) << this->type()
+        << " Layer cannot backpropagate to prior inputs.";
+  }
+  if (propagate_down[3]) {
+    LOG(FATAL) << this->type()
+        << " Layer cannot backpropagate to label inputs.";
+  }
+
+  // Back propagate on location prediction.
+  if (propagate_down[0]) {
+    Dtype* loc_bottom_diff = bottom[0]->mutable_cpu_diff<Dtype>();
+    caffe_set(bottom[0]->count(), Dtype(0), loc_bottom_diff);
+    if (num_matches_ >= 1) {
+      vector<bool> loc_propagate_down;
+      // Only back propagate on prediction, not ground truth.
+      loc_propagate_down.push_back(true);
+      loc_propagate_down.push_back(false);
+      loc_loss_layer_->Backward(loc_top_vec_, loc_propagate_down,
+                                loc_bottom_vec_);
+      // Scale gradient.
+      Dtype normalizer = LossLayer<Ftype, Btype>::GetNormalizer(
+          normalization_, num_, num_priors_, num_matches_);
+      Dtype loss_weight = top[0]->cpu_diff<Dtype>()[0] / normalizer;
+      caffe_scal(loc_pred_->count(), loss_weight, loc_pred_->mutable_cpu_diff<Dtype>());
+      // Copy gradient back to bottom[0].
+      const Dtype* loc_pred_diff = loc_pred_->cpu_diff<Dtype>();
+      int count = 0;
+      for (int i = 0; i < num_; ++i) {
+        for (map<int, vector<int> >::iterator it =
+             all_match_indices_[i].begin();
+             it != all_match_indices_[i].end(); ++it) {
+          const int label = share_location_ ? 0 : it->first;
+          const vector<int>& match_index = it->second;
+          for (int j = 0; j < match_index.size(); ++j) {
+            if (match_index[j] <= -1) {
+              continue;
+            }
+            // Copy the diff to the right place.
+            int start_idx = loc_classes_ * 4 * j + label * 4;
+            caffe_copy(4, loc_pred_diff + count * 4,
+                              loc_bottom_diff + start_idx);
+            ++count;
+          }
+        }
+        loc_bottom_diff += bottom[0]->offset(1);
+      }
+    }
+  }
+
+  // Back propagate on confidence prediction.
+  if (propagate_down[1]) {
+    Dtype* conf_bottom_diff = bottom[1]->mutable_cpu_diff<Dtype>();
+    caffe_set(bottom[1]->count(), Dtype(0), conf_bottom_diff);
+    if (num_conf_ >= 1) {
+      vector<bool> conf_propagate_down;
+      // Only back propagate on prediction, not ground truth.
+      conf_propagate_down.push_back(true);
+      conf_propagate_down.push_back(false);
+      conf_loss_layer_->Backward(conf_top_vec_, conf_propagate_down,
+                                 conf_bottom_vec_);
+      // Scale gradient.
+      Dtype normalizer = LossLayer<Ftype, Btype>::GetNormalizer(
+          normalization_, num_, num_priors_, num_matches_);
+      Dtype loss_weight = top[0]->cpu_diff<Dtype>()[0] / normalizer;
+      caffe_scal(conf_pred_->count(), loss_weight,
+                 conf_pred_->mutable_cpu_diff<Dtype>());
+      // Copy gradient back to bottom[1].
+      const Dtype* conf_pred_diff = conf_pred_->cpu_diff<Dtype>();
+      if (do_neg_mining_) {
+        int count = 0;
+        for (int i = 0; i < num_; ++i) {
+          // Copy matched (positive) bboxes scores' diff.
+          const map<int, vector<int> >& match_indices = all_match_indices_[i];
+          for (map<int, vector<int> >::const_iterator it =
+               match_indices.begin(); it != match_indices.end(); ++it) {
+            const vector<int>& match_index = it->second;
+            CHECK_EQ(match_index.size(), num_priors_);
+            for (int j = 0; j < num_priors_; ++j) {
+              if (match_index[j] <= -1) {
+                continue;
+              }
+              // Copy the diff to the right place.
+              caffe_copy(num_classes_,
+                                conf_pred_diff + count * num_classes_,
+                                conf_bottom_diff + j * num_classes_);
+              ++count;
+            }
+          }
+          // Copy negative bboxes scores' diff.
+          for (int n = 0; n < all_neg_indices_[i].size(); ++n) {
+            int j = all_neg_indices_[i][n];
+            CHECK_LT(j, num_priors_);
+            caffe_copy(num_classes_,
+                              conf_pred_diff + count * num_classes_,
+                              conf_bottom_diff + j * num_classes_);
+            ++count;
+          }
+          conf_bottom_diff += bottom[1]->offset(1);
+        }
+      } else {
+        // The diff is already computed and stored.
+        bottom[1]->ShareDiff(*conf_pred_);
+      }
+    }
+  }
+
+  // After backward, remove match statistics.
+  all_match_indices_.clear();
+  all_neg_indices_.clear();
+}
+
+INSTANTIATE_CLASS_FB(MultiBoxLossLayer);
+REGISTER_LAYER_CLASS(MultiBoxLoss);
+
+}  // namespace caffe
diff --git a/src/caffe/layers/normalize_layer.cpp b/src/caffe/layers/normalize_layer.cpp
new file mode 100644
index 00000000000..e11ef1d9795
--- /dev/null
+++ b/src/caffe/layers/normalize_layer.cpp
@@ -0,0 +1,229 @@
+#include <vector>
+
+#include "caffe/filler.hpp"
+#include "caffe/layers/normalize_layer.hpp"
+
+namespace caffe {
+
+template <typename Ftype, typename Btype>
+void NormalizeLayer<Ftype, Btype>::LayerSetUp(const vector<Blob*>& bottom,
+      const vector<Blob*>& top) {
+  CHECK_GE(bottom[0]->num_axes(), 2)
+      << "Number of axes of bottom blob must be >=2.";
+  buffer_.Reshape(1, bottom[0]->channels(),
+                   bottom[0]->height(), bottom[0]->width());
+  buffer_channel_.Reshape(1, bottom[0]->channels(), 1, 1);
+  buffer_spatial_.Reshape(1, 1, bottom[0]->height(), bottom[0]->width());
+  NormalizeParameter norm_param = this->layer_param().norm_param();
+  across_spatial_ = norm_param.across_spatial();
+  if (across_spatial_) {
+    norm_.Reshape(bottom[0]->num(), 1, 1, 1);
+  } else {
+    norm_.Reshape(bottom[0]->num(), 1, bottom[0]->height(), bottom[0]->width());
+  }
+  eps_ = std::max((Dtype)norm_param.eps(), min_dtype<Dtype>());
+  int channels = bottom[0]->channels();
+  int spatial_dim = bottom[0]->width() * bottom[0]->height();
+  sum_channel_multiplier_.Reshape(1, channels, 1, 1);
+  caffe_set(channels, Dtype(1), sum_channel_multiplier_.mutable_cpu_data());
+  sum_spatial_multiplier_.Reshape(
+      1, 1, bottom[0]->height(), bottom[0]->width());
+  caffe_set(spatial_dim, Dtype(1), sum_spatial_multiplier_.mutable_cpu_data());
+  channel_shared_ = norm_param.channel_shared();
+  if (this->blobs_.size() > 0) {
+    LOG(INFO) << "Skipping parameter initialization";
+  } else {
+    this->blobs_.resize(1);
+    if (channel_shared_) {
+      this->blobs_[0] = Blob::create<Ftype, Btype>(vector<int>(0));
+    } else {
+      this->blobs_[0] = Blob::create<Ftype, Btype>(vector<int>(1, channels));
+    }
+    shared_ptr<Filler<Dtype> > scale_filler;
+    if (norm_param.has_scale_filler()) {
+      scale_filler.reset(GetFiller<Dtype>(norm_param.scale_filler()));
+    } else {
+      FillerParameter filler_param;
+      filler_param.set_type("constant");
+      filler_param.set_value(1.0);
+      scale_filler.reset(GetFiller<Dtype>(filler_param));
+    }
+    scale_filler->Fill(this->blobs_[0].get());
+  }
+  if (channel_shared_) {
+    CHECK_EQ(this->blobs_[0]->count(), 1)
+        << "Scale size is inconsistent with prototxt config";
+  } else {
+    CHECK_EQ(this->blobs_[0]->count(), channels)
+        << "Scale size is inconsistent with prototxt config";
+  }
+  this->param_propagate_down_.resize(this->blobs_.size(), true);
+}
+
+template <typename Ftype, typename Btype>
+void NormalizeLayer<Ftype, Btype>::Reshape(const vector<Blob*>& bottom,
+      const vector<Blob*>& top) {
+  CHECK_GE(bottom[0]->num_axes(), 2)
+      << "Number of axes of bottom blob must be >=2.";
+  top[0]->ReshapeLike(*bottom[0]);
+  buffer_.Reshape(1, bottom[0]->channels(),
+                   bottom[0]->height(), bottom[0]->width());
+  if (!across_spatial_) {
+    norm_.Reshape(bottom[0]->num(), 1, bottom[0]->height(), bottom[0]->width());
+  }
+  int spatial_dim = bottom[0]->height() * bottom[0]->width();
+  if (spatial_dim != sum_spatial_multiplier_.count()) {
+    sum_spatial_multiplier_.Reshape(
+        1, 1, bottom[0]->height(), bottom[0]->width());
+    caffe_set(spatial_dim, Dtype(1),
+              sum_spatial_multiplier_.mutable_cpu_data());
+    buffer_spatial_.Reshape(1, 1, bottom[0]->height(), bottom[0]->width());
+  }
+}
+
+template <typename Ftype, typename Btype>
+void NormalizeLayer<Ftype, Btype>::Forward_cpu(const vector<Blob*>& bottom,
+    const vector<Blob*>& top) {
+  const Dtype* bottom_data = bottom[0]->cpu_data<Dtype>();
+  Dtype* top_data = top[0]->mutable_cpu_data<Dtype>();
+  const Dtype* scale = this->blobs_[0]->template cpu_data<Dtype>();
+  Dtype* buffer_data = buffer_.mutable_cpu_data();
+  Dtype* norm_data = norm_.mutable_cpu_data();
+  // add eps to avoid overflow
+  caffe_set(norm_.count(), Dtype(eps_), norm_data);
+  const Dtype* sum_channel_multiplier = sum_channel_multiplier_.cpu_data();
+  const Dtype* sum_spatial_multiplier = sum_spatial_multiplier_.cpu_data();
+  int num = bottom[0]->num();
+  int dim = bottom[0]->count() / num;
+  int spatial_dim = bottom[0]->height() * bottom[0]->width();
+  int channels = bottom[0]->channels();
+  for (int n = 0; n < num; ++n) {
+    caffe_sqr<Dtype>(dim, bottom_data, buffer_data);
+    if (across_spatial_) {
+      // add eps to avoid overflow
+      norm_data[n] = pow(Dtype(caffe_cpu_asum(dim, buffer_data)+eps_),
+                         Dtype(0.5));
+      caffe_cpu_scale<Dtype>(dim, Dtype(1.0 / norm_data[n]), bottom_data,
+                             top_data);
+    } else {
+      caffe_cpu_gemv<Dtype>(CblasTrans, channels, spatial_dim, Dtype(1),
+                            buffer_data, sum_channel_multiplier, Dtype(1),
+                            norm_data);
+      // compute norm
+      caffe_powx(spatial_dim, norm_data, Dtype(0.5), norm_data);
+      // scale the layer
+      caffe_cpu_gemm(CblasNoTrans, CblasNoTrans, channels, spatial_dim,
+                            1, Dtype(1), sum_channel_multiplier, norm_data,
+                            Dtype(0), buffer_data);
+      caffe_div(dim, bottom_data, buffer_data, top_data);
+      norm_data += spatial_dim;
+    }
+    // scale the output
+    if (channel_shared_) {
+      caffe_scal(dim, scale[0], top_data);
+    } else {
+      caffe_cpu_gemm(CblasNoTrans, CblasNoTrans, channels, spatial_dim,
+                            1, Dtype(1), scale, sum_spatial_multiplier,
+                            Dtype(0),
+                            buffer_data);
+      caffe_mul<Dtype>(dim, top_data, buffer_data, top_data);
+    }
+    bottom_data += dim;
+    top_data += dim;
+  }
+}
+
+template <typename Ftype, typename Btype>
+void NormalizeLayer<Ftype, Btype>::Backward_cpu(const vector<Blob*>& top,
+    const vector<bool>& propagate_down, const vector<Blob*>& bottom) {
+  const Dtype* top_diff = top[0]->cpu_diff<Dtype>();
+  const Dtype* top_data = top[0]->cpu_data<Dtype>();
+  const Dtype* bottom_data = bottom[0]->cpu_data<Dtype>();
+  Dtype* bottom_diff = bottom[0]->mutable_cpu_diff<Dtype>();
+  const Dtype* scale = this->blobs_[0]->template cpu_data<Dtype>();
+  const Dtype* norm_data = norm_.cpu_data();
+  Dtype* buffer_data = buffer_.mutable_cpu_data();
+  Dtype* buffer_channel = buffer_channel_.mutable_cpu_data();
+  Dtype* buffer_spatial = buffer_spatial_.mutable_cpu_data();
+  const Dtype* sum_channel_multiplier = sum_channel_multiplier_.cpu_data();
+  const Dtype* sum_spatial_multiplier = sum_spatial_multiplier_.cpu_data();
+  int count = top[0]->count();
+  int num = top[0]->num();
+  int dim = count / num;
+  int spatial_dim = top[0]->height() * top[0]->width();
+  int channels = top[0]->channels();
+
+  // Propagate to param
+  if (this->param_propagate_down_[0]) {
+    Dtype* scale_diff = this->blobs_[0]->template mutable_cpu_diff<Dtype>();
+    if (channel_shared_) {
+      scale_diff[0] +=
+          caffe_cpu_dot(count, top_data, top_diff) / scale[0];
+    } else {
+      for (int n = 0; n < num; ++n) {
+        caffe_mul(dim, top_data+n*dim, top_diff+n*dim, buffer_data);
+        caffe_cpu_gemv(CblasNoTrans, channels, spatial_dim, Dtype(1),
+                              buffer_data, sum_spatial_multiplier, Dtype(0),
+                              buffer_channel);
+        // store a / scale[i] in buffer_data temporary
+        caffe_div(channels, buffer_channel, scale, buffer_channel);
+        caffe_add(channels, buffer_channel, scale_diff, scale_diff);
+      }
+    }
+  }
+
+  // Propagate to bottom
+  if (propagate_down[0]) {
+    for (int n = 0; n < num; ++n) {
+      if (across_spatial_) {
+        Dtype a = caffe_cpu_dot(dim, bottom_data, top_diff);
+        caffe_cpu_scale(dim, Dtype(a / norm_data[n] / norm_data[n]),
+                               bottom_data, bottom_diff);
+        caffe_sub(dim, top_diff, bottom_diff, bottom_diff);
+        caffe_scal(dim, Dtype(1.0 / norm_data[n]), bottom_diff);
+      } else {
+        // dot product between bottom_data and top_diff
+        caffe_mul(dim, bottom_data, top_diff, buffer_data);
+        caffe_cpu_gemv(CblasTrans, channels, spatial_dim, Dtype(1),
+                              buffer_data, sum_channel_multiplier, Dtype(0),
+                              buffer_spatial);
+        // scale bottom_diff
+        caffe_cpu_gemm(CblasNoTrans, CblasNoTrans, channels, spatial_dim,
+                              1, Dtype(1), sum_channel_multiplier,
+                              buffer_spatial, Dtype(0), buffer_data);
+        caffe_mul(dim, bottom_data, buffer_data, bottom_diff);
+        // divide by square of norm
+        caffe_powx(spatial_dim, norm_data, Dtype(2), buffer_spatial);
+        caffe_cpu_gemm(CblasNoTrans, CblasNoTrans, channels, spatial_dim,
+                              1, Dtype(1), sum_channel_multiplier,
+                              buffer_spatial, Dtype(0), buffer_data);
+        caffe_div(dim, bottom_diff, buffer_data, bottom_diff);
+        // subtract
+        caffe_sub(dim, top_diff, bottom_diff, bottom_diff);
+        // divide by norm
+        caffe_cpu_gemm(CblasNoTrans, CblasNoTrans, channels, spatial_dim,
+                              1, Dtype(1), sum_channel_multiplier, norm_data,
+                              Dtype(0), buffer_data);
+        caffe_div(dim, bottom_diff, buffer_data, bottom_diff);
+        norm_data += spatial_dim;
+      }
+      // scale the diff
+      if (channel_shared_) {
+        caffe_scal(dim, scale[0], bottom_diff);
+      } else {
+        caffe_cpu_gemm(CblasNoTrans, CblasNoTrans, channels, spatial_dim,
+                              1, Dtype(1), scale, sum_spatial_multiplier,
+                              Dtype(0), buffer_data);
+        caffe_mul(dim, bottom_diff, buffer_data, bottom_diff);
+      }
+      bottom_data += dim;
+      top_diff += dim;
+      bottom_diff += dim;
+    }
+  }
+}
+
+INSTANTIATE_CLASS_FB(NormalizeLayer);
+REGISTER_LAYER_CLASS(Normalize);
+
+}  // namespace caffe
diff --git a/src/caffe/layers/normalize_layer.cu b/src/caffe/layers/normalize_layer.cu
new file mode 100644
index 00000000000..631bb0f59db
--- /dev/null
+++ b/src/caffe/layers/normalize_layer.cu
@@ -0,0 +1,245 @@
+#include <algorithm>
+#include <cfloat>
+#include <vector>
+#include <device_launch_parameters.h>
+
+#include "caffe/util/half.cuh"
+#include "caffe/filler.hpp"
+#include "caffe/layers/normalize_layer.hpp"
+#include "caffe/util/gpu_math_functions.cuh"
+#include "caffe/util/math_functions.hpp"
+
+namespace caffe {
+
+// divid a matrix with vector
+template <typename Dtype>
+__global__ void DivBsx(const int nthreads, const Dtype* A,
+    const Dtype* v, const int rows, const int cols,
+    Dtype* B) {
+  CUDA_KERNEL_LOOP(index, nthreads) {
+    B[index] = A[index] / v[index % cols];
+  }
+}
+
+template <>
+__global__ void DivBsx<float16>(const int nthreads, const float16* A,
+                       const float16* v, const int rows, const int cols, float16* B) {
+  CUDA_KERNEL_LOOP(index, nthreads) {
+    const half* ah = reinterpret_cast<const half*>(A);
+    const half* vh = reinterpret_cast<const half*>(v);
+    half* bh = reinterpret_cast<half*>(B);
+    bh[index] = hdiv(ah[index], vh[index % cols]);
+  }
+}
+
+template <typename Dtype>
+__global__ void MulBsx(const int nthreads, const Dtype* A,
+    const Dtype* v, const int rows, const int cols, const bool notrans,
+    Dtype* B) {
+  CUDA_KERNEL_LOOP(index, nthreads) {
+    int c = index % cols;
+    int r = (index / cols) % rows;
+    if (notrans) {
+      B[index] = A[index] * v[c];
+    } else {
+      B[index] = A[index] * v[r];
+    }
+  }
+}
+
+template <>
+__global__ void MulBsx<float16>(const int nthreads, const float16* A, const float16* v,
+                                const int rows, const int cols, const bool notrans, float16* B) {
+  CUDA_KERNEL_LOOP(index, nthreads) {
+    int c = index % cols;
+    int r = (index / cols) % rows;
+    const half* ah = reinterpret_cast<const half*>(A);
+    const half* vh = reinterpret_cast<const half*>(v);
+    half* bh = reinterpret_cast<half*>(B);
+    bh[index] = hmul(ah[index], vh[notrans ? c : r]);
+  }
+}
+
+
+template <typename Ftype, typename Btype>
+void NormalizeLayer<Ftype, Btype>::Forward_gpu(const vector<Blob*>& bottom,
+    const vector<Blob*>& top) {
+  const Dtype* bottom_data = bottom[0]->gpu_data<Dtype>();
+  Dtype* top_data = top[0]->mutable_gpu_data<Dtype>();
+  Dtype* buffer_data = buffer_.mutable_gpu_data();
+  Dtype* norm_data;
+  if (across_spatial_) {
+    // need to index it
+    norm_data = norm_.mutable_cpu_data();
+  } else {
+    norm_data = norm_.mutable_gpu_data();
+    // add eps to avoid overflow
+    caffe_gpu_set<Dtype>(norm_.count(), Dtype(eps_), norm_data);
+  }
+  const Dtype* scale;
+  if (channel_shared_) {
+    scale = this->blobs_[0]->template cpu_data<Dtype>();
+  } else {
+    scale = this->blobs_[0]->template gpu_data<Dtype>();
+  }
+  const Dtype* sum_channel_multiplier = sum_channel_multiplier_.gpu_data();
+  int num = bottom[0]->num();
+  int dim = bottom[0]->count() / num;
+  int spatial_dim = bottom[0]->height() * bottom[0]->width();
+  int channels = bottom[0]->channels();
+  cudaStream_t stream = Caffe::thread_stream();
+  for (int n = 0; n < num; ++n) {
+    caffe_gpu_powx<Dtype>(dim, bottom_data, Dtype(2), buffer_data);
+    if (across_spatial_) {
+      Dtype normsqr;
+      caffe_gpu_asum<Dtype>(dim, buffer_data, &normsqr, 0);
+      // add eps to avoid overflow
+      norm_data[n] = pow(normsqr+eps_, Dtype(0.5));
+      caffe_gpu_scale<Dtype>(dim, Dtype(1.0 / norm_data[n]), bottom_data,
+                             top_data);
+    } else {
+      // compute norm
+      caffe_gpu_gemv<Dtype>(CblasTrans, channels, spatial_dim, Dtype(1),
+                            buffer_data, sum_channel_multiplier, Dtype(1),
+                            norm_data);
+      caffe_gpu_powx<Dtype>(spatial_dim, norm_data, Dtype(0.5), norm_data);
+      // scale the layer
+      // NOLINT_NEXT_LINE(whitespace/operators)
+      DivBsx<<<CAFFE_GET_BLOCKS(dim), CAFFE_CUDA_NUM_THREADS, 0, stream>>>(
+          dim, bottom_data, norm_data, channels, spatial_dim, top_data);
+      CUDA_POST_KERNEL_CHECK;
+      CUDA_CHECK(cudaStreamSynchronize(stream));
+      norm_data += spatial_dim;
+    }
+    // scale the output
+    if (channel_shared_) {
+      caffe_gpu_scal<Dtype>(dim, scale[0], top_data);
+    } else {
+      // NOLINT_NEXT_LINE(whitespace/operators)
+      MulBsx<<<CAFFE_GET_BLOCKS(dim), CAFFE_CUDA_NUM_THREADS, 0, stream>>>(
+          dim, top_data, scale, channels, spatial_dim, false,
+          top_data);
+      CUDA_POST_KERNEL_CHECK;
+      CUDA_CHECK(cudaStreamSynchronize(stream));
+    }
+    bottom_data += dim;
+    top_data += dim;
+  }
+}
+
+template <typename Ftype, typename Btype>
+void NormalizeLayer<Ftype, Btype>::Backward_gpu(const vector<Blob*>& top,
+    const vector<bool>& propagate_down, const vector<Blob*>& bottom) {
+  const Dtype* top_diff = top[0]->gpu_diff<Dtype>();
+  const Dtype* top_data = top[0]->gpu_data<Dtype>();
+  const Dtype* bottom_data = bottom[0]->mutable_gpu_data<Dtype>();
+  Dtype* bottom_diff = bottom[0]->mutable_gpu_diff<Dtype>();
+  const Dtype* norm_data;
+  if (across_spatial_) {
+    // need to index it
+    norm_data = norm_.cpu_data();
+  } else {
+    norm_data = norm_.gpu_data();
+  }
+  const Dtype* scale;
+  if (channel_shared_) {
+    scale = this->blobs_[0]->template cpu_data<Dtype>();
+  } else {
+    scale = this->blobs_[0]->template gpu_data<Dtype>();
+  }
+  Dtype* buffer_data = buffer_.mutable_gpu_data();
+  Dtype* buffer_channel = buffer_channel_.mutable_gpu_data();
+  Dtype* buffer_spatial = buffer_spatial_.mutable_gpu_data();
+  const Dtype* sum_channel_multiplier = sum_channel_multiplier_.gpu_data();
+  const Dtype* sum_spatial_multiplier = sum_spatial_multiplier_.gpu_data();
+  int count = top[0]->count();
+  int num = top[0]->num();
+  int dim = count / num;
+  int spatial_dim = top[0]->height() * top[0]->width();
+  int channels = top[0]->channels();
+
+  // Propagate to param
+  if (this->param_propagate_down_[0]) {
+    if (channel_shared_) {
+      Dtype* scale_diff = this->blobs_[0]->template mutable_cpu_diff<Dtype>();
+      Dtype a;
+      caffe_gpu_dot<Dtype>(count, top_data, top_diff, &a);
+      scale_diff[0] += a / scale[0];
+    } else {
+      Dtype* scale_diff = this->blobs_[0]->template mutable_gpu_diff<Dtype>();
+      for (int n = 0; n < num; ++n) {
+        // compute a
+        caffe_gpu_mul<Dtype>(dim, top_data+n*dim, top_diff+n*dim, buffer_data);
+        caffe_gpu_gemv<Dtype>(CblasNoTrans, channels, spatial_dim, Dtype(1),
+                              buffer_data, sum_spatial_multiplier, Dtype(0),
+                              buffer_channel);
+        // store a / scale[i] in buffer_data temporary
+        caffe_gpu_div<Dtype>(channels, buffer_channel, scale, buffer_channel);
+        caffe_gpu_add<Dtype>(channels, buffer_channel, scale_diff, scale_diff);
+      }
+    }
+  }
+
+  // Propagate to bottom
+  if (propagate_down[0]) {
+    cudaStream_t stream = Caffe::thread_stream();
+    for (int n = 0; n < num; ++n) {
+      if (across_spatial_) {
+        Dtype a;
+        caffe_gpu_dot(dim, bottom_data, top_diff, &a);
+        caffe_gpu_scale(dim, Dtype(a / norm_data[n] / norm_data[n]),
+                               bottom_data, bottom_diff);
+        caffe_gpu_sub(dim, top_diff, bottom_diff, bottom_diff);
+        caffe_gpu_scale(dim, Dtype(1.0 / norm_data[n]), bottom_diff,
+                               bottom_diff);
+      } else {
+        // dot product between bottom_data and top_diff
+        caffe_gpu_mul(dim, bottom_data, top_diff, buffer_data);
+        caffe_gpu_gemv(CblasTrans, channels, spatial_dim, Dtype(1),
+                              buffer_data, sum_channel_multiplier, Dtype(0),
+                              buffer_spatial);
+        // scale botom_diff
+        // NOLINT_NEXT_LINE(whitespace/operators)
+        MulBsx<<<CAFFE_GET_BLOCKS(dim), CAFFE_CUDA_NUM_THREADS, 0, stream>>>(
+            dim, bottom_data, buffer_spatial, channels, spatial_dim,
+            true, bottom_diff);
+        CUDA_POST_KERNEL_CHECK;
+        CUDA_CHECK(cudaStreamSynchronize(stream));
+        // divide by square of norm
+        caffe_gpu_powx(spatial_dim, norm_data, Dtype(2), buffer_spatial);
+        // NOLINT_NEXT_LINE(whitespace/operators)
+        DivBsx<Dtype> <<<CAFFE_GET_BLOCKS(dim), CAFFE_CUDA_NUM_THREADS, 0, stream>>>(
+            dim, bottom_diff, buffer_spatial, channels, spatial_dim,
+            bottom_diff);
+        CUDA_POST_KERNEL_CHECK;
+        CUDA_CHECK(cudaStreamSynchronize(stream));
+        caffe_gpu_sub(dim, top_diff, bottom_diff, bottom_diff);
+        // divide by norm
+        // NOLINT_NEXT_LINE(whitespace/operators)
+        DivBsx<Dtype> <<<CAFFE_GET_BLOCKS(dim), CAFFE_CUDA_NUM_THREADS, 0, stream>>>(
+            dim, bottom_diff, norm_data, channels, spatial_dim,
+            bottom_diff);
+        CUDA_POST_KERNEL_CHECK;
+        norm_data += spatial_dim;
+        CUDA_CHECK(cudaStreamSynchronize(stream));
+      }
+      // scale the diff
+      if (channel_shared_) {
+        caffe_gpu_scal(dim, scale[0], bottom_diff);
+      } else {
+        // NOLINT_NEXT_LINE(whitespace/operators)
+        MulBsx<<<CAFFE_GET_BLOCKS(dim), CAFFE_CUDA_NUM_THREADS, 0, stream>>>(
+            dim, bottom_diff, scale, channels, spatial_dim, false, bottom_diff);
+        CUDA_POST_KERNEL_CHECK;
+        CUDA_CHECK(cudaStreamSynchronize(stream));
+      }
+      bottom_data += dim;
+      top_diff += dim;
+      bottom_diff += dim;
+    }
+  }
+}
+
+INSTANTIATE_LAYER_GPU_FUNCS_FB(NormalizeLayer);
+
+}  // namespace caffe
diff --git a/src/caffe/layers/permute_layer.cpp b/src/caffe/layers/permute_layer.cpp
new file mode 100644
index 00000000000..58cd496bf67
--- /dev/null
+++ b/src/caffe/layers/permute_layer.cpp
@@ -0,0 +1,139 @@
+#include <vector>
+
+#include "caffe/layers/permute_layer.hpp"
+#include "caffe/util/math_functions.hpp"
+
+namespace caffe {
+
+template <typename Dtype>
+void Permute(const int count, Dtype* bottom_data, const bool forward,
+    const int* permute_order, const int* old_steps, const int* new_steps,
+    const int num_axes, Dtype* top_data) {
+    for (int i = 0; i < count; ++i) {
+      int old_idx = 0;
+      int idx = i;
+      for (int j = 0; j < num_axes; ++j) {
+        int order = permute_order[j];
+        old_idx += (idx / new_steps[j]) * old_steps[order];
+        idx %= new_steps[j];
+      }
+      if (forward) {
+        top_data[i] = bottom_data[old_idx];
+      } else {
+        bottom_data[old_idx] = top_data[i];
+      }
+    }
+}
+
+
+template <typename Ftype, typename Btype>
+void PermuteLayer<Ftype, Btype>::LayerSetUp(const vector<Blob*>& bottom,
+      const vector<Blob*>& top) {
+  PermuteParameter permute_param = this->layer_param_.permute_param();
+  CHECK_EQ(bottom.size(), 1);
+  num_axes_ = bottom[0]->num_axes();
+  vector<int> orders;
+  // Push the specified new orders.
+  for (int i = 0; i < permute_param.order_size(); ++i) {
+    int order = permute_param.order(i);
+    CHECK_LT(order, num_axes_)
+        << "order should be less than the input dimension.";
+    if (std::find(orders.begin(), orders.end(), order) != orders.end()) {
+      LOG(FATAL) << "there are duplicate orders";
+    }
+    orders.push_back(order);
+  }
+  // Push the rest orders. And save original step sizes for each axis.
+  for (int i = 0; i < num_axes_; ++i) {
+    if (std::find(orders.begin(), orders.end(), i) == orders.end()) {
+      orders.push_back(i);
+    }
+  }
+  CHECK_EQ(num_axes_, orders.size());
+  // Check if we need to reorder the data or keep it.
+  need_permute_ = false;
+  for (int i = 0; i < num_axes_; ++i) {
+    if (orders[i] != i) {
+      // As long as there is one order which is different from the natural order
+      // of the data, we need to permute. Otherwise, we share the data and diff.
+      need_permute_ = true;
+      break;
+    }
+  }
+
+  vector<int> top_shape(num_axes_, 1);
+  permute_order_.Reshape(num_axes_, 1, 1, 1);
+  old_steps_.Reshape(num_axes_, 1, 1, 1);
+  new_steps_.Reshape(num_axes_, 1, 1, 1);
+  for (int i = 0; i < num_axes_; ++i) {
+    permute_order_.mutable_cpu_data()[i] = orders[i];
+    top_shape[i] = bottom[0]->shape(orders[i]);
+  }
+  top[0]->Reshape(top_shape);
+}
+
+template <typename Ftype, typename Btype>
+void PermuteLayer<Ftype, Btype>::Reshape(const vector<Blob*>& bottom,
+      const vector<Blob*>& top) {
+  vector<int> top_shape;
+  for (int i = 0; i < num_axes_; ++i) {
+    if (i == num_axes_ - 1) {
+      old_steps_.mutable_cpu_data()[i] = 1;
+    } else {
+      old_steps_.mutable_cpu_data()[i] = bottom[0]->count(i + 1);
+    }
+    top_shape.push_back(bottom[0]->shape(permute_order_.cpu_data()[i]));
+  }
+  top[0]->Reshape(top_shape);
+
+  for (int i = 0; i < num_axes_; ++i) {
+    if (i == num_axes_ - 1) {
+      new_steps_.mutable_cpu_data()[i] = 1;
+    } else {
+      new_steps_.mutable_cpu_data()[i] = top[0]->count(i + 1);
+    }
+  }
+}
+
+template <typename Ftype, typename Btype>
+void PermuteLayer<Ftype, Btype>::Forward_cpu(const vector<Blob*>& bottom,
+      const vector<Blob*>& top) {
+  if (need_permute_) {
+    Dtype* bottom_data = bottom[0]->mutable_cpu_data<Dtype>();
+    Dtype* top_data = top[0]->mutable_cpu_data<Dtype>();
+    const int top_count = top[0]->count();
+    const int* permute_order = permute_order_.cpu_data();
+    const int* old_steps = old_steps_.cpu_data();
+    const int* new_steps = new_steps_.cpu_data();
+    bool forward = true;
+    Permute(top_count, bottom_data, forward, permute_order, old_steps,
+            new_steps, num_axes_, top_data);
+  } else {
+    // If there is no need to permute, we share data to save memory.
+    top[0]->ShareData(*bottom[0]);
+  }
+}
+
+template <typename Ftype, typename Btype>
+void PermuteLayer<Ftype, Btype>::Backward_cpu(const vector<Blob*>& top,
+      const vector<bool>& propagate_down, const vector<Blob*>& bottom) {
+  if (need_permute_) {
+    Dtype* top_diff = top[0]->mutable_cpu_diff<Dtype>();
+    Dtype* bottom_diff = bottom[0]->mutable_cpu_diff<Dtype>();
+    const int top_count = top[0]->count();
+    const int* permute_order = permute_order_.cpu_data();
+    const int* old_steps = old_steps_.cpu_data();
+    const int* new_steps = new_steps_.cpu_data();
+    bool forward = false;
+    Permute(top_count, bottom_diff, forward, permute_order, old_steps,
+            new_steps, num_axes_, top_diff);
+  } else {
+    // If there is no need to permute, we share diff to save memory.
+    bottom[0]->ShareDiff(*top[0]);
+  }
+}
+
+INSTANTIATE_CLASS_FB(PermuteLayer);
+REGISTER_LAYER_CLASS(Permute);
+
+}  // namespace caffe
diff --git a/src/caffe/layers/permute_layer.cu b/src/caffe/layers/permute_layer.cu
new file mode 100644
index 00000000000..dff9ba2bfb4
--- /dev/null
+++ b/src/caffe/layers/permute_layer.cu
@@ -0,0 +1,104 @@
+#include <algorithm>
+#include <cfloat>
+#include <vector>
+#include <device_launch_parameters.h>
+
+#include "caffe/layers/permute_layer.hpp"
+#include "caffe/util/math_functions.hpp"
+
+namespace caffe {
+
+template <typename Dtype>
+__global__ void PermuteKernel(const int nthreads,
+    Dtype* bottom_data, const bool forward, const int* permute_order,
+    const int* old_steps, const int* new_steps, const int num_axes,
+    Dtype* top_data) {
+  CUDA_KERNEL_LOOP(index, nthreads) {
+    int temp_idx = index;
+    int old_idx = 0;
+    for (int i = 0; i < num_axes; ++i) {
+      int order = permute_order[i];
+      old_idx += (temp_idx / new_steps[i]) * old_steps[order];
+      temp_idx %= new_steps[i];
+    }
+    if (forward) {
+      top_data[index] = bottom_data[old_idx];
+    } else {
+      bottom_data[old_idx] = top_data[index];
+    }
+  }
+}
+
+template <>
+__global__ void PermuteKernel<float16>(const int nthreads, float16* bottom_data,
+                                       const bool forward, const int* permute_order,
+                                       const int* old_steps, const int* new_steps,
+                                       const int num_axes, float16* top_data) {
+  CUDA_KERNEL_LOOP(index, nthreads) {
+    int temp_idx = index;
+    int old_idx = 0;
+    for (int i = 0; i < num_axes; ++i) {
+      int order = permute_order[i];
+      old_idx += (temp_idx / new_steps[i]) * old_steps[order];
+      temp_idx %= new_steps[i];
+    }
+    if (forward) {
+      top_data[index] = bottom_data[old_idx];
+    } else {
+      bottom_data[old_idx] = top_data[index];
+    }
+  }
+}
+
+
+template <typename Ftype, typename Btype>
+void PermuteLayer<Ftype, Btype>::Forward_gpu(const vector<Blob*>& bottom,
+      const vector<Blob*>& top) {
+  if (need_permute_) {
+    Dtype* bottom_data = const_cast<Dtype*>(bottom[0]->gpu_data<Dtype>());
+    Dtype* top_data = top[0]->mutable_gpu_data<Dtype>();
+    int count = top[0]->count();
+    const int* permute_order = permute_order_.gpu_data();
+    const int* new_steps = new_steps_.gpu_data();
+    const int* old_steps = old_steps_.gpu_data();
+    bool foward = true;
+    cudaStream_t stream = Caffe::thread_stream();
+    // NOLINT_NEXT_LINE(whitespace/operators)
+    PermuteKernel<<<CAFFE_GET_BLOCKS(count), CAFFE_CUDA_NUM_THREADS, 0, stream>>>(
+        count, bottom_data, foward, permute_order, old_steps, new_steps,
+        num_axes_, top_data);
+    CUDA_POST_KERNEL_CHECK;
+    CUDA_CHECK(cudaStreamSynchronize(stream));
+  } else {
+    // If there is no need to permute, we share data to save memory.
+    top[0]->ShareData(*bottom[0]);
+  }
+}
+
+template <typename Ftype, typename Btype>
+void PermuteLayer<Ftype, Btype>::Backward_gpu(const vector<Blob*>& top,
+      const vector<bool>& propagate_down, const vector<Blob*>& bottom) {
+  if (need_permute_) {
+    Dtype* top_diff = top[0]->mutable_gpu_diff<Dtype>();
+    Dtype* bottom_diff = bottom[0]->mutable_gpu_diff<Dtype>();
+    const int count = bottom[0]->count();
+    const int* permute_order = permute_order_.gpu_data();
+    const int* new_steps = new_steps_.gpu_data();
+    const int* old_steps = old_steps_.gpu_data();
+    bool foward = false;
+    cudaStream_t stream = Caffe::thread_stream();
+    // NOLINT_NEXT_LINE(whitespace/operators)
+    PermuteKernel<Dtype><<<CAFFE_GET_BLOCKS(count), CAFFE_CUDA_NUM_THREADS, 0, stream>>>(
+        count, bottom_diff, foward, permute_order, old_steps, new_steps,
+        num_axes_, top_diff);
+    CUDA_POST_KERNEL_CHECK;
+    CUDA_CHECK(cudaStreamSynchronize(stream));
+  } else {
+    // If there is no need to permute, we share diff to save memory.
+    bottom[0]->ShareDiff(*top[0]);
+  }
+}
+
+INSTANTIATE_LAYER_GPU_FUNCS_FB(PermuteLayer);
+
+}  // namespace caffe
diff --git a/src/caffe/layers/pooling_layer.cpp b/src/caffe/layers/pooling_layer.cpp
index 6611f5b73b8..8dff65ba8c4 100644
--- a/src/caffe/layers/pooling_layer.cpp
+++ b/src/caffe/layers/pooling_layer.cpp
@@ -118,7 +118,7 @@ void PoolingLayer<Ftype, Btype>::Reshape(const vector<Blob*>& bottom,
   // If stochastic pooling, we will initialize the random index part.
   if (this->layer_param_.pooling_param().pool() ==
       PoolingParameter_PoolMethod_STOCHASTIC) {
-    rand_idx_.Reshape(bottom[0]->num(), channels_, pooled_height_,
+    rand_idx_->Reshape(bottom[0]->num(), channels_, pooled_height_,
       pooled_width_);
   }
 }
diff --git a/src/caffe/layers/pooling_layer.cu b/src/caffe/layers/pooling_layer.cu
index 57c05d564ac..70469d88ab4 100644
--- a/src/caffe/layers/pooling_layer.cu
+++ b/src/caffe/layers/pooling_layer.cu
@@ -25,7 +25,7 @@ __global__ void MaxPoolForward(const int nthreads,
     const int wend = min(wstart + kernel_w, width);
     hstart = max(hstart, 0);
     wstart = max(wstart, 0);
-    float maxval = -static_cast<float>(max_dtype<Ftype>());  // TODO Ftype?
+    float maxval = -static_cast<float>(max_dtype<Ftype>());
     int maxidx = -1;
     const Ftype* const bottom_slice =
         bottom_data + (n * channels + c) * height * width;
@@ -87,7 +87,7 @@ __global__ void StoPoolForwardTrain(const int nthreads,
     const int num, const int channels, const int height,
     const int width, const int pooled_height, const int pooled_width,
     const int kernel_h, const int kernel_w, const int stride_h,
-    const int stride_w, float* const rand_idx, Ftype* const top_data) {
+    const int stride_w, Ftype* const rand_idx, Ftype* const top_data) {
   CUDA_KERNEL_LOOP(index, nthreads) {
     const int pw = index % pooled_width;
     const int ph = (index / pooled_width) % pooled_height;
@@ -97,7 +97,7 @@ __global__ void StoPoolForwardTrain(const int nthreads,
     const int hend = min(hstart + kernel_h, height);
     const int wstart = pw * stride_w;
     const int wend = min(wstart + kernel_w, width);
-    float cumsum = 0.;
+    float cumsum = 0.F;
     const Ftype* const bottom_slice =
         bottom_data + (n * channels + c) * height * width;
     // First pass: get sum
@@ -106,7 +106,7 @@ __global__ void StoPoolForwardTrain(const int nthreads,
         cumsum += bottom_slice[h * width + w];
       }
     }
-    const float thres = rand_idx[index] * cumsum;
+    const Ftype thres = rand_idx[index] * cumsum;
     // Second pass: get value, and set index.
     cumsum = 0;
     for (int h = hstart; h < hend; ++h) {
@@ -190,15 +190,14 @@ void PoolingLayer<Ftype, Btype>::Forward_gpu(const vector<Blob*>& bottom,
   case PoolingParameter_PoolMethod_STOCHASTIC:
     if (this->phase_ == TRAIN) {
       // We need to create the random index as well.
-      caffe_gpu_rng_uniform<float>(count, 0.F, 1.F,
-                            rand_idx_.mutable_gpu_data());
+      caffe_gpu_rng_uniform(count, Ftype(0), Ftype(1), rand_idx_->mutable_gpu_data<Ftype>());
       // NOLINT_NEXT_LINE(whitespace/operators)
       StoPoolForwardTrain<Ftype><<<CAFFE_GET_BLOCKS(count),
                                    CAFFE_CUDA_NUM_THREADS, 0, stream>>>(
           count, bottom_data, bottom[0]->num(), channels_,
           height_, width_, pooled_height_, pooled_width_, kernel_h_,
           kernel_w_, stride_h_, stride_w_,
-          rand_idx_.mutable_gpu_data(), top[0]->mutable_gpu_data<Ftype>());
+          rand_idx_->mutable_gpu_data<Ftype>(), top[0]->mutable_gpu_data<Ftype>());
     } else {
       // NOLINT_NEXT_LINE(whitespace/operators)
       StoPoolForwardTest<Ftype><<<CAFFE_GET_BLOCKS(count),
@@ -303,7 +302,7 @@ __global__ void AvePoolBackward(const int nthreads, const Btype* const top_diff,
 
 template <typename Btype>
 __global__ void StoPoolBackward(const int nthreads,
-    const float* const rand_idx, const Btype* const top_diff,
+    const Btype* const rand_idx, const Btype* const top_diff,
     const int num, const int channels, const int height,
     const int width, const int pooled_height, const int pooled_width,
     const int kernel_h, const int kernel_w, const int stride_h,
@@ -319,8 +318,8 @@ __global__ void StoPoolBackward(const int nthreads,
     const int phend = min(h / stride_h + 1, pooled_height);
     const int pwstart = (w < kernel_w) ? 0 : (w - kernel_w) / stride_w + 1;
     const int pwend = min(w / stride_w + 1, pooled_width);
-    float gradient = 0;
-    const float* const rand_idx_slice =
+    float gradient = 0.F;
+    const Btype* const rand_idx_slice =
         rand_idx + (n * channels + c) * pooled_height * pooled_width;
     const Btype* const top_diff_slice =
         top_diff + (n * channels + c) * pooled_height * pooled_width;
@@ -378,7 +377,7 @@ void PoolingLayer<Ftype, Btype>::Backward_gpu(const vector<Blob*>& top,
   case PoolingParameter_PoolMethod_STOCHASTIC:
     // NOLINT_NEXT_LINE(whitespace/operators)
     StoPoolBackward<Btype><<<CAFFE_GET_BLOCKS(count), CAFFE_CUDA_NUM_THREADS, 0, stream>>>(
-        count, rand_idx_.gpu_data(), top_diff,
+        count, rand_idx_->gpu_data<Btype>(), top_diff,
         top[0]->num(), channels_, height_, width_, pooled_height_,
         pooled_width_, kernel_h_, kernel_w_, stride_h_, stride_w_,
         bottom_diff);
diff --git a/src/caffe/layers/prior_box_layer.cpp b/src/caffe/layers/prior_box_layer.cpp
new file mode 100644
index 00000000000..b557a99e879
--- /dev/null
+++ b/src/caffe/layers/prior_box_layer.cpp
@@ -0,0 +1,224 @@
+#include <algorithm>
+#include <functional>
+#include <utility>
+#include <vector>
+
+#include "caffe/layers/prior_box_layer.hpp"
+
+namespace caffe {
+
+template <typename Ftype, typename Btype>
+void PriorBoxLayer<Ftype, Btype>::LayerSetUp(const vector<Blob*>& bottom,
+      const vector<Blob*>& top) {
+  const PriorBoxParameter& prior_box_param =
+      this->layer_param_.prior_box_param();
+  CHECK_GT(prior_box_param.min_size_size(), 0) << "must provide min_size.";
+  for (int i = 0; i < prior_box_param.min_size_size(); ++i) {
+    min_sizes_.push_back(prior_box_param.min_size(i));
+    CHECK_GT(min_sizes_.back(), 0) << "min_size must be positive.";
+  }
+  aspect_ratios_.clear();
+  aspect_ratios_.push_back(1.);
+  flip_ = prior_box_param.flip();
+  for (int i = 0; i < prior_box_param.aspect_ratio_size(); ++i) {
+    float ar = prior_box_param.aspect_ratio(i);
+    bool already_exist = false;
+    for (int j = 0; j < aspect_ratios_.size(); ++j) {
+      if (fabs(ar - aspect_ratios_[j]) < 1e-6) {
+        already_exist = true;
+        break;
+      }
+    }
+    if (!already_exist) {
+      aspect_ratios_.push_back(ar);
+      if (flip_) {
+        aspect_ratios_.push_back(1./ar);
+      }
+    }
+  }
+  num_priors_ = aspect_ratios_.size() * min_sizes_.size();
+  if (prior_box_param.max_size_size() > 0) {
+    CHECK_EQ(prior_box_param.min_size_size(), prior_box_param.max_size_size());
+    for (int i = 0; i < prior_box_param.max_size_size(); ++i) {
+      max_sizes_.push_back(prior_box_param.max_size(i));
+      CHECK_GT(max_sizes_[i], min_sizes_[i])
+          << "max_size must be greater than min_size.";
+      num_priors_ += 1;
+    }
+  }
+  clip_ = prior_box_param.clip();
+  if (prior_box_param.variance_size() > 1) {
+    // Must and only provide 4 variance.
+    CHECK_EQ(prior_box_param.variance_size(), 4);
+    for (int i = 0; i < prior_box_param.variance_size(); ++i) {
+      CHECK_GT(prior_box_param.variance(i), 0);
+      variance_.push_back(prior_box_param.variance(i));
+    }
+  } else if (prior_box_param.variance_size() == 1) {
+    CHECK_GT(prior_box_param.variance(0), 0);
+    variance_.push_back(prior_box_param.variance(0));
+  } else {
+    // Set default to 0.1.
+    variance_.push_back(0.1);
+  }
+
+  if (prior_box_param.has_img_h() || prior_box_param.has_img_w()) {
+    CHECK(!prior_box_param.has_img_size())
+        << "Either img_size or img_h/img_w should be specified; not both.";
+    img_h_ = prior_box_param.img_h();
+    CHECK_GT(img_h_, 0) << "img_h should be larger than 0.";
+    img_w_ = prior_box_param.img_w();
+    CHECK_GT(img_w_, 0) << "img_w should be larger than 0.";
+  } else if (prior_box_param.has_img_size()) {
+    const int img_size = prior_box_param.img_size();
+    CHECK_GT(img_size, 0) << "img_size should be larger than 0.";
+    img_h_ = img_size;
+    img_w_ = img_size;
+  } else {
+    img_h_ = 0;
+    img_w_ = 0;
+  }
+
+  if (prior_box_param.has_step_h() || prior_box_param.has_step_w()) {
+    CHECK(!prior_box_param.has_step())
+        << "Either step or step_h/step_w should be specified; not both.";
+    step_h_ = prior_box_param.step_h();
+    CHECK_GT(step_h_, 0.) << "step_h should be larger than 0.";
+    step_w_ = prior_box_param.step_w();
+    CHECK_GT(step_w_, 0.) << "step_w should be larger than 0.";
+  } else if (prior_box_param.has_step()) {
+    const float step = prior_box_param.step();
+    CHECK_GT(step, 0) << "step should be larger than 0.";
+    step_h_ = step;
+    step_w_ = step;
+  } else {
+    step_h_ = 0;
+    step_w_ = 0;
+  }
+
+  offset_ = prior_box_param.offset();
+}
+
+template <typename Ftype, typename Btype>
+void PriorBoxLayer<Ftype, Btype>::Reshape(const vector<Blob*>& bottom,
+      const vector<Blob*>& top) {
+  const int layer_width = bottom[0]->width();
+  const int layer_height = bottom[0]->height();
+  vector<int> top_shape(3, 1);
+  // Since all images in a batch has same height and width, we only need to
+  // generate one set of priors which can be shared across all images.
+  top_shape[0] = 1;
+  // 2 channels. First channel stores the mean of each prior coordinate.
+  // Second channel stores the variance of each prior coordinate.
+  top_shape[1] = 2;
+  top_shape[2] = layer_width * layer_height * num_priors_ * 4;
+  CHECK_GT(top_shape[2], 0);
+  top[0]->Reshape(top_shape);
+}
+
+template <typename Ftype, typename Btype>
+void PriorBoxLayer<Ftype, Btype>::Forward_cpu(const vector<Blob*>& bottom,
+    const vector<Blob*>& top) {
+  const int layer_width = bottom[0]->width();
+  const int layer_height = bottom[0]->height();
+  int img_width, img_height;
+  if (img_h_ == 0 || img_w_ == 0) {
+    img_width = bottom[1]->width();
+    img_height = bottom[1]->height();
+  } else {
+    img_width = img_w_;
+    img_height = img_h_;
+  }
+  float step_w, step_h;
+  if (step_w_ == 0 || step_h_ == 0) {
+    step_w = static_cast<float>(img_width) / layer_width;
+    step_h = static_cast<float>(img_height) / layer_height;
+  } else {
+    step_w = step_w_;
+    step_h = step_h_;
+  }
+  Ftype* top_data = top[0]->mutable_cpu_data<Dtype>();
+  int dim = layer_height * layer_width * num_priors_ * 4;
+  int idx = 0;
+  for (int h = 0; h < layer_height; ++h) {
+    for (int w = 0; w < layer_width; ++w) {
+      float center_x = (w + offset_) * step_w;
+      float center_y = (h + offset_) * step_h;
+      float box_width, box_height;
+      for (int s = 0; s < min_sizes_.size(); ++s) {
+        int min_size_ = min_sizes_[s];
+        // first prior: aspect_ratio = 1, size = min_size
+        box_width = box_height = min_size_;
+        // xmin
+        top_data[idx++] = (center_x - box_width / 2.) / img_width;
+        // ymin
+        top_data[idx++] = (center_y - box_height / 2.) / img_height;
+        // xmax
+        top_data[idx++] = (center_x + box_width / 2.) / img_width;
+        // ymax
+        top_data[idx++] = (center_y + box_height / 2.) / img_height;
+
+        if (max_sizes_.size() > 0) {
+          CHECK_EQ(min_sizes_.size(), max_sizes_.size());
+          int max_size_ = max_sizes_[s];
+          // second prior: aspect_ratio = 1, size = sqrt(min_size * max_size)
+          box_width = box_height = sqrt(min_size_ * max_size_);
+          // xmin
+          top_data[idx++] = (center_x - box_width / 2.) / img_width;
+          // ymin
+          top_data[idx++] = (center_y - box_height / 2.) / img_height;
+          // xmax
+          top_data[idx++] = (center_x + box_width / 2.) / img_width;
+          // ymax
+          top_data[idx++] = (center_y + box_height / 2.) / img_height;
+        }
+
+        // rest of priors
+        for (int r = 0; r < aspect_ratios_.size(); ++r) {
+          float ar = aspect_ratios_[r];
+          if (fabs(ar - 1.) < 1e-6) {
+            continue;
+          }
+          box_width = min_size_ * sqrt(ar);
+          box_height = min_size_ / sqrt(ar);
+          // xmin
+          top_data[idx++] = (center_x - box_width / 2.) / img_width;
+          // ymin
+          top_data[idx++] = (center_y - box_height / 2.) / img_height;
+          // xmax
+          top_data[idx++] = (center_x + box_width / 2.) / img_width;
+          // ymax
+          top_data[idx++] = (center_y + box_height / 2.) / img_height;
+        }
+      }
+    }
+  }
+  // clip the prior's coordidate such that it is within [0, 1]
+  if (clip_) {
+    for (int d = 0; d < dim; ++d) {
+      top_data[d] = std::min<Dtype>(std::max<Dtype>(top_data[d], 0.), 1.);
+    }
+  }
+  // set the variance.
+  top_data += top[0]->offset(0, 1);
+  if (variance_.size() == 1) {
+    caffe_set(dim, Dtype(variance_[0]), top_data);
+  } else {
+    int count = 0;
+    for (int h = 0; h < layer_height; ++h) {
+      for (int w = 0; w < layer_width; ++w) {
+        for (int i = 0; i < num_priors_; ++i) {
+          for (int j = 0; j < 4; ++j) {
+            top_data[count] = variance_[j];
+            ++count;
+          }
+        }
+      }
+    }
+  }
+}
+
+INSTANTIATE_CLASS_FB(PriorBoxLayer);
+REGISTER_LAYER_CLASS(PriorBox);
+
+}  // namespace caffe
diff --git a/src/caffe/layers/recurrent_layer.cpp b/src/caffe/layers/recurrent_layer.cpp
new file mode 100644
index 00000000000..344d399a9e6
--- /dev/null
+++ b/src/caffe/layers/recurrent_layer.cpp
@@ -0,0 +1,296 @@
+#include <string>
+#include <vector>
+
+#include "caffe/blob.hpp"
+#include "caffe/common.hpp"
+#include "caffe/filler.hpp"
+#include "caffe/layer.hpp"
+#include "caffe/layers/recurrent_layer.hpp"
+#include "caffe/util/math_functions.hpp"
+
+namespace caffe {
+
+template<typename Ftype, typename Btype>
+void RecurrentLayer<Ftype, Btype>::LayerSetUp(const vector<Blob*>& bottom,
+      const vector<Blob*>& top) {
+  CHECK_GE(bottom[0]->num_axes(), 2)
+      << "bottom[0] must have at least 2 axes -- (#timesteps, #streams, ...)";
+  T_ = bottom[0]->shape(0);
+  N_ = bottom[0]->shape(1);
+  LOG(INFO) << "Initializing recurrent layer: assuming input batch contains "
+            << T_ << " timesteps of " << N_ << " independent streams.";
+
+  CHECK_EQ(bottom[1]->num_axes(), 2)
+      << "bottom[1] must have exactly 2 axes -- (#timesteps, #streams)";
+  CHECK_EQ(T_, bottom[1]->shape(0));
+  CHECK_EQ(N_, bottom[1]->shape(1));
+
+  // If expose_hidden is set, we take as input and produce as output
+  // the hidden state blobs at the first and last timesteps.
+  expose_hidden_ = this->layer_param_.recurrent_param().expose_hidden();
+
+  // Get (recurrent) input/output names.
+  vector<string> output_names;
+  OutputBlobNames(&output_names);
+  vector<string> recur_input_names;
+  RecurrentInputBlobNames(&recur_input_names);
+  vector<string> recur_output_names;
+  RecurrentOutputBlobNames(&recur_output_names);
+  const int num_recur_blobs = recur_input_names.size();
+  CHECK_EQ(num_recur_blobs, recur_output_names.size());
+
+  // If provided, bottom[2] is a static input to the recurrent net.
+  const int num_hidden_exposed = expose_hidden_ * num_recur_blobs;
+  static_input_ = (bottom.size() > 2 + num_hidden_exposed);
+  if (static_input_) {
+    CHECK_GE(bottom[2]->num_axes(), 1);
+    CHECK_EQ(N_, bottom[2]->shape(0));
+  }
+
+  // Create a NetParameter; setup the inputs that aren't unique to particular
+  // recurrent architectures.
+  NetParameter net_param;
+  net_param.set_default_forward_type(tp<Ftype>());
+  net_param.set_default_backward_type(tp<Btype>());
+  net_param.set_default_forward_math(tp<Ftype>());
+  net_param.set_default_backward_math(tp<Btype>());
+
+  LayerParameter* input_layer_param = net_param.add_layer();
+  input_layer_param->set_type("Input");
+  InputParameter* input_param = input_layer_param->mutable_input_param();
+  input_layer_param->add_top("x");
+  BlobShape input_shape;
+  for (int i = 0; i < bottom[0]->num_axes(); ++i) {
+    input_shape.add_dim(bottom[0]->shape(i));
+  }
+  input_param->add_shape()->CopyFrom(input_shape);
+
+  input_shape.Clear();
+  for (int i = 0; i < bottom[1]->num_axes(); ++i) {
+    input_shape.add_dim(bottom[1]->shape(i));
+  }
+  input_layer_param->add_top("cont");
+  input_param->add_shape()->CopyFrom(input_shape);
+
+  if (static_input_) {
+    input_shape.Clear();
+    for (int i = 0; i < bottom[2]->num_axes(); ++i) {
+      input_shape.add_dim(bottom[2]->shape(i));
+    }
+    input_layer_param->add_top("x_static");
+    input_param->add_shape()->CopyFrom(input_shape);
+  }
+
+  // Call the child's FillUnrolledNet implementation to specify the unrolled
+  // recurrent architecture.
+  this->FillUnrolledNet(&net_param);
+
+  // Prepend this layer's name to the names of each layer in the unrolled net.
+  const string& layer_name = this->layer_param_.name();
+  if (layer_name.size()) {
+    for (int i = 0; i < net_param.layer_size(); ++i) {
+      LayerParameter* layer = net_param.mutable_layer(i);
+      layer->set_name(layer_name + "_" + layer->name());
+    }
+  }
+
+  // Add "pseudo-losses" to all outputs to force backpropagation.
+  // (Setting force_backward is too aggressive as we may not need to backprop to
+  // all inputs, e.g., the sequence continuation indicators.)
+  vector<string> pseudo_losses(output_names.size());
+  for (int i = 0; i < output_names.size(); ++i) {
+    LayerParameter* layer = net_param.add_layer();
+    pseudo_losses[i] = output_names[i] + "_pseudoloss";
+    layer->set_name(pseudo_losses[i]);
+    layer->set_type("Reduction");
+    layer->add_bottom(output_names[i]);
+    layer->add_top(pseudo_losses[i]);
+    layer->add_loss_weight(1);
+  }
+
+  const size_t rank = this->parent_rank();
+  // Create the unrolled net.
+  unrolled_net_.reset(new Net(net_param, rank, nullptr, nullptr, true));
+  unrolled_net_->set_debug_info(
+      this->layer_param_.recurrent_param().debug_info());
+
+  // Setup pointers to the inputs.
+  x_input_blob_ = CHECK_NOTNULL(unrolled_net_->blob_by_name("x").get());
+  cont_input_blob_ = CHECK_NOTNULL(unrolled_net_->blob_by_name("cont").get());
+  if (static_input_) {
+    x_static_input_blob_ =
+        CHECK_NOTNULL(unrolled_net_->blob_by_name("x_static").get());
+  }
+
+  // Setup pointers to paired recurrent inputs/outputs.
+  recur_input_blobs_.resize(num_recur_blobs);
+  recur_output_blobs_.resize(num_recur_blobs);
+  for (int i = 0; i < recur_input_names.size(); ++i) {
+    recur_input_blobs_[i] =
+        CHECK_NOTNULL(unrolled_net_->blob_by_name(recur_input_names[i]).get());
+    recur_output_blobs_[i] =
+        CHECK_NOTNULL(unrolled_net_->blob_by_name(recur_output_names[i]).get());
+  }
+
+  // Setup pointers to outputs.
+  CHECK_EQ(top.size() - num_hidden_exposed, output_names.size())
+      << "OutputBlobNames must provide an output blob name for each top.";
+  output_blobs_.resize(output_names.size());
+  for (int i = 0; i < output_names.size(); ++i) {
+    output_blobs_[i] =
+        CHECK_NOTNULL(unrolled_net_->blob_by_name(output_names[i]).get());
+  }
+
+  // We should have 2 inputs (x and cont), plus a number of recurrent inputs,
+  // plus maybe a static input.
+  CHECK_EQ(2 + num_recur_blobs + static_input_,
+           unrolled_net_->input_blobs().size());
+
+  // This layer's parameters are any parameters in the layers of the unrolled
+  // net. We only want one copy of each parameter, so check that the parameter
+  // is "owned" by the layer, rather than shared with another.
+  this->blobs_.clear();
+  for (int i = 0; i < unrolled_net_->params().size(); ++i) {
+    if (unrolled_net_->param_owners()[i] == -1) {
+      LOG(INFO) << "Adding parameter " << i << ": "
+                << unrolled_net_->param_display_names()[i];
+      this->blobs_.push_back(unrolled_net_->params()[i]);
+    }
+  }
+  // Check that param_propagate_down is set for all of the parameters in the
+  // unrolled net; set param_propagate_down to true in this layer.
+  for (int i = 0; i < unrolled_net_->layers().size(); ++i) {
+    for (int j = 0; j < unrolled_net_->layers()[i]->blobs().size(); ++j) {
+      CHECK(unrolled_net_->layers()[i]->param_propagate_down(j))
+          << "param_propagate_down not set for layer " << i << ", param " << j;
+    }
+  }
+  this->param_propagate_down_.clear();
+  this->param_propagate_down_.resize(this->blobs_.size(), true);
+
+  // Set the diffs of recurrent outputs to 0 -- we can't backpropagate across
+  // batches.
+  for (int i = 0; i < recur_output_blobs_.size(); ++i) {
+    caffe_set(recur_output_blobs_[i]->count(), Ftype(0),
+              recur_output_blobs_[i]->mutable_cpu_diff<Ftype>());
+  }
+
+  // Check that the last output_names.size() layers are the pseudo-losses;
+  // set last_layer_index so that we don't actually run these layers.
+  const vector<string>& layer_names = unrolled_net_->layer_names();
+  last_layer_index_ = layer_names.size() - 1 - pseudo_losses.size();
+  for (int i = last_layer_index_ + 1, j = 0; i < layer_names.size(); ++i, ++j) {
+    CHECK_EQ(layer_names[i], pseudo_losses[j]);
+  }
+}
+
+template<typename Ftype, typename Btype>
+void RecurrentLayer<Ftype, Btype>::Reshape(const vector<Blob*>& bottom,
+      const vector<Blob*>& top) {
+  CHECK_GE(bottom[0]->num_axes(), 2)
+      << "bottom[0] must have at least 2 axes -- (#timesteps, #streams, ...)";
+  CHECK_EQ(T_, bottom[0]->shape(0)) << "input number of timesteps changed";
+  N_ = bottom[0]->shape(1);
+  CHECK_EQ(bottom[1]->num_axes(), 2)
+      << "bottom[1] must have exactly 2 axes -- (#timesteps, #streams)";
+  CHECK_EQ(T_, bottom[1]->shape(0));
+  CHECK_EQ(N_, bottom[1]->shape(1));
+  x_input_blob_->ReshapeLike(*bottom[0]);
+  vector<int> cont_shape = bottom[1]->shape();
+  cont_input_blob_->Reshape(cont_shape);
+  if (static_input_) {
+    x_static_input_blob_->ReshapeLike(*bottom[2]);
+  }
+  vector<BlobShape> recur_input_shapes;
+  RecurrentInputShapes(&recur_input_shapes);
+  CHECK_EQ(recur_input_shapes.size(), recur_input_blobs_.size());
+  for (int i = 0; i < recur_input_shapes.size(); ++i) {
+    recur_input_blobs_[i]->Reshape(recur_input_shapes[i]);
+  }
+  unrolled_net_->Reshape();
+  x_input_blob_->ShareData(*bottom[0]);
+  x_input_blob_->ShareDiff(*bottom[0]);
+  cont_input_blob_->ShareData(*bottom[1]);
+  if (static_input_) {
+    x_static_input_blob_->ShareData(*bottom[2]);
+    x_static_input_blob_->ShareDiff(*bottom[2]);
+  }
+  if (expose_hidden_) {
+    const int bottom_offset = 2 + static_input_;
+    for (int i = bottom_offset, j = 0; i < bottom.size(); ++i, ++j) {
+      CHECK(recur_input_blobs_[j]->shape() == bottom[i]->shape())
+          << "bottom[" << i << "] shape must match hidden state input shape: "
+          << recur_input_blobs_[j]->shape_string();
+      recur_input_blobs_[j]->ShareData(*bottom[i]);
+    }
+  }
+  for (int i = 0; i < output_blobs_.size(); ++i) {
+    top[i]->ReshapeLike(*output_blobs_[i]);
+    top[i]->ShareData(*output_blobs_[i]);
+    top[i]->ShareDiff(*output_blobs_[i]);
+  }
+  if (expose_hidden_) {
+    const int top_offset = output_blobs_.size();
+    for (int i = top_offset, j = 0; i < top.size(); ++i, ++j) {
+      top[i]->ReshapeLike(*recur_output_blobs_[j]);
+    }
+  }
+}
+
+template<typename Ftype, typename Btype>
+void RecurrentLayer<Ftype, Btype>::Reset() {
+  // "Reset" the hidden state of the net by zeroing out all recurrent outputs.
+  for (int i = 0; i < recur_output_blobs_.size(); ++i) {
+    caffe_set(recur_output_blobs_[i]->count(), Ftype(0),
+              recur_output_blobs_[i]->mutable_cpu_data<Ftype>());
+  }
+}
+
+template<typename Ftype, typename Btype>
+void RecurrentLayer<Ftype, Btype>::Forward_cpu(const vector<Blob*>& bottom,
+    const vector<Blob*>& top) {
+  // Hacky fix for test time: reshare all the internal shared blobs, which may
+  // currently point to a stale owner blob that was dropped when Solver::Test
+  // called test_net->ShareTrainedLayersWith(net_.get()).
+  // TODO: somehow make this work non-hackily.
+  if (this->phase_ == TEST) {
+    unrolled_net_->ShareWeights();
+  }
+
+  DCHECK_EQ(recur_input_blobs_.size(), recur_output_blobs_.size());
+  if (!expose_hidden_) {
+    for (int i = 0; i < recur_input_blobs_.size(); ++i) {
+      const int count = recur_input_blobs_[i]->count();
+      DCHECK_EQ(count, recur_output_blobs_[i]->count());
+      const Ftype* timestep_T_data = recur_output_blobs_[i]->cpu_data<Ftype>();
+      Ftype* timestep_0_data = recur_input_blobs_[i]->mutable_cpu_data<Ftype>();
+      caffe_copy(count, timestep_T_data, timestep_0_data);
+    }
+  }
+
+  unrolled_net_->ForwardTo(last_layer_index_);
+
+  if (expose_hidden_) {
+    const int top_offset = output_blobs_.size();
+    for (int i = top_offset, j = 0; i < top.size(); ++i, ++j) {
+      top[i]->ShareData(*recur_output_blobs_[j]);
+    }
+  }
+}
+
+template<typename Ftype, typename Btype>
+void RecurrentLayer<Ftype, Btype>::Backward_cpu(const vector<Blob*>& top,
+    const vector<bool>& propagate_down, const vector<Blob*>& bottom) {
+  CHECK(!propagate_down[1]) << "Cannot backpropagate to sequence indicators.";
+
+  // TODO: skip backpropagation to inputs and parameters inside the unrolled
+  // net according to propagate_down[0] and propagate_down[2]. For now just
+  // backprop to inputs and parameters unconditionally, as either the inputs or
+  // the parameters do need backward (or Net would have set
+  // layer_needs_backward_[i] == false for this layer).
+  unrolled_net_->BackwardFromToAu(last_layer_index_, 0, false);
+}
+
+INSTANTIATE_CLASS_FB(RecurrentLayer);
+
+}  // namespace caffe
diff --git a/src/caffe/layers/recurrent_layer.cu b/src/caffe/layers/recurrent_layer.cu
new file mode 100644
index 00000000000..102ba15ebad
--- /dev/null
+++ b/src/caffe/layers/recurrent_layer.cu
@@ -0,0 +1,44 @@
+#include <vector>
+
+#include "caffe/blob.hpp"
+#include "caffe/common.hpp"
+#include "caffe/filler.hpp"
+#include "caffe/layer.hpp"
+#include "caffe/layers/recurrent_layer.hpp"
+#include "caffe/util/math_functions.hpp"
+
+namespace caffe {
+
+template<typename Ftype, typename Btype>
+void RecurrentLayer<Ftype, Btype>::Forward_gpu(const vector<Blob*>& bottom,
+    const vector<Blob*>& top) {
+  // Hacky fix for test time... reshare all the shared blobs.
+  // TODO: somehow make this work non-hackily.
+  if (this->phase_ == TEST) {
+    unrolled_net_->ShareWeights();
+  }
+
+  DCHECK_EQ(recur_input_blobs_.size(), recur_output_blobs_.size());
+  if (!expose_hidden_) {
+    for (int i = 0; i < recur_input_blobs_.size(); ++i) {
+      const int count = recur_input_blobs_[i]->count();
+      DCHECK_EQ(count, recur_output_blobs_[i]->count());
+      const Ftype* timestep_T_data = recur_output_blobs_[i]->gpu_data<Ftype>();
+      Ftype* timestep_0_data = recur_input_blobs_[i]->mutable_gpu_data<Ftype>();
+      caffe_copy(count, timestep_T_data, timestep_0_data);
+    }
+  }
+
+  unrolled_net_->ForwardTo(last_layer_index_);
+
+  if (expose_hidden_) {
+    const int top_offset = output_blobs_.size();
+    for (int i = top_offset, j = 0; i < top.size(); ++i, ++j) {
+      top[i]->ShareData(*recur_output_blobs_[j]);
+    }
+  }
+}
+
+INSTANTIATE_LAYER_GPU_FORWARD_ONLY_FB(RecurrentLayer);
+
+}  // namespace caffe
diff --git a/src/caffe/layers/reduction_layer.cpp b/src/caffe/layers/reduction_layer.cpp
index 7f3952dd5cc..b573f27c583 100644
--- a/src/caffe/layers/reduction_layer.cpp
+++ b/src/caffe/layers/reduction_layer.cpp
@@ -29,8 +29,10 @@ void ReductionLayer<Ftype, Btype>::Reshape(const vector<Blob*>& bottom,
   if (op_ == ReductionParameter_ReductionOp_SUM ||
       op_ == ReductionParameter_ReductionOp_MEAN) {
     vector<int> sum_mult_shape(1, dim_);
-    sum_multiplier_.Reshape(sum_mult_shape);
-    caffe_set<Ftype>(dim_, Ftype(1), sum_multiplier_.template mutable_cpu_data<Ftype>());
+    if (sum_multiplier_.shape() != sum_mult_shape) {
+      sum_multiplier_.Reshape(sum_mult_shape);
+      caffe_set(dim_, Ftype(1), sum_multiplier_.mutable_cpu_data());
+    }
   }
   coeff_ = this->layer_param().reduction_param().coeff();
   if (op_ == ReductionParameter_ReductionOp_MEAN) {
diff --git a/src/caffe/layers/reshape_layer.cpp b/src/caffe/layers/reshape_layer.cpp
index f448124f462..1e9a88b96b3 100644
--- a/src/caffe/layers/reshape_layer.cpp
+++ b/src/caffe/layers/reshape_layer.cpp
@@ -87,7 +87,7 @@ void ReshapeLayer<Ftype, Btype>::Reshape(const vector<Blob*>& bottom,
   CHECK_EQ(top[0]->count(), bottom[0]->count())
       << "output count must match input count";
   top[0]->ShareData(*bottom[0]);
-  top[0]->ShareDiff(*bottom[0]);
+  bottom[0]->ShareDiff(*top[0]);
 }
 
 INSTANTIATE_CLASS_FB(ReshapeLayer);
diff --git a/src/caffe/layers/rnn_layer.cpp b/src/caffe/layers/rnn_layer.cpp
new file mode 100644
index 00000000000..d4af9e1c99f
--- /dev/null
+++ b/src/caffe/layers/rnn_layer.cpp
@@ -0,0 +1,236 @@
+#include <string>
+#include <vector>
+
+#include "caffe/common.hpp"
+#include "caffe/blob.hpp"
+#include "caffe/filler.hpp"
+#include "caffe/layer.hpp"
+#include "caffe/layers/rnn_layer.hpp"
+#include "caffe/util/math_functions.hpp"
+
+namespace caffe {
+
+template<typename Ftype, typename Btype>
+void RNNLayer<Ftype, Btype>::RecurrentInputBlobNames(vector<string>* names) const {
+  names->resize(1);
+  (*names)[0] = "h_0";
+}
+
+template<typename Ftype, typename Btype>
+void RNNLayer<Ftype, Btype>::RecurrentOutputBlobNames(vector<string>* names) const {
+  names->resize(1);
+  (*names)[0] = "h_" + format_int(this->T_);
+}
+
+template<typename Ftype, typename Btype>
+void RNNLayer<Ftype, Btype>::RecurrentInputShapes(vector<BlobShape>* shapes) const {
+  const int num_output = this->layer_param_.recurrent_param().num_output();
+  shapes->resize(1);
+  (*shapes)[0].Clear();
+  (*shapes)[0].add_dim(1);  // a single timestep
+  (*shapes)[0].add_dim(this->N_);
+  (*shapes)[0].add_dim(num_output);
+}
+
+template<typename Ftype, typename Btype>
+void RNNLayer<Ftype, Btype>::OutputBlobNames(vector<string>* names) const {
+  names->resize(1);
+  (*names)[0] = "o";
+}
+
+template<typename Ftype, typename Btype>
+void RNNLayer<Ftype, Btype>::FillUnrolledNet(NetParameter* net_param) const {
+  const int num_output = this->layer_param_.recurrent_param().num_output();
+  CHECK_GT(num_output, 0) << "num_output must be positive";
+  const FillerParameter& weight_filler =
+      this->layer_param_.recurrent_param().weight_filler();
+  const FillerParameter& bias_filler =
+      this->layer_param_.recurrent_param().bias_filler();
+
+  // Add generic LayerParameter's (without bottoms/tops) of layer types we'll
+  // use to save redundant code.
+  LayerParameter hidden_param;
+  hidden_param.set_type("InnerProduct");
+  hidden_param.mutable_inner_product_param()->set_num_output(num_output);
+  hidden_param.mutable_inner_product_param()->set_bias_term(false);
+  hidden_param.mutable_inner_product_param()->set_axis(2);
+  hidden_param.mutable_inner_product_param()->
+      mutable_weight_filler()->CopyFrom(weight_filler);
+
+  LayerParameter biased_hidden_param(hidden_param);
+  biased_hidden_param.mutable_inner_product_param()->set_bias_term(true);
+  biased_hidden_param.mutable_inner_product_param()->
+      mutable_bias_filler()->CopyFrom(bias_filler);
+
+  LayerParameter sum_param;
+  sum_param.set_type("Eltwise");
+  sum_param.mutable_eltwise_param()->set_operation(
+      EltwiseParameter_EltwiseOp_SUM);
+
+  LayerParameter tanh_param;
+  tanh_param.set_type("TanH");
+
+  LayerParameter scale_param;
+  scale_param.set_type("Scale");
+  scale_param.mutable_scale_param()->set_axis(0);
+
+  LayerParameter slice_param;
+  slice_param.set_type("Slice");
+  slice_param.mutable_slice_param()->set_axis(0);
+
+  vector<BlobShape> input_shapes;
+  RecurrentInputShapes(&input_shapes);
+  CHECK_EQ(1, input_shapes.size());
+
+  LayerParameter* input_layer_param = net_param->add_layer();
+  input_layer_param->set_type("Input");
+  InputParameter* input_param = input_layer_param->mutable_input_param();
+  input_layer_param->add_top("h_0");
+  input_param->add_shape()->CopyFrom(input_shapes[0]);
+
+  LayerParameter* cont_slice_param = net_param->add_layer();
+  cont_slice_param->CopyFrom(slice_param);
+  cont_slice_param->set_name("cont_slice");
+  cont_slice_param->add_bottom("cont");
+  cont_slice_param->mutable_slice_param()->set_axis(0);
+
+  // Add layer to transform all timesteps of x to the hidden state dimension.
+  //     W_xh_x = W_xh * x + b_h
+  {
+    LayerParameter* x_transform_param = net_param->add_layer();
+    x_transform_param->CopyFrom(biased_hidden_param);
+    x_transform_param->set_name("x_transform");
+    x_transform_param->add_param()->set_name("W_xh");
+    x_transform_param->add_param()->set_name("b_h");
+    x_transform_param->add_bottom("x");
+    x_transform_param->add_top("W_xh_x");
+    x_transform_param->add_propagate_down(true);
+  }
+
+  if (this->static_input_) {
+    // Add layer to transform x_static to the hidden state dimension.
+    //     W_xh_x_static = W_xh_static * x_static
+    LayerParameter* x_static_transform_param = net_param->add_layer();
+    x_static_transform_param->CopyFrom(hidden_param);
+    x_static_transform_param->mutable_inner_product_param()->set_axis(1);
+    x_static_transform_param->set_name("W_xh_x_static");
+    x_static_transform_param->add_param()->set_name("W_xh_static");
+    x_static_transform_param->add_bottom("x_static");
+    x_static_transform_param->add_top("W_xh_x_static_preshape");
+    x_static_transform_param->add_propagate_down(true);
+
+    LayerParameter* reshape_param = net_param->add_layer();
+    reshape_param->set_type("Reshape");
+    BlobShape* new_shape =
+         reshape_param->mutable_reshape_param()->mutable_shape();
+    new_shape->add_dim(1);  // One timestep.
+    // Should infer this->N as the dimension so we can reshape on batch size.
+    new_shape->add_dim(-1);
+    new_shape->add_dim(
+        x_static_transform_param->inner_product_param().num_output());
+    reshape_param->set_name("W_xh_x_static_reshape");
+    reshape_param->add_bottom("W_xh_x_static_preshape");
+    reshape_param->add_top("W_xh_x_static");
+  }
+
+  LayerParameter* x_slice_param = net_param->add_layer();
+  x_slice_param->CopyFrom(slice_param);
+  x_slice_param->set_name("W_xh_x_slice");
+  x_slice_param->add_bottom("W_xh_x");
+
+  LayerParameter output_concat_layer;
+  output_concat_layer.set_name("o_concat");
+  output_concat_layer.set_type("Concat");
+  output_concat_layer.add_top("o");
+  output_concat_layer.mutable_concat_param()->set_axis(0);
+
+  for (int t = 1; t <= this->T_; ++t) {
+    string tm1s = format_int(t - 1);
+    string ts = format_int(t);
+
+    cont_slice_param->add_top("cont_" + ts);
+    x_slice_param->add_top("W_xh_x_" + ts);
+
+    // Add layer to flush the hidden state when beginning a new sequence,
+    // as indicated by cont_t.
+    //     h_conted_{t-1} := cont_t * h_{t-1}
+    //
+    // Normally, cont_t is binary (i.e., 0 or 1), so:
+    //     h_conted_{t-1} := h_{t-1} if cont_t == 1
+    //                       0   otherwise
+    {
+      LayerParameter* cont_h_param = net_param->add_layer();
+      cont_h_param->CopyFrom(scale_param);
+      cont_h_param->set_name("h_conted_" + tm1s);
+      cont_h_param->add_bottom("h_" + tm1s);
+      cont_h_param->add_bottom("cont_" + ts);
+      cont_h_param->add_top("h_conted_" + tm1s);
+    }
+
+    // Add layer to compute
+    //     W_hh_h_{t-1} := W_hh * h_conted_{t-1}
+    {
+      LayerParameter* w_param = net_param->add_layer();
+      w_param->CopyFrom(hidden_param);
+      w_param->set_name("W_hh_h_" + tm1s);
+      w_param->add_param()->set_name("W_hh");
+      w_param->add_bottom("h_conted_" + tm1s);
+      w_param->add_top("W_hh_h_" + tm1s);
+      w_param->mutable_inner_product_param()->set_axis(2);
+    }
+
+    // Add layers to compute
+    //     h_t := \tanh( W_hh * h_conted_{t-1} + W_xh * x_t + b_h )
+    //          = \tanh( W_hh_h_{t-1} + W_xh_t )
+    {
+      LayerParameter* h_input_sum_param = net_param->add_layer();
+      h_input_sum_param->CopyFrom(sum_param);
+      h_input_sum_param->set_name("h_input_sum_" + ts);
+      h_input_sum_param->add_bottom("W_hh_h_" + tm1s);
+      h_input_sum_param->add_bottom("W_xh_x_" + ts);
+      if (this->static_input_) {
+        h_input_sum_param->add_bottom("W_xh_x_static");
+      }
+      h_input_sum_param->add_top("h_neuron_input_" + ts);
+    }
+    {
+      LayerParameter* h_neuron_param = net_param->add_layer();
+      h_neuron_param->CopyFrom(tanh_param);
+      h_neuron_param->set_name("h_neuron_" + ts);
+      h_neuron_param->add_bottom("h_neuron_input_" + ts);
+      h_neuron_param->add_top("h_" + ts);
+    }
+
+    // Add layer to compute
+    //     W_ho_h_t := W_ho * h_t + b_o
+    {
+      LayerParameter* w_param = net_param->add_layer();
+      w_param->CopyFrom(biased_hidden_param);
+      w_param->set_name("W_ho_h_" + ts);
+      w_param->add_param()->set_name("W_ho");
+      w_param->add_param()->set_name("b_o");
+      w_param->add_bottom("h_" + ts);
+      w_param->add_top("W_ho_h_" + ts);
+      w_param->mutable_inner_product_param()->set_axis(2);
+    }
+
+    // Add layers to compute
+    //     o_t := \tanh( W_ho * h_t + b_o)
+    //          = \tanh( W_ho_h_t )
+    {
+      LayerParameter* o_neuron_param = net_param->add_layer();
+      o_neuron_param->CopyFrom(tanh_param);
+      o_neuron_param->set_name("o_neuron_" + ts);
+      o_neuron_param->add_bottom("W_ho_h_" + ts);
+      o_neuron_param->add_top("o_" + ts);
+    }
+    output_concat_layer.add_bottom("o_" + ts);
+  }  // for (int t = 1; t <= this->T_; ++t)
+
+  net_param->add_layer()->CopyFrom(output_concat_layer);
+}
+
+INSTANTIATE_CLASS_FB(RNNLayer);
+REGISTER_LAYER_CLASS(RNN);
+
+}  // namespace caffe
diff --git a/src/caffe/layers/scale_layer.cpp b/src/caffe/layers/scale_layer.cpp
index e9045f8b779..200ed680d3f 100644
--- a/src/caffe/layers/scale_layer.cpp
+++ b/src/caffe/layers/scale_layer.cpp
@@ -4,7 +4,7 @@
 #include "caffe/filler.hpp"
 #include "caffe/layer_factory.hpp"
 #include "caffe/layers/scale_layer.hpp"
-#include "caffe/util/math_functions.hpp"
+#include "caffe/net.hpp"
 
 namespace caffe {
 
@@ -53,7 +53,7 @@ void ScaleLayer<Ftype, Btype>::LayerSetUp(const vector<Blob*>& bottom,
       bias_param->set_num_axes(param.num_axes());
     }
     bias_param->mutable_filler()->CopyFrom(param.bias_filler());
-    bias_layer_ = LayerRegistry::CreateLayer(layer_param, 0UL);
+    bias_layer_ = LayerRegistry::CreateLayer(layer_param, this->parent_rank());
     bias_bottom_vec_.resize(1);
     bias_bottom_vec_[0] = bottom[0];
     bias_layer_->SetUp(bias_bottom_vec_, top);
diff --git a/src/caffe/layers/slice_layer.cu b/src/caffe/layers/slice_layer.cu
index 0c6eb8a8fb7..d7a7c13ba7b 100644
--- a/src/caffe/layers/slice_layer.cu
+++ b/src/caffe/layers/slice_layer.cu
@@ -42,9 +42,9 @@ void SliceLayer<Ftype, Btype>::Forward_gpu(const vector<Blob*>& bottom,
         <<<CAFFE_GET_BLOCKS(nthreads), CAFFE_CUDA_NUM_THREADS, 0, stream>>>(
         nthreads, bottom_data, kForward, num_slices_, slice_size_,
         bottom_slice_axis, top_slice_axis, offset_slice_axis, top_data);
+    CUDA_CHECK(cudaStreamSynchronize(stream));
     offset_slice_axis += top_slice_axis;
   }
-  CUDA_CHECK(cudaStreamSynchronize(stream));
 }
 
 template <typename Ftype, typename Btype>
@@ -55,15 +55,17 @@ void SliceLayer<Ftype, Btype>::Backward_gpu(const vector<Blob*>& top,
   Btype* bottom_diff = bottom[0]->mutable_gpu_diff<Btype>();
   const int bottom_slice_axis = bottom[0]->shape(slice_axis_);
   const bool kForward = false;
+  cudaStream_t stream = Caffe::thread_stream();
   for (int i = 0; i < top.size(); ++i) {
     const Btype* top_diff = top[i]->gpu_diff<Btype>();
     const int top_slice_axis = top[i]->shape(slice_axis_);
     const int top_slice_size = top_slice_axis * slice_size_;
     const int nthreads = top_slice_size * num_slices_;
     Slice  // NOLINT_NEXT_LINE(whitespace/operators)
-        <<<CAFFE_GET_BLOCKS(nthreads), CAFFE_CUDA_NUM_THREADS, 0, Caffe::thread_stream()>>>(
+        <<<CAFFE_GET_BLOCKS(nthreads), CAFFE_CUDA_NUM_THREADS, 0, stream>>>(
         nthreads, top_diff, kForward, num_slices_, slice_size_,
         bottom_slice_axis, top_slice_axis, offset_slice_axis, bottom_diff);
+    CUDA_CHECK(cudaStreamSynchronize(stream));
     offset_slice_axis += top_slice_axis;
   }
 }
diff --git a/src/caffe/layers/smooth_L1_loss_layer.cpp b/src/caffe/layers/smooth_L1_loss_layer.cpp
new file mode 100644
index 00000000000..6cfea641e51
--- /dev/null
+++ b/src/caffe/layers/smooth_L1_loss_layer.cpp
@@ -0,0 +1,104 @@
+// ------------------------------------------------------------------
+// Fast R-CNN
+// copyright (c) 2015 Microsoft
+// Licensed under The MIT License [see fast-rcnn/LICENSE for details]
+// Written by Ross Girshick
+// Modified by Wei Liu
+// ------------------------------------------------------------------
+
+#include <vector>
+
+#include "caffe/layers/smooth_L1_loss_layer.hpp"
+#include "caffe/util/math_functions.hpp"
+
+namespace caffe {
+
+template <typename Ftype, typename Btype>
+void SmoothL1LossLayer<Ftype, Btype>::LayerSetUp(
+  const vector<Blob*>& bottom, const vector<Blob*>& top) {
+  LossLayer<Ftype, Btype>::LayerSetUp(bottom, top);
+  has_weights_ = (bottom.size() == 3);
+}
+
+template <typename Ftype, typename Btype>
+void SmoothL1LossLayer<Ftype, Btype>::Reshape(
+  const vector<Blob*>& bottom, const vector<Blob*>& top) {
+  LossLayer<Ftype, Btype>::Reshape(bottom, top);
+  CHECK_EQ(bottom[0]->channels(), bottom[1]->channels());
+  CHECK_EQ(bottom[0]->height(), bottom[1]->height());
+  CHECK_EQ(bottom[0]->width(), bottom[1]->width());
+  if (has_weights_) {
+    CHECK_EQ(bottom[0]->channels(), bottom[2]->channels());
+    CHECK_EQ(bottom[0]->height(), bottom[2]->height());
+    CHECK_EQ(bottom[0]->width(), bottom[2]->width());
+  }
+  diff_.Reshape(bottom[0]->num(), bottom[0]->channels(),
+      bottom[0]->height(), bottom[0]->width());
+  errors_.Reshape(bottom[0]->num(), bottom[0]->channels(),
+      bottom[0]->height(), bottom[0]->width());
+}
+
+template <typename Ftype, typename Btype>
+void SmoothL1LossLayer<Ftype, Btype>::Forward_cpu(const vector<Blob*>& bottom,
+    const vector<Blob*>& top) {
+  int count = bottom[0]->count();
+  caffe_sub(
+      count,
+      bottom[0]->cpu_data<Dtype>(),
+      bottom[1]->cpu_data<Dtype>(),
+      diff_.mutable_cpu_data());
+  if (has_weights_) {
+    caffe_mul(
+        count,
+        bottom[2]->cpu_data<Dtype>(),
+        diff_.cpu_data(),
+        diff_.mutable_cpu_data());  // d := w * (b0 - b1)
+  }
+  const Dtype* diff_data = diff_.cpu_data();
+  Dtype* error_data = errors_.mutable_cpu_data();
+  for (int i = 0; i < count; ++i) {
+    Dtype val = diff_data[i];
+    Dtype abs_val = fabs(val);
+    if (abs_val < 1.) {
+      error_data[i] = 0.5 * val * val;
+    } else {
+      error_data[i] = abs_val - 0.5;
+    }
+  }
+  top[0]->mutable_cpu_data<Dtype>()[0] =
+      caffe_cpu_asum(count, errors_.cpu_data()) / bottom[0]->num();
+}
+
+template <typename Ftype, typename Btype>
+void SmoothL1LossLayer<Ftype, Btype>::Backward_cpu(const vector<Blob*>& top,
+    const vector<bool>& propagate_down, const vector<Blob*>& bottom) {
+  int count = diff_.count();
+  Dtype* diff_data = diff_.mutable_cpu_data();
+  for (int i = 0; i < count; ++i) {
+    Dtype val = diff_data[i];
+    // f'(x) = x         if |x| < 1
+    //       = sign(x)   otherwise
+    if (fabs(val) < 1.) {
+      diff_data[i] = val;
+    } else {
+      diff_data[i] = (Dtype(0) < val) - (val < Dtype(0));
+    }
+  }
+  for (int i = 0; i < 2; ++i) {
+    if (propagate_down[i]) {
+      const Dtype sign = (i == 0) ? 1 : -1;
+      const Dtype alpha = sign * top[0]->cpu_diff<Dtype>()[0] / bottom[i]->num();
+      caffe_cpu_axpby(
+          bottom[i]->count(),               // count
+          alpha,                            // alpha
+          diff_.cpu_data(),                 // a
+          Dtype(0),                         // beta
+          bottom[i]->mutable_cpu_diff<Dtype>());   // b
+    }
+  }
+}
+
+INSTANTIATE_CLASS_FB(SmoothL1LossLayer);
+REGISTER_LAYER_CLASS(SmoothL1Loss);
+
+}  // namespace caffe
diff --git a/src/caffe/layers/smooth_L1_loss_layer.cu b/src/caffe/layers/smooth_L1_loss_layer.cu
new file mode 100644
index 00000000000..678e9cc0b07
--- /dev/null
+++ b/src/caffe/layers/smooth_L1_loss_layer.cu
@@ -0,0 +1,100 @@
+// ------------------------------------------------------------------
+// Fast R-CNN
+// copyright (c) 2015 Microsoft
+// Licensed under The MIT License [see fast-rcnn/LICENSE for details]
+// Written by Ross Girshick
+// Modified by Wei Liu
+// ------------------------------------------------------------------
+
+#include <vector>
+
+#include "caffe/layers/smooth_L1_loss_layer.hpp"
+#include "caffe/util/math_functions.hpp"
+
+namespace caffe {
+
+template <typename Dtype>
+__global__ void SmoothL1Forward(const int n, const Dtype* in, Dtype* out) {
+  // f(x) = 0.5 * x^2    if |x| < 1
+  //        |x| - 0.5    otherwise
+  CUDA_KERNEL_LOOP(index, n) {
+    Dtype val = in[index];
+    Dtype abs_val = abs(val);
+    if (abs_val < 1) {
+      out[index] = 0.5 * val * val;
+    } else {
+      out[index] = abs_val - 0.5;
+    }
+  }
+}
+
+template <typename Ftype, typename Btype>
+void SmoothL1LossLayer<Ftype, Btype>::Forward_gpu(const vector<Blob*>& bottom,
+    const vector<Blob*>& top) {
+  int count = bottom[0]->count();
+  caffe_gpu_sub(
+      count,
+      bottom[0]->gpu_data<Dtype>(),
+      bottom[1]->gpu_data<Dtype>(),
+      diff_.mutable_gpu_data());    // d := b0 - b1
+  if (has_weights_) {
+    caffe_gpu_mul(
+        count,
+        bottom[2]->gpu_data<Dtype>(),
+        diff_.gpu_data(),
+        diff_.mutable_gpu_data());  // d := w * (b0 - b1)
+  }
+  cudaStream_t stream = Caffe::thread_stream();
+  // NOLINT_NEXT_LINE(whitespace/operators)
+  SmoothL1Forward<Dtype><<<CAFFE_GET_BLOCKS(count), CAFFE_CUDA_NUM_THREADS, 0, stream>>>(
+      count, diff_.gpu_data(), errors_.mutable_gpu_data());
+  CUDA_CHECK(cudaStreamSynchronize(stream));
+  CUDA_POST_KERNEL_CHECK;
+
+  Dtype loss;
+  caffe_gpu_asum(count, errors_.gpu_data(), &loss, 0);
+  top[0]->mutable_cpu_data<Dtype>()[0] = loss / bottom[0]->num();
+}
+
+template <typename Dtype>
+__global__ void SmoothL1Backward(const int n, const Dtype* in, Dtype* out) {
+  // f'(x) = x         if |x| < 1
+  //       = sign(x)   otherwise
+  CUDA_KERNEL_LOOP(index, n) {
+    Dtype val = in[index];
+    Dtype abs_val = abs(val);
+    if (abs_val < 1) {
+      out[index] = val;
+    } else {
+      out[index] = (Dtype(0) < val) - (val < Dtype(0));
+    }
+  }
+}
+
+template <typename Ftype, typename Btype>
+void SmoothL1LossLayer<Ftype, Btype>::Backward_gpu(const vector<Blob*>& top,
+    const vector<bool>& propagate_down, const vector<Blob*>& bottom) {
+  int count = diff_.count();
+  cudaStream_t stream = Caffe::thread_stream();
+  // NOLINT_NEXT_LINE(whitespace/operators)
+  SmoothL1Backward<Dtype><<<CAFFE_GET_BLOCKS(count), CAFFE_CUDA_NUM_THREADS, 0, stream>>>(
+      count, diff_.gpu_data(), diff_.mutable_gpu_data());
+  CUDA_POST_KERNEL_CHECK;
+  CUDA_CHECK(cudaStreamSynchronize(stream));
+  for (int i = 0; i < 2; ++i) {
+    if (propagate_down[i]) {
+      const Dtype sign = (i == 0) ? 1 : -1;
+      const Dtype alpha = sign * top[0]->cpu_diff<Dtype>()[0] / bottom[i]->num();
+      caffe_gpu_axpby(
+          bottom[i]->count(),              // count
+          alpha,                           // alpha
+          diff_.gpu_data(),                // x
+          Dtype(0),                        // beta
+          bottom[i]->mutable_gpu_diff<Dtype>());  // y
+    }
+  }
+}
+
+INSTANTIATE_LAYER_GPU_FUNCS_FB(SmoothL1LossLayer);
+
+}  // namespace caffe
diff --git a/src/caffe/layers/video_data_layer.cpp b/src/caffe/layers/video_data_layer.cpp
new file mode 100644
index 00000000000..fd45c397220
--- /dev/null
+++ b/src/caffe/layers/video_data_layer.cpp
@@ -0,0 +1,157 @@
+#include <opencv2/core/core.hpp>
+#include <opencv2/highgui/highgui.hpp>
+
+#include <stdint.h>
+#include <algorithm>
+#include <csignal>
+#include <map>
+#include <string>
+#include <vector>
+
+#include "caffe/data_transformer.hpp"
+#include "caffe/layers/video_data_layer.hpp"
+#include "caffe/util/benchmark.hpp"
+
+namespace caffe {
+
+template <typename Ftype, typename Btype>
+VideoDataLayer<Ftype, Btype>::VideoDataLayer(const LayerParameter& param, size_t solver_rank)
+  : BasePrefetchingDataLayer<Ftype, Btype>(param, solver_rank) {}
+
+template <typename Ftype, typename Btype>
+VideoDataLayer<Ftype, Btype>::~VideoDataLayer() {
+  this->StopInternalThread();
+  if (cap_.isOpened()) {
+    cap_.release();
+  }
+}
+
+template <typename Ftype, typename Btype>
+void VideoDataLayer<Ftype, Btype>::DataLayerSetUp(
+    const vector<Blob*>& bottom, const vector<Blob*>& top) {
+  const int batch_size = this->layer_param_.data_param().batch_size();
+  const VideoDataParameter& video_data_param =
+      this->layer_param_.video_data_param();
+  video_type_ = video_data_param.video_type();
+  skip_frames_ = video_data_param.skip_frames();
+  CHECK_GE(skip_frames_, 0);
+  TBlob<Btype> transformed_datum;
+  // Read an image, and use it to initialize the top blob.
+  cv::Mat cv_img;
+  if (video_type_ == VideoDataParameter_VideoType_WEBCAM) {
+    const int device_id = video_data_param.device_id();
+    if (!cap_.open(device_id)) {
+      LOG(FATAL) << "Failed to open webcam: " << device_id;
+    }
+    cap_ >> cv_img;
+  } else if (video_type_ == VideoDataParameter_VideoType_VIDEO) {
+    CHECK(video_data_param.has_video_file()) << "Must provide video file!";
+    const string& video_file = video_data_param.video_file();
+    if (!cap_.open(video_file)) {
+      LOG(FATAL) << "Failed to open video: " << video_file;
+    }
+    total_frames_ = cap_.get(CV_CAP_PROP_FRAME_COUNT);
+    processed_frames_ = 0;
+    // Read image to infer shape.
+    cap_ >> cv_img;
+    // Set index back to the first frame.
+    cap_.set(CV_CAP_PROP_POS_FRAMES, 0);
+  } else {
+    LOG(FATAL) << "Unknow video type!";
+  }
+  CHECK(cv_img.data) << "Could not load image!";
+  // Use data_transformer to infer the expected blob shape from a cv_image.
+  top_shape_ = this->bdt(0)->InferBlobShape(cv_img);
+  transformed_datum.Reshape(top_shape_);
+  top_shape_[0] = batch_size;
+  top[0]->Reshape(top_shape_);
+  vector<int> label_shape(1, batch_size);
+  LOG(INFO) << "output data size: " << top[0]->num() << ","
+      << top[0]->channels() << "," << top[0]->height() << ","
+      << top[0]->width();
+  // label
+  if (this->output_labels_) {
+    top[1]->Reshape(label_shape);
+  }
+  this->batch_transformer_->reshape(top_shape_, label_shape, this->is_gpu_transform());
+}
+
+// This function is called on prefetch thread
+template <typename Ftype, typename Btype>
+void VideoDataLayer<Ftype, Btype>::load_batch(Batch* batch, int thread_id, size_t queue_id) {
+  CPUTimer batch_timer;
+  batch_timer.Start();
+  double read_time = 0;
+  double trans_time = 0;
+  CPUTimer timer;
+  CHECK(batch->data_->count());
+  TBlob<Btype> transformed_datum;
+
+  // Reshape according to the first anno_datum of each batch
+  // on single input batches allows for inputs of varying dimension.
+  const int batch_size = this->layer_param_.data_param().batch_size();
+  top_shape_[0] = 1;
+  transformed_datum.Reshape(top_shape_);
+  // Reshape batch according to the batch_size.
+  top_shape_[0] = batch_size;
+  batch->data_->Reshape(top_shape_);
+
+  Ftype* top_data = batch->data_->mutable_cpu_data<Ftype>();
+  Ftype* top_label = NULL;  // suppress warnings about uninitialized variables
+  if (this->output_labels_) {
+    top_label = batch->label_->mutable_cpu_data<Ftype>();
+  }
+
+  int skip_frames = skip_frames_;
+  for (int item_id = 0; item_id < batch_size; ++item_id) {
+    timer.Start();
+    cv::Mat cv_img;
+    if (video_type_ == VideoDataParameter_VideoType_WEBCAM) {
+      cap_ >> cv_img;
+    } else if (video_type_ == VideoDataParameter_VideoType_VIDEO) {
+      if (processed_frames_ >= total_frames_) {
+        LOG(INFO) << "Finished processing video.";
+        raise(SIGINT);
+      }
+      ++processed_frames_;
+      cap_ >> cv_img;
+    } else {
+      LOG(FATAL) << "Unknown video type.";
+    }
+    CHECK(cv_img.data) << "Could not load image!";
+    read_time += timer.MicroSeconds();
+    if (skip_frames > 0) {
+      --skip_frames;
+      --item_id;
+    } else {
+      skip_frames = skip_frames_;
+      timer.Start();
+      // Apply transformations (mirror, crop...) to the image
+      int offset = batch->data_->offset(item_id);
+      transformed_datum.set_cpu_data(top_data + offset);
+      this->bdt(0)->Transform(cv_img, &(transformed_datum));
+      trans_time += timer.MicroSeconds();
+    }
+    CHECK(cv_img.data) << "Could not load image!";
+    read_time += timer.MicroSeconds();
+    timer.Start();
+    // Apply transformations (mirror, crop...) to the image
+    int offset = batch->data_->offset(item_id);
+    transformed_datum.set_cpu_data(top_data + offset);
+    this->bdt(0)->Transform(cv_img, &(transformed_datum));
+    trans_time += timer.MicroSeconds();
+    if (this->output_labels_) {
+      top_label[item_id] = 0;
+    }
+  }
+  timer.Stop();
+  batch_timer.Stop();
+  DLOG(INFO) << "Prefetch batch: " << batch_timer.MilliSeconds() << " ms.";
+  DLOG(INFO) << "     Read time: " << read_time / 1000 << " ms.";
+  DLOG(INFO) << "Transform time: " << trans_time / 1000 << " ms.";
+}
+
+INSTANTIATE_CLASS_FB(VideoDataLayer);
+REGISTER_LAYER_CLASS_R(VideoData);
+
+}  // namespace caffe
diff --git a/src/caffe/net.cpp b/src/caffe/net.cpp
index d99b89b0fdc..8c209cede89 100644
--- a/src/caffe/net.cpp
+++ b/src/caffe/net.cpp
@@ -24,13 +24,15 @@ constexpr int Net::END_OF_TRAIN;
 Net::Net(const NetParameter& param,
     size_t solver_rank,
     Flag* solver_init_flag,
-    Flag* solver_iter0_flag,
-    const Net* root_net)
+    const Net* root_net,
+    bool inner_net,
+    int level,
+    const vector<string>* stages)
     : root_net_(root_net),
       solver_(nullptr),
       solver_rank_(solver_rank),
       solver_init_flag_(solver_init_flag),
-      solver_iter0_flag_(solver_iter0_flag) {
+      inner_net_(inner_net) {
   Init(param);
 }
 
@@ -38,16 +40,25 @@ Net::Net(const string& param_file,
     Phase phase,
     size_t solver_rank,
     Flag* solver_init_flag,
-    Flag* solver_iter0_flag,
-    const Net* root_net)
+    const Net* root_net,
+    bool inner_net,
+    int level,
+    const vector<string>* stages)
     : root_net_(root_net),
       solver_(nullptr),
       solver_rank_(solver_rank),
       solver_init_flag_(solver_init_flag),
-      solver_iter0_flag_(solver_iter0_flag) {
+      inner_net_(inner_net) {
   NetParameter param;
   ReadNetParamsFromTextFileOrDie(param_file, &param);
+  // Set phase, stages and level
   param.mutable_state()->set_phase(phase);
+  if (stages != NULL) {
+    for (int i = 0; i < stages->size(); ++i) {
+      param.mutable_state()->add_stage(stages->at(i));
+    }
+  }
+  param.mutable_state()->set_level(level);
   Init(param);
 }
 
@@ -55,7 +66,7 @@ Net::~Net() {
 }
 
 void Net::Init(const NetParameter& in_param) {
-  CHECK(Caffe::root_solver() || root_net_)
+  CHECK(inner_net_ || Caffe::root_solver() || root_net_)
       << "root_net_ needs to be set for all non-root solvers";
   // Set phase from the state.
   phase_ = in_param.state().phase();
@@ -106,12 +117,13 @@ void Net::Init(const NetParameter& in_param) {
 
   wgrad_sq_.store(0LL);
   global_grad_scale_coeff_ = 1.F;
+  has_global_grad_scale_param_ = in_param.has_global_grad_scale();
   global_grad_scale_param_ = in_param.global_grad_scale();
   global_grad_scale_adaptive_ = in_param.global_grad_scale_adaptive();
 
   for (int layer_id = 0; layer_id < param.layer_size(); ++layer_id) {
     // For non-root solvers, whether this layer is shared from root_net_.
-    bool share_from_root = !Caffe::root_solver()
+    bool share_from_root = !inner_net_ && !Caffe::root_solver()
         && root_net_->layers_[layer_id]->ShareInParallel();
 
     const LayerParameter& layer_param = param.layer(layer_id);
@@ -223,7 +235,6 @@ void Net::Init(const NetParameter& in_param) {
     layer->bm_by_user(bm_by_user);
 
     layers_[layer_id]->set_net_initialized_flag(solver_init_flag_);
-    layers_[layer_id]->set_net_iteration0_flag(solver_iter0_flag_);
 
     Flag* layer_inititialized_flag = layers_[layer_id]->layer_inititialized_flag();
     if (layer_inititialized_flag != nullptr) {
@@ -242,6 +253,7 @@ void Net::Init(const NetParameter& in_param) {
             << layer_param.name();
       }
     } else {
+      layers_[layer_id]->set_parent_net(this);
       layers_[layer_id]->SetUp(bottom_vecs_[layer_id], top_vecs_[layer_id]);
     }
     LOG_IF(INFO, Caffe::root_solver())
@@ -1318,7 +1330,7 @@ void Net::ShareWeights() {
       gpu_prm_memory_diff_use_ += params_[i]->gpu_memory_diff_use();
       continue;
     }
-    DLOG(INFO) << "param " << i << " has owner " << param_owners_[i];
+//    DLOG(INFO) << "param " << i << " has owner " << param_owners_[i];
     params_[i]->ShareData(*params_[param_owners_[i]]);
     params_[i]->ShareDiff(*params_[param_owners_[i]]);
     gpu_shp_memory_data_use_ += params_[i]->gpu_memory_data_use();
diff --git a/src/caffe/parallel.cpp b/src/caffe/parallel.cpp
index 9dbbfbadb73..8ecc8087adf 100644
--- a/src/caffe/parallel.cpp
+++ b/src/caffe/parallel.cpp
@@ -96,6 +96,7 @@ P2PSync::P2PSync(P2PManager* mgr, shared_ptr<Solver> root_solver,
 #ifndef USE_NCCL
   LOG(FATAL) << "USE_NCCL := 1 must be specified for multi-GPU";
 #endif
+  CHECK_EQ(target_device_, solver_param_.device_id());
   LOG(INFO) << "[" << rank << " - " << this->target_device_ << "] P2pSync adding callback";
 }
 
@@ -117,6 +118,7 @@ void P2PSync::InternalThreadEntry() {
   solver_->set_callback(this);
 
   CHECK_EQ(nranks_, Caffe::solver_count());
+  CHECK_EQ(target_device_, Caffe::current_device());
 
 #ifdef USE_NCCL
   ncclUniqueId* nccl_id = reinterpret_cast<ncclUniqueId*>(this->aux_);
@@ -160,14 +162,17 @@ void P2PSync::on_start(const vector<shared_ptr<Blob>>& net) {
   CHECK_EQ(count, nranks_);
   for (int i = 0; i < net.size(); ++i) {
     Blob* param = net[i].get();
+    const Type param_type = param->data_type();
+    const int type_id = solver_->net()->learnable_types()[0] == param_type ? 0 : 1;
+    reduce_barrier(type_id);
     NCCL_CHECK(ncclBcast(param->current_mutable_data_memory(true),
         even(param->count()),
-        nccl::nccl_type(param->data_type()),
+        nccl::nccl_type(param_type),
         0,
         nccl_comm_,
-        comm_stream(0)));
-    CUDA_CHECK(cudaStreamSynchronize(comm_stream(0)));
-    reduce_barrier(0);
+        comm_stream(type_id)));
+    CUDA_CHECK(cudaStreamSynchronize(comm_stream(type_id)));
+    reduce_barrier(type_id);
   }
 #endif  // USE_NCCL
 }
diff --git a/src/caffe/proto/caffe.proto b/src/caffe/proto/caffe.proto
index 182021850cc..60384239525 100644
--- a/src/caffe/proto/caffe.proto
+++ b/src/caffe/proto/caffe.proto
@@ -9,6 +9,7 @@ enum Type {
   FLOAT16 = 2;
   INT = 3;  // math not supported
   UINT = 4;  // math not supported
+  BOOL = 5; //math not supported
 }
 
 enum Packing {
@@ -60,6 +61,127 @@ message Datum {
   optional uint32 record_id = 8 [default = 0];
 }
 
+// The label (display) name and label id.
+message LabelMapItem {
+  // Both name and label are required.
+  optional string name = 1;
+  optional int32 label = 2;
+  // display_name is optional.
+  optional string display_name = 3;
+}
+
+message LabelMap {
+  repeated LabelMapItem item = 1;
+}
+
+// Sample a bbox in the normalized space [0, 1] with provided constraints.
+message Sampler {
+  // Minimum scale of the sampled bbox.
+  optional float min_scale = 1 [default = 1.];
+  // Maximum scale of the sampled bbox.
+  optional float max_scale = 2 [default = 1.];
+
+  // Minimum aspect ratio of the sampled bbox.
+  optional float min_aspect_ratio = 3 [default = 1.];
+  // Maximum aspect ratio of the sampled bbox.
+  optional float max_aspect_ratio = 4 [default = 1.];
+}
+
+// Constraints for selecting sampled bbox.
+message SampleConstraint {
+  // Minimum Jaccard overlap between sampled bbox and all bboxes in
+  // AnnotationGroup.
+  optional float min_jaccard_overlap = 1;
+  // Maximum Jaccard overlap between sampled bbox and all bboxes in
+  // AnnotationGroup.
+  optional float max_jaccard_overlap = 2;
+
+  // Minimum coverage of sampled bbox by all bboxes in AnnotationGroup.
+  optional float min_sample_coverage = 3;
+  // Maximum coverage of sampled bbox by all bboxes in AnnotationGroup.
+  optional float max_sample_coverage = 4;
+
+  // Minimum coverage of all bboxes in AnnotationGroup by sampled bbox.
+  optional float min_object_coverage = 5;
+  // Maximum coverage of all bboxes in AnnotationGroup by sampled bbox.
+  optional float max_object_coverage = 6;
+}
+
+// Sample a batch of bboxes with provided constraints.
+message BatchSampler {
+  // Use original image as the source for sampling.
+  optional bool use_original_image = 1 [default = true];
+
+  // Constraints for sampling bbox.
+  optional Sampler sampler = 2;
+
+  // Constraints for determining if a sampled bbox is positive or negative.
+  optional SampleConstraint sample_constraint = 3;
+
+  // If provided, break when found certain number of samples satisfing the
+  // sample_constraint.
+  optional uint32 max_sample = 4;
+
+  // Maximum number of trials for sampling to avoid infinite loop.
+  optional uint32 max_trials = 5 [default = 100];
+}
+
+// Condition for emitting annotations.
+message EmitConstraint {
+  enum EmitType {
+    CENTER = 0;
+    MIN_OVERLAP = 1;
+  }
+  optional EmitType emit_type = 1 [default = CENTER];
+  // If emit_type is MIN_OVERLAP, provide the emit_overlap.
+  optional float emit_overlap = 2;
+}
+
+// The normalized bounding box [0, 1] w.r.t. the input image size.
+message NormalizedBBox {
+  optional float xmin = 1;
+  optional float ymin = 2;
+  optional float xmax = 3;
+  optional float ymax = 4;
+  optional int32 label = 5;
+  optional bool difficult = 6;
+  optional float score = 7;
+  optional float size = 8;
+}
+
+// Annotation for each object instance.
+message Annotation {
+  optional int32 instance_id = 1 [default = 0];
+  optional NormalizedBBox bbox = 2;
+}
+
+// Group of annotations for a particular label.
+message AnnotationGroup {
+  optional int32 group_label = 1;
+  repeated Annotation annotation = 2;
+}
+
+// An extension of Datum which contains "rich" annotations.
+message AnnotatedDatum {
+  enum AnnotationType {
+    BBOX = 0;
+  }
+  optional Datum datum = 1;
+  // If there are "rich" annotations, specify the type of annotation.
+  // Currently it only supports bounding box.
+  // If there are no "rich" annotations, use label in datum instead.
+  optional AnnotationType type = 2;
+  // Each group contains annotation for a particular class.
+  repeated AnnotationGroup annotation_group = 3;
+  // Unique record index assigned by Reader
+  optional uint32 record_id = 4 [default = 0];  
+}
+
+enum DatumTypeInfo {
+  DatumTypeInfo_DATUM = 0;
+  DatumTypeInfo_ANNOTATED_DATUM = 1;
+}
+
 // Caffe 2 datasets support
 message C2TensorProto {
   // The dimensions in the tensor.
@@ -239,6 +361,17 @@ message SolverParameter {
   optional NetState train_state = 26;
   repeated NetState test_state = 27;
 
+  // Evaluation type.
+  optional string eval_type = 241 [default = "classification"];
+  // ap_version: different ways of computing Average Precision.
+  //    Check https://sanchom.wordpress.com/tag/average-precision/ for details.
+  //    11point: the 11-point interpolated average precision. Used in VOC2007.
+  //    MaxIntegral: maximally interpolated AP. Used in VOC2012/ILSVRC.
+  //    Integral: the natural integral of the precision-recall curve.
+  optional string ap_version = 242 [default = "Integral"];
+  // If true, display per class result.
+  optional bool show_per_class_result = 244 [default = false];
+
   // The number of iterations for each test net.
   repeated int32 test_iter = 3;
 
@@ -276,6 +409,8 @@ message SolverParameter {
   //      zero by the max_iter. return base_lr (1 - iter/max_iter) ^ (power)
   //    - sigmoid: the effective learning rate follows a sigmod decay
   //      return base_lr ( 1/(1 + exp(-gamma * (iter - stepsize))))
+  //    - plateau: decreases lr
+  //              if the minimum loss isn't updated for 'plateau_winsize' iters
   //
   // where base_lr, max_iter, gamma, step, stepvalue and power are defined
   // in the solver parameter protocol buffer, and iter is the current iteration.
@@ -304,6 +439,8 @@ message SolverParameter {
   optional int32 stepsize = 13;
   // the stepsize for learning rate policy "multistep"
   repeated int32 stepvalue = 34;
+  // the stepsize for learning rate policy "plateau"
+  repeated int32 plateau_winsize = 243;
 
   // Set clip_gradients to >= 0 to clip parameter gradients to that L2 norm,
   // whenever their actual L2 norm is larger.
@@ -380,6 +517,8 @@ message SolverState {
   optional string learned_net = 2; // The file that stores the learned net.
   repeated BlobProto history = 3; // The history for sgd solvers
   optional int32 current_step = 4 [default = 0]; // The current step for learning rate
+  optional float minimum_loss = 5 [default = 1E38]; // Historical minimum loss
+  optional int32 iter_last_event = 6 [default = 0]; // The iteration when last lr-update or min_loss-update happend
 }
 
 enum Phase {
@@ -439,7 +578,7 @@ message ParamSpec {
 // NOTE
 // Update the next available ID when you add a new LayerParameter field.
 //
-// LayerParameter next available layer-specific ID: 151 (last added: cudnn_math_override)
+// LayerParameter next available layer-specific ID: 152 (last added: recurrent_param)
 message LayerParameter {
   optional string name = 1; // the layer name
   optional string type = 2; // the layer type
@@ -512,6 +651,7 @@ message LayerParameter {
   // engine parameter for selecting the implementation.
   // The default for the engine is set by the ENGINE switch at compile-time.
   optional AccuracyParameter accuracy_param = 102;
+  optional AnnotatedDataParameter annotated_data_param = 200;
   optional ArgMaxParameter argmax_param = 103;
   optional BatchNormParameter batch_norm_param = 139;
   optional BiasParameter bias_param = 141;
@@ -520,6 +660,8 @@ message LayerParameter {
   optional ConvolutionParameter convolution_param = 106;
   optional CropParameter crop_param = 144;
   optional DataParameter data_param = 107;
+  optional DetectionEvaluateParameter detection_evaluate_param = 205;
+  optional DetectionOutputParameter detection_output_param = 204;
   optional DropoutParameter dropout_param = 108;
   optional DummyDataParameter dummy_data_param = 109;
   optional EltwiseParameter eltwise_param = 110;
@@ -537,10 +679,14 @@ message LayerParameter {
   optional LogParameter log_param = 134;
   optional LRNParameter lrn_param = 118;
   optional MemoryDataParameter memory_data_param = 119;
+  optional MultiBoxLossParameter multibox_loss_param = 201;
   optional MVNParameter mvn_param = 120;
+  optional NormalizeParameter norm_param = 206;
+  optional PermuteParameter permute_param = 202;
   optional PoolingParameter pooling_param = 121;
   optional PowerParameter power_param = 122;
   optional PReLUParameter prelu_param = 131;
+  optional PriorBoxParameter prior_box_param = 203;
   optional PythonParameter python_param = 130;
   optional ReductionParameter reduction_param = 136;
   optional ReLUParameter relu_param = 123;
@@ -553,7 +699,9 @@ message LayerParameter {
   optional TanHParameter tanh_param = 127;
   optional ThresholdParameter threshold_param = 128;
   optional TileParameter tile_param = 138;
+  optional VideoDataParameter video_data_param = 207;
   optional WindowDataParameter window_data_param = 129;
+  optional RecurrentParameter recurrent_param = 151;
 
   // NVIDIA PARAMETERS (Start with 68 because NV is 68 on an old-style phone)
   optional DetectNetGroundTruthParameter detectnet_groundtruth_param = 6801;
@@ -638,6 +786,9 @@ message TransformationParameter {
   optional bool mirror = 2 [default = false];
   // Specify if we would like to randomly crop an image.
   optional uint32 crop_size = 3 [default = 0];
+  optional uint32 crop_h = 211 [default = 0];
+  optional uint32 crop_w = 212 [default = 0];
+
   // mean_file and mean_value cannot be specified at the same time
   optional string mean_file = 4;
   // if specified can be repeated once (would substract it from all the channels)
@@ -656,6 +807,20 @@ message TransformationParameter {
   // random number generator would be initialized -- useful for reproducible results.
   // Otherwise, (and by default) initialize using a seed derived from the system clock.
   optional int64 random_seed = 9 [default = -1];
+
+  optional bool display = 22 [default = false];
+  optional int32 num_labels = 23 [default = 0];
+  
+  // Resize policy
+  optional ResizeParameter resize_param = 208;
+  // Noise policy
+  optional NoiseParameter noise_param = 209;
+  // Distortion policy
+  optional DistortionParameter distort_param = 213;
+  // Expand policy
+  optional ExpansionParameter expand_param = 214;
+  // Constraint for emitting the annotation after transformation.
+  optional EmitConstraint emit_constraint = 210;  
 }
 // Message that stores parameters used to create gridbox ground truth
 message DetectNetGroundTruthParameter {
@@ -729,6 +894,142 @@ message DetectNetAugmentationParameter {
   //  desaturation augmentation off.
   optional float desaturation_prob = 12 [default = 0.33];
   optional float desaturation_max = 13 [default = 0.5];
+  
+  // Resize policy
+  optional ResizeParameter resize_param = 208;
+  // Noise policy
+  optional NoiseParameter noise_param = 209;
+  // Distortion policy
+  optional DistortionParameter distort_param = 213;
+  // Expand policy
+  optional ExpansionParameter expand_param = 214;
+  // Constraint for emitting the annotation after transformation.
+  optional EmitConstraint emit_constraint = 210;  
+}
+
+// Message that stores parameters used by data transformer for resize policy
+message ResizeParameter {
+  //Probability of using this resize policy
+  optional float prob = 1 [default = 1];
+
+  enum Resize_mode {
+    WARP = 1;
+    FIT_SMALL_SIZE = 2;
+    FIT_LARGE_SIZE_AND_PAD = 3;
+  }
+  optional Resize_mode resize_mode = 2 [default = WARP];
+  optional uint32 height = 3 [default = 0];
+  optional uint32 width = 4 [default = 0];
+  // A parameter used to update bbox in FIT_SMALL_SIZE mode.
+  optional uint32 height_scale = 8 [default = 0];
+  optional uint32 width_scale = 9 [default = 0];
+
+  enum Pad_mode {
+    CONSTANT = 1;
+    MIRRORED = 2;
+    REPEAT_NEAREST = 3;
+  }
+  // Padding mode for BE_SMALL_SIZE_AND_PAD mode and object centering
+  optional Pad_mode pad_mode = 5 [default = CONSTANT];
+  // if specified can be repeated once (would fill all the channels)
+  // or can be repeated the same number of times as channels
+  // (would use it them to the corresponding channel)
+  repeated float pad_value = 6;
+
+  enum Interp_mode { //Same as in OpenCV
+    LINEAR = 1;
+    AREA = 2;
+    NEAREST = 3;
+    CUBIC = 4;
+    LANCZOS4 = 5;
+  }
+  //interpolation for for resizing
+  repeated Interp_mode interp_mode = 7;
+}
+
+message SaltPepperParameter {
+  //Percentage of pixels
+  optional float fraction = 1 [default = 0];
+  repeated float value = 2;
+}
+
+// Message that stores parameters used by data transformer for transformation
+// policy
+message NoiseParameter {
+  //Probability of using this resize policy
+  optional float prob = 1 [default = 0];
+  // Histogram equalized
+  optional bool hist_eq = 2 [default = false];
+  // Color inversion
+  optional bool inverse = 3 [default = false];
+  // Grayscale
+  optional bool decolorize = 4 [default = false];
+  // Gaussian blur
+  optional bool gauss_blur = 5 [default = false];
+
+  // JPEG compression quality (-1 = no compression)
+  optional float jpeg = 6 [default = -1];
+
+  // Posterization
+  optional bool posterize = 7 [default = false];
+
+  // Erosion
+  optional bool erode = 8 [default = false];
+
+  // Salt-and-pepper noise
+  optional bool saltpepper = 9 [default = false];
+
+  optional SaltPepperParameter saltpepper_param = 10;
+
+  // Local histogram equalization
+  optional bool clahe = 11 [default = false];
+
+  // Color space conversion
+  optional bool convert_to_hsv = 12 [default = false];
+
+  // Color space conversion
+  optional bool convert_to_lab = 13 [default = false];
+}
+
+// Message that stores parameters used by data transformer for distortion policy
+message DistortionParameter {
+  // The probability of adjusting brightness.
+  optional float brightness_prob = 1 [default = 0.0];
+  // Amount to add to the pixel values within [-delta, delta].
+  // The possible value is within [0, 255]. Recommend 32.
+  optional float brightness_delta = 2 [default = 0.0];
+
+  // The probability of adjusting contrast.
+  optional float contrast_prob = 3 [default = 0.0];
+  // Lower bound for random contrast factor. Recommend 0.5.
+  optional float contrast_lower = 4 [default = 0.0];
+  // Upper bound for random contrast factor. Recommend 1.5.
+  optional float contrast_upper = 5 [default = 0.0];
+
+  // The probability of adjusting hue.
+  optional float hue_prob = 6 [default = 0.0];
+  // Amount to add to the hue channel within [-delta, delta].
+  // The possible value is within [0, 180]. Recommend 36.
+  optional float hue_delta = 7 [default = 0.0];
+
+  // The probability of adjusting saturation.
+  optional float saturation_prob = 8 [default = 0.0];
+  // Lower bound for the random saturation factor. Recommend 0.5.
+  optional float saturation_lower = 9 [default = 0.0];
+  // Upper bound for the random saturation factor. Recommend 1.5.
+  optional float saturation_upper = 10 [default = 0.0];
+
+  // The probability of randomly order the image channels.
+  optional float random_order_prob = 11 [default = 0.0];
+}
+
+// Message that stores parameters used by data transformer for expansion policy
+message ExpansionParameter {
+  //Probability of using this expansion policy
+  optional float prob = 1 [default = 1];
+
+  // The ratio to expand the image.
+  optional float max_expand_ratio = 2 [default = 1.];
 }
 
 // Message that stores parameters shared by loss layers
@@ -778,6 +1079,16 @@ message AccuracyParameter {
   optional int32 ignore_label = 3;
 }
 
+message AnnotatedDataParameter {
+  // Define the sampler.
+  repeated BatchSampler batch_sampler = 1;
+  // Store label name and label id in LabelMap format.
+  optional string label_map_file = 2;
+  // If provided, it will replace the AnnotationType stored in each
+  // AnnotatedDatum.
+  optional AnnotatedDatum.AnnotationType anno_type = 3;
+}
+
 message ArgMaxParameter {
   // If true produce pairs (argmax, maxval)
   optional bool out_max_val = 1 [default = false];
@@ -996,6 +1307,95 @@ message DataParameter {
   optional bool shuffle = 14 [default = false];
 }
 
+// Message that store parameters used by DetectionEvaluateLayer
+message DetectionEvaluateParameter {
+  // Number of classes that are actually predicted. Required!
+  optional uint32 num_classes = 1;
+  // Label id for background class. Needed for sanity check so that
+  // background class is neither in the ground truth nor the detections.
+  optional uint32 background_label_id = 2 [default = 0];
+  // Threshold for deciding true/false positive.
+  optional float overlap_threshold = 3 [default = 0.5];
+  // If true, also consider difficult ground truth for evaluation.
+  optional bool evaluate_difficult_gt = 4 [default = true];
+  // A file which contains a list of names and sizes with same order
+  // of the input DB. The file is in the following format:
+  //    name height width
+  //    ...
+  // If provided, we will scale the prediction and ground truth NormalizedBBox
+  // for evaluation.
+  optional string name_size_file = 5;
+  // The resize parameter used in converting NormalizedBBox to original image.
+  optional ResizeParameter resize_param = 6;
+}
+
+message NonMaximumSuppressionParameter {
+  // Threshold to be used in nms.
+  optional float nms_threshold = 1 [default = 0.3];
+  // Maximum number of results to be kept.
+  optional int32 top_k = 2;
+  // Parameter for adaptive nms.
+  optional float eta = 3 [default = 1.0];
+}
+
+message SaveOutputParameter {
+  // Output directory. If not empty, we will save the results.
+  optional string output_directory = 1;
+  // Output name prefix.
+  optional string output_name_prefix = 2;
+  // Output format.
+  //    VOC - PASCAL VOC output format.
+  //    COCO - MS COCO output format.
+  optional string output_format = 3;
+  // If you want to output results, must also provide the following two files.
+  // Otherwise, we will ignore saving results.
+  // label map file.
+  optional string label_map_file = 4;
+  // A file which contains a list of names and sizes with same order
+  // of the input DB. The file is in the following format:
+  //    name height width
+  //    ...
+  optional string name_size_file = 5;
+  // Number of test images. It can be less than the lines specified in
+  // name_size_file. For example, when we only want to evaluate on part
+  // of the test images.
+  optional uint32 num_test_image = 6;
+  // The resize parameter used in saving the data.
+  optional ResizeParameter resize_param = 7;
+}
+
+// Message that store parameters used by DetectionOutputLayer
+message DetectionOutputParameter {
+  // Number of classes to be predicted. Required!
+  optional uint32 num_classes = 1;
+  // If true, bounding box are shared among different classes.
+  optional bool share_location = 2 [default = true];
+  // Background label id. If there is no background class,
+  // set it as -1.
+  optional int32 background_label_id = 3 [default = 0];
+  // Parameters used for non maximum suppression.
+  optional NonMaximumSuppressionParameter nms_param = 4;
+  // Parameters used for saving detection results.
+  optional SaveOutputParameter save_output_param = 5;
+  // Type of coding method for bbox.
+  optional PriorBoxParameter.CodeType code_type = 6 [default = CORNER];
+  // If true, variance is encoded in target; otherwise we need to adjust the
+  // predicted offset accordingly.
+  optional bool variance_encoded_in_target = 8 [default = false];
+  // Number of total bboxes to be kept per image after nms step.
+  // -1 means keeping all bboxes after nms step.
+  optional int32 keep_top_k = 7 [default = -1];
+  // Only consider detections whose confidences are larger than a threshold.
+  // If not provided, consider all boxes.
+  optional float confidence_threshold = 9;
+  // If true, visualize the detection results.
+  optional bool visualize = 10 [default = false];
+  // The threshold used to visualize the detection results.
+  optional float visualize_threshold = 11;
+  // If provided, save outputs to video file.
+  optional string save_file = 12;
+}
+
 message DropoutParameter {
   optional float dropout_ratio = 1 [default = 0.5]; // dropout ratio
   enum Engine {
@@ -1217,6 +1617,78 @@ message MemoryDataParameter {
   optional uint32 width = 4;
 }
 
+// Message that store parameters used by MultiBoxLossLayer
+message MultiBoxLossParameter {
+  // Localization loss type.
+  enum LocLossType {
+    L2 = 0;
+    SMOOTH_L1 = 1;
+  }
+  optional LocLossType loc_loss_type = 1 [default = SMOOTH_L1];
+  // Confidence loss type.
+  enum ConfLossType {
+    SOFTMAX = 0;
+    LOGISTIC = 1;
+  }
+  optional ConfLossType conf_loss_type = 2 [default = SOFTMAX];
+  // Weight for localization loss.
+  optional float loc_weight = 3 [default = 1.0];
+  // Number of classes to be predicted. Required!
+  optional uint32 num_classes = 4;
+  // If true, bounding box are shared among different classes.
+  optional bool share_location = 5 [default = true];
+  // Matching method during training.
+  enum MatchType {
+    BIPARTITE = 0;
+    PER_PREDICTION = 1;
+  }
+  optional MatchType match_type = 6 [default = PER_PREDICTION];
+  // If match_type is PER_PREDICTION, use overlap_threshold to
+  // determine the extra matching bboxes.
+  optional float overlap_threshold = 7 [default = 0.5];
+  // Use prior for matching.
+  optional bool use_prior_for_matching = 8 [default = true];
+  // Background label id.
+  optional uint32 background_label_id = 9 [default = 0];
+  // If true, also consider difficult ground truth.
+  optional bool use_difficult_gt = 10 [default = true];
+  // If true, perform negative mining.
+  // DEPRECATED: use mining_type instead.
+  optional bool do_neg_mining = 11;
+  // The negative/positive ratio.
+  optional float neg_pos_ratio = 12 [default = 3.0];
+  // The negative overlap upperbound for the unmatched predictions.
+  optional float neg_overlap = 13 [default = 0.5];
+  // Type of coding method for bbox.
+  optional PriorBoxParameter.CodeType code_type = 14 [default = CORNER];
+  // If true, encode the variance of prior box in the loc loss target instead of
+  // in bbox.
+  optional bool encode_variance_in_target = 16 [default = false];
+  // If true, map all object classes to agnostic class. It is useful for learning
+  // objectness detector.
+  optional bool map_object_to_agnostic = 17 [default = false];
+  // If true, ignore cross boundary bbox during matching.
+  // Cross boundary bbox is a bbox who is outside of the image region.
+  optional bool ignore_cross_boundary_bbox = 18 [default = false];
+  // If true, only backpropagate on corners which are inside of the image
+  // region when encode_type is CORNER or CORNER_SIZE.
+  optional bool bp_inside = 19 [default = false];
+  // Mining type during training.
+  //   NONE : use all negatives.
+  //   MAX_NEGATIVE : select negatives based on the score.
+  //   HARD_EXAMPLE : select hard examples based on "Training Region-based Object Detectors with Online Hard Example Mining", Shrivastava et.al.
+  enum MiningType {
+    NONE = 0;
+    MAX_NEGATIVE = 1;
+    HARD_EXAMPLE = 2;
+  }
+  optional MiningType mining_type = 20 [default = MAX_NEGATIVE];
+  // Parameters used for non maximum suppression durig hard example mining.
+  optional NonMaximumSuppressionParameter nms_param = 21;
+  optional int32 sample_size = 22 [default = 64];
+  optional bool use_prior_for_nms = 23 [default = false];
+}
+
 message MVNParameter {
   // This parameter can be set to false to normalize mean only
   optional bool normalize_variance = 1 [default = true];
@@ -1228,6 +1700,24 @@ message MVNParameter {
   optional float eps = 3 [default = 1e-9];
 }
 
+// Message that stores parameters used by NormalizeLayer
+message NormalizeParameter {
+  optional bool across_spatial = 1 [default = true];
+  // Initial value of scale. Default is 1.0 for all
+  optional FillerParameter scale_filler = 2;
+  // Whether or not scale parameters are shared across channels.
+  optional bool channel_shared = 3 [default = true];
+  // Epsilon for not dividing by zero while normalizing variance
+  optional float eps = 4 [default = 1e-10];
+}
+
+message PermuteParameter {
+  // The new orders of the axes of data. Notice it should be with
+  // in the same range as the input data, and it starts from 0.
+  // Do not provide repeated order.
+  repeated uint32 order = 1;
+}
+
 message PoolingParameter {
   enum PoolMethod {
     MAX = 0;
@@ -1269,6 +1759,48 @@ message PowerParameter {
   optional float shift = 3 [default = 0.0];
 }
 
+// Message that store parameters used by PriorBoxLayer
+message PriorBoxParameter {
+  // Encode/decode type.
+  enum CodeType {
+    CORNER = 1;
+    CENTER_SIZE = 2;
+    CORNER_SIZE = 3;
+  }
+  // Minimum box size (in pixels). Required!
+  repeated float min_size = 1;
+  // Maximum box size (in pixels). Required!
+  repeated float max_size = 2;
+  // Various of aspect ratios. Duplicate ratios will be ignored.
+  // If none is provided, we use default ratio 1.
+  repeated float aspect_ratio = 3;
+  // If true, will flip each aspect ratio.
+  // For example, if there is aspect ratio "r",
+  // we will generate aspect ratio "1.0/r" as well.
+  optional bool flip = 4 [default = true];
+  // If true, will clip the prior so that it is within [0, 1]
+  optional bool clip = 5 [default = false];
+  // Variance for adjusting the prior bboxes.
+  repeated float variance = 6;
+  // By default, we calculate img_height, img_width, step_x, step_y based on
+  // bottom[0] (feat) and bottom[1] (img). Unless these values are explicitely
+  // provided.
+  // Explicitly provide the img_size.
+  optional uint32 img_size = 7;
+  // Either img_size or img_h/img_w should be specified; not both.
+  optional uint32 img_h = 8;
+  optional uint32 img_w = 9;
+
+  // Explicitly provide the step size.
+  optional float step = 10;
+  // Either step or step_h/step_w should be specified; not both.
+  optional float step_h = 11;
+  optional float step_w = 12;
+
+  // Offset to the top left corner of each cell.
+  optional float offset = 13 [default = 0.5];
+}
+
 message PythonParameter {
   optional string module = 1;
   optional string layer = 2;
@@ -1283,6 +1815,25 @@ message PythonParameter {
   optional bool share_in_parallel = 4 [default = false];
 }
 
+// Message that stores parameters used by RecurrentLayer
+message RecurrentParameter {
+  // The dimension of the output (and usually hidden state) representation --
+  // must be explicitly set to non-zero.
+  optional uint32 num_output = 1 [default = 0];
+
+  optional FillerParameter weight_filler = 2; // The filler for the weight
+  optional FillerParameter bias_filler = 3; // The filler for the bias
+
+  // Whether to enable displaying debug_info in the unrolled recurrent net.
+  optional bool debug_info = 4 [default = false];
+
+  // Whether to add as additional inputs (bottoms) the initial hidden state
+  // blobs, and add as additional outputs (tops) the final timestep hidden state
+  // blobs.  The number of additional bottom/top blobs required depends on the
+  // recurrent architecture -- e.g., 1 for RNNs, 2 for LSTMs.
+  optional bool expose_hidden = 5 [default = false];
+}
+
 // Message that stores parameters used by ReductionLayer
 message ReductionParameter {
   enum ReductionOp {
@@ -1487,6 +2038,18 @@ message ThresholdParameter {
   optional float threshold = 1 [default = 0]; // Strictly positive values
 }
 
+message VideoDataParameter{
+  enum VideoType {
+    WEBCAM = 0;
+    VIDEO = 1;
+  }
+  optional VideoType video_type = 1 [default = WEBCAM];
+  optional int32 device_id = 2 [default = 0];
+  optional string video_file = 3;
+  // Number of frames to be skipped before processing a frame.
+  optional uint32 skip_frames = 4 [default = 0];
+}
+
 message WindowDataParameter {
   // Specify the data source.
   optional string source = 1;
@@ -1729,6 +2292,6 @@ message PReLUParameter {
 
   // Initial value of a_i. Default is a_i=0.25 for all i.
   optional FillerParameter filler = 1;
-  // Whether or not slope paramters are shared across channels.
+  // Whether or not slope parameters are shared across channels.
   optional bool channel_shared = 2 [default = false];
 }
diff --git a/src/caffe/solver.cpp b/src/caffe/solver.cpp
index 08c0ab5213c..126b5d3cd4e 100644
--- a/src/caffe/solver.cpp
+++ b/src/caffe/solver.cpp
@@ -5,11 +5,9 @@
 
 #include <boost/thread.hpp>
 #include "caffe/solver.hpp"
-#include "caffe/util/format.hpp"
-#include "caffe/util/gpu_memory.hpp"
 #include "caffe/util/hdf5.hpp"
-#include "caffe/util/io.hpp"
 #include "caffe/util/upgrade_proto.hpp"
+#include "caffe/util/bbox_util.hpp"
 
 namespace caffe {
 
@@ -97,9 +95,9 @@ void Solver::InitTrainNet() {
   net_state.MergeFrom(param_.train_state());
   net_param.mutable_state()->CopyFrom(net_state);
   if (Caffe::root_solver()) {
-    net_.reset(new Net(net_param, rank_, &init_flag_, &iter0_flag_));
+    net_.reset(new Net(net_param, rank_, &init_flag_));
   } else {
-    net_.reset(new Net(net_param, rank_, &init_flag_, &iter0_flag_,
+    net_.reset(new Net(net_param, rank_, &init_flag_,
         root_solver_->net_.get()));
   }
 }
@@ -175,9 +173,9 @@ void Solver::InitTestNets() {
     LOG(INFO)
         << "Creating test net (#" << i << ") specified by " << sources[i];
     if (Caffe::root_solver()) {
-      test_nets_[i].reset(new Net(net_params[i], rank_, &init_flag_, &iter0_flag_));
+      test_nets_[i].reset(new Net(net_params[i], rank_, &init_flag_));
     } else {
-      test_nets_[i].reset(new Net(net_params[i], rank_, &init_flag_, &iter0_flag_,
+      test_nets_[i].reset(new Net(net_params[i], rank_, &init_flag_,
           root_solver_->test_nets_[i].get()));
     }
     test_nets_[i]->set_debug_info(param_.debug_info());
@@ -197,7 +195,6 @@ void Solver::Step(int iters) {
   net_->set_solver(this);
 
   if (iters <= 0) {
-    iter0_flag_.set();
     init_flag_.set();
     return;
   }
@@ -209,6 +206,9 @@ void Solver::Step(int iters) {
       net_->InitializeLearnableDiffSpace(type_id);
     }
   }
+  for (auto b : net_->learnable_params_mapped()) {
+    b->current_mutable_data_memory(true);
+  }
 
   if (solver_count > 1) {
     // we need to sync all threads before starting, otherwise some cuda init,
@@ -264,9 +264,13 @@ void Solver::Step(int iters) {
     // Just started or restored?
     const bool first_loop = iter_ == 0 || iterations_last_ < 0;
     if (iter_ == 0) {
+      LOG_IF(INFO, Caffe::root_solver()) << mgpu_str << "Initial Test started...";
+      iteration_timer_->Start();
       scores = TestAll(1, use_multi_gpu_testing);
       callback_soft_barrier();
-      LOG_IF(INFO, Caffe::root_solver()) << mgpu_str << "Initial Test completed";
+      float lapse = iteration_timer_->Seconds();
+      LOG_IF(INFO, Caffe::root_solver()) << mgpu_str << "Initial Test completed in "
+                                                     << lapse << "s";
     } else if (test_and_snapshot || (param_.test_interval()
         && iter_ % param_.test_interval() == 0
         && iterations_last_ >= 0)) {
@@ -302,7 +306,6 @@ void Solver::Step(int iters) {
       loss += net_->ForwardBackward(i + 1 == param_.iter_size());
       if (i == 0) {
         if (first_loop) {
-          iter0_flag_.set();
           net_->wait_layers_init();
         }
       }
@@ -416,7 +419,7 @@ void Solver::Reduce(Callback* callback, int device, Caffe::Brew mode, uint64_t r
   if (mode == Caffe::GPU) {
     CUDA_CHECK(cudaSetDevice(device));
 #ifndef NO_NVML
-    nvml::setCpuAffinity();
+    nvml::setCpuAffinity(device);
 #endif
   }
   Caffe::set_mode(mode);
@@ -492,7 +495,12 @@ vector<float> Solver::TestAll(const int iters, bool use_multi_gpu) {
   for (int test_net_id = 0;
        test_net_id < test_nets_.size() && !requested_early_exit_;
        ++test_net_id) {
-    vector<float> scores = Test(test_net_id, iters, use_multi_gpu);
+    vector<float> scores;
+    if (param_.eval_type() == "detection") {
+      scores = TestDetection(test_net_id);
+    } else {
+      scores = Test(test_net_id, iters, use_multi_gpu);
+    }
     if (scores.size() == 0UL) {
       return scores;
     }
@@ -600,6 +608,132 @@ vector<float> Solver::Test(const int test_net_id, const int iters, bool use_mult
   return scores;
 }
 
+vector<float>   Solver::TestDetection(const int test_net_id) {
+  typedef float Dtype;
+  LOG_IF(INFO, Caffe::root_solver()) << "Iteration " << iter_
+            << ", Testing net (#" << test_net_id << ")";
+  if (!test_nets_[test_net_id]->trained_layers_shared()) {
+    CHECK_NOTNULL(test_nets_[test_net_id].get())->ShareTrainedLayersWith(net_.get());
+  }
+  vector<float> scores;
+  map<int, map<int, vector<pair<float, int> > > > all_true_pos;
+  map<int, map<int, vector<pair<float, int> > > > all_false_pos;
+  map<int, map<int, int> > all_num_pos;
+  const shared_ptr<Net >& test_net = test_nets_[test_net_id];
+  Dtype loss = 0;
+  for (int i = 0; i < param_.test_iter(test_net_id); ++i) {
+    SolverAction::Enum request = GetRequestedAction();
+    // Check to see if stoppage of testing/training has been requested.
+    while (request != SolverAction::NONE) {
+        if (SolverAction::SNAPSHOT == request) {
+          Snapshot();
+        } else if (SolverAction::STOP == request) {
+          requested_early_exit_ = true;
+        }
+        request = GetRequestedAction();
+    }
+    if (requested_early_exit_) {
+      // break out of test loop.
+      break;
+    }
+
+    Dtype iter_loss;
+    const vector<Blob*>& result = test_net->Forward(&iter_loss);
+    if (param_.test_compute_loss()) {
+      loss += iter_loss;
+    }
+    for (int j = 0; j < result.size(); ++j) {
+      CHECK_EQ(result[j]->width(), 5);
+      const Dtype* result_vec = result[j]->cpu_data<Dtype>();
+      int num_det = result[j]->height();
+      for (int k = 0; k < num_det; ++k) {
+        int item_id = static_cast<int>(result_vec[k * 5]);
+        int label = static_cast<int>(result_vec[k * 5 + 1]);
+        if (item_id == -1) {
+          // Special row of storing number of positives for a label.
+          if (all_num_pos[j].find(label) == all_num_pos[j].end()) {
+            all_num_pos[j][label] = static_cast<int>(result_vec[k * 5 + 2]);
+          } else {
+            all_num_pos[j][label] += static_cast<int>(result_vec[k * 5 + 2]);
+          }
+        } else {
+          // Normal row storing detection status.
+          float score = result_vec[k * 5 + 2];
+          int tp = static_cast<int>(result_vec[k * 5 + 3]);
+          int fp = static_cast<int>(result_vec[k * 5 + 4]);
+          if (tp == 0 && fp == 0) {
+            // Ignore such case. It happens when a detection bbox is matched to
+            // a difficult gt bbox and we don't evaluate on difficult gt bbox.
+            continue;
+          }
+          if (scores.size() < MAX_SNAPSHOT_SCORES) {
+            scores.push_back(score);
+          }
+          all_true_pos[j][label].push_back(std::make_pair(score, tp));
+          all_false_pos[j][label].push_back(std::make_pair(score, fp));
+        }
+      }
+    }
+  }
+  if (requested_early_exit_) {
+    LOG(INFO) << "Test interrupted.";
+    return scores;
+  }
+  if (param_.test_compute_loss()) {
+    loss /= param_.test_iter(test_net_id);
+    LOG(INFO) << "Test loss: " << loss;
+  }
+  for (int i = 0; i < all_true_pos.size(); ++i) {
+    if (all_true_pos.find(i) == all_true_pos.end()) {
+      LOG(FATAL) << "Missing output_blob true_pos: " << i;
+    }
+    const map<int, vector<pair<float, int> > >& true_pos =
+        all_true_pos.find(i)->second;
+    if (all_false_pos.find(i) == all_false_pos.end()) {
+      LOG(FATAL) << "Missing output_blob false_pos: " << i;
+    }
+    const map<int, vector<pair<float, int> > >& false_pos =
+        all_false_pos.find(i)->second;
+    if (all_num_pos.find(i) == all_num_pos.end()) {
+      LOG(FATAL) << "Missing output_blob num_pos: " << i;
+    }
+    const map<int, int>& num_pos = all_num_pos.find(i)->second;
+    map<int, float> APs;
+    float mAP = 0.;
+    // Sort true_pos and false_pos with descend scores.
+    for (map<int, int>::const_iterator it = num_pos.begin();
+         it != num_pos.end(); ++it) {
+      int label = it->first;
+      int label_num_pos = it->second;
+      if (true_pos.find(label) == true_pos.end()) {
+        LOG(WARNING) << "Missing true_pos for label: " << label;
+        continue;
+      }
+      const vector<pair<float, int> >& label_true_pos =
+          true_pos.find(label)->second;
+      if (false_pos.find(label) == false_pos.end()) {
+        LOG(WARNING) << "Missing false_pos for label: " << label;
+        continue;
+      }
+      const vector<pair<float, int> >& label_false_pos =
+          false_pos.find(label)->second;
+      vector<float> prec, rec;
+      ComputeAP(label_true_pos, label_num_pos, label_false_pos,
+                param_.ap_version(), &prec, &rec, &(APs[label]));
+      mAP += APs[label];
+      if (param_.show_per_class_result()) {
+        LOG(INFO) << "class AP " << label << ": " << APs[label];
+      }
+    }
+    mAP /= num_pos.size();
+    const int output_blob_index = test_net->output_blob_indices()[i];
+    const string& output_name = test_net->blob_names()[output_blob_index];
+    LOG(INFO) << "Test net output mAP #" << i << ": " << output_name << " = "
+              << mAP;
+  }
+  return scores;
+}
+
 void Solver::SnapshotWithScores(const vector<float>& scores) {
   CHECK(Caffe::root_solver());
   string model_filename;
diff --git a/src/caffe/solvers/sgd_solver.cpp b/src/caffe/solvers/sgd_solver.cpp
index 25fff98b0e4..082a45b0fb6 100644
--- a/src/caffe/solvers/sgd_solver.cpp
+++ b/src/caffe/solvers/sgd_solver.cpp
@@ -243,13 +243,13 @@ float SGDSolver<Dtype>::ComputeUpdateValue(int param_id, void* handle, float rat
   const string& larc_policy = this->param_.larc_policy();
   float local_rate = GetLocalRate(param_id, wgrad_sq);
   if (larc) {
-  if (larc_policy == "scale") {
-    local_rate = rate * local_rate;
-  } else if (larc_policy == "clip") {
-    local_rate = std::min(rate, local_rate);
-  } else {
-    LOG(FATAL) << "Unknown larc policy: " << larc_policy;
-  }
+    if (larc_policy == "scale") {
+      local_rate = rate * local_rate;
+    } else if (larc_policy == "clip") {
+      local_rate = std::min(rate, local_rate);
+    } else {
+      LOG(FATAL) << "Unknown larc policy: " << larc_policy;
+    }
   } else {
     local_rate = rate * local_rate;
   }
diff --git a/src/caffe/syncedmem.cpp b/src/caffe/syncedmem.cpp
index b7cd6f09f84..9390a442196 100644
--- a/src/caffe/syncedmem.cpp
+++ b/src/caffe/syncedmem.cpp
@@ -80,7 +80,8 @@ void SyncedMemory::to_gpu(bool copy_from_cpu, int group) {
   switch (head_) {
     case UNINITIALIZED:
       CUDA_CHECK(cudaGetDevice(&device_));
-      GPUMemory::allocate(&gpu_ptr_, size_, device_, group);
+      pstream_ = Caffe::thread_pstream(group);
+      GPUMemory::allocate(&gpu_ptr_, size_, device_, pstream_);
       caffe_gpu_memset(size_, 0, gpu_ptr_, group);
       head_ = HEAD_AT_GPU;
       own_gpu_data_ = true;
@@ -88,7 +89,8 @@ void SyncedMemory::to_gpu(bool copy_from_cpu, int group) {
     case HEAD_AT_CPU:
       if (gpu_ptr_ == NULL) {
         CUDA_CHECK(cudaGetDevice(&device_));
-        GPUMemory::allocate(&gpu_ptr_, size_, device_, group);
+        pstream_ = Caffe::thread_pstream(group);
+        GPUMemory::allocate(&gpu_ptr_, size_, device_, pstream_);
         own_gpu_data_ = true;
       }
       if (copy_from_cpu) {
diff --git a/src/caffe/tensor.cpp b/src/caffe/tensor.cpp
index f79d315c0a4..e5dce74d12d 100644
--- a/src/caffe/tensor.cpp
+++ b/src/caffe/tensor.cpp
@@ -62,7 +62,7 @@ void Tensor::convert(Type new_type) {
     if (!new_mem || new_mem->size() != new_cap) {
       new_mem = make_shared<SyncedMemory>(new_cap);
     }
-    const bool data_gpu = Caffe::mode() == Caffe::GPU;
+    const bool data_gpu = is_gpu_head();
     if (current_mem->head() != SyncedMemory::UNINITIALIZED) {
       copy_helper(data_gpu, count_,
           data_gpu ? current_mem->gpu_data() : current_mem->cpu_data(),
@@ -132,7 +132,7 @@ void Tensor::copy_helper(bool use_gpu, int count, const void* p_src, Type src_ty
 
 void Tensor::scale(float scale, void* handle) {
   shared_ptr<SyncedMemory>& mem = mutable_synced_mem();
-  if (Caffe::mode() == Caffe::GPU) {
+  if (is_gpu_head()) {
     cublasHandle_t cublas_handle =
         handle == nullptr ? Caffe::cublas_handle(0) : reinterpret_cast<cublasHandle_t>(handle);
     gpu_scal(count_, type_, mem->mutable_gpu_data(), scale, cublas_handle);
@@ -143,7 +143,7 @@ void Tensor::scale(float scale, void* handle) {
 
 void Tensor::set(float value) {
   shared_ptr<SyncedMemory>& mem = mutable_synced_mem();
-  if (Caffe::mode() == Caffe::GPU) {
+  if (is_gpu_head()) {
     void* data = mem->mutable_gpu_data();
     if (is_type<float>(type_)) {
       caffe_gpu_set(count_, value, static_cast<float*>(data));
@@ -174,7 +174,7 @@ float Tensor::asum(int group) const {
   if (!mem || count_ <= 0) {
     return asum;
   }
-  if (Caffe::mode() == Caffe::GPU) {
+  if (is_gpu_head()) {
     if (is_type<float>(type_)) {
       caffe_gpu_asum(count_, static_cast<const float*>(mem->gpu_data()), &asum, group);
     } else if (is_type<float16>(type_)) {
@@ -204,7 +204,7 @@ float Tensor::amax(int group) const {
   if (!mem || count_ <= 0) {
     return amax;
   }
-  if (Caffe::mode() == Caffe::GPU) {
+  if (is_gpu_head()) {
     if (is_type<float>(type_)) {
       caffe_gpu_amax(count_, static_cast<const float*>(mem->gpu_data()), &amax, group);
     } else if (is_type<float16>(type_)) {
@@ -234,7 +234,7 @@ float Tensor::sumsq(int group) const {
   if (!mem || count_ <= 0) {
     return sumsq;
   }
-  if (Caffe::mode() == Caffe::GPU) {
+  if (is_gpu_head()) {
     if (is_type<float>(type_)) {
       caffe_gpu_sumsq(count_, static_cast<const float*>(mem->gpu_data()), &sumsq, group);
     } else if (is_type<float16>(type_)) {
diff --git a/src/caffe/test/test_common.cpp b/src/caffe/test/test_common.cpp
index c2cafe1ca44..9e07e411539 100644
--- a/src/caffe/test/test_common.cpp
+++ b/src/caffe/test/test_common.cpp
@@ -52,9 +52,11 @@ TEST_F(CommonTest, TestRandSeedGPU) {
   Caffe::set_random_seed(1701);
   CURAND_CHECK(curandGenerate(Caffe::curand_generator(),
         static_cast<unsigned int*>(data_a.mutable_gpu_data()), 10));
+  CUDA_CHECK(cudaStreamSynchronize(Caffe::curand_stream()));
   Caffe::set_random_seed(1701);
   CURAND_CHECK(curandGenerate(Caffe::curand_generator(),
         static_cast<unsigned int*>(data_b.mutable_gpu_data()), 10));
+  CUDA_CHECK(cudaStreamSynchronize(Caffe::curand_stream()));
   for (int i = 0; i < 10; ++i) {
     EXPECT_EQ(((const unsigned int*)(data_a.cpu_data()))[i],
         ((const unsigned int*)(data_b.cpu_data()))[i]);
diff --git a/src/caffe/test/test_convolution_layer.cpp b/src/caffe/test/test_convolution_layer.cpp
index 905f1b57a5c..356cf83b37a 100644
--- a/src/caffe/test/test_convolution_layer.cpp
+++ b/src/caffe/test/test_convolution_layer.cpp
@@ -387,14 +387,14 @@ TYPED_TEST(ConvolutionLayerTest, TestSimple3DConvolution) {
   top_data = this->blob_top_->cpu_data();
   ref_top_data = this->ref_blob_top_->cpu_data();
   for (int i = 0; i < this->blob_top_->count(); ++i) {
-    EXPECT_NEAR(top_data[i], ref_top_data[i], tol<Dtype>(1e-3, 5e-2));
+    EXPECT_NEAR(top_data[i], ref_top_data[i], tol<Dtype>(1e-3, 1e-1));
   }
   caffe_conv(this->blob_bottom_2_, convolution_param, layer->blobs(),
       this->MakeReferenceTop(this->blob_top_2_));
   top_data = this->blob_top_2_->cpu_data();
   ref_top_data = this->ref_blob_top_->cpu_data();
   for (int i = 0; i < this->blob_top_->count(); ++i) {
-    EXPECT_NEAR(top_data[i], ref_top_data[i], tol<Dtype>(1e-3, 5e-2));
+    EXPECT_NEAR(top_data[i], ref_top_data[i], tol<Dtype>(1e-3, 1e-1));
   }
 }
 
@@ -1108,7 +1108,7 @@ TYPED_TEST(CuDNNConvolutionLayerTest, TestGradientCuDNN) {
   convolution_param->mutable_weight_filler()->set_type("gaussian");
   convolution_param->mutable_bias_filler()->set_type("gaussian");
   CuDNNConvolutionLayer<TypeParam, TypeParam> layer(layer_param);
-  GradientChecker<TypeParam> checker(tol<TypeParam>(5e-2, 1e-1), tol<TypeParam>(1e-2, 5e-1));
+  GradientChecker<TypeParam> checker(tol<TypeParam>(3e-2, 1e-1), tol<TypeParam>(1e-2, 5e-1));
   checker.CheckGradientExhaustive(&layer, this->blob_bottom_vec_, this->blob_top_vec_);
 }
 
diff --git a/src/caffe/test/test_data_transformer.cpp b/src/caffe/test/test_data_transformer.cpp
index 31e73f2f990..4824e343e64 100644
--- a/src/caffe/test/test_data_transformer.cpp
+++ b/src/caffe/test/test_data_transformer.cpp
@@ -38,7 +38,7 @@ class DataTransformTest : public ::testing::Test {
   int NumSequenceMatches(const TransformationParameter transform_param,
       Datum& datum, Phase phase) {
     // Get crop sequence with Caffe seed 1701.
-    DataTransformer transformer(transform_param, phase);
+    DataTransformer<Dtype> transformer(transform_param, phase);
     const int crop_size = transform_param.crop_size();
     Caffe::set_random_seed(seed_);
     transformer.InitRand();
@@ -85,7 +85,7 @@ TYPED_TEST(DataTransformTest, TestEmptyTransform) {
   Datum datum;
   FillDatum(label, channels, height, width, unique_pixels, &datum);
   TBlob<TypeParam> blob(1, channels, height, width);
-  DataTransformer transformer(transform_param, TEST);
+  DataTransformer<TypeParam> transformer(transform_param, TEST);
   transformer.InitRand();
   transformer.Transform(datum, &blob);
   EXPECT_EQ(blob.num(), 1);
@@ -108,7 +108,7 @@ TYPED_TEST(DataTransformTest, TestEmptyTransformUniquePixels) {
   Datum datum;
   FillDatum(label, channels, height, width, unique_pixels, &datum);
   TBlob<TypeParam> blob(1, 3, 4, 5);
-  DataTransformer transformer(transform_param, TEST);
+  DataTransformer<TypeParam> transformer(transform_param, TEST);
   transformer.InitRand();
   transformer.Transform(datum, &blob);
   EXPECT_EQ(blob.num(), 1);
@@ -132,7 +132,7 @@ TYPED_TEST(DataTransformTest, TestCropSize) {
   transform_param.set_crop_size(crop_size);
   Datum datum;
   FillDatum(label, channels, height, width, unique_pixels, &datum);
-  DataTransformer transformer(transform_param, TEST);
+  DataTransformer<TypeParam> transformer(transform_param, TEST);
   transformer.InitRand();
   TBlob<TypeParam> blob(1, channels, crop_size, crop_size);
   for (int iter = 0; iter < this->num_iter_; ++iter) {
@@ -270,7 +270,7 @@ TYPED_TEST(DataTransformTest, TestMeanValue) {
   Datum datum;
   FillDatum(label, channels, height, width, unique_pixels, &datum);
   TBlob<TypeParam> blob(1, channels, height, width);
-  DataTransformer transformer(transform_param, TEST);
+  DataTransformer<TypeParam> transformer(transform_param, TEST);
   transformer.InitRand();
   transformer.Transform(datum, &blob);
   for (int j = 0; j < blob.count(); ++j) {
@@ -292,7 +292,7 @@ TYPED_TEST(DataTransformTest, TestMeanValues) {
   Datum datum;
   FillDatum(label, channels, height, width, unique_pixels, &datum);
   TBlob<TypeParam> blob(1, channels, height, width);
-  DataTransformer transformer(transform_param, TEST);
+  DataTransformer<TypeParam> transformer(transform_param, TEST);
   transformer.InitRand();
   transformer.Transform(datum, &blob);
   for (int c = 0; c < channels; ++c) {
@@ -331,7 +331,7 @@ TYPED_TEST(DataTransformTest, TestMeanFile) {
   Datum datum;
   FillDatum(label, channels, height, width, unique_pixels, &datum);
   TBlob<TypeParam> blob(1, channels, height, width);
-  DataTransformer transformer(transform_param, TEST);
+  DataTransformer<TypeParam> transformer(transform_param, TEST);
   transformer.InitRand();
   transformer.Transform(datum, &blob);
   for (int j = 0; j < blob.count(); ++j) {
@@ -345,6 +345,27 @@ class VarSzTransformsTest : public ::testing::Test {
   VarSzTransformsTest()
     : seed_(1701) {}
 
+  void VariableSizedTransforms(DataTransformer<Dtype>& transformer, Datum* datum) {
+    const TransformationParameter& param = transformer.transform_param();
+    cv::Mat img1, img2;
+    const int color_mode = param.force_color() ? 1 : (param.force_gray() ? -1 : 0);
+    if (datum->encoded()) {
+      DecodeDatumToCVMat(*datum, color_mode, img1, false);
+    } else {
+      DatumToCVMat(*datum, img1, false);
+    }
+
+    transformer.image_random_resize(img1, img2);
+
+    if (transformer.image_random_crop_enabled()) {
+      transformer.image_random_crop(param.crop_size(), param.crop_size(), img2);
+    }
+    if (transformer.image_center_crop_enabled()) {
+      transformer.image_center_crop(param.crop_size(), param.crop_size(), img2);
+    }
+    CVMatToDatum(img2, *datum);
+  }
+
   void Run(
       TransformationParameter transform_param,
       int expected_height, int expected_width) {
@@ -356,12 +377,13 @@ class VarSzTransformsTest : public ::testing::Test {
 
     shared_ptr<Datum> datum = make_shared<Datum>();
     FillDatum(label, channels, height, width, unique_pixels, datum.get());
-    DataTransformer transformer(transform_param, TEST);
+    DataTransformer<Dtype> transformer(transform_param, TEST);
     Caffe::set_random_seed(seed_);
     transformer.InitRand();
 
     shared_ptr<Datum> transformed_datum = make_shared<Datum>(*datum);
-    transformer.VariableSizedTransforms(transformed_datum.get());
+    VariableSizedTransforms(transformer, transformed_datum.get());
+
     EXPECT_EQ(transformed_datum->channels(), 3);
     EXPECT_EQ(transformed_datum->height(), expected_height);
     EXPECT_EQ(transformed_datum->width(), expected_width);
@@ -407,7 +429,7 @@ class GPUDataTransformTest : public GPUDeviceTest<Dtype> {
   int NumSequenceMatches(const TransformationParameter transform_param,
       Datum& datum, Phase phase) {
     // Get crop sequence with Caffe seed 1701.
-    DataTransformer transformer(transform_param, phase);
+    DataTransformer<Dtype> transformer(transform_param, phase);
     const int crop_size = transform_param.crop_size();
     Caffe::set_random_seed(seed_);
     transformer.InitRand();
@@ -456,7 +478,7 @@ TYPED_TEST(GPUDataTransformTest, TestCropSize) {
   transform_param.set_use_gpu_transform(true);
   Datum datum;
   FillDatum(label, channels, height, width, unique_pixels, &datum);
-  DataTransformer transformer(transform_param, TEST);
+  DataTransformer<TypeParam> transformer(transform_param, TEST);
   transformer.InitRand();
   TBlob<TypeParam> blob(1, channels, crop_size, crop_size);
   for (int iter = 0; iter < this->num_iter_; ++iter) {
@@ -600,7 +622,7 @@ TYPED_TEST(GPUDataTransformTest, TestMeanValue) {
   Datum datum;
   FillDatum(label, channels, height, width, unique_pixels, &datum);
   TBlob<TypeParam> blob(1, channels, height, width);
-  DataTransformer transformer(transform_param, TEST);
+  DataTransformer<TypeParam> transformer(transform_param, TEST);
   transformer.InitRand();
   transformer.Transform(datum, &blob);
   for (int j = 0; j < blob.count(); ++j) {
@@ -623,7 +645,7 @@ TYPED_TEST(GPUDataTransformTest, TestMeanValues) {
   Datum datum;
   FillDatum(label, channels, height, width, unique_pixels, &datum);
   TBlob<TypeParam> blob(1, channels, height, width);
-  DataTransformer transformer(transform_param, TEST);
+  DataTransformer<TypeParam> transformer(transform_param, TEST);
   transformer.InitRand();
   transformer.Transform(datum, &blob);
   for (int c = 0; c < channels; ++c) {
@@ -662,7 +684,7 @@ TYPED_TEST(GPUDataTransformTest, TestMeanFile) {
   Datum datum;
   FillDatum(label, channels, height, width, unique_pixels, &datum);
   TBlob<TypeParam> blob(1, channels, height, width);
-  DataTransformer transformer(transform_param, TEST);
+  DataTransformer<TypeParam> transformer(transform_param, TEST);
   transformer.InitRand();
   transformer.Transform(datum, &blob);
   for (int j = 0; j < blob.count(); ++j) {
diff --git a/src/caffe/test/test_eltwise_layer.cpp b/src/caffe/test/test_eltwise_layer.cpp
index c6b187a420d..00fff4abe4b 100644
--- a/src/caffe/test/test_eltwise_layer.cpp
+++ b/src/caffe/test/test_eltwise_layer.cpp
@@ -94,6 +94,17 @@ TYPED_TEST(EltwiseLayerTest, TestProd) {
 
 TYPED_TEST(EltwiseLayerTest, TestSum) {
   typedef typename TypeParam::Dtype Dtype;
+  const int count = this->blob_top_->count();
+  const Dtype* in_data_a = this->blob_bottom_a_->cpu_data();
+  const Dtype* in_data_b = this->blob_bottom_b_->cpu_data();
+  const Dtype* in_data_c = this->blob_bottom_c_->cpu_data();
+  vector<int> shape(count);
+  TBlob<Dtype> val(shape);
+  Dtype* pval = val.mutable_cpu_data();
+  for (int i = 0; i < count; ++i) {
+    pval[i] = in_data_a[i] + in_data_b[i] + in_data_c[i];
+  }
+
   LayerParameter layer_param;
   layer_param.set_forward_type(tp<Dtype>());
   layer_param.set_backward_type(tp<Dtype>());
@@ -106,13 +117,8 @@ TYPED_TEST(EltwiseLayerTest, TestSum) {
   layer->SetUp(this->blob_bottom_vec_, this->blob_top_vec_);
   layer->Forward(this->blob_bottom_vec_, this->blob_top_vec_);
   const Dtype* data = this->blob_top_->cpu_data();
-  const int count = this->blob_top_->count();
-  const Dtype* in_data_a = this->blob_bottom_a_->cpu_data();
-  const Dtype* in_data_b = this->blob_bottom_b_->cpu_data();
-  const Dtype* in_data_c = this->blob_bottom_c_->cpu_data();
   for (int i = 0; i < count; ++i) {
-    EXPECT_NEAR(data[i], in_data_a[i] + in_data_b[i] + in_data_c[i],
-        tol<Dtype>(1e-4, 2e-3));
+    EXPECT_NEAR(data[i], pval[i], tol<Dtype>(1e-4, 2e-3));
   }
 }
 
diff --git a/src/caffe/test/test_lstm_layer.cpp b/src/caffe/test/test_lstm_layer.cpp
new file mode 100644
index 00000000000..a2d8f43aef6
--- /dev/null
+++ b/src/caffe/test/test_lstm_layer.cpp
@@ -0,0 +1,324 @@
+#include <cstring>
+#include <vector>
+
+#include "gtest/gtest.h"
+
+#include "caffe/blob.hpp"
+#include "caffe/common.hpp"
+#include "caffe/filler.hpp"
+#include "caffe/layers/lstm_layer.hpp"
+
+#include "caffe/test/test_caffe_main.hpp"
+#include "caffe/test/test_gradient_check_util.hpp"
+
+namespace caffe {
+
+template <typename TypeParam>
+class LSTMLayerTest : public MultiDeviceTest<TypeParam> {
+  typedef typename TypeParam::Dtype Dtype;
+
+ protected:
+  LSTMLayerTest() : num_output_(7) {
+    blob_bottom_vec_.push_back(&blob_bottom_);
+    blob_bottom_vec_.push_back(&blob_bottom_cont_);
+    blob_top_vec_.push_back(&blob_top_);
+    unit_blob_bottom_vec_.push_back(&unit_blob_bottom_c_prev_);
+    unit_blob_bottom_vec_.push_back(&unit_blob_bottom_x_);
+    unit_blob_bottom_vec_.push_back(&unit_blob_bottom_cont_);
+    unit_blob_top_vec_.push_back(&unit_blob_top_c_);
+    unit_blob_top_vec_.push_back(&unit_blob_top_h_);
+
+    ReshapeBlobs(1, 3);
+
+    layer_param_.set_forward_type(tp<Dtype>());
+    layer_param_.set_backward_type(tp<Dtype>());
+    layer_param_.set_forward_math(tp<Dtype>());
+    layer_param_.set_backward_math(tp<Dtype>());
+    layer_param_.mutable_recurrent_param()->set_num_output(num_output_);
+    FillerParameter* weight_filler =
+        layer_param_.mutable_recurrent_param()->mutable_weight_filler();
+    weight_filler->set_type("gaussian");
+    weight_filler->set_std(0.2);
+    FillerParameter* bias_filler =
+        layer_param_.mutable_recurrent_param()->mutable_bias_filler();
+    bias_filler->set_type("gaussian");
+    bias_filler->set_std(0.1);
+
+    layer_param_.set_phase(TEST);
+  }
+
+  void ReshapeBlobs(int num_timesteps, int num_instances) {
+    blob_bottom_.Reshape(num_timesteps, num_instances, 3, 2);
+    blob_bottom_static_.Reshape(num_instances, 2, 3, 4);
+    vector<int> shape(2);
+    shape[0] = num_timesteps;
+    shape[1] = num_instances;
+    blob_bottom_cont_.Reshape(shape);
+    shape.push_back(num_output_);
+
+    shape[0] = 1; shape[1] = num_instances; shape[2] = 4 * num_output_;
+    unit_blob_bottom_x_.Reshape(shape);
+    shape[0] = 1; shape[1] = num_instances; shape[2] = num_output_;
+    unit_blob_bottom_c_prev_.Reshape(shape);
+    shape.resize(2);
+    shape[0] = 1; shape[1] = num_instances;
+    unit_blob_bottom_cont_.Reshape(shape);
+
+    FillerParameter filler_param;
+    filler_param.set_min(-1);
+    filler_param.set_max(1);
+    UniformFiller<Dtype> filler(filler_param);
+    filler.Fill(&blob_bottom_);
+    filler.Fill(&unit_blob_bottom_c_prev_);
+    filler.Fill(&unit_blob_bottom_x_);
+  }
+
+  int num_output_;
+  LayerParameter layer_param_;
+  TBlob<Dtype> blob_bottom_;
+  TBlob<Dtype> blob_bottom_cont_;
+  TBlob<Dtype> blob_bottom_static_;
+  TBlob<Dtype> blob_top_;
+  vector<Blob*> blob_bottom_vec_;
+  vector<Blob*> blob_top_vec_;
+
+  TBlob<Dtype> unit_blob_bottom_cont_;
+  TBlob<Dtype> unit_blob_bottom_c_prev_;
+  TBlob<Dtype> unit_blob_bottom_x_;
+  TBlob<Dtype> unit_blob_top_c_;
+  TBlob<Dtype> unit_blob_top_h_;
+  vector<Blob*> unit_blob_bottom_vec_;
+  vector<Blob*> unit_blob_top_vec_;
+};
+
+TYPED_TEST_CASE(LSTMLayerTest, TestDtypesAndDevices);
+
+TYPED_TEST(LSTMLayerTest, TestSetUp) {
+  typedef typename TypeParam::Dtype Dtype;
+  LSTMLayer<Dtype, Dtype> layer(this->layer_param_);
+  layer.SetUp(this->blob_bottom_vec_, this->blob_top_vec_);
+  vector<int> expected_top_shape = this->blob_bottom_.shape();
+  expected_top_shape.resize(3);
+  expected_top_shape[2] = this->num_output_;
+  EXPECT_TRUE(this->blob_top_.shape() == expected_top_shape);
+}
+
+TYPED_TEST(LSTMLayerTest, TestForward) {
+  typedef typename TypeParam::Dtype Dtype;
+  const int kNumTimesteps = 3;
+  const int num = this->blob_bottom_.shape(1);
+  this->ReshapeBlobs(kNumTimesteps, num);
+
+  // Fill the cont blob with <0, 1, 1, ..., 1>,
+  // indicating a sequence that begins at the first timestep
+  // then continues for the rest of the sequence.
+  for (int t = 0; t < kNumTimesteps; ++t) {
+    for (int n = 0; n < num; ++n) {
+      this->blob_bottom_cont_.mutable_cpu_data()[t * num + n] = t > 0;
+    }
+  }
+
+  // Process the full sequence in a single batch.
+  FillerParameter filler_param;
+  filler_param.set_mean(0);
+  filler_param.set_std(1);
+  GaussianFiller<Dtype> sequence_filler(filler_param);
+  Caffe::set_random_seed(1);
+  sequence_filler.Fill(&this->blob_bottom_);
+  this->layer_param_.set_forward_type(tp<Dtype>());
+  this->layer_param_.set_backward_type(tp<Dtype>());
+  this->layer_param_.set_forward_math(tp<Dtype>());
+  this->layer_param_.set_backward_math(tp<Dtype>());
+  shared_ptr<LSTMLayer<Dtype, Dtype> > layer(new LSTMLayer<Dtype, Dtype>(this->layer_param_));
+  Caffe::set_random_seed(1701);
+  layer->SetUp(this->blob_bottom_vec_, this->blob_top_vec_);
+  LOG(INFO) << "Calling forward for full sequence LSTM";
+  layer->Forward(this->blob_bottom_vec_, this->blob_top_vec_);
+
+  // Copy the inputs and outputs to reuse/check them later.
+  TBlob<Dtype> bottom_copy(this->blob_bottom_.shape());
+  bottom_copy.CopyFrom(this->blob_bottom_);
+  TBlob<Dtype> top_copy(this->blob_top_.shape());
+  top_copy.CopyFrom(this->blob_top_);
+
+  // Process the batch one timestep at a time;
+  // check that we get the same result.
+  this->ReshapeBlobs(1, num);
+  layer.reset(new LSTMLayer<Dtype, Dtype>(this->layer_param_));
+  Caffe::set_random_seed(1701);
+  layer->SetUp(this->blob_bottom_vec_, this->blob_top_vec_);
+  const int bottom_count = this->blob_bottom_.count();
+  const int top_count = this->blob_top_.count();
+  const Dtype kEpsilon = 1e-5;
+  for (int t = 0; t < kNumTimesteps; ++t) {
+    caffe_copy(bottom_count, bottom_copy.cpu_data() + t * bottom_count,
+               this->blob_bottom_.mutable_cpu_data());
+    for (int n = 0; n < num; ++n) {
+      this->blob_bottom_cont_.mutable_cpu_data()[n] = t > 0;
+    }
+    LOG(INFO) << "Calling forward for LSTM timestep " << t;
+    layer->Forward(this->blob_bottom_vec_, this->blob_top_vec_);
+    for (int i = 0; i < top_count; ++i) {
+      ASSERT_LT(t * top_count + i, top_copy.count());
+      EXPECT_NEAR(this->blob_top_.cpu_data()[i],
+                  top_copy.cpu_data()[t * top_count + i], kEpsilon)
+         << "t = " << t << "; i = " << i;
+    }
+  }
+
+  // Process the batch one timestep at a time with all cont blobs set to 0.
+  // Check that we get a different result, except in the first timestep.
+  Caffe::set_random_seed(1701);
+  layer.reset(new LSTMLayer<Dtype, Dtype>(this->layer_param_));
+  layer->SetUp(this->blob_bottom_vec_, this->blob_top_vec_);
+  for (int t = 0; t < kNumTimesteps; ++t) {
+    caffe_copy(bottom_count, bottom_copy.cpu_data() + t * bottom_count,
+               this->blob_bottom_.mutable_cpu_data());
+    for (int n = 0; n < num; ++n) {
+      this->blob_bottom_cont_.mutable_cpu_data()[n] = 0;
+    }
+    LOG(INFO) << "Calling forward for LSTM timestep " << t;
+    layer->Forward(this->blob_bottom_vec_, this->blob_top_vec_);
+    for (int i = 0; i < top_count; ++i) {
+      if (t == 0) {
+        EXPECT_NEAR(this->blob_top_.cpu_data()[i],
+                    top_copy.cpu_data()[t * top_count + i], kEpsilon)
+           << "t = " << t << "; i = " << i;
+      } else {
+        EXPECT_NE(this->blob_top_.cpu_data()[i],
+                  top_copy.cpu_data()[t * top_count + i])
+           << "t = " << t << "; i = " << i;
+      }
+    }
+  }
+}
+
+TYPED_TEST(LSTMLayerTest, TestLSTMUnitSetUp) {
+  typedef typename TypeParam::Dtype Dtype;
+  LayerParameter layer_param;
+  layer_param.set_forward_type(tp<Dtype>());
+  layer_param.set_backward_type(tp<Dtype>());
+  layer_param.set_forward_math(tp<Dtype>());
+  layer_param.set_backward_math(tp<Dtype>());
+  LSTMUnitLayer<Dtype, Dtype> layer(layer_param);
+  layer.SetUp(this->unit_blob_bottom_vec_, this->unit_blob_top_vec_);
+  const int num_axes = this->unit_blob_bottom_c_prev_.num_axes();
+  ASSERT_EQ(num_axes, this->unit_blob_top_c_.num_axes());
+  ASSERT_EQ(num_axes, this->unit_blob_top_h_.num_axes());
+  for (int i = 0; i < num_axes; ++i) {
+    EXPECT_EQ(this->unit_blob_bottom_c_prev_.shape(i),
+              this->unit_blob_top_c_.shape(i));
+    EXPECT_EQ(this->unit_blob_bottom_c_prev_.shape(i),
+              this->unit_blob_top_h_.shape(i));
+  }
+}
+
+TYPED_TEST(LSTMLayerTest, TestLSTMUnitGradient) {
+  typedef typename TypeParam::Dtype Dtype;
+  LayerParameter layer_param;
+  layer_param.set_forward_type(tp<Dtype>());
+  layer_param.set_backward_type(tp<Dtype>());
+  layer_param.set_forward_math(tp<Dtype>());
+  layer_param.set_backward_math(tp<Dtype>());
+  LSTMUnitLayer<Dtype, Dtype> layer(layer_param);
+  GradientChecker<Dtype> checker(tol<Dtype>(1e-2, 1e-1), tol<Dtype>(1e-3, 1e-2));
+  Dtype* cont_data = this->blob_bottom_cont_.mutable_cpu_data();
+  cont_data[0] = 0;
+  cont_data[1] = 0;
+  cont_data[2] = 0;
+  checker.CheckGradientExhaustive(&layer, this->unit_blob_bottom_vec_,
+      this->unit_blob_top_vec_, 0);
+  checker.CheckGradientExhaustive(&layer, this->unit_blob_bottom_vec_,
+      this->unit_blob_top_vec_, 1);
+}
+
+TYPED_TEST(LSTMLayerTest, TestLSTMUnitGradientNonZeroCont) {
+  typedef typename TypeParam::Dtype Dtype;
+  LayerParameter layer_param;
+  layer_param.set_forward_type(tp<Dtype>());
+  layer_param.set_backward_type(tp<Dtype>());
+  layer_param.set_forward_math(tp<Dtype>());
+  layer_param.set_backward_math(tp<Dtype>());
+  LSTMUnitLayer<Dtype, Dtype> layer(layer_param);
+  GradientChecker<Dtype> checker(tol<Dtype>(1e-2, 1e-1), tol<Dtype>(1e-3, 1e-2));
+  Dtype* cont_data = this->blob_bottom_cont_.mutable_cpu_data();
+  cont_data[0] = 1;
+  cont_data[1] = 0;
+  cont_data[2] = 1;
+  checker.CheckGradientExhaustive(&layer, this->unit_blob_bottom_vec_,
+      this->unit_blob_top_vec_, 0);
+  checker.CheckGradientExhaustive(&layer, this->unit_blob_bottom_vec_,
+      this->unit_blob_top_vec_, 1);
+}
+
+TYPED_TEST(LSTMLayerTest, TestGradient) {
+  typedef typename TypeParam::Dtype Dtype;
+  this->layer_param_.set_forward_type(tp<Dtype>());
+  this->layer_param_.set_backward_type(tp<Dtype>());
+  this->layer_param_.set_forward_math(tp<Dtype>());
+  this->layer_param_.set_backward_math(tp<Dtype>());
+  LSTMLayer<Dtype, Dtype> layer(this->layer_param_);
+  GradientChecker<Dtype> checker(tol<Dtype>(1e-2, 1e-1), tol<Dtype>(1e-3, 1e-2));
+  checker.CheckGradientExhaustive(&layer, this->blob_bottom_vec_,
+      this->blob_top_vec_, 0);
+}
+
+TYPED_TEST(LSTMLayerTest, TestGradientNonZeroCont) {
+  typedef typename TypeParam::Dtype Dtype;
+  this->layer_param_.set_forward_type(tp<Dtype>());
+  this->layer_param_.set_backward_type(tp<Dtype>());
+  this->layer_param_.set_forward_math(tp<Dtype>());
+  this->layer_param_.set_backward_math(tp<Dtype>());
+  LSTMLayer<Dtype, Dtype> layer(this->layer_param_);
+  GradientChecker<Dtype> checker(tol<Dtype>(1e-2, 1e-1), tol<Dtype>(1e-3, 1e-2));
+  for (int i = 0; i < this->blob_bottom_cont_.count(); ++i) {
+    this->blob_bottom_cont_.mutable_cpu_data()[i] = i > 2;
+  }
+  checker.CheckGradientExhaustive(&layer, this->blob_bottom_vec_,
+      this->blob_top_vec_, 0);
+}
+
+TYPED_TEST(LSTMLayerTest, TestGradientNonZeroContBufferSize2) {
+  typedef typename TypeParam::Dtype Dtype;
+  this->ReshapeBlobs(2, 2);
+  FillerParameter filler_param;
+  UniformFiller<Dtype> filler(filler_param);
+  filler.Fill(&this->blob_bottom_);
+  this->layer_param_.set_forward_type(tp<Dtype>());
+  this->layer_param_.set_backward_type(tp<Dtype>());
+  this->layer_param_.set_forward_math(tp<Dtype>());
+  this->layer_param_.set_backward_math(tp<Dtype>());
+  LSTMLayer<Dtype, Dtype> layer(this->layer_param_);
+  GradientChecker<Dtype> checker(tol<Dtype>(1e-2, 1e-1), tol<Dtype>(1e-3, 1e-2));
+  for (int i = 0; i < this->blob_bottom_cont_.count(); ++i) {
+    this->blob_bottom_cont_.mutable_cpu_data()[i] = i > 2;
+  }
+  checker.CheckGradientExhaustive(&layer, this->blob_bottom_vec_,
+      this->blob_top_vec_, 0);
+}
+
+TYPED_TEST(LSTMLayerTest, TestGradientNonZeroContBufferSize2WithStaticInput) {
+  typedef typename TypeParam::Dtype Dtype;
+  this->ReshapeBlobs(2, 2);
+  FillerParameter filler_param;
+  UniformFiller<Dtype> filler(filler_param);
+  filler.Fill(&this->blob_bottom_);
+  filler.Fill(&this->blob_bottom_static_);
+  this->blob_bottom_vec_.push_back(&this->blob_bottom_static_);
+  this->layer_param_.set_forward_type(tp<Dtype>());
+  this->layer_param_.set_backward_type(tp<Dtype>());
+  this->layer_param_.set_forward_math(tp<Dtype>());
+  this->layer_param_.set_backward_math(tp<Dtype>());
+  LSTMLayer<Dtype, Dtype> layer(this->layer_param_);
+  GradientChecker<Dtype> checker(tol<Dtype>(1e-2, 1e-1), tol<Dtype>(1e-3, 1e-2));
+  for (int i = 0; i < this->blob_bottom_cont_.count(); ++i) {
+    this->blob_bottom_cont_.mutable_cpu_data()[i] = i > 2;
+  }
+  checker.CheckGradientExhaustive(&layer, this->blob_bottom_vec_,
+      this->blob_top_vec_, 0);
+  checker.CheckGradientExhaustive(&layer, this->blob_bottom_vec_,
+      this->blob_top_vec_, 2);
+}
+
+
+}  // namespace caffe
diff --git a/src/caffe/test/test_net.cpp b/src/caffe/test/test_net.cpp
index 21f9461ef3d..df9e1f9ed5c 100644
--- a/src/caffe/test/test_net.cpp
+++ b/src/caffe/test/test_net.cpp
@@ -48,6 +48,16 @@ class NetTest : public MultiDeviceTest<TypeParam> {
     }
   }
 
+  virtual void InitNetFromProtoFileWithState(const string& proto,
+      Phase phase = caffe::TRAIN, const int level = 0,
+      const vector<string>* stages = NULL) {
+    NetParameter param;
+    CHECK(google::protobuf::TextFormat::ParseFromString(proto, &param));
+    string param_file = MakeTempFilename();
+    WriteProtoToTextFile(param, param_file);
+    net_.reset(new Net(param_file, phase, 0U, nullptr, nullptr, false, level, stages));
+  }
+
   virtual void CopyNetBlobs(const bool copy_diff,
       vector<shared_ptr<TBlob<Dtype> > >* blobs_copy) {
     CHECK(net_);
@@ -876,10 +886,67 @@ class NetTest : public MultiDeviceTest<TypeParam> {
     InitNetFromProtoString(proto);
   }
 
+  virtual void InitAllInOneNet(Phase phase = caffe::TRAIN,
+      const int level = 0, const vector<string>* stages = NULL) {
+    string proto =
+        "name: 'All-in-one Network'"
+            "layer { "
+            "  name: 'train-data' "
+            "  type: 'DummyData' "
+            "  top: 'data' "
+            "  top: 'label' "
+            "  dummy_data_param { "
+            "    shape { dim: 1 dim: 10 } "
+            "    shape { dim: 1 dim: 1 } "
+            "  } "
+            "  include { phase: TRAIN stage: 'train' } "
+            "} "
+            "layer { "
+            "  name: 'val-data' "
+            "  type: 'DummyData' "
+            "  top: 'data' "
+            "  top: 'label' "
+            "  dummy_data_param { "
+            "    shape { dim: 1 dim: 10 } "
+            "    shape { dim: 1 dim: 1 } "
+            "  } "
+            "  include { phase: TEST stage: 'val' } "
+            "} "
+            "layer { "
+            "  name: 'deploy-data' "
+            "  type: 'Input' "
+            "  top: 'data' "
+            "  input_param { "
+            "    shape { dim: 1 dim: 10 } "
+            "  } "
+            "  include { phase: TEST stage: 'deploy' } "
+            "} "
+            "layer { "
+            "  name: 'ip' "
+            "  type: 'InnerProduct' "
+            "  bottom: 'data' "
+            "  top: 'ip' "
+            "  inner_product_param { "
+            "    num_output: 2 "
+            "  } "
+            "} "
+            "layer { "
+            "  name: 'loss' "
+            "  type: 'SoftmaxWithLoss' "
+            "  bottom: 'ip' "
+            "  bottom: 'label' "
+            "  top: 'loss' "
+            "  include { phase: TRAIN stage: 'train' } "
+            "  include { phase: TEST stage: 'val' } "
+            "} ";
+    InitNetFromProtoFileWithState(proto, phase, level, stages);
+  }
+
   int seed_;
   shared_ptr<Net> net_;
 };
 
+
 TYPED_TEST_CASE(NetTest, TestDtypesAndDevices);
 
 TYPED_TEST(NetTest, TestHasBlob) {
@@ -2624,4 +2691,64 @@ TYPED_TEST(NetTest, TestForcePropagateDown) {
   }
 }
 
+TYPED_TEST(NetTest, TestAllInOneNetTrain) {
+  vector<string> stages;
+  stages.push_back("train");
+  this->InitAllInOneNet(caffe::TRAIN, 0, &stages);
+  bool found_data = false;
+  bool found_loss = false;
+  for (int i = 0; i < this->net_->layers().size(); ++i) {
+    const string& layer_name = this->net_->layer_names()[i];
+    if (layer_name == "train-data") {
+      found_data = true;
+    } else if (layer_name == "loss") {
+      found_loss = true;
+    } else {
+      ASSERT_NE(layer_name, "val-data");
+      ASSERT_NE(layer_name, "deploy-data");
+    }
+  }
+  ASSERT_TRUE(found_data);
+  ASSERT_TRUE(found_loss);
+}
+
+TYPED_TEST(NetTest, TestAllInOneNetVal) {
+  vector<string> stages;
+  stages.push_back("val");
+  this->InitAllInOneNet(caffe::TEST, 0, &stages);
+  bool found_data = false;
+  bool found_loss = false;
+  for (int i = 0; i < this->net_->layers().size(); ++i) {
+    const string& layer_name = this->net_->layer_names()[i];
+    if (layer_name == "val-data") {
+      found_data = true;
+    } else if (layer_name == "loss") {
+      found_loss = true;
+    } else {
+      ASSERT_NE(layer_name, "train-data");
+      ASSERT_NE(layer_name, "deploy-data");
+    }
+  }
+  ASSERT_TRUE(found_data);
+  ASSERT_TRUE(found_loss);
+}
+
+TYPED_TEST(NetTest, TestAllInOneNetDeploy) {
+  vector<string> stages;
+  stages.push_back("deploy");
+  this->InitAllInOneNet(caffe::TEST, 0, &stages);
+  bool found_data = false;
+  for (int i = 0; i < this->net_->layers().size(); ++i) {
+    const string& layer_name = this->net_->layer_names()[i];
+    if (layer_name == "deploy-data") {
+      found_data = true;
+    } else {
+      ASSERT_NE(layer_name, "train-data");
+      ASSERT_NE(layer_name, "val-data");
+      ASSERT_NE(layer_name, "loss");
+    }
+  }
+  ASSERT_TRUE(found_data);
+}
+
 }  // namespace caffe
diff --git a/src/caffe/test/test_neuron_layer.cpp b/src/caffe/test/test_neuron_layer.cpp
index 2dfe9d61269..f30c9117d20 100644
--- a/src/caffe/test/test_neuron_layer.cpp
+++ b/src/caffe/test/test_neuron_layer.cpp
@@ -819,7 +819,7 @@ class CuDNNNeuronLayerTest : public GPUDeviceTest<Dtype> {
     LayerParameter layer_param;
     // Fill in the given dropout_ratio, unless it's 0.5, in which case we don't
     // set it explicitly to test that 0.5 is the default.
-    if (dropout_ratio != 0.5) {
+    if (dropout_ratio != 0.5F) {
       layer_param.mutable_dropout_param()->set_dropout_ratio(dropout_ratio);
     }
     DropoutLayer<Dtype, Dtype> layer(layer_param);
@@ -829,7 +829,7 @@ class CuDNNNeuronLayerTest : public GPUDeviceTest<Dtype> {
     // Now, check values
     const Dtype* bottom_data = this->blob_bottom_->cpu_data();
     const Dtype* top_data = this->blob_top_->cpu_data();
-    float scale = 1. / (1. - layer_param.dropout_param().dropout_ratio());
+    float scale = 1.F / (1.F - layer_param.dropout_param().dropout_ratio());
     const int count = this->blob_bottom_->count();
     // Initialize num_kept to count the number of inputs NOT dropped out.
     int num_kept = 0;
diff --git a/src/caffe/test/test_rnn_layer.cpp b/src/caffe/test/test_rnn_layer.cpp
new file mode 100644
index 00000000000..437e4e0a04c
--- /dev/null
+++ b/src/caffe/test/test_rnn_layer.cpp
@@ -0,0 +1,241 @@
+#include <cstring>
+#include <vector>
+
+#include "gtest/gtest.h"
+
+#include "caffe/blob.hpp"
+#include "caffe/common.hpp"
+#include "caffe/filler.hpp"
+#include "caffe/layers/rnn_layer.hpp"
+
+#include "caffe/test/test_caffe_main.hpp"
+#include "caffe/test/test_gradient_check_util.hpp"
+
+namespace caffe {
+
+template <typename TypeParam>
+class RNNLayerTest : public MultiDeviceTest<TypeParam> {
+  typedef typename TypeParam::Dtype Dtype;
+
+ protected:
+  RNNLayerTest() : num_output_(7) {
+    blob_bottom_vec_.push_back(&blob_bottom_);
+    blob_bottom_vec_.push_back(&blob_bottom_cont_);
+    blob_top_vec_.push_back(&blob_top_);
+
+    ReshapeBlobs(1, 3);
+
+    layer_param_.mutable_recurrent_param()->set_num_output(num_output_);
+    FillerParameter* weight_filler =
+        layer_param_.mutable_recurrent_param()->mutable_weight_filler();
+    weight_filler->set_type("gaussian");
+    weight_filler->set_std(0.2);
+    FillerParameter* bias_filler =
+        layer_param_.mutable_recurrent_param()->mutable_bias_filler();
+    bias_filler->set_type("gaussian");
+    bias_filler->set_std(0.1);
+
+    layer_param_.set_phase(TEST);
+  }
+
+  void ReshapeBlobs(int num_timesteps, int num_instances) {
+    blob_bottom_.Reshape(num_timesteps, num_instances, 3, 2);
+    blob_bottom_static_.Reshape(num_instances, 2, 3, 4);
+    vector<int> shape(2);
+    shape[0] = num_timesteps;
+    shape[1] = num_instances;
+    blob_bottom_cont_.Reshape(shape);
+
+    FillerParameter filler_param;
+    filler_param.set_min(-1);
+    filler_param.set_max(1);
+    UniformFiller<Dtype> filler(filler_param);
+    filler.Fill(&blob_bottom_);
+  }
+
+  int num_output_;
+  LayerParameter layer_param_;
+  TBlob<Dtype> blob_bottom_;
+  TBlob<Dtype> blob_bottom_cont_;
+  TBlob<Dtype> blob_bottom_static_;
+  TBlob<Dtype> blob_top_;
+  vector<Blob*> blob_bottom_vec_;
+  vector<Blob*> blob_top_vec_;
+};
+
+TYPED_TEST_CASE(RNNLayerTest, TestDtypesGPUOnly);
+
+TYPED_TEST(RNNLayerTest, TestSetUp) {
+  typedef typename TypeParam::Dtype Dtype;
+  this->layer_param_.set_forward_type(tp<Dtype>());
+  this->layer_param_.set_backward_type(tp<Dtype>());
+  this->layer_param_.set_forward_math(tp<Dtype>());
+  this->layer_param_.set_backward_math(tp<Dtype>());
+  RNNLayer<Dtype, Dtype> layer(this->layer_param_);
+  layer.SetUp(this->blob_bottom_vec_, this->blob_top_vec_);
+  vector<int> expected_top_shape = this->blob_bottom_.shape();
+  expected_top_shape.resize(3);
+  expected_top_shape[2] = this->num_output_;
+  EXPECT_TRUE(this->blob_top_.shape() == expected_top_shape);
+}
+
+TYPED_TEST(RNNLayerTest, TestForward) {
+  typedef typename TypeParam::Dtype Dtype;
+  const int kNumTimesteps = 3;
+  const int num = this->blob_bottom_.shape(1);
+  this->ReshapeBlobs(kNumTimesteps, num);
+
+  // Fill the cont blob with <0, 1, 1, ..., 1>,
+  // indicating a sequence that begins at the first timestep
+  // then continues for the rest of the sequence.
+  for (int t = 0; t < kNumTimesteps; ++t) {
+    for (int n = 0; n < num; ++n) {
+      this->blob_bottom_cont_.mutable_cpu_data()[t * num + n] = t > 0;
+    }
+  }
+
+  // Process the full sequence in a single batch.
+  FillerParameter filler_param;
+  filler_param.set_mean(0);
+  filler_param.set_std(1);
+  GaussianFiller<Dtype> sequence_filler(filler_param);
+  sequence_filler.Fill(&this->blob_bottom_);
+  this->layer_param_.set_forward_type(tp<Dtype>());
+  this->layer_param_.set_backward_type(tp<Dtype>());
+  this->layer_param_.set_forward_math(tp<Dtype>());
+  this->layer_param_.set_backward_math(tp<Dtype>());
+  shared_ptr<RNNLayer<Dtype, Dtype> > layer(new RNNLayer<Dtype, Dtype>(this->layer_param_));
+  Caffe::set_random_seed(1701);
+  layer->SetUp(this->blob_bottom_vec_, this->blob_top_vec_);
+  LOG(INFO) << "Calling forward for full sequence RNN";
+  layer->Forward(this->blob_bottom_vec_, this->blob_top_vec_);
+
+  // Copy the inputs and outputs to reuse/check them later.
+  TBlob<Dtype> bottom_copy(this->blob_bottom_.shape());
+  bottom_copy.CopyFrom(this->blob_bottom_);
+  TBlob<Dtype> top_copy(this->blob_top_.shape());
+  top_copy.CopyFrom(this->blob_top_);
+
+  // Process the batch one timestep at a time;
+  // check that we get the same result.
+  this->ReshapeBlobs(1, num);
+  layer.reset(new RNNLayer<Dtype, Dtype>(this->layer_param_));
+  Caffe::set_random_seed(1701);
+  layer->SetUp(this->blob_bottom_vec_, this->blob_top_vec_);
+  const int bottom_count = this->blob_bottom_.count();
+  const int top_count = this->blob_top_.count();
+  const Dtype kEpsilon = 1e-5;
+  for (int t = 0; t < kNumTimesteps; ++t) {
+    caffe_copy(bottom_count, bottom_copy.cpu_data() + t * bottom_count,
+               this->blob_bottom_.mutable_cpu_data());
+    for (int n = 0; n < num; ++n) {
+      this->blob_bottom_cont_.mutable_cpu_data()[n] = t > 0;
+    }
+    LOG(INFO) << "Calling forward for RNN timestep " << t;
+    layer->Forward(this->blob_bottom_vec_, this->blob_top_vec_);
+    for (int i = 0; i < top_count; ++i) {
+      ASSERT_LT(t * top_count + i, top_copy.count());
+      EXPECT_NEAR(this->blob_top_.cpu_data()[i],
+                  top_copy.cpu_data()[t * top_count + i], kEpsilon)
+         << "t = " << t << "; i = " << i;
+    }
+  }
+
+  // Process the batch one timestep at a time with all cont blobs set to 0.
+  // Check that we get a different result, except in the first timestep.
+  Caffe::set_random_seed(1701);
+  layer.reset(new RNNLayer<Dtype, Dtype>(this->layer_param_));
+  layer->SetUp(this->blob_bottom_vec_, this->blob_top_vec_);
+  for (int t = 0; t < kNumTimesteps; ++t) {
+    caffe_copy(bottom_count, bottom_copy.cpu_data() + t * bottom_count,
+               this->blob_bottom_.mutable_cpu_data());
+    for (int n = 0; n < num; ++n) {
+      this->blob_bottom_cont_.mutable_cpu_data()[n] = 0;
+    }
+    LOG(INFO) << "Calling forward for RNN timestep " << t;
+    layer->Forward(this->blob_bottom_vec_, this->blob_top_vec_);
+    for (int i = 0; i < top_count; ++i) {
+      if (t == 0) {
+        EXPECT_NEAR(this->blob_top_.cpu_data()[i],
+                    top_copy.cpu_data()[t * top_count + i], kEpsilon)
+           << "t = " << t << "; i = " << i;
+      } else {
+        EXPECT_NE(this->blob_top_.cpu_data()[i],
+                  top_copy.cpu_data()[t * top_count + i])
+           << "t = " << t << "; i = " << i;
+      }
+    }
+  }
+}
+
+TYPED_TEST(RNNLayerTest, TestGradient) {
+  typedef typename TypeParam::Dtype Dtype;
+  this->layer_param_.set_forward_type(tp<Dtype>());
+  this->layer_param_.set_backward_type(tp<Dtype>());
+  this->layer_param_.set_forward_math(tp<Dtype>());
+  this->layer_param_.set_backward_math(tp<Dtype>());
+  RNNLayer<Dtype, Dtype> layer(this->layer_param_);
+  GradientChecker<Dtype> checker(tol<Dtype>(1e-2, 1e-1), tol<Dtype>(1e-3, 1e-2));
+  checker.CheckGradientExhaustive(&layer, this->blob_bottom_vec_,
+      this->blob_top_vec_, 0);
+}
+
+TYPED_TEST(RNNLayerTest, TestGradientNonZeroCont) {
+  typedef typename TypeParam::Dtype Dtype;
+  this->layer_param_.set_forward_type(tp<Dtype>());
+  this->layer_param_.set_backward_type(tp<Dtype>());
+  this->layer_param_.set_forward_math(tp<Dtype>());
+  this->layer_param_.set_backward_math(tp<Dtype>());
+  RNNLayer<Dtype, Dtype> layer(this->layer_param_);
+  GradientChecker<Dtype> checker(tol<Dtype>(1e-2, 1e-1), tol<Dtype>(1e-3, 1e-2));
+  for (int i = 0; i < this->blob_bottom_cont_.count(); ++i) {
+    this->blob_bottom_cont_.mutable_cpu_data()[i] = i > 2;
+  }
+  checker.CheckGradientExhaustive(&layer, this->blob_bottom_vec_,
+      this->blob_top_vec_, 0);
+}
+
+TYPED_TEST(RNNLayerTest, TestGradientNonZeroContBufferSize2) {
+  typedef typename TypeParam::Dtype Dtype;
+  this->ReshapeBlobs(2, 2);
+  // fill the values
+  FillerParameter filler_param;
+  UniformFiller<Dtype> filler(filler_param);
+  filler.Fill(&this->blob_bottom_);
+  this->layer_param_.set_forward_type(tp<Dtype>());
+  this->layer_param_.set_backward_type(tp<Dtype>());
+  this->layer_param_.set_forward_math(tp<Dtype>());
+  this->layer_param_.set_backward_math(tp<Dtype>());
+  RNNLayer<Dtype, Dtype> layer(this->layer_param_);
+  GradientChecker<Dtype> checker(tol<Dtype>(1e-2, 1e-1), tol<Dtype>(1e-3, 1e-2));
+  for (int i = 0; i < this->blob_bottom_cont_.count(); ++i) {
+    this->blob_bottom_cont_.mutable_cpu_data()[i] = i > 2;
+  }
+  checker.CheckGradientExhaustive(&layer, this->blob_bottom_vec_,
+      this->blob_top_vec_, 0);
+}
+
+TYPED_TEST(RNNLayerTest, TestGradientNonZeroContBufferSize2WithStaticInput) {
+  typedef typename TypeParam::Dtype Dtype;
+  this->ReshapeBlobs(2, 2);
+  FillerParameter filler_param;
+  UniformFiller<Dtype> filler(filler_param);
+  filler.Fill(&this->blob_bottom_);
+  filler.Fill(&this->blob_bottom_static_);
+  this->blob_bottom_vec_.push_back(&this->blob_bottom_static_);
+  this->layer_param_.set_forward_type(tp<Dtype>());
+  this->layer_param_.set_backward_type(tp<Dtype>());
+  this->layer_param_.set_forward_math(tpmax<Dtype, float>());
+  this->layer_param_.set_backward_math(tpmax<Dtype, float>());
+  RNNLayer<Dtype, Dtype> layer(this->layer_param_);
+  GradientChecker<Dtype> checker(tol<Dtype>(1e-2, 1e-1), tol<Dtype>(1e-3, 1e-2));
+  for (int i = 0; i < this->blob_bottom_cont_.count(); ++i) {
+    this->blob_bottom_cont_.mutable_cpu_data()[i] = i > 2;
+  }
+  checker.CheckGradientExhaustive(&layer, this->blob_bottom_vec_,
+      this->blob_top_vec_, 0);
+  checker.CheckGradientExhaustive(&layer, this->blob_bottom_vec_,
+      this->blob_top_vec_, 2);
+}
+
+}  // namespace caffe
diff --git a/src/caffe/test/test_stochastic_pooling.cpp b/src/caffe/test/test_stochastic_pooling.cpp
index a1fa436dbed..7990a6c7b10 100644
--- a/src/caffe/test/test_stochastic_pooling.cpp
+++ b/src/caffe/test/test_stochastic_pooling.cpp
@@ -76,6 +76,10 @@ TYPED_TEST_CASE(GPUStochasticPoolingLayerTest, TestDtypes);
 TYPED_TEST(GPUStochasticPoolingLayerTest, TestStochastic) {
   LayerParameter layer_param;
   layer_param.set_phase(TRAIN);
+  layer_param.set_forward_type(tp<TypeParam>());
+  layer_param.set_backward_type(tp<TypeParam>());
+  layer_param.set_forward_math(tp<TypeParam>());
+  layer_param.set_backward_math(tp<TypeParam>());
   PoolingParameter* pooling_param = layer_param.mutable_pooling_param();
   pooling_param->set_kernel_size(3);
   pooling_param->set_stride(2);
@@ -119,6 +123,10 @@ TYPED_TEST(GPUStochasticPoolingLayerTest, TestStochastic) {
 TYPED_TEST(GPUStochasticPoolingLayerTest, TestStochasticTestPhase) {
   LayerParameter layer_param;
   layer_param.set_phase(TEST);
+  layer_param.set_forward_type(tp<TypeParam>());
+  layer_param.set_backward_type(tp<TypeParam>());
+  layer_param.set_forward_math(tp<TypeParam>());
+  layer_param.set_backward_math(tp<TypeParam>());
   PoolingParameter* pooling_param = layer_param.mutable_pooling_param();
   pooling_param->set_kernel_size(3);
   pooling_param->set_stride(2);
@@ -154,21 +162,22 @@ TYPED_TEST(GPUStochasticPoolingLayerTest, TestStochasticTestPhase) {
 }
 
 TYPED_TEST(GPUStochasticPoolingLayerTest, TestGradient) {
-  if (!is_precise<TypeParam>()) {
-    return;
-  }
   LayerParameter layer_param;
   layer_param.set_phase(TRAIN);
+  layer_param.set_forward_type(tp<TypeParam>());
+  layer_param.set_backward_type(tp<TypeParam>());
+  layer_param.set_forward_math(tp<TypeParam>());
+  layer_param.set_backward_math(tp<TypeParam>());
   PoolingParameter* pooling_param = layer_param.mutable_pooling_param();
   pooling_param->set_kernel_size(3);
   pooling_param->set_stride(2);
   pooling_param->set_pool(PoolingParameter_PoolMethod_STOCHASTIC);
   PoolingLayer<TypeParam, TypeParam> layer(layer_param);
-  GradientChecker<TypeParam> checker(tol<TypeParam>(1e-4, 3e-3), tol<TypeParam>(1e-2, 1e-1));
+  GradientChecker<TypeParam> checker(tol<TypeParam>(1e-4, 3e-3), tol<TypeParam>(1e-1, 1e-1),
+                                     1701, 0.F, 1.F);
   // it is too expensive to call curand multiple times, so we don't do an
   // exhaustive gradient check.
-  checker.CheckGradient(&layer, this->blob_bottom_vec_,
-      this->blob_top_vec_);
+  checker.CheckGradient(&layer, this->blob_bottom_vec_, this->blob_top_vec_);
 }
 
 }  // namespace caffe
diff --git a/src/caffe/util/bbox_util.cpp b/src/caffe/util/bbox_util.cpp
new file mode 100644
index 00000000000..5c9ce3601a8
--- /dev/null
+++ b/src/caffe/util/bbox_util.cpp
@@ -0,0 +1,2311 @@
+#include <algorithm>
+#include <csignal>
+#include <ctime>
+#include <functional>
+#include <map>
+#include <set>
+#include <string>
+#include <utility>
+#include <vector>
+
+#include "boost/iterator/counting_iterator.hpp"
+
+#include "caffe/util/bbox_util.hpp"
+
+namespace caffe {
+
+bool SortBBoxAscend(const NormalizedBBox& bbox1, const NormalizedBBox& bbox2) {
+  return bbox1.score() < bbox2.score();
+}
+
+bool SortBBoxDescend(const NormalizedBBox& bbox1, const NormalizedBBox& bbox2) {
+  return bbox1.score() > bbox2.score();
+}
+
+template <typename T>
+bool SortScorePairAscend(const pair<float, T>& pair1,
+                         const pair<float, T>& pair2) {
+  return pair1.first < pair2.first;
+}
+
+// Explicit initialization.
+template bool SortScorePairAscend(const pair<float, int>& pair1,
+                                  const pair<float, int>& pair2);
+template bool SortScorePairAscend(const pair<float, pair<int, int> >& pair1,
+                                  const pair<float, pair<int, int> >& pair2);
+
+template <typename T>
+bool SortScorePairDescend(const pair<float, T>& pair1,
+                          const pair<float, T>& pair2) {
+  return pair1.first > pair2.first;
+}
+
+// Explicit initialization.
+template bool SortScorePairDescend(const pair<float, int>& pair1,
+                                   const pair<float, int>& pair2);
+template bool SortScorePairDescend(const pair<float, pair<int, int> >& pair1,
+                                   const pair<float, pair<int, int> >& pair2);
+
+NormalizedBBox UnitBBox() {
+  NormalizedBBox unit_bbox;
+  unit_bbox.set_xmin(0.);
+  unit_bbox.set_ymin(0.);
+  unit_bbox.set_xmax(1.);
+  unit_bbox.set_ymax(1.);
+  return unit_bbox;
+}
+
+bool IsCrossBoundaryBBox(const NormalizedBBox& bbox) {
+  return bbox.xmin() < 0 || bbox.xmin() > 1 ||
+      bbox.ymin() < 0 || bbox.ymin() > 1 ||
+      bbox.xmax() < 0 || bbox.xmax() > 1 ||
+      bbox.ymax() < 0 || bbox.ymax() > 1;
+}
+
+void IntersectBBox(const NormalizedBBox& bbox1, const NormalizedBBox& bbox2,
+                   NormalizedBBox* intersect_bbox) {
+  if (bbox2.xmin() > bbox1.xmax() || bbox2.xmax() < bbox1.xmin() ||
+      bbox2.ymin() > bbox1.ymax() || bbox2.ymax() < bbox1.ymin()) {
+    // Return [0, 0, 0, 0] if there is no intersection.
+    intersect_bbox->set_xmin(0);
+    intersect_bbox->set_ymin(0);
+    intersect_bbox->set_xmax(0);
+    intersect_bbox->set_ymax(0);
+  } else {
+    intersect_bbox->set_xmin(std::max(bbox1.xmin(), bbox2.xmin()));
+    intersect_bbox->set_ymin(std::max(bbox1.ymin(), bbox2.ymin()));
+    intersect_bbox->set_xmax(std::min(bbox1.xmax(), bbox2.xmax()));
+    intersect_bbox->set_ymax(std::min(bbox1.ymax(), bbox2.ymax()));
+  }
+}
+
+float BBoxSize(const NormalizedBBox& bbox, const bool normalized) {
+  if (bbox.xmax() < bbox.xmin() || bbox.ymax() < bbox.ymin()) {
+    // If bbox is invalid (e.g. xmax < xmin or ymax < ymin), return 0.
+    return 0;
+  } else {
+    if (bbox.has_size()) {
+      return bbox.size();
+    } else {
+      float width = bbox.xmax() - bbox.xmin();
+      float height = bbox.ymax() - bbox.ymin();
+      if (normalized) {
+        return width * height;
+      } else {
+        // If bbox is not within range [0, 1].
+        return (width + 1) * (height + 1);
+      }
+    }
+  }
+}
+
+template <typename Dtype>
+Dtype BBoxSize(const Dtype* bbox, const bool normalized) {
+  if (bbox[2] < bbox[0] || bbox[3] < bbox[1]) {
+    // If bbox is invalid (e.g. xmax < xmin or ymax < ymin), return 0.
+    return Dtype(0.);
+  } else {
+    const Dtype width = bbox[2] - bbox[0];
+    const Dtype height = bbox[3] - bbox[1];
+    if (normalized) {
+      return width * height;
+    } else {
+      // If bbox is not within range [0, 1].
+      return (width + 1) * (height + 1);
+    }
+  }
+}
+
+template float BBoxSize(const float* bbox, const bool normalized);
+template double BBoxSize(const double* bbox, const bool normalized);
+
+void ClipBBox(const NormalizedBBox& bbox, NormalizedBBox* clip_bbox) {
+  clip_bbox->set_xmin(std::max(std::min(bbox.xmin(), 1.f), 0.f));
+  clip_bbox->set_ymin(std::max(std::min(bbox.ymin(), 1.f), 0.f));
+  clip_bbox->set_xmax(std::max(std::min(bbox.xmax(), 1.f), 0.f));
+  clip_bbox->set_ymax(std::max(std::min(bbox.ymax(), 1.f), 0.f));
+  clip_bbox->clear_size();
+  clip_bbox->set_size(BBoxSize(*clip_bbox));
+  clip_bbox->set_difficult(bbox.difficult());
+}
+
+void ClipBBox(const NormalizedBBox& bbox, const float height, const float width,
+              NormalizedBBox* clip_bbox) {
+  clip_bbox->set_xmin(std::max(std::min(bbox.xmin(), width), 0.f));
+  clip_bbox->set_ymin(std::max(std::min(bbox.ymin(), height), 0.f));
+  clip_bbox->set_xmax(std::max(std::min(bbox.xmax(), width), 0.f));
+  clip_bbox->set_ymax(std::max(std::min(bbox.ymax(), height), 0.f));
+  clip_bbox->clear_size();
+  clip_bbox->set_size(BBoxSize(*clip_bbox));
+  clip_bbox->set_difficult(bbox.difficult());
+}
+
+void ScaleBBox(const NormalizedBBox& bbox, const int height, const int width,
+               NormalizedBBox* scale_bbox) {
+  scale_bbox->set_xmin(bbox.xmin() * width);
+  scale_bbox->set_ymin(bbox.ymin() * height);
+  scale_bbox->set_xmax(bbox.xmax() * width);
+  scale_bbox->set_ymax(bbox.ymax() * height);
+  scale_bbox->clear_size();
+  bool normalized = !(width > 1 || height > 1);
+  scale_bbox->set_size(BBoxSize(*scale_bbox, normalized));
+  scale_bbox->set_difficult(bbox.difficult());
+}
+
+void OutputBBox(const NormalizedBBox& bbox, const pair<int, int>& img_size,
+                const bool has_resize, const ResizeParameter& resize_param,
+                NormalizedBBox* out_bbox) {
+  const int height = img_size.first;
+  const int width = img_size.second;
+  NormalizedBBox temp_bbox = bbox;
+  if (has_resize && resize_param.resize_mode()) {
+    float resize_height = resize_param.height();
+    CHECK_GT(resize_height, 0);
+    float resize_width = resize_param.width();
+    CHECK_GT(resize_width, 0);
+    float resize_aspect = resize_width / resize_height;
+    int height_scale = resize_param.height_scale();
+    int width_scale = resize_param.width_scale();
+    float aspect = static_cast<float>(width) / height;
+
+    float padding;
+    NormalizedBBox source_bbox;
+    switch (resize_param.resize_mode()) {
+      case ResizeParameter_Resize_mode_WARP:
+        ClipBBox(temp_bbox, &temp_bbox);
+        ScaleBBox(temp_bbox, height, width, out_bbox);
+        break;
+      case ResizeParameter_Resize_mode_FIT_LARGE_SIZE_AND_PAD:
+        if (aspect > resize_aspect) {
+          padding = (resize_height - resize_width / aspect) / 2;
+          source_bbox.set_xmin(0.);
+          source_bbox.set_ymin(padding / resize_height);
+          source_bbox.set_xmax(1.);
+          source_bbox.set_ymax(1. - padding / resize_height);
+        } else {
+          padding = (resize_width - resize_height * aspect) / 2;
+          source_bbox.set_xmin(padding / resize_width);
+          source_bbox.set_ymin(0.);
+          source_bbox.set_xmax(1. - padding / resize_width);
+          source_bbox.set_ymax(1.);
+        }
+        ProjectBBox(source_bbox, bbox, &temp_bbox);
+        ClipBBox(temp_bbox, &temp_bbox);
+        ScaleBBox(temp_bbox, height, width, out_bbox);
+        break;
+      case ResizeParameter_Resize_mode_FIT_SMALL_SIZE:
+        if (height_scale == 0 || width_scale == 0) {
+          ClipBBox(temp_bbox, &temp_bbox);
+          ScaleBBox(temp_bbox, height, width, out_bbox);
+        } else {
+          ScaleBBox(temp_bbox, height_scale, width_scale, out_bbox);
+          ClipBBox(*out_bbox, height, width, out_bbox);
+        }
+        break;
+      default:
+        LOG(FATAL) << "Unknown resize mode.";
+    }
+  } else {
+    // Clip the normalized bbox first.
+    ClipBBox(temp_bbox, &temp_bbox);
+    // Scale the bbox according to the original image size.
+    ScaleBBox(temp_bbox, height, width, out_bbox);
+  }
+}
+
+void LocateBBox(const NormalizedBBox& src_bbox, const NormalizedBBox& bbox,
+                NormalizedBBox* loc_bbox) {
+  float src_width = src_bbox.xmax() - src_bbox.xmin();
+  float src_height = src_bbox.ymax() - src_bbox.ymin();
+  loc_bbox->set_xmin(src_bbox.xmin() + bbox.xmin() * src_width);
+  loc_bbox->set_ymin(src_bbox.ymin() + bbox.ymin() * src_height);
+  loc_bbox->set_xmax(src_bbox.xmin() + bbox.xmax() * src_width);
+  loc_bbox->set_ymax(src_bbox.ymin() + bbox.ymax() * src_height);
+  loc_bbox->set_difficult(bbox.difficult());
+}
+
+bool ProjectBBox(const NormalizedBBox& src_bbox, const NormalizedBBox& bbox,
+                 NormalizedBBox* proj_bbox) {
+  if (bbox.xmin() >= src_bbox.xmax() || bbox.xmax() <= src_bbox.xmin() ||
+      bbox.ymin() >= src_bbox.ymax() || bbox.ymax() <= src_bbox.ymin()) {
+    return false;
+  }
+  float src_width = src_bbox.xmax() - src_bbox.xmin();
+  float src_height = src_bbox.ymax() - src_bbox.ymin();
+  proj_bbox->set_xmin((bbox.xmin() - src_bbox.xmin()) / src_width);
+  proj_bbox->set_ymin((bbox.ymin() - src_bbox.ymin()) / src_height);
+  proj_bbox->set_xmax((bbox.xmax() - src_bbox.xmin()) / src_width);
+  proj_bbox->set_ymax((bbox.ymax() - src_bbox.ymin()) / src_height);
+  proj_bbox->set_difficult(bbox.difficult());
+  ClipBBox(*proj_bbox, proj_bbox);
+  if (BBoxSize(*proj_bbox) > 0) {
+    return true;
+  } else {
+    return false;
+  }
+}
+
+void ExtrapolateBBox(const ResizeParameter& param, const int height,
+    const int width, const NormalizedBBox& crop_bbox, NormalizedBBox* bbox) {
+  float height_scale = param.height_scale();
+  float width_scale = param.width_scale();
+  if (height_scale > 0 && width_scale > 0 &&
+      param.resize_mode() == ResizeParameter_Resize_mode_FIT_SMALL_SIZE) {
+    float orig_aspect = static_cast<float>(width) / height;
+    float resize_height = param.height();
+    float resize_width = param.width();
+    float resize_aspect = resize_width / resize_height;
+    if (orig_aspect < resize_aspect) {
+      resize_height = resize_width / orig_aspect;
+    } else {
+      resize_width = resize_height * orig_aspect;
+    }
+    float crop_height = resize_height * (crop_bbox.ymax() - crop_bbox.ymin());
+    float crop_width = resize_width * (crop_bbox.xmax() - crop_bbox.xmin());
+    CHECK_GE(crop_width, width_scale);
+    CHECK_GE(crop_height, height_scale);
+    bbox->set_xmin(bbox->xmin() * crop_width / width_scale);
+    bbox->set_xmax(bbox->xmax() * crop_width / width_scale);
+    bbox->set_ymin(bbox->ymin() * crop_height / height_scale);
+    bbox->set_ymax(bbox->ymax() * crop_height / height_scale);
+  }
+}
+
+float JaccardOverlap(const NormalizedBBox& bbox1, const NormalizedBBox& bbox2,
+                     const bool normalized) {
+  NormalizedBBox intersect_bbox;
+  IntersectBBox(bbox1, bbox2, &intersect_bbox);
+  float intersect_width, intersect_height;
+  if (normalized) {
+    intersect_width = intersect_bbox.xmax() - intersect_bbox.xmin();
+    intersect_height = intersect_bbox.ymax() - intersect_bbox.ymin();
+  } else {
+    intersect_width = intersect_bbox.xmax() - intersect_bbox.xmin() + 1;
+    intersect_height = intersect_bbox.ymax() - intersect_bbox.ymin() + 1;
+  }
+  if (intersect_width > 0 && intersect_height > 0) {
+    float intersect_size = intersect_width * intersect_height;
+    float bbox1_size = BBoxSize(bbox1);
+    float bbox2_size = BBoxSize(bbox2);
+    return intersect_size / (bbox1_size + bbox2_size - intersect_size);
+  } else {
+    return 0.;
+  }
+}
+
+template <typename Dtype>
+Dtype JaccardOverlap(const Dtype* bbox1, const Dtype* bbox2) {
+  if (bbox2[0] > bbox1[2] || bbox2[2] < bbox1[0] ||
+      bbox2[1] > bbox1[3] || bbox2[3] < bbox1[1]) {
+    return Dtype(0.);
+  } else {
+    const Dtype inter_xmin = std::max(bbox1[0], bbox2[0]);
+    const Dtype inter_ymin = std::max(bbox1[1], bbox2[1]);
+    const Dtype inter_xmax = std::min(bbox1[2], bbox2[2]);
+    const Dtype inter_ymax = std::min(bbox1[3], bbox2[3]);
+
+    const Dtype inter_width = inter_xmax - inter_xmin;
+    const Dtype inter_height = inter_ymax - inter_ymin;
+    const Dtype inter_size = inter_width * inter_height;
+
+    const Dtype bbox1_size = BBoxSize(bbox1);
+    const Dtype bbox2_size = BBoxSize(bbox2);
+
+    return inter_size / (bbox1_size + bbox2_size - inter_size);
+  }
+}
+
+template float JaccardOverlap(const float* bbox1, const float* bbox2);
+template double JaccardOverlap(const double* bbox1, const double* bbox2);
+template float16 JaccardOverlap(const float16* bbox1, const float16* bbox2);
+
+float BBoxCoverage(const NormalizedBBox& bbox1, const NormalizedBBox& bbox2) {
+  NormalizedBBox intersect_bbox;
+  IntersectBBox(bbox1, bbox2, &intersect_bbox);
+  float intersect_size = BBoxSize(intersect_bbox);
+  if (intersect_size > 0) {
+    float bbox1_size = BBoxSize(bbox1);
+    return intersect_size / bbox1_size;
+  } else {
+    return 0.;
+  }
+}
+
+bool MeetEmitConstraint(const NormalizedBBox& src_bbox,
+                        const NormalizedBBox& bbox,
+                        const EmitConstraint& emit_constraint) {
+  EmitType emit_type = emit_constraint.emit_type();
+  if (emit_type == EmitConstraint_EmitType_CENTER) {
+    float x_center = (bbox.xmin() + bbox.xmax()) / 2;
+    float y_center = (bbox.ymin() + bbox.ymax()) / 2;
+    if (x_center >= src_bbox.xmin() && x_center <= src_bbox.xmax() &&
+        y_center >= src_bbox.ymin() && y_center <= src_bbox.ymax()) {
+      return true;
+    } else {
+      return false;
+    }
+  } else if (emit_type == EmitConstraint_EmitType_MIN_OVERLAP) {
+    float bbox_coverage = BBoxCoverage(bbox, src_bbox);
+    return bbox_coverage > emit_constraint.emit_overlap();
+  } else {
+    LOG(FATAL) << "Unknown emit type.";
+    return false;
+  }
+}
+
+void EncodeBBox(
+    const NormalizedBBox& prior_bbox, const vector<float>& prior_variance,
+    const CodeType code_type, const bool encode_variance_in_target,
+    const NormalizedBBox& bbox, NormalizedBBox* encode_bbox) {
+  if (code_type == PriorBoxParameter_CodeType_CORNER) {
+    if (encode_variance_in_target) {
+      encode_bbox->set_xmin(bbox.xmin() - prior_bbox.xmin());
+      encode_bbox->set_ymin(bbox.ymin() - prior_bbox.ymin());
+      encode_bbox->set_xmax(bbox.xmax() - prior_bbox.xmax());
+      encode_bbox->set_ymax(bbox.ymax() - prior_bbox.ymax());
+    } else {
+      // Encode variance in bbox.
+      CHECK_EQ(prior_variance.size(), 4);
+      for (int i = 0; i < prior_variance.size(); ++i) {
+        CHECK_GT(prior_variance[i], 0);
+      }
+      encode_bbox->set_xmin(
+          (bbox.xmin() - prior_bbox.xmin()) / prior_variance[0]);
+      encode_bbox->set_ymin(
+          (bbox.ymin() - prior_bbox.ymin()) / prior_variance[1]);
+      encode_bbox->set_xmax(
+          (bbox.xmax() - prior_bbox.xmax()) / prior_variance[2]);
+      encode_bbox->set_ymax(
+          (bbox.ymax() - prior_bbox.ymax()) / prior_variance[3]);
+    }
+  } else if (code_type == PriorBoxParameter_CodeType_CENTER_SIZE) {
+    float prior_width = prior_bbox.xmax() - prior_bbox.xmin();
+    CHECK_GT(prior_width, 0);
+    float prior_height = prior_bbox.ymax() - prior_bbox.ymin();
+    CHECK_GT(prior_height, 0);
+    float prior_center_x = (prior_bbox.xmin() + prior_bbox.xmax()) / 2.;
+    float prior_center_y = (prior_bbox.ymin() + prior_bbox.ymax()) / 2.;
+
+    float bbox_width = bbox.xmax() - bbox.xmin();
+    CHECK_GT(bbox_width, 0);
+    float bbox_height = bbox.ymax() - bbox.ymin();
+    CHECK_GT(bbox_height, 0);
+    float bbox_center_x = (bbox.xmin() + bbox.xmax()) / 2.;
+    float bbox_center_y = (bbox.ymin() + bbox.ymax()) / 2.;
+
+    if (encode_variance_in_target) {
+      encode_bbox->set_xmin((bbox_center_x - prior_center_x) / prior_width);
+      encode_bbox->set_ymin((bbox_center_y - prior_center_y) / prior_height);
+      encode_bbox->set_xmax(log(bbox_width / prior_width));
+      encode_bbox->set_ymax(log(bbox_height / prior_height));
+    } else {
+      // Encode variance in bbox.
+      encode_bbox->set_xmin(
+          (bbox_center_x - prior_center_x) / prior_width / prior_variance[0]);
+      encode_bbox->set_ymin(
+          (bbox_center_y - prior_center_y) / prior_height / prior_variance[1]);
+      encode_bbox->set_xmax(
+          log(bbox_width / prior_width) / prior_variance[2]);
+      encode_bbox->set_ymax(
+          log(bbox_height / prior_height) / prior_variance[3]);
+    }
+  } else if (code_type == PriorBoxParameter_CodeType_CORNER_SIZE) {
+    float prior_width = prior_bbox.xmax() - prior_bbox.xmin();
+    CHECK_GT(prior_width, 0);
+    float prior_height = prior_bbox.ymax() - prior_bbox.ymin();
+    CHECK_GT(prior_height, 0);
+    if (encode_variance_in_target) {
+      encode_bbox->set_xmin((bbox.xmin() - prior_bbox.xmin()) / prior_width);
+      encode_bbox->set_ymin((bbox.ymin() - prior_bbox.ymin()) / prior_height);
+      encode_bbox->set_xmax((bbox.xmax() - prior_bbox.xmax()) / prior_width);
+      encode_bbox->set_ymax((bbox.ymax() - prior_bbox.ymax()) / prior_height);
+    } else {
+      // Encode variance in bbox.
+      CHECK_EQ(prior_variance.size(), 4);
+      for (int i = 0; i < prior_variance.size(); ++i) {
+        CHECK_GT(prior_variance[i], 0);
+      }
+      encode_bbox->set_xmin(
+          (bbox.xmin() - prior_bbox.xmin()) / prior_width / prior_variance[0]);
+      encode_bbox->set_ymin(
+          (bbox.ymin() - prior_bbox.ymin()) / prior_height / prior_variance[1]);
+      encode_bbox->set_xmax(
+          (bbox.xmax() - prior_bbox.xmax()) / prior_width / prior_variance[2]);
+      encode_bbox->set_ymax(
+          (bbox.ymax() - prior_bbox.ymax()) / prior_height / prior_variance[3]);
+    }
+  } else {
+    LOG(FATAL) << "Unknown LocLossType.";
+  }
+}
+
+void DecodeBBox(
+    const NormalizedBBox& prior_bbox, const vector<float>& prior_variance,
+    const CodeType code_type, const bool variance_encoded_in_target,
+    const bool clip_bbox, const NormalizedBBox& bbox,
+    NormalizedBBox* decode_bbox) {
+  if (code_type == PriorBoxParameter_CodeType_CORNER) {
+    if (variance_encoded_in_target) {
+      // variance is encoded in target, we simply need to add the offset
+      // predictions.
+      decode_bbox->set_xmin(prior_bbox.xmin() + bbox.xmin());
+      decode_bbox->set_ymin(prior_bbox.ymin() + bbox.ymin());
+      decode_bbox->set_xmax(prior_bbox.xmax() + bbox.xmax());
+      decode_bbox->set_ymax(prior_bbox.ymax() + bbox.ymax());
+    } else {
+      // variance is encoded in bbox, we need to scale the offset accordingly.
+      decode_bbox->set_xmin(
+          prior_bbox.xmin() + prior_variance[0] * bbox.xmin());
+      decode_bbox->set_ymin(
+          prior_bbox.ymin() + prior_variance[1] * bbox.ymin());
+      decode_bbox->set_xmax(
+          prior_bbox.xmax() + prior_variance[2] * bbox.xmax());
+      decode_bbox->set_ymax(
+          prior_bbox.ymax() + prior_variance[3] * bbox.ymax());
+    }
+  } else if (code_type == PriorBoxParameter_CodeType_CENTER_SIZE) {
+    float prior_width = prior_bbox.xmax() - prior_bbox.xmin();
+    CHECK_GT(prior_width, 0);
+    float prior_height = prior_bbox.ymax() - prior_bbox.ymin();
+    CHECK_GT(prior_height, 0);
+    float prior_center_x = (prior_bbox.xmin() + prior_bbox.xmax()) / 2.;
+    float prior_center_y = (prior_bbox.ymin() + prior_bbox.ymax()) / 2.;
+
+    float decode_bbox_center_x, decode_bbox_center_y;
+    float decode_bbox_width, decode_bbox_height;
+    if (variance_encoded_in_target) {
+      // variance is encoded in target, we simply need to retore the offset
+      // predictions.
+      decode_bbox_center_x = bbox.xmin() * prior_width + prior_center_x;
+      decode_bbox_center_y = bbox.ymin() * prior_height + prior_center_y;
+      decode_bbox_width = exp(bbox.xmax()) * prior_width;
+      decode_bbox_height = exp(bbox.ymax()) * prior_height;
+    } else {
+      // variance is encoded in bbox, we need to scale the offset accordingly.
+      decode_bbox_center_x =
+          prior_variance[0] * bbox.xmin() * prior_width + prior_center_x;
+      decode_bbox_center_y =
+          prior_variance[1] * bbox.ymin() * prior_height + prior_center_y;
+      decode_bbox_width =
+          exp(prior_variance[2] * bbox.xmax()) * prior_width;
+      decode_bbox_height =
+          exp(prior_variance[3] * bbox.ymax()) * prior_height;
+    }
+
+    decode_bbox->set_xmin(decode_bbox_center_x - decode_bbox_width / 2.);
+    decode_bbox->set_ymin(decode_bbox_center_y - decode_bbox_height / 2.);
+    decode_bbox->set_xmax(decode_bbox_center_x + decode_bbox_width / 2.);
+    decode_bbox->set_ymax(decode_bbox_center_y + decode_bbox_height / 2.);
+  } else if (code_type == PriorBoxParameter_CodeType_CORNER_SIZE) {
+    float prior_width = prior_bbox.xmax() - prior_bbox.xmin();
+    CHECK_GT(prior_width, 0);
+    float prior_height = prior_bbox.ymax() - prior_bbox.ymin();
+    CHECK_GT(prior_height, 0);
+    if (variance_encoded_in_target) {
+      // variance is encoded in target, we simply need to add the offset
+      // predictions.
+      decode_bbox->set_xmin(prior_bbox.xmin() + bbox.xmin() * prior_width);
+      decode_bbox->set_ymin(prior_bbox.ymin() + bbox.ymin() * prior_height);
+      decode_bbox->set_xmax(prior_bbox.xmax() + bbox.xmax() * prior_width);
+      decode_bbox->set_ymax(prior_bbox.ymax() + bbox.ymax() * prior_height);
+    } else {
+      // variance is encoded in bbox, we need to scale the offset accordingly.
+      decode_bbox->set_xmin(
+          prior_bbox.xmin() + prior_variance[0] * bbox.xmin() * prior_width);
+      decode_bbox->set_ymin(
+          prior_bbox.ymin() + prior_variance[1] * bbox.ymin() * prior_height);
+      decode_bbox->set_xmax(
+          prior_bbox.xmax() + prior_variance[2] * bbox.xmax() * prior_width);
+      decode_bbox->set_ymax(
+          prior_bbox.ymax() + prior_variance[3] * bbox.ymax() * prior_height);
+    }
+  } else {
+    LOG(FATAL) << "Unknown LocLossType.";
+  }
+  float bbox_size = BBoxSize(*decode_bbox);
+  decode_bbox->set_size(bbox_size);
+  if (clip_bbox) {
+    ClipBBox(*decode_bbox, decode_bbox);
+  }
+}
+
+void DecodeBBoxes(
+    const vector<NormalizedBBox>& prior_bboxes,
+    const vector<vector<float> >& prior_variances,
+    const CodeType code_type, const bool variance_encoded_in_target,
+    const bool clip_bbox, const vector<NormalizedBBox>& bboxes,
+    vector<NormalizedBBox>* decode_bboxes) {
+  CHECK_EQ(prior_bboxes.size(), prior_variances.size());
+  CHECK_EQ(prior_bboxes.size(), bboxes.size());
+  int num_bboxes = prior_bboxes.size();
+  if (num_bboxes >= 1) {
+    CHECK_EQ(prior_variances[0].size(), 4);
+  }
+  decode_bboxes->clear();
+  for (int i = 0; i < num_bboxes; ++i) {
+    NormalizedBBox decode_bbox;
+    DecodeBBox(prior_bboxes[i], prior_variances[i], code_type,
+               variance_encoded_in_target, clip_bbox, bboxes[i], &decode_bbox);
+    decode_bboxes->push_back(decode_bbox);
+  }
+}
+
+void DecodeBBoxesAll(const vector<LabelBBox>& all_loc_preds,
+    const vector<NormalizedBBox>& prior_bboxes,
+    const vector<vector<float> >& prior_variances,
+    const int num, const bool share_location,
+    const int num_loc_classes, const int background_label_id,
+    const CodeType code_type, const bool variance_encoded_in_target,
+    const bool clip, vector<LabelBBox>* all_decode_bboxes) {
+  CHECK_EQ(all_loc_preds.size(), num);
+  all_decode_bboxes->clear();
+  all_decode_bboxes->resize(num);
+  for (int i = 0; i < num; ++i) {
+    // Decode predictions into bboxes.
+    LabelBBox& decode_bboxes = (*all_decode_bboxes)[i];
+    for (int c = 0; c < num_loc_classes; ++c) {
+      int label = share_location ? -1 : c;
+      if (label == background_label_id) {
+        // Ignore background class.
+        continue;
+      }
+      if (all_loc_preds[i].find(label) == all_loc_preds[i].end()) {
+        // Something bad happened if there are no predictions for current label.
+        LOG(FATAL) << "Could not find location predictions for label " << label;
+      }
+      const vector<NormalizedBBox>& label_loc_preds =
+          all_loc_preds[i].find(label)->second;
+      DecodeBBoxes(prior_bboxes, prior_variances,
+                   code_type, variance_encoded_in_target, clip,
+                   label_loc_preds, &(decode_bboxes[label]));
+    }
+  }
+}
+
+void MatchBBox(const vector<NormalizedBBox>& gt_bboxes,
+    const vector<NormalizedBBox>& pred_bboxes, const int label,
+    const MatchType match_type, const float overlap_threshold,
+    const bool ignore_cross_boundary_bbox,
+    vector<int>* match_indices, vector<float>* match_overlaps) {
+  int num_pred = pred_bboxes.size();
+  match_indices->clear();
+  match_indices->resize(num_pred, -1);
+  match_overlaps->clear();
+  match_overlaps->resize(num_pred, 0.);
+
+  int num_gt = 0;
+  vector<int> gt_indices;
+  if (label == -1) {
+    // label -1 means comparing against all ground truth.
+    num_gt = gt_bboxes.size();
+    for (int i = 0; i < num_gt; ++i) {
+      gt_indices.push_back(i);
+    }
+  } else {
+    // Count number of ground truth boxes which has the desired label.
+    for (int i = 0; i < gt_bboxes.size(); ++i) {
+      if (gt_bboxes[i].label() == label) {
+        num_gt++;
+        gt_indices.push_back(i);
+      }
+    }
+  }
+  if (num_gt == 0) {
+    return;
+  }
+
+  // Store the positive overlap between predictions and ground truth.
+  map<int, map<int, float> > overlaps;
+  for (int i = 0; i < num_pred; ++i) {
+    if (ignore_cross_boundary_bbox && IsCrossBoundaryBBox(pred_bboxes[i])) {
+      (*match_indices)[i] = -2;
+      continue;
+    }
+    for (int j = 0; j < num_gt; ++j) {
+      float overlap = JaccardOverlap(pred_bboxes[i], gt_bboxes[gt_indices[j]]);
+      if (overlap > 1e-6) {
+        (*match_overlaps)[i] = std::max((*match_overlaps)[i], overlap);
+        overlaps[i][j] = overlap;
+      }
+    }
+  }
+
+  // Bipartite matching.
+  vector<int> gt_pool;
+  for (int i = 0; i < num_gt; ++i) {
+    gt_pool.push_back(i);
+  }
+  while (gt_pool.size() > 0) {
+    // Find the most overlapped gt and cooresponding predictions.
+    int max_idx = -1;
+    int max_gt_idx = -1;
+    float max_overlap = -1;
+    for (map<int, map<int, float> >::iterator it = overlaps.begin();
+         it != overlaps.end(); ++it) {
+      int i = it->first;
+      if ((*match_indices)[i] != -1) {
+        // The prediction already has matched ground truth or is ignored.
+        continue;
+      }
+      for (int p = 0; p < gt_pool.size(); ++p) {
+        int j = gt_pool[p];
+        if (it->second.find(j) == it->second.end()) {
+          // No overlap between the i-th prediction and j-th ground truth.
+          continue;
+        }
+        // Find the maximum overlapped pair.
+        if (it->second[j] > max_overlap) {
+          // If the prediction has not been matched to any ground truth,
+          // and the overlap is larger than maximum overlap, update.
+          max_idx = i;
+          max_gt_idx = j;
+          max_overlap = it->second[j];
+        }
+      }
+    }
+    if (max_idx == -1) {
+      // Cannot find good match.
+      break;
+    } else {
+      CHECK_EQ((*match_indices)[max_idx], -1);
+      (*match_indices)[max_idx] = gt_indices[max_gt_idx];
+      (*match_overlaps)[max_idx] = max_overlap;
+      // Erase the ground truth.
+      gt_pool.erase(std::find(gt_pool.begin(), gt_pool.end(), max_gt_idx));
+    }
+  }
+
+  switch (match_type) {
+    case MultiBoxLossParameter_MatchType_BIPARTITE:
+      // Already done.
+      break;
+    case MultiBoxLossParameter_MatchType_PER_PREDICTION:
+      // Get most overlaped for the rest prediction bboxes.
+      for (map<int, map<int, float> >::iterator it = overlaps.begin();
+           it != overlaps.end(); ++it) {
+        int i = it->first;
+        if ((*match_indices)[i] != -1) {
+          // The prediction already has matched ground truth or is ignored.
+          continue;
+        }
+        int max_gt_idx = -1;
+        float max_overlap = -1;
+        for (int j = 0; j < num_gt; ++j) {
+          if (it->second.find(j) == it->second.end()) {
+            // No overlap between the i-th prediction and j-th ground truth.
+            continue;
+          }
+          // Find the maximum overlapped pair.
+          float overlap = it->second[j];
+          if (overlap >= overlap_threshold && overlap > max_overlap) {
+            // If the prediction has not been matched to any ground truth,
+            // and the overlap is larger than maximum overlap, update.
+            max_gt_idx = j;
+            max_overlap = overlap;
+          }
+        }
+        if (max_gt_idx != -1) {
+          // Found a matched ground truth.
+          CHECK_EQ((*match_indices)[i], -1);
+          (*match_indices)[i] = gt_indices[max_gt_idx];
+          (*match_overlaps)[i] = max_overlap;
+        }
+      }
+      break;
+    default:
+      LOG(FATAL) << "Unknown matching type.";
+      break;
+  }
+
+  return;
+}
+
+void FindMatches(const vector<LabelBBox>& all_loc_preds,
+      const map<int, vector<NormalizedBBox> >& all_gt_bboxes,
+      const vector<NormalizedBBox>& prior_bboxes,
+      const vector<vector<float> >& prior_variances,
+      const MultiBoxLossParameter& multibox_loss_param,
+      vector<map<int, vector<float> > >* all_match_overlaps,
+      vector<map<int, vector<int> > >* all_match_indices) {
+  // all_match_overlaps->clear();
+  // all_match_indices->clear();
+  // Get parameters.
+  CHECK(multibox_loss_param.has_num_classes()) << "Must provide num_classes.";
+  const int num_classes = multibox_loss_param.num_classes();
+  CHECK_GE(num_classes, 1) << "num_classes should not be less than 1.";
+  const bool share_location = multibox_loss_param.share_location();
+  const int loc_classes = share_location ? 1 : num_classes;
+  const MatchType match_type = multibox_loss_param.match_type();
+  const float overlap_threshold = multibox_loss_param.overlap_threshold();
+  const bool use_prior_for_matching =
+      multibox_loss_param.use_prior_for_matching();
+  const int background_label_id = multibox_loss_param.background_label_id();
+  const CodeType code_type = multibox_loss_param.code_type();
+  const bool encode_variance_in_target =
+      multibox_loss_param.encode_variance_in_target();
+  const bool ignore_cross_boundary_bbox =
+      multibox_loss_param.ignore_cross_boundary_bbox();
+  // Find the matches.
+  int num = all_loc_preds.size();
+  for (int i = 0; i < num; ++i) {
+    map<int, vector<int> > match_indices;
+    map<int, vector<float> > match_overlaps;
+    // Check if there is ground truth for current image.
+    if (all_gt_bboxes.find(i) == all_gt_bboxes.end()) {
+      // There is no gt for current image. All predictions are negative.
+      all_match_indices->push_back(match_indices);
+      all_match_overlaps->push_back(match_overlaps);
+      continue;
+    }
+    // Find match between predictions and ground truth.
+    const vector<NormalizedBBox>& gt_bboxes = all_gt_bboxes.find(i)->second;
+    if (!use_prior_for_matching) {
+      for (int c = 0; c < loc_classes; ++c) {
+        int label = share_location ? -1 : c;
+        if (!share_location && label == background_label_id) {
+          // Ignore background loc predictions.
+          continue;
+        }
+        // Decode the prediction into bbox first.
+        vector<NormalizedBBox> loc_bboxes;
+        bool clip_bbox = false;
+        DecodeBBoxes(prior_bboxes, prior_variances,
+                     code_type, encode_variance_in_target, clip_bbox,
+                     all_loc_preds[i].find(label)->second, &loc_bboxes);
+        MatchBBox(gt_bboxes, loc_bboxes, label, match_type,
+                  overlap_threshold, ignore_cross_boundary_bbox,
+                  &match_indices[label], &match_overlaps[label]);
+      }
+    } else {
+      // Use prior bboxes to match against all ground truth.
+      vector<int> temp_match_indices;
+      vector<float> temp_match_overlaps;
+      const int label = -1;
+      MatchBBox(gt_bboxes, prior_bboxes, label, match_type, overlap_threshold,
+                ignore_cross_boundary_bbox, &temp_match_indices,
+                &temp_match_overlaps);
+      if (share_location) {
+        match_indices[label] = temp_match_indices;
+        match_overlaps[label] = temp_match_overlaps;
+      } else {
+        // Get ground truth label for each ground truth bbox.
+        vector<int> gt_labels;
+        for (int g = 0; g < gt_bboxes.size(); ++g) {
+          gt_labels.push_back(gt_bboxes[g].label());
+        }
+        // Distribute the matching results to different loc_class.
+        for (int c = 0; c < loc_classes; ++c) {
+          if (c == background_label_id) {
+            // Ignore background loc predictions.
+            continue;
+          }
+          match_indices[c].resize(temp_match_indices.size(), -1);
+          match_overlaps[c] = temp_match_overlaps;
+          for (int m = 0; m < temp_match_indices.size(); ++m) {
+            if (temp_match_indices[m] > -1) {
+              const int gt_idx = temp_match_indices[m];
+              CHECK_LT(gt_idx, gt_labels.size());
+              if (c == gt_labels[gt_idx]) {
+                match_indices[c][m] = gt_idx;
+              }
+            }
+          }
+        }
+      }
+    }
+    all_match_indices->push_back(match_indices);
+    all_match_overlaps->push_back(match_overlaps);
+  }
+}
+
+int CountNumMatches(const vector<map<int, vector<int> > >& all_match_indices,
+                    const int num) {
+  int num_matches = 0;
+  for (int i = 0; i < num; ++i) {
+    const map<int, vector<int> >& match_indices = all_match_indices[i];
+    for (map<int, vector<int> >::const_iterator it = match_indices.begin();
+         it != match_indices.end(); ++it) {
+      const vector<int>& match_index = it->second;
+      for (int m = 0; m < match_index.size(); ++m) {
+        if (match_index[m] > -1) {
+          ++num_matches;
+        }
+      }
+    }
+  }
+  return num_matches;
+}
+
+inline bool IsEligibleMining(const MiningType mining_type, const int match_idx,
+    const float match_overlap, const float neg_overlap) {
+  if (mining_type == MultiBoxLossParameter_MiningType_MAX_NEGATIVE) {
+    return match_idx == -1 && match_overlap < neg_overlap;
+  } else if (mining_type == MultiBoxLossParameter_MiningType_HARD_EXAMPLE) {
+    return true;
+  } else {
+    return false;
+  }
+}
+
+template <typename Dtype>
+void MineHardExamples(const TBlob<Dtype>& conf_blob,
+    const vector<LabelBBox>& all_loc_preds,
+    const map<int, vector<NormalizedBBox> >& all_gt_bboxes,
+    const vector<NormalizedBBox>& prior_bboxes,
+    const vector<vector<float> >& prior_variances,
+    const vector<map<int, vector<float> > >& all_match_overlaps,
+    const MultiBoxLossParameter& multibox_loss_param,
+    int* num_matches, int* num_negs,
+    vector<map<int, vector<int> > >* all_match_indices,
+    vector<vector<int> >* all_neg_indices) {
+  int num = all_loc_preds.size();
+  // CHECK_EQ(num, all_match_overlaps.size());
+  // CHECK_EQ(num, all_match_indices->size());
+  // all_neg_indices->clear();
+  *num_matches = CountNumMatches(*all_match_indices, num);
+  *num_negs = 0;
+  int num_priors = prior_bboxes.size();
+  CHECK_EQ(num_priors, prior_variances.size());
+  // Get parameters.
+  CHECK(multibox_loss_param.has_num_classes()) << "Must provide num_classes.";
+  const int num_classes = multibox_loss_param.num_classes();
+  CHECK_GE(num_classes, 1) << "num_classes should not be less than 1.";
+  const int background_label_id = multibox_loss_param.background_label_id();
+  const bool use_prior_for_nms = multibox_loss_param.use_prior_for_nms();
+  const ConfLossType conf_loss_type = multibox_loss_param.conf_loss_type();
+  const MiningType mining_type = multibox_loss_param.mining_type();
+  if (mining_type == MultiBoxLossParameter_MiningType_NONE) {
+    return;
+  }
+  const LocLossType loc_loss_type = multibox_loss_param.loc_loss_type();
+  const float neg_pos_ratio = multibox_loss_param.neg_pos_ratio();
+  const float neg_overlap = multibox_loss_param.neg_overlap();
+  const CodeType code_type = multibox_loss_param.code_type();
+  const bool encode_variance_in_target =
+      multibox_loss_param.encode_variance_in_target();
+  const bool has_nms_param = multibox_loss_param.has_nms_param();
+  float nms_threshold = 0;
+  int top_k = -1;
+  if (has_nms_param) {
+    nms_threshold = multibox_loss_param.nms_param().nms_threshold();
+    top_k = multibox_loss_param.nms_param().top_k();
+  }
+  const int sample_size = multibox_loss_param.sample_size();
+  // Compute confidence losses based on matching results.
+  vector<vector<float> > all_conf_loss;
+//  ComputeConfLoss(conf_blob.cpu_data(), num, num_priors, num_classes,
+//      background_label_id, conf_loss_type, *all_match_indices, all_gt_bboxes,
+//      &all_conf_loss);
+  ComputeConfLossGPU(conf_blob, num, num_priors, num_classes,
+      background_label_id, conf_loss_type, *all_match_indices, all_gt_bboxes,
+      &all_conf_loss);
+  vector<vector<float> > all_loc_loss;
+  if (mining_type == MultiBoxLossParameter_MiningType_HARD_EXAMPLE) {
+    // Compute localization losses based on matching results.
+    TBlob<Dtype> loc_pred, loc_gt;
+    if (*num_matches != 0) {
+      vector<int> loc_shape(2, 1);
+      loc_shape[1] = *num_matches * 4;
+      loc_pred.Reshape(loc_shape);
+      loc_gt.Reshape(loc_shape);
+      Dtype* loc_pred_data = loc_pred.mutable_cpu_data();
+      Dtype* loc_gt_data = loc_gt.mutable_cpu_data();
+      EncodeLocPrediction(all_loc_preds, all_gt_bboxes, *all_match_indices,
+                          prior_bboxes, prior_variances, multibox_loss_param,
+                          loc_pred_data, loc_gt_data);
+    }
+    ComputeLocLoss(loc_pred, loc_gt, *all_match_indices, num,
+                   num_priors, loc_loss_type, &all_loc_loss);
+  } else {
+    // No localization loss.
+    for (int i = 0; i < num; ++i) {
+      vector<float> loc_loss(num_priors, 0.f);
+      all_loc_loss.push_back(loc_loss);
+    }
+  }
+  for (int i = 0; i < num; ++i) {
+    map<int, vector<int> >& match_indices = (*all_match_indices)[i];
+    const map<int, vector<float> >& match_overlaps = all_match_overlaps[i];
+    // loc + conf loss.
+    const vector<float>& conf_loss = all_conf_loss[i];
+    const vector<float>& loc_loss = all_loc_loss[i];
+    vector<float> loss;
+    std::transform(conf_loss.begin(), conf_loss.end(), loc_loss.begin(),
+                   std::back_inserter(loss), std::plus<float>());
+    // Pick negatives or hard examples based on loss.
+    set<int> sel_indices;
+    vector<int> neg_indices;
+    for (map<int, vector<int> >::iterator it = match_indices.begin();
+         it != match_indices.end(); ++it) {
+      const int label = it->first;
+      int num_sel = 0;
+      // Get potential indices and loss pairs.
+      vector<pair<float, int> > loss_indices;
+      for (int m = 0; m < match_indices[label].size(); ++m) {
+        if (IsEligibleMining(mining_type, match_indices[label][m],
+            match_overlaps.find(label)->second[m], neg_overlap)) {
+          loss_indices.push_back(std::make_pair(loss[m], m));
+          ++num_sel;
+        }
+      }
+      if (mining_type == MultiBoxLossParameter_MiningType_MAX_NEGATIVE) {
+        int num_pos = 0;
+        for (int m = 0; m < match_indices[label].size(); ++m) {
+          if (match_indices[label][m] > -1) {
+            ++num_pos;
+          }
+        }
+        num_sel = std::min(static_cast<int>(num_pos * neg_pos_ratio), num_sel);
+      } else if (mining_type == MultiBoxLossParameter_MiningType_HARD_EXAMPLE) {
+        CHECK_GT(sample_size, 0);
+        num_sel = std::min(sample_size, num_sel);
+      }
+      // Select samples.
+      if (has_nms_param && nms_threshold > 0) {
+        // Do nms before selecting samples.
+        vector<float> sel_loss;
+        vector<NormalizedBBox> sel_bboxes;
+        if (use_prior_for_nms) {
+          for (int m = 0; m < match_indices[label].size(); ++m) {
+            if (IsEligibleMining(mining_type, match_indices[label][m],
+                match_overlaps.find(label)->second[m], neg_overlap)) {
+              sel_loss.push_back(loss[m]);
+              sel_bboxes.push_back(prior_bboxes[m]);
+            }
+          }
+        } else {
+          // Decode the prediction into bbox first.
+          vector<NormalizedBBox> loc_bboxes;
+          bool clip_bbox = false;
+          DecodeBBoxes(prior_bboxes, prior_variances,
+                       code_type, encode_variance_in_target, clip_bbox,
+                       all_loc_preds[i].find(label)->second, &loc_bboxes);
+          for (int m = 0; m < match_indices[label].size(); ++m) {
+            if (IsEligibleMining(mining_type, match_indices[label][m],
+                match_overlaps.find(label)->second[m], neg_overlap)) {
+              sel_loss.push_back(loss[m]);
+              sel_bboxes.push_back(loc_bboxes[m]);
+            }
+          }
+        }
+        // Do non-maximum suppression based on the loss.
+        vector<int> nms_indices;
+        ApplyNMS(sel_bboxes, sel_loss, nms_threshold, top_k, &nms_indices);
+        if (nms_indices.size() < num_sel) {
+          LOG(INFO) << "not enough sample after nms: " << nms_indices.size();
+        }
+        // Pick top example indices after nms.
+        num_sel = std::min(static_cast<int>(nms_indices.size()), num_sel);
+        for (int n = 0; n < num_sel; ++n) {
+          sel_indices.insert(loss_indices[nms_indices[n]].second);
+        }
+      } else {
+        // Pick top example indices based on loss.
+        std::sort(loss_indices.begin(), loss_indices.end(),
+                  SortScorePairDescend<int>);
+        for (int n = 0; n < num_sel; ++n) {
+          sel_indices.insert(loss_indices[n].second);
+        }
+      }
+      // Update the match_indices and select neg_indices.
+      for (int m = 0; m < match_indices[label].size(); ++m) {
+        if (match_indices[label][m] > -1) {
+          if (mining_type == MultiBoxLossParameter_MiningType_HARD_EXAMPLE &&
+              sel_indices.find(m) == sel_indices.end()) {
+            match_indices[label][m] = -1;
+            *num_matches -= 1;
+          }
+        } else if (match_indices[label][m] == -1) {
+          if (sel_indices.find(m) != sel_indices.end()) {
+            neg_indices.push_back(m);
+            *num_negs += 1;
+          }
+        }
+      }
+    }
+    all_neg_indices->push_back(neg_indices);
+  }
+}
+
+// Explicite initialization.
+template void MineHardExamples(const TBlob<float>& conf_blob,
+    const vector<LabelBBox>& all_loc_preds,
+    const map<int, vector<NormalizedBBox> >& all_gt_bboxes,
+    const vector<NormalizedBBox>& prior_bboxes,
+    const vector<vector<float> >& prior_variances,
+    const vector<map<int, vector<float> > >& all_match_overlaps,
+    const MultiBoxLossParameter& multibox_loss_param,
+    int* num_matches, int* num_negs,
+    vector<map<int, vector<int> > >* all_match_indices,
+    vector<vector<int> >* all_neg_indices);
+template void MineHardExamples(const TBlob<double>& conf_blob,
+    const vector<LabelBBox>& all_loc_preds,
+    const map<int, vector<NormalizedBBox> >& all_gt_bboxes,
+    const vector<NormalizedBBox>& prior_bboxes,
+    const vector<vector<float> >& prior_variances,
+    const vector<map<int, vector<float> > >& all_match_overlaps,
+    const MultiBoxLossParameter& multibox_loss_param,
+    int* num_matches, int* num_negs,
+    vector<map<int, vector<int> > >* all_match_indices,
+    vector<vector<int> >* all_neg_indices);
+template void MineHardExamples(const TBlob<float16>& conf_blob,
+    const vector<LabelBBox>& all_loc_preds,
+    const map<int, vector<NormalizedBBox> >& all_gt_bboxes,
+    const vector<NormalizedBBox>& prior_bboxes,
+    const vector<vector<float> >& prior_variances,
+    const vector<map<int, vector<float> > >& all_match_overlaps,
+    const MultiBoxLossParameter& multibox_loss_param,
+    int* num_matches, int* num_negs,
+    vector<map<int, vector<int> > >* all_match_indices,
+    vector<vector<int> >* all_neg_indices);
+template <typename Dtype>
+void GetGroundTruth(const Dtype* gt_data, const int num_gt,
+      const int background_label_id, const bool use_difficult_gt,
+      map<int, vector<NormalizedBBox> >* all_gt_bboxes) {
+  all_gt_bboxes->clear();
+  for (int i = 0; i < num_gt; ++i) {
+    int start_idx = i * 8;
+    int item_id = gt_data[start_idx];
+    if (item_id == -1) {
+      continue;
+    }
+    int label = gt_data[start_idx + 1];
+    if (is_precise<Dtype>()) {  // round float16 to int?
+      CHECK_NE(background_label_id, label)
+        << "Found background label in the dataset.";
+    } else if (background_label_id == label) {
+      continue;
+    }
+    bool difficult = static_cast<bool>(gt_data[start_idx + 7]);
+    if (!use_difficult_gt && difficult) {
+      // Skip reading difficult ground truth.
+      continue;
+    }
+    NormalizedBBox bbox;
+    bbox.set_label(label);
+    bbox.set_xmin(gt_data[start_idx + 3]);
+    bbox.set_ymin(gt_data[start_idx + 4]);
+    bbox.set_xmax(gt_data[start_idx + 5]);
+    bbox.set_ymax(gt_data[start_idx + 6]);
+    bbox.set_difficult(difficult);
+    float bbox_size = BBoxSize(bbox);
+    bbox.set_size(bbox_size);
+    (*all_gt_bboxes)[item_id].push_back(bbox);
+  }
+}
+
+// Explicit initialization.
+template void GetGroundTruth(const float* gt_data, const int num_gt,
+      const int background_label_id, const bool use_difficult_gt,
+      map<int, vector<NormalizedBBox> >* all_gt_bboxes);
+template void GetGroundTruth(const double* gt_data, const int num_gt,
+      const int background_label_id, const bool use_difficult_gt,
+      map<int, vector<NormalizedBBox> >* all_gt_bboxes);
+template void GetGroundTruth(const float16* gt_data, const int num_gt,
+      const int background_label_id, const bool use_difficult_gt,
+      map<int, vector<NormalizedBBox> >* all_gt_bboxes);
+
+template <typename Dtype>
+void GetGroundTruth(const Dtype* gt_data, const int num_gt,
+      const int background_label_id, const bool use_difficult_gt,
+      map<int, LabelBBox>* all_gt_bboxes) {
+  all_gt_bboxes->clear();
+  for (int i = 0; i < num_gt; ++i) {
+    int start_idx = i * 8;
+    int item_id = gt_data[start_idx];
+    if (item_id == -1) {
+      break;
+    }
+    NormalizedBBox bbox;
+    int label = gt_data[start_idx + 1];
+    CHECK_NE(background_label_id, label)
+        << "Found background label in the dataset.";
+    bool difficult = static_cast<bool>(gt_data[start_idx + 7]);
+    if (!use_difficult_gt && difficult) {
+      // Skip reading difficult ground truth.
+      continue;
+    }
+    bbox.set_xmin(gt_data[start_idx + 3]);
+    bbox.set_ymin(gt_data[start_idx + 4]);
+    bbox.set_xmax(gt_data[start_idx + 5]);
+    bbox.set_ymax(gt_data[start_idx + 6]);
+    bbox.set_difficult(difficult);
+    float bbox_size = BBoxSize(bbox);
+    bbox.set_size(bbox_size);
+    (*all_gt_bboxes)[item_id][label].push_back(bbox);
+  }
+}
+
+// Explicit initialization.
+template void GetGroundTruth(const float* gt_data, const int num_gt,
+      const int background_label_id, const bool use_difficult_gt,
+      map<int, LabelBBox>* all_gt_bboxes);
+template void GetGroundTruth(const double* gt_data, const int num_gt,
+      const int background_label_id, const bool use_difficult_gt,
+      map<int, LabelBBox>* all_gt_bboxes);
+template void GetGroundTruth(const float16* gt_data, const int num_gt,
+      const int background_label_id, const bool use_difficult_gt,
+      map<int, LabelBBox>* all_gt_bboxes);
+template <typename Dtype>
+void GetLocPredictions(const Dtype* loc_data, const int num,
+      const int num_preds_per_class, const int num_loc_classes,
+      const bool share_location, vector<LabelBBox>* loc_preds) {
+  loc_preds->clear();
+  if (share_location) {
+    CHECK_EQ(num_loc_classes, 1);
+  }
+  loc_preds->resize(num);
+  for (int i = 0; i < num; ++i) {
+    LabelBBox& label_bbox = (*loc_preds)[i];
+    for (int p = 0; p < num_preds_per_class; ++p) {
+      int start_idx = p * num_loc_classes * 4;
+      for (int c = 0; c < num_loc_classes; ++c) {
+        int label = share_location ? -1 : c;
+        if (label_bbox.find(label) == label_bbox.end()) {
+          label_bbox[label].resize(num_preds_per_class);
+        }
+        label_bbox[label][p].set_xmin(loc_data[start_idx + c * 4]);
+        label_bbox[label][p].set_ymin(loc_data[start_idx + c * 4 + 1]);
+        label_bbox[label][p].set_xmax(loc_data[start_idx + c * 4 + 2]);
+        label_bbox[label][p].set_ymax(loc_data[start_idx + c * 4 + 3]);
+      }
+    }
+    loc_data += num_preds_per_class * num_loc_classes * 4;
+  }
+}
+
+// Explicit initialization.
+template void GetLocPredictions(const float* loc_data, const int num,
+      const int num_preds_per_class, const int num_loc_classes,
+      const bool share_location, vector<LabelBBox>* loc_preds);
+template void GetLocPredictions(const double* loc_data, const int num,
+      const int num_preds_per_class, const int num_loc_classes,
+      const bool share_location, vector<LabelBBox>* loc_preds);
+template void GetLocPredictions(const float16* loc_data, const int num,
+      const int num_preds_per_class, const int num_loc_classes,
+      const bool share_location, vector<LabelBBox>* loc_preds);
+template <typename Dtype>
+void EncodeLocPrediction(const vector<LabelBBox>& all_loc_preds,
+      const map<int, vector<NormalizedBBox> >& all_gt_bboxes,
+      const vector<map<int, vector<int> > >& all_match_indices,
+      const vector<NormalizedBBox>& prior_bboxes,
+      const vector<vector<float> >& prior_variances,
+      const MultiBoxLossParameter& multibox_loss_param,
+      Dtype* loc_pred_data, Dtype* loc_gt_data) {
+  int num = all_loc_preds.size();
+  // CHECK_EQ(num, all_match_indices.size());
+  // Get parameters.
+  const CodeType code_type = multibox_loss_param.code_type();
+  const bool encode_variance_in_target =
+      multibox_loss_param.encode_variance_in_target();
+  const bool bp_inside = multibox_loss_param.bp_inside();
+  const bool use_prior_for_matching =
+      multibox_loss_param.use_prior_for_matching();
+  int count = 0;
+  for (int i = 0; i < num; ++i) {
+    for (map<int, vector<int> >::const_iterator
+         it = all_match_indices[i].begin();
+         it != all_match_indices[i].end(); ++it) {
+      const int label = it->first;
+      const vector<int>& match_index = it->second;
+      CHECK(all_loc_preds[i].find(label) != all_loc_preds[i].end());
+      const vector<NormalizedBBox>& loc_pred =
+          all_loc_preds[i].find(label)->second;
+      for (int j = 0; j < match_index.size(); ++j) {
+        if (match_index[j] <= -1) {
+          continue;
+        }
+        // Store encoded ground truth.
+        const int gt_idx = match_index[j];
+        CHECK(all_gt_bboxes.find(i) != all_gt_bboxes.end());
+        CHECK_LT(gt_idx, all_gt_bboxes.find(i)->second.size());
+        const NormalizedBBox& gt_bbox = all_gt_bboxes.find(i)->second[gt_idx];
+        NormalizedBBox gt_encode;
+        CHECK_LT(j, prior_bboxes.size());
+        EncodeBBox(prior_bboxes[j], prior_variances[j], code_type,
+                   encode_variance_in_target, gt_bbox, &gt_encode);
+        loc_gt_data[count * 4] = gt_encode.xmin();
+        loc_gt_data[count * 4 + 1] = gt_encode.ymin();
+        loc_gt_data[count * 4 + 2] = gt_encode.xmax();
+        loc_gt_data[count * 4 + 3] = gt_encode.ymax();
+        // Store location prediction.
+        CHECK_LT(j, loc_pred.size());
+        if (bp_inside) {
+          NormalizedBBox match_bbox = prior_bboxes[j];
+          if (!use_prior_for_matching) {
+            const bool clip_bbox = false;
+            DecodeBBox(prior_bboxes[j], prior_variances[j], code_type,
+                       encode_variance_in_target, clip_bbox, loc_pred[j],
+                       &match_bbox);
+          }
+          // When a dimension of match_bbox is outside of image region, use
+          // gt_encode to simulate zero gradient.
+          loc_pred_data[count * 4] =
+              (match_bbox.xmin() < 0 || match_bbox.xmin() > 1) ?
+              gt_encode.xmin() : loc_pred[j].xmin();
+          loc_pred_data[count * 4 + 1] =
+              (match_bbox.ymin() < 0 || match_bbox.ymin() > 1) ?
+              gt_encode.ymin() : loc_pred[j].ymin();
+          loc_pred_data[count * 4 + 2] =
+              (match_bbox.xmax() < 0 || match_bbox.xmax() > 1) ?
+              gt_encode.xmax() : loc_pred[j].xmax();
+          loc_pred_data[count * 4 + 3] =
+              (match_bbox.ymax() < 0 || match_bbox.ymax() > 1) ?
+              gt_encode.ymax() : loc_pred[j].ymax();
+        } else {
+          loc_pred_data[count * 4] = loc_pred[j].xmin();
+          loc_pred_data[count * 4 + 1] = loc_pred[j].ymin();
+          loc_pred_data[count * 4 + 2] = loc_pred[j].xmax();
+          loc_pred_data[count * 4 + 3] = loc_pred[j].ymax();
+        }
+        if (encode_variance_in_target) {
+          for (int k = 0; k < 4; ++k) {
+            CHECK_GT(prior_variances[j][k], 0);
+            loc_pred_data[count * 4 + k] /= prior_variances[j][k];
+            loc_gt_data[count * 4 + k] /= prior_variances[j][k];
+          }
+        }
+        ++count;
+      }
+    }
+  }
+}
+
+// Explicit initialization.
+template void EncodeLocPrediction(const vector<LabelBBox>& all_loc_preds,
+      const map<int, vector<NormalizedBBox> >& all_gt_bboxes,
+      const vector<map<int, vector<int> > >& all_match_indices,
+      const vector<NormalizedBBox>& prior_bboxes,
+      const vector<vector<float> >& prior_variances,
+      const MultiBoxLossParameter& multibox_loss_param,
+      float* loc_pred_data, float* loc_gt_data);
+template void EncodeLocPrediction(const vector<LabelBBox>& all_loc_preds,
+      const map<int, vector<NormalizedBBox> >& all_gt_bboxes,
+      const vector<map<int, vector<int> > >& all_match_indices,
+      const vector<NormalizedBBox>& prior_bboxes,
+      const vector<vector<float> >& prior_variances,
+      const MultiBoxLossParameter& multibox_loss_param,
+      double* loc_pred_data, double* loc_gt_data);
+template void EncodeLocPrediction(const vector<LabelBBox>& all_loc_preds,
+      const map<int, vector<NormalizedBBox> >& all_gt_bboxes,
+      const vector<map<int, vector<int> > >& all_match_indices,
+      const vector<NormalizedBBox>& prior_bboxes,
+      const vector<vector<float> >& prior_variances,
+      const MultiBoxLossParameter& multibox_loss_param,
+      float16* loc_pred_data, float16* loc_gt_data);
+template <typename Dtype>
+void ComputeLocLoss(const TBlob<Dtype>& loc_pred, const TBlob<Dtype>& loc_gt,
+      const vector<map<int, vector<int> > >& all_match_indices,
+      const int num, const int num_priors, const LocLossType loc_loss_type,
+      vector<vector<float> >* all_loc_loss) {
+  int loc_count = loc_pred.count();
+  CHECK_EQ(loc_count, loc_gt.count());
+  TBlob<Dtype> diff;
+  const Dtype* diff_data = NULL;
+  if (loc_count != 0) {
+    diff.Reshape(loc_pred.shape());
+    caffe_sub(loc_count, loc_pred.cpu_data(), loc_gt.cpu_data(),
+              diff.mutable_cpu_data());
+    diff_data = diff.cpu_data();
+  }
+  int count = 0;
+  for (int i = 0; i < num; ++i) {
+    vector<float> loc_loss(num_priors, 0.f);
+    for (map<int, vector<int> >::const_iterator
+         it = all_match_indices[i].begin();
+         it != all_match_indices[i].end(); ++it) {
+      const vector<int>& match_index = it->second;
+      CHECK_EQ(num_priors, match_index.size());
+      for (int j = 0; j < match_index.size(); ++j) {
+        if (match_index[j] <= -1) {
+          continue;
+        }
+        Dtype loss = 0;
+        for (int k = 0; k < 4; ++k) {
+          Dtype val = diff_data[count * 4 + k];
+          if (loc_loss_type == MultiBoxLossParameter_LocLossType_SMOOTH_L1) {
+            Dtype abs_val = fabs(val);
+            if (abs_val < 1.) {
+              loss += 0.5 * val * val;
+            } else {
+              loss += abs_val - 0.5;
+            }
+          } else if (loc_loss_type == MultiBoxLossParameter_LocLossType_L2) {
+            loss += 0.5 * val * val;
+          } else {
+            LOG(FATAL) << "Unknown loc loss type.";
+          }
+        }
+        loc_loss[j] = loss;
+        ++count;
+      }
+    }
+    all_loc_loss->push_back(loc_loss);
+  }
+}
+
+// Explicit initialization.
+template void ComputeLocLoss(const TBlob<float>& loc_pred,
+      const TBlob<float>& loc_gt,
+      const vector<map<int, vector<int> > >& all_match_indices,
+      const int num, const int num_priors, const LocLossType loc_loss_type,
+      vector<vector<float> >* all_loc_loss);
+template void ComputeLocLoss(const TBlob<double>& loc_pred,
+      const TBlob<double>& loc_gt,
+      const vector<map<int, vector<int> > >& all_match_indices,
+      const int num, const int num_priors, const LocLossType loc_loss_type,
+      vector<vector<float> >* all_loc_loss);
+template void ComputeLocLoss(const TBlob<float16>& loc_pred,
+      const TBlob<float16>& loc_gt,
+      const vector<map<int, vector<int> > >& all_match_indices,
+      const int num, const int num_priors, const LocLossType loc_loss_type,
+      vector<vector<float> >* all_loc_loss);
+template <typename Dtype>
+void GetConfidenceScores(const Dtype* conf_data, const int num,
+      const int num_preds_per_class, const int num_classes,
+      vector<map<int, vector<float> > >* conf_preds) {
+  conf_preds->clear();
+  conf_preds->resize(num);
+  for (int i = 0; i < num; ++i) {
+    map<int, vector<float> >& label_scores = (*conf_preds)[i];
+    for (int p = 0; p < num_preds_per_class; ++p) {
+      int start_idx = p * num_classes;
+      for (int c = 0; c < num_classes; ++c) {
+        label_scores[c].push_back(conf_data[start_idx + c]);
+      }
+    }
+    conf_data += num_preds_per_class * num_classes;
+  }
+}
+
+// Explicit initialization.
+template void GetConfidenceScores(const float* conf_data, const int num,
+      const int num_preds_per_class, const int num_classes,
+      vector<map<int, vector<float> > >* conf_preds);
+template void GetConfidenceScores(const double* conf_data, const int num,
+      const int num_preds_per_class, const int num_classes,
+      vector<map<int, vector<float> > >* conf_preds);
+template void GetConfidenceScores(const float16* conf_data, const int num,
+      const int num_preds_per_class, const int num_classes,
+      vector<map<int, vector<float> > >* conf_preds);
+template <typename Dtype>
+void GetConfidenceScores(const Dtype* conf_data, const int num,
+      const int num_preds_per_class, const int num_classes,
+      const bool class_major, vector<map<int, vector<float> > >* conf_preds) {
+  conf_preds->clear();
+  conf_preds->resize(num);
+  for (int i = 0; i < num; ++i) {
+    map<int, vector<float> >& label_scores = (*conf_preds)[i];
+    if (class_major) {
+      for (int c = 0; c < num_classes; ++c) {
+        label_scores[c].assign(conf_data, conf_data + num_preds_per_class);
+        conf_data += num_preds_per_class;
+      }
+    } else {
+      for (int p = 0; p < num_preds_per_class; ++p) {
+        int start_idx = p * num_classes;
+        for (int c = 0; c < num_classes; ++c) {
+          label_scores[c].push_back(conf_data[start_idx + c]);
+        }
+      }
+      conf_data += num_preds_per_class * num_classes;
+    }
+  }
+}
+
+// Explicit initialization.
+template void GetConfidenceScores(const float* conf_data, const int num,
+      const int num_preds_per_class, const int num_classes,
+      const bool class_major, vector<map<int, vector<float> > >* conf_preds);
+template void GetConfidenceScores(const double* conf_data, const int num,
+      const int num_preds_per_class, const int num_classes,
+      const bool class_major, vector<map<int, vector<float> > >* conf_preds);
+template void GetConfidenceScores(const float16* conf_data, const int num,
+      const int num_preds_per_class, const int num_classes,
+      const bool class_major, vector<map<int, vector<float> > >* conf_preds);
+template <typename Dtype>
+void ComputeConfLoss(const Dtype* conf_data, const int num,
+      const int num_preds_per_class, const int num_classes,
+      const int background_label_id, const ConfLossType loss_type,
+      vector<vector<float> >* all_conf_loss) {
+  all_conf_loss->clear();
+  for (int i = 0; i < num; ++i) {
+    vector<float> conf_loss;
+    for (int p = 0; p < num_preds_per_class; ++p) {
+      int start_idx = p * num_classes;
+      int label = background_label_id;
+      Dtype loss = 0;
+      if (loss_type == MultiBoxLossParameter_ConfLossType_SOFTMAX) {
+        CHECK_GE(label, 0);
+        CHECK_LT(label, num_classes);
+        // Compute softmax probability.
+        // We need to subtract the max to avoid numerical issues.
+        Dtype maxval = -FLT_MAX;
+        for (int c = 0; c < num_classes; ++c) {
+          maxval = std::max<Dtype>(conf_data[start_idx + c], maxval);
+        }
+        Dtype sum = 0.;
+        for (int c = 0; c < num_classes; ++c) {
+          sum += std::exp(conf_data[start_idx + c] - maxval);
+        }
+        Dtype prob = std::exp(conf_data[start_idx + label] - maxval) / sum;
+        loss = -log(std::max(prob, Dtype(FLT_MIN)));
+      } else if (loss_type == MultiBoxLossParameter_ConfLossType_LOGISTIC) {
+        int target = 0;
+        for (int c = 0; c < num_classes; ++c) {
+          if (c == label) {
+            target = 1;
+          } else {
+            target = 0;
+          }
+          Dtype input = conf_data[start_idx + c];
+          loss -= input * (target - (input >= 0)) -
+              log(1 + exp(input - 2 * input * (input >= 0)));
+        }
+      } else {
+        LOG(FATAL) << "Unknown conf loss type.";
+      }
+      conf_loss.push_back(loss);
+    }
+    conf_data += num_preds_per_class * num_classes;
+    all_conf_loss->push_back(conf_loss);
+  }
+}
+
+// Explicit initialization.
+template void ComputeConfLoss(const float* conf_data, const int num,
+      const int num_preds_per_class, const int num_classes,
+      const int background_label_id, const ConfLossType loss_type,
+      vector<vector<float> >* all_conf_loss);
+template void ComputeConfLoss(const double* conf_data, const int num,
+      const int num_preds_per_class, const int num_classes,
+      const int background_label_id, const ConfLossType loss_type,
+      vector<vector<float> >* all_conf_loss);
+template void ComputeConfLoss(const float16* conf_data, const int num,
+      const int num_preds_per_class, const int num_classes,
+      const int background_label_id, const ConfLossType loss_type,
+      vector<vector<float> >* all_conf_loss);
+template <typename Dtype>
+void ComputeConfLoss(const Dtype* conf_data, const int num,
+      const int num_preds_per_class, const int num_classes,
+      const int background_label_id, const ConfLossType loss_type,
+      const vector<map<int, vector<int> > >& all_match_indices,
+      const map<int, vector<NormalizedBBox> >& all_gt_bboxes,
+      vector<vector<float> >* all_conf_loss) {
+  CHECK_LT(background_label_id, num_classes);
+  // CHECK_EQ(num, all_match_indices.size());
+  all_conf_loss->clear();
+  for (int i = 0; i < num; ++i) {
+    vector<float> conf_loss;
+    const map<int, vector<int> >& match_indices = all_match_indices[i];
+    for (int p = 0; p < num_preds_per_class; ++p) {
+      int start_idx = p * num_classes;
+      // Get the label index.
+      int label = background_label_id;
+      for (map<int, vector<int> >::const_iterator it =
+           match_indices.begin(); it != match_indices.end(); ++it) {
+        const vector<int>& match_index = it->second;
+        CHECK_EQ(match_index.size(), num_preds_per_class);
+        if (match_index[p] > -1) {
+          CHECK(all_gt_bboxes.find(i) != all_gt_bboxes.end());
+          const vector<NormalizedBBox>& gt_bboxes =
+              all_gt_bboxes.find(i)->second;
+          CHECK_LT(match_index[p], gt_bboxes.size());
+          label = gt_bboxes[match_index[p]].label();
+          CHECK_GE(label, 0);
+          CHECK_NE(label, background_label_id);
+          CHECK_LT(label, num_classes);
+          // A prior can only be matched to one gt bbox.
+          break;
+        }
+      }
+      Dtype loss = 0;
+      if (loss_type == MultiBoxLossParameter_ConfLossType_SOFTMAX) {
+        CHECK_GE(label, 0);
+        CHECK_LT(label, num_classes);
+        // Compute softmax probability.
+        // We need to subtract the max to avoid numerical issues.
+        Dtype maxval = conf_data[start_idx];
+        for (int c = 1; c < num_classes; ++c) {
+          maxval = std::max<Dtype>(conf_data[start_idx + c], maxval);
+        }
+        Dtype sum = 0.;
+        for (int c = 0; c < num_classes; ++c) {
+          sum += std::exp(conf_data[start_idx + c] - maxval);
+        }
+        Dtype prob = std::exp(conf_data[start_idx + label] - maxval) / sum;
+        loss = -log(std::max(prob, Dtype(FLT_MIN)));
+      } else if (loss_type == MultiBoxLossParameter_ConfLossType_LOGISTIC) {
+        int target = 0;
+        for (int c = 0; c < num_classes; ++c) {
+          if (c == label) {
+            target = 1;
+          } else {
+            target = 0;
+          }
+          Dtype input = conf_data[start_idx + c];
+          loss -= input * (target - (input >= 0)) -
+              log(1 + exp(input - 2 * input * (input >= 0)));
+        }
+      } else {
+        LOG(FATAL) << "Unknown conf loss type.";
+      }
+      conf_loss.push_back(loss);
+    }
+    conf_data += num_preds_per_class * num_classes;
+    all_conf_loss->push_back(conf_loss);
+  }
+}
+
+// Explicit initialization.
+template void ComputeConfLoss(const float* conf_data, const int num,
+      const int num_preds_per_class, const int num_classes,
+      const int background_label_id, const ConfLossType loss_type,
+      const vector<map<int, vector<int> > >& all_match_indices,
+      const map<int, vector<NormalizedBBox> >& all_gt_bboxes,
+      vector<vector<float> >* all_conf_loss);
+template void ComputeConfLoss(const double* conf_data, const int num,
+      const int num_preds_per_class, const int num_classes,
+      const int background_label_id, const ConfLossType loss_type,
+      const vector<map<int, vector<int> > >& all_match_indices,
+      const map<int, vector<NormalizedBBox> >& all_gt_bboxes,
+      vector<vector<float> >* all_conf_loss);
+template void ComputeConfLoss(const float16* conf_data, const int num,
+      const int num_preds_per_class, const int num_classes,
+      const int background_label_id, const ConfLossType loss_type,
+      const vector<map<int, vector<int> > >& all_match_indices,
+      const map<int, vector<NormalizedBBox> >& all_gt_bboxes,
+      vector<vector<float> >* all_conf_loss);
+template <typename Dtype>
+void EncodeConfPrediction(const Dtype* conf_data, const int num,
+      const int num_priors, const MultiBoxLossParameter& multibox_loss_param,
+      const vector<map<int, vector<int> > >& all_match_indices,
+      const vector<vector<int> >& all_neg_indices,
+      const map<int, vector<NormalizedBBox> >& all_gt_bboxes,
+      Dtype* conf_pred_data, Dtype* conf_gt_data) {
+  // CHECK_EQ(num, all_match_indices.size());
+  // CHECK_EQ(num, all_neg_indices.size());
+  // Retrieve parameters.
+  CHECK(multibox_loss_param.has_num_classes()) << "Must provide num_classes.";
+  const int num_classes = multibox_loss_param.num_classes();
+  CHECK_GE(num_classes, 1) << "num_classes should not be less than 1.";
+  const int background_label_id = multibox_loss_param.background_label_id();
+  const bool map_object_to_agnostic =
+      multibox_loss_param.map_object_to_agnostic();
+  if (map_object_to_agnostic) {
+    if (background_label_id >= 0) {
+      CHECK_EQ(num_classes, 2);
+    } else {
+      CHECK_EQ(num_classes, 1);
+    }
+  }
+  const MiningType mining_type = multibox_loss_param.mining_type();
+  bool do_neg_mining;
+  if (multibox_loss_param.has_do_neg_mining()) {
+    LOG(WARNING) << "do_neg_mining is deprecated, use mining_type instead.";
+    do_neg_mining = multibox_loss_param.do_neg_mining();
+    CHECK_EQ(do_neg_mining,
+             mining_type != MultiBoxLossParameter_MiningType_NONE);
+  }
+  do_neg_mining = mining_type != MultiBoxLossParameter_MiningType_NONE;
+  const ConfLossType conf_loss_type = multibox_loss_param.conf_loss_type();
+  int count = 0;
+  for (int i = 0; i < num; ++i) {
+    if (all_gt_bboxes.find(i) != all_gt_bboxes.end()) {
+      // Save matched (positive) bboxes scores and labels.
+      const map<int, vector<int> >& match_indices = all_match_indices[i];
+      for (map<int, vector<int> >::const_iterator it =
+          match_indices.begin(); it != match_indices.end(); ++it) {
+        const vector<int>& match_index = it->second;
+        CHECK_EQ(match_index.size(), num_priors);
+        for (int j = 0; j < num_priors; ++j) {
+          if (match_index[j] <= -1) {
+            continue;
+          }
+          const int gt_label = map_object_to_agnostic ?
+            background_label_id + 1 :
+            all_gt_bboxes.find(i)->second[match_index[j]].label();
+          int idx = do_neg_mining ? count : j;
+          switch (conf_loss_type) {
+            case MultiBoxLossParameter_ConfLossType_SOFTMAX:
+              conf_gt_data[idx] = gt_label;
+              break;
+            case MultiBoxLossParameter_ConfLossType_LOGISTIC:
+              conf_gt_data[idx * num_classes + gt_label] = 1;
+              break;
+            default:
+              LOG(FATAL) << "Unknown conf loss type.";
+          }
+          if (do_neg_mining) {
+            // Copy scores for matched bboxes.
+            caffe_copy<Dtype>(num_classes, conf_data + j * num_classes,
+                conf_pred_data + count * num_classes);
+            ++count;
+          }
+        }
+      }
+      // Go to next image.
+      if (do_neg_mining) {
+        // Save negative bboxes scores and labels.
+        for (int n = 0; n < all_neg_indices[i].size(); ++n) {
+          int j = all_neg_indices[i][n];
+          CHECK_LT(j, num_priors);
+          caffe_copy<Dtype>(num_classes, conf_data + j * num_classes,
+              conf_pred_data + count * num_classes);
+          switch (conf_loss_type) {
+            case MultiBoxLossParameter_ConfLossType_SOFTMAX:
+              conf_gt_data[count] = background_label_id;
+              break;
+            case MultiBoxLossParameter_ConfLossType_LOGISTIC:
+              if (background_label_id >= 0 &&
+                  background_label_id < num_classes) {
+                conf_gt_data[count * num_classes + background_label_id] = 1;
+              }
+              break;
+            default:
+              LOG(FATAL) << "Unknown conf loss type.";
+          }
+          ++count;
+        }
+      }
+    }
+    if (do_neg_mining) {
+      conf_data += num_priors * num_classes;
+    } else {
+      conf_gt_data += num_priors;
+    }
+  }
+}
+
+// Explicite initialization.
+template void EncodeConfPrediction(const float* conf_data, const int num,
+      const int num_priors, const MultiBoxLossParameter& multibox_loss_param,
+      const vector<map<int, vector<int> > >& all_match_indices,
+      const vector<vector<int> >& all_neg_indices,
+      const map<int, vector<NormalizedBBox> >& all_gt_bboxes,
+      float* conf_pred_data, float* conf_gt_data);
+template void EncodeConfPrediction(const double* conf_data, const int num,
+      const int num_priors, const MultiBoxLossParameter& multibox_loss_param,
+      const vector<map<int, vector<int> > >& all_match_indices,
+      const vector<vector<int> >& all_neg_indices,
+      const map<int, vector<NormalizedBBox> >& all_gt_bboxes,
+      double* conf_pred_data, double* conf_gt_data);
+template void EncodeConfPrediction(const float16* conf_data, const int num,
+      const int num_priors, const MultiBoxLossParameter& multibox_loss_param,
+      const vector<map<int, vector<int> > >& all_match_indices,
+      const vector<vector<int> >& all_neg_indices,
+      const map<int, vector<NormalizedBBox> >& all_gt_bboxes,
+      float16* conf_pred_data, float16* conf_gt_data);
+template <typename Dtype>
+void GetPriorBBoxes(const Dtype* prior_data, const int num_priors,
+      vector<NormalizedBBox>* prior_bboxes,
+      vector<vector<float> >* prior_variances) {
+  prior_bboxes->clear();
+  prior_variances->clear();
+  for (int i = 0; i < num_priors; ++i) {
+    int start_idx = i * 4;
+    NormalizedBBox bbox;
+    bbox.set_xmin(prior_data[start_idx]);
+    bbox.set_ymin(prior_data[start_idx + 1]);
+    bbox.set_xmax(prior_data[start_idx + 2]);
+    bbox.set_ymax(prior_data[start_idx + 3]);
+    float bbox_size = BBoxSize(bbox);
+    bbox.set_size(bbox_size);
+    prior_bboxes->push_back(bbox);
+  }
+
+  for (int i = 0; i < num_priors; ++i) {
+    int start_idx = (num_priors + i) * 4;
+    vector<float> var;
+    for (int j = 0; j < 4; ++j) {
+      var.push_back(prior_data[start_idx + j]);
+    }
+    prior_variances->push_back(var);
+  }
+}
+
+// Explicit initialization.
+template void GetPriorBBoxes(const float* prior_data, const int num_priors,
+      vector<NormalizedBBox>* prior_bboxes,
+      vector<vector<float> >* prior_variances);
+template void GetPriorBBoxes(const double* prior_data, const int num_priors,
+      vector<NormalizedBBox>* prior_bboxes,
+      vector<vector<float> >* prior_variances);
+template void GetPriorBBoxes(const float16* prior_data, const int num_priors,
+      vector<NormalizedBBox>* prior_bboxes,
+      vector<vector<float> >* prior_variances);
+template <typename Dtype>
+void GetDetectionResults(const Dtype* det_data, const int num_det,
+      const int background_label_id,
+      map<int, map<int, vector<NormalizedBBox> > >* all_detections) {
+  all_detections->clear();
+  for (int i = 0; i < num_det; ++i) {
+    int start_idx = i * 7;
+    int item_id = det_data[start_idx];
+    if (item_id == -1) {
+      continue;
+    }
+    int label = det_data[start_idx + 1];
+    CHECK_NE(background_label_id, label)
+        << "Found background label in the detection results.";
+    NormalizedBBox bbox;
+    bbox.set_score(det_data[start_idx + 2]);
+    bbox.set_xmin(det_data[start_idx + 3]);
+    bbox.set_ymin(det_data[start_idx + 4]);
+    bbox.set_xmax(det_data[start_idx + 5]);
+    bbox.set_ymax(det_data[start_idx + 6]);
+    float bbox_size = BBoxSize(bbox);
+    bbox.set_size(bbox_size);
+    (*all_detections)[item_id][label].push_back(bbox);
+  }
+}
+
+// Explicit initialization.
+template void GetDetectionResults(const float* det_data, const int num_det,
+      const int background_label_id,
+      map<int, map<int, vector<NormalizedBBox> > >* all_detections);
+template void GetDetectionResults(const double* det_data, const int num_det,
+      const int background_label_id,
+      map<int, map<int, vector<NormalizedBBox> > >* all_detections);
+template void GetDetectionResults(const float16* det_data, const int num_det,
+      const int background_label_id,
+      map<int, map<int, vector<NormalizedBBox> > >* all_detections);
+void GetTopKScoreIndex(const vector<float>& scores, const vector<int>& indices,
+      const int top_k, vector<pair<float, int> >* score_index_vec) {
+  CHECK_EQ(scores.size(), indices.size());
+
+  // Generate index score pairs.
+  for (int i = 0; i < scores.size(); ++i) {
+    score_index_vec->push_back(std::make_pair(scores[i], indices[i]));
+  }
+
+  // Sort the score pair according to the scores in descending order
+  std::stable_sort(score_index_vec->begin(), score_index_vec->end(),
+                   SortScorePairDescend<int>);
+
+  // Keep top_k scores if needed.
+  if (top_k > -1 && top_k < score_index_vec->size()) {
+    score_index_vec->resize(top_k);
+  }
+}
+
+void GetMaxScoreIndex(const vector<float>& scores, const float threshold,
+      const int top_k, vector<pair<float, int> >* score_index_vec) {
+  // Generate index score pairs.
+  for (int i = 0; i < scores.size(); ++i) {
+    if (scores[i] > threshold) {
+      score_index_vec->push_back(std::make_pair(scores[i], i));
+    }
+  }
+
+  // Sort the score pair according to the scores in descending order
+  std::stable_sort(score_index_vec->begin(), score_index_vec->end(),
+                   SortScorePairDescend<int>);
+
+  // Keep top_k scores if needed.
+  if (top_k > -1 && top_k < score_index_vec->size()) {
+    score_index_vec->resize(top_k);
+  }
+}
+
+template <typename Dtype>
+void GetMaxScoreIndex(const Dtype* scores, const int num, const float threshold,
+      const int top_k, vector<pair<Dtype, int> >* score_index_vec) {
+  // Generate index score pairs.
+  for (int i = 0; i < num; ++i) {
+    if (scores[i] > threshold) {
+      score_index_vec->emplace_back(std::make_pair(scores[i], i));
+    }
+  }
+
+  // Sort the score pair according to the scores in descending order
+  std::partial_sort(
+      score_index_vec->begin(), score_index_vec->begin() +
+          std::min(score_index_vec->size(), (size_t)top_k),
+      score_index_vec->end(), SortScorePairDescend<int>);
+
+  // Keep top_k scores if needed.
+  if (top_k > -1 && top_k < score_index_vec->size()) {
+    score_index_vec->resize(top_k);
+  }
+}
+
+template
+void GetMaxScoreIndex(const float* scores, const int num, const float threshold,
+      const int top_k, vector<pair<float, int> >* score_index_vec);
+template
+void GetMaxScoreIndex(const double* scores, const int num,
+      const float threshold, const int top_k,
+      vector<pair<double, int> >* score_index_vec);
+template
+void GetMaxScoreIndex(const float16* scores, const int num,
+      const float threshold, const int top_k,
+      vector<pair<float16, int> >* score_index_vec);
+void ApplyNMS(const vector<NormalizedBBox>& bboxes, const vector<float>& scores,
+      const float threshold, const int top_k, const bool reuse_overlaps,
+      map<int, map<int, float> >* overlaps, vector<int>* indices) {
+  // Sanity check.
+  CHECK_EQ(bboxes.size(), scores.size())
+      << "bboxes and scores have different size.";
+
+  // Get top_k scores (with corresponding indices).
+  vector<int> idx(boost::counting_iterator<int>(0),
+                  boost::counting_iterator<int>(scores.size()));
+  vector<pair<float, int> > score_index_vec;
+  GetTopKScoreIndex(scores, idx, top_k, &score_index_vec);
+
+  // Do nms.
+  indices->clear();
+  while (score_index_vec.size() != 0) {
+    // Get the current highest score box.
+    int best_idx = score_index_vec.front().second;
+    const NormalizedBBox& best_bbox = bboxes[best_idx];
+    if (BBoxSize(best_bbox) < 1e-5) {
+      // Erase small box.
+      score_index_vec.erase(score_index_vec.begin());
+      continue;
+    }
+    indices->push_back(best_idx);
+    // Erase the best box.
+    score_index_vec.erase(score_index_vec.begin());
+
+    if (top_k > -1 && indices->size() >= top_k) {
+      // Stop if finding enough bboxes for nms.
+      break;
+    }
+
+    // Compute overlap between best_bbox and other remaining bboxes.
+    // Remove a bbox if the overlap with best_bbox is larger than nms_threshold.
+    for (vector<pair<float, int> >::iterator it = score_index_vec.begin();
+         it != score_index_vec.end(); ) {
+      int cur_idx = it->second;
+      const NormalizedBBox& cur_bbox = bboxes[cur_idx];
+      if (BBoxSize(cur_bbox) < 1e-5) {
+        // Erase small box.
+        it = score_index_vec.erase(it);
+        continue;
+      }
+      float cur_overlap = 0.;
+      if (reuse_overlaps) {
+        if (overlaps->find(best_idx) != overlaps->end() &&
+            overlaps->find(best_idx)->second.find(cur_idx) !=
+            (*overlaps)[best_idx].end()) {
+          // Use the computed overlap.
+          cur_overlap = (*overlaps)[best_idx][cur_idx];
+        } else if (overlaps->find(cur_idx) != overlaps->end() &&
+                   overlaps->find(cur_idx)->second.find(best_idx) !=
+                   (*overlaps)[cur_idx].end()) {
+          // Use the computed overlap.
+          cur_overlap = (*overlaps)[cur_idx][best_idx];
+        } else {
+          cur_overlap = JaccardOverlap(best_bbox, cur_bbox);
+          // Store the overlap for future use.
+          (*overlaps)[best_idx][cur_idx] = cur_overlap;
+        }
+      } else {
+        cur_overlap = JaccardOverlap(best_bbox, cur_bbox);
+      }
+
+      // Remove it if necessary
+      if (cur_overlap > threshold) {
+        it = score_index_vec.erase(it);
+      } else {
+        ++it;
+      }
+    }
+  }
+}
+
+void ApplyNMS(const vector<NormalizedBBox>& bboxes, const vector<float>& scores,
+      const float threshold, const int top_k, vector<int>* indices) {
+  bool reuse_overlap = false;
+  map<int, map<int, float> > overlaps;
+  ApplyNMS(bboxes, scores, threshold, top_k, reuse_overlap, &overlaps, indices);
+}
+
+void ApplyNMS(const bool* overlapped, const int num, vector<int>* indices) {
+  vector<int> index_vec(boost::counting_iterator<int>(0),
+                        boost::counting_iterator<int>(num));
+  // Do nms.
+  indices->clear();
+  while (index_vec.size() != 0) {
+    // Get the current highest score box.
+    int best_idx = index_vec.front();
+    indices->push_back(best_idx);
+    // Erase the best box.
+    index_vec.erase(index_vec.begin());
+
+    for (vector<int>::iterator it = index_vec.begin(); it != index_vec.end();) {
+      int cur_idx = *it;
+
+      // Remove it if necessary
+      if (overlapped[best_idx * num + cur_idx]) {
+        it = index_vec.erase(it);
+      } else {
+        ++it;
+      }
+    }
+  }
+}
+
+inline int clamp(const int v, const int a, const int b) {
+  return v < a ? a : v > b ? b : v;
+}
+
+void ApplyNMSFast(const vector<NormalizedBBox>& bboxes,
+      const vector<float>& scores, const float score_threshold,
+      const float nms_threshold, const float eta, const int top_k,
+      vector<int>* indices) {
+  // Sanity check.
+  CHECK_EQ(bboxes.size(), scores.size())
+      << "bboxes and scores have different size.";
+
+  // Get top_k scores (with corresponding indices).
+  vector<pair<float, int> > score_index_vec;
+  GetMaxScoreIndex(scores, score_threshold, top_k, &score_index_vec);
+
+  // Do nms.
+  float adaptive_threshold = nms_threshold;
+  indices->clear();
+  while (score_index_vec.size() != 0) {
+    const int idx = score_index_vec.front().second;
+    bool keep = true;
+    for (int k = 0; k < indices->size(); ++k) {
+      if (keep) {
+        const int kept_idx = (*indices)[k];
+        float overlap = JaccardOverlap(bboxes[idx], bboxes[kept_idx]);
+        keep = overlap <= adaptive_threshold;
+      } else {
+        break;
+      }
+    }
+    if (keep) {
+      indices->push_back(idx);
+    }
+    score_index_vec.erase(score_index_vec.begin());
+    if (keep && eta < 1 && adaptive_threshold > 0.5) {
+      adaptive_threshold *= eta;
+    }
+  }
+}
+
+template <typename Dtype>
+void ApplyNMSFast(const Dtype* bboxes, const Dtype* scores, const int num,
+      const float score_threshold, const float nms_threshold,
+      const float eta, const int top_k, vector<int>* indices) {
+  // Get top_k scores (with corresponding indices).
+  vector<pair<Dtype, int> > score_index_vec;
+  GetMaxScoreIndex(scores, num, score_threshold, top_k, &score_index_vec);
+
+  // Do nms.
+  float adaptive_threshold = nms_threshold;
+  indices->clear();
+  auto it = score_index_vec.begin();
+  while (it != score_index_vec.end()) {
+    const int idx = it->second;
+    bool keep = true;
+    for (int k = 0; k < indices->size(); ++k) {
+      if (keep) {
+        const int kept_idx = (*indices)[k];
+        float overlap = JaccardOverlap(bboxes + idx * 4, bboxes + kept_idx * 4);
+        keep = overlap <= adaptive_threshold;
+      } else {
+        break;
+      }
+    }
+    if (keep) {
+      indices->push_back(idx);
+    }
+    ++it;
+    if (keep && eta < 1 && adaptive_threshold > 0.5) {
+      adaptive_threshold *= eta;
+    }
+  }
+}
+
+template
+void ApplyNMSFast(const float* bboxes, const float* scores, const int num,
+      const float score_threshold, const float nms_threshold,
+      const float eta, const int top_k, vector<int>* indices);
+template
+void ApplyNMSFast(const double* bboxes, const double* scores, const int num,
+      const float score_threshold, const float nms_threshold,
+      const float eta, const int top_k, vector<int>* indices);
+template
+void ApplyNMSFast(const float16* bboxes, const float16* scores, const int num,
+      const float score_threshold, const float nms_threshold,
+      const float eta, const int top_k, vector<int>* indices);
+void CumSum(const vector<pair<float, int> >& pairs, vector<int>* cumsum) {
+  // Sort the pairs based on first item of the pair.
+  vector<pair<float, int> > sort_pairs = pairs;
+  std::stable_sort(sort_pairs.begin(), sort_pairs.end(),
+                   SortScorePairDescend<int>);
+
+  cumsum->clear();
+  for (int i = 0; i < sort_pairs.size(); ++i) {
+    if (i == 0) {
+      cumsum->push_back(sort_pairs[i].second);
+    } else {
+      cumsum->push_back(cumsum->back() + sort_pairs[i].second);
+    }
+  }
+}
+
+void ComputeAP(const vector<pair<float, int> >& tp, const int num_pos,
+               const vector<pair<float, int> >& fp, const string ap_version,
+               vector<float>* prec, vector<float>* rec, float* ap) {
+  const float eps = 1e-6;
+  CHECK_EQ(tp.size(), fp.size()) << "tp must have same size as fp.";
+  const int num = tp.size();
+  // Make sure that tp and fp have complement value.
+  for (int i = 0; i < num; ++i) {
+    CHECK_LE(fabs(tp[i].first - fp[i].first), eps);
+    CHECK_EQ(tp[i].second, 1 - fp[i].second);
+  }
+  prec->clear();
+  rec->clear();
+  *ap = 0;
+  if (tp.size() == 0 || num_pos == 0) {
+    return;
+  }
+
+  // Compute cumsum of tp.
+  vector<int> tp_cumsum;
+  CumSum(tp, &tp_cumsum);
+  CHECK_EQ(tp_cumsum.size(), num);
+
+  // Compute cumsum of fp.
+  vector<int> fp_cumsum;
+  CumSum(fp, &fp_cumsum);
+  CHECK_EQ(fp_cumsum.size(), num);
+
+  // Compute precision.
+  for (int i = 0; i < num; ++i) {
+    prec->push_back(static_cast<float>(tp_cumsum[i]) /
+                    (tp_cumsum[i] + fp_cumsum[i]));
+  }
+
+  // Compute recall.
+  for (int i = 0; i < num; ++i) {
+    CHECK_LE(tp_cumsum[i], num_pos);
+    rec->push_back(static_cast<float>(tp_cumsum[i]) / num_pos);
+  }
+
+  if (ap_version == "11point") {
+    // VOC2007 style for computing AP.
+    vector<float> max_precs(11, 0.);
+    int start_idx = num - 1;
+    for (int j = 10; j >= 0; --j) {
+      for (int i = start_idx; i >= 0 ; --i) {
+        if ((*rec)[i] < j / 10.) {
+          start_idx = i;
+          if (j > 0) {
+            max_precs[j-1] = max_precs[j];
+          }
+          break;
+        } else {
+          if (max_precs[j] < (*prec)[i]) {
+            max_precs[j] = (*prec)[i];
+          }
+        }
+      }
+    }
+    for (int j = 10; j >= 0; --j) {
+      *ap += max_precs[j] / 11;
+    }
+  } else if (ap_version == "MaxIntegral") {
+    // VOC2012 or ILSVRC style for computing AP.
+    float cur_rec = rec->back();
+    float cur_prec = prec->back();
+    for (int i = num - 2; i >= 0; --i) {
+      cur_prec = std::max<float>((*prec)[i], cur_prec);
+      if (fabs(cur_rec - (*rec)[i]) > eps) {
+        *ap += cur_prec * fabs(cur_rec - (*rec)[i]);
+      }
+      cur_rec = (*rec)[i];
+    }
+    *ap += cur_rec * cur_prec;
+  } else if (ap_version == "Integral") {
+    // Natural integral.
+    float prev_rec = 0.;
+    for (int i = 0; i < num; ++i) {
+      if (fabs((*rec)[i] - prev_rec) > eps) {
+        *ap += (*prec)[i] * fabs((*rec)[i] - prev_rec);
+      }
+      prev_rec = (*rec)[i];
+    }
+  } else {
+    LOG(FATAL) << "Unknown ap_version: " << ap_version;
+  }
+}
+
+cv::Scalar HSV2RGB(const float h, const float s, const float v) {
+  const int h_i = static_cast<int>(h * 6);
+  const float f = h * 6 - h_i;
+  const float p = v * (1 - s);
+  const float q = v * (1 - f*s);
+  const float t = v * (1 - (1 - f) * s);
+  float r, g, b;
+  switch (h_i) {
+    case 0:
+      r = v; g = t; b = p;
+      break;
+    case 1:
+      r = q; g = v; b = p;
+      break;
+    case 2:
+      r = p; g = v; b = t;
+      break;
+    case 3:
+      r = p; g = q; b = v;
+      break;
+    case 4:
+      r = t; g = p; b = v;
+      break;
+    case 5:
+      r = v; g = p; b = q;
+      break;
+    default:
+      r = 1; g = 1; b = 1;
+      break;
+  }
+  return cv::Scalar(r * 255, g * 255, b * 255);
+}
+
+// http://martin.ankerl.com/2009/12/09/how-to-create-random-colors-programmatically
+vector<cv::Scalar> GetColors(const int n) {
+  vector<cv::Scalar> colors;
+  cv::RNG rng(12345);
+  const float golden_ratio_conjugate = 0.618033988749895;
+  const float s = 0.3;
+  const float v = 0.99;
+  for (int i = 0; i < n; ++i) {
+    const float h = std::fmod(rng.uniform(0.f, 1.f) + golden_ratio_conjugate,
+                              1.f);
+    colors.push_back(HSV2RGB(h, s, v));
+  }
+  return colors;
+}
+
+static clock_t start_clock = clock();
+static cv::VideoWriter cap_out;
+
+template <typename Dtype>
+void VisualizeBBox(const vector<cv::Mat>& images, const Blob* detections,
+                   const float threshold, const vector<cv::Scalar>& colors,
+                   const map<int, string>& label_to_display_name,
+                   const string& save_file) {
+  // Retrieve detections.
+  CHECK_EQ(detections->width(), 7);
+  const int num_det = detections->height();
+  const int num_img = images.size();
+  if (num_det == 0 || num_img == 0) {
+    return;
+  }
+  // Comute FPS.
+  float fps = num_img / (static_cast<double>(clock() - start_clock) /
+          CLOCKS_PER_SEC);
+
+  const Dtype* detections_data = detections->cpu_data<Dtype>();
+  const int width = images[0].cols;
+  const int height = images[0].rows;
+  vector<LabelBBox> all_detections(num_img);
+  for (int i = 0; i < num_det; ++i) {
+    const int img_idx = detections_data[i * 7];
+    CHECK_LT(img_idx, num_img);
+    const int label = detections_data[i * 7 + 1];
+    const float score = detections_data[i * 7 + 2];
+    if (score < threshold) {
+      continue;
+    }
+    NormalizedBBox bbox;
+    bbox.set_xmin(detections_data[i * 7 + 3] * width);
+    bbox.set_ymin(detections_data[i * 7 + 4] * height);
+    bbox.set_xmax(detections_data[i * 7 + 5] * width);
+    bbox.set_ymax(detections_data[i * 7 + 6] * height);
+    bbox.set_score(score);
+    all_detections[img_idx][label].push_back(bbox);
+  }
+
+  int fontface = cv::FONT_HERSHEY_SIMPLEX;
+  double scale = 1;
+  int thickness = 2;
+  int baseline = 0;
+  char buffer[50];
+  for (int i = 0; i < num_img; ++i) {
+    cv::Mat image = images[i];
+    // Show FPS.
+    snprintf(buffer, sizeof(buffer), "FPS: %.2f", fps);
+    cv::Size text = cv::getTextSize(buffer, fontface, scale, thickness,
+                                    &baseline);
+    cv::rectangle(image, cv::Point(0, 0),
+                  cv::Point(text.width, text.height + baseline),
+                  CV_RGB(255, 255, 255), CV_FILLED);
+    cv::putText(image, buffer, cv::Point(0, text.height + baseline / 2.),
+                fontface, scale, CV_RGB(0, 0, 0), thickness, 8);
+    // Draw bboxes.
+    for (map<int, vector<NormalizedBBox> >::iterator it =
+         all_detections[i].begin(); it != all_detections[i].end(); ++it) {
+      int label = it->first;
+      string label_name = "Unknown";
+      if (label_to_display_name.find(label) != label_to_display_name.end()) {
+        label_name = label_to_display_name.find(label)->second;
+      }
+      CHECK_LT(label, colors.size());
+      const cv::Scalar& color = colors[label];
+      const vector<NormalizedBBox>& bboxes = it->second;
+      for (int j = 0; j < bboxes.size(); ++j) {
+        cv::Point top_left_pt(bboxes[j].xmin(), bboxes[j].ymin());
+        cv::Point bottom_right_pt(bboxes[j].xmax(), bboxes[j].ymax());
+        cv::rectangle(image, top_left_pt, bottom_right_pt, color, 4);
+        cv::Point bottom_left_pt(bboxes[j].xmin(), bboxes[j].ymax());
+        snprintf(buffer, sizeof(buffer), "%s: %.2f", label_name.c_str(),
+                 bboxes[j].score());
+        cv::Size text = cv::getTextSize(buffer, fontface, scale, thickness,
+                                        &baseline);
+        cv::rectangle(
+            image, bottom_left_pt + cv::Point(0, 0),
+            bottom_left_pt + cv::Point(text.width, -text.height-baseline),
+            color, CV_FILLED);
+        cv::putText(image, buffer, bottom_left_pt - cv::Point(0, baseline),
+                    fontface, scale, CV_RGB(0, 0, 0), thickness, 8);
+      }
+    }
+    // Save result if required.
+    if (!save_file.empty()) {
+      if (!cap_out.isOpened()) {
+        cv::Size size(image.size().width, image.size().height);
+        cv::VideoWriter outputVideo(save_file, CV_FOURCC('D', 'I', 'V', 'X'),
+            30, size, true);
+        cap_out = outputVideo;
+      }
+      cap_out.write(image);
+    }
+    cv::imshow("detections", image);
+    if (cv::waitKey(1) == 27) {
+      raise(SIGINT);
+    }
+  }
+  start_clock = clock();
+}
+
+template
+void VisualizeBBox<float>(const vector<cv::Mat>& images,
+                   const Blob* detections,
+                   const float threshold, const vector<cv::Scalar>& colors,
+                   const map<int, string>& label_to_display_name,
+                   const string& save_file);
+template
+void VisualizeBBox<double>(const vector<cv::Mat>& images,
+                   const Blob* detections,
+                   const float threshold, const vector<cv::Scalar>& colors,
+                   const map<int, string>& label_to_display_name,
+                   const string& save_file);
+template
+void VisualizeBBox<float16>(const vector<cv::Mat>& images,
+                   const Blob* detections,
+                   const float threshold, const vector<cv::Scalar>& colors,
+                   const map<int, string>& label_to_display_name,
+                   const string& save_file);
+
+}  // namespace caffe
diff --git a/src/caffe/util/bbox_util.cu b/src/caffe/util/bbox_util.cu
new file mode 100644
index 00000000000..5a4a6eb5807
--- /dev/null
+++ b/src/caffe/util/bbox_util.cu
@@ -0,0 +1,682 @@
+#include <algorithm>
+#include <functional>
+#include <map>
+#include <vector>
+
+#include "thrust/functional.h"
+#include "thrust/sort.h"
+
+#include "caffe/common.hpp"
+#include "caffe/util/bbox_util.hpp"
+
+namespace caffe {
+
+template <typename Dtype>
+__host__ __device__ Dtype BBoxSizeGPU(const Dtype* bbox,
+    const bool normalized) {
+  if (bbox[2] < bbox[0] || bbox[3] < bbox[1]) {
+    // If bbox is invalid (e.g. xmax < xmin or ymax < ymin), return 0.
+    return Dtype(0.);
+  } else {
+    const Dtype width = bbox[2] - bbox[0];
+    const Dtype height = bbox[3] - bbox[1];
+    if (normalized) {
+      return width * height;
+    } else {
+      // If bbox is not within range [0, 1].
+      return (width + 1) * (height + 1);
+    }
+  }
+}
+
+template __host__ __device__ float BBoxSizeGPU(const float* bbox,
+    const bool normalized);
+template __host__ __device__ double BBoxSizeGPU(const double* bbox,
+    const bool normalized);
+
+template <typename Dtype>
+__host__ __device__ Dtype JaccardOverlapGPU(const Dtype* bbox1,
+    const Dtype* bbox2) {
+  if (bbox2[0] > bbox1[2] || bbox2[2] < bbox1[0] ||
+      bbox2[1] > bbox1[3] || bbox2[3] < bbox1[1]) {
+    return Dtype(0.);
+  } else {
+    const Dtype inter_xmin = max(bbox1[0], bbox2[0]);
+    const Dtype inter_ymin = max(bbox1[1], bbox2[1]);
+    const Dtype inter_xmax = min(bbox1[2], bbox2[2]);
+    const Dtype inter_ymax = min(bbox1[3], bbox2[3]);
+
+    const Dtype inter_width = inter_xmax - inter_xmin;
+    const Dtype inter_height = inter_ymax - inter_ymin;
+    const Dtype inter_size = inter_width * inter_height;
+
+    const Dtype bbox1_size = BBoxSizeGPU(bbox1);
+    const Dtype bbox2_size = BBoxSizeGPU(bbox2);
+
+    return inter_size / (bbox1_size + bbox2_size - inter_size);
+  }
+}
+
+template __host__ __device__ float JaccardOverlapGPU(const float* bbox1,
+    const float* bbox2);
+template __host__ __device__ double JaccardOverlapGPU(const double* bbox1,
+    const double* bbox2);
+
+template <typename Dtype>
+__device__ Dtype Min(const Dtype x, const Dtype y) {
+  return x < y ? x : y;
+}
+
+template <typename Dtype>
+__device__ Dtype Max(const Dtype x, const Dtype y) {
+  return x > y ? x : y;
+}
+
+template <typename Dtype>
+__device__ void ClipBBoxGPU(const Dtype* bbox, Dtype* clip_bbox) {
+  for (int i = 0; i < 4; ++i) {
+    clip_bbox[i] = Max(Min(bbox[i], Dtype(1.)), Dtype(0.));
+  }
+}
+
+template __device__ void ClipBBoxGPU(const float* bbox, float* clip_bbox);
+template __device__ void ClipBBoxGPU(const double* bbox, double* clip_bbox);
+
+template <typename Dtype>
+__global__ void DecodeBBoxesKernel(const int nthreads,
+          const Dtype* loc_data, const Dtype* prior_data,
+          const CodeType code_type, const bool variance_encoded_in_target,
+          const int num_priors, const bool share_location,
+          const int num_loc_classes, const int background_label_id,
+          const bool clip_bbox, Dtype* bbox_data) {
+  CUDA_KERNEL_LOOP(index, nthreads) {
+    const int i = index % 4;
+    const int c = (index / 4) % num_loc_classes;
+    const int d = (index / 4 / num_loc_classes) % num_priors;
+    if (!share_location && c == background_label_id) {
+      // Ignore background class if not share_location.
+      return;
+    }
+    const int pi = d * 4;
+    const int vi = pi + num_priors * 4;
+    if (code_type == PriorBoxParameter_CodeType_CORNER) {
+      if (variance_encoded_in_target) {
+        // variance is encoded in target, we simply need to add the offset
+        // predictions.
+        bbox_data[index] = prior_data[pi + i] + loc_data[index];
+      } else {
+        // variance is encoded in bbox, we need to scale the offset accordingly.
+        bbox_data[index] =
+          prior_data[pi + i] + loc_data[index] * prior_data[vi + i];
+      }
+    } else if (code_type == PriorBoxParameter_CodeType_CENTER_SIZE) {
+      const Dtype p_xmin = prior_data[pi];
+      const Dtype p_ymin = prior_data[pi + 1];
+      const Dtype p_xmax = prior_data[pi + 2];
+      const Dtype p_ymax = prior_data[pi + 3];
+      const Dtype prior_width = p_xmax - p_xmin;
+      const Dtype prior_height = p_ymax - p_ymin;
+      const Dtype prior_center_x = (p_xmin + p_xmax) / 2.;
+      const Dtype prior_center_y = (p_ymin + p_ymax) / 2.;
+
+      const Dtype xmin = loc_data[index - i];
+      const Dtype ymin = loc_data[index - i + 1];
+      const Dtype xmax = loc_data[index - i + 2];
+      const Dtype ymax = loc_data[index - i + 3];
+
+      Dtype decode_bbox_center_x, decode_bbox_center_y;
+      Dtype decode_bbox_width, decode_bbox_height;
+      if (variance_encoded_in_target) {
+        // variance is encoded in target, we simply need to retore the offset
+        // predictions.
+        decode_bbox_center_x = xmin * prior_width + prior_center_x;
+        decode_bbox_center_y = ymin * prior_height + prior_center_y;
+        decode_bbox_width = exp(xmax) * prior_width;
+        decode_bbox_height = exp(ymax) * prior_height;
+      } else {
+        // variance is encoded in bbox, we need to scale the offset accordingly.
+        decode_bbox_center_x =
+          prior_data[vi] * xmin * prior_width + prior_center_x;
+        decode_bbox_center_y =
+          prior_data[vi + 1] * ymin * prior_height + prior_center_y;
+        decode_bbox_width =
+          exp(prior_data[vi + 2] * xmax) * prior_width;
+        decode_bbox_height =
+          exp(prior_data[vi + 3] * ymax) * prior_height;
+      }
+
+      switch (i) {
+        case 0:
+          bbox_data[index] = decode_bbox_center_x - decode_bbox_width / 2.;
+          break;
+        case 1:
+          bbox_data[index] = decode_bbox_center_y - decode_bbox_height / 2.;
+          break;
+        case 2:
+          bbox_data[index] = decode_bbox_center_x + decode_bbox_width / 2.;
+          break;
+        case 3:
+          bbox_data[index] = decode_bbox_center_y + decode_bbox_height / 2.;
+          break;
+      }
+    } else if (code_type == PriorBoxParameter_CodeType_CORNER_SIZE) {
+      const Dtype p_xmin = prior_data[pi];
+      const Dtype p_ymin = prior_data[pi + 1];
+      const Dtype p_xmax = prior_data[pi + 2];
+      const Dtype p_ymax = prior_data[pi + 3];
+      const Dtype prior_width = p_xmax - p_xmin;
+      const Dtype prior_height = p_ymax - p_ymin;
+      Dtype p_size;
+      if (i == 0 || i == 2) {
+        p_size = prior_width;
+      } else {
+        p_size = prior_height;
+      }
+      if (variance_encoded_in_target) {
+        // variance is encoded in target, we simply need to add the offset
+        // predictions.
+        bbox_data[index] = prior_data[pi + i] + loc_data[index] * p_size;
+      } else {
+        // variance is encoded in bbox, we need to scale the offset accordingly.
+        bbox_data[index] =
+          prior_data[pi + i] + loc_data[index] * prior_data[vi + i] * p_size;
+      }
+    } else {
+      // Unknown code type.
+    }
+    if (clip_bbox) {
+      bbox_data[index] = max(min(bbox_data[index], Dtype(1.)), Dtype(0.));
+    }
+  }
+}
+
+template <typename Dtype>
+void DecodeBBoxesGPU(const int nthreads,
+          const Dtype* loc_data, const Dtype* prior_data,
+          const CodeType code_type, const bool variance_encoded_in_target,
+          const int num_priors, const bool share_location,
+          const int num_loc_classes, const int background_label_id,
+          const bool clip_bbox, Dtype* bbox_data) {
+  cudaStream_t stream = Caffe::thread_stream();
+  // NOLINT_NEXT_LINE(whitespace/operators)
+  DecodeBBoxesKernel<Dtype><<<CAFFE_GET_BLOCKS(nthreads), CAFFE_CUDA_NUM_THREADS, 0, stream>>>
+          (nthreads, loc_data, prior_data, code_type, variance_encoded_in_target, num_priors,
+           share_location, num_loc_classes, background_label_id, clip_bbox, bbox_data);
+  CUDA_POST_KERNEL_CHECK;
+  CUDA_CHECK(cudaStreamSynchronize(stream));
+}
+
+template void DecodeBBoxesGPU(const int nthreads,
+          const float* loc_data, const float* prior_data,
+          const CodeType code_type, const bool variance_encoded_in_target,
+          const int num_priors, const bool share_location,
+          const int num_loc_classes, const int background_label_id,
+          const bool clip_bbox, float* bbox_data);
+template void DecodeBBoxesGPU(const int nthreads,
+          const double* loc_data, const double* prior_data,
+          const CodeType code_type, const bool variance_encoded_in_target,
+          const int num_priors, const bool share_location,
+          const int num_loc_classes, const int background_label_id,
+          const bool clip_bbox, double* bbox_data);
+template void DecodeBBoxesGPU(const int nthreads,
+          const float16* loc_data, const float16* prior_data,
+          const CodeType code_type, const bool variance_encoded_in_target,
+          const int num_priors, const bool share_location,
+          const int num_loc_classes, const int background_label_id,
+          const bool clip_bbox, float16* bbox_data);
+
+template <typename Dtype>
+__global__ void PermuteDataKernel(const int nthreads,
+          const Dtype* data, const int num_classes, const int num_data,
+          const int num_dim, Dtype* new_data) {
+  CUDA_KERNEL_LOOP(index, nthreads) {
+    const int i = index % num_dim;
+    const int c = (index / num_dim) % num_classes;
+    const int d = (index / num_dim / num_classes) % num_data;
+    const int n = index / num_dim / num_classes / num_data;
+    const int new_index = ((n * num_classes + c) * num_data + d) * num_dim + i;
+    new_data[new_index] = data[index];
+  }
+}
+
+template <typename Dtype>
+void PermuteDataGPU(const int nthreads,
+          const Dtype* data, const int num_classes, const int num_data,
+          const int num_dim, Dtype* new_data) {
+  cudaStream_t stream = Caffe::thread_stream();
+  // NOLINT_NEXT_LINE(whitespace/operators)
+  PermuteDataKernel<Dtype><<<CAFFE_GET_BLOCKS(nthreads),
+      CAFFE_CUDA_NUM_THREADS, 0, stream>>>(nthreads, data, num_classes, num_data,
+      num_dim, new_data);
+  CUDA_POST_KERNEL_CHECK;
+  CUDA_CHECK(cudaStreamSynchronize(stream));
+}
+
+template void PermuteDataGPU(const int nthreads,
+          const float* data, const int num_classes, const int num_data,
+          const int num_dim, float* new_data);
+template void PermuteDataGPU(const int nthreads,
+          const double* data, const int num_classes, const int num_data,
+          const int num_dim, double* new_data);
+template void PermuteDataGPU(const int nthreads,
+          const float16* data, const int num_classes, const int num_data,
+          const int num_dim, float16* new_data);
+
+template <typename Dtype>
+__global__ void kernel_channel_max(const int num, const int channels,
+    const int spatial_dim, const Dtype* data, Dtype* out) {
+  CUDA_KERNEL_LOOP(index, num * spatial_dim) {
+    int n = index / spatial_dim;
+    int s = index % spatial_dim;
+    Dtype maxval = -FLT_MAX;
+    for (int c = 0; c < channels; ++c) {
+      maxval = max(data[(n * channels + c) * spatial_dim + s], maxval);
+    }
+    out[index] = maxval;
+  }
+}
+
+template <typename Dtype>
+__global__ void kernel_channel_subtract(const int count,
+    const int num, const int channels,
+    const int spatial_dim, const Dtype* channel_data, const Dtype* channel_max,
+    Dtype* data) {
+  CUDA_KERNEL_LOOP(index, count) {
+    int n = index / channels / spatial_dim;
+    int s = index % spatial_dim;
+    data[index] = channel_data[index] - channel_max[n * spatial_dim + s];
+  }
+}
+
+template <typename Dtype>
+__global__ void kernel_exp(const int count, const Dtype* data, Dtype* out) {
+  CUDA_KERNEL_LOOP(index, count) {
+    out[index] = exp(data[index]);
+  }
+}
+
+template <typename Dtype>
+__global__ void kernel_channel_sum(const int num, const int channels,
+    const int spatial_dim, const Dtype* data, Dtype* channel_sum) {
+  CUDA_KERNEL_LOOP(index, num * spatial_dim) {
+    int n = index / spatial_dim;
+    int s = index % spatial_dim;
+    Dtype sum = 0;
+    for (int c = 0; c < channels; ++c) {
+      sum += data[(n * channels + c) * spatial_dim + s];
+    }
+    channel_sum[index] = sum;
+  }
+}
+
+template <typename Dtype>
+__global__ void kernel_channel_div(const int count,
+    const int num, const int channels,
+    const int spatial_dim, const Dtype* channel_sum, Dtype* data) {
+  CUDA_KERNEL_LOOP(index, count) {
+    int n = index / channels / spatial_dim;
+    int s = index % spatial_dim;
+    data[index] /= channel_sum[n * spatial_dim + s];
+  }
+}
+
+template <typename Dtype>
+void SoftMaxGPU(const Dtype* data, const int outer_num,
+    const int channels, const int inner_num, Dtype* prob) {
+  vector<int> shape(4, 1);
+  shape[0] = outer_num;
+  shape[1] = channels;
+  shape[2] = inner_num;
+  TBlob<Dtype> scale(shape);
+  Dtype* scale_data = scale.mutable_gpu_data();
+  int count = outer_num * channels * inner_num;
+  cudaStream_t stream = Caffe::thread_stream();
+  // We need to subtract the max to avoid numerical issues, compute the exp,
+  // and then normalize.
+  // compute max
+  // NOLINT_NEXT_LINE(whitespace/operators)
+  kernel_channel_max<Dtype><<<CAFFE_GET_BLOCKS(outer_num * inner_num),
+      CAFFE_CUDA_NUM_THREADS, 0, stream>>>(outer_num, channels, inner_num, data,
+      scale_data);
+  // subtract
+  // NOLINT_NEXT_LINE(whitespace/operators)
+  kernel_channel_subtract<Dtype><<<CAFFE_GET_BLOCKS(count),
+      CAFFE_CUDA_NUM_THREADS, 0, stream>>>(count, outer_num, channels, inner_num,
+      data, scale_data, prob);
+  // exponentiate
+  // NOLINT_NEXT_LINE(whitespace/operators)
+  kernel_exp<Dtype><<<CAFFE_GET_BLOCKS(count), CAFFE_CUDA_NUM_THREADS, 0, stream>>>(
+      count, prob, prob);
+  // sum after exp
+  // NOLINT_NEXT_LINE(whitespace/operators)
+  kernel_channel_sum<Dtype><<<CAFFE_GET_BLOCKS(outer_num * inner_num),
+      CAFFE_CUDA_NUM_THREADS, 0, stream>>>(outer_num, channels, inner_num, prob,
+      scale_data);
+  // divide
+  // NOLINT_NEXT_LINE(whitespace/operators)
+  kernel_channel_div<Dtype><<<CAFFE_GET_BLOCKS(count),
+      CAFFE_CUDA_NUM_THREADS, 0, stream>>>(count, outer_num, channels, inner_num,
+      scale_data, prob);
+  CUDA_CHECK(cudaStreamSynchronize(stream));
+}
+
+template void SoftMaxGPU(const float* data, const int outer_num,
+    const int channels, const int inner_num, float* prob);
+template void SoftMaxGPU(const double* data, const int outer_num,
+    const int channels, const int inner_num, double* prob);
+
+template <typename Dtype>
+__global__ void ComputeOverlappedKernel(const int nthreads,
+          const Dtype* bbox_data, const int num_bboxes, const int num_classes,
+          const Dtype overlap_threshold, bool* overlapped_data) {
+  CUDA_KERNEL_LOOP(index, nthreads) {
+    const int j = index % num_bboxes;
+    const int i = (index / num_bboxes) % num_bboxes;
+    if (i == j) {
+      // Ignore same bbox.
+      return;
+    }
+    const int c = (index / num_bboxes / num_bboxes) % num_classes;
+    const int n = index / num_bboxes / num_bboxes / num_classes;
+    // Compute overlap between i-th bbox and j-th bbox.
+    const int start_loc_i = ((n * num_bboxes + i) * num_classes + c) * 4;
+    const int start_loc_j = ((n * num_bboxes + j) * num_classes + c) * 4;
+    const Dtype overlap = JaccardOverlapGPU<Dtype>(bbox_data + start_loc_i,
+        bbox_data + start_loc_j);
+    if (overlap > overlap_threshold) {
+      overlapped_data[index] = true;
+    }
+  }
+}
+
+template <typename Dtype>
+void ComputeOverlappedGPU(const int nthreads,
+          const Dtype* bbox_data, const int num_bboxes, const int num_classes,
+          const Dtype overlap_threshold, bool* overlapped_data) {
+  cudaStream_t stream = Caffe::thread_stream();
+  // NOLINT_NEXT_LINE(whitespace/operators)
+  ComputeOverlappedKernel<Dtype><<<CAFFE_GET_BLOCKS(nthreads),
+      CAFFE_CUDA_NUM_THREADS, 0, stream>>>(nthreads, bbox_data, num_bboxes, num_classes,
+      overlap_threshold, overlapped_data);
+  CUDA_POST_KERNEL_CHECK;
+  CUDA_CHECK(cudaStreamSynchronize(stream));
+}
+
+template void ComputeOverlappedGPU(const int nthreads,
+          const float* bbox_data, const int num_bboxes, const int num_classes,
+          const float overlap_threshold, bool* overlapped_data);
+template void ComputeOverlappedGPU(const int nthreads,
+          const double* bbox_data, const int num_bboxes, const int num_classes,
+          const double overlap_threshold, bool* overlapped_data);
+
+template <typename Dtype>
+__global__ void ComputeOverlappedByIdxKernel(const int nthreads,
+          const Dtype* bbox_data, const Dtype overlap_threshold,
+          const int* idx, const int num_idx, bool* overlapped_data) {
+  CUDA_KERNEL_LOOP(index, nthreads) {
+    const int j = index % num_idx;
+    const int i = (index / num_idx);
+    if (i == j) {
+      // Ignore same bbox.
+      return;
+    }
+    // Compute overlap between i-th bbox and j-th bbox.
+    const int start_loc_i = idx[i] * 4;
+    const int start_loc_j = idx[j] * 4;
+    const Dtype overlap = JaccardOverlapGPU<Dtype>(bbox_data + start_loc_i,
+        bbox_data + start_loc_j);
+    if (overlap > overlap_threshold) {
+      overlapped_data[index] = true;
+    }
+  }
+}
+
+template <typename Dtype>
+void ComputeOverlappedByIdxGPU(const int nthreads,
+          const Dtype* bbox_data, const Dtype overlap_threshold,
+          const int* idx, const int num_idx, bool* overlapped_data) {
+  cudaStream_t stream = Caffe::thread_stream();
+  // NOLINT_NEXT_LINE(whitespace/operators)
+  ComputeOverlappedByIdxKernel<Dtype><<<CAFFE_GET_BLOCKS(nthreads),
+      CAFFE_CUDA_NUM_THREADS, 0, stream>>>(nthreads, bbox_data, overlap_threshold,
+      idx, num_idx, overlapped_data);
+  CUDA_POST_KERNEL_CHECK;
+  CUDA_CHECK(cudaStreamSynchronize(stream));
+}
+
+template void ComputeOverlappedByIdxGPU(const int nthreads,
+          const float* bbox_data, const float overlap_threshold,
+          const int* idx, const int num_idx, bool* overlapped_data);
+template void ComputeOverlappedByIdxGPU(const int nthreads,
+          const double* bbox_data, const double overlap_threshold,
+          const int* idx, const int num_idx, bool* overlapped_data);
+
+template <typename Dtype>
+void ApplyNMSGPU(const Dtype* bbox_data, const Dtype* conf_data,
+          const int num_bboxes, const float confidence_threshold,
+          const int top_k, const float nms_threshold, vector<int>* indices) {
+  // Keep part of detections whose scores are higher than confidence threshold.
+  vector<int> idx;
+  vector<float> confidences;
+  for (int i = 0; i < num_bboxes; ++i) {
+    if (conf_data[i] > confidence_threshold) {
+      idx.push_back(i);
+      confidences.push_back(conf_data[i]);
+    }
+  }
+  int num_remain = confidences.size();
+  if (num_remain == 0) {
+    return;
+  }
+  // Sort detections based on score.
+  thrust::sort_by_key(&confidences[0], &confidences[0] + num_remain, &idx[0],
+      thrust::greater<float>());
+  if (top_k > -1 && top_k < num_remain) {
+    num_remain = top_k;
+  }
+
+  // Compute overlap between remaining detections.
+  TBlob<int> idx_blob(1, 1, 1, num_remain);
+  int* idx_data = idx_blob.mutable_cpu_data();
+  std::copy(idx.begin(), idx.begin() + num_remain, idx_data);
+
+  TBlob<bool> overlapped(1, 1, num_remain, num_remain);
+  const int total_bboxes = overlapped.count();
+  bool* overlapped_data = overlapped.mutable_gpu_data();
+  ComputeOverlappedByIdxGPU<Dtype>(total_bboxes, bbox_data, nms_threshold,
+      idx_blob.gpu_data(), num_remain, overlapped_data);
+
+  // Do non-maximum suppression based on overlapped results.
+  const bool* overlapped_results = overlapped.cpu_data();
+  vector<int> selected_indices;
+  ApplyNMS(overlapped_results, num_remain, &selected_indices);
+
+  // Put back the selected information.
+  for (int i = 0; i < selected_indices.size(); ++i) {
+    indices->push_back(idx[selected_indices[i]]);
+  }
+}
+
+template
+void ApplyNMSGPU(const float* bbox_data, const float* conf_data,
+          const int num_bboxes, const float confidence_threshold,
+          const int top_k, const float nms_threshold, vector<int>* indices);
+template
+void ApplyNMSGPU(const double* bbox_data, const double* conf_data,
+          const int num_bboxes, const float confidence_threshold,
+          const int top_k, const float nms_threshold, vector<int>* indices);
+
+template
+void ApplyNMSGPU(const float16* bbox_data, const float16* conf_data,
+          const int num_bboxes, const float confidence_threshold,
+          const int top_k, const float nms_threshold, vector<int>* indices);
+
+template <typename Dtype>
+__global__ void GetDetectionsKernel(const int nthreads,
+          const Dtype* bbox_data, const Dtype* conf_data, const int image_id,
+          const int label, const int* indices, const bool clip_bbox,
+          Dtype* detection_data) {
+  CUDA_KERNEL_LOOP(index, nthreads) {
+    const int det_idx = indices[index];
+    detection_data[index * 7] = image_id;
+    detection_data[index * 7 + 1] = label;
+    detection_data[index * 7 + 2] = conf_data[det_idx];
+    if (clip_bbox) {
+      ClipBBoxGPU(&(bbox_data[det_idx * 4]), &(detection_data[index * 7 + 3]));
+    } else {
+      for (int i = 0; i < 4; ++i) {
+        detection_data[index * 7 + 3 + i] = bbox_data[det_idx * 4 + i];
+      }
+    }
+  }
+}
+
+template <typename Dtype>
+void GetDetectionsGPU(const Dtype* bbox_data, const Dtype* conf_data,
+          const int image_id, const int label, const vector<int>& indices,
+          const bool clip_bbox, TBlob<Dtype>* detection_blob) {
+  // Store selected indices in array.
+  int num_det = indices.size();
+  if (num_det == 0) {
+    return;
+  }
+  TBlob<int> idx_blob(1, 1, 1, num_det);
+  int* idx_data = idx_blob.mutable_cpu_data();
+  std::copy(indices.begin(), indices.end(), idx_data);
+  // Prepare detection_blob.
+  detection_blob->Reshape(1, 1, num_det, 7);
+  Dtype* detection_data = detection_blob->mutable_gpu_data();
+  cudaStream_t stream = Caffe::thread_stream();
+  // NOLINT_NEXT_LINE(whitespace/operators)
+  GetDetectionsKernel<Dtype><<<CAFFE_GET_BLOCKS(num_det),
+      CAFFE_CUDA_NUM_THREADS, 0, stream>>>(num_det, bbox_data, conf_data, image_id, label,
+      idx_blob.gpu_data(), clip_bbox, detection_data);
+  CUDA_POST_KERNEL_CHECK;
+  CUDA_CHECK(cudaStreamSynchronize(stream));
+}
+
+template void GetDetectionsGPU(const float* bbox_data, const float* conf_data,
+          const int image_id, const int label, const vector<int>& indices,
+          const bool clip_bbox, TBlob<float>* detection_blob);
+template void GetDetectionsGPU(const double* bbox_data, const double* conf_data,
+          const int image_id, const int label, const vector<int>& indices,
+          const bool clip_bbox, TBlob<double>* detection_blob);
+
+template <typename Dtype>
+__global__ void ComputeConfLossKernel(const int nthreads,
+    const Dtype* conf_data, const int num_preds_per_class,
+    const int num_classes, const ConfLossType loss_type,
+    const Dtype* match_data, Dtype* conf_loss_data) {
+  CUDA_KERNEL_LOOP(index, nthreads) {
+    int label = match_data[index];
+    int num = index / num_preds_per_class;
+    int p = index % num_preds_per_class;
+    int start_idx = (num * num_preds_per_class + p) * num_classes;
+    Dtype loss = 0;
+    if (loss_type == MultiBoxLossParameter_ConfLossType_SOFTMAX) {
+      // Compute softmax probability.
+      Dtype prob = conf_data[start_idx + label];
+      loss = -log(Max(prob, Dtype(FLT_MIN)));
+    } else if (loss_type == MultiBoxLossParameter_ConfLossType_LOGISTIC) {
+      int target = 0;
+      for (int c = 0; c < num_classes; ++c) {
+        if (c == label) {
+          target = 1;
+        } else {
+          target = 0;
+        }
+        Dtype input = conf_data[start_idx + c];
+        loss -= input * (target - (input >= 0)) -
+          log(1 + exp(input - 2 * input * (input >= 0)));
+      }
+    }
+    conf_loss_data[index] = loss;
+  }
+}
+
+template <typename Dtype>
+void ComputeConfLossGPU(const TBlob<Dtype>& conf_blob, const int num,
+      const int num_preds_per_class, const int num_classes,
+      const int background_label_id, const ConfLossType loss_type,
+      const vector<map<int, vector<int> > >& all_match_indices,
+      const map<int, vector<NormalizedBBox> >& all_gt_bboxes,
+      vector<vector<float> >* all_conf_loss) {
+  CHECK_LT(background_label_id, num_classes);
+  TBlob<Dtype> match_blob(num, num_preds_per_class, 1, 1);
+  Dtype* match_data = match_blob.mutable_cpu_data();
+  for (int i = 0; i < num; ++i) {
+    const map<int, vector<int> >& match_indices = all_match_indices[i];
+    for (int p = 0; p < num_preds_per_class; ++p) {
+      // Get the label index.
+      int label = background_label_id;
+      for (map<int, vector<int> >::const_iterator it =
+           match_indices.begin(); it != match_indices.end(); ++it) {
+        const vector<int>& match_index = it->second;
+        CHECK_EQ(match_index.size(), num_preds_per_class);
+        if (match_index[p] > -1) {
+          CHECK(all_gt_bboxes.find(i) != all_gt_bboxes.end());
+          const vector<NormalizedBBox>& gt_bboxes =
+              all_gt_bboxes.find(i)->second;
+          CHECK_LT(match_index[p], gt_bboxes.size());
+          label = gt_bboxes[match_index[p]].label();
+          CHECK_GE(label, 0);
+          CHECK_NE(label, background_label_id);
+          CHECK_LT(label, num_classes);
+          // A prior can only be matched to one gt bbox.
+          break;
+        }
+      }
+      match_data[i * num_preds_per_class + p] = label;
+    }
+  }
+  // Get probability data.
+  const Dtype* conf_gpu_data = conf_blob.gpu_data();
+  TBlob<Dtype> prob_blob;
+  prob_blob.ReshapeLike(conf_blob);
+  if (loss_type == MultiBoxLossParameter_ConfLossType_SOFTMAX) {
+    Dtype* prob_gpu_data = prob_blob.mutable_gpu_data();
+    SoftMaxGPU(conf_blob.gpu_data(), num * num_preds_per_class, num_classes, 1,
+        prob_gpu_data);
+    conf_gpu_data = prob_blob.gpu_data();
+  }
+  // Compute the loss.
+  TBlob<Dtype> conf_loss_blob(num, num_preds_per_class, 1, 1);
+  Dtype* conf_loss_gpu_data = conf_loss_blob.mutable_gpu_data();
+  const int num_threads = num * num_preds_per_class;
+  cudaStream_t stream = Caffe::thread_stream();
+  // NOLINT_NEXT_LINE(whitespace/operators)
+  ComputeConfLossKernel<Dtype><<<CAFFE_GET_BLOCKS(num_threads),
+    CAFFE_CUDA_NUM_THREADS, 0, stream>>>(num_threads, conf_gpu_data, num_preds_per_class,
+        num_classes, loss_type, match_blob.gpu_data(), conf_loss_gpu_data);
+  CUDA_CHECK(cudaStreamSynchronize(stream));
+  // Save the loss.
+  all_conf_loss->clear();
+  const Dtype* loss_data = conf_loss_blob.cpu_data();
+  for (int i = 0; i < num; ++i) {
+    vector<float> conf_loss(loss_data, loss_data + num_preds_per_class);
+    all_conf_loss->push_back(conf_loss);
+    loss_data += num_preds_per_class;
+  }
+}
+
+// Explicit initialization.
+template void ComputeConfLossGPU(const TBlob<float>& conf_data, const int num,
+      const int num_preds_per_class, const int num_classes,
+      const int background_label_id, const ConfLossType loss_type,
+      const vector<map<int, vector<int> > >& all_match_indices,
+      const map<int, vector<NormalizedBBox> >& all_gt_bboxes,
+      vector<vector<float> >* all_conf_loss);
+template void ComputeConfLossGPU(const TBlob<double>& conf_data, const int num,
+      const int num_preds_per_class, const int num_classes,
+      const int background_label_id, const ConfLossType loss_type,
+      const vector<map<int, vector<int> > >& all_match_indices,
+      const map<int, vector<NormalizedBBox> >& all_gt_bboxes,
+      vector<vector<float> >* all_conf_loss);
+template void ComputeConfLossGPU(const TBlob<float16>& conf_data, const int num,
+      const int num_preds_per_class, const int num_classes,
+      const int background_label_id, const ConfLossType loss_type,
+      const vector<map<int, vector<int> > >& all_match_indices,
+      const map<int, vector<NormalizedBBox> >& all_gt_bboxes,
+      vector<vector<float> >* all_conf_loss);
+
+}  // namespace caffe
diff --git a/src/caffe/util/benchmark.cpp b/src/caffe/util/benchmark.cpp
index 04a4d106ce3..daa81345d94 100644
--- a/src/caffe/util/benchmark.cpp
+++ b/src/caffe/util/benchmark.cpp
@@ -8,7 +8,10 @@ namespace caffe {
 Timer::Timer()
     : initted_(false),
       running_(false),
-      has_run_at_least_once_(false) {
+      has_run_at_least_once_(false),
+      start_gpu_(nullptr),
+      stop_gpu_(nullptr),
+      device_(-1) {
   Init();
 }
 
@@ -31,6 +34,7 @@ Timer::~Timer() {
 void Timer::Start() {
   if (!running()) {
     if (Caffe::mode() == Caffe::GPU) {
+      CHECK_EQ(device_, Caffe::current_device());
       CUDA_CHECK(cudaEventRecord(start_gpu_, 0));
     } else {
       start_cpu_ = boost::posix_time::microsec_clock::local_time();
@@ -43,6 +47,7 @@ void Timer::Start() {
 void Timer::Stop() {
   if (running()) {
     if (Caffe::mode() == Caffe::GPU) {
+      CHECK_EQ(device_, Caffe::current_device());
       CUDA_CHECK(cudaEventRecord(stop_gpu_, 0));
       CUDA_CHECK(cudaEventSynchronize(stop_gpu_));
     } else {
@@ -52,7 +57,6 @@ void Timer::Stop() {
   }
 }
 
-
 float Timer::MicroSeconds() {
   if (!has_run_at_least_once()) {
     LOG(WARNING) << "Timer has never been run before reading time.";
@@ -62,6 +66,7 @@ float Timer::MicroSeconds() {
     Stop();
   }
   if (Caffe::mode() == Caffe::GPU) {
+    CHECK_EQ(device_, Caffe::current_device());
     CUDA_CHECK(cudaEventElapsedTime(&elapsed_milliseconds_, start_gpu_, stop_gpu_));
     // Cuda only measure milliseconds
     elapsed_microseconds_ = elapsed_milliseconds_ * 1000;
@@ -80,6 +85,7 @@ float Timer::MilliSeconds() {
     Stop();
   }
   if (Caffe::mode() == Caffe::GPU) {
+    CHECK_EQ(device_, Caffe::current_device());
     CUDA_CHECK(cudaEventElapsedTime(&elapsed_milliseconds_, start_gpu_, stop_gpu_));
   } else {
     elapsed_milliseconds_ = (stop_cpu_ - start_cpu_).total_milliseconds();
@@ -94,6 +100,12 @@ float Timer::Seconds() {
 void Timer::Init() {
   if (!initted()) {
     if (Caffe::mode() == Caffe::GPU) {
+      int current_device = Caffe::current_device();
+      if (device_ < 0) {
+        device_ = current_device;
+      } else {
+        CHECK_EQ(device_, current_device);
+      }
       CUDA_CHECK(cudaEventCreate(&start_gpu_));
       CUDA_CHECK(cudaEventCreate(&stop_gpu_));
     } else {
diff --git a/src/caffe/util/blocking_queue.cpp b/src/caffe/util/blocking_queue.cpp
index 0545b430119..696397b28a0 100644
--- a/src/caffe/util/blocking_queue.cpp
+++ b/src/caffe/util/blocking_queue.cpp
@@ -94,6 +94,7 @@ bool BlockingQueue<T>::nonblocking_size(size_t* size) const {
 template class BlockingQueue<int>;
 template class BlockingQueue<shared_ptr<Batch>>;
 template class BlockingQueue<shared_ptr<Datum>>;
+template class BlockingQueue<shared_ptr<AnnotatedDatum>>;
 template class BlockingQueue<P2PSync*>;
 template class BlockingQueue<shared_ptr<caffe::TBlob<float>>>;
 template class BlockingQueue<shared_ptr<caffe::TBlob<double>>>;
diff --git a/src/caffe/util/gpu_memory.cpp b/src/caffe/util/gpu_memory.cpp
index 9464bdd0cea..c8c7857f40b 100644
--- a/src/caffe/util/gpu_memory.cpp
+++ b/src/caffe/util/gpu_memory.cpp
@@ -49,9 +49,9 @@ bool GPUMemory::Workspace::safe_reserve(size_t size, int device) {
   }
   size_t gpu_bytes_left, total_memory;
   GPUMemory::GetInfo(&gpu_bytes_left, &total_memory, true);
-  if (size > size_ + align_down<7>(gpu_bytes_left)) {
+  if (size > size_ + align_down<8>(gpu_bytes_left)) {
     LOG(FATAL) << "Out of memory in safe_reserve: "
-        << size << " > " << size_ << " + " << align_down<7>(gpu_bytes_left)
+        << size << " > " << size_ << " + " << align_down<8>(gpu_bytes_left)
         << " on device " << device;
     return false;
   }
@@ -67,7 +67,8 @@ bool GPUMemory::Workspace::try_reserve(size_t size, int device) {
     if (device != INVALID_DEVICE) {
       device_ = device;  // switch from default to specific one
     }
-    status = mgr_.try_allocate(&ptr_, size, device_);
+    pstream_ = Caffe::thread_pstream(0);
+    status = mgr_.try_allocate(&ptr_, size, device_, pstream_);
     if (status) {
       CHECK_NOTNULL(ptr_);
       size_ = size;
@@ -132,7 +133,8 @@ void GPUMemory::Manager::lazy_init(int device) {
   static Scope gpu_memory_scope(gpus);
 }
 
-bool GPUMemory::Manager::try_allocate(void** ptr, size_t size, int device, int group) {
+bool GPUMemory::Manager::try_allocate(void** ptr, size_t size, int device,
+                                      const shared_ptr<CudaStream>& pstream) {
   if (!initialized_) {
     lazy_init(device);
   }
@@ -144,9 +146,16 @@ bool GPUMemory::Manager::try_allocate(void** ptr, size_t size, int device, int g
     shared_lock<shared_mutex> lock(GPUMemory::read_write_mutex());
     size_t size_allocated = 0;
     // Clean Cache & Retry logic is inside now
-    status = cub_allocator_->DeviceAllocate(device, ptr, size,
-        Caffe::thread_stream(group), size_allocated);
+    status = cub_allocator_->DeviceAllocate(device, ptr, size, pstream->get(), size_allocated);
     if (status == cudaSuccess && device > INVALID_DEVICE) {
+//      if (device == 0) {
+//        DevInfo dev_info;
+//        CUDA_CHECK(cudaMemGetInfo(&dev_info.free_, &dev_info.total_));
+//        size_t allocated = dev_info.total_ - dev_info.free_;
+//        size_t pcent = 100UL* allocated / dev_info.total_;
+//        std::string bar(pcent, '*');
+//        std::cout << bar << " " << pcent << "%" << std::endl;
+//      }
       if (size_allocated > 0) {
         if (dev_info_[device].free_ < update_thresholds_[device]) {
           update_dev_info(device);
@@ -257,4 +266,16 @@ void GPUMemory::Manager::GetInfo(size_t* free_mem, size_t* total_mem, bool with_
   }
 }
 
+GPUMemory::PinnedBuffer::PinnedBuffer(size_t size) {
+  CHECK_GT(size, 0);
+  shared_lock<shared_mutex> lock(GPUMemory::read_write_mutex());
+  CUDA_CHECK(cudaHostAlloc(&hptr_, size, cudaHostAllocMapped));
+  CUDA_CHECK(cudaHostGetDevicePointer(&dptr_, hptr_, 0));
+}
+
+GPUMemory::PinnedBuffer::~PinnedBuffer() {
+  shared_lock<shared_mutex> lock(GPUMemory::read_write_mutex());
+  CUDA_CHECK(cudaFreeHost(hptr_));
+}
+
 }  // namespace caffe
diff --git a/src/caffe/util/im_transforms.cpp b/src/caffe/util/im_transforms.cpp
new file mode 100644
index 00000000000..2f179bc4ff0
--- /dev/null
+++ b/src/caffe/util/im_transforms.cpp
@@ -0,0 +1,729 @@
+#include <opencv2/highgui/highgui.hpp>
+
+#if CV_VERSION_MAJOR == 3
+#include <opencv2/imgcodecs/imgcodecs.hpp>
+#define CV_GRAY2BGR cv::COLOR_GRAY2BGR
+#define CV_BGR2GRAY cv::COLOR_BGR2GRAY
+#define CV_BGR2YCrCb cv::COLOR_BGR2YCrCb
+#define CV_YCrCb2BGR cv::COLOR_YCrCb2BGR
+#define CV_IMWRITE_JPEG_QUALITY cv::IMWRITE_JPEG_QUALITY
+#define CV_LOAD_IMAGE_COLOR cv::IMREAD_COLOR
+#define CV_THRESH_BINARY_INV cv::THRESH_BINARY_INV
+#define CV_THRESH_OTSU cv::THRESH_OTSU
+#endif
+
+#include <algorithm>
+#include <numeric>
+#include <vector>
+
+#include "caffe/util/im_transforms.hpp"
+#include "caffe/util/math_functions.hpp"
+
+namespace caffe {
+
+const float prob_eps = 0.01;
+
+int roll_weighted_die(const vector<float>& probabilities) {
+  vector<float> cumulative;
+  std::partial_sum(&probabilities[0], &probabilities[0] + probabilities.size(),
+                   std::back_inserter(cumulative));
+  float val;
+  caffe_rng_uniform(1, static_cast<float>(0), cumulative.back(), &val);
+
+  // Find the position within the sequence and add 1
+  return (std::lower_bound(cumulative.begin(), cumulative.end(), val)
+          - cumulative.begin());
+}
+
+void UpdateBBoxByResizePolicy(const ResizeParameter& param,
+                              const int old_width, const int old_height,
+                              NormalizedBBox* bbox) {
+  float new_height = param.height();
+  float new_width = param.width();
+  float orig_aspect = static_cast<float>(old_width) / old_height;
+  float new_aspect = new_width / new_height;
+
+  float x_min = bbox->xmin() * old_width;
+  float y_min = bbox->ymin() * old_height;
+  float x_max = bbox->xmax() * old_width;
+  float y_max = bbox->ymax() * old_height;
+  float padding;
+  switch (param.resize_mode()) {
+    case ResizeParameter_Resize_mode_WARP:
+      x_min = std::max(0.f, x_min * new_width / old_width);
+      x_max = std::min(new_width, x_max * new_width / old_width);
+      y_min = std::max(0.f, y_min * new_height / old_height);
+      y_max = std::min(new_height, y_max * new_height / old_height);
+      break;
+    case ResizeParameter_Resize_mode_FIT_LARGE_SIZE_AND_PAD:
+      if (orig_aspect > new_aspect) {
+        padding = (new_height - new_width / orig_aspect) / 2;
+        x_min = std::max(0.f, x_min * new_width / old_width);
+        x_max = std::min(new_width, x_max * new_width / old_width);
+        y_min = y_min * (new_height - 2 * padding) / old_height;
+        y_min = padding + std::max(0.f, y_min);
+        y_max = y_max * (new_height - 2 * padding) / old_height;
+        y_max = padding + std::min(new_height, y_max);
+      } else {
+        padding = (new_width - orig_aspect * new_height) / 2;
+        x_min = x_min * (new_width - 2 * padding) / old_width;
+        x_min = padding + std::max(0.f, x_min);
+        x_max = x_max * (new_width - 2 * padding) / old_width;
+        x_max = padding + std::min(new_width, x_max);
+        y_min = std::max(0.f, y_min * new_height / old_height);
+        y_max = std::min(new_height, y_max * new_height / old_height);
+      }
+      break;
+    case ResizeParameter_Resize_mode_FIT_SMALL_SIZE:
+      if (orig_aspect < new_aspect) {
+        new_height = new_width / orig_aspect;
+      } else {
+        new_width = orig_aspect * new_height;
+      }
+      x_min = std::max(0.f, x_min * new_width / old_width);
+      x_max = std::min(new_width, x_max * new_width / old_width);
+      y_min = std::max(0.f, y_min * new_height / old_height);
+      y_max = std::min(new_height, y_max * new_height / old_height);
+      break;
+    default:
+      LOG(FATAL) << "Unknown resize mode.";
+  }
+  bbox->set_xmin(x_min / new_width);
+  bbox->set_ymin(y_min / new_height);
+  bbox->set_xmax(x_max / new_width);
+  bbox->set_ymax(y_max / new_height);
+}
+
+void InferNewSize(const ResizeParameter& resize_param,
+                  const int old_width, const int old_height,
+                  int* new_width, int* new_height) {
+  int height = resize_param.height();
+  int width = resize_param.width();
+  float orig_aspect = static_cast<float>(old_width) / old_height;
+  float aspect = static_cast<float>(width) / height;
+
+  switch (resize_param.resize_mode()) {
+    case ResizeParameter_Resize_mode_WARP:
+      break;
+    case ResizeParameter_Resize_mode_FIT_LARGE_SIZE_AND_PAD:
+      break;
+    case ResizeParameter_Resize_mode_FIT_SMALL_SIZE:
+      if (orig_aspect < aspect) {
+        height = static_cast<int>(width / orig_aspect);
+      } else {
+        width = static_cast<int>(orig_aspect * height);
+      }
+      break;
+    default:
+      LOG(FATAL) << "Unknown resize mode.";
+  }
+  *new_height = height;
+  *new_width = width;
+}
+
+template <typename T>
+bool is_border(const cv::Mat& edge, T color) {
+  cv::Mat im = edge.clone().reshape(0, 1);
+  bool res = true;
+  for (int i = 0; i < im.cols; ++i) {
+    res &= (color == im.at<T>(0, i));
+  }
+  return res;
+}
+
+template
+bool is_border(const cv::Mat& edge, uchar color);
+
+template <typename T>
+cv::Rect CropMask(const cv::Mat& src, T point, int padding) {
+  cv::Rect win(0, 0, src.cols, src.rows);
+
+  vector<cv::Rect> edges;
+  edges.push_back(cv::Rect(0, 0, src.cols, 1));
+  edges.push_back(cv::Rect(src.cols-2, 0, 1, src.rows));
+  edges.push_back(cv::Rect(0, src.rows-2, src.cols, 1));
+  edges.push_back(cv::Rect(0, 0, 1, src.rows));
+
+  cv::Mat edge;
+  int nborder = 0;
+  T color = src.at<T>(0, 0);
+  for (int i = 0; i < edges.size(); ++i) {
+    edge = src(edges[i]);
+    nborder += is_border(edge, color);
+  }
+
+  if (nborder < 4) {
+    return win;
+  }
+
+  bool next;
+  do {
+    edge = src(cv::Rect(win.x, win.height - 2, win.width, 1));
+    next = is_border(edge, color);
+    if (next) {
+      win.height--;
+    }
+  } while (next && (win.height > 0));
+
+  do {
+    edge = src(cv::Rect(win.width - 2, win.y, 1, win.height));
+    next = is_border(edge, color);
+    if (next) {
+      win.width--;
+    }
+  } while (next && (win.width > 0));
+
+  do {
+    edge = src(cv::Rect(win.x, win.y, win.width, 1));
+    next = is_border(edge, color);
+    if (next) {
+      win.y++;
+      win.height--;
+    }
+  } while (next && (win.y <= src.rows));
+
+  do {
+    edge = src(cv::Rect(win.x, win.y, 1, win.height));
+    next = is_border(edge, color);
+    if (next) {
+      win.x++;
+      win.width--;
+    }
+  } while (next && (win.x <= src.cols));
+
+  // add padding
+  if (win.x > padding) {
+    win.x -= padding;
+  }
+  if (win.y > padding) {
+    win.y -= padding;
+  }
+  if ((win.width + win.x + padding) < src.cols) {
+    win.width += padding;
+  }
+  if ((win.height + win.y + padding) < src.rows) {
+    win.height += padding;
+  }
+
+  return win;
+}
+
+template
+cv::Rect CropMask(const cv::Mat& src, uchar point, int padding);
+
+cv::Mat colorReduce(const cv::Mat& image, int div) {
+  cv::Mat out_img;
+  cv::Mat lookUpTable(1, 256, CV_8U);
+  uchar* p = lookUpTable.data;
+  const int div_2 = div / 2;
+  for ( int i = 0; i < 256; ++i ) {
+    p[i] = i / div * div + div_2;
+  }
+  cv::LUT(image, lookUpTable, out_img);
+  return out_img;
+}
+
+void fillEdgeImage(const cv::Mat& edgesIn, cv::Mat* filledEdgesOut) {
+  cv::Mat edgesNeg = edgesIn.clone();
+  cv::Scalar val(255, 255, 255);
+  cv::floodFill(edgesNeg, cv::Point(0, 0), val);
+  cv::floodFill(edgesNeg, cv::Point(edgesIn.cols - 1, edgesIn.rows - 1), val);
+  cv::floodFill(edgesNeg, cv::Point(0, edgesIn.rows - 1), val);
+  cv::floodFill(edgesNeg, cv::Point(edgesIn.cols - 1, 0), val);
+  cv::bitwise_not(edgesNeg, edgesNeg);
+  *filledEdgesOut = (edgesNeg | edgesIn);
+  return;
+}
+
+void CenterObjectAndFillBg(const cv::Mat& in_img, const bool fill_bg,
+                           cv::Mat* out_img) {
+  cv::Mat mask, crop_mask;
+  if (in_img.channels() > 1) {
+    cv::Mat in_img_gray;
+    cv::cvtColor(in_img, in_img_gray, CV_BGR2GRAY);
+    cv::threshold(in_img_gray, mask, 0, 255,
+                  CV_THRESH_BINARY_INV | CV_THRESH_OTSU);
+  } else {
+    cv::threshold(in_img, mask, 0, 255,
+                  CV_THRESH_BINARY_INV | CV_THRESH_OTSU);
+  }
+  cv::Rect crop_rect = CropMask(mask, mask.at<uchar>(0, 0), 2);
+
+  if (fill_bg) {
+    cv::Mat temp_img = in_img(crop_rect);
+    fillEdgeImage(mask, &mask);
+    crop_mask = mask(crop_rect).clone();
+    *out_img = cv::Mat::zeros(crop_rect.size(), in_img.type());
+    temp_img.copyTo(*out_img, crop_mask);
+  } else {
+    *out_img = in_img(crop_rect).clone();
+  }
+}
+
+cv::Mat AspectKeepingResizeAndPad(const cv::Mat& in_img,
+                                  const int new_width, const int new_height,
+                                  const int pad_type,  const cv::Scalar pad_val,
+                                  const int interp_mode) {
+  cv::Mat img_resized;
+  float orig_aspect = static_cast<float>(in_img.cols) / in_img.rows;
+  float new_aspect = static_cast<float>(new_width) / new_height;
+
+  if (orig_aspect > new_aspect) {
+    int height = floor(static_cast<float>(new_width) / orig_aspect);
+    cv::resize(in_img, img_resized, cv::Size(new_width, height), 0, 0,
+               interp_mode);
+    cv::Size resSize = img_resized.size();
+    int padding = floor((new_height - resSize.height) / 2.0);
+    cv::copyMakeBorder(img_resized, img_resized, padding,
+                       new_height - resSize.height - padding, 0, 0,
+                       pad_type, pad_val);
+  } else {
+    int width = floor(orig_aspect * new_height);
+    cv::resize(in_img, img_resized, cv::Size(width, new_height), 0, 0,
+               interp_mode);
+    cv::Size resSize = img_resized.size();
+    int padding = floor((new_width - resSize.width) / 2.0);
+    cv::copyMakeBorder(img_resized, img_resized, 0, 0, padding,
+                       new_width - resSize.width - padding,
+                       pad_type, pad_val);
+  }
+  return img_resized;
+}
+
+cv::Mat AspectKeepingResizeBySmall(const cv::Mat& in_img,
+                                   const int new_width,
+                                   const int new_height,
+                                   const int interp_mode) {
+  cv::Mat img_resized;
+  float orig_aspect = static_cast<float>(in_img.cols) / in_img.rows;
+  float new_aspect = static_cast<float> (new_width) / new_height;
+
+  if (orig_aspect < new_aspect) {
+    int height = floor(static_cast<float>(new_width) / orig_aspect);
+    cv::resize(in_img, img_resized, cv::Size(new_width, height), 0, 0,
+               interp_mode);
+  } else {
+    int width = floor(orig_aspect * new_height);
+    cv::resize(in_img, img_resized, cv::Size(width, new_height), 0, 0,
+               interp_mode);
+  }
+  return img_resized;
+}
+
+void constantNoise(const int n, const vector<uchar>& val, cv::Mat* image) {
+  const int cols = image->cols;
+  const int rows = image->rows;
+
+  if (image->channels() == 1) {
+    for (int k = 0; k < n; ++k) {
+      const int i = caffe_rng_rand() % cols;
+      const int j = caffe_rng_rand() % rows;
+      uchar* ptr = image->ptr<uchar>(j);
+      ptr[i]= val[0];
+    }
+  } else if (image->channels() == 3) {  // color image
+    for (int k = 0; k < n; ++k) {
+      const int i = caffe_rng_rand() % cols;
+      const int j = caffe_rng_rand() % rows;
+      cv::Vec3b* ptr = image->ptr<cv::Vec3b>(j);
+      (ptr[i])[0] = val[0];
+      (ptr[i])[1] = val[1];
+      (ptr[i])[2] = val[2];
+    }
+  }
+}
+
+cv::Mat ApplyResize(const cv::Mat& in_img, const ResizeParameter& param) {
+  cv::Mat out_img;
+
+  // Reading parameters
+  const int new_height = param.height();
+  const int new_width = param.width();
+
+  int pad_mode = cv::BORDER_CONSTANT;
+  switch (param.pad_mode()) {
+    case ResizeParameter_Pad_mode_CONSTANT:
+      break;
+    case ResizeParameter_Pad_mode_MIRRORED:
+      pad_mode = cv::BORDER_REFLECT101;
+      break;
+    case ResizeParameter_Pad_mode_REPEAT_NEAREST:
+      pad_mode = cv::BORDER_REPLICATE;
+      break;
+    default:
+      LOG(FATAL) << "Unknown pad mode.";
+  }
+
+  int interp_mode = cv::INTER_LINEAR;
+  int num_interp_mode = param.interp_mode_size();
+  if (num_interp_mode > 0) {
+    vector<float> probs(num_interp_mode, 1.f / num_interp_mode);
+    int prob_num = roll_weighted_die(probs);
+    switch (param.interp_mode(prob_num)) {
+      case ResizeParameter_Interp_mode_AREA:
+        interp_mode = cv::INTER_AREA;
+        break;
+      case ResizeParameter_Interp_mode_CUBIC:
+        interp_mode = cv::INTER_CUBIC;
+        break;
+      case ResizeParameter_Interp_mode_LINEAR:
+        interp_mode = cv::INTER_LINEAR;
+        break;
+      case ResizeParameter_Interp_mode_NEAREST:
+        interp_mode = cv::INTER_NEAREST;
+        break;
+      case ResizeParameter_Interp_mode_LANCZOS4:
+        interp_mode = cv::INTER_LANCZOS4;
+        break;
+      default:
+        LOG(FATAL) << "Unknown interp mode.";
+    }
+  }
+
+  cv::Scalar pad_val = cv::Scalar(0, 0, 0);
+  const int img_channels = in_img.channels();
+  if (param.pad_value_size() > 0) {
+    CHECK(param.pad_value_size() == 1 ||
+          param.pad_value_size() == img_channels) <<
+        "Specify either 1 pad_value or as many as channels: " << img_channels;
+    vector<float> pad_values;
+    for (int i = 0; i < param.pad_value_size(); ++i) {
+      pad_values.push_back(param.pad_value(i));
+    }
+    if (img_channels > 1 && param.pad_value_size() == 1) {
+      // Replicate the pad_value for simplicity
+      for (int c = 1; c < img_channels; ++c) {
+        pad_values.push_back(pad_values[0]);
+      }
+    }
+    pad_val = cv::Scalar(pad_values[0], pad_values[1], pad_values[2]);
+  }
+
+  switch (param.resize_mode()) {
+    case ResizeParameter_Resize_mode_WARP:
+      cv::resize(in_img, out_img, cv::Size(new_width, new_height), 0, 0,
+                 interp_mode);
+      break;
+    case ResizeParameter_Resize_mode_FIT_LARGE_SIZE_AND_PAD:
+      out_img = AspectKeepingResizeAndPad(in_img, new_width, new_height,
+                                          pad_mode, pad_val, interp_mode);
+      break;
+    case ResizeParameter_Resize_mode_FIT_SMALL_SIZE:
+      out_img = AspectKeepingResizeBySmall(in_img, new_width, new_height,
+                                           interp_mode);
+      break;
+    default:
+      LOG(INFO) << "Unknown resize mode.";
+  }
+  return  out_img;
+}
+
+cv::Mat ApplyNoise(const cv::Mat& in_img, const NoiseParameter& param) {
+  cv::Mat out_img;
+
+  if (param.decolorize()) {
+    cv::Mat grayscale_img;
+    cv::cvtColor(in_img, grayscale_img, CV_BGR2GRAY);
+    cv::cvtColor(grayscale_img, out_img,  CV_GRAY2BGR);
+  } else {
+    out_img = in_img;
+  }
+
+  if (param.gauss_blur()) {
+    cv::GaussianBlur(out_img, out_img, cv::Size(7, 7), 1.5);
+  }
+
+  if (param.hist_eq()) {
+    if (out_img.channels() > 1) {
+      cv::Mat ycrcb_image;
+      cv::cvtColor(out_img, ycrcb_image, CV_BGR2YCrCb);
+      // Extract the L channel
+      vector<cv::Mat> ycrcb_planes(3);
+      cv::split(ycrcb_image, ycrcb_planes);
+      // now we have the L image in ycrcb_planes[0]
+      cv::Mat dst;
+      cv::equalizeHist(ycrcb_planes[0], dst);
+      ycrcb_planes[0] = dst;
+      cv::merge(ycrcb_planes, ycrcb_image);
+      // convert back to RGB
+      cv::cvtColor(ycrcb_image, out_img, CV_YCrCb2BGR);
+    } else {
+      cv::Mat temp_img;
+      cv::equalizeHist(out_img, temp_img);
+      out_img = temp_img;
+    }
+  }
+
+  if (param.clahe()) {
+    cv::Ptr<cv::CLAHE> clahe = cv::createCLAHE();
+    clahe->setClipLimit(4);
+    if (out_img.channels() > 1) {
+      cv::Mat ycrcb_image;
+      cv::cvtColor(out_img, ycrcb_image, CV_BGR2YCrCb);
+      // Extract the L channel
+      vector<cv::Mat> ycrcb_planes(3);
+      cv::split(ycrcb_image, ycrcb_planes);
+      // now we have the L image in ycrcb_planes[0]
+      cv::Mat dst;
+      clahe->apply(ycrcb_planes[0], dst);
+      ycrcb_planes[0] = dst;
+      cv::merge(ycrcb_planes, ycrcb_image);
+      // convert back to RGB
+      cv::cvtColor(ycrcb_image, out_img, CV_YCrCb2BGR);
+    } else {
+      cv::Ptr<cv::CLAHE> clahe = cv::createCLAHE();
+      clahe->setClipLimit(4);
+      cv::Mat temp_img;
+      clahe->apply(out_img, temp_img);
+      out_img = temp_img;
+    }
+  }
+
+  if (param.jpeg() > 0) {
+    vector<uchar> buf;
+    vector<int> params;
+    params.push_back(CV_IMWRITE_JPEG_QUALITY);
+    params.push_back(param.jpeg());
+    cv::imencode(".jpg", out_img, buf, params);
+    out_img = cv::imdecode(buf, CV_LOAD_IMAGE_COLOR);
+  }
+
+  if (param.erode()) {
+    cv::Mat element = cv::getStructuringElement(
+        2, cv::Size(3, 3), cv::Point(1, 1));
+    cv::erode(out_img, out_img, element);
+  }
+
+  if (param.posterize()) {
+    cv::Mat tmp_img;
+    tmp_img = colorReduce(out_img);
+    out_img = tmp_img;
+  }
+
+  if (param.inverse()) {
+    cv::Mat tmp_img;
+    cv::bitwise_not(out_img, tmp_img);
+    out_img = tmp_img;
+  }
+
+  vector<uchar> noise_values;
+  if (param.saltpepper_param().value_size() > 0) {
+    CHECK(param.saltpepper_param().value_size() == 1
+          || param.saltpepper_param().value_size() == out_img.channels())
+        << "Specify either 1 pad_value or as many as channels: "
+        << out_img.channels();
+
+    for (int i = 0; i < param.saltpepper_param().value_size(); i++) {
+      noise_values.push_back(uchar(param.saltpepper_param().value(i)));
+    }
+    if (out_img.channels()  > 1
+        && param.saltpepper_param().value_size() == 1) {
+      // Replicate the pad_value for simplicity
+      for (int c = 1; c < out_img.channels(); ++c) {
+        noise_values.push_back(uchar(noise_values[0]));
+      }
+    }
+  }
+  if (param.saltpepper()) {
+    const int noise_pixels_num =
+        floor(param.saltpepper_param().fraction()
+              * out_img.cols * out_img.rows);
+    constantNoise(noise_pixels_num, noise_values, &out_img);
+  }
+
+  if (param.convert_to_hsv()) {
+    cv::Mat hsv_image;
+    cv::cvtColor(out_img, hsv_image, CV_BGR2HSV);
+    out_img = hsv_image;
+  }
+  if (param.convert_to_lab()) {
+    cv::Mat lab_image;
+    out_img.convertTo(lab_image, CV_32F);
+    lab_image *= 1.0 / 255;
+    cv::cvtColor(lab_image, out_img, CV_BGR2Lab);
+  }
+  return  out_img;
+}
+
+void RandomBrightness(const cv::Mat& in_img, cv::Mat* out_img,
+    const float brightness_prob, const float brightness_delta) {
+  float prob;
+  caffe_rng_uniform(1, 0.f, 1.f, &prob);
+  if (prob < brightness_prob) {
+    CHECK_GE(brightness_delta, 0) << "brightness_delta must be non-negative.";
+    float delta;
+    caffe_rng_uniform(1, -brightness_delta, brightness_delta, &delta);
+    AdjustBrightness(in_img, delta, out_img);
+  } else {
+    *out_img = in_img;
+  }
+}
+
+void AdjustBrightness(const cv::Mat& in_img, const float delta,
+                      cv::Mat* out_img) {
+  if (fabs(delta) > 0) {
+    in_img.convertTo(*out_img, -1, 1, delta);
+  } else {
+    *out_img = in_img;
+  }
+}
+
+void RandomContrast(const cv::Mat& in_img, cv::Mat* out_img,
+    const float contrast_prob, const float lower, const float upper) {
+  float prob;
+  caffe_rng_uniform(1, 0.f, 1.f, &prob);
+  if (prob < contrast_prob) {
+    CHECK_GE(upper, lower) << "contrast upper must be >= lower.";
+    CHECK_GE(lower, 0) << "contrast lower must be non-negative.";
+    float delta;
+    caffe_rng_uniform(1, lower, upper, &delta);
+    AdjustContrast(in_img, delta, out_img);
+  } else {
+    *out_img = in_img;
+  }
+}
+
+void AdjustContrast(const cv::Mat& in_img, const float delta,
+                    cv::Mat* out_img) {
+  if (fabs(delta - 1.f) > 1e-3) {
+    in_img.convertTo(*out_img, -1, delta, 0);
+  } else {
+    *out_img = in_img;
+  }
+}
+
+void RandomSaturation(const cv::Mat& in_img, cv::Mat* out_img,
+    const float saturation_prob, const float lower, const float upper) {
+  float prob;
+  caffe_rng_uniform(1, 0.f, 1.f, &prob);
+  if (prob < saturation_prob) {
+    CHECK_GE(upper, lower) << "saturation upper must be >= lower.";
+    CHECK_GE(lower, 0) << "saturation lower must be non-negative.";
+    float delta;
+    caffe_rng_uniform(1, lower, upper, &delta);
+    AdjustSaturation(in_img, delta, out_img);
+  } else {
+    *out_img = in_img;
+  }
+}
+
+void AdjustSaturation(const cv::Mat& in_img, const float delta,
+                      cv::Mat* out_img) {
+  if (fabs(delta - 1.f) != 1e-3) {
+    // Convert to HSV colorspae.
+    cv::cvtColor(in_img, *out_img, CV_BGR2HSV);
+
+    // Split the image to 3 channels.
+    vector<cv::Mat> channels;
+    cv::split(*out_img, channels);
+
+    // Adjust the saturation.
+    channels[1].convertTo(channels[1], -1, delta, 0);
+    cv::merge(channels, *out_img);
+
+    // Back to BGR colorspace.
+    cvtColor(*out_img, *out_img, CV_HSV2BGR);
+  } else {
+    *out_img = in_img;
+  }
+}
+
+void RandomHue(const cv::Mat& in_img, cv::Mat* out_img,
+               const float hue_prob, const float hue_delta) {
+  float prob;
+  caffe_rng_uniform(1, 0.f, 1.f, &prob);
+  if (prob < hue_prob) {
+    CHECK_GE(hue_delta, 0) << "hue_delta must be non-negative.";
+    float delta;
+    caffe_rng_uniform(1, -hue_delta, hue_delta, &delta);
+    AdjustHue(in_img, delta, out_img);
+  } else {
+    *out_img = in_img;
+  }
+}
+
+void AdjustHue(const cv::Mat& in_img, const float delta, cv::Mat* out_img) {
+  if (fabs(delta) > 0) {
+    // Convert to HSV colorspae.
+    cv::cvtColor(in_img, *out_img, CV_BGR2HSV);
+
+    // Split the image to 3 channels.
+    vector<cv::Mat> channels;
+    cv::split(*out_img, channels);
+
+    // Adjust the hue.
+    channels[0].convertTo(channels[0], -1, 1, delta);
+    cv::merge(channels, *out_img);
+
+    // Back to BGR colorspace.
+    cvtColor(*out_img, *out_img, CV_HSV2BGR);
+  } else {
+    *out_img = in_img;
+  }
+}
+
+void RandomOrderChannels(const cv::Mat& in_img, cv::Mat* out_img,
+                         const float random_order_prob) {
+  float prob;
+  caffe_rng_uniform(1, 0.f, 1.f, &prob);
+  if (prob < random_order_prob) {
+    // Split the image to 3 channels.
+    vector<cv::Mat> channels;
+    cv::split(*out_img, channels);
+    CHECK_EQ(channels.size(), 3);
+
+    // Shuffle the channels.
+    std::random_shuffle(channels.begin(), channels.end());
+    cv::merge(channels, *out_img);
+  } else {
+    *out_img = in_img;
+  }
+}
+
+cv::Mat ApplyDistort(const cv::Mat& in_img, const DistortionParameter& param) {
+  cv::Mat out_img = in_img;
+  float prob;
+  caffe_rng_uniform(1, 0.f, 1.f, &prob);
+
+  if (prob > 0.5) {
+    // Do random brightness distortion.
+    RandomBrightness(out_img, &out_img, param.brightness_prob(),
+                     param.brightness_delta());
+
+    // Do random contrast distortion.
+    RandomContrast(out_img, &out_img, param.contrast_prob(),
+                   param.contrast_lower(), param.contrast_upper());
+
+    // Do random saturation distortion.
+    RandomSaturation(out_img, &out_img, param.saturation_prob(),
+                     param.saturation_lower(), param.saturation_upper());
+
+    // Do random hue distortion.
+    RandomHue(out_img, &out_img, param.hue_prob(), param.hue_delta());
+
+    // Do random reordering of the channels.
+    RandomOrderChannels(out_img, &out_img, param.random_order_prob());
+  } else {
+    // Do random brightness distortion.
+    RandomBrightness(out_img, &out_img, param.brightness_prob(),
+                     param.brightness_delta());
+
+    // Do random saturation distortion.
+    RandomSaturation(out_img, &out_img, param.saturation_prob(),
+                     param.saturation_lower(), param.saturation_upper());
+
+    // Do random hue distortion.
+    RandomHue(out_img, &out_img, param.hue_prob(), param.hue_delta());
+
+    // Do random contrast distortion.
+    RandomContrast(out_img, &out_img, param.contrast_prob(),
+                   param.contrast_lower(), param.contrast_upper());
+
+    // Do random reordering of the channels.
+    RandomOrderChannels(out_img, &out_img, param.random_order_prob());
+  }
+
+  return out_img;
+}
+
+}  // namespace caffe
diff --git a/src/caffe/util/io.cpp b/src/caffe/util/io.cpp
index 9ef6c38a108..ebd7dc40932 100644
--- a/src/caffe/util/io.cpp
+++ b/src/caffe/util/io.cpp
@@ -1,3 +1,10 @@
+#include <boost/algorithm/string/classification.hpp>
+#include <boost/algorithm/string/split.hpp>
+#include <boost/filesystem.hpp>
+#include <boost/foreach.hpp>
+#include <boost/property_tree/json_parser.hpp>
+#include <boost/property_tree/ptree.hpp>
+#include <boost/property_tree/xml_parser.hpp>
 #include <fcntl.h>
 #include <google/protobuf/io/coded_stream.h>
 #include <google/protobuf/io/zero_copy_stream_impl.h>
@@ -6,6 +13,7 @@
 #include <opencv2/highgui/highgui.hpp>
 #include <opencv2/imgproc/imgproc.hpp>
 #include <fstream>  // NOLINT(readability/streams)
+#include <map>
 #include <turbojpeg.h>
 
 #include "caffe/blob.hpp"
@@ -15,6 +23,7 @@ const int kProtoReadBytesLimit = INT_MAX;  // Max size of 2 GB minus 1 byte.
 
 namespace caffe {
 
+using namespace boost::property_tree;  // NOLINT(build/namespaces)
 using google::protobuf::io::FileInputStream;
 using google::protobuf::io::FileOutputStream;
 using google::protobuf::io::ZeroCopyInputStream;
@@ -209,6 +218,49 @@ cv::Mat ReadImageToCVMat(const string& filename,
   return cv_img;
 }
 
+cv::Mat ReadImageToCVMat(const string& filename, const int height,
+    const int width, const int min_dim, const int max_dim,
+    const bool is_color) {
+  cv::Mat cv_img;
+  int cv_read_flag = (is_color ? CV_LOAD_IMAGE_COLOR :
+    CV_LOAD_IMAGE_GRAYSCALE);
+  cv::Mat cv_img_origin = cv::imread(filename, cv_read_flag);
+  if (!cv_img_origin.data) {
+    LOG(ERROR) << "Could not open or find file " << filename;
+    return cv_img_origin;
+  }
+  if (min_dim > 0 || max_dim > 0) {
+    int num_rows = cv_img_origin.rows;
+    int num_cols = cv_img_origin.cols;
+    int min_num = std::min(num_rows, num_cols);
+    int max_num = std::max(num_rows, num_cols);
+    float scale_factor = 1;
+    if (min_dim > 0 && min_num < min_dim) {
+      scale_factor = static_cast<float>(min_dim) / min_num;
+    }
+    if (max_dim > 0 && static_cast<int>(scale_factor * max_num) > max_dim) {
+      // Make sure the maximum dimension is less than max_dim.
+      scale_factor = static_cast<float>(max_dim) / max_num;
+    }
+    if (scale_factor == 1) {
+      cv_img = cv_img_origin;
+    } else {
+      cv::resize(cv_img_origin, cv_img, cv::Size(0, 0),
+                 scale_factor, scale_factor);
+    }
+  } else if (height > 0 && width > 0) {
+    cv::resize(cv_img_origin, cv_img, cv::Size(width, height));
+  } else {
+    cv_img = cv_img_origin;
+  }
+  return cv_img;
+}
+
+cv::Mat ReadImageToCVMat(const string& filename, const int height,
+    const int width, const int min_dim, const int max_dim) {
+  return ReadImageToCVMat(filename, height, width, min_dim, max_dim, true);
+}
+
 cv::Mat ReadImageToCVMat(const string& filename,
     const int height, const int width) {
   return ReadImageToCVMat(filename, height, width, true);
@@ -263,7 +315,494 @@ bool ReadImageToDatum(const string& filename, const int label,
   }
 }
 
-// tests only, TODO: clean
+bool ReadImageToDatum(const string& filename, const int label,
+    const int height, const int width, const int min_dim, const int max_dim,
+    const bool is_color, const std::string & encoding, Datum* datum) {
+  cv::Mat cv_img = ReadImageToCVMat(filename, height, width, min_dim, max_dim,
+                                    is_color);
+  if (cv_img.data) {
+    if (encoding.size()) {
+      if ( (cv_img.channels() == 3) == is_color && !height && !width &&
+          !min_dim && !max_dim && matchExt(filename, encoding) ) {
+        datum->set_channels(cv_img.channels());
+        datum->set_height(cv_img.rows);
+        datum->set_width(cv_img.cols);
+        return ReadFileToDatum(filename, label, datum);
+      }
+      EncodeCVMatToDatum(cv_img, encoding, datum);
+      datum->set_label(label);
+      return true;
+    }
+    CVMatToDatum(cv_img, *datum);
+    datum->set_label(label);
+    return true;
+  } else {
+    return false;
+  }
+}
+
+void GetImageSize(const string& filename, int* height, int* width) {
+  cv::Mat cv_img = cv::imread(filename);
+  if (!cv_img.data) {
+    LOG(ERROR) << "Could not open or find file " << filename;
+    return;
+  }
+  *height = cv_img.rows;
+  *width = cv_img.cols;
+}
+
+bool ReadRichImageToAnnotatedDatum(const string& filename,
+    const string& labelfile, const int height, const int width,
+    const int min_dim, const int max_dim, const bool is_color,
+    const string& encoding, const AnnotatedDatum_AnnotationType type,
+    const string& labeltype, const std::map<string, int>& name_to_label,
+    AnnotatedDatum* anno_datum) {
+  // Read image to datum.
+  bool status = ReadImageToDatum(filename, -1, height, width,
+                                 min_dim, max_dim, is_color, encoding,
+                                 anno_datum->mutable_datum());
+  if (status == false) {
+    return status;
+  }
+  anno_datum->clear_annotation_group();
+  if (!boost::filesystem::exists(labelfile)) {
+    return true;
+  }
+  switch (type) {
+    case AnnotatedDatum_AnnotationType_BBOX:
+      int ori_height, ori_width;
+      GetImageSize(filename, &ori_height, &ori_width);
+      if (labeltype == "xml") {
+        return ReadXMLToAnnotatedDatum(labelfile, ori_height, ori_width,
+                                       name_to_label, anno_datum);
+      } else if (labeltype == "json") {
+        return ReadJSONToAnnotatedDatum(labelfile, ori_height, ori_width,
+                                        name_to_label, anno_datum);
+      } else if (labeltype == "txt") {
+        return ReadTxtToAnnotatedDatum(labelfile, ori_height, ori_width,
+                                       anno_datum);
+      } else {
+        LOG(FATAL) << "Unknown label file type.";
+        return false;
+      }
+      break;
+    default:
+      LOG(FATAL) << "Unknown annotation type.";
+      return false;
+  }
+}
+
+//bool ReadFileToDatum(const string& filename, const int label,
+//    Datum* datum) {
+//  std::streampos size;
+//
+//  fstream file(filename.c_str(), ios::in|ios::binary|ios::ate);
+//  if (file.is_open()) {
+//    size = file.tellg();
+//    std::string buffer(size, ' ');
+//    file.seekg(0, ios::beg);
+//    file.read(&buffer[0], size);
+//    file.close();
+//    datum->set_data(buffer);
+//    datum->set_label(label);
+//    datum->set_encoded(true);
+//    return true;
+//  } else {
+//    return false;
+//  }
+//}
+
+// Parse VOC/ILSVRC detection annotation.
+bool ReadXMLToAnnotatedDatum(const string& labelfile, const int img_height,
+    const int img_width, const std::map<string, int>& name_to_label,
+    AnnotatedDatum* anno_datum) {
+  ptree pt;
+  read_xml(labelfile, pt);
+
+  // Parse annotation.
+  int width = 0, height = 0;
+  try {
+    height = pt.get<int>("annotation.size.height");
+    width = pt.get<int>("annotation.size.width");
+  } catch (const ptree_error &e) {
+    LOG(WARNING) << "When parsing " << labelfile << ": " << e.what();
+    height = img_height;
+    width = img_width;
+  }
+  LOG_IF(WARNING, height != img_height) << labelfile <<
+      " inconsistent image height.";
+  LOG_IF(WARNING, width != img_width) << labelfile <<
+      " inconsistent image width.";
+  CHECK(width != 0 && height != 0) << labelfile <<
+      " no valid image width/height.";
+  int instance_id = 0;
+  BOOST_FOREACH(ptree::value_type &v1, pt.get_child("annotation")) {
+    ptree pt1 = v1.second;
+    if (v1.first == "object") {
+      Annotation* anno = NULL;
+      bool difficult = false;
+      ptree object = v1.second;
+      BOOST_FOREACH(ptree::value_type &v2, object.get_child("")) {
+        ptree pt2 = v2.second;
+        if (v2.first == "name") {
+          string name = pt2.data();
+          if (name_to_label.find(name) == name_to_label.end()) {
+            LOG(FATAL) << "Unknown name: " << name;
+          }
+          int label = name_to_label.find(name)->second;
+          bool found_group = false;
+          for (int g = 0; g < anno_datum->annotation_group_size(); ++g) {
+            AnnotationGroup* anno_group =
+                anno_datum->mutable_annotation_group(g);
+            if (label == anno_group->group_label()) {
+              if (anno_group->annotation_size() == 0) {
+                instance_id = 0;
+              } else {
+                instance_id = anno_group->annotation(
+                    anno_group->annotation_size() - 1).instance_id() + 1;
+              }
+              anno = anno_group->add_annotation();
+              found_group = true;
+            }
+          }
+          if (!found_group) {
+            // If there is no such annotation_group, create a new one.
+            AnnotationGroup* anno_group = anno_datum->add_annotation_group();
+            anno_group->set_group_label(label);
+            anno = anno_group->add_annotation();
+            instance_id = 0;
+          }
+          anno->set_instance_id(instance_id++);
+        } else if (v2.first == "difficult") {
+          difficult = pt2.data() == "1";
+        } else if (v2.first == "bndbox") {
+          int xmin = pt2.get("xmin", 0);
+          int ymin = pt2.get("ymin", 0);
+          int xmax = pt2.get("xmax", 0);
+          int ymax = pt2.get("ymax", 0);
+          CHECK_NOTNULL(anno);
+          LOG_IF(WARNING, xmin > width) << labelfile <<
+              " bounding box exceeds image boundary.";
+          LOG_IF(WARNING, ymin > height) << labelfile <<
+              " bounding box exceeds image boundary.";
+          LOG_IF(WARNING, xmax > width) << labelfile <<
+              " bounding box exceeds image boundary.";
+          LOG_IF(WARNING, ymax > height) << labelfile <<
+              " bounding box exceeds image boundary.";
+          LOG_IF(WARNING, xmin < 0) << labelfile <<
+              " bounding box exceeds image boundary.";
+          LOG_IF(WARNING, ymin < 0) << labelfile <<
+              " bounding box exceeds image boundary.";
+          LOG_IF(WARNING, xmax < 0) << labelfile <<
+              " bounding box exceeds image boundary.";
+          LOG_IF(WARNING, ymax < 0) << labelfile <<
+              " bounding box exceeds image boundary.";
+          LOG_IF(WARNING, xmin > xmax) << labelfile <<
+              " bounding box irregular.";
+          LOG_IF(WARNING, ymin > ymax) << labelfile <<
+              " bounding box irregular.";
+          // Store the normalized bounding box.
+          NormalizedBBox* bbox = anno->mutable_bbox();
+          bbox->set_xmin(static_cast<float>(xmin) / width);
+          bbox->set_ymin(static_cast<float>(ymin) / height);
+          bbox->set_xmax(static_cast<float>(xmax) / width);
+          bbox->set_ymax(static_cast<float>(ymax) / height);
+          bbox->set_difficult(difficult);
+        }
+      }
+    }
+  }
+  return true;
+}
+
+// Parse MSCOCO detection annotation.
+bool ReadJSONToAnnotatedDatum(const string& labelfile, const int img_height,
+    const int img_width, const std::map<string, int>& name_to_label,
+    AnnotatedDatum* anno_datum) {
+  ptree pt;
+  read_json(labelfile, pt);
+
+  // Get image info.
+  int width = 0, height = 0;
+  try {
+    height = pt.get<int>("image.height");
+    width = pt.get<int>("image.width");
+  } catch (const ptree_error &e) {
+    LOG(WARNING) << "When parsing " << labelfile << ": " << e.what();
+    height = img_height;
+    width = img_width;
+  }
+  LOG_IF(WARNING, height != img_height) << labelfile <<
+      " inconsistent image height.";
+  LOG_IF(WARNING, width != img_width) << labelfile <<
+      " inconsistent image width.";
+  CHECK(width != 0 && height != 0) << labelfile <<
+      " no valid image width/height.";
+
+  // Get annotation info.
+  int instance_id = 0;
+  BOOST_FOREACH(ptree::value_type& v1, pt.get_child("annotation")) {
+    Annotation* anno = NULL;
+    bool iscrowd = false;
+    ptree object = v1.second;
+    // Get category_id.
+    string name = object.get<string>("category_id");
+    if (name_to_label.find(name) == name_to_label.end()) {
+      LOG(FATAL) << "Unknown name: " << name;
+    }
+    int label = name_to_label.find(name)->second;
+    bool found_group = false;
+    for (int g = 0; g < anno_datum->annotation_group_size(); ++g) {
+      AnnotationGroup* anno_group =
+          anno_datum->mutable_annotation_group(g);
+      if (label == anno_group->group_label()) {
+        if (anno_group->annotation_size() == 0) {
+          instance_id = 0;
+        } else {
+          instance_id = anno_group->annotation(
+              anno_group->annotation_size() - 1).instance_id() + 1;
+        }
+        anno = anno_group->add_annotation();
+        found_group = true;
+      }
+    }
+    if (!found_group) {
+      // If there is no such annotation_group, create a new one.
+      AnnotationGroup* anno_group = anno_datum->add_annotation_group();
+      anno_group->set_group_label(label);
+      anno = anno_group->add_annotation();
+      instance_id = 0;
+    }
+    anno->set_instance_id(instance_id++);
+
+    // Get iscrowd.
+    iscrowd = object.get<int>("iscrowd", 0);
+
+    // Get bbox.
+    vector<float> bbox_items;
+    BOOST_FOREACH(ptree::value_type& v2, object.get_child("bbox")) {
+      bbox_items.push_back(v2.second.get_value<float>());
+    }
+    CHECK_EQ(bbox_items.size(), 4);
+    float xmin = bbox_items[0];
+    float ymin = bbox_items[1];
+    float xmax = bbox_items[0] + bbox_items[2];
+    float ymax = bbox_items[1] + bbox_items[3];
+    CHECK_NOTNULL(anno);
+    LOG_IF(WARNING, xmin > width) << labelfile <<
+        " bounding box exceeds image boundary.";
+    LOG_IF(WARNING, ymin > height) << labelfile <<
+        " bounding box exceeds image boundary.";
+    LOG_IF(WARNING, xmax > width) << labelfile <<
+        " bounding box exceeds image boundary.";
+    LOG_IF(WARNING, ymax > height) << labelfile <<
+        " bounding box exceeds image boundary.";
+    LOG_IF(WARNING, xmin < 0) << labelfile <<
+        " bounding box exceeds image boundary.";
+    LOG_IF(WARNING, ymin < 0) << labelfile <<
+        " bounding box exceeds image boundary.";
+    LOG_IF(WARNING, xmax < 0) << labelfile <<
+        " bounding box exceeds image boundary.";
+    LOG_IF(WARNING, ymax < 0) << labelfile <<
+        " bounding box exceeds image boundary.";
+    LOG_IF(WARNING, xmin > xmax) << labelfile <<
+        " bounding box irregular.";
+    LOG_IF(WARNING, ymin > ymax) << labelfile <<
+        " bounding box irregular.";
+    // Store the normalized bounding box.
+    NormalizedBBox* bbox = anno->mutable_bbox();
+    bbox->set_xmin(xmin / width);
+    bbox->set_ymin(ymin / height);
+    bbox->set_xmax(xmax / width);
+    bbox->set_ymax(ymax / height);
+    bbox->set_difficult(iscrowd);
+  }
+  return true;
+}
+
+// Parse plain txt detection annotation: label_id, xmin, ymin, xmax, ymax.
+bool ReadTxtToAnnotatedDatum(const string& labelfile, const int height,
+    const int width, AnnotatedDatum* anno_datum) {
+  std::ifstream infile(labelfile.c_str());
+  if (!infile.good()) {
+    LOG(INFO) << "Cannot open " << labelfile;
+    return false;
+  }
+  int label;
+  float xmin, ymin, xmax, ymax;
+  while (infile >> label >> xmin >> ymin >> xmax >> ymax) {
+    Annotation* anno = NULL;
+    int instance_id = 0;
+    bool found_group = false;
+    for (int g = 0; g < anno_datum->annotation_group_size(); ++g) {
+      AnnotationGroup* anno_group = anno_datum->mutable_annotation_group(g);
+      if (label == anno_group->group_label()) {
+        if (anno_group->annotation_size() == 0) {
+          instance_id = 0;
+        } else {
+          instance_id = anno_group->annotation(
+              anno_group->annotation_size() - 1).instance_id() + 1;
+        }
+        anno = anno_group->add_annotation();
+        found_group = true;
+      }
+    }
+    if (!found_group) {
+      // If there is no such annotation_group, create a new one.
+      AnnotationGroup* anno_group = anno_datum->add_annotation_group();
+      anno_group->set_group_label(label);
+      anno = anno_group->add_annotation();
+      instance_id = 0;
+    }
+    anno->set_instance_id(instance_id++);
+    LOG_IF(WARNING, xmin > width) << labelfile <<
+      " bounding box exceeds image boundary.";
+    LOG_IF(WARNING, ymin > height) << labelfile <<
+      " bounding box exceeds image boundary.";
+    LOG_IF(WARNING, xmax > width) << labelfile <<
+      " bounding box exceeds image boundary.";
+    LOG_IF(WARNING, ymax > height) << labelfile <<
+      " bounding box exceeds image boundary.";
+    LOG_IF(WARNING, xmin < 0) << labelfile <<
+      " bounding box exceeds image boundary.";
+    LOG_IF(WARNING, ymin < 0) << labelfile <<
+      " bounding box exceeds image boundary.";
+    LOG_IF(WARNING, xmax < 0) << labelfile <<
+      " bounding box exceeds image boundary.";
+    LOG_IF(WARNING, ymax < 0) << labelfile <<
+      " bounding box exceeds image boundary.";
+    LOG_IF(WARNING, xmin > xmax) << labelfile <<
+      " bounding box irregular.";
+    LOG_IF(WARNING, ymin > ymax) << labelfile <<
+      " bounding box irregular.";
+    // Store the normalized bounding box.
+    NormalizedBBox* bbox = anno->mutable_bbox();
+    bbox->set_xmin(xmin / width);
+    bbox->set_ymin(ymin / height);
+    bbox->set_xmax(xmax / width);
+    bbox->set_ymax(ymax / height);
+    bbox->set_difficult(false);
+  }
+  return true;
+}
+
+bool ReadLabelFileToLabelMap(const string& filename, bool include_background,
+    const string& delimiter, LabelMap* map) {
+  // cleanup
+  map->Clear();
+
+  std::ifstream file(filename.c_str());
+  string line;
+  // Every line can have [1, 3] number of fields.
+  // The delimiter between fields can be one of " :;".
+  // The order of the fields are:
+  //  name [label] [display_name]
+  //  ...
+  int field_size = -1;
+  int label = 0;
+  LabelMapItem* map_item;
+  // Add background (none_of_the_above) class.
+  if (include_background) {
+    map_item = map->add_item();
+    map_item->set_name("none_of_the_above");
+    map_item->set_label(label++);
+    map_item->set_display_name("background");
+  }
+  while (std::getline(file, line)) {
+    vector<string> fields;
+    fields.clear();
+    boost::split(fields, line, boost::is_any_of(delimiter));
+    if (field_size == -1) {
+      field_size = fields.size();
+    } else {
+      CHECK_EQ(field_size, fields.size())
+          << "Inconsistent number of fields per line.";
+    }
+    map_item = map->add_item();
+    map_item->set_name(fields[0]);
+    switch (field_size) {
+      case 1:
+        map_item->set_label(label++);
+        map_item->set_display_name(fields[0]);
+        break;
+      case 2:
+        label = std::atoi(fields[1].c_str());
+        map_item->set_label(label);
+        map_item->set_display_name(fields[0]);
+        break;
+      case 3:
+        label = std::atoi(fields[1].c_str());
+        map_item->set_label(label);
+        map_item->set_display_name(fields[2]);
+        break;
+      default:
+        LOG(FATAL) << "The number of fields should be [1, 3].";
+        break;
+    }
+  }
+  return true;
+}
+
+bool MapNameToLabel(const LabelMap& map, const bool strict_check,
+    std::map<string, int>* name_to_label) {
+  // cleanup
+  name_to_label->clear();
+
+  for (int i = 0; i < map.item_size(); ++i) {
+    const string& name = map.item(i).name();
+    const int label = map.item(i).label();
+    if (strict_check) {
+      if (!name_to_label->insert(std::make_pair(name, label)).second) {
+        LOG(FATAL) << "There are many duplicates of name: " << name;
+        return false;
+      }
+    } else {
+      (*name_to_label)[name] = label;
+    }
+  }
+  return true;
+}
+
+bool MapLabelToName(const LabelMap& map, const bool strict_check,
+    std::map<int, string>* label_to_name) {
+  // cleanup
+  label_to_name->clear();
+
+  for (int i = 0; i < map.item_size(); ++i) {
+    const string& name = map.item(i).name();
+    const int label = map.item(i).label();
+    if (strict_check) {
+      if (!label_to_name->insert(std::make_pair(label, name)).second) {
+        LOG(FATAL) << "There are many duplicates of label: " << label;
+        return false;
+      }
+    } else {
+      (*label_to_name)[label] = name;
+    }
+  }
+  return true;
+}
+
+bool MapLabelToDisplayName(const LabelMap& map, const bool strict_check,
+    std::map<int, string>* label_to_display_name) {
+  // cleanup
+  label_to_display_name->clear();
+
+  for (int i = 0; i < map.item_size(); ++i) {
+    const string& display_name = map.item(i).display_name();
+    const int label = map.item(i).label();
+    if (strict_check) {
+      if (!label_to_display_name->insert(
+              std::make_pair(label, display_name)).second) {
+        LOG(FATAL) << "There are many duplicates of label: " << label;
+        return false;
+      }
+    } else {
+      (*label_to_display_name)[label] = display_name;
+    }
+  }
+  return true;
+}
 cv::Mat DecodeDatumToCVMatNative(const Datum& datum) {
   cv::Mat cv_img;
   DecodeDatumToCVMat(datum, 0, cv_img, false);
@@ -301,6 +840,18 @@ bool DecodeDatum(Datum* datum, bool is_color) {
   }
 }
 
+void EncodeCVMatToDatum(const cv::Mat& cv_img, const string& encoding,
+                        Datum* datum) {
+  std::vector<uchar> buf;
+  cv::imencode("."+encoding, cv_img, buf);
+  datum->set_data(std::string(reinterpret_cast<char*>(&buf[0]),
+                              buf.size()));
+  datum->set_channels(cv_img.channels());
+  datum->set_height(cv_img.rows);
+  datum->set_width(cv_img.cols);
+  datum->set_encoded(true);
+}
+
 vector<int> DatumToCVMat(const Datum& datum, cv::Mat& img, bool shape_only) {
   if (datum.encoded()) {
     LOG(FATAL) << "Datum encoded";
diff --git a/src/caffe/util/math_functions.cu b/src/caffe/util/math_functions.cu
index f46450e59b4..12563b795e7 100644
--- a/src/caffe/util/math_functions.cu
+++ b/src/caffe/util/math_functions.cu
@@ -2,7 +2,6 @@
 #include <device_launch_parameters.h>
 
 #include "caffe/util/half.cuh"
-//#include "caffe/common.hpp"
 #include "caffe/util/math_functions.hpp"
 #include "caffe/util/gpu_math_functions.cuh"
 #include "caffe/type.hpp"
@@ -332,6 +331,7 @@ void gpu_dot_kernel(const int N, const Dtype* x, const Dtype* y, Mtype* out) {
   Mtype cache[CAFFE_CUDA_NUM_THREADS];
   const int tidx = threadIdx.x;
   cache[tidx] = 0.;
+  __syncthreads();
   for (int i = tidx; i < N; i += blockDim.x) {
     cache[tidx] += static_cast<Mtype>(x[i]) * static_cast<Mtype>(y[i]);
   }
diff --git a/src/caffe/util/math_functions2.cu b/src/caffe/util/math_functions2.cu
index 2140302efaf..d4a84f45c59 100644
--- a/src/caffe/util/math_functions2.cu
+++ b/src/caffe/util/math_functions2.cu
@@ -357,11 +357,13 @@ void caffe_gpu_convert<float16, float16>(const unsigned int n,
 
 void caffe_gpu_rng_uniform(const int n, unsigned int* r) {
   CURAND_CHECK(curandGenerate(Caffe::curand_generator(), r, n));
+  CUDA_CHECK(cudaStreamSynchronize(Caffe::curand_stream()));
 }
 template<>
 void caffe_gpu_rng_uniform<float>(const int n, const float a, const float b,
     float* r) {
   CURAND_CHECK(curandGenerateUniform(Caffe::curand_generator(), r, n));
+  CUDA_CHECK(cudaStreamSynchronize(Caffe::curand_stream()));
   const float range = b - a;
   if (range != static_cast<float>(1)) {
     caffe_gpu_scal(n, range, r);
@@ -375,6 +377,7 @@ template<>
 void caffe_gpu_rng_uniform<double>(const int n, const double a, const double b,
     double* r) {
   CURAND_CHECK(curandGenerateUniformDouble(Caffe::curand_generator(), r, n));
+  CUDA_CHECK(cudaStreamSynchronize(Caffe::curand_stream()));
   const double range = b - a;
   if (range != static_cast<double>(1)) {
     caffe_gpu_scal(n, range, r);
@@ -390,6 +393,7 @@ void caffe_gpu_rng_uniform<float16>(const int n, const float16 a,
   GPUMemory::Workspace rf(n * sizeof(float), Caffe::current_device());
   float* rfp = static_cast<float*>(rf.data());
   CURAND_CHECK(curandGenerateUniform(Caffe::curand_generator(), rfp, n));
+  CUDA_CHECK(cudaStreamSynchronize(Caffe::curand_stream()));
   const float range = b - a;
   if (range != 1.F) {
     caffe_gpu_scal(n, range, rfp);
@@ -403,11 +407,13 @@ void caffe_gpu_rng_uniform<float16>(const int n, const float16 a,
 template<>
 void caffe_gpu_rng_gaussian(const int n, const float mu, const float sigma, float* r) {
   CURAND_CHECK(curandGenerateNormal(Caffe::curand_generator(), r, n, mu, sigma));
+  CUDA_CHECK(cudaStreamSynchronize(Caffe::curand_stream()));
 }
 
 template<>
 void caffe_gpu_rng_gaussian(const int n, const double mu, const double sigma, double* r) {
   CURAND_CHECK(curandGenerateNormalDouble(Caffe::curand_generator(), r, n, mu, sigma));
+  CUDA_CHECK(cudaStreamSynchronize(Caffe::curand_stream()));
 }
 
 template<>
@@ -415,6 +421,7 @@ void caffe_gpu_rng_gaussian(const int n, const float16 mu, const float16 sigma,
   GPUMemory::Workspace rf(n * sizeof(float), Caffe::current_device());
   float* rfp = static_cast<float*>(rf.data());
   CURAND_CHECK(curandGenerateNormal(Caffe::curand_generator(), rfp, n, mu, sigma));
+  CUDA_CHECK(cudaStreamSynchronize(Caffe::curand_stream()));
   caffe_gpu_convert(n, rfp, r);
 }
 
diff --git a/src/caffe/util/sampler.cpp b/src/caffe/util/sampler.cpp
new file mode 100644
index 00000000000..f04117d37ac
--- /dev/null
+++ b/src/caffe/util/sampler.cpp
@@ -0,0 +1,167 @@
+#include <algorithm>
+#include <vector>
+
+#include "caffe/util/bbox_util.hpp"
+#include "caffe/util/sampler.hpp"
+
+namespace caffe {
+
+void GroupObjectBBoxes(const AnnotatedDatum& anno_datum,
+                       vector<NormalizedBBox>* object_bboxes) {
+  object_bboxes->clear();
+  for (int i = 0; i < anno_datum.annotation_group_size(); ++i) {
+    const AnnotationGroup& anno_group = anno_datum.annotation_group(i);
+    for (int j = 0; j < anno_group.annotation_size(); ++j) {
+      const Annotation& anno = anno_group.annotation(j);
+      object_bboxes->push_back(anno.bbox());
+    }
+  }
+}
+
+bool SatisfySampleConstraint(const NormalizedBBox& sampled_bbox,
+                             const vector<NormalizedBBox>& object_bboxes,
+                             const SampleConstraint& sample_constraint) {
+  bool has_jaccard_overlap = sample_constraint.has_min_jaccard_overlap() ||
+      sample_constraint.has_max_jaccard_overlap();
+  bool has_sample_coverage = sample_constraint.has_min_sample_coverage() ||
+      sample_constraint.has_max_sample_coverage();
+  bool has_object_coverage = sample_constraint.has_min_object_coverage() ||
+      sample_constraint.has_max_object_coverage();
+  bool satisfy = !has_jaccard_overlap && !has_sample_coverage &&
+      !has_object_coverage;
+  if (satisfy) {
+    // By default, the sampled_bbox is "positive" if no constraints are defined.
+    return true;
+  }
+  // Check constraints.
+  bool found = false;
+  for (int i = 0; i < object_bboxes.size(); ++i) {
+    const NormalizedBBox& object_bbox = object_bboxes[i];
+    // Test jaccard overlap.
+    if (has_jaccard_overlap) {
+      const float jaccard_overlap = JaccardOverlap(sampled_bbox, object_bbox);
+      if (sample_constraint.has_min_jaccard_overlap() &&
+          jaccard_overlap < sample_constraint.min_jaccard_overlap()) {
+        continue;
+      }
+      if (sample_constraint.has_max_jaccard_overlap() &&
+          jaccard_overlap > sample_constraint.max_jaccard_overlap()) {
+        continue;
+      }
+      found = true;
+    }
+    // Test sample coverage.
+    if (has_sample_coverage) {
+      const float sample_coverage = BBoxCoverage(sampled_bbox, object_bbox);
+      if (sample_constraint.has_min_sample_coverage() &&
+          sample_coverage < sample_constraint.min_sample_coverage()) {
+        continue;
+      }
+      if (sample_constraint.has_max_sample_coverage() &&
+          sample_coverage > sample_constraint.max_sample_coverage()) {
+        continue;
+      }
+      found = true;
+    }
+    // Test object coverage.
+    if (has_object_coverage) {
+      const float object_coverage = BBoxCoverage(object_bbox, sampled_bbox);
+      if (sample_constraint.has_min_object_coverage() &&
+          object_coverage < sample_constraint.min_object_coverage()) {
+        continue;
+      }
+      if (sample_constraint.has_max_object_coverage() &&
+          object_coverage > sample_constraint.max_object_coverage()) {
+        continue;
+      }
+      found = true;
+    }
+    if (found) {
+      return true;
+    }
+  }
+  return found;
+}
+
+void SampleBBox(const Sampler& sampler, NormalizedBBox* sampled_bbox) {
+  // Get random scale.
+  CHECK_GE(sampler.max_scale(), sampler.min_scale());
+  CHECK_GT(sampler.min_scale(), 0.f);
+  CHECK_LE(sampler.max_scale(), 1.f);
+  float scale;
+  caffe_rng_uniform(1, sampler.min_scale(), sampler.max_scale(), &scale);
+
+  // Get random aspect ratio.
+  CHECK_GE(sampler.max_aspect_ratio(), sampler.min_aspect_ratio());
+  CHECK_GT(sampler.min_aspect_ratio(), 0.f);
+  CHECK_LT(sampler.max_aspect_ratio(), FLT_MAX);
+  float aspect_ratio;
+  caffe_rng_uniform(1, sampler.min_aspect_ratio(), sampler.max_aspect_ratio(),
+      &aspect_ratio);
+
+  aspect_ratio = std::max<float>(aspect_ratio, std::pow(scale, 2.f));
+  aspect_ratio = std::min<float>(aspect_ratio, 1.f / std::pow(scale, 2.f));
+
+  // Figure out bbox dimension.
+  float bbox_width = scale * sqrt(aspect_ratio);
+  float bbox_height = scale / sqrt(aspect_ratio);
+
+  // Figure out top left coordinates.
+  float w_off = 0.f, h_off = 0.f;
+  if (bbox_width < 1.f) {
+    caffe_rng_uniform(1, 0.f, 1.f - bbox_width, &w_off);
+  }
+  if (bbox_height < 1.f) {
+    caffe_rng_uniform(1, 0.f, 1.f - bbox_height, &h_off);
+  }
+
+  sampled_bbox->set_xmin(w_off);
+  sampled_bbox->set_ymin(h_off);
+  sampled_bbox->set_xmax(w_off + bbox_width);
+  sampled_bbox->set_ymax(h_off + bbox_height);
+}
+
+void GenerateSamples(const NormalizedBBox& source_bbox,
+                     const vector<NormalizedBBox>& object_bboxes,
+                     const BatchSampler& batch_sampler,
+                     vector<NormalizedBBox>* sampled_bboxes) {
+  int found = 0;
+  for (int i = 0; i < batch_sampler.max_trials(); ++i) {
+    if (batch_sampler.has_max_sample() &&
+        found >= batch_sampler.max_sample()) {
+      break;
+    }
+    // Generate sampled_bbox in the normalized space [0, 1].
+    NormalizedBBox sampled_bbox;
+    SampleBBox(batch_sampler.sampler(), &sampled_bbox);
+    // Transform the sampled_bbox w.r.t. source_bbox.
+    LocateBBox(source_bbox, sampled_bbox, &sampled_bbox);
+    // Determine if the sampled bbox is positive or negative by the constraint.
+    if (SatisfySampleConstraint(sampled_bbox, object_bboxes,
+                                batch_sampler.sample_constraint())) {
+      ++found;
+      sampled_bboxes->push_back(sampled_bbox);
+    }
+  }
+}
+
+void GenerateBatchSamples(const AnnotatedDatum& anno_datum,
+                          const vector<BatchSampler>& batch_samplers,
+                          vector<NormalizedBBox>* sampled_bboxes) {
+  sampled_bboxes->clear();
+  vector<NormalizedBBox> object_bboxes;
+  GroupObjectBBoxes(anno_datum, &object_bboxes);
+  for (int i = 0; i < batch_samplers.size(); ++i) {
+    if (batch_samplers[i].use_original_image()) {
+      NormalizedBBox unit_bbox;
+      unit_bbox.set_xmin(0);
+      unit_bbox.set_ymin(0);
+      unit_bbox.set_xmax(1);
+      unit_bbox.set_ymax(1);
+      GenerateSamples(unit_bbox, object_bboxes, batch_samplers[i],
+                      sampled_bboxes);
+    }
+  }
+}
+
+}  // namespace caffe
diff --git a/tools/caffe.cpp b/tools/caffe.cpp
index 6c0b5d49a63..9ed96043453 100644
--- a/tools/caffe.cpp
+++ b/tools/caffe.cpp
@@ -6,10 +6,12 @@ namespace bp = boost::python;
 #include <gflags/gflags.h>
 #include <glog/logging.h>
 #include <map>
+#include <vector>
 #include <boost/algorithm/string.hpp>
 
 #include "caffe/caffe.hpp"
 #include "caffe/util/signal_handler.h"
+#include "caffe/util/bbox_util.hpp"
 
 
 using caffe::TBlob;
@@ -32,6 +34,13 @@ DEFINE_string(solver, "",
     "The solver definition protocol buffer text file.");
 DEFINE_string(model, "",
     "The model definition protocol buffer text file.");
+DEFINE_string(phase, "",
+    "Optional; network phase (TRAIN or TEST). Only used for 'time'.");
+DEFINE_int32(level, 0,
+    "Optional; network level.");
+DEFINE_string(stage, "",
+    "Optional; network stages (not to be confused with phase), "
+    "separated by ','.");
 DEFINE_string(snapshot, "",
     "Optional; the snapshot solver state to resume training.");
 DEFINE_string(weights, "",
@@ -45,6 +54,10 @@ DEFINE_string(sigint_effect, "stop",
 DEFINE_string(sighup_effect, "snapshot",
              "Optional; action to take when a SIGHUP signal is received: "
              "snapshot, stop or none.");
+DEFINE_string(ap_version, "11point",
+    "Average Precision type for object detection");
+DEFINE_bool(show_per_class_result, true,
+    "Show per class result for object detection");
 
 // A simple registry for caffe commands.
 typedef int (*BrewFunction)();
@@ -94,6 +107,25 @@ static void get_gpus(vector<int>* gpus) {
   }
 }
 
+// Parse phase from flags
+caffe::Phase get_phase_from_flags(caffe::Phase default_value) {
+  if (FLAGS_phase == "")
+    return default_value;
+  if (FLAGS_phase == "TRAIN")
+    return caffe::TRAIN;
+  if (FLAGS_phase == "TEST")
+    return caffe::TEST;
+  LOG(FATAL) << "phase must be \"TRAIN\" or \"TEST\"";
+  return caffe::TRAIN;  // Avoid warning
+}
+
+// Parse stages from flags
+vector<string> get_stages_from_flags() {
+  vector<string> stages;
+  boost::split(stages, FLAGS_stage, boost::is_any_of(","));
+  return stages;
+}
+
 // caffe commands to call by
 //     caffe <command> <args>
 //
@@ -150,25 +182,31 @@ int train() {
   CHECK(!FLAGS_snapshot.size() || !FLAGS_weights.size())
       << "Give a snapshot to resume training or weights to finetune "
       "but not both.";
+  vector<string> stages = get_stages_from_flags();
 
   caffe::SolverParameter solver_param = caffe::ReadSolverParamsFromTextFileOrDie(FLAGS_solver);
 
+  solver_param.mutable_train_state()->set_level(FLAGS_level);
+  for (int i = 0; i < stages.size(); i++) {
+    solver_param.mutable_train_state()->add_stage(stages[i]);
+  }
+
   // If the gpus flag is not provided, allow the mode and device to be set
   // in the solver prototxt.
   if (FLAGS_gpu.size() == 0
+      && solver_param.has_solver_mode()
       && solver_param.solver_mode() == caffe::SolverParameter_SolverMode_GPU) {
       if (solver_param.has_device_id()) {
-          FLAGS_gpu = "" +
-              boost::lexical_cast<string>(solver_param.device_id());
+          FLAGS_gpu = std::to_string(solver_param.device_id());
       } else {  // Set default GPU if unspecified
-          FLAGS_gpu = "" + boost::lexical_cast<string>(0);
+          FLAGS_gpu = std::to_string(0);
       }
   }
 
   // Read flags for list of GPUs
   vector<int> gpus;
   get_gpus(&gpus);
-  caffe::GPUMemory::Scope gpu_memory_scope(gpus);
+
   // Set mode and device id[s]
   if (gpus.size() == 0) {
     LOG(INFO) << "Use CPU.";
@@ -179,19 +217,17 @@ int train() {
       s << (i ? ", " : "") << gpus[i];
     }
 
+    caffe::GPUMemory::Scope gpu_memory_scope(gpus);
+
     LOG(INFO) << "Using GPUs " << s.str();
-    int dev_id = 0;
     cudaDeviceProp device_prop;
     for (int i = 0; i < gpus.size(); ++i) {
       cudaGetDeviceProperties(&device_prop, gpus[i]);
       LOG(INFO) << "GPU " << gpus[i] << ": " << device_prop.name;
-      if (solver_param.has_device_id() && gpus[i] == solver_param.device_id()) {
-        dev_id = i;
-      }
     }
-    CUDA_CHECK(cudaSetDevice(gpus[dev_id]));
-    Caffe::SetDevice(gpus[dev_id]);
-    solver_param.set_device_id(gpus[dev_id]);
+    CUDA_CHECK(cudaSetDevice(gpus[0]));
+    Caffe::SetDevice(gpus[0]);
+    solver_param.set_device_id(gpus[0]);
     Caffe::set_mode(Caffe::GPU);
     Caffe::set_gpus(gpus);
     Caffe::set_solver_count(gpus.size());
@@ -238,6 +274,7 @@ RegisterBrewFunction(train);
 int test() {
   CHECK_GT(FLAGS_model.size(), 0) << "Need a model definition to score.";
   CHECK_GT(FLAGS_weights.size(), 0) << "Need model weights to score.";
+  vector<string> stages = get_stages_from_flags();
 
   // Read flags for list of GPUs
   vector<int> gpus;
@@ -265,7 +302,7 @@ int test() {
   }
 
   // Instantiate the caffe net.
-  Net caffe_net(FLAGS_model, caffe::TEST, 0U);
+  Net caffe_net(FLAGS_model, caffe::TEST, 0U, nullptr, nullptr, false, FLAGS_level, &stages);
   caffe_net.CopyTrainedLayersFrom(FLAGS_weights);
   LOG(INFO) << "Running for " << FLAGS_iterations << " iterations.";
 
@@ -313,9 +350,186 @@ int test() {
 }
 RegisterBrewFunction(test);
 
+
+// Test: score a detection model.
+int test_detection() {
+  typedef float Dtype;
+  CHECK_GT(FLAGS_model.size(), 0) << "Need a model definition to score.";
+  CHECK_GT(FLAGS_weights.size(), 0) << "Need model weights to score.";
+
+  // Read flags for list of GPUs
+  vector<int> gpus;
+  get_gpus(&gpus);
+  while (gpus.size() > 1) {
+    // Only use one GPU
+    LOG(INFO) << "Not using GPU #" << gpus.back() << " for single-GPU function";
+    gpus.pop_back();
+  }
+  if (gpus.size() > 0) {
+    Caffe::SetDevice(gpus[0]);
+  }
+  caffe::GPUMemory::Scope gpu_memory_scope(gpus);
+
+  // Set mode and device id
+  if (gpus.size() != 0) {
+    LOG(INFO) << "Use GPU with device ID " << gpus[0];
+    cudaDeviceProp device_prop;
+    cudaGetDeviceProperties(&device_prop, gpus[0]);
+    LOG(INFO) << "GPU device name: " << device_prop.name;
+    Caffe::set_mode(Caffe::GPU);
+  } else {
+    LOG(INFO) << "Use CPU.";
+    Caffe::set_mode(Caffe::CPU);
+  }
+
+  // Instantiate the caffe net.
+  Net caffe_net(FLAGS_model, caffe::TEST, 0U);
+  caffe_net.CopyTrainedLayersFrom(FLAGS_weights);
+  LOG(INFO) << "Running for " << FLAGS_iterations << " iterations.";
+
+  std::map<int, std::map<int, vector<std::pair<float, int> > > > all_true_pos;
+  std::map<int, std::map<int, vector<std::pair<float, int> > > > all_false_pos;
+  std::map<int, std::map<int, int> > all_num_pos;
+
+  vector<int> test_score_output_id;
+  vector<float> test_score;
+  float loss = 0;
+  for (int i = 0; i < FLAGS_iterations; ++i) {
+    float iter_loss;
+    const vector<Blob*>& result =
+        caffe_net.Forward(&iter_loss);
+    loss += iter_loss;
+    int idx = 0;
+    for (int j = 0; j < result.size(); ++j) {
+      const float* result_vec = result[j]->cpu_data<float>();
+      for (int k = 0; k < result[j]->count(); ++k, ++idx) {
+        const float score = result_vec[k];
+        if (i == 0) {
+          test_score.push_back(score);
+          test_score_output_id.push_back(j);
+        } else {
+          test_score[idx] += score;
+        }
+        const std::string& output_name = caffe_net.blob_names()[
+            caffe_net.output_blob_indices()[j]];
+        LOG(INFO) << "Batch " << i << ", " << output_name << " = " << score;
+      }
+    }
+
+    //To compute mAP
+    for (int j = 0; j < result.size(); ++j) {
+      CHECK_EQ(result[j]->width(), 5);
+      const Dtype* result_vec = result[j]->cpu_data<Dtype>();
+      int num_det = result[j]->height();
+      for (int k = 0; k < num_det; ++k) {
+        int item_id = static_cast<int>(result_vec[k * 5]);
+        int label = static_cast<int>(result_vec[k * 5 + 1]);
+        if (item_id == -1) {
+          // Special row of storing number of positives for a label.
+          if (all_num_pos[j].find(label) == all_num_pos[j].end()) {
+            all_num_pos[j][label] = static_cast<int>(result_vec[k * 5 + 2]);
+          } else {
+            all_num_pos[j][label] += static_cast<int>(result_vec[k * 5 + 2]);
+          }
+        } else {
+          // Normal row storing detection status.
+          float score = result_vec[k * 5 + 2];
+          int tp = static_cast<int>(result_vec[k * 5 + 3]);
+          int fp = static_cast<int>(result_vec[k * 5 + 4]);
+          if (tp == 0 && fp == 0) {
+            // Ignore such case. It happens when a detection bbox is matched to
+            // a difficult gt bbox and we don't evaluate on difficult gt bbox.
+            continue;
+          }
+          all_true_pos[j][label].push_back(std::make_pair(score, tp));
+          all_false_pos[j][label].push_back(std::make_pair(score, fp));
+        }
+      }
+    }
+  }
+  loss /= FLAGS_iterations;
+  LOG(INFO) << "Loss: " << loss;
+
+  for (int i = 0; i < test_score.size(); ++i) {
+    int test_score_output_id_value = test_score_output_id[i];
+    const vector<int>& output_blob_indices = caffe_net.output_blob_indices();
+    const vector<string>& blob_names = caffe_net.blob_names();
+    const vector<float>& blob_loss_weights = caffe_net.blob_loss_weights();
+    if (test_score_output_id_value < output_blob_indices.size()) {
+      int blob_index = output_blob_indices[test_score_output_id_value];
+      if (blob_index < blob_names.size() && blob_index < blob_loss_weights.size()) {
+        const std::string& output_name = blob_names[blob_index];
+        const float loss_weight = blob_loss_weights[blob_index];
+        std::ostringstream loss_msg_stream;
+        const float mean_score = test_score[i] / FLAGS_iterations;
+        if (loss_weight) {
+          loss_msg_stream << " (* " << loss_weight
+                          << " = " << (loss_weight * mean_score) << " loss)";
+        }
+        LOG(INFO) << output_name << " = " << mean_score << loss_msg_stream.str();
+      }
+    }
+  }
+
+  //To compute mAP
+  for (int i = 0; i < all_true_pos.size(); ++i) {
+    if (all_true_pos.find(i) == all_true_pos.end()) {
+      LOG(FATAL) << "Missing output_blob true_pos: " << i;
+    }
+    const std::map<int, vector<std::pair<float, int> > >& true_pos =
+        all_true_pos.find(i)->second;
+    if (all_false_pos.find(i) == all_false_pos.end()) {
+      LOG(FATAL) << "Missing output_blob false_pos: " << i;
+    }
+    const std::map<int, vector<std::pair<float, int> > >& false_pos =
+        all_false_pos.find(i)->second;
+    if (all_num_pos.find(i) == all_num_pos.end()) {
+      LOG(FATAL) << "Missing output_blob num_pos: " << i;
+    }
+    const std::map<int, int>& num_pos = all_num_pos.find(i)->second;
+    std::map<int, float> APs;
+    float mAP = 0.;
+    // Sort true_pos and false_pos with descend scores.
+    for (std::map<int, int>::const_iterator it = num_pos.begin();
+         it != num_pos.end(); ++it) {
+      int label = it->first;
+      int label_num_pos = it->second;
+      if (true_pos.find(label) == true_pos.end()) {
+        LOG(WARNING) << "Missing true_pos for label: " << label;
+        continue;
+      }
+      const vector<std::pair<float, int> >& label_true_pos =
+          true_pos.find(label)->second;
+      if (false_pos.find(label) == false_pos.end()) {
+        LOG(WARNING) << "Missing false_pos for label: " << label;
+        continue;
+      }
+      const vector<std::pair<float, int> >& label_false_pos =
+          false_pos.find(label)->second;
+      vector<float> prec, rec;
+      caffe::ComputeAP(label_true_pos, label_num_pos, label_false_pos,
+                FLAGS_ap_version, &prec, &rec, &(APs[label]));
+      mAP += APs[label];
+      if (FLAGS_show_per_class_result) {
+        LOG(INFO) << "class AP " << label << ": " << APs[label];
+      }
+    }
+    mAP /= num_pos.size();
+    const int output_blob_index = caffe_net.output_blob_indices()[i];
+    const string& output_name = caffe_net.blob_names()[output_blob_index];
+    LOG(INFO) << "Test net output mAP #" << i << ": " << output_name << " = " << mAP;
+  }
+  return 0;
+}
+RegisterBrewFunction(test_detection);
+
+
 // Time: benchmark the execution time of a model.
 int time() {
   CHECK_GT(FLAGS_model.size(), 0) << "Need a model definition to time.";
+  caffe::Phase phase = get_phase_from_flags(caffe::TRAIN);
+  vector<string> stages = get_stages_from_flags();
+
   vector<int> gpus;
   // Read flags for list of GPUs
   get_gpus(&gpus);
@@ -350,6 +564,13 @@ int time() {
   solver_param.set_random_seed(1371LL);
   solver_param.set_test_interval(FLAGS_iterations + 1);
   solver_param.set_display(0);
+
+  solver_param.mutable_train_state()->set_level(FLAGS_level);
+  for (int i = 0; i < stages.size(); i++) {
+    solver_param.mutable_train_state()->add_stage(stages[i]);
+  }
+  solver_param.mutable_net_param()->mutable_state()->set_phase(phase);
+
   shared_ptr<Solver> solver(caffe::SolverRegistry::CreateSolver(solver_param));
   shared_ptr<Net> caffe_net = solver->net();
 
@@ -442,14 +663,19 @@ int main(int argc, char** argv) {
       "  test            score a model\n"
       "  device_query    show GPU diagnostic information\n"
       "  time            benchmark model execution time");
+
+  std::ostringstream os;
+  os << std::endl;
+  for (int n = 0; n < argc; ++n) {
+    os << "[" << n << "]: " << argv[n] << std::endl;
+  }
   // Run tool or show usage.
   caffe::GlobalInit(&argc, &argv);
 
   vector<int> gpus;
   get_gpus(&gpus);
-  if (gpus.size() > 0) {
-    Caffe::SetDevice(gpus[0]);
-  }
+  Caffe::SetDevice(gpus.size() > 0 ? gpus[0] : 0);
+  Caffe::set_gpus(gpus);
 
   LOG(INFO) << "This is NVCaffe " << Caffe::caffe_version()
             << " started at " << Caffe::start_time();
@@ -457,6 +683,7 @@ int main(int argc, char** argv) {
   LOG(INFO) << "CuBLAS version: " << Caffe::cublas_version();
   LOG(INFO) << "CUDA version: " << Caffe::cuda_version();
   LOG(INFO) << "CUDA driver version: " << Caffe::cuda_driver_version();
+  LOG(INFO) << "Arguments: " << os.str();
 
   if (argc == 2) {
 #ifdef WITH_PYTHON_LAYER
@@ -470,7 +697,7 @@ int main(int argc, char** argv) {
 #endif
       return GetBrewFunction(caffe::string(argv[1]))();
 #ifdef WITH_PYTHON_LAYER
-    } catch (bp::error_already_set) {
+    } catch (bp::error_already_set&) {
       PyErr_Print();
       return 1;
     }
diff --git a/tools/convert_annoset.cpp b/tools/convert_annoset.cpp
new file mode 100644
index 00000000000..1429dd57b98
--- /dev/null
+++ b/tools/convert_annoset.cpp
@@ -0,0 +1,203 @@
+// This program converts a set of images and annotations to a lmdb/leveldb by
+// storing them as AnnotatedDatum proto buffers.
+// Usage:
+//   convert_annoset [FLAGS] ROOTFOLDER/ LISTFILE DB_NAME
+//
+// where ROOTFOLDER is the root folder that holds all the images and
+// annotations, and LISTFILE should be a list of files as well as their labels
+// or label files.
+// For classification task, the file should be in the format as
+//   imgfolder1/img1.JPEG 7
+//   ....
+// For detection task, the file should be in the format as
+//   imgfolder1/img1.JPEG annofolder1/anno1.xml
+//   ....
+
+#include <algorithm>
+#include <fstream>  // NOLINT(readability/streams)
+#include <map>
+#include <string>
+#include <utility>
+#include <vector>
+
+#include "boost/scoped_ptr.hpp"
+#include "boost/variant.hpp"
+#include "gflags/gflags.h"
+#include "glog/logging.h"
+
+#include "caffe/proto/caffe.pb.h"
+#include "caffe/util/db.hpp"
+#include "caffe/util/format.hpp"
+#include "caffe/util/io.hpp"
+#include "caffe/util/rng.hpp"
+
+using namespace caffe;  // NOLINT(build/namespaces)
+using std::pair;
+using boost::scoped_ptr;
+
+DEFINE_bool(gray, false,
+    "When this option is on, treat images as grayscale ones");
+DEFINE_bool(shuffle, false,
+    "Randomly shuffle the order of images and their labels");
+DEFINE_string(backend, "lmdb",
+    "The backend {lmdb, leveldb} for storing the result");
+DEFINE_string(anno_type, "classification",
+    "The type of annotation {classification, detection}.");
+DEFINE_string(label_type, "xml",
+    "The type of annotation file format.");
+DEFINE_string(label_map_file, "",
+    "A file with LabelMap protobuf message.");
+DEFINE_bool(check_label, false,
+    "When this option is on, check that there is no duplicated name/label.");
+DEFINE_int32(min_dim, 0,
+    "Minimum dimension images are resized to (keep same aspect ratio)");
+DEFINE_int32(max_dim, 0,
+    "Maximum dimension images are resized to (keep same aspect ratio)");
+DEFINE_int32(resize_width, 0, "Width images are resized to");
+DEFINE_int32(resize_height, 0, "Height images are resized to");
+DEFINE_bool(check_size, false,
+    "When this option is on, check that all the datum have the same size");
+DEFINE_bool(encoded, false,
+    "When this option is on, the encoded image will be save in datum");
+DEFINE_string(encode_type, "",
+    "Optional: What type should we encode the image as ('png','jpg',...).");
+
+int main(int argc, char** argv) {
+  ::google::InitGoogleLogging(argv[0]);
+  // Print output to stderr (while still logging)
+  FLAGS_alsologtostderr = 1;
+
+#ifndef GFLAGS_GFLAGS_H_
+  namespace gflags = google;
+#endif
+
+  gflags::SetUsageMessage("Convert a set of images and annotations to the "
+        "leveldb/lmdb format used as input for Caffe.\n"
+        "Usage:\n"
+        "    convert_annoset [FLAGS] ROOTFOLDER/ LISTFILE DB_NAME\n");
+  gflags::ParseCommandLineFlags(&argc, &argv, true);
+
+  if (argc < 4) {
+    gflags::ShowUsageWithFlagsRestrict(argv[0], "tools/convert_annoset");
+    return 1;
+  }
+
+  const bool is_color = !FLAGS_gray;
+  const bool check_size = FLAGS_check_size;
+  const bool encoded = FLAGS_encoded;
+  const string encode_type = FLAGS_encode_type;
+  const string anno_type = FLAGS_anno_type;
+  AnnotatedDatum_AnnotationType type;
+  const string label_type = FLAGS_label_type;
+  const string label_map_file = FLAGS_label_map_file;
+  const bool check_label = FLAGS_check_label;
+  std::map<std::string, int> name_to_label;
+
+  std::ifstream infile(argv[2]);
+  std::vector<std::pair<std::string, boost::variant<int, std::string> > > lines;
+  std::string filename;
+  int label;
+  std::string labelname;
+  if (anno_type == "classification") {
+    while (infile >> filename >> label) {
+      lines.push_back(std::make_pair(filename, label));
+    }
+  } else if (anno_type == "detection") {
+    type = AnnotatedDatum_AnnotationType_BBOX;
+    LabelMap label_map;
+    CHECK(ReadProtoFromTextFile(label_map_file, &label_map))
+        << "Failed to read label map file.";
+    CHECK(MapNameToLabel(label_map, check_label, &name_to_label))
+        << "Failed to convert name to label.";
+    while (infile >> filename >> labelname) {
+      lines.push_back(std::make_pair(filename, labelname));
+    }
+  }
+  if (FLAGS_shuffle) {
+    // randomly shuffle data
+    LOG(INFO) << "Shuffling data";
+    shuffle(lines.begin(), lines.end());
+  }
+  LOG(INFO) << "A total of " << lines.size() << " images.";
+
+  if (encode_type.size() && !encoded)
+    LOG(INFO) << "encode_type specified, assuming encoded=true.";
+
+  int min_dim = std::max<int>(0, FLAGS_min_dim);
+  int max_dim = std::max<int>(0, FLAGS_max_dim);
+  int resize_height = std::max<int>(0, FLAGS_resize_height);
+  int resize_width = std::max<int>(0, FLAGS_resize_width);
+
+  // Create new DB
+  scoped_ptr<db::DB> db(db::GetDB(FLAGS_backend));
+  db->Open(argv[3], db::NEW);
+  scoped_ptr<db::Transaction> txn(db->NewTransaction());
+
+  // Storing to db
+  std::string root_folder(argv[1]);
+  AnnotatedDatum anno_datum;
+  Datum* datum = anno_datum.mutable_datum();
+  int count = 0;
+  int data_size = 0;
+  bool data_size_initialized = false;
+
+  for (int line_id = 0; line_id < lines.size(); ++line_id) {
+    bool status = true;
+    std::string enc = encode_type;
+    if (encoded && !enc.size()) {
+      // Guess the encoding type from the file name
+      string fn = lines[line_id].first;
+      size_t p = fn.rfind('.');
+      if ( p == fn.npos )
+        LOG(WARNING) << "Failed to guess the encoding of '" << fn << "'";
+      enc = fn.substr(p);
+      std::transform(enc.begin(), enc.end(), enc.begin(), ::tolower);
+    }
+    filename = root_folder + lines[line_id].first;
+    if (anno_type == "classification") {
+      label = boost::get<int>(lines[line_id].second);
+      status = ReadImageToDatum(filename, label, resize_height, resize_width,
+          min_dim, max_dim, is_color, enc, datum);
+    } else if (anno_type == "detection") {
+      labelname = root_folder + boost::get<std::string>(lines[line_id].second);
+      status = ReadRichImageToAnnotatedDatum(filename, labelname, resize_height,
+          resize_width, min_dim, max_dim, is_color, enc, type, label_type,
+          name_to_label, &anno_datum);
+      anno_datum.set_type(AnnotatedDatum_AnnotationType_BBOX);
+    }
+    if (status == false) {
+      LOG(WARNING) << "Failed to read " << lines[line_id].first;
+      continue;
+    }
+    if (check_size) {
+      if (!data_size_initialized) {
+        data_size = datum->channels() * datum->height() * datum->width();
+        data_size_initialized = true;
+      } else {
+        const std::string& data = datum->data();
+        CHECK_EQ(data.size(), data_size) << "Incorrect data field size "
+            << data.size();
+      }
+    }
+    // sequential
+    string key_str = caffe::format_int(line_id, 8) + "_" + lines[line_id].first;
+
+    // Put in db
+    string out;
+    CHECK(anno_datum.SerializeToString(&out));
+    txn->Put(key_str, out);
+
+    if (++count % 1000 == 0) {
+      // Commit db
+      txn->Commit();
+      txn.reset(db->NewTransaction());
+      LOG(INFO) << "Processed " << count << " files.";
+    }
+  }
+  // write the last batch
+  if (count % 1000 != 0) {
+    txn->Commit();
+    LOG(INFO) << "Processed " << count << " files.";
+  }
+  return 0;
+}
diff --git a/tools/create_label_map.cpp b/tools/create_label_map.cpp
new file mode 100644
index 00000000000..4e16e6f6aa8
--- /dev/null
+++ b/tools/create_label_map.cpp
@@ -0,0 +1,65 @@
+// This program reads in pairs label names and optionally ids and display names
+// and store them in LabelMap proto buffer.
+// Usage:
+//   create_label_map [FLAGS] MAPFILE OUTFILE
+// where MAPFILE is a list of label names and optionally label ids and
+// displaynames, and OUTFILE stores the information in LabelMap proto.
+// Example:
+//   ./build/tools/create_label_map --delimiter=" " --include_background=true
+//   data/VOC2007/map.txt data/VOC2007/labelmap_voc.prototxt
+// The format of MAPFILE is like following:
+//   class1 [1] [someclass1]
+//   ...
+// The format of OUTFILE is like following:
+//   item {
+//     name: "class1"
+//     label: 1
+//     display_name: "someclass1"
+//   }
+//   ...
+
+#include <fstream>  // NOLINT(readability/streams)
+#include <string>
+
+#include "gflags/gflags.h"
+#include "glog/logging.h"
+
+#include "caffe/proto/caffe.pb.h"
+#include "caffe/util/io.hpp"
+
+using namespace caffe;  // NOLINT(build/namespaces)
+
+DEFINE_bool(include_background, false,
+    "When this option is on, include none_of_the_above as class 0.");
+DEFINE_string(delimiter, " ",
+    "The delimiter used to separate fields in label_map_file.");
+
+int main(int argc, char** argv) {
+  ::google::InitGoogleLogging(argv[0]);
+  // Print output to stderr (while still logging)
+  FLAGS_alsologtostderr = 1;
+
+#ifndef GFLAGS_GFLAGS_H_
+  namespace gflags = google;
+#endif
+
+  gflags::SetUsageMessage("Read in pairs label names and optionally ids and "
+        "display names and store them in LabelMap proto buffer.\n"
+        "Usage:\n"
+        "    create_label_map [FLAGS] MAPFILE OUTFILE\n");
+  gflags::ParseCommandLineFlags(&argc, &argv, true);
+
+  if (argc < 3) {
+    gflags::ShowUsageWithFlagsRestrict(argv[0], "tools/create_label_map");
+    return 1;
+  }
+
+  const bool include_background = FLAGS_include_background;
+  const string delimiter = FLAGS_delimiter;
+
+  const string& map_file = argv[1];
+  LabelMap label_map;
+  ReadLabelFileToLabelMap(map_file, include_background, delimiter, &label_map);
+
+  WriteProtoToTextFile(label_map, argv[2]);
+}
diff --git a/tools/get_image_size.cpp b/tools/get_image_size.cpp
new file mode 100644
index 00000000000..555d3e38a88
--- /dev/null
+++ b/tools/get_image_size.cpp
@@ -0,0 +1,109 @@
+// This program retrieves the sizes of a set of images.
+// Usage:
+//   get_image_size [FLAGS] ROOTFOLDER/ LISTFILE OUTFILE
+//
+// where ROOTFOLDER is the root folder that holds all the images and
+// annotations, and LISTFILE should be a list of files as well as their labels
+// or label files.
+// For classification task, the file should be in the format as
+//   imgfolder1/img1.JPEG 7
+//   ....
+// For detection task, the file should be in the format as
+//   imgfolder1/img1.JPEG annofolder1/anno1.xml
+//   ....
+
+#include <fstream>  // NOLINT(readability/streams)
+#include <map>
+#include <string>
+#include <utility>
+#include <vector>
+
+#include "gflags/gflags.h"
+#include "glog/logging.h"
+
+#include "caffe/util/io.hpp"
+
+using namespace caffe;  // NOLINT(build/namespaces)
+
+DEFINE_string(name_id_file, "",
+              "A file which maps image_name to image_id.");
+
+int main(int argc, char** argv) {
+  ::google::InitGoogleLogging(argv[0]);
+  // Print output to stderr (while still logging)
+  FLAGS_alsologtostderr = 1;
+
+#ifndef GFLAGS_GFLAGS_H_
+  namespace gflags = google;
+#endif
+
+  gflags::SetUsageMessage("Get sizes of a set of images.\n"
+        "Usage:\n"
+        "    get_image_size ROOTFOLDER/ LISTFILE OUTFILE\n");
+  gflags::ParseCommandLineFlags(&argc, &argv, true);
+
+  if (argc < 4) {
+    gflags::ShowUsageWithFlagsRestrict(argv[0], "tools/get_image_size");
+    return 1;
+  }
+
+  std::ifstream infile(argv[2]);
+  if (!infile.good()) {
+    LOG(FATAL) << "Failed to open file: " << argv[2];
+  }
+  std::vector<std::pair<std::string, std::string> > lines;
+  std::string filename, label;
+  while (infile >> filename >> label) {
+    lines.push_back(std::make_pair(filename, label));
+  }
+  infile.close();
+  LOG(INFO) << "A total of " << lines.size() << " images.";
+
+  const string name_id_file = FLAGS_name_id_file;
+  std::map<string, int> map_name_id;
+  if (!name_id_file.empty()) {
+    std::ifstream nameidfile(name_id_file.c_str());
+    if (!nameidfile.good()) {
+      LOG(FATAL) << "Failed to open name_id_file: " << name_id_file;
+    }
+    std::string name;
+    int id;
+    while (nameidfile >> name >> id) {
+      CHECK(map_name_id.find(name) == map_name_id.end());
+      map_name_id[name] = id;
+    }
+    CHECK_EQ(map_name_id.size(), lines.size());
+  }
+
+  // Storing to outfile
+  boost::filesystem::path root_folder(argv[1]);
+  std::ofstream outfile(argv[3]);
+  if (!outfile.good()) {
+    LOG(FATAL) << "Failed to open file: " << argv[3];
+  }
+  int height, width;
+  int count = 0;
+  for (int line_id = 0; line_id < lines.size(); ++line_id) {
+    boost::filesystem::path img_file = root_folder / lines[line_id].first;
+    GetImageSize(img_file.string(), &height, &width);
+    std::string img_name = img_file.stem().string();
+    if (map_name_id.size() == 0) {
+      outfile << img_name << " " << height << " " << width << std::endl;
+    } else {
+      CHECK(map_name_id.find(img_name) != map_name_id.end());
+      int img_id = map_name_id.find(img_name)->second;
+      outfile << img_id << " " << height << " " << width << std::endl;
+    }
+
+    if (++count % 1000 == 0) {
+      LOG(INFO) << "Processed " << count << " files.";
+    }
+  }
+  // write the last batch
+  if (count % 1000 != 0) {
+    LOG(INFO) << "Processed " << count << " files.";
+  }
+  outfile.flush();
+  outfile.close();
+  return 0;
+}