diff --git a/.gitignore b/.gitignore
index 77bc9368..f0c7a1a5 100644
--- a/.gitignore
+++ b/.gitignore
@@ -3,6 +3,7 @@
 *.lo
 *.o
 *.obj
+foo.dat
 
 # Precompiled Headers
 *.gch
diff --git a/CMakeLists.txt b/CMakeLists.txt
index ace4828f..6cc4399a 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -13,12 +13,25 @@ if (NOT CMAKE_BUILD_TYPE)
     set(CMAKE_BUILD_TYPE "RelWithDebInfo" CACHE STRING "Choose the type of build, options are: Debug Release RelWithDebInfo MinSizeRel." FORCE)
 endif()
 
-OPTION(LIBJPEG_AVAILABLE "Allows native loading of jpegs, via manifest file." ON)
-OPTION(DEV_RUN_COG "DeepCL maintainers only, otherwise set to 'OFF'." OFF)
-OPTION(PROTOTYPING "Only for devs." OFF)
-    mark_as_advanced( FORCE PROTOTYPING )
-OPTION(LIBPNGPP_AVAILABLE "Some toys/samples only." OFF)
-    mark_as_advanced(FORCE LIBPNGPP_AVAILABLE)
+#OPTION(BUILD_PYTHON_WRAPPERS "Build Python wrappers.  Needs Python." ON)
+OPTION(BUILD_JPEG_SUPPORT "Allows native loading of jpegs, via manifest file." ON)
+OPTION(BUILD_INTERNAL_LUA "If using from Lua, set to 'OFF'" ON)
+OPTION(MAINTAINER_OPTIONS "Show maintainer options" OFF)
+
+if(MAINTAINER_OPTIONS)
+    OPTION(BUILD_PYTHON_WRAPPERS "Build python wrappers.  Maintainers only." OFF.)
+    OPTION(DEV_RUN_COG "DeepCL maintainers only, otherwise set to 'OFF'." OFF)
+    OPTION(PROTOTYPING "Only for devs." OFF)
+#    mark_as_advanced( FORCE PROTOTYPING )
+    OPTION(LIBPNGPP_AVAILABLE "Some toys/samples only." OFF)
+#    mark_as_advanced(FORCE LIBPNGPP_AVAILABLE)
+else()
+    unset(DEV_RUN_COG CACHE)
+    unset(DEV_RUN_CYTHON CACHE)
+    unset(BUILD_PYTHON_WRAPPERS CACHE)
+    unset(LIBPNGPP_AVAILABLE CACHE)
+    unset(PROTOTYPING CACHE)
+endif()
 
 IF(CMAKE_INSTALL_PREFIX_INITIALIZED_TO_DEFAULT)
   SET(CMAKE_INSTALL_PREFIX
@@ -27,9 +40,10 @@ IF(CMAKE_INSTALL_PREFIX_INITIALIZED_TO_DEFAULT)
 ENDIF(CMAKE_INSTALL_PREFIX_INITIALIZED_TO_DEFAULT)
 
 # remove old flags from the cache:
+unset(LIBJPEG_AVAILABLE CACHE)
+unset(PROVIDE_LUA_ENGINE CACHE)
 unset(BUILD_PYSWIG_WRAPPERS CACHE)
 unset(BUILD_LUA_WRAPPERS CACHE)
-unset(BUILD_PYTHON_WRAPPERS CACHE)
 unset(RUN_COG CACHE)
 unset(RUN_SWIG CACHE)
 unset(DEV_RUN_SWIG CACHE)
@@ -42,7 +56,7 @@ unset(LUA_INCLUDE_DIR CACHE)
 
 # SET(CMAKE_INSTALL_PREFIX "${CMAKE_CURRENT_SOURCE_DIR}/dist" CACHE STRING "Installation directory." FORCE)
 
-SET(CMAKE_MODULE_PATH "${CMAKE_CURRENT_SOURCE_DIR}/CMakeModules")
+SET(CMAKE_MODULE_PATH "${CMAKE_CURRENT_SOURCE_DIR}/cmake")
 
 INCLUDE("${CMAKE_MODULE_PATH}/build_EasyCL.cmake")
 INCLUDE_DIRECTORIES(${EASYCL_INCLUDE_DIRS})
@@ -57,7 +71,6 @@ include_directories( src )
 include_directories( qlearning )
 
 if( ON_LINUX )
-#    target_link_libraries(EasyCL dl)
     SET(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++0x -Wall")
 endif()
 
@@ -65,7 +78,7 @@ if( ON_WINDOWS)
     link_libraries(winmm) # needed for timeGetTime
 endif()
 
-set(dirs activate batch clmath conv dropout fc forcebackprop input layer loaders
+set(dirs clblas activate batch clmath conv dropout fc forcebackprop input layer loaders
    loss net netdef normalize patches pooling trainers util weights qlearning
    )
 foreach(dir ${dirs})
@@ -75,6 +88,7 @@ foreach(dir ${dirs})
         set( deepcl_sources ${deepcl_sources} src/${dir}/${source})
     endforeach()
 endforeach()
+set(deepcl_sources ${deepcl_sources} src/DeepCL.cpp src/CppRuntimeBoundary.cpp)
 #message("deepcl_sources ${deepcl_sources}")
 
 #find_package(Lua51)
@@ -141,7 +155,7 @@ if( LIBPNGPP_AVAILABLE)
     target_link_libraries(deepcl_unittests ${PNG_LIBRARY})
 endif(LIBPNGPP_AVAILABLE)
 
-link_libraries(DeepCL)
+#link_libraries(DeepCL)
 
 if(DEV_RUN_COG)
     foreach( dir ${dirs} main)
@@ -149,7 +163,7 @@ if(DEV_RUN_COG)
     endforeach()
     add_custom_target(
         cog
-        python ${CMAKE_CURRENT_SOURCE_DIR}/thirdparty/cogapp/cog.py -q -I ${CMAKE_CURRENT_SOURCE_DIR}/cog-batteries -r ${cog_dirs} ${CMAKE_CURRENT_SOURCE_DIR}/test/*.cpp ${CMAKE_CURRENT_SOURCE_DIR}/test/*.h ${CMAKE_CURRENT_SOURCE_DIR}/prototyping/*.cpp
+        python ${CMAKE_CURRENT_SOURCE_DIR}/thirdparty/cogapp/cog.py -q -I ${CMAKE_CURRENT_SOURCE_DIR}/cog-batteries -r ${cog_dirs} ${CMAKE_CURRENT_SOURCE_DIR}/test/*.cpp ${CMAKE_CURRENT_SOURCE_DIR}/test/*.h ${CMAKE_CURRENT_SOURCE_DIR}/prototyping/*.cpp ${CMAKE_CURRENT_SOURCE_DIR}/src/*.h
         WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}
     )
     add_dependencies( DeepCL cog )
@@ -157,7 +171,9 @@ endif(DEV_RUN_COG)
 
 #ADD_DEPENDENCIES(DeepCL clBLAS)
 
-set( UNITTEST_SOURCES test/testupdateweights.cpp test/testforward.cpp test/testfilehelper.cpp
+set( UNITTEST_SOURCES test/testClBlas.cpp
+ test/testDeepCL.cpp
+ test/testupdateweights.cpp test/testforward.cpp test/testfilehelper.cpp
  test/testsimpleconvolvenet.cpp test/testlogicaloperators.cpp 
  test/testbackward.cpp test/testsinglebatch.cpp 
  test/testpoolingforward.cpp test/testpoolingbackward.cpp test/testNorbLoader.cpp
@@ -173,6 +189,7 @@ set( UNITTEST_SOURCES test/testupdateweights.cpp test/testforward.cpp test/testf
 if(LIBJPEG_AVAILABLE)
     set(UNITTEST_SOURCES ${UNITTEST_SOURCES} test/testjpeghelper.cpp)
     add_executable( mnist-to-jpegs test/mnist-to-jpegs.cpp src/util/stringhelper.cpp src/loaders/MnistLoader.cpp )
+    target_link_libraries(mnist-to-jpegs DeepCL)
 endif(LIBJPEG_AVAILABLE)
 
 add_executable( deepcl_train src/main/train.cpp src/util/stringhelper.cpp )
@@ -183,6 +200,10 @@ add_executable( prepare-norb test/prepare-norb.cpp src/util/stringhelper.cpp )
 add_executable( mnist-to-floats test/mnist-to-floats.cpp src/util/stringhelper.cpp )
 add_executable( mnist-to-pipe test/mnist-to-pipe.cpp src/util/stringhelper.cpp )
 
+foreach(exe deepcl_train deepcl_predict cifar-to-mat prepare-norb mnist-to-floats mnist-to-pipe)
+    target_link_libraries(${exe} DeepCL)
+endforeach()
+
 #target_link_libraries( cifar-to-mat ${LUA_LIBRARIES} )
 
 if( LIBPNGPP_AVAILABLE)
@@ -190,6 +211,10 @@ if( LIBPNGPP_AVAILABLE)
     add_executable( testpatchextractor test/testPatchExtractor.cpp src/util/stringhelper.cpp src/patches/PatchExtractor.cpp )
     add_executable( testtranslator test/testTranslator.cpp src/util/stringhelper.cpp src/patches/Translator.cpp )
     add_executable( testgenericloader test/testGenericLoader.cpp src/util/stringhelper.cpp )
+
+    foreach(exe testpatchextractor testtranslator testgenericloader)
+        target_link_libraries(${exe} DeepCL)
+    endforeach()
 endif(LIBPNGPP_AVAILABLE)
 
 if( ON_LINUX )
@@ -215,9 +240,17 @@ target_link_libraries( deepcl_unittests DeepCL )
 add_executable( testgtestsupp test/testgtestsupp.cpp thirdparty/gtest/gtest_main.cc )
 target_link_libraries( testgtestsupp deepcl_gtest )
 
+if(BUILD_PYTHON_WRAPPERS)
+    add_subdirectory(python)
+#    add_dependencies(PyDeepCL DeepCL)
+#    target_link_libraries(PyDeepCL DeepCL)
+endif()
+
 INSTALL( DIRECTORY src/ DESTINATION include/deepcl FILES_MATCHING PATTERN *.h )
+INSTALL(PROGRAMS src/activate.sh DESTINATION bin)
+INSTALL(PROGRAMS src/activate.bat DESTINATION bin)
 #INSTALL( DIRECTORY EasyCL/ DESTINATION include/easycl FILES_MATCHING PATTERN *.h )
-INSTALL( TARGETS DeepCL deepcl_train deepcl_predict deepcl_unittests
+INSTALL( TARGETS DeepCL deepcl_train deepcl_predict deepcl_unittests deepcl_gtest
     RUNTIME DESTINATION bin
     ARCHIVE DESTINATION lib
     LIBRARY DESTINATION lib )
diff --git a/EasyCL b/EasyCL
index ea5d0a02..9b74d95a 160000
--- a/EasyCL
+++ b/EasyCL
@@ -1 +1 @@
-Subproject commit ea5d0a0216b06e21cef642a2bfdfa2cc1d2ef526
+Subproject commit 9b74d95ace3c2eecbaf3df5773350ea89e2a5192
diff --git a/README.md b/README.md
index 03f0606c..f2926d0e 100644
--- a/README.md
+++ b/README.md
@@ -76,34 +76,57 @@ Multicolumn net also possible, as in [McDnn](http://arxiv.org/pdf/1202.2745.pdf)
 - obtained 99.5% test accuracy on MNIST, using `netdef=rt2-8c5z-relu-mp2-16c5z-relu-mp3-150n-tanh-10n numepochs=20 multinet=6 learningrate=0.002`
    - epoch time 99.8 seconds, using an Amazon GPU instance, ie half an NVidia GRID K520 GPU (since we are learning 6 nets in parallel, so 16.6seconds per epoch per net)
 
-# Releases
+# Installation
 
-* Stable: here
-* Unstable/dev: please see [8.x](https://github.com/hughperkins/DeepCL/tree/8.x)
+## Native library installation
 
-Unstable/dev is working on adding im2col, which promises faster convolution for larger layers, ie layers 1 and 2 in [Soumith's benchmarks](https://github.com/soumith/convnet-benchmarks)
+This section installs the native libraries, and the command-line tools.  You always need to do this part, even if you will use the Python wrappers.
 
-# To install
+### Windows
 
-## Python
+#### Pre-requisites:
 
-* For python, please use [Python API](python/README.md), or use [pip](https://pypi.python.org/pypi/DeepCL)
+* OpenCL-enabled GPU or APU, along with appropriate OpenCL driver installed
+* Tested using Windows 7
 
-## Commandline tools, and c++ libraries
+#### Procedure:
 
-### Windows
+* Download latest binary zip file from http://deepcl.hughperkins.com/Downloads/
+* unzip it, which creates the `dist` folder
+* To use it:
+  * open a cmd
+  * run `call dist\bin\activate.bat` (adjusting the path appropriately for wherever you downloaded deepcl binaries to)
+  * now, eg try `deepcl_unittests`
+
+Note that you need to "activate" the installation each time you open a new cmd prompt (or you could add appropriate environment variables permanently, using Control Panel | System | Advanced System Settings | Environment Variables)
+
+### Linux
+
+#### Pre-requisites:
+
+* OpenCL-enabled GPU or APU, along with appropriate OpenCL driver installed (can check by running `clinfo`, which should show your desired GPU device)
+* Tested using Ubuntu 14.04 32-bit/64-bit
+
+#### Procedure:
 
-Pre-built binaries are available for Windows.  In order to use them you need:
-* An OpenCL driver for your GPU
-* A recent release with Windows binaries is [v5.5.0](https://github.com/hughperkins/DeepCL/releases/tag/v5.5.0) 
+* Download latest tar file from http://deepcl.hughperkins.com/Downloads/
+* untar it, which creates the `dist` sub-folder
+* in a bash prompt, run `source dist\bin\activate.sh` (adjust the path appropriate for wherever you untarred the binaries tar file to)
+* test by doing, from the same bash prompt, eg `deepcl_unittests`
 
-### linux
+Note that you need to "activate" the installation each time you open a new bash prompt (or you can call activate.sh from your `.bashrc` file)
 
-Pre-build binaries are available for linux.  In order to use them you need:
-* An OpenCL driver for your GPU
-* A recent release with linux binaries is [v5.5.0](https://github.com/hughperkins/DeepCL/releases/tag/v5.5.0) 
+## Python wrappers
 
-If the binaries dont work on your distribution, please [build from source](doc/Build.md)
+* make sure you already installed the native library, and "activate"d it, by doing `call dist\bin\activate.bat`, or `source dist/bin/activate.sh`
+* run `pip install --pre DeepCL`
+* test by doing `python -c "import PyDeepCL; cl = PyDeepCL.DeepCL()"`
+
+## To build from source
+
+Building from source is only needed if installing from binaries doesn't work for your configuration, or if you want to modify DeepCL.
+
+See [Build.md](doc/Build.md)
 
 ## What if it doesn't run?
 
@@ -137,11 +160,8 @@ Related projects
 ================
 
 * [kgsgo-dataset-preprocessor](https://github.com/hughperkins/kgsgo-dataset-preprocessor) Dataset based on kgsgo games; 33 million data points
-
-Credits
-=======
-
-* Tambet Matilsen has provided excellent suggestions and feedback on which functionalities to prioritize, and on how to make the website somewhat presentable
+* [cltorch](https://github.com/hughperkins/cltorch)
+* [clnn](https://github.com/hughperkins/clnn)
 
 License
 =======
@@ -151,6 +171,60 @@ License
 Recent changes
 ==============
 
+* Aug 28th:
+  * installation of 8.x from binaries on Windows works now, by doing, eg on 32-bit Windows 7, and assuming you already activated an appropriate python environment (assumes 7-zip is installed, in default location, otherwise do the unzip by hand):
+```
+powershell Set-ExecutionPolicy unrestricted
+rem following command is like `wget` in linux:
+powershell.exe -Command (new-object System.Net.WebClient).DownloadFile('http://deepcl.hughperkins.com/Downloads/deepcl-win32-v8.0.0rc8.zip', 'deepcl-win32-v8.0.0rc8.zip')
+rem following command is like `tar -xf` in linux:
+"c:\program files\7-Zip\7z.exe" x deepcl-win32-v8.0.0rc8.zip
+call dist\bin\activate.bat
+pip install --pre DeepCL
+python -c "import PyDeepCL; cl = PyDeepCL.DeepCL()"
+```
+(last line is just to check works ok)
+  * merged 8.x branch to master, will release first version of 8.x shortly
+* Aug 26th: installation of 8.x from binaries on linux works now, by doing, eg on 64-bit Ubuntu 14.04:
+```
+mkdir 8.0.0rc4
+cd 8.0.0rc4
+wget http://deepcl.hughperkins.com/Downloads/deepcl-linux64-v8.0.0rc4.tar.bz2
+tar -xf deepcl-linux64-v8.0.0rc4.tar.bz2
+virtualenv env
+source env/bin/activate
+source dist/bin/activate.sh
+pip install --pre DeepCL
+python -c "import PyDeepCL; cl = PyDeepCL.DeepCL()"
+```
+(last line is just to check works ok)
+* Aug 21st-24th:
+  * 8.x finally builds again on all supported configurations!
+    * ubuntu 14.04 32-bit Python 2.7
+    * ubuntu 14.04 32-bit Python 3.4
+    * ubuntu 14.04 64-bit Python 2.7
+    * ubuntu 14.04 64-bit Python 3.4
+    * visual studio 2010 32-bit python 2.7
+    * visual studio 2010 32-bit python 3.4
+    * visual studio 2010 64-bit python 2.7
+    * visual studio 2010 64-bit python 3.4
+* Aug 19th-20th:
+  * Python wrappers now built using a very thin setup.py layer, on top of the standard native DeepCL build
+* Aug 18th:
+  * added BackwardIm2Col layer, which uses im2col for backward propagation
+  * added BackpropWeightsIm2Col layer, which uses im2col for weight update
+  * added BackwardAuto layer, which automatically selects fastest Backward layer
+  * added BackpropWeightsAuto layer, which automatically selects faster weight update layer
+  * under the covers:
+    * created ClBlasHelper, to handle Gemm and Gemv
+    * factorized im2col into Im2Col class
+* week up to Aug 17th:
+  * added forward and backward im2col layer
+  * forward im2col automatically used during forward propagation, where appropriate
+  * backwards has yet to be integrated
+  * under the covers:
+    * added clBLAS
+    * migrated the Python build process to use cmake, rather than setup.py (whether this turns out to be good or bad is a bit up in the air for now)
 * June 22nd:
   * removed lua wrappers
   * if you want to use lua with OpenCL, please consider using [cltorch](http://github.com/hughperkins/cltorch) and [clnn](http://github.com/hughperkins/clnn)
@@ -158,5 +232,5 @@ Recent changes
 To get in contact
 =================
 
-There is a mailing list at http://lists.hughperkins.com/listinfo.cgi/deepcl-hughperkins.com for discussions, ideas, or just to say 'hi'.  You can also just create issues, in github, in the top right of this page.
+Just create an issues, in github, in the top right of this page.  Don't worry about whether you think the issue sounds silly or anything.  The more feedback the better!
 
diff --git a/cl/BackpropWeights2InOutPairs.cl b/cl/BackpropWeights2InOutPairs.cl
index 6a192883..bbc86e84 100644
--- a/cl/BackpropWeights2InOutPairs.cl
+++ b/cl/BackpropWeights2InOutPairs.cl
@@ -14,8 +14,8 @@
 // eg plane f from each output: 128 * 28 * 28 * 4 = 401KB
 // plane i from each input: 128 * 28 * 28 * 4 = 401KB
 // plane i from filter f: 5 * 5 * 4 = 100 bytes...
-// plane i from all filters: 5 * 5 * 4 * 8 = 800 bytes (ok :-) )
-// all planes from all filters eg: 5 * 5 * 4 * 8 * 1 = 800 bytes (ok :-) )
+// plane i from all filters: 5 * 5 * 4 * 8 = 800 bytes (ok :-))
+// all planes from all filters eg: 5 * 5 * 4 * 8 * 1 = 800 bytes (ok :-))
 //
 // in forward, filter plane i of filter f:
 // convolves with plane i from each input cube
@@ -33,8 +33,8 @@
 // workgroupId: [outBlockId][inBlockId]
 // localId: [filterId][inputPlane][filterRow][filterCol]
 // per-thread iteration: [n][outputRow][outputCol]
-// local: errorimage: outputImageSize * outputImageSize
-//        imageimage: inputImageSize * inputImageSize
+// local: errorimage: outputSize * outputSize
+//        imageimage: inputSize * inputSize
 void kernel backprop_floats_withscratch_dobias( 
         const float learningRateMultiplier, const int batchSize, 
          global const float *gradOutput, global const float *images, 
@@ -64,44 +64,44 @@ void kernel backprop_floats_withscratch_dobias(
     const int filterCol = localLinearPos % gFilterSize;
 
 
-    for( int outPlane = 
+    for (int outPlane = 
     // weights:     [outPlane][upstreamPlane][filterRow][filterCol]
     //       aggregate over:  [outRow][outCol][n]
     float thiswchange = 0;
 #ifdef BIASED
     float thisbiaschange = 0;
 #endif
-    for( int n = 0; n < batchSize; n++ ) {
-        int upstreamImageGlobalOffset = ( n * gInputPlanes + upstreamPlane ) * gInputImageSizeSquared;
+    for (int n = 0; n < batchSize; n++) {
+        int upstreamImageGlobalOffset = (n * gInputPlanes + upstreamPlane) * gInputSizeSquared;
         // need to fetch the image, but it's bigger than us, so will need to loop...
-        int numLoopsForUpstream = ( gInputImageSizeSquared + workgroupSize - 1 ) / workgroupSize;
+        int numLoopsForUpstream = (gInputSizeSquared + workgroupSize - 1) / workgroupSize;
         barrier(CLK_LOCAL_MEM_FENCE);
-        for( int i = 0; i < numLoopsForUpstream; i++ ) {
+        for (int i = 0; i < numLoopsForUpstream; i++) {
             int thisOffset = i * workgroupSize + localId;
-            if( thisOffset < gInputImageSizeSquared ) {
+            if (thisOffset < gInputSizeSquared) {
                 _imageImage[thisOffset] = images[ upstreamImageGlobalOffset + thisOffset ];
             }
         }
-        int resultImageGlobalOffset = ( n * gNumFilters + outPlane ) * gOutputImageSizeSquared;
-        int numLoopsForOutput = ( gOutputImageSizeSquared + workgroupSize - 1 ) / workgroupSize;
-        for( int i = 0; i < numLoopsForOutput; i++ ) {
+        int resultImageGlobalOffset = (n * gNumFilters + outPlane) * gOutputSizeSquared;
+        int numLoopsForOutput = (gOutputSizeSquared + workgroupSize - 1) / workgroupSize;
+        for (int i = 0; i < numLoopsForOutput; i++) {
             int thisOffset = i * workgroupSize + localId;
-            if( thisOffset < gOutputImageSizeSquared ) {
+            if (thisOffset < gOutputSizeSquared) {
                 _errorImage[thisOffset ] = gradOutput[resultImageGlobalOffset + thisOffset];
             }
         }
         barrier(CLK_LOCAL_MEM_FENCE);
-        if( localId < gFilterSizeSquared ) {
-            for( int outRow = 0; outRow < gOutputImageSize; outRow++ ) {
+        if (localId < gFilterSizeSquared) {
+            for (int outRow = 0; outRow < gOutputSize; outRow++) {
                 int upstreamRow = outRow - gMargin + filterRow;
-                for( int outCol = 0; outCol < gOutputImageSize; outCol++ ) {
+                for (int outCol = 0; outCol < gOutputSize; outCol++) {
                     int upstreamCol = outCol - gMargin + filterCol;
-                    bool proceed = upstreamRow >= 0 && upstreamCol >= 0 && upstreamRow < gInputImageSize
-                        && upstreamCol < gInputImageSize;
-                    if( proceed ) {
-                        int resultIndex = outRow * gOutputImageSize + outCol;
+                    bool proceed = upstreamRow >= 0 && upstreamCol >= 0 && upstreamRow < gInputSize
+                        && upstreamCol < gInputSize;
+                    if (proceed) {
+                        int resultIndex = outRow * gOutputSize + outCol;
                         float error = _errorImage[resultIndex];
-                        int upstreamDataIndex = upstreamRow * gInputImageSize + upstreamCol;
+                        int upstreamDataIndex = upstreamRow * gInputSize + upstreamCol;
                         float upstreamResult = _imageImage[upstreamDataIndex];
                         thiswchange += upstreamResult * error;
     #ifdef BIASED
@@ -112,12 +112,12 @@ void kernel backprop_floats_withscratch_dobias(
             }
         }
     }
-    if( localId < gFilterSizeSquared ) {
+    if (localId < gFilterSizeSquared) {
         weights[ workgroupId * gFilterSizeSquared + localId ] -= learningRateMultiplier * thiswchange;
     }
 #ifdef BIASED
     bool writeBias = upstreamPlane == 0 && localId == 0;
-    if( writeBias ) {
+    if (writeBias) {
         biasWeights[outPlane] -= learningRateMultiplier * thisbiaschange;
     }
 #endif
diff --git a/cl/BackpropWeightsScratch.cl b/cl/BackpropWeightsScratch.cl
index 3414b892..a58deeee 100644
--- a/cl/BackpropWeightsScratch.cl
+++ b/cl/BackpropWeightsScratch.cl
@@ -13,8 +13,8 @@
 // workgroupId: [outputPlane][inputPlane]
 // localId: [filterRow][filterCol]
 // per-thread iteration: [n][outputRow][outputCol]
-// local: errorimage: outputImageSize * outputImageSize
-//        imageimage: inputImageSize * inputImageSize
+// local: errorimage: outputSize * outputSize
+//        imageimage: inputSize * inputSize
 void kernel backprop_floats_withscratch_dobias( 
         const float learningRateMultiplier, const int batchSize, 
          global const float *gradOutput, global const float *images, 
@@ -27,8 +27,8 @@ void kernel backprop_floats_withscratch_dobias(
     const int filterRow = localId / gFilterSize;
     const int filterCol = localId % gFilterSize;
 
-    #define outPlane ( workgroupId / gInputPlanes )
-    #define upstreamPlane ( workgroupId % gInputPlanes )
+    #define outPlane (workgroupId / gInputPlanes)
+    #define upstreamPlane (workgroupId % gInputPlanes)
 
     // gradWeights:     [outPlane][upstreamPlane][filterRow][filterCol]
     //       aggregate over:  [outRow][outCol][n]
@@ -36,25 +36,25 @@ void kernel backprop_floats_withscratch_dobias(
 #ifdef BIASED
     float thisbiaschange = 0;
 #endif
-    for( int n = 0; n < batchSize; n++ ) {
+    for (int n = 0; n < batchSize; n++) {
         barrier(CLK_LOCAL_MEM_FENCE);
-        copyLocal( _imageImage, images + ( n * gInputPlanes + upstreamPlane ) * gInputImageSizeSquared, gInputImageSizeSquared );
-        copyLocal(_errorImage, gradOutput + ( n * gNumFilters + outPlane ) * gOutputImageSizeSquared, gOutputImageSizeSquared );
+        copyLocal(_imageImage, images + (n * gInputPlanes + upstreamPlane) * gInputSizeSquared, gInputSizeSquared);
+        copyLocal(_errorImage, gradOutput + (n * gNumFilters + outPlane) * gOutputSizeSquared, gOutputSizeSquared);
         barrier(CLK_LOCAL_MEM_FENCE);
-        if( localId < gFilterSizeSquared ) {
-            for( int outRow = 0; outRow < gOutputImageSize; outRow++ ) {
+        if (localId < gFilterSizeSquared) {
+            for (int outRow = 0; outRow < gOutputSize; outRow++) {
                 int upstreamRow = outRow - gMargin + filterRow;
-                for( int outCol = 0; outCol < gOutputImageSize; outCol++ ) {
+                for (int outCol = 0; outCol < gOutputSize; outCol++) {
                     const int upstreamCol = outCol - gMargin + filterCol;
-                    #define proceed ( upstreamRow >= 0 && upstreamCol >= 0 && upstreamRow < gInputImageSize && upstreamCol < gInputImageSize )
-                    if( proceed ) {
+                    #define proceed (upstreamRow >= 0 && upstreamCol >= 0 && upstreamRow < gInputSize && upstreamCol < gInputSize)
+                    if (proceed) {
                         // these defines reduce register pressure, compared to const
                         // giving a 40% speedup on nvidia :-)
-                        #define resultIndex ( outRow * gOutputImageSize + outCol )
-                        #define error ( _errorImage[resultIndex] )
+                        #define resultIndex (outRow * gOutputSize + outCol)
+                        #define error (_errorImage[resultIndex])
                         //const float error = _errorImage[resultIndex];
-                        #define upstreamDataIndex ( upstreamRow * gInputImageSize + upstreamCol )
-                        #define upstreamResult ( _imageImage[upstreamDataIndex] )
+                        #define upstreamDataIndex (upstreamRow * gInputSize + upstreamCol)
+                        #define upstreamResult (_imageImage[upstreamDataIndex])
                         thiswchange += upstreamResult * error;
     #ifdef BIASED
                         thisbiaschange += error;
@@ -64,12 +64,12 @@ void kernel backprop_floats_withscratch_dobias(
             }
         }
     }
-    if( localId < gFilterSizeSquared ) {
+    if (localId < gFilterSizeSquared) {
         gradWeights[ workgroupId * gFilterSizeSquared + localId ] = learningRateMultiplier * thiswchange;
     }
 #ifdef BIASED
-    #define writeBias ( upstreamPlane == 0 && filterRow == gMargin && filterCol == gMargin )
-    if( writeBias ) {
+    #define writeBias (upstreamPlane == 0 && filterRow == gMargin && filterCol == gMargin)
+    if (writeBias) {
         gradBiasWeights[outPlane] = learningRateMultiplier * thisbiaschange;
     }
 #endif
diff --git a/cl/BackpropWeightsScratchLarge.cl b/cl/BackpropWeightsScratchLarge.cl
index 3ecce4f3..f178cf1f 100644
--- a/cl/BackpropWeightsScratchLarge.cl
+++ b/cl/BackpropWeightsScratchLarge.cl
@@ -10,8 +10,8 @@
 // workgroupId: [outputPlane][inputPlane]
 // localId: [filterRow][filterCol]
 // per-thread iteration: [n][outputRow][outputCol]
-// local: errorimage: outputImageSize * outputImageSize
-//        imageimage: inputImageSize * inputImageSize
+// local: errorimage: outputSize * outputSize
+//        imageimage: inputSize * inputSize
 // specific characteristic: load one stripe of each image at a time,
 // so we dont run out of memory
 // number of stripes set in: gNumStripes
@@ -32,15 +32,15 @@ void kernel backprop_floats_withscratch_dobias_striped(
         local float *_errorStripe, local float *_imageStripe
  ) {
     // gHalfFilterSize
-    // gInputImageSize
+    // gInputSize
     //
     // gInputStripeMarginRows => basically equal to gHalfFilterSize
-    // gInputStripeInnerNumRows = gInputImageSize / gNumStripes
+    // gInputStripeInnerNumRows = gInputSize / gNumStripes
     // gInputStripeOuterNumRows = gInputStripeInnerNumRows + 2 * gHalfFilterSize  (note: one row less than
     //                                                         if we just added gFilterSize)
-    // gInputStripeInnerSize = gInputStripeInnerNumRows * gInputImageSize
-    // gInputStripeOuterSize = gInputStripeOuterNumRows * gInputImageSize
-    // gInputStripeMarginSize = gInputStripeMarginRows * gInputImageSize
+    // gInputStripeInnerSize = gInputStripeInnerNumRows * gInputSize
+    // gInputStripeOuterSize = gInputStripeOuterNumRows * gInputSize
+    // gInputStripeMarginSize = gInputStripeMarginRows * gInputSize
     //
     // gOutputStripeNumRows
     // gOutputStripeSize
@@ -62,62 +62,62 @@ void kernel backprop_floats_withscratch_dobias_striped(
 #ifdef BIASED
     float thisbiaschange = 0;
 #endif
-    const int numLoopsForImageStripe = ( gInputStripeOuterSize + workgroupSize - 1 ) / workgroupSize;
-    const int numLoopsForErrorStripe = ( gOutputImageSizeSquared + workgroupSize - 1 ) / workgroupSize;
-    for( int n = 0; n < batchSize; n++ ) {
-        const int imageImageGlobalOffset = ( n * gInputPlanes + upstreamPlane ) * gInputImageSizeSquared;
-        const int imageImageGlobalOffsetAfter = imageImageGlobalOffset + gInputImageSizeSquared;
-        const int errorImageGlobalOffset = ( n * gNumFilters + outPlane ) * gOutputImageSizeSquared;
-        const int errorImageGlobalOffsetAfter = errorImageGlobalOffset + gOutputImageSizeSquared;
-        for( int stripe = 0; stripe < gNumStripes; stripe++ ) {
+    const int numLoopsForImageStripe = (gInputStripeOuterSize + workgroupSize - 1) / workgroupSize;
+    const int numLoopsForErrorStripe = (gOutputSizeSquared + workgroupSize - 1) / workgroupSize;
+    for (int n = 0; n < batchSize; n++) {
+        const int imageImageGlobalOffset = (n * gInputPlanes + upstreamPlane) * gInputSizeSquared;
+        const int imageImageGlobalOffsetAfter = imageImageGlobalOffset + gInputSizeSquared;
+        const int errorImageGlobalOffset = (n * gNumFilters + outPlane) * gOutputSizeSquared;
+        const int errorImageGlobalOffsetAfter = errorImageGlobalOffset + gOutputSizeSquared;
+        for (int stripe = 0; stripe < gNumStripes; stripe++) {
             const int imageStripeInnerOffset = imageImageGlobalOffset + stripe * gInputStripeInnerSize;
             const int imageStripeOuterOffset = imageStripeInnerOffset - gInputStripeMarginSize;
             // need to fetch the image, but it's bigger than us, so will need to loop...
             barrier(CLK_LOCAL_MEM_FENCE);
-            for( int i = 0; i < numLoopsForImageStripe; i++ ) {
+            for (int i = 0; i < numLoopsForImageStripe; i++) {
                 int thisOffset = i * workgroupSize + localId;
                 int thisGlobalImagesOffset = imageStripeOuterOffset + thisOffset;
                 bool process = thisOffset < gInputStripeOuterSize 
                     && thisGlobalImagesOffset >= imageImageGlobalOffset 
                     && thisGlobalImagesOffset < imageImageGlobalOffsetAfter;
-                if( process ) {
+                if (process) {
                     _imageStripe[thisOffset] = images[ thisGlobalImagesOffset ];
                 }
             }
             int errorStripeOffset = errorImageGlobalOffset + stripe * gOutputStripeSize;
-            for( int i = 0; i < numLoopsForErrorStripe; i++ ) {
+            for (int i = 0; i < numLoopsForErrorStripe; i++) {
                 int thisOffset = i * workgroupSize + localId;
                 int globalErrorsOffset = errorStripeOffset + thisOffset;
                 bool process = thisOffset < gOutputStripeSize 
                     && globalErrorsOffset < errorImageGlobalOffsetAfter;
-                if( process ) {
+                if (process) {
                     _errorStripe[thisOffset ] = gradOutput[globalErrorsOffset];
                 }
             }
             const int stripeOutRowStart = stripe * gOutputStripeNumRows;
             const int stripeOutRowEndExcl = stripeOutRowStart + gOutputStripeNumRows;
             barrier(CLK_LOCAL_MEM_FENCE);
-//            if( localId == 13 ) {
-//                for( int i = 0; i < 12; i++ ) {
-//                    gradWeights[100 + stripe * 12 + i ] = _errorStripe[i * gOutputImageSize];
+//            if (localId == 13) {
+//                for (int i = 0; i < 12; i++) {
+//                    gradWeights[100 + stripe * 12 + i ] = _errorStripe[i * gOutputSize];
 //                }
-//                for( int i = 0; i < 20; i++ ) {
-//                    gradWeights[200 + stripe * 20 + i ] = _imageStripe[i * gInputImageSize];
+//                for (int i = 0; i < 20; i++) {
+//                    gradWeights[200 + stripe * 20 + i ] = _imageStripe[i * gInputSize];
 //                }
 //            }
-            if( localId < gFilterSizeSquared ) {
-                for( int outRow = stripeOutRowStart; outRow < stripeOutRowEndExcl; outRow++ ) {
+            if (localId < gFilterSizeSquared) {
+                for (int outRow = stripeOutRowStart; outRow < stripeOutRowEndExcl; outRow++) {
                     int upstreamRow = outRow - gMargin + filterRow;
-                    for( int outCol = 0; outCol < gOutputImageSize; outCol++ ) {
+                    for (int outCol = 0; outCol < gOutputSize; outCol++) {
                         int upstreamCol = outCol - gMargin + filterCol;
                         bool proceed = 
                             upstreamRow >= 0 && upstreamCol >= 0 
-                            && upstreamRow < gInputImageSize && upstreamCol < gInputImageSize
-                            && outRow < gOutputImageSize;
-                        if( proceed ) {
-                            int resultIndex = outRow * gOutputImageSize + outCol;
+                            && upstreamRow < gInputSize && upstreamCol < gInputSize
+                            && outRow < gOutputSize;
+                        if (proceed) {
+                            int resultIndex = outRow * gOutputSize + outCol;
                             float error = _errorStripe[resultIndex - stripe * gOutputStripeSize];
-                            int upstreamDataIndex = upstreamRow * gInputImageSize + upstreamCol;
+                            int upstreamDataIndex = upstreamRow * gInputSize + upstreamCol;
                             float upstreamResult = _imageStripe[upstreamDataIndex +  gInputStripeMarginSize
                                         - stripe * gInputStripeInnerSize ];
                             thiswchange += upstreamResult * error;
@@ -130,13 +130,13 @@ void kernel backprop_floats_withscratch_dobias_striped(
             }
         }
     }
-    if( localId < gFilterSizeSquared ) {
+    if (localId < gFilterSizeSquared) {
         gradWeights[ workgroupId * gFilterSizeSquared + localId ] = learningRateMultiplier * thiswchange;
 //        weightChanges[ workgroupId * gFilterSizeSquared + localId ] = workgroupId;
     }
 #ifdef BIASED
     bool writeBias = upstreamPlane == 0 && filterRow == gMargin && filterCol == gMargin;
-    if( writeBias ) {
+    if (writeBias) {
         gradBiasWeights[outPlane] = learningRateMultiplier * thisbiaschange;
     }
 #endif
diff --git a/cl/ForwardIm2Col.cl b/cl/ForwardIm2Col.cl
new file mode 100644
index 00000000..44d47e11
--- /dev/null
+++ b/cl/ForwardIm2Col.cl
@@ -0,0 +1,73 @@
+// from SpatialConvolutionMM.cu:
+
+// CL: grid stride looping
+#define CL_KERNEL_LOOP(i, n)                        \
+  for (int i = get_group_id(0) * get_local_size(0) + get_local_id(0); \
+      i < (n);                                       \
+      i += get_local_size(0) * get_num_groups(0))
+
+//#define gPadding {{padding}}
+//#define gStride {{stride}}
+//#define gColSize {{colSize}}
+//#define gFilterSize {{filterSize}}
+//#define gSize {{size}}
+
+// Kernel for fast unfold+copy
+// (adapted from Caffe: https://github.com/BVLC/caffe/blob/master/src/caffe/layers/conv_layer.cu)
+kernel void im2col(
+    const int n,
+    global float const * im_data, int im_offset,
+    global float* data_col) {
+  global const float *data_im = im_data + im_offset;
+
+  CL_KERNEL_LOOP(index, n) {
+    int w_out = index % {{colSize}};
+    index /= {{colSize}};
+    int h_out = index % {{colSize}};
+    int channel_in = index / {{colSize}};
+    int channel_out = channel_in * {{filterSize}} * {{filterSize}};
+    int h_in = h_out * {{stride}} - {{padding}};
+    int w_in = w_out * {{stride}} - {{padding}};
+    data_col += (channel_out * {{colSize}} + h_out) * {{colSize}} + w_out;
+    data_im += (channel_in * {{size}} + h_in) * {{size}} + w_in;
+    for (int i = 0; i < {{filterSize}}; ++i) {
+      for (int j = 0; j < {{filterSize}}; ++j) {
+        int h = h_in + i;
+        int w = w_in + j;
+        *data_col = (h >= 0 && w >= 0 && h < {{size}} && w < {{size}}) ?
+          data_im[i * {{size}} + j] : 0;
+        data_col += {{colSize}} * {{colSize}};
+      }
+    }
+  }
+}
+
+kernel void col2im(
+    const int n,
+    global float const *data_col,
+    global float* im_data, int im_offset) {
+  global float *data_im = im_data + im_offset;
+
+  for (int index = get_group_id(0) * get_local_size(0) + get_local_id(0); index < (n); index += get_local_size(0) * get_num_groups(0)) {
+    float val = 0;
+    int w = index % {{size}} + {{padding}};
+    int h = (index / {{size}}) % {{size}} + {{padding}};
+    int c = index / ({{size}} * {{size}});
+    // compute the start and end of the output
+    int w_col_start = (w < {{filterSize}}) ? 0 : (w - {{filterSize}}) / {{stride}} + 1;
+    int w_col_end = min(w / {{stride}} + 1, {{colSize}});
+    int h_col_start = (h < {{filterSize}}) ? 0 : (h - {{filterSize}}) / {{stride}} + 1;
+    int h_col_end = min(h / {{stride}} + 1, {{colSize}});
+
+    int offset = (c * {{filterSize}} * {{filterSize}} + h * {{filterSize}} + w) * {{colSize}} * {{colSize}};
+    int coeff_h_col = (1 - {{stride}} * {{filterSize}} * {{colSize}}) * {{colSize}};
+    int coeff_w_col = (1 - {{stride}} * {{colSize}} * {{colSize}});
+    for (int h_col = h_col_start; h_col < h_col_end; ++h_col) {
+      for (int w_col = w_col_start; w_col < w_col_end; ++w_col) {
+        val += data_col[offset + h_col * coeff_h_col + w_col * coeff_w_col];
+      }
+    }
+    data_im[index] = val;
+  }
+}
+
diff --git a/cl/PoolingBackwardGpuNaive.cl b/cl/PoolingBackwardGpuNaive.cl
index 8c9c823b..baa0acaf 100644
--- a/cl/PoolingBackwardGpuNaive.cl
+++ b/cl/PoolingBackwardGpuNaive.cl
@@ -11,37 +11,37 @@
 // wont use workgroups (since 'naive')
 // one thread per: [n][plane][outrow][outcol]
 // globalId: [n][plane][outrow][outcol]
-kernel void backward( const int batchSize, 
-    global const float *gradOutput, global const int *selectors, global float *gradInput ) {
+kernel void backward(const int batchSize, 
+    global const float *gradOutput, global const int *selectors, global float *gradInput) {
 
     #define globalId get_global_id(0)
-    #define nPlaneCombo ( globalId / gOutputImageSizeSquared ) 
-    #define outputPosCombo ( globalId % gOutputImageSizeSquared )
+    #define nPlaneCombo (globalId / gOutputSizeSquared) 
+    #define outputPosCombo (globalId % gOutputSizeSquared)
 
     const int n = nPlaneCombo / gNumPlanes;
     const int plane = nPlaneCombo % gNumPlanes;
-    const int outputRow = outputPosCombo / gOutputImageSize;
-    const int outputCol = outputPosCombo % gOutputImageSize;
+    const int outputRow = outputPosCombo / gOutputSize;
+    const int outputCol = outputPosCombo % gOutputSize;
 
-    if( n >= batchSize ) {
+    if (n >= batchSize) {
         return;
     }
 
-    int resultIndex = ( ( n
-        * gNumPlanes + plane )
-        * gOutputImageSize + outputRow )
-        * gOutputImageSize + outputCol;
-    #define error ( gradOutput[resultIndex] )
-    int selector = ( selectors[resultIndex] );
-    #define drow ( selector / gPoolingSize )
-    #define dcol ( selector % gPoolingSize )
-    #define inputRow ( outputRow * gPoolingSize + drow )
-    #define inputCol ( outputCol * gPoolingSize + dcol )
-    int inputIndex = ( ( n
-        * gNumPlanes + plane )
-        * gInputImageSize + inputRow )
-        * gInputImageSize + inputCol;
-//    if( n < batchSize ) {
+    int resultIndex = (( n
+        * gNumPlanes + plane)
+        * gOutputSize + outputRow)
+        * gOutputSize + outputCol;
+    #define error (gradOutput[resultIndex])
+    int selector = (selectors[resultIndex]);
+    #define drow (selector / gPoolingSize)
+    #define dcol (selector % gPoolingSize)
+    #define inputRow (outputRow * gPoolingSize + drow)
+    #define inputCol (outputCol * gPoolingSize + dcol)
+    int inputIndex = (( n
+        * gNumPlanes + plane)
+        * gInputSize + inputRow)
+        * gInputSize + inputCol;
+//    if (n < batchSize) {
         gradInput[ inputIndex ] = error;
 //    }
 }
diff --git a/cl/SGD.cl b/cl/SGD.cl
index c3a707e8..71bdc3c3 100644
--- a/cl/SGD.cl
+++ b/cl/SGD.cl
@@ -13,7 +13,7 @@ kernel void updateWeights(
         global float *weights
             ) {
     const int globalId = get_global_id(0);
-    if( globalId >= N ) {
+    if (globalId >= N) {
         return;
     }
     // first update the update
diff --git a/cl/activate.cl b/cl/activate.cl
index 9f88db83..f8a72ffb 100644
--- a/cl/activate.cl
+++ b/cl/activate.cl
@@ -10,7 +10,7 @@
 #ifdef TANH
     #define ACTIVATION_FUNCTION(output) (tanh(output))
 #elif defined SCALEDTANH
-    #define ACTIVATION_FUNCTION(output) ( 1.7159f * tanh( 0.66667f * output))
+    #define ACTIVATION_FUNCTION(output) (1.7159f * tanh(0.66667f * output))
 #elif SIGMOID
     #define ACTIVATION_FUNCTION(output) (1.0f / (1 + exp(-output)))
 #elif defined RELU
@@ -20,22 +20,22 @@
 #endif
 
 #ifdef ACTIVATION_FUNCTION // protect against not defined
-kernel void activate( const int N, global float *inout ) {
+kernel void activate(const int N, global float *inout) {
     const int globalId = get_global_id(0);
-    if( globalId >= N ) {
+    if (globalId >= N) {
         return;
     }
-    inout[globalId] = ACTIVATION_FUNCTION( inout[globalId] );
+    inout[globalId] = ACTIVATION_FUNCTION(inout[globalId]);
 }
 #endif
 
 #ifdef ACTIVATION_FUNCTION // protect against not defined
-kernel void forwardNaive( const int N, global float *out, global const float *in ) {
+kernel void forwardNaive(const int N, global float *out, global const float *in) {
     const int globalId = get_global_id(0);
-    if( globalId >= N ) {
+    if (globalId >= N) {
         return;
     }
-    out[globalId] = ACTIVATION_FUNCTION( in[globalId] );
+    out[globalId] = ACTIVATION_FUNCTION(in[globalId]);
 }
 #endif
 
diff --git a/cl/addscalar.cl b/cl/addscalar.cl
index 446d76fc..b9152000 100644
--- a/cl/addscalar.cl
+++ b/cl/addscalar.cl
@@ -7,9 +7,9 @@
 kernel void add_scalar(
         const int N,
         const float scalar,
-        global float *data ) {
+        global float *data) {
     const int globalId = get_global_id(0);
-    if( globalId >= N ) {
+    if (globalId >= N) {
         return;
     }
     data[globalId] += scalar;
diff --git a/cl/applyActivationDeriv.cl b/cl/applyActivationDeriv.cl
index 851b03bd..c08a85e3 100644
--- a/cl/applyActivationDeriv.cl
+++ b/cl/applyActivationDeriv.cl
@@ -10,9 +10,9 @@
 #ifdef TANH
     #define ACTIVATION_DERIV(output) (1 - output * output)
 #elif defined SCALEDTANH
-    #define ACTIVATION_DERIV(output) ( 0.66667f * ( 1.7159f - 1 / 1.7159f * output * output ) )
+    #define ACTIVATION_DERIV(output) (0.66667f * (1.7159f - 1 / 1.7159f * output * output) )
 #elif defined SIGMOID
-    #define ACTIVATION_DERIV(output) (output * ( 1 - output ) )
+    #define ACTIVATION_DERIV(output) (output * (1 - output) )
 #elif defined RELU
     #define ACTIVATION_DERIV(output) (output > 0 ? 1 : 0)
 #elif defined LINEAR
@@ -22,19 +22,19 @@
 //#ifdef ACTIVATION_DERIV
 //void kernel applyActivationDeriv( 
 //        const int N,
-//        global float *inout ) {
+//        global float *inout) {
 //    int globalId = get_global_id(0);
-//    inout[globalId] = ACTIVATION_DERIV( inout[globalId] );
+//    inout[globalId] = ACTIVATION_DERIV(inout[globalId]);
 //}
 //#endif
 
 #ifdef ACTIVATION_DERIV
 void kernel applyActivationDeriv( 
         const int N,
-        global float *target, global const float *source ) {
+        global float *target, global const float *source) {
     int globalId = get_global_id(0);
-    if( globalId < N ) {
-        target[globalId] *= ACTIVATION_DERIV( source[globalId] );
+    if (globalId < N) {
+        target[globalId] *= ACTIVATION_DERIV(source[globalId]);
     }
   //  target[globalId] *= source[globalId];
 }
@@ -45,10 +45,10 @@ void kernel backward(
         const int N,
         global const float *inputs,
         global const float *gradOutput, 
-        global float *gradInput ) {
+        global float *gradInput) {
     int globalId = get_global_id(0);
-    if( globalId < N ) {
-        gradInput[globalId] = ACTIVATION_DERIV( inputs[globalId] ) * gradOutput[globalId];
+    if (globalId < N) {
+        gradInput[globalId] = ACTIVATION_DERIV(inputs[globalId]) * gradOutput[globalId];
             // probably not ideal to have the output and input separate?
     }
   //  target[globalId] *= source[globalId];
diff --git a/cl/backpropweights.cl b/cl/backpropweights.cl
index 80f4e39c..f7020b0e 100644
--- a/cl/backpropweights.cl
+++ b/cl/backpropweights.cl
@@ -9,7 +9,7 @@
 
 // globalId: [outPlane][inputPlane][filterRow][filterCol]
 // per-thread iteration: [n][outputRow][outputCol]
-void kernel backprop_floats( const float learningRateMultiplier,
+void kernel backprop_floats(const float learningRateMultiplier,
         const int batchSize, 
          global const float *gradOutput, global const float *images, 
         global float *gradWeights
@@ -18,7 +18,7 @@ void kernel backprop_floats( const float learningRateMultiplier,
         #endif
  ) {
     int globalId = get_global_id(0);
-    if( globalId >= gNumFilters * gInputPlanes * gFilterSize * gFilterSize ) {
+    if (globalId >= gNumFilters * gInputPlanes * gFilterSize * gFilterSize) {
         return;
     }
 
@@ -36,22 +36,22 @@ void kernel backprop_floats( const float learningRateMultiplier,
 #ifdef BIASED
     float thisbiaschange = 0;
 #endif
-    for( int n = 0; n < batchSize; n++ ) {
-        for( int outRow = 0; outRow < gOutputImageSize; outRow++ ) {
+    for (int n = 0; n < batchSize; n++) {
+        for (int outRow = 0; outRow < gOutputSize; outRow++) {
             int upstreamRow = outRow - gMargin + filterRow;
-            for( int outCol = 0; outCol < gOutputImageSize; outCol++ ) {
+            for (int outCol = 0; outCol < gOutputSize; outCol++) {
                 int upstreamCol = outCol - gMargin + filterCol;
-                bool proceed = upstreamRow >= 0 && upstreamCol >= 0 && upstreamRow < gInputImageSize
-                    && upstreamCol < gInputImageSize;
-                if( proceed ) {
-                    int resultIndex = ( ( n * gNumFilters 
-                              + outPlane ) * gOutputImageSize
-                              + outRow ) * gOutputImageSize
+                bool proceed = upstreamRow >= 0 && upstreamCol >= 0 && upstreamRow < gInputSize
+                    && upstreamCol < gInputSize;
+                if (proceed) {
+                    int resultIndex = (( n * gNumFilters 
+                              + outPlane) * gOutputSize
+                              + outRow) * gOutputSize
                               + outCol;
                     float error = gradOutput[resultIndex];
-                    int upstreamDataIndex = ( ( n * gInputPlanes 
-                                     + upstreamPlane ) * gInputImageSize
-                                     + upstreamRow ) * gInputImageSize
+                    int upstreamDataIndex = (( n * gInputPlanes 
+                                     + upstreamPlane) * gInputSize
+                                     + upstreamRow) * gInputSize
                                      + upstreamCol;
                     float upstreamResult = images[upstreamDataIndex];
                     float thisimagethiswchange = upstreamResult * error;
@@ -68,7 +68,7 @@ void kernel backprop_floats( const float learningRateMultiplier,
     gradWeights[ globalId ] = learningRateMultiplier * thiswchange;
 #ifdef BIASED
     bool writeBias = upstreamPlane == 0 && filterRow == gMargin && filterCol == gMargin;
-    if( writeBias ) {
+    if (writeBias) {
         gradBiasWeights[outPlane] = learningRateMultiplier * thisbiaschange;
     }
 #endif
diff --git a/cl/backpropweights_blocked.cl b/cl/backpropweights_blocked.cl
index bd13ff48..8b8a276d 100644
--- a/cl/backpropweights_blocked.cl
+++ b/cl/backpropweights_blocked.cl
@@ -22,16 +22,16 @@
 #include "cl/copyLocal.cl"
 #include "cl/copyBlock.cl"
 
-//#define posToRow( pos ) ( ( pos >> 10 ) & (2^11-1) )
-//#define posToCol( pos ) ( ( pos ) & (2^11-1) )
-//#define rowColToPos( row, col ) ( ( row << 10 ) | col )
-//#define linearIdToPos( linearId, base ) ( rowColToPos( ( linearId / base ), ( linearId % base )  ) )
+//#define posToRow(pos) (( pos >> 10) & (2^11-1))
+//#define posToCol(pos) (( pos) & (2^11-1))
+//#define rowColToPos(row, col) (( row << 10) | col)
+//#define linearIdToPos(linearId, base) (rowColToPos(( linearId / base), (linearId % base)  ))
 
 // workgroupId: [outputPlane][inputPlane][blockRow][blockCol]
 // localId: [filterRow][filterCol]
 // per-thread iteration: [n][outputRow][outputCol]
 // local: errorimage: blockSize * blockSize
-//        imageimage: inputImageSize * inputImageSize
+//        imageimage: inputSize * inputSize
 void kernel backprop_floats_withscratch_dobias( 
         const float learningRateMultiplier, const int batchSize, 
          global const float *gradOutput, global const float *images, 
@@ -41,18 +41,18 @@ void kernel backprop_floats_withscratch_dobias(
         #endif
         local float *_errorImage, local float *_imageImage
  ) {
-    #define globalId ( get_global_id(0) )
-    #define localId ( get_local_id(0)  )
-    #define workgroupId ( get_group_id(0) )
-    #define workgroupSize ( get_local_size(0) )
+    #define globalId (get_global_id(0))
+    #define localId (get_local_id(0)  )
+    #define workgroupId (get_group_id(0))
+    #define workgroupSize (get_local_size(0))
 
 //    const int filterRow = localId / gFilterSize;
 //    const int filterCol = localId % gFilterSize;
-    const int filterPos = linearIdToPos( localId, gFilterSize )
-    const int inOutPlane = linearIdToPos( workgroupId, gInputPlanes )
+    const int filterPos = linearIdToPos(localId, gFilterSize)
+    const int inOutPlane = linearIdToPos(workgroupId, gInputPlanes)
 
-//    #define outPlane ( workgroupId / gInputPlanes )
-//    #define upstreamPlane ( workgroupId % gInputPlanes )
+//    #define outPlane (workgroupId / gInputPlanes)
+//    #define upstreamPlane (workgroupId % gInputPlanes)
 
     // gradWeights:     [outPlane][upstreamPlane][filterRow][filterCol]
     //       aggregate over:  [outRow][outCol][n]
@@ -60,27 +60,27 @@ void kernel backprop_floats_withscratch_dobias(
 #ifdef BIASED
     float thisbiaschange = 0;
 #endif
-    for( int n = 0; n < batchSize; n++ ) {
+    for (int n = 0; n < batchSize; n++) {
         barrier(CLK_LOCAL_MEM_FENCE);
-        copyLocal( _imageImage, images + ( n * gInputPlanes + upstreamPlane ) * gInputImageSizeSquared, 
-            gInputImageSizeSquared );
-        copyLocal( _errorImage, gradOutput + ( n * gNumFilters + outPlane ) * gOutputImageSizeSquared,
-            gOutputImageSizeSquared );
+        copyLocal(_imageImage, images + (n * gInputPlanes + upstreamPlane) * gInputSizeSquared, 
+            gInputSizeSquared);
+        copyLocal(_errorImage, gradOutput + (n * gNumFilters + outPlane) * gOutputSizeSquared,
+            gOutputSizeSquared);
         barrier(CLK_LOCAL_MEM_FENCE);
-        if( localId < gFilterSizeSquared ) {
-            for( int outRow = 0; outRow < gOutputImageSize; outRow++ ) {
+        if (localId < gFilterSizeSquared) {
+            for (int outRow = 0; outRow < gOutputSize; outRow++) {
                 int upstreamRow = outRow - gMargin + filterRow;
-                for( int outCol = 0; outCol < gOutputImageSize; outCol++ ) {
+                for (int outCol = 0; outCol < gOutputSize; outCol++) {
                     const int upstreamCol = outCol - gMargin + filterCol;
-                    #define proceed ( upstreamRow >= 0 && upstreamCol >= 0 && upstreamRow < gInputImageSize && upstreamCol < gInputImageSize )
-                    if( proceed ) {
+                    #define proceed (upstreamRow >= 0 && upstreamCol >= 0 && upstreamRow < gInputSize && upstreamCol < gInputSize)
+                    if (proceed) {
                         // these defines reduce register pressure, compared to const
                         // giving a 40% speedup on nvidia :-)
-                        #define resultIndex ( outRow * gOutputImageSize + outCol )
-                        #define error ( _errorImage[resultIndex] )
+                        #define resultIndex (outRow * gOutputSize + outCol)
+                        #define error (_errorImage[resultIndex])
                         //const float error = _errorImage[resultIndex];
-                        #define upstreamDataIndex ( upstreamRow * gInputImageSize + upstreamCol )
-                        #define upstreamResult ( _imageImage[upstreamDataIndex] )
+                        #define upstreamDataIndex (upstreamRow * gInputSize + upstreamCol)
+                        #define upstreamResult (_imageImage[upstreamDataIndex])
                         thiswchange += upstreamResult * error;
     #ifdef BIASED
                         thisbiaschange += error;
@@ -90,12 +90,12 @@ void kernel backprop_floats_withscratch_dobias(
             }
         }
     }
-    if( localId < gFilterSizeSquared ) {
+    if (localId < gFilterSizeSquared) {
         gradWeights[ workgroupId * gFilterSizeSquared + localId ] = learningRateMultiplier * thiswchange;
     }
 #ifdef BIASED
-    #define writeBias ( upstreamPlane == 0 && localId == 0 )
-    if( writeBias ) {
+    #define writeBias (upstreamPlane == 0 && localId == 0)
+    if (writeBias) {
         gradBiasWeights[outPlane] = learningRateMultiplier * thisbiaschange;
     }
 #endif
diff --git a/cl/backpropweights_byrow.cl b/cl/backpropweights_byrow.cl
index 96d63666..4fa246b3 100644
--- a/cl/backpropweights_byrow.cl
+++ b/cl/backpropweights_byrow.cl
@@ -23,20 +23,20 @@
 // localid: [filterRow][filterCol]
 // weightChanges1: [outputPlane][inputPlane][filterRow][filterCol][outputRow]
 // gradBiasWeights1: [outputPlane][outputRow]
-kernel void backprop_weights( const float learningRateMultiplier, const int batchSize,
+kernel void backprop_weights(const float learningRateMultiplier, const int batchSize,
     global float const *gradOutput, global float const *input, global float *restrict gradWeights1,
     #ifdef BIASED
          global float *restrict gradBiasWeights1,
     #endif
-    local float *restrict _errorRow, local float *restrict _inputRow ) {
-    #define globalId ( get_global_id(0) )
-    #define workgroupId ( get_group_id(0) )
-    #define localId ( get_local_id(0) )
+    local float *restrict _errorRow, local float *restrict _inputRow) {
+    #define globalId (get_global_id(0))
+    #define workgroupId (get_group_id(0))
+    #define localId (get_local_id(0))
     
     const int filterRow = localId / gFilterSize;
     const int filterCol = localId % gFilterSize;
-    const int outputRow = workgroupId % gOutputImageSize;
-    #define outInCombo ( workgroupId / gOutputImageSize )
+    const int outputRow = workgroupId % gOutputSize;
+    #define outInCombo (workgroupId / gOutputSize)
     const int outputPlane = outInCombo / gNumInputPlanes;
     const int inputPlane = outInCombo % gNumInputPlanes;
 
@@ -46,35 +46,35 @@ kernel void backprop_weights( const float learningRateMultiplier, const int batc
     #ifdef BIASED
         float thisbiaschange = 0.0f;
     #endif
-    for( int n = 0; n < batchSize; n++ ) {
+    for (int n = 0; n < batchSize; n++) {
         barrier(CLK_LOCAL_MEM_FENCE);
         // copy down the gradOutput row...
         {
             global float const*gradOutputRow = gradOutput + 
-                ( ( n
-                    * gNumOutputPlanes + outputPlane )
-                    * gOutputImageSize + outputRow )
-                    * gOutputImageSize;
-            if( localId < gOutputImageSize ) { // assume we have enough threads for now... should fix later
+                (( n
+                    * gNumOutputPlanes + outputPlane)
+                    * gOutputSize + outputRow)
+                    * gOutputSize;
+            if (localId < gOutputSize) { // assume we have enough threads for now... should fix later
                 _errorRow[ localId ] = gradOutputRow[ localId ];
             }
         }
         // copy down the input row
         {
             global float const*inputRowData = input +
-                ( ( n
-                    * gNumInputPlanes + inputPlane )
-                    * gInputImageSize + thisInputRow )
-                    * gInputImageSize;
-            if( localId < gInputImageSize ) { // assume we have enough threads for now... should fix later
+                (( n
+                    * gNumInputPlanes + inputPlane)
+                    * gInputSize + thisInputRow)
+                    * gInputSize;
+            if (localId < gInputSize) { // assume we have enough threads for now... should fix later
                 _inputRow[ localId ] = inputRowData[ localId ];
             }
         }
         barrier(CLK_LOCAL_MEM_FENCE);
-        for( int outputCol = 0; outputCol < gOutputImageSize; outputCol++ ) {
+        for (int outputCol = 0; outputCol < gOutputSize; outputCol++) {
             const int inputCol = outputCol - gMargin + filterCol;
-            if( inputRow >= 0 && inputRow < gInputImageSize && inputCol >= 0 && inputCol < gInputImageSize ) {
-                if( localId < gFilterSizeSquared ) {
+            if (inputRow >= 0 && inputRow < gInputSize && inputCol >= 0 && inputCol < gInputSize) {
+                if (localId < gFilterSizeSquared) {
                     thiswchange += _inputRow[ inputCol ] * _errorRow[ outputCol ];
                     #ifdef BIASED
                         thisbiaschange += _errorRow[ outputCol ];
@@ -84,21 +84,21 @@ kernel void backprop_weights( const float learningRateMultiplier, const int batc
         }
     }
 
-    if( workgroupId == 0 && localId == 0 ) {
+    if (workgroupId == 0 && localId == 0) {
         gradWeights1[0] = _inputRow[0];
         gradWeights1[1] = _inputRow[1];
     }
 
-    if( localId < gFilterSizeSquared ) {
-        #define weightsIndex ( ( ( outInCombo \
-            * gFilterSizeSquared ) + localId \
-            * gOutputImageSize ) + outputRow )
+    if (localId < gFilterSizeSquared) {
+        #define weightsIndex (( (outInCombo \
+            * gFilterSizeSquared) + localId \
+            * gOutputSize) + outputRow)
         //gradWeights1[ weightsIndex ] -= learningRateMultiplier * thiswchange;
         //gradWeights1[weightsIndex] = 123.0f;
     }
     #ifdef BIASED
-        if( inputPlane == 0 && localId == 0 ) {
-            gradBiasWeights1[outputPlane * gOutputImageSize + outputRow ] = learningRateMultiplier * thisbiaschange;
+        if (inputPlane == 0 && localId == 0) {
+            gradBiasWeights1[outputPlane * gOutputSize + outputRow ] = learningRateMultiplier * thisbiaschange;
         }
     #endif
 }
diff --git a/cl/backward.cl b/cl/backward.cl
index 3cd55ee8..4588e133 100644
--- a/cl/backward.cl
+++ b/cl/backward.cl
@@ -13,42 +13,42 @@
 // weights: [filterId][inputPlane][filterRow][filterCol] 32 * 32 * 5 * 5 * 4 = 409KB
 void kernel calcGradInput( 
         const int batchSize,
-        global const float *gradOutput, global float *weights, global float *gradInput ) {
+        global const float *gradOutput, global float *weights, global float *gradInput) {
     int globalId = get_global_id(0);
 
-    const int upstreamImage2dId = globalId / gInputImageSizeSquared;
+    const int upstreamImage2dId = globalId / gInputSizeSquared;
 
-    const int intraImageOffset = globalId % gInputImageSizeSquared;
-    const int upstreamRow = intraImageOffset / gInputImageSize;
-    const int upstreamCol = intraImageOffset % gInputImageSize;
+    const int intraImageOffset = globalId % gInputSizeSquared;
+    const int upstreamRow = intraImageOffset / gInputSize;
+    const int upstreamCol = intraImageOffset % gInputSize;
 
     const int upstreamPlane = upstreamImage2dId % gInputPlanes;
     const int n = upstreamImage2dId / gInputPlanes;
 
-    if( n >= batchSize ) {
+    if (n >= batchSize) {
         return;
     }
 
-    const int minFilterRow = max( 0, upstreamRow + gMargin - (gOutputImageSize - 1) );
-    const int maxFilterRow = min( gFilterSize - 1, upstreamRow + gMargin );
-    const int minFilterCol = max( 0, upstreamCol + gMargin - (gOutputImageSize -1) );
-    const int maxFilterCol = min( gFilterSize - 1, upstreamCol + gMargin );
+    const int minFilterRow = max(0, upstreamRow + gMargin - (gOutputSize - 1));
+    const int maxFilterRow = min(gFilterSize - 1, upstreamRow + gMargin);
+    const int minFilterCol = max(0, upstreamCol + gMargin - (gOutputSize -1));
+    const int maxFilterCol = min(gFilterSize - 1, upstreamCol + gMargin);
 
     float sumWeightTimesOutError = 0;
     // aggregate over [outPlane][outRow][outCol]
-    for( int outPlane = 0; outPlane < gNumFilters; outPlane++ ) {
-        for( int filterRow = minFilterRow; filterRow <= maxFilterRow; filterRow++ ) {
+    for (int outPlane = 0; outPlane < gNumFilters; outPlane++) {
+        for (int filterRow = minFilterRow; filterRow <= maxFilterRow; filterRow++) {
             int outRow = upstreamRow + gMargin - filterRow;
-            for( int filterCol = minFilterCol; filterCol <= maxFilterCol; filterCol++ ) {
+            for (int filterCol = minFilterCol; filterCol <= maxFilterCol; filterCol++) {
                 int outCol = upstreamCol + gMargin - filterCol;
-                int resultIndex = ( ( n * gNumFilters 
-                          + outPlane ) * gOutputImageSize
-                          + outRow ) * gOutputImageSize
+                int resultIndex = (( n * gNumFilters 
+                          + outPlane) * gOutputSize
+                          + outRow) * gOutputSize
                           + outCol;
                 float thisError = gradOutput[resultIndex];
-                int thisWeightIndex = ( ( outPlane * gInputPlanes
-                                    + upstreamPlane ) * gFilterSize
-                                    + filterRow ) * gFilterSize
+                int thisWeightIndex = (( outPlane * gInputPlanes
+                                    + upstreamPlane) * gFilterSize
+                                    + filterRow) * gFilterSize
                                     + filterCol;
                 float thisWeight = weights[thisWeightIndex];
                 float thisWeightTimesError = thisWeight * thisError;
diff --git a/cl/backward_cached.cl b/cl/backward_cached.cl
index 374af60d..e33351a0 100644
--- a/cl/backward_cached.cl
+++ b/cl/backward_cached.cl
@@ -4,11 +4,11 @@
 // v. 2.0. If a copy of the MPL was not distributed with this file, You can 
 // obtain one at http://mozilla.org/MPL/2.0/.
 
-void copyLocal( local float *target, global float const *source, int N ) {
-    int numLoops = ( N + get_local_size(0) - 1 ) / get_local_size(0);
-    for( int loop = 0; loop < numLoops; loop++ ) {
+void copyLocal(local float *target, global float const *source, int N) {
+    int numLoops = (N + get_local_size(0) - 1) / get_local_size(0);
+    for (int loop = 0; loop < numLoops; loop++) {
         int offset = loop * get_local_size(0) + get_local_id(0);
-        if( offset < N ) {
+        if (offset < N) {
             target[offset] = source[offset];
         }
     }
@@ -20,7 +20,7 @@ void copyLocal( local float *target, global float const *source, int N ) {
 // localid: [upstreamrow][upstreamcol]
 // per-thread aggregation: [outPlane][filterRow][filterCol]
 // need to store locally:
-// - _gradOutputPlane. size = outputImageSizeSquared
+// - _gradOutputPlane. size = outputSizeSquared
 // - _filterPlane. size = filtersizesquared
 // note: currently doesnt use bias as input.  thats probably an error?
 // inputs: gradOutput :convolve: filters => gradInput
@@ -38,7 +38,7 @@ void kernel calcGradInputCached(
         global const float *filtersGlobal, 
         global float *gradInput,
         local float *_gradOutputPlane, 
-        local float *_filterPlane ) {
+        local float *_filterPlane) {
 
     #define globalId get_global_id(0)
     #define localId get_local_id(0)
@@ -48,30 +48,30 @@ void kernel calcGradInputCached(
     const int n = workgroupId / gInputPlanes;
     const int upstreamPlane = workgroupId % gInputPlanes;
 
-    const int upstreamRow = localId / gInputImageSize;
-    const int upstreamCol = localId % gInputImageSize;
+    const int upstreamRow = localId / gInputSize;
+    const int upstreamCol = localId % gInputSize;
 
     float sumWeightTimesOutError = 0;
-    for( int outPlane = 0; outPlane < gNumFilters; outPlane++ ) {
+    for (int outPlane = 0; outPlane < gNumFilters; outPlane++) {
         barrier(CLK_LOCAL_MEM_FENCE);
-        copyLocal( _filterPlane, filtersGlobal + ( outPlane * gInputPlanes + upstreamPlane ) * gFilterSizeSquared, gFilterSizeSquared );
-        copyLocal( _gradOutputPlane, gradOutputGlobal + ( n * gNumFilters + outPlane ) * gOutputImageSizeSquared, gOutputImageSizeSquared );
+        copyLocal(_filterPlane, filtersGlobal + (outPlane * gInputPlanes + upstreamPlane) * gFilterSizeSquared, gFilterSizeSquared);
+        copyLocal(_gradOutputPlane, gradOutputGlobal + (n * gNumFilters + outPlane) * gOutputSizeSquared, gOutputSizeSquared);
         barrier(CLK_LOCAL_MEM_FENCE);
-        for( int filterRow = 0; filterRow < gFilterSize; filterRow++ ) {
+        for (int filterRow = 0; filterRow < gFilterSize; filterRow++) {
             int outRow = upstreamRow + gMargin - filterRow;
-            for( int filterCol = 0; filterCol < gFilterSize; filterCol++ ) {
+            for (int filterCol = 0; filterCol < gFilterSize; filterCol++) {
                 int outCol = upstreamCol + gMargin - filterCol;
-                if( outCol >= 0 && outCol < gOutputImageSize && outRow >= 0 && outRow < gOutputImageSize ) {
+                if (outCol >= 0 && outCol < gOutputSize && outRow >= 0 && outRow < gOutputSize) {
                     float thisWeightTimesError = 
-                        _gradOutputPlane[outRow * gOutputImageSize + outCol] * 
+                        _gradOutputPlane[outRow * gOutputSize + outCol] * 
                         _filterPlane[filterRow * gFilterSize + filterCol];
                     sumWeightTimesOutError += thisWeightTimesError;
                 }
             }
         }
     }
-    const int upstreamImageGlobalOffset = ( n * gInputPlanes + upstreamPlane ) * gInputImageSizeSquared;
-    if( localId < gInputImageSizeSquared ) {
+    const int upstreamImageGlobalOffset = (n * gInputPlanes + upstreamPlane) * gInputSizeSquared;
+    if (localId < gInputSizeSquared) {
         gradInput[upstreamImageGlobalOffset + localId] = sumWeightTimesOutError;
     }
 }
diff --git a/cl/bw_rowperwg.cl b/cl/bw_rowperwg.cl
index a2ffbacb..8b032753 100644
--- a/cl/bw_rowperwg.cl
+++ b/cl/bw_rowperwg.cl
@@ -10,8 +10,8 @@
 // workgroupId: [outputPlane][inputPlane][inputRow]
 // localId: [filterRow][filterCol]
 // per-thread iteration: [n][outputCol]
-// local: errorimage: outputImageSize
-//        imageimage: inputImageSize
+// local: errorimage: outputSize
+//        imageimage: inputSize
 // output weight changes: [outputPlane][inputPlane][filterRow][filterCol][outRow]
 void kernel backprop_weights( 
         const float learningRateMultiplier, const int batchSize, 
@@ -30,9 +30,9 @@ void kernel backprop_weights(
     const int filterRow = localId / gFilterSize;
     const int filterCol = localId % gFilterSize;
 
-    const int inputRow = workgroupId % gInputImageSize;
-    const int outputPlane = ( workgroupId / gInputImageSize ) / gInputPlanes;
-    const int inputPlane = ( workgroupId / gInputImageSize ) % gInputPlanes;
+    const int inputRow = workgroupId % gInputSize;
+    const int outputPlane = (workgroupId / gInputSize) / gInputPlanes;
+    const int inputPlane = (workgroupId / gInputSize) % gInputPlanes;
 
     // weightchanges:     [outputPlane][inputPlane][filterRow][filterCol][outRow]
     //       aggregate over:  [outCol][n]
@@ -40,37 +40,37 @@ void kernel backprop_weights(
 #ifdef BIASED
     float thisbiaschange = 0;
 #endif
-    for( int n = 0; n < batchSize; n++ ) {
-        int upstreamImageGlobalOffset = ( n * gInputPlanes + inputPlane ) * gInputImageSizeSquared;
+    for (int n = 0; n < batchSize; n++) {
+        int upstreamImageGlobalOffset = (n * gInputPlanes + inputPlane) * gInputSizeSquared;
         // need to fetch the image, but it's bigger than us, so will need to loop...
-        const int numLoopsForUpstream = ( gInputImageSizeSquared + workgroupSize - 1 ) / workgroupSize;
+        const int numLoopsForUpstream = (gInputSizeSquared + workgroupSize - 1) / workgroupSize;
         barrier(CLK_LOCAL_MEM_FENCE);
-        for( int i = 0; i < numLoopsForUpstream; i++ ) {
+        for (int i = 0; i < numLoopsForUpstream; i++) {
             int thisOffset = i * workgroupSize + localId;
-            if( thisOffset < gInputImageSizeSquared ) {
+            if (thisOffset < gInputSizeSquared) {
                 _imageImage[thisOffset] = images[ upstreamImageGlobalOffset + thisOffset ];
             }
         }
-        int resultImageGlobalOffset = ( n * gNumFilters + outputPlane ) * gOutputImageSizeSquared;
-        int numLoopsForOutput = ( gOutputImageSizeSquared + workgroupSize - 1 ) / workgroupSize;
-        for( int i = 0; i < numLoopsForOutput; i++ ) {
+        int resultImageGlobalOffset = (n * gNumFilters + outputPlane) * gOutputSizeSquared;
+        int numLoopsForOutput = (gOutputSizeSquared + workgroupSize - 1) / workgroupSize;
+        for (int i = 0; i < numLoopsForOutput; i++) {
             int thisOffset = i * workgroupSize + localId;
-            if( thisOffset < gOutputImageSizeSquared ) {
+            if (thisOffset < gOutputSizeSquared) {
                 _errorImage[thisOffset ] = gradOutput[resultImageGlobalOffset + thisOffset];
             }
         }
         barrier(CLK_LOCAL_MEM_FENCE);
-        if( localId < gFilterSizeSquared ) {
-            for( int outRow = 0; outRow < gOutputImageSize; outRow++ ) {
+        if (localId < gFilterSizeSquared) {
+            for (int outRow = 0; outRow < gOutputSize; outRow++) {
                 int inputRow = outRow - gMargin + filterRow;
-                for( int outCol = 0; outCol < gOutputImageSize; outCol++ ) {
+                for (int outCol = 0; outCol < gOutputSize; outCol++) {
                     int inputCol = outCol - gMargin + filterCol;
-                    bool proceed = inputRow >= 0 && inputCol >= 0 && inputRow < gInputImageSize
-                        && inputCol < gInputImageSize;
-                    if( proceed ) {
-                        int resultIndex = outRow * gOutputImageSize + outCol;
+                    bool proceed = inputRow >= 0 && inputCol >= 0 && inputRow < gInputSize
+                        && inputCol < gInputSize;
+                    if (proceed) {
+                        int resultIndex = outRow * gOutputSize + outCol;
                         float error = _errorImage[resultIndex];
-                        int upstreamDataIndex = inputRow * gInputImageSize + inputCol;
+                        int upstreamDataIndex = inputRow * gInputSize + inputCol;
                         float upstreamResult = _imageImage[upstreamDataIndex];
                         thiswchange += upstreamResult * error;
     #ifdef BIASED
@@ -81,12 +81,12 @@ void kernel backprop_weights(
             }
         }
     }
-    if( localId < gFilterSizeSquared ) {
+    if (localId < gFilterSizeSquared) {
         weights[ workgroupId * gFilterSizeSquared + localId ] -= learningRateMultiplier * thiswchange;
     }
 #ifdef BIASED
     bool writeBias = inputPlane == 0 && localId == 0;
-    if( writeBias ) {
+    if (writeBias) {
         biasWeights[outputPlane] -= learningRateMultiplier * thisbiaschange;
     }
 #endif
diff --git a/cl/copy.cl b/cl/copy.cl
index 60b5361c..84939adc 100644
--- a/cl/copy.cl
+++ b/cl/copy.cl
@@ -10,9 +10,9 @@
 kernel void copy(
         const int N,
         global const float *in,
-        global float *out ) {
+        global float *out) {
     const int globalId = get_global_id(0);
-    if( globalId >= N ) {
+    if (globalId >= N) {
         return;
     }
     out[globalId] = in[globalId];
@@ -23,9 +23,9 @@ kernel void copy_with_offset(
         global const float *in,
         const int inoffset,
         global float *out,
-        const int outoffset ) {
+        const int outoffset) {
     const int globalId = get_global_id(0);
-    if( globalId >= N ) {
+    if (globalId >= N) {
         return;
     }
     out[globalId + outoffset] = in[globalId + inoffset];
@@ -35,9 +35,9 @@ kernel void multiplyConstant(
         const int N,
         const float multiplier,
         global const float *in,
-        global float *out ) {
+        global float *out) {
     const int globalId = get_global_id(0);
-    if( globalId >= N ) {
+    if (globalId >= N) {
         return;
     }
     out[globalId] = multiplier * in[globalId];
@@ -46,9 +46,9 @@ kernel void multiplyConstant(
 kernel void multiplyInplace(
         const int N,
         const float multiplier,
-        global float *data ) {
+        global float *data) {
     const int globalId = get_global_id(0);
-    if( globalId >= N ) {
+    if (globalId >= N) {
         return;
     }
     data[globalId] *= multiplier;
diff --git a/cl/copyBlock.cl b/cl/copyBlock.cl
index be366717..1697a89e 100644
--- a/cl/copyBlock.cl
+++ b/cl/copyBlock.cl
@@ -4,35 +4,35 @@
 // v. 2.0. If a copy of the MPL was not distributed with this file, You can 
 // obtain one at http://mozilla.org/MPL/2.0/.
 
-int posToRow( int pos ) {
-    return ( pos >> 10 ) & ( (1<<10)-1);
+int posToRow(int pos) {
+    return (pos >> 10) & ((1<<10)-1);
 //    return 53
 }
-int posToCol( int pos ) {
+int posToCol(int pos) {
     return pos & ((1<<10)-1);
   //  return 67;
     //return ((1<<11)-1);
 }
-int rowColToPos( int row, int col ) {
-    return ( row << 10 ) | col;
+int rowColToPos(int row, int col) {
+    return (row << 10) | col;
 }
-int linearIdToPos( int linearId, int base ) {
-    return rowColToPos( ( linearId / base ), ( linearId % base )  );
+int linearIdToPos(int linearId, int base) {
+    return rowColToPos(( linearId / base), (linearId % base)  );
 }
-int posToOffset( int pos, int rowLength ) {
+int posToOffset(int pos, int rowLength) {
     return posToRow(pos) * rowLength + posToCol(pos);
 }
 
 // assumes that the block will fit exactly into the target
-void copyBlock( local float *target, global float const *source, 
-    const int sourceSize, const int blockStart, const int blockSize ) {
-    const int totalLinearSize = posToRow( blockSize ) * posToCol( blockSize );
-    const int numLoops = ( totalLinearSize + get_local_size(0) - 1 ) / get_local_size(0);
-    for( int loop = 0; loop < numLoops; loop++ ) {
+void copyBlock(local float *target, global float const *source, 
+    const int sourceSize, const int blockStart, const int blockSize) {
+    const int totalLinearSize = posToRow(blockSize) * posToCol(blockSize);
+    const int numLoops = (totalLinearSize + get_local_size(0) - 1) / get_local_size(0);
+    for (int loop = 0; loop < numLoops; loop++) {
         const int offset = get_local_id(0) + loop * get_local_size(0);
-        if( offset < totalLinearSize ) {
-            const int offsetAsPos = linearIdToPos( offset, posToCol( blockSize ) );
-            target[ offset ] = source[ posToOffset( blockStart + offsetAsPos, posToCol( sourceSize ) ) ];  
+        if (offset < totalLinearSize) {
+            const int offsetAsPos = linearIdToPos(offset, posToCol(blockSize) );
+            target[ offset ] = source[ posToOffset(blockStart + offsetAsPos, posToCol(sourceSize) ) ];  
         }
     }
 }
diff --git a/cl/copyLocal.cl b/cl/copyLocal.cl
index 897d82fe..900dd33d 100644
--- a/cl/copyLocal.cl
+++ b/cl/copyLocal.cl
@@ -4,21 +4,21 @@
 // v. 2.0. If a copy of the MPL was not distributed with this file, You can 
 // obtain one at http://mozilla.org/MPL/2.0/.
 
-void copyLocal( local float *target, global float const *source, int N ) {
-    int numLoops = ( N + get_local_size(0) - 1 ) / get_local_size(0);
-    for( int loop = 0; loop < numLoops; loop++ ) {
+void copyLocal(local float *target, global float const *source, int N) {
+    int numLoops = (N + get_local_size(0) - 1) / get_local_size(0);
+    for (int loop = 0; loop < numLoops; loop++) {
         int offset = loop * get_local_size(0) + get_local_id(0);
-        if( offset < N ) {
+        if (offset < N) {
             target[offset] = source[offset];
         }
     }
 }
 
-void copyGlobal( global float *target, local float const *source, int N ) {
-    int numLoops = ( N + get_local_size(0) - 1 ) / get_local_size(0);
-    for( int loop = 0; loop < numLoops; loop++ ) {
+void copyGlobal(global float *target, local float const *source, int N) {
+    int numLoops = (N + get_local_size(0) - 1) / get_local_size(0);
+    for (int loop = 0; loop < numLoops; loop++) {
         int offset = loop * get_local_size(0) + get_local_id(0);
-        if( offset < N ) {
+        if (offset < N) {
             target[offset] = source[offset];
         }
     }
diff --git a/cl/dropout.cl b/cl/dropout.cl
index bf99a7b8..6c7a3447 100644
--- a/cl/dropout.cl
+++ b/cl/dropout.cl
@@ -8,9 +8,9 @@ kernel void forwardNaive(
         const int N, 
         global const unsigned char *mask,
         global const float *input,
-        global float *output ) {
+        global float *output) {
     const int globalId = get_global_id(0);
-    if( globalId >= N ) {
+    if (globalId >= N) {
         return;
     }
     output[globalId] = mask[globalId] == 1 ? input[globalId] : 0.0f;
@@ -22,7 +22,7 @@ kernel void backpropNaive(
         global const float *gradOutput,
         global float *output) {
     const int globalId = get_global_id(0);
-    if( globalId >= N ) {
+    if (globalId >= N) {
         return;
     }
     output[globalId] = mask[globalId] == 1 ? gradOutput[globalId] : 0.0f;
diff --git a/cl/forward.cl b/cl/forward.cl
index 7c6f64a1..464bc091 100644
--- a/cl/forward.cl
+++ b/cl/forward.cl
@@ -11,7 +11,7 @@
 #ifdef TANH
     #define ACTIVATION_FUNCTION(output) (tanh(output))
 #elif defined SCALEDTANH
-    #define ACTIVATION_FUNCTION(output) ( 1.7159f * tanh( 0.66667f * output))
+    #define ACTIVATION_FUNCTION(output) (1.7159f * tanh(0.66667f * output))
 #elif SIGMOID
     #define ACTIVATION_FUNCTION(output) (1.0f / (1 + exp(-output)))
 #elif defined RELU
@@ -20,28 +20,28 @@
     #define ACTIVATION_FUNCTION(output) (output)
 #endif
 
-void kernel convolve_ints( global const int *p_imageSize, global const int *p_filterSize,
-      global const int *image, global const int *filter, global int *result ) {
+void kernel convolve_ints(global const int *p_imageSize, global const int *p_filterSize,
+      global const int *image, global const int *filter, global int *result) {
     int id = get_global_id(0);
     int imageSize = p_imageSize[0];
     int filterSize = p_filterSize[0];
-    int imageOffset = id / (imageSize * imageSize ) * (imageSize * imageSize );
-    int localid = id % (imageSize * imageSize );
+    int imageOffset = id / (imageSize * imageSize) * (imageSize * imageSize);
+    int localid = id % (imageSize * imageSize);
     int row = localid / imageSize;
     int col = localid % imageSize;
     int halfFilterSize = filterSize >> 1;
     int sum = 0;
-    int minm = max( -halfFilterSize, -row );
-    int maxm = min( halfFilterSize, imageSize - 1 - row );
-    int minn = max( -halfFilterSize, -col );
-    int maxn = min( halfFilterSize, imageSize - 1 - col );
+    int minm = max(-halfFilterSize, -row);
+    int maxm = min(halfFilterSize, imageSize - 1 - row);
+    int minn = max(-halfFilterSize, -col);
+    int maxn = min(halfFilterSize, imageSize - 1 - col);
     int m = minm;
-    while( m <= maxm ) {
-        int x = ( row + m );
+    while(m <= maxm) {
+        int x = (row + m);
         int ximage = imageOffset + x * imageSize;
         int filterrowoffset = (m+halfFilterSize) * filterSize + halfFilterSize;
         int n = minn;
-        while( n <= maxn ) {
+        while(n <= maxn) {
             int y = col + n;
             sum += image[ ximage + y] * filter[ filterrowoffset + n ];
             n++;
@@ -51,28 +51,28 @@ void kernel convolve_ints( global const int *p_imageSize, global const int *p_fi
     result[id] = sum;
 }
 
-void kernel convolve_floats( global const int *p_imageSize, global const int *p_filterSize,
-      global const float *image, global const float *filter, global float *result ) {
+void kernel convolve_floats(global const int *p_imageSize, global const int *p_filterSize,
+      global const float *image, global const float *filter, global float *result) {
     int id = get_global_id(0);
     int imageSize = p_imageSize[0];
     int filterSize = p_filterSize[0];
-    int imageOffset = id / (imageSize * imageSize ) * (imageSize * imageSize );
-    int localid = id % (imageSize * imageSize );
+    int imageOffset = id / (imageSize * imageSize) * (imageSize * imageSize);
+    int localid = id % (imageSize * imageSize);
     int row = localid / imageSize;
     int col = localid % imageSize;
     int halfFilterSize = filterSize >> 1;
     float sum = 0;
-    int minm = max( -halfFilterSize, -row );
-    int maxm = min( halfFilterSize, imageSize - 1 - row );
-    int minn = max( -halfFilterSize, -col );
-    int maxn = min( halfFilterSize, imageSize - 1 - col );
+    int minm = max(-halfFilterSize, -row);
+    int maxm = min(halfFilterSize, imageSize - 1 - row);
+    int minn = max(-halfFilterSize, -col);
+    int maxn = min(halfFilterSize, imageSize - 1 - col);
     int m = minm;
-    while( m <= maxm ) {
-        int x = ( row + m );
+    while(m <= maxm) {
+        int x = (row + m);
         int ximage = imageOffset + x * imageSize;
         int filterrowoffset = (m+halfFilterSize) * filterSize + halfFilterSize;
         int n = minn;
-        while( n <= maxn ) {
+        while(n <= maxn) {
             int y = col + n;
             sum += image[ ximage + y] * filter[ filterrowoffset + n ];
             n++;
@@ -82,9 +82,9 @@ void kernel convolve_floats( global const int *p_imageSize, global const int *p_
     result[id] = sum;
 }
 
-void kernel convolve_imagecubes_int( global const int *p_numInputPlanes, global const int *p_numFilters, 
+void kernel convolve_imagecubes_int(global const int *p_numInputPlanes, global const int *p_numFilters, 
       global const int *p_imageSize, global const int *p_filterSize,
-      global const int *images, global const int *filters, global int *output ) {
+      global const int *images, global const int *filters, global int *output) {
     int globalId = get_global_id(0);
 
     int numInputPlanes = p_numInputPlanes[0];
@@ -107,21 +107,21 @@ void kernel convolve_imagecubes_int( global const int *p_numInputPlanes, global
 
     int halfFilterSize = filterSize >> 1;
     int sum = 0;
-    int minm = max( -halfFilterSize, -row );
-    int maxm = min( halfFilterSize, imageSize - 1 - row );
-    int minn = max( -halfFilterSize, -col );
-    int maxn = min( halfFilterSize, imageSize - 1 - col );
+    int minm = max(-halfFilterSize, -row);
+    int maxm = min(halfFilterSize, imageSize - 1 - row);
+    int minn = max(-halfFilterSize, -col);
+    int maxn = min(halfFilterSize, imageSize - 1 - col);
     int plane = 0;
-    while( plane < numInputPlanes ) {
+    while(plane < numInputPlanes) {
         int inputImageOffset = inputImage3Offset + plane * imageSizeSquared;
         int filterPlaneOffset = filterOffset + plane * filterSize * filterSize;
         int m = minm;
-        while( m <= maxm ) {
+        while(m <= maxm) {
             int y = row + m;
             int inputimagerowoffset = inputImageOffset + y * imageSize;
             int filterrowoffset = filterPlaneOffset + (m+halfFilterSize) * filterSize + halfFilterSize;
             int n = minn;
-            while( n <= maxn ) {
+            while(n <= maxn) {
                 int x = col + n;
                 sum += images[ inputimagerowoffset + x] * filters[ filterrowoffset + n ];
                 n++;
@@ -159,7 +159,7 @@ void kernel convolve_imagecubes_int( global const int *p_numInputPlanes, global
 void kernel convolve_imagecubes_float( 
       const int numInputPlanes, const int numFilters, 
       const int imageSize, const int filterSize,
-      global const float *images, global const float *filters, global float *output ) {
+      global const float *images, global const float *filters, global float *output) {
     int globalId = get_global_id(0);
 
     int imageSizeSquared = imageSize * imageSize;
@@ -180,20 +180,20 @@ void kernel convolve_imagecubes_float(
     float sum = 0;
     // m should vary from -halfFilterSize through 0 to halfFilterSize 
     // n too...
-    int minm = max( -halfFilterSize, -row );
-    int maxm = min( halfFilterSize, imageSize - 1 - row );
-    int minn = max( -halfFilterSize, -col );
-    int maxn = min( halfFilterSize, imageSize - 1 - col );
+    int minm = max(-halfFilterSize, -row);
+    int maxm = min(halfFilterSize, imageSize - 1 - row);
+    int minn = max(-halfFilterSize, -col);
+    int maxn = min(halfFilterSize, imageSize - 1 - col);
     int inputPlane = 0;
-    while( inputPlane < numInputPlanes ) {
+    while(inputPlane < numInputPlanes) {
         int inputImageOffset = inputImage3Offset + inputPlane * imageSizeSquared;
         int m = minm;
-        while( m <= maxm ) {
+        while(m <= maxm) {
             int y = row + m;
             int inputimagerowoffset = inputImageOffset + y * imageSize;
             int filterrowoffset = filterOffset + (m+halfFilterSize) * filterSize + halfFilterSize;
             int n = minn;
-            while( n <= maxn ) {
+            while(n <= maxn) {
                 int x = col + n;
                 sum += images[ inputimagerowoffset + x] * filters[ filterrowoffset + n ];
                 n++;
@@ -208,25 +208,25 @@ void kernel convolve_imagecubes_float(
 
 void kernel convolve_imagecubes_float_nopadzeros( 
       const int numInputPlanes, const int numFilters, 
-      const int inputImageSize, const int filterSize,
-      global const float *images, global const float *filters, global float *output ) {
+      const int inputSize, const int filterSize,
+      global const float *images, global const float *filters, global float *output) {
     int globalId = get_global_id(0);
 
-    int inputImageSizeSquared = inputImageSize * inputImageSize;
-    int outputImageSize = inputImageSize - filterSize + 1;
-    int outputImageSizeSquared = outputImageSize * outputImageSize;
+    int inputSizeSquared = inputSize * inputSize;
+    int outputSize = inputSize - filterSize + 1;
+    int outputSizeSquared = outputSize * outputSize;
 
-    int outputImage2Id = globalId / outputImageSizeSquared;
+    int outputImage2Id = globalId / outputSizeSquared;
     int filterId = outputImage2Id % numFilters;
     int inputImage3Id = outputImage2Id / numFilters;
 
     int filterOffset = filterId * filterSize * filterSize;
-    int inputImage3Offset = inputImage3Id * numInputPlanes * inputImageSizeSquared;
+    int inputImage3Offset = inputImage3Id * numInputPlanes * inputSizeSquared;
 
     // intraimage coords
-    int localid = globalId % outputImageSizeSquared;
-    int outputRow = localid / outputImageSize;
-    int outputCol = localid % outputImageSize;
+    int localid = globalId % outputSizeSquared;
+    int outputRow = localid / outputSize;
+    int outputCol = localid % outputSize;
 
     int halfFilterSize = filterSize >> 1;
     float sum = 0;
@@ -235,15 +235,15 @@ void kernel convolve_imagecubes_float_nopadzeros(
     int minn = -halfFilterSize;
     int maxn = halfFilterSize;
     int inputPlane = 0;
-    while( inputPlane < numInputPlanes ) {
-        int inputImageOffset = inputImage3Offset + inputPlane * inputImageSizeSquared;
+    while(inputPlane < numInputPlanes) {
+        int inputImageOffset = inputImage3Offset + inputPlane * inputSizeSquared;
         int m = minm;
-        while( m <= maxm ) {
+        while(m <= maxm) {
             int inputRow = outputRow + m + halfFilterSize;
-            int inputimagerowoffset = inputImageOffset + inputRow * inputImageSize;
+            int inputimagerowoffset = inputImageOffset + inputRow * inputSize;
             int filterrowoffset = filterOffset + (m+halfFilterSize) * filterSize + halfFilterSize;
             int n = minn;
-            while( n <= maxn ) {
+            while(n <= maxn) {
                 int inputCol = outputCol + n + halfFilterSize;
                 sum += images[ inputimagerowoffset + inputCol] * filters[ filterrowoffset + n ];
                 n++;
diff --git a/cl/forward1.cl b/cl/forward1.cl
index 0fc856c3..1e80bbfc 100644
--- a/cl/forward1.cl
+++ b/cl/forward1.cl
@@ -63,45 +63,45 @@
 void kernel convolve_imagecubes_float2(
     const int numExamples,
       global const float *inputs, global const float *filters, 
-    global float *output ) {
+    global float *output) {
     int globalId = get_global_id(0);
 
-    int outputImage2Id = globalId / gOutputImageSizeSquared;
+    int outputImage2Id = globalId / gOutputSizeSquared;
     int exampleId = outputImage2Id / gNumFilters;
     int filterId = outputImage2Id % gNumFilters;
 
     // intraimage coords
-    int localid = globalId % gOutputImageSizeSquared;
-    int outputRow = localid / gOutputImageSize;
-    int outputCol = localid % gOutputImageSize;
+    int localid = globalId % gOutputSizeSquared;
+    int outputRow = localid / gOutputSize;
+    int outputCol = localid % gOutputSize;
 
-    global float const*inputCube = inputs + exampleId * gNumInputPlanes * gInputImageSizeSquared;
+    global float const*inputCube = inputs + exampleId * gNumInputPlanes * gInputSizeSquared;
     global float const*filterCube = filters + filterId * gNumInputPlanes * gFilterSizeSquared;
 
     float sum = 0;
-    if( exampleId < numExamples ) {
-        for( int inputPlaneIdx = 0; inputPlaneIdx < gNumInputPlanes; inputPlaneIdx++ ) {
-            global float const*inputPlane = inputCube + inputPlaneIdx * gInputImageSizeSquared;
+    if (exampleId < numExamples) {
+        for (int inputPlaneIdx = 0; inputPlaneIdx < gNumInputPlanes; inputPlaneIdx++) {
+            global float const*inputPlane = inputCube + inputPlaneIdx * gInputSizeSquared;
             global float const*filterPlane = filterCube + inputPlaneIdx * gFilterSizeSquared;
-            for( int u = -gHalfFilterSize; u <= gHalfFilterSize - gEven; u++ ) {
+            for (int u = -gHalfFilterSize; u <= gHalfFilterSize - gEven; u++) {
                 // trying to reduce register pressure...
                 #if gPadZeros == 1
-                    #define inputRowIdx ( outputRow + u )
+                    #define inputRowIdx (outputRow + u)
                 #else
-                    #define inputRowIdx ( outputRow + u + gHalfFilterSize )
+                    #define inputRowIdx (outputRow + u + gHalfFilterSize)
                 #endif
-                global float const *inputRow = inputPlane + inputRowIdx * gInputImageSize;
+                global float const *inputRow = inputPlane + inputRowIdx * gInputSize;
                 global float const *filterRow = filterPlane + (u+gHalfFilterSize) * gFilterSize + gHalfFilterSize;
-                bool rowOk = inputRowIdx >= 0 && inputRowIdx < gInputImageSize;
+                bool rowOk = inputRowIdx >= 0 && inputRowIdx < gInputSize;
                 #pragma unroll
-                for( int v = -gHalfFilterSize; v <= gHalfFilterSize - gEven; v++ ) {
+                for (int v = -gHalfFilterSize; v <= gHalfFilterSize - gEven; v++) {
                     #if gPadZeros == 1
-                        #define inputColIdx ( outputCol + v )
+                        #define inputColIdx (outputCol + v)
                     #else
-                        #define inputColIdx ( outputCol + v + gHalfFilterSize )
+                        #define inputColIdx (outputCol + v + gHalfFilterSize)
                     #endif
-                    bool process = rowOk && inputColIdx >= 0 && inputColIdx < gInputImageSize;
-                    if( process ) {
+                    bool process = rowOk && inputColIdx >= 0 && inputColIdx < gInputSize;
+                    if (process) {
                             sum += inputRow[inputColIdx] * filterRow[v];
                     }
                 }
@@ -109,7 +109,7 @@ void kernel convolve_imagecubes_float2(
         }
     }
 
-    if( exampleId < numExamples ) {
+    if (exampleId < numExamples) {
         output[globalId] = sum;
     }
 }
diff --git a/cl/forward2.cl b/cl/forward2.cl
index ee7837e1..fb33315d 100644
--- a/cl/forward2.cl
+++ b/cl/forward2.cl
@@ -4,17 +4,17 @@
 // v. 2.0. If a copy of the MPL was not distributed with this file, You can 
 // obtain one at http://mozilla.org/MPL/2.0/.
 
-void copyLocal( local float *target, global float const *source, const int N ) {
-    int numLoops = ( N + gWorkgroupSize - 1 ) / gWorkgroupSize;
-    for( int loop = 0; loop < numLoops; loop++ ) {
+void copyLocal(local float *target, global float const *source, const int N) {
+    int numLoops = (N + gWorkgroupSize - 1) / gWorkgroupSize;
+    for (int loop = 0; loop < numLoops; loop++) {
         int offset = loop * gWorkgroupSize + get_local_id(0);
-        if( offset < N ) {
+        if (offset < N) {
             target[offset] = source[offset];
         }
     }
 }
 
-#ifdef gOutputImageSize // for previous tests that dont define it
+#ifdef gOutputSize // for previous tests that dont define it
 // workgroup id organized like: [outplane]
 // local id organized like: [outrow][outcol]
 // each thread iterates over: [imageid][upstreamplane][filterrow][filtercol]
@@ -30,7 +30,7 @@ void kernel forward_2_by_outplane(
         const int batchSize,
         global const float *images, global const float *filters, 
         global float *output,
-        local float *_inputPlane, local float *_filterCube ) {
+        local float *_inputPlane, local float *_filterCube) {
     const int globalId = get_global_id(0);
 
     const int workgroupId = get_group_id(0);
@@ -38,14 +38,14 @@ void kernel forward_2_by_outplane(
     const int outPlane = workgroupId;
 
     const int localId = get_local_id(0);
-    const int outputRow = localId / gOutputImageSize;
-    const int outputCol = localId % gOutputImageSize;
+    const int outputRow = localId / gOutputSize;
+    const int outputCol = localId % gOutputSize;
 
     #if gPadZeros == 1
-        const int minu = max( -gHalfFilterSize, -outputRow );
-        const int maxu = min( gHalfFilterSize, gOutputImageSize - 1 - outputRow ) - gEven;
-        const int minv = max( -gHalfFilterSize, -outputCol );
-        const int maxv = min( gHalfFilterSize, gOutputImageSize - 1 - outputCol ) - gEven;
+        const int minu = max(-gHalfFilterSize, -outputRow);
+        const int maxu = min(gHalfFilterSize, gOutputSize - 1 - outputRow) - gEven;
+        const int minv = max(-gHalfFilterSize, -outputCol);
+        const int maxv = min(gHalfFilterSize, gOutputSize - 1 - outputCol) - gEven;
     #else
         const int minu = -gHalfFilterSize;
         const int maxu = gHalfFilterSize - gEven;
@@ -55,30 +55,30 @@ void kernel forward_2_by_outplane(
 
     {
         const int filterCubeLength = gInputPlanes * gFilterSizeSquared;
-        copyLocal( _filterCube, 
+        copyLocal(_filterCube, 
                 filters + outPlane * filterCubeLength,
-                filterCubeLength );
+                filterCubeLength);
     }
     // dont need a barrier, since we'll just run behind the barrier from the upstream image download
 
-    for( int n = 0; n < batchSize; n++ ) {
+    for (int n = 0; n < batchSize; n++) {
         float sum = 0;
-        for( int upstreamPlane = 0; upstreamPlane < gInputPlanes; upstreamPlane++ ) {
+        for (int upstreamPlane = 0; upstreamPlane < gInputPlanes; upstreamPlane++) {
             barrier(CLK_LOCAL_MEM_FENCE);
-            copyLocal( _inputPlane, 
-                       images + ( n * gInputPlanes + upstreamPlane ) * gInputImageSizeSquared,
-                       gInputImageSizeSquared );
+            copyLocal(_inputPlane, 
+                       images + (n * gInputPlanes + upstreamPlane) * gInputSizeSquared,
+                       gInputSizeSquared);
             barrier(CLK_LOCAL_MEM_FENCE);
             int filterImageOffset = upstreamPlane * gFilterSizeSquared;
-            if( localId < gOutputImageSizeSquared ) {
-                for( int u = minu; u <= maxu; u++ ) {
+            if (localId < gOutputSizeSquared) {
+                for (int u = minu; u <= maxu; u++) {
                     int inputRow = outputRow + u;
                     #if gPadZeros == 0
                          inputRow += gHalfFilterSize;
                     #endif
-                    int inputimagerowoffset = inputRow * gInputImageSize;
+                    int inputimagerowoffset = inputRow * gInputSize;
                     int filterrowoffset = filterImageOffset + (u+gHalfFilterSize) * gFilterSize + gHalfFilterSize;
-                    for( int v = minv; v <= maxv; v++ ) {
+                    for (int v = minv; v <= maxv; v++) {
                         int inputCol = outputCol + v;
                         #if gPadZeros == 0
                              inputCol += gHalfFilterSize;
@@ -89,8 +89,8 @@ void kernel forward_2_by_outplane(
             }
         }
         // output are organized like [imageid][filterid][row][col]
-        int resultIndex = ( n * gNumFilters + outPlane ) * gOutputImageSizeSquared + localId;
-        if( localId < gOutputImageSizeSquared ) {
+        int resultIndex = (n * gNumFilters + outPlane) * gOutputSizeSquared + localId;
+        if (localId < gOutputSizeSquared) {
             output[resultIndex ] = sum;
         }
     }
diff --git a/cl/forward3.cl b/cl/forward3.cl
index 9af4acc8..d855d74a 100644
--- a/cl/forward3.cl
+++ b/cl/forward3.cl
@@ -15,10 +15,10 @@
 // one filter cube (corresponding to one outplane) = 5*5 * 32 * 4 = 3.2KB (ok)
 // all filter cubes = 3.2KB * 32 = 102KB (too big)
 // output are organized like [imageid][filterid][row][col]
-void kernel forward_3_by_n_outplane( const int batchSize,
+void kernel forward_3_by_n_outplane(const int batchSize,
       global const float *images, global const float *filters, 
     global float *output,
-    local float *_upstreamImage, local float *_filterCube ) {
+    local float *_upstreamImage, local float *_filterCube) {
     const int globalId = get_global_id(0);
 
     const int workgroupId = get_group_id(0);
@@ -27,52 +27,52 @@ void kernel forward_3_by_n_outplane( const int batchSize,
     const int outPlane = workgroupId % gNumFilters;
 
     const int localId = get_local_id(0);
-    const int outputRow = localId / gOutputImageSize;
-    const int outputCol = localId % gOutputImageSize;
+    const int outputRow = localId / gOutputSize;
+    const int outputCol = localId % gOutputSize;
 
-    const int minu = gPadZeros ? max( -gHalfFilterSize, -outputRow ) : -gHalfFilterSize;
-    const int maxu = gPadZeros ? min( gHalfFilterSize - gEven, gOutputImageSize - 1 - outputRow  - gEven) : gHalfFilterSize - gEven;
-    const int minv = gPadZeros ? max( -gHalfFilterSize, -outputCol ) : - gHalfFilterSize;
-    const int maxv = gPadZeros ? min( gHalfFilterSize - gEven, gOutputImageSize - 1 - outputCol - gEven) : gHalfFilterSize - gEven;
+    const int minu = gPadZeros ? max(-gHalfFilterSize, -outputRow) : -gHalfFilterSize;
+    const int maxu = gPadZeros ? min(gHalfFilterSize - gEven, gOutputSize - 1 - outputRow  - gEven) : gHalfFilterSize - gEven;
+    const int minv = gPadZeros ? max(-gHalfFilterSize, -outputCol) : - gHalfFilterSize;
+    const int maxv = gPadZeros ? min(gHalfFilterSize - gEven, gOutputSize - 1 - outputCol - gEven) : gHalfFilterSize - gEven;
 
-    const int numUpstreamsPerThread = ( gInputImageSizeSquared + workgroupSize - 1 ) / workgroupSize;
+    const int numUpstreamsPerThread = (gInputSizeSquared + workgroupSize - 1) / workgroupSize;
 
     const int filterCubeLength = gInputPlanes * gFilterSizeSquared;
     const int filterCubeGlobalOffset = outPlane * filterCubeLength;
-    const int numPixelsPerThread = ( filterCubeLength + workgroupSize - 1 ) / workgroupSize;
-    for( int i = 0; i < numPixelsPerThread; i++ ) {
+    const int numPixelsPerThread = (filterCubeLength + workgroupSize - 1) / workgroupSize;
+    for (int i = 0; i < numPixelsPerThread; i++) {
         int thisOffset = localId + i * workgroupSize;
-        if( thisOffset < filterCubeLength ) {
+        if (thisOffset < filterCubeLength) {
             _filterCube[thisOffset] = filters[filterCubeGlobalOffset + thisOffset];
         }
     }
     // dont need a barrier, since we'll just run behind the barrier from the upstream image download
 
     float sum = 0;
-    for( int upstreamPlane = 0; upstreamPlane < gInputPlanes; upstreamPlane++ ) {
-        int thisUpstreamImageOffset = ( n * gInputPlanes + upstreamPlane ) * gInputImageSizeSquared;
+    for (int upstreamPlane = 0; upstreamPlane < gInputPlanes; upstreamPlane++) {
+        int thisUpstreamImageOffset = (n * gInputPlanes + upstreamPlane) * gInputSizeSquared;
         barrier(CLK_LOCAL_MEM_FENCE);
-        for( int i = 0; i < numUpstreamsPerThread; i++ ) {
+        for (int i = 0; i < numUpstreamsPerThread; i++) {
             int thisOffset = workgroupSize * i + localId;
-            if( thisOffset < gInputImageSizeSquared ) {
+            if (thisOffset < gInputSizeSquared) {
                 _upstreamImage[ thisOffset ] = images[ thisUpstreamImageOffset + thisOffset ];
             }
         }
         barrier(CLK_LOCAL_MEM_FENCE);
         int filterImageOffset = upstreamPlane * gFilterSizeSquared;
-        for( int u = minu; u <= maxu; u++ ) {
+        for (int u = minu; u <= maxu; u++) {
             int inputRow = outputRow + u;
             #if gPadZeros == 0
                 inputRow += gHalfFilterSize;
             #endif
-            int inputimagerowoffset = inputRow * gInputImageSize;
+            int inputimagerowoffset = inputRow * gInputSize;
             int filterrowoffset = filterImageOffset + (u+gHalfFilterSize) * gFilterSize + gHalfFilterSize;
-            for( int v = minv; v <= maxv; v++ ) {
+            for (int v = minv; v <= maxv; v++) {
                 int inputCol = outputCol + v;
                 #if gPadZeros == 0
                     inputCol += gHalfFilterSize;
                 #endif
-                if( localId < gOutputImageSizeSquared ) {
+                if (localId < gOutputSizeSquared) {
                     sum += _upstreamImage[ inputimagerowoffset + inputCol] * _filterCube[ filterrowoffset + v ];
                 }
             }
@@ -80,8 +80,8 @@ void kernel forward_3_by_n_outplane( const int batchSize,
     }
 
     // output are organized like [imageid][filterid][row][col]
-    int resultIndex = ( n * gNumFilters + outPlane ) * gOutputImageSizeSquared + localId;
-    if( localId < gOutputImageSizeSquared ) {
+    int resultIndex = (n * gNumFilters + outPlane) * gOutputSizeSquared + localId;
+    if (localId < gOutputSizeSquared) {
         output[resultIndex ] = sum;
     }
 }
diff --git a/cl/forward4.cl b/cl/forward4.cl
index dd54bce4..4619d6bb 100644
--- a/cl/forward4.cl
+++ b/cl/forward4.cl
@@ -4,17 +4,17 @@
 // v. 2.0. If a copy of the MPL was not distributed with this file, You can 
 // obtain one at http://mozilla.org/MPL/2.0/.
 
-void copyLocal( local float *target, global float const *source, int N ) {
-    int numLoops = ( N + get_local_size(0) - 1 ) / get_local_size(0);
-    for( int loop = 0; loop < numLoops; loop++ ) {
+void copyLocal(local float *target, global float const *source, int N) {
+    int numLoops = (N + get_local_size(0) - 1) / get_local_size(0);
+    for (int loop = 0; loop < numLoops; loop++) {
         int offset = loop * get_local_size(0) + get_local_id(0);
-        if( offset < N ) {
+        if (offset < N) {
             target[offset] = source[offset];
         }
     }
 }
 
-#ifdef gOutputImageSize // for previous tests that dont define it
+#ifdef gOutputSize // for previous tests that dont define it
 // workgroup id organized like: [n][filterid]
 // local id organized like: [outrow][outcol]
 // each thread iterates over: [upstreamplane][filterrow][filtercol]
@@ -43,21 +43,21 @@ void copyLocal( local float *target, global float const *source, int N ) {
 //     basically, it's a hack, so larger images actually run, without
 //     crashing, and we can probably improve it a lot :-)
 //
-// So, when outputImageSize * outputImageSize > workgroupSize, then
+// So, when outputSize * outputSize > workgroupSize, then
 // multiple workgroups will be created for each output plane
 // the number of such workgroups is given by: `gPixelsPerThread`
 // the id of our workgroup within such a set of workgroups is calculated
 // as `pixel`
 // effectiveLocalId is our local id if we had one enormous workgroup
 // containing the whole output image plane
-void kernel forward_4_by_n_outplane_smallercache( const int batchSize,
+void kernel forward_4_by_n_outplane_smallercache(const int batchSize,
       global const float *images, global const float *filters, 
     global float *output,
-    local float *_inputPlane, local float *_filterPlane ) {
-    #define globalId ( get_global_id(0) )
+    local float *_inputPlane, local float *_filterPlane) {
+    #define globalId (get_global_id(0))
 
-    #define localId ( get_local_id(0) )
-    #define workgroupId ( get_group_id(0) )
+    #define localId (get_local_id(0))
+    #define workgroupId (get_group_id(0))
 //    const int workgroupSize = get_local_size(0);
     const int effectiveWorkgroupId = workgroupId / gPixelsPerThread;
     const int pixel = workgroupId % gPixelsPerThread;
@@ -65,35 +65,35 @@ void kernel forward_4_by_n_outplane_smallercache( const int batchSize,
     const int n = effectiveWorkgroupId / gNumFilters;
     const int outPlane = effectiveWorkgroupId % gNumFilters;
 
-    const int outputRow = effectiveLocalId / gOutputImageSize;
-    const int outputCol = effectiveLocalId % gOutputImageSize;
+    const int outputRow = effectiveLocalId / gOutputSize;
+    const int outputCol = effectiveLocalId % gOutputSize;
 
     float sum = 0;
-    for( int upstreamPlane = 0; upstreamPlane < gInputPlanes; upstreamPlane++ ) {
+    for (int upstreamPlane = 0; upstreamPlane < gInputPlanes; upstreamPlane++) {
         barrier(CLK_LOCAL_MEM_FENCE);
-        copyLocal( _inputPlane, images + ( n * gInputPlanes + upstreamPlane ) * gInputImageSizeSquared, gInputImageSizeSquared );
-        copyLocal( _filterPlane, filters + ( outPlane * gInputPlanes + upstreamPlane ) * gFilterSizeSquared, gFilterSizeSquared );
+        copyLocal(_inputPlane, images + (n * gInputPlanes + upstreamPlane) * gInputSizeSquared, gInputSizeSquared);
+        copyLocal(_filterPlane, filters + (outPlane * gInputPlanes + upstreamPlane) * gFilterSizeSquared, gFilterSizeSquared);
         barrier(CLK_LOCAL_MEM_FENCE);
 
-        if( effectiveLocalId < gOutputImageSizeSquared ) {
-            for( int u = -gHalfFilterSize; u <= gHalfFilterSize - gEven; u++ ) {
+        if (effectiveLocalId < gOutputSizeSquared) {
+            for (int u = -gHalfFilterSize; u <= gHalfFilterSize - gEven; u++) {
                 // trying to reduce register pressure...
                 #if gPadZeros == 1
-                    #define inputRow ( outputRow + u )
+                    #define inputRow (outputRow + u)
                 #else
-                    #define inputRow ( outputRow + u + gHalfFilterSize )
+                    #define inputRow (outputRow + u + gHalfFilterSize)
                 #endif
-                int inputimagerowoffset = inputRow * gInputImageSize;
+                int inputimagerowoffset = inputRow * gInputSize;
                 int filterrowoffset = (u+gHalfFilterSize) * gFilterSize + gHalfFilterSize;
-                bool rowOk = inputRow >= 0 && inputRow < gInputImageSize;
-                for( int v = -gHalfFilterSize; v <= gHalfFilterSize - gEven; v++ ) {
+                bool rowOk = inputRow >= 0 && inputRow < gInputSize;
+                for (int v = -gHalfFilterSize; v <= gHalfFilterSize - gEven; v++) {
                     #if gPadZeros == 1
-                        #define inputCol ( outputCol + v )
+                        #define inputCol (outputCol + v)
                     #else
-                        #define inputCol ( outputCol + v + gHalfFilterSize )
+                        #define inputCol (outputCol + v + gHalfFilterSize)
                     #endif
-                    bool process = rowOk && inputCol >= 0 && inputCol < gInputImageSize;
-                    if( process ) {
+                    bool process = rowOk && inputCol >= 0 && inputCol < gInputSize;
+                    if (process) {
                             sum += _inputPlane[ inputimagerowoffset + inputCol] * _filterPlane[ filterrowoffset + v ];
                     }
                 }
@@ -101,8 +101,8 @@ void kernel forward_4_by_n_outplane_smallercache( const int batchSize,
         }
     }
     // output are organized like [imageid][filterid][row][col]
-    #define resultIndex ( ( n * gNumFilters + outPlane ) * gOutputImageSizeSquared + effectiveLocalId )
-    if( effectiveLocalId < gOutputImageSizeSquared ) {
+    #define resultIndex (( n * gNumFilters + outPlane) * gOutputSizeSquared + effectiveLocalId)
+    if (effectiveLocalId < gOutputSizeSquared) {
         output[resultIndex ] = sum;
     }
 }
diff --git a/cl/forward_byinputplane.cl b/cl/forward_byinputplane.cl
index 379c9b64..715ed262 100644
--- a/cl/forward_byinputplane.cl
+++ b/cl/forward_byinputplane.cl
@@ -16,10 +16,10 @@
 // iterate over: [n][outCol]
 // output: [n][filterId][outRow][outCol][inputPlane]
 // need to later reduce output over: [inputPlane]
-void kernel forward_byinputplane( const int batchSize,
+void kernel forward_byinputplane(const int batchSize,
       global const float *images, global const float *filters, 
     global float *output,
-    local float *_inputPlane, local float *_filterPlanes ) {
+    local float *_inputPlane, local float *_filterPlanes) {
 //    const int evenPadding = gFilterSize % 2 == 0 ? 1 : 0;
 
     const int globalId = get_global_id(0);
@@ -28,71 +28,71 @@ void kernel forward_byinputplane( const int batchSize,
     const int localId = get_local_id(0);
 
     const int inputPlaneId = workgroupId;
-    const int numLoops = ( gNumFilters * gOutputImageSize + workgroupSize - 1 ) / workgroupSize;
-    const int numFilterCopyLoops = ( gFilterSizeSquared + gOutputImageSize - 1 ) / gOutputImageSize;
-    const int numImageCopyLoops = ( gInputImageSizeSquared + workgroupSize - 1 ) / workgroupSize;
-    for( int loop = 0; loop < numLoops; loop++ ) {
+    const int numLoops = (gNumFilters * gOutputSize + workgroupSize - 1) / workgroupSize;
+    const int numFilterCopyLoops = (gFilterSizeSquared + gOutputSize - 1) / gOutputSize;
+    const int numImageCopyLoops = (gInputSizeSquared + workgroupSize - 1) / workgroupSize;
+    for (int loop = 0; loop < numLoops; loop++) {
         const int loopLocalId = localId + loop * workgroupSize;
-        const int filterId = loopLocalId / gOutputImageSize;
-        const int outRow = loopLocalId % gOutputImageSize;
+        const int filterId = loopLocalId / gOutputSize;
+        const int outRow = loopLocalId % gOutputSize;
  
-        // copy down our filter, we have gOutputImageSize threads to do this
+        // copy down our filter, we have gOutputSize threads to do this
         global float const *globalFilterPlane = filters +
-            ( filterId * gNumInputPlanes + inputPlaneId ) * gFilterSizeSquared;
+            (filterId * gNumInputPlanes + inputPlaneId) * gFilterSizeSquared;
         local float *_localFilterPlane = _filterPlanes + filterId * gFilterSizeSquared;
         barrier(CLK_LOCAL_MEM_FENCE);
-        for( int i = 0; i < numFilterCopyLoops; i++ ) {
-            const int offset = i * gOutputImageSize + outRow;
+        for (int i = 0; i < numFilterCopyLoops; i++) {
+            const int offset = i * gOutputSize + outRow;
             bool process = filterId < gNumFilters && offset < gFilterSizeSquared;
-            if( process ) {
+            if (process) {
                 _localFilterPlane[ offset ] = globalFilterPlane[ offset ];
             }
         }
         // loop over n ...
-        for( int n = 0; n < batchSize; n++ ) {
+        for (int n = 0; n < batchSize; n++) {
             // copy down our imageplane, we have workgroupSize threads to do this
             barrier(CLK_LOCAL_MEM_FENCE);
             global float const *globalImagePlane = images +
-                ( n * gNumInputPlanes + inputPlaneId ) * gInputImageSizeSquared;
-            for( int i = 0; i< numImageCopyLoops; i++ ) {
+                (n * gNumInputPlanes + inputPlaneId) * gInputSizeSquared;
+            for (int i = 0; i< numImageCopyLoops; i++) {
                 const int offset = i * workgroupSize + localId;
-                if( offset < gInputImageSizeSquared ) {
+                if (offset < gInputSizeSquared) {
                     _inputPlane[ offset ] = globalImagePlane[ offset ];
                 }
             }
             barrier(CLK_LOCAL_MEM_FENCE);
             // calc output for each [outrow][outcol]
             bool filterPlaneOk = filterId < gNumFilters;
-            for( int outCol = 0; outCol < gOutputImageSize; outCol++ ) {
+            for (int outCol = 0; outCol < gOutputSize; outCol++) {
                 float sum = 0;
-                for( int filterRow = 0; filterRow < gFilterSize; filterRow++ ) {
+                for (int filterRow = 0; filterRow < gFilterSize; filterRow++) {
                     int inRow = outRow + filterRow;
                     #if gPadZeros == 1
                         inRow -= gHalfFilterSize;
                     #endif
-                    bool rowOk = filterPlaneOk && inRow >= 0 && inRow < gInputImageSize;
-                    for( int filterCol = 0; filterCol < gFilterSize; filterCol++ ) {
+                    bool rowOk = filterPlaneOk && inRow >= 0 && inRow < gInputSize;
+                    for (int filterCol = 0; filterCol < gFilterSize; filterCol++) {
                         int inCol = outCol + filterCol;
                         #if gPadZeros == 1
                             inCol -= gHalfFilterSize;
                         #endif
-                        bool process = rowOk && inCol >= 0 && inCol < gInputImageSize;
-                        if( process ) {
-                            float imageValue = _inputPlane[ inRow * gInputImageSize + inCol ];
+                        bool process = rowOk && inCol >= 0 && inCol < gInputSize;
+                        if (process) {
+                            float imageValue = _inputPlane[ inRow * gInputSize + inCol ];
                             float filterValue = _localFilterPlane[ filterRow * gFilterSize + filterCol ];
                             sum += imageValue * filterValue;
                         }
                     }
                 }
-                if( filterId < gNumFilters ) {
+                if (filterId < gNumFilters) {
                     // [n][filterId][outRow][outCol][inputPlane]
-                    int resultIndex = ( ( ( n
-                        * gNumFilters + filterId )
-                        * gOutputImageSize + outRow )
-                        * gOutputImageSize + outCol )
+                    int resultIndex = (( (n
+                        * gNumFilters + filterId)
+                        * gOutputSize + outRow)
+                        * gOutputSize + outCol)
                         * gNumInputPlanes + inputPlaneId;
                     output[resultIndex] = sum;
-                    //if( globalId == 2 ) output[0] = resultIndex;
+                    //if (globalId == 2) output[0] = resultIndex;
 //                    output[resultIndex] = outRow;
                 }
 //                output[localId] = _localFilterPlane[localId];
diff --git a/cl/forward_fc_wgperrow.cl b/cl/forward_fc_wgperrow.cl
index fbe07ac7..f4f914aa 100644
--- a/cl/forward_fc_wgperrow.cl
+++ b/cl/forward_fc_wgperrow.cl
@@ -4,11 +4,11 @@
 // v. 2.0. If a copy of the MPL was not distributed with this file, You can 
 // obtain one at http://mozilla.org/MPL/2.0/.
 
-void copyLocal( local float *restrict target, global float const *restrict source, int N ) {
-    int numLoops = ( N + get_local_size(0) - 1 ) / get_local_size(0);
-    for( int loop = 0; loop < numLoops; loop++ ) {
+void copyLocal(local float *restrict target, global float const *restrict source, int N) {
+    int numLoops = (N + get_local_size(0) - 1) / get_local_size(0);
+    for (int loop = 0; loop < numLoops; loop++) {
         int offset = loop * get_local_size(0) + get_local_id(0);
-        if( offset < N ) {
+        if (offset < N) {
             target[offset] = source[offset];
         }
     }
@@ -41,15 +41,15 @@ void copyLocal( local float *restrict target, global float const *restrict sourc
 //   filtersize == inputimagesize (mandatory)
 //   inputimagesize == 19
 //   filtersize == 19
-//   outputImageSize == 1
+//   outputSize == 1
 //   lots of outplanes/filters, hundreds, but less than max work groupsize, eg 350, 500, 361
 //   lots of inplanes, eg 32-128
 //   inputimagesize around 19, not too small
-#if (gFilterSize == gInputImageSize) && (gPadZeros == 0)
-void kernel forward_fc_workgroup_perrow( const int batchSize,
+#if (gFilterSize == gInputSize) && (gPadZeros == 0)
+void kernel forward_fc_workgroup_perrow(const int batchSize,
     global const float *images, global const float *filters, 
     global float *output1,
-    local float *_imageRow, local float *_filterRows ) {
+    local float *_imageRow, local float *_filterRows) {
     const int globalId = get_global_id(0);
 
     const int workgroupId = get_group_id(0);
@@ -67,32 +67,32 @@ void kernel forward_fc_workgroup_perrow( const int batchSize,
         + inputPlaneId * gFilterSizeSquared
         + filterRowId * gFilterSize;
     local float *_threadFilterRow = _filterRows + localId * gFilterSize;
-    if( localId < gNumFilters ) {
-        for( int i = 0; i < gFilterSize; i++ ) {
+    if (localId < gNumFilters) {
+        for (int i = 0; i < gFilterSize; i++) {
             _threadFilterRow[i] = filterRow[i];
         }
     }
-    const int loopsPerExample = ( gInputImageSize + workgroupSize - 1 ) / workgroupSize;
+    const int loopsPerExample = (gInputSize + workgroupSize - 1) / workgroupSize;
     // now loop over examples...
-    for( int n = 0; n < batchSize; n++ ) {
+    for (int n = 0; n < batchSize; n++) {
         // copy down example row, which is global to all threads in workgroup
         // hopefully should be enough threads....
         // but we should check anyway really, since depends on number of filters configured,
         // not on relative size of filter and input image
         barrier(CLK_LOCAL_MEM_FENCE);
-        copyLocal( _imageRow,  images 
-            + ( ( n 
-                * gNumInputPlanes + inputPlaneId ) 
-                * gInputImageSize + filterRowId )
-                * gInputImageSize, 
-            gInputImageSize );
+        copyLocal(_imageRow,  images 
+            + (( n 
+                * gNumInputPlanes + inputPlaneId) 
+                * gInputSize + filterRowId)
+                * gInputSize, 
+            gInputSize);
         barrier(CLK_LOCAL_MEM_FENCE);
         // add up the values in our row...
         // note: dont activate yet, since need to reduce again
         // output structured as: [n][filter][inputplane][filterrow], need to reduce again after
-        if( localId < gNumFilters ) {
+        if (localId < gNumFilters) {
             float sum = 0;
-            for( int filterCol = 0; filterCol < gFilterSize; filterCol++ ) {
+            for (int filterCol = 0; filterCol < gFilterSize; filterCol++) {
                 sum += _imageRow[ filterCol ] * _threadFilterRow[ filterCol ];
             }
             output1[ n * gNumInputPlanes * gNumFilters * gFilterSize
diff --git a/cl/ids.cl b/cl/ids.cl
index 3891df82..8c03c865 100644
--- a/cl/ids.cl
+++ b/cl/ids.cl
@@ -4,9 +4,9 @@
 // v. 2.0. If a copy of the MPL was not distributed with this file, You can 
 // obtain one at http://mozilla.org/MPL/2.0/.
 
-#define globalId ( get_global_id(0) )
-#define localId ( get_local_id(0)  )
-#define workgroupId ( get_group_id(0) )
-#define workgroupSize ( get_local_size(0) )
+#define globalId (get_global_id(0))
+#define localId (get_local_id(0)  )
+#define workgroupId (get_group_id(0))
+#define workgroupSize (get_local_size(0))
 
 
diff --git a/cl/inv.cl b/cl/inv.cl
index 2ece5cf0..4f508e09 100644
--- a/cl/inv.cl
+++ b/cl/inv.cl
@@ -9,9 +9,9 @@
 
 kernel void array_inv(
         const int N,
-        global float *data ) {
+        global float *data) {
     const int globalId = get_global_id(0);
-    if( globalId >= N ) {
+    if (globalId >= N) {
         return;
     }
     data[globalId] = 1.0f / data[globalId];
diff --git a/cl/memset.cl b/cl/memset.cl
index 69643b95..db6dd003 100644
--- a/cl/memset.cl
+++ b/cl/memset.cl
@@ -4,9 +4,9 @@
 // v. 2.0. If a copy of the MPL was not distributed with this file, You can 
 // obtain one at http://mozilla.org/MPL/2.0/.
 
-kernel void memset( global float *target, const float value, const int N ) {
+kernel void memset(global float *target, const float value, const int N) {
     #define globalId get_global_id(0)
-    if( globalId < N ) {
+    if (globalId < N) {
         target[globalId] = value;
     }
 }
diff --git a/cl/old/backproperrors.cl b/cl/old/backproperrors.cl
index 68815014..3705c16c 100644
--- a/cl/old/backproperrors.cl
+++ b/cl/old/backproperrors.cl
@@ -7,8 +7,8 @@
 // expected defines:
 // BIASED (or not)
 
-#define getFilterImageOffset( filter, inputPlane ) ( ( filter * gInputPlanes + inputPlane ) * gFilterSizeSquared )
-#define getResultImageOffset( n, filter ) ( ( n * gNumFilters + filter ) * gOutputImageSizeSquared )
+#define getFilterImageOffset(filter, inputPlane) (( filter * gInputPlanes + inputPlane) * gFilterSizeSquared)
+#define getResultImageOffset(n, filter) (( n * gNumFilters + filter) * gOutputSizeSquared)
 
 // handle lower layer...
 // gradOutput for upstream look like [n][inPlane][inRow][inCol]
@@ -27,7 +27,7 @@ void kernel calcGradInput(
         const int upstreamNumPlanes, const int upstreamImageSize, const int filterSize, 
         const int outNumPlanes, const int outImageSize,
         const int padZeros,
-        global const float *weights, global const float *gradOutput, global float *gradInput ) {
+        global const float *weights, global const float *gradOutput, global float *gradInput) {
     int globalId = get_global_id(0);
     const int halfFilterSize = filterSize >> 1;
     const int margin = padZeros ? halfFilterSize : 0;
@@ -42,26 +42,26 @@ void kernel calcGradInput(
     const int upstreamPlane = upstreamImage2dId % upstreamNumPlanes;
     const int n = upstreamImage2dId / upstreamNumPlanes;
 
-    const int minFilterRow = max( 0, upstreamRow + margin - (outImageSize - 1) );
-    const int maxFilterRow = min( filterSize - 1, upstreamRow + margin );
-    const int minFilterCol = max( 0, upstreamCol + margin - (outImageSize -1) );
-    const int maxFilterCol = min( filterSize - 1, upstreamCol + margin );
+    const int minFilterRow = max(0, upstreamRow + margin - (outImageSize - 1));
+    const int maxFilterRow = min(filterSize - 1, upstreamRow + margin);
+    const int minFilterCol = max(0, upstreamCol + margin - (outImageSize -1));
+    const int maxFilterCol = min(filterSize - 1, upstreamCol + margin);
 
     float sumWeightTimesOutError = 0;
     // aggregate over [outPlane][outRow][outCol]
-    for( int outPlane = 0; outPlane < outNumPlanes; outPlane++ ) {
-        for( int filterRow = minFilterRow; filterRow <= maxFilterRow; filterRow++ ) {
+    for (int outPlane = 0; outPlane < outNumPlanes; outPlane++) {
+        for (int filterRow = minFilterRow; filterRow <= maxFilterRow; filterRow++) {
             int outRow = upstreamRow + margin - filterRow;
-            for( int filterCol = minFilterCol; filterCol <= maxFilterCol; filterCol++ ) {
+            for (int filterCol = minFilterCol; filterCol <= maxFilterCol; filterCol++) {
                 int outCol = upstreamCol + margin - filterCol;
-                int resultIndex = ( ( n * outNumPlanes 
-                          + outPlane ) * outImageSize
-                          + outRow ) * outImageSize
+                int resultIndex = (( n * outNumPlanes 
+                          + outPlane) * outImageSize
+                          + outRow) * outImageSize
                           + outCol;
                 float thisError = gradOutput[resultIndex];
-                int thisWeightIndex = ( ( outPlane * upstreamNumPlanes
-                                    + upstreamPlane ) * filterSize
-                                    + filterRow ) * filterSize
+                int thisWeightIndex = (( outPlane * upstreamNumPlanes
+                                    + upstreamPlane) * filterSize
+                                    + filterRow) * filterSize
                                     + filterCol;
                 float thisWeight = weights[thisWeightIndex];
                 float thisWeightTimesError = thisWeight * thisError;
@@ -78,7 +78,7 @@ void kernel calcGradInput(
 // localid: [upstreamrow][upstreamcol]
 // per-thread aggregation: [outPlane][filterRow][filterCol]
 // need to store locally:
-// - _errorImage. size = outputImageSizeSquared
+// - _errorImage. size = outputSizeSquared
 // - _filterImage. size = filtersizesquared
 // note: currently doesnt use bias as input.  thats probably an error?
 // inputs: gradOutput :convolve: filters => gradInput
@@ -86,14 +86,14 @@ void kernel calcGradInput(
 // per workgroup:
 // gradOutput: [outPlane][outRow][outCol] 32 * 19 * 19 * 4 = 46KB
 // weights: [filterId][filterRow][filterCol] 32 * 5 * 5 * 4 = 3.2KB
-#ifdef gOutputImageSize // for previous tests that dont define it
+#ifdef gOutputSize // for previous tests that dont define it
 void kernel calcGradInputCached( 
         const int batchSize,
         global const float *gradOutputGlobal,
         global const float *filtersGlobal, 
         global float *gradInput,
         local float *_errorImage, 
-        local float *_filterImage ) {
+        local float *_filterImage) {
 
     const int globalId = get_global_id(0);
     const int localId = get_local_id(0);
@@ -103,43 +103,43 @@ void kernel calcGradInputCached(
     const int n = workgroupId / gInputPlanes;
     const int upstreamPlane = workgroupId % gInputPlanes;
 
-    const int upstreamRow = localId / gInputImageSize;
-    const int upstreamCol = localId % gInputImageSize;
+    const int upstreamRow = localId / gInputSize;
+    const int upstreamCol = localId % gInputSize;
 
-    const int minFilterRow = max( 0, upstreamRow + gMargin - (gOutputImageSize - 1) );
-    const int maxFilterRow = min( gFilterSize - 1, upstreamRow + gMargin );
-    const int minFilterCol = max( 0, upstreamCol + gMargin - (gOutputImageSize -1) );
-    const int maxFilterCol = min( gFilterSize - 1, upstreamCol + gMargin );
+    const int minFilterRow = max(0, upstreamRow + gMargin - (gOutputSize - 1));
+    const int maxFilterRow = min(gFilterSize - 1, upstreamRow + gMargin);
+    const int minFilterCol = max(0, upstreamCol + gMargin - (gOutputSize -1));
+    const int maxFilterCol = min(gFilterSize - 1, upstreamCol + gMargin);
 
-    const int filterPixelCopiesPerThread = ( gFilterSizeSquared + workgroupSize - 1 ) / workgroupSize;
-    const int errorPixelCopiesPerThread = ( gOutputImageSizeSquared + workgroupSize - 1 ) / workgroupSize;
-    const int pixelCopiesPerThread = max( filterPixelCopiesPerThread, errorPixelCopiesPerThread );
+    const int filterPixelCopiesPerThread = (gFilterSizeSquared + workgroupSize - 1) / workgroupSize;
+    const int errorPixelCopiesPerThread = (gOutputSizeSquared + workgroupSize - 1) / workgroupSize;
+    const int pixelCopiesPerThread = max(filterPixelCopiesPerThread, errorPixelCopiesPerThread);
 
     float sumWeightTimesOutError = 0;
-    for( int outPlane = 0; outPlane < gNumFilters; outPlane++ ) {
-        const int filterImageGlobalOffset =( outPlane * gInputPlanes + upstreamPlane ) * gFilterSizeSquared;
-        const int errorImageGlobalOffset = ( n * gNumFilters + outPlane ) * gOutputImageSizeSquared;
+    for (int outPlane = 0; outPlane < gNumFilters; outPlane++) {
+        const int filterImageGlobalOffset =(outPlane * gInputPlanes + upstreamPlane) * gFilterSizeSquared;
+        const int errorImageGlobalOffset = (n * gNumFilters + outPlane) * gOutputSizeSquared;
         barrier(CLK_LOCAL_MEM_FENCE);
-        for( int i = 0; i < pixelCopiesPerThread; i++ ) {
+        for (int i = 0; i < pixelCopiesPerThread; i++) {
             int thisOffset = workgroupSize * i + localId;
-            if( thisOffset < gFilterSizeSquared ) {
+            if (thisOffset < gFilterSizeSquared) {
                 _filterImage[ thisOffset ] = filtersGlobal[ filterImageGlobalOffset + thisOffset ];
             }
-            if( thisOffset < gOutputImageSizeSquared ) {
+            if (thisOffset < gOutputSizeSquared) {
                 _errorImage[ thisOffset ] = gradOutputGlobal[ errorImageGlobalOffset + thisOffset ];
             }
         }
         barrier(CLK_LOCAL_MEM_FENCE);
-//        if( globalId == 0 ) {
-//            for( int i = 0; i < gFilterSizeSquared; i++ ) {
+//        if (globalId == 0) {
+//            for (int i = 0; i < gFilterSizeSquared; i++) {
 //                gradInput[ (outPlane+1)*100 + i ] = _filterImage[i];
 //            }
 //        }
-        for( int filterRow = minFilterRow; filterRow <= maxFilterRow; filterRow++ ) {
+        for (int filterRow = minFilterRow; filterRow <= maxFilterRow; filterRow++) {
             int outRow = upstreamRow + gMargin - filterRow;
-            for( int filterCol = minFilterCol; filterCol <= maxFilterCol; filterCol++ ) {
+            for (int filterCol = minFilterCol; filterCol <= maxFilterCol; filterCol++) {
                 int outCol = upstreamCol + gMargin - filterCol;
-                int resultIndex = outRow * gOutputImageSize + outCol;
+                int resultIndex = outRow * gOutputSize + outCol;
                 float thisError = _errorImage[resultIndex];
                 int thisWeightIndex = filterRow * gFilterSize + filterCol;
                 float thisWeight = _filterImage[thisWeightIndex];
@@ -148,8 +148,8 @@ void kernel calcGradInputCached(
             }
         }
     }
-    const int upstreamImageGlobalOffset = ( n * gInputPlanes + upstreamPlane ) * gInputImageSizeSquared;
-    if( localId < gInputImageSizeSquared ) {
+    const int upstreamImageGlobalOffset = (n * gInputPlanes + upstreamPlane) * gInputSizeSquared;
+    if (localId < gInputSizeSquared) {
         gradInput[upstreamImageGlobalOffset + localId] = sumWeightTimesOutError;
     }
 }
@@ -161,45 +161,45 @@ void kernel calcGradInputCached(
 // so, workgroupId is [upstreamPlane]
 // localId is [upstreamRow][upstreamCol]
 // we iterate over [n]
-#ifdef gOutputImageSize // for previous tests that dont define it
+#ifdef gOutputSize // for previous tests that dont define it
 /*
 void kernel calcGradInput2( 
         const int batchSize,
         global const float *weightsGlobal, global const float *gradOutputGlobal, 
         global float *gradInputGlobal,
-        local float *_weightImage, local float *_errorImage ) {
+        local float *_weightImage, local float *_errorImage) {
     const int globalId = get_global_id(0);
     const int workgroupId = get_group_id(0);
     const int localId = get_local_id(0);
     const int workgroupSize = get_local_size(0);
 
     const int upstreamPlane = workgroupId;
-    const int upstreamRow = localId / gInputImageSize;
-    const int upstreamCol = localId % gInputImageSize;
+    const int upstreamRow = localId / gInputSize;
+    const int upstreamCol = localId % gInputSize;
 
     const int 
-    if( localId < filterSizeSquared ) {
+    if (localId < filterSizeSquared) {
         _weightImage[localId] = weightsGlobal[localId];
     }
 
-    for( int n = 0; n < batchSize; n++ ) {
+    for (int n = 0; n < batchSize; n++) {
         float sumWeightTimesOutError = 0;
         // aggregate over [outPlane][outRow][outCol]
-        for( int outPlane = 0; outPlane < outNumPlanes; outPlane++ ) {
-            for( int outRow = 0; outRow < outImageSize; outRow++ ) {
+        for (int outPlane = 0; outPlane < outNumPlanes; outPlane++) {
+            for (int outRow = 0; outRow < outImageSize; outRow++) {
                 // need to derive filterRow and filterCol, given outRow and outCol
                 int filterRow = upstreamRow + margin - outRow;
-                for( int outCol = 0; outCol < outImageSize; outCol++ ) {
+                for (int outCol = 0; outCol < outImageSize; outCol++) {
                    // need to derive filterRow and filterCol, given outRow and outCol
                     int filterCol = upstreamCol + margin - outCol;
-                    int resultIndex = ( ( n * outNumPlanes 
-                              + outPlane ) * outImageSize
-                              + outRow ) * outImageSize
+                    int resultIndex = (( n * outNumPlanes 
+                              + outPlane) * outImageSize
+                              + outRow) * outImageSize
                               + outCol;
                     float thisError = gradOutput[resultIndex];
-                    int thisWeightIndex = ( ( outPlane * upstreamNumPlanes
-                                        + upstreamPlane ) * filterSize
-                                        + filterRow ) * filterSize
+                    int thisWeightIndex = (( outPlane * upstreamNumPlanes
+                                        + upstreamPlane) * filterSize
+                                        + filterRow) * filterSize
                                         + filterCol;
                     float thisWeight = weights[thisWeightIndex];
                     float thisWeightTimesError = thisWeight * thisError;
@@ -231,47 +231,47 @@ void kernel calcGradInput2(
 //   filters are organized like [filterid][inplane][filterrow][filtercol]
 //        (so we will swap filterid and inplane around when referencing filters, kindof)
 //  globalid will be organized like upstreamoutput, ie [imageid][upstreamplane][upstreamrow][upstreamcol]
-#ifdef gOutputImageSize // for previous tests that dont define it
+#ifdef gOutputSize // for previous tests that dont define it
 void kernel convolve_errorcubes_float( 
        const int batchSize,
       global const float *errorcubes, global const float *filters, 
-    global float *upstreamErrors ) {
+    global float *upstreamErrors) {
     int globalId = get_global_id(0);
 
-    int upstreamImage2Id = globalId / gInputImageSizeSquared;
+    int upstreamImage2Id = globalId / gInputSizeSquared;
     int exampleId = upstreamImage2Id / gInputPlanes;
     int filterId = upstreamImage2Id % gInputPlanes;
 
-    if( exampleId >= batchSize ) {
+    if (exampleId >= batchSize) {
         return;
     }
 /*
-    int errorCubeOffset = exampleId * gOutPlanes * gOutputImageSizeSquared;
+    int errorCubeOffset = exampleId * gOutPlanes * gOutputSizeSquared;
     int filterCubeOffset = filterId * gNumInputPlanes * gFilterSizeSquared;
 
     int localid = globalId % upstreamImageSizeSquared;
-    int upstreamRow = localid / gInputImageSize;
-    int upstreamCol = localid % gInputImageSize;
+    int upstreamRow = localid / gInputSize;
+    int upstreamCol = localid % gInputSize;
 
     float sum = 0;
 // ====in progress
-    int minm = padZeros ? max( -halfFilterSize, -outputRow ) : -halfFilterSize;
+    int minm = padZeros ? max(-halfFilterSize, -outputRow) : -halfFilterSize;
 // ====to do
-    int maxm = padZeros ? min( halfFilterSize, outputImageSize - 1 - outputRow ) : halfFilterSize;
-    int minn = padZeros ? max( -halfFilterSize, -outputCol ) : - halfFilterSize;
-    int maxn = padZeros ? min( halfFilterSize, outputImageSize - 1 - outputCol ) : halfFilterSize;
+    int maxm = padZeros ? min(halfFilterSize, outputSize - 1 - outputRow) : halfFilterSize;
+    int minn = padZeros ? max(-halfFilterSize, -outputCol) : - halfFilterSize;
+    int maxn = padZeros ? min(halfFilterSize, outputSize - 1 - outputCol) : halfFilterSize;
     int inputPlane = 0;
-    while( inputPlane < numInputPlanes ) {
-        int inputImageOffset = inputCubeOffset + inputPlane * inputImageSizeSquared;
+    while(inputPlane < numInputPlanes) {
+        int inputImageOffset = inputCubeOffset + inputPlane * inputSizeSquared;
         int filterImageOffset = filterCubeOffset + inputPlane * filterSizeSquared;
         int m = minm;
-        while( m <= maxm ) {
-            int inputRow = outputRow + m + ( padZeros ? 0 : halfFilterSize );
-            int inputimagerowoffset = inputImageOffset + inputRow * inputImageSize;
+        while(m <= maxm) {
+            int inputRow = outputRow + m + (padZeros ? 0 : halfFilterSize);
+            int inputimagerowoffset = inputImageOffset + inputRow * inputSize;
             int filterrowoffset = filterImageOffset + (m+halfFilterSize) * filterSize + halfFilterSize;
             int n = minn;
-            while( n <= maxn ) {
-                int inputCol = outputCol + n + ( padZeros ? 0 : halfFilterSize );
+            while(n <= maxn) {
+                int inputCol = outputCol + n + (padZeros ? 0 : halfFilterSize);
                 sum += images[ inputimagerowoffset + inputCol] * filters[ filterrowoffset + n ];
                 n++;
             }
diff --git a/cl/per_element_add.cl b/cl/per_element_add.cl
index 2f21e628..a4f03d72 100644
--- a/cl/per_element_add.cl
+++ b/cl/per_element_add.cl
@@ -4,9 +4,9 @@
 // v. 2.0. If a copy of the MPL was not distributed with this file, You can 
 // obtain one at http://mozilla.org/MPL/2.0/.
 
-kernel void per_element_add( const int N, global float *target, global const float *source ) {
+kernel void per_element_add(const int N, global float *target, global const float *source) {
     const int globalId = get_global_id(0);
-    if( globalId >= N ) {
+    if (globalId >= N) {
         return;
     }
     target[globalId] += source[globalId];
@@ -14,19 +14,19 @@ kernel void per_element_add( const int N, global float *target, global const flo
 
 // adds source to target
 // tiles source as necessary, according to tilingSize
-kernel void per_element_tiled_add( const int N, const int tilingSize, global float *target, global const float *source ) {
+kernel void per_element_tiled_add(const int N, const int tilingSize, global float *target, global const float *source) {
     const int globalId = get_global_id(0);
-    if( globalId >= N ) {
+    if (globalId >= N) {
         return;
     }
     target[globalId] += source[globalId % tilingSize];
 }
 
-kernel void repeated_add( const int N, const int sourceSize, const int repeatSize, global float *target, global const float *source ) {
+kernel void repeated_add(const int N, const int sourceSize, const int repeatSize, global float *target, global const float *source) {
     const int globalId = get_global_id(0);
-    if( globalId >= N ) {
+    if (globalId >= N) {
         return;
     }
-    target[globalId] += source[ ( globalId / repeatSize ) % sourceSize ];
+    target[globalId] += source[ (globalId / repeatSize) % sourceSize ];
 }
 
diff --git a/cl/per_element_mult.cl b/cl/per_element_mult.cl
index e7f72432..72ff2f28 100644
--- a/cl/per_element_mult.cl
+++ b/cl/per_element_mult.cl
@@ -4,9 +4,9 @@
 // v. 2.0. If a copy of the MPL was not distributed with this file, You can 
 // obtain one at http://mozilla.org/MPL/2.0/.
 
-kernel void per_element_mult_inplace( const int N, global float *target, global const float *source ) {
+kernel void per_element_mult_inplace(const int N, global float *target, global const float *source) {
     const int globalId = get_global_id(0);
-    if( globalId >= N ) {
+    if (globalId >= N) {
         return;
     }
     target[globalId] *= source[globalId];
diff --git a/cl/per_element_op1.cl b/cl/per_element_op1.cl
index ebf586a4..159039e3 100644
--- a/cl/per_element_op1.cl
+++ b/cl/per_element_op1.cl
@@ -4,23 +4,23 @@
 // v. 2.0. If a copy of the MPL was not distributed with this file, You can 
 // obtain one at http://mozilla.org/MPL/2.0/.
 
-float operation( float val_one ) {
+float operation(float val_one) {
     return {{operation}};
 }
 
-kernel void per_element_op1_inplace( const int N, global float *target ) {
+kernel void per_element_op1_inplace(const int N, global float *target) {
     const int globalId = get_global_id(0);
-    if( globalId >= N ) {
+    if (globalId >= N) {
         return;
     }
-    target[globalId] = operation( target[globalId] );
+    target[globalId] = operation(target[globalId]);
 }
 
-kernel void per_element_op1_outofplace( const int N, global float *target, global float *one ) {
+kernel void per_element_op1_outofplace(const int N, global float *target, global float *one) {
     const int globalId = get_global_id(0);
-    if( globalId >= N ) {
+    if (globalId >= N) {
         return;
     }
-    target[globalId] = operation( one[globalId] );
+    target[globalId] = operation(one[globalId]);
 }
 
diff --git a/cl/per_element_op2.cl b/cl/per_element_op2.cl
index 840f2b7d..03bed67d 100644
--- a/cl/per_element_op2.cl
+++ b/cl/per_element_op2.cl
@@ -4,23 +4,23 @@
 // v. 2.0. If a copy of the MPL was not distributed with this file, You can 
 // obtain one at http://mozilla.org/MPL/2.0/.
 
-float operation( float val_one, float val_two ) {
+float operation(float val_one, float val_two) {
     return {{operation}};
 }
 
-kernel void per_element_op2_inplace( const int N, global float *target, global const float *source ) {
+kernel void per_element_op2_inplace(const int N, global float *target, global const float *source) {
     const int globalId = get_global_id(0);
-    if( globalId >= N ) {
+    if (globalId >= N) {
         return;
     }
-    target[globalId] = operation( target[globalId], source[globalId] );
+    target[globalId] = operation(target[globalId], source[globalId]);
 }
 
-kernel void per_element_op2_outofplace( const int N, global float *target, global float *one, global const float *two ) {
+kernel void per_element_op2_outofplace(const int N, global float *target, global float *one, global const float *two) {
     const int globalId = get_global_id(0);
-    if( globalId >= N ) {
+    if (globalId >= N) {
         return;
     }
-    target[globalId] = operation( one[globalId], two[globalId] );
+    target[globalId] = operation(one[globalId], two[globalId]);
 }
 
diff --git a/cl/per_element_op2_scalar.cl b/cl/per_element_op2_scalar.cl
index 7a7b8ffb..0dfd253d 100644
--- a/cl/per_element_op2_scalar.cl
+++ b/cl/per_element_op2_scalar.cl
@@ -4,23 +4,23 @@
 // v. 2.0. If a copy of the MPL was not distributed with this file, You can 
 // obtain one at http://mozilla.org/MPL/2.0/.
 
-float operation( float val_one, float val_two ) {
+float operation(float val_one, float val_two) {
     return {{operation}};
 }
 
-kernel void per_element_op2_inplace( const int N, global float *target, const float scalar ) {
+kernel void per_element_op2_inplace(const int N, global float *target, const float scalar) {
     const int globalId = get_global_id(0);
-    if( globalId >= N ) {
+    if (globalId >= N) {
         return;
     }
-    target[globalId] = operation( target[globalId], scalar );
+    target[globalId] = operation(target[globalId], scalar);
 }
 
-kernel void per_element_op2_outofplace( const int N, global float *target, global float *source, const float scalar ) {
+kernel void per_element_op2_outofplace(const int N, global float *target, global float *source, const float scalar) {
     const int globalId = get_global_id(0);
-    if( globalId >= N ) {
+    if (globalId >= N) {
         return;
     }
-    target[globalId] = operation( source[globalId], scalar );
+    target[globalId] = operation(source[globalId], scalar);
 }
 
diff --git a/cl/pooling.cl b/cl/pooling.cl
index ce384de3..b2c2f1b8 100644
--- a/cl/pooling.cl
+++ b/cl/pooling.cl
@@ -7,33 +7,33 @@
 // every plane is independent
 // every example is independent
 // so, globalid can be: [n][plane][outputRow][outputCol]
-kernel void forwardNaive( const int batchSize, global const float *input, global int *selectors, global float *output ) {
+kernel void forwardNaive(const int batchSize, global const float *input, global int *selectors, global float *output) {
     const int globalId = get_global_id(0);
 
-    const int intraImageOffset = globalId % gOutputImageSizeSquared;
-    const int outputRow = intraImageOffset / gOutputImageSize;
-    const int outputCol = intraImageOffset % gOutputImageSize;
+    const int intraImageOffset = globalId % gOutputSizeSquared;
+    const int outputRow = intraImageOffset / gOutputSize;
+    const int outputCol = intraImageOffset % gOutputSize;
 
-    const int image2dIdx = globalId / gOutputImageSizeSquared;
+    const int image2dIdx = globalId / gOutputSizeSquared;
     const int plane = image2dIdx % gNumPlanes;
     const int n = image2dIdx / gNumPlanes;
 
-    if( n >= batchSize ) {
+    if (n >= batchSize) {
         return;
     }
 
     const int inputRow = outputRow * gPoolingSize;
     const int inputCol = outputCol * gPoolingSize;
-    const int inputImageOffset = ( n * gNumPlanes + plane ) * gInputImageSizeSquared;
+    const int inputImageOffset = (n * gNumPlanes + plane) * gInputSizeSquared;
     int selector = 0;
-    int poolInputOffset = inputImageOffset + inputRow * gInputImageSize + inputCol;
+    int poolInputOffset = inputImageOffset + inputRow * gInputSize + inputCol;
     float maxValue = input[ poolInputOffset ];
-    for( int dRow = 0; dRow < gPoolingSize; dRow++ ) {
-        for( int dCol = 0; dCol < gPoolingSize; dCol++ ) {
-            bool process = ( inputRow + dRow < gInputImageSize ) && ( inputCol + dCol < gInputImageSize );
-            if( process ) {
-                float thisValue = input[ poolInputOffset + dRow * gInputImageSize + dCol ];
-                if( thisValue > maxValue ) {
+    for (int dRow = 0; dRow < gPoolingSize; dRow++) {
+        for (int dCol = 0; dCol < gPoolingSize; dCol++) {
+            bool process = (inputRow + dRow < gInputSize) && (inputCol + dCol < gInputSize);
+            if (process) {
+                float thisValue = input[ poolInputOffset + dRow * gInputSize + dCol ];
+                if (thisValue > maxValue) {
                     maxValue = thisValue;
                     selector = dRow * gPoolingSize + dCol;
                 }
diff --git a/cl/reduce_segments.cl b/cl/reduce_segments.cl
index a4008e1b..f20aec3f 100644
--- a/cl/reduce_segments.cl
+++ b/cl/reduce_segments.cl
@@ -4,18 +4,18 @@
 // v. 2.0. If a copy of the MPL was not distributed with this file, You can 
 // obtain one at http://mozilla.org/MPL/2.0/.
 
-kernel void reduce_segments( const int numSegments, const int segmentLength, 
-        global float const *in, global float* out ) {
+kernel void reduce_segments(const int numSegments, const int segmentLength, 
+        global float const *in, global float* out) {
     const int globalId = get_global_id(0);
     const int segmentId = globalId;
 
-    if( segmentId >= numSegments ) {
+    if (segmentId >= numSegments) {
         return;
     }
 
     float sum = 0;
     global const float *segment = in + segmentId * segmentLength;
-    for( int i = 0; i < segmentLength; i++ ) {
+    for (int i = 0; i < segmentLength; i++) {
         sum += segment[i];
     }
     out[segmentId] = sum;
diff --git a/cl/sqrt.cl b/cl/sqrt.cl
index 937990de..998145a3 100644
--- a/cl/sqrt.cl
+++ b/cl/sqrt.cl
@@ -6,11 +6,11 @@
 
 kernel void array_sqrt(
         const int N,
-        global float *data ) {
+        global float *data) {
     const int globalId = get_global_id(0);
-    if( globalId >= N ) {
+    if (globalId >= N) {
         return;
     }
-    data[globalId] = native_sqrt( data[globalId] );
+    data[globalId] = native_sqrt(data[globalId]);
 }
 
diff --git a/cl/squared.cl b/cl/squared.cl
index b328c15b..52860512 100644
--- a/cl/squared.cl
+++ b/cl/squared.cl
@@ -6,9 +6,9 @@
 
 kernel void array_squared(
         const int N,
-        global float *data ) {
+        global float *data) {
     const int globalId = get_global_id(0);
-    if( globalId >= N ) {
+    if (globalId >= N) {
         return;
     }
     data[globalId] = data[globalId] * data[globalId];
diff --git a/cl/unused/forward_fc.cl b/cl/unused/forward_fc.cl
index 4bae0eae..e4eae2f9 100644
--- a/cl/unused/forward_fc.cl
+++ b/cl/unused/forward_fc.cl
@@ -11,7 +11,7 @@
 #ifdef TANH
     #define ACTIVATION_FUNCTION(output) (tanh(output))
 #elif defined SCALEDTANH
-    #define ACTIVATION_FUNCTION(output) ( 1.7159f * tanh( 0.66667f * output))
+    #define ACTIVATION_FUNCTION(output) (1.7159f * tanh(0.66667f * output))
 #elif SIGMOID
     #define ACTIVATION_FUNCTION(output) (1.0f / (1 + exp(-output)))
 #elif defined RELU
@@ -25,16 +25,16 @@
 // output1: [n][inputplane][filter][filterrow]
 // output2: [n][inputplane][filter]
 #ifdef ACTIVATION_FUNCTION // protect against not defined
-kernel void reduce_rows( const int batchSize, global float const *output1, global float*output2 ) {
+kernel void reduce_rows(const int batchSize, global float const *output1, global float*output2) {
     const int globalId = get_global_id(0);
     const int n = globalId / gNumInputPlanes / gNumFilters;
-    if( n >= batchSize ) {
+    if (n >= batchSize) {
         return;
     }
     const int filterId = globalId % gNumFilters;
     float sum = 0;
     global const float *output1Col = output1 + globalId * gFilterSize;
-    for( int filterRow = 0; filterRow < gFilterSize; filterRow++ ) {
+    for (int filterRow = 0; filterRow < gFilterSize; filterRow++) {
         sum += output1Col[filterRow];
     }
     output2[globalId] = sum;
@@ -45,16 +45,16 @@ kernel void reduce_rows( const int batchSize, global float const *output1, globa
 // output2: [n][inputplane][filter]
 // output: [n][filter]
 #ifdef ACTIVATION_FUNCTION // protect against not defined
-kernel void reduce_inputplanes( const int batchSize, global float const *output2, global float*output ) {
+kernel void reduce_inputplanes(const int batchSize, global float const *output2, global float*output) {
     const int globalId = get_global_id(0);
     const int n = globalId / gNumFilters;
-    if( n >= batchSize ) {
+    if (n >= batchSize) {
         return;
     }
     const int filterId = globalId % gNumFilters;
     float sum = 0;
     global const float *output2Col = output2 + globalId * gNumInputPlanes;
-    for( int inputPlane = 0; inputPlane < gNumInputPlanes; inputPlane++ ) {
+    for (int inputPlane = 0; inputPlane < gNumInputPlanes; inputPlane++) {
         sum += output2Col[inputPlane];
     }
     // activate...
@@ -70,18 +70,18 @@ kernel void reduce_inputplanes( const int batchSize, global float const *output2
 // this kernel assumes:
 //   padzeros == 0 (mandatory)
 //   filtersize == inputimagesize (mandatory)
-//   outputImageSize == 1
+//   outputSize == 1
 //   lots of outplanes, hundreds, but less than max work groupsize, eg 350, 500, 361
 //   lots of inplanes, eg 32
 //   inputimagesize around 19, not too small
-#if gFilterSize == gInputImageSize && gPadZeros == 0
-void kernel forward_filter_matches_inimage( const int batchSize,
+#if gFilterSize == gInputSize && gPadZeros == 0
+void kernel forward_filter_matches_inimage(const int batchSize,
       global const float *images, global const float *filters, 
         #ifdef BIASED
             global const float*biases, 
         #endif
     global float *output,
-    local float *_upstreamImage, local float *_filterImage ) {
+    local float *_upstreamImage, local float *_filterImage) {
     const int globalId = get_global_id(0);
 
     const int workgroupId = get_group_id(0);
@@ -94,30 +94,30 @@ void kernel forward_filter_matches_inimage( const int batchSize,
     const int filterCol = localId % gFilterSize;
 
     float sum = 0;
-    for( int upstreamPlane = 0; upstreamPlane < gUpstreamNumPlanes; upstreamPlane++ ) {
-        int thisUpstreamImageOffset = ( n * gUpstreamNumPlanes + upstreamPlane ) * gUpstreamImageSizeSquared;
+    for (int upstreamPlane = 0; upstreamPlane < gUpstreamNumPlanes; upstreamPlane++) {
+        int thisUpstreamImageOffset = (n * gUpstreamNumPlanes + upstreamPlane) * gUpstreamImageSizeSquared;
         barrier(CLK_LOCAL_MEM_FENCE);
-        for( int i = 0; i < numUpstreamsPerThread; i++ ) {
+        for (int i = 0; i < numUpstreamsPerThread; i++) {
             int thisOffset = workgroupSize * i + localId;
-            if( thisOffset < gUpstreamImageSizeSquared ) {
+            if (thisOffset < gUpstreamImageSizeSquared) {
                 _upstreamImage[ thisOffset ] = images[ thisUpstreamImageOffset + thisOffset ];
             }
         }
-        const int filterGlobalOffset = ( outPlane * gUpstreamNumPlanes + upstreamPlane ) * gFilterSizeSquared;
-        for( int i = 0; i < numFilterPixelsPerThread; i++ ) {
+        const int filterGlobalOffset = (outPlane * gUpstreamNumPlanes + upstreamPlane) * gFilterSizeSquared;
+        for (int i = 0; i < numFilterPixelsPerThread; i++) {
             int thisOffset = workgroupSize * i + localId;
-            if( thisOffset < gFilterSizeSquared ) {
+            if (thisOffset < gFilterSizeSquared) {
                 _filterCube[thisOffset] = filters[filterGlobalOffset + thisOffset];
             }
         }
         barrier(CLK_LOCAL_MEM_FENCE);
-        if( localId < gOutImageSizeSquared ) {
-            for( int u = minu; u <= maxu; u++ ) {
-                int inputRow = outputRow + u + ( gPadZeros ? 0 : gHalfFilterSize );
+        if (localId < gOutImageSizeSquared) {
+            for (int u = minu; u <= maxu; u++) {
+                int inputRow = outputRow + u + (gPadZeros ? 0 : gHalfFilterSize);
                 int inputimagerowoffset = inputRow * gUpstreamImageSize;
                 int filterrowoffset = (u+gHalfFilterSize) * gFilterSize + gHalfFilterSize;
-                for( int v = minv; v <= maxv; v++ ) {
-                    int inputCol = outputCol + v + ( gPadZeros ? 0 : gHalfFilterSize );
+                for (int v = minv; v <= maxv; v++) {
+                    int inputCol = outputCol + v + (gPadZeros ? 0 : gHalfFilterSize);
                     sum += _upstreamImage[ inputimagerowoffset + inputCol] * _filterCube[ filterrowoffset + v ];
                 }
             }
@@ -127,8 +127,8 @@ void kernel forward_filter_matches_inimage( const int batchSize,
         sum += biases[outPlane];
     #endif
     // output are organized like [imageid][filterid][row][col]
-    int resultIndex = ( n * gNumOutPlanes + outPlane ) * gOutImageSizeSquared + localId;
-    if( localId < gOutImageSizeSquared ) {
+    int resultIndex = (n * gNumOutPlanes + outPlane) * gOutImageSizeSquared + localId;
+    if (localId < gOutImageSizeSquared) {
         output[resultIndex ] = ACTIVATION_FUNCTION(sum);
 //        output[resultIndex ] = 123;
     }
diff --git a/cl/unused/forwardfc_workgroupperfilterplane.cl b/cl/unused/forwardfc_workgroupperfilterplane.cl
index 2159b4e2..0eb0e0f4 100644
--- a/cl/unused/forwardfc_workgroupperfilterplane.cl
+++ b/cl/unused/forwardfc_workgroupperfilterplane.cl
@@ -11,7 +11,7 @@
 #ifdef TANH
     #define ACTIVATION_FUNCTION(output) (tanh(output))
 #elif defined SCALEDTANH
-    #define ACTIVATION_FUNCTION(output) ( 1.7159f * tanh( 0.66667f * output))
+    #define ACTIVATION_FUNCTION(output) (1.7159f * tanh(0.66667f * output))
 #elif SIGMOID
     #define ACTIVATION_FUNCTION(output) (1.0f / (1 + exp(-output)))
 #elif defined RELU
@@ -38,7 +38,7 @@
 // localid as [filterRow][filterCol]
 // output as [n][filterId][inputPlane]
 #if gFilterSize == gInputImagesize && gPadZeros == 0
-kernel void kernel1( const int batchSize, 
+kernel void kernel1(const int batchSize, 
     global float const * images,
     global float const * filters,
     global float *output1,
@@ -48,7 +48,7 @@ kernel void kernel1( const int batchSize,
     const int workgroupId = get_group_id(0);
     const int localId = get_local_id(0);
 
-    if( localId >= gFilterSizeSquared ) {
+    if (localId >= gFilterSizeSquared) {
         return;
     }
 
@@ -58,10 +58,10 @@ kernel void kernel1( const int batchSize,
     // first copy down our filter plane, assume we have exactly one thread per 
     // filter plane pixel
     global float *filterPlane = filters 
-        + ( filterId * gNumInputPlanes + inputPlane ) * gFilterSizeSquared;
+        + (filterId * gNumInputPlanes + inputPlane) * gFilterSizeSquared;
     _filterPlane[localId] = filterPlane[localId];
     barrier(CLK_LOCAL_MEM_FENCE);
-    for( int n = 0; n < batchSize; n++ ) {
+    for (int n = 0; n < batchSize; n++) {
         // copy down the example plane
         // oh, problem with this is, no sharing of this example across multiple filters....
     }
diff --git a/CMakeModules/build_EasyCL.cmake b/cmake/build_EasyCL.cmake
similarity index 73%
rename from CMakeModules/build_EasyCL.cmake
rename to cmake/build_EasyCL.cmake
index 54e1f8df..07d93763 100644
--- a/CMakeModules/build_EasyCL.cmake
+++ b/cmake/build_EasyCL.cmake
@@ -1,6 +1,5 @@
 INCLUDE(ExternalProject)
 
-message("CMAKE_INSTALL_PREFIX ${CMAKE_INSTALL_PREFIX}")
 ExternalProject_Add(
     EasyCL-external
     SOURCE_DIR ${CMAKE_CURRENT_SOURCE_DIR}/EasyCL
@@ -10,6 +9,7 @@ ExternalProject_Add(
     -DCMAKE_INSTALL_PREFIX:PATH=${CMAKE_INSTALL_PREFIX}
     -DBUILD_SHARED_LIBRARY:BOOL=ON
     -DBUILD_TESTS:BOOL=OFF
+	-DPROVIDE_LUA_ENGINE:BOOL=${BUILD_INTERNAL_LUA}
     -DCMAKE_BUILD_TYPE:STRING=RelWithDebInfo
     )
 
@@ -17,7 +17,7 @@ ADD_LIBRARY(EasyCL SHARED IMPORTED)
 SET_PROPERTY(TARGET EasyCL PROPERTY IMPORTED_LOCATION "${CMAKE_INSTALL_PREFIX}/lib/${CMAKE_SHARED_LIBRARY_PREFIX}EasyCL${CMAKE_SHARED_LIBRARY_SUFFIX}")
 SET_PROPERTY(TARGET EasyCL PROPERTY IMPORTED_IMPLIB "${CMAKE_INSTALL_PREFIX}/lib/EasyCL.lib")
 ADD_DEPENDENCIES(EasyCL EasyCL-external)
-SET(EASYCL_INCLUDE_DIRS ${CMAKE_CURRENT_SOURCE_DIR}/EasyCL)
+SET(EASYCL_INCLUDE_DIRS ${CMAKE_CURRENT_SOURCE_DIR}/EasyCL ${CMAKE_CURRENT_SOURCE_DIR}/EasyCL/thirdparty/clew/include  ${CMAKE_CURRENT_SOURCE_DIR}/EasyCL/thirdparty/clew/include/proxy-opencl)
 #message("lib path ${CMAKE_INSTALL_PREFIX}/lib/${CMAKE_SHARED_LIBRARY_PREFIX}EasyCL${CMAKE_SHARED_LIBRARY_SUFFIX}")
 if(WIN32)
 	SET(EASYCL_LIBRARIES ${CMAKE_INSTALL_PREFIX}/lib/EasyCL.lib ${CMAKE_INSTALL_PREFIX}/lib/clew.lib)
@@ -26,7 +26,8 @@ else()
 endif()
 SET(EASYCL_FOUND ON)
 
-#add_custom_target(EasyCL_delete_stamp EasyCL-external clBLAS-external
-#  ${CMAKE_COMMAND} -E  remove_directory "${CMAKE_BINARY_DIR}/EasyCL/stamp"
-#)
+add_custom_target(easycl_delete_stamp ALL 
+  COMMAND ${CMAKE_COMMAND} -E  remove_directory "${CMAKE_BINARY_DIR}/EasyCL/stamp"
+)
+add_dependencies(EasyCL-external easycl_delete_stamp)
 
diff --git a/CMakeModules/build_clBLAS.cmake b/cmake/build_clBLAS.cmake
similarity index 94%
rename from CMakeModules/build_clBLAS.cmake
rename to cmake/build_clBLAS.cmake
index bb496470..938e126d 100644
--- a/CMakeModules/build_clBLAS.cmake
+++ b/cmake/build_clBLAS.cmake
@@ -6,7 +6,6 @@ else()
 	set(CLBLAS_CLEW_LIB ${CMAKE_INSTALL_PREFIX}/lib/${CMAKE_SHARED_LIBRARY_PREFIX}clew${CMAKE_SHARED_LIBRARY_SUFFIX})
 endif()
 
-message("CMAKE_INSTALL_PREFIX ${CMAKE_INSTALL_PREFIX}")
 ExternalProject_Add(
     clBLAS-external
     SOURCE_DIR ${CMAKE_CURRENT_SOURCE_DIR}/clMathLibraries/clBLAS/src
@@ -31,7 +30,7 @@ SET_PROPERTY(TARGET clBLAS PROPERTY IMPORTED_LOCATION "${CMAKE_INSTALL_PREFIX}/l
 SET_PROPERTY(TARGET clBLAS PROPERTY IMPORTED_IMPLIB "${CMAKE_INSTALL_PREFIX}/lib/import/clBLAS.lib")
 #SET_TARGET_PROPERTIES(clBLAS PROPERTIES IMPORTED_LOCATION ${clBLAS_location})
 ADD_DEPENDENCIES(clBLAS clBLAS-external)
-SET(CLBLAS_INCLUDE_DIRS ${CMAKE_INSTALL_PREFIX}/include)
+SET(CLBLAS_INCLUDE_DIRS ${CMAKE_CURRENT_SOURCE_DIR}/clMathLibraries/clBLAS/src)
 SET(CLBLAS_LIBRARIES ${CMAKE_INSTALL_PREFIX}/lib/${CMAKE_SHARED_LIBRARY_PREFIX}clBLAS${CMAKE_SHARED_LIBRARY_SUFFIX})
 SET(CLBLAS_FOUND ON)
 
diff --git a/cog-batteries/cog_addheaders.py b/cog-batteries/cog_addheaders.py
index 90b3ed3e..7c3cfe2b 100644
--- a/cog-batteries/cog_addheaders.py
+++ b/cog-batteries/cog_addheaders.py
@@ -145,6 +145,8 @@ def addv2(classname = '', default_access='private'):
                 fnheader = fnheader.strip().replace(';const', 'const;')
                 fnheader = fnheader.strip().replace('; const', ' const;')
                 # cog.outl(fnheader);
+                if thisdec != '' and not got_all_header:
+                    thisdec += '    '
                 thisdec += fnheader + '\n'
                 if got_all_header:
                     decs_by_acc[thisaccess].append(thisdec)
diff --git a/cog-batteries/cog_optionswriter.py b/cog-batteries/cog_optionswriter.py
index 9b552928..c1277156 100644
--- a/cog-batteries/cog_optionswriter.py
+++ b/cog-batteries/cog_optionswriter.py
@@ -11,5 +11,5 @@ def write_options( optionsList ):
     for option in optionsList:
         optionTcase = option[0].upper() + option[1:]
         gOption = 'g' + optionTcase
-        cog.outl( 'options += " -D' + gOption + '=" + toString( ' + option + ' );' )
+        cog.outl( 'options += " -D' + gOption + '=" + toString(' + option + ');' )
 
diff --git a/doc/Build.md b/doc/Build.md
index 5b762a3e..b13ea741 100644
--- a/doc/Build.md
+++ b/doc/Build.md
@@ -15,23 +15,25 @@
 
 #To build
 
-Do you want to use DeepCL via python, or from the commandline, or from C++?
-* to use from Python, please see [python/README.md](../python/README.md)
-* To use from the commandline, or from C++, please continue reading this page :-)
+## Build options
+
+* If you want to be able to read training/testing data from jpeg files, then please choose `BUILD_JPEG_SUPPORT` = `ON`. You will need to provide turbojpeg library and headers, or compatible.  Otherwise set to `OFF`
 
 ## linux
 
 ### Pre-requisites
 
+*Required:*
 - git
+- make 
 - cmake
+- cmake-curses-gui
+- gfortran
 - g++ (should support c++0x; eg 4.4 or better)
 - An OpenCL-compatible driver installed, and OpenCL-compatible GPU
-  - tested using beignet, which provides OpenCL 1.2; and on CUDA 6.5 driver
-- opencl-headers
-- make 
-- (new) libjpeg62 or compatible, eg `sudo apt-get install libjpeg-turbo8-dev` (libjpeg-turbo is faster than original libjpeg6.2, by around 2-4 times, because it uses SIMD extensions)
-- (new) lua5.1
+
+*Optional:*
+- libjpeg62 or compatible, eg `sudo apt-get install libjpeg-turbo8-dev` (libjpeg-turbo is faster than original libjpeg6.2, by around 2-4 times, because it uses SIMD extensions)
 
 ### Procedure
 
@@ -40,10 +42,17 @@ git clone --recursive https://github.com/hughperkins/DeepCL.git
 cd DeepCL
 mkdir build
 cd build
-cmake ..
-make
+ccmake ..
+# in ccmake:
+# - press 'c'/configure
+# - choose the options you want
+# - press 'c' /configure again
+# - press 'g' / generate, then `q` / quit
+make -j 4 install
 ```
 
+The outputs will appear in subdirectories of `../dist`
+
 Note:
 * be sure to add `--recursive` when you clone, else when you build it will complain about OpenCLHelper missing (or clew missing)
   * if you do forget, you can experiment with running `git submodule init --recursive`, and then `git submodule update --recursive`
@@ -51,19 +60,46 @@ Note:
 * note: recently, moved EasyCL/thirdparty/clew from submodule to simply copying in the files
    * hopefully this makes new clones easier, but for now, if you already have a clone, when you next update, you might need to first remove the EasyCL/thirdparty/clew directory
 
+### To activate, setup environment:
+
+Open a bash prompt, and run:
+```
+source /path/to/DeepCL/dist/bin/activate.sh
+```
+(where you need to modify `/path/to/DeepCL` appropriately)
+
+Keep the bash open, and go to the next section
+
+### To check all is working
+
+Unit-tests:
+```
+deepcl_unittests
+```
+Most tests should pass, but one or two might fail.  Please do feel free to raise an issue for failing tests, even if they fail intermittently.
+
+Commandline training:
+```
+deepcl_train numtest=-1 numtrain=10000 datadir=/data/mnist
+```
+(change path to wherever the mnist data files are downloaded)
+
 ## Windows
 
 ### Pre-requisites
 
+*Required:*
 - git
 - cmake
 - Visual Studio (current 'standard' build system is: Visual Studio 2010 Express, but should also work on Visual Studio 2008 for Python 2.7, and Visual Studio Express 2013)
 - An OpenCL-compatible driver installed, and OpenCL-compatible GPU
+
+*Optional:*
 - (new) libjpeg62, or compatible, eg [libjpeg-turbo](http://www.libjpeg-turbo.org/Documentation/OfficialBinaries)  (libjpeg-turbo is faster than original libjpeg6.2, by around 2-4 times, because it uses SIMD extensions)
   - if you want, I made a fresh build of libjpeg-turbo 1.4.0:
     - dynamic library (doesnt work for me): [libjpeg-turbo-1.4.0-win32.zip](http://deepcl.hughperkins.com/Downloads/turbojpeg-1.4.0-win32.zip) and [libjpeg-turbo-1.4.0-win64.zip](http://deepcl.hughperkins.com/Downloads/turbojpeg-1.4.0-win64.zip)
     - static library (works ok for me): [libjpeg-turbo-1.4.0-win32.zip](http://deepcl.hughperkins.com/Downloads/turbojpeg-1.4.0-win32-static.zip) and [libjpeg-turbo-1.4.0-win64.zip](http://deepcl.hughperkins.com/Downloads/turbojpeg-1.4.0-win64-static.zip)
-- (new) lua5.1
+- Python 2.7 or Python 3.4 (needs python, and also the development library and include files)
 
 ### Procedure
 
@@ -72,14 +108,47 @@ Note:
   - hopefully this makes new clones easier, but for now, if you already have a clone, when you next update, you might need to first remove the EasyCL/thirdparty/clew directory
 - create a subdirectory `build` in the git cloned `DeepCL` directory
 - open cmake, point at the `DeepCL` directory, and set to build in the `build` subdirectory
-  - `configure` then `generate`
+  - `configure`, select 'visual studio 2010' (or as appropriate)
+- choose the options you want, eg turn python on/off, jpeg on/off
+- click `generate`
 - open visual studio, and load any of the projects in the `build` directory
   - change release type to `Release`
   - choose `build` from the `build` menu
+- select 'INSTALL' project, right-click and 'Build'
+
+The outputs will appear in the subdirectory 'dist'
+
+### To activate, setup environment:
+
+Open a cmd prompt, and run:
+```
+call \path\to\DeepCL\dist\bin\activate.bat
+```
+(where you need to modify `\path\to\DeepCL` appropriately)
+
+Keep the cmd open, and go to the next section
+
+### To check all is working
+
+First open a cmd prompt, and activate, as above, then:
+
+Unit-tests:
+```
+deepcl_unittests
+```
+Most tests should pass, but one or two might fail.  Please do feel free to raise an issue for failing tests, even if they fail intermittently.
+
+Commandline training:
+```
+deepcl_train numtest=-1 numtrain=10000 datadir=c:\data\mnist
+```
+(change path to wherever the mnist data files are downloaded)
 
 ## Linking
 
 If you want to use the DeepCL library from C++, you will need to link with the following libraries:
 - libDeepCL.so (or DeepCL.dll, on Windows)
 - libEasyCL.so (or EasyCL.dll, on Windows)
+- libclew.so / clew.dll
+- libclBLAS.so / clBLAS.dll
 
diff --git a/doc/CodingStandards.md b/doc/CodingStandards.md
index acfa9899..c4c650a7 100644
--- a/doc/CodingStandards.md
+++ b/doc/CodingStandards.md
@@ -31,4 +31,10 @@ classes that need to be overridden by the client script/code
 build the sources, but if you do have python installed, and you flip the `PYTHON_AVAILABLE`(note: might have changed names recently :-P ) switch in the 
 cmake configuration, then a lot of manual editing will no longer be necessary :-)
 
+## Formatting
+
+* whitespace:
+  * I've been using eg `foo( somearg, argtwo )`, but will probably change to `foo(somearg, argtwo)`, since that's same as pep8, and torch also uses this standard
+  * no whitespace in between functions in .cpp files (to reduce scrolling)
+* braces: `{` at end of previous line, like `if(foo) {`
 
diff --git a/doc/Development.md b/doc/Development.md
index 0e9331b4..46872b9d 100644
--- a/doc/Development.md
+++ b/doc/Development.md
@@ -3,6 +3,7 @@
 * [Nomenclature](Nomenclature.md)
 * [Doxygen docs](http://deepcl.hughperkins.com/4.x.x/html/annotated.html)
 * [Cog](Cog.md)
+* [Python wrapper dev](PythonWrapperDev.md)
 * [Branches](Branches.md)
 * [Architecture](Architecture.md)
 * [Coding guidelines](CodingStandards.md)
diff --git a/doc/PythonWrapperDev.md b/doc/PythonWrapperDev.md
new file mode 100644
index 00000000..2cb6c275
--- /dev/null
+++ b/doc/PythonWrapperDev.md
@@ -0,0 +1,30 @@
+# Python Wrapper Development
+
+## Notes on how the wrapper works
+
+* [cDeepCL.pxd](https://github.com/hughperkins/DeepCL/blob/master/python/cDeepCL.pxd) contains the definitions of the underlying DeepCL c++ libraries classes
+* [PyDeepCL.pyx](https://github.com/hughperkins/DeepCL/blob/master/python/PyDeepCL.pyx) contains Cython wrapper classes around the underlying c++ classes
+* [setup.py](https://github.com/hughperkins/DeepCL/blob/master/python/setup.py) is a setup file for compiling the `PyDeepCL.pyx` Cython file
+
+## Maintainer/development information
+
+If you want to modify the python wrappers, you'll need to re-run Cython.  This is no longer handled by `setup.py`, but is handled by the cmake build.  So, to run cython you'll need to:
+- install Cython, eg `pip install cython`
+- follow the instructions for the native build, [Build.md](https://github.com/hughperkins/DeepCL/blob/8.x/doc/Build.md)
+- when you open `ccmake`:
+  - enable option `Maintainer options`, then press `c`/`configure`
+  - enable `BUILD_PYTHON_WRAPPERS`, then `c`/`configure`
+  - enable `DEV_RUN_CYTHON`, then `c`/`configure`
+- => and now `g`/`generate`, and build
+
+* If you want to update this readme, you need to re-generate the README.rst, so you'll need pypandoc:
+```
+pip install pypandoc
+```
+  * (note that pypandoc depends on pandoc native library)
+
+And then to regenerate README.rst:
+```
+python setup.py sdist
+```
+
diff --git a/jenkins/linux-cpp.sh b/jenkins/linux-cpp.sh
new file mode 100644
index 00000000..49d38f98
--- /dev/null
+++ b/jenkins/linux-cpp.sh
@@ -0,0 +1,9 @@
+pwd
+version=$(cat jenkins/version.txt)
+rm -Rf build dist
+mkdir -p build
+cd build
+cmake .. || exit 1
+make -j 4 install || exit 1
+cd ..
+
diff --git a/jenkins/linux32-cpp.sh b/jenkins/linux32-cpp.sh
index a3b0457d..4a022e7b 100644
--- a/jenkins/linux32-cpp.sh
+++ b/jenkins/linux32-cpp.sh
@@ -1,9 +1,7 @@
-pwd
 version=$(cat jenkins/version.txt)
-rm -Rf build
-mkdir -p build
-cd build
-schroot -c trusty_i386 -- cmake -D BUILD_PYSWIG_WRAPPERS:BOOL=OFF -D BUILD_LUA_WRAPPERS:BOOL=OFF .. || exit 1
-schroot -c trusty_i386 -- make || exit 1
-tar -cjf deepcl-linux32-${version}.tar.bz2 --exclude=CMake* --exclude=CMakeFiles --exclude=cmake* --exclude=Makefile --exclude=*.png --exclude=*.dat *
+echo version ${version}
+schroot -c trusty_i386 bash jenkins/linux-cpp.sh || exit 1
+echo version ${version}
+tar -cjf deepcl-linux32-${version}.tar.bz2 dist
+tar -tf deepcl-linux32-${version}.tar.bz2
 
diff --git a/jenkins/linux32-py.sh b/jenkins/linux32-py.sh
index 83407813..cd4d4f7c 100644
--- a/jenkins/linux32-py.sh
+++ b/jenkins/linux32-py.sh
@@ -5,15 +5,16 @@ echo pyenv: $pyenv
 
 pwd
 cp jenkins/version.txt python
-cd python
-pwd
-rm -Rf dist mysrc build PyDeepCL.cpp
-ls
-schroot -c trusty_i386 -- $HOME/${pyenv}/bin/pip install cython pypandoc || exit 1
-schroot -c trusty_i386 -- $HOME/${pyenv}/bin/python setup.py build_ext -i || exit 1
-schroot -c trusty_i386 -- $HOME/${pyenv}/bin/python setup.py bdist_egg || exit 1
+#cd python
+#pwd
+#rm -Rf dist mysrc build PyDeepCL.cpp
+#ls
+#schroot -c trusty_i386 -- $HOME/${pyenv}/bin/pip install cython pypandoc || exit 1
+#schroot -c trusty_i386 -- $HOME/${pyenv}/bin/python setup.py build_ext -i || exit 1
+#schroot -c trusty_i386 -- $HOME/${pyenv}/bin/python setup.py bdist_egg || exit 1
 
-# just ignore the error on next line for now (if already uploaded this version)
-schroot -c trusty_i386 -- $HOME/${pyenv}/bin/python setup.py bdist_egg upload
+## just ignore the error on next line for now (if already uploaded this version)
+#schroot -c trusty_i386 -- $HOME/${pyenv}/bin/python setup.py bdist_egg upload
+schroot -c trusty_i386 -- bash jenkins/linux64-py.sh $1 || exit 1
 exit 0
 
diff --git a/jenkins/linux64-cpp.sh b/jenkins/linux64-cpp.sh
index 8a99910a..12ea4389 100644
--- a/jenkins/linux64-cpp.sh
+++ b/jenkins/linux64-cpp.sh
@@ -1,9 +1,7 @@
-pwd
 version=$(cat jenkins/version.txt)
-rm -Rf build
-mkdir -p build
-cd build
-cmake -D BUILD_PYSWIG_WRAPPERS:BOOL=OFF -D BUILD_LUA_WRAPPERS:BOOL=OFF .. || exit 1
-make || exit 1
-tar -cjf deepcl-linux64-${version}.tar.bz2 --exclude=CMake* --exclude=CMakeFiles --exclude=cmake* --exclude=Makefile --exclude=*.png --exclude=*.dat *
+echo version ${version}
+bash jenkins/linux-cpp.sh
+echo version ${version}
+tar -cjf deepcl-linux64-${version}.tar.bz2 dist
+tar -tf deepcl-linux64-${version}.tar.bz2
 
diff --git a/jenkins/linux64-py.sh b/jenkins/linux64-py.sh
index 25cababe..1b7b02a5 100644
--- a/jenkins/linux64-py.sh
+++ b/jenkins/linux64-py.sh
@@ -1,17 +1,31 @@
+set -x
+
 echo args: $1
 
 pyenv=$1
 echo pyenv: $pyenv
 
+. $HOME/${pyenv}/bin/activate
+pip install cython pypandoc || exit 1
+
+pwd
+rm -Rf build dist
+mkdir build
+cd build
+cmake .. || exit 1
+make -j 4 install || exit 1
+cd ..
+pwd
+ls
+. dist/bin/activate.sh
+
 pwd
 cp jenkins/version.txt python
 cd python
 pwd
-rm -Rf dist mysrc build PyDeepCL.cpp
+rm -Rf dist build DeepCL.egg-info
 ls
-. $HOME/${pyenv}/bin/activate
 pwd
-pip install cython pypandoc || exit 1
 python setup.py build_ext -i || exit 1
 python setup.py bdist_egg || exit 1
 
diff --git a/jenkins/win-cpp.bat b/jenkins/win-cpp.bat
new file mode 100644
index 00000000..ab870782
--- /dev/null
+++ b/jenkins/win-cpp.bat
@@ -0,0 +1,58 @@
+set WINBITS=%1
+echo WINBITS: %WINBITS%
+
+cd %~dp0.
+for /f "" %%i in (version.txt) do (
+   set version=%%i
+)
+
+cd %~dp0..
+if not exist turbogjpeg-win%WINBITS%.zip powershell.exe -Command (new-object System.Net.WebClient).DownloadFile('http://deepcl.hughperkins.com/Downloads/turbojpeg-1.4.0-win%WINBITS%-static.zip', 'turbojpeg-win%WINBITS%.zip')
+if errorlevel 1 exit /B 1
+rmdir /s /q turbojpeg-win%WINBITS%
+mkdir turbojpeg-win%WINBITS%
+cd turbojpeg-win%WINBITS%
+"c:\program files\7-Zip\7z.exe" x ..\turbojpeg-win%WINBITS%.zip
+if errorlevel 1 exit /B 1
+
+cd %~dp0..
+dir
+rmdir /s /q build
+mkdir build
+cd %~dp0..\build
+dir
+set "VS100COMNTOOLS=c:\Program Files (x86)\Microsoft Visual Studio 10.0\Common7\Tools\"
+set "VS110COMNTOOLS=C:\Program Files (x86)\Microsoft Visual Studio 11.0\Common7\Tools\"
+set "VS120COMNTOOLS=C:\Program Files (x86)\Microsoft Visual Studio 12.0\Common7\Tools\"
+echo get_filename_component(SOURCE_DIR ${CMAKE_CURRENT_SOURCE_DIR} DIRECTORY)>initcache.cmake
+echo set(JPEG_INCLUDE_DIR "${SOURCE_DIR}/turbojpeg-win%WINBITS%" CACHE PATH "JPEG_INCLUDE_DIR")>>initcache.cmake
+echo set(JPEG_LIBRARY "${SOURCE_DIR}/turbojpeg-win%WINBITS%/turbojpeg-static.lib" CACHE PATH "JPEG_LIBRARY")>>initcache.cmake
+if exist "c:\program files\cmake\bin\cmake.exe" set "CMAKEEXE=c:\program files\cmake\bin\cmake.exe"
+if exist "c:\program files (x86)\cmake\bin\cmake.exe" set "CMAKEEXE=c:\program files (x86)\cmake\bin\cmake.exe"
+set "generatorpostfix="
+if x%WINBITS%==x64 set "generatorpostfix= Win64"
+"%CMAKEEXE%" -G "Visual Studio 10 2010%generatorpostfix%" -C initcache.cmake ..
+C:\WINDOWS\Microsoft.NET\Framework\v4.0.30319\MSBuild.exe ALL_BUILD.vcxproj /p:Configuration=Release
+if errorlevel 1 exit /B 1
+C:\WINDOWS\Microsoft.NET\Framework\v4.0.30319\MSBuild.exe INSTALL.vcxproj /p:Configuration=Release
+if errorlevel 1 exit /B 1
+
+rem copy down the redistributables (maybe they're on the server somewhere?)
+cd %~dp0..
+powershell Set-ExecutionPolicy unrestricted
+if not exist vc2010redist.zip powershell.exe -Command (new-object System.Net.WebClient).DownloadFile('http://deepcl.hughperkins.com/Downloads/vc2010redist.zip', 'vc2010redist.zip')
+if errorlevel 1 exit /B 1
+
+rmdir /s /q vc2010redist
+"c:\program files\7-Zip\7z.exe" x vc2010redist.zip
+if errorlevel 1 exit /B 1
+
+copy vc2010redist\win%WINBITS%\* dist\bin
+
+cd %~dp0..
+"c:\program files\7-Zip\7z.exe" a deepcl-win%WINBITS%-%version%.zip dist
+if errorlevel 1 exit /B 1
+
+cd %~dp0..
+echo %version%>latestUnstable.txt
+
diff --git a/jenkins/win-py.bat b/jenkins/win-py.bat
index 756d639d..d511acd7 100644
--- a/jenkins/win-py.bat
+++ b/jenkins/win-py.bat
@@ -1,12 +1,33 @@
-echo args: %1
+echo args: %1 %2
+set bitness=%1
+set pyversion=%2
+echo bitness: %bitness%
+echo pyversion: %pyversion%
 
-call \%1\scripts\activate
+call \env-%pyversion%-%bitness%\scripts\activate
 
 python -c "from __future__ import print_function; import platform; print( platform.uname() )"
 python -c "from __future__ import print_function; import platform; print( platform.architecture() )"
 
-copy /y jenkins\version.txt python
+cd
+rmdir /s /q build
+rmdir /s /q dist
+dir
+mkdir build
+cd build
+set "generatorpostfix="
+if %bitness%==64 set "generatorpostfix= Win64"
+"c:\program files (x86)\cmake\bin\cmake" -G "Visual Studio 10 2010%generatorpostfix%" ..
+C:\WINDOWS\Microsoft.NET\Framework\v4.0.30319\MSBuild.exe ALL_BUILD.vcxproj /p:Configuration=Release
+if errorlevel 1 exit /B 1
+C:\WINDOWS\Microsoft.NET\Framework\v4.0.30319\MSBuild.exe INSTALL.vcxproj /p:Configuration=Release
+if errorlevel 1 exit /B 1
+cd ..
+cd
+dir
+call dist\bin\activate.bat
 
+copy /y jenkins\version.txt python
 cd python
 
 rmdir /s /q dist
@@ -19,6 +40,7 @@ if exist build goto :error
 if exist mysrc goto :error
 if exist src goto :error
 
+set
 python setup.py build_ext -i
 if errorlevel 1 goto :error
 
diff --git a/jenkins/win32-cpp.bat b/jenkins/win32-cpp.bat
index 94240137..f8dd0107 100644
--- a/jenkins/win32-cpp.bat
+++ b/jenkins/win32-cpp.bat
@@ -1,47 +1,2 @@
-cd jenkins
-for /f "" %%i in (version.txt) do (
-   set version=%%i
-)
-cd ..
-
-powershell.exe -Command (new-object System.Net.WebClient).DownloadFile('http://deepcl.hughperkins.com/Downloads/turbojpeg-1.4.0-win32-static.zip', 'turbojpeg-win32.zip')
-if errorlevel 1 exit /B 1
-mkdir turbojpeg-win32
-cd turbojpeg-win32
-"c:\program files\7-Zip\7z.exe" x ..\turbojpeg-win32.zip
-if errorlevel 1 exit /B 1
-cd ..
-
-dir
-rmdir /s /q build
-mkdir build
-cd build
-dir
-set "VS100COMNTOOLS=c:\Program Files (x86)\Microsoft Visual Studio 10.0\Common7\Tools\"
-set "VS110COMNTOOLS=C:\Program Files (x86)\Microsoft Visual Studio 11.0\Common7\Tools\"
-set "VS120COMNTOOLS=C:\Program Files (x86)\Microsoft Visual Studio 12.0\Common7\Tools\"
-echo set(BUILD_LUA_WRAPPERS "OFF" CACHE BOOL "BUILD_LUA_WRAPPERS")>initcache.cmake
-echo get_filename_component(SOURCE_DIR ${CMAKE_CURRENT_SOURCE_DIR} DIRECTORY)>>initcache.cmake
-echo set(JPEG_INCLUDE_DIR "${SOURCE_DIR}/turbojpeg-win32" CACHE PATH "JPEG_INCLUDE_DIR")>>initcache.cmake
-echo set(JPEG_LIBRARY "${SOURCE_DIR}/turbojpeg-win32/turbojpeg-static.lib" CACHE PATH "JPEG_LIBRARY")>>initcache.cmake
-"c:\program files (x86)\cmake\bin\cmake" -G "Visual Studio 10 2010" -C initcache.cmake ..
-C:\WINDOWS\Microsoft.NET\Framework\v4.0.30319\MSBuild.exe ALL_BUILD.vcxproj /p:Configuration=Release
-if errorlevel 1 exit /B 1
-
-rem copy down the redistributables (maybe they're on the server somewhere?)
-powershell Set-ExecutionPolicy unrestricted
-powershell.exe -Command (new-object System.Net.WebClient).DownloadFile('http://deepcl.hughperkins.com/Downloads/vc2010redist.zip', 'vc2010redist.zip')
-if errorlevel 1 exit /B 1
-
-"c:\program files\7-Zip\7z.exe" x vc2010redist.zip
-if errorlevel 1 exit /B 1
-
-copy vc2010redist\win32\* Release
-
-cd Release
-"c:\program files\7-Zip\7z.exe" a deepcl-win32-%version%.zip *
-if errorlevel 1 exit /B 1
-
-cd ..
-echo %version%>latestUnstable.txt
+call %~dp0win-cpp.bat 32
 
diff --git a/jenkins/win32-py27.bat b/jenkins/win32-py27.bat
index 6c99b99a..06973518 100644
--- a/jenkins/win32-py27.bat
+++ b/jenkins/win32-py27.bat
@@ -1,4 +1,4 @@
-call %~dp0win-py.bat env-27-32
+call %~dp0win-py.bat 32 27
 if errorlevel 1 goto :fail
 goto :eof
 
diff --git a/jenkins/win32-py34.bat b/jenkins/win32-py34.bat
index 12432d5e..5a8866b6 100644
--- a/jenkins/win32-py34.bat
+++ b/jenkins/win32-py34.bat
@@ -1,4 +1,4 @@
-call %~dp0win-py.bat env-34-32
+call %~dp0win-py.bat 32 34
 if errorlevel 1 goto :fail
 goto :eof
 
diff --git a/jenkins/win64-cpp.bat b/jenkins/win64-cpp.bat
index 5d23115c..99591100 100644
--- a/jenkins/win64-cpp.bat
+++ b/jenkins/win64-cpp.bat
@@ -1,43 +1,2 @@
-cd jenkins
-for /f "" %%i in (version.txt) do (
-   set version=%%i
-)
-cd ..
-
-powershell.exe -Command (new-object System.Net.WebClient).DownloadFile('http://deepcl.hughperkins.com/Downloads/turbojpeg-1.4.0-win64-static.zip', 'turbojpeg-win64.zip')
-if errorlevel 1 exit /B 1
-mkdir turbojpeg-win64
-cd turbojpeg-win64
-"c:\program files\7-Zip\7z.exe" x ..\turbojpeg-win64.zip
-if errorlevel 1 exit /B 1
-cd ..
-
-dir
-rmdir /s /q build
-mkdir build
-cd build
-dir
-set "VS100COMNTOOLS=c:\Program Files (x86)\Microsoft Visual Studio 10.0\Common7\Tools\"
-set "VS110COMNTOOLS=C:\Program Files (x86)\Microsoft Visual Studio 11.0\Common7\Tools\"
-set "VS120COMNTOOLS=C:\Program Files (x86)\Microsoft Visual Studio 12.0\Common7\Tools\"
-"c:\program files (x86)\cmake\bin\cmake" -G "Visual Studio 10 2010 Win64" -D BUILD_PYSWIG_WRAPPERS:BOOL=OFF -D BUILD_LUA_WRAPPERS:BOOL=OFF -D JPEG_INCLUDE_DIR=%CD%\..\turbojpeg-win64 -D JPEG_LIBRARY=%CD%\..\turbojpeg-win64\turbojpeg-static.lib ..
-C:\WINDOWS\Microsoft.NET\Framework\v4.0.30319\MSBuild.exe ALL_BUILD.vcxproj /p:Configuration=Release
-if errorlevel 1 exit /B 1
-
-rem copy down the redistributables (maybe they're on the server somewhere?)
-powershell Set-ExecutionPolicy unrestricted
-powershell.exe -Command (new-object System.Net.WebClient).DownloadFile('http://deepcl.hughperkins.com/Downloads/vc2010redist.zip', 'vc2010redist.zip')
-if errorlevel 1 exit /B 1
-
-"c:\program files\7-Zip\7z.exe" x vc2010redist.zip
-if errorlevel 1 exit /B 1
-
-copy vc2010redist\win64\* Release
-
-cd Release
-"c:\program files\7-Zip\7z.exe" a deepcl-win64-%version%.zip *
-if errorlevel 1 exit /B 1
-
-cd ..
-echo %version%>latestUnstable.txt
+call %~dp0win-cpp.bat 64
 
diff --git a/jenkins/win64-py27.bat b/jenkins/win64-py27.bat
index f6db3990..b4c62d94 100644
--- a/jenkins/win64-py27.bat
+++ b/jenkins/win64-py27.bat
@@ -1,4 +1,4 @@
-call %~dp0win-py.bat env-27-64
+call %~dp0win-py.bat 64 27
 if errorlevel 1 goto :fail
 goto :eof
 
diff --git a/jenkins/win64-py34.bat b/jenkins/win64-py34.bat
index 46a7a259..0f303ac9 100644
--- a/jenkins/win64-py34.bat
+++ b/jenkins/win64-py34.bat
@@ -1,4 +1,4 @@
-call %~dp0win-py.bat env-34-64
+call %~dp0win-py.bat 64 34
 if errorlevel 1 goto :fail
 goto :eof
 
diff --git a/python/Adadelta.pyx b/python/Adadelta.pyx
index 8fdf23d0..c2d57ddd 100644
--- a/python/Adadelta.pyx
+++ b/python/Adadelta.pyx
@@ -1,8 +1,8 @@
 cdef class Adadelta: 
     cdef cDeepCL.Adadelta *thisptr
-    def __cinit__( self, EasyCL cl, rho=0.9 ):
+    def __cinit__( self, DeepCL cl, rho=0.9 ):
         self.thisptr = new cDeepCL.Adadelta(cl.thisptr, rho)
-    def __dealloc(self):
+    def __dealloc__(self):
         del self.thisptr
     def train(self, NeuralNet net, TrainingContext context,
         float[:] inputdata, float[:] expectedOutput ):
diff --git a/python/Adagrad.pyx b/python/Adagrad.pyx
index e7307af1..dc7b8daa 100644
--- a/python/Adagrad.pyx
+++ b/python/Adagrad.pyx
@@ -1,9 +1,9 @@
 cdef class Adagrad: 
     cdef cDeepCL.Adagrad *thisptr
-    def __cinit__( self, EasyCL cl, learningRate, momentum=0.0 ):
+    def __cinit__( self, DeepCL cl, learningRate, momentum=0.0 ):
         self.thisptr = new cDeepCL.Adagrad(cl.thisptr)
         self.thisptr.setLearningRate(learningRate)
-    def __dealloc(self):
+    def __dealloc__(self):
         del self.thisptr
     def setLearningRate(self, float learningRate):
         self.thisptr.setLearningRate(learningRate)
diff --git a/python/Annealer.pyx b/python/Annealer.pyx
index 4c0dc950..b9a2851e 100644
--- a/python/Annealer.pyx
+++ b/python/Annealer.pyx
@@ -1,10 +1,10 @@
 cdef class Annealer: 
     cdef cDeepCL.Annealer *thisptr
-    def __cinit__( self, EasyCL cl, learningRate, anneal=1.0 ):
+    def __cinit__( self, DeepCL cl, learningRate, anneal=1.0 ):
         self.thisptr = new cDeepCL.Annealer(cl.thisptr)
         self.thisptr.setLearningRate(learningRate)
         self.thisptr.setAnneal(anneal)
-    def __dealloc(self):
+    def __dealloc__(self):
         del self.thisptr
     def setLearningRate(self, float learningRate):
         self.thisptr.setLearningRate(learningRate)
diff --git a/python/CMakeLists.txt b/python/CMakeLists.txt
new file mode 100644
index 00000000..32a2bda9
--- /dev/null
+++ b/python/CMakeLists.txt
@@ -0,0 +1,62 @@
+CMAKE_MINIMUM_REQUIRED(VERSION 2.8.7)
+
+OPTION(DEV_RUN_CYTHON "DeepCL maintainers only, otherwise set to 'OFF'." OFF)
+
+set(CMAKE_MODULE_PATH ${CMAKE_MODULE_PATH} ${CMAKE_CURRENT_SOURCE_DIR}/cmake)
+
+IF(CMAKE_INSTALL_PREFIX_INITIALIZED_TO_DEFAULT)
+  SET(CMAKE_INSTALL_PREFIX
+    "${CMAKE_CURRENT_SOURCE_DIR}/../dist" CACHE PATH "Installation prefix, default 'dist'" FORCE
+    )
+ENDIF(CMAKE_INSTALL_PREFIX_INITIALIZED_TO_DEFAULT)
+
+mark_as_advanced(CLEAR PYTHON_LIBRARY)
+mark_as_advanced(CLEAR PYTHON_INCLUDE_DIR)
+
+include_directories(.)
+include_directories(../EasyCL)
+include_directories(../EasyCL/thirdparty/clew/include)
+include_directories(../src)
+
+if(UNIX)
+    SET(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++0x -Wall")
+endif()
+
+if(WIN32)
+    link_libraries(winmm) # needed for timeGetTime
+endif()
+
+if(DEV_RUN_CYTHON)
+    include(UseCython)
+    set_source_files_properties(PyDeepCL.pyx PROPERTIES CYTHON_IS_CXX TRUE)
+    cython_add_module(PyDeepCL PyDeepCL.pyx CyWrappers.cpp)
+else()
+    find_package( PythonLibs REQUIRED )
+    include_directories(${PYTHON_INCLUDE_DIRS})
+	python_add_module(PyDeepCL PyDeepCL.cxx CyWrappers.cpp)
+	target_link_libraries(PyDeepCL ${PYTHON_LIBRARY})
+endif()
+
+if(WIN32)
+#	SET(DEEPCL_LIBRARIES ${CMAKE_INSTALL_PREFIX}/lib/DeepCL.lib ${CMAKE_INSTALL_PREFIX}/lib/EasyCL.lib)
+else()
+#	SET(DEEPCL_LIBRARIES ${CMAKE_INSTALL_PREFIX}/lib/${CMAKE_SHARED_LIBRARY_PREFIX}DeepCL${CMAKE_SHARED_LIBRARY_SUFFIX})
+endif()
+
+#target_link_libraries(PyDeepCL ${DEEPCL_LIBRARIES})
+target_link_libraries(PyDeepCL DeepCL)
+
+#install(PROGRAMS test_deepcl.py test_lowlevel.py test_qlearning.py
+#    DESTINATION bin)
+if(WIN32)
+	install(TARGETS PyDeepCL
+		RUNTIME DESTINATION bin
+		ARCHIVE DESTINATION lib
+		LIBRARY DESTINATION bin )
+else()
+	install(TARGETS PyDeepCL
+		RUNTIME DESTINATION bin
+		ARCHIVE DESTINATION lib
+		LIBRARY DESTINATION lib )
+endif()
+
diff --git a/python/CppRuntimeBoundary.pxd b/python/CppRuntimeBoundary.pxd
new file mode 100644
index 00000000..cb8b77f7
--- /dev/null
+++ b/python/CppRuntimeBoundary.pxd
@@ -0,0 +1,3 @@
+
+cdef extern from "CppRuntimeBoundary.h":
+    cdef void deepcl_deleteCharStar(const char *charStar)
diff --git a/python/CyWrappers.cpp b/python/CyWrappers.cpp
index 5fc46218..fcae1a60 100644
--- a/python/CyWrappers.cpp
+++ b/python/CyWrappers.cpp
@@ -3,15 +3,15 @@
 int exceptionRaised = 0;
 std::string exceptionMessage = "";
 
-void checkException( int *wasRaised, std::string *message ) {
-    *wasRaised = exceptionRaised;
-    *message = exceptionMessage;
-}
+//void checkException( int *wasRaised, std::string *message ) {
+//    *wasRaised = exceptionRaised;
+//    *message = exceptionMessage;
+//}
 
-void raiseException( std::string message ) {
-    exceptionRaised = 1; 
-    exceptionMessage = message;    
-}
+//void raiseException( std::string message ) {
+//    exceptionRaised = 1; 
+//    exceptionMessage = message;    
+//}
 
 
 
diff --git a/python/CyWrappers.h b/python/CyWrappers.h
index 5ae70fc3..10f01177 100644
--- a/python/CyWrappers.h
+++ b/python/CyWrappers.h
@@ -6,8 +6,8 @@
 
 extern int exceptionRaised;
 extern std::string exceptionMessage;
-void raiseException( std::string message );
-void checkException( int *wasRaised, std::string *message );
+//void raiseException( std::string message );
+//void checkException( int *wasRaised, std::string *message );
 
 #include "batch/NetLearner.h"
 #include "trainers/SGD.h"
diff --git a/python/DeepCL.pyx b/python/DeepCL.pyx
new file mode 100644
index 00000000..eb12bc17
--- /dev/null
+++ b/python/DeepCL.pyx
@@ -0,0 +1,34 @@
+cdef class DeepCL:
+    cdef cDeepCL.DeepCL *thisptr
+
+    def __cinit__(self, gpuindex=None ):
+#        print( '__cinit__(planes,size)')
+        if gpuindex is None:
+             self.thisptr = cDeepCL.DeepCL.createForFirstGpuOtherwiseCpu()
+        else:
+            self.thisptr = cDeepCL.DeepCL.createForIndexedGpu(gpuindex)
+
+    def __dealloc__(self):
+        self.thisptr.deleteMe()
+
+    def setProfiling(self, profiling):
+        self.thisptr.setProfiling(profiling)
+
+    def dumpProfiling(self):
+        self.thisptr.dumpProfiling()
+
+    def getComputeUnits(self):
+        return self.thisptr.getComputeUnits()
+
+    def getLocalMemorySize(self):
+        return self.thisptr.getLocalMemorySize()
+
+    def getLocalMemorySizeKB(self):
+        return self.thisptr.getLocalMemorySizeKB()
+
+    def getMaxWorkgroupSize(self):
+        return self.thisptr.getMaxWorkgroupSize()
+
+    def getMaxAllocSizeMB(self):
+        return self.thisptr.getMaxAllocSizeMB()
+
diff --git a/python/EasyCL.pyx b/python/EasyCL.pyx
index a03bf09a..eb782827 100644
--- a/python/EasyCL.pyx
+++ b/python/EasyCL.pyx
@@ -1,13 +1,13 @@
-cdef class EasyCL:
-    cdef cDeepCL.EasyCL *thisptr
+#cdef class DeepCL:
+#    cdef cDeepCL.DeepCL *thisptr
 
-    def __cinit__(self, gpuindex=None ):
-#        print( '__cinit__(planes,size)')
-        if gpuindex is None:
-             self.thisptr = cDeepCL.EasyCL.createForFirstGpuOtherwiseCpu()
-        else:
-            self.thisptr = cDeepCL.EasyCL.createForIndexedGpu(gpuindex)
+#    def __cinit__(self, gpuindex=None ):
+##        print( '__cinit__(planes,size)')
+#        if gpuindex is None:
+#             self.thisptr = cDeepCL.DeepCL.createForFirstGpuOtherwiseCpu()
+#        else:
+#            self.thisptr = cDeepCL.DeepCL.createForIndexedGpu(gpuindex)
 
-    def __dealloc(self):
-        del self.thisptr 
+#    def __dealloc__(self):
+#        del self.thisptr 
 
diff --git a/python/GenericLoader.pyx b/python/GenericLoader.pyx
index 41b66bfa..fe1c4806 100644
--- a/python/GenericLoader.pyx
+++ b/python/GenericLoader.pyx
@@ -1,13 +1,15 @@
 cdef class GenericLoader:
     @staticmethod
-    def getDimensions( trainFilePath ):
+    def getDimensions( trainFilepath ):
+        print 'GenericLoader.py getDimensions ', trainFilepath
         cdef int N
         cdef int planes
         cdef int size
-        cDeepCL.GenericLoader.getDimensions( toCppString( trainFilePath ), &N, &planes, &size )
+        cdef const char *trainFilepath_charstar = trainFilepath
+        cDeepCL.GenericLoader.getDimensions(trainFilepath_charstar, &N, &planes, &size)
+        print 'finished calling'
         return (N,planes,size)
     @staticmethod 
     def load( trainFilepath, float[:] images, int[:] labels, startN, numExamples ):
-        cDeepCL.GenericLoader.load( toCppString(trainFilepath), &images[0], &labels[0], startN , numExamples )
-
-
+        cdef const char *trainFilepath_charstar = trainFilepath
+        cDeepCL.GenericLoader.load(trainFilepath_charstar, &images[0], &labels[0], startN , numExamples)
diff --git a/python/LICENSE b/python/LICENSE
deleted file mode 100644
index e87a115e..00000000
--- a/python/LICENSE
+++ /dev/null
@@ -1,363 +0,0 @@
-Mozilla Public License, version 2.0
-
-1. Definitions
-
-1.1. "Contributor"
-
-     means each individual or legal entity that creates, contributes to the
-     creation of, or owns Covered Software.
-
-1.2. "Contributor Version"
-
-     means the combination of the Contributions of others (if any) used by a
-     Contributor and that particular Contributor's Contribution.
-
-1.3. "Contribution"
-
-     means Covered Software of a particular Contributor.
-
-1.4. "Covered Software"
-
-     means Source Code Form to which the initial Contributor has attached the
-     notice in Exhibit A, the Executable Form of such Source Code Form, and
-     Modifications of such Source Code Form, in each case including portions
-     thereof.
-
-1.5. "Incompatible With Secondary Licenses"
-     means
-
-     a. that the initial Contributor has attached the notice described in
-        Exhibit B to the Covered Software; or
-
-     b. that the Covered Software was made available under the terms of
-        version 1.1 or earlier of the License, but not also under the terms of
-        a Secondary License.
-
-1.6. "Executable Form"
-
-     means any form of the work other than Source Code Form.
-
-1.7. "Larger Work"
-
-     means a work that combines Covered Software with other material, in a
-     separate file or files, that is not Covered Software.
-
-1.8. "License"
-
-     means this document.
-
-1.9. "Licensable"
-
-     means having the right to grant, to the maximum extent possible, whether
-     at the time of the initial grant or subsequently, any and all of the
-     rights conveyed by this License.
-
-1.10. "Modifications"
-
-     means any of the following:
-
-     a. any file in Source Code Form that results from an addition to,
-        deletion from, or modification of the contents of Covered Software; or
-
-     b. any new file in Source Code Form that contains any Covered Software.
-
-1.11. "Patent Claims" of a Contributor
-
-      means any patent claim(s), including without limitation, method,
-      process, and apparatus claims, in any patent Licensable by such
-      Contributor that would be infringed, but for the grant of the License,
-      by the making, using, selling, offering for sale, having made, import,
-      or transfer of either its Contributions or its Contributor Version.
-
-1.12. "Secondary License"
-
-      means either the GNU General Public License, Version 2.0, the GNU Lesser
-      General Public License, Version 2.1, the GNU Affero General Public
-      License, Version 3.0, or any later versions of those licenses.
-
-1.13. "Source Code Form"
-
-      means the form of the work preferred for making modifications.
-
-1.14. "You" (or "Your")
-
-      means an individual or a legal entity exercising rights under this
-      License. For legal entities, "You" includes any entity that controls, is
-      controlled by, or is under common control with You. For purposes of this
-      definition, "control" means (a) the power, direct or indirect, to cause
-      the direction or management of such entity, whether by contract or
-      otherwise, or (b) ownership of more than fifty percent (50%) of the
-      outstanding shares or beneficial ownership of such entity.
-
-
-2. License Grants and Conditions
-
-2.1. Grants
-
-     Each Contributor hereby grants You a world-wide, royalty-free,
-     non-exclusive license:
-
-     a. under intellectual property rights (other than patent or trademark)
-        Licensable by such Contributor to use, reproduce, make available,
-        modify, display, perform, distribute, and otherwise exploit its
-        Contributions, either on an unmodified basis, with Modifications, or
-        as part of a Larger Work; and
-
-     b. under Patent Claims of such Contributor to make, use, sell, offer for
-        sale, have made, import, and otherwise transfer either its
-        Contributions or its Contributor Version.
-
-2.2. Effective Date
-
-     The licenses granted in Section 2.1 with respect to any Contribution
-     become effective for each Contribution on the date the Contributor first
-     distributes such Contribution.
-
-2.3. Limitations on Grant Scope
-
-     The licenses granted in this Section 2 are the only rights granted under
-     this License. No additional rights or licenses will be implied from the
-     distribution or licensing of Covered Software under this License.
-     Notwithstanding Section 2.1(b) above, no patent license is granted by a
-     Contributor:
-
-     a. for any code that a Contributor has removed from Covered Software; or
-
-     b. for infringements caused by: (i) Your and any other third party's
-        modifications of Covered Software, or (ii) the combination of its
-        Contributions with other software (except as part of its Contributor
-        Version); or
-
-     c. under Patent Claims infringed by Covered Software in the absence of
-        its Contributions.
-
-     This License does not grant any rights in the trademarks, service marks,
-     or logos of any Contributor (except as may be necessary to comply with
-     the notice requirements in Section 3.4).
-
-2.4. Subsequent Licenses
-
-     No Contributor makes additional grants as a result of Your choice to
-     distribute the Covered Software under a subsequent version of this
-     License (see Section 10.2) or under the terms of a Secondary License (if
-     permitted under the terms of Section 3.3).
-
-2.5. Representation
-
-     Each Contributor represents that the Contributor believes its
-     Contributions are its original creation(s) or it has sufficient rights to
-     grant the rights to its Contributions conveyed by this License.
-
-2.6. Fair Use
-
-     This License is not intended to limit any rights You have under
-     applicable copyright doctrines of fair use, fair dealing, or other
-     equivalents.
-
-2.7. Conditions
-
-     Sections 3.1, 3.2, 3.3, and 3.4 are conditions of the licenses granted in
-     Section 2.1.
-
-
-3. Responsibilities
-
-3.1. Distribution of Source Form
-
-     All distribution of Covered Software in Source Code Form, including any
-     Modifications that You create or to which You contribute, must be under
-     the terms of this License. You must inform recipients that the Source
-     Code Form of the Covered Software is governed by the terms of this
-     License, and how they can obtain a copy of this License. You may not
-     attempt to alter or restrict the recipients' rights in the Source Code
-     Form.
-
-3.2. Distribution of Executable Form
-
-     If You distribute Covered Software in Executable Form then:
-
-     a. such Covered Software must also be made available in Source Code Form,
-        as described in Section 3.1, and You must inform recipients of the
-        Executable Form how they can obtain a copy of such Source Code Form by
-        reasonable means in a timely manner, at a charge no more than the cost
-        of distribution to the recipient; and
-
-     b. You may distribute such Executable Form under the terms of this
-        License, or sublicense it under different terms, provided that the
-        license for the Executable Form does not attempt to limit or alter the
-        recipients' rights in the Source Code Form under this License.
-
-3.3. Distribution of a Larger Work
-
-     You may create and distribute a Larger Work under terms of Your choice,
-     provided that You also comply with the requirements of this License for
-     the Covered Software. If the Larger Work is a combination of Covered
-     Software with a work governed by one or more Secondary Licenses, and the
-     Covered Software is not Incompatible With Secondary Licenses, this
-     License permits You to additionally distribute such Covered Software
-     under the terms of such Secondary License(s), so that the recipient of
-     the Larger Work may, at their option, further distribute the Covered
-     Software under the terms of either this License or such Secondary
-     License(s).
-
-3.4. Notices
-
-     You may not remove or alter the substance of any license notices
-     (including copyright notices, patent notices, disclaimers of warranty, or
-     limitations of liability) contained within the Source Code Form of the
-     Covered Software, except that You may alter any license notices to the
-     extent required to remedy known factual inaccuracies.
-
-3.5. Application of Additional Terms
-
-     You may choose to offer, and to charge a fee for, warranty, support,
-     indemnity or liability obligations to one or more recipients of Covered
-     Software. However, You may do so only on Your own behalf, and not on
-     behalf of any Contributor. You must make it absolutely clear that any
-     such warranty, support, indemnity, or liability obligation is offered by
-     You alone, and You hereby agree to indemnify every Contributor for any
-     liability incurred by such Contributor as a result of warranty, support,
-     indemnity or liability terms You offer. You may include additional
-     disclaimers of warranty and limitations of liability specific to any
-     jurisdiction.
-
-4. Inability to Comply Due to Statute or Regulation
-
-   If it is impossible for You to comply with any of the terms of this License
-   with respect to some or all of the Covered Software due to statute,
-   judicial order, or regulation then You must: (a) comply with the terms of
-   this License to the maximum extent possible; and (b) describe the
-   limitations and the code they affect. Such description must be placed in a
-   text file included with all distributions of the Covered Software under
-   this License. Except to the extent prohibited by statute or regulation,
-   such description must be sufficiently detailed for a recipient of ordinary
-   skill to be able to understand it.
-
-5. Termination
-
-5.1. The rights granted under this License will terminate automatically if You
-     fail to comply with any of its terms. However, if You become compliant,
-     then the rights granted under this License from a particular Contributor
-     are reinstated (a) provisionally, unless and until such Contributor
-     explicitly and finally terminates Your grants, and (b) on an ongoing
-     basis, if such Contributor fails to notify You of the non-compliance by
-     some reasonable means prior to 60 days after You have come back into
-     compliance. Moreover, Your grants from a particular Contributor are
-     reinstated on an ongoing basis if such Contributor notifies You of the
-     non-compliance by some reasonable means, this is the first time You have
-     received notice of non-compliance with this License from such
-     Contributor, and You become compliant prior to 30 days after Your receipt
-     of the notice.
-
-5.2. If You initiate litigation against any entity by asserting a patent
-     infringement claim (excluding declaratory judgment actions,
-     counter-claims, and cross-claims) alleging that a Contributor Version
-     directly or indirectly infringes any patent, then the rights granted to
-     You by any and all Contributors for the Covered Software under Section
-     2.1 of this License shall terminate.
-
-5.3. In the event of termination under Sections 5.1 or 5.2 above, all end user
-     license agreements (excluding distributors and resellers) which have been
-     validly granted by You or Your distributors under this License prior to
-     termination shall survive termination.
-
-6. Disclaimer of Warranty
-
-   Covered Software is provided under this License on an "as is" basis,
-   without warranty of any kind, either expressed, implied, or statutory,
-   including, without limitation, warranties that the Covered Software is free
-   of defects, merchantable, fit for a particular purpose or non-infringing.
-   The entire risk as to the quality and performance of the Covered Software
-   is with You. Should any Covered Software prove defective in any respect,
-   You (not any Contributor) assume the cost of any necessary servicing,
-   repair, or correction. This disclaimer of warranty constitutes an essential
-   part of this License. No use of  any Covered Software is authorized under
-   this License except under this disclaimer.
-
-7. Limitation of Liability
-
-   Under no circumstances and under no legal theory, whether tort (including
-   negligence), contract, or otherwise, shall any Contributor, or anyone who
-   distributes Covered Software as permitted above, be liable to You for any
-   direct, indirect, special, incidental, or consequential damages of any
-   character including, without limitation, damages for lost profits, loss of
-   goodwill, work stoppage, computer failure or malfunction, or any and all
-   other commercial damages or losses, even if such party shall have been
-   informed of the possibility of such damages. This limitation of liability
-   shall not apply to liability for death or personal injury resulting from
-   such party's negligence to the extent applicable law prohibits such
-   limitation. Some jurisdictions do not allow the exclusion or limitation of
-   incidental or consequential damages, so this exclusion and limitation may
-   not apply to You.
-
-8. Litigation
-
-   Any litigation relating to this License may be brought only in the courts
-   of a jurisdiction where the defendant maintains its principal place of
-   business and such litigation shall be governed by laws of that
-   jurisdiction, without reference to its conflict-of-law provisions. Nothing
-   in this Section shall prevent a party's ability to bring cross-claims or
-   counter-claims.
-
-9. Miscellaneous
-
-   This License represents the complete agreement concerning the subject
-   matter hereof. If any provision of this License is held to be
-   unenforceable, such provision shall be reformed only to the extent
-   necessary to make it enforceable. Any law or regulation which provides that
-   the language of a contract shall be construed against the drafter shall not
-   be used to construe this License against a Contributor.
-
-
-10. Versions of the License
-
-10.1. New Versions
-
-      Mozilla Foundation is the license steward. Except as provided in Section
-      10.3, no one other than the license steward has the right to modify or
-      publish new versions of this License. Each version will be given a
-      distinguishing version number.
-
-10.2. Effect of New Versions
-
-      You may distribute the Covered Software under the terms of the version
-      of the License under which You originally received the Covered Software,
-      or under the terms of any subsequent version published by the license
-      steward.
-
-10.3. Modified Versions
-
-      If you create software not governed by this License, and you want to
-      create a new license for such software, you may create and use a
-      modified version of this License if you rename the license and remove
-      any references to the name of the license steward (except to note that
-      such modified license differs from this License).
-
-10.4. Distributing Source Code Form that is Incompatible With Secondary
-      Licenses If You choose to distribute Source Code Form that is
-      Incompatible With Secondary Licenses under the terms of this version of
-      the License, the notice described in Exhibit B of this License must be
-      attached.
-
-Exhibit A - Source Code Form License Notice
-
-      This Source Code Form is subject to the
-      terms of the Mozilla Public License, v.
-      2.0. If a copy of the MPL was not
-      distributed with this file, You can
-      obtain one at
-      http://mozilla.org/MPL/2.0/.
-
-If it is not possible or desirable to put the notice in a particular file,
-then You may include the notice in a location (such as a LICENSE file in a
-relevant directory) where a recipient would be likely to look for such a
-notice.
-
-You may add additional accurate notices of copyright ownership.
-
-Exhibit B - "Incompatible With Secondary Licenses" Notice
-
-      This Source Code Form is "Incompatible
-      With Secondary Licenses", as defined by
-      the Mozilla Public License, v. 2.0.
-
diff --git a/python/Layer.pyx b/python/Layer.pyx
index 36efbc14..bd20eb01 100644
--- a/python/Layer.pyx
+++ b/python/Layer.pyx
@@ -17,16 +17,16 @@ cdef class Layer:
         return self.thisptr.getOutputCubeSize()
     def getOutputPlanes(self):
         return self.thisptr.getOutputPlanes()
-    def getOutputImageSize(self):
-        return self.thisptr.getOutputImageSize()
+    def getOutputSize(self):
+        return self.thisptr.getOutputSize()
     def getOutput(self):
         # the underlying c++ method returns a pointer
         # to a block of memory that we dont own
         # we should probably copy it I suppose
         cdef float *output = self.thisptr.getOutput()
-        cdef int outputSize = self.thisptr.getOutputSize()
-        cdef c_array.array outputArray = array('f', [0] * outputSize )
-        for i in range(outputSize):
+        cdef int outputNumElements = self.thisptr.getOutputNumElements()
+        cdef c_array.array outputArray = array('f', [0] * outputNumElements )
+        for i in range(outputNumElements):
             outputArray[i] = output[i]
 #        cdef float[:] outputMv = output
 #        cdef float[:] outputArrayMv = outputArray
@@ -53,8 +53,10 @@ cdef class Layer:
         weightsArray.fromlist( weightsList )
         self.setWeights( weightsArray )
     def asString(self):
-        return self.thisptr.asString()
+        cdef const char *res_charstar = self.thisptr.asNewCharStar()
+        cdef str res = str(res_charstar.decode('UTF-8'))
+        CppRuntimeBoundary.deepcl_deleteCharStar(res_charstar)
+        return res
     def getClassName(self):
-        return self.thisptr.getClassName()
-
+        return self.thisptr.getClassNameAsCharStar()
 
diff --git a/python/MANIFEST.in b/python/MANIFEST.in
index 277f3cf3..a8d96248 100644
--- a/python/MANIFEST.in
+++ b/python/MANIFEST.in
@@ -1,9 +1,3 @@
-include cDeepCL.pxd
-include Cy*.cpp
-include Cy*.h
-include mysrc/*
-include *.pxd
-include *.pyx
+include *.h
 include version.txt
-recursive-include mysrc *.txt *.cpp *.h *.cl *.c
 
diff --git a/python/Nesterov.pyx b/python/Nesterov.pyx
index a8e9402a..86abe5ad 100644
--- a/python/Nesterov.pyx
+++ b/python/Nesterov.pyx
@@ -1,10 +1,10 @@
 cdef class Nesterov: 
     cdef cDeepCL.Nesterov *thisptr
-    def __cinit__( self, EasyCL cl, learningRate, momentum=0.0 ):
+    def __cinit__( self, DeepCL cl, learningRate, momentum=0.0 ):
         self.thisptr = new cDeepCL.Nesterov(cl.thisptr)
         self.thisptr.setLearningRate(learningRate)
         self.thisptr.setMomentum(momentum)
-    def __dealloc(self):
+    def __dealloc__(self):
         del self.thisptr
     def setLearningRate(self, float learningRate):
         self.thisptr.setLearningRate(learningRate)
diff --git a/python/NetDefToNet.pyx b/python/NetDefToNet.pyx
index 7e2c9d07..8b07fb1f 100644
--- a/python/NetDefToNet.pyx
+++ b/python/NetDefToNet.pyx
@@ -1,6 +1,5 @@
 cdef class NetdefToNet:
     @staticmethod
     def createNetFromNetdef( NeuralNet neuralnet, netdef ):
-        return cDeepCL.NetdefToNet.createNetFromNetdef( neuralnet.thisptr, toCppString( netdef ) )
-
-
+        cdef const char *netdef_charstar = netdef
+        return cDeepCL.NetdefToNet.createNetFromNetdefCharStar(neuralnet.thisptr, netdef_charstar)
diff --git a/python/NetLearner.pyx b/python/NetLearner.pyx
index eb55016e..77d7ad3c 100644
--- a/python/NetLearner.pyx
+++ b/python/NetLearner.pyx
@@ -9,7 +9,7 @@ cdef class NetLearner:
             Ntrain, &trainData[0], &trainLabels[0],
             Ntest, &testData[0], &testLabels[0],
             batchSize )
-    def __dealloc(self):
+    def __dealloc__(self):
         del self.thisptr
 #    def setTrainingData( self, Ntrain, float[:] trainData, int[:] trainLabels ):
 #        self.thisptr.setTrainingData( Ntrain, &trainData[0], &trainLabels[0] )
@@ -28,6 +28,6 @@ cdef class NetLearner:
         interruptableCall( self._run, [] ) 
 ##        with nogil:
 ##            thisptr._learn( learningRate )
-        checkException()
+        # checkException()
 
 
diff --git a/python/NeuralNet.pyx b/python/NeuralNet.pyx
index e9b64659..b66d0872 100644
--- a/python/NeuralNet.pyx
+++ b/python/NeuralNet.pyx
@@ -1,18 +1,23 @@
 cdef class NeuralNet:
     cdef cDeepCL.NeuralNet *thisptr
 
-    def __cinit__(self, EasyCL cl, planes = None, size = None):
+    def __cinit__(self, DeepCL cl, planes = None, size = None):
 #        print( '__cinit__(planes,size)')
         if planes == None and size == None:
-             self.thisptr = new cDeepCL.NeuralNet(cl.thisptr)
+            self.thisptr = cDeepCL.NeuralNet.instance(cl.thisptr)
         else:
-            self.thisptr = new cDeepCL.NeuralNet(cl.thisptr, planes, size)
+            self.thisptr = cDeepCL.NeuralNet.instance3(cl.thisptr, planes, size)
 
-    def __dealloc(self):
-        del self.thisptr 
+    def __dealloc__(self):
+        self.thisptr.deleteMe()
 
     def asString(self):
-        return self.thisptr.asString()
+        print('about to call asnewcharstar')
+        cdef const char *result_charstar = self.thisptr.asNewCharStar()
+        print('got char *result')
+        cdef str result = str(result_charstar.decode('UTF-8'))
+        CppRuntimeBoundary.deepcl_deleteCharStar(result_charstar)
+        return result
 
 #    def myprint(self):
 #        self.thisptr.print()
@@ -45,13 +50,12 @@ cdef class NeuralNet:
         return self.thisptr.getNumLayers()
     def getOutput(self):
         cdef const float *output = self.thisptr.getOutput()
-        cdef int outputSize = self.thisptr.getOutputSize()
-        cdef c_array.array outputArray = array('f', [0] * outputSize )
-        for i in range(outputSize):
+        cdef int outputNumElements = self.thisptr.getOutputNumElements()
+        cdef c_array.array outputArray = array('f', [0] * outputNumElements )
+        for i in range(outputNumElements):
             outputArray[i] = output[i]
         return outputArray
     def setTraining(self, training): # 1 is, we are training net, 0 is we are not
                             # used for example by randomtranslations layer (for now,
                             # used only by randomtranslations layer)
         self.thisptr.setTraining( training )
-
diff --git a/python/PyDeepCL.cpp b/python/PyDeepCL.cxx
similarity index 90%
rename from python/PyDeepCL.cpp
rename to python/PyDeepCL.cxx
index c123f444..2398d8d3 100644
--- a/python/PyDeepCL.cpp
+++ b/python/PyDeepCL.cxx
@@ -1,60 +1,5 @@
 /* Generated by Cython 0.22 */
 
-/* BEGIN: Cython Metadata
-{
-    "distutils": {
-        "language": "c++", 
-        "define_macros": [
-            [
-                "DeepCL_EXPORTS", 
-                1
-            ], 
-            [
-                "EasyCL_EXPORTS", 
-                1
-            ]
-        ], 
-        "runtime_library_dirs": [
-            "."
-        ], 
-        "depends": [
-            "mysrc/qlearning/QLearner.h", 
-            "mysrc/trainers/Adagrad.h", 
-            "mysrc/activate/ActivationMaker.h", 
-            "mysrc/trainers/SGD.h", 
-            "mysrc/netdef/NetdefToNet.h", 
-            "mysrc/trainers/Rmsprop.h", 
-            "mysrc/input/InputLayerMaker.h", 
-            "mysrc/EasyCL.h", 
-            "mysrc/forcebackprop/ForceBackpropLayerMaker.h", 
-            "mysrc/fc/FullyConnectedMaker.h", 
-            "CyWrappers.h", 
-            "mysrc/layer/Layer.h", 
-            "CyScenario.h", 
-            "mysrc/trainers/Adadelta.h", 
-            "mysrc/loaders/GenericLoader.h", 
-            "mysrc/layer/LayerMaker.h", 
-            "mysrc/trainers/TrainingContext.h", 
-            "mysrc/trainers/Annealer.h", 
-            "mysrc/conv/ConvolutionalMaker.h", 
-            "mysrc/normalize/NormalizationLayerMaker.h", 
-            "mysrc/trainers/Trainer.h", 
-            "mysrc/net/NeuralNet.h", 
-            "mysrc/pooling/PoolingMaker.h", 
-            "mysrc/dropout/DropoutMaker.h", 
-            "mysrc/trainers/Nesterov.h"
-        ], 
-        "extra_compile_args": [
-            "-std=c++0x", 
-            "-g"
-        ], 
-        "include_dirs": [
-            "mysrc"
-        ]
-    }
-}
-END: Cython Metadata */
-
 #define PY_SSIZE_T_CLEAN
 #ifndef CYTHON_USE_PYLONG_INTERNALS
 #ifdef PYLONG_BITS_IN_DIGIT
@@ -302,14 +247,14 @@ class __Pyx_FakeReference {
 #include "string.h"
 #include "stdio.h"
 #include "pythread.h"
-#include <string>
+#include "CppRuntimeBoundary.h"
+#include "DeepCL.h"
+#include "layer/LayerMaker.h"
+#include "input/InputLayerMaker.h"
 #include "ios"
 #include "new"
 #include "stdexcept"
 #include "typeinfo"
-#include "EasyCL.h"
-#include "layer/LayerMaker.h"
-#include "input/InputLayerMaker.h"
 #include "dropout/DropoutMaker.h"
 #include "activate/ActivationMaker.h"
 #include "normalize/NormalizationLayerMaker.h"
@@ -518,8 +463,7 @@ static const char *__pyx_filename;
 static const char *__pyx_f[] = {
   "LayerMaker.pyx",
   "NeuralNet.pyx",
-  "PyDeepCL.pyx",
-  "EasyCL.pyx",
+  "DeepCL.pyx",
   "SGD.pyx",
   "Annealer.pyx",
   "Nesterov.pyx",
@@ -531,6 +475,7 @@ static const char *__pyx_f[] = {
   "NetLearner.pyx",
   "NetDefToNet.pyx",
   "QLearning.pyx",
+  "PyDeepCL.pyx",
   "array.pxd",
   "stringsource",
   "type.pxd",
@@ -634,7 +579,7 @@ typedef volatile __pyx_atomic_int_type __pyx_atomic_int;
 struct arrayobject;
 typedef struct arrayobject arrayobject;
 #endif
-struct __pyx_obj_8PyDeepCL_EasyCL;
+struct __pyx_obj_8PyDeepCL_DeepCL;
 struct __pyx_obj_8PyDeepCL_TrainingContext;
 struct __pyx_obj_8PyDeepCL_SGD;
 struct __pyx_obj_8PyDeepCL_Annealer;
@@ -665,14 +610,14 @@ struct __pyx_MemviewEnum_obj;
 struct __pyx_memoryview_obj;
 struct __pyx_memoryviewslice_obj;
 
-/* "EasyCL.pyx":1
- * cdef class EasyCL:             # <<<<<<<<<<<<<<
- *     cdef cDeepCL.EasyCL *thisptr
+/* "../../../../../../home/user/git/DeepCL/python/DeepCL.pyx":1
+ * cdef class DeepCL:             # <<<<<<<<<<<<<<
+ *     cdef cDeepCL.DeepCL *thisptr
  * 
  */
-struct __pyx_obj_8PyDeepCL_EasyCL {
+struct __pyx_obj_8PyDeepCL_DeepCL {
   PyObject_HEAD
-  EasyCL *thisptr;
+  DeepCL *thisptr;
 };
 
 struct __pyx_obj_8PyDeepCL_TrainingContext {
@@ -681,12 +626,12 @@ struct __pyx_obj_8PyDeepCL_TrainingContext {
 };
 
 
-/* "SGD.pyx":8
+/* "../../../../../../home/user/git/DeepCL/python/SGD.pyx":8
  *         del self.thisptr
  * 
  * cdef class SGD:             # <<<<<<<<<<<<<<
  *     cdef cDeepCL.SGD *thisptr
- *     def __cinit__( self, EasyCL cl, learningRate, momentum=0.0 ):
+ *     def __cinit__( self, DeepCL cl, learningRate, momentum=0.0 ):
  */
 struct __pyx_obj_8PyDeepCL_SGD {
   PyObject_HEAD
@@ -694,10 +639,10 @@ struct __pyx_obj_8PyDeepCL_SGD {
 };
 
 
-/* "Annealer.pyx":1
+/* "../../../../../../home/user/git/DeepCL/python/Annealer.pyx":1
  * cdef class Annealer:             # <<<<<<<<<<<<<<
  *     cdef cDeepCL.Annealer *thisptr
- *     def __cinit__( self, EasyCL cl, learningRate, anneal=1.0 ):
+ *     def __cinit__( self, DeepCL cl, learningRate, anneal=1.0 ):
  */
 struct __pyx_obj_8PyDeepCL_Annealer {
   PyObject_HEAD
@@ -741,7 +686,7 @@ struct __pyx_obj_8PyDeepCL_LayerMaker2 {
 };
 
 
-/* "LayerMaker.pyx":4
+/* "../../../../../../home/user/git/DeepCL/python/LayerMaker.pyx":4
  *     cdef cDeepCL.LayerMaker2 *baseptr
  * 
  * cdef class NormalizationLayerMaker(LayerMaker2):             # <<<<<<<<<<<<<<
@@ -754,7 +699,7 @@ struct __pyx_obj_8PyDeepCL_NormalizationLayerMaker {
 };
 
 
-/* "LayerMaker.pyx":21
+/* "../../../../../../home/user/git/DeepCL/python/LayerMaker.pyx":21
  *         return NormalizationLayerMaker()
  * 
  * cdef class FullyConnectedMaker(LayerMaker2):             # <<<<<<<<<<<<<<
@@ -767,7 +712,7 @@ struct __pyx_obj_8PyDeepCL_FullyConnectedMaker {
 };
 
 
-/* "LayerMaker.pyx":44
+/* "../../../../../../home/user/git/DeepCL/python/LayerMaker.pyx":44
  *         return FullyConnectedMaker()
  * 
  * cdef class ConvolutionalMaker(LayerMaker2):             # <<<<<<<<<<<<<<
@@ -780,7 +725,7 @@ struct __pyx_obj_8PyDeepCL_ConvolutionalMaker {
 };
 
 
-/* "LayerMaker.pyx":73
+/* "../../../../../../home/user/git/DeepCL/python/LayerMaker.pyx":73
  *         return ConvolutionalMaker()
  * 
  * cdef class PoolingMaker(LayerMaker2):             # <<<<<<<<<<<<<<
@@ -793,7 +738,7 @@ struct __pyx_obj_8PyDeepCL_PoolingMaker {
 };
 
 
-/* "LayerMaker.pyx":87
+/* "../../../../../../home/user/git/DeepCL/python/LayerMaker.pyx":87
  *         return PoolingMaker()
  * 
  * cdef class DropoutMaker(LayerMaker2):             # <<<<<<<<<<<<<<
@@ -806,7 +751,7 @@ struct __pyx_obj_8PyDeepCL_DropoutMaker {
 };
 
 
-/* "LayerMaker.pyx":99
+/* "../../../../../../home/user/git/DeepCL/python/LayerMaker.pyx":99
  *         return ActivationMaker()
  * 
  * cdef class ActivationMaker(LayerMaker2):             # <<<<<<<<<<<<<<
@@ -819,7 +764,7 @@ struct __pyx_obj_8PyDeepCL_ActivationMaker {
 };
 
 
-/* "LayerMaker.pyx":120
+/* "../../../../../../home/user/git/DeepCL/python/LayerMaker.pyx":120
  *         return ActivationMaker()
  * 
  * cdef class ForceBackpropMaker(LayerMaker2):             # <<<<<<<<<<<<<<
@@ -832,7 +777,7 @@ struct __pyx_obj_8PyDeepCL_ForceBackpropMaker {
 };
 
 
-/* "LayerMaker.pyx":131
+/* "../../../../../../home/user/git/DeepCL/python/LayerMaker.pyx":131
  *         return ForceBackpropMaker()
  * 
  * cdef class SquareLossMaker(LayerMaker2):             # <<<<<<<<<<<<<<
@@ -845,7 +790,7 @@ struct __pyx_obj_8PyDeepCL_SquareLossMaker {
 };
 
 
-/* "LayerMaker.pyx":142
+/* "../../../../../../home/user/git/DeepCL/python/LayerMaker.pyx":142
  *         return SquareLossMaker()
  * 
  * cdef class SoftMaxMaker(LayerMaker2):             # <<<<<<<<<<<<<<
@@ -858,7 +803,7 @@ struct __pyx_obj_8PyDeepCL_SoftMaxMaker {
 };
 
 
-/* "LayerMaker.pyx":153
+/* "../../../../../../home/user/git/DeepCL/python/LayerMaker.pyx":153
  *         return SoftMaxMaker()
  * 
  * cdef class InputLayerMaker(LayerMaker2):             # <<<<<<<<<<<<<<
@@ -871,10 +816,10 @@ struct __pyx_obj_8PyDeepCL_InputLayerMaker {
 };
 
 
-/* "GenericLoader.pyx":1
+/* "../../../../../../home/user/git/DeepCL/python/GenericLoader.pyx":1
  * cdef class GenericLoader:             # <<<<<<<<<<<<<<
  *     @staticmethod
- *     def getDimensions( trainFilePath ):
+ *     def getDimensions( trainFilepath ):
  */
 struct __pyx_obj_8PyDeepCL_GenericLoader {
   PyObject_HEAD
@@ -895,7 +840,7 @@ struct __pyx_obj_8PyDeepCL_QLearner {
 };
 
 
-/* "QLearning.pyx":61
+/* "../../../../../../home/user/git/DeepCL/python/QLearning.pyx":61
  *     return (<object>pyObject).hasFinished()
  * 
  * cdef class Scenario:             # <<<<<<<<<<<<<<
@@ -985,7 +930,7 @@ struct __pyx_memoryviewslice_obj {
 
 
 
-/* "Layer.pyx":1
+/* "../../../../../../home/user/git/DeepCL/python/Layer.pyx":1
  * cdef class Layer:             # <<<<<<<<<<<<<<
  *     cdef cDeepCL.Layer *thisptr
  * 
@@ -1153,7 +1098,12 @@ static CYTHON_INLINE int __pyx_sub_acquisition_count_locked(
 static CYTHON_INLINE void __Pyx_INC_MEMVIEW(__Pyx_memviewslice *, int, int);
 static CYTHON_INLINE void __Pyx_XDEC_MEMVIEW(__Pyx_memviewslice *, int, int);
 
-static CYTHON_INLINE PyObject *__Pyx_GetModuleGlobalName(PyObject *name);
+#include <string.h>
+
+static CYTHON_INLINE PyObject* __Pyx_decode_c_string(
+         const char* cstring, Py_ssize_t start, Py_ssize_t stop,
+         const char* encoding, const char* errors,
+         PyObject* (*decode_func)(const char *s, Py_ssize_t size, const char *errors));
 
 #if CYTHON_COMPILING_IN_CPYTHON
 static CYTHON_INLINE PyObject* __Pyx_PyObject_Call(PyObject *func, PyObject *arg, PyObject *kw);
@@ -1161,6 +1111,8 @@ static CYTHON_INLINE PyObject* __Pyx_PyObject_Call(PyObject *func, PyObject *arg
 #define __Pyx_PyObject_Call(func, arg, kw) PyObject_Call(func, arg, kw)
 #endif
 
+static CYTHON_INLINE PyObject *__Pyx_GetModuleGlobalName(PyObject *name);
+
 static CYTHON_INLINE int __Pyx_TypeTest(PyObject *obj, PyTypeObject *type);
 
 static CYTHON_INLINE void __Pyx_ErrRestore(PyObject *type, PyObject *value, PyObject *tb);
@@ -1185,12 +1137,6 @@ static CYTHON_INLINE PyObject* __Pyx_PyObject_CallMethO(PyObject *func, PyObject
 
 static CYTHON_INLINE PyObject* __Pyx_PyObject_CallOneArg(PyObject *func, PyObject *arg);
 
-#if CYTHON_COMPILING_IN_CPYTHON
-static CYTHON_INLINE PyObject* __Pyx_PyObject_CallNoArg(PyObject *func);
-#else
-#define __Pyx_PyObject_CallNoArg(func) __Pyx_PyObject_Call(func, __pyx_empty_tuple, NULL)
-#endif
-
 #if CYTHON_COMPILING_IN_CPYTHON
 #define __Pyx_PyObject_DelAttrStr(o,n) __Pyx_PyObject_SetAttrStr(o,n,NULL)
 static CYTHON_INLINE int __Pyx_PyObject_SetAttrStr(PyObject* obj, PyObject* attr_name, PyObject* value) {
@@ -1208,6 +1154,12 @@ static CYTHON_INLINE int __Pyx_PyObject_SetAttrStr(PyObject* obj, PyObject* attr
 #define __Pyx_PyObject_SetAttrStr(o,n,v) PyObject_SetAttr(o,n,v)
 #endif
 
+#if CYTHON_COMPILING_IN_CPYTHON
+static CYTHON_INLINE PyObject* __Pyx_PyObject_CallNoArg(PyObject *func);
+#else
+#define __Pyx_PyObject_CallNoArg(func) __Pyx_PyObject_Call(func, __pyx_empty_tuple, NULL)
+#endif
+
 #define __Pyx_GetItemInt(o, i, type, is_signed, to_py_func, is_list, wraparound, boundscheck) \
     (__Pyx_fits_Py_ssize_t(i, type, is_signed) ? \
     __Pyx_GetItemInt_Fast(o, (Py_ssize_t)i, is_list, wraparound, boundscheck) : \
@@ -1233,8 +1185,6 @@ static void __Pyx_WriteUnraisable(const char *name, int clineno,
                                   int lineno, const char *filename,
                                   int full_traceback);
 
-#include <string.h>
-
 static CYTHON_INLINE int __Pyx_PyBytes_Equals(PyObject* s1, PyObject* s2, int equals);
 
 static CYTHON_INLINE int __Pyx_PyUnicode_Equals(PyObject* s1, PyObject* s2, int equals);
@@ -1257,11 +1207,6 @@ static CYTHON_UNUSED int __pyx_array_getbuffer(PyObject *__pyx_v_self, Py_buffer
 static PyObject *get_memview(PyObject *__pyx_v_self); /*proto*/
 static CYTHON_INLINE PyObject *__Pyx_GetAttr(PyObject *, PyObject *);
 
-static CYTHON_INLINE PyObject* __Pyx_decode_c_string(
-         const char* cstring, Py_ssize_t start, Py_ssize_t stop,
-         const char* encoding, const char* errors,
-         PyObject* (*decode_func)(const char *s, Py_ssize_t size, const char *errors));
-
 static CYTHON_INLINE void __Pyx_RaiseTooManyValuesError(Py_ssize_t expected);
 
 static CYTHON_INLINE void __Pyx_RaiseNeedMoreValuesError(Py_ssize_t index);
@@ -1512,6 +1457,8 @@ static CYTHON_INLINE __Pyx_memviewslice __Pyx_PyObject_to_MemoryviewSlice_ds_flo
 
 static CYTHON_INLINE __Pyx_memviewslice __Pyx_PyObject_to_MemoryviewSlice_ds_int(PyObject *);
 
+static CYTHON_INLINE PyObject* __Pyx_PyInt_From_int(int value);
+
 #ifndef __Pyx_CppExn2PyErr
 #include <new>
 #include <typeinfo>
@@ -1551,7 +1498,13 @@ static void __Pyx_CppExn2PyErr() {
 }
 #endif
 
-static CYTHON_INLINE PyObject* __Pyx_PyInt_From_int(int value);
+static int __Pyx_Print(PyObject*, PyObject *, int);
+#if CYTHON_COMPILING_IN_PYPY || PY_MAJOR_VERSION >= 3
+static PyObject* __pyx_print = 0;
+static PyObject* __pyx_print_kwargs = 0;
+#endif
+
+static int __Pyx_PrintOne(PyObject* stream, PyObject *o);
 
 static PyObject *__pyx_memview_get_float(const char *itemp);
 static int __pyx_memview_set_float(const char *itemp, PyObject *obj);
@@ -1697,14 +1650,14 @@ static PyTypeObject *__pyx_ptype_7cpython_7complex_complex = 0;
 static PyTypeObject *__pyx_ptype_7cpython_5array_array = 0;
 static CYTHON_INLINE int __pyx_f_7cpython_5array_extend_buffer(arrayobject *, char *, Py_ssize_t); /*proto*/
 
-/* Module declarations from 'libcpp.string' */
-
 /* Module declarations from 'libcpp' */
 
+/* Module declarations from 'CppRuntimeBoundary' */
+
 /* Module declarations from 'cDeepCL' */
 
 /* Module declarations from 'PyDeepCL' */
-static PyTypeObject *__pyx_ptype_8PyDeepCL_EasyCL = 0;
+static PyTypeObject *__pyx_ptype_8PyDeepCL_DeepCL = 0;
 static PyTypeObject *__pyx_ptype_8PyDeepCL_TrainingContext = 0;
 static PyTypeObject *__pyx_ptype_8PyDeepCL_SGD = 0;
 static PyTypeObject *__pyx_ptype_8PyDeepCL_Annealer = 0;
@@ -1778,12 +1731,6 @@ static void __pyx_memoryview_refcount_objects_in_slice_with_gil(char *, Py_ssize
 static void __pyx_memoryview_refcount_objects_in_slice(char *, Py_ssize_t *, Py_ssize_t *, int, int); /*proto*/
 static void __pyx_memoryview_slice_assign_scalar(__Pyx_memviewslice *, int, size_t, void *, int); /*proto*/
 static void __pyx_memoryview__slice_assign_scalar(char *, Py_ssize_t *, Py_ssize_t *, int, size_t, void *); /*proto*/
-static CYTHON_INLINE PyObject *__pyx_convert_PyObject_string_to_py_std__in_string(std::string const &); /*proto*/
-static CYTHON_INLINE PyObject *__pyx_convert_PyUnicode_string_to_py_std__in_string(std::string const &); /*proto*/
-static CYTHON_INLINE PyObject *__pyx_convert_PyStr_string_to_py_std__in_string(std::string const &); /*proto*/
-static CYTHON_INLINE PyObject *__pyx_convert_PyBytes_string_to_py_std__in_string(std::string const &); /*proto*/
-static CYTHON_INLINE PyObject *__pyx_convert_PyByteArray_string_to_py_std__in_string(std::string const &); /*proto*/
-static std::string __pyx_convert_string_from_py_std__in_string(PyObject *); /*proto*/
 static __Pyx_TypeInfo __Pyx_TypeInfo_float = { "float", NULL, sizeof(float), { 0 }, 0, 'R', 0, 0 };
 static __Pyx_TypeInfo __Pyx_TypeInfo_int = { "int", NULL, sizeof(int), { 0 }, 0, IS_UNSIGNED(int) ? 'U' : 'I', IS_UNSIGNED(int), 0 };
 #define __Pyx_MODULE_NAME "PyDeepCL"
@@ -1793,7 +1740,6 @@ int __pyx_module_is_main_PyDeepCL = 0;
 static PyObject *__pyx_builtin_staticmethod;
 static PyObject *__pyx_builtin_Exception;
 static PyObject *__pyx_builtin_range;
-static PyObject *__pyx_builtin_RuntimeError;
 static PyObject *__pyx_builtin_MemoryError;
 static PyObject *__pyx_builtin_ValueError;
 static PyObject *__pyx_builtin_enumerate;
@@ -1801,45 +1747,52 @@ static PyObject *__pyx_builtin_Ellipsis;
 static PyObject *__pyx_builtin_TypeError;
 static PyObject *__pyx_builtin_id;
 static PyObject *__pyx_builtin_IndexError;
-static int __pyx_pf_8PyDeepCL_6EasyCL___cinit__(struct __pyx_obj_8PyDeepCL_EasyCL *__pyx_v_self, PyObject *__pyx_v_gpuindex); /* proto */
-static PyObject *__pyx_pf_8PyDeepCL_6EasyCL_2__dealloc(struct __pyx_obj_8PyDeepCL_EasyCL *__pyx_v_self); /* proto */
+static int __pyx_pf_8PyDeepCL_6DeepCL___cinit__(struct __pyx_obj_8PyDeepCL_DeepCL *__pyx_v_self, PyObject *__pyx_v_gpuindex); /* proto */
+static void __pyx_pf_8PyDeepCL_6DeepCL_2__dealloc__(struct __pyx_obj_8PyDeepCL_DeepCL *__pyx_v_self); /* proto */
+static PyObject *__pyx_pf_8PyDeepCL_6DeepCL_4setProfiling(struct __pyx_obj_8PyDeepCL_DeepCL *__pyx_v_self, PyObject *__pyx_v_profiling); /* proto */
+static PyObject *__pyx_pf_8PyDeepCL_6DeepCL_6dumpProfiling(struct __pyx_obj_8PyDeepCL_DeepCL *__pyx_v_self); /* proto */
+static PyObject *__pyx_pf_8PyDeepCL_6DeepCL_8getComputeUnits(struct __pyx_obj_8PyDeepCL_DeepCL *__pyx_v_self); /* proto */
+static PyObject *__pyx_pf_8PyDeepCL_6DeepCL_10getLocalMemorySize(struct __pyx_obj_8PyDeepCL_DeepCL *__pyx_v_self); /* proto */
+static PyObject *__pyx_pf_8PyDeepCL_6DeepCL_12getLocalMemorySizeKB(struct __pyx_obj_8PyDeepCL_DeepCL *__pyx_v_self); /* proto */
+static PyObject *__pyx_pf_8PyDeepCL_6DeepCL_14getMaxWorkgroupSize(struct __pyx_obj_8PyDeepCL_DeepCL *__pyx_v_self); /* proto */
+static PyObject *__pyx_pf_8PyDeepCL_6DeepCL_16getMaxAllocSizeMB(struct __pyx_obj_8PyDeepCL_DeepCL *__pyx_v_self); /* proto */
 static int __pyx_pf_8PyDeepCL_15TrainingContext___cinit__(struct __pyx_obj_8PyDeepCL_TrainingContext *__pyx_v_self, int __pyx_v_epoch, int __pyx_v_batch); /* proto */
 static void __pyx_pf_8PyDeepCL_15TrainingContext_2__dealloc__(struct __pyx_obj_8PyDeepCL_TrainingContext *__pyx_v_self); /* proto */
-static int __pyx_pf_8PyDeepCL_3SGD___cinit__(struct __pyx_obj_8PyDeepCL_SGD *__pyx_v_self, struct __pyx_obj_8PyDeepCL_EasyCL *__pyx_v_cl, PyObject *__pyx_v_learningRate, PyObject *__pyx_v_momentum); /* proto */
-static PyObject *__pyx_pf_8PyDeepCL_3SGD_2__dealloc(struct __pyx_obj_8PyDeepCL_SGD *__pyx_v_self); /* proto */
+static int __pyx_pf_8PyDeepCL_3SGD___cinit__(struct __pyx_obj_8PyDeepCL_SGD *__pyx_v_self, struct __pyx_obj_8PyDeepCL_DeepCL *__pyx_v_cl, PyObject *__pyx_v_learningRate, PyObject *__pyx_v_momentum); /* proto */
+static void __pyx_pf_8PyDeepCL_3SGD_2__dealloc__(struct __pyx_obj_8PyDeepCL_SGD *__pyx_v_self); /* proto */
 static PyObject *__pyx_pf_8PyDeepCL_3SGD_4setLearningRate(struct __pyx_obj_8PyDeepCL_SGD *__pyx_v_self, float __pyx_v_learningRate); /* proto */
 static PyObject *__pyx_pf_8PyDeepCL_3SGD_6setMomentum(struct __pyx_obj_8PyDeepCL_SGD *__pyx_v_self, float __pyx_v_momentum); /* proto */
 static PyObject *__pyx_pf_8PyDeepCL_3SGD_8setWeightDecay(struct __pyx_obj_8PyDeepCL_SGD *__pyx_v_self, float __pyx_v_weightDecay); /* proto */
 static PyObject *__pyx_pf_8PyDeepCL_3SGD_10train(struct __pyx_obj_8PyDeepCL_SGD *__pyx_v_self, struct __pyx_obj_8PyDeepCL_NeuralNet *__pyx_v_net, struct __pyx_obj_8PyDeepCL_TrainingContext *__pyx_v_context, __Pyx_memviewslice __pyx_v_inputdata, __Pyx_memviewslice __pyx_v_expectedOutput); /* proto */
 static PyObject *__pyx_pf_8PyDeepCL_3SGD_12trainFromLabels(struct __pyx_obj_8PyDeepCL_SGD *__pyx_v_self, struct __pyx_obj_8PyDeepCL_NeuralNet *__pyx_v_net, struct __pyx_obj_8PyDeepCL_TrainingContext *__pyx_v_context, __Pyx_memviewslice __pyx_v_inputdata, __Pyx_memviewslice __pyx_v_labels); /* proto */
-static int __pyx_pf_8PyDeepCL_8Annealer___cinit__(struct __pyx_obj_8PyDeepCL_Annealer *__pyx_v_self, struct __pyx_obj_8PyDeepCL_EasyCL *__pyx_v_cl, PyObject *__pyx_v_learningRate, PyObject *__pyx_v_anneal); /* proto */
-static PyObject *__pyx_pf_8PyDeepCL_8Annealer_2__dealloc(struct __pyx_obj_8PyDeepCL_Annealer *__pyx_v_self); /* proto */
+static int __pyx_pf_8PyDeepCL_8Annealer___cinit__(struct __pyx_obj_8PyDeepCL_Annealer *__pyx_v_self, struct __pyx_obj_8PyDeepCL_DeepCL *__pyx_v_cl, PyObject *__pyx_v_learningRate, PyObject *__pyx_v_anneal); /* proto */
+static void __pyx_pf_8PyDeepCL_8Annealer_2__dealloc__(struct __pyx_obj_8PyDeepCL_Annealer *__pyx_v_self); /* proto */
 static PyObject *__pyx_pf_8PyDeepCL_8Annealer_4setLearningRate(struct __pyx_obj_8PyDeepCL_Annealer *__pyx_v_self, float __pyx_v_learningRate); /* proto */
 static PyObject *__pyx_pf_8PyDeepCL_8Annealer_6setAnneal(struct __pyx_obj_8PyDeepCL_Annealer *__pyx_v_self, float __pyx_v_anneal); /* proto */
 static PyObject *__pyx_pf_8PyDeepCL_8Annealer_8train(struct __pyx_obj_8PyDeepCL_Annealer *__pyx_v_self, struct __pyx_obj_8PyDeepCL_NeuralNet *__pyx_v_net, struct __pyx_obj_8PyDeepCL_TrainingContext *__pyx_v_context, __Pyx_memviewslice __pyx_v_inputdata, __Pyx_memviewslice __pyx_v_expectedOutput); /* proto */
 static PyObject *__pyx_pf_8PyDeepCL_8Annealer_10trainFromLabels(struct __pyx_obj_8PyDeepCL_Annealer *__pyx_v_self, struct __pyx_obj_8PyDeepCL_NeuralNet *__pyx_v_net, struct __pyx_obj_8PyDeepCL_TrainingContext *__pyx_v_context, __Pyx_memviewslice __pyx_v_inputdata, __Pyx_memviewslice __pyx_v_labels); /* proto */
-static int __pyx_pf_8PyDeepCL_8Nesterov___cinit__(struct __pyx_obj_8PyDeepCL_Nesterov *__pyx_v_self, struct __pyx_obj_8PyDeepCL_EasyCL *__pyx_v_cl, PyObject *__pyx_v_learningRate, PyObject *__pyx_v_momentum); /* proto */
-static PyObject *__pyx_pf_8PyDeepCL_8Nesterov_2__dealloc(struct __pyx_obj_8PyDeepCL_Nesterov *__pyx_v_self); /* proto */
+static int __pyx_pf_8PyDeepCL_8Nesterov___cinit__(struct __pyx_obj_8PyDeepCL_Nesterov *__pyx_v_self, struct __pyx_obj_8PyDeepCL_DeepCL *__pyx_v_cl, PyObject *__pyx_v_learningRate, PyObject *__pyx_v_momentum); /* proto */
+static void __pyx_pf_8PyDeepCL_8Nesterov_2__dealloc__(struct __pyx_obj_8PyDeepCL_Nesterov *__pyx_v_self); /* proto */
 static PyObject *__pyx_pf_8PyDeepCL_8Nesterov_4setLearningRate(struct __pyx_obj_8PyDeepCL_Nesterov *__pyx_v_self, float __pyx_v_learningRate); /* proto */
 static PyObject *__pyx_pf_8PyDeepCL_8Nesterov_6setMomentum(struct __pyx_obj_8PyDeepCL_Nesterov *__pyx_v_self, float __pyx_v_momentum); /* proto */
 static PyObject *__pyx_pf_8PyDeepCL_8Nesterov_8train(struct __pyx_obj_8PyDeepCL_Nesterov *__pyx_v_self, struct __pyx_obj_8PyDeepCL_NeuralNet *__pyx_v_net, struct __pyx_obj_8PyDeepCL_TrainingContext *__pyx_v_context, __Pyx_memviewslice __pyx_v_inputdata, __Pyx_memviewslice __pyx_v_expectedOutput); /* proto */
 static PyObject *__pyx_pf_8PyDeepCL_8Nesterov_10trainFromLabels(struct __pyx_obj_8PyDeepCL_Nesterov *__pyx_v_self, struct __pyx_obj_8PyDeepCL_NeuralNet *__pyx_v_net, struct __pyx_obj_8PyDeepCL_TrainingContext *__pyx_v_context, __Pyx_memviewslice __pyx_v_inputdata, __Pyx_memviewslice __pyx_v_labels); /* proto */
-static int __pyx_pf_8PyDeepCL_7Adagrad___cinit__(struct __pyx_obj_8PyDeepCL_Adagrad *__pyx_v_self, struct __pyx_obj_8PyDeepCL_EasyCL *__pyx_v_cl, PyObject *__pyx_v_learningRate, CYTHON_UNUSED PyObject *__pyx_v_momentum); /* proto */
-static PyObject *__pyx_pf_8PyDeepCL_7Adagrad_2__dealloc(struct __pyx_obj_8PyDeepCL_Adagrad *__pyx_v_self); /* proto */
+static int __pyx_pf_8PyDeepCL_7Adagrad___cinit__(struct __pyx_obj_8PyDeepCL_Adagrad *__pyx_v_self, struct __pyx_obj_8PyDeepCL_DeepCL *__pyx_v_cl, PyObject *__pyx_v_learningRate, CYTHON_UNUSED PyObject *__pyx_v_momentum); /* proto */
+static void __pyx_pf_8PyDeepCL_7Adagrad_2__dealloc__(struct __pyx_obj_8PyDeepCL_Adagrad *__pyx_v_self); /* proto */
 static PyObject *__pyx_pf_8PyDeepCL_7Adagrad_4setLearningRate(struct __pyx_obj_8PyDeepCL_Adagrad *__pyx_v_self, float __pyx_v_learningRate); /* proto */
 static PyObject *__pyx_pf_8PyDeepCL_7Adagrad_6train(struct __pyx_obj_8PyDeepCL_Adagrad *__pyx_v_self, struct __pyx_obj_8PyDeepCL_NeuralNet *__pyx_v_net, struct __pyx_obj_8PyDeepCL_TrainingContext *__pyx_v_context, __Pyx_memviewslice __pyx_v_inputdata, __Pyx_memviewslice __pyx_v_expectedOutput); /* proto */
 static PyObject *__pyx_pf_8PyDeepCL_7Adagrad_8trainFromLabels(struct __pyx_obj_8PyDeepCL_Adagrad *__pyx_v_self, struct __pyx_obj_8PyDeepCL_NeuralNet *__pyx_v_net, struct __pyx_obj_8PyDeepCL_TrainingContext *__pyx_v_context, __Pyx_memviewslice __pyx_v_inputdata, __Pyx_memviewslice __pyx_v_labels); /* proto */
-static int __pyx_pf_8PyDeepCL_7Rmsprop___cinit__(struct __pyx_obj_8PyDeepCL_Rmsprop *__pyx_v_self, struct __pyx_obj_8PyDeepCL_EasyCL *__pyx_v_cl, PyObject *__pyx_v_learningRate, CYTHON_UNUSED PyObject *__pyx_v_momentum); /* proto */
-static PyObject *__pyx_pf_8PyDeepCL_7Rmsprop_2__dealloc(struct __pyx_obj_8PyDeepCL_Rmsprop *__pyx_v_self); /* proto */
+static int __pyx_pf_8PyDeepCL_7Rmsprop___cinit__(struct __pyx_obj_8PyDeepCL_Rmsprop *__pyx_v_self, struct __pyx_obj_8PyDeepCL_DeepCL *__pyx_v_cl, PyObject *__pyx_v_learningRate, CYTHON_UNUSED PyObject *__pyx_v_momentum); /* proto */
+static void __pyx_pf_8PyDeepCL_7Rmsprop_2__dealloc__(struct __pyx_obj_8PyDeepCL_Rmsprop *__pyx_v_self); /* proto */
 static PyObject *__pyx_pf_8PyDeepCL_7Rmsprop_4setLearningRate(struct __pyx_obj_8PyDeepCL_Rmsprop *__pyx_v_self, float __pyx_v_learningRate); /* proto */
 static PyObject *__pyx_pf_8PyDeepCL_7Rmsprop_6train(struct __pyx_obj_8PyDeepCL_Rmsprop *__pyx_v_self, struct __pyx_obj_8PyDeepCL_NeuralNet *__pyx_v_net, struct __pyx_obj_8PyDeepCL_TrainingContext *__pyx_v_context, __Pyx_memviewslice __pyx_v_inputdata, __Pyx_memviewslice __pyx_v_expectedOutput); /* proto */
 static PyObject *__pyx_pf_8PyDeepCL_7Rmsprop_8trainFromLabels(struct __pyx_obj_8PyDeepCL_Rmsprop *__pyx_v_self, struct __pyx_obj_8PyDeepCL_NeuralNet *__pyx_v_net, struct __pyx_obj_8PyDeepCL_TrainingContext *__pyx_v_context, __Pyx_memviewslice __pyx_v_inputdata, __Pyx_memviewslice __pyx_v_labels); /* proto */
-static int __pyx_pf_8PyDeepCL_8Adadelta___cinit__(struct __pyx_obj_8PyDeepCL_Adadelta *__pyx_v_self, struct __pyx_obj_8PyDeepCL_EasyCL *__pyx_v_cl, PyObject *__pyx_v_rho); /* proto */
-static PyObject *__pyx_pf_8PyDeepCL_8Adadelta_2__dealloc(struct __pyx_obj_8PyDeepCL_Adadelta *__pyx_v_self); /* proto */
+static int __pyx_pf_8PyDeepCL_8Adadelta___cinit__(struct __pyx_obj_8PyDeepCL_Adadelta *__pyx_v_self, struct __pyx_obj_8PyDeepCL_DeepCL *__pyx_v_cl, PyObject *__pyx_v_rho); /* proto */
+static void __pyx_pf_8PyDeepCL_8Adadelta_2__dealloc__(struct __pyx_obj_8PyDeepCL_Adadelta *__pyx_v_self); /* proto */
 static PyObject *__pyx_pf_8PyDeepCL_8Adadelta_4train(struct __pyx_obj_8PyDeepCL_Adadelta *__pyx_v_self, struct __pyx_obj_8PyDeepCL_NeuralNet *__pyx_v_net, struct __pyx_obj_8PyDeepCL_TrainingContext *__pyx_v_context, __Pyx_memviewslice __pyx_v_inputdata, __Pyx_memviewslice __pyx_v_expectedOutput); /* proto */
 static PyObject *__pyx_pf_8PyDeepCL_8Adadelta_6trainFromLabels(struct __pyx_obj_8PyDeepCL_Adadelta *__pyx_v_self, struct __pyx_obj_8PyDeepCL_NeuralNet *__pyx_v_net, struct __pyx_obj_8PyDeepCL_TrainingContext *__pyx_v_context, __Pyx_memviewslice __pyx_v_inputdata, __Pyx_memviewslice __pyx_v_labels); /* proto */
-static int __pyx_pf_8PyDeepCL_9NeuralNet___cinit__(struct __pyx_obj_8PyDeepCL_NeuralNet *__pyx_v_self, struct __pyx_obj_8PyDeepCL_EasyCL *__pyx_v_cl, PyObject *__pyx_v_planes, PyObject *__pyx_v_size); /* proto */
-static PyObject *__pyx_pf_8PyDeepCL_9NeuralNet_2__dealloc(struct __pyx_obj_8PyDeepCL_NeuralNet *__pyx_v_self); /* proto */
+static int __pyx_pf_8PyDeepCL_9NeuralNet___cinit__(struct __pyx_obj_8PyDeepCL_NeuralNet *__pyx_v_self, struct __pyx_obj_8PyDeepCL_DeepCL *__pyx_v_cl, PyObject *__pyx_v_planes, PyObject *__pyx_v_size); /* proto */
+static void __pyx_pf_8PyDeepCL_9NeuralNet_2__dealloc__(struct __pyx_obj_8PyDeepCL_NeuralNet *__pyx_v_self); /* proto */
 static PyObject *__pyx_pf_8PyDeepCL_9NeuralNet_4asString(struct __pyx_obj_8PyDeepCL_NeuralNet *__pyx_v_self); /* proto */
 static PyObject *__pyx_pf_8PyDeepCL_9NeuralNet_6setBatchSize(struct __pyx_obj_8PyDeepCL_NeuralNet *__pyx_v_self, int __pyx_v_batchSize); /* proto */
 static PyObject *__pyx_pf_8PyDeepCL_9NeuralNet_8forward(struct __pyx_obj_8PyDeepCL_NeuralNet *__pyx_v_self, __Pyx_memviewslice __pyx_v_images); /* proto */
@@ -1858,7 +1811,7 @@ static PyObject *__pyx_pf_8PyDeepCL_5Layer_4backward(struct __pyx_obj_8PyDeepCL_
 static PyObject *__pyx_pf_8PyDeepCL_5Layer_6needsBackProp(struct __pyx_obj_8PyDeepCL_Layer *__pyx_v_self); /* proto */
 static PyObject *__pyx_pf_8PyDeepCL_5Layer_8getOutputCubeSize(struct __pyx_obj_8PyDeepCL_Layer *__pyx_v_self); /* proto */
 static PyObject *__pyx_pf_8PyDeepCL_5Layer_10getOutputPlanes(struct __pyx_obj_8PyDeepCL_Layer *__pyx_v_self); /* proto */
-static PyObject *__pyx_pf_8PyDeepCL_5Layer_12getOutputImageSize(struct __pyx_obj_8PyDeepCL_Layer *__pyx_v_self); /* proto */
+static PyObject *__pyx_pf_8PyDeepCL_5Layer_12getOutputSize(struct __pyx_obj_8PyDeepCL_Layer *__pyx_v_self); /* proto */
 static PyObject *__pyx_pf_8PyDeepCL_5Layer_14getOutput(struct __pyx_obj_8PyDeepCL_Layer *__pyx_v_self); /* proto */
 static PyObject *__pyx_pf_8PyDeepCL_5Layer_16getWeights(struct __pyx_obj_8PyDeepCL_Layer *__pyx_v_self); /* proto */
 static PyObject *__pyx_pf_8PyDeepCL_5Layer_18setWeights(struct __pyx_obj_8PyDeepCL_Layer *__pyx_v_self, __Pyx_memviewslice __pyx_v_weights); /* proto */
@@ -1905,10 +1858,10 @@ static int __pyx_pf_8PyDeepCL_15InputLayerMaker___cinit__(struct __pyx_obj_8PyDe
 static PyObject *__pyx_pf_8PyDeepCL_15InputLayerMaker_2numPlanes(struct __pyx_obj_8PyDeepCL_InputLayerMaker *__pyx_v_self, int __pyx_v__numPlanes); /* proto */
 static PyObject *__pyx_pf_8PyDeepCL_15InputLayerMaker_4imageSize(struct __pyx_obj_8PyDeepCL_InputLayerMaker *__pyx_v_self, int __pyx_v__imageSize); /* proto */
 static PyObject *__pyx_pf_8PyDeepCL_15InputLayerMaker_6instance(); /* proto */
-static PyObject *__pyx_pf_8PyDeepCL_13GenericLoader_getDimensions(PyObject *__pyx_v_trainFilePath); /* proto */
+static PyObject *__pyx_pf_8PyDeepCL_13GenericLoader_getDimensions(PyObject *__pyx_v_trainFilepath); /* proto */
 static PyObject *__pyx_pf_8PyDeepCL_13GenericLoader_2load(PyObject *__pyx_v_trainFilepath, __Pyx_memviewslice __pyx_v_images, __Pyx_memviewslice __pyx_v_labels, PyObject *__pyx_v_startN, PyObject *__pyx_v_numExamples); /* proto */
 static int __pyx_pf_8PyDeepCL_10NetLearner___cinit__(struct __pyx_obj_8PyDeepCL_NetLearner *__pyx_v_self, struct __pyx_obj_8PyDeepCL_SGD *__pyx_v_sgd, struct __pyx_obj_8PyDeepCL_NeuralNet *__pyx_v_neuralnet, PyObject *__pyx_v_Ntrain, __Pyx_memviewslice __pyx_v_trainData, __Pyx_memviewslice __pyx_v_trainLabels, PyObject *__pyx_v_Ntest, __Pyx_memviewslice __pyx_v_testData, __Pyx_memviewslice __pyx_v_testLabels, PyObject *__pyx_v_batchSize); /* proto */
-static PyObject *__pyx_pf_8PyDeepCL_10NetLearner_2__dealloc(struct __pyx_obj_8PyDeepCL_NetLearner *__pyx_v_self); /* proto */
+static void __pyx_pf_8PyDeepCL_10NetLearner_2__dealloc__(struct __pyx_obj_8PyDeepCL_NetLearner *__pyx_v_self); /* proto */
 static PyObject *__pyx_pf_8PyDeepCL_10NetLearner_4setSchedule(struct __pyx_obj_8PyDeepCL_NetLearner *__pyx_v_self, PyObject *__pyx_v_numEpochs); /* proto */
 static PyObject *__pyx_pf_8PyDeepCL_10NetLearner_6setDumpTimings(struct __pyx_obj_8PyDeepCL_NetLearner *__pyx_v_self, int __pyx_v_dumpTimings); /* proto */
 static PyObject *__pyx_pf_8PyDeepCL_10NetLearner_8_run(struct __pyx_obj_8PyDeepCL_NetLearner *__pyx_v_self); /* proto */
@@ -1929,9 +1882,8 @@ static PyObject *__pyx_pf_8PyDeepCL_8Scenario_8getNumActions(CYTHON_UNUSED struc
 static PyObject *__pyx_pf_8PyDeepCL_8Scenario_10act(CYTHON_UNUSED struct __pyx_obj_8PyDeepCL_Scenario *__pyx_v_self, CYTHON_UNUSED PyObject *__pyx_v_index); /* proto */
 static PyObject *__pyx_pf_8PyDeepCL_8Scenario_12hasFinished(CYTHON_UNUSED struct __pyx_obj_8PyDeepCL_Scenario *__pyx_v_self); /* proto */
 static PyObject *__pyx_pf_8PyDeepCL_8Scenario_14getPerception(CYTHON_UNUSED struct __pyx_obj_8PyDeepCL_Scenario *__pyx_v_self, CYTHON_UNUSED PyObject *__pyx_v_perception); /* proto */
-static PyObject *__pyx_pf_8PyDeepCL_checkException(CYTHON_UNUSED PyObject *__pyx_self); /* proto */
-static PyObject *__pyx_pf_8PyDeepCL_2interruptableCall(CYTHON_UNUSED PyObject *__pyx_self, PyObject *__pyx_v_function, PyObject *__pyx_v_args); /* proto */
-static PyObject *__pyx_pf_8PyDeepCL_4toCppString(CYTHON_UNUSED PyObject *__pyx_self, PyObject *__pyx_v_pyString); /* proto */
+static PyObject *__pyx_pf_8PyDeepCL_interruptableCall(CYTHON_UNUSED PyObject *__pyx_self, PyObject *__pyx_v_function, PyObject *__pyx_v_args); /* proto */
+static PyObject *__pyx_pf_8PyDeepCL_2toCppString(CYTHON_UNUSED PyObject *__pyx_self, PyObject *__pyx_v_pyString); /* proto */
 static int __pyx_pf_7cpython_5array_5array___getbuffer__(arrayobject *__pyx_v_self, Py_buffer *__pyx_v_info, CYTHON_UNUSED int __pyx_v_flags); /* proto */
 static void __pyx_pf_7cpython_5array_5array_2__releasebuffer__(CYTHON_UNUSED arrayobject *__pyx_v_self, Py_buffer *__pyx_v_info); /* proto */
 static int __pyx_array___pyx_pf_15View_dot_MemoryView_5array___cinit__(struct __pyx_array_obj *__pyx_v_self, PyObject *__pyx_v_shape, Py_ssize_t __pyx_v_itemsize, PyObject *__pyx_v_format, PyObject *__pyx_v_mode, int __pyx_v_allocate_buffer); /* proto */
@@ -1966,7 +1918,7 @@ static PyObject *__pyx_memoryview___pyx_pf_15View_dot_MemoryView_10memoryview_20
 static PyObject *__pyx_memoryview___pyx_pf_15View_dot_MemoryView_10memoryview_22copy_fortran(struct __pyx_memoryview_obj *__pyx_v_self); /* proto */
 static void __pyx_memoryviewslice___pyx_pf_15View_dot_MemoryView_16_memoryviewslice___dealloc__(struct __pyx_memoryviewslice_obj *__pyx_v_self); /* proto */
 static PyObject *__pyx_pf_15View_dot_MemoryView_16_memoryviewslice_4base___get__(struct __pyx_memoryviewslice_obj *__pyx_v_self); /* proto */
-static PyObject *__pyx_tp_new_8PyDeepCL_EasyCL(PyTypeObject *t, PyObject *a, PyObject *k); /*proto*/
+static PyObject *__pyx_tp_new_8PyDeepCL_DeepCL(PyTypeObject *t, PyObject *a, PyObject *k); /*proto*/
 static PyObject *__pyx_tp_new_8PyDeepCL_TrainingContext(PyTypeObject *t, PyObject *a, PyObject *k); /*proto*/
 static PyObject *__pyx_tp_new_8PyDeepCL_SGD(PyTypeObject *t, PyObject *a, PyObject *k); /*proto*/
 static PyObject *__pyx_tp_new_8PyDeepCL_Annealer(PyTypeObject *t, PyObject *a, PyObject *k); /*proto*/
@@ -2000,10 +1952,10 @@ static char __pyx_k_N[] = "N";
 static char __pyx_k_O[] = "O";
 static char __pyx_k_c[] = "c";
 static char __pyx_k_f[] = "f";
-static char __pyx_k__9[] = "";
 static char __pyx_k_cl[] = "cl";
 static char __pyx_k_id[] = "id";
 static char __pyx_k_act[] = "act";
+static char __pyx_k_end[] = "end";
 static char __pyx_k_net[] = "net";
 static char __pyx_k_obj[] = "obj";
 static char __pyx_k_rho[] = "rho";
@@ -2011,6 +1963,7 @@ static char __pyx_k_run[] = "_run";
 static char __pyx_k_sgd[] = "sgd";
 static char __pyx_k_args[] = "args";
 static char __pyx_k_base[] = "base";
+static char __pyx_k_file[] = "file";
 static char __pyx_k_join[] = "join";
 static char __pyx_k_load[] = "load";
 static char __pyx_k_main[] = "__main__";
@@ -2031,6 +1984,7 @@ static char __pyx_k_epoch[] = "epoch";
 static char __pyx_k_error[] = "error";
 static char __pyx_k_flags[] = "flags";
 static char __pyx_k_layer[] = "layer ";
+static char __pyx_k_print[] = "print";
 static char __pyx_k_range[] = "range";
 static char __pyx_k_reset[] = "reset";
 static char __pyx_k_shape[] = "shape";
@@ -2055,7 +2009,6 @@ static char __pyx_k_context[] = "context";
 static char __pyx_k_fortran[] = "fortran";
 static char __pyx_k_isAlive[] = "isAlive";
 static char __pyx_k_memview[] = "memview";
-static char __pyx_k_message[] = "message";
 static char __pyx_k_Ellipsis[] = "Ellipsis";
 static char __pyx_k_PyDeepCL[] = "PyDeepCL";
 static char __pyx_k_fromlist[] = "fromlist";
@@ -2087,20 +2040,19 @@ static char __pyx_k_hasFinished[] = "hasFinished";
 static char __pyx_k_numExamples[] = "numExamples";
 static char __pyx_k_toCppString[] = "toCppString";
 static char __pyx_k_trainLabels[] = "trainLabels";
-static char __pyx_k_RuntimeError[] = "RuntimeError";
 static char __pyx_k_learningRate[] = "learningRate";
 static char __pyx_k_staticmethod[] = "staticmethod";
 static char __pyx_k_getDimensions[] = "getDimensions";
 static char __pyx_k_getNumActions[] = "getNumActions";
 static char __pyx_k_getPerception[] = "getPerception";
 static char __pyx_k_pyx_getbuffer[] = "__pyx_getbuffer";
-static char __pyx_k_trainFilePath[] = "trainFilePath";
 static char __pyx_k_trainFilepath[] = "trainFilepath";
-static char __pyx_k_checkException[] = "checkException";
 static char __pyx_k_expectedOutput[] = "expectedOutput";
-static char __pyx_k_threwException[] = "threwException";
 static char __pyx_k_allocate_buffer[] = "allocate_buffer";
 static char __pyx_k_dtype_is_object[] = "dtype_is_object";
+static char __pyx_k_got_char_result[] = "got char *result";
+static char __pyx_k_netdef_charstar[] = "netdef_charstar";
+static char __pyx_k_finished_calling[] = "finished calling";
 static char __pyx_k_getPerceptionSize[] = "getPerceptionSize";
 static char __pyx_k_interruptableCall[] = "interruptableCall";
 static char __pyx_k_strided_and_direct[] = "<strided and direct>";
@@ -2109,6 +2061,7 @@ static char __pyx_k_getPerceptionPlanes[] = "getPerceptionPlanes";
 static char __pyx_k_strided_and_indirect[] = "<strided and indirect>";
 static char __pyx_k_contiguous_and_direct[] = "<contiguous and direct>";
 static char __pyx_k_MemoryView_of_r_object[] = "<MemoryView of %r object>";
+static char __pyx_k_trainFilepath_charstar[] = "trainFilepath_charstar";
 static char __pyx_k_MemoryView_of_r_at_0x_x[] = "<MemoryView of %r at 0x%x>";
 static char __pyx_k_contiguous_and_indirect[] = "<contiguous and indirect>";
 static char __pyx_k_Cannot_index_with_type_s[] = "Cannot index with type '%s'";
@@ -2117,13 +2070,15 @@ static char __pyx_k_Dimension_d_is_not_direct[] = "Dimension %d is not direct";
 static char __pyx_k_Invalid_shape_in_axis_d_d[] = "Invalid shape in axis %d: %d.";
 static char __pyx_k_Index_out_of_bounds_axis_d[] = "Index out of bounds (axis %d)";
 static char __pyx_k_Step_may_not_be_zero_axis_d[] = "Step may not be zero (axis %d)";
+static char __pyx_k_about_to_call_asnewcharstar[] = "about to call asnewcharstar";
 static char __pyx_k_itemsize_0_for_cython_array[] = "itemsize <= 0 for cython.array";
 static char __pyx_k_unable_to_allocate_array_data[] = "unable to allocate array data.";
+static char __pyx_k_GenericLoader_py_getDimensions[] = "GenericLoader.py getDimensions ";
 static char __pyx_k_strided_and_direct_or_indirect[] = "<strided and direct or indirect>";
-static char __pyx_k_data_norep_git_DeepCL_python_Ge[] = "/data/norep/git/DeepCL/python/GenericLoader.pyx";
-static char __pyx_k_data_norep_git_DeepCL_python_La[] = "/data/norep/git/DeepCL/python/LayerMaker.pyx";
-static char __pyx_k_data_norep_git_DeepCL_python_Ne[] = "/data/norep/git/DeepCL/python/NetDefToNet.pyx";
-static char __pyx_k_data_norep_git_DeepCL_python_Py[] = "/data/norep/git/DeepCL/python/PyDeepCL.pyx";
+static char __pyx_k_home_user_git_DeepCL_python_Gen[] = "/home/user/git/DeepCL/python/GenericLoader.pyx";
+static char __pyx_k_home_user_git_DeepCL_python_Lay[] = "/home/user/git/DeepCL/python/LayerMaker.pyx";
+static char __pyx_k_home_user_git_DeepCL_python_Net[] = "/home/user/git/DeepCL/python/NetDefToNet.pyx";
+static char __pyx_k_home_user_git_DeepCL_python_PyD[] = "/home/user/git/DeepCL/python/PyDeepCL.pyx";
 static char __pyx_k_All_dimensions_preceding_dimensi[] = "All dimensions preceding dimension %d must be indexed and not sliced";
 static char __pyx_k_Buffer_view_does_not_expose_stri[] = "Buffer view does not expose strides";
 static char __pyx_k_Can_only_create_a_buffer_that_is[] = "Can only create a buffer that is contiguous in memory.";
@@ -2148,6 +2103,7 @@ static PyObject *__pyx_kp_s_Cannot_index_with_type_s;
 static PyObject *__pyx_n_s_Ellipsis;
 static PyObject *__pyx_kp_s_Empty_shape_tuple_for_cython_arr;
 static PyObject *__pyx_n_s_Exception;
+static PyObject *__pyx_kp_s_GenericLoader_py_getDimensions;
 static PyObject *__pyx_n_s_IndexError;
 static PyObject *__pyx_kp_s_Indirect_dimensions_not_supporte;
 static PyObject *__pyx_kp_s_Invalid_mode_expected_c_or_fortr;
@@ -2168,12 +2124,11 @@ static PyObject *__pyx_n_s_Ntrain;
 static PyObject *__pyx_n_b_O;
 static PyObject *__pyx_kp_s_Out_of_bounds_on_buffer_access_a;
 static PyObject *__pyx_n_s_PyDeepCL;
-static PyObject *__pyx_n_s_RuntimeError;
 static PyObject *__pyx_n_s_Thread;
 static PyObject *__pyx_n_s_TypeError;
 static PyObject *__pyx_kp_s_Unable_to_convert_item_to_object;
 static PyObject *__pyx_n_s_ValueError;
-static PyObject *__pyx_kp_b__9;
+static PyObject *__pyx_kp_s_about_to_call_asnewcharstar;
 static PyObject *__pyx_n_s_act;
 static PyObject *__pyx_n_s_allocate_buffer;
 static PyObject *__pyx_n_s_anneal;
@@ -2184,7 +2139,6 @@ static PyObject *__pyx_n_s_batch;
 static PyObject *__pyx_n_s_batchSize;
 static PyObject *__pyx_n_s_c;
 static PyObject *__pyx_n_u_c;
-static PyObject *__pyx_n_s_checkException;
 static PyObject *__pyx_n_s_cl;
 static PyObject *__pyx_n_s_class;
 static PyObject *__pyx_n_s_context;
@@ -2192,17 +2146,16 @@ static PyObject *__pyx_kp_s_contiguous_and_direct;
 static PyObject *__pyx_kp_s_contiguous_and_indirect;
 static PyObject *__pyx_n_s_createNetFromNetdef;
 static PyObject *__pyx_n_s_daemon;
-static PyObject *__pyx_kp_s_data_norep_git_DeepCL_python_Ge;
-static PyObject *__pyx_kp_s_data_norep_git_DeepCL_python_La;
-static PyObject *__pyx_kp_s_data_norep_git_DeepCL_python_Ne;
-static PyObject *__pyx_kp_s_data_norep_git_DeepCL_python_Py;
 static PyObject *__pyx_n_s_dtype_is_object;
 static PyObject *__pyx_n_s_encode;
+static PyObject *__pyx_n_s_end;
 static PyObject *__pyx_n_s_enumerate;
 static PyObject *__pyx_n_s_epoch;
 static PyObject *__pyx_n_s_error;
 static PyObject *__pyx_n_s_expectedOutput;
 static PyObject *__pyx_n_s_f;
+static PyObject *__pyx_n_s_file;
+static PyObject *__pyx_kp_s_finished_calling;
 static PyObject *__pyx_n_s_flags;
 static PyObject *__pyx_n_s_format;
 static PyObject *__pyx_n_s_fortran;
@@ -2214,9 +2167,14 @@ static PyObject *__pyx_n_s_getNumActions;
 static PyObject *__pyx_n_s_getPerception;
 static PyObject *__pyx_n_s_getPerceptionPlanes;
 static PyObject *__pyx_n_s_getPerceptionSize;
+static PyObject *__pyx_kp_s_got_char_result;
 static PyObject *__pyx_kp_s_got_differing_extents_in_dimensi;
 static PyObject *__pyx_n_s_gpuindex;
 static PyObject *__pyx_n_s_hasFinished;
+static PyObject *__pyx_kp_s_home_user_git_DeepCL_python_Gen;
+static PyObject *__pyx_kp_s_home_user_git_DeepCL_python_Lay;
+static PyObject *__pyx_kp_s_home_user_git_DeepCL_python_Net;
+static PyObject *__pyx_kp_s_home_user_git_DeepCL_python_PyD;
 static PyObject *__pyx_n_s_id;
 static PyObject *__pyx_n_s_images;
 static PyObject *__pyx_n_s_import;
@@ -2233,7 +2191,6 @@ static PyObject *__pyx_n_s_learningRate;
 static PyObject *__pyx_n_s_load;
 static PyObject *__pyx_n_s_main;
 static PyObject *__pyx_n_s_memview;
-static PyObject *__pyx_n_s_message;
 static PyObject *__pyx_n_s_mode;
 static PyObject *__pyx_n_s_momentum;
 static PyObject *__pyx_n_s_mythread;
@@ -2242,12 +2199,14 @@ static PyObject *__pyx_n_s_name_2;
 static PyObject *__pyx_n_s_ndim;
 static PyObject *__pyx_n_s_net;
 static PyObject *__pyx_n_s_netdef;
+static PyObject *__pyx_n_s_netdef_charstar;
 static PyObject *__pyx_n_s_neuralnet;
 static PyObject *__pyx_kp_s_not_found;
 static PyObject *__pyx_n_s_numExamples;
 static PyObject *__pyx_n_s_obj;
 static PyObject *__pyx_n_s_pack;
 static PyObject *__pyx_n_s_planes;
+static PyObject *__pyx_n_s_print;
 static PyObject *__pyx_n_s_pyString;
 static PyObject *__pyx_n_s_pyx_getbuffer;
 static PyObject *__pyx_n_s_pyx_vtable;
@@ -2274,11 +2233,10 @@ static PyObject *__pyx_n_s_test;
 static PyObject *__pyx_n_s_testData;
 static PyObject *__pyx_n_s_testLabels;
 static PyObject *__pyx_n_s_threading;
-static PyObject *__pyx_n_s_threwException;
 static PyObject *__pyx_n_s_toCppString;
 static PyObject *__pyx_n_s_trainData;
-static PyObject *__pyx_n_s_trainFilePath;
 static PyObject *__pyx_n_s_trainFilepath;
+static PyObject *__pyx_n_s_trainFilepath_charstar;
 static PyObject *__pyx_n_s_trainLabels;
 static PyObject *__pyx_kp_s_unable_to_allocate_array_data;
 static PyObject *__pyx_kp_s_unable_to_allocate_shape_and_str;
@@ -2299,9 +2257,10 @@ static PyObject *__pyx_tuple__5;
 static PyObject *__pyx_tuple__6;
 static PyObject *__pyx_tuple__7;
 static PyObject *__pyx_tuple__8;
+static PyObject *__pyx_tuple__9;
+static PyObject *__pyx_slice__19;
 static PyObject *__pyx_slice__20;
 static PyObject *__pyx_slice__21;
-static PyObject *__pyx_slice__22;
 static PyObject *__pyx_tuple__10;
 static PyObject *__pyx_tuple__11;
 static PyObject *__pyx_tuple__12;
@@ -2311,19 +2270,18 @@ static PyObject *__pyx_tuple__15;
 static PyObject *__pyx_tuple__16;
 static PyObject *__pyx_tuple__17;
 static PyObject *__pyx_tuple__18;
-static PyObject *__pyx_tuple__19;
-static PyObject *__pyx_tuple__23;
-static PyObject *__pyx_tuple__34;
-static PyObject *__pyx_tuple__36;
-static PyObject *__pyx_tuple__38;
-static PyObject *__pyx_tuple__40;
-static PyObject *__pyx_tuple__42;
+static PyObject *__pyx_tuple__22;
+static PyObject *__pyx_tuple__33;
+static PyObject *__pyx_tuple__35;
+static PyObject *__pyx_tuple__37;
+static PyObject *__pyx_tuple__39;
+static PyObject *__pyx_tuple__41;
+static PyObject *__pyx_tuple__43;
 static PyObject *__pyx_tuple__44;
+static PyObject *__pyx_tuple__45;
 static PyObject *__pyx_tuple__46;
 static PyObject *__pyx_tuple__47;
-static PyObject *__pyx_tuple__48;
-static PyObject *__pyx_tuple__49;
-static PyObject *__pyx_tuple__50;
+static PyObject *__pyx_codeobj__23;
 static PyObject *__pyx_codeobj__24;
 static PyObject *__pyx_codeobj__25;
 static PyObject *__pyx_codeobj__26;
@@ -2333,16 +2291,14 @@ static PyObject *__pyx_codeobj__29;
 static PyObject *__pyx_codeobj__30;
 static PyObject *__pyx_codeobj__31;
 static PyObject *__pyx_codeobj__32;
-static PyObject *__pyx_codeobj__33;
-static PyObject *__pyx_codeobj__35;
-static PyObject *__pyx_codeobj__37;
-static PyObject *__pyx_codeobj__39;
-static PyObject *__pyx_codeobj__41;
-static PyObject *__pyx_codeobj__43;
-static PyObject *__pyx_codeobj__45;
+static PyObject *__pyx_codeobj__34;
+static PyObject *__pyx_codeobj__36;
+static PyObject *__pyx_codeobj__38;
+static PyObject *__pyx_codeobj__40;
+static PyObject *__pyx_codeobj__42;
 
-/* "EasyCL.pyx":4
- *     cdef cDeepCL.EasyCL *thisptr
+/* "../../../../../../home/user/git/DeepCL/python/DeepCL.pyx":4
+ *     cdef cDeepCL.DeepCL *thisptr
  * 
  *     def __cinit__(self, gpuindex=None ):             # <<<<<<<<<<<<<<
  * #        print( '__cinit__(planes,size)')
@@ -2350,8 +2306,8 @@ static PyObject *__pyx_codeobj__45;
  */
 
 /* Python wrapper */
-static int __pyx_pw_8PyDeepCL_6EasyCL_1__cinit__(PyObject *__pyx_v_self, PyObject *__pyx_args, PyObject *__pyx_kwds); /*proto*/
-static int __pyx_pw_8PyDeepCL_6EasyCL_1__cinit__(PyObject *__pyx_v_self, PyObject *__pyx_args, PyObject *__pyx_kwds) {
+static int __pyx_pw_8PyDeepCL_6DeepCL_1__cinit__(PyObject *__pyx_v_self, PyObject *__pyx_args, PyObject *__pyx_kwds); /*proto*/
+static int __pyx_pw_8PyDeepCL_6DeepCL_1__cinit__(PyObject *__pyx_v_self, PyObject *__pyx_args, PyObject *__pyx_kwds) {
   PyObject *__pyx_v_gpuindex = 0;
   int __pyx_lineno = 0;
   const char *__pyx_filename = NULL;
@@ -2380,7 +2336,7 @@ static int __pyx_pw_8PyDeepCL_6EasyCL_1__cinit__(PyObject *__pyx_v_self, PyObjec
         }
       }
       if (unlikely(kw_args > 0)) {
-        if (unlikely(__Pyx_ParseOptionalKeywords(__pyx_kwds, __pyx_pyargnames, 0, values, pos_args, "__cinit__") < 0)) {__pyx_filename = __pyx_f[3]; __pyx_lineno = 4; __pyx_clineno = __LINE__; goto __pyx_L3_error;}
+        if (unlikely(__Pyx_ParseOptionalKeywords(__pyx_kwds, __pyx_pyargnames, 0, values, pos_args, "__cinit__") < 0)) {__pyx_filename = __pyx_f[2]; __pyx_lineno = 4; __pyx_clineno = __LINE__; goto __pyx_L3_error;}
       }
     } else {
       switch (PyTuple_GET_SIZE(__pyx_args)) {
@@ -2393,20 +2349,20 @@ static int __pyx_pw_8PyDeepCL_6EasyCL_1__cinit__(PyObject *__pyx_v_self, PyObjec
   }
   goto __pyx_L4_argument_unpacking_done;
   __pyx_L5_argtuple_error:;
-  __Pyx_RaiseArgtupleInvalid("__cinit__", 0, 0, 1, PyTuple_GET_SIZE(__pyx_args)); {__pyx_filename = __pyx_f[3]; __pyx_lineno = 4; __pyx_clineno = __LINE__; goto __pyx_L3_error;}
+  __Pyx_RaiseArgtupleInvalid("__cinit__", 0, 0, 1, PyTuple_GET_SIZE(__pyx_args)); {__pyx_filename = __pyx_f[2]; __pyx_lineno = 4; __pyx_clineno = __LINE__; goto __pyx_L3_error;}
   __pyx_L3_error:;
-  __Pyx_AddTraceback("PyDeepCL.EasyCL.__cinit__", __pyx_clineno, __pyx_lineno, __pyx_filename);
+  __Pyx_AddTraceback("PyDeepCL.DeepCL.__cinit__", __pyx_clineno, __pyx_lineno, __pyx_filename);
   __Pyx_RefNannyFinishContext();
   return -1;
   __pyx_L4_argument_unpacking_done:;
-  __pyx_r = __pyx_pf_8PyDeepCL_6EasyCL___cinit__(((struct __pyx_obj_8PyDeepCL_EasyCL *)__pyx_v_self), __pyx_v_gpuindex);
+  __pyx_r = __pyx_pf_8PyDeepCL_6DeepCL___cinit__(((struct __pyx_obj_8PyDeepCL_DeepCL *)__pyx_v_self), __pyx_v_gpuindex);
 
   /* function exit code */
   __Pyx_RefNannyFinishContext();
   return __pyx_r;
 }
 
-static int __pyx_pf_8PyDeepCL_6EasyCL___cinit__(struct __pyx_obj_8PyDeepCL_EasyCL *__pyx_v_self, PyObject *__pyx_v_gpuindex) {
+static int __pyx_pf_8PyDeepCL_6DeepCL___cinit__(struct __pyx_obj_8PyDeepCL_DeepCL *__pyx_v_self, PyObject *__pyx_v_gpuindex) {
   int __pyx_r;
   __Pyx_RefNannyDeclarations
   int __pyx_t_1;
@@ -2417,43 +2373,43 @@ static int __pyx_pf_8PyDeepCL_6EasyCL___cinit__(struct __pyx_obj_8PyDeepCL_EasyC
   int __pyx_clineno = 0;
   __Pyx_RefNannySetupContext("__cinit__", 0);
 
-  /* "EasyCL.pyx":6
+  /* "../../../../../../home/user/git/DeepCL/python/DeepCL.pyx":6
  *     def __cinit__(self, gpuindex=None ):
  * #        print( '__cinit__(planes,size)')
  *         if gpuindex is None:             # <<<<<<<<<<<<<<
- *              self.thisptr = cDeepCL.EasyCL.createForFirstGpuOtherwiseCpu()
+ *              self.thisptr = cDeepCL.DeepCL.createForFirstGpuOtherwiseCpu()
  *         else:
  */
   __pyx_t_1 = (__pyx_v_gpuindex == Py_None);
   __pyx_t_2 = (__pyx_t_1 != 0);
   if (__pyx_t_2) {
 
-    /* "EasyCL.pyx":7
+    /* "../../../../../../home/user/git/DeepCL/python/DeepCL.pyx":7
  * #        print( '__cinit__(planes,size)')
  *         if gpuindex is None:
- *              self.thisptr = cDeepCL.EasyCL.createForFirstGpuOtherwiseCpu()             # <<<<<<<<<<<<<<
+ *              self.thisptr = cDeepCL.DeepCL.createForFirstGpuOtherwiseCpu()             # <<<<<<<<<<<<<<
  *         else:
- *             self.thisptr = cDeepCL.EasyCL.createForIndexedGpu(gpuindex)
+ *             self.thisptr = cDeepCL.DeepCL.createForIndexedGpu(gpuindex)
  */
-    __pyx_v_self->thisptr = EasyCL::createForFirstGpuOtherwiseCpu();
+    __pyx_v_self->thisptr = DeepCL::createForFirstGpuOtherwiseCpu();
     goto __pyx_L3;
   }
   /*else*/ {
 
-    /* "EasyCL.pyx":9
- *              self.thisptr = cDeepCL.EasyCL.createForFirstGpuOtherwiseCpu()
+    /* "../../../../../../home/user/git/DeepCL/python/DeepCL.pyx":9
+ *              self.thisptr = cDeepCL.DeepCL.createForFirstGpuOtherwiseCpu()
  *         else:
- *             self.thisptr = cDeepCL.EasyCL.createForIndexedGpu(gpuindex)             # <<<<<<<<<<<<<<
+ *             self.thisptr = cDeepCL.DeepCL.createForIndexedGpu(gpuindex)             # <<<<<<<<<<<<<<
  * 
- *     def __dealloc(self):
+ *     def __dealloc__(self):
  */
-    __pyx_t_3 = __Pyx_PyInt_As_int(__pyx_v_gpuindex); if (unlikely((__pyx_t_3 == (int)-1) && PyErr_Occurred())) {__pyx_filename = __pyx_f[3]; __pyx_lineno = 9; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
-    __pyx_v_self->thisptr = EasyCL::createForIndexedGpu(__pyx_t_3);
+    __pyx_t_3 = __Pyx_PyInt_As_int(__pyx_v_gpuindex); if (unlikely((__pyx_t_3 == (int)-1) && PyErr_Occurred())) {__pyx_filename = __pyx_f[2]; __pyx_lineno = 9; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+    __pyx_v_self->thisptr = DeepCL::createForIndexedGpu(__pyx_t_3);
   }
   __pyx_L3:;
 
-  /* "EasyCL.pyx":4
- *     cdef cDeepCL.EasyCL *thisptr
+  /* "../../../../../../home/user/git/DeepCL/python/DeepCL.pyx":4
+ *     cdef cDeepCL.DeepCL *thisptr
  * 
  *     def __cinit__(self, gpuindex=None ):             # <<<<<<<<<<<<<<
  * #        print( '__cinit__(planes,size)')
@@ -2464,63 +2420,482 @@ static int __pyx_pf_8PyDeepCL_6EasyCL___cinit__(struct __pyx_obj_8PyDeepCL_EasyC
   __pyx_r = 0;
   goto __pyx_L0;
   __pyx_L1_error:;
-  __Pyx_AddTraceback("PyDeepCL.EasyCL.__cinit__", __pyx_clineno, __pyx_lineno, __pyx_filename);
+  __Pyx_AddTraceback("PyDeepCL.DeepCL.__cinit__", __pyx_clineno, __pyx_lineno, __pyx_filename);
   __pyx_r = -1;
   __pyx_L0:;
   __Pyx_RefNannyFinishContext();
   return __pyx_r;
 }
 
-/* "EasyCL.pyx":11
- *             self.thisptr = cDeepCL.EasyCL.createForIndexedGpu(gpuindex)
+/* "../../../../../../home/user/git/DeepCL/python/DeepCL.pyx":11
+ *             self.thisptr = cDeepCL.DeepCL.createForIndexedGpu(gpuindex)
  * 
- *     def __dealloc(self):             # <<<<<<<<<<<<<<
- *         del self.thisptr
+ *     def __dealloc__(self):             # <<<<<<<<<<<<<<
+ *         self.thisptr.deleteMe()
+ * 
+ */
+
+/* Python wrapper */
+static void __pyx_pw_8PyDeepCL_6DeepCL_3__dealloc__(PyObject *__pyx_v_self); /*proto*/
+static void __pyx_pw_8PyDeepCL_6DeepCL_3__dealloc__(PyObject *__pyx_v_self) {
+  __Pyx_RefNannyDeclarations
+  __Pyx_RefNannySetupContext("__dealloc__ (wrapper)", 0);
+  __pyx_pf_8PyDeepCL_6DeepCL_2__dealloc__(((struct __pyx_obj_8PyDeepCL_DeepCL *)__pyx_v_self));
+
+  /* function exit code */
+  __Pyx_RefNannyFinishContext();
+}
+
+static void __pyx_pf_8PyDeepCL_6DeepCL_2__dealloc__(struct __pyx_obj_8PyDeepCL_DeepCL *__pyx_v_self) {
+  __Pyx_RefNannyDeclarations
+  __Pyx_RefNannySetupContext("__dealloc__", 0);
+
+  /* "../../../../../../home/user/git/DeepCL/python/DeepCL.pyx":12
+ * 
+ *     def __dealloc__(self):
+ *         self.thisptr.deleteMe()             # <<<<<<<<<<<<<<
+ * 
+ *     def setProfiling(self, profiling):
+ */
+  __pyx_v_self->thisptr->deleteMe();
+
+  /* "../../../../../../home/user/git/DeepCL/python/DeepCL.pyx":11
+ *             self.thisptr = cDeepCL.DeepCL.createForIndexedGpu(gpuindex)
+ * 
+ *     def __dealloc__(self):             # <<<<<<<<<<<<<<
+ *         self.thisptr.deleteMe()
+ * 
+ */
+
+  /* function exit code */
+  __Pyx_RefNannyFinishContext();
+}
+
+/* "../../../../../../home/user/git/DeepCL/python/DeepCL.pyx":14
+ *         self.thisptr.deleteMe()
+ * 
+ *     def setProfiling(self, profiling):             # <<<<<<<<<<<<<<
+ *         self.thisptr.setProfiling(profiling)
  * 
  */
 
 /* Python wrapper */
-static PyObject *__pyx_pw_8PyDeepCL_6EasyCL_3__dealloc(PyObject *__pyx_v_self, CYTHON_UNUSED PyObject *unused); /*proto*/
-static PyObject *__pyx_pw_8PyDeepCL_6EasyCL_3__dealloc(PyObject *__pyx_v_self, CYTHON_UNUSED PyObject *unused) {
+static PyObject *__pyx_pw_8PyDeepCL_6DeepCL_5setProfiling(PyObject *__pyx_v_self, PyObject *__pyx_v_profiling); /*proto*/
+static PyObject *__pyx_pw_8PyDeepCL_6DeepCL_5setProfiling(PyObject *__pyx_v_self, PyObject *__pyx_v_profiling) {
   PyObject *__pyx_r = 0;
   __Pyx_RefNannyDeclarations
-  __Pyx_RefNannySetupContext("__dealloc (wrapper)", 0);
-  __pyx_r = __pyx_pf_8PyDeepCL_6EasyCL_2__dealloc(((struct __pyx_obj_8PyDeepCL_EasyCL *)__pyx_v_self));
+  __Pyx_RefNannySetupContext("setProfiling (wrapper)", 0);
+  __pyx_r = __pyx_pf_8PyDeepCL_6DeepCL_4setProfiling(((struct __pyx_obj_8PyDeepCL_DeepCL *)__pyx_v_self), ((PyObject *)__pyx_v_profiling));
 
   /* function exit code */
   __Pyx_RefNannyFinishContext();
   return __pyx_r;
 }
 
-static PyObject *__pyx_pf_8PyDeepCL_6EasyCL_2__dealloc(struct __pyx_obj_8PyDeepCL_EasyCL *__pyx_v_self) {
+static PyObject *__pyx_pf_8PyDeepCL_6DeepCL_4setProfiling(struct __pyx_obj_8PyDeepCL_DeepCL *__pyx_v_self, PyObject *__pyx_v_profiling) {
   PyObject *__pyx_r = NULL;
   __Pyx_RefNannyDeclarations
-  __Pyx_RefNannySetupContext("__dealloc", 0);
+  bool __pyx_t_1;
+  int __pyx_lineno = 0;
+  const char *__pyx_filename = NULL;
+  int __pyx_clineno = 0;
+  __Pyx_RefNannySetupContext("setProfiling", 0);
 
-  /* "EasyCL.pyx":12
+  /* "../../../../../../home/user/git/DeepCL/python/DeepCL.pyx":15
  * 
- *     def __dealloc(self):
- *         del self.thisptr             # <<<<<<<<<<<<<<
+ *     def setProfiling(self, profiling):
+ *         self.thisptr.setProfiling(profiling)             # <<<<<<<<<<<<<<
  * 
+ *     def dumpProfiling(self):
  */
-  delete __pyx_v_self->thisptr;
+  __pyx_t_1 = __Pyx_PyObject_IsTrue(__pyx_v_profiling); if (unlikely((__pyx_t_1 == (bool)-1) && PyErr_Occurred())) {__pyx_filename = __pyx_f[2]; __pyx_lineno = 15; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+  __pyx_v_self->thisptr->setProfiling(__pyx_t_1);
 
-  /* "EasyCL.pyx":11
- *             self.thisptr = cDeepCL.EasyCL.createForIndexedGpu(gpuindex)
+  /* "../../../../../../home/user/git/DeepCL/python/DeepCL.pyx":14
+ *         self.thisptr.deleteMe()
  * 
- *     def __dealloc(self):             # <<<<<<<<<<<<<<
- *         del self.thisptr
+ *     def setProfiling(self, profiling):             # <<<<<<<<<<<<<<
+ *         self.thisptr.setProfiling(profiling)
  * 
  */
 
   /* function exit code */
   __pyx_r = Py_None; __Pyx_INCREF(Py_None);
+  goto __pyx_L0;
+  __pyx_L1_error:;
+  __Pyx_AddTraceback("PyDeepCL.DeepCL.setProfiling", __pyx_clineno, __pyx_lineno, __pyx_filename);
+  __pyx_r = NULL;
+  __pyx_L0:;
   __Pyx_XGIVEREF(__pyx_r);
   __Pyx_RefNannyFinishContext();
   return __pyx_r;
 }
 
-/* "SGD.pyx":3
+/* "../../../../../../home/user/git/DeepCL/python/DeepCL.pyx":17
+ *         self.thisptr.setProfiling(profiling)
+ * 
+ *     def dumpProfiling(self):             # <<<<<<<<<<<<<<
+ *         self.thisptr.dumpProfiling()
+ * 
+ */
+
+/* Python wrapper */
+static PyObject *__pyx_pw_8PyDeepCL_6DeepCL_7dumpProfiling(PyObject *__pyx_v_self, CYTHON_UNUSED PyObject *unused); /*proto*/
+static PyObject *__pyx_pw_8PyDeepCL_6DeepCL_7dumpProfiling(PyObject *__pyx_v_self, CYTHON_UNUSED PyObject *unused) {
+  PyObject *__pyx_r = 0;
+  __Pyx_RefNannyDeclarations
+  __Pyx_RefNannySetupContext("dumpProfiling (wrapper)", 0);
+  __pyx_r = __pyx_pf_8PyDeepCL_6DeepCL_6dumpProfiling(((struct __pyx_obj_8PyDeepCL_DeepCL *)__pyx_v_self));
+
+  /* function exit code */
+  __Pyx_RefNannyFinishContext();
+  return __pyx_r;
+}
+
+static PyObject *__pyx_pf_8PyDeepCL_6DeepCL_6dumpProfiling(struct __pyx_obj_8PyDeepCL_DeepCL *__pyx_v_self) {
+  PyObject *__pyx_r = NULL;
+  __Pyx_RefNannyDeclarations
+  __Pyx_RefNannySetupContext("dumpProfiling", 0);
+
+  /* "../../../../../../home/user/git/DeepCL/python/DeepCL.pyx":18
+ * 
+ *     def dumpProfiling(self):
+ *         self.thisptr.dumpProfiling()             # <<<<<<<<<<<<<<
+ * 
+ *     def getComputeUnits(self):
+ */
+  __pyx_v_self->thisptr->dumpProfiling();
+
+  /* "../../../../../../home/user/git/DeepCL/python/DeepCL.pyx":17
+ *         self.thisptr.setProfiling(profiling)
+ * 
+ *     def dumpProfiling(self):             # <<<<<<<<<<<<<<
+ *         self.thisptr.dumpProfiling()
+ * 
+ */
+
+  /* function exit code */
+  __pyx_r = Py_None; __Pyx_INCREF(Py_None);
+  __Pyx_XGIVEREF(__pyx_r);
+  __Pyx_RefNannyFinishContext();
+  return __pyx_r;
+}
+
+/* "../../../../../../home/user/git/DeepCL/python/DeepCL.pyx":20
+ *         self.thisptr.dumpProfiling()
+ * 
+ *     def getComputeUnits(self):             # <<<<<<<<<<<<<<
+ *         return self.thisptr.getComputeUnits()
+ * 
+ */
+
+/* Python wrapper */
+static PyObject *__pyx_pw_8PyDeepCL_6DeepCL_9getComputeUnits(PyObject *__pyx_v_self, CYTHON_UNUSED PyObject *unused); /*proto*/
+static PyObject *__pyx_pw_8PyDeepCL_6DeepCL_9getComputeUnits(PyObject *__pyx_v_self, CYTHON_UNUSED PyObject *unused) {
+  PyObject *__pyx_r = 0;
+  __Pyx_RefNannyDeclarations
+  __Pyx_RefNannySetupContext("getComputeUnits (wrapper)", 0);
+  __pyx_r = __pyx_pf_8PyDeepCL_6DeepCL_8getComputeUnits(((struct __pyx_obj_8PyDeepCL_DeepCL *)__pyx_v_self));
+
+  /* function exit code */
+  __Pyx_RefNannyFinishContext();
+  return __pyx_r;
+}
+
+static PyObject *__pyx_pf_8PyDeepCL_6DeepCL_8getComputeUnits(struct __pyx_obj_8PyDeepCL_DeepCL *__pyx_v_self) {
+  PyObject *__pyx_r = NULL;
+  __Pyx_RefNannyDeclarations
+  PyObject *__pyx_t_1 = NULL;
+  int __pyx_lineno = 0;
+  const char *__pyx_filename = NULL;
+  int __pyx_clineno = 0;
+  __Pyx_RefNannySetupContext("getComputeUnits", 0);
+
+  /* "../../../../../../home/user/git/DeepCL/python/DeepCL.pyx":21
+ * 
+ *     def getComputeUnits(self):
+ *         return self.thisptr.getComputeUnits()             # <<<<<<<<<<<<<<
+ * 
+ *     def getLocalMemorySize(self):
+ */
+  __Pyx_XDECREF(__pyx_r);
+  __pyx_t_1 = __Pyx_PyInt_From_int(__pyx_v_self->thisptr->getComputeUnits()); if (unlikely(!__pyx_t_1)) {__pyx_filename = __pyx_f[2]; __pyx_lineno = 21; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+  __Pyx_GOTREF(__pyx_t_1);
+  __pyx_r = __pyx_t_1;
+  __pyx_t_1 = 0;
+  goto __pyx_L0;
+
+  /* "../../../../../../home/user/git/DeepCL/python/DeepCL.pyx":20
+ *         self.thisptr.dumpProfiling()
+ * 
+ *     def getComputeUnits(self):             # <<<<<<<<<<<<<<
+ *         return self.thisptr.getComputeUnits()
+ * 
+ */
+
+  /* function exit code */
+  __pyx_L1_error:;
+  __Pyx_XDECREF(__pyx_t_1);
+  __Pyx_AddTraceback("PyDeepCL.DeepCL.getComputeUnits", __pyx_clineno, __pyx_lineno, __pyx_filename);
+  __pyx_r = NULL;
+  __pyx_L0:;
+  __Pyx_XGIVEREF(__pyx_r);
+  __Pyx_RefNannyFinishContext();
+  return __pyx_r;
+}
+
+/* "../../../../../../home/user/git/DeepCL/python/DeepCL.pyx":23
+ *         return self.thisptr.getComputeUnits()
+ * 
+ *     def getLocalMemorySize(self):             # <<<<<<<<<<<<<<
+ *         return self.thisptr.getLocalMemorySize()
+ * 
+ */
+
+/* Python wrapper */
+static PyObject *__pyx_pw_8PyDeepCL_6DeepCL_11getLocalMemorySize(PyObject *__pyx_v_self, CYTHON_UNUSED PyObject *unused); /*proto*/
+static PyObject *__pyx_pw_8PyDeepCL_6DeepCL_11getLocalMemorySize(PyObject *__pyx_v_self, CYTHON_UNUSED PyObject *unused) {
+  PyObject *__pyx_r = 0;
+  __Pyx_RefNannyDeclarations
+  __Pyx_RefNannySetupContext("getLocalMemorySize (wrapper)", 0);
+  __pyx_r = __pyx_pf_8PyDeepCL_6DeepCL_10getLocalMemorySize(((struct __pyx_obj_8PyDeepCL_DeepCL *)__pyx_v_self));
+
+  /* function exit code */
+  __Pyx_RefNannyFinishContext();
+  return __pyx_r;
+}
+
+static PyObject *__pyx_pf_8PyDeepCL_6DeepCL_10getLocalMemorySize(struct __pyx_obj_8PyDeepCL_DeepCL *__pyx_v_self) {
+  PyObject *__pyx_r = NULL;
+  __Pyx_RefNannyDeclarations
+  PyObject *__pyx_t_1 = NULL;
+  int __pyx_lineno = 0;
+  const char *__pyx_filename = NULL;
+  int __pyx_clineno = 0;
+  __Pyx_RefNannySetupContext("getLocalMemorySize", 0);
+
+  /* "../../../../../../home/user/git/DeepCL/python/DeepCL.pyx":24
+ * 
+ *     def getLocalMemorySize(self):
+ *         return self.thisptr.getLocalMemorySize()             # <<<<<<<<<<<<<<
+ * 
+ *     def getLocalMemorySizeKB(self):
+ */
+  __Pyx_XDECREF(__pyx_r);
+  __pyx_t_1 = __Pyx_PyInt_From_int(__pyx_v_self->thisptr->getLocalMemorySize()); if (unlikely(!__pyx_t_1)) {__pyx_filename = __pyx_f[2]; __pyx_lineno = 24; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+  __Pyx_GOTREF(__pyx_t_1);
+  __pyx_r = __pyx_t_1;
+  __pyx_t_1 = 0;
+  goto __pyx_L0;
+
+  /* "../../../../../../home/user/git/DeepCL/python/DeepCL.pyx":23
+ *         return self.thisptr.getComputeUnits()
+ * 
+ *     def getLocalMemorySize(self):             # <<<<<<<<<<<<<<
+ *         return self.thisptr.getLocalMemorySize()
+ * 
+ */
+
+  /* function exit code */
+  __pyx_L1_error:;
+  __Pyx_XDECREF(__pyx_t_1);
+  __Pyx_AddTraceback("PyDeepCL.DeepCL.getLocalMemorySize", __pyx_clineno, __pyx_lineno, __pyx_filename);
+  __pyx_r = NULL;
+  __pyx_L0:;
+  __Pyx_XGIVEREF(__pyx_r);
+  __Pyx_RefNannyFinishContext();
+  return __pyx_r;
+}
+
+/* "../../../../../../home/user/git/DeepCL/python/DeepCL.pyx":26
+ *         return self.thisptr.getLocalMemorySize()
+ * 
+ *     def getLocalMemorySizeKB(self):             # <<<<<<<<<<<<<<
+ *         return self.thisptr.getLocalMemorySizeKB()
+ * 
+ */
+
+/* Python wrapper */
+static PyObject *__pyx_pw_8PyDeepCL_6DeepCL_13getLocalMemorySizeKB(PyObject *__pyx_v_self, CYTHON_UNUSED PyObject *unused); /*proto*/
+static PyObject *__pyx_pw_8PyDeepCL_6DeepCL_13getLocalMemorySizeKB(PyObject *__pyx_v_self, CYTHON_UNUSED PyObject *unused) {
+  PyObject *__pyx_r = 0;
+  __Pyx_RefNannyDeclarations
+  __Pyx_RefNannySetupContext("getLocalMemorySizeKB (wrapper)", 0);
+  __pyx_r = __pyx_pf_8PyDeepCL_6DeepCL_12getLocalMemorySizeKB(((struct __pyx_obj_8PyDeepCL_DeepCL *)__pyx_v_self));
+
+  /* function exit code */
+  __Pyx_RefNannyFinishContext();
+  return __pyx_r;
+}
+
+static PyObject *__pyx_pf_8PyDeepCL_6DeepCL_12getLocalMemorySizeKB(struct __pyx_obj_8PyDeepCL_DeepCL *__pyx_v_self) {
+  PyObject *__pyx_r = NULL;
+  __Pyx_RefNannyDeclarations
+  PyObject *__pyx_t_1 = NULL;
+  int __pyx_lineno = 0;
+  const char *__pyx_filename = NULL;
+  int __pyx_clineno = 0;
+  __Pyx_RefNannySetupContext("getLocalMemorySizeKB", 0);
+
+  /* "../../../../../../home/user/git/DeepCL/python/DeepCL.pyx":27
+ * 
+ *     def getLocalMemorySizeKB(self):
+ *         return self.thisptr.getLocalMemorySizeKB()             # <<<<<<<<<<<<<<
+ * 
+ *     def getMaxWorkgroupSize(self):
+ */
+  __Pyx_XDECREF(__pyx_r);
+  __pyx_t_1 = __Pyx_PyInt_From_int(__pyx_v_self->thisptr->getLocalMemorySizeKB()); if (unlikely(!__pyx_t_1)) {__pyx_filename = __pyx_f[2]; __pyx_lineno = 27; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+  __Pyx_GOTREF(__pyx_t_1);
+  __pyx_r = __pyx_t_1;
+  __pyx_t_1 = 0;
+  goto __pyx_L0;
+
+  /* "../../../../../../home/user/git/DeepCL/python/DeepCL.pyx":26
+ *         return self.thisptr.getLocalMemorySize()
+ * 
+ *     def getLocalMemorySizeKB(self):             # <<<<<<<<<<<<<<
+ *         return self.thisptr.getLocalMemorySizeKB()
+ * 
+ */
+
+  /* function exit code */
+  __pyx_L1_error:;
+  __Pyx_XDECREF(__pyx_t_1);
+  __Pyx_AddTraceback("PyDeepCL.DeepCL.getLocalMemorySizeKB", __pyx_clineno, __pyx_lineno, __pyx_filename);
+  __pyx_r = NULL;
+  __pyx_L0:;
+  __Pyx_XGIVEREF(__pyx_r);
+  __Pyx_RefNannyFinishContext();
+  return __pyx_r;
+}
+
+/* "../../../../../../home/user/git/DeepCL/python/DeepCL.pyx":29
+ *         return self.thisptr.getLocalMemorySizeKB()
+ * 
+ *     def getMaxWorkgroupSize(self):             # <<<<<<<<<<<<<<
+ *         return self.thisptr.getMaxWorkgroupSize()
+ * 
+ */
+
+/* Python wrapper */
+static PyObject *__pyx_pw_8PyDeepCL_6DeepCL_15getMaxWorkgroupSize(PyObject *__pyx_v_self, CYTHON_UNUSED PyObject *unused); /*proto*/
+static PyObject *__pyx_pw_8PyDeepCL_6DeepCL_15getMaxWorkgroupSize(PyObject *__pyx_v_self, CYTHON_UNUSED PyObject *unused) {
+  PyObject *__pyx_r = 0;
+  __Pyx_RefNannyDeclarations
+  __Pyx_RefNannySetupContext("getMaxWorkgroupSize (wrapper)", 0);
+  __pyx_r = __pyx_pf_8PyDeepCL_6DeepCL_14getMaxWorkgroupSize(((struct __pyx_obj_8PyDeepCL_DeepCL *)__pyx_v_self));
+
+  /* function exit code */
+  __Pyx_RefNannyFinishContext();
+  return __pyx_r;
+}
+
+static PyObject *__pyx_pf_8PyDeepCL_6DeepCL_14getMaxWorkgroupSize(struct __pyx_obj_8PyDeepCL_DeepCL *__pyx_v_self) {
+  PyObject *__pyx_r = NULL;
+  __Pyx_RefNannyDeclarations
+  PyObject *__pyx_t_1 = NULL;
+  int __pyx_lineno = 0;
+  const char *__pyx_filename = NULL;
+  int __pyx_clineno = 0;
+  __Pyx_RefNannySetupContext("getMaxWorkgroupSize", 0);
+
+  /* "../../../../../../home/user/git/DeepCL/python/DeepCL.pyx":30
+ * 
+ *     def getMaxWorkgroupSize(self):
+ *         return self.thisptr.getMaxWorkgroupSize()             # <<<<<<<<<<<<<<
+ * 
+ *     def getMaxAllocSizeMB(self):
+ */
+  __Pyx_XDECREF(__pyx_r);
+  __pyx_t_1 = __Pyx_PyInt_From_int(__pyx_v_self->thisptr->getMaxWorkgroupSize()); if (unlikely(!__pyx_t_1)) {__pyx_filename = __pyx_f[2]; __pyx_lineno = 30; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+  __Pyx_GOTREF(__pyx_t_1);
+  __pyx_r = __pyx_t_1;
+  __pyx_t_1 = 0;
+  goto __pyx_L0;
+
+  /* "../../../../../../home/user/git/DeepCL/python/DeepCL.pyx":29
+ *         return self.thisptr.getLocalMemorySizeKB()
+ * 
+ *     def getMaxWorkgroupSize(self):             # <<<<<<<<<<<<<<
+ *         return self.thisptr.getMaxWorkgroupSize()
+ * 
+ */
+
+  /* function exit code */
+  __pyx_L1_error:;
+  __Pyx_XDECREF(__pyx_t_1);
+  __Pyx_AddTraceback("PyDeepCL.DeepCL.getMaxWorkgroupSize", __pyx_clineno, __pyx_lineno, __pyx_filename);
+  __pyx_r = NULL;
+  __pyx_L0:;
+  __Pyx_XGIVEREF(__pyx_r);
+  __Pyx_RefNannyFinishContext();
+  return __pyx_r;
+}
+
+/* "../../../../../../home/user/git/DeepCL/python/DeepCL.pyx":32
+ *         return self.thisptr.getMaxWorkgroupSize()
+ * 
+ *     def getMaxAllocSizeMB(self):             # <<<<<<<<<<<<<<
+ *         return self.thisptr.getMaxAllocSizeMB()
+ * 
+ */
+
+/* Python wrapper */
+static PyObject *__pyx_pw_8PyDeepCL_6DeepCL_17getMaxAllocSizeMB(PyObject *__pyx_v_self, CYTHON_UNUSED PyObject *unused); /*proto*/
+static PyObject *__pyx_pw_8PyDeepCL_6DeepCL_17getMaxAllocSizeMB(PyObject *__pyx_v_self, CYTHON_UNUSED PyObject *unused) {
+  PyObject *__pyx_r = 0;
+  __Pyx_RefNannyDeclarations
+  __Pyx_RefNannySetupContext("getMaxAllocSizeMB (wrapper)", 0);
+  __pyx_r = __pyx_pf_8PyDeepCL_6DeepCL_16getMaxAllocSizeMB(((struct __pyx_obj_8PyDeepCL_DeepCL *)__pyx_v_self));
+
+  /* function exit code */
+  __Pyx_RefNannyFinishContext();
+  return __pyx_r;
+}
+
+static PyObject *__pyx_pf_8PyDeepCL_6DeepCL_16getMaxAllocSizeMB(struct __pyx_obj_8PyDeepCL_DeepCL *__pyx_v_self) {
+  PyObject *__pyx_r = NULL;
+  __Pyx_RefNannyDeclarations
+  PyObject *__pyx_t_1 = NULL;
+  int __pyx_lineno = 0;
+  const char *__pyx_filename = NULL;
+  int __pyx_clineno = 0;
+  __Pyx_RefNannySetupContext("getMaxAllocSizeMB", 0);
+
+  /* "../../../../../../home/user/git/DeepCL/python/DeepCL.pyx":33
+ * 
+ *     def getMaxAllocSizeMB(self):
+ *         return self.thisptr.getMaxAllocSizeMB()             # <<<<<<<<<<<<<<
+ * 
+ */
+  __Pyx_XDECREF(__pyx_r);
+  __pyx_t_1 = __Pyx_PyInt_From_int(__pyx_v_self->thisptr->getMaxAllocSizeMB()); if (unlikely(!__pyx_t_1)) {__pyx_filename = __pyx_f[2]; __pyx_lineno = 33; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+  __Pyx_GOTREF(__pyx_t_1);
+  __pyx_r = __pyx_t_1;
+  __pyx_t_1 = 0;
+  goto __pyx_L0;
+
+  /* "../../../../../../home/user/git/DeepCL/python/DeepCL.pyx":32
+ *         return self.thisptr.getMaxWorkgroupSize()
+ * 
+ *     def getMaxAllocSizeMB(self):             # <<<<<<<<<<<<<<
+ *         return self.thisptr.getMaxAllocSizeMB()
+ * 
+ */
+
+  /* function exit code */
+  __pyx_L1_error:;
+  __Pyx_XDECREF(__pyx_t_1);
+  __Pyx_AddTraceback("PyDeepCL.DeepCL.getMaxAllocSizeMB", __pyx_clineno, __pyx_lineno, __pyx_filename);
+  __pyx_r = NULL;
+  __pyx_L0:;
+  __Pyx_XGIVEREF(__pyx_r);
+  __Pyx_RefNannyFinishContext();
+  return __pyx_r;
+}
+
+/* "../../../../../../home/user/git/DeepCL/python/SGD.pyx":3
  * cdef class TrainingContext:
  *     cdef cDeepCL.TrainingContext *thisptr
  *     def __cinit__(self, int epoch, int batch):             # <<<<<<<<<<<<<<
@@ -2559,11 +2934,11 @@ static int __pyx_pw_8PyDeepCL_15TrainingContext_1__cinit__(PyObject *__pyx_v_sel
         case  1:
         if (likely((values[1] = PyDict_GetItem(__pyx_kwds, __pyx_n_s_batch)) != 0)) kw_args--;
         else {
-          __Pyx_RaiseArgtupleInvalid("__cinit__", 1, 2, 2, 1); {__pyx_filename = __pyx_f[4]; __pyx_lineno = 3; __pyx_clineno = __LINE__; goto __pyx_L3_error;}
+          __Pyx_RaiseArgtupleInvalid("__cinit__", 1, 2, 2, 1); {__pyx_filename = __pyx_f[3]; __pyx_lineno = 3; __pyx_clineno = __LINE__; goto __pyx_L3_error;}
         }
       }
       if (unlikely(kw_args > 0)) {
-        if (unlikely(__Pyx_ParseOptionalKeywords(__pyx_kwds, __pyx_pyargnames, 0, values, pos_args, "__cinit__") < 0)) {__pyx_filename = __pyx_f[4]; __pyx_lineno = 3; __pyx_clineno = __LINE__; goto __pyx_L3_error;}
+        if (unlikely(__Pyx_ParseOptionalKeywords(__pyx_kwds, __pyx_pyargnames, 0, values, pos_args, "__cinit__") < 0)) {__pyx_filename = __pyx_f[3]; __pyx_lineno = 3; __pyx_clineno = __LINE__; goto __pyx_L3_error;}
       }
     } else if (PyTuple_GET_SIZE(__pyx_args) != 2) {
       goto __pyx_L5_argtuple_error;
@@ -2571,12 +2946,12 @@ static int __pyx_pw_8PyDeepCL_15TrainingContext_1__cinit__(PyObject *__pyx_v_sel
       values[0] = PyTuple_GET_ITEM(__pyx_args, 0);
       values[1] = PyTuple_GET_ITEM(__pyx_args, 1);
     }
-    __pyx_v_epoch = __Pyx_PyInt_As_int(values[0]); if (unlikely((__pyx_v_epoch == (int)-1) && PyErr_Occurred())) {__pyx_filename = __pyx_f[4]; __pyx_lineno = 3; __pyx_clineno = __LINE__; goto __pyx_L3_error;}
-    __pyx_v_batch = __Pyx_PyInt_As_int(values[1]); if (unlikely((__pyx_v_batch == (int)-1) && PyErr_Occurred())) {__pyx_filename = __pyx_f[4]; __pyx_lineno = 3; __pyx_clineno = __LINE__; goto __pyx_L3_error;}
+    __pyx_v_epoch = __Pyx_PyInt_As_int(values[0]); if (unlikely((__pyx_v_epoch == (int)-1) && PyErr_Occurred())) {__pyx_filename = __pyx_f[3]; __pyx_lineno = 3; __pyx_clineno = __LINE__; goto __pyx_L3_error;}
+    __pyx_v_batch = __Pyx_PyInt_As_int(values[1]); if (unlikely((__pyx_v_batch == (int)-1) && PyErr_Occurred())) {__pyx_filename = __pyx_f[3]; __pyx_lineno = 3; __pyx_clineno = __LINE__; goto __pyx_L3_error;}
   }
   goto __pyx_L4_argument_unpacking_done;
   __pyx_L5_argtuple_error:;
-  __Pyx_RaiseArgtupleInvalid("__cinit__", 1, 2, 2, PyTuple_GET_SIZE(__pyx_args)); {__pyx_filename = __pyx_f[4]; __pyx_lineno = 3; __pyx_clineno = __LINE__; goto __pyx_L3_error;}
+  __Pyx_RaiseArgtupleInvalid("__cinit__", 1, 2, 2, PyTuple_GET_SIZE(__pyx_args)); {__pyx_filename = __pyx_f[3]; __pyx_lineno = 3; __pyx_clineno = __LINE__; goto __pyx_L3_error;}
   __pyx_L3_error:;
   __Pyx_AddTraceback("PyDeepCL.TrainingContext.__cinit__", __pyx_clineno, __pyx_lineno, __pyx_filename);
   __Pyx_RefNannyFinishContext();
@@ -2594,7 +2969,7 @@ static int __pyx_pf_8PyDeepCL_15TrainingContext___cinit__(struct __pyx_obj_8PyDe
   __Pyx_RefNannyDeclarations
   __Pyx_RefNannySetupContext("__cinit__", 0);
 
-  /* "SGD.pyx":4
+  /* "../../../../../../home/user/git/DeepCL/python/SGD.pyx":4
  *     cdef cDeepCL.TrainingContext *thisptr
  *     def __cinit__(self, int epoch, int batch):
  *         self.thisptr = new cDeepCL.TrainingContext(epoch, batch)             # <<<<<<<<<<<<<<
@@ -2603,7 +2978,7 @@ static int __pyx_pf_8PyDeepCL_15TrainingContext___cinit__(struct __pyx_obj_8PyDe
  */
   __pyx_v_self->thisptr = new TrainingContext(__pyx_v_epoch, __pyx_v_batch);
 
-  /* "SGD.pyx":3
+  /* "../../../../../../home/user/git/DeepCL/python/SGD.pyx":3
  * cdef class TrainingContext:
  *     cdef cDeepCL.TrainingContext *thisptr
  *     def __cinit__(self, int epoch, int batch):             # <<<<<<<<<<<<<<
@@ -2617,7 +2992,7 @@ static int __pyx_pf_8PyDeepCL_15TrainingContext___cinit__(struct __pyx_obj_8PyDe
   return __pyx_r;
 }
 
-/* "SGD.pyx":5
+/* "../../../../../../home/user/git/DeepCL/python/SGD.pyx":5
  *     def __cinit__(self, int epoch, int batch):
  *         self.thisptr = new cDeepCL.TrainingContext(epoch, batch)
  *     def __dealloc__(self):             # <<<<<<<<<<<<<<
@@ -2640,7 +3015,7 @@ static void __pyx_pf_8PyDeepCL_15TrainingContext_2__dealloc__(struct __pyx_obj_8
   __Pyx_RefNannyDeclarations
   __Pyx_RefNannySetupContext("__dealloc__", 0);
 
-  /* "SGD.pyx":6
+  /* "../../../../../../home/user/git/DeepCL/python/SGD.pyx":6
  *         self.thisptr = new cDeepCL.TrainingContext(epoch, batch)
  *     def __dealloc__(self):
  *         del self.thisptr             # <<<<<<<<<<<<<<
@@ -2649,7 +3024,7 @@ static void __pyx_pf_8PyDeepCL_15TrainingContext_2__dealloc__(struct __pyx_obj_8
  */
   delete __pyx_v_self->thisptr;
 
-  /* "SGD.pyx":5
+  /* "../../../../../../home/user/git/DeepCL/python/SGD.pyx":5
  *     def __cinit__(self, int epoch, int batch):
  *         self.thisptr = new cDeepCL.TrainingContext(epoch, batch)
  *     def __dealloc__(self):             # <<<<<<<<<<<<<<
@@ -2661,10 +3036,10 @@ static void __pyx_pf_8PyDeepCL_15TrainingContext_2__dealloc__(struct __pyx_obj_8
   __Pyx_RefNannyFinishContext();
 }
 
-/* "SGD.pyx":10
+/* "../../../../../../home/user/git/DeepCL/python/SGD.pyx":10
  * cdef class SGD:
  *     cdef cDeepCL.SGD *thisptr
- *     def __cinit__( self, EasyCL cl, learningRate, momentum=0.0 ):             # <<<<<<<<<<<<<<
+ *     def __cinit__( self, DeepCL cl, learningRate, momentum=0.0 ):             # <<<<<<<<<<<<<<
  *         self.thisptr = new cDeepCL.SGD(cl.thisptr)
  *         self.thisptr.setLearningRate(learningRate)
  */
@@ -2672,7 +3047,7 @@ static void __pyx_pf_8PyDeepCL_15TrainingContext_2__dealloc__(struct __pyx_obj_8
 /* Python wrapper */
 static int __pyx_pw_8PyDeepCL_3SGD_1__cinit__(PyObject *__pyx_v_self, PyObject *__pyx_args, PyObject *__pyx_kwds); /*proto*/
 static int __pyx_pw_8PyDeepCL_3SGD_1__cinit__(PyObject *__pyx_v_self, PyObject *__pyx_args, PyObject *__pyx_kwds) {
-  struct __pyx_obj_8PyDeepCL_EasyCL *__pyx_v_cl = 0;
+  struct __pyx_obj_8PyDeepCL_DeepCL *__pyx_v_cl = 0;
   PyObject *__pyx_v_learningRate = 0;
   PyObject *__pyx_v_momentum = 0;
   int __pyx_lineno = 0;
@@ -2703,7 +3078,7 @@ static int __pyx_pw_8PyDeepCL_3SGD_1__cinit__(PyObject *__pyx_v_self, PyObject *
         case  1:
         if (likely((values[1] = PyDict_GetItem(__pyx_kwds, __pyx_n_s_learningRate)) != 0)) kw_args--;
         else {
-          __Pyx_RaiseArgtupleInvalid("__cinit__", 0, 2, 3, 1); {__pyx_filename = __pyx_f[4]; __pyx_lineno = 10; __pyx_clineno = __LINE__; goto __pyx_L3_error;}
+          __Pyx_RaiseArgtupleInvalid("__cinit__", 0, 2, 3, 1); {__pyx_filename = __pyx_f[3]; __pyx_lineno = 10; __pyx_clineno = __LINE__; goto __pyx_L3_error;}
         }
         case  2:
         if (kw_args > 0) {
@@ -2712,7 +3087,7 @@ static int __pyx_pw_8PyDeepCL_3SGD_1__cinit__(PyObject *__pyx_v_self, PyObject *
         }
       }
       if (unlikely(kw_args > 0)) {
-        if (unlikely(__Pyx_ParseOptionalKeywords(__pyx_kwds, __pyx_pyargnames, 0, values, pos_args, "__cinit__") < 0)) {__pyx_filename = __pyx_f[4]; __pyx_lineno = 10; __pyx_clineno = __LINE__; goto __pyx_L3_error;}
+        if (unlikely(__Pyx_ParseOptionalKeywords(__pyx_kwds, __pyx_pyargnames, 0, values, pos_args, "__cinit__") < 0)) {__pyx_filename = __pyx_f[3]; __pyx_lineno = 10; __pyx_clineno = __LINE__; goto __pyx_L3_error;}
       }
     } else {
       switch (PyTuple_GET_SIZE(__pyx_args)) {
@@ -2723,19 +3098,19 @@ static int __pyx_pw_8PyDeepCL_3SGD_1__cinit__(PyObject *__pyx_v_self, PyObject *
         default: goto __pyx_L5_argtuple_error;
       }
     }
-    __pyx_v_cl = ((struct __pyx_obj_8PyDeepCL_EasyCL *)values[0]);
+    __pyx_v_cl = ((struct __pyx_obj_8PyDeepCL_DeepCL *)values[0]);
     __pyx_v_learningRate = values[1];
     __pyx_v_momentum = values[2];
   }
   goto __pyx_L4_argument_unpacking_done;
   __pyx_L5_argtuple_error:;
-  __Pyx_RaiseArgtupleInvalid("__cinit__", 0, 2, 3, PyTuple_GET_SIZE(__pyx_args)); {__pyx_filename = __pyx_f[4]; __pyx_lineno = 10; __pyx_clineno = __LINE__; goto __pyx_L3_error;}
+  __Pyx_RaiseArgtupleInvalid("__cinit__", 0, 2, 3, PyTuple_GET_SIZE(__pyx_args)); {__pyx_filename = __pyx_f[3]; __pyx_lineno = 10; __pyx_clineno = __LINE__; goto __pyx_L3_error;}
   __pyx_L3_error:;
   __Pyx_AddTraceback("PyDeepCL.SGD.__cinit__", __pyx_clineno, __pyx_lineno, __pyx_filename);
   __Pyx_RefNannyFinishContext();
   return -1;
   __pyx_L4_argument_unpacking_done:;
-  if (unlikely(!__Pyx_ArgTypeTest(((PyObject *)__pyx_v_cl), __pyx_ptype_8PyDeepCL_EasyCL, 1, "cl", 0))) {__pyx_filename = __pyx_f[4]; __pyx_lineno = 10; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+  if (unlikely(!__Pyx_ArgTypeTest(((PyObject *)__pyx_v_cl), __pyx_ptype_8PyDeepCL_DeepCL, 1, "cl", 0))) {__pyx_filename = __pyx_f[3]; __pyx_lineno = 10; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
   __pyx_r = __pyx_pf_8PyDeepCL_3SGD___cinit__(((struct __pyx_obj_8PyDeepCL_SGD *)__pyx_v_self), __pyx_v_cl, __pyx_v_learningRate, __pyx_v_momentum);
 
   /* function exit code */
@@ -2747,7 +3122,7 @@ static int __pyx_pw_8PyDeepCL_3SGD_1__cinit__(PyObject *__pyx_v_self, PyObject *
   return __pyx_r;
 }
 
-static int __pyx_pf_8PyDeepCL_3SGD___cinit__(struct __pyx_obj_8PyDeepCL_SGD *__pyx_v_self, struct __pyx_obj_8PyDeepCL_EasyCL *__pyx_v_cl, PyObject *__pyx_v_learningRate, PyObject *__pyx_v_momentum) {
+static int __pyx_pf_8PyDeepCL_3SGD___cinit__(struct __pyx_obj_8PyDeepCL_SGD *__pyx_v_self, struct __pyx_obj_8PyDeepCL_DeepCL *__pyx_v_cl, PyObject *__pyx_v_learningRate, PyObject *__pyx_v_momentum) {
   int __pyx_r;
   __Pyx_RefNannyDeclarations
   SGD *__pyx_t_1;
@@ -2757,9 +3132,9 @@ static int __pyx_pf_8PyDeepCL_3SGD___cinit__(struct __pyx_obj_8PyDeepCL_SGD *__p
   int __pyx_clineno = 0;
   __Pyx_RefNannySetupContext("__cinit__", 0);
 
-  /* "SGD.pyx":11
+  /* "../../../../../../home/user/git/DeepCL/python/SGD.pyx":11
  *     cdef cDeepCL.SGD *thisptr
- *     def __cinit__( self, EasyCL cl, learningRate, momentum=0.0 ):
+ *     def __cinit__( self, DeepCL cl, learningRate, momentum=0.0 ):
  *         self.thisptr = new cDeepCL.SGD(cl.thisptr)             # <<<<<<<<<<<<<<
  *         self.thisptr.setLearningRate(learningRate)
  *         self.thisptr.setMomentum(momentum)
@@ -2768,34 +3143,34 @@ static int __pyx_pf_8PyDeepCL_3SGD___cinit__(struct __pyx_obj_8PyDeepCL_SGD *__p
     __pyx_t_1 = new SGD(__pyx_v_cl->thisptr);
   } catch(...) {
     __Pyx_CppExn2PyErr();
-    {__pyx_filename = __pyx_f[4]; __pyx_lineno = 11; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+    {__pyx_filename = __pyx_f[3]; __pyx_lineno = 11; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
   }
   __pyx_v_self->thisptr = __pyx_t_1;
 
-  /* "SGD.pyx":12
- *     def __cinit__( self, EasyCL cl, learningRate, momentum=0.0 ):
+  /* "../../../../../../home/user/git/DeepCL/python/SGD.pyx":12
+ *     def __cinit__( self, DeepCL cl, learningRate, momentum=0.0 ):
  *         self.thisptr = new cDeepCL.SGD(cl.thisptr)
  *         self.thisptr.setLearningRate(learningRate)             # <<<<<<<<<<<<<<
  *         self.thisptr.setMomentum(momentum)
- *     def __dealloc(self):
+ *     def __dealloc__(self):
  */
-  __pyx_t_2 = __pyx_PyFloat_AsFloat(__pyx_v_learningRate); if (unlikely((__pyx_t_2 == (float)-1) && PyErr_Occurred())) {__pyx_filename = __pyx_f[4]; __pyx_lineno = 12; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+  __pyx_t_2 = __pyx_PyFloat_AsFloat(__pyx_v_learningRate); if (unlikely((__pyx_t_2 == (float)-1) && PyErr_Occurred())) {__pyx_filename = __pyx_f[3]; __pyx_lineno = 12; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
   __pyx_v_self->thisptr->setLearningRate(__pyx_t_2);
 
-  /* "SGD.pyx":13
+  /* "../../../../../../home/user/git/DeepCL/python/SGD.pyx":13
  *         self.thisptr = new cDeepCL.SGD(cl.thisptr)
  *         self.thisptr.setLearningRate(learningRate)
  *         self.thisptr.setMomentum(momentum)             # <<<<<<<<<<<<<<
- *     def __dealloc(self):
+ *     def __dealloc__(self):
  *         del self.thisptr
  */
-  __pyx_t_2 = __pyx_PyFloat_AsFloat(__pyx_v_momentum); if (unlikely((__pyx_t_2 == (float)-1) && PyErr_Occurred())) {__pyx_filename = __pyx_f[4]; __pyx_lineno = 13; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+  __pyx_t_2 = __pyx_PyFloat_AsFloat(__pyx_v_momentum); if (unlikely((__pyx_t_2 == (float)-1) && PyErr_Occurred())) {__pyx_filename = __pyx_f[3]; __pyx_lineno = 13; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
   __pyx_v_self->thisptr->setMomentum(__pyx_t_2);
 
-  /* "SGD.pyx":10
+  /* "../../../../../../home/user/git/DeepCL/python/SGD.pyx":10
  * cdef class SGD:
  *     cdef cDeepCL.SGD *thisptr
- *     def __cinit__( self, EasyCL cl, learningRate, momentum=0.0 ):             # <<<<<<<<<<<<<<
+ *     def __cinit__( self, DeepCL cl, learningRate, momentum=0.0 ):             # <<<<<<<<<<<<<<
  *         self.thisptr = new cDeepCL.SGD(cl.thisptr)
  *         self.thisptr.setLearningRate(learningRate)
  */
@@ -2811,58 +3186,52 @@ static int __pyx_pf_8PyDeepCL_3SGD___cinit__(struct __pyx_obj_8PyDeepCL_SGD *__p
   return __pyx_r;
 }
 
-/* "SGD.pyx":14
+/* "../../../../../../home/user/git/DeepCL/python/SGD.pyx":14
  *         self.thisptr.setLearningRate(learningRate)
  *         self.thisptr.setMomentum(momentum)
- *     def __dealloc(self):             # <<<<<<<<<<<<<<
+ *     def __dealloc__(self):             # <<<<<<<<<<<<<<
  *         del self.thisptr
  *     def setLearningRate(self, float learningRate):
  */
 
 /* Python wrapper */
-static PyObject *__pyx_pw_8PyDeepCL_3SGD_3__dealloc(PyObject *__pyx_v_self, CYTHON_UNUSED PyObject *unused); /*proto*/
-static PyObject *__pyx_pw_8PyDeepCL_3SGD_3__dealloc(PyObject *__pyx_v_self, CYTHON_UNUSED PyObject *unused) {
-  PyObject *__pyx_r = 0;
+static void __pyx_pw_8PyDeepCL_3SGD_3__dealloc__(PyObject *__pyx_v_self); /*proto*/
+static void __pyx_pw_8PyDeepCL_3SGD_3__dealloc__(PyObject *__pyx_v_self) {
   __Pyx_RefNannyDeclarations
-  __Pyx_RefNannySetupContext("__dealloc (wrapper)", 0);
-  __pyx_r = __pyx_pf_8PyDeepCL_3SGD_2__dealloc(((struct __pyx_obj_8PyDeepCL_SGD *)__pyx_v_self));
+  __Pyx_RefNannySetupContext("__dealloc__ (wrapper)", 0);
+  __pyx_pf_8PyDeepCL_3SGD_2__dealloc__(((struct __pyx_obj_8PyDeepCL_SGD *)__pyx_v_self));
 
   /* function exit code */
   __Pyx_RefNannyFinishContext();
-  return __pyx_r;
 }
 
-static PyObject *__pyx_pf_8PyDeepCL_3SGD_2__dealloc(struct __pyx_obj_8PyDeepCL_SGD *__pyx_v_self) {
-  PyObject *__pyx_r = NULL;
+static void __pyx_pf_8PyDeepCL_3SGD_2__dealloc__(struct __pyx_obj_8PyDeepCL_SGD *__pyx_v_self) {
   __Pyx_RefNannyDeclarations
-  __Pyx_RefNannySetupContext("__dealloc", 0);
+  __Pyx_RefNannySetupContext("__dealloc__", 0);
 
-  /* "SGD.pyx":15
+  /* "../../../../../../home/user/git/DeepCL/python/SGD.pyx":15
  *         self.thisptr.setMomentum(momentum)
- *     def __dealloc(self):
+ *     def __dealloc__(self):
  *         del self.thisptr             # <<<<<<<<<<<<<<
  *     def setLearningRate(self, float learningRate):
  *         self.thisptr.setLearningRate(learningRate)
  */
   delete __pyx_v_self->thisptr;
 
-  /* "SGD.pyx":14
+  /* "../../../../../../home/user/git/DeepCL/python/SGD.pyx":14
  *         self.thisptr.setLearningRate(learningRate)
  *         self.thisptr.setMomentum(momentum)
- *     def __dealloc(self):             # <<<<<<<<<<<<<<
+ *     def __dealloc__(self):             # <<<<<<<<<<<<<<
  *         del self.thisptr
  *     def setLearningRate(self, float learningRate):
  */
 
   /* function exit code */
-  __pyx_r = Py_None; __Pyx_INCREF(Py_None);
-  __Pyx_XGIVEREF(__pyx_r);
   __Pyx_RefNannyFinishContext();
-  return __pyx_r;
 }
 
-/* "SGD.pyx":16
- *     def __dealloc(self):
+/* "../../../../../../home/user/git/DeepCL/python/SGD.pyx":16
+ *     def __dealloc__(self):
  *         del self.thisptr
  *     def setLearningRate(self, float learningRate):             # <<<<<<<<<<<<<<
  *         self.thisptr.setLearningRate(learningRate)
@@ -2880,7 +3249,7 @@ static PyObject *__pyx_pw_8PyDeepCL_3SGD_5setLearningRate(PyObject *__pyx_v_self
   __Pyx_RefNannyDeclarations
   __Pyx_RefNannySetupContext("setLearningRate (wrapper)", 0);
   assert(__pyx_arg_learningRate); {
-    __pyx_v_learningRate = __pyx_PyFloat_AsFloat(__pyx_arg_learningRate); if (unlikely((__pyx_v_learningRate == (float)-1) && PyErr_Occurred())) {__pyx_filename = __pyx_f[4]; __pyx_lineno = 16; __pyx_clineno = __LINE__; goto __pyx_L3_error;}
+    __pyx_v_learningRate = __pyx_PyFloat_AsFloat(__pyx_arg_learningRate); if (unlikely((__pyx_v_learningRate == (float)-1) && PyErr_Occurred())) {__pyx_filename = __pyx_f[3]; __pyx_lineno = 16; __pyx_clineno = __LINE__; goto __pyx_L3_error;}
   }
   goto __pyx_L4_argument_unpacking_done;
   __pyx_L3_error:;
@@ -2900,7 +3269,7 @@ static PyObject *__pyx_pf_8PyDeepCL_3SGD_4setLearningRate(struct __pyx_obj_8PyDe
   __Pyx_RefNannyDeclarations
   __Pyx_RefNannySetupContext("setLearningRate", 0);
 
-  /* "SGD.pyx":17
+  /* "../../../../../../home/user/git/DeepCL/python/SGD.pyx":17
  *         del self.thisptr
  *     def setLearningRate(self, float learningRate):
  *         self.thisptr.setLearningRate(learningRate)             # <<<<<<<<<<<<<<
@@ -2909,8 +3278,8 @@ static PyObject *__pyx_pf_8PyDeepCL_3SGD_4setLearningRate(struct __pyx_obj_8PyDe
  */
   __pyx_v_self->thisptr->setLearningRate(__pyx_v_learningRate);
 
-  /* "SGD.pyx":16
- *     def __dealloc(self):
+  /* "../../../../../../home/user/git/DeepCL/python/SGD.pyx":16
+ *     def __dealloc__(self):
  *         del self.thisptr
  *     def setLearningRate(self, float learningRate):             # <<<<<<<<<<<<<<
  *         self.thisptr.setLearningRate(learningRate)
@@ -2924,7 +3293,7 @@ static PyObject *__pyx_pf_8PyDeepCL_3SGD_4setLearningRate(struct __pyx_obj_8PyDe
   return __pyx_r;
 }
 
-/* "SGD.pyx":18
+/* "../../../../../../home/user/git/DeepCL/python/SGD.pyx":18
  *     def setLearningRate(self, float learningRate):
  *         self.thisptr.setLearningRate(learningRate)
  *     def setMomentum(self, float momentum):             # <<<<<<<<<<<<<<
@@ -2943,7 +3312,7 @@ static PyObject *__pyx_pw_8PyDeepCL_3SGD_7setMomentum(PyObject *__pyx_v_self, Py
   __Pyx_RefNannyDeclarations
   __Pyx_RefNannySetupContext("setMomentum (wrapper)", 0);
   assert(__pyx_arg_momentum); {
-    __pyx_v_momentum = __pyx_PyFloat_AsFloat(__pyx_arg_momentum); if (unlikely((__pyx_v_momentum == (float)-1) && PyErr_Occurred())) {__pyx_filename = __pyx_f[4]; __pyx_lineno = 18; __pyx_clineno = __LINE__; goto __pyx_L3_error;}
+    __pyx_v_momentum = __pyx_PyFloat_AsFloat(__pyx_arg_momentum); if (unlikely((__pyx_v_momentum == (float)-1) && PyErr_Occurred())) {__pyx_filename = __pyx_f[3]; __pyx_lineno = 18; __pyx_clineno = __LINE__; goto __pyx_L3_error;}
   }
   goto __pyx_L4_argument_unpacking_done;
   __pyx_L3_error:;
@@ -2963,7 +3332,7 @@ static PyObject *__pyx_pf_8PyDeepCL_3SGD_6setMomentum(struct __pyx_obj_8PyDeepCL
   __Pyx_RefNannyDeclarations
   __Pyx_RefNannySetupContext("setMomentum", 0);
 
-  /* "SGD.pyx":19
+  /* "../../../../../../home/user/git/DeepCL/python/SGD.pyx":19
  *         self.thisptr.setLearningRate(learningRate)
  *     def setMomentum(self, float momentum):
  *         self.thisptr.setMomentum(momentum)             # <<<<<<<<<<<<<<
@@ -2972,7 +3341,7 @@ static PyObject *__pyx_pf_8PyDeepCL_3SGD_6setMomentum(struct __pyx_obj_8PyDeepCL
  */
   __pyx_v_self->thisptr->setMomentum(__pyx_v_momentum);
 
-  /* "SGD.pyx":18
+  /* "../../../../../../home/user/git/DeepCL/python/SGD.pyx":18
  *     def setLearningRate(self, float learningRate):
  *         self.thisptr.setLearningRate(learningRate)
  *     def setMomentum(self, float momentum):             # <<<<<<<<<<<<<<
@@ -2987,7 +3356,7 @@ static PyObject *__pyx_pf_8PyDeepCL_3SGD_6setMomentum(struct __pyx_obj_8PyDeepCL
   return __pyx_r;
 }
 
-/* "SGD.pyx":20
+/* "../../../../../../home/user/git/DeepCL/python/SGD.pyx":20
  *     def setMomentum(self, float momentum):
  *         self.thisptr.setMomentum(momentum)
  *     def setWeightDecay(self, float weightDecay):             # <<<<<<<<<<<<<<
@@ -3006,7 +3375,7 @@ static PyObject *__pyx_pw_8PyDeepCL_3SGD_9setWeightDecay(PyObject *__pyx_v_self,
   __Pyx_RefNannyDeclarations
   __Pyx_RefNannySetupContext("setWeightDecay (wrapper)", 0);
   assert(__pyx_arg_weightDecay); {
-    __pyx_v_weightDecay = __pyx_PyFloat_AsFloat(__pyx_arg_weightDecay); if (unlikely((__pyx_v_weightDecay == (float)-1) && PyErr_Occurred())) {__pyx_filename = __pyx_f[4]; __pyx_lineno = 20; __pyx_clineno = __LINE__; goto __pyx_L3_error;}
+    __pyx_v_weightDecay = __pyx_PyFloat_AsFloat(__pyx_arg_weightDecay); if (unlikely((__pyx_v_weightDecay == (float)-1) && PyErr_Occurred())) {__pyx_filename = __pyx_f[3]; __pyx_lineno = 20; __pyx_clineno = __LINE__; goto __pyx_L3_error;}
   }
   goto __pyx_L4_argument_unpacking_done;
   __pyx_L3_error:;
@@ -3026,7 +3395,7 @@ static PyObject *__pyx_pf_8PyDeepCL_3SGD_8setWeightDecay(struct __pyx_obj_8PyDee
   __Pyx_RefNannyDeclarations
   __Pyx_RefNannySetupContext("setWeightDecay", 0);
 
-  /* "SGD.pyx":21
+  /* "../../../../../../home/user/git/DeepCL/python/SGD.pyx":21
  *         self.thisptr.setMomentum(momentum)
  *     def setWeightDecay(self, float weightDecay):
  *         self.thisptr.setWeightDecay(weightDecay)             # <<<<<<<<<<<<<<
@@ -3035,7 +3404,7 @@ static PyObject *__pyx_pf_8PyDeepCL_3SGD_8setWeightDecay(struct __pyx_obj_8PyDee
  */
   __pyx_v_self->thisptr->setWeightDecay(__pyx_v_weightDecay);
 
-  /* "SGD.pyx":20
+  /* "../../../../../../home/user/git/DeepCL/python/SGD.pyx":20
  *     def setMomentum(self, float momentum):
  *         self.thisptr.setMomentum(momentum)
  *     def setWeightDecay(self, float weightDecay):             # <<<<<<<<<<<<<<
@@ -3050,7 +3419,7 @@ static PyObject *__pyx_pf_8PyDeepCL_3SGD_8setWeightDecay(struct __pyx_obj_8PyDee
   return __pyx_r;
 }
 
-/* "SGD.pyx":22
+/* "../../../../../../home/user/git/DeepCL/python/SGD.pyx":22
  *     def setWeightDecay(self, float weightDecay):
  *         self.thisptr.setWeightDecay(weightDecay)
  *     def train(self, NeuralNet net, TrainingContext context,             # <<<<<<<<<<<<<<
@@ -3093,21 +3462,21 @@ static PyObject *__pyx_pw_8PyDeepCL_3SGD_11train(PyObject *__pyx_v_self, PyObjec
         case  1:
         if (likely((values[1] = PyDict_GetItem(__pyx_kwds, __pyx_n_s_context)) != 0)) kw_args--;
         else {
-          __Pyx_RaiseArgtupleInvalid("train", 1, 4, 4, 1); {__pyx_filename = __pyx_f[4]; __pyx_lineno = 22; __pyx_clineno = __LINE__; goto __pyx_L3_error;}
+          __Pyx_RaiseArgtupleInvalid("train", 1, 4, 4, 1); {__pyx_filename = __pyx_f[3]; __pyx_lineno = 22; __pyx_clineno = __LINE__; goto __pyx_L3_error;}
         }
         case  2:
         if (likely((values[2] = PyDict_GetItem(__pyx_kwds, __pyx_n_s_inputdata)) != 0)) kw_args--;
         else {
-          __Pyx_RaiseArgtupleInvalid("train", 1, 4, 4, 2); {__pyx_filename = __pyx_f[4]; __pyx_lineno = 22; __pyx_clineno = __LINE__; goto __pyx_L3_error;}
+          __Pyx_RaiseArgtupleInvalid("train", 1, 4, 4, 2); {__pyx_filename = __pyx_f[3]; __pyx_lineno = 22; __pyx_clineno = __LINE__; goto __pyx_L3_error;}
         }
         case  3:
         if (likely((values[3] = PyDict_GetItem(__pyx_kwds, __pyx_n_s_expectedOutput)) != 0)) kw_args--;
         else {
-          __Pyx_RaiseArgtupleInvalid("train", 1, 4, 4, 3); {__pyx_filename = __pyx_f[4]; __pyx_lineno = 22; __pyx_clineno = __LINE__; goto __pyx_L3_error;}
+          __Pyx_RaiseArgtupleInvalid("train", 1, 4, 4, 3); {__pyx_filename = __pyx_f[3]; __pyx_lineno = 22; __pyx_clineno = __LINE__; goto __pyx_L3_error;}
         }
       }
       if (unlikely(kw_args > 0)) {
-        if (unlikely(__Pyx_ParseOptionalKeywords(__pyx_kwds, __pyx_pyargnames, 0, values, pos_args, "train") < 0)) {__pyx_filename = __pyx_f[4]; __pyx_lineno = 22; __pyx_clineno = __LINE__; goto __pyx_L3_error;}
+        if (unlikely(__Pyx_ParseOptionalKeywords(__pyx_kwds, __pyx_pyargnames, 0, values, pos_args, "train") < 0)) {__pyx_filename = __pyx_f[3]; __pyx_lineno = 22; __pyx_clineno = __LINE__; goto __pyx_L3_error;}
       }
     } else if (PyTuple_GET_SIZE(__pyx_args) != 4) {
       goto __pyx_L5_argtuple_error;
@@ -3119,19 +3488,19 @@ static PyObject *__pyx_pw_8PyDeepCL_3SGD_11train(PyObject *__pyx_v_self, PyObjec
     }
     __pyx_v_net = ((struct __pyx_obj_8PyDeepCL_NeuralNet *)values[0]);
     __pyx_v_context = ((struct __pyx_obj_8PyDeepCL_TrainingContext *)values[1]);
-    __pyx_v_inputdata = __Pyx_PyObject_to_MemoryviewSlice_ds_float(values[2]); if (unlikely(!__pyx_v_inputdata.memview)) {__pyx_filename = __pyx_f[4]; __pyx_lineno = 23; __pyx_clineno = __LINE__; goto __pyx_L3_error;}
-    __pyx_v_expectedOutput = __Pyx_PyObject_to_MemoryviewSlice_ds_float(values[3]); if (unlikely(!__pyx_v_expectedOutput.memview)) {__pyx_filename = __pyx_f[4]; __pyx_lineno = 23; __pyx_clineno = __LINE__; goto __pyx_L3_error;}
+    __pyx_v_inputdata = __Pyx_PyObject_to_MemoryviewSlice_ds_float(values[2]); if (unlikely(!__pyx_v_inputdata.memview)) {__pyx_filename = __pyx_f[3]; __pyx_lineno = 23; __pyx_clineno = __LINE__; goto __pyx_L3_error;}
+    __pyx_v_expectedOutput = __Pyx_PyObject_to_MemoryviewSlice_ds_float(values[3]); if (unlikely(!__pyx_v_expectedOutput.memview)) {__pyx_filename = __pyx_f[3]; __pyx_lineno = 23; __pyx_clineno = __LINE__; goto __pyx_L3_error;}
   }
   goto __pyx_L4_argument_unpacking_done;
   __pyx_L5_argtuple_error:;
-  __Pyx_RaiseArgtupleInvalid("train", 1, 4, 4, PyTuple_GET_SIZE(__pyx_args)); {__pyx_filename = __pyx_f[4]; __pyx_lineno = 22; __pyx_clineno = __LINE__; goto __pyx_L3_error;}
+  __Pyx_RaiseArgtupleInvalid("train", 1, 4, 4, PyTuple_GET_SIZE(__pyx_args)); {__pyx_filename = __pyx_f[3]; __pyx_lineno = 22; __pyx_clineno = __LINE__; goto __pyx_L3_error;}
   __pyx_L3_error:;
   __Pyx_AddTraceback("PyDeepCL.SGD.train", __pyx_clineno, __pyx_lineno, __pyx_filename);
   __Pyx_RefNannyFinishContext();
   return NULL;
   __pyx_L4_argument_unpacking_done:;
-  if (unlikely(!__Pyx_ArgTypeTest(((PyObject *)__pyx_v_net), __pyx_ptype_8PyDeepCL_NeuralNet, 1, "net", 0))) {__pyx_filename = __pyx_f[4]; __pyx_lineno = 22; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
-  if (unlikely(!__Pyx_ArgTypeTest(((PyObject *)__pyx_v_context), __pyx_ptype_8PyDeepCL_TrainingContext, 1, "context", 0))) {__pyx_filename = __pyx_f[4]; __pyx_lineno = 22; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+  if (unlikely(!__Pyx_ArgTypeTest(((PyObject *)__pyx_v_net), __pyx_ptype_8PyDeepCL_NeuralNet, 1, "net", 0))) {__pyx_filename = __pyx_f[3]; __pyx_lineno = 22; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+  if (unlikely(!__Pyx_ArgTypeTest(((PyObject *)__pyx_v_context), __pyx_ptype_8PyDeepCL_TrainingContext, 1, "context", 0))) {__pyx_filename = __pyx_f[3]; __pyx_lineno = 22; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
   __pyx_r = __pyx_pf_8PyDeepCL_3SGD_10train(((struct __pyx_obj_8PyDeepCL_SGD *)__pyx_v_self), __pyx_v_net, __pyx_v_context, __pyx_v_inputdata, __pyx_v_expectedOutput);
 
   /* function exit code */
@@ -3156,7 +3525,7 @@ static PyObject *__pyx_pf_8PyDeepCL_3SGD_10train(struct __pyx_obj_8PyDeepCL_SGD
   int __pyx_clineno = 0;
   __Pyx_RefNannySetupContext("train", 0);
 
-  /* "SGD.pyx":25
+  /* "../../../../../../home/user/git/DeepCL/python/SGD.pyx":25
  *         float[:] inputdata, float[:] expectedOutput ):
  *         cdef cDeepCL.BatchResult result = self.thisptr.train(
  *             net.thisptr, context.thisptr, &inputdata[0], &expectedOutput[0])             # <<<<<<<<<<<<<<
@@ -3171,7 +3540,7 @@ static PyObject *__pyx_pf_8PyDeepCL_3SGD_10train(struct __pyx_obj_8PyDeepCL_SGD
   } else if (unlikely(__pyx_t_1 >= __pyx_v_inputdata.shape[0])) __pyx_t_2 = 0;
   if (unlikely(__pyx_t_2 != -1)) {
     __Pyx_RaiseBufferIndexError(__pyx_t_2);
-    {__pyx_filename = __pyx_f[4]; __pyx_lineno = 25; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+    {__pyx_filename = __pyx_f[3]; __pyx_lineno = 25; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
   }
   __pyx_t_3 = 0;
   __pyx_t_2 = -1;
@@ -3181,10 +3550,10 @@ static PyObject *__pyx_pf_8PyDeepCL_3SGD_10train(struct __pyx_obj_8PyDeepCL_SGD
   } else if (unlikely(__pyx_t_3 >= __pyx_v_expectedOutput.shape[0])) __pyx_t_2 = 0;
   if (unlikely(__pyx_t_2 != -1)) {
     __Pyx_RaiseBufferIndexError(__pyx_t_2);
-    {__pyx_filename = __pyx_f[4]; __pyx_lineno = 25; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+    {__pyx_filename = __pyx_f[3]; __pyx_lineno = 25; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
   }
 
-  /* "SGD.pyx":24
+  /* "../../../../../../home/user/git/DeepCL/python/SGD.pyx":24
  *     def train(self, NeuralNet net, TrainingContext context,
  *         float[:] inputdata, float[:] expectedOutput ):
  *         cdef cDeepCL.BatchResult result = self.thisptr.train(             # <<<<<<<<<<<<<<
@@ -3193,7 +3562,7 @@ static PyObject *__pyx_pf_8PyDeepCL_3SGD_10train(struct __pyx_obj_8PyDeepCL_SGD
  */
   __pyx_v_result = __pyx_v_self->thisptr->train(__pyx_v_net->thisptr, __pyx_v_context->thisptr, (&(*((float *) ( /* dim=0 */ (__pyx_v_inputdata.data + __pyx_t_1 * __pyx_v_inputdata.strides[0]) )))), (&(*((float *) ( /* dim=0 */ (__pyx_v_expectedOutput.data + __pyx_t_3 * __pyx_v_expectedOutput.strides[0]) )))));
 
-  /* "SGD.pyx":26
+  /* "../../../../../../home/user/git/DeepCL/python/SGD.pyx":26
  *         cdef cDeepCL.BatchResult result = self.thisptr.train(
  *             net.thisptr, context.thisptr, &inputdata[0], &expectedOutput[0])
  *         return result.getLoss()             # <<<<<<<<<<<<<<
@@ -3201,13 +3570,13 @@ static PyObject *__pyx_pf_8PyDeepCL_3SGD_10train(struct __pyx_obj_8PyDeepCL_SGD
  *         float[:] inputdata, int[:] labels):
  */
   __Pyx_XDECREF(__pyx_r);
-  __pyx_t_4 = PyFloat_FromDouble(__pyx_v_result.getLoss()); if (unlikely(!__pyx_t_4)) {__pyx_filename = __pyx_f[4]; __pyx_lineno = 26; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+  __pyx_t_4 = PyFloat_FromDouble(__pyx_v_result.getLoss()); if (unlikely(!__pyx_t_4)) {__pyx_filename = __pyx_f[3]; __pyx_lineno = 26; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
   __Pyx_GOTREF(__pyx_t_4);
   __pyx_r = __pyx_t_4;
   __pyx_t_4 = 0;
   goto __pyx_L0;
 
-  /* "SGD.pyx":22
+  /* "../../../../../../home/user/git/DeepCL/python/SGD.pyx":22
  *     def setWeightDecay(self, float weightDecay):
  *         self.thisptr.setWeightDecay(weightDecay)
  *     def train(self, NeuralNet net, TrainingContext context,             # <<<<<<<<<<<<<<
@@ -3228,7 +3597,7 @@ static PyObject *__pyx_pf_8PyDeepCL_3SGD_10train(struct __pyx_obj_8PyDeepCL_SGD
   return __pyx_r;
 }
 
-/* "SGD.pyx":27
+/* "../../../../../../home/user/git/DeepCL/python/SGD.pyx":27
  *             net.thisptr, context.thisptr, &inputdata[0], &expectedOutput[0])
  *         return result.getLoss()
  *     def trainFromLabels(self, NeuralNet net, TrainingContext context,             # <<<<<<<<<<<<<<
@@ -3271,21 +3640,21 @@ static PyObject *__pyx_pw_8PyDeepCL_3SGD_13trainFromLabels(PyObject *__pyx_v_sel
         case  1:
         if (likely((values[1] = PyDict_GetItem(__pyx_kwds, __pyx_n_s_context)) != 0)) kw_args--;
         else {
-          __Pyx_RaiseArgtupleInvalid("trainFromLabels", 1, 4, 4, 1); {__pyx_filename = __pyx_f[4]; __pyx_lineno = 27; __pyx_clineno = __LINE__; goto __pyx_L3_error;}
+          __Pyx_RaiseArgtupleInvalid("trainFromLabels", 1, 4, 4, 1); {__pyx_filename = __pyx_f[3]; __pyx_lineno = 27; __pyx_clineno = __LINE__; goto __pyx_L3_error;}
         }
         case  2:
         if (likely((values[2] = PyDict_GetItem(__pyx_kwds, __pyx_n_s_inputdata)) != 0)) kw_args--;
         else {
-          __Pyx_RaiseArgtupleInvalid("trainFromLabels", 1, 4, 4, 2); {__pyx_filename = __pyx_f[4]; __pyx_lineno = 27; __pyx_clineno = __LINE__; goto __pyx_L3_error;}
+          __Pyx_RaiseArgtupleInvalid("trainFromLabels", 1, 4, 4, 2); {__pyx_filename = __pyx_f[3]; __pyx_lineno = 27; __pyx_clineno = __LINE__; goto __pyx_L3_error;}
         }
         case  3:
         if (likely((values[3] = PyDict_GetItem(__pyx_kwds, __pyx_n_s_labels)) != 0)) kw_args--;
         else {
-          __Pyx_RaiseArgtupleInvalid("trainFromLabels", 1, 4, 4, 3); {__pyx_filename = __pyx_f[4]; __pyx_lineno = 27; __pyx_clineno = __LINE__; goto __pyx_L3_error;}
+          __Pyx_RaiseArgtupleInvalid("trainFromLabels", 1, 4, 4, 3); {__pyx_filename = __pyx_f[3]; __pyx_lineno = 27; __pyx_clineno = __LINE__; goto __pyx_L3_error;}
         }
       }
       if (unlikely(kw_args > 0)) {
-        if (unlikely(__Pyx_ParseOptionalKeywords(__pyx_kwds, __pyx_pyargnames, 0, values, pos_args, "trainFromLabels") < 0)) {__pyx_filename = __pyx_f[4]; __pyx_lineno = 27; __pyx_clineno = __LINE__; goto __pyx_L3_error;}
+        if (unlikely(__Pyx_ParseOptionalKeywords(__pyx_kwds, __pyx_pyargnames, 0, values, pos_args, "trainFromLabels") < 0)) {__pyx_filename = __pyx_f[3]; __pyx_lineno = 27; __pyx_clineno = __LINE__; goto __pyx_L3_error;}
       }
     } else if (PyTuple_GET_SIZE(__pyx_args) != 4) {
       goto __pyx_L5_argtuple_error;
@@ -3297,19 +3666,19 @@ static PyObject *__pyx_pw_8PyDeepCL_3SGD_13trainFromLabels(PyObject *__pyx_v_sel
     }
     __pyx_v_net = ((struct __pyx_obj_8PyDeepCL_NeuralNet *)values[0]);
     __pyx_v_context = ((struct __pyx_obj_8PyDeepCL_TrainingContext *)values[1]);
-    __pyx_v_inputdata = __Pyx_PyObject_to_MemoryviewSlice_ds_float(values[2]); if (unlikely(!__pyx_v_inputdata.memview)) {__pyx_filename = __pyx_f[4]; __pyx_lineno = 28; __pyx_clineno = __LINE__; goto __pyx_L3_error;}
-    __pyx_v_labels = __Pyx_PyObject_to_MemoryviewSlice_ds_int(values[3]); if (unlikely(!__pyx_v_labels.memview)) {__pyx_filename = __pyx_f[4]; __pyx_lineno = 28; __pyx_clineno = __LINE__; goto __pyx_L3_error;}
+    __pyx_v_inputdata = __Pyx_PyObject_to_MemoryviewSlice_ds_float(values[2]); if (unlikely(!__pyx_v_inputdata.memview)) {__pyx_filename = __pyx_f[3]; __pyx_lineno = 28; __pyx_clineno = __LINE__; goto __pyx_L3_error;}
+    __pyx_v_labels = __Pyx_PyObject_to_MemoryviewSlice_ds_int(values[3]); if (unlikely(!__pyx_v_labels.memview)) {__pyx_filename = __pyx_f[3]; __pyx_lineno = 28; __pyx_clineno = __LINE__; goto __pyx_L3_error;}
   }
   goto __pyx_L4_argument_unpacking_done;
   __pyx_L5_argtuple_error:;
-  __Pyx_RaiseArgtupleInvalid("trainFromLabels", 1, 4, 4, PyTuple_GET_SIZE(__pyx_args)); {__pyx_filename = __pyx_f[4]; __pyx_lineno = 27; __pyx_clineno = __LINE__; goto __pyx_L3_error;}
+  __Pyx_RaiseArgtupleInvalid("trainFromLabels", 1, 4, 4, PyTuple_GET_SIZE(__pyx_args)); {__pyx_filename = __pyx_f[3]; __pyx_lineno = 27; __pyx_clineno = __LINE__; goto __pyx_L3_error;}
   __pyx_L3_error:;
   __Pyx_AddTraceback("PyDeepCL.SGD.trainFromLabels", __pyx_clineno, __pyx_lineno, __pyx_filename);
   __Pyx_RefNannyFinishContext();
   return NULL;
   __pyx_L4_argument_unpacking_done:;
-  if (unlikely(!__Pyx_ArgTypeTest(((PyObject *)__pyx_v_net), __pyx_ptype_8PyDeepCL_NeuralNet, 1, "net", 0))) {__pyx_filename = __pyx_f[4]; __pyx_lineno = 27; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
-  if (unlikely(!__Pyx_ArgTypeTest(((PyObject *)__pyx_v_context), __pyx_ptype_8PyDeepCL_TrainingContext, 1, "context", 0))) {__pyx_filename = __pyx_f[4]; __pyx_lineno = 27; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+  if (unlikely(!__Pyx_ArgTypeTest(((PyObject *)__pyx_v_net), __pyx_ptype_8PyDeepCL_NeuralNet, 1, "net", 0))) {__pyx_filename = __pyx_f[3]; __pyx_lineno = 27; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+  if (unlikely(!__Pyx_ArgTypeTest(((PyObject *)__pyx_v_context), __pyx_ptype_8PyDeepCL_TrainingContext, 1, "context", 0))) {__pyx_filename = __pyx_f[3]; __pyx_lineno = 27; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
   __pyx_r = __pyx_pf_8PyDeepCL_3SGD_12trainFromLabels(((struct __pyx_obj_8PyDeepCL_SGD *)__pyx_v_self), __pyx_v_net, __pyx_v_context, __pyx_v_inputdata, __pyx_v_labels);
 
   /* function exit code */
@@ -3336,12 +3705,11 @@ static PyObject *__pyx_pf_8PyDeepCL_3SGD_12trainFromLabels(struct __pyx_obj_8PyD
   int __pyx_clineno = 0;
   __Pyx_RefNannySetupContext("trainFromLabels", 0);
 
-  /* "SGD.pyx":30
+  /* "../../../../../../home/user/git/DeepCL/python/SGD.pyx":30
  *         float[:] inputdata, int[:] labels):
  *         cdef cDeepCL.BatchResult result = self.thisptr.trainFromLabels(
  *             net.thisptr, context.thisptr, &inputdata[0], &labels[0])             # <<<<<<<<<<<<<<
  *         return ( result.getLoss(), result.getNumRight() )
- * 
  */
   __pyx_t_1 = 0;
   __pyx_t_2 = -1;
@@ -3351,7 +3719,7 @@ static PyObject *__pyx_pf_8PyDeepCL_3SGD_12trainFromLabels(struct __pyx_obj_8PyD
   } else if (unlikely(__pyx_t_1 >= __pyx_v_inputdata.shape[0])) __pyx_t_2 = 0;
   if (unlikely(__pyx_t_2 != -1)) {
     __Pyx_RaiseBufferIndexError(__pyx_t_2);
-    {__pyx_filename = __pyx_f[4]; __pyx_lineno = 30; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+    {__pyx_filename = __pyx_f[3]; __pyx_lineno = 30; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
   }
   __pyx_t_3 = 0;
   __pyx_t_2 = -1;
@@ -3361,10 +3729,10 @@ static PyObject *__pyx_pf_8PyDeepCL_3SGD_12trainFromLabels(struct __pyx_obj_8PyD
   } else if (unlikely(__pyx_t_3 >= __pyx_v_labels.shape[0])) __pyx_t_2 = 0;
   if (unlikely(__pyx_t_2 != -1)) {
     __Pyx_RaiseBufferIndexError(__pyx_t_2);
-    {__pyx_filename = __pyx_f[4]; __pyx_lineno = 30; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+    {__pyx_filename = __pyx_f[3]; __pyx_lineno = 30; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
   }
 
-  /* "SGD.pyx":29
+  /* "../../../../../../home/user/git/DeepCL/python/SGD.pyx":29
  *     def trainFromLabels(self, NeuralNet net, TrainingContext context,
  *         float[:] inputdata, int[:] labels):
  *         cdef cDeepCL.BatchResult result = self.thisptr.trainFromLabels(             # <<<<<<<<<<<<<<
@@ -3373,18 +3741,17 @@ static PyObject *__pyx_pf_8PyDeepCL_3SGD_12trainFromLabels(struct __pyx_obj_8PyD
  */
   __pyx_v_result = __pyx_v_self->thisptr->trainFromLabels(__pyx_v_net->thisptr, __pyx_v_context->thisptr, (&(*((float *) ( /* dim=0 */ (__pyx_v_inputdata.data + __pyx_t_1 * __pyx_v_inputdata.strides[0]) )))), (&(*((int *) ( /* dim=0 */ (__pyx_v_labels.data + __pyx_t_3 * __pyx_v_labels.strides[0]) )))));
 
-  /* "SGD.pyx":31
+  /* "../../../../../../home/user/git/DeepCL/python/SGD.pyx":31
  *         cdef cDeepCL.BatchResult result = self.thisptr.trainFromLabels(
  *             net.thisptr, context.thisptr, &inputdata[0], &labels[0])
  *         return ( result.getLoss(), result.getNumRight() )             # <<<<<<<<<<<<<<
- * 
  */
   __Pyx_XDECREF(__pyx_r);
-  __pyx_t_4 = PyFloat_FromDouble(__pyx_v_result.getLoss()); if (unlikely(!__pyx_t_4)) {__pyx_filename = __pyx_f[4]; __pyx_lineno = 31; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+  __pyx_t_4 = PyFloat_FromDouble(__pyx_v_result.getLoss()); if (unlikely(!__pyx_t_4)) {__pyx_filename = __pyx_f[3]; __pyx_lineno = 31; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
   __Pyx_GOTREF(__pyx_t_4);
-  __pyx_t_5 = __Pyx_PyInt_From_int(__pyx_v_result.getNumRight()); if (unlikely(!__pyx_t_5)) {__pyx_filename = __pyx_f[4]; __pyx_lineno = 31; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+  __pyx_t_5 = __Pyx_PyInt_From_int(__pyx_v_result.getNumRight()); if (unlikely(!__pyx_t_5)) {__pyx_filename = __pyx_f[3]; __pyx_lineno = 31; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
   __Pyx_GOTREF(__pyx_t_5);
-  __pyx_t_6 = PyTuple_New(2); if (unlikely(!__pyx_t_6)) {__pyx_filename = __pyx_f[4]; __pyx_lineno = 31; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+  __pyx_t_6 = PyTuple_New(2); if (unlikely(!__pyx_t_6)) {__pyx_filename = __pyx_f[3]; __pyx_lineno = 31; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
   __Pyx_GOTREF(__pyx_t_6);
   PyTuple_SET_ITEM(__pyx_t_6, 0, __pyx_t_4);
   __Pyx_GIVEREF(__pyx_t_4);
@@ -3396,7 +3763,7 @@ static PyObject *__pyx_pf_8PyDeepCL_3SGD_12trainFromLabels(struct __pyx_obj_8PyD
   __pyx_t_6 = 0;
   goto __pyx_L0;
 
-  /* "SGD.pyx":27
+  /* "../../../../../../home/user/git/DeepCL/python/SGD.pyx":27
  *             net.thisptr, context.thisptr, &inputdata[0], &expectedOutput[0])
  *         return result.getLoss()
  *     def trainFromLabels(self, NeuralNet net, TrainingContext context,             # <<<<<<<<<<<<<<
@@ -3419,10 +3786,10 @@ static PyObject *__pyx_pf_8PyDeepCL_3SGD_12trainFromLabels(struct __pyx_obj_8PyD
   return __pyx_r;
 }
 
-/* "Annealer.pyx":3
+/* "../../../../../../home/user/git/DeepCL/python/Annealer.pyx":3
  * cdef class Annealer:
  *     cdef cDeepCL.Annealer *thisptr
- *     def __cinit__( self, EasyCL cl, learningRate, anneal=1.0 ):             # <<<<<<<<<<<<<<
+ *     def __cinit__( self, DeepCL cl, learningRate, anneal=1.0 ):             # <<<<<<<<<<<<<<
  *         self.thisptr = new cDeepCL.Annealer(cl.thisptr)
  *         self.thisptr.setLearningRate(learningRate)
  */
@@ -3430,7 +3797,7 @@ static PyObject *__pyx_pf_8PyDeepCL_3SGD_12trainFromLabels(struct __pyx_obj_8PyD
 /* Python wrapper */
 static int __pyx_pw_8PyDeepCL_8Annealer_1__cinit__(PyObject *__pyx_v_self, PyObject *__pyx_args, PyObject *__pyx_kwds); /*proto*/
 static int __pyx_pw_8PyDeepCL_8Annealer_1__cinit__(PyObject *__pyx_v_self, PyObject *__pyx_args, PyObject *__pyx_kwds) {
-  struct __pyx_obj_8PyDeepCL_EasyCL *__pyx_v_cl = 0;
+  struct __pyx_obj_8PyDeepCL_DeepCL *__pyx_v_cl = 0;
   PyObject *__pyx_v_learningRate = 0;
   PyObject *__pyx_v_anneal = 0;
   int __pyx_lineno = 0;
@@ -3461,7 +3828,7 @@ static int __pyx_pw_8PyDeepCL_8Annealer_1__cinit__(PyObject *__pyx_v_self, PyObj
         case  1:
         if (likely((values[1] = PyDict_GetItem(__pyx_kwds, __pyx_n_s_learningRate)) != 0)) kw_args--;
         else {
-          __Pyx_RaiseArgtupleInvalid("__cinit__", 0, 2, 3, 1); {__pyx_filename = __pyx_f[5]; __pyx_lineno = 3; __pyx_clineno = __LINE__; goto __pyx_L3_error;}
+          __Pyx_RaiseArgtupleInvalid("__cinit__", 0, 2, 3, 1); {__pyx_filename = __pyx_f[4]; __pyx_lineno = 3; __pyx_clineno = __LINE__; goto __pyx_L3_error;}
         }
         case  2:
         if (kw_args > 0) {
@@ -3470,7 +3837,7 @@ static int __pyx_pw_8PyDeepCL_8Annealer_1__cinit__(PyObject *__pyx_v_self, PyObj
         }
       }
       if (unlikely(kw_args > 0)) {
-        if (unlikely(__Pyx_ParseOptionalKeywords(__pyx_kwds, __pyx_pyargnames, 0, values, pos_args, "__cinit__") < 0)) {__pyx_filename = __pyx_f[5]; __pyx_lineno = 3; __pyx_clineno = __LINE__; goto __pyx_L3_error;}
+        if (unlikely(__Pyx_ParseOptionalKeywords(__pyx_kwds, __pyx_pyargnames, 0, values, pos_args, "__cinit__") < 0)) {__pyx_filename = __pyx_f[4]; __pyx_lineno = 3; __pyx_clineno = __LINE__; goto __pyx_L3_error;}
       }
     } else {
       switch (PyTuple_GET_SIZE(__pyx_args)) {
@@ -3481,19 +3848,19 @@ static int __pyx_pw_8PyDeepCL_8Annealer_1__cinit__(PyObject *__pyx_v_self, PyObj
         default: goto __pyx_L5_argtuple_error;
       }
     }
-    __pyx_v_cl = ((struct __pyx_obj_8PyDeepCL_EasyCL *)values[0]);
+    __pyx_v_cl = ((struct __pyx_obj_8PyDeepCL_DeepCL *)values[0]);
     __pyx_v_learningRate = values[1];
     __pyx_v_anneal = values[2];
   }
   goto __pyx_L4_argument_unpacking_done;
   __pyx_L5_argtuple_error:;
-  __Pyx_RaiseArgtupleInvalid("__cinit__", 0, 2, 3, PyTuple_GET_SIZE(__pyx_args)); {__pyx_filename = __pyx_f[5]; __pyx_lineno = 3; __pyx_clineno = __LINE__; goto __pyx_L3_error;}
+  __Pyx_RaiseArgtupleInvalid("__cinit__", 0, 2, 3, PyTuple_GET_SIZE(__pyx_args)); {__pyx_filename = __pyx_f[4]; __pyx_lineno = 3; __pyx_clineno = __LINE__; goto __pyx_L3_error;}
   __pyx_L3_error:;
   __Pyx_AddTraceback("PyDeepCL.Annealer.__cinit__", __pyx_clineno, __pyx_lineno, __pyx_filename);
   __Pyx_RefNannyFinishContext();
   return -1;
   __pyx_L4_argument_unpacking_done:;
-  if (unlikely(!__Pyx_ArgTypeTest(((PyObject *)__pyx_v_cl), __pyx_ptype_8PyDeepCL_EasyCL, 1, "cl", 0))) {__pyx_filename = __pyx_f[5]; __pyx_lineno = 3; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+  if (unlikely(!__Pyx_ArgTypeTest(((PyObject *)__pyx_v_cl), __pyx_ptype_8PyDeepCL_DeepCL, 1, "cl", 0))) {__pyx_filename = __pyx_f[4]; __pyx_lineno = 3; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
   __pyx_r = __pyx_pf_8PyDeepCL_8Annealer___cinit__(((struct __pyx_obj_8PyDeepCL_Annealer *)__pyx_v_self), __pyx_v_cl, __pyx_v_learningRate, __pyx_v_anneal);
 
   /* function exit code */
@@ -3505,7 +3872,7 @@ static int __pyx_pw_8PyDeepCL_8Annealer_1__cinit__(PyObject *__pyx_v_self, PyObj
   return __pyx_r;
 }
 
-static int __pyx_pf_8PyDeepCL_8Annealer___cinit__(struct __pyx_obj_8PyDeepCL_Annealer *__pyx_v_self, struct __pyx_obj_8PyDeepCL_EasyCL *__pyx_v_cl, PyObject *__pyx_v_learningRate, PyObject *__pyx_v_anneal) {
+static int __pyx_pf_8PyDeepCL_8Annealer___cinit__(struct __pyx_obj_8PyDeepCL_Annealer *__pyx_v_self, struct __pyx_obj_8PyDeepCL_DeepCL *__pyx_v_cl, PyObject *__pyx_v_learningRate, PyObject *__pyx_v_anneal) {
   int __pyx_r;
   __Pyx_RefNannyDeclarations
   Annealer *__pyx_t_1;
@@ -3515,9 +3882,9 @@ static int __pyx_pf_8PyDeepCL_8Annealer___cinit__(struct __pyx_obj_8PyDeepCL_Ann
   int __pyx_clineno = 0;
   __Pyx_RefNannySetupContext("__cinit__", 0);
 
-  /* "Annealer.pyx":4
+  /* "../../../../../../home/user/git/DeepCL/python/Annealer.pyx":4
  *     cdef cDeepCL.Annealer *thisptr
- *     def __cinit__( self, EasyCL cl, learningRate, anneal=1.0 ):
+ *     def __cinit__( self, DeepCL cl, learningRate, anneal=1.0 ):
  *         self.thisptr = new cDeepCL.Annealer(cl.thisptr)             # <<<<<<<<<<<<<<
  *         self.thisptr.setLearningRate(learningRate)
  *         self.thisptr.setAnneal(anneal)
@@ -3526,34 +3893,34 @@ static int __pyx_pf_8PyDeepCL_8Annealer___cinit__(struct __pyx_obj_8PyDeepCL_Ann
     __pyx_t_1 = new Annealer(__pyx_v_cl->thisptr);
   } catch(...) {
     __Pyx_CppExn2PyErr();
-    {__pyx_filename = __pyx_f[5]; __pyx_lineno = 4; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+    {__pyx_filename = __pyx_f[4]; __pyx_lineno = 4; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
   }
   __pyx_v_self->thisptr = __pyx_t_1;
 
-  /* "Annealer.pyx":5
- *     def __cinit__( self, EasyCL cl, learningRate, anneal=1.0 ):
+  /* "../../../../../../home/user/git/DeepCL/python/Annealer.pyx":5
+ *     def __cinit__( self, DeepCL cl, learningRate, anneal=1.0 ):
  *         self.thisptr = new cDeepCL.Annealer(cl.thisptr)
  *         self.thisptr.setLearningRate(learningRate)             # <<<<<<<<<<<<<<
  *         self.thisptr.setAnneal(anneal)
- *     def __dealloc(self):
+ *     def __dealloc__(self):
  */
-  __pyx_t_2 = __pyx_PyFloat_AsFloat(__pyx_v_learningRate); if (unlikely((__pyx_t_2 == (float)-1) && PyErr_Occurred())) {__pyx_filename = __pyx_f[5]; __pyx_lineno = 5; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+  __pyx_t_2 = __pyx_PyFloat_AsFloat(__pyx_v_learningRate); if (unlikely((__pyx_t_2 == (float)-1) && PyErr_Occurred())) {__pyx_filename = __pyx_f[4]; __pyx_lineno = 5; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
   __pyx_v_self->thisptr->setLearningRate(__pyx_t_2);
 
-  /* "Annealer.pyx":6
+  /* "../../../../../../home/user/git/DeepCL/python/Annealer.pyx":6
  *         self.thisptr = new cDeepCL.Annealer(cl.thisptr)
  *         self.thisptr.setLearningRate(learningRate)
  *         self.thisptr.setAnneal(anneal)             # <<<<<<<<<<<<<<
- *     def __dealloc(self):
+ *     def __dealloc__(self):
  *         del self.thisptr
  */
-  __pyx_t_2 = __pyx_PyFloat_AsFloat(__pyx_v_anneal); if (unlikely((__pyx_t_2 == (float)-1) && PyErr_Occurred())) {__pyx_filename = __pyx_f[5]; __pyx_lineno = 6; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+  __pyx_t_2 = __pyx_PyFloat_AsFloat(__pyx_v_anneal); if (unlikely((__pyx_t_2 == (float)-1) && PyErr_Occurred())) {__pyx_filename = __pyx_f[4]; __pyx_lineno = 6; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
   __pyx_v_self->thisptr->setAnneal(__pyx_t_2);
 
-  /* "Annealer.pyx":3
+  /* "../../../../../../home/user/git/DeepCL/python/Annealer.pyx":3
  * cdef class Annealer:
  *     cdef cDeepCL.Annealer *thisptr
- *     def __cinit__( self, EasyCL cl, learningRate, anneal=1.0 ):             # <<<<<<<<<<<<<<
+ *     def __cinit__( self, DeepCL cl, learningRate, anneal=1.0 ):             # <<<<<<<<<<<<<<
  *         self.thisptr = new cDeepCL.Annealer(cl.thisptr)
  *         self.thisptr.setLearningRate(learningRate)
  */
@@ -3569,58 +3936,52 @@ static int __pyx_pf_8PyDeepCL_8Annealer___cinit__(struct __pyx_obj_8PyDeepCL_Ann
   return __pyx_r;
 }
 
-/* "Annealer.pyx":7
+/* "../../../../../../home/user/git/DeepCL/python/Annealer.pyx":7
  *         self.thisptr.setLearningRate(learningRate)
  *         self.thisptr.setAnneal(anneal)
- *     def __dealloc(self):             # <<<<<<<<<<<<<<
+ *     def __dealloc__(self):             # <<<<<<<<<<<<<<
  *         del self.thisptr
  *     def setLearningRate(self, float learningRate):
  */
 
 /* Python wrapper */
-static PyObject *__pyx_pw_8PyDeepCL_8Annealer_3__dealloc(PyObject *__pyx_v_self, CYTHON_UNUSED PyObject *unused); /*proto*/
-static PyObject *__pyx_pw_8PyDeepCL_8Annealer_3__dealloc(PyObject *__pyx_v_self, CYTHON_UNUSED PyObject *unused) {
-  PyObject *__pyx_r = 0;
+static void __pyx_pw_8PyDeepCL_8Annealer_3__dealloc__(PyObject *__pyx_v_self); /*proto*/
+static void __pyx_pw_8PyDeepCL_8Annealer_3__dealloc__(PyObject *__pyx_v_self) {
   __Pyx_RefNannyDeclarations
-  __Pyx_RefNannySetupContext("__dealloc (wrapper)", 0);
-  __pyx_r = __pyx_pf_8PyDeepCL_8Annealer_2__dealloc(((struct __pyx_obj_8PyDeepCL_Annealer *)__pyx_v_self));
+  __Pyx_RefNannySetupContext("__dealloc__ (wrapper)", 0);
+  __pyx_pf_8PyDeepCL_8Annealer_2__dealloc__(((struct __pyx_obj_8PyDeepCL_Annealer *)__pyx_v_self));
 
   /* function exit code */
   __Pyx_RefNannyFinishContext();
-  return __pyx_r;
 }
 
-static PyObject *__pyx_pf_8PyDeepCL_8Annealer_2__dealloc(struct __pyx_obj_8PyDeepCL_Annealer *__pyx_v_self) {
-  PyObject *__pyx_r = NULL;
+static void __pyx_pf_8PyDeepCL_8Annealer_2__dealloc__(struct __pyx_obj_8PyDeepCL_Annealer *__pyx_v_self) {
   __Pyx_RefNannyDeclarations
-  __Pyx_RefNannySetupContext("__dealloc", 0);
+  __Pyx_RefNannySetupContext("__dealloc__", 0);
 
-  /* "Annealer.pyx":8
+  /* "../../../../../../home/user/git/DeepCL/python/Annealer.pyx":8
  *         self.thisptr.setAnneal(anneal)
- *     def __dealloc(self):
+ *     def __dealloc__(self):
  *         del self.thisptr             # <<<<<<<<<<<<<<
  *     def setLearningRate(self, float learningRate):
  *         self.thisptr.setLearningRate(learningRate)
  */
   delete __pyx_v_self->thisptr;
 
-  /* "Annealer.pyx":7
+  /* "../../../../../../home/user/git/DeepCL/python/Annealer.pyx":7
  *         self.thisptr.setLearningRate(learningRate)
  *         self.thisptr.setAnneal(anneal)
- *     def __dealloc(self):             # <<<<<<<<<<<<<<
+ *     def __dealloc__(self):             # <<<<<<<<<<<<<<
  *         del self.thisptr
  *     def setLearningRate(self, float learningRate):
  */
 
   /* function exit code */
-  __pyx_r = Py_None; __Pyx_INCREF(Py_None);
-  __Pyx_XGIVEREF(__pyx_r);
   __Pyx_RefNannyFinishContext();
-  return __pyx_r;
 }
 
-/* "Annealer.pyx":9
- *     def __dealloc(self):
+/* "../../../../../../home/user/git/DeepCL/python/Annealer.pyx":9
+ *     def __dealloc__(self):
  *         del self.thisptr
  *     def setLearningRate(self, float learningRate):             # <<<<<<<<<<<<<<
  *         self.thisptr.setLearningRate(learningRate)
@@ -3638,7 +3999,7 @@ static PyObject *__pyx_pw_8PyDeepCL_8Annealer_5setLearningRate(PyObject *__pyx_v
   __Pyx_RefNannyDeclarations
   __Pyx_RefNannySetupContext("setLearningRate (wrapper)", 0);
   assert(__pyx_arg_learningRate); {
-    __pyx_v_learningRate = __pyx_PyFloat_AsFloat(__pyx_arg_learningRate); if (unlikely((__pyx_v_learningRate == (float)-1) && PyErr_Occurred())) {__pyx_filename = __pyx_f[5]; __pyx_lineno = 9; __pyx_clineno = __LINE__; goto __pyx_L3_error;}
+    __pyx_v_learningRate = __pyx_PyFloat_AsFloat(__pyx_arg_learningRate); if (unlikely((__pyx_v_learningRate == (float)-1) && PyErr_Occurred())) {__pyx_filename = __pyx_f[4]; __pyx_lineno = 9; __pyx_clineno = __LINE__; goto __pyx_L3_error;}
   }
   goto __pyx_L4_argument_unpacking_done;
   __pyx_L3_error:;
@@ -3658,7 +4019,7 @@ static PyObject *__pyx_pf_8PyDeepCL_8Annealer_4setLearningRate(struct __pyx_obj_
   __Pyx_RefNannyDeclarations
   __Pyx_RefNannySetupContext("setLearningRate", 0);
 
-  /* "Annealer.pyx":10
+  /* "../../../../../../home/user/git/DeepCL/python/Annealer.pyx":10
  *         del self.thisptr
  *     def setLearningRate(self, float learningRate):
  *         self.thisptr.setLearningRate(learningRate)             # <<<<<<<<<<<<<<
@@ -3667,8 +4028,8 @@ static PyObject *__pyx_pf_8PyDeepCL_8Annealer_4setLearningRate(struct __pyx_obj_
  */
   __pyx_v_self->thisptr->setLearningRate(__pyx_v_learningRate);
 
-  /* "Annealer.pyx":9
- *     def __dealloc(self):
+  /* "../../../../../../home/user/git/DeepCL/python/Annealer.pyx":9
+ *     def __dealloc__(self):
  *         del self.thisptr
  *     def setLearningRate(self, float learningRate):             # <<<<<<<<<<<<<<
  *         self.thisptr.setLearningRate(learningRate)
@@ -3682,7 +4043,7 @@ static PyObject *__pyx_pf_8PyDeepCL_8Annealer_4setLearningRate(struct __pyx_obj_
   return __pyx_r;
 }
 
-/* "Annealer.pyx":11
+/* "../../../../../../home/user/git/DeepCL/python/Annealer.pyx":11
  *     def setLearningRate(self, float learningRate):
  *         self.thisptr.setLearningRate(learningRate)
  *     def setAnneal(self, float anneal):             # <<<<<<<<<<<<<<
@@ -3701,7 +4062,7 @@ static PyObject *__pyx_pw_8PyDeepCL_8Annealer_7setAnneal(PyObject *__pyx_v_self,
   __Pyx_RefNannyDeclarations
   __Pyx_RefNannySetupContext("setAnneal (wrapper)", 0);
   assert(__pyx_arg_anneal); {
-    __pyx_v_anneal = __pyx_PyFloat_AsFloat(__pyx_arg_anneal); if (unlikely((__pyx_v_anneal == (float)-1) && PyErr_Occurred())) {__pyx_filename = __pyx_f[5]; __pyx_lineno = 11; __pyx_clineno = __LINE__; goto __pyx_L3_error;}
+    __pyx_v_anneal = __pyx_PyFloat_AsFloat(__pyx_arg_anneal); if (unlikely((__pyx_v_anneal == (float)-1) && PyErr_Occurred())) {__pyx_filename = __pyx_f[4]; __pyx_lineno = 11; __pyx_clineno = __LINE__; goto __pyx_L3_error;}
   }
   goto __pyx_L4_argument_unpacking_done;
   __pyx_L3_error:;
@@ -3721,7 +4082,7 @@ static PyObject *__pyx_pf_8PyDeepCL_8Annealer_6setAnneal(struct __pyx_obj_8PyDee
   __Pyx_RefNannyDeclarations
   __Pyx_RefNannySetupContext("setAnneal", 0);
 
-  /* "Annealer.pyx":12
+  /* "../../../../../../home/user/git/DeepCL/python/Annealer.pyx":12
  *         self.thisptr.setLearningRate(learningRate)
  *     def setAnneal(self, float anneal):
  *         self.thisptr.setAnneal(anneal)             # <<<<<<<<<<<<<<
@@ -3730,7 +4091,7 @@ static PyObject *__pyx_pf_8PyDeepCL_8Annealer_6setAnneal(struct __pyx_obj_8PyDee
  */
   __pyx_v_self->thisptr->setAnneal(__pyx_v_anneal);
 
-  /* "Annealer.pyx":11
+  /* "../../../../../../home/user/git/DeepCL/python/Annealer.pyx":11
  *     def setLearningRate(self, float learningRate):
  *         self.thisptr.setLearningRate(learningRate)
  *     def setAnneal(self, float anneal):             # <<<<<<<<<<<<<<
@@ -3745,7 +4106,7 @@ static PyObject *__pyx_pf_8PyDeepCL_8Annealer_6setAnneal(struct __pyx_obj_8PyDee
   return __pyx_r;
 }
 
-/* "Annealer.pyx":13
+/* "../../../../../../home/user/git/DeepCL/python/Annealer.pyx":13
  *     def setAnneal(self, float anneal):
  *         self.thisptr.setAnneal(anneal)
  *     def train(self, NeuralNet net, TrainingContext context,             # <<<<<<<<<<<<<<
@@ -3788,21 +4149,21 @@ static PyObject *__pyx_pw_8PyDeepCL_8Annealer_9train(PyObject *__pyx_v_self, PyO
         case  1:
         if (likely((values[1] = PyDict_GetItem(__pyx_kwds, __pyx_n_s_context)) != 0)) kw_args--;
         else {
-          __Pyx_RaiseArgtupleInvalid("train", 1, 4, 4, 1); {__pyx_filename = __pyx_f[5]; __pyx_lineno = 13; __pyx_clineno = __LINE__; goto __pyx_L3_error;}
+          __Pyx_RaiseArgtupleInvalid("train", 1, 4, 4, 1); {__pyx_filename = __pyx_f[4]; __pyx_lineno = 13; __pyx_clineno = __LINE__; goto __pyx_L3_error;}
         }
         case  2:
         if (likely((values[2] = PyDict_GetItem(__pyx_kwds, __pyx_n_s_inputdata)) != 0)) kw_args--;
         else {
-          __Pyx_RaiseArgtupleInvalid("train", 1, 4, 4, 2); {__pyx_filename = __pyx_f[5]; __pyx_lineno = 13; __pyx_clineno = __LINE__; goto __pyx_L3_error;}
+          __Pyx_RaiseArgtupleInvalid("train", 1, 4, 4, 2); {__pyx_filename = __pyx_f[4]; __pyx_lineno = 13; __pyx_clineno = __LINE__; goto __pyx_L3_error;}
         }
         case  3:
         if (likely((values[3] = PyDict_GetItem(__pyx_kwds, __pyx_n_s_expectedOutput)) != 0)) kw_args--;
         else {
-          __Pyx_RaiseArgtupleInvalid("train", 1, 4, 4, 3); {__pyx_filename = __pyx_f[5]; __pyx_lineno = 13; __pyx_clineno = __LINE__; goto __pyx_L3_error;}
+          __Pyx_RaiseArgtupleInvalid("train", 1, 4, 4, 3); {__pyx_filename = __pyx_f[4]; __pyx_lineno = 13; __pyx_clineno = __LINE__; goto __pyx_L3_error;}
         }
       }
       if (unlikely(kw_args > 0)) {
-        if (unlikely(__Pyx_ParseOptionalKeywords(__pyx_kwds, __pyx_pyargnames, 0, values, pos_args, "train") < 0)) {__pyx_filename = __pyx_f[5]; __pyx_lineno = 13; __pyx_clineno = __LINE__; goto __pyx_L3_error;}
+        if (unlikely(__Pyx_ParseOptionalKeywords(__pyx_kwds, __pyx_pyargnames, 0, values, pos_args, "train") < 0)) {__pyx_filename = __pyx_f[4]; __pyx_lineno = 13; __pyx_clineno = __LINE__; goto __pyx_L3_error;}
       }
     } else if (PyTuple_GET_SIZE(__pyx_args) != 4) {
       goto __pyx_L5_argtuple_error;
@@ -3814,19 +4175,19 @@ static PyObject *__pyx_pw_8PyDeepCL_8Annealer_9train(PyObject *__pyx_v_self, PyO
     }
     __pyx_v_net = ((struct __pyx_obj_8PyDeepCL_NeuralNet *)values[0]);
     __pyx_v_context = ((struct __pyx_obj_8PyDeepCL_TrainingContext *)values[1]);
-    __pyx_v_inputdata = __Pyx_PyObject_to_MemoryviewSlice_ds_float(values[2]); if (unlikely(!__pyx_v_inputdata.memview)) {__pyx_filename = __pyx_f[5]; __pyx_lineno = 14; __pyx_clineno = __LINE__; goto __pyx_L3_error;}
-    __pyx_v_expectedOutput = __Pyx_PyObject_to_MemoryviewSlice_ds_float(values[3]); if (unlikely(!__pyx_v_expectedOutput.memview)) {__pyx_filename = __pyx_f[5]; __pyx_lineno = 14; __pyx_clineno = __LINE__; goto __pyx_L3_error;}
+    __pyx_v_inputdata = __Pyx_PyObject_to_MemoryviewSlice_ds_float(values[2]); if (unlikely(!__pyx_v_inputdata.memview)) {__pyx_filename = __pyx_f[4]; __pyx_lineno = 14; __pyx_clineno = __LINE__; goto __pyx_L3_error;}
+    __pyx_v_expectedOutput = __Pyx_PyObject_to_MemoryviewSlice_ds_float(values[3]); if (unlikely(!__pyx_v_expectedOutput.memview)) {__pyx_filename = __pyx_f[4]; __pyx_lineno = 14; __pyx_clineno = __LINE__; goto __pyx_L3_error;}
   }
   goto __pyx_L4_argument_unpacking_done;
   __pyx_L5_argtuple_error:;
-  __Pyx_RaiseArgtupleInvalid("train", 1, 4, 4, PyTuple_GET_SIZE(__pyx_args)); {__pyx_filename = __pyx_f[5]; __pyx_lineno = 13; __pyx_clineno = __LINE__; goto __pyx_L3_error;}
+  __Pyx_RaiseArgtupleInvalid("train", 1, 4, 4, PyTuple_GET_SIZE(__pyx_args)); {__pyx_filename = __pyx_f[4]; __pyx_lineno = 13; __pyx_clineno = __LINE__; goto __pyx_L3_error;}
   __pyx_L3_error:;
   __Pyx_AddTraceback("PyDeepCL.Annealer.train", __pyx_clineno, __pyx_lineno, __pyx_filename);
   __Pyx_RefNannyFinishContext();
   return NULL;
   __pyx_L4_argument_unpacking_done:;
-  if (unlikely(!__Pyx_ArgTypeTest(((PyObject *)__pyx_v_net), __pyx_ptype_8PyDeepCL_NeuralNet, 1, "net", 0))) {__pyx_filename = __pyx_f[5]; __pyx_lineno = 13; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
-  if (unlikely(!__Pyx_ArgTypeTest(((PyObject *)__pyx_v_context), __pyx_ptype_8PyDeepCL_TrainingContext, 1, "context", 0))) {__pyx_filename = __pyx_f[5]; __pyx_lineno = 13; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+  if (unlikely(!__Pyx_ArgTypeTest(((PyObject *)__pyx_v_net), __pyx_ptype_8PyDeepCL_NeuralNet, 1, "net", 0))) {__pyx_filename = __pyx_f[4]; __pyx_lineno = 13; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+  if (unlikely(!__Pyx_ArgTypeTest(((PyObject *)__pyx_v_context), __pyx_ptype_8PyDeepCL_TrainingContext, 1, "context", 0))) {__pyx_filename = __pyx_f[4]; __pyx_lineno = 13; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
   __pyx_r = __pyx_pf_8PyDeepCL_8Annealer_8train(((struct __pyx_obj_8PyDeepCL_Annealer *)__pyx_v_self), __pyx_v_net, __pyx_v_context, __pyx_v_inputdata, __pyx_v_expectedOutput);
 
   /* function exit code */
@@ -3851,7 +4212,7 @@ static PyObject *__pyx_pf_8PyDeepCL_8Annealer_8train(struct __pyx_obj_8PyDeepCL_
   int __pyx_clineno = 0;
   __Pyx_RefNannySetupContext("train", 0);
 
-  /* "Annealer.pyx":16
+  /* "../../../../../../home/user/git/DeepCL/python/Annealer.pyx":16
  *         float[:] inputdata, float[:] expectedOutput ):
  *         cdef cDeepCL.BatchResult result = self.thisptr.train(
  *             net.thisptr, context.thisptr, &inputdata[0], &expectedOutput[0])             # <<<<<<<<<<<<<<
@@ -3866,7 +4227,7 @@ static PyObject *__pyx_pf_8PyDeepCL_8Annealer_8train(struct __pyx_obj_8PyDeepCL_
   } else if (unlikely(__pyx_t_1 >= __pyx_v_inputdata.shape[0])) __pyx_t_2 = 0;
   if (unlikely(__pyx_t_2 != -1)) {
     __Pyx_RaiseBufferIndexError(__pyx_t_2);
-    {__pyx_filename = __pyx_f[5]; __pyx_lineno = 16; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+    {__pyx_filename = __pyx_f[4]; __pyx_lineno = 16; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
   }
   __pyx_t_3 = 0;
   __pyx_t_2 = -1;
@@ -3876,10 +4237,10 @@ static PyObject *__pyx_pf_8PyDeepCL_8Annealer_8train(struct __pyx_obj_8PyDeepCL_
   } else if (unlikely(__pyx_t_3 >= __pyx_v_expectedOutput.shape[0])) __pyx_t_2 = 0;
   if (unlikely(__pyx_t_2 != -1)) {
     __Pyx_RaiseBufferIndexError(__pyx_t_2);
-    {__pyx_filename = __pyx_f[5]; __pyx_lineno = 16; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+    {__pyx_filename = __pyx_f[4]; __pyx_lineno = 16; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
   }
 
-  /* "Annealer.pyx":15
+  /* "../../../../../../home/user/git/DeepCL/python/Annealer.pyx":15
  *     def train(self, NeuralNet net, TrainingContext context,
  *         float[:] inputdata, float[:] expectedOutput ):
  *         cdef cDeepCL.BatchResult result = self.thisptr.train(             # <<<<<<<<<<<<<<
@@ -3888,7 +4249,7 @@ static PyObject *__pyx_pf_8PyDeepCL_8Annealer_8train(struct __pyx_obj_8PyDeepCL_
  */
   __pyx_v_result = __pyx_v_self->thisptr->train(__pyx_v_net->thisptr, __pyx_v_context->thisptr, (&(*((float *) ( /* dim=0 */ (__pyx_v_inputdata.data + __pyx_t_1 * __pyx_v_inputdata.strides[0]) )))), (&(*((float *) ( /* dim=0 */ (__pyx_v_expectedOutput.data + __pyx_t_3 * __pyx_v_expectedOutput.strides[0]) )))));
 
-  /* "Annealer.pyx":17
+  /* "../../../../../../home/user/git/DeepCL/python/Annealer.pyx":17
  *         cdef cDeepCL.BatchResult result = self.thisptr.train(
  *             net.thisptr, context.thisptr, &inputdata[0], &expectedOutput[0])
  *         return result.getLoss()             # <<<<<<<<<<<<<<
@@ -3896,13 +4257,13 @@ static PyObject *__pyx_pf_8PyDeepCL_8Annealer_8train(struct __pyx_obj_8PyDeepCL_
  *         float[:] inputdata, int[:] labels):
  */
   __Pyx_XDECREF(__pyx_r);
-  __pyx_t_4 = PyFloat_FromDouble(__pyx_v_result.getLoss()); if (unlikely(!__pyx_t_4)) {__pyx_filename = __pyx_f[5]; __pyx_lineno = 17; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+  __pyx_t_4 = PyFloat_FromDouble(__pyx_v_result.getLoss()); if (unlikely(!__pyx_t_4)) {__pyx_filename = __pyx_f[4]; __pyx_lineno = 17; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
   __Pyx_GOTREF(__pyx_t_4);
   __pyx_r = __pyx_t_4;
   __pyx_t_4 = 0;
   goto __pyx_L0;
 
-  /* "Annealer.pyx":13
+  /* "../../../../../../home/user/git/DeepCL/python/Annealer.pyx":13
  *     def setAnneal(self, float anneal):
  *         self.thisptr.setAnneal(anneal)
  *     def train(self, NeuralNet net, TrainingContext context,             # <<<<<<<<<<<<<<
@@ -3923,7 +4284,7 @@ static PyObject *__pyx_pf_8PyDeepCL_8Annealer_8train(struct __pyx_obj_8PyDeepCL_
   return __pyx_r;
 }
 
-/* "Annealer.pyx":18
+/* "../../../../../../home/user/git/DeepCL/python/Annealer.pyx":18
  *             net.thisptr, context.thisptr, &inputdata[0], &expectedOutput[0])
  *         return result.getLoss()
  *     def trainFromLabels(self, NeuralNet net, TrainingContext context,             # <<<<<<<<<<<<<<
@@ -3966,21 +4327,21 @@ static PyObject *__pyx_pw_8PyDeepCL_8Annealer_11trainFromLabels(PyObject *__pyx_
         case  1:
         if (likely((values[1] = PyDict_GetItem(__pyx_kwds, __pyx_n_s_context)) != 0)) kw_args--;
         else {
-          __Pyx_RaiseArgtupleInvalid("trainFromLabels", 1, 4, 4, 1); {__pyx_filename = __pyx_f[5]; __pyx_lineno = 18; __pyx_clineno = __LINE__; goto __pyx_L3_error;}
+          __Pyx_RaiseArgtupleInvalid("trainFromLabels", 1, 4, 4, 1); {__pyx_filename = __pyx_f[4]; __pyx_lineno = 18; __pyx_clineno = __LINE__; goto __pyx_L3_error;}
         }
         case  2:
         if (likely((values[2] = PyDict_GetItem(__pyx_kwds, __pyx_n_s_inputdata)) != 0)) kw_args--;
         else {
-          __Pyx_RaiseArgtupleInvalid("trainFromLabels", 1, 4, 4, 2); {__pyx_filename = __pyx_f[5]; __pyx_lineno = 18; __pyx_clineno = __LINE__; goto __pyx_L3_error;}
+          __Pyx_RaiseArgtupleInvalid("trainFromLabels", 1, 4, 4, 2); {__pyx_filename = __pyx_f[4]; __pyx_lineno = 18; __pyx_clineno = __LINE__; goto __pyx_L3_error;}
         }
         case  3:
         if (likely((values[3] = PyDict_GetItem(__pyx_kwds, __pyx_n_s_labels)) != 0)) kw_args--;
         else {
-          __Pyx_RaiseArgtupleInvalid("trainFromLabels", 1, 4, 4, 3); {__pyx_filename = __pyx_f[5]; __pyx_lineno = 18; __pyx_clineno = __LINE__; goto __pyx_L3_error;}
+          __Pyx_RaiseArgtupleInvalid("trainFromLabels", 1, 4, 4, 3); {__pyx_filename = __pyx_f[4]; __pyx_lineno = 18; __pyx_clineno = __LINE__; goto __pyx_L3_error;}
         }
       }
       if (unlikely(kw_args > 0)) {
-        if (unlikely(__Pyx_ParseOptionalKeywords(__pyx_kwds, __pyx_pyargnames, 0, values, pos_args, "trainFromLabels") < 0)) {__pyx_filename = __pyx_f[5]; __pyx_lineno = 18; __pyx_clineno = __LINE__; goto __pyx_L3_error;}
+        if (unlikely(__Pyx_ParseOptionalKeywords(__pyx_kwds, __pyx_pyargnames, 0, values, pos_args, "trainFromLabels") < 0)) {__pyx_filename = __pyx_f[4]; __pyx_lineno = 18; __pyx_clineno = __LINE__; goto __pyx_L3_error;}
       }
     } else if (PyTuple_GET_SIZE(__pyx_args) != 4) {
       goto __pyx_L5_argtuple_error;
@@ -3992,19 +4353,19 @@ static PyObject *__pyx_pw_8PyDeepCL_8Annealer_11trainFromLabels(PyObject *__pyx_
     }
     __pyx_v_net = ((struct __pyx_obj_8PyDeepCL_NeuralNet *)values[0]);
     __pyx_v_context = ((struct __pyx_obj_8PyDeepCL_TrainingContext *)values[1]);
-    __pyx_v_inputdata = __Pyx_PyObject_to_MemoryviewSlice_ds_float(values[2]); if (unlikely(!__pyx_v_inputdata.memview)) {__pyx_filename = __pyx_f[5]; __pyx_lineno = 19; __pyx_clineno = __LINE__; goto __pyx_L3_error;}
-    __pyx_v_labels = __Pyx_PyObject_to_MemoryviewSlice_ds_int(values[3]); if (unlikely(!__pyx_v_labels.memview)) {__pyx_filename = __pyx_f[5]; __pyx_lineno = 19; __pyx_clineno = __LINE__; goto __pyx_L3_error;}
+    __pyx_v_inputdata = __Pyx_PyObject_to_MemoryviewSlice_ds_float(values[2]); if (unlikely(!__pyx_v_inputdata.memview)) {__pyx_filename = __pyx_f[4]; __pyx_lineno = 19; __pyx_clineno = __LINE__; goto __pyx_L3_error;}
+    __pyx_v_labels = __Pyx_PyObject_to_MemoryviewSlice_ds_int(values[3]); if (unlikely(!__pyx_v_labels.memview)) {__pyx_filename = __pyx_f[4]; __pyx_lineno = 19; __pyx_clineno = __LINE__; goto __pyx_L3_error;}
   }
   goto __pyx_L4_argument_unpacking_done;
   __pyx_L5_argtuple_error:;
-  __Pyx_RaiseArgtupleInvalid("trainFromLabels", 1, 4, 4, PyTuple_GET_SIZE(__pyx_args)); {__pyx_filename = __pyx_f[5]; __pyx_lineno = 18; __pyx_clineno = __LINE__; goto __pyx_L3_error;}
+  __Pyx_RaiseArgtupleInvalid("trainFromLabels", 1, 4, 4, PyTuple_GET_SIZE(__pyx_args)); {__pyx_filename = __pyx_f[4]; __pyx_lineno = 18; __pyx_clineno = __LINE__; goto __pyx_L3_error;}
   __pyx_L3_error:;
   __Pyx_AddTraceback("PyDeepCL.Annealer.trainFromLabels", __pyx_clineno, __pyx_lineno, __pyx_filename);
   __Pyx_RefNannyFinishContext();
   return NULL;
   __pyx_L4_argument_unpacking_done:;
-  if (unlikely(!__Pyx_ArgTypeTest(((PyObject *)__pyx_v_net), __pyx_ptype_8PyDeepCL_NeuralNet, 1, "net", 0))) {__pyx_filename = __pyx_f[5]; __pyx_lineno = 18; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
-  if (unlikely(!__Pyx_ArgTypeTest(((PyObject *)__pyx_v_context), __pyx_ptype_8PyDeepCL_TrainingContext, 1, "context", 0))) {__pyx_filename = __pyx_f[5]; __pyx_lineno = 18; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+  if (unlikely(!__Pyx_ArgTypeTest(((PyObject *)__pyx_v_net), __pyx_ptype_8PyDeepCL_NeuralNet, 1, "net", 0))) {__pyx_filename = __pyx_f[4]; __pyx_lineno = 18; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+  if (unlikely(!__Pyx_ArgTypeTest(((PyObject *)__pyx_v_context), __pyx_ptype_8PyDeepCL_TrainingContext, 1, "context", 0))) {__pyx_filename = __pyx_f[4]; __pyx_lineno = 18; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
   __pyx_r = __pyx_pf_8PyDeepCL_8Annealer_10trainFromLabels(((struct __pyx_obj_8PyDeepCL_Annealer *)__pyx_v_self), __pyx_v_net, __pyx_v_context, __pyx_v_inputdata, __pyx_v_labels);
 
   /* function exit code */
@@ -4031,7 +4392,7 @@ static PyObject *__pyx_pf_8PyDeepCL_8Annealer_10trainFromLabels(struct __pyx_obj
   int __pyx_clineno = 0;
   __Pyx_RefNannySetupContext("trainFromLabels", 0);
 
-  /* "Annealer.pyx":21
+  /* "../../../../../../home/user/git/DeepCL/python/Annealer.pyx":21
  *         float[:] inputdata, int[:] labels):
  *         cdef cDeepCL.BatchResult result = self.thisptr.trainFromLabels(
  *             net.thisptr, context.thisptr, &inputdata[0], &labels[0])             # <<<<<<<<<<<<<<
@@ -4046,7 +4407,7 @@ static PyObject *__pyx_pf_8PyDeepCL_8Annealer_10trainFromLabels(struct __pyx_obj
   } else if (unlikely(__pyx_t_1 >= __pyx_v_inputdata.shape[0])) __pyx_t_2 = 0;
   if (unlikely(__pyx_t_2 != -1)) {
     __Pyx_RaiseBufferIndexError(__pyx_t_2);
-    {__pyx_filename = __pyx_f[5]; __pyx_lineno = 21; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+    {__pyx_filename = __pyx_f[4]; __pyx_lineno = 21; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
   }
   __pyx_t_3 = 0;
   __pyx_t_2 = -1;
@@ -4056,10 +4417,10 @@ static PyObject *__pyx_pf_8PyDeepCL_8Annealer_10trainFromLabels(struct __pyx_obj
   } else if (unlikely(__pyx_t_3 >= __pyx_v_labels.shape[0])) __pyx_t_2 = 0;
   if (unlikely(__pyx_t_2 != -1)) {
     __Pyx_RaiseBufferIndexError(__pyx_t_2);
-    {__pyx_filename = __pyx_f[5]; __pyx_lineno = 21; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+    {__pyx_filename = __pyx_f[4]; __pyx_lineno = 21; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
   }
 
-  /* "Annealer.pyx":20
+  /* "../../../../../../home/user/git/DeepCL/python/Annealer.pyx":20
  *     def trainFromLabels(self, NeuralNet net, TrainingContext context,
  *         float[:] inputdata, int[:] labels):
  *         cdef cDeepCL.BatchResult result = self.thisptr.trainFromLabels(             # <<<<<<<<<<<<<<
@@ -4068,18 +4429,18 @@ static PyObject *__pyx_pf_8PyDeepCL_8Annealer_10trainFromLabels(struct __pyx_obj
  */
   __pyx_v_result = __pyx_v_self->thisptr->trainFromLabels(__pyx_v_net->thisptr, __pyx_v_context->thisptr, (&(*((float *) ( /* dim=0 */ (__pyx_v_inputdata.data + __pyx_t_1 * __pyx_v_inputdata.strides[0]) )))), (&(*((int *) ( /* dim=0 */ (__pyx_v_labels.data + __pyx_t_3 * __pyx_v_labels.strides[0]) )))));
 
-  /* "Annealer.pyx":22
+  /* "../../../../../../home/user/git/DeepCL/python/Annealer.pyx":22
  *         cdef cDeepCL.BatchResult result = self.thisptr.trainFromLabels(
  *             net.thisptr, context.thisptr, &inputdata[0], &labels[0])
  *         return ( result.getLoss(), result.getNumRight() )             # <<<<<<<<<<<<<<
  * 
  */
   __Pyx_XDECREF(__pyx_r);
-  __pyx_t_4 = PyFloat_FromDouble(__pyx_v_result.getLoss()); if (unlikely(!__pyx_t_4)) {__pyx_filename = __pyx_f[5]; __pyx_lineno = 22; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+  __pyx_t_4 = PyFloat_FromDouble(__pyx_v_result.getLoss()); if (unlikely(!__pyx_t_4)) {__pyx_filename = __pyx_f[4]; __pyx_lineno = 22; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
   __Pyx_GOTREF(__pyx_t_4);
-  __pyx_t_5 = __Pyx_PyInt_From_int(__pyx_v_result.getNumRight()); if (unlikely(!__pyx_t_5)) {__pyx_filename = __pyx_f[5]; __pyx_lineno = 22; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+  __pyx_t_5 = __Pyx_PyInt_From_int(__pyx_v_result.getNumRight()); if (unlikely(!__pyx_t_5)) {__pyx_filename = __pyx_f[4]; __pyx_lineno = 22; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
   __Pyx_GOTREF(__pyx_t_5);
-  __pyx_t_6 = PyTuple_New(2); if (unlikely(!__pyx_t_6)) {__pyx_filename = __pyx_f[5]; __pyx_lineno = 22; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+  __pyx_t_6 = PyTuple_New(2); if (unlikely(!__pyx_t_6)) {__pyx_filename = __pyx_f[4]; __pyx_lineno = 22; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
   __Pyx_GOTREF(__pyx_t_6);
   PyTuple_SET_ITEM(__pyx_t_6, 0, __pyx_t_4);
   __Pyx_GIVEREF(__pyx_t_4);
@@ -4091,7 +4452,7 @@ static PyObject *__pyx_pf_8PyDeepCL_8Annealer_10trainFromLabels(struct __pyx_obj
   __pyx_t_6 = 0;
   goto __pyx_L0;
 
-  /* "Annealer.pyx":18
+  /* "../../../../../../home/user/git/DeepCL/python/Annealer.pyx":18
  *             net.thisptr, context.thisptr, &inputdata[0], &expectedOutput[0])
  *         return result.getLoss()
  *     def trainFromLabels(self, NeuralNet net, TrainingContext context,             # <<<<<<<<<<<<<<
@@ -4114,10 +4475,10 @@ static PyObject *__pyx_pf_8PyDeepCL_8Annealer_10trainFromLabels(struct __pyx_obj
   return __pyx_r;
 }
 
-/* "Nesterov.pyx":3
+/* "../../../../../../home/user/git/DeepCL/python/Nesterov.pyx":3
  * cdef class Nesterov:
  *     cdef cDeepCL.Nesterov *thisptr
- *     def __cinit__( self, EasyCL cl, learningRate, momentum=0.0 ):             # <<<<<<<<<<<<<<
+ *     def __cinit__( self, DeepCL cl, learningRate, momentum=0.0 ):             # <<<<<<<<<<<<<<
  *         self.thisptr = new cDeepCL.Nesterov(cl.thisptr)
  *         self.thisptr.setLearningRate(learningRate)
  */
@@ -4125,7 +4486,7 @@ static PyObject *__pyx_pf_8PyDeepCL_8Annealer_10trainFromLabels(struct __pyx_obj
 /* Python wrapper */
 static int __pyx_pw_8PyDeepCL_8Nesterov_1__cinit__(PyObject *__pyx_v_self, PyObject *__pyx_args, PyObject *__pyx_kwds); /*proto*/
 static int __pyx_pw_8PyDeepCL_8Nesterov_1__cinit__(PyObject *__pyx_v_self, PyObject *__pyx_args, PyObject *__pyx_kwds) {
-  struct __pyx_obj_8PyDeepCL_EasyCL *__pyx_v_cl = 0;
+  struct __pyx_obj_8PyDeepCL_DeepCL *__pyx_v_cl = 0;
   PyObject *__pyx_v_learningRate = 0;
   PyObject *__pyx_v_momentum = 0;
   int __pyx_lineno = 0;
@@ -4156,7 +4517,7 @@ static int __pyx_pw_8PyDeepCL_8Nesterov_1__cinit__(PyObject *__pyx_v_self, PyObj
         case  1:
         if (likely((values[1] = PyDict_GetItem(__pyx_kwds, __pyx_n_s_learningRate)) != 0)) kw_args--;
         else {
-          __Pyx_RaiseArgtupleInvalid("__cinit__", 0, 2, 3, 1); {__pyx_filename = __pyx_f[6]; __pyx_lineno = 3; __pyx_clineno = __LINE__; goto __pyx_L3_error;}
+          __Pyx_RaiseArgtupleInvalid("__cinit__", 0, 2, 3, 1); {__pyx_filename = __pyx_f[5]; __pyx_lineno = 3; __pyx_clineno = __LINE__; goto __pyx_L3_error;}
         }
         case  2:
         if (kw_args > 0) {
@@ -4165,7 +4526,7 @@ static int __pyx_pw_8PyDeepCL_8Nesterov_1__cinit__(PyObject *__pyx_v_self, PyObj
         }
       }
       if (unlikely(kw_args > 0)) {
-        if (unlikely(__Pyx_ParseOptionalKeywords(__pyx_kwds, __pyx_pyargnames, 0, values, pos_args, "__cinit__") < 0)) {__pyx_filename = __pyx_f[6]; __pyx_lineno = 3; __pyx_clineno = __LINE__; goto __pyx_L3_error;}
+        if (unlikely(__Pyx_ParseOptionalKeywords(__pyx_kwds, __pyx_pyargnames, 0, values, pos_args, "__cinit__") < 0)) {__pyx_filename = __pyx_f[5]; __pyx_lineno = 3; __pyx_clineno = __LINE__; goto __pyx_L3_error;}
       }
     } else {
       switch (PyTuple_GET_SIZE(__pyx_args)) {
@@ -4176,19 +4537,19 @@ static int __pyx_pw_8PyDeepCL_8Nesterov_1__cinit__(PyObject *__pyx_v_self, PyObj
         default: goto __pyx_L5_argtuple_error;
       }
     }
-    __pyx_v_cl = ((struct __pyx_obj_8PyDeepCL_EasyCL *)values[0]);
+    __pyx_v_cl = ((struct __pyx_obj_8PyDeepCL_DeepCL *)values[0]);
     __pyx_v_learningRate = values[1];
     __pyx_v_momentum = values[2];
   }
   goto __pyx_L4_argument_unpacking_done;
   __pyx_L5_argtuple_error:;
-  __Pyx_RaiseArgtupleInvalid("__cinit__", 0, 2, 3, PyTuple_GET_SIZE(__pyx_args)); {__pyx_filename = __pyx_f[6]; __pyx_lineno = 3; __pyx_clineno = __LINE__; goto __pyx_L3_error;}
+  __Pyx_RaiseArgtupleInvalid("__cinit__", 0, 2, 3, PyTuple_GET_SIZE(__pyx_args)); {__pyx_filename = __pyx_f[5]; __pyx_lineno = 3; __pyx_clineno = __LINE__; goto __pyx_L3_error;}
   __pyx_L3_error:;
   __Pyx_AddTraceback("PyDeepCL.Nesterov.__cinit__", __pyx_clineno, __pyx_lineno, __pyx_filename);
   __Pyx_RefNannyFinishContext();
   return -1;
   __pyx_L4_argument_unpacking_done:;
-  if (unlikely(!__Pyx_ArgTypeTest(((PyObject *)__pyx_v_cl), __pyx_ptype_8PyDeepCL_EasyCL, 1, "cl", 0))) {__pyx_filename = __pyx_f[6]; __pyx_lineno = 3; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+  if (unlikely(!__Pyx_ArgTypeTest(((PyObject *)__pyx_v_cl), __pyx_ptype_8PyDeepCL_DeepCL, 1, "cl", 0))) {__pyx_filename = __pyx_f[5]; __pyx_lineno = 3; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
   __pyx_r = __pyx_pf_8PyDeepCL_8Nesterov___cinit__(((struct __pyx_obj_8PyDeepCL_Nesterov *)__pyx_v_self), __pyx_v_cl, __pyx_v_learningRate, __pyx_v_momentum);
 
   /* function exit code */
@@ -4200,7 +4561,7 @@ static int __pyx_pw_8PyDeepCL_8Nesterov_1__cinit__(PyObject *__pyx_v_self, PyObj
   return __pyx_r;
 }
 
-static int __pyx_pf_8PyDeepCL_8Nesterov___cinit__(struct __pyx_obj_8PyDeepCL_Nesterov *__pyx_v_self, struct __pyx_obj_8PyDeepCL_EasyCL *__pyx_v_cl, PyObject *__pyx_v_learningRate, PyObject *__pyx_v_momentum) {
+static int __pyx_pf_8PyDeepCL_8Nesterov___cinit__(struct __pyx_obj_8PyDeepCL_Nesterov *__pyx_v_self, struct __pyx_obj_8PyDeepCL_DeepCL *__pyx_v_cl, PyObject *__pyx_v_learningRate, PyObject *__pyx_v_momentum) {
   int __pyx_r;
   __Pyx_RefNannyDeclarations
   Nesterov *__pyx_t_1;
@@ -4210,9 +4571,9 @@ static int __pyx_pf_8PyDeepCL_8Nesterov___cinit__(struct __pyx_obj_8PyDeepCL_Nes
   int __pyx_clineno = 0;
   __Pyx_RefNannySetupContext("__cinit__", 0);
 
-  /* "Nesterov.pyx":4
+  /* "../../../../../../home/user/git/DeepCL/python/Nesterov.pyx":4
  *     cdef cDeepCL.Nesterov *thisptr
- *     def __cinit__( self, EasyCL cl, learningRate, momentum=0.0 ):
+ *     def __cinit__( self, DeepCL cl, learningRate, momentum=0.0 ):
  *         self.thisptr = new cDeepCL.Nesterov(cl.thisptr)             # <<<<<<<<<<<<<<
  *         self.thisptr.setLearningRate(learningRate)
  *         self.thisptr.setMomentum(momentum)
@@ -4221,34 +4582,34 @@ static int __pyx_pf_8PyDeepCL_8Nesterov___cinit__(struct __pyx_obj_8PyDeepCL_Nes
     __pyx_t_1 = new Nesterov(__pyx_v_cl->thisptr);
   } catch(...) {
     __Pyx_CppExn2PyErr();
-    {__pyx_filename = __pyx_f[6]; __pyx_lineno = 4; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+    {__pyx_filename = __pyx_f[5]; __pyx_lineno = 4; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
   }
   __pyx_v_self->thisptr = __pyx_t_1;
 
-  /* "Nesterov.pyx":5
- *     def __cinit__( self, EasyCL cl, learningRate, momentum=0.0 ):
+  /* "../../../../../../home/user/git/DeepCL/python/Nesterov.pyx":5
+ *     def __cinit__( self, DeepCL cl, learningRate, momentum=0.0 ):
  *         self.thisptr = new cDeepCL.Nesterov(cl.thisptr)
  *         self.thisptr.setLearningRate(learningRate)             # <<<<<<<<<<<<<<
  *         self.thisptr.setMomentum(momentum)
- *     def __dealloc(self):
+ *     def __dealloc__(self):
  */
-  __pyx_t_2 = __pyx_PyFloat_AsFloat(__pyx_v_learningRate); if (unlikely((__pyx_t_2 == (float)-1) && PyErr_Occurred())) {__pyx_filename = __pyx_f[6]; __pyx_lineno = 5; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+  __pyx_t_2 = __pyx_PyFloat_AsFloat(__pyx_v_learningRate); if (unlikely((__pyx_t_2 == (float)-1) && PyErr_Occurred())) {__pyx_filename = __pyx_f[5]; __pyx_lineno = 5; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
   __pyx_v_self->thisptr->setLearningRate(__pyx_t_2);
 
-  /* "Nesterov.pyx":6
+  /* "../../../../../../home/user/git/DeepCL/python/Nesterov.pyx":6
  *         self.thisptr = new cDeepCL.Nesterov(cl.thisptr)
  *         self.thisptr.setLearningRate(learningRate)
  *         self.thisptr.setMomentum(momentum)             # <<<<<<<<<<<<<<
- *     def __dealloc(self):
+ *     def __dealloc__(self):
  *         del self.thisptr
  */
-  __pyx_t_2 = __pyx_PyFloat_AsFloat(__pyx_v_momentum); if (unlikely((__pyx_t_2 == (float)-1) && PyErr_Occurred())) {__pyx_filename = __pyx_f[6]; __pyx_lineno = 6; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+  __pyx_t_2 = __pyx_PyFloat_AsFloat(__pyx_v_momentum); if (unlikely((__pyx_t_2 == (float)-1) && PyErr_Occurred())) {__pyx_filename = __pyx_f[5]; __pyx_lineno = 6; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
   __pyx_v_self->thisptr->setMomentum(__pyx_t_2);
 
-  /* "Nesterov.pyx":3
+  /* "../../../../../../home/user/git/DeepCL/python/Nesterov.pyx":3
  * cdef class Nesterov:
  *     cdef cDeepCL.Nesterov *thisptr
- *     def __cinit__( self, EasyCL cl, learningRate, momentum=0.0 ):             # <<<<<<<<<<<<<<
+ *     def __cinit__( self, DeepCL cl, learningRate, momentum=0.0 ):             # <<<<<<<<<<<<<<
  *         self.thisptr = new cDeepCL.Nesterov(cl.thisptr)
  *         self.thisptr.setLearningRate(learningRate)
  */
@@ -4264,58 +4625,52 @@ static int __pyx_pf_8PyDeepCL_8Nesterov___cinit__(struct __pyx_obj_8PyDeepCL_Nes
   return __pyx_r;
 }
 
-/* "Nesterov.pyx":7
+/* "../../../../../../home/user/git/DeepCL/python/Nesterov.pyx":7
  *         self.thisptr.setLearningRate(learningRate)
  *         self.thisptr.setMomentum(momentum)
- *     def __dealloc(self):             # <<<<<<<<<<<<<<
+ *     def __dealloc__(self):             # <<<<<<<<<<<<<<
  *         del self.thisptr
  *     def setLearningRate(self, float learningRate):
  */
 
 /* Python wrapper */
-static PyObject *__pyx_pw_8PyDeepCL_8Nesterov_3__dealloc(PyObject *__pyx_v_self, CYTHON_UNUSED PyObject *unused); /*proto*/
-static PyObject *__pyx_pw_8PyDeepCL_8Nesterov_3__dealloc(PyObject *__pyx_v_self, CYTHON_UNUSED PyObject *unused) {
-  PyObject *__pyx_r = 0;
+static void __pyx_pw_8PyDeepCL_8Nesterov_3__dealloc__(PyObject *__pyx_v_self); /*proto*/
+static void __pyx_pw_8PyDeepCL_8Nesterov_3__dealloc__(PyObject *__pyx_v_self) {
   __Pyx_RefNannyDeclarations
-  __Pyx_RefNannySetupContext("__dealloc (wrapper)", 0);
-  __pyx_r = __pyx_pf_8PyDeepCL_8Nesterov_2__dealloc(((struct __pyx_obj_8PyDeepCL_Nesterov *)__pyx_v_self));
+  __Pyx_RefNannySetupContext("__dealloc__ (wrapper)", 0);
+  __pyx_pf_8PyDeepCL_8Nesterov_2__dealloc__(((struct __pyx_obj_8PyDeepCL_Nesterov *)__pyx_v_self));
 
   /* function exit code */
   __Pyx_RefNannyFinishContext();
-  return __pyx_r;
 }
 
-static PyObject *__pyx_pf_8PyDeepCL_8Nesterov_2__dealloc(struct __pyx_obj_8PyDeepCL_Nesterov *__pyx_v_self) {
-  PyObject *__pyx_r = NULL;
+static void __pyx_pf_8PyDeepCL_8Nesterov_2__dealloc__(struct __pyx_obj_8PyDeepCL_Nesterov *__pyx_v_self) {
   __Pyx_RefNannyDeclarations
-  __Pyx_RefNannySetupContext("__dealloc", 0);
+  __Pyx_RefNannySetupContext("__dealloc__", 0);
 
-  /* "Nesterov.pyx":8
+  /* "../../../../../../home/user/git/DeepCL/python/Nesterov.pyx":8
  *         self.thisptr.setMomentum(momentum)
- *     def __dealloc(self):
+ *     def __dealloc__(self):
  *         del self.thisptr             # <<<<<<<<<<<<<<
  *     def setLearningRate(self, float learningRate):
  *         self.thisptr.setLearningRate(learningRate)
  */
   delete __pyx_v_self->thisptr;
 
-  /* "Nesterov.pyx":7
+  /* "../../../../../../home/user/git/DeepCL/python/Nesterov.pyx":7
  *         self.thisptr.setLearningRate(learningRate)
  *         self.thisptr.setMomentum(momentum)
- *     def __dealloc(self):             # <<<<<<<<<<<<<<
+ *     def __dealloc__(self):             # <<<<<<<<<<<<<<
  *         del self.thisptr
  *     def setLearningRate(self, float learningRate):
  */
 
   /* function exit code */
-  __pyx_r = Py_None; __Pyx_INCREF(Py_None);
-  __Pyx_XGIVEREF(__pyx_r);
   __Pyx_RefNannyFinishContext();
-  return __pyx_r;
 }
 
-/* "Nesterov.pyx":9
- *     def __dealloc(self):
+/* "../../../../../../home/user/git/DeepCL/python/Nesterov.pyx":9
+ *     def __dealloc__(self):
  *         del self.thisptr
  *     def setLearningRate(self, float learningRate):             # <<<<<<<<<<<<<<
  *         self.thisptr.setLearningRate(learningRate)
@@ -4333,7 +4688,7 @@ static PyObject *__pyx_pw_8PyDeepCL_8Nesterov_5setLearningRate(PyObject *__pyx_v
   __Pyx_RefNannyDeclarations
   __Pyx_RefNannySetupContext("setLearningRate (wrapper)", 0);
   assert(__pyx_arg_learningRate); {
-    __pyx_v_learningRate = __pyx_PyFloat_AsFloat(__pyx_arg_learningRate); if (unlikely((__pyx_v_learningRate == (float)-1) && PyErr_Occurred())) {__pyx_filename = __pyx_f[6]; __pyx_lineno = 9; __pyx_clineno = __LINE__; goto __pyx_L3_error;}
+    __pyx_v_learningRate = __pyx_PyFloat_AsFloat(__pyx_arg_learningRate); if (unlikely((__pyx_v_learningRate == (float)-1) && PyErr_Occurred())) {__pyx_filename = __pyx_f[5]; __pyx_lineno = 9; __pyx_clineno = __LINE__; goto __pyx_L3_error;}
   }
   goto __pyx_L4_argument_unpacking_done;
   __pyx_L3_error:;
@@ -4353,7 +4708,7 @@ static PyObject *__pyx_pf_8PyDeepCL_8Nesterov_4setLearningRate(struct __pyx_obj_
   __Pyx_RefNannyDeclarations
   __Pyx_RefNannySetupContext("setLearningRate", 0);
 
-  /* "Nesterov.pyx":10
+  /* "../../../../../../home/user/git/DeepCL/python/Nesterov.pyx":10
  *         del self.thisptr
  *     def setLearningRate(self, float learningRate):
  *         self.thisptr.setLearningRate(learningRate)             # <<<<<<<<<<<<<<
@@ -4362,8 +4717,8 @@ static PyObject *__pyx_pf_8PyDeepCL_8Nesterov_4setLearningRate(struct __pyx_obj_
  */
   __pyx_v_self->thisptr->setLearningRate(__pyx_v_learningRate);
 
-  /* "Nesterov.pyx":9
- *     def __dealloc(self):
+  /* "../../../../../../home/user/git/DeepCL/python/Nesterov.pyx":9
+ *     def __dealloc__(self):
  *         del self.thisptr
  *     def setLearningRate(self, float learningRate):             # <<<<<<<<<<<<<<
  *         self.thisptr.setLearningRate(learningRate)
@@ -4377,7 +4732,7 @@ static PyObject *__pyx_pf_8PyDeepCL_8Nesterov_4setLearningRate(struct __pyx_obj_
   return __pyx_r;
 }
 
-/* "Nesterov.pyx":11
+/* "../../../../../../home/user/git/DeepCL/python/Nesterov.pyx":11
  *     def setLearningRate(self, float learningRate):
  *         self.thisptr.setLearningRate(learningRate)
  *     def setMomentum(self, float momentum):             # <<<<<<<<<<<<<<
@@ -4396,7 +4751,7 @@ static PyObject *__pyx_pw_8PyDeepCL_8Nesterov_7setMomentum(PyObject *__pyx_v_sel
   __Pyx_RefNannyDeclarations
   __Pyx_RefNannySetupContext("setMomentum (wrapper)", 0);
   assert(__pyx_arg_momentum); {
-    __pyx_v_momentum = __pyx_PyFloat_AsFloat(__pyx_arg_momentum); if (unlikely((__pyx_v_momentum == (float)-1) && PyErr_Occurred())) {__pyx_filename = __pyx_f[6]; __pyx_lineno = 11; __pyx_clineno = __LINE__; goto __pyx_L3_error;}
+    __pyx_v_momentum = __pyx_PyFloat_AsFloat(__pyx_arg_momentum); if (unlikely((__pyx_v_momentum == (float)-1) && PyErr_Occurred())) {__pyx_filename = __pyx_f[5]; __pyx_lineno = 11; __pyx_clineno = __LINE__; goto __pyx_L3_error;}
   }
   goto __pyx_L4_argument_unpacking_done;
   __pyx_L3_error:;
@@ -4416,7 +4771,7 @@ static PyObject *__pyx_pf_8PyDeepCL_8Nesterov_6setMomentum(struct __pyx_obj_8PyD
   __Pyx_RefNannyDeclarations
   __Pyx_RefNannySetupContext("setMomentum", 0);
 
-  /* "Nesterov.pyx":12
+  /* "../../../../../../home/user/git/DeepCL/python/Nesterov.pyx":12
  *         self.thisptr.setLearningRate(learningRate)
  *     def setMomentum(self, float momentum):
  *         self.thisptr.setMomentum(momentum)             # <<<<<<<<<<<<<<
@@ -4425,7 +4780,7 @@ static PyObject *__pyx_pf_8PyDeepCL_8Nesterov_6setMomentum(struct __pyx_obj_8PyD
  */
   __pyx_v_self->thisptr->setMomentum(__pyx_v_momentum);
 
-  /* "Nesterov.pyx":11
+  /* "../../../../../../home/user/git/DeepCL/python/Nesterov.pyx":11
  *     def setLearningRate(self, float learningRate):
  *         self.thisptr.setLearningRate(learningRate)
  *     def setMomentum(self, float momentum):             # <<<<<<<<<<<<<<
@@ -4440,7 +4795,7 @@ static PyObject *__pyx_pf_8PyDeepCL_8Nesterov_6setMomentum(struct __pyx_obj_8PyD
   return __pyx_r;
 }
 
-/* "Nesterov.pyx":13
+/* "../../../../../../home/user/git/DeepCL/python/Nesterov.pyx":13
  *     def setMomentum(self, float momentum):
  *         self.thisptr.setMomentum(momentum)
  *     def train(self, NeuralNet net, TrainingContext context,             # <<<<<<<<<<<<<<
@@ -4483,21 +4838,21 @@ static PyObject *__pyx_pw_8PyDeepCL_8Nesterov_9train(PyObject *__pyx_v_self, PyO
         case  1:
         if (likely((values[1] = PyDict_GetItem(__pyx_kwds, __pyx_n_s_context)) != 0)) kw_args--;
         else {
-          __Pyx_RaiseArgtupleInvalid("train", 1, 4, 4, 1); {__pyx_filename = __pyx_f[6]; __pyx_lineno = 13; __pyx_clineno = __LINE__; goto __pyx_L3_error;}
+          __Pyx_RaiseArgtupleInvalid("train", 1, 4, 4, 1); {__pyx_filename = __pyx_f[5]; __pyx_lineno = 13; __pyx_clineno = __LINE__; goto __pyx_L3_error;}
         }
         case  2:
         if (likely((values[2] = PyDict_GetItem(__pyx_kwds, __pyx_n_s_inputdata)) != 0)) kw_args--;
         else {
-          __Pyx_RaiseArgtupleInvalid("train", 1, 4, 4, 2); {__pyx_filename = __pyx_f[6]; __pyx_lineno = 13; __pyx_clineno = __LINE__; goto __pyx_L3_error;}
+          __Pyx_RaiseArgtupleInvalid("train", 1, 4, 4, 2); {__pyx_filename = __pyx_f[5]; __pyx_lineno = 13; __pyx_clineno = __LINE__; goto __pyx_L3_error;}
         }
         case  3:
         if (likely((values[3] = PyDict_GetItem(__pyx_kwds, __pyx_n_s_expectedOutput)) != 0)) kw_args--;
         else {
-          __Pyx_RaiseArgtupleInvalid("train", 1, 4, 4, 3); {__pyx_filename = __pyx_f[6]; __pyx_lineno = 13; __pyx_clineno = __LINE__; goto __pyx_L3_error;}
+          __Pyx_RaiseArgtupleInvalid("train", 1, 4, 4, 3); {__pyx_filename = __pyx_f[5]; __pyx_lineno = 13; __pyx_clineno = __LINE__; goto __pyx_L3_error;}
         }
       }
       if (unlikely(kw_args > 0)) {
-        if (unlikely(__Pyx_ParseOptionalKeywords(__pyx_kwds, __pyx_pyargnames, 0, values, pos_args, "train") < 0)) {__pyx_filename = __pyx_f[6]; __pyx_lineno = 13; __pyx_clineno = __LINE__; goto __pyx_L3_error;}
+        if (unlikely(__Pyx_ParseOptionalKeywords(__pyx_kwds, __pyx_pyargnames, 0, values, pos_args, "train") < 0)) {__pyx_filename = __pyx_f[5]; __pyx_lineno = 13; __pyx_clineno = __LINE__; goto __pyx_L3_error;}
       }
     } else if (PyTuple_GET_SIZE(__pyx_args) != 4) {
       goto __pyx_L5_argtuple_error;
@@ -4509,19 +4864,19 @@ static PyObject *__pyx_pw_8PyDeepCL_8Nesterov_9train(PyObject *__pyx_v_self, PyO
     }
     __pyx_v_net = ((struct __pyx_obj_8PyDeepCL_NeuralNet *)values[0]);
     __pyx_v_context = ((struct __pyx_obj_8PyDeepCL_TrainingContext *)values[1]);
-    __pyx_v_inputdata = __Pyx_PyObject_to_MemoryviewSlice_ds_float(values[2]); if (unlikely(!__pyx_v_inputdata.memview)) {__pyx_filename = __pyx_f[6]; __pyx_lineno = 14; __pyx_clineno = __LINE__; goto __pyx_L3_error;}
-    __pyx_v_expectedOutput = __Pyx_PyObject_to_MemoryviewSlice_ds_float(values[3]); if (unlikely(!__pyx_v_expectedOutput.memview)) {__pyx_filename = __pyx_f[6]; __pyx_lineno = 14; __pyx_clineno = __LINE__; goto __pyx_L3_error;}
+    __pyx_v_inputdata = __Pyx_PyObject_to_MemoryviewSlice_ds_float(values[2]); if (unlikely(!__pyx_v_inputdata.memview)) {__pyx_filename = __pyx_f[5]; __pyx_lineno = 14; __pyx_clineno = __LINE__; goto __pyx_L3_error;}
+    __pyx_v_expectedOutput = __Pyx_PyObject_to_MemoryviewSlice_ds_float(values[3]); if (unlikely(!__pyx_v_expectedOutput.memview)) {__pyx_filename = __pyx_f[5]; __pyx_lineno = 14; __pyx_clineno = __LINE__; goto __pyx_L3_error;}
   }
   goto __pyx_L4_argument_unpacking_done;
   __pyx_L5_argtuple_error:;
-  __Pyx_RaiseArgtupleInvalid("train", 1, 4, 4, PyTuple_GET_SIZE(__pyx_args)); {__pyx_filename = __pyx_f[6]; __pyx_lineno = 13; __pyx_clineno = __LINE__; goto __pyx_L3_error;}
+  __Pyx_RaiseArgtupleInvalid("train", 1, 4, 4, PyTuple_GET_SIZE(__pyx_args)); {__pyx_filename = __pyx_f[5]; __pyx_lineno = 13; __pyx_clineno = __LINE__; goto __pyx_L3_error;}
   __pyx_L3_error:;
   __Pyx_AddTraceback("PyDeepCL.Nesterov.train", __pyx_clineno, __pyx_lineno, __pyx_filename);
   __Pyx_RefNannyFinishContext();
   return NULL;
   __pyx_L4_argument_unpacking_done:;
-  if (unlikely(!__Pyx_ArgTypeTest(((PyObject *)__pyx_v_net), __pyx_ptype_8PyDeepCL_NeuralNet, 1, "net", 0))) {__pyx_filename = __pyx_f[6]; __pyx_lineno = 13; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
-  if (unlikely(!__Pyx_ArgTypeTest(((PyObject *)__pyx_v_context), __pyx_ptype_8PyDeepCL_TrainingContext, 1, "context", 0))) {__pyx_filename = __pyx_f[6]; __pyx_lineno = 13; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+  if (unlikely(!__Pyx_ArgTypeTest(((PyObject *)__pyx_v_net), __pyx_ptype_8PyDeepCL_NeuralNet, 1, "net", 0))) {__pyx_filename = __pyx_f[5]; __pyx_lineno = 13; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+  if (unlikely(!__Pyx_ArgTypeTest(((PyObject *)__pyx_v_context), __pyx_ptype_8PyDeepCL_TrainingContext, 1, "context", 0))) {__pyx_filename = __pyx_f[5]; __pyx_lineno = 13; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
   __pyx_r = __pyx_pf_8PyDeepCL_8Nesterov_8train(((struct __pyx_obj_8PyDeepCL_Nesterov *)__pyx_v_self), __pyx_v_net, __pyx_v_context, __pyx_v_inputdata, __pyx_v_expectedOutput);
 
   /* function exit code */
@@ -4546,7 +4901,7 @@ static PyObject *__pyx_pf_8PyDeepCL_8Nesterov_8train(struct __pyx_obj_8PyDeepCL_
   int __pyx_clineno = 0;
   __Pyx_RefNannySetupContext("train", 0);
 
-  /* "Nesterov.pyx":16
+  /* "../../../../../../home/user/git/DeepCL/python/Nesterov.pyx":16
  *         float[:] inputdata, float[:] expectedOutput ):
  *         cdef cDeepCL.BatchResult result = self.thisptr.train(
  *             net.thisptr, context.thisptr, &inputdata[0], &expectedOutput[0])             # <<<<<<<<<<<<<<
@@ -4561,7 +4916,7 @@ static PyObject *__pyx_pf_8PyDeepCL_8Nesterov_8train(struct __pyx_obj_8PyDeepCL_
   } else if (unlikely(__pyx_t_1 >= __pyx_v_inputdata.shape[0])) __pyx_t_2 = 0;
   if (unlikely(__pyx_t_2 != -1)) {
     __Pyx_RaiseBufferIndexError(__pyx_t_2);
-    {__pyx_filename = __pyx_f[6]; __pyx_lineno = 16; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+    {__pyx_filename = __pyx_f[5]; __pyx_lineno = 16; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
   }
   __pyx_t_3 = 0;
   __pyx_t_2 = -1;
@@ -4571,10 +4926,10 @@ static PyObject *__pyx_pf_8PyDeepCL_8Nesterov_8train(struct __pyx_obj_8PyDeepCL_
   } else if (unlikely(__pyx_t_3 >= __pyx_v_expectedOutput.shape[0])) __pyx_t_2 = 0;
   if (unlikely(__pyx_t_2 != -1)) {
     __Pyx_RaiseBufferIndexError(__pyx_t_2);
-    {__pyx_filename = __pyx_f[6]; __pyx_lineno = 16; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+    {__pyx_filename = __pyx_f[5]; __pyx_lineno = 16; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
   }
 
-  /* "Nesterov.pyx":15
+  /* "../../../../../../home/user/git/DeepCL/python/Nesterov.pyx":15
  *     def train(self, NeuralNet net, TrainingContext context,
  *         float[:] inputdata, float[:] expectedOutput ):
  *         cdef cDeepCL.BatchResult result = self.thisptr.train(             # <<<<<<<<<<<<<<
@@ -4583,7 +4938,7 @@ static PyObject *__pyx_pf_8PyDeepCL_8Nesterov_8train(struct __pyx_obj_8PyDeepCL_
  */
   __pyx_v_result = __pyx_v_self->thisptr->train(__pyx_v_net->thisptr, __pyx_v_context->thisptr, (&(*((float *) ( /* dim=0 */ (__pyx_v_inputdata.data + __pyx_t_1 * __pyx_v_inputdata.strides[0]) )))), (&(*((float *) ( /* dim=0 */ (__pyx_v_expectedOutput.data + __pyx_t_3 * __pyx_v_expectedOutput.strides[0]) )))));
 
-  /* "Nesterov.pyx":17
+  /* "../../../../../../home/user/git/DeepCL/python/Nesterov.pyx":17
  *         cdef cDeepCL.BatchResult result = self.thisptr.train(
  *             net.thisptr, context.thisptr, &inputdata[0], &expectedOutput[0])
  *         return result.getLoss()             # <<<<<<<<<<<<<<
@@ -4591,13 +4946,13 @@ static PyObject *__pyx_pf_8PyDeepCL_8Nesterov_8train(struct __pyx_obj_8PyDeepCL_
  *         float[:] inputdata, int[:] labels):
  */
   __Pyx_XDECREF(__pyx_r);
-  __pyx_t_4 = PyFloat_FromDouble(__pyx_v_result.getLoss()); if (unlikely(!__pyx_t_4)) {__pyx_filename = __pyx_f[6]; __pyx_lineno = 17; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+  __pyx_t_4 = PyFloat_FromDouble(__pyx_v_result.getLoss()); if (unlikely(!__pyx_t_4)) {__pyx_filename = __pyx_f[5]; __pyx_lineno = 17; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
   __Pyx_GOTREF(__pyx_t_4);
   __pyx_r = __pyx_t_4;
   __pyx_t_4 = 0;
   goto __pyx_L0;
 
-  /* "Nesterov.pyx":13
+  /* "../../../../../../home/user/git/DeepCL/python/Nesterov.pyx":13
  *     def setMomentum(self, float momentum):
  *         self.thisptr.setMomentum(momentum)
  *     def train(self, NeuralNet net, TrainingContext context,             # <<<<<<<<<<<<<<
@@ -4618,7 +4973,7 @@ static PyObject *__pyx_pf_8PyDeepCL_8Nesterov_8train(struct __pyx_obj_8PyDeepCL_
   return __pyx_r;
 }
 
-/* "Nesterov.pyx":18
+/* "../../../../../../home/user/git/DeepCL/python/Nesterov.pyx":18
  *             net.thisptr, context.thisptr, &inputdata[0], &expectedOutput[0])
  *         return result.getLoss()
  *     def trainFromLabels(self, NeuralNet net, TrainingContext context,             # <<<<<<<<<<<<<<
@@ -4661,21 +5016,21 @@ static PyObject *__pyx_pw_8PyDeepCL_8Nesterov_11trainFromLabels(PyObject *__pyx_
         case  1:
         if (likely((values[1] = PyDict_GetItem(__pyx_kwds, __pyx_n_s_context)) != 0)) kw_args--;
         else {
-          __Pyx_RaiseArgtupleInvalid("trainFromLabels", 1, 4, 4, 1); {__pyx_filename = __pyx_f[6]; __pyx_lineno = 18; __pyx_clineno = __LINE__; goto __pyx_L3_error;}
+          __Pyx_RaiseArgtupleInvalid("trainFromLabels", 1, 4, 4, 1); {__pyx_filename = __pyx_f[5]; __pyx_lineno = 18; __pyx_clineno = __LINE__; goto __pyx_L3_error;}
         }
         case  2:
         if (likely((values[2] = PyDict_GetItem(__pyx_kwds, __pyx_n_s_inputdata)) != 0)) kw_args--;
         else {
-          __Pyx_RaiseArgtupleInvalid("trainFromLabels", 1, 4, 4, 2); {__pyx_filename = __pyx_f[6]; __pyx_lineno = 18; __pyx_clineno = __LINE__; goto __pyx_L3_error;}
+          __Pyx_RaiseArgtupleInvalid("trainFromLabels", 1, 4, 4, 2); {__pyx_filename = __pyx_f[5]; __pyx_lineno = 18; __pyx_clineno = __LINE__; goto __pyx_L3_error;}
         }
         case  3:
         if (likely((values[3] = PyDict_GetItem(__pyx_kwds, __pyx_n_s_labels)) != 0)) kw_args--;
         else {
-          __Pyx_RaiseArgtupleInvalid("trainFromLabels", 1, 4, 4, 3); {__pyx_filename = __pyx_f[6]; __pyx_lineno = 18; __pyx_clineno = __LINE__; goto __pyx_L3_error;}
+          __Pyx_RaiseArgtupleInvalid("trainFromLabels", 1, 4, 4, 3); {__pyx_filename = __pyx_f[5]; __pyx_lineno = 18; __pyx_clineno = __LINE__; goto __pyx_L3_error;}
         }
       }
       if (unlikely(kw_args > 0)) {
-        if (unlikely(__Pyx_ParseOptionalKeywords(__pyx_kwds, __pyx_pyargnames, 0, values, pos_args, "trainFromLabels") < 0)) {__pyx_filename = __pyx_f[6]; __pyx_lineno = 18; __pyx_clineno = __LINE__; goto __pyx_L3_error;}
+        if (unlikely(__Pyx_ParseOptionalKeywords(__pyx_kwds, __pyx_pyargnames, 0, values, pos_args, "trainFromLabels") < 0)) {__pyx_filename = __pyx_f[5]; __pyx_lineno = 18; __pyx_clineno = __LINE__; goto __pyx_L3_error;}
       }
     } else if (PyTuple_GET_SIZE(__pyx_args) != 4) {
       goto __pyx_L5_argtuple_error;
@@ -4687,19 +5042,19 @@ static PyObject *__pyx_pw_8PyDeepCL_8Nesterov_11trainFromLabels(PyObject *__pyx_
     }
     __pyx_v_net = ((struct __pyx_obj_8PyDeepCL_NeuralNet *)values[0]);
     __pyx_v_context = ((struct __pyx_obj_8PyDeepCL_TrainingContext *)values[1]);
-    __pyx_v_inputdata = __Pyx_PyObject_to_MemoryviewSlice_ds_float(values[2]); if (unlikely(!__pyx_v_inputdata.memview)) {__pyx_filename = __pyx_f[6]; __pyx_lineno = 19; __pyx_clineno = __LINE__; goto __pyx_L3_error;}
-    __pyx_v_labels = __Pyx_PyObject_to_MemoryviewSlice_ds_int(values[3]); if (unlikely(!__pyx_v_labels.memview)) {__pyx_filename = __pyx_f[6]; __pyx_lineno = 19; __pyx_clineno = __LINE__; goto __pyx_L3_error;}
+    __pyx_v_inputdata = __Pyx_PyObject_to_MemoryviewSlice_ds_float(values[2]); if (unlikely(!__pyx_v_inputdata.memview)) {__pyx_filename = __pyx_f[5]; __pyx_lineno = 19; __pyx_clineno = __LINE__; goto __pyx_L3_error;}
+    __pyx_v_labels = __Pyx_PyObject_to_MemoryviewSlice_ds_int(values[3]); if (unlikely(!__pyx_v_labels.memview)) {__pyx_filename = __pyx_f[5]; __pyx_lineno = 19; __pyx_clineno = __LINE__; goto __pyx_L3_error;}
   }
   goto __pyx_L4_argument_unpacking_done;
   __pyx_L5_argtuple_error:;
-  __Pyx_RaiseArgtupleInvalid("trainFromLabels", 1, 4, 4, PyTuple_GET_SIZE(__pyx_args)); {__pyx_filename = __pyx_f[6]; __pyx_lineno = 18; __pyx_clineno = __LINE__; goto __pyx_L3_error;}
+  __Pyx_RaiseArgtupleInvalid("trainFromLabels", 1, 4, 4, PyTuple_GET_SIZE(__pyx_args)); {__pyx_filename = __pyx_f[5]; __pyx_lineno = 18; __pyx_clineno = __LINE__; goto __pyx_L3_error;}
   __pyx_L3_error:;
   __Pyx_AddTraceback("PyDeepCL.Nesterov.trainFromLabels", __pyx_clineno, __pyx_lineno, __pyx_filename);
   __Pyx_RefNannyFinishContext();
   return NULL;
   __pyx_L4_argument_unpacking_done:;
-  if (unlikely(!__Pyx_ArgTypeTest(((PyObject *)__pyx_v_net), __pyx_ptype_8PyDeepCL_NeuralNet, 1, "net", 0))) {__pyx_filename = __pyx_f[6]; __pyx_lineno = 18; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
-  if (unlikely(!__Pyx_ArgTypeTest(((PyObject *)__pyx_v_context), __pyx_ptype_8PyDeepCL_TrainingContext, 1, "context", 0))) {__pyx_filename = __pyx_f[6]; __pyx_lineno = 18; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+  if (unlikely(!__Pyx_ArgTypeTest(((PyObject *)__pyx_v_net), __pyx_ptype_8PyDeepCL_NeuralNet, 1, "net", 0))) {__pyx_filename = __pyx_f[5]; __pyx_lineno = 18; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+  if (unlikely(!__Pyx_ArgTypeTest(((PyObject *)__pyx_v_context), __pyx_ptype_8PyDeepCL_TrainingContext, 1, "context", 0))) {__pyx_filename = __pyx_f[5]; __pyx_lineno = 18; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
   __pyx_r = __pyx_pf_8PyDeepCL_8Nesterov_10trainFromLabels(((struct __pyx_obj_8PyDeepCL_Nesterov *)__pyx_v_self), __pyx_v_net, __pyx_v_context, __pyx_v_inputdata, __pyx_v_labels);
 
   /* function exit code */
@@ -4726,7 +5081,7 @@ static PyObject *__pyx_pf_8PyDeepCL_8Nesterov_10trainFromLabels(struct __pyx_obj
   int __pyx_clineno = 0;
   __Pyx_RefNannySetupContext("trainFromLabels", 0);
 
-  /* "Nesterov.pyx":21
+  /* "../../../../../../home/user/git/DeepCL/python/Nesterov.pyx":21
  *         float[:] inputdata, int[:] labels):
  *         cdef cDeepCL.BatchResult result = self.thisptr.trainFromLabels(
  *             net.thisptr, context.thisptr, &inputdata[0], &labels[0])             # <<<<<<<<<<<<<<
@@ -4741,7 +5096,7 @@ static PyObject *__pyx_pf_8PyDeepCL_8Nesterov_10trainFromLabels(struct __pyx_obj
   } else if (unlikely(__pyx_t_1 >= __pyx_v_inputdata.shape[0])) __pyx_t_2 = 0;
   if (unlikely(__pyx_t_2 != -1)) {
     __Pyx_RaiseBufferIndexError(__pyx_t_2);
-    {__pyx_filename = __pyx_f[6]; __pyx_lineno = 21; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+    {__pyx_filename = __pyx_f[5]; __pyx_lineno = 21; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
   }
   __pyx_t_3 = 0;
   __pyx_t_2 = -1;
@@ -4751,10 +5106,10 @@ static PyObject *__pyx_pf_8PyDeepCL_8Nesterov_10trainFromLabels(struct __pyx_obj
   } else if (unlikely(__pyx_t_3 >= __pyx_v_labels.shape[0])) __pyx_t_2 = 0;
   if (unlikely(__pyx_t_2 != -1)) {
     __Pyx_RaiseBufferIndexError(__pyx_t_2);
-    {__pyx_filename = __pyx_f[6]; __pyx_lineno = 21; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+    {__pyx_filename = __pyx_f[5]; __pyx_lineno = 21; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
   }
 
-  /* "Nesterov.pyx":20
+  /* "../../../../../../home/user/git/DeepCL/python/Nesterov.pyx":20
  *     def trainFromLabels(self, NeuralNet net, TrainingContext context,
  *         float[:] inputdata, int[:] labels):
  *         cdef cDeepCL.BatchResult result = self.thisptr.trainFromLabels(             # <<<<<<<<<<<<<<
@@ -4763,18 +5118,18 @@ static PyObject *__pyx_pf_8PyDeepCL_8Nesterov_10trainFromLabels(struct __pyx_obj
  */
   __pyx_v_result = __pyx_v_self->thisptr->trainFromLabels(__pyx_v_net->thisptr, __pyx_v_context->thisptr, (&(*((float *) ( /* dim=0 */ (__pyx_v_inputdata.data + __pyx_t_1 * __pyx_v_inputdata.strides[0]) )))), (&(*((int *) ( /* dim=0 */ (__pyx_v_labels.data + __pyx_t_3 * __pyx_v_labels.strides[0]) )))));
 
-  /* "Nesterov.pyx":22
+  /* "../../../../../../home/user/git/DeepCL/python/Nesterov.pyx":22
  *         cdef cDeepCL.BatchResult result = self.thisptr.trainFromLabels(
  *             net.thisptr, context.thisptr, &inputdata[0], &labels[0])
  *         return ( result.getLoss(), result.getNumRight() )             # <<<<<<<<<<<<<<
  * 
  */
   __Pyx_XDECREF(__pyx_r);
-  __pyx_t_4 = PyFloat_FromDouble(__pyx_v_result.getLoss()); if (unlikely(!__pyx_t_4)) {__pyx_filename = __pyx_f[6]; __pyx_lineno = 22; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+  __pyx_t_4 = PyFloat_FromDouble(__pyx_v_result.getLoss()); if (unlikely(!__pyx_t_4)) {__pyx_filename = __pyx_f[5]; __pyx_lineno = 22; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
   __Pyx_GOTREF(__pyx_t_4);
-  __pyx_t_5 = __Pyx_PyInt_From_int(__pyx_v_result.getNumRight()); if (unlikely(!__pyx_t_5)) {__pyx_filename = __pyx_f[6]; __pyx_lineno = 22; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+  __pyx_t_5 = __Pyx_PyInt_From_int(__pyx_v_result.getNumRight()); if (unlikely(!__pyx_t_5)) {__pyx_filename = __pyx_f[5]; __pyx_lineno = 22; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
   __Pyx_GOTREF(__pyx_t_5);
-  __pyx_t_6 = PyTuple_New(2); if (unlikely(!__pyx_t_6)) {__pyx_filename = __pyx_f[6]; __pyx_lineno = 22; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+  __pyx_t_6 = PyTuple_New(2); if (unlikely(!__pyx_t_6)) {__pyx_filename = __pyx_f[5]; __pyx_lineno = 22; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
   __Pyx_GOTREF(__pyx_t_6);
   PyTuple_SET_ITEM(__pyx_t_6, 0, __pyx_t_4);
   __Pyx_GIVEREF(__pyx_t_4);
@@ -4786,7 +5141,7 @@ static PyObject *__pyx_pf_8PyDeepCL_8Nesterov_10trainFromLabels(struct __pyx_obj
   __pyx_t_6 = 0;
   goto __pyx_L0;
 
-  /* "Nesterov.pyx":18
+  /* "../../../../../../home/user/git/DeepCL/python/Nesterov.pyx":18
  *             net.thisptr, context.thisptr, &inputdata[0], &expectedOutput[0])
  *         return result.getLoss()
  *     def trainFromLabels(self, NeuralNet net, TrainingContext context,             # <<<<<<<<<<<<<<
@@ -4809,10 +5164,10 @@ static PyObject *__pyx_pf_8PyDeepCL_8Nesterov_10trainFromLabels(struct __pyx_obj
   return __pyx_r;
 }
 
-/* "Adagrad.pyx":3
+/* "../../../../../../home/user/git/DeepCL/python/Adagrad.pyx":3
  * cdef class Adagrad:
  *     cdef cDeepCL.Adagrad *thisptr
- *     def __cinit__( self, EasyCL cl, learningRate, momentum=0.0 ):             # <<<<<<<<<<<<<<
+ *     def __cinit__( self, DeepCL cl, learningRate, momentum=0.0 ):             # <<<<<<<<<<<<<<
  *         self.thisptr = new cDeepCL.Adagrad(cl.thisptr)
  *         self.thisptr.setLearningRate(learningRate)
  */
@@ -4820,7 +5175,7 @@ static PyObject *__pyx_pf_8PyDeepCL_8Nesterov_10trainFromLabels(struct __pyx_obj
 /* Python wrapper */
 static int __pyx_pw_8PyDeepCL_7Adagrad_1__cinit__(PyObject *__pyx_v_self, PyObject *__pyx_args, PyObject *__pyx_kwds); /*proto*/
 static int __pyx_pw_8PyDeepCL_7Adagrad_1__cinit__(PyObject *__pyx_v_self, PyObject *__pyx_args, PyObject *__pyx_kwds) {
-  struct __pyx_obj_8PyDeepCL_EasyCL *__pyx_v_cl = 0;
+  struct __pyx_obj_8PyDeepCL_DeepCL *__pyx_v_cl = 0;
   PyObject *__pyx_v_learningRate = 0;
   CYTHON_UNUSED PyObject *__pyx_v_momentum = 0;
   int __pyx_lineno = 0;
@@ -4851,7 +5206,7 @@ static int __pyx_pw_8PyDeepCL_7Adagrad_1__cinit__(PyObject *__pyx_v_self, PyObje
         case  1:
         if (likely((values[1] = PyDict_GetItem(__pyx_kwds, __pyx_n_s_learningRate)) != 0)) kw_args--;
         else {
-          __Pyx_RaiseArgtupleInvalid("__cinit__", 0, 2, 3, 1); {__pyx_filename = __pyx_f[7]; __pyx_lineno = 3; __pyx_clineno = __LINE__; goto __pyx_L3_error;}
+          __Pyx_RaiseArgtupleInvalid("__cinit__", 0, 2, 3, 1); {__pyx_filename = __pyx_f[6]; __pyx_lineno = 3; __pyx_clineno = __LINE__; goto __pyx_L3_error;}
         }
         case  2:
         if (kw_args > 0) {
@@ -4860,7 +5215,7 @@ static int __pyx_pw_8PyDeepCL_7Adagrad_1__cinit__(PyObject *__pyx_v_self, PyObje
         }
       }
       if (unlikely(kw_args > 0)) {
-        if (unlikely(__Pyx_ParseOptionalKeywords(__pyx_kwds, __pyx_pyargnames, 0, values, pos_args, "__cinit__") < 0)) {__pyx_filename = __pyx_f[7]; __pyx_lineno = 3; __pyx_clineno = __LINE__; goto __pyx_L3_error;}
+        if (unlikely(__Pyx_ParseOptionalKeywords(__pyx_kwds, __pyx_pyargnames, 0, values, pos_args, "__cinit__") < 0)) {__pyx_filename = __pyx_f[6]; __pyx_lineno = 3; __pyx_clineno = __LINE__; goto __pyx_L3_error;}
       }
     } else {
       switch (PyTuple_GET_SIZE(__pyx_args)) {
@@ -4871,19 +5226,19 @@ static int __pyx_pw_8PyDeepCL_7Adagrad_1__cinit__(PyObject *__pyx_v_self, PyObje
         default: goto __pyx_L5_argtuple_error;
       }
     }
-    __pyx_v_cl = ((struct __pyx_obj_8PyDeepCL_EasyCL *)values[0]);
+    __pyx_v_cl = ((struct __pyx_obj_8PyDeepCL_DeepCL *)values[0]);
     __pyx_v_learningRate = values[1];
     __pyx_v_momentum = values[2];
   }
   goto __pyx_L4_argument_unpacking_done;
   __pyx_L5_argtuple_error:;
-  __Pyx_RaiseArgtupleInvalid("__cinit__", 0, 2, 3, PyTuple_GET_SIZE(__pyx_args)); {__pyx_filename = __pyx_f[7]; __pyx_lineno = 3; __pyx_clineno = __LINE__; goto __pyx_L3_error;}
+  __Pyx_RaiseArgtupleInvalid("__cinit__", 0, 2, 3, PyTuple_GET_SIZE(__pyx_args)); {__pyx_filename = __pyx_f[6]; __pyx_lineno = 3; __pyx_clineno = __LINE__; goto __pyx_L3_error;}
   __pyx_L3_error:;
   __Pyx_AddTraceback("PyDeepCL.Adagrad.__cinit__", __pyx_clineno, __pyx_lineno, __pyx_filename);
   __Pyx_RefNannyFinishContext();
   return -1;
   __pyx_L4_argument_unpacking_done:;
-  if (unlikely(!__Pyx_ArgTypeTest(((PyObject *)__pyx_v_cl), __pyx_ptype_8PyDeepCL_EasyCL, 1, "cl", 0))) {__pyx_filename = __pyx_f[7]; __pyx_lineno = 3; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+  if (unlikely(!__Pyx_ArgTypeTest(((PyObject *)__pyx_v_cl), __pyx_ptype_8PyDeepCL_DeepCL, 1, "cl", 0))) {__pyx_filename = __pyx_f[6]; __pyx_lineno = 3; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
   __pyx_r = __pyx_pf_8PyDeepCL_7Adagrad___cinit__(((struct __pyx_obj_8PyDeepCL_Adagrad *)__pyx_v_self), __pyx_v_cl, __pyx_v_learningRate, __pyx_v_momentum);
 
   /* function exit code */
@@ -4895,7 +5250,7 @@ static int __pyx_pw_8PyDeepCL_7Adagrad_1__cinit__(PyObject *__pyx_v_self, PyObje
   return __pyx_r;
 }
 
-static int __pyx_pf_8PyDeepCL_7Adagrad___cinit__(struct __pyx_obj_8PyDeepCL_Adagrad *__pyx_v_self, struct __pyx_obj_8PyDeepCL_EasyCL *__pyx_v_cl, PyObject *__pyx_v_learningRate, CYTHON_UNUSED PyObject *__pyx_v_momentum) {
+static int __pyx_pf_8PyDeepCL_7Adagrad___cinit__(struct __pyx_obj_8PyDeepCL_Adagrad *__pyx_v_self, struct __pyx_obj_8PyDeepCL_DeepCL *__pyx_v_cl, PyObject *__pyx_v_learningRate, CYTHON_UNUSED PyObject *__pyx_v_momentum) {
   int __pyx_r;
   __Pyx_RefNannyDeclarations
   Adagrad *__pyx_t_1;
@@ -4905,35 +5260,35 @@ static int __pyx_pf_8PyDeepCL_7Adagrad___cinit__(struct __pyx_obj_8PyDeepCL_Adag
   int __pyx_clineno = 0;
   __Pyx_RefNannySetupContext("__cinit__", 0);
 
-  /* "Adagrad.pyx":4
+  /* "../../../../../../home/user/git/DeepCL/python/Adagrad.pyx":4
  *     cdef cDeepCL.Adagrad *thisptr
- *     def __cinit__( self, EasyCL cl, learningRate, momentum=0.0 ):
+ *     def __cinit__( self, DeepCL cl, learningRate, momentum=0.0 ):
  *         self.thisptr = new cDeepCL.Adagrad(cl.thisptr)             # <<<<<<<<<<<<<<
  *         self.thisptr.setLearningRate(learningRate)
- *     def __dealloc(self):
+ *     def __dealloc__(self):
  */
   try {
     __pyx_t_1 = new Adagrad(__pyx_v_cl->thisptr);
   } catch(...) {
     __Pyx_CppExn2PyErr();
-    {__pyx_filename = __pyx_f[7]; __pyx_lineno = 4; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+    {__pyx_filename = __pyx_f[6]; __pyx_lineno = 4; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
   }
   __pyx_v_self->thisptr = __pyx_t_1;
 
-  /* "Adagrad.pyx":5
- *     def __cinit__( self, EasyCL cl, learningRate, momentum=0.0 ):
+  /* "../../../../../../home/user/git/DeepCL/python/Adagrad.pyx":5
+ *     def __cinit__( self, DeepCL cl, learningRate, momentum=0.0 ):
  *         self.thisptr = new cDeepCL.Adagrad(cl.thisptr)
  *         self.thisptr.setLearningRate(learningRate)             # <<<<<<<<<<<<<<
- *     def __dealloc(self):
+ *     def __dealloc__(self):
  *         del self.thisptr
  */
-  __pyx_t_2 = __pyx_PyFloat_AsFloat(__pyx_v_learningRate); if (unlikely((__pyx_t_2 == (float)-1) && PyErr_Occurred())) {__pyx_filename = __pyx_f[7]; __pyx_lineno = 5; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+  __pyx_t_2 = __pyx_PyFloat_AsFloat(__pyx_v_learningRate); if (unlikely((__pyx_t_2 == (float)-1) && PyErr_Occurred())) {__pyx_filename = __pyx_f[6]; __pyx_lineno = 5; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
   __pyx_v_self->thisptr->setLearningRate(__pyx_t_2);
 
-  /* "Adagrad.pyx":3
+  /* "../../../../../../home/user/git/DeepCL/python/Adagrad.pyx":3
  * cdef class Adagrad:
  *     cdef cDeepCL.Adagrad *thisptr
- *     def __cinit__( self, EasyCL cl, learningRate, momentum=0.0 ):             # <<<<<<<<<<<<<<
+ *     def __cinit__( self, DeepCL cl, learningRate, momentum=0.0 ):             # <<<<<<<<<<<<<<
  *         self.thisptr = new cDeepCL.Adagrad(cl.thisptr)
  *         self.thisptr.setLearningRate(learningRate)
  */
@@ -4949,58 +5304,52 @@ static int __pyx_pf_8PyDeepCL_7Adagrad___cinit__(struct __pyx_obj_8PyDeepCL_Adag
   return __pyx_r;
 }
 
-/* "Adagrad.pyx":6
+/* "../../../../../../home/user/git/DeepCL/python/Adagrad.pyx":6
  *         self.thisptr = new cDeepCL.Adagrad(cl.thisptr)
  *         self.thisptr.setLearningRate(learningRate)
- *     def __dealloc(self):             # <<<<<<<<<<<<<<
+ *     def __dealloc__(self):             # <<<<<<<<<<<<<<
  *         del self.thisptr
  *     def setLearningRate(self, float learningRate):
  */
 
 /* Python wrapper */
-static PyObject *__pyx_pw_8PyDeepCL_7Adagrad_3__dealloc(PyObject *__pyx_v_self, CYTHON_UNUSED PyObject *unused); /*proto*/
-static PyObject *__pyx_pw_8PyDeepCL_7Adagrad_3__dealloc(PyObject *__pyx_v_self, CYTHON_UNUSED PyObject *unused) {
-  PyObject *__pyx_r = 0;
+static void __pyx_pw_8PyDeepCL_7Adagrad_3__dealloc__(PyObject *__pyx_v_self); /*proto*/
+static void __pyx_pw_8PyDeepCL_7Adagrad_3__dealloc__(PyObject *__pyx_v_self) {
   __Pyx_RefNannyDeclarations
-  __Pyx_RefNannySetupContext("__dealloc (wrapper)", 0);
-  __pyx_r = __pyx_pf_8PyDeepCL_7Adagrad_2__dealloc(((struct __pyx_obj_8PyDeepCL_Adagrad *)__pyx_v_self));
+  __Pyx_RefNannySetupContext("__dealloc__ (wrapper)", 0);
+  __pyx_pf_8PyDeepCL_7Adagrad_2__dealloc__(((struct __pyx_obj_8PyDeepCL_Adagrad *)__pyx_v_self));
 
   /* function exit code */
   __Pyx_RefNannyFinishContext();
-  return __pyx_r;
 }
 
-static PyObject *__pyx_pf_8PyDeepCL_7Adagrad_2__dealloc(struct __pyx_obj_8PyDeepCL_Adagrad *__pyx_v_self) {
-  PyObject *__pyx_r = NULL;
+static void __pyx_pf_8PyDeepCL_7Adagrad_2__dealloc__(struct __pyx_obj_8PyDeepCL_Adagrad *__pyx_v_self) {
   __Pyx_RefNannyDeclarations
-  __Pyx_RefNannySetupContext("__dealloc", 0);
+  __Pyx_RefNannySetupContext("__dealloc__", 0);
 
-  /* "Adagrad.pyx":7
+  /* "../../../../../../home/user/git/DeepCL/python/Adagrad.pyx":7
  *         self.thisptr.setLearningRate(learningRate)
- *     def __dealloc(self):
+ *     def __dealloc__(self):
  *         del self.thisptr             # <<<<<<<<<<<<<<
  *     def setLearningRate(self, float learningRate):
  *         self.thisptr.setLearningRate(learningRate)
  */
   delete __pyx_v_self->thisptr;
 
-  /* "Adagrad.pyx":6
+  /* "../../../../../../home/user/git/DeepCL/python/Adagrad.pyx":6
  *         self.thisptr = new cDeepCL.Adagrad(cl.thisptr)
  *         self.thisptr.setLearningRate(learningRate)
- *     def __dealloc(self):             # <<<<<<<<<<<<<<
+ *     def __dealloc__(self):             # <<<<<<<<<<<<<<
  *         del self.thisptr
  *     def setLearningRate(self, float learningRate):
  */
 
   /* function exit code */
-  __pyx_r = Py_None; __Pyx_INCREF(Py_None);
-  __Pyx_XGIVEREF(__pyx_r);
   __Pyx_RefNannyFinishContext();
-  return __pyx_r;
 }
 
-/* "Adagrad.pyx":8
- *     def __dealloc(self):
+/* "../../../../../../home/user/git/DeepCL/python/Adagrad.pyx":8
+ *     def __dealloc__(self):
  *         del self.thisptr
  *     def setLearningRate(self, float learningRate):             # <<<<<<<<<<<<<<
  *         self.thisptr.setLearningRate(learningRate)
@@ -5018,7 +5367,7 @@ static PyObject *__pyx_pw_8PyDeepCL_7Adagrad_5setLearningRate(PyObject *__pyx_v_
   __Pyx_RefNannyDeclarations
   __Pyx_RefNannySetupContext("setLearningRate (wrapper)", 0);
   assert(__pyx_arg_learningRate); {
-    __pyx_v_learningRate = __pyx_PyFloat_AsFloat(__pyx_arg_learningRate); if (unlikely((__pyx_v_learningRate == (float)-1) && PyErr_Occurred())) {__pyx_filename = __pyx_f[7]; __pyx_lineno = 8; __pyx_clineno = __LINE__; goto __pyx_L3_error;}
+    __pyx_v_learningRate = __pyx_PyFloat_AsFloat(__pyx_arg_learningRate); if (unlikely((__pyx_v_learningRate == (float)-1) && PyErr_Occurred())) {__pyx_filename = __pyx_f[6]; __pyx_lineno = 8; __pyx_clineno = __LINE__; goto __pyx_L3_error;}
   }
   goto __pyx_L4_argument_unpacking_done;
   __pyx_L3_error:;
@@ -5038,7 +5387,7 @@ static PyObject *__pyx_pf_8PyDeepCL_7Adagrad_4setLearningRate(struct __pyx_obj_8
   __Pyx_RefNannyDeclarations
   __Pyx_RefNannySetupContext("setLearningRate", 0);
 
-  /* "Adagrad.pyx":9
+  /* "../../../../../../home/user/git/DeepCL/python/Adagrad.pyx":9
  *         del self.thisptr
  *     def setLearningRate(self, float learningRate):
  *         self.thisptr.setLearningRate(learningRate)             # <<<<<<<<<<<<<<
@@ -5047,8 +5396,8 @@ static PyObject *__pyx_pf_8PyDeepCL_7Adagrad_4setLearningRate(struct __pyx_obj_8
  */
   __pyx_v_self->thisptr->setLearningRate(__pyx_v_learningRate);
 
-  /* "Adagrad.pyx":8
- *     def __dealloc(self):
+  /* "../../../../../../home/user/git/DeepCL/python/Adagrad.pyx":8
+ *     def __dealloc__(self):
  *         del self.thisptr
  *     def setLearningRate(self, float learningRate):             # <<<<<<<<<<<<<<
  *         self.thisptr.setLearningRate(learningRate)
@@ -5062,7 +5411,7 @@ static PyObject *__pyx_pf_8PyDeepCL_7Adagrad_4setLearningRate(struct __pyx_obj_8
   return __pyx_r;
 }
 
-/* "Adagrad.pyx":10
+/* "../../../../../../home/user/git/DeepCL/python/Adagrad.pyx":10
  *     def setLearningRate(self, float learningRate):
  *         self.thisptr.setLearningRate(learningRate)
  *     def train(self, NeuralNet net, TrainingContext context,             # <<<<<<<<<<<<<<
@@ -5105,21 +5454,21 @@ static PyObject *__pyx_pw_8PyDeepCL_7Adagrad_7train(PyObject *__pyx_v_self, PyOb
         case  1:
         if (likely((values[1] = PyDict_GetItem(__pyx_kwds, __pyx_n_s_context)) != 0)) kw_args--;
         else {
-          __Pyx_RaiseArgtupleInvalid("train", 1, 4, 4, 1); {__pyx_filename = __pyx_f[7]; __pyx_lineno = 10; __pyx_clineno = __LINE__; goto __pyx_L3_error;}
+          __Pyx_RaiseArgtupleInvalid("train", 1, 4, 4, 1); {__pyx_filename = __pyx_f[6]; __pyx_lineno = 10; __pyx_clineno = __LINE__; goto __pyx_L3_error;}
         }
         case  2:
         if (likely((values[2] = PyDict_GetItem(__pyx_kwds, __pyx_n_s_inputdata)) != 0)) kw_args--;
         else {
-          __Pyx_RaiseArgtupleInvalid("train", 1, 4, 4, 2); {__pyx_filename = __pyx_f[7]; __pyx_lineno = 10; __pyx_clineno = __LINE__; goto __pyx_L3_error;}
+          __Pyx_RaiseArgtupleInvalid("train", 1, 4, 4, 2); {__pyx_filename = __pyx_f[6]; __pyx_lineno = 10; __pyx_clineno = __LINE__; goto __pyx_L3_error;}
         }
         case  3:
         if (likely((values[3] = PyDict_GetItem(__pyx_kwds, __pyx_n_s_expectedOutput)) != 0)) kw_args--;
         else {
-          __Pyx_RaiseArgtupleInvalid("train", 1, 4, 4, 3); {__pyx_filename = __pyx_f[7]; __pyx_lineno = 10; __pyx_clineno = __LINE__; goto __pyx_L3_error;}
+          __Pyx_RaiseArgtupleInvalid("train", 1, 4, 4, 3); {__pyx_filename = __pyx_f[6]; __pyx_lineno = 10; __pyx_clineno = __LINE__; goto __pyx_L3_error;}
         }
       }
       if (unlikely(kw_args > 0)) {
-        if (unlikely(__Pyx_ParseOptionalKeywords(__pyx_kwds, __pyx_pyargnames, 0, values, pos_args, "train") < 0)) {__pyx_filename = __pyx_f[7]; __pyx_lineno = 10; __pyx_clineno = __LINE__; goto __pyx_L3_error;}
+        if (unlikely(__Pyx_ParseOptionalKeywords(__pyx_kwds, __pyx_pyargnames, 0, values, pos_args, "train") < 0)) {__pyx_filename = __pyx_f[6]; __pyx_lineno = 10; __pyx_clineno = __LINE__; goto __pyx_L3_error;}
       }
     } else if (PyTuple_GET_SIZE(__pyx_args) != 4) {
       goto __pyx_L5_argtuple_error;
@@ -5131,19 +5480,19 @@ static PyObject *__pyx_pw_8PyDeepCL_7Adagrad_7train(PyObject *__pyx_v_self, PyOb
     }
     __pyx_v_net = ((struct __pyx_obj_8PyDeepCL_NeuralNet *)values[0]);
     __pyx_v_context = ((struct __pyx_obj_8PyDeepCL_TrainingContext *)values[1]);
-    __pyx_v_inputdata = __Pyx_PyObject_to_MemoryviewSlice_ds_float(values[2]); if (unlikely(!__pyx_v_inputdata.memview)) {__pyx_filename = __pyx_f[7]; __pyx_lineno = 11; __pyx_clineno = __LINE__; goto __pyx_L3_error;}
-    __pyx_v_expectedOutput = __Pyx_PyObject_to_MemoryviewSlice_ds_float(values[3]); if (unlikely(!__pyx_v_expectedOutput.memview)) {__pyx_filename = __pyx_f[7]; __pyx_lineno = 11; __pyx_clineno = __LINE__; goto __pyx_L3_error;}
+    __pyx_v_inputdata = __Pyx_PyObject_to_MemoryviewSlice_ds_float(values[2]); if (unlikely(!__pyx_v_inputdata.memview)) {__pyx_filename = __pyx_f[6]; __pyx_lineno = 11; __pyx_clineno = __LINE__; goto __pyx_L3_error;}
+    __pyx_v_expectedOutput = __Pyx_PyObject_to_MemoryviewSlice_ds_float(values[3]); if (unlikely(!__pyx_v_expectedOutput.memview)) {__pyx_filename = __pyx_f[6]; __pyx_lineno = 11; __pyx_clineno = __LINE__; goto __pyx_L3_error;}
   }
   goto __pyx_L4_argument_unpacking_done;
   __pyx_L5_argtuple_error:;
-  __Pyx_RaiseArgtupleInvalid("train", 1, 4, 4, PyTuple_GET_SIZE(__pyx_args)); {__pyx_filename = __pyx_f[7]; __pyx_lineno = 10; __pyx_clineno = __LINE__; goto __pyx_L3_error;}
+  __Pyx_RaiseArgtupleInvalid("train", 1, 4, 4, PyTuple_GET_SIZE(__pyx_args)); {__pyx_filename = __pyx_f[6]; __pyx_lineno = 10; __pyx_clineno = __LINE__; goto __pyx_L3_error;}
   __pyx_L3_error:;
   __Pyx_AddTraceback("PyDeepCL.Adagrad.train", __pyx_clineno, __pyx_lineno, __pyx_filename);
   __Pyx_RefNannyFinishContext();
   return NULL;
   __pyx_L4_argument_unpacking_done:;
-  if (unlikely(!__Pyx_ArgTypeTest(((PyObject *)__pyx_v_net), __pyx_ptype_8PyDeepCL_NeuralNet, 1, "net", 0))) {__pyx_filename = __pyx_f[7]; __pyx_lineno = 10; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
-  if (unlikely(!__Pyx_ArgTypeTest(((PyObject *)__pyx_v_context), __pyx_ptype_8PyDeepCL_TrainingContext, 1, "context", 0))) {__pyx_filename = __pyx_f[7]; __pyx_lineno = 10; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+  if (unlikely(!__Pyx_ArgTypeTest(((PyObject *)__pyx_v_net), __pyx_ptype_8PyDeepCL_NeuralNet, 1, "net", 0))) {__pyx_filename = __pyx_f[6]; __pyx_lineno = 10; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+  if (unlikely(!__Pyx_ArgTypeTest(((PyObject *)__pyx_v_context), __pyx_ptype_8PyDeepCL_TrainingContext, 1, "context", 0))) {__pyx_filename = __pyx_f[6]; __pyx_lineno = 10; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
   __pyx_r = __pyx_pf_8PyDeepCL_7Adagrad_6train(((struct __pyx_obj_8PyDeepCL_Adagrad *)__pyx_v_self), __pyx_v_net, __pyx_v_context, __pyx_v_inputdata, __pyx_v_expectedOutput);
 
   /* function exit code */
@@ -5168,7 +5517,7 @@ static PyObject *__pyx_pf_8PyDeepCL_7Adagrad_6train(struct __pyx_obj_8PyDeepCL_A
   int __pyx_clineno = 0;
   __Pyx_RefNannySetupContext("train", 0);
 
-  /* "Adagrad.pyx":13
+  /* "../../../../../../home/user/git/DeepCL/python/Adagrad.pyx":13
  *         float[:] inputdata, float[:] expectedOutput ):
  *         cdef cDeepCL.BatchResult result = self.thisptr.train(
  *             net.thisptr, context.thisptr, &inputdata[0], &expectedOutput[0])             # <<<<<<<<<<<<<<
@@ -5183,7 +5532,7 @@ static PyObject *__pyx_pf_8PyDeepCL_7Adagrad_6train(struct __pyx_obj_8PyDeepCL_A
   } else if (unlikely(__pyx_t_1 >= __pyx_v_inputdata.shape[0])) __pyx_t_2 = 0;
   if (unlikely(__pyx_t_2 != -1)) {
     __Pyx_RaiseBufferIndexError(__pyx_t_2);
-    {__pyx_filename = __pyx_f[7]; __pyx_lineno = 13; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+    {__pyx_filename = __pyx_f[6]; __pyx_lineno = 13; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
   }
   __pyx_t_3 = 0;
   __pyx_t_2 = -1;
@@ -5193,10 +5542,10 @@ static PyObject *__pyx_pf_8PyDeepCL_7Adagrad_6train(struct __pyx_obj_8PyDeepCL_A
   } else if (unlikely(__pyx_t_3 >= __pyx_v_expectedOutput.shape[0])) __pyx_t_2 = 0;
   if (unlikely(__pyx_t_2 != -1)) {
     __Pyx_RaiseBufferIndexError(__pyx_t_2);
-    {__pyx_filename = __pyx_f[7]; __pyx_lineno = 13; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+    {__pyx_filename = __pyx_f[6]; __pyx_lineno = 13; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
   }
 
-  /* "Adagrad.pyx":12
+  /* "../../../../../../home/user/git/DeepCL/python/Adagrad.pyx":12
  *     def train(self, NeuralNet net, TrainingContext context,
  *         float[:] inputdata, float[:] expectedOutput ):
  *         cdef cDeepCL.BatchResult result = self.thisptr.train(             # <<<<<<<<<<<<<<
@@ -5205,7 +5554,7 @@ static PyObject *__pyx_pf_8PyDeepCL_7Adagrad_6train(struct __pyx_obj_8PyDeepCL_A
  */
   __pyx_v_result = __pyx_v_self->thisptr->train(__pyx_v_net->thisptr, __pyx_v_context->thisptr, (&(*((float *) ( /* dim=0 */ (__pyx_v_inputdata.data + __pyx_t_1 * __pyx_v_inputdata.strides[0]) )))), (&(*((float *) ( /* dim=0 */ (__pyx_v_expectedOutput.data + __pyx_t_3 * __pyx_v_expectedOutput.strides[0]) )))));
 
-  /* "Adagrad.pyx":14
+  /* "../../../../../../home/user/git/DeepCL/python/Adagrad.pyx":14
  *         cdef cDeepCL.BatchResult result = self.thisptr.train(
  *             net.thisptr, context.thisptr, &inputdata[0], &expectedOutput[0])
  *         return result.getLoss()             # <<<<<<<<<<<<<<
@@ -5213,13 +5562,13 @@ static PyObject *__pyx_pf_8PyDeepCL_7Adagrad_6train(struct __pyx_obj_8PyDeepCL_A
  *         float[:] inputdata, int[:] labels):
  */
   __Pyx_XDECREF(__pyx_r);
-  __pyx_t_4 = PyFloat_FromDouble(__pyx_v_result.getLoss()); if (unlikely(!__pyx_t_4)) {__pyx_filename = __pyx_f[7]; __pyx_lineno = 14; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+  __pyx_t_4 = PyFloat_FromDouble(__pyx_v_result.getLoss()); if (unlikely(!__pyx_t_4)) {__pyx_filename = __pyx_f[6]; __pyx_lineno = 14; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
   __Pyx_GOTREF(__pyx_t_4);
   __pyx_r = __pyx_t_4;
   __pyx_t_4 = 0;
   goto __pyx_L0;
 
-  /* "Adagrad.pyx":10
+  /* "../../../../../../home/user/git/DeepCL/python/Adagrad.pyx":10
  *     def setLearningRate(self, float learningRate):
  *         self.thisptr.setLearningRate(learningRate)
  *     def train(self, NeuralNet net, TrainingContext context,             # <<<<<<<<<<<<<<
@@ -5240,7 +5589,7 @@ static PyObject *__pyx_pf_8PyDeepCL_7Adagrad_6train(struct __pyx_obj_8PyDeepCL_A
   return __pyx_r;
 }
 
-/* "Adagrad.pyx":15
+/* "../../../../../../home/user/git/DeepCL/python/Adagrad.pyx":15
  *             net.thisptr, context.thisptr, &inputdata[0], &expectedOutput[0])
  *         return result.getLoss()
  *     def trainFromLabels(self, NeuralNet net, TrainingContext context,             # <<<<<<<<<<<<<<
@@ -5283,21 +5632,21 @@ static PyObject *__pyx_pw_8PyDeepCL_7Adagrad_9trainFromLabels(PyObject *__pyx_v_
         case  1:
         if (likely((values[1] = PyDict_GetItem(__pyx_kwds, __pyx_n_s_context)) != 0)) kw_args--;
         else {
-          __Pyx_RaiseArgtupleInvalid("trainFromLabels", 1, 4, 4, 1); {__pyx_filename = __pyx_f[7]; __pyx_lineno = 15; __pyx_clineno = __LINE__; goto __pyx_L3_error;}
+          __Pyx_RaiseArgtupleInvalid("trainFromLabels", 1, 4, 4, 1); {__pyx_filename = __pyx_f[6]; __pyx_lineno = 15; __pyx_clineno = __LINE__; goto __pyx_L3_error;}
         }
         case  2:
         if (likely((values[2] = PyDict_GetItem(__pyx_kwds, __pyx_n_s_inputdata)) != 0)) kw_args--;
         else {
-          __Pyx_RaiseArgtupleInvalid("trainFromLabels", 1, 4, 4, 2); {__pyx_filename = __pyx_f[7]; __pyx_lineno = 15; __pyx_clineno = __LINE__; goto __pyx_L3_error;}
+          __Pyx_RaiseArgtupleInvalid("trainFromLabels", 1, 4, 4, 2); {__pyx_filename = __pyx_f[6]; __pyx_lineno = 15; __pyx_clineno = __LINE__; goto __pyx_L3_error;}
         }
         case  3:
         if (likely((values[3] = PyDict_GetItem(__pyx_kwds, __pyx_n_s_labels)) != 0)) kw_args--;
         else {
-          __Pyx_RaiseArgtupleInvalid("trainFromLabels", 1, 4, 4, 3); {__pyx_filename = __pyx_f[7]; __pyx_lineno = 15; __pyx_clineno = __LINE__; goto __pyx_L3_error;}
+          __Pyx_RaiseArgtupleInvalid("trainFromLabels", 1, 4, 4, 3); {__pyx_filename = __pyx_f[6]; __pyx_lineno = 15; __pyx_clineno = __LINE__; goto __pyx_L3_error;}
         }
       }
       if (unlikely(kw_args > 0)) {
-        if (unlikely(__Pyx_ParseOptionalKeywords(__pyx_kwds, __pyx_pyargnames, 0, values, pos_args, "trainFromLabels") < 0)) {__pyx_filename = __pyx_f[7]; __pyx_lineno = 15; __pyx_clineno = __LINE__; goto __pyx_L3_error;}
+        if (unlikely(__Pyx_ParseOptionalKeywords(__pyx_kwds, __pyx_pyargnames, 0, values, pos_args, "trainFromLabels") < 0)) {__pyx_filename = __pyx_f[6]; __pyx_lineno = 15; __pyx_clineno = __LINE__; goto __pyx_L3_error;}
       }
     } else if (PyTuple_GET_SIZE(__pyx_args) != 4) {
       goto __pyx_L5_argtuple_error;
@@ -5309,19 +5658,19 @@ static PyObject *__pyx_pw_8PyDeepCL_7Adagrad_9trainFromLabels(PyObject *__pyx_v_
     }
     __pyx_v_net = ((struct __pyx_obj_8PyDeepCL_NeuralNet *)values[0]);
     __pyx_v_context = ((struct __pyx_obj_8PyDeepCL_TrainingContext *)values[1]);
-    __pyx_v_inputdata = __Pyx_PyObject_to_MemoryviewSlice_ds_float(values[2]); if (unlikely(!__pyx_v_inputdata.memview)) {__pyx_filename = __pyx_f[7]; __pyx_lineno = 16; __pyx_clineno = __LINE__; goto __pyx_L3_error;}
-    __pyx_v_labels = __Pyx_PyObject_to_MemoryviewSlice_ds_int(values[3]); if (unlikely(!__pyx_v_labels.memview)) {__pyx_filename = __pyx_f[7]; __pyx_lineno = 16; __pyx_clineno = __LINE__; goto __pyx_L3_error;}
+    __pyx_v_inputdata = __Pyx_PyObject_to_MemoryviewSlice_ds_float(values[2]); if (unlikely(!__pyx_v_inputdata.memview)) {__pyx_filename = __pyx_f[6]; __pyx_lineno = 16; __pyx_clineno = __LINE__; goto __pyx_L3_error;}
+    __pyx_v_labels = __Pyx_PyObject_to_MemoryviewSlice_ds_int(values[3]); if (unlikely(!__pyx_v_labels.memview)) {__pyx_filename = __pyx_f[6]; __pyx_lineno = 16; __pyx_clineno = __LINE__; goto __pyx_L3_error;}
   }
   goto __pyx_L4_argument_unpacking_done;
   __pyx_L5_argtuple_error:;
-  __Pyx_RaiseArgtupleInvalid("trainFromLabels", 1, 4, 4, PyTuple_GET_SIZE(__pyx_args)); {__pyx_filename = __pyx_f[7]; __pyx_lineno = 15; __pyx_clineno = __LINE__; goto __pyx_L3_error;}
+  __Pyx_RaiseArgtupleInvalid("trainFromLabels", 1, 4, 4, PyTuple_GET_SIZE(__pyx_args)); {__pyx_filename = __pyx_f[6]; __pyx_lineno = 15; __pyx_clineno = __LINE__; goto __pyx_L3_error;}
   __pyx_L3_error:;
   __Pyx_AddTraceback("PyDeepCL.Adagrad.trainFromLabels", __pyx_clineno, __pyx_lineno, __pyx_filename);
   __Pyx_RefNannyFinishContext();
   return NULL;
   __pyx_L4_argument_unpacking_done:;
-  if (unlikely(!__Pyx_ArgTypeTest(((PyObject *)__pyx_v_net), __pyx_ptype_8PyDeepCL_NeuralNet, 1, "net", 0))) {__pyx_filename = __pyx_f[7]; __pyx_lineno = 15; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
-  if (unlikely(!__Pyx_ArgTypeTest(((PyObject *)__pyx_v_context), __pyx_ptype_8PyDeepCL_TrainingContext, 1, "context", 0))) {__pyx_filename = __pyx_f[7]; __pyx_lineno = 15; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+  if (unlikely(!__Pyx_ArgTypeTest(((PyObject *)__pyx_v_net), __pyx_ptype_8PyDeepCL_NeuralNet, 1, "net", 0))) {__pyx_filename = __pyx_f[6]; __pyx_lineno = 15; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+  if (unlikely(!__Pyx_ArgTypeTest(((PyObject *)__pyx_v_context), __pyx_ptype_8PyDeepCL_TrainingContext, 1, "context", 0))) {__pyx_filename = __pyx_f[6]; __pyx_lineno = 15; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
   __pyx_r = __pyx_pf_8PyDeepCL_7Adagrad_8trainFromLabels(((struct __pyx_obj_8PyDeepCL_Adagrad *)__pyx_v_self), __pyx_v_net, __pyx_v_context, __pyx_v_inputdata, __pyx_v_labels);
 
   /* function exit code */
@@ -5348,7 +5697,7 @@ static PyObject *__pyx_pf_8PyDeepCL_7Adagrad_8trainFromLabels(struct __pyx_obj_8
   int __pyx_clineno = 0;
   __Pyx_RefNannySetupContext("trainFromLabels", 0);
 
-  /* "Adagrad.pyx":18
+  /* "../../../../../../home/user/git/DeepCL/python/Adagrad.pyx":18
  *         float[:] inputdata, int[:] labels):
  *         cdef cDeepCL.BatchResult result = self.thisptr.trainFromLabels(
  *             net.thisptr, context.thisptr, &inputdata[0], &labels[0])             # <<<<<<<<<<<<<<
@@ -5363,7 +5712,7 @@ static PyObject *__pyx_pf_8PyDeepCL_7Adagrad_8trainFromLabels(struct __pyx_obj_8
   } else if (unlikely(__pyx_t_1 >= __pyx_v_inputdata.shape[0])) __pyx_t_2 = 0;
   if (unlikely(__pyx_t_2 != -1)) {
     __Pyx_RaiseBufferIndexError(__pyx_t_2);
-    {__pyx_filename = __pyx_f[7]; __pyx_lineno = 18; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+    {__pyx_filename = __pyx_f[6]; __pyx_lineno = 18; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
   }
   __pyx_t_3 = 0;
   __pyx_t_2 = -1;
@@ -5373,10 +5722,10 @@ static PyObject *__pyx_pf_8PyDeepCL_7Adagrad_8trainFromLabels(struct __pyx_obj_8
   } else if (unlikely(__pyx_t_3 >= __pyx_v_labels.shape[0])) __pyx_t_2 = 0;
   if (unlikely(__pyx_t_2 != -1)) {
     __Pyx_RaiseBufferIndexError(__pyx_t_2);
-    {__pyx_filename = __pyx_f[7]; __pyx_lineno = 18; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+    {__pyx_filename = __pyx_f[6]; __pyx_lineno = 18; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
   }
 
-  /* "Adagrad.pyx":17
+  /* "../../../../../../home/user/git/DeepCL/python/Adagrad.pyx":17
  *     def trainFromLabels(self, NeuralNet net, TrainingContext context,
  *         float[:] inputdata, int[:] labels):
  *         cdef cDeepCL.BatchResult result = self.thisptr.trainFromLabels(             # <<<<<<<<<<<<<<
@@ -5385,18 +5734,18 @@ static PyObject *__pyx_pf_8PyDeepCL_7Adagrad_8trainFromLabels(struct __pyx_obj_8
  */
   __pyx_v_result = __pyx_v_self->thisptr->trainFromLabels(__pyx_v_net->thisptr, __pyx_v_context->thisptr, (&(*((float *) ( /* dim=0 */ (__pyx_v_inputdata.data + __pyx_t_1 * __pyx_v_inputdata.strides[0]) )))), (&(*((int *) ( /* dim=0 */ (__pyx_v_labels.data + __pyx_t_3 * __pyx_v_labels.strides[0]) )))));
 
-  /* "Adagrad.pyx":19
+  /* "../../../../../../home/user/git/DeepCL/python/Adagrad.pyx":19
  *         cdef cDeepCL.BatchResult result = self.thisptr.trainFromLabels(
  *             net.thisptr, context.thisptr, &inputdata[0], &labels[0])
  *         return ( result.getLoss(), result.getNumRight() )             # <<<<<<<<<<<<<<
  * 
  */
   __Pyx_XDECREF(__pyx_r);
-  __pyx_t_4 = PyFloat_FromDouble(__pyx_v_result.getLoss()); if (unlikely(!__pyx_t_4)) {__pyx_filename = __pyx_f[7]; __pyx_lineno = 19; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+  __pyx_t_4 = PyFloat_FromDouble(__pyx_v_result.getLoss()); if (unlikely(!__pyx_t_4)) {__pyx_filename = __pyx_f[6]; __pyx_lineno = 19; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
   __Pyx_GOTREF(__pyx_t_4);
-  __pyx_t_5 = __Pyx_PyInt_From_int(__pyx_v_result.getNumRight()); if (unlikely(!__pyx_t_5)) {__pyx_filename = __pyx_f[7]; __pyx_lineno = 19; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+  __pyx_t_5 = __Pyx_PyInt_From_int(__pyx_v_result.getNumRight()); if (unlikely(!__pyx_t_5)) {__pyx_filename = __pyx_f[6]; __pyx_lineno = 19; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
   __Pyx_GOTREF(__pyx_t_5);
-  __pyx_t_6 = PyTuple_New(2); if (unlikely(!__pyx_t_6)) {__pyx_filename = __pyx_f[7]; __pyx_lineno = 19; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+  __pyx_t_6 = PyTuple_New(2); if (unlikely(!__pyx_t_6)) {__pyx_filename = __pyx_f[6]; __pyx_lineno = 19; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
   __Pyx_GOTREF(__pyx_t_6);
   PyTuple_SET_ITEM(__pyx_t_6, 0, __pyx_t_4);
   __Pyx_GIVEREF(__pyx_t_4);
@@ -5408,7 +5757,7 @@ static PyObject *__pyx_pf_8PyDeepCL_7Adagrad_8trainFromLabels(struct __pyx_obj_8
   __pyx_t_6 = 0;
   goto __pyx_L0;
 
-  /* "Adagrad.pyx":15
+  /* "../../../../../../home/user/git/DeepCL/python/Adagrad.pyx":15
  *             net.thisptr, context.thisptr, &inputdata[0], &expectedOutput[0])
  *         return result.getLoss()
  *     def trainFromLabels(self, NeuralNet net, TrainingContext context,             # <<<<<<<<<<<<<<
@@ -5431,10 +5780,10 @@ static PyObject *__pyx_pf_8PyDeepCL_7Adagrad_8trainFromLabels(struct __pyx_obj_8
   return __pyx_r;
 }
 
-/* "Rmsprop.pyx":3
+/* "../../../../../../home/user/git/DeepCL/python/Rmsprop.pyx":3
  * cdef class Rmsprop:
  *     cdef cDeepCL.Rmsprop *thisptr
- *     def __cinit__( self, EasyCL cl, learningRate, momentum=0.0 ):             # <<<<<<<<<<<<<<
+ *     def __cinit__( self, DeepCL cl, learningRate, momentum=0.0 ):             # <<<<<<<<<<<<<<
  *         self.thisptr = new cDeepCL.Rmsprop(cl.thisptr)
  *         self.thisptr.setLearningRate(learningRate)
  */
@@ -5442,7 +5791,7 @@ static PyObject *__pyx_pf_8PyDeepCL_7Adagrad_8trainFromLabels(struct __pyx_obj_8
 /* Python wrapper */
 static int __pyx_pw_8PyDeepCL_7Rmsprop_1__cinit__(PyObject *__pyx_v_self, PyObject *__pyx_args, PyObject *__pyx_kwds); /*proto*/
 static int __pyx_pw_8PyDeepCL_7Rmsprop_1__cinit__(PyObject *__pyx_v_self, PyObject *__pyx_args, PyObject *__pyx_kwds) {
-  struct __pyx_obj_8PyDeepCL_EasyCL *__pyx_v_cl = 0;
+  struct __pyx_obj_8PyDeepCL_DeepCL *__pyx_v_cl = 0;
   PyObject *__pyx_v_learningRate = 0;
   CYTHON_UNUSED PyObject *__pyx_v_momentum = 0;
   int __pyx_lineno = 0;
@@ -5473,7 +5822,7 @@ static int __pyx_pw_8PyDeepCL_7Rmsprop_1__cinit__(PyObject *__pyx_v_self, PyObje
         case  1:
         if (likely((values[1] = PyDict_GetItem(__pyx_kwds, __pyx_n_s_learningRate)) != 0)) kw_args--;
         else {
-          __Pyx_RaiseArgtupleInvalid("__cinit__", 0, 2, 3, 1); {__pyx_filename = __pyx_f[8]; __pyx_lineno = 3; __pyx_clineno = __LINE__; goto __pyx_L3_error;}
+          __Pyx_RaiseArgtupleInvalid("__cinit__", 0, 2, 3, 1); {__pyx_filename = __pyx_f[7]; __pyx_lineno = 3; __pyx_clineno = __LINE__; goto __pyx_L3_error;}
         }
         case  2:
         if (kw_args > 0) {
@@ -5482,7 +5831,7 @@ static int __pyx_pw_8PyDeepCL_7Rmsprop_1__cinit__(PyObject *__pyx_v_self, PyObje
         }
       }
       if (unlikely(kw_args > 0)) {
-        if (unlikely(__Pyx_ParseOptionalKeywords(__pyx_kwds, __pyx_pyargnames, 0, values, pos_args, "__cinit__") < 0)) {__pyx_filename = __pyx_f[8]; __pyx_lineno = 3; __pyx_clineno = __LINE__; goto __pyx_L3_error;}
+        if (unlikely(__Pyx_ParseOptionalKeywords(__pyx_kwds, __pyx_pyargnames, 0, values, pos_args, "__cinit__") < 0)) {__pyx_filename = __pyx_f[7]; __pyx_lineno = 3; __pyx_clineno = __LINE__; goto __pyx_L3_error;}
       }
     } else {
       switch (PyTuple_GET_SIZE(__pyx_args)) {
@@ -5493,19 +5842,19 @@ static int __pyx_pw_8PyDeepCL_7Rmsprop_1__cinit__(PyObject *__pyx_v_self, PyObje
         default: goto __pyx_L5_argtuple_error;
       }
     }
-    __pyx_v_cl = ((struct __pyx_obj_8PyDeepCL_EasyCL *)values[0]);
+    __pyx_v_cl = ((struct __pyx_obj_8PyDeepCL_DeepCL *)values[0]);
     __pyx_v_learningRate = values[1];
     __pyx_v_momentum = values[2];
   }
   goto __pyx_L4_argument_unpacking_done;
   __pyx_L5_argtuple_error:;
-  __Pyx_RaiseArgtupleInvalid("__cinit__", 0, 2, 3, PyTuple_GET_SIZE(__pyx_args)); {__pyx_filename = __pyx_f[8]; __pyx_lineno = 3; __pyx_clineno = __LINE__; goto __pyx_L3_error;}
+  __Pyx_RaiseArgtupleInvalid("__cinit__", 0, 2, 3, PyTuple_GET_SIZE(__pyx_args)); {__pyx_filename = __pyx_f[7]; __pyx_lineno = 3; __pyx_clineno = __LINE__; goto __pyx_L3_error;}
   __pyx_L3_error:;
   __Pyx_AddTraceback("PyDeepCL.Rmsprop.__cinit__", __pyx_clineno, __pyx_lineno, __pyx_filename);
   __Pyx_RefNannyFinishContext();
   return -1;
   __pyx_L4_argument_unpacking_done:;
-  if (unlikely(!__Pyx_ArgTypeTest(((PyObject *)__pyx_v_cl), __pyx_ptype_8PyDeepCL_EasyCL, 1, "cl", 0))) {__pyx_filename = __pyx_f[8]; __pyx_lineno = 3; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+  if (unlikely(!__Pyx_ArgTypeTest(((PyObject *)__pyx_v_cl), __pyx_ptype_8PyDeepCL_DeepCL, 1, "cl", 0))) {__pyx_filename = __pyx_f[7]; __pyx_lineno = 3; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
   __pyx_r = __pyx_pf_8PyDeepCL_7Rmsprop___cinit__(((struct __pyx_obj_8PyDeepCL_Rmsprop *)__pyx_v_self), __pyx_v_cl, __pyx_v_learningRate, __pyx_v_momentum);
 
   /* function exit code */
@@ -5517,7 +5866,7 @@ static int __pyx_pw_8PyDeepCL_7Rmsprop_1__cinit__(PyObject *__pyx_v_self, PyObje
   return __pyx_r;
 }
 
-static int __pyx_pf_8PyDeepCL_7Rmsprop___cinit__(struct __pyx_obj_8PyDeepCL_Rmsprop *__pyx_v_self, struct __pyx_obj_8PyDeepCL_EasyCL *__pyx_v_cl, PyObject *__pyx_v_learningRate, CYTHON_UNUSED PyObject *__pyx_v_momentum) {
+static int __pyx_pf_8PyDeepCL_7Rmsprop___cinit__(struct __pyx_obj_8PyDeepCL_Rmsprop *__pyx_v_self, struct __pyx_obj_8PyDeepCL_DeepCL *__pyx_v_cl, PyObject *__pyx_v_learningRate, CYTHON_UNUSED PyObject *__pyx_v_momentum) {
   int __pyx_r;
   __Pyx_RefNannyDeclarations
   Rmsprop *__pyx_t_1;
@@ -5527,35 +5876,35 @@ static int __pyx_pf_8PyDeepCL_7Rmsprop___cinit__(struct __pyx_obj_8PyDeepCL_Rmsp
   int __pyx_clineno = 0;
   __Pyx_RefNannySetupContext("__cinit__", 0);
 
-  /* "Rmsprop.pyx":4
+  /* "../../../../../../home/user/git/DeepCL/python/Rmsprop.pyx":4
  *     cdef cDeepCL.Rmsprop *thisptr
- *     def __cinit__( self, EasyCL cl, learningRate, momentum=0.0 ):
+ *     def __cinit__( self, DeepCL cl, learningRate, momentum=0.0 ):
  *         self.thisptr = new cDeepCL.Rmsprop(cl.thisptr)             # <<<<<<<<<<<<<<
  *         self.thisptr.setLearningRate(learningRate)
- *     def __dealloc(self):
+ *     def __dealloc__(self):
  */
   try {
     __pyx_t_1 = new Rmsprop(__pyx_v_cl->thisptr);
   } catch(...) {
     __Pyx_CppExn2PyErr();
-    {__pyx_filename = __pyx_f[8]; __pyx_lineno = 4; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+    {__pyx_filename = __pyx_f[7]; __pyx_lineno = 4; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
   }
   __pyx_v_self->thisptr = __pyx_t_1;
 
-  /* "Rmsprop.pyx":5
- *     def __cinit__( self, EasyCL cl, learningRate, momentum=0.0 ):
+  /* "../../../../../../home/user/git/DeepCL/python/Rmsprop.pyx":5
+ *     def __cinit__( self, DeepCL cl, learningRate, momentum=0.0 ):
  *         self.thisptr = new cDeepCL.Rmsprop(cl.thisptr)
  *         self.thisptr.setLearningRate(learningRate)             # <<<<<<<<<<<<<<
- *     def __dealloc(self):
+ *     def __dealloc__(self):
  *         del self.thisptr
  */
-  __pyx_t_2 = __pyx_PyFloat_AsFloat(__pyx_v_learningRate); if (unlikely((__pyx_t_2 == (float)-1) && PyErr_Occurred())) {__pyx_filename = __pyx_f[8]; __pyx_lineno = 5; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+  __pyx_t_2 = __pyx_PyFloat_AsFloat(__pyx_v_learningRate); if (unlikely((__pyx_t_2 == (float)-1) && PyErr_Occurred())) {__pyx_filename = __pyx_f[7]; __pyx_lineno = 5; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
   __pyx_v_self->thisptr->setLearningRate(__pyx_t_2);
 
-  /* "Rmsprop.pyx":3
+  /* "../../../../../../home/user/git/DeepCL/python/Rmsprop.pyx":3
  * cdef class Rmsprop:
  *     cdef cDeepCL.Rmsprop *thisptr
- *     def __cinit__( self, EasyCL cl, learningRate, momentum=0.0 ):             # <<<<<<<<<<<<<<
+ *     def __cinit__( self, DeepCL cl, learningRate, momentum=0.0 ):             # <<<<<<<<<<<<<<
  *         self.thisptr = new cDeepCL.Rmsprop(cl.thisptr)
  *         self.thisptr.setLearningRate(learningRate)
  */
@@ -5571,58 +5920,52 @@ static int __pyx_pf_8PyDeepCL_7Rmsprop___cinit__(struct __pyx_obj_8PyDeepCL_Rmsp
   return __pyx_r;
 }
 
-/* "Rmsprop.pyx":6
+/* "../../../../../../home/user/git/DeepCL/python/Rmsprop.pyx":6
  *         self.thisptr = new cDeepCL.Rmsprop(cl.thisptr)
  *         self.thisptr.setLearningRate(learningRate)
- *     def __dealloc(self):             # <<<<<<<<<<<<<<
+ *     def __dealloc__(self):             # <<<<<<<<<<<<<<
  *         del self.thisptr
  *     def setLearningRate(self, float learningRate):
  */
 
 /* Python wrapper */
-static PyObject *__pyx_pw_8PyDeepCL_7Rmsprop_3__dealloc(PyObject *__pyx_v_self, CYTHON_UNUSED PyObject *unused); /*proto*/
-static PyObject *__pyx_pw_8PyDeepCL_7Rmsprop_3__dealloc(PyObject *__pyx_v_self, CYTHON_UNUSED PyObject *unused) {
-  PyObject *__pyx_r = 0;
+static void __pyx_pw_8PyDeepCL_7Rmsprop_3__dealloc__(PyObject *__pyx_v_self); /*proto*/
+static void __pyx_pw_8PyDeepCL_7Rmsprop_3__dealloc__(PyObject *__pyx_v_self) {
   __Pyx_RefNannyDeclarations
-  __Pyx_RefNannySetupContext("__dealloc (wrapper)", 0);
-  __pyx_r = __pyx_pf_8PyDeepCL_7Rmsprop_2__dealloc(((struct __pyx_obj_8PyDeepCL_Rmsprop *)__pyx_v_self));
+  __Pyx_RefNannySetupContext("__dealloc__ (wrapper)", 0);
+  __pyx_pf_8PyDeepCL_7Rmsprop_2__dealloc__(((struct __pyx_obj_8PyDeepCL_Rmsprop *)__pyx_v_self));
 
   /* function exit code */
   __Pyx_RefNannyFinishContext();
-  return __pyx_r;
 }
 
-static PyObject *__pyx_pf_8PyDeepCL_7Rmsprop_2__dealloc(struct __pyx_obj_8PyDeepCL_Rmsprop *__pyx_v_self) {
-  PyObject *__pyx_r = NULL;
+static void __pyx_pf_8PyDeepCL_7Rmsprop_2__dealloc__(struct __pyx_obj_8PyDeepCL_Rmsprop *__pyx_v_self) {
   __Pyx_RefNannyDeclarations
-  __Pyx_RefNannySetupContext("__dealloc", 0);
+  __Pyx_RefNannySetupContext("__dealloc__", 0);
 
-  /* "Rmsprop.pyx":7
+  /* "../../../../../../home/user/git/DeepCL/python/Rmsprop.pyx":7
  *         self.thisptr.setLearningRate(learningRate)
- *     def __dealloc(self):
+ *     def __dealloc__(self):
  *         del self.thisptr             # <<<<<<<<<<<<<<
  *     def setLearningRate(self, float learningRate):
  *         self.thisptr.setLearningRate(learningRate)
  */
   delete __pyx_v_self->thisptr;
 
-  /* "Rmsprop.pyx":6
+  /* "../../../../../../home/user/git/DeepCL/python/Rmsprop.pyx":6
  *         self.thisptr = new cDeepCL.Rmsprop(cl.thisptr)
  *         self.thisptr.setLearningRate(learningRate)
- *     def __dealloc(self):             # <<<<<<<<<<<<<<
+ *     def __dealloc__(self):             # <<<<<<<<<<<<<<
  *         del self.thisptr
  *     def setLearningRate(self, float learningRate):
  */
 
   /* function exit code */
-  __pyx_r = Py_None; __Pyx_INCREF(Py_None);
-  __Pyx_XGIVEREF(__pyx_r);
   __Pyx_RefNannyFinishContext();
-  return __pyx_r;
 }
 
-/* "Rmsprop.pyx":8
- *     def __dealloc(self):
+/* "../../../../../../home/user/git/DeepCL/python/Rmsprop.pyx":8
+ *     def __dealloc__(self):
  *         del self.thisptr
  *     def setLearningRate(self, float learningRate):             # <<<<<<<<<<<<<<
  *         self.thisptr.setLearningRate(learningRate)
@@ -5640,7 +5983,7 @@ static PyObject *__pyx_pw_8PyDeepCL_7Rmsprop_5setLearningRate(PyObject *__pyx_v_
   __Pyx_RefNannyDeclarations
   __Pyx_RefNannySetupContext("setLearningRate (wrapper)", 0);
   assert(__pyx_arg_learningRate); {
-    __pyx_v_learningRate = __pyx_PyFloat_AsFloat(__pyx_arg_learningRate); if (unlikely((__pyx_v_learningRate == (float)-1) && PyErr_Occurred())) {__pyx_filename = __pyx_f[8]; __pyx_lineno = 8; __pyx_clineno = __LINE__; goto __pyx_L3_error;}
+    __pyx_v_learningRate = __pyx_PyFloat_AsFloat(__pyx_arg_learningRate); if (unlikely((__pyx_v_learningRate == (float)-1) && PyErr_Occurred())) {__pyx_filename = __pyx_f[7]; __pyx_lineno = 8; __pyx_clineno = __LINE__; goto __pyx_L3_error;}
   }
   goto __pyx_L4_argument_unpacking_done;
   __pyx_L3_error:;
@@ -5660,7 +6003,7 @@ static PyObject *__pyx_pf_8PyDeepCL_7Rmsprop_4setLearningRate(struct __pyx_obj_8
   __Pyx_RefNannyDeclarations
   __Pyx_RefNannySetupContext("setLearningRate", 0);
 
-  /* "Rmsprop.pyx":9
+  /* "../../../../../../home/user/git/DeepCL/python/Rmsprop.pyx":9
  *         del self.thisptr
  *     def setLearningRate(self, float learningRate):
  *         self.thisptr.setLearningRate(learningRate)             # <<<<<<<<<<<<<<
@@ -5669,8 +6012,8 @@ static PyObject *__pyx_pf_8PyDeepCL_7Rmsprop_4setLearningRate(struct __pyx_obj_8
  */
   __pyx_v_self->thisptr->setLearningRate(__pyx_v_learningRate);
 
-  /* "Rmsprop.pyx":8
- *     def __dealloc(self):
+  /* "../../../../../../home/user/git/DeepCL/python/Rmsprop.pyx":8
+ *     def __dealloc__(self):
  *         del self.thisptr
  *     def setLearningRate(self, float learningRate):             # <<<<<<<<<<<<<<
  *         self.thisptr.setLearningRate(learningRate)
@@ -5684,7 +6027,7 @@ static PyObject *__pyx_pf_8PyDeepCL_7Rmsprop_4setLearningRate(struct __pyx_obj_8
   return __pyx_r;
 }
 
-/* "Rmsprop.pyx":10
+/* "../../../../../../home/user/git/DeepCL/python/Rmsprop.pyx":10
  *     def setLearningRate(self, float learningRate):
  *         self.thisptr.setLearningRate(learningRate)
  *     def train(self, NeuralNet net, TrainingContext context,             # <<<<<<<<<<<<<<
@@ -5727,21 +6070,21 @@ static PyObject *__pyx_pw_8PyDeepCL_7Rmsprop_7train(PyObject *__pyx_v_self, PyOb
         case  1:
         if (likely((values[1] = PyDict_GetItem(__pyx_kwds, __pyx_n_s_context)) != 0)) kw_args--;
         else {
-          __Pyx_RaiseArgtupleInvalid("train", 1, 4, 4, 1); {__pyx_filename = __pyx_f[8]; __pyx_lineno = 10; __pyx_clineno = __LINE__; goto __pyx_L3_error;}
+          __Pyx_RaiseArgtupleInvalid("train", 1, 4, 4, 1); {__pyx_filename = __pyx_f[7]; __pyx_lineno = 10; __pyx_clineno = __LINE__; goto __pyx_L3_error;}
         }
         case  2:
         if (likely((values[2] = PyDict_GetItem(__pyx_kwds, __pyx_n_s_inputdata)) != 0)) kw_args--;
         else {
-          __Pyx_RaiseArgtupleInvalid("train", 1, 4, 4, 2); {__pyx_filename = __pyx_f[8]; __pyx_lineno = 10; __pyx_clineno = __LINE__; goto __pyx_L3_error;}
+          __Pyx_RaiseArgtupleInvalid("train", 1, 4, 4, 2); {__pyx_filename = __pyx_f[7]; __pyx_lineno = 10; __pyx_clineno = __LINE__; goto __pyx_L3_error;}
         }
         case  3:
         if (likely((values[3] = PyDict_GetItem(__pyx_kwds, __pyx_n_s_expectedOutput)) != 0)) kw_args--;
         else {
-          __Pyx_RaiseArgtupleInvalid("train", 1, 4, 4, 3); {__pyx_filename = __pyx_f[8]; __pyx_lineno = 10; __pyx_clineno = __LINE__; goto __pyx_L3_error;}
+          __Pyx_RaiseArgtupleInvalid("train", 1, 4, 4, 3); {__pyx_filename = __pyx_f[7]; __pyx_lineno = 10; __pyx_clineno = __LINE__; goto __pyx_L3_error;}
         }
       }
       if (unlikely(kw_args > 0)) {
-        if (unlikely(__Pyx_ParseOptionalKeywords(__pyx_kwds, __pyx_pyargnames, 0, values, pos_args, "train") < 0)) {__pyx_filename = __pyx_f[8]; __pyx_lineno = 10; __pyx_clineno = __LINE__; goto __pyx_L3_error;}
+        if (unlikely(__Pyx_ParseOptionalKeywords(__pyx_kwds, __pyx_pyargnames, 0, values, pos_args, "train") < 0)) {__pyx_filename = __pyx_f[7]; __pyx_lineno = 10; __pyx_clineno = __LINE__; goto __pyx_L3_error;}
       }
     } else if (PyTuple_GET_SIZE(__pyx_args) != 4) {
       goto __pyx_L5_argtuple_error;
@@ -5753,19 +6096,19 @@ static PyObject *__pyx_pw_8PyDeepCL_7Rmsprop_7train(PyObject *__pyx_v_self, PyOb
     }
     __pyx_v_net = ((struct __pyx_obj_8PyDeepCL_NeuralNet *)values[0]);
     __pyx_v_context = ((struct __pyx_obj_8PyDeepCL_TrainingContext *)values[1]);
-    __pyx_v_inputdata = __Pyx_PyObject_to_MemoryviewSlice_ds_float(values[2]); if (unlikely(!__pyx_v_inputdata.memview)) {__pyx_filename = __pyx_f[8]; __pyx_lineno = 11; __pyx_clineno = __LINE__; goto __pyx_L3_error;}
-    __pyx_v_expectedOutput = __Pyx_PyObject_to_MemoryviewSlice_ds_float(values[3]); if (unlikely(!__pyx_v_expectedOutput.memview)) {__pyx_filename = __pyx_f[8]; __pyx_lineno = 11; __pyx_clineno = __LINE__; goto __pyx_L3_error;}
+    __pyx_v_inputdata = __Pyx_PyObject_to_MemoryviewSlice_ds_float(values[2]); if (unlikely(!__pyx_v_inputdata.memview)) {__pyx_filename = __pyx_f[7]; __pyx_lineno = 11; __pyx_clineno = __LINE__; goto __pyx_L3_error;}
+    __pyx_v_expectedOutput = __Pyx_PyObject_to_MemoryviewSlice_ds_float(values[3]); if (unlikely(!__pyx_v_expectedOutput.memview)) {__pyx_filename = __pyx_f[7]; __pyx_lineno = 11; __pyx_clineno = __LINE__; goto __pyx_L3_error;}
   }
   goto __pyx_L4_argument_unpacking_done;
   __pyx_L5_argtuple_error:;
-  __Pyx_RaiseArgtupleInvalid("train", 1, 4, 4, PyTuple_GET_SIZE(__pyx_args)); {__pyx_filename = __pyx_f[8]; __pyx_lineno = 10; __pyx_clineno = __LINE__; goto __pyx_L3_error;}
+  __Pyx_RaiseArgtupleInvalid("train", 1, 4, 4, PyTuple_GET_SIZE(__pyx_args)); {__pyx_filename = __pyx_f[7]; __pyx_lineno = 10; __pyx_clineno = __LINE__; goto __pyx_L3_error;}
   __pyx_L3_error:;
   __Pyx_AddTraceback("PyDeepCL.Rmsprop.train", __pyx_clineno, __pyx_lineno, __pyx_filename);
   __Pyx_RefNannyFinishContext();
   return NULL;
   __pyx_L4_argument_unpacking_done:;
-  if (unlikely(!__Pyx_ArgTypeTest(((PyObject *)__pyx_v_net), __pyx_ptype_8PyDeepCL_NeuralNet, 1, "net", 0))) {__pyx_filename = __pyx_f[8]; __pyx_lineno = 10; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
-  if (unlikely(!__Pyx_ArgTypeTest(((PyObject *)__pyx_v_context), __pyx_ptype_8PyDeepCL_TrainingContext, 1, "context", 0))) {__pyx_filename = __pyx_f[8]; __pyx_lineno = 10; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+  if (unlikely(!__Pyx_ArgTypeTest(((PyObject *)__pyx_v_net), __pyx_ptype_8PyDeepCL_NeuralNet, 1, "net", 0))) {__pyx_filename = __pyx_f[7]; __pyx_lineno = 10; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+  if (unlikely(!__Pyx_ArgTypeTest(((PyObject *)__pyx_v_context), __pyx_ptype_8PyDeepCL_TrainingContext, 1, "context", 0))) {__pyx_filename = __pyx_f[7]; __pyx_lineno = 10; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
   __pyx_r = __pyx_pf_8PyDeepCL_7Rmsprop_6train(((struct __pyx_obj_8PyDeepCL_Rmsprop *)__pyx_v_self), __pyx_v_net, __pyx_v_context, __pyx_v_inputdata, __pyx_v_expectedOutput);
 
   /* function exit code */
@@ -5790,7 +6133,7 @@ static PyObject *__pyx_pf_8PyDeepCL_7Rmsprop_6train(struct __pyx_obj_8PyDeepCL_R
   int __pyx_clineno = 0;
   __Pyx_RefNannySetupContext("train", 0);
 
-  /* "Rmsprop.pyx":13
+  /* "../../../../../../home/user/git/DeepCL/python/Rmsprop.pyx":13
  *         float[:] inputdata, float[:] expectedOutput ):
  *         cdef cDeepCL.BatchResult result = self.thisptr.train(
  *             net.thisptr, context.thisptr, &inputdata[0], &expectedOutput[0])             # <<<<<<<<<<<<<<
@@ -5805,7 +6148,7 @@ static PyObject *__pyx_pf_8PyDeepCL_7Rmsprop_6train(struct __pyx_obj_8PyDeepCL_R
   } else if (unlikely(__pyx_t_1 >= __pyx_v_inputdata.shape[0])) __pyx_t_2 = 0;
   if (unlikely(__pyx_t_2 != -1)) {
     __Pyx_RaiseBufferIndexError(__pyx_t_2);
-    {__pyx_filename = __pyx_f[8]; __pyx_lineno = 13; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+    {__pyx_filename = __pyx_f[7]; __pyx_lineno = 13; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
   }
   __pyx_t_3 = 0;
   __pyx_t_2 = -1;
@@ -5815,10 +6158,10 @@ static PyObject *__pyx_pf_8PyDeepCL_7Rmsprop_6train(struct __pyx_obj_8PyDeepCL_R
   } else if (unlikely(__pyx_t_3 >= __pyx_v_expectedOutput.shape[0])) __pyx_t_2 = 0;
   if (unlikely(__pyx_t_2 != -1)) {
     __Pyx_RaiseBufferIndexError(__pyx_t_2);
-    {__pyx_filename = __pyx_f[8]; __pyx_lineno = 13; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+    {__pyx_filename = __pyx_f[7]; __pyx_lineno = 13; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
   }
 
-  /* "Rmsprop.pyx":12
+  /* "../../../../../../home/user/git/DeepCL/python/Rmsprop.pyx":12
  *     def train(self, NeuralNet net, TrainingContext context,
  *         float[:] inputdata, float[:] expectedOutput ):
  *         cdef cDeepCL.BatchResult result = self.thisptr.train(             # <<<<<<<<<<<<<<
@@ -5827,7 +6170,7 @@ static PyObject *__pyx_pf_8PyDeepCL_7Rmsprop_6train(struct __pyx_obj_8PyDeepCL_R
  */
   __pyx_v_result = __pyx_v_self->thisptr->train(__pyx_v_net->thisptr, __pyx_v_context->thisptr, (&(*((float *) ( /* dim=0 */ (__pyx_v_inputdata.data + __pyx_t_1 * __pyx_v_inputdata.strides[0]) )))), (&(*((float *) ( /* dim=0 */ (__pyx_v_expectedOutput.data + __pyx_t_3 * __pyx_v_expectedOutput.strides[0]) )))));
 
-  /* "Rmsprop.pyx":14
+  /* "../../../../../../home/user/git/DeepCL/python/Rmsprop.pyx":14
  *         cdef cDeepCL.BatchResult result = self.thisptr.train(
  *             net.thisptr, context.thisptr, &inputdata[0], &expectedOutput[0])
  *         return result.getLoss()             # <<<<<<<<<<<<<<
@@ -5835,13 +6178,13 @@ static PyObject *__pyx_pf_8PyDeepCL_7Rmsprop_6train(struct __pyx_obj_8PyDeepCL_R
  *         float[:] inputdata, int[:] labels):
  */
   __Pyx_XDECREF(__pyx_r);
-  __pyx_t_4 = PyFloat_FromDouble(__pyx_v_result.getLoss()); if (unlikely(!__pyx_t_4)) {__pyx_filename = __pyx_f[8]; __pyx_lineno = 14; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+  __pyx_t_4 = PyFloat_FromDouble(__pyx_v_result.getLoss()); if (unlikely(!__pyx_t_4)) {__pyx_filename = __pyx_f[7]; __pyx_lineno = 14; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
   __Pyx_GOTREF(__pyx_t_4);
   __pyx_r = __pyx_t_4;
   __pyx_t_4 = 0;
   goto __pyx_L0;
 
-  /* "Rmsprop.pyx":10
+  /* "../../../../../../home/user/git/DeepCL/python/Rmsprop.pyx":10
  *     def setLearningRate(self, float learningRate):
  *         self.thisptr.setLearningRate(learningRate)
  *     def train(self, NeuralNet net, TrainingContext context,             # <<<<<<<<<<<<<<
@@ -5862,7 +6205,7 @@ static PyObject *__pyx_pf_8PyDeepCL_7Rmsprop_6train(struct __pyx_obj_8PyDeepCL_R
   return __pyx_r;
 }
 
-/* "Rmsprop.pyx":15
+/* "../../../../../../home/user/git/DeepCL/python/Rmsprop.pyx":15
  *             net.thisptr, context.thisptr, &inputdata[0], &expectedOutput[0])
  *         return result.getLoss()
  *     def trainFromLabels(self, NeuralNet net, TrainingContext context,             # <<<<<<<<<<<<<<
@@ -5905,21 +6248,21 @@ static PyObject *__pyx_pw_8PyDeepCL_7Rmsprop_9trainFromLabels(PyObject *__pyx_v_
         case  1:
         if (likely((values[1] = PyDict_GetItem(__pyx_kwds, __pyx_n_s_context)) != 0)) kw_args--;
         else {
-          __Pyx_RaiseArgtupleInvalid("trainFromLabels", 1, 4, 4, 1); {__pyx_filename = __pyx_f[8]; __pyx_lineno = 15; __pyx_clineno = __LINE__; goto __pyx_L3_error;}
+          __Pyx_RaiseArgtupleInvalid("trainFromLabels", 1, 4, 4, 1); {__pyx_filename = __pyx_f[7]; __pyx_lineno = 15; __pyx_clineno = __LINE__; goto __pyx_L3_error;}
         }
         case  2:
         if (likely((values[2] = PyDict_GetItem(__pyx_kwds, __pyx_n_s_inputdata)) != 0)) kw_args--;
         else {
-          __Pyx_RaiseArgtupleInvalid("trainFromLabels", 1, 4, 4, 2); {__pyx_filename = __pyx_f[8]; __pyx_lineno = 15; __pyx_clineno = __LINE__; goto __pyx_L3_error;}
+          __Pyx_RaiseArgtupleInvalid("trainFromLabels", 1, 4, 4, 2); {__pyx_filename = __pyx_f[7]; __pyx_lineno = 15; __pyx_clineno = __LINE__; goto __pyx_L3_error;}
         }
         case  3:
         if (likely((values[3] = PyDict_GetItem(__pyx_kwds, __pyx_n_s_labels)) != 0)) kw_args--;
         else {
-          __Pyx_RaiseArgtupleInvalid("trainFromLabels", 1, 4, 4, 3); {__pyx_filename = __pyx_f[8]; __pyx_lineno = 15; __pyx_clineno = __LINE__; goto __pyx_L3_error;}
+          __Pyx_RaiseArgtupleInvalid("trainFromLabels", 1, 4, 4, 3); {__pyx_filename = __pyx_f[7]; __pyx_lineno = 15; __pyx_clineno = __LINE__; goto __pyx_L3_error;}
         }
       }
       if (unlikely(kw_args > 0)) {
-        if (unlikely(__Pyx_ParseOptionalKeywords(__pyx_kwds, __pyx_pyargnames, 0, values, pos_args, "trainFromLabels") < 0)) {__pyx_filename = __pyx_f[8]; __pyx_lineno = 15; __pyx_clineno = __LINE__; goto __pyx_L3_error;}
+        if (unlikely(__Pyx_ParseOptionalKeywords(__pyx_kwds, __pyx_pyargnames, 0, values, pos_args, "trainFromLabels") < 0)) {__pyx_filename = __pyx_f[7]; __pyx_lineno = 15; __pyx_clineno = __LINE__; goto __pyx_L3_error;}
       }
     } else if (PyTuple_GET_SIZE(__pyx_args) != 4) {
       goto __pyx_L5_argtuple_error;
@@ -5931,19 +6274,19 @@ static PyObject *__pyx_pw_8PyDeepCL_7Rmsprop_9trainFromLabels(PyObject *__pyx_v_
     }
     __pyx_v_net = ((struct __pyx_obj_8PyDeepCL_NeuralNet *)values[0]);
     __pyx_v_context = ((struct __pyx_obj_8PyDeepCL_TrainingContext *)values[1]);
-    __pyx_v_inputdata = __Pyx_PyObject_to_MemoryviewSlice_ds_float(values[2]); if (unlikely(!__pyx_v_inputdata.memview)) {__pyx_filename = __pyx_f[8]; __pyx_lineno = 16; __pyx_clineno = __LINE__; goto __pyx_L3_error;}
-    __pyx_v_labels = __Pyx_PyObject_to_MemoryviewSlice_ds_int(values[3]); if (unlikely(!__pyx_v_labels.memview)) {__pyx_filename = __pyx_f[8]; __pyx_lineno = 16; __pyx_clineno = __LINE__; goto __pyx_L3_error;}
+    __pyx_v_inputdata = __Pyx_PyObject_to_MemoryviewSlice_ds_float(values[2]); if (unlikely(!__pyx_v_inputdata.memview)) {__pyx_filename = __pyx_f[7]; __pyx_lineno = 16; __pyx_clineno = __LINE__; goto __pyx_L3_error;}
+    __pyx_v_labels = __Pyx_PyObject_to_MemoryviewSlice_ds_int(values[3]); if (unlikely(!__pyx_v_labels.memview)) {__pyx_filename = __pyx_f[7]; __pyx_lineno = 16; __pyx_clineno = __LINE__; goto __pyx_L3_error;}
   }
   goto __pyx_L4_argument_unpacking_done;
   __pyx_L5_argtuple_error:;
-  __Pyx_RaiseArgtupleInvalid("trainFromLabels", 1, 4, 4, PyTuple_GET_SIZE(__pyx_args)); {__pyx_filename = __pyx_f[8]; __pyx_lineno = 15; __pyx_clineno = __LINE__; goto __pyx_L3_error;}
+  __Pyx_RaiseArgtupleInvalid("trainFromLabels", 1, 4, 4, PyTuple_GET_SIZE(__pyx_args)); {__pyx_filename = __pyx_f[7]; __pyx_lineno = 15; __pyx_clineno = __LINE__; goto __pyx_L3_error;}
   __pyx_L3_error:;
   __Pyx_AddTraceback("PyDeepCL.Rmsprop.trainFromLabels", __pyx_clineno, __pyx_lineno, __pyx_filename);
   __Pyx_RefNannyFinishContext();
   return NULL;
   __pyx_L4_argument_unpacking_done:;
-  if (unlikely(!__Pyx_ArgTypeTest(((PyObject *)__pyx_v_net), __pyx_ptype_8PyDeepCL_NeuralNet, 1, "net", 0))) {__pyx_filename = __pyx_f[8]; __pyx_lineno = 15; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
-  if (unlikely(!__Pyx_ArgTypeTest(((PyObject *)__pyx_v_context), __pyx_ptype_8PyDeepCL_TrainingContext, 1, "context", 0))) {__pyx_filename = __pyx_f[8]; __pyx_lineno = 15; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+  if (unlikely(!__Pyx_ArgTypeTest(((PyObject *)__pyx_v_net), __pyx_ptype_8PyDeepCL_NeuralNet, 1, "net", 0))) {__pyx_filename = __pyx_f[7]; __pyx_lineno = 15; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+  if (unlikely(!__Pyx_ArgTypeTest(((PyObject *)__pyx_v_context), __pyx_ptype_8PyDeepCL_TrainingContext, 1, "context", 0))) {__pyx_filename = __pyx_f[7]; __pyx_lineno = 15; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
   __pyx_r = __pyx_pf_8PyDeepCL_7Rmsprop_8trainFromLabels(((struct __pyx_obj_8PyDeepCL_Rmsprop *)__pyx_v_self), __pyx_v_net, __pyx_v_context, __pyx_v_inputdata, __pyx_v_labels);
 
   /* function exit code */
@@ -5970,7 +6313,7 @@ static PyObject *__pyx_pf_8PyDeepCL_7Rmsprop_8trainFromLabels(struct __pyx_obj_8
   int __pyx_clineno = 0;
   __Pyx_RefNannySetupContext("trainFromLabels", 0);
 
-  /* "Rmsprop.pyx":18
+  /* "../../../../../../home/user/git/DeepCL/python/Rmsprop.pyx":18
  *         float[:] inputdata, int[:] labels):
  *         cdef cDeepCL.BatchResult result = self.thisptr.trainFromLabels(
  *             net.thisptr, context.thisptr, &inputdata[0], &labels[0])             # <<<<<<<<<<<<<<
@@ -5985,7 +6328,7 @@ static PyObject *__pyx_pf_8PyDeepCL_7Rmsprop_8trainFromLabels(struct __pyx_obj_8
   } else if (unlikely(__pyx_t_1 >= __pyx_v_inputdata.shape[0])) __pyx_t_2 = 0;
   if (unlikely(__pyx_t_2 != -1)) {
     __Pyx_RaiseBufferIndexError(__pyx_t_2);
-    {__pyx_filename = __pyx_f[8]; __pyx_lineno = 18; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+    {__pyx_filename = __pyx_f[7]; __pyx_lineno = 18; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
   }
   __pyx_t_3 = 0;
   __pyx_t_2 = -1;
@@ -5995,10 +6338,10 @@ static PyObject *__pyx_pf_8PyDeepCL_7Rmsprop_8trainFromLabels(struct __pyx_obj_8
   } else if (unlikely(__pyx_t_3 >= __pyx_v_labels.shape[0])) __pyx_t_2 = 0;
   if (unlikely(__pyx_t_2 != -1)) {
     __Pyx_RaiseBufferIndexError(__pyx_t_2);
-    {__pyx_filename = __pyx_f[8]; __pyx_lineno = 18; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+    {__pyx_filename = __pyx_f[7]; __pyx_lineno = 18; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
   }
 
-  /* "Rmsprop.pyx":17
+  /* "../../../../../../home/user/git/DeepCL/python/Rmsprop.pyx":17
  *     def trainFromLabels(self, NeuralNet net, TrainingContext context,
  *         float[:] inputdata, int[:] labels):
  *         cdef cDeepCL.BatchResult result = self.thisptr.trainFromLabels(             # <<<<<<<<<<<<<<
@@ -6007,18 +6350,18 @@ static PyObject *__pyx_pf_8PyDeepCL_7Rmsprop_8trainFromLabels(struct __pyx_obj_8
  */
   __pyx_v_result = __pyx_v_self->thisptr->trainFromLabels(__pyx_v_net->thisptr, __pyx_v_context->thisptr, (&(*((float *) ( /* dim=0 */ (__pyx_v_inputdata.data + __pyx_t_1 * __pyx_v_inputdata.strides[0]) )))), (&(*((int *) ( /* dim=0 */ (__pyx_v_labels.data + __pyx_t_3 * __pyx_v_labels.strides[0]) )))));
 
-  /* "Rmsprop.pyx":19
+  /* "../../../../../../home/user/git/DeepCL/python/Rmsprop.pyx":19
  *         cdef cDeepCL.BatchResult result = self.thisptr.trainFromLabels(
  *             net.thisptr, context.thisptr, &inputdata[0], &labels[0])
  *         return ( result.getLoss(), result.getNumRight() )             # <<<<<<<<<<<<<<
  * 
  */
   __Pyx_XDECREF(__pyx_r);
-  __pyx_t_4 = PyFloat_FromDouble(__pyx_v_result.getLoss()); if (unlikely(!__pyx_t_4)) {__pyx_filename = __pyx_f[8]; __pyx_lineno = 19; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+  __pyx_t_4 = PyFloat_FromDouble(__pyx_v_result.getLoss()); if (unlikely(!__pyx_t_4)) {__pyx_filename = __pyx_f[7]; __pyx_lineno = 19; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
   __Pyx_GOTREF(__pyx_t_4);
-  __pyx_t_5 = __Pyx_PyInt_From_int(__pyx_v_result.getNumRight()); if (unlikely(!__pyx_t_5)) {__pyx_filename = __pyx_f[8]; __pyx_lineno = 19; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+  __pyx_t_5 = __Pyx_PyInt_From_int(__pyx_v_result.getNumRight()); if (unlikely(!__pyx_t_5)) {__pyx_filename = __pyx_f[7]; __pyx_lineno = 19; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
   __Pyx_GOTREF(__pyx_t_5);
-  __pyx_t_6 = PyTuple_New(2); if (unlikely(!__pyx_t_6)) {__pyx_filename = __pyx_f[8]; __pyx_lineno = 19; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+  __pyx_t_6 = PyTuple_New(2); if (unlikely(!__pyx_t_6)) {__pyx_filename = __pyx_f[7]; __pyx_lineno = 19; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
   __Pyx_GOTREF(__pyx_t_6);
   PyTuple_SET_ITEM(__pyx_t_6, 0, __pyx_t_4);
   __Pyx_GIVEREF(__pyx_t_4);
@@ -6030,7 +6373,7 @@ static PyObject *__pyx_pf_8PyDeepCL_7Rmsprop_8trainFromLabels(struct __pyx_obj_8
   __pyx_t_6 = 0;
   goto __pyx_L0;
 
-  /* "Rmsprop.pyx":15
+  /* "../../../../../../home/user/git/DeepCL/python/Rmsprop.pyx":15
  *             net.thisptr, context.thisptr, &inputdata[0], &expectedOutput[0])
  *         return result.getLoss()
  *     def trainFromLabels(self, NeuralNet net, TrainingContext context,             # <<<<<<<<<<<<<<
@@ -6053,18 +6396,18 @@ static PyObject *__pyx_pf_8PyDeepCL_7Rmsprop_8trainFromLabels(struct __pyx_obj_8
   return __pyx_r;
 }
 
-/* "Adadelta.pyx":3
+/* "../../../../../../home/user/git/DeepCL/python/Adadelta.pyx":3
  * cdef class Adadelta:
  *     cdef cDeepCL.Adadelta *thisptr
- *     def __cinit__( self, EasyCL cl, rho=0.9 ):             # <<<<<<<<<<<<<<
+ *     def __cinit__( self, DeepCL cl, rho=0.9 ):             # <<<<<<<<<<<<<<
  *         self.thisptr = new cDeepCL.Adadelta(cl.thisptr, rho)
- *     def __dealloc(self):
+ *     def __dealloc__(self):
  */
 
 /* Python wrapper */
 static int __pyx_pw_8PyDeepCL_8Adadelta_1__cinit__(PyObject *__pyx_v_self, PyObject *__pyx_args, PyObject *__pyx_kwds); /*proto*/
 static int __pyx_pw_8PyDeepCL_8Adadelta_1__cinit__(PyObject *__pyx_v_self, PyObject *__pyx_args, PyObject *__pyx_kwds) {
-  struct __pyx_obj_8PyDeepCL_EasyCL *__pyx_v_cl = 0;
+  struct __pyx_obj_8PyDeepCL_DeepCL *__pyx_v_cl = 0;
   PyObject *__pyx_v_rho = 0;
   int __pyx_lineno = 0;
   const char *__pyx_filename = NULL;
@@ -6097,7 +6440,7 @@ static int __pyx_pw_8PyDeepCL_8Adadelta_1__cinit__(PyObject *__pyx_v_self, PyObj
         }
       }
       if (unlikely(kw_args > 0)) {
-        if (unlikely(__Pyx_ParseOptionalKeywords(__pyx_kwds, __pyx_pyargnames, 0, values, pos_args, "__cinit__") < 0)) {__pyx_filename = __pyx_f[9]; __pyx_lineno = 3; __pyx_clineno = __LINE__; goto __pyx_L3_error;}
+        if (unlikely(__Pyx_ParseOptionalKeywords(__pyx_kwds, __pyx_pyargnames, 0, values, pos_args, "__cinit__") < 0)) {__pyx_filename = __pyx_f[8]; __pyx_lineno = 3; __pyx_clineno = __LINE__; goto __pyx_L3_error;}
       }
     } else {
       switch (PyTuple_GET_SIZE(__pyx_args)) {
@@ -6107,18 +6450,18 @@ static int __pyx_pw_8PyDeepCL_8Adadelta_1__cinit__(PyObject *__pyx_v_self, PyObj
         default: goto __pyx_L5_argtuple_error;
       }
     }
-    __pyx_v_cl = ((struct __pyx_obj_8PyDeepCL_EasyCL *)values[0]);
+    __pyx_v_cl = ((struct __pyx_obj_8PyDeepCL_DeepCL *)values[0]);
     __pyx_v_rho = values[1];
   }
   goto __pyx_L4_argument_unpacking_done;
   __pyx_L5_argtuple_error:;
-  __Pyx_RaiseArgtupleInvalid("__cinit__", 0, 1, 2, PyTuple_GET_SIZE(__pyx_args)); {__pyx_filename = __pyx_f[9]; __pyx_lineno = 3; __pyx_clineno = __LINE__; goto __pyx_L3_error;}
+  __Pyx_RaiseArgtupleInvalid("__cinit__", 0, 1, 2, PyTuple_GET_SIZE(__pyx_args)); {__pyx_filename = __pyx_f[8]; __pyx_lineno = 3; __pyx_clineno = __LINE__; goto __pyx_L3_error;}
   __pyx_L3_error:;
   __Pyx_AddTraceback("PyDeepCL.Adadelta.__cinit__", __pyx_clineno, __pyx_lineno, __pyx_filename);
   __Pyx_RefNannyFinishContext();
   return -1;
   __pyx_L4_argument_unpacking_done:;
-  if (unlikely(!__Pyx_ArgTypeTest(((PyObject *)__pyx_v_cl), __pyx_ptype_8PyDeepCL_EasyCL, 1, "cl", 0))) {__pyx_filename = __pyx_f[9]; __pyx_lineno = 3; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+  if (unlikely(!__Pyx_ArgTypeTest(((PyObject *)__pyx_v_cl), __pyx_ptype_8PyDeepCL_DeepCL, 1, "cl", 0))) {__pyx_filename = __pyx_f[8]; __pyx_lineno = 3; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
   __pyx_r = __pyx_pf_8PyDeepCL_8Adadelta___cinit__(((struct __pyx_obj_8PyDeepCL_Adadelta *)__pyx_v_self), __pyx_v_cl, __pyx_v_rho);
 
   /* function exit code */
@@ -6130,7 +6473,7 @@ static int __pyx_pw_8PyDeepCL_8Adadelta_1__cinit__(PyObject *__pyx_v_self, PyObj
   return __pyx_r;
 }
 
-static int __pyx_pf_8PyDeepCL_8Adadelta___cinit__(struct __pyx_obj_8PyDeepCL_Adadelta *__pyx_v_self, struct __pyx_obj_8PyDeepCL_EasyCL *__pyx_v_cl, PyObject *__pyx_v_rho) {
+static int __pyx_pf_8PyDeepCL_8Adadelta___cinit__(struct __pyx_obj_8PyDeepCL_Adadelta *__pyx_v_self, struct __pyx_obj_8PyDeepCL_DeepCL *__pyx_v_cl, PyObject *__pyx_v_rho) {
   int __pyx_r;
   __Pyx_RefNannyDeclarations
   float __pyx_t_1;
@@ -6140,28 +6483,28 @@ static int __pyx_pf_8PyDeepCL_8Adadelta___cinit__(struct __pyx_obj_8PyDeepCL_Ada
   int __pyx_clineno = 0;
   __Pyx_RefNannySetupContext("__cinit__", 0);
 
-  /* "Adadelta.pyx":4
+  /* "../../../../../../home/user/git/DeepCL/python/Adadelta.pyx":4
  *     cdef cDeepCL.Adadelta *thisptr
- *     def __cinit__( self, EasyCL cl, rho=0.9 ):
+ *     def __cinit__( self, DeepCL cl, rho=0.9 ):
  *         self.thisptr = new cDeepCL.Adadelta(cl.thisptr, rho)             # <<<<<<<<<<<<<<
- *     def __dealloc(self):
+ *     def __dealloc__(self):
  *         del self.thisptr
  */
-  __pyx_t_1 = __pyx_PyFloat_AsFloat(__pyx_v_rho); if (unlikely((__pyx_t_1 == (float)-1) && PyErr_Occurred())) {__pyx_filename = __pyx_f[9]; __pyx_lineno = 4; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+  __pyx_t_1 = __pyx_PyFloat_AsFloat(__pyx_v_rho); if (unlikely((__pyx_t_1 == (float)-1) && PyErr_Occurred())) {__pyx_filename = __pyx_f[8]; __pyx_lineno = 4; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
   try {
     __pyx_t_2 = new Adadelta(__pyx_v_cl->thisptr, __pyx_t_1);
   } catch(...) {
     __Pyx_CppExn2PyErr();
-    {__pyx_filename = __pyx_f[9]; __pyx_lineno = 4; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+    {__pyx_filename = __pyx_f[8]; __pyx_lineno = 4; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
   }
   __pyx_v_self->thisptr = __pyx_t_2;
 
-  /* "Adadelta.pyx":3
+  /* "../../../../../../home/user/git/DeepCL/python/Adadelta.pyx":3
  * cdef class Adadelta:
  *     cdef cDeepCL.Adadelta *thisptr
- *     def __cinit__( self, EasyCL cl, rho=0.9 ):             # <<<<<<<<<<<<<<
+ *     def __cinit__( self, DeepCL cl, rho=0.9 ):             # <<<<<<<<<<<<<<
  *         self.thisptr = new cDeepCL.Adadelta(cl.thisptr, rho)
- *     def __dealloc(self):
+ *     def __dealloc__(self):
  */
 
   /* function exit code */
@@ -6175,58 +6518,52 @@ static int __pyx_pf_8PyDeepCL_8Adadelta___cinit__(struct __pyx_obj_8PyDeepCL_Ada
   return __pyx_r;
 }
 
-/* "Adadelta.pyx":5
- *     def __cinit__( self, EasyCL cl, rho=0.9 ):
+/* "../../../../../../home/user/git/DeepCL/python/Adadelta.pyx":5
+ *     def __cinit__( self, DeepCL cl, rho=0.9 ):
  *         self.thisptr = new cDeepCL.Adadelta(cl.thisptr, rho)
- *     def __dealloc(self):             # <<<<<<<<<<<<<<
+ *     def __dealloc__(self):             # <<<<<<<<<<<<<<
  *         del self.thisptr
  *     def train(self, NeuralNet net, TrainingContext context,
  */
 
 /* Python wrapper */
-static PyObject *__pyx_pw_8PyDeepCL_8Adadelta_3__dealloc(PyObject *__pyx_v_self, CYTHON_UNUSED PyObject *unused); /*proto*/
-static PyObject *__pyx_pw_8PyDeepCL_8Adadelta_3__dealloc(PyObject *__pyx_v_self, CYTHON_UNUSED PyObject *unused) {
-  PyObject *__pyx_r = 0;
+static void __pyx_pw_8PyDeepCL_8Adadelta_3__dealloc__(PyObject *__pyx_v_self); /*proto*/
+static void __pyx_pw_8PyDeepCL_8Adadelta_3__dealloc__(PyObject *__pyx_v_self) {
   __Pyx_RefNannyDeclarations
-  __Pyx_RefNannySetupContext("__dealloc (wrapper)", 0);
-  __pyx_r = __pyx_pf_8PyDeepCL_8Adadelta_2__dealloc(((struct __pyx_obj_8PyDeepCL_Adadelta *)__pyx_v_self));
+  __Pyx_RefNannySetupContext("__dealloc__ (wrapper)", 0);
+  __pyx_pf_8PyDeepCL_8Adadelta_2__dealloc__(((struct __pyx_obj_8PyDeepCL_Adadelta *)__pyx_v_self));
 
   /* function exit code */
   __Pyx_RefNannyFinishContext();
-  return __pyx_r;
 }
 
-static PyObject *__pyx_pf_8PyDeepCL_8Adadelta_2__dealloc(struct __pyx_obj_8PyDeepCL_Adadelta *__pyx_v_self) {
-  PyObject *__pyx_r = NULL;
+static void __pyx_pf_8PyDeepCL_8Adadelta_2__dealloc__(struct __pyx_obj_8PyDeepCL_Adadelta *__pyx_v_self) {
   __Pyx_RefNannyDeclarations
-  __Pyx_RefNannySetupContext("__dealloc", 0);
+  __Pyx_RefNannySetupContext("__dealloc__", 0);
 
-  /* "Adadelta.pyx":6
+  /* "../../../../../../home/user/git/DeepCL/python/Adadelta.pyx":6
  *         self.thisptr = new cDeepCL.Adadelta(cl.thisptr, rho)
- *     def __dealloc(self):
+ *     def __dealloc__(self):
  *         del self.thisptr             # <<<<<<<<<<<<<<
  *     def train(self, NeuralNet net, TrainingContext context,
  *         float[:] inputdata, float[:] expectedOutput ):
  */
   delete __pyx_v_self->thisptr;
 
-  /* "Adadelta.pyx":5
- *     def __cinit__( self, EasyCL cl, rho=0.9 ):
+  /* "../../../../../../home/user/git/DeepCL/python/Adadelta.pyx":5
+ *     def __cinit__( self, DeepCL cl, rho=0.9 ):
  *         self.thisptr = new cDeepCL.Adadelta(cl.thisptr, rho)
- *     def __dealloc(self):             # <<<<<<<<<<<<<<
+ *     def __dealloc__(self):             # <<<<<<<<<<<<<<
  *         del self.thisptr
  *     def train(self, NeuralNet net, TrainingContext context,
  */
 
   /* function exit code */
-  __pyx_r = Py_None; __Pyx_INCREF(Py_None);
-  __Pyx_XGIVEREF(__pyx_r);
   __Pyx_RefNannyFinishContext();
-  return __pyx_r;
 }
 
-/* "Adadelta.pyx":7
- *     def __dealloc(self):
+/* "../../../../../../home/user/git/DeepCL/python/Adadelta.pyx":7
+ *     def __dealloc__(self):
  *         del self.thisptr
  *     def train(self, NeuralNet net, TrainingContext context,             # <<<<<<<<<<<<<<
  *         float[:] inputdata, float[:] expectedOutput ):
@@ -6268,21 +6605,21 @@ static PyObject *__pyx_pw_8PyDeepCL_8Adadelta_5train(PyObject *__pyx_v_self, PyO
         case  1:
         if (likely((values[1] = PyDict_GetItem(__pyx_kwds, __pyx_n_s_context)) != 0)) kw_args--;
         else {
-          __Pyx_RaiseArgtupleInvalid("train", 1, 4, 4, 1); {__pyx_filename = __pyx_f[9]; __pyx_lineno = 7; __pyx_clineno = __LINE__; goto __pyx_L3_error;}
+          __Pyx_RaiseArgtupleInvalid("train", 1, 4, 4, 1); {__pyx_filename = __pyx_f[8]; __pyx_lineno = 7; __pyx_clineno = __LINE__; goto __pyx_L3_error;}
         }
         case  2:
         if (likely((values[2] = PyDict_GetItem(__pyx_kwds, __pyx_n_s_inputdata)) != 0)) kw_args--;
         else {
-          __Pyx_RaiseArgtupleInvalid("train", 1, 4, 4, 2); {__pyx_filename = __pyx_f[9]; __pyx_lineno = 7; __pyx_clineno = __LINE__; goto __pyx_L3_error;}
+          __Pyx_RaiseArgtupleInvalid("train", 1, 4, 4, 2); {__pyx_filename = __pyx_f[8]; __pyx_lineno = 7; __pyx_clineno = __LINE__; goto __pyx_L3_error;}
         }
         case  3:
         if (likely((values[3] = PyDict_GetItem(__pyx_kwds, __pyx_n_s_expectedOutput)) != 0)) kw_args--;
         else {
-          __Pyx_RaiseArgtupleInvalid("train", 1, 4, 4, 3); {__pyx_filename = __pyx_f[9]; __pyx_lineno = 7; __pyx_clineno = __LINE__; goto __pyx_L3_error;}
+          __Pyx_RaiseArgtupleInvalid("train", 1, 4, 4, 3); {__pyx_filename = __pyx_f[8]; __pyx_lineno = 7; __pyx_clineno = __LINE__; goto __pyx_L3_error;}
         }
       }
       if (unlikely(kw_args > 0)) {
-        if (unlikely(__Pyx_ParseOptionalKeywords(__pyx_kwds, __pyx_pyargnames, 0, values, pos_args, "train") < 0)) {__pyx_filename = __pyx_f[9]; __pyx_lineno = 7; __pyx_clineno = __LINE__; goto __pyx_L3_error;}
+        if (unlikely(__Pyx_ParseOptionalKeywords(__pyx_kwds, __pyx_pyargnames, 0, values, pos_args, "train") < 0)) {__pyx_filename = __pyx_f[8]; __pyx_lineno = 7; __pyx_clineno = __LINE__; goto __pyx_L3_error;}
       }
     } else if (PyTuple_GET_SIZE(__pyx_args) != 4) {
       goto __pyx_L5_argtuple_error;
@@ -6294,19 +6631,19 @@ static PyObject *__pyx_pw_8PyDeepCL_8Adadelta_5train(PyObject *__pyx_v_self, PyO
     }
     __pyx_v_net = ((struct __pyx_obj_8PyDeepCL_NeuralNet *)values[0]);
     __pyx_v_context = ((struct __pyx_obj_8PyDeepCL_TrainingContext *)values[1]);
-    __pyx_v_inputdata = __Pyx_PyObject_to_MemoryviewSlice_ds_float(values[2]); if (unlikely(!__pyx_v_inputdata.memview)) {__pyx_filename = __pyx_f[9]; __pyx_lineno = 8; __pyx_clineno = __LINE__; goto __pyx_L3_error;}
-    __pyx_v_expectedOutput = __Pyx_PyObject_to_MemoryviewSlice_ds_float(values[3]); if (unlikely(!__pyx_v_expectedOutput.memview)) {__pyx_filename = __pyx_f[9]; __pyx_lineno = 8; __pyx_clineno = __LINE__; goto __pyx_L3_error;}
+    __pyx_v_inputdata = __Pyx_PyObject_to_MemoryviewSlice_ds_float(values[2]); if (unlikely(!__pyx_v_inputdata.memview)) {__pyx_filename = __pyx_f[8]; __pyx_lineno = 8; __pyx_clineno = __LINE__; goto __pyx_L3_error;}
+    __pyx_v_expectedOutput = __Pyx_PyObject_to_MemoryviewSlice_ds_float(values[3]); if (unlikely(!__pyx_v_expectedOutput.memview)) {__pyx_filename = __pyx_f[8]; __pyx_lineno = 8; __pyx_clineno = __LINE__; goto __pyx_L3_error;}
   }
   goto __pyx_L4_argument_unpacking_done;
   __pyx_L5_argtuple_error:;
-  __Pyx_RaiseArgtupleInvalid("train", 1, 4, 4, PyTuple_GET_SIZE(__pyx_args)); {__pyx_filename = __pyx_f[9]; __pyx_lineno = 7; __pyx_clineno = __LINE__; goto __pyx_L3_error;}
+  __Pyx_RaiseArgtupleInvalid("train", 1, 4, 4, PyTuple_GET_SIZE(__pyx_args)); {__pyx_filename = __pyx_f[8]; __pyx_lineno = 7; __pyx_clineno = __LINE__; goto __pyx_L3_error;}
   __pyx_L3_error:;
   __Pyx_AddTraceback("PyDeepCL.Adadelta.train", __pyx_clineno, __pyx_lineno, __pyx_filename);
   __Pyx_RefNannyFinishContext();
   return NULL;
   __pyx_L4_argument_unpacking_done:;
-  if (unlikely(!__Pyx_ArgTypeTest(((PyObject *)__pyx_v_net), __pyx_ptype_8PyDeepCL_NeuralNet, 1, "net", 0))) {__pyx_filename = __pyx_f[9]; __pyx_lineno = 7; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
-  if (unlikely(!__Pyx_ArgTypeTest(((PyObject *)__pyx_v_context), __pyx_ptype_8PyDeepCL_TrainingContext, 1, "context", 0))) {__pyx_filename = __pyx_f[9]; __pyx_lineno = 7; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+  if (unlikely(!__Pyx_ArgTypeTest(((PyObject *)__pyx_v_net), __pyx_ptype_8PyDeepCL_NeuralNet, 1, "net", 0))) {__pyx_filename = __pyx_f[8]; __pyx_lineno = 7; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+  if (unlikely(!__Pyx_ArgTypeTest(((PyObject *)__pyx_v_context), __pyx_ptype_8PyDeepCL_TrainingContext, 1, "context", 0))) {__pyx_filename = __pyx_f[8]; __pyx_lineno = 7; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
   __pyx_r = __pyx_pf_8PyDeepCL_8Adadelta_4train(((struct __pyx_obj_8PyDeepCL_Adadelta *)__pyx_v_self), __pyx_v_net, __pyx_v_context, __pyx_v_inputdata, __pyx_v_expectedOutput);
 
   /* function exit code */
@@ -6331,7 +6668,7 @@ static PyObject *__pyx_pf_8PyDeepCL_8Adadelta_4train(struct __pyx_obj_8PyDeepCL_
   int __pyx_clineno = 0;
   __Pyx_RefNannySetupContext("train", 0);
 
-  /* "Adadelta.pyx":10
+  /* "../../../../../../home/user/git/DeepCL/python/Adadelta.pyx":10
  *         float[:] inputdata, float[:] expectedOutput ):
  *         cdef cDeepCL.BatchResult result = self.thisptr.train(
  *             net.thisptr, context.thisptr, &inputdata[0], &expectedOutput[0])             # <<<<<<<<<<<<<<
@@ -6346,7 +6683,7 @@ static PyObject *__pyx_pf_8PyDeepCL_8Adadelta_4train(struct __pyx_obj_8PyDeepCL_
   } else if (unlikely(__pyx_t_1 >= __pyx_v_inputdata.shape[0])) __pyx_t_2 = 0;
   if (unlikely(__pyx_t_2 != -1)) {
     __Pyx_RaiseBufferIndexError(__pyx_t_2);
-    {__pyx_filename = __pyx_f[9]; __pyx_lineno = 10; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+    {__pyx_filename = __pyx_f[8]; __pyx_lineno = 10; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
   }
   __pyx_t_3 = 0;
   __pyx_t_2 = -1;
@@ -6356,10 +6693,10 @@ static PyObject *__pyx_pf_8PyDeepCL_8Adadelta_4train(struct __pyx_obj_8PyDeepCL_
   } else if (unlikely(__pyx_t_3 >= __pyx_v_expectedOutput.shape[0])) __pyx_t_2 = 0;
   if (unlikely(__pyx_t_2 != -1)) {
     __Pyx_RaiseBufferIndexError(__pyx_t_2);
-    {__pyx_filename = __pyx_f[9]; __pyx_lineno = 10; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+    {__pyx_filename = __pyx_f[8]; __pyx_lineno = 10; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
   }
 
-  /* "Adadelta.pyx":9
+  /* "../../../../../../home/user/git/DeepCL/python/Adadelta.pyx":9
  *     def train(self, NeuralNet net, TrainingContext context,
  *         float[:] inputdata, float[:] expectedOutput ):
  *         cdef cDeepCL.BatchResult result = self.thisptr.train(             # <<<<<<<<<<<<<<
@@ -6368,7 +6705,7 @@ static PyObject *__pyx_pf_8PyDeepCL_8Adadelta_4train(struct __pyx_obj_8PyDeepCL_
  */
   __pyx_v_result = __pyx_v_self->thisptr->train(__pyx_v_net->thisptr, __pyx_v_context->thisptr, (&(*((float *) ( /* dim=0 */ (__pyx_v_inputdata.data + __pyx_t_1 * __pyx_v_inputdata.strides[0]) )))), (&(*((float *) ( /* dim=0 */ (__pyx_v_expectedOutput.data + __pyx_t_3 * __pyx_v_expectedOutput.strides[0]) )))));
 
-  /* "Adadelta.pyx":11
+  /* "../../../../../../home/user/git/DeepCL/python/Adadelta.pyx":11
  *         cdef cDeepCL.BatchResult result = self.thisptr.train(
  *             net.thisptr, context.thisptr, &inputdata[0], &expectedOutput[0])
  *         return result.getLoss()             # <<<<<<<<<<<<<<
@@ -6376,14 +6713,14 @@ static PyObject *__pyx_pf_8PyDeepCL_8Adadelta_4train(struct __pyx_obj_8PyDeepCL_
  *         float[:] inputdata, int[:] labels):
  */
   __Pyx_XDECREF(__pyx_r);
-  __pyx_t_4 = PyFloat_FromDouble(__pyx_v_result.getLoss()); if (unlikely(!__pyx_t_4)) {__pyx_filename = __pyx_f[9]; __pyx_lineno = 11; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+  __pyx_t_4 = PyFloat_FromDouble(__pyx_v_result.getLoss()); if (unlikely(!__pyx_t_4)) {__pyx_filename = __pyx_f[8]; __pyx_lineno = 11; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
   __Pyx_GOTREF(__pyx_t_4);
   __pyx_r = __pyx_t_4;
   __pyx_t_4 = 0;
   goto __pyx_L0;
 
-  /* "Adadelta.pyx":7
- *     def __dealloc(self):
+  /* "../../../../../../home/user/git/DeepCL/python/Adadelta.pyx":7
+ *     def __dealloc__(self):
  *         del self.thisptr
  *     def train(self, NeuralNet net, TrainingContext context,             # <<<<<<<<<<<<<<
  *         float[:] inputdata, float[:] expectedOutput ):
@@ -6403,7 +6740,7 @@ static PyObject *__pyx_pf_8PyDeepCL_8Adadelta_4train(struct __pyx_obj_8PyDeepCL_
   return __pyx_r;
 }
 
-/* "Adadelta.pyx":12
+/* "../../../../../../home/user/git/DeepCL/python/Adadelta.pyx":12
  *             net.thisptr, context.thisptr, &inputdata[0], &expectedOutput[0])
  *         return result.getLoss()
  *     def trainFromLabels(self, NeuralNet net, TrainingContext context,             # <<<<<<<<<<<<<<
@@ -6446,21 +6783,21 @@ static PyObject *__pyx_pw_8PyDeepCL_8Adadelta_7trainFromLabels(PyObject *__pyx_v
         case  1:
         if (likely((values[1] = PyDict_GetItem(__pyx_kwds, __pyx_n_s_context)) != 0)) kw_args--;
         else {
-          __Pyx_RaiseArgtupleInvalid("trainFromLabels", 1, 4, 4, 1); {__pyx_filename = __pyx_f[9]; __pyx_lineno = 12; __pyx_clineno = __LINE__; goto __pyx_L3_error;}
+          __Pyx_RaiseArgtupleInvalid("trainFromLabels", 1, 4, 4, 1); {__pyx_filename = __pyx_f[8]; __pyx_lineno = 12; __pyx_clineno = __LINE__; goto __pyx_L3_error;}
         }
         case  2:
         if (likely((values[2] = PyDict_GetItem(__pyx_kwds, __pyx_n_s_inputdata)) != 0)) kw_args--;
         else {
-          __Pyx_RaiseArgtupleInvalid("trainFromLabels", 1, 4, 4, 2); {__pyx_filename = __pyx_f[9]; __pyx_lineno = 12; __pyx_clineno = __LINE__; goto __pyx_L3_error;}
+          __Pyx_RaiseArgtupleInvalid("trainFromLabels", 1, 4, 4, 2); {__pyx_filename = __pyx_f[8]; __pyx_lineno = 12; __pyx_clineno = __LINE__; goto __pyx_L3_error;}
         }
         case  3:
         if (likely((values[3] = PyDict_GetItem(__pyx_kwds, __pyx_n_s_labels)) != 0)) kw_args--;
         else {
-          __Pyx_RaiseArgtupleInvalid("trainFromLabels", 1, 4, 4, 3); {__pyx_filename = __pyx_f[9]; __pyx_lineno = 12; __pyx_clineno = __LINE__; goto __pyx_L3_error;}
+          __Pyx_RaiseArgtupleInvalid("trainFromLabels", 1, 4, 4, 3); {__pyx_filename = __pyx_f[8]; __pyx_lineno = 12; __pyx_clineno = __LINE__; goto __pyx_L3_error;}
         }
       }
       if (unlikely(kw_args > 0)) {
-        if (unlikely(__Pyx_ParseOptionalKeywords(__pyx_kwds, __pyx_pyargnames, 0, values, pos_args, "trainFromLabels") < 0)) {__pyx_filename = __pyx_f[9]; __pyx_lineno = 12; __pyx_clineno = __LINE__; goto __pyx_L3_error;}
+        if (unlikely(__Pyx_ParseOptionalKeywords(__pyx_kwds, __pyx_pyargnames, 0, values, pos_args, "trainFromLabels") < 0)) {__pyx_filename = __pyx_f[8]; __pyx_lineno = 12; __pyx_clineno = __LINE__; goto __pyx_L3_error;}
       }
     } else if (PyTuple_GET_SIZE(__pyx_args) != 4) {
       goto __pyx_L5_argtuple_error;
@@ -6472,19 +6809,19 @@ static PyObject *__pyx_pw_8PyDeepCL_8Adadelta_7trainFromLabels(PyObject *__pyx_v
     }
     __pyx_v_net = ((struct __pyx_obj_8PyDeepCL_NeuralNet *)values[0]);
     __pyx_v_context = ((struct __pyx_obj_8PyDeepCL_TrainingContext *)values[1]);
-    __pyx_v_inputdata = __Pyx_PyObject_to_MemoryviewSlice_ds_float(values[2]); if (unlikely(!__pyx_v_inputdata.memview)) {__pyx_filename = __pyx_f[9]; __pyx_lineno = 13; __pyx_clineno = __LINE__; goto __pyx_L3_error;}
-    __pyx_v_labels = __Pyx_PyObject_to_MemoryviewSlice_ds_int(values[3]); if (unlikely(!__pyx_v_labels.memview)) {__pyx_filename = __pyx_f[9]; __pyx_lineno = 13; __pyx_clineno = __LINE__; goto __pyx_L3_error;}
+    __pyx_v_inputdata = __Pyx_PyObject_to_MemoryviewSlice_ds_float(values[2]); if (unlikely(!__pyx_v_inputdata.memview)) {__pyx_filename = __pyx_f[8]; __pyx_lineno = 13; __pyx_clineno = __LINE__; goto __pyx_L3_error;}
+    __pyx_v_labels = __Pyx_PyObject_to_MemoryviewSlice_ds_int(values[3]); if (unlikely(!__pyx_v_labels.memview)) {__pyx_filename = __pyx_f[8]; __pyx_lineno = 13; __pyx_clineno = __LINE__; goto __pyx_L3_error;}
   }
   goto __pyx_L4_argument_unpacking_done;
   __pyx_L5_argtuple_error:;
-  __Pyx_RaiseArgtupleInvalid("trainFromLabels", 1, 4, 4, PyTuple_GET_SIZE(__pyx_args)); {__pyx_filename = __pyx_f[9]; __pyx_lineno = 12; __pyx_clineno = __LINE__; goto __pyx_L3_error;}
+  __Pyx_RaiseArgtupleInvalid("trainFromLabels", 1, 4, 4, PyTuple_GET_SIZE(__pyx_args)); {__pyx_filename = __pyx_f[8]; __pyx_lineno = 12; __pyx_clineno = __LINE__; goto __pyx_L3_error;}
   __pyx_L3_error:;
   __Pyx_AddTraceback("PyDeepCL.Adadelta.trainFromLabels", __pyx_clineno, __pyx_lineno, __pyx_filename);
   __Pyx_RefNannyFinishContext();
   return NULL;
   __pyx_L4_argument_unpacking_done:;
-  if (unlikely(!__Pyx_ArgTypeTest(((PyObject *)__pyx_v_net), __pyx_ptype_8PyDeepCL_NeuralNet, 1, "net", 0))) {__pyx_filename = __pyx_f[9]; __pyx_lineno = 12; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
-  if (unlikely(!__Pyx_ArgTypeTest(((PyObject *)__pyx_v_context), __pyx_ptype_8PyDeepCL_TrainingContext, 1, "context", 0))) {__pyx_filename = __pyx_f[9]; __pyx_lineno = 12; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+  if (unlikely(!__Pyx_ArgTypeTest(((PyObject *)__pyx_v_net), __pyx_ptype_8PyDeepCL_NeuralNet, 1, "net", 0))) {__pyx_filename = __pyx_f[8]; __pyx_lineno = 12; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+  if (unlikely(!__Pyx_ArgTypeTest(((PyObject *)__pyx_v_context), __pyx_ptype_8PyDeepCL_TrainingContext, 1, "context", 0))) {__pyx_filename = __pyx_f[8]; __pyx_lineno = 12; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
   __pyx_r = __pyx_pf_8PyDeepCL_8Adadelta_6trainFromLabels(((struct __pyx_obj_8PyDeepCL_Adadelta *)__pyx_v_self), __pyx_v_net, __pyx_v_context, __pyx_v_inputdata, __pyx_v_labels);
 
   /* function exit code */
@@ -6511,7 +6848,7 @@ static PyObject *__pyx_pf_8PyDeepCL_8Adadelta_6trainFromLabels(struct __pyx_obj_
   int __pyx_clineno = 0;
   __Pyx_RefNannySetupContext("trainFromLabels", 0);
 
-  /* "Adadelta.pyx":15
+  /* "../../../../../../home/user/git/DeepCL/python/Adadelta.pyx":15
  *         float[:] inputdata, int[:] labels):
  *         cdef cDeepCL.BatchResult result = self.thisptr.trainFromLabels(
  *             net.thisptr, context.thisptr, &inputdata[0], &labels[0])             # <<<<<<<<<<<<<<
@@ -6526,7 +6863,7 @@ static PyObject *__pyx_pf_8PyDeepCL_8Adadelta_6trainFromLabels(struct __pyx_obj_
   } else if (unlikely(__pyx_t_1 >= __pyx_v_inputdata.shape[0])) __pyx_t_2 = 0;
   if (unlikely(__pyx_t_2 != -1)) {
     __Pyx_RaiseBufferIndexError(__pyx_t_2);
-    {__pyx_filename = __pyx_f[9]; __pyx_lineno = 15; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+    {__pyx_filename = __pyx_f[8]; __pyx_lineno = 15; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
   }
   __pyx_t_3 = 0;
   __pyx_t_2 = -1;
@@ -6536,10 +6873,10 @@ static PyObject *__pyx_pf_8PyDeepCL_8Adadelta_6trainFromLabels(struct __pyx_obj_
   } else if (unlikely(__pyx_t_3 >= __pyx_v_labels.shape[0])) __pyx_t_2 = 0;
   if (unlikely(__pyx_t_2 != -1)) {
     __Pyx_RaiseBufferIndexError(__pyx_t_2);
-    {__pyx_filename = __pyx_f[9]; __pyx_lineno = 15; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+    {__pyx_filename = __pyx_f[8]; __pyx_lineno = 15; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
   }
 
-  /* "Adadelta.pyx":14
+  /* "../../../../../../home/user/git/DeepCL/python/Adadelta.pyx":14
  *     def trainFromLabels(self, NeuralNet net, TrainingContext context,
  *         float[:] inputdata, int[:] labels):
  *         cdef cDeepCL.BatchResult result = self.thisptr.trainFromLabels(             # <<<<<<<<<<<<<<
@@ -6548,18 +6885,18 @@ static PyObject *__pyx_pf_8PyDeepCL_8Adadelta_6trainFromLabels(struct __pyx_obj_
  */
   __pyx_v_result = __pyx_v_self->thisptr->trainFromLabels(__pyx_v_net->thisptr, __pyx_v_context->thisptr, (&(*((float *) ( /* dim=0 */ (__pyx_v_inputdata.data + __pyx_t_1 * __pyx_v_inputdata.strides[0]) )))), (&(*((int *) ( /* dim=0 */ (__pyx_v_labels.data + __pyx_t_3 * __pyx_v_labels.strides[0]) )))));
 
-  /* "Adadelta.pyx":16
+  /* "../../../../../../home/user/git/DeepCL/python/Adadelta.pyx":16
  *         cdef cDeepCL.BatchResult result = self.thisptr.trainFromLabels(
  *             net.thisptr, context.thisptr, &inputdata[0], &labels[0])
  *         return ( result.getLoss(), result.getNumRight() )             # <<<<<<<<<<<<<<
  * 
  */
   __Pyx_XDECREF(__pyx_r);
-  __pyx_t_4 = PyFloat_FromDouble(__pyx_v_result.getLoss()); if (unlikely(!__pyx_t_4)) {__pyx_filename = __pyx_f[9]; __pyx_lineno = 16; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+  __pyx_t_4 = PyFloat_FromDouble(__pyx_v_result.getLoss()); if (unlikely(!__pyx_t_4)) {__pyx_filename = __pyx_f[8]; __pyx_lineno = 16; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
   __Pyx_GOTREF(__pyx_t_4);
-  __pyx_t_5 = __Pyx_PyInt_From_int(__pyx_v_result.getNumRight()); if (unlikely(!__pyx_t_5)) {__pyx_filename = __pyx_f[9]; __pyx_lineno = 16; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+  __pyx_t_5 = __Pyx_PyInt_From_int(__pyx_v_result.getNumRight()); if (unlikely(!__pyx_t_5)) {__pyx_filename = __pyx_f[8]; __pyx_lineno = 16; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
   __Pyx_GOTREF(__pyx_t_5);
-  __pyx_t_6 = PyTuple_New(2); if (unlikely(!__pyx_t_6)) {__pyx_filename = __pyx_f[9]; __pyx_lineno = 16; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+  __pyx_t_6 = PyTuple_New(2); if (unlikely(!__pyx_t_6)) {__pyx_filename = __pyx_f[8]; __pyx_lineno = 16; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
   __Pyx_GOTREF(__pyx_t_6);
   PyTuple_SET_ITEM(__pyx_t_6, 0, __pyx_t_4);
   __Pyx_GIVEREF(__pyx_t_4);
@@ -6571,7 +6908,7 @@ static PyObject *__pyx_pf_8PyDeepCL_8Adadelta_6trainFromLabels(struct __pyx_obj_
   __pyx_t_6 = 0;
   goto __pyx_L0;
 
-  /* "Adadelta.pyx":12
+  /* "../../../../../../home/user/git/DeepCL/python/Adadelta.pyx":12
  *             net.thisptr, context.thisptr, &inputdata[0], &expectedOutput[0])
  *         return result.getLoss()
  *     def trainFromLabels(self, NeuralNet net, TrainingContext context,             # <<<<<<<<<<<<<<
@@ -6594,10 +6931,10 @@ static PyObject *__pyx_pf_8PyDeepCL_8Adadelta_6trainFromLabels(struct __pyx_obj_
   return __pyx_r;
 }
 
-/* "NeuralNet.pyx":4
+/* "../../../../../../home/user/git/DeepCL/python/NeuralNet.pyx":4
  *     cdef cDeepCL.NeuralNet *thisptr
  * 
- *     def __cinit__(self, EasyCL cl, planes = None, size = None):             # <<<<<<<<<<<<<<
+ *     def __cinit__(self, DeepCL cl, planes = None, size = None):             # <<<<<<<<<<<<<<
  * #        print( '__cinit__(planes,size)')
  *         if planes == None and size == None:
  */
@@ -6605,7 +6942,7 @@ static PyObject *__pyx_pf_8PyDeepCL_8Adadelta_6trainFromLabels(struct __pyx_obj_
 /* Python wrapper */
 static int __pyx_pw_8PyDeepCL_9NeuralNet_1__cinit__(PyObject *__pyx_v_self, PyObject *__pyx_args, PyObject *__pyx_kwds); /*proto*/
 static int __pyx_pw_8PyDeepCL_9NeuralNet_1__cinit__(PyObject *__pyx_v_self, PyObject *__pyx_args, PyObject *__pyx_kwds) {
-  struct __pyx_obj_8PyDeepCL_EasyCL *__pyx_v_cl = 0;
+  struct __pyx_obj_8PyDeepCL_DeepCL *__pyx_v_cl = 0;
   PyObject *__pyx_v_planes = 0;
   PyObject *__pyx_v_size = 0;
   int __pyx_lineno = 0;
@@ -6657,7 +6994,7 @@ static int __pyx_pw_8PyDeepCL_9NeuralNet_1__cinit__(PyObject *__pyx_v_self, PyOb
         default: goto __pyx_L5_argtuple_error;
       }
     }
-    __pyx_v_cl = ((struct __pyx_obj_8PyDeepCL_EasyCL *)values[0]);
+    __pyx_v_cl = ((struct __pyx_obj_8PyDeepCL_DeepCL *)values[0]);
     __pyx_v_planes = values[1];
     __pyx_v_size = values[2];
   }
@@ -6669,7 +7006,7 @@ static int __pyx_pw_8PyDeepCL_9NeuralNet_1__cinit__(PyObject *__pyx_v_self, PyOb
   __Pyx_RefNannyFinishContext();
   return -1;
   __pyx_L4_argument_unpacking_done:;
-  if (unlikely(!__Pyx_ArgTypeTest(((PyObject *)__pyx_v_cl), __pyx_ptype_8PyDeepCL_EasyCL, 1, "cl", 0))) {__pyx_filename = __pyx_f[1]; __pyx_lineno = 4; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+  if (unlikely(!__Pyx_ArgTypeTest(((PyObject *)__pyx_v_cl), __pyx_ptype_8PyDeepCL_DeepCL, 1, "cl", 0))) {__pyx_filename = __pyx_f[1]; __pyx_lineno = 4; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
   __pyx_r = __pyx_pf_8PyDeepCL_9NeuralNet___cinit__(((struct __pyx_obj_8PyDeepCL_NeuralNet *)__pyx_v_self), __pyx_v_cl, __pyx_v_planes, __pyx_v_size);
 
   /* function exit code */
@@ -6681,7 +7018,7 @@ static int __pyx_pw_8PyDeepCL_9NeuralNet_1__cinit__(PyObject *__pyx_v_self, PyOb
   return __pyx_r;
 }
 
-static int __pyx_pf_8PyDeepCL_9NeuralNet___cinit__(struct __pyx_obj_8PyDeepCL_NeuralNet *__pyx_v_self, struct __pyx_obj_8PyDeepCL_EasyCL *__pyx_v_cl, PyObject *__pyx_v_planes, PyObject *__pyx_v_size) {
+static int __pyx_pf_8PyDeepCL_9NeuralNet___cinit__(struct __pyx_obj_8PyDeepCL_NeuralNet *__pyx_v_self, struct __pyx_obj_8PyDeepCL_DeepCL *__pyx_v_cl, PyObject *__pyx_v_planes, PyObject *__pyx_v_size) {
   int __pyx_r;
   __Pyx_RefNannyDeclarations
   int __pyx_t_1;
@@ -6695,11 +7032,11 @@ static int __pyx_pf_8PyDeepCL_9NeuralNet___cinit__(struct __pyx_obj_8PyDeepCL_Ne
   int __pyx_clineno = 0;
   __Pyx_RefNannySetupContext("__cinit__", 0);
 
-  /* "NeuralNet.pyx":6
- *     def __cinit__(self, EasyCL cl, planes = None, size = None):
+  /* "../../../../../../home/user/git/DeepCL/python/NeuralNet.pyx":6
+ *     def __cinit__(self, DeepCL cl, planes = None, size = None):
  * #        print( '__cinit__(planes,size)')
  *         if planes == None and size == None:             # <<<<<<<<<<<<<<
- *              self.thisptr = new cDeepCL.NeuralNet(cl.thisptr)
+ *             self.thisptr = cDeepCL.NeuralNet.instance(cl.thisptr)
  *         else:
  */
   __pyx_t_2 = PyObject_RichCompare(__pyx_v_planes, Py_None, Py_EQ); __Pyx_XGOTREF(__pyx_t_2); if (unlikely(!__pyx_t_2)) {__pyx_filename = __pyx_f[1]; __pyx_lineno = 6; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
@@ -6717,15 +7054,15 @@ static int __pyx_pf_8PyDeepCL_9NeuralNet___cinit__(struct __pyx_obj_8PyDeepCL_Ne
   __pyx_L4_bool_binop_done:;
   if (__pyx_t_1) {
 
-    /* "NeuralNet.pyx":7
+    /* "../../../../../../home/user/git/DeepCL/python/NeuralNet.pyx":7
  * #        print( '__cinit__(planes,size)')
  *         if planes == None and size == None:
- *              self.thisptr = new cDeepCL.NeuralNet(cl.thisptr)             # <<<<<<<<<<<<<<
+ *             self.thisptr = cDeepCL.NeuralNet.instance(cl.thisptr)             # <<<<<<<<<<<<<<
  *         else:
- *             self.thisptr = new cDeepCL.NeuralNet(cl.thisptr, planes, size)
+ *             self.thisptr = cDeepCL.NeuralNet.instance3(cl.thisptr, planes, size)
  */
     try {
-      __pyx_t_4 = new NeuralNet(__pyx_v_cl->thisptr);
+      __pyx_t_4 = NeuralNet::instance(__pyx_v_cl->thisptr);
     } catch(...) {
       __Pyx_CppExn2PyErr();
       {__pyx_filename = __pyx_f[1]; __pyx_lineno = 7; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
@@ -6735,17 +7072,17 @@ static int __pyx_pf_8PyDeepCL_9NeuralNet___cinit__(struct __pyx_obj_8PyDeepCL_Ne
   }
   /*else*/ {
 
-    /* "NeuralNet.pyx":9
- *              self.thisptr = new cDeepCL.NeuralNet(cl.thisptr)
+    /* "../../../../../../home/user/git/DeepCL/python/NeuralNet.pyx":9
+ *             self.thisptr = cDeepCL.NeuralNet.instance(cl.thisptr)
  *         else:
- *             self.thisptr = new cDeepCL.NeuralNet(cl.thisptr, planes, size)             # <<<<<<<<<<<<<<
+ *             self.thisptr = cDeepCL.NeuralNet.instance3(cl.thisptr, planes, size)             # <<<<<<<<<<<<<<
  * 
- *     def __dealloc(self):
+ *     def __dealloc__(self):
  */
     __pyx_t_5 = __Pyx_PyInt_As_int(__pyx_v_planes); if (unlikely((__pyx_t_5 == (int)-1) && PyErr_Occurred())) {__pyx_filename = __pyx_f[1]; __pyx_lineno = 9; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
     __pyx_t_6 = __Pyx_PyInt_As_int(__pyx_v_size); if (unlikely((__pyx_t_6 == (int)-1) && PyErr_Occurred())) {__pyx_filename = __pyx_f[1]; __pyx_lineno = 9; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
     try {
-      __pyx_t_4 = new NeuralNet(__pyx_v_cl->thisptr, __pyx_t_5, __pyx_t_6);
+      __pyx_t_4 = NeuralNet::instance3(__pyx_v_cl->thisptr, __pyx_t_5, __pyx_t_6);
     } catch(...) {
       __Pyx_CppExn2PyErr();
       {__pyx_filename = __pyx_f[1]; __pyx_lineno = 9; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
@@ -6754,10 +7091,10 @@ static int __pyx_pf_8PyDeepCL_9NeuralNet___cinit__(struct __pyx_obj_8PyDeepCL_Ne
   }
   __pyx_L3:;
 
-  /* "NeuralNet.pyx":4
+  /* "../../../../../../home/user/git/DeepCL/python/NeuralNet.pyx":4
  *     cdef cDeepCL.NeuralNet *thisptr
  * 
- *     def __cinit__(self, EasyCL cl, planes = None, size = None):             # <<<<<<<<<<<<<<
+ *     def __cinit__(self, DeepCL cl, planes = None, size = None):             # <<<<<<<<<<<<<<
  * #        print( '__cinit__(planes,size)')
  *         if planes == None and size == None:
  */
@@ -6774,62 +7111,56 @@ static int __pyx_pf_8PyDeepCL_9NeuralNet___cinit__(struct __pyx_obj_8PyDeepCL_Ne
   return __pyx_r;
 }
 
-/* "NeuralNet.pyx":11
- *             self.thisptr = new cDeepCL.NeuralNet(cl.thisptr, planes, size)
+/* "../../../../../../home/user/git/DeepCL/python/NeuralNet.pyx":11
+ *             self.thisptr = cDeepCL.NeuralNet.instance3(cl.thisptr, planes, size)
  * 
- *     def __dealloc(self):             # <<<<<<<<<<<<<<
- *         del self.thisptr
+ *     def __dealloc__(self):             # <<<<<<<<<<<<<<
+ *         self.thisptr.deleteMe()
  * 
  */
 
 /* Python wrapper */
-static PyObject *__pyx_pw_8PyDeepCL_9NeuralNet_3__dealloc(PyObject *__pyx_v_self, CYTHON_UNUSED PyObject *unused); /*proto*/
-static PyObject *__pyx_pw_8PyDeepCL_9NeuralNet_3__dealloc(PyObject *__pyx_v_self, CYTHON_UNUSED PyObject *unused) {
-  PyObject *__pyx_r = 0;
+static void __pyx_pw_8PyDeepCL_9NeuralNet_3__dealloc__(PyObject *__pyx_v_self); /*proto*/
+static void __pyx_pw_8PyDeepCL_9NeuralNet_3__dealloc__(PyObject *__pyx_v_self) {
   __Pyx_RefNannyDeclarations
-  __Pyx_RefNannySetupContext("__dealloc (wrapper)", 0);
-  __pyx_r = __pyx_pf_8PyDeepCL_9NeuralNet_2__dealloc(((struct __pyx_obj_8PyDeepCL_NeuralNet *)__pyx_v_self));
+  __Pyx_RefNannySetupContext("__dealloc__ (wrapper)", 0);
+  __pyx_pf_8PyDeepCL_9NeuralNet_2__dealloc__(((struct __pyx_obj_8PyDeepCL_NeuralNet *)__pyx_v_self));
 
   /* function exit code */
   __Pyx_RefNannyFinishContext();
-  return __pyx_r;
 }
 
-static PyObject *__pyx_pf_8PyDeepCL_9NeuralNet_2__dealloc(struct __pyx_obj_8PyDeepCL_NeuralNet *__pyx_v_self) {
-  PyObject *__pyx_r = NULL;
+static void __pyx_pf_8PyDeepCL_9NeuralNet_2__dealloc__(struct __pyx_obj_8PyDeepCL_NeuralNet *__pyx_v_self) {
   __Pyx_RefNannyDeclarations
-  __Pyx_RefNannySetupContext("__dealloc", 0);
+  __Pyx_RefNannySetupContext("__dealloc__", 0);
 
-  /* "NeuralNet.pyx":12
+  /* "../../../../../../home/user/git/DeepCL/python/NeuralNet.pyx":12
  * 
- *     def __dealloc(self):
- *         del self.thisptr             # <<<<<<<<<<<<<<
+ *     def __dealloc__(self):
+ *         self.thisptr.deleteMe()             # <<<<<<<<<<<<<<
  * 
  *     def asString(self):
  */
-  delete __pyx_v_self->thisptr;
+  __pyx_v_self->thisptr->deleteMe();
 
-  /* "NeuralNet.pyx":11
- *             self.thisptr = new cDeepCL.NeuralNet(cl.thisptr, planes, size)
+  /* "../../../../../../home/user/git/DeepCL/python/NeuralNet.pyx":11
+ *             self.thisptr = cDeepCL.NeuralNet.instance3(cl.thisptr, planes, size)
  * 
- *     def __dealloc(self):             # <<<<<<<<<<<<<<
- *         del self.thisptr
+ *     def __dealloc__(self):             # <<<<<<<<<<<<<<
+ *         self.thisptr.deleteMe()
  * 
  */
 
   /* function exit code */
-  __pyx_r = Py_None; __Pyx_INCREF(Py_None);
-  __Pyx_XGIVEREF(__pyx_r);
   __Pyx_RefNannyFinishContext();
-  return __pyx_r;
 }
 
-/* "NeuralNet.pyx":14
- *         del self.thisptr
+/* "../../../../../../home/user/git/DeepCL/python/NeuralNet.pyx":14
+ *         self.thisptr.deleteMe()
  * 
  *     def asString(self):             # <<<<<<<<<<<<<<
- *         return self.thisptr.asString()
- * 
+ *         print('about to call asnewcharstar')
+ *         cdef const char *result_charstar = self.thisptr.asNewCharStar()
  */
 
 /* Python wrapper */
@@ -6846,55 +7177,115 @@ static PyObject *__pyx_pw_8PyDeepCL_9NeuralNet_5asString(PyObject *__pyx_v_self,
 }
 
 static PyObject *__pyx_pf_8PyDeepCL_9NeuralNet_4asString(struct __pyx_obj_8PyDeepCL_NeuralNet *__pyx_v_self) {
+  char const *__pyx_v_result_charstar;
+  PyObject *__pyx_v_result = 0;
   PyObject *__pyx_r = NULL;
   __Pyx_RefNannyDeclarations
-  std::string __pyx_t_1;
+  char const *__pyx_t_1;
   PyObject *__pyx_t_2 = NULL;
+  PyObject *__pyx_t_3 = NULL;
   int __pyx_lineno = 0;
   const char *__pyx_filename = NULL;
   int __pyx_clineno = 0;
   __Pyx_RefNannySetupContext("asString", 0);
 
-  /* "NeuralNet.pyx":15
+  /* "../../../../../../home/user/git/DeepCL/python/NeuralNet.pyx":15
  * 
  *     def asString(self):
- *         return self.thisptr.asString()             # <<<<<<<<<<<<<<
- * 
- * #    def myprint(self):
+ *         print('about to call asnewcharstar')             # <<<<<<<<<<<<<<
+ *         cdef const char *result_charstar = self.thisptr.asNewCharStar()
+ *         print('got char *result')
+ */
+  if (__Pyx_PrintOne(0, __pyx_kp_s_about_to_call_asnewcharstar) < 0) {__pyx_filename = __pyx_f[1]; __pyx_lineno = 15; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+
+  /* "../../../../../../home/user/git/DeepCL/python/NeuralNet.pyx":16
+ *     def asString(self):
+ *         print('about to call asnewcharstar')
+ *         cdef const char *result_charstar = self.thisptr.asNewCharStar()             # <<<<<<<<<<<<<<
+ *         print('got char *result')
+ *         cdef str result = str(result_charstar.decode('UTF-8'))
  */
-  __Pyx_XDECREF(__pyx_r);
   try {
-    __pyx_t_1 = __pyx_v_self->thisptr->asString();
+    __pyx_t_1 = __pyx_v_self->thisptr->asNewCharStar();
   } catch(...) {
     __Pyx_CppExn2PyErr();
-    {__pyx_filename = __pyx_f[1]; __pyx_lineno = 15; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+    {__pyx_filename = __pyx_f[1]; __pyx_lineno = 16; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
   }
-  __pyx_t_2 = __pyx_convert_PyBytes_string_to_py_std__in_string(__pyx_t_1); if (unlikely(!__pyx_t_2)) {__pyx_filename = __pyx_f[1]; __pyx_lineno = 15; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+  __pyx_v_result_charstar = __pyx_t_1;
+
+  /* "../../../../../../home/user/git/DeepCL/python/NeuralNet.pyx":17
+ *         print('about to call asnewcharstar')
+ *         cdef const char *result_charstar = self.thisptr.asNewCharStar()
+ *         print('got char *result')             # <<<<<<<<<<<<<<
+ *         cdef str result = str(result_charstar.decode('UTF-8'))
+ *         CppRuntimeBoundary.deepcl_deleteCharStar(result_charstar)
+ */
+  if (__Pyx_PrintOne(0, __pyx_kp_s_got_char_result) < 0) {__pyx_filename = __pyx_f[1]; __pyx_lineno = 17; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+
+  /* "../../../../../../home/user/git/DeepCL/python/NeuralNet.pyx":18
+ *         cdef const char *result_charstar = self.thisptr.asNewCharStar()
+ *         print('got char *result')
+ *         cdef str result = str(result_charstar.decode('UTF-8'))             # <<<<<<<<<<<<<<
+ *         CppRuntimeBoundary.deepcl_deleteCharStar(result_charstar)
+ *         return result
+ */
+  __pyx_t_2 = __Pyx_decode_c_string(__pyx_v_result_charstar, 0, strlen(__pyx_v_result_charstar), NULL, NULL, PyUnicode_DecodeUTF8); if (unlikely(!__pyx_t_2)) {__pyx_filename = __pyx_f[1]; __pyx_lineno = 18; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
   __Pyx_GOTREF(__pyx_t_2);
-  __pyx_r = __pyx_t_2;
+  __pyx_t_3 = PyTuple_New(1); if (unlikely(!__pyx_t_3)) {__pyx_filename = __pyx_f[1]; __pyx_lineno = 18; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+  __Pyx_GOTREF(__pyx_t_3);
+  PyTuple_SET_ITEM(__pyx_t_3, 0, __pyx_t_2);
+  __Pyx_GIVEREF(__pyx_t_2);
   __pyx_t_2 = 0;
+  __pyx_t_2 = __Pyx_PyObject_Call(((PyObject *)((PyObject*)(&PyString_Type))), __pyx_t_3, NULL); if (unlikely(!__pyx_t_2)) {__pyx_filename = __pyx_f[1]; __pyx_lineno = 18; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+  __Pyx_GOTREF(__pyx_t_2);
+  __Pyx_DECREF(__pyx_t_3); __pyx_t_3 = 0;
+  if (!(likely(PyString_CheckExact(__pyx_t_2))||((__pyx_t_2) == Py_None)||(PyErr_Format(PyExc_TypeError, "Expected %.16s, got %.200s", "str", Py_TYPE(__pyx_t_2)->tp_name), 0))) {__pyx_filename = __pyx_f[1]; __pyx_lineno = 18; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+  __pyx_v_result = ((PyObject*)__pyx_t_2);
+  __pyx_t_2 = 0;
+
+  /* "../../../../../../home/user/git/DeepCL/python/NeuralNet.pyx":19
+ *         print('got char *result')
+ *         cdef str result = str(result_charstar.decode('UTF-8'))
+ *         CppRuntimeBoundary.deepcl_deleteCharStar(result_charstar)             # <<<<<<<<<<<<<<
+ *         return result
+ * 
+ */
+  deepcl_deleteCharStar(__pyx_v_result_charstar);
+
+  /* "../../../../../../home/user/git/DeepCL/python/NeuralNet.pyx":20
+ *         cdef str result = str(result_charstar.decode('UTF-8'))
+ *         CppRuntimeBoundary.deepcl_deleteCharStar(result_charstar)
+ *         return result             # <<<<<<<<<<<<<<
+ * 
+ * #    def myprint(self):
+ */
+  __Pyx_XDECREF(__pyx_r);
+  __Pyx_INCREF(__pyx_v_result);
+  __pyx_r = __pyx_v_result;
   goto __pyx_L0;
 
-  /* "NeuralNet.pyx":14
- *         del self.thisptr
+  /* "../../../../../../home/user/git/DeepCL/python/NeuralNet.pyx":14
+ *         self.thisptr.deleteMe()
  * 
  *     def asString(self):             # <<<<<<<<<<<<<<
- *         return self.thisptr.asString()
- * 
+ *         print('about to call asnewcharstar')
+ *         cdef const char *result_charstar = self.thisptr.asNewCharStar()
  */
 
   /* function exit code */
   __pyx_L1_error:;
   __Pyx_XDECREF(__pyx_t_2);
+  __Pyx_XDECREF(__pyx_t_3);
   __Pyx_AddTraceback("PyDeepCL.NeuralNet.asString", __pyx_clineno, __pyx_lineno, __pyx_filename);
   __pyx_r = NULL;
   __pyx_L0:;
+  __Pyx_XDECREF(__pyx_v_result);
   __Pyx_XGIVEREF(__pyx_r);
   __Pyx_RefNannyFinishContext();
   return __pyx_r;
 }
 
-/* "NeuralNet.pyx":20
+/* "../../../../../../home/user/git/DeepCL/python/NeuralNet.pyx":25
  * #        self.thisptr.print()
  * 
  *     def setBatchSize( self, int batchSize ):             # <<<<<<<<<<<<<<
@@ -6913,7 +7304,7 @@ static PyObject *__pyx_pw_8PyDeepCL_9NeuralNet_7setBatchSize(PyObject *__pyx_v_s
   __Pyx_RefNannyDeclarations
   __Pyx_RefNannySetupContext("setBatchSize (wrapper)", 0);
   assert(__pyx_arg_batchSize); {
-    __pyx_v_batchSize = __Pyx_PyInt_As_int(__pyx_arg_batchSize); if (unlikely((__pyx_v_batchSize == (int)-1) && PyErr_Occurred())) {__pyx_filename = __pyx_f[1]; __pyx_lineno = 20; __pyx_clineno = __LINE__; goto __pyx_L3_error;}
+    __pyx_v_batchSize = __Pyx_PyInt_As_int(__pyx_arg_batchSize); if (unlikely((__pyx_v_batchSize == (int)-1) && PyErr_Occurred())) {__pyx_filename = __pyx_f[1]; __pyx_lineno = 25; __pyx_clineno = __LINE__; goto __pyx_L3_error;}
   }
   goto __pyx_L4_argument_unpacking_done;
   __pyx_L3_error:;
@@ -6936,7 +7327,7 @@ static PyObject *__pyx_pf_8PyDeepCL_9NeuralNet_6setBatchSize(struct __pyx_obj_8P
   int __pyx_clineno = 0;
   __Pyx_RefNannySetupContext("setBatchSize", 0);
 
-  /* "NeuralNet.pyx":21
+  /* "../../../../../../home/user/git/DeepCL/python/NeuralNet.pyx":26
  * 
  *     def setBatchSize( self, int batchSize ):
  *         self.thisptr.setBatchSize( batchSize )             # <<<<<<<<<<<<<<
@@ -6947,10 +7338,10 @@ static PyObject *__pyx_pf_8PyDeepCL_9NeuralNet_6setBatchSize(struct __pyx_obj_8P
     __pyx_v_self->thisptr->setBatchSize(__pyx_v_batchSize);
   } catch(...) {
     __Pyx_CppExn2PyErr();
-    {__pyx_filename = __pyx_f[1]; __pyx_lineno = 21; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+    {__pyx_filename = __pyx_f[1]; __pyx_lineno = 26; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
   }
 
-  /* "NeuralNet.pyx":20
+  /* "../../../../../../home/user/git/DeepCL/python/NeuralNet.pyx":25
  * #        self.thisptr.print()
  * 
  *     def setBatchSize( self, int batchSize ):             # <<<<<<<<<<<<<<
@@ -6970,7 +7361,7 @@ static PyObject *__pyx_pf_8PyDeepCL_9NeuralNet_6setBatchSize(struct __pyx_obj_8P
   return __pyx_r;
 }
 
-/* "NeuralNet.pyx":22
+/* "../../../../../../home/user/git/DeepCL/python/NeuralNet.pyx":27
  *     def setBatchSize( self, int batchSize ):
  *         self.thisptr.setBatchSize( batchSize )
  *     def forward( self, const float[:] images):             # <<<<<<<<<<<<<<
@@ -6989,7 +7380,7 @@ static PyObject *__pyx_pw_8PyDeepCL_9NeuralNet_9forward(PyObject *__pyx_v_self,
   __Pyx_RefNannyDeclarations
   __Pyx_RefNannySetupContext("forward (wrapper)", 0);
   assert(__pyx_arg_images); {
-    __pyx_v_images = __Pyx_PyObject_to_MemoryviewSlice_ds_float(__pyx_arg_images); if (unlikely(!__pyx_v_images.memview)) {__pyx_filename = __pyx_f[1]; __pyx_lineno = 22; __pyx_clineno = __LINE__; goto __pyx_L3_error;}
+    __pyx_v_images = __Pyx_PyObject_to_MemoryviewSlice_ds_float(__pyx_arg_images); if (unlikely(!__pyx_v_images.memview)) {__pyx_filename = __pyx_f[1]; __pyx_lineno = 27; __pyx_clineno = __LINE__; goto __pyx_L3_error;}
   }
   goto __pyx_L4_argument_unpacking_done;
   __pyx_L3_error:;
@@ -7014,7 +7405,7 @@ static PyObject *__pyx_pf_8PyDeepCL_9NeuralNet_8forward(struct __pyx_obj_8PyDeep
   int __pyx_clineno = 0;
   __Pyx_RefNannySetupContext("forward", 0);
 
-  /* "NeuralNet.pyx":23
+  /* "../../../../../../home/user/git/DeepCL/python/NeuralNet.pyx":28
  *         self.thisptr.setBatchSize( batchSize )
  *     def forward( self, const float[:] images):
  *         self.thisptr.forward( &images[0] )             # <<<<<<<<<<<<<<
@@ -7029,16 +7420,16 @@ static PyObject *__pyx_pf_8PyDeepCL_9NeuralNet_8forward(struct __pyx_obj_8PyDeep
   } else if (unlikely(__pyx_t_1 >= __pyx_v_images.shape[0])) __pyx_t_2 = 0;
   if (unlikely(__pyx_t_2 != -1)) {
     __Pyx_RaiseBufferIndexError(__pyx_t_2);
-    {__pyx_filename = __pyx_f[1]; __pyx_lineno = 23; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+    {__pyx_filename = __pyx_f[1]; __pyx_lineno = 28; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
   }
   try {
     __pyx_v_self->thisptr->forward((&(*((float *) ( /* dim=0 */ (__pyx_v_images.data + __pyx_t_1 * __pyx_v_images.strides[0]) )))));
   } catch(...) {
     __Pyx_CppExn2PyErr();
-    {__pyx_filename = __pyx_f[1]; __pyx_lineno = 23; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+    {__pyx_filename = __pyx_f[1]; __pyx_lineno = 28; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
   }
 
-  /* "NeuralNet.pyx":22
+  /* "../../../../../../home/user/git/DeepCL/python/NeuralNet.pyx":27
  *     def setBatchSize( self, int batchSize ):
  *         self.thisptr.setBatchSize( batchSize )
  *     def forward( self, const float[:] images):             # <<<<<<<<<<<<<<
@@ -7059,7 +7450,7 @@ static PyObject *__pyx_pf_8PyDeepCL_9NeuralNet_8forward(struct __pyx_obj_8PyDeep
   return __pyx_r;
 }
 
-/* "NeuralNet.pyx":24
+/* "../../../../../../home/user/git/DeepCL/python/NeuralNet.pyx":29
  *     def forward( self, const float[:] images):
  *         self.thisptr.forward( &images[0] )
  *     def forwardList( self, imagesList):             # <<<<<<<<<<<<<<
@@ -7097,14 +7488,14 @@ static PyObject *__pyx_pf_8PyDeepCL_9NeuralNet_10forwardList(struct __pyx_obj_8P
   int __pyx_clineno = 0;
   __Pyx_RefNannySetupContext("forwardList", 0);
 
-  /* "NeuralNet.pyx":25
+  /* "../../../../../../home/user/git/DeepCL/python/NeuralNet.pyx":30
  *         self.thisptr.forward( &images[0] )
  *     def forwardList( self, imagesList):
  *         cdef c_array.array imagesArray = array('f', imagesList )             # <<<<<<<<<<<<<<
  *         cdef float[:] imagesArray_view = imagesArray
  *         self.thisptr.forward( &imagesArray_view[0] )
  */
-  __pyx_t_2 = __Pyx_GetModuleGlobalName(__pyx_n_s_array); if (unlikely(!__pyx_t_2)) {__pyx_filename = __pyx_f[1]; __pyx_lineno = 25; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+  __pyx_t_2 = __Pyx_GetModuleGlobalName(__pyx_n_s_array); if (unlikely(!__pyx_t_2)) {__pyx_filename = __pyx_f[1]; __pyx_lineno = 30; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
   __Pyx_GOTREF(__pyx_t_2);
   __pyx_t_3 = NULL;
   __pyx_t_4 = 0;
@@ -7118,7 +7509,7 @@ static PyObject *__pyx_pf_8PyDeepCL_9NeuralNet_10forwardList(struct __pyx_obj_8P
       __pyx_t_4 = 1;
     }
   }
-  __pyx_t_5 = PyTuple_New(2+__pyx_t_4); if (unlikely(!__pyx_t_5)) {__pyx_filename = __pyx_f[1]; __pyx_lineno = 25; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+  __pyx_t_5 = PyTuple_New(2+__pyx_t_4); if (unlikely(!__pyx_t_5)) {__pyx_filename = __pyx_f[1]; __pyx_lineno = 30; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
   __Pyx_GOTREF(__pyx_t_5);
   if (__pyx_t_3) {
     PyTuple_SET_ITEM(__pyx_t_5, 0, __pyx_t_3); __Pyx_GIVEREF(__pyx_t_3); __pyx_t_3 = NULL;
@@ -7129,15 +7520,15 @@ static PyObject *__pyx_pf_8PyDeepCL_9NeuralNet_10forwardList(struct __pyx_obj_8P
   __Pyx_INCREF(__pyx_v_imagesList);
   PyTuple_SET_ITEM(__pyx_t_5, 1+__pyx_t_4, __pyx_v_imagesList);
   __Pyx_GIVEREF(__pyx_v_imagesList);
-  __pyx_t_1 = __Pyx_PyObject_Call(__pyx_t_2, __pyx_t_5, NULL); if (unlikely(!__pyx_t_1)) {__pyx_filename = __pyx_f[1]; __pyx_lineno = 25; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+  __pyx_t_1 = __Pyx_PyObject_Call(__pyx_t_2, __pyx_t_5, NULL); if (unlikely(!__pyx_t_1)) {__pyx_filename = __pyx_f[1]; __pyx_lineno = 30; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
   __Pyx_GOTREF(__pyx_t_1);
   __Pyx_DECREF(__pyx_t_5); __pyx_t_5 = 0;
   __Pyx_DECREF(__pyx_t_2); __pyx_t_2 = 0;
-  if (!(likely(((__pyx_t_1) == Py_None) || likely(__Pyx_TypeTest(__pyx_t_1, __pyx_ptype_7cpython_5array_array))))) {__pyx_filename = __pyx_f[1]; __pyx_lineno = 25; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+  if (!(likely(((__pyx_t_1) == Py_None) || likely(__Pyx_TypeTest(__pyx_t_1, __pyx_ptype_7cpython_5array_array))))) {__pyx_filename = __pyx_f[1]; __pyx_lineno = 30; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
   __pyx_v_imagesArray = ((arrayobject *)__pyx_t_1);
   __pyx_t_1 = 0;
 
-  /* "NeuralNet.pyx":26
+  /* "../../../../../../home/user/git/DeepCL/python/NeuralNet.pyx":31
  *     def forwardList( self, imagesList):
  *         cdef c_array.array imagesArray = array('f', imagesList )
  *         cdef float[:] imagesArray_view = imagesArray             # <<<<<<<<<<<<<<
@@ -7145,12 +7536,12 @@ static PyObject *__pyx_pf_8PyDeepCL_9NeuralNet_10forwardList(struct __pyx_obj_8P
  *     def backwardFromLabels( self, int[:] labels):
  */
   __pyx_t_6 = __Pyx_PyObject_to_MemoryviewSlice_ds_float(((PyObject *)__pyx_v_imagesArray));
-  if (unlikely(!__pyx_t_6.memview)) {__pyx_filename = __pyx_f[1]; __pyx_lineno = 26; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+  if (unlikely(!__pyx_t_6.memview)) {__pyx_filename = __pyx_f[1]; __pyx_lineno = 31; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
   __pyx_v_imagesArray_view = __pyx_t_6;
   __pyx_t_6.memview = NULL;
   __pyx_t_6.data = NULL;
 
-  /* "NeuralNet.pyx":27
+  /* "../../../../../../home/user/git/DeepCL/python/NeuralNet.pyx":32
  *         cdef c_array.array imagesArray = array('f', imagesList )
  *         cdef float[:] imagesArray_view = imagesArray
  *         self.thisptr.forward( &imagesArray_view[0] )             # <<<<<<<<<<<<<<
@@ -7165,16 +7556,16 @@ static PyObject *__pyx_pf_8PyDeepCL_9NeuralNet_10forwardList(struct __pyx_obj_8P
   } else if (unlikely(__pyx_t_4 >= __pyx_v_imagesArray_view.shape[0])) __pyx_t_7 = 0;
   if (unlikely(__pyx_t_7 != -1)) {
     __Pyx_RaiseBufferIndexError(__pyx_t_7);
-    {__pyx_filename = __pyx_f[1]; __pyx_lineno = 27; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+    {__pyx_filename = __pyx_f[1]; __pyx_lineno = 32; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
   }
   try {
     __pyx_v_self->thisptr->forward((&(*((float *) ( /* dim=0 */ (__pyx_v_imagesArray_view.data + __pyx_t_4 * __pyx_v_imagesArray_view.strides[0]) )))));
   } catch(...) {
     __Pyx_CppExn2PyErr();
-    {__pyx_filename = __pyx_f[1]; __pyx_lineno = 27; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+    {__pyx_filename = __pyx_f[1]; __pyx_lineno = 32; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
   }
 
-  /* "NeuralNet.pyx":24
+  /* "../../../../../../home/user/git/DeepCL/python/NeuralNet.pyx":29
  *     def forward( self, const float[:] images):
  *         self.thisptr.forward( &images[0] )
  *     def forwardList( self, imagesList):             # <<<<<<<<<<<<<<
@@ -7201,7 +7592,7 @@ static PyObject *__pyx_pf_8PyDeepCL_9NeuralNet_10forwardList(struct __pyx_obj_8P
   return __pyx_r;
 }
 
-/* "NeuralNet.pyx":28
+/* "../../../../../../home/user/git/DeepCL/python/NeuralNet.pyx":33
  *         cdef float[:] imagesArray_view = imagesArray
  *         self.thisptr.forward( &imagesArray_view[0] )
  *     def backwardFromLabels( self, int[:] labels):             # <<<<<<<<<<<<<<
@@ -7220,7 +7611,7 @@ static PyObject *__pyx_pw_8PyDeepCL_9NeuralNet_13backwardFromLabels(PyObject *__
   __Pyx_RefNannyDeclarations
   __Pyx_RefNannySetupContext("backwardFromLabels (wrapper)", 0);
   assert(__pyx_arg_labels); {
-    __pyx_v_labels = __Pyx_PyObject_to_MemoryviewSlice_ds_int(__pyx_arg_labels); if (unlikely(!__pyx_v_labels.memview)) {__pyx_filename = __pyx_f[1]; __pyx_lineno = 28; __pyx_clineno = __LINE__; goto __pyx_L3_error;}
+    __pyx_v_labels = __Pyx_PyObject_to_MemoryviewSlice_ds_int(__pyx_arg_labels); if (unlikely(!__pyx_v_labels.memview)) {__pyx_filename = __pyx_f[1]; __pyx_lineno = 33; __pyx_clineno = __LINE__; goto __pyx_L3_error;}
   }
   goto __pyx_L4_argument_unpacking_done;
   __pyx_L3_error:;
@@ -7246,7 +7637,7 @@ static PyObject *__pyx_pf_8PyDeepCL_9NeuralNet_12backwardFromLabels(struct __pyx
   int __pyx_clineno = 0;
   __Pyx_RefNannySetupContext("backwardFromLabels", 0);
 
-  /* "NeuralNet.pyx":29
+  /* "../../../../../../home/user/git/DeepCL/python/NeuralNet.pyx":34
  *         self.thisptr.forward( &imagesArray_view[0] )
  *     def backwardFromLabels( self, int[:] labels):
  *         return self.thisptr.backwardFromLabels( &labels[0] )             # <<<<<<<<<<<<<<
@@ -7262,21 +7653,21 @@ static PyObject *__pyx_pf_8PyDeepCL_9NeuralNet_12backwardFromLabels(struct __pyx
   } else if (unlikely(__pyx_t_1 >= __pyx_v_labels.shape[0])) __pyx_t_2 = 0;
   if (unlikely(__pyx_t_2 != -1)) {
     __Pyx_RaiseBufferIndexError(__pyx_t_2);
-    {__pyx_filename = __pyx_f[1]; __pyx_lineno = 29; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+    {__pyx_filename = __pyx_f[1]; __pyx_lineno = 34; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
   }
   try {
     __pyx_v_self->thisptr->backwardFromLabels((&(*((int *) ( /* dim=0 */ (__pyx_v_labels.data + __pyx_t_1 * __pyx_v_labels.strides[0]) )))));
   } catch(...) {
     __Pyx_CppExn2PyErr();
-    {__pyx_filename = __pyx_f[1]; __pyx_lineno = 29; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+    {__pyx_filename = __pyx_f[1]; __pyx_lineno = 34; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
   }
-  __pyx_t_3 = __Pyx_void_to_None(NULL); if (unlikely(!__pyx_t_3)) {__pyx_filename = __pyx_f[1]; __pyx_lineno = 29; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+  __pyx_t_3 = __Pyx_void_to_None(NULL); if (unlikely(!__pyx_t_3)) {__pyx_filename = __pyx_f[1]; __pyx_lineno = 34; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
   __Pyx_GOTREF(__pyx_t_3);
   __pyx_r = __pyx_t_3;
   __pyx_t_3 = 0;
   goto __pyx_L0;
 
-  /* "NeuralNet.pyx":28
+  /* "../../../../../../home/user/git/DeepCL/python/NeuralNet.pyx":33
  *         cdef float[:] imagesArray_view = imagesArray
  *         self.thisptr.forward( &imagesArray_view[0] )
  *     def backwardFromLabels( self, int[:] labels):             # <<<<<<<<<<<<<<
@@ -7296,7 +7687,7 @@ static PyObject *__pyx_pf_8PyDeepCL_9NeuralNet_12backwardFromLabels(struct __pyx
   return __pyx_r;
 }
 
-/* "NeuralNet.pyx":30
+/* "../../../../../../home/user/git/DeepCL/python/NeuralNet.pyx":35
  *     def backwardFromLabels( self, int[:] labels):
  *         return self.thisptr.backwardFromLabels( &labels[0] )
  *     def backward( self, float[:] expectedOutput):             # <<<<<<<<<<<<<<
@@ -7315,7 +7706,7 @@ static PyObject *__pyx_pw_8PyDeepCL_9NeuralNet_15backward(PyObject *__pyx_v_self
   __Pyx_RefNannyDeclarations
   __Pyx_RefNannySetupContext("backward (wrapper)", 0);
   assert(__pyx_arg_expectedOutput); {
-    __pyx_v_expectedOutput = __Pyx_PyObject_to_MemoryviewSlice_ds_float(__pyx_arg_expectedOutput); if (unlikely(!__pyx_v_expectedOutput.memview)) {__pyx_filename = __pyx_f[1]; __pyx_lineno = 30; __pyx_clineno = __LINE__; goto __pyx_L3_error;}
+    __pyx_v_expectedOutput = __Pyx_PyObject_to_MemoryviewSlice_ds_float(__pyx_arg_expectedOutput); if (unlikely(!__pyx_v_expectedOutput.memview)) {__pyx_filename = __pyx_f[1]; __pyx_lineno = 35; __pyx_clineno = __LINE__; goto __pyx_L3_error;}
   }
   goto __pyx_L4_argument_unpacking_done;
   __pyx_L3_error:;
@@ -7341,7 +7732,7 @@ static PyObject *__pyx_pf_8PyDeepCL_9NeuralNet_14backward(struct __pyx_obj_8PyDe
   int __pyx_clineno = 0;
   __Pyx_RefNannySetupContext("backward", 0);
 
-  /* "NeuralNet.pyx":31
+  /* "../../../../../../home/user/git/DeepCL/python/NeuralNet.pyx":36
  *         return self.thisptr.backwardFromLabels( &labels[0] )
  *     def backward( self, float[:] expectedOutput):
  *         return self.thisptr.backward( &expectedOutput[0] )             # <<<<<<<<<<<<<<
@@ -7357,21 +7748,21 @@ static PyObject *__pyx_pf_8PyDeepCL_9NeuralNet_14backward(struct __pyx_obj_8PyDe
   } else if (unlikely(__pyx_t_1 >= __pyx_v_expectedOutput.shape[0])) __pyx_t_2 = 0;
   if (unlikely(__pyx_t_2 != -1)) {
     __Pyx_RaiseBufferIndexError(__pyx_t_2);
-    {__pyx_filename = __pyx_f[1]; __pyx_lineno = 31; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+    {__pyx_filename = __pyx_f[1]; __pyx_lineno = 36; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
   }
   try {
     __pyx_v_self->thisptr->backward((&(*((float *) ( /* dim=0 */ (__pyx_v_expectedOutput.data + __pyx_t_1 * __pyx_v_expectedOutput.strides[0]) )))));
   } catch(...) {
     __Pyx_CppExn2PyErr();
-    {__pyx_filename = __pyx_f[1]; __pyx_lineno = 31; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+    {__pyx_filename = __pyx_f[1]; __pyx_lineno = 36; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
   }
-  __pyx_t_3 = __Pyx_void_to_None(NULL); if (unlikely(!__pyx_t_3)) {__pyx_filename = __pyx_f[1]; __pyx_lineno = 31; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+  __pyx_t_3 = __Pyx_void_to_None(NULL); if (unlikely(!__pyx_t_3)) {__pyx_filename = __pyx_f[1]; __pyx_lineno = 36; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
   __Pyx_GOTREF(__pyx_t_3);
   __pyx_r = __pyx_t_3;
   __pyx_t_3 = 0;
   goto __pyx_L0;
 
-  /* "NeuralNet.pyx":30
+  /* "../../../../../../home/user/git/DeepCL/python/NeuralNet.pyx":35
  *     def backwardFromLabels( self, int[:] labels):
  *         return self.thisptr.backwardFromLabels( &labels[0] )
  *     def backward( self, float[:] expectedOutput):             # <<<<<<<<<<<<<<
@@ -7391,7 +7782,7 @@ static PyObject *__pyx_pf_8PyDeepCL_9NeuralNet_14backward(struct __pyx_obj_8PyDe
   return __pyx_r;
 }
 
-/* "NeuralNet.pyx":32
+/* "../../../../../../home/user/git/DeepCL/python/NeuralNet.pyx":37
  *     def backward( self, float[:] expectedOutput):
  *         return self.thisptr.backward( &expectedOutput[0] )
  *     def calcNumRight( self, int[:] labels ):             # <<<<<<<<<<<<<<
@@ -7410,7 +7801,7 @@ static PyObject *__pyx_pw_8PyDeepCL_9NeuralNet_17calcNumRight(PyObject *__pyx_v_
   __Pyx_RefNannyDeclarations
   __Pyx_RefNannySetupContext("calcNumRight (wrapper)", 0);
   assert(__pyx_arg_labels); {
-    __pyx_v_labels = __Pyx_PyObject_to_MemoryviewSlice_ds_int(__pyx_arg_labels); if (unlikely(!__pyx_v_labels.memview)) {__pyx_filename = __pyx_f[1]; __pyx_lineno = 32; __pyx_clineno = __LINE__; goto __pyx_L3_error;}
+    __pyx_v_labels = __Pyx_PyObject_to_MemoryviewSlice_ds_int(__pyx_arg_labels); if (unlikely(!__pyx_v_labels.memview)) {__pyx_filename = __pyx_f[1]; __pyx_lineno = 37; __pyx_clineno = __LINE__; goto __pyx_L3_error;}
   }
   goto __pyx_L4_argument_unpacking_done;
   __pyx_L3_error:;
@@ -7436,7 +7827,7 @@ static PyObject *__pyx_pf_8PyDeepCL_9NeuralNet_16calcNumRight(struct __pyx_obj_8
   int __pyx_clineno = 0;
   __Pyx_RefNannySetupContext("calcNumRight", 0);
 
-  /* "NeuralNet.pyx":33
+  /* "../../../../../../home/user/git/DeepCL/python/NeuralNet.pyx":38
  *         return self.thisptr.backward( &expectedOutput[0] )
  *     def calcNumRight( self, int[:] labels ):
  *         return self.thisptr.calcNumRight( &labels[0] )             # <<<<<<<<<<<<<<
@@ -7452,21 +7843,21 @@ static PyObject *__pyx_pf_8PyDeepCL_9NeuralNet_16calcNumRight(struct __pyx_obj_8
   } else if (unlikely(__pyx_t_1 >= __pyx_v_labels.shape[0])) __pyx_t_2 = 0;
   if (unlikely(__pyx_t_2 != -1)) {
     __Pyx_RaiseBufferIndexError(__pyx_t_2);
-    {__pyx_filename = __pyx_f[1]; __pyx_lineno = 33; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+    {__pyx_filename = __pyx_f[1]; __pyx_lineno = 38; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
   }
   try {
     __pyx_t_2 = __pyx_v_self->thisptr->calcNumRight((&(*((int *) ( /* dim=0 */ (__pyx_v_labels.data + __pyx_t_1 * __pyx_v_labels.strides[0]) )))));
   } catch(...) {
     __Pyx_CppExn2PyErr();
-    {__pyx_filename = __pyx_f[1]; __pyx_lineno = 33; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+    {__pyx_filename = __pyx_f[1]; __pyx_lineno = 38; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
   }
-  __pyx_t_3 = __Pyx_PyInt_From_int(__pyx_t_2); if (unlikely(!__pyx_t_3)) {__pyx_filename = __pyx_f[1]; __pyx_lineno = 33; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+  __pyx_t_3 = __Pyx_PyInt_From_int(__pyx_t_2); if (unlikely(!__pyx_t_3)) {__pyx_filename = __pyx_f[1]; __pyx_lineno = 38; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
   __Pyx_GOTREF(__pyx_t_3);
   __pyx_r = __pyx_t_3;
   __pyx_t_3 = 0;
   goto __pyx_L0;
 
-  /* "NeuralNet.pyx":32
+  /* "../../../../../../home/user/git/DeepCL/python/NeuralNet.pyx":37
  *     def backward( self, float[:] expectedOutput):
  *         return self.thisptr.backward( &expectedOutput[0] )
  *     def calcNumRight( self, int[:] labels ):             # <<<<<<<<<<<<<<
@@ -7486,7 +7877,7 @@ static PyObject *__pyx_pf_8PyDeepCL_9NeuralNet_16calcNumRight(struct __pyx_obj_8
   return __pyx_r;
 }
 
-/* "NeuralNet.pyx":34
+/* "../../../../../../home/user/git/DeepCL/python/NeuralNet.pyx":39
  *     def calcNumRight( self, int[:] labels ):
  *         return self.thisptr.calcNumRight( &labels[0] )
  *     def addLayer( self, LayerMaker2 layerMaker ):             # <<<<<<<<<<<<<<
@@ -7503,7 +7894,7 @@ static PyObject *__pyx_pw_8PyDeepCL_9NeuralNet_19addLayer(PyObject *__pyx_v_self
   PyObject *__pyx_r = 0;
   __Pyx_RefNannyDeclarations
   __Pyx_RefNannySetupContext("addLayer (wrapper)", 0);
-  if (unlikely(!__Pyx_ArgTypeTest(((PyObject *)__pyx_v_layerMaker), __pyx_ptype_8PyDeepCL_LayerMaker2, 1, "layerMaker", 0))) {__pyx_filename = __pyx_f[1]; __pyx_lineno = 34; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+  if (unlikely(!__Pyx_ArgTypeTest(((PyObject *)__pyx_v_layerMaker), __pyx_ptype_8PyDeepCL_LayerMaker2, 1, "layerMaker", 0))) {__pyx_filename = __pyx_f[1]; __pyx_lineno = 39; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
   __pyx_r = __pyx_pf_8PyDeepCL_9NeuralNet_18addLayer(((struct __pyx_obj_8PyDeepCL_NeuralNet *)__pyx_v_self), ((struct __pyx_obj_8PyDeepCL_LayerMaker2 *)__pyx_v_layerMaker));
 
   /* function exit code */
@@ -7523,7 +7914,7 @@ static PyObject *__pyx_pf_8PyDeepCL_9NeuralNet_18addLayer(struct __pyx_obj_8PyDe
   int __pyx_clineno = 0;
   __Pyx_RefNannySetupContext("addLayer", 0);
 
-  /* "NeuralNet.pyx":35
+  /* "../../../../../../home/user/git/DeepCL/python/NeuralNet.pyx":40
  *         return self.thisptr.calcNumRight( &labels[0] )
  *     def addLayer( self, LayerMaker2 layerMaker ):
  *         self.thisptr.addLayer( layerMaker.baseptr )             # <<<<<<<<<<<<<<
@@ -7534,10 +7925,10 @@ static PyObject *__pyx_pf_8PyDeepCL_9NeuralNet_18addLayer(struct __pyx_obj_8PyDe
     __pyx_v_self->thisptr->addLayer(__pyx_v_layerMaker->baseptr);
   } catch(...) {
     __Pyx_CppExn2PyErr();
-    {__pyx_filename = __pyx_f[1]; __pyx_lineno = 35; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+    {__pyx_filename = __pyx_f[1]; __pyx_lineno = 40; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
   }
 
-  /* "NeuralNet.pyx":34
+  /* "../../../../../../home/user/git/DeepCL/python/NeuralNet.pyx":39
  *     def calcNumRight( self, int[:] labels ):
  *         return self.thisptr.calcNumRight( &labels[0] )
  *     def addLayer( self, LayerMaker2 layerMaker ):             # <<<<<<<<<<<<<<
@@ -7557,7 +7948,7 @@ static PyObject *__pyx_pf_8PyDeepCL_9NeuralNet_18addLayer(struct __pyx_obj_8PyDe
   return __pyx_r;
 }
 
-/* "NeuralNet.pyx":36
+/* "../../../../../../home/user/git/DeepCL/python/NeuralNet.pyx":41
  *     def addLayer( self, LayerMaker2 layerMaker ):
  *         self.thisptr.addLayer( layerMaker.baseptr )
  *     def getLayer( self, int index ):             # <<<<<<<<<<<<<<
@@ -7576,7 +7967,7 @@ static PyObject *__pyx_pw_8PyDeepCL_9NeuralNet_21getLayer(PyObject *__pyx_v_self
   __Pyx_RefNannyDeclarations
   __Pyx_RefNannySetupContext("getLayer (wrapper)", 0);
   assert(__pyx_arg_index); {
-    __pyx_v_index = __Pyx_PyInt_As_int(__pyx_arg_index); if (unlikely((__pyx_v_index == (int)-1) && PyErr_Occurred())) {__pyx_filename = __pyx_f[1]; __pyx_lineno = 36; __pyx_clineno = __LINE__; goto __pyx_L3_error;}
+    __pyx_v_index = __Pyx_PyInt_As_int(__pyx_arg_index); if (unlikely((__pyx_v_index == (int)-1) && PyErr_Occurred())) {__pyx_filename = __pyx_f[1]; __pyx_lineno = 41; __pyx_clineno = __LINE__; goto __pyx_L3_error;}
   }
   goto __pyx_L4_argument_unpacking_done;
   __pyx_L3_error:;
@@ -7604,7 +7995,7 @@ static PyObject *__pyx_pf_8PyDeepCL_9NeuralNet_20getLayer(struct __pyx_obj_8PyDe
   int __pyx_clineno = 0;
   __Pyx_RefNannySetupContext("getLayer", 0);
 
-  /* "NeuralNet.pyx":37
+  /* "../../../../../../home/user/git/DeepCL/python/NeuralNet.pyx":42
  *         self.thisptr.addLayer( layerMaker.baseptr )
  *     def getLayer( self, int index ):
  *         cdef cDeepCL.Layer *cLayer = self.thisptr.getLayer( index )             # <<<<<<<<<<<<<<
@@ -7613,7 +8004,7 @@ static PyObject *__pyx_pf_8PyDeepCL_9NeuralNet_20getLayer(struct __pyx_obj_8PyDe
  */
   __pyx_v_cLayer = __pyx_v_self->thisptr->getLayer(__pyx_v_index);
 
-  /* "NeuralNet.pyx":38
+  /* "../../../../../../home/user/git/DeepCL/python/NeuralNet.pyx":43
  *     def getLayer( self, int index ):
  *         cdef cDeepCL.Layer *cLayer = self.thisptr.getLayer( index )
  *         if cLayer == NULL:             # <<<<<<<<<<<<<<
@@ -7623,66 +8014,66 @@ static PyObject *__pyx_pf_8PyDeepCL_9NeuralNet_20getLayer(struct __pyx_obj_8PyDe
   __pyx_t_1 = ((__pyx_v_cLayer == NULL) != 0);
   if (__pyx_t_1) {
 
-    /* "NeuralNet.pyx":39
+    /* "../../../../../../home/user/git/DeepCL/python/NeuralNet.pyx":44
  *         cdef cDeepCL.Layer *cLayer = self.thisptr.getLayer( index )
  *         if cLayer == NULL:
  *             raise Exception('layer ' + str(index) + ' not found')             # <<<<<<<<<<<<<<
  *         layer = Layer()
  *         layer.set_thisptr( cLayer ) # note: once neuralnet out of scope, these
  */
-    __pyx_t_2 = __Pyx_PyInt_From_int(__pyx_v_index); if (unlikely(!__pyx_t_2)) {__pyx_filename = __pyx_f[1]; __pyx_lineno = 39; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+    __pyx_t_2 = __Pyx_PyInt_From_int(__pyx_v_index); if (unlikely(!__pyx_t_2)) {__pyx_filename = __pyx_f[1]; __pyx_lineno = 44; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
     __Pyx_GOTREF(__pyx_t_2);
-    __pyx_t_3 = PyTuple_New(1); if (unlikely(!__pyx_t_3)) {__pyx_filename = __pyx_f[1]; __pyx_lineno = 39; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+    __pyx_t_3 = PyTuple_New(1); if (unlikely(!__pyx_t_3)) {__pyx_filename = __pyx_f[1]; __pyx_lineno = 44; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
     __Pyx_GOTREF(__pyx_t_3);
     PyTuple_SET_ITEM(__pyx_t_3, 0, __pyx_t_2);
     __Pyx_GIVEREF(__pyx_t_2);
     __pyx_t_2 = 0;
-    __pyx_t_2 = __Pyx_PyObject_Call(((PyObject *)((PyObject*)(&PyString_Type))), __pyx_t_3, NULL); if (unlikely(!__pyx_t_2)) {__pyx_filename = __pyx_f[1]; __pyx_lineno = 39; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+    __pyx_t_2 = __Pyx_PyObject_Call(((PyObject *)((PyObject*)(&PyString_Type))), __pyx_t_3, NULL); if (unlikely(!__pyx_t_2)) {__pyx_filename = __pyx_f[1]; __pyx_lineno = 44; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
     __Pyx_GOTREF(__pyx_t_2);
     __Pyx_DECREF(__pyx_t_3); __pyx_t_3 = 0;
-    __pyx_t_3 = PyNumber_Add(__pyx_kp_s_layer, __pyx_t_2); if (unlikely(!__pyx_t_3)) {__pyx_filename = __pyx_f[1]; __pyx_lineno = 39; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+    __pyx_t_3 = PyNumber_Add(__pyx_kp_s_layer, __pyx_t_2); if (unlikely(!__pyx_t_3)) {__pyx_filename = __pyx_f[1]; __pyx_lineno = 44; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
     __Pyx_GOTREF(__pyx_t_3);
     __Pyx_DECREF(__pyx_t_2); __pyx_t_2 = 0;
-    __pyx_t_2 = PyNumber_Add(__pyx_t_3, __pyx_kp_s_not_found); if (unlikely(!__pyx_t_2)) {__pyx_filename = __pyx_f[1]; __pyx_lineno = 39; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+    __pyx_t_2 = PyNumber_Add(__pyx_t_3, __pyx_kp_s_not_found); if (unlikely(!__pyx_t_2)) {__pyx_filename = __pyx_f[1]; __pyx_lineno = 44; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
     __Pyx_GOTREF(__pyx_t_2);
     __Pyx_DECREF(__pyx_t_3); __pyx_t_3 = 0;
-    __pyx_t_3 = PyTuple_New(1); if (unlikely(!__pyx_t_3)) {__pyx_filename = __pyx_f[1]; __pyx_lineno = 39; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+    __pyx_t_3 = PyTuple_New(1); if (unlikely(!__pyx_t_3)) {__pyx_filename = __pyx_f[1]; __pyx_lineno = 44; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
     __Pyx_GOTREF(__pyx_t_3);
     PyTuple_SET_ITEM(__pyx_t_3, 0, __pyx_t_2);
     __Pyx_GIVEREF(__pyx_t_2);
     __pyx_t_2 = 0;
-    __pyx_t_2 = __Pyx_PyObject_Call(__pyx_builtin_Exception, __pyx_t_3, NULL); if (unlikely(!__pyx_t_2)) {__pyx_filename = __pyx_f[1]; __pyx_lineno = 39; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+    __pyx_t_2 = __Pyx_PyObject_Call(__pyx_builtin_Exception, __pyx_t_3, NULL); if (unlikely(!__pyx_t_2)) {__pyx_filename = __pyx_f[1]; __pyx_lineno = 44; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
     __Pyx_GOTREF(__pyx_t_2);
     __Pyx_DECREF(__pyx_t_3); __pyx_t_3 = 0;
     __Pyx_Raise(__pyx_t_2, 0, 0, 0);
     __Pyx_DECREF(__pyx_t_2); __pyx_t_2 = 0;
-    {__pyx_filename = __pyx_f[1]; __pyx_lineno = 39; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+    {__pyx_filename = __pyx_f[1]; __pyx_lineno = 44; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
   }
 
-  /* "NeuralNet.pyx":40
+  /* "../../../../../../home/user/git/DeepCL/python/NeuralNet.pyx":45
  *         if cLayer == NULL:
  *             raise Exception('layer ' + str(index) + ' not found')
  *         layer = Layer()             # <<<<<<<<<<<<<<
  *         layer.set_thisptr( cLayer ) # note: once neuralnet out of scope, these
  *                                                         # are no longer valid
  */
-  __pyx_t_2 = __Pyx_PyObject_Call(((PyObject *)((PyObject*)__pyx_ptype_8PyDeepCL_Layer)), __pyx_empty_tuple, NULL); if (unlikely(!__pyx_t_2)) {__pyx_filename = __pyx_f[1]; __pyx_lineno = 40; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+  __pyx_t_2 = __Pyx_PyObject_Call(((PyObject *)((PyObject*)__pyx_ptype_8PyDeepCL_Layer)), __pyx_empty_tuple, NULL); if (unlikely(!__pyx_t_2)) {__pyx_filename = __pyx_f[1]; __pyx_lineno = 45; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
   __Pyx_GOTREF(__pyx_t_2);
   __pyx_v_layer = ((struct __pyx_obj_8PyDeepCL_Layer *)__pyx_t_2);
   __pyx_t_2 = 0;
 
-  /* "NeuralNet.pyx":41
+  /* "../../../../../../home/user/git/DeepCL/python/NeuralNet.pyx":46
  *             raise Exception('layer ' + str(index) + ' not found')
  *         layer = Layer()
  *         layer.set_thisptr( cLayer ) # note: once neuralnet out of scope, these             # <<<<<<<<<<<<<<
  *                                                         # are no longer valid
  *         return layer
  */
-  __pyx_t_2 = ((struct __pyx_vtabstruct_8PyDeepCL_Layer *)__pyx_v_layer->__pyx_vtab)->set_thisptr(__pyx_v_layer, __pyx_v_cLayer); if (unlikely(!__pyx_t_2)) {__pyx_filename = __pyx_f[1]; __pyx_lineno = 41; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+  __pyx_t_2 = ((struct __pyx_vtabstruct_8PyDeepCL_Layer *)__pyx_v_layer->__pyx_vtab)->set_thisptr(__pyx_v_layer, __pyx_v_cLayer); if (unlikely(!__pyx_t_2)) {__pyx_filename = __pyx_f[1]; __pyx_lineno = 46; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
   __Pyx_GOTREF(__pyx_t_2);
   __Pyx_DECREF(__pyx_t_2); __pyx_t_2 = 0;
 
-  /* "NeuralNet.pyx":43
+  /* "../../../../../../home/user/git/DeepCL/python/NeuralNet.pyx":48
  *         layer.set_thisptr( cLayer ) # note: once neuralnet out of scope, these
  *                                                         # are no longer valid
  *         return layer             # <<<<<<<<<<<<<<
@@ -7694,7 +8085,7 @@ static PyObject *__pyx_pf_8PyDeepCL_9NeuralNet_20getLayer(struct __pyx_obj_8PyDe
   __pyx_r = ((PyObject *)__pyx_v_layer);
   goto __pyx_L0;
 
-  /* "NeuralNet.pyx":36
+  /* "../../../../../../home/user/git/DeepCL/python/NeuralNet.pyx":41
  *     def addLayer( self, LayerMaker2 layerMaker ):
  *         self.thisptr.addLayer( layerMaker.baseptr )
  *     def getLayer( self, int index ):             # <<<<<<<<<<<<<<
@@ -7715,7 +8106,7 @@ static PyObject *__pyx_pf_8PyDeepCL_9NeuralNet_20getLayer(struct __pyx_obj_8PyDe
   return __pyx_r;
 }
 
-/* "NeuralNet.pyx":44
+/* "../../../../../../home/user/git/DeepCL/python/NeuralNet.pyx":49
  *                                                         # are no longer valid
  *         return layer
  *     def getNumLayers( self ):             # <<<<<<<<<<<<<<
@@ -7745,7 +8136,7 @@ static PyObject *__pyx_pf_8PyDeepCL_9NeuralNet_22getNumLayers(struct __pyx_obj_8
   int __pyx_clineno = 0;
   __Pyx_RefNannySetupContext("getNumLayers", 0);
 
-  /* "NeuralNet.pyx":45
+  /* "../../../../../../home/user/git/DeepCL/python/NeuralNet.pyx":50
  *         return layer
  *     def getNumLayers( self ):
  *         return self.thisptr.getNumLayers()             # <<<<<<<<<<<<<<
@@ -7753,13 +8144,13 @@ static PyObject *__pyx_pf_8PyDeepCL_9NeuralNet_22getNumLayers(struct __pyx_obj_8
  *         cdef const float *output = self.thisptr.getOutput()
  */
   __Pyx_XDECREF(__pyx_r);
-  __pyx_t_1 = __Pyx_PyInt_From_int(__pyx_v_self->thisptr->getNumLayers()); if (unlikely(!__pyx_t_1)) {__pyx_filename = __pyx_f[1]; __pyx_lineno = 45; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+  __pyx_t_1 = __Pyx_PyInt_From_int(__pyx_v_self->thisptr->getNumLayers()); if (unlikely(!__pyx_t_1)) {__pyx_filename = __pyx_f[1]; __pyx_lineno = 50; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
   __Pyx_GOTREF(__pyx_t_1);
   __pyx_r = __pyx_t_1;
   __pyx_t_1 = 0;
   goto __pyx_L0;
 
-  /* "NeuralNet.pyx":44
+  /* "../../../../../../home/user/git/DeepCL/python/NeuralNet.pyx":49
  *                                                         # are no longer valid
  *         return layer
  *     def getNumLayers( self ):             # <<<<<<<<<<<<<<
@@ -7778,12 +8169,12 @@ static PyObject *__pyx_pf_8PyDeepCL_9NeuralNet_22getNumLayers(struct __pyx_obj_8
   return __pyx_r;
 }
 
-/* "NeuralNet.pyx":46
+/* "../../../../../../home/user/git/DeepCL/python/NeuralNet.pyx":51
  *     def getNumLayers( self ):
  *         return self.thisptr.getNumLayers()
  *     def getOutput(self):             # <<<<<<<<<<<<<<
  *         cdef const float *output = self.thisptr.getOutput()
- *         cdef int outputSize = self.thisptr.getOutputSize()
+ *         cdef int outputNumElements = self.thisptr.getOutputNumElements()
  */
 
 /* Python wrapper */
@@ -7801,7 +8192,7 @@ static PyObject *__pyx_pw_8PyDeepCL_9NeuralNet_25getOutput(PyObject *__pyx_v_sel
 
 static PyObject *__pyx_pf_8PyDeepCL_9NeuralNet_24getOutput(struct __pyx_obj_8PyDeepCL_NeuralNet *__pyx_v_self) {
   float const *__pyx_v_output;
-  int __pyx_v_outputSize;
+  int __pyx_v_outputNumElements;
   arrayobject *__pyx_v_outputArray = 0;
   int __pyx_v_i;
   PyObject *__pyx_r = NULL;
@@ -7819,37 +8210,37 @@ static PyObject *__pyx_pf_8PyDeepCL_9NeuralNet_24getOutput(struct __pyx_obj_8PyD
   int __pyx_clineno = 0;
   __Pyx_RefNannySetupContext("getOutput", 0);
 
-  /* "NeuralNet.pyx":47
+  /* "../../../../../../home/user/git/DeepCL/python/NeuralNet.pyx":52
  *         return self.thisptr.getNumLayers()
  *     def getOutput(self):
  *         cdef const float *output = self.thisptr.getOutput()             # <<<<<<<<<<<<<<
- *         cdef int outputSize = self.thisptr.getOutputSize()
- *         cdef c_array.array outputArray = array('f', [0] * outputSize )
+ *         cdef int outputNumElements = self.thisptr.getOutputNumElements()
+ *         cdef c_array.array outputArray = array('f', [0] * outputNumElements )
  */
   __pyx_v_output = __pyx_v_self->thisptr->getOutput();
 
-  /* "NeuralNet.pyx":48
+  /* "../../../../../../home/user/git/DeepCL/python/NeuralNet.pyx":53
  *     def getOutput(self):
  *         cdef const float *output = self.thisptr.getOutput()
- *         cdef int outputSize = self.thisptr.getOutputSize()             # <<<<<<<<<<<<<<
- *         cdef c_array.array outputArray = array('f', [0] * outputSize )
- *         for i in range(outputSize):
+ *         cdef int outputNumElements = self.thisptr.getOutputNumElements()             # <<<<<<<<<<<<<<
+ *         cdef c_array.array outputArray = array('f', [0] * outputNumElements )
+ *         for i in range(outputNumElements):
  */
-  __pyx_v_outputSize = __pyx_v_self->thisptr->getOutputSize();
+  __pyx_v_outputNumElements = __pyx_v_self->thisptr->getOutputNumElements();
 
-  /* "NeuralNet.pyx":49
+  /* "../../../../../../home/user/git/DeepCL/python/NeuralNet.pyx":54
  *         cdef const float *output = self.thisptr.getOutput()
- *         cdef int outputSize = self.thisptr.getOutputSize()
- *         cdef c_array.array outputArray = array('f', [0] * outputSize )             # <<<<<<<<<<<<<<
- *         for i in range(outputSize):
+ *         cdef int outputNumElements = self.thisptr.getOutputNumElements()
+ *         cdef c_array.array outputArray = array('f', [0] * outputNumElements )             # <<<<<<<<<<<<<<
+ *         for i in range(outputNumElements):
  *             outputArray[i] = output[i]
  */
-  __pyx_t_2 = __Pyx_GetModuleGlobalName(__pyx_n_s_array); if (unlikely(!__pyx_t_2)) {__pyx_filename = __pyx_f[1]; __pyx_lineno = 49; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+  __pyx_t_2 = __Pyx_GetModuleGlobalName(__pyx_n_s_array); if (unlikely(!__pyx_t_2)) {__pyx_filename = __pyx_f[1]; __pyx_lineno = 54; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
   __Pyx_GOTREF(__pyx_t_2);
-  __pyx_t_3 = PyList_New(1 * ((__pyx_v_outputSize<0) ? 0:__pyx_v_outputSize)); if (unlikely(!__pyx_t_3)) {__pyx_filename = __pyx_f[1]; __pyx_lineno = 49; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+  __pyx_t_3 = PyList_New(1 * ((__pyx_v_outputNumElements<0) ? 0:__pyx_v_outputNumElements)); if (unlikely(!__pyx_t_3)) {__pyx_filename = __pyx_f[1]; __pyx_lineno = 54; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
   __Pyx_GOTREF(__pyx_t_3);
   { Py_ssize_t __pyx_temp;
-    for (__pyx_temp=0; __pyx_temp < __pyx_v_outputSize; __pyx_temp++) {
+    for (__pyx_temp=0; __pyx_temp < __pyx_v_outputNumElements; __pyx_temp++) {
       __Pyx_INCREF(__pyx_int_0);
       PyList_SET_ITEM(__pyx_t_3, __pyx_temp, __pyx_int_0);
       __Pyx_GIVEREF(__pyx_int_0);
@@ -7867,7 +8258,7 @@ static PyObject *__pyx_pf_8PyDeepCL_9NeuralNet_24getOutput(struct __pyx_obj_8PyD
       __pyx_t_5 = 1;
     }
   }
-  __pyx_t_6 = PyTuple_New(2+__pyx_t_5); if (unlikely(!__pyx_t_6)) {__pyx_filename = __pyx_f[1]; __pyx_lineno = 49; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+  __pyx_t_6 = PyTuple_New(2+__pyx_t_5); if (unlikely(!__pyx_t_6)) {__pyx_filename = __pyx_f[1]; __pyx_lineno = 54; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
   __Pyx_GOTREF(__pyx_t_6);
   if (__pyx_t_4) {
     PyTuple_SET_ITEM(__pyx_t_6, 0, __pyx_t_4); __Pyx_GIVEREF(__pyx_t_4); __pyx_t_4 = NULL;
@@ -7878,40 +8269,40 @@ static PyObject *__pyx_pf_8PyDeepCL_9NeuralNet_24getOutput(struct __pyx_obj_8PyD
   PyTuple_SET_ITEM(__pyx_t_6, 1+__pyx_t_5, __pyx_t_3);
   __Pyx_GIVEREF(__pyx_t_3);
   __pyx_t_3 = 0;
-  __pyx_t_1 = __Pyx_PyObject_Call(__pyx_t_2, __pyx_t_6, NULL); if (unlikely(!__pyx_t_1)) {__pyx_filename = __pyx_f[1]; __pyx_lineno = 49; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+  __pyx_t_1 = __Pyx_PyObject_Call(__pyx_t_2, __pyx_t_6, NULL); if (unlikely(!__pyx_t_1)) {__pyx_filename = __pyx_f[1]; __pyx_lineno = 54; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
   __Pyx_GOTREF(__pyx_t_1);
   __Pyx_DECREF(__pyx_t_6); __pyx_t_6 = 0;
   __Pyx_DECREF(__pyx_t_2); __pyx_t_2 = 0;
-  if (!(likely(((__pyx_t_1) == Py_None) || likely(__Pyx_TypeTest(__pyx_t_1, __pyx_ptype_7cpython_5array_array))))) {__pyx_filename = __pyx_f[1]; __pyx_lineno = 49; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+  if (!(likely(((__pyx_t_1) == Py_None) || likely(__Pyx_TypeTest(__pyx_t_1, __pyx_ptype_7cpython_5array_array))))) {__pyx_filename = __pyx_f[1]; __pyx_lineno = 54; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
   __pyx_v_outputArray = ((arrayobject *)__pyx_t_1);
   __pyx_t_1 = 0;
 
-  /* "NeuralNet.pyx":50
- *         cdef int outputSize = self.thisptr.getOutputSize()
- *         cdef c_array.array outputArray = array('f', [0] * outputSize )
- *         for i in range(outputSize):             # <<<<<<<<<<<<<<
+  /* "../../../../../../home/user/git/DeepCL/python/NeuralNet.pyx":55
+ *         cdef int outputNumElements = self.thisptr.getOutputNumElements()
+ *         cdef c_array.array outputArray = array('f', [0] * outputNumElements )
+ *         for i in range(outputNumElements):             # <<<<<<<<<<<<<<
  *             outputArray[i] = output[i]
  *         return outputArray
  */
-  __pyx_t_7 = __pyx_v_outputSize;
+  __pyx_t_7 = __pyx_v_outputNumElements;
   for (__pyx_t_8 = 0; __pyx_t_8 < __pyx_t_7; __pyx_t_8+=1) {
     __pyx_v_i = __pyx_t_8;
 
-    /* "NeuralNet.pyx":51
- *         cdef c_array.array outputArray = array('f', [0] * outputSize )
- *         for i in range(outputSize):
+    /* "../../../../../../home/user/git/DeepCL/python/NeuralNet.pyx":56
+ *         cdef c_array.array outputArray = array('f', [0] * outputNumElements )
+ *         for i in range(outputNumElements):
  *             outputArray[i] = output[i]             # <<<<<<<<<<<<<<
  *         return outputArray
  *     def setTraining(self, training): # 1 is, we are training net, 0 is we are not
  */
-    __pyx_t_1 = PyFloat_FromDouble((__pyx_v_output[__pyx_v_i])); if (unlikely(!__pyx_t_1)) {__pyx_filename = __pyx_f[1]; __pyx_lineno = 51; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+    __pyx_t_1 = PyFloat_FromDouble((__pyx_v_output[__pyx_v_i])); if (unlikely(!__pyx_t_1)) {__pyx_filename = __pyx_f[1]; __pyx_lineno = 56; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
     __Pyx_GOTREF(__pyx_t_1);
-    if (unlikely(__Pyx_SetItemInt(((PyObject *)__pyx_v_outputArray), __pyx_v_i, __pyx_t_1, int, 1, __Pyx_PyInt_From_int, 0, 1, 1) < 0)) {__pyx_filename = __pyx_f[1]; __pyx_lineno = 51; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+    if (unlikely(__Pyx_SetItemInt(((PyObject *)__pyx_v_outputArray), __pyx_v_i, __pyx_t_1, int, 1, __Pyx_PyInt_From_int, 0, 1, 1) < 0)) {__pyx_filename = __pyx_f[1]; __pyx_lineno = 56; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
     __Pyx_DECREF(__pyx_t_1); __pyx_t_1 = 0;
   }
 
-  /* "NeuralNet.pyx":52
- *         for i in range(outputSize):
+  /* "../../../../../../home/user/git/DeepCL/python/NeuralNet.pyx":57
+ *         for i in range(outputNumElements):
  *             outputArray[i] = output[i]
  *         return outputArray             # <<<<<<<<<<<<<<
  *     def setTraining(self, training): # 1 is, we are training net, 0 is we are not
@@ -7922,12 +8313,12 @@ static PyObject *__pyx_pf_8PyDeepCL_9NeuralNet_24getOutput(struct __pyx_obj_8PyD
   __pyx_r = ((PyObject *)__pyx_v_outputArray);
   goto __pyx_L0;
 
-  /* "NeuralNet.pyx":46
+  /* "../../../../../../home/user/git/DeepCL/python/NeuralNet.pyx":51
  *     def getNumLayers( self ):
  *         return self.thisptr.getNumLayers()
  *     def getOutput(self):             # <<<<<<<<<<<<<<
  *         cdef const float *output = self.thisptr.getOutput()
- *         cdef int outputSize = self.thisptr.getOutputSize()
+ *         cdef int outputNumElements = self.thisptr.getOutputNumElements()
  */
 
   /* function exit code */
@@ -7946,7 +8337,7 @@ static PyObject *__pyx_pf_8PyDeepCL_9NeuralNet_24getOutput(struct __pyx_obj_8PyD
   return __pyx_r;
 }
 
-/* "NeuralNet.pyx":53
+/* "../../../../../../home/user/git/DeepCL/python/NeuralNet.pyx":58
  *             outputArray[i] = output[i]
  *         return outputArray
  *     def setTraining(self, training): # 1 is, we are training net, 0 is we are not             # <<<<<<<<<<<<<<
@@ -7976,16 +8367,15 @@ static PyObject *__pyx_pf_8PyDeepCL_9NeuralNet_26setTraining(struct __pyx_obj_8P
   int __pyx_clineno = 0;
   __Pyx_RefNannySetupContext("setTraining", 0);
 
-  /* "NeuralNet.pyx":56
+  /* "../../../../../../home/user/git/DeepCL/python/NeuralNet.pyx":61
  *                             # used for example by randomtranslations layer (for now,
  *                             # used only by randomtranslations layer)
  *         self.thisptr.setTraining( training )             # <<<<<<<<<<<<<<
- * 
  */
-  __pyx_t_1 = __Pyx_PyObject_IsTrue(__pyx_v_training); if (unlikely((__pyx_t_1 == (bool)-1) && PyErr_Occurred())) {__pyx_filename = __pyx_f[1]; __pyx_lineno = 56; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+  __pyx_t_1 = __Pyx_PyObject_IsTrue(__pyx_v_training); if (unlikely((__pyx_t_1 == (bool)-1) && PyErr_Occurred())) {__pyx_filename = __pyx_f[1]; __pyx_lineno = 61; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
   __pyx_v_self->thisptr->setTraining(__pyx_t_1);
 
-  /* "NeuralNet.pyx":53
+  /* "../../../../../../home/user/git/DeepCL/python/NeuralNet.pyx":58
  *             outputArray[i] = output[i]
  *         return outputArray
  *     def setTraining(self, training): # 1 is, we are training net, 0 is we are not             # <<<<<<<<<<<<<<
@@ -8005,7 +8395,7 @@ static PyObject *__pyx_pf_8PyDeepCL_9NeuralNet_26setTraining(struct __pyx_obj_8P
   return __pyx_r;
 }
 
-/* "Layer.pyx":4
+/* "../../../../../../home/user/git/DeepCL/python/Layer.pyx":4
  *     cdef cDeepCL.Layer *thisptr
  * 
  *     def __cinit__(self):             # <<<<<<<<<<<<<<
@@ -8040,7 +8430,7 @@ static int __pyx_pf_8PyDeepCL_5Layer___cinit__(CYTHON_UNUSED struct __pyx_obj_8P
   return __pyx_r;
 }
 
-/* "Layer.pyx":6
+/* "../../../../../../home/user/git/DeepCL/python/Layer.pyx":6
  *     def __cinit__(self):
  *         pass
  *     cdef set_thisptr(self, cDeepCL.Layer *thisptr):             # <<<<<<<<<<<<<<
@@ -8053,7 +8443,7 @@ static PyObject *__pyx_f_8PyDeepCL_5Layer_set_thisptr(struct __pyx_obj_8PyDeepCL
   __Pyx_RefNannyDeclarations
   __Pyx_RefNannySetupContext("set_thisptr", 0);
 
-  /* "Layer.pyx":7
+  /* "../../../../../../home/user/git/DeepCL/python/Layer.pyx":7
  *         pass
  *     cdef set_thisptr(self, cDeepCL.Layer *thisptr):
  *         self.thisptr = thisptr             # <<<<<<<<<<<<<<
@@ -8062,7 +8452,7 @@ static PyObject *__pyx_f_8PyDeepCL_5Layer_set_thisptr(struct __pyx_obj_8PyDeepCL
  */
   __pyx_v_self->thisptr = __pyx_v_thisptr;
 
-  /* "Layer.pyx":6
+  /* "../../../../../../home/user/git/DeepCL/python/Layer.pyx":6
  *     def __cinit__(self):
  *         pass
  *     cdef set_thisptr(self, cDeepCL.Layer *thisptr):             # <<<<<<<<<<<<<<
@@ -8077,7 +8467,7 @@ static PyObject *__pyx_f_8PyDeepCL_5Layer_set_thisptr(struct __pyx_obj_8PyDeepCL
   return __pyx_r;
 }
 
-/* "Layer.pyx":8
+/* "../../../../../../home/user/git/DeepCL/python/Layer.pyx":8
  *     cdef set_thisptr(self, cDeepCL.Layer *thisptr):
  *         self.thisptr = thisptr
  *     def forward(self):             # <<<<<<<<<<<<<<
@@ -8103,7 +8493,7 @@ static PyObject *__pyx_pf_8PyDeepCL_5Layer_2forward(struct __pyx_obj_8PyDeepCL_L
   __Pyx_RefNannyDeclarations
   __Pyx_RefNannySetupContext("forward", 0);
 
-  /* "Layer.pyx":9
+  /* "../../../../../../home/user/git/DeepCL/python/Layer.pyx":9
  *         self.thisptr = thisptr
  *     def forward(self):
  *         self.thisptr.forward()             # <<<<<<<<<<<<<<
@@ -8112,7 +8502,7 @@ static PyObject *__pyx_pf_8PyDeepCL_5Layer_2forward(struct __pyx_obj_8PyDeepCL_L
  */
   __pyx_v_self->thisptr->forward();
 
-  /* "Layer.pyx":8
+  /* "../../../../../../home/user/git/DeepCL/python/Layer.pyx":8
  *     cdef set_thisptr(self, cDeepCL.Layer *thisptr):
  *         self.thisptr = thisptr
  *     def forward(self):             # <<<<<<<<<<<<<<
@@ -8127,7 +8517,7 @@ static PyObject *__pyx_pf_8PyDeepCL_5Layer_2forward(struct __pyx_obj_8PyDeepCL_L
   return __pyx_r;
 }
 
-/* "Layer.pyx":10
+/* "../../../../../../home/user/git/DeepCL/python/Layer.pyx":10
  *     def forward(self):
  *         self.thisptr.forward()
  *     def backward(self):             # <<<<<<<<<<<<<<
@@ -8153,7 +8543,7 @@ static PyObject *__pyx_pf_8PyDeepCL_5Layer_4backward(struct __pyx_obj_8PyDeepCL_
   __Pyx_RefNannyDeclarations
   __Pyx_RefNannySetupContext("backward", 0);
 
-  /* "Layer.pyx":11
+  /* "../../../../../../home/user/git/DeepCL/python/Layer.pyx":11
  *         self.thisptr.forward()
  *     def backward(self):
  *         self.thisptr.backward()             # <<<<<<<<<<<<<<
@@ -8162,7 +8552,7 @@ static PyObject *__pyx_pf_8PyDeepCL_5Layer_4backward(struct __pyx_obj_8PyDeepCL_
  */
   __pyx_v_self->thisptr->backward();
 
-  /* "Layer.pyx":10
+  /* "../../../../../../home/user/git/DeepCL/python/Layer.pyx":10
  *     def forward(self):
  *         self.thisptr.forward()
  *     def backward(self):             # <<<<<<<<<<<<<<
@@ -8177,7 +8567,7 @@ static PyObject *__pyx_pf_8PyDeepCL_5Layer_4backward(struct __pyx_obj_8PyDeepCL_
   return __pyx_r;
 }
 
-/* "Layer.pyx":12
+/* "../../../../../../home/user/git/DeepCL/python/Layer.pyx":12
  *     def backward(self):
  *         self.thisptr.backward()
  *     def needsBackProp(self):             # <<<<<<<<<<<<<<
@@ -8207,7 +8597,7 @@ static PyObject *__pyx_pf_8PyDeepCL_5Layer_6needsBackProp(struct __pyx_obj_8PyDe
   int __pyx_clineno = 0;
   __Pyx_RefNannySetupContext("needsBackProp", 0);
 
-  /* "Layer.pyx":13
+  /* "../../../../../../home/user/git/DeepCL/python/Layer.pyx":13
  *         self.thisptr.backward()
  *     def needsBackProp(self):
  *         return self.thisptr.needsBackProp()             # <<<<<<<<<<<<<<
@@ -8215,13 +8605,13 @@ static PyObject *__pyx_pf_8PyDeepCL_5Layer_6needsBackProp(struct __pyx_obj_8PyDe
  * #        return self.thisptr.getBiased()
  */
   __Pyx_XDECREF(__pyx_r);
-  __pyx_t_1 = __Pyx_PyBool_FromLong(__pyx_v_self->thisptr->needsBackProp()); if (unlikely(!__pyx_t_1)) {__pyx_filename = __pyx_f[10]; __pyx_lineno = 13; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+  __pyx_t_1 = __Pyx_PyBool_FromLong(__pyx_v_self->thisptr->needsBackProp()); if (unlikely(!__pyx_t_1)) {__pyx_filename = __pyx_f[9]; __pyx_lineno = 13; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
   __Pyx_GOTREF(__pyx_t_1);
   __pyx_r = __pyx_t_1;
   __pyx_t_1 = 0;
   goto __pyx_L0;
 
-  /* "Layer.pyx":12
+  /* "../../../../../../home/user/git/DeepCL/python/Layer.pyx":12
  *     def backward(self):
  *         self.thisptr.backward()
  *     def needsBackProp(self):             # <<<<<<<<<<<<<<
@@ -8240,7 +8630,7 @@ static PyObject *__pyx_pf_8PyDeepCL_5Layer_6needsBackProp(struct __pyx_obj_8PyDe
   return __pyx_r;
 }
 
-/* "Layer.pyx":16
+/* "../../../../../../home/user/git/DeepCL/python/Layer.pyx":16
  * #    def getBiased( self ):
  * #        return self.thisptr.getBiased()
  *     def getOutputCubeSize(self):             # <<<<<<<<<<<<<<
@@ -8270,7 +8660,7 @@ static PyObject *__pyx_pf_8PyDeepCL_5Layer_8getOutputCubeSize(struct __pyx_obj_8
   int __pyx_clineno = 0;
   __Pyx_RefNannySetupContext("getOutputCubeSize", 0);
 
-  /* "Layer.pyx":17
+  /* "../../../../../../home/user/git/DeepCL/python/Layer.pyx":17
  * #        return self.thisptr.getBiased()
  *     def getOutputCubeSize(self):
  *         return self.thisptr.getOutputCubeSize()             # <<<<<<<<<<<<<<
@@ -8278,13 +8668,13 @@ static PyObject *__pyx_pf_8PyDeepCL_5Layer_8getOutputCubeSize(struct __pyx_obj_8
  *         return self.thisptr.getOutputPlanes()
  */
   __Pyx_XDECREF(__pyx_r);
-  __pyx_t_1 = __Pyx_PyInt_From_int(__pyx_v_self->thisptr->getOutputCubeSize()); if (unlikely(!__pyx_t_1)) {__pyx_filename = __pyx_f[10]; __pyx_lineno = 17; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+  __pyx_t_1 = __Pyx_PyInt_From_int(__pyx_v_self->thisptr->getOutputCubeSize()); if (unlikely(!__pyx_t_1)) {__pyx_filename = __pyx_f[9]; __pyx_lineno = 17; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
   __Pyx_GOTREF(__pyx_t_1);
   __pyx_r = __pyx_t_1;
   __pyx_t_1 = 0;
   goto __pyx_L0;
 
-  /* "Layer.pyx":16
+  /* "../../../../../../home/user/git/DeepCL/python/Layer.pyx":16
  * #    def getBiased( self ):
  * #        return self.thisptr.getBiased()
  *     def getOutputCubeSize(self):             # <<<<<<<<<<<<<<
@@ -8303,12 +8693,12 @@ static PyObject *__pyx_pf_8PyDeepCL_5Layer_8getOutputCubeSize(struct __pyx_obj_8
   return __pyx_r;
 }
 
-/* "Layer.pyx":18
+/* "../../../../../../home/user/git/DeepCL/python/Layer.pyx":18
  *     def getOutputCubeSize(self):
  *         return self.thisptr.getOutputCubeSize()
  *     def getOutputPlanes(self):             # <<<<<<<<<<<<<<
  *         return self.thisptr.getOutputPlanes()
- *     def getOutputImageSize(self):
+ *     def getOutputSize(self):
  */
 
 /* Python wrapper */
@@ -8333,26 +8723,26 @@ static PyObject *__pyx_pf_8PyDeepCL_5Layer_10getOutputPlanes(struct __pyx_obj_8P
   int __pyx_clineno = 0;
   __Pyx_RefNannySetupContext("getOutputPlanes", 0);
 
-  /* "Layer.pyx":19
+  /* "../../../../../../home/user/git/DeepCL/python/Layer.pyx":19
  *         return self.thisptr.getOutputCubeSize()
  *     def getOutputPlanes(self):
  *         return self.thisptr.getOutputPlanes()             # <<<<<<<<<<<<<<
- *     def getOutputImageSize(self):
- *         return self.thisptr.getOutputImageSize()
+ *     def getOutputSize(self):
+ *         return self.thisptr.getOutputSize()
  */
   __Pyx_XDECREF(__pyx_r);
-  __pyx_t_1 = __Pyx_PyInt_From_int(__pyx_v_self->thisptr->getOutputPlanes()); if (unlikely(!__pyx_t_1)) {__pyx_filename = __pyx_f[10]; __pyx_lineno = 19; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+  __pyx_t_1 = __Pyx_PyInt_From_int(__pyx_v_self->thisptr->getOutputPlanes()); if (unlikely(!__pyx_t_1)) {__pyx_filename = __pyx_f[9]; __pyx_lineno = 19; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
   __Pyx_GOTREF(__pyx_t_1);
   __pyx_r = __pyx_t_1;
   __pyx_t_1 = 0;
   goto __pyx_L0;
 
-  /* "Layer.pyx":18
+  /* "../../../../../../home/user/git/DeepCL/python/Layer.pyx":18
  *     def getOutputCubeSize(self):
  *         return self.thisptr.getOutputCubeSize()
  *     def getOutputPlanes(self):             # <<<<<<<<<<<<<<
  *         return self.thisptr.getOutputPlanes()
- *     def getOutputImageSize(self):
+ *     def getOutputSize(self):
  */
 
   /* function exit code */
@@ -8366,62 +8756,62 @@ static PyObject *__pyx_pf_8PyDeepCL_5Layer_10getOutputPlanes(struct __pyx_obj_8P
   return __pyx_r;
 }
 
-/* "Layer.pyx":20
+/* "../../../../../../home/user/git/DeepCL/python/Layer.pyx":20
  *     def getOutputPlanes(self):
  *         return self.thisptr.getOutputPlanes()
- *     def getOutputImageSize(self):             # <<<<<<<<<<<<<<
- *         return self.thisptr.getOutputImageSize()
+ *     def getOutputSize(self):             # <<<<<<<<<<<<<<
+ *         return self.thisptr.getOutputSize()
  *     def getOutput(self):
  */
 
 /* Python wrapper */
-static PyObject *__pyx_pw_8PyDeepCL_5Layer_13getOutputImageSize(PyObject *__pyx_v_self, CYTHON_UNUSED PyObject *unused); /*proto*/
-static PyObject *__pyx_pw_8PyDeepCL_5Layer_13getOutputImageSize(PyObject *__pyx_v_self, CYTHON_UNUSED PyObject *unused) {
+static PyObject *__pyx_pw_8PyDeepCL_5Layer_13getOutputSize(PyObject *__pyx_v_self, CYTHON_UNUSED PyObject *unused); /*proto*/
+static PyObject *__pyx_pw_8PyDeepCL_5Layer_13getOutputSize(PyObject *__pyx_v_self, CYTHON_UNUSED PyObject *unused) {
   PyObject *__pyx_r = 0;
   __Pyx_RefNannyDeclarations
-  __Pyx_RefNannySetupContext("getOutputImageSize (wrapper)", 0);
-  __pyx_r = __pyx_pf_8PyDeepCL_5Layer_12getOutputImageSize(((struct __pyx_obj_8PyDeepCL_Layer *)__pyx_v_self));
+  __Pyx_RefNannySetupContext("getOutputSize (wrapper)", 0);
+  __pyx_r = __pyx_pf_8PyDeepCL_5Layer_12getOutputSize(((struct __pyx_obj_8PyDeepCL_Layer *)__pyx_v_self));
 
   /* function exit code */
   __Pyx_RefNannyFinishContext();
   return __pyx_r;
 }
 
-static PyObject *__pyx_pf_8PyDeepCL_5Layer_12getOutputImageSize(struct __pyx_obj_8PyDeepCL_Layer *__pyx_v_self) {
+static PyObject *__pyx_pf_8PyDeepCL_5Layer_12getOutputSize(struct __pyx_obj_8PyDeepCL_Layer *__pyx_v_self) {
   PyObject *__pyx_r = NULL;
   __Pyx_RefNannyDeclarations
   PyObject *__pyx_t_1 = NULL;
   int __pyx_lineno = 0;
   const char *__pyx_filename = NULL;
   int __pyx_clineno = 0;
-  __Pyx_RefNannySetupContext("getOutputImageSize", 0);
+  __Pyx_RefNannySetupContext("getOutputSize", 0);
 
-  /* "Layer.pyx":21
+  /* "../../../../../../home/user/git/DeepCL/python/Layer.pyx":21
  *         return self.thisptr.getOutputPlanes()
- *     def getOutputImageSize(self):
- *         return self.thisptr.getOutputImageSize()             # <<<<<<<<<<<<<<
+ *     def getOutputSize(self):
+ *         return self.thisptr.getOutputSize()             # <<<<<<<<<<<<<<
  *     def getOutput(self):
  *         # the underlying c++ method returns a pointer
  */
   __Pyx_XDECREF(__pyx_r);
-  __pyx_t_1 = __Pyx_PyInt_From_int(__pyx_v_self->thisptr->getOutputImageSize()); if (unlikely(!__pyx_t_1)) {__pyx_filename = __pyx_f[10]; __pyx_lineno = 21; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+  __pyx_t_1 = __Pyx_PyInt_From_int(__pyx_v_self->thisptr->getOutputSize()); if (unlikely(!__pyx_t_1)) {__pyx_filename = __pyx_f[9]; __pyx_lineno = 21; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
   __Pyx_GOTREF(__pyx_t_1);
   __pyx_r = __pyx_t_1;
   __pyx_t_1 = 0;
   goto __pyx_L0;
 
-  /* "Layer.pyx":20
+  /* "../../../../../../home/user/git/DeepCL/python/Layer.pyx":20
  *     def getOutputPlanes(self):
  *         return self.thisptr.getOutputPlanes()
- *     def getOutputImageSize(self):             # <<<<<<<<<<<<<<
- *         return self.thisptr.getOutputImageSize()
+ *     def getOutputSize(self):             # <<<<<<<<<<<<<<
+ *         return self.thisptr.getOutputSize()
  *     def getOutput(self):
  */
 
   /* function exit code */
   __pyx_L1_error:;
   __Pyx_XDECREF(__pyx_t_1);
-  __Pyx_AddTraceback("PyDeepCL.Layer.getOutputImageSize", __pyx_clineno, __pyx_lineno, __pyx_filename);
+  __Pyx_AddTraceback("PyDeepCL.Layer.getOutputSize", __pyx_clineno, __pyx_lineno, __pyx_filename);
   __pyx_r = NULL;
   __pyx_L0:;
   __Pyx_XGIVEREF(__pyx_r);
@@ -8429,9 +8819,9 @@ static PyObject *__pyx_pf_8PyDeepCL_5Layer_12getOutputImageSize(struct __pyx_obj
   return __pyx_r;
 }
 
-/* "Layer.pyx":22
- *     def getOutputImageSize(self):
- *         return self.thisptr.getOutputImageSize()
+/* "../../../../../../home/user/git/DeepCL/python/Layer.pyx":22
+ *     def getOutputSize(self):
+ *         return self.thisptr.getOutputSize()
  *     def getOutput(self):             # <<<<<<<<<<<<<<
  *         # the underlying c++ method returns a pointer
  *         # to a block of memory that we dont own
@@ -8452,7 +8842,7 @@ static PyObject *__pyx_pw_8PyDeepCL_5Layer_15getOutput(PyObject *__pyx_v_self, C
 
 static PyObject *__pyx_pf_8PyDeepCL_5Layer_14getOutput(struct __pyx_obj_8PyDeepCL_Layer *__pyx_v_self) {
   float *__pyx_v_output;
-  int __pyx_v_outputSize;
+  int __pyx_v_outputNumElements;
   arrayobject *__pyx_v_outputArray = 0;
   int __pyx_v_i;
   PyObject *__pyx_r = NULL;
@@ -8470,37 +8860,37 @@ static PyObject *__pyx_pf_8PyDeepCL_5Layer_14getOutput(struct __pyx_obj_8PyDeepC
   int __pyx_clineno = 0;
   __Pyx_RefNannySetupContext("getOutput", 0);
 
-  /* "Layer.pyx":26
+  /* "../../../../../../home/user/git/DeepCL/python/Layer.pyx":26
  *         # to a block of memory that we dont own
  *         # we should probably copy it I suppose
  *         cdef float *output = self.thisptr.getOutput()             # <<<<<<<<<<<<<<
- *         cdef int outputSize = self.thisptr.getOutputSize()
- *         cdef c_array.array outputArray = array('f', [0] * outputSize )
+ *         cdef int outputNumElements = self.thisptr.getOutputNumElements()
+ *         cdef c_array.array outputArray = array('f', [0] * outputNumElements )
  */
   __pyx_v_output = __pyx_v_self->thisptr->getOutput();
 
-  /* "Layer.pyx":27
+  /* "../../../../../../home/user/git/DeepCL/python/Layer.pyx":27
  *         # we should probably copy it I suppose
  *         cdef float *output = self.thisptr.getOutput()
- *         cdef int outputSize = self.thisptr.getOutputSize()             # <<<<<<<<<<<<<<
- *         cdef c_array.array outputArray = array('f', [0] * outputSize )
- *         for i in range(outputSize):
+ *         cdef int outputNumElements = self.thisptr.getOutputNumElements()             # <<<<<<<<<<<<<<
+ *         cdef c_array.array outputArray = array('f', [0] * outputNumElements )
+ *         for i in range(outputNumElements):
  */
-  __pyx_v_outputSize = __pyx_v_self->thisptr->getOutputSize();
+  __pyx_v_outputNumElements = __pyx_v_self->thisptr->getOutputNumElements();
 
-  /* "Layer.pyx":28
+  /* "../../../../../../home/user/git/DeepCL/python/Layer.pyx":28
  *         cdef float *output = self.thisptr.getOutput()
- *         cdef int outputSize = self.thisptr.getOutputSize()
- *         cdef c_array.array outputArray = array('f', [0] * outputSize )             # <<<<<<<<<<<<<<
- *         for i in range(outputSize):
+ *         cdef int outputNumElements = self.thisptr.getOutputNumElements()
+ *         cdef c_array.array outputArray = array('f', [0] * outputNumElements )             # <<<<<<<<<<<<<<
+ *         for i in range(outputNumElements):
  *             outputArray[i] = output[i]
  */
-  __pyx_t_2 = __Pyx_GetModuleGlobalName(__pyx_n_s_array); if (unlikely(!__pyx_t_2)) {__pyx_filename = __pyx_f[10]; __pyx_lineno = 28; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+  __pyx_t_2 = __Pyx_GetModuleGlobalName(__pyx_n_s_array); if (unlikely(!__pyx_t_2)) {__pyx_filename = __pyx_f[9]; __pyx_lineno = 28; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
   __Pyx_GOTREF(__pyx_t_2);
-  __pyx_t_3 = PyList_New(1 * ((__pyx_v_outputSize<0) ? 0:__pyx_v_outputSize)); if (unlikely(!__pyx_t_3)) {__pyx_filename = __pyx_f[10]; __pyx_lineno = 28; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+  __pyx_t_3 = PyList_New(1 * ((__pyx_v_outputNumElements<0) ? 0:__pyx_v_outputNumElements)); if (unlikely(!__pyx_t_3)) {__pyx_filename = __pyx_f[9]; __pyx_lineno = 28; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
   __Pyx_GOTREF(__pyx_t_3);
   { Py_ssize_t __pyx_temp;
-    for (__pyx_temp=0; __pyx_temp < __pyx_v_outputSize; __pyx_temp++) {
+    for (__pyx_temp=0; __pyx_temp < __pyx_v_outputNumElements; __pyx_temp++) {
       __Pyx_INCREF(__pyx_int_0);
       PyList_SET_ITEM(__pyx_t_3, __pyx_temp, __pyx_int_0);
       __Pyx_GIVEREF(__pyx_int_0);
@@ -8518,7 +8908,7 @@ static PyObject *__pyx_pf_8PyDeepCL_5Layer_14getOutput(struct __pyx_obj_8PyDeepC
       __pyx_t_5 = 1;
     }
   }
-  __pyx_t_6 = PyTuple_New(2+__pyx_t_5); if (unlikely(!__pyx_t_6)) {__pyx_filename = __pyx_f[10]; __pyx_lineno = 28; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+  __pyx_t_6 = PyTuple_New(2+__pyx_t_5); if (unlikely(!__pyx_t_6)) {__pyx_filename = __pyx_f[9]; __pyx_lineno = 28; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
   __Pyx_GOTREF(__pyx_t_6);
   if (__pyx_t_4) {
     PyTuple_SET_ITEM(__pyx_t_6, 0, __pyx_t_4); __Pyx_GIVEREF(__pyx_t_4); __pyx_t_4 = NULL;
@@ -8529,39 +8919,39 @@ static PyObject *__pyx_pf_8PyDeepCL_5Layer_14getOutput(struct __pyx_obj_8PyDeepC
   PyTuple_SET_ITEM(__pyx_t_6, 1+__pyx_t_5, __pyx_t_3);
   __Pyx_GIVEREF(__pyx_t_3);
   __pyx_t_3 = 0;
-  __pyx_t_1 = __Pyx_PyObject_Call(__pyx_t_2, __pyx_t_6, NULL); if (unlikely(!__pyx_t_1)) {__pyx_filename = __pyx_f[10]; __pyx_lineno = 28; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+  __pyx_t_1 = __Pyx_PyObject_Call(__pyx_t_2, __pyx_t_6, NULL); if (unlikely(!__pyx_t_1)) {__pyx_filename = __pyx_f[9]; __pyx_lineno = 28; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
   __Pyx_GOTREF(__pyx_t_1);
   __Pyx_DECREF(__pyx_t_6); __pyx_t_6 = 0;
   __Pyx_DECREF(__pyx_t_2); __pyx_t_2 = 0;
-  if (!(likely(((__pyx_t_1) == Py_None) || likely(__Pyx_TypeTest(__pyx_t_1, __pyx_ptype_7cpython_5array_array))))) {__pyx_filename = __pyx_f[10]; __pyx_lineno = 28; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+  if (!(likely(((__pyx_t_1) == Py_None) || likely(__Pyx_TypeTest(__pyx_t_1, __pyx_ptype_7cpython_5array_array))))) {__pyx_filename = __pyx_f[9]; __pyx_lineno = 28; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
   __pyx_v_outputArray = ((arrayobject *)__pyx_t_1);
   __pyx_t_1 = 0;
 
-  /* "Layer.pyx":29
- *         cdef int outputSize = self.thisptr.getOutputSize()
- *         cdef c_array.array outputArray = array('f', [0] * outputSize )
- *         for i in range(outputSize):             # <<<<<<<<<<<<<<
+  /* "../../../../../../home/user/git/DeepCL/python/Layer.pyx":29
+ *         cdef int outputNumElements = self.thisptr.getOutputNumElements()
+ *         cdef c_array.array outputArray = array('f', [0] * outputNumElements )
+ *         for i in range(outputNumElements):             # <<<<<<<<<<<<<<
  *             outputArray[i] = output[i]
  * #        cdef float[:] outputMv = output
  */
-  __pyx_t_7 = __pyx_v_outputSize;
+  __pyx_t_7 = __pyx_v_outputNumElements;
   for (__pyx_t_8 = 0; __pyx_t_8 < __pyx_t_7; __pyx_t_8+=1) {
     __pyx_v_i = __pyx_t_8;
 
-    /* "Layer.pyx":30
- *         cdef c_array.array outputArray = array('f', [0] * outputSize )
- *         for i in range(outputSize):
+    /* "../../../../../../home/user/git/DeepCL/python/Layer.pyx":30
+ *         cdef c_array.array outputArray = array('f', [0] * outputNumElements )
+ *         for i in range(outputNumElements):
  *             outputArray[i] = output[i]             # <<<<<<<<<<<<<<
  * #        cdef float[:] outputMv = output
  * #        cdef float[:] outputArrayMv = outputArray
  */
-    __pyx_t_1 = PyFloat_FromDouble((__pyx_v_output[__pyx_v_i])); if (unlikely(!__pyx_t_1)) {__pyx_filename = __pyx_f[10]; __pyx_lineno = 30; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+    __pyx_t_1 = PyFloat_FromDouble((__pyx_v_output[__pyx_v_i])); if (unlikely(!__pyx_t_1)) {__pyx_filename = __pyx_f[9]; __pyx_lineno = 30; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
     __Pyx_GOTREF(__pyx_t_1);
-    if (unlikely(__Pyx_SetItemInt(((PyObject *)__pyx_v_outputArray), __pyx_v_i, __pyx_t_1, int, 1, __Pyx_PyInt_From_int, 0, 1, 1) < 0)) {__pyx_filename = __pyx_f[10]; __pyx_lineno = 30; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+    if (unlikely(__Pyx_SetItemInt(((PyObject *)__pyx_v_outputArray), __pyx_v_i, __pyx_t_1, int, 1, __Pyx_PyInt_From_int, 0, 1, 1) < 0)) {__pyx_filename = __pyx_f[9]; __pyx_lineno = 30; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
     __Pyx_DECREF(__pyx_t_1); __pyx_t_1 = 0;
   }
 
-  /* "Layer.pyx":35
+  /* "../../../../../../home/user/git/DeepCL/python/Layer.pyx":35
  * #        outputArrayMv[:] = outputMv
  * #        outputArrayMv = self.thisptr.getOutput()
  *         return outputArray             # <<<<<<<<<<<<<<
@@ -8573,9 +8963,9 @@ static PyObject *__pyx_pf_8PyDeepCL_5Layer_14getOutput(struct __pyx_obj_8PyDeepC
   __pyx_r = ((PyObject *)__pyx_v_outputArray);
   goto __pyx_L0;
 
-  /* "Layer.pyx":22
- *     def getOutputImageSize(self):
- *         return self.thisptr.getOutputImageSize()
+  /* "../../../../../../home/user/git/DeepCL/python/Layer.pyx":22
+ *     def getOutputSize(self):
+ *         return self.thisptr.getOutputSize()
  *     def getOutput(self):             # <<<<<<<<<<<<<<
  *         # the underlying c++ method returns a pointer
  *         # to a block of memory that we dont own
@@ -8597,7 +8987,7 @@ static PyObject *__pyx_pf_8PyDeepCL_5Layer_14getOutput(struct __pyx_obj_8PyDeepC
   return __pyx_r;
 }
 
-/* "Layer.pyx":36
+/* "../../../../../../home/user/git/DeepCL/python/Layer.pyx":36
  * #        outputArrayMv = self.thisptr.getOutput()
  *         return outputArray
  *     def getWeights(self):             # <<<<<<<<<<<<<<
@@ -8637,7 +9027,7 @@ static PyObject *__pyx_pf_8PyDeepCL_5Layer_16getWeights(struct __pyx_obj_8PyDeep
   int __pyx_clineno = 0;
   __Pyx_RefNannySetupContext("getWeights", 0);
 
-  /* "Layer.pyx":37
+  /* "../../../../../../home/user/git/DeepCL/python/Layer.pyx":37
  *         return outputArray
  *     def getWeights(self):
  *         cdef int weightsSize = self.thisptr.getPersistSize()             # <<<<<<<<<<<<<<
@@ -8646,16 +9036,16 @@ static PyObject *__pyx_pf_8PyDeepCL_5Layer_16getWeights(struct __pyx_obj_8PyDeep
  */
   __pyx_v_weightsSize = __pyx_v_self->thisptr->getPersistSize();
 
-  /* "Layer.pyx":38
+  /* "../../../../../../home/user/git/DeepCL/python/Layer.pyx":38
  *     def getWeights(self):
  *         cdef int weightsSize = self.thisptr.getPersistSize()
  *         cdef c_array.array weightsArray = array('f', [0] * weightsSize )             # <<<<<<<<<<<<<<
  *         cdef float[:] weightsArray_view = weightsArray
  *         self.thisptr.persistToArray( &weightsArray_view[0] )
  */
-  __pyx_t_2 = __Pyx_GetModuleGlobalName(__pyx_n_s_array); if (unlikely(!__pyx_t_2)) {__pyx_filename = __pyx_f[10]; __pyx_lineno = 38; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+  __pyx_t_2 = __Pyx_GetModuleGlobalName(__pyx_n_s_array); if (unlikely(!__pyx_t_2)) {__pyx_filename = __pyx_f[9]; __pyx_lineno = 38; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
   __Pyx_GOTREF(__pyx_t_2);
-  __pyx_t_3 = PyList_New(1 * ((__pyx_v_weightsSize<0) ? 0:__pyx_v_weightsSize)); if (unlikely(!__pyx_t_3)) {__pyx_filename = __pyx_f[10]; __pyx_lineno = 38; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+  __pyx_t_3 = PyList_New(1 * ((__pyx_v_weightsSize<0) ? 0:__pyx_v_weightsSize)); if (unlikely(!__pyx_t_3)) {__pyx_filename = __pyx_f[9]; __pyx_lineno = 38; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
   __Pyx_GOTREF(__pyx_t_3);
   { Py_ssize_t __pyx_temp;
     for (__pyx_temp=0; __pyx_temp < __pyx_v_weightsSize; __pyx_temp++) {
@@ -8676,7 +9066,7 @@ static PyObject *__pyx_pf_8PyDeepCL_5Layer_16getWeights(struct __pyx_obj_8PyDeep
       __pyx_t_5 = 1;
     }
   }
-  __pyx_t_6 = PyTuple_New(2+__pyx_t_5); if (unlikely(!__pyx_t_6)) {__pyx_filename = __pyx_f[10]; __pyx_lineno = 38; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+  __pyx_t_6 = PyTuple_New(2+__pyx_t_5); if (unlikely(!__pyx_t_6)) {__pyx_filename = __pyx_f[9]; __pyx_lineno = 38; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
   __Pyx_GOTREF(__pyx_t_6);
   if (__pyx_t_4) {
     PyTuple_SET_ITEM(__pyx_t_6, 0, __pyx_t_4); __Pyx_GIVEREF(__pyx_t_4); __pyx_t_4 = NULL;
@@ -8687,15 +9077,15 @@ static PyObject *__pyx_pf_8PyDeepCL_5Layer_16getWeights(struct __pyx_obj_8PyDeep
   PyTuple_SET_ITEM(__pyx_t_6, 1+__pyx_t_5, __pyx_t_3);
   __Pyx_GIVEREF(__pyx_t_3);
   __pyx_t_3 = 0;
-  __pyx_t_1 = __Pyx_PyObject_Call(__pyx_t_2, __pyx_t_6, NULL); if (unlikely(!__pyx_t_1)) {__pyx_filename = __pyx_f[10]; __pyx_lineno = 38; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+  __pyx_t_1 = __Pyx_PyObject_Call(__pyx_t_2, __pyx_t_6, NULL); if (unlikely(!__pyx_t_1)) {__pyx_filename = __pyx_f[9]; __pyx_lineno = 38; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
   __Pyx_GOTREF(__pyx_t_1);
   __Pyx_DECREF(__pyx_t_6); __pyx_t_6 = 0;
   __Pyx_DECREF(__pyx_t_2); __pyx_t_2 = 0;
-  if (!(likely(((__pyx_t_1) == Py_None) || likely(__Pyx_TypeTest(__pyx_t_1, __pyx_ptype_7cpython_5array_array))))) {__pyx_filename = __pyx_f[10]; __pyx_lineno = 38; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+  if (!(likely(((__pyx_t_1) == Py_None) || likely(__Pyx_TypeTest(__pyx_t_1, __pyx_ptype_7cpython_5array_array))))) {__pyx_filename = __pyx_f[9]; __pyx_lineno = 38; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
   __pyx_v_weightsArray = ((arrayobject *)__pyx_t_1);
   __pyx_t_1 = 0;
 
-  /* "Layer.pyx":39
+  /* "../../../../../../home/user/git/DeepCL/python/Layer.pyx":39
  *         cdef int weightsSize = self.thisptr.getPersistSize()
  *         cdef c_array.array weightsArray = array('f', [0] * weightsSize )
  *         cdef float[:] weightsArray_view = weightsArray             # <<<<<<<<<<<<<<
@@ -8703,12 +9093,12 @@ static PyObject *__pyx_pf_8PyDeepCL_5Layer_16getWeights(struct __pyx_obj_8PyDeep
  *         return weightsArray
  */
   __pyx_t_7 = __Pyx_PyObject_to_MemoryviewSlice_ds_float(((PyObject *)__pyx_v_weightsArray));
-  if (unlikely(!__pyx_t_7.memview)) {__pyx_filename = __pyx_f[10]; __pyx_lineno = 39; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+  if (unlikely(!__pyx_t_7.memview)) {__pyx_filename = __pyx_f[9]; __pyx_lineno = 39; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
   __pyx_v_weightsArray_view = __pyx_t_7;
   __pyx_t_7.memview = NULL;
   __pyx_t_7.data = NULL;
 
-  /* "Layer.pyx":40
+  /* "../../../../../../home/user/git/DeepCL/python/Layer.pyx":40
  *         cdef c_array.array weightsArray = array('f', [0] * weightsSize )
  *         cdef float[:] weightsArray_view = weightsArray
  *         self.thisptr.persistToArray( &weightsArray_view[0] )             # <<<<<<<<<<<<<<
@@ -8723,11 +9113,11 @@ static PyObject *__pyx_pf_8PyDeepCL_5Layer_16getWeights(struct __pyx_obj_8PyDeep
   } else if (unlikely(__pyx_t_5 >= __pyx_v_weightsArray_view.shape[0])) __pyx_t_8 = 0;
   if (unlikely(__pyx_t_8 != -1)) {
     __Pyx_RaiseBufferIndexError(__pyx_t_8);
-    {__pyx_filename = __pyx_f[10]; __pyx_lineno = 40; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+    {__pyx_filename = __pyx_f[9]; __pyx_lineno = 40; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
   }
   __pyx_v_self->thisptr->persistToArray((&(*((float *) ( /* dim=0 */ (__pyx_v_weightsArray_view.data + __pyx_t_5 * __pyx_v_weightsArray_view.strides[0]) )))));
 
-  /* "Layer.pyx":41
+  /* "../../../../../../home/user/git/DeepCL/python/Layer.pyx":41
  *         cdef float[:] weightsArray_view = weightsArray
  *         self.thisptr.persistToArray( &weightsArray_view[0] )
  *         return weightsArray             # <<<<<<<<<<<<<<
@@ -8739,7 +9129,7 @@ static PyObject *__pyx_pf_8PyDeepCL_5Layer_16getWeights(struct __pyx_obj_8PyDeep
   __pyx_r = ((PyObject *)__pyx_v_weightsArray);
   goto __pyx_L0;
 
-  /* "Layer.pyx":36
+  /* "../../../../../../home/user/git/DeepCL/python/Layer.pyx":36
  * #        outputArrayMv = self.thisptr.getOutput()
  *         return outputArray
  *     def getWeights(self):             # <<<<<<<<<<<<<<
@@ -8765,7 +9155,7 @@ static PyObject *__pyx_pf_8PyDeepCL_5Layer_16getWeights(struct __pyx_obj_8PyDeep
   return __pyx_r;
 }
 
-/* "Layer.pyx":42
+/* "../../../../../../home/user/git/DeepCL/python/Layer.pyx":42
  *         self.thisptr.persistToArray( &weightsArray_view[0] )
  *         return weightsArray
  *     def setWeights(self, float[:] weights):             # <<<<<<<<<<<<<<
@@ -8784,7 +9174,7 @@ static PyObject *__pyx_pw_8PyDeepCL_5Layer_19setWeights(PyObject *__pyx_v_self,
   __Pyx_RefNannyDeclarations
   __Pyx_RefNannySetupContext("setWeights (wrapper)", 0);
   assert(__pyx_arg_weights); {
-    __pyx_v_weights = __Pyx_PyObject_to_MemoryviewSlice_ds_float(__pyx_arg_weights); if (unlikely(!__pyx_v_weights.memview)) {__pyx_filename = __pyx_f[10]; __pyx_lineno = 42; __pyx_clineno = __LINE__; goto __pyx_L3_error;}
+    __pyx_v_weights = __Pyx_PyObject_to_MemoryviewSlice_ds_float(__pyx_arg_weights); if (unlikely(!__pyx_v_weights.memview)) {__pyx_filename = __pyx_f[9]; __pyx_lineno = 42; __pyx_clineno = __LINE__; goto __pyx_L3_error;}
   }
   goto __pyx_L4_argument_unpacking_done;
   __pyx_L3_error:;
@@ -8811,7 +9201,7 @@ static PyObject *__pyx_pf_8PyDeepCL_5Layer_18setWeights(struct __pyx_obj_8PyDeep
   int __pyx_clineno = 0;
   __Pyx_RefNannySetupContext("setWeights", 0);
 
-  /* "Layer.pyx":43
+  /* "../../../../../../home/user/git/DeepCL/python/Layer.pyx":43
  *         return weightsArray
  *     def setWeights(self, float[:] weights):
  *         cdef int weightsSize = self.thisptr.getPersistSize()             # <<<<<<<<<<<<<<
@@ -8820,7 +9210,7 @@ static PyObject *__pyx_pf_8PyDeepCL_5Layer_18setWeights(struct __pyx_obj_8PyDeep
  */
   __pyx_v_weightsSize = __pyx_v_self->thisptr->getPersistSize();
 
-  /* "Layer.pyx":44
+  /* "../../../../../../home/user/git/DeepCL/python/Layer.pyx":44
  *     def setWeights(self, float[:] weights):
  *         cdef int weightsSize = self.thisptr.getPersistSize()
  *         assert weightsSize == len(weights)             # <<<<<<<<<<<<<<
@@ -8829,18 +9219,18 @@ static PyObject *__pyx_pf_8PyDeepCL_5Layer_18setWeights(struct __pyx_obj_8PyDeep
  */
   #ifndef CYTHON_WITHOUT_ASSERTIONS
   if (unlikely(!Py_OptimizeFlag)) {
-    __pyx_t_1 = __pyx_memoryview_fromslice(__pyx_v_weights, 1, (PyObject *(*)(char *)) __pyx_memview_get_float, (int (*)(char *, PyObject *)) __pyx_memview_set_float, 0);; if (unlikely(!__pyx_t_1)) {__pyx_filename = __pyx_f[10]; __pyx_lineno = 44; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+    __pyx_t_1 = __pyx_memoryview_fromslice(__pyx_v_weights, 1, (PyObject *(*)(char *)) __pyx_memview_get_float, (int (*)(char *, PyObject *)) __pyx_memview_set_float, 0);; if (unlikely(!__pyx_t_1)) {__pyx_filename = __pyx_f[9]; __pyx_lineno = 44; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
     __Pyx_GOTREF(__pyx_t_1);
-    __pyx_t_2 = PyObject_Length(__pyx_t_1); if (unlikely(__pyx_t_2 == -1)) {__pyx_filename = __pyx_f[10]; __pyx_lineno = 44; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+    __pyx_t_2 = PyObject_Length(__pyx_t_1); if (unlikely(__pyx_t_2 == -1)) {__pyx_filename = __pyx_f[9]; __pyx_lineno = 44; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
     __Pyx_DECREF(__pyx_t_1); __pyx_t_1 = 0;
     if (unlikely(!((__pyx_v_weightsSize == __pyx_t_2) != 0))) {
       PyErr_SetNone(PyExc_AssertionError);
-      {__pyx_filename = __pyx_f[10]; __pyx_lineno = 44; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+      {__pyx_filename = __pyx_f[9]; __pyx_lineno = 44; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
     }
   }
   #endif
 
-  /* "Layer.pyx":46
+  /* "../../../../../../home/user/git/DeepCL/python/Layer.pyx":46
  *         assert weightsSize == len(weights)
  * #        cdef c_array.array weightsArray = array('f', [0] * weightsSize )
  *         self.thisptr.unpersistFromArray( &weights[0] )             # <<<<<<<<<<<<<<
@@ -8855,11 +9245,11 @@ static PyObject *__pyx_pf_8PyDeepCL_5Layer_18setWeights(struct __pyx_obj_8PyDeep
   } else if (unlikely(__pyx_t_2 >= __pyx_v_weights.shape[0])) __pyx_t_3 = 0;
   if (unlikely(__pyx_t_3 != -1)) {
     __Pyx_RaiseBufferIndexError(__pyx_t_3);
-    {__pyx_filename = __pyx_f[10]; __pyx_lineno = 46; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+    {__pyx_filename = __pyx_f[9]; __pyx_lineno = 46; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
   }
   __pyx_v_self->thisptr->unpersistFromArray((&(*((float *) ( /* dim=0 */ (__pyx_v_weights.data + __pyx_t_2 * __pyx_v_weights.strides[0]) )))));
 
-  /* "Layer.pyx":42
+  /* "../../../../../../home/user/git/DeepCL/python/Layer.pyx":42
  *         self.thisptr.persistToArray( &weightsArray_view[0] )
  *         return weightsArray
  *     def setWeights(self, float[:] weights):             # <<<<<<<<<<<<<<
@@ -8881,7 +9271,7 @@ static PyObject *__pyx_pf_8PyDeepCL_5Layer_18setWeights(struct __pyx_obj_8PyDeep
   return __pyx_r;
 }
 
-/* "Layer.pyx":51
+/* "../../../../../../home/user/git/DeepCL/python/Layer.pyx":51
  * #        void persistToArray(float *array)
  * #        void unpersistFromArray(const float *array)
  *     def setWeightsList(self, weightsList):             # <<<<<<<<<<<<<<
@@ -8915,30 +9305,30 @@ static PyObject *__pyx_pf_8PyDeepCL_5Layer_20setWeightsList(struct __pyx_obj_8Py
   int __pyx_clineno = 0;
   __Pyx_RefNannySetupContext("setWeightsList", 0);
 
-  /* "Layer.pyx":52
+  /* "../../../../../../home/user/git/DeepCL/python/Layer.pyx":52
  * #        void unpersistFromArray(const float *array)
  *     def setWeightsList(self, weightsList):
  *         cdef c_array.array weightsArray = array('f')             # <<<<<<<<<<<<<<
  *         weightsArray.fromlist( weightsList )
  *         self.setWeights( weightsArray )
  */
-  __pyx_t_1 = __Pyx_GetModuleGlobalName(__pyx_n_s_array); if (unlikely(!__pyx_t_1)) {__pyx_filename = __pyx_f[10]; __pyx_lineno = 52; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+  __pyx_t_1 = __Pyx_GetModuleGlobalName(__pyx_n_s_array); if (unlikely(!__pyx_t_1)) {__pyx_filename = __pyx_f[9]; __pyx_lineno = 52; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
   __Pyx_GOTREF(__pyx_t_1);
-  __pyx_t_2 = __Pyx_PyObject_Call(__pyx_t_1, __pyx_tuple_, NULL); if (unlikely(!__pyx_t_2)) {__pyx_filename = __pyx_f[10]; __pyx_lineno = 52; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+  __pyx_t_2 = __Pyx_PyObject_Call(__pyx_t_1, __pyx_tuple_, NULL); if (unlikely(!__pyx_t_2)) {__pyx_filename = __pyx_f[9]; __pyx_lineno = 52; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
   __Pyx_GOTREF(__pyx_t_2);
   __Pyx_DECREF(__pyx_t_1); __pyx_t_1 = 0;
-  if (!(likely(((__pyx_t_2) == Py_None) || likely(__Pyx_TypeTest(__pyx_t_2, __pyx_ptype_7cpython_5array_array))))) {__pyx_filename = __pyx_f[10]; __pyx_lineno = 52; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+  if (!(likely(((__pyx_t_2) == Py_None) || likely(__Pyx_TypeTest(__pyx_t_2, __pyx_ptype_7cpython_5array_array))))) {__pyx_filename = __pyx_f[9]; __pyx_lineno = 52; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
   __pyx_v_weightsArray = ((arrayobject *)__pyx_t_2);
   __pyx_t_2 = 0;
 
-  /* "Layer.pyx":53
+  /* "../../../../../../home/user/git/DeepCL/python/Layer.pyx":53
  *     def setWeightsList(self, weightsList):
  *         cdef c_array.array weightsArray = array('f')
  *         weightsArray.fromlist( weightsList )             # <<<<<<<<<<<<<<
  *         self.setWeights( weightsArray )
  *     def asString(self):
  */
-  __pyx_t_1 = __Pyx_PyObject_GetAttrStr(((PyObject *)__pyx_v_weightsArray), __pyx_n_s_fromlist); if (unlikely(!__pyx_t_1)) {__pyx_filename = __pyx_f[10]; __pyx_lineno = 53; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+  __pyx_t_1 = __Pyx_PyObject_GetAttrStr(((PyObject *)__pyx_v_weightsArray), __pyx_n_s_fromlist); if (unlikely(!__pyx_t_1)) {__pyx_filename = __pyx_f[9]; __pyx_lineno = 53; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
   __Pyx_GOTREF(__pyx_t_1);
   __pyx_t_3 = NULL;
   if (CYTHON_COMPILING_IN_CPYTHON && likely(PyMethod_Check(__pyx_t_1))) {
@@ -8951,30 +9341,30 @@ static PyObject *__pyx_pf_8PyDeepCL_5Layer_20setWeightsList(struct __pyx_obj_8Py
     }
   }
   if (!__pyx_t_3) {
-    __pyx_t_2 = __Pyx_PyObject_CallOneArg(__pyx_t_1, __pyx_v_weightsList); if (unlikely(!__pyx_t_2)) {__pyx_filename = __pyx_f[10]; __pyx_lineno = 53; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+    __pyx_t_2 = __Pyx_PyObject_CallOneArg(__pyx_t_1, __pyx_v_weightsList); if (unlikely(!__pyx_t_2)) {__pyx_filename = __pyx_f[9]; __pyx_lineno = 53; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
     __Pyx_GOTREF(__pyx_t_2);
   } else {
-    __pyx_t_4 = PyTuple_New(1+1); if (unlikely(!__pyx_t_4)) {__pyx_filename = __pyx_f[10]; __pyx_lineno = 53; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+    __pyx_t_4 = PyTuple_New(1+1); if (unlikely(!__pyx_t_4)) {__pyx_filename = __pyx_f[9]; __pyx_lineno = 53; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
     __Pyx_GOTREF(__pyx_t_4);
     PyTuple_SET_ITEM(__pyx_t_4, 0, __pyx_t_3); __Pyx_GIVEREF(__pyx_t_3); __pyx_t_3 = NULL;
     __Pyx_INCREF(__pyx_v_weightsList);
     PyTuple_SET_ITEM(__pyx_t_4, 0+1, __pyx_v_weightsList);
     __Pyx_GIVEREF(__pyx_v_weightsList);
-    __pyx_t_2 = __Pyx_PyObject_Call(__pyx_t_1, __pyx_t_4, NULL); if (unlikely(!__pyx_t_2)) {__pyx_filename = __pyx_f[10]; __pyx_lineno = 53; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+    __pyx_t_2 = __Pyx_PyObject_Call(__pyx_t_1, __pyx_t_4, NULL); if (unlikely(!__pyx_t_2)) {__pyx_filename = __pyx_f[9]; __pyx_lineno = 53; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
     __Pyx_GOTREF(__pyx_t_2);
     __Pyx_DECREF(__pyx_t_4); __pyx_t_4 = 0;
   }
   __Pyx_DECREF(__pyx_t_1); __pyx_t_1 = 0;
   __Pyx_DECREF(__pyx_t_2); __pyx_t_2 = 0;
 
-  /* "Layer.pyx":54
+  /* "../../../../../../home/user/git/DeepCL/python/Layer.pyx":54
  *         cdef c_array.array weightsArray = array('f')
  *         weightsArray.fromlist( weightsList )
  *         self.setWeights( weightsArray )             # <<<<<<<<<<<<<<
  *     def asString(self):
- *         return self.thisptr.asString()
+ *         cdef const char *res_charstar = self.thisptr.asNewCharStar()
  */
-  __pyx_t_1 = __Pyx_PyObject_GetAttrStr(((PyObject *)__pyx_v_self), __pyx_n_s_setWeights); if (unlikely(!__pyx_t_1)) {__pyx_filename = __pyx_f[10]; __pyx_lineno = 54; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+  __pyx_t_1 = __Pyx_PyObject_GetAttrStr(((PyObject *)__pyx_v_self), __pyx_n_s_setWeights); if (unlikely(!__pyx_t_1)) {__pyx_filename = __pyx_f[9]; __pyx_lineno = 54; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
   __Pyx_GOTREF(__pyx_t_1);
   __pyx_t_4 = NULL;
   if (CYTHON_COMPILING_IN_CPYTHON && likely(PyMethod_Check(__pyx_t_1))) {
@@ -8987,23 +9377,23 @@ static PyObject *__pyx_pf_8PyDeepCL_5Layer_20setWeightsList(struct __pyx_obj_8Py
     }
   }
   if (!__pyx_t_4) {
-    __pyx_t_2 = __Pyx_PyObject_CallOneArg(__pyx_t_1, ((PyObject *)__pyx_v_weightsArray)); if (unlikely(!__pyx_t_2)) {__pyx_filename = __pyx_f[10]; __pyx_lineno = 54; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+    __pyx_t_2 = __Pyx_PyObject_CallOneArg(__pyx_t_1, ((PyObject *)__pyx_v_weightsArray)); if (unlikely(!__pyx_t_2)) {__pyx_filename = __pyx_f[9]; __pyx_lineno = 54; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
     __Pyx_GOTREF(__pyx_t_2);
   } else {
-    __pyx_t_3 = PyTuple_New(1+1); if (unlikely(!__pyx_t_3)) {__pyx_filename = __pyx_f[10]; __pyx_lineno = 54; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+    __pyx_t_3 = PyTuple_New(1+1); if (unlikely(!__pyx_t_3)) {__pyx_filename = __pyx_f[9]; __pyx_lineno = 54; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
     __Pyx_GOTREF(__pyx_t_3);
     PyTuple_SET_ITEM(__pyx_t_3, 0, __pyx_t_4); __Pyx_GIVEREF(__pyx_t_4); __pyx_t_4 = NULL;
     __Pyx_INCREF(((PyObject *)__pyx_v_weightsArray));
     PyTuple_SET_ITEM(__pyx_t_3, 0+1, ((PyObject *)__pyx_v_weightsArray));
     __Pyx_GIVEREF(((PyObject *)__pyx_v_weightsArray));
-    __pyx_t_2 = __Pyx_PyObject_Call(__pyx_t_1, __pyx_t_3, NULL); if (unlikely(!__pyx_t_2)) {__pyx_filename = __pyx_f[10]; __pyx_lineno = 54; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+    __pyx_t_2 = __Pyx_PyObject_Call(__pyx_t_1, __pyx_t_3, NULL); if (unlikely(!__pyx_t_2)) {__pyx_filename = __pyx_f[9]; __pyx_lineno = 54; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
     __Pyx_GOTREF(__pyx_t_2);
     __Pyx_DECREF(__pyx_t_3); __pyx_t_3 = 0;
   }
   __Pyx_DECREF(__pyx_t_1); __pyx_t_1 = 0;
   __Pyx_DECREF(__pyx_t_2); __pyx_t_2 = 0;
 
-  /* "Layer.pyx":51
+  /* "../../../../../../home/user/git/DeepCL/python/Layer.pyx":51
  * #        void persistToArray(float *array)
  * #        void unpersistFromArray(const float *array)
  *     def setWeightsList(self, weightsList):             # <<<<<<<<<<<<<<
@@ -9028,12 +9418,12 @@ static PyObject *__pyx_pf_8PyDeepCL_5Layer_20setWeightsList(struct __pyx_obj_8Py
   return __pyx_r;
 }
 
-/* "Layer.pyx":55
+/* "../../../../../../home/user/git/DeepCL/python/Layer.pyx":55
  *         weightsArray.fromlist( weightsList )
  *         self.setWeights( weightsArray )
  *     def asString(self):             # <<<<<<<<<<<<<<
- *         return self.thisptr.asString()
- *     def getClassName(self):
+ *         cdef const char *res_charstar = self.thisptr.asNewCharStar()
+ *         cdef str res = str(res_charstar.decode('UTF-8'))
  */
 
 /* Python wrapper */
@@ -9050,52 +9440,94 @@ static PyObject *__pyx_pw_8PyDeepCL_5Layer_23asString(PyObject *__pyx_v_self, CY
 }
 
 static PyObject *__pyx_pf_8PyDeepCL_5Layer_22asString(struct __pyx_obj_8PyDeepCL_Layer *__pyx_v_self) {
+  char const *__pyx_v_res_charstar;
+  PyObject *__pyx_v_res = 0;
   PyObject *__pyx_r = NULL;
   __Pyx_RefNannyDeclarations
   PyObject *__pyx_t_1 = NULL;
+  PyObject *__pyx_t_2 = NULL;
   int __pyx_lineno = 0;
   const char *__pyx_filename = NULL;
   int __pyx_clineno = 0;
   __Pyx_RefNannySetupContext("asString", 0);
 
-  /* "Layer.pyx":56
+  /* "../../../../../../home/user/git/DeepCL/python/Layer.pyx":56
  *         self.setWeights( weightsArray )
  *     def asString(self):
- *         return self.thisptr.asString()             # <<<<<<<<<<<<<<
- *     def getClassName(self):
- *         return self.thisptr.getClassName()
+ *         cdef const char *res_charstar = self.thisptr.asNewCharStar()             # <<<<<<<<<<<<<<
+ *         cdef str res = str(res_charstar.decode('UTF-8'))
+ *         CppRuntimeBoundary.deepcl_deleteCharStar(res_charstar)
  */
-  __Pyx_XDECREF(__pyx_r);
-  __pyx_t_1 = __pyx_convert_PyBytes_string_to_py_std__in_string(__pyx_v_self->thisptr->asString()); if (unlikely(!__pyx_t_1)) {__pyx_filename = __pyx_f[10]; __pyx_lineno = 56; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+  __pyx_v_res_charstar = __pyx_v_self->thisptr->asNewCharStar();
+
+  /* "../../../../../../home/user/git/DeepCL/python/Layer.pyx":57
+ *     def asString(self):
+ *         cdef const char *res_charstar = self.thisptr.asNewCharStar()
+ *         cdef str res = str(res_charstar.decode('UTF-8'))             # <<<<<<<<<<<<<<
+ *         CppRuntimeBoundary.deepcl_deleteCharStar(res_charstar)
+ *         return res
+ */
+  __pyx_t_1 = __Pyx_decode_c_string(__pyx_v_res_charstar, 0, strlen(__pyx_v_res_charstar), NULL, NULL, PyUnicode_DecodeUTF8); if (unlikely(!__pyx_t_1)) {__pyx_filename = __pyx_f[9]; __pyx_lineno = 57; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
   __Pyx_GOTREF(__pyx_t_1);
-  __pyx_r = __pyx_t_1;
+  __pyx_t_2 = PyTuple_New(1); if (unlikely(!__pyx_t_2)) {__pyx_filename = __pyx_f[9]; __pyx_lineno = 57; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+  __Pyx_GOTREF(__pyx_t_2);
+  PyTuple_SET_ITEM(__pyx_t_2, 0, __pyx_t_1);
+  __Pyx_GIVEREF(__pyx_t_1);
   __pyx_t_1 = 0;
+  __pyx_t_1 = __Pyx_PyObject_Call(((PyObject *)((PyObject*)(&PyString_Type))), __pyx_t_2, NULL); if (unlikely(!__pyx_t_1)) {__pyx_filename = __pyx_f[9]; __pyx_lineno = 57; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+  __Pyx_GOTREF(__pyx_t_1);
+  __Pyx_DECREF(__pyx_t_2); __pyx_t_2 = 0;
+  if (!(likely(PyString_CheckExact(__pyx_t_1))||((__pyx_t_1) == Py_None)||(PyErr_Format(PyExc_TypeError, "Expected %.16s, got %.200s", "str", Py_TYPE(__pyx_t_1)->tp_name), 0))) {__pyx_filename = __pyx_f[9]; __pyx_lineno = 57; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+  __pyx_v_res = ((PyObject*)__pyx_t_1);
+  __pyx_t_1 = 0;
+
+  /* "../../../../../../home/user/git/DeepCL/python/Layer.pyx":58
+ *         cdef const char *res_charstar = self.thisptr.asNewCharStar()
+ *         cdef str res = str(res_charstar.decode('UTF-8'))
+ *         CppRuntimeBoundary.deepcl_deleteCharStar(res_charstar)             # <<<<<<<<<<<<<<
+ *         return res
+ *     def getClassName(self):
+ */
+  deepcl_deleteCharStar(__pyx_v_res_charstar);
+
+  /* "../../../../../../home/user/git/DeepCL/python/Layer.pyx":59
+ *         cdef str res = str(res_charstar.decode('UTF-8'))
+ *         CppRuntimeBoundary.deepcl_deleteCharStar(res_charstar)
+ *         return res             # <<<<<<<<<<<<<<
+ *     def getClassName(self):
+ *         return self.thisptr.getClassNameAsCharStar()
+ */
+  __Pyx_XDECREF(__pyx_r);
+  __Pyx_INCREF(__pyx_v_res);
+  __pyx_r = __pyx_v_res;
   goto __pyx_L0;
 
-  /* "Layer.pyx":55
+  /* "../../../../../../home/user/git/DeepCL/python/Layer.pyx":55
  *         weightsArray.fromlist( weightsList )
  *         self.setWeights( weightsArray )
  *     def asString(self):             # <<<<<<<<<<<<<<
- *         return self.thisptr.asString()
- *     def getClassName(self):
+ *         cdef const char *res_charstar = self.thisptr.asNewCharStar()
+ *         cdef str res = str(res_charstar.decode('UTF-8'))
  */
 
   /* function exit code */
   __pyx_L1_error:;
   __Pyx_XDECREF(__pyx_t_1);
+  __Pyx_XDECREF(__pyx_t_2);
   __Pyx_AddTraceback("PyDeepCL.Layer.asString", __pyx_clineno, __pyx_lineno, __pyx_filename);
   __pyx_r = NULL;
   __pyx_L0:;
+  __Pyx_XDECREF(__pyx_v_res);
   __Pyx_XGIVEREF(__pyx_r);
   __Pyx_RefNannyFinishContext();
   return __pyx_r;
 }
 
-/* "Layer.pyx":57
- *     def asString(self):
- *         return self.thisptr.asString()
+/* "../../../../../../home/user/git/DeepCL/python/Layer.pyx":60
+ *         CppRuntimeBoundary.deepcl_deleteCharStar(res_charstar)
+ *         return res
  *     def getClassName(self):             # <<<<<<<<<<<<<<
- *         return self.thisptr.getClassName()
+ *         return self.thisptr.getClassNameAsCharStar()
  * 
  */
 
@@ -9121,25 +9553,24 @@ static PyObject *__pyx_pf_8PyDeepCL_5Layer_24getClassName(struct __pyx_obj_8PyDe
   int __pyx_clineno = 0;
   __Pyx_RefNannySetupContext("getClassName", 0);
 
-  /* "Layer.pyx":58
- *         return self.thisptr.asString()
+  /* "../../../../../../home/user/git/DeepCL/python/Layer.pyx":61
+ *         return res
  *     def getClassName(self):
- *         return self.thisptr.getClassName()             # <<<<<<<<<<<<<<
- * 
+ *         return self.thisptr.getClassNameAsCharStar()             # <<<<<<<<<<<<<<
  * 
  */
   __Pyx_XDECREF(__pyx_r);
-  __pyx_t_1 = __pyx_convert_PyBytes_string_to_py_std__in_string(__pyx_v_self->thisptr->getClassName()); if (unlikely(!__pyx_t_1)) {__pyx_filename = __pyx_f[10]; __pyx_lineno = 58; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+  __pyx_t_1 = __Pyx_PyBytes_FromString(__pyx_v_self->thisptr->getClassNameAsCharStar()); if (unlikely(!__pyx_t_1)) {__pyx_filename = __pyx_f[9]; __pyx_lineno = 61; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
   __Pyx_GOTREF(__pyx_t_1);
   __pyx_r = __pyx_t_1;
   __pyx_t_1 = 0;
   goto __pyx_L0;
 
-  /* "Layer.pyx":57
- *     def asString(self):
- *         return self.thisptr.asString()
+  /* "../../../../../../home/user/git/DeepCL/python/Layer.pyx":60
+ *         CppRuntimeBoundary.deepcl_deleteCharStar(res_charstar)
+ *         return res
  *     def getClassName(self):             # <<<<<<<<<<<<<<
- *         return self.thisptr.getClassName()
+ *         return self.thisptr.getClassNameAsCharStar()
  * 
  */
 
@@ -9154,7 +9585,7 @@ static PyObject *__pyx_pf_8PyDeepCL_5Layer_24getClassName(struct __pyx_obj_8PyDe
   return __pyx_r;
 }
 
-/* "LayerMaker.pyx":6
+/* "../../../../../../home/user/git/DeepCL/python/LayerMaker.pyx":6
  * cdef class NormalizationLayerMaker(LayerMaker2):
  *     cdef cDeepCL.NormalizationLayerMaker *thisptr
  *     def __cinit__( self ):             # <<<<<<<<<<<<<<
@@ -9187,7 +9618,7 @@ static int __pyx_pf_8PyDeepCL_23NormalizationLayerMaker___cinit__(struct __pyx_o
   int __pyx_clineno = 0;
   __Pyx_RefNannySetupContext("__cinit__", 0);
 
-  /* "LayerMaker.pyx":7
+  /* "../../../../../../home/user/git/DeepCL/python/LayerMaker.pyx":7
  *     cdef cDeepCL.NormalizationLayerMaker *thisptr
  *     def __cinit__( self ):
  *         self.thisptr = new cDeepCL.NormalizationLayerMaker()             # <<<<<<<<<<<<<<
@@ -9202,7 +9633,7 @@ static int __pyx_pf_8PyDeepCL_23NormalizationLayerMaker___cinit__(struct __pyx_o
   }
   __pyx_v_self->thisptr = __pyx_t_1;
 
-  /* "LayerMaker.pyx":8
+  /* "../../../../../../home/user/git/DeepCL/python/LayerMaker.pyx":8
  *     def __cinit__( self ):
  *         self.thisptr = new cDeepCL.NormalizationLayerMaker()
  *         self.baseptr = self.thisptr             # <<<<<<<<<<<<<<
@@ -9212,7 +9643,7 @@ static int __pyx_pf_8PyDeepCL_23NormalizationLayerMaker___cinit__(struct __pyx_o
   __pyx_t_1 = __pyx_v_self->thisptr;
   __pyx_v_self->__pyx_base.baseptr = __pyx_t_1;
 
-  /* "LayerMaker.pyx":6
+  /* "../../../../../../home/user/git/DeepCL/python/LayerMaker.pyx":6
  * cdef class NormalizationLayerMaker(LayerMaker2):
  *     cdef cDeepCL.NormalizationLayerMaker *thisptr
  *     def __cinit__( self ):             # <<<<<<<<<<<<<<
@@ -9231,7 +9662,7 @@ static int __pyx_pf_8PyDeepCL_23NormalizationLayerMaker___cinit__(struct __pyx_o
   return __pyx_r;
 }
 
-/* "LayerMaker.pyx":11
+/* "../../../../../../home/user/git/DeepCL/python/LayerMaker.pyx":11
  * #    def __dealloc__(self):
  * #        del self.thisptr
  *     def translate( self, float _translate ):             # <<<<<<<<<<<<<<
@@ -9273,7 +9704,7 @@ static PyObject *__pyx_pf_8PyDeepCL_23NormalizationLayerMaker_2translate(struct
   int __pyx_clineno = 0;
   __Pyx_RefNannySetupContext("translate", 0);
 
-  /* "LayerMaker.pyx":12
+  /* "../../../../../../home/user/git/DeepCL/python/LayerMaker.pyx":12
  * #        del self.thisptr
  *     def translate( self, float _translate ):
  *         self.thisptr.translate( _translate )             # <<<<<<<<<<<<<<
@@ -9287,7 +9718,7 @@ static PyObject *__pyx_pf_8PyDeepCL_23NormalizationLayerMaker_2translate(struct
     {__pyx_filename = __pyx_f[0]; __pyx_lineno = 12; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
   }
 
-  /* "LayerMaker.pyx":13
+  /* "../../../../../../home/user/git/DeepCL/python/LayerMaker.pyx":13
  *     def translate( self, float _translate ):
  *         self.thisptr.translate( _translate )
  *         return self             # <<<<<<<<<<<<<<
@@ -9299,7 +9730,7 @@ static PyObject *__pyx_pf_8PyDeepCL_23NormalizationLayerMaker_2translate(struct
   __pyx_r = ((PyObject *)__pyx_v_self);
   goto __pyx_L0;
 
-  /* "LayerMaker.pyx":11
+  /* "../../../../../../home/user/git/DeepCL/python/LayerMaker.pyx":11
  * #    def __dealloc__(self):
  * #        del self.thisptr
  *     def translate( self, float _translate ):             # <<<<<<<<<<<<<<
@@ -9317,7 +9748,7 @@ static PyObject *__pyx_pf_8PyDeepCL_23NormalizationLayerMaker_2translate(struct
   return __pyx_r;
 }
 
-/* "LayerMaker.pyx":14
+/* "../../../../../../home/user/git/DeepCL/python/LayerMaker.pyx":14
  *         self.thisptr.translate( _translate )
  *         return self
  *     def scale( self, float _scale ):             # <<<<<<<<<<<<<<
@@ -9359,7 +9790,7 @@ static PyObject *__pyx_pf_8PyDeepCL_23NormalizationLayerMaker_4scale(struct __py
   int __pyx_clineno = 0;
   __Pyx_RefNannySetupContext("scale", 0);
 
-  /* "LayerMaker.pyx":15
+  /* "../../../../../../home/user/git/DeepCL/python/LayerMaker.pyx":15
  *         return self
  *     def scale( self, float _scale ):
  *         self.thisptr.scale( _scale )             # <<<<<<<<<<<<<<
@@ -9373,7 +9804,7 @@ static PyObject *__pyx_pf_8PyDeepCL_23NormalizationLayerMaker_4scale(struct __py
     {__pyx_filename = __pyx_f[0]; __pyx_lineno = 15; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
   }
 
-  /* "LayerMaker.pyx":16
+  /* "../../../../../../home/user/git/DeepCL/python/LayerMaker.pyx":16
  *     def scale( self, float _scale ):
  *         self.thisptr.scale( _scale )
  *         return self             # <<<<<<<<<<<<<<
@@ -9385,7 +9816,7 @@ static PyObject *__pyx_pf_8PyDeepCL_23NormalizationLayerMaker_4scale(struct __py
   __pyx_r = ((PyObject *)__pyx_v_self);
   goto __pyx_L0;
 
-  /* "LayerMaker.pyx":14
+  /* "../../../../../../home/user/git/DeepCL/python/LayerMaker.pyx":14
  *         self.thisptr.translate( _translate )
  *         return self
  *     def scale( self, float _scale ):             # <<<<<<<<<<<<<<
@@ -9403,7 +9834,7 @@ static PyObject *__pyx_pf_8PyDeepCL_23NormalizationLayerMaker_4scale(struct __py
   return __pyx_r;
 }
 
-/* "LayerMaker.pyx":18
+/* "../../../../../../home/user/git/DeepCL/python/LayerMaker.pyx":18
  *         return self
  *     @staticmethod
  *     def instance():             # <<<<<<<<<<<<<<
@@ -9437,7 +9868,7 @@ static PyObject *__pyx_pf_8PyDeepCL_23NormalizationLayerMaker_6instance() {
   int __pyx_clineno = 0;
   __Pyx_RefNannySetupContext("instance", 0);
 
-  /* "LayerMaker.pyx":19
+  /* "../../../../../../home/user/git/DeepCL/python/LayerMaker.pyx":19
  *     @staticmethod
  *     def instance():
  *         return NormalizationLayerMaker()             # <<<<<<<<<<<<<<
@@ -9451,7 +9882,7 @@ static PyObject *__pyx_pf_8PyDeepCL_23NormalizationLayerMaker_6instance() {
   __pyx_t_1 = 0;
   goto __pyx_L0;
 
-  /* "LayerMaker.pyx":18
+  /* "../../../../../../home/user/git/DeepCL/python/LayerMaker.pyx":18
  *         return self
  *     @staticmethod
  *     def instance():             # <<<<<<<<<<<<<<
@@ -9470,7 +9901,7 @@ static PyObject *__pyx_pf_8PyDeepCL_23NormalizationLayerMaker_6instance() {
   return __pyx_r;
 }
 
-/* "LayerMaker.pyx":23
+/* "../../../../../../home/user/git/DeepCL/python/LayerMaker.pyx":23
  * cdef class FullyConnectedMaker(LayerMaker2):
  *     cdef cDeepCL.FullyConnectedMaker *thisptr
  *     def __cinit__( self ):             # <<<<<<<<<<<<<<
@@ -9503,7 +9934,7 @@ static int __pyx_pf_8PyDeepCL_19FullyConnectedMaker___cinit__(struct __pyx_obj_8
   int __pyx_clineno = 0;
   __Pyx_RefNannySetupContext("__cinit__", 0);
 
-  /* "LayerMaker.pyx":24
+  /* "../../../../../../home/user/git/DeepCL/python/LayerMaker.pyx":24
  *     cdef cDeepCL.FullyConnectedMaker *thisptr
  *     def __cinit__( self ):
  *         self.thisptr = new cDeepCL.FullyConnectedMaker()             # <<<<<<<<<<<<<<
@@ -9518,7 +9949,7 @@ static int __pyx_pf_8PyDeepCL_19FullyConnectedMaker___cinit__(struct __pyx_obj_8
   }
   __pyx_v_self->thisptr = __pyx_t_1;
 
-  /* "LayerMaker.pyx":25
+  /* "../../../../../../home/user/git/DeepCL/python/LayerMaker.pyx":25
  *     def __cinit__( self ):
  *         self.thisptr = new cDeepCL.FullyConnectedMaker()
  *         self.baseptr = self.thisptr             # <<<<<<<<<<<<<<
@@ -9528,7 +9959,7 @@ static int __pyx_pf_8PyDeepCL_19FullyConnectedMaker___cinit__(struct __pyx_obj_8
   __pyx_t_1 = __pyx_v_self->thisptr;
   __pyx_v_self->__pyx_base.baseptr = __pyx_t_1;
 
-  /* "LayerMaker.pyx":23
+  /* "../../../../../../home/user/git/DeepCL/python/LayerMaker.pyx":23
  * cdef class FullyConnectedMaker(LayerMaker2):
  *     cdef cDeepCL.FullyConnectedMaker *thisptr
  *     def __cinit__( self ):             # <<<<<<<<<<<<<<
@@ -9547,7 +9978,7 @@ static int __pyx_pf_8PyDeepCL_19FullyConnectedMaker___cinit__(struct __pyx_obj_8
   return __pyx_r;
 }
 
-/* "LayerMaker.pyx":28
+/* "../../../../../../home/user/git/DeepCL/python/LayerMaker.pyx":28
  * #    def __dealloc__(self):
  * #        del self.thisptr
  *     def numPlanes( self, int _numPlanes ):             # <<<<<<<<<<<<<<
@@ -9589,7 +10020,7 @@ static PyObject *__pyx_pf_8PyDeepCL_19FullyConnectedMaker_2numPlanes(struct __py
   int __pyx_clineno = 0;
   __Pyx_RefNannySetupContext("numPlanes", 0);
 
-  /* "LayerMaker.pyx":29
+  /* "../../../../../../home/user/git/DeepCL/python/LayerMaker.pyx":29
  * #        del self.thisptr
  *     def numPlanes( self, int _numPlanes ):
  *         self.thisptr.numPlanes( _numPlanes )             # <<<<<<<<<<<<<<
@@ -9603,7 +10034,7 @@ static PyObject *__pyx_pf_8PyDeepCL_19FullyConnectedMaker_2numPlanes(struct __py
     {__pyx_filename = __pyx_f[0]; __pyx_lineno = 29; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
   }
 
-  /* "LayerMaker.pyx":30
+  /* "../../../../../../home/user/git/DeepCL/python/LayerMaker.pyx":30
  *     def numPlanes( self, int _numPlanes ):
  *         self.thisptr.numPlanes( _numPlanes )
  *         return self             # <<<<<<<<<<<<<<
@@ -9615,7 +10046,7 @@ static PyObject *__pyx_pf_8PyDeepCL_19FullyConnectedMaker_2numPlanes(struct __py
   __pyx_r = ((PyObject *)__pyx_v_self);
   goto __pyx_L0;
 
-  /* "LayerMaker.pyx":28
+  /* "../../../../../../home/user/git/DeepCL/python/LayerMaker.pyx":28
  * #    def __dealloc__(self):
  * #        del self.thisptr
  *     def numPlanes( self, int _numPlanes ):             # <<<<<<<<<<<<<<
@@ -9633,7 +10064,7 @@ static PyObject *__pyx_pf_8PyDeepCL_19FullyConnectedMaker_2numPlanes(struct __py
   return __pyx_r;
 }
 
-/* "LayerMaker.pyx":31
+/* "../../../../../../home/user/git/DeepCL/python/LayerMaker.pyx":31
  *         self.thisptr.numPlanes( _numPlanes )
  *         return self
  *     def imageSize( self, int _imageSize ):             # <<<<<<<<<<<<<<
@@ -9675,7 +10106,7 @@ static PyObject *__pyx_pf_8PyDeepCL_19FullyConnectedMaker_4imageSize(struct __py
   int __pyx_clineno = 0;
   __Pyx_RefNannySetupContext("imageSize", 0);
 
-  /* "LayerMaker.pyx":32
+  /* "../../../../../../home/user/git/DeepCL/python/LayerMaker.pyx":32
  *         return self
  *     def imageSize( self, int _imageSize ):
  *         self.thisptr.imageSize( _imageSize )             # <<<<<<<<<<<<<<
@@ -9689,7 +10120,7 @@ static PyObject *__pyx_pf_8PyDeepCL_19FullyConnectedMaker_4imageSize(struct __py
     {__pyx_filename = __pyx_f[0]; __pyx_lineno = 32; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
   }
 
-  /* "LayerMaker.pyx":33
+  /* "../../../../../../home/user/git/DeepCL/python/LayerMaker.pyx":33
  *     def imageSize( self, int _imageSize ):
  *         self.thisptr.imageSize( _imageSize )
  *         return self             # <<<<<<<<<<<<<<
@@ -9701,7 +10132,7 @@ static PyObject *__pyx_pf_8PyDeepCL_19FullyConnectedMaker_4imageSize(struct __py
   __pyx_r = ((PyObject *)__pyx_v_self);
   goto __pyx_L0;
 
-  /* "LayerMaker.pyx":31
+  /* "../../../../../../home/user/git/DeepCL/python/LayerMaker.pyx":31
  *         self.thisptr.numPlanes( _numPlanes )
  *         return self
  *     def imageSize( self, int _imageSize ):             # <<<<<<<<<<<<<<
@@ -9719,7 +10150,7 @@ static PyObject *__pyx_pf_8PyDeepCL_19FullyConnectedMaker_4imageSize(struct __py
   return __pyx_r;
 }
 
-/* "LayerMaker.pyx":34
+/* "../../../../../../home/user/git/DeepCL/python/LayerMaker.pyx":34
  *         self.thisptr.imageSize( _imageSize )
  *         return self
  *     def biased(self):             # <<<<<<<<<<<<<<
@@ -9748,7 +10179,7 @@ static PyObject *__pyx_pf_8PyDeepCL_19FullyConnectedMaker_6biased(struct __pyx_o
   int __pyx_clineno = 0;
   __Pyx_RefNannySetupContext("biased", 0);
 
-  /* "LayerMaker.pyx":35
+  /* "../../../../../../home/user/git/DeepCL/python/LayerMaker.pyx":35
  *         return self
  *     def biased(self):
  *         self.thisptr.biased()             # <<<<<<<<<<<<<<
@@ -9762,7 +10193,7 @@ static PyObject *__pyx_pf_8PyDeepCL_19FullyConnectedMaker_6biased(struct __pyx_o
     {__pyx_filename = __pyx_f[0]; __pyx_lineno = 35; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
   }
 
-  /* "LayerMaker.pyx":36
+  /* "../../../../../../home/user/git/DeepCL/python/LayerMaker.pyx":36
  *     def biased(self):
  *         self.thisptr.biased()
  *         return self             # <<<<<<<<<<<<<<
@@ -9774,7 +10205,7 @@ static PyObject *__pyx_pf_8PyDeepCL_19FullyConnectedMaker_6biased(struct __pyx_o
   __pyx_r = ((PyObject *)__pyx_v_self);
   goto __pyx_L0;
 
-  /* "LayerMaker.pyx":34
+  /* "../../../../../../home/user/git/DeepCL/python/LayerMaker.pyx":34
  *         self.thisptr.imageSize( _imageSize )
  *         return self
  *     def biased(self):             # <<<<<<<<<<<<<<
@@ -9792,7 +10223,7 @@ static PyObject *__pyx_pf_8PyDeepCL_19FullyConnectedMaker_6biased(struct __pyx_o
   return __pyx_r;
 }
 
-/* "LayerMaker.pyx":37
+/* "../../../../../../home/user/git/DeepCL/python/LayerMaker.pyx":37
  *         self.thisptr.biased()
  *         return self
  *     def biased(self, int _biased):             # <<<<<<<<<<<<<<
@@ -9834,7 +10265,7 @@ static PyObject *__pyx_pf_8PyDeepCL_19FullyConnectedMaker_8biased(struct __pyx_o
   int __pyx_clineno = 0;
   __Pyx_RefNannySetupContext("biased", 0);
 
-  /* "LayerMaker.pyx":38
+  /* "../../../../../../home/user/git/DeepCL/python/LayerMaker.pyx":38
  *         return self
  *     def biased(self, int _biased):
  *         self.thisptr.biased( _biased )             # <<<<<<<<<<<<<<
@@ -9848,7 +10279,7 @@ static PyObject *__pyx_pf_8PyDeepCL_19FullyConnectedMaker_8biased(struct __pyx_o
     {__pyx_filename = __pyx_f[0]; __pyx_lineno = 38; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
   }
 
-  /* "LayerMaker.pyx":39
+  /* "../../../../../../home/user/git/DeepCL/python/LayerMaker.pyx":39
  *     def biased(self, int _biased):
  *         self.thisptr.biased( _biased )
  *         return self             # <<<<<<<<<<<<<<
@@ -9860,7 +10291,7 @@ static PyObject *__pyx_pf_8PyDeepCL_19FullyConnectedMaker_8biased(struct __pyx_o
   __pyx_r = ((PyObject *)__pyx_v_self);
   goto __pyx_L0;
 
-  /* "LayerMaker.pyx":37
+  /* "../../../../../../home/user/git/DeepCL/python/LayerMaker.pyx":37
  *         self.thisptr.biased()
  *         return self
  *     def biased(self, int _biased):             # <<<<<<<<<<<<<<
@@ -9878,7 +10309,7 @@ static PyObject *__pyx_pf_8PyDeepCL_19FullyConnectedMaker_8biased(struct __pyx_o
   return __pyx_r;
 }
 
-/* "LayerMaker.pyx":41
+/* "../../../../../../home/user/git/DeepCL/python/LayerMaker.pyx":41
  *         return self
  *     @staticmethod
  *     def instance():             # <<<<<<<<<<<<<<
@@ -9912,7 +10343,7 @@ static PyObject *__pyx_pf_8PyDeepCL_19FullyConnectedMaker_10instance() {
   int __pyx_clineno = 0;
   __Pyx_RefNannySetupContext("instance", 0);
 
-  /* "LayerMaker.pyx":42
+  /* "../../../../../../home/user/git/DeepCL/python/LayerMaker.pyx":42
  *     @staticmethod
  *     def instance():
  *         return FullyConnectedMaker()             # <<<<<<<<<<<<<<
@@ -9926,7 +10357,7 @@ static PyObject *__pyx_pf_8PyDeepCL_19FullyConnectedMaker_10instance() {
   __pyx_t_1 = 0;
   goto __pyx_L0;
 
-  /* "LayerMaker.pyx":41
+  /* "../../../../../../home/user/git/DeepCL/python/LayerMaker.pyx":41
  *         return self
  *     @staticmethod
  *     def instance():             # <<<<<<<<<<<<<<
@@ -9945,7 +10376,7 @@ static PyObject *__pyx_pf_8PyDeepCL_19FullyConnectedMaker_10instance() {
   return __pyx_r;
 }
 
-/* "LayerMaker.pyx":46
+/* "../../../../../../home/user/git/DeepCL/python/LayerMaker.pyx":46
  * cdef class ConvolutionalMaker(LayerMaker2):
  *     cdef cDeepCL.ConvolutionalMaker *thisptr
  *     def __cinit__( self ):             # <<<<<<<<<<<<<<
@@ -9978,7 +10409,7 @@ static int __pyx_pf_8PyDeepCL_18ConvolutionalMaker___cinit__(struct __pyx_obj_8P
   int __pyx_clineno = 0;
   __Pyx_RefNannySetupContext("__cinit__", 0);
 
-  /* "LayerMaker.pyx":47
+  /* "../../../../../../home/user/git/DeepCL/python/LayerMaker.pyx":47
  *     cdef cDeepCL.ConvolutionalMaker *thisptr
  *     def __cinit__( self ):
  *         self.thisptr = new cDeepCL.ConvolutionalMaker()             # <<<<<<<<<<<<<<
@@ -9993,7 +10424,7 @@ static int __pyx_pf_8PyDeepCL_18ConvolutionalMaker___cinit__(struct __pyx_obj_8P
   }
   __pyx_v_self->thisptr = __pyx_t_1;
 
-  /* "LayerMaker.pyx":48
+  /* "../../../../../../home/user/git/DeepCL/python/LayerMaker.pyx":48
  *     def __cinit__( self ):
  *         self.thisptr = new cDeepCL.ConvolutionalMaker()
  *         self.baseptr = self.thisptr             # <<<<<<<<<<<<<<
@@ -10003,7 +10434,7 @@ static int __pyx_pf_8PyDeepCL_18ConvolutionalMaker___cinit__(struct __pyx_obj_8P
   __pyx_t_1 = __pyx_v_self->thisptr;
   __pyx_v_self->__pyx_base.baseptr = __pyx_t_1;
 
-  /* "LayerMaker.pyx":46
+  /* "../../../../../../home/user/git/DeepCL/python/LayerMaker.pyx":46
  * cdef class ConvolutionalMaker(LayerMaker2):
  *     cdef cDeepCL.ConvolutionalMaker *thisptr
  *     def __cinit__( self ):             # <<<<<<<<<<<<<<
@@ -10022,7 +10453,7 @@ static int __pyx_pf_8PyDeepCL_18ConvolutionalMaker___cinit__(struct __pyx_obj_8P
   return __pyx_r;
 }
 
-/* "LayerMaker.pyx":51
+/* "../../../../../../home/user/git/DeepCL/python/LayerMaker.pyx":51
  * #    def __dealloc__(self):
  *         #del self.thisptr
  *     def numFilters( self, int _numFilters ):             # <<<<<<<<<<<<<<
@@ -10064,7 +10495,7 @@ static PyObject *__pyx_pf_8PyDeepCL_18ConvolutionalMaker_2numFilters(struct __py
   int __pyx_clineno = 0;
   __Pyx_RefNannySetupContext("numFilters", 0);
 
-  /* "LayerMaker.pyx":52
+  /* "../../../../../../home/user/git/DeepCL/python/LayerMaker.pyx":52
  *         #del self.thisptr
  *     def numFilters( self, int _numFilters ):
  *         self.thisptr.numFilters( _numFilters )             # <<<<<<<<<<<<<<
@@ -10078,7 +10509,7 @@ static PyObject *__pyx_pf_8PyDeepCL_18ConvolutionalMaker_2numFilters(struct __py
     {__pyx_filename = __pyx_f[0]; __pyx_lineno = 52; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
   }
 
-  /* "LayerMaker.pyx":53
+  /* "../../../../../../home/user/git/DeepCL/python/LayerMaker.pyx":53
  *     def numFilters( self, int _numFilters ):
  *         self.thisptr.numFilters( _numFilters )
  *         return self             # <<<<<<<<<<<<<<
@@ -10090,7 +10521,7 @@ static PyObject *__pyx_pf_8PyDeepCL_18ConvolutionalMaker_2numFilters(struct __py
   __pyx_r = ((PyObject *)__pyx_v_self);
   goto __pyx_L0;
 
-  /* "LayerMaker.pyx":51
+  /* "../../../../../../home/user/git/DeepCL/python/LayerMaker.pyx":51
  * #    def __dealloc__(self):
  *         #del self.thisptr
  *     def numFilters( self, int _numFilters ):             # <<<<<<<<<<<<<<
@@ -10108,7 +10539,7 @@ static PyObject *__pyx_pf_8PyDeepCL_18ConvolutionalMaker_2numFilters(struct __py
   return __pyx_r;
 }
 
-/* "LayerMaker.pyx":54
+/* "../../../../../../home/user/git/DeepCL/python/LayerMaker.pyx":54
  *         self.thisptr.numFilters( _numFilters )
  *         return self
  *     def filterSize( self, int _filterSize ):             # <<<<<<<<<<<<<<
@@ -10150,7 +10581,7 @@ static PyObject *__pyx_pf_8PyDeepCL_18ConvolutionalMaker_4filterSize(struct __py
   int __pyx_clineno = 0;
   __Pyx_RefNannySetupContext("filterSize", 0);
 
-  /* "LayerMaker.pyx":55
+  /* "../../../../../../home/user/git/DeepCL/python/LayerMaker.pyx":55
  *         return self
  *     def filterSize( self, int _filterSize ):
  *         self.thisptr.filterSize( _filterSize )             # <<<<<<<<<<<<<<
@@ -10164,7 +10595,7 @@ static PyObject *__pyx_pf_8PyDeepCL_18ConvolutionalMaker_4filterSize(struct __py
     {__pyx_filename = __pyx_f[0]; __pyx_lineno = 55; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
   }
 
-  /* "LayerMaker.pyx":56
+  /* "../../../../../../home/user/git/DeepCL/python/LayerMaker.pyx":56
  *     def filterSize( self, int _filterSize ):
  *         self.thisptr.filterSize( _filterSize )
  *         return self             # <<<<<<<<<<<<<<
@@ -10176,7 +10607,7 @@ static PyObject *__pyx_pf_8PyDeepCL_18ConvolutionalMaker_4filterSize(struct __py
   __pyx_r = ((PyObject *)__pyx_v_self);
   goto __pyx_L0;
 
-  /* "LayerMaker.pyx":54
+  /* "../../../../../../home/user/git/DeepCL/python/LayerMaker.pyx":54
  *         self.thisptr.numFilters( _numFilters )
  *         return self
  *     def filterSize( self, int _filterSize ):             # <<<<<<<<<<<<<<
@@ -10194,7 +10625,7 @@ static PyObject *__pyx_pf_8PyDeepCL_18ConvolutionalMaker_4filterSize(struct __py
   return __pyx_r;
 }
 
-/* "LayerMaker.pyx":57
+/* "../../../../../../home/user/git/DeepCL/python/LayerMaker.pyx":57
  *         self.thisptr.filterSize( _filterSize )
  *         return self
  *     def padZeros(self):             # <<<<<<<<<<<<<<
@@ -10223,7 +10654,7 @@ static PyObject *__pyx_pf_8PyDeepCL_18ConvolutionalMaker_6padZeros(struct __pyx_
   int __pyx_clineno = 0;
   __Pyx_RefNannySetupContext("padZeros", 0);
 
-  /* "LayerMaker.pyx":58
+  /* "../../../../../../home/user/git/DeepCL/python/LayerMaker.pyx":58
  *         return self
  *     def padZeros(self):
  *         self.thisptr.padZeros()             # <<<<<<<<<<<<<<
@@ -10237,7 +10668,7 @@ static PyObject *__pyx_pf_8PyDeepCL_18ConvolutionalMaker_6padZeros(struct __pyx_
     {__pyx_filename = __pyx_f[0]; __pyx_lineno = 58; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
   }
 
-  /* "LayerMaker.pyx":59
+  /* "../../../../../../home/user/git/DeepCL/python/LayerMaker.pyx":59
  *     def padZeros(self):
  *         self.thisptr.padZeros()
  *         return self             # <<<<<<<<<<<<<<
@@ -10249,7 +10680,7 @@ static PyObject *__pyx_pf_8PyDeepCL_18ConvolutionalMaker_6padZeros(struct __pyx_
   __pyx_r = ((PyObject *)__pyx_v_self);
   goto __pyx_L0;
 
-  /* "LayerMaker.pyx":57
+  /* "../../../../../../home/user/git/DeepCL/python/LayerMaker.pyx":57
  *         self.thisptr.filterSize( _filterSize )
  *         return self
  *     def padZeros(self):             # <<<<<<<<<<<<<<
@@ -10267,7 +10698,7 @@ static PyObject *__pyx_pf_8PyDeepCL_18ConvolutionalMaker_6padZeros(struct __pyx_
   return __pyx_r;
 }
 
-/* "LayerMaker.pyx":60
+/* "../../../../../../home/user/git/DeepCL/python/LayerMaker.pyx":60
  *         self.thisptr.padZeros()
  *         return self
  *     def padZeros(self, bint _padZeros):             # <<<<<<<<<<<<<<
@@ -10309,7 +10740,7 @@ static PyObject *__pyx_pf_8PyDeepCL_18ConvolutionalMaker_8padZeros(struct __pyx_
   int __pyx_clineno = 0;
   __Pyx_RefNannySetupContext("padZeros", 0);
 
-  /* "LayerMaker.pyx":61
+  /* "../../../../../../home/user/git/DeepCL/python/LayerMaker.pyx":61
  *         return self
  *     def padZeros(self, bint _padZeros):
  *         self.thisptr.padZeros( _padZeros )             # <<<<<<<<<<<<<<
@@ -10323,7 +10754,7 @@ static PyObject *__pyx_pf_8PyDeepCL_18ConvolutionalMaker_8padZeros(struct __pyx_
     {__pyx_filename = __pyx_f[0]; __pyx_lineno = 61; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
   }
 
-  /* "LayerMaker.pyx":62
+  /* "../../../../../../home/user/git/DeepCL/python/LayerMaker.pyx":62
  *     def padZeros(self, bint _padZeros):
  *         self.thisptr.padZeros( _padZeros )
  *         return self             # <<<<<<<<<<<<<<
@@ -10335,7 +10766,7 @@ static PyObject *__pyx_pf_8PyDeepCL_18ConvolutionalMaker_8padZeros(struct __pyx_
   __pyx_r = ((PyObject *)__pyx_v_self);
   goto __pyx_L0;
 
-  /* "LayerMaker.pyx":60
+  /* "../../../../../../home/user/git/DeepCL/python/LayerMaker.pyx":60
  *         self.thisptr.padZeros()
  *         return self
  *     def padZeros(self, bint _padZeros):             # <<<<<<<<<<<<<<
@@ -10353,7 +10784,7 @@ static PyObject *__pyx_pf_8PyDeepCL_18ConvolutionalMaker_8padZeros(struct __pyx_
   return __pyx_r;
 }
 
-/* "LayerMaker.pyx":63
+/* "../../../../../../home/user/git/DeepCL/python/LayerMaker.pyx":63
  *         self.thisptr.padZeros( _padZeros )
  *         return self
  *     def biased(self):             # <<<<<<<<<<<<<<
@@ -10382,7 +10813,7 @@ static PyObject *__pyx_pf_8PyDeepCL_18ConvolutionalMaker_10biased(struct __pyx_o
   int __pyx_clineno = 0;
   __Pyx_RefNannySetupContext("biased", 0);
 
-  /* "LayerMaker.pyx":64
+  /* "../../../../../../home/user/git/DeepCL/python/LayerMaker.pyx":64
  *         return self
  *     def biased(self):
  *         self.thisptr.biased()             # <<<<<<<<<<<<<<
@@ -10396,7 +10827,7 @@ static PyObject *__pyx_pf_8PyDeepCL_18ConvolutionalMaker_10biased(struct __pyx_o
     {__pyx_filename = __pyx_f[0]; __pyx_lineno = 64; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
   }
 
-  /* "LayerMaker.pyx":65
+  /* "../../../../../../home/user/git/DeepCL/python/LayerMaker.pyx":65
  *     def biased(self):
  *         self.thisptr.biased()
  *         return self             # <<<<<<<<<<<<<<
@@ -10408,7 +10839,7 @@ static PyObject *__pyx_pf_8PyDeepCL_18ConvolutionalMaker_10biased(struct __pyx_o
   __pyx_r = ((PyObject *)__pyx_v_self);
   goto __pyx_L0;
 
-  /* "LayerMaker.pyx":63
+  /* "../../../../../../home/user/git/DeepCL/python/LayerMaker.pyx":63
  *         self.thisptr.padZeros( _padZeros )
  *         return self
  *     def biased(self):             # <<<<<<<<<<<<<<
@@ -10426,7 +10857,7 @@ static PyObject *__pyx_pf_8PyDeepCL_18ConvolutionalMaker_10biased(struct __pyx_o
   return __pyx_r;
 }
 
-/* "LayerMaker.pyx":66
+/* "../../../../../../home/user/git/DeepCL/python/LayerMaker.pyx":66
  *         self.thisptr.biased()
  *         return self
  *     def biased(self, bint _biased):             # <<<<<<<<<<<<<<
@@ -10468,7 +10899,7 @@ static PyObject *__pyx_pf_8PyDeepCL_18ConvolutionalMaker_12biased(struct __pyx_o
   int __pyx_clineno = 0;
   __Pyx_RefNannySetupContext("biased", 0);
 
-  /* "LayerMaker.pyx":67
+  /* "../../../../../../home/user/git/DeepCL/python/LayerMaker.pyx":67
  *         return self
  *     def biased(self, bint _biased):
  *         self.thisptr.biased( _biased )             # <<<<<<<<<<<<<<
@@ -10482,7 +10913,7 @@ static PyObject *__pyx_pf_8PyDeepCL_18ConvolutionalMaker_12biased(struct __pyx_o
     {__pyx_filename = __pyx_f[0]; __pyx_lineno = 67; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
   }
 
-  /* "LayerMaker.pyx":68
+  /* "../../../../../../home/user/git/DeepCL/python/LayerMaker.pyx":68
  *     def biased(self, bint _biased):
  *         self.thisptr.biased( _biased )
  *         return self             # <<<<<<<<<<<<<<
@@ -10494,7 +10925,7 @@ static PyObject *__pyx_pf_8PyDeepCL_18ConvolutionalMaker_12biased(struct __pyx_o
   __pyx_r = ((PyObject *)__pyx_v_self);
   goto __pyx_L0;
 
-  /* "LayerMaker.pyx":66
+  /* "../../../../../../home/user/git/DeepCL/python/LayerMaker.pyx":66
  *         self.thisptr.biased()
  *         return self
  *     def biased(self, bint _biased):             # <<<<<<<<<<<<<<
@@ -10512,7 +10943,7 @@ static PyObject *__pyx_pf_8PyDeepCL_18ConvolutionalMaker_12biased(struct __pyx_o
   return __pyx_r;
 }
 
-/* "LayerMaker.pyx":70
+/* "../../../../../../home/user/git/DeepCL/python/LayerMaker.pyx":70
  *         return self
  *     @staticmethod
  *     def instance():             # <<<<<<<<<<<<<<
@@ -10546,7 +10977,7 @@ static PyObject *__pyx_pf_8PyDeepCL_18ConvolutionalMaker_14instance() {
   int __pyx_clineno = 0;
   __Pyx_RefNannySetupContext("instance", 0);
 
-  /* "LayerMaker.pyx":71
+  /* "../../../../../../home/user/git/DeepCL/python/LayerMaker.pyx":71
  *     @staticmethod
  *     def instance():
  *         return ConvolutionalMaker()             # <<<<<<<<<<<<<<
@@ -10560,7 +10991,7 @@ static PyObject *__pyx_pf_8PyDeepCL_18ConvolutionalMaker_14instance() {
   __pyx_t_1 = 0;
   goto __pyx_L0;
 
-  /* "LayerMaker.pyx":70
+  /* "../../../../../../home/user/git/DeepCL/python/LayerMaker.pyx":70
  *         return self
  *     @staticmethod
  *     def instance():             # <<<<<<<<<<<<<<
@@ -10579,7 +11010,7 @@ static PyObject *__pyx_pf_8PyDeepCL_18ConvolutionalMaker_14instance() {
   return __pyx_r;
 }
 
-/* "LayerMaker.pyx":75
+/* "../../../../../../home/user/git/DeepCL/python/LayerMaker.pyx":75
  * cdef class PoolingMaker(LayerMaker2):
  *     cdef cDeepCL.PoolingMaker *thisptr
  *     def __cinit__( self ):             # <<<<<<<<<<<<<<
@@ -10612,7 +11043,7 @@ static int __pyx_pf_8PyDeepCL_12PoolingMaker___cinit__(struct __pyx_obj_8PyDeepC
   int __pyx_clineno = 0;
   __Pyx_RefNannySetupContext("__cinit__", 0);
 
-  /* "LayerMaker.pyx":76
+  /* "../../../../../../home/user/git/DeepCL/python/LayerMaker.pyx":76
  *     cdef cDeepCL.PoolingMaker *thisptr
  *     def __cinit__( self ):
  *         self.thisptr = new cDeepCL.PoolingMaker()             # <<<<<<<<<<<<<<
@@ -10627,7 +11058,7 @@ static int __pyx_pf_8PyDeepCL_12PoolingMaker___cinit__(struct __pyx_obj_8PyDeepC
   }
   __pyx_v_self->thisptr = __pyx_t_1;
 
-  /* "LayerMaker.pyx":77
+  /* "../../../../../../home/user/git/DeepCL/python/LayerMaker.pyx":77
  *     def __cinit__( self ):
  *         self.thisptr = new cDeepCL.PoolingMaker()
  *         self.baseptr = self.thisptr             # <<<<<<<<<<<<<<
@@ -10637,7 +11068,7 @@ static int __pyx_pf_8PyDeepCL_12PoolingMaker___cinit__(struct __pyx_obj_8PyDeepC
   __pyx_t_1 = __pyx_v_self->thisptr;
   __pyx_v_self->__pyx_base.baseptr = __pyx_t_1;
 
-  /* "LayerMaker.pyx":75
+  /* "../../../../../../home/user/git/DeepCL/python/LayerMaker.pyx":75
  * cdef class PoolingMaker(LayerMaker2):
  *     cdef cDeepCL.PoolingMaker *thisptr
  *     def __cinit__( self ):             # <<<<<<<<<<<<<<
@@ -10656,7 +11087,7 @@ static int __pyx_pf_8PyDeepCL_12PoolingMaker___cinit__(struct __pyx_obj_8PyDeepC
   return __pyx_r;
 }
 
-/* "LayerMaker.pyx":80
+/* "../../../../../../home/user/git/DeepCL/python/LayerMaker.pyx":80
  * #    def __dealloc__(self):
  * #        del self.thisptr
  *     def poolingSize( self, int _poolingSize ):             # <<<<<<<<<<<<<<
@@ -10698,7 +11129,7 @@ static PyObject *__pyx_pf_8PyDeepCL_12PoolingMaker_2poolingSize(struct __pyx_obj
   int __pyx_clineno = 0;
   __Pyx_RefNannySetupContext("poolingSize", 0);
 
-  /* "LayerMaker.pyx":81
+  /* "../../../../../../home/user/git/DeepCL/python/LayerMaker.pyx":81
  * #        del self.thisptr
  *     def poolingSize( self, int _poolingSize ):
  *         self.thisptr.poolingSize( _poolingSize )             # <<<<<<<<<<<<<<
@@ -10712,7 +11143,7 @@ static PyObject *__pyx_pf_8PyDeepCL_12PoolingMaker_2poolingSize(struct __pyx_obj
     {__pyx_filename = __pyx_f[0]; __pyx_lineno = 81; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
   }
 
-  /* "LayerMaker.pyx":82
+  /* "../../../../../../home/user/git/DeepCL/python/LayerMaker.pyx":82
  *     def poolingSize( self, int _poolingSize ):
  *         self.thisptr.poolingSize( _poolingSize )
  *         return self             # <<<<<<<<<<<<<<
@@ -10724,7 +11155,7 @@ static PyObject *__pyx_pf_8PyDeepCL_12PoolingMaker_2poolingSize(struct __pyx_obj
   __pyx_r = ((PyObject *)__pyx_v_self);
   goto __pyx_L0;
 
-  /* "LayerMaker.pyx":80
+  /* "../../../../../../home/user/git/DeepCL/python/LayerMaker.pyx":80
  * #    def __dealloc__(self):
  * #        del self.thisptr
  *     def poolingSize( self, int _poolingSize ):             # <<<<<<<<<<<<<<
@@ -10742,7 +11173,7 @@ static PyObject *__pyx_pf_8PyDeepCL_12PoolingMaker_2poolingSize(struct __pyx_obj
   return __pyx_r;
 }
 
-/* "LayerMaker.pyx":84
+/* "../../../../../../home/user/git/DeepCL/python/LayerMaker.pyx":84
  *         return self
  *     @staticmethod
  *     def instance():             # <<<<<<<<<<<<<<
@@ -10776,7 +11207,7 @@ static PyObject *__pyx_pf_8PyDeepCL_12PoolingMaker_4instance() {
   int __pyx_clineno = 0;
   __Pyx_RefNannySetupContext("instance", 0);
 
-  /* "LayerMaker.pyx":85
+  /* "../../../../../../home/user/git/DeepCL/python/LayerMaker.pyx":85
  *     @staticmethod
  *     def instance():
  *         return PoolingMaker()             # <<<<<<<<<<<<<<
@@ -10790,7 +11221,7 @@ static PyObject *__pyx_pf_8PyDeepCL_12PoolingMaker_4instance() {
   __pyx_t_1 = 0;
   goto __pyx_L0;
 
-  /* "LayerMaker.pyx":84
+  /* "../../../../../../home/user/git/DeepCL/python/LayerMaker.pyx":84
  *         return self
  *     @staticmethod
  *     def instance():             # <<<<<<<<<<<<<<
@@ -10809,7 +11240,7 @@ static PyObject *__pyx_pf_8PyDeepCL_12PoolingMaker_4instance() {
   return __pyx_r;
 }
 
-/* "LayerMaker.pyx":89
+/* "../../../../../../home/user/git/DeepCL/python/LayerMaker.pyx":89
  * cdef class DropoutMaker(LayerMaker2):
  *     cdef cDeepCL.DropoutMaker *thisptr
  *     def __cinit__( self ):             # <<<<<<<<<<<<<<
@@ -10842,7 +11273,7 @@ static int __pyx_pf_8PyDeepCL_12DropoutMaker___cinit__(struct __pyx_obj_8PyDeepC
   int __pyx_clineno = 0;
   __Pyx_RefNannySetupContext("__cinit__", 0);
 
-  /* "LayerMaker.pyx":90
+  /* "../../../../../../home/user/git/DeepCL/python/LayerMaker.pyx":90
  *     cdef cDeepCL.DropoutMaker *thisptr
  *     def __cinit__( self ):
  *         self.thisptr = new cDeepCL.DropoutMaker()             # <<<<<<<<<<<<<<
@@ -10857,7 +11288,7 @@ static int __pyx_pf_8PyDeepCL_12DropoutMaker___cinit__(struct __pyx_obj_8PyDeepC
   }
   __pyx_v_self->thisptr = __pyx_t_1;
 
-  /* "LayerMaker.pyx":91
+  /* "../../../../../../home/user/git/DeepCL/python/LayerMaker.pyx":91
  *     def __cinit__( self ):
  *         self.thisptr = new cDeepCL.DropoutMaker()
  *         self.baseptr = self.thisptr             # <<<<<<<<<<<<<<
@@ -10867,7 +11298,7 @@ static int __pyx_pf_8PyDeepCL_12DropoutMaker___cinit__(struct __pyx_obj_8PyDeepC
   __pyx_t_1 = __pyx_v_self->thisptr;
   __pyx_v_self->__pyx_base.baseptr = __pyx_t_1;
 
-  /* "LayerMaker.pyx":89
+  /* "../../../../../../home/user/git/DeepCL/python/LayerMaker.pyx":89
  * cdef class DropoutMaker(LayerMaker2):
  *     cdef cDeepCL.DropoutMaker *thisptr
  *     def __cinit__( self ):             # <<<<<<<<<<<<<<
@@ -10886,7 +11317,7 @@ static int __pyx_pf_8PyDeepCL_12DropoutMaker___cinit__(struct __pyx_obj_8PyDeepC
   return __pyx_r;
 }
 
-/* "LayerMaker.pyx":92
+/* "../../../../../../home/user/git/DeepCL/python/LayerMaker.pyx":92
  *         self.thisptr = new cDeepCL.DropoutMaker()
  *         self.baseptr = self.thisptr
  *     def dropRatio(self, float _dropRatio):             # <<<<<<<<<<<<<<
@@ -10928,7 +11359,7 @@ static PyObject *__pyx_pf_8PyDeepCL_12DropoutMaker_2dropRatio(struct __pyx_obj_8
   int __pyx_clineno = 0;
   __Pyx_RefNannySetupContext("dropRatio", 0);
 
-  /* "LayerMaker.pyx":93
+  /* "../../../../../../home/user/git/DeepCL/python/LayerMaker.pyx":93
  *         self.baseptr = self.thisptr
  *     def dropRatio(self, float _dropRatio):
  *         self.thisptr.dropRatio(_dropRatio)             # <<<<<<<<<<<<<<
@@ -10942,7 +11373,7 @@ static PyObject *__pyx_pf_8PyDeepCL_12DropoutMaker_2dropRatio(struct __pyx_obj_8
     {__pyx_filename = __pyx_f[0]; __pyx_lineno = 93; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
   }
 
-  /* "LayerMaker.pyx":94
+  /* "../../../../../../home/user/git/DeepCL/python/LayerMaker.pyx":94
  *     def dropRatio(self, float _dropRatio):
  *         self.thisptr.dropRatio(_dropRatio)
  *         return self             # <<<<<<<<<<<<<<
@@ -10954,7 +11385,7 @@ static PyObject *__pyx_pf_8PyDeepCL_12DropoutMaker_2dropRatio(struct __pyx_obj_8
   __pyx_r = ((PyObject *)__pyx_v_self);
   goto __pyx_L0;
 
-  /* "LayerMaker.pyx":92
+  /* "../../../../../../home/user/git/DeepCL/python/LayerMaker.pyx":92
  *         self.thisptr = new cDeepCL.DropoutMaker()
  *         self.baseptr = self.thisptr
  *     def dropRatio(self, float _dropRatio):             # <<<<<<<<<<<<<<
@@ -10972,7 +11403,7 @@ static PyObject *__pyx_pf_8PyDeepCL_12DropoutMaker_2dropRatio(struct __pyx_obj_8
   return __pyx_r;
 }
 
-/* "LayerMaker.pyx":96
+/* "../../../../../../home/user/git/DeepCL/python/LayerMaker.pyx":96
  *         return self
  *     @staticmethod
  *     def instance():             # <<<<<<<<<<<<<<
@@ -11006,7 +11437,7 @@ static PyObject *__pyx_pf_8PyDeepCL_12DropoutMaker_4instance() {
   int __pyx_clineno = 0;
   __Pyx_RefNannySetupContext("instance", 0);
 
-  /* "LayerMaker.pyx":97
+  /* "../../../../../../home/user/git/DeepCL/python/LayerMaker.pyx":97
  *     @staticmethod
  *     def instance():
  *         return ActivationMaker()             # <<<<<<<<<<<<<<
@@ -11020,7 +11451,7 @@ static PyObject *__pyx_pf_8PyDeepCL_12DropoutMaker_4instance() {
   __pyx_t_1 = 0;
   goto __pyx_L0;
 
-  /* "LayerMaker.pyx":96
+  /* "../../../../../../home/user/git/DeepCL/python/LayerMaker.pyx":96
  *         return self
  *     @staticmethod
  *     def instance():             # <<<<<<<<<<<<<<
@@ -11039,7 +11470,7 @@ static PyObject *__pyx_pf_8PyDeepCL_12DropoutMaker_4instance() {
   return __pyx_r;
 }
 
-/* "LayerMaker.pyx":101
+/* "../../../../../../home/user/git/DeepCL/python/LayerMaker.pyx":101
  * cdef class ActivationMaker(LayerMaker2):
  *     cdef cDeepCL.ActivationMaker *thisptr
  *     def __cinit__( self ):             # <<<<<<<<<<<<<<
@@ -11072,7 +11503,7 @@ static int __pyx_pf_8PyDeepCL_15ActivationMaker___cinit__(struct __pyx_obj_8PyDe
   int __pyx_clineno = 0;
   __Pyx_RefNannySetupContext("__cinit__", 0);
 
-  /* "LayerMaker.pyx":102
+  /* "../../../../../../home/user/git/DeepCL/python/LayerMaker.pyx":102
  *     cdef cDeepCL.ActivationMaker *thisptr
  *     def __cinit__( self ):
  *         self.thisptr = new cDeepCL.ActivationMaker()             # <<<<<<<<<<<<<<
@@ -11087,7 +11518,7 @@ static int __pyx_pf_8PyDeepCL_15ActivationMaker___cinit__(struct __pyx_obj_8PyDe
   }
   __pyx_v_self->thisptr = __pyx_t_1;
 
-  /* "LayerMaker.pyx":103
+  /* "../../../../../../home/user/git/DeepCL/python/LayerMaker.pyx":103
  *     def __cinit__( self ):
  *         self.thisptr = new cDeepCL.ActivationMaker()
  *         self.baseptr = self.thisptr             # <<<<<<<<<<<<<<
@@ -11097,7 +11528,7 @@ static int __pyx_pf_8PyDeepCL_15ActivationMaker___cinit__(struct __pyx_obj_8PyDe
   __pyx_t_1 = __pyx_v_self->thisptr;
   __pyx_v_self->__pyx_base.baseptr = __pyx_t_1;
 
-  /* "LayerMaker.pyx":101
+  /* "../../../../../../home/user/git/DeepCL/python/LayerMaker.pyx":101
  * cdef class ActivationMaker(LayerMaker2):
  *     cdef cDeepCL.ActivationMaker *thisptr
  *     def __cinit__( self ):             # <<<<<<<<<<<<<<
@@ -11116,7 +11547,7 @@ static int __pyx_pf_8PyDeepCL_15ActivationMaker___cinit__(struct __pyx_obj_8PyDe
   return __pyx_r;
 }
 
-/* "LayerMaker.pyx":104
+/* "../../../../../../home/user/git/DeepCL/python/LayerMaker.pyx":104
  *         self.thisptr = new cDeepCL.ActivationMaker()
  *         self.baseptr = self.thisptr
  *     def relu(self):             # <<<<<<<<<<<<<<
@@ -11145,7 +11576,7 @@ static PyObject *__pyx_pf_8PyDeepCL_15ActivationMaker_2relu(struct __pyx_obj_8Py
   int __pyx_clineno = 0;
   __Pyx_RefNannySetupContext("relu", 0);
 
-  /* "LayerMaker.pyx":105
+  /* "../../../../../../home/user/git/DeepCL/python/LayerMaker.pyx":105
  *         self.baseptr = self.thisptr
  *     def relu(self):
  *         self.thisptr.relu()             # <<<<<<<<<<<<<<
@@ -11159,7 +11590,7 @@ static PyObject *__pyx_pf_8PyDeepCL_15ActivationMaker_2relu(struct __pyx_obj_8Py
     {__pyx_filename = __pyx_f[0]; __pyx_lineno = 105; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
   }
 
-  /* "LayerMaker.pyx":106
+  /* "../../../../../../home/user/git/DeepCL/python/LayerMaker.pyx":106
  *     def relu(self):
  *         self.thisptr.relu()
  *         return self             # <<<<<<<<<<<<<<
@@ -11171,7 +11602,7 @@ static PyObject *__pyx_pf_8PyDeepCL_15ActivationMaker_2relu(struct __pyx_obj_8Py
   __pyx_r = ((PyObject *)__pyx_v_self);
   goto __pyx_L0;
 
-  /* "LayerMaker.pyx":104
+  /* "../../../../../../home/user/git/DeepCL/python/LayerMaker.pyx":104
  *         self.thisptr = new cDeepCL.ActivationMaker()
  *         self.baseptr = self.thisptr
  *     def relu(self):             # <<<<<<<<<<<<<<
@@ -11189,7 +11620,7 @@ static PyObject *__pyx_pf_8PyDeepCL_15ActivationMaker_2relu(struct __pyx_obj_8Py
   return __pyx_r;
 }
 
-/* "LayerMaker.pyx":107
+/* "../../../../../../home/user/git/DeepCL/python/LayerMaker.pyx":107
  *         self.thisptr.relu()
  *         return self
  *     def sigmoid(self):             # <<<<<<<<<<<<<<
@@ -11218,7 +11649,7 @@ static PyObject *__pyx_pf_8PyDeepCL_15ActivationMaker_4sigmoid(struct __pyx_obj_
   int __pyx_clineno = 0;
   __Pyx_RefNannySetupContext("sigmoid", 0);
 
-  /* "LayerMaker.pyx":108
+  /* "../../../../../../home/user/git/DeepCL/python/LayerMaker.pyx":108
  *         return self
  *     def sigmoid(self):
  *         self.thisptr.sigmoid()             # <<<<<<<<<<<<<<
@@ -11232,7 +11663,7 @@ static PyObject *__pyx_pf_8PyDeepCL_15ActivationMaker_4sigmoid(struct __pyx_obj_
     {__pyx_filename = __pyx_f[0]; __pyx_lineno = 108; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
   }
 
-  /* "LayerMaker.pyx":109
+  /* "../../../../../../home/user/git/DeepCL/python/LayerMaker.pyx":109
  *     def sigmoid(self):
  *         self.thisptr.sigmoid()
  *         return self             # <<<<<<<<<<<<<<
@@ -11244,7 +11675,7 @@ static PyObject *__pyx_pf_8PyDeepCL_15ActivationMaker_4sigmoid(struct __pyx_obj_
   __pyx_r = ((PyObject *)__pyx_v_self);
   goto __pyx_L0;
 
-  /* "LayerMaker.pyx":107
+  /* "../../../../../../home/user/git/DeepCL/python/LayerMaker.pyx":107
  *         self.thisptr.relu()
  *         return self
  *     def sigmoid(self):             # <<<<<<<<<<<<<<
@@ -11262,7 +11693,7 @@ static PyObject *__pyx_pf_8PyDeepCL_15ActivationMaker_4sigmoid(struct __pyx_obj_
   return __pyx_r;
 }
 
-/* "LayerMaker.pyx":110
+/* "../../../../../../home/user/git/DeepCL/python/LayerMaker.pyx":110
  *         self.thisptr.sigmoid()
  *         return self
  *     def tanh(self):             # <<<<<<<<<<<<<<
@@ -11291,7 +11722,7 @@ static PyObject *__pyx_pf_8PyDeepCL_15ActivationMaker_6tanh(struct __pyx_obj_8Py
   int __pyx_clineno = 0;
   __Pyx_RefNannySetupContext("tanh", 0);
 
-  /* "LayerMaker.pyx":111
+  /* "../../../../../../home/user/git/DeepCL/python/LayerMaker.pyx":111
  *         return self
  *     def tanh(self):
  *         self.thisptr.tanh()             # <<<<<<<<<<<<<<
@@ -11305,7 +11736,7 @@ static PyObject *__pyx_pf_8PyDeepCL_15ActivationMaker_6tanh(struct __pyx_obj_8Py
     {__pyx_filename = __pyx_f[0]; __pyx_lineno = 111; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
   }
 
-  /* "LayerMaker.pyx":112
+  /* "../../../../../../home/user/git/DeepCL/python/LayerMaker.pyx":112
  *     def tanh(self):
  *         self.thisptr.tanh()
  *         return self             # <<<<<<<<<<<<<<
@@ -11317,7 +11748,7 @@ static PyObject *__pyx_pf_8PyDeepCL_15ActivationMaker_6tanh(struct __pyx_obj_8Py
   __pyx_r = ((PyObject *)__pyx_v_self);
   goto __pyx_L0;
 
-  /* "LayerMaker.pyx":110
+  /* "../../../../../../home/user/git/DeepCL/python/LayerMaker.pyx":110
  *         self.thisptr.sigmoid()
  *         return self
  *     def tanh(self):             # <<<<<<<<<<<<<<
@@ -11335,7 +11766,7 @@ static PyObject *__pyx_pf_8PyDeepCL_15ActivationMaker_6tanh(struct __pyx_obj_8Py
   return __pyx_r;
 }
 
-/* "LayerMaker.pyx":113
+/* "../../../../../../home/user/git/DeepCL/python/LayerMaker.pyx":113
  *         self.thisptr.tanh()
  *         return self
  *     def linear(self):             # <<<<<<<<<<<<<<
@@ -11364,7 +11795,7 @@ static PyObject *__pyx_pf_8PyDeepCL_15ActivationMaker_8linear(struct __pyx_obj_8
   int __pyx_clineno = 0;
   __Pyx_RefNannySetupContext("linear", 0);
 
-  /* "LayerMaker.pyx":114
+  /* "../../../../../../home/user/git/DeepCL/python/LayerMaker.pyx":114
  *         return self
  *     def linear(self):
  *         self.thisptr.linear()             # <<<<<<<<<<<<<<
@@ -11378,7 +11809,7 @@ static PyObject *__pyx_pf_8PyDeepCL_15ActivationMaker_8linear(struct __pyx_obj_8
     {__pyx_filename = __pyx_f[0]; __pyx_lineno = 114; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
   }
 
-  /* "LayerMaker.pyx":115
+  /* "../../../../../../home/user/git/DeepCL/python/LayerMaker.pyx":115
  *     def linear(self):
  *         self.thisptr.linear()
  *         return self             # <<<<<<<<<<<<<<
@@ -11390,7 +11821,7 @@ static PyObject *__pyx_pf_8PyDeepCL_15ActivationMaker_8linear(struct __pyx_obj_8
   __pyx_r = ((PyObject *)__pyx_v_self);
   goto __pyx_L0;
 
-  /* "LayerMaker.pyx":113
+  /* "../../../../../../home/user/git/DeepCL/python/LayerMaker.pyx":113
  *         self.thisptr.tanh()
  *         return self
  *     def linear(self):             # <<<<<<<<<<<<<<
@@ -11408,7 +11839,7 @@ static PyObject *__pyx_pf_8PyDeepCL_15ActivationMaker_8linear(struct __pyx_obj_8
   return __pyx_r;
 }
 
-/* "LayerMaker.pyx":117
+/* "../../../../../../home/user/git/DeepCL/python/LayerMaker.pyx":117
  *         return self
  *     @staticmethod
  *     def instance():             # <<<<<<<<<<<<<<
@@ -11442,7 +11873,7 @@ static PyObject *__pyx_pf_8PyDeepCL_15ActivationMaker_10instance() {
   int __pyx_clineno = 0;
   __Pyx_RefNannySetupContext("instance", 0);
 
-  /* "LayerMaker.pyx":118
+  /* "../../../../../../home/user/git/DeepCL/python/LayerMaker.pyx":118
  *     @staticmethod
  *     def instance():
  *         return ActivationMaker()             # <<<<<<<<<<<<<<
@@ -11456,7 +11887,7 @@ static PyObject *__pyx_pf_8PyDeepCL_15ActivationMaker_10instance() {
   __pyx_t_1 = 0;
   goto __pyx_L0;
 
-  /* "LayerMaker.pyx":117
+  /* "../../../../../../home/user/git/DeepCL/python/LayerMaker.pyx":117
  *         return self
  *     @staticmethod
  *     def instance():             # <<<<<<<<<<<<<<
@@ -11475,7 +11906,7 @@ static PyObject *__pyx_pf_8PyDeepCL_15ActivationMaker_10instance() {
   return __pyx_r;
 }
 
-/* "LayerMaker.pyx":122
+/* "../../../../../../home/user/git/DeepCL/python/LayerMaker.pyx":122
  * cdef class ForceBackpropMaker(LayerMaker2):
  *     cdef cDeepCL.ForceBackpropLayerMaker *thisptr
  *     def __cinit__( self ):             # <<<<<<<<<<<<<<
@@ -11508,7 +11939,7 @@ static int __pyx_pf_8PyDeepCL_18ForceBackpropMaker___cinit__(struct __pyx_obj_8P
   int __pyx_clineno = 0;
   __Pyx_RefNannySetupContext("__cinit__", 0);
 
-  /* "LayerMaker.pyx":123
+  /* "../../../../../../home/user/git/DeepCL/python/LayerMaker.pyx":123
  *     cdef cDeepCL.ForceBackpropLayerMaker *thisptr
  *     def __cinit__( self ):
  *         self.thisptr = new cDeepCL.ForceBackpropLayerMaker()             # <<<<<<<<<<<<<<
@@ -11523,7 +11954,7 @@ static int __pyx_pf_8PyDeepCL_18ForceBackpropMaker___cinit__(struct __pyx_obj_8P
   }
   __pyx_v_self->thisptr = __pyx_t_1;
 
-  /* "LayerMaker.pyx":124
+  /* "../../../../../../home/user/git/DeepCL/python/LayerMaker.pyx":124
  *     def __cinit__( self ):
  *         self.thisptr = new cDeepCL.ForceBackpropLayerMaker()
  *         self.baseptr = self.thisptr             # <<<<<<<<<<<<<<
@@ -11533,7 +11964,7 @@ static int __pyx_pf_8PyDeepCL_18ForceBackpropMaker___cinit__(struct __pyx_obj_8P
   __pyx_t_1 = __pyx_v_self->thisptr;
   __pyx_v_self->__pyx_base.baseptr = __pyx_t_1;
 
-  /* "LayerMaker.pyx":122
+  /* "../../../../../../home/user/git/DeepCL/python/LayerMaker.pyx":122
  * cdef class ForceBackpropMaker(LayerMaker2):
  *     cdef cDeepCL.ForceBackpropLayerMaker *thisptr
  *     def __cinit__( self ):             # <<<<<<<<<<<<<<
@@ -11552,7 +11983,7 @@ static int __pyx_pf_8PyDeepCL_18ForceBackpropMaker___cinit__(struct __pyx_obj_8P
   return __pyx_r;
 }
 
-/* "LayerMaker.pyx":128
+/* "../../../../../../home/user/git/DeepCL/python/LayerMaker.pyx":128
  * #        del self.thisptr
  *     @staticmethod
  *     def instance():             # <<<<<<<<<<<<<<
@@ -11586,7 +12017,7 @@ static PyObject *__pyx_pf_8PyDeepCL_18ForceBackpropMaker_2instance() {
   int __pyx_clineno = 0;
   __Pyx_RefNannySetupContext("instance", 0);
 
-  /* "LayerMaker.pyx":129
+  /* "../../../../../../home/user/git/DeepCL/python/LayerMaker.pyx":129
  *     @staticmethod
  *     def instance():
  *         return ForceBackpropMaker()             # <<<<<<<<<<<<<<
@@ -11600,7 +12031,7 @@ static PyObject *__pyx_pf_8PyDeepCL_18ForceBackpropMaker_2instance() {
   __pyx_t_1 = 0;
   goto __pyx_L0;
 
-  /* "LayerMaker.pyx":128
+  /* "../../../../../../home/user/git/DeepCL/python/LayerMaker.pyx":128
  * #        del self.thisptr
  *     @staticmethod
  *     def instance():             # <<<<<<<<<<<<<<
@@ -11619,7 +12050,7 @@ static PyObject *__pyx_pf_8PyDeepCL_18ForceBackpropMaker_2instance() {
   return __pyx_r;
 }
 
-/* "LayerMaker.pyx":133
+/* "../../../../../../home/user/git/DeepCL/python/LayerMaker.pyx":133
  * cdef class SquareLossMaker(LayerMaker2):
  *     cdef cDeepCL.SquareLossMaker *thisptr
  *     def __cinit__( self ):             # <<<<<<<<<<<<<<
@@ -11652,7 +12083,7 @@ static int __pyx_pf_8PyDeepCL_15SquareLossMaker___cinit__(struct __pyx_obj_8PyDe
   int __pyx_clineno = 0;
   __Pyx_RefNannySetupContext("__cinit__", 0);
 
-  /* "LayerMaker.pyx":134
+  /* "../../../../../../home/user/git/DeepCL/python/LayerMaker.pyx":134
  *     cdef cDeepCL.SquareLossMaker *thisptr
  *     def __cinit__( self ):
  *         self.thisptr = new cDeepCL.SquareLossMaker()             # <<<<<<<<<<<<<<
@@ -11667,7 +12098,7 @@ static int __pyx_pf_8PyDeepCL_15SquareLossMaker___cinit__(struct __pyx_obj_8PyDe
   }
   __pyx_v_self->thisptr = __pyx_t_1;
 
-  /* "LayerMaker.pyx":135
+  /* "../../../../../../home/user/git/DeepCL/python/LayerMaker.pyx":135
  *     def __cinit__( self ):
  *         self.thisptr = new cDeepCL.SquareLossMaker()
  *         self.baseptr = self.thisptr             # <<<<<<<<<<<<<<
@@ -11677,7 +12108,7 @@ static int __pyx_pf_8PyDeepCL_15SquareLossMaker___cinit__(struct __pyx_obj_8PyDe
   __pyx_t_1 = __pyx_v_self->thisptr;
   __pyx_v_self->__pyx_base.baseptr = __pyx_t_1;
 
-  /* "LayerMaker.pyx":133
+  /* "../../../../../../home/user/git/DeepCL/python/LayerMaker.pyx":133
  * cdef class SquareLossMaker(LayerMaker2):
  *     cdef cDeepCL.SquareLossMaker *thisptr
  *     def __cinit__( self ):             # <<<<<<<<<<<<<<
@@ -11696,7 +12127,7 @@ static int __pyx_pf_8PyDeepCL_15SquareLossMaker___cinit__(struct __pyx_obj_8PyDe
   return __pyx_r;
 }
 
-/* "LayerMaker.pyx":139
+/* "../../../../../../home/user/git/DeepCL/python/LayerMaker.pyx":139
  * #        del self.thisptr
  *     @staticmethod
  *     def instance():             # <<<<<<<<<<<<<<
@@ -11730,7 +12161,7 @@ static PyObject *__pyx_pf_8PyDeepCL_15SquareLossMaker_2instance() {
   int __pyx_clineno = 0;
   __Pyx_RefNannySetupContext("instance", 0);
 
-  /* "LayerMaker.pyx":140
+  /* "../../../../../../home/user/git/DeepCL/python/LayerMaker.pyx":140
  *     @staticmethod
  *     def instance():
  *         return SquareLossMaker()             # <<<<<<<<<<<<<<
@@ -11744,7 +12175,7 @@ static PyObject *__pyx_pf_8PyDeepCL_15SquareLossMaker_2instance() {
   __pyx_t_1 = 0;
   goto __pyx_L0;
 
-  /* "LayerMaker.pyx":139
+  /* "../../../../../../home/user/git/DeepCL/python/LayerMaker.pyx":139
  * #        del self.thisptr
  *     @staticmethod
  *     def instance():             # <<<<<<<<<<<<<<
@@ -11763,7 +12194,7 @@ static PyObject *__pyx_pf_8PyDeepCL_15SquareLossMaker_2instance() {
   return __pyx_r;
 }
 
-/* "LayerMaker.pyx":144
+/* "../../../../../../home/user/git/DeepCL/python/LayerMaker.pyx":144
  * cdef class SoftMaxMaker(LayerMaker2):
  *     cdef cDeepCL.SoftMaxMaker *thisptr
  *     def __cinit__( self ):             # <<<<<<<<<<<<<<
@@ -11796,7 +12227,7 @@ static int __pyx_pf_8PyDeepCL_12SoftMaxMaker___cinit__(struct __pyx_obj_8PyDeepC
   int __pyx_clineno = 0;
   __Pyx_RefNannySetupContext("__cinit__", 0);
 
-  /* "LayerMaker.pyx":145
+  /* "../../../../../../home/user/git/DeepCL/python/LayerMaker.pyx":145
  *     cdef cDeepCL.SoftMaxMaker *thisptr
  *     def __cinit__( self ):
  *         self.thisptr = new cDeepCL.SoftMaxMaker()             # <<<<<<<<<<<<<<
@@ -11811,7 +12242,7 @@ static int __pyx_pf_8PyDeepCL_12SoftMaxMaker___cinit__(struct __pyx_obj_8PyDeepC
   }
   __pyx_v_self->thisptr = __pyx_t_1;
 
-  /* "LayerMaker.pyx":146
+  /* "../../../../../../home/user/git/DeepCL/python/LayerMaker.pyx":146
  *     def __cinit__( self ):
  *         self.thisptr = new cDeepCL.SoftMaxMaker()
  *         self.baseptr = self.thisptr             # <<<<<<<<<<<<<<
@@ -11821,7 +12252,7 @@ static int __pyx_pf_8PyDeepCL_12SoftMaxMaker___cinit__(struct __pyx_obj_8PyDeepC
   __pyx_t_1 = __pyx_v_self->thisptr;
   __pyx_v_self->__pyx_base.baseptr = __pyx_t_1;
 
-  /* "LayerMaker.pyx":144
+  /* "../../../../../../home/user/git/DeepCL/python/LayerMaker.pyx":144
  * cdef class SoftMaxMaker(LayerMaker2):
  *     cdef cDeepCL.SoftMaxMaker *thisptr
  *     def __cinit__( self ):             # <<<<<<<<<<<<<<
@@ -11840,7 +12271,7 @@ static int __pyx_pf_8PyDeepCL_12SoftMaxMaker___cinit__(struct __pyx_obj_8PyDeepC
   return __pyx_r;
 }
 
-/* "LayerMaker.pyx":150
+/* "../../../../../../home/user/git/DeepCL/python/LayerMaker.pyx":150
  * #        del self.thisptr
  *     @staticmethod
  *     def instance():             # <<<<<<<<<<<<<<
@@ -11874,7 +12305,7 @@ static PyObject *__pyx_pf_8PyDeepCL_12SoftMaxMaker_2instance() {
   int __pyx_clineno = 0;
   __Pyx_RefNannySetupContext("instance", 0);
 
-  /* "LayerMaker.pyx":151
+  /* "../../../../../../home/user/git/DeepCL/python/LayerMaker.pyx":151
  *     @staticmethod
  *     def instance():
  *         return SoftMaxMaker()             # <<<<<<<<<<<<<<
@@ -11888,7 +12319,7 @@ static PyObject *__pyx_pf_8PyDeepCL_12SoftMaxMaker_2instance() {
   __pyx_t_1 = 0;
   goto __pyx_L0;
 
-  /* "LayerMaker.pyx":150
+  /* "../../../../../../home/user/git/DeepCL/python/LayerMaker.pyx":150
  * #        del self.thisptr
  *     @staticmethod
  *     def instance():             # <<<<<<<<<<<<<<
@@ -11907,7 +12338,7 @@ static PyObject *__pyx_pf_8PyDeepCL_12SoftMaxMaker_2instance() {
   return __pyx_r;
 }
 
-/* "LayerMaker.pyx":155
+/* "../../../../../../home/user/git/DeepCL/python/LayerMaker.pyx":155
  * cdef class InputLayerMaker(LayerMaker2):
  *     cdef cDeepCL.InputLayerMaker *thisptr
  *     def __cinit__( self ):             # <<<<<<<<<<<<<<
@@ -11940,7 +12371,7 @@ static int __pyx_pf_8PyDeepCL_15InputLayerMaker___cinit__(struct __pyx_obj_8PyDe
   int __pyx_clineno = 0;
   __Pyx_RefNannySetupContext("__cinit__", 0);
 
-  /* "LayerMaker.pyx":156
+  /* "../../../../../../home/user/git/DeepCL/python/LayerMaker.pyx":156
  *     cdef cDeepCL.InputLayerMaker *thisptr
  *     def __cinit__( self ):
  *         self.thisptr = new cDeepCL.InputLayerMaker()             # <<<<<<<<<<<<<<
@@ -11955,7 +12386,7 @@ static int __pyx_pf_8PyDeepCL_15InputLayerMaker___cinit__(struct __pyx_obj_8PyDe
   }
   __pyx_v_self->thisptr = __pyx_t_1;
 
-  /* "LayerMaker.pyx":157
+  /* "../../../../../../home/user/git/DeepCL/python/LayerMaker.pyx":157
  *     def __cinit__( self ):
  *         self.thisptr = new cDeepCL.InputLayerMaker()
  *         self.baseptr = self.thisptr             # <<<<<<<<<<<<<<
@@ -11965,7 +12396,7 @@ static int __pyx_pf_8PyDeepCL_15InputLayerMaker___cinit__(struct __pyx_obj_8PyDe
   __pyx_t_1 = __pyx_v_self->thisptr;
   __pyx_v_self->__pyx_base.baseptr = __pyx_t_1;
 
-  /* "LayerMaker.pyx":155
+  /* "../../../../../../home/user/git/DeepCL/python/LayerMaker.pyx":155
  * cdef class InputLayerMaker(LayerMaker2):
  *     cdef cDeepCL.InputLayerMaker *thisptr
  *     def __cinit__( self ):             # <<<<<<<<<<<<<<
@@ -11984,7 +12415,7 @@ static int __pyx_pf_8PyDeepCL_15InputLayerMaker___cinit__(struct __pyx_obj_8PyDe
   return __pyx_r;
 }
 
-/* "LayerMaker.pyx":160
+/* "../../../../../../home/user/git/DeepCL/python/LayerMaker.pyx":160
  * #    def __dealloc__(self):
  * #        del self.thisptr
  *     def numPlanes( self, int _numPlanes ):             # <<<<<<<<<<<<<<
@@ -12026,7 +12457,7 @@ static PyObject *__pyx_pf_8PyDeepCL_15InputLayerMaker_2numPlanes(struct __pyx_ob
   int __pyx_clineno = 0;
   __Pyx_RefNannySetupContext("numPlanes", 0);
 
-  /* "LayerMaker.pyx":161
+  /* "../../../../../../home/user/git/DeepCL/python/LayerMaker.pyx":161
  * #        del self.thisptr
  *     def numPlanes( self, int _numPlanes ):
  *         self.thisptr.numPlanes( _numPlanes )             # <<<<<<<<<<<<<<
@@ -12040,7 +12471,7 @@ static PyObject *__pyx_pf_8PyDeepCL_15InputLayerMaker_2numPlanes(struct __pyx_ob
     {__pyx_filename = __pyx_f[0]; __pyx_lineno = 161; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
   }
 
-  /* "LayerMaker.pyx":162
+  /* "../../../../../../home/user/git/DeepCL/python/LayerMaker.pyx":162
  *     def numPlanes( self, int _numPlanes ):
  *         self.thisptr.numPlanes( _numPlanes )
  *         return self             # <<<<<<<<<<<<<<
@@ -12052,7 +12483,7 @@ static PyObject *__pyx_pf_8PyDeepCL_15InputLayerMaker_2numPlanes(struct __pyx_ob
   __pyx_r = ((PyObject *)__pyx_v_self);
   goto __pyx_L0;
 
-  /* "LayerMaker.pyx":160
+  /* "../../../../../../home/user/git/DeepCL/python/LayerMaker.pyx":160
  * #    def __dealloc__(self):
  * #        del self.thisptr
  *     def numPlanes( self, int _numPlanes ):             # <<<<<<<<<<<<<<
@@ -12070,7 +12501,7 @@ static PyObject *__pyx_pf_8PyDeepCL_15InputLayerMaker_2numPlanes(struct __pyx_ob
   return __pyx_r;
 }
 
-/* "LayerMaker.pyx":163
+/* "../../../../../../home/user/git/DeepCL/python/LayerMaker.pyx":163
  *         self.thisptr.numPlanes( _numPlanes )
  *         return self
  *     def imageSize( self, int _imageSize ):             # <<<<<<<<<<<<<<
@@ -12112,7 +12543,7 @@ static PyObject *__pyx_pf_8PyDeepCL_15InputLayerMaker_4imageSize(struct __pyx_ob
   int __pyx_clineno = 0;
   __Pyx_RefNannySetupContext("imageSize", 0);
 
-  /* "LayerMaker.pyx":164
+  /* "../../../../../../home/user/git/DeepCL/python/LayerMaker.pyx":164
  *         return self
  *     def imageSize( self, int _imageSize ):
  *         self.thisptr.imageSize( _imageSize )             # <<<<<<<<<<<<<<
@@ -12126,7 +12557,7 @@ static PyObject *__pyx_pf_8PyDeepCL_15InputLayerMaker_4imageSize(struct __pyx_ob
     {__pyx_filename = __pyx_f[0]; __pyx_lineno = 164; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
   }
 
-  /* "LayerMaker.pyx":165
+  /* "../../../../../../home/user/git/DeepCL/python/LayerMaker.pyx":165
  *     def imageSize( self, int _imageSize ):
  *         self.thisptr.imageSize( _imageSize )
  *         return self             # <<<<<<<<<<<<<<
@@ -12138,7 +12569,7 @@ static PyObject *__pyx_pf_8PyDeepCL_15InputLayerMaker_4imageSize(struct __pyx_ob
   __pyx_r = ((PyObject *)__pyx_v_self);
   goto __pyx_L0;
 
-  /* "LayerMaker.pyx":163
+  /* "../../../../../../home/user/git/DeepCL/python/LayerMaker.pyx":163
  *         self.thisptr.numPlanes( _numPlanes )
  *         return self
  *     def imageSize( self, int _imageSize ):             # <<<<<<<<<<<<<<
@@ -12156,7 +12587,7 @@ static PyObject *__pyx_pf_8PyDeepCL_15InputLayerMaker_4imageSize(struct __pyx_ob
   return __pyx_r;
 }
 
-/* "LayerMaker.pyx":167
+/* "../../../../../../home/user/git/DeepCL/python/LayerMaker.pyx":167
  *         return self
  *     @staticmethod
  *     def instance():             # <<<<<<<<<<<<<<
@@ -12190,7 +12621,7 @@ static PyObject *__pyx_pf_8PyDeepCL_15InputLayerMaker_6instance() {
   int __pyx_clineno = 0;
   __Pyx_RefNannySetupContext("instance", 0);
 
-  /* "LayerMaker.pyx":168
+  /* "../../../../../../home/user/git/DeepCL/python/LayerMaker.pyx":168
  *     @staticmethod
  *     def instance():
  *         return InputLayerMaker()             # <<<<<<<<<<<<<<
@@ -12204,7 +12635,7 @@ static PyObject *__pyx_pf_8PyDeepCL_15InputLayerMaker_6instance() {
   __pyx_t_1 = 0;
   goto __pyx_L0;
 
-  /* "LayerMaker.pyx":167
+  /* "../../../../../../home/user/git/DeepCL/python/LayerMaker.pyx":167
  *         return self
  *     @staticmethod
  *     def instance():             # <<<<<<<<<<<<<<
@@ -12223,19 +12654,19 @@ static PyObject *__pyx_pf_8PyDeepCL_15InputLayerMaker_6instance() {
   return __pyx_r;
 }
 
-/* "GenericLoader.pyx":3
+/* "../../../../../../home/user/git/DeepCL/python/GenericLoader.pyx":3
  * cdef class GenericLoader:
  *     @staticmethod
- *     def getDimensions( trainFilePath ):             # <<<<<<<<<<<<<<
+ *     def getDimensions( trainFilepath ):             # <<<<<<<<<<<<<<
+ *         print 'GenericLoader.py getDimensions ', trainFilepath
  *         cdef int N
- *         cdef int planes
  */
 
 /* Python wrapper */
 static PyObject *__pyx_pw_8PyDeepCL_13GenericLoader_1getDimensions(CYTHON_UNUSED PyObject *__pyx_self, PyObject *__pyx_args, PyObject *__pyx_kwds); /*proto*/
 static PyMethodDef __pyx_mdef_8PyDeepCL_13GenericLoader_1getDimensions = {"getDimensions", (PyCFunction)__pyx_pw_8PyDeepCL_13GenericLoader_1getDimensions, METH_VARARGS|METH_KEYWORDS, 0};
 static PyObject *__pyx_pw_8PyDeepCL_13GenericLoader_1getDimensions(CYTHON_UNUSED PyObject *__pyx_self, PyObject *__pyx_args, PyObject *__pyx_kwds) {
-  PyObject *__pyx_v_trainFilePath = 0;
+  PyObject *__pyx_v_trainFilepath = 0;
   int __pyx_lineno = 0;
   const char *__pyx_filename = NULL;
   int __pyx_clineno = 0;
@@ -12243,7 +12674,7 @@ static PyObject *__pyx_pw_8PyDeepCL_13GenericLoader_1getDimensions(CYTHON_UNUSED
   __Pyx_RefNannyDeclarations
   __Pyx_RefNannySetupContext("getDimensions (wrapper)", 0);
   {
-    static PyObject **__pyx_pyargnames[] = {&__pyx_n_s_trainFilePath,0};
+    static PyObject **__pyx_pyargnames[] = {&__pyx_n_s_trainFilepath,0};
     PyObject* values[1] = {0};
     if (unlikely(__pyx_kwds)) {
       Py_ssize_t kw_args;
@@ -12256,136 +12687,145 @@ static PyObject *__pyx_pw_8PyDeepCL_13GenericLoader_1getDimensions(CYTHON_UNUSED
       kw_args = PyDict_Size(__pyx_kwds);
       switch (pos_args) {
         case  0:
-        if (likely((values[0] = PyDict_GetItem(__pyx_kwds, __pyx_n_s_trainFilePath)) != 0)) kw_args--;
+        if (likely((values[0] = PyDict_GetItem(__pyx_kwds, __pyx_n_s_trainFilepath)) != 0)) kw_args--;
         else goto __pyx_L5_argtuple_error;
       }
       if (unlikely(kw_args > 0)) {
-        if (unlikely(__Pyx_ParseOptionalKeywords(__pyx_kwds, __pyx_pyargnames, 0, values, pos_args, "getDimensions") < 0)) {__pyx_filename = __pyx_f[11]; __pyx_lineno = 3; __pyx_clineno = __LINE__; goto __pyx_L3_error;}
+        if (unlikely(__Pyx_ParseOptionalKeywords(__pyx_kwds, __pyx_pyargnames, 0, values, pos_args, "getDimensions") < 0)) {__pyx_filename = __pyx_f[10]; __pyx_lineno = 3; __pyx_clineno = __LINE__; goto __pyx_L3_error;}
       }
     } else if (PyTuple_GET_SIZE(__pyx_args) != 1) {
       goto __pyx_L5_argtuple_error;
     } else {
       values[0] = PyTuple_GET_ITEM(__pyx_args, 0);
     }
-    __pyx_v_trainFilePath = values[0];
+    __pyx_v_trainFilepath = values[0];
   }
   goto __pyx_L4_argument_unpacking_done;
   __pyx_L5_argtuple_error:;
-  __Pyx_RaiseArgtupleInvalid("getDimensions", 1, 1, 1, PyTuple_GET_SIZE(__pyx_args)); {__pyx_filename = __pyx_f[11]; __pyx_lineno = 3; __pyx_clineno = __LINE__; goto __pyx_L3_error;}
+  __Pyx_RaiseArgtupleInvalid("getDimensions", 1, 1, 1, PyTuple_GET_SIZE(__pyx_args)); {__pyx_filename = __pyx_f[10]; __pyx_lineno = 3; __pyx_clineno = __LINE__; goto __pyx_L3_error;}
   __pyx_L3_error:;
   __Pyx_AddTraceback("PyDeepCL.GenericLoader.getDimensions", __pyx_clineno, __pyx_lineno, __pyx_filename);
   __Pyx_RefNannyFinishContext();
   return NULL;
   __pyx_L4_argument_unpacking_done:;
-  __pyx_r = __pyx_pf_8PyDeepCL_13GenericLoader_getDimensions(__pyx_v_trainFilePath);
+  __pyx_r = __pyx_pf_8PyDeepCL_13GenericLoader_getDimensions(__pyx_v_trainFilepath);
 
   /* function exit code */
   __Pyx_RefNannyFinishContext();
   return __pyx_r;
 }
 
-static PyObject *__pyx_pf_8PyDeepCL_13GenericLoader_getDimensions(PyObject *__pyx_v_trainFilePath) {
+static PyObject *__pyx_pf_8PyDeepCL_13GenericLoader_getDimensions(PyObject *__pyx_v_trainFilepath) {
   int __pyx_v_N;
   int __pyx_v_planes;
   int __pyx_v_size;
+  char const *__pyx_v_trainFilepath_charstar;
   PyObject *__pyx_r = NULL;
   __Pyx_RefNannyDeclarations
   PyObject *__pyx_t_1 = NULL;
-  PyObject *__pyx_t_2 = NULL;
+  char const *__pyx_t_2;
   PyObject *__pyx_t_3 = NULL;
   PyObject *__pyx_t_4 = NULL;
-  std::string __pyx_t_5;
+  PyObject *__pyx_t_5 = NULL;
   int __pyx_lineno = 0;
   const char *__pyx_filename = NULL;
   int __pyx_clineno = 0;
   __Pyx_RefNannySetupContext("getDimensions", 0);
 
-  /* "GenericLoader.pyx":7
+  /* "../../../../../../home/user/git/DeepCL/python/GenericLoader.pyx":4
+ *     @staticmethod
+ *     def getDimensions( trainFilepath ):
+ *         print 'GenericLoader.py getDimensions ', trainFilepath             # <<<<<<<<<<<<<<
+ *         cdef int N
+ *         cdef int planes
+ */
+  __pyx_t_1 = PyTuple_New(2); if (unlikely(!__pyx_t_1)) {__pyx_filename = __pyx_f[10]; __pyx_lineno = 4; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+  __Pyx_GOTREF(__pyx_t_1);
+  __Pyx_INCREF(__pyx_kp_s_GenericLoader_py_getDimensions);
+  PyTuple_SET_ITEM(__pyx_t_1, 0, __pyx_kp_s_GenericLoader_py_getDimensions);
+  __Pyx_GIVEREF(__pyx_kp_s_GenericLoader_py_getDimensions);
+  __Pyx_INCREF(__pyx_v_trainFilepath);
+  PyTuple_SET_ITEM(__pyx_t_1, 1, __pyx_v_trainFilepath);
+  __Pyx_GIVEREF(__pyx_v_trainFilepath);
+  if (__Pyx_Print(0, __pyx_t_1, 1) < 0) {__pyx_filename = __pyx_f[10]; __pyx_lineno = 4; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+  __Pyx_DECREF(__pyx_t_1); __pyx_t_1 = 0;
+
+  /* "../../../../../../home/user/git/DeepCL/python/GenericLoader.pyx":8
  *         cdef int planes
  *         cdef int size
- *         cDeepCL.GenericLoader.getDimensions( toCppString( trainFilePath ), &N, &planes, &size )             # <<<<<<<<<<<<<<
+ *         cdef const char *trainFilepath_charstar = trainFilepath             # <<<<<<<<<<<<<<
+ *         cDeepCL.GenericLoader.getDimensions(trainFilepath_charstar, &N, &planes, &size)
+ *         print 'finished calling'
+ */
+  __pyx_t_2 = __Pyx_PyObject_AsString(__pyx_v_trainFilepath); if (unlikely((!__pyx_t_2) && PyErr_Occurred())) {__pyx_filename = __pyx_f[10]; __pyx_lineno = 8; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+  __pyx_v_trainFilepath_charstar = __pyx_t_2;
+
+  /* "../../../../../../home/user/git/DeepCL/python/GenericLoader.pyx":9
+ *         cdef int size
+ *         cdef const char *trainFilepath_charstar = trainFilepath
+ *         cDeepCL.GenericLoader.getDimensions(trainFilepath_charstar, &N, &planes, &size)             # <<<<<<<<<<<<<<
+ *         print 'finished calling'
  *         return (N,planes,size)
- *     @staticmethod
  */
-  __pyx_t_2 = __Pyx_GetModuleGlobalName(__pyx_n_s_toCppString); if (unlikely(!__pyx_t_2)) {__pyx_filename = __pyx_f[11]; __pyx_lineno = 7; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
-  __Pyx_GOTREF(__pyx_t_2);
-  __pyx_t_3 = NULL;
-  if (CYTHON_COMPILING_IN_CPYTHON && unlikely(PyMethod_Check(__pyx_t_2))) {
-    __pyx_t_3 = PyMethod_GET_SELF(__pyx_t_2);
-    if (likely(__pyx_t_3)) {
-      PyObject* function = PyMethod_GET_FUNCTION(__pyx_t_2);
-      __Pyx_INCREF(__pyx_t_3);
-      __Pyx_INCREF(function);
-      __Pyx_DECREF_SET(__pyx_t_2, function);
-    }
-  }
-  if (!__pyx_t_3) {
-    __pyx_t_1 = __Pyx_PyObject_CallOneArg(__pyx_t_2, __pyx_v_trainFilePath); if (unlikely(!__pyx_t_1)) {__pyx_filename = __pyx_f[11]; __pyx_lineno = 7; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
-    __Pyx_GOTREF(__pyx_t_1);
-  } else {
-    __pyx_t_4 = PyTuple_New(1+1); if (unlikely(!__pyx_t_4)) {__pyx_filename = __pyx_f[11]; __pyx_lineno = 7; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
-    __Pyx_GOTREF(__pyx_t_4);
-    PyTuple_SET_ITEM(__pyx_t_4, 0, __pyx_t_3); __Pyx_GIVEREF(__pyx_t_3); __pyx_t_3 = NULL;
-    __Pyx_INCREF(__pyx_v_trainFilePath);
-    PyTuple_SET_ITEM(__pyx_t_4, 0+1, __pyx_v_trainFilePath);
-    __Pyx_GIVEREF(__pyx_v_trainFilePath);
-    __pyx_t_1 = __Pyx_PyObject_Call(__pyx_t_2, __pyx_t_4, NULL); if (unlikely(!__pyx_t_1)) {__pyx_filename = __pyx_f[11]; __pyx_lineno = 7; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
-    __Pyx_GOTREF(__pyx_t_1);
-    __Pyx_DECREF(__pyx_t_4); __pyx_t_4 = 0;
-  }
-  __Pyx_DECREF(__pyx_t_2); __pyx_t_2 = 0;
-  __pyx_t_5 = __pyx_convert_string_from_py_std__in_string(__pyx_t_1); if (unlikely(PyErr_Occurred())) {__pyx_filename = __pyx_f[11]; __pyx_lineno = 7; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
-  __Pyx_DECREF(__pyx_t_1); __pyx_t_1 = 0;
   try {
-    GenericLoader::getDimensions(__pyx_t_5, (&__pyx_v_N), (&__pyx_v_planes), (&__pyx_v_size));
+    GenericLoader::getDimensions(__pyx_v_trainFilepath_charstar, (&__pyx_v_N), (&__pyx_v_planes), (&__pyx_v_size));
   } catch(...) {
     __Pyx_CppExn2PyErr();
-    {__pyx_filename = __pyx_f[11]; __pyx_lineno = 7; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+    {__pyx_filename = __pyx_f[10]; __pyx_lineno = 9; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
   }
 
-  /* "GenericLoader.pyx":8
- *         cdef int size
- *         cDeepCL.GenericLoader.getDimensions( toCppString( trainFilePath ), &N, &planes, &size )
+  /* "../../../../../../home/user/git/DeepCL/python/GenericLoader.pyx":10
+ *         cdef const char *trainFilepath_charstar = trainFilepath
+ *         cDeepCL.GenericLoader.getDimensions(trainFilepath_charstar, &N, &planes, &size)
+ *         print 'finished calling'             # <<<<<<<<<<<<<<
+ *         return (N,planes,size)
+ *     @staticmethod
+ */
+  if (__Pyx_PrintOne(0, __pyx_kp_s_finished_calling) < 0) {__pyx_filename = __pyx_f[10]; __pyx_lineno = 10; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+
+  /* "../../../../../../home/user/git/DeepCL/python/GenericLoader.pyx":11
+ *         cDeepCL.GenericLoader.getDimensions(trainFilepath_charstar, &N, &planes, &size)
+ *         print 'finished calling'
  *         return (N,planes,size)             # <<<<<<<<<<<<<<
  *     @staticmethod
  *     def load( trainFilepath, float[:] images, int[:] labels, startN, numExamples ):
  */
   __Pyx_XDECREF(__pyx_r);
-  __pyx_t_1 = __Pyx_PyInt_From_int(__pyx_v_N); if (unlikely(!__pyx_t_1)) {__pyx_filename = __pyx_f[11]; __pyx_lineno = 8; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+  __pyx_t_1 = __Pyx_PyInt_From_int(__pyx_v_N); if (unlikely(!__pyx_t_1)) {__pyx_filename = __pyx_f[10]; __pyx_lineno = 11; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
   __Pyx_GOTREF(__pyx_t_1);
-  __pyx_t_2 = __Pyx_PyInt_From_int(__pyx_v_planes); if (unlikely(!__pyx_t_2)) {__pyx_filename = __pyx_f[11]; __pyx_lineno = 8; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
-  __Pyx_GOTREF(__pyx_t_2);
-  __pyx_t_4 = __Pyx_PyInt_From_int(__pyx_v_size); if (unlikely(!__pyx_t_4)) {__pyx_filename = __pyx_f[11]; __pyx_lineno = 8; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
-  __Pyx_GOTREF(__pyx_t_4);
-  __pyx_t_3 = PyTuple_New(3); if (unlikely(!__pyx_t_3)) {__pyx_filename = __pyx_f[11]; __pyx_lineno = 8; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+  __pyx_t_3 = __Pyx_PyInt_From_int(__pyx_v_planes); if (unlikely(!__pyx_t_3)) {__pyx_filename = __pyx_f[10]; __pyx_lineno = 11; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
   __Pyx_GOTREF(__pyx_t_3);
-  PyTuple_SET_ITEM(__pyx_t_3, 0, __pyx_t_1);
+  __pyx_t_4 = __Pyx_PyInt_From_int(__pyx_v_size); if (unlikely(!__pyx_t_4)) {__pyx_filename = __pyx_f[10]; __pyx_lineno = 11; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+  __Pyx_GOTREF(__pyx_t_4);
+  __pyx_t_5 = PyTuple_New(3); if (unlikely(!__pyx_t_5)) {__pyx_filename = __pyx_f[10]; __pyx_lineno = 11; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+  __Pyx_GOTREF(__pyx_t_5);
+  PyTuple_SET_ITEM(__pyx_t_5, 0, __pyx_t_1);
   __Pyx_GIVEREF(__pyx_t_1);
-  PyTuple_SET_ITEM(__pyx_t_3, 1, __pyx_t_2);
-  __Pyx_GIVEREF(__pyx_t_2);
-  PyTuple_SET_ITEM(__pyx_t_3, 2, __pyx_t_4);
+  PyTuple_SET_ITEM(__pyx_t_5, 1, __pyx_t_3);
+  __Pyx_GIVEREF(__pyx_t_3);
+  PyTuple_SET_ITEM(__pyx_t_5, 2, __pyx_t_4);
   __Pyx_GIVEREF(__pyx_t_4);
   __pyx_t_1 = 0;
-  __pyx_t_2 = 0;
-  __pyx_t_4 = 0;
-  __pyx_r = __pyx_t_3;
   __pyx_t_3 = 0;
+  __pyx_t_4 = 0;
+  __pyx_r = __pyx_t_5;
+  __pyx_t_5 = 0;
   goto __pyx_L0;
 
-  /* "GenericLoader.pyx":3
+  /* "../../../../../../home/user/git/DeepCL/python/GenericLoader.pyx":3
  * cdef class GenericLoader:
  *     @staticmethod
- *     def getDimensions( trainFilePath ):             # <<<<<<<<<<<<<<
+ *     def getDimensions( trainFilepath ):             # <<<<<<<<<<<<<<
+ *         print 'GenericLoader.py getDimensions ', trainFilepath
  *         cdef int N
- *         cdef int planes
  */
 
   /* function exit code */
   __pyx_L1_error:;
   __Pyx_XDECREF(__pyx_t_1);
-  __Pyx_XDECREF(__pyx_t_2);
   __Pyx_XDECREF(__pyx_t_3);
   __Pyx_XDECREF(__pyx_t_4);
+  __Pyx_XDECREF(__pyx_t_5);
   __Pyx_AddTraceback("PyDeepCL.GenericLoader.getDimensions", __pyx_clineno, __pyx_lineno, __pyx_filename);
   __pyx_r = NULL;
   __pyx_L0:;
@@ -12394,12 +12834,12 @@ static PyObject *__pyx_pf_8PyDeepCL_13GenericLoader_getDimensions(PyObject *__py
   return __pyx_r;
 }
 
-/* "GenericLoader.pyx":10
+/* "../../../../../../home/user/git/DeepCL/python/GenericLoader.pyx":13
  *         return (N,planes,size)
  *     @staticmethod
  *     def load( trainFilepath, float[:] images, int[:] labels, startN, numExamples ):             # <<<<<<<<<<<<<<
- *         cDeepCL.GenericLoader.load( toCppString(trainFilepath), &images[0], &labels[0], startN , numExamples )
- * 
+ *         cdef const char *trainFilepath_charstar = trainFilepath
+ *         cDeepCL.GenericLoader.load(trainFilepath_charstar, &images[0], &labels[0], startN , numExamples)
  */
 
 /* Python wrapper */
@@ -12440,26 +12880,26 @@ static PyObject *__pyx_pw_8PyDeepCL_13GenericLoader_3load(CYTHON_UNUSED PyObject
         case  1:
         if (likely((values[1] = PyDict_GetItem(__pyx_kwds, __pyx_n_s_images)) != 0)) kw_args--;
         else {
-          __Pyx_RaiseArgtupleInvalid("load", 1, 5, 5, 1); {__pyx_filename = __pyx_f[11]; __pyx_lineno = 10; __pyx_clineno = __LINE__; goto __pyx_L3_error;}
+          __Pyx_RaiseArgtupleInvalid("load", 1, 5, 5, 1); {__pyx_filename = __pyx_f[10]; __pyx_lineno = 13; __pyx_clineno = __LINE__; goto __pyx_L3_error;}
         }
         case  2:
         if (likely((values[2] = PyDict_GetItem(__pyx_kwds, __pyx_n_s_labels)) != 0)) kw_args--;
         else {
-          __Pyx_RaiseArgtupleInvalid("load", 1, 5, 5, 2); {__pyx_filename = __pyx_f[11]; __pyx_lineno = 10; __pyx_clineno = __LINE__; goto __pyx_L3_error;}
+          __Pyx_RaiseArgtupleInvalid("load", 1, 5, 5, 2); {__pyx_filename = __pyx_f[10]; __pyx_lineno = 13; __pyx_clineno = __LINE__; goto __pyx_L3_error;}
         }
         case  3:
         if (likely((values[3] = PyDict_GetItem(__pyx_kwds, __pyx_n_s_startN)) != 0)) kw_args--;
         else {
-          __Pyx_RaiseArgtupleInvalid("load", 1, 5, 5, 3); {__pyx_filename = __pyx_f[11]; __pyx_lineno = 10; __pyx_clineno = __LINE__; goto __pyx_L3_error;}
+          __Pyx_RaiseArgtupleInvalid("load", 1, 5, 5, 3); {__pyx_filename = __pyx_f[10]; __pyx_lineno = 13; __pyx_clineno = __LINE__; goto __pyx_L3_error;}
         }
         case  4:
         if (likely((values[4] = PyDict_GetItem(__pyx_kwds, __pyx_n_s_numExamples)) != 0)) kw_args--;
         else {
-          __Pyx_RaiseArgtupleInvalid("load", 1, 5, 5, 4); {__pyx_filename = __pyx_f[11]; __pyx_lineno = 10; __pyx_clineno = __LINE__; goto __pyx_L3_error;}
+          __Pyx_RaiseArgtupleInvalid("load", 1, 5, 5, 4); {__pyx_filename = __pyx_f[10]; __pyx_lineno = 13; __pyx_clineno = __LINE__; goto __pyx_L3_error;}
         }
       }
       if (unlikely(kw_args > 0)) {
-        if (unlikely(__Pyx_ParseOptionalKeywords(__pyx_kwds, __pyx_pyargnames, 0, values, pos_args, "load") < 0)) {__pyx_filename = __pyx_f[11]; __pyx_lineno = 10; __pyx_clineno = __LINE__; goto __pyx_L3_error;}
+        if (unlikely(__Pyx_ParseOptionalKeywords(__pyx_kwds, __pyx_pyargnames, 0, values, pos_args, "load") < 0)) {__pyx_filename = __pyx_f[10]; __pyx_lineno = 13; __pyx_clineno = __LINE__; goto __pyx_L3_error;}
       }
     } else if (PyTuple_GET_SIZE(__pyx_args) != 5) {
       goto __pyx_L5_argtuple_error;
@@ -12471,14 +12911,14 @@ static PyObject *__pyx_pw_8PyDeepCL_13GenericLoader_3load(CYTHON_UNUSED PyObject
       values[4] = PyTuple_GET_ITEM(__pyx_args, 4);
     }
     __pyx_v_trainFilepath = values[0];
-    __pyx_v_images = __Pyx_PyObject_to_MemoryviewSlice_ds_float(values[1]); if (unlikely(!__pyx_v_images.memview)) {__pyx_filename = __pyx_f[11]; __pyx_lineno = 10; __pyx_clineno = __LINE__; goto __pyx_L3_error;}
-    __pyx_v_labels = __Pyx_PyObject_to_MemoryviewSlice_ds_int(values[2]); if (unlikely(!__pyx_v_labels.memview)) {__pyx_filename = __pyx_f[11]; __pyx_lineno = 10; __pyx_clineno = __LINE__; goto __pyx_L3_error;}
+    __pyx_v_images = __Pyx_PyObject_to_MemoryviewSlice_ds_float(values[1]); if (unlikely(!__pyx_v_images.memview)) {__pyx_filename = __pyx_f[10]; __pyx_lineno = 13; __pyx_clineno = __LINE__; goto __pyx_L3_error;}
+    __pyx_v_labels = __Pyx_PyObject_to_MemoryviewSlice_ds_int(values[2]); if (unlikely(!__pyx_v_labels.memview)) {__pyx_filename = __pyx_f[10]; __pyx_lineno = 13; __pyx_clineno = __LINE__; goto __pyx_L3_error;}
     __pyx_v_startN = values[3];
     __pyx_v_numExamples = values[4];
   }
   goto __pyx_L4_argument_unpacking_done;
   __pyx_L5_argtuple_error:;
-  __Pyx_RaiseArgtupleInvalid("load", 1, 5, 5, PyTuple_GET_SIZE(__pyx_args)); {__pyx_filename = __pyx_f[11]; __pyx_lineno = 10; __pyx_clineno = __LINE__; goto __pyx_L3_error;}
+  __Pyx_RaiseArgtupleInvalid("load", 1, 5, 5, PyTuple_GET_SIZE(__pyx_args)); {__pyx_filename = __pyx_f[10]; __pyx_lineno = 13; __pyx_clineno = __LINE__; goto __pyx_L3_error;}
   __pyx_L3_error:;
   __Pyx_AddTraceback("PyDeepCL.GenericLoader.load", __pyx_clineno, __pyx_lineno, __pyx_filename);
   __Pyx_RefNannyFinishContext();
@@ -12492,103 +12932,74 @@ static PyObject *__pyx_pw_8PyDeepCL_13GenericLoader_3load(CYTHON_UNUSED PyObject
 }
 
 static PyObject *__pyx_pf_8PyDeepCL_13GenericLoader_2load(PyObject *__pyx_v_trainFilepath, __Pyx_memviewslice __pyx_v_images, __Pyx_memviewslice __pyx_v_labels, PyObject *__pyx_v_startN, PyObject *__pyx_v_numExamples) {
+  char const *__pyx_v_trainFilepath_charstar;
   PyObject *__pyx_r = NULL;
   __Pyx_RefNannyDeclarations
-  PyObject *__pyx_t_1 = NULL;
-  PyObject *__pyx_t_2 = NULL;
-  PyObject *__pyx_t_3 = NULL;
-  PyObject *__pyx_t_4 = NULL;
-  std::string __pyx_t_5;
-  Py_ssize_t __pyx_t_6;
-  int __pyx_t_7;
-  Py_ssize_t __pyx_t_8;
-  int __pyx_t_9;
+  char const *__pyx_t_1;
+  Py_ssize_t __pyx_t_2;
+  int __pyx_t_3;
+  Py_ssize_t __pyx_t_4;
+  int __pyx_t_5;
   int __pyx_lineno = 0;
   const char *__pyx_filename = NULL;
   int __pyx_clineno = 0;
   __Pyx_RefNannySetupContext("load", 0);
 
-  /* "GenericLoader.pyx":11
+  /* "../../../../../../home/user/git/DeepCL/python/GenericLoader.pyx":14
  *     @staticmethod
  *     def load( trainFilepath, float[:] images, int[:] labels, startN, numExamples ):
- *         cDeepCL.GenericLoader.load( toCppString(trainFilepath), &images[0], &labels[0], startN , numExamples )             # <<<<<<<<<<<<<<
- * 
- * 
+ *         cdef const char *trainFilepath_charstar = trainFilepath             # <<<<<<<<<<<<<<
+ *         cDeepCL.GenericLoader.load(trainFilepath_charstar, &images[0], &labels[0], startN , numExamples)
  */
-  __pyx_t_2 = __Pyx_GetModuleGlobalName(__pyx_n_s_toCppString); if (unlikely(!__pyx_t_2)) {__pyx_filename = __pyx_f[11]; __pyx_lineno = 11; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
-  __Pyx_GOTREF(__pyx_t_2);
-  __pyx_t_3 = NULL;
-  if (CYTHON_COMPILING_IN_CPYTHON && unlikely(PyMethod_Check(__pyx_t_2))) {
-    __pyx_t_3 = PyMethod_GET_SELF(__pyx_t_2);
-    if (likely(__pyx_t_3)) {
-      PyObject* function = PyMethod_GET_FUNCTION(__pyx_t_2);
-      __Pyx_INCREF(__pyx_t_3);
-      __Pyx_INCREF(function);
-      __Pyx_DECREF_SET(__pyx_t_2, function);
-    }
-  }
-  if (!__pyx_t_3) {
-    __pyx_t_1 = __Pyx_PyObject_CallOneArg(__pyx_t_2, __pyx_v_trainFilepath); if (unlikely(!__pyx_t_1)) {__pyx_filename = __pyx_f[11]; __pyx_lineno = 11; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
-    __Pyx_GOTREF(__pyx_t_1);
-  } else {
-    __pyx_t_4 = PyTuple_New(1+1); if (unlikely(!__pyx_t_4)) {__pyx_filename = __pyx_f[11]; __pyx_lineno = 11; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
-    __Pyx_GOTREF(__pyx_t_4);
-    PyTuple_SET_ITEM(__pyx_t_4, 0, __pyx_t_3); __Pyx_GIVEREF(__pyx_t_3); __pyx_t_3 = NULL;
-    __Pyx_INCREF(__pyx_v_trainFilepath);
-    PyTuple_SET_ITEM(__pyx_t_4, 0+1, __pyx_v_trainFilepath);
-    __Pyx_GIVEREF(__pyx_v_trainFilepath);
-    __pyx_t_1 = __Pyx_PyObject_Call(__pyx_t_2, __pyx_t_4, NULL); if (unlikely(!__pyx_t_1)) {__pyx_filename = __pyx_f[11]; __pyx_lineno = 11; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
-    __Pyx_GOTREF(__pyx_t_1);
-    __Pyx_DECREF(__pyx_t_4); __pyx_t_4 = 0;
-  }
-  __Pyx_DECREF(__pyx_t_2); __pyx_t_2 = 0;
-  __pyx_t_5 = __pyx_convert_string_from_py_std__in_string(__pyx_t_1); if (unlikely(PyErr_Occurred())) {__pyx_filename = __pyx_f[11]; __pyx_lineno = 11; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
-  __Pyx_DECREF(__pyx_t_1); __pyx_t_1 = 0;
-  __pyx_t_6 = 0;
-  __pyx_t_7 = -1;
-  if (__pyx_t_6 < 0) {
-    __pyx_t_6 += __pyx_v_images.shape[0];
-    if (unlikely(__pyx_t_6 < 0)) __pyx_t_7 = 0;
-  } else if (unlikely(__pyx_t_6 >= __pyx_v_images.shape[0])) __pyx_t_7 = 0;
-  if (unlikely(__pyx_t_7 != -1)) {
-    __Pyx_RaiseBufferIndexError(__pyx_t_7);
-    {__pyx_filename = __pyx_f[11]; __pyx_lineno = 11; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+  __pyx_t_1 = __Pyx_PyObject_AsString(__pyx_v_trainFilepath); if (unlikely((!__pyx_t_1) && PyErr_Occurred())) {__pyx_filename = __pyx_f[10]; __pyx_lineno = 14; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+  __pyx_v_trainFilepath_charstar = __pyx_t_1;
+
+  /* "../../../../../../home/user/git/DeepCL/python/GenericLoader.pyx":15
+ *     def load( trainFilepath, float[:] images, int[:] labels, startN, numExamples ):
+ *         cdef const char *trainFilepath_charstar = trainFilepath
+ *         cDeepCL.GenericLoader.load(trainFilepath_charstar, &images[0], &labels[0], startN , numExamples)             # <<<<<<<<<<<<<<
+ */
+  __pyx_t_2 = 0;
+  __pyx_t_3 = -1;
+  if (__pyx_t_2 < 0) {
+    __pyx_t_2 += __pyx_v_images.shape[0];
+    if (unlikely(__pyx_t_2 < 0)) __pyx_t_3 = 0;
+  } else if (unlikely(__pyx_t_2 >= __pyx_v_images.shape[0])) __pyx_t_3 = 0;
+  if (unlikely(__pyx_t_3 != -1)) {
+    __Pyx_RaiseBufferIndexError(__pyx_t_3);
+    {__pyx_filename = __pyx_f[10]; __pyx_lineno = 15; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
   }
-  __pyx_t_8 = 0;
-  __pyx_t_7 = -1;
-  if (__pyx_t_8 < 0) {
-    __pyx_t_8 += __pyx_v_labels.shape[0];
-    if (unlikely(__pyx_t_8 < 0)) __pyx_t_7 = 0;
-  } else if (unlikely(__pyx_t_8 >= __pyx_v_labels.shape[0])) __pyx_t_7 = 0;
-  if (unlikely(__pyx_t_7 != -1)) {
-    __Pyx_RaiseBufferIndexError(__pyx_t_7);
-    {__pyx_filename = __pyx_f[11]; __pyx_lineno = 11; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+  __pyx_t_4 = 0;
+  __pyx_t_3 = -1;
+  if (__pyx_t_4 < 0) {
+    __pyx_t_4 += __pyx_v_labels.shape[0];
+    if (unlikely(__pyx_t_4 < 0)) __pyx_t_3 = 0;
+  } else if (unlikely(__pyx_t_4 >= __pyx_v_labels.shape[0])) __pyx_t_3 = 0;
+  if (unlikely(__pyx_t_3 != -1)) {
+    __Pyx_RaiseBufferIndexError(__pyx_t_3);
+    {__pyx_filename = __pyx_f[10]; __pyx_lineno = 15; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
   }
-  __pyx_t_7 = __Pyx_PyInt_As_int(__pyx_v_startN); if (unlikely((__pyx_t_7 == (int)-1) && PyErr_Occurred())) {__pyx_filename = __pyx_f[11]; __pyx_lineno = 11; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
-  __pyx_t_9 = __Pyx_PyInt_As_int(__pyx_v_numExamples); if (unlikely((__pyx_t_9 == (int)-1) && PyErr_Occurred())) {__pyx_filename = __pyx_f[11]; __pyx_lineno = 11; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+  __pyx_t_3 = __Pyx_PyInt_As_int(__pyx_v_startN); if (unlikely((__pyx_t_3 == (int)-1) && PyErr_Occurred())) {__pyx_filename = __pyx_f[10]; __pyx_lineno = 15; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+  __pyx_t_5 = __Pyx_PyInt_As_int(__pyx_v_numExamples); if (unlikely((__pyx_t_5 == (int)-1) && PyErr_Occurred())) {__pyx_filename = __pyx_f[10]; __pyx_lineno = 15; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
   try {
-    GenericLoader::load(__pyx_t_5, (&(*((float *) ( /* dim=0 */ (__pyx_v_images.data + __pyx_t_6 * __pyx_v_images.strides[0]) )))), (&(*((int *) ( /* dim=0 */ (__pyx_v_labels.data + __pyx_t_8 * __pyx_v_labels.strides[0]) )))), __pyx_t_7, __pyx_t_9);
+    GenericLoader::load(__pyx_v_trainFilepath_charstar, (&(*((float *) ( /* dim=0 */ (__pyx_v_images.data + __pyx_t_2 * __pyx_v_images.strides[0]) )))), (&(*((int *) ( /* dim=0 */ (__pyx_v_labels.data + __pyx_t_4 * __pyx_v_labels.strides[0]) )))), __pyx_t_3, __pyx_t_5);
   } catch(...) {
     __Pyx_CppExn2PyErr();
-    {__pyx_filename = __pyx_f[11]; __pyx_lineno = 11; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+    {__pyx_filename = __pyx_f[10]; __pyx_lineno = 15; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
   }
 
-  /* "GenericLoader.pyx":10
+  /* "../../../../../../home/user/git/DeepCL/python/GenericLoader.pyx":13
  *         return (N,planes,size)
  *     @staticmethod
  *     def load( trainFilepath, float[:] images, int[:] labels, startN, numExamples ):             # <<<<<<<<<<<<<<
- *         cDeepCL.GenericLoader.load( toCppString(trainFilepath), &images[0], &labels[0], startN , numExamples )
- * 
+ *         cdef const char *trainFilepath_charstar = trainFilepath
+ *         cDeepCL.GenericLoader.load(trainFilepath_charstar, &images[0], &labels[0], startN , numExamples)
  */
 
   /* function exit code */
   __pyx_r = Py_None; __Pyx_INCREF(Py_None);
   goto __pyx_L0;
   __pyx_L1_error:;
-  __Pyx_XDECREF(__pyx_t_1);
-  __Pyx_XDECREF(__pyx_t_2);
-  __Pyx_XDECREF(__pyx_t_3);
-  __Pyx_XDECREF(__pyx_t_4);
   __Pyx_AddTraceback("PyDeepCL.GenericLoader.load", __pyx_clineno, __pyx_lineno, __pyx_filename);
   __pyx_r = NULL;
   __pyx_L0:;
@@ -12599,7 +13010,7 @@ static PyObject *__pyx_pf_8PyDeepCL_13GenericLoader_2load(PyObject *__pyx_v_trai
   return __pyx_r;
 }
 
-/* "NetLearner.pyx":3
+/* "../../../../../../home/user/git/DeepCL/python/NetLearner.pyx":3
  * cdef class NetLearner:
  *     cdef cDeepCL.CyNetLearner *thisptr
  *     def __cinit__( self, SGD sgd, NeuralNet neuralnet,             # <<<<<<<<<<<<<<
@@ -12652,46 +13063,46 @@ static int __pyx_pw_8PyDeepCL_10NetLearner_1__cinit__(PyObject *__pyx_v_self, Py
         case  1:
         if (likely((values[1] = PyDict_GetItem(__pyx_kwds, __pyx_n_s_neuralnet)) != 0)) kw_args--;
         else {
-          __Pyx_RaiseArgtupleInvalid("__cinit__", 1, 9, 9, 1); {__pyx_filename = __pyx_f[12]; __pyx_lineno = 3; __pyx_clineno = __LINE__; goto __pyx_L3_error;}
+          __Pyx_RaiseArgtupleInvalid("__cinit__", 1, 9, 9, 1); {__pyx_filename = __pyx_f[11]; __pyx_lineno = 3; __pyx_clineno = __LINE__; goto __pyx_L3_error;}
         }
         case  2:
         if (likely((values[2] = PyDict_GetItem(__pyx_kwds, __pyx_n_s_Ntrain)) != 0)) kw_args--;
         else {
-          __Pyx_RaiseArgtupleInvalid("__cinit__", 1, 9, 9, 2); {__pyx_filename = __pyx_f[12]; __pyx_lineno = 3; __pyx_clineno = __LINE__; goto __pyx_L3_error;}
+          __Pyx_RaiseArgtupleInvalid("__cinit__", 1, 9, 9, 2); {__pyx_filename = __pyx_f[11]; __pyx_lineno = 3; __pyx_clineno = __LINE__; goto __pyx_L3_error;}
         }
         case  3:
         if (likely((values[3] = PyDict_GetItem(__pyx_kwds, __pyx_n_s_trainData)) != 0)) kw_args--;
         else {
-          __Pyx_RaiseArgtupleInvalid("__cinit__", 1, 9, 9, 3); {__pyx_filename = __pyx_f[12]; __pyx_lineno = 3; __pyx_clineno = __LINE__; goto __pyx_L3_error;}
+          __Pyx_RaiseArgtupleInvalid("__cinit__", 1, 9, 9, 3); {__pyx_filename = __pyx_f[11]; __pyx_lineno = 3; __pyx_clineno = __LINE__; goto __pyx_L3_error;}
         }
         case  4:
         if (likely((values[4] = PyDict_GetItem(__pyx_kwds, __pyx_n_s_trainLabels)) != 0)) kw_args--;
         else {
-          __Pyx_RaiseArgtupleInvalid("__cinit__", 1, 9, 9, 4); {__pyx_filename = __pyx_f[12]; __pyx_lineno = 3; __pyx_clineno = __LINE__; goto __pyx_L3_error;}
+          __Pyx_RaiseArgtupleInvalid("__cinit__", 1, 9, 9, 4); {__pyx_filename = __pyx_f[11]; __pyx_lineno = 3; __pyx_clineno = __LINE__; goto __pyx_L3_error;}
         }
         case  5:
         if (likely((values[5] = PyDict_GetItem(__pyx_kwds, __pyx_n_s_Ntest)) != 0)) kw_args--;
         else {
-          __Pyx_RaiseArgtupleInvalid("__cinit__", 1, 9, 9, 5); {__pyx_filename = __pyx_f[12]; __pyx_lineno = 3; __pyx_clineno = __LINE__; goto __pyx_L3_error;}
+          __Pyx_RaiseArgtupleInvalid("__cinit__", 1, 9, 9, 5); {__pyx_filename = __pyx_f[11]; __pyx_lineno = 3; __pyx_clineno = __LINE__; goto __pyx_L3_error;}
         }
         case  6:
         if (likely((values[6] = PyDict_GetItem(__pyx_kwds, __pyx_n_s_testData)) != 0)) kw_args--;
         else {
-          __Pyx_RaiseArgtupleInvalid("__cinit__", 1, 9, 9, 6); {__pyx_filename = __pyx_f[12]; __pyx_lineno = 3; __pyx_clineno = __LINE__; goto __pyx_L3_error;}
+          __Pyx_RaiseArgtupleInvalid("__cinit__", 1, 9, 9, 6); {__pyx_filename = __pyx_f[11]; __pyx_lineno = 3; __pyx_clineno = __LINE__; goto __pyx_L3_error;}
         }
         case  7:
         if (likely((values[7] = PyDict_GetItem(__pyx_kwds, __pyx_n_s_testLabels)) != 0)) kw_args--;
         else {
-          __Pyx_RaiseArgtupleInvalid("__cinit__", 1, 9, 9, 7); {__pyx_filename = __pyx_f[12]; __pyx_lineno = 3; __pyx_clineno = __LINE__; goto __pyx_L3_error;}
+          __Pyx_RaiseArgtupleInvalid("__cinit__", 1, 9, 9, 7); {__pyx_filename = __pyx_f[11]; __pyx_lineno = 3; __pyx_clineno = __LINE__; goto __pyx_L3_error;}
         }
         case  8:
         if (likely((values[8] = PyDict_GetItem(__pyx_kwds, __pyx_n_s_batchSize)) != 0)) kw_args--;
         else {
-          __Pyx_RaiseArgtupleInvalid("__cinit__", 1, 9, 9, 8); {__pyx_filename = __pyx_f[12]; __pyx_lineno = 3; __pyx_clineno = __LINE__; goto __pyx_L3_error;}
+          __Pyx_RaiseArgtupleInvalid("__cinit__", 1, 9, 9, 8); {__pyx_filename = __pyx_f[11]; __pyx_lineno = 3; __pyx_clineno = __LINE__; goto __pyx_L3_error;}
         }
       }
       if (unlikely(kw_args > 0)) {
-        if (unlikely(__Pyx_ParseOptionalKeywords(__pyx_kwds, __pyx_pyargnames, 0, values, pos_args, "__cinit__") < 0)) {__pyx_filename = __pyx_f[12]; __pyx_lineno = 3; __pyx_clineno = __LINE__; goto __pyx_L3_error;}
+        if (unlikely(__Pyx_ParseOptionalKeywords(__pyx_kwds, __pyx_pyargnames, 0, values, pos_args, "__cinit__") < 0)) {__pyx_filename = __pyx_f[11]; __pyx_lineno = 3; __pyx_clineno = __LINE__; goto __pyx_L3_error;}
       }
     } else if (PyTuple_GET_SIZE(__pyx_args) != 9) {
       goto __pyx_L5_argtuple_error;
@@ -12709,23 +13120,23 @@ static int __pyx_pw_8PyDeepCL_10NetLearner_1__cinit__(PyObject *__pyx_v_self, Py
     __pyx_v_sgd = ((struct __pyx_obj_8PyDeepCL_SGD *)values[0]);
     __pyx_v_neuralnet = ((struct __pyx_obj_8PyDeepCL_NeuralNet *)values[1]);
     __pyx_v_Ntrain = values[2];
-    __pyx_v_trainData = __Pyx_PyObject_to_MemoryviewSlice_ds_float(values[3]); if (unlikely(!__pyx_v_trainData.memview)) {__pyx_filename = __pyx_f[12]; __pyx_lineno = 4; __pyx_clineno = __LINE__; goto __pyx_L3_error;}
-    __pyx_v_trainLabels = __Pyx_PyObject_to_MemoryviewSlice_ds_int(values[4]); if (unlikely(!__pyx_v_trainLabels.memview)) {__pyx_filename = __pyx_f[12]; __pyx_lineno = 4; __pyx_clineno = __LINE__; goto __pyx_L3_error;}
+    __pyx_v_trainData = __Pyx_PyObject_to_MemoryviewSlice_ds_float(values[3]); if (unlikely(!__pyx_v_trainData.memview)) {__pyx_filename = __pyx_f[11]; __pyx_lineno = 4; __pyx_clineno = __LINE__; goto __pyx_L3_error;}
+    __pyx_v_trainLabels = __Pyx_PyObject_to_MemoryviewSlice_ds_int(values[4]); if (unlikely(!__pyx_v_trainLabels.memview)) {__pyx_filename = __pyx_f[11]; __pyx_lineno = 4; __pyx_clineno = __LINE__; goto __pyx_L3_error;}
     __pyx_v_Ntest = values[5];
-    __pyx_v_testData = __Pyx_PyObject_to_MemoryviewSlice_ds_float(values[6]); if (unlikely(!__pyx_v_testData.memview)) {__pyx_filename = __pyx_f[12]; __pyx_lineno = 5; __pyx_clineno = __LINE__; goto __pyx_L3_error;}
-    __pyx_v_testLabels = __Pyx_PyObject_to_MemoryviewSlice_ds_int(values[7]); if (unlikely(!__pyx_v_testLabels.memview)) {__pyx_filename = __pyx_f[12]; __pyx_lineno = 5; __pyx_clineno = __LINE__; goto __pyx_L3_error;}
+    __pyx_v_testData = __Pyx_PyObject_to_MemoryviewSlice_ds_float(values[6]); if (unlikely(!__pyx_v_testData.memview)) {__pyx_filename = __pyx_f[11]; __pyx_lineno = 5; __pyx_clineno = __LINE__; goto __pyx_L3_error;}
+    __pyx_v_testLabels = __Pyx_PyObject_to_MemoryviewSlice_ds_int(values[7]); if (unlikely(!__pyx_v_testLabels.memview)) {__pyx_filename = __pyx_f[11]; __pyx_lineno = 5; __pyx_clineno = __LINE__; goto __pyx_L3_error;}
     __pyx_v_batchSize = values[8];
   }
   goto __pyx_L4_argument_unpacking_done;
   __pyx_L5_argtuple_error:;
-  __Pyx_RaiseArgtupleInvalid("__cinit__", 1, 9, 9, PyTuple_GET_SIZE(__pyx_args)); {__pyx_filename = __pyx_f[12]; __pyx_lineno = 3; __pyx_clineno = __LINE__; goto __pyx_L3_error;}
+  __Pyx_RaiseArgtupleInvalid("__cinit__", 1, 9, 9, PyTuple_GET_SIZE(__pyx_args)); {__pyx_filename = __pyx_f[11]; __pyx_lineno = 3; __pyx_clineno = __LINE__; goto __pyx_L3_error;}
   __pyx_L3_error:;
   __Pyx_AddTraceback("PyDeepCL.NetLearner.__cinit__", __pyx_clineno, __pyx_lineno, __pyx_filename);
   __Pyx_RefNannyFinishContext();
   return -1;
   __pyx_L4_argument_unpacking_done:;
-  if (unlikely(!__Pyx_ArgTypeTest(((PyObject *)__pyx_v_sgd), __pyx_ptype_8PyDeepCL_SGD, 1, "sgd", 0))) {__pyx_filename = __pyx_f[12]; __pyx_lineno = 3; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
-  if (unlikely(!__Pyx_ArgTypeTest(((PyObject *)__pyx_v_neuralnet), __pyx_ptype_8PyDeepCL_NeuralNet, 1, "neuralnet", 0))) {__pyx_filename = __pyx_f[12]; __pyx_lineno = 3; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+  if (unlikely(!__Pyx_ArgTypeTest(((PyObject *)__pyx_v_sgd), __pyx_ptype_8PyDeepCL_SGD, 1, "sgd", 0))) {__pyx_filename = __pyx_f[11]; __pyx_lineno = 3; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+  if (unlikely(!__Pyx_ArgTypeTest(((PyObject *)__pyx_v_neuralnet), __pyx_ptype_8PyDeepCL_NeuralNet, 1, "neuralnet", 0))) {__pyx_filename = __pyx_f[11]; __pyx_lineno = 3; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
   __pyx_r = __pyx_pf_8PyDeepCL_10NetLearner___cinit__(((struct __pyx_obj_8PyDeepCL_NetLearner *)__pyx_v_self), __pyx_v_sgd, __pyx_v_neuralnet, __pyx_v_Ntrain, __pyx_v_trainData, __pyx_v_trainLabels, __pyx_v_Ntest, __pyx_v_testData, __pyx_v_testLabels, __pyx_v_batchSize);
 
   /* function exit code */
@@ -12753,14 +13164,14 @@ static int __pyx_pf_8PyDeepCL_10NetLearner___cinit__(struct __pyx_obj_8PyDeepCL_
   int __pyx_clineno = 0;
   __Pyx_RefNannySetupContext("__cinit__", 0);
 
-  /* "NetLearner.pyx":9
+  /* "../../../../../../home/user/git/DeepCL/python/NetLearner.pyx":9
  *         self.thisptr = new cDeepCL.CyNetLearner(
  *             sgd.thisptr, neuralnet.thisptr,
  *             Ntrain, &trainData[0], &trainLabels[0],             # <<<<<<<<<<<<<<
  *             Ntest, &testData[0], &testLabels[0],
  *             batchSize )
  */
-  __pyx_t_1 = __Pyx_PyInt_As_int(__pyx_v_Ntrain); if (unlikely((__pyx_t_1 == (int)-1) && PyErr_Occurred())) {__pyx_filename = __pyx_f[12]; __pyx_lineno = 9; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+  __pyx_t_1 = __Pyx_PyInt_As_int(__pyx_v_Ntrain); if (unlikely((__pyx_t_1 == (int)-1) && PyErr_Occurred())) {__pyx_filename = __pyx_f[11]; __pyx_lineno = 9; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
   __pyx_t_2 = 0;
   __pyx_t_3 = -1;
   if (__pyx_t_2 < 0) {
@@ -12769,7 +13180,7 @@ static int __pyx_pf_8PyDeepCL_10NetLearner___cinit__(struct __pyx_obj_8PyDeepCL_
   } else if (unlikely(__pyx_t_2 >= __pyx_v_trainData.shape[0])) __pyx_t_3 = 0;
   if (unlikely(__pyx_t_3 != -1)) {
     __Pyx_RaiseBufferIndexError(__pyx_t_3);
-    {__pyx_filename = __pyx_f[12]; __pyx_lineno = 9; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+    {__pyx_filename = __pyx_f[11]; __pyx_lineno = 9; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
   }
   __pyx_t_4 = 0;
   __pyx_t_3 = -1;
@@ -12779,17 +13190,17 @@ static int __pyx_pf_8PyDeepCL_10NetLearner___cinit__(struct __pyx_obj_8PyDeepCL_
   } else if (unlikely(__pyx_t_4 >= __pyx_v_trainLabels.shape[0])) __pyx_t_3 = 0;
   if (unlikely(__pyx_t_3 != -1)) {
     __Pyx_RaiseBufferIndexError(__pyx_t_3);
-    {__pyx_filename = __pyx_f[12]; __pyx_lineno = 9; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+    {__pyx_filename = __pyx_f[11]; __pyx_lineno = 9; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
   }
 
-  /* "NetLearner.pyx":10
+  /* "../../../../../../home/user/git/DeepCL/python/NetLearner.pyx":10
  *             sgd.thisptr, neuralnet.thisptr,
  *             Ntrain, &trainData[0], &trainLabels[0],
  *             Ntest, &testData[0], &testLabels[0],             # <<<<<<<<<<<<<<
  *             batchSize )
- *     def __dealloc(self):
+ *     def __dealloc__(self):
  */
-  __pyx_t_3 = __Pyx_PyInt_As_int(__pyx_v_Ntest); if (unlikely((__pyx_t_3 == (int)-1) && PyErr_Occurred())) {__pyx_filename = __pyx_f[12]; __pyx_lineno = 10; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+  __pyx_t_3 = __Pyx_PyInt_As_int(__pyx_v_Ntest); if (unlikely((__pyx_t_3 == (int)-1) && PyErr_Occurred())) {__pyx_filename = __pyx_f[11]; __pyx_lineno = 10; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
   __pyx_t_5 = 0;
   __pyx_t_6 = -1;
   if (__pyx_t_5 < 0) {
@@ -12798,7 +13209,7 @@ static int __pyx_pf_8PyDeepCL_10NetLearner___cinit__(struct __pyx_obj_8PyDeepCL_
   } else if (unlikely(__pyx_t_5 >= __pyx_v_testData.shape[0])) __pyx_t_6 = 0;
   if (unlikely(__pyx_t_6 != -1)) {
     __Pyx_RaiseBufferIndexError(__pyx_t_6);
-    {__pyx_filename = __pyx_f[12]; __pyx_lineno = 10; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+    {__pyx_filename = __pyx_f[11]; __pyx_lineno = 10; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
   }
   __pyx_t_7 = 0;
   __pyx_t_6 = -1;
@@ -12808,19 +13219,19 @@ static int __pyx_pf_8PyDeepCL_10NetLearner___cinit__(struct __pyx_obj_8PyDeepCL_
   } else if (unlikely(__pyx_t_7 >= __pyx_v_testLabels.shape[0])) __pyx_t_6 = 0;
   if (unlikely(__pyx_t_6 != -1)) {
     __Pyx_RaiseBufferIndexError(__pyx_t_6);
-    {__pyx_filename = __pyx_f[12]; __pyx_lineno = 10; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+    {__pyx_filename = __pyx_f[11]; __pyx_lineno = 10; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
   }
 
-  /* "NetLearner.pyx":11
+  /* "../../../../../../home/user/git/DeepCL/python/NetLearner.pyx":11
  *             Ntrain, &trainData[0], &trainLabels[0],
  *             Ntest, &testData[0], &testLabels[0],
  *             batchSize )             # <<<<<<<<<<<<<<
- *     def __dealloc(self):
+ *     def __dealloc__(self):
  *         del self.thisptr
  */
-  __pyx_t_6 = __Pyx_PyInt_As_int(__pyx_v_batchSize); if (unlikely((__pyx_t_6 == (int)-1) && PyErr_Occurred())) {__pyx_filename = __pyx_f[12]; __pyx_lineno = 11; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+  __pyx_t_6 = __Pyx_PyInt_As_int(__pyx_v_batchSize); if (unlikely((__pyx_t_6 == (int)-1) && PyErr_Occurred())) {__pyx_filename = __pyx_f[11]; __pyx_lineno = 11; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
 
-  /* "NetLearner.pyx":7
+  /* "../../../../../../home/user/git/DeepCL/python/NetLearner.pyx":7
  *             Ntest, float[:] testData, int[:] testLabels,
  *             batchSize ):
  *         self.thisptr = new cDeepCL.CyNetLearner(             # <<<<<<<<<<<<<<
@@ -12831,11 +13242,11 @@ static int __pyx_pf_8PyDeepCL_10NetLearner___cinit__(struct __pyx_obj_8PyDeepCL_
     __pyx_t_8 = new CyNetLearner(__pyx_v_sgd->thisptr, __pyx_v_neuralnet->thisptr, __pyx_t_1, (&(*((float *) ( /* dim=0 */ (__pyx_v_trainData.data + __pyx_t_2 * __pyx_v_trainData.strides[0]) )))), (&(*((int *) ( /* dim=0 */ (__pyx_v_trainLabels.data + __pyx_t_4 * __pyx_v_trainLabels.strides[0]) )))), __pyx_t_3, (&(*((float *) ( /* dim=0 */ (__pyx_v_testData.data + __pyx_t_5 * __pyx_v_testData.strides[0]) )))), (&(*((int *) ( /* dim=0 */ (__pyx_v_testLabels.data + __pyx_t_7 * __pyx_v_testLabels.strides[0]) )))), __pyx_t_6);
   } catch(...) {
     __Pyx_CppExn2PyErr();
-    {__pyx_filename = __pyx_f[12]; __pyx_lineno = 7; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+    {__pyx_filename = __pyx_f[11]; __pyx_lineno = 7; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
   }
   __pyx_v_self->thisptr = __pyx_t_8;
 
-  /* "NetLearner.pyx":3
+  /* "../../../../../../home/user/git/DeepCL/python/NetLearner.pyx":3
  * cdef class NetLearner:
  *     cdef cDeepCL.CyNetLearner *thisptr
  *     def __cinit__( self, SGD sgd, NeuralNet neuralnet,             # <<<<<<<<<<<<<<
@@ -12858,57 +13269,51 @@ static int __pyx_pf_8PyDeepCL_10NetLearner___cinit__(struct __pyx_obj_8PyDeepCL_
   return __pyx_r;
 }
 
-/* "NetLearner.pyx":12
+/* "../../../../../../home/user/git/DeepCL/python/NetLearner.pyx":12
  *             Ntest, &testData[0], &testLabels[0],
  *             batchSize )
- *     def __dealloc(self):             # <<<<<<<<<<<<<<
+ *     def __dealloc__(self):             # <<<<<<<<<<<<<<
  *         del self.thisptr
  * #    def setTrainingData( self, Ntrain, float[:] trainData, int[:] trainLabels ):
  */
 
 /* Python wrapper */
-static PyObject *__pyx_pw_8PyDeepCL_10NetLearner_3__dealloc(PyObject *__pyx_v_self, CYTHON_UNUSED PyObject *unused); /*proto*/
-static PyObject *__pyx_pw_8PyDeepCL_10NetLearner_3__dealloc(PyObject *__pyx_v_self, CYTHON_UNUSED PyObject *unused) {
-  PyObject *__pyx_r = 0;
+static void __pyx_pw_8PyDeepCL_10NetLearner_3__dealloc__(PyObject *__pyx_v_self); /*proto*/
+static void __pyx_pw_8PyDeepCL_10NetLearner_3__dealloc__(PyObject *__pyx_v_self) {
   __Pyx_RefNannyDeclarations
-  __Pyx_RefNannySetupContext("__dealloc (wrapper)", 0);
-  __pyx_r = __pyx_pf_8PyDeepCL_10NetLearner_2__dealloc(((struct __pyx_obj_8PyDeepCL_NetLearner *)__pyx_v_self));
+  __Pyx_RefNannySetupContext("__dealloc__ (wrapper)", 0);
+  __pyx_pf_8PyDeepCL_10NetLearner_2__dealloc__(((struct __pyx_obj_8PyDeepCL_NetLearner *)__pyx_v_self));
 
   /* function exit code */
   __Pyx_RefNannyFinishContext();
-  return __pyx_r;
 }
 
-static PyObject *__pyx_pf_8PyDeepCL_10NetLearner_2__dealloc(struct __pyx_obj_8PyDeepCL_NetLearner *__pyx_v_self) {
-  PyObject *__pyx_r = NULL;
+static void __pyx_pf_8PyDeepCL_10NetLearner_2__dealloc__(struct __pyx_obj_8PyDeepCL_NetLearner *__pyx_v_self) {
   __Pyx_RefNannyDeclarations
-  __Pyx_RefNannySetupContext("__dealloc", 0);
+  __Pyx_RefNannySetupContext("__dealloc__", 0);
 
-  /* "NetLearner.pyx":13
+  /* "../../../../../../home/user/git/DeepCL/python/NetLearner.pyx":13
  *             batchSize )
- *     def __dealloc(self):
+ *     def __dealloc__(self):
  *         del self.thisptr             # <<<<<<<<<<<<<<
  * #    def setTrainingData( self, Ntrain, float[:] trainData, int[:] trainLabels ):
  * #        self.thisptr.setTrainingData( Ntrain, &trainData[0], &trainLabels[0] )
  */
   delete __pyx_v_self->thisptr;
 
-  /* "NetLearner.pyx":12
+  /* "../../../../../../home/user/git/DeepCL/python/NetLearner.pyx":12
  *             Ntest, &testData[0], &testLabels[0],
  *             batchSize )
- *     def __dealloc(self):             # <<<<<<<<<<<<<<
+ *     def __dealloc__(self):             # <<<<<<<<<<<<<<
  *         del self.thisptr
  * #    def setTrainingData( self, Ntrain, float[:] trainData, int[:] trainLabels ):
  */
 
   /* function exit code */
-  __pyx_r = Py_None; __Pyx_INCREF(Py_None);
-  __Pyx_XGIVEREF(__pyx_r);
   __Pyx_RefNannyFinishContext();
-  return __pyx_r;
 }
 
-/* "NetLearner.pyx":18
+/* "../../../../../../home/user/git/DeepCL/python/NetLearner.pyx":18
  * #    def setTestingData( self, Ntest, float[:] testData, int[:] testLabels ):
  * #        self.thisptr.setTestingData( Ntest, &testData[0], &testLabels[0] )
  *     def setSchedule( self, numEpochs ):             # <<<<<<<<<<<<<<
@@ -12938,22 +13343,22 @@ static PyObject *__pyx_pf_8PyDeepCL_10NetLearner_4setSchedule(struct __pyx_obj_8
   int __pyx_clineno = 0;
   __Pyx_RefNannySetupContext("setSchedule", 0);
 
-  /* "NetLearner.pyx":19
+  /* "../../../../../../home/user/git/DeepCL/python/NetLearner.pyx":19
  * #        self.thisptr.setTestingData( Ntest, &testData[0], &testLabels[0] )
  *     def setSchedule( self, numEpochs ):
  *         self.thisptr.setSchedule( numEpochs )             # <<<<<<<<<<<<<<
  *     def setDumpTimings( self, bint dumpTimings ):
  *         self.thisptr.setDumpTimings( dumpTimings )
  */
-  __pyx_t_1 = __Pyx_PyInt_As_int(__pyx_v_numEpochs); if (unlikely((__pyx_t_1 == (int)-1) && PyErr_Occurred())) {__pyx_filename = __pyx_f[12]; __pyx_lineno = 19; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+  __pyx_t_1 = __Pyx_PyInt_As_int(__pyx_v_numEpochs); if (unlikely((__pyx_t_1 == (int)-1) && PyErr_Occurred())) {__pyx_filename = __pyx_f[11]; __pyx_lineno = 19; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
   try {
     __pyx_v_self->thisptr->setSchedule(__pyx_t_1);
   } catch(...) {
     __Pyx_CppExn2PyErr();
-    {__pyx_filename = __pyx_f[12]; __pyx_lineno = 19; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+    {__pyx_filename = __pyx_f[11]; __pyx_lineno = 19; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
   }
 
-  /* "NetLearner.pyx":18
+  /* "../../../../../../home/user/git/DeepCL/python/NetLearner.pyx":18
  * #    def setTestingData( self, Ntest, float[:] testData, int[:] testLabels ):
  * #        self.thisptr.setTestingData( Ntest, &testData[0], &testLabels[0] )
  *     def setSchedule( self, numEpochs ):             # <<<<<<<<<<<<<<
@@ -12973,7 +13378,7 @@ static PyObject *__pyx_pf_8PyDeepCL_10NetLearner_4setSchedule(struct __pyx_obj_8
   return __pyx_r;
 }
 
-/* "NetLearner.pyx":20
+/* "../../../../../../home/user/git/DeepCL/python/NetLearner.pyx":20
  *     def setSchedule( self, numEpochs ):
  *         self.thisptr.setSchedule( numEpochs )
  *     def setDumpTimings( self, bint dumpTimings ):             # <<<<<<<<<<<<<<
@@ -12992,7 +13397,7 @@ static PyObject *__pyx_pw_8PyDeepCL_10NetLearner_7setDumpTimings(PyObject *__pyx
   __Pyx_RefNannyDeclarations
   __Pyx_RefNannySetupContext("setDumpTimings (wrapper)", 0);
   assert(__pyx_arg_dumpTimings); {
-    __pyx_v_dumpTimings = __Pyx_PyObject_IsTrue(__pyx_arg_dumpTimings); if (unlikely((__pyx_v_dumpTimings == (int)-1) && PyErr_Occurred())) {__pyx_filename = __pyx_f[12]; __pyx_lineno = 20; __pyx_clineno = __LINE__; goto __pyx_L3_error;}
+    __pyx_v_dumpTimings = __Pyx_PyObject_IsTrue(__pyx_arg_dumpTimings); if (unlikely((__pyx_v_dumpTimings == (int)-1) && PyErr_Occurred())) {__pyx_filename = __pyx_f[11]; __pyx_lineno = 20; __pyx_clineno = __LINE__; goto __pyx_L3_error;}
   }
   goto __pyx_L4_argument_unpacking_done;
   __pyx_L3_error:;
@@ -13015,7 +13420,7 @@ static PyObject *__pyx_pf_8PyDeepCL_10NetLearner_6setDumpTimings(struct __pyx_ob
   int __pyx_clineno = 0;
   __Pyx_RefNannySetupContext("setDumpTimings", 0);
 
-  /* "NetLearner.pyx":21
+  /* "../../../../../../home/user/git/DeepCL/python/NetLearner.pyx":21
  *         self.thisptr.setSchedule( numEpochs )
  *     def setDumpTimings( self, bint dumpTimings ):
  *         self.thisptr.setDumpTimings( dumpTimings )             # <<<<<<<<<<<<<<
@@ -13026,10 +13431,10 @@ static PyObject *__pyx_pf_8PyDeepCL_10NetLearner_6setDumpTimings(struct __pyx_ob
     __pyx_v_self->thisptr->setDumpTimings(__pyx_v_dumpTimings);
   } catch(...) {
     __Pyx_CppExn2PyErr();
-    {__pyx_filename = __pyx_f[12]; __pyx_lineno = 21; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+    {__pyx_filename = __pyx_f[11]; __pyx_lineno = 21; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
   }
 
-  /* "NetLearner.pyx":20
+  /* "../../../../../../home/user/git/DeepCL/python/NetLearner.pyx":20
  *     def setSchedule( self, numEpochs ):
  *         self.thisptr.setSchedule( numEpochs )
  *     def setDumpTimings( self, bint dumpTimings ):             # <<<<<<<<<<<<<<
@@ -13049,7 +13454,7 @@ static PyObject *__pyx_pf_8PyDeepCL_10NetLearner_6setDumpTimings(struct __pyx_ob
   return __pyx_r;
 }
 
-/* "NetLearner.pyx":24
+/* "../../../../../../home/user/git/DeepCL/python/NetLearner.pyx":24
  * #    def setBatchSize( self, batchSize ):
  * #        self.thisptr.setBatchSize( batchSize )
  *     def _run(self):             # <<<<<<<<<<<<<<
@@ -13075,7 +13480,7 @@ static PyObject *__pyx_pf_8PyDeepCL_10NetLearner_8_run(struct __pyx_obj_8PyDeepC
   __Pyx_RefNannyDeclarations
   __Pyx_RefNannySetupContext("_run", 0);
 
-  /* "NetLearner.pyx":25
+  /* "../../../../../../home/user/git/DeepCL/python/NetLearner.pyx":25
  * #        self.thisptr.setBatchSize( batchSize )
  *     def _run(self):
  *         with nogil:             # <<<<<<<<<<<<<<
@@ -13089,7 +13494,7 @@ static PyObject *__pyx_pf_8PyDeepCL_10NetLearner_8_run(struct __pyx_obj_8PyDeepC
       #endif
       /*try:*/ {
 
-        /* "NetLearner.pyx":26
+        /* "../../../../../../home/user/git/DeepCL/python/NetLearner.pyx":26
  *     def _run(self):
  *         with nogil:
  *            self.thisptr.run()             # <<<<<<<<<<<<<<
@@ -13099,7 +13504,7 @@ static PyObject *__pyx_pf_8PyDeepCL_10NetLearner_8_run(struct __pyx_obj_8PyDeepC
         __pyx_v_self->thisptr->run();
       }
 
-      /* "NetLearner.pyx":25
+      /* "../../../../../../home/user/git/DeepCL/python/NetLearner.pyx":25
  * #        self.thisptr.setBatchSize( batchSize )
  *     def _run(self):
  *         with nogil:             # <<<<<<<<<<<<<<
@@ -13117,7 +13522,7 @@ static PyObject *__pyx_pf_8PyDeepCL_10NetLearner_8_run(struct __pyx_obj_8PyDeepC
       }
   }
 
-  /* "NetLearner.pyx":24
+  /* "../../../../../../home/user/git/DeepCL/python/NetLearner.pyx":24
  * #    def setBatchSize( self, batchSize ):
  * #        self.thisptr.setBatchSize( batchSize )
  *     def _run(self):             # <<<<<<<<<<<<<<
@@ -13132,7 +13537,7 @@ static PyObject *__pyx_pf_8PyDeepCL_10NetLearner_8_run(struct __pyx_obj_8PyDeepC
   return __pyx_r;
 }
 
-/* "NetLearner.pyx":27
+/* "../../../../../../home/user/git/DeepCL/python/NetLearner.pyx":27
  *         with nogil:
  *            self.thisptr.run()
  *     def run(self):             # <<<<<<<<<<<<<<
@@ -13168,18 +13573,18 @@ static PyObject *__pyx_pf_8PyDeepCL_10NetLearner_10run(struct __pyx_obj_8PyDeepC
   int __pyx_clineno = 0;
   __Pyx_RefNannySetupContext("run", 0);
 
-  /* "NetLearner.pyx":28
+  /* "../../../../../../home/user/git/DeepCL/python/NetLearner.pyx":28
  *            self.thisptr.run()
  *     def run(self):
  *         interruptableCall( self._run, [] )             # <<<<<<<<<<<<<<
  * ##        with nogil:
  * ##            thisptr._learn( learningRate )
  */
-  __pyx_t_2 = __Pyx_GetModuleGlobalName(__pyx_n_s_interruptableCall); if (unlikely(!__pyx_t_2)) {__pyx_filename = __pyx_f[12]; __pyx_lineno = 28; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+  __pyx_t_2 = __Pyx_GetModuleGlobalName(__pyx_n_s_interruptableCall); if (unlikely(!__pyx_t_2)) {__pyx_filename = __pyx_f[11]; __pyx_lineno = 28; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
   __Pyx_GOTREF(__pyx_t_2);
-  __pyx_t_3 = __Pyx_PyObject_GetAttrStr(((PyObject *)__pyx_v_self), __pyx_n_s_run); if (unlikely(!__pyx_t_3)) {__pyx_filename = __pyx_f[12]; __pyx_lineno = 28; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+  __pyx_t_3 = __Pyx_PyObject_GetAttrStr(((PyObject *)__pyx_v_self), __pyx_n_s_run); if (unlikely(!__pyx_t_3)) {__pyx_filename = __pyx_f[11]; __pyx_lineno = 28; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
   __Pyx_GOTREF(__pyx_t_3);
-  __pyx_t_4 = PyList_New(0); if (unlikely(!__pyx_t_4)) {__pyx_filename = __pyx_f[12]; __pyx_lineno = 28; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+  __pyx_t_4 = PyList_New(0); if (unlikely(!__pyx_t_4)) {__pyx_filename = __pyx_f[11]; __pyx_lineno = 28; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
   __Pyx_GOTREF(__pyx_t_4);
   __pyx_t_5 = NULL;
   __pyx_t_6 = 0;
@@ -13193,7 +13598,7 @@ static PyObject *__pyx_pf_8PyDeepCL_10NetLearner_10run(struct __pyx_obj_8PyDeepC
       __pyx_t_6 = 1;
     }
   }
-  __pyx_t_7 = PyTuple_New(2+__pyx_t_6); if (unlikely(!__pyx_t_7)) {__pyx_filename = __pyx_f[12]; __pyx_lineno = 28; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+  __pyx_t_7 = PyTuple_New(2+__pyx_t_6); if (unlikely(!__pyx_t_7)) {__pyx_filename = __pyx_f[11]; __pyx_lineno = 28; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
   __Pyx_GOTREF(__pyx_t_7);
   if (__pyx_t_5) {
     PyTuple_SET_ITEM(__pyx_t_7, 0, __pyx_t_5); __Pyx_GIVEREF(__pyx_t_5); __pyx_t_5 = NULL;
@@ -13204,42 +13609,13 @@ static PyObject *__pyx_pf_8PyDeepCL_10NetLearner_10run(struct __pyx_obj_8PyDeepC
   __Pyx_GIVEREF(__pyx_t_4);
   __pyx_t_3 = 0;
   __pyx_t_4 = 0;
-  __pyx_t_1 = __Pyx_PyObject_Call(__pyx_t_2, __pyx_t_7, NULL); if (unlikely(!__pyx_t_1)) {__pyx_filename = __pyx_f[12]; __pyx_lineno = 28; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+  __pyx_t_1 = __Pyx_PyObject_Call(__pyx_t_2, __pyx_t_7, NULL); if (unlikely(!__pyx_t_1)) {__pyx_filename = __pyx_f[11]; __pyx_lineno = 28; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
   __Pyx_GOTREF(__pyx_t_1);
   __Pyx_DECREF(__pyx_t_7); __pyx_t_7 = 0;
   __Pyx_DECREF(__pyx_t_2); __pyx_t_2 = 0;
   __Pyx_DECREF(__pyx_t_1); __pyx_t_1 = 0;
 
-  /* "NetLearner.pyx":31
- * ##        with nogil:
- * ##            thisptr._learn( learningRate )
- *         checkException()             # <<<<<<<<<<<<<<
- * 
- * 
- */
-  __pyx_t_2 = __Pyx_GetModuleGlobalName(__pyx_n_s_checkException); if (unlikely(!__pyx_t_2)) {__pyx_filename = __pyx_f[12]; __pyx_lineno = 31; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
-  __Pyx_GOTREF(__pyx_t_2);
-  __pyx_t_7 = NULL;
-  if (CYTHON_COMPILING_IN_CPYTHON && unlikely(PyMethod_Check(__pyx_t_2))) {
-    __pyx_t_7 = PyMethod_GET_SELF(__pyx_t_2);
-    if (likely(__pyx_t_7)) {
-      PyObject* function = PyMethod_GET_FUNCTION(__pyx_t_2);
-      __Pyx_INCREF(__pyx_t_7);
-      __Pyx_INCREF(function);
-      __Pyx_DECREF_SET(__pyx_t_2, function);
-    }
-  }
-  if (__pyx_t_7) {
-    __pyx_t_1 = __Pyx_PyObject_CallOneArg(__pyx_t_2, __pyx_t_7); if (unlikely(!__pyx_t_1)) {__pyx_filename = __pyx_f[12]; __pyx_lineno = 31; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
-    __Pyx_DECREF(__pyx_t_7); __pyx_t_7 = 0;
-  } else {
-    __pyx_t_1 = __Pyx_PyObject_CallNoArg(__pyx_t_2); if (unlikely(!__pyx_t_1)) {__pyx_filename = __pyx_f[12]; __pyx_lineno = 31; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
-  }
-  __Pyx_GOTREF(__pyx_t_1);
-  __Pyx_DECREF(__pyx_t_2); __pyx_t_2 = 0;
-  __Pyx_DECREF(__pyx_t_1); __pyx_t_1 = 0;
-
-  /* "NetLearner.pyx":27
+  /* "../../../../../../home/user/git/DeepCL/python/NetLearner.pyx":27
  *         with nogil:
  *            self.thisptr.run()
  *     def run(self):             # <<<<<<<<<<<<<<
@@ -13265,12 +13641,12 @@ static PyObject *__pyx_pf_8PyDeepCL_10NetLearner_10run(struct __pyx_obj_8PyDeepC
   return __pyx_r;
 }
 
-/* "NetDefToNet.pyx":3
+/* "../../../../../../home/user/git/DeepCL/python/NetDefToNet.pyx":3
  * cdef class NetdefToNet:
  *     @staticmethod
  *     def createNetFromNetdef( NeuralNet neuralnet, netdef ):             # <<<<<<<<<<<<<<
- *         return cDeepCL.NetdefToNet.createNetFromNetdef( neuralnet.thisptr, toCppString( netdef ) )
- * 
+ *         cdef const char *netdef_charstar = netdef
+ *         return cDeepCL.NetdefToNet.createNetFromNetdefCharStar(neuralnet.thisptr, netdef_charstar)
  */
 
 /* Python wrapper */
@@ -13305,11 +13681,11 @@ static PyObject *__pyx_pw_8PyDeepCL_11NetdefToNet_1createNetFromNetdef(CYTHON_UN
         case  1:
         if (likely((values[1] = PyDict_GetItem(__pyx_kwds, __pyx_n_s_netdef)) != 0)) kw_args--;
         else {
-          __Pyx_RaiseArgtupleInvalid("createNetFromNetdef", 1, 2, 2, 1); {__pyx_filename = __pyx_f[13]; __pyx_lineno = 3; __pyx_clineno = __LINE__; goto __pyx_L3_error;}
+          __Pyx_RaiseArgtupleInvalid("createNetFromNetdef", 1, 2, 2, 1); {__pyx_filename = __pyx_f[12]; __pyx_lineno = 3; __pyx_clineno = __LINE__; goto __pyx_L3_error;}
         }
       }
       if (unlikely(kw_args > 0)) {
-        if (unlikely(__Pyx_ParseOptionalKeywords(__pyx_kwds, __pyx_pyargnames, 0, values, pos_args, "createNetFromNetdef") < 0)) {__pyx_filename = __pyx_f[13]; __pyx_lineno = 3; __pyx_clineno = __LINE__; goto __pyx_L3_error;}
+        if (unlikely(__Pyx_ParseOptionalKeywords(__pyx_kwds, __pyx_pyargnames, 0, values, pos_args, "createNetFromNetdef") < 0)) {__pyx_filename = __pyx_f[12]; __pyx_lineno = 3; __pyx_clineno = __LINE__; goto __pyx_L3_error;}
       }
     } else if (PyTuple_GET_SIZE(__pyx_args) != 2) {
       goto __pyx_L5_argtuple_error;
@@ -13322,13 +13698,13 @@ static PyObject *__pyx_pw_8PyDeepCL_11NetdefToNet_1createNetFromNetdef(CYTHON_UN
   }
   goto __pyx_L4_argument_unpacking_done;
   __pyx_L5_argtuple_error:;
-  __Pyx_RaiseArgtupleInvalid("createNetFromNetdef", 1, 2, 2, PyTuple_GET_SIZE(__pyx_args)); {__pyx_filename = __pyx_f[13]; __pyx_lineno = 3; __pyx_clineno = __LINE__; goto __pyx_L3_error;}
+  __Pyx_RaiseArgtupleInvalid("createNetFromNetdef", 1, 2, 2, PyTuple_GET_SIZE(__pyx_args)); {__pyx_filename = __pyx_f[12]; __pyx_lineno = 3; __pyx_clineno = __LINE__; goto __pyx_L3_error;}
   __pyx_L3_error:;
   __Pyx_AddTraceback("PyDeepCL.NetdefToNet.createNetFromNetdef", __pyx_clineno, __pyx_lineno, __pyx_filename);
   __Pyx_RefNannyFinishContext();
   return NULL;
   __pyx_L4_argument_unpacking_done:;
-  if (unlikely(!__Pyx_ArgTypeTest(((PyObject *)__pyx_v_neuralnet), __pyx_ptype_8PyDeepCL_NeuralNet, 1, "neuralnet", 0))) {__pyx_filename = __pyx_f[13]; __pyx_lineno = 3; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+  if (unlikely(!__Pyx_ArgTypeTest(((PyObject *)__pyx_v_neuralnet), __pyx_ptype_8PyDeepCL_NeuralNet, 1, "neuralnet", 0))) {__pyx_filename = __pyx_f[12]; __pyx_lineno = 3; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
   __pyx_r = __pyx_pf_8PyDeepCL_11NetdefToNet_createNetFromNetdef(__pyx_v_neuralnet, __pyx_v_netdef);
 
   /* function exit code */
@@ -13341,82 +13717,55 @@ static PyObject *__pyx_pw_8PyDeepCL_11NetdefToNet_1createNetFromNetdef(CYTHON_UN
 }
 
 static PyObject *__pyx_pf_8PyDeepCL_11NetdefToNet_createNetFromNetdef(struct __pyx_obj_8PyDeepCL_NeuralNet *__pyx_v_neuralnet, PyObject *__pyx_v_netdef) {
+  char const *__pyx_v_netdef_charstar;
   PyObject *__pyx_r = NULL;
   __Pyx_RefNannyDeclarations
-  PyObject *__pyx_t_1 = NULL;
-  PyObject *__pyx_t_2 = NULL;
+  char const *__pyx_t_1;
+  bool __pyx_t_2;
   PyObject *__pyx_t_3 = NULL;
-  PyObject *__pyx_t_4 = NULL;
-  std::string __pyx_t_5;
-  bool __pyx_t_6;
   int __pyx_lineno = 0;
   const char *__pyx_filename = NULL;
   int __pyx_clineno = 0;
   __Pyx_RefNannySetupContext("createNetFromNetdef", 0);
 
-  /* "NetDefToNet.pyx":4
+  /* "../../../../../../home/user/git/DeepCL/python/NetDefToNet.pyx":4
  *     @staticmethod
  *     def createNetFromNetdef( NeuralNet neuralnet, netdef ):
- *         return cDeepCL.NetdefToNet.createNetFromNetdef( neuralnet.thisptr, toCppString( netdef ) )             # <<<<<<<<<<<<<<
- * 
- * 
+ *         cdef const char *netdef_charstar = netdef             # <<<<<<<<<<<<<<
+ *         return cDeepCL.NetdefToNet.createNetFromNetdefCharStar(neuralnet.thisptr, netdef_charstar)
+ */
+  __pyx_t_1 = __Pyx_PyObject_AsString(__pyx_v_netdef); if (unlikely((!__pyx_t_1) && PyErr_Occurred())) {__pyx_filename = __pyx_f[12]; __pyx_lineno = 4; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+  __pyx_v_netdef_charstar = __pyx_t_1;
+
+  /* "../../../../../../home/user/git/DeepCL/python/NetDefToNet.pyx":5
+ *     def createNetFromNetdef( NeuralNet neuralnet, netdef ):
+ *         cdef const char *netdef_charstar = netdef
+ *         return cDeepCL.NetdefToNet.createNetFromNetdefCharStar(neuralnet.thisptr, netdef_charstar)             # <<<<<<<<<<<<<<
  */
   __Pyx_XDECREF(__pyx_r);
-  __pyx_t_2 = __Pyx_GetModuleGlobalName(__pyx_n_s_toCppString); if (unlikely(!__pyx_t_2)) {__pyx_filename = __pyx_f[13]; __pyx_lineno = 4; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
-  __Pyx_GOTREF(__pyx_t_2);
-  __pyx_t_3 = NULL;
-  if (CYTHON_COMPILING_IN_CPYTHON && unlikely(PyMethod_Check(__pyx_t_2))) {
-    __pyx_t_3 = PyMethod_GET_SELF(__pyx_t_2);
-    if (likely(__pyx_t_3)) {
-      PyObject* function = PyMethod_GET_FUNCTION(__pyx_t_2);
-      __Pyx_INCREF(__pyx_t_3);
-      __Pyx_INCREF(function);
-      __Pyx_DECREF_SET(__pyx_t_2, function);
-    }
-  }
-  if (!__pyx_t_3) {
-    __pyx_t_1 = __Pyx_PyObject_CallOneArg(__pyx_t_2, __pyx_v_netdef); if (unlikely(!__pyx_t_1)) {__pyx_filename = __pyx_f[13]; __pyx_lineno = 4; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
-    __Pyx_GOTREF(__pyx_t_1);
-  } else {
-    __pyx_t_4 = PyTuple_New(1+1); if (unlikely(!__pyx_t_4)) {__pyx_filename = __pyx_f[13]; __pyx_lineno = 4; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
-    __Pyx_GOTREF(__pyx_t_4);
-    PyTuple_SET_ITEM(__pyx_t_4, 0, __pyx_t_3); __Pyx_GIVEREF(__pyx_t_3); __pyx_t_3 = NULL;
-    __Pyx_INCREF(__pyx_v_netdef);
-    PyTuple_SET_ITEM(__pyx_t_4, 0+1, __pyx_v_netdef);
-    __Pyx_GIVEREF(__pyx_v_netdef);
-    __pyx_t_1 = __Pyx_PyObject_Call(__pyx_t_2, __pyx_t_4, NULL); if (unlikely(!__pyx_t_1)) {__pyx_filename = __pyx_f[13]; __pyx_lineno = 4; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
-    __Pyx_GOTREF(__pyx_t_1);
-    __Pyx_DECREF(__pyx_t_4); __pyx_t_4 = 0;
-  }
-  __Pyx_DECREF(__pyx_t_2); __pyx_t_2 = 0;
-  __pyx_t_5 = __pyx_convert_string_from_py_std__in_string(__pyx_t_1); if (unlikely(PyErr_Occurred())) {__pyx_filename = __pyx_f[13]; __pyx_lineno = 4; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
-  __Pyx_DECREF(__pyx_t_1); __pyx_t_1 = 0;
   try {
-    __pyx_t_6 = NetdefToNet::createNetFromNetdef(__pyx_v_neuralnet->thisptr, __pyx_t_5);
+    __pyx_t_2 = NetdefToNet::createNetFromNetdefCharStar(__pyx_v_neuralnet->thisptr, __pyx_v_netdef_charstar);
   } catch(...) {
     __Pyx_CppExn2PyErr();
-    {__pyx_filename = __pyx_f[13]; __pyx_lineno = 4; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+    {__pyx_filename = __pyx_f[12]; __pyx_lineno = 5; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
   }
-  __pyx_t_1 = __Pyx_PyBool_FromLong(__pyx_t_6); if (unlikely(!__pyx_t_1)) {__pyx_filename = __pyx_f[13]; __pyx_lineno = 4; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
-  __Pyx_GOTREF(__pyx_t_1);
-  __pyx_r = __pyx_t_1;
-  __pyx_t_1 = 0;
+  __pyx_t_3 = __Pyx_PyBool_FromLong(__pyx_t_2); if (unlikely(!__pyx_t_3)) {__pyx_filename = __pyx_f[12]; __pyx_lineno = 5; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+  __Pyx_GOTREF(__pyx_t_3);
+  __pyx_r = __pyx_t_3;
+  __pyx_t_3 = 0;
   goto __pyx_L0;
 
-  /* "NetDefToNet.pyx":3
+  /* "../../../../../../home/user/git/DeepCL/python/NetDefToNet.pyx":3
  * cdef class NetdefToNet:
  *     @staticmethod
  *     def createNetFromNetdef( NeuralNet neuralnet, netdef ):             # <<<<<<<<<<<<<<
- *         return cDeepCL.NetdefToNet.createNetFromNetdef( neuralnet.thisptr, toCppString( netdef ) )
- * 
+ *         cdef const char *netdef_charstar = netdef
+ *         return cDeepCL.NetdefToNet.createNetFromNetdefCharStar(neuralnet.thisptr, netdef_charstar)
  */
 
   /* function exit code */
   __pyx_L1_error:;
-  __Pyx_XDECREF(__pyx_t_1);
-  __Pyx_XDECREF(__pyx_t_2);
   __Pyx_XDECREF(__pyx_t_3);
-  __Pyx_XDECREF(__pyx_t_4);
   __Pyx_AddTraceback("PyDeepCL.NetdefToNet.createNetFromNetdef", __pyx_clineno, __pyx_lineno, __pyx_filename);
   __pyx_r = NULL;
   __pyx_L0:;
@@ -13458,16 +13807,16 @@ static int __pyx_pw_8PyDeepCL_8QLearner_1__cinit__(PyObject *__pyx_v_self, PyObj
         case  1:
         if (likely((values[1] = PyDict_GetItem(__pyx_kwds, __pyx_n_s_scenario)) != 0)) kw_args--;
         else {
-          __Pyx_RaiseArgtupleInvalid("__cinit__", 1, 3, 3, 1); {__pyx_filename = __pyx_f[14]; __pyx_lineno = 3; __pyx_clineno = __LINE__; goto __pyx_L3_error;}
+          __Pyx_RaiseArgtupleInvalid("__cinit__", 1, 3, 3, 1); {__pyx_filename = __pyx_f[13]; __pyx_lineno = 3; __pyx_clineno = __LINE__; goto __pyx_L3_error;}
         }
         case  2:
         if (likely((values[2] = PyDict_GetItem(__pyx_kwds, __pyx_n_s_net)) != 0)) kw_args--;
         else {
-          __Pyx_RaiseArgtupleInvalid("__cinit__", 1, 3, 3, 2); {__pyx_filename = __pyx_f[14]; __pyx_lineno = 3; __pyx_clineno = __LINE__; goto __pyx_L3_error;}
+          __Pyx_RaiseArgtupleInvalid("__cinit__", 1, 3, 3, 2); {__pyx_filename = __pyx_f[13]; __pyx_lineno = 3; __pyx_clineno = __LINE__; goto __pyx_L3_error;}
         }
       }
       if (unlikely(kw_args > 0)) {
-        if (unlikely(__Pyx_ParseOptionalKeywords(__pyx_kwds, __pyx_pyargnames, 0, values, pos_args, "__cinit__") < 0)) {__pyx_filename = __pyx_f[14]; __pyx_lineno = 3; __pyx_clineno = __LINE__; goto __pyx_L3_error;}
+        if (unlikely(__Pyx_ParseOptionalKeywords(__pyx_kwds, __pyx_pyargnames, 0, values, pos_args, "__cinit__") < 0)) {__pyx_filename = __pyx_f[13]; __pyx_lineno = 3; __pyx_clineno = __LINE__; goto __pyx_L3_error;}
       }
     } else if (PyTuple_GET_SIZE(__pyx_args) != 3) {
       goto __pyx_L5_argtuple_error;
@@ -13482,15 +13831,15 @@ static int __pyx_pw_8PyDeepCL_8QLearner_1__cinit__(PyObject *__pyx_v_self, PyObj
   }
   goto __pyx_L4_argument_unpacking_done;
   __pyx_L5_argtuple_error:;
-  __Pyx_RaiseArgtupleInvalid("__cinit__", 1, 3, 3, PyTuple_GET_SIZE(__pyx_args)); {__pyx_filename = __pyx_f[14]; __pyx_lineno = 3; __pyx_clineno = __LINE__; goto __pyx_L3_error;}
+  __Pyx_RaiseArgtupleInvalid("__cinit__", 1, 3, 3, PyTuple_GET_SIZE(__pyx_args)); {__pyx_filename = __pyx_f[13]; __pyx_lineno = 3; __pyx_clineno = __LINE__; goto __pyx_L3_error;}
   __pyx_L3_error:;
   __Pyx_AddTraceback("PyDeepCL.QLearner.__cinit__", __pyx_clineno, __pyx_lineno, __pyx_filename);
   __Pyx_RefNannyFinishContext();
   return -1;
   __pyx_L4_argument_unpacking_done:;
-  if (unlikely(!__Pyx_ArgTypeTest(((PyObject *)__pyx_v_sgd), __pyx_ptype_8PyDeepCL_SGD, 1, "sgd", 0))) {__pyx_filename = __pyx_f[14]; __pyx_lineno = 3; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
-  if (unlikely(!__Pyx_ArgTypeTest(((PyObject *)__pyx_v_scenario), __pyx_ptype_8PyDeepCL_Scenario, 1, "scenario", 0))) {__pyx_filename = __pyx_f[14]; __pyx_lineno = 3; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
-  if (unlikely(!__Pyx_ArgTypeTest(((PyObject *)__pyx_v_net), __pyx_ptype_8PyDeepCL_NeuralNet, 1, "net", 0))) {__pyx_filename = __pyx_f[14]; __pyx_lineno = 3; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+  if (unlikely(!__Pyx_ArgTypeTest(((PyObject *)__pyx_v_sgd), __pyx_ptype_8PyDeepCL_SGD, 1, "sgd", 0))) {__pyx_filename = __pyx_f[13]; __pyx_lineno = 3; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+  if (unlikely(!__Pyx_ArgTypeTest(((PyObject *)__pyx_v_scenario), __pyx_ptype_8PyDeepCL_Scenario, 1, "scenario", 0))) {__pyx_filename = __pyx_f[13]; __pyx_lineno = 3; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+  if (unlikely(!__Pyx_ArgTypeTest(((PyObject *)__pyx_v_net), __pyx_ptype_8PyDeepCL_NeuralNet, 1, "net", 0))) {__pyx_filename = __pyx_f[13]; __pyx_lineno = 3; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
   __pyx_r = __pyx_pf_8PyDeepCL_8QLearner___cinit__(((struct __pyx_obj_8PyDeepCL_QLearner *)__pyx_v_self), __pyx_v_sgd, __pyx_v_scenario, __pyx_v_net);
 
   /* function exit code */
@@ -13511,16 +13860,16 @@ static int __pyx_pf_8PyDeepCL_8QLearner___cinit__(struct __pyx_obj_8PyDeepCL_QLe
   int __pyx_clineno = 0;
   __Pyx_RefNannySetupContext("__cinit__", 0);
 
-  /* "QLearning.pyx":4
+  /* "../../../../../../home/user/git/DeepCL/python/QLearning.pyx":4
  *     cdef cDeepCL.QLearner *thisptr
  *     def __cinit__(self,SGD sgd, Scenario scenario,NeuralNet net):
  *         scenario.net = net             # <<<<<<<<<<<<<<
  *         self.thisptr = new cDeepCL.QLearner(
  *             sgd.thisptr, scenario.thisptr, net.thisptr)
  */
-  if (__Pyx_PyObject_SetAttrStr(((PyObject *)__pyx_v_scenario), __pyx_n_s_net, ((PyObject *)__pyx_v_net)) < 0) {__pyx_filename = __pyx_f[14]; __pyx_lineno = 4; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+  if (__Pyx_PyObject_SetAttrStr(((PyObject *)__pyx_v_scenario), __pyx_n_s_net, ((PyObject *)__pyx_v_net)) < 0) {__pyx_filename = __pyx_f[13]; __pyx_lineno = 4; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
 
-  /* "QLearning.pyx":5
+  /* "../../../../../../home/user/git/DeepCL/python/QLearning.pyx":5
  *     def __cinit__(self,SGD sgd, Scenario scenario,NeuralNet net):
  *         scenario.net = net
  *         self.thisptr = new cDeepCL.QLearner(             # <<<<<<<<<<<<<<
@@ -13531,11 +13880,11 @@ static int __pyx_pf_8PyDeepCL_8QLearner___cinit__(struct __pyx_obj_8PyDeepCL_QLe
     __pyx_t_1 = new QLearner(__pyx_v_sgd->thisptr, __pyx_v_scenario->thisptr, __pyx_v_net->thisptr);
   } catch(...) {
     __Pyx_CppExn2PyErr();
-    {__pyx_filename = __pyx_f[14]; __pyx_lineno = 5; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+    {__pyx_filename = __pyx_f[13]; __pyx_lineno = 5; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
   }
   __pyx_v_self->thisptr = __pyx_t_1;
 
-  /* "QLearning.pyx":3
+  /* "../../../../../../home/user/git/DeepCL/python/QLearning.pyx":3
  * cdef class QLearner:
  *     cdef cDeepCL.QLearner *thisptr
  *     def __cinit__(self,SGD sgd, Scenario scenario,NeuralNet net):             # <<<<<<<<<<<<<<
@@ -13554,7 +13903,7 @@ static int __pyx_pf_8PyDeepCL_8QLearner___cinit__(struct __pyx_obj_8PyDeepCL_QLe
   return __pyx_r;
 }
 
-/* "QLearning.pyx":7
+/* "../../../../../../home/user/git/DeepCL/python/QLearning.pyx":7
  *         self.thisptr = new cDeepCL.QLearner(
  *             sgd.thisptr, scenario.thisptr, net.thisptr)
  *     def __dealloc__(self):             # <<<<<<<<<<<<<<
@@ -13577,7 +13926,7 @@ static void __pyx_pf_8PyDeepCL_8QLearner_2__dealloc__(struct __pyx_obj_8PyDeepCL
   __Pyx_RefNannyDeclarations
   __Pyx_RefNannySetupContext("__dealloc__", 0);
 
-  /* "QLearning.pyx":8
+  /* "../../../../../../home/user/git/DeepCL/python/QLearning.pyx":8
  *             sgd.thisptr, scenario.thisptr, net.thisptr)
  *     def __dealloc__(self):
  *         del self.thisptr             # <<<<<<<<<<<<<<
@@ -13586,7 +13935,7 @@ static void __pyx_pf_8PyDeepCL_8QLearner_2__dealloc__(struct __pyx_obj_8PyDeepCL
  */
   delete __pyx_v_self->thisptr;
 
-  /* "QLearning.pyx":7
+  /* "../../../../../../home/user/git/DeepCL/python/QLearning.pyx":7
  *         self.thisptr = new cDeepCL.QLearner(
  *             sgd.thisptr, scenario.thisptr, net.thisptr)
  *     def __dealloc__(self):             # <<<<<<<<<<<<<<
@@ -13598,7 +13947,7 @@ static void __pyx_pf_8PyDeepCL_8QLearner_2__dealloc__(struct __pyx_obj_8PyDeepCL
   __Pyx_RefNannyFinishContext();
 }
 
-/* "QLearning.pyx":9
+/* "../../../../../../home/user/git/DeepCL/python/QLearning.pyx":9
  *     def __dealloc__(self):
  *         del self.thisptr
  *     def _run( self ):             # <<<<<<<<<<<<<<
@@ -13627,7 +13976,7 @@ static PyObject *__pyx_pf_8PyDeepCL_8QLearner_4_run(struct __pyx_obj_8PyDeepCL_Q
   int __pyx_clineno = 0;
   __Pyx_RefNannySetupContext("_run", 0);
 
-  /* "QLearning.pyx":10
+  /* "../../../../../../home/user/git/DeepCL/python/QLearning.pyx":10
  *         del self.thisptr
  *     def _run( self ):
  *         self.thisptr.run()             # <<<<<<<<<<<<<<
@@ -13638,10 +13987,10 @@ static PyObject *__pyx_pf_8PyDeepCL_8QLearner_4_run(struct __pyx_obj_8PyDeepCL_Q
     __pyx_v_self->thisptr->run();
   } catch(...) {
     __Pyx_CppExn2PyErr();
-    {__pyx_filename = __pyx_f[14]; __pyx_lineno = 10; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+    {__pyx_filename = __pyx_f[13]; __pyx_lineno = 10; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
   }
 
-  /* "QLearning.pyx":9
+  /* "../../../../../../home/user/git/DeepCL/python/QLearning.pyx":9
  *     def __dealloc__(self):
  *         del self.thisptr
  *     def _run( self ):             # <<<<<<<<<<<<<<
@@ -13661,7 +14010,7 @@ static PyObject *__pyx_pf_8PyDeepCL_8QLearner_4_run(struct __pyx_obj_8PyDeepCL_Q
   return __pyx_r;
 }
 
-/* "QLearning.pyx":11
+/* "../../../../../../home/user/git/DeepCL/python/QLearning.pyx":11
  *     def _run( self ):
  *         self.thisptr.run()
  *     def run( self ):             # <<<<<<<<<<<<<<
@@ -13697,18 +14046,18 @@ static PyObject *__pyx_pf_8PyDeepCL_8QLearner_6run(struct __pyx_obj_8PyDeepCL_QL
   int __pyx_clineno = 0;
   __Pyx_RefNannySetupContext("run", 0);
 
-  /* "QLearning.pyx":12
+  /* "../../../../../../home/user/git/DeepCL/python/QLearning.pyx":12
  *         self.thisptr.run()
  *     def run( self ):
  *         interruptableCall( self._run, [] )             # <<<<<<<<<<<<<<
  *     def setLambda( self, float thislambda ):
  *         self.thisptr.setLambda( thislambda )
  */
-  __pyx_t_2 = __Pyx_GetModuleGlobalName(__pyx_n_s_interruptableCall); if (unlikely(!__pyx_t_2)) {__pyx_filename = __pyx_f[14]; __pyx_lineno = 12; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+  __pyx_t_2 = __Pyx_GetModuleGlobalName(__pyx_n_s_interruptableCall); if (unlikely(!__pyx_t_2)) {__pyx_filename = __pyx_f[13]; __pyx_lineno = 12; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
   __Pyx_GOTREF(__pyx_t_2);
-  __pyx_t_3 = __Pyx_PyObject_GetAttrStr(((PyObject *)__pyx_v_self), __pyx_n_s_run); if (unlikely(!__pyx_t_3)) {__pyx_filename = __pyx_f[14]; __pyx_lineno = 12; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+  __pyx_t_3 = __Pyx_PyObject_GetAttrStr(((PyObject *)__pyx_v_self), __pyx_n_s_run); if (unlikely(!__pyx_t_3)) {__pyx_filename = __pyx_f[13]; __pyx_lineno = 12; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
   __Pyx_GOTREF(__pyx_t_3);
-  __pyx_t_4 = PyList_New(0); if (unlikely(!__pyx_t_4)) {__pyx_filename = __pyx_f[14]; __pyx_lineno = 12; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+  __pyx_t_4 = PyList_New(0); if (unlikely(!__pyx_t_4)) {__pyx_filename = __pyx_f[13]; __pyx_lineno = 12; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
   __Pyx_GOTREF(__pyx_t_4);
   __pyx_t_5 = NULL;
   __pyx_t_6 = 0;
@@ -13722,7 +14071,7 @@ static PyObject *__pyx_pf_8PyDeepCL_8QLearner_6run(struct __pyx_obj_8PyDeepCL_QL
       __pyx_t_6 = 1;
     }
   }
-  __pyx_t_7 = PyTuple_New(2+__pyx_t_6); if (unlikely(!__pyx_t_7)) {__pyx_filename = __pyx_f[14]; __pyx_lineno = 12; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+  __pyx_t_7 = PyTuple_New(2+__pyx_t_6); if (unlikely(!__pyx_t_7)) {__pyx_filename = __pyx_f[13]; __pyx_lineno = 12; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
   __Pyx_GOTREF(__pyx_t_7);
   if (__pyx_t_5) {
     PyTuple_SET_ITEM(__pyx_t_7, 0, __pyx_t_5); __Pyx_GIVEREF(__pyx_t_5); __pyx_t_5 = NULL;
@@ -13733,13 +14082,13 @@ static PyObject *__pyx_pf_8PyDeepCL_8QLearner_6run(struct __pyx_obj_8PyDeepCL_QL
   __Pyx_GIVEREF(__pyx_t_4);
   __pyx_t_3 = 0;
   __pyx_t_4 = 0;
-  __pyx_t_1 = __Pyx_PyObject_Call(__pyx_t_2, __pyx_t_7, NULL); if (unlikely(!__pyx_t_1)) {__pyx_filename = __pyx_f[14]; __pyx_lineno = 12; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+  __pyx_t_1 = __Pyx_PyObject_Call(__pyx_t_2, __pyx_t_7, NULL); if (unlikely(!__pyx_t_1)) {__pyx_filename = __pyx_f[13]; __pyx_lineno = 12; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
   __Pyx_GOTREF(__pyx_t_1);
   __Pyx_DECREF(__pyx_t_7); __pyx_t_7 = 0;
   __Pyx_DECREF(__pyx_t_2); __pyx_t_2 = 0;
   __Pyx_DECREF(__pyx_t_1); __pyx_t_1 = 0;
 
-  /* "QLearning.pyx":11
+  /* "../../../../../../home/user/git/DeepCL/python/QLearning.pyx":11
  *     def _run( self ):
  *         self.thisptr.run()
  *     def run( self ):             # <<<<<<<<<<<<<<
@@ -13765,7 +14114,7 @@ static PyObject *__pyx_pf_8PyDeepCL_8QLearner_6run(struct __pyx_obj_8PyDeepCL_QL
   return __pyx_r;
 }
 
-/* "QLearning.pyx":13
+/* "../../../../../../home/user/git/DeepCL/python/QLearning.pyx":13
  *     def run( self ):
  *         interruptableCall( self._run, [] )
  *     def setLambda( self, float thislambda ):             # <<<<<<<<<<<<<<
@@ -13784,7 +14133,7 @@ static PyObject *__pyx_pw_8PyDeepCL_8QLearner_9setLambda(PyObject *__pyx_v_self,
   __Pyx_RefNannyDeclarations
   __Pyx_RefNannySetupContext("setLambda (wrapper)", 0);
   assert(__pyx_arg_thislambda); {
-    __pyx_v_thislambda = __pyx_PyFloat_AsFloat(__pyx_arg_thislambda); if (unlikely((__pyx_v_thislambda == (float)-1) && PyErr_Occurred())) {__pyx_filename = __pyx_f[14]; __pyx_lineno = 13; __pyx_clineno = __LINE__; goto __pyx_L3_error;}
+    __pyx_v_thislambda = __pyx_PyFloat_AsFloat(__pyx_arg_thislambda); if (unlikely((__pyx_v_thislambda == (float)-1) && PyErr_Occurred())) {__pyx_filename = __pyx_f[13]; __pyx_lineno = 13; __pyx_clineno = __LINE__; goto __pyx_L3_error;}
   }
   goto __pyx_L4_argument_unpacking_done;
   __pyx_L3_error:;
@@ -13804,7 +14153,7 @@ static PyObject *__pyx_pf_8PyDeepCL_8QLearner_8setLambda(struct __pyx_obj_8PyDee
   __Pyx_RefNannyDeclarations
   __Pyx_RefNannySetupContext("setLambda", 0);
 
-  /* "QLearning.pyx":14
+  /* "../../../../../../home/user/git/DeepCL/python/QLearning.pyx":14
  *         interruptableCall( self._run, [] )
  *     def setLambda( self, float thislambda ):
  *         self.thisptr.setLambda( thislambda )             # <<<<<<<<<<<<<<
@@ -13813,7 +14162,7 @@ static PyObject *__pyx_pf_8PyDeepCL_8QLearner_8setLambda(struct __pyx_obj_8PyDee
  */
   __pyx_v_self->thisptr->setLambda(__pyx_v_thislambda);
 
-  /* "QLearning.pyx":13
+  /* "../../../../../../home/user/git/DeepCL/python/QLearning.pyx":13
  *     def run( self ):
  *         interruptableCall( self._run, [] )
  *     def setLambda( self, float thislambda ):             # <<<<<<<<<<<<<<
@@ -13828,7 +14177,7 @@ static PyObject *__pyx_pf_8PyDeepCL_8QLearner_8setLambda(struct __pyx_obj_8PyDee
   return __pyx_r;
 }
 
-/* "QLearning.pyx":15
+/* "../../../../../../home/user/git/DeepCL/python/QLearning.pyx":15
  *     def setLambda( self, float thislambda ):
  *         self.thisptr.setLambda( thislambda )
  *     def setMaxSamples( self, int maxSamples ):             # <<<<<<<<<<<<<<
@@ -13847,7 +14196,7 @@ static PyObject *__pyx_pw_8PyDeepCL_8QLearner_11setMaxSamples(PyObject *__pyx_v_
   __Pyx_RefNannyDeclarations
   __Pyx_RefNannySetupContext("setMaxSamples (wrapper)", 0);
   assert(__pyx_arg_maxSamples); {
-    __pyx_v_maxSamples = __Pyx_PyInt_As_int(__pyx_arg_maxSamples); if (unlikely((__pyx_v_maxSamples == (int)-1) && PyErr_Occurred())) {__pyx_filename = __pyx_f[14]; __pyx_lineno = 15; __pyx_clineno = __LINE__; goto __pyx_L3_error;}
+    __pyx_v_maxSamples = __Pyx_PyInt_As_int(__pyx_arg_maxSamples); if (unlikely((__pyx_v_maxSamples == (int)-1) && PyErr_Occurred())) {__pyx_filename = __pyx_f[13]; __pyx_lineno = 15; __pyx_clineno = __LINE__; goto __pyx_L3_error;}
   }
   goto __pyx_L4_argument_unpacking_done;
   __pyx_L3_error:;
@@ -13867,7 +14216,7 @@ static PyObject *__pyx_pf_8PyDeepCL_8QLearner_10setMaxSamples(struct __pyx_obj_8
   __Pyx_RefNannyDeclarations
   __Pyx_RefNannySetupContext("setMaxSamples", 0);
 
-  /* "QLearning.pyx":16
+  /* "../../../../../../home/user/git/DeepCL/python/QLearning.pyx":16
  *         self.thisptr.setLambda( thislambda )
  *     def setMaxSamples( self, int maxSamples ):
  *         self.thisptr.setMaxSamples( maxSamples )             # <<<<<<<<<<<<<<
@@ -13876,7 +14225,7 @@ static PyObject *__pyx_pf_8PyDeepCL_8QLearner_10setMaxSamples(struct __pyx_obj_8
  */
   __pyx_v_self->thisptr->setMaxSamples(__pyx_v_maxSamples);
 
-  /* "QLearning.pyx":15
+  /* "../../../../../../home/user/git/DeepCL/python/QLearning.pyx":15
  *     def setLambda( self, float thislambda ):
  *         self.thisptr.setLambda( thislambda )
  *     def setMaxSamples( self, int maxSamples ):             # <<<<<<<<<<<<<<
@@ -13891,7 +14240,7 @@ static PyObject *__pyx_pf_8PyDeepCL_8QLearner_10setMaxSamples(struct __pyx_obj_8
   return __pyx_r;
 }
 
-/* "QLearning.pyx":17
+/* "../../../../../../home/user/git/DeepCL/python/QLearning.pyx":17
  *     def setMaxSamples( self, int maxSamples ):
  *         self.thisptr.setMaxSamples( maxSamples )
  *     def setEpsilon( self, float epsilon ):             # <<<<<<<<<<<<<<
@@ -13910,7 +14259,7 @@ static PyObject *__pyx_pw_8PyDeepCL_8QLearner_13setEpsilon(PyObject *__pyx_v_sel
   __Pyx_RefNannyDeclarations
   __Pyx_RefNannySetupContext("setEpsilon (wrapper)", 0);
   assert(__pyx_arg_epsilon); {
-    __pyx_v_epsilon = __pyx_PyFloat_AsFloat(__pyx_arg_epsilon); if (unlikely((__pyx_v_epsilon == (float)-1) && PyErr_Occurred())) {__pyx_filename = __pyx_f[14]; __pyx_lineno = 17; __pyx_clineno = __LINE__; goto __pyx_L3_error;}
+    __pyx_v_epsilon = __pyx_PyFloat_AsFloat(__pyx_arg_epsilon); if (unlikely((__pyx_v_epsilon == (float)-1) && PyErr_Occurred())) {__pyx_filename = __pyx_f[13]; __pyx_lineno = 17; __pyx_clineno = __LINE__; goto __pyx_L3_error;}
   }
   goto __pyx_L4_argument_unpacking_done;
   __pyx_L3_error:;
@@ -13930,7 +14279,7 @@ static PyObject *__pyx_pf_8PyDeepCL_8QLearner_12setEpsilon(struct __pyx_obj_8PyD
   __Pyx_RefNannyDeclarations
   __Pyx_RefNannySetupContext("setEpsilon", 0);
 
-  /* "QLearning.pyx":18
+  /* "../../../../../../home/user/git/DeepCL/python/QLearning.pyx":18
  *         self.thisptr.setMaxSamples( maxSamples )
  *     def setEpsilon( self, float epsilon ):
  *         self.thisptr.setEpsilon( epsilon )             # <<<<<<<<<<<<<<
@@ -13939,7 +14288,7 @@ static PyObject *__pyx_pf_8PyDeepCL_8QLearner_12setEpsilon(struct __pyx_obj_8PyD
  */
   __pyx_v_self->thisptr->setEpsilon(__pyx_v_epsilon);
 
-  /* "QLearning.pyx":17
+  /* "../../../../../../home/user/git/DeepCL/python/QLearning.pyx":17
  *     def setMaxSamples( self, int maxSamples ):
  *         self.thisptr.setMaxSamples( maxSamples )
  *     def setEpsilon( self, float epsilon ):             # <<<<<<<<<<<<<<
@@ -13954,7 +14303,7 @@ static PyObject *__pyx_pf_8PyDeepCL_8QLearner_12setEpsilon(struct __pyx_obj_8PyD
   return __pyx_r;
 }
 
-/* "QLearning.pyx":31
+/* "../../../../../../home/user/git/DeepCL/python/QLearning.pyx":31
  * #    scenario.showQ(scenario.net)
  * 
  * cdef void Scenario_getPerception( float *perception, void *pyObject ):             # <<<<<<<<<<<<<<
@@ -13977,14 +14326,14 @@ static void __pyx_f_8PyDeepCL_Scenario_getPerception(float *__pyx_v_perception,
   int __pyx_clineno = 0;
   __Pyx_RefNannySetupContext("Scenario_getPerception", 0);
 
-  /* "QLearning.pyx":32
+  /* "../../../../../../home/user/git/DeepCL/python/QLearning.pyx":32
  * 
  * cdef void Scenario_getPerception( float *perception, void *pyObject ):
  *     pyPerception = (<object>pyObject).getPerception()             # <<<<<<<<<<<<<<
  *     for i in range(len(pyPerception)):
  *         perception[i] = pyPerception[i]
  */
-  __pyx_t_2 = __Pyx_PyObject_GetAttrStr(((PyObject *)__pyx_v_pyObject), __pyx_n_s_getPerception); if (unlikely(!__pyx_t_2)) {__pyx_filename = __pyx_f[14]; __pyx_lineno = 32; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+  __pyx_t_2 = __Pyx_PyObject_GetAttrStr(((PyObject *)__pyx_v_pyObject), __pyx_n_s_getPerception); if (unlikely(!__pyx_t_2)) {__pyx_filename = __pyx_f[13]; __pyx_lineno = 32; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
   __Pyx_GOTREF(__pyx_t_2);
   __pyx_t_3 = NULL;
   if (CYTHON_COMPILING_IN_CPYTHON && likely(PyMethod_Check(__pyx_t_2))) {
@@ -13997,42 +14346,42 @@ static void __pyx_f_8PyDeepCL_Scenario_getPerception(float *__pyx_v_perception,
     }
   }
   if (__pyx_t_3) {
-    __pyx_t_1 = __Pyx_PyObject_CallOneArg(__pyx_t_2, __pyx_t_3); if (unlikely(!__pyx_t_1)) {__pyx_filename = __pyx_f[14]; __pyx_lineno = 32; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+    __pyx_t_1 = __Pyx_PyObject_CallOneArg(__pyx_t_2, __pyx_t_3); if (unlikely(!__pyx_t_1)) {__pyx_filename = __pyx_f[13]; __pyx_lineno = 32; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
     __Pyx_DECREF(__pyx_t_3); __pyx_t_3 = 0;
   } else {
-    __pyx_t_1 = __Pyx_PyObject_CallNoArg(__pyx_t_2); if (unlikely(!__pyx_t_1)) {__pyx_filename = __pyx_f[14]; __pyx_lineno = 32; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+    __pyx_t_1 = __Pyx_PyObject_CallNoArg(__pyx_t_2); if (unlikely(!__pyx_t_1)) {__pyx_filename = __pyx_f[13]; __pyx_lineno = 32; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
   }
   __Pyx_GOTREF(__pyx_t_1);
   __Pyx_DECREF(__pyx_t_2); __pyx_t_2 = 0;
   __pyx_v_pyPerception = __pyx_t_1;
   __pyx_t_1 = 0;
 
-  /* "QLearning.pyx":33
+  /* "../../../../../../home/user/git/DeepCL/python/QLearning.pyx":33
  * cdef void Scenario_getPerception( float *perception, void *pyObject ):
  *     pyPerception = (<object>pyObject).getPerception()
  *     for i in range(len(pyPerception)):             # <<<<<<<<<<<<<<
  *         perception[i] = pyPerception[i]
  * 
  */
-  __pyx_t_4 = PyObject_Length(__pyx_v_pyPerception); if (unlikely(__pyx_t_4 == -1)) {__pyx_filename = __pyx_f[14]; __pyx_lineno = 33; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+  __pyx_t_4 = PyObject_Length(__pyx_v_pyPerception); if (unlikely(__pyx_t_4 == -1)) {__pyx_filename = __pyx_f[13]; __pyx_lineno = 33; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
   for (__pyx_t_5 = 0; __pyx_t_5 < __pyx_t_4; __pyx_t_5+=1) {
     __pyx_v_i = __pyx_t_5;
 
-    /* "QLearning.pyx":34
+    /* "../../../../../../home/user/git/DeepCL/python/QLearning.pyx":34
  *     pyPerception = (<object>pyObject).getPerception()
  *     for i in range(len(pyPerception)):
  *         perception[i] = pyPerception[i]             # <<<<<<<<<<<<<<
  * 
  * #[[[cog
  */
-    __pyx_t_1 = __Pyx_GetItemInt(__pyx_v_pyPerception, __pyx_v_i, Py_ssize_t, 1, PyInt_FromSsize_t, 0, 1, 1); if (unlikely(__pyx_t_1 == NULL)) {__pyx_filename = __pyx_f[14]; __pyx_lineno = 34; __pyx_clineno = __LINE__; goto __pyx_L1_error;};
+    __pyx_t_1 = __Pyx_GetItemInt(__pyx_v_pyPerception, __pyx_v_i, Py_ssize_t, 1, PyInt_FromSsize_t, 0, 1, 1); if (unlikely(__pyx_t_1 == NULL)) {__pyx_filename = __pyx_f[13]; __pyx_lineno = 34; __pyx_clineno = __LINE__; goto __pyx_L1_error;};
     __Pyx_GOTREF(__pyx_t_1);
-    __pyx_t_6 = __pyx_PyFloat_AsFloat(__pyx_t_1); if (unlikely((__pyx_t_6 == (float)-1) && PyErr_Occurred())) {__pyx_filename = __pyx_f[14]; __pyx_lineno = 34; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+    __pyx_t_6 = __pyx_PyFloat_AsFloat(__pyx_t_1); if (unlikely((__pyx_t_6 == (float)-1) && PyErr_Occurred())) {__pyx_filename = __pyx_f[13]; __pyx_lineno = 34; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
     __Pyx_DECREF(__pyx_t_1); __pyx_t_1 = 0;
     (__pyx_v_perception[__pyx_v_i]) = __pyx_t_6;
   }
 
-  /* "QLearning.pyx":31
+  /* "../../../../../../home/user/git/DeepCL/python/QLearning.pyx":31
  * #    scenario.showQ(scenario.net)
  * 
  * cdef void Scenario_getPerception( float *perception, void *pyObject ):             # <<<<<<<<<<<<<<
@@ -14052,7 +14401,7 @@ static void __pyx_f_8PyDeepCL_Scenario_getPerception(float *__pyx_v_perception,
   __Pyx_RefNannyFinishContext();
 }
 
-/* "QLearning.pyx":43
+/* "../../../../../../home/user/git/DeepCL/python/QLearning.pyx":43
  * #]]]
  * # generated using cog (as far as the [[end]] bit:
  * cdef int Scenario_getPerceptionSize(  void *pyObject ):             # <<<<<<<<<<<<<<
@@ -14072,14 +14421,14 @@ static int __pyx_f_8PyDeepCL_Scenario_getPerceptionSize(void *__pyx_v_pyObject)
   int __pyx_clineno = 0;
   __Pyx_RefNannySetupContext("Scenario_getPerceptionSize", 0);
 
-  /* "QLearning.pyx":44
+  /* "../../../../../../home/user/git/DeepCL/python/QLearning.pyx":44
  * # generated using cog (as far as the [[end]] bit:
  * cdef int Scenario_getPerceptionSize(  void *pyObject ):
  *     return (<object>pyObject).getPerceptionSize()             # <<<<<<<<<<<<<<
  * 
  * cdef int Scenario_getPerceptionPlanes(  void *pyObject ):
  */
-  __pyx_t_2 = __Pyx_PyObject_GetAttrStr(((PyObject *)__pyx_v_pyObject), __pyx_n_s_getPerceptionSize); if (unlikely(!__pyx_t_2)) {__pyx_filename = __pyx_f[14]; __pyx_lineno = 44; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+  __pyx_t_2 = __Pyx_PyObject_GetAttrStr(((PyObject *)__pyx_v_pyObject), __pyx_n_s_getPerceptionSize); if (unlikely(!__pyx_t_2)) {__pyx_filename = __pyx_f[13]; __pyx_lineno = 44; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
   __Pyx_GOTREF(__pyx_t_2);
   __pyx_t_3 = NULL;
   if (CYTHON_COMPILING_IN_CPYTHON && likely(PyMethod_Check(__pyx_t_2))) {
@@ -14092,19 +14441,19 @@ static int __pyx_f_8PyDeepCL_Scenario_getPerceptionSize(void *__pyx_v_pyObject)
     }
   }
   if (__pyx_t_3) {
-    __pyx_t_1 = __Pyx_PyObject_CallOneArg(__pyx_t_2, __pyx_t_3); if (unlikely(!__pyx_t_1)) {__pyx_filename = __pyx_f[14]; __pyx_lineno = 44; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+    __pyx_t_1 = __Pyx_PyObject_CallOneArg(__pyx_t_2, __pyx_t_3); if (unlikely(!__pyx_t_1)) {__pyx_filename = __pyx_f[13]; __pyx_lineno = 44; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
     __Pyx_DECREF(__pyx_t_3); __pyx_t_3 = 0;
   } else {
-    __pyx_t_1 = __Pyx_PyObject_CallNoArg(__pyx_t_2); if (unlikely(!__pyx_t_1)) {__pyx_filename = __pyx_f[14]; __pyx_lineno = 44; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+    __pyx_t_1 = __Pyx_PyObject_CallNoArg(__pyx_t_2); if (unlikely(!__pyx_t_1)) {__pyx_filename = __pyx_f[13]; __pyx_lineno = 44; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
   }
   __Pyx_GOTREF(__pyx_t_1);
   __Pyx_DECREF(__pyx_t_2); __pyx_t_2 = 0;
-  __pyx_t_4 = __Pyx_PyInt_As_int(__pyx_t_1); if (unlikely((__pyx_t_4 == (int)-1) && PyErr_Occurred())) {__pyx_filename = __pyx_f[14]; __pyx_lineno = 44; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+  __pyx_t_4 = __Pyx_PyInt_As_int(__pyx_t_1); if (unlikely((__pyx_t_4 == (int)-1) && PyErr_Occurred())) {__pyx_filename = __pyx_f[13]; __pyx_lineno = 44; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
   __Pyx_DECREF(__pyx_t_1); __pyx_t_1 = 0;
   __pyx_r = __pyx_t_4;
   goto __pyx_L0;
 
-  /* "QLearning.pyx":43
+  /* "../../../../../../home/user/git/DeepCL/python/QLearning.pyx":43
  * #]]]
  * # generated using cog (as far as the [[end]] bit:
  * cdef int Scenario_getPerceptionSize(  void *pyObject ):             # <<<<<<<<<<<<<<
@@ -14124,7 +14473,7 @@ static int __pyx_f_8PyDeepCL_Scenario_getPerceptionSize(void *__pyx_v_pyObject)
   return __pyx_r;
 }
 
-/* "QLearning.pyx":46
+/* "../../../../../../home/user/git/DeepCL/python/QLearning.pyx":46
  *     return (<object>pyObject).getPerceptionSize()
  * 
  * cdef int Scenario_getPerceptionPlanes(  void *pyObject ):             # <<<<<<<<<<<<<<
@@ -14144,14 +14493,14 @@ static int __pyx_f_8PyDeepCL_Scenario_getPerceptionPlanes(void *__pyx_v_pyObject
   int __pyx_clineno = 0;
   __Pyx_RefNannySetupContext("Scenario_getPerceptionPlanes", 0);
 
-  /* "QLearning.pyx":47
+  /* "../../../../../../home/user/git/DeepCL/python/QLearning.pyx":47
  * 
  * cdef int Scenario_getPerceptionPlanes(  void *pyObject ):
  *     return (<object>pyObject).getPerceptionPlanes()             # <<<<<<<<<<<<<<
  * 
  * cdef void Scenario_reset(  void *pyObject ):
  */
-  __pyx_t_2 = __Pyx_PyObject_GetAttrStr(((PyObject *)__pyx_v_pyObject), __pyx_n_s_getPerceptionPlanes); if (unlikely(!__pyx_t_2)) {__pyx_filename = __pyx_f[14]; __pyx_lineno = 47; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+  __pyx_t_2 = __Pyx_PyObject_GetAttrStr(((PyObject *)__pyx_v_pyObject), __pyx_n_s_getPerceptionPlanes); if (unlikely(!__pyx_t_2)) {__pyx_filename = __pyx_f[13]; __pyx_lineno = 47; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
   __Pyx_GOTREF(__pyx_t_2);
   __pyx_t_3 = NULL;
   if (CYTHON_COMPILING_IN_CPYTHON && likely(PyMethod_Check(__pyx_t_2))) {
@@ -14164,19 +14513,19 @@ static int __pyx_f_8PyDeepCL_Scenario_getPerceptionPlanes(void *__pyx_v_pyObject
     }
   }
   if (__pyx_t_3) {
-    __pyx_t_1 = __Pyx_PyObject_CallOneArg(__pyx_t_2, __pyx_t_3); if (unlikely(!__pyx_t_1)) {__pyx_filename = __pyx_f[14]; __pyx_lineno = 47; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+    __pyx_t_1 = __Pyx_PyObject_CallOneArg(__pyx_t_2, __pyx_t_3); if (unlikely(!__pyx_t_1)) {__pyx_filename = __pyx_f[13]; __pyx_lineno = 47; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
     __Pyx_DECREF(__pyx_t_3); __pyx_t_3 = 0;
   } else {
-    __pyx_t_1 = __Pyx_PyObject_CallNoArg(__pyx_t_2); if (unlikely(!__pyx_t_1)) {__pyx_filename = __pyx_f[14]; __pyx_lineno = 47; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+    __pyx_t_1 = __Pyx_PyObject_CallNoArg(__pyx_t_2); if (unlikely(!__pyx_t_1)) {__pyx_filename = __pyx_f[13]; __pyx_lineno = 47; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
   }
   __Pyx_GOTREF(__pyx_t_1);
   __Pyx_DECREF(__pyx_t_2); __pyx_t_2 = 0;
-  __pyx_t_4 = __Pyx_PyInt_As_int(__pyx_t_1); if (unlikely((__pyx_t_4 == (int)-1) && PyErr_Occurred())) {__pyx_filename = __pyx_f[14]; __pyx_lineno = 47; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+  __pyx_t_4 = __Pyx_PyInt_As_int(__pyx_t_1); if (unlikely((__pyx_t_4 == (int)-1) && PyErr_Occurred())) {__pyx_filename = __pyx_f[13]; __pyx_lineno = 47; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
   __Pyx_DECREF(__pyx_t_1); __pyx_t_1 = 0;
   __pyx_r = __pyx_t_4;
   goto __pyx_L0;
 
-  /* "QLearning.pyx":46
+  /* "../../../../../../home/user/git/DeepCL/python/QLearning.pyx":46
  *     return (<object>pyObject).getPerceptionSize()
  * 
  * cdef int Scenario_getPerceptionPlanes(  void *pyObject ):             # <<<<<<<<<<<<<<
@@ -14196,7 +14545,7 @@ static int __pyx_f_8PyDeepCL_Scenario_getPerceptionPlanes(void *__pyx_v_pyObject
   return __pyx_r;
 }
 
-/* "QLearning.pyx":49
+/* "../../../../../../home/user/git/DeepCL/python/QLearning.pyx":49
  *     return (<object>pyObject).getPerceptionPlanes()
  * 
  * cdef void Scenario_reset(  void *pyObject ):             # <<<<<<<<<<<<<<
@@ -14214,14 +14563,14 @@ static void __pyx_f_8PyDeepCL_Scenario_reset(void *__pyx_v_pyObject) {
   int __pyx_clineno = 0;
   __Pyx_RefNannySetupContext("Scenario_reset", 0);
 
-  /* "QLearning.pyx":50
+  /* "../../../../../../home/user/git/DeepCL/python/QLearning.pyx":50
  * 
  * cdef void Scenario_reset(  void *pyObject ):
  *     (<object>pyObject).reset()             # <<<<<<<<<<<<<<
  * 
  * cdef int Scenario_getNumActions(  void *pyObject ):
  */
-  __pyx_t_2 = __Pyx_PyObject_GetAttrStr(((PyObject *)__pyx_v_pyObject), __pyx_n_s_reset); if (unlikely(!__pyx_t_2)) {__pyx_filename = __pyx_f[14]; __pyx_lineno = 50; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+  __pyx_t_2 = __Pyx_PyObject_GetAttrStr(((PyObject *)__pyx_v_pyObject), __pyx_n_s_reset); if (unlikely(!__pyx_t_2)) {__pyx_filename = __pyx_f[13]; __pyx_lineno = 50; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
   __Pyx_GOTREF(__pyx_t_2);
   __pyx_t_3 = NULL;
   if (CYTHON_COMPILING_IN_CPYTHON && likely(PyMethod_Check(__pyx_t_2))) {
@@ -14234,16 +14583,16 @@ static void __pyx_f_8PyDeepCL_Scenario_reset(void *__pyx_v_pyObject) {
     }
   }
   if (__pyx_t_3) {
-    __pyx_t_1 = __Pyx_PyObject_CallOneArg(__pyx_t_2, __pyx_t_3); if (unlikely(!__pyx_t_1)) {__pyx_filename = __pyx_f[14]; __pyx_lineno = 50; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+    __pyx_t_1 = __Pyx_PyObject_CallOneArg(__pyx_t_2, __pyx_t_3); if (unlikely(!__pyx_t_1)) {__pyx_filename = __pyx_f[13]; __pyx_lineno = 50; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
     __Pyx_DECREF(__pyx_t_3); __pyx_t_3 = 0;
   } else {
-    __pyx_t_1 = __Pyx_PyObject_CallNoArg(__pyx_t_2); if (unlikely(!__pyx_t_1)) {__pyx_filename = __pyx_f[14]; __pyx_lineno = 50; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+    __pyx_t_1 = __Pyx_PyObject_CallNoArg(__pyx_t_2); if (unlikely(!__pyx_t_1)) {__pyx_filename = __pyx_f[13]; __pyx_lineno = 50; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
   }
   __Pyx_GOTREF(__pyx_t_1);
   __Pyx_DECREF(__pyx_t_2); __pyx_t_2 = 0;
   __Pyx_DECREF(__pyx_t_1); __pyx_t_1 = 0;
 
-  /* "QLearning.pyx":49
+  /* "../../../../../../home/user/git/DeepCL/python/QLearning.pyx":49
  *     return (<object>pyObject).getPerceptionPlanes()
  * 
  * cdef void Scenario_reset(  void *pyObject ):             # <<<<<<<<<<<<<<
@@ -14262,7 +14611,7 @@ static void __pyx_f_8PyDeepCL_Scenario_reset(void *__pyx_v_pyObject) {
   __Pyx_RefNannyFinishContext();
 }
 
-/* "QLearning.pyx":52
+/* "../../../../../../home/user/git/DeepCL/python/QLearning.pyx":52
  *     (<object>pyObject).reset()
  * 
  * cdef int Scenario_getNumActions(  void *pyObject ):             # <<<<<<<<<<<<<<
@@ -14282,14 +14631,14 @@ static int __pyx_f_8PyDeepCL_Scenario_getNumActions(void *__pyx_v_pyObject) {
   int __pyx_clineno = 0;
   __Pyx_RefNannySetupContext("Scenario_getNumActions", 0);
 
-  /* "QLearning.pyx":53
+  /* "../../../../../../home/user/git/DeepCL/python/QLearning.pyx":53
  * 
  * cdef int Scenario_getNumActions(  void *pyObject ):
  *     return (<object>pyObject).getNumActions()             # <<<<<<<<<<<<<<
  * 
  * cdef float Scenario_act( int index,  void *pyObject ):
  */
-  __pyx_t_2 = __Pyx_PyObject_GetAttrStr(((PyObject *)__pyx_v_pyObject), __pyx_n_s_getNumActions); if (unlikely(!__pyx_t_2)) {__pyx_filename = __pyx_f[14]; __pyx_lineno = 53; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+  __pyx_t_2 = __Pyx_PyObject_GetAttrStr(((PyObject *)__pyx_v_pyObject), __pyx_n_s_getNumActions); if (unlikely(!__pyx_t_2)) {__pyx_filename = __pyx_f[13]; __pyx_lineno = 53; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
   __Pyx_GOTREF(__pyx_t_2);
   __pyx_t_3 = NULL;
   if (CYTHON_COMPILING_IN_CPYTHON && likely(PyMethod_Check(__pyx_t_2))) {
@@ -14302,19 +14651,19 @@ static int __pyx_f_8PyDeepCL_Scenario_getNumActions(void *__pyx_v_pyObject) {
     }
   }
   if (__pyx_t_3) {
-    __pyx_t_1 = __Pyx_PyObject_CallOneArg(__pyx_t_2, __pyx_t_3); if (unlikely(!__pyx_t_1)) {__pyx_filename = __pyx_f[14]; __pyx_lineno = 53; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+    __pyx_t_1 = __Pyx_PyObject_CallOneArg(__pyx_t_2, __pyx_t_3); if (unlikely(!__pyx_t_1)) {__pyx_filename = __pyx_f[13]; __pyx_lineno = 53; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
     __Pyx_DECREF(__pyx_t_3); __pyx_t_3 = 0;
   } else {
-    __pyx_t_1 = __Pyx_PyObject_CallNoArg(__pyx_t_2); if (unlikely(!__pyx_t_1)) {__pyx_filename = __pyx_f[14]; __pyx_lineno = 53; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+    __pyx_t_1 = __Pyx_PyObject_CallNoArg(__pyx_t_2); if (unlikely(!__pyx_t_1)) {__pyx_filename = __pyx_f[13]; __pyx_lineno = 53; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
   }
   __Pyx_GOTREF(__pyx_t_1);
   __Pyx_DECREF(__pyx_t_2); __pyx_t_2 = 0;
-  __pyx_t_4 = __Pyx_PyInt_As_int(__pyx_t_1); if (unlikely((__pyx_t_4 == (int)-1) && PyErr_Occurred())) {__pyx_filename = __pyx_f[14]; __pyx_lineno = 53; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+  __pyx_t_4 = __Pyx_PyInt_As_int(__pyx_t_1); if (unlikely((__pyx_t_4 == (int)-1) && PyErr_Occurred())) {__pyx_filename = __pyx_f[13]; __pyx_lineno = 53; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
   __Pyx_DECREF(__pyx_t_1); __pyx_t_1 = 0;
   __pyx_r = __pyx_t_4;
   goto __pyx_L0;
 
-  /* "QLearning.pyx":52
+  /* "../../../../../../home/user/git/DeepCL/python/QLearning.pyx":52
  *     (<object>pyObject).reset()
  * 
  * cdef int Scenario_getNumActions(  void *pyObject ):             # <<<<<<<<<<<<<<
@@ -14334,7 +14683,7 @@ static int __pyx_f_8PyDeepCL_Scenario_getNumActions(void *__pyx_v_pyObject) {
   return __pyx_r;
 }
 
-/* "QLearning.pyx":55
+/* "../../../../../../home/user/git/DeepCL/python/QLearning.pyx":55
  *     return (<object>pyObject).getNumActions()
  * 
  * cdef float Scenario_act( int index,  void *pyObject ):             # <<<<<<<<<<<<<<
@@ -14356,16 +14705,16 @@ static float __pyx_f_8PyDeepCL_Scenario_act(int __pyx_v_index, void *__pyx_v_pyO
   int __pyx_clineno = 0;
   __Pyx_RefNannySetupContext("Scenario_act", 0);
 
-  /* "QLearning.pyx":56
+  /* "../../../../../../home/user/git/DeepCL/python/QLearning.pyx":56
  * 
  * cdef float Scenario_act( int index,  void *pyObject ):
  *     return (<object>pyObject).act(index)             # <<<<<<<<<<<<<<
  * 
  * cdef bool Scenario_hasFinished(  void *pyObject ):
  */
-  __pyx_t_2 = __Pyx_PyObject_GetAttrStr(((PyObject *)__pyx_v_pyObject), __pyx_n_s_act); if (unlikely(!__pyx_t_2)) {__pyx_filename = __pyx_f[14]; __pyx_lineno = 56; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+  __pyx_t_2 = __Pyx_PyObject_GetAttrStr(((PyObject *)__pyx_v_pyObject), __pyx_n_s_act); if (unlikely(!__pyx_t_2)) {__pyx_filename = __pyx_f[13]; __pyx_lineno = 56; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
   __Pyx_GOTREF(__pyx_t_2);
-  __pyx_t_3 = __Pyx_PyInt_From_int(__pyx_v_index); if (unlikely(!__pyx_t_3)) {__pyx_filename = __pyx_f[14]; __pyx_lineno = 56; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+  __pyx_t_3 = __Pyx_PyInt_From_int(__pyx_v_index); if (unlikely(!__pyx_t_3)) {__pyx_filename = __pyx_f[13]; __pyx_lineno = 56; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
   __Pyx_GOTREF(__pyx_t_3);
   __pyx_t_4 = NULL;
   if (CYTHON_COMPILING_IN_CPYTHON && likely(PyMethod_Check(__pyx_t_2))) {
@@ -14378,27 +14727,27 @@ static float __pyx_f_8PyDeepCL_Scenario_act(int __pyx_v_index, void *__pyx_v_pyO
     }
   }
   if (!__pyx_t_4) {
-    __pyx_t_1 = __Pyx_PyObject_CallOneArg(__pyx_t_2, __pyx_t_3); if (unlikely(!__pyx_t_1)) {__pyx_filename = __pyx_f[14]; __pyx_lineno = 56; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+    __pyx_t_1 = __Pyx_PyObject_CallOneArg(__pyx_t_2, __pyx_t_3); if (unlikely(!__pyx_t_1)) {__pyx_filename = __pyx_f[13]; __pyx_lineno = 56; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
     __Pyx_DECREF(__pyx_t_3); __pyx_t_3 = 0;
     __Pyx_GOTREF(__pyx_t_1);
   } else {
-    __pyx_t_5 = PyTuple_New(1+1); if (unlikely(!__pyx_t_5)) {__pyx_filename = __pyx_f[14]; __pyx_lineno = 56; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+    __pyx_t_5 = PyTuple_New(1+1); if (unlikely(!__pyx_t_5)) {__pyx_filename = __pyx_f[13]; __pyx_lineno = 56; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
     __Pyx_GOTREF(__pyx_t_5);
     PyTuple_SET_ITEM(__pyx_t_5, 0, __pyx_t_4); __Pyx_GIVEREF(__pyx_t_4); __pyx_t_4 = NULL;
     PyTuple_SET_ITEM(__pyx_t_5, 0+1, __pyx_t_3);
     __Pyx_GIVEREF(__pyx_t_3);
     __pyx_t_3 = 0;
-    __pyx_t_1 = __Pyx_PyObject_Call(__pyx_t_2, __pyx_t_5, NULL); if (unlikely(!__pyx_t_1)) {__pyx_filename = __pyx_f[14]; __pyx_lineno = 56; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+    __pyx_t_1 = __Pyx_PyObject_Call(__pyx_t_2, __pyx_t_5, NULL); if (unlikely(!__pyx_t_1)) {__pyx_filename = __pyx_f[13]; __pyx_lineno = 56; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
     __Pyx_GOTREF(__pyx_t_1);
     __Pyx_DECREF(__pyx_t_5); __pyx_t_5 = 0;
   }
   __Pyx_DECREF(__pyx_t_2); __pyx_t_2 = 0;
-  __pyx_t_6 = __pyx_PyFloat_AsFloat(__pyx_t_1); if (unlikely((__pyx_t_6 == (float)-1) && PyErr_Occurred())) {__pyx_filename = __pyx_f[14]; __pyx_lineno = 56; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+  __pyx_t_6 = __pyx_PyFloat_AsFloat(__pyx_t_1); if (unlikely((__pyx_t_6 == (float)-1) && PyErr_Occurred())) {__pyx_filename = __pyx_f[13]; __pyx_lineno = 56; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
   __Pyx_DECREF(__pyx_t_1); __pyx_t_1 = 0;
   __pyx_r = __pyx_t_6;
   goto __pyx_L0;
 
-  /* "QLearning.pyx":55
+  /* "../../../../../../home/user/git/DeepCL/python/QLearning.pyx":55
  *     return (<object>pyObject).getNumActions()
  * 
  * cdef float Scenario_act( int index,  void *pyObject ):             # <<<<<<<<<<<<<<
@@ -14420,7 +14769,7 @@ static float __pyx_f_8PyDeepCL_Scenario_act(int __pyx_v_index, void *__pyx_v_pyO
   return __pyx_r;
 }
 
-/* "QLearning.pyx":58
+/* "../../../../../../home/user/git/DeepCL/python/QLearning.pyx":58
  *     return (<object>pyObject).act(index)
  * 
  * cdef bool Scenario_hasFinished(  void *pyObject ):             # <<<<<<<<<<<<<<
@@ -14440,14 +14789,14 @@ static bool __pyx_f_8PyDeepCL_Scenario_hasFinished(void *__pyx_v_pyObject) {
   int __pyx_clineno = 0;
   __Pyx_RefNannySetupContext("Scenario_hasFinished", 0);
 
-  /* "QLearning.pyx":59
+  /* "../../../../../../home/user/git/DeepCL/python/QLearning.pyx":59
  * 
  * cdef bool Scenario_hasFinished(  void *pyObject ):
  *     return (<object>pyObject).hasFinished()             # <<<<<<<<<<<<<<
  * 
  * cdef class Scenario:
  */
-  __pyx_t_2 = __Pyx_PyObject_GetAttrStr(((PyObject *)__pyx_v_pyObject), __pyx_n_s_hasFinished); if (unlikely(!__pyx_t_2)) {__pyx_filename = __pyx_f[14]; __pyx_lineno = 59; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+  __pyx_t_2 = __Pyx_PyObject_GetAttrStr(((PyObject *)__pyx_v_pyObject), __pyx_n_s_hasFinished); if (unlikely(!__pyx_t_2)) {__pyx_filename = __pyx_f[13]; __pyx_lineno = 59; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
   __Pyx_GOTREF(__pyx_t_2);
   __pyx_t_3 = NULL;
   if (CYTHON_COMPILING_IN_CPYTHON && likely(PyMethod_Check(__pyx_t_2))) {
@@ -14460,19 +14809,19 @@ static bool __pyx_f_8PyDeepCL_Scenario_hasFinished(void *__pyx_v_pyObject) {
     }
   }
   if (__pyx_t_3) {
-    __pyx_t_1 = __Pyx_PyObject_CallOneArg(__pyx_t_2, __pyx_t_3); if (unlikely(!__pyx_t_1)) {__pyx_filename = __pyx_f[14]; __pyx_lineno = 59; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+    __pyx_t_1 = __Pyx_PyObject_CallOneArg(__pyx_t_2, __pyx_t_3); if (unlikely(!__pyx_t_1)) {__pyx_filename = __pyx_f[13]; __pyx_lineno = 59; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
     __Pyx_DECREF(__pyx_t_3); __pyx_t_3 = 0;
   } else {
-    __pyx_t_1 = __Pyx_PyObject_CallNoArg(__pyx_t_2); if (unlikely(!__pyx_t_1)) {__pyx_filename = __pyx_f[14]; __pyx_lineno = 59; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+    __pyx_t_1 = __Pyx_PyObject_CallNoArg(__pyx_t_2); if (unlikely(!__pyx_t_1)) {__pyx_filename = __pyx_f[13]; __pyx_lineno = 59; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
   }
   __Pyx_GOTREF(__pyx_t_1);
   __Pyx_DECREF(__pyx_t_2); __pyx_t_2 = 0;
-  __pyx_t_4 = __Pyx_PyObject_IsTrue(__pyx_t_1); if (unlikely((__pyx_t_4 == (bool)-1) && PyErr_Occurred())) {__pyx_filename = __pyx_f[14]; __pyx_lineno = 59; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+  __pyx_t_4 = __Pyx_PyObject_IsTrue(__pyx_t_1); if (unlikely((__pyx_t_4 == (bool)-1) && PyErr_Occurred())) {__pyx_filename = __pyx_f[13]; __pyx_lineno = 59; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
   __Pyx_DECREF(__pyx_t_1); __pyx_t_1 = 0;
   __pyx_r = __pyx_t_4;
   goto __pyx_L0;
 
-  /* "QLearning.pyx":58
+  /* "../../../../../../home/user/git/DeepCL/python/QLearning.pyx":58
  *     return (<object>pyObject).act(index)
  * 
  * cdef bool Scenario_hasFinished(  void *pyObject ):             # <<<<<<<<<<<<<<
@@ -14492,7 +14841,7 @@ static bool __pyx_f_8PyDeepCL_Scenario_hasFinished(void *__pyx_v_pyObject) {
   return __pyx_r;
 }
 
-/* "QLearning.pyx":63
+/* "../../../../../../home/user/git/DeepCL/python/QLearning.pyx":63
  * cdef class Scenario:
  *     cdef cDeepCL.CyScenario *thisptr
  *     def __cinit__(self):             # <<<<<<<<<<<<<<
@@ -14521,7 +14870,7 @@ static int __pyx_pf_8PyDeepCL_8Scenario___cinit__(struct __pyx_obj_8PyDeepCL_Sce
   __Pyx_RefNannyDeclarations
   __Pyx_RefNannySetupContext("__cinit__", 0);
 
-  /* "QLearning.pyx":64
+  /* "../../../../../../home/user/git/DeepCL/python/QLearning.pyx":64
  *     cdef cDeepCL.CyScenario *thisptr
  *     def __cinit__(self):
  *         self.thisptr = new cDeepCL.CyScenario(<void *>self )             # <<<<<<<<<<<<<<
@@ -14530,7 +14879,7 @@ static int __pyx_pf_8PyDeepCL_8Scenario___cinit__(struct __pyx_obj_8PyDeepCL_Sce
  */
   __pyx_v_self->thisptr = new CyScenario(((void *)__pyx_v_self));
 
-  /* "QLearning.pyx":66
+  /* "../../../../../../home/user/git/DeepCL/python/QLearning.pyx":66
  *         self.thisptr = new cDeepCL.CyScenario(<void *>self )
  * 
  *         self.thisptr.setGetPerceptionSize( Scenario_getPerceptionSize )             # <<<<<<<<<<<<<<
@@ -14539,7 +14888,7 @@ static int __pyx_pf_8PyDeepCL_8Scenario___cinit__(struct __pyx_obj_8PyDeepCL_Sce
  */
   __pyx_v_self->thisptr->setGetPerceptionSize(__pyx_f_8PyDeepCL_Scenario_getPerceptionSize);
 
-  /* "QLearning.pyx":67
+  /* "../../../../../../home/user/git/DeepCL/python/QLearning.pyx":67
  * 
  *         self.thisptr.setGetPerceptionSize( Scenario_getPerceptionSize )
  *         self.thisptr.setGetPerceptionPlanes( Scenario_getPerceptionPlanes )             # <<<<<<<<<<<<<<
@@ -14548,7 +14897,7 @@ static int __pyx_pf_8PyDeepCL_8Scenario___cinit__(struct __pyx_obj_8PyDeepCL_Sce
  */
   __pyx_v_self->thisptr->setGetPerceptionPlanes(__pyx_f_8PyDeepCL_Scenario_getPerceptionPlanes);
 
-  /* "QLearning.pyx":68
+  /* "../../../../../../home/user/git/DeepCL/python/QLearning.pyx":68
  *         self.thisptr.setGetPerceptionSize( Scenario_getPerceptionSize )
  *         self.thisptr.setGetPerceptionPlanes( Scenario_getPerceptionPlanes )
  *         self.thisptr.setGetPerception( Scenario_getPerception )             # <<<<<<<<<<<<<<
@@ -14557,7 +14906,7 @@ static int __pyx_pf_8PyDeepCL_8Scenario___cinit__(struct __pyx_obj_8PyDeepCL_Sce
  */
   __pyx_v_self->thisptr->setGetPerception(__pyx_f_8PyDeepCL_Scenario_getPerception);
 
-  /* "QLearning.pyx":69
+  /* "../../../../../../home/user/git/DeepCL/python/QLearning.pyx":69
  *         self.thisptr.setGetPerceptionPlanes( Scenario_getPerceptionPlanes )
  *         self.thisptr.setGetPerception( Scenario_getPerception )
  *         self.thisptr.setReset( Scenario_reset )             # <<<<<<<<<<<<<<
@@ -14566,7 +14915,7 @@ static int __pyx_pf_8PyDeepCL_8Scenario___cinit__(struct __pyx_obj_8PyDeepCL_Sce
  */
   __pyx_v_self->thisptr->setReset(__pyx_f_8PyDeepCL_Scenario_reset);
 
-  /* "QLearning.pyx":70
+  /* "../../../../../../home/user/git/DeepCL/python/QLearning.pyx":70
  *         self.thisptr.setGetPerception( Scenario_getPerception )
  *         self.thisptr.setReset( Scenario_reset )
  *         self.thisptr.setGetNumActions( Scenario_getNumActions )             # <<<<<<<<<<<<<<
@@ -14575,7 +14924,7 @@ static int __pyx_pf_8PyDeepCL_8Scenario___cinit__(struct __pyx_obj_8PyDeepCL_Sce
  */
   __pyx_v_self->thisptr->setGetNumActions(__pyx_f_8PyDeepCL_Scenario_getNumActions);
 
-  /* "QLearning.pyx":71
+  /* "../../../../../../home/user/git/DeepCL/python/QLearning.pyx":71
  *         self.thisptr.setReset( Scenario_reset )
  *         self.thisptr.setGetNumActions( Scenario_getNumActions )
  *         self.thisptr.setAct( Scenario_act )             # <<<<<<<<<<<<<<
@@ -14584,7 +14933,7 @@ static int __pyx_pf_8PyDeepCL_8Scenario___cinit__(struct __pyx_obj_8PyDeepCL_Sce
  */
   __pyx_v_self->thisptr->setAct(__pyx_f_8PyDeepCL_Scenario_act);
 
-  /* "QLearning.pyx":72
+  /* "../../../../../../home/user/git/DeepCL/python/QLearning.pyx":72
  *         self.thisptr.setGetNumActions( Scenario_getNumActions )
  *         self.thisptr.setAct( Scenario_act )
  *         self.thisptr.setHasFinished( Scenario_hasFinished )             # <<<<<<<<<<<<<<
@@ -14593,7 +14942,7 @@ static int __pyx_pf_8PyDeepCL_8Scenario___cinit__(struct __pyx_obj_8PyDeepCL_Sce
  */
   __pyx_v_self->thisptr->setHasFinished(__pyx_f_8PyDeepCL_Scenario_hasFinished);
 
-  /* "QLearning.pyx":63
+  /* "../../../../../../home/user/git/DeepCL/python/QLearning.pyx":63
  * cdef class Scenario:
  *     cdef cDeepCL.CyScenario *thisptr
  *     def __cinit__(self):             # <<<<<<<<<<<<<<
@@ -14607,7 +14956,7 @@ static int __pyx_pf_8PyDeepCL_8Scenario___cinit__(struct __pyx_obj_8PyDeepCL_Sce
   return __pyx_r;
 }
 
-/* "QLearning.pyx":74
+/* "../../../../../../home/user/git/DeepCL/python/QLearning.pyx":74
  *         self.thisptr.setHasFinished( Scenario_hasFinished )
  * 
  *     def getPerceptionSize(self):             # <<<<<<<<<<<<<<
@@ -14637,20 +14986,20 @@ static PyObject *__pyx_pf_8PyDeepCL_8Scenario_2getPerceptionSize(CYTHON_UNUSED s
   int __pyx_clineno = 0;
   __Pyx_RefNannySetupContext("getPerceptionSize", 0);
 
-  /* "QLearning.pyx":75
+  /* "../../../../../../home/user/git/DeepCL/python/QLearning.pyx":75
  * 
  *     def getPerceptionSize(self):
  *         raise Exception("Method needs to be overridden: Scenario.getPerceptionSize()")             # <<<<<<<<<<<<<<
  * 
  *     def getPerceptionPlanes(self):
  */
-  __pyx_t_1 = __Pyx_PyObject_Call(__pyx_builtin_Exception, __pyx_tuple__2, NULL); if (unlikely(!__pyx_t_1)) {__pyx_filename = __pyx_f[14]; __pyx_lineno = 75; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+  __pyx_t_1 = __Pyx_PyObject_Call(__pyx_builtin_Exception, __pyx_tuple__2, NULL); if (unlikely(!__pyx_t_1)) {__pyx_filename = __pyx_f[13]; __pyx_lineno = 75; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
   __Pyx_GOTREF(__pyx_t_1);
   __Pyx_Raise(__pyx_t_1, 0, 0, 0);
   __Pyx_DECREF(__pyx_t_1); __pyx_t_1 = 0;
-  {__pyx_filename = __pyx_f[14]; __pyx_lineno = 75; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+  {__pyx_filename = __pyx_f[13]; __pyx_lineno = 75; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
 
-  /* "QLearning.pyx":74
+  /* "../../../../../../home/user/git/DeepCL/python/QLearning.pyx":74
  *         self.thisptr.setHasFinished( Scenario_hasFinished )
  * 
  *     def getPerceptionSize(self):             # <<<<<<<<<<<<<<
@@ -14668,7 +15017,7 @@ static PyObject *__pyx_pf_8PyDeepCL_8Scenario_2getPerceptionSize(CYTHON_UNUSED s
   return __pyx_r;
 }
 
-/* "QLearning.pyx":77
+/* "../../../../../../home/user/git/DeepCL/python/QLearning.pyx":77
  *         raise Exception("Method needs to be overridden: Scenario.getPerceptionSize()")
  * 
  *     def getPerceptionPlanes(self):             # <<<<<<<<<<<<<<
@@ -14698,20 +15047,20 @@ static PyObject *__pyx_pf_8PyDeepCL_8Scenario_4getPerceptionPlanes(CYTHON_UNUSED
   int __pyx_clineno = 0;
   __Pyx_RefNannySetupContext("getPerceptionPlanes", 0);
 
-  /* "QLearning.pyx":78
+  /* "../../../../../../home/user/git/DeepCL/python/QLearning.pyx":78
  * 
  *     def getPerceptionPlanes(self):
  *         raise Exception("Method needs to be overridden: Scenario.getPerceptionPlanes()")             # <<<<<<<<<<<<<<
  * 
  *     def reset(self):
  */
-  __pyx_t_1 = __Pyx_PyObject_Call(__pyx_builtin_Exception, __pyx_tuple__3, NULL); if (unlikely(!__pyx_t_1)) {__pyx_filename = __pyx_f[14]; __pyx_lineno = 78; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+  __pyx_t_1 = __Pyx_PyObject_Call(__pyx_builtin_Exception, __pyx_tuple__3, NULL); if (unlikely(!__pyx_t_1)) {__pyx_filename = __pyx_f[13]; __pyx_lineno = 78; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
   __Pyx_GOTREF(__pyx_t_1);
   __Pyx_Raise(__pyx_t_1, 0, 0, 0);
   __Pyx_DECREF(__pyx_t_1); __pyx_t_1 = 0;
-  {__pyx_filename = __pyx_f[14]; __pyx_lineno = 78; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+  {__pyx_filename = __pyx_f[13]; __pyx_lineno = 78; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
 
-  /* "QLearning.pyx":77
+  /* "../../../../../../home/user/git/DeepCL/python/QLearning.pyx":77
  *         raise Exception("Method needs to be overridden: Scenario.getPerceptionSize()")
  * 
  *     def getPerceptionPlanes(self):             # <<<<<<<<<<<<<<
@@ -14729,7 +15078,7 @@ static PyObject *__pyx_pf_8PyDeepCL_8Scenario_4getPerceptionPlanes(CYTHON_UNUSED
   return __pyx_r;
 }
 
-/* "QLearning.pyx":80
+/* "../../../../../../home/user/git/DeepCL/python/QLearning.pyx":80
  *         raise Exception("Method needs to be overridden: Scenario.getPerceptionPlanes()")
  * 
  *     def reset(self):             # <<<<<<<<<<<<<<
@@ -14759,20 +15108,20 @@ static PyObject *__pyx_pf_8PyDeepCL_8Scenario_6reset(CYTHON_UNUSED struct __pyx_
   int __pyx_clineno = 0;
   __Pyx_RefNannySetupContext("reset", 0);
 
-  /* "QLearning.pyx":81
+  /* "../../../../../../home/user/git/DeepCL/python/QLearning.pyx":81
  * 
  *     def reset(self):
  *         raise Exception("Method needs to be overridden: Scenario.reset()")             # <<<<<<<<<<<<<<
  * 
  *     def getNumActions(self):
  */
-  __pyx_t_1 = __Pyx_PyObject_Call(__pyx_builtin_Exception, __pyx_tuple__4, NULL); if (unlikely(!__pyx_t_1)) {__pyx_filename = __pyx_f[14]; __pyx_lineno = 81; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+  __pyx_t_1 = __Pyx_PyObject_Call(__pyx_builtin_Exception, __pyx_tuple__4, NULL); if (unlikely(!__pyx_t_1)) {__pyx_filename = __pyx_f[13]; __pyx_lineno = 81; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
   __Pyx_GOTREF(__pyx_t_1);
   __Pyx_Raise(__pyx_t_1, 0, 0, 0);
   __Pyx_DECREF(__pyx_t_1); __pyx_t_1 = 0;
-  {__pyx_filename = __pyx_f[14]; __pyx_lineno = 81; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+  {__pyx_filename = __pyx_f[13]; __pyx_lineno = 81; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
 
-  /* "QLearning.pyx":80
+  /* "../../../../../../home/user/git/DeepCL/python/QLearning.pyx":80
  *         raise Exception("Method needs to be overridden: Scenario.getPerceptionPlanes()")
  * 
  *     def reset(self):             # <<<<<<<<<<<<<<
@@ -14790,7 +15139,7 @@ static PyObject *__pyx_pf_8PyDeepCL_8Scenario_6reset(CYTHON_UNUSED struct __pyx_
   return __pyx_r;
 }
 
-/* "QLearning.pyx":83
+/* "../../../../../../home/user/git/DeepCL/python/QLearning.pyx":83
  *         raise Exception("Method needs to be overridden: Scenario.reset()")
  * 
  *     def getNumActions(self):             # <<<<<<<<<<<<<<
@@ -14820,20 +15169,20 @@ static PyObject *__pyx_pf_8PyDeepCL_8Scenario_8getNumActions(CYTHON_UNUSED struc
   int __pyx_clineno = 0;
   __Pyx_RefNannySetupContext("getNumActions", 0);
 
-  /* "QLearning.pyx":84
+  /* "../../../../../../home/user/git/DeepCL/python/QLearning.pyx":84
  * 
  *     def getNumActions(self):
  *         raise Exception("Method needs to be overridden: Scenario.getNumActions()")             # <<<<<<<<<<<<<<
  * 
  *     def act(self, index):
  */
-  __pyx_t_1 = __Pyx_PyObject_Call(__pyx_builtin_Exception, __pyx_tuple__5, NULL); if (unlikely(!__pyx_t_1)) {__pyx_filename = __pyx_f[14]; __pyx_lineno = 84; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+  __pyx_t_1 = __Pyx_PyObject_Call(__pyx_builtin_Exception, __pyx_tuple__5, NULL); if (unlikely(!__pyx_t_1)) {__pyx_filename = __pyx_f[13]; __pyx_lineno = 84; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
   __Pyx_GOTREF(__pyx_t_1);
   __Pyx_Raise(__pyx_t_1, 0, 0, 0);
   __Pyx_DECREF(__pyx_t_1); __pyx_t_1 = 0;
-  {__pyx_filename = __pyx_f[14]; __pyx_lineno = 84; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+  {__pyx_filename = __pyx_f[13]; __pyx_lineno = 84; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
 
-  /* "QLearning.pyx":83
+  /* "../../../../../../home/user/git/DeepCL/python/QLearning.pyx":83
  *         raise Exception("Method needs to be overridden: Scenario.reset()")
  * 
  *     def getNumActions(self):             # <<<<<<<<<<<<<<
@@ -14851,7 +15200,7 @@ static PyObject *__pyx_pf_8PyDeepCL_8Scenario_8getNumActions(CYTHON_UNUSED struc
   return __pyx_r;
 }
 
-/* "QLearning.pyx":86
+/* "../../../../../../home/user/git/DeepCL/python/QLearning.pyx":86
  *         raise Exception("Method needs to be overridden: Scenario.getNumActions()")
  * 
  *     def act(self, index):             # <<<<<<<<<<<<<<
@@ -14881,20 +15230,20 @@ static PyObject *__pyx_pf_8PyDeepCL_8Scenario_10act(CYTHON_UNUSED struct __pyx_o
   int __pyx_clineno = 0;
   __Pyx_RefNannySetupContext("act", 0);
 
-  /* "QLearning.pyx":87
+  /* "../../../../../../home/user/git/DeepCL/python/QLearning.pyx":87
  * 
  *     def act(self, index):
  *         raise Exception("Method needs to be overridden: Scenario.act()")             # <<<<<<<<<<<<<<
  * 
  *     def hasFinished(self):
  */
-  __pyx_t_1 = __Pyx_PyObject_Call(__pyx_builtin_Exception, __pyx_tuple__6, NULL); if (unlikely(!__pyx_t_1)) {__pyx_filename = __pyx_f[14]; __pyx_lineno = 87; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+  __pyx_t_1 = __Pyx_PyObject_Call(__pyx_builtin_Exception, __pyx_tuple__6, NULL); if (unlikely(!__pyx_t_1)) {__pyx_filename = __pyx_f[13]; __pyx_lineno = 87; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
   __Pyx_GOTREF(__pyx_t_1);
   __Pyx_Raise(__pyx_t_1, 0, 0, 0);
   __Pyx_DECREF(__pyx_t_1); __pyx_t_1 = 0;
-  {__pyx_filename = __pyx_f[14]; __pyx_lineno = 87; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+  {__pyx_filename = __pyx_f[13]; __pyx_lineno = 87; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
 
-  /* "QLearning.pyx":86
+  /* "../../../../../../home/user/git/DeepCL/python/QLearning.pyx":86
  *         raise Exception("Method needs to be overridden: Scenario.getNumActions()")
  * 
  *     def act(self, index):             # <<<<<<<<<<<<<<
@@ -14912,7 +15261,7 @@ static PyObject *__pyx_pf_8PyDeepCL_8Scenario_10act(CYTHON_UNUSED struct __pyx_o
   return __pyx_r;
 }
 
-/* "QLearning.pyx":89
+/* "../../../../../../home/user/git/DeepCL/python/QLearning.pyx":89
  *         raise Exception("Method needs to be overridden: Scenario.act()")
  * 
  *     def hasFinished(self):             # <<<<<<<<<<<<<<
@@ -14942,20 +15291,20 @@ static PyObject *__pyx_pf_8PyDeepCL_8Scenario_12hasFinished(CYTHON_UNUSED struct
   int __pyx_clineno = 0;
   __Pyx_RefNannySetupContext("hasFinished", 0);
 
-  /* "QLearning.pyx":90
+  /* "../../../../../../home/user/git/DeepCL/python/QLearning.pyx":90
  * 
  *     def hasFinished(self):
  *         raise Exception("Method needs to be overridden: Scenario.hasFinished()")             # <<<<<<<<<<<<<<
  * 
  * #[[[end]]]
  */
-  __pyx_t_1 = __Pyx_PyObject_Call(__pyx_builtin_Exception, __pyx_tuple__7, NULL); if (unlikely(!__pyx_t_1)) {__pyx_filename = __pyx_f[14]; __pyx_lineno = 90; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+  __pyx_t_1 = __Pyx_PyObject_Call(__pyx_builtin_Exception, __pyx_tuple__7, NULL); if (unlikely(!__pyx_t_1)) {__pyx_filename = __pyx_f[13]; __pyx_lineno = 90; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
   __Pyx_GOTREF(__pyx_t_1);
   __Pyx_Raise(__pyx_t_1, 0, 0, 0);
   __Pyx_DECREF(__pyx_t_1); __pyx_t_1 = 0;
-  {__pyx_filename = __pyx_f[14]; __pyx_lineno = 90; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+  {__pyx_filename = __pyx_f[13]; __pyx_lineno = 90; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
 
-  /* "QLearning.pyx":89
+  /* "../../../../../../home/user/git/DeepCL/python/QLearning.pyx":89
  *         raise Exception("Method needs to be overridden: Scenario.act()")
  * 
  *     def hasFinished(self):             # <<<<<<<<<<<<<<
@@ -14973,49 +15322,7 @@ static PyObject *__pyx_pf_8PyDeepCL_8Scenario_12hasFinished(CYTHON_UNUSED struct
   return __pyx_r;
 }
 
-/* "QLearning.pyx":100
- * #        r aise Exception("Method needs to be overridden: Scenario.showQ()")
- * 
- *     def getPerception(self, perception):             # <<<<<<<<<<<<<<
- *         raise Exception("Method needs to be overridden: Scenario.getPerception()")
- * 
- */
-
-/* Python wrapper */
-static PyObject *__pyx_pw_8PyDeepCL_8Scenario_15getPerception(PyObject *__pyx_v_self, PyObject *__pyx_v_perception); /*proto*/
-static PyObject *__pyx_pw_8PyDeepCL_8Scenario_15getPerception(PyObject *__pyx_v_self, PyObject *__pyx_v_perception) {
-  PyObject *__pyx_r = 0;
-  __Pyx_RefNannyDeclarations
-  __Pyx_RefNannySetupContext("getPerception (wrapper)", 0);
-  __pyx_r = __pyx_pf_8PyDeepCL_8Scenario_14getPerception(((struct __pyx_obj_8PyDeepCL_Scenario *)__pyx_v_self), ((PyObject *)__pyx_v_perception));
-
-  /* function exit code */
-  __Pyx_RefNannyFinishContext();
-  return __pyx_r;
-}
-
-static PyObject *__pyx_pf_8PyDeepCL_8Scenario_14getPerception(CYTHON_UNUSED struct __pyx_obj_8PyDeepCL_Scenario *__pyx_v_self, CYTHON_UNUSED PyObject *__pyx_v_perception) {
-  PyObject *__pyx_r = NULL;
-  __Pyx_RefNannyDeclarations
-  PyObject *__pyx_t_1 = NULL;
-  int __pyx_lineno = 0;
-  const char *__pyx_filename = NULL;
-  int __pyx_clineno = 0;
-  __Pyx_RefNannySetupContext("getPerception", 0);
-
-  /* "QLearning.pyx":101
- * 
- *     def getPerception(self, perception):
- *         raise Exception("Method needs to be overridden: Scenario.getPerception()")             # <<<<<<<<<<<<<<
- * 
- */
-  __pyx_t_1 = __Pyx_PyObject_Call(__pyx_builtin_Exception, __pyx_tuple__8, NULL); if (unlikely(!__pyx_t_1)) {__pyx_filename = __pyx_f[14]; __pyx_lineno = 101; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
-  __Pyx_GOTREF(__pyx_t_1);
-  __Pyx_Raise(__pyx_t_1, 0, 0, 0);
-  __Pyx_DECREF(__pyx_t_1); __pyx_t_1 = 0;
-  {__pyx_filename = __pyx_f[14]; __pyx_lineno = 101; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
-
-  /* "QLearning.pyx":100
+/* "../../../../../../home/user/git/DeepCL/python/QLearning.pyx":100
  * #        r aise Exception("Method needs to be overridden: Scenario.showQ()")
  * 
  *     def getPerception(self, perception):             # <<<<<<<<<<<<<<
@@ -15023,136 +15330,60 @@ static PyObject *__pyx_pf_8PyDeepCL_8Scenario_14getPerception(CYTHON_UNUSED stru
  * 
  */
 
-  /* function exit code */
-  __pyx_L1_error:;
-  __Pyx_XDECREF(__pyx_t_1);
-  __Pyx_AddTraceback("PyDeepCL.Scenario.getPerception", __pyx_clineno, __pyx_lineno, __pyx_filename);
-  __pyx_r = NULL;
-  __Pyx_XGIVEREF(__pyx_r);
-  __Pyx_RefNannyFinishContext();
-  return __pyx_r;
-}
-
-/* "PyDeepCL.pyx":31
- * include "QLearning.pyx"
- * 
- * def checkException():             # <<<<<<<<<<<<<<
- *     cdef int threwException = 0
- *     cdef string message = ""
- */
-
 /* Python wrapper */
-static PyObject *__pyx_pw_8PyDeepCL_1checkException(PyObject *__pyx_self, CYTHON_UNUSED PyObject *unused); /*proto*/
-static PyMethodDef __pyx_mdef_8PyDeepCL_1checkException = {"checkException", (PyCFunction)__pyx_pw_8PyDeepCL_1checkException, METH_NOARGS, 0};
-static PyObject *__pyx_pw_8PyDeepCL_1checkException(PyObject *__pyx_self, CYTHON_UNUSED PyObject *unused) {
+static PyObject *__pyx_pw_8PyDeepCL_8Scenario_15getPerception(PyObject *__pyx_v_self, PyObject *__pyx_v_perception); /*proto*/
+static PyObject *__pyx_pw_8PyDeepCL_8Scenario_15getPerception(PyObject *__pyx_v_self, PyObject *__pyx_v_perception) {
   PyObject *__pyx_r = 0;
   __Pyx_RefNannyDeclarations
-  __Pyx_RefNannySetupContext("checkException (wrapper)", 0);
-  __pyx_r = __pyx_pf_8PyDeepCL_checkException(__pyx_self);
+  __Pyx_RefNannySetupContext("getPerception (wrapper)", 0);
+  __pyx_r = __pyx_pf_8PyDeepCL_8Scenario_14getPerception(((struct __pyx_obj_8PyDeepCL_Scenario *)__pyx_v_self), ((PyObject *)__pyx_v_perception));
 
   /* function exit code */
   __Pyx_RefNannyFinishContext();
   return __pyx_r;
 }
 
-static PyObject *__pyx_pf_8PyDeepCL_checkException(CYTHON_UNUSED PyObject *__pyx_self) {
-  int __pyx_v_threwException;
-  std::string __pyx_v_message;
+static PyObject *__pyx_pf_8PyDeepCL_8Scenario_14getPerception(CYTHON_UNUSED struct __pyx_obj_8PyDeepCL_Scenario *__pyx_v_self, CYTHON_UNUSED PyObject *__pyx_v_perception) {
   PyObject *__pyx_r = NULL;
   __Pyx_RefNannyDeclarations
-  std::string __pyx_t_1;
-  int __pyx_t_2;
-  PyObject *__pyx_t_3 = NULL;
-  PyObject *__pyx_t_4 = NULL;
+  PyObject *__pyx_t_1 = NULL;
   int __pyx_lineno = 0;
   const char *__pyx_filename = NULL;
   int __pyx_clineno = 0;
-  __Pyx_RefNannySetupContext("checkException", 0);
+  __Pyx_RefNannySetupContext("getPerception", 0);
 
-  /* "PyDeepCL.pyx":32
+  /* "../../../../../../home/user/git/DeepCL/python/QLearning.pyx":101
  * 
- * def checkException():
- *     cdef int threwException = 0             # <<<<<<<<<<<<<<
- *     cdef string message = ""
- *     cDeepCL.checkException( &threwException, &message)
- */
-  __pyx_v_threwException = 0;
-
-  /* "PyDeepCL.pyx":33
- * def checkException():
- *     cdef int threwException = 0
- *     cdef string message = ""             # <<<<<<<<<<<<<<
- *     cDeepCL.checkException( &threwException, &message)
- *     # print('threwException: ' + str(threwException) + ' ' + message )
- */
-  __pyx_t_1 = __pyx_convert_string_from_py_std__in_string(__pyx_kp_b__9); if (unlikely(PyErr_Occurred())) {__pyx_filename = __pyx_f[2]; __pyx_lineno = 33; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
-  __pyx_v_message = __pyx_t_1;
-
-  /* "PyDeepCL.pyx":34
- *     cdef int threwException = 0
- *     cdef string message = ""
- *     cDeepCL.checkException( &threwException, &message)             # <<<<<<<<<<<<<<
- *     # print('threwException: ' + str(threwException) + ' ' + message )
- *     if threwException:
- */
-  checkException((&__pyx_v_threwException), (&__pyx_v_message));
-
-  /* "PyDeepCL.pyx":36
- *     cDeepCL.checkException( &threwException, &message)
- *     # print('threwException: ' + str(threwException) + ' ' + message )
- *     if threwException:             # <<<<<<<<<<<<<<
- *         raise RuntimeError(message)
+ *     def getPerception(self, perception):
+ *         raise Exception("Method needs to be overridden: Scenario.getPerception()")             # <<<<<<<<<<<<<<
  * 
  */
-  __pyx_t_2 = (__pyx_v_threwException != 0);
-  if (__pyx_t_2) {
+  __pyx_t_1 = __Pyx_PyObject_Call(__pyx_builtin_Exception, __pyx_tuple__8, NULL); if (unlikely(!__pyx_t_1)) {__pyx_filename = __pyx_f[13]; __pyx_lineno = 101; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+  __Pyx_GOTREF(__pyx_t_1);
+  __Pyx_Raise(__pyx_t_1, 0, 0, 0);
+  __Pyx_DECREF(__pyx_t_1); __pyx_t_1 = 0;
+  {__pyx_filename = __pyx_f[13]; __pyx_lineno = 101; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
 
-    /* "PyDeepCL.pyx":37
- *     # print('threwException: ' + str(threwException) + ' ' + message )
- *     if threwException:
- *         raise RuntimeError(message)             # <<<<<<<<<<<<<<
+  /* "../../../../../../home/user/git/DeepCL/python/QLearning.pyx":100
+ * #        r aise Exception("Method needs to be overridden: Scenario.showQ()")
  * 
- * def interruptableCall( function, args ):
- */
-    __pyx_t_3 = __pyx_convert_PyBytes_string_to_py_std__in_string(__pyx_v_message); if (unlikely(!__pyx_t_3)) {__pyx_filename = __pyx_f[2]; __pyx_lineno = 37; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
-    __Pyx_GOTREF(__pyx_t_3);
-    __pyx_t_4 = PyTuple_New(1); if (unlikely(!__pyx_t_4)) {__pyx_filename = __pyx_f[2]; __pyx_lineno = 37; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
-    __Pyx_GOTREF(__pyx_t_4);
-    PyTuple_SET_ITEM(__pyx_t_4, 0, __pyx_t_3);
-    __Pyx_GIVEREF(__pyx_t_3);
-    __pyx_t_3 = 0;
-    __pyx_t_3 = __Pyx_PyObject_Call(__pyx_builtin_RuntimeError, __pyx_t_4, NULL); if (unlikely(!__pyx_t_3)) {__pyx_filename = __pyx_f[2]; __pyx_lineno = 37; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
-    __Pyx_GOTREF(__pyx_t_3);
-    __Pyx_DECREF(__pyx_t_4); __pyx_t_4 = 0;
-    __Pyx_Raise(__pyx_t_3, 0, 0, 0);
-    __Pyx_DECREF(__pyx_t_3); __pyx_t_3 = 0;
-    {__pyx_filename = __pyx_f[2]; __pyx_lineno = 37; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
-  }
-
-  /* "PyDeepCL.pyx":31
- * include "QLearning.pyx"
+ *     def getPerception(self, perception):             # <<<<<<<<<<<<<<
+ *         raise Exception("Method needs to be overridden: Scenario.getPerception()")
  * 
- * def checkException():             # <<<<<<<<<<<<<<
- *     cdef int threwException = 0
- *     cdef string message = ""
  */
 
   /* function exit code */
-  __pyx_r = Py_None; __Pyx_INCREF(Py_None);
-  goto __pyx_L0;
   __pyx_L1_error:;
-  __Pyx_XDECREF(__pyx_t_3);
-  __Pyx_XDECREF(__pyx_t_4);
-  __Pyx_AddTraceback("PyDeepCL.checkException", __pyx_clineno, __pyx_lineno, __pyx_filename);
+  __Pyx_XDECREF(__pyx_t_1);
+  __Pyx_AddTraceback("PyDeepCL.Scenario.getPerception", __pyx_clineno, __pyx_lineno, __pyx_filename);
   __pyx_r = NULL;
-  __pyx_L0:;
   __Pyx_XGIVEREF(__pyx_r);
   __Pyx_RefNannyFinishContext();
   return __pyx_r;
 }
 
-/* "PyDeepCL.pyx":39
- *         raise RuntimeError(message)
+/* "PyDeepCL.pyx":41
+ * #        raise RuntimeError(message)
  * 
  * def interruptableCall( function, args ):             # <<<<<<<<<<<<<<
  *     mythread = threading.Thread( target=function, args = args )
@@ -15160,9 +15391,9 @@ static PyObject *__pyx_pf_8PyDeepCL_checkException(CYTHON_UNUSED PyObject *__pyx
  */
 
 /* Python wrapper */
-static PyObject *__pyx_pw_8PyDeepCL_3interruptableCall(PyObject *__pyx_self, PyObject *__pyx_args, PyObject *__pyx_kwds); /*proto*/
-static PyMethodDef __pyx_mdef_8PyDeepCL_3interruptableCall = {"interruptableCall", (PyCFunction)__pyx_pw_8PyDeepCL_3interruptableCall, METH_VARARGS|METH_KEYWORDS, 0};
-static PyObject *__pyx_pw_8PyDeepCL_3interruptableCall(PyObject *__pyx_self, PyObject *__pyx_args, PyObject *__pyx_kwds) {
+static PyObject *__pyx_pw_8PyDeepCL_1interruptableCall(PyObject *__pyx_self, PyObject *__pyx_args, PyObject *__pyx_kwds); /*proto*/
+static PyMethodDef __pyx_mdef_8PyDeepCL_1interruptableCall = {"interruptableCall", (PyCFunction)__pyx_pw_8PyDeepCL_1interruptableCall, METH_VARARGS|METH_KEYWORDS, 0};
+static PyObject *__pyx_pw_8PyDeepCL_1interruptableCall(PyObject *__pyx_self, PyObject *__pyx_args, PyObject *__pyx_kwds) {
   PyObject *__pyx_v_function = 0;
   PyObject *__pyx_v_args = 0;
   int __pyx_lineno = 0;
@@ -15191,11 +15422,11 @@ static PyObject *__pyx_pw_8PyDeepCL_3interruptableCall(PyObject *__pyx_self, PyO
         case  1:
         if (likely((values[1] = PyDict_GetItem(__pyx_kwds, __pyx_n_s_args)) != 0)) kw_args--;
         else {
-          __Pyx_RaiseArgtupleInvalid("interruptableCall", 1, 2, 2, 1); {__pyx_filename = __pyx_f[2]; __pyx_lineno = 39; __pyx_clineno = __LINE__; goto __pyx_L3_error;}
+          __Pyx_RaiseArgtupleInvalid("interruptableCall", 1, 2, 2, 1); {__pyx_filename = __pyx_f[14]; __pyx_lineno = 41; __pyx_clineno = __LINE__; goto __pyx_L3_error;}
         }
       }
       if (unlikely(kw_args > 0)) {
-        if (unlikely(__Pyx_ParseOptionalKeywords(__pyx_kwds, __pyx_pyargnames, 0, values, pos_args, "interruptableCall") < 0)) {__pyx_filename = __pyx_f[2]; __pyx_lineno = 39; __pyx_clineno = __LINE__; goto __pyx_L3_error;}
+        if (unlikely(__Pyx_ParseOptionalKeywords(__pyx_kwds, __pyx_pyargnames, 0, values, pos_args, "interruptableCall") < 0)) {__pyx_filename = __pyx_f[14]; __pyx_lineno = 41; __pyx_clineno = __LINE__; goto __pyx_L3_error;}
       }
     } else if (PyTuple_GET_SIZE(__pyx_args) != 2) {
       goto __pyx_L5_argtuple_error;
@@ -15208,20 +15439,20 @@ static PyObject *__pyx_pw_8PyDeepCL_3interruptableCall(PyObject *__pyx_self, PyO
   }
   goto __pyx_L4_argument_unpacking_done;
   __pyx_L5_argtuple_error:;
-  __Pyx_RaiseArgtupleInvalid("interruptableCall", 1, 2, 2, PyTuple_GET_SIZE(__pyx_args)); {__pyx_filename = __pyx_f[2]; __pyx_lineno = 39; __pyx_clineno = __LINE__; goto __pyx_L3_error;}
+  __Pyx_RaiseArgtupleInvalid("interruptableCall", 1, 2, 2, PyTuple_GET_SIZE(__pyx_args)); {__pyx_filename = __pyx_f[14]; __pyx_lineno = 41; __pyx_clineno = __LINE__; goto __pyx_L3_error;}
   __pyx_L3_error:;
   __Pyx_AddTraceback("PyDeepCL.interruptableCall", __pyx_clineno, __pyx_lineno, __pyx_filename);
   __Pyx_RefNannyFinishContext();
   return NULL;
   __pyx_L4_argument_unpacking_done:;
-  __pyx_r = __pyx_pf_8PyDeepCL_2interruptableCall(__pyx_self, __pyx_v_function, __pyx_v_args);
+  __pyx_r = __pyx_pf_8PyDeepCL_interruptableCall(__pyx_self, __pyx_v_function, __pyx_v_args);
 
   /* function exit code */
   __Pyx_RefNannyFinishContext();
   return __pyx_r;
 }
 
-static PyObject *__pyx_pf_8PyDeepCL_2interruptableCall(CYTHON_UNUSED PyObject *__pyx_self, PyObject *__pyx_v_function, PyObject *__pyx_v_args) {
+static PyObject *__pyx_pf_8PyDeepCL_interruptableCall(CYTHON_UNUSED PyObject *__pyx_self, PyObject *__pyx_v_function, PyObject *__pyx_v_args) {
   PyObject *__pyx_v_mythread = NULL;
   PyObject *__pyx_r = NULL;
   __Pyx_RefNannyDeclarations
@@ -15234,46 +15465,46 @@ static PyObject *__pyx_pf_8PyDeepCL_2interruptableCall(CYTHON_UNUSED PyObject *_
   int __pyx_clineno = 0;
   __Pyx_RefNannySetupContext("interruptableCall", 0);
 
-  /* "PyDeepCL.pyx":40
+  /* "PyDeepCL.pyx":42
  * 
  * def interruptableCall( function, args ):
  *     mythread = threading.Thread( target=function, args = args )             # <<<<<<<<<<<<<<
  *     mythread.daemon = True
  *     mythread.start()
  */
-  __pyx_t_1 = __Pyx_GetModuleGlobalName(__pyx_n_s_threading); if (unlikely(!__pyx_t_1)) {__pyx_filename = __pyx_f[2]; __pyx_lineno = 40; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+  __pyx_t_1 = __Pyx_GetModuleGlobalName(__pyx_n_s_threading); if (unlikely(!__pyx_t_1)) {__pyx_filename = __pyx_f[14]; __pyx_lineno = 42; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
   __Pyx_GOTREF(__pyx_t_1);
-  __pyx_t_2 = __Pyx_PyObject_GetAttrStr(__pyx_t_1, __pyx_n_s_Thread); if (unlikely(!__pyx_t_2)) {__pyx_filename = __pyx_f[2]; __pyx_lineno = 40; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+  __pyx_t_2 = __Pyx_PyObject_GetAttrStr(__pyx_t_1, __pyx_n_s_Thread); if (unlikely(!__pyx_t_2)) {__pyx_filename = __pyx_f[14]; __pyx_lineno = 42; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
   __Pyx_GOTREF(__pyx_t_2);
   __Pyx_DECREF(__pyx_t_1); __pyx_t_1 = 0;
-  __pyx_t_1 = PyDict_New(); if (unlikely(!__pyx_t_1)) {__pyx_filename = __pyx_f[2]; __pyx_lineno = 40; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+  __pyx_t_1 = PyDict_New(); if (unlikely(!__pyx_t_1)) {__pyx_filename = __pyx_f[14]; __pyx_lineno = 42; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
   __Pyx_GOTREF(__pyx_t_1);
-  if (PyDict_SetItem(__pyx_t_1, __pyx_n_s_target, __pyx_v_function) < 0) {__pyx_filename = __pyx_f[2]; __pyx_lineno = 40; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
-  if (PyDict_SetItem(__pyx_t_1, __pyx_n_s_args, __pyx_v_args) < 0) {__pyx_filename = __pyx_f[2]; __pyx_lineno = 40; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
-  __pyx_t_3 = __Pyx_PyObject_Call(__pyx_t_2, __pyx_empty_tuple, __pyx_t_1); if (unlikely(!__pyx_t_3)) {__pyx_filename = __pyx_f[2]; __pyx_lineno = 40; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+  if (PyDict_SetItem(__pyx_t_1, __pyx_n_s_target, __pyx_v_function) < 0) {__pyx_filename = __pyx_f[14]; __pyx_lineno = 42; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+  if (PyDict_SetItem(__pyx_t_1, __pyx_n_s_args, __pyx_v_args) < 0) {__pyx_filename = __pyx_f[14]; __pyx_lineno = 42; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+  __pyx_t_3 = __Pyx_PyObject_Call(__pyx_t_2, __pyx_empty_tuple, __pyx_t_1); if (unlikely(!__pyx_t_3)) {__pyx_filename = __pyx_f[14]; __pyx_lineno = 42; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
   __Pyx_GOTREF(__pyx_t_3);
   __Pyx_DECREF(__pyx_t_2); __pyx_t_2 = 0;
   __Pyx_DECREF(__pyx_t_1); __pyx_t_1 = 0;
   __pyx_v_mythread = __pyx_t_3;
   __pyx_t_3 = 0;
 
-  /* "PyDeepCL.pyx":41
+  /* "PyDeepCL.pyx":43
  * def interruptableCall( function, args ):
  *     mythread = threading.Thread( target=function, args = args )
  *     mythread.daemon = True             # <<<<<<<<<<<<<<
  *     mythread.start()
  *     while mythread.isAlive():
  */
-  if (__Pyx_PyObject_SetAttrStr(__pyx_v_mythread, __pyx_n_s_daemon, Py_True) < 0) {__pyx_filename = __pyx_f[2]; __pyx_lineno = 41; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+  if (__Pyx_PyObject_SetAttrStr(__pyx_v_mythread, __pyx_n_s_daemon, Py_True) < 0) {__pyx_filename = __pyx_f[14]; __pyx_lineno = 43; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
 
-  /* "PyDeepCL.pyx":42
+  /* "PyDeepCL.pyx":44
  *     mythread = threading.Thread( target=function, args = args )
  *     mythread.daemon = True
  *     mythread.start()             # <<<<<<<<<<<<<<
  *     while mythread.isAlive():
  *         mythread.join(0.1)
  */
-  __pyx_t_1 = __Pyx_PyObject_GetAttrStr(__pyx_v_mythread, __pyx_n_s_start); if (unlikely(!__pyx_t_1)) {__pyx_filename = __pyx_f[2]; __pyx_lineno = 42; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+  __pyx_t_1 = __Pyx_PyObject_GetAttrStr(__pyx_v_mythread, __pyx_n_s_start); if (unlikely(!__pyx_t_1)) {__pyx_filename = __pyx_f[14]; __pyx_lineno = 44; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
   __Pyx_GOTREF(__pyx_t_1);
   __pyx_t_2 = NULL;
   if (CYTHON_COMPILING_IN_CPYTHON && likely(PyMethod_Check(__pyx_t_1))) {
@@ -15286,16 +15517,16 @@ static PyObject *__pyx_pf_8PyDeepCL_2interruptableCall(CYTHON_UNUSED PyObject *_
     }
   }
   if (__pyx_t_2) {
-    __pyx_t_3 = __Pyx_PyObject_CallOneArg(__pyx_t_1, __pyx_t_2); if (unlikely(!__pyx_t_3)) {__pyx_filename = __pyx_f[2]; __pyx_lineno = 42; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+    __pyx_t_3 = __Pyx_PyObject_CallOneArg(__pyx_t_1, __pyx_t_2); if (unlikely(!__pyx_t_3)) {__pyx_filename = __pyx_f[14]; __pyx_lineno = 44; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
     __Pyx_DECREF(__pyx_t_2); __pyx_t_2 = 0;
   } else {
-    __pyx_t_3 = __Pyx_PyObject_CallNoArg(__pyx_t_1); if (unlikely(!__pyx_t_3)) {__pyx_filename = __pyx_f[2]; __pyx_lineno = 42; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+    __pyx_t_3 = __Pyx_PyObject_CallNoArg(__pyx_t_1); if (unlikely(!__pyx_t_3)) {__pyx_filename = __pyx_f[14]; __pyx_lineno = 44; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
   }
   __Pyx_GOTREF(__pyx_t_3);
   __Pyx_DECREF(__pyx_t_1); __pyx_t_1 = 0;
   __Pyx_DECREF(__pyx_t_3); __pyx_t_3 = 0;
 
-  /* "PyDeepCL.pyx":43
+  /* "PyDeepCL.pyx":45
  *     mythread.daemon = True
  *     mythread.start()
  *     while mythread.isAlive():             # <<<<<<<<<<<<<<
@@ -15303,7 +15534,7 @@ static PyObject *__pyx_pf_8PyDeepCL_2interruptableCall(CYTHON_UNUSED PyObject *_
  *         #print('join timed out')
  */
   while (1) {
-    __pyx_t_1 = __Pyx_PyObject_GetAttrStr(__pyx_v_mythread, __pyx_n_s_isAlive); if (unlikely(!__pyx_t_1)) {__pyx_filename = __pyx_f[2]; __pyx_lineno = 43; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+    __pyx_t_1 = __Pyx_PyObject_GetAttrStr(__pyx_v_mythread, __pyx_n_s_isAlive); if (unlikely(!__pyx_t_1)) {__pyx_filename = __pyx_f[14]; __pyx_lineno = 45; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
     __Pyx_GOTREF(__pyx_t_1);
     __pyx_t_2 = NULL;
     if (CYTHON_COMPILING_IN_CPYTHON && likely(PyMethod_Check(__pyx_t_1))) {
@@ -15316,34 +15547,34 @@ static PyObject *__pyx_pf_8PyDeepCL_2interruptableCall(CYTHON_UNUSED PyObject *_
       }
     }
     if (__pyx_t_2) {
-      __pyx_t_3 = __Pyx_PyObject_CallOneArg(__pyx_t_1, __pyx_t_2); if (unlikely(!__pyx_t_3)) {__pyx_filename = __pyx_f[2]; __pyx_lineno = 43; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+      __pyx_t_3 = __Pyx_PyObject_CallOneArg(__pyx_t_1, __pyx_t_2); if (unlikely(!__pyx_t_3)) {__pyx_filename = __pyx_f[14]; __pyx_lineno = 45; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
       __Pyx_DECREF(__pyx_t_2); __pyx_t_2 = 0;
     } else {
-      __pyx_t_3 = __Pyx_PyObject_CallNoArg(__pyx_t_1); if (unlikely(!__pyx_t_3)) {__pyx_filename = __pyx_f[2]; __pyx_lineno = 43; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+      __pyx_t_3 = __Pyx_PyObject_CallNoArg(__pyx_t_1); if (unlikely(!__pyx_t_3)) {__pyx_filename = __pyx_f[14]; __pyx_lineno = 45; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
     }
     __Pyx_GOTREF(__pyx_t_3);
     __Pyx_DECREF(__pyx_t_1); __pyx_t_1 = 0;
-    __pyx_t_4 = __Pyx_PyObject_IsTrue(__pyx_t_3); if (unlikely(__pyx_t_4 < 0)) {__pyx_filename = __pyx_f[2]; __pyx_lineno = 43; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+    __pyx_t_4 = __Pyx_PyObject_IsTrue(__pyx_t_3); if (unlikely(__pyx_t_4 < 0)) {__pyx_filename = __pyx_f[14]; __pyx_lineno = 45; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
     __Pyx_DECREF(__pyx_t_3); __pyx_t_3 = 0;
     if (!__pyx_t_4) break;
 
-    /* "PyDeepCL.pyx":44
+    /* "PyDeepCL.pyx":46
  *     mythread.start()
  *     while mythread.isAlive():
  *         mythread.join(0.1)             # <<<<<<<<<<<<<<
  *         #print('join timed out')
  * 
  */
-    __pyx_t_3 = __Pyx_PyObject_GetAttrStr(__pyx_v_mythread, __pyx_n_s_join); if (unlikely(!__pyx_t_3)) {__pyx_filename = __pyx_f[2]; __pyx_lineno = 44; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+    __pyx_t_3 = __Pyx_PyObject_GetAttrStr(__pyx_v_mythread, __pyx_n_s_join); if (unlikely(!__pyx_t_3)) {__pyx_filename = __pyx_f[14]; __pyx_lineno = 46; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
     __Pyx_GOTREF(__pyx_t_3);
-    __pyx_t_1 = __Pyx_PyObject_Call(__pyx_t_3, __pyx_tuple__10, NULL); if (unlikely(!__pyx_t_1)) {__pyx_filename = __pyx_f[2]; __pyx_lineno = 44; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+    __pyx_t_1 = __Pyx_PyObject_Call(__pyx_t_3, __pyx_tuple__9, NULL); if (unlikely(!__pyx_t_1)) {__pyx_filename = __pyx_f[14]; __pyx_lineno = 46; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
     __Pyx_GOTREF(__pyx_t_1);
     __Pyx_DECREF(__pyx_t_3); __pyx_t_3 = 0;
     __Pyx_DECREF(__pyx_t_1); __pyx_t_1 = 0;
   }
 
-  /* "PyDeepCL.pyx":39
- *         raise RuntimeError(message)
+  /* "PyDeepCL.pyx":41
+ * #        raise RuntimeError(message)
  * 
  * def interruptableCall( function, args ):             # <<<<<<<<<<<<<<
  *     mythread = threading.Thread( target=function, args = args )
@@ -15366,7 +15597,7 @@ static PyObject *__pyx_pf_8PyDeepCL_2interruptableCall(CYTHON_UNUSED PyObject *_
   return __pyx_r;
 }
 
-/* "PyDeepCL.pyx":47
+/* "PyDeepCL.pyx":49
  *         #print('join timed out')
  * 
  * def toCppString( pyString ):             # <<<<<<<<<<<<<<
@@ -15375,20 +15606,20 @@ static PyObject *__pyx_pf_8PyDeepCL_2interruptableCall(CYTHON_UNUSED PyObject *_
  */
 
 /* Python wrapper */
-static PyObject *__pyx_pw_8PyDeepCL_5toCppString(PyObject *__pyx_self, PyObject *__pyx_v_pyString); /*proto*/
-static PyMethodDef __pyx_mdef_8PyDeepCL_5toCppString = {"toCppString", (PyCFunction)__pyx_pw_8PyDeepCL_5toCppString, METH_O, 0};
-static PyObject *__pyx_pw_8PyDeepCL_5toCppString(PyObject *__pyx_self, PyObject *__pyx_v_pyString) {
+static PyObject *__pyx_pw_8PyDeepCL_3toCppString(PyObject *__pyx_self, PyObject *__pyx_v_pyString); /*proto*/
+static PyMethodDef __pyx_mdef_8PyDeepCL_3toCppString = {"toCppString", (PyCFunction)__pyx_pw_8PyDeepCL_3toCppString, METH_O, 0};
+static PyObject *__pyx_pw_8PyDeepCL_3toCppString(PyObject *__pyx_self, PyObject *__pyx_v_pyString) {
   PyObject *__pyx_r = 0;
   __Pyx_RefNannyDeclarations
   __Pyx_RefNannySetupContext("toCppString (wrapper)", 0);
-  __pyx_r = __pyx_pf_8PyDeepCL_4toCppString(__pyx_self, ((PyObject *)__pyx_v_pyString));
+  __pyx_r = __pyx_pf_8PyDeepCL_2toCppString(__pyx_self, ((PyObject *)__pyx_v_pyString));
 
   /* function exit code */
   __Pyx_RefNannyFinishContext();
   return __pyx_r;
 }
 
-static PyObject *__pyx_pf_8PyDeepCL_4toCppString(CYTHON_UNUSED PyObject *__pyx_self, PyObject *__pyx_v_pyString) {
+static PyObject *__pyx_pf_8PyDeepCL_2toCppString(CYTHON_UNUSED PyObject *__pyx_self, PyObject *__pyx_v_pyString) {
   PyObject *__pyx_r = NULL;
   __Pyx_RefNannyDeclarations
   int __pyx_t_1;
@@ -15400,7 +15631,7 @@ static PyObject *__pyx_pf_8PyDeepCL_4toCppString(CYTHON_UNUSED PyObject *__pyx_s
   int __pyx_clineno = 0;
   __Pyx_RefNannySetupContext("toCppString", 0);
 
-  /* "PyDeepCL.pyx":48
+  /* "PyDeepCL.pyx":50
  * 
  * def toCppString( pyString ):
  *     if isinstance( pyString, unicode ):             # <<<<<<<<<<<<<<
@@ -15411,7 +15642,7 @@ static PyObject *__pyx_pf_8PyDeepCL_4toCppString(CYTHON_UNUSED PyObject *__pyx_s
   __pyx_t_2 = (__pyx_t_1 != 0);
   if (__pyx_t_2) {
 
-    /* "PyDeepCL.pyx":49
+    /* "PyDeepCL.pyx":51
  * def toCppString( pyString ):
  *     if isinstance( pyString, unicode ):
  *         return pyString.encode('utf8')             # <<<<<<<<<<<<<<
@@ -15419,9 +15650,9 @@ static PyObject *__pyx_pf_8PyDeepCL_4toCppString(CYTHON_UNUSED PyObject *__pyx_s
  * 
  */
     __Pyx_XDECREF(__pyx_r);
-    __pyx_t_3 = __Pyx_PyObject_GetAttrStr(__pyx_v_pyString, __pyx_n_s_encode); if (unlikely(!__pyx_t_3)) {__pyx_filename = __pyx_f[2]; __pyx_lineno = 49; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+    __pyx_t_3 = __Pyx_PyObject_GetAttrStr(__pyx_v_pyString, __pyx_n_s_encode); if (unlikely(!__pyx_t_3)) {__pyx_filename = __pyx_f[14]; __pyx_lineno = 51; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
     __Pyx_GOTREF(__pyx_t_3);
-    __pyx_t_4 = __Pyx_PyObject_Call(__pyx_t_3, __pyx_tuple__11, NULL); if (unlikely(!__pyx_t_4)) {__pyx_filename = __pyx_f[2]; __pyx_lineno = 49; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+    __pyx_t_4 = __Pyx_PyObject_Call(__pyx_t_3, __pyx_tuple__10, NULL); if (unlikely(!__pyx_t_4)) {__pyx_filename = __pyx_f[14]; __pyx_lineno = 51; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
     __Pyx_GOTREF(__pyx_t_4);
     __Pyx_DECREF(__pyx_t_3); __pyx_t_3 = 0;
     __pyx_r = __pyx_t_4;
@@ -15429,7 +15660,7 @@ static PyObject *__pyx_pf_8PyDeepCL_4toCppString(CYTHON_UNUSED PyObject *__pyx_s
     goto __pyx_L0;
   }
 
-  /* "PyDeepCL.pyx":50
+  /* "PyDeepCL.pyx":52
  *     if isinstance( pyString, unicode ):
  *         return pyString.encode('utf8')
  *     return pyString             # <<<<<<<<<<<<<<
@@ -15441,7 +15672,7 @@ static PyObject *__pyx_pf_8PyDeepCL_4toCppString(CYTHON_UNUSED PyObject *__pyx_s
   __pyx_r = __pyx_v_pyString;
   goto __pyx_L0;
 
-  /* "PyDeepCL.pyx":47
+  /* "PyDeepCL.pyx":49
  *         #print('join timed out')
  * 
  * def toCppString( pyString ):             # <<<<<<<<<<<<<<
@@ -16281,7 +16512,7 @@ static int __pyx_array___pyx_pf_15View_dot_MemoryView_5array___cinit__(struct __
  * 
  *         if itemsize <= 0:
  */
-    __pyx_t_3 = __Pyx_PyObject_Call(__pyx_builtin_ValueError, __pyx_tuple__12, NULL); if (unlikely(!__pyx_t_3)) {__pyx_filename = __pyx_f[16]; __pyx_lineno = 127; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+    __pyx_t_3 = __Pyx_PyObject_Call(__pyx_builtin_ValueError, __pyx_tuple__11, NULL); if (unlikely(!__pyx_t_3)) {__pyx_filename = __pyx_f[16]; __pyx_lineno = 127; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
     __Pyx_GOTREF(__pyx_t_3);
     __Pyx_Raise(__pyx_t_3, 0, 0, 0);
     __Pyx_DECREF(__pyx_t_3); __pyx_t_3 = 0;
@@ -16305,7 +16536,7 @@ static int __pyx_array___pyx_pf_15View_dot_MemoryView_5array___cinit__(struct __
  * 
  *         if isinstance(format, unicode):
  */
-    __pyx_t_3 = __Pyx_PyObject_Call(__pyx_builtin_ValueError, __pyx_tuple__13, NULL); if (unlikely(!__pyx_t_3)) {__pyx_filename = __pyx_f[16]; __pyx_lineno = 130; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+    __pyx_t_3 = __Pyx_PyObject_Call(__pyx_builtin_ValueError, __pyx_tuple__12, NULL); if (unlikely(!__pyx_t_3)) {__pyx_filename = __pyx_f[16]; __pyx_lineno = 130; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
     __Pyx_GOTREF(__pyx_t_3);
     __Pyx_Raise(__pyx_t_3, 0, 0, 0);
     __Pyx_DECREF(__pyx_t_3); __pyx_t_3 = 0;
@@ -16403,7 +16634,7 @@ static int __pyx_array___pyx_pf_15View_dot_MemoryView_5array___cinit__(struct __
  * 
  * 
  */
-    __pyx_t_3 = __Pyx_PyObject_Call(__pyx_builtin_MemoryError, __pyx_tuple__14, NULL); if (unlikely(!__pyx_t_3)) {__pyx_filename = __pyx_f[16]; __pyx_lineno = 142; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+    __pyx_t_3 = __Pyx_PyObject_Call(__pyx_builtin_MemoryError, __pyx_tuple__13, NULL); if (unlikely(!__pyx_t_3)) {__pyx_filename = __pyx_f[16]; __pyx_lineno = 142; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
     __Pyx_GOTREF(__pyx_t_3);
     __Pyx_Raise(__pyx_t_3, 0, 0, 0);
     __Pyx_DECREF(__pyx_t_3); __pyx_t_3 = 0;
@@ -16654,7 +16885,7 @@ static int __pyx_array___pyx_pf_15View_dot_MemoryView_5array___cinit__(struct __
  * 
  *             if self.dtype_is_object:
  */
-      __pyx_t_3 = __Pyx_PyObject_Call(__pyx_builtin_MemoryError, __pyx_tuple__15, NULL); if (unlikely(!__pyx_t_3)) {__pyx_filename = __pyx_f[16]; __pyx_lineno = 170; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+      __pyx_t_3 = __Pyx_PyObject_Call(__pyx_builtin_MemoryError, __pyx_tuple__14, NULL); if (unlikely(!__pyx_t_3)) {__pyx_filename = __pyx_f[16]; __pyx_lineno = 170; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
       __Pyx_GOTREF(__pyx_t_3);
       __Pyx_Raise(__pyx_t_3, 0, 0, 0);
       __Pyx_DECREF(__pyx_t_3); __pyx_t_3 = 0;
@@ -16872,7 +17103,7 @@ static int __pyx_array___pyx_pf_15View_dot_MemoryView_5array_2__getbuffer__(stru
  *         info.buf = self.data
  *         info.len = self.len
  */
-    __pyx_t_3 = __Pyx_PyObject_Call(__pyx_builtin_ValueError, __pyx_tuple__16, NULL); if (unlikely(!__pyx_t_3)) {__pyx_filename = __pyx_f[16]; __pyx_lineno = 186; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+    __pyx_t_3 = __Pyx_PyObject_Call(__pyx_builtin_ValueError, __pyx_tuple__15, NULL); if (unlikely(!__pyx_t_3)) {__pyx_filename = __pyx_f[16]; __pyx_lineno = 186; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
     __Pyx_GOTREF(__pyx_t_3);
     __Pyx_Raise(__pyx_t_3, 0, 0, 0);
     __Pyx_DECREF(__pyx_t_3); __pyx_t_3 = 0;
@@ -19465,7 +19696,7 @@ static PyObject *__pyx_memoryview_convert_item_to_object(struct __pyx_memoryview
  *         else:
  *             if len(self.view.format) == 1:
  */
-      __pyx_t_6 = __Pyx_PyObject_Call(__pyx_builtin_ValueError, __pyx_tuple__17, NULL); if (unlikely(!__pyx_t_6)) {__pyx_filename = __pyx_f[16]; __pyx_lineno = 445; __pyx_clineno = __LINE__; goto __pyx_L5_except_error;}
+      __pyx_t_6 = __Pyx_PyObject_Call(__pyx_builtin_ValueError, __pyx_tuple__16, NULL); if (unlikely(!__pyx_t_6)) {__pyx_filename = __pyx_f[16]; __pyx_lineno = 445; __pyx_clineno = __LINE__; goto __pyx_L5_except_error;}
       __Pyx_GOTREF(__pyx_t_6);
       __Pyx_Raise(__pyx_t_6, 0, 0, 0);
       __Pyx_DECREF(__pyx_t_6); __pyx_t_6 = 0;
@@ -20247,7 +20478,7 @@ static PyObject *__pyx_pf_15View_dot_MemoryView_10memoryview_7strides___get__(st
  * 
  *             return tuple([stride for stride in self.view.strides[:self.view.ndim]])
  */
-    __pyx_t_2 = __Pyx_PyObject_Call(__pyx_builtin_ValueError, __pyx_tuple__18, NULL); if (unlikely(!__pyx_t_2)) {__pyx_filename = __pyx_f[16]; __pyx_lineno = 521; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+    __pyx_t_2 = __Pyx_PyObject_Call(__pyx_builtin_ValueError, __pyx_tuple__17, NULL); if (unlikely(!__pyx_t_2)) {__pyx_filename = __pyx_f[16]; __pyx_lineno = 521; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
     __Pyx_GOTREF(__pyx_t_2);
     __Pyx_Raise(__pyx_t_2, 0, 0, 0);
     __Pyx_DECREF(__pyx_t_2); __pyx_t_2 = 0;
@@ -20356,7 +20587,7 @@ static PyObject *__pyx_pf_15View_dot_MemoryView_10memoryview_10suboffsets___get_
     __Pyx_XDECREF(__pyx_r);
     __pyx_t_2 = __Pyx_PyInt_From_int(__pyx_v_self->view.ndim); if (unlikely(!__pyx_t_2)) {__pyx_filename = __pyx_f[16]; __pyx_lineno = 529; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
     __Pyx_GOTREF(__pyx_t_2);
-    __pyx_t_3 = PyNumber_Multiply(__pyx_tuple__19, __pyx_t_2); if (unlikely(!__pyx_t_3)) {__pyx_filename = __pyx_f[16]; __pyx_lineno = 529; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+    __pyx_t_3 = PyNumber_Multiply(__pyx_tuple__18, __pyx_t_2); if (unlikely(!__pyx_t_3)) {__pyx_filename = __pyx_f[16]; __pyx_lineno = 529; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
     __Pyx_GOTREF(__pyx_t_3);
     __Pyx_DECREF(__pyx_t_2); __pyx_t_2 = 0;
     __pyx_r = __pyx_t_3;
@@ -21664,9 +21895,9 @@ static PyObject *_unellipsify(PyObject *__pyx_v_index, int __pyx_v_ndim) {
         __Pyx_GOTREF(__pyx_t_7);
         { Py_ssize_t __pyx_temp;
           for (__pyx_temp=0; __pyx_temp < ((__pyx_v_ndim - __pyx_t_8) + 1); __pyx_temp++) {
-            __Pyx_INCREF(__pyx_slice__20);
-            PyList_SET_ITEM(__pyx_t_7, __pyx_temp, __pyx_slice__20);
-            __Pyx_GIVEREF(__pyx_slice__20);
+            __Pyx_INCREF(__pyx_slice__19);
+            PyList_SET_ITEM(__pyx_t_7, __pyx_temp, __pyx_slice__19);
+            __Pyx_GIVEREF(__pyx_slice__19);
           }
         }
         __pyx_t_9 = __Pyx_PyList_Extend(__pyx_v_result, __pyx_t_7); if (unlikely(__pyx_t_9 == -1)) {__pyx_filename = __pyx_f[16]; __pyx_lineno = 638; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
@@ -21691,7 +21922,7 @@ static PyObject *_unellipsify(PyObject *__pyx_v_index, int __pyx_v_ndim) {
  *             have_slices = True
  *         else:
  */
-        __pyx_t_9 = __Pyx_PyList_Append(__pyx_v_result, __pyx_slice__21); if (unlikely(__pyx_t_9 == -1)) {__pyx_filename = __pyx_f[16]; __pyx_lineno = 641; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+        __pyx_t_9 = __Pyx_PyList_Append(__pyx_v_result, __pyx_slice__20); if (unlikely(__pyx_t_9 == -1)) {__pyx_filename = __pyx_f[16]; __pyx_lineno = 641; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
       }
       __pyx_L7:;
 
@@ -21820,9 +22051,9 @@ static PyObject *_unellipsify(PyObject *__pyx_v_index, int __pyx_v_ndim) {
     __Pyx_GOTREF(__pyx_t_3);
     { Py_ssize_t __pyx_temp;
       for (__pyx_temp=0; __pyx_temp < __pyx_v_nslices; __pyx_temp++) {
-        __Pyx_INCREF(__pyx_slice__22);
-        PyList_SET_ITEM(__pyx_t_3, __pyx_temp, __pyx_slice__22);
-        __Pyx_GIVEREF(__pyx_slice__22);
+        __Pyx_INCREF(__pyx_slice__21);
+        PyList_SET_ITEM(__pyx_t_3, __pyx_temp, __pyx_slice__21);
+        __Pyx_GIVEREF(__pyx_slice__21);
       }
     }
     __pyx_t_9 = __Pyx_PyList_Extend(__pyx_v_result, __pyx_t_3); if (unlikely(__pyx_t_9 == -1)) {__pyx_filename = __pyx_f[16]; __pyx_lineno = 652; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
@@ -21943,7 +22174,7 @@ static PyObject *assert_direct_dimensions(Py_ssize_t *__pyx_v_suboffsets, int __
  * 
  * 
  */
-      __pyx_t_5 = __Pyx_PyObject_Call(__pyx_builtin_ValueError, __pyx_tuple__23, NULL); if (unlikely(!__pyx_t_5)) {__pyx_filename = __pyx_f[16]; __pyx_lineno = 659; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+      __pyx_t_5 = __Pyx_PyObject_Call(__pyx_builtin_ValueError, __pyx_tuple__22, NULL); if (unlikely(!__pyx_t_5)) {__pyx_filename = __pyx_f[16]; __pyx_lineno = 659; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
       __Pyx_GOTREF(__pyx_t_5);
       __Pyx_Raise(__pyx_t_5, 0, 0, 0);
       __Pyx_DECREF(__pyx_t_5); __pyx_t_5 = 0;
@@ -26721,311 +26952,7 @@ static void __pyx_memoryview__slice_assign_scalar(char *__pyx_v_data, Py_ssize_t
   /* function exit code */
 }
 
-/* "string.to_py":31
- * 
- * @cname("__pyx_convert_PyObject_string_to_py_std__in_string")
- * cdef inline object __pyx_convert_PyObject_string_to_py_std__in_string(const string& s):             # <<<<<<<<<<<<<<
- *     return __Pyx_PyObject_FromStringAndSize(s.data(), s.size())
- * cdef extern from *:
- */
-
-static CYTHON_INLINE PyObject *__pyx_convert_PyObject_string_to_py_std__in_string(std::string const &__pyx_v_s) {
-  PyObject *__pyx_r = NULL;
-  __Pyx_RefNannyDeclarations
-  PyObject *__pyx_t_1 = NULL;
-  int __pyx_lineno = 0;
-  const char *__pyx_filename = NULL;
-  int __pyx_clineno = 0;
-  __Pyx_RefNannySetupContext("__pyx_convert_PyObject_string_to_py_std__in_string", 0);
-
-  /* "string.to_py":32
- * @cname("__pyx_convert_PyObject_string_to_py_std__in_string")
- * cdef inline object __pyx_convert_PyObject_string_to_py_std__in_string(const string& s):
- *     return __Pyx_PyObject_FromStringAndSize(s.data(), s.size())             # <<<<<<<<<<<<<<
- * cdef extern from *:
- *     cdef object __Pyx_PyUnicode_FromStringAndSize(char*, size_t)
- */
-  __Pyx_XDECREF(__pyx_r);
-  __pyx_t_1 = __Pyx_PyObject_FromStringAndSize(__pyx_v_s.data(), __pyx_v_s.size()); if (unlikely(!__pyx_t_1)) {__pyx_filename = __pyx_f[16]; __pyx_lineno = 32; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
-  __Pyx_GOTREF(__pyx_t_1);
-  __pyx_r = __pyx_t_1;
-  __pyx_t_1 = 0;
-  goto __pyx_L0;
-
-  /* "string.to_py":31
- * 
- * @cname("__pyx_convert_PyObject_string_to_py_std__in_string")
- * cdef inline object __pyx_convert_PyObject_string_to_py_std__in_string(const string& s):             # <<<<<<<<<<<<<<
- *     return __Pyx_PyObject_FromStringAndSize(s.data(), s.size())
- * cdef extern from *:
- */
-
-  /* function exit code */
-  __pyx_L1_error:;
-  __Pyx_XDECREF(__pyx_t_1);
-  __Pyx_AddTraceback("string.to_py.__pyx_convert_PyObject_string_to_py_std__in_string", __pyx_clineno, __pyx_lineno, __pyx_filename);
-  __pyx_r = 0;
-  __pyx_L0:;
-  __Pyx_XGIVEREF(__pyx_r);
-  __Pyx_RefNannyFinishContext();
-  return __pyx_r;
-}
-
-/* "string.to_py":37
- * 
- * @cname("__pyx_convert_PyUnicode_string_to_py_std__in_string")
- * cdef inline object __pyx_convert_PyUnicode_string_to_py_std__in_string(const string& s):             # <<<<<<<<<<<<<<
- *     return __Pyx_PyUnicode_FromStringAndSize(s.data(), s.size())
- * cdef extern from *:
- */
-
-static CYTHON_INLINE PyObject *__pyx_convert_PyUnicode_string_to_py_std__in_string(std::string const &__pyx_v_s) {
-  PyObject *__pyx_r = NULL;
-  __Pyx_RefNannyDeclarations
-  PyObject *__pyx_t_1 = NULL;
-  int __pyx_lineno = 0;
-  const char *__pyx_filename = NULL;
-  int __pyx_clineno = 0;
-  __Pyx_RefNannySetupContext("__pyx_convert_PyUnicode_string_to_py_std__in_string", 0);
-
-  /* "string.to_py":38
- * @cname("__pyx_convert_PyUnicode_string_to_py_std__in_string")
- * cdef inline object __pyx_convert_PyUnicode_string_to_py_std__in_string(const string& s):
- *     return __Pyx_PyUnicode_FromStringAndSize(s.data(), s.size())             # <<<<<<<<<<<<<<
- * cdef extern from *:
- *     cdef object __Pyx_PyStr_FromStringAndSize(char*, size_t)
- */
-  __Pyx_XDECREF(__pyx_r);
-  __pyx_t_1 = __Pyx_PyUnicode_FromStringAndSize(__pyx_v_s.data(), __pyx_v_s.size()); if (unlikely(!__pyx_t_1)) {__pyx_filename = __pyx_f[16]; __pyx_lineno = 38; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
-  __Pyx_GOTREF(__pyx_t_1);
-  __pyx_r = __pyx_t_1;
-  __pyx_t_1 = 0;
-  goto __pyx_L0;
-
-  /* "string.to_py":37
- * 
- * @cname("__pyx_convert_PyUnicode_string_to_py_std__in_string")
- * cdef inline object __pyx_convert_PyUnicode_string_to_py_std__in_string(const string& s):             # <<<<<<<<<<<<<<
- *     return __Pyx_PyUnicode_FromStringAndSize(s.data(), s.size())
- * cdef extern from *:
- */
-
-  /* function exit code */
-  __pyx_L1_error:;
-  __Pyx_XDECREF(__pyx_t_1);
-  __Pyx_AddTraceback("string.to_py.__pyx_convert_PyUnicode_string_to_py_std__in_string", __pyx_clineno, __pyx_lineno, __pyx_filename);
-  __pyx_r = 0;
-  __pyx_L0:;
-  __Pyx_XGIVEREF(__pyx_r);
-  __Pyx_RefNannyFinishContext();
-  return __pyx_r;
-}
-
-/* "string.to_py":43
- * 
- * @cname("__pyx_convert_PyStr_string_to_py_std__in_string")
- * cdef inline object __pyx_convert_PyStr_string_to_py_std__in_string(const string& s):             # <<<<<<<<<<<<<<
- *     return __Pyx_PyStr_FromStringAndSize(s.data(), s.size())
- * cdef extern from *:
- */
-
-static CYTHON_INLINE PyObject *__pyx_convert_PyStr_string_to_py_std__in_string(std::string const &__pyx_v_s) {
-  PyObject *__pyx_r = NULL;
-  __Pyx_RefNannyDeclarations
-  PyObject *__pyx_t_1 = NULL;
-  int __pyx_lineno = 0;
-  const char *__pyx_filename = NULL;
-  int __pyx_clineno = 0;
-  __Pyx_RefNannySetupContext("__pyx_convert_PyStr_string_to_py_std__in_string", 0);
-
-  /* "string.to_py":44
- * @cname("__pyx_convert_PyStr_string_to_py_std__in_string")
- * cdef inline object __pyx_convert_PyStr_string_to_py_std__in_string(const string& s):
- *     return __Pyx_PyStr_FromStringAndSize(s.data(), s.size())             # <<<<<<<<<<<<<<
- * cdef extern from *:
- *     cdef object __Pyx_PyBytes_FromStringAndSize(char*, size_t)
- */
-  __Pyx_XDECREF(__pyx_r);
-  __pyx_t_1 = __Pyx_PyStr_FromStringAndSize(__pyx_v_s.data(), __pyx_v_s.size()); if (unlikely(!__pyx_t_1)) {__pyx_filename = __pyx_f[16]; __pyx_lineno = 44; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
-  __Pyx_GOTREF(__pyx_t_1);
-  __pyx_r = __pyx_t_1;
-  __pyx_t_1 = 0;
-  goto __pyx_L0;
-
-  /* "string.to_py":43
- * 
- * @cname("__pyx_convert_PyStr_string_to_py_std__in_string")
- * cdef inline object __pyx_convert_PyStr_string_to_py_std__in_string(const string& s):             # <<<<<<<<<<<<<<
- *     return __Pyx_PyStr_FromStringAndSize(s.data(), s.size())
- * cdef extern from *:
- */
-
-  /* function exit code */
-  __pyx_L1_error:;
-  __Pyx_XDECREF(__pyx_t_1);
-  __Pyx_AddTraceback("string.to_py.__pyx_convert_PyStr_string_to_py_std__in_string", __pyx_clineno, __pyx_lineno, __pyx_filename);
-  __pyx_r = 0;
-  __pyx_L0:;
-  __Pyx_XGIVEREF(__pyx_r);
-  __Pyx_RefNannyFinishContext();
-  return __pyx_r;
-}
-
-/* "string.to_py":49
- * 
- * @cname("__pyx_convert_PyBytes_string_to_py_std__in_string")
- * cdef inline object __pyx_convert_PyBytes_string_to_py_std__in_string(const string& s):             # <<<<<<<<<<<<<<
- *     return __Pyx_PyBytes_FromStringAndSize(s.data(), s.size())
- * cdef extern from *:
- */
-
-static CYTHON_INLINE PyObject *__pyx_convert_PyBytes_string_to_py_std__in_string(std::string const &__pyx_v_s) {
-  PyObject *__pyx_r = NULL;
-  __Pyx_RefNannyDeclarations
-  PyObject *__pyx_t_1 = NULL;
-  int __pyx_lineno = 0;
-  const char *__pyx_filename = NULL;
-  int __pyx_clineno = 0;
-  __Pyx_RefNannySetupContext("__pyx_convert_PyBytes_string_to_py_std__in_string", 0);
-
-  /* "string.to_py":50
- * @cname("__pyx_convert_PyBytes_string_to_py_std__in_string")
- * cdef inline object __pyx_convert_PyBytes_string_to_py_std__in_string(const string& s):
- *     return __Pyx_PyBytes_FromStringAndSize(s.data(), s.size())             # <<<<<<<<<<<<<<
- * cdef extern from *:
- *     cdef object __Pyx_PyByteArray_FromStringAndSize(char*, size_t)
- */
-  __Pyx_XDECREF(__pyx_r);
-  __pyx_t_1 = __Pyx_PyBytes_FromStringAndSize(__pyx_v_s.data(), __pyx_v_s.size()); if (unlikely(!__pyx_t_1)) {__pyx_filename = __pyx_f[16]; __pyx_lineno = 50; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
-  __Pyx_GOTREF(__pyx_t_1);
-  __pyx_r = __pyx_t_1;
-  __pyx_t_1 = 0;
-  goto __pyx_L0;
-
-  /* "string.to_py":49
- * 
- * @cname("__pyx_convert_PyBytes_string_to_py_std__in_string")
- * cdef inline object __pyx_convert_PyBytes_string_to_py_std__in_string(const string& s):             # <<<<<<<<<<<<<<
- *     return __Pyx_PyBytes_FromStringAndSize(s.data(), s.size())
- * cdef extern from *:
- */
-
-  /* function exit code */
-  __pyx_L1_error:;
-  __Pyx_XDECREF(__pyx_t_1);
-  __Pyx_AddTraceback("string.to_py.__pyx_convert_PyBytes_string_to_py_std__in_string", __pyx_clineno, __pyx_lineno, __pyx_filename);
-  __pyx_r = 0;
-  __pyx_L0:;
-  __Pyx_XGIVEREF(__pyx_r);
-  __Pyx_RefNannyFinishContext();
-  return __pyx_r;
-}
-
-/* "string.to_py":55
- * 
- * @cname("__pyx_convert_PyByteArray_string_to_py_std__in_string")
- * cdef inline object __pyx_convert_PyByteArray_string_to_py_std__in_string(const string& s):             # <<<<<<<<<<<<<<
- *     return __Pyx_PyByteArray_FromStringAndSize(s.data(), s.size())
- * 
- */
-
-static CYTHON_INLINE PyObject *__pyx_convert_PyByteArray_string_to_py_std__in_string(std::string const &__pyx_v_s) {
-  PyObject *__pyx_r = NULL;
-  __Pyx_RefNannyDeclarations
-  PyObject *__pyx_t_1 = NULL;
-  int __pyx_lineno = 0;
-  const char *__pyx_filename = NULL;
-  int __pyx_clineno = 0;
-  __Pyx_RefNannySetupContext("__pyx_convert_PyByteArray_string_to_py_std__in_string", 0);
-
-  /* "string.to_py":56
- * @cname("__pyx_convert_PyByteArray_string_to_py_std__in_string")
- * cdef inline object __pyx_convert_PyByteArray_string_to_py_std__in_string(const string& s):
- *     return __Pyx_PyByteArray_FromStringAndSize(s.data(), s.size())             # <<<<<<<<<<<<<<
- * 
- */
-  __Pyx_XDECREF(__pyx_r);
-  __pyx_t_1 = __Pyx_PyByteArray_FromStringAndSize(__pyx_v_s.data(), __pyx_v_s.size()); if (unlikely(!__pyx_t_1)) {__pyx_filename = __pyx_f[16]; __pyx_lineno = 56; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
-  __Pyx_GOTREF(__pyx_t_1);
-  __pyx_r = __pyx_t_1;
-  __pyx_t_1 = 0;
-  goto __pyx_L0;
-
-  /* "string.to_py":55
- * 
- * @cname("__pyx_convert_PyByteArray_string_to_py_std__in_string")
- * cdef inline object __pyx_convert_PyByteArray_string_to_py_std__in_string(const string& s):             # <<<<<<<<<<<<<<
- *     return __Pyx_PyByteArray_FromStringAndSize(s.data(), s.size())
- * 
- */
-
-  /* function exit code */
-  __pyx_L1_error:;
-  __Pyx_XDECREF(__pyx_t_1);
-  __Pyx_AddTraceback("string.to_py.__pyx_convert_PyByteArray_string_to_py_std__in_string", __pyx_clineno, __pyx_lineno, __pyx_filename);
-  __pyx_r = 0;
-  __pyx_L0:;
-  __Pyx_XGIVEREF(__pyx_r);
-  __Pyx_RefNannyFinishContext();
-  return __pyx_r;
-}
-
-/* "string.from_py":13
- * 
- * @cname("__pyx_convert_string_from_py_std__in_string")
- * cdef string __pyx_convert_string_from_py_std__in_string(object o) except *:             # <<<<<<<<<<<<<<
- *     cdef Py_ssize_t length
- *     cdef char* data = __Pyx_PyObject_AsStringAndSize(o, &length)
- */
-
-static std::string __pyx_convert_string_from_py_std__in_string(PyObject *__pyx_v_o) {
-  Py_ssize_t __pyx_v_length;
-  char *__pyx_v_data;
-  std::string __pyx_r;
-  __Pyx_RefNannyDeclarations
-  char *__pyx_t_1;
-  int __pyx_lineno = 0;
-  const char *__pyx_filename = NULL;
-  int __pyx_clineno = 0;
-  __Pyx_RefNannySetupContext("__pyx_convert_string_from_py_std__in_string", 0);
-
-  /* "string.from_py":15
- * cdef string __pyx_convert_string_from_py_std__in_string(object o) except *:
- *     cdef Py_ssize_t length
- *     cdef char* data = __Pyx_PyObject_AsStringAndSize(o, &length)             # <<<<<<<<<<<<<<
- *     return string(data, length)
- * 
- */
-  __pyx_t_1 = __Pyx_PyObject_AsStringAndSize(__pyx_v_o, (&__pyx_v_length)); if (unlikely(__pyx_t_1 == NULL)) {__pyx_filename = __pyx_f[16]; __pyx_lineno = 15; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
-  __pyx_v_data = __pyx_t_1;
-
-  /* "string.from_py":16
- *     cdef Py_ssize_t length
- *     cdef char* data = __Pyx_PyObject_AsStringAndSize(o, &length)
- *     return string(data, length)             # <<<<<<<<<<<<<<
- * 
- * 
- */
-  __pyx_r = std::string(__pyx_v_data, __pyx_v_length);
-  goto __pyx_L0;
-
-  /* "string.from_py":13
- * 
- * @cname("__pyx_convert_string_from_py_std__in_string")
- * cdef string __pyx_convert_string_from_py_std__in_string(object o) except *:             # <<<<<<<<<<<<<<
- *     cdef Py_ssize_t length
- *     cdef char* data = __Pyx_PyObject_AsStringAndSize(o, &length)
- */
-
-  /* function exit code */
-  __pyx_L1_error:;
-  __Pyx_AddTraceback("string.from_py.__pyx_convert_string_from_py_std__in_string", __pyx_clineno, __pyx_lineno, __pyx_filename);
-  __pyx_L0:;
-  __Pyx_RefNannyFinishContext();
-  return __pyx_r;
-}
-
-static PyObject *__pyx_tp_new_8PyDeepCL_EasyCL(PyTypeObject *t, PyObject *a, PyObject *k) {
+static PyObject *__pyx_tp_new_8PyDeepCL_DeepCL(PyTypeObject *t, PyObject *a, PyObject *k) {
   PyObject *o;
   if (likely((t->tp_flags & Py_TPFLAGS_IS_ABSTRACT) == 0)) {
     o = (*t->tp_alloc)(t, 0);
@@ -27033,32 +26960,46 @@ static PyObject *__pyx_tp_new_8PyDeepCL_EasyCL(PyTypeObject *t, PyObject *a, PyO
     o = (PyObject *) PyBaseObject_Type.tp_new(t, __pyx_empty_tuple, 0);
   }
   if (unlikely(!o)) return 0;
-  if (unlikely(__pyx_pw_8PyDeepCL_6EasyCL_1__cinit__(o, a, k) < 0)) {
+  if (unlikely(__pyx_pw_8PyDeepCL_6DeepCL_1__cinit__(o, a, k) < 0)) {
     Py_DECREF(o); o = 0;
   }
   return o;
 }
 
-static void __pyx_tp_dealloc_8PyDeepCL_EasyCL(PyObject *o) {
+static void __pyx_tp_dealloc_8PyDeepCL_DeepCL(PyObject *o) {
   #if PY_VERSION_HEX >= 0x030400a1
   if (unlikely(Py_TYPE(o)->tp_finalize) && (!PyType_IS_GC(Py_TYPE(o)) || !_PyGC_FINALIZED(o))) {
     if (PyObject_CallFinalizerFromDealloc(o)) return;
   }
   #endif
+  {
+    PyObject *etype, *eval, *etb;
+    PyErr_Fetch(&etype, &eval, &etb);
+    ++Py_REFCNT(o);
+    __pyx_pw_8PyDeepCL_6DeepCL_3__dealloc__(o);
+    --Py_REFCNT(o);
+    PyErr_Restore(etype, eval, etb);
+  }
   (*Py_TYPE(o)->tp_free)(o);
 }
 
-static PyMethodDef __pyx_methods_8PyDeepCL_EasyCL[] = {
-  {"__dealloc", (PyCFunction)__pyx_pw_8PyDeepCL_6EasyCL_3__dealloc, METH_NOARGS, 0},
+static PyMethodDef __pyx_methods_8PyDeepCL_DeepCL[] = {
+  {"setProfiling", (PyCFunction)__pyx_pw_8PyDeepCL_6DeepCL_5setProfiling, METH_O, 0},
+  {"dumpProfiling", (PyCFunction)__pyx_pw_8PyDeepCL_6DeepCL_7dumpProfiling, METH_NOARGS, 0},
+  {"getComputeUnits", (PyCFunction)__pyx_pw_8PyDeepCL_6DeepCL_9getComputeUnits, METH_NOARGS, 0},
+  {"getLocalMemorySize", (PyCFunction)__pyx_pw_8PyDeepCL_6DeepCL_11getLocalMemorySize, METH_NOARGS, 0},
+  {"getLocalMemorySizeKB", (PyCFunction)__pyx_pw_8PyDeepCL_6DeepCL_13getLocalMemorySizeKB, METH_NOARGS, 0},
+  {"getMaxWorkgroupSize", (PyCFunction)__pyx_pw_8PyDeepCL_6DeepCL_15getMaxWorkgroupSize, METH_NOARGS, 0},
+  {"getMaxAllocSizeMB", (PyCFunction)__pyx_pw_8PyDeepCL_6DeepCL_17getMaxAllocSizeMB, METH_NOARGS, 0},
   {0, 0, 0, 0}
 };
 
-static PyTypeObject __pyx_type_8PyDeepCL_EasyCL = {
+static PyTypeObject __pyx_type_8PyDeepCL_DeepCL = {
   PyVarObject_HEAD_INIT(0, 0)
-  "PyDeepCL.EasyCL", /*tp_name*/
-  sizeof(struct __pyx_obj_8PyDeepCL_EasyCL), /*tp_basicsize*/
+  "PyDeepCL.DeepCL", /*tp_name*/
+  sizeof(struct __pyx_obj_8PyDeepCL_DeepCL), /*tp_basicsize*/
   0, /*tp_itemsize*/
-  __pyx_tp_dealloc_8PyDeepCL_EasyCL, /*tp_dealloc*/
+  __pyx_tp_dealloc_8PyDeepCL_DeepCL, /*tp_dealloc*/
   0, /*tp_print*/
   0, /*tp_getattr*/
   0, /*tp_setattr*/
@@ -27085,7 +27026,7 @@ static PyTypeObject __pyx_type_8PyDeepCL_EasyCL = {
   0, /*tp_weaklistoffset*/
   0, /*tp_iter*/
   0, /*tp_iternext*/
-  __pyx_methods_8PyDeepCL_EasyCL, /*tp_methods*/
+  __pyx_methods_8PyDeepCL_DeepCL, /*tp_methods*/
   0, /*tp_members*/
   0, /*tp_getset*/
   0, /*tp_base*/
@@ -27095,7 +27036,7 @@ static PyTypeObject __pyx_type_8PyDeepCL_EasyCL = {
   0, /*tp_dictoffset*/
   0, /*tp_init*/
   0, /*tp_alloc*/
-  __pyx_tp_new_8PyDeepCL_EasyCL, /*tp_new*/
+  __pyx_tp_new_8PyDeepCL_DeepCL, /*tp_new*/
   0, /*tp_free*/
   0, /*tp_is_gc*/
   0, /*tp_bases*/
@@ -27222,11 +27163,18 @@ static void __pyx_tp_dealloc_8PyDeepCL_SGD(PyObject *o) {
     if (PyObject_CallFinalizerFromDealloc(o)) return;
   }
   #endif
+  {
+    PyObject *etype, *eval, *etb;
+    PyErr_Fetch(&etype, &eval, &etb);
+    ++Py_REFCNT(o);
+    __pyx_pw_8PyDeepCL_3SGD_3__dealloc__(o);
+    --Py_REFCNT(o);
+    PyErr_Restore(etype, eval, etb);
+  }
   (*Py_TYPE(o)->tp_free)(o);
 }
 
 static PyMethodDef __pyx_methods_8PyDeepCL_SGD[] = {
-  {"__dealloc", (PyCFunction)__pyx_pw_8PyDeepCL_3SGD_3__dealloc, METH_NOARGS, 0},
   {"setLearningRate", (PyCFunction)__pyx_pw_8PyDeepCL_3SGD_5setLearningRate, METH_O, 0},
   {"setMomentum", (PyCFunction)__pyx_pw_8PyDeepCL_3SGD_7setMomentum, METH_O, 0},
   {"setWeightDecay", (PyCFunction)__pyx_pw_8PyDeepCL_3SGD_9setWeightDecay, METH_O, 0},
@@ -27311,12 +27259,19 @@ static void __pyx_tp_dealloc_8PyDeepCL_Annealer(PyObject *o) {
   if (unlikely(Py_TYPE(o)->tp_finalize) && (!PyType_IS_GC(Py_TYPE(o)) || !_PyGC_FINALIZED(o))) {
     if (PyObject_CallFinalizerFromDealloc(o)) return;
   }
-  #endif
+  #endif
+  {
+    PyObject *etype, *eval, *etb;
+    PyErr_Fetch(&etype, &eval, &etb);
+    ++Py_REFCNT(o);
+    __pyx_pw_8PyDeepCL_8Annealer_3__dealloc__(o);
+    --Py_REFCNT(o);
+    PyErr_Restore(etype, eval, etb);
+  }
   (*Py_TYPE(o)->tp_free)(o);
 }
 
 static PyMethodDef __pyx_methods_8PyDeepCL_Annealer[] = {
-  {"__dealloc", (PyCFunction)__pyx_pw_8PyDeepCL_8Annealer_3__dealloc, METH_NOARGS, 0},
   {"setLearningRate", (PyCFunction)__pyx_pw_8PyDeepCL_8Annealer_5setLearningRate, METH_O, 0},
   {"setAnneal", (PyCFunction)__pyx_pw_8PyDeepCL_8Annealer_7setAnneal, METH_O, 0},
   {"train", (PyCFunction)__pyx_pw_8PyDeepCL_8Annealer_9train, METH_VARARGS|METH_KEYWORDS, 0},
@@ -27401,11 +27356,18 @@ static void __pyx_tp_dealloc_8PyDeepCL_Nesterov(PyObject *o) {
     if (PyObject_CallFinalizerFromDealloc(o)) return;
   }
   #endif
+  {
+    PyObject *etype, *eval, *etb;
+    PyErr_Fetch(&etype, &eval, &etb);
+    ++Py_REFCNT(o);
+    __pyx_pw_8PyDeepCL_8Nesterov_3__dealloc__(o);
+    --Py_REFCNT(o);
+    PyErr_Restore(etype, eval, etb);
+  }
   (*Py_TYPE(o)->tp_free)(o);
 }
 
 static PyMethodDef __pyx_methods_8PyDeepCL_Nesterov[] = {
-  {"__dealloc", (PyCFunction)__pyx_pw_8PyDeepCL_8Nesterov_3__dealloc, METH_NOARGS, 0},
   {"setLearningRate", (PyCFunction)__pyx_pw_8PyDeepCL_8Nesterov_5setLearningRate, METH_O, 0},
   {"setMomentum", (PyCFunction)__pyx_pw_8PyDeepCL_8Nesterov_7setMomentum, METH_O, 0},
   {"train", (PyCFunction)__pyx_pw_8PyDeepCL_8Nesterov_9train, METH_VARARGS|METH_KEYWORDS, 0},
@@ -27490,11 +27452,18 @@ static void __pyx_tp_dealloc_8PyDeepCL_Adagrad(PyObject *o) {
     if (PyObject_CallFinalizerFromDealloc(o)) return;
   }
   #endif
+  {
+    PyObject *etype, *eval, *etb;
+    PyErr_Fetch(&etype, &eval, &etb);
+    ++Py_REFCNT(o);
+    __pyx_pw_8PyDeepCL_7Adagrad_3__dealloc__(o);
+    --Py_REFCNT(o);
+    PyErr_Restore(etype, eval, etb);
+  }
   (*Py_TYPE(o)->tp_free)(o);
 }
 
 static PyMethodDef __pyx_methods_8PyDeepCL_Adagrad[] = {
-  {"__dealloc", (PyCFunction)__pyx_pw_8PyDeepCL_7Adagrad_3__dealloc, METH_NOARGS, 0},
   {"setLearningRate", (PyCFunction)__pyx_pw_8PyDeepCL_7Adagrad_5setLearningRate, METH_O, 0},
   {"train", (PyCFunction)__pyx_pw_8PyDeepCL_7Adagrad_7train, METH_VARARGS|METH_KEYWORDS, 0},
   {"trainFromLabels", (PyCFunction)__pyx_pw_8PyDeepCL_7Adagrad_9trainFromLabels, METH_VARARGS|METH_KEYWORDS, 0},
@@ -27578,11 +27547,18 @@ static void __pyx_tp_dealloc_8PyDeepCL_Rmsprop(PyObject *o) {
     if (PyObject_CallFinalizerFromDealloc(o)) return;
   }
   #endif
+  {
+    PyObject *etype, *eval, *etb;
+    PyErr_Fetch(&etype, &eval, &etb);
+    ++Py_REFCNT(o);
+    __pyx_pw_8PyDeepCL_7Rmsprop_3__dealloc__(o);
+    --Py_REFCNT(o);
+    PyErr_Restore(etype, eval, etb);
+  }
   (*Py_TYPE(o)->tp_free)(o);
 }
 
 static PyMethodDef __pyx_methods_8PyDeepCL_Rmsprop[] = {
-  {"__dealloc", (PyCFunction)__pyx_pw_8PyDeepCL_7Rmsprop_3__dealloc, METH_NOARGS, 0},
   {"setLearningRate", (PyCFunction)__pyx_pw_8PyDeepCL_7Rmsprop_5setLearningRate, METH_O, 0},
   {"train", (PyCFunction)__pyx_pw_8PyDeepCL_7Rmsprop_7train, METH_VARARGS|METH_KEYWORDS, 0},
   {"trainFromLabels", (PyCFunction)__pyx_pw_8PyDeepCL_7Rmsprop_9trainFromLabels, METH_VARARGS|METH_KEYWORDS, 0},
@@ -27666,11 +27642,18 @@ static void __pyx_tp_dealloc_8PyDeepCL_Adadelta(PyObject *o) {
     if (PyObject_CallFinalizerFromDealloc(o)) return;
   }
   #endif
+  {
+    PyObject *etype, *eval, *etb;
+    PyErr_Fetch(&etype, &eval, &etb);
+    ++Py_REFCNT(o);
+    __pyx_pw_8PyDeepCL_8Adadelta_3__dealloc__(o);
+    --Py_REFCNT(o);
+    PyErr_Restore(etype, eval, etb);
+  }
   (*Py_TYPE(o)->tp_free)(o);
 }
 
 static PyMethodDef __pyx_methods_8PyDeepCL_Adadelta[] = {
-  {"__dealloc", (PyCFunction)__pyx_pw_8PyDeepCL_8Adadelta_3__dealloc, METH_NOARGS, 0},
   {"train", (PyCFunction)__pyx_pw_8PyDeepCL_8Adadelta_5train, METH_VARARGS|METH_KEYWORDS, 0},
   {"trainFromLabels", (PyCFunction)__pyx_pw_8PyDeepCL_8Adadelta_7trainFromLabels, METH_VARARGS|METH_KEYWORDS, 0},
   {0, 0, 0, 0}
@@ -27753,11 +27736,18 @@ static void __pyx_tp_dealloc_8PyDeepCL_NeuralNet(PyObject *o) {
     if (PyObject_CallFinalizerFromDealloc(o)) return;
   }
   #endif
+  {
+    PyObject *etype, *eval, *etb;
+    PyErr_Fetch(&etype, &eval, &etb);
+    ++Py_REFCNT(o);
+    __pyx_pw_8PyDeepCL_9NeuralNet_3__dealloc__(o);
+    --Py_REFCNT(o);
+    PyErr_Restore(etype, eval, etb);
+  }
   (*Py_TYPE(o)->tp_free)(o);
 }
 
 static PyMethodDef __pyx_methods_8PyDeepCL_NeuralNet[] = {
-  {"__dealloc", (PyCFunction)__pyx_pw_8PyDeepCL_9NeuralNet_3__dealloc, METH_NOARGS, 0},
   {"asString", (PyCFunction)__pyx_pw_8PyDeepCL_9NeuralNet_5asString, METH_NOARGS, 0},
   {"setBatchSize", (PyCFunction)__pyx_pw_8PyDeepCL_9NeuralNet_7setBatchSize, METH_O, 0},
   {"forward", (PyCFunction)__pyx_pw_8PyDeepCL_9NeuralNet_9forward, METH_O, 0},
@@ -27863,7 +27853,7 @@ static PyMethodDef __pyx_methods_8PyDeepCL_Layer[] = {
   {"needsBackProp", (PyCFunction)__pyx_pw_8PyDeepCL_5Layer_7needsBackProp, METH_NOARGS, 0},
   {"getOutputCubeSize", (PyCFunction)__pyx_pw_8PyDeepCL_5Layer_9getOutputCubeSize, METH_NOARGS, 0},
   {"getOutputPlanes", (PyCFunction)__pyx_pw_8PyDeepCL_5Layer_11getOutputPlanes, METH_NOARGS, 0},
-  {"getOutputImageSize", (PyCFunction)__pyx_pw_8PyDeepCL_5Layer_13getOutputImageSize, METH_NOARGS, 0},
+  {"getOutputSize", (PyCFunction)__pyx_pw_8PyDeepCL_5Layer_13getOutputSize, METH_NOARGS, 0},
   {"getOutput", (PyCFunction)__pyx_pw_8PyDeepCL_5Layer_15getOutput, METH_NOARGS, 0},
   {"getWeights", (PyCFunction)__pyx_pw_8PyDeepCL_5Layer_17getWeights, METH_NOARGS, 0},
   {"setWeights", (PyCFunction)__pyx_pw_8PyDeepCL_5Layer_19setWeights, METH_O, 0},
@@ -28840,11 +28830,18 @@ static void __pyx_tp_dealloc_8PyDeepCL_NetLearner(PyObject *o) {
     if (PyObject_CallFinalizerFromDealloc(o)) return;
   }
   #endif
+  {
+    PyObject *etype, *eval, *etb;
+    PyErr_Fetch(&etype, &eval, &etb);
+    ++Py_REFCNT(o);
+    __pyx_pw_8PyDeepCL_10NetLearner_3__dealloc__(o);
+    --Py_REFCNT(o);
+    PyErr_Restore(etype, eval, etb);
+  }
   (*Py_TYPE(o)->tp_free)(o);
 }
 
 static PyMethodDef __pyx_methods_8PyDeepCL_NetLearner[] = {
-  {"__dealloc", (PyCFunction)__pyx_pw_8PyDeepCL_10NetLearner_3__dealloc, METH_NOARGS, 0},
   {"setSchedule", (PyCFunction)__pyx_pw_8PyDeepCL_10NetLearner_5setSchedule, METH_O, 0},
   {"setDumpTimings", (PyCFunction)__pyx_pw_8PyDeepCL_10NetLearner_7setDumpTimings, METH_O, 0},
   {"_run", (PyCFunction)__pyx_pw_8PyDeepCL_10NetLearner_9_run, METH_NOARGS, 0},
@@ -29863,6 +29860,7 @@ static __Pyx_StringTabEntry __pyx_string_tab[] = {
   {&__pyx_n_s_Ellipsis, __pyx_k_Ellipsis, sizeof(__pyx_k_Ellipsis), 0, 0, 1, 1},
   {&__pyx_kp_s_Empty_shape_tuple_for_cython_arr, __pyx_k_Empty_shape_tuple_for_cython_arr, sizeof(__pyx_k_Empty_shape_tuple_for_cython_arr), 0, 0, 1, 0},
   {&__pyx_n_s_Exception, __pyx_k_Exception, sizeof(__pyx_k_Exception), 0, 0, 1, 1},
+  {&__pyx_kp_s_GenericLoader_py_getDimensions, __pyx_k_GenericLoader_py_getDimensions, sizeof(__pyx_k_GenericLoader_py_getDimensions), 0, 0, 1, 0},
   {&__pyx_n_s_IndexError, __pyx_k_IndexError, sizeof(__pyx_k_IndexError), 0, 0, 1, 1},
   {&__pyx_kp_s_Indirect_dimensions_not_supporte, __pyx_k_Indirect_dimensions_not_supporte, sizeof(__pyx_k_Indirect_dimensions_not_supporte), 0, 0, 1, 0},
   {&__pyx_kp_s_Invalid_mode_expected_c_or_fortr, __pyx_k_Invalid_mode_expected_c_or_fortr, sizeof(__pyx_k_Invalid_mode_expected_c_or_fortr), 0, 0, 1, 0},
@@ -29883,12 +29881,11 @@ static __Pyx_StringTabEntry __pyx_string_tab[] = {
   {&__pyx_n_b_O, __pyx_k_O, sizeof(__pyx_k_O), 0, 0, 0, 1},
   {&__pyx_kp_s_Out_of_bounds_on_buffer_access_a, __pyx_k_Out_of_bounds_on_buffer_access_a, sizeof(__pyx_k_Out_of_bounds_on_buffer_access_a), 0, 0, 1, 0},
   {&__pyx_n_s_PyDeepCL, __pyx_k_PyDeepCL, sizeof(__pyx_k_PyDeepCL), 0, 0, 1, 1},
-  {&__pyx_n_s_RuntimeError, __pyx_k_RuntimeError, sizeof(__pyx_k_RuntimeError), 0, 0, 1, 1},
   {&__pyx_n_s_Thread, __pyx_k_Thread, sizeof(__pyx_k_Thread), 0, 0, 1, 1},
   {&__pyx_n_s_TypeError, __pyx_k_TypeError, sizeof(__pyx_k_TypeError), 0, 0, 1, 1},
   {&__pyx_kp_s_Unable_to_convert_item_to_object, __pyx_k_Unable_to_convert_item_to_object, sizeof(__pyx_k_Unable_to_convert_item_to_object), 0, 0, 1, 0},
   {&__pyx_n_s_ValueError, __pyx_k_ValueError, sizeof(__pyx_k_ValueError), 0, 0, 1, 1},
-  {&__pyx_kp_b__9, __pyx_k__9, sizeof(__pyx_k__9), 0, 0, 0, 0},
+  {&__pyx_kp_s_about_to_call_asnewcharstar, __pyx_k_about_to_call_asnewcharstar, sizeof(__pyx_k_about_to_call_asnewcharstar), 0, 0, 1, 0},
   {&__pyx_n_s_act, __pyx_k_act, sizeof(__pyx_k_act), 0, 0, 1, 1},
   {&__pyx_n_s_allocate_buffer, __pyx_k_allocate_buffer, sizeof(__pyx_k_allocate_buffer), 0, 0, 1, 1},
   {&__pyx_n_s_anneal, __pyx_k_anneal, sizeof(__pyx_k_anneal), 0, 0, 1, 1},
@@ -29899,7 +29896,6 @@ static __Pyx_StringTabEntry __pyx_string_tab[] = {
   {&__pyx_n_s_batchSize, __pyx_k_batchSize, sizeof(__pyx_k_batchSize), 0, 0, 1, 1},
   {&__pyx_n_s_c, __pyx_k_c, sizeof(__pyx_k_c), 0, 0, 1, 1},
   {&__pyx_n_u_c, __pyx_k_c, sizeof(__pyx_k_c), 0, 1, 0, 1},
-  {&__pyx_n_s_checkException, __pyx_k_checkException, sizeof(__pyx_k_checkException), 0, 0, 1, 1},
   {&__pyx_n_s_cl, __pyx_k_cl, sizeof(__pyx_k_cl), 0, 0, 1, 1},
   {&__pyx_n_s_class, __pyx_k_class, sizeof(__pyx_k_class), 0, 0, 1, 1},
   {&__pyx_n_s_context, __pyx_k_context, sizeof(__pyx_k_context), 0, 0, 1, 1},
@@ -29907,17 +29903,16 @@ static __Pyx_StringTabEntry __pyx_string_tab[] = {
   {&__pyx_kp_s_contiguous_and_indirect, __pyx_k_contiguous_and_indirect, sizeof(__pyx_k_contiguous_and_indirect), 0, 0, 1, 0},
   {&__pyx_n_s_createNetFromNetdef, __pyx_k_createNetFromNetdef, sizeof(__pyx_k_createNetFromNetdef), 0, 0, 1, 1},
   {&__pyx_n_s_daemon, __pyx_k_daemon, sizeof(__pyx_k_daemon), 0, 0, 1, 1},
-  {&__pyx_kp_s_data_norep_git_DeepCL_python_Ge, __pyx_k_data_norep_git_DeepCL_python_Ge, sizeof(__pyx_k_data_norep_git_DeepCL_python_Ge), 0, 0, 1, 0},
-  {&__pyx_kp_s_data_norep_git_DeepCL_python_La, __pyx_k_data_norep_git_DeepCL_python_La, sizeof(__pyx_k_data_norep_git_DeepCL_python_La), 0, 0, 1, 0},
-  {&__pyx_kp_s_data_norep_git_DeepCL_python_Ne, __pyx_k_data_norep_git_DeepCL_python_Ne, sizeof(__pyx_k_data_norep_git_DeepCL_python_Ne), 0, 0, 1, 0},
-  {&__pyx_kp_s_data_norep_git_DeepCL_python_Py, __pyx_k_data_norep_git_DeepCL_python_Py, sizeof(__pyx_k_data_norep_git_DeepCL_python_Py), 0, 0, 1, 0},
   {&__pyx_n_s_dtype_is_object, __pyx_k_dtype_is_object, sizeof(__pyx_k_dtype_is_object), 0, 0, 1, 1},
   {&__pyx_n_s_encode, __pyx_k_encode, sizeof(__pyx_k_encode), 0, 0, 1, 1},
+  {&__pyx_n_s_end, __pyx_k_end, sizeof(__pyx_k_end), 0, 0, 1, 1},
   {&__pyx_n_s_enumerate, __pyx_k_enumerate, sizeof(__pyx_k_enumerate), 0, 0, 1, 1},
   {&__pyx_n_s_epoch, __pyx_k_epoch, sizeof(__pyx_k_epoch), 0, 0, 1, 1},
   {&__pyx_n_s_error, __pyx_k_error, sizeof(__pyx_k_error), 0, 0, 1, 1},
   {&__pyx_n_s_expectedOutput, __pyx_k_expectedOutput, sizeof(__pyx_k_expectedOutput), 0, 0, 1, 1},
   {&__pyx_n_s_f, __pyx_k_f, sizeof(__pyx_k_f), 0, 0, 1, 1},
+  {&__pyx_n_s_file, __pyx_k_file, sizeof(__pyx_k_file), 0, 0, 1, 1},
+  {&__pyx_kp_s_finished_calling, __pyx_k_finished_calling, sizeof(__pyx_k_finished_calling), 0, 0, 1, 0},
   {&__pyx_n_s_flags, __pyx_k_flags, sizeof(__pyx_k_flags), 0, 0, 1, 1},
   {&__pyx_n_s_format, __pyx_k_format, sizeof(__pyx_k_format), 0, 0, 1, 1},
   {&__pyx_n_s_fortran, __pyx_k_fortran, sizeof(__pyx_k_fortran), 0, 0, 1, 1},
@@ -29929,9 +29924,14 @@ static __Pyx_StringTabEntry __pyx_string_tab[] = {
   {&__pyx_n_s_getPerception, __pyx_k_getPerception, sizeof(__pyx_k_getPerception), 0, 0, 1, 1},
   {&__pyx_n_s_getPerceptionPlanes, __pyx_k_getPerceptionPlanes, sizeof(__pyx_k_getPerceptionPlanes), 0, 0, 1, 1},
   {&__pyx_n_s_getPerceptionSize, __pyx_k_getPerceptionSize, sizeof(__pyx_k_getPerceptionSize), 0, 0, 1, 1},
+  {&__pyx_kp_s_got_char_result, __pyx_k_got_char_result, sizeof(__pyx_k_got_char_result), 0, 0, 1, 0},
   {&__pyx_kp_s_got_differing_extents_in_dimensi, __pyx_k_got_differing_extents_in_dimensi, sizeof(__pyx_k_got_differing_extents_in_dimensi), 0, 0, 1, 0},
   {&__pyx_n_s_gpuindex, __pyx_k_gpuindex, sizeof(__pyx_k_gpuindex), 0, 0, 1, 1},
   {&__pyx_n_s_hasFinished, __pyx_k_hasFinished, sizeof(__pyx_k_hasFinished), 0, 0, 1, 1},
+  {&__pyx_kp_s_home_user_git_DeepCL_python_Gen, __pyx_k_home_user_git_DeepCL_python_Gen, sizeof(__pyx_k_home_user_git_DeepCL_python_Gen), 0, 0, 1, 0},
+  {&__pyx_kp_s_home_user_git_DeepCL_python_Lay, __pyx_k_home_user_git_DeepCL_python_Lay, sizeof(__pyx_k_home_user_git_DeepCL_python_Lay), 0, 0, 1, 0},
+  {&__pyx_kp_s_home_user_git_DeepCL_python_Net, __pyx_k_home_user_git_DeepCL_python_Net, sizeof(__pyx_k_home_user_git_DeepCL_python_Net), 0, 0, 1, 0},
+  {&__pyx_kp_s_home_user_git_DeepCL_python_PyD, __pyx_k_home_user_git_DeepCL_python_PyD, sizeof(__pyx_k_home_user_git_DeepCL_python_PyD), 0, 0, 1, 0},
   {&__pyx_n_s_id, __pyx_k_id, sizeof(__pyx_k_id), 0, 0, 1, 1},
   {&__pyx_n_s_images, __pyx_k_images, sizeof(__pyx_k_images), 0, 0, 1, 1},
   {&__pyx_n_s_import, __pyx_k_import, sizeof(__pyx_k_import), 0, 0, 1, 1},
@@ -29948,7 +29948,6 @@ static __Pyx_StringTabEntry __pyx_string_tab[] = {
   {&__pyx_n_s_load, __pyx_k_load, sizeof(__pyx_k_load), 0, 0, 1, 1},
   {&__pyx_n_s_main, __pyx_k_main, sizeof(__pyx_k_main), 0, 0, 1, 1},
   {&__pyx_n_s_memview, __pyx_k_memview, sizeof(__pyx_k_memview), 0, 0, 1, 1},
-  {&__pyx_n_s_message, __pyx_k_message, sizeof(__pyx_k_message), 0, 0, 1, 1},
   {&__pyx_n_s_mode, __pyx_k_mode, sizeof(__pyx_k_mode), 0, 0, 1, 1},
   {&__pyx_n_s_momentum, __pyx_k_momentum, sizeof(__pyx_k_momentum), 0, 0, 1, 1},
   {&__pyx_n_s_mythread, __pyx_k_mythread, sizeof(__pyx_k_mythread), 0, 0, 1, 1},
@@ -29957,12 +29956,14 @@ static __Pyx_StringTabEntry __pyx_string_tab[] = {
   {&__pyx_n_s_ndim, __pyx_k_ndim, sizeof(__pyx_k_ndim), 0, 0, 1, 1},
   {&__pyx_n_s_net, __pyx_k_net, sizeof(__pyx_k_net), 0, 0, 1, 1},
   {&__pyx_n_s_netdef, __pyx_k_netdef, sizeof(__pyx_k_netdef), 0, 0, 1, 1},
+  {&__pyx_n_s_netdef_charstar, __pyx_k_netdef_charstar, sizeof(__pyx_k_netdef_charstar), 0, 0, 1, 1},
   {&__pyx_n_s_neuralnet, __pyx_k_neuralnet, sizeof(__pyx_k_neuralnet), 0, 0, 1, 1},
   {&__pyx_kp_s_not_found, __pyx_k_not_found, sizeof(__pyx_k_not_found), 0, 0, 1, 0},
   {&__pyx_n_s_numExamples, __pyx_k_numExamples, sizeof(__pyx_k_numExamples), 0, 0, 1, 1},
   {&__pyx_n_s_obj, __pyx_k_obj, sizeof(__pyx_k_obj), 0, 0, 1, 1},
   {&__pyx_n_s_pack, __pyx_k_pack, sizeof(__pyx_k_pack), 0, 0, 1, 1},
   {&__pyx_n_s_planes, __pyx_k_planes, sizeof(__pyx_k_planes), 0, 0, 1, 1},
+  {&__pyx_n_s_print, __pyx_k_print, sizeof(__pyx_k_print), 0, 0, 1, 1},
   {&__pyx_n_s_pyString, __pyx_k_pyString, sizeof(__pyx_k_pyString), 0, 0, 1, 1},
   {&__pyx_n_s_pyx_getbuffer, __pyx_k_pyx_getbuffer, sizeof(__pyx_k_pyx_getbuffer), 0, 0, 1, 1},
   {&__pyx_n_s_pyx_vtable, __pyx_k_pyx_vtable, sizeof(__pyx_k_pyx_vtable), 0, 0, 1, 1},
@@ -29989,11 +29990,10 @@ static __Pyx_StringTabEntry __pyx_string_tab[] = {
   {&__pyx_n_s_testData, __pyx_k_testData, sizeof(__pyx_k_testData), 0, 0, 1, 1},
   {&__pyx_n_s_testLabels, __pyx_k_testLabels, sizeof(__pyx_k_testLabels), 0, 0, 1, 1},
   {&__pyx_n_s_threading, __pyx_k_threading, sizeof(__pyx_k_threading), 0, 0, 1, 1},
-  {&__pyx_n_s_threwException, __pyx_k_threwException, sizeof(__pyx_k_threwException), 0, 0, 1, 1},
   {&__pyx_n_s_toCppString, __pyx_k_toCppString, sizeof(__pyx_k_toCppString), 0, 0, 1, 1},
   {&__pyx_n_s_trainData, __pyx_k_trainData, sizeof(__pyx_k_trainData), 0, 0, 1, 1},
-  {&__pyx_n_s_trainFilePath, __pyx_k_trainFilePath, sizeof(__pyx_k_trainFilePath), 0, 0, 1, 1},
   {&__pyx_n_s_trainFilepath, __pyx_k_trainFilepath, sizeof(__pyx_k_trainFilepath), 0, 0, 1, 1},
+  {&__pyx_n_s_trainFilepath_charstar, __pyx_k_trainFilepath_charstar, sizeof(__pyx_k_trainFilepath_charstar), 0, 0, 1, 1},
   {&__pyx_n_s_trainLabels, __pyx_k_trainLabels, sizeof(__pyx_k_trainLabels), 0, 0, 1, 1},
   {&__pyx_kp_s_unable_to_allocate_array_data, __pyx_k_unable_to_allocate_array_data, sizeof(__pyx_k_unable_to_allocate_array_data), 0, 0, 1, 0},
   {&__pyx_kp_s_unable_to_allocate_shape_and_str, __pyx_k_unable_to_allocate_shape_and_str, sizeof(__pyx_k_unable_to_allocate_shape_and_str), 0, 0, 1, 0},
@@ -30003,9 +30003,8 @@ static __Pyx_StringTabEntry __pyx_string_tab[] = {
 };
 static int __Pyx_InitCachedBuiltins(void) {
   __pyx_builtin_staticmethod = __Pyx_GetBuiltinName(__pyx_n_s_staticmethod); if (!__pyx_builtin_staticmethod) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 17; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
-  __pyx_builtin_Exception = __Pyx_GetBuiltinName(__pyx_n_s_Exception); if (!__pyx_builtin_Exception) {__pyx_filename = __pyx_f[1]; __pyx_lineno = 39; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
-  __pyx_builtin_range = __Pyx_GetBuiltinName(__pyx_n_s_range); if (!__pyx_builtin_range) {__pyx_filename = __pyx_f[1]; __pyx_lineno = 50; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
-  __pyx_builtin_RuntimeError = __Pyx_GetBuiltinName(__pyx_n_s_RuntimeError); if (!__pyx_builtin_RuntimeError) {__pyx_filename = __pyx_f[2]; __pyx_lineno = 37; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+  __pyx_builtin_Exception = __Pyx_GetBuiltinName(__pyx_n_s_Exception); if (!__pyx_builtin_Exception) {__pyx_filename = __pyx_f[1]; __pyx_lineno = 44; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+  __pyx_builtin_range = __Pyx_GetBuiltinName(__pyx_n_s_range); if (!__pyx_builtin_range) {__pyx_filename = __pyx_f[1]; __pyx_lineno = 55; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
   __pyx_builtin_MemoryError = __Pyx_GetBuiltinName(__pyx_n_s_MemoryError); if (!__pyx_builtin_MemoryError) {__pyx_filename = __pyx_f[15]; __pyx_lineno = 107; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
   __pyx_builtin_ValueError = __Pyx_GetBuiltinName(__pyx_n_s_ValueError); if (!__pyx_builtin_ValueError) {__pyx_filename = __pyx_f[16]; __pyx_lineno = 127; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
   __pyx_builtin_enumerate = __Pyx_GetBuiltinName(__pyx_n_s_enumerate); if (!__pyx_builtin_enumerate) {__pyx_filename = __pyx_f[16]; __pyx_lineno = 145; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
@@ -30022,114 +30021,114 @@ static int __Pyx_InitCachedConstants(void) {
   __Pyx_RefNannyDeclarations
   __Pyx_RefNannySetupContext("__Pyx_InitCachedConstants", 0);
 
-  /* "Layer.pyx":52
+  /* "../../../../../../home/user/git/DeepCL/python/Layer.pyx":52
  * #        void unpersistFromArray(const float *array)
  *     def setWeightsList(self, weightsList):
  *         cdef c_array.array weightsArray = array('f')             # <<<<<<<<<<<<<<
  *         weightsArray.fromlist( weightsList )
  *         self.setWeights( weightsArray )
  */
-  __pyx_tuple_ = PyTuple_Pack(1, __pyx_n_s_f); if (unlikely(!__pyx_tuple_)) {__pyx_filename = __pyx_f[10]; __pyx_lineno = 52; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+  __pyx_tuple_ = PyTuple_Pack(1, __pyx_n_s_f); if (unlikely(!__pyx_tuple_)) {__pyx_filename = __pyx_f[9]; __pyx_lineno = 52; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
   __Pyx_GOTREF(__pyx_tuple_);
   __Pyx_GIVEREF(__pyx_tuple_);
 
-  /* "QLearning.pyx":75
+  /* "../../../../../../home/user/git/DeepCL/python/QLearning.pyx":75
  * 
  *     def getPerceptionSize(self):
  *         raise Exception("Method needs to be overridden: Scenario.getPerceptionSize()")             # <<<<<<<<<<<<<<
  * 
  *     def getPerceptionPlanes(self):
  */
-  __pyx_tuple__2 = PyTuple_Pack(1, __pyx_kp_s_Method_needs_to_be_overridden_Sc); if (unlikely(!__pyx_tuple__2)) {__pyx_filename = __pyx_f[14]; __pyx_lineno = 75; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+  __pyx_tuple__2 = PyTuple_Pack(1, __pyx_kp_s_Method_needs_to_be_overridden_Sc); if (unlikely(!__pyx_tuple__2)) {__pyx_filename = __pyx_f[13]; __pyx_lineno = 75; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
   __Pyx_GOTREF(__pyx_tuple__2);
   __Pyx_GIVEREF(__pyx_tuple__2);
 
-  /* "QLearning.pyx":78
+  /* "../../../../../../home/user/git/DeepCL/python/QLearning.pyx":78
  * 
  *     def getPerceptionPlanes(self):
  *         raise Exception("Method needs to be overridden: Scenario.getPerceptionPlanes()")             # <<<<<<<<<<<<<<
  * 
  *     def reset(self):
  */
-  __pyx_tuple__3 = PyTuple_Pack(1, __pyx_kp_s_Method_needs_to_be_overridden_Sc_2); if (unlikely(!__pyx_tuple__3)) {__pyx_filename = __pyx_f[14]; __pyx_lineno = 78; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+  __pyx_tuple__3 = PyTuple_Pack(1, __pyx_kp_s_Method_needs_to_be_overridden_Sc_2); if (unlikely(!__pyx_tuple__3)) {__pyx_filename = __pyx_f[13]; __pyx_lineno = 78; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
   __Pyx_GOTREF(__pyx_tuple__3);
   __Pyx_GIVEREF(__pyx_tuple__3);
 
-  /* "QLearning.pyx":81
+  /* "../../../../../../home/user/git/DeepCL/python/QLearning.pyx":81
  * 
  *     def reset(self):
  *         raise Exception("Method needs to be overridden: Scenario.reset()")             # <<<<<<<<<<<<<<
  * 
  *     def getNumActions(self):
  */
-  __pyx_tuple__4 = PyTuple_Pack(1, __pyx_kp_s_Method_needs_to_be_overridden_Sc_3); if (unlikely(!__pyx_tuple__4)) {__pyx_filename = __pyx_f[14]; __pyx_lineno = 81; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+  __pyx_tuple__4 = PyTuple_Pack(1, __pyx_kp_s_Method_needs_to_be_overridden_Sc_3); if (unlikely(!__pyx_tuple__4)) {__pyx_filename = __pyx_f[13]; __pyx_lineno = 81; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
   __Pyx_GOTREF(__pyx_tuple__4);
   __Pyx_GIVEREF(__pyx_tuple__4);
 
-  /* "QLearning.pyx":84
+  /* "../../../../../../home/user/git/DeepCL/python/QLearning.pyx":84
  * 
  *     def getNumActions(self):
  *         raise Exception("Method needs to be overridden: Scenario.getNumActions()")             # <<<<<<<<<<<<<<
  * 
  *     def act(self, index):
  */
-  __pyx_tuple__5 = PyTuple_Pack(1, __pyx_kp_s_Method_needs_to_be_overridden_Sc_4); if (unlikely(!__pyx_tuple__5)) {__pyx_filename = __pyx_f[14]; __pyx_lineno = 84; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+  __pyx_tuple__5 = PyTuple_Pack(1, __pyx_kp_s_Method_needs_to_be_overridden_Sc_4); if (unlikely(!__pyx_tuple__5)) {__pyx_filename = __pyx_f[13]; __pyx_lineno = 84; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
   __Pyx_GOTREF(__pyx_tuple__5);
   __Pyx_GIVEREF(__pyx_tuple__5);
 
-  /* "QLearning.pyx":87
+  /* "../../../../../../home/user/git/DeepCL/python/QLearning.pyx":87
  * 
  *     def act(self, index):
  *         raise Exception("Method needs to be overridden: Scenario.act()")             # <<<<<<<<<<<<<<
  * 
  *     def hasFinished(self):
  */
-  __pyx_tuple__6 = PyTuple_Pack(1, __pyx_kp_s_Method_needs_to_be_overridden_Sc_5); if (unlikely(!__pyx_tuple__6)) {__pyx_filename = __pyx_f[14]; __pyx_lineno = 87; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+  __pyx_tuple__6 = PyTuple_Pack(1, __pyx_kp_s_Method_needs_to_be_overridden_Sc_5); if (unlikely(!__pyx_tuple__6)) {__pyx_filename = __pyx_f[13]; __pyx_lineno = 87; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
   __Pyx_GOTREF(__pyx_tuple__6);
   __Pyx_GIVEREF(__pyx_tuple__6);
 
-  /* "QLearning.pyx":90
+  /* "../../../../../../home/user/git/DeepCL/python/QLearning.pyx":90
  * 
  *     def hasFinished(self):
  *         raise Exception("Method needs to be overridden: Scenario.hasFinished()")             # <<<<<<<<<<<<<<
  * 
  * #[[[end]]]
  */
-  __pyx_tuple__7 = PyTuple_Pack(1, __pyx_kp_s_Method_needs_to_be_overridden_Sc_6); if (unlikely(!__pyx_tuple__7)) {__pyx_filename = __pyx_f[14]; __pyx_lineno = 90; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+  __pyx_tuple__7 = PyTuple_Pack(1, __pyx_kp_s_Method_needs_to_be_overridden_Sc_6); if (unlikely(!__pyx_tuple__7)) {__pyx_filename = __pyx_f[13]; __pyx_lineno = 90; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
   __Pyx_GOTREF(__pyx_tuple__7);
   __Pyx_GIVEREF(__pyx_tuple__7);
 
-  /* "QLearning.pyx":101
+  /* "../../../../../../home/user/git/DeepCL/python/QLearning.pyx":101
  * 
  *     def getPerception(self, perception):
  *         raise Exception("Method needs to be overridden: Scenario.getPerception()")             # <<<<<<<<<<<<<<
  * 
  */
-  __pyx_tuple__8 = PyTuple_Pack(1, __pyx_kp_s_Method_needs_to_be_overridden_Sc_7); if (unlikely(!__pyx_tuple__8)) {__pyx_filename = __pyx_f[14]; __pyx_lineno = 101; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+  __pyx_tuple__8 = PyTuple_Pack(1, __pyx_kp_s_Method_needs_to_be_overridden_Sc_7); if (unlikely(!__pyx_tuple__8)) {__pyx_filename = __pyx_f[13]; __pyx_lineno = 101; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
   __Pyx_GOTREF(__pyx_tuple__8);
   __Pyx_GIVEREF(__pyx_tuple__8);
 
-  /* "PyDeepCL.pyx":44
+  /* "PyDeepCL.pyx":46
  *     mythread.start()
  *     while mythread.isAlive():
  *         mythread.join(0.1)             # <<<<<<<<<<<<<<
  *         #print('join timed out')
  * 
  */
-  __pyx_tuple__10 = PyTuple_Pack(1, __pyx_float_0_1); if (unlikely(!__pyx_tuple__10)) {__pyx_filename = __pyx_f[2]; __pyx_lineno = 44; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
-  __Pyx_GOTREF(__pyx_tuple__10);
-  __Pyx_GIVEREF(__pyx_tuple__10);
+  __pyx_tuple__9 = PyTuple_Pack(1, __pyx_float_0_1); if (unlikely(!__pyx_tuple__9)) {__pyx_filename = __pyx_f[14]; __pyx_lineno = 46; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+  __Pyx_GOTREF(__pyx_tuple__9);
+  __Pyx_GIVEREF(__pyx_tuple__9);
 
-  /* "PyDeepCL.pyx":49
+  /* "PyDeepCL.pyx":51
  * def toCppString( pyString ):
  *     if isinstance( pyString, unicode ):
  *         return pyString.encode('utf8')             # <<<<<<<<<<<<<<
  *     return pyString
  * 
  */
-  __pyx_tuple__11 = PyTuple_Pack(1, __pyx_n_s_utf8); if (unlikely(!__pyx_tuple__11)) {__pyx_filename = __pyx_f[2]; __pyx_lineno = 49; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
-  __Pyx_GOTREF(__pyx_tuple__11);
-  __Pyx_GIVEREF(__pyx_tuple__11);
+  __pyx_tuple__10 = PyTuple_Pack(1, __pyx_n_s_utf8); if (unlikely(!__pyx_tuple__10)) {__pyx_filename = __pyx_f[14]; __pyx_lineno = 51; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+  __Pyx_GOTREF(__pyx_tuple__10);
+  __Pyx_GIVEREF(__pyx_tuple__10);
 
   /* "View.MemoryView":127
  * 
@@ -30138,9 +30137,9 @@ static int __Pyx_InitCachedConstants(void) {
  * 
  *         if itemsize <= 0:
  */
-  __pyx_tuple__12 = PyTuple_Pack(1, __pyx_kp_s_Empty_shape_tuple_for_cython_arr); if (unlikely(!__pyx_tuple__12)) {__pyx_filename = __pyx_f[16]; __pyx_lineno = 127; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
-  __Pyx_GOTREF(__pyx_tuple__12);
-  __Pyx_GIVEREF(__pyx_tuple__12);
+  __pyx_tuple__11 = PyTuple_Pack(1, __pyx_kp_s_Empty_shape_tuple_for_cython_arr); if (unlikely(!__pyx_tuple__11)) {__pyx_filename = __pyx_f[16]; __pyx_lineno = 127; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+  __Pyx_GOTREF(__pyx_tuple__11);
+  __Pyx_GIVEREF(__pyx_tuple__11);
 
   /* "View.MemoryView":130
  * 
@@ -30149,9 +30148,9 @@ static int __Pyx_InitCachedConstants(void) {
  * 
  *         if isinstance(format, unicode):
  */
-  __pyx_tuple__13 = PyTuple_Pack(1, __pyx_kp_s_itemsize_0_for_cython_array); if (unlikely(!__pyx_tuple__13)) {__pyx_filename = __pyx_f[16]; __pyx_lineno = 130; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
-  __Pyx_GOTREF(__pyx_tuple__13);
-  __Pyx_GIVEREF(__pyx_tuple__13);
+  __pyx_tuple__12 = PyTuple_Pack(1, __pyx_kp_s_itemsize_0_for_cython_array); if (unlikely(!__pyx_tuple__12)) {__pyx_filename = __pyx_f[16]; __pyx_lineno = 130; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+  __Pyx_GOTREF(__pyx_tuple__12);
+  __Pyx_GIVEREF(__pyx_tuple__12);
 
   /* "View.MemoryView":142
  * 
@@ -30160,9 +30159,9 @@ static int __Pyx_InitCachedConstants(void) {
  * 
  * 
  */
-  __pyx_tuple__14 = PyTuple_Pack(1, __pyx_kp_s_unable_to_allocate_shape_and_str); if (unlikely(!__pyx_tuple__14)) {__pyx_filename = __pyx_f[16]; __pyx_lineno = 142; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
-  __Pyx_GOTREF(__pyx_tuple__14);
-  __Pyx_GIVEREF(__pyx_tuple__14);
+  __pyx_tuple__13 = PyTuple_Pack(1, __pyx_kp_s_unable_to_allocate_shape_and_str); if (unlikely(!__pyx_tuple__13)) {__pyx_filename = __pyx_f[16]; __pyx_lineno = 142; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+  __Pyx_GOTREF(__pyx_tuple__13);
+  __Pyx_GIVEREF(__pyx_tuple__13);
 
   /* "View.MemoryView":170
  *             self.data = <char *>malloc(self.len)
@@ -30171,9 +30170,9 @@ static int __Pyx_InitCachedConstants(void) {
  * 
  *             if self.dtype_is_object:
  */
-  __pyx_tuple__15 = PyTuple_Pack(1, __pyx_kp_s_unable_to_allocate_array_data); if (unlikely(!__pyx_tuple__15)) {__pyx_filename = __pyx_f[16]; __pyx_lineno = 170; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
-  __Pyx_GOTREF(__pyx_tuple__15);
-  __Pyx_GIVEREF(__pyx_tuple__15);
+  __pyx_tuple__14 = PyTuple_Pack(1, __pyx_kp_s_unable_to_allocate_array_data); if (unlikely(!__pyx_tuple__14)) {__pyx_filename = __pyx_f[16]; __pyx_lineno = 170; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+  __Pyx_GOTREF(__pyx_tuple__14);
+  __Pyx_GIVEREF(__pyx_tuple__14);
 
   /* "View.MemoryView":186
  *             bufmode = PyBUF_F_CONTIGUOUS | PyBUF_ANY_CONTIGUOUS
@@ -30182,9 +30181,9 @@ static int __Pyx_InitCachedConstants(void) {
  *         info.buf = self.data
  *         info.len = self.len
  */
-  __pyx_tuple__16 = PyTuple_Pack(1, __pyx_kp_s_Can_only_create_a_buffer_that_is); if (unlikely(!__pyx_tuple__16)) {__pyx_filename = __pyx_f[16]; __pyx_lineno = 186; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
-  __Pyx_GOTREF(__pyx_tuple__16);
-  __Pyx_GIVEREF(__pyx_tuple__16);
+  __pyx_tuple__15 = PyTuple_Pack(1, __pyx_kp_s_Can_only_create_a_buffer_that_is); if (unlikely(!__pyx_tuple__15)) {__pyx_filename = __pyx_f[16]; __pyx_lineno = 186; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+  __Pyx_GOTREF(__pyx_tuple__15);
+  __Pyx_GIVEREF(__pyx_tuple__15);
 
   /* "View.MemoryView":445
  *             result = struct.unpack(self.view.format, bytesitem)
@@ -30193,9 +30192,9 @@ static int __Pyx_InitCachedConstants(void) {
  *         else:
  *             if len(self.view.format) == 1:
  */
-  __pyx_tuple__17 = PyTuple_Pack(1, __pyx_kp_s_Unable_to_convert_item_to_object); if (unlikely(!__pyx_tuple__17)) {__pyx_filename = __pyx_f[16]; __pyx_lineno = 445; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
-  __Pyx_GOTREF(__pyx_tuple__17);
-  __Pyx_GIVEREF(__pyx_tuple__17);
+  __pyx_tuple__16 = PyTuple_Pack(1, __pyx_kp_s_Unable_to_convert_item_to_object); if (unlikely(!__pyx_tuple__16)) {__pyx_filename = __pyx_f[16]; __pyx_lineno = 445; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+  __Pyx_GOTREF(__pyx_tuple__16);
+  __Pyx_GIVEREF(__pyx_tuple__16);
 
   /* "View.MemoryView":521
  *             if self.view.strides == NULL:
@@ -30204,9 +30203,9 @@ static int __Pyx_InitCachedConstants(void) {
  * 
  *             return tuple([stride for stride in self.view.strides[:self.view.ndim]])
  */
-  __pyx_tuple__18 = PyTuple_Pack(1, __pyx_kp_s_Buffer_view_does_not_expose_stri); if (unlikely(!__pyx_tuple__18)) {__pyx_filename = __pyx_f[16]; __pyx_lineno = 521; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
-  __Pyx_GOTREF(__pyx_tuple__18);
-  __Pyx_GIVEREF(__pyx_tuple__18);
+  __pyx_tuple__17 = PyTuple_Pack(1, __pyx_kp_s_Buffer_view_does_not_expose_stri); if (unlikely(!__pyx_tuple__17)) {__pyx_filename = __pyx_f[16]; __pyx_lineno = 521; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+  __Pyx_GOTREF(__pyx_tuple__17);
+  __Pyx_GIVEREF(__pyx_tuple__17);
 
   /* "View.MemoryView":529
  *         def __get__(self):
@@ -30215,12 +30214,12 @@ static int __Pyx_InitCachedConstants(void) {
  * 
  *             return tuple([suboffset for suboffset in self.view.suboffsets[:self.view.ndim]])
  */
-  __pyx_tuple__19 = PyTuple_New(1); if (unlikely(!__pyx_tuple__19)) {__pyx_filename = __pyx_f[16]; __pyx_lineno = 529; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
-  __Pyx_GOTREF(__pyx_tuple__19);
+  __pyx_tuple__18 = PyTuple_New(1); if (unlikely(!__pyx_tuple__18)) {__pyx_filename = __pyx_f[16]; __pyx_lineno = 529; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+  __Pyx_GOTREF(__pyx_tuple__18);
   __Pyx_INCREF(__pyx_int_neg_1);
-  PyTuple_SET_ITEM(__pyx_tuple__19, 0, __pyx_int_neg_1);
+  PyTuple_SET_ITEM(__pyx_tuple__18, 0, __pyx_int_neg_1);
   __Pyx_GIVEREF(__pyx_int_neg_1);
-  __Pyx_GIVEREF(__pyx_tuple__19);
+  __Pyx_GIVEREF(__pyx_tuple__18);
 
   /* "View.MemoryView":638
  *         if item is Ellipsis:
@@ -30229,9 +30228,9 @@ static int __Pyx_InitCachedConstants(void) {
  *                 seen_ellipsis = True
  *             else:
  */
-  __pyx_slice__20 = PySlice_New(Py_None, Py_None, Py_None); if (unlikely(!__pyx_slice__20)) {__pyx_filename = __pyx_f[16]; __pyx_lineno = 638; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
-  __Pyx_GOTREF(__pyx_slice__20);
-  __Pyx_GIVEREF(__pyx_slice__20);
+  __pyx_slice__19 = PySlice_New(Py_None, Py_None, Py_None); if (unlikely(!__pyx_slice__19)) {__pyx_filename = __pyx_f[16]; __pyx_lineno = 638; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+  __Pyx_GOTREF(__pyx_slice__19);
+  __Pyx_GIVEREF(__pyx_slice__19);
 
   /* "View.MemoryView":641
  *                 seen_ellipsis = True
@@ -30240,9 +30239,9 @@ static int __Pyx_InitCachedConstants(void) {
  *             have_slices = True
  *         else:
  */
-  __pyx_slice__21 = PySlice_New(Py_None, Py_None, Py_None); if (unlikely(!__pyx_slice__21)) {__pyx_filename = __pyx_f[16]; __pyx_lineno = 641; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
-  __Pyx_GOTREF(__pyx_slice__21);
-  __Pyx_GIVEREF(__pyx_slice__21);
+  __pyx_slice__20 = PySlice_New(Py_None, Py_None, Py_None); if (unlikely(!__pyx_slice__20)) {__pyx_filename = __pyx_f[16]; __pyx_lineno = 641; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+  __Pyx_GOTREF(__pyx_slice__20);
+  __Pyx_GIVEREF(__pyx_slice__20);
 
   /* "View.MemoryView":652
  *     nslices = ndim - len(result)
@@ -30251,9 +30250,9 @@ static int __Pyx_InitCachedConstants(void) {
  * 
  *     return have_slices or nslices, tuple(result)
  */
-  __pyx_slice__22 = PySlice_New(Py_None, Py_None, Py_None); if (unlikely(!__pyx_slice__22)) {__pyx_filename = __pyx_f[16]; __pyx_lineno = 652; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
-  __Pyx_GOTREF(__pyx_slice__22);
-  __Pyx_GIVEREF(__pyx_slice__22);
+  __pyx_slice__21 = PySlice_New(Py_None, Py_None, Py_None); if (unlikely(!__pyx_slice__21)) {__pyx_filename = __pyx_f[16]; __pyx_lineno = 652; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+  __Pyx_GOTREF(__pyx_slice__21);
+  __Pyx_GIVEREF(__pyx_slice__21);
 
   /* "View.MemoryView":659
  *     for suboffset in suboffsets[:ndim]:
@@ -30262,171 +30261,159 @@ static int __Pyx_InitCachedConstants(void) {
  * 
  * 
  */
-  __pyx_tuple__23 = PyTuple_Pack(1, __pyx_kp_s_Indirect_dimensions_not_supporte); if (unlikely(!__pyx_tuple__23)) {__pyx_filename = __pyx_f[16]; __pyx_lineno = 659; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
-  __Pyx_GOTREF(__pyx_tuple__23);
-  __Pyx_GIVEREF(__pyx_tuple__23);
+  __pyx_tuple__22 = PyTuple_Pack(1, __pyx_kp_s_Indirect_dimensions_not_supporte); if (unlikely(!__pyx_tuple__22)) {__pyx_filename = __pyx_f[16]; __pyx_lineno = 659; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+  __Pyx_GOTREF(__pyx_tuple__22);
+  __Pyx_GIVEREF(__pyx_tuple__22);
 
-  /* "LayerMaker.pyx":18
+  /* "../../../../../../home/user/git/DeepCL/python/LayerMaker.pyx":18
  *         return self
  *     @staticmethod
  *     def instance():             # <<<<<<<<<<<<<<
  *         return NormalizationLayerMaker()
  * 
  */
-  __pyx_codeobj__24 = (PyObject*)__Pyx_PyCode_New(0, 0, 0, 0, 0, __pyx_empty_bytes, __pyx_empty_tuple, __pyx_empty_tuple, __pyx_empty_tuple, __pyx_empty_tuple, __pyx_empty_tuple, __pyx_kp_s_data_norep_git_DeepCL_python_La, __pyx_n_s_instance, 18, __pyx_empty_bytes); if (unlikely(!__pyx_codeobj__24)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 18; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+  __pyx_codeobj__23 = (PyObject*)__Pyx_PyCode_New(0, 0, 0, 0, 0, __pyx_empty_bytes, __pyx_empty_tuple, __pyx_empty_tuple, __pyx_empty_tuple, __pyx_empty_tuple, __pyx_empty_tuple, __pyx_kp_s_home_user_git_DeepCL_python_Lay, __pyx_n_s_instance, 18, __pyx_empty_bytes); if (unlikely(!__pyx_codeobj__23)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 18; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
 
-  /* "LayerMaker.pyx":41
+  /* "../../../../../../home/user/git/DeepCL/python/LayerMaker.pyx":41
  *         return self
  *     @staticmethod
  *     def instance():             # <<<<<<<<<<<<<<
  *         return FullyConnectedMaker()
  * 
  */
-  __pyx_codeobj__25 = (PyObject*)__Pyx_PyCode_New(0, 0, 0, 0, 0, __pyx_empty_bytes, __pyx_empty_tuple, __pyx_empty_tuple, __pyx_empty_tuple, __pyx_empty_tuple, __pyx_empty_tuple, __pyx_kp_s_data_norep_git_DeepCL_python_La, __pyx_n_s_instance, 41, __pyx_empty_bytes); if (unlikely(!__pyx_codeobj__25)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 41; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+  __pyx_codeobj__24 = (PyObject*)__Pyx_PyCode_New(0, 0, 0, 0, 0, __pyx_empty_bytes, __pyx_empty_tuple, __pyx_empty_tuple, __pyx_empty_tuple, __pyx_empty_tuple, __pyx_empty_tuple, __pyx_kp_s_home_user_git_DeepCL_python_Lay, __pyx_n_s_instance, 41, __pyx_empty_bytes); if (unlikely(!__pyx_codeobj__24)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 41; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
 
-  /* "LayerMaker.pyx":70
+  /* "../../../../../../home/user/git/DeepCL/python/LayerMaker.pyx":70
  *         return self
  *     @staticmethod
  *     def instance():             # <<<<<<<<<<<<<<
  *         return ConvolutionalMaker()
  * 
  */
-  __pyx_codeobj__26 = (PyObject*)__Pyx_PyCode_New(0, 0, 0, 0, 0, __pyx_empty_bytes, __pyx_empty_tuple, __pyx_empty_tuple, __pyx_empty_tuple, __pyx_empty_tuple, __pyx_empty_tuple, __pyx_kp_s_data_norep_git_DeepCL_python_La, __pyx_n_s_instance, 70, __pyx_empty_bytes); if (unlikely(!__pyx_codeobj__26)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 70; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+  __pyx_codeobj__25 = (PyObject*)__Pyx_PyCode_New(0, 0, 0, 0, 0, __pyx_empty_bytes, __pyx_empty_tuple, __pyx_empty_tuple, __pyx_empty_tuple, __pyx_empty_tuple, __pyx_empty_tuple, __pyx_kp_s_home_user_git_DeepCL_python_Lay, __pyx_n_s_instance, 70, __pyx_empty_bytes); if (unlikely(!__pyx_codeobj__25)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 70; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
 
-  /* "LayerMaker.pyx":84
+  /* "../../../../../../home/user/git/DeepCL/python/LayerMaker.pyx":84
  *         return self
  *     @staticmethod
  *     def instance():             # <<<<<<<<<<<<<<
  *         return PoolingMaker()
  * 
  */
-  __pyx_codeobj__27 = (PyObject*)__Pyx_PyCode_New(0, 0, 0, 0, 0, __pyx_empty_bytes, __pyx_empty_tuple, __pyx_empty_tuple, __pyx_empty_tuple, __pyx_empty_tuple, __pyx_empty_tuple, __pyx_kp_s_data_norep_git_DeepCL_python_La, __pyx_n_s_instance, 84, __pyx_empty_bytes); if (unlikely(!__pyx_codeobj__27)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 84; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+  __pyx_codeobj__26 = (PyObject*)__Pyx_PyCode_New(0, 0, 0, 0, 0, __pyx_empty_bytes, __pyx_empty_tuple, __pyx_empty_tuple, __pyx_empty_tuple, __pyx_empty_tuple, __pyx_empty_tuple, __pyx_kp_s_home_user_git_DeepCL_python_Lay, __pyx_n_s_instance, 84, __pyx_empty_bytes); if (unlikely(!__pyx_codeobj__26)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 84; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
 
-  /* "LayerMaker.pyx":96
+  /* "../../../../../../home/user/git/DeepCL/python/LayerMaker.pyx":96
  *         return self
  *     @staticmethod
  *     def instance():             # <<<<<<<<<<<<<<
  *         return ActivationMaker()
  * 
  */
-  __pyx_codeobj__28 = (PyObject*)__Pyx_PyCode_New(0, 0, 0, 0, 0, __pyx_empty_bytes, __pyx_empty_tuple, __pyx_empty_tuple, __pyx_empty_tuple, __pyx_empty_tuple, __pyx_empty_tuple, __pyx_kp_s_data_norep_git_DeepCL_python_La, __pyx_n_s_instance, 96, __pyx_empty_bytes); if (unlikely(!__pyx_codeobj__28)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 96; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+  __pyx_codeobj__27 = (PyObject*)__Pyx_PyCode_New(0, 0, 0, 0, 0, __pyx_empty_bytes, __pyx_empty_tuple, __pyx_empty_tuple, __pyx_empty_tuple, __pyx_empty_tuple, __pyx_empty_tuple, __pyx_kp_s_home_user_git_DeepCL_python_Lay, __pyx_n_s_instance, 96, __pyx_empty_bytes); if (unlikely(!__pyx_codeobj__27)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 96; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
 
-  /* "LayerMaker.pyx":117
+  /* "../../../../../../home/user/git/DeepCL/python/LayerMaker.pyx":117
  *         return self
  *     @staticmethod
  *     def instance():             # <<<<<<<<<<<<<<
  *         return ActivationMaker()
  * 
  */
-  __pyx_codeobj__29 = (PyObject*)__Pyx_PyCode_New(0, 0, 0, 0, 0, __pyx_empty_bytes, __pyx_empty_tuple, __pyx_empty_tuple, __pyx_empty_tuple, __pyx_empty_tuple, __pyx_empty_tuple, __pyx_kp_s_data_norep_git_DeepCL_python_La, __pyx_n_s_instance, 117, __pyx_empty_bytes); if (unlikely(!__pyx_codeobj__29)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 117; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+  __pyx_codeobj__28 = (PyObject*)__Pyx_PyCode_New(0, 0, 0, 0, 0, __pyx_empty_bytes, __pyx_empty_tuple, __pyx_empty_tuple, __pyx_empty_tuple, __pyx_empty_tuple, __pyx_empty_tuple, __pyx_kp_s_home_user_git_DeepCL_python_Lay, __pyx_n_s_instance, 117, __pyx_empty_bytes); if (unlikely(!__pyx_codeobj__28)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 117; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
 
-  /* "LayerMaker.pyx":128
+  /* "../../../../../../home/user/git/DeepCL/python/LayerMaker.pyx":128
  * #        del self.thisptr
  *     @staticmethod
  *     def instance():             # <<<<<<<<<<<<<<
  *         return ForceBackpropMaker()
  * 
  */
-  __pyx_codeobj__30 = (PyObject*)__Pyx_PyCode_New(0, 0, 0, 0, 0, __pyx_empty_bytes, __pyx_empty_tuple, __pyx_empty_tuple, __pyx_empty_tuple, __pyx_empty_tuple, __pyx_empty_tuple, __pyx_kp_s_data_norep_git_DeepCL_python_La, __pyx_n_s_instance, 128, __pyx_empty_bytes); if (unlikely(!__pyx_codeobj__30)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 128; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+  __pyx_codeobj__29 = (PyObject*)__Pyx_PyCode_New(0, 0, 0, 0, 0, __pyx_empty_bytes, __pyx_empty_tuple, __pyx_empty_tuple, __pyx_empty_tuple, __pyx_empty_tuple, __pyx_empty_tuple, __pyx_kp_s_home_user_git_DeepCL_python_Lay, __pyx_n_s_instance, 128, __pyx_empty_bytes); if (unlikely(!__pyx_codeobj__29)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 128; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
 
-  /* "LayerMaker.pyx":139
+  /* "../../../../../../home/user/git/DeepCL/python/LayerMaker.pyx":139
  * #        del self.thisptr
  *     @staticmethod
  *     def instance():             # <<<<<<<<<<<<<<
  *         return SquareLossMaker()
  * 
  */
-  __pyx_codeobj__31 = (PyObject*)__Pyx_PyCode_New(0, 0, 0, 0, 0, __pyx_empty_bytes, __pyx_empty_tuple, __pyx_empty_tuple, __pyx_empty_tuple, __pyx_empty_tuple, __pyx_empty_tuple, __pyx_kp_s_data_norep_git_DeepCL_python_La, __pyx_n_s_instance, 139, __pyx_empty_bytes); if (unlikely(!__pyx_codeobj__31)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 139; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+  __pyx_codeobj__30 = (PyObject*)__Pyx_PyCode_New(0, 0, 0, 0, 0, __pyx_empty_bytes, __pyx_empty_tuple, __pyx_empty_tuple, __pyx_empty_tuple, __pyx_empty_tuple, __pyx_empty_tuple, __pyx_kp_s_home_user_git_DeepCL_python_Lay, __pyx_n_s_instance, 139, __pyx_empty_bytes); if (unlikely(!__pyx_codeobj__30)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 139; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
 
-  /* "LayerMaker.pyx":150
+  /* "../../../../../../home/user/git/DeepCL/python/LayerMaker.pyx":150
  * #        del self.thisptr
  *     @staticmethod
  *     def instance():             # <<<<<<<<<<<<<<
  *         return SoftMaxMaker()
  * 
  */
-  __pyx_codeobj__32 = (PyObject*)__Pyx_PyCode_New(0, 0, 0, 0, 0, __pyx_empty_bytes, __pyx_empty_tuple, __pyx_empty_tuple, __pyx_empty_tuple, __pyx_empty_tuple, __pyx_empty_tuple, __pyx_kp_s_data_norep_git_DeepCL_python_La, __pyx_n_s_instance, 150, __pyx_empty_bytes); if (unlikely(!__pyx_codeobj__32)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 150; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+  __pyx_codeobj__31 = (PyObject*)__Pyx_PyCode_New(0, 0, 0, 0, 0, __pyx_empty_bytes, __pyx_empty_tuple, __pyx_empty_tuple, __pyx_empty_tuple, __pyx_empty_tuple, __pyx_empty_tuple, __pyx_kp_s_home_user_git_DeepCL_python_Lay, __pyx_n_s_instance, 150, __pyx_empty_bytes); if (unlikely(!__pyx_codeobj__31)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 150; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
 
-  /* "LayerMaker.pyx":167
+  /* "../../../../../../home/user/git/DeepCL/python/LayerMaker.pyx":167
  *         return self
  *     @staticmethod
  *     def instance():             # <<<<<<<<<<<<<<
  *         return InputLayerMaker()
  * 
  */
-  __pyx_codeobj__33 = (PyObject*)__Pyx_PyCode_New(0, 0, 0, 0, 0, __pyx_empty_bytes, __pyx_empty_tuple, __pyx_empty_tuple, __pyx_empty_tuple, __pyx_empty_tuple, __pyx_empty_tuple, __pyx_kp_s_data_norep_git_DeepCL_python_La, __pyx_n_s_instance, 167, __pyx_empty_bytes); if (unlikely(!__pyx_codeobj__33)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 167; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+  __pyx_codeobj__32 = (PyObject*)__Pyx_PyCode_New(0, 0, 0, 0, 0, __pyx_empty_bytes, __pyx_empty_tuple, __pyx_empty_tuple, __pyx_empty_tuple, __pyx_empty_tuple, __pyx_empty_tuple, __pyx_kp_s_home_user_git_DeepCL_python_Lay, __pyx_n_s_instance, 167, __pyx_empty_bytes); if (unlikely(!__pyx_codeobj__32)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 167; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
 
-  /* "GenericLoader.pyx":3
+  /* "../../../../../../home/user/git/DeepCL/python/GenericLoader.pyx":3
  * cdef class GenericLoader:
  *     @staticmethod
- *     def getDimensions( trainFilePath ):             # <<<<<<<<<<<<<<
+ *     def getDimensions( trainFilepath ):             # <<<<<<<<<<<<<<
+ *         print 'GenericLoader.py getDimensions ', trainFilepath
  *         cdef int N
- *         cdef int planes
  */
-  __pyx_tuple__34 = PyTuple_Pack(4, __pyx_n_s_trainFilePath, __pyx_n_s_N, __pyx_n_s_planes, __pyx_n_s_size); if (unlikely(!__pyx_tuple__34)) {__pyx_filename = __pyx_f[11]; __pyx_lineno = 3; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
-  __Pyx_GOTREF(__pyx_tuple__34);
-  __Pyx_GIVEREF(__pyx_tuple__34);
-  __pyx_codeobj__35 = (PyObject*)__Pyx_PyCode_New(1, 0, 4, 0, 0, __pyx_empty_bytes, __pyx_empty_tuple, __pyx_empty_tuple, __pyx_tuple__34, __pyx_empty_tuple, __pyx_empty_tuple, __pyx_kp_s_data_norep_git_DeepCL_python_Ge, __pyx_n_s_getDimensions, 3, __pyx_empty_bytes); if (unlikely(!__pyx_codeobj__35)) {__pyx_filename = __pyx_f[11]; __pyx_lineno = 3; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+  __pyx_tuple__33 = PyTuple_Pack(5, __pyx_n_s_trainFilepath, __pyx_n_s_N, __pyx_n_s_planes, __pyx_n_s_size, __pyx_n_s_trainFilepath_charstar); if (unlikely(!__pyx_tuple__33)) {__pyx_filename = __pyx_f[10]; __pyx_lineno = 3; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+  __Pyx_GOTREF(__pyx_tuple__33);
+  __Pyx_GIVEREF(__pyx_tuple__33);
+  __pyx_codeobj__34 = (PyObject*)__Pyx_PyCode_New(1, 0, 5, 0, 0, __pyx_empty_bytes, __pyx_empty_tuple, __pyx_empty_tuple, __pyx_tuple__33, __pyx_empty_tuple, __pyx_empty_tuple, __pyx_kp_s_home_user_git_DeepCL_python_Gen, __pyx_n_s_getDimensions, 3, __pyx_empty_bytes); if (unlikely(!__pyx_codeobj__34)) {__pyx_filename = __pyx_f[10]; __pyx_lineno = 3; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
 
-  /* "GenericLoader.pyx":10
+  /* "../../../../../../home/user/git/DeepCL/python/GenericLoader.pyx":13
  *         return (N,planes,size)
  *     @staticmethod
  *     def load( trainFilepath, float[:] images, int[:] labels, startN, numExamples ):             # <<<<<<<<<<<<<<
- *         cDeepCL.GenericLoader.load( toCppString(trainFilepath), &images[0], &labels[0], startN , numExamples )
- * 
+ *         cdef const char *trainFilepath_charstar = trainFilepath
+ *         cDeepCL.GenericLoader.load(trainFilepath_charstar, &images[0], &labels[0], startN , numExamples)
  */
-  __pyx_tuple__36 = PyTuple_Pack(5, __pyx_n_s_trainFilepath, __pyx_n_s_images, __pyx_n_s_labels, __pyx_n_s_startN, __pyx_n_s_numExamples); if (unlikely(!__pyx_tuple__36)) {__pyx_filename = __pyx_f[11]; __pyx_lineno = 10; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
-  __Pyx_GOTREF(__pyx_tuple__36);
-  __Pyx_GIVEREF(__pyx_tuple__36);
-  __pyx_codeobj__37 = (PyObject*)__Pyx_PyCode_New(5, 0, 5, 0, 0, __pyx_empty_bytes, __pyx_empty_tuple, __pyx_empty_tuple, __pyx_tuple__36, __pyx_empty_tuple, __pyx_empty_tuple, __pyx_kp_s_data_norep_git_DeepCL_python_Ge, __pyx_n_s_load, 10, __pyx_empty_bytes); if (unlikely(!__pyx_codeobj__37)) {__pyx_filename = __pyx_f[11]; __pyx_lineno = 10; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+  __pyx_tuple__35 = PyTuple_Pack(6, __pyx_n_s_trainFilepath, __pyx_n_s_images, __pyx_n_s_labels, __pyx_n_s_startN, __pyx_n_s_numExamples, __pyx_n_s_trainFilepath_charstar); if (unlikely(!__pyx_tuple__35)) {__pyx_filename = __pyx_f[10]; __pyx_lineno = 13; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+  __Pyx_GOTREF(__pyx_tuple__35);
+  __Pyx_GIVEREF(__pyx_tuple__35);
+  __pyx_codeobj__36 = (PyObject*)__Pyx_PyCode_New(5, 0, 6, 0, 0, __pyx_empty_bytes, __pyx_empty_tuple, __pyx_empty_tuple, __pyx_tuple__35, __pyx_empty_tuple, __pyx_empty_tuple, __pyx_kp_s_home_user_git_DeepCL_python_Gen, __pyx_n_s_load, 13, __pyx_empty_bytes); if (unlikely(!__pyx_codeobj__36)) {__pyx_filename = __pyx_f[10]; __pyx_lineno = 13; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
 
-  /* "NetDefToNet.pyx":3
+  /* "../../../../../../home/user/git/DeepCL/python/NetDefToNet.pyx":3
  * cdef class NetdefToNet:
  *     @staticmethod
  *     def createNetFromNetdef( NeuralNet neuralnet, netdef ):             # <<<<<<<<<<<<<<
- *         return cDeepCL.NetdefToNet.createNetFromNetdef( neuralnet.thisptr, toCppString( netdef ) )
- * 
- */
-  __pyx_tuple__38 = PyTuple_Pack(2, __pyx_n_s_neuralnet, __pyx_n_s_netdef); if (unlikely(!__pyx_tuple__38)) {__pyx_filename = __pyx_f[13]; __pyx_lineno = 3; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
-  __Pyx_GOTREF(__pyx_tuple__38);
-  __Pyx_GIVEREF(__pyx_tuple__38);
-  __pyx_codeobj__39 = (PyObject*)__Pyx_PyCode_New(2, 0, 2, 0, 0, __pyx_empty_bytes, __pyx_empty_tuple, __pyx_empty_tuple, __pyx_tuple__38, __pyx_empty_tuple, __pyx_empty_tuple, __pyx_kp_s_data_norep_git_DeepCL_python_Ne, __pyx_n_s_createNetFromNetdef, 3, __pyx_empty_bytes); if (unlikely(!__pyx_codeobj__39)) {__pyx_filename = __pyx_f[13]; __pyx_lineno = 3; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
-
-  /* "PyDeepCL.pyx":31
- * include "QLearning.pyx"
- * 
- * def checkException():             # <<<<<<<<<<<<<<
- *     cdef int threwException = 0
- *     cdef string message = ""
+ *         cdef const char *netdef_charstar = netdef
+ *         return cDeepCL.NetdefToNet.createNetFromNetdefCharStar(neuralnet.thisptr, netdef_charstar)
  */
-  __pyx_tuple__40 = PyTuple_Pack(2, __pyx_n_s_threwException, __pyx_n_s_message); if (unlikely(!__pyx_tuple__40)) {__pyx_filename = __pyx_f[2]; __pyx_lineno = 31; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
-  __Pyx_GOTREF(__pyx_tuple__40);
-  __Pyx_GIVEREF(__pyx_tuple__40);
-  __pyx_codeobj__41 = (PyObject*)__Pyx_PyCode_New(0, 0, 2, 0, 0, __pyx_empty_bytes, __pyx_empty_tuple, __pyx_empty_tuple, __pyx_tuple__40, __pyx_empty_tuple, __pyx_empty_tuple, __pyx_kp_s_data_norep_git_DeepCL_python_Py, __pyx_n_s_checkException, 31, __pyx_empty_bytes); if (unlikely(!__pyx_codeobj__41)) {__pyx_filename = __pyx_f[2]; __pyx_lineno = 31; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+  __pyx_tuple__37 = PyTuple_Pack(3, __pyx_n_s_neuralnet, __pyx_n_s_netdef, __pyx_n_s_netdef_charstar); if (unlikely(!__pyx_tuple__37)) {__pyx_filename = __pyx_f[12]; __pyx_lineno = 3; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+  __Pyx_GOTREF(__pyx_tuple__37);
+  __Pyx_GIVEREF(__pyx_tuple__37);
+  __pyx_codeobj__38 = (PyObject*)__Pyx_PyCode_New(2, 0, 3, 0, 0, __pyx_empty_bytes, __pyx_empty_tuple, __pyx_empty_tuple, __pyx_tuple__37, __pyx_empty_tuple, __pyx_empty_tuple, __pyx_kp_s_home_user_git_DeepCL_python_Net, __pyx_n_s_createNetFromNetdef, 3, __pyx_empty_bytes); if (unlikely(!__pyx_codeobj__38)) {__pyx_filename = __pyx_f[12]; __pyx_lineno = 3; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
 
-  /* "PyDeepCL.pyx":39
- *         raise RuntimeError(message)
+  /* "PyDeepCL.pyx":41
+ * #        raise RuntimeError(message)
  * 
  * def interruptableCall( function, args ):             # <<<<<<<<<<<<<<
  *     mythread = threading.Thread( target=function, args = args )
  *     mythread.daemon = True
  */
-  __pyx_tuple__42 = PyTuple_Pack(3, __pyx_n_s_function, __pyx_n_s_args, __pyx_n_s_mythread); if (unlikely(!__pyx_tuple__42)) {__pyx_filename = __pyx_f[2]; __pyx_lineno = 39; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
-  __Pyx_GOTREF(__pyx_tuple__42);
-  __Pyx_GIVEREF(__pyx_tuple__42);
-  __pyx_codeobj__43 = (PyObject*)__Pyx_PyCode_New(2, 0, 3, 0, 0, __pyx_empty_bytes, __pyx_empty_tuple, __pyx_empty_tuple, __pyx_tuple__42, __pyx_empty_tuple, __pyx_empty_tuple, __pyx_kp_s_data_norep_git_DeepCL_python_Py, __pyx_n_s_interruptableCall, 39, __pyx_empty_bytes); if (unlikely(!__pyx_codeobj__43)) {__pyx_filename = __pyx_f[2]; __pyx_lineno = 39; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+  __pyx_tuple__39 = PyTuple_Pack(3, __pyx_n_s_function, __pyx_n_s_args, __pyx_n_s_mythread); if (unlikely(!__pyx_tuple__39)) {__pyx_filename = __pyx_f[14]; __pyx_lineno = 41; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+  __Pyx_GOTREF(__pyx_tuple__39);
+  __Pyx_GIVEREF(__pyx_tuple__39);
+  __pyx_codeobj__40 = (PyObject*)__Pyx_PyCode_New(2, 0, 3, 0, 0, __pyx_empty_bytes, __pyx_empty_tuple, __pyx_empty_tuple, __pyx_tuple__39, __pyx_empty_tuple, __pyx_empty_tuple, __pyx_kp_s_home_user_git_DeepCL_python_PyD, __pyx_n_s_interruptableCall, 41, __pyx_empty_bytes); if (unlikely(!__pyx_codeobj__40)) {__pyx_filename = __pyx_f[14]; __pyx_lineno = 41; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
 
-  /* "PyDeepCL.pyx":47
+  /* "PyDeepCL.pyx":49
  *         #print('join timed out')
  * 
  * def toCppString( pyString ):             # <<<<<<<<<<<<<<
  *     if isinstance( pyString, unicode ):
  *         return pyString.encode('utf8')
  */
-  __pyx_tuple__44 = PyTuple_Pack(1, __pyx_n_s_pyString); if (unlikely(!__pyx_tuple__44)) {__pyx_filename = __pyx_f[2]; __pyx_lineno = 47; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
-  __Pyx_GOTREF(__pyx_tuple__44);
-  __Pyx_GIVEREF(__pyx_tuple__44);
-  __pyx_codeobj__45 = (PyObject*)__Pyx_PyCode_New(1, 0, 1, 0, 0, __pyx_empty_bytes, __pyx_empty_tuple, __pyx_empty_tuple, __pyx_tuple__44, __pyx_empty_tuple, __pyx_empty_tuple, __pyx_kp_s_data_norep_git_DeepCL_python_Py, __pyx_n_s_toCppString, 47, __pyx_empty_bytes); if (unlikely(!__pyx_codeobj__45)) {__pyx_filename = __pyx_f[2]; __pyx_lineno = 47; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+  __pyx_tuple__41 = PyTuple_Pack(1, __pyx_n_s_pyString); if (unlikely(!__pyx_tuple__41)) {__pyx_filename = __pyx_f[14]; __pyx_lineno = 49; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+  __Pyx_GOTREF(__pyx_tuple__41);
+  __Pyx_GIVEREF(__pyx_tuple__41);
+  __pyx_codeobj__42 = (PyObject*)__Pyx_PyCode_New(1, 0, 1, 0, 0, __pyx_empty_bytes, __pyx_empty_tuple, __pyx_empty_tuple, __pyx_tuple__41, __pyx_empty_tuple, __pyx_empty_tuple, __pyx_kp_s_home_user_git_DeepCL_python_PyD, __pyx_n_s_toCppString, 49, __pyx_empty_bytes); if (unlikely(!__pyx_codeobj__42)) {__pyx_filename = __pyx_f[14]; __pyx_lineno = 49; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
 
   /* "View.MemoryView":276
  *         return self.name
@@ -30435,9 +30422,9 @@ static int __Pyx_InitCachedConstants(void) {
  * cdef strided = Enum("<strided and direct>") # default
  * cdef indirect = Enum("<strided and indirect>")
  */
-  __pyx_tuple__46 = PyTuple_Pack(1, __pyx_kp_s_strided_and_direct_or_indirect); if (unlikely(!__pyx_tuple__46)) {__pyx_filename = __pyx_f[16]; __pyx_lineno = 276; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
-  __Pyx_GOTREF(__pyx_tuple__46);
-  __Pyx_GIVEREF(__pyx_tuple__46);
+  __pyx_tuple__43 = PyTuple_Pack(1, __pyx_kp_s_strided_and_direct_or_indirect); if (unlikely(!__pyx_tuple__43)) {__pyx_filename = __pyx_f[16]; __pyx_lineno = 276; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+  __Pyx_GOTREF(__pyx_tuple__43);
+  __Pyx_GIVEREF(__pyx_tuple__43);
 
   /* "View.MemoryView":277
  * 
@@ -30446,9 +30433,9 @@ static int __Pyx_InitCachedConstants(void) {
  * cdef indirect = Enum("<strided and indirect>")
  * 
  */
-  __pyx_tuple__47 = PyTuple_Pack(1, __pyx_kp_s_strided_and_direct); if (unlikely(!__pyx_tuple__47)) {__pyx_filename = __pyx_f[16]; __pyx_lineno = 277; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
-  __Pyx_GOTREF(__pyx_tuple__47);
-  __Pyx_GIVEREF(__pyx_tuple__47);
+  __pyx_tuple__44 = PyTuple_Pack(1, __pyx_kp_s_strided_and_direct); if (unlikely(!__pyx_tuple__44)) {__pyx_filename = __pyx_f[16]; __pyx_lineno = 277; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+  __Pyx_GOTREF(__pyx_tuple__44);
+  __Pyx_GIVEREF(__pyx_tuple__44);
 
   /* "View.MemoryView":278
  * cdef generic = Enum("<strided and direct or indirect>")
@@ -30457,9 +30444,9 @@ static int __Pyx_InitCachedConstants(void) {
  * 
  * 
  */
-  __pyx_tuple__48 = PyTuple_Pack(1, __pyx_kp_s_strided_and_indirect); if (unlikely(!__pyx_tuple__48)) {__pyx_filename = __pyx_f[16]; __pyx_lineno = 278; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
-  __Pyx_GOTREF(__pyx_tuple__48);
-  __Pyx_GIVEREF(__pyx_tuple__48);
+  __pyx_tuple__45 = PyTuple_Pack(1, __pyx_kp_s_strided_and_indirect); if (unlikely(!__pyx_tuple__45)) {__pyx_filename = __pyx_f[16]; __pyx_lineno = 278; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+  __Pyx_GOTREF(__pyx_tuple__45);
+  __Pyx_GIVEREF(__pyx_tuple__45);
 
   /* "View.MemoryView":281
  * 
@@ -30468,9 +30455,9 @@ static int __Pyx_InitCachedConstants(void) {
  * cdef indirect_contiguous = Enum("<contiguous and indirect>")
  * 
  */
-  __pyx_tuple__49 = PyTuple_Pack(1, __pyx_kp_s_contiguous_and_direct); if (unlikely(!__pyx_tuple__49)) {__pyx_filename = __pyx_f[16]; __pyx_lineno = 281; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
-  __Pyx_GOTREF(__pyx_tuple__49);
-  __Pyx_GIVEREF(__pyx_tuple__49);
+  __pyx_tuple__46 = PyTuple_Pack(1, __pyx_kp_s_contiguous_and_direct); if (unlikely(!__pyx_tuple__46)) {__pyx_filename = __pyx_f[16]; __pyx_lineno = 281; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+  __Pyx_GOTREF(__pyx_tuple__46);
+  __Pyx_GIVEREF(__pyx_tuple__46);
 
   /* "View.MemoryView":282
  * 
@@ -30479,9 +30466,9 @@ static int __Pyx_InitCachedConstants(void) {
  * 
  * 
  */
-  __pyx_tuple__50 = PyTuple_Pack(1, __pyx_kp_s_contiguous_and_indirect); if (unlikely(!__pyx_tuple__50)) {__pyx_filename = __pyx_f[16]; __pyx_lineno = 282; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
-  __Pyx_GOTREF(__pyx_tuple__50);
-  __Pyx_GIVEREF(__pyx_tuple__50);
+  __pyx_tuple__47 = PyTuple_Pack(1, __pyx_kp_s_contiguous_and_indirect); if (unlikely(!__pyx_tuple__47)) {__pyx_filename = __pyx_f[16]; __pyx_lineno = 282; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+  __Pyx_GOTREF(__pyx_tuple__47);
+  __Pyx_GIVEREF(__pyx_tuple__47);
   __Pyx_RefNannyFinishContext();
   return 0;
   __pyx_L1_error:;
@@ -30490,14 +30477,14 @@ static int __Pyx_InitCachedConstants(void) {
 }
 
 static int __Pyx_InitGlobals(void) {
-  if (__Pyx_InitStrings(__pyx_string_tab) < 0) {__pyx_filename = __pyx_f[2]; __pyx_lineno = 1; __pyx_clineno = __LINE__; goto __pyx_L1_error;};
-  __pyx_float_0_0 = PyFloat_FromDouble(0.0); if (unlikely(!__pyx_float_0_0)) {__pyx_filename = __pyx_f[2]; __pyx_lineno = 1; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
-  __pyx_float_0_1 = PyFloat_FromDouble(0.1); if (unlikely(!__pyx_float_0_1)) {__pyx_filename = __pyx_f[2]; __pyx_lineno = 1; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
-  __pyx_float_0_9 = PyFloat_FromDouble(0.9); if (unlikely(!__pyx_float_0_9)) {__pyx_filename = __pyx_f[2]; __pyx_lineno = 1; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
-  __pyx_float_1_0 = PyFloat_FromDouble(1.0); if (unlikely(!__pyx_float_1_0)) {__pyx_filename = __pyx_f[2]; __pyx_lineno = 1; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
-  __pyx_int_0 = PyInt_FromLong(0); if (unlikely(!__pyx_int_0)) {__pyx_filename = __pyx_f[2]; __pyx_lineno = 1; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
-  __pyx_int_1 = PyInt_FromLong(1); if (unlikely(!__pyx_int_1)) {__pyx_filename = __pyx_f[2]; __pyx_lineno = 1; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
-  __pyx_int_neg_1 = PyInt_FromLong(-1); if (unlikely(!__pyx_int_neg_1)) {__pyx_filename = __pyx_f[2]; __pyx_lineno = 1; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+  if (__Pyx_InitStrings(__pyx_string_tab) < 0) {__pyx_filename = __pyx_f[14]; __pyx_lineno = 1; __pyx_clineno = __LINE__; goto __pyx_L1_error;};
+  __pyx_float_0_0 = PyFloat_FromDouble(0.0); if (unlikely(!__pyx_float_0_0)) {__pyx_filename = __pyx_f[14]; __pyx_lineno = 1; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+  __pyx_float_0_1 = PyFloat_FromDouble(0.1); if (unlikely(!__pyx_float_0_1)) {__pyx_filename = __pyx_f[14]; __pyx_lineno = 1; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+  __pyx_float_0_9 = PyFloat_FromDouble(0.9); if (unlikely(!__pyx_float_0_9)) {__pyx_filename = __pyx_f[14]; __pyx_lineno = 1; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+  __pyx_float_1_0 = PyFloat_FromDouble(1.0); if (unlikely(!__pyx_float_1_0)) {__pyx_filename = __pyx_f[14]; __pyx_lineno = 1; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+  __pyx_int_0 = PyInt_FromLong(0); if (unlikely(!__pyx_int_0)) {__pyx_filename = __pyx_f[14]; __pyx_lineno = 1; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+  __pyx_int_1 = PyInt_FromLong(1); if (unlikely(!__pyx_int_1)) {__pyx_filename = __pyx_f[14]; __pyx_lineno = 1; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+  __pyx_int_neg_1 = PyInt_FromLong(-1); if (unlikely(!__pyx_int_neg_1)) {__pyx_filename = __pyx_f[14]; __pyx_lineno = 1; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
   return 0;
   __pyx_L1_error:;
   return -1;
@@ -30527,17 +30514,17 @@ PyMODINIT_FUNC PyInit_PyDeepCL(void)
   }
   #endif
   __Pyx_RefNannySetupContext("PyMODINIT_FUNC PyInit_PyDeepCL(void)", 0);
-  if ( __Pyx_check_binary_version() < 0) {__pyx_filename = __pyx_f[2]; __pyx_lineno = 1; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
-  __pyx_empty_tuple = PyTuple_New(0); if (unlikely(!__pyx_empty_tuple)) {__pyx_filename = __pyx_f[2]; __pyx_lineno = 1; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
-  __pyx_empty_bytes = PyBytes_FromStringAndSize("", 0); if (unlikely(!__pyx_empty_bytes)) {__pyx_filename = __pyx_f[2]; __pyx_lineno = 1; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+  if ( __Pyx_check_binary_version() < 0) {__pyx_filename = __pyx_f[14]; __pyx_lineno = 1; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+  __pyx_empty_tuple = PyTuple_New(0); if (unlikely(!__pyx_empty_tuple)) {__pyx_filename = __pyx_f[14]; __pyx_lineno = 1; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+  __pyx_empty_bytes = PyBytes_FromStringAndSize("", 0); if (unlikely(!__pyx_empty_bytes)) {__pyx_filename = __pyx_f[14]; __pyx_lineno = 1; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
   #ifdef __Pyx_CyFunction_USED
-  if (__Pyx_CyFunction_init() < 0) {__pyx_filename = __pyx_f[2]; __pyx_lineno = 1; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+  if (__Pyx_CyFunction_init() < 0) {__pyx_filename = __pyx_f[14]; __pyx_lineno = 1; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
   #endif
   #ifdef __Pyx_FusedFunction_USED
-  if (__pyx_FusedFunction_init() < 0) {__pyx_filename = __pyx_f[2]; __pyx_lineno = 1; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+  if (__pyx_FusedFunction_init() < 0) {__pyx_filename = __pyx_f[14]; __pyx_lineno = 1; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
   #endif
   #ifdef __Pyx_Generator_USED
-  if (__pyx_Generator_init() < 0) {__pyx_filename = __pyx_f[2]; __pyx_lineno = 1; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+  if (__pyx_Generator_init() < 0) {__pyx_filename = __pyx_f[14]; __pyx_lineno = 1; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
   #endif
   /*--- Library function declarations ---*/
   /*--- Threads initialization code ---*/
@@ -30552,34 +30539,34 @@ PyMODINIT_FUNC PyInit_PyDeepCL(void)
   #else
   __pyx_m = PyModule_Create(&__pyx_moduledef);
   #endif
-  if (unlikely(!__pyx_m)) {__pyx_filename = __pyx_f[2]; __pyx_lineno = 1; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
-  __pyx_d = PyModule_GetDict(__pyx_m); if (unlikely(!__pyx_d)) {__pyx_filename = __pyx_f[2]; __pyx_lineno = 1; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+  if (unlikely(!__pyx_m)) {__pyx_filename = __pyx_f[14]; __pyx_lineno = 1; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+  __pyx_d = PyModule_GetDict(__pyx_m); if (unlikely(!__pyx_d)) {__pyx_filename = __pyx_f[14]; __pyx_lineno = 1; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
   Py_INCREF(__pyx_d);
-  __pyx_b = PyImport_AddModule(__Pyx_BUILTIN_MODULE_NAME); if (unlikely(!__pyx_b)) {__pyx_filename = __pyx_f[2]; __pyx_lineno = 1; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+  __pyx_b = PyImport_AddModule(__Pyx_BUILTIN_MODULE_NAME); if (unlikely(!__pyx_b)) {__pyx_filename = __pyx_f[14]; __pyx_lineno = 1; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
   #if CYTHON_COMPILING_IN_PYPY
   Py_INCREF(__pyx_b);
   #endif
-  if (PyObject_SetAttrString(__pyx_m, "__builtins__", __pyx_b) < 0) {__pyx_filename = __pyx_f[2]; __pyx_lineno = 1; __pyx_clineno = __LINE__; goto __pyx_L1_error;};
+  if (PyObject_SetAttrString(__pyx_m, "__builtins__", __pyx_b) < 0) {__pyx_filename = __pyx_f[14]; __pyx_lineno = 1; __pyx_clineno = __LINE__; goto __pyx_L1_error;};
   /*--- Initialize various global constants etc. ---*/
-  if (unlikely(__Pyx_InitGlobals() < 0)) {__pyx_filename = __pyx_f[2]; __pyx_lineno = 1; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+  if (unlikely(__Pyx_InitGlobals() < 0)) {__pyx_filename = __pyx_f[14]; __pyx_lineno = 1; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
   #if PY_MAJOR_VERSION < 3 && (__PYX_DEFAULT_STRING_ENCODING_IS_ASCII || __PYX_DEFAULT_STRING_ENCODING_IS_DEFAULT)
-  if (__Pyx_init_sys_getdefaultencoding_params() < 0) {__pyx_filename = __pyx_f[2]; __pyx_lineno = 1; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+  if (__Pyx_init_sys_getdefaultencoding_params() < 0) {__pyx_filename = __pyx_f[14]; __pyx_lineno = 1; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
   #endif
   if (__pyx_module_is_main_PyDeepCL) {
-    if (PyObject_SetAttrString(__pyx_m, "__name__", __pyx_n_s_main) < 0) {__pyx_filename = __pyx_f[2]; __pyx_lineno = 1; __pyx_clineno = __LINE__; goto __pyx_L1_error;};
+    if (PyObject_SetAttrString(__pyx_m, "__name__", __pyx_n_s_main) < 0) {__pyx_filename = __pyx_f[14]; __pyx_lineno = 1; __pyx_clineno = __LINE__; goto __pyx_L1_error;};
   }
   #if PY_MAJOR_VERSION >= 3
   {
-    PyObject *modules = PyImport_GetModuleDict(); if (unlikely(!modules)) {__pyx_filename = __pyx_f[2]; __pyx_lineno = 1; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+    PyObject *modules = PyImport_GetModuleDict(); if (unlikely(!modules)) {__pyx_filename = __pyx_f[14]; __pyx_lineno = 1; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
     if (!PyDict_GetItemString(modules, "PyDeepCL")) {
-      if (unlikely(PyDict_SetItemString(modules, "PyDeepCL", __pyx_m) < 0)) {__pyx_filename = __pyx_f[2]; __pyx_lineno = 1; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+      if (unlikely(PyDict_SetItemString(modules, "PyDeepCL", __pyx_m) < 0)) {__pyx_filename = __pyx_f[14]; __pyx_lineno = 1; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
     }
   }
   #endif
   /*--- Builtin init code ---*/
-  if (unlikely(__Pyx_InitCachedBuiltins() < 0)) {__pyx_filename = __pyx_f[2]; __pyx_lineno = 1; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+  if (unlikely(__Pyx_InitCachedBuiltins() < 0)) {__pyx_filename = __pyx_f[14]; __pyx_lineno = 1; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
   /*--- Constants init code ---*/
-  if (unlikely(__Pyx_InitCachedConstants() < 0)) {__pyx_filename = __pyx_f[2]; __pyx_lineno = 1; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+  if (unlikely(__Pyx_InitCachedConstants() < 0)) {__pyx_filename = __pyx_f[14]; __pyx_lineno = 1; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
   /*--- Global init code ---*/
   generic = Py_None; Py_INCREF(Py_None);
   strided = Py_None; Py_INCREF(Py_None);
@@ -30589,37 +30576,37 @@ PyMODINIT_FUNC PyInit_PyDeepCL(void)
   /*--- Variable export code ---*/
   /*--- Function export code ---*/
   /*--- Type init code ---*/
-  if (PyType_Ready(&__pyx_type_8PyDeepCL_EasyCL) < 0) {__pyx_filename = __pyx_f[3]; __pyx_lineno = 1; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
-  __pyx_type_8PyDeepCL_EasyCL.tp_print = 0;
-  if (PyObject_SetAttrString(__pyx_m, "EasyCL", (PyObject *)&__pyx_type_8PyDeepCL_EasyCL) < 0) {__pyx_filename = __pyx_f[3]; __pyx_lineno = 1; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
-  __pyx_ptype_8PyDeepCL_EasyCL = &__pyx_type_8PyDeepCL_EasyCL;
-  if (PyType_Ready(&__pyx_type_8PyDeepCL_TrainingContext) < 0) {__pyx_filename = __pyx_f[4]; __pyx_lineno = 1; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+  if (PyType_Ready(&__pyx_type_8PyDeepCL_DeepCL) < 0) {__pyx_filename = __pyx_f[2]; __pyx_lineno = 1; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+  __pyx_type_8PyDeepCL_DeepCL.tp_print = 0;
+  if (PyObject_SetAttrString(__pyx_m, "DeepCL", (PyObject *)&__pyx_type_8PyDeepCL_DeepCL) < 0) {__pyx_filename = __pyx_f[2]; __pyx_lineno = 1; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+  __pyx_ptype_8PyDeepCL_DeepCL = &__pyx_type_8PyDeepCL_DeepCL;
+  if (PyType_Ready(&__pyx_type_8PyDeepCL_TrainingContext) < 0) {__pyx_filename = __pyx_f[3]; __pyx_lineno = 1; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
   __pyx_type_8PyDeepCL_TrainingContext.tp_print = 0;
-  if (PyObject_SetAttrString(__pyx_m, "TrainingContext", (PyObject *)&__pyx_type_8PyDeepCL_TrainingContext) < 0) {__pyx_filename = __pyx_f[4]; __pyx_lineno = 1; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+  if (PyObject_SetAttrString(__pyx_m, "TrainingContext", (PyObject *)&__pyx_type_8PyDeepCL_TrainingContext) < 0) {__pyx_filename = __pyx_f[3]; __pyx_lineno = 1; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
   __pyx_ptype_8PyDeepCL_TrainingContext = &__pyx_type_8PyDeepCL_TrainingContext;
-  if (PyType_Ready(&__pyx_type_8PyDeepCL_SGD) < 0) {__pyx_filename = __pyx_f[4]; __pyx_lineno = 8; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+  if (PyType_Ready(&__pyx_type_8PyDeepCL_SGD) < 0) {__pyx_filename = __pyx_f[3]; __pyx_lineno = 8; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
   __pyx_type_8PyDeepCL_SGD.tp_print = 0;
-  if (PyObject_SetAttrString(__pyx_m, "SGD", (PyObject *)&__pyx_type_8PyDeepCL_SGD) < 0) {__pyx_filename = __pyx_f[4]; __pyx_lineno = 8; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+  if (PyObject_SetAttrString(__pyx_m, "SGD", (PyObject *)&__pyx_type_8PyDeepCL_SGD) < 0) {__pyx_filename = __pyx_f[3]; __pyx_lineno = 8; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
   __pyx_ptype_8PyDeepCL_SGD = &__pyx_type_8PyDeepCL_SGD;
-  if (PyType_Ready(&__pyx_type_8PyDeepCL_Annealer) < 0) {__pyx_filename = __pyx_f[5]; __pyx_lineno = 1; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+  if (PyType_Ready(&__pyx_type_8PyDeepCL_Annealer) < 0) {__pyx_filename = __pyx_f[4]; __pyx_lineno = 1; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
   __pyx_type_8PyDeepCL_Annealer.tp_print = 0;
-  if (PyObject_SetAttrString(__pyx_m, "Annealer", (PyObject *)&__pyx_type_8PyDeepCL_Annealer) < 0) {__pyx_filename = __pyx_f[5]; __pyx_lineno = 1; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+  if (PyObject_SetAttrString(__pyx_m, "Annealer", (PyObject *)&__pyx_type_8PyDeepCL_Annealer) < 0) {__pyx_filename = __pyx_f[4]; __pyx_lineno = 1; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
   __pyx_ptype_8PyDeepCL_Annealer = &__pyx_type_8PyDeepCL_Annealer;
-  if (PyType_Ready(&__pyx_type_8PyDeepCL_Nesterov) < 0) {__pyx_filename = __pyx_f[6]; __pyx_lineno = 1; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+  if (PyType_Ready(&__pyx_type_8PyDeepCL_Nesterov) < 0) {__pyx_filename = __pyx_f[5]; __pyx_lineno = 1; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
   __pyx_type_8PyDeepCL_Nesterov.tp_print = 0;
-  if (PyObject_SetAttrString(__pyx_m, "Nesterov", (PyObject *)&__pyx_type_8PyDeepCL_Nesterov) < 0) {__pyx_filename = __pyx_f[6]; __pyx_lineno = 1; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+  if (PyObject_SetAttrString(__pyx_m, "Nesterov", (PyObject *)&__pyx_type_8PyDeepCL_Nesterov) < 0) {__pyx_filename = __pyx_f[5]; __pyx_lineno = 1; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
   __pyx_ptype_8PyDeepCL_Nesterov = &__pyx_type_8PyDeepCL_Nesterov;
-  if (PyType_Ready(&__pyx_type_8PyDeepCL_Adagrad) < 0) {__pyx_filename = __pyx_f[7]; __pyx_lineno = 1; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+  if (PyType_Ready(&__pyx_type_8PyDeepCL_Adagrad) < 0) {__pyx_filename = __pyx_f[6]; __pyx_lineno = 1; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
   __pyx_type_8PyDeepCL_Adagrad.tp_print = 0;
-  if (PyObject_SetAttrString(__pyx_m, "Adagrad", (PyObject *)&__pyx_type_8PyDeepCL_Adagrad) < 0) {__pyx_filename = __pyx_f[7]; __pyx_lineno = 1; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+  if (PyObject_SetAttrString(__pyx_m, "Adagrad", (PyObject *)&__pyx_type_8PyDeepCL_Adagrad) < 0) {__pyx_filename = __pyx_f[6]; __pyx_lineno = 1; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
   __pyx_ptype_8PyDeepCL_Adagrad = &__pyx_type_8PyDeepCL_Adagrad;
-  if (PyType_Ready(&__pyx_type_8PyDeepCL_Rmsprop) < 0) {__pyx_filename = __pyx_f[8]; __pyx_lineno = 1; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+  if (PyType_Ready(&__pyx_type_8PyDeepCL_Rmsprop) < 0) {__pyx_filename = __pyx_f[7]; __pyx_lineno = 1; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
   __pyx_type_8PyDeepCL_Rmsprop.tp_print = 0;
-  if (PyObject_SetAttrString(__pyx_m, "Rmsprop", (PyObject *)&__pyx_type_8PyDeepCL_Rmsprop) < 0) {__pyx_filename = __pyx_f[8]; __pyx_lineno = 1; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+  if (PyObject_SetAttrString(__pyx_m, "Rmsprop", (PyObject *)&__pyx_type_8PyDeepCL_Rmsprop) < 0) {__pyx_filename = __pyx_f[7]; __pyx_lineno = 1; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
   __pyx_ptype_8PyDeepCL_Rmsprop = &__pyx_type_8PyDeepCL_Rmsprop;
-  if (PyType_Ready(&__pyx_type_8PyDeepCL_Adadelta) < 0) {__pyx_filename = __pyx_f[9]; __pyx_lineno = 1; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+  if (PyType_Ready(&__pyx_type_8PyDeepCL_Adadelta) < 0) {__pyx_filename = __pyx_f[8]; __pyx_lineno = 1; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
   __pyx_type_8PyDeepCL_Adadelta.tp_print = 0;
-  if (PyObject_SetAttrString(__pyx_m, "Adadelta", (PyObject *)&__pyx_type_8PyDeepCL_Adadelta) < 0) {__pyx_filename = __pyx_f[9]; __pyx_lineno = 1; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+  if (PyObject_SetAttrString(__pyx_m, "Adadelta", (PyObject *)&__pyx_type_8PyDeepCL_Adadelta) < 0) {__pyx_filename = __pyx_f[8]; __pyx_lineno = 1; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
   __pyx_ptype_8PyDeepCL_Adadelta = &__pyx_type_8PyDeepCL_Adadelta;
   if (PyType_Ready(&__pyx_type_8PyDeepCL_NeuralNet) < 0) {__pyx_filename = __pyx_f[1]; __pyx_lineno = 1; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
   __pyx_type_8PyDeepCL_NeuralNet.tp_print = 0;
@@ -30627,10 +30614,10 @@ PyMODINIT_FUNC PyInit_PyDeepCL(void)
   __pyx_ptype_8PyDeepCL_NeuralNet = &__pyx_type_8PyDeepCL_NeuralNet;
   __pyx_vtabptr_8PyDeepCL_Layer = &__pyx_vtable_8PyDeepCL_Layer;
   __pyx_vtable_8PyDeepCL_Layer.set_thisptr = (PyObject *(*)(struct __pyx_obj_8PyDeepCL_Layer *, Layer *))__pyx_f_8PyDeepCL_5Layer_set_thisptr;
-  if (PyType_Ready(&__pyx_type_8PyDeepCL_Layer) < 0) {__pyx_filename = __pyx_f[10]; __pyx_lineno = 1; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+  if (PyType_Ready(&__pyx_type_8PyDeepCL_Layer) < 0) {__pyx_filename = __pyx_f[9]; __pyx_lineno = 1; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
   __pyx_type_8PyDeepCL_Layer.tp_print = 0;
-  if (__Pyx_SetVtable(__pyx_type_8PyDeepCL_Layer.tp_dict, __pyx_vtabptr_8PyDeepCL_Layer) < 0) {__pyx_filename = __pyx_f[10]; __pyx_lineno = 1; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
-  if (PyObject_SetAttrString(__pyx_m, "Layer", (PyObject *)&__pyx_type_8PyDeepCL_Layer) < 0) {__pyx_filename = __pyx_f[10]; __pyx_lineno = 1; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+  if (__Pyx_SetVtable(__pyx_type_8PyDeepCL_Layer.tp_dict, __pyx_vtabptr_8PyDeepCL_Layer) < 0) {__pyx_filename = __pyx_f[9]; __pyx_lineno = 1; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+  if (PyObject_SetAttrString(__pyx_m, "Layer", (PyObject *)&__pyx_type_8PyDeepCL_Layer) < 0) {__pyx_filename = __pyx_f[9]; __pyx_lineno = 1; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
   __pyx_ptype_8PyDeepCL_Layer = &__pyx_type_8PyDeepCL_Layer;
   if (PyType_Ready(&__pyx_type_8PyDeepCL_LayerMaker2) < 0) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 1; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
   __pyx_type_8PyDeepCL_LayerMaker2.tp_print = 0;
@@ -30686,25 +30673,25 @@ PyMODINIT_FUNC PyInit_PyDeepCL(void)
   __pyx_type_8PyDeepCL_InputLayerMaker.tp_print = 0;
   if (PyObject_SetAttrString(__pyx_m, "InputLayerMaker", (PyObject *)&__pyx_type_8PyDeepCL_InputLayerMaker) < 0) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 153; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
   __pyx_ptype_8PyDeepCL_InputLayerMaker = &__pyx_type_8PyDeepCL_InputLayerMaker;
-  if (PyType_Ready(&__pyx_type_8PyDeepCL_GenericLoader) < 0) {__pyx_filename = __pyx_f[11]; __pyx_lineno = 1; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+  if (PyType_Ready(&__pyx_type_8PyDeepCL_GenericLoader) < 0) {__pyx_filename = __pyx_f[10]; __pyx_lineno = 1; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
   __pyx_type_8PyDeepCL_GenericLoader.tp_print = 0;
-  if (PyObject_SetAttrString(__pyx_m, "GenericLoader", (PyObject *)&__pyx_type_8PyDeepCL_GenericLoader) < 0) {__pyx_filename = __pyx_f[11]; __pyx_lineno = 1; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+  if (PyObject_SetAttrString(__pyx_m, "GenericLoader", (PyObject *)&__pyx_type_8PyDeepCL_GenericLoader) < 0) {__pyx_filename = __pyx_f[10]; __pyx_lineno = 1; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
   __pyx_ptype_8PyDeepCL_GenericLoader = &__pyx_type_8PyDeepCL_GenericLoader;
-  if (PyType_Ready(&__pyx_type_8PyDeepCL_NetLearner) < 0) {__pyx_filename = __pyx_f[12]; __pyx_lineno = 1; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+  if (PyType_Ready(&__pyx_type_8PyDeepCL_NetLearner) < 0) {__pyx_filename = __pyx_f[11]; __pyx_lineno = 1; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
   __pyx_type_8PyDeepCL_NetLearner.tp_print = 0;
-  if (PyObject_SetAttrString(__pyx_m, "NetLearner", (PyObject *)&__pyx_type_8PyDeepCL_NetLearner) < 0) {__pyx_filename = __pyx_f[12]; __pyx_lineno = 1; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+  if (PyObject_SetAttrString(__pyx_m, "NetLearner", (PyObject *)&__pyx_type_8PyDeepCL_NetLearner) < 0) {__pyx_filename = __pyx_f[11]; __pyx_lineno = 1; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
   __pyx_ptype_8PyDeepCL_NetLearner = &__pyx_type_8PyDeepCL_NetLearner;
-  if (PyType_Ready(&__pyx_type_8PyDeepCL_NetdefToNet) < 0) {__pyx_filename = __pyx_f[13]; __pyx_lineno = 1; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+  if (PyType_Ready(&__pyx_type_8PyDeepCL_NetdefToNet) < 0) {__pyx_filename = __pyx_f[12]; __pyx_lineno = 1; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
   __pyx_type_8PyDeepCL_NetdefToNet.tp_print = 0;
-  if (PyObject_SetAttrString(__pyx_m, "NetdefToNet", (PyObject *)&__pyx_type_8PyDeepCL_NetdefToNet) < 0) {__pyx_filename = __pyx_f[13]; __pyx_lineno = 1; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+  if (PyObject_SetAttrString(__pyx_m, "NetdefToNet", (PyObject *)&__pyx_type_8PyDeepCL_NetdefToNet) < 0) {__pyx_filename = __pyx_f[12]; __pyx_lineno = 1; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
   __pyx_ptype_8PyDeepCL_NetdefToNet = &__pyx_type_8PyDeepCL_NetdefToNet;
-  if (PyType_Ready(&__pyx_type_8PyDeepCL_QLearner) < 0) {__pyx_filename = __pyx_f[14]; __pyx_lineno = 1; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+  if (PyType_Ready(&__pyx_type_8PyDeepCL_QLearner) < 0) {__pyx_filename = __pyx_f[13]; __pyx_lineno = 1; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
   __pyx_type_8PyDeepCL_QLearner.tp_print = 0;
-  if (PyObject_SetAttrString(__pyx_m, "QLearner", (PyObject *)&__pyx_type_8PyDeepCL_QLearner) < 0) {__pyx_filename = __pyx_f[14]; __pyx_lineno = 1; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+  if (PyObject_SetAttrString(__pyx_m, "QLearner", (PyObject *)&__pyx_type_8PyDeepCL_QLearner) < 0) {__pyx_filename = __pyx_f[13]; __pyx_lineno = 1; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
   __pyx_ptype_8PyDeepCL_QLearner = &__pyx_type_8PyDeepCL_QLearner;
-  if (PyType_Ready(&__pyx_type_8PyDeepCL_Scenario) < 0) {__pyx_filename = __pyx_f[14]; __pyx_lineno = 61; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+  if (PyType_Ready(&__pyx_type_8PyDeepCL_Scenario) < 0) {__pyx_filename = __pyx_f[13]; __pyx_lineno = 61; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
   __pyx_type_8PyDeepCL_Scenario.tp_print = 0;
-  if (PyObject_SetAttrString(__pyx_m, "Scenario", (PyObject *)&__pyx_type_8PyDeepCL_Scenario) < 0) {__pyx_filename = __pyx_f[14]; __pyx_lineno = 61; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+  if (PyObject_SetAttrString(__pyx_m, "Scenario", (PyObject *)&__pyx_type_8PyDeepCL_Scenario) < 0) {__pyx_filename = __pyx_f[13]; __pyx_lineno = 61; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
   __pyx_ptype_8PyDeepCL_Scenario = &__pyx_type_8PyDeepCL_Scenario;
   if (PyType_Ready(&__pyx_type___pyx_array) < 0) {__pyx_filename = __pyx_f[16]; __pyx_lineno = 99; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
   __pyx_type___pyx_array.tp_print = 0;
@@ -30753,19 +30740,19 @@ PyMODINIT_FUNC PyInit_PyDeepCL(void)
  * from cpython cimport array as c_array
  * from array import array             # <<<<<<<<<<<<<<
  * import threading
- * from libcpp.string cimport string
+ * from libcpp cimport bool
  */
-  __pyx_t_1 = PyList_New(1); if (unlikely(!__pyx_t_1)) {__pyx_filename = __pyx_f[2]; __pyx_lineno = 9; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+  __pyx_t_1 = PyList_New(1); if (unlikely(!__pyx_t_1)) {__pyx_filename = __pyx_f[14]; __pyx_lineno = 9; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
   __Pyx_GOTREF(__pyx_t_1);
   __Pyx_INCREF(__pyx_n_s_array);
   PyList_SET_ITEM(__pyx_t_1, 0, __pyx_n_s_array);
   __Pyx_GIVEREF(__pyx_n_s_array);
-  __pyx_t_2 = __Pyx_Import(__pyx_n_s_array, __pyx_t_1, -1); if (unlikely(!__pyx_t_2)) {__pyx_filename = __pyx_f[2]; __pyx_lineno = 9; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+  __pyx_t_2 = __Pyx_Import(__pyx_n_s_array, __pyx_t_1, -1); if (unlikely(!__pyx_t_2)) {__pyx_filename = __pyx_f[14]; __pyx_lineno = 9; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
   __Pyx_GOTREF(__pyx_t_2);
   __Pyx_DECREF(__pyx_t_1); __pyx_t_1 = 0;
-  __pyx_t_1 = __Pyx_ImportFrom(__pyx_t_2, __pyx_n_s_array); if (unlikely(!__pyx_t_1)) {__pyx_filename = __pyx_f[2]; __pyx_lineno = 9; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+  __pyx_t_1 = __Pyx_ImportFrom(__pyx_t_2, __pyx_n_s_array); if (unlikely(!__pyx_t_1)) {__pyx_filename = __pyx_f[14]; __pyx_lineno = 9; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
   __Pyx_GOTREF(__pyx_t_1);
-  if (PyDict_SetItem(__pyx_d, __pyx_n_s_array, __pyx_t_1) < 0) {__pyx_filename = __pyx_f[2]; __pyx_lineno = 9; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+  if (PyDict_SetItem(__pyx_d, __pyx_n_s_array, __pyx_t_1) < 0) {__pyx_filename = __pyx_f[14]; __pyx_lineno = 9; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
   __Pyx_DECREF(__pyx_t_1); __pyx_t_1 = 0;
   __Pyx_DECREF(__pyx_t_2); __pyx_t_2 = 0;
 
@@ -30773,15 +30760,15 @@ PyMODINIT_FUNC PyInit_PyDeepCL(void)
  * from cpython cimport array as c_array
  * from array import array
  * import threading             # <<<<<<<<<<<<<<
- * from libcpp.string cimport string
  * from libcpp cimport bool
+ * 
  */
-  __pyx_t_2 = __Pyx_Import(__pyx_n_s_threading, 0, -1); if (unlikely(!__pyx_t_2)) {__pyx_filename = __pyx_f[2]; __pyx_lineno = 10; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+  __pyx_t_2 = __Pyx_Import(__pyx_n_s_threading, 0, -1); if (unlikely(!__pyx_t_2)) {__pyx_filename = __pyx_f[14]; __pyx_lineno = 10; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
   __Pyx_GOTREF(__pyx_t_2);
-  if (PyDict_SetItem(__pyx_d, __pyx_n_s_threading, __pyx_t_2) < 0) {__pyx_filename = __pyx_f[2]; __pyx_lineno = 10; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+  if (PyDict_SetItem(__pyx_d, __pyx_n_s_threading, __pyx_t_2) < 0) {__pyx_filename = __pyx_f[14]; __pyx_lineno = 10; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
   __Pyx_DECREF(__pyx_t_2); __pyx_t_2 = 0;
 
-  /* "LayerMaker.pyx":18
+  /* "../../../../../../home/user/git/DeepCL/python/LayerMaker.pyx":18
  *         return self
  *     @staticmethod
  *     def instance():             # <<<<<<<<<<<<<<
@@ -30791,7 +30778,7 @@ PyMODINIT_FUNC PyInit_PyDeepCL(void)
   __pyx_t_2 = PyCFunction_NewEx(&__pyx_mdef_8PyDeepCL_23NormalizationLayerMaker_7instance, NULL, __pyx_n_s_PyDeepCL); if (unlikely(!__pyx_t_2)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 18; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
   __Pyx_GOTREF(__pyx_t_2);
 
-  /* "LayerMaker.pyx":17
+  /* "../../../../../../home/user/git/DeepCL/python/LayerMaker.pyx":17
  *         self.thisptr.scale( _scale )
  *         return self
  *     @staticmethod             # <<<<<<<<<<<<<<
@@ -30810,7 +30797,7 @@ PyMODINIT_FUNC PyInit_PyDeepCL(void)
   __Pyx_DECREF(__pyx_t_2); __pyx_t_2 = 0;
   PyType_Modified(__pyx_ptype_8PyDeepCL_NormalizationLayerMaker);
 
-  /* "LayerMaker.pyx":18
+  /* "../../../../../../home/user/git/DeepCL/python/LayerMaker.pyx":18
  *         return self
  *     @staticmethod
  *     def instance():             # <<<<<<<<<<<<<<
@@ -30820,7 +30807,7 @@ PyMODINIT_FUNC PyInit_PyDeepCL(void)
   __pyx_t_2 = __Pyx_GetNameInClass((PyObject *)__pyx_ptype_8PyDeepCL_NormalizationLayerMaker, __pyx_n_s_instance); if (unlikely(!__pyx_t_2)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 18; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
   __Pyx_GOTREF(__pyx_t_2);
 
-  /* "LayerMaker.pyx":17
+  /* "../../../../../../home/user/git/DeepCL/python/LayerMaker.pyx":17
  *         self.thisptr.scale( _scale )
  *         return self
  *     @staticmethod             # <<<<<<<<<<<<<<
@@ -30839,7 +30826,7 @@ PyMODINIT_FUNC PyInit_PyDeepCL(void)
   __Pyx_DECREF(__pyx_t_2); __pyx_t_2 = 0;
   PyType_Modified(__pyx_ptype_8PyDeepCL_NormalizationLayerMaker);
 
-  /* "LayerMaker.pyx":41
+  /* "../../../../../../home/user/git/DeepCL/python/LayerMaker.pyx":41
  *         return self
  *     @staticmethod
  *     def instance():             # <<<<<<<<<<<<<<
@@ -30849,7 +30836,7 @@ PyMODINIT_FUNC PyInit_PyDeepCL(void)
   __pyx_t_2 = PyCFunction_NewEx(&__pyx_mdef_8PyDeepCL_19FullyConnectedMaker_11instance, NULL, __pyx_n_s_PyDeepCL); if (unlikely(!__pyx_t_2)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 41; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
   __Pyx_GOTREF(__pyx_t_2);
 
-  /* "LayerMaker.pyx":40
+  /* "../../../../../../home/user/git/DeepCL/python/LayerMaker.pyx":40
  *         self.thisptr.biased( _biased )
  *         return self
  *     @staticmethod             # <<<<<<<<<<<<<<
@@ -30868,7 +30855,7 @@ PyMODINIT_FUNC PyInit_PyDeepCL(void)
   __Pyx_DECREF(__pyx_t_2); __pyx_t_2 = 0;
   PyType_Modified(__pyx_ptype_8PyDeepCL_FullyConnectedMaker);
 
-  /* "LayerMaker.pyx":41
+  /* "../../../../../../home/user/git/DeepCL/python/LayerMaker.pyx":41
  *         return self
  *     @staticmethod
  *     def instance():             # <<<<<<<<<<<<<<
@@ -30878,7 +30865,7 @@ PyMODINIT_FUNC PyInit_PyDeepCL(void)
   __pyx_t_2 = __Pyx_GetNameInClass((PyObject *)__pyx_ptype_8PyDeepCL_FullyConnectedMaker, __pyx_n_s_instance); if (unlikely(!__pyx_t_2)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 41; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
   __Pyx_GOTREF(__pyx_t_2);
 
-  /* "LayerMaker.pyx":40
+  /* "../../../../../../home/user/git/DeepCL/python/LayerMaker.pyx":40
  *         self.thisptr.biased( _biased )
  *         return self
  *     @staticmethod             # <<<<<<<<<<<<<<
@@ -30897,7 +30884,7 @@ PyMODINIT_FUNC PyInit_PyDeepCL(void)
   __Pyx_DECREF(__pyx_t_2); __pyx_t_2 = 0;
   PyType_Modified(__pyx_ptype_8PyDeepCL_FullyConnectedMaker);
 
-  /* "LayerMaker.pyx":70
+  /* "../../../../../../home/user/git/DeepCL/python/LayerMaker.pyx":70
  *         return self
  *     @staticmethod
  *     def instance():             # <<<<<<<<<<<<<<
@@ -30907,7 +30894,7 @@ PyMODINIT_FUNC PyInit_PyDeepCL(void)
   __pyx_t_2 = PyCFunction_NewEx(&__pyx_mdef_8PyDeepCL_18ConvolutionalMaker_15instance, NULL, __pyx_n_s_PyDeepCL); if (unlikely(!__pyx_t_2)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 70; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
   __Pyx_GOTREF(__pyx_t_2);
 
-  /* "LayerMaker.pyx":69
+  /* "../../../../../../home/user/git/DeepCL/python/LayerMaker.pyx":69
  *         self.thisptr.biased( _biased )
  *         return self
  *     @staticmethod             # <<<<<<<<<<<<<<
@@ -30926,7 +30913,7 @@ PyMODINIT_FUNC PyInit_PyDeepCL(void)
   __Pyx_DECREF(__pyx_t_2); __pyx_t_2 = 0;
   PyType_Modified(__pyx_ptype_8PyDeepCL_ConvolutionalMaker);
 
-  /* "LayerMaker.pyx":70
+  /* "../../../../../../home/user/git/DeepCL/python/LayerMaker.pyx":70
  *         return self
  *     @staticmethod
  *     def instance():             # <<<<<<<<<<<<<<
@@ -30936,7 +30923,7 @@ PyMODINIT_FUNC PyInit_PyDeepCL(void)
   __pyx_t_2 = __Pyx_GetNameInClass((PyObject *)__pyx_ptype_8PyDeepCL_ConvolutionalMaker, __pyx_n_s_instance); if (unlikely(!__pyx_t_2)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 70; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
   __Pyx_GOTREF(__pyx_t_2);
 
-  /* "LayerMaker.pyx":69
+  /* "../../../../../../home/user/git/DeepCL/python/LayerMaker.pyx":69
  *         self.thisptr.biased( _biased )
  *         return self
  *     @staticmethod             # <<<<<<<<<<<<<<
@@ -30955,7 +30942,7 @@ PyMODINIT_FUNC PyInit_PyDeepCL(void)
   __Pyx_DECREF(__pyx_t_2); __pyx_t_2 = 0;
   PyType_Modified(__pyx_ptype_8PyDeepCL_ConvolutionalMaker);
 
-  /* "LayerMaker.pyx":84
+  /* "../../../../../../home/user/git/DeepCL/python/LayerMaker.pyx":84
  *         return self
  *     @staticmethod
  *     def instance():             # <<<<<<<<<<<<<<
@@ -30965,7 +30952,7 @@ PyMODINIT_FUNC PyInit_PyDeepCL(void)
   __pyx_t_2 = PyCFunction_NewEx(&__pyx_mdef_8PyDeepCL_12PoolingMaker_5instance, NULL, __pyx_n_s_PyDeepCL); if (unlikely(!__pyx_t_2)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 84; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
   __Pyx_GOTREF(__pyx_t_2);
 
-  /* "LayerMaker.pyx":83
+  /* "../../../../../../home/user/git/DeepCL/python/LayerMaker.pyx":83
  *         self.thisptr.poolingSize( _poolingSize )
  *         return self
  *     @staticmethod             # <<<<<<<<<<<<<<
@@ -30984,7 +30971,7 @@ PyMODINIT_FUNC PyInit_PyDeepCL(void)
   __Pyx_DECREF(__pyx_t_2); __pyx_t_2 = 0;
   PyType_Modified(__pyx_ptype_8PyDeepCL_PoolingMaker);
 
-  /* "LayerMaker.pyx":84
+  /* "../../../../../../home/user/git/DeepCL/python/LayerMaker.pyx":84
  *         return self
  *     @staticmethod
  *     def instance():             # <<<<<<<<<<<<<<
@@ -30994,7 +30981,7 @@ PyMODINIT_FUNC PyInit_PyDeepCL(void)
   __pyx_t_2 = __Pyx_GetNameInClass((PyObject *)__pyx_ptype_8PyDeepCL_PoolingMaker, __pyx_n_s_instance); if (unlikely(!__pyx_t_2)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 84; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
   __Pyx_GOTREF(__pyx_t_2);
 
-  /* "LayerMaker.pyx":83
+  /* "../../../../../../home/user/git/DeepCL/python/LayerMaker.pyx":83
  *         self.thisptr.poolingSize( _poolingSize )
  *         return self
  *     @staticmethod             # <<<<<<<<<<<<<<
@@ -31013,7 +31000,7 @@ PyMODINIT_FUNC PyInit_PyDeepCL(void)
   __Pyx_DECREF(__pyx_t_2); __pyx_t_2 = 0;
   PyType_Modified(__pyx_ptype_8PyDeepCL_PoolingMaker);
 
-  /* "LayerMaker.pyx":96
+  /* "../../../../../../home/user/git/DeepCL/python/LayerMaker.pyx":96
  *         return self
  *     @staticmethod
  *     def instance():             # <<<<<<<<<<<<<<
@@ -31023,7 +31010,7 @@ PyMODINIT_FUNC PyInit_PyDeepCL(void)
   __pyx_t_2 = PyCFunction_NewEx(&__pyx_mdef_8PyDeepCL_12DropoutMaker_5instance, NULL, __pyx_n_s_PyDeepCL); if (unlikely(!__pyx_t_2)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 96; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
   __Pyx_GOTREF(__pyx_t_2);
 
-  /* "LayerMaker.pyx":95
+  /* "../../../../../../home/user/git/DeepCL/python/LayerMaker.pyx":95
  *         self.thisptr.dropRatio(_dropRatio)
  *         return self
  *     @staticmethod             # <<<<<<<<<<<<<<
@@ -31042,7 +31029,7 @@ PyMODINIT_FUNC PyInit_PyDeepCL(void)
   __Pyx_DECREF(__pyx_t_2); __pyx_t_2 = 0;
   PyType_Modified(__pyx_ptype_8PyDeepCL_DropoutMaker);
 
-  /* "LayerMaker.pyx":96
+  /* "../../../../../../home/user/git/DeepCL/python/LayerMaker.pyx":96
  *         return self
  *     @staticmethod
  *     def instance():             # <<<<<<<<<<<<<<
@@ -31052,7 +31039,7 @@ PyMODINIT_FUNC PyInit_PyDeepCL(void)
   __pyx_t_2 = __Pyx_GetNameInClass((PyObject *)__pyx_ptype_8PyDeepCL_DropoutMaker, __pyx_n_s_instance); if (unlikely(!__pyx_t_2)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 96; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
   __Pyx_GOTREF(__pyx_t_2);
 
-  /* "LayerMaker.pyx":95
+  /* "../../../../../../home/user/git/DeepCL/python/LayerMaker.pyx":95
  *         self.thisptr.dropRatio(_dropRatio)
  *         return self
  *     @staticmethod             # <<<<<<<<<<<<<<
@@ -31071,7 +31058,7 @@ PyMODINIT_FUNC PyInit_PyDeepCL(void)
   __Pyx_DECREF(__pyx_t_2); __pyx_t_2 = 0;
   PyType_Modified(__pyx_ptype_8PyDeepCL_DropoutMaker);
 
-  /* "LayerMaker.pyx":117
+  /* "../../../../../../home/user/git/DeepCL/python/LayerMaker.pyx":117
  *         return self
  *     @staticmethod
  *     def instance():             # <<<<<<<<<<<<<<
@@ -31081,7 +31068,7 @@ PyMODINIT_FUNC PyInit_PyDeepCL(void)
   __pyx_t_2 = PyCFunction_NewEx(&__pyx_mdef_8PyDeepCL_15ActivationMaker_11instance, NULL, __pyx_n_s_PyDeepCL); if (unlikely(!__pyx_t_2)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 117; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
   __Pyx_GOTREF(__pyx_t_2);
 
-  /* "LayerMaker.pyx":116
+  /* "../../../../../../home/user/git/DeepCL/python/LayerMaker.pyx":116
  *         self.thisptr.linear()
  *         return self
  *     @staticmethod             # <<<<<<<<<<<<<<
@@ -31100,7 +31087,7 @@ PyMODINIT_FUNC PyInit_PyDeepCL(void)
   __Pyx_DECREF(__pyx_t_2); __pyx_t_2 = 0;
   PyType_Modified(__pyx_ptype_8PyDeepCL_ActivationMaker);
 
-  /* "LayerMaker.pyx":117
+  /* "../../../../../../home/user/git/DeepCL/python/LayerMaker.pyx":117
  *         return self
  *     @staticmethod
  *     def instance():             # <<<<<<<<<<<<<<
@@ -31110,7 +31097,7 @@ PyMODINIT_FUNC PyInit_PyDeepCL(void)
   __pyx_t_2 = __Pyx_GetNameInClass((PyObject *)__pyx_ptype_8PyDeepCL_ActivationMaker, __pyx_n_s_instance); if (unlikely(!__pyx_t_2)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 117; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
   __Pyx_GOTREF(__pyx_t_2);
 
-  /* "LayerMaker.pyx":116
+  /* "../../../../../../home/user/git/DeepCL/python/LayerMaker.pyx":116
  *         self.thisptr.linear()
  *         return self
  *     @staticmethod             # <<<<<<<<<<<<<<
@@ -31129,7 +31116,7 @@ PyMODINIT_FUNC PyInit_PyDeepCL(void)
   __Pyx_DECREF(__pyx_t_2); __pyx_t_2 = 0;
   PyType_Modified(__pyx_ptype_8PyDeepCL_ActivationMaker);
 
-  /* "LayerMaker.pyx":128
+  /* "../../../../../../home/user/git/DeepCL/python/LayerMaker.pyx":128
  * #        del self.thisptr
  *     @staticmethod
  *     def instance():             # <<<<<<<<<<<<<<
@@ -31139,7 +31126,7 @@ PyMODINIT_FUNC PyInit_PyDeepCL(void)
   __pyx_t_2 = PyCFunction_NewEx(&__pyx_mdef_8PyDeepCL_18ForceBackpropMaker_3instance, NULL, __pyx_n_s_PyDeepCL); if (unlikely(!__pyx_t_2)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 128; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
   __Pyx_GOTREF(__pyx_t_2);
 
-  /* "LayerMaker.pyx":127
+  /* "../../../../../../home/user/git/DeepCL/python/LayerMaker.pyx":127
  * #    def __dealloc__(self):
  * #        del self.thisptr
  *     @staticmethod             # <<<<<<<<<<<<<<
@@ -31158,7 +31145,7 @@ PyMODINIT_FUNC PyInit_PyDeepCL(void)
   __Pyx_DECREF(__pyx_t_2); __pyx_t_2 = 0;
   PyType_Modified(__pyx_ptype_8PyDeepCL_ForceBackpropMaker);
 
-  /* "LayerMaker.pyx":128
+  /* "../../../../../../home/user/git/DeepCL/python/LayerMaker.pyx":128
  * #        del self.thisptr
  *     @staticmethod
  *     def instance():             # <<<<<<<<<<<<<<
@@ -31168,7 +31155,7 @@ PyMODINIT_FUNC PyInit_PyDeepCL(void)
   __pyx_t_2 = __Pyx_GetNameInClass((PyObject *)__pyx_ptype_8PyDeepCL_ForceBackpropMaker, __pyx_n_s_instance); if (unlikely(!__pyx_t_2)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 128; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
   __Pyx_GOTREF(__pyx_t_2);
 
-  /* "LayerMaker.pyx":127
+  /* "../../../../../../home/user/git/DeepCL/python/LayerMaker.pyx":127
  * #    def __dealloc__(self):
  * #        del self.thisptr
  *     @staticmethod             # <<<<<<<<<<<<<<
@@ -31187,7 +31174,7 @@ PyMODINIT_FUNC PyInit_PyDeepCL(void)
   __Pyx_DECREF(__pyx_t_2); __pyx_t_2 = 0;
   PyType_Modified(__pyx_ptype_8PyDeepCL_ForceBackpropMaker);
 
-  /* "LayerMaker.pyx":139
+  /* "../../../../../../home/user/git/DeepCL/python/LayerMaker.pyx":139
  * #        del self.thisptr
  *     @staticmethod
  *     def instance():             # <<<<<<<<<<<<<<
@@ -31197,7 +31184,7 @@ PyMODINIT_FUNC PyInit_PyDeepCL(void)
   __pyx_t_2 = PyCFunction_NewEx(&__pyx_mdef_8PyDeepCL_15SquareLossMaker_3instance, NULL, __pyx_n_s_PyDeepCL); if (unlikely(!__pyx_t_2)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 139; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
   __Pyx_GOTREF(__pyx_t_2);
 
-  /* "LayerMaker.pyx":138
+  /* "../../../../../../home/user/git/DeepCL/python/LayerMaker.pyx":138
  * #    def __dealloc__(self):
  * #        del self.thisptr
  *     @staticmethod             # <<<<<<<<<<<<<<
@@ -31216,7 +31203,7 @@ PyMODINIT_FUNC PyInit_PyDeepCL(void)
   __Pyx_DECREF(__pyx_t_2); __pyx_t_2 = 0;
   PyType_Modified(__pyx_ptype_8PyDeepCL_SquareLossMaker);
 
-  /* "LayerMaker.pyx":139
+  /* "../../../../../../home/user/git/DeepCL/python/LayerMaker.pyx":139
  * #        del self.thisptr
  *     @staticmethod
  *     def instance():             # <<<<<<<<<<<<<<
@@ -31226,7 +31213,7 @@ PyMODINIT_FUNC PyInit_PyDeepCL(void)
   __pyx_t_2 = __Pyx_GetNameInClass((PyObject *)__pyx_ptype_8PyDeepCL_SquareLossMaker, __pyx_n_s_instance); if (unlikely(!__pyx_t_2)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 139; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
   __Pyx_GOTREF(__pyx_t_2);
 
-  /* "LayerMaker.pyx":138
+  /* "../../../../../../home/user/git/DeepCL/python/LayerMaker.pyx":138
  * #    def __dealloc__(self):
  * #        del self.thisptr
  *     @staticmethod             # <<<<<<<<<<<<<<
@@ -31245,7 +31232,7 @@ PyMODINIT_FUNC PyInit_PyDeepCL(void)
   __Pyx_DECREF(__pyx_t_2); __pyx_t_2 = 0;
   PyType_Modified(__pyx_ptype_8PyDeepCL_SquareLossMaker);
 
-  /* "LayerMaker.pyx":150
+  /* "../../../../../../home/user/git/DeepCL/python/LayerMaker.pyx":150
  * #        del self.thisptr
  *     @staticmethod
  *     def instance():             # <<<<<<<<<<<<<<
@@ -31255,7 +31242,7 @@ PyMODINIT_FUNC PyInit_PyDeepCL(void)
   __pyx_t_2 = PyCFunction_NewEx(&__pyx_mdef_8PyDeepCL_12SoftMaxMaker_3instance, NULL, __pyx_n_s_PyDeepCL); if (unlikely(!__pyx_t_2)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 150; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
   __Pyx_GOTREF(__pyx_t_2);
 
-  /* "LayerMaker.pyx":149
+  /* "../../../../../../home/user/git/DeepCL/python/LayerMaker.pyx":149
  * #    def __dealloc__(self):
  * #        del self.thisptr
  *     @staticmethod             # <<<<<<<<<<<<<<
@@ -31274,7 +31261,7 @@ PyMODINIT_FUNC PyInit_PyDeepCL(void)
   __Pyx_DECREF(__pyx_t_2); __pyx_t_2 = 0;
   PyType_Modified(__pyx_ptype_8PyDeepCL_SoftMaxMaker);
 
-  /* "LayerMaker.pyx":150
+  /* "../../../../../../home/user/git/DeepCL/python/LayerMaker.pyx":150
  * #        del self.thisptr
  *     @staticmethod
  *     def instance():             # <<<<<<<<<<<<<<
@@ -31284,7 +31271,7 @@ PyMODINIT_FUNC PyInit_PyDeepCL(void)
   __pyx_t_2 = __Pyx_GetNameInClass((PyObject *)__pyx_ptype_8PyDeepCL_SoftMaxMaker, __pyx_n_s_instance); if (unlikely(!__pyx_t_2)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 150; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
   __Pyx_GOTREF(__pyx_t_2);
 
-  /* "LayerMaker.pyx":149
+  /* "../../../../../../home/user/git/DeepCL/python/LayerMaker.pyx":149
  * #    def __dealloc__(self):
  * #        del self.thisptr
  *     @staticmethod             # <<<<<<<<<<<<<<
@@ -31303,7 +31290,7 @@ PyMODINIT_FUNC PyInit_PyDeepCL(void)
   __Pyx_DECREF(__pyx_t_2); __pyx_t_2 = 0;
   PyType_Modified(__pyx_ptype_8PyDeepCL_SoftMaxMaker);
 
-  /* "LayerMaker.pyx":167
+  /* "../../../../../../home/user/git/DeepCL/python/LayerMaker.pyx":167
  *         return self
  *     @staticmethod
  *     def instance():             # <<<<<<<<<<<<<<
@@ -31313,7 +31300,7 @@ PyMODINIT_FUNC PyInit_PyDeepCL(void)
   __pyx_t_2 = PyCFunction_NewEx(&__pyx_mdef_8PyDeepCL_15InputLayerMaker_7instance, NULL, __pyx_n_s_PyDeepCL); if (unlikely(!__pyx_t_2)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 167; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
   __Pyx_GOTREF(__pyx_t_2);
 
-  /* "LayerMaker.pyx":166
+  /* "../../../../../../home/user/git/DeepCL/python/LayerMaker.pyx":166
  *         self.thisptr.imageSize( _imageSize )
  *         return self
  *     @staticmethod             # <<<<<<<<<<<<<<
@@ -31332,7 +31319,7 @@ PyMODINIT_FUNC PyInit_PyDeepCL(void)
   __Pyx_DECREF(__pyx_t_2); __pyx_t_2 = 0;
   PyType_Modified(__pyx_ptype_8PyDeepCL_InputLayerMaker);
 
-  /* "LayerMaker.pyx":167
+  /* "../../../../../../home/user/git/DeepCL/python/LayerMaker.pyx":167
  *         return self
  *     @staticmethod
  *     def instance():             # <<<<<<<<<<<<<<
@@ -31342,7 +31329,7 @@ PyMODINIT_FUNC PyInit_PyDeepCL(void)
   __pyx_t_2 = __Pyx_GetNameInClass((PyObject *)__pyx_ptype_8PyDeepCL_InputLayerMaker, __pyx_n_s_instance); if (unlikely(!__pyx_t_2)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 167; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
   __Pyx_GOTREF(__pyx_t_2);
 
-  /* "LayerMaker.pyx":166
+  /* "../../../../../../home/user/git/DeepCL/python/LayerMaker.pyx":166
  *         self.thisptr.imageSize( _imageSize )
  *         return self
  *     @staticmethod             # <<<<<<<<<<<<<<
@@ -31361,210 +31348,198 @@ PyMODINIT_FUNC PyInit_PyDeepCL(void)
   __Pyx_DECREF(__pyx_t_2); __pyx_t_2 = 0;
   PyType_Modified(__pyx_ptype_8PyDeepCL_InputLayerMaker);
 
-  /* "GenericLoader.pyx":3
+  /* "../../../../../../home/user/git/DeepCL/python/GenericLoader.pyx":3
  * cdef class GenericLoader:
  *     @staticmethod
- *     def getDimensions( trainFilePath ):             # <<<<<<<<<<<<<<
+ *     def getDimensions( trainFilepath ):             # <<<<<<<<<<<<<<
+ *         print 'GenericLoader.py getDimensions ', trainFilepath
  *         cdef int N
- *         cdef int planes
  */
-  __pyx_t_2 = PyCFunction_NewEx(&__pyx_mdef_8PyDeepCL_13GenericLoader_1getDimensions, NULL, __pyx_n_s_PyDeepCL); if (unlikely(!__pyx_t_2)) {__pyx_filename = __pyx_f[11]; __pyx_lineno = 3; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+  __pyx_t_2 = PyCFunction_NewEx(&__pyx_mdef_8PyDeepCL_13GenericLoader_1getDimensions, NULL, __pyx_n_s_PyDeepCL); if (unlikely(!__pyx_t_2)) {__pyx_filename = __pyx_f[10]; __pyx_lineno = 3; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
   __Pyx_GOTREF(__pyx_t_2);
 
-  /* "GenericLoader.pyx":2
+  /* "../../../../../../home/user/git/DeepCL/python/GenericLoader.pyx":2
  * cdef class GenericLoader:
  *     @staticmethod             # <<<<<<<<<<<<<<
- *     def getDimensions( trainFilePath ):
- *         cdef int N
+ *     def getDimensions( trainFilepath ):
+ *         print 'GenericLoader.py getDimensions ', trainFilepath
  */
-  __pyx_t_1 = PyTuple_New(1); if (unlikely(!__pyx_t_1)) {__pyx_filename = __pyx_f[11]; __pyx_lineno = 2; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+  __pyx_t_1 = PyTuple_New(1); if (unlikely(!__pyx_t_1)) {__pyx_filename = __pyx_f[10]; __pyx_lineno = 2; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
   __Pyx_GOTREF(__pyx_t_1);
   PyTuple_SET_ITEM(__pyx_t_1, 0, __pyx_t_2);
   __Pyx_GIVEREF(__pyx_t_2);
   __pyx_t_2 = 0;
-  __pyx_t_2 = __Pyx_PyObject_Call(__pyx_builtin_staticmethod, __pyx_t_1, NULL); if (unlikely(!__pyx_t_2)) {__pyx_filename = __pyx_f[11]; __pyx_lineno = 2; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+  __pyx_t_2 = __Pyx_PyObject_Call(__pyx_builtin_staticmethod, __pyx_t_1, NULL); if (unlikely(!__pyx_t_2)) {__pyx_filename = __pyx_f[10]; __pyx_lineno = 2; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
   __Pyx_GOTREF(__pyx_t_2);
   __Pyx_DECREF(__pyx_t_1); __pyx_t_1 = 0;
-  if (PyDict_SetItem((PyObject *)__pyx_ptype_8PyDeepCL_GenericLoader->tp_dict, __pyx_n_s_getDimensions, __pyx_t_2) < 0) {__pyx_filename = __pyx_f[11]; __pyx_lineno = 3; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+  if (PyDict_SetItem((PyObject *)__pyx_ptype_8PyDeepCL_GenericLoader->tp_dict, __pyx_n_s_getDimensions, __pyx_t_2) < 0) {__pyx_filename = __pyx_f[10]; __pyx_lineno = 3; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
   __Pyx_DECREF(__pyx_t_2); __pyx_t_2 = 0;
   PyType_Modified(__pyx_ptype_8PyDeepCL_GenericLoader);
 
-  /* "GenericLoader.pyx":3
+  /* "../../../../../../home/user/git/DeepCL/python/GenericLoader.pyx":3
  * cdef class GenericLoader:
  *     @staticmethod
- *     def getDimensions( trainFilePath ):             # <<<<<<<<<<<<<<
+ *     def getDimensions( trainFilepath ):             # <<<<<<<<<<<<<<
+ *         print 'GenericLoader.py getDimensions ', trainFilepath
  *         cdef int N
- *         cdef int planes
  */
-  __pyx_t_2 = __Pyx_GetNameInClass((PyObject *)__pyx_ptype_8PyDeepCL_GenericLoader, __pyx_n_s_getDimensions); if (unlikely(!__pyx_t_2)) {__pyx_filename = __pyx_f[11]; __pyx_lineno = 3; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+  __pyx_t_2 = __Pyx_GetNameInClass((PyObject *)__pyx_ptype_8PyDeepCL_GenericLoader, __pyx_n_s_getDimensions); if (unlikely(!__pyx_t_2)) {__pyx_filename = __pyx_f[10]; __pyx_lineno = 3; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
   __Pyx_GOTREF(__pyx_t_2);
 
-  /* "GenericLoader.pyx":2
+  /* "../../../../../../home/user/git/DeepCL/python/GenericLoader.pyx":2
  * cdef class GenericLoader:
  *     @staticmethod             # <<<<<<<<<<<<<<
- *     def getDimensions( trainFilePath ):
- *         cdef int N
+ *     def getDimensions( trainFilepath ):
+ *         print 'GenericLoader.py getDimensions ', trainFilepath
  */
-  __pyx_t_1 = PyTuple_New(1); if (unlikely(!__pyx_t_1)) {__pyx_filename = __pyx_f[11]; __pyx_lineno = 2; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+  __pyx_t_1 = PyTuple_New(1); if (unlikely(!__pyx_t_1)) {__pyx_filename = __pyx_f[10]; __pyx_lineno = 2; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
   __Pyx_GOTREF(__pyx_t_1);
   PyTuple_SET_ITEM(__pyx_t_1, 0, __pyx_t_2);
   __Pyx_GIVEREF(__pyx_t_2);
   __pyx_t_2 = 0;
-  __pyx_t_2 = __Pyx_PyObject_Call(__pyx_builtin_staticmethod, __pyx_t_1, NULL); if (unlikely(!__pyx_t_2)) {__pyx_filename = __pyx_f[11]; __pyx_lineno = 2; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+  __pyx_t_2 = __Pyx_PyObject_Call(__pyx_builtin_staticmethod, __pyx_t_1, NULL); if (unlikely(!__pyx_t_2)) {__pyx_filename = __pyx_f[10]; __pyx_lineno = 2; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
   __Pyx_GOTREF(__pyx_t_2);
   __Pyx_DECREF(__pyx_t_1); __pyx_t_1 = 0;
-  if (PyDict_SetItem((PyObject *)__pyx_ptype_8PyDeepCL_GenericLoader->tp_dict, __pyx_n_s_getDimensions, __pyx_t_2) < 0) {__pyx_filename = __pyx_f[11]; __pyx_lineno = 3; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+  if (PyDict_SetItem((PyObject *)__pyx_ptype_8PyDeepCL_GenericLoader->tp_dict, __pyx_n_s_getDimensions, __pyx_t_2) < 0) {__pyx_filename = __pyx_f[10]; __pyx_lineno = 3; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
   __Pyx_DECREF(__pyx_t_2); __pyx_t_2 = 0;
   PyType_Modified(__pyx_ptype_8PyDeepCL_GenericLoader);
 
-  /* "GenericLoader.pyx":10
+  /* "../../../../../../home/user/git/DeepCL/python/GenericLoader.pyx":13
  *         return (N,planes,size)
  *     @staticmethod
  *     def load( trainFilepath, float[:] images, int[:] labels, startN, numExamples ):             # <<<<<<<<<<<<<<
- *         cDeepCL.GenericLoader.load( toCppString(trainFilepath), &images[0], &labels[0], startN , numExamples )
- * 
+ *         cdef const char *trainFilepath_charstar = trainFilepath
+ *         cDeepCL.GenericLoader.load(trainFilepath_charstar, &images[0], &labels[0], startN , numExamples)
  */
-  __pyx_t_2 = PyCFunction_NewEx(&__pyx_mdef_8PyDeepCL_13GenericLoader_3load, NULL, __pyx_n_s_PyDeepCL); if (unlikely(!__pyx_t_2)) {__pyx_filename = __pyx_f[11]; __pyx_lineno = 10; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+  __pyx_t_2 = PyCFunction_NewEx(&__pyx_mdef_8PyDeepCL_13GenericLoader_3load, NULL, __pyx_n_s_PyDeepCL); if (unlikely(!__pyx_t_2)) {__pyx_filename = __pyx_f[10]; __pyx_lineno = 13; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
   __Pyx_GOTREF(__pyx_t_2);
 
-  /* "GenericLoader.pyx":9
- *         cDeepCL.GenericLoader.getDimensions( toCppString( trainFilePath ), &N, &planes, &size )
+  /* "../../../../../../home/user/git/DeepCL/python/GenericLoader.pyx":12
+ *         print 'finished calling'
  *         return (N,planes,size)
  *     @staticmethod             # <<<<<<<<<<<<<<
  *     def load( trainFilepath, float[:] images, int[:] labels, startN, numExamples ):
- *         cDeepCL.GenericLoader.load( toCppString(trainFilepath), &images[0], &labels[0], startN , numExamples )
+ *         cdef const char *trainFilepath_charstar = trainFilepath
  */
-  __pyx_t_1 = PyTuple_New(1); if (unlikely(!__pyx_t_1)) {__pyx_filename = __pyx_f[11]; __pyx_lineno = 9; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+  __pyx_t_1 = PyTuple_New(1); if (unlikely(!__pyx_t_1)) {__pyx_filename = __pyx_f[10]; __pyx_lineno = 12; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
   __Pyx_GOTREF(__pyx_t_1);
   PyTuple_SET_ITEM(__pyx_t_1, 0, __pyx_t_2);
   __Pyx_GIVEREF(__pyx_t_2);
   __pyx_t_2 = 0;
-  __pyx_t_2 = __Pyx_PyObject_Call(__pyx_builtin_staticmethod, __pyx_t_1, NULL); if (unlikely(!__pyx_t_2)) {__pyx_filename = __pyx_f[11]; __pyx_lineno = 9; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+  __pyx_t_2 = __Pyx_PyObject_Call(__pyx_builtin_staticmethod, __pyx_t_1, NULL); if (unlikely(!__pyx_t_2)) {__pyx_filename = __pyx_f[10]; __pyx_lineno = 12; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
   __Pyx_GOTREF(__pyx_t_2);
   __Pyx_DECREF(__pyx_t_1); __pyx_t_1 = 0;
-  if (PyDict_SetItem((PyObject *)__pyx_ptype_8PyDeepCL_GenericLoader->tp_dict, __pyx_n_s_load, __pyx_t_2) < 0) {__pyx_filename = __pyx_f[11]; __pyx_lineno = 10; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+  if (PyDict_SetItem((PyObject *)__pyx_ptype_8PyDeepCL_GenericLoader->tp_dict, __pyx_n_s_load, __pyx_t_2) < 0) {__pyx_filename = __pyx_f[10]; __pyx_lineno = 13; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
   __Pyx_DECREF(__pyx_t_2); __pyx_t_2 = 0;
   PyType_Modified(__pyx_ptype_8PyDeepCL_GenericLoader);
 
-  /* "GenericLoader.pyx":10
+  /* "../../../../../../home/user/git/DeepCL/python/GenericLoader.pyx":13
  *         return (N,planes,size)
  *     @staticmethod
  *     def load( trainFilepath, float[:] images, int[:] labels, startN, numExamples ):             # <<<<<<<<<<<<<<
- *         cDeepCL.GenericLoader.load( toCppString(trainFilepath), &images[0], &labels[0], startN , numExamples )
- * 
+ *         cdef const char *trainFilepath_charstar = trainFilepath
+ *         cDeepCL.GenericLoader.load(trainFilepath_charstar, &images[0], &labels[0], startN , numExamples)
  */
-  __pyx_t_2 = __Pyx_GetNameInClass((PyObject *)__pyx_ptype_8PyDeepCL_GenericLoader, __pyx_n_s_load); if (unlikely(!__pyx_t_2)) {__pyx_filename = __pyx_f[11]; __pyx_lineno = 10; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+  __pyx_t_2 = __Pyx_GetNameInClass((PyObject *)__pyx_ptype_8PyDeepCL_GenericLoader, __pyx_n_s_load); if (unlikely(!__pyx_t_2)) {__pyx_filename = __pyx_f[10]; __pyx_lineno = 13; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
   __Pyx_GOTREF(__pyx_t_2);
 
-  /* "GenericLoader.pyx":9
- *         cDeepCL.GenericLoader.getDimensions( toCppString( trainFilePath ), &N, &planes, &size )
+  /* "../../../../../../home/user/git/DeepCL/python/GenericLoader.pyx":12
+ *         print 'finished calling'
  *         return (N,planes,size)
  *     @staticmethod             # <<<<<<<<<<<<<<
  *     def load( trainFilepath, float[:] images, int[:] labels, startN, numExamples ):
- *         cDeepCL.GenericLoader.load( toCppString(trainFilepath), &images[0], &labels[0], startN , numExamples )
+ *         cdef const char *trainFilepath_charstar = trainFilepath
  */
-  __pyx_t_1 = PyTuple_New(1); if (unlikely(!__pyx_t_1)) {__pyx_filename = __pyx_f[11]; __pyx_lineno = 9; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+  __pyx_t_1 = PyTuple_New(1); if (unlikely(!__pyx_t_1)) {__pyx_filename = __pyx_f[10]; __pyx_lineno = 12; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
   __Pyx_GOTREF(__pyx_t_1);
   PyTuple_SET_ITEM(__pyx_t_1, 0, __pyx_t_2);
   __Pyx_GIVEREF(__pyx_t_2);
   __pyx_t_2 = 0;
-  __pyx_t_2 = __Pyx_PyObject_Call(__pyx_builtin_staticmethod, __pyx_t_1, NULL); if (unlikely(!__pyx_t_2)) {__pyx_filename = __pyx_f[11]; __pyx_lineno = 9; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+  __pyx_t_2 = __Pyx_PyObject_Call(__pyx_builtin_staticmethod, __pyx_t_1, NULL); if (unlikely(!__pyx_t_2)) {__pyx_filename = __pyx_f[10]; __pyx_lineno = 12; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
   __Pyx_GOTREF(__pyx_t_2);
   __Pyx_DECREF(__pyx_t_1); __pyx_t_1 = 0;
-  if (PyDict_SetItem((PyObject *)__pyx_ptype_8PyDeepCL_GenericLoader->tp_dict, __pyx_n_s_load, __pyx_t_2) < 0) {__pyx_filename = __pyx_f[11]; __pyx_lineno = 10; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+  if (PyDict_SetItem((PyObject *)__pyx_ptype_8PyDeepCL_GenericLoader->tp_dict, __pyx_n_s_load, __pyx_t_2) < 0) {__pyx_filename = __pyx_f[10]; __pyx_lineno = 13; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
   __Pyx_DECREF(__pyx_t_2); __pyx_t_2 = 0;
   PyType_Modified(__pyx_ptype_8PyDeepCL_GenericLoader);
 
-  /* "NetDefToNet.pyx":3
+  /* "../../../../../../home/user/git/DeepCL/python/NetDefToNet.pyx":3
  * cdef class NetdefToNet:
  *     @staticmethod
  *     def createNetFromNetdef( NeuralNet neuralnet, netdef ):             # <<<<<<<<<<<<<<
- *         return cDeepCL.NetdefToNet.createNetFromNetdef( neuralnet.thisptr, toCppString( netdef ) )
- * 
+ *         cdef const char *netdef_charstar = netdef
+ *         return cDeepCL.NetdefToNet.createNetFromNetdefCharStar(neuralnet.thisptr, netdef_charstar)
  */
-  __pyx_t_2 = PyCFunction_NewEx(&__pyx_mdef_8PyDeepCL_11NetdefToNet_1createNetFromNetdef, NULL, __pyx_n_s_PyDeepCL); if (unlikely(!__pyx_t_2)) {__pyx_filename = __pyx_f[13]; __pyx_lineno = 3; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+  __pyx_t_2 = PyCFunction_NewEx(&__pyx_mdef_8PyDeepCL_11NetdefToNet_1createNetFromNetdef, NULL, __pyx_n_s_PyDeepCL); if (unlikely(!__pyx_t_2)) {__pyx_filename = __pyx_f[12]; __pyx_lineno = 3; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
   __Pyx_GOTREF(__pyx_t_2);
 
-  /* "NetDefToNet.pyx":2
+  /* "../../../../../../home/user/git/DeepCL/python/NetDefToNet.pyx":2
  * cdef class NetdefToNet:
  *     @staticmethod             # <<<<<<<<<<<<<<
  *     def createNetFromNetdef( NeuralNet neuralnet, netdef ):
- *         return cDeepCL.NetdefToNet.createNetFromNetdef( neuralnet.thisptr, toCppString( netdef ) )
+ *         cdef const char *netdef_charstar = netdef
  */
-  __pyx_t_1 = PyTuple_New(1); if (unlikely(!__pyx_t_1)) {__pyx_filename = __pyx_f[13]; __pyx_lineno = 2; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+  __pyx_t_1 = PyTuple_New(1); if (unlikely(!__pyx_t_1)) {__pyx_filename = __pyx_f[12]; __pyx_lineno = 2; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
   __Pyx_GOTREF(__pyx_t_1);
   PyTuple_SET_ITEM(__pyx_t_1, 0, __pyx_t_2);
   __Pyx_GIVEREF(__pyx_t_2);
   __pyx_t_2 = 0;
-  __pyx_t_2 = __Pyx_PyObject_Call(__pyx_builtin_staticmethod, __pyx_t_1, NULL); if (unlikely(!__pyx_t_2)) {__pyx_filename = __pyx_f[13]; __pyx_lineno = 2; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+  __pyx_t_2 = __Pyx_PyObject_Call(__pyx_builtin_staticmethod, __pyx_t_1, NULL); if (unlikely(!__pyx_t_2)) {__pyx_filename = __pyx_f[12]; __pyx_lineno = 2; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
   __Pyx_GOTREF(__pyx_t_2);
   __Pyx_DECREF(__pyx_t_1); __pyx_t_1 = 0;
-  if (PyDict_SetItem((PyObject *)__pyx_ptype_8PyDeepCL_NetdefToNet->tp_dict, __pyx_n_s_createNetFromNetdef, __pyx_t_2) < 0) {__pyx_filename = __pyx_f[13]; __pyx_lineno = 3; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+  if (PyDict_SetItem((PyObject *)__pyx_ptype_8PyDeepCL_NetdefToNet->tp_dict, __pyx_n_s_createNetFromNetdef, __pyx_t_2) < 0) {__pyx_filename = __pyx_f[12]; __pyx_lineno = 3; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
   __Pyx_DECREF(__pyx_t_2); __pyx_t_2 = 0;
   PyType_Modified(__pyx_ptype_8PyDeepCL_NetdefToNet);
 
-  /* "NetDefToNet.pyx":3
+  /* "../../../../../../home/user/git/DeepCL/python/NetDefToNet.pyx":3
  * cdef class NetdefToNet:
  *     @staticmethod
  *     def createNetFromNetdef( NeuralNet neuralnet, netdef ):             # <<<<<<<<<<<<<<
- *         return cDeepCL.NetdefToNet.createNetFromNetdef( neuralnet.thisptr, toCppString( netdef ) )
- * 
+ *         cdef const char *netdef_charstar = netdef
+ *         return cDeepCL.NetdefToNet.createNetFromNetdefCharStar(neuralnet.thisptr, netdef_charstar)
  */
-  __pyx_t_2 = __Pyx_GetNameInClass((PyObject *)__pyx_ptype_8PyDeepCL_NetdefToNet, __pyx_n_s_createNetFromNetdef); if (unlikely(!__pyx_t_2)) {__pyx_filename = __pyx_f[13]; __pyx_lineno = 3; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+  __pyx_t_2 = __Pyx_GetNameInClass((PyObject *)__pyx_ptype_8PyDeepCL_NetdefToNet, __pyx_n_s_createNetFromNetdef); if (unlikely(!__pyx_t_2)) {__pyx_filename = __pyx_f[12]; __pyx_lineno = 3; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
   __Pyx_GOTREF(__pyx_t_2);
 
-  /* "NetDefToNet.pyx":2
+  /* "../../../../../../home/user/git/DeepCL/python/NetDefToNet.pyx":2
  * cdef class NetdefToNet:
  *     @staticmethod             # <<<<<<<<<<<<<<
  *     def createNetFromNetdef( NeuralNet neuralnet, netdef ):
- *         return cDeepCL.NetdefToNet.createNetFromNetdef( neuralnet.thisptr, toCppString( netdef ) )
+ *         cdef const char *netdef_charstar = netdef
  */
-  __pyx_t_1 = PyTuple_New(1); if (unlikely(!__pyx_t_1)) {__pyx_filename = __pyx_f[13]; __pyx_lineno = 2; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+  __pyx_t_1 = PyTuple_New(1); if (unlikely(!__pyx_t_1)) {__pyx_filename = __pyx_f[12]; __pyx_lineno = 2; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
   __Pyx_GOTREF(__pyx_t_1);
   PyTuple_SET_ITEM(__pyx_t_1, 0, __pyx_t_2);
   __Pyx_GIVEREF(__pyx_t_2);
   __pyx_t_2 = 0;
-  __pyx_t_2 = __Pyx_PyObject_Call(__pyx_builtin_staticmethod, __pyx_t_1, NULL); if (unlikely(!__pyx_t_2)) {__pyx_filename = __pyx_f[13]; __pyx_lineno = 2; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+  __pyx_t_2 = __Pyx_PyObject_Call(__pyx_builtin_staticmethod, __pyx_t_1, NULL); if (unlikely(!__pyx_t_2)) {__pyx_filename = __pyx_f[12]; __pyx_lineno = 2; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
   __Pyx_GOTREF(__pyx_t_2);
   __Pyx_DECREF(__pyx_t_1); __pyx_t_1 = 0;
-  if (PyDict_SetItem((PyObject *)__pyx_ptype_8PyDeepCL_NetdefToNet->tp_dict, __pyx_n_s_createNetFromNetdef, __pyx_t_2) < 0) {__pyx_filename = __pyx_f[13]; __pyx_lineno = 3; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+  if (PyDict_SetItem((PyObject *)__pyx_ptype_8PyDeepCL_NetdefToNet->tp_dict, __pyx_n_s_createNetFromNetdef, __pyx_t_2) < 0) {__pyx_filename = __pyx_f[12]; __pyx_lineno = 3; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
   __Pyx_DECREF(__pyx_t_2); __pyx_t_2 = 0;
   PyType_Modified(__pyx_ptype_8PyDeepCL_NetdefToNet);
 
-  /* "PyDeepCL.pyx":31
- * include "QLearning.pyx"
- * 
- * def checkException():             # <<<<<<<<<<<<<<
- *     cdef int threwException = 0
- *     cdef string message = ""
- */
-  __pyx_t_2 = PyCFunction_NewEx(&__pyx_mdef_8PyDeepCL_1checkException, NULL, __pyx_n_s_PyDeepCL); if (unlikely(!__pyx_t_2)) {__pyx_filename = __pyx_f[2]; __pyx_lineno = 31; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
-  __Pyx_GOTREF(__pyx_t_2);
-  if (PyDict_SetItem(__pyx_d, __pyx_n_s_checkException, __pyx_t_2) < 0) {__pyx_filename = __pyx_f[2]; __pyx_lineno = 31; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
-  __Pyx_DECREF(__pyx_t_2); __pyx_t_2 = 0;
-
-  /* "PyDeepCL.pyx":39
- *         raise RuntimeError(message)
+  /* "PyDeepCL.pyx":41
+ * #        raise RuntimeError(message)
  * 
  * def interruptableCall( function, args ):             # <<<<<<<<<<<<<<
  *     mythread = threading.Thread( target=function, args = args )
  *     mythread.daemon = True
  */
-  __pyx_t_2 = PyCFunction_NewEx(&__pyx_mdef_8PyDeepCL_3interruptableCall, NULL, __pyx_n_s_PyDeepCL); if (unlikely(!__pyx_t_2)) {__pyx_filename = __pyx_f[2]; __pyx_lineno = 39; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+  __pyx_t_2 = PyCFunction_NewEx(&__pyx_mdef_8PyDeepCL_1interruptableCall, NULL, __pyx_n_s_PyDeepCL); if (unlikely(!__pyx_t_2)) {__pyx_filename = __pyx_f[14]; __pyx_lineno = 41; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
   __Pyx_GOTREF(__pyx_t_2);
-  if (PyDict_SetItem(__pyx_d, __pyx_n_s_interruptableCall, __pyx_t_2) < 0) {__pyx_filename = __pyx_f[2]; __pyx_lineno = 39; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+  if (PyDict_SetItem(__pyx_d, __pyx_n_s_interruptableCall, __pyx_t_2) < 0) {__pyx_filename = __pyx_f[14]; __pyx_lineno = 41; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
   __Pyx_DECREF(__pyx_t_2); __pyx_t_2 = 0;
 
-  /* "PyDeepCL.pyx":47
+  /* "PyDeepCL.pyx":49
  *         #print('join timed out')
  * 
  * def toCppString( pyString ):             # <<<<<<<<<<<<<<
  *     if isinstance( pyString, unicode ):
  *         return pyString.encode('utf8')
  */
-  __pyx_t_2 = PyCFunction_NewEx(&__pyx_mdef_8PyDeepCL_5toCppString, NULL, __pyx_n_s_PyDeepCL); if (unlikely(!__pyx_t_2)) {__pyx_filename = __pyx_f[2]; __pyx_lineno = 47; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+  __pyx_t_2 = PyCFunction_NewEx(&__pyx_mdef_8PyDeepCL_3toCppString, NULL, __pyx_n_s_PyDeepCL); if (unlikely(!__pyx_t_2)) {__pyx_filename = __pyx_f[14]; __pyx_lineno = 49; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
   __Pyx_GOTREF(__pyx_t_2);
-  if (PyDict_SetItem(__pyx_d, __pyx_n_s_toCppString, __pyx_t_2) < 0) {__pyx_filename = __pyx_f[2]; __pyx_lineno = 47; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+  if (PyDict_SetItem(__pyx_d, __pyx_n_s_toCppString, __pyx_t_2) < 0) {__pyx_filename = __pyx_f[14]; __pyx_lineno = 49; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
   __Pyx_DECREF(__pyx_t_2); __pyx_t_2 = 0;
 
   /* "PyDeepCL.pyx":1
@@ -31572,9 +31547,9 @@ PyMODINIT_FUNC PyInit_PyDeepCL(void)
  * #
  * # This Source Code Form is subject to the terms of the Mozilla Public License,
  */
-  __pyx_t_2 = PyDict_New(); if (unlikely(!__pyx_t_2)) {__pyx_filename = __pyx_f[2]; __pyx_lineno = 1; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+  __pyx_t_2 = PyDict_New(); if (unlikely(!__pyx_t_2)) {__pyx_filename = __pyx_f[14]; __pyx_lineno = 1; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
   __Pyx_GOTREF(__pyx_t_2);
-  if (PyDict_SetItem(__pyx_d, __pyx_n_s_test, __pyx_t_2) < 0) {__pyx_filename = __pyx_f[2]; __pyx_lineno = 1; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+  if (PyDict_SetItem(__pyx_d, __pyx_n_s_test, __pyx_t_2) < 0) {__pyx_filename = __pyx_f[14]; __pyx_lineno = 1; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
   __Pyx_DECREF(__pyx_t_2); __pyx_t_2 = 0;
 
   /* "View.MemoryView":203
@@ -31597,7 +31572,7 @@ PyMODINIT_FUNC PyInit_PyDeepCL(void)
  * cdef strided = Enum("<strided and direct>") # default
  * cdef indirect = Enum("<strided and indirect>")
  */
-  __pyx_t_2 = __Pyx_PyObject_Call(((PyObject *)((PyObject *)__pyx_MemviewEnum_type)), __pyx_tuple__46, NULL); if (unlikely(!__pyx_t_2)) {__pyx_filename = __pyx_f[16]; __pyx_lineno = 276; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+  __pyx_t_2 = __Pyx_PyObject_Call(((PyObject *)((PyObject *)__pyx_MemviewEnum_type)), __pyx_tuple__43, NULL); if (unlikely(!__pyx_t_2)) {__pyx_filename = __pyx_f[16]; __pyx_lineno = 276; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
   __Pyx_GOTREF(__pyx_t_2);
   __Pyx_XGOTREF(generic);
   __Pyx_DECREF_SET(generic, __pyx_t_2);
@@ -31611,7 +31586,7 @@ PyMODINIT_FUNC PyInit_PyDeepCL(void)
  * cdef indirect = Enum("<strided and indirect>")
  * 
  */
-  __pyx_t_2 = __Pyx_PyObject_Call(((PyObject *)((PyObject *)__pyx_MemviewEnum_type)), __pyx_tuple__47, NULL); if (unlikely(!__pyx_t_2)) {__pyx_filename = __pyx_f[16]; __pyx_lineno = 277; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+  __pyx_t_2 = __Pyx_PyObject_Call(((PyObject *)((PyObject *)__pyx_MemviewEnum_type)), __pyx_tuple__44, NULL); if (unlikely(!__pyx_t_2)) {__pyx_filename = __pyx_f[16]; __pyx_lineno = 277; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
   __Pyx_GOTREF(__pyx_t_2);
   __Pyx_XGOTREF(strided);
   __Pyx_DECREF_SET(strided, __pyx_t_2);
@@ -31625,7 +31600,7 @@ PyMODINIT_FUNC PyInit_PyDeepCL(void)
  * 
  * 
  */
-  __pyx_t_2 = __Pyx_PyObject_Call(((PyObject *)((PyObject *)__pyx_MemviewEnum_type)), __pyx_tuple__48, NULL); if (unlikely(!__pyx_t_2)) {__pyx_filename = __pyx_f[16]; __pyx_lineno = 278; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+  __pyx_t_2 = __Pyx_PyObject_Call(((PyObject *)((PyObject *)__pyx_MemviewEnum_type)), __pyx_tuple__45, NULL); if (unlikely(!__pyx_t_2)) {__pyx_filename = __pyx_f[16]; __pyx_lineno = 278; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
   __Pyx_GOTREF(__pyx_t_2);
   __Pyx_XGOTREF(indirect);
   __Pyx_DECREF_SET(indirect, __pyx_t_2);
@@ -31639,7 +31614,7 @@ PyMODINIT_FUNC PyInit_PyDeepCL(void)
  * cdef indirect_contiguous = Enum("<contiguous and indirect>")
  * 
  */
-  __pyx_t_2 = __Pyx_PyObject_Call(((PyObject *)((PyObject *)__pyx_MemviewEnum_type)), __pyx_tuple__49, NULL); if (unlikely(!__pyx_t_2)) {__pyx_filename = __pyx_f[16]; __pyx_lineno = 281; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+  __pyx_t_2 = __Pyx_PyObject_Call(((PyObject *)((PyObject *)__pyx_MemviewEnum_type)), __pyx_tuple__46, NULL); if (unlikely(!__pyx_t_2)) {__pyx_filename = __pyx_f[16]; __pyx_lineno = 281; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
   __Pyx_GOTREF(__pyx_t_2);
   __Pyx_XGOTREF(contiguous);
   __Pyx_DECREF_SET(contiguous, __pyx_t_2);
@@ -31653,7 +31628,7 @@ PyMODINIT_FUNC PyInit_PyDeepCL(void)
  * 
  * 
  */
-  __pyx_t_2 = __Pyx_PyObject_Call(((PyObject *)((PyObject *)__pyx_MemviewEnum_type)), __pyx_tuple__50, NULL); if (unlikely(!__pyx_t_2)) {__pyx_filename = __pyx_f[16]; __pyx_lineno = 282; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+  __pyx_t_2 = __Pyx_PyObject_Call(((PyObject *)((PyObject *)__pyx_MemviewEnum_type)), __pyx_tuple__47, NULL); if (unlikely(!__pyx_t_2)) {__pyx_filename = __pyx_f[16]; __pyx_lineno = 282; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
   __Pyx_GOTREF(__pyx_t_2);
   __Pyx_XGOTREF(indirect_contiguous);
   __Pyx_DECREF_SET(indirect_contiguous, __pyx_t_2);
@@ -31686,12 +31661,12 @@ PyMODINIT_FUNC PyInit_PyDeepCL(void)
   __Pyx_DECREF(__pyx_t_2); __pyx_t_2 = 0;
   PyType_Modified(__pyx_memoryviewslice_type);
 
-  /* "string.from_py":13
+  /* "View.MemoryView":1362
  * 
- * @cname("__pyx_convert_string_from_py_std__in_string")
- * cdef string __pyx_convert_string_from_py_std__in_string(object o) except *:             # <<<<<<<<<<<<<<
- *     cdef Py_ssize_t length
- *     cdef char* data = __Pyx_PyObject_AsStringAndSize(o, &length)
+ * @cname('__pyx_memoryview__slice_assign_scalar')
+ * cdef void _slice_assign_scalar(char *data, Py_ssize_t *shape,             # <<<<<<<<<<<<<<
+ *                               Py_ssize_t *strides, int ndim,
+ *                               size_t itemsize, void *item) nogil:
  */
 
   /*--- Wrapped vars code ---*/
@@ -32601,21 +32576,30 @@ static CYTHON_INLINE void __Pyx_XDEC_MEMVIEW(__Pyx_memviewslice *memslice,
     }
 }
 
-static CYTHON_INLINE PyObject *__Pyx_GetModuleGlobalName(PyObject *name) {
-    PyObject *result;
-#if CYTHON_COMPILING_IN_CPYTHON
-    result = PyDict_GetItem(__pyx_d, name);
-    if (likely(result)) {
-        Py_INCREF(result);
+static CYTHON_INLINE PyObject* __Pyx_decode_c_string(
+         const char* cstring, Py_ssize_t start, Py_ssize_t stop,
+         const char* encoding, const char* errors,
+         PyObject* (*decode_func)(const char *s, Py_ssize_t size, const char *errors)) {
+    Py_ssize_t length;
+    if (unlikely((start < 0) | (stop < 0))) {
+        length = strlen(cstring);
+        if (start < 0) {
+            start += length;
+            if (start < 0)
+                start = 0;
+        }
+        if (stop < 0)
+            stop += length;
+    }
+    length = stop - start;
+    if (unlikely(length <= 0))
+        return PyUnicode_FromUnicode(NULL, 0);
+    cstring += start;
+    if (decode_func) {
+        return decode_func(cstring, length, errors);
     } else {
-#else
-    result = PyObject_GetItem(__pyx_d, name);
-    if (!result) {
-        PyErr_Clear();
-#endif
-        result = __Pyx_GetBuiltinName(name);
+        return PyUnicode_Decode(cstring, length, encoding, errors);
     }
-    return result;
 }
 
 #if CYTHON_COMPILING_IN_CPYTHON
@@ -32637,6 +32621,23 @@ static CYTHON_INLINE PyObject* __Pyx_PyObject_Call(PyObject *func, PyObject *arg
 }
 #endif
 
+static CYTHON_INLINE PyObject *__Pyx_GetModuleGlobalName(PyObject *name) {
+    PyObject *result;
+#if CYTHON_COMPILING_IN_CPYTHON
+    result = PyDict_GetItem(__pyx_d, name);
+    if (likely(result)) {
+        Py_INCREF(result);
+    } else {
+#else
+    result = PyObject_GetItem(__pyx_d, name);
+    if (!result) {
+        PyErr_Clear();
+#endif
+        result = __Pyx_GetBuiltinName(name);
+    }
+    return result;
+}
+
 static CYTHON_INLINE int __Pyx_TypeTest(PyObject *obj, PyTypeObject *type) {
     if (unlikely(!type)) {
         PyErr_SetString(PyExc_SystemError, "Missing type object");
@@ -33232,32 +33233,6 @@ static CYTHON_INLINE PyObject *__Pyx_GetAttr(PyObject *o, PyObject *n) {
     return PyObject_GetAttr(o, n);
 }
 
-static CYTHON_INLINE PyObject* __Pyx_decode_c_string(
-         const char* cstring, Py_ssize_t start, Py_ssize_t stop,
-         const char* encoding, const char* errors,
-         PyObject* (*decode_func)(const char *s, Py_ssize_t size, const char *errors)) {
-    Py_ssize_t length;
-    if (unlikely((start < 0) | (stop < 0))) {
-        length = strlen(cstring);
-        if (start < 0) {
-            start += length;
-            if (start < 0)
-                start = 0;
-        }
-        if (stop < 0)
-            stop += length;
-    }
-    length = stop - start;
-    if (unlikely(length <= 0))
-        return PyUnicode_FromUnicode(NULL, 0);
-    cstring += start;
-    if (decode_func) {
-        return decode_func(cstring, length, errors);
-    } else {
-        return PyUnicode_Decode(cstring, length, encoding, errors);
-    }
-}
-
 static CYTHON_INLINE void __Pyx_RaiseTooManyValuesError(Py_ssize_t expected) {
     PyErr_Format(PyExc_ValueError,
                  "too many values to unpack (expected %" CYTHON_FORMAT_SSIZE_T "d)", expected);
@@ -34091,6 +34066,147 @@ static CYTHON_INLINE PyObject* __Pyx_PyInt_From_int(int value) {
     }
 }
 
+#if !CYTHON_COMPILING_IN_PYPY && PY_MAJOR_VERSION < 3
+static PyObject *__Pyx_GetStdout(void) {
+    PyObject *f = PySys_GetObject((char *)"stdout");
+    if (!f) {
+        PyErr_SetString(PyExc_RuntimeError, "lost sys.stdout");
+    }
+    return f;
+}
+static int __Pyx_Print(PyObject* f, PyObject *arg_tuple, int newline) {
+    int i;
+    if (!f) {
+        if (!(f = __Pyx_GetStdout()))
+            return -1;
+    }
+    Py_INCREF(f);
+    for (i=0; i < PyTuple_GET_SIZE(arg_tuple); i++) {
+        PyObject* v;
+        if (PyFile_SoftSpace(f, 1)) {
+            if (PyFile_WriteString(" ", f) < 0)
+                goto error;
+        }
+        v = PyTuple_GET_ITEM(arg_tuple, i);
+        if (PyFile_WriteObject(v, f, Py_PRINT_RAW) < 0)
+            goto error;
+        if (PyString_Check(v)) {
+            char *s = PyString_AsString(v);
+            Py_ssize_t len = PyString_Size(v);
+            if (len > 0) {
+                switch (s[len-1]) {
+                    case ' ': break;
+                    case '\f': case '\r': case '\n': case '\t': case '\v':
+                        PyFile_SoftSpace(f, 0);
+                        break;
+                    default:  break;
+                }
+            }
+        }
+    }
+    if (newline) {
+        if (PyFile_WriteString("\n", f) < 0)
+            goto error;
+        PyFile_SoftSpace(f, 0);
+    }
+    Py_DECREF(f);
+    return 0;
+error:
+    Py_DECREF(f);
+    return -1;
+}
+#else
+static int __Pyx_Print(PyObject* stream, PyObject *arg_tuple, int newline) {
+    PyObject* kwargs = 0;
+    PyObject* result = 0;
+    PyObject* end_string;
+    if (unlikely(!__pyx_print)) {
+        __pyx_print = PyObject_GetAttr(__pyx_b, __pyx_n_s_print);
+        if (!__pyx_print)
+            return -1;
+    }
+    if (stream) {
+        kwargs = PyDict_New();
+        if (unlikely(!kwargs))
+            return -1;
+        if (unlikely(PyDict_SetItem(kwargs, __pyx_n_s_file, stream) < 0))
+            goto bad;
+        if (!newline) {
+            end_string = PyUnicode_FromStringAndSize(" ", 1);
+            if (unlikely(!end_string))
+                goto bad;
+            if (PyDict_SetItem(kwargs, __pyx_n_s_end, end_string) < 0) {
+                Py_DECREF(end_string);
+                goto bad;
+            }
+            Py_DECREF(end_string);
+        }
+    } else if (!newline) {
+        if (unlikely(!__pyx_print_kwargs)) {
+            __pyx_print_kwargs = PyDict_New();
+            if (unlikely(!__pyx_print_kwargs))
+                return -1;
+            end_string = PyUnicode_FromStringAndSize(" ", 1);
+            if (unlikely(!end_string))
+                return -1;
+            if (PyDict_SetItem(__pyx_print_kwargs, __pyx_n_s_end, end_string) < 0) {
+                Py_DECREF(end_string);
+                return -1;
+            }
+            Py_DECREF(end_string);
+        }
+        kwargs = __pyx_print_kwargs;
+    }
+    result = PyObject_Call(__pyx_print, arg_tuple, kwargs);
+    if (unlikely(kwargs) && (kwargs != __pyx_print_kwargs))
+        Py_DECREF(kwargs);
+    if (!result)
+        return -1;
+    Py_DECREF(result);
+    return 0;
+bad:
+    if (kwargs != __pyx_print_kwargs)
+        Py_XDECREF(kwargs);
+    return -1;
+}
+#endif
+
+#if !CYTHON_COMPILING_IN_PYPY && PY_MAJOR_VERSION < 3
+static int __Pyx_PrintOne(PyObject* f, PyObject *o) {
+    if (!f) {
+        if (!(f = __Pyx_GetStdout()))
+            return -1;
+    }
+    Py_INCREF(f);
+    if (PyFile_SoftSpace(f, 0)) {
+        if (PyFile_WriteString(" ", f) < 0)
+            goto error;
+    }
+    if (PyFile_WriteObject(o, f, Py_PRINT_RAW) < 0)
+        goto error;
+    if (PyFile_WriteString("\n", f) < 0)
+        goto error;
+    Py_DECREF(f);
+    return 0;
+error:
+    Py_DECREF(f);
+    return -1;
+    /* the line below is just to avoid C compiler
+     * warnings about unused functions */
+    return __Pyx_Print(f, NULL, 0);
+}
+#else
+static int __Pyx_PrintOne(PyObject* stream, PyObject *o) {
+    int res;
+    PyObject* arg_tuple = PyTuple_Pack(1, o);
+    if (unlikely(!arg_tuple))
+        return -1;
+    res = __Pyx_Print(stream, arg_tuple, 1);
+    Py_DECREF(arg_tuple);
+    return res;
+}
+#endif
+
 static PyObject *__pyx_memview_get_float(const char *itemp) {
     return (PyObject *) PyFloat_FromDouble(*(float *) itemp);
 }
diff --git a/python/PyDeepCL.pyx b/python/PyDeepCL.pyx
index 0f179375..3897b481 100644
--- a/python/PyDeepCL.pyx
+++ b/python/PyDeepCL.pyx
@@ -8,12 +8,14 @@ from cython cimport view
 from cpython cimport array as c_array
 from array import array
 import threading
-from libcpp.string cimport string
 from libcpp cimport bool
 
+cimport CppRuntimeBoundary
+
 cimport cDeepCL
 
-include "EasyCL.pyx"
+include "DeepCL.pyx"
+#include "DeepCL.pyx"
 include "SGD.pyx"
 include "Annealer.pyx"
 include "Nesterov.pyx"
@@ -28,13 +30,13 @@ include "NetLearner.pyx"
 include "NetDefToNet.pyx"
 include "QLearning.pyx"
 
-def checkException():
-    cdef int threwException = 0
-    cdef string message = ""
-    cDeepCL.checkException( &threwException, &message)
-    # print('threwException: ' + str(threwException) + ' ' + message ) 
-    if threwException:
-        raise RuntimeError(message)
+#def checkException():
+#    cdef int threwException = 0
+#    cdef string message = ""
+#    cDeepCL.checkException( &threwException, &message)
+#    # print('threwException: ' + str(threwException) + ' ' + message ) 
+#    if threwException:
+#        raise RuntimeError(message)
 
 def interruptableCall( function, args ):
     mythread = threading.Thread( target=function, args = args )
diff --git a/python/README.md b/python/README.md
index a9cd68b1..e6deb9e3 100644
--- a/python/README.md
+++ b/python/README.md
@@ -2,15 +2,19 @@
 
 Python wrapper for  [DeepCL](https://github.com/hughperkins/DeepCL)
 
-# To install from pip
+## Pre-requisites
+
+You must have first installed and activated DeepCL native libraries, see [Build.md](https://github.com/hughperkins/DeepCL/blob/8.x/doc/Build.md)
+
+## To install from pip
 
 ```bash
-pip install DeepCL 
+pip install --pre --upgrade DeepCL
 ```
 
 * related pypi page: [https://pypi.python.org/pypi/DeepCL](https://pypi.python.org/pypi/DeepCL)
 
-# How to use
+## How to use
 
 See [test_deepcl.py](https://github.com/hughperkins/DeepCL/blob/master/python/test_deepcl.py) for an example of:
 
@@ -25,9 +29,9 @@ For examples of using lower-level entrypoints, see [test_lowlevel.py](https://gi
 
 For example of using q-learning, see [test_qlearning.py](https://github.com/hughperkins/DeepCL/blob/master/python/test_qlearning.py).
 
-# To build from source
+## To install from source
 
-## Pre-requisites:
+### Pre-requisites:
 
 * on Windows:
   * Python 2.7 or Python 3.4
@@ -38,48 +42,10 @@ For example of using q-learning, see [test_qlearning.py](https://github.com/hugh
   * Python 2.7 or Python 3.4
   * g++, supporting c++0x, eg 4.4 or higher
 
-## To build:
-
-```bash
-cd python
-python setup.py build_ext -i
-```
-
-Then, you can run from this directory, by making sure to add it to the path, eg:
-```
-PYTHONPATH=. python test_lowlevel.py /my/mnist/data/dir 
-```
-
-## To install:
+### To install:
 
 ```bash
 cd python
 python setup.py install
 ```
 
-## Notes on how the wrapper works
-
-* [cDeepCL.pxd](https://github.com/hughperkins/DeepCL/blob/master/python/cDeepCL.pxd) contains the definitions of the underlying DeepCL c++ libraries classes
-* [PyDeepCL.pyx](https://github.com/hughperkins/DeepCL/blob/master/python/PyDeepCL.pyx) contains Cython wrapper classes around the underlying c++ classes
-* [setup.py](https://github.com/hughperkins/DeepCL/blob/master/python/setup.py) is a setup file for compiling the `PyDeepCL.pyx` Cython file
-
-## to run unit-tests
-
-From the python directory:
-
-```bash
-nosetests -sv
-```
-
-## Development builds
-
-* If you want to modify the sourcecode, you'll need to re-run cython, so you'll need cython:
-```
-pip install cython
-```
-* If you want to update this readme, you might want to re-generate the README.rst, so you'll need pypandoc:
-```
-pip install pypandoc
-```
-  * (note that pypandoc depends on pandoc)
-
diff --git a/python/README.rst b/python/README.rst
index 74afad0c..8663010b 100644
--- a/python/README.rst
+++ b/python/README.rst
@@ -3,17 +3,23 @@ DeepCL Python wrappers
 
 Python wrapper for `DeepCL <https://github.com/hughperkins/DeepCL>`__
 
+Pre-requisites
+--------------
+
+You must have first installed and activated DeepCL native libraries, see
+`Build.md <https://github.com/hughperkins/DeepCL/blob/8.x/doc/Build.md>`__
+
 To install from pip
-===================
+-------------------
 
 .. code:: bash
 
-    pip install DeepCL 
+    pip install --pre --upgrade DeepCL
 
 -  related pypi page: https://pypi.python.org/pypi/DeepCL
 
 How to use
-==========
+----------
 
 See
 `test\_deepcl.py <https://github.com/hughperkins/DeepCL/blob/master/python/test_deepcl.py>`__
@@ -32,11 +38,11 @@ For examples of using lower-level entrypoints, see
 For example of using q-learning, see
 `test\_qlearning.py <https://github.com/hughperkins/DeepCL/blob/master/python/test_qlearning.py>`__.
 
-To build from source
-====================
+To install from source
+----------------------
 
 Pre-requisites:
----------------
+~~~~~~~~~~~~~~~
 
 -  on Windows:
 -  Python 2.7 or Python 3.4
@@ -52,66 +58,11 @@ Pre-requisites:
 -  Python 2.7 or Python 3.4
 -  g++, supporting c++0x, eg 4.4 or higher
 
-To build:
----------
-
-.. code:: bash
-
-    cd python
-    python setup.py build_ext -i
-
-Then, you can run from this directory, by making sure to add it to the
-path, eg:
-
-::
-
-    PYTHONPATH=. python test_lowlevel.py /my/mnist/data/dir 
-
 To install:
------------
+~~~~~~~~~~~
 
 .. code:: bash
 
     cd python
     python setup.py install
 
-Notes on how the wrapper works
-------------------------------
-
--  `cDeepCL.pxd <https://github.com/hughperkins/DeepCL/blob/master/python/cDeepCL.pxd>`__
-   contains the definitions of the underlying DeepCL c++ libraries
-   classes
--  `PyDeepCL.pyx <https://github.com/hughperkins/DeepCL/blob/master/python/PyDeepCL.pyx>`__
-   contains Cython wrapper classes around the underlying c++ classes
--  `setup.py <https://github.com/hughperkins/DeepCL/blob/master/python/setup.py>`__
-   is a setup file for compiling the ``PyDeepCL.pyx`` Cython file
-
-to run unit-tests
------------------
-
-From the python directory:
-
-.. code:: bash
-
-    nosetests -sv
-
-Development builds
-------------------
-
--  If you want to modify the sourcecode, you'll need to re-run cython,
-   so you'll need cython:
-
-   ::
-
-       pip install cython
-
--  If you want to update this readme, you might want to re-generate the
-   README.rst, so you'll need pypandoc:
-
-   ::
-
-       pip install pypandoc
-
--  (note that pypandoc depends on pandoc)
-
-
diff --git a/python/Rmsprop.pyx b/python/Rmsprop.pyx
index 01f61de2..08262878 100644
--- a/python/Rmsprop.pyx
+++ b/python/Rmsprop.pyx
@@ -1,9 +1,9 @@
 cdef class Rmsprop: 
     cdef cDeepCL.Rmsprop *thisptr
-    def __cinit__( self, EasyCL cl, learningRate, momentum=0.0 ):
+    def __cinit__( self, DeepCL cl, learningRate, momentum=0.0 ):
         self.thisptr = new cDeepCL.Rmsprop(cl.thisptr)
         self.thisptr.setLearningRate(learningRate)
-    def __dealloc(self):
+    def __dealloc__(self):
         del self.thisptr
     def setLearningRate(self, float learningRate):
         self.thisptr.setLearningRate(learningRate)
diff --git a/python/SGD.pyx b/python/SGD.pyx
index d9116624..8bef02b0 100644
--- a/python/SGD.pyx
+++ b/python/SGD.pyx
@@ -7,11 +7,11 @@ cdef class TrainingContext:
 
 cdef class SGD: 
     cdef cDeepCL.SGD *thisptr
-    def __cinit__( self, EasyCL cl, learningRate, momentum=0.0 ):
+    def __cinit__( self, DeepCL cl, learningRate, momentum=0.0 ):
         self.thisptr = new cDeepCL.SGD(cl.thisptr)
         self.thisptr.setLearningRate(learningRate)
         self.thisptr.setMomentum(momentum)
-    def __dealloc(self):
+    def __dealloc__(self):
         del self.thisptr
     def setLearningRate(self, float learningRate):
         self.thisptr.setLearningRate(learningRate)
@@ -29,4 +29,3 @@ cdef class SGD:
         cdef cDeepCL.BatchResult result = self.thisptr.trainFromLabels(
             net.thisptr, context.thisptr, &inputdata[0], &labels[0])
         return ( result.getLoss(), result.getNumRight() )
-
diff --git a/python/ScenarioDefs.py b/python/ScenarioDefs.py
index d9b057b9..88c86c33 100644
--- a/python/ScenarioDefs.py
+++ b/python/ScenarioDefs.py
@@ -1,19 +1,21 @@
-# imported by CyScenario.h cog section, and by the corresponding .pyx cog section
+"""
+imported by CyScenario.h cog section
+and by the corresponding .pyx cog section
+"""
 
-def upperFirst( word ):
+
+def upperFirst(word):
     word = word[0].upper() + word[1:]
     return word
 
 defs = []
-# def format is: ( name, returntype, list_of_parameters )
-defs.append( ( 'getPerceptionSize', 'int', [] ) ) 
-defs.append( ( 'getPerceptionPlanes', 'int', [] ) ) 
-defs.append( ( 'getPerception', 'void', [('float *','perception')] ) ) 
-defs.append( ( 'reset', 'void', [] ) ) 
-defs.append( ( 'getNumActions', 'int', [] ) ) 
-defs.append( ( 'act', 'float', [('int','index')] ) ) 
-defs.append( ( 'hasFinished', 'bool', [] ) ) 
-# defs.append( ( 'print', 'void', [] ) ) 
-# defs.append( ( 'printQRepresentation', 'void', [('NeuralNet *','net')] ) ) 
-
-
+# def format is: (name, returntype, list_of_parameters)
+defs.append(('getPerceptionSize', 'int', []))
+defs.append(('getPerceptionPlanes', 'int', []))
+defs.append(('getPerception', 'void', [('float *', 'perception')]))
+defs.append(('reset', 'void', []))
+defs.append(('getNumActions', 'int', []))
+defs.append(('act', 'float', [('int', 'index')]))
+defs.append(('hasFinished', 'bool', []))
+# defs.append(('print', 'void', []))
+# defs.append(('printQRepresentation', 'void', [('NeuralNet *','net')]))
diff --git a/python/benchmarking/deepcl_benchmark.py b/python/benchmarking/deepcl_benchmark.py
index 78309f95..b82718b2 100755
--- a/python/benchmarking/deepcl_benchmark.py
+++ b/python/benchmarking/deepcl_benchmark.py
@@ -59,7 +59,7 @@ def writeResults( resultsLine ):
 
 def time_layer( numEpochs, batchSize, inputPlanes, inputSize, outputPlanes, filterSize ):
     print('building network...')
-    cl = PyDeepCL.EasyCL()
+    cl = PyDeepCL.DeepCL()
     net = PyDeepCL.NeuralNet(cl, inputPlanes, inputSize )
 #    net.addLayer( PyDeepCL.ConvolutionalMaker().numFilters(inputPlanes)
 #        .filterSize(1).padZeros().biased().linear() ) # this is just to make sure that gradient needs to be 
@@ -91,10 +91,10 @@ def time_layer( numEpochs, batchSize, inputPlanes, inputSize, outputPlanes, filt
         now = time.time()
         print('  warm up forward all-layer time', now - last )
         last = now
-    net.backwardFromLabels(labels)
-    now = time.time()
-    print('   warm up backward all-layer time', now - last )
-    last = now
+        net.backwardFromLabels(labels)
+        now = time.time()
+        print('   warm up backward all-layer time', now - last )
+        last = now
 
     layer = net.getLayer(2)
     print('running forward prop timings:')
diff --git a/python/benchmarking/deepcl_benchmark2.py b/python/benchmarking/deepcl_benchmark2.py
index 1dd527bb..e87e9b27 100755
--- a/python/benchmarking/deepcl_benchmark2.py
+++ b/python/benchmarking/deepcl_benchmark2.py
@@ -63,7 +63,7 @@ def time_layer(num_epochs, label, batch_size, net_string):
     print('building network...')
     input_string, layer_string = net_string.split('-')
     input_planes, input_size = map(lambda x: int(x), input_string.split('i'))
-    cl = PyDeepCL.EasyCL()
+    cl = PyDeepCL.DeepCL()
     net = PyDeepCL.NeuralNet( cl, input_planes, input_size )
     net.addLayer( PyDeepCL.ForceBackpropMaker() ) # this forces the next layer to backward gradients to
                           # this layer
@@ -96,16 +96,16 @@ def time_layer(num_epochs, label, batch_size, net_string):
     net.setBatchSize(batch_size)
 
     # warm up forward
-    for i in range(8):
+    for i in range(9):
         last = time.time()
         net.forward( images )
         now = time.time()
         print('  warm up forward all-layer time', ( now - last ) * 1000, 'ms' )
         last = now
-    net.backwardFromLabels(labels)
-    now = time.time()
-    print('   warm up backward all-layer time', (now - last) * 1000, 'ms' )
-    last = now
+        net.backwardFromLabels(labels)
+        now = time.time()
+        print('   warm up backward all-layer time', (now - last) * 1000, 'ms' )
+        last = now
 
     layer = net.getLayer(2)
     print('running forward prop timings:')
@@ -142,7 +142,7 @@ def time_fullnet(num_epochs, label, batch_size, net_string):
     input_string = split_net_string[0]
     netdef = '-'.join(split_net_string[1:])
     input_planes, input_size = map(lambda x: int(x), input_string.split('i'))
-    cl = PyDeepCL.EasyCL()
+    cl = PyDeepCL.DeepCL()
     net = PyDeepCL.NeuralNet(cl, input_planes, input_size )
     PyDeepCL.NetdefToNet.createNetFromNetdef(net, netdef)
     print( net.asString() )
diff --git a/python/cAdadelta.pxd b/python/cAdadelta.pxd
index 619df3bf..81beb14e 100644
--- a/python/cAdadelta.pxd
+++ b/python/cAdadelta.pxd
@@ -1,6 +1,6 @@
 cdef extern from "trainers/Adadelta.h":
     cdef cppclass Adadelta:
-        Adadelta( EasyCL *cl, float rho ) except +
+        Adadelta( DeepCL *cl, float rho ) except +
         BatchResult train( NeuralNet *net, TrainingContext *context,
             const float *input, const float *expectedOutput )
         BatchResult trainFromLabels( NeuralNet *net, TrainingContext *context,
diff --git a/python/cAdagrad.pxd b/python/cAdagrad.pxd
index 670b901e..3b569a6a 100644
--- a/python/cAdagrad.pxd
+++ b/python/cAdagrad.pxd
@@ -1,6 +1,6 @@
 cdef extern from "trainers/Adagrad.h":
     cdef cppclass Adagrad:
-        Adagrad( EasyCL *cl ) except +
+        Adagrad( DeepCL *cl ) except +
         void setLearningRate( float learningRate )
         BatchResult train( NeuralNet *net, TrainingContext *context,
             const float *input, const float *expectedOutput )
diff --git a/python/cAnnealer.pxd b/python/cAnnealer.pxd
index b8b70e14..e887a6c7 100644
--- a/python/cAnnealer.pxd
+++ b/python/cAnnealer.pxd
@@ -1,6 +1,6 @@
 cdef extern from "trainers/Annealer.h":
     cdef cppclass Annealer:
-        Annealer( EasyCL *cl ) except +
+        Annealer( DeepCL *cl ) except +
         void setLearningRate( float learningRate )
         void setAnneal( float anneal )
         BatchResult train( NeuralNet *net, TrainingContext *context,
diff --git a/python/cDeepCL.pxd b/python/cDeepCL.pxd
index 31fc70ae..4a6eae1e 100644
--- a/python/cDeepCL.pxd
+++ b/python/cDeepCL.pxd
@@ -4,10 +4,26 @@
 # v. 2.0. If a copy of the MPL was not distributed with this file, You can 
 # obtain one at http://mozilla.org/MPL/2.0/.
 
-from libcpp.string cimport string
 from libcpp cimport bool
 
-include "cEasyCL.pxd"
+cdef extern from "DeepCL.h":
+    cdef cppclass DeepCL:
+        @staticmethod
+        DeepCL *createForFirstGpuOtherwiseCpu()
+        @staticmethod
+        DeepCL *createForIndexedGpu( int gpu )
+
+        void deleteMe()
+
+        void setProfiling(bool profiling)
+        void dumpProfiling();
+
+        int getComputeUnits()
+        int getLocalMemorySize()
+        int getLocalMemorySizeKB()
+        int getMaxWorkgroupSize()
+        int getMaxAllocSizeMB()
+
 include "cLayerMaker.pxd"
 include "cNeuralNet.pxd"
 include "cSGD.pxd"
@@ -22,7 +38,6 @@ include "cNetLearner.pxd"
 include "cLayer.pxd"
 include "cQLearning.pxd"
 
-cdef extern from "CyWrappers.h":
-    cdef void checkException( int *wasRaised, string *message )
-
+# cdef extern from "CyWrappers.h":
+#     cdef void checkException( int *wasRaised, string *message )
 
diff --git a/python/cEasyCL.pxd b/python/cEasyCL.pxd
deleted file mode 100644
index 73dcade6..00000000
--- a/python/cEasyCL.pxd
+++ /dev/null
@@ -1,7 +0,0 @@
-cdef extern from "EasyCL.h":
-    cdef cppclass EasyCL:
-        @staticmethod
-        EasyCL *createForFirstGpuOtherwiseCpu()
-        @staticmethod
-        EasyCL *createForIndexedGpu( int gpu )
-
diff --git a/python/cGenericLoader.pxd b/python/cGenericLoader.pxd
index f155c8c2..67c2b9f5 100644
--- a/python/cGenericLoader.pxd
+++ b/python/cGenericLoader.pxd
@@ -1,8 +1,6 @@
 cdef extern from "loaders/GenericLoader.h":
     cdef cppclass GenericLoader:
         @staticmethod
-        void getDimensions( string trainFilepath, int *p_numExamples, int *p_numPlanes, int *p_imageSize ) except +
+        void getDimensions( const char * trainFilepath, int *p_numExamples, int *p_numPlanes, int *p_imageSize ) except +
         @staticmethod
-        void load( string trainFilepath, float *images, int *labels, int startN, int numExamples ) except +
-
-
+        void load( const char * trainFilepath, float *images, int *labels, int startN, int numExamples ) except +
diff --git a/python/cLayer.pxd b/python/cLayer.pxd
index a8443e32..3d04b981 100644
--- a/python/cLayer.pxd
+++ b/python/cLayer.pxd
@@ -6,13 +6,12 @@ cdef extern from "layer/Layer.h":
         bool getBiased()
         int getOutputCubeSize()
         int getOutputPlanes()
-        int getOutputImageSize()
-        float * getOutput()
         int getOutputSize()
+        float * getOutput()
+        int getOutputNumElements()
         int getPersistSize()
         void persistToArray(float *array)
         void unpersistFromArray(const float *array)
-        string asString()
-        string getClassName()
-
+        const char *asNewCharStar()
+        const char *getClassNameAsCharStar()
 
diff --git a/python/cNesterov.pxd b/python/cNesterov.pxd
index 4250e7cf..b4c457a0 100644
--- a/python/cNesterov.pxd
+++ b/python/cNesterov.pxd
@@ -1,6 +1,6 @@
 cdef extern from "trainers/Nesterov.h":
     cdef cppclass Nesterov:
-        Nesterov( EasyCL *cl ) except +
+        Nesterov( DeepCL *cl ) except +
         void setLearningRate( float learningRate )
         void setMomentum( float momentum )
         BatchResult train( NeuralNet *net, TrainingContext *context,
diff --git a/python/cNetDefToNet.pxd b/python/cNetDefToNet.pxd
index e35143d2..51316fbb 100644
--- a/python/cNetDefToNet.pxd
+++ b/python/cNetDefToNet.pxd
@@ -1,6 +1,6 @@
 cdef extern from "netdef/NetdefToNet.h":
     cdef cppclass NetdefToNet:
         @staticmethod
-        bool createNetFromNetdef( NeuralNet *net, string netdef ) except +
+        bool createNetFromNetdefCharStar( NeuralNet *net, const char * netdef ) except +
 
 
diff --git a/python/cNeuralNet.pxd b/python/cNeuralNet.pxd
index e5a7e2b3..7171147c 100644
--- a/python/cNeuralNet.pxd
+++ b/python/cNeuralNet.pxd
@@ -1,10 +1,10 @@
 cdef extern from "net/NeuralNet.h":
     cdef cppclass NeuralNet:
-        #pass
-        NeuralNet( EasyCL *cl ) except +
-        #void print()
-        NeuralNet( EasyCL *cl, int numPlanes, int size ) except +
-        string asString() except +
+        @staticmethod
+        NeuralNet *instance(DeepCL *cl) except +
+        @staticmethod
+        NeuralNet *instance3(DeepCL *cl, int numPlanes, int size) except +
+        const char *asNewCharStar() except +
         void setBatchSize( int batchSize ) except +
         void forward( const float *images) except +
         void backwardFromLabels( const int *labels) except +
@@ -14,7 +14,6 @@ cdef extern from "net/NeuralNet.h":
         Layer *getLayer( int index )
         int getNumLayers()
         const float *getOutput()
-        int getOutputSize()
+        int getOutputNumElements()
         void setTraining( bool training )
-
-
+        void deleteMe()
diff --git a/python/cRmsprop.pxd b/python/cRmsprop.pxd
index b6f1cd46..0b3a5376 100644
--- a/python/cRmsprop.pxd
+++ b/python/cRmsprop.pxd
@@ -1,6 +1,6 @@
 cdef extern from "trainers/Rmsprop.h":
     cdef cppclass Rmsprop:
-        Rmsprop( EasyCL *cl ) except +
+        Rmsprop( DeepCL *cl ) except +
         void setLearningRate( float learningRate )
         BatchResult train( NeuralNet *net, TrainingContext *context,
             const float *input, const float *expectedOutput )
diff --git a/python/cSGD.pxd b/python/cSGD.pxd
index b149e53f..e10b62b3 100644
--- a/python/cSGD.pxd
+++ b/python/cSGD.pxd
@@ -9,7 +9,7 @@ cdef extern from "trainers/TrainingContext.h":
 
 cdef extern from "trainers/SGD.h":
     cdef cppclass SGD:
-        SGD( EasyCL *cl ) except +
+        SGD( DeepCL *cl ) except +
         void setLearningRate( float learningRate )
         void setMomentum( float momentum )
         void setWeightDecay( float weightDecay )
@@ -17,4 +17,3 @@ cdef extern from "trainers/SGD.h":
             const float *input, const float *expectedOutput )
         BatchResult trainFromLabels( NeuralNet *net, TrainingContext *context,
             const float *input, const int *labels )
-
diff --git a/python/clean.sh b/python/clean.sh
deleted file mode 100755
index dc984dba..00000000
--- a/python/clean.sh
+++ /dev/null
@@ -1,8 +0,0 @@
-#!/bin/bash
-
-# bash script to purge everything, so we can rebuild from scratch
-# not supported on Windows, clearly :-)
-
-
-rm -Rf build dist DeepCL.egg-info mysrc *.pyc PyDeepCL.cpp PyDeepCL.pyd *.so
-
diff --git a/python/cmake/FindCython.cmake b/python/cmake/FindCython.cmake
new file mode 100644
index 00000000..f44f1f70
--- /dev/null
+++ b/python/cmake/FindCython.cmake
@@ -0,0 +1,45 @@
+# Find the Cython compiler.
+#
+# This code sets the following variables:
+#
+#  CYTHON_EXECUTABLE
+#
+# See also UseCython.cmake
+
+#=============================================================================
+# Copyright 2011 Kitware, Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#=============================================================================
+
+# Use the Cython executable that lives next to the Python executable
+# if it is a local installation.
+find_package( PythonInterp )
+if( PYTHONINTERP_FOUND )
+  get_filename_component( _python_path ${PYTHON_EXECUTABLE} PATH )
+  find_program( CYTHON_EXECUTABLE
+    NAMES cython cython.bat cython3
+    HINTS ${_python_path}
+    )
+else()
+  find_program( CYTHON_EXECUTABLE
+    NAMES cython cython.bat cython3
+    )
+endif()
+
+
+include( FindPackageHandleStandardArgs )
+FIND_PACKAGE_HANDLE_STANDARD_ARGS( Cython REQUIRED_VARS CYTHON_EXECUTABLE )
+
+mark_as_advanced( CYTHON_EXECUTABLE )
+
diff --git a/python/cmake/ReplicatePythonSourceTree.cmake b/python/cmake/ReplicatePythonSourceTree.cmake
new file mode 100644
index 00000000..d308cce7
--- /dev/null
+++ b/python/cmake/ReplicatePythonSourceTree.cmake
@@ -0,0 +1,4 @@
+# Note: when executed in the build dir, then CMAKE_CURRENT_SOURCE_DIR is the
+# build dir.
+file( COPY setup.py src test bin DESTINATION "${CMAKE_ARGV3}"
+  FILES_MATCHING PATTERN "*.py" )
diff --git a/python/cmake/UseCython.cmake b/python/cmake/UseCython.cmake
new file mode 100644
index 00000000..68345301
--- /dev/null
+++ b/python/cmake/UseCython.cmake
@@ -0,0 +1,296 @@
+# Define a function to create Cython modules.
+#
+# For more information on the Cython project, see http://cython.org/.
+# "Cython is a language that makes writing C extensions for the Python language
+# as easy as Python itself."
+#
+# This file defines a CMake function to build a Cython Python module.
+# To use it, first include this file.
+#
+#   include( UseCython )
+#
+# Then call cython_add_module to create a module.
+#
+#   cython_add_module( <module_name> <src1> <src2> ... <srcN> )
+#
+# To create a standalone executable, the function
+#
+#   cython_add_standalone_executable( <executable_name> [MAIN_MODULE src1] <src1> <src2> ... <srcN> )
+#
+# To avoid dependence on Python, set the PYTHON_LIBRARY cache variable to point
+# to a static library.  If a MAIN_MODULE source is specified, 
+# the "if __name__ == '__main__':" from that module is used as the C main() method
+# for the executable.  If MAIN_MODULE, the source with the same basename as
+# <executable_name> is assumed to be the MAIN_MODULE.
+#
+# Where <module_name> is the name of the resulting Python module and
+# <src1> <src2> ... are source files to be compiled into the module, e.g. *.pyx,
+# *.py, *.c, *.cxx, etc.  A CMake target is created with name <module_name>.  This can
+# be used for target_link_libraries(), etc.
+#
+# The sample paths set with the CMake include_directories() command will be used
+# for include directories to search for *.pxd when running the Cython complire.
+#
+# Cache variables that effect the behavior include:
+#
+#  CYTHON_ANNOTATE
+#  CYTHON_NO_DOCSTRINGS
+#  CYTHON_FLAGS
+#
+# Source file properties that effect the build process are
+#
+#  CYTHON_IS_CXX
+#
+# If this is set of a *.pyx file with CMake set_source_files_properties()
+# command, the file will be compiled as a C++ file.
+#
+# See also FindCython.cmake
+
+#=============================================================================
+# Copyright 2011 Kitware, Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#=============================================================================
+
+# Configuration options.
+set( CYTHON_ANNOTATE OFF
+  CACHE BOOL "Create an annotated .html file when compiling *.pyx." )
+set( CYTHON_NO_DOCSTRINGS OFF
+  CACHE BOOL "Strip docstrings from the compiled module." )
+set( CYTHON_FLAGS "" CACHE STRING
+  "Extra flags to the cython compiler." )
+mark_as_advanced( CYTHON_ANNOTATE CYTHON_NO_DOCSTRINGS CYTHON_FLAGS )
+mark_as_advanced( CMAKE_CXX_COMPILER CMAKE_C_COMPILER )
+
+find_package( Cython REQUIRED )
+find_package( PythonLibs REQUIRED )
+
+set( CYTHON_CXX_EXTENSION "cxx" )
+set( CYTHON_C_EXTENSION "c" )
+
+# Create a *.c or *.cxx file from a *.pyx file.
+# Input the generated file basename.  The generate file will put into the variable
+# placed in the "generated_file" argument. Finally all the *.py and *.pyx files.
+function( compile_pyx _name generated_file )
+  # Default to assuming all files are C.
+  set( cxx_arg "" )
+  set( extension ${CYTHON_C_EXTENSION} )
+  set( pyx_lang "C" )
+  set( comment "Compiling Cython C source for ${_name}..." )
+
+  set( cython_include_directories "" )
+  set( pxd_dependencies "" )
+  set( c_header_dependencies "" )
+  set( pyx_locations "" )
+
+  foreach( pyx_file ${ARGN} )
+    get_filename_component( pyx_file_basename "${pyx_file}" NAME_WE )
+
+    # Determine if it is a C or C++ file.
+    get_source_file_property( property_is_cxx ${pyx_file} CYTHON_IS_CXX )
+    if( ${property_is_cxx} )
+      set( cxx_arg "--cplus" )
+      set( extension ${CYTHON_CXX_EXTENSION} )
+      set( pyx_lang "CXX" )
+      set( comment "Compiling Cython CXX source for ${_name}..." )
+    endif()
+
+    # Get the include directories.
+    get_source_file_property( pyx_location ${pyx_file} LOCATION )
+    get_filename_component( pyx_path ${pyx_location} PATH )
+    get_directory_property( cmake_include_directories DIRECTORY ${pyx_path} INCLUDE_DIRECTORIES )
+    list( APPEND cython_include_directories ${cmake_include_directories} )
+    list( APPEND pyx_locations "${pyx_location}" )
+
+    # Determine dependencies.
+    # Add the pxd file will the same name as the given pyx file.
+    unset( corresponding_pxd_file CACHE )
+    find_file( corresponding_pxd_file ${pyx_file_basename}.pxd
+      PATHS "${pyx_path}" ${cmake_include_directories} 
+      NO_DEFAULT_PATH )
+    if( corresponding_pxd_file )
+      list( APPEND pxd_dependencies "${corresponding_pxd_file}" )
+    endif()
+
+    # pxd files to check for additional dependencies.
+    set( pxds_to_check "${pyx_file}" "${pxd_dependencies}" )
+    set( pxds_checked "" )
+    set( number_pxds_to_check 1 )
+    while( ${number_pxds_to_check} GREATER 0 )
+      foreach( pxd ${pxds_to_check} )
+        list( APPEND pxds_checked "${pxd}" )
+        list( REMOVE_ITEM pxds_to_check "${pxd}" )
+
+        # check for C header dependencies
+        file( STRINGS "${pxd}" extern_from_statements
+          REGEX "cdef[ ]+extern[ ]+from.*$" )
+        foreach( statement ${extern_from_statements} )
+          # Had trouble getting the quote in the regex
+          string( REGEX REPLACE "cdef[ ]+extern[ ]+from[ ]+[\"]([^\"]+)[\"].*" "\\1" header "${statement}" )
+          unset( header_location CACHE )
+          find_file( header_location ${header} PATHS ${cmake_include_directories} )
+          if( header_location )
+            list( FIND c_header_dependencies "${header_location}" header_idx )
+            if( ${header_idx} LESS 0 )
+              list( APPEND c_header_dependencies "${header_location}" )
+            endif()
+          endif()
+        endforeach()
+
+        # check for pxd dependencies
+
+        # Look for cimport statements.
+        set( module_dependencies "" )
+        file( STRINGS "${pxd}" cimport_statements REGEX cimport )
+        foreach( statement ${cimport_statements} )
+          if( ${statement} MATCHES from )
+            string( REGEX REPLACE "from[ ]+([^ ]+).*" "\\1" module "${statement}" )
+          else()
+            string( REGEX REPLACE "cimport[ ]+([^ ]+).*" "\\1" module "${statement}" )
+          endif()
+          list( APPEND module_dependencies ${module} )
+        endforeach()
+        list( REMOVE_DUPLICATES module_dependencies )
+        # Add the module to the files to check, if appropriate.
+        foreach( module ${module_dependencies} )
+          unset( pxd_location CACHE )
+          find_file( pxd_location ${module}.pxd
+            PATHS "${pyx_path}" ${cmake_include_directories} NO_DEFAULT_PATH )
+          if( pxd_location )
+            list( FIND pxds_checked ${pxd_location} pxd_idx )
+            if( ${pxd_idx} LESS 0 )
+              list( FIND pxds_to_check ${pxd_location} pxd_idx )
+              if( ${pxd_idx} LESS 0 )
+                list( APPEND pxds_to_check ${pxd_location} )
+                list( APPEND pxd_dependencies ${pxd_location} )
+              endif() # if it is not already going to be checked
+            endif() # if it has not already been checked
+          endif() # if pxd file can be found
+        endforeach() # for each module dependency discovered
+      endforeach() # for each pxd file to check
+      list( LENGTH pxds_to_check number_pxds_to_check )
+    endwhile()
+  endforeach() # pyx_file
+
+  # Set additional flags.
+  if( CYTHON_ANNOTATE )
+    set( annotate_arg "--annotate" )
+  endif()
+
+  if( CYTHON_NO_DOCSTRINGS )
+    set( no_docstrings_arg "--no-docstrings" )
+  endif()
+
+  if( "${CMAKE_BUILD_TYPE}" STREQUAL "Debug" OR
+        "${CMAKE_BUILD_TYPE}" STREQUAL "RelWithDebInfo" )
+      set( cython_debug_arg "--gdb" )
+  endif()
+
+  if( "${PYTHONLIBS_VERSION_STRING}" MATCHES "^2." )
+    set( version_arg "-2" )
+  elseif( "${PYTHONLIBS_VERSION_STRING}" MATCHES "^3." )
+    set( version_arg "-3" )
+  else()
+    set( version_arg )
+  endif()
+
+  # Include directory arguments. 
+  list( REMOVE_DUPLICATES cython_include_directories )
+  set( include_directory_arg "" )
+  foreach( _include_dir ${cython_include_directories} )
+    set( include_directory_arg ${include_directory_arg} "-I" "${_include_dir}" )
+  endforeach()
+
+  # Determining generated file name.
+  set( _generated_file "${CMAKE_CURRENT_SOURCE_DIR}/${_name}.${extension}" )
+  set_source_files_properties( ${_generated_file} PROPERTIES GENERATED TRUE )
+  set( ${generated_file} ${_generated_file} PARENT_SCOPE )
+
+  list( REMOVE_DUPLICATES pxd_dependencies )
+  list( REMOVE_DUPLICATES c_header_dependencies )
+
+  # Add the command to run the compiler.
+  add_custom_command( OUTPUT ${_generated_file}
+    COMMAND ${CYTHON_EXECUTABLE}
+    ARGS ${cxx_arg} ${include_directory_arg} ${version_arg}
+    ${annotate_arg} ${no_docstrings_arg} ${cython_debug_arg} ${CYTHON_FLAGS}
+    --output-file  ${_generated_file} ${pyx_locations}
+    DEPENDS ${pyx_locations} ${pxd_dependencies}
+    IMPLICIT_DEPENDS ${pyx_lang} ${c_header_dependencies}
+    COMMENT ${comment}
+    )
+
+  # Remove their visibility to the user.
+  set( corresponding_pxd_file "" CACHE INTERNAL "" )
+  set( header_location "" CACHE INTERNAL "" )
+  set( pxd_location "" CACHE INTERNAL "" )
+endfunction()
+
+# cython_add_module( <name> src1 src2 ... srcN )
+# Build the Cython Python module.
+function( cython_add_module _name )
+  set( pyx_module_sources "" )
+  set( other_module_sources "" )
+  foreach( _file ${ARGN} )
+    if( ${_file} MATCHES ".*\\.py[x]?$" )
+      list( APPEND pyx_module_sources ${_file} )
+    else()
+      list( APPEND other_module_sources ${_file} )
+    endif()
+  endforeach()
+  compile_pyx( ${_name} generated_file ${pyx_module_sources} )
+  include_directories( ${PYTHON_INCLUDE_DIRS} )
+  python_add_module( ${_name} ${generated_file} ${other_module_sources} )
+  if( APPLE )
+    set_target_properties( ${_name} PROPERTIES LINK_FLAGS "-undefined dynamic_lookup" )
+  else()
+    target_link_libraries( ${_name} ${PYTHON_LIBRARIES} )
+  endif()
+endfunction()
+
+include( CMakeParseArguments )
+# cython_add_standalone_executable( _name [MAIN_MODULE src3.py] src1 src2 ... srcN )
+# Creates a standalone executable the given sources.
+function( cython_add_standalone_executable _name )
+  set( pyx_module_sources "" )
+  set( other_module_sources "" )
+  set( main_module "" )
+  cmake_parse_arguments( cython_arguments "" "MAIN_MODULE" "" ${ARGN} )
+  include_directories( ${PYTHON_INCLUDE_DIRS} )
+  foreach( _file ${cython_arguments_UNPARSED_ARGUMENTS} )
+    if( ${_file} MATCHES ".*\\.py[x]?$" )
+      get_filename_component( _file_we ${_file} NAME_WE )
+      if( "${_file_we}" STREQUAL "${_name}" )
+        set( main_module "${_file}" )
+      elseif( NOT "${_file}" STREQUAL "${cython_arguments_MAIN_MODULE}" )
+        set( PYTHON_MODULE_${_file_we}_static_BUILD_SHARED OFF )
+        compile_pyx( "${_file_we}_static" generated_file "${_file}" )
+        list( APPEND pyx_module_sources "${generated_file}" )
+      endif()
+    else()
+      list( APPEND other_module_sources ${_file} )
+    endif()
+  endforeach()
+
+  if( cython_arguments_MAIN_MODULE )
+    set( main_module ${cython_arguments_MAIN_MODULE} )
+  endif()
+  if( NOT main_module )
+    message( FATAL_ERROR "main module not found." )
+  endif()
+  get_filename_component( main_module_we "${main_module}" NAME_WE )
+  set( CYTHON_FLAGS ${CYTHON_FLAGS} --embed )
+  compile_pyx( "${main_module_we}_static" generated_file ${main_module} )
+  add_executable( ${_name} ${generated_file} ${pyx_module_sources} ${other_module_sources} )
+  target_link_libraries( ${_name} ${PYTHON_LIBRARIES} ${pyx_module_libs} )
+endfunction()
diff --git a/python/cog_cython.py b/python/cog_cython.py
index be9dd76c..fde72a77 100644
--- a/python/cog_cython.py
+++ b/python/cog_cython.py
@@ -1,160 +1,187 @@
 # Copyright Hugh Perkins 2015 hughperkins at gmail
 #
-# This Source Code Form is subject to the terms of the Mozilla Public License, 
-# v. 2.0. If a copy of the MPL was not distributed with this file, You can 
+# This Source Code Form is subject to the terms of the Mozilla Public License,
+# v. 2.0. If a copy of the MPL was not distributed with this file, You can
 # obtain one at http://mozilla.org/MPL/2.0/.
 
-# functions to help wrap C++ callback classes in Cython, and more
-# There are three parts to wrapping C++ callback classes:
-#
-# - in C++, you need to override the C++-side abstract class
-#   => cpp_write_proxy_class
-#
-# - in the pxd, you need to declare the C++ proxy class
-#   => pxd_write_proxy_class
-#
-# - in the .pyx, you need to write a wrapper class, that can be 
-#   overridden in the python files
-#   => pyx_write_overrideable_class
-#
-# in all cases, you need to provide a 'defs' file, which is a python
-# file with a list of method definitions, provided as tuples like:
-# defs.append( ( 'act', 'float', [('int','index')] ) ) 
-# here: - act is the name of the method
-#       - float is the return type
-#       - there is one parameter 'index', of type 'int'
+"""
+functions to help wrap C++ callback classes in Cython, and more
+There are three parts to wrapping C++ callback classes:
+
+- in C++, you need to override the C++-side abstract class
+  => cpp_write_proxy_class
+
+- in the pxd, you need to declare the C++ proxy class
+  => pxd_write_proxy_class
+
+- in the .pyx, you need to write a wrapper class, that can be
+  overridden in the python files
+  => pyx_write_overrideable_class
+
+in all cases, you need to provide a 'defs' file, which is a python
+file with a list of method definitions, provided as tuples like:
+defs.append(('act', 'float', [('int','index')]))
+here: - act is the name of the method
+      - float is the return type
+      - there is one parameter 'index', of type 'int'
+"""
 
 import cog
 
-def upperFirst( word ):
-    """helper method to capitalize the first letter of word"""
+
+def upperFirst(word):
+    """
+    helper method to capitalize the first letter of word
+    """
     word = word[0].upper() + word[1:]
     return word
 
-def cpp_write_proxy_class( proxy_name, parent_name, defs ):
-    """use to create a c++ class that inherits from a (possibly abstract) c++ class
+
+def cpp_write_proxy_class(proxy_name, parent_name, defs):
+    """
+    use to create a c++ class that inherits from a (possibly abstract) c++ class
     and handles the c++ side of receiving callback functions into cython,
-    and calling these appropriately"""
+    and calling these appropriately
+    """
 
     cog.outl('// generated using cog (as far as the [[end]] bit:')
-    cog.outl( 'class ' + proxy_name + ' : public ' + parent_name + ' {' )
-    cog.outl( 'public:')
-    cog.outl( '    void *pyObject;')
-    cog.outl( '')
-    cog.outl( '    ' + proxy_name + '(void *pyObject) :')
-    cog.outl( '        pyObject(pyObject) {')
-    cog.outl( '    }')
-    cog.outl( '')
+    cog.outl('class ' + proxy_name + ' : public ' + parent_name + ' {')
+    cog.outl('public:')
+    cog.outl('    void *pyObject;')
+    cog.outl('')
+    cog.outl('    ' + proxy_name + '(void *pyObject) :')
+    cog.outl('        pyObject(pyObject) {')
+    cog.outl('    }')
+    cog.outl('')
 
     for thisdef in defs:
-        ( name, returnType, parameters ) = thisdef
+        (name, returnType, parameters) = thisdef
         cog.out('    typedef ' + returnType + '(*' + name + 'Def)(')
         for parameter in parameters:
-            (ptype,pname) = parameter
-            cog.out( ptype + ' ' + pname + ',')
-        cog.outl( ' void *pyObject);')
+            (ptype, pname) = parameter
+            cog.out(ptype + ' ' + pname + ',')
+        cog.outl(' void *pyObject);')
     cog.outl('')
 
     for thisdef in defs:
-        ( name, returnType, parameters ) = thisdef
-        cog.outl( '    ' + name + 'Def c' + upperFirst( name ) + ';' )   
-    cog.outl('')     
+        (name, returnType, parameters) = thisdef
+        cog.outl('    ' + name + 'Def c' + upperFirst(name) + ';')
+    cog.outl('')
 
     for thisdef in defs:
-        ( name, returnType, parameters ) = thisdef
-        cog.outl( '    void set' + upperFirst( name ) + ' ( ' + name + 'Def c' + upperFirst( name ) + ' ) {' )   
-        cog.outl( '        this->c' + upperFirst( name ) + ' = c' + upperFirst( name ) + ';' )
-        cog.outl( '    }')
-    cog.outl('')     
+        (name, returnType, parameters) = thisdef
+        cog.outl(
+            '    void set' + upperFirst(name) + ' (' +
+            name + 'Def c' + upperFirst(name) + ') {')
+        cog.outl(
+            '        this->c' + upperFirst(name) +
+            ' = c' + upperFirst(name) + ';')
+        cog.outl('    }')
+    cog.outl('')
 
     for thisdef in defs:
-        ( name, returnType, parameters ) = thisdef
-        cog.out( '    virtual ' + returnType + ' ' + name + '(' )
+        (name, returnType, parameters) = thisdef
+        cog.out('    virtual ' + returnType + ' ' + name + '(')
         isFirstParam = True
         for param in parameters:
-            (ptype,pname) = param
+            (ptype, pname) = param
             if not isFirstParam:
                 cog.out(', ')
-            cog.out( ptype + ' ' + pname )
+            cog.out(ptype + ' ' + pname)
             isFirstParam = False
         cog.outl(') {')
-        # cog.outl('    std::cout << "CyScenario.' + name + '()" << std::endl;')
         cog.out('        ')
         if returnType != 'void':
             cog.out('return ')
-        cog.out('c' + upperFirst( name ) + '(')
+        cog.out('c' + upperFirst(name) + '(')
         for param in parameters:
-            (ptype,pname) = param
-            cog.out( pname + ', ' )
-        cog.outl( 'pyObject );' )
+            (ptype, pname) = param
+            cog.out(pname + ', ')
+        cog.outl('pyObject);')
         cog.outl('    }')
-    cog.outl( '};' )
+    cog.outl('};')
+
 
-def pxd_write_proxy_class( proxy_name, defs ):
-    """writes the pxd declaration of the same class that was created using
+def pxd_write_proxy_class(proxy_name, defs):
+    """
+    writes the pxd declaration of the same class that was created using
     'cpp_write_proxy_class' for C++ above.
-    This should be used inside 'cdef extern from "somefile.h":' section"""
+    This should be used inside 'cdef extern from "somefile.h":' section
+    """
 
     cog.outl('# generated using cog (as far as the [[end]] bit:')
     for thisdef in defs:
-        ( name, returnType, parameters ) = thisdef
-        cog.out('ctypedef ' + returnType + '(*' + proxy_name + '_' + name + 'Def)(')
+        (name, returnType, parameters) = thisdef
+        cog.out(
+            'ctypedef ' + returnType +
+            '(*' + proxy_name + '_' + name + 'Def)(')
         for parameter in parameters:
-            (ptype,pname) = parameter
-            cog.out( ptype + ' ' + pname + ',')
-        cog.outl( ' void *pyObject)')
+            (ptype, pname) = parameter
+            cog.out(ptype + ' ' + pname + ',')
+        cog.outl(' void *pyObject)')
 
-    cog.outl( 'cdef cppclass ' + proxy_name + ':')
-    cog.outl( '    ' + proxy_name + '(void *pyObject)')
-    cog.outl( '' )
+    cog.outl('cdef cppclass ' + proxy_name + ':')
+    cog.outl('    ' + proxy_name + '(void *pyObject)')
+    cog.outl('')
     for thisdef in defs:
-        ( name, returnType, parameters ) = thisdef
-        cog.outl( '    void set' + upperFirst( name ) + ' ( ' + proxy_name + '_' + name + 'Def c' + upperFirst( name ) + ' )')
+        (name, returnType, parameters) = thisdef
+        cog.outl(
+            '    void set' + upperFirst(name) +
+            ' (' + proxy_name + '_' + name + 'Def c' + upperFirst(name) + ')')
+
 
-def pyx_write_overrideable_class( pxd_module, pxd_class, pyx_class, defs, skip_names ):
-    """writes the python class in the pyx file that the .py modules
+def pyx_write_overrideable_class(
+        pxd_module, pxd_class, pyx_class, defs, skip_names):
+    """
+    writes the python class in the pyx file that the .py modules
     can override, and receives callbacks from
     any method names in skip_names will be skipped, and you can write them
-    manually before/after the cog block"""
+    manually before/after the cog block
+    """
 
     cog.outl('# generated using cog (as far as the [[end]] bit:')
     for thisdef in defs:
-        ( name, returnType, parameters ) = thisdef
-        if not name in skip_names:
-            cog.out('cdef ' + returnType + ' ' + pyx_class + '_' + name + '( ')
-            for (ptype,pname) in parameters:
-                cog.out( ptype + ' ' + pname + ', ' )
+        (name, returnType, parameters) = thisdef
+        if name not in skip_names:
+            cog.out('cdef ' + returnType + ' ' + pyx_class + '_' + name + '(')
+            for (ptype, pname) in parameters:
+                cog.out(ptype + ' ' + pname + ', ')
                 isFirst = False
-            cog.outl( ' void *pyObject ):')
-            cog.out( '    ')
+            cog.outl(' void *pyObject):')
+            cog.out('    ')
             if returnType != 'void':
-                cog.out( 'return ')
-            cog.out( '(<object>pyObject).' + name + '(')
+                cog.out('return ')
+            cog.out('(<object>pyObject).' + name + '(')
             isFirst = True
-            for (ptype,pname) in parameters:
+            for (ptype, pname) in parameters:
                 if not isFirst:
                     cog.out(', ')
-                cog.out( pname )
+                cog.out(pname)
                 isFirst = False
-            cog.outl( ')' )
-            cog.outl( '' )
-    cog.outl( 'cdef class ' + pyx_class + ':')
-    cog.outl( '    cdef ' + pxd_module + '.' + pxd_class + ' *thisptr')
-    cog.outl( '    def __cinit__(self):')
-    cog.outl( '        self.thisptr = new ' + pxd_module + '.' + pxd_class + '(<void *>self )')
-    cog.outl( '' )
+            cog.outl(')')
+            cog.outl('')
+    cog.outl('cdef class ' + pyx_class + ':')
+    cog.outl('    cdef ' + pxd_module + '.' + pxd_class + ' *thisptr')
+    cog.outl('    def __cinit__(self):')
+    cog.outl(
+        '        self.thisptr = new ' +
+        pxd_module + '.' + pxd_class + '(<void *>self)')
+    cog.outl('')
     for thisdef in defs:
-        ( name, returnType, parameters ) = thisdef
-        cog.outl('        self.thisptr.set' + upperFirst( name ) + '( ' + pyx_class + '_' + name + ' )' )
-    cog.outl( '' )
+        (name, returnType, parameters) = thisdef
+        cog.outl(
+            '        self.thisptr.set' + upperFirst(name) +
+            '(' + pyx_class + '_' + name + ')')
+    cog.outl('')
     for thisdef in defs:
-        ( name, returnType, parameters ) = thisdef
+        (name, returnType, parameters) = thisdef
         if name in skip_names:
             continue
-        cog.out( '    def ' + name + '(self')
-        for (ptype,pname) in parameters:
-            cog.out( ', ' + pname )
-        cog.outl( '):')
-        cog.outl('        raise Exception("Method needs to be overridden: ' + pyx_class + '.' + name + '()")')
+        cog.out('    def ' + name + '(self')
+        for (ptype, pname) in parameters:
+            cog.out(', ' + pname)
+        cog.outl('):')
+        cog.outl(
+            '        raise Exception("Method needs to be overridden: ' +
+            pyx_class + '.' + name + '()")')
         cog.outl('')
-
diff --git a/python/dev-build.py b/python/dev-build.py
deleted file mode 100644
index 4c567042..00000000
--- a/python/dev-build.py
+++ /dev/null
@@ -1,137 +0,0 @@
-#!/usr/bin/python
-# Copyright Hugh Perkins 2015 hughperkins at gmail
-#
-# This Source Code Form is subject to the terms of the Mozilla Public License, 
-# v. 2.0. If a copy of the MPL was not distributed with this file, You can 
-# obtain one at http://mozilla.org/MPL/2.0/.
-
-# ****************************************************************************
-# *                                                                          *
-# *  IMPORTANT:                                                              *
-# *  This script is for python wrapper development                           *
-# *  If you want to build and use the wrapper, you probably want             *
-# *  to use: 'setup.py'                                                      *
-# *                                                                          *
-# ****************************************************************************
-#
-# This script uses the binary built by the c++ cmake build, which can be built
-# using multiple threads etc, so is fast to build during development :-)
-# If you just want to build and use the wrapper, you should probably use 
-# 'setup.py', which builds slower, but more likely to be reliable and multi
-# platform
-#
-# Bearing in mind these caveats, if you do want to use this script:
-#
-# - first, build DeepCL shared object (.so or .dll) into the ../build directory
-#
-# - then, simply run this script as for setup.py, ie:
-#
-#     python dev-build.py build_ext -i
-#
-# ... and then you can simply run the test python scripts as before, eg:
-#
-#     ./test_lowlevel.py /mydata/mnist 
-#
-# The following command might be useful for running the C++ build, on linux:
-#
-#     ( cd ..; mkdir -p build; cd build; cmake ..; make -j 4 )
-#
-
-import os
-import os.path
-import sysconfig
-import sys
-import glob
-import platform
-from setuptools import setup
-from setuptools import Extension
-from Cython.Build import cythonize
-import pypandoc
-import cogapp
-
-for arg in sys.argv:
-    if arg == 'upload' or arg == 'register' or arg == 'testarg':
-        print('This setup is not designed to be uploaded or registered :-)')
-        sys.exit(-1)
-
-def read(fname):
-    return open(os.path.join(os.path.dirname(__file__), fname)).read()
-
-def get_so_suffix():
-    if sysconfig.get_config_var('SOABI') != None:
-        return "." + sysconfig.get_config_var('SOABI')
-    return ""
-
-pypandoc.convert('README.md', 'rst', outputfile = 'README.rst' )
-
-cog = cogapp.cogapp.Cog()
-cog.callableMain(['','--verbosity=1','-r','CyScenario.h','PyDeepCL.pyx','cDeepCL.pxd'])
-
-# from http://stackoverflow.com/questions/14320220/testing-python-c-libraries-get-build-path
-def distutils_dir_name(dname):
-    """Returns the name of a distutils build directory"""
-    f = "{dirname}.{platform}-{version[0]}.{version[1]}"
-    return f.format(dirname=dname,
-                    platform=sysconfig.get_platform(),
-                    version=sys.version_info)
- 
-def lib_build_dir():
-    return os.path.join('build', distutils_dir_name('lib'))
-
-compile_options = []
-osfamily = platform.uname()[0]
-if osfamily == 'Windows':
-   compile_options.append('/EHsc')
-elif osfamily == 'Linux':
-   compile_options.append('-std=c++0x')
-   compile_options.append('-g')
-else:
-   pass
-   # put other options etc here if necessary
-
-runtime_library_dirs = []
-libraries = []
-if osfamily == 'Linux':
-    runtime_library_dirs= ['../build']
-
-if osfamily == 'Windows':
-    libraries = ['winmm']
-
-libraries.append('DeepCL')
-
-ext_modules = [
-    Extension("PyDeepCL",
-              sources=["PyDeepCL.pyx", 'CyWrappers.cpp'], 
-              include_dirs = ['../src','../EasyCL','../qlearning'],
-              libraries= libraries,
-              extra_compile_args=compile_options,
-#              extra_objects=['cDeepCL.pxd'],
-              library_dirs = runtime_library_dirs,
-              runtime_library_dirs=runtime_library_dirs,
-              language="c++"
-    )
-]
-
-setup(
-  name = 'DeepCL',
-  # version = "1.0.2",
-  author = "Hugh Perkins",
-  author_email = "hughperkins@gmail.com",
-  description = 'python wrapper for DeepCL deep convolutional neural network library for OpenCL',
-  license = 'MPL',
-  url = 'https://github.com/hughperkins/DeepCL',
-  long_description = read('README.rst'),
-  classifiers = [
-    'Development Status :: 4 - Beta',
-    'Topic :: Scientific/Engineering :: Artificial Intelligence',
-    'License :: OSI Approved :: Mozilla Public License 2.0 (MPL 2.0)',
-  ],
-  install_requires = ['Cython>=0.22','cogapp>=2.4','future>=0.14.3'],
-  tests_require = ['nose>=1.3.4'],
-  scripts = ['test_deepcl.py','test_lowlevel.py'],
- # modules = libraries,
-#  libraries = libraries,
-  ext_modules = cythonize( ext_modules),
-)
-
-
diff --git a/python/publish.sh b/python/publish.sh
deleted file mode 100755
index 47200054..00000000
--- a/python/publish.sh
+++ /dev/null
@@ -1,6 +0,0 @@
-#!/bin/bash
-
-~/envs/bin/python setup.py bdist_egg upload
-~/env-34/bin/python setup.py bdist_egg upload
-~/env-34/bin/python setup.py sdist upload 
- 
diff --git a/python/setup.py b/python/setup.py
index 900932a3..9339c03d 100644
--- a/python/setup.py
+++ b/python/setup.py
@@ -1,305 +1,106 @@
 # Copyright Hugh Perkins 2015 hughperkins at gmail
 #
-# This Source Code Form is subject to the terms of the Mozilla Public License, 
-# v. 2.0. If a copy of the MPL was not distributed with this file, You can 
+# This Source Code Form is subject to the terms of the Mozilla Public License,
+# v. 2.0. If a copy of the MPL was not distributed with this file, You can
 # obtain one at http://mozilla.org/MPL/2.0/.
 
-#from distutils.core import setup
+from __future__ import print_function
 import os
 import os.path
-import sysconfig
 import sys
-import glob
 import platform
 from setuptools import setup
-#from distutils.extension import Extension
 from setuptools import Extension
-import distutils.dir_util
-import distutils.file_util
 
 cython_present = False
-try:
-    from Cython.Build import cythonize
-    cython_present = True
-except ImportError:
-    pass
-
-pypandoc_present = False
-try:
-    import pypandoc
-    pypandoc_present = True
-except ImportError:
-    pass
 
-#print ( sys.argv )
-# if any of sys.argv is bdist or sdist or bdist_egg, then lets copy everything to
-# a subfolder of us called 'mysrc', since '..' paths dont work well..
-# otherwise, let's just assume this folder already contains our source :-)
-docopy = False
+building_dist = False
 for arg in sys.argv:
-    if arg in ('sdist','bdist','bdist_egg','build_ext'):
-        docopy = True
+    if arg in ('sdist', 'bdist', 'bdist_egg', 'build_ext'):
+        building_dist = True
+        break
 
-srcdirs = ['lua', 'activate','batch','clmath','conv','dropout','fc','forcebackprop',
-    'input','layer','loaders','loss','net','netdef','normalize','patches',
-    'pooling','trainers','util','weights', 'qlearning' ]
+if building_dist:
+    try:
+        import pypandoc
+        pypandoc.convert('README.md', 'rst', outputfile='README.rst')
+    except:
+        print('WARNING: pypandoc not installed, cannot update README.rst')
 
-if docopy:
-    if not os.path.isdir('mysrc'):
-        os.makedirs('mysrc')
-    if not os.path.isdir('mysrc/util'):
-        os.makedirs('mysrc/util')
-    if not os.path.isdir('mysrc/templates'):
-        os.makedirs('mysrc/templates')
-    if not os.path.isdir('mysrc/lua'):
-        os.makedirs('mysrc/lua')
-    for thisdir in ['../src','../EasyCL',
-            '../EasyCL/thirdparty/clew/src',
-            '../EasyCL/thirdparty/clew/include']: # copy everything..
-        for thisfile in os.listdir(thisdir):
-            #print(thisfile)
-            thisfilepath = thisdir +'/' + thisfile
-            if os.path.isfile(thisfilepath):
-                distutils.file_util.copy_file( thisfilepath, 'mysrc/' + thisfile )
-    for thisdir in ['../EasyCL/util']:
-        for thisfile in os.listdir(thisdir):
-            #print(thisfile)
-            thisfilepath = thisdir +'/' + thisfile
-            if os.path.isfile(thisfilepath):
-                distutils.file_util.copy_file( thisfilepath, 'mysrc/util/' + thisfile )
-    for thisdir in ['../EasyCL/thirdparty/lua-5.1.5/src']:
-        for thisfile in os.listdir(thisdir):
-            #print(thisfile)
-            thisfilepath = thisdir +'/' + thisfile
-            if os.path.isfile(thisfilepath):
-                distutils.file_util.copy_file( thisfilepath, 'mysrc/lua/' + thisfile )
-    distutils.file_util.copy_file('../EasyCL/thirdparty/lua-5.1.5/files.txt', 'mysrc/lua/files.txt')
-    for thisdir in ['../EasyCL/templates']:
-        for thisfile in os.listdir(thisdir):
-            #print(thisfile)
-            thisfilepath = thisdir +'/' + thisfile
-            if os.path.isfile(thisfilepath):
-                distutils.file_util.copy_file( thisfilepath, 'mysrc/templates/' + thisfile )
-    distutils.file_util.copy_file( '../jenkins/version.txt', 'version.txt' )
-    for srcdir in srcdirs:
-        if srcdir == 'lua':
-            continue
-        if not os.path.isdir('mysrc/' + srcdir):
-            os.makedirs('mysrc/' + srcdir)
-        for thisfile in os.listdir('../src/' + srcdir):
-            thisfilepath = '../src/' + srcdir +'/' + thisfile
-            if os.path.isfile(thisfilepath):
-                distutils.file_util.copy_file( thisfilepath, 'mysrc/' + srcdir + '/' + thisfile )
-
-#        distutils.dir_util.copy_tree( thisdir, 'mysrc' )
 
 def read(fname):
     return open(os.path.join(os.path.dirname(__file__), fname)).read()
 
-def get_so_suffix():
-    if sysconfig.get_config_var('SOABI') != None:
-        return "." + sysconfig.get_config_var('SOABI')
-    return ""
-
-if pypandoc_present:
-    pypandoc.convert('README.md', 'rst', outputfile = 'README.rst' )
-
-def my_cythonize(extensions, **_ignore):
-    #newextensions = []
-    for extension in extensions:
-        print(extension.sources)
-        should_cythonize = False
-        sources = []
-        for sfile in extension.sources:
-            path, ext = os.path.splitext(sfile)
-            if ext in ('.pyx', '.py'):
-                should_cythonize = True
-                if not cython_present:
-                   # if extension.language == 'c++':
-                        ext = '.cpp'
-                    #else:
-                     #   ext = '.c'
-            if sfile == 'PyDeepCL.c':
-                ext = '.cpp' # hack for now... not sure how to fix this cleanly
-                             # yet
-            sfile = path + ext
-            if sfile.startswith('..'):
-                # use mysrc instead
-                basename = os.path.basename(sfile)
-                sfile = 'mysrc/' + basename
-            sources.append(sfile)
-        #print(should_cythonize)
-        if should_cythonize and cython_present:
-            print('cythonizing...')
-            cythonize(extension)
-        extension.sources[:] = sources    
-        #newextensions.append( extension )
-    return extensions
-
-def no_cythonize(extensions, **_ignore):
-    for extension in extensions:
-        sources = []
-        for sfile in extension.sources:
-            path, ext = os.path.splitext(sfile)
-            #print('path,ext',path,ext)
-            if ext in ('.pyx', '.py'):
-                #if extension.language == 'c++':
-                    ext = '.cpp'
-                #else:
-                #    ext = '.c'
-            if sfile == 'PyDeepCL.c':
-                ext = '.cpp' # hack for now... not sure how to fix this cleanly
-                             # yet
-            sfile = path + ext
-            if sfile.startswith('..'):
-                # use mysrc instead
-                basename = os.path.basename(sfile)
-                sfile = 'mysrc/' + basename
-            sources.append(sfile)
-            print('appending source ', sfile )
-        extension.sources[:] = sources    
-    return extensions
-
-# from http://stackoverflow.com/questions/14320220/testing-python-c-libraries-get-build-path
-def distutils_dir_name(dname):
-    """Returns the name of a distutils build directory"""
-    f = "{dirname}.{platform}-{version[0]}.{version[1]}"
-    return f.format(dirname=dname,
-                    platform=sysconfig.get_platform(),
-                    version=sys.version_info)
- 
-def lib_build_dir():
-    return os.path.join('build', distutils_dir_name('lib'))
-
-deepcl_sources = []
-for srcdir in srcdirs:
-    filespath = 'mysrc/' + srcdir + '/files.txt'
-    fileslist = []
-    with open( filespath, 'r' ) as f:
-        lines = f.readlines()
-        for line in lines:
-            if line.strip() != "":
-                fileslist.append( 'mysrc/' + srcdir + '/' + line.strip() )
-#    print('fileslist: ', fileslist)
-    deepcl_sources = deepcl_sources + fileslist
-print('deeplcl_sources', deepcl_sources)
-#for source in deepcl_sources_all:
-#    deepcl_sources.append(source)
-
-easyclsources = list(map( lambda name : 'mysrc/' + name, [
-        'EasyCL.cpp',
-        'deviceinfo_helper.cpp', 'platforminfo_helper.cpp',
-         'templates/LuaTemplater.cpp',
-        'util/easycl_stringhelper.cpp', 'templates/TemplatedKernel.cpp',
-#        'EasyCL/speedtemplates/SpeedTemplates.cpp',
-        'CLWrapper.cpp',
-        'CLKernel.cpp', 'clew.c' ] ))
-print(easyclsources)
-print(isinstance( easyclsources, list) )
-
 compile_options = []
 osfamily = platform.uname()[0]
 if osfamily == 'Windows':
-   compile_options.append('/EHsc')
+    compile_options.append('/EHsc')
 elif osfamily == 'Linux':
-   compile_options.append('-std=c++0x')
-   compile_options.append('-g')
+    compile_options.append('-std=c++0x')
+    compile_options.append('-g')
 else:
-   pass
-   # put other options etc here if necessary
+    pass
+    # put other options etc here if necessary
 
 runtime_library_dirs = []
 libraries = []
+libraries.append('clBLAS')
+libraries.append('EasyCL')
+libraries.append('DeepCL')
+
+library_dirs = []
+library_dirs.append('../dist/lib')
+library_dirs.append('../dist/lib/import')
+
 if osfamily == 'Linux':
-    runtime_library_dirs= ['.']
+    runtime_library_dirs = ['.']
 
 if osfamily == 'Windows':
-    libraries = ['winmm']
+    libraries.append('winmm')
 
+sources = ["PyDeepCL.cxx", 'CyWrappers.cpp']
 if cython_present:
-    my_cythonize = cythonize
-else:
-    my_cythonize = no_cythonize
-
-#libraries = [
-#    ("EasyCL", {
-#        'sources': easyclsources + ['dummy_easycl.cpp'],
-#        'include_dirs': ['DeepCL/EasyCL'],
-#        'extra_compile_args': compile_options,
-##        define_macros = [('EasyCL_EXPORTS',1)],
-##        libraries = []
-##        language='c++'
-#        }
-#    )
-#]
-
+    sources = ["PyDeepCL.pyx", 'CyWrappers.cpp']
 ext_modules = [
-#    Extension("_EasyCL",
-#        sources = easyclsources + ['dummy_easycl.cpp'],
-#        include_dirs = ['DeepCL/EasyCL'],
-#        extra_compile_args=compile_options,
-#        define_macros = [('EasyCL_EXPORTS',1),('MS_WIN32',1)],
-##        libraries = []
-##        language='c++'
-#    )
-#    Extension("libDeepCL",
-#        list(map( lambda name : 'DeepCL/src/' + name, deepcl_sources)), # +
-##            glob.glob('DeepCL/src/*.h'),
-#        include_dirs = ['DeepCL/src','DeepCL/EasyCL'],
-#        extra_compile_args = compile_options,
-#        library_dirs = [ lib_build_dir() ],
-#        libraries = [ "EasyCL" + get_so_suffix() ],
-#        define_macros = [('DeepCL_EXPORTS',1)],
-#        runtime_library_dirs=runtime_library_dirs
-##        language='c++'
-#    ),
     Extension("PyDeepCL",
-              sources=["PyDeepCL.pyx", 'CyWrappers.cpp'] 
-                + easyclsources
-                + deepcl_sources, 
-#                glob.glob('DeepCL/EasyCL/*.h'),
-              include_dirs = ['mysrc', 'mysrc/lua'],
-              libraries= libraries,
+              sources=sources,
+              include_dirs=['mysrc', 'mysrc/lua'],
+              library_dirs=library_dirs,
+              libraries=libraries,
               extra_compile_args=compile_options,
-        define_macros = [('DeepCL_EXPORTS',1),('EasyCL_EXPORTS',1)],
-#              extra_objects=['cDeepCL.pxd'],
-#              library_dirs = [lib_build_dir()],
               runtime_library_dirs=runtime_library_dirs,
-              language="c++"
-    )
-]
+              language="c++")]
+
 
 def read_if_exists(filename):
     filepath = os.path.join(os.path.dirname(__file__), filename)
     if os.path.isfile(filepath):
         return open(filepath).read()
     else:
-        ""
+        return ""
 
 version = read_if_exists('version.txt').strip().replace('v', '')
-print('version: ', version )
+if building_dist and version == '':
+    raise Exception('version cannot be empty string when building dist')
+print('version: ', version)
 
 setup(
-  name = 'DeepCL',
-  version = version,
-#  version = "3.4.0rc1",  # synchronize to deepcl main version
-  author = "Hugh Perkins",
-  author_email = "hughperkins@gmail.com",
-  description = 'python wrapper for DeepCL deep convolutional neural network library for OpenCL',
-  license = 'MPL',
-  url = 'https://github.com/hughperkins/DeepCL',
-  long_description = read('README.rst'),
-  classifiers = [
-    'Development Status :: 4 - Beta',
-    'Topic :: Scientific/Engineering :: Artificial Intelligence',
-    'License :: OSI Approved :: Mozilla Public License 2.0 (MPL 2.0)',
-  ],
-  install_requires = [],
-  # install_requires = [],
-  tests_require = ['nose>=1.3.4','Cython>=0.22','cogapp>=2.4','future>=0.14.3'],
-  scripts = ['test_deepcl.py','test_lowlevel.py'],
- # modules = libraries,
-#  lib raries = libraries,
-  ext_modules = my_cythonize( ext_modules),
+    name='DeepCL',
+    version=version,
+    author="Hugh Perkins",
+    author_email="hughperkins@gmail.com",
+    description=(
+        'python wrapper for DeepCL deep convolutional '
+        'neural network library for OpenCL'),
+    license='MPL',
+    url='https://github.com/hughperkins/DeepCL',
+    long_description=read('README.rst'),
+    classifiers=[
+        'Development Status :: 5 - Production/Stable',
+        'Topic :: Scientific/Engineering :: Artificial Intelligence',
+        'License :: OSI Approved :: Mozilla Public License 2.0 (MPL 2.0)',
+    ],
+    install_requires=[],
+    scripts=['test_deepcl.py', 'test_lowlevel.py'],
+    ext_modules=ext_modules,
 )
-
diff --git a/python/test/test_basic.py b/python/test/test_basic.py
index 92f18717..62373abf 100644
--- a/python/test/test_basic.py
+++ b/python/test/test_basic.py
@@ -41,11 +41,11 @@ def test_buildnet():
         exceptionCalled = True
     assert exceptionCalled
 
-    assert 28 == net.getLayer(0).getOutputImageSize()
-    assert 28 == net.getLayer(1).getOutputImageSize()
-    assert 28 == net.getLayer(2).getOutputImageSize()
-    assert 14 == net.getLayer(3).getOutputImageSize()
-    assert 1 == net.getLayer(8).getOutputImageSize()
+    assert 28 == net.getLayer(0).getOutputSize()
+    assert 28 == net.getLayer(1).getOutputSize()
+    assert 28 == net.getLayer(2).getOutputSize()
+    assert 14 == net.getLayer(3).getOutputSize()
+    assert 1 == net.getLayer(8).getOutputSize()
 
 #    assert not net.getLayer(0).getBiased()
 #    assert net.getLayer(2).getBiased()
diff --git a/python/test_deepcl.py b/python/test_deepcl.py
index f1ac13a8..984e95a7 100755
--- a/python/test_deepcl.py
+++ b/python/test_deepcl.py
@@ -1,45 +1,55 @@
 #!/usr/bin/python
 
 from __future__ import print_function
-
-#from array import array
 import array
 import PyDeepCL
 import sys
 print('imports done')
 
 if len(sys.argv) != 2:
-    print('usage: python ' + sys.argv[0] + ' [mnist data directory (containing the .mat files)]')
+    print(
+        'usage: python ' + sys.argv[0] +
+        ' [mnist data directory (containing the .mat files)]')
     sys.exit(-1)
 
-mnistFilePath = sys.argv[1] + '/t10k-images-idx3-ubyte' 
+mnistFilePath = sys.argv[1] + '/t10k-images-idx3-ubyte'
+
+cl = PyDeepCL.DeepCL()
+
+print('compute units:', cl.getComputeUnits())
+print('local memory size, bytes:', cl.getLocalMemorySize())
+print('local memory size, KB:', cl.getLocalMemorySizeKB())
+print('max workgroup size:', cl.getMaxWorkgroupSize())
+print('max alloc size MB:', cl.getMaxAllocSizeMB())
 
-cl = PyDeepCL.EasyCL()
-net = PyDeepCL.NeuralNet(cl, 1,28)
+net = PyDeepCL.NeuralNet(cl, 1, 28)
 print('created net')
-print( net.asString() )
+print(net.asString())
 print('printed net')
-net.addLayer( PyDeepCL.NormalizationLayerMaker().translate(-0.5).scale(1/255.0) )
+net.addLayer(PyDeepCL.NormalizationLayerMaker().translate(-0.5).scale(1/255.0))
 print('added layer ')
-PyDeepCL.NetdefToNet.createNetFromNetdef( net, "rt2-8c5z-relu-mp2-16c5z-relu-mp3-150n-tanh-10n" ) 
-print( net.asString() )
- 
-(N,planes,size) = PyDeepCL.GenericLoader.getDimensions(mnistFilePath)
-print( (N,planes,size) )
+PyDeepCL.NetdefToNet.createNetFromNetdef(
+    net, "rt2-8c5z-relu-mp2-16c5z-relu-mp3-150n-tanh-10n")
+print(net.asString())
+
+(N, planes, size) = PyDeepCL.GenericLoader.getDimensions(mnistFilePath)
+print((N, planes, size))
 
 N = 1280
-images = array.array( 'f', [0] * (N*planes*size*size) )
-labels = array.array('i',[0] * N )
-PyDeepCL.GenericLoader.load(mnistFilePath, images, labels, 0, N )
+images = array.array('f', [0] * (N * planes * size * size))
+labels = array.array('i', [0] * N)
+PyDeepCL.GenericLoader.load(mnistFilePath, images, labels, 0, N)
+print('loaded data')
 
-sgd = PyDeepCL.SGD(cl, 0.002, 0.0 )
+sgd = PyDeepCL.SGD(cl, 0.002, 0.0)
+print('created SGD')
 sgd.setWeightDecay(0.0001)
 netLearner = PyDeepCL.NetLearner(
     sgd, net,
     N, images, labels,
     N, images, labels,
-    128 )
-netLearner.setSchedule( 12 )
+    128)
+print('created netLearner')
+netLearner.setSchedule(12)
 netLearner.run()
- 
-
+print('done, cleaning up...')
diff --git a/python/test_lowlevel.py b/python/test_lowlevel.py
index f6e973b3..01ba6aab 100755
--- a/python/test_lowlevel.py
+++ b/python/test_lowlevel.py
@@ -1,59 +1,64 @@
 #!/usr/bin/python
 
 from __future__ import print_function
-
-#from array import array
 import sys
 import array
 import PyDeepCL
 
 if len(sys.argv) != 2:
-    print('usage: python ' + sys.argv[0] + ' [mnist data directory (containing the .mat files)]')
+    print(
+        'usage: python ' + sys.argv[0] +
+        ' [mnist data directory (containing the .mat files)]')
     sys.exit(-1)
 
-mnistFilePath = sys.argv[1] + '/t10k-images-idx3-ubyte' 
+mnistFilePath = sys.argv[1] + '/t10k-images-idx3-ubyte'
 
-cl = PyDeepCL.EasyCL()
+cl = PyDeepCL.DeepCL()
 net = PyDeepCL.NeuralNet(cl)
-sgd = PyDeepCL.SGD( cl, 0.002, 0 )
-sgd.setMomentum( 0.0001 )
-net.addLayer( PyDeepCL.InputLayerMaker().numPlanes(1).imageSize(28) )
-net.addLayer( PyDeepCL.NormalizationLayerMaker().translate(-0.5).scale(1/255.0) )
-net.addLayer( PyDeepCL.ConvolutionalMaker().numFilters(8).filterSize(5).padZeros().biased() )
-net.addLayer( PyDeepCL.ActivationMaker().relu() )
-net.addLayer( PyDeepCL.PoolingMaker().poolingSize(2) )
-net.addLayer( PyDeepCL.ConvolutionalMaker().numFilters(8).filterSize(5).padZeros().biased() )
-net.addLayer( PyDeepCL.ActivationMaker().relu() )
-net.addLayer( PyDeepCL.PoolingMaker().poolingSize(3) )
-net.addLayer( PyDeepCL.FullyConnectedMaker().numPlanes(150).imageSize(1).biased() )
-net.addLayer( PyDeepCL.ActivationMaker().tanh() )
-net.addLayer( PyDeepCL.FullyConnectedMaker().numPlanes(10).imageSize(1).biased() )
-#net.addLayer( PyDeepCL.SquareLossMaker() )
-net.addLayer( PyDeepCL.SoftMaxMaker() )
-print( net.asString() )
-
-(N,planes,size) = PyDeepCL.GenericLoader.getDimensions(mnistFilePath)
-print( (N,planes,size) )
+sgd = PyDeepCL.SGD(cl, 0.002, 0)
+sgd.setMomentum(0.0001)
+net.addLayer(PyDeepCL.InputLayerMaker().numPlanes(1).imageSize(28))
+net.addLayer(
+    PyDeepCL.NormalizationLayerMaker().translate(-0.5).scale(1/255.0))
+net.addLayer(
+    PyDeepCL.ConvolutionalMaker()
+    .numFilters(8).filterSize(5).padZeros().biased())
+net.addLayer(PyDeepCL.ActivationMaker().relu())
+net.addLayer(PyDeepCL.PoolingMaker().poolingSize(2))
+net.addLayer(
+    PyDeepCL.ConvolutionalMaker()
+    .numFilters(8).filterSize(5).padZeros().biased())
+net.addLayer(PyDeepCL.ActivationMaker().relu())
+net.addLayer(PyDeepCL.PoolingMaker().poolingSize(3))
+net.addLayer(
+    PyDeepCL.FullyConnectedMaker().numPlanes(150).imageSize(1).biased())
+net.addLayer(PyDeepCL.ActivationMaker().tanh())
+net.addLayer(
+    PyDeepCL.FullyConnectedMaker().numPlanes(10).imageSize(1).biased())
+net.addLayer(PyDeepCL.SoftMaxMaker())
+print(net.asString())
+
+(N, planes, size) = PyDeepCL.GenericLoader.getDimensions(mnistFilePath)
+print((N, planes, size))
 
 N = 1280
 batchSize = 128
 numEpochs = 30
 
-images = array.array( 'f', [0] * (N*planes*size*size) )
-labels = array.array('i',[0] * N )
-PyDeepCL.GenericLoader.load(mnistFilePath, images, labels, 0, N )
+images = array.array('f', [0] * (N*planes*size*size))
+labels = array.array('i', [0] * N)
+PyDeepCL.GenericLoader.load(mnistFilePath, images, labels, 0, N)
 
 net.setBatchSize(batchSize)
-for epoch in range(numEpochs): 
+for epoch in range(numEpochs):
     numRight = 0
     context = PyDeepCL.TrainingContext(epoch, 0)
-    for batch in range( N // batchSize ):
-        sgd.trainFromLabels( net, context, images[batch * batchSize * planes * size * size:], labels[batch * batchSize:] )
-#        net.forward( images[batch * batchSize * planes * size * size:] )
-#        net.backwardFromLabels( labels[batch * batchSize:] )
-        net.forward( images[batch * batchSize * planes * size * size:] )
-        numRight += net.calcNumRight( labels[batch * batchSize:] )
-        # print( 'numright ' + str( net.calcNumRight( labels ) ) )
-#    print( 'loss ' + str( loss ) )
-    print( 'num right: ' + str(numRight) )
-
+    for batch in range(N // batchSize):
+        sgd.trainFromLabels(
+            net,
+            context,
+            images[batch * batchSize * planes * size * size:],
+            labels[batch * batchSize:])
+        net.forward(images[batch * batchSize * planes * size * size:])
+        numRight += net.calcNumRight(labels[batch * batchSize:])
+    print('num right: ' + str(numRight))
diff --git a/python/test_qlearning.py b/python/test_qlearning.py
index 8dadb6e0..895c5794 100644
--- a/python/test_qlearning.py
+++ b/python/test_qlearning.py
@@ -1,46 +1,65 @@
 #!/usr/bin/python
 
 from __future__ import print_function
-
-#from array import array
-import sys
 import array
 import random
 import PyDeepCL
 
-# This is an example scenario.  It overrides the PyDeepCL.Scenario class
-# The Q-learning module will call into this object each time it makes a move
-# This class can therefore represent any world you want to expose to the
-# q-learning module
+
 class ScenarioImage(PyDeepCL.Scenario):
+    """
+    This is an example scenario.  It overrides the PyDeepCL.Scenario class
+    The Q-learning module will call into this object each time it makes a move
+    This class can therefore represent any world you want to expose to the
+    q-learning module
+    """
     def __init__(self, size, apple_moves):
-        """Standard constructor.  Do whatever you need to set up the world"""
-        super(ScenarioImage,self).__init__()
+        """
+        Standard constructor.  Do whatever you need to set up the world
+        """
+        super(ScenarioImage, self).__init__()
         self.size = size
         self.appleMoves = apple_moves
         self.finished = False
         self.game = 0
         self.reset()
+
     def getPerceptionSize(self):
-        """Assumes perception is square.  This is the length of one edge"""
+        """
+        Assumes perception is square.  This is the length of one edge
+        """
         return self.size
+
     def getNumActions(self):
-        """How many possible virtual 'buttons' can the computer push?"""
+        """
+        How many possible virtual 'buttons' can the computer push?
+        """
         return 4
+
     def getPerceptionPlanes(self):
-        """We can feed one or more planes to the qleaning module"""
+        """
+        We can feed one or more planes to the qleaning module
+        """
         return 2
+
     def getPerception(self):
-        """Need to provide the current perception to the qlearning module,
-        which should be of size numPlanes * size * size"""
+        """
+        Need to provide the current perception to the qlearning module,
+        which should be of size numPlanes * size * size
+        """
         perception = [0] * 2 * self.size * self.size
-        perception[self.appleY * self.size + self.appleX] = 1;
-        perception[self.size * self.size + self.posY * self.size + self.posX] = 1; 
+        perception[self.appleY * self.size + self.appleX] = 1
+        perception[
+            self.size * self.size +
+            self.posY * self.size + self.posX] = 1
         return perception
-    def act(self,index):
-        """The computer chooses one of the numActions available actions
+
+    def act(self, index):
+        """
+        The computer chooses one of the numActions available actions
         this method needs to update the world, and return the reward
-        (positive or negative)"""
+        (positive or negative)
+        """
         dx = 0
         dy = 0
         if index == 0:
@@ -51,8 +70,8 @@ def act(self,index):
             dy = 1
         elif index == 3:
             dy = -1
-        newX = self.posX + dx;
-        newY = self.posY + dy;
+        newX = self.posX + dx
+        newY = self.posY + dy
         if newX < 0 or newX >= self.size or newY < 0 or newY >= self.size:
             return -0.5
         if newX == self.appleX and newY == self.appleY:
@@ -64,23 +83,31 @@ def act(self,index):
             self.posX = newX
             self.posY = newY
             return -0.1
+
     def hasFinished(self):
-        """If the last action ended this particular game/world-instance
+        """
+        If the last action ended this particular game/world-instance
         then this should return True.  After 'reset' has been called
-        it should return False again"""
-        #print('scenarioimage.hasFinished()') 
+        it should return False again
+        """
         return self.finished
+
     def setNet(self, net):
-        """This doesnt override anything from the base class, we're simply using 
+        """
+        This doesnt override anything from the base class, we're simply using
         it, because then we can use it to print a q representation, eg at the
-        end of each game"""
+        end of each game
+        """
         self.net = net
+
     def _show(self):
-        """can do nothing, or it can print the world somehow.
+        """
+        can do nothing, or it can print the world somehow.
         This provides no information to the qlearning module: it's
         simply an opportunity for you to see how the world looks
-        occasionally"""
-        print('pos',self.posX,self.posY,'apple',self.appleX,self.appleY)
+        occasionally
+        """
+        print('pos', self.posX, self.posY, 'apple', self.appleX, self.appleY)
         for y in range(self.size):
             line = ''
             for x in range(self.size):
@@ -91,28 +118,28 @@ def _show(self):
                 else:
                     line += "."
             print(line)
+
     def _showQ(self):
-        """can do nothing, or it can print the current q 
+        """
+        can do nothing, or it can print the current q
         values somehow.
         This provides no information to the qlearning module: it's
         simply an opportunity for you to see how the q value look
-        occasionally"""
-#        print('showQ()')
-#        print('net num layers: ' + str(net.getNumLayers() ) ) # proves we do have a copy of the network :-)
-        scenario = self
+        occasionally
+        """
         net = self.net
-        print( "q directions:" )
+        print("q directions:")
         size = self.size
-        netinput = array.array( 'f', [0] * (2*size*size) )
-        netinput[ self.appleY * size + self .appleX ] = 1
+        netinput = array.array('f', [0] * (2*size*size))
+        netinput[self.appleY * size + self .appleX] = 1
         for y in range(size):
             thisLine = ''
             for x in range(size):
                 highestQ = 0
                 bestAction = 0
-                netinput[ size * size + y * size + x ] = 1
-                net.forward( netinput )
-                netinput[ size * size + y * size + x ] = 0
+                netinput[size * size + y * size + x] = 1
+                net.forward(netinput)
+                netinput[size * size + y * size + x] = 0
                 output = net.getOutput()
                 for action in range(4):
                     thisQ = output[action]
@@ -128,18 +155,21 @@ def _showQ(self):
                 else:
                     thisLine += "^"
             print(thisLine)
+
     def reset(self):
-        """starts a new game / world-instance"""
-        # first, lets print the final world and q-state:
-        # this used to be called by the qlearning module
-        # but seems to make more sense - and be more 
-        # flexible :-) - to call it from here, ourselves
-        # we can then call it ourselves from 'act' etc
-        # too, if we wish
+        """
+        starts a new game / world-instance
+        first, lets print the final world and q-state:
+        this used to be called by the qlearning module
+        but seems to make more sense - and be more
+        flexible :-) - to call it from here, ourselves
+        we can then call it ourselves from 'act' etc
+        too, if we wish
+        """
         if self.game >= 1:
             self._show()
             self._showQ()
-        print('scenarioimage.reset()') 
+        print('scenarioimage.reset()')
         if self.appleMoves:
             self.appleX = random.randint(0, self.size-1)
             self.appleY = random.randint(0, self.size-1)
@@ -147,49 +177,60 @@ def reset(self):
             self.appleX = self.appleY = self.size // 2
         self.finished = False
         sampledOnce = False
-        while not sampledOnce or ( self.posX == self.appleX and self.posY == self.appleY ):
+        while not sampledOnce or (
+                self.posX == self.appleX and self.posY == self.appleY):
             self.posX = random.randint(0, self.size-1)
-            self.posY =random.randint(0, self.size-1)
+            self.posY = random.randint(0, self.size-1)
             sampledOnce = True
         self.game += 1
 
+
 def go():
-    """creates a net, instantiates the scenario, and calls into the qlearning
-    module, to start learning"""
+    """
+    creates a net, instantiates the scenario, and calls into the qlearning
+    module, to start learning
+    """
 
-    scenario = ScenarioImage(5,True)
+    scenario = ScenarioImage(5, True)
 
-    size = scenario.getPerceptionSize();
-    planes = scenario.getPerceptionPlanes();
-    numActions = scenario.getNumActions();
-    #size = 5
-    #planes = 2
-    print('size',size,'planes',planes,'numActions',numActions)
+    size = scenario.getPerceptionSize()
+    planes = scenario.getPerceptionPlanes()
+    numActions = scenario.getNumActions()
+    print('size', size, 'planes', planes, 'numActions', numActions)
 
-    cl = PyDeepCL.EasyCL()
+    cl = PyDeepCL.DeepCL()
     net = PyDeepCL.NeuralNet(cl)
     sgd = PyDeepCL.SGD(cl, 0.1, 0.0)
-    net.addLayer( PyDeepCL.InputLayerMaker().numPlanes(planes).imageSize(size) )
-    net.addLayer( PyDeepCL.ConvolutionalMaker().numFilters(8).filterSize(5).padZeros().biased() )
-    net.addLayer( PyDeepCL.ActivationMaker().relu() )
-    net.addLayer( PyDeepCL.ConvolutionalMaker().numFilters(8).filterSize(5).padZeros().biased() )
-    net.addLayer( PyDeepCL.ActivationMaker().relu() )
-    net.addLayer( PyDeepCL.FullyConnectedMaker().numPlanes(100).imageSize(1).biased() )
-    net.addLayer( PyDeepCL.ActivationMaker().tanh() )
-    net.addLayer( PyDeepCL.FullyConnectedMaker().numPlanes(numActions).imageSize(1).biased() )
-    net.addLayer( PyDeepCL.SquareLossMaker() )
-    print( net.asString() )
+    net.addLayer(PyDeepCL.InputLayerMaker().numPlanes(planes).imageSize(size))
+    net.addLayer(
+        PyDeepCL.ConvolutionalMaker()
+        .numFilters(8).filterSize(5).padZeros().biased())
+    net.addLayer(PyDeepCL.ActivationMaker().relu())
+    net.addLayer(
+        PyDeepCL.ConvolutionalMaker()
+        .numFilters(8).filterSize(5).padZeros().biased())
+    net.addLayer(PyDeepCL.ActivationMaker().relu())
+    net.addLayer(
+        PyDeepCL.FullyConnectedMaker().numPlanes(100).imageSize(1).biased())
+    net.addLayer(PyDeepCL.ActivationMaker().tanh())
+    net.addLayer(
+        PyDeepCL.FullyConnectedMaker()
+        .numPlanes(numActions).imageSize(1).biased())
+    net.addLayer(PyDeepCL.SquareLossMaker())
+    print(net.asString())
 
     scenario.setNet(net)
 
-    qlearner = PyDeepCL.QLearner( sgd, scenario, net )
-    # qlearner.setLambda(0.9) # sets decay of the eligibility trace decay rate
-    # qlearner.setMaxSamples(32) # how many samples to learn from after each move
-    # qlearner.setEpsilon(0.1) # probability of exploring, instead of exploiting
-    # qlearner.setLearningRate(0.1) # learning rate of the neural net
+    qlearner = PyDeepCL.QLearner(sgd, scenario, net)
+    # sets decay of the eligibility trace decay rate
+    # qlearner.setLambda(0.9)
+    # how many samples to learn from after each move
+    # qlearner.setMaxSamples(32)
+    # probability of exploring, instead of exploiting
+    # qlearner.setEpsilon(0.1)
+    # learning rate of the neural net
+    # qlearner.setLearningRate(0.1)
     qlearner.run()
 
 if __name__ == '__main__':
     go()
-
-
diff --git a/src/CppRuntimeBoundary.cpp b/src/CppRuntimeBoundary.cpp
new file mode 100644
index 00000000..cc307d55
--- /dev/null
+++ b/src/CppRuntimeBoundary.cpp
@@ -0,0 +1,15 @@
+#include "CppRuntimeBoundary.h"
+
+#include <cstdio>
+#include <string>
+
+const char *deepcl_stringToCharStar(std::string astring) {
+    int len = astring.size();
+    char *charStar = new char[len + 1];
+    sprintf(charStar, "%s", astring.c_str());
+    return charStar;
+}
+void deepcl_deleteCharStar(const char *charStar) {
+    delete[] charStar;
+}
+
diff --git a/src/CppRuntimeBoundary.h b/src/CppRuntimeBoundary.h
new file mode 100644
index 00000000..fa2ec2ea
--- /dev/null
+++ b/src/CppRuntimeBoundary.h
@@ -0,0 +1,12 @@
+#pragma once
+
+#include "DeepCLDllExport.h"
+
+#include <string>
+
+// handles helping to call across cpp runtime boundaries
+
+// allocates new string, returns it.  MUST call deleteCharStar to delete it
+DeepCL_EXPORT const char *deepcl_stringToCharStar(std::string astring);
+DeepCL_EXPORT void deepcl_deleteCharStar(const char *charStar);
+
diff --git a/src/DeepCL.cpp b/src/DeepCL.cpp
new file mode 100644
index 00000000..5c441330
--- /dev/null
+++ b/src/DeepCL.cpp
@@ -0,0 +1,58 @@
+#include "DeepCL.h"
+#include "DevicesInfo.h"
+
+#undef STATIC
+#define STATIC
+#define PUBLIC
+
+using namespace easycl;
+
+//DeepCL::DeepCL() :
+//    EasyCL() {
+//}
+//DeepCL::DeepCL(int gpu) :
+//    EasyCL(gpu) {
+//}
+PUBLIC DeepCL::DeepCL(cl_platform_id platformId, cl_device_id deviceId) :
+    EasyCL(platformId, deviceId) {
+}
+PUBLIC DeepCL::~DeepCL() {
+}
+PUBLIC void DeepCL::deleteMe() {
+    delete this;
+}
+PUBLIC STATIC DeepCL *DeepCL::createForFirstGpu() {
+    cl_platform_id platformId;
+    cl_device_id deviceId;
+    DevicesInfo::getIdForIndexedGpu(0, &platformId, &deviceId);
+    return new DeepCL(platformId, deviceId);
+}
+PUBLIC STATIC DeepCL *DeepCL::createForFirstGpuOtherwiseCpu() {
+    if(DevicesInfo::getNumGpus() >= 1) {
+        return createForFirstGpu();
+    } else {
+        return createForIndexedDevice(0);
+    }
+}
+PUBLIC STATIC DeepCL *DeepCL::createForIndexedDevice(int device) {
+    cl_platform_id platformId;
+    cl_device_id deviceId;
+    DevicesInfo::getIdForIndexedDevice(device, &platformId, &deviceId);
+    return new DeepCL(platformId, deviceId);
+}
+PUBLIC STATIC DeepCL *DeepCL::createForIndexedGpu(int gpu) {
+    cl_platform_id platformId;
+    cl_device_id deviceId;
+    DevicesInfo::getIdForIndexedGpu(gpu, &platformId, &deviceId);
+    return new DeepCL(platformId, deviceId);
+}
+PUBLIC STATIC DeepCL *DeepCL::createForPlatformDeviceIndexes(int platformIndex, int deviceIndex) {
+    cl_platform_id platformId;
+    cl_device_id deviceId;
+    DevicesInfo::getIdForIndexedPlatformDevice(platformIndex, deviceIndex, CL_DEVICE_TYPE_ALL, &platformId, &deviceId);
+    return new DeepCL(platformId, deviceId);
+}
+PUBLIC STATIC DeepCL *DeepCL::createForPlatformDeviceIds(cl_platform_id platformId, cl_device_id deviceId) {
+    return new DeepCL(platformId, deviceId);
+}
+
diff --git a/src/DeepCL.h b/src/DeepCL.h
index 9531b9eb..b9b03254 100644
--- a/src/DeepCL.h
+++ b/src/DeepCL.h
@@ -37,3 +37,35 @@
 #include "loaders/GenericLoader.h"
 #include "loaders/GenericLoaderv2.h"
 
+#include "clblas/ClBlasInstance.h"
+
+#include "DeepCLDllExport.h"
+
+#define STATIC static
+#define VIRTUAL virtual
+
+class DeepCL_EXPORT DeepCL : public EasyCL {
+public:
+//    EasyCL *cl;
+    ClBlasInstance clBlasInstance;
+    
+    // [[[cog
+    // import cog_addheaders
+    // cog_addheaders.addv2()
+    // ]]]
+    // generated, using cog:
+
+    public:
+    DeepCL(cl_platform_id platformId, cl_device_id deviceId);
+    ~DeepCL();
+    void deleteMe();
+    STATIC DeepCL *createForFirstGpu();
+    STATIC DeepCL *createForFirstGpuOtherwiseCpu();
+    STATIC DeepCL *createForIndexedDevice(int device);
+    STATIC DeepCL *createForIndexedGpu(int gpu);
+    STATIC DeepCL *createForPlatformDeviceIndexes(int platformIndex, int deviceIndex);
+    STATIC DeepCL *createForPlatformDeviceIds(cl_platform_id platformId, cl_device_id deviceId);
+
+    // [[[end]]]
+};
+
diff --git a/src/DeepCLDllExport.h b/src/DeepCLDllExport.h
index fe749ed4..f0752963 100644
--- a/src/DeepCLDllExport.h
+++ b/src/DeepCLDllExport.h
@@ -20,3 +20,6 @@
 
 typedef unsigned char uchar;
 
+typedef long long int64;
+typedef int int32;
+
diff --git a/src/activate.bat b/src/activate.bat
new file mode 100644
index 00000000..d147a619
--- /dev/null
+++ b/src/activate.bat
@@ -0,0 +1,3 @@
+set "PATH=%~dp0.;%PATH%"
+set "CL=%CL% /I%~dp0..\include /I%~dp0..\include\deepcl /I%~dp0..\include\easycl"
+set "LIB=%~dp0..\lib;%~dp0..\lib\import %LIB%"
diff --git a/src/activate.sh b/src/activate.sh
new file mode 100644
index 00000000..48edbee5
--- /dev/null
+++ b/src/activate.sh
@@ -0,0 +1,11 @@
+#!/bin/bash
+
+bin_dir=$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )
+dist_dir=$(dirname ${bin_dir})
+
+export PATH=${bin_dir}:$PATH
+export LD_LIBRARY_PATH=${dist_dir}/lib:${LD_LIBRARY_PATH}
+#export PYTHONPATH=${dist_dir}/lib:${PYTHONPATH}
+export CPATH=${dist_dir}/include:${dist_dir}/include/easycl:${dist_dir}/include/deepcl:${CPATH}
+export LDFLAGS="-L${dist_dir}/lib ${LDFLAGS}"
+
diff --git a/src/activate/ActivationBackward.cpp b/src/activate/ActivationBackward.cpp
index 68b42211..3a823ca8 100644
--- a/src/activate/ActivationBackward.cpp
+++ b/src/activate/ActivationBackward.cpp
@@ -23,55 +23,55 @@ using namespace std;
 #undef STATIC
 #define STATIC
 
-STATIC ActivationBackward *ActivationBackward::instance( EasyCL *cl, int numPlanes, int inputImageSize, ActivationFunction const *fn ) {
-    return new ActivationBackwardGpuNaive( cl, numPlanes, inputImageSize, fn );
+STATIC ActivationBackward *ActivationBackward::instance(EasyCL *cl, int numPlanes, int inputSize, ActivationFunction const *fn) {
+    return new ActivationBackwardGpuNaive(cl, numPlanes, inputSize, fn);
 }
-STATIC ActivationBackward *ActivationBackward::instanceForTest( EasyCL *cl, int numPlanes, int inputImageSize, ActivationFunction const *fn) {
-    return new ActivationBackwardCpu( cl, numPlanes, inputImageSize, fn );
+STATIC ActivationBackward *ActivationBackward::instanceForTest(EasyCL *cl, int numPlanes, int inputSize, ActivationFunction const *fn) {
+    return new ActivationBackwardCpu(cl, numPlanes, inputSize, fn);
 }
-STATIC ActivationBackward *ActivationBackward::instanceSpecific( int idx, EasyCL *cl, int numPlanes, int inputImageSize, ActivationFunction const *fn ) {
-    if( idx == 0 ) {
-        return new ActivationBackwardCpu( cl, numPlanes, inputImageSize, fn );
+STATIC ActivationBackward *ActivationBackward::instanceSpecific(int idx, EasyCL *cl, int numPlanes, int inputSize, ActivationFunction const *fn) {
+    if(idx == 0) {
+        return new ActivationBackwardCpu(cl, numPlanes, inputSize, fn);
     }
-    if( idx == 1 ) {
-        return new ActivationBackwardGpuNaive( cl, numPlanes, inputImageSize, fn );
+    if(idx == 1) {
+        return new ActivationBackwardGpuNaive(cl, numPlanes, inputSize, fn);
     }
-    throw runtime_error("ActivationBackward::instanceSpecific, idx not known: " + toString( idx ) );
+    throw runtime_error("ActivationBackward::instanceSpecific, idx not known: " + toString(idx) );
 }
-ActivationBackward::ActivationBackward( EasyCL *cl, int numPlanes, int inputImageSize, ActivationFunction const *fn ) :
-        cl( cl ),
-        numPlanes( numPlanes ),
-        inputImageSize( inputImageSize ),
-        fn( fn ),
-        outputImageSize( inputImageSize ) {
+ActivationBackward::ActivationBackward(EasyCL *cl, int numPlanes, int inputSize, ActivationFunction const *fn) :
+        cl(cl),
+        numPlanes(numPlanes),
+        inputSize(inputSize),
+        fn(fn),
+        outputSize(inputSize) {
 }
-VIRTUAL int ActivationBackward::getInputSize( int batchSize ) {
-    return batchSize * numPlanes * inputImageSize * inputImageSize;
+VIRTUAL int ActivationBackward::getInputNumElements(int batchSize) {
+    return batchSize * numPlanes * inputSize * inputSize;
 }
-VIRTUAL int ActivationBackward::getOutputSize(int batchSize) {
-    return batchSize * numPlanes * outputImageSize * outputImageSize;
+VIRTUAL int ActivationBackward::getOutputNumElements(int batchSize) {
+    return batchSize * numPlanes * outputSize * outputSize;
 }
-VIRTUAL void ActivationBackward::backward( int batchSize, float *inputs, float *gradOutput, float *gradInput ) {
-//    cout << "ActivationBackward::backward( float * )" << endl;
-    StatefulTimer::instance()->timeCheck("ActivationBackward::backward float->wrapper start" );
+VIRTUAL void ActivationBackward::backward(int batchSize, float *inputs, float *gradOutput, float *gradInput) {
+//    cout << "ActivationBackward::backward(float *)" << endl;
+    StatefulTimer::instance()->timeCheck("ActivationBackward::backward float->wrapper start");
 
-    CLWrapper *inputsWrapper = cl->wrap( getInputSize(batchSize), inputs );
-    CLWrapper *gradOutputWrapper = cl->wrap( getOutputSize(batchSize), gradOutput );
-    CLWrapper *gradInputWrapper = cl->wrap( getInputSize(batchSize), gradInput );
+    CLWrapper *inputsWrapper = cl->wrap(getInputNumElements(batchSize), inputs);
+    CLWrapper *gradOutputWrapper = cl->wrap(getOutputNumElements(batchSize), gradOutput);
+    CLWrapper *gradInputWrapper = cl->wrap(getInputNumElements(batchSize), gradInput);
 
     inputsWrapper->copyToDevice();
     gradOutputWrapper->copyToDevice();
 
-    backward( batchSize, inputsWrapper, gradOutputWrapper, gradInputWrapper );
+    backward(batchSize, inputsWrapper, gradOutputWrapper, gradInputWrapper);
 
     gradInputWrapper->copyToHost();
 
     delete inputsWrapper;
     delete gradOutputWrapper;
     delete gradInputWrapper;
-    StatefulTimer::instance()->timeCheck("ActivationBackward::backward float->wrapper end" );
+    StatefulTimer::instance()->timeCheck("ActivationBackward::backward float->wrapper end");
 }
-VIRTUAL void ActivationBackward::backward( int batchSize, CLWrapper *inputsWrapper, CLWrapper *gradOutputWrapper, CLWrapper *gradInputWrapper ) {
-    throw runtime_error("ActivationBackward::backward wrappers not implemented" );
+VIRTUAL void ActivationBackward::backward(int batchSize, CLWrapper *inputsWrapper, CLWrapper *gradOutputWrapper, CLWrapper *gradInputWrapper) {
+    throw runtime_error("ActivationBackward::backward wrappers not implemented");
 }
 
diff --git a/src/activate/ActivationBackward.h b/src/activate/ActivationBackward.h
index 0bd7cf29..db87aa1c 100644
--- a/src/activate/ActivationBackward.h
+++ b/src/activate/ActivationBackward.h
@@ -20,23 +20,23 @@ class DeepCL_EXPORT ActivationBackward {
     EasyCL *cl;
 
     const int numPlanes;
-    const int inputImageSize;
+    const int inputSize;
     ActivationFunction const *fn;
 
-    const int outputImageSize;
+    const int outputSize;
 
     virtual ~ActivationBackward() {}
-    inline int getInputIndex( int n, int plane, int row, int col ) {
-        return ( ( n
-            * numPlanes + plane )
-            * inputImageSize + row )
-            * inputImageSize + col;
+    inline int getInputIndex(int n, int plane, int row, int col) {
+        return (( n
+            * numPlanes + plane)
+            * inputSize + row)
+            * inputSize + col;
     }
-    inline int getResultIndex( int n, int plane, int row, int col ) {
-        return ( ( n
-            * numPlanes + plane )
-            * outputImageSize + row )
-            * outputImageSize + col;
+    inline int getResultIndex(int n, int plane, int row, int col) {
+        return (( n
+            * numPlanes + plane)
+            * outputSize + row)
+            * outputSize + col;
     }
 
     // [[[cog
@@ -44,14 +44,14 @@ class DeepCL_EXPORT ActivationBackward {
     // cog_addheaders.add()
     // ]]]
     // generated, using cog:
-    STATIC ActivationBackward *instance( EasyCL *cl, int numPlanes, int inputImageSize, ActivationFunction const *fn );
-    STATIC ActivationBackward *instanceForTest( EasyCL *cl, int numPlanes, int inputImageSize, ActivationFunction const *fn);
-    STATIC ActivationBackward *instanceSpecific( int idx, EasyCL *cl, int numPlanes, int inputImageSize, ActivationFunction const *fn );
-    ActivationBackward( EasyCL *cl, int numPlanes, int inputImageSize, ActivationFunction const *fn );
-    VIRTUAL int getInputSize( int batchSize );
-    VIRTUAL int getOutputSize(int batchSize);
-    VIRTUAL void backward( int batchSize, float *inputs, float *gradOutput, float *gradInput );
-    VIRTUAL void backward( int batchSize, CLWrapper *inputsWrapper, CLWrapper *gradOutputWrapper, CLWrapper *gradInputWrapper );
+    STATIC ActivationBackward *instance(EasyCL *cl, int numPlanes, int inputSize, ActivationFunction const *fn);
+    STATIC ActivationBackward *instanceForTest(EasyCL *cl, int numPlanes, int inputSize, ActivationFunction const *fn);
+    STATIC ActivationBackward *instanceSpecific(int idx, EasyCL *cl, int numPlanes, int inputSize, ActivationFunction const *fn);
+    ActivationBackward(EasyCL *cl, int numPlanes, int inputSize, ActivationFunction const *fn);
+    VIRTUAL int getInputNumElements(int batchSize);
+    VIRTUAL int getOutputNumElements(int batchSize);
+    VIRTUAL void backward(int batchSize, float *inputs, float *gradOutput, float *gradInput);
+    VIRTUAL void backward(int batchSize, CLWrapper *inputsWrapper, CLWrapper *gradOutputWrapper, CLWrapper *gradInputWrapper);
 
     // [[[end]]]
 };
diff --git a/src/activate/ActivationBackwardCpu.cpp b/src/activate/ActivationBackwardCpu.cpp
index addbbd5a..83592def 100644
--- a/src/activate/ActivationBackwardCpu.cpp
+++ b/src/activate/ActivationBackwardCpu.cpp
@@ -22,46 +22,46 @@ using namespace std;
 #undef STATIC
 #define STATIC
 
-ActivationBackwardCpu::ActivationBackwardCpu( EasyCL *cl, int numPlanes, int inputImageSize, ActivationFunction const *fn ) :
-        ActivationBackward( cl, numPlanes, inputImageSize, fn ) {
+ActivationBackwardCpu::ActivationBackwardCpu(EasyCL *cl, int numPlanes, int inputSize, ActivationFunction const *fn) :
+        ActivationBackward(cl, numPlanes, inputSize, fn) {
 }
-VIRTUAL void ActivationBackwardCpu::backward( int batchSize, float *outputs, float *gradOutput, float *gradInput ) {
-    int totalLinearSize = batchSize * numPlanes * inputImageSize * inputImageSize;
-    for( int i = 0; i < totalLinearSize; i++ ) {
-//        cout << "input=" << inputs[i] << " deriv=" << fn->calcDerivative( inputs[i] )
+VIRTUAL void ActivationBackwardCpu::backward(int batchSize, float *outputs, float *gradOutput, float *gradInput) {
+    int totalLinearSize = batchSize * numPlanes * inputSize * inputSize;
+    for(int i = 0; i < totalLinearSize; i++) {
+//        cout << "input=" << inputs[i] << " deriv=" << fn->calcDerivative(inputs[i])
 //            << " error=" << errors[i];
-        gradInput[i] = fn->calcDerivative( outputs[i] ) * gradOutput[i];
+        gradInput[i] = fn->calcDerivative(outputs[i]) * gradOutput[i];
         cout << " gradInput=" << gradInput[i] << endl;
     }
 }
-VIRTUAL void ActivationBackwardCpu::backward( int batchSize, 
+VIRTUAL void ActivationBackwardCpu::backward(int batchSize, 
         CLWrapper *outputWrapper,
          CLWrapper *gradOutputWrapper, 
-        CLWrapper *gradInputWrapper ) {
-    StatefulTimer::instance()->timeCheck("ActivationBackwardCpu::backward start" );
+        CLWrapper *gradInputWrapper) {
+    StatefulTimer::instance()->timeCheck("ActivationBackwardCpu::backward start");
 
     outputWrapper->copyToHost();
     gradOutputWrapper->copyToHost();
 
-    float *outputs = reinterpret_cast<float *>( outputWrapper->getHostArray() );
-    float *gradOutput = reinterpret_cast<float *>( gradOutputWrapper->getHostArray() );
-    float *gradInput = new float[ getInputSize( batchSize ) ];
-    for( int i = 0; i < 4; i++ ) {
+    float *outputs = reinterpret_cast<float *>(outputWrapper->getHostArray());
+    float *gradOutput = reinterpret_cast<float *>(gradOutputWrapper->getHostArray());
+    float *gradInput = new float[ getInputNumElements(batchSize) ];
+    for(int i = 0; i < 4; i++) {
         cout << "i=" << i << " outputs=" << outputs[i] << " gradOutput=" << gradOutput[i] << endl;
     }
 
-    backward( batchSize, outputs, gradOutput, gradInput );
+    backward(batchSize, outputs, gradOutput, gradInput);
 
-    float *gradInputHostArray = reinterpret_cast<float *>( gradInputWrapper->getHostArray() );
-    memcpy( gradInputHostArray, gradInput, sizeof(float) * getInputSize( batchSize ) );
+    float *gradInputHostArray = reinterpret_cast<float *>(gradInputWrapper->getHostArray());
+    memcpy(gradInputHostArray, gradInput, sizeof(float) * getInputNumElements(batchSize) );
     gradInputWrapper->copyToDevice();
 
-    for( int i = 0; i < 4; i++ ) {
+    for(int i = 0; i < 4; i++) {
         cout << "i=" << i << " gradInput=" << gradInput[i] << endl;
     }
 
     delete[] gradInput;
     
-    StatefulTimer::instance()->timeCheck("ActivationBackwardCpu::backward end" );
+    StatefulTimer::instance()->timeCheck("ActivationBackwardCpu::backward end");
 }
 
diff --git a/src/activate/ActivationBackwardCpu.h b/src/activate/ActivationBackwardCpu.h
index 84c30591..452335f4 100644
--- a/src/activate/ActivationBackwardCpu.h
+++ b/src/activate/ActivationBackwardCpu.h
@@ -19,12 +19,12 @@ class ActivationBackwardCpu : public ActivationBackward {
     // cog_addheaders.add()
     // ]]]
     // generated, using cog:
-    ActivationBackwardCpu( EasyCL *cl, int numPlanes, int inputImageSize, ActivationFunction const *fn );
-    VIRTUAL void backward( int batchSize, float *outputs, float *gradOutput, float *gradInput );
-    VIRTUAL void backward( int batchSize,
+    ActivationBackwardCpu(EasyCL *cl, int numPlanes, int inputSize, ActivationFunction const *fn);
+    VIRTUAL void backward(int batchSize, float *outputs, float *gradOutput, float *gradInput);
+    VIRTUAL void backward(int batchSize,
     CLWrapper *outputWrapper,
     CLWrapper *gradOutputWrapper,
-    CLWrapper *gradInputWrapper );
+    CLWrapper *gradInputWrapper);
 
     // [[[end]]]
 };
diff --git a/src/activate/ActivationBackwardGpuNaive.cpp b/src/activate/ActivationBackwardGpuNaive.cpp
index c64d269b..e9f0ac01 100644
--- a/src/activate/ActivationBackwardGpuNaive.cpp
+++ b/src/activate/ActivationBackwardGpuNaive.cpp
@@ -27,41 +27,41 @@ VIRTUAL ActivationBackwardGpuNaive::~ActivationBackwardGpuNaive() {
     delete kernel;
 //    delete kMemset;
 }
-VIRTUAL void ActivationBackwardGpuNaive::backward( int batchSize, CLWrapper *inputWrapper,
+VIRTUAL void ActivationBackwardGpuNaive::backward(int batchSize, CLWrapper *inputWrapper,
          CLWrapper *gradOutputWrapper, 
-        CLWrapper *gradInputWrapper ) {
+        CLWrapper *gradInputWrapper) {
 
-    StatefulTimer::instance()->timeCheck("ActivationBackwardGpuNaive::backward start" );
+    StatefulTimer::instance()->timeCheck("ActivationBackwardGpuNaive::backward start");
 
-    int globalSize = batchSize * numPlanes * inputImageSize * inputImageSize;
+    int globalSize = batchSize * numPlanes * inputSize * inputSize;
     int workgroupSize = 64;
-    int numWorkgroups = ( globalSize + workgroupSize - 1 ) / workgroupSize;
-    kernel->in( batchSize * numPlanes * inputImageSize * inputImageSize )
-          ->in( inputWrapper )
-          ->in( gradOutputWrapper )
-          ->out( gradInputWrapper );
-    globalSize = batchSize * numPlanes * outputImageSize * outputImageSize;
+    int numWorkgroups = (globalSize + workgroupSize - 1) / workgroupSize;
+    kernel->in(batchSize * numPlanes * inputSize * inputSize)
+          ->in(inputWrapper)
+          ->in(gradOutputWrapper)
+          ->out(gradInputWrapper);
+    globalSize = batchSize * numPlanes * outputSize * outputSize;
     workgroupSize = 64;
-    numWorkgroups = ( globalSize + workgroupSize - 1 ) / workgroupSize;
-    kernel->run_1d( numWorkgroups * workgroupSize, workgroupSize );
+    numWorkgroups = (globalSize + workgroupSize - 1) / workgroupSize;
+    kernel->run_1d(numWorkgroups * workgroupSize, workgroupSize);
     cl->finish();
 
-    StatefulTimer::instance()->timeCheck("ActivationBackwardGpuNaive::backward end" );
+    StatefulTimer::instance()->timeCheck("ActivationBackwardGpuNaive::backward end");
 }
-ActivationBackwardGpuNaive::ActivationBackwardGpuNaive( EasyCL *cl, int numPlanes, int inputImageSize, ActivationFunction const*fn ) :
-        ActivationBackward( cl, numPlanes, inputImageSize, fn ) {
+ActivationBackwardGpuNaive::ActivationBackwardGpuNaive(EasyCL *cl, int numPlanes, int inputSize, ActivationFunction const*fn) :
+        ActivationBackward(cl, numPlanes, inputSize, fn) {
 //    std::string options = "-D " + fn->getDefineName();
     string options = "";
-    options += " -D gNumPlanes=" + toString( numPlanes );
-    options += " -D gInputImageSize=" + toString( inputImageSize );
-    options += " -D gInputImageSizeSquared=" + toString( inputImageSize * inputImageSize );
-    options += " -D gOutputImageSize=" + toString( outputImageSize );
-    options += " -D gOutputImageSizeSquared=" + toString( outputImageSize * outputImageSize );
+    options += " -D gNumPlanes=" + toString(numPlanes);
+    options += " -D gInputSize=" + toString(inputSize);
+    options += " -D gInputSizeSquared=" + toString(inputSize * inputSize);
+    options += " -D gOutputSize=" + toString(outputSize);
+    options += " -D gOutputSizeSquared=" + toString(outputSize * outputSize);
     options += " -D " + fn->getDefineName();
 
     // [[[cog
     // import stringify
-    // stringify.write_kernel2( "kernel", "cl/applyActivationDeriv.cl", "backward", 'options' )
+    // stringify.write_kernel2("kernel", "cl/applyActivationDeriv.cl", "backward", 'options')
     // ]]]
     // generated using cog, from cl/applyActivationDeriv.cl:
     const char * kernelSource =  
@@ -77,9 +77,9 @@ ActivationBackwardGpuNaive::ActivationBackwardGpuNaive( EasyCL *cl, int numPlane
     "#ifdef TANH\n" 
     "    #define ACTIVATION_DERIV(output) (1 - output * output)\n" 
     "#elif defined SCALEDTANH\n" 
-    "    #define ACTIVATION_DERIV(output) ( 0.66667f * ( 1.7159f - 1 / 1.7159f * output * output ) )\n" 
+    "    #define ACTIVATION_DERIV(output) (0.66667f * (1.7159f - 1 / 1.7159f * output * output) )\n" 
     "#elif defined SIGMOID\n" 
-    "    #define ACTIVATION_DERIV(output) (output * ( 1 - output ) )\n" 
+    "    #define ACTIVATION_DERIV(output) (output * (1 - output) )\n" 
     "#elif defined RELU\n" 
     "    #define ACTIVATION_DERIV(output) (output > 0 ? 1 : 0)\n" 
     "#elif defined LINEAR\n" 
@@ -89,19 +89,19 @@ ActivationBackwardGpuNaive::ActivationBackwardGpuNaive( EasyCL *cl, int numPlane
     "//#ifdef ACTIVATION_DERIV\n" 
     "//void kernel applyActivationDeriv(\n" 
     "//        const int N,\n" 
-    "//        global float *inout ) {\n" 
+    "//        global float *inout) {\n" 
     "//    int globalId = get_global_id(0);\n" 
-    "//    inout[globalId] = ACTIVATION_DERIV( inout[globalId] );\n" 
+    "//    inout[globalId] = ACTIVATION_DERIV(inout[globalId]);\n" 
     "//}\n" 
     "//#endif\n" 
     "\n" 
     "#ifdef ACTIVATION_DERIV\n" 
     "void kernel applyActivationDeriv(\n" 
     "        const int N,\n" 
-    "        global float *target, global const float *source ) {\n" 
+    "        global float *target, global const float *source) {\n" 
     "    int globalId = get_global_id(0);\n" 
-    "    if( globalId < N ) {\n" 
-    "        target[globalId] *= ACTIVATION_DERIV( source[globalId] );\n" 
+    "    if (globalId < N) {\n" 
+    "        target[globalId] *= ACTIVATION_DERIV(source[globalId]);\n" 
     "    }\n" 
     "  //  target[globalId] *= source[globalId];\n" 
     "}\n" 
@@ -112,10 +112,10 @@ ActivationBackwardGpuNaive::ActivationBackwardGpuNaive( EasyCL *cl, int numPlane
     "        const int N,\n" 
     "        global const float *inputs,\n" 
     "        global const float *gradOutput,\n" 
-    "        global float *gradInput ) {\n" 
+    "        global float *gradInput) {\n" 
     "    int globalId = get_global_id(0);\n" 
-    "    if( globalId < N ) {\n" 
-    "        gradInput[globalId] = ACTIVATION_DERIV( inputs[globalId] ) * gradOutput[globalId];\n" 
+    "    if (globalId < N) {\n" 
+    "        gradInput[globalId] = ACTIVATION_DERIV(inputs[globalId]) * gradOutput[globalId];\n" 
     "            // probably not ideal to have the output and input separate?\n" 
     "    }\n" 
     "  //  target[globalId] *= source[globalId];\n" 
diff --git a/src/activate/ActivationBackwardGpuNaive.h b/src/activate/ActivationBackwardGpuNaive.h
index 7f2ef985..ee226d79 100644
--- a/src/activate/ActivationBackwardGpuNaive.h
+++ b/src/activate/ActivationBackwardGpuNaive.h
@@ -21,10 +21,10 @@ class ActivationBackwardGpuNaive : public ActivationBackward {
     // ]]]
     // generated, using cog:
     VIRTUAL ~ActivationBackwardGpuNaive();
-    VIRTUAL void backward( int batchSize, CLWrapper *inputWrapper,
+    VIRTUAL void backward(int batchSize, CLWrapper *inputWrapper,
     CLWrapper *gradOutputWrapper,
-    CLWrapper *gradInputWrapper );
-    ActivationBackwardGpuNaive( EasyCL *cl, int numPlanes, int inputImageSize, ActivationFunction const*fn );
+    CLWrapper *gradInputWrapper);
+    ActivationBackwardGpuNaive(EasyCL *cl, int numPlanes, int inputSize, ActivationFunction const*fn);
 
     // [[[end]]]
 };
diff --git a/src/activate/ActivationForward.cpp b/src/activate/ActivationForward.cpp
index 6d9e0a9e..02ee0ec2 100644
--- a/src/activate/ActivationForward.cpp
+++ b/src/activate/ActivationForward.cpp
@@ -20,50 +20,50 @@ using namespace std;
 #undef STATIC
 #define STATIC
 
-ActivationForward::ActivationForward( EasyCL *cl, int numPlanes, int inputImageSize, ActivationFunction const*fn ) :
-        cl( cl ),
-        numPlanes( numPlanes ),
-        inputImageSize( inputImageSize ),
-        outputImageSize( inputImageSize ),
+ActivationForward::ActivationForward(EasyCL *cl, int numPlanes, int inputSize, ActivationFunction const*fn) :
+        cl(cl),
+        numPlanes(numPlanes),
+        inputSize(inputSize),
+        outputSize(inputSize),
         fn(fn) {
 }
-STATIC ActivationForward *ActivationForward::instance( EasyCL *cl, int numPlanes, int inputImageSize, ActivationFunction const*fn ) {
-    return new ActivationForwardGpuNaive( cl, numPlanes, inputImageSize, fn );
-//    return new ActivationForwardCpu( cl, numPlanes, inputImageSize );
+STATIC ActivationForward *ActivationForward::instance(EasyCL *cl, int numPlanes, int inputSize, ActivationFunction const*fn) {
+    return new ActivationForwardGpuNaive(cl, numPlanes, inputSize, fn);
+//    return new ActivationForwardCpu(cl, numPlanes, inputSize);
 }
-STATIC ActivationForward *ActivationForward::instanceForTest( EasyCL *cl, int numPlanes, int inputImageSize, ActivationFunction const*fn ) {
-    return new ActivationForwardCpu( cl, numPlanes, inputImageSize, fn );
+STATIC ActivationForward *ActivationForward::instanceForTest(EasyCL *cl, int numPlanes, int inputSize, ActivationFunction const*fn) {
+    return new ActivationForwardCpu(cl, numPlanes, inputSize, fn);
 }
-STATIC ActivationForward *ActivationForward::instanceSpecific( int idx, EasyCL *cl, int numPlanes, int inputImageSize, ActivationFunction const*fn ) {
-    if( idx == 0 ) {
-        return new ActivationForwardCpu( cl, numPlanes, inputImageSize, fn );
+STATIC ActivationForward *ActivationForward::instanceSpecific(int idx, EasyCL *cl, int numPlanes, int inputSize, ActivationFunction const*fn) {
+    if(idx == 0) {
+        return new ActivationForwardCpu(cl, numPlanes, inputSize, fn);
     }
-    if( idx == 1 ) {
-        return new ActivationForwardGpuNaive( cl, numPlanes, inputImageSize, fn );
+    if(idx == 1) {
+        return new ActivationForwardGpuNaive(cl, numPlanes, inputSize, fn);
     }
     cout << "idx " << idx << " not known" << endl;
-    throw runtime_error("ActivationForward::instanceSpecific idx not known: " + toString( idx ) );
+    throw runtime_error("ActivationForward::instanceSpecific idx not known: " + toString(idx) );
 }
-VIRTUAL void ActivationForward::forward( int batchSize, CLWrapper *inputData, CLWrapper *outputData ) {
+VIRTUAL void ActivationForward::forward(int batchSize, CLWrapper *inputData, CLWrapper *outputData) {
     throw runtime_error("forward not implemented for this child type");
 }
-VIRTUAL void ActivationForward::forward( int batchSize, float *input, float *output ) {
-//    cout << "ActivationForward::forward( float * )" << endl;
-    CLWrapper *inputWrapper = cl->wrap( getInputSize( batchSize ), input );
-    CLWrapper *outputWrapper = cl->wrap( getOutputSize( batchSize ), output );
+VIRTUAL void ActivationForward::forward(int batchSize, float *input, float *output) {
+//    cout << "ActivationForward::forward(float *)" << endl;
+    CLWrapper *inputWrapper = cl->wrap(getInputNumElements(batchSize), input);
+    CLWrapper *outputWrapper = cl->wrap(getOutputNumElements(batchSize), output);
 
     inputWrapper->copyToDevice();
     outputWrapper->createOnDevice();
-    forward( batchSize, inputWrapper, outputWrapper );
+    forward(batchSize, inputWrapper, outputWrapper);
     outputWrapper->copyToHost();    
 
     delete outputWrapper;
     delete inputWrapper;
 }
-VIRTUAL int ActivationForward::getInputSize( int batchSize ) {
-    return batchSize * numPlanes * inputImageSize * inputImageSize;
+VIRTUAL int ActivationForward::getInputNumElements(int batchSize) {
+    return batchSize * numPlanes * inputSize * inputSize;
 }
-VIRTUAL int ActivationForward::getOutputSize(int batchSize) {
-    return batchSize * numPlanes * outputImageSize * outputImageSize;
+VIRTUAL int ActivationForward::getOutputNumElements(int batchSize) {
+    return batchSize * numPlanes * outputSize * outputSize;
 }
 
diff --git a/src/activate/ActivationForward.h b/src/activate/ActivationForward.h
index c8b80ae1..40722b77 100644
--- a/src/activate/ActivationForward.h
+++ b/src/activate/ActivationForward.h
@@ -20,24 +20,24 @@ class DeepCL_EXPORT ActivationForward {
     EasyCL *cl;
 
     const int numPlanes;
-    const int inputImageSize;
+    const int inputSize;
 
-    const int outputImageSize;
+    const int outputSize;
 
     ActivationFunction const*fn;
 
     virtual ~ActivationForward() {}
-    inline int getInputIndex( int n, int plane, int row, int col ) {
-        return ( ( n
-            * numPlanes + plane )
-            * inputImageSize + row )
-            * inputImageSize + col;
+    inline int getInputIndex(int n, int plane, int row, int col) {
+        return (( n
+            * numPlanes + plane)
+            * inputSize + row)
+            * inputSize + col;
     }
-    inline int getResultIndex( int n, int plane, int row, int col ) {
-        return ( ( n
-            * numPlanes + plane )
-            * outputImageSize + row )
-            * outputImageSize + col;
+    inline int getResultIndex(int n, int plane, int row, int col) {
+        return (( n
+            * numPlanes + plane)
+            * outputSize + row)
+            * outputSize + col;
     }
 
     // [[[cog
@@ -45,14 +45,14 @@ class DeepCL_EXPORT ActivationForward {
     // cog_addheaders.add()
     // ]]]
     // generated, using cog:
-    ActivationForward( EasyCL *cl, int numPlanes, int inputImageSize, ActivationFunction const*fn );
-    STATIC ActivationForward *instance( EasyCL *cl, int numPlanes, int inputImageSize, ActivationFunction const*fn );
-    STATIC ActivationForward *instanceForTest( EasyCL *cl, int numPlanes, int inputImageSize, ActivationFunction const*fn );
-    STATIC ActivationForward *instanceSpecific( int idx, EasyCL *cl, int numPlanes, int inputImageSize, ActivationFunction const*fn );
-    VIRTUAL void forward( int batchSize, CLWrapper *inputData, CLWrapper *outputData );
-    VIRTUAL void forward( int batchSize, float *input, float *output );
-    VIRTUAL int getInputSize( int batchSize );
-    VIRTUAL int getOutputSize(int batchSize);
+    ActivationForward(EasyCL *cl, int numPlanes, int inputSize, ActivationFunction const*fn);
+    STATIC ActivationForward *instance(EasyCL *cl, int numPlanes, int inputSize, ActivationFunction const*fn);
+    STATIC ActivationForward *instanceForTest(EasyCL *cl, int numPlanes, int inputSize, ActivationFunction const*fn);
+    STATIC ActivationForward *instanceSpecific(int idx, EasyCL *cl, int numPlanes, int inputSize, ActivationFunction const*fn);
+    VIRTUAL void forward(int batchSize, CLWrapper *inputData, CLWrapper *outputData);
+    VIRTUAL void forward(int batchSize, float *input, float *output);
+    VIRTUAL int getInputNumElements(int batchSize);
+    VIRTUAL int getOutputNumElements(int batchSize);
 
     // [[[end]]]
 };
diff --git a/src/activate/ActivationForwardCpu.cpp b/src/activate/ActivationForwardCpu.cpp
index 3403381c..5915e257 100644
--- a/src/activate/ActivationForwardCpu.cpp
+++ b/src/activate/ActivationForwardCpu.cpp
@@ -20,35 +20,35 @@ using namespace std;
 #undef STATIC
 #define STATIC
 
-ActivationForwardCpu::ActivationForwardCpu( EasyCL *cl, int numPlanes, int inputImageSize, ActivationFunction const*fn ) :
-        ActivationForward( cl, numPlanes, inputImageSize, fn ) {
+ActivationForwardCpu::ActivationForwardCpu(EasyCL *cl, int numPlanes, int inputSize, ActivationFunction const*fn) :
+        ActivationForward(cl, numPlanes, inputSize, fn) {
 }
-VIRTUAL void ActivationForwardCpu::forward( int batchSize, CLWrapper *inputWrapper, CLWrapper *outputWrapper ) {
-//    cout << "ActivationForwardCpu::forward( CLWrapper * )" << endl;
+VIRTUAL void ActivationForwardCpu::forward(int batchSize, CLWrapper *inputWrapper, CLWrapper *outputWrapper) {
+//    cout << "ActivationForwardCpu::forward(CLWrapper *)" << endl;
 
     inputWrapper->copyToHost();
 
-    float *input = reinterpret_cast<float *>( inputWrapper->getHostArray() );
-    float *output = new float[ getOutputSize( batchSize ) ];
+    float *input = reinterpret_cast<float *>(inputWrapper->getHostArray());
+    float *output = new float[ getOutputNumElements(batchSize) ];
 
-    forward( batchSize, input, output );
+    forward(batchSize, input, output);
 
-    float *outputHostArray = reinterpret_cast<float *>( outputWrapper->getHostArray() );
-    memcpy( outputHostArray, output, sizeof(float) * getOutputSize( batchSize ) );
+    float *outputHostArray = reinterpret_cast<float *>(outputWrapper->getHostArray());
+    memcpy(outputHostArray, output, sizeof(float) * getOutputNumElements(batchSize) );
 
     outputWrapper->copyToDevice();
 
     delete[] output;
 }
-VIRTUAL void ActivationForwardCpu::forward( int batchSize, float *input, float *output ) {
-//    float *output = new float[ getOutputSize( batchSize ) ];
-//    cout << "ActivationForwardCpu::forward( float * )" << endl;
-    StatefulTimer::instance()->timeCheck("ActivationForwardCpu::forward start" );
-    int totalLinearSize = batchSize * numPlanes * inputImageSize * inputImageSize;
-    for( int i = 0; i < totalLinearSize; i++ ) {
-        output[i] = fn->calc( input[i] );
+VIRTUAL void ActivationForwardCpu::forward(int batchSize, float *input, float *output) {
+//    float *output = new float[ getOutputNumElements(batchSize) ];
+//    cout << "ActivationForwardCpu::forward(float *)" << endl;
+    StatefulTimer::instance()->timeCheck("ActivationForwardCpu::forward start");
+    int totalLinearSize = batchSize * numPlanes * inputSize * inputSize;
+    for(int i = 0; i < totalLinearSize; i++) {
+        output[i] = fn->calc(input[i]);
     }
-    StatefulTimer::instance()->timeCheck("ActivationForwardCpu::forward end" );
+    StatefulTimer::instance()->timeCheck("ActivationForwardCpu::forward end");
 //    return output;
 }
 
diff --git a/src/activate/ActivationForwardCpu.h b/src/activate/ActivationForwardCpu.h
index 099f2bdf..812ee1df 100644
--- a/src/activate/ActivationForwardCpu.h
+++ b/src/activate/ActivationForwardCpu.h
@@ -19,9 +19,9 @@ class ActivationForwardCpu : public ActivationForward {
     // cog_addheaders.add()
     // ]]]
     // generated, using cog:
-    ActivationForwardCpu( EasyCL *cl, int numPlanes, int inputImageSize, ActivationFunction const*fn );
-    VIRTUAL void forward( int batchSize, CLWrapper *inputWrapper, CLWrapper *outputWrapper );
-    VIRTUAL void forward( int batchSize, float *input, float *output );
+    ActivationForwardCpu(EasyCL *cl, int numPlanes, int inputSize, ActivationFunction const*fn);
+    VIRTUAL void forward(int batchSize, CLWrapper *inputWrapper, CLWrapper *outputWrapper);
+    VIRTUAL void forward(int batchSize, float *input, float *output);
 
     // [[[end]]]
 };
diff --git a/src/activate/ActivationForwardGpuNaive.cpp b/src/activate/ActivationForwardGpuNaive.cpp
index 6670e794..e761a40a 100644
--- a/src/activate/ActivationForwardGpuNaive.cpp
+++ b/src/activate/ActivationForwardGpuNaive.cpp
@@ -27,39 +27,39 @@ using namespace std;
 VIRTUAL ActivationForwardGpuNaive::~ActivationForwardGpuNaive() {
     delete kernel;
 }
-VIRTUAL void ActivationForwardGpuNaive::forward( int batchSize, CLWrapper *inputWrapper, CLWrapper *outputWrapper ) {
-//    cout << StatefulTimer::instance()->prefix << "ActivationForwardGpuNaive::forward( CLWrapper * )" << endl;
-    StatefulTimer::instance()->timeCheck("ActivationForwardGpuNaive::forward start" );
+VIRTUAL void ActivationForwardGpuNaive::forward(int batchSize, CLWrapper *inputWrapper, CLWrapper *outputWrapper) {
+//    cout << StatefulTimer::instance()->prefix << "ActivationForwardGpuNaive::forward(CLWrapper *)" << endl;
+    StatefulTimer::instance()->timeCheck("ActivationForwardGpuNaive::forward start");
 
-    kernel->input( batchSize * numPlanes * outputImageSize * outputImageSize );
-    kernel->output( outputWrapper )->input( inputWrapper );
-//    kernel->input( batchSize )->input( inputWrapper )->output( outputWrapper );
+    kernel->input(batchSize * numPlanes * outputSize * outputSize);
+    kernel->output(outputWrapper)->input(inputWrapper);
+//    kernel->input(batchSize)->input(inputWrapper)->output(outputWrapper);
 
-    int globalSize = batchSize * numPlanes * outputImageSize * outputImageSize;
+    int globalSize = batchSize * numPlanes * outputSize * outputSize;
     int workgroupsize = cl->getMaxWorkgroupSize();
-    globalSize = ( ( globalSize + workgroupsize - 1 ) / workgroupsize ) * workgroupsize;
+    globalSize = (( globalSize + workgroupsize - 1) / workgroupsize) * workgroupsize;
 //    cout << "ActivationForwardGpuNaive::forward batchsize=" << batchSize << " g=" << globalSize << " w=" << workgroupsize << endl;
     kernel->run_1d(globalSize, workgroupsize);
     cl->finish();
 
 //    cout << "ActivationForwardGpuNaive::forward selectorswrapper:" << endl;
-//    PrintBuffer::printInts( cl, selectorsWrapper, outputImageSize, outputImageSize );
+//    PrintBuffer::printInts(cl, selectorsWrapper, outputSize, outputSize);
 
-    StatefulTimer::instance()->timeCheck("ActivationForwardGpuNaive::forward end" );
+    StatefulTimer::instance()->timeCheck("ActivationForwardGpuNaive::forward end");
 }
-ActivationForwardGpuNaive::ActivationForwardGpuNaive( EasyCL *cl, int numPlanes, int inputImageSize, ActivationFunction const*fn ) :
-        ActivationForward( cl, numPlanes, inputImageSize, fn ) {
+ActivationForwardGpuNaive::ActivationForwardGpuNaive(EasyCL *cl, int numPlanes, int inputSize, ActivationFunction const*fn) :
+        ActivationForward(cl, numPlanes, inputSize, fn) {
     string options = "";
-    options += " -DgOutputImageSize=" + toString( outputImageSize );
-    options += " -DgOutputImageSizeSquared=" + toString( outputImageSize * outputImageSize );
-    options += " -DgInputImageSize=" + toString( inputImageSize );
-    options += " -DgInputImageSizeSquared=" + toString( inputImageSize * inputImageSize );
-    options += " -DgNumPlanes=" + toString( numPlanes );
+    options += " -DgOutputSize=" + toString(outputSize);
+    options += " -DgOutputSizeSquared=" + toString(outputSize * outputSize);
+    options += " -DgInputSize=" + toString(inputSize);
+    options += " -DgInputSizeSquared=" + toString(inputSize * inputSize);
+    options += " -DgNumPlanes=" + toString(numPlanes);
     options += " -D" + fn->getDefineName();
 
     // [[[cog
     // import stringify
-    // stringify.write_kernel2( "kernel", "cl/activate.cl", "forwardNaive", 'options' )
+    // stringify.write_kernel2("kernel", "cl/activate.cl", "forwardNaive", 'options')
     // ]]]
     // generated using cog, from cl/activate.cl:
     const char * kernelSource =  
@@ -75,7 +75,7 @@ ActivationForwardGpuNaive::ActivationForwardGpuNaive( EasyCL *cl, int numPlanes,
     "#ifdef TANH\n" 
     "    #define ACTIVATION_FUNCTION(output) (tanh(output))\n" 
     "#elif defined SCALEDTANH\n" 
-    "    #define ACTIVATION_FUNCTION(output) ( 1.7159f * tanh( 0.66667f * output))\n" 
+    "    #define ACTIVATION_FUNCTION(output) (1.7159f * tanh(0.66667f * output))\n" 
     "#elif SIGMOID\n" 
     "    #define ACTIVATION_FUNCTION(output) (1.0f / (1 + exp(-output)))\n" 
     "#elif defined RELU\n" 
@@ -85,22 +85,22 @@ ActivationForwardGpuNaive::ActivationForwardGpuNaive( EasyCL *cl, int numPlanes,
     "#endif\n" 
     "\n" 
     "#ifdef ACTIVATION_FUNCTION // protect against not defined\n" 
-    "kernel void activate( const int N, global float *inout ) {\n" 
+    "kernel void activate(const int N, global float *inout) {\n" 
     "    const int globalId = get_global_id(0);\n" 
-    "    if( globalId >= N ) {\n" 
+    "    if (globalId >= N) {\n" 
     "        return;\n" 
     "    }\n" 
-    "    inout[globalId] = ACTIVATION_FUNCTION( inout[globalId] );\n" 
+    "    inout[globalId] = ACTIVATION_FUNCTION(inout[globalId]);\n" 
     "}\n" 
     "#endif\n" 
     "\n" 
     "#ifdef ACTIVATION_FUNCTION // protect against not defined\n" 
-    "kernel void forwardNaive( const int N, global float *out, global const float *in ) {\n" 
+    "kernel void forwardNaive(const int N, global float *out, global const float *in) {\n" 
     "    const int globalId = get_global_id(0);\n" 
-    "    if( globalId >= N ) {\n" 
+    "    if (globalId >= N) {\n" 
     "        return;\n" 
     "    }\n" 
-    "    out[globalId] = ACTIVATION_FUNCTION( in[globalId] );\n" 
+    "    out[globalId] = ACTIVATION_FUNCTION(in[globalId]);\n" 
     "}\n" 
     "#endif\n" 
     "\n" 
diff --git a/src/activate/ActivationForwardGpuNaive.h b/src/activate/ActivationForwardGpuNaive.h
index ea68ef8f..29bd4b61 100644
--- a/src/activate/ActivationForwardGpuNaive.h
+++ b/src/activate/ActivationForwardGpuNaive.h
@@ -23,8 +23,8 @@ class ActivationForwardGpuNaive : public ActivationForward {
     // ]]]
     // generated, using cog:
     VIRTUAL ~ActivationForwardGpuNaive();
-    VIRTUAL void forward( int batchSize, CLWrapper *inputWrapper, CLWrapper *outputWrapper );
-    ActivationForwardGpuNaive( EasyCL *cl, int numPlanes, int inputImageSize, ActivationFunction const*fn );
+    VIRTUAL void forward(int batchSize, CLWrapper *inputWrapper, CLWrapper *outputWrapper);
+    ActivationForwardGpuNaive(EasyCL *cl, int numPlanes, int inputSize, ActivationFunction const*fn);
 
     // [[[end]]]
 };
diff --git a/src/activate/ActivationFunction.cpp b/src/activate/ActivationFunction.cpp
index fd5c0f8e..79b39d93 100644
--- a/src/activate/ActivationFunction.cpp
+++ b/src/activate/ActivationFunction.cpp
@@ -4,43 +4,43 @@
 
 using namespace std;
 
-ActivationFunction *ActivationFunction::fromName( std::string name ) {
-    if( name == "tanh" ) {
+ActivationFunction *ActivationFunction::fromName(std::string name) {
+    if(name == "tanh") {
         return new TanhActivation();
-    } else if( name == "scaledtanh" ) {
+    } else if(name == "scaledtanh") {
         return new ScaledTanhActivation();
-    } else if( name == "sigmoid" ) {
+    } else if(name == "sigmoid") {
         return new SigmoidActivation();
-    } else if( name == "linear" ) {
+    } else if(name == "linear") {
         return new LinearActivation();
-    } else if( name == "relu" ) {
+    } else if(name == "relu") {
         return new ReluActivation();
     } else {
         throw std::runtime_error("activation " + name + " not known");
     }
 }
 
-ostream &operator<<( ostream &os, LinearActivation const&act ) {
+ostream &operator<<(ostream &os, LinearActivation const&act) {
     os << "LinearActivation{}";
     return os;
 }
 
-ostream &operator<<( ostream &os, TanhActivation const&act ) {
+ostream &operator<<(ostream &os, TanhActivation const&act) {
     os << "TanhActivation{}";
     return os;
 }
 
-ostream &operator<<( ostream &os, ScaledTanhActivation const&act ) {
+ostream &operator<<(ostream &os, ScaledTanhActivation const&act) {
     os << "ScaledTanhActivation{}";
     return os;
 }
 
-ostream &operator<<( ostream &os, ReluActivation const&act ) {
+ostream &operator<<(ostream &os, ReluActivation const&act) {
     os << "ReluActivation{}";
     return os;
 }
 
-ostream &operator<<( ostream &os, SigmoidActivation const&act ) {
+ostream &operator<<(ostream &os, SigmoidActivation const&act) {
     os << "SigmoidActivation{}";
     return os;
 }
diff --git a/src/activate/ActivationFunction.h b/src/activate/ActivationFunction.h
index e9b45c05..517aeea0 100644
--- a/src/activate/ActivationFunction.h
+++ b/src/activate/ActivationFunction.h
@@ -15,20 +15,20 @@
 class DeepCL_EXPORT ActivationFunction {
 public:
     virtual ~ActivationFunction() {}
-    virtual float calc( float value ) const { throw std::runtime_error("calc not implemented"); };
-    virtual float calcDerivative( float output ) const { throw std::runtime_error("calcDerivative not implemented"); };
+    virtual float calc(float value) const { throw std::runtime_error("calc not implemented"); };
+    virtual float calcDerivative(float output) const { throw std::runtime_error("calcDerivative not implemented"); };
     virtual float getFalse() const {  throw std::runtime_error("getFalse not implemented"); } 
     virtual float getTrue() const {  throw std::runtime_error("getTrue not implemented"); } 
     virtual std::string getDefineName() const { throw std::runtime_error("getDefineName not implemented"); } 
-    static ActivationFunction *fromName( std::string name );
+    static ActivationFunction *fromName(std::string name);
 };
 
 class TanhActivation : public ActivationFunction {
 public:
-    virtual float calc( float value ) const {
-        return tanh( value );
+    virtual float calc(float value) const {
+        return tanh(value);
     }
-    virtual float calcDerivative( float output ) const {
+    virtual float calcDerivative(float output) const {
         return 1 - output * output;
     }
     virtual float getTrue() const {
@@ -44,11 +44,11 @@ class TanhActivation : public ActivationFunction {
 
 class ScaledTanhActivation : public ActivationFunction {
 public:
-    virtual float calc( float value ) const {
-        return 1.7159f * tanh( value * 0.66667f );
+    virtual float calc(float value) const {
+        return 1.7159f * tanh(value * 0.66667f);
     }
-    virtual float calcDerivative( float output ) const {
-        return 0.66667f * ( 1.7159f - 1 / 1.7159f * output * output );
+    virtual float calcDerivative(float output) const {
+        return 0.66667f * (1.7159f - 1 / 1.7159f * output * output);
     }
     virtual float getTrue() const {
         return 1.0f;
@@ -63,11 +63,11 @@ class ScaledTanhActivation : public ActivationFunction {
 
 class SigmoidActivation : public ActivationFunction {
 public:
-    virtual float calc( float value ) const {
-        return 1.0f / ( 1.0f + exp( - value ) );
+    virtual float calc(float value) const {
+        return 1.0f / (1.0f + exp(- value) );
     }
-    virtual float calcDerivative( float output ) const {
-        return output * ( 1 - output );
+    virtual float calcDerivative(float output) const {
+        return output * (1 - output);
     }
     virtual float getTrue() const {
         return 0.8f;
@@ -82,10 +82,10 @@ class SigmoidActivation : public ActivationFunction {
 
 class LinearActivation : public ActivationFunction {
 public:
-    virtual float calc( float value ) const {
+    virtual float calc(float value) const {
         return value;
     }
-    virtual float calcDerivative( float output ) const {
+    virtual float calcDerivative(float output) const {
         return 1;
     }
     virtual float getTrue() const {
@@ -101,10 +101,10 @@ class LinearActivation : public ActivationFunction {
 
 class ReluActivation : public ActivationFunction {
 public:
-    virtual float calc( float value ) const {
+    virtual float calc(float value) const {
         return value > 0 ? value : 0;
     }
-    virtual float calcDerivative( float output ) const {
+    virtual float calcDerivative(float output) const {
         return output > 0 ? 1.0f : 0.0f;
     }
     virtual float getTrue() const {
diff --git a/src/activate/ActivationLayer.cpp b/src/activate/ActivationLayer.cpp
index f87051de..8dc1795d 100644
--- a/src/activate/ActivationLayer.cpp
+++ b/src/activate/ActivationLayer.cpp
@@ -19,13 +19,13 @@ using namespace std;
 #undef STATIC
 #define STATIC
 
-ActivationLayer::ActivationLayer( EasyCL *cl, Layer *previousLayer, ActivationMaker *maker ) :
-        Layer( previousLayer, maker ),
-        numPlanes ( previousLayer->getOutputPlanes() ),
-        inputImageSize( previousLayer->getOutputImageSize() ),
-        outputImageSize( previousLayer->getOutputImageSize() ),
-        fn( maker->_activationFunction ),
-        cl( cl ),
+ActivationLayer::ActivationLayer(EasyCL *cl, Layer *previousLayer, ActivationMaker *maker) :
+        Layer(previousLayer, maker),
+        numPlanes (previousLayer->getOutputPlanes()),
+        inputSize(previousLayer->getOutputSize()),
+        outputSize(previousLayer->getOutputSize()),
+        fn(maker->_activationFunction),
+        cl(cl),
         output(0),
         gradInput(0),
         outputWrapper(0),
@@ -34,104 +34,104 @@ ActivationLayer::ActivationLayer( EasyCL *cl, Layer *previousLayer, ActivationMa
 //        gradInputCopiedToHost(false),
         batchSize(0),
         allocatedSize(0) {
-    if( inputImageSize == 0 ){
+    if(inputSize == 0){
 //        maker->net->print();
-        throw runtime_error("Error: Activation layer " + toString( layerIndex ) + ": input image size is 0" );
+        throw runtime_error("Error: Activation layer " + toString(layerIndex) + ": input image size is 0");
     }
-    if( outputImageSize == 0 ){
+    if(outputSize == 0){
 //        maker->net->print();
-        throw runtime_error("Error: Activation layer " + toString( layerIndex ) + ": output image size is 0" );
+        throw runtime_error("Error: Activation layer " + toString(layerIndex) + ": output image size is 0");
     }
-    activationForwardImpl = ActivationForward::instance( cl, numPlanes, inputImageSize, fn );
-    activationBackpropImpl = ActivationBackward::instance( cl, numPlanes, inputImageSize, fn );
+    activationForwardImpl = ActivationForward::instance(cl, numPlanes, inputSize, fn);
+    activationBackpropImpl = ActivationBackward::instance(cl, numPlanes, inputSize, fn);
 }
 VIRTUAL ActivationLayer::~ActivationLayer() {
     delete activationForwardImpl;
     delete activationBackpropImpl;
-    if( outputWrapper != 0 ) {
+    if(outputWrapper != 0) {
         delete outputWrapper;
     }
-    if( output != 0 ) {
+    if(output != 0) {
         delete[] output;
     }
-    if( gradInputWrapper != 0 ) {
+    if(gradInputWrapper != 0) {
         delete gradInputWrapper;
     }
-    if( gradInput != 0 ) {
+    if(gradInput != 0) {
         delete[] gradInput;
     }
 }
 VIRTUAL std::string ActivationLayer::getClassName() const {
     return "ActivationLayer";
 }
-VIRTUAL float ActivationLayer::getOutput( int n, int plane, int row, int col ) {
-    int index = ( ( n
-        * numPlanes + plane )
-        * outputImageSize + row )
-        * outputImageSize + col;
+VIRTUAL float ActivationLayer::getOutput(int n, int plane, int row, int col) {
+    int index = (( n
+        * numPlanes + plane)
+        * outputSize + row)
+        * outputSize + col;
     return output[ index ];
 }
 VIRTUAL void ActivationLayer::printOutput() {
 //    float const*output = getOutput();
 //    int outPlanes = getOutputPlanes();
-//    int outputSize = getOutputImageSize();
+//    int outputNumElements = getOutputSize();
     std::cout << "  outputs: " << std::endl;
     getOutput();
 // output are organized like [imageid][filterid][row][col]
-    for( int n = 0; n < std::min( 5, batchSize ); n++ ) {
+    for(int n = 0; n < std::min(5, batchSize); n++) {
         std::cout << "    n: " << n << std::endl;
-        for( int plane = 0; plane < std::min(5, numPlanes ); plane++ ) {
-            if( numPlanes > 1 ) std::cout << "      plane " << plane << std::endl;
-            if( outputImageSize == 1 ) {
-                 std::cout << "        " << getOutput(n, plane, 0, 0 ) << std::endl;
+        for(int plane = 0; plane < std::min(5, numPlanes); plane++) {
+            if(numPlanes > 1) std::cout << "      plane " << plane << std::endl;
+            if(outputSize == 1) {
+                 std::cout << "        " << getOutput(n, plane, 0, 0) << std::endl;
             } else {
-                for( int i = 0; i < std::min(5, outputImageSize); i++ ) {
+                for(int i = 0; i < std::min(5, outputSize); i++) {
                     std::cout << "      ";
-                    for( int j = 0; j < std::min(5, outputImageSize); j++ ) {
-                        std::cout << getOutput( n, plane, i, j ) << " ";
+                    for(int j = 0; j < std::min(5, outputSize); j++) {
+                        std::cout << getOutput(n, plane, i, j) << " ";
                     }
-                    if( outputImageSize > 5 ) std::cout << " ... ";
+                    if(outputSize > 5) std::cout << " ... ";
                     std::cout << std::endl;
                 }
-                if( outputImageSize > 5 ) std::cout << " ... " << std::endl;
+                if(outputSize > 5) std::cout << " ... " << std::endl;
             }
-            if( numPlanes > 5 ) std::cout << " ... other planes ... " << std::endl;
+            if(numPlanes > 5) std::cout << " ... other planes ... " << std::endl;
         }
-        if( batchSize > 5 ) std::cout << " ... other n ... " << std::endl;
+        if(batchSize > 5) std::cout << " ... other n ... " << std::endl;
     }
 }
-VIRTUAL void ActivationLayer::setBatchSize( int batchSize ) {
+VIRTUAL void ActivationLayer::setBatchSize(int batchSize) {
 //    cout << "ActivationLayer::setBatchSize" << endl;
-    if( batchSize <= allocatedSize ) {
+    if(batchSize <= allocatedSize) {
         this->batchSize = batchSize;
         return;
     }
-    if( outputWrapper != 0 ) {
+    if(outputWrapper != 0) {
         delete outputWrapper;
     }
-    if( output != 0 ) {
+    if(output != 0) {
         delete[] output;
     }
-    if( gradInputWrapper != 0 ) {
+    if(gradInputWrapper != 0) {
         delete gradInputWrapper;
     }
-    if( gradInput != 0 ) {
+    if(gradInput != 0) {
         delete[] gradInput;
     }
     this->batchSize = batchSize;
     this->allocatedSize = batchSize;
-    output = new float[ getOutputSize() ];
-    outputWrapper = cl->wrap( getOutputSize(), output );
+    output = new float[ getOutputNumElements() ];
+    outputWrapper = cl->wrap(getOutputNumElements(), output);
     outputWrapper->createOnDevice();
-    gradInput = new float[ previousLayer->getOutputSize() ];
-    gradInputWrapper = cl->wrap( previousLayer->getOutputSize(), gradInput );
+    gradInput = new float[ previousLayer->getOutputNumElements() ];
+    gradInputWrapper = cl->wrap(previousLayer->getOutputNumElements(), gradInput);
     gradInputWrapper->createOnDevice();
 }
-VIRTUAL int ActivationLayer::getOutputSize() {
-    return batchSize * numPlanes * outputImageSize * outputImageSize;
+VIRTUAL int ActivationLayer::getOutputNumElements() {
+    return batchSize * numPlanes * outputSize * outputSize;
 }
 VIRTUAL float *ActivationLayer::getOutput() {
-    if( outputWrapper->isDeviceDirty() ) {
+    if(outputWrapper->isDeviceDirty()) {
         outputWrapper->copyToHost();
 //        outputCopiedToHost = true;
     }
@@ -141,15 +141,15 @@ VIRTUAL float *ActivationLayer::getOutput() {
 VIRTUAL bool ActivationLayer::needsBackProp() {
     return previousLayer->needsBackProp();
 }
-VIRTUAL int ActivationLayer::getOutputSize() const {
-//    int outputImageSize = inputImageSize / poolingSize;
-    return batchSize * numPlanes * outputImageSize * outputImageSize;
+VIRTUAL int ActivationLayer::getOutputNumElements() const {
+//    int outputSize = inputSize / poolingSize;
+    return batchSize * numPlanes * outputSize * outputSize;
 }
 VIRTUAL int ActivationLayer::getOutputCubeSize() const {
-    return numPlanes * outputImageSize * outputImageSize;
+    return numPlanes * outputSize * outputSize;
 }
-VIRTUAL int ActivationLayer::getOutputImageSize() const {
-    return outputImageSize;
+VIRTUAL int ActivationLayer::getOutputSize() const {
+    return outputSize;
 }
 VIRTUAL int ActivationLayer::getOutputPlanes() const {
     return numPlanes;
@@ -173,7 +173,7 @@ VIRTUAL int ActivationLayer::getBiasSize() const {
     return 0;
 }
 VIRTUAL float *ActivationLayer::getGradInput() {
-    if( gradInputWrapper->isDeviceDirty() ) {
+    if(gradInputWrapper->isDeviceDirty()) {
         gradInputWrapper->copyToHost();
 //        gradInputCopiedToHost = true;
     }
@@ -184,16 +184,16 @@ VIRTUAL ActivationFunction const *ActivationLayer::getActivationFunction() {
 }
 VIRTUAL void ActivationLayer::forward() {
     CLWrapper *inputWrapper = 0;
-    if( previousLayer->hasOutputWrapper() ) {
+    if(previousLayer->hasOutputWrapper()) {
         inputWrapper = previousLayer->getOutputWrapper();
     } else {
         float *input = previousLayer->getOutput();
-        inputWrapper = cl->wrap( previousLayer->getOutputSize(), input );
+        inputWrapper = cl->wrap(previousLayer->getOutputNumElements(), input);
         inputWrapper->copyToDevice();
     }
-    activationForwardImpl->forward( batchSize, inputWrapper, outputWrapper );
+    activationForwardImpl->forward(batchSize, inputWrapper, outputWrapper);
 //    outputCopiedToHost = false;
-    if( !previousLayer->hasOutputWrapper() ) {
+    if(!previousLayer->hasOutputWrapper()) {
         delete inputWrapper;
     }
 }
@@ -201,37 +201,37 @@ VIRTUAL void ActivationLayer::backward() {
     // have no weights to backprop to, just need to backprop the errors
 
 //    CLWrapper *imagesWrapper = 0;
-//    if( previousLayer->hasOutputWrapper() ) {
+//    if(previousLayer->hasOutputWrapper()) {
 //        imagesWrapper = previousLayer->getOutputWrapper();
 //    } else {
-//        imagesWrapper = cl->wrap( previousLayer->getOutputSize(), previousLayer->getOutput() );
+//        imagesWrapper = cl->wrap(previousLayer->getOutputNumElements(), previousLayer->getOutput());
 //        imagesWrapper->copyToDevice();
 //    }
 
     CLWrapper *gradOutputWrapper = 0;
     bool weOwnGradOutputWrapper = false;
-    if( nextLayer->providesGradInputWrapper() ) {
+    if(nextLayer->providesGradInputWrapper()) {
         gradOutputWrapper = nextLayer->getGradInputWrapper();
     } else {
-        gradOutputWrapper = cl->wrap( getOutputSize(), nextLayer->getGradInput() );
+        gradOutputWrapper = cl->wrap(getOutputNumElements(), nextLayer->getGradInput());
         gradOutputWrapper->copyToDevice();
         weOwnGradOutputWrapper = true;
     }
 
-    activationBackpropImpl->backward( batchSize, outputWrapper, gradOutputWrapper, gradInputWrapper );
+    activationBackpropImpl->backward(batchSize, outputWrapper, gradOutputWrapper, gradInputWrapper);
 //    gradInputCopiedToHost = false;
 
-//    if( !previousLayer->hasOutputWrapper() ) {
+//    if(!previousLayer->hasOutputWrapper()) {
 //        delete imagesWrapper;
 //    }
-    if( weOwnGradOutputWrapper ) {
+    if(weOwnGradOutputWrapper) {
         delete gradOutputWrapper;
     }
 }
 VIRTUAL std::string ActivationLayer::asString() const {
     return "ActivationLayer{ " + fn->getDefineName() + " }";
 }
-VIRTUAL int ActivationLayer::getPersistSize( int version ) const {
+VIRTUAL int ActivationLayer::getPersistSize(int version) const {
     // no weights, so:
     return 0;
 }
diff --git a/src/activate/ActivationLayer.h b/src/activate/ActivationLayer.h
index d6b2e45b..383f4594 100644
--- a/src/activate/ActivationLayer.h
+++ b/src/activate/ActivationLayer.h
@@ -23,9 +23,9 @@ class ActivationMaker;
 class ActivationLayer : public Layer {
 public:
     const int numPlanes;
-    const int inputImageSize;
+    const int inputSize;
 
-    const int outputImageSize;
+    const int outputSize;
 
     ActivationFunction const *fn;
 
@@ -52,18 +52,18 @@ class ActivationLayer : public Layer {
     // cog_addheaders.add()
     // ]]]
     // generated, using cog:
-    ActivationLayer( EasyCL *cl, Layer *previousLayer, ActivationMaker *maker );
+    ActivationLayer(EasyCL *cl, Layer *previousLayer, ActivationMaker *maker);
     VIRTUAL ~ActivationLayer();
     VIRTUAL std::string getClassName() const;
-    VIRTUAL float getOutput( int n, int plane, int row, int col );
+    VIRTUAL float getOutput(int n, int plane, int row, int col);
     VIRTUAL void printOutput();
-    VIRTUAL void setBatchSize( int batchSize );
-    VIRTUAL int getOutputSize();
+    VIRTUAL void setBatchSize(int batchSize);
+    VIRTUAL int getOutputNumElements();
     VIRTUAL float *getOutput();
     VIRTUAL bool needsBackProp();
-    VIRTUAL int getOutputSize() const;
+    VIRTUAL int getOutputNumElements() const;
     VIRTUAL int getOutputCubeSize() const;
-    VIRTUAL int getOutputImageSize() const;
+    VIRTUAL int getOutputSize() const;
     VIRTUAL int getOutputPlanes() const;
     VIRTUAL bool providesGradInputWrapper() const;
     VIRTUAL CLWrapper *getGradInputWrapper();
@@ -76,7 +76,7 @@ class ActivationLayer : public Layer {
     VIRTUAL void forward();
     VIRTUAL void backward();
     VIRTUAL std::string asString() const;
-    VIRTUAL int getPersistSize( int version ) const;
+    VIRTUAL int getPersistSize(int version) const;
 
     // [[[end]]]
 };
diff --git a/src/activate/ActivationMaker.cpp b/src/activate/ActivationMaker.cpp
index 3413524d..c9814caf 100644
--- a/src/activate/ActivationMaker.cpp
+++ b/src/activate/ActivationMaker.cpp
@@ -14,8 +14,8 @@ using namespace std;
 #undef STATIC
 #define STATIC
 
-Layer *ActivationMaker::createLayer( Layer *previousLayer ) {
-    Layer *layer = new ActivationLayer( cl, previousLayer, this );
+Layer *ActivationMaker::createLayer(Layer *previousLayer) {
+    Layer *layer = new ActivationLayer(cl, previousLayer, this);
     return layer;
 }
 
diff --git a/src/activate/ActivationMaker.h b/src/activate/ActivationMaker.h
index 22a79a6d..48627ea5 100644
--- a/src/activate/ActivationMaker.h
+++ b/src/activate/ActivationMaker.h
@@ -19,7 +19,7 @@ class DeepCL_EXPORT ActivationMaker : public LayerMaker2 {
 public:
     ActivationFunction const *_activationFunction;
     ActivationMaker() :
-        _activationFunction( new ReluActivation() ) {
+        _activationFunction(new ReluActivation()) {
     }
     static ActivationMaker *instance() {
         return new ActivationMaker();
@@ -50,9 +50,9 @@ class DeepCL_EXPORT ActivationMaker : public LayerMaker2 {
     }
     virtual ActivationMaker *clone() const {
         ActivationMaker *thisClone = new ActivationMaker();
-        memcpy( thisClone, this, sizeof( ActivationMaker ) ); // this will copy the activationfunction pointer too
+        memcpy(thisClone, this, sizeof(ActivationMaker) ); // this will copy the activationfunction pointer too
         return thisClone;
     }
-    virtual Layer *createLayer( Layer *previousLayer );
+    virtual Layer *createLayer(Layer *previousLayer);
 };
 
diff --git a/src/batch/BatchData.cpp b/src/batch/BatchData.cpp
index 824b02d3..706e91b0 100644
--- a/src/batch/BatchData.cpp
+++ b/src/batch/BatchData.cpp
@@ -9,25 +9,25 @@
 
 using namespace std;
 
-InputData *InputData::instance( Trainable *net, float const*inputs ) {
+InputData *InputData::instance(Trainable *net, float const*inputs) {
     int inputCubeSize = net->getInputCubeSize();
-    return new InputData( inputCubeSize, inputs );
+    return new InputData(inputCubeSize, inputs);
 }
 
-ExpectedData *ExpectedData::instance( Trainable *net, float const*expectedOutputs ) {
+ExpectedData *ExpectedData::instance(Trainable *net, float const*expectedOutputs) {
     int outputCubeSize = net->getOutputCubeSize();
-    return new ExpectedData( outputCubeSize, expectedOutputs );
+    return new ExpectedData(outputCubeSize, expectedOutputs);
 }
-LabeledData *LabeledData::instance( Trainable *net, int const*labels ) { // net not used
+LabeledData *LabeledData::instance(Trainable *net, int const*labels) { // net not used
     // but means dont have to keep remembering whether to add in parameters or not
-    return new LabeledData( labels );
+    return new LabeledData(labels);
 }
 
-ExpectedData::ExpectedData( Trainable *net, float const*expected ) {
+ExpectedData::ExpectedData(Trainable *net, float const*expected) {
     this->outputCubeSize = net->getOutputCubeSize();
     this->expected = expected;
 }
-LabeledData::LabeledData( Trainable *net, int const*labels ) { // net not used
+LabeledData::LabeledData(Trainable *net, int const*labels) { // net not used
     // but means dont have to keep remembering whether to add in parameters or not
     this->labels = labels;
 }
diff --git a/src/batch/BatchData.h b/src/batch/BatchData.h
index 8ce51169..254f5070 100644
--- a/src/batch/BatchData.h
+++ b/src/batch/BatchData.h
@@ -14,24 +14,24 @@ class OutputData {
     }
     virtual ~OutputData() {
     }
-    virtual OutputData *slice( int start ) = 0;
-//    static LabeledData *fromLabels( int *labels ) {
-//        return new LabeledData( labels );
+    virtual OutputData *slice(int start) = 0;
+//    static LabeledData *fromLabels(int *labels) {
+//        return new LabeledData(labels);
 //    }
-//    static ExpectedData *fromExpected( int outputCubeSize, float *expected ) {
-//        return new ExpectedData( outputCubeSize, expected );
+//    static ExpectedData *fromExpected(int outputCubeSize, float *expected) {
+//        return new ExpectedData(outputCubeSize, expected);
 //    }
 };
 class LabeledData : public OutputData {
 public:
     int const*labels; // NOT owned by us, dont delete
-    LabeledData( int const*labels ) {
+    LabeledData(int const*labels) {
         this->labels = labels;
     }
-    LabeledData( Trainable *net, int const*labels );
-    static LabeledData *instance( Trainable *net, int const*labels );
-    LabeledData *slice( int start ) {
-        LabeledData *child = new LabeledData( labels + start );
+    LabeledData(Trainable *net, int const*labels);
+    static LabeledData *instance(Trainable *net, int const*labels);
+    LabeledData *slice(int start) {
+        LabeledData *child = new LabeledData(labels + start);
         return child;
     }
 };
@@ -40,14 +40,14 @@ class ExpectedData : public OutputData {
     int outputCubeSize;
     float const*expected; // NOT owned by us, dont delete
 
-    ExpectedData( int outputCubeSize, float const*expected ) {
+    ExpectedData(int outputCubeSize, float const*expected) {
         this->outputCubeSize = outputCubeSize;
         this->expected = expected;
     }
-    ExpectedData( Trainable *net, float const*expected );
-    static ExpectedData *instance( Trainable *net, float const*expected );
-    ExpectedData *slice( int start ) {
-        ExpectedData *child = new ExpectedData( outputCubeSize, expected + start * outputCubeSize );
+    ExpectedData(Trainable *net, float const*expected);
+    static ExpectedData *instance(Trainable *net, float const*expected);
+    ExpectedData *slice(int start) {
+        ExpectedData *child = new ExpectedData(outputCubeSize, expected + start * outputCubeSize);
         return child;
     }
 };
@@ -55,13 +55,13 @@ class InputData {
 public:
     int inputCubeSize;
     float const*inputs; // NOT owned by us, dont delete
-    InputData( int inputCubeSize, float const*inputs ) {
+    InputData(int inputCubeSize, float const*inputs) {
         this->inputCubeSize = inputCubeSize;
         this->inputs = inputs;
     }
-    static InputData *instance( Trainable *net, float const*inputs );
-    InputData *slice( int start ) {
-        InputData *child = new InputData( inputCubeSize, inputs + start * inputCubeSize );
+    static InputData *instance(Trainable *net, float const*inputs);
+    InputData *slice(int start) {
+        InputData *child = new InputData(inputCubeSize, inputs + start * inputCubeSize);
         return child;
     }
 };
diff --git a/src/batch/BatchLearnerOnDemand.cpp b/src/batch/BatchLearnerOnDemand.cpp
index 72011ee2..17533d91 100644
--- a/src/batch/BatchLearnerOnDemand.cpp
+++ b/src/batch/BatchLearnerOnDemand.cpp
@@ -20,28 +20,28 @@ using namespace std;
 #define STATIC
 #define VIRTUAL
 
-//BatchLearnerOnDemand::BatchLearnerOnDemand( Trainable *net ) :
-//    net( net ) {
+//BatchLearnerOnDemand::BatchLearnerOnDemand(Trainable *net) :
+//    net(net) {
 //}
 
-//EpochResult BatchLearnerOnDemand::runBatchedNetAction( std::string filepath, int fileReadBatches, int batchSize, int N, NetAction *netAction ) {
-//    OnDemandBatcher onDemandBatcher(net, netAction, filepath, N, fileReadBatches, batchSize );
+//EpochResult BatchLearnerOnDemand::runBatchedNetAction(std::string filepath, int fileReadBatches, int batchSize, int N, NetAction *netAction) {
+//    OnDemandBatcher onDemandBatcher(net, netAction, filepath, N, fileReadBatches, batchSize);
 //    return onDemandBatcher.run();
 //}
 
-//int BatchLearnerOnDemand::test( std::string filepath, int fileReadBatches, int batchSize, int Ntest ) {
-//    net->setTraining( false );
+//int BatchLearnerOnDemand::test(std::string filepath, int fileReadBatches, int batchSize, int Ntest) {
+//    net->setTraining(false);
 //    NetAction *action = new NetForwardAction();
-//    int numRight = runBatchedNetAction( filepath, fileReadBatches, batchSize, Ntest, action ).numRight;
+//    int numRight = runBatchedNetAction(filepath, fileReadBatches, batchSize, Ntest, action).numRight;
 //    delete action;
 //    return numRight;
 //}
 
-//EpochResult BatchLearnerOnDemand::runEpochFromLabels( Trainer *trainer, TrainingContext *context,
-//         std::string filepath, int fileReadBatches, int batchSize, int Ntrain ) {
-//    net->setTraining( true );
-//    NetAction *action = new NetLearnLabeledAction( trainer, context );
-//    EpochResult epochResult = runBatchedNetAction( filepath, fileReadBatches, batchSize, Ntrain, action );
+//EpochResult BatchLearnerOnDemand::runEpochFromLabels(Trainer *trainer, TrainingContext *context,
+//         std::string filepath, int fileReadBatches, int batchSize, int Ntrain) {
+//    net->setTraining(true);
+//    NetAction *action = new NetLearnLabeledAction(trainer, context);
+//    EpochResult epochResult = runBatchedNetAction(filepath, fileReadBatches, batchSize, Ntrain, action);
 //    delete action;
 //    return epochResult;
 //}
diff --git a/src/batch/BatchProcess.cpp b/src/batch/BatchProcess.cpp
index edcd2993..65b6f283 100644
--- a/src/batch/BatchProcess.cpp
+++ b/src/batch/BatchProcess.cpp
@@ -18,34 +18,34 @@
 using namespace std;
 
 void BatchProcess::run(std::string filepath, int startN, int batchSize, int totalN, int cubeSize, BatchAction *batchAction) {
-    int numBatches = ( totalN + batchSize - 1 ) / batchSize;
+    int numBatches = (totalN + batchSize - 1) / batchSize;
     int thisBatchSize = batchSize;
 //    cout << "batchProcess::run batchsize " << batchSize << " startN " << startN << " totalN " << totalN << " numBatches " << numBatches << endl;
-    for( int batch = 0; batch < numBatches; batch++ ) {
+    for(int batch = 0; batch < numBatches; batch++) {
         int batchStart = batch * batchSize;
-        if( batch == numBatches - 1 ) {
+        if(batch == numBatches - 1) {
             thisBatchSize = totalN - batchStart;
 //            cout << "size of last batch: " << thisBatchSize << endl;
         }
 //        cout << "   batchStart " << batchStart << " thisBatchSize " << thisBatchSize << endl;
-        GenericLoader::load( filepath, batchAction->data, batchAction->labels, batchStart, thisBatchSize );
-        batchAction->processBatch( thisBatchSize, cubeSize );
+        GenericLoader::load(filepath.c_str(), batchAction->data, batchAction->labels, batchStart, thisBatchSize);
+        batchAction->processBatch(thisBatchSize, cubeSize);
     }
 }
 
-void BatchProcessv2::run( GenericLoaderv2 *loader, int startN, int batchSize, int totalN, int cubeSize, BatchAction *batchAction) {
-    int numBatches = ( totalN + batchSize - 1 ) / batchSize;
+void BatchProcessv2::run(GenericLoaderv2 *loader, int startN, int batchSize, int totalN, int cubeSize, BatchAction *batchAction) {
+    int numBatches = (totalN + batchSize - 1) / batchSize;
     int thisBatchSize = batchSize;
 //    cout << "batchProcess::run batchsize " << batchSize << " startN " << startN << " totalN " << totalN << " numBatches " << numBatches << endl;
-    for( int batch = 0; batch < numBatches; batch++ ) {
+    for(int batch = 0; batch < numBatches; batch++) {
         int batchStart = batch * batchSize;
-        if( batch == numBatches - 1 ) {
+        if(batch == numBatches - 1) {
             thisBatchSize = totalN - batchStart;
 //            cout << "size of last batch: " << thisBatchSize << endl;
         }
 //        cout << "   batchStart " << batchStart << " thisBatchSize " << thisBatchSize << endl;
-        loader->load( batchAction->data, batchAction->labels, batchStart, thisBatchSize );
-        batchAction->processBatch( thisBatchSize, cubeSize );
+        loader->load(batchAction->data, batchAction->labels, batchStart, thisBatchSize);
+        batchAction->processBatch(thisBatchSize, cubeSize);
     }
 }
 
diff --git a/src/batch/BatchProcess.h b/src/batch/BatchProcess.h
index 78d92415..dc479c01 100644
--- a/src/batch/BatchProcess.h
+++ b/src/batch/BatchProcess.h
@@ -20,16 +20,16 @@ class DeepCL_EXPORT BatchAction {
 public:
     float *data;
     int *labels;
-    BatchAction( float *data, int *labels ) :
+    BatchAction(float *data, int *labels) :
         data(data),
         labels(labels) { // have to provide appropriate buffers for this
     }
-    virtual void processBatch( int batchSize, int cubeSize ) = 0;
+    virtual void processBatch(int batchSize, int cubeSize) = 0;
 };
 
 class DeepCL_EXPORT BatchProcessv2 {
 public:
-    static void run( GenericLoaderv2*loader, int startN, int batchSize, int totalN, int cubeSize, BatchAction *batchAction);
+    static void run(GenericLoaderv2*loader, int startN, int batchSize, int totalN, int cubeSize, BatchAction *batchAction);
 };
 
 class DeepCL_EXPORT BatchProcess {
@@ -40,14 +40,14 @@ class DeepCL_EXPORT BatchProcess {
 class DeepCL_EXPORT NormalizeGetStdDev : public BatchAction {
 public:
     Statistics statistics; 
-    NormalizeGetStdDev( float *data, int *labels ) :
-        BatchAction( data, labels ) {
+    NormalizeGetStdDev(float *data, int *labels) :
+        BatchAction(data, labels) {
     }
-    virtual void processBatch( int batchSize, int cubeSize ) {
-        NormalizationHelper::updateStatistics( this->data, batchSize, cubeSize, &statistics );
+    virtual void processBatch(int batchSize, int cubeSize) {
+        NormalizationHelper::updateStatistics(this->data, batchSize, cubeSize, &statistics);
     }
-    void calcMeanStdDev( float *p_mean, float *p_stdDev ) {
-        NormalizationHelper::calcMeanAndStdDev( &statistics, p_mean, p_stdDev );
+    void calcMeanStdDev(float *p_mean, float *p_stdDev) {
+        NormalizationHelper::calcMeanAndStdDev(&statistics, p_mean, p_stdDev);
     }
 };
 
@@ -55,17 +55,17 @@ class DeepCL_EXPORT NormalizeGetStdDev : public BatchAction {
 class DeepCL_EXPORT NormalizeGetMinMax : public BatchAction {
 public:
     Statistics statistics; 
-    NormalizeGetMinMax( float *data, int *labels ) :
-        BatchAction( data, labels ) {
+    NormalizeGetMinMax(float *data, int *labels) :
+        BatchAction(data, labels) {
     }
-    virtual void processBatch( int batchSize, int cubeSize ) {
-        NormalizationHelper::updateStatistics( this->data, batchSize, cubeSize, &statistics );
+    virtual void processBatch(int batchSize, int cubeSize) {
+        NormalizationHelper::updateStatistics(this->data, batchSize, cubeSize, &statistics);
     }
-    void calcMinMaxTransform( float *p_translate, float *p_scale ) {
+    void calcMinMaxTransform(float *p_translate, float *p_scale) {
         // add this to our values to center
-        *p_translate = - ( statistics.maxY - statistics.minY ) / 2.0f;
+        *p_translate = - (statistics.maxY - statistics.minY) / 2.0f;
         // multiply our values by this to scale to -1 / +1 range
-        *p_scale = 1.0f / ( statistics.maxY - statistics.minY );
+        *p_scale = 1.0f / (statistics.maxY - statistics.minY);
     }
 };
 
diff --git a/src/batch/Batcher.cpp b/src/batch/Batcher.cpp
index 1a5994ec..006ebba9 100644
--- a/src/batch/Batcher.cpp
+++ b/src/batch/Batcher.cpp
@@ -22,7 +22,7 @@ using namespace std;
 //#define PUBLICAPI
 
 /// \brief constructor: pass in data to process, along with labels, network, ...
-PUBLICAPI Batcher::Batcher(Trainable *net, int batchSize, int N, float *data, int const*labels ) :
+PUBLICAPI Batcher::Batcher(Trainable *net, int batchSize, int N, float *data, int const*labels) :
         net(net),
         batchSize(batchSize),
         N(N),
@@ -30,7 +30,7 @@ PUBLICAPI Batcher::Batcher(Trainable *net, int batchSize, int N, float *data, in
         labels(labels)
             {
     inputCubeSize = net->getInputCubeSize();
-    numBatches = ( N + batchSize - 1 ) / batchSize;
+    numBatches = (N + batchSize - 1) / batchSize;
     reset();
 }
 VIRTUAL Batcher::~Batcher() {
@@ -44,7 +44,7 @@ PUBLICAPI void Batcher::reset() {
 }
 /// \brief what is the index of the next batch to process?
 PUBLICAPI int Batcher::getNextBatch() {
-    if( epochDone ) {
+    if(epochDone) {
         return 0;
     } else {
         return nextBatch;
@@ -66,14 +66,14 @@ PUBLICAPI VIRTUAL int Batcher::getN() {
 PUBLICAPI VIRTUAL bool Batcher::getEpochDone() {
     return epochDone;
 }
-VIRTUAL void Batcher::setBatchState( int nextBatch, int numRight, float loss ) {
+VIRTUAL void Batcher::setBatchState(int nextBatch, int numRight, float loss) {
     this->nextBatch = nextBatch;
     this->numRight = numRight;
     this->loss = loss;
 }
-VIRTUAL void Batcher::setN( int N ) {
+VIRTUAL void Batcher::setN(int N) {
     this->N = N;
-    this->numBatches = (N + batchSize - 1 ) / batchSize;
+    this->numBatches = (N + batchSize - 1) / batchSize;
 }
 /// \brief processes one single batch of data
 ///
@@ -81,32 +81,32 @@ VIRTUAL void Batcher::setN( int N ) {
 ///
 /// if most recent epoch has finished, then resets, and starts a new
 /// set of learning
-PUBLICAPI bool Batcher::tick( int epoch ) {
+PUBLICAPI bool Batcher::tick(int epoch) {
 //    cout << "Batcher::tick epochDone=" << epochDone << " batch=" <<  nextBatch << endl;
 //    updateVars();
-    if( epochDone ) {
+    if(epochDone) {
         reset();
     }
     int batch = nextBatch;
 //    std::cout << "BatchLearner.tick() batch=" << batch << std::endl;
     int batchStart = batch * batchSize;
     int thisBatchSize = batchSize;
-    if( batch == numBatches - 1 ) {
+    if(batch == numBatches - 1) {
         thisBatchSize = N - batchStart;
     }
 //    std::cout << "batchSize=" << batchSize << " thisBatchSize=" << thisBatchSize << " batch=" << batch <<
 //            " batchStart=" << batchStart << " data=" << (void *)data << " labels=" << labels << 
 //            std::endl;
-    net->setBatchSize( thisBatchSize );
-    internalTick( epoch, &(data[ batchStart * inputCubeSize ]), &(labels[batchStart]) );
-//        netAction->run( net, &(data[ batchStart * inputCubeSize ]), &(labels[batchStart]) );
-    float thisLoss = net->calcLossFromLabels( &(labels[batchStart]) );
-    int thisNumRight = net->calcNumRight( &(labels[batchStart]) );
+    net->setBatchSize(thisBatchSize);
+    internalTick(epoch, &(data[ batchStart * inputCubeSize ]), &(labels[batchStart]));
+//        netAction->run(net, &(data[ batchStart * inputCubeSize ]), &(labels[batchStart]));
+    float thisLoss = net->calcLossFromLabels(&(labels[batchStart]));
+    int thisNumRight = net->calcNumRight(&(labels[batchStart]));
 //        std::cout << "thisloss " << thisLoss << " thisnumright " << thisNumRight << std::endl; 
     loss += thisLoss;
     numRight += thisNumRight;
     nextBatch++;
-    if( nextBatch == numBatches ) {
+    if(nextBatch == numBatches) {
         epochDone = true;
     }
     return !epochDone;
@@ -115,45 +115,45 @@ PUBLICAPI bool Batcher::tick( int epoch ) {
 ///
 /// could be one batch of learning, or one batch of forward propagation
 /// (for test/prediction), for example
-PUBLICAPI EpochResult Batcher::run( int epoch ) {
-    if( data == 0 ) {
+PUBLICAPI EpochResult Batcher::run(int epoch) {
+    if(data == 0) {
         throw runtime_error("Batcher: no data set");
     }
-    if( labels == 0 ) {
+    if(labels == 0) {
         throw runtime_error("Batcher: no labels set");
     }
-    if( epochDone ) {
+    if(epochDone) {
         reset();
     }
-    while( !epochDone ) {
-        tick( epoch );
+    while(!epochDone) {
+        tick(epoch);
     }
-    EpochResult epochResult( loss, numRight );
+    EpochResult epochResult(loss, numRight);
     return epochResult;
 }
-LearnBatcher::LearnBatcher( Trainer *trainer, Trainable *net,
-        int batchSize, int N, float *data, int const*labels ) :
-    Batcher( net, batchSize, N, data, labels ),
-    trainer( trainer ) {
+LearnBatcher::LearnBatcher(Trainer *trainer, Trainable *net,
+        int batchSize, int N, float *data, int const*labels) :
+    Batcher(net, batchSize, N, data, labels),
+    trainer(trainer) {
 }
-VIRTUAL void LearnBatcher::internalTick( int epoch, float const*batchData, int const*batchLabels) {
+VIRTUAL void LearnBatcher::internalTick(int epoch, float const*batchData, int const*batchLabels) {
 //    cout << "LearnBatcher learningRate=" << learningRate << " batchdata=" << (void *)batchData << 
 //        " batchLabels=" << batchLabels << endl;
-    TrainingContext context( epoch, nextBatch );
-    trainer->trainFromLabels( net, &context, batchData, batchLabels );
+    TrainingContext context(epoch, nextBatch);
+    trainer->trainFromLabels(net, &context, batchData, batchLabels);
 }
 
 NetActionBatcher::NetActionBatcher(Trainable *net, int batchSize, int N, float *data, int const*labels, NetAction *netAction) :
-    Batcher( net, batchSize, N, data, labels ),
-    netAction( netAction ) {
+    Batcher(net, batchSize, N, data, labels),
+    netAction(netAction) {
 }
-void NetActionBatcher::internalTick( int epoch, float const*batchData, int const*batchLabels ) {
-    netAction->run( this->net, epoch, nextBatch, batchData, batchLabels );
+void NetActionBatcher::internalTick(int epoch, float const*batchData, int const*batchLabels) {
+    netAction->run(this->net, epoch, nextBatch, batchData, batchLabels);
 }
-ForwardBatcher::ForwardBatcher(Trainable *net, int batchSize, int N, float *data, int const*labels ) :
-    Batcher( net, batchSize, N, data, labels ) {
+ForwardBatcher::ForwardBatcher(Trainable *net, int batchSize, int N, float *data, int const*labels) :
+    Batcher(net, batchSize, N, data, labels) {
 }
-void ForwardBatcher::internalTick( int epoch, float const*batchData, int const*batchLabels) {
-    this->net->forward( batchData );
+void ForwardBatcher::internalTick(int epoch, float const*batchData, int const*batchLabels) {
+    this->net->forward(batchData);
 }
 
diff --git a/src/batch/Batcher.h b/src/batch/Batcher.h
index 6d9dd5c2..84270b53 100644
--- a/src/batch/Batcher.h
+++ b/src/batch/Batcher.h
@@ -44,14 +44,14 @@ class DeepCL_EXPORT Batcher {
     float loss;
 
 public:
-    virtual void internalTick( int epoch, float const*batchData, int const*batchLabels) = 0;
+    virtual void internalTick(int epoch, float const*batchData, int const*batchLabels) = 0;
 
     // [[[cog
     // import cog_addheaders
     // cog_addheaders.add()
     // ]]]
     // generated, using cog:
-    PUBLICAPI Batcher(Trainable *net, int batchSize, int N, float *data, int const*labels );
+    PUBLICAPI Batcher(Trainable *net, int batchSize, int N, float *data, int const*labels);
     VIRTUAL ~Batcher();
     PUBLICAPI void reset();
     PUBLICAPI int getNextBatch();
@@ -59,10 +59,10 @@ class DeepCL_EXPORT Batcher {
     PUBLICAPI VIRTUAL int getNumRight();
     PUBLICAPI VIRTUAL int getN();
     PUBLICAPI VIRTUAL bool getEpochDone();
-    VIRTUAL void setBatchState( int nextBatch, int numRight, float loss );
-    VIRTUAL void setN( int N );
-    PUBLICAPI bool tick( int epoch );
-    PUBLICAPI EpochResult run( int epoch );
+    VIRTUAL void setBatchState(int nextBatch, int numRight, float loss);
+    VIRTUAL void setN(int N);
+    PUBLICAPI bool tick(int epoch);
+    PUBLICAPI EpochResult run(int epoch);
 
     // [[[end]]]
 };
@@ -72,9 +72,9 @@ class DeepCL_EXPORT LearnBatcher : public Batcher {
     Trainer *trainer; // NOT delete
     TrainingContext *context; // NOT delete
 
-    LearnBatcher( Trainer *trainer, 
-        Trainable *net, int batchSize, int N, float *data, int const*labels );
-    virtual void internalTick( int epoch, float const*batchData, int const*batchLabels);
+    LearnBatcher(Trainer *trainer, 
+        Trainable *net, int batchSize, int N, float *data, int const*labels);
+    virtual void internalTick(int epoch, float const*batchData, int const*batchLabels);
 };
 
 //class DeepCL_EXPORT LearnFromExpectedBatcher : public Batcher {
@@ -82,23 +82,23 @@ class DeepCL_EXPORT LearnBatcher : public Batcher {
 //    Trainer *trainer; // NOT delete
 //    TrainingContext *context; // NOT delete
 
-//    LearnFromExpectedBatcher( Trainer *trainer, 
-//        Trainable *net, int batchSize, int N, float *data, float *expectedOutputs );
-//    virtual void internalTick( int epoch, float const*batchData, float *expectedOutputs );
+//    LearnFromExpectedBatcher(Trainer *trainer, 
+//        Trainable *net, int batchSize, int N, float *data, float *expectedOutputs);
+//    virtual void internalTick(int epoch, float const*batchData, float *expectedOutputs);
 //};
 
 class DeepCL_EXPORT NetActionBatcher : public Batcher {
 public:
     NetAction * netAction;
     NetActionBatcher(Trainable *net, int batchSize, int N, float *data, int const*labels, NetAction * netAction);
-    virtual void internalTick( int epoch, float const*batchData, int const*batchLabels );
+    virtual void internalTick(int epoch, float const*batchData, int const*batchLabels);
 };
 
 
 class DeepCL_EXPORT ForwardBatcher : public Batcher {
 public:
     ForwardBatcher(Trainable *net, int batchSize, int N, float *data, int const*labels);
-    virtual void internalTick( int epoch, float const*batchData, int const*batchLabels);
+    virtual void internalTick(int epoch, float const*batchData, int const*batchLabels);
 };
 
 
diff --git a/src/batch/Batcher2.cpp b/src/batch/Batcher2.cpp
index 2a1baa56..00fd4f91 100644
--- a/src/batch/Batcher2.cpp
+++ b/src/batch/Batcher2.cpp
@@ -21,7 +21,7 @@ using namespace std;
 /// \brief constructor: pass in data to process, along with labels, network, ...
 Batcher2::Batcher2(Trainable *net, NetAction2 *action,
              int batchSize, int N, 
-            InputData *inputData, OutputData *outputData ) :
+            InputData *inputData, OutputData *outputData) :
         net(net),
         action(action),
         batchSize(batchSize),
@@ -30,7 +30,7 @@ Batcher2::Batcher2(Trainable *net, NetAction2 *action,
         outputData(outputData)
             {
 //    inputCubeSize = net->getInputCubeSize();
-    numBatches = ( N + batchSize - 1 ) / batchSize;
+    numBatches = (N + batchSize - 1) / batchSize;
     reset();
 }
 VIRTUAL Batcher2::~Batcher2() {
@@ -44,7 +44,7 @@ void Batcher2::reset() {
 }
 /// \brief what is the index of the next batch to process?
 int Batcher2::getNextBatch() {
-    if( epochDone ) {
+    if(epochDone) {
         return 0;
     } else {
         return nextBatch;
@@ -66,9 +66,9 @@ VIRTUAL int Batcher2::getN() {
 VIRTUAL bool Batcher2::getEpochDone() {
     return epochDone;
 }
-VIRTUAL void Batcher2::setN( int N ) {
+VIRTUAL void Batcher2::setN(int N) {
     this->N = N;
-    this->numBatches = (N + batchSize - 1 ) / batchSize;
+    this->numBatches = (N + batchSize - 1) / batchSize;
 }
 
 /// \brief processes one single batch of data
@@ -77,57 +77,57 @@ VIRTUAL void Batcher2::setN( int N ) {
 ///
 /// if most recent epoch has finished, then resets, and starts a new
 /// set of learning
-bool Batcher2::tick( int epoch ) {
+bool Batcher2::tick(int epoch) {
 //    cout << "Batcher2::tick epochDone=" << epochDone << " batch=" <<  nextBatch << endl;
 //    updateVars();
-    if( epochDone ) {
+    if(epochDone) {
         reset();
     }
     int batch = nextBatch;
 //    std::cout << "BatchLearner.tick() batch=" << batch << std::endl;
     int batchStart = batch * batchSize;
     int thisBatchSize = batchSize;
-    if( batch == numBatches - 1 ) {
+    if(batch == numBatches - 1) {
         thisBatchSize = N - batchStart;
     }
 //    std::cout << "batchSize=" << batchSize << " thisBatchSize=" << thisBatchSize << " batch=" << batch <<
 //            " batchStart=" << batchStart << " data=" << (void *)data << " labels=" << labels << 
 //            std::endl;
-    net->setBatchSize( thisBatchSize );
-    internalTick( epoch, inputData->slice( batchStart ), outputData->slice( batchStart ) );
+    net->setBatchSize(thisBatchSize);
+    internalTick(epoch, inputData->slice(batchStart), outputData->slice(batchStart) );
 
-//    float thisLoss = net->calcLossFromLabels( &(labels[batchStart]) );
-//    int thisNumRight = net->calcNumRight( &(labels[batchStart]) );
+//    float thisLoss = net->calcLossFromLabels(&(labels[batchStart]));
+//    int thisNumRight = net->calcNumRight(&(labels[batchStart]));
 //        std::cout << "thisloss " << thisLoss << " thisnumright " << thisNumRight << std::endl; 
 //    loss += thisLoss;
 //    numRight += thisNumRight;
     nextBatch++;
-    if( nextBatch == numBatches ) {
+    if(nextBatch == numBatches) {
         epochDone = true;
     }
     return !epochDone;
 }
 
-VIRTUAL void Batcher2::internalTick( int epoch, InputData *inputData, OutputData *outputData ) {
-     action->run( net, epoch, nextBatch, inputData, outputData );
+VIRTUAL void Batcher2::internalTick(int epoch, InputData *inputData, OutputData *outputData) {
+     action->run(net, epoch, nextBatch, inputData, outputData);
 }
 
 /// \brief runs batch once, for currently loaded data
 ///
 /// could be one batch of learning, or one batch of forward propagation
 /// (for test/prediction), for example
-void Batcher2::run( int epoch ) {
-//    if( data == 0 ) {
+void Batcher2::run(int epoch) {
+//    if(data == 0) {
 //        throw runtime_error("Batcher2: no data set");
 //    }
-//    if( labels == 0 ) {
+//    if(labels == 0) {
 //        throw runtime_error("Batcher2: no labels set");
 //    }
-    if( epochDone ) {
+    if(epochDone) {
         reset();
     }
-    while( !epochDone ) {
-        tick( epoch );
+    while(!epochDone) {
+        tick(epoch);
     }
 }
 
diff --git a/src/batch/Batcher2.h b/src/batch/Batcher2.h
index 9815fb84..8d418a73 100644
--- a/src/batch/Batcher2.h
+++ b/src/batch/Batcher2.h
@@ -50,16 +50,16 @@ class DeepCL_EXPORT Batcher2 {
     // generated, using cog:
     Batcher2(Trainable *net, NetAction2 *action,
     int batchSize, int N,
-    InputData *inputData, OutputData *outputData );
+    InputData *inputData, OutputData *outputData);
     VIRTUAL ~Batcher2();
     void reset();
     int getNextBatch();
     VIRTUAL int getN();
     VIRTUAL bool getEpochDone();
-    VIRTUAL void setN( int N );
-    bool tick( int epoch );
-    VIRTUAL void internalTick( int epoch, InputData *inputData, OutputData *outputData );
-    void run( int epoch );
+    VIRTUAL void setN(int N);
+    bool tick(int epoch);
+    VIRTUAL void internalTick(int epoch, InputData *inputData, OutputData *outputData);
+    void run(int epoch);
 
     // [[[end]]]
 };
@@ -73,10 +73,10 @@ class DeepCL_EXPORT LearnBatcher2 : public Batcher2 {
 
     LearnBatcher2(Trainable *net, Trainer *trainer, int batchSize, int N,
             InputData *inputData, OutputData *outputData) :
-        Batcher2( net, &action, batchSize, N, inputData, outputData ),
-        action( trainer ) {        
+        Batcher2(net, &action, batchSize, N, inputData, outputData),
+        action(trainer) {        
     }
-    void setBatchState( int nextBatch, int numRight, float loss ) {
+    void setBatchState(int nextBatch, int numRight, float loss) {
         this->nextBatch = nextBatch;
         this->epochNumRight = numRight;
         this->epochLoss = loss;
diff --git a/src/batch/EpochMaker.cpp b/src/batch/EpochMaker.cpp
index 02ce6841..543492db 100644
--- a/src/batch/EpochMaker.cpp
+++ b/src/batch/EpochMaker.cpp
@@ -15,41 +15,41 @@
 
 using namespace std;
 
-float EpochMaker::run( int epoch ) {
-    if( _labels != 0 ) {
+float EpochMaker::run(int epoch) {
+    if(_labels != 0) {
         throw runtime_error("should not provide labels if using Epoch::run");
     }
-    if( _expectedOutputs == 0 ) {
+    if(_expectedOutputs == 0) {
         throw runtime_error("must provide expectedOutputs if using runWithCalcTrainingAccuracy");
     }
     
-    InputData input( net->getInputCubeSize(), _inputData );
-    ExpectedData output( net->getOutputCubeSize(), _expectedOutputs );
-    LearnBatcher2 learnBatcher( net, trainer, _batchSize, _numExamples,
-        &input, &output );
-    learnBatcher.run( epoch );
+    InputData input(net->getInputCubeSize(), _inputData);
+    ExpectedData output(net->getOutputCubeSize(), _expectedOutputs);
+    LearnBatcher2 learnBatcher(net, trainer, _batchSize, _numExamples,
+        &input, &output);
+    learnBatcher.run(epoch);
     return learnBatcher.getEpochLoss();
 }
 
-//float EpochMaker::runWithCalcTrainingAccuracy( int *p_numRight ) {
-//    if( _expectedOutputs == 0 ) {
+//float EpochMaker::runWithCalcTrainingAccuracy(int *p_numRight) {
+//    if(_expectedOutputs == 0) {
 //        throw runtime_error("must provide expectedOutputs if using Epoch::runWithCalcTrainingAccuracy");
 //    }
-//    if( _expectedOutputs == 0 ) {
+//    if(_expectedOutputs == 0) {
 //        throw runtime_error("must provide labels if using Epoch::runWithCalcTrainingAccuracy");
 //    }
-//    return net->doEpochWithCalcTrainingAccuracy( _learningRate, _batchSize, _numExamples, _inputData, _expectedOutputs, _labels, p_numRight );
+//    return net->doEpochWithCalcTrainingAccuracy(_learningRate, _batchSize, _numExamples, _inputData, _expectedOutputs, _labels, p_numRight);
 //}
 
-//float EpochMaker::runFromLabels( int *p_numRight ) {
-//    if( _expectedOutputs != 0 ) {
+//float EpochMaker::runFromLabels(int *p_numRight) {
+//    if(_expectedOutputs != 0) {
 //        throw runtime_error("should not provide expectedOutputs if using Epoch::runFromLabels");
 //    }
-//    if( _labels == 0 ) {
+//    if(_labels == 0) {
 //        throw runtime_error("must provide labels if using Epoch::runFromLabels");
 //    }
-//    BatchLearner batchLearner( net );
-//    EpochResult epochResult = batchLearner.runEpochFromLabels( _learningRate, _batchSize, _numExamples, _inputData, _labels );
+//    BatchLearner batchLearner(net);
+//    EpochResult epochResult = batchLearner.runEpochFromLabels(_learningRate, _batchSize, _numExamples, _inputData, _labels);
 //    *p_numRight = epochResult.numRight;
 //    return epochResult.loss;
 //}
diff --git a/src/batch/EpochMaker.h b/src/batch/EpochMaker.h
index 1bb3caa9..cae16664 100644
--- a/src/batch/EpochMaker.h
+++ b/src/batch/EpochMaker.h
@@ -19,8 +19,8 @@ class DeepCL_EXPORT EpochMaker {
     float *_expectedOutputs;
     int const*_labels;
 public:
-    EpochMaker( NeuralNet *net, Trainer *trainer ) {
-        memset( this, 0, sizeof(EpochMaker) );
+    EpochMaker(NeuralNet *net, Trainer *trainer) {
+        memset(this, 0, sizeof(EpochMaker));
         _expectedOutputs = 0;
         _labels = 0;
         this->net = net;
@@ -51,8 +51,8 @@ class DeepCL_EXPORT EpochMaker {
         this->_labels = labels;
         return this;
     }
-    float run( int epoch );
-    float runWithCalcTrainingAccuracy( int epoch, int *p_numRight);
-    float runFromLabels( int epoch, int *p_numRight);
+    float run(int epoch);
+    float runWithCalcTrainingAccuracy(int epoch, int *p_numRight);
+    float runFromLabels(int epoch, int *p_numRight);
 };
 
diff --git a/src/batch/NetAction.cpp b/src/batch/NetAction.cpp
index 120b78e9..4ab9d56b 100644
--- a/src/batch/NetAction.cpp
+++ b/src/batch/NetAction.cpp
@@ -16,21 +16,21 @@ using namespace std;
 #define STATIC
 #define VIRTUAL
 
-void NetLearnLabeledAction::run( Trainable *net, int epoch, int batch, float const*const batchData, int const*const batchLabels ) {
+void NetLearnLabeledAction::run(Trainable *net, int epoch, int batch, float const*const batchData, int const*const batchLabels) {
 //    cout << "NetLearnLabeledBatch learningrate=" << learningRate << endl;
-    TrainingContext context( epoch, batch );
-    trainer->trainFromLabels( net, &context, batchData, batchLabels );
+    TrainingContext context(epoch, batch);
+    trainer->trainFromLabels(net, &context, batchData, batchLabels);
 }
 
-void NetForwardAction::run( Trainable *net, int epoch, int batch, float const*const batchData, int const*const batchLabels ) {
+void NetForwardAction::run(Trainable *net, int epoch, int batch, float const*const batchData, int const*const batchLabels) {
 //    cout << "NetForwardBatch" << endl;
-    net->forward( batchData );
-//    trainer->train( net, batchData, batchLabels );
+    net->forward(batchData);
+//    trainer->train(net, batchData, batchLabels);
 }
 
-//void NetBackpropAction::run( Trainable *net, float const*const batchData, int const*const batchLabels ) {
+//void NetBackpropAction::run(Trainable *net, float const*const batchData, int const*const batchLabels) {
 ////    cout << "NetBackpropBatch learningrate=" << learningRate << endl;
-//    net->backwardFromLabels( learningRate, batchLabels );
+//    net->backwardFromLabels(learningRate, batchLabels);
 //}
 
 
diff --git a/src/batch/NetAction.h b/src/batch/NetAction.h
index a5b8b153..cb98ef3b 100644
--- a/src/batch/NetAction.h
+++ b/src/batch/NetAction.h
@@ -19,16 +19,16 @@ class DeepCL_EXPORT EpochResult {
 public:
     float loss;
     int numRight;
-    EpochResult( float loss, int numRight ) :
-        loss( loss ),
-        numRight( numRight ) {
+    EpochResult(float loss, int numRight) :
+        loss(loss),
+        numRight(numRight) {
     }
 };
 
 class DeepCL_EXPORT NetAction {
 public:
     virtual ~NetAction() {}
-    virtual void run( Trainable *net, int epoch, int batch, float const*const batchData, int const*const batchLabels ) = 0;
+    virtual void run(Trainable *net, int epoch, int batch, float const*const batchData, int const*const batchLabels) = 0;
 };
 
 
@@ -39,10 +39,10 @@ class DeepCL_EXPORT NetLearnLabeledAction : public NetAction {
 //        return learningRate;
 //    }
     Trainer *trainer;
-    NetLearnLabeledAction( Trainer *trainer ) :
-        trainer( trainer ) {
+    NetLearnLabeledAction(Trainer *trainer) :
+        trainer(trainer) {
     }   
-    virtual void run( Trainable *net, int epoch, int batch, float const*const batchData, int const*const batchLabels );
+    virtual void run(Trainable *net, int epoch, int batch, float const*const batchData, int const*const batchLabels);
 };
 
 
@@ -50,7 +50,7 @@ class DeepCL_EXPORT NetForwardAction : public NetAction {
 public:
     NetForwardAction() {
     }
-    virtual void run( Trainable *net, int epoch, int batch, float const*const batchData, int const*const batchLabels );
+    virtual void run(Trainable *net, int epoch, int batch, float const*const batchData, int const*const batchLabels);
 };
 
 
@@ -60,10 +60,10 @@ class DeepCL_EXPORT NetForwardAction : public NetAction {
 //    float getLearningRate() {
 //        return learningRate;
 //    }
-//    NetBackpropAction( float learningRate ) :
-//        learningRate( learningRate ) {
+//    NetBackpropAction(float learningRate) :
+//        learningRate(learningRate) {
 //    }
-//    virtual void run( Trainable *net, float const*const batchData, int const*const batchLabels );
+//    virtual void run(Trainable *net, float const*const batchData, int const*const batchLabels);
 //};
 
 
diff --git a/src/batch/NetAction2.cpp b/src/batch/NetAction2.cpp
index 853729a1..c999b85d 100644
--- a/src/batch/NetAction2.cpp
+++ b/src/batch/NetAction2.cpp
@@ -18,30 +18,30 @@ using namespace std;
 #define STATIC
 #define VIRTUAL
 
-void NetLearnAction2::run( Trainable *net, int epoch, int batch, InputData *inputData, OutputData *outputData ) {
+void NetLearnAction2::run(Trainable *net, int epoch, int batch, InputData *inputData, OutputData *outputData) {
 //    cout << "NetLearnLabeledBatch learningrate=" << learningRate << endl;
-    TrainingContext context( epoch, batch );
-    ExpectedData *expected = dynamic_cast< ExpectedData * >( outputData );
-    LabeledData *labeled = dynamic_cast< LabeledData * >( outputData );
+    TrainingContext context(epoch, batch);
+    ExpectedData *expected = dynamic_cast< ExpectedData * >(outputData);
+    LabeledData *labeled = dynamic_cast< LabeledData * >(outputData);
     BatchResult batchResult;
-    if( expected != 0 ) {
-        batchResult = trainer->train( net, &context, inputData->inputs, expected->expected );
-    } else if( labeled != 0 ) {
-        batchResult = trainer->trainFromLabels( net, &context, inputData->inputs, labeled->labels );        
+    if(expected != 0) {
+        batchResult = trainer->train(net, &context, inputData->inputs, expected->expected);
+    } else if(labeled != 0) {
+        batchResult = trainer->trainFromLabels(net, &context, inputData->inputs, labeled->labels);        
     }
     epochLoss += batchResult.loss;
     epochNumRight += batchResult.numRight;
 }
 
-void NetForwardAction2::run( Trainable *net, int epoch, int batch, InputData *inputData, OutputData *outputData ) {
+void NetForwardAction2::run(Trainable *net, int epoch, int batch, InputData *inputData, OutputData *outputData) {
 //    cout << "NetForwardBatch" << endl;
-    net->forward( inputData->inputs );
-//    trainer->train( net, batchData, batchLabels );
+    net->forward(inputData->inputs);
+//    trainer->train(net, batchData, batchLabels);
 }
 
-//void NetBackpropAction::run( Trainable *net, InputData *inputData, OutputData *outputData ) {
+//void NetBackpropAction::run(Trainable *net, InputData *inputData, OutputData *outputData) {
 ////    cout << "NetBackpropBatch learningrate=" << learningRate << endl;
-//    net->backwardFromLabels( learningRate, batchLabels );
+//    net->backwardFromLabels(learningRate, batchLabels);
 //}
 
 
diff --git a/src/batch/NetAction2.h b/src/batch/NetAction2.h
index b39acf60..30d9e03d 100644
--- a/src/batch/NetAction2.h
+++ b/src/batch/NetAction2.h
@@ -26,7 +26,7 @@ class Trainer;
 class DeepCL_EXPORT NetAction2 {
 public:
     virtual ~NetAction2() {}
-    virtual void run( Trainable *net, int epoch, int batch, InputData *inputData, OutputData *outputData ) = 0;
+    virtual void run(Trainable *net, int epoch, int batch, InputData *inputData, OutputData *outputData) = 0;
 };
 
 class DeepCL_EXPORT NetLearnAction2 : public NetAction2 {
@@ -34,12 +34,12 @@ class DeepCL_EXPORT NetLearnAction2 : public NetAction2 {
     Trainer *trainer;
     float epochLoss;
     int epochNumRight;
-    NetLearnAction2( Trainer *trainer ) :
-        trainer( trainer ) {
+    NetLearnAction2(Trainer *trainer) :
+        trainer(trainer) {
         epochLoss = 0;
         epochNumRight = 0;
     }   
-    virtual void run( Trainable *net, int epoch, int batch, InputData *inputData, OutputData *outputData );
+    virtual void run(Trainable *net, int epoch, int batch, InputData *inputData, OutputData *outputData);
     float getEpochLoss() {
         return epochLoss;
     }
@@ -52,6 +52,6 @@ class DeepCL_EXPORT NetForwardAction2 : public NetAction2 {
 public:
     NetForwardAction2() {
     }
-    virtual void run( Trainable *net, int epoch, int batch, InputData *inputData, OutputData *outputData );
+    virtual void run(Trainable *net, int epoch, int batch, InputData *inputData, OutputData *outputData);
 };
 
diff --git a/src/batch/NetLearner.cpp b/src/batch/NetLearner.cpp
index e82f3baa..92014f7d 100644
--- a/src/batch/NetLearner.cpp
+++ b/src/batch/NetLearner.cpp
@@ -22,11 +22,11 @@ using namespace std;
 #define STATIC
 #define VIRTUAL
 
-PUBLICAPI NetLearner::NetLearner( Trainer *trainer, Trainable *net,
+PUBLICAPI NetLearner::NetLearner(Trainer *trainer, Trainable *net,
         int Ntrain, float *trainData, int *trainLabels,
         int Ntest, float *testData, int *testLabels,
-        int batchSize ) :
-        net( net )
+        int batchSize) :
+        net(net)
         {
 //    annealLearningRate = 1.0f;
     numEpochs = 12;
@@ -34,20 +34,20 @@ PUBLICAPI NetLearner::NetLearner( Trainer *trainer, Trainable *net,
     dumpTimings = false;
     learningDone = false;
 
-    trainBatcher = new LearnBatcher( trainer, net, batchSize, Ntrain, trainData, trainLabels );
-    testBatcher = new ForwardBatcher( net, batchSize, Ntest, testData, testLabels );   
+    trainBatcher = new LearnBatcher(trainer, net, batchSize, Ntrain, trainData, trainLabels);
+    testBatcher = new ForwardBatcher(net, batchSize, Ntest, testData, testLabels);   
 }
 VIRTUAL NetLearner::~NetLearner() {
     delete trainBatcher;
     delete testBatcher;
 }
-VIRTUAL void NetLearner::setSchedule( int numEpochs ) {
-    setSchedule( numEpochs, 0 );
+VIRTUAL void NetLearner::setSchedule(int numEpochs) {
+    setSchedule(numEpochs, 0);
 }
-VIRTUAL void NetLearner::setDumpTimings( bool dumpTimings ) {
+VIRTUAL void NetLearner::setDumpTimings(bool dumpTimings) {
     this->dumpTimings = dumpTimings;
 }
-VIRTUAL void NetLearner::setSchedule( int numEpochs, int nextEpoch ) {
+VIRTUAL void NetLearner::setSchedule(int numEpochs, int nextEpoch) {
     this->numEpochs = numEpochs;
     this->nextEpoch = nextEpoch;
 }
@@ -55,38 +55,38 @@ PUBLICAPI VIRTUAL void NetLearner::reset() {
 //    cout << "NetLearner::reset()" << endl;
     learningDone = false;
     nextEpoch = 0;
-//    net->setTraining( true );
+//    net->setTraining(true);
     trainBatcher->reset();
     testBatcher->reset();
     timer.lap();
 }
 VIRTUAL void NetLearner::postEpochTesting() {
-    if( dumpTimings ) {
+    if(dumpTimings) {
         StatefulTimer::dump(true);
     }
 //        cout << "-----------------------" << endl;
     cout << endl;
-    timer.timeCheck("after epoch " + toString(nextEpoch+1) );
+    timer.timeCheck("after epoch " + toString(nextEpoch+1));
 //    cout << "annealed learning rate: " << trainBatcher->getLearningRate() <<
     cout << " training loss: " << trainBatcher->getLoss() << endl;
     cout << " train accuracy: " << trainBatcher->getNumRight() << "/" << trainBatcher->getN() << " " << (trainBatcher->getNumRight() * 100.0f/ trainBatcher->getN()) << "%" << std::endl;
-    net->setTraining( false );
+    net->setTraining(false);
     testBatcher->run(nextEpoch);
     cout << "test accuracy: " << testBatcher->getNumRight() << "/" << testBatcher->getN() << " " << 
-        (testBatcher->getNumRight() * 100.0f / testBatcher->getN() ) << "%" << endl;
+        (testBatcher->getNumRight() * 100.0f / testBatcher->getN()) << "%" << endl;
     timer.timeCheck("after tests");
 }
 PUBLICAPI VIRTUAL bool NetLearner::tickBatch() { // just tick one learn batch, once all done, then run testing etc
 //    int epoch = nextEpoch;
-//    trainBatcher->setLearningRate( learningRate * pow( annealLearningRate, epoch ) );
-    net->setTraining( true );
+//    trainBatcher->setLearningRate(learningRate * pow(annealLearningRate, epoch) );
+    net->setTraining(true);
     trainBatcher->tick(nextEpoch);       // returns false once all learning done (all epochs)
-    if( trainBatcher->getEpochDone() ) {
+    if(trainBatcher->getEpochDone()) {
         postEpochTesting();
         nextEpoch++;
     }
 //    cout << "check learningDone nextEpoch=" << nextEpoch << " numEpochs=" << numEpochs << endl;
-    if( nextEpoch == numEpochs ) {
+    if(nextEpoch == numEpochs) {
 //        cout << "setting learningdone to true" << endl;
         learningDone = true;
     }
@@ -110,8 +110,8 @@ PUBLICAPI VIRTUAL int NetLearner::getBatchNumRight() {
 PUBLICAPI VIRTUAL float NetLearner::getBatchLoss() {
     return trainBatcher->getLoss();
 }
-VIRTUAL void NetLearner::setBatchState( int nextBatch, int numRight, float loss ) {
-    trainBatcher->setBatchState( nextBatch, numRight, loss );
+VIRTUAL void NetLearner::setBatchState(int nextBatch, int numRight, float loss) {
+    trainBatcher->setBatchState(nextBatch, numRight, loss);
 //    trainBatcher->numRight = numRight;
 //    trainBatcher->loss = loss;
 }
@@ -119,40 +119,40 @@ PUBLICAPI VIRTUAL bool NetLearner::tickEpoch() {
 //    int epoch = nextEpoch;
 //    cout << "NetLearner.tickEpoch epoch=" << epoch << " learningDone=" << learningDone << " epochDone=" << trainBatcher->getEpochDone() << endl;
 //    cout << "numEpochs=" << numEpochs << endl;
-    if( trainBatcher->getEpochDone() ) {
+    if(trainBatcher->getEpochDone()) {
         trainBatcher->reset();
     }
-    while(!trainBatcher->getEpochDone() ) {
+    while(!trainBatcher->getEpochDone()) {
         tickBatch();
     }
     return !learningDone;
 }
 PUBLICAPI VIRTUAL void NetLearner::run() {
-    if( learningDone ) {
+    if(learningDone) {
         reset();
     }
-    while( !learningDone ) {
+    while(!learningDone) {
         tickEpoch();
     }
 }
 PUBLICAPI VIRTUAL bool NetLearner::isLearningDone() {
     return learningDone;
 }
-//PUBLICAPI VIRTUAL void NetLearner::setLearningRate( float learningRate ) {
-//    this->setLearningRate( learningRate, 1.0f );
+//PUBLICAPI VIRTUAL void NetLearner::setLearningRate(float learningRate) {
+//    this->setLearningRate(learningRate, 1.0f);
 //}
-//VIRTUAL void NetLearner::setLearningRate( float learningRate, float annealLearningRate ) {
+//VIRTUAL void NetLearner::setLearningRate(float learningRate, float annealLearningRate) {
 //    this->learningRate = learningRate;
 //    this->annealLearningRate = annealLearningRate;
 //}
-//PUBLICAPI VIRTUAL void NetLearner::learn( float learningRate ) {
-//    learn( learningRate, 1.0f );
+//PUBLICAPI VIRTUAL void NetLearner::learn(float learningRate) {
+//    learn(learningRate, 1.0f);
 //}
-//VIRTUAL void NetLearner::learn( float learningRate, float annealLearningRate ) {
-//    setLearningRate( learningRate, annealLearningRate );
+//VIRTUAL void NetLearner::learn(float learningRate, float annealLearningRate) {
+//    setLearningRate(learningRate, annealLearningRate);
 //    run();
 //}
-//VIRTUAL void NetLearner::setTrainer( Trainer *trainer ) {
+//VIRTUAL void NetLearner::setTrainer(Trainer *trainer) {
 //    this->trainer = trainer;
 //}
 
diff --git a/src/batch/NetLearner.h b/src/batch/NetLearner.h
index 8cdce40a..45fe18e0 100644
--- a/src/batch/NetLearner.h
+++ b/src/batch/NetLearner.h
@@ -57,14 +57,14 @@ class DeepCL_EXPORT NetLearner : public NetLearnerBase {
     // cog_addheaders.add()
     // ]]]
     // generated, using cog:
-    PUBLICAPI NetLearner( Trainer *trainer, Trainable *net,
+    PUBLICAPI NetLearner(Trainer *trainer, Trainable *net,
     int Ntrain, float *trainData, int *trainLabels,
     int Ntest, float *testData, int *testLabels,
-    int batchSize );
+    int batchSize);
     VIRTUAL ~NetLearner();
-    VIRTUAL void setSchedule( int numEpochs );
-    VIRTUAL void setDumpTimings( bool dumpTimings );
-    VIRTUAL void setSchedule( int numEpochs, int nextEpoch );
+    VIRTUAL void setSchedule(int numEpochs);
+    VIRTUAL void setDumpTimings(bool dumpTimings);
+    VIRTUAL void setSchedule(int numEpochs, int nextEpoch);
     PUBLICAPI VIRTUAL void reset();
     VIRTUAL void postEpochTesting();
     PUBLICAPI VIRTUAL bool tickBatch();  // just tick one learn batch, once all done, then run testing etc
@@ -74,7 +74,7 @@ class DeepCL_EXPORT NetLearner : public NetLearnerBase {
     PUBLICAPI VIRTUAL int getNTrain();
     PUBLICAPI VIRTUAL int getBatchNumRight();
     PUBLICAPI VIRTUAL float getBatchLoss();
-    VIRTUAL void setBatchState( int nextBatch, int numRight, float loss );
+    VIRTUAL void setBatchState(int nextBatch, int numRight, float loss);
     PUBLICAPI VIRTUAL bool tickEpoch();
     PUBLICAPI VIRTUAL void run();
     PUBLICAPI VIRTUAL bool isLearningDone();
diff --git a/src/batch/NetLearnerBase.h b/src/batch/NetLearnerBase.h
index 8081f753..b493c4e5 100644
--- a/src/batch/NetLearnerBase.h
+++ b/src/batch/NetLearnerBase.h
@@ -16,13 +16,13 @@ class DeepCL_EXPORT NetLearnerBase {
 public:
     virtual ~NetLearnerBase() {}
     virtual bool isLearningDone() = 0;
-    virtual void setSchedule( int numEpochs ) = 0;
-    virtual void setDumpTimings( bool dumpTimings ) = 0;
-    virtual void setSchedule( int numEpochs, int startEpoch ) = 0;
-//    virtual void setLearningRate( float learningRate ) = 0;
-//    virtual void setLearningRate( float learningRate, float annealLearningRate ) = 0;
-//    virtual void learn( float learningRate ) = 0;
-//    virtual void learn( float learningRate, float annealLearningRate ) = 0;
+    virtual void setSchedule(int numEpochs) = 0;
+    virtual void setDumpTimings(bool dumpTimings) = 0;
+    virtual void setSchedule(int numEpochs, int startEpoch) = 0;
+//    virtual void setLearningRate(float learningRate) = 0;
+//    virtual void setLearningRate(float learningRate, float annealLearningRate) = 0;
+//    virtual void learn(float learningRate) = 0;
+//    virtual void learn(float learningRate, float annealLearningRate) = 0;
     virtual void reset() = 0;
     virtual bool tickEpoch() = 0;
     virtual bool tickBatch() = 0;
@@ -32,8 +32,8 @@ class DeepCL_EXPORT NetLearnerBase {
     virtual int getBatchNumRight() = 0;
     virtual float getBatchLoss() = 0;
     virtual int getNTrain() = 0;
-    virtual void setBatchState( int batch, int numRight, float loss ) = 0;
+    virtual void setBatchState(int batch, int numRight, float loss) = 0;
     virtual void run() = 0;
-//    virtual void setTrainer( Trainer *trainer ) = 0;
+//    virtual void setTrainer(Trainer *trainer) = 0;
 };
 
diff --git a/src/batch/NetLearnerOnDemand.cpp b/src/batch/NetLearnerOnDemand.cpp
index 025b5b99..c73db646 100644
--- a/src/batch/NetLearnerOnDemand.cpp
+++ b/src/batch/NetLearnerOnDemand.cpp
@@ -21,19 +21,19 @@ using namespace std;
 #define STATIC
 #define VIRTUAL
 
-PUBLICAPI NetLearnerOnDemand::NetLearnerOnDemand( Trainer *trainer, Trainable *net, 
+PUBLICAPI NetLearnerOnDemand::NetLearnerOnDemand(Trainer *trainer, Trainable *net, 
             std::string trainFilepath, int Ntrain,
             std::string testFilepath, int Ntest,
-            int fileReadBatches, int batchSize ) :
-        net( net ),
+            int fileReadBatches, int batchSize) :
+        net(net),
         learnBatcher(0),
         testBatcher(0)
 //    batchSize = 128;
         {
-    learnAction = new NetLearnLabeledAction( trainer );
+    learnAction = new NetLearnLabeledAction(trainer);
     testAction = new NetForwardAction();
-    learnBatcher = new OnDemandBatcher( net, learnAction, trainFilepath, Ntrain, fileReadBatches, batchSize );
-    testBatcher = new OnDemandBatcher( net, testAction, testFilepath, Ntest, fileReadBatches, batchSize );
+    learnBatcher = new OnDemandBatcher(net, learnAction, trainFilepath, Ntrain, fileReadBatches, batchSize);
+    testBatcher = new OnDemandBatcher(net, testAction, testFilepath, Ntest, fileReadBatches, batchSize);
 //    annealLearningRate = 1.0f;
     numEpochs = 12;
     nextEpoch = 0;
@@ -41,22 +41,22 @@ PUBLICAPI NetLearnerOnDemand::NetLearnerOnDemand( Trainer *trainer, Trainable *n
     dumpTimings = false;
 }
 VIRTUAL NetLearnerOnDemand::~NetLearnerOnDemand() {
-    if( learnBatcher != 0 ) {
+    if(learnBatcher != 0) {
         delete learnBatcher;
     }
-    if( testBatcher != 0 ) {
+    if(testBatcher != 0) {
         delete testBatcher;
     }
     delete testAction;
     delete learnAction;
 }
-VIRTUAL void NetLearnerOnDemand::setSchedule( int numEpochs ) {
-    setSchedule( numEpochs, 1 );
+VIRTUAL void NetLearnerOnDemand::setSchedule(int numEpochs) {
+    setSchedule(numEpochs, 1);
 }
-VIRTUAL void NetLearnerOnDemand::setDumpTimings( bool dumpTimings ) {
+VIRTUAL void NetLearnerOnDemand::setDumpTimings(bool dumpTimings) {
     this->dumpTimings = dumpTimings;
 }
-VIRTUAL void NetLearnerOnDemand::setSchedule( int numEpochs, int nextEpoch ) {
+VIRTUAL void NetLearnerOnDemand::setSchedule(int numEpochs, int nextEpoch) {
     this->numEpochs = numEpochs;
     this->nextEpoch = nextEpoch;
 }
@@ -66,10 +66,10 @@ PUBLICAPI VIRTUAL bool NetLearnerOnDemand::getEpochDone() {
 PUBLICAPI VIRTUAL int NetLearnerOnDemand::getNextEpoch() {
     return nextEpoch;
 }
-//VIRTUAL void NetLearnerOnDemand::setLearningRate( float learningRate ) {
-//    this->setLearningRate( learningRate, 1.0f );
+//VIRTUAL void NetLearnerOnDemand::setLearningRate(float learningRate) {
+//    this->setLearningRate(learningRate, 1.0f);
 //}
-//VIRTUAL void NetLearnerOnDemand::setLearningRate( float learningRate, float annealLearningRate ) {
+//VIRTUAL void NetLearnerOnDemand::setLearningRate(float learningRate, float annealLearningRate) {
 //    this->learningRate = learningRate;
 //    this->annealLearningRate = annealLearningRate;
 //}
@@ -85,8 +85,8 @@ PUBLICAPI VIRTUAL int NetLearnerOnDemand::getBatchNumRight() {
 PUBLICAPI VIRTUAL float NetLearnerOnDemand::getBatchLoss() {
     return learnBatcher->getLoss();
 }
-VIRTUAL void NetLearnerOnDemand::setBatchState( int nextBatch, int numRight, float loss ) {
-    learnBatcher->setBatchState( nextBatch, numRight, loss );
+VIRTUAL void NetLearnerOnDemand::setBatchState(int nextBatch, int numRight, float loss) {
+    learnBatcher->setBatchState(nextBatch, numRight, loss);
 }
 PUBLICAPI VIRTUAL void NetLearnerOnDemand::reset() {
     timer.lap();
@@ -97,31 +97,31 @@ PUBLICAPI VIRTUAL void NetLearnerOnDemand::reset() {
 }
 VIRTUAL void NetLearnerOnDemand::postEpochTesting() {
     cout << "dumpTimings " << dumpTimings << endl;
-    if( dumpTimings ) {
+    if(dumpTimings) {
         StatefulTimer::dump(true);
     }
 //        cout << "-----------------------" << endl;
     cout << endl;
-    timer.timeCheck("after epoch " + toString(nextEpoch + 1 ) );
+    timer.timeCheck("after epoch " + toString(nextEpoch + 1) );
 //    cout << "annealed learning rate: " << learnAction->getLearningRate()
     cout << " training loss: " << learnBatcher->getLoss() << endl;
     cout << " train accuracy: " << learnBatcher->getNumRight() << "/" << learnBatcher->getN() << " " << (learnBatcher->getNumRight() * 100.0f/ learnBatcher->getN()) << "%" << std::endl;
-    testBatcher->run( nextEpoch );
-//    int testNumRight = batchLearnerOnDemand.test( testFilepath, fileReadBatches, batchSize, Ntest );
-    cout << "test accuracy: " << testBatcher->getNumRight() << "/" << testBatcher->getN() << " " << (testBatcher->getNumRight() * 100.0f / testBatcher->getN() ) << "%" << endl;
+    testBatcher->run(nextEpoch);
+//    int testNumRight = batchLearnerOnDemand.test(testFilepath, fileReadBatches, batchSize, Ntest);
+    cout << "test accuracy: " << testBatcher->getNumRight() << "/" << testBatcher->getN() << " " << (testBatcher->getNumRight() * 100.0f / testBatcher->getN()) << "%" << endl;
     timer.timeCheck("after tests");
 }
 PUBLICAPI VIRTUAL bool NetLearnerOnDemand::tickBatch() { // means: filebatch, not low-level batch
                                                // probalby good enough for now?    
 //    int epoch = nextEpoch;
-//    learnAction->learningRate = learningRate * pow( annealLearningRate, epoch );
-    learnBatcher->tick( nextEpoch );       // returns false once all learning done (all epochs)
-    if( learnBatcher->getEpochDone() ) {
+//    learnAction->learningRate = learningRate * pow(annealLearningRate, epoch);
+    learnBatcher->tick(nextEpoch);       // returns false once all learning done (all epochs)
+    if(learnBatcher->getEpochDone()) {
         postEpochTesting();
         nextEpoch++;
     }
 //    cout << "check learningDone nextEpoch=" << nextEpoch << " numEpochs=" << numEpochs << endl;
-    if( nextEpoch == numEpochs ) {
+    if(nextEpoch == numEpochs) {
 //        cout << "setting learningdone to true" << endl;
         learningDone = true;
     }
@@ -132,33 +132,33 @@ PUBLICAPI VIRTUAL bool NetLearnerOnDemand::tickEpoch() {
 //    int epoch = nextEpoch;
 //    cout << "NetLearnerOnDemand.tickEpoch epoch=" << epoch << " learningDone=" << learningDone << " epochDone=" << learnBatcher->getEpochDone() << endl;
 //    cout << "numEpochs=" << numEpochs << endl;
-    if( learnBatcher->getEpochDone() ) {
+    if(learnBatcher->getEpochDone()) {
         learnBatcher->reset();
     }
-    while(!learnBatcher->getEpochDone() ) {
+    while(!learnBatcher->getEpochDone()) {
         tickBatch();
     }
     return !learningDone;
 }
 PUBLICAPI VIRTUAL void NetLearnerOnDemand::run() {
-    if( learningDone ) {
+    if(learningDone) {
         reset();
     }
-    while( !learningDone ) {
+    while(!learningDone) {
         tickEpoch();
     }
 }
 PUBLICAPI VIRTUAL bool NetLearnerOnDemand::isLearningDone() {
     return learningDone;
 }
-//PUBLICAPI VIRTUAL void NetLearnerOnDemand::learn( float learningRate ) {
-//    learn( learningRate, 1.0f );
+//PUBLICAPI VIRTUAL void NetLearnerOnDemand::learn(float learningRate) {
+//    learn(learningRate, 1.0f);
 //}
-//VIRTUAL void NetLearnerOnDemand::learn( float learningRate, float annealLearningRate ) {
-//    setLearningRate( learningRate, annealLearningRate );
+//VIRTUAL void NetLearnerOnDemand::learn(float learningRate, float annealLearningRate) {
+//    setLearningRate(learningRate, annealLearningRate);
 //    run();
 //}
-//VIRTUAL void NetLearnerOnDemand::setTrainer( Trainer *trainer ) {
+//VIRTUAL void NetLearnerOnDemand::setTrainer(Trainer *trainer) {
 //    this->trainer = trainer;
 //}
 
diff --git a/src/batch/NetLearnerOnDemand.h b/src/batch/NetLearnerOnDemand.h
index 08c51344..d751551e 100644
--- a/src/batch/NetLearnerOnDemand.h
+++ b/src/batch/NetLearnerOnDemand.h
@@ -61,21 +61,21 @@ class DeepCL_EXPORT NetLearnerOnDemand : public NetLearnerBase {
     // cog_addheaders.add()
     // ]]]
     // generated, using cog:
-    PUBLICAPI NetLearnerOnDemand( Trainer *trainer, Trainable *net,
+    PUBLICAPI NetLearnerOnDemand(Trainer *trainer, Trainable *net,
     std::string trainFilepath, int Ntrain,
     std::string testFilepath, int Ntest,
-    int fileReadBatches, int batchSize );
+    int fileReadBatches, int batchSize);
     VIRTUAL ~NetLearnerOnDemand();
-    VIRTUAL void setSchedule( int numEpochs );
-    VIRTUAL void setDumpTimings( bool dumpTimings );
-    VIRTUAL void setSchedule( int numEpochs, int nextEpoch );
+    VIRTUAL void setSchedule(int numEpochs);
+    VIRTUAL void setDumpTimings(bool dumpTimings);
+    VIRTUAL void setSchedule(int numEpochs, int nextEpoch);
     PUBLICAPI VIRTUAL bool getEpochDone();
     PUBLICAPI VIRTUAL int getNextEpoch();
     PUBLICAPI VIRTUAL int getNextBatch();
     PUBLICAPI VIRTUAL int getNTrain();
     PUBLICAPI VIRTUAL int getBatchNumRight();
     PUBLICAPI VIRTUAL float getBatchLoss();
-    VIRTUAL void setBatchState( int nextBatch, int numRight, float loss );
+    VIRTUAL void setBatchState(int nextBatch, int numRight, float loss);
     PUBLICAPI VIRTUAL void reset();
     VIRTUAL void postEpochTesting();
     PUBLICAPI VIRTUAL bool tickBatch();  // means: filebatch, not low-level batch
diff --git a/src/batch/NetLearnerOnDemandv2.cpp b/src/batch/NetLearnerOnDemandv2.cpp
index 3f881a88..42c35226 100644
--- a/src/batch/NetLearnerOnDemandv2.cpp
+++ b/src/batch/NetLearnerOnDemandv2.cpp
@@ -22,19 +22,19 @@ using namespace std;
 #define STATIC
 #define VIRTUAL
 
-PUBLICAPI NetLearnerOnDemandv2::NetLearnerOnDemandv2( Trainer *trainer, Trainable *net, 
+PUBLICAPI NetLearnerOnDemandv2::NetLearnerOnDemandv2(Trainer *trainer, Trainable *net, 
             GenericLoaderv2 *trainLoader, int Ntrain,
             GenericLoaderv2 *validateLoader, int Ntest,
-            int fileReadBatches, int batchSize ) :
-        net( net ),
+            int fileReadBatches, int batchSize) :
+        net(net),
         learnBatcher(0),
         testBatcher(0)
 //    batchSize = 128;
         {
-    learnAction = new NetLearnLabeledAction( trainer );
+    learnAction = new NetLearnLabeledAction(trainer);
     testAction = new NetForwardAction();
-    learnBatcher = new OnDemandBatcherv2( net, learnAction, trainLoader, Ntrain, fileReadBatches, batchSize );
-    testBatcher = new OnDemandBatcherv2( net, testAction, validateLoader, Ntest, fileReadBatches, batchSize );
+    learnBatcher = new OnDemandBatcherv2(net, learnAction, trainLoader, Ntrain, fileReadBatches, batchSize);
+    testBatcher = new OnDemandBatcherv2(net, testAction, validateLoader, Ntest, fileReadBatches, batchSize);
 //    annealLearningRate = 1.0f;
     numEpochs = 12;
     nextEpoch = 0;
@@ -42,22 +42,22 @@ PUBLICAPI NetLearnerOnDemandv2::NetLearnerOnDemandv2( Trainer *trainer, Trainabl
     dumpTimings = false;
 }
 VIRTUAL NetLearnerOnDemandv2::~NetLearnerOnDemandv2() {
-    if( learnBatcher != 0 ) {
+    if(learnBatcher != 0) {
         delete learnBatcher;
     }
-    if( testBatcher != 0 ) {
+    if(testBatcher != 0) {
         delete testBatcher;
     }
     delete testAction;
     delete learnAction;
 }
-VIRTUAL void NetLearnerOnDemandv2::setSchedule( int numEpochs ) {
-    setSchedule( numEpochs, 1 );
+VIRTUAL void NetLearnerOnDemandv2::setSchedule(int numEpochs) {
+    setSchedule(numEpochs, 1);
 }
-VIRTUAL void NetLearnerOnDemandv2::setDumpTimings( bool dumpTimings ) {
+VIRTUAL void NetLearnerOnDemandv2::setDumpTimings(bool dumpTimings) {
     this->dumpTimings = dumpTimings;
 }
-VIRTUAL void NetLearnerOnDemandv2::setSchedule( int numEpochs, int nextEpoch ) {
+VIRTUAL void NetLearnerOnDemandv2::setSchedule(int numEpochs, int nextEpoch) {
     this->numEpochs = numEpochs;
     this->nextEpoch = nextEpoch;
 }
@@ -67,10 +67,10 @@ PUBLICAPI VIRTUAL bool NetLearnerOnDemandv2::getEpochDone() {
 PUBLICAPI VIRTUAL int NetLearnerOnDemandv2::getNextEpoch() {
     return nextEpoch;
 }
-//VIRTUAL void NetLearnerOnDemandv2::setLearningRate( float learningRate ) {
-//    this->setLearningRate( learningRate, 1.0f );
+//VIRTUAL void NetLearnerOnDemandv2::setLearningRate(float learningRate) {
+//    this->setLearningRate(learningRate, 1.0f);
 //}
-//VIRTUAL void NetLearnerOnDemandv2::setLearningRate( float learningRate, float annealLearningRate ) {
+//VIRTUAL void NetLearnerOnDemandv2::setLearningRate(float learningRate, float annealLearningRate) {
 //    this->learningRate = learningRate;
 //    this->annealLearningRate = annealLearningRate;
 //}
@@ -86,8 +86,8 @@ PUBLICAPI VIRTUAL int NetLearnerOnDemandv2::getBatchNumRight() {
 PUBLICAPI VIRTUAL float NetLearnerOnDemandv2::getBatchLoss() {
     return learnBatcher->getLoss();
 }
-VIRTUAL void NetLearnerOnDemandv2::setBatchState( int nextBatch, int numRight, float loss ) {
-    learnBatcher->setBatchState( nextBatch, numRight, loss );
+VIRTUAL void NetLearnerOnDemandv2::setBatchState(int nextBatch, int numRight, float loss) {
+    learnBatcher->setBatchState(nextBatch, numRight, loss);
 }
 PUBLICAPI VIRTUAL void NetLearnerOnDemandv2::reset() {
     timer.lap();
@@ -98,31 +98,31 @@ PUBLICAPI VIRTUAL void NetLearnerOnDemandv2::reset() {
 }
 VIRTUAL void NetLearnerOnDemandv2::postEpochTesting() {
     cout << "dumpTimings " << dumpTimings << endl;
-    if( dumpTimings ) {
+    if(dumpTimings) {
         StatefulTimer::dump(true);
     }
 //        cout << "-----------------------" << endl;
     cout << endl;
-    timer.timeCheck("after epoch " + toString(nextEpoch + 1 ) );
+    timer.timeCheck("after epoch " + toString(nextEpoch + 1) );
 //    cout << "annealed learning rate: " << learnAction->getLearningRate()
     cout << " training loss: " << learnBatcher->getLoss() << endl;
     cout << " train accuracy: " << learnBatcher->getNumRight() << "/" << learnBatcher->getN() << " " << (learnBatcher->getNumRight() * 100.0f/ learnBatcher->getN()) << "%" << std::endl;
-    testBatcher->run( nextEpoch );
-//    int testNumRight = batchLearnerOnDemand.test( testFilepath, fileReadBatches, batchSize, Ntest );
-    cout << "test accuracy: " << testBatcher->getNumRight() << "/" << testBatcher->getN() << " " << (testBatcher->getNumRight() * 100.0f / testBatcher->getN() ) << "%" << endl;
+    testBatcher->run(nextEpoch);
+//    int testNumRight = batchLearnerOnDemand.test(testFilepath, fileReadBatches, batchSize, Ntest);
+    cout << "test accuracy: " << testBatcher->getNumRight() << "/" << testBatcher->getN() << " " << (testBatcher->getNumRight() * 100.0f / testBatcher->getN()) << "%" << endl;
     timer.timeCheck("after tests");
 }
 PUBLICAPI VIRTUAL bool NetLearnerOnDemandv2::tickBatch() { // means: filebatch, not low-level batch
                                                // probalby good enough for now?    
 //    int epoch = nextEpoch;
-//    learnAction->learningRate = learningRate * pow( annealLearningRate, epoch );
-    learnBatcher->tick( nextEpoch );       // returns false once all learning done (all epochs)
-    if( learnBatcher->getEpochDone() ) {
+//    learnAction->learningRate = learningRate * pow(annealLearningRate, epoch);
+    learnBatcher->tick(nextEpoch);       // returns false once all learning done (all epochs)
+    if(learnBatcher->getEpochDone()) {
         postEpochTesting();
         nextEpoch++;
     }
 //    cout << "check learningDone nextEpoch=" << nextEpoch << " numEpochs=" << numEpochs << endl;
-    if( nextEpoch == numEpochs ) {
+    if(nextEpoch == numEpochs) {
 //        cout << "setting learningdone to true" << endl;
         learningDone = true;
     }
@@ -133,33 +133,33 @@ PUBLICAPI VIRTUAL bool NetLearnerOnDemandv2::tickEpoch() {
 //    int epoch = nextEpoch;
 //    cout << "NetLearnerOnDemandv2.tickEpoch epoch=" << epoch << " learningDone=" << learningDone << " epochDone=" << learnBatcher->getEpochDone() << endl;
 //    cout << "numEpochs=" << numEpochs << endl;
-    if( learnBatcher->getEpochDone() ) {
+    if(learnBatcher->getEpochDone()) {
         learnBatcher->reset();
     }
-    while(!learnBatcher->getEpochDone() ) {
+    while(!learnBatcher->getEpochDone()) {
         tickBatch();
     }
     return !learningDone;
 }
 PUBLICAPI VIRTUAL void NetLearnerOnDemandv2::run() {
-    if( learningDone ) {
+    if(learningDone) {
         reset();
     }
-    while( !learningDone ) {
+    while(!learningDone) {
         tickEpoch();
     }
 }
 PUBLICAPI VIRTUAL bool NetLearnerOnDemandv2::isLearningDone() {
     return learningDone;
 }
-//PUBLICAPI VIRTUAL void NetLearnerOnDemandv2::learn( float learningRate ) {
-//    learn( learningRate, 1.0f );
+//PUBLICAPI VIRTUAL void NetLearnerOnDemandv2::learn(float learningRate) {
+//    learn(learningRate, 1.0f);
 //}
-//VIRTUAL void NetLearnerOnDemandv2::learn( float learningRate, float annealLearningRate ) {
-//    setLearningRate( learningRate, annealLearningRate );
+//VIRTUAL void NetLearnerOnDemandv2::learn(float learningRate, float annealLearningRate) {
+//    setLearningRate(learningRate, annealLearningRate);
 //    run();
 //}
-//VIRTUAL void NetLearnerOnDemandv2::setTrainer( Trainer *trainer ) {
+//VIRTUAL void NetLearnerOnDemandv2::setTrainer(Trainer *trainer) {
 //    this->trainer = trainer;
 //}
 
diff --git a/src/batch/NetLearnerOnDemandv2.h b/src/batch/NetLearnerOnDemandv2.h
index 496992c5..aa53769c 100644
--- a/src/batch/NetLearnerOnDemandv2.h
+++ b/src/batch/NetLearnerOnDemandv2.h
@@ -64,21 +64,21 @@ class DeepCL_EXPORT NetLearnerOnDemandv2 : public NetLearnerBase {
     // cog_addheaders.add()
     // ]]]
     // generated, using cog:
-    PUBLICAPI NetLearnerOnDemandv2( Trainer *trainer, Trainable *net,
+    PUBLICAPI NetLearnerOnDemandv2(Trainer *trainer, Trainable *net,
     GenericLoaderv2 *trainLoader, int Ntrain,
     GenericLoaderv2 *validateLoader, int Ntest,
-    int fileReadBatches, int batchSize );
+    int fileReadBatches, int batchSize);
     VIRTUAL ~NetLearnerOnDemandv2();
-    VIRTUAL void setSchedule( int numEpochs );
-    VIRTUAL void setDumpTimings( bool dumpTimings );
-    VIRTUAL void setSchedule( int numEpochs, int nextEpoch );
+    VIRTUAL void setSchedule(int numEpochs);
+    VIRTUAL void setDumpTimings(bool dumpTimings);
+    VIRTUAL void setSchedule(int numEpochs, int nextEpoch);
     PUBLICAPI VIRTUAL bool getEpochDone();
     PUBLICAPI VIRTUAL int getNextEpoch();
     PUBLICAPI VIRTUAL int getNextBatch();
     PUBLICAPI VIRTUAL int getNTrain();
     PUBLICAPI VIRTUAL int getBatchNumRight();
     PUBLICAPI VIRTUAL float getBatchLoss();
-    VIRTUAL void setBatchState( int nextBatch, int numRight, float loss );
+    VIRTUAL void setBatchState(int nextBatch, int numRight, float loss);
     PUBLICAPI VIRTUAL void reset();
     VIRTUAL void postEpochTesting();
     PUBLICAPI VIRTUAL bool tickBatch();  // means: filebatch, not low-level batch
diff --git a/src/batch/OnDemandBatcher.cpp b/src/batch/OnDemandBatcher.cpp
index 4530081d..c8a3b060 100644
--- a/src/batch/OnDemandBatcher.cpp
+++ b/src/batch/OnDemandBatcher.cpp
@@ -18,22 +18,22 @@ using namespace std;
 #define STATIC
 #define VIRTUAL
 
-PUBLICAPI OnDemandBatcher::OnDemandBatcher( Trainable *net, NetAction *netAction, 
-            std::string filepath, int N, int fileReadBatches, int batchSize ) :
-            net( net ),
-            netAction( netAction ),
-            netActionBatcher( 0 ),
-            filepath( filepath ),
-            N( N ),
-            fileReadBatches( fileReadBatches ),
-            batchSize( batchSize ),
-            fileBatchSize( batchSize * fileReadBatches ),
-            inputCubeSize( net->getInputCubeSize() )
+PUBLICAPI OnDemandBatcher::OnDemandBatcher(Trainable *net, NetAction *netAction, 
+            std::string filepath, int N, int fileReadBatches, int batchSize) :
+            net(net),
+            netAction(netAction),
+            netActionBatcher(0),
+            filepath(filepath),
+            N(N),
+            fileReadBatches(fileReadBatches),
+            batchSize(batchSize),
+            fileBatchSize(batchSize * fileReadBatches),
+            inputCubeSize(net->getInputCubeSize())
         {
-    numFileBatches = ( N + fileBatchSize - 1 ) / fileBatchSize;
+    numFileBatches = (N + fileBatchSize - 1) / fileBatchSize;
     dataBuffer = new float[ fileBatchSize * inputCubeSize ];
     labelsBuffer = new int[ fileBatchSize ];
-    netActionBatcher = new NetActionBatcher( net, batchSize, fileBatchSize, dataBuffer, labelsBuffer, netAction );
+    netActionBatcher = new NetActionBatcher(net, batchSize, fileBatchSize, dataBuffer, labelsBuffer, netAction);
     reset();
 }
 VIRTUAL OnDemandBatcher::~OnDemandBatcher() {
@@ -41,7 +41,7 @@ VIRTUAL OnDemandBatcher::~OnDemandBatcher() {
     delete[] dataBuffer;
     delete[] labelsBuffer;
 }
-VIRTUAL void OnDemandBatcher::setBatchState( int nextBatch, int numRight, float loss ) {
+VIRTUAL void OnDemandBatcher::setBatchState(int nextBatch, int numRight, float loss) {
     this->nextFileBatch = nextBatch / fileReadBatches;
     this->numRight = numRight;
     this->loss = loss;
@@ -68,11 +68,11 @@ PUBLICAPI VIRTUAL bool OnDemandBatcher::getEpochDone() {
 PUBLICAPI VIRTUAL int OnDemandBatcher::getN() {
     return N;
 }
-//VIRTUAL void OnDemandBatcher::setLearningRate( float learningRate ) {
+//VIRTUAL void OnDemandBatcher::setLearningRate(float learningRate) {
 //    this->learningRate = learningRate;
 //}
-//VIRTUAL void OnDemandBatcher::setBatchSize( int batchSize ) {
-//    if( batchSize != this->batchSize ) {
+//VIRTUAL void OnDemandBatcher::setBatchSize(int batchSize) {
+//    if(batchSize != this->batchSize) {
 //        this->batchSize = batchSize;
 ////        updateBuffers();
 //    }
@@ -88,37 +88,37 @@ PUBLICAPI bool OnDemandBatcher::tick(int epoch) {
 //    cout << "OnDemandBatcher::tick nextFileBatch=" << nextFileBatch << " numRight=" << numRight << 
 //        " loss=" << loss << " epochDone=" << epochDone << endl;
 //    updateBuffers();
-    if( epochDone ) {
+    if(epochDone) {
         reset();
     }
     int fileBatch = nextFileBatch;
     int fileBatchStart = fileBatch * fileBatchSize;
     int thisFileBatchSize = fileBatchSize;
-    if( fileBatch == numFileBatches - 1 ) {
+    if(fileBatch == numFileBatches - 1) {
         thisFileBatchSize = N - fileBatchStart;
     }
-    netActionBatcher->setN( thisFileBatchSize );
+    netActionBatcher->setN(thisFileBatchSize);
 //    cout << "batchlearnerondemand, read data... filebatchstart=" << fileBatchStart << " filebatchsize=" << thisFileBatchSize << endl;
-    GenericLoader::load( filepath, dataBuffer, labelsBuffer, fileBatchStart, thisFileBatchSize );
-    EpochResult epochResult = netActionBatcher->run( epoch );
+    GenericLoader::load(filepath.c_str(), dataBuffer, labelsBuffer, fileBatchStart, thisFileBatchSize);
+    EpochResult epochResult = netActionBatcher->run(epoch);
     loss += epochResult.loss;
     numRight += epochResult.numRight;
 
     nextFileBatch++;
-    if( nextFileBatch == numFileBatches ) {
+    if(nextFileBatch == numFileBatches) {
         epochDone = true;
     }
     return !epochDone;
 }
-PUBLICAPI EpochResult OnDemandBatcher::run( int epoch ) {
+PUBLICAPI EpochResult OnDemandBatcher::run(int epoch) {
 //    cout << "OnDemandBatcher::run() epochDone=" << epochDone << endl;
-    if( epochDone ) {
+    if(epochDone) {
         reset();
     }
-    while( !epochDone ) {
-        tick( epoch );
+    while(!epochDone) {
+        tick(epoch);
     }
-    EpochResult epochResult( loss, numRight );
+    EpochResult epochResult(loss, numRight);
     return epochResult;
 }
 
diff --git a/src/batch/OnDemandBatcher.h b/src/batch/OnDemandBatcher.h
index 898aa30d..c838b810 100644
--- a/src/batch/OnDemandBatcher.h
+++ b/src/batch/OnDemandBatcher.h
@@ -59,10 +59,10 @@ class OnDemandBatcher {
     // cog_addheaders.add()
     // ]]]
     // generated, using cog:
-    PUBLICAPI OnDemandBatcher( Trainable *net, NetAction *netAction,
-    std::string filepath, int N, int fileReadBatches, int batchSize );
+    PUBLICAPI OnDemandBatcher(Trainable *net, NetAction *netAction,
+    std::string filepath, int N, int fileReadBatches, int batchSize);
     VIRTUAL ~OnDemandBatcher();
-    VIRTUAL void setBatchState( int nextBatch, int numRight, float loss );
+    VIRTUAL void setBatchState(int nextBatch, int numRight, float loss);
     VIRTUAL int getBatchSize();
     PUBLICAPI VIRTUAL int getNextFileBatch();
     PUBLICAPI VIRTUAL int getNextBatch();
@@ -72,7 +72,7 @@ class OnDemandBatcher {
     PUBLICAPI VIRTUAL int getN();
     PUBLICAPI void reset();
     PUBLICAPI bool tick(int epoch);
-    PUBLICAPI EpochResult run( int epoch );
+    PUBLICAPI EpochResult run(int epoch);
 
     // [[[end]]]
 };
diff --git a/src/batch/OnDemandBatcherv2.cpp b/src/batch/OnDemandBatcherv2.cpp
index 0ffbd486..5b70ef51 100644
--- a/src/batch/OnDemandBatcherv2.cpp
+++ b/src/batch/OnDemandBatcherv2.cpp
@@ -18,22 +18,22 @@ using namespace std;
 #define STATIC
 #define VIRTUAL
 
-PUBLICAPI OnDemandBatcherv2::OnDemandBatcherv2( Trainable *net, NetAction *netAction, 
-            GenericLoaderv2 *loader, int N, int fileReadBatches, int batchSize ) :
-            net( net ),
-            netAction( netAction ),
-            netActionBatcher( 0 ),
-            loader( loader ),
-            N( N ),
-            fileReadBatches( fileReadBatches ),
-            batchSize( batchSize ),
-            fileBatchSize( batchSize * fileReadBatches ),
-            inputCubeSize( net->getInputCubeSize() )
+PUBLICAPI OnDemandBatcherv2::OnDemandBatcherv2(Trainable *net, NetAction *netAction, 
+            GenericLoaderv2 *loader, int N, int fileReadBatches, int batchSize) :
+            net(net),
+            netAction(netAction),
+            netActionBatcher(0),
+            loader(loader),
+            N(N),
+            fileReadBatches(fileReadBatches),
+            batchSize(batchSize),
+            fileBatchSize(batchSize * fileReadBatches),
+            inputCubeSize(net->getInputCubeSize())
         {
-    numFileBatches = ( N + fileBatchSize - 1 ) / fileBatchSize;
+    numFileBatches = (N + fileBatchSize - 1) / fileBatchSize;
     dataBuffer = new float[ fileBatchSize * inputCubeSize ];
     labelsBuffer = new int[ fileBatchSize ];
-    netActionBatcher = new NetActionBatcher( net, batchSize, fileBatchSize, dataBuffer, labelsBuffer, netAction );
+    netActionBatcher = new NetActionBatcher(net, batchSize, fileBatchSize, dataBuffer, labelsBuffer, netAction);
     reset();
 }
 VIRTUAL OnDemandBatcherv2::~OnDemandBatcherv2() {
@@ -41,7 +41,7 @@ VIRTUAL OnDemandBatcherv2::~OnDemandBatcherv2() {
     delete[] dataBuffer;
     delete[] labelsBuffer;
 }
-VIRTUAL void OnDemandBatcherv2::setBatchState( int nextBatch, int numRight, float loss ) {
+VIRTUAL void OnDemandBatcherv2::setBatchState(int nextBatch, int numRight, float loss) {
     this->nextFileBatch = nextBatch / fileReadBatches;
     this->numRight = numRight;
     this->loss = loss;
@@ -68,11 +68,11 @@ PUBLICAPI VIRTUAL bool OnDemandBatcherv2::getEpochDone() {
 PUBLICAPI VIRTUAL int OnDemandBatcherv2::getN() {
     return N;
 }
-//VIRTUAL void OnDemandBatcherv2::setLearningRate( float learningRate ) {
+//VIRTUAL void OnDemandBatcherv2::setLearningRate(float learningRate) {
 //    this->learningRate = learningRate;
 //}
-//VIRTUAL void OnDemandBatcherv2::setBatchSize( int batchSize ) {
-//    if( batchSize != this->batchSize ) {
+//VIRTUAL void OnDemandBatcherv2::setBatchSize(int batchSize) {
+//    if(batchSize != this->batchSize) {
 //        this->batchSize = batchSize;
 ////        updateBuffers();
 //    }
@@ -88,37 +88,37 @@ PUBLICAPI bool OnDemandBatcherv2::tick(int epoch) {
 //    cout << "OnDemandBatcherv2::tick nextFileBatch=" << nextFileBatch << " numRight=" << numRight << 
 //        " loss=" << loss << " epochDone=" << epochDone << endl;
 //    updateBuffers();
-    if( epochDone ) {
+    if(epochDone) {
         reset();
     }
     int fileBatch = nextFileBatch;
     int fileBatchStart = fileBatch * fileBatchSize;
     int thisFileBatchSize = fileBatchSize;
-    if( fileBatch == numFileBatches - 1 ) {
+    if(fileBatch == numFileBatches - 1) {
         thisFileBatchSize = N - fileBatchStart;
     }
-    netActionBatcher->setN( thisFileBatchSize );
+    netActionBatcher->setN(thisFileBatchSize);
 //    cout << "batchlearnerondemand, read data... filebatchstart=" << fileBatchStart << " filebatchsize=" << thisFileBatchSize << endl;
-    loader->load( dataBuffer, labelsBuffer, fileBatchStart, thisFileBatchSize );
-    EpochResult epochResult = netActionBatcher->run( epoch );
+    loader->load(dataBuffer, labelsBuffer, fileBatchStart, thisFileBatchSize);
+    EpochResult epochResult = netActionBatcher->run(epoch);
     loss += epochResult.loss;
     numRight += epochResult.numRight;
 
     nextFileBatch++;
-    if( nextFileBatch == numFileBatches ) {
+    if(nextFileBatch == numFileBatches) {
         epochDone = true;
     }
     return !epochDone;
 }
-PUBLICAPI EpochResult OnDemandBatcherv2::run( int epoch ) {
+PUBLICAPI EpochResult OnDemandBatcherv2::run(int epoch) {
 //    cout << "OnDemandBatcherv2::run() epochDone=" << epochDone << endl;
-    if( epochDone ) {
+    if(epochDone) {
         reset();
     }
-    while( !epochDone ) {
-        tick( epoch );
+    while(!epochDone) {
+        tick(epoch);
     }
-    EpochResult epochResult( loss, numRight );
+    EpochResult epochResult(loss, numRight);
     return epochResult;
 }
 
diff --git a/src/batch/OnDemandBatcherv2.h b/src/batch/OnDemandBatcherv2.h
index 790ba63b..8cfe165f 100644
--- a/src/batch/OnDemandBatcherv2.h
+++ b/src/batch/OnDemandBatcherv2.h
@@ -64,10 +64,10 @@ class OnDemandBatcherv2 {
     // cog_addheaders.add()
     // ]]]
     // generated, using cog:
-    PUBLICAPI OnDemandBatcherv2( Trainable *net, NetAction *netAction,
-    GenericLoaderv2 *loader, int N, int fileReadBatches, int batchSize );
+    PUBLICAPI OnDemandBatcherv2(Trainable *net, NetAction *netAction,
+    GenericLoaderv2 *loader, int N, int fileReadBatches, int batchSize);
     VIRTUAL ~OnDemandBatcherv2();
-    VIRTUAL void setBatchState( int nextBatch, int numRight, float loss );
+    VIRTUAL void setBatchState(int nextBatch, int numRight, float loss);
     VIRTUAL int getBatchSize();
     PUBLICAPI VIRTUAL int getNextFileBatch();
     PUBLICAPI VIRTUAL int getNextBatch();
@@ -77,7 +77,7 @@ class OnDemandBatcherv2 {
     PUBLICAPI VIRTUAL int getN();
     PUBLICAPI void reset();
     PUBLICAPI bool tick(int epoch);
-    PUBLICAPI EpochResult run( int epoch );
+    PUBLICAPI EpochResult run(int epoch);
 
     // [[[end]]]
 };
diff --git a/src/clblas/ClBlasHelper.cpp b/src/clblas/ClBlasHelper.cpp
new file mode 100644
index 00000000..809fa3d3
--- /dev/null
+++ b/src/clblas/ClBlasHelper.cpp
@@ -0,0 +1,108 @@
+#include "util/stringhelper.h"
+#include "ClBlasHelper.h"
+
+#include "EasyCL.h"
+
+#include <iostream>
+using namespace std;
+
+#undef STATIC
+#undef VIRTUAL
+#define PUBLIC
+#define STATIC
+#define VIRTUAL
+
+#ifndef _WIN32
+    extern int clblasInitialized;
+#endif
+
+class ClblasNotInitializedException {
+};
+
+PUBLIC ClBlasHelper::ClBlasHelper() {
+}
+
+PUBLIC STATIC void ClBlasHelper::Gemm(
+    EasyCL *cl,
+    clblasOrder order, clblasTranspose aTrans, clblasTranspose bTrans,
+    int64 m, int64 k, int64 n,
+    float alpha,
+    CLWrapper *AWrapper, int64 aOffset,
+    CLWrapper *BWrapper, int64 bOffset,
+    float beta,
+    CLWrapper *CWrapper, int64 cOffset
+        ) {
+    #ifndef _WIN32  // not sure how to check this on Windows, but this is mostly to detect bugs during
+                    // development/testing anyway.  we can fix any initialization-bugs on linux, and
+                    // then it should work ok on Windows too
+        if(!clblasInitialized) {
+            cout << "Didnt initialize clBLAS" << endl;
+            throw ClblasNotInitializedException();
+        }
+    #endif
+    if(!CWrapper->isOnDevice()) {
+        if(beta == 0) {
+            CWrapper->createOnDevice();
+        } else {
+            CWrapper->copyToDevice();
+        }
+    }
+    int64 lda = ((order == clblasRowMajor) != (aTrans == clblasTrans)) ? k : m;
+    int64 ldb = ((order == clblasRowMajor) != (bTrans == clblasTrans)) ? n : k;
+    int64 ldc = order == clblasRowMajor ? n : m;
+    cl_int err = clblasSgemm(
+        order,
+        aTrans, bTrans,
+        (size_t)m, (size_t)n, (size_t)k,
+        alpha,
+        AWrapper->getBuffer(), (size_t)aOffset, (size_t)lda,
+        BWrapper->getBuffer(), (size_t)bOffset, (size_t)ldb,
+        beta,
+        CWrapper->getBuffer(), (size_t)cOffset, (size_t)ldc,
+        1, cl->queue, 0, NULL, 0
+   );
+   if (err != CL_SUCCESS) {
+       throw runtime_error("clblasSgemm() failed with " + toString(err));
+   }    
+}
+
+PUBLIC STATIC void ClBlasHelper::Gemv(
+    EasyCL *cl,
+    clblasOrder order, clblasTranspose trans,
+    int64 m, int64 n,
+    float alpha,
+    CLWrapper *AWrapper, int64 aOffset,
+    CLWrapper *BWrapper, int64 bOffset,
+    float beta,
+    CLWrapper *CWrapper, int64 cOffset
+        ) {
+    #ifndef _WIN32
+        if(!clblasInitialized) {
+            cout << "Didnt initialize clBLAS" << endl;
+            throw ClblasNotInitializedException();
+        }
+    #endif
+    if(!CWrapper->isOnDevice()) {
+        if(beta == 0) {
+            CWrapper->createOnDevice();
+        } else {
+            CWrapper->copyToDevice();
+        }
+    }
+    int64 lda = order == clblasRowMajor ? n : m;
+    cl_int err = clblasSgemv(
+        order,
+        trans,
+        (size_t)m, (size_t)n,
+        alpha,
+        AWrapper->getBuffer(), (size_t)aOffset, (size_t)lda,
+        BWrapper->getBuffer(), (size_t)bOffset, 1,
+        beta,
+        CWrapper->getBuffer(), (size_t)cOffset, 1,
+        1, cl->queue, 0, NULL, 0
+   );
+   if (err != CL_SUCCESS) {
+       throw runtime_error("clblasSgemv() failed with " + toString(err));
+   }        
+}
+
diff --git a/src/clblas/ClBlasHelper.h b/src/clblas/ClBlasHelper.h
new file mode 100644
index 00000000..4971470b
--- /dev/null
+++ b/src/clblas/ClBlasHelper.h
@@ -0,0 +1,46 @@
+#pragma once
+
+#define STATIC static
+#define VIRTUAL virtual
+
+#include "clBLAS.h"
+#include "DeepCLDllExport.h"
+
+class EasyCL;
+class CLWrapper;
+
+class DeepCL_EXPORT ClBlasHelper {
+    public:
+
+    // [[[cog
+    // import cog_addheaders
+    // cog_addheaders.addv2()
+    // ]]]
+    // generated, using cog:
+
+    public:
+    ClBlasHelper();
+    STATIC void Gemm(
+        EasyCL *cl,
+        clblasOrder order, clblasTranspose aTrans, clblasTranspose bTrans,
+        int64 m, int64 k, int64 n,
+        float alpha,
+        CLWrapper *AWrapper, int64 aOffset,
+        CLWrapper *BWrapper, int64 bOffset,
+        float beta,
+        CLWrapper *CWrapper, int64 cOffset
+    );
+    STATIC void Gemv(
+        EasyCL *cl,
+        clblasOrder order, clblasTranspose trans,
+        int64 m, int64 n,
+        float alpha,
+        CLWrapper *AWrapper, int64 aOffset,
+        CLWrapper *BWrapper, int64 bOffset,
+        float beta,
+        CLWrapper *CWrapper, int64 cOffset
+    );
+
+    // [[[end]]]
+};
+
diff --git a/src/clblas/ClBlasInstance.cpp b/src/clblas/ClBlasInstance.cpp
new file mode 100644
index 00000000..bd3e1f2d
--- /dev/null
+++ b/src/clblas/ClBlasInstance.cpp
@@ -0,0 +1,30 @@
+#include "clBLAS.h"
+
+#include "ClBlasInstance.h"
+
+#include <iostream>
+using namespace std;
+
+#define PUBLIC
+
+PUBLIC ClBlasInstance::ClBlasInstance() {
+    cout << "initializing clblas" << endl;
+    clblasSetup();
+}
+
+PUBLIC ClBlasInstance::~ClBlasInstance() {
+    cout << "clblas teardown" << endl;
+    clblasTeardown();
+}
+
+//bool ClBlasInstance::initialized = false;
+
+// assume single-threaded, at least for now
+//void ClBlasInstance::initializeIfNecessary() {
+//    if(!initialized) {
+//        cout << "initializing clblas" << endl;
+//        clblasSetup();
+//        initialized = true;
+//    }
+//}
+
diff --git a/src/clblas/ClBlasInstance.h b/src/clblas/ClBlasInstance.h
new file mode 100644
index 00000000..d7cc768d
--- /dev/null
+++ b/src/clblas/ClBlasInstance.h
@@ -0,0 +1,23 @@
+#pragma once
+
+#include "DeepCLDllExport.h"
+
+class DeepCL_EXPORT ClBlasInstance {
+//    static bool initialized;
+
+public:
+//    static void initializeIfNecessary();
+
+    // [[[cog
+    // import cog_addheaders
+    // cog_addheaders.addv2()
+    // ]]]
+    // generated, using cog:
+
+    public:
+    ClBlasInstance();
+    ~ClBlasInstance();
+
+    // [[[end]]]
+};
+
diff --git a/src/clblas/files.txt b/src/clblas/files.txt
new file mode 100644
index 00000000..123df489
--- /dev/null
+++ b/src/clblas/files.txt
@@ -0,0 +1,3 @@
+ClBlasHelper.cpp
+ClBlasInstance.cpp
+
diff --git a/src/clmath/CLMathWrapper.cpp b/src/clmath/CLMathWrapper.cpp
index 572a398f..4d4b3089 100644
--- a/src/clmath/CLMathWrapper.cpp
+++ b/src/clmath/CLMathWrapper.cpp
@@ -22,78 +22,84 @@ using namespace std;
 VIRTUAL CLMathWrapper::~CLMathWrapper() {
     delete gpuOp;
 }
-VIRTUAL CLMathWrapper &CLMathWrapper::operator*=( const float scalar ) {
+VIRTUAL CLMathWrapper &CLMathWrapper::operator=(const float scalar) {
+//    cout << "CLMathWrapper.operator*=(scalar)" << endl;
+    Op2Equal op;
+    gpuOp->apply2_inplace(N, wrapper, scalar, &op);
+    return *this;    
+}
+VIRTUAL CLMathWrapper &CLMathWrapper::operator*=(const float scalar) {
 //    cout << "CLMathWrapper.operator*=(scalar)" << endl;
     Op2Mul op;
-    gpuOp->apply2_inplace( N, wrapper, scalar, &op );
+    gpuOp->apply2_inplace(N, wrapper, scalar, &op);
     return *this;    
 }
-VIRTUAL CLMathWrapper &CLMathWrapper::operator+=( const float scalar ) {
+VIRTUAL CLMathWrapper &CLMathWrapper::operator+=(const float scalar) {
 //    cout << "CLMathWrapper.operator*=(scalar)" << endl;
     Op2Add op;
-    gpuOp->apply2_inplace( N, wrapper, scalar, &op );
+    gpuOp->apply2_inplace(N, wrapper, scalar, &op);
     return *this;    
 }
-VIRTUAL CLMathWrapper &CLMathWrapper::operator*=( const CLMathWrapper &two ) {
+VIRTUAL CLMathWrapper &CLMathWrapper::operator*=(const CLMathWrapper &two) {
 //    cout << "CLMathWrapper.operator*=(scalar)" << endl;
-    if( two.N != N ) {
-        throw runtime_error("CLMathWrapper::operator+, array size mismatch, cannot assign " + toString( two.N ) + 
-            " vs " + toString( N ) );
+    if(two.N != N) {
+        throw runtime_error("CLMathWrapper::operator+, array size mismatch, cannot assign " + toString(two.N) + 
+            " vs " + toString(N) );
     }
     Op2Mul op;
-    gpuOp->apply2_inplace( N, wrapper, ((CLMathWrapper &)two).wrapper, &op );
+    gpuOp->apply2_inplace(N, wrapper, ((CLMathWrapper &)two).wrapper, &op);
     return *this;    
 }
-VIRTUAL CLMathWrapper &CLMathWrapper::operator+=( const CLMathWrapper &two ) {
+VIRTUAL CLMathWrapper &CLMathWrapper::operator+=(const CLMathWrapper &two) {
 //    cout << "CLMathWrapper.operator+=()" << endl;
-    if( two.N != N ) {
-        throw runtime_error("CLMathWrapper::operator+, array size mismatch, cannot assign " + toString( two.N ) + 
-            " vs " + toString( N ) );
+    if(two.N != N) {
+        throw runtime_error("CLMathWrapper::operator+, array size mismatch, cannot assign " + toString(two.N) + 
+            " vs " + toString(N) );
     }
     Op2Add op;
-    gpuOp->apply2_inplace( N, wrapper, ((CLMathWrapper &)two).wrapper, &op );
+    gpuOp->apply2_inplace(N, wrapper, ((CLMathWrapper &)two).wrapper, &op);
     return *this;    
 }
-VIRTUAL CLMathWrapper &CLMathWrapper::operator=( const CLMathWrapper &rhs ) {
+VIRTUAL CLMathWrapper &CLMathWrapper::operator=(const CLMathWrapper &rhs) {
 //    cout << "CLMathWrapper.operator=()" << endl;
-    if( rhs.N != N ) {
-        throw runtime_error("CLMathWrapper::operator= array size mismatch, cannot assign " + toString( rhs.N ) + 
-            " vs " + toString( N ) );
+    if(rhs.N != N) {
+        throw runtime_error("CLMathWrapper::operator= array size mismatch, cannot assign " + toString(rhs.N) + 
+            " vs " + toString(N) );
     }
-    Op1Equal op;
-    gpuOp->apply1_outofplace( N, wrapper, ((CLMathWrapper &)rhs).wrapper, &op );
+    Op2Equal op;
+    gpuOp->apply2_inplace(N, wrapper, ((CLMathWrapper &)rhs).wrapper, &op);
     return *this;
 }
 VIRTUAL CLMathWrapper &CLMathWrapper::sqrt() {
     Op1Sqrt op;
-    gpuOp->apply1_inplace( N, wrapper, &op );
+    gpuOp->apply1_inplace(N, wrapper, &op);
     return *this;
 }
 VIRTUAL CLMathWrapper &CLMathWrapper::inv() {
     Op1Inv op;
-    gpuOp->apply1_inplace( N, wrapper, &op );
+    gpuOp->apply1_inplace(N, wrapper, &op);
     return *this;
 }
 VIRTUAL CLMathWrapper &CLMathWrapper::squared() {
     Op1Squared op;
-    gpuOp->apply1_inplace( N, wrapper, &op );
+    gpuOp->apply1_inplace(N, wrapper, &op);
     return *this;
 }
-VIRTUAL void CLMathWrapper::runKernel( CLKernel *kernel ) {   
+VIRTUAL void CLMathWrapper::runKernel(CLKernel *kernel) {   
     int globalSize = N;
     int workgroupSize = 64;
-    int numWorkgroups = ( globalSize + workgroupSize - 1 ) / workgroupSize;
-    kernel->run_1d( numWorkgroups * workgroupSize, workgroupSize );
+    int numWorkgroups = (globalSize + workgroupSize - 1) / workgroupSize;
+    kernel->run_1d(numWorkgroups * workgroupSize, workgroupSize);
     cl->finish();
 }
-CLMathWrapper::CLMathWrapper( CLWrapper *wrapper ) {
-    CLFloatWrapper *floatWrapper = dynamic_cast< CLFloatWrapper * >( wrapper );
-    if( floatWrapper == 0 ) {
-        throw runtime_error( "CLMathWrapper only works on CLFloatWrapper objects");
+CLMathWrapper::CLMathWrapper(CLWrapper *wrapper) {
+    CLFloatWrapper *floatWrapper = dynamic_cast< CLFloatWrapper * >(wrapper);
+    if(floatWrapper == 0) {
+        throw runtime_error("CLMathWrapper only works on CLFloatWrapper objects");
     }
     this->cl = floatWrapper->getCl();
     this->wrapper = floatWrapper;
     this->N = floatWrapper->size();
-    this->gpuOp = new GpuOp( cl );
+    this->gpuOp = new GpuOp(cl);
 }
 
diff --git a/src/clmath/CLMathWrapper.h b/src/clmath/CLMathWrapper.h
index 11503839..f1bcd752 100644
--- a/src/clmath/CLMathWrapper.h
+++ b/src/clmath/CLMathWrapper.h
@@ -40,16 +40,17 @@ class DeepCL_EXPORT CLMathWrapper {
     // ]]]
     // generated, using cog:
     VIRTUAL ~CLMathWrapper();
-    VIRTUAL CLMathWrapper &operator*=( const float scalar );
-    VIRTUAL CLMathWrapper &operator+=( const float scalar );
-    VIRTUAL CLMathWrapper &operator*=( const CLMathWrapper &two );
-    VIRTUAL CLMathWrapper &operator+=( const CLMathWrapper &two );
-    VIRTUAL CLMathWrapper &operator=( const CLMathWrapper &rhs );
+    VIRTUAL CLMathWrapper &operator=(const float scalar);
+    VIRTUAL CLMathWrapper &operator*=(const float scalar);
+    VIRTUAL CLMathWrapper &operator+=(const float scalar);
+    VIRTUAL CLMathWrapper &operator*=(const CLMathWrapper &two);
+    VIRTUAL CLMathWrapper &operator+=(const CLMathWrapper &two);
+    VIRTUAL CLMathWrapper &operator=(const CLMathWrapper &rhs);
     VIRTUAL CLMathWrapper &sqrt();
     VIRTUAL CLMathWrapper &inv();
     VIRTUAL CLMathWrapper &squared();
-    VIRTUAL void runKernel( CLKernel *kernel );
-    CLMathWrapper( CLWrapper *wrapper );
+    VIRTUAL void runKernel(CLKernel *kernel);
+    CLMathWrapper(CLWrapper *wrapper);
 
     // [[[end]]]
 };
diff --git a/src/clmath/CopyBuffer.cpp b/src/clmath/CopyBuffer.cpp
index 367f5a9e..bb97057b 100644
--- a/src/clmath/CopyBuffer.cpp
+++ b/src/clmath/CopyBuffer.cpp
@@ -17,32 +17,32 @@ using namespace std;
 #define STATIC
 #define VIRTUAL
 
-VIRTUAL void CopyBuffer::copy( int N, CLWrapper *in, CLWrapper *out ) {
-    kernel  ->in( N )
-            ->in( in )
-//            ->in( inoffset )
-            ->out( out );
-//            ->in( outoffset );
+VIRTUAL void CopyBuffer::copy(int N, CLWrapper *in, CLWrapper *out) {
+    kernel  ->in(N)
+            ->in(in)
+//            ->in(inoffset)
+            ->out(out);
+//            ->in(outoffset);
 
     int globalSize = N;
     int workgroupSize = 64;
-    int numWorkgroups = ( globalSize + workgroupSize - 1 ) / workgroupSize;
-    kernel->run_1d( numWorkgroups * workgroupSize, workgroupSize );
+    int numWorkgroups = (globalSize + workgroupSize - 1) / workgroupSize;
+    kernel->run_1d(numWorkgroups * workgroupSize, workgroupSize);
     cl->finish();
 
-    StatefulTimer::instance()->timeCheck("CopyBuffer::copy end" );
+    StatefulTimer::instance()->timeCheck("CopyBuffer::copy end");
 }
 
 VIRTUAL CopyBuffer::~CopyBuffer() {
 //    delete kernel;
 }
 
-CopyBuffer::CopyBuffer( EasyCL *cl ) :
-        cl( cl ) {
+CopyBuffer::CopyBuffer(EasyCL *cl) :
+        cl(cl) {
 
     std::string kernelName = "copy.copy";
-    if( cl->kernelExists( kernelName ) ) {
-        this->kernel = cl->getKernel( kernelName );
+    if(cl->kernelExists(kernelName) ) {
+        this->kernel = cl->getKernel(kernelName);
 //        cout << "CopyBuffer kernel already built => reusing" << endl;
         return;
     }
@@ -52,7 +52,7 @@ CopyBuffer::CopyBuffer( EasyCL *cl ) :
 
     // [[[cog
     // import stringify
-    // stringify.write_kernel2( "kernel", "cl/copy.cl", "copy", 'options' )
+    // stringify.write_kernel2("kernel", "cl/copy.cl", "copy", 'options')
     // ]]]
     // generated using cog, from cl/copy.cl:
     const char * kernelSource =  
@@ -68,9 +68,9 @@ CopyBuffer::CopyBuffer( EasyCL *cl ) :
     "kernel void copy(\n" 
     "        const int N,\n" 
     "        global const float *in,\n" 
-    "        global float *out ) {\n" 
+    "        global float *out) {\n" 
     "    const int globalId = get_global_id(0);\n" 
-    "    if( globalId >= N ) {\n" 
+    "    if (globalId >= N) {\n" 
     "        return;\n" 
     "    }\n" 
     "    out[globalId] = in[globalId];\n" 
@@ -81,9 +81,9 @@ CopyBuffer::CopyBuffer( EasyCL *cl ) :
     "        global const float *in,\n" 
     "        const int inoffset,\n" 
     "        global float *out,\n" 
-    "        const int outoffset ) {\n" 
+    "        const int outoffset) {\n" 
     "    const int globalId = get_global_id(0);\n" 
-    "    if( globalId >= N ) {\n" 
+    "    if (globalId >= N) {\n" 
     "        return;\n" 
     "    }\n" 
     "    out[globalId + outoffset] = in[globalId + inoffset];\n" 
@@ -93,9 +93,9 @@ CopyBuffer::CopyBuffer( EasyCL *cl ) :
     "        const int N,\n" 
     "        const float multiplier,\n" 
     "        global const float *in,\n" 
-    "        global float *out ) {\n" 
+    "        global float *out) {\n" 
     "    const int globalId = get_global_id(0);\n" 
-    "    if( globalId >= N ) {\n" 
+    "    if (globalId >= N) {\n" 
     "        return;\n" 
     "    }\n" 
     "    out[globalId] = multiplier * in[globalId];\n" 
@@ -104,9 +104,9 @@ CopyBuffer::CopyBuffer( EasyCL *cl ) :
     "kernel void multiplyInplace(\n" 
     "        const int N,\n" 
     "        const float multiplier,\n" 
-    "        global float *data ) {\n" 
+    "        global float *data) {\n" 
     "    const int globalId = get_global_id(0);\n" 
-    "    if( globalId >= N ) {\n" 
+    "    if (globalId >= N) {\n" 
     "        return;\n" 
     "    }\n" 
     "    data[globalId] *= multiplier;\n" 
@@ -115,7 +115,7 @@ CopyBuffer::CopyBuffer( EasyCL *cl ) :
     "";
     kernel = cl->buildKernelFromString( kernelSource, "copy", options, "cl/copy.cl" );
     // [[[end]]]
-    cl->storeKernel( kernelName, kernel, true );
+    cl->storeKernel(kernelName, kernel, true);
     this->kernel = kernel;
 }
 
diff --git a/src/clmath/CopyBuffer.h b/src/clmath/CopyBuffer.h
index d74e1f82..6623b556 100644
--- a/src/clmath/CopyBuffer.h
+++ b/src/clmath/CopyBuffer.h
@@ -30,9 +30,9 @@ class CopyBuffer {
     // cog_addheaders.add()
     // ]]]
     // generated, using cog:
-    VIRTUAL void copy( int N, CLWrapper *in, CLWrapper *out );
+    VIRTUAL void copy(int N, CLWrapper *in, CLWrapper *out);
     VIRTUAL ~CopyBuffer();
-    CopyBuffer( EasyCL *cl );
+    CopyBuffer(EasyCL *cl);
 
     // [[[end]]]
 };
diff --git a/src/clmath/GpuAdd.cpp b/src/clmath/GpuAdd.cpp
index c68160db..645d0301 100644
--- a/src/clmath/GpuAdd.cpp
+++ b/src/clmath/GpuAdd.cpp
@@ -18,27 +18,27 @@ using namespace std;
 #define VIRTUAL
 
 /// \brief calculates destinationWrapper += deltaWrapper
-VIRTUAL void GpuAdd::add( int N, CLWrapper*destinationWrapper, CLWrapper *deltaWrapper ) {
-    StatefulTimer::instance()->timeCheck("GpuAdd::add start" );
+VIRTUAL void GpuAdd::add(int N, CLWrapper*destinationWrapper, CLWrapper *deltaWrapper) {
+    StatefulTimer::instance()->timeCheck("GpuAdd::add start");
 
-    kernel->in( N );
-    kernel->inout( destinationWrapper );
-    kernel->in( deltaWrapper );
+    kernel->in(N);
+    kernel->inout(destinationWrapper);
+    kernel->in(deltaWrapper);
     int globalSize = N;
     int workgroupSize = 64;
-    int numWorkgroups = ( globalSize + workgroupSize - 1 ) / workgroupSize;
-    kernel->run_1d( numWorkgroups * workgroupSize, workgroupSize );
+    int numWorkgroups = (globalSize + workgroupSize - 1) / workgroupSize;
+    kernel->run_1d(numWorkgroups * workgroupSize, workgroupSize);
     cl->finish();
 
-    StatefulTimer::instance()->timeCheck("GpuAdd::add end" );
+    StatefulTimer::instance()->timeCheck("GpuAdd::add end");
 }
 VIRTUAL GpuAdd::~GpuAdd() {
 }
-GpuAdd::GpuAdd( EasyCL *cl ) :
-        cl( cl ) {
+GpuAdd::GpuAdd(EasyCL *cl) :
+        cl(cl) {
     std::string kernelName = "per_element_add.per_element_add";
-    if( cl->kernelExists( kernelName ) ) {
-        this->kernel = cl->getKernel( kernelName );
+    if(cl->kernelExists(kernelName) ) {
+        this->kernel = cl->getKernel(kernelName);
 //        cout << "GpuAdd kernel already built => reusing" << endl;
         return;
     }
@@ -48,7 +48,7 @@ GpuAdd::GpuAdd( EasyCL *cl ) :
 
     // [[[cog
     // import stringify
-    // stringify.write_kernel2( "kernel", "cl/per_element_add.cl", "per_element_add", 'options' )
+    // stringify.write_kernel2("kernel", "cl/per_element_add.cl", "per_element_add", 'options')
     // ]]]
     // generated using cog, from cl/per_element_add.cl:
     const char * kernelSource =  
@@ -58,9 +58,9 @@ GpuAdd::GpuAdd( EasyCL *cl ) :
     "// v. 2.0. If a copy of the MPL was not distributed with this file, You can\n" 
     "// obtain one at http://mozilla.org/MPL/2.0/.\n" 
     "\n" 
-    "kernel void per_element_add( const int N, global float *target, global const float *source ) {\n" 
+    "kernel void per_element_add(const int N, global float *target, global const float *source) {\n" 
     "    const int globalId = get_global_id(0);\n" 
-    "    if( globalId >= N ) {\n" 
+    "    if (globalId >= N) {\n" 
     "        return;\n" 
     "    }\n" 
     "    target[globalId] += source[globalId];\n" 
@@ -68,26 +68,26 @@ GpuAdd::GpuAdd( EasyCL *cl ) :
     "\n" 
     "// adds source to target\n" 
     "// tiles source as necessary, according to tilingSize\n" 
-    "kernel void per_element_tiled_add( const int N, const int tilingSize, global float *target, global const float *source ) {\n" 
+    "kernel void per_element_tiled_add(const int N, const int tilingSize, global float *target, global const float *source) {\n" 
     "    const int globalId = get_global_id(0);\n" 
-    "    if( globalId >= N ) {\n" 
+    "    if (globalId >= N) {\n" 
     "        return;\n" 
     "    }\n" 
     "    target[globalId] += source[globalId % tilingSize];\n" 
     "}\n" 
     "\n" 
-    "kernel void repeated_add( const int N, const int sourceSize, const int repeatSize, global float *target, global const float *source ) {\n" 
+    "kernel void repeated_add(const int N, const int sourceSize, const int repeatSize, global float *target, global const float *source) {\n" 
     "    const int globalId = get_global_id(0);\n" 
-    "    if( globalId >= N ) {\n" 
+    "    if (globalId >= N) {\n" 
     "        return;\n" 
     "    }\n" 
-    "    target[globalId] += source[ ( globalId / repeatSize ) % sourceSize ];\n" 
+    "    target[globalId] += source[ (globalId / repeatSize) % sourceSize ];\n" 
     "}\n" 
     "\n" 
     "";
     kernel = cl->buildKernelFromString( kernelSource, "per_element_add", options, "cl/per_element_add.cl" );
     // [[[end]]]
-    cl->storeKernel( kernelName, kernel, true );
+    cl->storeKernel(kernelName, kernel, true);
     this->kernel = kernel;
 }
 
diff --git a/src/clmath/GpuAdd.h b/src/clmath/GpuAdd.h
index 4043acf2..d17ab9ea 100644
--- a/src/clmath/GpuAdd.h
+++ b/src/clmath/GpuAdd.h
@@ -30,9 +30,9 @@ class GpuAdd {
     // cog_addheaders.add()
     // ]]]
     // generated, using cog:
-    VIRTUAL void add( int N, CLWrapper*destinationWrapper, CLWrapper *deltaWrapper );
+    VIRTUAL void add(int N, CLWrapper*destinationWrapper, CLWrapper *deltaWrapper);
     VIRTUAL ~GpuAdd();
-    GpuAdd( EasyCL *cl );
+    GpuAdd(EasyCL *cl);
 
     // [[[end]]]
 };
diff --git a/src/clmath/GpuOp.cpp b/src/clmath/GpuOp.cpp
index bb5258ef..a0160874 100644
--- a/src/clmath/GpuOp.cpp
+++ b/src/clmath/GpuOp.cpp
@@ -19,116 +19,116 @@ using namespace std;
 #define VIRTUAL
 
 /// \brief calculates destinationWrapper += deltaWrapper
-VIRTUAL void GpuOp::apply2_inplace( int N, CLWrapper*destinationWrapper, float scalar, Op2 *op ) {
-    StatefulTimer::instance()->timeCheck("GpuOp::apply inplace start" );
+VIRTUAL void GpuOp::apply2_inplace(int N, CLWrapper*destinationWrapper, float scalar, Op2 *op) {
+    StatefulTimer::instance()->timeCheck("GpuOp::apply inplace start");
 
     string kernelName = "GpuOp::" + op->getName() + "_inplace_scalar";
-    if( !cl->kernelExists( kernelName ) ) {
-        buildKernelScalar( kernelName, op, true );
+    if(!cl->kernelExists(kernelName) ) {
+        buildKernelScalar(kernelName, op, true);
     }
-    CLKernel *kernel = cl->getKernel( kernelName );
+    CLKernel *kernel = cl->getKernel(kernelName);
 
-    kernel->in( N );
-    kernel->inout( destinationWrapper );
-    kernel->in( scalar );
+    kernel->in(N);
+    kernel->inout(destinationWrapper);
+    kernel->in(scalar);
     int globalSize = N;
     int workgroupSize = 64;
-    int numWorkgroups = ( globalSize + workgroupSize - 1 ) / workgroupSize;
-    kernel->run_1d( numWorkgroups * workgroupSize, workgroupSize );
+    int numWorkgroups = (globalSize + workgroupSize - 1) / workgroupSize;
+    kernel->run_1d(numWorkgroups * workgroupSize, workgroupSize);
     cl->finish();
 
-    StatefulTimer::instance()->timeCheck("GpuOp::apply inplace end" );
+    StatefulTimer::instance()->timeCheck("GpuOp::apply inplace end");
 }
-VIRTUAL void GpuOp::apply2_inplace( int N, CLWrapper*destinationWrapper, CLWrapper *deltaWrapper, Op2 *op ) {
-    StatefulTimer::instance()->timeCheck("GpuOp::apply inplace start" );
+VIRTUAL void GpuOp::apply2_inplace(int N, CLWrapper*destinationWrapper, CLWrapper *deltaWrapper, Op2 *op) {
+    StatefulTimer::instance()->timeCheck("GpuOp::apply inplace start");
 
     string kernelName = "GpuOp::" + op->getName() + "_inplace";
-    if( !cl->kernelExists( kernelName ) ) {
-        buildKernel( kernelName, op, true );
+    if(!cl->kernelExists(kernelName) ) {
+        buildKernel(kernelName, op, true);
     }
-    CLKernel *kernel = cl->getKernel( kernelName );
+    CLKernel *kernel = cl->getKernel(kernelName);
 
-    kernel->in( N );
-    kernel->inout( destinationWrapper );
-    kernel->in( deltaWrapper );
+    kernel->in(N);
+    kernel->inout(destinationWrapper);
+    kernel->in(deltaWrapper);
     int globalSize = N;
     int workgroupSize = 64;
-    int numWorkgroups = ( globalSize + workgroupSize - 1 ) / workgroupSize;
-    kernel->run_1d( numWorkgroups * workgroupSize, workgroupSize );
+    int numWorkgroups = (globalSize + workgroupSize - 1) / workgroupSize;
+    kernel->run_1d(numWorkgroups * workgroupSize, workgroupSize);
     cl->finish();
 
-    StatefulTimer::instance()->timeCheck("GpuOp::apply inplace end" );
+    StatefulTimer::instance()->timeCheck("GpuOp::apply inplace end");
 }
-VIRTUAL void GpuOp::apply2_outofplace( int N, CLWrapper*destinationWrapper, CLWrapper*one, CLWrapper *two, Op2 *op ) {
-    StatefulTimer::instance()->timeCheck("GpuOp::apply inplace start" );
+VIRTUAL void GpuOp::apply2_outofplace(int N, CLWrapper*destinationWrapper, CLWrapper*one, CLWrapper *two, Op2 *op) {
+    StatefulTimer::instance()->timeCheck("GpuOp::apply inplace start");
 
     string kernelName = "GpuOp::" + op->getName() + "_outofplace";
-    if( !cl->kernelExists( kernelName ) ) {
-        buildKernel( kernelName, op, false );
+    if(!cl->kernelExists(kernelName) ) {
+        buildKernel(kernelName, op, false);
     }
-    CLKernel *kernel = cl->getKernel( kernelName );
+    CLKernel *kernel = cl->getKernel(kernelName);
 
-    kernel->in( N );
-    kernel->inout( destinationWrapper );
-    kernel->in( one );
-    kernel->in( two );
+    kernel->in(N);
+    kernel->inout(destinationWrapper);
+    kernel->in(one);
+    kernel->in(two);
     int globalSize = N;
     int workgroupSize = 64;
-    int numWorkgroups = ( globalSize + workgroupSize - 1 ) / workgroupSize;
-    kernel->run_1d( numWorkgroups * workgroupSize, workgroupSize );
+    int numWorkgroups = (globalSize + workgroupSize - 1) / workgroupSize;
+    kernel->run_1d(numWorkgroups * workgroupSize, workgroupSize);
     cl->finish();
 
-    StatefulTimer::instance()->timeCheck("GpuOp::apply inplace end" );
+    StatefulTimer::instance()->timeCheck("GpuOp::apply inplace end");
 }
-VIRTUAL void GpuOp::apply1_inplace( int N, CLWrapper*destinationWrapper, Op1 *op ) {
-    StatefulTimer::instance()->timeCheck("GpuOp::apply inplace start" );
+VIRTUAL void GpuOp::apply1_inplace(int N, CLWrapper*destinationWrapper, Op1 *op) {
+    StatefulTimer::instance()->timeCheck("GpuOp::apply inplace start");
 
     string kernelName = "GpuOp::" + op->getName() + "_inplace";
-    if( !cl->kernelExists( kernelName ) ) {
-        buildKernel( kernelName, op, true );
+    if(!cl->kernelExists(kernelName) ) {
+        buildKernel(kernelName, op, true);
     }
-    CLKernel *kernel = cl->getKernel( kernelName );
+    CLKernel *kernel = cl->getKernel(kernelName);
 
-    kernel->in( N );
-    kernel->inout( destinationWrapper );
+    kernel->in(N);
+    kernel->inout(destinationWrapper);
     int globalSize = N;
     int workgroupSize = 64;
-    int numWorkgroups = ( globalSize + workgroupSize - 1 ) / workgroupSize;
-    kernel->run_1d( numWorkgroups * workgroupSize, workgroupSize );
+    int numWorkgroups = (globalSize + workgroupSize - 1) / workgroupSize;
+    kernel->run_1d(numWorkgroups * workgroupSize, workgroupSize);
     cl->finish();
 
-    StatefulTimer::instance()->timeCheck("GpuOp::apply inplace end" );
+    StatefulTimer::instance()->timeCheck("GpuOp::apply inplace end");
 }
-VIRTUAL void GpuOp::apply1_outofplace( int N, CLWrapper*destinationWrapper, CLWrapper*one, Op1 *op ) {
-    StatefulTimer::instance()->timeCheck("GpuOp::apply inplace start" );
+VIRTUAL void GpuOp::apply1_outofplace(int N, CLWrapper*destinationWrapper, CLWrapper*one, Op1 *op) {
+    StatefulTimer::instance()->timeCheck("GpuOp::apply inplace start");
 
     string kernelName = "GpuOp::" + op->getName() + "_outofplace";
-    if( !cl->kernelExists( kernelName ) ) {
-        buildKernel( kernelName, op, false );
+    if(!cl->kernelExists(kernelName) ) {
+        buildKernel(kernelName, op, false);
     }
-    CLKernel *kernel = cl->getKernel( kernelName );
+    CLKernel *kernel = cl->getKernel(kernelName);
 
-    kernel->in( N );
-    kernel->inout( destinationWrapper );
-    kernel->in( one );
+    kernel->in(N);
+    kernel->inout(destinationWrapper);
+    kernel->in(one);
     int globalSize = N;
     int workgroupSize = 64;
-    int numWorkgroups = ( globalSize + workgroupSize - 1 ) / workgroupSize;
-    kernel->run_1d( numWorkgroups * workgroupSize, workgroupSize );
+    int numWorkgroups = (globalSize + workgroupSize - 1) / workgroupSize;
+    kernel->run_1d(numWorkgroups * workgroupSize, workgroupSize);
     cl->finish();
 
-    StatefulTimer::instance()->timeCheck("GpuOp::apply inplace end" );
+    StatefulTimer::instance()->timeCheck("GpuOp::apply inplace end");
 }
 VIRTUAL GpuOp::~GpuOp() {
 }
-GpuOp::GpuOp( EasyCL *cl ) :
-        cl( cl ) {
+GpuOp::GpuOp(EasyCL *cl) :
+        cl(cl) {
 }
-void GpuOp::buildKernel( std::string name, Op2 *op, bool inPlace ) {
+void GpuOp::buildKernel(std::string name, Op2 *op, bool inPlace) {
 
     // [[[cog
     // import stringify
-    // stringify.write_kernel( "kernel", "cl/per_element_op2.cl" )
+    // stringify.write_kernel("kernel", "cl/per_element_op2.cl")
     // ]]]
     // generated using cog, from cl/per_element_op2.cl:
     const char * kernelSource =  
@@ -138,46 +138,46 @@ void GpuOp::buildKernel( std::string name, Op2 *op, bool inPlace ) {
     "// v. 2.0. If a copy of the MPL was not distributed with this file, You can\n" 
     "// obtain one at http://mozilla.org/MPL/2.0/.\n" 
     "\n" 
-    "float operation( float val_one, float val_two ) {\n" 
+    "float operation(float val_one, float val_two) {\n" 
     "    return {{operation}};\n" 
     "}\n" 
     "\n" 
-    "kernel void per_element_op2_inplace( const int N, global float *target, global const float *source ) {\n" 
+    "kernel void per_element_op2_inplace(const int N, global float *target, global const float *source) {\n" 
     "    const int globalId = get_global_id(0);\n" 
-    "    if( globalId >= N ) {\n" 
+    "    if (globalId >= N) {\n" 
     "        return;\n" 
     "    }\n" 
-    "    target[globalId] = operation( target[globalId], source[globalId] );\n" 
+    "    target[globalId] = operation(target[globalId], source[globalId]);\n" 
     "}\n" 
     "\n" 
-    "kernel void per_element_op2_outofplace( const int N, global float *target, global float *one, global const float *two ) {\n" 
+    "kernel void per_element_op2_outofplace(const int N, global float *target, global float *one, global const float *two) {\n" 
     "    const int globalId = get_global_id(0);\n" 
-    "    if( globalId >= N ) {\n" 
+    "    if (globalId >= N) {\n" 
     "        return;\n" 
     "    }\n" 
-    "    target[globalId] = operation( one[globalId], two[globalId] );\n" 
+    "    target[globalId] = operation(one[globalId], two[globalId]);\n" 
     "}\n" 
     "\n" 
     "";
     // [[[end]]]
     LuaTemplater templater;
-    templater.set( "operation", op->getOperationString() );
+    templater.set("operation", op->getOperationString());
     string renderedKernel = templater.render(kernelSource);
     // cout << "renderedKernel:" << endl;
     // cout << renderedKernel << endl;
 
     string clKernelName = "per_element_op2_outofplace";
-    if( inPlace ) {
+    if(inPlace) {
         clKernelName = "per_element_op2_inplace";
     }
-    kernel = cl->buildKernelFromString( renderedKernel, clKernelName, "", "cl/per_element_op2.cl" );
-    cl->storeKernel( name, kernel, true );
+    kernel = cl->buildKernelFromString(renderedKernel, clKernelName, "", "cl/per_element_op2.cl");
+    cl->storeKernel(name, kernel, true);
 }
-void GpuOp::buildKernel( std::string name, Op1 *op, bool inPlace ) {
+void GpuOp::buildKernel(std::string name, Op1 *op, bool inPlace) {
 
     // [[[cog
     // import stringify
-    // stringify.write_kernel( "kernel", "cl/per_element_op1.cl" )
+    // stringify.write_kernel("kernel", "cl/per_element_op1.cl")
     // ]]]
     // generated using cog, from cl/per_element_op1.cl:
     const char * kernelSource =  
@@ -187,46 +187,46 @@ void GpuOp::buildKernel( std::string name, Op1 *op, bool inPlace ) {
     "// v. 2.0. If a copy of the MPL was not distributed with this file, You can\n" 
     "// obtain one at http://mozilla.org/MPL/2.0/.\n" 
     "\n" 
-    "float operation( float val_one ) {\n" 
+    "float operation(float val_one) {\n" 
     "    return {{operation}};\n" 
     "}\n" 
     "\n" 
-    "kernel void per_element_op1_inplace( const int N, global float *target ) {\n" 
+    "kernel void per_element_op1_inplace(const int N, global float *target) {\n" 
     "    const int globalId = get_global_id(0);\n" 
-    "    if( globalId >= N ) {\n" 
+    "    if (globalId >= N) {\n" 
     "        return;\n" 
     "    }\n" 
-    "    target[globalId] = operation( target[globalId] );\n" 
+    "    target[globalId] = operation(target[globalId]);\n" 
     "}\n" 
     "\n" 
-    "kernel void per_element_op1_outofplace( const int N, global float *target, global float *one ) {\n" 
+    "kernel void per_element_op1_outofplace(const int N, global float *target, global float *one) {\n" 
     "    const int globalId = get_global_id(0);\n" 
-    "    if( globalId >= N ) {\n" 
+    "    if (globalId >= N) {\n" 
     "        return;\n" 
     "    }\n" 
-    "    target[globalId] = operation( one[globalId] );\n" 
+    "    target[globalId] = operation(one[globalId]);\n" 
     "}\n" 
     "\n" 
     "";
     // [[[end]]]
     LuaTemplater templater;
-    templater.set( "operation", op->getOperationString() );
+    templater.set("operation", op->getOperationString());
     string renderedKernel = templater.render(kernelSource);
     // cout << "renderedKernel:" << endl;
     // cout << renderedKernel << endl;
 
     string clKernelName = "per_element_op1_outofplace";
-    if( inPlace ) {
+    if(inPlace) {
         clKernelName = "per_element_op1_inplace";
     }
-    kernel = cl->buildKernelFromString( renderedKernel, clKernelName, "", "cl/per_element_op1.cl" );
-    cl->storeKernel( name, kernel, true );
+    kernel = cl->buildKernelFromString(renderedKernel, clKernelName, "", "cl/per_element_op1.cl");
+    cl->storeKernel(name, kernel, true);
 }
-void GpuOp::buildKernelScalar( std::string name, Op2 *op, bool inPlace ) {
+void GpuOp::buildKernelScalar(std::string name, Op2 *op, bool inPlace) {
 
     // [[[cog
     // import stringify
-    // stringify.write_kernel( "kernel", "cl/per_element_op2_scalar.cl" )
+    // stringify.write_kernel("kernel", "cl/per_element_op2_scalar.cl")
     // ]]]
     // generated using cog, from cl/per_element_op2_scalar.cl:
     const char * kernelSource =  
@@ -236,39 +236,39 @@ void GpuOp::buildKernelScalar( std::string name, Op2 *op, bool inPlace ) {
     "// v. 2.0. If a copy of the MPL was not distributed with this file, You can\n" 
     "// obtain one at http://mozilla.org/MPL/2.0/.\n" 
     "\n" 
-    "float operation( float val_one, float val_two ) {\n" 
+    "float operation(float val_one, float val_two) {\n" 
     "    return {{operation}};\n" 
     "}\n" 
     "\n" 
-    "kernel void per_element_op2_inplace( const int N, global float *target, const float scalar ) {\n" 
+    "kernel void per_element_op2_inplace(const int N, global float *target, const float scalar) {\n" 
     "    const int globalId = get_global_id(0);\n" 
-    "    if( globalId >= N ) {\n" 
+    "    if (globalId >= N) {\n" 
     "        return;\n" 
     "    }\n" 
-    "    target[globalId] = operation( target[globalId], scalar );\n" 
+    "    target[globalId] = operation(target[globalId], scalar);\n" 
     "}\n" 
     "\n" 
-    "kernel void per_element_op2_outofplace( const int N, global float *target, global float *source, const float scalar ) {\n" 
+    "kernel void per_element_op2_outofplace(const int N, global float *target, global float *source, const float scalar) {\n" 
     "    const int globalId = get_global_id(0);\n" 
-    "    if( globalId >= N ) {\n" 
+    "    if (globalId >= N) {\n" 
     "        return;\n" 
     "    }\n" 
-    "    target[globalId] = operation( source[globalId], scalar );\n" 
+    "    target[globalId] = operation(source[globalId], scalar);\n" 
     "}\n" 
     "\n" 
     "";
     // [[[end]]]
     LuaTemplater templater;
-    templater.set( "operation", op->getOperationString() );
+    templater.set("operation", op->getOperationString());
     string renderedKernel = templater.render(kernelSource);
     // cout << "renderedKernel:" << endl;
     // cout << renderedKernel << endl;
 
     string clKernelName = "per_element_op2_outofplace";
-    if( inPlace ) {
+    if(inPlace) {
         clKernelName = "per_element_op2_inplace";
     }
-    kernel = cl->buildKernelFromString( renderedKernel, clKernelName, "", "cl/per_element_op2_scalar.cl" );
-    cl->storeKernel( name, kernel, true );
+    kernel = cl->buildKernelFromString(renderedKernel, clKernelName, "", "cl/per_element_op2_scalar.cl");
+    cl->storeKernel(name, kernel, true);
 }
 
diff --git a/src/clmath/GpuOp.h b/src/clmath/GpuOp.h
index cbec0c4c..2fa7b0bb 100644
--- a/src/clmath/GpuOp.h
+++ b/src/clmath/GpuOp.h
@@ -25,12 +25,6 @@ class DeepCL_EXPORT Op1 {
     virtual std::string getOperationString() = 0;
     virtual std::string getName() = 0;
 };
-class DeepCL_EXPORT Op1Equal : public Op1 {
-    std::string getOperationString() {
-        return "val_one";
-    }
-    std::string getName(){ return "Op1_Equal"; }
-};
 class DeepCL_EXPORT Op1Inv : public Op1 {
     std::string getOperationString() {
         return "1.0f / val_one";
@@ -39,7 +33,7 @@ class DeepCL_EXPORT Op1Inv : public Op1 {
 };
 class DeepCL_EXPORT Op1Sqrt : public Op1 {
     std::string getOperationString() {
-        return "native_sqrt( val_one )";
+        return "native_sqrt(val_one)";
     }
     std::string getName(){ return "Op1_Sqrt"; }
 };
@@ -55,6 +49,12 @@ class DeepCL_EXPORT Op2 {
     virtual std::string getOperationString() = 0;
     virtual std::string getName() = 0;
 };
+class DeepCL_EXPORT Op2Equal : public Op2 {
+    std::string getOperationString() {
+        return "val_two";
+    }
+    std::string getName(){ return "Op2_Equal"; }
+};
 class DeepCL_EXPORT Op2Add : public Op2 {
 public:
     std::string getOperationString() {
@@ -97,16 +97,16 @@ class DeepCL_EXPORT GpuOp {
     // cog_addheaders.add()
     // ]]]
     // generated, using cog:
-    VIRTUAL void apply2_inplace( int N, CLWrapper*destinationWrapper, float scalar, Op2 *op );
-    VIRTUAL void apply2_inplace( int N, CLWrapper*destinationWrapper, CLWrapper *deltaWrapper, Op2 *op );
-    VIRTUAL void apply2_outofplace( int N, CLWrapper*destinationWrapper, CLWrapper*one, CLWrapper *two, Op2 *op );
-    VIRTUAL void apply1_inplace( int N, CLWrapper*destinationWrapper, Op1 *op );
-    VIRTUAL void apply1_outofplace( int N, CLWrapper*destinationWrapper, CLWrapper*one, Op1 *op );
+    VIRTUAL void apply2_inplace(int N, CLWrapper*destinationWrapper, float scalar, Op2 *op);
+    VIRTUAL void apply2_inplace(int N, CLWrapper*destinationWrapper, CLWrapper *deltaWrapper, Op2 *op);
+    VIRTUAL void apply2_outofplace(int N, CLWrapper*destinationWrapper, CLWrapper*one, CLWrapper *two, Op2 *op);
+    VIRTUAL void apply1_inplace(int N, CLWrapper*destinationWrapper, Op1 *op);
+    VIRTUAL void apply1_outofplace(int N, CLWrapper*destinationWrapper, CLWrapper*one, Op1 *op);
     VIRTUAL ~GpuOp();
-    GpuOp( EasyCL *cl );
-    void buildKernel( std::string name, Op2 *op, bool inPlace );
-    void buildKernel( std::string name, Op1 *op, bool inPlace );
-    void buildKernelScalar( std::string name, Op2 *op, bool inPlace );
+    GpuOp(EasyCL *cl);
+    void buildKernel(std::string name, Op2 *op, bool inPlace);
+    void buildKernel(std::string name, Op1 *op, bool inPlace);
+    void buildKernelScalar(std::string name, Op2 *op, bool inPlace);
 
     // [[[end]]]
 };
diff --git a/src/clmath/MultiplyBuffer.cpp b/src/clmath/MultiplyBuffer.cpp
index 72adeeeb..2e9108b3 100644
--- a/src/clmath/MultiplyBuffer.cpp
+++ b/src/clmath/MultiplyBuffer.cpp
@@ -18,52 +18,52 @@ using namespace std;
 #define STATIC
 #define VIRTUAL
 
-VIRTUAL void MultiplyBuffer::multiply( int N, float multiplier, CLWrapper *in, CLWrapper *out ) {
-        StatefulTimer::instance()->timeCheck("MultiplyBuffer::multiply start" );
+VIRTUAL void MultiplyBuffer::multiply(int N, float multiplier, CLWrapper *in, CLWrapper *out) {
+        StatefulTimer::instance()->timeCheck("MultiplyBuffer::multiply start");
 
-    kernel  ->in( N )
-            ->in( multiplier )
-            ->in( in )
-            ->out( out );
+    kernel  ->in(N)
+            ->in(multiplier)
+            ->in(in)
+            ->out(out);
 
     int globalSize = N;
     int workgroupSize = 64;
-    int numWorkgroups = ( globalSize + workgroupSize - 1 ) / workgroupSize;
-    kernel->run_1d( numWorkgroups * workgroupSize, workgroupSize );
+    int numWorkgroups = (globalSize + workgroupSize - 1) / workgroupSize;
+    kernel->run_1d(numWorkgroups * workgroupSize, workgroupSize);
     cl->finish();
 
-    StatefulTimer::instance()->timeCheck("MultiplyBuffer::multiply end" );
+    StatefulTimer::instance()->timeCheck("MultiplyBuffer::multiply end");
 }
 
 VIRTUAL MultiplyBuffer::~MultiplyBuffer() {
 //    delete kernel;
 }
 
-//VIRTUAL std::string MultiplyBuffer::floatToFloatString( float value ) {
-//    string floatString = toString( value );
-//    if( floatString.find( "." ) == string::npos ) {
+//VIRTUAL std::string MultiplyBuffer::floatToFloatString(float value) {
+//    string floatString = toString(value);
+//    if(floatString.find(".") == string::npos) {
 //        floatString += ".0";
 //    }
 //    floatString += "f";
 //    return floatString;
 //}
 
-MultiplyBuffer::MultiplyBuffer( EasyCL *cl ) :
-        cl( cl ) {
+MultiplyBuffer::MultiplyBuffer(EasyCL *cl) :
+        cl(cl) {
 //    std::string options = "-D " + fn->getDefineName();
     string options = "";
-//    options += " -DgN=" + toString( N );
-//    options += " -DgMultiplier=" + floatToFloatString( multiplier );
+//    options += " -DgN=" + toString(N);
+//    options += " -DgMultiplier=" + floatToFloatString(multiplier);
 
     std::string kernelName = "multiplyConstant";
-    if( cl->kernelExists( kernelName ) ) {
-        this->kernel = cl->getKernel( kernelName );
+    if(cl->kernelExists(kernelName) ) {
+        this->kernel = cl->getKernel(kernelName);
         return;
     }
 
     // [[[cog
     // import stringify
-    // stringify.write_kernel2( "kernel", "cl/copy.cl", "multiplyConstant", 'options' )
+    // stringify.write_kernel2("kernel", "cl/copy.cl", "multiplyConstant", 'options')
     // ]]]
     // generated using cog, from cl/copy.cl:
     const char * kernelSource =  
@@ -79,9 +79,9 @@ MultiplyBuffer::MultiplyBuffer( EasyCL *cl ) :
     "kernel void copy(\n" 
     "        const int N,\n" 
     "        global const float *in,\n" 
-    "        global float *out ) {\n" 
+    "        global float *out) {\n" 
     "    const int globalId = get_global_id(0);\n" 
-    "    if( globalId >= N ) {\n" 
+    "    if (globalId >= N) {\n" 
     "        return;\n" 
     "    }\n" 
     "    out[globalId] = in[globalId];\n" 
@@ -92,9 +92,9 @@ MultiplyBuffer::MultiplyBuffer( EasyCL *cl ) :
     "        global const float *in,\n" 
     "        const int inoffset,\n" 
     "        global float *out,\n" 
-    "        const int outoffset ) {\n" 
+    "        const int outoffset) {\n" 
     "    const int globalId = get_global_id(0);\n" 
-    "    if( globalId >= N ) {\n" 
+    "    if (globalId >= N) {\n" 
     "        return;\n" 
     "    }\n" 
     "    out[globalId + outoffset] = in[globalId + inoffset];\n" 
@@ -104,9 +104,9 @@ MultiplyBuffer::MultiplyBuffer( EasyCL *cl ) :
     "        const int N,\n" 
     "        const float multiplier,\n" 
     "        global const float *in,\n" 
-    "        global float *out ) {\n" 
+    "        global float *out) {\n" 
     "    const int globalId = get_global_id(0);\n" 
-    "    if( globalId >= N ) {\n" 
+    "    if (globalId >= N) {\n" 
     "        return;\n" 
     "    }\n" 
     "    out[globalId] = multiplier * in[globalId];\n" 
@@ -115,9 +115,9 @@ MultiplyBuffer::MultiplyBuffer( EasyCL *cl ) :
     "kernel void multiplyInplace(\n" 
     "        const int N,\n" 
     "        const float multiplier,\n" 
-    "        global float *data ) {\n" 
+    "        global float *data) {\n" 
     "    const int globalId = get_global_id(0);\n" 
-    "    if( globalId >= N ) {\n" 
+    "    if (globalId >= N) {\n" 
     "        return;\n" 
     "    }\n" 
     "    data[globalId] *= multiplier;\n" 
@@ -126,7 +126,7 @@ MultiplyBuffer::MultiplyBuffer( EasyCL *cl ) :
     "";
     kernel = cl->buildKernelFromString( kernelSource, "multiplyConstant", options, "cl/copy.cl" );
     // [[[end]]]
-    cl->storeKernel( kernelName, kernel, true );
+    cl->storeKernel(kernelName, kernel, true);
     this->kernel = kernel;
 }
 
diff --git a/src/clmath/MultiplyBuffer.h b/src/clmath/MultiplyBuffer.h
index 28f45f4c..d1e8f7b8 100644
--- a/src/clmath/MultiplyBuffer.h
+++ b/src/clmath/MultiplyBuffer.h
@@ -30,9 +30,9 @@ class MultiplyBuffer {
     // cog_addheaders.add()
     // ]]]
     // generated, using cog:
-    VIRTUAL void multiply( int N, float multiplier, CLWrapper *in, CLWrapper *out );
+    VIRTUAL void multiply(int N, float multiplier, CLWrapper *in, CLWrapper *out);
     VIRTUAL ~MultiplyBuffer();
-    MultiplyBuffer( EasyCL *cl );
+    MultiplyBuffer(EasyCL *cl);
 
     // [[[end]]]
 };
diff --git a/src/clmath/MultiplyInPlace.cpp b/src/clmath/MultiplyInPlace.cpp
index 53f47ffb..bd406ded 100644
--- a/src/clmath/MultiplyInPlace.cpp
+++ b/src/clmath/MultiplyInPlace.cpp
@@ -19,31 +19,31 @@ using namespace std;
 #define VIRTUAL
 
 /// \brief calculates data *= multiplier
-VIRTUAL void MultiplyInPlace::multiply( int N, float multiplier, CLWrapper *data ) {
-        StatefulTimer::instance()->timeCheck("MultiplyInPlace::multiply start" );
+VIRTUAL void MultiplyInPlace::multiply(int N, float multiplier, CLWrapper *data) {
+        StatefulTimer::instance()->timeCheck("MultiplyInPlace::multiply start");
 
-    kernel  ->in( N )
-            ->in( multiplier )
-            ->inout( data );
+    kernel  ->in(N)
+            ->in(multiplier)
+            ->inout(data);
 
     int globalSize = N;
     int workgroupSize = 64;
-    int numWorkgroups = ( globalSize + workgroupSize - 1 ) / workgroupSize;
-    kernel->run_1d( numWorkgroups * workgroupSize, workgroupSize );
+    int numWorkgroups = (globalSize + workgroupSize - 1) / workgroupSize;
+    kernel->run_1d(numWorkgroups * workgroupSize, workgroupSize);
     cl->finish();
 
-    StatefulTimer::instance()->timeCheck("MultiplyInPlace::multiply end" );
+    StatefulTimer::instance()->timeCheck("MultiplyInPlace::multiply end");
 }
 VIRTUAL MultiplyInPlace::~MultiplyInPlace() {
 //    delete kernel;
 }
-MultiplyInPlace::MultiplyInPlace( EasyCL *cl ) :
-        cl( cl ) {
+MultiplyInPlace::MultiplyInPlace(EasyCL *cl) :
+        cl(cl) {
     string options = "";
 
     std::string kernelName = "copy.multiplyInplace";
-    if( cl->kernelExists( kernelName ) ) {
-        this->kernel = cl->getKernel( kernelName );
+    if(cl->kernelExists(kernelName) ) {
+        this->kernel = cl->getKernel(kernelName);
 //        cout << "MultiplyInPlace kernel already built => reusing" << endl;
         return;
     }
@@ -51,7 +51,7 @@ MultiplyInPlace::MultiplyInPlace( EasyCL *cl ) :
 
     // [[[cog
     // import stringify
-    // stringify.write_kernel2( "kernel", "cl/copy.cl", "multiplyInplace", 'options' )
+    // stringify.write_kernel2("kernel", "cl/copy.cl", "multiplyInplace", 'options')
     // ]]]
     // generated using cog, from cl/copy.cl:
     const char * kernelSource =  
@@ -67,9 +67,9 @@ MultiplyInPlace::MultiplyInPlace( EasyCL *cl ) :
     "kernel void copy(\n" 
     "        const int N,\n" 
     "        global const float *in,\n" 
-    "        global float *out ) {\n" 
+    "        global float *out) {\n" 
     "    const int globalId = get_global_id(0);\n" 
-    "    if( globalId >= N ) {\n" 
+    "    if (globalId >= N) {\n" 
     "        return;\n" 
     "    }\n" 
     "    out[globalId] = in[globalId];\n" 
@@ -80,9 +80,9 @@ MultiplyInPlace::MultiplyInPlace( EasyCL *cl ) :
     "        global const float *in,\n" 
     "        const int inoffset,\n" 
     "        global float *out,\n" 
-    "        const int outoffset ) {\n" 
+    "        const int outoffset) {\n" 
     "    const int globalId = get_global_id(0);\n" 
-    "    if( globalId >= N ) {\n" 
+    "    if (globalId >= N) {\n" 
     "        return;\n" 
     "    }\n" 
     "    out[globalId + outoffset] = in[globalId + inoffset];\n" 
@@ -92,9 +92,9 @@ MultiplyInPlace::MultiplyInPlace( EasyCL *cl ) :
     "        const int N,\n" 
     "        const float multiplier,\n" 
     "        global const float *in,\n" 
-    "        global float *out ) {\n" 
+    "        global float *out) {\n" 
     "    const int globalId = get_global_id(0);\n" 
-    "    if( globalId >= N ) {\n" 
+    "    if (globalId >= N) {\n" 
     "        return;\n" 
     "    }\n" 
     "    out[globalId] = multiplier * in[globalId];\n" 
@@ -103,9 +103,9 @@ MultiplyInPlace::MultiplyInPlace( EasyCL *cl ) :
     "kernel void multiplyInplace(\n" 
     "        const int N,\n" 
     "        const float multiplier,\n" 
-    "        global float *data ) {\n" 
+    "        global float *data) {\n" 
     "    const int globalId = get_global_id(0);\n" 
-    "    if( globalId >= N ) {\n" 
+    "    if (globalId >= N) {\n" 
     "        return;\n" 
     "    }\n" 
     "    data[globalId] *= multiplier;\n" 
@@ -114,7 +114,7 @@ MultiplyInPlace::MultiplyInPlace( EasyCL *cl ) :
     "";
     kernel = cl->buildKernelFromString( kernelSource, "multiplyInplace", options, "cl/copy.cl" );
     // [[[end]]]
-    cl->storeKernel( kernelName, kernel, true );
+    cl->storeKernel(kernelName, kernel, true);
     this->kernel = kernel;
 }
 // Copyright Hugh Perkins 2015 hughperkins at gmail
diff --git a/src/clmath/MultiplyInPlace.h b/src/clmath/MultiplyInPlace.h
index 57ca70c3..f282ce32 100644
--- a/src/clmath/MultiplyInPlace.h
+++ b/src/clmath/MultiplyInPlace.h
@@ -30,9 +30,9 @@ class MultiplyInPlace {
     // cog_addheaders.add()
     // ]]]
     // generated, using cog:
-    VIRTUAL void multiply( int N, float multiplier, CLWrapper *data );
+    VIRTUAL void multiply(int N, float multiplier, CLWrapper *data);
     VIRTUAL ~MultiplyInPlace();
-    MultiplyInPlace( EasyCL *cl );
+    MultiplyInPlace(EasyCL *cl);
 
     // [[[end]]]
 };
diff --git a/src/conv/AddBias.cpp b/src/conv/AddBias.cpp
index 4807b717..fa590570 100644
--- a/src/conv/AddBias.cpp
+++ b/src/conv/AddBias.cpp
@@ -19,30 +19,30 @@ using namespace std;
 VIRTUAL AddBias::~AddBias() {
 }
 VIRTUAL void AddBias::forward(
-        int batchSize, int numFilters, int outputImageSize,
+        int batchSize, int numFilters, int outputSize,
         CLWrapper *outputWrapper,
         CLWrapper *biasWrapper
             ) {
     StatefulTimer::timeCheck("AddBias::forward begin");
 
-    kernel->in( batchSize * numFilters * outputImageSize * outputImageSize )
-        ->in( numFilters )
-        ->in( outputImageSize * outputImageSize )
-        ->inout( outputWrapper )->in( biasWrapper );
-    int globalSize = batchSize * numFilters * outputImageSize * outputImageSize;
+    kernel->in(batchSize * numFilters * outputSize * outputSize)
+        ->in(numFilters)
+        ->in(outputSize * outputSize)
+        ->inout(outputWrapper)->in(biasWrapper);
+    int globalSize = batchSize * numFilters * outputSize * outputSize;
     int workgroupSize = 64;
-    int numWorkgroups = ( globalSize + workgroupSize - 1 ) / workgroupSize;
-    kernel->run_1d( numWorkgroups * workgroupSize, workgroupSize );
+    int numWorkgroups = (globalSize + workgroupSize - 1) / workgroupSize;
+    kernel->run_1d(numWorkgroups * workgroupSize, workgroupSize);
     cl->finish();
 
     StatefulTimer::timeCheck("AddBias::forward after repeatedAdd");
 }
-AddBias::AddBias( EasyCL *cl ) :
-        cl( cl )
+AddBias::AddBias(EasyCL *cl) :
+        cl(cl)
             {
     string kernelName = "AddBias.per_element_add";
-    if( cl->kernelExists( kernelName ) ) {
-        this->kernel = cl->getKernel( kernelName );
+    if(cl->kernelExists(kernelName) ) {
+        this->kernel = cl->getKernel(kernelName);
         return;
     }
 
@@ -50,7 +50,7 @@ AddBias::AddBias( EasyCL *cl ) :
 
     // [[[cog
     // import stringify
-    // stringify.write_kernel2( "kernel", "cl/per_element_add.cl", "repeated_add", 'options' )
+    // stringify.write_kernel2("kernel", "cl/per_element_add.cl", "repeated_add", 'options')
     // ]]]
     // generated using cog, from cl/per_element_add.cl:
     const char * kernelSource =  
@@ -60,9 +60,9 @@ AddBias::AddBias( EasyCL *cl ) :
     "// v. 2.0. If a copy of the MPL was not distributed with this file, You can\n" 
     "// obtain one at http://mozilla.org/MPL/2.0/.\n" 
     "\n" 
-    "kernel void per_element_add( const int N, global float *target, global const float *source ) {\n" 
+    "kernel void per_element_add(const int N, global float *target, global const float *source) {\n" 
     "    const int globalId = get_global_id(0);\n" 
-    "    if( globalId >= N ) {\n" 
+    "    if (globalId >= N) {\n" 
     "        return;\n" 
     "    }\n" 
     "    target[globalId] += source[globalId];\n" 
@@ -70,26 +70,26 @@ AddBias::AddBias( EasyCL *cl ) :
     "\n" 
     "// adds source to target\n" 
     "// tiles source as necessary, according to tilingSize\n" 
-    "kernel void per_element_tiled_add( const int N, const int tilingSize, global float *target, global const float *source ) {\n" 
+    "kernel void per_element_tiled_add(const int N, const int tilingSize, global float *target, global const float *source) {\n" 
     "    const int globalId = get_global_id(0);\n" 
-    "    if( globalId >= N ) {\n" 
+    "    if (globalId >= N) {\n" 
     "        return;\n" 
     "    }\n" 
     "    target[globalId] += source[globalId % tilingSize];\n" 
     "}\n" 
     "\n" 
-    "kernel void repeated_add( const int N, const int sourceSize, const int repeatSize, global float *target, global const float *source ) {\n" 
+    "kernel void repeated_add(const int N, const int sourceSize, const int repeatSize, global float *target, global const float *source) {\n" 
     "    const int globalId = get_global_id(0);\n" 
-    "    if( globalId >= N ) {\n" 
+    "    if (globalId >= N) {\n" 
     "        return;\n" 
     "    }\n" 
-    "    target[globalId] += source[ ( globalId / repeatSize ) % sourceSize ];\n" 
+    "    target[globalId] += source[ (globalId / repeatSize) % sourceSize ];\n" 
     "}\n" 
     "\n" 
     "";
     kernel = cl->buildKernelFromString( kernelSource, "repeated_add", options, "cl/per_element_add.cl" );
     // [[[end]]]
 
-    cl->storeKernel( kernelName, kernel, true );
+    cl->storeKernel(kernelName, kernel, true);
 }
 
diff --git a/src/conv/AddBias.h b/src/conv/AddBias.h
index 6e6cfabd..89d8028f 100644
--- a/src/conv/AddBias.h
+++ b/src/conv/AddBias.h
@@ -30,11 +30,11 @@ class AddBias {
     // generated, using cog:
     VIRTUAL ~AddBias();
     VIRTUAL void forward(
-    int batchSize, int numFilters, int outputImageSize,
+    int batchSize, int numFilters, int outputSize,
     CLWrapper *outputWrapper,
     CLWrapper *biasWrapper
     );
-    AddBias( EasyCL *cl );
+    AddBias(EasyCL *cl);
 
     // [[[end]]]
 };
diff --git a/src/conv/BackpropWeights.cpp b/src/conv/BackpropWeights.cpp
index 1bceb3ec..691a133f 100644
--- a/src/conv/BackpropWeights.cpp
+++ b/src/conv/BackpropWeights.cpp
@@ -15,6 +15,8 @@
 #include "BackpropWeightsNaive.h"
 #include "BackpropWeightsScratch.h"
 #include "BackpropWeightsScratchLarge.h"
+#include "BackpropWeightsIm2Col.h"
+#include "BackpropWeightsAuto.h"
 
 using namespace std;
 
@@ -24,72 +26,91 @@ using namespace std;
 #undef VIRTUAL
 #define VIRTUAL 
 
-BackpropWeights::BackpropWeights( EasyCL *cl, LayerDimensions layerDimensions ) :
-        cl( cl ),
-        dim( layerDimensions ),
-        debug( false ) {
+BackpropWeights::BackpropWeights(EasyCL *cl, LayerDimensions layerDimensions) :
+        cl(cl),
+        dim(layerDimensions),
+        debug(false) {
 }
-STATIC BackpropWeights *BackpropWeights::instance(EasyCL *cl, LayerDimensions dim ) {
-    if( dim.inputImageSize - dim.filterSize < 4 ) {
-        return new BackpropWeightsNaive( cl, dim );
+STATIC BackpropWeights *BackpropWeights::instance(EasyCL *cl, LayerDimensions dim) {
+    return new BackpropWeightsAuto(cl, dim);
+//    if(dim.inputSize - dim.filterSize < 4) {
+//        return new BackpropWeightsNaive(cl, dim);
+//    }
+//    if(square(dim.filterSize) <= cl->getMaxWorkgroupSize() 
+//            && dim.inputSize <= 32) { // if inputimagesize too big, we run out of local memory
+//        return new BackpropWeightsScratch(cl, dim);
+//    } else if(square(dim.filterSize) <= cl->getMaxWorkgroupSize()) {
+//        return new BackpropWeightsScratchLarge(cl, dim);
+//    } else {
+//        return new BackpropWeightsNaive(cl, dim);
+//    }
+}
+STATIC int BackpropWeights::getNumImplementations() {
+    return 5;
+}
+STATIC bool BackpropWeights::plausiblyOptimal(int index, int batchSize, LayerDimensions dim) {
+    if(index == 0) { 
+        return false;
     }
-    if( square( dim.filterSize ) <= cl->getMaxWorkgroupSize() 
-            && dim.inputImageSize <= 32 ) { // if inputimagesize too big, we run out of local memory
-        return new BackpropWeightsScratch( cl, dim );
-    } else if( square( dim.filterSize ) <= cl->getMaxWorkgroupSize() ) {
-        return new BackpropWeightsScratchLarge( cl, dim );
-    } else {
-        return new BackpropWeightsNaive( cl, dim );
+    if(index >= 5) {
+        return false;
     }
+    return true;
 }
-STATIC BackpropWeights *BackpropWeights::instanceForTest(EasyCL *cl, LayerDimensions layerDimensions ) {
-    return new BackpropWeightsScratchLarge( cl, layerDimensions );
+STATIC BackpropWeights *BackpropWeights::instanceForTest(EasyCL *cl, LayerDimensions layerDimensions) {
+    return new BackpropWeightsScratchLarge(cl, layerDimensions);
 }
-STATIC BackpropWeights *BackpropWeights::instanceSpecific( int idx, EasyCL *cl, LayerDimensions layerDimensions ) {
-    if( idx == 0 ) {
-        return new BackpropWeightsCpu( cl, layerDimensions );
+STATIC BackpropWeights *BackpropWeights::instanceSpecific(int idx, EasyCL *cl, LayerDimensions layerDimensions) {
+    if(idx == -1) {
+        return new BackpropWeightsAuto(cl, layerDimensions);
+    }
+    if(idx == 0) {
+        return new BackpropWeightsCpu(cl, layerDimensions);
+    }
+    if(idx == 1) {
+        return new BackpropWeightsNaive(cl, layerDimensions);
     }
-    if( idx == 1 ) {
-        return new BackpropWeightsNaive( cl, layerDimensions );
+    if(idx == 2) {
+        return new BackpropWeightsScratch(cl, layerDimensions);
     }
-    if( idx == 2 ) {
-        return new BackpropWeightsScratch( cl, layerDimensions );
+    if(idx == 3) {
+        return new BackpropWeightsScratchLarge(cl, layerDimensions);
     }
-    if( idx == 3 ) {
-        return new BackpropWeightsScratchLarge( cl, layerDimensions );
+    if(idx == 4) {
+        return new BackpropWeightsIm2Col(cl, layerDimensions);
     }
-    throw std::runtime_error("BackpropWeights::instanceSpecific doesnt handle idx " + toString(idx) );
+    throw std::runtime_error("BackpropWeights::instanceSpecific doesnt handle idx " + toString(idx));
 }
 
-VIRTUAL void BackpropWeights::calcGradWeights( int batchSize, float *gradOutput, float *inputs, float *gradWeights, float *gradBias ) {
+VIRTUAL void BackpropWeights::calcGradWeights(int batchSize, float *gradOutput, float *inputs, float *gradWeights, float *gradBias) {
     StatefulTimer::timeCheck("BackpropWeights::backprop begin");
 
-//    const float learningMultiplier = learningRate / batchSize / sqrt( dim.outputImageSize * dim.outputImageSize );
+//    const float learningMultiplier = learningRate / batchSize / sqrt(dim.outputSize * dim.outputSize);
 
-    int outputSize = batchSize * dim.outputCubeSize;
-    CLWrapper *gradOutputWrapper = cl->wrap( outputSize, gradOutput );
+    int outputNumElements = batchSize * dim.outputCubeSize;
+    CLWrapper *gradOutputWrapper = cl->wrap(outputNumElements, gradOutput);
     gradOutputWrapper->copyToDevice();
 
-    int inputSize = batchSize * dim.inputCubeSize;
-    CLWrapper *inputDataWrapper = cl->wrap( inputSize, inputs );
+    int inputNumElements = batchSize * dim.inputCubeSize;
+    CLWrapper *inputDataWrapper = cl->wrap(inputNumElements, inputs);
     inputDataWrapper->copyToDevice();
 
     CLWrapper *gradWeightsWrapper = 0;
-    int gradWeightsSize = debug ? std::max(10000, dim.filtersSize ) : dim.filtersSize;
-    gradWeightsWrapper = cl->wrap( gradWeightsSize, gradWeights );
+    int gradWeightsSize = debug ? std::max(10000, dim.filtersSize) : dim.filtersSize;
+    gradWeightsWrapper = cl->wrap(gradWeightsSize, gradWeights);
     gradWeightsWrapper->copyToDevice();
 
     CLWrapper *gradBiasWrapper = 0;
-    if( dim.biased ) {
-        gradBiasWrapper = cl->wrap( dim.numFilters, gradBias );
+    if(dim.biased) {
+        gradBiasWrapper = cl->wrap(dim.numFilters, gradBias);
         gradBiasWrapper->copyToDevice();
     }
 
     StatefulTimer::timeCheck("BackpropWeights::backprop after copied to device");
-    calcGradWeights( batchSize, gradOutputWrapper, inputDataWrapper, gradWeightsWrapper, gradBiasWrapper );
+    calcGradWeights(batchSize, gradOutputWrapper, inputDataWrapper, gradWeightsWrapper, gradBiasWrapper);
     StatefulTimer::timeCheck("BackpropWeights::backprop after call backprop");
     gradWeightsWrapper->copyToHost();
-    if( dim.biased ) {
+    if(dim.biased) {
         gradBiasWrapper->copyToHost();
     }
     StatefulTimer::timeCheck("BackpropWeights::backprop after copytohost");
@@ -97,13 +118,13 @@ VIRTUAL void BackpropWeights::calcGradWeights( int batchSize, float *gradOutput,
     delete gradOutputWrapper;
     delete inputDataWrapper;
     delete gradWeightsWrapper;
-    if( dim.biased ) {
+    if(dim.biased) {
         delete gradBiasWrapper;
     }
 }
 
-float BackpropWeights::learningRateToMultiplier( int batchSize ) {
-//        float multiplier = rate / batchSize / sqrt( dim.outputImageSize );
+float BackpropWeights::learningRateToMultiplier(int batchSize) {
+//        float multiplier = rate / batchSize / sqrt(dim.outputSize);
 //    float multiplier = rate;
 //    std::cout << "rate " << rate << " multiplier " << multiplier << std::endl;
     return 1.0f;
diff --git a/src/conv/BackpropWeights.h b/src/conv/BackpropWeights.h
index e4691ad0..233a3cf3 100644
--- a/src/conv/BackpropWeights.h
+++ b/src/conv/BackpropWeights.h
@@ -24,19 +24,21 @@ class DeepCL_EXPORT BackpropWeights {
     bool debug; // = false;
 
     virtual ~BackpropWeights() {}
-    virtual void calcGradWeights( int batchSize, CLWrapper *gradOutputWrapper, CLWrapper *inputsWrapper, CLWrapper *gradWeightsWrapper, CLWrapper *gradBiasWrapper ) = 0;
+    virtual void calcGradWeights(int batchSize, CLWrapper *gradOutputWrapper, CLWrapper *inputsWrapper, CLWrapper *gradWeightsWrapper, CLWrapper *gradBiasWrapper) = 0;
 
     // [[[cog
     // import cog_addheaders
     // cog_addheaders.add()
     // ]]]
     // generated, using cog:
-    BackpropWeights( EasyCL *cl, LayerDimensions layerDimensions );
-    STATIC BackpropWeights *instance(EasyCL *cl, LayerDimensions dim );
-    STATIC BackpropWeights *instanceForTest(EasyCL *cl, LayerDimensions layerDimensions );
-    STATIC BackpropWeights *instanceSpecific( int idx, EasyCL *cl, LayerDimensions layerDimensions );
-    VIRTUAL void calcGradWeights( int batchSize, float *gradOutput, float *inputs, float *gradWeights, float *gradBias );
-    float learningRateToMultiplier( int batchSize );
+    BackpropWeights(EasyCL *cl, LayerDimensions layerDimensions);
+    STATIC BackpropWeights *instance(EasyCL *cl, LayerDimensions dim);
+    STATIC int getNumImplementations();
+    STATIC bool plausiblyOptimal(int index, int batchSize, LayerDimensions dim);
+    STATIC BackpropWeights *instanceForTest(EasyCL *cl, LayerDimensions layerDimensions);
+    STATIC BackpropWeights *instanceSpecific(int idx, EasyCL *cl, LayerDimensions layerDimensions);
+    VIRTUAL void calcGradWeights(int batchSize, float *gradOutput, float *inputs, float *gradWeights, float *gradBias);
+    float learningRateToMultiplier(int batchSize);
 
     // [[[end]]]
 };
diff --git a/src/conv/BackpropWeightsAuto.cpp b/src/conv/BackpropWeightsAuto.cpp
new file mode 100644
index 00000000..3935c389
--- /dev/null
+++ b/src/conv/BackpropWeightsAuto.cpp
@@ -0,0 +1,117 @@
+// Copyright Hugh Perkins 2014,2015 hughperkins at gmail
+//
+// This Source Code Form is subject to the terms of the Mozilla Public License, 
+// v. 2.0. If a copy of the MPL was not distributed with this file, You can 
+// obtain one at http://mozilla.org/MPL/2.0/.
+
+#include <iostream>
+#include <algorithm>
+#include <stdexcept>
+
+#include "conv/BackpropWeightsAuto.h"
+#include "util/stringhelper.h"
+#include "util/StatefulTimer.h"
+#include "util/Timer.h"
+
+using namespace std;
+
+#undef STATIC
+#define STATIC 
+
+#undef VIRTUAL
+#define VIRTUAL 
+
+BackpropWeightsAuto::BackpropWeightsAuto(EasyCL *cl, LayerDimensions dim) :
+        BackpropWeights(cl, dim),
+        milliseconds(0),
+        valid(0),
+        chosenIndex(-1),
+        instances(0)
+         {
+    num = BackpropWeights::getNumImplementations();
+    milliseconds = new int[ num];
+    valid = new bool[ num ];
+    instances = new BackpropWeights *[ num ];
+    for(int i = 0; i < num; i++) {
+        instances[i] = 0;
+        valid[i] = false;
+        milliseconds[i] = -1;
+    }
+    nextIndex = 0;
+}
+VIRTUAL BackpropWeightsAuto::~BackpropWeightsAuto() {
+    for(int i = 0; i < num; i++) {
+        if(instances[i] != 0) {
+            delete instances[i];
+        }
+    }
+}
+VIRTUAL void BackpropWeightsAuto::calcGradWeights(
+        int batchSize, CLWrapper *inputDataWrapper, CLWrapper *gradOutput, CLWrapper *weightsWrapper,
+        CLWrapper *gradInput) {
+    while(chosenIndex == -1 && nextIndex < num) {
+        int thisIndex = nextIndex;
+        nextIndex++;
+        cout << "calcGradWeights try kernel " << thisIndex << endl;
+        if(BackpropWeights::plausiblyOptimal(thisIndex, batchSize, dim)) {
+            BackpropWeights *candidate = 0;
+            try {
+                candidate = BackpropWeights::instanceSpecific(thisIndex, cl, dim);
+                instances[thisIndex] = candidate;
+                valid[thisIndex] = true;
+                cout << "   ... seems valid" << endl;
+            } catch(runtime_error &e) {
+                cout << StatefulTimer::instance()->prefix << "BackpropWeightsAuto: kernel " << thisIndex << ": this instance cant be used: " << e.what() << endl;
+                valid[thisIndex] = false;
+            }
+            if(valid[thisIndex]) {
+                Timer timer;
+                try {
+                    candidate->calcGradWeights(batchSize, inputDataWrapper, gradOutput, weightsWrapper, gradInput);
+                    milliseconds[thisIndex] = (int)timer.lap();
+                    cout << StatefulTimer::instance()->prefix << "BackpropWeightsAuto: kernel " << thisIndex << " " << milliseconds[thisIndex] << "ms" << endl;
+                    return;
+                } catch(runtime_error &e) {
+                    cout << StatefulTimer::instance()->prefix << "BackpropWeightsAuto: kernel " << thisIndex << " this instance cant be used: " << e.what() << endl;
+                    valid[thisIndex] = false;
+                    delete instances[thisIndex];
+                    instances[thisIndex] = 0;
+                }
+            } else {
+                cout << "   ... not valid" << endl;
+            }
+        } else {
+            cout << "  ... not plausibly optimal, skipping" << endl;
+        }
+    }
+    if(chosenIndex == -1) {
+//        cout << StatefulTimer::instance()->prefix + "BackpropWeightsAuto::calcGradWeights choosing best instance:" << endl;
+        int bestIndex = -1;
+        int bestTime = 0;
+        for(int i = 0; i < num; i++) {
+            if(!valid[i]) {
+                cout << "   calcGradWeights kernel " << i << ": cannot be used" << endl;
+                continue;
+            }
+            cout << "   calcGradWeights kernel " << i << " time: " << milliseconds[i] << "ms" << endl;
+            if(bestIndex == -1) {
+                bestIndex = i;
+                bestTime = milliseconds[i];
+                continue;
+            }
+            if(milliseconds[i] < bestTime) {
+                bestTime = milliseconds[i];
+                bestIndex = i;
+            }
+        }
+        if(bestIndex != -1) {
+            cout << "   calcGradWeights layer selected kernel " << bestIndex << endl;
+            this->chosenIndex = bestIndex;
+        } else {
+            throw runtime_error(StatefulTimer::instance()->prefix + "No valid calcGradWeights implementations found");
+        }
+    }
+//    cout << "BackpropWeightsAuto::calcGradWeights using instance index: " << chosenIndex << endl;
+    instances[chosenIndex]->calcGradWeights(batchSize, inputDataWrapper, gradOutput, weightsWrapper, gradInput);
+}
+
diff --git a/src/conv/BackpropWeightsAuto.h b/src/conv/BackpropWeightsAuto.h
new file mode 100644
index 00000000..0c6ddc1f
--- /dev/null
+++ b/src/conv/BackpropWeightsAuto.h
@@ -0,0 +1,57 @@
+// Copyright Hugh Perkins 2014 hughperkins at gmail
+//
+// This Source Code Form is subject to the terms of the Mozilla Public License, 
+// v. 2.0. If a copy of the MPL was not distributed with this file, You can 
+// obtain one at http://mozilla.org/MPL/2.0/.
+
+#pragma once
+
+#include <algorithm>
+#include <iostream>
+#include <string>
+
+#include "EasyCL.h"
+#include "activate/ActivationFunction.h"
+#include "conv/BackpropWeights.h"
+#include "conv/LayerDimensions.h"
+#include "DeepCLDllExport.h"
+
+using namespace std;
+
+//inline float square(float value) {
+//    return value * value;
+//}
+
+#define STATIC static
+#define VIRTUAL virtual
+
+class DeepCL_EXPORT BackpropWeightsAuto : public BackpropWeights {
+public:
+//    EasyCL *cl;
+//    LayerDimensions dim;
+//    ActivationFunction const*fn;
+
+    int num;
+    int *milliseconds;
+    bool *valid;
+    int chosenIndex;
+    BackpropWeights **instances;
+    int nextIndex;
+
+    // [[[cog
+    // import cog_addheaders
+    // cog_addheaders.add()
+    // ]]]
+    // generated, using cog:
+    BackpropWeightsAuto(EasyCL *cl, LayerDimensions dim);
+    VIRTUAL ~BackpropWeightsAuto();
+    VIRTUAL void calcGradWeights(
+    int batchSize, CLWrapper *inputDataWrapper, CLWrapper *gradOutput, CLWrapper *weightsWrapper,
+    CLWrapper *gradInput);
+
+    // [[[end]]]
+
+};
+
+
+
diff --git a/src/conv/BackpropWeightsByRow.cpp b/src/conv/BackpropWeightsByRow.cpp
index 4a7f4ae3..6b18c980 100644
--- a/src/conv/BackpropWeightsByRow.cpp
+++ b/src/conv/BackpropWeightsByRow.cpp
@@ -25,104 +25,104 @@ VIRTUAL BackpropWeightsByRow::~BackpropWeightsByRow() {
     delete reduce;
     delete perElementAdd;
 }
-VIRTUAL void BackpropWeightsByRow::backpropWeights( int batchSize, float learningRate,  CLWrapper *gradOutputWrapper, CLWrapper *imagesWrapper, CLWrapper *weightsWrapper, CLWrapper *biasWrapper ) {
-    StatefulTimer::instance()->timeCheck("BackpropWeightsByRow start" );
+VIRTUAL void BackpropWeightsByRow::backpropWeights(int batchSize, float learningRate,  CLWrapper *gradOutputWrapper, CLWrapper *imagesWrapper, CLWrapper *weightsWrapper, CLWrapper *biasWrapper) {
+    StatefulTimer::instance()->timeCheck("BackpropWeightsByRow start");
 
     cout << "input buffer:" << endl;
-    PrintBuffer::printFloats( cl, imagesWrapper, batchSize * dim.inputImageSize, dim.inputImageSize );
+    PrintBuffer::printFloats(cl, imagesWrapper, batchSize * dim.inputSize, dim.inputSize);
     cout << endl;
 
     cout << "errors buffer:" << endl;
-    PrintBuffer::printFloats( cl, gradOutputWrapper, batchSize * dim.outputImageSize, dim.outputImageSize );
+    PrintBuffer::printFloats(cl, gradOutputWrapper, batchSize * dim.outputSize, dim.outputSize);
     cout << endl;
 
     int globalSize = workgroupSize * numWorkgroups;
-    globalSize = ( ( globalSize + workgroupSize - 1 ) / workgroupSize ) * workgroupSize;
+    globalSize = (( globalSize + workgroupSize - 1) / workgroupSize) * workgroupSize;
 
-    int localMemRequiredKB = ( dim.outputImageSize * 4 + dim.inputImageSize * 4 ) / 1024 + 1;
-    if( localMemRequiredKB >= cl->getLocalMemorySizeKB() ) {
-        throw runtime_error( "local memory too small to use this kernel on this device.  Need: " + 
-            toString( localMemRequiredKB ) + "KB, but only have: " + 
-            toString( cl->getLocalMemorySizeKB() ) + "KB local memory" );
+    int localMemRequiredKB = (dim.outputSize * 4 + dim.inputSize * 4) / 1024 + 1;
+    if(localMemRequiredKB >= cl->getLocalMemorySizeKB()) {
+        throw runtime_error("local memory too small to use this kernel on this device.  Need: " + 
+            toString(localMemRequiredKB) + "KB, but only have: " + 
+            toString(cl->getLocalMemorySizeKB()) + "KB local memory");
     }
 
-    const float learningMultiplier = learningRateToMultiplier( batchSize, learningRate );
+    const float learningMultiplier = learningRateToMultiplier(batchSize, learningRate);
 
-    const int weights1Size = dim.filtersSize * dim.outputImageSize;
+    const int weights1Size = dim.filtersSize * dim.outputSize;
     float *weights1 = new float[ weights1Size ];
-    CLWrapper *weights1Wrapper = cl->wrap( weights1Size, weights1 );
+    CLWrapper *weights1Wrapper = cl->wrap(weights1Size, weights1);
     weights1Wrapper->createOnDevice();
 
     float *bias1 = 0;
     CLWrapper *bias1Wrapper = 0;
-    if( dim.biased ) {
-        const int bias1Size = dim.numFilters * dim.outputImageSize;
+    if(dim.biased) {
+        const int bias1Size = dim.numFilters * dim.outputSize;
         bias1 = new float[ bias1Size ];
-        bias1Wrapper = cl->wrap( bias1Size, bias1 );
+        bias1Wrapper = cl->wrap(bias1Size, bias1);
         bias1Wrapper->createOnDevice();
     }
 
     float *weights2 = new float[ dim.filtersSize ];
-    CLWrapper *weights2Wrapper = cl->wrap( dim.filtersSize, weights2 );
+    CLWrapper *weights2Wrapper = cl->wrap(dim.filtersSize, weights2);
     weights2Wrapper->createOnDevice();
 
     float *bias2 = 0;
     CLWrapper *bias2Wrapper = 0;
-    if( dim.biased ) {
+    if(dim.biased) {
         bias2 = new float[ dim.numFilters ];
-        bias2Wrapper = cl->wrap( dim.numFilters, bias2 );
+        bias2Wrapper = cl->wrap(dim.numFilters, bias2);
         bias2Wrapper->createOnDevice();
     }
 
-    StatefulTimer::instance()->timeCheck("BackpropWeightsByRow allocated buffers and wrappers" );
+    StatefulTimer::instance()->timeCheck("BackpropWeightsByRow allocated buffers and wrappers");
 
     kernel
        ->in(learningMultiplier)
-       ->in( batchSize )
-       ->in( gradOutputWrapper )
-        ->in( imagesWrapper )
-       ->out( weights1Wrapper );
-    if( dim.biased ) {
-        kernel->out( bias1Wrapper );
+       ->in(batchSize)
+       ->in(gradOutputWrapper)
+        ->in(imagesWrapper)
+       ->out(weights1Wrapper);
+    if(dim.biased) {
+        kernel->out(bias1Wrapper);
     }
     kernel
-        ->localFloats( dim.outputImageSize )
-        ->localFloats( dim.inputImageSize );
+        ->localFloats(dim.outputSize)
+        ->localFloats(dim.inputSize);
 
     kernel->run_1d(globalSize, workgroupSize);
     cl->finish();
 
     cout << "weights1wrapper after first kernel:" << endl;
-    PrintBuffer::printFloats( cl, weights1Wrapper, dim.filterSize * dim.outputImageSize, dim.filterSize );
+    PrintBuffer::printFloats(cl, weights1Wrapper, dim.filterSize * dim.outputSize, dim.filterSize);
     cout << endl;
 
-    reduce->in( dim.filtersSize )->in( dim.outputImageSize )
-        ->in( weights1Wrapper )->out( weights2Wrapper );
-    reduce->run_1d( ( dim.filtersSize + 64 - 1 ) / 64 * 64, 64 );
-    if( dim.biased ) {
-        reduce->in( dim.numFilters )->in( dim.outputImageSize )
-            ->in( bias1Wrapper )->out( bias2Wrapper );
-        reduce->run_1d( ( dim.numFilters + 64 - 1 ) / 64 * 64, 64 );
+    reduce->in(dim.filtersSize)->in(dim.outputSize)
+        ->in(weights1Wrapper)->out(weights2Wrapper);
+    reduce->run_1d(( dim.filtersSize + 64 - 1) / 64 * 64, 64);
+    if(dim.biased) {
+        reduce->in(dim.numFilters)->in(dim.outputSize)
+            ->in(bias1Wrapper)->out(bias2Wrapper);
+        reduce->run_1d(( dim.numFilters + 64 - 1) / 64 * 64, 64);
     }
     cl->finish();
 
-    PrintBuffer::printFloats( cl, weights2Wrapper, dim.filterSize, dim.filterSize );
+    PrintBuffer::printFloats(cl, weights2Wrapper, dim.filterSize, dim.filterSize);
 
-    PrintBuffer::printFloats( cl, weightsWrapper, dim.filterSize, dim.filterSize );
+    PrintBuffer::printFloats(cl, weightsWrapper, dim.filterSize, dim.filterSize);
     
-    perElementAdd->in( dim.filtersSize )->inout( weightsWrapper )->in( weights2Wrapper );
-    perElementAdd->run_1d( ( dim.filtersSize + 64 - 1 ) / 64 * 64, 64 );
+    perElementAdd->in(dim.filtersSize)->inout(weightsWrapper)->in(weights2Wrapper);
+    perElementAdd->run_1d(( dim.filtersSize + 64 - 1) / 64 * 64, 64);
     
-    if( dim.biased ) {
-        perElementAdd->in( dim.numFilters )->inout( biasWrapper )->in( bias2Wrapper );
-        perElementAdd->run_1d( ( dim.numFilters + 64 - 1 ) / 64 * 64, 64 );
+    if(dim.biased) {
+        perElementAdd->in(dim.numFilters)->inout(biasWrapper)->in(bias2Wrapper);
+        perElementAdd->run_1d(( dim.numFilters + 64 - 1) / 64 * 64, 64);
     }
 
     cl->finish();
 
-    PrintBuffer::printFloats( cl, weightsWrapper, dim.filterSize, dim.filterSize );
+    PrintBuffer::printFloats(cl, weightsWrapper, dim.filterSize, dim.filterSize);
 
-    if( dim.biased ) {
+    if(dim.biased) {
         delete bias2Wrapper;
         delete bias1Wrapper;
         delete[] bias2;
@@ -133,15 +133,15 @@ VIRTUAL void BackpropWeightsByRow::backpropWeights( int batchSize, float learnin
     delete[] weights2;
     delete[] weights1;
 
-    StatefulTimer::instance()->timeCheck("BackpropWeightsByRow end" );
+    StatefulTimer::instance()->timeCheck("BackpropWeightsByRow end");
 }
-BackpropWeightsByRow::BackpropWeightsByRow( EasyCL *cl, LayerDimensions dim ) :
-        BackpropWeights( cl, dim )
+BackpropWeightsByRow::BackpropWeightsByRow(EasyCL *cl, LayerDimensions dim) :
+        BackpropWeights(cl, dim)
             {
-    workgroupSize = std::max( 32, dim.filterSize ); // no point in wasting cores...
-    numWorkgroups = dim.inputPlanes * dim.numFilters * dim.outputImageSize;
+    workgroupSize = std::max(32, dim.filterSize); // no point in wasting cores...
+    numWorkgroups = dim.inputPlanes * dim.numFilters * dim.outputSize;
     cout << "numWorkgroups " << numWorkgroups << " workgropuSize=" << workgroupSize << endl;
-    if( workgroupSize > cl->getMaxWorkgroupSize() ) {
+    if(workgroupSize > cl->getMaxWorkgroupSize()) {
         throw runtime_error("filtersize larger than maxworkgroupsize, so cannot use BackpropWeightsByRow kernel");
     }    
 
@@ -149,9 +149,9 @@ BackpropWeightsByRow::BackpropWeightsByRow( EasyCL *cl, LayerDimensions dim ) :
 
     // [[[cog
     // import stringify
-    // stringify.write_kernel2( "kernel", "cl/backpropweights_byrow.cl", "backprop_weights", 'options' )
-    // stringify.write_kernel2( "reduce", "cl/reduce_segments.cl", "reduce_segments", '""' )
-    // stringify.write_kernel2( "perElementAdd", "cl/per_element_add.cl", "per_element_add", '""' )
+    // stringify.write_kernel2("kernel", "cl/backpropweights_byrow.cl", "backprop_weights", 'options')
+    // stringify.write_kernel2("reduce", "cl/reduce_segments.cl", "reduce_segments", '""')
+    // stringify.write_kernel2("perElementAdd", "cl/per_element_add.cl", "per_element_add", '""')
     // ]]]
     // generated using cog, from cl/backpropweights_byrow.cl:
     const char * kernelSource =  
@@ -180,20 +180,20 @@ BackpropWeightsByRow::BackpropWeightsByRow( EasyCL *cl, LayerDimensions dim ) :
     "// localid: [filterRow][filterCol]\n" 
     "// weightChanges1: [outputPlane][inputPlane][filterRow][filterCol][outputRow]\n" 
     "// gradBiasWeights1: [outputPlane][outputRow]\n" 
-    "kernel void backprop_weights( const float learningRateMultiplier, const int batchSize,\n" 
+    "kernel void backprop_weights(const float learningRateMultiplier, const int batchSize,\n" 
     "    global float const *gradOutput, global float const *input, global float *restrict gradWeights1,\n" 
     "    #ifdef BIASED\n" 
     "         global float *restrict gradBiasWeights1,\n" 
     "    #endif\n" 
-    "    local float *restrict _errorRow, local float *restrict _inputRow ) {\n" 
-    "    #define globalId ( get_global_id(0) )\n" 
-    "    #define workgroupId ( get_group_id(0) )\n" 
-    "    #define localId ( get_local_id(0) )\n" 
+    "    local float *restrict _errorRow, local float *restrict _inputRow) {\n" 
+    "    #define globalId (get_global_id(0))\n" 
+    "    #define workgroupId (get_group_id(0))\n" 
+    "    #define localId (get_local_id(0))\n" 
     "\n" 
     "    const int filterRow = localId / gFilterSize;\n" 
     "    const int filterCol = localId % gFilterSize;\n" 
-    "    const int outputRow = workgroupId % gOutputImageSize;\n" 
-    "    #define outInCombo ( workgroupId / gOutputImageSize )\n" 
+    "    const int outputRow = workgroupId % gOutputSize;\n" 
+    "    #define outInCombo (workgroupId / gOutputSize)\n" 
     "    const int outputPlane = outInCombo / gNumInputPlanes;\n" 
     "    const int inputPlane = outInCombo % gNumInputPlanes;\n" 
     "\n" 
@@ -203,35 +203,35 @@ BackpropWeightsByRow::BackpropWeightsByRow( EasyCL *cl, LayerDimensions dim ) :
     "    #ifdef BIASED\n" 
     "        float thisbiaschange = 0.0f;\n" 
     "    #endif\n" 
-    "    for( int n = 0; n < batchSize; n++ ) {\n" 
+    "    for (int n = 0; n < batchSize; n++) {\n" 
     "        barrier(CLK_LOCAL_MEM_FENCE);\n" 
     "        // copy down the gradOutput row...\n" 
     "        {\n" 
     "            global float const*gradOutputRow = gradOutput +\n" 
-    "                ( ( n\n" 
-    "                    * gNumOutputPlanes + outputPlane )\n" 
-    "                    * gOutputImageSize + outputRow )\n" 
-    "                    * gOutputImageSize;\n" 
-    "            if( localId < gOutputImageSize ) { // assume we have enough threads for now... should fix later\n" 
+    "                (( n\n" 
+    "                    * gNumOutputPlanes + outputPlane)\n" 
+    "                    * gOutputSize + outputRow)\n" 
+    "                    * gOutputSize;\n" 
+    "            if (localId < gOutputSize) { // assume we have enough threads for now... should fix later\n" 
     "                _errorRow[ localId ] = gradOutputRow[ localId ];\n" 
     "            }\n" 
     "        }\n" 
     "        // copy down the input row\n" 
     "        {\n" 
     "            global float const*inputRowData = input +\n" 
-    "                ( ( n\n" 
-    "                    * gNumInputPlanes + inputPlane )\n" 
-    "                    * gInputImageSize + thisInputRow )\n" 
-    "                    * gInputImageSize;\n" 
-    "            if( localId < gInputImageSize ) { // assume we have enough threads for now... should fix later\n" 
+    "                (( n\n" 
+    "                    * gNumInputPlanes + inputPlane)\n" 
+    "                    * gInputSize + thisInputRow)\n" 
+    "                    * gInputSize;\n" 
+    "            if (localId < gInputSize) { // assume we have enough threads for now... should fix later\n" 
     "                _inputRow[ localId ] = inputRowData[ localId ];\n" 
     "            }\n" 
     "        }\n" 
     "        barrier(CLK_LOCAL_MEM_FENCE);\n" 
-    "        for( int outputCol = 0; outputCol < gOutputImageSize; outputCol++ ) {\n" 
+    "        for (int outputCol = 0; outputCol < gOutputSize; outputCol++) {\n" 
     "            const int inputCol = outputCol - gMargin + filterCol;\n" 
-    "            if( inputRow >= 0 && inputRow < gInputImageSize && inputCol >= 0 && inputCol < gInputImageSize ) {\n" 
-    "                if( localId < gFilterSizeSquared ) {\n" 
+    "            if (inputRow >= 0 && inputRow < gInputSize && inputCol >= 0 && inputCol < gInputSize) {\n" 
+    "                if (localId < gFilterSizeSquared) {\n" 
     "                    thiswchange += _inputRow[ inputCol ] * _errorRow[ outputCol ];\n" 
     "                    #ifdef BIASED\n" 
     "                        thisbiaschange += _errorRow[ outputCol ];\n" 
@@ -241,21 +241,21 @@ BackpropWeightsByRow::BackpropWeightsByRow( EasyCL *cl, LayerDimensions dim ) :
     "        }\n" 
     "    }\n" 
     "\n" 
-    "    if( workgroupId == 0 && localId == 0 ) {\n" 
+    "    if (workgroupId == 0 && localId == 0) {\n" 
     "        gradWeights1[0] = _inputRow[0];\n" 
     "        gradWeights1[1] = _inputRow[1];\n" 
     "    }\n" 
     "\n" 
-    "    if( localId < gFilterSizeSquared ) {\n" 
-    "        #define weightsIndex ( ( ( outInCombo \\\n" 
-    "            * gFilterSizeSquared ) + localId \\\n" 
-    "            * gOutputImageSize ) + outputRow )\n" 
+    "    if (localId < gFilterSizeSquared) {\n" 
+    "        #define weightsIndex (( (outInCombo \\\n" 
+    "            * gFilterSizeSquared) + localId \\\n" 
+    "            * gOutputSize) + outputRow)\n" 
     "        //gradWeights1[ weightsIndex ] -= learningRateMultiplier * thiswchange;\n" 
     "        //gradWeights1[weightsIndex] = 123.0f;\n" 
     "    }\n" 
     "    #ifdef BIASED\n" 
-    "        if( inputPlane == 0 && localId == 0 ) {\n" 
-    "            gradBiasWeights1[outputPlane * gOutputImageSize + outputRow ] = learningRateMultiplier * thisbiaschange;\n" 
+    "        if (inputPlane == 0 && localId == 0) {\n" 
+    "            gradBiasWeights1[outputPlane * gOutputSize + outputRow ] = learningRateMultiplier * thisbiaschange;\n" 
     "        }\n" 
     "    #endif\n" 
     "}\n" 
@@ -270,18 +270,18 @@ BackpropWeightsByRow::BackpropWeightsByRow( EasyCL *cl, LayerDimensions dim ) :
     "// v. 2.0. If a copy of the MPL was not distributed with this file, You can\n" 
     "// obtain one at http://mozilla.org/MPL/2.0/.\n" 
     "\n" 
-    "kernel void reduce_segments( const int numSegments, const int segmentLength,\n" 
-    "        global float const *in, global float* out ) {\n" 
+    "kernel void reduce_segments(const int numSegments, const int segmentLength,\n" 
+    "        global float const *in, global float* out) {\n" 
     "    const int globalId = get_global_id(0);\n" 
     "    const int segmentId = globalId;\n" 
     "\n" 
-    "    if( segmentId >= numSegments ) {\n" 
+    "    if (segmentId >= numSegments) {\n" 
     "        return;\n" 
     "    }\n" 
     "\n" 
     "    float sum = 0;\n" 
     "    global const float *segment = in + segmentId * segmentLength;\n" 
-    "    for( int i = 0; i < segmentLength; i++ ) {\n" 
+    "    for (int i = 0; i < segmentLength; i++) {\n" 
     "        sum += segment[i];\n" 
     "    }\n" 
     "    out[segmentId] = sum;\n" 
@@ -298,9 +298,9 @@ BackpropWeightsByRow::BackpropWeightsByRow( EasyCL *cl, LayerDimensions dim ) :
     "// v. 2.0. If a copy of the MPL was not distributed with this file, You can\n" 
     "// obtain one at http://mozilla.org/MPL/2.0/.\n" 
     "\n" 
-    "kernel void per_element_add( const int N, global float *target, global const float *source ) {\n" 
+    "kernel void per_element_add(const int N, global float *target, global const float *source) {\n" 
     "    const int globalId = get_global_id(0);\n" 
-    "    if( globalId >= N ) {\n" 
+    "    if (globalId >= N) {\n" 
     "        return;\n" 
     "    }\n" 
     "    target[globalId] += source[globalId];\n" 
@@ -308,20 +308,20 @@ BackpropWeightsByRow::BackpropWeightsByRow( EasyCL *cl, LayerDimensions dim ) :
     "\n" 
     "// adds source to target\n" 
     "// tiles source as necessary, according to tilingSize\n" 
-    "kernel void per_element_tiled_add( const int N, const int tilingSize, global float *target, global const float *source ) {\n" 
+    "kernel void per_element_tiled_add(const int N, const int tilingSize, global float *target, global const float *source) {\n" 
     "    const int globalId = get_global_id(0);\n" 
-    "    if( globalId >= N ) {\n" 
+    "    if (globalId >= N) {\n" 
     "        return;\n" 
     "    }\n" 
     "    target[globalId] += source[globalId % tilingSize];\n" 
     "}\n" 
     "\n" 
-    "kernel void repeated_add( const int N, const int sourceSize, const int repeatSize, global float *target, global const float *source ) {\n" 
+    "kernel void repeated_add(const int N, const int sourceSize, const int repeatSize, global float *target, global const float *source) {\n" 
     "    const int globalId = get_global_id(0);\n" 
-    "    if( globalId >= N ) {\n" 
+    "    if (globalId >= N) {\n" 
     "        return;\n" 
     "    }\n" 
-    "    target[globalId] += source[ ( globalId / repeatSize ) % sourceSize ];\n" 
+    "    target[globalId] += source[ (globalId / repeatSize) % sourceSize ];\n" 
     "}\n" 
     "\n" 
     "";
diff --git a/src/conv/BackpropWeightsByRow.h b/src/conv/BackpropWeightsByRow.h
index e13c6910..72043811 100644
--- a/src/conv/BackpropWeightsByRow.h
+++ b/src/conv/BackpropWeightsByRow.h
@@ -20,8 +20,8 @@ class BackpropWeightsByRow : public BackpropWeights {
     // ]]]
     // generated, using cog:
     VIRTUAL ~BackpropWeightsByRow();
-    VIRTUAL void backpropWeights( int batchSize, float learningRate,  CLWrapper *gradOutputWrapper, CLWrapper *imagesWrapper, CLWrapper *weightsWrapper, CLWrapper *biasWrapper );
-    BackpropWeightsByRow( EasyCL *cl, LayerDimensions dim );
+    VIRTUAL void backpropWeights(int batchSize, float learningRate,  CLWrapper *gradOutputWrapper, CLWrapper *imagesWrapper, CLWrapper *weightsWrapper, CLWrapper *biasWrapper);
+    BackpropWeightsByRow(EasyCL *cl, LayerDimensions dim);
 
     // [[[end]]]
 };
diff --git a/src/conv/BackpropWeightsCpu.cpp b/src/conv/BackpropWeightsCpu.cpp
index 833fe4b7..d8b2a318 100644
--- a/src/conv/BackpropWeightsCpu.cpp
+++ b/src/conv/BackpropWeightsCpu.cpp
@@ -16,68 +16,68 @@ using namespace std;
 #undef VIRTUAL
 #define VIRTUAL 
 
-BackpropWeightsCpu::BackpropWeightsCpu( EasyCL *cl, LayerDimensions dim ) :
-        BackpropWeights( cl, dim )
+BackpropWeightsCpu::BackpropWeightsCpu(EasyCL *cl, LayerDimensions dim) :
+        BackpropWeights(cl, dim)
             {
 }
 VIRTUAL BackpropWeightsCpu::~BackpropWeightsCpu() {
 }
-VIRTUAL void BackpropWeightsCpu::calcGradWeights( int batchSize, CLWrapper *gradOutputWrapper, CLWrapper *imagesWrapper, CLWrapper *gradWeightsWrapper, CLWrapper *gradBiasWrapper ) {
+VIRTUAL void BackpropWeightsCpu::calcGradWeights(int batchSize, CLWrapper *gradOutputWrapper, CLWrapper *imagesWrapper, CLWrapper *gradWeightsWrapper, CLWrapper *gradBiasWrapper) {
     gradOutputWrapper->copyToHost();
     imagesWrapper->copyToHost();
     float *gradBias = 0;
-    if( dim.biased ) {
+    if(dim.biased) {
         gradBiasWrapper->copyToHost();
         gradBias =  (float *)gradBiasWrapper->getHostArray();
     }
-    calcGradWeights( batchSize, (float *)gradOutputWrapper->getHostArray(), (float *)imagesWrapper->getHostArray(),
-        (float *)gradWeightsWrapper->getHostArray(), gradBias );
+    calcGradWeights(batchSize, (float *)gradOutputWrapper->getHostArray(), (float *)imagesWrapper->getHostArray(),
+        (float *)gradWeightsWrapper->getHostArray(), gradBias);
     gradWeightsWrapper->copyToDevice();
-    if( dim.biased ) {
+    if(dim.biased) {
         gradBiasWrapper->copyToDevice();
     }
 }
-VIRTUAL void BackpropWeightsCpu::calcGradWeights( int batchSize, float *gradOutput,
-    float *inputs, float *gradWeights, float *gradBias ) {
+VIRTUAL void BackpropWeightsCpu::calcGradWeights(int batchSize, float *gradOutput,
+    float *inputs, float *gradWeights, float *gradBias) {
 
-    StatefulTimer::instance()->timeCheck(" calcGradWeightsCpu start" );
+    StatefulTimer::instance()->timeCheck(" calcGradWeightsCpu start");
 
-    const float learningMultiplier = learningRateToMultiplier( batchSize );
+    const float learningMultiplier = learningRateToMultiplier(batchSize);
 
     const int halfFilterSize = dim.filterSize >> 1;
     const int margin = dim.padZeros ? halfFilterSize : 0;
-    for( int outPlane = 0; outPlane < dim.numFilters; outPlane++ ) {
-        for( int inputPlane = 0; inputPlane < dim.inputPlanes; inputPlane++ ) {
-            for( int filterRow = 0; filterRow < dim.filterSize; filterRow++ ) {
-                for( int filterCol = 0; filterCol <dim.filterSize; filterCol++ ) {
-                    int weightIndex = ( ( outPlane
-                        * dim.inputPlanes + inputPlane )
-                        * dim.filterSize + filterRow )
+    for(int outPlane = 0; outPlane < dim.numFilters; outPlane++) {
+        for(int inputPlane = 0; inputPlane < dim.inputPlanes; inputPlane++) {
+            for(int filterRow = 0; filterRow < dim.filterSize; filterRow++) {
+                for(int filterCol = 0; filterCol <dim.filterSize; filterCol++) {
+                    int weightIndex = (( outPlane
+                        * dim.inputPlanes + inputPlane)
+                        * dim.filterSize + filterRow)
                         * dim.filterSize + filterCol;
                     float thiswchange = 0;
                     float thisBiasChange = 0;
                     // gradWeights:     [outPlane][inputPlane][filterRow][filterCol]
                     //       aggregate over:  [outRow][outCol][n]
-                    for( int outRow = 0; outRow < dim.outputImageSize; outRow++ ) {
+                    for(int outRow = 0; outRow < dim.outputSize; outRow++) {
                         int inputRow = outRow - margin + filterRow;
-                        if( inputRow < 0 || inputRow > dim.inputImageSize - 1 ) {
+                        if(inputRow < 0 || inputRow > dim.inputSize - 1) {
                             continue;
                         }
-                        for( int outCol = 0; outCol < dim.outputImageSize; outCol++ ) {
+                        for(int outCol = 0; outCol < dim.outputSize; outCol++) {
                             int inputCol = outCol - margin + filterCol;
-                            if( inputCol < 0 || inputCol > dim.inputImageSize - 1 ) {
+                            if(inputCol < 0 || inputCol > dim.inputSize - 1) {
                                 continue;
                             }
-                            for( int n = 0; n < batchSize; n++ ) {
-                                int outputIndex = ( ( n
-                                    * dim.numFilters + outPlane )
-                                    * dim.outputImageSize + outRow )
-                                    * dim.outputImageSize + outCol;
+                            for(int n = 0; n < batchSize; n++) {
+                                int outputIndex = (( n
+                                    * dim.numFilters + outPlane)
+                                    * dim.outputSize + outRow)
+                                    * dim.outputSize + outCol;
                                 float gradOutputValue = gradOutput[outputIndex];
-                                int inputIndex = ( ( n
-                                    * dim.inputPlanes + inputPlane )
-                                    * dim.inputImageSize + inputRow )
-                                    * dim.inputImageSize + inputCol;
+                                int inputIndex = (( n
+                                    * dim.inputPlanes + inputPlane)
+                                    * dim.inputSize + inputRow)
+                                    * dim.inputSize + inputCol;
                                 float inputValue = inputs[ inputIndex ];
                                 thiswchange += gradOutputValue * inputValue;
                                 thisBiasChange += gradOutputValue; // fairly sure this is right.  Fairly :-P
@@ -86,8 +86,8 @@ VIRTUAL void BackpropWeightsCpu::calcGradWeights( int batchSize, float *gradOutp
                     }
 //                    cout << "weight change " << weightIndex << " " << learningMultiplier * thiswchange << endl;
                     gradWeights[ weightIndex ] = thiswchange * learningMultiplier;
-                    if( dim.biased ) {
-                        if( filterRow == margin && filterCol == margin && inputPlane == 0 ) {
+                    if(dim.biased) {
+                        if(filterRow == margin && filterCol == margin && inputPlane == 0) {
                             gradBias[ outPlane ] = learningMultiplier * thisBiasChange;
                         }
                     }
@@ -95,6 +95,6 @@ VIRTUAL void BackpropWeightsCpu::calcGradWeights( int batchSize, float *gradOutp
             }
         }
     }
-    StatefulTimer::instance()->timeCheck(" calcGradWeightsCpu end" );
+    StatefulTimer::instance()->timeCheck(" calcGradWeightsCpu end");
 }
 
diff --git a/src/conv/BackpropWeightsCpu.h b/src/conv/BackpropWeightsCpu.h
index 67f787c9..5e961737 100644
--- a/src/conv/BackpropWeightsCpu.h
+++ b/src/conv/BackpropWeightsCpu.h
@@ -19,11 +19,11 @@ class BackpropWeightsCpu : public BackpropWeights {
     // cog_addheaders.add()
     // ]]]
     // generated, using cog:
-    BackpropWeightsCpu( EasyCL *cl, LayerDimensions dim );
+    BackpropWeightsCpu(EasyCL *cl, LayerDimensions dim);
     VIRTUAL ~BackpropWeightsCpu();
-    VIRTUAL void calcGradWeights( int batchSize, CLWrapper *gradOutputWrapper, CLWrapper *imagesWrapper, CLWrapper *gradWeightsWrapper, CLWrapper *gradBiasWrapper );
-    VIRTUAL void calcGradWeights( int batchSize, float *gradOutput,
-    float *inputs, float *gradWeights, float *gradBias );
+    VIRTUAL void calcGradWeights(int batchSize, CLWrapper *gradOutputWrapper, CLWrapper *imagesWrapper, CLWrapper *gradWeightsWrapper, CLWrapper *gradBiasWrapper);
+    VIRTUAL void calcGradWeights(int batchSize, float *gradOutput,
+    float *inputs, float *gradWeights, float *gradBias);
 
     // [[[end]]]
 };
diff --git a/src/conv/BackpropWeightsIm2Col.cpp b/src/conv/BackpropWeightsIm2Col.cpp
new file mode 100644
index 00000000..6814af7d
--- /dev/null
+++ b/src/conv/BackpropWeightsIm2Col.cpp
@@ -0,0 +1,115 @@
+#include "EasyCL.h"
+#include "util/stringhelper.h"
+#include "util/StatefulTimer.h"
+
+#include <sstream>
+#include <iostream>
+#include <string>
+
+//#include "clblas/ClBlasInstance.h"
+#include "clblas/ClBlasHelper.h"
+#include "BackpropWeightsIm2Col.h"
+#include "conv/Im2Col.h"
+#include "clmath/CLMathWrapper.h"
+
+using namespace std;
+
+#undef STATIC
+#define STATIC 
+
+#undef VIRTUAL
+#define VIRTUAL 
+
+#define PUBLIC
+
+PUBLIC BackpropWeightsIm2Col::BackpropWeightsIm2Col(EasyCL *cl, LayerDimensions dim) :
+            BackpropWeights(cl, dim)
+        {
+//    ClBlasInstance::initializeIfNecessary();
+
+//    addBias = new AddBias(cl);
+
+    this->im2Col = new Im2Col(cl, dim);
+}
+PUBLIC VIRTUAL BackpropWeightsIm2Col::~BackpropWeightsIm2Col() {
+    delete im2Col;
+//    delete addBias;
+}
+//int batchSize, CLWrapper *gradOutputWrapper, CLWrapper *imagesWrapper, CLWrapper *gradWeightsWrapper, CLWrapper *gradBiasWrapper
+PUBLIC VIRTUAL void BackpropWeightsIm2Col::calcGradWeights(int batchSize, CLWrapper *gradOutputWrapper, CLWrapper *inputWrapper, CLWrapper *gradWeightsWrapper, CLWrapper *gradBiasWrapper) {
+    StatefulTimer::timeCheck("BackpropWeightsIm2Col::calcGradWeights START");
+
+    int columnsSize = dim.inputPlanes * dim.filterSizeSquared * dim.outputSizeSquared;
+    float *columns = new float[columnsSize];
+    CLWrapper *columnsWrapper = cl->wrap(columnsSize, columns);
+    columnsWrapper->createOnDevice();
+
+    int onesSize = dim.outputSizeSquared;
+    float *ones = new float[onesSize];
+    CLWrapper *onesWrapper = cl->wrap(onesSize, ones);
+    onesWrapper->createOnDevice();
+    CLMathWrapper ones_(onesWrapper);
+    ones_ = 1.0f;
+
+//    cout << "gradColumnsSize: " << gradColumnsSize << endl;
+//    cout << "weightsize: " << weightsWrapper->size() << endl;
+
+    StatefulTimer::timeCheck("BackpropWeightsIm2Col::calcGradWeights after alloc");
+
+    CLMathWrapper gradWeights_(gradWeightsWrapper);
+    gradWeights_ = 0.0f;
+    if(dim.biased) {
+        CLMathWrapper gradBias_(gradBiasWrapper);
+        gradBias_ = 0.0f;
+    }
+    for (int b = 0; b < batchSize; b ++) {
+//        cout << "b=" << b << " numkernels=" << numKernels << endl;
+
+        im2Col->im2Col(
+            inputWrapper, b * dim.inputCubeSize,
+            columnsWrapper
+        );
+        
+        int64 m = dim.inputPlanes * dim.filterSizeSquared;
+        int64 n = dim.numFilters;
+        int64 k = dim.outputSizeSquared;
+
+        ClBlasHelper::Gemm(
+            cl,
+            clblasColumnMajor,
+            clblasTrans, clblasNoTrans,
+            m, k, n,
+            1,
+            columnsWrapper, 0,
+            gradOutputWrapper, b * dim.outputCubeSize,
+            1,
+            gradWeightsWrapper, 0
+        );
+        if(dim.biased) {
+            int64 m_ = dim.outputSizeSquared;
+            int64 n_ = dim.numFilters;
+            ClBlasHelper::Gemv(
+                cl,
+                clblasColumnMajor,
+                clblasTrans,
+                m_, n_,
+                1,
+                gradOutputWrapper, b * dim.outputCubeSize,
+                onesWrapper, 0,
+                1,
+                gradBiasWrapper, 0
+            );
+        }
+    }
+
+    delete onesWrapper;
+    delete[] ones;
+
+    delete columnsWrapper;
+    delete[] columns;
+
+    StatefulTimer::timeCheck("BackpropWeightsIm2Col::calcGradWeights after call calcGradWeights");
+
+    StatefulTimer::timeCheck("BackpropWeightsIm2Col::calcGradWeights END");
+}
+
diff --git a/src/conv/BackpropWeightsIm2Col.h b/src/conv/BackpropWeightsIm2Col.h
new file mode 100644
index 00000000..b5f0d9e7
--- /dev/null
+++ b/src/conv/BackpropWeightsIm2Col.h
@@ -0,0 +1,38 @@
+#pragma once
+
+#include "BackpropWeights.h"
+//#include "EasyCL.h"
+
+#include "DeepCLDllExport.h"
+
+class Im2Col;
+class CLWrapper;
+class EasyCL;
+class CLKernel;
+
+#define STATIC static
+#define VIRTUAL virtual
+
+class DeepCL_EXPORT BackpropWeightsIm2Col : public BackpropWeights {
+    private:
+//    CLKernel *kernelIm2Col;
+    Im2Col *im2Col;
+
+    float *columns;
+    CLWrapper *columnsWrapper;
+    int numKernels;
+
+    // [[[cog
+    // import cog_addheaders
+    // cog_addheaders.addv2()
+    // ]]]
+    // generated, using cog:
+
+    public:
+    BackpropWeightsIm2Col(EasyCL *cl, LayerDimensions dim);
+    VIRTUAL ~BackpropWeightsIm2Col();
+    VIRTUAL void calcGradWeights(int batchSize, CLWrapper *gradOutputWrapper, CLWrapper *inputWrapper, CLWrapper *gradWeightsWrapper, CLWrapper *gradBiasWrapper);
+
+    // [[[end]]]
+};
+
diff --git a/src/conv/BackpropWeightsNaive.cpp b/src/conv/BackpropWeightsNaive.cpp
index df6e180e..7cafce5c 100644
--- a/src/conv/BackpropWeightsNaive.cpp
+++ b/src/conv/BackpropWeightsNaive.cpp
@@ -20,38 +20,38 @@ VIRTUAL BackpropWeightsNaive::~BackpropWeightsNaive() {
 //    cout << "~backpropgradWeights2naive: deleting kernel" << endl;
     delete kernel;
 }
-VIRTUAL void BackpropWeightsNaive::calcGradWeights( int batchSize, CLWrapper *gradOutputWrapper, CLWrapper *imagesWrapper, CLWrapper *gradWeightsWrapper, CLWrapper *gradBiasWrapper ) {
-    StatefulTimer::instance()->timeCheck("BackpropWeightsNaive start" );
+VIRTUAL void BackpropWeightsNaive::calcGradWeights(int batchSize, CLWrapper *gradOutputWrapper, CLWrapper *imagesWrapper, CLWrapper *gradWeightsWrapper, CLWrapper *gradBiasWrapper) {
+    StatefulTimer::instance()->timeCheck("BackpropWeightsNaive start");
 
-    const float learningMultiplier = learningRateToMultiplier( batchSize );
+    const float learningMultiplier = learningRateToMultiplier(batchSize);
 
     kernel
        ->in(learningMultiplier)
-       ->in( batchSize )
-       ->in( gradOutputWrapper )
-        ->in( imagesWrapper )
-       ->inout( gradWeightsWrapper );
-    if( dim.biased ) {
-        kernel->inout( gradBiasWrapper );
+       ->in(batchSize)
+       ->in(gradOutputWrapper)
+        ->in(imagesWrapper)
+       ->inout(gradWeightsWrapper);
+    if(dim.biased) {
+        kernel->inout(gradBiasWrapper);
     }
 
     int globalSize = dim.filtersSize;
     int workgroupsize = cl->getMaxWorkgroupSize();
-    globalSize = ( ( globalSize + workgroupsize - 1 ) / workgroupsize ) * workgroupsize;
+    globalSize = ((globalSize + workgroupsize - 1) / workgroupsize) * workgroupsize;
     kernel->run_1d(globalSize, workgroupsize);
 
     cl->finish();
 
-    StatefulTimer::instance()->timeCheck("BackpropWeightsNaive end" );
+    StatefulTimer::instance()->timeCheck("BackpropWeightsNaive end");
 }
-BackpropWeightsNaive::BackpropWeightsNaive( EasyCL *cl, LayerDimensions dim ) :
-        BackpropWeights( cl, dim )
+BackpropWeightsNaive::BackpropWeightsNaive(EasyCL *cl, LayerDimensions dim) :
+        BackpropWeights(cl, dim)
             {
     std::string options = dim.buildOptionsString();
 
     // [[[cog
     // import stringify
-    // stringify.write_kernel2( "kernel", "cl/backpropweights.cl", "backprop_floats", 'options' )
+    // stringify.write_kernel2("kernel", "cl/backpropweights.cl", "backprop_floats", 'options')
     // ]]]
     // generated using cog, from cl/backpropweights.cl:
     const char * kernelSource =  
@@ -66,7 +66,7 @@ BackpropWeightsNaive::BackpropWeightsNaive( EasyCL *cl, LayerDimensions dim ) :
     "\n" 
     "// globalId: [outPlane][inputPlane][filterRow][filterCol]\n" 
     "// per-thread iteration: [n][outputRow][outputCol]\n" 
-    "void kernel backprop_floats( const float learningRateMultiplier,\n" 
+    "void kernel backprop_floats(const float learningRateMultiplier,\n" 
     "        const int batchSize,\n" 
     "         global const float *gradOutput, global const float *images,\n" 
     "        global float *gradWeights\n" 
@@ -75,7 +75,7 @@ BackpropWeightsNaive::BackpropWeightsNaive( EasyCL *cl, LayerDimensions dim ) :
     "        #endif\n" 
     " ) {\n" 
     "    int globalId = get_global_id(0);\n" 
-    "    if( globalId >= gNumFilters * gInputPlanes * gFilterSize * gFilterSize ) {\n" 
+    "    if (globalId >= gNumFilters * gInputPlanes * gFilterSize * gFilterSize) {\n" 
     "        return;\n" 
     "    }\n" 
     "\n" 
@@ -93,22 +93,22 @@ BackpropWeightsNaive::BackpropWeightsNaive( EasyCL *cl, LayerDimensions dim ) :
     "#ifdef BIASED\n" 
     "    float thisbiaschange = 0;\n" 
     "#endif\n" 
-    "    for( int n = 0; n < batchSize; n++ ) {\n" 
-    "        for( int outRow = 0; outRow < gOutputImageSize; outRow++ ) {\n" 
+    "    for (int n = 0; n < batchSize; n++) {\n" 
+    "        for (int outRow = 0; outRow < gOutputSize; outRow++) {\n" 
     "            int upstreamRow = outRow - gMargin + filterRow;\n" 
-    "            for( int outCol = 0; outCol < gOutputImageSize; outCol++ ) {\n" 
+    "            for (int outCol = 0; outCol < gOutputSize; outCol++) {\n" 
     "                int upstreamCol = outCol - gMargin + filterCol;\n" 
-    "                bool proceed = upstreamRow >= 0 && upstreamCol >= 0 && upstreamRow < gInputImageSize\n" 
-    "                    && upstreamCol < gInputImageSize;\n" 
-    "                if( proceed ) {\n" 
-    "                    int resultIndex = ( ( n * gNumFilters\n" 
-    "                              + outPlane ) * gOutputImageSize\n" 
-    "                              + outRow ) * gOutputImageSize\n" 
+    "                bool proceed = upstreamRow >= 0 && upstreamCol >= 0 && upstreamRow < gInputSize\n" 
+    "                    && upstreamCol < gInputSize;\n" 
+    "                if (proceed) {\n" 
+    "                    int resultIndex = (( n * gNumFilters\n" 
+    "                              + outPlane) * gOutputSize\n" 
+    "                              + outRow) * gOutputSize\n" 
     "                              + outCol;\n" 
     "                    float error = gradOutput[resultIndex];\n" 
-    "                    int upstreamDataIndex = ( ( n * gInputPlanes\n" 
-    "                                     + upstreamPlane ) * gInputImageSize\n" 
-    "                                     + upstreamRow ) * gInputImageSize\n" 
+    "                    int upstreamDataIndex = (( n * gInputPlanes\n" 
+    "                                     + upstreamPlane) * gInputSize\n" 
+    "                                     + upstreamRow) * gInputSize\n" 
     "                                     + upstreamCol;\n" 
     "                    float upstreamResult = images[upstreamDataIndex];\n" 
     "                    float thisimagethiswchange = upstreamResult * error;\n" 
@@ -125,7 +125,7 @@ BackpropWeightsNaive::BackpropWeightsNaive( EasyCL *cl, LayerDimensions dim ) :
     "    gradWeights[ globalId ] = learningRateMultiplier * thiswchange;\n" 
     "#ifdef BIASED\n" 
     "    bool writeBias = upstreamPlane == 0 && filterRow == gMargin && filterCol == gMargin;\n" 
-    "    if( writeBias ) {\n" 
+    "    if (writeBias) {\n" 
     "        gradBiasWeights[outPlane] = learningRateMultiplier * thisbiaschange;\n" 
     "    }\n" 
     "#endif\n" 
diff --git a/src/conv/BackpropWeightsNaive.h b/src/conv/BackpropWeightsNaive.h
index 96a10a40..dfa2a224 100644
--- a/src/conv/BackpropWeightsNaive.h
+++ b/src/conv/BackpropWeightsNaive.h
@@ -21,8 +21,8 @@ class BackpropWeightsNaive : public BackpropWeights {
     // ]]]
     // generated, using cog:
     VIRTUAL ~BackpropWeightsNaive();
-    VIRTUAL void calcGradWeights( int batchSize, CLWrapper *gradOutputWrapper, CLWrapper *imagesWrapper, CLWrapper *gradWeightsWrapper, CLWrapper *gradBiasWrapper );
-    BackpropWeightsNaive( EasyCL *cl, LayerDimensions dim );
+    VIRTUAL void calcGradWeights(int batchSize, CLWrapper *gradOutputWrapper, CLWrapper *imagesWrapper, CLWrapper *gradWeightsWrapper, CLWrapper *gradBiasWrapper);
+    BackpropWeightsNaive(EasyCL *cl, LayerDimensions dim);
 
     // [[[end]]]
 };
diff --git a/src/conv/BackpropWeightsScratch.cpp b/src/conv/BackpropWeightsScratch.cpp
index 6b046f5a..9a902bf0 100644
--- a/src/conv/BackpropWeightsScratch.cpp
+++ b/src/conv/BackpropWeightsScratch.cpp
@@ -21,49 +21,49 @@ using namespace std;
 VIRTUAL BackpropWeightsScratch::~BackpropWeightsScratch() {
     delete kernel;
 }
-VIRTUAL void BackpropWeightsScratch::calcGradWeights( int batchSize, CLWrapper *gradOutputWrapper, CLWrapper *imagesWrapper, CLWrapper *gradWeightsWrapper, CLWrapper *gradBiasWrapper ) {
-    StatefulTimer::instance()->timeCheck("BackpropWeightsScratch start" );
+VIRTUAL void BackpropWeightsScratch::calcGradWeights(int batchSize, CLWrapper *gradOutputWrapper, CLWrapper *imagesWrapper, CLWrapper *gradWeightsWrapper, CLWrapper *gradBiasWrapper) {
+    StatefulTimer::instance()->timeCheck("BackpropWeightsScratch start");
 
-    int workgroupsize = std::max( 32, square( dim.filterSize ) ); // no point in wasting cores...
+    int workgroupsize = std::max(32, square(dim.filterSize) ); // no point in wasting cores...
     int numWorkgroups = dim.inputPlanes * dim.numFilters;
     int globalSize = workgroupsize * numWorkgroups;
-    globalSize = ( ( globalSize + workgroupsize - 1 ) / workgroupsize ) * workgroupsize;
+    globalSize = (( globalSize + workgroupsize - 1) / workgroupsize) * workgroupsize;
 
-    int localMemRequiredKB = ( square( dim.outputImageSize ) * 4 + square( dim.inputImageSize ) * 4 ) / 1024 + 1;
-    if( localMemRequiredKB >= cl->getLocalMemorySizeKB() ) {
-        throw runtime_error( "local memory too small to use this kernel on this device.  Need: " + 
-            toString( localMemRequiredKB ) + "KB, but only have: " + 
-            toString( cl->getLocalMemorySizeKB() ) + "KB local memory" );
+    int localMemRequiredKB = (square(dim.outputSize) * 4 + square(dim.inputSize) * 4) / 1024 + 1;
+    if(localMemRequiredKB >= cl->getLocalMemorySizeKB()) {
+        throw runtime_error("local memory too small to use this kernel on this device.  Need: " + 
+            toString(localMemRequiredKB) + "KB, but only have: " + 
+            toString(cl->getLocalMemorySizeKB()) + "KB local memory");
     }
 
-    const float learningMultiplier = learningRateToMultiplier( batchSize );
+    const float learningMultiplier = learningRateToMultiplier(batchSize);
 
     kernel
        ->in(learningMultiplier)
-       ->in( batchSize )
-       ->in( gradOutputWrapper )
-        ->in( imagesWrapper )
-       ->inout( gradWeightsWrapper );
-    if( dim.biased ) {
-        kernel->inout( gradBiasWrapper );
+       ->in(batchSize)
+       ->in(gradOutputWrapper)
+        ->in(imagesWrapper)
+       ->inout(gradWeightsWrapper);
+    if(dim.biased) {
+        kernel->inout(gradBiasWrapper);
     }
     kernel
-        ->localFloats( square( dim.outputImageSize ) )
-        ->localFloats( square( dim.inputImageSize ) );
+        ->localFloats(square(dim.outputSize) )
+        ->localFloats(square(dim.inputSize) );
 
     kernel->run_1d(globalSize, workgroupsize);
 
     cl->finish();
 
-    StatefulTimer::instance()->timeCheck("BackpropWeightsScratch end" );
+    StatefulTimer::instance()->timeCheck("BackpropWeightsScratch end");
 }
-BackpropWeightsScratch::BackpropWeightsScratch( EasyCL *cl, LayerDimensions dim ) :
-        BackpropWeights( cl, dim )
+BackpropWeightsScratch::BackpropWeightsScratch(EasyCL *cl, LayerDimensions dim) :
+        BackpropWeights(cl, dim)
             {
     std::string options = dim.buildOptionsString();
     // [[[cog
     // import stringify
-    // stringify.write_kernel2( "kernel", "cl/BackpropWeightsScratch.cl", "backprop_floats_withscratch_dobias", 'options' )
+    // stringify.write_kernel2("kernel", "cl/BackpropWeightsScratch.cl", "backprop_floats_withscratch_dobias", 'options')
     // ]]]
     // generated using cog, from cl/BackpropWeightsScratch.cl:
     const char * kernelSource =  
@@ -83,21 +83,21 @@ BackpropWeightsScratch::BackpropWeightsScratch( EasyCL *cl, LayerDimensions dim
     "// v. 2.0. If a copy of the MPL was not distributed with this file, You can\n" 
     "// obtain one at http://mozilla.org/MPL/2.0/.\n" 
     "\n" 
-    "void copyLocal( local float *target, global float const *source, int N ) {\n" 
-    "    int numLoops = ( N + get_local_size(0) - 1 ) / get_local_size(0);\n" 
-    "    for( int loop = 0; loop < numLoops; loop++ ) {\n" 
+    "void copyLocal(local float *target, global float const *source, int N) {\n" 
+    "    int numLoops = (N + get_local_size(0) - 1) / get_local_size(0);\n" 
+    "    for (int loop = 0; loop < numLoops; loop++) {\n" 
     "        int offset = loop * get_local_size(0) + get_local_id(0);\n" 
-    "        if( offset < N ) {\n" 
+    "        if (offset < N) {\n" 
     "            target[offset] = source[offset];\n" 
     "        }\n" 
     "    }\n" 
     "}\n" 
     "\n" 
-    "void copyGlobal( global float *target, local float const *source, int N ) {\n" 
-    "    int numLoops = ( N + get_local_size(0) - 1 ) / get_local_size(0);\n" 
-    "    for( int loop = 0; loop < numLoops; loop++ ) {\n" 
+    "void copyGlobal(global float *target, local float const *source, int N) {\n" 
+    "    int numLoops = (N + get_local_size(0) - 1) / get_local_size(0);\n" 
+    "    for (int loop = 0; loop < numLoops; loop++) {\n" 
     "        int offset = loop * get_local_size(0) + get_local_id(0);\n" 
-    "        if( offset < N ) {\n" 
+    "        if (offset < N) {\n" 
     "            target[offset] = source[offset];\n" 
     "        }\n" 
     "    }\n" 
@@ -111,10 +111,10 @@ BackpropWeightsScratch::BackpropWeightsScratch( EasyCL *cl, LayerDimensions dim
     "// v. 2.0. If a copy of the MPL was not distributed with this file, You can\n" 
     "// obtain one at http://mozilla.org/MPL/2.0/.\n" 
     "\n" 
-    "#define globalId ( get_global_id(0) )\n" 
-    "#define localId ( get_local_id(0)  )\n" 
-    "#define workgroupId ( get_group_id(0) )\n" 
-    "#define workgroupSize ( get_local_size(0) )\n" 
+    "#define globalId (get_global_id(0))\n" 
+    "#define localId (get_local_id(0)  )\n" 
+    "#define workgroupId (get_group_id(0))\n" 
+    "#define workgroupSize (get_local_size(0))\n" 
     "\n" 
     "\n" 
     "\n" 
@@ -122,8 +122,8 @@ BackpropWeightsScratch::BackpropWeightsScratch( EasyCL *cl, LayerDimensions dim
     "// workgroupId: [outputPlane][inputPlane]\n" 
     "// localId: [filterRow][filterCol]\n" 
     "// per-thread iteration: [n][outputRow][outputCol]\n" 
-    "// local: errorimage: outputImageSize * outputImageSize\n" 
-    "//        imageimage: inputImageSize * inputImageSize\n" 
+    "// local: errorimage: outputSize * outputSize\n" 
+    "//        imageimage: inputSize * inputSize\n" 
     "void kernel backprop_floats_withscratch_dobias(\n" 
     "        const float learningRateMultiplier, const int batchSize,\n" 
     "         global const float *gradOutput, global const float *images,\n" 
@@ -136,8 +136,8 @@ BackpropWeightsScratch::BackpropWeightsScratch( EasyCL *cl, LayerDimensions dim
     "    const int filterRow = localId / gFilterSize;\n" 
     "    const int filterCol = localId % gFilterSize;\n" 
     "\n" 
-    "    #define outPlane ( workgroupId / gInputPlanes )\n" 
-    "    #define upstreamPlane ( workgroupId % gInputPlanes )\n" 
+    "    #define outPlane (workgroupId / gInputPlanes)\n" 
+    "    #define upstreamPlane (workgroupId % gInputPlanes)\n" 
     "\n" 
     "    // gradWeights:     [outPlane][upstreamPlane][filterRow][filterCol]\n" 
     "    //       aggregate over:  [outRow][outCol][n]\n" 
@@ -145,25 +145,25 @@ BackpropWeightsScratch::BackpropWeightsScratch( EasyCL *cl, LayerDimensions dim
     "#ifdef BIASED\n" 
     "    float thisbiaschange = 0;\n" 
     "#endif\n" 
-    "    for( int n = 0; n < batchSize; n++ ) {\n" 
+    "    for (int n = 0; n < batchSize; n++) {\n" 
     "        barrier(CLK_LOCAL_MEM_FENCE);\n" 
-    "        copyLocal( _imageImage, images + ( n * gInputPlanes + upstreamPlane ) * gInputImageSizeSquared, gInputImageSizeSquared );\n" 
-    "        copyLocal(_errorImage, gradOutput + ( n * gNumFilters + outPlane ) * gOutputImageSizeSquared, gOutputImageSizeSquared );\n" 
+    "        copyLocal(_imageImage, images + (n * gInputPlanes + upstreamPlane) * gInputSizeSquared, gInputSizeSquared);\n" 
+    "        copyLocal(_errorImage, gradOutput + (n * gNumFilters + outPlane) * gOutputSizeSquared, gOutputSizeSquared);\n" 
     "        barrier(CLK_LOCAL_MEM_FENCE);\n" 
-    "        if( localId < gFilterSizeSquared ) {\n" 
-    "            for( int outRow = 0; outRow < gOutputImageSize; outRow++ ) {\n" 
+    "        if (localId < gFilterSizeSquared) {\n" 
+    "            for (int outRow = 0; outRow < gOutputSize; outRow++) {\n" 
     "                int upstreamRow = outRow - gMargin + filterRow;\n" 
-    "                for( int outCol = 0; outCol < gOutputImageSize; outCol++ ) {\n" 
+    "                for (int outCol = 0; outCol < gOutputSize; outCol++) {\n" 
     "                    const int upstreamCol = outCol - gMargin + filterCol;\n" 
-    "                    #define proceed ( upstreamRow >= 0 && upstreamCol >= 0 && upstreamRow < gInputImageSize && upstreamCol < gInputImageSize )\n" 
-    "                    if( proceed ) {\n" 
+    "                    #define proceed (upstreamRow >= 0 && upstreamCol >= 0 && upstreamRow < gInputSize && upstreamCol < gInputSize)\n" 
+    "                    if (proceed) {\n" 
     "                        // these defines reduce register pressure, compared to const\n" 
     "                        // giving a 40% speedup on nvidia :-)\n" 
-    "                        #define resultIndex ( outRow * gOutputImageSize + outCol )\n" 
-    "                        #define error ( _errorImage[resultIndex] )\n" 
+    "                        #define resultIndex (outRow * gOutputSize + outCol)\n" 
+    "                        #define error (_errorImage[resultIndex])\n" 
     "                        //const float error = _errorImage[resultIndex];\n" 
-    "                        #define upstreamDataIndex ( upstreamRow * gInputImageSize + upstreamCol )\n" 
-    "                        #define upstreamResult ( _imageImage[upstreamDataIndex] )\n" 
+    "                        #define upstreamDataIndex (upstreamRow * gInputSize + upstreamCol)\n" 
+    "                        #define upstreamResult (_imageImage[upstreamDataIndex])\n" 
     "                        thiswchange += upstreamResult * error;\n" 
     "    #ifdef BIASED\n" 
     "                        thisbiaschange += error;\n" 
@@ -173,12 +173,12 @@ BackpropWeightsScratch::BackpropWeightsScratch( EasyCL *cl, LayerDimensions dim
     "            }\n" 
     "        }\n" 
     "    }\n" 
-    "    if( localId < gFilterSizeSquared ) {\n" 
+    "    if (localId < gFilterSizeSquared) {\n" 
     "        gradWeights[ workgroupId * gFilterSizeSquared + localId ] = learningRateMultiplier * thiswchange;\n" 
     "    }\n" 
     "#ifdef BIASED\n" 
-    "    #define writeBias ( upstreamPlane == 0 && filterRow == gMargin && filterCol == gMargin )\n" 
-    "    if( writeBias ) {\n" 
+    "    #define writeBias (upstreamPlane == 0 && filterRow == gMargin && filterCol == gMargin)\n" 
+    "    if (writeBias) {\n" 
     "        gradBiasWeights[outPlane] = learningRateMultiplier * thisbiaschange;\n" 
     "    }\n" 
     "#endif\n" 
@@ -189,7 +189,7 @@ BackpropWeightsScratch::BackpropWeightsScratch( EasyCL *cl, LayerDimensions dim
     "";
     kernel = cl->buildKernelFromString( kernelSource, "backprop_floats_withscratch_dobias", options, "cl/BackpropWeightsScratch.cl" );
     // [[[end]]]
-//    kernel = cl->buildKernel( "backpropgradWeights2.cl", "backprop_floats_withscratch_dobias", options );
-//    kernel = cl->buildKernelFromString( kernelSource, "calcGradInput", options );
+//    kernel = cl->buildKernel("backpropgradWeights2.cl", "backprop_floats_withscratch_dobias", options);
+//    kernel = cl->buildKernelFromString(kernelSource, "calcGradInput", options);
 }
 
diff --git a/src/conv/BackpropWeightsScratch.h b/src/conv/BackpropWeightsScratch.h
index bd0a272c..fe75b64c 100644
--- a/src/conv/BackpropWeightsScratch.h
+++ b/src/conv/BackpropWeightsScratch.h
@@ -15,8 +15,8 @@ class BackpropWeightsScratch : public BackpropWeights {
     // ]]]
     // generated, using cog:
     VIRTUAL ~BackpropWeightsScratch();
-    VIRTUAL void calcGradWeights( int batchSize, CLWrapper *gradOutputWrapper, CLWrapper *imagesWrapper, CLWrapper *gradWeightsWrapper, CLWrapper *gradBiasWrapper );
-    BackpropWeightsScratch( EasyCL *cl, LayerDimensions dim );
+    VIRTUAL void calcGradWeights(int batchSize, CLWrapper *gradOutputWrapper, CLWrapper *imagesWrapper, CLWrapper *gradWeightsWrapper, CLWrapper *gradBiasWrapper);
+    BackpropWeightsScratch(EasyCL *cl, LayerDimensions dim);
 
     // [[[end]]]
 };
diff --git a/src/conv/BackpropWeightsScratchLarge.cpp b/src/conv/BackpropWeightsScratchLarge.cpp
index 51757693..41cc527b 100644
--- a/src/conv/BackpropWeightsScratchLarge.cpp
+++ b/src/conv/BackpropWeightsScratchLarge.cpp
@@ -21,94 +21,94 @@ using namespace std;
 VIRTUAL BackpropWeightsScratchLarge::~BackpropWeightsScratchLarge() {
     delete kernel;
 }
-VIRTUAL void BackpropWeightsScratchLarge::calcGradWeights( int batchSize, CLWrapper *gradOutputWrapper, CLWrapper *imagesWrapper, CLWrapper *gradWeightsWrapper, CLWrapper *gradBiasWrapper ) {
-    StatefulTimer::instance()->timeCheck("BackpropWeightsScratchLarge start" );
+VIRTUAL void BackpropWeightsScratchLarge::calcGradWeights(int batchSize, CLWrapper *gradOutputWrapper, CLWrapper *imagesWrapper, CLWrapper *gradWeightsWrapper, CLWrapper *gradBiasWrapper) {
+    StatefulTimer::instance()->timeCheck("BackpropWeightsScratchLarge start");
 
-    int workgroupSize = 32 * ( ( square(dim.filterSize) + 32 - 1 ) / 32 ); // quantize to nearest 32
-//    int workgroupsize = std::max( 32, square( dim.filterSize ) ); // no point in wasting cores...
+    int workgroupSize = 32 * (( square(dim.filterSize) + 32 - 1) / 32); // quantize to nearest 32
+//    int workgroupsize = std::max(32, square(dim.filterSize) ); // no point in wasting cores...
     int numWorkgroups = dim.inputPlanes * dim.numFilters;
     int globalSize = workgroupSize * numWorkgroups;
-//    globalSize = ( ( globalSize + workgroupSize - 1 ) / workgroupSize ) * workgroupSize;
+//    globalSize = (( globalSize + workgroupSize - 1) / workgroupSize) * workgroupSize;
 //    cout << "workgroupsize " << workgroupSize << " numworkgroups " << numWorkgroups << " globalsize " << globalSize << endl;
 
-    const float learningMultiplier = learningRateToMultiplier( batchSize );
+    const float learningMultiplier = learningRateToMultiplier(batchSize);
 
     kernel
        ->in(learningMultiplier)
-       ->in( batchSize )
-       ->in( gradOutputWrapper )
-        ->in( imagesWrapper )
-       ->inout( gradWeightsWrapper );
-    if( dim.biased ) {
-        kernel->inout( gradBiasWrapper );
+       ->in(batchSize)
+       ->in(gradOutputWrapper)
+        ->in(imagesWrapper)
+       ->inout(gradWeightsWrapper);
+    if(dim.biased) {
+        kernel->inout(gradBiasWrapper);
     }
     kernel
-        ->localFloats( outputStripeSize )
-        ->localFloats( inputStripeOuterSize );
+        ->localFloats(outputStripeSize)
+        ->localFloats(inputStripeOuterSize);
 
     kernel->run_1d(globalSize, workgroupSize);
 
     cl->finish();
 
-    StatefulTimer::instance()->timeCheck("BackpropWeightsScratchLarge end" );
+    StatefulTimer::instance()->timeCheck("BackpropWeightsScratchLarge end");
 }
-BackpropWeightsScratchLarge::BackpropWeightsScratchLarge( EasyCL *cl, LayerDimensions dim ) :
-        BackpropWeights( cl, dim )
+BackpropWeightsScratchLarge::BackpropWeightsScratchLarge(EasyCL *cl, LayerDimensions dim) :
+        BackpropWeights(cl, dim)
             {
     // [[[cog
     // import stringify
-    // # stringify.write_kernel( "kernelSource", "ClConvolve.cl")
+    // # stringify.write_kernel("kernelSource", "ClConvolve.cl")
     // ]]]
     // [[[end]]]
 //    cout << "dim: " << dim << endl;
     std::string options = dim.buildOptionsString();
 
-    int localMemoryRequirementsFullImage = dim.inputImageSize * dim.inputImageSize * 4 + dim.outputImageSize * dim.outputImageSize * 4;
+    int localMemoryRequirementsFullImage = dim.inputSize * dim.inputSize * 4 + dim.outputSize * dim.outputSize * 4;
     int availableLocal = cl->getLocalMemorySize();
 //    cout << "localmemoryrequirementsfullimage: " << localMemoryRequirementsFullImage << endl;
 //    cout << "availablelocal: " << availableLocal << endl;
     // make the local memory used about one quarter of what is available? half of what is available?
     // let's try one quarter :-)
     int localWeCanUse = availableLocal / 4;
-    numStripes = ( localMemoryRequirementsFullImage + localWeCanUse - 1 ) / localWeCanUse;
+    numStripes = (localMemoryRequirementsFullImage + localWeCanUse - 1) / localWeCanUse;
 //    cout << "numStripes: " << numStripes << endl;
     // make it a power of 2
-    numStripes = EasyCL::getNextPower2( numStripes );
+    numStripes = EasyCL::getNextPower2(numStripes);
 //    cout << "numStripes: " << numStripes << endl;
 
     int inputStripeMarginRows = dim.filterSize - 1;
-    int inputStripeInnerNumRows = dim.inputImageSize / numStripes;
+    int inputStripeInnerNumRows = dim.inputSize / numStripes;
     int inputStripeOuterNumRows = inputStripeInnerNumRows + 2 * inputStripeMarginRows;
 
-    int inputStripeInnerSize = inputStripeInnerNumRows * dim.inputImageSize;
-    inputStripeOuterSize = inputStripeOuterNumRows * dim.inputImageSize;
-    int inputStripeMarginSize = inputStripeMarginRows * dim.inputImageSize;
+    int inputStripeInnerSize = inputStripeInnerNumRows * dim.inputSize;
+    inputStripeOuterSize = inputStripeOuterNumRows * dim.inputSize;
+    int inputStripeMarginSize = inputStripeMarginRows * dim.inputSize;
 
-    int outputStripeNumRows = ( dim.outputImageSize + numStripes - 1 ) / numStripes;
-    outputStripeSize = outputStripeNumRows * dim.outputImageSize;
+    int outputStripeNumRows = (dim.outputSize + numStripes - 1) / numStripes;
+    outputStripeSize = outputStripeNumRows * dim.outputSize;
 
     // [[[cog
     // import cog_optionswriter
-    // cog_optionswriter.write_options( ['numStripes','inputStripeMarginRows','inputStripeInnerNumRows',
+    // cog_optionswriter.write_options(['numStripes','inputStripeMarginRows','inputStripeInnerNumRows',
     //     'inputStripeOuterNumRows', 'inputStripeInnerSize', 'inputStripeOuterSize', 'inputStripeMarginSize',
-    //     'outputStripeNumRows', 'outputStripeSize' ] )
+    //     'outputStripeNumRows', 'outputStripeSize' ])
     // ]]]
     // generated, using cog:
-    options += " -DgNumStripes=" + toString( numStripes );
-    options += " -DgInputStripeMarginRows=" + toString( inputStripeMarginRows );
-    options += " -DgInputStripeInnerNumRows=" + toString( inputStripeInnerNumRows );
-    options += " -DgInputStripeOuterNumRows=" + toString( inputStripeOuterNumRows );
-    options += " -DgInputStripeInnerSize=" + toString( inputStripeInnerSize );
-    options += " -DgInputStripeOuterSize=" + toString( inputStripeOuterSize );
-    options += " -DgInputStripeMarginSize=" + toString( inputStripeMarginSize );
-    options += " -DgOutputStripeNumRows=" + toString( outputStripeNumRows );
-    options += " -DgOutputStripeSize=" + toString( outputStripeSize );
+    options += " -DgNumStripes=" + toString(numStripes);
+    options += " -DgInputStripeMarginRows=" + toString(inputStripeMarginRows);
+    options += " -DgInputStripeInnerNumRows=" + toString(inputStripeInnerNumRows);
+    options += " -DgInputStripeOuterNumRows=" + toString(inputStripeOuterNumRows);
+    options += " -DgInputStripeInnerSize=" + toString(inputStripeInnerSize);
+    options += " -DgInputStripeOuterSize=" + toString(inputStripeOuterSize);
+    options += " -DgInputStripeMarginSize=" + toString(inputStripeMarginSize);
+    options += " -DgOutputStripeNumRows=" + toString(outputStripeNumRows);
+    options += " -DgOutputStripeSize=" + toString(outputStripeSize);
     // [[[end]]]
     cout << "options: " << options << endl;
 
     // [[[cog
     // import stringify
-    // stringify.write_kernel2( "kernel", "cl/BackpropWeightsScratchLarge.cl", "backprop_floats_withscratch_dobias_striped", 'options' )
+    // stringify.write_kernel2("kernel", "cl/BackpropWeightsScratchLarge.cl", "backprop_floats_withscratch_dobias_striped", 'options')
     // ]]]
     // generated using cog, from cl/BackpropWeightsScratchLarge.cl:
     const char * kernelSource =  
@@ -124,8 +124,8 @@ BackpropWeightsScratchLarge::BackpropWeightsScratchLarge( EasyCL *cl, LayerDimen
     "// workgroupId: [outputPlane][inputPlane]\n" 
     "// localId: [filterRow][filterCol]\n" 
     "// per-thread iteration: [n][outputRow][outputCol]\n" 
-    "// local: errorimage: outputImageSize * outputImageSize\n" 
-    "//        imageimage: inputImageSize * inputImageSize\n" 
+    "// local: errorimage: outputSize * outputSize\n" 
+    "//        imageimage: inputSize * inputSize\n" 
     "// specific characteristic: load one stripe of each image at a time,\n" 
     "// so we dont run out of memory\n" 
     "// number of stripes set in: gNumStripes\n" 
@@ -146,15 +146,15 @@ BackpropWeightsScratchLarge::BackpropWeightsScratchLarge( EasyCL *cl, LayerDimen
     "        local float *_errorStripe, local float *_imageStripe\n" 
     " ) {\n" 
     "    // gHalfFilterSize\n" 
-    "    // gInputImageSize\n" 
+    "    // gInputSize\n" 
     "    //\n" 
     "    // gInputStripeMarginRows => basically equal to gHalfFilterSize\n" 
-    "    // gInputStripeInnerNumRows = gInputImageSize / gNumStripes\n" 
+    "    // gInputStripeInnerNumRows = gInputSize / gNumStripes\n" 
     "    // gInputStripeOuterNumRows = gInputStripeInnerNumRows + 2 * gHalfFilterSize  (note: one row less than\n" 
     "    //                                                         if we just added gFilterSize)\n" 
-    "    // gInputStripeInnerSize = gInputStripeInnerNumRows * gInputImageSize\n" 
-    "    // gInputStripeOuterSize = gInputStripeOuterNumRows * gInputImageSize\n" 
-    "    // gInputStripeMarginSize = gInputStripeMarginRows * gInputImageSize\n" 
+    "    // gInputStripeInnerSize = gInputStripeInnerNumRows * gInputSize\n" 
+    "    // gInputStripeOuterSize = gInputStripeOuterNumRows * gInputSize\n" 
+    "    // gInputStripeMarginSize = gInputStripeMarginRows * gInputSize\n" 
     "    //\n" 
     "    // gOutputStripeNumRows\n" 
     "    // gOutputStripeSize\n" 
@@ -176,62 +176,62 @@ BackpropWeightsScratchLarge::BackpropWeightsScratchLarge( EasyCL *cl, LayerDimen
     "#ifdef BIASED\n" 
     "    float thisbiaschange = 0;\n" 
     "#endif\n" 
-    "    const int numLoopsForImageStripe = ( gInputStripeOuterSize + workgroupSize - 1 ) / workgroupSize;\n" 
-    "    const int numLoopsForErrorStripe = ( gOutputImageSizeSquared + workgroupSize - 1 ) / workgroupSize;\n" 
-    "    for( int n = 0; n < batchSize; n++ ) {\n" 
-    "        const int imageImageGlobalOffset = ( n * gInputPlanes + upstreamPlane ) * gInputImageSizeSquared;\n" 
-    "        const int imageImageGlobalOffsetAfter = imageImageGlobalOffset + gInputImageSizeSquared;\n" 
-    "        const int errorImageGlobalOffset = ( n * gNumFilters + outPlane ) * gOutputImageSizeSquared;\n" 
-    "        const int errorImageGlobalOffsetAfter = errorImageGlobalOffset + gOutputImageSizeSquared;\n" 
-    "        for( int stripe = 0; stripe < gNumStripes; stripe++ ) {\n" 
+    "    const int numLoopsForImageStripe = (gInputStripeOuterSize + workgroupSize - 1) / workgroupSize;\n" 
+    "    const int numLoopsForErrorStripe = (gOutputSizeSquared + workgroupSize - 1) / workgroupSize;\n" 
+    "    for (int n = 0; n < batchSize; n++) {\n" 
+    "        const int imageImageGlobalOffset = (n * gInputPlanes + upstreamPlane) * gInputSizeSquared;\n" 
+    "        const int imageImageGlobalOffsetAfter = imageImageGlobalOffset + gInputSizeSquared;\n" 
+    "        const int errorImageGlobalOffset = (n * gNumFilters + outPlane) * gOutputSizeSquared;\n" 
+    "        const int errorImageGlobalOffsetAfter = errorImageGlobalOffset + gOutputSizeSquared;\n" 
+    "        for (int stripe = 0; stripe < gNumStripes; stripe++) {\n" 
     "            const int imageStripeInnerOffset = imageImageGlobalOffset + stripe * gInputStripeInnerSize;\n" 
     "            const int imageStripeOuterOffset = imageStripeInnerOffset - gInputStripeMarginSize;\n" 
     "            // need to fetch the image, but it's bigger than us, so will need to loop...\n" 
     "            barrier(CLK_LOCAL_MEM_FENCE);\n" 
-    "            for( int i = 0; i < numLoopsForImageStripe; i++ ) {\n" 
+    "            for (int i = 0; i < numLoopsForImageStripe; i++) {\n" 
     "                int thisOffset = i * workgroupSize + localId;\n" 
     "                int thisGlobalImagesOffset = imageStripeOuterOffset + thisOffset;\n" 
     "                bool process = thisOffset < gInputStripeOuterSize\n" 
     "                    && thisGlobalImagesOffset >= imageImageGlobalOffset\n" 
     "                    && thisGlobalImagesOffset < imageImageGlobalOffsetAfter;\n" 
-    "                if( process ) {\n" 
+    "                if (process) {\n" 
     "                    _imageStripe[thisOffset] = images[ thisGlobalImagesOffset ];\n" 
     "                }\n" 
     "            }\n" 
     "            int errorStripeOffset = errorImageGlobalOffset + stripe * gOutputStripeSize;\n" 
-    "            for( int i = 0; i < numLoopsForErrorStripe; i++ ) {\n" 
+    "            for (int i = 0; i < numLoopsForErrorStripe; i++) {\n" 
     "                int thisOffset = i * workgroupSize + localId;\n" 
     "                int globalErrorsOffset = errorStripeOffset + thisOffset;\n" 
     "                bool process = thisOffset < gOutputStripeSize\n" 
     "                    && globalErrorsOffset < errorImageGlobalOffsetAfter;\n" 
-    "                if( process ) {\n" 
+    "                if (process) {\n" 
     "                    _errorStripe[thisOffset ] = gradOutput[globalErrorsOffset];\n" 
     "                }\n" 
     "            }\n" 
     "            const int stripeOutRowStart = stripe * gOutputStripeNumRows;\n" 
     "            const int stripeOutRowEndExcl = stripeOutRowStart + gOutputStripeNumRows;\n" 
     "            barrier(CLK_LOCAL_MEM_FENCE);\n" 
-    "//            if( localId == 13 ) {\n" 
-    "//                for( int i = 0; i < 12; i++ ) {\n" 
-    "//                    gradWeights[100 + stripe * 12 + i ] = _errorStripe[i * gOutputImageSize];\n" 
+    "//            if (localId == 13) {\n" 
+    "//                for (int i = 0; i < 12; i++) {\n" 
+    "//                    gradWeights[100 + stripe * 12 + i ] = _errorStripe[i * gOutputSize];\n" 
     "//                }\n" 
-    "//                for( int i = 0; i < 20; i++ ) {\n" 
-    "//                    gradWeights[200 + stripe * 20 + i ] = _imageStripe[i * gInputImageSize];\n" 
+    "//                for (int i = 0; i < 20; i++) {\n" 
+    "//                    gradWeights[200 + stripe * 20 + i ] = _imageStripe[i * gInputSize];\n" 
     "//                }\n" 
     "//            }\n" 
-    "            if( localId < gFilterSizeSquared ) {\n" 
-    "                for( int outRow = stripeOutRowStart; outRow < stripeOutRowEndExcl; outRow++ ) {\n" 
+    "            if (localId < gFilterSizeSquared) {\n" 
+    "                for (int outRow = stripeOutRowStart; outRow < stripeOutRowEndExcl; outRow++) {\n" 
     "                    int upstreamRow = outRow - gMargin + filterRow;\n" 
-    "                    for( int outCol = 0; outCol < gOutputImageSize; outCol++ ) {\n" 
+    "                    for (int outCol = 0; outCol < gOutputSize; outCol++) {\n" 
     "                        int upstreamCol = outCol - gMargin + filterCol;\n" 
     "                        bool proceed =\n" 
     "                            upstreamRow >= 0 && upstreamCol >= 0\n" 
-    "                            && upstreamRow < gInputImageSize && upstreamCol < gInputImageSize\n" 
-    "                            && outRow < gOutputImageSize;\n" 
-    "                        if( proceed ) {\n" 
-    "                            int resultIndex = outRow * gOutputImageSize + outCol;\n" 
+    "                            && upstreamRow < gInputSize && upstreamCol < gInputSize\n" 
+    "                            && outRow < gOutputSize;\n" 
+    "                        if (proceed) {\n" 
+    "                            int resultIndex = outRow * gOutputSize + outCol;\n" 
     "                            float error = _errorStripe[resultIndex - stripe * gOutputStripeSize];\n" 
-    "                            int upstreamDataIndex = upstreamRow * gInputImageSize + upstreamCol;\n" 
+    "                            int upstreamDataIndex = upstreamRow * gInputSize + upstreamCol;\n" 
     "                            float upstreamResult = _imageStripe[upstreamDataIndex +  gInputStripeMarginSize\n" 
     "                                        - stripe * gInputStripeInnerSize ];\n" 
     "                            thiswchange += upstreamResult * error;\n" 
@@ -244,13 +244,13 @@ BackpropWeightsScratchLarge::BackpropWeightsScratchLarge( EasyCL *cl, LayerDimen
     "            }\n" 
     "        }\n" 
     "    }\n" 
-    "    if( localId < gFilterSizeSquared ) {\n" 
+    "    if (localId < gFilterSizeSquared) {\n" 
     "        gradWeights[ workgroupId * gFilterSizeSquared + localId ] = learningRateMultiplier * thiswchange;\n" 
     "//        weightChanges[ workgroupId * gFilterSizeSquared + localId ] = workgroupId;\n" 
     "    }\n" 
     "#ifdef BIASED\n" 
     "    bool writeBias = upstreamPlane == 0 && filterRow == gMargin && filterCol == gMargin;\n" 
-    "    if( writeBias ) {\n" 
+    "    if (writeBias) {\n" 
     "        gradBiasWeights[outPlane] = learningRateMultiplier * thisbiaschange;\n" 
     "    }\n" 
     "#endif\n" 
diff --git a/src/conv/BackpropWeightsScratchLarge.h b/src/conv/BackpropWeightsScratchLarge.h
index 3fc26a95..3df9cc14 100644
--- a/src/conv/BackpropWeightsScratchLarge.h
+++ b/src/conv/BackpropWeightsScratchLarge.h
@@ -18,8 +18,8 @@ class BackpropWeightsScratchLarge : public BackpropWeights {
     // ]]]
     // generated, using cog:
     VIRTUAL ~BackpropWeightsScratchLarge();
-    VIRTUAL void calcGradWeights( int batchSize, CLWrapper *gradOutputWrapper, CLWrapper *imagesWrapper, CLWrapper *gradWeightsWrapper, CLWrapper *gradBiasWrapper );
-    BackpropWeightsScratchLarge( EasyCL *cl, LayerDimensions dim );
+    VIRTUAL void calcGradWeights(int batchSize, CLWrapper *gradOutputWrapper, CLWrapper *imagesWrapper, CLWrapper *gradWeightsWrapper, CLWrapper *gradBiasWrapper);
+    BackpropWeightsScratchLarge(EasyCL *cl, LayerDimensions dim);
 
     // [[[end]]]
 };
diff --git a/src/conv/Backward.cpp b/src/conv/Backward.cpp
index 9993de4a..42c91daa 100644
--- a/src/conv/Backward.cpp
+++ b/src/conv/Backward.cpp
@@ -9,9 +9,11 @@
 #include "util/StatefulTimer.h"
 #include "util/stringhelper.h"
 
+#include "BackwardAuto.h"
 #include "BackwardCpu.h"
 #include "BackwardGpuNaive.h"
 #include "BackwardGpuCached.h"
+#include "BackwardIm2Col.h"
 
 #include "Backward.h"
 
@@ -23,53 +25,72 @@ using namespace std;
 #undef VIRTUAL
 #define VIRTUAL 
 
-STATIC Backward *Backward::instance(EasyCL *cl, LayerDimensions dim ) {
-    if( ( dim.inputImageSize - dim.filterSize > 6 ) && square( dim.inputImageSize ) <= cl->getMaxWorkgroupSize() ) {
-        return new BackwardGpuCached( cl, dim );
-    } else {
-        return new BackwardGpuNaive( cl, dim );
-    }
+STATIC Backward *Backward::instance(EasyCL *cl, LayerDimensions dim) {
+    return new BackwardAuto(cl, dim);
+//    if((dim.inputSize - dim.filterSize > 6) && square(dim.inputSize) <= cl->getMaxWorkgroupSize()) {
+//        return new BackwardGpuCached(cl, dim);
+//    } else {
+//        return new BackwardGpuNaive(cl, dim);
+//    }
 }
-STATIC Backward *Backward::instanceForTest(EasyCL *cl, LayerDimensions layerDimensions ) {
-    return new BackwardGpuNaive( cl, layerDimensions );
+STATIC Backward *Backward::instanceForTest(EasyCL *cl, LayerDimensions layerDimensions) {
+    return new BackwardGpuNaive(cl, layerDimensions);
 }
-STATIC Backward *Backward::instanceSpecific( int idx, EasyCL *cl, LayerDimensions layerDimensions ) {
-    if( idx == 0 ) {
-        return new BackwardCpu( cl, layerDimensions );
+STATIC Backward *Backward::instanceSpecific(int idx, EasyCL *cl, LayerDimensions layerDimensions) {
+    if(idx == -1) {
+        return new BackwardAuto(cl, layerDimensions);
+    }
+    if(idx == 0) {
+        return new BackwardCpu(cl, layerDimensions);
     }
-    if( idx == 1 ) {
-        return new BackwardGpuNaive( cl, layerDimensions );
+    if(idx == 1) {
+        return new BackwardGpuNaive(cl, layerDimensions);
     }
-    if( idx == 2 ) {
-        return new BackwardGpuCached( cl, layerDimensions );
+    if(idx == 2) {
+        return new BackwardGpuCached(cl, layerDimensions);
     }
-    throw std::runtime_error("backproperrorsv2::isntancespecifc, index not known: " + toString( idx ) );
+    if(idx == 3) {
+        return new BackwardIm2Col(cl, layerDimensions);
+    }
+    throw std::runtime_error("backproperrorsv2::isntancespecifc, index not known: " + toString(idx));
+}
+Backward::Backward(EasyCL *cl, LayerDimensions layerDimensions) :
+        cl(cl),
+        dim(layerDimensions) {
 }
-Backward::Backward( EasyCL *cl, LayerDimensions layerDimensions ) :
-        cl( cl ),
-        dim( layerDimensions ) {
+STATIC int Backward::getNumImplementations() {
+    return 4;
+}
+STATIC bool Backward::plausiblyOptimal(int index, int batchSize, LayerDimensions dim) {
+    if(index == 0) { 
+        return false;
+    }
+    if(index >= 4) {
+        return false;
+    }
+    return true;
 }
-VIRTUAL float * Backward::backward( int batchSize, float *input, float *gradOutput, float *filters ) {
+VIRTUAL float * Backward::backward(int batchSize, float *input, float *gradOutput, float *filters) {
     StatefulTimer::timeCheck("Backward::backprop begin");
 
-    CLWrapper *inputWrapper = cl->wrap( batchSize * dim.inputCubeSize, input );
+    CLWrapper *inputWrapper = cl->wrap(batchSize * dim.inputCubeSize, input);
     inputWrapper->copyToDevice();
 
-    CLWrapper *gradOutputWrapper = cl->wrap( batchSize * dim.outputCubeSize, gradOutput );
+    CLWrapper *gradOutputWrapper = cl->wrap(batchSize * dim.outputCubeSize, gradOutput);
     gradOutputWrapper->copyToDevice();
 
     int weightsSize = dim.filtersSize;
-    CLWrapper *weightsWrapper = cl->wrap( weightsSize, filters );
+    CLWrapper *weightsWrapper = cl->wrap(weightsSize, filters);
     weightsWrapper->copyToDevice();
 
     int outputDataSize = batchSize * dim.inputCubeSize;
 //    cout << " batchsize " << batchSize << " " << dim << endl;
-    int allocatedOutputSize = std::max(5000, outputDataSize );
-    float *gradInput = new float[allocatedOutputSize];
-    CLWrapper *gradInputWrapper = cl->wrap( allocatedOutputSize, gradInput );
+    int allocatedOutputNumElements = std::max(5000, outputDataSize);
+    float *gradInput = new float[allocatedOutputNumElements];
+    CLWrapper *gradInputWrapper = cl->wrap(allocatedOutputNumElements, gradInput);
 
     StatefulTimer::timeCheck("Backward::backprop after copied to device");
-    backward( batchSize, inputWrapper, gradOutputWrapper, weightsWrapper, gradInputWrapper );
+    backward(batchSize, inputWrapper, gradOutputWrapper, weightsWrapper, gradInputWrapper);
     StatefulTimer::timeCheck("Backward::backprop after call backprop");
     gradInputWrapper->copyToHost();
     StatefulTimer::timeCheck("Backward::backprop after copytohost");
diff --git a/src/conv/Backward.h b/src/conv/Backward.h
index e4113883..1e954699 100644
--- a/src/conv/Backward.h
+++ b/src/conv/Backward.h
@@ -25,20 +25,22 @@ class DeepCL_EXPORT Backward {
 //    ActivationFunction const *upstreamFn;
 
     virtual ~Backward() {}
-    virtual void backward( int batchSize, 
+    virtual void backward(int batchSize, 
         CLWrapper *inputDataWrapper, CLWrapper *gradOutput, CLWrapper *weightsWrapper,
-        CLWrapper *gradInput ) = 0;
+        CLWrapper *gradInput) = 0;
 
     // [[[cog
     // import cog_addheaders    
     // cog_addheaders.add()
     // ]]]
     // generated, using cog:
-    STATIC Backward *instance(EasyCL *cl, LayerDimensions dim );
-    STATIC Backward *instanceForTest(EasyCL *cl, LayerDimensions layerDimensions );
-    STATIC Backward *instanceSpecific( int idx, EasyCL *cl, LayerDimensions layerDimensions );
-    Backward( EasyCL *cl, LayerDimensions layerDimensions );
-    VIRTUAL float * backward( int batchSize, float *input, float *gradOutput, float *filters );
+    STATIC Backward *instance(EasyCL *cl, LayerDimensions dim);
+    STATIC Backward *instanceForTest(EasyCL *cl, LayerDimensions layerDimensions);
+    STATIC Backward *instanceSpecific(int idx, EasyCL *cl, LayerDimensions layerDimensions);
+    Backward(EasyCL *cl, LayerDimensions layerDimensions);
+    STATIC int getNumImplementations();
+    STATIC bool plausiblyOptimal(int index, int batchSize, LayerDimensions dim);
+    VIRTUAL float * backward(int batchSize, float *input, float *gradOutput, float *filters);
 
     // [[[end]]]
 };
diff --git a/src/conv/BackwardAuto.cpp b/src/conv/BackwardAuto.cpp
new file mode 100644
index 00000000..fe526c8e
--- /dev/null
+++ b/src/conv/BackwardAuto.cpp
@@ -0,0 +1,117 @@
+// Copyright Hugh Perkins 2014,2015 hughperkins at gmail
+//
+// This Source Code Form is subject to the terms of the Mozilla Public License, 
+// v. 2.0. If a copy of the MPL was not distributed with this file, You can 
+// obtain one at http://mozilla.org/MPL/2.0/.
+
+#include <iostream>
+#include <algorithm>
+#include <stdexcept>
+
+#include "conv/BackwardAuto.h"
+#include "util/stringhelper.h"
+#include "util/StatefulTimer.h"
+#include "util/Timer.h"
+
+using namespace std;
+
+#undef STATIC
+#define STATIC 
+
+#undef VIRTUAL
+#define VIRTUAL 
+
+BackwardAuto::BackwardAuto(EasyCL *cl, LayerDimensions dim) :
+        Backward(cl, dim),
+        milliseconds(0),
+        valid(0),
+        chosenIndex(-1),
+        instances(0)
+         {
+    num = Backward::getNumImplementations();
+    milliseconds = new int[ num];
+    valid = new bool[ num ];
+    instances = new Backward *[ num ];
+    for(int i = 0; i < num; i++) {
+        instances[i] = 0;
+        valid[i] = false;
+        milliseconds[i] = -1;
+    }
+    nextIndex = 0;
+}
+VIRTUAL BackwardAuto::~BackwardAuto() {
+    for(int i = 0; i < num; i++) {
+        if(instances[i] != 0) {
+            delete instances[i];
+        }
+    }
+}
+VIRTUAL void BackwardAuto::backward(
+        int batchSize, CLWrapper *inputDataWrapper, CLWrapper *gradOutput, CLWrapper *weightsWrapper,
+        CLWrapper *gradInput) {
+    while(chosenIndex == -1 && nextIndex < num) {
+        int thisIndex = nextIndex;
+        nextIndex++;
+        cout << "backward try kernel " << thisIndex << endl;
+        if(Backward::plausiblyOptimal(thisIndex, batchSize, dim)) {
+            Backward *candidate = 0;
+            try {
+                candidate = Backward::instanceSpecific(thisIndex, cl, dim);
+                instances[thisIndex] = candidate;
+                valid[thisIndex] = true;
+                cout << "   ... seems valid" << endl;
+            } catch(runtime_error &e) {
+                cout << StatefulTimer::instance()->prefix << "BackwardAuto: kernel " << thisIndex << ": this instance cant be used: " << e.what() << endl;
+                valid[thisIndex] = false;
+            }
+            if(valid[thisIndex]) {
+                Timer timer;
+                try {
+                    candidate->backward(batchSize, inputDataWrapper, gradOutput, weightsWrapper, gradInput);
+                    milliseconds[thisIndex] = (int)timer.lap();
+                    cout << StatefulTimer::instance()->prefix << "BackwardAuto: kernel " << thisIndex << " " << milliseconds[thisIndex] << "ms" << endl;
+                    return;
+                } catch(runtime_error &e) {
+                    cout << StatefulTimer::instance()->prefix << "BackwardAuto: kernel " << thisIndex << " this instance cant be used: " << e.what() << endl;
+                    valid[thisIndex] = false;
+                    delete instances[thisIndex];
+                    instances[thisIndex] = 0;
+                }
+            } else {
+                cout << "   ... not valid" << endl;
+            }
+        } else {
+            cout << "  ... not plausibly optimal, skipping" << endl;
+        }
+    }
+    if(chosenIndex == -1) {
+//        cout << StatefulTimer::instance()->prefix + "BackwardAuto::backward choosing best instance:" << endl;
+        int bestIndex = -1;
+        int bestTime = 0;
+        for(int i = 0; i < num; i++) {
+            if(!valid[i]) {
+                cout << "   backward kernel " << i << ": cannot be used" << endl;
+                continue;
+            }
+            cout << "   backward kernel " << i << " time: " << milliseconds[i] << "ms" << endl;
+            if(bestIndex == -1) {
+                bestIndex = i;
+                bestTime = milliseconds[i];
+                continue;
+            }
+            if(milliseconds[i] < bestTime) {
+                bestTime = milliseconds[i];
+                bestIndex = i;
+            }
+        }
+        if(bestIndex != -1) {
+            cout << "   backward layer selected kernel " << bestIndex << endl;
+            this->chosenIndex = bestIndex;
+        } else {
+            throw runtime_error(StatefulTimer::instance()->prefix + "No valid backward implementations found");
+        }
+    }
+//    cout << "BackwardAuto::backward using instance index: " << chosenIndex << endl;
+    instances[chosenIndex]->backward(batchSize, inputDataWrapper, gradOutput, weightsWrapper, gradInput);
+}
+
diff --git a/src/conv/BackwardAuto.h b/src/conv/BackwardAuto.h
new file mode 100644
index 00000000..ac177cbd
--- /dev/null
+++ b/src/conv/BackwardAuto.h
@@ -0,0 +1,57 @@
+// Copyright Hugh Perkins 2014 hughperkins at gmail
+//
+// This Source Code Form is subject to the terms of the Mozilla Public License, 
+// v. 2.0. If a copy of the MPL was not distributed with this file, You can 
+// obtain one at http://mozilla.org/MPL/2.0/.
+
+#pragma once
+
+#include <algorithm>
+#include <iostream>
+#include <string>
+
+#include "EasyCL.h"
+#include "activate/ActivationFunction.h"
+#include "conv/Backward.h"
+#include "conv/LayerDimensions.h"
+#include "DeepCLDllExport.h"
+
+using namespace std;
+
+//inline float square(float value) {
+//    return value * value;
+//}
+
+#define STATIC static
+#define VIRTUAL virtual
+
+class DeepCL_EXPORT BackwardAuto : public Backward {
+public:
+//    EasyCL *cl;
+//    LayerDimensions dim;
+//    ActivationFunction const*fn;
+
+    int num;
+    int *milliseconds;
+    bool *valid;
+    int chosenIndex;
+    Backward **instances;
+    int nextIndex;
+
+    // [[[cog
+    // import cog_addheaders
+    // cog_addheaders.add()
+    // ]]]
+    // generated, using cog:
+    BackwardAuto(EasyCL *cl, LayerDimensions dim);
+    VIRTUAL ~BackwardAuto();
+    VIRTUAL void backward(
+    int batchSize, CLWrapper *inputDataWrapper, CLWrapper *gradOutput, CLWrapper *weightsWrapper,
+    CLWrapper *gradInput);
+
+    // [[[end]]]
+
+};
+
+
+
diff --git a/src/conv/BackwardCpu.cpp b/src/conv/BackwardCpu.cpp
index f991fcac..4b03dbe5 100644
--- a/src/conv/BackwardCpu.cpp
+++ b/src/conv/BackwardCpu.cpp
@@ -18,18 +18,18 @@ using namespace std;
 #undef VIRTUAL
 #define VIRTUAL 
 
-BackwardCpu::BackwardCpu( EasyCL *cl, LayerDimensions dim ) :
-        Backward( cl, dim )
+BackwardCpu::BackwardCpu(EasyCL *cl, LayerDimensions dim) :
+        Backward(cl, dim)
             {
 }
 VIRTUAL BackwardCpu::~BackwardCpu() {
 }
-VIRTUAL float *BackwardCpu::backward( int batchSize, float *inputs,
-    float *gradOutput, float *weights ) {
+VIRTUAL float *BackwardCpu::backward(int batchSize, float *inputs,
+    float *gradOutput, float *weights) {
     float *gradInput = new float[ batchSize * dim.inputCubeSize ];
 
 //        Timer timer;
-    StatefulTimer::instance()->timeCheck("BackwardCpu start" );
+    StatefulTimer::instance()->timeCheck("BackwardCpu start");
     const int halfFilterSize = dim.filterSize >> 1;
     const int margin = dim.padZeros ? halfFilterSize : 0;
     // handle lower layer...
@@ -43,66 +43,66 @@ VIRTUAL float *BackwardCpu::backward( int batchSize, float *inputs,
     //      [outPlane][inPlane][filterRow][filtercol]
     //    aggregating over: [n][outRow][outCol]
     // errors are provider per [n][inPlane][inRow][inCol]
-    for( int n = 0; n < batchSize; n++ ) {
-        for( int upstreamPlane = 0; upstreamPlane < dim.inputPlanes; upstreamPlane++ ) {
-            for( int upstreamRow = 0; upstreamRow < dim.inputImageSize; upstreamRow++ ) {
-                int minFilterRow = std::max( 0, upstreamRow + margin - (dim.outputImageSize - 1) );
-                int maxFilterRow = std::min( dim.filterSize - 1, upstreamRow + margin );
-                for( int upstreamCol = 0; upstreamCol < dim.inputImageSize; upstreamCol++ ) {
+    for(int n = 0; n < batchSize; n++) {
+        for(int upstreamPlane = 0; upstreamPlane < dim.inputPlanes; upstreamPlane++) {
+            for(int upstreamRow = 0; upstreamRow < dim.inputSize; upstreamRow++) {
+                int minFilterRow = std::max(0, upstreamRow + margin - (dim.outputSize - 1));
+                int maxFilterRow = std::min(dim.filterSize - 1, upstreamRow + margin);
+                for(int upstreamCol = 0; upstreamCol < dim.inputSize; upstreamCol++) {
                     float sumWeightTimesGradOutput = 0;
                     // aggregate over [outPlane][outRow][outCol]
-                    int minFilterCol = std::max( 0, upstreamCol + margin - (dim.outputImageSize -1) );
-                    int maxFilterCol = std::min( dim.filterSize - 1, upstreamCol + margin );
-                    for( int outPlane = 0; outPlane < dim.numFilters; outPlane++ ) {
-                        for( int filterRow = minFilterRow; filterRow <= maxFilterRow; filterRow++ ) {
+                    int minFilterCol = std::max(0, upstreamCol + margin - (dim.outputSize -1));
+                    int maxFilterCol = std::min(dim.filterSize - 1, upstreamCol + margin);
+                    for(int outPlane = 0; outPlane < dim.numFilters; outPlane++) {
+                        for(int filterRow = minFilterRow; filterRow <= maxFilterRow; filterRow++) {
                             int outRow = upstreamRow + margin - filterRow;
-                            for( int filterCol = minFilterCol; filterCol <= maxFilterCol; filterCol++ ) {
+                            for(int filterCol = minFilterCol; filterCol <= maxFilterCol; filterCol++) {
                                 int outCol = upstreamCol + margin - filterCol;
-                                int resultIndex = ( ( n 
-                                    * dim.numFilters + outPlane )
-                                    * dim.outputImageSize + outRow )
-                                    * dim.outputImageSize + outCol;
+                                int resultIndex = (( n 
+                                    * dim.numFilters + outPlane)
+                                    * dim.outputSize + outRow)
+                                    * dim.outputSize + outCol;
                                 float thisGradOutput = gradOutput[resultIndex];
-                                int thisWeightIndex = ( ( outPlane 
-                                    * dim.inputPlanes + upstreamPlane )
-                                    * dim.filterSize + filterRow )
+                                int thisWeightIndex = (( outPlane 
+                                    * dim.inputPlanes + upstreamPlane)
+                                    * dim.filterSize + filterRow)
                                     * dim.filterSize + filterCol;
                                 float thisWeight = weights[thisWeightIndex];
                                 sumWeightTimesGradOutput += thisWeight * thisGradOutput;
                             }
                         }
                     }
-                    int inputIndex = ( ( n
-                        * dim.inputPlanes + upstreamPlane )
-                        * dim.inputImageSize + upstreamRow )
-                        * dim.inputImageSize + upstreamCol;
+                    int inputIndex = (( n
+                        * dim.inputPlanes + upstreamPlane)
+                        * dim.inputSize + upstreamRow)
+                        * dim.inputSize + upstreamCol;
                     gradInput[inputIndex] = sumWeightTimesGradOutput; // * activationDerivativeUpstream;
                 }
             }
         }
     }
 //        timer.timeCheck("calced errors for upstream");   
-    StatefulTimer::instance()->timeCheck("BackwardCpu end" );
+    StatefulTimer::instance()->timeCheck("BackwardCpu end");
 
     return gradInput;
 }
-VIRTUAL void BackwardCpu::backward( int batchSize, 
+VIRTUAL void BackwardCpu::backward(int batchSize, 
         CLWrapper *inputDataWrapper, CLWrapper *gradOutputWrapper, CLWrapper *weightsWrapper,
-        CLWrapper *gradInputWrapper ) {
+        CLWrapper *gradInputWrapper) {
 
     inputDataWrapper->copyToHost();
     gradOutputWrapper->copyToHost();
     weightsWrapper->copyToHost();
 //    float *bias = 0;
-//    if( dim.biased ) {
+//    if(dim.biased) {
 //        biasWrapper->copyToHost();
 //        bias =  (float *)biasWrapper->getHostArray();
 //    }
-    float *gradInput = backward( batchSize, (float *)inputDataWrapper->getHostArray(),
-         (float *)gradOutputWrapper->getHostArray(), (float *)weightsWrapper->getHostArray() );
+    float *gradInput = backward(batchSize, (float *)inputDataWrapper->getHostArray(),
+         (float *)gradOutputWrapper->getHostArray(), (float *)weightsWrapper->getHostArray());
     float *gradInputHostArray = (float*)gradInputWrapper->getHostArray();
     const int gradInputWrapperSize = gradInputWrapper->size();
-    for( int i = 0; i < gradInputWrapperSize; i++ ) {
+    for(int i = 0; i < gradInputWrapperSize; i++) {
         gradInputHostArray[i] = gradInput[i];
     }
     gradInputWrapper->copyToDevice();
diff --git a/src/conv/BackwardCpu.h b/src/conv/BackwardCpu.h
index 6d344181..623a6ee2 100644
--- a/src/conv/BackwardCpu.h
+++ b/src/conv/BackwardCpu.h
@@ -18,13 +18,13 @@ class BackwardCpu : public Backward {
     // cog_addheaders.add()
     // ]]]
     // generated, using cog:
-    BackwardCpu( EasyCL *cl, LayerDimensions dim );
+    BackwardCpu(EasyCL *cl, LayerDimensions dim);
     VIRTUAL ~BackwardCpu();
-    VIRTUAL float *backward( int batchSize, float *inputs,
-    float *gradOutput, float *weights );
-    VIRTUAL void backward( int batchSize,
+    VIRTUAL float *backward(int batchSize, float *inputs,
+    float *gradOutput, float *weights);
+    VIRTUAL void backward(int batchSize,
     CLWrapper *inputDataWrapper, CLWrapper *gradOutputWrapper, CLWrapper *weightsWrapper,
-    CLWrapper *gradInputWrapper );
+    CLWrapper *gradInputWrapper);
 
     // [[[end]]]
 };
diff --git a/src/conv/BackwardGpuCached.cpp b/src/conv/BackwardGpuCached.cpp
index 094ad368..f4f8ace4 100644
--- a/src/conv/BackwardGpuCached.cpp
+++ b/src/conv/BackwardGpuCached.cpp
@@ -14,69 +14,69 @@ VIRTUAL BackwardGpuCached::~BackwardGpuCached() {
     delete kernel;
 //    delete applyActivationDeriv;
 }
-VIRTUAL void BackwardGpuCached::backward( int batchSize, 
+VIRTUAL void BackwardGpuCached::backward(int batchSize, 
         CLWrapper *inputDataWrapper, CLWrapper *gradOutputWrapper, CLWrapper *weightsWrapper,
-        CLWrapper *gradInputWrapper ) {
-    StatefulTimer::instance()->timeCheck("BackwardGpuCached start" );
+        CLWrapper *gradInputWrapper) {
+    StatefulTimer::instance()->timeCheck("BackwardGpuCached start");
 
 //        const int batchSize,
 //        global const float *gradOutputGlobal,
 //        global const float *filtersGlobal, 
 //        global float *gradInput,
 //        local float *_errorImage, 
-//        local float *_filterImage ) {
+//        local float *_filterImage) {
 
     kernel
-       ->in( batchSize )
-        ->in( gradOutputWrapper )
-       ->in( weightsWrapper )
-        ->out( gradInputWrapper )
-        ->localFloats( square( dim.outputImageSize ) )
-        ->localFloats( square( dim.filterSize ) );
+       ->in(batchSize)
+        ->in(gradOutputWrapper)
+       ->in(weightsWrapper)
+        ->out(gradInputWrapper)
+        ->localFloats(square(dim.outputSize) )
+        ->localFloats(square(dim.filterSize) );
 
     int numWorkgroups = batchSize * dim.inputPlanes;
-    int workgroupSize = square( dim.inputImageSize );
-    workgroupSize = std::max( 32, workgroupSize ); // no point in wasting cores...
+    int workgroupSize = square(dim.inputSize);
+    workgroupSize = std::max(32, workgroupSize); // no point in wasting cores...
     int globalSize = numWorkgroups * workgroupSize;
 
 //    int globalSize = batchSize * dim.inputCubeSize;
 //    int workgroupsize = cl->getMaxWorkgroupSize();
-//    globalSize = ( ( globalSize + workgroupsize - 1 ) / workgroupsize ) * workgroupsize;
+//    globalSize = (( globalSize + workgroupsize - 1) / workgroupsize) * workgroupsize;
 //    kernel->run_1d(globalSize, workgroupsize);
     
 //    float const*gradInput = (float *)gradInputWrapper->getHostArray();
     kernel->run_1d(globalSize, workgroupSize);
     cl->finish();
 //    gradInputWrapper->copyToHost();
-    StatefulTimer::instance()->timeCheck("BackwardGpuCached after first kernel" );
-//    for( int i = 0; i < min( 40, batchSize * dim.inputCubeSize ); i++ ) {
+    StatefulTimer::instance()->timeCheck("BackwardGpuCached after first kernel");
+//    for(int i = 0; i < min(40, batchSize * dim.inputCubeSize); i++) {
 //        cout << "efu[" << i << "]=" << gradInput[i] << endl;
 //    }
 
-//    applyActivationDeriv->in( batchSize * dim.inputCubeSize )->in( gradInputWrapper )->in( inputDataWrapper );
+//    applyActivationDeriv->in(batchSize * dim.inputCubeSize)->in(gradInputWrapper)->in(inputDataWrapper);
 //    applyActivationDeriv->run_1d(globalSize, workgroupSize);
-//    applyActivationDeriv->in( batchSize * dim.inputCubeSize )->inout( gradInputWrapper )->in( inputDataWrapper );
+//    applyActivationDeriv->in(batchSize * dim.inputCubeSize)->inout(gradInputWrapper)->in(inputDataWrapper);
 //    applyActivationDeriv->run_1d(globalSize, workgroupSize);
 //    cl->finish();
-//    StatefulTimer::instance()->timeCheck("BackwardGpuCached after applyActivationDeriv" );
+//    StatefulTimer::instance()->timeCheck("BackwardGpuCached after applyActivationDeriv");
 //    gradInputWrapper->copyToHost();
-//    for( int i = 0; i < min( 40, batchSize * dim.inputCubeSize ); i++ ) {
+//    for(int i = 0; i < min(40, batchSize * dim.inputCubeSize); i++) {
 //        cout << "efu2[" << i << "]=" << gradInput[i] << endl;
 //    }
     
-    StatefulTimer::instance()->timeCheck("BackwardGpuCached end" );
+    StatefulTimer::instance()->timeCheck("BackwardGpuCached end");
 }
-BackwardGpuCached::BackwardGpuCached( EasyCL *cl, LayerDimensions dim ) :
-        Backward( cl, dim )
+BackwardGpuCached::BackwardGpuCached(EasyCL *cl, LayerDimensions dim) :
+        Backward(cl, dim)
             {
     std::string options = dim.buildOptionsString();
     options += ""; // " -D " + upstreamFn->getDefineName();
     // [[[cog
     // import stringify
-    // stringify.write_kernel2( "kernel", "cl/backward_cached.cl", "calcGradInputCached", 'options' )
-    // # stringify.write_kernel2( "broadcastMultiply", "cl/backproperrorsv2.cl", "broadcast_multiply", 'options' )
-    // # stringify.write_kernel2( "applyActivationDeriv", "cl/applyActivationDeriv.cl", "applyActivationDeriv", 'options' )
-    // # stringify.write_kernel( "kernelSource", "ClConvolve.cl")
+    // stringify.write_kernel2("kernel", "cl/backward_cached.cl", "calcGradInputCached", 'options')
+    // # stringify.write_kernel2("broadcastMultiply", "cl/backproperrorsv2.cl", "broadcast_multiply", 'options')
+    // # stringify.write_kernel2("applyActivationDeriv", "cl/applyActivationDeriv.cl", "applyActivationDeriv", 'options')
+    // # stringify.write_kernel("kernelSource", "ClConvolve.cl")
     // ]]]
     // generated using cog, from cl/backward_cached.cl:
     const char * kernelSource =  
@@ -86,11 +86,11 @@ BackwardGpuCached::BackwardGpuCached( EasyCL *cl, LayerDimensions dim ) :
     "// v. 2.0. If a copy of the MPL was not distributed with this file, You can\n" 
     "// obtain one at http://mozilla.org/MPL/2.0/.\n" 
     "\n" 
-    "void copyLocal( local float *target, global float const *source, int N ) {\n" 
-    "    int numLoops = ( N + get_local_size(0) - 1 ) / get_local_size(0);\n" 
-    "    for( int loop = 0; loop < numLoops; loop++ ) {\n" 
+    "void copyLocal(local float *target, global float const *source, int N) {\n" 
+    "    int numLoops = (N + get_local_size(0) - 1) / get_local_size(0);\n" 
+    "    for (int loop = 0; loop < numLoops; loop++) {\n" 
     "        int offset = loop * get_local_size(0) + get_local_id(0);\n" 
-    "        if( offset < N ) {\n" 
+    "        if (offset < N) {\n" 
     "            target[offset] = source[offset];\n" 
     "        }\n" 
     "    }\n" 
@@ -102,7 +102,7 @@ BackwardGpuCached::BackwardGpuCached( EasyCL *cl, LayerDimensions dim ) :
     "// localid: [upstreamrow][upstreamcol]\n" 
     "// per-thread aggregation: [outPlane][filterRow][filterCol]\n" 
     "// need to store locally:\n" 
-    "// - _gradOutputPlane. size = outputImageSizeSquared\n" 
+    "// - _gradOutputPlane. size = outputSizeSquared\n" 
     "// - _filterPlane. size = filtersizesquared\n" 
     "// note: currently doesnt use bias as input.  thats probably an error?\n" 
     "// inputs: gradOutput :convolve: filters => gradInput\n" 
@@ -120,7 +120,7 @@ BackwardGpuCached::BackwardGpuCached( EasyCL *cl, LayerDimensions dim ) :
     "        global const float *filtersGlobal,\n" 
     "        global float *gradInput,\n" 
     "        local float *_gradOutputPlane,\n" 
-    "        local float *_filterPlane ) {\n" 
+    "        local float *_filterPlane) {\n" 
     "\n" 
     "    #define globalId get_global_id(0)\n" 
     "    #define localId get_local_id(0)\n" 
@@ -130,30 +130,30 @@ BackwardGpuCached::BackwardGpuCached( EasyCL *cl, LayerDimensions dim ) :
     "    const int n = workgroupId / gInputPlanes;\n" 
     "    const int upstreamPlane = workgroupId % gInputPlanes;\n" 
     "\n" 
-    "    const int upstreamRow = localId / gInputImageSize;\n" 
-    "    const int upstreamCol = localId % gInputImageSize;\n" 
+    "    const int upstreamRow = localId / gInputSize;\n" 
+    "    const int upstreamCol = localId % gInputSize;\n" 
     "\n" 
     "    float sumWeightTimesOutError = 0;\n" 
-    "    for( int outPlane = 0; outPlane < gNumFilters; outPlane++ ) {\n" 
+    "    for (int outPlane = 0; outPlane < gNumFilters; outPlane++) {\n" 
     "        barrier(CLK_LOCAL_MEM_FENCE);\n" 
-    "        copyLocal( _filterPlane, filtersGlobal + ( outPlane * gInputPlanes + upstreamPlane ) * gFilterSizeSquared, gFilterSizeSquared );\n" 
-    "        copyLocal( _gradOutputPlane, gradOutputGlobal + ( n * gNumFilters + outPlane ) * gOutputImageSizeSquared, gOutputImageSizeSquared );\n" 
+    "        copyLocal(_filterPlane, filtersGlobal + (outPlane * gInputPlanes + upstreamPlane) * gFilterSizeSquared, gFilterSizeSquared);\n" 
+    "        copyLocal(_gradOutputPlane, gradOutputGlobal + (n * gNumFilters + outPlane) * gOutputSizeSquared, gOutputSizeSquared);\n" 
     "        barrier(CLK_LOCAL_MEM_FENCE);\n" 
-    "        for( int filterRow = 0; filterRow < gFilterSize; filterRow++ ) {\n" 
+    "        for (int filterRow = 0; filterRow < gFilterSize; filterRow++) {\n" 
     "            int outRow = upstreamRow + gMargin - filterRow;\n" 
-    "            for( int filterCol = 0; filterCol < gFilterSize; filterCol++ ) {\n" 
+    "            for (int filterCol = 0; filterCol < gFilterSize; filterCol++) {\n" 
     "                int outCol = upstreamCol + gMargin - filterCol;\n" 
-    "                if( outCol >= 0 && outCol < gOutputImageSize && outRow >= 0 && outRow < gOutputImageSize ) {\n" 
+    "                if (outCol >= 0 && outCol < gOutputSize && outRow >= 0 && outRow < gOutputSize) {\n" 
     "                    float thisWeightTimesError =\n" 
-    "                        _gradOutputPlane[outRow * gOutputImageSize + outCol] *\n" 
+    "                        _gradOutputPlane[outRow * gOutputSize + outCol] *\n" 
     "                        _filterPlane[filterRow * gFilterSize + filterCol];\n" 
     "                    sumWeightTimesOutError += thisWeightTimesError;\n" 
     "                }\n" 
     "            }\n" 
     "        }\n" 
     "    }\n" 
-    "    const int upstreamImageGlobalOffset = ( n * gInputPlanes + upstreamPlane ) * gInputImageSizeSquared;\n" 
-    "    if( localId < gInputImageSizeSquared ) {\n" 
+    "    const int upstreamImageGlobalOffset = (n * gInputPlanes + upstreamPlane) * gInputSizeSquared;\n" 
+    "    if (localId < gInputSizeSquared) {\n" 
     "        gradInput[upstreamImageGlobalOffset + localId] = sumWeightTimesOutError;\n" 
     "    }\n" 
     "}\n" 
@@ -161,7 +161,7 @@ BackwardGpuCached::BackwardGpuCached( EasyCL *cl, LayerDimensions dim ) :
     "";
     kernel = cl->buildKernelFromString( kernelSource, "calcGradInputCached", options, "cl/backward_cached.cl" );
     // [[[end]]]
-//    kernel = cl->buildKernel( "backproperrorsv2.cl", "calcGradInput", options );
-//    kernel = cl->buildKernelFromString( kernelSource, "calcGradInput", options );
+//    kernel = cl->buildKernel("backproperrorsv2.cl", "calcGradInput", options);
+//    kernel = cl->buildKernelFromString(kernelSource, "calcGradInput", options);
 }
 
diff --git a/src/conv/BackwardGpuCached.h b/src/conv/BackwardGpuCached.h
index 661ea032..9ff2f750 100644
--- a/src/conv/BackwardGpuCached.h
+++ b/src/conv/BackwardGpuCached.h
@@ -17,10 +17,10 @@ class BackwardGpuCached : public Backward {
     // ]]]
     // generated, using cog:
     VIRTUAL ~BackwardGpuCached();
-    VIRTUAL void backward( int batchSize,
+    VIRTUAL void backward(int batchSize,
     CLWrapper *inputDataWrapper, CLWrapper *gradOutputWrapper, CLWrapper *weightsWrapper,
-    CLWrapper *gradInputWrapper );
-    BackwardGpuCached( EasyCL *cl, LayerDimensions dim );
+    CLWrapper *gradInputWrapper);
+    BackwardGpuCached(EasyCL *cl, LayerDimensions dim);
 
     // [[[end]]]
 };
diff --git a/src/conv/BackwardGpuNaive.cpp b/src/conv/BackwardGpuNaive.cpp
index ea587299..b2a04438 100644
--- a/src/conv/BackwardGpuNaive.cpp
+++ b/src/conv/BackwardGpuNaive.cpp
@@ -15,43 +15,43 @@ VIRTUAL BackwardGpuNaive::~BackwardGpuNaive() {
 //    delete broadcastMultiply;
 //    delete applyActivationDeriv;
 }
-VIRTUAL void BackwardGpuNaive::backward( int batchSize, 
+VIRTUAL void BackwardGpuNaive::backward(int batchSize, 
         CLWrapper *inputDataWrapper, CLWrapper *gradOutputWrapper, CLWrapper *weightsWrapper,
-        CLWrapper *gradInputWrapper ) {
-    StatefulTimer::instance()->timeCheck("BackwardGpuNaive start" );
+        CLWrapper *gradInputWrapper) {
+    StatefulTimer::instance()->timeCheck("BackwardGpuNaive start");
 
     kernel
-       ->in( batchSize )
-        ->in( gradOutputWrapper )
-       ->in( weightsWrapper )
-        ->out( gradInputWrapper );
+       ->in(batchSize)
+        ->in(gradOutputWrapper)
+       ->in(weightsWrapper)
+        ->out(gradInputWrapper);
 
     int globalSize = batchSize * dim.inputCubeSize;
     int workgroupsize = cl->getMaxWorkgroupSize();
-    globalSize = ( ( globalSize + workgroupsize - 1 ) / workgroupsize ) * workgroupsize;
+    globalSize = (( globalSize + workgroupsize - 1) / workgroupsize) * workgroupsize;
     kernel->run_1d(globalSize, workgroupsize);
 
     cl->finish();
-    StatefulTimer::instance()->timeCheck("BackwardGpuNaive after first kernel" );
+    StatefulTimer::instance()->timeCheck("BackwardGpuNaive after first kernel");
 
-//    applyActivationDeriv->in( batchSize * dim.inputCubeSize )->in( gradInputWrapper )->in( inputDataWrapper );
+//    applyActivationDeriv->in(batchSize * dim.inputCubeSize)->in(gradInputWrapper)->in(inputDataWrapper);
 //    applyActivationDeriv->run_1d(globalSize, workgroupsize);
 //    cl->finish();
-//    StatefulTimer::instance()->timeCheck("BackwardGpuNaive after applyActivationDeriv" );
+//    StatefulTimer::instance()->timeCheck("BackwardGpuNaive after applyActivationDeriv");
     
-    StatefulTimer::instance()->timeCheck("BackwardGpuNaive end" );
+    StatefulTimer::instance()->timeCheck("BackwardGpuNaive end");
 }
-BackwardGpuNaive::BackwardGpuNaive( EasyCL *cl, LayerDimensions dim ) :
-        Backward( cl, dim )
+BackwardGpuNaive::BackwardGpuNaive(EasyCL *cl, LayerDimensions dim) :
+        Backward(cl, dim)
             {
     std::string options = dim.buildOptionsString();
     options += ""; // " -D " + upstreamFn->getDefineName();
     // [[[cog
     // import stringify
-    // stringify.write_kernel2( "kernel", "cl/backward.cl", "calcGradInput", 'options' )
-    // # stringify.write_kernel2( "broadcastMultiply", "cl/backproperrorsv2.cl", "broadcast_multiply", 'options' )
-    // # stringify.write_kernel2( "applyActivationDeriv", "cl/applyActivationDeriv.cl", "applyActivationDeriv", 'options' )
-    // # stringify.write_kernel( "kernelSource", "ClConvolve.cl")
+    // stringify.write_kernel2("kernel", "cl/backward.cl", "calcGradInput", 'options')
+    // # stringify.write_kernel2("broadcastMultiply", "cl/backproperrorsv2.cl", "broadcast_multiply", 'options')
+    // # stringify.write_kernel2("applyActivationDeriv", "cl/applyActivationDeriv.cl", "applyActivationDeriv", 'options')
+    // # stringify.write_kernel("kernelSource", "ClConvolve.cl")
     // ]]]
     // generated using cog, from cl/backward.cl:
     const char * kernelSource =  
@@ -70,42 +70,42 @@ BackwardGpuNaive::BackwardGpuNaive( EasyCL *cl, LayerDimensions dim ) :
     "// weights: [filterId][inputPlane][filterRow][filterCol] 32 * 32 * 5 * 5 * 4 = 409KB\n" 
     "void kernel calcGradInput(\n" 
     "        const int batchSize,\n" 
-    "        global const float *gradOutput, global float *weights, global float *gradInput ) {\n" 
+    "        global const float *gradOutput, global float *weights, global float *gradInput) {\n" 
     "    int globalId = get_global_id(0);\n" 
     "\n" 
-    "    const int upstreamImage2dId = globalId / gInputImageSizeSquared;\n" 
+    "    const int upstreamImage2dId = globalId / gInputSizeSquared;\n" 
     "\n" 
-    "    const int intraImageOffset = globalId % gInputImageSizeSquared;\n" 
-    "    const int upstreamRow = intraImageOffset / gInputImageSize;\n" 
-    "    const int upstreamCol = intraImageOffset % gInputImageSize;\n" 
+    "    const int intraImageOffset = globalId % gInputSizeSquared;\n" 
+    "    const int upstreamRow = intraImageOffset / gInputSize;\n" 
+    "    const int upstreamCol = intraImageOffset % gInputSize;\n" 
     "\n" 
     "    const int upstreamPlane = upstreamImage2dId % gInputPlanes;\n" 
     "    const int n = upstreamImage2dId / gInputPlanes;\n" 
     "\n" 
-    "    if( n >= batchSize ) {\n" 
+    "    if (n >= batchSize) {\n" 
     "        return;\n" 
     "    }\n" 
     "\n" 
-    "    const int minFilterRow = max( 0, upstreamRow + gMargin - (gOutputImageSize - 1) );\n" 
-    "    const int maxFilterRow = min( gFilterSize - 1, upstreamRow + gMargin );\n" 
-    "    const int minFilterCol = max( 0, upstreamCol + gMargin - (gOutputImageSize -1) );\n" 
-    "    const int maxFilterCol = min( gFilterSize - 1, upstreamCol + gMargin );\n" 
+    "    const int minFilterRow = max(0, upstreamRow + gMargin - (gOutputSize - 1));\n" 
+    "    const int maxFilterRow = min(gFilterSize - 1, upstreamRow + gMargin);\n" 
+    "    const int minFilterCol = max(0, upstreamCol + gMargin - (gOutputSize -1));\n" 
+    "    const int maxFilterCol = min(gFilterSize - 1, upstreamCol + gMargin);\n" 
     "\n" 
     "    float sumWeightTimesOutError = 0;\n" 
     "    // aggregate over [outPlane][outRow][outCol]\n" 
-    "    for( int outPlane = 0; outPlane < gNumFilters; outPlane++ ) {\n" 
-    "        for( int filterRow = minFilterRow; filterRow <= maxFilterRow; filterRow++ ) {\n" 
+    "    for (int outPlane = 0; outPlane < gNumFilters; outPlane++) {\n" 
+    "        for (int filterRow = minFilterRow; filterRow <= maxFilterRow; filterRow++) {\n" 
     "            int outRow = upstreamRow + gMargin - filterRow;\n" 
-    "            for( int filterCol = minFilterCol; filterCol <= maxFilterCol; filterCol++ ) {\n" 
+    "            for (int filterCol = minFilterCol; filterCol <= maxFilterCol; filterCol++) {\n" 
     "                int outCol = upstreamCol + gMargin - filterCol;\n" 
-    "                int resultIndex = ( ( n * gNumFilters\n" 
-    "                          + outPlane ) * gOutputImageSize\n" 
-    "                          + outRow ) * gOutputImageSize\n" 
+    "                int resultIndex = (( n * gNumFilters\n" 
+    "                          + outPlane) * gOutputSize\n" 
+    "                          + outRow) * gOutputSize\n" 
     "                          + outCol;\n" 
     "                float thisError = gradOutput[resultIndex];\n" 
-    "                int thisWeightIndex = ( ( outPlane * gInputPlanes\n" 
-    "                                    + upstreamPlane ) * gFilterSize\n" 
-    "                                    + filterRow ) * gFilterSize\n" 
+    "                int thisWeightIndex = (( outPlane * gInputPlanes\n" 
+    "                                    + upstreamPlane) * gFilterSize\n" 
+    "                                    + filterRow) * gFilterSize\n" 
     "                                    + filterCol;\n" 
     "                float thisWeight = weights[thisWeightIndex];\n" 
     "                float thisWeightTimesError = thisWeight * thisError;\n" 
@@ -119,7 +119,7 @@ BackwardGpuNaive::BackwardGpuNaive( EasyCL *cl, LayerDimensions dim ) :
     "";
     kernel = cl->buildKernelFromString( kernelSource, "calcGradInput", options, "cl/backward.cl" );
     // [[[end]]]
-//    kernel = cl->buildKernel( "backproperrorsv2.cl", "calcGradInput", options );
-//    kernel = cl->buildKernelFromString( kernelSource, "calcGradInput", options );
+//    kernel = cl->buildKernel("backproperrorsv2.cl", "calcGradInput", options);
+//    kernel = cl->buildKernelFromString(kernelSource, "calcGradInput", options);
 }
 
diff --git a/src/conv/BackwardGpuNaive.h b/src/conv/BackwardGpuNaive.h
index a5538434..a1710819 100644
--- a/src/conv/BackwardGpuNaive.h
+++ b/src/conv/BackwardGpuNaive.h
@@ -18,10 +18,10 @@ class BackwardGpuNaive : public Backward {
     // ]]]
     // generated, using cog:
     VIRTUAL ~BackwardGpuNaive();
-    VIRTUAL void backward( int batchSize,
+    VIRTUAL void backward(int batchSize,
     CLWrapper *inputDataWrapper, CLWrapper *gradOutputWrapper, CLWrapper *weightsWrapper,
-    CLWrapper *gradInputWrapper );
-    BackwardGpuNaive( EasyCL *cl, LayerDimensions dim );
+    CLWrapper *gradInputWrapper);
+    BackwardGpuNaive(EasyCL *cl, LayerDimensions dim);
 
     // [[[end]]]
 };
diff --git a/src/conv/BackwardIm2Col.cpp b/src/conv/BackwardIm2Col.cpp
new file mode 100644
index 00000000..1c00f093
--- /dev/null
+++ b/src/conv/BackwardIm2Col.cpp
@@ -0,0 +1,76 @@
+#include "util/stringhelper.h"
+#include "util/StatefulTimer.h"
+
+#include <sstream>
+#include <iostream>
+#include <string>
+
+//#include "clblas/ClBlasInstance.h"
+#include "clblas/ClBlasHelper.h"
+#include "conv/Im2Col.h"
+#include "BackwardIm2Col.h"
+
+using namespace std;
+
+#undef STATIC
+#define STATIC 
+
+#undef VIRTUAL
+#define VIRTUAL 
+
+#define PUBLIC
+
+PUBLIC BackwardIm2Col::BackwardIm2Col(EasyCL *cl, LayerDimensions dim) :
+            Backward(cl, dim)
+        {
+//    ClBlasInstance::initializeIfNecessary();
+    im2Col = new Im2Col(cl, dim);
+}
+PUBLIC VIRTUAL BackwardIm2Col::~BackwardIm2Col() {
+    delete im2Col;
+}
+PUBLIC VIRTUAL void BackwardIm2Col::backward(int batchSize, 
+        CLWrapper *inputDataWrapper, CLWrapper *gradOutputWrapper, CLWrapper *weightsWrapper,
+        CLWrapper *gradInputWrapper) {
+    StatefulTimer::timeCheck("BackwardIm2Col::backward START");
+
+    int gradColumnsSize = dim.inputPlanes * dim.filterSizeSquared * dim.outputSizeSquared;
+    float *gradColumns = new float[gradColumnsSize];
+    CLWrapper *gradColumnsWrapper = cl->wrap(gradColumnsSize, gradColumns);
+    gradColumnsWrapper->createOnDevice();
+//    cout << "gradColumnsSize: " << gradColumnsSize << endl;
+//    cout << "weightsize: " << weightsWrapper->size() << endl;
+
+    StatefulTimer::timeCheck("BackwardIm2Col::backward after alloc");
+
+    if(!gradInputWrapper->isOnDevice()) {
+        gradInputWrapper->createOnDevice();
+    }
+    for (int b = 0; b < batchSize; b ++) {
+//        cout << "b=" << b << " numkernels=" << numKernels << endl;
+        long m = dim.outputSizeSquared;
+        long n = dim.inputPlanes * dim.filterSizeSquared;
+        long k = dim.numFilters;
+//        cout << "m=" << m << " k=" << k << " n=" << n << endl;
+
+        ClBlasHelper::Gemm(
+            cl, clblasColumnMajor, clblasNoTrans, clblasTrans,
+            m, k, n,
+            1,
+            gradOutputWrapper, b * dim.outputCubeSize,
+            weightsWrapper, 0,
+            0,
+            gradColumnsWrapper, 0
+        );
+
+        im2Col->col2Im(gradColumnsWrapper, gradInputWrapper, b * dim.inputCubeSize);
+    }
+
+    delete gradColumnsWrapper;
+    delete[] gradColumns;
+
+    StatefulTimer::timeCheck("BackwardIm2Col::backward after call backward");
+
+    StatefulTimer::timeCheck("BackwardIm2Col::backward END");
+}
+
diff --git a/src/conv/BackwardIm2Col.h b/src/conv/BackwardIm2Col.h
new file mode 100644
index 00000000..ce5b7da7
--- /dev/null
+++ b/src/conv/BackwardIm2Col.h
@@ -0,0 +1,38 @@
+#pragma once
+
+#include "Backward.h"
+#include "EasyCL.h"
+
+#include "DeepCLDllExport.h"
+
+class Im2Col;
+
+#define STATIC static
+#define VIRTUAL virtual
+
+class DeepCL_EXPORT BackwardIm2Col : public Backward {
+    private:
+    Im2Col *im2Col;
+//    CLKernel *kernelCol2Im;
+//    AddBias *addBias;
+
+    float *columns;
+    CLWrapper *columnsWrapper;
+    int numKernels;
+
+    // [[[cog
+    // import cog_addheaders
+    // cog_addheaders.addv2()
+    // ]]]
+    // generated, using cog:
+
+    public:
+    BackwardIm2Col(EasyCL *cl, LayerDimensions dim);
+    VIRTUAL ~BackwardIm2Col();
+    VIRTUAL void backward(int batchSize,
+        CLWrapper *inputDataWrapper, CLWrapper *gradOutputWrapper, CLWrapper *weightsWrapper,
+    CLWrapper *gradInputWrapper);
+
+    // [[[end]]]
+};
+
diff --git a/src/conv/ConvolutionalLayer.cpp b/src/conv/ConvolutionalLayer.cpp
index 563b5c9f..64b62b0d 100644
--- a/src/conv/ConvolutionalLayer.cpp
+++ b/src/conv/ConvolutionalLayer.cpp
@@ -24,84 +24,84 @@ using namespace std;
 #undef VIRTUAL
 #define VIRTUAL 
 
-ConvolutionalLayer::ConvolutionalLayer( EasyCL *cl, Layer *previousLayer, ConvolutionalMaker *maker ) :
-        Layer( previousLayer, maker ),
-//        filterSize( maker->_filterSize ),
-//        filterSizeSquared( filterSize * filterSize ),
-//        padZeros( maker->_padZeros ),
-        cl( cl ),
-        trainerState( 0 ),
-        biasTrainerState( 0 ),
+ConvolutionalLayer::ConvolutionalLayer(EasyCL *cl, Layer *previousLayer, ConvolutionalMaker *maker) :
+        Layer(previousLayer, maker),
+//        filterSize(maker->_filterSize),
+//        filterSizeSquared(filterSize * filterSize),
+//        padZeros(maker->_padZeros),
+        cl(cl),
+        trainerState(0),
+        biasTrainerState(0),
         forwardImpl(0),
         backwardImpl(0),
 
         weights(0),
         bias(0),
         output(0),
-        gradInput( 0 ),
-        gradWeights( 0 ),
-        gradBias( 0 ),
+        gradInput(0),
+        gradWeights(0),
+        gradBias(0),
 
-        weightsWrapper( 0 ),
-        biasWrapper( 0 ),
-        outputWrapper( 0 ),
-        gradInputWrapper( 0 ),
-        gradWeightsWrapper( 0 ),
-        gradBiasWrapper( 0 ),
+        weightsWrapper(0),
+        biasWrapper(0),
+        outputWrapper(0),
+        gradInputWrapper(0),
+        gradWeightsWrapper(0),
+        gradBiasWrapper(0),
 
-        batchSize( 0 ),
-        allocatedSpaceNumExamples( 0 )
+        batchSize(0),
+        allocatedSpaceNumExamples(0)
             {
-    dim.setInputPlanes( previousLayer->getOutputPlanes() )
-        .setInputImageSize( previousLayer->getOutputImageSize() )
-        .setNumFilters( maker->_numFilters )
-        .setFilterSize( maker->_filterSize )
-        .setBiased( maker->_biased )
-        .setPadZeros( maker->_padZeros );
-    if( dim.padZeros && dim.filterSize % 2 == 0 ) {
+    dim.setInputPlanes(previousLayer->getOutputPlanes())
+        .setInputSize(previousLayer->getOutputSize())
+        .setNumFilters(maker->_numFilters)
+        .setFilterSize(maker->_filterSize)
+        .setBiased(maker->_biased)
+        .setPadZeros(maker->_padZeros);
+    if(dim.padZeros && dim.filterSize % 2 == 0) {
         throw std::runtime_error("filter size must be an odd number, if padZeros is true, so either turn off padZeros, or choose a different filtersize :-)");
     }
-//    weightsTrainer = new SGD( cl, getWeightsSize() ); // so it doesnt crash...
-//    biasTrainer = new SGD( cl, getBiasSize() );
+//    weightsTrainer = new SGD(cl, getWeightsSize()); // so it doesnt crash...
+//    biasTrainer = new SGD(cl, getBiasSize());
 
-//    dim = LayerDimensions( upstreamNumPlanes, upstreamImageSize, 
-//        numPlanes, filterSize, padZeros, biased );
-    forwardImpl = Forward::instance( cl, dim );
-    backpropWeightsImpl = BackpropWeights::instance( cl, dim );
-    if( previousLayer->needsBackProp() ) {
-        backwardImpl = Backward::instance( cl, dim );
+//    dim = LayerDimensions(upstreamNumPlanes, upstreamImageSize, 
+//        numPlanes, filterSize, padZeros, biased);
+    forwardImpl = Forward::instance(cl, dim);
+    backpropWeightsImpl = BackpropWeights::instance(cl, dim);
+    if(previousLayer->needsBackProp()) {
+        backwardImpl = Backward::instance(cl, dim);
     }
 
-    if( dim.filterSize > dim.inputImageSize ) {
-            throw std::runtime_error("filter size cannot be larger than upstream image size: " + toString( dim.filterSize) +
-                " > " + toString(dim.inputImageSize) );
+    if(dim.filterSize > dim.inputSize) {
+            throw std::runtime_error("filter size cannot be larger than upstream image size: " + toString(dim.filterSize) +
+                " > " + toString(dim.inputSize));
     }
     weights = new float[ getWeightsSize() ];
-    if( dim.biased ) {
+    if(dim.biased) {
         bias = new float[ getBiasSize() ];
     }
-    randomizeWeights( maker->_weightsInitializer );
+    randomizeWeights(maker->_weightsInitializer);
 
-    weightsWrapper = cl->wrap( getWeightsSize(), weights );
+    weightsWrapper = cl->wrap(getWeightsSize(), weights);
     weightsWrapper->copyToDevice();
 
-    if( dim.biased ) {
-        biasWrapper = cl->wrap( getBiasSize(), bias );
+    if(dim.biased) {
+        biasWrapper = cl->wrap(getBiasSize(), bias);
         biasWrapper->copyToDevice();
     }
 
     gradWeights = new float[ getWeightsSize() ];
-    gradWeightsWrapper = cl->wrap( getWeightsSize(), gradWeights );
+    gradWeightsWrapper = cl->wrap(getWeightsSize(), gradWeights);
     gradWeightsWrapper->createOnDevice();
 
-    if( dim.biased ) {
+    if(dim.biased) {
         gradBias = new float[ getBiasSize() ];
-        gradBiasWrapper = cl->wrap( getBiasSize(), gradBias );
+        gradBiasWrapper = cl->wrap(getBiasSize(), gradBias);
         gradBiasWrapper->createOnDevice();
     }
 
-    gpuAdd = new GpuAdd( cl );
-    copyBuffer = new CopyBuffer( cl );
+    gpuAdd = new GpuAdd(cl);
+    copyBuffer = new CopyBuffer(cl);
 }
 VIRTUAL ConvolutionalLayer::~ConvolutionalLayer() {
     delete gpuAdd;
@@ -134,21 +134,21 @@ VIRTUAL std::string ConvolutionalLayer::getClassName() const {
 //    return activationFunction;
 //}
 VIRTUAL float *ConvolutionalLayer::getGradInput() {
-    if( gradInputWrapper->isDeviceDirty() ) {
+    if(gradInputWrapper->isDeviceDirty()) {
 //        std::cout << "copying gradInput to host, from GPU" << std::endl;
         gradInputWrapper->copyToHost();
     }
     return gradInput;
 }
 VIRTUAL float *ConvolutionalLayer::getGradWeights() {
-    if( gradWeightsWrapper->isDeviceDirty() ) {
+    if(gradWeightsWrapper->isDeviceDirty()) {
 //        std::cout << "copying gradWeights to host, from GPU" << std::endl;
         gradWeightsWrapper->copyToHost();
     }
     return gradWeights;
 }
 VIRTUAL float *ConvolutionalLayer::getGradBias() {
-    if( gradBiasWrapper->isDeviceDirty() ) {
+    if(gradBiasWrapper->isDeviceDirty()) {
 //        std::cout << "copying gradBias to host, from GPU" << std::endl;
         gradBiasWrapper->copyToHost();
     }
@@ -181,32 +181,32 @@ VIRTUAL CLWrapper *ConvolutionalLayer::getOutputWrapper() {
 VIRTUAL bool ConvolutionalLayer::needsBackProp() {
     return true;
 }
-VIRTUAL int ConvolutionalLayer::getOutputSize() const {
+VIRTUAL int ConvolutionalLayer::getOutputNumElements() const {
     return batchSize * dim.outputCubeSize;
 }
 VIRTUAL int ConvolutionalLayer::getOutputPlanes() const {
     return dim.numFilters;
 }
-VIRTUAL int ConvolutionalLayer::getOutputImageSize() const {
-    return dim.outputImageSize;
+VIRTUAL int ConvolutionalLayer::getOutputSize() const {
+    return dim.outputSize;
 }
 // filters are organized like [filterid][plane][row][col]
-void ConvolutionalLayer::randomizeWeights( WeightsInitializer *weightsInitializer ) {
+void ConvolutionalLayer::randomizeWeights(WeightsInitializer *weightsInitializer) {
 //        std::cout << "convolutional layer randomzing weights" << std::endl;
     int fanin = dim.inputPlanes * dim.filterSize * dim.filterSize;
-    if( dim.biased ) {
+    if(dim.biased) {
         fanin++;
     }
     const int numThisLayerWeights = getWeightsSize();
-    weightsInitializer->initializeWeights( numThisLayerWeights, weights, fanin );
-    if( dim.biased ) {
-        weightsInitializer->initializeWeights( dim.numFilters, bias, fanin );
+    weightsInitializer->initializeWeights(numThisLayerWeights, weights, fanin);
+    if(dim.biased) {
+        weightsInitializer->initializeWeights(dim.numFilters, bias, fanin);
     }
 }
 VIRTUAL void ConvolutionalLayer::print() {
     std::cout << "ConvolutionalLayer " << dim << std::endl;
     printWeights();
-    if( output != 0 ) {
+    if(output != 0) {
         printOutput();
     }
 }
@@ -214,62 +214,62 @@ VIRTUAL void ConvolutionalLayer::printWeights() {
     std::cout << "  weights: " << std::endl;
     getWeights();
 // filters are organized like [filterid][plane][row][col]
-    for( int filter = 0; filter < std::min( 5, dim.numFilters ); filter++ ) {
+    for(int filter = 0; filter < std::min(5, dim.numFilters); filter++) {
        std::cout << "    filter " << filter << std::endl;
-       if( dim.biased ) {
+       if(dim.biased) {
            std::cout << "       bias=" << bias[filter] << std::endl;            
        }
-       for( int plane = 0; plane < std::min(5, dim.inputPlanes); plane++ ) {
-           if( dim.inputPlanes > 1 ) std::cout << "    inplane " << plane << std::endl;
-            for( int i = 0; i < std::min(5, dim.filterSize); i++ ) {
+       for(int plane = 0; plane < std::min(5, dim.inputPlanes); plane++) {
+           if(dim.inputPlanes > 1) std::cout << "    inplane " << plane << std::endl;
+            for(int i = 0; i < std::min(5, dim.filterSize); i++) {
                 std::cout << "      ";
-                for( int j = 0; j < std::min(5, dim.filterSize); j++ ) {
-                   std::cout << getWeight( filter, plane, i, j ) << " ";
+                for(int j = 0; j < std::min(5, dim.filterSize); j++) {
+                   std::cout << getWeight(filter, plane, i, j) << " ";
                 }
-                if( dim.filterSize > 5 ) {
+                if(dim.filterSize > 5) {
                    std::cout << " ...";
                 }
                 std::cout << std::endl;
             }
-            if( dim.filterSize > 5 ) {
+            if(dim.filterSize > 5) {
                std::cout << " ..." << std::endl;
             }
         }
-        if( dim.inputPlanes > 5 ) std::cout << " ... other inplanes ... " << std::endl;
+        if(dim.inputPlanes > 5) std::cout << " ... other inplanes ... " << std::endl;
     }
-    if( dim.numFilters > 5 ) std::cout << " ... other filters ... " << std::endl;
+    if(dim.numFilters > 5) std::cout << " ... other filters ... " << std::endl;
  }
 VIRTUAL void ConvolutionalLayer::printOutput() { 
-    if( output == 0 ) {
+    if(output == 0) {
         return;
     }
     //    getOutput();
     std::cout << "  outputs: " << std::endl;
 // output are organized like [imageid][filterid][row][col]
-    for( int n = 0; n < std::min( 5, batchSize ); n++ ) {
+    for(int n = 0; n < std::min(5, batchSize); n++) {
         std::cout << "    n: " << n << std::endl;
-        for( int plane = 0; plane < std::min(5, dim.numFilters ); plane++ ) {
-            if( dim.numFilters > 1 ) std::cout << "      plane " << plane << std::endl;
-            if( dim.outputImageSize == 1 ) {
-                 std::cout << "        " << getOutput(n, plane, 0, 0 ) << std::endl;
+        for(int plane = 0; plane < std::min(5, dim.numFilters); plane++) {
+            if(dim.numFilters > 1) std::cout << "      plane " << plane << std::endl;
+            if(dim.outputSize == 1) {
+                 std::cout << "        " << getOutput(n, plane, 0, 0) << std::endl;
             } else {
-                for( int i = 0; i < std::min(5, dim.outputImageSize); i++ ) {
+                for(int i = 0; i < std::min(5, dim.outputSize); i++) {
                     std::cout << "      ";
-                    for( int j = 0; j < std::min(5, dim.outputImageSize); j++ ) {
-                        std::cout << getOutput( n, plane, i, j ) << " ";
+                    for(int j = 0; j < std::min(5, dim.outputSize); j++) {
+                        std::cout << getOutput(n, plane, i, j) << " ";
                     }
-                    if( dim.outputImageSize > 5 ) std::cout << " ... ";
+                    if(dim.outputSize > 5) std::cout << " ... ";
                     std::cout << std::endl;
                 }
-                if( dim.outputImageSize > 5 ) std::cout << " ... " << std::endl;
+                if(dim.outputSize > 5) std::cout << " ... " << std::endl;
             }
-            if( dim.numFilters > 5 ) std::cout << " ... other planes ... " << std::endl;
+            if(dim.numFilters > 5) std::cout << " ... other planes ... " << std::endl;
         }
-        if( batchSize > 5 ) std::cout << " ... other n ... " << std::endl;
+        if(batchSize > 5) std::cout << " ... other n ... " << std::endl;
     }
 }
-VIRTUAL void ConvolutionalLayer::setBatchSize( int batchSize ) {
-    if( batchSize <= allocatedSpaceNumExamples ) {
+VIRTUAL void ConvolutionalLayer::setBatchSize(int batchSize) {
+    if(batchSize <= allocatedSpaceNumExamples) {
         this->batchSize = batchSize;
         return;
     }
@@ -283,76 +283,76 @@ VIRTUAL void ConvolutionalLayer::setBatchSize( int batchSize ) {
     delete gradInputWrapper;
     delete[] gradInput;
 
-    output = new float[getOutputSize()];
-    outputWrapper = cl->wrap( getOutputSize(), output );
+    output = new float[getOutputNumElements()];
+    outputWrapper = cl->wrap(getOutputNumElements(), output);
 
-    if( layerIndex > 1 ) {
-        gradInput = new float[ previousLayer->getOutputSize() ];
-        gradInputWrapper = cl->wrap( previousLayer->getOutputSize(), gradInput );
+    if(layerIndex > 1) {
+        gradInput = new float[ previousLayer->getOutputNumElements() ];
+        gradInputWrapper = cl->wrap(previousLayer->getOutputNumElements(), gradInput);
     }
 }
-VIRTUAL void ConvolutionalLayer::setWeights( float *weights, float *bias ) {
+VIRTUAL void ConvolutionalLayer::setWeights(float *weights, float *bias) {
 //    cout << "setweights" << endl;
-    initWeights( weights );
-    if( dim.biased ) {
-        initBias( bias );
+    initWeights(weights);
+    if(dim.biased) {
+        initBias(bias);
     }
 }
 VIRTUAL int ConvolutionalLayer::getOutputCubeSize() const {
     return dim.outputCubeSize;
 }
-VIRTUAL int ConvolutionalLayer::getPersistSize( int version ) const {
-    if( dim.biased ) {
+VIRTUAL int ConvolutionalLayer::getPersistSize(int version) const {
+    if(dim.biased) {
         return getWeightsSize() + getBiasSize();
     } else {
         return getWeightsSize();
     }
 }
-VIRTUAL void ConvolutionalLayer::persistToArray( int version, float *array ) {
+VIRTUAL void ConvolutionalLayer::persistToArray(int version, float *array) {
     float const*weights = getWeights();
-    memcpy( array, weights, sizeof(float) * getWeightsSize() );
-    if( dim.biased ) {
+    memcpy(array, weights, sizeof(float) * getWeightsSize());
+    if(dim.biased) {
         float const *bias = getBias();
-        memcpy( array + getWeightsSize(), bias, sizeof(float) * getBiasSize() );
+        memcpy(array + getWeightsSize(), bias, sizeof(float) * getBiasSize());
     }
 }
-VIRTUAL void ConvolutionalLayer::unpersistFromArray( int version, float const*array ) {
+VIRTUAL void ConvolutionalLayer::unpersistFromArray(int version, float const*array) {
     float const*newweights = array;
-    initWeights( newweights );
-    if( dim.biased ) {
+    initWeights(newweights);
+    if(dim.biased) {
         float const*newbias = array + getWeightsSize();
-        initBias( newbias );
+        initBias(newbias);
     }
 }
-VIRTUAL void ConvolutionalLayer::initWeights( float const*weights ) {
+VIRTUAL void ConvolutionalLayer::initWeights(float const*weights) {
 //    cout << "initweights()" << endl;
     int weightsSize = getWeightsSize();
-    memcpy( this->weights, weights, sizeof(float) * weightsSize );
+    memcpy(this->weights, weights, sizeof(float) * weightsSize);
     weightsWrapper->copyToDevice();
 }
-VIRTUAL void ConvolutionalLayer::initBias( float const*bias ) {
+VIRTUAL void ConvolutionalLayer::initBias(float const*bias) {
     int biasSize = dim.numFilters;
-    memcpy( this->bias, bias, sizeof(float) * biasSize );
+    memcpy(this->bias, bias, sizeof(float) * biasSize);
     biasWrapper->copyToDevice();
 }
 VIRTUAL int ConvolutionalLayer::getWeightsSize() const {
     return dim.numFilters * dim.inputPlanes * dim.filterSize * dim.filterSize;
 }
 VIRTUAL int ConvolutionalLayer::getBiasSize() const {
-    if( dim.biased ) {
+    if(dim.biased) {
         return dim.numFilters;
     } else {
         return 0;
     }
 }
 VIRTUAL float const *ConvolutionalLayer::getWeights() const {
-    if( weightsWrapper->isDeviceDirty() ) {
+    if(weightsWrapper->isDeviceDirty()) {
         throw std::runtime_error("weights not copied to host, and htis is const object, so cannot copy");
     }
     return weights;
 }
 VIRTUAL float *ConvolutionalLayer::getWeights() {
-    if( weightsWrapper->isDeviceDirty() ) {
+    if(weightsWrapper->isDeviceDirty()) {
 //        cout << "copying weights to host" << endl;
         cl->finish();
         weightsWrapper->copyToHost();
@@ -360,108 +360,108 @@ VIRTUAL float *ConvolutionalLayer::getWeights() {
     return weights;
 }
 VIRTUAL float *ConvolutionalLayer::getBias() {
-    if( biasWrapper->isDeviceDirty() ) {
+    if(biasWrapper->isDeviceDirty()) {
         cl->finish();
         biasWrapper->copyToHost();
     }
     return bias;
 }
 VIRTUAL float const*ConvolutionalLayer::getBias() const {
-    if( biasWrapper->isDeviceDirty() ) {
+    if(biasWrapper->isDeviceDirty()) {
         throw std::runtime_error("bias not copied to host, and htis is const object, so cannot copy");
     }
     return bias;
 }
 VIRTUAL float * ConvolutionalLayer::getOutput() {
-    if( outputWrapper->isDeviceDirty() ) {
+    if(outputWrapper->isDeviceDirty()) {
         outputWrapper->copyToHost();
 //        outputCopiedToHost = true;
     }
     return output;
 };
 VIRTUAL void ConvolutionalLayer::forward() {
-    if( batchSize == 0 ) {
+    if(batchSize == 0) {
         throw runtime_error("Need to call setBatchSize(size) before calling forward etc");
     }
-    StatefulTimer::instance()->timeCheck("    forward layer " + toString( layerIndex ) + ", START");
+    StatefulTimer::instance()->timeCheck("    forward layer " + toString(layerIndex) + ", START");
 
     CLWrapper *upstreamWrapper = 0;
-    if( previousLayer->hasOutputWrapper() ) {
+    if(previousLayer->hasOutputWrapper()) {
 //            std::cout << "layer " << previousLayer->layerIndex << " has outputWrapper" << std::endl;
         upstreamWrapper = previousLayer->getOutputWrapper();
     } else {
 //            std::cout << "layer " << previousLayer->layerIndex << " has no outputWrapper" << std::endl;
-        upstreamWrapper = cl->wrap( previousLayer->getOutputSize(), (float *)previousLayer->getOutput() );
+        upstreamWrapper = cl->wrap(previousLayer->getOutputNumElements(), (float *)previousLayer->getOutput());
         upstreamWrapper->copyToDevice();
     }
-    StatefulTimer::instance()->timeCheck("    forward layer " + toString( layerIndex ) + ", copied to device");
-    forwardImpl->forward( batchSize, upstreamWrapper, weightsWrapper, biasWrapper, outputWrapper );
-    StatefulTimer::instance()->timeCheck("    forward layer " + toString( layerIndex ) + ",  after clFinish");
+    StatefulTimer::instance()->timeCheck("    forward layer " + toString(layerIndex) + ", copied to device");
+    forwardImpl->forward(batchSize, upstreamWrapper, weightsWrapper, biasWrapper, outputWrapper);
+    StatefulTimer::instance()->timeCheck("    forward layer " + toString(layerIndex) + ",  after clFinish");
 
-    if( !previousLayer->hasOutputWrapper() ) {
+    if(!previousLayer->hasOutputWrapper()) {
         delete upstreamWrapper;
     }
 //    outputCopiedToHost = false;
 }
 VIRTUAL void ConvolutionalLayer::backward() {
-    StatefulTimer::instance()->timeCheck("backprop(): start, layer " + toString( layerIndex ) );
+    StatefulTimer::instance()->timeCheck("backprop(): start, layer " + toString(layerIndex) );
 
     CLWrapper *inputWrapper = 0;
-    if( previousLayer->hasOutputWrapper() ) {
+    if(previousLayer->hasOutputWrapper()) {
         inputWrapper = previousLayer->getOutputWrapper();
     } else {
-        inputWrapper = cl->wrap( previousLayer->getOutputSize(), previousLayer->getOutput() );
+        inputWrapper = cl->wrap(previousLayer->getOutputNumElements(), previousLayer->getOutput());
         inputWrapper->copyToDevice();
     }
 
     CLWrapper *gradOutputWrapper = 0;
     bool weOwnGradOutputWrapper = false;
-    if( nextLayer->providesGradInputWrapper() ) {
+    if(nextLayer->providesGradInputWrapper()) {
         gradOutputWrapper = nextLayer->getGradInputWrapper();
     } else {
-        gradOutputWrapper = cl->wrap( getOutputSize(), nextLayer->getGradInput() );
+        gradOutputWrapper = cl->wrap(getOutputNumElements(), nextLayer->getGradInput());
         gradOutputWrapper->copyToDevice();
         weOwnGradOutputWrapper = true;
     }
 
-    if( previousLayer->needsBackProp() ) {
-        backwardImpl->backward( batchSize, inputWrapper, gradOutputWrapper, weightsWrapper, gradInputWrapper );
-        StatefulTimer::instance()->timeCheck("backproperrors(): calced gradInput, layer " + ::toString( layerIndex ) );
+    if(previousLayer->needsBackProp()) {
+        backwardImpl->backward(batchSize, inputWrapper, gradOutputWrapper, weightsWrapper, gradInputWrapper);
+        StatefulTimer::instance()->timeCheck("backproperrors(): calced gradInput, layer " + ::toString(layerIndex) );
     }
 
-    backpropWeightsImpl->calcGradWeights( batchSize, gradOutputWrapper, inputWrapper,  gradWeightsWrapper, gradBiasWrapper );
-    StatefulTimer::instance()->timeCheck("backproperrors(): done calc gradWeights, layer " + ::toString( layerIndex ) );
+    backpropWeightsImpl->calcGradWeights(batchSize, gradOutputWrapper, inputWrapper,  gradWeightsWrapper, gradBiasWrapper);
+    StatefulTimer::instance()->timeCheck("backproperrors(): done calc gradWeights, layer " + ::toString(layerIndex) );
 
 //    gradWeightsCopiedToHost = false;
 //    gradBiasCopiedToHost = false;
 
-    if( !previousLayer->hasOutputWrapper() ) {
+    if(!previousLayer->hasOutputWrapper()) {
         delete inputWrapper;
     }
-    if( weOwnGradOutputWrapper ) {
+    if(weOwnGradOutputWrapper) {
         delete gradOutputWrapper;
     }
 }
-//VIRTUAL void ConvolutionalLayer::setWeights( CLWrapper *weightWrapper, CLWrapper *biasWrapper ) {
-//    copyBuffer->copy( getWeightsSize(), weightWrapper, this->weightsWrapper );
-//    if( dim.biased ) {
-//        copyBuffer->copy( getBiasSize(), biasWrapper, this->biasWrapper );
+//VIRTUAL void ConvolutionalLayer::setWeights(CLWrapper *weightWrapper, CLWrapper *biasWrapper) {
+//    copyBuffer->copy(getWeightsSize(), weightWrapper, this->weightsWrapper);
+//    if(dim.biased) {
+//        copyBuffer->copy(getBiasSize(), biasWrapper, this->biasWrapper);
 //    }
 //    weightsCopiedToHost = false;
 //    biasCopiedToHost = false;
-//    StatefulTimer::instance()->timeCheck("ConvolutionalLayer::setWeights(): set weights, layer " + ::toString( layerIndex ) );
+//    StatefulTimer::instance()->timeCheck("ConvolutionalLayer::setWeights(): set weights, layer " + ::toString(layerIndex) );
 //}
-//VIRTUAL void ConvolutionalLayer::updateWeights( CLWrapper *weightChangesWrapper, CLWrapper *biasChangesWrapper ) {
-//    gpuAdd->add( getWeightsSize(), weightsWrapper, weightChangesWrapper );
-//    if( dim.biased ) {
-//        gpuAdd->add( getBiasSize(), biasWrapper, biasChangesWrapper );
+//VIRTUAL void ConvolutionalLayer::updateWeights(CLWrapper *weightChangesWrapper, CLWrapper *biasChangesWrapper) {
+//    gpuAdd->add(getWeightsSize(), weightsWrapper, weightChangesWrapper);
+//    if(dim.biased) {
+//        gpuAdd->add(getBiasSize(), biasWrapper, biasChangesWrapper);
 //    }
 //    weightsCopiedToHost = false;
 //    biasCopiedToHost = false;
-//    StatefulTimer::instance()->timeCheck("ConvolutionalLayer::updateWeights(): updated weights, layer " + ::toString( layerIndex ) );
+//    StatefulTimer::instance()->timeCheck("ConvolutionalLayer::updateWeights(): updated weights, layer " + ::toString(layerIndex) );
 //}
 VIRTUAL std::string ConvolutionalLayer::asString() const {
-    return "ConvolutionalLayer{ " + toString( dim ) + " }";
+    return "ConvolutionalLayer{ " + toString(dim) + " }";
 }
 VIRTUAL bool ConvolutionalLayer::needsTrainerState() const {
     return true;
@@ -475,12 +475,12 @@ VIRTUAL TrainerState *ConvolutionalLayer::getTrainerState() {
 VIRTUAL TrainerState *ConvolutionalLayer::getBiasTrainerState() {
     return biasTrainerState;
 }
-VIRTUAL void ConvolutionalLayer::setTrainerState( TrainerStateMaker *trainerStateMaker ) {
+VIRTUAL void ConvolutionalLayer::setTrainerState(TrainerStateMaker *trainerStateMaker) {
     delete trainerState;
     delete biasTrainerState;
-    this->trainerState = trainerStateMaker->instance( cl, getWeightsSize() );
-    if( dim.biased ) {
-        this->biasTrainerState = trainerStateMaker->instance( cl, getBiasSize() );
+    this->trainerState = trainerStateMaker->instance(cl, getWeightsSize());
+    if(dim.biased) {
+        this->biasTrainerState = trainerStateMaker->instance(cl, getBiasSize());
     }
 }
 
diff --git a/src/conv/ConvolutionalLayer.h b/src/conv/ConvolutionalLayer.h
index 8c71ef92..d933abd9 100644
--- a/src/conv/ConvolutionalLayer.h
+++ b/src/conv/ConvolutionalLayer.h
@@ -72,38 +72,38 @@ class ConvolutionalLayer : public Layer {
     GpuAdd *gpuAdd;
     CopyBuffer *copyBuffer;
 
-    inline int getWeightIndex( int filterId, int inputPlane, int filterRow, int filterCol ) const {
-        return ( ( filterId 
-            * dim.inputPlanes + inputPlane )
-            * dim.filterSize + filterRow )
+    inline int getWeightIndex(int filterId, int inputPlane, int filterRow, int filterCol) const {
+        return (( filterId 
+            * dim.inputPlanes + inputPlane)
+            * dim.filterSize + filterRow)
             * dim.filterSize + filterCol;
     }
-    inline float getWeight( int filterId, int inputPlane, int filterRow, int filterCol ) const {
+    inline float getWeight(int filterId, int inputPlane, int filterRow, int filterCol) const {
 //        getWeights();
-        return weights[ getWeightIndex( filterId, inputPlane, filterRow, filterCol ) ];
+        return weights[ getWeightIndex(filterId, inputPlane, filterRow, filterCol) ];
     }
-    inline int getOutputIndex( int n, int outPlane, int outRow, int outCol ) const {
-        return ( ( n
-            * dim.numFilters + outPlane )
-            * dim.outputImageSize + outRow )
-            * dim.outputImageSize + outCol;
+    inline int getOutputIndex(int n, int outPlane, int outRow, int outCol) const {
+        return (( n
+            * dim.numFilters + outPlane)
+            * dim.outputSize + outRow)
+            * dim.outputSize + outCol;
     }
-    inline float getOutput( int n, int outPlane, int outRow, int outCol ) const {
-        return output[ getOutputIndex(n,outPlane, outRow, outCol ) ];
+    inline float getOutput(int n, int outPlane, int outRow, int outCol) const {
+        return output[ getOutputIndex(n,outPlane, outRow, outCol) ];
     }
 
-//    ConvolutionalLayer( Layer *previousLayer, ConvolutionalMaker const*maker );
+//    ConvolutionalLayer(Layer *previousLayer, ConvolutionalMaker const*maker);
     // images are organized like [imageId][plane][imagerow][imagecol]
     // filters are organized like [filterid][plane][filterrow][filtercol]
     // output are organized like [imageid][filterid][imagerow][imagecol]
-//    inline int getWeightIndex( int outPlane, int inPlane, int filterrow, int filtercol ) const {
-//        return ( ( outPlane * upstreamNumPlanes 
-//             + inPlane ) * filterSize 
-//             + filterrow ) * filterSize
+//    inline int getWeightIndex(int outPlane, int inPlane, int filterrow, int filtercol) const {
+//        return (( outPlane * upstreamNumPlanes 
+//             + inPlane) * filterSize 
+//             + filterrow) * filterSize
 //             + filtercol;
 //    }
-//    inline float getWeight( int outPlane, int inPlane, int filterrow, int filtercol ) const {
-//        return weights[getWeightIndex( outPlane, inPlane, filterrow, filtercol ) ];
+//    inline float getWeight(int outPlane, int inPlane, int filterrow, int filtercol) const {
+//        return weights[getWeightIndex(outPlane, inPlane, filterrow, filtercol) ];
 //    }
 
     // [[[cog
@@ -111,7 +111,7 @@ class ConvolutionalLayer : public Layer {
     // cog_addheaders.add()
     // ]]]
     // generated, using cog:
-    ConvolutionalLayer( EasyCL *cl, Layer *previousLayer, ConvolutionalMaker *maker );
+    ConvolutionalLayer(EasyCL *cl, Layer *previousLayer, ConvolutionalMaker *maker);
     VIRTUAL ~ConvolutionalLayer();
     VIRTUAL std::string getClassName() const;
     VIRTUAL float *getGradInput();
@@ -126,21 +126,21 @@ class ConvolutionalLayer : public Layer {
     VIRTUAL bool hasOutputWrapper() const;
     VIRTUAL CLWrapper *getOutputWrapper();
     VIRTUAL bool needsBackProp();
-    VIRTUAL int getOutputSize() const;
+    VIRTUAL int getOutputNumElements() const;
     VIRTUAL int getOutputPlanes() const;
-    VIRTUAL int getOutputImageSize() const;
-    void randomizeWeights( WeightsInitializer *weightsInitializer );
+    VIRTUAL int getOutputSize() const;
+    void randomizeWeights(WeightsInitializer *weightsInitializer);
     VIRTUAL void print();
     VIRTUAL void printWeights();
     VIRTUAL void printOutput();
-    VIRTUAL void setBatchSize( int batchSize );
-    VIRTUAL void setWeights( float *weights, float *bias );
+    VIRTUAL void setBatchSize(int batchSize);
+    VIRTUAL void setWeights(float *weights, float *bias);
     VIRTUAL int getOutputCubeSize() const;
-    VIRTUAL int getPersistSize( int version ) const;
-    VIRTUAL void persistToArray( int version, float *array );
-    VIRTUAL void unpersistFromArray( int version, float const*array );
-    VIRTUAL void initWeights( float const*weights );
-    VIRTUAL void initBias( float const*bias );
+    VIRTUAL int getPersistSize(int version) const;
+    VIRTUAL void persistToArray(int version, float *array);
+    VIRTUAL void unpersistFromArray(int version, float const*array);
+    VIRTUAL void initWeights(float const*weights);
+    VIRTUAL void initBias(float const*bias);
     VIRTUAL int getWeightsSize() const;
     VIRTUAL int getBiasSize() const;
     VIRTUAL float const *getWeights() const;
@@ -155,7 +155,7 @@ class ConvolutionalLayer : public Layer {
     VIRTUAL bool biased();
     VIRTUAL TrainerState *getTrainerState();
     VIRTUAL TrainerState *getBiasTrainerState();
-    VIRTUAL void setTrainerState( TrainerStateMaker *trainerStateMaker );
+    VIRTUAL void setTrainerState(TrainerStateMaker *trainerStateMaker);
 
     // [[[end]]]
 };
diff --git a/src/conv/ConvolutionalMaker.cpp b/src/conv/ConvolutionalMaker.cpp
index 91bb0f93..7aa003c1 100644
--- a/src/conv/ConvolutionalMaker.cpp
+++ b/src/conv/ConvolutionalMaker.cpp
@@ -10,14 +10,14 @@
 
 using namespace std;
 
-Layer *ConvolutionalMaker::createLayer( Layer *previousLayer ) {
-    if( _numFilters == 0 ) {
+Layer *ConvolutionalMaker::createLayer(Layer *previousLayer) {
+    if(_numFilters == 0) {
         throw runtime_error("Must provide ->numFilters(numFilters)");
     }
-    if( _filterSize == 0 ) {
+    if(_filterSize == 0) {
         throw runtime_error("Must provide ->filterSize(filterSize)");
     }
-    Layer *layer = new ConvolutionalLayer( cl, previousLayer, this );
+    Layer *layer = new ConvolutionalLayer(cl, previousLayer, this);
     return layer;
 }
 
diff --git a/src/conv/ConvolutionalMaker.h b/src/conv/ConvolutionalMaker.h
index 7fb54626..5b74c986 100644
--- a/src/conv/ConvolutionalMaker.h
+++ b/src/conv/ConvolutionalMaker.h
@@ -50,7 +50,7 @@ class DeepCL_EXPORT ConvolutionalMaker : public LayerMaker2 {
         this->_padZeros = true;
         return this;
     }    
-    PUBLICAPI ConvolutionalMaker *padZeros( bool value ) {
+    PUBLICAPI ConvolutionalMaker *padZeros(bool value) {
         this->_padZeros = value;
         return this;
     }    
@@ -64,9 +64,9 @@ class DeepCL_EXPORT ConvolutionalMaker : public LayerMaker2 {
     }    
     virtual ConvolutionalMaker *clone() const {
         ConvolutionalMaker *thisClone = new ConvolutionalMaker();
-        memcpy( thisClone, this, sizeof( ConvolutionalMaker ) ); // this will copy the activationfunction pointer too
+        memcpy(thisClone, this, sizeof(ConvolutionalMaker) ); // this will copy the activationfunction pointer too
         return thisClone;
     }
-    virtual Layer *createLayer( Layer *previousLayer );
+    virtual Layer *createLayer(Layer *previousLayer);
 };
 
diff --git a/src/conv/Forward.cpp b/src/conv/Forward.cpp
index 1b8ed71a..0553a06d 100644
--- a/src/conv/Forward.cpp
+++ b/src/conv/Forward.cpp
@@ -16,6 +16,7 @@
 #include "conv/Forward4.h"
 #include "conv/ForwardFc.h"
 #include "conv/ForwardByInputPlane.h"
+#include "conv/ForwardIm2Col.h"
 #include "conv/ForwardAuto.h"
 #include "util/StatefulTimer.h"
 
@@ -27,135 +28,137 @@ using namespace std;
 #undef VIRTUAL
 #define VIRTUAL 
 
-Forward::Forward( EasyCL *cl, LayerDimensions layerDimensions ) :
-        cl( cl ),
-        dim( layerDimensions ) {
+Forward::Forward(EasyCL *cl, LayerDimensions layerDimensions) :
+        cl(cl),
+        dim(layerDimensions) {
 }
-STATIC Forward *Forward::instance(EasyCL *cl, LayerDimensions dim ) {
-    return new ForwardAuto( cl, dim );
-//    return new ForwardByInputPlane( cl, dim );
+STATIC Forward *Forward::instance(EasyCL *cl, LayerDimensions dim) {
+    return new ForwardAuto(cl, dim);
+//    return new ForwardByInputPlane(cl, dim);
 
-//    if( dim.filterSize == dim.inputImageSize && dim.padZeros == false && dim.numFilters >= 64
-//        && dim.filterSize >= 11 ) {
-//        return new ForwardFc( cl, dim );
+//    if(dim.filterSize == dim.inputSize && dim.padZeros == false && dim.numFilters >= 64
+//        && dim.filterSize >= 11) {
+//        return new ForwardFc(cl, dim);
 //    } else {
 //    }
-//    if( dim.filterSize == dim.inputImageSize && dim.padZeros == false && dim.numFilters >= 64
-//        && dim.filterSize >= 11 ) {
-//        return new ForwardFc( cl, dim );
-//    } else if( square( dim.outputImageSize ) < 32 || square( dim.outputImageSize ) > cl->getMaxWorkgroupSize() ) {
-//        return new Forward1( cl, dim );
+//    if(dim.filterSize == dim.inputSize && dim.padZeros == false && dim.numFilters >= 64
+//        && dim.filterSize >= 11) {
+//        return new ForwardFc(cl, dim);
+//    } else if(square(dim.outputSize) < 32 || square(dim.outputSize) > cl->getMaxWorkgroupSize()) {
+//        return new Forward1(cl, dim);
 //    } else {
-//        return new Forward3( cl, dim );
+//        return new Forward3(cl, dim);
 //    }
 }
-STATIC Forward *Forward::instanceTest(EasyCL *cl, LayerDimensions layerDimensions ) {
-    return new Forward2( cl, layerDimensions );
+STATIC Forward *Forward::instanceTest(EasyCL *cl, LayerDimensions layerDimensions) {
+    return new Forward2(cl, layerDimensions);
 }
 STATIC int Forward::getNumImplementations() {
-    return 7;
+    return 8;
 }
-STATIC bool Forward::plausiblyOptimal( int index, int batchSize, LayerDimensions dim ) {
-    if( index == 0 ) { 
+STATIC bool Forward::plausiblyOptimal(int index, int batchSize, LayerDimensions dim) {
+    if(index == 0) { 
         return false;
     }
-    if( index > 6 ) {
+    if(index > 7) {
         return false;
     }
     return true;
 }
-STATIC Forward *Forward::instanceSpecific( int idx, EasyCL *cl, LayerDimensions layerDimensions ) {
-    if( idx == 0 ) {
-        return new ForwardCpu( cl, layerDimensions );
-    } else if( idx == -1 ) {
-        return instance( cl, layerDimensions );
-    } else if( idx == -2 ) {
+STATIC Forward *Forward::instanceSpecific(int idx, EasyCL *cl, LayerDimensions layerDimensions) {
+    if(idx == 0) {
+        return new ForwardCpu(cl, layerDimensions);
+    } else if(idx == -1) {
+        return instance(cl, layerDimensions);
+    } else if(idx == -2) {
         cout << "Forward::instanceSpeicfic, choosing: ForwardAuto" << endl;
-        return new ForwardAuto( cl, layerDimensions );
-    } else if( idx == 1 ) {
-        return new Forward1( cl, layerDimensions );
-    } else if( idx == 2 ) {
-        return new Forward2( cl, layerDimensions );
-    } else if( idx == 3 ) {
-        return new Forward3( cl, layerDimensions );
-    } else if( idx == 4 ) {
-        return new Forward4( cl, layerDimensions );
-    } else if( idx == 5 ) {
-        return new ForwardFc( cl, layerDimensions );
-    } else if( idx == 6 ) {
-        return new ForwardByInputPlane( cl, layerDimensions );
+        return new ForwardAuto(cl, layerDimensions);
+    } else if(idx == 1) {
+        return new Forward1(cl, layerDimensions);
+    } else if(idx == 2) {
+        return new Forward2(cl, layerDimensions);
+    } else if(idx == 3) {
+        return new Forward3(cl, layerDimensions);
+    } else if(idx == 4) {
+        return new Forward4(cl, layerDimensions);
+    } else if(idx == 5) {
+        return new ForwardFc(cl, layerDimensions);
+    } else if(idx == 6) {
+        return new ForwardByInputPlane(cl, layerDimensions);
+    } else if(idx == 7) {
+        return new ForwardIm2Col(cl, layerDimensions);
     } else {
-        throw runtime_error( string("") + __FILE__ + ":" + toString( __LINE__ ) + " Forward::instanceSpecific: no instance defined for index " + toString(idx) );
+        throw runtime_error(string("") + __FILE__ + ":" + toString(__LINE__) + " Forward::instanceSpecific: no instance defined for index " + toString(idx));
     }
 }
-STATIC Forward *Forward::instanceSpecific( std::string name, EasyCL *cl, LayerDimensions layerDimensions ) {
-    if( name == "cpu" ) {
-        return new ForwardCpu( cl, layerDimensions );
-    } else if( name == "prop1" ) {
-        return new Forward1( cl, layerDimensions );
-    } else if( name == "prop3" ) {
-        return new Forward3( cl, layerDimensions );
-    } else if( name == "prop4" ) {
-        return new Forward4( cl, layerDimensions );
-    } else if( name == "fc" ) {
-        return new ForwardFc( cl, layerDimensions );
-    } else if( name == "byinplane" ) {
-        return new ForwardByInputPlane( cl, layerDimensions );
+STATIC Forward *Forward::instanceSpecific(std::string name, EasyCL *cl, LayerDimensions layerDimensions) {
+    if(name == "cpu") {
+        return new ForwardCpu(cl, layerDimensions);
+    } else if(name == "prop1") {
+        return new Forward1(cl, layerDimensions);
+    } else if(name == "prop3") {
+        return new Forward3(cl, layerDimensions);
+    } else if(name == "prop4") {
+        return new Forward4(cl, layerDimensions);
+    } else if(name == "fc") {
+        return new ForwardFc(cl, layerDimensions);
+    } else if(name == "byinplane") {
+        return new ForwardByInputPlane(cl, layerDimensions);
     } else {
-        throw runtime_error( string("") + __FILE__ + ":" + toString( __LINE__ ) + " Forward::instanceSpecific: no instance defined for name " + name );
+        throw runtime_error(string("") + __FILE__ + ":" + toString(__LINE__) + " Forward::instanceSpecific: no instance defined for name " + name);
     }
 }
 // you own the returned output array, and are responsible for deleting it
-//VIRTUAL float * Forward::forward( int batchSize, float *inputData, float *filters, float *biases ) {
+//VIRTUAL float * Forward::forward(int batchSize, float *inputData, float *filters, float *biases) {
 //    float *output = new float[batchSize * dim.outputCubeSize];
-//    forward( batchSize, inputData, filters, biases, output );
+//    forward(batchSize, inputData, filters, biases, output);
 //    return output;
 //}
 VIRTUAL int Forward::getOutputTotalSize(int batchSize) {
     return batchSize * dim.outputCubeSize;
 }
 // must allocate output yourself before the call
-VIRTUAL void Forward::forward( int batchSize, float *inputData, float *filters, float *biases, float *output ) {
+VIRTUAL void Forward::forward(int batchSize, float *inputData, float *filters, float *biases, float *output) {
     StatefulTimer::timeCheck("Forward::forward begin");
     int inputDataSize = batchSize * dim.inputCubeSize;
-    CLWrapper *dataWrapper = cl->wrap( inputDataSize, inputData );
+    CLWrapper *dataWrapper = cl->wrap(inputDataSize, inputData);
     dataWrapper->copyToDevice();
 
     int weightsSize = dim.filtersSize;
-    CLWrapper *weightsWrapper = cl->wrap( weightsSize, filters );
+    CLWrapper *weightsWrapper = cl->wrap(weightsSize, filters);
     weightsWrapper->copyToDevice();
 
     CLWrapper *biasWrapper = 0;
-    if( dim.biased ) {
+    if(dim.biased) {
         int biasWrapperSize = dim.numFilters;
-        biasWrapper = cl->wrap( biasWrapperSize, biases );
+        biasWrapper = cl->wrap(biasWrapperSize, biases);
         biasWrapper->copyToDevice();
     }
 
 //    int outputDataSize = batchSize * dim.outputCubeSize;
 //    cout << " batchsize " << batchSize << " " << dim << endl;
-//    int allocatedOutputSize = std::max(5000, outputDataSize );
-//    int allocatedOutputSize = outputDataSize;
-//    float *output = new float[allocatedOutputSize];
-    CLWrapper *outputWrapper = cl->wrap( batchSize * dim.outputCubeSize, output );
+//    int allocatedOutputNumElements = std::max(5000, outputDataSize);
+//    int allocatedOutputNumElements = outputDataSize;
+//    float *output = new float[allocatedOutputNumElements];
+    CLWrapper *outputWrapper = cl->wrap(batchSize * dim.outputCubeSize, output);
     outputWrapper->createOnDevice();
     cl->finish();
 
     StatefulTimer::timeCheck("Forward::forward after copied to device");
-    forward( batchSize, dataWrapper, weightsWrapper, biasWrapper,
-            outputWrapper );
+    forward(batchSize, dataWrapper, weightsWrapper, biasWrapper,
+            outputWrapper);
     StatefulTimer::timeCheck("Forward::forward after call forward");
     cl->finish();
     outputWrapper->copyToHost();
     StatefulTimer::timeCheck("Forward::forward after copytohost");
-//    for( int i = 0; i < 20; i++ ) {
+//    for(int i = 0; i < 20; i++) {
 //        cout << "output[" << i << "]=" << output[i] << endl;
 //    }
     delete outputWrapper;
 
     delete dataWrapper;
     delete weightsWrapper;
-    if( dim.biased ) {
+    if(dim.biased) {
         delete biasWrapper;
     }
 
diff --git a/src/conv/Forward.h b/src/conv/Forward.h
index 9581c76c..9b8b3daf 100644
--- a/src/conv/Forward.h
+++ b/src/conv/Forward.h
@@ -17,7 +17,7 @@
 
 using namespace std;
 
-//inline float square( float value ) {
+//inline float square(float value) {
 //    return value * value;
 //}
 
@@ -30,24 +30,24 @@ class DeepCL_EXPORT Forward {
     LayerDimensions dim;
 
     virtual ~Forward() {}
-    virtual void forward( int batchSize, 
+    virtual void forward(int batchSize, 
         CLWrapper *dataWrapper, CLWrapper *weightsWrapper, CLWrapper *biasWrapper,
-        CLWrapper *outputWrapper ) = 0;
+        CLWrapper *outputWrapper) = 0;
 
     // [[[cog
     // import cog_addheaders
     // cog_addheaders.add()
     // ]]]
     // generated, using cog:
-    Forward( EasyCL *cl, LayerDimensions layerDimensions );
-    STATIC Forward *instance(EasyCL *cl, LayerDimensions dim );
-    STATIC Forward *instanceTest(EasyCL *cl, LayerDimensions layerDimensions );
+    Forward(EasyCL *cl, LayerDimensions layerDimensions);
+    STATIC Forward *instance(EasyCL *cl, LayerDimensions dim);
+    STATIC Forward *instanceTest(EasyCL *cl, LayerDimensions layerDimensions);
     STATIC int getNumImplementations();
-    STATIC bool plausiblyOptimal( int index, int batchSize, LayerDimensions dim );
-    STATIC Forward *instanceSpecific( int idx, EasyCL *cl, LayerDimensions layerDimensions );
-    STATIC Forward *instanceSpecific( std::string name, EasyCL *cl, LayerDimensions layerDimensions );
+    STATIC bool plausiblyOptimal(int index, int batchSize, LayerDimensions dim);
+    STATIC Forward *instanceSpecific(int idx, EasyCL *cl, LayerDimensions layerDimensions);
+    STATIC Forward *instanceSpecific(std::string name, EasyCL *cl, LayerDimensions layerDimensions);
     VIRTUAL int getOutputTotalSize(int batchSize);
-    VIRTUAL void forward( int batchSize, float *inputData, float *filters, float *biases, float *output );
+    VIRTUAL void forward(int batchSize, float *inputData, float *filters, float *biases, float *output);
 
     // [[[end]]]
 
diff --git a/src/conv/Forward1.cpp b/src/conv/Forward1.cpp
index 638c35be..9e4ec42e 100644
--- a/src/conv/Forward1.cpp
+++ b/src/conv/Forward1.cpp
@@ -20,42 +20,42 @@ VIRTUAL Forward1::~Forward1() {
     delete kernel;
     delete addBias;
 }
-VIRTUAL void Forward1::forward( int batchSize, CLWrapper *dataWrapper, CLWrapper *weightsWrapper, CLWrapper *biasWrapper,
-    CLWrapper *outputWrapper ) {
+VIRTUAL void Forward1::forward(int batchSize, CLWrapper *dataWrapper, CLWrapper *weightsWrapper, CLWrapper *biasWrapper,
+    CLWrapper *outputWrapper) {
     StatefulTimer::timeCheck("Forward1::forward START");
 
     kernel->in(batchSize);
-    kernel->input( dataWrapper );
-    kernel->input( weightsWrapper);
-    kernel->output( outputWrapper );
+    kernel->input(dataWrapper);
+    kernel->input(weightsWrapper);
+    kernel->output(outputWrapper);
 
     int globalSize = batchSize * dim.outputCubeSize;
-    int workgroupsize = std::min( globalSize, cl->getMaxWorkgroupSize() );
-    globalSize = ( ( globalSize + workgroupsize - 1 ) / workgroupsize ) * workgroupsize;
+    int workgroupsize = std::min(globalSize, cl->getMaxWorkgroupSize());
+    globalSize = (( globalSize + workgroupsize - 1) / workgroupsize) * workgroupsize;
 //    cout << "forward1 globalsize " << globalSize << " workgroupsize " << workgroupsize << endl;
 
-    kernel->run_1d( globalSize, workgroupsize );
+    kernel->run_1d(globalSize, workgroupsize);
     cl->finish();
     StatefulTimer::timeCheck("Forward1::forward after call forward");
 
-    if( dim.biased ) {
+    if(dim.biased) {
         addBias->forward(
-            batchSize, dim.numFilters, dim.outputImageSize,
-            outputWrapper, biasWrapper );
+            batchSize, dim.numFilters, dim.outputSize,
+            outputWrapper, biasWrapper);
     }
     StatefulTimer::timeCheck("Forward1::forward END");
 }
-Forward1::Forward1( EasyCL *cl, LayerDimensions dim ) :
-            Forward( cl, dim )
+Forward1::Forward1(EasyCL *cl, LayerDimensions dim) :
+            Forward(cl, dim)
         {
-    addBias = new AddBias( cl );
+    addBias = new AddBias(cl);
 
     std::string options = "";
     options += dim.buildOptionsString();
 
     // [[[cog
     // import stringify
-    // stringify.write_kernel2( "kernel", "cl/forward1.cl", "convolve_imagecubes_float2", 'options' )
+    // stringify.write_kernel2("kernel", "cl/forward1.cl", "convolve_imagecubes_float2", 'options')
     // ]]]
     // generated using cog, from cl/forward1.cl:
     const char * kernelSource =  
@@ -124,45 +124,45 @@ Forward1::Forward1( EasyCL *cl, LayerDimensions dim ) :
     "void kernel convolve_imagecubes_float2(\n" 
     "    const int numExamples,\n" 
     "      global const float *inputs, global const float *filters,\n" 
-    "    global float *output ) {\n" 
+    "    global float *output) {\n" 
     "    int globalId = get_global_id(0);\n" 
     "\n" 
-    "    int outputImage2Id = globalId / gOutputImageSizeSquared;\n" 
+    "    int outputImage2Id = globalId / gOutputSizeSquared;\n" 
     "    int exampleId = outputImage2Id / gNumFilters;\n" 
     "    int filterId = outputImage2Id % gNumFilters;\n" 
     "\n" 
     "    // intraimage coords\n" 
-    "    int localid = globalId % gOutputImageSizeSquared;\n" 
-    "    int outputRow = localid / gOutputImageSize;\n" 
-    "    int outputCol = localid % gOutputImageSize;\n" 
+    "    int localid = globalId % gOutputSizeSquared;\n" 
+    "    int outputRow = localid / gOutputSize;\n" 
+    "    int outputCol = localid % gOutputSize;\n" 
     "\n" 
-    "    global float const*inputCube = inputs + exampleId * gNumInputPlanes * gInputImageSizeSquared;\n" 
+    "    global float const*inputCube = inputs + exampleId * gNumInputPlanes * gInputSizeSquared;\n" 
     "    global float const*filterCube = filters + filterId * gNumInputPlanes * gFilterSizeSquared;\n" 
     "\n" 
     "    float sum = 0;\n" 
-    "    if( exampleId < numExamples ) {\n" 
-    "        for( int inputPlaneIdx = 0; inputPlaneIdx < gNumInputPlanes; inputPlaneIdx++ ) {\n" 
-    "            global float const*inputPlane = inputCube + inputPlaneIdx * gInputImageSizeSquared;\n" 
+    "    if (exampleId < numExamples) {\n" 
+    "        for (int inputPlaneIdx = 0; inputPlaneIdx < gNumInputPlanes; inputPlaneIdx++) {\n" 
+    "            global float const*inputPlane = inputCube + inputPlaneIdx * gInputSizeSquared;\n" 
     "            global float const*filterPlane = filterCube + inputPlaneIdx * gFilterSizeSquared;\n" 
-    "            for( int u = -gHalfFilterSize; u <= gHalfFilterSize - gEven; u++ ) {\n" 
+    "            for (int u = -gHalfFilterSize; u <= gHalfFilterSize - gEven; u++) {\n" 
     "                // trying to reduce register pressure...\n" 
     "                #if gPadZeros == 1\n" 
-    "                    #define inputRowIdx ( outputRow + u )\n" 
+    "                    #define inputRowIdx (outputRow + u)\n" 
     "                #else\n" 
-    "                    #define inputRowIdx ( outputRow + u + gHalfFilterSize )\n" 
+    "                    #define inputRowIdx (outputRow + u + gHalfFilterSize)\n" 
     "                #endif\n" 
-    "                global float const *inputRow = inputPlane + inputRowIdx * gInputImageSize;\n" 
+    "                global float const *inputRow = inputPlane + inputRowIdx * gInputSize;\n" 
     "                global float const *filterRow = filterPlane + (u+gHalfFilterSize) * gFilterSize + gHalfFilterSize;\n" 
-    "                bool rowOk = inputRowIdx >= 0 && inputRowIdx < gInputImageSize;\n" 
+    "                bool rowOk = inputRowIdx >= 0 && inputRowIdx < gInputSize;\n" 
     "                #pragma unroll\n" 
-    "                for( int v = -gHalfFilterSize; v <= gHalfFilterSize - gEven; v++ ) {\n" 
+    "                for (int v = -gHalfFilterSize; v <= gHalfFilterSize - gEven; v++) {\n" 
     "                    #if gPadZeros == 1\n" 
-    "                        #define inputColIdx ( outputCol + v )\n" 
+    "                        #define inputColIdx (outputCol + v)\n" 
     "                    #else\n" 
-    "                        #define inputColIdx ( outputCol + v + gHalfFilterSize )\n" 
+    "                        #define inputColIdx (outputCol + v + gHalfFilterSize)\n" 
     "                    #endif\n" 
-    "                    bool process = rowOk && inputColIdx >= 0 && inputColIdx < gInputImageSize;\n" 
-    "                    if( process ) {\n" 
+    "                    bool process = rowOk && inputColIdx >= 0 && inputColIdx < gInputSize;\n" 
+    "                    if (process) {\n" 
     "                            sum += inputRow[inputColIdx] * filterRow[v];\n" 
     "                    }\n" 
     "                }\n" 
@@ -170,7 +170,7 @@ Forward1::Forward1( EasyCL *cl, LayerDimensions dim ) :
     "        }\n" 
     "    }\n" 
     "\n" 
-    "    if( exampleId < numExamples ) {\n" 
+    "    if (exampleId < numExamples) {\n" 
     "        output[globalId] = sum;\n" 
     "    }\n" 
     "}\n" 
diff --git a/src/conv/Forward1.h b/src/conv/Forward1.h
index bee923fd..8c2dddf1 100644
--- a/src/conv/Forward1.h
+++ b/src/conv/Forward1.h
@@ -21,9 +21,9 @@ class Forward1 : public Forward {
     // ]]]
     // generated, using cog:
     VIRTUAL ~Forward1();
-    VIRTUAL void forward( int batchSize, CLWrapper *dataWrapper, CLWrapper *weightsWrapper, CLWrapper *biasWrapper,
-    CLWrapper *outputWrapper );
-    Forward1( EasyCL *cl, LayerDimensions dim );
+    VIRTUAL void forward(int batchSize, CLWrapper *dataWrapper, CLWrapper *weightsWrapper, CLWrapper *biasWrapper,
+    CLWrapper *outputWrapper);
+    Forward1(EasyCL *cl, LayerDimensions dim);
 
     // [[[end]]]
 };
diff --git a/src/conv/Forward2.cpp b/src/conv/Forward2.cpp
index 004503d9..c083efde 100644
--- a/src/conv/Forward2.cpp
+++ b/src/conv/Forward2.cpp
@@ -23,50 +23,50 @@ VIRTUAL Forward2::~Forward2() {
     delete addBias;
 }
 // only works for small filters
-// condition: square( dim.filterSize ) * dim.inputPlanes * 4 < 5000 (about 5KB)
-VIRTUAL void Forward2::forward( int batchSize, CLWrapper *dataWrapper, CLWrapper *weightsWrapper, CLWrapper *biasWrapper,
-    CLWrapper *outputWrapper ) {
+// condition: square(dim.filterSize) * dim.inputPlanes * 4 < 5000 (about 5KB)
+VIRTUAL void Forward2::forward(int batchSize, CLWrapper *dataWrapper, CLWrapper *weightsWrapper, CLWrapper *biasWrapper,
+    CLWrapper *outputWrapper) {
     StatefulTimer::timeCheck("Forward2::forward START");
     kernel->in(batchSize);
-    kernel->input( dataWrapper );
-    kernel->input( weightsWrapper);
-    kernel->output( outputWrapper );
-//        cout << "square(outputImageSize) " << square( outputImageSize ) << endl;
-    kernel->localFloats( square( dim.inputImageSize ) );
-    kernel->localFloats( square( dim.filterSize ) * dim.inputPlanes );
+    kernel->input(dataWrapper);
+    kernel->input(weightsWrapper);
+    kernel->output(outputWrapper);
+//        cout << "square(outputSize) " << square(outputSize) << endl;
+    kernel->localFloats(square(dim.inputSize) );
+    kernel->localFloats(square(dim.filterSize) * dim.inputPlanes);
 //    cout << "forward2 globalsize " << globalSize << " workgroupsize " << workgroupsize << endl;
-    kernel->run_1d( globalSize, workgroupSize );
+    kernel->run_1d(globalSize, workgroupSize);
     cl->finish();
     StatefulTimer::timeCheck("Forward2::forward after call forward");
 
-    if( dim.biased ) {
+    if(dim.biased) {
         addBias->forward(
-            batchSize, dim.numFilters, dim.outputImageSize,
-            outputWrapper, biasWrapper );
+            batchSize, dim.numFilters, dim.outputSize,
+            outputWrapper, biasWrapper);
     }
     StatefulTimer::timeCheck("Forward2::forward END");
 }
-Forward2::Forward2( EasyCL *cl, LayerDimensions dim ) :
-            Forward( cl, dim )
+Forward2::Forward2(EasyCL *cl, LayerDimensions dim) :
+            Forward(cl, dim)
         {
-    if( square( dim.outputImageSize ) > cl->getMaxWorkgroupSize() ) {
+    if(square(dim.outputSize) > cl->getMaxWorkgroupSize()) {
         throw runtime_error("cannot use forward2, since outputimagesize * outputimagesize > maxworkgroupsize");
     }
 
-    addBias = new AddBias( cl );
+    addBias = new AddBias(cl);
 
-    this->workgroupSize = square( dim.outputImageSize );
+    this->workgroupSize = square(dim.outputSize);
     // round up to nearest 32, so dont waste threads:
-    this->workgroupSize = ( ( workgroupSize + 32 - 1 ) / 32 ) * 32;
+    this->workgroupSize = (( workgroupSize + 32 - 1) / 32) * 32;
     this->numWorkgroups = dim.numFilters;
     this->globalSize = this->workgroupSize * this->numWorkgroups;
 
     std::string options = ""; // "-D " + fn->getDefineName();
     options += dim.buildOptionsString();
-    options += " -DgWorkgroupSize=" + toString( this->workgroupSize );
+    options += " -DgWorkgroupSize=" + toString(this->workgroupSize);
     // [[[cog
     // import stringify
-    // stringify.write_kernel2( "kernel", "cl/forward2.cl", "forward_2_by_outplane", 'options' )
+    // stringify.write_kernel2("kernel", "cl/forward2.cl", "forward_2_by_outplane", 'options')
     // ]]]
     // generated using cog, from cl/forward2.cl:
     const char * kernelSource =  
@@ -76,17 +76,17 @@ Forward2::Forward2( EasyCL *cl, LayerDimensions dim ) :
     "// v. 2.0. If a copy of the MPL was not distributed with this file, You can\n" 
     "// obtain one at http://mozilla.org/MPL/2.0/.\n" 
     "\n" 
-    "void copyLocal( local float *target, global float const *source, const int N ) {\n" 
-    "    int numLoops = ( N + gWorkgroupSize - 1 ) / gWorkgroupSize;\n" 
-    "    for( int loop = 0; loop < numLoops; loop++ ) {\n" 
+    "void copyLocal(local float *target, global float const *source, const int N) {\n" 
+    "    int numLoops = (N + gWorkgroupSize - 1) / gWorkgroupSize;\n" 
+    "    for (int loop = 0; loop < numLoops; loop++) {\n" 
     "        int offset = loop * gWorkgroupSize + get_local_id(0);\n" 
-    "        if( offset < N ) {\n" 
+    "        if (offset < N) {\n" 
     "            target[offset] = source[offset];\n" 
     "        }\n" 
     "    }\n" 
     "}\n" 
     "\n" 
-    "#ifdef gOutputImageSize // for previous tests that dont define it\n" 
+    "#ifdef gOutputSize // for previous tests that dont define it\n" 
     "// workgroup id organized like: [outplane]\n" 
     "// local id organized like: [outrow][outcol]\n" 
     "// each thread iterates over: [imageid][upstreamplane][filterrow][filtercol]\n" 
@@ -102,7 +102,7 @@ Forward2::Forward2( EasyCL *cl, LayerDimensions dim ) :
     "        const int batchSize,\n" 
     "        global const float *images, global const float *filters,\n" 
     "        global float *output,\n" 
-    "        local float *_inputPlane, local float *_filterCube ) {\n" 
+    "        local float *_inputPlane, local float *_filterCube) {\n" 
     "    const int globalId = get_global_id(0);\n" 
     "\n" 
     "    const int workgroupId = get_group_id(0);\n" 
@@ -110,14 +110,14 @@ Forward2::Forward2( EasyCL *cl, LayerDimensions dim ) :
     "    const int outPlane = workgroupId;\n" 
     "\n" 
     "    const int localId = get_local_id(0);\n" 
-    "    const int outputRow = localId / gOutputImageSize;\n" 
-    "    const int outputCol = localId % gOutputImageSize;\n" 
+    "    const int outputRow = localId / gOutputSize;\n" 
+    "    const int outputCol = localId % gOutputSize;\n" 
     "\n" 
     "    #if gPadZeros == 1\n" 
-    "        const int minu = max( -gHalfFilterSize, -outputRow );\n" 
-    "        const int maxu = min( gHalfFilterSize, gOutputImageSize - 1 - outputRow ) - gEven;\n" 
-    "        const int minv = max( -gHalfFilterSize, -outputCol );\n" 
-    "        const int maxv = min( gHalfFilterSize, gOutputImageSize - 1 - outputCol ) - gEven;\n" 
+    "        const int minu = max(-gHalfFilterSize, -outputRow);\n" 
+    "        const int maxu = min(gHalfFilterSize, gOutputSize - 1 - outputRow) - gEven;\n" 
+    "        const int minv = max(-gHalfFilterSize, -outputCol);\n" 
+    "        const int maxv = min(gHalfFilterSize, gOutputSize - 1 - outputCol) - gEven;\n" 
     "    #else\n" 
     "        const int minu = -gHalfFilterSize;\n" 
     "        const int maxu = gHalfFilterSize - gEven;\n" 
@@ -127,30 +127,30 @@ Forward2::Forward2( EasyCL *cl, LayerDimensions dim ) :
     "\n" 
     "    {\n" 
     "        const int filterCubeLength = gInputPlanes * gFilterSizeSquared;\n" 
-    "        copyLocal( _filterCube,\n" 
+    "        copyLocal(_filterCube,\n" 
     "                filters + outPlane * filterCubeLength,\n" 
-    "                filterCubeLength );\n" 
+    "                filterCubeLength);\n" 
     "    }\n" 
     "    // dont need a barrier, since we'll just run behind the barrier from the upstream image download\n" 
     "\n" 
-    "    for( int n = 0; n < batchSize; n++ ) {\n" 
+    "    for (int n = 0; n < batchSize; n++) {\n" 
     "        float sum = 0;\n" 
-    "        for( int upstreamPlane = 0; upstreamPlane < gInputPlanes; upstreamPlane++ ) {\n" 
+    "        for (int upstreamPlane = 0; upstreamPlane < gInputPlanes; upstreamPlane++) {\n" 
     "            barrier(CLK_LOCAL_MEM_FENCE);\n" 
-    "            copyLocal( _inputPlane,\n" 
-    "                       images + ( n * gInputPlanes + upstreamPlane ) * gInputImageSizeSquared,\n" 
-    "                       gInputImageSizeSquared );\n" 
+    "            copyLocal(_inputPlane,\n" 
+    "                       images + (n * gInputPlanes + upstreamPlane) * gInputSizeSquared,\n" 
+    "                       gInputSizeSquared);\n" 
     "            barrier(CLK_LOCAL_MEM_FENCE);\n" 
     "            int filterImageOffset = upstreamPlane * gFilterSizeSquared;\n" 
-    "            if( localId < gOutputImageSizeSquared ) {\n" 
-    "                for( int u = minu; u <= maxu; u++ ) {\n" 
+    "            if (localId < gOutputSizeSquared) {\n" 
+    "                for (int u = minu; u <= maxu; u++) {\n" 
     "                    int inputRow = outputRow + u;\n" 
     "                    #if gPadZeros == 0\n" 
     "                         inputRow += gHalfFilterSize;\n" 
     "                    #endif\n" 
-    "                    int inputimagerowoffset = inputRow * gInputImageSize;\n" 
+    "                    int inputimagerowoffset = inputRow * gInputSize;\n" 
     "                    int filterrowoffset = filterImageOffset + (u+gHalfFilterSize) * gFilterSize + gHalfFilterSize;\n" 
-    "                    for( int v = minv; v <= maxv; v++ ) {\n" 
+    "                    for (int v = minv; v <= maxv; v++) {\n" 
     "                        int inputCol = outputCol + v;\n" 
     "                        #if gPadZeros == 0\n" 
     "                             inputCol += gHalfFilterSize;\n" 
@@ -161,8 +161,8 @@ Forward2::Forward2( EasyCL *cl, LayerDimensions dim ) :
     "            }\n" 
     "        }\n" 
     "        // output are organized like [imageid][filterid][row][col]\n" 
-    "        int resultIndex = ( n * gNumFilters + outPlane ) * gOutputImageSizeSquared + localId;\n" 
-    "        if( localId < gOutputImageSizeSquared ) {\n" 
+    "        int resultIndex = (n * gNumFilters + outPlane) * gOutputSizeSquared + localId;\n" 
+    "        if (localId < gOutputSizeSquared) {\n" 
     "            output[resultIndex ] = sum;\n" 
     "        }\n" 
     "    }\n" 
diff --git a/src/conv/Forward2.h b/src/conv/Forward2.h
index e33016ac..014e3cf4 100644
--- a/src/conv/Forward2.h
+++ b/src/conv/Forward2.h
@@ -24,9 +24,9 @@ class Forward2 : public Forward {
     // ]]]
     // generated, using cog:
     VIRTUAL ~Forward2();
-    VIRTUAL void forward( int batchSize, CLWrapper *dataWrapper, CLWrapper *weightsWrapper, CLWrapper *biasWrapper,
-    CLWrapper *outputWrapper );
-    Forward2( EasyCL *cl, LayerDimensions dim );
+    VIRTUAL void forward(int batchSize, CLWrapper *dataWrapper, CLWrapper *weightsWrapper, CLWrapper *biasWrapper,
+    CLWrapper *outputWrapper);
+    Forward2(EasyCL *cl, LayerDimensions dim);
 
     // [[[end]]]
 
diff --git a/src/conv/Forward3.cpp b/src/conv/Forward3.cpp
index 711cbe9f..3d01f422 100644
--- a/src/conv/Forward3.cpp
+++ b/src/conv/Forward3.cpp
@@ -22,39 +22,39 @@ VIRTUAL Forward3::~Forward3() {
     delete kernel;
     delete addBias;
 }
-VIRTUAL void Forward3::forward( int batchSize, CLWrapper *dataWrapper, CLWrapper *weightsWrapper, CLWrapper *biasWrapper,
-    CLWrapper *outputWrapper ) {
+VIRTUAL void Forward3::forward(int batchSize, CLWrapper *dataWrapper, CLWrapper *weightsWrapper, CLWrapper *biasWrapper,
+    CLWrapper *outputWrapper) {
     StatefulTimer::timeCheck("Forward3::forward begin");
 //    const int maxWorkgroupSize = cl->getMaxWorkgroupSize();
 //    int maxglobalId = 0;
 
     kernel->in(batchSize);
-    kernel->input( dataWrapper );
-    kernel->input( weightsWrapper);
-    kernel->output( outputWrapper );
-    kernel->localFloats( square( dim.inputImageSize ) );
-    kernel->localFloats( square( dim.filterSize ) * dim.inputPlanes );
+    kernel->input(dataWrapper);
+    kernel->input(weightsWrapper);
+    kernel->output(outputWrapper);
+    kernel->localFloats(square(dim.inputSize) );
+    kernel->localFloats(square(dim.filterSize) * dim.inputPlanes);
 
-    int workgroupsize = std::max( 32, square( dim.outputImageSize ) ); // no point in wasting threads....
+    int workgroupsize = std::max(32, square(dim.outputSize) ); // no point in wasting threads....
     int numWorkgroups = dim.numFilters * batchSize;
     int globalSize = workgroupsize * numWorkgroups;
-    kernel->run_1d( globalSize, workgroupsize );
+    kernel->run_1d(globalSize, workgroupsize);
     cl->finish();
 
     StatefulTimer::timeCheck("Forward3::forward after kernel1");
 
-    if( dim.biased ) {
-        addBias->forward( batchSize, dim.numFilters, dim.outputImageSize,
-                          outputWrapper, biasWrapper );
+    if(dim.biased) {
+        addBias->forward(batchSize, dim.numFilters, dim.outputSize,
+                          outputWrapper, biasWrapper);
     }
 }
-Forward3::Forward3( EasyCL *cl, LayerDimensions dim ) :
-        Forward( cl, dim )
+Forward3::Forward3(EasyCL *cl, LayerDimensions dim) :
+        Forward(cl, dim)
             {
 
-    addBias = new AddBias( cl );
+    addBias = new AddBias(cl);
 
-    if( square( dim.outputImageSize ) > cl->getMaxWorkgroupSize() ) {
+    if(square(dim.outputSize) > cl->getMaxWorkgroupSize()) {
         throw runtime_error("cannot use forward3, since outputimagesize * outputimagesize > maxworkgroupsize");
     }
 
@@ -63,8 +63,8 @@ Forward3::Forward3( EasyCL *cl, LayerDimensions dim ) :
 
     // [[[cog
     // import stringify
-    // stringify.write_kernel2( "kernel", "cl/forward3.cl", "forward_3_by_n_outplane", 'options' )
-    // # stringify.write_kernel2( "repeatedAdd", "cl/per_element_add.cl", "repeated_add", 'options' )
+    // stringify.write_kernel2("kernel", "cl/forward3.cl", "forward_3_by_n_outplane", 'options')
+    // # stringify.write_kernel2("repeatedAdd", "cl/per_element_add.cl", "repeated_add", 'options')
     // ]]]
     // generated using cog, from cl/forward3.cl:
     const char * kernelSource =  
@@ -85,10 +85,10 @@ Forward3::Forward3( EasyCL *cl, LayerDimensions dim ) :
     "// one filter cube (corresponding to one outplane) = 5*5 * 32 * 4 = 3.2KB (ok)\n" 
     "// all filter cubes = 3.2KB * 32 = 102KB (too big)\n" 
     "// output are organized like [imageid][filterid][row][col]\n" 
-    "void kernel forward_3_by_n_outplane( const int batchSize,\n" 
+    "void kernel forward_3_by_n_outplane(const int batchSize,\n" 
     "      global const float *images, global const float *filters,\n" 
     "    global float *output,\n" 
-    "    local float *_upstreamImage, local float *_filterCube ) {\n" 
+    "    local float *_upstreamImage, local float *_filterCube) {\n" 
     "    const int globalId = get_global_id(0);\n" 
     "\n" 
     "    const int workgroupId = get_group_id(0);\n" 
@@ -97,52 +97,52 @@ Forward3::Forward3( EasyCL *cl, LayerDimensions dim ) :
     "    const int outPlane = workgroupId % gNumFilters;\n" 
     "\n" 
     "    const int localId = get_local_id(0);\n" 
-    "    const int outputRow = localId / gOutputImageSize;\n" 
-    "    const int outputCol = localId % gOutputImageSize;\n" 
+    "    const int outputRow = localId / gOutputSize;\n" 
+    "    const int outputCol = localId % gOutputSize;\n" 
     "\n" 
-    "    const int minu = gPadZeros ? max( -gHalfFilterSize, -outputRow ) : -gHalfFilterSize;\n" 
-    "    const int maxu = gPadZeros ? min( gHalfFilterSize - gEven, gOutputImageSize - 1 - outputRow  - gEven) : gHalfFilterSize - gEven;\n" 
-    "    const int minv = gPadZeros ? max( -gHalfFilterSize, -outputCol ) : - gHalfFilterSize;\n" 
-    "    const int maxv = gPadZeros ? min( gHalfFilterSize - gEven, gOutputImageSize - 1 - outputCol - gEven) : gHalfFilterSize - gEven;\n" 
+    "    const int minu = gPadZeros ? max(-gHalfFilterSize, -outputRow) : -gHalfFilterSize;\n" 
+    "    const int maxu = gPadZeros ? min(gHalfFilterSize - gEven, gOutputSize - 1 - outputRow  - gEven) : gHalfFilterSize - gEven;\n" 
+    "    const int minv = gPadZeros ? max(-gHalfFilterSize, -outputCol) : - gHalfFilterSize;\n" 
+    "    const int maxv = gPadZeros ? min(gHalfFilterSize - gEven, gOutputSize - 1 - outputCol - gEven) : gHalfFilterSize - gEven;\n" 
     "\n" 
-    "    const int numUpstreamsPerThread = ( gInputImageSizeSquared + workgroupSize - 1 ) / workgroupSize;\n" 
+    "    const int numUpstreamsPerThread = (gInputSizeSquared + workgroupSize - 1) / workgroupSize;\n" 
     "\n" 
     "    const int filterCubeLength = gInputPlanes * gFilterSizeSquared;\n" 
     "    const int filterCubeGlobalOffset = outPlane * filterCubeLength;\n" 
-    "    const int numPixelsPerThread = ( filterCubeLength + workgroupSize - 1 ) / workgroupSize;\n" 
-    "    for( int i = 0; i < numPixelsPerThread; i++ ) {\n" 
+    "    const int numPixelsPerThread = (filterCubeLength + workgroupSize - 1) / workgroupSize;\n" 
+    "    for (int i = 0; i < numPixelsPerThread; i++) {\n" 
     "        int thisOffset = localId + i * workgroupSize;\n" 
-    "        if( thisOffset < filterCubeLength ) {\n" 
+    "        if (thisOffset < filterCubeLength) {\n" 
     "            _filterCube[thisOffset] = filters[filterCubeGlobalOffset + thisOffset];\n" 
     "        }\n" 
     "    }\n" 
     "    // dont need a barrier, since we'll just run behind the barrier from the upstream image download\n" 
     "\n" 
     "    float sum = 0;\n" 
-    "    for( int upstreamPlane = 0; upstreamPlane < gInputPlanes; upstreamPlane++ ) {\n" 
-    "        int thisUpstreamImageOffset = ( n * gInputPlanes + upstreamPlane ) * gInputImageSizeSquared;\n" 
+    "    for (int upstreamPlane = 0; upstreamPlane < gInputPlanes; upstreamPlane++) {\n" 
+    "        int thisUpstreamImageOffset = (n * gInputPlanes + upstreamPlane) * gInputSizeSquared;\n" 
     "        barrier(CLK_LOCAL_MEM_FENCE);\n" 
-    "        for( int i = 0; i < numUpstreamsPerThread; i++ ) {\n" 
+    "        for (int i = 0; i < numUpstreamsPerThread; i++) {\n" 
     "            int thisOffset = workgroupSize * i + localId;\n" 
-    "            if( thisOffset < gInputImageSizeSquared ) {\n" 
+    "            if (thisOffset < gInputSizeSquared) {\n" 
     "                _upstreamImage[ thisOffset ] = images[ thisUpstreamImageOffset + thisOffset ];\n" 
     "            }\n" 
     "        }\n" 
     "        barrier(CLK_LOCAL_MEM_FENCE);\n" 
     "        int filterImageOffset = upstreamPlane * gFilterSizeSquared;\n" 
-    "        for( int u = minu; u <= maxu; u++ ) {\n" 
+    "        for (int u = minu; u <= maxu; u++) {\n" 
     "            int inputRow = outputRow + u;\n" 
     "            #if gPadZeros == 0\n" 
     "                inputRow += gHalfFilterSize;\n" 
     "            #endif\n" 
-    "            int inputimagerowoffset = inputRow * gInputImageSize;\n" 
+    "            int inputimagerowoffset = inputRow * gInputSize;\n" 
     "            int filterrowoffset = filterImageOffset + (u+gHalfFilterSize) * gFilterSize + gHalfFilterSize;\n" 
-    "            for( int v = minv; v <= maxv; v++ ) {\n" 
+    "            for (int v = minv; v <= maxv; v++) {\n" 
     "                int inputCol = outputCol + v;\n" 
     "                #if gPadZeros == 0\n" 
     "                    inputCol += gHalfFilterSize;\n" 
     "                #endif\n" 
-    "                if( localId < gOutputImageSizeSquared ) {\n" 
+    "                if (localId < gOutputSizeSquared) {\n" 
     "                    sum += _upstreamImage[ inputimagerowoffset + inputCol] * _filterCube[ filterrowoffset + v ];\n" 
     "                }\n" 
     "            }\n" 
@@ -150,8 +150,8 @@ Forward3::Forward3( EasyCL *cl, LayerDimensions dim ) :
     "    }\n" 
     "\n" 
     "    // output are organized like [imageid][filterid][row][col]\n" 
-    "    int resultIndex = ( n * gNumFilters + outPlane ) * gOutputImageSizeSquared + localId;\n" 
-    "    if( localId < gOutputImageSizeSquared ) {\n" 
+    "    int resultIndex = (n * gNumFilters + outPlane) * gOutputSizeSquared + localId;\n" 
+    "    if (localId < gOutputSizeSquared) {\n" 
     "        output[resultIndex ] = sum;\n" 
     "    }\n" 
     "}\n" 
diff --git a/src/conv/Forward3.h b/src/conv/Forward3.h
index 6ec3aa8c..4ac00d24 100644
--- a/src/conv/Forward3.h
+++ b/src/conv/Forward3.h
@@ -15,9 +15,9 @@ class Forward3 : public Forward {
     // ]]]
     // generated, using cog:
     VIRTUAL ~Forward3();
-    VIRTUAL void forward( int batchSize, CLWrapper *dataWrapper, CLWrapper *weightsWrapper, CLWrapper *biasWrapper,
-    CLWrapper *outputWrapper );
-    Forward3( EasyCL *cl, LayerDimensions dim );
+    VIRTUAL void forward(int batchSize, CLWrapper *dataWrapper, CLWrapper *weightsWrapper, CLWrapper *biasWrapper,
+    CLWrapper *outputWrapper);
+    Forward3(EasyCL *cl, LayerDimensions dim);
 
     // [[[end]]]
 };
diff --git a/src/conv/Forward4.cpp b/src/conv/Forward4.cpp
index 56380928..6e424b48 100644
--- a/src/conv/Forward4.cpp
+++ b/src/conv/Forward4.cpp
@@ -20,36 +20,36 @@ VIRTUAL Forward4::~Forward4() {
     delete kernel;
     delete addBias;
 }
-VIRTUAL void Forward4::forward( int batchSize, CLWrapper *dataWrapper, CLWrapper *weightsWrapper, CLWrapper *biasWrapper,
-    CLWrapper *outputWrapper ) {
+VIRTUAL void Forward4::forward(int batchSize, CLWrapper *dataWrapper, CLWrapper *weightsWrapper, CLWrapper *biasWrapper,
+    CLWrapper *outputWrapper) {
     StatefulTimer::timeCheck("Forward4::forward start");
 
     int numWorkgroups = dim.numFilters * batchSize * pixelsPerThread;
     int globalSize = workgroupSize * numWorkgroups;
 
     kernel->in(batchSize);
-    kernel->input( dataWrapper );
-    kernel->input( weightsWrapper);
-    kernel->output( outputWrapper );
-    kernel->localFloats( square( dim.inputImageSize ) );
-    kernel->localFloats( square( dim.filterSize ) );
+    kernel->input(dataWrapper);
+    kernel->input(weightsWrapper);
+    kernel->output(outputWrapper);
+    kernel->localFloats(square(dim.inputSize) );
+    kernel->localFloats(square(dim.filterSize) );
 
-    kernel->run_1d( globalSize, workgroupSize );
+    kernel->run_1d(globalSize, workgroupSize);
     cl->finish();
     StatefulTimer::timeCheck("Forward4::forward after call forward");
 
-    if( dim.biased ) {
+    if(dim.biased) {
         addBias->forward(
-            batchSize, dim.numFilters, dim.outputImageSize,
-            outputWrapper, biasWrapper );
+            batchSize, dim.numFilters, dim.outputSize,
+            outputWrapper, biasWrapper);
     }
 }
-Forward4::Forward4( EasyCL *cl, LayerDimensions dim ) :
-        Forward( cl, dim )
+Forward4::Forward4(EasyCL *cl, LayerDimensions dim) :
+        Forward(cl, dim)
             {
-    addBias = new AddBias( cl );
+    addBias = new AddBias(cl);
 
-    workgroupSize = std::max( 32, square( dim.outputImageSize ) ); // no point in wasting threads....
+    workgroupSize = std::max(32, square(dim.outputSize) ); // no point in wasting threads....
     const int maxWorkgroupSize = cl->getMaxWorkgroupSize();
     // see comments in forward4.cl,
     // if the outputimagesize * outputimagesize > maxWorkgroupSize,
@@ -59,20 +59,20 @@ Forward4::Forward4( EasyCL *cl, LayerDimensions dim ) :
     // here, we calculate how many workgroups we will need, in powers
     // of two:
     pixelsPerThread = 1;
-    while( workgroupSize > maxWorkgroupSize ) {
-        workgroupSize = (workgroupSize + 1 ) >> 1;
+    while(workgroupSize > maxWorkgroupSize) {
+        workgroupSize = (workgroupSize + 1) >> 1;
         pixelsPerThread <<= 1;
     }
     //cout << "workgroupSize=" << workgroupSize << " pixelsPerThread=" << pixelsPerThread << endl;
 
     std::string options = ""; // "-D " + fn->getDefineName();
-    options += " -D gWorkgroupSize=" + toString( workgroupSize );
-    options += " -D gPixelsPerThread=" + toString( pixelsPerThread );
+    options += " -D gWorkgroupSize=" + toString(workgroupSize);
+    options += " -D gPixelsPerThread=" + toString(pixelsPerThread);
 //    cout << "options " << options << endl;
     options += dim.buildOptionsString();
     // [[[cog
     // import stringify
-    // stringify.write_kernel2( "kernel", "cl/forward4.cl", "forward_4_by_n_outplane_smallercache", 'options' )
+    // stringify.write_kernel2("kernel", "cl/forward4.cl", "forward_4_by_n_outplane_smallercache", 'options')
     // ]]]
     // generated using cog, from cl/forward4.cl:
     const char * kernelSource =  
@@ -82,17 +82,17 @@ Forward4::Forward4( EasyCL *cl, LayerDimensions dim ) :
     "// v. 2.0. If a copy of the MPL was not distributed with this file, You can\n" 
     "// obtain one at http://mozilla.org/MPL/2.0/.\n" 
     "\n" 
-    "void copyLocal( local float *target, global float const *source, int N ) {\n" 
-    "    int numLoops = ( N + get_local_size(0) - 1 ) / get_local_size(0);\n" 
-    "    for( int loop = 0; loop < numLoops; loop++ ) {\n" 
+    "void copyLocal(local float *target, global float const *source, int N) {\n" 
+    "    int numLoops = (N + get_local_size(0) - 1) / get_local_size(0);\n" 
+    "    for (int loop = 0; loop < numLoops; loop++) {\n" 
     "        int offset = loop * get_local_size(0) + get_local_id(0);\n" 
-    "        if( offset < N ) {\n" 
+    "        if (offset < N) {\n" 
     "            target[offset] = source[offset];\n" 
     "        }\n" 
     "    }\n" 
     "}\n" 
     "\n" 
-    "#ifdef gOutputImageSize // for previous tests that dont define it\n" 
+    "#ifdef gOutputSize // for previous tests that dont define it\n" 
     "// workgroup id organized like: [n][filterid]\n" 
     "// local id organized like: [outrow][outcol]\n" 
     "// each thread iterates over: [upstreamplane][filterrow][filtercol]\n" 
@@ -121,21 +121,21 @@ Forward4::Forward4( EasyCL *cl, LayerDimensions dim ) :
     "//     basically, it's a hack, so larger images actually run, without\n" 
     "//     crashing, and we can probably improve it a lot :-)\n" 
     "//\n" 
-    "// So, when outputImageSize * outputImageSize > workgroupSize, then\n" 
+    "// So, when outputSize * outputSize > workgroupSize, then\n" 
     "// multiple workgroups will be created for each output plane\n" 
     "// the number of such workgroups is given by: `gPixelsPerThread`\n" 
     "// the id of our workgroup within such a set of workgroups is calculated\n" 
     "// as `pixel`\n" 
     "// effectiveLocalId is our local id if we had one enormous workgroup\n" 
     "// containing the whole output image plane\n" 
-    "void kernel forward_4_by_n_outplane_smallercache( const int batchSize,\n" 
+    "void kernel forward_4_by_n_outplane_smallercache(const int batchSize,\n" 
     "      global const float *images, global const float *filters,\n" 
     "    global float *output,\n" 
-    "    local float *_inputPlane, local float *_filterPlane ) {\n" 
-    "    #define globalId ( get_global_id(0) )\n" 
+    "    local float *_inputPlane, local float *_filterPlane) {\n" 
+    "    #define globalId (get_global_id(0))\n" 
     "\n" 
-    "    #define localId ( get_local_id(0) )\n" 
-    "    #define workgroupId ( get_group_id(0) )\n" 
+    "    #define localId (get_local_id(0))\n" 
+    "    #define workgroupId (get_group_id(0))\n" 
     "//    const int workgroupSize = get_local_size(0);\n" 
     "    const int effectiveWorkgroupId = workgroupId / gPixelsPerThread;\n" 
     "    const int pixel = workgroupId % gPixelsPerThread;\n" 
@@ -143,35 +143,35 @@ Forward4::Forward4( EasyCL *cl, LayerDimensions dim ) :
     "    const int n = effectiveWorkgroupId / gNumFilters;\n" 
     "    const int outPlane = effectiveWorkgroupId % gNumFilters;\n" 
     "\n" 
-    "    const int outputRow = effectiveLocalId / gOutputImageSize;\n" 
-    "    const int outputCol = effectiveLocalId % gOutputImageSize;\n" 
+    "    const int outputRow = effectiveLocalId / gOutputSize;\n" 
+    "    const int outputCol = effectiveLocalId % gOutputSize;\n" 
     "\n" 
     "    float sum = 0;\n" 
-    "    for( int upstreamPlane = 0; upstreamPlane < gInputPlanes; upstreamPlane++ ) {\n" 
+    "    for (int upstreamPlane = 0; upstreamPlane < gInputPlanes; upstreamPlane++) {\n" 
     "        barrier(CLK_LOCAL_MEM_FENCE);\n" 
-    "        copyLocal( _inputPlane, images + ( n * gInputPlanes + upstreamPlane ) * gInputImageSizeSquared, gInputImageSizeSquared );\n" 
-    "        copyLocal( _filterPlane, filters + ( outPlane * gInputPlanes + upstreamPlane ) * gFilterSizeSquared, gFilterSizeSquared );\n" 
+    "        copyLocal(_inputPlane, images + (n * gInputPlanes + upstreamPlane) * gInputSizeSquared, gInputSizeSquared);\n" 
+    "        copyLocal(_filterPlane, filters + (outPlane * gInputPlanes + upstreamPlane) * gFilterSizeSquared, gFilterSizeSquared);\n" 
     "        barrier(CLK_LOCAL_MEM_FENCE);\n" 
     "\n" 
-    "        if( effectiveLocalId < gOutputImageSizeSquared ) {\n" 
-    "            for( int u = -gHalfFilterSize; u <= gHalfFilterSize - gEven; u++ ) {\n" 
+    "        if (effectiveLocalId < gOutputSizeSquared) {\n" 
+    "            for (int u = -gHalfFilterSize; u <= gHalfFilterSize - gEven; u++) {\n" 
     "                // trying to reduce register pressure...\n" 
     "                #if gPadZeros == 1\n" 
-    "                    #define inputRow ( outputRow + u )\n" 
+    "                    #define inputRow (outputRow + u)\n" 
     "                #else\n" 
-    "                    #define inputRow ( outputRow + u + gHalfFilterSize )\n" 
+    "                    #define inputRow (outputRow + u + gHalfFilterSize)\n" 
     "                #endif\n" 
-    "                int inputimagerowoffset = inputRow * gInputImageSize;\n" 
+    "                int inputimagerowoffset = inputRow * gInputSize;\n" 
     "                int filterrowoffset = (u+gHalfFilterSize) * gFilterSize + gHalfFilterSize;\n" 
-    "                bool rowOk = inputRow >= 0 && inputRow < gInputImageSize;\n" 
-    "                for( int v = -gHalfFilterSize; v <= gHalfFilterSize - gEven; v++ ) {\n" 
+    "                bool rowOk = inputRow >= 0 && inputRow < gInputSize;\n" 
+    "                for (int v = -gHalfFilterSize; v <= gHalfFilterSize - gEven; v++) {\n" 
     "                    #if gPadZeros == 1\n" 
-    "                        #define inputCol ( outputCol + v )\n" 
+    "                        #define inputCol (outputCol + v)\n" 
     "                    #else\n" 
-    "                        #define inputCol ( outputCol + v + gHalfFilterSize )\n" 
+    "                        #define inputCol (outputCol + v + gHalfFilterSize)\n" 
     "                    #endif\n" 
-    "                    bool process = rowOk && inputCol >= 0 && inputCol < gInputImageSize;\n" 
-    "                    if( process ) {\n" 
+    "                    bool process = rowOk && inputCol >= 0 && inputCol < gInputSize;\n" 
+    "                    if (process) {\n" 
     "                            sum += _inputPlane[ inputimagerowoffset + inputCol] * _filterPlane[ filterrowoffset + v ];\n" 
     "                    }\n" 
     "                }\n" 
@@ -179,8 +179,8 @@ Forward4::Forward4( EasyCL *cl, LayerDimensions dim ) :
     "        }\n" 
     "    }\n" 
     "    // output are organized like [imageid][filterid][row][col]\n" 
-    "    #define resultIndex ( ( n * gNumFilters + outPlane ) * gOutputImageSizeSquared + effectiveLocalId )\n" 
-    "    if( effectiveLocalId < gOutputImageSizeSquared ) {\n" 
+    "    #define resultIndex (( n * gNumFilters + outPlane) * gOutputSizeSquared + effectiveLocalId)\n" 
+    "    if (effectiveLocalId < gOutputSizeSquared) {\n" 
     "        output[resultIndex ] = sum;\n" 
     "    }\n" 
     "}\n" 
diff --git a/src/conv/Forward4.h b/src/conv/Forward4.h
index 62a4c294..1a367eb0 100644
--- a/src/conv/Forward4.h
+++ b/src/conv/Forward4.h
@@ -18,9 +18,9 @@ class Forward4 : public Forward {
     // ]]]
     // generated, using cog:
     VIRTUAL ~Forward4();
-    VIRTUAL void forward( int batchSize, CLWrapper *dataWrapper, CLWrapper *weightsWrapper, CLWrapper *biasWrapper,
-    CLWrapper *outputWrapper );
-    Forward4( EasyCL *cl, LayerDimensions dim );
+    VIRTUAL void forward(int batchSize, CLWrapper *dataWrapper, CLWrapper *weightsWrapper, CLWrapper *biasWrapper,
+    CLWrapper *outputWrapper);
+    Forward4(EasyCL *cl, LayerDimensions dim);
 
     // [[[end]]]
 };
diff --git a/src/conv/ForwardAuto.cpp b/src/conv/ForwardAuto.cpp
index 5236f9bd..dfa3526a 100644
--- a/src/conv/ForwardAuto.cpp
+++ b/src/conv/ForwardAuto.cpp
@@ -21,18 +21,18 @@ using namespace std;
 #undef VIRTUAL
 #define VIRTUAL 
 
-ForwardAuto::ForwardAuto( EasyCL *cl, LayerDimensions dim ) :
-        Forward( cl, dim ),
-        milliseconds( 0 ),
-        valid( 0 ),
-        chosenIndex( -1 ),
-        instances( 0 )
+ForwardAuto::ForwardAuto(EasyCL *cl, LayerDimensions dim) :
+        Forward(cl, dim),
+        milliseconds(0),
+        valid(0),
+        chosenIndex(-1),
+        instances(0)
          {
     num = Forward::getNumImplementations();
     milliseconds = new int[ num];
     valid = new bool[ num ];
     instances = new Forward *[ num ];
-    for( int i = 0; i < num; i++ ) {
+    for(int i = 0; i < num; i++) {
         instances[i] = 0;
         valid[i] = false;
         milliseconds[i] = -1;
@@ -40,73 +40,79 @@ ForwardAuto::ForwardAuto( EasyCL *cl, LayerDimensions dim ) :
     nextIndex = 0;
 }
 VIRTUAL ForwardAuto::~ForwardAuto() {
-    for( int i = 0; i < num; i++ ) {
-        if( instances[i] != 0 ) {
+    for(int i = 0; i < num; i++) {
+        if(instances[i] != 0) {
             delete instances[i];
         }
     }
 }
-VIRTUAL void ForwardAuto::forward( int batchSize, CLWrapper *dataWrapper, CLWrapper *weightsWrapper, 
-        CLWrapper *biasWrapper, CLWrapper *outputWrapper ) {
+VIRTUAL void ForwardAuto::forward(int batchSize, CLWrapper *dataWrapper, CLWrapper *weightsWrapper, 
+        CLWrapper *biasWrapper, CLWrapper *outputWrapper) {
 //    Forward *instance = 0;
 //    cout << "ForwardAuto::forward" << endl;
-    while( chosenIndex == -1 && nextIndex < num ) {
+    while(chosenIndex == -1 && nextIndex < num) {
         int thisIndex = nextIndex;
         nextIndex++;
-        if( Forward::plausiblyOptimal( thisIndex, batchSize, dim ) ) {
+        cout << "forward try kernel " << thisIndex << endl;
+        if(Forward::plausiblyOptimal(thisIndex, batchSize, dim)) {
             Forward *candidate = 0;
             try {
-                candidate = Forward::instanceSpecific( thisIndex, cl, dim );
+                candidate = Forward::instanceSpecific(thisIndex, cl, dim);
                 instances[thisIndex] = candidate;
                 valid[thisIndex] = true;
-            } catch( runtime_error &e ) {
-//                cout << StatefulTimer::instance()->prefix << "ForwardAuto: instance " << thisIndex << ": this instance cant be used: " << e.what() << endl;
+                cout << "   ... seems valid" << endl;
+            } catch(runtime_error &e) {
+                cout << StatefulTimer::instance()->prefix << "ForwardAuto: kernel " << thisIndex << ": this instance cant be used: " << e.what() << endl;
                 valid[thisIndex] = false;
             }
-            if( valid[thisIndex] ) {
+            if(valid[thisIndex]) {
                 Timer timer;
                 try {
-                    candidate->forward( batchSize, dataWrapper, weightsWrapper, biasWrapper, outputWrapper );
+                    candidate->forward(batchSize, dataWrapper, weightsWrapper, biasWrapper, outputWrapper);
                     milliseconds[thisIndex] = (int)timer.lap();
-//                    cout << StatefulTimer::instance()->prefix << "ForwardAuto: instance " << thisIndex << " " << milliseconds[thisIndex] << "ms" << endl;
+                    cout << StatefulTimer::instance()->prefix << "ForwardAuto: kernel " << thisIndex << " " << milliseconds[thisIndex] << "ms" << endl;
                     return;
-                } catch( runtime_error &e ) {
-//                    cout << StatefulTimer::instance()->prefix << "ForwardAuto: instance " << thisIndex << " this instance cant be used: " << e.what() << endl;
+                } catch(runtime_error &e) {
+                    cout << StatefulTimer::instance()->prefix << "ForwardAuto: kernel " << thisIndex << " this instance cant be used: " << e.what() << endl;
                     valid[thisIndex] = false;
                     delete instances[thisIndex];
                     instances[thisIndex] = 0;
                 }
+            } else {
+                cout << "   ... not valid" << endl;
             }
+        } else {
+            cout << "  ... not plausibly optimal, skipping" << endl;
         }
     }
-    if( chosenIndex == -1 ) {
+    if(chosenIndex == -1) {
 //        cout << StatefulTimer::instance()->prefix + "ForwardAuto::forward choosing best instance:" << endl;
         int bestIndex = -1;
         int bestTime = 0;
-        for( int i = 0; i < num; i++ ) {
-            if( !valid[i] ) {
-//                cout << "   instance " << i << ": cannot be used" << endl;
+        for(int i = 0; i < num; i++) {
+            if(!valid[i]) {
+                cout << "   forward kernel " << i << ": cannot be used" << endl;
                 continue;
             }
-//            cout << "   instance " << i << ": " << milliseconds[i] << "ms" << endl;
-            if( bestIndex == -1 ) {
+            cout << "   forward kernel " << i << " time: " << milliseconds[i] << "ms" << endl;
+            if(bestIndex == -1) {
                 bestIndex = i;
                 bestTime = milliseconds[i];
                 continue;
             }
-            if( milliseconds[i] < bestTime ) {
+            if(milliseconds[i] < bestTime) {
                 bestTime = milliseconds[i];
                 bestIndex = i;
             }
         }
-        if( bestIndex != -1 ) {
-//            cout << "   selected: instance " << bestIndex << endl;
+        if(bestIndex != -1) {
+            cout << "   forward layer selected kernel " << bestIndex << endl;
             this->chosenIndex = bestIndex;
         } else {
-            throw runtime_error(StatefulTimer::instance()->prefix + "No valid forward implementations found" );
+            throw runtime_error(StatefulTimer::instance()->prefix + "No valid forward implementations found");
         }
     }
 //    cout << "ForwardAuto::forward using instance index: " << chosenIndex << endl;
-    instances[chosenIndex]->forward( batchSize, dataWrapper, weightsWrapper, biasWrapper, outputWrapper );
+    instances[chosenIndex]->forward(batchSize, dataWrapper, weightsWrapper, biasWrapper, outputWrapper);
 }
 
diff --git a/src/conv/ForwardAuto.h b/src/conv/ForwardAuto.h
index e144d127..c6c23981 100644
--- a/src/conv/ForwardAuto.h
+++ b/src/conv/ForwardAuto.h
@@ -18,7 +18,7 @@
 
 using namespace std;
 
-//inline float square( float value ) {
+//inline float square(float value) {
 //    return value * value;
 //}
 
@@ -43,10 +43,10 @@ class DeepCL_EXPORT ForwardAuto : public Forward {
     // cog_addheaders.add()
     // ]]]
     // generated, using cog:
-    ForwardAuto( EasyCL *cl, LayerDimensions dim );
+    ForwardAuto(EasyCL *cl, LayerDimensions dim);
     VIRTUAL ~ForwardAuto();
-    VIRTUAL void forward( int batchSize, CLWrapper *dataWrapper, CLWrapper *weightsWrapper,
-    CLWrapper *biasWrapper, CLWrapper *outputWrapper );
+    VIRTUAL void forward(int batchSize, CLWrapper *dataWrapper, CLWrapper *weightsWrapper,
+    CLWrapper *biasWrapper, CLWrapper *outputWrapper);
 
     // [[[end]]]
 
diff --git a/src/conv/ForwardByInputPlane.cpp b/src/conv/ForwardByInputPlane.cpp
index 9bcfc0f1..77e1f12b 100644
--- a/src/conv/ForwardByInputPlane.cpp
+++ b/src/conv/ForwardByInputPlane.cpp
@@ -23,74 +23,74 @@ VIRTUAL ForwardByInputPlane::~ForwardByInputPlane() {
     delete repeatedAdd;
 //    delete activate;
 }
-VIRTUAL void ForwardByInputPlane::forward( int batchSize, CLWrapper *dataWrapper, CLWrapper *weightsWrapper, CLWrapper *biasWrapper,
-    CLWrapper *outputWrapper ) {
+VIRTUAL void ForwardByInputPlane::forward(int batchSize, CLWrapper *dataWrapper, CLWrapper *weightsWrapper, CLWrapper *biasWrapper,
+    CLWrapper *outputWrapper) {
     StatefulTimer::timeCheck("ForwardByInputPlane::forward begin");
     const int maxWorkgroupSize = cl->getMaxWorkgroupSize();
     int maxglobalId = 0;
 
-    int MBAllocRequired = (int) ( (long)batchSize * dim.numFilters * dim.outputImageSizeSquared * dim.numInputPlanes * 4 / 1024 / 1024 );
-    if( MBAllocRequired >= cl->getMaxAllocSizeMB() ) {
-        throw runtime_error( "memallocsize too small to use this kernel on this device.  Need: " + 
-            toString( MBAllocRequired ) + "MB, but only have: " + 
-            toString( cl->getMaxAllocSizeMB() ) + "MB max alloc size" );
+    int MBAllocRequired = (int) ((long)batchSize * dim.numFilters * dim.outputSizeSquared * dim.numInputPlanes * 4 / 1024 / 1024);
+    if(MBAllocRequired >= cl->getMaxAllocSizeMB()) {
+        throw runtime_error("memallocsize too small to use this kernel on this device.  Need: " + 
+            toString(MBAllocRequired) + "MB, but only have: " + 
+            toString(cl->getMaxAllocSizeMB()) + "MB max alloc size");
     }
 
     // [n][filterId][outRow][outCol][inputPlane]
-    int output1Size = batchSize * dim.numFilters * dim.outputImageSizeSquared * dim.numInputPlanes;
+    int output1Size = batchSize * dim.numFilters * dim.outputSizeSquared * dim.numInputPlanes;
 //    cout << "output1size: " << output1Size << endl;
     float *output1 = new float[output1Size];
-    CLWrapper *output1Wrapper = cl->wrap( output1Size, output1 );
+    CLWrapper *output1Wrapper = cl->wrap(output1Size, output1);
 
     kernel->in(batchSize);
-    kernel->input( dataWrapper );
-    kernel->input( weightsWrapper);
-    kernel->output( output1Wrapper );
-    kernel->localFloats( square( dim.inputImageSize ) );
-    kernel->localFloats( square( dim.filterSize ) * dim.numFilters );
+    kernel->input(dataWrapper);
+    kernel->input(weightsWrapper);
+    kernel->output(output1Wrapper);
+    kernel->localFloats(square(dim.inputSize) );
+    kernel->localFloats(square(dim.filterSize) * dim.numFilters);
 
-    int workgroupsize = std::max( 32, dim.numFilters * dim.outputImageSize ); // no point in wasting threads....
-    while( workgroupsize > cl->getMaxWorkgroupSize() ) {
+    int workgroupsize = std::max(32, dim.numFilters * dim.outputSize); // no point in wasting threads....
+    while(workgroupsize > cl->getMaxWorkgroupSize()) {
         workgroupsize >>= 1;
     }
     int numWorkgroups = dim.numInputPlanes;
     int globalSize = workgroupsize * numWorkgroups;
 //    cout << "forwardbyinputplane numworkgroups " << numWorkgroups << " globalsize " << globalSize << " workgroupsize " << workgroupsize << " numinputplanes=" << dim.numInputPlanes << endl;
-    kernel->run_1d( globalSize, workgroupsize );
+    kernel->run_1d(globalSize, workgroupsize);
     cl->finish();
     StatefulTimer::timeCheck("ForwardByInputPlane::forward after kernel1");
 
 //    {
 //        output1Wrapper->copyToHost();
-//        for( int i = 0; i < output1Size + 10; i++ ) {
-//            cout << "output1[" << i << "]=" << output1[i] << " " << ( i < output1Size ) << endl;
+//        for(int i = 0; i < output1Size + 10; i++) {
+//            cout << "output1[" << i << "]=" << output1[i] << " " << (i < output1Size) << endl;
 //        }
 //    }
 
-    reduceSegments->in( batchSize * dim.numFilters * dim.outputImageSizeSquared )->in( dim.numInputPlanes )->in( output1Wrapper )->out( outputWrapper );
-    maxglobalId = batchSize * dim.numFilters * dim.outputImageSize * dim.outputImageSize;
-    numWorkgroups = ( maxglobalId + maxWorkgroupSize - 1 ) / maxWorkgroupSize;
-    reduceSegments->run_1d( numWorkgroups * maxWorkgroupSize, maxWorkgroupSize );
+    reduceSegments->in(batchSize * dim.numFilters * dim.outputSizeSquared)->in(dim.numInputPlanes)->in(output1Wrapper)->out(outputWrapper);
+    maxglobalId = batchSize * dim.numFilters * dim.outputSize * dim.outputSize;
+    numWorkgroups = (maxglobalId + maxWorkgroupSize - 1) / maxWorkgroupSize;
+    reduceSegments->run_1d(numWorkgroups * maxWorkgroupSize, maxWorkgroupSize);
     cl->finish();
     StatefulTimer::timeCheck("ForwardByInputPlane::forward after reduce over inputplanes");
 
-    if( dim.biased ) {
-        repeatedAdd->in( batchSize * dim.numFilters * dim.outputImageSize * dim.outputImageSize )
-            ->in( dim.numFilters )
-            ->in( dim.outputImageSize * dim.outputImageSize )
-            ->inout( outputWrapper )->in( biasWrapper );
-        maxglobalId = batchSize * dim.numFilters * dim.outputImageSize * dim.outputImageSize;
-        numWorkgroups = ( maxglobalId + maxWorkgroupSize - 1 ) / maxWorkgroupSize;
-        repeatedAdd->run_1d( numWorkgroups * maxWorkgroupSize, maxWorkgroupSize );
+    if(dim.biased) {
+        repeatedAdd->in(batchSize * dim.numFilters * dim.outputSize * dim.outputSize)
+            ->in(dim.numFilters)
+            ->in(dim.outputSize * dim.outputSize)
+            ->inout(outputWrapper)->in(biasWrapper);
+        maxglobalId = batchSize * dim.numFilters * dim.outputSize * dim.outputSize;
+        numWorkgroups = (maxglobalId + maxWorkgroupSize - 1) / maxWorkgroupSize;
+        repeatedAdd->run_1d(numWorkgroups * maxWorkgroupSize, maxWorkgroupSize);
         cl->finish();
         StatefulTimer::timeCheck("ForwardByInputPlane::forward after repeatedAdd");
     }
 
-//    activate->in( batchSize * dim.numFilters * dim.outputImageSize * dim.outputImageSize )
-//        ->inout( outputWrapper );
-//    maxglobalId = batchSize * dim.numFilters * dim.outputImageSize * dim.outputImageSize;
-//    numWorkgroups = ( maxglobalId + maxWorkgroupSize - 1 ) / maxWorkgroupSize;
-//    activate->run_1d( numWorkgroups * maxWorkgroupSize, maxWorkgroupSize );
+//    activate->in(batchSize * dim.numFilters * dim.outputSize * dim.outputSize)
+//        ->inout(outputWrapper);
+//    maxglobalId = batchSize * dim.numFilters * dim.outputSize * dim.outputSize;
+//    numWorkgroups = (maxglobalId + maxWorkgroupSize - 1) / maxWorkgroupSize;
+//    activate->run_1d(numWorkgroups * maxWorkgroupSize, maxWorkgroupSize);
 //    cl->finish();
 //    StatefulTimer::timeCheck("ForwardByInputPlane::forward after activate");
 
@@ -99,8 +99,8 @@ VIRTUAL void ForwardByInputPlane::forward( int batchSize, CLWrapper *dataWrapper
 
     StatefulTimer::timeCheck("ForwardByInputPlane::forward after call forward");
 }
-ForwardByInputPlane::ForwardByInputPlane( EasyCL *cl, LayerDimensions dim ) :
-        Forward( cl, dim )
+ForwardByInputPlane::ForwardByInputPlane(EasyCL *cl, LayerDimensions dim) :
+        Forward(cl, dim)
             {
 
     std::string options = ""; // "-D " + fn->getDefineName();
@@ -108,10 +108,10 @@ ForwardByInputPlane::ForwardByInputPlane( EasyCL *cl, LayerDimensions dim ) :
 
     // [[[cog
     // import stringify
-    // stringify.write_kernel2( "kernel", "cl/forward_byinputplane.cl", "forward_byinputplane", 'options' )
-    // stringify.write_kernel2( "reduceSegments", "cl/reduce_segments.cl", "reduce_segments", 'options' )
-    // stringify.write_kernel2( "repeatedAdd", "cl/per_element_add.cl", "repeated_add", 'options' )
-    // # stringify.write_kernel2( "activate", "cl/activate.cl", "activate", 'options' )
+    // stringify.write_kernel2("kernel", "cl/forward_byinputplane.cl", "forward_byinputplane", 'options')
+    // stringify.write_kernel2("reduceSegments", "cl/reduce_segments.cl", "reduce_segments", 'options')
+    // stringify.write_kernel2("repeatedAdd", "cl/per_element_add.cl", "repeated_add", 'options')
+    // # stringify.write_kernel2("activate", "cl/activate.cl", "activate", 'options')
     // ]]]
     // generated using cog, from cl/forward_byinputplane.cl:
     const char * kernelSource =  
@@ -133,10 +133,10 @@ ForwardByInputPlane::ForwardByInputPlane( EasyCL *cl, LayerDimensions dim ) :
     "// iterate over: [n][outCol]\n" 
     "// output: [n][filterId][outRow][outCol][inputPlane]\n" 
     "// need to later reduce output over: [inputPlane]\n" 
-    "void kernel forward_byinputplane( const int batchSize,\n" 
+    "void kernel forward_byinputplane(const int batchSize,\n" 
     "      global const float *images, global const float *filters,\n" 
     "    global float *output,\n" 
-    "    local float *_inputPlane, local float *_filterPlanes ) {\n" 
+    "    local float *_inputPlane, local float *_filterPlanes) {\n" 
     "//    const int evenPadding = gFilterSize % 2 == 0 ? 1 : 0;\n" 
     "\n" 
     "    const int globalId = get_global_id(0);\n" 
@@ -145,71 +145,71 @@ ForwardByInputPlane::ForwardByInputPlane( EasyCL *cl, LayerDimensions dim ) :
     "    const int localId = get_local_id(0);\n" 
     "\n" 
     "    const int inputPlaneId = workgroupId;\n" 
-    "    const int numLoops = ( gNumFilters * gOutputImageSize + workgroupSize - 1 ) / workgroupSize;\n" 
-    "    const int numFilterCopyLoops = ( gFilterSizeSquared + gOutputImageSize - 1 ) / gOutputImageSize;\n" 
-    "    const int numImageCopyLoops = ( gInputImageSizeSquared + workgroupSize - 1 ) / workgroupSize;\n" 
-    "    for( int loop = 0; loop < numLoops; loop++ ) {\n" 
+    "    const int numLoops = (gNumFilters * gOutputSize + workgroupSize - 1) / workgroupSize;\n" 
+    "    const int numFilterCopyLoops = (gFilterSizeSquared + gOutputSize - 1) / gOutputSize;\n" 
+    "    const int numImageCopyLoops = (gInputSizeSquared + workgroupSize - 1) / workgroupSize;\n" 
+    "    for (int loop = 0; loop < numLoops; loop++) {\n" 
     "        const int loopLocalId = localId + loop * workgroupSize;\n" 
-    "        const int filterId = loopLocalId / gOutputImageSize;\n" 
-    "        const int outRow = loopLocalId % gOutputImageSize;\n" 
+    "        const int filterId = loopLocalId / gOutputSize;\n" 
+    "        const int outRow = loopLocalId % gOutputSize;\n" 
     "\n" 
-    "        // copy down our filter, we have gOutputImageSize threads to do this\n" 
+    "        // copy down our filter, we have gOutputSize threads to do this\n" 
     "        global float const *globalFilterPlane = filters +\n" 
-    "            ( filterId * gNumInputPlanes + inputPlaneId ) * gFilterSizeSquared;\n" 
+    "            (filterId * gNumInputPlanes + inputPlaneId) * gFilterSizeSquared;\n" 
     "        local float *_localFilterPlane = _filterPlanes + filterId * gFilterSizeSquared;\n" 
     "        barrier(CLK_LOCAL_MEM_FENCE);\n" 
-    "        for( int i = 0; i < numFilterCopyLoops; i++ ) {\n" 
-    "            const int offset = i * gOutputImageSize + outRow;\n" 
+    "        for (int i = 0; i < numFilterCopyLoops; i++) {\n" 
+    "            const int offset = i * gOutputSize + outRow;\n" 
     "            bool process = filterId < gNumFilters && offset < gFilterSizeSquared;\n" 
-    "            if( process ) {\n" 
+    "            if (process) {\n" 
     "                _localFilterPlane[ offset ] = globalFilterPlane[ offset ];\n" 
     "            }\n" 
     "        }\n" 
     "        // loop over n ...\n" 
-    "        for( int n = 0; n < batchSize; n++ ) {\n" 
+    "        for (int n = 0; n < batchSize; n++) {\n" 
     "            // copy down our imageplane, we have workgroupSize threads to do this\n" 
     "            barrier(CLK_LOCAL_MEM_FENCE);\n" 
     "            global float const *globalImagePlane = images +\n" 
-    "                ( n * gNumInputPlanes + inputPlaneId ) * gInputImageSizeSquared;\n" 
-    "            for( int i = 0; i< numImageCopyLoops; i++ ) {\n" 
+    "                (n * gNumInputPlanes + inputPlaneId) * gInputSizeSquared;\n" 
+    "            for (int i = 0; i< numImageCopyLoops; i++) {\n" 
     "                const int offset = i * workgroupSize + localId;\n" 
-    "                if( offset < gInputImageSizeSquared ) {\n" 
+    "                if (offset < gInputSizeSquared) {\n" 
     "                    _inputPlane[ offset ] = globalImagePlane[ offset ];\n" 
     "                }\n" 
     "            }\n" 
     "            barrier(CLK_LOCAL_MEM_FENCE);\n" 
     "            // calc output for each [outrow][outcol]\n" 
     "            bool filterPlaneOk = filterId < gNumFilters;\n" 
-    "            for( int outCol = 0; outCol < gOutputImageSize; outCol++ ) {\n" 
+    "            for (int outCol = 0; outCol < gOutputSize; outCol++) {\n" 
     "                float sum = 0;\n" 
-    "                for( int filterRow = 0; filterRow < gFilterSize; filterRow++ ) {\n" 
+    "                for (int filterRow = 0; filterRow < gFilterSize; filterRow++) {\n" 
     "                    int inRow = outRow + filterRow;\n" 
     "                    #if gPadZeros == 1\n" 
     "                        inRow -= gHalfFilterSize;\n" 
     "                    #endif\n" 
-    "                    bool rowOk = filterPlaneOk && inRow >= 0 && inRow < gInputImageSize;\n" 
-    "                    for( int filterCol = 0; filterCol < gFilterSize; filterCol++ ) {\n" 
+    "                    bool rowOk = filterPlaneOk && inRow >= 0 && inRow < gInputSize;\n" 
+    "                    for (int filterCol = 0; filterCol < gFilterSize; filterCol++) {\n" 
     "                        int inCol = outCol + filterCol;\n" 
     "                        #if gPadZeros == 1\n" 
     "                            inCol -= gHalfFilterSize;\n" 
     "                        #endif\n" 
-    "                        bool process = rowOk && inCol >= 0 && inCol < gInputImageSize;\n" 
-    "                        if( process ) {\n" 
-    "                            float imageValue = _inputPlane[ inRow * gInputImageSize + inCol ];\n" 
+    "                        bool process = rowOk && inCol >= 0 && inCol < gInputSize;\n" 
+    "                        if (process) {\n" 
+    "                            float imageValue = _inputPlane[ inRow * gInputSize + inCol ];\n" 
     "                            float filterValue = _localFilterPlane[ filterRow * gFilterSize + filterCol ];\n" 
     "                            sum += imageValue * filterValue;\n" 
     "                        }\n" 
     "                    }\n" 
     "                }\n" 
-    "                if( filterId < gNumFilters ) {\n" 
+    "                if (filterId < gNumFilters) {\n" 
     "                    // [n][filterId][outRow][outCol][inputPlane]\n" 
-    "                    int resultIndex = ( ( ( n\n" 
-    "                        * gNumFilters + filterId )\n" 
-    "                        * gOutputImageSize + outRow )\n" 
-    "                        * gOutputImageSize + outCol )\n" 
+    "                    int resultIndex = (( (n\n" 
+    "                        * gNumFilters + filterId)\n" 
+    "                        * gOutputSize + outRow)\n" 
+    "                        * gOutputSize + outCol)\n" 
     "                        * gNumInputPlanes + inputPlaneId;\n" 
     "                    output[resultIndex] = sum;\n" 
-    "                    //if( globalId == 2 ) output[0] = resultIndex;\n" 
+    "                    //if (globalId == 2) output[0] = resultIndex;\n" 
     "//                    output[resultIndex] = outRow;\n" 
     "                }\n" 
     "//                output[localId] = _localFilterPlane[localId];\n" 
@@ -228,18 +228,18 @@ ForwardByInputPlane::ForwardByInputPlane( EasyCL *cl, LayerDimensions dim ) :
     "// v. 2.0. If a copy of the MPL was not distributed with this file, You can\n" 
     "// obtain one at http://mozilla.org/MPL/2.0/.\n" 
     "\n" 
-    "kernel void reduce_segments( const int numSegments, const int segmentLength,\n" 
-    "        global float const *in, global float* out ) {\n" 
+    "kernel void reduce_segments(const int numSegments, const int segmentLength,\n" 
+    "        global float const *in, global float* out) {\n" 
     "    const int globalId = get_global_id(0);\n" 
     "    const int segmentId = globalId;\n" 
     "\n" 
-    "    if( segmentId >= numSegments ) {\n" 
+    "    if (segmentId >= numSegments) {\n" 
     "        return;\n" 
     "    }\n" 
     "\n" 
     "    float sum = 0;\n" 
     "    global const float *segment = in + segmentId * segmentLength;\n" 
-    "    for( int i = 0; i < segmentLength; i++ ) {\n" 
+    "    for (int i = 0; i < segmentLength; i++) {\n" 
     "        sum += segment[i];\n" 
     "    }\n" 
     "    out[segmentId] = sum;\n" 
@@ -256,9 +256,9 @@ ForwardByInputPlane::ForwardByInputPlane( EasyCL *cl, LayerDimensions dim ) :
     "// v. 2.0. If a copy of the MPL was not distributed with this file, You can\n" 
     "// obtain one at http://mozilla.org/MPL/2.0/.\n" 
     "\n" 
-    "kernel void per_element_add( const int N, global float *target, global const float *source ) {\n" 
+    "kernel void per_element_add(const int N, global float *target, global const float *source) {\n" 
     "    const int globalId = get_global_id(0);\n" 
-    "    if( globalId >= N ) {\n" 
+    "    if (globalId >= N) {\n" 
     "        return;\n" 
     "    }\n" 
     "    target[globalId] += source[globalId];\n" 
@@ -266,20 +266,20 @@ ForwardByInputPlane::ForwardByInputPlane( EasyCL *cl, LayerDimensions dim ) :
     "\n" 
     "// adds source to target\n" 
     "// tiles source as necessary, according to tilingSize\n" 
-    "kernel void per_element_tiled_add( const int N, const int tilingSize, global float *target, global const float *source ) {\n" 
+    "kernel void per_element_tiled_add(const int N, const int tilingSize, global float *target, global const float *source) {\n" 
     "    const int globalId = get_global_id(0);\n" 
-    "    if( globalId >= N ) {\n" 
+    "    if (globalId >= N) {\n" 
     "        return;\n" 
     "    }\n" 
     "    target[globalId] += source[globalId % tilingSize];\n" 
     "}\n" 
     "\n" 
-    "kernel void repeated_add( const int N, const int sourceSize, const int repeatSize, global float *target, global const float *source ) {\n" 
+    "kernel void repeated_add(const int N, const int sourceSize, const int repeatSize, global float *target, global const float *source) {\n" 
     "    const int globalId = get_global_id(0);\n" 
-    "    if( globalId >= N ) {\n" 
+    "    if (globalId >= N) {\n" 
     "        return;\n" 
     "    }\n" 
-    "    target[globalId] += source[ ( globalId / repeatSize ) % sourceSize ];\n" 
+    "    target[globalId] += source[ (globalId / repeatSize) % sourceSize ];\n" 
     "}\n" 
     "\n" 
     "";
diff --git a/src/conv/ForwardByInputPlane.h b/src/conv/ForwardByInputPlane.h
index 87619860..6875fd33 100644
--- a/src/conv/ForwardByInputPlane.h
+++ b/src/conv/ForwardByInputPlane.h
@@ -15,9 +15,9 @@ class ForwardByInputPlane : public Forward {
     // ]]]
     // generated, using cog:
     VIRTUAL ~ForwardByInputPlane();
-    VIRTUAL void forward( int batchSize, CLWrapper *dataWrapper, CLWrapper *weightsWrapper, CLWrapper *biasWrapper,
-    CLWrapper *outputWrapper );
-    ForwardByInputPlane( EasyCL *cl, LayerDimensions dim );
+    VIRTUAL void forward(int batchSize, CLWrapper *dataWrapper, CLWrapper *weightsWrapper, CLWrapper *biasWrapper,
+    CLWrapper *outputWrapper);
+    ForwardByInputPlane(EasyCL *cl, LayerDimensions dim);
 
     // [[[end]]]
 };
diff --git a/src/conv/ForwardCpu.cpp b/src/conv/ForwardCpu.cpp
index 46b56afd..bedab9e4 100644
--- a/src/conv/ForwardCpu.cpp
+++ b/src/conv/ForwardCpu.cpp
@@ -15,65 +15,65 @@ using namespace std;
 #define VIRTUAL
 #define STATIC
 
-ForwardCpu::ForwardCpu( EasyCL *cl, LayerDimensions dim ) :
-        Forward( cl, dim )
+ForwardCpu::ForwardCpu(EasyCL *cl, LayerDimensions dim) :
+        Forward(cl, dim)
     {
 }
-VIRTUAL void ForwardCpu::forward( int batchSize, CLWrapper *inputDataWrapper, CLWrapper *weightsWrapper, CLWrapper *biasWrapper, CLWrapper *outputWrapper ) {
+VIRTUAL void ForwardCpu::forward(int batchSize, CLWrapper *inputDataWrapper, CLWrapper *weightsWrapper, CLWrapper *biasWrapper, CLWrapper *outputWrapper) {
     inputDataWrapper->copyToHost();
     weightsWrapper->copyToHost();
 //    weightsWrapper->copyToHost();
   //  biasWrapper->copyToHost();
     float *bias = 0;
-    if( dim.biased ) {
+    if(dim.biased) {
         biasWrapper->copyToHost();
         bias =  (float *)biasWrapper->getHostArray();
     }
-    float *output = forward( batchSize, (float *)inputDataWrapper->getHostArray(), (float *)weightsWrapper->getHostArray(), bias );
-    int outputSize = batchSize * dim.outputCubeSize;
-//        memcpy( (float *)outputWrapper->getHostArray(), output, sizeof(float) * outputSize );
+    float *output = forward(batchSize, (float *)inputDataWrapper->getHostArray(), (float *)weightsWrapper->getHostArray(), bias);
+    int outputNumElements = batchSize * dim.outputCubeSize;
+//        memcpy((float *)outputWrapper->getHostArray(), output, sizeof(float) * outputNumElements);
     float *hostArray = (float *)outputWrapper->getHostArray();
-    for( int i = 0; i < outputSize; i++ ) {
+    for(int i = 0; i < outputNumElements; i++) {
         hostArray[i] = output[i];
     }
     outputWrapper->copyToDevice();
     delete[] output;
 }
-VIRTUAL float *ForwardCpu::forward( int batchSize, float *inputData, float *weights, float *bias ) {
+VIRTUAL float *ForwardCpu::forward(int batchSize, float *inputData, float *weights, float *bias) {
 //    cout << "ForwardCpu::forward outputcubesize=" << dim.outputCubeSize << " batchSize=" << batchSize << endl;
     float *output = new float[ dim.outputCubeSize * batchSize ];
-    for( int n = 0; n < batchSize; n++ ) {
-        for( int filter = 0; filter < dim.numFilters; filter++ ) {
-            for( int outRow = 0; outRow < dim.outputImageSize; outRow += 1 + dim.skip ) {
-                for( int outCol = 0; outCol < dim.outputImageSize; outCol += 1 + dim.skip ) {
+    for(int n = 0; n < batchSize; n++) {
+        for(int filter = 0; filter < dim.numFilters; filter++) {
+            for(int outRow = 0; outRow < dim.outputSize; outRow += 1 + dim.skip) {
+                for(int outCol = 0; outCol < dim.outputSize; outCol += 1 + dim.skip) {
                     float sum = 0;
-                    for( int inPlane = 0; inPlane < dim.inputPlanes; inPlane++ ) {
+                    for(int inPlane = 0; inPlane < dim.inputPlanes; inPlane++) {
 //                        cout << "inplane=" << inPlane << endl;
-                        for( int u = -dim.halfFilterSize; u <= dim.halfFilterSize; u++ ) {
-                            int inRow = outRow * ( dim.skip + 1 ) + u + ( dim.padZeros ? 0 : dim.halfFilterSize );
+                        for(int u = -dim.halfFilterSize; u <= dim.halfFilterSize; u++) {
+                            int inRow = outRow * (dim.skip + 1) + u + (dim.padZeros ? 0 : dim.halfFilterSize);
 //                                cout << "candidate inRow " << inRow << endl;
-                            if( inRow < 0 || inRow > dim.inputImageSize - 1 ) {
+                            if(inRow < 0 || inRow > dim.inputSize - 1) {
                                 continue;
                             }
                             int filterRow = u + dim.halfFilterSize;
-                            for( int v = -dim.halfFilterSize; v <= dim.halfFilterSize; v++ ) {
-                                int inCol = outCol * ( dim.skip + 1 ) + v + ( dim.padZeros ? 0 : dim.halfFilterSize );
+                            for(int v = -dim.halfFilterSize; v <= dim.halfFilterSize; v++) {
+                                int inCol = outCol * (dim.skip + 1) + v + (dim.padZeros ? 0 : dim.halfFilterSize);
                                 int filterCol = v + dim.halfFilterSize;
-                                if( inCol < 0 || inCol > dim.inputImageSize - 1 ) {
+                                if(inCol < 0 || inCol > dim.inputSize - 1) {
                                     continue;
                                 }
-                                int inputIndex = ( ( n
-                                    * dim.inputPlanes + inPlane )
-                                    * dim.inputImageSize + inRow )
-                                    * dim.inputImageSize + inCol;
-                                int weightIndex = ( ( filter 
-                                    * dim.inputPlanes + inPlane ) 
-                                    * dim.filterSize  + filterRow )
+                                int inputIndex = (( n
+                                    * dim.inputPlanes + inPlane)
+                                    * dim.inputSize + inRow)
+                                    * dim.inputSize + inCol;
+                                int weightIndex = (( filter 
+                                    * dim.inputPlanes + inPlane) 
+                                    * dim.filterSize  + filterRow)
                                     * dim.filterSize  + filterCol;
 //                                    cout << "inpos " << inRow << "," << inCol << " outpos " << outRow << "," << outCol
 //                                        << " filterpos " << filterRow << "," << filterCol << endl;
                                 float sumchange = inputData[ inputIndex] * weights[ weightIndex ];
-                                if( sumchange != 0 ) {
+                                if(sumchange != 0) {
 //                                        cout << inputData[inputIndex] << " * " << weights[weightIndex] << " = " << sumchange << endl;
                                 }
                                 sum += sumchange;
@@ -82,14 +82,14 @@ VIRTUAL float *ForwardCpu::forward( int batchSize, float *inputData, float *weig
                             }
                         }
                     }
-                    if( dim.biased ) {
+                    if(dim.biased) {
                         sum += bias[filter];
                     }
-//                    sum = fn->calc( sum );
-                    int outputIndex = ( ( n 
-                        * dim.numFilters + filter ) 
-                        * dim.outputImageSize + outRow )
-                        * dim.outputImageSize + outCol;
+//                    sum = fn->calc(sum);
+                    int outputIndex = (( n 
+                        * dim.numFilters + filter) 
+                        * dim.outputSize + outRow)
+                        * dim.outputSize + outCol;
                     output[outputIndex] = sum;
 //                    cout << "outputIndex=" << outputIndex << " sum=" << sum << " output[outputIndex]=" <<
 //                        output[outputIndex] << endl;
diff --git a/src/conv/ForwardCpu.h b/src/conv/ForwardCpu.h
index ce1e87e5..b8fdcd9f 100644
--- a/src/conv/ForwardCpu.h
+++ b/src/conv/ForwardCpu.h
@@ -19,9 +19,9 @@ class ForwardCpu : public Forward {
     // cog_addheaders.add()
     // ]]]
     // generated, using cog:
-    ForwardCpu( EasyCL *cl, LayerDimensions dim );
-    VIRTUAL void forward( int batchSize, CLWrapper *inputDataWrapper, CLWrapper *weightsWrapper, CLWrapper *biasWrapper, CLWrapper *outputWrapper );
-    VIRTUAL float *forward( int batchSize, float *inputData, float *weights, float *bias );
+    ForwardCpu(EasyCL *cl, LayerDimensions dim);
+    VIRTUAL void forward(int batchSize, CLWrapper *inputDataWrapper, CLWrapper *weightsWrapper, CLWrapper *biasWrapper, CLWrapper *outputWrapper);
+    VIRTUAL float *forward(int batchSize, float *inputData, float *weights, float *bias);
 
     // [[[end]]]
 };
diff --git a/src/conv/ForwardFc.cpp b/src/conv/ForwardFc.cpp
index fcd7cc1c..6edbbc53 100644
--- a/src/conv/ForwardFc.cpp
+++ b/src/conv/ForwardFc.cpp
@@ -25,7 +25,7 @@ VIRTUAL ForwardFc::~ForwardFc() {
     delete addBias;
     delete reduceSegments;
 }
-VIRTUAL void ForwardFc::forward( int batchSize, CLWrapper *dataWrapper, CLWrapper *weightsWrapper, CLWrapper *biasWrapper, CLWrapper *outputWrapper ) {
+VIRTUAL void ForwardFc::forward(int batchSize, CLWrapper *dataWrapper, CLWrapper *weightsWrapper, CLWrapper *biasWrapper, CLWrapper *outputWrapper) {
     StatefulTimer::timeCheck("ForwardFc::forward begin");
 
 //    const int maxWorkgroupSize = cl->getMaxWorkgroupSize();
@@ -36,38 +36,38 @@ VIRTUAL void ForwardFc::forward( int batchSize, CLWrapper *dataWrapper, CLWrappe
 
 //    const int output1Size = batchSize * dim.numFilters * dim.numInputPlanes * dim.filterSize;
     float *output1 = new float[ output1Size ];
-    CLWrapper *output1Wrapper = cl->wrap( output1Size, output1 );
+    CLWrapper *output1Wrapper = cl->wrap(output1Size, output1);
     output1Wrapper->createOnDevice();
 
 //    const int output2Size = batchSize * dim.numFilters * dim.numInputPlanes;
     float *output2 = new float[ output2Size ];
-    CLWrapper *output2Wrapper = cl->wrap( output2Size, output2 );
+    CLWrapper *output2Wrapper = cl->wrap(output2Size, output2);
     output2Wrapper->createOnDevice();
 
-    kernel1->in( batchSize );
-    kernel1->input( dataWrapper );
-    kernel1->input( weightsWrapper);
-    kernel1->output( output1Wrapper );
-    kernel1->localFloats( dim.inputImageSize );
-    kernel1->localFloats( dim.numFilters * dim.filterSize  );
+    kernel1->in(batchSize);
+    kernel1->input(dataWrapper);
+    kernel1->input(weightsWrapper);
+    kernel1->output(output1Wrapper);
+    kernel1->localFloats(dim.inputSize);
+    kernel1->localFloats(dim.numFilters * dim.filterSize  );
 
     int workgroupSize = dim.numFilters;
     // uncommenting next line causes out-of-bounds access currently:
-    workgroupSize = ( ( workgroupSize + 32 - 1 ) / 32 ) * 32; // round up to nearest 32
+    workgroupSize = (( workgroupSize + 32 - 1) / 32) * 32; // round up to nearest 32
     int numWorkgroups = dim.filterSize * dim.numInputPlanes;
 
-    kernel1->run_1d( workgroupSize * numWorkgroups, workgroupSize );
+    kernel1->run_1d(workgroupSize * numWorkgroups, workgroupSize);
     cl->finish();
     StatefulTimer::timeCheck("ForwardFc::forward after first kernel");
 
-    reduceSegments->reduce( output1Size, dim.filterSize, output1Wrapper, output2Wrapper );
-    reduceSegments->reduce( output2Size, dim.numInputPlanes, output2Wrapper, outputWrapper );
+    reduceSegments->reduce(output1Size, dim.filterSize, output1Wrapper, output2Wrapper);
+    reduceSegments->reduce(output2Size, dim.numInputPlanes, output2Wrapper, outputWrapper);
 
     // add bias...
-    if( dim.biased ) {
+    if(dim.biased) {
         addBias->forward(
-            batchSize, dim.numFilters, dim.outputImageSize,
-            outputWrapper, biasWrapper );
+            batchSize, dim.numFilters, dim.outputSize,
+            outputWrapper, biasWrapper);
     }
 
     delete output2Wrapper;
@@ -77,27 +77,27 @@ VIRTUAL void ForwardFc::forward( int batchSize, CLWrapper *dataWrapper, CLWrappe
     delete[] output1;
     StatefulTimer::timeCheck("ForwardFc::forward end");
 }
-ForwardFc::ForwardFc( EasyCL *cl, LayerDimensions dim ) :
-        Forward( cl, dim )
+ForwardFc::ForwardFc(EasyCL *cl, LayerDimensions dim) :
+        Forward(cl, dim)
             {
 
-    if( dim.inputImageSize != dim.filterSize ) {
+    if(dim.inputSize != dim.filterSize) {
         throw runtime_error("For ForwardFc, filtersize and inputimagesize must be identical");
     }
-    if( dim.padZeros ) {
+    if(dim.padZeros) {
         throw runtime_error("For ForwardFc, padzeros must be disabled");
     }
 
-    this->addBias = new AddBias( cl );
-    this->reduceSegments = new ReduceSegments( cl );
+    this->addBias = new AddBias(cl);
+    this->reduceSegments = new ReduceSegments(cl);
 
     std::string options = "";
     options += dim.buildOptionsString();
 
     // [[[cog
     // import stringify
-    // stringify.write_kernel2( "kernel1", "cl/forward_fc_wgperrow.cl", "forward_fc_workgroup_perrow", 'options' )
-    // # stringify.write_kernel2( "kernel_reduce", "cl/reduce_segments.cl", "reduce_segments", 'options' )
+    // stringify.write_kernel2("kernel1", "cl/forward_fc_wgperrow.cl", "forward_fc_workgroup_perrow", 'options')
+    // # stringify.write_kernel2("kernel_reduce", "cl/reduce_segments.cl", "reduce_segments", 'options')
     // ]]]
     // generated using cog, from cl/forward_fc_wgperrow.cl:
     const char * kernel1Source =  
@@ -107,11 +107,11 @@ ForwardFc::ForwardFc( EasyCL *cl, LayerDimensions dim ) :
     "// v. 2.0. If a copy of the MPL was not distributed with this file, You can\n" 
     "// obtain one at http://mozilla.org/MPL/2.0/.\n" 
     "\n" 
-    "void copyLocal( local float *restrict target, global float const *restrict source, int N ) {\n" 
-    "    int numLoops = ( N + get_local_size(0) - 1 ) / get_local_size(0);\n" 
-    "    for( int loop = 0; loop < numLoops; loop++ ) {\n" 
+    "void copyLocal(local float *restrict target, global float const *restrict source, int N) {\n" 
+    "    int numLoops = (N + get_local_size(0) - 1) / get_local_size(0);\n" 
+    "    for (int loop = 0; loop < numLoops; loop++) {\n" 
     "        int offset = loop * get_local_size(0) + get_local_id(0);\n" 
-    "        if( offset < N ) {\n" 
+    "        if (offset < N) {\n" 
     "            target[offset] = source[offset];\n" 
     "        }\n" 
     "    }\n" 
@@ -144,15 +144,15 @@ ForwardFc::ForwardFc( EasyCL *cl, LayerDimensions dim ) :
     "//   filtersize == inputimagesize (mandatory)\n" 
     "//   inputimagesize == 19\n" 
     "//   filtersize == 19\n" 
-    "//   outputImageSize == 1\n" 
+    "//   outputSize == 1\n" 
     "//   lots of outplanes/filters, hundreds, but less than max work groupsize, eg 350, 500, 361\n" 
     "//   lots of inplanes, eg 32-128\n" 
     "//   inputimagesize around 19, not too small\n" 
-    "#if (gFilterSize == gInputImageSize) && (gPadZeros == 0)\n" 
-    "void kernel forward_fc_workgroup_perrow( const int batchSize,\n" 
+    "#if (gFilterSize == gInputSize) && (gPadZeros == 0)\n" 
+    "void kernel forward_fc_workgroup_perrow(const int batchSize,\n" 
     "    global const float *images, global const float *filters,\n" 
     "    global float *output1,\n" 
-    "    local float *_imageRow, local float *_filterRows ) {\n" 
+    "    local float *_imageRow, local float *_filterRows) {\n" 
     "    const int globalId = get_global_id(0);\n" 
     "\n" 
     "    const int workgroupId = get_group_id(0);\n" 
@@ -170,32 +170,32 @@ ForwardFc::ForwardFc( EasyCL *cl, LayerDimensions dim ) :
     "        + inputPlaneId * gFilterSizeSquared\n" 
     "        + filterRowId * gFilterSize;\n" 
     "    local float *_threadFilterRow = _filterRows + localId * gFilterSize;\n" 
-    "    if( localId < gNumFilters ) {\n" 
-    "        for( int i = 0; i < gFilterSize; i++ ) {\n" 
+    "    if (localId < gNumFilters) {\n" 
+    "        for (int i = 0; i < gFilterSize; i++) {\n" 
     "            _threadFilterRow[i] = filterRow[i];\n" 
     "        }\n" 
     "    }\n" 
-    "    const int loopsPerExample = ( gInputImageSize + workgroupSize - 1 ) / workgroupSize;\n" 
+    "    const int loopsPerExample = (gInputSize + workgroupSize - 1) / workgroupSize;\n" 
     "    // now loop over examples...\n" 
-    "    for( int n = 0; n < batchSize; n++ ) {\n" 
+    "    for (int n = 0; n < batchSize; n++) {\n" 
     "        // copy down example row, which is global to all threads in workgroup\n" 
     "        // hopefully should be enough threads....\n" 
     "        // but we should check anyway really, since depends on number of filters configured,\n" 
     "        // not on relative size of filter and input image\n" 
     "        barrier(CLK_LOCAL_MEM_FENCE);\n" 
-    "        copyLocal( _imageRow,  images\n" 
-    "            + ( ( n\n" 
-    "                * gNumInputPlanes + inputPlaneId )\n" 
-    "                * gInputImageSize + filterRowId )\n" 
-    "                * gInputImageSize,\n" 
-    "            gInputImageSize );\n" 
+    "        copyLocal(_imageRow,  images\n" 
+    "            + (( n\n" 
+    "                * gNumInputPlanes + inputPlaneId)\n" 
+    "                * gInputSize + filterRowId)\n" 
+    "                * gInputSize,\n" 
+    "            gInputSize);\n" 
     "        barrier(CLK_LOCAL_MEM_FENCE);\n" 
     "        // add up the values in our row...\n" 
     "        // note: dont activate yet, since need to reduce again\n" 
     "        // output structured as: [n][filter][inputplane][filterrow], need to reduce again after\n" 
-    "        if( localId < gNumFilters ) {\n" 
+    "        if (localId < gNumFilters) {\n" 
     "            float sum = 0;\n" 
-    "            for( int filterCol = 0; filterCol < gFilterSize; filterCol++ ) {\n" 
+    "            for (int filterCol = 0; filterCol < gFilterSize; filterCol++) {\n" 
     "                sum += _imageRow[ filterCol ] * _threadFilterRow[ filterCol ];\n" 
     "            }\n" 
     "            output1[ n * gNumInputPlanes * gNumFilters * gFilterSize\n" 
diff --git a/src/conv/ForwardFc.h b/src/conv/ForwardFc.h
index dbf5f03d..5e957cce 100644
--- a/src/conv/ForwardFc.h
+++ b/src/conv/ForwardFc.h
@@ -27,8 +27,8 @@ class ForwardFc : public Forward {
     // ]]]
     // generated, using cog:
     VIRTUAL ~ForwardFc();
-    VIRTUAL void forward( int batchSize, CLWrapper *dataWrapper, CLWrapper *weightsWrapper, CLWrapper *biasWrapper, CLWrapper *outputWrapper );
-    ForwardFc( EasyCL *cl, LayerDimensions dim );
+    VIRTUAL void forward(int batchSize, CLWrapper *dataWrapper, CLWrapper *weightsWrapper, CLWrapper *biasWrapper, CLWrapper *outputWrapper);
+    ForwardFc(EasyCL *cl, LayerDimensions dim);
 
     // [[[end]]]
 };
diff --git a/src/conv/ForwardIm2Col.cpp b/src/conv/ForwardIm2Col.cpp
new file mode 100644
index 00000000..08611a0f
--- /dev/null
+++ b/src/conv/ForwardIm2Col.cpp
@@ -0,0 +1,82 @@
+// Copyright Hugh Perkins 2014 hughperkins at gmail
+//
+// This Source Code Form is subject to the terms of the Mozilla Public License, 
+// v. 2.0. If a copy of the MPL was not distributed with this file, You can 
+// obtain one at http://mozilla.org/MPL/2.0/.
+
+#include "conv/ForwardIm2Col.h"
+#include "util/stringhelper.h"
+#include "util/StatefulTimer.h"
+#include "conv/AddBias.h"
+//#include "clblas/ClBlasInstance.h"
+#include "clblas/ClBlasHelper.h"
+#include "conv/Im2Col.h"
+
+#include <sstream>
+#include <iostream>
+#include <string>
+
+using namespace std;
+
+#undef VIRTUAL
+#undef STATIC
+#define VIRTUAL
+#define STATIC
+#define PUBLIC
+
+PUBLIC ForwardIm2Col::ForwardIm2Col(EasyCL *cl, LayerDimensions dim) :
+            Forward(cl, dim)
+        {
+//    ClBlasInstance::initializeIfNecessary();
+
+    addBias = new AddBias(cl);
+    im2Col = new Im2Col(cl, dim);
+}
+PUBLIC VIRTUAL ForwardIm2Col::~ForwardIm2Col() {
+    delete addBias;
+    delete im2Col;
+}
+PUBLIC VIRTUAL void ForwardIm2Col::forward(int batchSize, CLWrapper *dataWrapper, CLWrapper *weightsWrapper, CLWrapper *biasWrapper, CLWrapper *outputWrapper) {
+    StatefulTimer::timeCheck("ForwardIm2Col::forward START");
+
+    int columnsSize= dim.inputPlanes * dim.filterSizeSquared * dim.outputSizeSquared;
+    float *columns = new float[columnsSize];
+    CLWrapper *columnsWrapper = cl->wrap(columnsSize, columns);
+    columnsWrapper->createOnDevice();
+//    cout << "columnsSize: " << columnsSize << endl;
+//    cout << "weightsize: " << weightsWrapper->size() << endl;
+
+    StatefulTimer::timeCheck("ForwardIm2Col::forward after alloc");
+
+    for (int b = 0; b < batchSize; b ++) {
+        im2Col->im2Col(dataWrapper, b * dim.inputCubeSize, columnsWrapper);
+
+        long m = dim.outputSizeSquared;
+        long n = dim.numFilters;
+        long k = dim.inputPlanes * dim.filterSizeSquared;
+//        cout << "m=" << m << " n=" << n << " k=" << k << endl;
+
+        ClBlasHelper::Gemm(
+            cl, clblasColumnMajor, clblasNoTrans, clblasNoTrans,
+            m, k, n,
+            1,
+            columnsWrapper, 0,
+            weightsWrapper, 0,
+            0,
+            outputWrapper, b * dim.outputCubeSize
+        );
+    }
+
+    delete columnsWrapper;
+    delete[] columns;
+
+    StatefulTimer::timeCheck("ForwardIm2Col::forward after call forward");
+
+    if(dim.biased) {
+        addBias->forward(
+            batchSize, dim.numFilters, dim.outputSize,
+            outputWrapper, biasWrapper);
+    }
+    StatefulTimer::timeCheck("ForwardIm2Col::forward END");
+}
+
diff --git a/src/conv/ForwardIm2Col.h b/src/conv/ForwardIm2Col.h
new file mode 100644
index 00000000..f4ff3e4e
--- /dev/null
+++ b/src/conv/ForwardIm2Col.h
@@ -0,0 +1,43 @@
+// Copyright Hugh Perkins 2014 hughperkins at gmail
+//
+// This Source Code Form is subject to the terms of the Mozilla Public License, 
+// v. 2.0. If a copy of the MPL was not distributed with this file, You can 
+// obtain one at http://mozilla.org/MPL/2.0/.
+
+#pragma once
+
+#include "Forward.h"
+
+class AddBias;
+class Im2Col;
+
+#include "DeepCLDllExport.h"
+
+#define VIRTUAL virtual
+#define STATIC static
+
+class DeepCL_EXPORT ForwardIm2Col : public Forward {
+    private:
+//    CLKernel *kernelIm2Col;
+//    CLKernel *kernelCol2Im;
+    AddBias *addBias;
+    Im2Col *im2Col;
+
+    float *columns;
+    CLWrapper *columnsWrapper;
+    int numKernels;
+
+    // [[[cog
+    // import cog_addheaders
+    // cog_addheaders.addv2()
+    // ]]]
+    // generated, using cog:
+
+    public:
+    ForwardIm2Col(EasyCL *cl, LayerDimensions dim);
+    VIRTUAL ~ForwardIm2Col();
+    VIRTUAL void forward(int batchSize, CLWrapper *dataWrapper, CLWrapper *weightsWrapper, CLWrapper *biasWrapper, CLWrapper *outputWrapper);
+
+    // [[[end]]]
+};
+
diff --git a/src/conv/Im2Col.cpp b/src/conv/Im2Col.cpp
new file mode 100644
index 00000000..5fda7a3a
--- /dev/null
+++ b/src/conv/Im2Col.cpp
@@ -0,0 +1,181 @@
+//#include "clblas/ClBlasInstance.h"
+#include "clblas/ClBlasHelper.h"
+#include "EasyCL.h"
+#include "templates/TemplatedKernel.h"
+
+#include "Im2Col.h"
+
+#include <iostream>
+#include <stdexcept>
+using namespace std;
+
+#undef STATIC
+#undef VIRTUAL
+#define STATIC
+#define VIRTUAL
+#define PUBLIC
+
+PUBLIC Im2Col::Im2Col(EasyCL *cl, LayerDimensions dim) :
+        cl(cl),
+        dim(dim) {
+//    ClBlasInstance::initializeIfNecessary();
+    this->kernelIm2Col = 0;
+    this->kernelCol2Im = 0;
+}
+PUBLIC VIRTUAL Im2Col::~Im2Col() {
+    delete kernelIm2Col;
+    delete kernelCol2Im;
+}
+void Im2Col::setupBuilder(TemplatedKernel *builder) {
+    int size = dim.inputSize;
+    int padding = dim.padZeros ? dim.halfFilterSize : 0;
+    int stride = 1;
+    int channels = dim.inputPlanes;
+    int size_col = (size + 2 * padding - dim.filterSize) / stride + 1;
+
+    this->numKernelsIm2Col = channels * size_col * size_col;
+    this->numKernelsCol2Im = channels * dim.inputSizeSquared;
+
+    builder->set("padding", dim.padZeros ? dim.halfFilterSize : 0);
+    builder->set("stride", 1);
+    builder->set("colSize", size_col);
+    builder->set("channels", dim.inputPlanes);
+    builder->set("filterSize", dim.filterSize);
+    builder->set("size", dim.inputSize);
+}
+void Im2Col::buildKernelIm2Col() {
+    TemplatedKernel builder(cl);
+    setupBuilder(&builder);
+    this->kernelIm2Col = builder.buildKernel(
+        "im2col",
+        "ForwardIm2Col.cl",
+        getKernelTemplate(),
+        "im2col",
+        false
+    );
+}
+void Im2Col::buildKernelCol2Im() {
+    TemplatedKernel builder(cl);
+    setupBuilder(&builder);
+    this->kernelCol2Im = builder.buildKernel(
+        "col2im",
+        "ForwardIm2Col.cl",
+        getKernelTemplate(),
+        "col2im",
+        false
+    );
+}
+PUBLIC void Im2Col::im2Col(CLWrapper *imagesWrapper, int imagesOffset, CLWrapper *columnsWrapper) {
+    if(kernelIm2Col == 0) {
+        buildKernelIm2Col();
+    }
+    kernelIm2Col->in(numKernelsIm2Col);
+    kernelIm2Col->in(imagesWrapper);
+    kernelIm2Col->in(imagesOffset);
+    kernelIm2Col->out(columnsWrapper);
+
+    int workgroupSize = cl->getMaxWorkgroupSize();
+    int numWorkgroups = this->numKernelsIm2Col;
+
+    kernelIm2Col->run_1d(numWorkgroups * workgroupSize, workgroupSize);
+}
+PUBLIC void Im2Col::col2Im(CLWrapper *columnsWrapper, CLWrapper *imagesWrapper, int imagesOffset) {
+    if(kernelCol2Im == 0) {
+        buildKernelCol2Im();
+    }
+    kernelCol2Im->in(numKernelsCol2Im);
+    kernelCol2Im->in(columnsWrapper);
+    kernelCol2Im->out(imagesWrapper);
+    kernelCol2Im->in(imagesOffset);
+
+    int workgroupSize = cl->getMaxWorkgroupSize();
+    int numWorkgroups = this->numKernelsCol2Im;
+
+//        cout << "numworkgroups=" << numWorkgroups << " workgorupSize=" << workgroupSize << endl;
+    kernelCol2Im->run_1d(numWorkgroups * workgroupSize, workgroupSize);
+}
+STATIC std::string Im2Col::getKernelTemplate() {
+    // [[[cog
+    // import stringify
+    // stringify.write_kernel("kernel", "cl/ForwardIm2Col.cl")
+    // ]]]
+    // generated using cog, from cl/ForwardIm2Col.cl:
+    const char * kernelSource =  
+    "// from SpatialConvolutionMM.cu:\n" 
+    "\n" 
+    "// CL: grid stride looping\n" 
+    "#define CL_KERNEL_LOOP(i, n)                        \\\n" 
+    "  for (int i = get_group_id(0) * get_local_size(0) + get_local_id(0); \\\n" 
+    "      i < (n);                                       \\\n" 
+    "      i += get_local_size(0) * get_num_groups(0))\n" 
+    "\n" 
+    "//#define gPadding {{padding}}\n" 
+    "//#define gStride {{stride}}\n" 
+    "//#define gColSize {{colSize}}\n" 
+    "//#define gFilterSize {{filterSize}}\n" 
+    "//#define gSize {{size}}\n" 
+    "\n" 
+    "// Kernel for fast unfold+copy\n" 
+    "// (adapted from Caffe: https://github.com/BVLC/caffe/blob/master/src/caffe/layers/conv_layer.cu)\n" 
+    "kernel void im2col(\n" 
+    "    const int n,\n" 
+    "    global float const * im_data, int im_offset,\n" 
+    "    global float* data_col) {\n" 
+    "  global const float *data_im = im_data + im_offset;\n" 
+    "\n" 
+    "  CL_KERNEL_LOOP(index, n) {\n" 
+    "    int w_out = index % {{colSize}};\n" 
+    "    index /= {{colSize}};\n" 
+    "    int h_out = index % {{colSize}};\n" 
+    "    int channel_in = index / {{colSize}};\n" 
+    "    int channel_out = channel_in * {{filterSize}} * {{filterSize}};\n" 
+    "    int h_in = h_out * {{stride}} - {{padding}};\n" 
+    "    int w_in = w_out * {{stride}} - {{padding}};\n" 
+    "    data_col += (channel_out * {{colSize}} + h_out) * {{colSize}} + w_out;\n" 
+    "    data_im += (channel_in * {{size}} + h_in) * {{size}} + w_in;\n" 
+    "    for (int i = 0; i < {{filterSize}}; ++i) {\n" 
+    "      for (int j = 0; j < {{filterSize}}; ++j) {\n" 
+    "        int h = h_in + i;\n" 
+    "        int w = w_in + j;\n" 
+    "        *data_col = (h >= 0 && w >= 0 && h < {{size}} && w < {{size}}) ?\n" 
+    "          data_im[i * {{size}} + j] : 0;\n" 
+    "        data_col += {{colSize}} * {{colSize}};\n" 
+    "      }\n" 
+    "    }\n" 
+    "  }\n" 
+    "}\n" 
+    "\n" 
+    "kernel void col2im(\n" 
+    "    const int n,\n" 
+    "    global float const *data_col,\n" 
+    "    global float* im_data, int im_offset) {\n" 
+    "  global float *data_im = im_data + im_offset;\n" 
+    "\n" 
+    "  for (int index = get_group_id(0) * get_local_size(0) + get_local_id(0); index < (n); index += get_local_size(0) * get_num_groups(0)) {\n" 
+    "    float val = 0;\n" 
+    "    int w = index % {{size}} + {{padding}};\n" 
+    "    int h = (index / {{size}}) % {{size}} + {{padding}};\n" 
+    "    int c = index / ({{size}} * {{size}});\n" 
+    "    // compute the start and end of the output\n" 
+    "    int w_col_start = (w < {{filterSize}}) ? 0 : (w - {{filterSize}}) / {{stride}} + 1;\n" 
+    "    int w_col_end = min(w / {{stride}} + 1, {{colSize}});\n" 
+    "    int h_col_start = (h < {{filterSize}}) ? 0 : (h - {{filterSize}}) / {{stride}} + 1;\n" 
+    "    int h_col_end = min(h / {{stride}} + 1, {{colSize}});\n" 
+    "\n" 
+    "    int offset = (c * {{filterSize}} * {{filterSize}} + h * {{filterSize}} + w) * {{colSize}} * {{colSize}};\n" 
+    "    int coeff_h_col = (1 - {{stride}} * {{filterSize}} * {{colSize}}) * {{colSize}};\n" 
+    "    int coeff_w_col = (1 - {{stride}} * {{colSize}} * {{colSize}});\n" 
+    "    for (int h_col = h_col_start; h_col < h_col_end; ++h_col) {\n" 
+    "      for (int w_col = w_col_start; w_col < w_col_end; ++w_col) {\n" 
+    "        val += data_col[offset + h_col * coeff_h_col + w_col * coeff_w_col];\n" 
+    "      }\n" 
+    "    }\n" 
+    "    data_im[index] = val;\n" 
+    "  }\n" 
+    "}\n" 
+    "\n" 
+    "";
+    // [[[end]]]
+    return kernelSource;
+}
+
diff --git a/src/conv/Im2Col.h b/src/conv/Im2Col.h
new file mode 100644
index 00000000..c0acf03a
--- /dev/null
+++ b/src/conv/Im2Col.h
@@ -0,0 +1,45 @@
+#pragma once
+
+#include "LayerDimensions.h"
+
+class EasyCL;
+class CLWrapper;
+class CLKernel;
+class TemplatedKernel;
+
+#include "DeepCLDllExport.h"
+
+#define STATIC static
+#define VIRTUAL virtual
+
+class Im2Col {
+    EasyCL *cl;
+    LayerDimensions dim;
+
+    CLKernel *kernelIm2Col;
+    CLKernel *kernelCol2Im;
+
+    int numKernelsIm2Col;
+    int numKernelsCol2Im;
+
+    // [[[cog
+    // import cog_addheaders
+    // cog_addheaders.addv2()
+    // ]]]
+    // generated, using cog:
+
+    public:
+    Im2Col(EasyCL *cl, LayerDimensions dim);
+    VIRTUAL ~Im2Col();
+    void im2Col(CLWrapper *imagesWrapper, int imagesOffset, CLWrapper *columnsWrapper);
+    void col2Im(CLWrapper *columnsWrapper, CLWrapper *imagesWrapper, int imagesOffset);
+
+    private:
+    void setupBuilder(TemplatedKernel *builder);
+    void buildKernelIm2Col();
+    void buildKernelCol2Im();
+    STATIC std::string getKernelTemplate();
+
+    // [[[end]]]
+};
+
diff --git a/src/conv/LayerDimensions.cpp b/src/conv/LayerDimensions.cpp
index b59066ac..78c9eb47 100644
--- a/src/conv/LayerDimensions.cpp
+++ b/src/conv/LayerDimensions.cpp
@@ -7,13 +7,13 @@
 
 using namespace std;
 
-ostream &operator<<( ostream &os, const LayerDimensions &dim ) {
+ostream &operator<<(ostream &os, const LayerDimensions &dim) {
     os << "LayerDimensions{";
     os << " inputPlanes=" << dim.inputPlanes;
-    os << " inputImageSize=" << dim.inputImageSize;
+    os << " inputSize=" << dim.inputSize;
     os << " numFilters=" << dim.numFilters;
     os << " filterSize=" << dim.filterSize;
-    os << " outputImageSize=" << dim.outputImageSize;
+    os << " outputSize=" << dim.outputSize;
     os << " padZeros=" << dim.padZeros;
     os << " biased=" << dim.biased;
     os << " skip=" << dim.skip;
@@ -24,17 +24,17 @@ ostream &operator<<( ostream &os, const LayerDimensions &dim ) {
 void LayerDimensions::deriveOthers() {
     this->numInputPlanes = inputPlanes;
     this->isEven = filterSize % 2 == 0;
-    this->outputImageSize = padZeros ? 
-            ( filterSize % 2 == 0 ? inputImageSize / ( skip + 1 ) + 1 : inputImageSize / ( skip + 1 ) ) :
-            ( inputImageSize - filterSize ) / ( skip + 1 ) + 1;
+    this->outputSize = padZeros ? 
+            (filterSize % 2 == 0 ? inputSize / (skip + 1) + 1 : inputSize / (skip + 1) ) :
+            (inputSize - filterSize) / (skip + 1) + 1;
 
-    this->inputImageSizeSquared = inputImageSize * inputImageSize;
+    this->inputSizeSquared = inputSize * inputSize;
     this->filterSizeSquared = filterSize * filterSize;
-    this->outputImageSizeSquared = outputImageSize * outputImageSize;
+    this->outputSizeSquared = outputSize * outputSize;
 
-    this->inputCubeSize = inputPlanes * inputImageSizeSquared;
+    this->inputCubeSize = inputPlanes * inputSizeSquared;
     this->filtersSize = inputPlanes * numFilters * filterSizeSquared;
-    this->outputCubeSize = numFilters * outputImageSizeSquared;
+    this->outputCubeSize = numFilters * outputSizeSquared;
 
     this->halfFilterSize = filterSize >> 1;
 //    cout << "deriveOthers()" << *this << endl;
@@ -42,21 +42,21 @@ void LayerDimensions::deriveOthers() {
 
 string LayerDimensions::buildOptionsString() {
     string options = "";
-    if( biased ) {
+    if(biased) {
          options += " -D BIASED";
     }
     options += " -D gNumInputPlanes=" + toString(inputPlanes);
     options += " -D gInputPlanes=" + toString(inputPlanes);
-    options += " -D gInputImageSize=" + toString(inputImageSize);
-    options += " -D gInputImageSizeSquared=" + toString(square(inputImageSize));
+    options += " -D gInputSize=" + toString(inputSize);
+    options += " -D gInputSizeSquared=" + toString(square(inputSize));
     options += " -D gNumFilters=" + toString(numFilters);
     options += " -D gFilterSize=" + toString(filterSize);
-    options += " -D gHalfFilterSize=" + toString( filterSize >> 1 );
+    options += " -D gHalfFilterSize=" + toString(filterSize >> 1);
     options += " -D gFilterSizeSquared=" + toString(square(filterSize));
     options += " -D gNumOutputPlanes=" + toString(numFilters);
     options += " -D gOutputPlanes=" + toString(numFilters);
-    options += " -D gOutputImageSize=" + toString(outputImageSize);
-    options += " -D gOutputImageSizeSquared=" + toString(square(outputImageSize));
+    options += " -D gOutputSize=" + toString(outputSize);
+    options += " -D gOutputSizeSquared=" + toString(square(outputSize));
     options += " -D gPadZeros=" + toString(padZeros ? 1 : 0);
     options += " -D gMargin=" + toString(padZeros ? filterSize >> 1 : 0);
     options += " -D gEven=" + toString(filterSize % 2 == 0 ? 1 : 0);
diff --git a/src/conv/LayerDimensions.h b/src/conv/LayerDimensions.h
index 2d1542d8..882de874 100644
--- a/src/conv/LayerDimensions.h
+++ b/src/conv/LayerDimensions.h
@@ -5,13 +5,13 @@
 
 #include "DeepCLDllExport.h"
 
-inline int square( int value ) {
+inline int square(int value) {
     return value * value;
 }
 
 class DeepCL_EXPORT LayerDimensions {
 public:
-    int inputPlanes, inputImageSize, numFilters, filterSize, outputImageSize;
+    int inputPlanes, inputSize, numFilters, filterSize, outputSize;
     bool padZeros, isEven;
     bool biased;
     int skip;
@@ -21,66 +21,66 @@ class DeepCL_EXPORT LayerDimensions {
     int outputCubeSize;
     int numInputPlanes;
 
-    int outputImageSizeSquared;
+    int outputSizeSquared;
     int filterSizeSquared;
-    int inputImageSizeSquared;
+    int inputSizeSquared;
 
     int halfFilterSize;
 
     LayerDimensions() {
-        memset( this, 0, sizeof( LayerDimensions ) );
+        memset(this, 0, sizeof(LayerDimensions) );
     }
-    LayerDimensions( int inputPlanes, int inputImageSize, 
+    LayerDimensions(int inputPlanes, int inputSize, 
                 int numFilters, int filterSize, 
-                bool padZeros, bool biased ) :
-            inputPlanes( inputPlanes ),
-            inputImageSize( inputImageSize ),
-            numFilters( numFilters ),
-            filterSize( filterSize ),
-            padZeros( padZeros ),
-            biased( biased )
+                bool padZeros, bool biased) :
+            inputPlanes(inputPlanes),
+            inputSize(inputSize),
+            numFilters(numFilters),
+            filterSize(filterSize),
+            padZeros(padZeros),
+            biased(biased)
         {
         skip = 0;
         deriveOthers();
-//        std::cout << "outputImageSize " << outputImageSize << " padZeros " << padZeros << " filtersize "
-//            << filterSize << " inputImageSize " << inputImageSize << std::endl;
+//        std::cout << "outputSize " << outputSize << " padZeros " << padZeros << " filtersize "
+//            << filterSize << " inputSize " << inputSize << std::endl;
     }
-    LayerDimensions &setInputPlanes( int _planes ) {
+    LayerDimensions &setInputPlanes(int _planes) {
         this->inputPlanes = _planes;
         deriveOthers();
         return *this;
     }
-    LayerDimensions &setNumInputPlanes( int _planes ) {
+    LayerDimensions &setNumInputPlanes(int _planes) {
         this->inputPlanes = _planes;
         deriveOthers();
         return *this;
     }
-    LayerDimensions &setInputImageSize( int inputImageSize ) {
-        this->inputImageSize = inputImageSize;
+    LayerDimensions &setInputSize(int inputSize) {
+        this->inputSize = inputSize;
         deriveOthers();
         return *this;
     }
-    LayerDimensions &setSkip( int skip ) {
+    LayerDimensions &setSkip(int skip) {
         this->skip = skip;
         deriveOthers();
         return *this;
     }
-    LayerDimensions &setNumFilters( int numFilters ) {
+    LayerDimensions &setNumFilters(int numFilters) {
         this->numFilters = numFilters;
         deriveOthers();
         return *this;
     }
-    LayerDimensions &setFilterSize( int filterSize ) {
+    LayerDimensions &setFilterSize(int filterSize) {
         this->filterSize = filterSize;
         deriveOthers();
         return *this;
     }
-    LayerDimensions &setBiased( bool biased ) {
+    LayerDimensions &setBiased(bool biased) {
         this->biased = biased;
         deriveOthers();
         return *this;
     }
-    LayerDimensions &setPadZeros( bool padZeros ) {
+    LayerDimensions &setPadZeros(bool padZeros) {
         this->padZeros = padZeros;
         deriveOthers();
         return *this;
@@ -89,6 +89,6 @@ class DeepCL_EXPORT LayerDimensions {
     std::string buildOptionsString();
 };
 
-DeepCL_EXPORT std::ostream &operator<<( std::ostream &os, const LayerDimensions &dim );
+DeepCL_EXPORT std::ostream &operator<<(std::ostream &os, const LayerDimensions &dim);
 
 
diff --git a/src/conv/ReduceSegments.cpp b/src/conv/ReduceSegments.cpp
index 5e014515..acdcca7e 100644
--- a/src/conv/ReduceSegments.cpp
+++ b/src/conv/ReduceSegments.cpp
@@ -26,27 +26,27 @@ VIRTUAL void ReduceSegments::reduce(
             ) {
     StatefulTimer::timeCheck("ReduceSegments::reduce begin");
 
-    if( totalLength % segmentLength != 0 ) {
+    if(totalLength % segmentLength != 0) {
         throw runtime_error("ReduceSegments: totalLength should be multiple of segmentLength");
     }
     const int numSegments = totalLength / segmentLength;
     kernel
-        ->in( numSegments )
-        ->in( segmentLength )
-        ->in( inputWrapper )
-        ->out( outputWrapper );
-    int numWorkgroups = ( numSegments + 64 - 1 ) / 64;
-    kernel->run_1d( numWorkgroups * 64, 64 );
+        ->in(numSegments)
+        ->in(segmentLength)
+        ->in(inputWrapper)
+        ->out(outputWrapper);
+    int numWorkgroups = (numSegments + 64 - 1) / 64;
+    kernel->run_1d(numWorkgroups * 64, 64);
     cl->finish();
 
     StatefulTimer::timeCheck("ReduceSegments::reduce end");
 }
-ReduceSegments::ReduceSegments( EasyCL *cl ) :
-        cl( cl )
+ReduceSegments::ReduceSegments(EasyCL *cl) :
+        cl(cl)
             {
     string kernelName = "ReduceSegments.reduce_segments";
-    if( cl->kernelExists( kernelName ) ) {
-        this->kernel = cl->getKernel( kernelName );
+    if(cl->kernelExists(kernelName) ) {
+        this->kernel = cl->getKernel(kernelName);
         return;
     }
 
@@ -54,7 +54,7 @@ ReduceSegments::ReduceSegments( EasyCL *cl ) :
 
     // [[[cog
     // import stringify
-    // stringify.write_kernel2( "kernel", "cl/reduce_segments.cl", "reduce_segments", 'options' )
+    // stringify.write_kernel2("kernel", "cl/reduce_segments.cl", "reduce_segments", 'options')
     // ]]]
     // generated using cog, from cl/reduce_segments.cl:
     const char * kernelSource =  
@@ -64,18 +64,18 @@ ReduceSegments::ReduceSegments( EasyCL *cl ) :
     "// v. 2.0. If a copy of the MPL was not distributed with this file, You can\n" 
     "// obtain one at http://mozilla.org/MPL/2.0/.\n" 
     "\n" 
-    "kernel void reduce_segments( const int numSegments, const int segmentLength,\n" 
-    "        global float const *in, global float* out ) {\n" 
+    "kernel void reduce_segments(const int numSegments, const int segmentLength,\n" 
+    "        global float const *in, global float* out) {\n" 
     "    const int globalId = get_global_id(0);\n" 
     "    const int segmentId = globalId;\n" 
     "\n" 
-    "    if( segmentId >= numSegments ) {\n" 
+    "    if (segmentId >= numSegments) {\n" 
     "        return;\n" 
     "    }\n" 
     "\n" 
     "    float sum = 0;\n" 
     "    global const float *segment = in + segmentId * segmentLength;\n" 
-    "    for( int i = 0; i < segmentLength; i++ ) {\n" 
+    "    for (int i = 0; i < segmentLength; i++) {\n" 
     "        sum += segment[i];\n" 
     "    }\n" 
     "    out[segmentId] = sum;\n" 
@@ -86,6 +86,6 @@ ReduceSegments::ReduceSegments( EasyCL *cl ) :
     kernel = cl->buildKernelFromString( kernelSource, "reduce_segments", options, "cl/reduce_segments.cl" );
     // [[[end]]]
 
-    cl->storeKernel( kernelName, kernel, true );
+    cl->storeKernel(kernelName, kernel, true);
 }
 
diff --git a/src/conv/ReduceSegments.h b/src/conv/ReduceSegments.h
index 0f67f78e..322eb6d9 100644
--- a/src/conv/ReduceSegments.h
+++ b/src/conv/ReduceSegments.h
@@ -37,7 +37,7 @@ class DeepCL_EXPORT ReduceSegments {
     CLWrapper *inputWrapper,
     CLWrapper *outputWrapper
     );
-    ReduceSegments( EasyCL *cl );
+    ReduceSegments(EasyCL *cl);
 
     // [[[end]]]
 };
diff --git a/src/conv/files.txt b/src/conv/files.txt
index a0b335a0..a74529ef 100644
--- a/src/conv/files.txt
+++ b/src/conv/files.txt
@@ -1,3 +1,9 @@
+Im2Col.cpp
+BackpropWeightsIm2Col.cpp
+BackpropWeightsAuto.cpp
+BackwardAuto.cpp
+BackwardIm2Col.cpp
+ForwardIm2Col.cpp
 ReduceSegments.cpp
 AddBias.cpp
 BackpropWeights.cpp
@@ -21,3 +27,4 @@ Forward.cpp
 ForwardCpu.cpp
 ForwardFc.cpp
 LayerDimensions.cpp
+
diff --git a/src/conv/unused/ForwardFc_workgroupPerFilterPlane.cpp b/src/conv/unused/ForwardFc_workgroupPerFilterPlane.cpp
index 8ff613e6..89e022e6 100644
--- a/src/conv/unused/ForwardFc_workgroupPerFilterPlane.cpp
+++ b/src/conv/unused/ForwardFc_workgroupPerFilterPlane.cpp
@@ -21,43 +21,43 @@ VIRTUAL ForwardFc_workgroupPerFilterPlane::~ForwardFc_workgroupPerFilterPlane()
     delete kernel1;
     delete kernel2;
 }
-VIRTUAL void ForwardFc_workgroupPerFilterPlane::forward( int batchSize, CLWrapper *dataWrapper, CLWrapper *weightsWrapper, CLWrapper *biasWrapper, CLWrapper *outputWrapper ) {
+VIRTUAL void ForwardFc_workgroupPerFilterPlane::forward(int batchSize, CLWrapper *dataWrapper, CLWrapper *weightsWrapper, CLWrapper *biasWrapper, CLWrapper *outputWrapper) {
     StatefulTimer::timeCheck("ForwardFc_workgroupPerFilterPlane::forward begin");
     const int output1Size = batchSize * dim.numFilters * dim.filterSize;
     float *output1 = new float[ output1Size ];
-    CLWrapper *output1Wrapper = cl->wrap( output1Size, output1 );
+    CLWrapper *output1Wrapper = cl->wrap(output1Size, output1);
     output1Wrapper->createOnDevice();
 
     kernel1->in(batchSize);
-    kernel1->input( dataWrapper );
-    kernel1->input( weightsWrapper);
-    if( dim.biased ) kernel1->input( biasWrapper );
-    kernel1->output( output1Wrapper );
-    kernel1->localFloats( dim.inputImageSize );
-    kernel1->localFloats( batchSize * dim.filterSize );
+    kernel1->input(dataWrapper);
+    kernel1->input(weightsWrapper);
+    if(dim.biased) kernel1->input(biasWrapper);
+    kernel1->output(output1Wrapper);
+    kernel1->localFloats(dim.inputSize);
+    kernel1->localFloats(batchSize * dim.filterSize);
 
     int workgroupSize = dim.numFilters;
     int numWorkgroups = dim.filterSize;
 
     int globalSize = workgroupSize * numWorkgroups;
 /////    cout << "forward3 numworkgroups " << numWorkgroups << " globalsize " << globalSize << " workgroupsize " << workgroupsize << endl;
-    kernel1->run_1d( globalSize, workgroupSize );
+    kernel1->run_1d(globalSize, workgroupSize);
     cl->finish();
     StatefulTimer::timeCheck("ForwardFc_workgroupPerFilterPlane::forward after first kernel");
 
     // now reduce again...
-    kernel2->in(batchSize)->in( output1Wrapper )->out( outputWrapper );
+    kernel2->in(batchSize)->in(output1Wrapper)->out(outputWrapper);
     int maxWorkgroupSize = cl->getMaxWorkgroupSize();
-    numWorkgroups = ( batchSize * dim.numFilters + maxWorkgroupSize - 1 ) / maxWorkgroupSize;
-    kernel2->run_1d( numWorkgroups * maxWorkgroupSize, maxWorkgroupSize );
+    numWorkgroups = (batchSize * dim.numFilters + maxWorkgroupSize - 1) / maxWorkgroupSize;
+    kernel2->run_1d(numWorkgroups * maxWorkgroupSize, maxWorkgroupSize);
     cl->finish();
 
     delete output1Wrapper;
     delete[] output1;
     StatefulTimer::timeCheck("ForwardFc_workgroupPerFilterPlane::forward end");
 }
-ForwardFc_workgroupPerFilterPlane::ForwardFc_workgroupPerFilterPlane( EasyCL *cl, LayerDimensions dim ) :
-        Forward( cl, dim )
+ForwardFc_workgroupPerFilterPlane::ForwardFc_workgroupPerFilterPlane(EasyCL *cl, LayerDimensions dim) :
+        Forward(cl, dim)
             {
 
     std::string options = ""; // "-D " + fn->getDefineName();
@@ -65,8 +65,8 @@ ForwardFc_workgroupPerFilterPlane::ForwardFc_workgroupPerFilterPlane( EasyCL *cl
 
     // [[[cog
     // import stringify
-    // stringify.write_kernel2( "kernel1", "cl/forward_fc_wgperrow.cl", "forward_fc_workgroup_perrow", 'options' )
-    // stringify.write_kernel2( "kernel2", "cl/forward_fc.cl", "reduce_rows", 'options' )
+    // stringify.write_kernel2("kernel1", "cl/forward_fc_wgperrow.cl", "forward_fc_workgroup_perrow", 'options')
+    // stringify.write_kernel2("kernel2", "cl/forward_fc.cl", "reduce_rows", 'options')
     // ]]]
     // generated using cog, from cl/forward_fc_wgperrow.cl:
     const char * kernel1Source =  
@@ -76,11 +76,11 @@ ForwardFc_workgroupPerFilterPlane::ForwardFc_workgroupPerFilterPlane( EasyCL *cl
     "// v. 2.0. If a copy of the MPL was not distributed with this file, You can\n" 
     "// obtain one at http://mozilla.org/MPL/2.0/.\n" 
     "\n" 
-    "void copyLocal( local float *restrict target, global float const *restrict source, int N ) {\n" 
-    "    int numLoops = ( N + get_local_size(0) - 1 ) / get_local_size(0);\n" 
-    "    for( int loop = 0; loop < numLoops; loop++ ) {\n" 
+    "void copyLocal(local float *restrict target, global float const *restrict source, int N) {\n" 
+    "    int numLoops = (N + get_local_size(0) - 1) / get_local_size(0);\n" 
+    "    for(int loop = 0; loop < numLoops; loop++) {\n" 
     "        int offset = loop * get_local_size(0) + get_local_id(0);\n" 
-    "        if( offset < N ) {\n" 
+    "        if(offset < N) {\n" 
     "            target[offset] = source[offset];\n" 
     "        }\n" 
     "    }\n" 
@@ -113,15 +113,15 @@ ForwardFc_workgroupPerFilterPlane::ForwardFc_workgroupPerFilterPlane( EasyCL *cl
     "//   filtersize == inputimagesize (mandatory)\n" 
     "//   inputimagesize == 19\n" 
     "//   filtersize == 19\n" 
-    "//   outputImageSize == 1\n" 
+    "//   outputSize == 1\n" 
     "//   lots of outplanes/filters, hundreds, but less than max work groupsize, eg 350, 500, 361\n" 
     "//   lots of inplanes, eg 32-128\n" 
     "//   inputimagesize around 19, not too small\n" 
-    "#if (gFilterSize == gInputImageSize) && (gPadZeros == 0)\n" 
-    "void kernel forward_fc_workgroup_perrow( const int batchSize,\n" 
+    "#if (gFilterSize == gInputSize) && (gPadZeros == 0)\n" 
+    "void kernel forward_fc_workgroup_perrow(const int batchSize,\n" 
     "    global const float *images, global const float *filters,\n" 
     "    global float *output1,\n" 
-    "    local float *_imageRow, local float *_filterRows ) {\n" 
+    "    local float *_imageRow, local float *_filterRows) {\n" 
     "    const int globalId = get_global_id(0);\n" 
     "\n" 
     "    const int workgroupId = get_group_id(0);\n" 
@@ -139,32 +139,32 @@ ForwardFc_workgroupPerFilterPlane::ForwardFc_workgroupPerFilterPlane( EasyCL *cl
     "        + inputPlaneId * gFilterSizeSquared\n" 
     "        + filterRowId * gFilterSize;\n" 
     "    local float *_threadFilterRow = _filterRows + localId * gFilterSize;\n" 
-    "    for( int i = 0; i < gFilterSize; i++ ) {\n" 
+    "    for(int i = 0; i < gFilterSize; i++) {\n" 
     "        _threadFilterRow[i] = filterRow[i];\n" 
     "    }\n" 
-    "    const int loopsPerExample = ( gInputImageSize + workgroupSize - 1 ) / workgroupSize;\n" 
+    "    const int loopsPerExample = (gInputSize + workgroupSize - 1) / workgroupSize;\n" 
     "    // now loop over examples...\n" 
-    "    for( int n = 0; n < batchSize; n++ ) {\n" 
+    "    for(int n = 0; n < batchSize; n++) {\n" 
     "        // copy down example row, which is global to all threads in workgroup\n" 
     "        // hopefully should be enough threads....\n" 
     "        // but we should check anyway really, since depends on number of filters configured,\n" 
     "        // not on relative size of filter and input image\n" 
     "        barrier(CLK_LOCAL_MEM_FENCE);\n" 
-    "        copyLocal( _imageRow,  images\n" 
-    "            + ( ( n\n" 
-    "                * gNumInputPlanes + inputPlaneId )\n" 
-    "                * gInputImageSize + filterRowId )\n" 
-    "                * gInputImageSize,\n" 
-    "            gInputImageSize );\n" 
+    "        copyLocal(_imageRow,  images\n" 
+    "            + (( n\n" 
+    "                * gNumInputPlanes + inputPlaneId)\n" 
+    "                * gInputSize + filterRowId)\n" 
+    "                * gInputSize,\n" 
+    "            gInputSize);\n" 
     "        barrier(CLK_LOCAL_MEM_FENCE);\n" 
     "        // add up the values in our row...\n" 
     "        float sum = 0;\n" 
-    "        for( int filterCol = 0; filterCol < gFilterSize; filterCol++ ) {\n" 
+    "        for(int filterCol = 0; filterCol < gFilterSize; filterCol++) {\n" 
     "            sum += _imageRow[ filterCol ] * _threadFilterRow[ filterCol ];\n" 
     "        }\n" 
     "        // note: dont activate yet, since need to reduce again\n" 
     "        // output structured as: [n][filter][inputplane][filterrow], need to reduce again after\n" 
-    "        if( localId < gNumFilters ) {\n" 
+    "        if(localId < gNumFilters) {\n" 
     "            output1[ n * gNumInputPlanes * gNumFilters * gFilterSize\n" 
     "                + inputPlaneId * gFilterSize\n" 
     "                + filterId * gNumInputPlanes * gFilterSize + filterRowId ] = sum;\n" 
@@ -174,7 +174,7 @@ ForwardFc_workgroupPerFilterPlane::ForwardFc_workgroupPerFilterPlane( EasyCL *cl
     "#endif\n" 
     "\n" 
     "";
-    kernel1 = cl->buildKernelFromString( kernel1Source, "forward_fc_workgroup_perrow", options, "cl/forward_fc_wgperrow.cl" );
+    kernel1 = cl->buildKernelFromString(kernel1Source, "forward_fc_workgroup_perrow", options, "cl/forward_fc_wgperrow.cl");
     // generated using cog, from cl/forward_fc.cl:
     const char * kernel2Source =  
     "// Copyright Hugh Perkins 2014, 2015 hughperkins at gmail\n" 
@@ -190,15 +190,15 @@ ForwardFc_workgroupPerFilterPlane::ForwardFc_workgroupPerFilterPlane( EasyCL *cl
     "// this kernel assumes:\n" 
     "//   padzeros == 0 (mandatory)\n" 
     "//   filtersize == inputimagesize (mandatory)\n" 
-    "//   outputImageSize == 1\n" 
+    "//   outputSize == 1\n" 
     "//   lots of outplanes, hundreds, but less than max work groupsize, eg 350, 500, 361\n" 
     "//   lots of inplanes, eg 32\n" 
     "//   inputimagesize around 19, not too small\n" 
-    "#if gFilterSize == gInputImageSize && gPadZeros == 0\n" 
-    "void kernel forward_filter_matches_inimage( const int batchSize,\n" 
+    "#if gFilterSize == gInputSize && gPadZeros == 0\n" 
+    "void kernel forward_filter_matches_inimage(const int batchSize,\n" 
     "      global const float *images, global const float *filters,\n" 
     "    global float *output,\n" 
-    "    local float *_upstreamImage, local float *_filterImage ) {\n" 
+    "    local float *_upstreamImage, local float *_filterImage) {\n" 
     "    const int globalId = get_global_id(0);\n" 
     "\n" 
     "    const int workgroupId = get_group_id(0);\n" 
@@ -211,38 +211,38 @@ ForwardFc_workgroupPerFilterPlane::ForwardFc_workgroupPerFilterPlane( EasyCL *cl
     "    const int filterCol = localId % gFilterSize;\n" 
     "\n" 
     "    float sum = 0;\n" 
-    "    for( int upstreamPlane = 0; upstreamPlane < gUpstreamNumPlanes; upstreamPlane++ ) {\n" 
-    "        int thisUpstreamImageOffset = ( n * gUpstreamNumPlanes + upstreamPlane ) * gUpstreamImageSizeSquared;\n" 
+    "    for(int upstreamPlane = 0; upstreamPlane < gUpstreamNumPlanes; upstreamPlane++) {\n" 
+    "        int thisUpstreamImageOffset = (n * gUpstreamNumPlanes + upstreamPlane) * gUpstreamImageSizeSquared;\n" 
     "        barrier(CLK_LOCAL_MEM_FENCE);\n" 
-    "        for( int i = 0; i < numUpstreamsPerThread; i++ ) {\n" 
+    "        for(int i = 0; i < numUpstreamsPerThread; i++) {\n" 
     "            int thisOffset = workgroupSize * i + localId;\n" 
-    "            if( thisOffset < gUpstreamImageSizeSquared ) {\n" 
+    "            if(thisOffset < gUpstreamImageSizeSquared) {\n" 
     "                _upstreamImage[ thisOffset ] = images[ thisUpstreamImageOffset + thisOffset ];\n" 
     "            }\n" 
     "        }\n" 
-    "        const int filterGlobalOffset = ( outPlane * gUpstreamNumPlanes + upstreamPlane ) * gFilterSizeSquared;\n" 
-    "        for( int i = 0; i < numFilterPixelsPerThread; i++ ) {\n" 
+    "        const int filterGlobalOffset = (outPlane * gUpstreamNumPlanes + upstreamPlane) * gFilterSizeSquared;\n" 
+    "        for(int i = 0; i < numFilterPixelsPerThread; i++) {\n" 
     "            int thisOffset = workgroupSize * i + localId;\n" 
-    "            if( thisOffset < gFilterSizeSquared ) {\n" 
+    "            if(thisOffset < gFilterSizeSquared) {\n" 
     "                _filterCube[thisOffset] = filters[filterGlobalOffset + thisOffset];\n" 
     "            }\n" 
     "        }\n" 
     "        barrier(CLK_LOCAL_MEM_FENCE);\n" 
-    "        if( localId < gOutImageSizeSquared ) {\n" 
-    "            for( int u = minu; u <= maxu; u++ ) {\n" 
-    "                int inputRow = outputRow + u + ( gPadZeros ? 0 : gHalfFilterSize );\n" 
+    "        if(localId < gOutImageSizeSquared) {\n" 
+    "            for(int u = minu; u <= maxu; u++) {\n" 
+    "                int inputRow = outputRow + u + (gPadZeros ? 0 : gHalfFilterSize);\n" 
     "                int inputimagerowoffset = inputRow * gUpstreamImageSize;\n" 
     "                int filterrowoffset = (u+gHalfFilterSize) * gFilterSize + gHalfFilterSize;\n" 
-    "                for( int v = minv; v <= maxv; v++ ) {\n" 
-    "                    int inputCol = outputCol + v + ( gPadZeros ? 0 : gHalfFilterSize );\n" 
+    "                for(int v = minv; v <= maxv; v++) {\n" 
+    "                    int inputCol = outputCol + v + (gPadZeros ? 0 : gHalfFilterSize);\n" 
     "                    sum += _upstreamImage[ inputimagerowoffset + inputCol] * _filterCube[ filterrowoffset + v ];\n" 
     "                }\n" 
     "            }\n" 
     "        }\n" 
     "    }\n" 
     "    // output are organized like [imageid][filterid][row][col]\n" 
-    "    int resultIndex = ( n * gNumOutPlanes + outPlane ) * gOutImageSizeSquared + localId;\n" 
-    "    if( localId < gOutImageSizeSquared ) {\n" 
+    "    int resultIndex = (n * gNumOutPlanes + outPlane) * gOutImageSizeSquared + localId;\n" 
+    "    if(localId < gOutImageSizeSquared) {\n" 
     "        output[resultIndex ] = sum;\n" 
     "    }\n" 
     "}\n" 
@@ -251,7 +251,7 @@ ForwardFc_workgroupPerFilterPlane::ForwardFc_workgroupPerFilterPlane( EasyCL *cl
     "\n" 
     "\n" 
     "";
-    kernel2 = cl->buildKernelFromString( kernel2Source, "reduce_rows", options, "cl/forward_fc.cl" );
+    kernel2 = cl->buildKernelFromString(kernel2Source, "reduce_rows", options, "cl/forward_fc.cl");
     // [[[end]]]
 }
 
diff --git a/src/conv/unused/ForwardFc_workgroupPerFilterPlane.h b/src/conv/unused/ForwardFc_workgroupPerFilterPlane.h
index 0e784f4c..bdc65eb3 100644
--- a/src/conv/unused/ForwardFc_workgroupPerFilterPlane.h
+++ b/src/conv/unused/ForwardFc_workgroupPerFilterPlane.h
@@ -19,8 +19,8 @@ class ForwardFc_workgroupPerFilterPlane : public Forward {
     // ]]]
     // generated, using cog:
     VIRTUAL ~ForwardFc_workgroupPerFilterPlane();
-    VIRTUAL void forward( int batchSize, CLWrapper *dataWrapper, CLWrapper *weightsWrapper, CLWrapper *biasWrapper, CLWrapper *outputWrapper );
-    ForwardFc_workgroupPerFilterPlane( EasyCL *cl, LayerDimensions dim );
+    VIRTUAL void forward(int batchSize, CLWrapper *dataWrapper, CLWrapper *weightsWrapper, CLWrapper *biasWrapper, CLWrapper *outputWrapper);
+    ForwardFc_workgroupPerFilterPlane(EasyCL *cl, LayerDimensions dim);
 
     // [[[end]]]
 };
diff --git a/src/dropout/DropoutBackward.cpp b/src/dropout/DropoutBackward.cpp
index f7f63f8f..2d3c3bed 100644
--- a/src/dropout/DropoutBackward.cpp
+++ b/src/dropout/DropoutBackward.cpp
@@ -23,59 +23,59 @@ using namespace std;
 #undef STATIC
 #define STATIC
 
-STATIC DropoutBackward *DropoutBackward::instance( EasyCL *cl, int numPlanes, int inputImageSize, float dropRatio ) {
-    return new DropoutBackwardGpuNaive( cl, numPlanes, inputImageSize, dropRatio );
+STATIC DropoutBackward *DropoutBackward::instance(EasyCL *cl, int numPlanes, int inputSize, float dropRatio) {
+    return new DropoutBackwardGpuNaive(cl, numPlanes, inputSize, dropRatio);
 }
-STATIC DropoutBackward *DropoutBackward::instanceForTest( EasyCL *cl, int numPlanes, int inputImageSize, float dropRatio) {
-    return new DropoutBackwardGpuNaive( cl, numPlanes, inputImageSize, dropRatio );
+STATIC DropoutBackward *DropoutBackward::instanceForTest(EasyCL *cl, int numPlanes, int inputSize, float dropRatio) {
+    return new DropoutBackwardGpuNaive(cl, numPlanes, inputSize, dropRatio);
 }
-STATIC DropoutBackward *DropoutBackward::instanceSpecific( int idx, EasyCL *cl, int numPlanes, int inputImageSize, float dropRatio ) {
-    if( idx == 0 ) {
-        return new DropoutBackwardCpu( cl, numPlanes, inputImageSize, dropRatio );
+STATIC DropoutBackward *DropoutBackward::instanceSpecific(int idx, EasyCL *cl, int numPlanes, int inputSize, float dropRatio) {
+    if(idx == 0) {
+        return new DropoutBackwardCpu(cl, numPlanes, inputSize, dropRatio);
     }
-    if( idx == 1 ) {
-        return new DropoutBackwardGpuNaive( cl, numPlanes, inputImageSize, dropRatio );
+    if(idx == 1) {
+        return new DropoutBackwardGpuNaive(cl, numPlanes, inputSize, dropRatio);
     }
-    throw runtime_error("DropoutBackward::instanceSpecific, idx not known: " + toString( idx ) );
+    throw runtime_error("DropoutBackward::instanceSpecific, idx not known: " + toString(idx) );
 }
-DropoutBackward::DropoutBackward( EasyCL *cl, int numPlanes, int inputImageSize, float dropRatio ) :
-        cl( cl ),
-        numPlanes( numPlanes ),
-        inputImageSize( inputImageSize ),
-        dropRatio( dropRatio ),
-//        dropoutSizeSquared( dropoutSize * dropoutSize ),
-        outputImageSize( inputImageSize ) {
-//    if( inputImageSize % dropoutSize != 0 ) {
-//        throw runtime_error("inputImageSize should be an exact multiple of dropoutsize: " + toString( inputImageSize ) + " " + toString(dropoutSize ) );
+DropoutBackward::DropoutBackward(EasyCL *cl, int numPlanes, int inputSize, float dropRatio) :
+        cl(cl),
+        numPlanes(numPlanes),
+        inputSize(inputSize),
+        dropRatio(dropRatio),
+//        dropoutSizeSquared(dropoutSize * dropoutSize),
+        outputSize(inputSize) {
+//    if(inputSize % dropoutSize != 0) {
+//        throw runtime_error("inputSize should be an exact multiple of dropoutsize: " + toString(inputSize) + " " + toString(dropoutSize) );
 //    }
 }
-VIRTUAL int DropoutBackward::getInputSize( int batchSize ) {
-    return batchSize * numPlanes * inputImageSize * inputImageSize;
+VIRTUAL int DropoutBackward::getInputNumElements(int batchSize) {
+    return batchSize * numPlanes * inputSize * inputSize;
 }
-VIRTUAL int DropoutBackward::getOutputSize(int batchSize) {
-    return batchSize * numPlanes * outputImageSize * outputImageSize;
+VIRTUAL int DropoutBackward::getOutputNumElements(int batchSize) {
+    return batchSize * numPlanes * outputSize * outputSize;
 }
-VIRTUAL void DropoutBackward::backward( int batchSize, uchar *mask, float *gradOutput, float *gradInput ) {
-//    cout << "DropoutBackward::backward( float * )" << endl;
-    StatefulTimer::instance()->timeCheck("DropoutBackward::backward float->wrapper start" );
-    CLWrapper *maskWrapper = cl->wrap( getOutputSize(batchSize), mask );
-    CLWrapper *gradOutputWrapper = cl->wrap( getOutputSize(batchSize), gradOutput );
-    CLWrapper *gradInputWrapper = cl->wrap( getInputSize(batchSize), gradInput );
+VIRTUAL void DropoutBackward::backward(int batchSize, uchar *mask, float *gradOutput, float *gradInput) {
+//    cout << "DropoutBackward::backward(float *)" << endl;
+    StatefulTimer::instance()->timeCheck("DropoutBackward::backward float->wrapper start");
+    CLWrapper *maskWrapper = cl->wrap(getOutputNumElements(batchSize), mask);
+    CLWrapper *gradOutputWrapper = cl->wrap(getOutputNumElements(batchSize), gradOutput);
+    CLWrapper *gradInputWrapper = cl->wrap(getInputNumElements(batchSize), gradInput);
 
     maskWrapper->copyToDevice();
     gradOutputWrapper->copyToDevice();
     gradInputWrapper->createOnDevice();
 
-    backward( batchSize, maskWrapper, gradOutputWrapper, gradInputWrapper );
+    backward(batchSize, maskWrapper, gradOutputWrapper, gradInputWrapper);
 
     gradInputWrapper->copyToHost();
 
     delete maskWrapper;
     delete gradOutputWrapper;
     delete gradInputWrapper;
-    StatefulTimer::instance()->timeCheck("DropoutBackward::backward float->wrapper end" );
+    StatefulTimer::instance()->timeCheck("DropoutBackward::backward float->wrapper end");
 }
-VIRTUAL void DropoutBackward::backward( int batchSize, CLWrapper *maskWrapper, CLWrapper *gradOutputWrapper, CLWrapper *gradInputWrapper ) {
-    throw runtime_error("DropoutBackward::backward wrappers not implemented" );
+VIRTUAL void DropoutBackward::backward(int batchSize, CLWrapper *maskWrapper, CLWrapper *gradOutputWrapper, CLWrapper *gradInputWrapper) {
+    throw runtime_error("DropoutBackward::backward wrappers not implemented");
 }
 
diff --git a/src/dropout/DropoutBackward.h b/src/dropout/DropoutBackward.h
index 98c0066f..6821d989 100644
--- a/src/dropout/DropoutBackward.h
+++ b/src/dropout/DropoutBackward.h
@@ -19,23 +19,23 @@ class DeepCL_EXPORT DropoutBackward {
     EasyCL *cl;
 
     const int numPlanes;
-    const int inputImageSize;
+    const int inputSize;
     const float dropRatio;
 
-    const int outputImageSize;
+    const int outputSize;
 
     virtual ~DropoutBackward() {}
-    inline int getInputIndex( int n, int plane, int row, int col ) {
-        return ( ( n
-            * numPlanes + plane )
-            * inputImageSize + row )
-            * inputImageSize + col;
+    inline int getInputIndex(int n, int plane, int row, int col) {
+        return (( n
+            * numPlanes + plane)
+            * inputSize + row)
+            * inputSize + col;
     }
-    inline int getResultIndex( int n, int plane, int row, int col ) {
-        return ( ( n
-            * numPlanes + plane )
-            * outputImageSize + row )
-            * outputImageSize + col;
+    inline int getResultIndex(int n, int plane, int row, int col) {
+        return (( n
+            * numPlanes + plane)
+            * outputSize + row)
+            * outputSize + col;
     }
 
     // [[[cog
@@ -43,14 +43,14 @@ class DeepCL_EXPORT DropoutBackward {
     // cog_addheaders.add()
     // ]]]
     // generated, using cog:
-    STATIC DropoutBackward *instance( EasyCL *cl, int numPlanes, int inputImageSize, float dropRatio );
-    STATIC DropoutBackward *instanceForTest( EasyCL *cl, int numPlanes, int inputImageSize, float dropRatio);
-    STATIC DropoutBackward *instanceSpecific( int idx, EasyCL *cl, int numPlanes, int inputImageSize, float dropRatio );
-    DropoutBackward( EasyCL *cl, int numPlanes, int inputImageSize, float dropRatio );
-    VIRTUAL int getInputSize( int batchSize );
-    VIRTUAL int getOutputSize(int batchSize);
-    VIRTUAL void backward( int batchSize, uchar *mask, float *gradOutput, float *gradInput );
-    VIRTUAL void backward( int batchSize, CLWrapper *maskWrapper, CLWrapper *gradOutputWrapper, CLWrapper *gradInputWrapper );
+    STATIC DropoutBackward *instance(EasyCL *cl, int numPlanes, int inputSize, float dropRatio);
+    STATIC DropoutBackward *instanceForTest(EasyCL *cl, int numPlanes, int inputSize, float dropRatio);
+    STATIC DropoutBackward *instanceSpecific(int idx, EasyCL *cl, int numPlanes, int inputSize, float dropRatio);
+    DropoutBackward(EasyCL *cl, int numPlanes, int inputSize, float dropRatio);
+    VIRTUAL int getInputNumElements(int batchSize);
+    VIRTUAL int getOutputNumElements(int batchSize);
+    VIRTUAL void backward(int batchSize, uchar *mask, float *gradOutput, float *gradInput);
+    VIRTUAL void backward(int batchSize, CLWrapper *maskWrapper, CLWrapper *gradOutputWrapper, CLWrapper *gradInputWrapper);
 
     // [[[end]]]
 };
diff --git a/src/dropout/DropoutBackwardCpu.cpp b/src/dropout/DropoutBackwardCpu.cpp
index 36063f23..c1350b84 100644
--- a/src/dropout/DropoutBackwardCpu.cpp
+++ b/src/dropout/DropoutBackwardCpu.cpp
@@ -21,34 +21,34 @@ using namespace std;
 #undef STATIC
 #define STATIC
 
-DropoutBackwardCpu::DropoutBackwardCpu( EasyCL *cl, int numPlanes, int inputImageSize, float dropRatio ) :
-        DropoutBackward( cl, numPlanes, inputImageSize, dropRatio ) {
+DropoutBackwardCpu::DropoutBackwardCpu(EasyCL *cl, int numPlanes, int inputSize, float dropRatio) :
+        DropoutBackward(cl, numPlanes, inputSize, dropRatio) {
 }
-VIRTUAL void DropoutBackwardCpu::backward( int batchSize, uchar *mask,  float *gradOutput, float *gradInput ) {
-    int totalLinearSize = batchSize * numPlanes * inputImageSize * inputImageSize;
-    for( int i = 0; i < totalLinearSize; i++ ) {
+VIRTUAL void DropoutBackwardCpu::backward(int batchSize, uchar *mask,  float *gradOutput, float *gradInput) {
+    int totalLinearSize = batchSize * numPlanes * inputSize * inputSize;
+    for(int i = 0; i < totalLinearSize; i++) {
         gradInput[i] = mask[i] == 1 ? gradOutput[i] : 0.0f;
     }
 }
-VIRTUAL void DropoutBackwardCpu::backward( int batchSize, CLWrapper *maskWrapper, CLWrapper *gradOutputWrapper, 
-        CLWrapper *gradInputWrapper ) {
-    StatefulTimer::instance()->timeCheck("DropoutBackwardCpu::backward start" );
+VIRTUAL void DropoutBackwardCpu::backward(int batchSize, CLWrapper *maskWrapper, CLWrapper *gradOutputWrapper, 
+        CLWrapper *gradInputWrapper) {
+    StatefulTimer::instance()->timeCheck("DropoutBackwardCpu::backward start");
 
     maskWrapper->copyToHost();
     gradOutputWrapper->copyToHost();
 
-    uchar *mask = reinterpret_cast<uchar *>( maskWrapper->getHostArray() );
-    float *gradOutput = reinterpret_cast<float *>( gradOutputWrapper->getHostArray() );
-    float *gradInput = new float[ getInputSize( batchSize ) ];
+    uchar *mask = reinterpret_cast<uchar *>(maskWrapper->getHostArray());
+    float *gradOutput = reinterpret_cast<float *>(gradOutputWrapper->getHostArray());
+    float *gradInput = new float[ getInputNumElements(batchSize) ];
 
-    backward( batchSize, mask, gradOutput, gradInput );
+    backward(batchSize, mask, gradOutput, gradInput);
 
-    float *gradInputHostArray = reinterpret_cast<float *>( gradInputWrapper->getHostArray() );
-    memcpy( gradInputHostArray, gradInput, sizeof(float) * getInputSize( batchSize ) );
+    float *gradInputHostArray = reinterpret_cast<float *>(gradInputWrapper->getHostArray());
+    memcpy(gradInputHostArray, gradInput, sizeof(float) * getInputNumElements(batchSize) );
     gradInputWrapper->copyToDevice();
 
     delete[] gradInput;
     
-    StatefulTimer::instance()->timeCheck("DropoutBackwardCpu::backward end" );
+    StatefulTimer::instance()->timeCheck("DropoutBackwardCpu::backward end");
 }
 
diff --git a/src/dropout/DropoutBackwardCpu.h b/src/dropout/DropoutBackwardCpu.h
index 806c982c..9a5d38d6 100644
--- a/src/dropout/DropoutBackwardCpu.h
+++ b/src/dropout/DropoutBackwardCpu.h
@@ -19,10 +19,10 @@ class DropoutBackwardCpu : public DropoutBackward {
     // cog_addheaders.add()
     // ]]]
     // generated, using cog:
-    DropoutBackwardCpu( EasyCL *cl, int numPlanes, int inputImageSize, float dropRatio );
-    VIRTUAL void backward( int batchSize, uchar *mask,  float *gradOutput, float *gradInput );
-    VIRTUAL void backward( int batchSize, CLWrapper *maskWrapper, CLWrapper *gradOutputWrapper,
-    CLWrapper *gradInputWrapper );
+    DropoutBackwardCpu(EasyCL *cl, int numPlanes, int inputSize, float dropRatio);
+    VIRTUAL void backward(int batchSize, uchar *mask,  float *gradOutput, float *gradInput);
+    VIRTUAL void backward(int batchSize, CLWrapper *maskWrapper, CLWrapper *gradOutputWrapper,
+    CLWrapper *gradInputWrapper);
 
     // [[[end]]]
 };
diff --git a/src/dropout/DropoutBackwardGpuNaive.cpp b/src/dropout/DropoutBackwardGpuNaive.cpp
index 1803f74d..73ea68d9 100644
--- a/src/dropout/DropoutBackwardGpuNaive.cpp
+++ b/src/dropout/DropoutBackwardGpuNaive.cpp
@@ -30,45 +30,45 @@ VIRTUAL void DropoutBackwardGpuNaive::backward(
             int batchSize, 
             CLWrapper *maskWrapper, 
             CLWrapper *gradOutputWrapper, 
-            CLWrapper *gradInputWrapper ) 
+            CLWrapper *gradInputWrapper) 
         {
 
-    StatefulTimer::instance()->timeCheck("DropoutBackwardGpuNaive::backward start" );
+    StatefulTimer::instance()->timeCheck("DropoutBackwardGpuNaive::backward start");
 
     // first, memset errors to 0 ...
-//    kMemset ->out( gradInputWrapper )
-//            ->in( 0.0f )
-//            ->in( batchSize * numPlanes * inputImageSize * inputImageSize );
-//    int globalSize = batchSize * numPlanes * inputImageSize * inputImageSize;
+//    kMemset ->out(gradInputWrapper)
+//            ->in(0.0f)
+//            ->in(batchSize * numPlanes * inputSize * inputSize);
+//    int globalSize = batchSize * numPlanes * inputSize * inputSize;
 //    int workgroupSize = 64;
-//    int numWorkgroups = ( globalSize + workgroupSize - 1 ) / workgroupSize;
-//    kMemset->run_1d( numWorkgroups * workgroupSize, workgroupSize );
+//    int numWorkgroups = (globalSize + workgroupSize - 1) / workgroupSize;
+//    kMemset->run_1d(numWorkgroups * workgroupSize, workgroupSize);
 //    cl->finish();
 
-    kernel  ->in( batchSize * numPlanes * outputImageSize * outputImageSize )
-            ->in( maskWrapper )
-            ->in( gradOutputWrapper )
-            ->out( gradInputWrapper );
-    int globalSize = batchSize * numPlanes * outputImageSize * outputImageSize;
+    kernel  ->in(batchSize * numPlanes * outputSize * outputSize)
+            ->in(maskWrapper)
+            ->in(gradOutputWrapper)
+            ->out(gradInputWrapper);
+    int globalSize = batchSize * numPlanes * outputSize * outputSize;
     int workgroupSize = 64;
-    int numWorkgroups = ( globalSize + workgroupSize - 1 ) / workgroupSize;
-    kernel->run_1d( numWorkgroups * workgroupSize, workgroupSize );
+    int numWorkgroups = (globalSize + workgroupSize - 1) / workgroupSize;
+    kernel->run_1d(numWorkgroups * workgroupSize, workgroupSize);
     cl->finish();
 
-    StatefulTimer::instance()->timeCheck("DropoutBackwardGpuNaive::backward end" );
+    StatefulTimer::instance()->timeCheck("DropoutBackwardGpuNaive::backward end");
 }
-DropoutBackwardGpuNaive::DropoutBackwardGpuNaive( EasyCL *cl, int numPlanes, int inputImageSize, float dropRatio ) :
-        DropoutBackward( cl, numPlanes, inputImageSize, dropRatio ) {
+DropoutBackwardGpuNaive::DropoutBackwardGpuNaive(EasyCL *cl, int numPlanes, int inputSize, float dropRatio) :
+        DropoutBackward(cl, numPlanes, inputSize, dropRatio) {
 //    std::string options = "-D " + fn->getDefineName();
     string options = "";
-    options += " -D gNumPlanes=" + toString( numPlanes );
-    options += " -D gInputImageSize=" + toString( inputImageSize );
-    options += " -D gInputImageSizeSquared=" + toString( inputImageSize * inputImageSize );
-    options += " -D gOutputImageSize=" + toString( outputImageSize );
-    options += " -D gOutputImageSizeSquared=" + toString( outputImageSize * outputImageSize );
+    options += " -D gNumPlanes=" + toString(numPlanes);
+    options += " -D gInputSize=" + toString(inputSize);
+    options += " -D gInputSizeSquared=" + toString(inputSize * inputSize);
+    options += " -D gOutputSize=" + toString(outputSize);
+    options += " -D gOutputSizeSquared=" + toString(outputSize * outputSize);
 //    float inverseDropRatio = 1.0f / dropRatio;
-    string dropRatioString = toString( dropRatio );
-    if( dropRatioString.find( "." ) == string::npos ) {
+    string dropRatioString = toString(dropRatio);
+    if(dropRatioString.find(".") == string::npos) {
         dropRatioString += ".0f";
     } else {
         dropRatioString += "f";
@@ -78,8 +78,8 @@ DropoutBackwardGpuNaive::DropoutBackwardGpuNaive( EasyCL *cl, int numPlanes, int
 
     // [[[cog
     // import stringify
-    // stringify.write_kernel2( "kernel", "cl/dropout.cl", "backpropNaive", 'options' )
-    // # stringify.write_kernel2( "kMemset", "cl/memset.cl", "memset", '""' )
+    // stringify.write_kernel2("kernel", "cl/dropout.cl", "backpropNaive", 'options')
+    // # stringify.write_kernel2("kMemset", "cl/memset.cl", "memset", '""')
     // ]]]
     // generated using cog, from cl/dropout.cl:
     const char * kernelSource =  
@@ -93,9 +93,9 @@ DropoutBackwardGpuNaive::DropoutBackwardGpuNaive( EasyCL *cl, int numPlanes, int
     "        const int N,\n" 
     "        global const unsigned char *mask,\n" 
     "        global const float *input,\n" 
-    "        global float *output ) {\n" 
+    "        global float *output) {\n" 
     "    const int globalId = get_global_id(0);\n" 
-    "    if( globalId >= N ) {\n" 
+    "    if (globalId >= N) {\n" 
     "        return;\n" 
     "    }\n" 
     "    output[globalId] = mask[globalId] == 1 ? input[globalId] : 0.0f;\n" 
@@ -107,7 +107,7 @@ DropoutBackwardGpuNaive::DropoutBackwardGpuNaive( EasyCL *cl, int numPlanes, int
     "        global const float *gradOutput,\n" 
     "        global float *output) {\n" 
     "    const int globalId = get_global_id(0);\n" 
-    "    if( globalId >= N ) {\n" 
+    "    if (globalId >= N) {\n" 
     "        return;\n" 
     "    }\n" 
     "    output[globalId] = mask[globalId] == 1 ? gradOutput[globalId] : 0.0f;\n" 
diff --git a/src/dropout/DropoutBackwardGpuNaive.h b/src/dropout/DropoutBackwardGpuNaive.h
index 72d4e281..d2d218d6 100644
--- a/src/dropout/DropoutBackwardGpuNaive.h
+++ b/src/dropout/DropoutBackwardGpuNaive.h
@@ -26,8 +26,8 @@ class DropoutBackwardGpuNaive : public DropoutBackward {
     int batchSize,
     CLWrapper *maskWrapper,
     CLWrapper *gradOutputWrapper,
-    CLWrapper *gradInputWrapper );
-    DropoutBackwardGpuNaive( EasyCL *cl, int numPlanes, int inputImageSize, float dropRatio );
+    CLWrapper *gradInputWrapper);
+    DropoutBackwardGpuNaive(EasyCL *cl, int numPlanes, int inputSize, float dropRatio);
 
     // [[[end]]]
 };
diff --git a/src/dropout/DropoutForward.cpp b/src/dropout/DropoutForward.cpp
index a3158298..745f1eb5 100644
--- a/src/dropout/DropoutForward.cpp
+++ b/src/dropout/DropoutForward.cpp
@@ -20,57 +20,57 @@ using namespace std;
 #undef STATIC
 #define STATIC
 
-DropoutForward::DropoutForward( EasyCL *cl, int numPlanes, int inputImageSize, float dropRatio ) :
-        cl( cl ),
-        numPlanes( numPlanes ),
-        inputImageSize( inputImageSize ),
-        dropRatio( dropRatio ),
-        outputImageSize( inputImageSize ) {
-//    if( inputImageSize % dropoutSize != 0 ) {
-//        throw runtime_error("inputImageSize should be an exact multiple of dropoutsize: " + toString( inputImageSize ) + " " + toString(dropoutSize ) );
+DropoutForward::DropoutForward(EasyCL *cl, int numPlanes, int inputSize, float dropRatio) :
+        cl(cl),
+        numPlanes(numPlanes),
+        inputSize(inputSize),
+        dropRatio(dropRatio),
+        outputSize(inputSize) {
+//    if(inputSize % dropoutSize != 0) {
+//        throw runtime_error("inputSize should be an exact multiple of dropoutsize: " + toString(inputSize) + " " + toString(dropoutSize) );
 //    }
 }
-STATIC DropoutForward *DropoutForward::instance( EasyCL *cl, int numPlanes, int inputImageSize, float dropRatio ) {
-    return new DropoutForwardGpuNaive( cl, numPlanes, inputImageSize, dropRatio );
-//    return new DropoutForwardCpu( cl, padZeros, numPlanes, inputImageSize, dropoutSize );
+STATIC DropoutForward *DropoutForward::instance(EasyCL *cl, int numPlanes, int inputSize, float dropRatio) {
+    return new DropoutForwardGpuNaive(cl, numPlanes, inputSize, dropRatio);
+//    return new DropoutForwardCpu(cl, padZeros, numPlanes, inputSize, dropoutSize);
 }
-STATIC DropoutForward *DropoutForward::instanceForTest( EasyCL *cl, int numPlanes, int inputImageSize, float dropRatio ) {
-    return new DropoutForwardCpu( cl, numPlanes, inputImageSize, dropRatio );
+STATIC DropoutForward *DropoutForward::instanceForTest(EasyCL *cl, int numPlanes, int inputSize, float dropRatio) {
+    return new DropoutForwardCpu(cl, numPlanes, inputSize, dropRatio);
 }
-STATIC DropoutForward *DropoutForward::instanceSpecific( int idx, EasyCL *cl, int numPlanes, int inputImageSize, float dropRatio ) {
-    if( idx == 0 ) {
-        return new DropoutForwardCpu( cl, numPlanes, inputImageSize, dropRatio );
+STATIC DropoutForward *DropoutForward::instanceSpecific(int idx, EasyCL *cl, int numPlanes, int inputSize, float dropRatio) {
+    if(idx == 0) {
+        return new DropoutForwardCpu(cl, numPlanes, inputSize, dropRatio);
     }
-    if( idx == 1 ) {
-        return new DropoutForwardGpuNaive( cl, numPlanes, inputImageSize, dropRatio );
+    if(idx == 1) {
+        return new DropoutForwardGpuNaive(cl, numPlanes, inputSize, dropRatio);
     }
     cout << "idx " << idx << " not known" << endl;
-    throw runtime_error("DropoutForward::instanceSpecific idx not known: " + toString( idx ) );
+    throw runtime_error("DropoutForward::instanceSpecific idx not known: " + toString(idx) );
 }
-VIRTUAL void DropoutForward::forward( int batchSize, CLWrapper *masksWrapper, CLWrapper *inputData, CLWrapper *outputData ) {
+VIRTUAL void DropoutForward::forward(int batchSize, CLWrapper *masksWrapper, CLWrapper *inputData, CLWrapper *outputData) {
     throw runtime_error("forward not implemented for this child type");
 }
-VIRTUAL void DropoutForward::forward( int batchSize, unsigned char *masks, float *input, float *output ) {
-//    cout << "DropoutForward::forward( float * )" << endl;
-    int inputLinearSize = getInputSize( batchSize );
-    CLWrapper *masksWrapper = cl->wrap( inputLinearSize, masks );
-    CLWrapper *inputWrapper = cl->wrap( inputLinearSize, input );
-    CLWrapper *outputWrapper = cl->wrap( getOutputSize( batchSize ), output );
+VIRTUAL void DropoutForward::forward(int batchSize, unsigned char *masks, float *input, float *output) {
+//    cout << "DropoutForward::forward(float *)" << endl;
+    int inputLinearSize = getInputNumElements(batchSize);
+    CLWrapper *masksWrapper = cl->wrap(inputLinearSize, masks);
+    CLWrapper *inputWrapper = cl->wrap(inputLinearSize, input);
+    CLWrapper *outputWrapper = cl->wrap(getOutputNumElements(batchSize), output);
 
     masksWrapper->copyToDevice();
     inputWrapper->copyToDevice();
-    forward( batchSize, masksWrapper, inputWrapper, outputWrapper );
+    forward(batchSize, masksWrapper, inputWrapper, outputWrapper);
     outputWrapper->copyToHost();    
 
     delete outputWrapper;
     delete inputWrapper;
     delete masksWrapper;
 }
-VIRTUAL int DropoutForward::getInputSize( int batchSize ) {
-    return batchSize * numPlanes * inputImageSize * inputImageSize;
+VIRTUAL int DropoutForward::getInputNumElements(int batchSize) {
+    return batchSize * numPlanes * inputSize * inputSize;
 }
-VIRTUAL int DropoutForward::getOutputSize(int batchSize) {
-    return batchSize * numPlanes * outputImageSize * outputImageSize;
+VIRTUAL int DropoutForward::getOutputNumElements(int batchSize) {
+    return batchSize * numPlanes * outputSize * outputSize;
 }
 
 
diff --git a/src/dropout/DropoutForward.h b/src/dropout/DropoutForward.h
index 7fa4c736..23731e4c 100644
--- a/src/dropout/DropoutForward.h
+++ b/src/dropout/DropoutForward.h
@@ -19,23 +19,23 @@ class DeepCL_EXPORT DropoutForward {
     EasyCL *cl;
 
     const int numPlanes;
-    const int inputImageSize;
+    const int inputSize;
     const float dropRatio;
 
-    const int outputImageSize;
+    const int outputSize;
 
     virtual ~DropoutForward() {}
-    inline int getInputIndex( int n, int plane, int row, int col ) {
-        return ( ( n
-            * numPlanes + plane )
-            * inputImageSize + row )
-            * inputImageSize + col;
+    inline int getInputIndex(int n, int plane, int row, int col) {
+        return (( n
+            * numPlanes + plane)
+            * inputSize + row)
+            * inputSize + col;
     }
-    inline int getResultIndex( int n, int plane, int row, int col ) {
-        return ( ( n
-            * numPlanes + plane )
-            * outputImageSize + row )
-            * outputImageSize + col;
+    inline int getResultIndex(int n, int plane, int row, int col) {
+        return (( n
+            * numPlanes + plane)
+            * outputSize + row)
+            * outputSize + col;
     }
 
     // [[[cog
@@ -43,14 +43,14 @@ class DeepCL_EXPORT DropoutForward {
     // cog_addheaders.add()
     // ]]]
     // generated, using cog:
-    DropoutForward( EasyCL *cl, int numPlanes, int inputImageSize, float dropRatio );
-    STATIC DropoutForward *instance( EasyCL *cl, int numPlanes, int inputImageSize, float dropRatio );
-    STATIC DropoutForward *instanceForTest( EasyCL *cl, int numPlanes, int inputImageSize, float dropRatio );
-    STATIC DropoutForward *instanceSpecific( int idx, EasyCL *cl, int numPlanes, int inputImageSize, float dropRatio );
-    VIRTUAL void forward( int batchSize, CLWrapper *masksWrapper, CLWrapper *inputData, CLWrapper *outputData );
-    VIRTUAL void forward( int batchSize, unsigned char *masks, float *input, float *output );
-    VIRTUAL int getInputSize( int batchSize );
-    VIRTUAL int getOutputSize(int batchSize);
+    DropoutForward(EasyCL *cl, int numPlanes, int inputSize, float dropRatio);
+    STATIC DropoutForward *instance(EasyCL *cl, int numPlanes, int inputSize, float dropRatio);
+    STATIC DropoutForward *instanceForTest(EasyCL *cl, int numPlanes, int inputSize, float dropRatio);
+    STATIC DropoutForward *instanceSpecific(int idx, EasyCL *cl, int numPlanes, int inputSize, float dropRatio);
+    VIRTUAL void forward(int batchSize, CLWrapper *masksWrapper, CLWrapper *inputData, CLWrapper *outputData);
+    VIRTUAL void forward(int batchSize, unsigned char *masks, float *input, float *output);
+    VIRTUAL int getInputNumElements(int batchSize);
+    VIRTUAL int getOutputNumElements(int batchSize);
 
     // [[[end]]]
 };
diff --git a/src/dropout/DropoutForwardCpu.cpp b/src/dropout/DropoutForwardCpu.cpp
index 1444c32a..37b6bdeb 100644
--- a/src/dropout/DropoutForwardCpu.cpp
+++ b/src/dropout/DropoutForwardCpu.cpp
@@ -20,37 +20,37 @@ using namespace std;
 #undef STATIC
 #define STATIC
 
-DropoutForwardCpu::DropoutForwardCpu( EasyCL *cl, int numPlanes, int inputImageSize, float dropRatio ) :
-        DropoutForward( cl, numPlanes, inputImageSize, dropRatio ) {
+DropoutForwardCpu::DropoutForwardCpu(EasyCL *cl, int numPlanes, int inputSize, float dropRatio) :
+        DropoutForward(cl, numPlanes, inputSize, dropRatio) {
 }
-VIRTUAL void DropoutForwardCpu::forward( int batchSize, CLWrapper *masksWrapper, CLWrapper *inputWrapper, CLWrapper *outputWrapper ) {
-//    cout << "DropoutForwardCpu::forward( CLWrapper * )" << endl;
+VIRTUAL void DropoutForwardCpu::forward(int batchSize, CLWrapper *masksWrapper, CLWrapper *inputWrapper, CLWrapper *outputWrapper) {
+//    cout << "DropoutForwardCpu::forward(CLWrapper *)" << endl;
 
     inputWrapper->copyToHost();
 
-    unsigned char *masks = reinterpret_cast<unsigned char *>( masksWrapper->getHostArray() );
-    float *input = reinterpret_cast<float *>( inputWrapper->getHostArray() );
-    float *output = new float[ getOutputSize( batchSize ) ];
+    unsigned char *masks = reinterpret_cast<unsigned char *>(masksWrapper->getHostArray());
+    float *input = reinterpret_cast<float *>(inputWrapper->getHostArray());
+    float *output = new float[ getOutputNumElements(batchSize) ];
 
-    forward( batchSize, masks, input, output );
+    forward(batchSize, masks, input, output);
 
-    float *outputHostArray = reinterpret_cast<float *>( outputWrapper->getHostArray() );
-    memcpy( outputHostArray, output, sizeof(float) * getOutputSize( batchSize ) );
+    float *outputHostArray = reinterpret_cast<float *>(outputWrapper->getHostArray());
+    memcpy(outputHostArray, output, sizeof(float) * getOutputNumElements(batchSize) );
 
     outputWrapper->copyToDevice();
 
     delete[] output;
 }
-VIRTUAL void DropoutForwardCpu::forward( int batchSize, unsigned char *masks, float *input, float *output ) {
-//    float *output = new float[ getOutputSize( batchSize ) ];
-//    cout << "DropoutForwardCpu::forward( float * )" << endl;
-    StatefulTimer::instance()->timeCheck("DropoutForwardCpu::forward start" );
-    int totalLinearSize = batchSize * numPlanes * inputImageSize * inputImageSize;
+VIRTUAL void DropoutForwardCpu::forward(int batchSize, unsigned char *masks, float *input, float *output) {
+//    float *output = new float[ getOutputNumElements(batchSize) ];
+//    cout << "DropoutForwardCpu::forward(float *)" << endl;
+    StatefulTimer::instance()->timeCheck("DropoutForwardCpu::forward start");
+    int totalLinearSize = batchSize * numPlanes * inputSize * inputSize;
 //    float inverseDropRatio = 1.0f / dropRatio; // since multiply faster than divide, just divide once
-    for( int i = 0; i < totalLinearSize; i++ ) {
+    for(int i = 0; i < totalLinearSize; i++) {
         output[i] = masks[i] == 1 ? input[i] : 0;
     }
-    StatefulTimer::instance()->timeCheck("DropoutForwardCpu::forward end" );
+    StatefulTimer::instance()->timeCheck("DropoutForwardCpu::forward end");
 //    return output;
 }
 
diff --git a/src/dropout/DropoutForwardCpu.h b/src/dropout/DropoutForwardCpu.h
index 0f0d3afc..101c4969 100644
--- a/src/dropout/DropoutForwardCpu.h
+++ b/src/dropout/DropoutForwardCpu.h
@@ -19,9 +19,9 @@ class DropoutForwardCpu : public DropoutForward {
     // cog_addheaders.add()
     // ]]]
     // generated, using cog:
-    DropoutForwardCpu( EasyCL *cl, int numPlanes, int inputImageSize, float dropRatio );
-    VIRTUAL void forward( int batchSize, CLWrapper *masksWrapper, CLWrapper *inputWrapper, CLWrapper *outputWrapper );
-    VIRTUAL void forward( int batchSize, unsigned char *masks, float *input, float *output );
+    DropoutForwardCpu(EasyCL *cl, int numPlanes, int inputSize, float dropRatio);
+    VIRTUAL void forward(int batchSize, CLWrapper *masksWrapper, CLWrapper *inputWrapper, CLWrapper *outputWrapper);
+    VIRTUAL void forward(int batchSize, unsigned char *masks, float *input, float *output);
 
     // [[[end]]]
 };
diff --git a/src/dropout/DropoutForwardGpuNaive.cpp b/src/dropout/DropoutForwardGpuNaive.cpp
index 45dcb58f..130b6a77 100644
--- a/src/dropout/DropoutForwardGpuNaive.cpp
+++ b/src/dropout/DropoutForwardGpuNaive.cpp
@@ -26,37 +26,37 @@ using namespace std;
 VIRTUAL DropoutForwardGpuNaive::~DropoutForwardGpuNaive() {
     delete kernel;
 }
-VIRTUAL void DropoutForwardGpuNaive::forward( int batchSize, CLWrapper *masksWrapper, CLWrapper *inputWrapper, CLWrapper *outputWrapper ) {
-//    cout << StatefulTimer::instance()->prefix << "DropoutForwardGpuNaive::forward( CLWrapper * )" << endl;
-    StatefulTimer::instance()->timeCheck("DropoutForwardGpuNaive::forward start" );
+VIRTUAL void DropoutForwardGpuNaive::forward(int batchSize, CLWrapper *masksWrapper, CLWrapper *inputWrapper, CLWrapper *outputWrapper) {
+//    cout << StatefulTimer::instance()->prefix << "DropoutForwardGpuNaive::forward(CLWrapper *)" << endl;
+    StatefulTimer::instance()->timeCheck("DropoutForwardGpuNaive::forward start");
 
-    kernel  ->input( batchSize * numPlanes * outputImageSize * outputImageSize )
-            ->input( masksWrapper )
-            ->input( inputWrapper )
-            ->output( outputWrapper );
-    int globalSize = batchSize * numPlanes * outputImageSize * outputImageSize;
+    kernel  ->input(batchSize * numPlanes * outputSize * outputSize)
+            ->input(masksWrapper)
+            ->input(inputWrapper)
+            ->output(outputWrapper);
+    int globalSize = batchSize * numPlanes * outputSize * outputSize;
     int workgroupsize = cl->getMaxWorkgroupSize();
-    globalSize = ( ( globalSize + workgroupsize - 1 ) / workgroupsize ) * workgroupsize;
+    globalSize = (( globalSize + workgroupsize - 1) / workgroupsize) * workgroupsize;
 //    cout << "DropoutForwardGpuNaive::forward batchsize=" << batchSize << " g=" << globalSize << " w=" << workgroupsize << endl;
     kernel->run_1d(globalSize, workgroupsize);
     cl->finish();
 
 //    cout << "DropoutForwardGpuNaive::forward selectorswrapper:" << endl;
-//    PrintBuffer::printInts( cl, selectorsWrapper, outputImageSize, outputImageSize );
+//    PrintBuffer::printInts(cl, selectorsWrapper, outputSize, outputSize);
 
-    StatefulTimer::instance()->timeCheck("DropoutForwardGpuNaive::forward end" );
+    StatefulTimer::instance()->timeCheck("DropoutForwardGpuNaive::forward end");
 }
-DropoutForwardGpuNaive::DropoutForwardGpuNaive( EasyCL *cl, int numPlanes, int inputImageSize, float dropRatio ) :
-        DropoutForward( cl, numPlanes, inputImageSize, dropRatio ) {
+DropoutForwardGpuNaive::DropoutForwardGpuNaive(EasyCL *cl, int numPlanes, int inputSize, float dropRatio) :
+        DropoutForward(cl, numPlanes, inputSize, dropRatio) {
     string options = "";
-    options += " -DgOutputImageSize=" + toString( outputImageSize );
-    options += " -DgOutputImageSizeSquared=" + toString( outputImageSize * outputImageSize );
-    options += " -DgInputImageSize=" + toString( inputImageSize );
-    options += " -DgInputImageSizeSquared=" + toString( inputImageSize * inputImageSize );
-    options += " -DgNumPlanes=" + toString( numPlanes );
+    options += " -DgOutputSize=" + toString(outputSize);
+    options += " -DgOutputSizeSquared=" + toString(outputSize * outputSize);
+    options += " -DgInputSize=" + toString(inputSize);
+    options += " -DgInputSizeSquared=" + toString(inputSize * inputSize);
+    options += " -DgNumPlanes=" + toString(numPlanes);
 //    float inverseDropRatio = 1.0f / dropRatio;
-//    string inverseDropRatioString = toString( inverseDropRatio );
-//    if( inverseDropRatioString.find( "." ) == string::npos ) {
+//    string inverseDropRatioString = toString(inverseDropRatio);
+//    if(inverseDropRatioString.find(".") == string::npos) {
 //        inverseDropRatioString += ".0f";
 //    } else {
 //        inverseDropRatioString += "f";
@@ -66,7 +66,7 @@ DropoutForwardGpuNaive::DropoutForwardGpuNaive( EasyCL *cl, int numPlanes, int i
 
     // [[[cog
     // import stringify
-    // stringify.write_kernel2( "kernel", "cl/dropout.cl", "forwardNaive", 'options' )
+    // stringify.write_kernel2("kernel", "cl/dropout.cl", "forwardNaive", 'options')
     // ]]]
     // generated using cog, from cl/dropout.cl:
     const char * kernelSource =  
@@ -80,9 +80,9 @@ DropoutForwardGpuNaive::DropoutForwardGpuNaive( EasyCL *cl, int numPlanes, int i
     "        const int N,\n" 
     "        global const unsigned char *mask,\n" 
     "        global const float *input,\n" 
-    "        global float *output ) {\n" 
+    "        global float *output) {\n" 
     "    const int globalId = get_global_id(0);\n" 
-    "    if( globalId >= N ) {\n" 
+    "    if (globalId >= N) {\n" 
     "        return;\n" 
     "    }\n" 
     "    output[globalId] = mask[globalId] == 1 ? input[globalId] : 0.0f;\n" 
@@ -94,7 +94,7 @@ DropoutForwardGpuNaive::DropoutForwardGpuNaive( EasyCL *cl, int numPlanes, int i
     "        global const float *gradOutput,\n" 
     "        global float *output) {\n" 
     "    const int globalId = get_global_id(0);\n" 
-    "    if( globalId >= N ) {\n" 
+    "    if (globalId >= N) {\n" 
     "        return;\n" 
     "    }\n" 
     "    output[globalId] = mask[globalId] == 1 ? gradOutput[globalId] : 0.0f;\n" 
@@ -103,6 +103,6 @@ DropoutForwardGpuNaive::DropoutForwardGpuNaive( EasyCL *cl, int numPlanes, int i
     "";
     kernel = cl->buildKernelFromString( kernelSource, "forwardNaive", options, "cl/dropout.cl" );
     // [[[end]]]
-//    kernel = cl->buildKernel( "dropout.cl", "forwardNaive", options );
+//    kernel = cl->buildKernel("dropout.cl", "forwardNaive", options);
 }
 
diff --git a/src/dropout/DropoutForwardGpuNaive.h b/src/dropout/DropoutForwardGpuNaive.h
index 3a6005ca..f64a95cb 100644
--- a/src/dropout/DropoutForwardGpuNaive.h
+++ b/src/dropout/DropoutForwardGpuNaive.h
@@ -23,8 +23,8 @@ class DropoutForwardGpuNaive : public DropoutForward {
     // ]]]
     // generated, using cog:
     VIRTUAL ~DropoutForwardGpuNaive();
-    VIRTUAL void forward( int batchSize, CLWrapper *masksWrapper, CLWrapper *inputWrapper, CLWrapper *outputWrapper );
-    DropoutForwardGpuNaive( EasyCL *cl, int numPlanes, int inputImageSize, float dropRatio );
+    VIRTUAL void forward(int batchSize, CLWrapper *masksWrapper, CLWrapper *inputWrapper, CLWrapper *outputWrapper);
+    DropoutForwardGpuNaive(EasyCL *cl, int numPlanes, int inputSize, float dropRatio);
 
     // [[[end]]]
 };
diff --git a/src/dropout/DropoutLayer.cpp b/src/dropout/DropoutLayer.cpp
index 98e71878..b96bad8e 100644
--- a/src/dropout/DropoutLayer.cpp
+++ b/src/dropout/DropoutLayer.cpp
@@ -24,14 +24,14 @@ using namespace std;
 #undef STATIC
 #define STATIC
 
-DropoutLayer::DropoutLayer( EasyCL *cl, Layer *previousLayer, DropoutMaker *maker ) :
-        Layer( previousLayer, maker ),
-        numPlanes ( previousLayer->getOutputPlanes() ),
-        inputImageSize( previousLayer->getOutputImageSize() ),
-        dropRatio( maker->_dropRatio ),
-        outputImageSize( previousLayer->getOutputImageSize() ),
-        random( RandomSingleton::instance() ),
-        cl( cl ),
+DropoutLayer::DropoutLayer(EasyCL *cl, Layer *previousLayer, DropoutMaker *maker) :
+        Layer(previousLayer, maker),
+        numPlanes (previousLayer->getOutputPlanes()),
+        inputSize(previousLayer->getOutputSize()),
+        dropRatio(maker->_dropRatio),
+        outputSize(previousLayer->getOutputSize()),
+        random(RandomSingleton::instance()),
+        cl(cl),
         masks(0),
         output(0),
         gradInput(0),
@@ -42,86 +42,86 @@ DropoutLayer::DropoutLayer( EasyCL *cl, Layer *previousLayer, DropoutMaker *make
 //        gradInputCopiedToHost(false),
         batchSize(0),
         allocatedSize(0) {
-    if( inputImageSize == 0 ){
+    if(inputSize == 0){
 //        maker->net->print();
-        throw runtime_error("Error: Dropout layer " + toString( layerIndex ) + ": input image size is 0" );
+        throw runtime_error("Error: Dropout layer " + toString(layerIndex) + ": input image size is 0");
     }
-    if( outputImageSize == 0 ){
+    if(outputSize == 0){
 //        maker->net->print();
-        throw runtime_error("Error: Dropout layer " + toString( layerIndex ) + ": output image size is 0" );
+        throw runtime_error("Error: Dropout layer " + toString(layerIndex) + ": output image size is 0");
     }
-    dropoutForwardImpl = DropoutForward::instance( cl, numPlanes, inputImageSize, dropRatio );
-    dropoutBackwardImpl = DropoutBackward::instance( cl, numPlanes, inputImageSize, dropRatio );
-    multiplyBuffer = new MultiplyBuffer( cl );
+    dropoutForwardImpl = DropoutForward::instance(cl, numPlanes, inputSize, dropRatio);
+    dropoutBackwardImpl = DropoutBackward::instance(cl, numPlanes, inputSize, dropRatio);
+    multiplyBuffer = new MultiplyBuffer(cl);
 }
 VIRTUAL DropoutLayer::~DropoutLayer() {
     delete multiplyBuffer;
     delete dropoutForwardImpl;
     delete dropoutBackwardImpl;
-    if( maskWrapper != 0 ) {
+    if(maskWrapper != 0) {
         delete maskWrapper;
     }
-    if( outputWrapper != 0 ) {
+    if(outputWrapper != 0) {
         delete outputWrapper;
     }
-    if( masks != 0 ) {
+    if(masks != 0) {
         delete[] masks;
     }
-    if( output != 0 ) {
+    if(output != 0) {
         delete[] output;
     }
-    if( gradInputWrapper != 0 ) {
+    if(gradInputWrapper != 0) {
         delete gradInputWrapper;
     }
-    if( gradInput != 0 ) {
+    if(gradInput != 0) {
         delete[] gradInput;
     }
 }
 VIRTUAL std::string DropoutLayer::getClassName() const {
     return "DropoutLayer";
 }
-VIRTUAL void DropoutLayer::fortesting_setRandomSingleton( RandomSingleton *random ) {
+VIRTUAL void DropoutLayer::fortesting_setRandomSingleton(RandomSingleton *random) {
     this->random = random;
 }
-VIRTUAL void DropoutLayer::setBatchSize( int batchSize ) {
+VIRTUAL void DropoutLayer::setBatchSize(int batchSize) {
 //    cout << "DropoutLayer::setBatchSize" << endl;
-    if( batchSize <= allocatedSize ) {
+    if(batchSize <= allocatedSize) {
         this->batchSize = batchSize;
         return;
     }
-    if( maskWrapper != 0 ) {
+    if(maskWrapper != 0) {
         delete maskWrapper;
     }
-    if( outputWrapper != 0 ) {
+    if(outputWrapper != 0) {
         delete outputWrapper;
     }
-    if( masks != 0 ) {
+    if(masks != 0) {
         delete[] masks;
     }
-    if( output != 0 ) {
+    if(output != 0) {
         delete[] output;
     }
-    if( gradInputWrapper != 0 ) {
+    if(gradInputWrapper != 0) {
         delete gradInputWrapper;
     }
-    if( gradInput != 0 ) {
+    if(gradInput != 0) {
         delete[] gradInput;
     }
     this->batchSize = batchSize;
     this->allocatedSize = batchSize;
-    masks = new unsigned char[ getOutputSize() ];
-    maskWrapper = cl->wrap( getOutputSize(), masks );
-    output = new float[ getOutputSize() ];
-    outputWrapper = cl->wrap( getOutputSize(), output );
-    gradInput = new float[ previousLayer->getOutputSize() ];
-    gradInputWrapper = cl->wrap( previousLayer->getOutputSize(), gradInput );
+    masks = new unsigned char[ getOutputNumElements() ];
+    maskWrapper = cl->wrap(getOutputNumElements(), masks);
+    output = new float[ getOutputNumElements() ];
+    outputWrapper = cl->wrap(getOutputNumElements(), output);
+    gradInput = new float[ previousLayer->getOutputNumElements() ];
+    gradInputWrapper = cl->wrap(previousLayer->getOutputNumElements(), gradInput);
     gradInputWrapper->createOnDevice();
 }
-VIRTUAL int DropoutLayer::getOutputSize() {
-    return batchSize * numPlanes * outputImageSize * outputImageSize;
+VIRTUAL int DropoutLayer::getOutputNumElements() {
+    return batchSize * numPlanes * outputSize * outputSize;
 }
 VIRTUAL float *DropoutLayer::getOutput() {
-    if( outputWrapper->isDeviceDirty() ) {
+    if(outputWrapper->isDeviceDirty()) {
         outputWrapper->copyToHost();
 //        outputCopiedToHost = true;
     }
@@ -132,17 +132,17 @@ VIRTUAL bool DropoutLayer::needsBackProp() {
                                            // but anyway, we dont have any weights ourselves
                                            // so just depends on upstream
 }
-VIRTUAL int DropoutLayer::getOutputSize() const {
-//    int outputImageSize = inputImageSize / dropoutSize;
-    return batchSize * numPlanes * outputImageSize * outputImageSize;
+VIRTUAL int DropoutLayer::getOutputNumElements() const {
+//    int outputSize = inputSize / dropoutSize;
+    return batchSize * numPlanes * outputSize * outputSize;
 }
-VIRTUAL int DropoutLayer::getOutputImageSize() const {
-    return outputImageSize;
+VIRTUAL int DropoutLayer::getOutputSize() const {
+    return outputSize;
 }
 VIRTUAL int DropoutLayer::getOutputPlanes() const {
     return numPlanes;
 }
-VIRTUAL int DropoutLayer::getPersistSize( int version ) const {
+VIRTUAL int DropoutLayer::getPersistSize(int version) const {
     return 0;
 }
 VIRTUAL bool DropoutLayer::providesGradInputWrapper() const {
@@ -164,14 +164,14 @@ VIRTUAL ActivationFunction const *DropoutLayer::getActivationFunction() {
     return new LinearActivation();
 }
 //VIRTUAL void DropoutLayer::generateMasks() {
-//    int totalInputLinearSize = getOutputSize();
+//    int totalInputLinearSize = getOutputNumElements();
 ////    int numBytes = (totalInputLinearSize+8-1)/8;
 ////    unsigned char *bitsField = new unsigned char[numBytes];
 //    int idx = 0;
 //    unsigned char thisByte = 0;
 //    int bitsPacked = 0;
-//    for( int i = 0; i < totalInputLinearSize; i++ ) {
-//        //double value = ( (int)random() % 10000 ) / 20000.0f + 0.5f;
+//    for(int i = 0; i < totalInputLinearSize; i++) {
+//        //double value = ((int)random() % 10000) / 20000.0f + 0.5f;
 //        // 1 means we pass value through, 0 means we drop
 //        // dropRatio is probability that mask value is 0 therefore
 //        // so higher dropRatio => more likely to be 0
@@ -180,7 +180,7 @@ VIRTUAL ActivationFunction const *DropoutLayer::getActivationFunction() {
 //        thisByte <<= 1;
 //        thisByte |= bit;
 //        bitsPacked++;
-//        if( bitsPacked >= 8 ) {
+//        if(bitsPacked >= 8) {
 //            masks[idx] = thisByte;
 //            idx++;
 //            bitsPacked = 0;
@@ -188,32 +188,32 @@ VIRTUAL ActivationFunction const *DropoutLayer::getActivationFunction() {
 //    }
 //}
 VIRTUAL void DropoutLayer::generateMasks() {
-    int totalInputLinearSize = getOutputSize();
-    for( int i = 0; i < totalInputLinearSize; i++ ) {
+    int totalInputLinearSize = getOutputNumElements();
+    for(int i = 0; i < totalInputLinearSize; i++) {
         masks[i] = random->_uniform() <= dropRatio ? 0 : 1;
     }
 }
 VIRTUAL void DropoutLayer::forward() {
     CLWrapper *upstreamOutputWrapper = 0;
-    if( previousLayer->hasOutputWrapper() ) {
+    if(previousLayer->hasOutputWrapper()) {
         upstreamOutputWrapper = previousLayer->getOutputWrapper();
     } else {
         float *upstreamOutput = previousLayer->getOutput();
-        upstreamOutputWrapper = cl->wrap( previousLayer->getOutputSize(), upstreamOutput );
+        upstreamOutputWrapper = cl->wrap(previousLayer->getOutputNumElements(), upstreamOutput);
         upstreamOutputWrapper->copyToDevice();
     }
 
 //    cout << "training: " << training << endl;
-    if( training ) {
+    if(training) {
         // create new masks...
         generateMasks();
         maskWrapper->copyToDevice();
-        dropoutForwardImpl->forward( batchSize, maskWrapper, upstreamOutputWrapper, outputWrapper );
+        dropoutForwardImpl->forward(batchSize, maskWrapper, upstreamOutputWrapper, outputWrapper);
     } else {
         // if not training, then simply skip the dropout bit, copy the buffers directly
-        multiplyBuffer->multiply( getOutputSize(), dropRatio, upstreamOutputWrapper, outputWrapper );
+        multiplyBuffer->multiply(getOutputNumElements(), dropRatio, upstreamOutputWrapper, outputWrapper);
     }
-    if( !previousLayer->hasOutputWrapper() ) {
+    if(!previousLayer->hasOutputWrapper()) {
         delete upstreamOutputWrapper;
     }
 }
@@ -222,15 +222,15 @@ VIRTUAL void DropoutLayer::backward() {
 
     CLWrapper *gradOutputWrapper = 0;
     bool weOwnErrorsWrapper = false;
-    if( nextLayer->providesGradInputWrapper() ) {
+    if(nextLayer->providesGradInputWrapper()) {
         gradOutputWrapper = nextLayer->getGradInputWrapper();
     } else {
-        gradOutputWrapper = cl->wrap( getOutputSize(), nextLayer->getGradInput() );
+        gradOutputWrapper = cl->wrap(getOutputNumElements(), nextLayer->getGradInput());
         gradOutputWrapper->copyToDevice();
         weOwnErrorsWrapper = true;
     }
-    dropoutBackwardImpl->backward( batchSize, maskWrapper, gradOutputWrapper, gradInputWrapper );
-    if( weOwnErrorsWrapper ) {
+    dropoutBackwardImpl->backward(batchSize, maskWrapper, gradOutputWrapper, gradInputWrapper);
+    if(weOwnErrorsWrapper) {
         delete gradOutputWrapper;
     }
 }
diff --git a/src/dropout/DropoutLayer.h b/src/dropout/DropoutLayer.h
index 825a0816..37ac7370 100644
--- a/src/dropout/DropoutLayer.h
+++ b/src/dropout/DropoutLayer.h
@@ -22,10 +22,10 @@ class MultiplyBuffer;
 class DropoutLayer : public Layer {
 public:
     const int numPlanes;
-    const int inputImageSize;
+    const int inputSize;
     const float dropRatio;
 
-    const int outputImageSize;
+    const int outputSize;
 
     RandomSingleton *random;
 
@@ -53,18 +53,18 @@ class DropoutLayer : public Layer {
     // cog_addheaders.add()
     // ]]]
     // generated, using cog:
-    DropoutLayer( EasyCL *cl, Layer *previousLayer, DropoutMaker *maker );
+    DropoutLayer(EasyCL *cl, Layer *previousLayer, DropoutMaker *maker);
     VIRTUAL ~DropoutLayer();
     VIRTUAL std::string getClassName() const;
-    VIRTUAL void fortesting_setRandomSingleton( RandomSingleton *random );
-    VIRTUAL void setBatchSize( int batchSize );
-    VIRTUAL int getOutputSize();
+    VIRTUAL void fortesting_setRandomSingleton(RandomSingleton *random);
+    VIRTUAL void setBatchSize(int batchSize);
+    VIRTUAL int getOutputNumElements();
     VIRTUAL float *getOutput();
     VIRTUAL bool needsBackProp();
+    VIRTUAL int getOutputNumElements() const;
     VIRTUAL int getOutputSize() const;
-    VIRTUAL int getOutputImageSize() const;
     VIRTUAL int getOutputPlanes() const;
-    VIRTUAL int getPersistSize( int version ) const;
+    VIRTUAL int getPersistSize(int version) const;
     VIRTUAL bool providesGradInputWrapper() const;
     VIRTUAL CLWrapper *getGradInputWrapper();
     VIRTUAL bool hasOutputWrapper() const;
diff --git a/src/dropout/DropoutMaker.cpp b/src/dropout/DropoutMaker.cpp
index a2c417e1..6368c112 100644
--- a/src/dropout/DropoutMaker.cpp
+++ b/src/dropout/DropoutMaker.cpp
@@ -9,7 +9,7 @@
 
 using namespace std;
 
-Layer *DropoutMaker::createLayer( Layer *previousLayer ) {
-    return new DropoutLayer( cl, previousLayer, this );
+Layer *DropoutMaker::createLayer(Layer *previousLayer) {
+    return new DropoutLayer(cl, previousLayer, this);
 }
 
diff --git a/src/dropout/DropoutMaker.h b/src/dropout/DropoutMaker.h
index d61c0411..62b984e7 100644
--- a/src/dropout/DropoutMaker.h
+++ b/src/dropout/DropoutMaker.h
@@ -14,9 +14,9 @@ class DeepCL_EXPORT DropoutMaker : public LayerMaker2 {
 public:
     float _dropRatio; // 0.0 -> 1.0
     DropoutMaker() :
-        _dropRatio( 0.5f ) {
+        _dropRatio(0.5f) {
     }
-    DropoutMaker *dropRatio( float _dropRatio ) {
+    DropoutMaker *dropRatio(float _dropRatio) {
         this->_dropRatio = _dropRatio;
         return this;
     }
@@ -25,10 +25,10 @@ class DeepCL_EXPORT DropoutMaker : public LayerMaker2 {
     }
     virtual DropoutMaker *clone() const {
         DropoutMaker *thisClone = new DropoutMaker();
-        memcpy( thisClone, this, sizeof( DropoutMaker ) );
+        memcpy(thisClone, this, sizeof(DropoutMaker) );
         return thisClone;
     }
-    virtual Layer *createLayer( Layer *previousLayer );
+    virtual Layer *createLayer(Layer *previousLayer);
 };
 
 
diff --git a/src/fc/FullyConnectedLayer.cpp b/src/fc/FullyConnectedLayer.cpp
index 79bfbdc2..489b72b5 100644
--- a/src/fc/FullyConnectedLayer.cpp
+++ b/src/fc/FullyConnectedLayer.cpp
@@ -15,18 +15,18 @@ using namespace std;
 #undef VIRTUAL
 #define VIRTUAL 
 
-FullyConnectedLayer::FullyConnectedLayer( EasyCL *cl, Layer *previousLayer, FullyConnectedMaker *maker ) :
-        Layer( previousLayer, maker ),
-        numPlanes( maker->_numPlanes ),
-        imageSize( maker->_imageSize ),
-//        fn( maker->_activationFunction ),
+FullyConnectedLayer::FullyConnectedLayer(EasyCL *cl, Layer *previousLayer, FullyConnectedMaker *maker) :
+        Layer(previousLayer, maker),
+        numPlanes(maker->_numPlanes),
+        imageSize(maker->_imageSize),
+//        fn(maker->_activationFunction),
         batchSize(0) {
     ConvolutionalMaker *convolutionalMaker = new ConvolutionalMaker();
-    convolutionalMaker->numFilters( numPlanes * imageSize * imageSize )
-                      ->filterSize( previousLayer->getOutputImageSize() )
-                        ->biased( maker->_biased )
-                        ->weightsInitializer( maker->_weightsInitializer );
-    convolutionalLayer = new ConvolutionalLayer( cl, previousLayer, convolutionalMaker );
+    convolutionalMaker->numFilters(numPlanes * imageSize * imageSize)
+                      ->filterSize(previousLayer->getOutputSize())
+                        ->biased(maker->_biased)
+                        ->weightsInitializer(maker->_weightsInitializer);
+    convolutionalLayer = new ConvolutionalLayer(cl, previousLayer, convolutionalMaker);
 //    delete convolutionalMaker;
 }
 
@@ -36,33 +36,33 @@ VIRTUAL FullyConnectedLayer::~FullyConnectedLayer() {
 VIRTUAL std::string FullyConnectedLayer::getClassName() const {
     return "FullyConnectedLayer";
 }
-VIRTUAL void FullyConnectedLayer::setBatchSize( int batchSize ) {
+VIRTUAL void FullyConnectedLayer::setBatchSize(int batchSize) {
     convolutionalLayer->previousLayer = this->previousLayer;
     convolutionalLayer->nextLayer = this->nextLayer;
-    convolutionalLayer->setBatchSize( batchSize );
+    convolutionalLayer->setBatchSize(batchSize);
     this->batchSize = batchSize;
 }
 VIRTUAL int FullyConnectedLayer::getOutputCubeSize() const {
     return numPlanes * imageSize * imageSize;
 }
-VIRTUAL int FullyConnectedLayer::getOutputImageSize() const {
+VIRTUAL int FullyConnectedLayer::getOutputSize() const {
     return imageSize;
 }
 VIRTUAL int FullyConnectedLayer::getOutputPlanes() const {
     return numPlanes;
 }
-VIRTUAL int FullyConnectedLayer::getPersistSize( int version ) const {
-    return convolutionalLayer->getPersistSize( version );
+VIRTUAL int FullyConnectedLayer::getPersistSize(int version) const {
+    return convolutionalLayer->getPersistSize(version);
 }
-VIRTUAL void FullyConnectedLayer::persistToArray( int version, float *array ) {
-    convolutionalLayer->persistToArray( version, array );
+VIRTUAL void FullyConnectedLayer::persistToArray(int version, float *array) {
+    convolutionalLayer->persistToArray(version, array);
 }
-VIRTUAL void FullyConnectedLayer::unpersistFromArray( int version, float const*array ) {
-    convolutionalLayer->unpersistFromArray( version, array );
+VIRTUAL void FullyConnectedLayer::unpersistFromArray(int version, float const*array) {
+    convolutionalLayer->unpersistFromArray(version, array);
 }
-VIRTUAL void FullyConnectedLayer::setWeights( float *weights, float *bias ) {
-    convolutionalLayer->initWeights( weights );
-    convolutionalLayer->initBias( bias );
+VIRTUAL void FullyConnectedLayer::setWeights(float *weights, float *bias) {
+    convolutionalLayer->initWeights(weights);
+    convolutionalLayer->initBias(bias);
 }
 VIRTUAL float * FullyConnectedLayer::getWeights() {
     return convolutionalLayer->getWeights();
@@ -73,8 +73,8 @@ VIRTUAL int FullyConnectedLayer::getWeightsSize() const {
 VIRTUAL int FullyConnectedLayer::getBiasSize() const {
     return convolutionalLayer->getBiasSize();
 }
-VIRTUAL int FullyConnectedLayer::getOutputSize() const {
-    return convolutionalLayer->getOutputSize();
+VIRTUAL int FullyConnectedLayer::getOutputNumElements() const {
+    return convolutionalLayer->getOutputNumElements();
 }
 VIRTUAL float *FullyConnectedLayer::getOutput() {
     return convolutionalLayer->getOutput();
@@ -130,10 +130,10 @@ VIRTUAL TrainerState *FullyConnectedLayer::getTrainerState() {
 VIRTUAL TrainerState *FullyConnectedLayer::getBiasTrainerState() {
     return convolutionalLayer->getBiasTrainerState();
 }
-VIRTUAL void FullyConnectedLayer::setTrainerState( TrainerStateMaker *TrainerStateMaker ) {
-    convolutionalLayer->setTrainerState( TrainerStateMaker );
+VIRTUAL void FullyConnectedLayer::setTrainerState(TrainerStateMaker *TrainerStateMaker) {
+    convolutionalLayer->setTrainerState(TrainerStateMaker);
 }
 VIRTUAL std::string FullyConnectedLayer::asString() const {
-    return "FullyConnectedLayer{ numPlanes=" + toString( numPlanes ) + " imageSize=" + toString( imageSize ) + " }";
+    return "FullyConnectedLayer{ numPlanes=" + toString(numPlanes) + " imageSize=" + toString(imageSize) + " }";
 }
 
diff --git a/src/fc/FullyConnectedLayer.h b/src/fc/FullyConnectedLayer.h
index 8e9be33b..defc22ea 100644
--- a/src/fc/FullyConnectedLayer.h
+++ b/src/fc/FullyConnectedLayer.h
@@ -28,21 +28,21 @@ class FullyConnectedLayer : public Layer {
     // cog_addheaders.add()
     // ]]]
     // generated, using cog:
-    FullyConnectedLayer( EasyCL *cl, Layer *previousLayer, FullyConnectedMaker *maker );
+    FullyConnectedLayer(EasyCL *cl, Layer *previousLayer, FullyConnectedMaker *maker);
     VIRTUAL ~FullyConnectedLayer();
     VIRTUAL std::string getClassName() const;
-    VIRTUAL void setBatchSize( int batchSize );
+    VIRTUAL void setBatchSize(int batchSize);
     VIRTUAL int getOutputCubeSize() const;
-    VIRTUAL int getOutputImageSize() const;
+    VIRTUAL int getOutputSize() const;
     VIRTUAL int getOutputPlanes() const;
-    VIRTUAL int getPersistSize( int version ) const;
-    VIRTUAL void persistToArray( int version, float *array );
-    VIRTUAL void unpersistFromArray( int version, float const*array );
-    VIRTUAL void setWeights( float *weights, float *bias );
+    VIRTUAL int getPersistSize(int version) const;
+    VIRTUAL void persistToArray(int version, float *array);
+    VIRTUAL void unpersistFromArray(int version, float const*array);
+    VIRTUAL void setWeights(float *weights, float *bias);
     VIRTUAL float * getWeights();
     VIRTUAL int getWeightsSize() const;
     VIRTUAL int getBiasSize() const;
-    VIRTUAL int getOutputSize() const;
+    VIRTUAL int getOutputNumElements() const;
     VIRTUAL float *getOutput();
     VIRTUAL float *getGradInput();
     VIRTUAL CLWrapper *getGradWeightsWrapper();
@@ -60,7 +60,7 @@ class FullyConnectedLayer : public Layer {
     VIRTUAL bool needsTrainerState() const;
     VIRTUAL TrainerState *getTrainerState();
     VIRTUAL TrainerState *getBiasTrainerState();
-    VIRTUAL void setTrainerState( TrainerStateMaker *TrainerStateMaker );
+    VIRTUAL void setTrainerState(TrainerStateMaker *TrainerStateMaker);
     VIRTUAL std::string asString() const;
 
     // [[[end]]]
diff --git a/src/fc/FullyConnectedMaker.cpp b/src/fc/FullyConnectedMaker.cpp
index 62032acd..dfec8a8f 100644
--- a/src/fc/FullyConnectedMaker.cpp
+++ b/src/fc/FullyConnectedMaker.cpp
@@ -10,8 +10,8 @@
 
 using namespace std;
 
-Layer *FullyConnectedMaker::createLayer( Layer *previousLayer ) {
-    return new FullyConnectedLayer( cl, previousLayer, this );
+Layer *FullyConnectedMaker::createLayer(Layer *previousLayer) {
+    return new FullyConnectedLayer(cl, previousLayer, this);
 }
 
 
diff --git a/src/fc/FullyConnectedMaker.h b/src/fc/FullyConnectedMaker.h
index 4e44c36d..db841346 100644
--- a/src/fc/FullyConnectedMaker.h
+++ b/src/fc/FullyConnectedMaker.h
@@ -27,7 +27,7 @@ class DeepCL_EXPORT FullyConnectedMaker : public LayerMaker2 {
         _numPlanes(0),
         _imageSize(0),
         _biased(true),
-        _weightsInitializer( new OriginalInitializer() ) {
+        _weightsInitializer(new OriginalInitializer()) {
     }
     FullyConnectedMaker *weightsInitializer(WeightsInitializer *weightsInitializer) {
         this->_weightsInitializer = weightsInitializer;
@@ -54,10 +54,10 @@ class DeepCL_EXPORT FullyConnectedMaker : public LayerMaker2 {
     }
     virtual FullyConnectedMaker *clone() const {
         FullyConnectedMaker *thisClone = new FullyConnectedMaker();
-        memcpy( thisClone, this, sizeof( FullyConnectedMaker ) );
+        memcpy(thisClone, this, sizeof(FullyConnectedMaker) );
         return thisClone;
     }
-    virtual Layer *createLayer( Layer *previousLayer );
+    virtual Layer *createLayer(Layer *previousLayer);
 };
 
 
diff --git a/src/forcebackprop/ForceBackpropLayer.cpp b/src/forcebackprop/ForceBackpropLayer.cpp
index 7addaa46..9ccdebb5 100644
--- a/src/forcebackprop/ForceBackpropLayer.cpp
+++ b/src/forcebackprop/ForceBackpropLayer.cpp
@@ -13,60 +13,60 @@ using namespace std;
 #undef VIRTUAL
 #define VIRTUAL 
 
-ForceBackpropLayer::ForceBackpropLayer( Layer *previousLayer, ForceBackpropLayerMaker *maker ) :
-       Layer( previousLayer, maker ),
-    outputPlanes( previousLayer->getOutputPlanes() ),
-    outputImageSize( previousLayer->getOutputImageSize() ),
+ForceBackpropLayer::ForceBackpropLayer(Layer *previousLayer, ForceBackpropLayerMaker *maker) :
+       Layer(previousLayer, maker),
+    outputPlanes(previousLayer->getOutputPlanes()),
+    outputSize(previousLayer->getOutputSize()),
     batchSize(0),
     allocatedSize(0),
     output(0) {
 }
 VIRTUAL ForceBackpropLayer::~ForceBackpropLayer() {
-    if( output != 0 ) {
+    if(output != 0) {
         delete[] output;
     }
 }
 VIRTUAL std::string ForceBackpropLayer::getClassName() const {
     return "ForceBackpropLayer";
 }
-VIRTUAL void ForceBackpropLayer::backward( float learningRate ) {
+VIRTUAL void ForceBackpropLayer::backward(float learningRate) {
     // do nothing...
 }
 VIRTUAL float *ForceBackpropLayer::getOutput() {
     return output;
 }
-VIRTUAL int ForceBackpropLayer::getPersistSize( int version ) const {
+VIRTUAL int ForceBackpropLayer::getPersistSize(int version) const {
     return 0;
 }
 VIRTUAL bool ForceBackpropLayer::needsBackProp() {
     return true;
 }
 VIRTUAL void ForceBackpropLayer::printOutput() {
-    if( output == 0 ) {
+    if(output == 0) {
          return;
     }
-    for( int n = 0; n < std::min(5,batchSize); n++ ) {
+    for(int n = 0; n < std::min(5,batchSize); n++) {
         std::cout << "ForceBackpropLayer n " << n << ":" << std::endl;
-        for( int plane = 0; plane < std::min( 5, outputPlanes); plane++ ) {
-            if( outputPlanes > 1 ) std::cout << "    plane " << plane << ":" << std::endl;
-            for( int i = 0; i < std::min(5, outputImageSize); i++ ) {
+        for(int plane = 0; plane < std::min(5, outputPlanes); plane++) {
+            if(outputPlanes > 1) std::cout << "    plane " << plane << ":" << std::endl;
+            for(int i = 0; i < std::min(5, outputSize); i++) {
                 std::cout << "      ";
-                for( int j = 0; j < std::min(5, outputImageSize); j++ ) {
-                    std::cout << getOutput( n, plane, i, j ) << " ";
+                for(int j = 0; j < std::min(5, outputSize); j++) {
+                    std::cout << getOutput(n, plane, i, j) << " ";
 //output[
 //                            n * numPlanes * imageSize*imageSize +
 //                            plane*imageSize*imageSize +
 //                            i * imageSize +
 //                            j ] << " ";
                 }
-                if( outputImageSize > 5 ) std::cout << " ... ";
+                if(outputSize > 5) std::cout << " ... ";
                 std::cout << std::endl;
             }
-            if( outputImageSize > 5 ) std::cout << " ... " << std::endl;
+            if(outputSize > 5) std::cout << " ... " << std::endl;
         }
-        if( outputPlanes > 5 ) std::cout << "   ... other planes ... " << std::endl;
+        if(outputPlanes > 5) std::cout << "   ... other planes ... " << std::endl;
     }
-    if( batchSize > 5 ) std::cout << "   ... other n ... " << std::endl;
+    if(batchSize > 5) std::cout << "   ... other n ... " << std::endl;
 }
 VIRTUAL void ForceBackpropLayer::print() {
     printOutput();
@@ -74,45 +74,45 @@ VIRTUAL void ForceBackpropLayer::print() {
 //VIRTUAL bool ForceBackpropLayer::needErrorsBackprop() {
 //    return true; // the main reason for this layer :-)
 //}
-VIRTUAL void ForceBackpropLayer::setBatchSize( int batchSize ) {
-    if( batchSize <= allocatedSize ) {
+VIRTUAL void ForceBackpropLayer::setBatchSize(int batchSize) {
+    if(batchSize <= allocatedSize) {
         this->batchSize = batchSize;
         return;
     }
-    if( output != 0 ) {
+    if(output != 0) {
         delete[] output;
     }
     this->batchSize = batchSize;
     this->allocatedSize = allocatedSize;
-    output = new float[ getOutputSize() ];
+    output = new float[ getOutputNumElements() ];
 }
 VIRTUAL void ForceBackpropLayer::forward() {
-    int totalLinearLength = getOutputSize();
+    int totalLinearLength = getOutputNumElements();
     float *input = previousLayer->getOutput();
-    for( int i = 0; i < totalLinearLength; i++ ) {
+    for(int i = 0; i < totalLinearLength; i++) {
         output[i] = input[i];
     }
 }
 VIRTUAL void ForceBackpropLayer::backward() {
   // do nothing... ?
 }
-VIRTUAL int ForceBackpropLayer::getOutputImageSize() const {
-    return outputImageSize;
+VIRTUAL int ForceBackpropLayer::getOutputSize() const {
+    return outputSize;
 }
 VIRTUAL int ForceBackpropLayer::getOutputPlanes() const {
     return outputPlanes;
 }
 VIRTUAL int ForceBackpropLayer::getOutputCubeSize() const {
-    return outputPlanes * outputImageSize * outputImageSize;
+    return outputPlanes * outputSize * outputSize;
 }
-VIRTUAL int ForceBackpropLayer::getOutputSize() const {
+VIRTUAL int ForceBackpropLayer::getOutputNumElements() const {
     return batchSize * getOutputCubeSize();
 }
 VIRTUAL std::string ForceBackpropLayer::toString() {
     return toString();
 }
 VIRTUAL std::string ForceBackpropLayer::asString() const {
-    return std::string("") + "ForceBackpropLayer{ outputPlanes=" + ::toString( outputPlanes ) + " outputImageSize=" +  ::toString( outputImageSize ) + " }";
+    return std::string("") + "ForceBackpropLayer{ outputPlanes=" + ::toString(outputPlanes) + " outputSize=" +  ::toString(outputSize) + " }";
 }
 
 
diff --git a/src/forcebackprop/ForceBackpropLayer.h b/src/forcebackprop/ForceBackpropLayer.h
index 3dd954eb..ebc39522 100644
--- a/src/forcebackprop/ForceBackpropLayer.h
+++ b/src/forcebackprop/ForceBackpropLayer.h
@@ -26,20 +26,20 @@ class ForceBackpropLayerMaker;
 class ForceBackpropLayer : public Layer, IHasToString {
 public:
     const int outputPlanes;
-    const int outputImageSize;
+    const int outputSize;
 
     int batchSize;
     int allocatedSize;
     float *output;
 
-    inline int getOutputIndex( int n, int outPlane, int outRow, int outCol ) const {
-        return ( ( n
-            * outputPlanes + outPlane )
-            * outputImageSize + outRow )
-            * outputImageSize + outCol;
+    inline int getOutputIndex(int n, int outPlane, int outRow, int outCol) const {
+        return (( n
+            * outputPlanes + outPlane)
+            * outputSize + outRow)
+            * outputSize + outCol;
     }
-    inline float getOutput( int n, int outPlane, int outRow, int outCol ) const {
-        return output[ getOutputIndex(n,outPlane, outRow, outCol ) ];
+    inline float getOutput(int n, int outPlane, int outRow, int outCol) const {
+        return output[ getOutputIndex(n,outPlane, outRow, outCol) ];
     }
 
     // [[[cog
@@ -47,28 +47,28 @@ class ForceBackpropLayer : public Layer, IHasToString {
     // cog_addheaders.add()
     // ]]]
     // generated, using cog:
-    ForceBackpropLayer( Layer *previousLayer, ForceBackpropLayerMaker *maker );
+    ForceBackpropLayer(Layer *previousLayer, ForceBackpropLayerMaker *maker);
     VIRTUAL ~ForceBackpropLayer();
     VIRTUAL std::string getClassName() const;
-    VIRTUAL void backward( float learningRate );
+    VIRTUAL void backward(float learningRate);
     VIRTUAL float *getOutput();
-    VIRTUAL int getPersistSize( int version ) const;
+    VIRTUAL int getPersistSize(int version) const;
     VIRTUAL bool needsBackProp();
     VIRTUAL void printOutput();
     VIRTUAL void print();
-    VIRTUAL void setBatchSize( int batchSize );
+    VIRTUAL void setBatchSize(int batchSize);
     VIRTUAL void forward();
     VIRTUAL void backward();
-    VIRTUAL int getOutputImageSize() const;
+    VIRTUAL int getOutputSize() const;
     VIRTUAL int getOutputPlanes() const;
     VIRTUAL int getOutputCubeSize() const;
-    VIRTUAL int getOutputSize() const;
+    VIRTUAL int getOutputNumElements() const;
     VIRTUAL std::string toString();
     VIRTUAL std::string asString() const;
 
     // [[[end]]]
 };
 
-std::ostream &operator<<( std::ostream &os, ForceBackpropLayer &layer );
-std::ostream &operator<<( std::ostream &os, ForceBackpropLayer const*layer );
+std::ostream &operator<<(std::ostream &os, ForceBackpropLayer &layer);
+std::ostream &operator<<(std::ostream &os, ForceBackpropLayer const*layer);
 
diff --git a/src/forcebackprop/ForceBackpropLayerMaker.cpp b/src/forcebackprop/ForceBackpropLayerMaker.cpp
index 559eb3d4..f6adbb0a 100644
--- a/src/forcebackprop/ForceBackpropLayerMaker.cpp
+++ b/src/forcebackprop/ForceBackpropLayerMaker.cpp
@@ -13,7 +13,7 @@ using namespace std;
 #undef VIRTUAL
 #define VIRTUAL
 
-Layer *ForceBackpropLayerMaker::createLayer( Layer *previousLayer ) {
-    return new ForceBackpropLayer( previousLayer, this );
+Layer *ForceBackpropLayerMaker::createLayer(Layer *previousLayer) {
+    return new ForceBackpropLayer(previousLayer, this);
 }
 
diff --git a/src/forcebackprop/ForceBackpropLayerMaker.h b/src/forcebackprop/ForceBackpropLayerMaker.h
index 9916a6bd..f2c3dd06 100644
--- a/src/forcebackprop/ForceBackpropLayerMaker.h
+++ b/src/forcebackprop/ForceBackpropLayerMaker.h
@@ -20,9 +20,9 @@ class DeepCL_EXPORT ForceBackpropLayerMaker : public LayerMaker2 {
     }
     virtual ForceBackpropLayerMaker *clone() const {
         ForceBackpropLayerMaker *thisClone = new ForceBackpropLayerMaker();
-        memcpy( thisClone, this, sizeof( ForceBackpropLayerMaker ) );
+        memcpy(thisClone, this, sizeof(ForceBackpropLayerMaker) );
         return thisClone;
     }
-    virtual Layer *createLayer( Layer *previousLayer );
+    virtual Layer *createLayer(Layer *previousLayer);
 };
 
diff --git a/src/input/InputLayer.cpp b/src/input/InputLayer.cpp
index ceeea8e8..cec2a793 100644
--- a/src/input/InputLayer.cpp
+++ b/src/input/InputLayer.cpp
@@ -13,12 +13,12 @@ using namespace std;
 #undef VIRTUAL
 #define VIRTUAL 
 
-InputLayer::InputLayer( InputLayerMaker *maker ) :
-       Layer( 0, maker ),
+InputLayer::InputLayer(InputLayerMaker *maker) :
+       Layer(0, maker),
     batchSize(0),
     allocatedSize(0),
-    outputPlanes( maker->_numPlanes ),
-    outputImageSize( maker->_imageSize ),
+    outputPlanes(maker->_numPlanes),
+    outputSize(maker->_imageSize),
     input(0),
     output(0) {
 }
@@ -33,40 +33,40 @@ VIRTUAL float *InputLayer::getOutput() {
 VIRTUAL bool InputLayer::needsBackProp() {
     return false;
 }
-VIRTUAL int InputLayer::getPersistSize( int version ) const {
+VIRTUAL int InputLayer::getPersistSize(int version) const {
     return 0;
 }
 VIRTUAL void InputLayer::printOutput() {
-    if( output == 0 ) {
+    if(output == 0) {
          return;
     }
-    for( int n = 0; n < std::min(5,batchSize); n++ ) {
+    for(int n = 0; n < std::min(5,batchSize); n++) {
         std::cout << "InputLayer n " << n << ":" << std::endl;
-        for( int plane = 0; plane < std::min( 5, outputPlanes); plane++ ) {
-            if( outputPlanes > 1 ) std::cout << "    plane " << plane << ":" << std::endl;
-            for( int i = 0; i < std::min(5, outputImageSize); i++ ) {
+        for(int plane = 0; plane < std::min(5, outputPlanes); plane++) {
+            if(outputPlanes > 1) std::cout << "    plane " << plane << ":" << std::endl;
+            for(int i = 0; i < std::min(5, outputSize); i++) {
                 std::cout << "      ";
-                for( int j = 0; j < std::min(5, outputImageSize); j++ ) {
-                    std::cout << getOutput( n, plane, i, j ) << " ";
+                for(int j = 0; j < std::min(5, outputSize); j++) {
+                    std::cout << getOutput(n, plane, i, j) << " ";
 //output[
 //                            n * numPlanes * imageSize*imageSize +
 //                            plane*imageSize*imageSize +
 //                            i * imageSize +
 //                            j ] << " ";
                 }
-                if( outputImageSize > 5 ) std::cout << " ... ";
+                if(outputSize > 5) std::cout << " ... ";
                 std::cout << std::endl;
             }
-            if( outputImageSize > 5 ) std::cout << " ... " << std::endl;
+            if(outputSize > 5) std::cout << " ... " << std::endl;
         }
-        if( outputPlanes > 5 ) std::cout << "   ... other planes ... " << std::endl;
+        if(outputPlanes > 5) std::cout << "   ... other planes ... " << std::endl;
     }
-    if( batchSize > 5 ) std::cout << "   ... other n ... " << std::endl;
+    if(batchSize > 5) std::cout << "   ... other n ... " << std::endl;
 }
 VIRTUAL void InputLayer::print() {
     printOutput();
 }
- void InputLayer::in( float const*images ) {
+ void InputLayer::in(float const*images) {
 //        std::cout << "InputLayer::in()" << std::endl;
     this->input = images;
 //        this->batchStart = batchStart;
@@ -76,13 +76,13 @@ VIRTUAL void InputLayer::print() {
 VIRTUAL bool InputLayer::needErrorsBackprop() {
     return false;
 }
-VIRTUAL void InputLayer::setBatchSize( int batchSize ) {
+VIRTUAL void InputLayer::setBatchSize(int batchSize) {
 //        std::cout << "inputlayer setting batchsize " << batchSize << std::endl;
-    if( batchSize <= allocatedSize ) {
+    if(batchSize <= allocatedSize) {
         this->batchSize = batchSize;
         return;
     }
-    if( output != 0 ) {
+    if(output != 0) {
         delete[] output;
     }
     this->batchSize = batchSize;
@@ -90,38 +90,38 @@ VIRTUAL void InputLayer::setBatchSize( int batchSize ) {
     output = new float[batchSize * getOutputCubeSize() ];
 }
 VIRTUAL void InputLayer::forward() {
-    int totalLinearLength = getOutputSize();
-    for( int i = 0; i < totalLinearLength; i++ ) {
+    int totalLinearLength = getOutputNumElements();
+    for(int i = 0; i < totalLinearLength; i++) {
         output[i] = input[i];
     }
 }
-VIRTUAL void InputLayer::backward( float learningRate, float const *gradOutput ) {
+VIRTUAL void InputLayer::backward(float learningRate, float const *gradOutput) {
 }
-VIRTUAL int InputLayer::getOutputImageSize() const {
-    return outputImageSize;
+VIRTUAL int InputLayer::getOutputSize() const {
+    return outputSize;
 }
 VIRTUAL int InputLayer::getOutputPlanes() const {
     return outputPlanes;
 }
 VIRTUAL int InputLayer::getOutputCubeSize() const {
-    return outputPlanes * outputImageSize * outputImageSize;
+    return outputPlanes * outputSize * outputSize;
 }
-VIRTUAL int InputLayer::getOutputSize() const {
+VIRTUAL int InputLayer::getOutputNumElements() const {
     return batchSize * getOutputCubeSize();
 }
 VIRTUAL std::string InputLayer::toString() {
     return asString();
 }
 VIRTUAL std::string InputLayer::asString() const {
-    return std::string("") + "InputLayer{ outputPlanes=" + ::toString( outputPlanes ) + " outputImageSize=" +  ::toString( outputImageSize ) + " }";
+    return std::string("") + "InputLayer{ outputPlanes=" + ::toString(outputPlanes) + " outputSize=" +  ::toString(outputSize) + " }";
 }
 
 //template<>VIRTUAL std::string InputLayer<unsigned char>::asString() const {
-//    return std::string("") + "InputLayer<unsigned char>{ outputPlanes=" + ::toString( outputPlanes ) + " outputImageSize=" +  ::toString( outputImageSize ) + " }";
+//    return std::string("") + "InputLayer<unsigned char>{ outputPlanes=" + ::toString(outputPlanes) + " outputSize=" +  ::toString(outputSize) + " }";
 //}
 
 //template<>VIRTUAL std::string InputLayer<float>::asString() const {
-//    return std::string("") + "InputLayer<float>{ outputPlanes=" + ::toString( outputPlanes ) + " outputImageSize=" +  ::toString( outputImageSize ) + " }";
+//    return std::string("") + "InputLayer<float>{ outputPlanes=" + ::toString(outputPlanes) + " outputSize=" +  ::toString(outputSize) + " }";
 //}
 
 
diff --git a/src/input/InputLayer.h b/src/input/InputLayer.h
index bd16236c..9f3ec18d 100644
--- a/src/input/InputLayer.h
+++ b/src/input/InputLayer.h
@@ -22,19 +22,19 @@ class DeepCL_EXPORT InputLayer : public Layer, IHasToString {
     int allocatedSize;
 
     const int outputPlanes;
-    const int outputImageSize;
+    const int outputSize;
 
     float const*input; // we dont own this
     float *output; // we own this :-)
 
-    inline int getOutputIndex( int n, int outPlane, int outRow, int outCol ) const {
-        return ( ( n
-            * outputPlanes + outPlane )
-            * outputImageSize + outRow )
-            * outputImageSize + outCol;
+    inline int getOutputIndex(int n, int outPlane, int outRow, int outCol) const {
+        return (( n
+            * outputPlanes + outPlane)
+            * outputSize + outRow)
+            * outputSize + outCol;
     }
-    inline float getOutput( int n, int outPlane, int outRow, int outCol ) const {
-        return output[ getOutputIndex(n,outPlane, outRow, outCol ) ];
+    inline float getOutput(int n, int outPlane, int outRow, int outCol) const {
+        return output[ getOutputIndex(n,outPlane, outRow, outCol) ];
     }
 
     // [[[cog
@@ -42,29 +42,29 @@ class DeepCL_EXPORT InputLayer : public Layer, IHasToString {
     // cog_addheaders.add()
     // ]]]
     // generated, using cog:
-    InputLayer( InputLayerMaker *maker );
+    InputLayer(InputLayerMaker *maker);
     VIRTUAL ~InputLayer();
     VIRTUAL std::string getClassName() const;
     VIRTUAL float *getOutput();
     VIRTUAL bool needsBackProp();
-    VIRTUAL int getPersistSize( int version ) const;
+    VIRTUAL int getPersistSize(int version) const;
     VIRTUAL void printOutput();
     VIRTUAL void print();
-    void in( float const*images );
+    void in(float const*images);
     VIRTUAL bool needErrorsBackprop();
-    VIRTUAL void setBatchSize( int batchSize );
+    VIRTUAL void setBatchSize(int batchSize);
     VIRTUAL void forward();
-    VIRTUAL void backward( float learningRate, float const *gradOutput );
-    VIRTUAL int getOutputImageSize() const;
+    VIRTUAL void backward(float learningRate, float const *gradOutput);
+    VIRTUAL int getOutputSize() const;
     VIRTUAL int getOutputPlanes() const;
     VIRTUAL int getOutputCubeSize() const;
-    VIRTUAL int getOutputSize() const;
+    VIRTUAL int getOutputNumElements() const;
     VIRTUAL std::string toString();
     VIRTUAL std::string asString() const;
 
     // [[[end]]]
 };
 
- std::ostream &operator<<( std::ostream &os, InputLayer &layer );
- std::ostream &operator<<( std::ostream &os, InputLayer const*layer );
+ std::ostream &operator<<(std::ostream &os, InputLayer &layer);
+ std::ostream &operator<<(std::ostream &os, InputLayer const*layer);
 
diff --git a/src/input/InputLayerMaker.cpp b/src/input/InputLayerMaker.cpp
index 37195203..0a37f83e 100644
--- a/src/input/InputLayerMaker.cpp
+++ b/src/input/InputLayerMaker.cpp
@@ -11,7 +11,7 @@
 
 using namespace std;
 
-Layer *InputLayerMaker::createLayer( Layer *previousLayer ) {
-    return new InputLayer( this );
+Layer *InputLayerMaker::createLayer(Layer *previousLayer) {
+    return new InputLayer(this);
 }
 
diff --git a/src/input/InputLayerMaker.h b/src/input/InputLayerMaker.h
index cc929bbe..1b84a838 100644
--- a/src/input/InputLayerMaker.h
+++ b/src/input/InputLayerMaker.h
@@ -24,15 +24,15 @@ class DeepCL_EXPORT InputLayerMaker : public LayerMaker2 {
     int _numPlanes;
     int _imageSize;
     PUBLICAPI InputLayerMaker() :
-//            LayerMaker( net, 0 ),
+//            LayerMaker(net, 0),
             _numPlanes(0),
             _imageSize(0) {
     }
-    PUBLICAPI InputLayerMaker *numPlanes( int _numPlanes ) {
+    PUBLICAPI InputLayerMaker *numPlanes(int _numPlanes) {
         this->_numPlanes = _numPlanes;
         return this;
     }    
-    PUBLICAPI InputLayerMaker *imageSize( int _imageSize ) {
+    PUBLICAPI InputLayerMaker *imageSize(int _imageSize) {
         this->_imageSize = _imageSize;
         return this;
     }    
@@ -41,9 +41,9 @@ class DeepCL_EXPORT InputLayerMaker : public LayerMaker2 {
     }    
     virtual InputLayerMaker *clone() const {
         InputLayerMaker *thisClone = new InputLayerMaker();
-        memcpy( thisClone, this, sizeof( InputLayerMaker ) );
+        memcpy(thisClone, this, sizeof(InputLayerMaker) );
         return thisClone;
     }
-    virtual Layer *createLayer( Layer *previousLayer );
+    virtual Layer *createLayer(Layer *previousLayer);
 };
 
diff --git a/src/layer/Layer.cpp b/src/layer/Layer.cpp
index fcf65d41..35e5eb8f 100644
--- a/src/layer/Layer.cpp
+++ b/src/layer/Layer.cpp
@@ -1,57 +1,61 @@
 #include "layer/Layer.h"
 #include "weights/WeightsPersister.h"
+#include "CppRuntimeBoundary.h"
 
 using namespace std;
 
 #undef VIRTUAL
 #define VIRTUAL 
 
-PUBLICAPI Layer::Layer( Layer *previousLayer, LayerMaker2 *maker ) :
-    previousLayer( previousLayer ),
-    nextLayer( 0 ),
-    layerIndex( previousLayer == 0 ? 0 : previousLayer->layerIndex + 1 ),
-    training( false ),
-    maker( maker )
+PUBLICAPI Layer::Layer(Layer *previousLayer, LayerMaker2 *maker) :
+    previousLayer(previousLayer),
+    nextLayer(0),
+    layerIndex(previousLayer == 0 ? 0 : previousLayer->layerIndex + 1),
+    training(false),
+    maker(maker)
      {
-    if( previousLayer != 0 ) {
+    if(previousLayer != 0) {
         previousLayer->nextLayer = this;
     }
 }
 VIRTUAL Layer::~Layer() {
-    if( maker != 0 ) {
+    if(maker != 0) {
         //delete maker; // this segfaults sometimes, (probably because it already
                         // self-deleted)
     }
 }
 /// \brief Are we training or predicting?
 /// Only affects the Random translations and patches layers currently
-PUBLICAPI VIRTUAL void Layer::setTraining( bool training ) {
+PUBLICAPI VIRTUAL void Layer::setTraining(bool training) {
     this->training = training;
 }
 /// used to set up internal buffers and stuff
-PUBLICAPI VIRTUAL void Layer::setBatchSize( int batchSize ) {
+PUBLICAPI VIRTUAL void Layer::setBatchSize(int batchSize) {
     throw std::runtime_error("setBatchsize not implemetned for this layer type");
 }
 VIRTUAL bool Layer::providesGradInputWrapper() const {
     return false;
 }
+VIRTUAL const char *Layer::getClassNameAsCharStar() const {
+    return deepcl_stringToCharStar(getClassName());
+}
 VIRTUAL float *Layer::getGradInput() {
-    throw std::runtime_error("getGradInput not implemented for " + getClassName() );
+    throw std::runtime_error("getGradInput not implemented for " + getClassName());
 }
 VIRTUAL CLWrapper *Layer::getGradWeightsWrapper() {
-    throw std::runtime_error("getGradWeightsWrapper not implemented for " + getClassName() );
+    throw std::runtime_error("getGradWeightsWrapper not implemented for " + getClassName());
 }
 VIRTUAL CLWrapper *Layer::getGradBiasWrapper() {
-    throw std::runtime_error("getGradBiasWrapper not implemented for " + getClassName() );
+    throw std::runtime_error("getGradBiasWrapper not implemented for " + getClassName());
 }
 VIRTUAL CLWrapper *Layer::getWeightsWrapper() {
-    throw std::runtime_error("getWeightsWrapper not implemented for " + getClassName() );
+    throw std::runtime_error("getWeightsWrapper not implemented for " + getClassName());
 }
 VIRTUAL CLWrapper *Layer::getBiasWrapper() {
-    throw std::runtime_error("getBiasWrapper not implemented for " + getClassName() );
+    throw std::runtime_error("getBiasWrapper not implemented for " + getClassName());
 }
 VIRTUAL CLWrapper *Layer::getGradInputWrapper() {
-    throw std::runtime_error("getGradInputWrapper not implemented for " + getClassName() );
+    throw std::runtime_error("getGradInputWrapper not implemented for " + getClassName());
 }
 PUBLICAPI VIRTUAL bool Layer::getBiased() const {
      throw std::runtime_error("getBiased not implemented for " + getClassName());
@@ -60,44 +64,44 @@ PUBLICAPI VIRTUAL bool Layer::hasOutputWrapper() const {
     return false;
 }
 PUBLICAPI VIRTUAL CLWrapper *Layer::getOutputWrapper() {
-    throw std::runtime_error("getOutputWrapper not implemetned for " + getClassName() );
+    throw std::runtime_error("getOutputWrapper not implemetned for " + getClassName());
 }
 PUBLICAPI VIRTUAL int Layer::getOutputCubeSize() const {
-    throw std::runtime_error("getOutputCubeSize not implemetned for " + getClassName() );
+    throw std::runtime_error("getOutputCubeSize not implemetned for " + getClassName());
  //     return numPlanes * imageSize * imageSize * batchSize;
 }
 PUBLICAPI VIRTUAL int Layer::getOutputPlanes() const {
-    throw std::runtime_error("getOutputPlanes not implemetned for " + getClassName() );
+    throw std::runtime_error("getOutputPlanes not implemetned for " + getClassName());
 }
-PUBLICAPI VIRTUAL int Layer::getOutputImageSize() const {
-    throw std::runtime_error("getOutputImageSize not implemetned for " + getClassName() );
+PUBLICAPI VIRTUAL int Layer::getOutputSize() const {
+    throw std::runtime_error("getOutputSize not implemetned for " + getClassName());
 }
 VIRTUAL void Layer::forward() {
-    throw std::runtime_error("forward not implemented for " + getClassName() );
+    throw std::runtime_error("forward not implemented for " + getClassName());
 }
 VIRTUAL bool Layer::needsBackProp() {
-    throw std::runtime_error("needsBackProp not implemented for " + getClassName() );
+    throw std::runtime_error("needsBackProp not implemented for " + getClassName());
 }
 VIRTUAL void Layer::print() {
 //    printWeights();
-//    if( output != 0 ) {
+//    if(output != 0) {
     printOutput();
     printWeights();
 //    } else {
 //        std::cout << "No output yet " << std::endl;
 //    }
 }
-VIRTUAL void Layer::initWeights( float const*weights ) {
-    throw std::runtime_error("initWeights not implemetned for " + getClassName() );
+VIRTUAL void Layer::initWeights(float const*weights) {
+    throw std::runtime_error("initWeights not implemetned for " + getClassName());
 //    int numWeights = getWeightsSize();
-//    for( int i = 0; i < numWeights; i++ ) {
+//    for(int i = 0; i < numWeights; i++) {
 //        this->weights[i] = weights[i];
 //    }
 }
-VIRTUAL void Layer::initBias( float const *bias ) {
-    throw std::runtime_error("initBias not implemetned for " + getClassName() );
+VIRTUAL void Layer::initBias(float const *bias) {
+    throw std::runtime_error("initBias not implemetned for " + getClassName());
 //    int numBias = getBiasSize();
-//    for( int i = 0; i < numBias; i++ ) {
+//    for(int i = 0; i < numBias; i++) {
 //        this->bias[i] = bias[i];
 //    }
 }
@@ -105,82 +109,85 @@ int Layer::getLayerIndex() {
     return layerIndex;
 }
 VIRTUAL void Layer::printWeights() {
-    throw std::runtime_error("printWeights not implemented for " + getClassName() );
+    throw std::runtime_error("printWeights not implemented for " + getClassName());
 }
 VIRTUAL void Layer::printOutput() {
-    throw std::runtime_error("printOutput not implemented for " + getClassName() );
+    throw std::runtime_error("printOutput not implemented for " + getClassName());
 }
 PUBLICAPI VIRTUAL void Layer::backward() {
-    throw std::runtime_error("backward not implemented for " + getClassName() );
+    throw std::runtime_error("backward not implemented for " + getClassName());
 }
 VIRTUAL float *Layer::getGradWeights() {
-    throw std::runtime_error("getGradWeights not implemented for " + getClassName() );
+    throw std::runtime_error("getGradWeights not implemented for " + getClassName());
 }
 VIRTUAL float *Layer::getGradBias() {
-    throw std::runtime_error("getGradBias not implemented for " + getClassName() );
+    throw std::runtime_error("getGradBias not implemented for " + getClassName());
 }
 VIRTUAL bool Layer::biased() {
-    throw std::runtime_error("biased not implemented for " + getClassName() );
+    throw std::runtime_error("biased not implemented for " + getClassName());
 }
 PUBLICAPI VIRTUAL int Layer::getWeightsSize() const {
-    throw std::runtime_error("getWeightsSize not implemented for " + getClassName() );
+    throw std::runtime_error("getWeightsSize not implemented for " + getClassName());
 }
 PUBLICAPI VIRTUAL int Layer::getBiasSize() const {
-    throw std::runtime_error("getBiasSize not implemented for " + getClassName() );
+    throw std::runtime_error("getBiasSize not implemented for " + getClassName());
 }
 PUBLICAPI VIRTUAL int Layer::getPersistSize() const {
-    return getPersistSize( WeightsPersister::latestVersion );
+    return getPersistSize(WeightsPersister::latestVersion);
 }
-PUBLICAPI VIRTUAL void Layer::persistToArray( float *array ) {
-    persistToArray( WeightsPersister::latestVersion, array );
+PUBLICAPI VIRTUAL void Layer::persistToArray(float *array) {
+    persistToArray(WeightsPersister::latestVersion, array);
 }
 /// \brief store the current weights and biases to array
 /// Note that you need to allocate array first
-PUBLICAPI VIRTUAL void Layer::persistToArray( int version, float *array ) {
-    throw std::runtime_error("persistToArray not implemented for " + getClassName() );
+PUBLICAPI VIRTUAL void Layer::persistToArray(int version, float *array) {
+    throw std::runtime_error("persistToArray not implemented for " + getClassName());
 }
-PUBLICAPI VIRTUAL void Layer::unpersistFromArray( float const*array ) {
-    unpersistFromArray( WeightsPersister::latestVersion, array );
+PUBLICAPI VIRTUAL void Layer::unpersistFromArray(float const*array) {
+    unpersistFromArray(WeightsPersister::latestVersion, array);
 }
 /// \brief initialize the current weights and biases from array
-PUBLICAPI VIRTUAL void Layer::unpersistFromArray( int version, float const*array ) {
-    throw std::runtime_error("unpersistFromArray not implemented for " + getClassName() );
+PUBLICAPI VIRTUAL void Layer::unpersistFromArray(int version, float const*array) {
+    throw std::runtime_error("unpersistFromArray not implemented for " + getClassName());
 }
 VIRTUAL void Layer::setWeights(float *weights, float *bias) {
-    throw std::runtime_error("setWeights not implemented for " + getClassName() );
+    throw std::runtime_error("setWeights not implemented for " + getClassName());
 }
 VIRTUAL float const *Layer::getWeights() const {
-    throw std::runtime_error("getWeights const not implemented for " + getClassName() );
+    throw std::runtime_error("getWeights const not implemented for " + getClassName());
 }
 VIRTUAL float *Layer::getWeights() {
-    throw std::runtime_error("getWeights not implemented for " + getClassName() );
+    throw std::runtime_error("getWeights not implemented for " + getClassName());
 }
 VIRTUAL float *Layer::getBias() {
-    throw std::runtime_error("getBias not implemented for " + getClassName() );
+    throw std::runtime_error("getBias not implemented for " + getClassName());
 }
 VIRTUAL float const*Layer::getBias() const {
-    throw std::runtime_error("getBias const not implemented for " + getClassName() );
+    throw std::runtime_error("getBias const not implemented for " + getClassName());
 }
 /// \brief Get a string representation of the layer
-PUBLICAPI VIRTUAL std::string Layer::asString() const {
+VIRTUAL std::string Layer::asString() const {
     return "Layer{}";
 }
+VIRTUAL const char *Layer::asNewCharStar() const {
+    return deepcl_stringToCharStar(asString());
+}
 VIRTUAL bool Layer::needsTrainerState  () const {
     return false;
 }
 // This transfers ownership of the trainer to the layer,
 // which is responsible for deleting it
 // probably should pass in a Maker class instead
-VIRTUAL void Layer::setTrainerState( TrainerStateMaker *trainerMaker ) {
-    throw std::runtime_error("setTrainer not implemented for " + getClassName() );
+VIRTUAL void Layer::setTrainerState(TrainerStateMaker *trainerMaker) {
+    throw std::runtime_error("setTrainer not implemented for " + getClassName());
 }
 VIRTUAL TrainerState *Layer::getTrainerState() {
-    throw std::runtime_error("getTrainerState not implemented for " + getClassName() );
+    throw std::runtime_error("getTrainerState not implemented for " + getClassName());
 }
 VIRTUAL TrainerState *Layer::getBiasTrainerState() {
-    throw std::runtime_error("getBiasTrainerState not implemented for " + getClassName() );
+    throw std::runtime_error("getBiasTrainerState not implemented for " + getClassName());
 }
-VIRTUAL void Layer::updateWeights( CLWrapper *weightChangesWrapper, CLWrapper *biasChangesWrapper ) {
-    throw std::runtime_error("updateWeights not implemented for " + getClassName() );
+VIRTUAL void Layer::updateWeights(CLWrapper *weightChangesWrapper, CLWrapper *biasChangesWrapper) {
+    throw std::runtime_error("updateWeights not implemented for " + getClassName());
 }
 
diff --git a/src/layer/Layer.h b/src/layer/Layer.h
index d129422d..e91cc252 100644
--- a/src/layer/Layer.h
+++ b/src/layer/Layer.h
@@ -38,9 +38,9 @@ class DeepCL_EXPORT Layer {
     PUBLICAPI virtual float * getOutput() = 0;
 //    virtual Layer *clone() = 0;
     /// \brief Get the size of array needed for persisting to/from an array
-    PUBLICAPI virtual int getPersistSize( int version ) const = 0;
+    PUBLICAPI virtual int getPersistSize(int version) const = 0;
     /// \brief Get the size of the activated output from this layer
-    PUBLICAPI virtual int getOutputSize() const = 0;
+    PUBLICAPI virtual int getOutputNumElements() const = 0;
     virtual std::string getClassName() const = 0;
 
     // [[[cog
@@ -48,11 +48,12 @@ class DeepCL_EXPORT Layer {
     // cog_addheaders.add()
     // ]]]
     // generated, using cog:
-    PUBLICAPI Layer( Layer *previousLayer, LayerMaker2 *maker );
+    PUBLICAPI Layer(Layer *previousLayer, LayerMaker2 *maker);
     VIRTUAL ~Layer();
-    PUBLICAPI VIRTUAL void setTraining( bool training );
-    PUBLICAPI VIRTUAL void setBatchSize( int batchSize );
+    PUBLICAPI VIRTUAL void setTraining(bool training);
+    PUBLICAPI VIRTUAL void setBatchSize(int batchSize);
     VIRTUAL bool providesGradInputWrapper() const;
+    VIRTUAL const char *getClassNameAsCharStar() const;
     VIRTUAL float *getGradInput();
     VIRTUAL CLWrapper *getGradWeightsWrapper();
     VIRTUAL CLWrapper *getGradBiasWrapper();
@@ -64,12 +65,12 @@ class DeepCL_EXPORT Layer {
     PUBLICAPI VIRTUAL CLWrapper *getOutputWrapper();
     PUBLICAPI VIRTUAL int getOutputCubeSize() const;
     PUBLICAPI VIRTUAL int getOutputPlanes() const;
-    PUBLICAPI VIRTUAL int getOutputImageSize() const;
+    PUBLICAPI VIRTUAL int getOutputSize() const;
     VIRTUAL void forward();
     VIRTUAL bool needsBackProp();
     VIRTUAL void print();
-    VIRTUAL void initWeights( float const*weights );
-    VIRTUAL void initBias( float const *bias );
+    VIRTUAL void initWeights(float const*weights);
+    VIRTUAL void initBias(float const *bias);
     int getLayerIndex();
     VIRTUAL void printWeights();
     VIRTUAL void printOutput();
@@ -80,21 +81,22 @@ class DeepCL_EXPORT Layer {
     PUBLICAPI VIRTUAL int getWeightsSize() const;
     PUBLICAPI VIRTUAL int getBiasSize() const;
     PUBLICAPI VIRTUAL int getPersistSize() const;
-    PUBLICAPI VIRTUAL void persistToArray( float *array );
-    PUBLICAPI VIRTUAL void persistToArray( int version, float *array );
-    PUBLICAPI VIRTUAL void unpersistFromArray( float const*array );
-    PUBLICAPI VIRTUAL void unpersistFromArray( int version, float const*array );
+    PUBLICAPI VIRTUAL void persistToArray(float *array);
+    PUBLICAPI VIRTUAL void persistToArray(int version, float *array);
+    PUBLICAPI VIRTUAL void unpersistFromArray(float const*array);
+    PUBLICAPI VIRTUAL void unpersistFromArray(int version, float const*array);
     VIRTUAL void setWeights(float *weights, float *bias);
     VIRTUAL float const *getWeights() const;
     VIRTUAL float *getWeights();
     VIRTUAL float *getBias();
     VIRTUAL float const*getBias() const;
-    PUBLICAPI VIRTUAL std::string asString() const;
+    VIRTUAL std::string asString() const;
+    VIRTUAL const char *asNewCharStar() const;
     VIRTUAL bool needsTrainerState  () const;
-    VIRTUAL void setTrainerState( TrainerStateMaker *trainerMaker );
+    VIRTUAL void setTrainerState(TrainerStateMaker *trainerMaker);
     VIRTUAL TrainerState *getTrainerState();
     VIRTUAL TrainerState *getBiasTrainerState();
-    VIRTUAL void updateWeights( CLWrapper *weightChangesWrapper, CLWrapper *biasChangesWrapper );
+    VIRTUAL void updateWeights(CLWrapper *weightChangesWrapper, CLWrapper *biasChangesWrapper);
 
     // [[[end]]]
 
diff --git a/src/layer/LayerMaker.cpp b/src/layer/LayerMaker.cpp
index af190941..4d270e4a 100644
--- a/src/layer/LayerMaker.cpp
+++ b/src/layer/LayerMaker.cpp
@@ -22,13 +22,13 @@
 
 using namespace std;
 
-Layer *SquareLossMaker::createLayer( Layer *previousLayer ) {
-    return new SquareLossLayer( previousLayer, this );
+Layer *SquareLossMaker::createLayer(Layer *previousLayer) {
+    return new SquareLossLayer(previousLayer, this);
 }
-Layer *CrossEntropyLossMaker::createLayer( Layer *previousLayer ) {
-    return new CrossEntropyLoss( previousLayer, this );
+Layer *CrossEntropyLossMaker::createLayer(Layer *previousLayer) {
+    return new CrossEntropyLoss(previousLayer, this);
 }
-Layer *SoftMaxMaker::createLayer( Layer *previousLayer ) {
-    return new SoftMaxLayer( previousLayer, this );
+Layer *SoftMaxMaker::createLayer(Layer *previousLayer) {
+    return new SoftMaxLayer(previousLayer, this);
 }
 
diff --git a/src/layer/LayerMaker.h b/src/layer/LayerMaker.h
index 5fdc305a..dedc51eb 100644
--- a/src/layer/LayerMaker.h
+++ b/src/layer/LayerMaker.h
@@ -37,10 +37,10 @@ class DeepCL_EXPORT LayerMaker2 {
         cl(0) {
     }
     virtual ~LayerMaker2() {}
-    void setCl( EasyCL *cl ) {
+    void setCl(EasyCL *cl) {
         this->cl = cl;
     }
-    virtual Layer *createLayer( Layer *previousLayer ) = 0;
+    virtual Layer *createLayer(Layer *previousLayer) = 0;
     virtual LayerMaker2 *clone() const = 0;
 };
 
@@ -48,22 +48,22 @@ class DeepCL_EXPORT LayerMaker2 {
 //public:
 //    Layer *previousLayer;
 //    NeuralNet *net; // only used for 'insert'
-//    virtual int getOutputImageSize() const = 0;
+//    virtual int getOutputSize() const = 0;
 //    virtual int getOutputPlanes() const = 0;
 //    virtual int getBiased() const = 0;
 //    virtual ActivationFunction const*getActivationFunction() const {
 //        throw std::runtime_error("getactivationfunction not impelmented for this maker type");
 //    }
-//    LayerMaker( NeuralNet *net, Layer *previousLayer ) :
-//        net( net ),
-//        previousLayer( previousLayer ) {
+//    LayerMaker(NeuralNet *net, Layer *previousLayer) :
+//        net(net),
+//        previousLayer(previousLayer) {
 //    }
-//    void setPreviousLayer( Layer *previousLayer ) {
+//    void setPreviousLayer(Layer *previousLayer) {
 //        this->previousLayer = previousLayer;
 //    }
 //    virtual Layer *insert();
 //    virtual Layer *instance() const = 0;
-//    virtual LayerMaker *clone( Layer *clonePreviousLayer ) const = 0;
+//    virtual LayerMaker *clone(Layer *clonePreviousLayer) const = 0;
 //};
 
 class DeepCL_EXPORT LossLayerMaker : public LayerMaker2 {
@@ -82,10 +82,10 @@ class DeepCL_EXPORT SquareLossMaker : public LossLayerMaker {
     }
     virtual SquareLossMaker *clone() const {
         SquareLossMaker *thisClone = new SquareLossMaker();
-        memcpy( thisClone, this, sizeof( SquareLossMaker ) );
+        memcpy(thisClone, this, sizeof(SquareLossMaker) );
         return thisClone;
     }
-    virtual Layer *createLayer( Layer *previousLayer );
+    virtual Layer *createLayer(Layer *previousLayer);
 };
 
 class DeepCL_EXPORT CrossEntropyLossMaker : public LossLayerMaker {
@@ -97,10 +97,10 @@ class DeepCL_EXPORT CrossEntropyLossMaker : public LossLayerMaker {
     }
     virtual CrossEntropyLossMaker *clone() const {
         CrossEntropyLossMaker *thisClone = new CrossEntropyLossMaker();
-        memcpy( thisClone, this, sizeof( CrossEntropyLossMaker ) );
+        memcpy(thisClone, this, sizeof(CrossEntropyLossMaker) );
         return thisClone;
     }
-    virtual Layer *createLayer( Layer *previousLayer );
+    virtual Layer *createLayer(Layer *previousLayer);
 };
 
 // by default, it will be per-plane
@@ -124,10 +124,10 @@ class DeepCL_EXPORT SoftMaxMaker : public LossLayerMaker {
     }
     virtual SoftMaxMaker *clone() const {
         SoftMaxMaker *thisClone = new SoftMaxMaker();
-        memcpy( thisClone, this, sizeof( SoftMaxMaker ) );
+        memcpy(thisClone, this, sizeof(SoftMaxMaker) );
         return thisClone;
     }
-    virtual Layer *createLayer( Layer *previousLayer );
+    virtual Layer *createLayer(Layer *previousLayer);
 };
 
 
diff --git a/src/loaders/GenericLoader.cpp b/src/loaders/GenericLoader.cpp
index a680342f..dac7ef92 100644
--- a/src/loaders/GenericLoader.cpp
+++ b/src/loaders/GenericLoader.cpp
@@ -25,64 +25,68 @@ using namespace std;
 #define STATIC
 #define VIRTUAL
 
-PUBLIC PUBLICAPI STATIC void GenericLoader::getDimensions( std::string trainFilepath, int *p_numExamples, int *p_numPlanes, int *p_imageSize ) {
-    char *headerBytes = FileHelper::readBinaryChunk( trainFilepath, 0, 1024 );
+PUBLIC PUBLICAPI STATIC void GenericLoader::getDimensions(const char * trainFilepath, int *p_numExamples, int *p_numPlanes, int *p_imageSize) {
+    cout << "GenericLoader::getDimensions" << endl;
+    cout << "trainFilepath: " << trainFilepath << endl;
+    char *headerBytes = FileHelper::readBinaryChunk(trainFilepath, 0, 1024);
     char type[1025];
-    strncpy( type, headerBytes, 4 );
+    strncpy(type, headerBytes, 4);
     type[4] = 0;
-    unsigned int *headerInts = reinterpret_cast< unsigned int *>( headerBytes );
+    unsigned int *headerInts = reinterpret_cast< unsigned int *>(headerBytes);
 
-    if( string(type) == "mlv2" ) {
+    if(string(type) == "mlv2") {
 //        cout << "Loading as a Kgsv2 file" << endl;
-        Kgsv2Loader::getDimensions( trainFilepath, p_numExamples, p_numPlanes, p_imageSize );
-    } else if( headerInts[0] == 0x1e3d4c55 ) {
+        Kgsv2Loader::getDimensions(trainFilepath, p_numExamples, p_numPlanes, p_imageSize);
+    } else if(headerInts[0] == 0x1e3d4c55) {
 //        cout << "Loading as a Norb mat file" << endl;
-        NorbLoader::getDimensions( trainFilepath, p_numExamples, p_numPlanes, p_imageSize );
-    } else if( headerInts[0] == 0x03080000 ) {
-        MnistLoader::getDimensions( trainFilepath, p_numExamples, p_numPlanes, p_imageSize );
+        NorbLoader::getDimensions(trainFilepath, p_numExamples, p_numPlanes, p_imageSize);
+    } else if(headerInts[0] == 0x03080000) {
+        MnistLoader::getDimensions(trainFilepath, p_numExamples, p_numPlanes, p_imageSize);
     } else {
         cout << "headstring" << type << endl;
-        throw runtime_error("Filetype of " + trainFilepath + " not recognised" );
+        throw runtime_error(string("Filetype of ") + trainFilepath + " not recognised");
     }
 }
 
-PUBLIC PUBLICAPI STATIC void GenericLoader::load( std::string imagesFilePath, float *images, int *labels, int startN, int numExamples ) {
+PUBLIC PUBLICAPI STATIC void GenericLoader::load(const char * imagesFilePath, float *images, int *labels, int startN, int numExamples) {
 //    cout << "GenericLoader::load " << numExamples << endl;
+    cout << "GenericLoader::load " << endl;
+    cout << imagesFilePath << endl;
     int N, planes, size;
-    getDimensions( imagesFilePath, &N, &planes, &size );
+    getDimensions(imagesFilePath, &N, &planes, &size);
     unsigned char *ucImages = new unsigned char[ numExamples * planes * size * size ];
-    load( imagesFilePath, ucImages, labels, startN, numExamples );
+    load(imagesFilePath, ucImages, labels, startN, numExamples);
     int linearSize =  numExamples * planes * size * size;
 
-    for( int i = 0; i < linearSize; i++ ) {
+    for(int i = 0; i < linearSize; i++) {
         images[i] = ucImages[i];
     }
     delete[] ucImages;
 }
 
-PUBLIC STATIC void GenericLoader::load( std::string trainFilepath, unsigned char *images, int *labels ) {
-    load( trainFilepath, images, labels, 0, 0 );
+PUBLIC STATIC void GenericLoader::load(const char * trainFilepath, unsigned char *images, int *labels) {
+    load(trainFilepath, images, labels, 0, 0);
 }
 // for now, if pass in 0 for labels, it wont read labels
-PUBLIC STATIC void GenericLoader::load( std::string trainFilepath, unsigned char *images, int *labels, int startN, int numExamples ) {
+PUBLIC STATIC void GenericLoader::load(const char * trainFilepath, unsigned char *images, int *labels, int startN, int numExamples) {
     StatefulTimer::timeCheck("GenericLoader::load start");
-    char *headerBytes = FileHelper::readBinaryChunk( trainFilepath, 0, 1024 );
+    char *headerBytes = FileHelper::readBinaryChunk(trainFilepath, 0, 1024);
     char type[1025];
-    strncpy( type, headerBytes, 4 );
+    strncpy(type, headerBytes, 4);
     type[4] = 0;
-    unsigned int *headerInts = reinterpret_cast< unsigned int *>( headerBytes );
+    unsigned int *headerInts = reinterpret_cast< unsigned int *>(headerBytes);
 
-    if( string(type) == "mlv2" ) {
+    if(string(type) == "mlv2") {
 //        cout << "Loading as a Kgsv2 file" << endl;
-        Kgsv2Loader::load( trainFilepath, images, labels, startN, numExamples );
-    } else if( headerInts[0] == 0x1e3d4c55 ) {
+        Kgsv2Loader::load(trainFilepath, images, labels, startN, numExamples);
+    } else if(headerInts[0] == 0x1e3d4c55) {
 //        cout << "Loading as a Norb mat file" << endl;
-        NorbLoader::load( trainFilepath, images, labels, startN, numExamples );
-    } else if( headerInts[0] == 0x03080000 ) {
-        MnistLoader::load( trainFilepath, images, labels, startN, numExamples );
+        NorbLoader::load(trainFilepath, images, labels, startN, numExamples);
+    } else if(headerInts[0] == 0x03080000) {
+        MnistLoader::load(trainFilepath, images, labels, startN, numExamples);
     } else {
         cout << "headstring" << type << endl;
-        throw runtime_error("Filetype of " + trainFilepath + " not recognised" );
+        throw runtime_error(string("Filetype of ") + trainFilepath + " not recognised");
     }
     StatefulTimer::timeCheck("GenericLoader::load end");
 }
diff --git a/src/loaders/GenericLoader.h b/src/loaders/GenericLoader.h
index 4c6879ea..87730cc0 100644
--- a/src/loaders/GenericLoader.h
+++ b/src/loaders/GenericLoader.h
@@ -32,10 +32,10 @@ class DeepCL_EXPORT GenericLoader {
     // generated, using cog:
 
     public:
-    PUBLICAPI STATIC void getDimensions( std::string trainFilepath, int *p_numExamples, int *p_numPlanes, int *p_imageSize );
-    PUBLICAPI STATIC void load( std::string imagesFilePath, float *images, int *labels, int startN, int numExamples );
-    STATIC void load( std::string trainFilepath, unsigned char *images, int *labels );
-    STATIC void load( std::string trainFilepath, unsigned char *images, int *labels, int startN, int numExamples );
+    PUBLICAPI STATIC void getDimensions(const char * trainFilepath, int *p_numExamples, int *p_numPlanes, int *p_imageSize);
+    PUBLICAPI STATIC void load(const char * imagesFilePath, float *images, int *labels, int startN, int numExamples);
+    STATIC void load(const char * trainFilepath, unsigned char *images, int *labels);
+    STATIC void load(const char * trainFilepath, unsigned char *images, int *labels, int startN, int numExamples);
 
     // [[[end]]]
 };
diff --git a/src/loaders/GenericLoaderv1Wrapper.cpp b/src/loaders/GenericLoaderv1Wrapper.cpp
index 41d0ab93..8272058a 100644
--- a/src/loaders/GenericLoaderv1Wrapper.cpp
+++ b/src/loaders/GenericLoaderv1Wrapper.cpp
@@ -30,14 +30,14 @@ PUBLIC VIRTUAL int GenericLoaderv1Wrapper::getPlanes() {
 PUBLIC VIRTUAL int GenericLoaderv1Wrapper::getImageSize() {
     return size;
 }
-PUBLIC GenericLoaderv1Wrapper::GenericLoaderv1Wrapper( std::string imagesFilepath ) {
+PUBLIC GenericLoaderv1Wrapper::GenericLoaderv1Wrapper(std::string imagesFilepath) {
     this->imagesFilepath = imagesFilepath;
-    GenericLoader::getDimensions( imagesFilepath, &N, &planes, &size );
+    GenericLoader::getDimensions(imagesFilepath.c_str(), &N, &planes, &size);
 }
 PUBLIC VIRTUAL int GenericLoaderv1Wrapper::getImageCubeSize() {
     return planes * size * size;
 }
-PUBLIC VIRTUAL void GenericLoaderv1Wrapper::load( unsigned char *data, int *labels, int startRecord, int numRecords ) {
-    GenericLoader::load( imagesFilepath, data, labels, startRecord, numRecords );
+PUBLIC VIRTUAL void GenericLoaderv1Wrapper::load(unsigned char *data, int *labels, int startRecord, int numRecords) {
+    GenericLoader::load(imagesFilepath.c_str(), data, labels, startRecord, numRecords);
 }
 
diff --git a/src/loaders/GenericLoaderv1Wrapper.h b/src/loaders/GenericLoaderv1Wrapper.h
index 058990ab..5e00381f 100644
--- a/src/loaders/GenericLoaderv1Wrapper.h
+++ b/src/loaders/GenericLoaderv1Wrapper.h
@@ -38,9 +38,9 @@ class GenericLoaderv1Wrapper : public Loader {
     VIRTUAL int getN();
     VIRTUAL int getPlanes();
     VIRTUAL int getImageSize();
-    GenericLoaderv1Wrapper( std::string imagesFilepath );
+    GenericLoaderv1Wrapper(std::string imagesFilepath);
     VIRTUAL int getImageCubeSize();
-    VIRTUAL void load( unsigned char *data, int *labels, int startRecord, int numRecords );
+    VIRTUAL void load(unsigned char *data, int *labels, int startRecord, int numRecords);
 
     // [[[end]]]
 };
diff --git a/src/loaders/GenericLoaderv2.cpp b/src/loaders/GenericLoaderv2.cpp
index d66a51ce..4681a33f 100644
--- a/src/loaders/GenericLoaderv2.cpp
+++ b/src/loaders/GenericLoaderv2.cpp
@@ -27,25 +27,25 @@ using namespace std;
 #define STATIC
 #define VIRTUAL
 
-PUBLIC GenericLoaderv2::GenericLoaderv2( std::string imagesFilepath ) {
+PUBLIC GenericLoaderv2::GenericLoaderv2(std::string imagesFilepath) {
     loader = 0;
     #ifdef LIBJPEG_FOUND
-    if( ManifestLoaderv1::isFormatFor( imagesFilepath ) ) {
-        loader = new ManifestLoaderv1( imagesFilepath );
+    if(ManifestLoaderv1::isFormatFor(imagesFilepath) ) {
+        loader = new ManifestLoaderv1(imagesFilepath);
     }
     #endif
-    if( loader == 0 ) {
-        loader = new GenericLoaderv1Wrapper( imagesFilepath );
+    if(loader == 0) {
+        loader = new GenericLoaderv1Wrapper(imagesFilepath);
     }
 }
 
-PUBLIC void GenericLoaderv2::load( float *images, int *labels, int startN, int numExamples ) {
+PUBLIC void GenericLoaderv2::load(float *images, int *labels, int startN, int numExamples) {
     int linearSize =  numExamples * loader->getImageCubeSize();
     unsigned char *ucImages = new unsigned char[ linearSize ];
 
-    load( ucImages, labels, startN, numExamples );
+    load(ucImages, labels, startN, numExamples);
 
-    for( int i = 0; i < linearSize; i++ ) {
+    for(int i = 0; i < linearSize; i++) {
         images[i] = ucImages[i];
     }
     delete[] ucImages;
@@ -59,14 +59,14 @@ PUBLIC int GenericLoaderv2::getPlanes() {
 PUBLIC int GenericLoaderv2::getImageSize() {
     return loader->getImageSize();
 }
-PUBLIC void GenericLoaderv2::load( unsigned char *images, int *labels ) {
-    load( images, labels, 0, 0 );
+PUBLIC void GenericLoaderv2::load(unsigned char *images, int *labels) {
+    load(images, labels, 0, 0);
 }
 
-PUBLIC void GenericLoaderv2::load( unsigned char *images, int *labels, int startN, int numExamples ) {
+PUBLIC void GenericLoaderv2::load(unsigned char *images, int *labels, int startN, int numExamples) {
     StatefulTimer::timeCheck("GenericLoaderv2::load start");
 
-    loader->load( images, labels, startN, numExamples );
+    loader->load(images, labels, startN, numExamples);
 
     StatefulTimer::timeCheck("GenericLoaderv2::load end");
 }
diff --git a/src/loaders/GenericLoaderv2.h b/src/loaders/GenericLoaderv2.h
index 40aff078..0c3fd465 100644
--- a/src/loaders/GenericLoaderv2.h
+++ b/src/loaders/GenericLoaderv2.h
@@ -32,13 +32,13 @@ class DeepCL_EXPORT GenericLoaderv2 {
     // generated, using cog:
 
     public:
-    GenericLoaderv2( std::string imagesFilepath );
-    void load( float *images, int *labels, int startN, int numExamples );
+    GenericLoaderv2(std::string imagesFilepath);
+    void load(float *images, int *labels, int startN, int numExamples);
     int getN();
     int getPlanes();
     int getImageSize();
-    void load( unsigned char *images, int *labels );
-    void load( unsigned char *images, int *labels, int startN, int numExamples );
+    void load(unsigned char *images, int *labels);
+    void load(unsigned char *images, int *labels, int startN, int numExamples);
 
     // [[[end]]]
 };
diff --git a/src/loaders/Kgsv2Loader.cpp b/src/loaders/Kgsv2Loader.cpp
index 31226070..c73246b5 100644
--- a/src/loaders/Kgsv2Loader.cpp
+++ b/src/loaders/Kgsv2Loader.cpp
@@ -21,20 +21,20 @@ using namespace std;
 #define STATIC
 #define VIRTUAL
 
-STATIC void Kgsv2Loader::getDimensions( std::string filepath, int *p_N, int *p_numPlanes, int *p_imageSize ) {
-    char *headerBytes = FileHelper::readBinaryChunk( filepath, 0, 1024 );
+STATIC void Kgsv2Loader::getDimensions(std::string filepath, int *p_N, int *p_numPlanes, int *p_imageSize) {
+    char *headerBytes = FileHelper::readBinaryChunk(filepath, 0, 1024);
     headerBytes[1023] = 0;
-    string headerString = string( headerBytes );
-    vector<string> splitHeader = split( headerString, "-" );
-    if( splitHeader[0] != "mlv2" ) {
-        throw runtime_error( "file " + filepath + " is not an mlv2 (kgsgo) data file" );
+    string headerString = string(headerBytes);
+    vector<string> splitHeader = split(headerString, "-");
+    if(splitHeader[0] != "mlv2") {
+        throw runtime_error("file " + filepath + " is not an mlv2 (kgsgo) data file");
     }
-    int N = atoi( split( split( headerString, "-n=" )[1], "-" )[0] );
-    int numPlanes = atoi( split( split( headerString, "-numplanes=" )[1], "-" )[0] );
-    int imageSize = atoi( split( split( headerString, "-imagewidth=" )[1], "-" )[0] );
-    int imageSizeRepeated = atoi( split( split( headerString, "-imageheight=" )[1], "-" )[0] );
-    if( imageSize != imageSizeRepeated ) {
-        throw runtime_error( "file " + filepath + " contains non-square images.  Not handled for now." );
+    int N = atoi(split(split(headerString, "-n=")[1], "-")[0]);
+    int numPlanes = atoi(split(split(headerString, "-numplanes=")[1], "-")[0]);
+    int imageSize = atoi(split(split(headerString, "-imagewidth=")[1], "-")[0]);
+    int imageSizeRepeated = atoi(split(split(headerString, "-imageheight=")[1], "-")[0]);
+    if(imageSize != imageSizeRepeated) {
+        throw runtime_error("file " + filepath + " contains non-square images.  Not handled for now.");
     }
     *p_N = N;
     *p_numPlanes = numPlanes;
@@ -42,28 +42,28 @@ STATIC void Kgsv2Loader::getDimensions( std::string filepath, int *p_N, int *p_n
 //    *p_totalImagesLinearSize = N * numPlanes * imageSize * imageSize;
 }
 
-//STATIC int Kgsv2Loader::getNumRecords( std::string filepath ) {
-//    long filesize = FileHelper::getFilesize( filepath );
+//STATIC int Kgsv2Loader::getNumRecords(std::string filepath) {
+//    long filesize = FileHelper::getFilesize(filepath);
 //    int recordsSize = filesize - 4; // because of 'END' at the end
 //    int numRecords = recordsSize / getRecordSize();
 //    return numRecords;
 //}
 
-//STATIC int Kgsv2Loader::loadKgs( std::string filepath, int *p_numPlanes, int *p_imageSize, unsigned char *data, int *labels ) {
-//    return loadKgs( filepath, p_numPlanes, p_imageSize, data, labels, 0, getNumRecords( filepath ) );
+//STATIC int Kgsv2Loader::loadKgs(std::string filepath, int *p_numPlanes, int *p_imageSize, unsigned char *data, int *labels) {
+//    return loadKgs(filepath, p_numPlanes, p_imageSize, data, labels, 0, getNumRecords(filepath) );
 //}
 
-STATIC void Kgsv2Loader::load( std::string filepath, unsigned char *data, int *labels ) {
-    load( filepath, data, labels, 0, 0 );
+STATIC void Kgsv2Loader::load(std::string filepath, unsigned char *data, int *labels) {
+    load(filepath, data, labels, 0, 0);
 }
 
-STATIC void Kgsv2Loader::load( std::string filepath, unsigned char *data, int *labels, int startRecord, int numRecords ) {
+STATIC void Kgsv2Loader::load(std::string filepath, unsigned char *data, int *labels, int startRecord, int numRecords) {
     int N;
     int imageSize;
     int numPlanes;
 //    int imagesSize;
-    getDimensions( filepath, &N, &numPlanes, &imageSize );
-    if( numRecords == 0 ) {
+    getDimensions(filepath, &N, &numPlanes, &imageSize);
+    if(numRecords == 0) {
         numRecords = N - startRecord;
     }
     const int imageSizeSquared = imageSize * imageSize;
@@ -71,22 +71,22 @@ STATIC void Kgsv2Loader::load( std::string filepath, unsigned char *data, int *l
     long pos = (long)startRecord * recordSize + 1024 /* for header */;
     long chunkByteSize = (long)numRecords * recordSize;
 //    cout << "chunkByteSize: " << chunkByteSize << endl;
-    unsigned char *kgsData = reinterpret_cast<unsigned char *>( FileHelper::readBinaryChunk( filepath, pos, chunkByteSize ) );
-    for( int n = 0; n < numRecords; n++ ) {
+    unsigned char *kgsData = reinterpret_cast<unsigned char *>(FileHelper::readBinaryChunk(filepath, pos, chunkByteSize) );
+    for(int n = 0; n < numRecords; n++) {
         long recordOffset = (long)n * recordSize;
 //        cout << "recordOffset: " << recordOffset << endl;
         unsigned char *record = kgsData + recordOffset;
-        if( record[ 0 ] != 'G' ) {
-            throw std::runtime_error("alignment error, for record " + toString(n) );
+        if(record[ 0 ] != 'G') {
+            throw std::runtime_error("alignment error, for record " + toString(n));
         }
-        if( record[ 1 ] != 'O' ) {
-            throw std::runtime_error("alignment error, for record " + toString(n) );
+        if(record[ 1 ] != 'O') {
+            throw std::runtime_error("alignment error, for record " + toString(n));
         }
-        if( labels != 0 ) {
-            int *p_label = reinterpret_cast< int * >( record + 2 );
+        if(labels != 0) {
+            int *p_label = reinterpret_cast< int * >(record + 2);
             int label = p_label[0];
             labels[n] = label;
-            if( label < 0 ) {
+            if(label < 0) {
                 throw runtime_error("Error: label " + toString(labels) + " is negative");
             }
         }
@@ -94,14 +94,14 @@ STATIC void Kgsv2Loader::load( std::string filepath, unsigned char *data, int *l
         int bitPos = 0;
         int intraRecordPos = 0;
         unsigned char thisrecordbyte = recordImage[ intraRecordPos ];
-        for( int plane = 0; plane < numPlanes; plane++ ) {
-            unsigned char *dataPlane = data + ( (long)n * numPlanes + plane ) * imageSizeSquared;
-            for( int intraImagePos = 0; intraImagePos < imageSizeSquared; intraImagePos++ ) {
-                unsigned char thisbyte = ( thisrecordbyte >> ( 7 - bitPos ) ) & 1;
+        for(int plane = 0; plane < numPlanes; plane++) {
+            unsigned char *dataPlane = data + ((long)n * numPlanes + plane) * imageSizeSquared;
+            for(int intraImagePos = 0; intraImagePos < imageSizeSquared; intraImagePos++) {
+                unsigned char thisbyte = (thisrecordbyte >> (7 - bitPos) ) & 1;
 //                cout << "thisbyte: " << (int)thisbyte << endl;
                 dataPlane[ intraImagePos ] = thisbyte * 255;
                 bitPos++;
-                if( bitPos == 8 ) {
+                if(bitPos == 8) {
                     bitPos = 0;
                     intraRecordPos++;
                     thisrecordbyte = recordImage[ intraRecordPos ];
@@ -113,26 +113,26 @@ STATIC void Kgsv2Loader::load( std::string filepath, unsigned char *data, int *l
 //    return numRecords;
 }
 
-//STATIC int Kgsv2Loader::loadKgs( std::string filepath, int *p_numPlanes, int *p_imageSize, unsigned char *data, int *labels, int recordStart, int numRecords ) {
+//STATIC int Kgsv2Loader::loadKgs(std::string filepath, int *p_numPlanes, int *p_imageSize, unsigned char *data, int *labels, int recordStart, int numRecords) {
 //    long pos = (long)recordStart * getRecordSize();
 //    const int recordSize = getRecordSize();
 //    const int imageSize = 19;
 //    const int numPlanes = 8;
 //    const int imageSizeSquared = imageSize * imageSize;
-//    unsigned char *kgsData = reinterpret_cast<unsigned char *>( FileHelper::readBinaryChunk( filepath, pos, (long)numRecords * recordSize ) );
-//    for( int n = 0; n < numRecords; n++ ) {
+//    unsigned char *kgsData = reinterpret_cast<unsigned char *>(FileHelper::readBinaryChunk(filepath, pos, (long)numRecords * recordSize) );
+//    for(int n = 0; n < numRecords; n++) {
 //        long recordPos = n * recordSize;
-//        if( kgsData[recordPos + 0 ] != 'G' ) {
-//            throw std::runtime_error("alignment error, for record " + toString(n) );
+//        if(kgsData[recordPos + 0 ] != 'G') {
+//            throw std::runtime_error("alignment error, for record " + toString(n));
 //        }
 //        int row = kgsData[ recordPos + 2 ];
 //        int col = kgsData[ recordPos + 3 ];
 //        labels[n] = row * imageSize + col;
-//        for( int plane = 0; plane < numPlanes; plane++ ) {
-//            for( int intraImagePos = 0; intraImagePos < imageSizeSquared; intraImagePos++ ) {
+//        for(int plane = 0; plane < numPlanes; plane++) {
+//            for(int intraImagePos = 0; intraImagePos < imageSizeSquared; intraImagePos++) {
 //                unsigned char thisbyte = kgsData[ recordPos + intraImagePos + 4 ];
-//                thisbyte = ( thisbyte >> plane ) & 1;
-//                data[ ( n * numPlanes + plane * imageSizeSquared ) + intraImagePos ] = thisbyte;
+//                thisbyte = (thisbyte >> plane) & 1;
+//                data[ (n * numPlanes + plane * imageSizeSquared) + intraImagePos ] = thisbyte;
 //            }
 //        }
 //    }
@@ -141,12 +141,12 @@ STATIC void Kgsv2Loader::load( std::string filepath, unsigned char *data, int *l
 //    return numRecords;
 //}
 
-STATIC int Kgsv2Loader::getRecordSize( int numPlanes, int imageSize ) {
+STATIC int Kgsv2Loader::getRecordSize(int numPlanes, int imageSize) {
 //    const int imageSizeSquared = imageSize * imageSize;
     int recordSize = 2 /* "GO" */ + 4 /* label */;
     // + imageSizeSquared;
     int numBits = numPlanes * imageSize * imageSize;
-    int numBytes = ( numBits + 8 - 1 ) / 8;
+    int numBytes = (numBits + 8 - 1) / 8;
     recordSize += numBytes;
 //    cout << "numBits " << numBits << " numBytes " << numBytes << " recordSize " << recordSize << endl;
     return recordSize;
diff --git a/src/loaders/Kgsv2Loader.h b/src/loaders/Kgsv2Loader.h
index e7344288..177bb836 100644
--- a/src/loaders/Kgsv2Loader.h
+++ b/src/loaders/Kgsv2Loader.h
@@ -21,10 +21,10 @@ class DeepCL_EXPORT Kgsv2Loader {
     // cog_addheaders.add()
     // ]]]
     // generated, using cog:
-    STATIC void getDimensions( std::string filepath, int *p_N, int *p_numPlanes, int *p_imageSize );
-    STATIC void load( std::string filepath, unsigned char *data, int *labels );
-    STATIC void load( std::string filepath, unsigned char *data, int *labels, int startRecord, int numRecords );
-    STATIC int getRecordSize( int numPlanes, int imageSize );
+    STATIC void getDimensions(std::string filepath, int *p_N, int *p_numPlanes, int *p_imageSize);
+    STATIC void load(std::string filepath, unsigned char *data, int *labels);
+    STATIC void load(std::string filepath, unsigned char *data, int *labels, int startRecord, int numRecords);
+    STATIC int getRecordSize(int numPlanes, int imageSize);
 
     // [[[end]]]
 };
diff --git a/src/loaders/Loader.h b/src/loaders/Loader.h
index 26f1052d..689febb5 100644
--- a/src/loaders/Loader.h
+++ b/src/loaders/Loader.h
@@ -17,7 +17,7 @@
 class Loader {
     public:
     VIRTUAL std::string getType() = 0;
-    VIRTUAL void load( unsigned char *data, int *labels, int startRecord, int numRecords ) = 0;
+    VIRTUAL void load(unsigned char *data, int *labels, int startRecord, int numRecords) = 0;
     VIRTUAL int getImageCubeSize() = 0;
     VIRTUAL int getN() = 0;
     VIRTUAL int getPlanes() = 0;
diff --git a/src/loaders/ManifestLoaderv1.cpp b/src/loaders/ManifestLoaderv1.cpp
index 105a555a..bfaa5e28 100644
--- a/src/loaders/ManifestLoaderv1.cpp
+++ b/src/loaders/ManifestLoaderv1.cpp
@@ -24,44 +24,44 @@ using namespace std;
 #define STATIC
 #define VIRTUAL
 
-PUBLIC STATIC bool ManifestLoaderv1::isFormatFor( std::string imagesFilepath ) {
+PUBLIC STATIC bool ManifestLoaderv1::isFormatFor(std::string imagesFilepath) {
     cout << "ManifestLoaderv1 checking format for " << imagesFilepath << endl;
-    char *headerBytes = FileHelper::readBinaryChunk( imagesFilepath, 0, 1024 );
+    char *headerBytes = FileHelper::readBinaryChunk(imagesFilepath, 0, 1024);
     string sigString = "# format=deepcl-jpeg-list-v1 ";
     headerBytes[sigString.length()] = 0;
-    bool matched = string( headerBytes ) == sigString;
+    bool matched = string(headerBytes) == sigString;
     cout << "matched: " << matched << endl;
     return matched;
 }
-PUBLIC ManifestLoaderv1::ManifestLoaderv1( std::string imagesFilepath ) {
-    init( imagesFilepath, true );    
+PUBLIC ManifestLoaderv1::ManifestLoaderv1(std::string imagesFilepath) {
+    init(imagesFilepath, true);    
 }
-PUBLIC ManifestLoaderv1::ManifestLoaderv1( std::string imagesFilepath, bool includeLabels ) {
-    init( imagesFilepath, includeLabels );
+PUBLIC ManifestLoaderv1::ManifestLoaderv1(std::string imagesFilepath, bool includeLabels) {
+    init(imagesFilepath, includeLabels);
 }
-PRIVATE void ManifestLoaderv1::init( std::string imagesFilepath, bool includeLabels ) {
+PRIVATE void ManifestLoaderv1::init(std::string imagesFilepath, bool includeLabels) {
     this->includeLabels = includeLabels;
     this->imagesFilepath = imagesFilepath;
     // by reading the number of lines in the manifest, we can get the number of examples, *p_N
     // number of planes is .... 1
     // imageSize is ...
 
-    if( !isFormatFor( imagesFilepath ) ) {
-        throw runtime_error( "file " + imagesFilepath + " is not a deepcl-jpeg-list-v1 manifest file" );
+    if(!isFormatFor(imagesFilepath) ) {
+        throw runtime_error("file " + imagesFilepath + " is not a deepcl-jpeg-list-v1 manifest file");
     }
 
-    ifstream infile( imagesFilepath );
+    ifstream infile(imagesFilepath);
     char lineChars[1024];
-    infile.getline( lineChars, 1024 ); // skip first, header, line
-    string firstLine = string( lineChars );
+    infile.getline(lineChars, 1024); // skip first, header, line
+    string firstLine = string(lineChars);
 //    cout << "firstline: [" << firstLine << "]" << endl;
-    vector<string> splitLine = split( firstLine, " " );
-    N = readIntValue( splitLine, "N" );
-    planes = readIntValue( splitLine, "planes" );
-    size = readIntValue( splitLine, "width" );
-    int imageSizeRepeated = readIntValue( splitLine, "height" );
-    if( size != imageSizeRepeated ) {
-        throw runtime_error( "file " + imagesFilepath + " contains non-square images.  Not handled for now." );
+    vector<string> splitLine = split(firstLine, " ");
+    N = readIntValue(splitLine, "N");
+    planes = readIntValue(splitLine, "planes");
+    size = readIntValue(splitLine, "width");
+    int imageSizeRepeated = readIntValue(splitLine, "height");
+    if(size != imageSizeRepeated) {
+        throw runtime_error("file " + imagesFilepath + " contains non-square images.  Not handled for now.");
     }
     // now we should load into memory, since the file is not fixed-size records, and cannot be loaded partially easily
 
@@ -69,25 +69,25 @@ PRIVATE void ManifestLoaderv1::init( std::string imagesFilepath, bool includeLab
     labels = new int[N];
 
     int n = 0;
-    while( infile ) {
-        infile.getline( lineChars, 1024 );
-        if( !infile ) {
+    while(infile) {
+        infile.getline(lineChars, 1024);
+        if(!infile) {
             break;
         }
-        string line = string( lineChars );
-        if( line == "" ) {
+        string line = string(lineChars);
+        if(line == "") {
             continue;
         }
         vector<string> splitLine = split(line, " ");
-        if( (int)splitLine.size() == 0 ) {
+        if((int)splitLine.size() == 0) {
             continue;
         }
-        if( includeLabels && (int)splitLine.size() != 2 ) { 
-            throw runtime_error("Error reading " + imagesFilepath + ".  Following line not parseable:\n" + line );
+        if(includeLabels && (int)splitLine.size() != 2) { 
+            throw runtime_error("Error reading " + imagesFilepath + ".  Following line not parseable:\n" + line);
         }
         string jpegFile = splitLine[0];
         files[n] = jpegFile;
-        if( includeLabels ) {
+        if(includeLabels) {
             int label = atoi(splitLine[1]);
         labels[n] = label;
         }
@@ -113,26 +113,26 @@ PUBLIC VIRTUAL int ManifestLoaderv1::getPlanes() {
 PUBLIC VIRTUAL int ManifestLoaderv1::getImageSize() {
     return size;
 }
-int ManifestLoaderv1::readIntValue( std::vector< std::string > splitLine, std::string key ) {
-    for( int i = 0; i < (int)splitLine.size(); i++ ) {
-        vector<string> splitPair = split( splitLine[i], "=" );
-        if( (int)splitPair.size() == 2 ) {
-            if( splitPair[0] == key ) {
-                return atoi( splitPair[1] );
+int ManifestLoaderv1::readIntValue(std::vector< std::string > splitLine, std::string key) {
+    for(int i = 0; i < (int)splitLine.size(); i++) {
+        vector<string> splitPair = split(splitLine[i], "=");
+        if((int)splitPair.size() == 2) {
+            if(splitPair[0] == key) {
+                return atoi(splitPair[1]);
             }
         }
     }
-    throw runtime_error("Key " + key + " not found in file header" );
+    throw runtime_error("Key " + key + " not found in file header");
 }
-PUBLIC VIRTUAL void ManifestLoaderv1::load( unsigned char *data, int *labels, int startRecord, int numRecords ) {
+PUBLIC VIRTUAL void ManifestLoaderv1::load(unsigned char *data, int *labels, int startRecord, int numRecords) {
     int imageCubeSize = planes * size * size;
 //    cout << "ManifestLoaderv1, loading " << numRecords << " jpegs" << endl;
-    for( int localN = 0; localN < numRecords; localN++ ) {
+    for(int localN = 0; localN < numRecords; localN++) {
         int globalN = localN + startRecord;
-        JpegHelper::read( files[globalN], planes, size, size, data + localN * imageCubeSize );
-        if( labels != 0 ) {
-            if( !includeLabels ) {
-                throw runtime_error( "ManifestLoaderv1: labels reqested in load() method, but not activated in constructor" );
+        JpegHelper::read(files[globalN], planes, size, size, data + localN * imageCubeSize);
+        if(labels != 0) {
+            if(!includeLabels) {
+                throw runtime_error("ManifestLoaderv1: labels reqested in load() method, but not activated in constructor");
             }
             labels[localN] = this->labels[globalN];
         }
diff --git a/src/loaders/ManifestLoaderv1.h b/src/loaders/ManifestLoaderv1.h
index 4f1633a5..42df1169 100644
--- a/src/loaders/ManifestLoaderv1.h
+++ b/src/loaders/ManifestLoaderv1.h
@@ -34,19 +34,19 @@ class ManifestLoaderv1 : public Loader {
     // generated, using cog:
 
     public:
-    STATIC bool isFormatFor( std::string imagesFilepath );
-    ManifestLoaderv1( std::string imagesFilepath );
-    ManifestLoaderv1( std::string imagesFilepath, bool includeLabels );
+    STATIC bool isFormatFor(std::string imagesFilepath);
+    ManifestLoaderv1(std::string imagesFilepath);
+    ManifestLoaderv1(std::string imagesFilepath, bool includeLabels);
     VIRTUAL std::string getType();
     VIRTUAL int getImageCubeSize();
     VIRTUAL int getN();
     VIRTUAL int getPlanes();
     VIRTUAL int getImageSize();
-    VIRTUAL void load( unsigned char *data, int *labels, int startRecord, int numRecords );
+    VIRTUAL void load(unsigned char *data, int *labels, int startRecord, int numRecords);
 
     private:
-    void init( std::string imagesFilepath, bool includeLabels );
-    int readIntValue( std::vector< std::string > splitLine, std::string key );
+    void init(std::string imagesFilepath, bool includeLabels);
+    int readIntValue(std::vector< std::string > splitLine, std::string key);
 
     // [[[end]]]
 };
diff --git a/src/loaders/MnistLoader.cpp b/src/loaders/MnistLoader.cpp
index 0298851b..6bc638eb 100644
--- a/src/loaders/MnistLoader.cpp
+++ b/src/loaders/MnistLoader.cpp
@@ -18,15 +18,15 @@ using namespace std;
 #undef STATIC
 #define STATIC
 
-STATIC void MnistLoader::getDimensions( std::string imagesFilePath, 
-    int *p_numExamples, int *p_numPlanes, int *p_imageSize ) {
-    char*headerBytes = FileHelper::readBinaryChunk( imagesFilePath, 0, 4 * 4 );
-    unsigned char *headerValues = reinterpret_cast< unsigned char *>( headerBytes );
-    *p_numExamples = readUInt( headerValues, 1 );
+STATIC void MnistLoader::getDimensions(std::string imagesFilePath, 
+    int *p_numExamples, int *p_numPlanes, int *p_imageSize) {
+    char*headerBytes = FileHelper::readBinaryChunk(imagesFilePath, 0, 4 * 4);
+    unsigned char *headerValues = reinterpret_cast< unsigned char *>(headerBytes);
+    *p_numExamples = readUInt(headerValues, 1);
     *p_numPlanes = 1;
-    *p_imageSize = readUInt( headerValues, 2 );
-    int imageSizeRepeat = readUInt( headerValues, 3 );
-    if( *p_imageSize != imageSizeRepeat ) {
+    *p_imageSize = readUInt(headerValues, 2);
+    int imageSizeRepeat = readUInt(headerValues, 3);
+    if(*p_imageSize != imageSizeRepeat) {
         throw runtime_error("error reading mnist-format file " + imagesFilePath + ": height and width not equal.  We only support square images currently.");
     }
 
@@ -36,52 +36,52 @@ STATIC void MnistLoader::getDimensions( std::string imagesFilePath,
 // in
 // oh, except the data is stored as unsigned char, not int, so need to convert
 // new: if labels is 0, then it wont read labels
-STATIC void MnistLoader::load( std::string imagesFilePath, unsigned char *images, int *labels, int startN, int numExamples ) {
+STATIC void MnistLoader::load(std::string imagesFilePath, unsigned char *images, int *labels, int startN, int numExamples) {
     int N, numPlanes, imageSize;
-    getDimensions( imagesFilePath, &N, &numPlanes, &imageSize );
-    if( numExamples == 0 ) {
+    getDimensions(imagesFilePath, &N, &numPlanes, &imageSize);
+    if(numExamples == 0) {
         numExamples = N - startN;
     }
     long fileStartPos = 4 * 4 + (long)startN * numPlanes * imageSize * imageSize;
     long fileReadLength = (long)numExamples * numPlanes * imageSize * imageSize;
-    char *imagesAsCharArray = reinterpret_cast< char *>(images );
-    FileHelper::readBinaryChunk( imagesAsCharArray, imagesFilePath, fileStartPos, fileReadLength );
+    char *imagesAsCharArray = reinterpret_cast< char *>(images);
+    FileHelper::readBinaryChunk(imagesAsCharArray, imagesFilePath, fileStartPos, fileReadLength);
 
     // now do labels...
-    if( labels == 0 ) {
+    if(labels == 0) {
         return;
     }
-    string labelsFilePath = replace( imagesFilePath, "-images-idx3-ubyte", "-labels-idx1-ubyte" );
+    string labelsFilePath = replace(imagesFilePath, "-images-idx3-ubyte", "-labels-idx1-ubyte");
 //    cout << "labelsfilepath: " << labelsFilePath << endl;
     
     fileStartPos = 2 * 4 + (long)startN;
     fileReadLength = (long)numExamples;
     char *labelsAsCharArray = new char[fileReadLength];
-    unsigned char *labelsAsUCharArray = reinterpret_cast< unsigned char *>(labelsAsCharArray );
+    unsigned char *labelsAsUCharArray = reinterpret_cast< unsigned char *>(labelsAsCharArray);
 //    cout << "labels path " << labelsFilePath << " startpos " << fileStartPos << " read length " << fileReadLength << endl;
-    FileHelper::readBinaryChunk( labelsAsCharArray, labelsFilePath, fileStartPos, fileReadLength );
-    for( int i = 0; i < numExamples; i++ ) {
+    FileHelper::readBinaryChunk(labelsAsCharArray, labelsFilePath, fileStartPos, fileReadLength);
+    for(int i = 0; i < numExamples; i++) {
         labels[i] = labelsAsUCharArray[i];
     }
     delete[]labelsAsCharArray;
 }
-//STATIC int **MnistLoader::loadImage( std::string dir, std::string set, int idx, int *p_size ) {
+//STATIC int **MnistLoader::loadImage(std::string dir, std::string set, int idx, int *p_size) {
 //    long imagesFilesize = 0;
 //    long labelsFilesize = 0;
-//    char *imagesDataSigned = FileHelper::readBinary( dir + "/" + set + "-images-idx3-ubyte", &imagesFilesize );
-//    char *labelsDataSigned = FileHelper::readBinary( dir + "/" + set + "-labels-idx1-ubyte", &labelsFilesize );
+//    char *imagesDataSigned = FileHelper::readBinary(dir + "/" + set + "-images-idx3-ubyte", &imagesFilesize);
+//    char *labelsDataSigned = FileHelper::readBinary(dir + "/" + set + "-labels-idx1-ubyte", &labelsFilesize);
 //    unsigned char *imagesData = reinterpret_cast< unsigned char *>(imagesDataSigned);
 ////        unsigned char *labelsData = reinterpret_cast< unsigned char *>(labelsDataSigned);
 
-////    int numImages = readUInt( imagesData, 1 );
-//    int numRows = readUInt( imagesData, 2 );
-//    int numCols = readUInt( imagesData, 3 );
+////    int numImages = readUInt(imagesData, 1);
+//    int numRows = readUInt(imagesData, 2);
+//    int numCols = readUInt(imagesData, 3);
 //    *p_size = numRows;
 ////    std::cout << "numimages " << numImages << " " << numRows << "*" << numCols << std::endl;
 
-//    int **image = ImageHelper::allocateImage( numRows );
-//    for( int i = 0; i < numRows; i++ ) {
-//        for( int j = 0; j < numRows; j++ ) {
+//    int **image = ImageHelper::allocateImage(numRows);
+//    for(int i = 0; i < numRows; i++) {
+//        for(int j = 0; j < numRows; j++) {
 //            image[i][j] = (int)imagesData[idx * numRows * numCols + i * numCols + j];
 //        }
 //    }
@@ -89,21 +89,21 @@ STATIC void MnistLoader::load( std::string imagesFilePath, unsigned char *images
 //    delete[] labelsDataSigned;
 //    return image;
 //}
-//STATIC int ***MnistLoader::loadImages( std::string dir, std::string set, int *p_numImages, int *p_size ) {
+//STATIC int ***MnistLoader::loadImages(std::string dir, std::string set, int *p_numImages, int *p_size) {
 //    long imagesFilesize = 0;
-//    char *imagesDataSigned = FileHelper::readBinary( dir + "/" + set + "-images-idx3-ubyte", &imagesFilesize );
+//    char *imagesDataSigned = FileHelper::readBinary(dir + "/" + set + "-images-idx3-ubyte", &imagesFilesize);
 //    unsigned char *imagesData = reinterpret_cast<unsigned char *>(imagesDataSigned);
-//    int totalNumImages = readUInt( imagesData, 1 );
-//    int numRows = readUInt( imagesData, 2 );
-//    int numCols = readUInt( imagesData, 3 );
+//    int totalNumImages = readUInt(imagesData, 1);
+//    int numRows = readUInt(imagesData, 2);
+//    int numCols = readUInt(imagesData, 3);
 ////        *p_numImages = min(100,totalNumImages);
 //    *p_numImages = totalNumImages;
 //    *p_size = numRows;
 ////    std::cout << "totalNumImages " << *p_numImages << " " << *p_size << "*" << numCols << std::endl;
-//    int ***images = ImagesHelper::allocateImages( *p_numImages, numRows );
-//    for( int n = 0; n < *p_numImages; n++ ) {
-//        for( int i = 0; i < numRows; i++ ) {
-//            for( int j = 0; j < numRows; j++ ) {
+//    int ***images = ImagesHelper::allocateImages(*p_numImages, numRows);
+//    for(int n = 0; n < *p_numImages; n++) {
+//        for(int i = 0; i < numRows; i++) {
+//            for(int j = 0; j < numRows; j++) {
 //                images[n][i][j] = (int)imagesData[16 + n * numRows * numCols + i * numCols + j];
 //            }
 //        }
@@ -111,25 +111,25 @@ STATIC void MnistLoader::load( std::string imagesFilePath, unsigned char *images
 //    delete[] imagesDataSigned;
 //    return images;
 //}
-STATIC int *MnistLoader::loadLabels( std::string dir, std::string set, int *p_numImages ) {
+STATIC int *MnistLoader::loadLabels(std::string dir, std::string set, int *p_numImages) {
     long labelsFilesize = 0;
-    char *labelsDataSigned = FileHelper::readBinary( dir + "/" + set + "-labels-idx1-ubyte", &labelsFilesize );
+    char *labelsDataSigned = FileHelper::readBinary(dir + "/" + set + "-labels-idx1-ubyte", &labelsFilesize);
     unsigned char *labelsData = reinterpret_cast<unsigned char *>(labelsDataSigned);
-    int totalNumImages = readUInt( labelsData, 1 );
+    int totalNumImages = readUInt(labelsData, 1);
   //  *p_numImages = min(100,totalNumImages);
     *p_numImages = totalNumImages;
 //    std::cout << "set " << set << " num labels " << *p_numImages << std::endl;
     int *labels = new int[*p_numImages];
-    for( int n = 0; n < *p_numImages; n++ ) {
+    for(int n = 0; n < *p_numImages; n++) {
        labels[n] = (int)labelsData[8 + n];
     }
     delete[] labelsDataSigned;
     return labels;
 }
 
-STATIC int MnistLoader::readUInt( unsigned char *data, int location ) {
+STATIC int MnistLoader::readUInt(unsigned char *data, int location) {
     unsigned int value = 0;
-    for( int i = 0; i < 4; i++ ) {
+    for(int i = 0; i < 4; i++) {
         int thisbyte = data[location*4+i];
         value += thisbyte << ((3-i) * 8);
     }
@@ -137,8 +137,8 @@ STATIC int MnistLoader::readUInt( unsigned char *data, int location ) {
     return value;
 }
 
-STATIC void MnistLoader::writeUInt( unsigned char *data, int location, int value ) {
-    for( int i = 0; i < 4; i++ ) {
+STATIC void MnistLoader::writeUInt(unsigned char *data, int location, int value) {
+    for(int i = 0; i < 4; i++) {
         data[location*4+i] = ((value >> ((3-i)*8))&255);
     }
 }
diff --git a/src/loaders/MnistLoader.h b/src/loaders/MnistLoader.h
index e6d221ac..cb9c773a 100644
--- a/src/loaders/MnistLoader.h
+++ b/src/loaders/MnistLoader.h
@@ -23,12 +23,12 @@ class DeepCL_EXPORT MnistLoader {
     // cog_addheaders.add()
     // ]]]
     // generated, using cog:
-    STATIC void getDimensions( std::string imagesFilePath,
-    int *p_numExamples, int *p_numPlanes, int *p_imageSize );
-    STATIC void load( std::string imagesFilePath, unsigned char *images, int *labels, int startN, int numExamples );
-    STATIC int *loadLabels( std::string dir, std::string set, int *p_numImages );
-    STATIC int readUInt( unsigned char *data, int location );
-    STATIC void writeUInt( unsigned char *data, int location, int value );
+    STATIC void getDimensions(std::string imagesFilePath,
+    int *p_numExamples, int *p_numPlanes, int *p_imageSize);
+    STATIC void load(std::string imagesFilePath, unsigned char *images, int *labels, int startN, int numExamples);
+    STATIC int *loadLabels(std::string dir, std::string set, int *p_numImages);
+    STATIC int readUInt(unsigned char *data, int location);
+    STATIC void writeUInt(unsigned char *data, int location, int value);
 
     // [[[end]]]
 };
diff --git a/src/loaders/NorbLoader.cpp b/src/loaders/NorbLoader.cpp
index fc67be3d..489e8b6c 100644
--- a/src/loaders/NorbLoader.cpp
+++ b/src/loaders/NorbLoader.cpp
@@ -20,14 +20,14 @@ using namespace std;
 #define STATIC
 #define VIRTUAL
 
-STATIC void NorbLoader::getDimensions( std::string trainFilepath, int *p_N, int *p_numPlanes, int *p_imageSize ) {
-    char*headerBytes = FileHelper::readBinaryChunk( trainFilepath, 0, 6 * 4 );
-    unsigned int *headerValues = reinterpret_cast< unsigned int *>( headerBytes );
+STATIC void NorbLoader::getDimensions(std::string trainFilepath, int *p_N, int *p_numPlanes, int *p_imageSize) {
+    char*headerBytes = FileHelper::readBinaryChunk(trainFilepath, 0, 6 * 4);
+    unsigned int *headerValues = reinterpret_cast< unsigned int *>(headerBytes);
 
     int magic = headerValues[0];
 //    std::cout << "magic: " << magic << std::endl;
-    if( magic != 0x1e3d4c55 ) {
-        throw std::runtime_error("magic value doesnt match expections: " + toString(magic) );
+    if(magic != 0x1e3d4c55) {
+        throw std::runtime_error("magic value doesnt match expections: " + toString(magic));
     }
 //    int ndim = headerValues[1];
     int N = headerValues[2];
@@ -35,10 +35,10 @@ STATIC void NorbLoader::getDimensions( std::string trainFilepath, int *p_N, int
     int imageSize = headerValues[4];
     int imageSizeRepeated = headerValues[5];
 //    std::cout << "ndim " << ndim << " " << N << " " << numPlanes << " " << imageSize << " " << imageSizeRepeated << std::endl;
-    checkSame( "imageSize", imageSize, imageSizeRepeated );
+    checkSame("imageSize", imageSize, imageSizeRepeated);
 
-//    if( maxN > 0 ) {
-//        N = min( maxN, N );
+//    if(maxN > 0) {
+//        N = min(maxN, N);
 //    }
 //    int totalLinearSize = N * numPlanes * imageSize * imageSize;
     *p_N = N;
@@ -47,62 +47,62 @@ STATIC void NorbLoader::getDimensions( std::string trainFilepath, int *p_N, int
 //    *p_imagesLinearSize = totalLinearSize;
 }
 
-STATIC void NorbLoader::load( std::string trainFilepath, unsigned char *images, int *labels ) {
-    load( trainFilepath, images, labels, 0, 0 );
+STATIC void NorbLoader::load(std::string trainFilepath, unsigned char *images, int *labels) {
+    load(trainFilepath, images, labels, 0, 0);
 }
 
-STATIC void NorbLoader::load( std::string trainFilepath, unsigned char *images, int *labels, int startN, int numExamples ) {
+STATIC void NorbLoader::load(std::string trainFilepath, unsigned char *images, int *labels, int startN, int numExamples) {
     int N, numPlanes, imageSize;
     // I know, this could be optimized a bit, to remove the intermediate arrays...
-    loadImages( images, trainFilepath, &N, &numPlanes, &imageSize, startN, numExamples );
+    loadImages(images, trainFilepath, &N, &numPlanes, &imageSize, startN, numExamples);
 //    int totalLinearSize = numExamples  * numPlanes * imageSize * imageSize;
-//    memcpy( images, loadedImages + startN * numPlanes * imageSize * imageSize, numExamples * numPlanes * imageSize * imageSize * sizeof( unsigned char ) );
-    if( labels == 0 ) {
+//    memcpy(images, loadedImages + startN * numPlanes * imageSize * imageSize, numExamples * numPlanes * imageSize * imageSize * sizeof(unsigned char) );
+    if(labels == 0) {
         return;
     }
-    loadLabels( labels, replace( trainFilepath, "-dat.mat","-cat.mat"), startN, numExamples );
-//    memcpy( labels, loadedLabels + startN, sizeof( int ) * numExamples );
+    loadLabels(labels, replace(trainFilepath, "-dat.mat","-cat.mat"), startN, numExamples);
+//    memcpy(labels, loadedLabels + startN, sizeof(int) * numExamples);
 //    delete []loadedImages;
 //    delete[] loadedLabels;
 }
 
-//STATIC unsigned char *NorbLoader::loadImages( std::string filepath, int *p_N, int *p_numPlanes, int *p_imageSize ) {
-//    return loadImages( filepath, p_N, p_numPlanes, p_imageSize, 0, 0 );
+//STATIC unsigned char *NorbLoader::loadImages(std::string filepath, int *p_N, int *p_numPlanes, int *p_imageSize) {
+//    return loadImages(filepath, p_N, p_numPlanes, p_imageSize, 0, 0);
 //}
 
-STATIC int *NorbLoader::loadLabels( std::string labelsfilepath, int numExamples ) {
+STATIC int *NorbLoader::loadLabels(std::string labelsfilepath, int numExamples) {
     int *labels = new int[numExamples];
-    loadLabels( labels, labelsfilepath, 0, numExamples );
+    loadLabels(labels, labelsfilepath, 0, numExamples);
     return labels;
 }
 
-STATIC unsigned char *NorbLoader::loadImages( std::string filepath, int *p_N, int *p_numPlanes, int *p_imageSize ) {
-    return loadImages( filepath, p_N, p_numPlanes, p_imageSize, 0, 0 );
+STATIC unsigned char *NorbLoader::loadImages(std::string filepath, int *p_N, int *p_numPlanes, int *p_imageSize) {
+    return loadImages(filepath, p_N, p_numPlanes, p_imageSize, 0, 0);
 }
 
-STATIC unsigned char *NorbLoader::loadImages( std::string filepath, int *p_N, int *p_numPlanes, int *p_imageSize, int numExamples ) {
-    return loadImages( filepath, p_N, p_numPlanes, p_imageSize, 0, numExamples );
+STATIC unsigned char *NorbLoader::loadImages(std::string filepath, int *p_N, int *p_numPlanes, int *p_imageSize, int numExamples) {
+    return loadImages(filepath, p_N, p_numPlanes, p_imageSize, 0, numExamples);
 }
 
-STATIC unsigned char *NorbLoader::loadImages( std::string filepath, int *p_N, int *p_numPlanes, int *p_imageSize, int startN, int numExamples ) {
-    getDimensions( filepath, p_N, p_numPlanes, p_imageSize );
-    if( numExamples == 0 ) {
+STATIC unsigned char *NorbLoader::loadImages(std::string filepath, int *p_N, int *p_numPlanes, int *p_imageSize, int startN, int numExamples) {
+    getDimensions(filepath, p_N, p_numPlanes, p_imageSize);
+    if(numExamples == 0) {
         numExamples = *p_N - startN;
     }
     unsigned char *images = new unsigned char[ (long)numExamples * *p_numPlanes * *p_imageSize * *p_imageSize ];
-    loadImages( images, filepath, p_N, p_numPlanes, p_imageSize, startN, numExamples );
+    loadImages(images, filepath, p_N, p_numPlanes, p_imageSize, startN, numExamples);
     return images;
 }
 
 // you need to allocate this yourself, before use
-STATIC void NorbLoader::loadImages( unsigned char *images, std::string filepath, int *p_N, int *p_numPlanes, int *p_imageSize, int startN, int numExamples ) {
-    char*headerBytes = FileHelper::readBinaryChunk( filepath, 0, 6 * 4 );
-    unsigned int *headerValues = reinterpret_cast< unsigned int *>( headerBytes );
+STATIC void NorbLoader::loadImages(unsigned char *images, std::string filepath, int *p_N, int *p_numPlanes, int *p_imageSize, int startN, int numExamples) {
+    char*headerBytes = FileHelper::readBinaryChunk(filepath, 0, 6 * 4);
+    unsigned int *headerValues = reinterpret_cast< unsigned int *>(headerBytes);
 
     int magic = headerValues[0];
 //    std::cout << "magic: " << magic << std::endl;
-    if( magic != 0x1e3d4c55 ) {
-        throw std::runtime_error("magic value doesnt match expections: " + toString(magic) );
+    if(magic != 0x1e3d4c55) {
+        throw std::runtime_error("magic value doesnt match expections: " + toString(magic));
     }
 //    int ndim = headerValues[1];
     int N = headerValues[2];
@@ -110,22 +110,22 @@ STATIC void NorbLoader::loadImages( unsigned char *images, std::string filepath,
     int imageSize = headerValues[4];
     int imageSizeRepeated = headerValues[5];
 //    std::cout << "ndim " << ndim << " " << N << " " << numPlanes << " " << imageSize << " " << imageSizeRepeated << std::endl;
-    checkSame( "imageSize", imageSize, imageSizeRepeated );
+    checkSame("imageSize", imageSize, imageSizeRepeated);
 
-    if( numExamples > 0 && numExamples > ( N - startN ) ) {
-        throw runtime_error("You requested " + toString( numExamples ) + " but there are only " + toString( N - startN ) + " avialalbe after start N " + toString( startN ) );
+    if(numExamples > 0 && numExamples > (N - startN) ) {
+        throw runtime_error("You requested " + toString(numExamples) + " but there are only " + toString(N - startN) + " avialalbe after start N " + toString(startN) );
     }
-    if( numExamples == 0 ) {
+    if(numExamples == 0) {
         numExamples = N - startN;
     }
-//    if( maxN > 0 ) {
-//        N = min( maxN, N );
+//    if(maxN > 0) {
+//        N = min(maxN, N);
 //    }
     long fileStartPos = 6 * 4 + (long)startN * numPlanes * imageSize * imageSize;
     long fileReadLength = (long)numExamples * numPlanes * imageSize * imageSize;
-    char *imagesAsCharArray = reinterpret_cast< char *>(images );
+    char *imagesAsCharArray = reinterpret_cast< char *>(images);
 //    cout << "images, filestartpos " << fileStartPos << " readlength " << fileReadLength << endl;
-    FileHelper::readBinaryChunk( imagesAsCharArray, filepath, fileStartPos, fileReadLength );
+    FileHelper::readBinaryChunk(imagesAsCharArray, filepath, fileStartPos, fileReadLength);
 //    unsigned char *imagesDataUnsigned = reinterpret_cast< unsigned char *>(imagesDataSigned);
 
     *p_N = N;
@@ -134,41 +134,41 @@ STATIC void NorbLoader::loadImages( unsigned char *images, std::string filepath,
 //    return imagesDataUnsigned;
 }
 // you need to allocate this yourself, before use
-STATIC void NorbLoader::loadLabels( int *labels, std::string filepath, int startN, int numExamples ) {
-    char*headerBytes = FileHelper::readBinaryChunk( filepath, 0, 6 * 5 );
-    unsigned int *headerValues = reinterpret_cast< unsigned int *>( headerBytes );
+STATIC void NorbLoader::loadLabels(int *labels, std::string filepath, int startN, int numExamples) {
+    char*headerBytes = FileHelper::readBinaryChunk(filepath, 0, 6 * 5);
+    unsigned int *headerValues = reinterpret_cast< unsigned int *>(headerBytes);
 
     int magic = headerValues[0];
 //    std::cout << "magic: " << magic << std::endl;
-    if( magic != 0x1e3d4c54 ) {
-        throw std::runtime_error("magic value doesnt match expections: " + toString(magic) + " expected: " + toString( 0x1e3d4c54 ) );
+    if(magic != 0x1e3d4c54) {
+        throw std::runtime_error("magic value doesnt match expections: " + toString(magic) + " expected: " + toString(0x1e3d4c54) );
     }
 //    int ndim = headerValues[1];
     int N = headerValues[2];
-//    checkSame( "ndim", 1, ndim );
-    if( numExamples > 0 && numExamples > ( N - startN ) ) {
-        throw runtime_error("You requested " + toString( numExamples ) + " but there are only " + toString( N - startN ) + " avialalbe after start N " + toString( startN ) );
+//    checkSame("ndim", 1, ndim);
+    if(numExamples > 0 && numExamples > (N - startN) ) {
+        throw runtime_error("You requested " + toString(numExamples) + " but there are only " + toString(N - startN) + " avialalbe after start N " + toString(startN) );
     }
-    if( numExamples == 0 ) {
+    if(numExamples == 0) {
         numExamples = N - startN;
     }
 //    N = Ntoget;
 
 //    int totalLinearSize = N;
-    char *labelsAsCharArray = reinterpret_cast< char *>( labels );
+    char *labelsAsCharArray = reinterpret_cast< char *>(labels);
     long fileStartPos = 5 * 4 + (long)startN * 4;
     long fileReadLength = (long)numExamples * 4;
 //    cout << "labels file read start " << fileStartPos << " length " << fileReadLength << endl;
-    FileHelper::readBinaryChunk( labelsAsCharArray, filepath, fileStartPos, fileReadLength );
+    FileHelper::readBinaryChunk(labelsAsCharArray, filepath, fileStartPos, fileReadLength);
 //    int *labels = reinterpret_cast< int *>(labelsAsByteArray);
 //    return labels;
 }
-STATIC void NorbLoader::writeImages( std::string filepath, unsigned char *images, int N, int numPlanes, int imageSize ) {
+STATIC void NorbLoader::writeImages(std::string filepath, unsigned char *images, int N, int numPlanes, int imageSize) {
     int totalLinearSize = N * numPlanes * imageSize * imageSize;
 
     long imagesFilesize = totalLinearSize + 6 * 4; // magic, plus num dimensions, plus 4 dimensions
     char *imagesDataSigned = new char[ imagesFilesize ];
-    unsigned int *imagesDataInt = reinterpret_cast< unsigned int *>( imagesDataSigned );
+    unsigned int *imagesDataInt = reinterpret_cast< unsigned int *>(imagesDataSigned);
     unsigned char *imagesDataUnsigned = reinterpret_cast< unsigned char *>(imagesDataSigned);
     imagesDataInt[0] = 0x1e3d4c55;
     imagesDataInt[1] = 4;
@@ -176,22 +176,22 @@ STATIC void NorbLoader::writeImages( std::string filepath, unsigned char *images
     imagesDataInt[3] = numPlanes;
     imagesDataInt[4] = imageSize;
     imagesDataInt[5] = imageSize;
-    memcpy( imagesDataUnsigned + 6 * sizeof(int), images, totalLinearSize * sizeof( unsigned char ) );
-    FileHelper::writeBinary( filepath, imagesDataSigned, imagesFilesize );
+    memcpy(imagesDataUnsigned + 6 * sizeof(int), images, totalLinearSize * sizeof(unsigned char) );
+    FileHelper::writeBinary(filepath, imagesDataSigned, imagesFilesize);
 }
-STATIC void NorbLoader::writeLabels( std::string filepath, int *labels, int N ) {
+STATIC void NorbLoader::writeLabels(std::string filepath, int *labels, int N) {
     int totalLinearSize = N;
 
     long imagesFilesize = totalLinearSize * 4 + 5 * 4; // magic, plus num dimensions, plus 3 dimensions
     char *imagesDataSigned = new char[ imagesFilesize ];
-    unsigned int *imagesDataInt = reinterpret_cast< unsigned int *>( imagesDataSigned );
+    unsigned int *imagesDataInt = reinterpret_cast< unsigned int *>(imagesDataSigned);
     unsigned char *imagesDataUnsigned = reinterpret_cast< unsigned char *>(imagesDataSigned);
     imagesDataInt[0] = 0x1e3d4c54;
     imagesDataInt[1] = 1;
     imagesDataInt[2] = N;
     imagesDataInt[3] = 1;
     imagesDataInt[4] = 1;
-    memcpy( imagesDataUnsigned + 5 * sizeof(int), labels, totalLinearSize * sizeof( int ) );
-    FileHelper::writeBinary( filepath, imagesDataSigned, imagesFilesize );
+    memcpy(imagesDataUnsigned + 5 * sizeof(int), labels, totalLinearSize * sizeof(int) );
+    FileHelper::writeBinary(filepath, imagesDataSigned, imagesFilesize);
 }
 
diff --git a/src/loaders/NorbLoader.h b/src/loaders/NorbLoader.h
index d27cebe7..6623a635 100644
--- a/src/loaders/NorbLoader.h
+++ b/src/loaders/NorbLoader.h
@@ -24,24 +24,24 @@ class DeepCL_EXPORT NorbLoader {
     // cog_addheaders.add()
     // ]]]
     // generated, using cog:
-    STATIC void getDimensions( std::string trainFilepath, int *p_N, int *p_numPlanes, int *p_imageSize );
-    STATIC void load( std::string trainFilepath, unsigned char *images, int *labels );
-    STATIC void load( std::string trainFilepath, unsigned char *images, int *labels, int startN, int numExamples );
-    STATIC int *loadLabels( std::string labelsfilepath, int numExamples );
-    STATIC unsigned char *loadImages( std::string filepath, int *p_N, int *p_numPlanes, int *p_imageSize );
-    STATIC unsigned char *loadImages( std::string filepath, int *p_N, int *p_numPlanes, int *p_imageSize, int numExamples );
-    STATIC unsigned char *loadImages( std::string filepath, int *p_N, int *p_numPlanes, int *p_imageSize, int startN, int numExamples );
-    STATIC void loadImages( unsigned char *images, std::string filepath, int *p_N, int *p_numPlanes, int *p_imageSize, int startN, int numExamples );
-    STATIC void loadLabels( int *labels, std::string filepath, int startN, int numExamples );
-    STATIC void writeImages( std::string filepath, unsigned char *images, int N, int numPlanes, int imageSize );
-    STATIC void writeLabels( std::string filepath, int *labels, int N );
+    STATIC void getDimensions(std::string trainFilepath, int *p_N, int *p_numPlanes, int *p_imageSize);
+    STATIC void load(std::string trainFilepath, unsigned char *images, int *labels);
+    STATIC void load(std::string trainFilepath, unsigned char *images, int *labels, int startN, int numExamples);
+    STATIC int *loadLabels(std::string labelsfilepath, int numExamples);
+    STATIC unsigned char *loadImages(std::string filepath, int *p_N, int *p_numPlanes, int *p_imageSize);
+    STATIC unsigned char *loadImages(std::string filepath, int *p_N, int *p_numPlanes, int *p_imageSize, int numExamples);
+    STATIC unsigned char *loadImages(std::string filepath, int *p_N, int *p_numPlanes, int *p_imageSize, int startN, int numExamples);
+    STATIC void loadImages(unsigned char *images, std::string filepath, int *p_N, int *p_numPlanes, int *p_imageSize, int startN, int numExamples);
+    STATIC void loadLabels(int *labels, std::string filepath, int startN, int numExamples);
+    STATIC void writeImages(std::string filepath, unsigned char *images, int N, int numPlanes, int imageSize);
+    STATIC void writeLabels(std::string filepath, int *labels, int N);
 
     // [[[end]]]
 
 protected:
-    static void checkSame( std::string name, int one, int two ) {
-        if( one != two ) {
-            throw std::runtime_error( "Error, didnt match: " + name + " " + toString(one) + " != " + toString(two ) );
+    static void checkSame(std::string name, int one, int two) {
+        if(one != two) {
+            throw std::runtime_error("Error, didnt match: " + name + " " + toString(one) + " != " + toString(two) );
         }
     }
 };
diff --git a/src/loss/CrossEntropyLoss.cpp b/src/loss/CrossEntropyLoss.cpp
index c5041661..fdd09268 100644
--- a/src/loss/CrossEntropyLoss.cpp
+++ b/src/loss/CrossEntropyLoss.cpp
@@ -14,13 +14,13 @@ using namespace std;
 #define VIRTUAL
 #define STATIC
 
-CrossEntropyLoss::CrossEntropyLoss( Layer *previousLayer, CrossEntropyLossMaker *maker ) :
-        LossLayer( previousLayer, maker ),
-        gradInput( 0 ),
-        allocatedSize( 0 ) {
+CrossEntropyLoss::CrossEntropyLoss(Layer *previousLayer, CrossEntropyLossMaker *maker) :
+        LossLayer(previousLayer, maker),
+        gradInput(0),
+        allocatedSize(0) {
 }
 VIRTUAL CrossEntropyLoss::~CrossEntropyLoss(){
-    if( gradInput != 0 ) {
+    if(gradInput != 0) {
         delete[] gradInput;
     }
 }
@@ -30,41 +30,41 @@ VIRTUAL std::string CrossEntropyLoss::getClassName() const {
 VIRTUAL float*CrossEntropyLoss::getGradInput() {
     return gradInput;
 }
-VIRTUAL int CrossEntropyLoss::getPersistSize( int version ) const {
+VIRTUAL int CrossEntropyLoss::getPersistSize(int version) const {
     return 0;
 }
-VIRTUAL float CrossEntropyLoss::calcLoss( float const *expected ) {
+VIRTUAL float CrossEntropyLoss::calcLoss(float const *expected) {
     float loss = 0;
-    int inputSize = previousLayer->getOutputSize();
+    int inputNumElements = previousLayer->getOutputNumElements();
     float *input = previousLayer->getOutput();
 //    cout << "CrossEntropyLoss::calcLoss" << endl;
-    for( int i = 0; i < inputSize; i++ ) {
+    for(int i = 0; i < inputNumElements; i++) {
         float expectedOutput = expected[i];
         float inputValue = input[i];
-        float negthisloss = expectedOutput * log( inputValue ) 
-            + ( 1 - expectedOutput ) * log( 1 - inputValue );
+        float negthisloss = expectedOutput * log(inputValue) 
+            + (1 - expectedOutput) * log(1 - inputValue);
         loss -= negthisloss;
     }
     return loss;
  }
-VIRTUAL void CrossEntropyLoss::setBatchSize( int batchSize ) {
-    if( batchSize <= allocatedSize ) {
+VIRTUAL void CrossEntropyLoss::setBatchSize(int batchSize) {
+    if(batchSize <= allocatedSize) {
         this->batchSize = batchSize;
         return;
     }
-    if( gradInput != 0 ) {
+    if(gradInput != 0) {
         delete[] gradInput;
     }
-    gradInput = new float[ batchSize * previousLayer->getOutputSize() ];
+    gradInput = new float[ batchSize * previousLayer->getOutputNumElements() ];
     this->batchSize = batchSize;
     allocatedSize = batchSize;
 }
 // just do naively for now, then add sigmoid short-cutting later
-VIRTUAL void CrossEntropyLoss::calcGradInput( float const*expectedOutput ) {
-    int inputSize = previousLayer->getOutputSize();
+VIRTUAL void CrossEntropyLoss::calcGradInput(float const*expectedOutput) {
+    int inputNumElements = previousLayer->getOutputNumElements();
     float *input = previousLayer->getOutput();
-    for( int i = 0; i < inputSize; i++ ) {
-        gradInput[i] = ( input[i] - expectedOutput[i] ) / input[i] / ( 1.0f - input[i] );
+    for(int i = 0; i < inputNumElements; i++) {
+        gradInput[i] = (input[i] - expectedOutput[i]) / input[i] / (1.0f - input[i]);
     }
 }
 
diff --git a/src/loss/CrossEntropyLoss.h b/src/loss/CrossEntropyLoss.h
index 94a12b7c..3a4dfe3c 100644
--- a/src/loss/CrossEntropyLoss.h
+++ b/src/loss/CrossEntropyLoss.h
@@ -26,14 +26,14 @@ class CrossEntropyLoss : public LossLayer {
     // cog_addheaders.add()
     // ]]]
     // generated, using cog:
-    CrossEntropyLoss( Layer *previousLayer, CrossEntropyLossMaker *maker );
+    CrossEntropyLoss(Layer *previousLayer, CrossEntropyLossMaker *maker);
     VIRTUAL ~CrossEntropyLoss();
     VIRTUAL std::string getClassName() const;
     VIRTUAL float*getGradInput();
-    VIRTUAL int getPersistSize( int version ) const;
-    VIRTUAL float calcLoss( float const *expected );
-    VIRTUAL void setBatchSize( int batchSize );
-    VIRTUAL void calcGradInput( float const*expectedOutput );
+    VIRTUAL int getPersistSize(int version) const;
+    VIRTUAL float calcLoss(float const *expected);
+    VIRTUAL void setBatchSize(int batchSize);
+    VIRTUAL void calcGradInput(float const*expectedOutput);
 
     // [[[end]]]
 };
diff --git a/src/loss/IAcceptsLabels.h b/src/loss/IAcceptsLabels.h
index 12f27f6a..84415034 100644
--- a/src/loss/IAcceptsLabels.h
+++ b/src/loss/IAcceptsLabels.h
@@ -10,9 +10,9 @@ class OutputData;
 
 class IAcceptsLabels {
 public:
-    virtual float calcLossFromLabels( int const*labels ) = 0;
-    virtual void calcGradInputFromLabels( int const*labels ) = 0;
-    virtual int calcNumRight( int const*labels ) = 0;
+    virtual float calcLossFromLabels(int const*labels) = 0;
+    virtual void calcGradInputFromLabels(int const*labels) = 0;
+    virtual int calcNumRight(int const*labels) = 0;
     virtual int getNumLabelsPerExample() = 0;
 };
 
diff --git a/src/loss/LossLayer.cpp b/src/loss/LossLayer.cpp
index b527813e..aed7ce8e 100644
--- a/src/loss/LossLayer.cpp
+++ b/src/loss/LossLayer.cpp
@@ -15,8 +15,8 @@ using namespace std;
 #define VIRTUAL
 #define STATIC
 
-LossLayer::LossLayer( Layer *previousLayer, LossLayerMaker *maker ) :
-        Layer( previousLayer, maker ) {
+LossLayer::LossLayer(Layer *previousLayer, LossLayerMaker *maker) :
+        Layer(previousLayer, maker) {
 }
 VIRTUAL void LossLayer::forward() {
 }
@@ -26,14 +26,14 @@ VIRTUAL bool LossLayer::needsBackProp() {
 VIRTUAL float *LossLayer::getOutput() {
     return previousLayer->getOutput();
 }
-VIRTUAL int LossLayer::getOutputSize() const {
-    return previousLayer->getOutputSize();
+VIRTUAL int LossLayer::getOutputNumElements() const {
+    return previousLayer->getOutputNumElements();
 }
 VIRTUAL int LossLayer::getOutputCubeSize() const {
     return previousLayer->getOutputCubeSize();
 }
-VIRTUAL int LossLayer::getOutputImageSize() const {
-    return previousLayer->getOutputImageSize();
+VIRTUAL int LossLayer::getOutputSize() const {
+    return previousLayer->getOutputSize();
 }
 VIRTUAL int LossLayer::getOutputPlanes() const {
     return previousLayer->getOutputPlanes();
@@ -42,42 +42,42 @@ VIRTUAL int LossLayer::getWeightsSize() const {
     return previousLayer->getWeightsSize();
 }
 
-VIRTUAL float LossLayer::calcLoss( OutputData *outputData ) {
-    ExpectedData *expectedData = dynamic_cast< ExpectedData * >( outputData );
-    LabeledData *labeledData = dynamic_cast< LabeledData * >( outputData );
-    if( expectedData != 0 ) {
-        return this->calcLoss( expectedData->expected );
-    } else if( labeledData != 0 ) {
-        IAcceptsLabels *labeled = dynamic_cast< IAcceptsLabels * >( this );
-        return labeled->calcLossFromLabels( labeledData->labels );
+VIRTUAL float LossLayer::calcLoss(OutputData *outputData) {
+    ExpectedData *expectedData = dynamic_cast< ExpectedData * >(outputData);
+    LabeledData *labeledData = dynamic_cast< LabeledData * >(outputData);
+    if(expectedData != 0) {
+        return this->calcLoss(expectedData->expected);
+    } else if(labeledData != 0) {
+        IAcceptsLabels *labeled = dynamic_cast< IAcceptsLabels * >(this);
+        return labeled->calcLossFromLabels(labeledData->labels);
     } else {
-        throw runtime_error( "OutputData child class not implemeneted in LossLayer::calcLoss" );
+        throw runtime_error("OutputData child class not implemeneted in LossLayer::calcLoss");
     }
 }
 
-VIRTUAL void LossLayer::calcGradInput( OutputData *outputData ) {
-    ExpectedData *expectedData = dynamic_cast< ExpectedData * >( outputData );
-    LabeledData *labeledData = dynamic_cast< LabeledData * >( outputData );
-    if( expectedData != 0 ) {
-        this->calcGradInput( expectedData->expected );
-    } else if( labeledData != 0 ) {
-        IAcceptsLabels *labeled = dynamic_cast< IAcceptsLabels * >( this );
-        labeled->calcGradInputFromLabels( labeledData->labels );
+VIRTUAL void LossLayer::calcGradInput(OutputData *outputData) {
+    ExpectedData *expectedData = dynamic_cast< ExpectedData * >(outputData);
+    LabeledData *labeledData = dynamic_cast< LabeledData * >(outputData);
+    if(expectedData != 0) {
+        this->calcGradInput(expectedData->expected);
+    } else if(labeledData != 0) {
+        IAcceptsLabels *labeled = dynamic_cast< IAcceptsLabels * >(this);
+        labeled->calcGradInputFromLabels(labeledData->labels);
     } else {
-        throw runtime_error( "OutputData child class not implemeneted in LossLayer::calcGradInput" );
+        throw runtime_error("OutputData child class not implemeneted in LossLayer::calcGradInput");
     }
 }
 
-VIRTUAL int LossLayer::calcNumRight( OutputData *outputData ) {
-    ExpectedData *expectedData = dynamic_cast< ExpectedData * >( outputData );
-    LabeledData *labeledData = dynamic_cast< LabeledData * >( outputData );
-    if( expectedData != 0 ) {
+VIRTUAL int LossLayer::calcNumRight(OutputData *outputData) {
+    ExpectedData *expectedData = dynamic_cast< ExpectedData * >(outputData);
+    LabeledData *labeledData = dynamic_cast< LabeledData * >(outputData);
+    if(expectedData != 0) {
         return 0; // how are we going to calculate num right, if not labeled?
-    } else if( labeledData != 0 ) {
-        IAcceptsLabels *labeled = dynamic_cast< IAcceptsLabels * >( this );
-        return labeled->calcNumRight( labeledData->labels );
+    } else if(labeledData != 0) {
+        IAcceptsLabels *labeled = dynamic_cast< IAcceptsLabels * >(this);
+        return labeled->calcNumRight(labeledData->labels);
     } else {
-        throw runtime_error( "OutputData child class not implemeneted in LossLayer::calcNumRight" );
+        throw runtime_error("OutputData child class not implemeneted in LossLayer::calcNumRight");
     }
 }
 
diff --git a/src/loss/LossLayer.h b/src/loss/LossLayer.h
index f7bef4f5..9e89f8c8 100644
--- a/src/loss/LossLayer.h
+++ b/src/loss/LossLayer.h
@@ -12,26 +12,26 @@ class OutputData;
 
 class LossLayer : public Layer {
 public:
-    virtual float calcLoss( float const*expectedValue ) = 0;
-    virtual void calcGradInput( float const*expectedOutput ) = 0;
+    virtual float calcLoss(float const*expectedValue) = 0;
+    virtual void calcGradInput(float const*expectedOutput) = 0;
 
     // [[[cog
     // import cog_addheaders
     // cog_addheaders.add()
     // ]]]
     // generated, using cog:
-    LossLayer( Layer *previousLayer, LossLayerMaker *maker );
+    LossLayer(Layer *previousLayer, LossLayerMaker *maker);
     VIRTUAL void forward();
     VIRTUAL bool needsBackProp();
     VIRTUAL float *getOutput();
-    VIRTUAL int getOutputSize() const;
+    VIRTUAL int getOutputNumElements() const;
     VIRTUAL int getOutputCubeSize() const;
-    VIRTUAL int getOutputImageSize() const;
+    VIRTUAL int getOutputSize() const;
     VIRTUAL int getOutputPlanes() const;
     VIRTUAL int getWeightsSize() const;
-    VIRTUAL float calcLoss( OutputData *outputData );
-    VIRTUAL void calcGradInput( OutputData *outputData );
-    VIRTUAL int calcNumRight( OutputData *outputData );
+    VIRTUAL float calcLoss(OutputData *outputData);
+    VIRTUAL void calcGradInput(OutputData *outputData);
+    VIRTUAL int calcNumRight(OutputData *outputData);
 
     // [[[end]]]
 };
diff --git a/src/loss/MultinomialCrossEntropy.cpp b/src/loss/MultinomialCrossEntropy.cpp
index 94a0bfde..ba82b206 100644
--- a/src/loss/MultinomialCrossEntropy.cpp
+++ b/src/loss/MultinomialCrossEntropy.cpp
@@ -14,38 +14,38 @@ using namespace std;
 #define VIRTUAL
 #define STATIC
 
-MultinomialCrossEntropy::MultinomialCrossEntropy( Layer *previousLayer, MultinomialCrossEntropyMaker const*maker ) :
-        LossLayer( previousLayer, maker ),
-        errors( 0 ),
-        allocatedSize( 0 ) {
+MultinomialCrossEntropy::MultinomialCrossEntropy(Layer *previousLayer, MultinomialCrossEntropyMaker const*maker) :
+        LossLayer(previousLayer, maker),
+        errors(0),
+        allocatedSize(0) {
 }
 VIRTUAL MultinomialCrossEntropy::~MultinomialCrossEntropy(){
-    if( errors != 0 ) {
+    if(errors != 0) {
         delete[] errors;
     }
 }
 VIRTUAL float*MultinomialCrossEntropy::getGradInput() {
     return errors;
 }
-VIRTUAL float MultinomialCrossEntropy::calcLoss( float const *expected ) {
+VIRTUAL float MultinomialCrossEntropy::calcLoss(float const *expected) {
     float loss = 0;
     float *output = getOutput();
 //    cout << "MultinomialCrossEntropy::calcLoss" << endl;
     // this is matrix subtraction, then element-wise square, then aggregation
     int numPlanes = previousLayer->getOutputPlanes();
-    int imageSize = previousLayer->getOutputImageSize();
-    for( int imageId = 0; imageId < batchSize; imageId++ ) {
-        for( int plane = 0; plane < numPlanes; plane++ ) {
-            for( int outRow = 0; outRow < imageSize; outRow++ ) {
-                for( int outCol = 0; outCol < imageSize; outCol++ ) {
-                    int resultOffset = ( ( imageId
-                         * numPlanes + plane )
-                         * imageSize + outRow )
+    int imageSize = previousLayer->getOutputSize();
+    for(int imageId = 0; imageId < batchSize; imageId++) {
+        for(int plane = 0; plane < numPlanes; plane++) {
+            for(int outRow = 0; outRow < imageSize; outRow++) {
+                for(int outCol = 0; outCol < imageSize; outCol++) {
+                    int resultOffset = (( imageId
+                         * numPlanes + plane)
+                         * imageSize + outRow)
                          * imageSize + outCol;
- //                   int resultOffset = getResultIndex( imageId, plane, outRow, outCol ); //imageId * numPlanes + out;
+ //                   int resultOffset = getResultIndex(imageId, plane, outRow, outCol); //imageId * numPlanes + out;
                     float expectedOutput = expected[resultOffset];
                     float actualOutput = output[resultOffset];
-                    float negthisloss = expectedOutput * log( actualOutput );
+                    float negthisloss = expectedOutput * log(actualOutput);
                     loss -= negthisloss;
                 }
             }
@@ -55,26 +55,26 @@ VIRTUAL float MultinomialCrossEntropy::calcLoss( float const *expected ) {
 //    cout << "loss " << loss << endl;
     return loss;
  }
-VIRTUAL void MultinomialCrossEntropy::setBatchSize( int batchSize ) {
-    if( batchSize <= allocatedSize ) {
+VIRTUAL void MultinomialCrossEntropy::setBatchSize(int batchSize) {
+    if(batchSize <= allocatedSize) {
         this->batchSize = batchSize;
         return;
     }
-    if( errors != 0 ) {
+    if(errors != 0) {
         delete[] errors;
     }
-    errors = new float[ batchSize * previousLayer->getOutputSize() ];
+    errors = new float[ batchSize * previousLayer->getOutputNumElements() ];
     this->batchSize = batchSize;
     allocatedSize = batchSize;
 }
 // just do naively for now, then add sigmoid short-cutting later
-VIRTUAL void MultinomialCrossEntropy::calcGradInput( float const*expectedOutput ) {
+VIRTUAL void MultinomialCrossEntropy::calcGradInput(float const*expectedOutput) {
     ActivationFunction const*fn = previousLayer->getActivationFunction();
-    int outputSize = previousLayer->getOutputSize();
+    int outputNumElements = previousLayer->getOutputNumElements();
     float *output = previousLayer->getOutput();
-    for( int i = 0; i < outputSize; i++ ) {
+    for(int i = 0; i < outputNumElements; i++) {
         float result = output[i];
-        float partialOutBySum = fn->calcDerivative( result );
+        float partialOutBySum = fn->calcDerivative(result);
         float partialLossByOut = - expectedOutput[i] / result;
         errors[i] = partialLossByOut * partialOutBySum;
     }
diff --git a/src/loss/MultinomialCrossEntropy.h b/src/loss/MultinomialCrossEntropy.h
index 3428276f..1c442acc 100644
--- a/src/loss/MultinomialCrossEntropy.h
+++ b/src/loss/MultinomialCrossEntropy.h
@@ -26,12 +26,12 @@ class MultinomialCrossEntropy : public LossLayer {
     // cog_addheaders.add()
     // ]]]
     // generated, using cog:
-    MultinomialCrossEntropy( Layer *previousLayer, MultinomialCrossEntropyMaker const*maker );
+    MultinomialCrossEntropy(Layer *previousLayer, MultinomialCrossEntropyMaker const*maker);
     VIRTUAL ~MultinomialCrossEntropy();
     VIRTUAL float*getGradInput();
-    VIRTUAL float calcLoss( float const *expected );
-    VIRTUAL void setBatchSize( int batchSize );
-    VIRTUAL void calcGradInput( float const*expectedOutput );
+    VIRTUAL float calcLoss(float const *expected);
+    VIRTUAL void setBatchSize(int batchSize);
+    VIRTUAL void calcGradInput(float const*expectedOutput);
 
     // [[[end]]]
 };
diff --git a/src/loss/SoftMaxLayer.cpp b/src/loss/SoftMaxLayer.cpp
index 309f05a0..6ca01548 100644
--- a/src/loss/SoftMaxLayer.cpp
+++ b/src/loss/SoftMaxLayer.cpp
@@ -16,23 +16,23 @@ using namespace std;
 #undef STATIC
 #define STATIC
 
-SoftMaxLayer::SoftMaxLayer( Layer *previousLayer, SoftMaxMaker *maker ) :
-    LossLayer( previousLayer, maker ),
-        perPlane( maker->_perPlane ),
-        imageSize( previousLayer->getOutputImageSize() ),
-        numPlanes( previousLayer->getOutputPlanes() ),
-        imageSizeSquared( previousLayer->getOutputImageSize() * previousLayer->getOutputImageSize() ),
-        output( 0 ),
-        gradInput( 0 ),
-        allocatedSize( 0 ),
-        batchSize( 0 )
+SoftMaxLayer::SoftMaxLayer(Layer *previousLayer, SoftMaxMaker *maker) :
+    LossLayer(previousLayer, maker),
+        perPlane(maker->_perPlane),
+        imageSize(previousLayer->getOutputSize()),
+        numPlanes(previousLayer->getOutputPlanes()),
+        imageSizeSquared(previousLayer->getOutputSize() * previousLayer->getOutputSize()),
+        output(0),
+        gradInput(0),
+        allocatedSize(0),
+        batchSize(0)
          {
 }
 VIRTUAL SoftMaxLayer::~SoftMaxLayer() {
-    if( gradInput != 0 ) {
+    if(gradInput != 0) {
         delete[] gradInput;
     }
-    if( output != 0 ) {
+    if(output != 0) {
         delete[] output;
     }
 }
@@ -45,60 +45,60 @@ VIRTUAL float *SoftMaxLayer::getOutput() {
 VIRTUAL float *SoftMaxLayer::getGradInput() {
     return gradInput;
 }
-VIRTUAL void SoftMaxLayer::setBatchSize( int batchSize ) {
+VIRTUAL void SoftMaxLayer::setBatchSize(int batchSize) {
     this->batchSize = batchSize;
-    if( batchSize <= this->allocatedSize ) {
+    if(batchSize <= this->allocatedSize) {
         return;
     }
-    if( output != 0 ) {
+    if(output != 0) {
         delete[] output;
     }
-    if( gradInput != 0 ) {
+    if(gradInput != 0) {
         delete[] gradInput;
     }
-    output = new float[ getOutputSize() ];
-    gradInput = new float[ previousLayer-> getOutputSize() ];
+    output = new float[ getOutputNumElements() ];
+    gradInput = new float[ previousLayer-> getOutputNumElements() ];
     allocatedSize = batchSize;
 }
 
 // need to calculate multinomial logistic /cross-entropy loss
-VIRTUAL float SoftMaxLayer::calcLossFromLabels( int const *labels ) {
+VIRTUAL float SoftMaxLayer::calcLossFromLabels(int const *labels) {
 //    cout << "softmaxlayer::calcloss" << endl;
     StatefulTimer::timeCheck("start SoftMaxLayer calcLossfromlabels");
     float loss = 0;
-    if( perPlane ) {
-        for( int n = 0; n < batchSize; n++ ) {
-            for( int plane = 0; plane < numPlanes; plane++ ) {
+    if(perPlane) {
+        for(int n = 0; n < batchSize; n++) {
+            for(int plane = 0; plane < numPlanes; plane++) {
                 int label = labels[n * numPlanes + plane];
-                int imageOffset = ( n * numPlanes + plane ) * imageSizeSquared;
-                loss += - log( output[ imageOffset + label ] );
+                int imageOffset = (n * numPlanes + plane) * imageSizeSquared;
+                loss += - log(output[ imageOffset + label ]);
             }
         }
     } else {
         // force imagesize of 1 for now
-        if( imageSize != 1 ) {
+        if(imageSize != 1) {
             throw std::runtime_error("perColumn only supported for imagesize 1 for now.  Sit tight :-)  (But please raise an issue to highlight your need)");
         }
-        for( int n = 0; n < batchSize; n++ ) {
+        for(int n = 0; n < batchSize; n++) {
             int imageOffset = n * numPlanes * imageSizeSquared;
             int label = labels[n];
-            loss += - log( output[imageOffset + label] );
+            loss += - log(output[imageOffset + label]);
         }
     }
     StatefulTimer::timeCheck("end SoftMaxLayer calcLossfromlabels");
     return loss;
 }
 // need to calculate multinomial logistic /cross-entropy loss
-VIRTUAL float SoftMaxLayer::calcLoss( float const *expectedValues ) {
+VIRTUAL float SoftMaxLayer::calcLoss(float const *expectedValues) {
     StatefulTimer::timeCheck("start SoftMaxLayer calcLoss");
     float loss = 0;
-    if( perPlane ) {
-        for( int n = 0; n < batchSize; n++ ) {
-            for( int plane = 0; plane < numPlanes; plane++ ) {
-                int imageOffset = ( n * numPlanes + plane ) * imageSizeSquared;
-                for( int i = 0; i < imageSizeSquared; i++ ) {
-                    if( expectedValues[ imageOffset + i ] != 0 ) {
-                        float thisloss = - expectedValues[ imageOffset + i ] * log( output[ imageOffset + i ] );
+    if(perPlane) {
+        for(int n = 0; n < batchSize; n++) {
+            for(int plane = 0; plane < numPlanes; plane++) {
+                int imageOffset = (n * numPlanes + plane) * imageSizeSquared;
+                for(int i = 0; i < imageSizeSquared; i++) {
+                    if(expectedValues[ imageOffset + i ] != 0) {
+                        float thisloss = - expectedValues[ imageOffset + i ] * log(output[ imageOffset + i ]);
                         loss += thisloss;
                     }
                 }
@@ -106,13 +106,13 @@ VIRTUAL float SoftMaxLayer::calcLoss( float const *expectedValues ) {
         }
     } else {
         // force imagesize of 1 for now
-        if( imageSize != 1 ) {
+        if(imageSize != 1) {
             throw std::runtime_error("perColumn only supported for imagesize 1 for now.  Sit tight :-)  (But please raise an issue to highlight your need)");
         }
-        for( int n = 0; n < batchSize; n++ ) {
+        for(int n = 0; n < batchSize; n++) {
             int imageOffset = n * numPlanes * imageSizeSquared;
-            for( int plane = 0; plane < numPlanes; plane++ ) {
-                float thisloss = - expectedValues[imageOffset + plane] * log( output[imageOffset + plane] );
+            for(int plane = 0; plane < numPlanes; plane++) {
+                float thisloss = - expectedValues[imageOffset + plane] * log(output[imageOffset + plane]);
                 loss += thisloss;
             }
         }
@@ -123,15 +123,15 @@ VIRTUAL float SoftMaxLayer::calcLoss( float const *expectedValues ) {
 // calculate partial deriv loss wrt our inputs, in other words, product of
 // (multinomial cross-entropy) loss derivative wrt our output, and
 // derivative of softmax wrt our inputs
-VIRTUAL void SoftMaxLayer::calcGradInputFromLabels( int const *labels ) {
+VIRTUAL void SoftMaxLayer::calcGradInputFromLabels(int const *labels) {
 //    cout << "softmaxlayer::calcerrors" << endl;
     StatefulTimer::timeCheck("start SoftMaxLayer calcGradInputfromlabels");
-    if( perPlane ) {
-        for( int n = 0; n < batchSize; n++ ) {
-            for( int plane = 0; plane < numPlanes; plane++ ) {
-                int imageOffset = ( n * numPlanes + plane ) * imageSizeSquared;
+    if(perPlane) {
+        for(int n = 0; n < batchSize; n++) {
+            for(int plane = 0; plane < numPlanes; plane++) {
+                int imageOffset = (n * numPlanes + plane) * imageSizeSquared;
                 int label = labels[n * numPlanes + plane];
-                for( int i = 0; i < imageSizeSquared; i++ ) {
+                for(int i = 0; i < imageSizeSquared; i++) {
                     gradInput[imageOffset + i] = output[imageOffset + i];
                 }
                 gradInput[imageOffset + label] -= 1;
@@ -139,19 +139,19 @@ VIRTUAL void SoftMaxLayer::calcGradInputFromLabels( int const *labels ) {
         }
     } else {
         // force imagesize of 1 for now
-        if( imageSize != 1 ) {
+        if(imageSize != 1) {
             throw std::runtime_error("perColumn only supported for imagesize 1 for now.  Sit tight :-)  (But please raise an issue to highlight your need)");
         }
-        for( int n = 0; n < batchSize; n++ ) {
+        for(int n = 0; n < batchSize; n++) {
             int imageOffset = n * numPlanes * imageSizeSquared;
             int label = labels[n];
-            for( int plane = 0; plane < numPlanes; plane++ ) {
+            for(int plane = 0; plane < numPlanes; plane++) {
                 gradInput[imageOffset + plane] = output[imageOffset + plane];
             }
-            if( label >= numPlanes ) {
-                throw runtime_error("Label " + toString( label ) + " exceeds number of softmax planes " + toString( numPlanes ) );
-            } else if( label < 0 ) {
-                throw runtime_error("Label " + toString( label ) + " negative" );
+            if(label >= numPlanes) {
+                throw runtime_error("Label " + toString(label) + " exceeds number of softmax planes " + toString(numPlanes) );
+            } else if(label < 0) {
+                throw runtime_error("Label " + toString(label) + " negative");
             }
             gradInput[imageOffset + label] -= 1;
         }
@@ -161,14 +161,14 @@ VIRTUAL void SoftMaxLayer::calcGradInputFromLabels( int const *labels ) {
 // calculate partial deriv loss wrt our inputs, in other words, product of
 // (multinomial cross-entropy) loss derivative wrt our output, and
 // derivative of softmax wrt our inputs
-VIRTUAL void SoftMaxLayer::calcGradInput( float const *expectedValues ) {
+VIRTUAL void SoftMaxLayer::calcGradInput(float const *expectedValues) {
 //    cout << "softmaxlayer::calcerrors" << endl;
     StatefulTimer::timeCheck("start SoftMaxLayer calcGradInput");
-    if( perPlane ) {
-        for( int n = 0; n < batchSize; n++ ) {
-            for( int plane = 0; plane < numPlanes; plane++ ) {
-                int imageOffset = ( n * numPlanes + plane ) * imageSizeSquared;
-                for( int i = 0; i < imageSizeSquared; i++ ) {
+    if(perPlane) {
+        for(int n = 0; n < batchSize; n++) {
+            for(int plane = 0; plane < numPlanes; plane++) {
+                int imageOffset = (n * numPlanes + plane) * imageSizeSquared;
+                for(int i = 0; i < imageSizeSquared; i++) {
                     int resultIndex = imageOffset + i;
                     gradInput[resultIndex] = output[resultIndex] - expectedValues[resultIndex];
                 }
@@ -176,12 +176,12 @@ VIRTUAL void SoftMaxLayer::calcGradInput( float const *expectedValues ) {
         }
     } else {
         // force imagesize of 1 for now
-        if( imageSize != 1 ) {
+        if(imageSize != 1) {
             throw std::runtime_error("perColumn only supported for imagesize 1 for now.  Sit tight :-)  (But please raise an issue to highlight your need)");
         }
-        for( int n = 0; n < batchSize; n++ ) {
+        for(int n = 0; n < batchSize; n++) {
             int imageOffset = n * numPlanes * imageSizeSquared;
-            for( int plane = 0; plane < numPlanes; plane++ ) {
+            for(int plane = 0; plane < numPlanes; plane++) {
                 int resultIndex = imageOffset + plane;
                 gradInput[resultIndex] = output[resultIndex] - expectedValues[resultIndex];
             }
@@ -190,33 +190,33 @@ VIRTUAL void SoftMaxLayer::calcGradInput( float const *expectedValues ) {
     StatefulTimer::timeCheck("end SoftMaxLayer calcGradInput");
 }
 VIRTUAL int SoftMaxLayer::getNumLabelsPerExample() {
-    if( perPlane ) {
+    if(perPlane) {
         return numPlanes;
     } else {
         return imageSizeSquared;
     }
 }
-VIRTUAL int SoftMaxLayer::getPersistSize( int version ) const {
+VIRTUAL int SoftMaxLayer::getPersistSize(int version) const {
     return 0;
 }
-VIRTUAL int SoftMaxLayer::calcNumRight( int const*labels ) {
+VIRTUAL int SoftMaxLayer::calcNumRight(int const*labels) {
     StatefulTimer::timeCheck("start SoftMaxLayer calcNumRight");
 //    float *input = previousLayer->getOutput(); // just retrieve as host-side array for now
     int numRight = 0;
-    if( perPlane ) {
-        for( int n = 0; n < batchSize; n++ ) {
-            for( int plane = 0; plane < numPlanes; plane++ ) {
-                int imageOffset = ( n * numPlanes + plane ) * imageSizeSquared;
+    if(perPlane) {
+        for(int n = 0; n < batchSize; n++) {
+            for(int plane = 0; plane < numPlanes; plane++) {
+                int imageOffset = (n * numPlanes + plane) * imageSizeSquared;
                 int label = labels[n * numPlanes + plane];
                 float thisMax = output[imageOffset + 0];
                 int iMax = 0;
-                for( int i = 1; i < imageSizeSquared; i++ ) {
-                    if( output[imageOffset + i] > thisMax ) {
+                for(int i = 1; i < imageSizeSquared; i++) {
+                    if(output[imageOffset + i] > thisMax) {
                         thisMax = output[imageOffset + i];
                         iMax = i;
                     }
                 }
-                if( label == iMax ) {
+                if(label == iMax) {
 //                    cout << "n " << n << " plane " << plane << " label " << label << endl;
                     numRight++;
                 }
@@ -224,21 +224,21 @@ VIRTUAL int SoftMaxLayer::calcNumRight( int const*labels ) {
         }
     } else {
         // force imagesize of 1 for now
-        if( imageSize != 1 ) {
+        if(imageSize != 1) {
             throw std::runtime_error("perColumn only supported for imagesize 1 for now.  Sit tight :-)  (But please raise an issue to highlight your need)");
         }
-        for( int n = 0; n < batchSize; n++ ) {
+        for(int n = 0; n < batchSize; n++) {
             int imageOffset = n * numPlanes * imageSizeSquared;
             int label = labels[n];
             float thisMax = output[imageOffset + 0];
             int iMax = 0;
-            for( int i = 1; i < numPlanes; i++ ) {
-                if( output[imageOffset + i] > thisMax ) {
+            for(int i = 1; i < numPlanes; i++) {
+                if(output[imageOffset + i] > thisMax) {
                     thisMax = output[imageOffset + i];
                     iMax = i;
                 }
             }
-            if( label == iMax ) {
+            if(label == iMax) {
                 numRight++;
             }
         }
@@ -252,61 +252,61 @@ VIRTUAL void SoftMaxLayer::forward() {
 //    cout << "softmaxlayer::forward" << endl;
     StatefulTimer::timeCheck("start SoftMaxLayer forward");
     float *input = previousLayer->getOutput(); // just retrieve as host-side array for now
-    if( perPlane ) {
-        for( int n = 0; n < batchSize; n++ ) {
-            for( int plane = 0; plane < numPlanes; plane++ ) {
-                int imageOffset = ( n * numPlanes + plane ) * imageSizeSquared;
+    if(perPlane) {
+        for(int n = 0; n < batchSize; n++) {
+            for(int plane = 0; plane < numPlanes; plane++) {
+                int imageOffset = (n * numPlanes + plane) * imageSizeSquared;
                 float maxValue = input[imageOffset + 0];
-                for( int i = 1; i < imageSizeSquared; i++ ) {
-                    maxValue = std::max( maxValue, input[imageOffset + i] );
+                for(int i = 1; i < imageSizeSquared; i++) {
+                    maxValue = std::max(maxValue, input[imageOffset + i]);
                 }
                 float denominator = 0;
-                for( int i = 0; i < imageSizeSquared; i++ ) {
-                    denominator += exp( input[imageOffset + i] - maxValue );
+                for(int i = 0; i < imageSizeSquared; i++) {
+                    denominator += exp(input[imageOffset + i] - maxValue);
                 }
-                for( int i = 0; i < imageSizeSquared; i++ ) {
-                    output[imageOffset + i] = exp( input[imageOffset + i] - maxValue ) / denominator;
+                for(int i = 0; i < imageSizeSquared; i++) {
+                    output[imageOffset + i] = exp(input[imageOffset + i] - maxValue) / denominator;
                 }
             }
         }
     } else {
         // force imagesize of 1 for now
-        if( imageSize != 1 ) {
+        if(imageSize != 1) {
             throw std::runtime_error("perColumn only supported for imagesize 1 for now.  Sit tight :-)  (But please raise an issue to highlight your need)");
         }
-        for( int n = 0; n < batchSize; n++ ) {
+        for(int n = 0; n < batchSize; n++) {
             int imageOffset = n * numPlanes * imageSizeSquared;
             // first get the max
             float maxValue = input[imageOffset + 0]; // since we assume imagesize 1, this is correct
-            for( int plane = 1; plane < numPlanes; plane++ ) {
-                maxValue = std::max( maxValue, input[imageOffset + plane] );
+            for(int plane = 1; plane < numPlanes; plane++) {
+                maxValue = std::max(maxValue, input[imageOffset + plane]);
             }
             // calculate sum, under this max
             float denominator = 0;
-            for( int plane = 0; plane < numPlanes; plane++ ) {
-                denominator += exp( input[imageOffset + plane] - maxValue );
+            for(int plane = 0; plane < numPlanes; plane++) {
+                denominator += exp(input[imageOffset + plane] - maxValue);
             }
             // now calc the softmaxes:
-            for( int plane = 0; plane < numPlanes; plane++ ) {
-                output[imageOffset + plane] = exp( input[imageOffset + plane] - maxValue ) / denominator;
+            for(int plane = 0; plane < numPlanes; plane++) {
+                output[imageOffset + plane] = exp(input[imageOffset + plane] - maxValue) / denominator;
             }
         }
     }
     StatefulTimer::timeCheck("end SoftMaxLayer forward");
 }
-VIRTUAL void SoftMaxLayer::getLabels( int *labels ) { // need to allocate labels array first, and have called 'forward' first
-    if( perPlane ) {
+VIRTUAL void SoftMaxLayer::getLabels(int *labels) { // need to allocate labels array first, and have called 'forward' first
+    if(perPlane) {
         throw std::runtime_error("getLabels doesnt work with 'perPlane' option currently, though it wouldnt be hard to add, so ask if you need");
     }
-    if( imageSize != 1 ) {
+    if(imageSize != 1) {
         throw std::runtime_error("perColumn only supported for imagesize 1 for now.  Sit tight :-)  (But please raise an issue to highlight your need)");
     }
-    for( int n = 0; n < batchSize; n++ ) {
+    for(int n = 0; n < batchSize; n++) {
         float *outputStack = output + n * numPlanes;
         float highestProb = outputStack[0];
         int bestPlane = 0;
-        for( int plane = 1; plane < numPlanes; plane++ ) {
-            if( outputStack[plane] > highestProb ) {
+        for(int plane = 1; plane < numPlanes; plane++) {
+            if(outputStack[plane] > highestProb) {
                 bestPlane = plane;
                 highestProb = outputStack[plane];
             }
@@ -315,15 +315,15 @@ VIRTUAL void SoftMaxLayer::getLabels( int *labels ) { // need to allocate labels
     }
 }
 // this seems to be handled by calcGradInput? So, just to a nop?
-// (cos this layer kind of combines loss layer and a 'normal' propagation layer )
+// (cos this layer kind of combines loss layer and a 'normal' propagation layer)
 // certainly, we dont have any weights to update, and we already handled error
 // propagation in 'calcGradInput' method above
-VIRTUAL void SoftMaxLayer::backward( float learningRate ) {
+VIRTUAL void SoftMaxLayer::backward(float learningRate) {
 //    cout << "softmaxlayer::backproperrors" << endl;
     // nop, do nothing :-)
 }
 VIRTUAL std::string SoftMaxLayer::asString() const {
-    return "SoftMaxLayer{ perPlane=" + toString( perPlane ) + " numPlanes=" + toString( numPlanes )
-        + " imageSize=" + toString( imageSize ) + " }";
+    return "SoftMaxLayer{ perPlane=" + toString(perPlane) + " numPlanes=" + toString(numPlanes)
+        + " imageSize=" + toString(imageSize) + " }";
 }
 
diff --git a/src/loss/SoftMaxLayer.h b/src/loss/SoftMaxLayer.h
index bcb3d1b2..227fcfbe 100644
--- a/src/loss/SoftMaxLayer.h
+++ b/src/loss/SoftMaxLayer.h
@@ -36,22 +36,22 @@ class SoftMaxLayer : public LossLayer, public IAcceptsLabels {
     // cog_addheaders.add()
     // ]]]
     // generated, using cog:
-    SoftMaxLayer( Layer *previousLayer, SoftMaxMaker *maker );
+    SoftMaxLayer(Layer *previousLayer, SoftMaxMaker *maker);
     VIRTUAL ~SoftMaxLayer();
     VIRTUAL std::string getClassName() const;
     VIRTUAL float *getOutput();
     VIRTUAL float *getGradInput();
-    VIRTUAL void setBatchSize( int batchSize );
-    VIRTUAL float calcLossFromLabels( int const *labels );
-    VIRTUAL float calcLoss( float const *expectedValues );
-    VIRTUAL void calcGradInputFromLabels( int const *labels );
-    VIRTUAL void calcGradInput( float const *expectedValues );
+    VIRTUAL void setBatchSize(int batchSize);
+    VIRTUAL float calcLossFromLabels(int const *labels);
+    VIRTUAL float calcLoss(float const *expectedValues);
+    VIRTUAL void calcGradInputFromLabels(int const *labels);
+    VIRTUAL void calcGradInput(float const *expectedValues);
     VIRTUAL int getNumLabelsPerExample();
-    VIRTUAL int getPersistSize( int version ) const;
-    VIRTUAL int calcNumRight( int const*labels );
+    VIRTUAL int getPersistSize(int version) const;
+    VIRTUAL int calcNumRight(int const*labels);
     VIRTUAL void forward();
-    VIRTUAL void getLabels( int *labels );  // need to allocate labels array first, and have called 'forward' first
-    VIRTUAL void backward( float learningRate );
+    VIRTUAL void getLabels(int *labels);  // need to allocate labels array first, and have called 'forward' first
+    VIRTUAL void backward(float learningRate);
     VIRTUAL std::string asString() const;
 
     // [[[end]]]
diff --git a/src/loss/SquareLossLayer.cpp b/src/loss/SquareLossLayer.cpp
index e706b1ae..e61bbb5a 100644
--- a/src/loss/SquareLossLayer.cpp
+++ b/src/loss/SquareLossLayer.cpp
@@ -15,13 +15,13 @@ using namespace std;
 #define VIRTUAL
 #define STATIC
 
-SquareLossLayer::SquareLossLayer( Layer *previousLayer, SquareLossMaker *maker ) :
-        LossLayer( previousLayer, maker ),
-        gradInput( 0 ),
-        allocatedSize( 0 ) {
+SquareLossLayer::SquareLossLayer(Layer *previousLayer, SquareLossMaker *maker) :
+        LossLayer(previousLayer, maker),
+        gradInput(0),
+        allocatedSize(0) {
 }
 VIRTUAL SquareLossLayer::~SquareLossLayer(){
-    if( gradInput != 0 ) {
+    if(gradInput != 0) {
         delete[] gradInput;
     }
 }
@@ -31,16 +31,16 @@ VIRTUAL std::string SquareLossLayer::getClassName() const {
 VIRTUAL float*SquareLossLayer::getGradInput() {
     return gradInput;
 }
-VIRTUAL float SquareLossLayer::calcLoss( float const *expected ) {
+VIRTUAL float SquareLossLayer::calcLoss(float const *expected) {
     float loss = 0;
 //    float *output = getOutput();
     float *input = previousLayer->getOutput();
 //    cout << "SquareLossLayer::calcLoss" << endl;
     int numPlanes = previousLayer->getOutputPlanes();
-    int imageSize = previousLayer->getOutputImageSize();
+    int imageSize = previousLayer->getOutputSize();
     int totalLinearSize = batchSize * numPlanes * imageSize * imageSize;
-    for( int i = 0; i < totalLinearSize; i++ ) {
-//        if( i < 5 ) cout << "input[" << i << "]=" << input[i] << endl;
+    for(int i = 0; i < totalLinearSize; i++) {
+//        if(i < 5) cout << "input[" << i << "]=" << input[i] << endl;
         float diff = input[i] - expected[i];
         float diffSquared = diff * diff;
         loss += diffSquared;
@@ -49,26 +49,26 @@ VIRTUAL float SquareLossLayer::calcLoss( float const *expected ) {
 //    cout << "loss " << loss << endl;
     return loss;
  }
-VIRTUAL void SquareLossLayer::setBatchSize( int batchSize ) {
-    if( batchSize <= allocatedSize ) {
+VIRTUAL void SquareLossLayer::setBatchSize(int batchSize) {
+    if(batchSize <= allocatedSize) {
         this->batchSize = batchSize;
         return;
     }
-    if( gradInput != 0 ) {
+    if(gradInput != 0) {
         delete[] gradInput;
     }
     this->batchSize = batchSize;
     allocatedSize = batchSize;
-    gradInput = new float[ batchSize * previousLayer->getOutputSize() ];
+    gradInput = new float[ batchSize * previousLayer->getOutputNumElements() ];
 }
-VIRTUAL void SquareLossLayer::calcGradInput( float const*expectedOutput ) {
-    int inputSize = previousLayer->getOutputSize();
+VIRTUAL void SquareLossLayer::calcGradInput(float const*expectedOutput) {
+    int inputNumElements = previousLayer->getOutputNumElements();
     float *input = previousLayer->getOutput();
-    for( int i = 0; i < inputSize; i++ ) {
+    for(int i = 0; i < inputNumElements; i++) {
         gradInput[i] = input[i] - expectedOutput[i];
     }
 }
-VIRTUAL int SquareLossLayer::getPersistSize( int version ) const {
+VIRTUAL int SquareLossLayer::getPersistSize(int version) const {
     return 0;
 }
 VIRTUAL std::string SquareLossLayer::asString() const {
diff --git a/src/loss/SquareLossLayer.h b/src/loss/SquareLossLayer.h
index 8ca52563..2bc27cff 100644
--- a/src/loss/SquareLossLayer.h
+++ b/src/loss/SquareLossLayer.h
@@ -27,14 +27,14 @@ class SquareLossLayer : public LossLayer {
     // cog_addheaders.add()
     // ]]]
     // generated, using cog:
-    SquareLossLayer( Layer *previousLayer, SquareLossMaker *maker );
+    SquareLossLayer(Layer *previousLayer, SquareLossMaker *maker);
     VIRTUAL ~SquareLossLayer();
     VIRTUAL std::string getClassName() const;
     VIRTUAL float*getGradInput();
-    VIRTUAL float calcLoss( float const *expected );
-    VIRTUAL void setBatchSize( int batchSize );
-    VIRTUAL void calcGradInput( float const*expectedOutput );
-    VIRTUAL int getPersistSize( int version ) const;
+    VIRTUAL float calcLoss(float const *expected);
+    VIRTUAL void setBatchSize(int batchSize);
+    VIRTUAL void calcGradInput(float const*expectedOutput);
+    VIRTUAL int getPersistSize(int version) const;
     VIRTUAL std::string asString() const;
 
     // [[[end]]]
diff --git a/src/main/predict.cpp b/src/main/predict.cpp
index f562238c..6771ecb5 100644
--- a/src/main/predict.cpp
+++ b/src/main/predict.cpp
@@ -12,6 +12,7 @@
 #include <fcntl.h>
 #include <io.h>
 #endif // _WIN32
+#include "clblas/ClBlasInstance.h"
 
 using namespace std;
 
@@ -40,7 +41,7 @@ class Config {
     /* [[[cog
         cog.outl('// generated using cog:')
         for option in options:
-            cog.outl( option['type'] + ' ' + option['name'] + ';')
+            cog.outl(option['type'] + ' ' + option['name'] + ';')
     */// ]]]
     // generated using cog:
     int gpuIndex;
@@ -69,7 +70,7 @@ class Config {
                     if '.' not in defaultString:
                         defaultString += '.0'
                     defaultString += 'f'
-                cog.outl( option['name'] + ' = ' + defaultString + ';')
+                cog.outl(option['name'] + ' = ' + defaultString + ';')
         */// ]]]
         // generated using cog:
         gpuIndex = -1;
@@ -86,7 +87,7 @@ class Config {
 
 void go(Config config) {
     bool verbose = true;
-    if( config.outputFile == "" ) {
+    if(config.outputFile == "") {
         verbose = false;
     }
 
@@ -94,18 +95,18 @@ void go(Config config) {
     int numPlanes;
     int imageSize;
     int imageSizeCheck;
-    if( config.inputFile == "" ) {
+    if(config.inputFile == "") {
         int dims[3];
-        cin.read( reinterpret_cast< char * >( dims ), 3 * 4l );
+        cin.read(reinterpret_cast< char * >(dims), 3 * 4l);
         numPlanes = dims[0];
         imageSize = dims[1];
         imageSizeCheck = dims[2];
-        if( imageSize != imageSizeCheck ) {
-            throw std::runtime_error( "imageSize doesnt match imageSizeCheck, image not square" );
+        if(imageSize != imageSizeCheck) {
+            throw std::runtime_error("imageSize doesnt match imageSizeCheck, image not square");
         }
     } else {
-        GenericLoader::getDimensions( config.inputFile, &N, &numPlanes, &imageSize );
-        if( verbose ) cout << "N " << N << " planes " << numPlanes << " size " << imageSize << endl;
+        GenericLoader::getDimensions(config.inputFile.c_str(), &N, &numPlanes, &imageSize);
+        if(verbose) cout << "N " << N << " planes " << numPlanes << " size " << imageSize << endl;
     }
 
     const long inputCubeSize = numPlanes * imageSize * imageSize ;
@@ -115,11 +116,12 @@ void go(Config config) {
     //
 
     EasyCL *cl = 0;
-    if( config.gpuIndex >= 0 ) {
-        cl = EasyCL::createForIndexedGpu( config.gpuIndex, verbose );
+    if(config.gpuIndex >= 0) {
+        cl = EasyCL::createForIndexedGpu(config.gpuIndex, verbose);
     } else {
-        cl = EasyCL::createForFirstGpuOtherwiseCpu( verbose );
+        cl = EasyCL::createForFirstGpuOtherwiseCpu(verbose);
     }
+    ClBlasInstance blasInstance;
 
     NeuralNet *net;
     net = new NeuralNet(cl);
@@ -127,22 +129,22 @@ void go(Config config) {
     // just use the default for net creation, weights are overriden from the weightsFile
     WeightsInitializer *weightsInitializer = new OriginalInitializer();
 
-    if( config.weightsFile == "" ) {
+    if(config.weightsFile == "") {
         cout << "weightsFile not specified" << endl;
         return;
     }
 
     string netDef;
-    if ( !WeightsPersister::loadConfigString( config.weightsFile, netDef ) ){
+    if (!WeightsPersister::loadConfigString(config.weightsFile, netDef) ){
         cout << "Cannot load network definition from weightsFile." << endl;
         return;
     }
 //    cout << "net def from weights file: " << netDef << endl;
 
-    net->addLayer( InputLayerMaker::instance()->numPlanes(numPlanes)->imageSize(imageSize) );
-    net->addLayer( NormalizationLayerMaker::instance()->translate( 0.0f )->scale( 1.0f ) ); // This will be read from weights file
+    net->addLayer(InputLayerMaker::instance()->numPlanes(numPlanes)->imageSize(imageSize));
+    net->addLayer(NormalizationLayerMaker::instance()->translate(0.0f)->scale(1.0f) ); // This will be read from weights file
 
-    if( !NetdefToNet::createNetFromNetdef( net, netDef, weightsInitializer ) ) {
+    if(!NetdefToNet::createNetFromNetdef(net, netDef, weightsInitializer) ) {
         return;
     }
 
@@ -152,16 +154,16 @@ void go(Config config) {
 
     // weights file contains normalization layer parameters as 'weights' now.  We should probably rename weights to parameters
     // sooner or later ,but anyway, tehcnically, works for onw
-    if( !WeightsPersister::loadWeights( config.weightsFile, string("netDef=")+netDef, net, &ignI, &ignI, &ignF, &ignI, &ignF ) ){
+    if(!WeightsPersister::loadWeights(config.weightsFile, string("netDef=")+netDef, net, &ignI, &ignI, &ignF, &ignI, &ignF) ){
         cout << "Cannot load network weights from weightsFile." << endl;
         return;
     }
 
-    if( verbose ) {
+    if(verbose) {
         net->print();
     }
     net->setBatchSize(config.batchSize);
-    if( verbose ) cout << "batchSize: " << config.batchSize << endl;
+    if(verbose) cout << "batchSize: " << config.batchSize << endl;
 
 
     //
@@ -174,55 +176,55 @@ void go(Config config) {
     int n = 0;
     bool more = true;
     ostream *outFile = 0;
-    if( verbose ) cout << "outputFile: '" << config.outputFile << "'"<< endl;
-    if( config.outputFile == "" ) {
+    if(verbose) cout << "outputFile: '" << config.outputFile << "'"<< endl;
+    if(config.outputFile == "") {
         #ifdef _WIN32
         // refs:
         // http://www.thecodingforums.com/threads/binary-output-to-stdout-in-windows.317367/
         // http://www.cplusplus.com/forum/windows/77812/
-        _setmode( _fileno( stdout ), _O_BINARY ); 
+        _setmode(_fileno(stdout), _O_BINARY); 
         #endif
         outFile = &cout;
     } else {
-        if( config.outputFormat == "text" ) {
-            outFile = new ofstream( config.outputFile, ios::out );
-        } else if( config.outputFormat == "binary" ) {
-            outFile = new ofstream( config.outputFile, ios::out | std::ios::binary );
+        if(config.outputFormat == "text") {
+            outFile = new ofstream(config.outputFile, ios::out);
+        } else if(config.outputFormat == "binary") {
+            outFile = new ofstream(config.outputFile, ios::out | std::ios::binary);
         } else {
-            throw runtime_error( "outputFormat " + config.outputFormat + " not recognized" );
+            throw runtime_error("outputFormat " + config.outputFormat + " not recognized");
         }
     }
-    if( config.outputLayer == -1 ) {
+    if(config.outputLayer == -1) {
         config.outputLayer = net->getNumLayers() - 1;
     }
-    if( verbose ) cout << "inputFile: '" << config.inputFile << "'"<< endl;
-    if( config.inputFile == "" ) {
-        cin.read( reinterpret_cast< char * >( inputData ), inputCubeSize * config.batchSize * 4l );
+    if(verbose) cout << "inputFile: '" << config.inputFile << "'"<< endl;
+    if(config.inputFile == "") {
+        cin.read(reinterpret_cast< char * >(inputData), inputCubeSize * config.batchSize * 4l);
         more = !cin.eof();
     } else {
         // pass 0 for labels, and this will cause GenericLoader to simply not try to load any labels
         // now, after modifying GenericLoader to have this new behavior
-        GenericLoader::load( config.inputFile, inputData, 0, n, config.batchSize );
+        GenericLoader::load(config.inputFile.c_str(), inputData, 0, n, config.batchSize);
     }
-    while( more ) {
+    while(more) {
         // no point in forwarding through all, so forward through each, one by one
-        if( config.outputLayer < 0 || config.outputLayer > net->getNumLayers() ) {
-            throw runtime_error( "outputLayer should be the layer number of one of the layers in the network" );    
+        if(config.outputLayer < 0 || config.outputLayer > net->getNumLayers()) {
+            throw runtime_error("outputLayer should be the layer number of one of the layers in the network");    
         }
-        dynamic_cast<InputLayer *>( net->getLayer(0) )->in( inputData );
-        for( int layerId = 0; layerId <= config.outputLayer; layerId++ ) {
-            StatefulTimer::setPrefix("layer" + toString(layerId) + " " );
-            net->getLayer( layerId )->forward();
-            StatefulTimer::setPrefix("" );
+        dynamic_cast<InputLayer *>(net->getLayer(0))->in(inputData);
+        for(int layerId = 0; layerId <= config.outputLayer; layerId++) {
+            StatefulTimer::setPrefix("layer" + toString(layerId) + " ");
+            net->getLayer(layerId)->forward();
+            StatefulTimer::setPrefix("");
         }
 
-        if( !config.writeLabels ) {
-            if( config.outputFormat == "text" ) {
-                float const*output = net->getLayer( config.outputLayer )->getOutput();
-                const int numFields = net->getLayer( config.outputLayer )->getOutputCubeSize();
-                for( int i = 0; i < config.batchSize; i++ ) {
-                    for( int f = 0; f < numFields; f++ ) {
-                        if( f > 0 ) {
+        if(!config.writeLabels) {
+            if(config.outputFormat == "text") {
+                float const*output = net->getLayer(config.outputLayer)->getOutput();
+                const int numFields = net->getLayer(config.outputLayer)->getOutputCubeSize();
+                for(int i = 0; i < config.batchSize; i++) {
+                    for(int f = 0; f < numFields; f++) {
+                        if(f > 0) {
                             *outFile << " ";
                         }
                         *outFile << output[ i * numFields + f ];
@@ -230,40 +232,40 @@ void go(Config config) {
                     *outFile << "\n";
                 }
             } else {
-                outFile->write( reinterpret_cast<const char *>(net->getOutput()), net->getOutputSize() * 4 * config.batchSize);
+                outFile->write(reinterpret_cast<const char *>(net->getOutput()), net->getOutputNumElements() * 4 * config.batchSize);
             }
         } else {
-            SoftMaxLayer *softMaxLayer = dynamic_cast< SoftMaxLayer *>(net->getLayer( config.outputLayer ) );
-            if( softMaxLayer == 0 ) {
+            SoftMaxLayer *softMaxLayer = dynamic_cast< SoftMaxLayer *>(net->getLayer(config.outputLayer) );
+            if(softMaxLayer == 0) {
                 cout << "must choose softmaxlayer, if want to output labels" << endl;
                 return;
             }
             softMaxLayer->getLabels(labels);
-            if( config.outputFormat == "text" ) {
-                for( int i = 0; i < config.batchSize; i++ ) {
+            if(config.outputFormat == "text") {
+                for(int i = 0; i < config.batchSize; i++) {
                     *outFile << labels[i] << "\n";
                 }
             } else {
-                outFile->write( reinterpret_cast< char * >( labels ), config.batchSize * 4l );
+                outFile->write(reinterpret_cast< char * >(labels), config.batchSize * 4l);
             }
             outFile->flush();
         }
         n += config.batchSize;
-        if( config.inputFile == "" ) {
-            cin.read( reinterpret_cast< char * >( inputData ), inputCubeSize * config.batchSize * 4l );
+        if(config.inputFile == "") {
+            cin.read(reinterpret_cast< char * >(inputData), inputCubeSize * config.batchSize * 4l);
             more = !cin.eof();
         } else {
-            if( n + config.batchSize < N ) {
-                GenericLoader::load( config.inputFile, inputData, 0, n, config.batchSize );
+            if(n + config.batchSize < N) {
+                GenericLoader::load(config.inputFile.c_str(), inputData, 0, n, config.batchSize);
             } else {
                 more = false;
-                if( n != N ) {
+                if(n != N) {
                     cout << "breaking prematurely, since file is not an exact multiple of batchsize, and we didnt handle this yet" << endl;
                 }
             }
         }
     }
-    if( config.outputFile != "" ) {
+    if(config.outputFile != "") {
         delete outFile;
     }
 
@@ -274,7 +276,7 @@ void go(Config config) {
     delete cl;
 }
 
-void printUsage( char *argv[], Config config ) {
+void printUsage(char *argv[], Config config) {
     cout << "Usage: " << argv[0] << " [key]=[value] [[key]=[value]] ..." << endl;
     cout << endl;
     cout << "Possible key=value pairs:" << endl;
@@ -285,14 +287,14 @@ void printUsage( char *argv[], Config config ) {
             name = option['name']
             description = option['description']
             if 'ispublicapi' in option and option['ispublicapi']:
-                cog.outl( 'cout << "    ' + name.lower() + '=[' + description + '] (" << config.' + name + ' << ")" << endl;')
+                cog.outl('cout << "    ' + name.lower() + '=[' + description + '] (" << config.' + name + ' << ")" << endl;')
         cog.outl('cout << "" << endl; ')
         cog.outl('cout << "unstable, might change within major version:" << endl; ')
         for option in options:
             if 'ispublicapi' not in option or not option['ispublicapi']:
                 name = option['name']
                 description = option['description']
-                cog.outl( 'cout << "    ' + name.lower() + '=[' + description + '] (" << config.' + name + ' << ")" << endl;')
+                cog.outl('cout << "    ' + name.lower() + '=[' + description + '] (" << config.' + name + ' << ")" << endl;')
     *///]]]
     // generated using cog:
     cout << "public api, shouldnt change within major version:" << endl;
@@ -309,14 +311,14 @@ void printUsage( char *argv[], Config config ) {
     // [[[end]]]
 }
 
-int main( int argc, char *argv[] ) {
+int main(int argc, char *argv[]) {
     Config config;
-    if( argc == 2 && ( string(argv[1]) == "--help" || string(argv[1]) == "--?" || string(argv[1]) == "-?" || string(argv[1]) == "-h" ) ) {
-        printUsage( argv, config );
+    if(argc == 2 && (string(argv[1]) == "--help" || string(argv[1]) == "--?" || string(argv[1]) == "-?" || string(argv[1]) == "-h") ) {
+        printUsage(argv, config);
     } 
-    for( int i = 1; i < argc; i++ ) {
-        vector<string> splitkeyval = split( argv[i], "=" );
-        if( splitkeyval.size() != 2 ) {
+    for(int i = 1; i < argc; i++) {
+        vector<string> splitkeyval = split(argv[i], "=");
+        if(splitkeyval.size() != 2) {
           cout << "Usage: " << argv[0] << " [key]=[value] [[key]=[value]] ..." << endl;
           exit(1);
         } else {
@@ -325,56 +327,56 @@ int main( int argc, char *argv[] ) {
 //            cout << "key [" << key << "]" << endl;
             /* [[[cog
                 cog.outl('// generated using cog:')
-                cog.outl('if( false ) {')
+                cog.outl('if(false) {')
                 for option in options:
                     name = option['name']
                     type = option['type']
-                    cog.outl( '} else if( key == "' + name.lower() + '" ) {')
+                    cog.outl('} else if(key == "' + name.lower() + '") {')
                     converter = '';
                     if type == 'int':
                         converter = 'atoi';
                     elif type == 'float':
                         converter = 'atof';
-                    cog.outl( '    config.' + name + ' = ' + converter + '(value);')
+                    cog.outl('    config.' + name + ' = ' + converter + '(value);')
             */// ]]]
             // generated using cog:
-            if( false ) {
-            } else if( key == "gpuindex" ) {
+            if(false) {
+            } else if(key == "gpuindex") {
                 config.gpuIndex = atoi(value);
-            } else if( key == "weightsfile" ) {
+            } else if(key == "weightsfile") {
                 config.weightsFile = (value);
-            } else if( key == "batchsize" ) {
+            } else if(key == "batchsize") {
                 config.batchSize = atoi(value);
-            } else if( key == "inputfile" ) {
+            } else if(key == "inputfile") {
                 config.inputFile = (value);
-            } else if( key == "outputfile" ) {
+            } else if(key == "outputfile") {
                 config.outputFile = (value);
-            } else if( key == "outputlayer" ) {
+            } else if(key == "outputlayer") {
                 config.outputLayer = atoi(value);
-            } else if( key == "writelabels" ) {
+            } else if(key == "writelabels") {
                 config.writeLabels = atoi(value);
-            } else if( key == "outputformat" ) {
+            } else if(key == "outputformat") {
                 config.outputFormat = (value);
             // [[[end]]]
             } else {
                 cout << endl;
                 cout << "Error: key '" << key << "' not recognised" << endl;
                 cout << endl;
-                printUsage( argv, config );
+                printUsage(argv, config);
                 cout << endl;
                 return -1;
             }
         }
     }
-    if( config.outputFormat != "text" && config.outputFormat != "binary" ) {
+    if(config.outputFormat != "text" && config.outputFormat != "binary") {
         cout << endl;
         cout << "outputformat must be 'text' or 'binary'" << endl;
         cout << endl;
         return -1;
     }
     try {
-        go( config );
-    } catch( runtime_error e ) {
+        go(config);
+    } catch(runtime_error e) {
         cout << "Something went wrong: " << e.what() << endl;
         return -1;
     }
diff --git a/src/main/train.cpp b/src/main/train.cpp
index 4b98d8b5..855d57d8 100644
--- a/src/main/train.cpp
+++ b/src/main/train.cpp
@@ -10,13 +10,14 @@
 
 #include "DeepCL.h"
 //#include "test/Sampler.h"  // TODO: REMOVE THIS
+#include "clblas/ClBlasInstance.h"
 
 using namespace std;
 
 /* [[[cog
     # These are used in the later cog sections in this file:
     # format:
-    # ( name, type, description, default, ispublicapi )
+    # (name, type, description, default, ispublicapi)
     options = [
         ('gpuIndex', 'int', 'gpu device index; default value is gpu if present, cpu otw.', -1, True),
         ('dataDir', 'string', 'directory to search for train and validate files', '../data/mnist', True),
@@ -38,9 +39,9 @@ using namespace std;
         ('loadOnDemand', 'int', 'load data on demand [1|0]', 0, True),
         ('fileReadBatches', 'int', 'how many batches to read from file each time? (for loadondemand=1)', 50, True),
         ('normalizationExamples', 'int', 'number of examples to read to determine normalization parameters', 10000, True),
-        ('weightsInitializer', 'string', 'initializer for weights, choices: original, uniform (default: original)', 'original', True ),
-        ('initialWeights', 'float', 'for uniform initializer, weights will be initialized randomly within range -initialweights to +initialweights, divided by fanin, (default: 1.0f)', 1.0, False ),
-        ('trainer', 'string', 'which trainer, sgd, anneal, nesterov, adagrad, rmsprop, or adadelta (default: sgd)', 'sgd', True ),
+        ('weightsInitializer', 'string', 'initializer for weights, choices: original, uniform (default: original)', 'original', True),
+        ('initialWeights', 'float', 'for uniform initializer, weights will be initialized randomly within range -initialweights to +initialweights, divided by fanin, (default: 1.0f)', 1.0, False),
+        ('trainer', 'string', 'which trainer, sgd, anneal, nesterov, adagrad, rmsprop, or adadelta (default: sgd)', 'sgd', True),
         ('learningRate', 'float', 'learning rate, a float value, used by all trainers', 0.002, True),
         ('rho', 'float', 'rho decay, in adadelta trainer. 1 is no decay. 0 is full decay (default 0.9)', 0.9, False),
         ('momentum', 'float', 'momentum, used by sgd and nesterov trainers', 0.0, True),
@@ -55,7 +56,7 @@ class Config {
     /* [[[cog
         cog.outl('// generated using cog:')
         for (name,type,description,default,_) in options:
-            cog.outl( type + ' ' + name + ';')
+            cog.outl(type + ' ' + name + ';')
     */// ]]]
     // generated using cog:
     int gpuIndex;
@@ -102,7 +103,7 @@ class Config {
                     if '.' not in defaultString:
                         defaultString += '.0'
                     defaultString += 'f'
-                cog.outl( name + ' = ' + defaultString + ';')
+                cog.outl(name + ' = ' + defaultString + ';')
         */// ]]]
         // generated using cog:
         gpuIndex = -1;
@@ -167,40 +168,45 @@ void go(Config config) {
     int trainAllocateN = 0;
     int testAllocateN = 0;
 
+    if(config.dumpTimings) {
+        StatefulTimer::setEnabled(true);
+    }
+    cout << "Statefultimer enabled: " << StatefulTimer::enabled << endl;
+
 //    int totalLinearSize;
-    GenericLoaderv2 trainLoader( config.dataDir + "/" + config.trainFile );
+    GenericLoaderv2 trainLoader(config.dataDir + "/" + config.trainFile);
     Ntrain = trainLoader.getN();
     numPlanes = trainLoader.getPlanes();
     imageSize = trainLoader.getImageSize();
-    // GenericLoader::getDimensions( , &Ntrain, &numPlanes, &imageSize );
+    // GenericLoader::getDimensions(, &Ntrain, &numPlanes, &imageSize);
     Ntrain = config.numTrain == -1 ? Ntrain : config.numTrain;
 //    long allocateSize = (long)Ntrain * numPlanes * imageSize * imageSize;
     cout << "Ntrain " << Ntrain << " numPlanes " << numPlanes << " imageSize " << imageSize << endl;
-    if( config.loadOnDemand ) {
+    if(config.loadOnDemand) {
         trainAllocateN = config.batchSize; // can improve this later
     } else {
         trainAllocateN = Ntrain;
     }
     trainData = new float[ (long)trainAllocateN * numPlanes * imageSize * imageSize ];
     trainLabels = new int[trainAllocateN];
-    if( !config.loadOnDemand && Ntrain > 0 ) {
-        trainLoader.load( trainData, trainLabels, 0, Ntrain );
+    if(!config.loadOnDemand && Ntrain > 0) {
+        trainLoader.load(trainData, trainLabels, 0, Ntrain);
     }
 
-    GenericLoaderv2 testLoader( config.dataDir + "/" + config.validateFile );
+    GenericLoaderv2 testLoader(config.dataDir + "/" + config.validateFile);
     Ntest = testLoader.getN();
     numPlanes = testLoader.getPlanes();
     imageSize = testLoader.getImageSize();
     Ntest = config.numTest == -1 ? Ntest : config.numTest;
-    if( config.loadOnDemand ) {
+    if(config.loadOnDemand) {
         testAllocateN = config.batchSize; // can improve this later
     } else {
         testAllocateN = Ntest;
     }
     testData = new float[ (long)testAllocateN * numPlanes * imageSize * imageSize ];
     testLabels = new int[testAllocateN]; 
-    if( !config.loadOnDemand && Ntest > 0 ) {
-        testLoader.load( testData, testLabels, 0, Ntest );
+    if(!config.loadOnDemand && Ntest > 0) {
+        testLoader.load(testData, testLabels, 0, Ntest);
     }
     cout << "Ntest " << Ntest << " Ntest" << endl;
     
@@ -210,16 +216,16 @@ void go(Config config) {
     float translate;
     float scale;
     int normalizationExamples = config.normalizationExamples > Ntrain ? Ntrain : config.normalizationExamples;
-    if( !config.loadOnDemand ) {
-        if( config.normalization == "stddev" ) {
+    if(!config.loadOnDemand) {
+        if(config.normalization == "stddev") {
             float mean, stdDev;
-            NormalizationHelper::getMeanAndStdDev( trainData, normalizationExamples * inputCubeSize, &mean, &stdDev );
+            NormalizationHelper::getMeanAndStdDev(trainData, normalizationExamples * inputCubeSize, &mean, &stdDev);
             cout << " image stats mean " << mean << " stdDev " << stdDev << endl;
             translate = - mean;
             scale = 1.0f / stdDev / config.normalizationNumStds;
-        } else if( config.normalization == "maxmin" ) {
+        } else if(config.normalization == "maxmin") {
             float mean, stdDev;
-            NormalizationHelper::getMinMax( trainData, normalizationExamples * inputCubeSize, &mean, &stdDev );
+            NormalizationHelper::getMinMax(trainData, normalizationExamples * inputCubeSize, &mean, &stdDev);
             translate = - mean;
             scale = 1.0f / stdDev;
         } else {
@@ -227,18 +233,18 @@ void go(Config config) {
             return;
         }
     } else {
-        if( config.normalization == "stddev" ) {
+        if(config.normalization == "stddev") {
             float mean, stdDev;
-            NormalizeGetStdDev normalizeGetStdDev( trainData, trainLabels ); 
-            BatchProcessv2::run( &trainLoader, 0, config.batchSize, normalizationExamples, inputCubeSize, &normalizeGetStdDev );
-            normalizeGetStdDev.calcMeanStdDev( &mean, &stdDev );
+            NormalizeGetStdDev normalizeGetStdDev(trainData, trainLabels); 
+            BatchProcessv2::run(&trainLoader, 0, config.batchSize, normalizationExamples, inputCubeSize, &normalizeGetStdDev);
+            normalizeGetStdDev.calcMeanStdDev(&mean, &stdDev);
             cout << " image stats mean " << mean << " stdDev " << stdDev << endl;
             translate = - mean;
             scale = 1.0f / stdDev / config.normalizationNumStds;
-        } else if( config.normalization == "maxmin" ) {
-            NormalizeGetMinMax normalizeGetMinMax( trainData, trainLabels );
-            BatchProcessv2::run( &trainLoader, 0, config.batchSize, normalizationExamples, inputCubeSize, &normalizeGetMinMax );
-            normalizeGetMinMax.calcMinMaxTransform( &translate, &scale );
+        } else if(config.normalization == "maxmin") {
+            NormalizeGetMinMax normalizeGetMinMax(trainData, trainLabels);
+            BatchProcessv2::run(&trainLoader, 0, config.batchSize, normalizationExamples, inputCubeSize, &normalizeGetMinMax);
+            normalizeGetMinMax.calcMinMaxTransform(&translate, &scale);
         } else {
             cout << "Error: Unknown normalization: " << config.normalization << endl;
             return;
@@ -251,68 +257,69 @@ void go(Config config) {
 //    const int batchSize = config.batchSize;
 
     EasyCL *cl = 0;
-    if( config.gpuIndex >= 0 ) {
-        cl = EasyCL::createForIndexedGpu( config.gpuIndex );
+    if(config.gpuIndex >= 0) {
+        cl = EasyCL::createForIndexedGpu(config.gpuIndex);
     } else {
         cl = EasyCL::createForFirstGpuOtherwiseCpu();
     }
+    ClBlasInstance blasInstance;
 
     NeuralNet *net;
     net = new NeuralNet(cl);
 
     WeightsInitializer *weightsInitializer = 0;
-    if( toLower( config.weightsInitializer ) == "original" ) {
+    if(toLower(config.weightsInitializer) == "original") {
         weightsInitializer = new OriginalInitializer();
-    } else if( toLower( config.weightsInitializer ) == "uniform" ) {
-        weightsInitializer = new UniformInitializer( config.initialWeights );
+    } else if(toLower(config.weightsInitializer) == "uniform") {
+        weightsInitializer = new UniformInitializer(config.initialWeights);
     } else {
         cout << "Unknown weights initializer " << config.weightsInitializer << endl;
         return;
     }
 
 //    net->inputMaker<unsigned char>()->numPlanes(numPlanes)->imageSize(imageSize)->insert();
-    net->addLayer( InputLayerMaker::instance()->numPlanes(numPlanes)->imageSize(imageSize) );
-    net->addLayer( NormalizationLayerMaker::instance()->translate(translate)->scale(scale) );
-    if( !NetdefToNet::createNetFromNetdef( net, config.netDef, weightsInitializer ) ) {
+    net->addLayer(InputLayerMaker::instance()->numPlanes(numPlanes)->imageSize(imageSize));
+    net->addLayer(NormalizationLayerMaker::instance()->translate(translate)->scale(scale));
+    if(!NetdefToNet::createNetFromNetdef(net, config.netDef, weightsInitializer)) {
         return;
     }
     // apply the trainer
     Trainer *trainer = 0;
-    if( toLower( config.trainer ) == "sgd" ) {
-        SGD *sgd = new SGD( cl );
-        sgd->setLearningRate( config.learningRate );
-        sgd->setMomentum( config.momentum );
-        sgd->setWeightDecay( config.weightDecay );
+    if(toLower(config.trainer) == "sgd") {
+        SGD *sgd = new SGD(cl);
+        sgd->setLearningRate(config.learningRate);
+        sgd->setMomentum(config.momentum);
+        sgd->setWeightDecay(config.weightDecay);
         trainer = sgd;
-    } else if( toLower( config.trainer ) == "anneal" ) {
-        Annealer *annealer = new Annealer( cl );
-        annealer->setLearningRate( config.learningRate );
-        annealer->setAnneal( config.anneal );
+    } else if(toLower(config.trainer) == "anneal") {
+        Annealer *annealer = new Annealer(cl);
+        annealer->setLearningRate(config.learningRate);
+        annealer->setAnneal(config.anneal);
         trainer = annealer;
-    } else if( toLower( config.trainer ) == "nesterov" ) {
-        Nesterov *nesterov = new Nesterov( cl );
-        nesterov->setLearningRate( config.learningRate );
-        nesterov->setMomentum( config.momentum );
+    } else if(toLower(config.trainer) == "nesterov") {
+        Nesterov *nesterov = new Nesterov(cl);
+        nesterov->setLearningRate(config.learningRate);
+        nesterov->setMomentum(config.momentum);
         trainer = nesterov;
-    } else if( toLower( config.trainer ) == "adagrad" ) {
-        Adagrad *adagrad = new Adagrad( cl );
-        adagrad->setLearningRate( config.learningRate );
+    } else if(toLower(config.trainer) == "adagrad") {
+        Adagrad *adagrad = new Adagrad(cl);
+        adagrad->setLearningRate(config.learningRate);
         trainer = adagrad;
-    } else if( toLower( config.trainer ) == "rmsprop" ) {
-        Rmsprop *rmsprop = new Rmsprop( cl );
-        rmsprop->setLearningRate( config.learningRate );
+    } else if(toLower(config.trainer) == "rmsprop") {
+        Rmsprop *rmsprop = new Rmsprop(cl);
+        rmsprop->setLearningRate(config.learningRate);
         trainer = rmsprop;
-    } else if( toLower( config.trainer ) == "adadelta" ) {
-        Adadelta *adadelta = new Adadelta( cl, config.rho );
+    } else if(toLower(config.trainer) == "adadelta") {
+        Adadelta *adadelta = new Adadelta(cl, config.rho);
         trainer = adadelta;
     } else {
         cout << "trainer " << config.trainer << " unknown." << endl;
         return;
     }
     cout << "Using trainer " << trainer->asString() << endl;
-//    trainer->bindTo( net );
-//    net->setTrainer( trainer );
-    net->setBatchSize( config.batchSize );
+//    trainer->bindTo(net);
+//    net->setTrainer(trainer);
+    net->setBatchSize(config.batchSize);
     net->print();
 
     bool afterRestart = false;
@@ -321,19 +328,19 @@ void go(Config config) {
     float restartAnnealedLearningRate = 0;
     int restartNumRight = 0;
     float restartLoss = 0;
-    if( config.loadWeights && config.weightsFile != "" ) {
+    if(config.loadWeights && config.weightsFile != "") {
         cout << "loadingweights" << endl;
-        afterRestart = WeightsPersister::loadWeights( config.weightsFile, config.getTrainingString(), net, &restartEpoch, &restartBatch, &restartAnnealedLearningRate, &restartNumRight, &restartLoss );
-        if( !afterRestart && FileHelper::exists( config.weightsFile ) ) {
+        afterRestart = WeightsPersister::loadWeights(config.weightsFile, config.getTrainingString(), net, &restartEpoch, &restartBatch, &restartAnnealedLearningRate, &restartNumRight, &restartLoss);
+        if(!afterRestart && FileHelper::exists(config.weightsFile)) {
             // try old trainingstring
-            afterRestart = WeightsPersister::loadWeights( config.weightsFile, config.getOldTrainingString(), net, &restartEpoch, &restartBatch, &restartAnnealedLearningRate, &restartNumRight, &restartLoss );
+            afterRestart = WeightsPersister::loadWeights(config.weightsFile, config.getOldTrainingString(), net, &restartEpoch, &restartBatch, &restartAnnealedLearningRate, &restartNumRight, &restartLoss);
         }
-        if( !afterRestart && FileHelper::exists( config.weightsFile ) ) {
+        if(!afterRestart && FileHelper::exists(config.weightsFile)) {
             cout << "Weights file " << config.weightsFile << " exists, but doesnt match training options provided." << endl;
             cout << "Continue loading anyway (might crash, or weights might be completely inappropriate)? (y/n)" << endl;
             string response;
             cin >> response;
-            if( response != "y" ) {
+            if(response != "y") {
                 cout << "Please either check the training options, or choose a weights file that doesnt exist yet" << endl;
                 return;
             }
@@ -342,70 +349,73 @@ void go(Config config) {
     }
 
     timer.timeCheck("before learning start");
-    if( config.dumpTimings ) {
-        StatefulTimer::dump( true );
+    if(config.dumpTimings) {
+        StatefulTimer::dump(true);
     }
     StatefulTimer::timeCheck("START");
 
     Trainable *trainable = net;
     MultiNet *multiNet = 0;
-    if( config.multiNet > 1 ) {
-        multiNet = new MultiNet( config.multiNet, net );
+    if(config.multiNet > 1) {
+        multiNet = new MultiNet(config.multiNet, net);
         trainable = multiNet;
     }
     NetLearnerBase *netLearner = 0;
-    if( config.loadOnDemand ) {
-        netLearner = new NetLearnerOnDemandv2( trainer, trainable,
+    if(config.loadOnDemand) {
+        netLearner = new NetLearnerOnDemandv2(trainer, trainable,
             &trainLoader, Ntrain,
             &testLoader, Ntest,
             config.fileReadBatches, config.batchSize
         );
     } else {
-        netLearner = new NetLearner( trainer, trainable,
+        netLearner = new NetLearner(trainer, trainable,
             Ntrain, trainData, trainLabels,
             Ntest, testData, testLabels,
             config.batchSize 
         );
     }
-//    netLearner->setTrainer( trainer );
+//    netLearner->setTrainer(trainer);
     netLearner->reset();
-    netLearner->setSchedule( config.numEpochs, afterRestart ? restartEpoch : 0 );
-    if( afterRestart ) {
-        netLearner->setBatchState( restartBatch, restartNumRight, restartLoss ); 
+    netLearner->setSchedule(config.numEpochs, afterRestart ? restartEpoch : 0);
+    if(afterRestart) {
+        netLearner->setBatchState(restartBatch, restartNumRight, restartLoss); 
     }
-    netLearner->setDumpTimings( config.dumpTimings );
-//    netLearner->setLearningRate( config.learningRate, config.annealLearningRate );
+    netLearner->setDumpTimings(config.dumpTimings);
+//    netLearner->setLearningRate(config.learningRate, config.annealLearningRate);
     Timer weightsWriteTimer;
-    while( !netLearner->isLearningDone() ) {
+    while(!netLearner->isLearningDone()) {
 //        netLearnerBase->tickEpoch();
         netLearner->tickBatch();
-        if( netLearner->getEpochDone() ) {
+        if(netLearner->getEpochDone()) {
 //            cout << "epoch done" << endl;
-            if( config.weightsFile != "" ) {
+            if(config.weightsFile != "") {
                 cout << "record epoch=" << netLearner->getNextEpoch() << endl;
-                WeightsPersister::persistWeights( config.weightsFile, config.getTrainingString(), net, netLearner->getNextEpoch(), 0, 0, 0, 0 );
+                WeightsPersister::persistWeights(config.weightsFile, config.getTrainingString(), net, netLearner->getNextEpoch(), 0, 0, 0, 0);
                 weightsWriteTimer.lap();
             }
-//            Sampler::sampleFloatWrapper( "conv weights", net->getLayer(6)->getWeightsWrapper() );
-//            Sampler::sampleFloatWrapper( "fc weights", net->getLayer(11)->getWeightsWrapper() );
-//            Sampler::sampleFloatWrapper( "conv bias", net->getLayer(6)->getBiasWrapper() );
-//            Sampler::sampleFloatWrapper( "fc bias", net->getLayer(11)->getBiasWrapper() );
+//            Sampler::sampleFloatWrapper("conv weights", net->getLayer(6)->getWeightsWrapper());
+//            Sampler::sampleFloatWrapper("fc weights", net->getLayer(11)->getWeightsWrapper());
+//            Sampler::sampleFloatWrapper("conv bias", net->getLayer(6)->getBiasWrapper());
+//            Sampler::sampleFloatWrapper("fc bias", net->getLayer(11)->getBiasWrapper());
+            if(config.dumpTimings) {
+                StatefulTimer::dump(true);
+            }
         } else {
-            if( config.writeWeightsInterval > 0 ) {
+            if(config.writeWeightsInterval > 0) {
 //                cout << "batch done" << endl;
                 float timeMinutes = weightsWriteTimer.interval() / 1000.0f / 60.0f;
 //                cout << "timeMinutes " << timeMinutes << endl;
-                if( timeMinutes >= config.writeWeightsInterval ) {
+                if(timeMinutes >= config.writeWeightsInterval) {
                     int nextEpoch = netLearner->getNextEpoch();
                     int nextBatch = netLearner->getNextBatch();
                     int batchNumRight = netLearner->getBatchNumRight();
                     float batchLoss = netLearner->getBatchLoss();
                     cout << "record epoch=" << nextEpoch << " batch=" << nextBatch <<
-                        "(" << ( (float)nextBatch * 100.0f / netLearner->getNTrain() * config.batchSize ) << "% of epoch)" <<
-                        " numRight=" << batchNumRight << "(" << (batchNumRight * 100.0f / nextBatch / config.batchSize ) << "%)" <<
+                        "(" << ((float)nextBatch * 100.0f / netLearner->getNTrain() * config.batchSize) << "% of epoch)" <<
+                        " numRight=" << batchNumRight << "(" << (batchNumRight * 100.0f / nextBatch / config.batchSize) << "%)" <<
                         " loss=" << batchLoss << endl;
-                    WeightsPersister::persistWeights( config.weightsFile, config.getTrainingString(), net,
-                        nextEpoch, nextBatch, 0, batchNumRight, batchLoss );
+                    WeightsPersister::persistWeights(config.weightsFile, config.getTrainingString(), net,
+                        nextEpoch, nextBatch, 0, batchNumRight, batchLoss);
                     weightsWriteTimer.lap();
                 }
             }
@@ -415,26 +425,26 @@ void go(Config config) {
     delete weightsInitializer;
     delete trainer;
     delete netLearner;
-    if( multiNet != 0 ) {
+    if(multiNet != 0) {
         delete multiNet;
     }
     delete net;
-    if( trainData != 0 ) {
+    if(trainData != 0) {
         delete[] trainData;
     }
-    if( testData != 0 ) {
+    if(testData != 0) {
         delete[] testData;
     }
-    if( testLabels != 0 ) {
+    if(testLabels != 0) {
         delete[] testLabels;
     }
-    if( trainLabels != 0 ) {
+    if(trainLabels != 0) {
         delete[] trainLabels;
     }
     delete cl;
 }
 
-void printUsage( char *argv[], Config config ) {
+void printUsage(char *argv[], Config config) {
     cout << "Usage: " << argv[0] << " [key]=[value] [[key]=[value]] ..." << endl;
     cout << endl;
     cout << "Possible key=value pairs:" << endl;
@@ -443,12 +453,12 @@ void printUsage( char *argv[], Config config ) {
         cog.outl('cout << "public api, shouldnt change within major version:" << endl;')
         for (name,type,description,_, is_public_api) in options:
             if is_public_api:
-                cog.outl( 'cout << "    ' + name.lower() + '=[' + description + '] (" << config.' + name + ' << ")" << endl;')
+                cog.outl('cout << "    ' + name.lower() + '=[' + description + '] (" << config.' + name + ' << ")" << endl;')
         cog.outl('cout << "" << endl; ')
         cog.outl('cout << "unstable, might change within major version:" << endl; ')
         for (name,type,description,_, is_public_api) in options:
             if not is_public_api:
-                cog.outl( 'cout << "    ' + name.lower() + '=[' + description + '] (" << config.' + name + ' << ")" << endl;')
+                cog.outl('cout << "    ' + name.lower() + '=[' + description + '] (" << config.' + name + ' << ")" << endl;')
     *///]]]
     // generated using cog:
     cout << "public api, shouldnt change within major version:" << endl;
@@ -485,14 +495,14 @@ void printUsage( char *argv[], Config config ) {
     // [[[end]]]
 }
 
-int main( int argc, char *argv[] ) {
+int main(int argc, char *argv[]) {
     Config config;
-    if( argc == 2 && ( string(argv[1]) == "--help" || string(argv[1]) == "--?" || string(argv[1]) == "-?" || string(argv[1]) == "-h" ) ) {
-        printUsage( argv, config );
+    if(argc == 2 && (string(argv[1]) == "--help" || string(argv[1]) == "--?" || string(argv[1]) == "-?" || string(argv[1]) == "-h")) {
+        printUsage(argv, config);
     } 
-    for( int i = 1; i < argc; i++ ) {
-        vector<string> splitkeyval = split( argv[i], "=" );
-        if( splitkeyval.size() != 2 ) {
+    for(int i = 1; i < argc; i++) {
+        vector<string> splitkeyval = split(argv[i], "=");
+        if(splitkeyval.size() != 2) {
           cout << "Usage: " << argv[0] << " [key]=[value] [[key]=[value]] ..." << endl;
           exit(1);
         } else {
@@ -501,105 +511,105 @@ int main( int argc, char *argv[] ) {
 //            cout << "key [" << key << "]" << endl;
             /* [[[cog
                 cog.outl('// generated using cog:')
-                cog.outl('if( false ) {')
+                cog.outl('if(false) {')
                 for (name,type,description,_,_) in options:
-                    cog.outl( '} else if( key == "' + name.lower() + '" ) {')
+                    cog.outl('} else if(key == "' + name.lower() + '") {')
                     converter = '';
                     if type == 'int':
                         converter = 'atoi';
                     elif type == 'float':
                         converter = 'atof';
-                    cog.outl( '    config.' + name + ' = ' + converter + '(value);')
+                    cog.outl('    config.' + name + ' = ' + converter + '(value);')
             */// ]]]
             // generated using cog:
-            if( false ) {
-            } else if( key == "gpuindex" ) {
+            if(false) {
+            } else if(key == "gpuindex") {
                 config.gpuIndex = atoi(value);
-            } else if( key == "datadir" ) {
+            } else if(key == "datadir") {
                 config.dataDir = (value);
-            } else if( key == "trainfile" ) {
+            } else if(key == "trainfile") {
                 config.trainFile = (value);
-            } else if( key == "dataset" ) {
+            } else if(key == "dataset") {
                 config.dataset = (value);
-            } else if( key == "validatefile" ) {
+            } else if(key == "validatefile") {
                 config.validateFile = (value);
-            } else if( key == "numtrain" ) {
+            } else if(key == "numtrain") {
                 config.numTrain = atoi(value);
-            } else if( key == "numtest" ) {
+            } else if(key == "numtest") {
                 config.numTest = atoi(value);
-            } else if( key == "batchsize" ) {
+            } else if(key == "batchsize") {
                 config.batchSize = atoi(value);
-            } else if( key == "numepochs" ) {
+            } else if(key == "numepochs") {
                 config.numEpochs = atoi(value);
-            } else if( key == "netdef" ) {
+            } else if(key == "netdef") {
                 config.netDef = (value);
-            } else if( key == "loadweights" ) {
+            } else if(key == "loadweights") {
                 config.loadWeights = atoi(value);
-            } else if( key == "weightsfile" ) {
+            } else if(key == "weightsfile") {
                 config.weightsFile = (value);
-            } else if( key == "writeweightsinterval" ) {
+            } else if(key == "writeweightsinterval") {
                 config.writeWeightsInterval = atof(value);
-            } else if( key == "normalization" ) {
+            } else if(key == "normalization") {
                 config.normalization = (value);
-            } else if( key == "normalizationnumstds" ) {
+            } else if(key == "normalizationnumstds") {
                 config.normalizationNumStds = atof(value);
-            } else if( key == "dumptimings" ) {
+            } else if(key == "dumptimings") {
                 config.dumpTimings = atoi(value);
-            } else if( key == "multinet" ) {
+            } else if(key == "multinet") {
                 config.multiNet = atoi(value);
-            } else if( key == "loadondemand" ) {
+            } else if(key == "loadondemand") {
                 config.loadOnDemand = atoi(value);
-            } else if( key == "filereadbatches" ) {
+            } else if(key == "filereadbatches") {
                 config.fileReadBatches = atoi(value);
-            } else if( key == "normalizationexamples" ) {
+            } else if(key == "normalizationexamples") {
                 config.normalizationExamples = atoi(value);
-            } else if( key == "weightsinitializer" ) {
+            } else if(key == "weightsinitializer") {
                 config.weightsInitializer = (value);
-            } else if( key == "initialweights" ) {
+            } else if(key == "initialweights") {
                 config.initialWeights = atof(value);
-            } else if( key == "trainer" ) {
+            } else if(key == "trainer") {
                 config.trainer = (value);
-            } else if( key == "learningrate" ) {
+            } else if(key == "learningrate") {
                 config.learningRate = atof(value);
-            } else if( key == "rho" ) {
+            } else if(key == "rho") {
                 config.rho = atof(value);
-            } else if( key == "momentum" ) {
+            } else if(key == "momentum") {
                 config.momentum = atof(value);
-            } else if( key == "weightdecay" ) {
+            } else if(key == "weightdecay") {
                 config.weightDecay = atof(value);
-            } else if( key == "anneal" ) {
+            } else if(key == "anneal") {
                 config.anneal = atof(value);
             // [[[end]]]
             } else {
                 cout << endl;
                 cout << "Error: key '" << key << "' not recognised" << endl;
                 cout << endl;
-                printUsage( argv, config );
+                printUsage(argv, config);
                 cout << endl;
                 return -1;
             }
         }
     }
-    string dataset = toLower( config.dataset );
-    if( dataset != "" ) {
-        if( dataset == "mnist" ) {
+    string dataset = toLower(config.dataset);
+    if(dataset != "") {
+        if(dataset == "mnist") {
             config.dataDir = "../data/mnist";
             config.trainFile = "train-images-idx3-ubyte";
             config.validateFile = "t10k-images-idx3-ubyte";
-        } else if( dataset == "norb" ) {
+        } else if(dataset == "norb") {
             config.dataDir = "../data/norb";
             config.trainFile = "training-shuffled-dat.mat";
             config.validateFile = "testing-sampled-dat.mat";
-        } else if( dataset == "cifar10" ) {
+        } else if(dataset == "cifar10") {
             config.dataDir = "../data/cifar10";
             config.trainFile = "train-dat.mat";
             config.validateFile = "test-dat.mat";
-        } else if( dataset == "kgsgo" ) {
+        } else if(dataset == "kgsgo") {
             config.dataDir = "../data/kgsgo";
             config.trainFile = "kgsgo-train10k-v2.dat";
             config.validateFile = "kgsgo-test-v2.dat";
             config.loadOnDemand = 1;
-        } else if( dataset == "kgsgoall" ) {
+        } else if(dataset == "kgsgoall") {
             config.dataDir = "../data/kgsgo";
             config.trainFile = "kgsgo-trainall-v2.dat";
             config.validateFile = "kgsgo-test-v2.dat";
@@ -614,8 +624,8 @@ int main( int argc, char *argv[] ) {
         cout << "   validatefile: " << config.validateFile << ":" << endl;
     }
     try {
-        go( config );
-    } catch( runtime_error e ) {
+        go(config);
+    } catch(runtime_error e) {
         cout << "Something went wrong: " << e.what() << endl;
         return -1;
     }
diff --git a/src/net/MultiNet.cpp b/src/net/MultiNet.cpp
index 92f62b0c..d299d57a 100644
--- a/src/net/MultiNet.cpp
+++ b/src/net/MultiNet.cpp
@@ -21,33 +21,33 @@ using namespace std;
 #define STATIC
 #define VIRTUAL
 
-MultiNet::MultiNet( int numNets, NeuralNet *model ) :
-        output( 0 ),
-        batchSize( 0 ),
-        allocatedSize( 0 ),
-        proxyInputLayer( 0 ),
-        lossLayer( 0 ) {
-//    trainables.push_back( model );
-    for( int i = 0; i < numNets; i++ ) {
-        trainables.push_back( model->clone() );
+MultiNet::MultiNet(int numNets, NeuralNet *model) :
+        output(0),
+        batchSize(0),
+        allocatedSize(0),
+        proxyInputLayer(0),
+        lossLayer(0) {
+//    trainables.push_back(model);
+    for(int i = 0; i < numNets; i++) {
+        trainables.push_back(model->clone());
     }
     InputLayerMaker *inputLayerMaker = InputLayerMaker::instance();
-    inputLayerMaker->numPlanes( trainables[0]->getOutputPlanes() );
-    inputLayerMaker->imageSize( trainables[0]->getOutputImageSize() );
-    proxyInputLayer = new InputLayer( inputLayerMaker );
-    lossLayer = dynamic_cast< LossLayer *>( trainables[0]->cloneLossLayerMaker()->createLayer(proxyInputLayer) );
+    inputLayerMaker->numPlanes(trainables[0]->getOutputPlanes());
+    inputLayerMaker->imageSize(trainables[0]->getOutputSize());
+    proxyInputLayer = new InputLayer(inputLayerMaker);
+    lossLayer = dynamic_cast< LossLayer *>(trainables[0]->cloneLossLayerMaker()->createLayer(proxyInputLayer));
 }
 VIRTUAL MultiNet::~MultiNet() {
-    if( proxyInputLayer != 0 ) {
+    if(proxyInputLayer != 0) {
         delete proxyInputLayer;
     }
-    if( lossLayer != 0 ) {
+    if(lossLayer != 0) {
         delete lossLayer;
     }
-    if( output != 0 ) {
+    if(output != 0) {
         delete[] output;
     }
-    for( vector< Trainable * >::iterator it = trainables.begin(); it != trainables.end(); it++ ) {
+    for(vector< Trainable * >::iterator it = trainables.begin(); it != trainables.end(); it++) {
         delete (*it);
     }    
 }
@@ -57,125 +57,125 @@ VIRTUAL int MultiNet::getInputCubeSize() const {
 VIRTUAL int MultiNet::getOutputCubeSize() const {
     return trainables[0]->getOutputCubeSize();
 }
-VIRTUAL int MultiNet::getOutputSize() const {
-    return trainables[0]->getOutputSize();
+VIRTUAL int MultiNet::getOutputNumElements() const {
+    return trainables[0]->getOutputNumElements();
 }
 VIRTUAL int MultiNet::getOutputPlanes() const {
     return trainables[0]->getOutputPlanes();
 }
-VIRTUAL int MultiNet::getOutputImageSize() const {
-    return trainables[0]->getOutputImageSize();
+VIRTUAL int MultiNet::getOutputSize() const {
+    return trainables[0]->getOutputSize();
 }
 VIRTUAL LossLayerMaker *MultiNet::cloneLossLayerMaker() const {
-    throw runtime_error("need to implement MultiNet::cloneLossLayerMaker :-)" );
-//    return dynamic_cast< LossLayerMaker *>( lossLayer->maker->clone( clonePreviousLayer ) );
+    throw runtime_error("need to implement MultiNet::cloneLossLayerMaker :-)");
+//    return dynamic_cast< LossLayerMaker *>(lossLayer->maker->clone(clonePreviousLayer) );
 }
-VIRTUAL float MultiNet::calcLoss(float const *expectedValues ) {
-    float loss = lossLayer->calcLoss( expectedValues );
+VIRTUAL float MultiNet::calcLoss(float const *expectedValues) {
+    float loss = lossLayer->calcLoss(expectedValues);
     return loss;
 
     // average across all, and then calc loss, right?
     // but .... we need a loss layer?
     // maybe just report average/total child loss, for now?
 //    float totalLoss = 0.0f;
-//    const int outputSize = trainables[0]->getOutputSize();
+//    const int outputNumElements = trainables[0]->getOutputNumElements();
 //    float *expectedValuesSum = new 
-//    for( vector< Trainable * >::iterator it = trainables.begin(); it != trainables.end(); it++ ) {
-//        //totalLoss += (*it)->calcLoss( expectedValues );
+//    for(vector< Trainable * >::iterator it = trainables.begin(); it != trainables.end(); it++) {
+//        //totalLoss += (*it)->calcLoss(expectedValues);
 //        
 //    }
 //    return totalLoss;
 }
-VIRTUAL float MultiNet::calcLossFromLabels(int const *labels ) {
+VIRTUAL float MultiNet::calcLossFromLabels(int const *labels) {
     // average across all, and then calc loss, right?
     // but .... we need a loss layer?
     // maybe just report average/total child loss, for now?
 //    float totalLoss = 0.0f;
-//    for( vector< Trainable * >::iterator it = trainables.begin(); it != trainables.end(); it++ ) {
-//        totalLoss += (*it)->calcLossFromLabels( labels );
+//    for(vector< Trainable * >::iterator it = trainables.begin(); it != trainables.end(); it++) {
+//        totalLoss += (*it)->calcLossFromLabels(labels);
 //    }
 //    return totalLoss;
-    SoftMaxLayer *softMaxLayer = dynamic_cast< SoftMaxLayer *>( lossLayer );
-    if( softMaxLayer == 0 ) {
+    SoftMaxLayer *softMaxLayer = dynamic_cast< SoftMaxLayer *>(lossLayer);
+    if(softMaxLayer == 0) {
         throw runtime_error("trying to call multinet::calcNumRight, but model networks dont have a SoftMax loss layer");
     }
-    return softMaxLayer->calcLossFromLabels( labels );
+    return softMaxLayer->calcLossFromLabels(labels);
 }
-VIRTUAL void MultiNet::setBatchSize( int batchSize ) {
+VIRTUAL void MultiNet::setBatchSize(int batchSize) {
     // do children first
-    for( vector< Trainable * >::iterator it = trainables.begin(); it != trainables.end(); it++ ) {
-        (*it)->setBatchSize( batchSize );
+    for(vector< Trainable * >::iterator it = trainables.begin(); it != trainables.end(); it++) {
+        (*it)->setBatchSize(batchSize);
     }
-    proxyInputLayer->setBatchSize( batchSize );
-    lossLayer->setBatchSize( batchSize );
+    proxyInputLayer->setBatchSize(batchSize);
+    lossLayer->setBatchSize(batchSize);
     // now ourselves :-)
-    if( batchSize <= allocatedSize ) {
+    if(batchSize <= allocatedSize) {
         this->batchSize = batchSize;
         return;
     }
-    if( output != 0 ) {
+    if(output != 0) {
         delete[] output;
     }
     this->batchSize = batchSize;
     this->allocatedSize = batchSize;
-    output = new float[ trainables[0]->getOutputSize() ];
+    output = new float[ trainables[0]->getOutputNumElements() ];
 }
-VIRTUAL void MultiNet::setTraining( bool training ) {
-    for( vector< Trainable * >::iterator it = trainables.begin(); it != trainables.end(); it++ ) {
-        (*it)->setTraining( training );
+VIRTUAL void MultiNet::setTraining(bool training) {
+    for(vector< Trainable * >::iterator it = trainables.begin(); it != trainables.end(); it++) {
+        (*it)->setTraining(training);
     }
 }
-VIRTUAL int MultiNet::calcNumRight( int const *labels ) {
+VIRTUAL int MultiNet::calcNumRight(int const *labels) {
 //    cout << proxyInputLayer->asString() << endl;
 //    cout << lossLayer->asString() << endl;
-//    proxyInputLayer->in( trainables[0]->getOutput() );
-//    return dynamic_cast< SoftMaxLayer *>( lossLayer )->calcNumRight( labels );
-//    return trainables[0]->calcNumRight( labels );
+//    proxyInputLayer->in(trainables[0]->getOutput());
+//    return dynamic_cast< SoftMaxLayer *>(lossLayer)->calcNumRight(labels);
+//    return trainables[0]->calcNumRight(labels);
 
     // call getOutput(), then work out the predictions, then compare with the labels
     // or, use a losslayer?
     // depends on the configuration of the softmax layer too, ie per-plane or not
 //    SoftMaxMaker *maker = trainables[0]->cloneLossLayerMaker();
 //    SoftMaxLayer *clonedSoftMax = 
-    SoftMaxLayer *softMaxLayer = dynamic_cast< SoftMaxLayer *>( lossLayer );
-    if( softMaxLayer == 0 ) {
+    SoftMaxLayer *softMaxLayer = dynamic_cast< SoftMaxLayer *>(lossLayer);
+    if(softMaxLayer == 0) {
         throw runtime_error("trying to call multinet::calcNumRight, but model networks dont have a SoftMax loss layer");
     }
-    return softMaxLayer->calcNumRight( labels );
+    return softMaxLayer->calcNumRight(labels);
 }
 void MultiNet::forwardToOurselves() {
     // now forward to ourselves :-)
     // I suppose this could be done in GPU, but what if we want to split across mpi?
-    const int outputSize = trainables[0]->getOutputSize();
-    memset( output, 0, sizeof( float ) * outputSize );
-    for( vector< Trainable * >::iterator it = trainables.begin(); it != trainables.end(); it++ ) {
+    const int outputNumElements = trainables[0]->getOutputNumElements();
+    memset(output, 0, sizeof(float) * outputNumElements);
+    for(vector< Trainable * >::iterator it = trainables.begin(); it != trainables.end(); it++) {
         float const*childOutput = (*it)->getOutput();
-        for( int i = 0; i < outputSize; i++ ) {
+        for(int i = 0; i < outputNumElements; i++) {
             output[i] += childOutput[i];
         }
     }    
     const int numChildren = (int)trainables.size();
-    for( int i = 0; i < outputSize; i++ ) {
+    for(int i = 0; i < outputNumElements; i++) {
         output[i] /= numChildren;
     }
-    memcpy( dynamic_cast< SoftMaxLayer * >( lossLayer )->output, output, sizeof(float) * lossLayer->getOutputSize() );
-//    proxyInputLayer->in( output );
+    memcpy(dynamic_cast< SoftMaxLayer * >(lossLayer)->output, output, sizeof(float) * lossLayer->getOutputNumElements());
+//    proxyInputLayer->in(output);
 }
-VIRTUAL void MultiNet::forward( float const*images) {
-    for( vector< Trainable * >::iterator it = trainables.begin(); it != trainables.end(); it++ ) {
-        (*it)->forward( images );
+VIRTUAL void MultiNet::forward(float const*images) {
+    for(vector< Trainable * >::iterator it = trainables.begin(); it != trainables.end(); it++) {
+        (*it)->forward(images);
     }
     forwardToOurselves();
 }
-VIRTUAL void MultiNet::backwardFromLabels( int const *labels) {
+VIRTUAL void MultiNet::backwardFromLabels(int const *labels) {
     // dont think we need to backprop onto ourselves?  Just direclty onto children, right?
-    for( vector< Trainable * >::iterator it = trainables.begin(); it != trainables.end(); it++ ) {
-        (*it)->backwardFromLabels( labels );
+    for(vector< Trainable * >::iterator it = trainables.begin(); it != trainables.end(); it++) {
+        (*it)->backwardFromLabels(labels);
     }
 }
-VIRTUAL void MultiNet::backward( float const *expectedOutput) {
-    for( vector< Trainable * >::iterator it = trainables.begin(); it != trainables.end(); it++ ) {
-        (*it)->backward( expectedOutput );
+VIRTUAL void MultiNet::backward(float const *expectedOutput) {
+    for(vector< Trainable * >::iterator it = trainables.begin(); it != trainables.end(); it++) {
+        (*it)->backward(expectedOutput);
     }
 }
 VIRTUAL float const *MultiNet::getOutput() const {
@@ -184,13 +184,13 @@ VIRTUAL float const *MultiNet::getOutput() const {
 VIRTUAL int MultiNet::getNumNets() const {
     return trainables.size();
 }
-VIRTUAL Trainable *MultiNet::getNet( int idx ) {
+VIRTUAL Trainable *MultiNet::getNet(int idx) {
     return trainables[ idx ];
 }
 
-//VIRTUAL void MultiNet::setTrainer( TrainerMaker *trainerMaker ) {
-//    for( vector< Trainable * >::iterator it = trainables.begin(); it != trainables.end(); it++ ) {
-//        (*it)->setTrainer( trainerMaker );
+//VIRTUAL void MultiNet::setTrainer(TrainerMaker *trainerMaker) {
+//    for(vector< Trainable * >::iterator it = trainables.begin(); it != trainables.end(); it++) {
+//        (*it)->setTrainer(trainerMaker);
 //    }
 //}
 
diff --git a/src/net/MultiNet.h b/src/net/MultiNet.h
index a1aac836..45b986b8 100644
--- a/src/net/MultiNet.h
+++ b/src/net/MultiNet.h
@@ -36,26 +36,26 @@ class DeepCL_EXPORT MultiNet : public Trainable {
     // cog_addheaders.add()
     // ]]]
     // generated, using cog:
-    MultiNet( int numNets, NeuralNet *model );
+    MultiNet(int numNets, NeuralNet *model);
     VIRTUAL ~MultiNet();
     VIRTUAL int getInputCubeSize() const;
     VIRTUAL int getOutputCubeSize() const;
-    VIRTUAL int getOutputSize() const;
+    VIRTUAL int getOutputNumElements() const;
     VIRTUAL int getOutputPlanes() const;
-    VIRTUAL int getOutputImageSize() const;
+    VIRTUAL int getOutputSize() const;
     VIRTUAL LossLayerMaker *cloneLossLayerMaker() const;
-    VIRTUAL float calcLoss(float const *expectedValues );
-    VIRTUAL float calcLossFromLabels(int const *labels );
-    VIRTUAL void setBatchSize( int batchSize );
-    VIRTUAL void setTraining( bool training );
-    VIRTUAL int calcNumRight( int const *labels );
+    VIRTUAL float calcLoss(float const *expectedValues);
+    VIRTUAL float calcLossFromLabels(int const *labels);
+    VIRTUAL void setBatchSize(int batchSize);
+    VIRTUAL void setTraining(bool training);
+    VIRTUAL int calcNumRight(int const *labels);
     void forwardToOurselves();
-    VIRTUAL void forward( float const*images);
-    VIRTUAL void backwardFromLabels( int const *labels);
-    VIRTUAL void backward( float const *expectedOutput);
+    VIRTUAL void forward(float const*images);
+    VIRTUAL void backwardFromLabels(int const *labels);
+    VIRTUAL void backward(float const *expectedOutput);
     VIRTUAL float const *getOutput() const;
     VIRTUAL int getNumNets() const;
-    VIRTUAL Trainable *getNet( int idx );
+    VIRTUAL Trainable *getNet(int idx);
 
     // [[[end]]]
 };
diff --git a/src/net/NeuralNet.cpp b/src/net/NeuralNet.cpp
index 3a49eeb2..699172b0 100644
--- a/src/net/NeuralNet.cpp
+++ b/src/net/NeuralNet.cpp
@@ -25,6 +25,7 @@
 #include "trainers/Trainer.h"
 #include "trainers/TrainerMaker.h"
 #include "weights/WeightsPersister.h"
+#include "CppRuntimeBoundary.h"
 
 #include "net/NeuralNet.h"
 
@@ -37,56 +38,68 @@ using namespace std;
 #undef STATIC
 #define STATIC
 
-NeuralNet::NeuralNet( EasyCL *cl ) :
-        cl( cl ) {
+NeuralNet::NeuralNet(EasyCL *cl) :
+        cl(cl) {
     trainer = 0;
     isTraining = true;
 }
+STATIC NeuralNet *NeuralNet::instance(EasyCL *cl) {
+    return new NeuralNet(cl);
+}
+STATIC NeuralNet *NeuralNet::instance(EasyCL *cl, int numPlanes, int imageSize) {
+    return new NeuralNet(cl, numPlanes, imageSize);
+}
+STATIC NeuralNet *NeuralNet::instance3(EasyCL *cl, int numPlanes, int imageSize) {
+    return new NeuralNet(cl, numPlanes, imageSize);
+}
+void NeuralNet::deleteMe() {
+    delete this;
+}
 /// Constructor
-PUBLICAPI NeuralNet::NeuralNet(  EasyCL *cl, int numPlanes, int imageSize ) :
-        cl( cl ) {
-    addLayer( InputLayerMaker::instance()->numPlanes( numPlanes )->imageSize( imageSize ) );
+NeuralNet::NeuralNet(EasyCL *cl, int numPlanes, int imageSize) :
+        cl(cl) {
+    addLayer(InputLayerMaker::instance()->numPlanes(numPlanes)->imageSize(imageSize) );
     trainer = 0;
 }
 NeuralNet::~NeuralNet() {
-    for( int i = 0; i < (int)layers.size(); i++ ) {
+    for(int i = 0; i < (int)layers.size(); i++) {
         delete layers[i];
     }
 }
-STATIC NeuralNetMould *NeuralNet::maker( EasyCL *cl ) {
-    return new NeuralNetMould( cl );
+STATIC NeuralNetMould *NeuralNet::maker(EasyCL *cl) {
+    return new NeuralNetMould(cl);
 }
 NeuralNet *NeuralNet::clone() {
-    NeuralNet *copy = new NeuralNet( cl );
-    for( vector<Layer *>::iterator it = layers.begin(); it != layers.end(); it++ ) {
+    NeuralNet *copy = new NeuralNet(cl);
+    for(vector<Layer *>::iterator it = layers.begin(); it != layers.end(); it++) {
         LayerMaker2 *maker = (*it)->maker;
 
         LayerMaker2 *makerCopy = maker->clone();
-        copy->addLayer( makerCopy );
+        copy->addLayer(makerCopy);
     }
     copy->print();
-    cout << "outputimagesize: " << copy->getOutputImageSize() << endl;
+    cout << "outputimagesize: " << copy->getOutputSize() << endl;
     return copy;
 }
 EasyCL *NeuralNet::getCl() {
     return cl;
 }
 /// Add a network layer, using a LayerMaker2 object
-PUBLICAPI void NeuralNet::addLayer( LayerMaker2 *maker ) {
+PUBLICAPI void NeuralNet::addLayer(LayerMaker2 *maker) {
 //    cout << "neuralnet::insert numplanes " << inputLayerMaker._numPlanes << " imageSize " << inputLayerMaker._imageSize << endl;
-    maker->setCl( cl );
-    Layer *layer = maker->createLayer( getLastLayer() );
-    layers.push_back( layer );
+    maker->setCl(cl);
+    Layer *layer = maker->createLayer(getLastLayer());
+    layers.push_back(layer);
 }
-PUBLICAPI void NeuralNet::initWeights( int layerIndex, float *weights, float *bias ) {
-    initWeights( layerIndex, weights );
-    initBias( layerIndex, bias );
+PUBLICAPI void NeuralNet::initWeights(int layerIndex, float *weights, float *bias) {
+    initWeights(layerIndex, weights);
+    initBias(layerIndex, bias);
 }
-PUBLICAPI void NeuralNet::initWeights( int layerIndex, float *weights ) {
-    layers[layerIndex]->initWeights( weights );
+PUBLICAPI void NeuralNet::initWeights(int layerIndex, float *weights) {
+    layers[layerIndex]->initWeights(weights);
 }
-PUBLICAPI void NeuralNet::initBias( int layerIndex, float *weights ) {
-    layers[layerIndex]->initBias( weights );
+PUBLICAPI void NeuralNet::initBias(int layerIndex, float *weights) {
+    layers[layerIndex]->initBias(weights);
 }
 /// \brief calculate the loss, based on the passed in expectedValues array
 ///
@@ -95,36 +108,36 @@ PUBLICAPI void NeuralNet::initBias( int layerIndex, float *weights ) {
 /// Calculate the loss, based on the passed in expectedValues array
 /// which should be the same size as the output of the final layer
 /// of the network
-PUBLICAPI float NeuralNet::calcLoss(float const *expectedValues ) {
-    return dynamic_cast<LossLayer*>(getLastLayer())->calcLoss( expectedValues );
+PUBLICAPI float NeuralNet::calcLoss(float const *expectedValues) {
+    return dynamic_cast<LossLayer*>(getLastLayer())->calcLoss(expectedValues);
 }
-PUBLICAPI float NeuralNet::calcLossFromLabels(int const *labels ) {
-    return dynamic_cast<IAcceptsLabels*>(getLastLayer())->calcLossFromLabels( labels );
+PUBLICAPI float NeuralNet::calcLossFromLabels(int const *labels) {
+    return dynamic_cast<IAcceptsLabels*>(getLastLayer())->calcLossFromLabels(labels);
 }
-float NeuralNet::calcLoss( OutputData *outputData ) {
-    return dynamic_cast<LossLayer*>(getLastLayer())->calcLoss( outputData );
+float NeuralNet::calcLoss(OutputData *outputData) {
+    return dynamic_cast<LossLayer*>(getLastLayer())->calcLoss(outputData);
 }
-int NeuralNet::calcNumRight( OutputData *outputData ) {
-    return dynamic_cast<LossLayer*>(getLastLayer())->calcNumRight( outputData );
+int NeuralNet::calcNumRight(OutputData *outputData) {
+    return dynamic_cast<LossLayer*>(getLastLayer())->calcNumRight(outputData);
 }
-EpochMaker *NeuralNet::epochMaker( Trainer *trainer ) {
+EpochMaker *NeuralNet::epochMaker(Trainer *trainer) {
      return new EpochMaker(this, trainer);
 }
 VIRTUAL LossLayerMaker *NeuralNet::cloneLossLayerMaker() const {
-    LossLayer const *lossLayer = dynamic_cast< LossLayer const*>( getLastLayer() );
-    if( lossLayer == 0 ) {
+    LossLayer const *lossLayer = dynamic_cast< LossLayer const*>(getLastLayer());
+    if(lossLayer == 0) {
         throw runtime_error("error: last layer must be a losslayer");
     }
-    return dynamic_cast< LossLayerMaker *>( lossLayer->maker->clone() );
-//    throw runtime_error("need to implement neuralnet::clonelosslayermaker :-)" );
-//    LossLayer const*lossLayer = dynamic_cast< LossLayer const*>( getLastLayer() );
-//    return dynamic_cast< LossLayerMaker *>( lossLayer->maker->clone( clonePreviousLayer ) ) ;
+    return dynamic_cast< LossLayerMaker *>(lossLayer->maker->clone());
+//    throw runtime_error("need to implement neuralnet::clonelosslayermaker :-)");
+//    LossLayer const*lossLayer = dynamic_cast< LossLayer const*>(getLastLayer());
+//    return dynamic_cast< LossLayerMaker *>(lossLayer->maker->clone(clonePreviousLayer) ) ;
 }
 PUBLICAPI InputLayer *NeuralNet::getFirstLayer() {
-    return dynamic_cast<InputLayer *>( layers[0] );
+    return dynamic_cast<InputLayer *>(layers[0]);
 }
 PUBLICAPI Layer *NeuralNet::getLastLayer() {
-    if( layers.size() == 0 ) {
+    if(layers.size() == 0) {
         return 0;
     }
     return layers[layers.size() - 1];
@@ -132,17 +145,17 @@ PUBLICAPI Layer *NeuralNet::getLastLayer() {
 PUBLICAPI int NeuralNet::getNumLayers() const {
     return (int)layers.size();
 }
-PUBLICAPI Layer *NeuralNet::getLayer( int index ) {
-    if( layers.size() == 0 ) {
+PUBLICAPI Layer *NeuralNet::getLayer(int index) {
+    if(layers.size() == 0) {
         return 0;
     }
-    if( index < 0 || index > (int)layers.size() - 1 ) {
+    if(index < 0 || index > (int)layers.size() - 1) {
         return 0;
     }
     return layers[index];
 }
 PUBLICAPI Layer const*NeuralNet::getLastLayer() const {
-    if( layers.size() == 0 ) {
+    if(layers.size() == 0) {
         return 0;
     }
     return layers[layers.size() - 1];
@@ -150,81 +163,81 @@ PUBLICAPI Layer const*NeuralNet::getLastLayer() const {
 PUBLICAPI VIRTUAL int NeuralNet::getOutputPlanes() const {
     return getLastLayer()->getOutputPlanes();
 }
-PUBLICAPI VIRTUAL int NeuralNet::getOutputImageSize() const {
-    return getLastLayer()->getOutputImageSize();
+PUBLICAPI VIRTUAL int NeuralNet::getOutputSize() const {
+    return getLastLayer()->getOutputSize();
 }
-PUBLICAPI void NeuralNet::setBatchSize( int batchSize ) {
-    for( std::vector<Layer*>::iterator it = layers.begin(); it != layers.end(); it++ ) {
-        (*it)->setBatchSize( batchSize );
+PUBLICAPI void NeuralNet::setBatchSize(int batchSize) {
+    for(std::vector<Layer*>::iterator it = layers.begin(); it != layers.end(); it++) {
+        (*it)->setBatchSize(batchSize);
     }
 }
-PUBLICAPI void NeuralNet::setTraining( bool training ) {
-    for( std::vector<Layer*>::iterator it = layers.begin(); it != layers.end(); it++ ) {
-        (*it)->setTraining( training );
+PUBLICAPI void NeuralNet::setTraining(bool training) {
+    for(std::vector<Layer*>::iterator it = layers.begin(); it != layers.end(); it++) {
+        (*it)->setTraining(training);
     }
 }
-PUBLICAPI int NeuralNet::calcNumRight( int const *labels ) {
+PUBLICAPI int NeuralNet::calcNumRight(int const *labels) {
     IAcceptsLabels *acceptsLabels = dynamic_cast<IAcceptsLabels*>(getLastLayer());
-    if( acceptsLabels == 0 ) {
+    if(acceptsLabels == 0) {
         THROW("You need to add a IAcceptsLabels as the last layer, in order to use calcNumRight");
     }
-    return acceptsLabels->calcNumRight( labels );
+    return acceptsLabels->calcNumRight(labels);
 }
-PUBLICAPI void NeuralNet::forward( float const*images) {
+PUBLICAPI void NeuralNet::forward(float const*images) {
     // forward...
-    dynamic_cast<InputLayer *>(layers[0])->in( images );
-    for( int layerId = 0; layerId < (int)layers.size(); layerId++ ) {
-        StatefulTimer::setPrefix("layer" + toString(layerId) + " " );
+    dynamic_cast<InputLayer *>(layers[0])->in(images);
+    for(int layerId = 0; layerId < (int)layers.size(); layerId++) {
+        StatefulTimer::setPrefix("layer" + toString(layerId) + " ");
         layers[layerId]->forward();
-        StatefulTimer::setPrefix("" );
+        StatefulTimer::setPrefix("");
     }
 }
 /// \brief note: this does no learning, just calculates the gradients
-PUBLICAPI void NeuralNet::backwardFromLabels( int const *labels) {
+PUBLICAPI void NeuralNet::backwardFromLabels(int const *labels) {
     IAcceptsLabels *acceptsLabels = dynamic_cast<IAcceptsLabels*>(getLastLayer());
-    if( acceptsLabels == 0 ) {
+    if(acceptsLabels == 0) {
         throw std::runtime_error("Must add a child of IAcceptsLabels as last layer, to use backwardFromLabels");
     }
-    acceptsLabels->calcGradInputFromLabels( labels );
-    for( int layerIdx = (int)layers.size() - 2; layerIdx >= 1; layerIdx-- ) { // no point in propagating to input layer :-P
-        StatefulTimer::setPrefix("layer" + toString(layerIdx) + " " );
+    acceptsLabels->calcGradInputFromLabels(labels);
+    for(int layerIdx = (int)layers.size() - 2; layerIdx >= 1; layerIdx--) { // no point in propagating to input layer :-P
+        StatefulTimer::setPrefix("layer" + toString(layerIdx) + " ");
         Layer *layer = layers[layerIdx];
-        if( layer->needsBackProp() ) {
+        if(layer->needsBackProp()) {
             layer->backward();
         }
-        StatefulTimer::setPrefix("" );
+        StatefulTimer::setPrefix("");
     }
 }
 /// \brief note: this does no learning, just calculates the gradients
-PUBLICAPI void NeuralNet::backward( float const *expectedOutput) {
+PUBLICAPI void NeuralNet::backward(float const *expectedOutput) {
     LossLayer *lossLayer = dynamic_cast<LossLayer*>(getLastLayer());
-    if( lossLayer == 0 ) {
+    if(lossLayer == 0) {
         throw std::runtime_error("Must add a LossLayer as last layer of net");
     }
-    lossLayer->calcGradInput( expectedOutput );
-    for( int layerIdx = (int)layers.size() - 2; layerIdx >= 1; layerIdx-- ) { // no point in propagating to input layer
-        StatefulTimer::setPrefix("layer" + toString(layerIdx) + " " );
+    lossLayer->calcGradInput(expectedOutput);
+    for(int layerIdx = (int)layers.size() - 2; layerIdx >= 1; layerIdx--) { // no point in propagating to input layer
+        StatefulTimer::setPrefix("layer" + toString(layerIdx) + " ");
         layers[layerIdx]->backward();
-        StatefulTimer::setPrefix("" );
+        StatefulTimer::setPrefix("");
     }
 }
-void NeuralNet::backward( OutputData *outputData ) {
+void NeuralNet::backward(OutputData *outputData) {
     LossLayer *lossLayer = dynamic_cast<LossLayer*>(getLastLayer());
-    lossLayer->calcGradInput( outputData );
-    for( int layerIdx = (int)layers.size() - 2; layerIdx >= 1; layerIdx-- ) { // no point in propagating to input layer
-        Layer *layer = getLayer( layerIdx );
-        if( !layer->needsBackProp() ) {
+    lossLayer->calcGradInput(outputData);
+    for(int layerIdx = (int)layers.size() - 2; layerIdx >= 1; layerIdx--) { // no point in propagating to input layer
+        Layer *layer = getLayer(layerIdx);
+        if(!layer->needsBackProp()) {
             break;
         }
-        StatefulTimer::setPrefix("layer" + toString(layerIdx) + " " );
+        StatefulTimer::setPrefix("layer" + toString(layerIdx) + " ");
         layer->backward();
-        StatefulTimer::setPrefix("" );
+        StatefulTimer::setPrefix("");
     }
 }
 PUBLICAPI int NeuralNet::getNumLayers() {
     return (int)layers.size();
 }
-PUBLICAPI float const *NeuralNet::getOutput( int layer ) const {
+PUBLICAPI float const *NeuralNet::getOutput(int layer) const {
     return layers[layer]->getOutput();
 }
 PUBLICAPI int NeuralNet::getInputCubeSize() const {
@@ -234,23 +247,23 @@ PUBLICAPI int NeuralNet::getOutputCubeSize() const {
     return layers[ layers.size() - 1 ]->getOutputCubeSize();
 }
 PUBLICAPI float const *NeuralNet::getOutput() const {
-    return getOutput( (int)layers.size() - 1 );
+    return getOutput((int)layers.size() - 1);
 }
-PUBLICAPI VIRTUAL int NeuralNet::getOutputSize() const {
-    return getLastLayer()->getOutputSize();
+PUBLICAPI VIRTUAL int NeuralNet::getOutputNumElements() const {
+    return getLastLayer()->getOutputNumElements();
 }
 void NeuralNet::print() {
     cout << this->asString();
     printParamStats();
 //    int i = 0; 
-//    for( std::vector< Layer* >::iterator it = layers.begin(); it != layers.end(); it++ ) {
+//    for(std::vector< Layer* >::iterator it = layers.begin(); it != layers.end(); it++) {
 //        std::cout << "layer " << i << ":" << (*it)->asString() << endl;
 //        i++;
 //    }
 }
 void NeuralNet::printWeights() {
     int i = 0; 
-    for( std::vector< Layer* >::iterator it = layers.begin(); it != layers.end(); it++ ) {
+    for(std::vector< Layer* >::iterator it = layers.begin(); it != layers.end(); it++) {
         std::cout << "layer " << i << ":" << std::endl;
         (*it)->printWeights();
         i++;
@@ -258,13 +271,13 @@ void NeuralNet::printWeights() {
 }
 void NeuralNet::printOutput() {
     int i = 0; 
-    for( std::vector< Layer* >::iterator it = layers.begin(); it != layers.end(); it++ ) {
+    for(std::vector< Layer* >::iterator it = layers.begin(); it != layers.end(); it++) {
         std::cout << "layer " << i << ":" << std::endl;
         (*it)->printOutput();
         i++;
     }
 }
-VIRTUAL void NeuralNet::setTrainer( Trainer *trainer ) {
+VIRTUAL void NeuralNet::setTrainer(Trainer *trainer) {
     this->trainer = trainer;
 }
 void NeuralNet::printParamStats() {
@@ -272,24 +285,24 @@ void NeuralNet::printParamStats() {
     int skip = 0;
     int precision = (int)std::cout.precision();
 //    cout << "precision: " << precision << endl;
-    for( std::vector< Layer* >::iterator it = layers.begin(); it != layers.end(); it++ ) {
-        int size = (*it)->getPersistSize( WeightsPersister::latestVersion );
+    for(std::vector< Layer* >::iterator it = layers.begin(); it != layers.end(); it++) {
+        int size = (*it)->getPersistSize(WeightsPersister::latestVersion);
         sum += size;
-        if( ! size ){
+        if(! size){
             skip++;
         }
     }
     std::cout << "Parameters overview: (skipping " << skip << " layers with 0 params)" << std::endl;
     int i = 0;
-    for( std::vector< Layer* >::iterator it = layers.begin(); it != layers.end(); it++, i++ ) {
-        int size = (*it)->getPersistSize( WeightsPersister::latestVersion );
-        if( size ) {
+    for(std::vector< Layer* >::iterator it = layers.begin(); it != layers.end(); it++, i++) {
+        int size = (*it)->getPersistSize(WeightsPersister::latestVersion);
+        if(size) {
             std::cout << "layer " << i << ": params=" << size << "\t";
             std::cout << std::fixed << std::setprecision(1) << ((float) 100 * size)/sum << "%";
             std::cout << std::endl;
         }
     }
-    if( i ){
+    if(i){
         std::cout << "TOTAL  : params=" << sum << std::endl;
     }
     // reset the cout properties, so that I dont spend 2 hours figuring out why my weights
@@ -300,10 +313,19 @@ void NeuralNet::printParamStats() {
 PUBLICAPI std::string NeuralNet::asString() {
     std::string result = "";
     int i = 0; 
-    for( std::vector< Layer* >::iterator it = layers.begin(); it != layers.end(); it++ ) {
-        result += "layer " + toString( i ) + ":" + (*it)->asString() + "\n";
+    for(std::vector< Layer* >::iterator it = layers.begin(); it != layers.end(); it++) {
+        result += "layer " + toString(i) + ":" + (*it)->asString() + "\n";
         i++;
     }    
     return result;
 }
+PUBLICAPI const char * NeuralNet::asNewCharStar() { // call deepcl_deleteCharStar to delete this
+    std::string result = "";
+    int i = 0; 
+    for(std::vector< Layer* >::iterator it = layers.begin(); it != layers.end(); it++) {
+        result += "layer " + toString(i) + ":" + (*it)->asString() + "\n";
+        i++;
+    }
+    return deepcl_stringToCharStar(result);
+}
 
diff --git a/src/net/NeuralNet.h b/src/net/NeuralNet.h
index 82e0f85f..c3074f26 100644
--- a/src/net/NeuralNet.h
+++ b/src/net/NeuralNet.h
@@ -35,11 +35,11 @@ PUBLICAPI
 class DeepCL_EXPORT NeuralNet : public Trainable {
 protected:
 #ifdef _WIN32
-#pragma warning( disable: 4251 )
+#pragma warning(disable: 4251)
 #endif
     std::vector< Layer *> layers;
 #ifdef _WIN32
-#pragma warning( default: 4251 )
+#pragma warning(default: 4251)
 #endif
     EasyCL *cl; // NOT owned by us, dont delete
     Trainer *trainer; // NOT owned by us, dont delete
@@ -52,48 +52,53 @@ class DeepCL_EXPORT NeuralNet : public Trainable {
     // cog_addheaders.add()
     // ]]]
     // generated, using cog:
-    NeuralNet( EasyCL *cl );
-    PUBLICAPI NeuralNet(  EasyCL *cl, int numPlanes, int imageSize );
+    NeuralNet(EasyCL *cl);
+    STATIC NeuralNet *instance(EasyCL *cl);
+    STATIC NeuralNet *instance(EasyCL *cl, int numPlanes, int imageSize);
+    STATIC NeuralNet *instance3(EasyCL *cl, int numPlanes, int imageSize);
+    void deleteMe();
+    NeuralNet(EasyCL *cl, int numPlanes, int imageSize);
     ~NeuralNet();
-    STATIC NeuralNetMould *maker( EasyCL *cl );
+    STATIC NeuralNetMould *maker(EasyCL *cl);
     NeuralNet *clone();
     EasyCL *getCl();
-    PUBLICAPI void addLayer( LayerMaker2 *maker );
-    PUBLICAPI void initWeights( int layerIndex, float *weights, float *bias );
-    PUBLICAPI void initWeights( int layerIndex, float *weights );
-    PUBLICAPI void initBias( int layerIndex, float *weights );
-    PUBLICAPI float calcLoss(float const *expectedValues );
-    PUBLICAPI float calcLossFromLabels(int const *labels );
-    float calcLoss( OutputData *outputData );
-    int calcNumRight( OutputData *outputData );
-    EpochMaker *epochMaker( Trainer *trainer );
+    PUBLICAPI void addLayer(LayerMaker2 *maker);
+    PUBLICAPI void initWeights(int layerIndex, float *weights, float *bias);
+    PUBLICAPI void initWeights(int layerIndex, float *weights);
+    PUBLICAPI void initBias(int layerIndex, float *weights);
+    PUBLICAPI float calcLoss(float const *expectedValues);
+    PUBLICAPI float calcLossFromLabels(int const *labels);
+    float calcLoss(OutputData *outputData);
+    int calcNumRight(OutputData *outputData);
+    EpochMaker *epochMaker(Trainer *trainer);
     VIRTUAL LossLayerMaker *cloneLossLayerMaker() const;
     PUBLICAPI InputLayer *getFirstLayer();
     PUBLICAPI Layer *getLastLayer();
     PUBLICAPI int getNumLayers() const;
-    PUBLICAPI Layer *getLayer( int index );
+    PUBLICAPI Layer *getLayer(int index);
     PUBLICAPI Layer const*getLastLayer() const;
     PUBLICAPI VIRTUAL int getOutputPlanes() const;
-    PUBLICAPI VIRTUAL int getOutputImageSize() const;
-    PUBLICAPI void setBatchSize( int batchSize );
-    PUBLICAPI void setTraining( bool training );
-    PUBLICAPI int calcNumRight( int const *labels );
-    PUBLICAPI void forward( float const*images);
-    PUBLICAPI void backwardFromLabels( int const *labels);
-    PUBLICAPI void backward( float const *expectedOutput);
-    void backward( OutputData *outputData );
+    PUBLICAPI VIRTUAL int getOutputSize() const;
+    PUBLICAPI void setBatchSize(int batchSize);
+    PUBLICAPI void setTraining(bool training);
+    PUBLICAPI int calcNumRight(int const *labels);
+    PUBLICAPI void forward(float const*images);
+    PUBLICAPI void backwardFromLabels(int const *labels);
+    PUBLICAPI void backward(float const *expectedOutput);
+    void backward(OutputData *outputData);
     PUBLICAPI int getNumLayers();
-    PUBLICAPI float const *getOutput( int layer ) const;
+    PUBLICAPI float const *getOutput(int layer) const;
     PUBLICAPI int getInputCubeSize() const;
     PUBLICAPI int getOutputCubeSize() const;
     PUBLICAPI float const *getOutput() const;
-    PUBLICAPI VIRTUAL int getOutputSize() const;
+    PUBLICAPI VIRTUAL int getOutputNumElements() const;
     void print();
     void printWeights();
     void printOutput();
-    VIRTUAL void setTrainer( Trainer *trainer );
+    VIRTUAL void setTrainer(Trainer *trainer);
     void printParamStats();
     PUBLICAPI std::string asString();
+    PUBLICAPI const char * asNewCharStar();  // call deepcl_deleteCharStar to delete this
 
     // [[[end]]]
 };
diff --git a/src/net/NeuralNetMould.cpp b/src/net/NeuralNetMould.cpp
index 662adf2d..13d2457f 100644
--- a/src/net/NeuralNetMould.cpp
+++ b/src/net/NeuralNetMould.cpp
@@ -17,18 +17,18 @@ using namespace std;
 
 NeuralNet *NeuralNetMould::instance() {
 //    cout << "neuralnetmould::instance imagesize " << _imageSize << " numPlanes " << _numPlanes << endl;
-    if( _numPlanes != 0 || _imageSize != 0 ) {
-        if( _numPlanes == 0 ) {
+    if(_numPlanes != 0 || _imageSize != 0) {
+        if(_numPlanes == 0) {
             throw runtime_error("Must provide ->planes(planes)");
         }
-        if( _imageSize == 0 ) {
+        if(_imageSize == 0) {
             throw runtime_error("Must provide ->imageSize(imageSize)");
         }
-        NeuralNet *net = new NeuralNet( cl, _numPlanes, _imageSize );
+        NeuralNet *net = new NeuralNet(cl, _numPlanes, _imageSize);
         delete this;
         return net;
     } else {
-        NeuralNet *net = new NeuralNet( cl );
+        NeuralNet *net = new NeuralNet(cl);
         delete this;
         return net;
     }
diff --git a/src/net/NeuralNetMould.h b/src/net/NeuralNetMould.h
index b060cc57..37be32e9 100644
--- a/src/net/NeuralNetMould.h
+++ b/src/net/NeuralNetMould.h
@@ -16,20 +16,20 @@ class DeepCL_EXPORT NeuralNetMould {
     EasyCL *cl; // NOT delete
     int _numPlanes;
     int _imageSize;
-    NeuralNetMould( EasyCL *cl ) :
-            cl( cl ) {
+    NeuralNetMould(EasyCL *cl) :
+            cl(cl) {
         _numPlanes = 0;
         _imageSize = 0;
     }
-    NeuralNetMould( int planes, int imageSize ){
+    NeuralNetMould(int planes, int imageSize){
         this->_numPlanes = planes;
         this->_imageSize = imageSize;
     }
-    NeuralNetMould *planes(int planes ) {
+    NeuralNetMould *planes(int planes) {
         this->_numPlanes = planes;
         return this;
     }
-    NeuralNetMould *imageSize( int imageSize ) {
+    NeuralNetMould *imageSize(int imageSize) {
         this->_imageSize = imageSize;
         return this;
     }
diff --git a/src/net/Trainable.cpp b/src/net/Trainable.cpp
index 5188ff66..a1e755d7 100644
--- a/src/net/Trainable.cpp
+++ b/src/net/Trainable.cpp
@@ -16,14 +16,14 @@ using namespace std;
 #undef STATIC
 #define STATIC
 
-//void Trainable::learnBatch( float learningRate, float const*images, float const *expectedOutput ) {
-//    setTraining( true );
-//    forward( images);
-//    backward( learningRate, expectedOutput );
+//void Trainable::learnBatch(float learningRate, float const*images, float const *expectedOutput) {
+//    setTraining(true);
+//    forward(images);
+//    backward(learningRate, expectedOutput);
 //}
-//void Trainable::learnBatchFromLabels( float learningRate, float const*images, int const *labels ) {
-//    setTraining( true );
-//    forward( images);
-//    backwardFromLabels( learningRate, labels );
+//void Trainable::learnBatchFromLabels(float learningRate, float const*images, int const *labels) {
+//    setTraining(true);
+//    forward(images);
+//    backwardFromLabels(learningRate, labels);
 //}
 
diff --git a/src/net/Trainable.h b/src/net/Trainable.h
index b98e95f0..7dd97653 100644
--- a/src/net/Trainable.h
+++ b/src/net/Trainable.h
@@ -20,22 +20,22 @@ class TrainerMaker;
 class DeepCL_EXPORT Trainable {
 public:
     virtual ~Trainable() {}
-    virtual int getOutputSize() const = 0;
-    virtual float calcLoss(float const *expectedValues ) = 0;
-    virtual float calcLossFromLabels(int const *labels ) = 0;
-    virtual void setBatchSize( int batchSize ) = 0;
-    virtual void setTraining( bool training ) = 0;
-    virtual int calcNumRight( int const *labels ) = 0;
-    virtual void forward( float const*images) = 0;
-    virtual void backwardFromLabels( int const *labels) = 0;
-    virtual void backward( float const *expectedOutput) = 0;
+    virtual int getOutputNumElements() const = 0;
+    virtual float calcLoss(float const *expectedValues) = 0;
+    virtual float calcLossFromLabels(int const *labels) = 0;
+    virtual void setBatchSize(int batchSize) = 0;
+    virtual void setTraining(bool training) = 0;
+    virtual int calcNumRight(int const *labels) = 0;
+    virtual void forward(float const*images) = 0;
+    virtual void backwardFromLabels(int const *labels) = 0;
+    virtual void backward(float const *expectedOutput) = 0;
     virtual float const *getOutput() const = 0;
     virtual LossLayerMaker *cloneLossLayerMaker() const = 0;
     virtual int getOutputPlanes() const = 0;
-    virtual int getOutputImageSize() const = 0;
+    virtual int getOutputSize() const = 0;
     virtual int getInputCubeSize() const = 0;
     virtual int getOutputCubeSize() const = 0;
-//    virtual void setTrainer( TrainerMaker *trainer ) = 0;
+//    virtual void setTrainer(TrainerMaker *trainer) = 0;
 
     // [[[cog
     // import cog_addheaders
diff --git a/src/netdef/NetdefToNet.cpp b/src/netdef/NetdefToNet.cpp
index 33f8aab7..7e199864 100644
--- a/src/netdef/NetdefToNet.cpp
+++ b/src/netdef/NetdefToNet.cpp
@@ -25,42 +25,42 @@ using namespace std;
 // prefix-nn*(inner)-postfix
 // or:
 // prefix-nn*inner-postfix
-STATIC std::string expandMultipliers( std::string netdef ) {
+STATIC std::string expandMultipliers(std::string netdef) {
     int starPos = netdef.find("*");
-    if( starPos != (int)string::npos ) {
+    if(starPos != (int)string::npos) {
         int prefixEnd = netdef.rfind("-", starPos);
         string prefix = "";
         string nnString = "";
-        if( prefixEnd == (int)string::npos ) {
+        if(prefixEnd == (int)string::npos) {
             prefixEnd = -1;
-            nnString = netdef.substr(0, starPos );
+            nnString = netdef.substr(0, starPos);
         } else {
             prefixEnd--;
-            prefix = netdef.substr(0, prefixEnd + 1 );
+            prefix = netdef.substr(0, prefixEnd + 1);
             cout << "prefix: [" << prefix << "]" << endl;
             nnString = netdef.substr(prefixEnd + 2, starPos - prefixEnd - 2);
         }
         cout << "nnString: [" << nnString << "]" << endl;
-        int repeatNum = atoi( nnString);
+        int repeatNum = atoi(nnString);
         cout << "repeatNum " << repeatNum << endl;
-        string remainderString = netdef.substr( starPos + 1 );
+        string remainderString = netdef.substr(starPos + 1);
         cout << "remainderString [" << remainderString << "]" << endl;
         string inner = "";
         string postfix = "";
-        if( remainderString.substr(0, 1 ) == "(" ) {
+        if(remainderString.substr(0, 1) == "(") {
             // need to find other ')', assume not nested for now...
             int rhBracket = remainderString.find(")");
-            if( rhBracket == (int)string::npos ) {
-                throw runtime_error( "matching bracket not found in " + remainderString );
+            if(rhBracket == (int)string::npos) {
+                throw runtime_error("matching bracket not found in " + remainderString);
 //                return false;
             }
-            inner = remainderString.substr(1, rhBracket - 1 );
+            inner = remainderString.substr(1, rhBracket - 1);
             cout << "inner [" << inner << "]" << endl;
             string newRemainder = remainderString.substr(rhBracket + 1);
             cout << "newRemainder [" << newRemainder << "]" << endl;
-            if( newRemainder != "" ) {
-                if( newRemainder[0] != '-' ) {
-                    throw runtime_error( "expect '-' after ')' in " + remainderString );
+            if(newRemainder != "") {
+                if(newRemainder[0] != '-') {
+                    throw runtime_error("expect '-' after ')' in " + remainderString);
     //                return false;
                 }
                 postfix = newRemainder.substr(1);
@@ -68,33 +68,33 @@ STATIC std::string expandMultipliers( std::string netdef ) {
             }
         } else {
             int innerEnd = remainderString.find("-");
-            if( innerEnd == (int)string::npos ) {
+            if(innerEnd == (int)string::npos) {
                 innerEnd = remainderString.length();
             } else {
 //                innerEnd;
-                postfix = remainderString.substr( innerEnd + 1 );
+                postfix = remainderString.substr(innerEnd + 1);
                 cout << "postfix [" << postfix << "]" << endl;
             }
-            inner = remainderString.substr(0, innerEnd );
+            inner = remainderString.substr(0, innerEnd);
             cout << "inner [" << inner << "]" << endl;
-//            if( remainderString.find("-") != string::npos ) {
+//            if(remainderString.find("-") != string::npos) {
 //                sectionEndPos = remainderString.find("-");
 //            }
         }
 //        return "";
-        // if remainderString starts with (, then repeat up to next )
+        // if remainderString starts with (, then repeat up to next)
         // otherwise, repeat up to next -
 //        int sectionEndPos = remainderString.length();
 //        remainderString = 
         string newString = prefix;
-        for( int i = 0; i < repeatNum; i++ ) {
-            if( newString != "" ) {
+        for(int i = 0; i < repeatNum; i++) {
+            if(newString != "") {
                 newString += "-";
             }
-            newString += expandMultipliers( inner );
+            newString += expandMultipliers(inner);
         }
-        if( postfix != "" ) {
-            newString += "-" + expandMultipliers( postfix );
+        if(postfix != "") {
+            newString += "-" + expandMultipliers(postfix);
         }
         cout << "multiplied string: " << newString << endl;
         return newString;
@@ -103,54 +103,54 @@ STATIC std::string expandMultipliers( std::string netdef ) {
     }    
 }
 
-STATIC bool NetdefToNet::parseSubstring( WeightsInitializer *weightsInitializer, NeuralNet *net, std::string substring, bool isLast ) {
+STATIC bool NetdefToNet::parseSubstring(WeightsInitializer *weightsInitializer, NeuralNet *net, std::string substring, bool isLast) {
 //    cout << "substring [" << substring << "]" << endl;
-    vector<string>splitLayerDef = split( substring, "{" );
+    vector<string>splitLayerDef = split(substring, "{");
     string baseLayerDef = splitLayerDef[0];
 //         optionsDef = "";
     vector<string> splitOptionsDef;
 //    cout << "splitlayerdef.size() " << splitLayerDef.size() << endl;
-    if( splitLayerDef.size() == 2 ) {
-        string  optionsDef = split( splitLayerDef[1], "}" )[0];
+    if(splitLayerDef.size() == 2) {
+        string  optionsDef = split(splitLayerDef[1], "}")[0];
 //        cout << "optionsDef [" << optionsDef << "]" << endl;
-        splitOptionsDef = split( optionsDef, "," );
+        splitOptionsDef = split(optionsDef, ",");
     }
-    if( baseLayerDef.find("c") != string::npos ) {
-        vector<string> splitConvDef = split( baseLayerDef, "c" );
-        int numFilters = atoi( splitConvDef[0] );
-        vector<string> splitConvDef1 = split( splitConvDef[1], "z" );
-        int filterSize = atoi( splitConvDef1[0] );
+    if(baseLayerDef.find("c") != string::npos) {
+        vector<string> splitConvDef = split(baseLayerDef, "c");
+        int numFilters = atoi(splitConvDef[0]);
+        vector<string> splitConvDef1 = split(splitConvDef[1], "z");
+        int filterSize = atoi(splitConvDef1[0]);
         int skip = 0;
         ActivationFunction *fn = 0;
         int padZeros = 0;
-        if( splitConvDef1.size() == 2 ) {
+        if(splitConvDef1.size() == 2) {
             padZeros = 1;
         }
-        for( int i = 0; i < (int)splitOptionsDef.size(); i++ ) {
+        for(int i = 0; i < (int)splitOptionsDef.size(); i++) {
             string optionDef = splitOptionsDef[i];
 //            cout << "optionDef [" << optionDef << "]" << endl;
-            vector<string> splitOptionDef = split( optionDef, "=");
+            vector<string> splitOptionDef = split(optionDef, "=");
             string optionName = splitOptionDef[0];
-            if( splitOptionDef.size() == 2 ) {
+            if(splitOptionDef.size() == 2) {
                 string optionValue = splitOptionDef[1];
-                if( optionName == "skip" ) {
-                    skip = atoi( optionValue );
+                if(optionName == "skip") {
+                    skip = atoi(optionValue);
                     cout << "got skip: " << skip << endl;
                 }
-            } else if( splitOptionDef.size() == 1 ) {
-                if( optionName == "tanh" ) {
+            } else if(splitOptionDef.size() == 1) {
+                if(optionName == "tanh") {
                     fn = new TanhActivation();
-                } else if( optionName == "scaledtanh" ) {
+                } else if(optionName == "scaledtanh") {
                     fn = new ScaledTanhActivation();
-                } else if( optionName == "sigmoid" ) {
+                } else if(optionName == "sigmoid") {
                     fn = new SigmoidActivation();
-                } else if( optionName == "relu" ) {
+                } else if(optionName == "relu") {
                     fn = new ReluActivation();
-                } else if( optionName == "linear" ) {
+                } else if(optionName == "linear") {
                     fn = new LinearActivation();
-                } else if( optionName == "padzeros" ) {
+                } else if(optionName == "padzeros") {
                     padZeros = 1;
-                } else if( optionName == "z" ) {
+                } else if(optionName == "z") {
                     padZeros = 1;
                 } else {
                     cout << "Error: unknown subkey: [" << splitOptionsDef[i] << "]" << endl;
@@ -161,56 +161,56 @@ STATIC bool NetdefToNet::parseSubstring( WeightsInitializer *weightsInitializer,
                 return false;
             }
         }
-        net->addLayer( ConvolutionalMaker::instance()->numFilters(numFilters)->filterSize(filterSize)->padZeros( padZeros )->biased()->weightsInitializer( weightsInitializer ) );
-        if( fn != 0 ) {
-            net->addLayer( ActivationMaker::instance()->fn( fn ) );
+        net->addLayer(ConvolutionalMaker::instance()->numFilters(numFilters)->filterSize(filterSize)->padZeros(padZeros)->biased()->weightsInitializer(weightsInitializer) );
+        if(fn != 0) {
+            net->addLayer(ActivationMaker::instance()->fn(fn) );
         }
-    } else if( baseLayerDef.find("mp") != string::npos ) {
-        vector<string> splitPoolDef = split( baseLayerDef, "mp" );
-        int poolingSize = atoi( splitPoolDef[1] );
-        net->addLayer( PoolingMaker::instance()->poolingSize(poolingSize) );
-    } else if( baseLayerDef.find("drop") != string::npos ) {
-        net->addLayer( DropoutMaker::instance()->dropRatio(0.5f) );
-    } else if( baseLayerDef.find("relu") != string::npos ) {
-        net->addLayer( ActivationMaker::instance()->relu() );
-    } else if( baseLayerDef.find("tanh") != string::npos ) {
-        net->addLayer( ActivationMaker::instance()->tanh() );
-    } else if( baseLayerDef.find("sigmoid") != string::npos ) {
-        net->addLayer( ActivationMaker::instance()->sigmoid() );
-    } else if( baseLayerDef.find("linear") != string::npos ) {
-        net->addLayer( ActivationMaker::instance()->linear() ); // kind of pointless nop, but useful for testing
-    } else if( baseLayerDef.find("rp") != string::npos ) {
-        int patchSize = atoi( split( baseLayerDef, "rp" )[1] );
-        net->addLayer( RandomPatchesMaker::instance()->patchSize( patchSize ) );
-    } else if( baseLayerDef.find("rt") != string::npos ) {
-        int translateSize = atoi( split( baseLayerDef, "rt" )[1] );
-        net->addLayer( RandomTranslationsMaker::instance()->translateSize( translateSize ) );
-    } else if( baseLayerDef.find("n") != string::npos ) {
-        vector<string> fullDef = split( baseLayerDef, "n" );
-        int numPlanes = atoi( fullDef[0] );
+    } else if(baseLayerDef.find("mp") != string::npos) {
+        vector<string> splitPoolDef = split(baseLayerDef, "mp");
+        int poolingSize = atoi(splitPoolDef[1]);
+        net->addLayer(PoolingMaker::instance()->poolingSize(poolingSize));
+    } else if(baseLayerDef.find("drop") != string::npos) {
+        net->addLayer(DropoutMaker::instance()->dropRatio(0.5f));
+    } else if(baseLayerDef.find("relu") != string::npos) {
+        net->addLayer(ActivationMaker::instance()->relu());
+    } else if(baseLayerDef.find("tanh") != string::npos) {
+        net->addLayer(ActivationMaker::instance()->tanh());
+    } else if(baseLayerDef.find("sigmoid") != string::npos) {
+        net->addLayer(ActivationMaker::instance()->sigmoid());
+    } else if(baseLayerDef.find("linear") != string::npos) {
+        net->addLayer(ActivationMaker::instance()->linear()); // kind of pointless nop, but useful for testing
+    } else if(baseLayerDef.find("rp") != string::npos) {
+        int patchSize = atoi(split(baseLayerDef, "rp")[1]);
+        net->addLayer(RandomPatchesMaker::instance()->patchSize(patchSize) );
+    } else if(baseLayerDef.find("rt") != string::npos) {
+        int translateSize = atoi(split(baseLayerDef, "rt")[1]);
+        net->addLayer(RandomTranslationsMaker::instance()->translateSize(translateSize) );
+    } else if(baseLayerDef.find("n") != string::npos) {
+        vector<string> fullDef = split(baseLayerDef, "n");
+        int numPlanes = atoi(fullDef[0]);
         ActivationFunction *fn = 0;
-//        if( isLast ) {
+//        if(isLast) {
 //            fn = new LinearActivation();
 //        }
 //        int padZeros = 0;
         int biased = 1;
-        for( int i = 0; i < (int)splitOptionsDef.size(); i++ ) {
+        for(int i = 0; i < (int)splitOptionsDef.size(); i++) {
             string optionDef = splitOptionsDef[i];
 //                cout << "optionDef: " << optionDef << endl;
-            vector<string> splitOptionDef = split( optionDef, "=");
+            vector<string> splitOptionDef = split(optionDef, "=");
             string optionName = splitOptionDef[0];
-            if( splitOptionDef.size() == 1 ) {
-                if( optionName == "tanh" ) {
+            if(splitOptionDef.size() == 1) {
+                if(optionName == "tanh") {
                     fn = new TanhActivation();
-                } else if( optionName == "scaledtanh" ) {
+                } else if(optionName == "scaledtanh") {
                     fn = new ScaledTanhActivation();
-                } else if( optionName == "sigmoid" ) {
+                } else if(optionName == "sigmoid") {
                     fn = new SigmoidActivation();
-                } else if( optionName == "relu" ) {
+                } else if(optionName == "relu") {
                     fn = new ReluActivation();
-                } else if( optionName == "nobias" ) {
+                } else if(optionName == "nobias") {
                     biased = 0;
-                } else if( optionName == "linear" ) {
+                } else if(optionName == "linear") {
                     fn = new LinearActivation();
                 } else {
                     cout << "Error: unknown subkey: [" << splitOptionsDef[i] << "]" << endl;
@@ -221,13 +221,13 @@ STATIC bool NetdefToNet::parseSubstring( WeightsInitializer *weightsInitializer,
                 return false;
             }
         }
-        if( isLast && fn != 0 ) {
+        if(isLast && fn != 0) {
             cout << "Last fullyconnectedlayer must be linear (because softmax is the 'activationlayer' for this layer)" << endl;
             return false;
         }
-        net->addLayer( FullyConnectedMaker::instance()->numPlanes(numPlanes)->imageSize(1)->biased(biased)->weightsInitializer( weightsInitializer ) );
-        if( fn != 0 ) {
-            net->addLayer( ActivationMaker::instance()->fn( fn ) );
+        net->addLayer(FullyConnectedMaker::instance()->numPlanes(numPlanes)->imageSize(1)->biased(biased)->weightsInitializer(weightsInitializer) );
+        if(fn != 0) {
+            net->addLayer(ActivationMaker::instance()->fn(fn) );
         }
     } else {
         cout << "network definition " << baseLayerDef << " not recognised" << endl;
@@ -236,32 +236,35 @@ STATIC bool NetdefToNet::parseSubstring( WeightsInitializer *weightsInitializer,
     return true;
 }
 
-PUBLICAPI STATIC bool NetdefToNet::createNetFromNetdef( NeuralNet *net, std::string netdef ) {
+PUBLICAPI STATIC bool NetdefToNet::createNetFromNetdef(NeuralNet *net, std::string netdef) {
     OriginalInitializer originalInitializer;
-    return createNetFromNetdef( net, netdef, &originalInitializer );
+    return createNetFromNetdef(net, netdef, &originalInitializer);
+}
+PUBLICAPI STATIC bool NetdefToNet::createNetFromNetdefCharStar(NeuralNet *net, const char *netdef) {
+    OriginalInitializer originalInitializer;
+    return createNetFromNetdef(net, netdef, &originalInitializer);
 }
 
-STATIC bool NetdefToNet::createNetFromNetdef( NeuralNet *net, std::string netdef, WeightsInitializer *weightsInitializer ) {
-    string netDefLower = toLower( netdef );
+STATIC bool NetdefToNet::createNetFromNetdef(NeuralNet *net, std::string netdef, WeightsInitializer *weightsInitializer) {
+    string netDefLower = toLower(netdef);
 //    cout << "netDefLower [" << netDefLower << "]" << endl;
     try {
-        netDefLower = expandMultipliers( netDefLower );
-    } catch( runtime_error &e ) {
+        netDefLower = expandMultipliers(netDefLower);
+    } catch(runtime_error &e) {
         cout << e.what() << endl;
         return false;
     }
 //    cout << "netDefLower [" << netDefLower << "]" << endl;
-    vector<string> splitNetDef = split( netDefLower, "-" );
-    if( netdef != "" ) {
-        for( int i = 0; i < (int)splitNetDef.size(); i++ ) {
+    vector<string> splitNetDef = split(netDefLower, "-");
+    if(netdef != "") {
+        for(int i = 0; i < (int)splitNetDef.size(); i++) {
             string thisLayerDef = splitNetDef[i];
 //            cout << "thisLayerDef [" << thisLayerDef << "]" << endl;
-            if( !parseSubstring( weightsInitializer, net, thisLayerDef, i == (int)splitNetDef.size() - 1 ) ) {
+            if(!parseSubstring(weightsInitializer, net, thisLayerDef, i == (int)splitNetDef.size() - 1) ) {
                 return false;
             }
         }
     }
-    net->addLayer( SoftMaxMaker::instance() );
+    net->addLayer(SoftMaxMaker::instance());
     return true;
 }
-
diff --git a/src/netdef/NetdefToNet.h b/src/netdef/NetdefToNet.h
index d68b5ea2..5142f040 100644
--- a/src/netdef/NetdefToNet.h
+++ b/src/netdef/NetdefToNet.h
@@ -31,9 +31,10 @@ class DeepCL_EXPORT NetdefToNet {
     // cog_addheaders.add()
     // ]]]
     // generated, using cog:
-    STATIC bool parseSubstring( WeightsInitializer *weightsInitializer, NeuralNet *net, std::string substring, bool isLast );
-    PUBLICAPI STATIC bool createNetFromNetdef( NeuralNet *net, std::string netdef );
-    STATIC bool createNetFromNetdef( NeuralNet *net, std::string netdef, WeightsInitializer *weightsInitializer );
+    STATIC bool parseSubstring(WeightsInitializer *weightsInitializer, NeuralNet *net, std::string substring, bool isLast);
+    PUBLICAPI STATIC bool createNetFromNetdef(NeuralNet *net, std::string netdef);
+    PUBLICAPI STATIC bool createNetFromNetdefCharStar(NeuralNet *net, const char *netdef);
+    STATIC bool createNetFromNetdef(NeuralNet *net, std::string netdef, WeightsInitializer *weightsInitializer);
 
     // [[[end]]]
 };
diff --git a/src/normalize/NormalizationHelper.h b/src/normalize/NormalizationHelper.h
index a5742ab9..cb99e75c 100644
--- a/src/normalize/NormalizationHelper.h
+++ b/src/normalize/NormalizationHelper.h
@@ -21,18 +21,18 @@ class DeepCL_EXPORT Statistics {
     float sumY;
     float sumYSquared;
     Statistics() {
-        memset( this, 0, sizeof( Statistics ) );
+        memset(this, 0, sizeof(Statistics) );
     }
 };
 
 class DeepCL_EXPORT NormalizationHelper {
 public:
-    static void updateStatistics( float *Y, int length, int cubeSize, Statistics *statistics ) {
+    static void updateStatistics(float *Y, int length, int cubeSize, Statistics *statistics) {
         float thisSumY = 0;
         float thisSumYSquared = 0;
         float thisMin = Y[0];
         float thisMax = Y[0];
-        for( int i = 0; i < length * cubeSize; i++ ) {
+        for(int i = 0; i < length * cubeSize; i++) {
             float thisValue = Y[i];
             thisSumY += thisValue;
             thisSumYSquared += (float)thisValue * (float)thisValue;
@@ -48,76 +48,76 @@ class DeepCL_EXPORT NormalizationHelper {
         statistics->sumYSquared += thisSumYSquared;
     }
 
-    static void calcMeanAndStdDev( Statistics *statistics, float *p_mean, float *p_stdDev ) {
+    static void calcMeanAndStdDev(Statistics *statistics, float *p_mean, float *p_stdDev) {
         *p_mean = (float)statistics->sumY / statistics->count;
-        *p_stdDev = sqrt( ( statistics->sumYSquared - statistics->sumY * statistics->sumY / statistics->count ) / (statistics->count - 1 ) );
+        *p_stdDev = sqrt(( statistics->sumYSquared - statistics->sumY * statistics->sumY / statistics->count) / (statistics->count - 1) );
     }
     
-    static void getMeanAndStdDev( float *data, int length, float *p_mean, float *p_stdDev ) {
+    static void getMeanAndStdDev(float *data, int length, float *p_mean, float *p_stdDev) {
         // get mean of the dataset, and stddev
     //    float thismax = 0;
         float sum = 0;
-        for( int i = 0; i < length; i++ ) {
+        for(int i = 0; i < length; i++) {
             float thisValue = data[i];
             sum += thisValue;
         }
         float mean = sum / length;
 
         float sumSquaredDiff = 0;
-        for( int i = 0; i < length; i++ ) {
+        for(int i = 0; i < length; i++) {
             float thisValue = data[i];
 //            std::cout << "i " << i << "=" << thisValue << std::endl;
             float diffFromMean = thisValue - mean;
             float diffSquared = diffFromMean * diffFromMean;
             sumSquaredDiff += diffSquared;
         }
-        float stdDev = (float)std::sqrt( (double)(sumSquaredDiff / ( length - 1 )) );
+        float stdDev = (float)std::sqrt((double)(sumSquaredDiff / (length - 1)));
 
         *p_mean = mean;
         *p_stdDev = stdDev;
     }
     
-    static void getMeanAndMaxDev( float *data, int length, float *p_mean, float *p_maxDev ) {
+    static void getMeanAndMaxDev(float *data, int length, float *p_mean, float *p_maxDev) {
         // get mean of the dataset, and stddev
     //    float thismax = 0;
         float sum = 0;
-        for( int i = 0; i < length; i++ ) {
+        for(int i = 0; i < length; i++) {
             float thisValue = data[i];
             sum += thisValue;
         }
         float mean = sum / length;
 
 //        float sumSquaredDiff = 0;
-//        for( int i = 0; i < length; i++ ) {
+//        for(int i = 0; i < length; i++) {
 //            int thisValue = (int)data[i];
 //            float diffFromMean = thisValue - mean;
 //            float diffSquared = diffFromMean * diffFromMean;
 //            sumSquaredDiff += diffSquared;
 //        }
-//        float stdDev = sqrt( sumSquaredDiff / ( length - 1 ) );
+//        float stdDev = sqrt(sumSquaredDiff / (length - 1) );
 
         *p_mean = mean;
-        *p_maxDev = std::max<float>( 255-mean, mean );
+        *p_maxDev = std::max<float>(255-mean, mean);
     }
     
-    static void getMinMax( float *data, int length, float *p_middle, float *p_maxDev ) {
+    static void getMinMax(float *data, int length, float *p_middle, float *p_maxDev) {
         // get mean of the dataset, and stddev
         float thismin = 0;
         float thismax = 0;
 //        float sum = 0;
-        for( int i = 0; i < length; i++ ) {
+        for(int i = 0; i < length; i++) {
             float thisValue = data[i];
-            thismin = std::min<float>( thisValue, thismin );
-            thismax = std::max<float>( thisValue, thismax );
+            thismin = std::min<float>(thisValue, thismin);
+            thismax = std::max<float>(thisValue, thismax);
         }
 
-        *p_middle = ( thismax + thismin ) / 2; // pick number in the middle
-        *p_maxDev = ( thismax - thismin ) / 2; // distance from middle of range to either end
+        *p_middle = (thismax + thismin) / 2; // pick number in the middle
+        *p_maxDev = (thismax - thismin) / 2; // distance from middle of range to either end
     }
     
-    static void normalize( float *data, int length, float mean, float scaling ) {
-        for( int i = 0; i < length; i++ ) {
-            data[i] = ( data[i] - mean ) / scaling;
+    static void normalize(float *data, int length, float mean, float scaling) {
+        for(int i = 0; i < length; i++) {
+            data[i] = (data[i] - mean) / scaling;
         }
     }
 };
diff --git a/src/normalize/NormalizationLayer.cpp b/src/normalize/NormalizationLayer.cpp
index 12afc15b..7f3c2586 100644
--- a/src/normalize/NormalizationLayer.cpp
+++ b/src/normalize/NormalizationLayer.cpp
@@ -13,18 +13,18 @@ using namespace std;
 #undef VIRTUAL
 #define VIRTUAL 
 
-NormalizationLayer::NormalizationLayer( Layer *previousLayer, NormalizationLayerMaker *maker ) :
-       Layer( previousLayer, maker ),
-    translate( maker->_translate ),
-    scale( maker->_scale ),
-    outputPlanes( previousLayer->getOutputPlanes() ),
-    outputImageSize( previousLayer->getOutputImageSize() ),
+NormalizationLayer::NormalizationLayer(Layer *previousLayer, NormalizationLayerMaker *maker) :
+       Layer(previousLayer, maker),
+    translate(maker->_translate),
+    scale(maker->_scale),
+    outputPlanes(previousLayer->getOutputPlanes()),
+    outputSize(previousLayer->getOutputSize()),
     batchSize(0),
     allocatedSize(0),
     output(0) {
 }
 VIRTUAL NormalizationLayer::~NormalizationLayer() {
-    if( output != 0 ) {
+    if(output != 0) {
         delete[] output;
     }
 }
@@ -37,22 +37,22 @@ VIRTUAL float *NormalizationLayer::getOutput() {
 VIRTUAL ActivationFunction const *NormalizationLayer::getActivationFunction() {
     return new LinearActivation();
 }
-VIRTUAL int NormalizationLayer::getPersistSize( int version ) const {
-    if( version == 1 ) {
+VIRTUAL int NormalizationLayer::getPersistSize(int version) const {
+    if(version == 1) {
         return 0;
     }
     return 2;
 }
-VIRTUAL void NormalizationLayer::persistToArray( int version, float *array ) {
-    if( version == 1 ) {
+VIRTUAL void NormalizationLayer::persistToArray(int version, float *array) {
+    if(version == 1) {
         return;
     }
     array[0] = translate;
     array[1] = scale;
 }
 /// \brief initialize the current weights and biases from array
-VIRTUAL void NormalizationLayer::unpersistFromArray( int version, float const*array ) {
-    if( version == 1 ) {
+VIRTUAL void NormalizationLayer::unpersistFromArray(int version, float const*array) {
+    if(version == 1) {
         return;
     }
     translate = array[0];
@@ -62,31 +62,31 @@ VIRTUAL bool NormalizationLayer::needsBackProp() {
     return previousLayer->needsBackProp();
 }
 VIRTUAL void NormalizationLayer::printOutput() const {
-    if( output == 0 ) {
+    if(output == 0) {
          return;
     }
-    for( int n = 0; n < std::min(5,batchSize); n++ ) {
+    for(int n = 0; n < std::min(5,batchSize); n++) {
         std::cout << "NormalizationLayer n " << n << ":" << std::endl;
-        for( int plane = 0; plane < std::min( 5, outputPlanes); plane++ ) {
-            if( outputPlanes > 1 ) std::cout << "    plane " << plane << ":" << std::endl;
-            for( int i = 0; i < std::min(5, outputImageSize); i++ ) {
+        for(int plane = 0; plane < std::min(5, outputPlanes); plane++) {
+            if(outputPlanes > 1) std::cout << "    plane " << plane << ":" << std::endl;
+            for(int i = 0; i < std::min(5, outputSize); i++) {
                 std::cout << "      ";
-                for( int j = 0; j < std::min(5, outputImageSize); j++ ) {
-                    std::cout << getResult( n, plane, i, j ) << " ";
+                for(int j = 0; j < std::min(5, outputSize); j++) {
+                    std::cout << getResult(n, plane, i, j) << " ";
 //output[
 //                            n * numPlanes * imageSize*imageSize +
 //                            plane*imageSize*imageSize +
 //                            i * imageSize +
 //                            j ] << " ";
                 }
-                if( outputImageSize > 5 ) std::cout << " ... ";
+                if(outputSize > 5) std::cout << " ... ";
                 std::cout << std::endl;
             }
-            if( outputImageSize > 5 ) std::cout << " ... " << std::endl;
+            if(outputSize > 5) std::cout << " ... " << std::endl;
         }
-        if( outputPlanes > 5 ) std::cout << "   ... other planes ... " << std::endl;
+        if(outputPlanes > 5) std::cout << "   ... other planes ... " << std::endl;
     }
-    if( batchSize > 5 ) std::cout << "   ... other n ... " << std::endl;
+    if(batchSize > 5) std::cout << "   ... other n ... " << std::endl;
 }
 VIRTUAL void NormalizationLayer::print() const {
     printOutput();
@@ -94,46 +94,46 @@ VIRTUAL void NormalizationLayer::print() const {
 VIRTUAL bool NormalizationLayer::needErrorsBackprop() {
     return false;
 }
-VIRTUAL void NormalizationLayer::setBatchSize( int batchSize ) {
-    if( batchSize <= allocatedSize ) {
+VIRTUAL void NormalizationLayer::setBatchSize(int batchSize) {
+    if(batchSize <= allocatedSize) {
         this->batchSize = batchSize;
         return;
     }
-    if( output != 0 ) {
+    if(output != 0) {
         delete[] output;
     }
     this->batchSize = batchSize;
     this->allocatedSize = allocatedSize;
-    output = new float[ getOutputSize() ];
+    output = new float[ getOutputNumElements() ];
 }
 VIRTUAL void NormalizationLayer::forward() {
-    int totalLinearLength = getOutputSize();
+    int totalLinearLength = getOutputNumElements();
     float *upstreamOutput = previousLayer->getOutput();
-    for( int i = 0; i < totalLinearLength; i++ ) {
-        output[i] = ( upstreamOutput[i] + translate ) * scale;
+    for(int i = 0; i < totalLinearLength; i++) {
+        output[i] = (upstreamOutput[i] + translate) * scale;
     }
 }
-VIRTUAL void NormalizationLayer::backward( float learningRate, float const *gradOutput ) {
+VIRTUAL void NormalizationLayer::backward(float learningRate, float const *gradOutput) {
   // do nothing...
 }
-VIRTUAL int NormalizationLayer::getOutputImageSize() const {
-    return outputImageSize;
+VIRTUAL int NormalizationLayer::getOutputSize() const {
+    return outputSize;
 }
 VIRTUAL int NormalizationLayer::getOutputPlanes() const {
     return outputPlanes;
 }
 VIRTUAL int NormalizationLayer::getOutputCubeSize() const {
-    return outputPlanes * outputImageSize * outputImageSize;
+    return outputPlanes * outputSize * outputSize;
 }
-VIRTUAL int NormalizationLayer::getOutputSize() const {
+VIRTUAL int NormalizationLayer::getOutputNumElements() const {
     return batchSize * getOutputCubeSize();
 }
 VIRTUAL std::string NormalizationLayer::toString() {
     return toString();
 }
 VIRTUAL std::string NormalizationLayer::asString() const {
-    return std::string("") + "NormalizationLayer{ outputPlanes=" + ::toString( outputPlanes ) + " outputImageSize=" +  ::toString( outputImageSize ) + " translate=" + ::toString( translate ) + 
-        " scale=" + ::toString( scale ) + " }";
+    return std::string("") + "NormalizationLayer{ outputPlanes=" + ::toString(outputPlanes) + " outputSize=" +  ::toString(outputSize) + " translate=" + ::toString(translate) + 
+        " scale=" + ::toString(scale) + " }";
 }
 
 
diff --git a/src/normalize/NormalizationLayer.h b/src/normalize/NormalizationLayer.h
index 19e0cb80..b5987246 100644
--- a/src/normalize/NormalizationLayer.h
+++ b/src/normalize/NormalizationLayer.h
@@ -20,20 +20,20 @@ class NormalizationLayer : public Layer, IHasToString {
     float scale;  // then scale
 
     const int outputPlanes;
-    const int outputImageSize;
+    const int outputSize;
 
     int batchSize;
     int allocatedSize;
     float *output;
 
-    inline int getResultIndex( int n, int outPlane, int outRow, int outCol ) const {
-        return ( ( n
-            * outputPlanes + outPlane )
-            * outputImageSize + outRow )
-            * outputImageSize + outCol;
+    inline int getResultIndex(int n, int outPlane, int outRow, int outCol) const {
+        return (( n
+            * outputPlanes + outPlane)
+            * outputSize + outRow)
+            * outputSize + outCol;
     }
-    inline float getResult( int n, int outPlane, int outRow, int outCol ) const {
-        return output[ getResultIndex(n,outPlane, outRow, outCol ) ];
+    inline float getResult(int n, int outPlane, int outRow, int outCol) const {
+        return output[ getResultIndex(n,outPlane, outRow, outCol) ];
     }
 
     // [[[cog
@@ -41,31 +41,31 @@ class NormalizationLayer : public Layer, IHasToString {
     // cog_addheaders.add()
     // ]]]
     // generated, using cog:
-    NormalizationLayer( Layer *previousLayer, NormalizationLayerMaker *maker );
+    NormalizationLayer(Layer *previousLayer, NormalizationLayerMaker *maker);
     VIRTUAL ~NormalizationLayer();
     VIRTUAL std::string getClassName() const;
     VIRTUAL float *getOutput();
     VIRTUAL ActivationFunction const *getActivationFunction();
-    VIRTUAL int getPersistSize( int version ) const;
-    VIRTUAL void persistToArray( int version, float *array );
-    VIRTUAL void unpersistFromArray( int version, float const*array );
+    VIRTUAL int getPersistSize(int version) const;
+    VIRTUAL void persistToArray(int version, float *array);
+    VIRTUAL void unpersistFromArray(int version, float const*array);
     VIRTUAL bool needsBackProp();
     VIRTUAL void printOutput() const;
     VIRTUAL void print() const;
     VIRTUAL bool needErrorsBackprop();
-    VIRTUAL void setBatchSize( int batchSize );
+    VIRTUAL void setBatchSize(int batchSize);
     VIRTUAL void forward();
-    VIRTUAL void backward( float learningRate, float const *gradOutput );
-    VIRTUAL int getOutputImageSize() const;
+    VIRTUAL void backward(float learningRate, float const *gradOutput);
+    VIRTUAL int getOutputSize() const;
     VIRTUAL int getOutputPlanes() const;
     VIRTUAL int getOutputCubeSize() const;
-    VIRTUAL int getOutputSize() const;
+    VIRTUAL int getOutputNumElements() const;
     VIRTUAL std::string toString();
     VIRTUAL std::string asString() const;
 
     // [[[end]]]
 };
 
-std::ostream &operator<<( std::ostream &os, NormalizationLayer &layer );
-std::ostream &operator<<( std::ostream &os, NormalizationLayer const*layer );
+std::ostream &operator<<(std::ostream &os, NormalizationLayer &layer);
+std::ostream &operator<<(std::ostream &os, NormalizationLayer const*layer);
 
diff --git a/src/normalize/NormalizationLayerMaker.cpp b/src/normalize/NormalizationLayerMaker.cpp
index 937b9379..29f9324f 100644
--- a/src/normalize/NormalizationLayerMaker.cpp
+++ b/src/normalize/NormalizationLayerMaker.cpp
@@ -10,8 +10,8 @@
 
 using namespace std;
 
-Layer *NormalizationLayerMaker::createLayer( Layer *previousLayer ) {
-    return new NormalizationLayer( previousLayer, this );
+Layer *NormalizationLayerMaker::createLayer(Layer *previousLayer) {
+    return new NormalizationLayer(previousLayer, this);
 }
 
 
diff --git a/src/normalize/NormalizationLayerMaker.h b/src/normalize/NormalizationLayerMaker.h
index b27402c0..73955305 100644
--- a/src/normalize/NormalizationLayerMaker.h
+++ b/src/normalize/NormalizationLayerMaker.h
@@ -23,17 +23,17 @@ class DeepCL_EXPORT NormalizationLayerMaker : public LayerMaker2 {
     float _scale;
     PUBLICAPI NormalizationLayerMaker() :
         _translate(0.0f),
-        _scale( 1.0f ) {
+        _scale(1.0f) {
     }
-//    NormalizationLayerMaker( float _translate, float _scale ) :
+//    NormalizationLayerMaker(float _translate, float _scale) :
 //        _translate(_translate),
-//        _scale( _scale ) {
+//        _scale(_scale) {
 //    }
-    PUBLICAPI NormalizationLayerMaker *translate( float _translate ) {
+    PUBLICAPI NormalizationLayerMaker *translate(float _translate) {
         this->_translate = _translate;
         return this;
     }
-    PUBLICAPI NormalizationLayerMaker *scale( float _scale ) {
+    PUBLICAPI NormalizationLayerMaker *scale(float _scale) {
         this->_scale = _scale;
         return this;
     }
@@ -42,9 +42,9 @@ class DeepCL_EXPORT NormalizationLayerMaker : public LayerMaker2 {
     }
     virtual NormalizationLayerMaker *clone() const {
         NormalizationLayerMaker *thisClone = new NormalizationLayerMaker();
-        memcpy( thisClone, this, sizeof( NormalizationLayerMaker ) );
+        memcpy(thisClone, this, sizeof(NormalizationLayerMaker) );
         return thisClone;
     }
-    virtual Layer *createLayer( Layer *previousLayer );
+    virtual Layer *createLayer(Layer *previousLayer);
 };
 
diff --git a/src/patches/PatchExtractor.cpp b/src/patches/PatchExtractor.cpp
index 995d2993..690df329 100644
--- a/src/patches/PatchExtractor.cpp
+++ b/src/patches/PatchExtractor.cpp
@@ -10,16 +10,16 @@
 
 using namespace std;
 
-void PatchExtractor::extractPatch( int n, int numPlanes, int imageSize, int patchSize, int patchRow, int patchCol, float *source, float *destination ) {
+void PatchExtractor::extractPatch(int n, int numPlanes, int imageSize, int patchSize, int patchRow, int patchCol, float *source, float *destination) {
 //    int n = 0;
-    for( int plane = 0; plane < numPlanes; plane++ ) {
-        float *upstreamImage = source + ( n * numPlanes + plane ) * imageSize * imageSize;
-        float *outputImage = destination + ( n * numPlanes + plane ) * patchSize * patchSize;
-        for( int outRow = 0; outRow < patchSize; outRow++ ) {
+    for(int plane = 0; plane < numPlanes; plane++) {
+        float *upstreamImage = source + (n * numPlanes + plane) * imageSize * imageSize;
+        float *outputImage = destination + (n * numPlanes + plane) * patchSize * patchSize;
+        for(int outRow = 0; outRow < patchSize; outRow++) {
             const int inRow = outRow + patchRow;
-            memcpy( &(outputImage[ outRow * patchSize ]), 
+            memcpy(&(outputImage[ outRow * patchSize ]), 
                 &(upstreamImage[ inRow * imageSize + patchCol ]),
-                patchSize * sizeof(float) );
+                patchSize * sizeof(float));
         }        
     }
 }
diff --git a/src/patches/PatchExtractor.h b/src/patches/PatchExtractor.h
index c5e0e93f..c75f1197 100644
--- a/src/patches/PatchExtractor.h
+++ b/src/patches/PatchExtractor.h
@@ -8,6 +8,6 @@
 
 class PatchExtractor {
 public:
-    static void extractPatch( int n, int numPlanes, int imageSize, int patchSize, int patchRow, int patchCol, float *source, float *destination );
+    static void extractPatch(int n, int numPlanes, int imageSize, int patchSize, int patchRow, int patchCol, float *source, float *destination);
 };
 
diff --git a/src/patches/RandomPatches.cpp b/src/patches/RandomPatches.cpp
index 2d7faf5d..06c095df 100644
--- a/src/patches/RandomPatches.cpp
+++ b/src/patches/RandomPatches.cpp
@@ -20,49 +20,49 @@ using namespace std;
 #undef STATIC
 #define STATIC
 
-RandomPatches::RandomPatches( Layer *previousLayer, RandomPatchesMaker *maker ) :
-        Layer( previousLayer, maker ),
-        patchSize( maker->_patchSize ),
-        numPlanes ( previousLayer->getOutputPlanes() ),
-        inputImageSize( previousLayer->getOutputImageSize() ),
-        outputImageSize( maker->_patchSize ),
+RandomPatches::RandomPatches(Layer *previousLayer, RandomPatchesMaker *maker) :
+        Layer(previousLayer, maker),
+        patchSize(maker->_patchSize),
+        numPlanes (previousLayer->getOutputPlanes()),
+        inputSize(previousLayer->getOutputSize()),
+        outputSize(maker->_patchSize),
         output(0),
         batchSize(0),
         allocatedSize(0) {
-    if( inputImageSize == 0 ) {
+    if(inputSize == 0) {
 //        maker->net->print();
-        throw runtime_error("Error: Pooling layer " + toString( layerIndex ) + ": input image size is 0" );
+        throw runtime_error("Error: Pooling layer " + toString(layerIndex) + ": input image size is 0");
     }
-    if( outputImageSize == 0 ) {
+    if(outputSize == 0) {
 //        maker->net->print();
-        throw runtime_error("Error: Pooling layer " + toString( layerIndex ) + ": output image size is 0" );
+        throw runtime_error("Error: Pooling layer " + toString(layerIndex) + ": output image size is 0");
     }
-    if( previousLayer->needsBackProp() ) {
+    if(previousLayer->needsBackProp()) {
         throw runtime_error("Error: RandomPatches layer does not provide backprop currently, so you cannot put it after a layer that needs backprop");
     }
 }
 VIRTUAL RandomPatches::~RandomPatches() {
-    if( output != 0 ) {
+    if(output != 0) {
         delete[] output;
     }
 }
 VIRTUAL std::string RandomPatches::getClassName() const {
     return "RandomPatches";
 }
-VIRTUAL void RandomPatches::setBatchSize( int batchSize ) {
-    if( batchSize <= allocatedSize ) {
+VIRTUAL void RandomPatches::setBatchSize(int batchSize) {
+    if(batchSize <= allocatedSize) {
         this->batchSize = batchSize;
         return;
     }
-    if( output != 0 ) {
+    if(output != 0) {
         delete[] output;
     }
     this->batchSize = batchSize;
     this->allocatedSize = batchSize;
-    output = new float[ getOutputSize() ];
+    output = new float[ getOutputNumElements() ];
 }
-VIRTUAL int RandomPatches::getOutputSize() {
-    return batchSize * numPlanes * outputImageSize * outputImageSize;
+VIRTUAL int RandomPatches::getOutputNumElements() {
+    return batchSize * numPlanes * outputSize * outputSize;
 }
 VIRTUAL float *RandomPatches::getOutput() {
     return output;
@@ -70,16 +70,16 @@ VIRTUAL float *RandomPatches::getOutput() {
 VIRTUAL bool RandomPatches::needsBackProp() {
     return false;
 }
-VIRTUAL int RandomPatches::getOutputSize() const {
-    return batchSize * numPlanes * outputImageSize * outputImageSize;
+VIRTUAL int RandomPatches::getOutputNumElements() const {
+    return batchSize * numPlanes * outputSize * outputSize;
 }
-VIRTUAL int RandomPatches::getOutputImageSize() const {
-    return outputImageSize;
+VIRTUAL int RandomPatches::getOutputSize() const {
+    return outputSize;
 }
 VIRTUAL int RandomPatches::getOutputPlanes() const {
     return numPlanes;
 }
-VIRTUAL int RandomPatches::getPersistSize( int version ) const {
+VIRTUAL int RandomPatches::getPersistSize(int version) const {
     return 0;
 }
 VIRTUAL bool RandomPatches::providesGradInputWrapper() const {
@@ -90,19 +90,19 @@ VIRTUAL bool RandomPatches::hasOutputWrapper() const {
 }
 VIRTUAL void RandomPatches::forward() {
     float *upstreamOutput = previousLayer->getOutput();
-    for( int n = 0; n < batchSize; n++ ) {
-        int patchMargin = inputImageSize - outputImageSize;
+    for(int n = 0; n < batchSize; n++) {
+        int patchMargin = inputSize - outputSize;
         int patchRow = patchMargin / 2;
         int patchCol = patchMargin / 2;
-        if( training ) {
-            patchRow = RandomSingleton::instance()->uniformInt( 0, patchMargin );
-            patchCol = RandomSingleton::instance()->uniformInt( 0, patchMargin );
+        if(training) {
+            patchRow = RandomSingleton::instance()->uniformInt(0, patchMargin);
+            patchCol = RandomSingleton::instance()->uniformInt(0, patchMargin);
         }
-        PatchExtractor::extractPatch( n, numPlanes, inputImageSize, patchSize, patchRow, patchCol, upstreamOutput, output );
+        PatchExtractor::extractPatch(n, numPlanes, inputSize, patchSize, patchRow, patchCol, upstreamOutput, output);
     }
 }
 VIRTUAL std::string RandomPatches::asString() const {
-    return "RandomPatches{ inputPlanes=" + toString(numPlanes) + " inputImageSize=" + toString(inputImageSize) + " patchSize=" + toString( patchSize ) + " }";
+    return "RandomPatches{ inputPlanes=" + toString(numPlanes) + " inputSize=" + toString(inputSize) + " patchSize=" + toString(patchSize) + " }";
 }
 
 
diff --git a/src/patches/RandomPatches.h b/src/patches/RandomPatches.h
index e3d6ebb5..b0cbba97 100644
--- a/src/patches/RandomPatches.h
+++ b/src/patches/RandomPatches.h
@@ -21,9 +21,9 @@ class RandomPatches : public Layer {
 public:
     const int patchSize;
     const int numPlanes;
-    const int inputImageSize;
+    const int inputSize;
 
-    const int outputImageSize;
+    const int outputSize;
 
     float *output;
 
@@ -35,17 +35,17 @@ class RandomPatches : public Layer {
     // cog_addheaders.add()
     // ]]]
     // generated, using cog:
-    RandomPatches( Layer *previousLayer, RandomPatchesMaker *maker );
+    RandomPatches(Layer *previousLayer, RandomPatchesMaker *maker);
     VIRTUAL ~RandomPatches();
     VIRTUAL std::string getClassName() const;
-    VIRTUAL void setBatchSize( int batchSize );
-    VIRTUAL int getOutputSize();
+    VIRTUAL void setBatchSize(int batchSize);
+    VIRTUAL int getOutputNumElements();
     VIRTUAL float *getOutput();
     VIRTUAL bool needsBackProp();
+    VIRTUAL int getOutputNumElements() const;
     VIRTUAL int getOutputSize() const;
-    VIRTUAL int getOutputImageSize() const;
     VIRTUAL int getOutputPlanes() const;
-    VIRTUAL int getPersistSize( int version ) const;
+    VIRTUAL int getPersistSize(int version) const;
     VIRTUAL bool providesGradInputWrapper() const;
     VIRTUAL bool hasOutputWrapper() const;
     VIRTUAL void forward();
diff --git a/src/patches/RandomPatchesMaker.cpp b/src/patches/RandomPatchesMaker.cpp
index 03c80264..370929b0 100644
--- a/src/patches/RandomPatchesMaker.cpp
+++ b/src/patches/RandomPatchesMaker.cpp
@@ -10,7 +10,7 @@
 
 using namespace std;
 
-Layer *RandomPatchesMaker::createLayer( Layer *previousLayer ) {
-    return new RandomPatches( previousLayer, this );
+Layer *RandomPatchesMaker::createLayer(Layer *previousLayer) {
+    return new RandomPatches(previousLayer, this);
 }
 
diff --git a/src/patches/RandomPatchesMaker.h b/src/patches/RandomPatchesMaker.h
index 99110a62..952b504f 100644
--- a/src/patches/RandomPatchesMaker.h
+++ b/src/patches/RandomPatchesMaker.h
@@ -27,7 +27,7 @@ class DeepCL_EXPORT RandomPatchesMaker : public LayerMaker2 {
     PUBLICAPI RandomPatchesMaker() :
         _patchSize(0) {
     }
-    PUBLICAPI RandomPatchesMaker *patchSize( int _patchSize ) {
+    PUBLICAPI RandomPatchesMaker *patchSize(int _patchSize) {
         this->_patchSize = _patchSize;
         return this;
     }
@@ -36,9 +36,9 @@ class DeepCL_EXPORT RandomPatchesMaker : public LayerMaker2 {
     }
     virtual RandomPatchesMaker *clone() const {
         RandomPatchesMaker *thisClone = new RandomPatchesMaker();
-        memcpy( thisClone, this, sizeof( RandomPatchesMaker ) );
+        memcpy(thisClone, this, sizeof(RandomPatchesMaker) );
         return thisClone;
     }
-    virtual Layer *createLayer( Layer *previousLayer );
+    virtual Layer *createLayer(Layer *previousLayer);
 };
 
diff --git a/src/patches/RandomTranslations.cpp b/src/patches/RandomTranslations.cpp
index d3dbedd8..428b8f9f 100644
--- a/src/patches/RandomTranslations.cpp
+++ b/src/patches/RandomTranslations.cpp
@@ -20,49 +20,49 @@ using namespace std;
 #undef STATIC
 #define STATIC
 
-RandomTranslations::RandomTranslations( Layer *previousLayer, RandomTranslationsMaker *maker ) :
-        Layer( previousLayer, maker ),
-        translateSize( maker->_translateSize ),
-        numPlanes ( previousLayer->getOutputPlanes() ),
-        inputImageSize( previousLayer->getOutputImageSize() ),
-        outputImageSize( previousLayer->getOutputImageSize() ),
+RandomTranslations::RandomTranslations(Layer *previousLayer, RandomTranslationsMaker *maker) :
+        Layer(previousLayer, maker),
+        translateSize(maker->_translateSize),
+        numPlanes (previousLayer->getOutputPlanes()),
+        inputSize(previousLayer->getOutputSize()),
+        outputSize(previousLayer->getOutputSize()),
         output(0),
         batchSize(0),
         allocatedSize(0) {
-    if( inputImageSize == 0 ) {
+    if(inputSize == 0) {
 //        maker->net->print();
-        throw runtime_error("Error: Pooling layer " + toString( layerIndex ) + ": input image size is 0" );
+        throw runtime_error("Error: Pooling layer " + toString(layerIndex) + ": input image size is 0");
     }
-    if( outputImageSize == 0 ) {
+    if(outputSize == 0) {
 //        maker->net->print();
-        throw runtime_error("Error: Pooling layer " + toString( layerIndex ) + ": output image size is 0" );
+        throw runtime_error("Error: Pooling layer " + toString(layerIndex) + ": output image size is 0");
     }
-    if( previousLayer->needsBackProp() ) {
+    if(previousLayer->needsBackProp()) {
         throw runtime_error("Error: RandomTranslations layer does not provide backprop currently, so you cannot put it after a layer that needs backprop");
     }
 }
 VIRTUAL RandomTranslations::~RandomTranslations() {
-    if( output != 0 ) {
+    if(output != 0) {
         delete[] output;
     }
 }
 VIRTUAL std::string RandomTranslations::getClassName() const {
     return "RandomTranslations";
 }
-VIRTUAL void RandomTranslations::setBatchSize( int batchSize ) {
-    if( batchSize <= allocatedSize ) {
+VIRTUAL void RandomTranslations::setBatchSize(int batchSize) {
+    if(batchSize <= allocatedSize) {
         this->batchSize = batchSize;
         return;
     }
-    if( output != 0 ) {
+    if(output != 0) {
         delete[] output;
     }
     this->batchSize = batchSize;
     this->allocatedSize = batchSize;
-    output = new float[ getOutputSize() ];
+    output = new float[ getOutputNumElements() ];
 }
-VIRTUAL int RandomTranslations::getOutputSize() {
-    return batchSize * numPlanes * outputImageSize * outputImageSize;
+VIRTUAL int RandomTranslations::getOutputNumElements() {
+    return batchSize * numPlanes * outputSize * outputSize;
 }
 VIRTUAL float *RandomTranslations::getOutput() {
     return output;
@@ -70,16 +70,16 @@ VIRTUAL float *RandomTranslations::getOutput() {
 VIRTUAL bool RandomTranslations::needsBackProp() {
     return false;
 }
-VIRTUAL int RandomTranslations::getOutputSize() const {
-    return batchSize * numPlanes * outputImageSize * outputImageSize;
+VIRTUAL int RandomTranslations::getOutputNumElements() const {
+    return batchSize * numPlanes * outputSize * outputSize;
 }
-VIRTUAL int RandomTranslations::getOutputImageSize() const {
-    return outputImageSize;
+VIRTUAL int RandomTranslations::getOutputSize() const {
+    return outputSize;
 }
 VIRTUAL int RandomTranslations::getOutputPlanes() const {
     return numPlanes;
 }
-VIRTUAL int RandomTranslations::getPersistSize( int version ) const {
+VIRTUAL int RandomTranslations::getPersistSize(int version) const {
     return 0;
 }
 VIRTUAL bool RandomTranslations::providesGradInputWrapper() const {
@@ -90,18 +90,18 @@ VIRTUAL bool RandomTranslations::hasOutputWrapper() const {
 }
 VIRTUAL void RandomTranslations::forward() {
     float *upstreamOutput = previousLayer->getOutput();
-    if( !training ) {
-        memcpy( output, upstreamOutput, sizeof(float) * getOutputSize() );
+    if(!training) {
+        memcpy(output, upstreamOutput, sizeof(float) * getOutputNumElements());
         return;
     }
-    for( int n = 0; n < batchSize; n++ ) {
-        const int translateRows = RandomSingleton::instance()->uniformInt( - translateSize, translateSize );
-        const int translateCols = RandomSingleton::instance()->uniformInt( - translateSize, translateSize );
-        Translator::translate( n, numPlanes, inputImageSize, translateRows, translateCols, upstreamOutput, output );
+    for(int n = 0; n < batchSize; n++) {
+        const int translateRows = RandomSingleton::instance()->uniformInt(- translateSize, translateSize);
+        const int translateCols = RandomSingleton::instance()->uniformInt(- translateSize, translateSize);
+        Translator::translate(n, numPlanes, inputSize, translateRows, translateCols, upstreamOutput, output);
     }
 }
 VIRTUAL std::string RandomTranslations::asString() const {
-    return "RandomTranslations{ inputPlanes=" + toString(numPlanes) + " inputImageSize=" + toString(inputImageSize) + " translateSize=" + toString( translateSize ) + " }";
+    return "RandomTranslations{ inputPlanes=" + toString(numPlanes) + " inputSize=" + toString(inputSize) + " translateSize=" + toString(translateSize) + " }";
 }
 
 
diff --git a/src/patches/RandomTranslations.h b/src/patches/RandomTranslations.h
index ed82703b..feb342c0 100644
--- a/src/patches/RandomTranslations.h
+++ b/src/patches/RandomTranslations.h
@@ -21,9 +21,9 @@ class RandomTranslations : public Layer {
 public:
     const int translateSize;
     const int numPlanes;
-    const int inputImageSize;
+    const int inputSize;
 
-    const int outputImageSize;
+    const int outputSize;
 
     float *output;
 
@@ -35,17 +35,17 @@ class RandomTranslations : public Layer {
     // cog_addheaders.add()
     // ]]]
     // generated, using cog:
-    RandomTranslations( Layer *previousLayer, RandomTranslationsMaker *maker );
+    RandomTranslations(Layer *previousLayer, RandomTranslationsMaker *maker);
     VIRTUAL ~RandomTranslations();
     VIRTUAL std::string getClassName() const;
-    VIRTUAL void setBatchSize( int batchSize );
-    VIRTUAL int getOutputSize();
+    VIRTUAL void setBatchSize(int batchSize);
+    VIRTUAL int getOutputNumElements();
     VIRTUAL float *getOutput();
     VIRTUAL bool needsBackProp();
+    VIRTUAL int getOutputNumElements() const;
     VIRTUAL int getOutputSize() const;
-    VIRTUAL int getOutputImageSize() const;
     VIRTUAL int getOutputPlanes() const;
-    VIRTUAL int getPersistSize( int version ) const;
+    VIRTUAL int getPersistSize(int version) const;
     VIRTUAL bool providesGradInputWrapper() const;
     VIRTUAL bool hasOutputWrapper() const;
     VIRTUAL void forward();
diff --git a/src/patches/RandomTranslationsMaker.cpp b/src/patches/RandomTranslationsMaker.cpp
index 3c33c11a..20722c39 100644
--- a/src/patches/RandomTranslationsMaker.cpp
+++ b/src/patches/RandomTranslationsMaker.cpp
@@ -8,7 +8,7 @@
 
 #include "RandomTranslationsMaker.h"
 
-Layer *RandomTranslationsMaker::createLayer( Layer *previousLayer ) {
-    return new RandomTranslations( previousLayer, this );
+Layer *RandomTranslationsMaker::createLayer(Layer *previousLayer) {
+    return new RandomTranslations(previousLayer, this);
 }
 
diff --git a/src/patches/RandomTranslationsMaker.h b/src/patches/RandomTranslationsMaker.h
index 5d52c12e..1e24037e 100644
--- a/src/patches/RandomTranslationsMaker.h
+++ b/src/patches/RandomTranslationsMaker.h
@@ -28,15 +28,15 @@ class DeepCL_EXPORT RandomTranslationsMaker : public LayerMaker2 {
     PUBLICAPI static RandomTranslationsMaker *instance() {
         return new RandomTranslationsMaker();
     }    
-    PUBLICAPI RandomTranslationsMaker *translateSize( int _translateSize ) {
+    PUBLICAPI RandomTranslationsMaker *translateSize(int _translateSize) {
         this->_translateSize = _translateSize;
         return this;
     }
     virtual RandomTranslationsMaker *clone() const {
         RandomTranslationsMaker *thisClone = new RandomTranslationsMaker();
-        memcpy( thisClone, this, sizeof( RandomTranslationsMaker ) );
+        memcpy(thisClone, this, sizeof(RandomTranslationsMaker) );
         return thisClone;
     }
-    virtual Layer *createLayer( Layer *previousLayer );
+    virtual Layer *createLayer(Layer *previousLayer);
 };
 
diff --git a/src/patches/Translator.cpp b/src/patches/Translator.cpp
index bef91063..b16312bf 100644
--- a/src/patches/Translator.cpp
+++ b/src/patches/Translator.cpp
@@ -14,25 +14,25 @@
 using namespace std;
 
 
-void Translator::translate( int n, int numPlanes, int imageSize, int translateRows, int translateCols, float *source, float *destination ) {
+void Translator::translate(int n, int numPlanes, int imageSize, int translateRows, int translateCols, float *source, float *destination) {
     const int cubeSize = numPlanes * imageSize * imageSize;
 //    float *sourceCube = source + n * cubeSize;
     float *destinationCube = destination + n * cubeSize;
-    memset( destinationCube, 0, sizeof(float) * cubeSize );
-    const int rowCopyLength = imageSize - abs( translateCols );
+    memset(destinationCube, 0, sizeof(float) * cubeSize);
+    const int rowCopyLength = imageSize - abs(translateCols);
     const int outColStart = translateCols > 0 ? translateCols : 0;
     const int inColStart = translateCols > 0 ? 0 : - translateCols;
-    for( int plane = 0; plane < numPlanes; plane++ ) {
-        float *upstreamImage = source + ( n * numPlanes + plane ) * imageSize * imageSize;
-        float *outputImage = destination + ( n * numPlanes + plane ) * imageSize * imageSize;
-        for( int inRow = 0; inRow < imageSize; inRow++ ) {
+    for(int plane = 0; plane < numPlanes; plane++) {
+        float *upstreamImage = source + (n * numPlanes + plane) * imageSize * imageSize;
+        float *outputImage = destination + (n * numPlanes + plane) * imageSize * imageSize;
+        for(int inRow = 0; inRow < imageSize; inRow++) {
             const int outRow = inRow + translateRows;
-            if( outRow < 0 || outRow > imageSize - 1 ) {
+            if(outRow < 0 || outRow > imageSize - 1) {
                 continue;
             }
-            memcpy( &(outputImage[ outRow * imageSize + outColStart ]), 
+            memcpy(&(outputImage[ outRow * imageSize + outColStart ]), 
                 &(upstreamImage[ inRow * imageSize + inColStart ]),
-                rowCopyLength * sizeof(float) );
+                rowCopyLength * sizeof(float));
         }        
     }
 }
diff --git a/src/patches/Translator.h b/src/patches/Translator.h
index 34aeb53b..dffa9134 100644
--- a/src/patches/Translator.h
+++ b/src/patches/Translator.h
@@ -8,6 +8,6 @@
 
 class Translator {
 public:
-    static void translate( int n, int numPlanes, int imageSize, int translateRows, int translateCols, float *source, float *destination );
+    static void translate(int n, int numPlanes, int imageSize, int translateRows, int translateCols, float *source, float *destination);
 };
 
diff --git a/src/pooling/PoolingBackward.cpp b/src/pooling/PoolingBackward.cpp
index 6f6e7e33..871ee0a4 100644
--- a/src/pooling/PoolingBackward.cpp
+++ b/src/pooling/PoolingBackward.cpp
@@ -23,50 +23,50 @@ using namespace std;
 #undef STATIC
 #define STATIC
 
-STATIC PoolingBackward *PoolingBackward::instance( EasyCL *cl, bool padZeros, int numPlanes, int inputImageSize, int poolingSize ) {
-    return new PoolingBackwardGpuNaive( cl, padZeros, numPlanes, inputImageSize, poolingSize );
+STATIC PoolingBackward *PoolingBackward::instance(EasyCL *cl, bool padZeros, int numPlanes, int inputSize, int poolingSize) {
+    return new PoolingBackwardGpuNaive(cl, padZeros, numPlanes, inputSize, poolingSize);
 }
-STATIC PoolingBackward *PoolingBackward::instanceForTest( EasyCL *cl, bool padZeros, int numPlanes, int inputImageSize, int poolingSize) {
-    return new PoolingBackwardCpu( cl, padZeros, numPlanes, inputImageSize, poolingSize );
+STATIC PoolingBackward *PoolingBackward::instanceForTest(EasyCL *cl, bool padZeros, int numPlanes, int inputSize, int poolingSize) {
+    return new PoolingBackwardCpu(cl, padZeros, numPlanes, inputSize, poolingSize);
 }
-STATIC PoolingBackward *PoolingBackward::instanceSpecific( int idx, EasyCL *cl, bool padZeros, int numPlanes, int inputImageSize, int poolingSize ) {
-    if( idx == 0 ) {
-        return new PoolingBackwardCpu( cl, padZeros, numPlanes, inputImageSize, poolingSize );
+STATIC PoolingBackward *PoolingBackward::instanceSpecific(int idx, EasyCL *cl, bool padZeros, int numPlanes, int inputSize, int poolingSize) {
+    if(idx == 0) {
+        return new PoolingBackwardCpu(cl, padZeros, numPlanes, inputSize, poolingSize);
     }
-    if( idx == 1 ) {
-        return new PoolingBackwardGpuNaive( cl, padZeros, numPlanes, inputImageSize, poolingSize );
+    if(idx == 1) {
+        return new PoolingBackwardGpuNaive(cl, padZeros, numPlanes, inputSize, poolingSize);
     }
-    throw runtime_error("PoolingBackward::instanceSpecific, idx not known: " + toString( idx ) );
+    throw runtime_error("PoolingBackward::instanceSpecific, idx not known: " + toString(idx) );
 }
-PoolingBackward::PoolingBackward( EasyCL *cl, bool padZeros, int numPlanes, int inputImageSize, int poolingSize ) :
-        cl( cl ),
-        padZeros( padZeros ),
-        numPlanes( numPlanes ),
-        inputImageSize( inputImageSize ),
-        poolingSize( poolingSize ),
-//        poolingSizeSquared( poolingSize * poolingSize ),
-        outputImageSize( padZeros ? ( inputImageSize + poolingSize - 1 ) / poolingSize : inputImageSize / poolingSize ) {
-//    if( inputImageSize % poolingSize != 0 ) {
-//        throw runtime_error("inputImageSize should be an exact multiple of poolingsize: " + toString( inputImageSize ) + " " + toString(poolingSize ) );
+PoolingBackward::PoolingBackward(EasyCL *cl, bool padZeros, int numPlanes, int inputSize, int poolingSize) :
+        cl(cl),
+        padZeros(padZeros),
+        numPlanes(numPlanes),
+        inputSize(inputSize),
+        poolingSize(poolingSize),
+//        poolingSizeSquared(poolingSize * poolingSize),
+        outputSize(padZeros ? (inputSize + poolingSize - 1) / poolingSize : inputSize / poolingSize) {
+//    if(inputSize % poolingSize != 0) {
+//        throw runtime_error("inputSize should be an exact multiple of poolingsize: " + toString(inputSize) + " " + toString(poolingSize) );
 //    }
 }
-VIRTUAL int PoolingBackward::getInputSize( int batchSize ) {
-    return batchSize * numPlanes * inputImageSize * inputImageSize;
+VIRTUAL int PoolingBackward::getInputNumElements(int batchSize) {
+    return batchSize * numPlanes * inputSize * inputSize;
 }
-VIRTUAL int PoolingBackward::getOutputSize(int batchSize) {
-    return batchSize * numPlanes * outputImageSize * outputImageSize;
+VIRTUAL int PoolingBackward::getOutputNumElements(int batchSize) {
+    return batchSize * numPlanes * outputSize * outputSize;
 }
-VIRTUAL void PoolingBackward::backward( int batchSize, float *gradOutput, int *selectors, float *gradInput ) {
-//    cout << "PoolingBackward::backward( float * )" << endl;
-    StatefulTimer::instance()->timeCheck("PoolingBackward::backward float->wrapper start" );
-    CLWrapper *gradOutputWrapper = cl->wrap( getOutputSize(batchSize), gradOutput );
-    CLWrapper *selectorsWrapper = cl->wrap( getOutputSize(batchSize), selectors );
-    CLWrapper *gradInputWrapper = cl->wrap( getInputSize(batchSize), gradInput );
+VIRTUAL void PoolingBackward::backward(int batchSize, float *gradOutput, int *selectors, float *gradInput) {
+//    cout << "PoolingBackward::backward(float *)" << endl;
+    StatefulTimer::instance()->timeCheck("PoolingBackward::backward float->wrapper start");
+    CLWrapper *gradOutputWrapper = cl->wrap(getOutputNumElements(batchSize), gradOutput);
+    CLWrapper *selectorsWrapper = cl->wrap(getOutputNumElements(batchSize), selectors);
+    CLWrapper *gradInputWrapper = cl->wrap(getInputNumElements(batchSize), gradInput);
 
     gradOutputWrapper->copyToDevice();
     selectorsWrapper->copyToDevice();
 
-    backward( batchSize, gradOutputWrapper, selectorsWrapper, gradInputWrapper );
+    backward(batchSize, gradOutputWrapper, selectorsWrapper, gradInputWrapper);
 
     selectorsWrapper->copyToHost();
     gradInputWrapper->copyToHost();
@@ -74,9 +74,9 @@ VIRTUAL void PoolingBackward::backward( int batchSize, float *gradOutput, int *s
     delete gradOutputWrapper;
     delete selectorsWrapper;
     delete gradInputWrapper;
-    StatefulTimer::instance()->timeCheck("PoolingBackward::backward float->wrapper end" );
+    StatefulTimer::instance()->timeCheck("PoolingBackward::backward float->wrapper end");
 }
-VIRTUAL void PoolingBackward::backward( int batchSize, CLWrapper *gradOutputWrapper, CLWrapper *selectorsWrapper, CLWrapper *gradInputWrapper ) {
-    throw runtime_error("PoolingBackward::backward wrappers not implemented" );
+VIRTUAL void PoolingBackward::backward(int batchSize, CLWrapper *gradOutputWrapper, CLWrapper *selectorsWrapper, CLWrapper *gradInputWrapper) {
+    throw runtime_error("PoolingBackward::backward wrappers not implemented");
 }
 
diff --git a/src/pooling/PoolingBackward.h b/src/pooling/PoolingBackward.h
index c68c32e8..5c3d845f 100644
--- a/src/pooling/PoolingBackward.h
+++ b/src/pooling/PoolingBackward.h
@@ -20,24 +20,24 @@ class DeepCL_EXPORT PoolingBackward {
 
     const bool padZeros;
     const int numPlanes;
-    const int inputImageSize;
+    const int inputSize;
     const int poolingSize;
 
-    const int outputImageSize;
+    const int outputSize;
 //    const int poolingSizeSquared;
 
     virtual ~PoolingBackward() {}
-    inline int getInputIndex( int n, int plane, int row, int col ) {
-        return ( ( n
-            * numPlanes + plane )
-            * inputImageSize + row )
-            * inputImageSize + col;
+    inline int getInputIndex(int n, int plane, int row, int col) {
+        return (( n
+            * numPlanes + plane)
+            * inputSize + row)
+            * inputSize + col;
     }
-    inline int getResultIndex( int n, int plane, int row, int col ) {
-        return ( ( n
-            * numPlanes + plane )
-            * outputImageSize + row )
-            * outputImageSize + col;
+    inline int getResultIndex(int n, int plane, int row, int col) {
+        return (( n
+            * numPlanes + plane)
+            * outputSize + row)
+            * outputSize + col;
     }
 
     // [[[cog
@@ -45,14 +45,14 @@ class DeepCL_EXPORT PoolingBackward {
     // cog_addheaders.add()
     // ]]]
     // generated, using cog:
-    STATIC PoolingBackward *instance( EasyCL *cl, bool padZeros, int numPlanes, int inputImageSize, int poolingSize );
-    STATIC PoolingBackward *instanceForTest( EasyCL *cl, bool padZeros, int numPlanes, int inputImageSize, int poolingSize);
-    STATIC PoolingBackward *instanceSpecific( int idx, EasyCL *cl, bool padZeros, int numPlanes, int inputImageSize, int poolingSize );
-    PoolingBackward( EasyCL *cl, bool padZeros, int numPlanes, int inputImageSize, int poolingSize );
-    VIRTUAL int getInputSize( int batchSize );
-    VIRTUAL int getOutputSize(int batchSize);
-    VIRTUAL void backward( int batchSize, float *gradOutput, int *selectors, float *gradInput );
-    VIRTUAL void backward( int batchSize, CLWrapper *gradOutputWrapper, CLWrapper *selectorsWrapper, CLWrapper *gradInputWrapper );
+    STATIC PoolingBackward *instance(EasyCL *cl, bool padZeros, int numPlanes, int inputSize, int poolingSize);
+    STATIC PoolingBackward *instanceForTest(EasyCL *cl, bool padZeros, int numPlanes, int inputSize, int poolingSize);
+    STATIC PoolingBackward *instanceSpecific(int idx, EasyCL *cl, bool padZeros, int numPlanes, int inputSize, int poolingSize);
+    PoolingBackward(EasyCL *cl, bool padZeros, int numPlanes, int inputSize, int poolingSize);
+    VIRTUAL int getInputNumElements(int batchSize);
+    VIRTUAL int getOutputNumElements(int batchSize);
+    VIRTUAL void backward(int batchSize, float *gradOutput, int *selectors, float *gradInput);
+    VIRTUAL void backward(int batchSize, CLWrapper *gradOutputWrapper, CLWrapper *selectorsWrapper, CLWrapper *gradInputWrapper);
 
     // [[[end]]]
 };
diff --git a/src/pooling/PoolingBackwardCpu.cpp b/src/pooling/PoolingBackwardCpu.cpp
index cc25058e..6608f52e 100644
--- a/src/pooling/PoolingBackwardCpu.cpp
+++ b/src/pooling/PoolingBackwardCpu.cpp
@@ -21,47 +21,47 @@ using namespace std;
 #undef STATIC
 #define STATIC
 
-PoolingBackwardCpu::PoolingBackwardCpu( EasyCL *cl, bool padZeros, int numPlanes, int inputImageSize, int poolingSize ) :
-        PoolingBackward( cl, padZeros, numPlanes, inputImageSize, poolingSize ) {
+PoolingBackwardCpu::PoolingBackwardCpu(EasyCL *cl, bool padZeros, int numPlanes, int inputSize, int poolingSize) :
+        PoolingBackward(cl, padZeros, numPlanes, inputSize, poolingSize) {
 }
-VIRTUAL void PoolingBackwardCpu::backward( int batchSize,  float *gradOutput, int *selectors, float *gradInput ) {
-    memset( gradInput, 0, sizeof( float ) * getInputSize( batchSize ) );
-    for( int n = 0; n < batchSize; n++ ) {
-        for( int plane = 0; plane < numPlanes; plane++ ) {
-            for( int outputRow = 0; outputRow < outputImageSize; outputRow++ ) {
+VIRTUAL void PoolingBackwardCpu::backward(int batchSize,  float *gradOutput, int *selectors, float *gradInput) {
+    memset(gradInput, 0, sizeof(float) * getInputNumElements(batchSize) );
+    for(int n = 0; n < batchSize; n++) {
+        for(int plane = 0; plane < numPlanes; plane++) {
+            for(int outputRow = 0; outputRow < outputSize; outputRow++) {
                 int inputRow = outputRow * poolingSize;
-                for( int outputCol = 0; outputCol < outputImageSize; outputCol++ ) {
+                for(int outputCol = 0; outputCol < outputSize; outputCol++) {
                     int inputCol = outputCol * poolingSize;
-                    int outputIndex = getResultIndex( n, plane, outputRow, outputCol );
+                    int outputIndex = getResultIndex(n, plane, outputRow, outputCol);
                     int selector = selectors[outputIndex];
                     int drow = selector / poolingSize;
                     int dcol = selector % poolingSize;
-                    int inputIndex = getInputIndex( n, plane, inputRow + drow, inputCol + dcol );
+                    int inputIndex = getInputIndex(n, plane, inputRow + drow, inputCol + dcol);
                     gradInput[ inputIndex ] = gradOutput[outputIndex];
                 }
             }
         }
     }
 }
-VIRTUAL void PoolingBackwardCpu::backward( int batchSize, CLWrapper *gradOutputWrapper, CLWrapper *selectorsWrapper, 
-        CLWrapper *gradInputWrapper ) {
-    StatefulTimer::instance()->timeCheck("PoolingBackwardCpu::backward start" );
+VIRTUAL void PoolingBackwardCpu::backward(int batchSize, CLWrapper *gradOutputWrapper, CLWrapper *selectorsWrapper, 
+        CLWrapper *gradInputWrapper) {
+    StatefulTimer::instance()->timeCheck("PoolingBackwardCpu::backward start");
 
     gradOutputWrapper->copyToHost();
     selectorsWrapper->copyToHost();
 
-    float *gradOutput = reinterpret_cast<float *>( gradOutputWrapper->getHostArray() );
-    int *selectors = reinterpret_cast<int *>( selectorsWrapper->getHostArray() );
-    float *gradInput = new float[ getInputSize( batchSize ) ];
+    float *gradOutput = reinterpret_cast<float *>(gradOutputWrapper->getHostArray());
+    int *selectors = reinterpret_cast<int *>(selectorsWrapper->getHostArray());
+    float *gradInput = new float[ getInputNumElements(batchSize) ];
 
-    backward( batchSize, gradOutput, selectors, gradInput );
+    backward(batchSize, gradOutput, selectors, gradInput);
 
-    float *gradInputHostArray = reinterpret_cast<float *>( gradInputWrapper->getHostArray() );
-    memcpy( gradInputHostArray, gradInput, sizeof(float) * getInputSize( batchSize ) );
+    float *gradInputHostArray = reinterpret_cast<float *>(gradInputWrapper->getHostArray());
+    memcpy(gradInputHostArray, gradInput, sizeof(float) * getInputNumElements(batchSize) );
     gradInputWrapper->copyToDevice();
 
     delete[] gradInput;
     
-    StatefulTimer::instance()->timeCheck("PoolingBackwardCpu::backward end" );
+    StatefulTimer::instance()->timeCheck("PoolingBackwardCpu::backward end");
 }
 
diff --git a/src/pooling/PoolingBackwardCpu.h b/src/pooling/PoolingBackwardCpu.h
index 6b678e7a..58afe6d7 100644
--- a/src/pooling/PoolingBackwardCpu.h
+++ b/src/pooling/PoolingBackwardCpu.h
@@ -19,10 +19,10 @@ class PoolingBackwardCpu : public PoolingBackward {
     // cog_addheaders.add()
     // ]]]
     // generated, using cog:
-    PoolingBackwardCpu( EasyCL *cl, bool padZeros, int numPlanes, int inputImageSize, int poolingSize );
-    VIRTUAL void backward( int batchSize,  float *gradOutput, int *selectors, float *gradInput );
-    VIRTUAL void backward( int batchSize, CLWrapper *gradOutputWrapper, CLWrapper *selectorsWrapper,
-    CLWrapper *gradInputWrapper );
+    PoolingBackwardCpu(EasyCL *cl, bool padZeros, int numPlanes, int inputSize, int poolingSize);
+    VIRTUAL void backward(int batchSize,  float *gradOutput, int *selectors, float *gradInput);
+    VIRTUAL void backward(int batchSize, CLWrapper *gradOutputWrapper, CLWrapper *selectorsWrapper,
+    CLWrapper *gradInputWrapper);
 
     // [[[end]]]
 };
diff --git a/src/pooling/PoolingBackwardGpuNaive.cpp b/src/pooling/PoolingBackwardGpuNaive.cpp
index 7e33dc45..7a404be9 100644
--- a/src/pooling/PoolingBackwardGpuNaive.cpp
+++ b/src/pooling/PoolingBackwardGpuNaive.cpp
@@ -26,44 +26,44 @@ VIRTUAL PoolingBackwardGpuNaive::~PoolingBackwardGpuNaive() {
     delete kernel;
     delete kMemset;
 }
-VIRTUAL void PoolingBackwardGpuNaive::backward( int batchSize, CLWrapper *gradOutputWrapper, CLWrapper *selectorsWrapper, 
-        CLWrapper *gradInputWrapper ) {
+VIRTUAL void PoolingBackwardGpuNaive::backward(int batchSize, CLWrapper *gradOutputWrapper, CLWrapper *selectorsWrapper, 
+        CLWrapper *gradInputWrapper) {
 
-    StatefulTimer::instance()->timeCheck("PoolingBackwardGpuNaive::backward start" );
+    StatefulTimer::instance()->timeCheck("PoolingBackwardGpuNaive::backward start");
 
     // first, memset errors to 0 ...
-    kMemset->out( gradInputWrapper )->in( 0.0f )->in( batchSize * numPlanes * inputImageSize * inputImageSize );
-    int globalSize = batchSize * numPlanes * inputImageSize * inputImageSize;
+    kMemset->out(gradInputWrapper)->in(0.0f)->in(batchSize * numPlanes * inputSize * inputSize);
+    int globalSize = batchSize * numPlanes * inputSize * inputSize;
     int workgroupSize = 64;
-    int numWorkgroups = ( globalSize + workgroupSize - 1 ) / workgroupSize;
-    kMemset->run_1d( numWorkgroups * workgroupSize, workgroupSize );
+    int numWorkgroups = (globalSize + workgroupSize - 1) / workgroupSize;
+    kMemset->run_1d(numWorkgroups * workgroupSize, workgroupSize);
     cl->finish();
 
-    kernel->in( batchSize )->inout( gradOutputWrapper )->in( selectorsWrapper )->in( gradInputWrapper );
-    globalSize = batchSize * numPlanes * outputImageSize * outputImageSize;
+    kernel->in(batchSize)->inout(gradOutputWrapper)->in(selectorsWrapper)->in(gradInputWrapper);
+    globalSize = batchSize * numPlanes * outputSize * outputSize;
     workgroupSize = 64;
-    numWorkgroups = ( globalSize + workgroupSize - 1 ) / workgroupSize;
-    kernel->run_1d( numWorkgroups * workgroupSize, workgroupSize );
+    numWorkgroups = (globalSize + workgroupSize - 1) / workgroupSize;
+    kernel->run_1d(numWorkgroups * workgroupSize, workgroupSize);
     cl->finish();
 
-    StatefulTimer::instance()->timeCheck("PoolingBackwardGpuNaive::backward end" );
+    StatefulTimer::instance()->timeCheck("PoolingBackwardGpuNaive::backward end");
 }
-PoolingBackwardGpuNaive::PoolingBackwardGpuNaive( EasyCL *cl, bool padZeros, int numPlanes, int inputImageSize, int poolingSize ) :
-        PoolingBackward( cl, padZeros, numPlanes, inputImageSize, poolingSize ) {
+PoolingBackwardGpuNaive::PoolingBackwardGpuNaive(EasyCL *cl, bool padZeros, int numPlanes, int inputSize, int poolingSize) :
+        PoolingBackward(cl, padZeros, numPlanes, inputSize, poolingSize) {
 //    std::string options = "-D " + fn->getDefineName();
     string options = "";
-    options += " -D gNumPlanes=" + toString( numPlanes );
-    options += " -D gInputImageSize=" + toString( inputImageSize );
-    options += " -D gInputImageSizeSquared=" + toString( inputImageSize * inputImageSize );
-    options += " -D gOutputImageSize=" + toString( outputImageSize );
-    options += " -D gOutputImageSizeSquared=" + toString( outputImageSize * outputImageSize );
-    options += " -D gPoolingSize=" + toString( poolingSize );
-    options += " -D gPadZeros=" + toString( padZeros ? 1 : 0 );
+    options += " -D gNumPlanes=" + toString(numPlanes);
+    options += " -D gInputSize=" + toString(inputSize);
+    options += " -D gInputSizeSquared=" + toString(inputSize * inputSize);
+    options += " -D gOutputSize=" + toString(outputSize);
+    options += " -D gOutputSizeSquared=" + toString(outputSize * outputSize);
+    options += " -D gPoolingSize=" + toString(poolingSize);
+    options += " -D gPadZeros=" + toString(padZeros ? 1 : 0);
 
     // [[[cog
     // import stringify
-    // stringify.write_kernel2( "kernel", "cl/PoolingBackwardGpuNaive.cl", "backward", 'options' )
-    // stringify.write_kernel2( "kMemset", "cl/memset.cl", "memset", '""' )
+    // stringify.write_kernel2("kernel", "cl/PoolingBackwardGpuNaive.cl", "backward", 'options')
+    // stringify.write_kernel2("kMemset", "cl/memset.cl", "memset", '""')
     // ]]]
     // generated using cog, from cl/PoolingBackwardGpuNaive.cl:
     const char * kernelSource =  
@@ -80,37 +80,37 @@ PoolingBackwardGpuNaive::PoolingBackwardGpuNaive( EasyCL *cl, bool padZeros, int
     "// wont use workgroups (since 'naive')\n" 
     "// one thread per: [n][plane][outrow][outcol]\n" 
     "// globalId: [n][plane][outrow][outcol]\n" 
-    "kernel void backward( const int batchSize,\n" 
-    "    global const float *gradOutput, global const int *selectors, global float *gradInput ) {\n" 
+    "kernel void backward(const int batchSize,\n" 
+    "    global const float *gradOutput, global const int *selectors, global float *gradInput) {\n" 
     "\n" 
     "    #define globalId get_global_id(0)\n" 
-    "    #define nPlaneCombo ( globalId / gOutputImageSizeSquared )\n" 
-    "    #define outputPosCombo ( globalId % gOutputImageSizeSquared )\n" 
+    "    #define nPlaneCombo (globalId / gOutputSizeSquared)\n" 
+    "    #define outputPosCombo (globalId % gOutputSizeSquared)\n" 
     "\n" 
     "    const int n = nPlaneCombo / gNumPlanes;\n" 
     "    const int plane = nPlaneCombo % gNumPlanes;\n" 
-    "    const int outputRow = outputPosCombo / gOutputImageSize;\n" 
-    "    const int outputCol = outputPosCombo % gOutputImageSize;\n" 
+    "    const int outputRow = outputPosCombo / gOutputSize;\n" 
+    "    const int outputCol = outputPosCombo % gOutputSize;\n" 
     "\n" 
-    "    if( n >= batchSize ) {\n" 
+    "    if (n >= batchSize) {\n" 
     "        return;\n" 
     "    }\n" 
     "\n" 
-    "    int resultIndex = ( ( n\n" 
-    "        * gNumPlanes + plane )\n" 
-    "        * gOutputImageSize + outputRow )\n" 
-    "        * gOutputImageSize + outputCol;\n" 
-    "    #define error ( gradOutput[resultIndex] )\n" 
-    "    int selector = ( selectors[resultIndex] );\n" 
-    "    #define drow ( selector / gPoolingSize )\n" 
-    "    #define dcol ( selector % gPoolingSize )\n" 
-    "    #define inputRow ( outputRow * gPoolingSize + drow )\n" 
-    "    #define inputCol ( outputCol * gPoolingSize + dcol )\n" 
-    "    int inputIndex = ( ( n\n" 
-    "        * gNumPlanes + plane )\n" 
-    "        * gInputImageSize + inputRow )\n" 
-    "        * gInputImageSize + inputCol;\n" 
-    "//    if( n < batchSize ) {\n" 
+    "    int resultIndex = (( n\n" 
+    "        * gNumPlanes + plane)\n" 
+    "        * gOutputSize + outputRow)\n" 
+    "        * gOutputSize + outputCol;\n" 
+    "    #define error (gradOutput[resultIndex])\n" 
+    "    int selector = (selectors[resultIndex]);\n" 
+    "    #define drow (selector / gPoolingSize)\n" 
+    "    #define dcol (selector % gPoolingSize)\n" 
+    "    #define inputRow (outputRow * gPoolingSize + drow)\n" 
+    "    #define inputCol (outputCol * gPoolingSize + dcol)\n" 
+    "    int inputIndex = (( n\n" 
+    "        * gNumPlanes + plane)\n" 
+    "        * gInputSize + inputRow)\n" 
+    "        * gInputSize + inputCol;\n" 
+    "//    if (n < batchSize) {\n" 
     "        gradInput[ inputIndex ] = error;\n" 
     "//    }\n" 
     "}\n" 
@@ -125,9 +125,9 @@ PoolingBackwardGpuNaive::PoolingBackwardGpuNaive( EasyCL *cl, bool padZeros, int
     "// v. 2.0. If a copy of the MPL was not distributed with this file, You can\n" 
     "// obtain one at http://mozilla.org/MPL/2.0/.\n" 
     "\n" 
-    "kernel void memset( global float *target, const float value, const int N ) {\n" 
+    "kernel void memset(global float *target, const float value, const int N) {\n" 
     "    #define globalId get_global_id(0)\n" 
-    "    if( globalId < N ) {\n" 
+    "    if (globalId < N) {\n" 
     "        target[globalId] = value;\n" 
     "    }\n" 
     "}\n" 
diff --git a/src/pooling/PoolingBackwardGpuNaive.h b/src/pooling/PoolingBackwardGpuNaive.h
index 98efdf20..aa1bb9c5 100644
--- a/src/pooling/PoolingBackwardGpuNaive.h
+++ b/src/pooling/PoolingBackwardGpuNaive.h
@@ -22,9 +22,9 @@ class PoolingBackwardGpuNaive : public PoolingBackward {
     // ]]]
     // generated, using cog:
     VIRTUAL ~PoolingBackwardGpuNaive();
-    VIRTUAL void backward( int batchSize, CLWrapper *gradOutputWrapper, CLWrapper *selectorsWrapper,
-    CLWrapper *gradInputWrapper );
-    PoolingBackwardGpuNaive( EasyCL *cl, bool padZeros, int numPlanes, int inputImageSize, int poolingSize );
+    VIRTUAL void backward(int batchSize, CLWrapper *gradOutputWrapper, CLWrapper *selectorsWrapper,
+    CLWrapper *gradInputWrapper);
+    PoolingBackwardGpuNaive(EasyCL *cl, bool padZeros, int numPlanes, int inputSize, int poolingSize);
 
     // [[[end]]]
 };
diff --git a/src/pooling/PoolingForward.cpp b/src/pooling/PoolingForward.cpp
index 3e013f5b..1a2d502f 100644
--- a/src/pooling/PoolingForward.cpp
+++ b/src/pooling/PoolingForward.cpp
@@ -20,45 +20,45 @@ using namespace std;
 #undef STATIC
 #define STATIC
 
-PoolingForward::PoolingForward( EasyCL *cl, bool padZeros, int numPlanes, int inputImageSize, int poolingSize ) :
-        cl( cl ),
-        padZeros( padZeros ),
-        numPlanes( numPlanes ),
-        inputImageSize( inputImageSize ),
-        poolingSize( poolingSize ),
-        outputImageSize( padZeros ? ( inputImageSize + poolingSize - 1 ) / poolingSize : inputImageSize / poolingSize ) {
-//    if( inputImageSize % poolingSize != 0 ) {
-//        throw runtime_error("inputImageSize should be an exact multiple of poolingsize: " + toString( inputImageSize ) + " " + toString(poolingSize ) );
+PoolingForward::PoolingForward(EasyCL *cl, bool padZeros, int numPlanes, int inputSize, int poolingSize) :
+        cl(cl),
+        padZeros(padZeros),
+        numPlanes(numPlanes),
+        inputSize(inputSize),
+        poolingSize(poolingSize),
+        outputSize(padZeros ? (inputSize + poolingSize - 1) / poolingSize : inputSize / poolingSize) {
+//    if(inputSize % poolingSize != 0) {
+//        throw runtime_error("inputSize should be an exact multiple of poolingsize: " + toString(inputSize) + " " + toString(poolingSize) );
 //    }
 }
-STATIC PoolingForward *PoolingForward::instance( EasyCL *cl, bool padZeros, int numPlanes, int inputImageSize, int poolingSize ) {
-    return new PoolingForwardGpuNaive( cl, padZeros, numPlanes, inputImageSize, poolingSize );
-//    return new PoolingForwardCpu( cl, padZeros, numPlanes, inputImageSize, poolingSize );
+STATIC PoolingForward *PoolingForward::instance(EasyCL *cl, bool padZeros, int numPlanes, int inputSize, int poolingSize) {
+    return new PoolingForwardGpuNaive(cl, padZeros, numPlanes, inputSize, poolingSize);
+//    return new PoolingForwardCpu(cl, padZeros, numPlanes, inputSize, poolingSize);
 }
-STATIC PoolingForward *PoolingForward::instanceForTest( EasyCL *cl, bool padZeros, int numPlanes, int inputImageSize, int poolingSize ) {
-    return new PoolingForwardGpuNaive( cl, padZeros, numPlanes, inputImageSize, poolingSize );
+STATIC PoolingForward *PoolingForward::instanceForTest(EasyCL *cl, bool padZeros, int numPlanes, int inputSize, int poolingSize) {
+    return new PoolingForwardGpuNaive(cl, padZeros, numPlanes, inputSize, poolingSize);
 }
-STATIC PoolingForward *PoolingForward::instanceSpecific( int idx, EasyCL *cl, bool padZeros, int numPlanes, int inputImageSize, int poolingSize ) {
-    if( idx == 0 ) {
-        return new PoolingForwardCpu( cl, padZeros, numPlanes, inputImageSize, poolingSize );
+STATIC PoolingForward *PoolingForward::instanceSpecific(int idx, EasyCL *cl, bool padZeros, int numPlanes, int inputSize, int poolingSize) {
+    if(idx == 0) {
+        return new PoolingForwardCpu(cl, padZeros, numPlanes, inputSize, poolingSize);
     }
-    if( idx == 1 ) {
-        return new PoolingForwardGpuNaive( cl, padZeros, numPlanes, inputImageSize, poolingSize );
+    if(idx == 1) {
+        return new PoolingForwardGpuNaive(cl, padZeros, numPlanes, inputSize, poolingSize);
     }
     cout << "idx " << idx << " not known" << endl;
-    throw runtime_error("PoolingForward::instanceSpecific idx not known: " + toString( idx ) );
+    throw runtime_error("PoolingForward::instanceSpecific idx not known: " + toString(idx) );
 }
-VIRTUAL void PoolingForward::forward( int batchSize, CLWrapper *inputData, CLWrapper *selectors, CLWrapper *outputData ) {
+VIRTUAL void PoolingForward::forward(int batchSize, CLWrapper *inputData, CLWrapper *selectors, CLWrapper *outputData) {
     throw runtime_error("forward not implemented for this child type");
 }
-VIRTUAL void PoolingForward::forward( int batchSize, float *input, int *selectors, float *output ) {
-//    cout << "PoolingForward::forward( float * )" << endl;
-    CLWrapper *inputWrapper = cl->wrap( getInputSize( batchSize ), input );
-    CLWrapper *selectorsWrapper = cl->wrap( getOutputSize( batchSize ), selectors );
-    CLWrapper *outputWrapper = cl->wrap( getOutputSize( batchSize ), output );
+VIRTUAL void PoolingForward::forward(int batchSize, float *input, int *selectors, float *output) {
+//    cout << "PoolingForward::forward(float *)" << endl;
+    CLWrapper *inputWrapper = cl->wrap(getInputNumElements(batchSize), input);
+    CLWrapper *selectorsWrapper = cl->wrap(getOutputNumElements(batchSize), selectors);
+    CLWrapper *outputWrapper = cl->wrap(getOutputNumElements(batchSize), output);
 
     inputWrapper->copyToDevice();
-    forward( batchSize, inputWrapper, selectorsWrapper, outputWrapper );
+    forward(batchSize, inputWrapper, selectorsWrapper, outputWrapper);
     selectorsWrapper->copyToHost();    
     outputWrapper->copyToHost();    
 
@@ -66,11 +66,11 @@ VIRTUAL void PoolingForward::forward( int batchSize, float *input, int *selector
     delete selectorsWrapper;
     delete inputWrapper;
 }
-VIRTUAL int PoolingForward::getInputSize( int batchSize ) {
-    return batchSize * numPlanes * inputImageSize * inputImageSize;
+VIRTUAL int PoolingForward::getInputNumElements(int batchSize) {
+    return batchSize * numPlanes * inputSize * inputSize;
 }
-VIRTUAL int PoolingForward::getOutputSize(int batchSize) {
-    return batchSize * numPlanes * outputImageSize * outputImageSize;
+VIRTUAL int PoolingForward::getOutputNumElements(int batchSize) {
+    return batchSize * numPlanes * outputSize * outputSize;
 }
 
 
diff --git a/src/pooling/PoolingForward.h b/src/pooling/PoolingForward.h
index e23bd28a..86ed1f47 100644
--- a/src/pooling/PoolingForward.h
+++ b/src/pooling/PoolingForward.h
@@ -20,23 +20,23 @@ class DeepCL_EXPORT PoolingForward {
 
     const bool padZeros;
     const int numPlanes;
-    const int inputImageSize;
+    const int inputSize;
     const int poolingSize;
 
-    const int outputImageSize;
+    const int outputSize;
 
     virtual ~PoolingForward() {}
-    inline int getInputIndex( int n, int plane, int row, int col ) {
-        return ( ( n
-            * numPlanes + plane )
-            * inputImageSize + row )
-            * inputImageSize + col;
+    inline int getInputIndex(int n, int plane, int row, int col) {
+        return (( n
+            * numPlanes + plane)
+            * inputSize + row)
+            * inputSize + col;
     }
-    inline int getResultIndex( int n, int plane, int row, int col ) {
-        return ( ( n
-            * numPlanes + plane )
-            * outputImageSize + row )
-            * outputImageSize + col;
+    inline int getResultIndex(int n, int plane, int row, int col) {
+        return (( n
+            * numPlanes + plane)
+            * outputSize + row)
+            * outputSize + col;
     }
 
     // [[[cog
@@ -44,14 +44,14 @@ class DeepCL_EXPORT PoolingForward {
     // cog_addheaders.add()
     // ]]]
     // generated, using cog:
-    PoolingForward( EasyCL *cl, bool padZeros, int numPlanes, int inputImageSize, int poolingSize );
-    STATIC PoolingForward *instance( EasyCL *cl, bool padZeros, int numPlanes, int inputImageSize, int poolingSize );
-    STATIC PoolingForward *instanceForTest( EasyCL *cl, bool padZeros, int numPlanes, int inputImageSize, int poolingSize );
-    STATIC PoolingForward *instanceSpecific( int idx, EasyCL *cl, bool padZeros, int numPlanes, int inputImageSize, int poolingSize );
-    VIRTUAL void forward( int batchSize, CLWrapper *inputData, CLWrapper *selectors, CLWrapper *outputData );
-    VIRTUAL void forward( int batchSize, float *input, int *selectors, float *output );
-    VIRTUAL int getInputSize( int batchSize );
-    VIRTUAL int getOutputSize(int batchSize);
+    PoolingForward(EasyCL *cl, bool padZeros, int numPlanes, int inputSize, int poolingSize);
+    STATIC PoolingForward *instance(EasyCL *cl, bool padZeros, int numPlanes, int inputSize, int poolingSize);
+    STATIC PoolingForward *instanceForTest(EasyCL *cl, bool padZeros, int numPlanes, int inputSize, int poolingSize);
+    STATIC PoolingForward *instanceSpecific(int idx, EasyCL *cl, bool padZeros, int numPlanes, int inputSize, int poolingSize);
+    VIRTUAL void forward(int batchSize, CLWrapper *inputData, CLWrapper *selectors, CLWrapper *outputData);
+    VIRTUAL void forward(int batchSize, float *input, int *selectors, float *output);
+    VIRTUAL int getInputNumElements(int batchSize);
+    VIRTUAL int getOutputNumElements(int batchSize);
 
     // [[[end]]]
 };
diff --git a/src/pooling/PoolingForwardCpu.cpp b/src/pooling/PoolingForwardCpu.cpp
index e76a6f63..c9e2b3ed 100644
--- a/src/pooling/PoolingForwardCpu.cpp
+++ b/src/pooling/PoolingForwardCpu.cpp
@@ -20,25 +20,25 @@ using namespace std;
 #undef STATIC
 #define STATIC
 
-PoolingForwardCpu::PoolingForwardCpu( EasyCL *cl, bool padZeros, int numPlanes, int inputImageSize, int poolingSize ) :
-        PoolingForward( cl, padZeros, numPlanes, inputImageSize, poolingSize ) {
+PoolingForwardCpu::PoolingForwardCpu(EasyCL *cl, bool padZeros, int numPlanes, int inputSize, int poolingSize) :
+        PoolingForward(cl, padZeros, numPlanes, inputSize, poolingSize) {
 }
-VIRTUAL void PoolingForwardCpu::forward( int batchSize, CLWrapper *inputWrapper, CLWrapper *selectorsWrapper, CLWrapper *outputWrapper ) {
-//    cout << "PoolingForwardCpu::forward( CLWrapper * )" << endl;
+VIRTUAL void PoolingForwardCpu::forward(int batchSize, CLWrapper *inputWrapper, CLWrapper *selectorsWrapper, CLWrapper *outputWrapper) {
+//    cout << "PoolingForwardCpu::forward(CLWrapper *)" << endl;
 
     inputWrapper->copyToHost();
 
-    float *input = reinterpret_cast<float *>( inputWrapper->getHostArray() );
-    int *selectors = new int[ getOutputSize( batchSize ) ];
-    float *output = new float[ getOutputSize( batchSize ) ];
+    float *input = reinterpret_cast<float *>(inputWrapper->getHostArray());
+    int *selectors = new int[ getOutputNumElements(batchSize) ];
+    float *output = new float[ getOutputNumElements(batchSize) ];
 
-    forward( batchSize, input, selectors, output );
+    forward(batchSize, input, selectors, output);
 
-    int *selectorsHostArray = reinterpret_cast<int *>( selectorsWrapper->getHostArray() );
-    memcpy( selectorsHostArray, selectors, sizeof(int) * getOutputSize( batchSize ) );
+    int *selectorsHostArray = reinterpret_cast<int *>(selectorsWrapper->getHostArray());
+    memcpy(selectorsHostArray, selectors, sizeof(int) * getOutputNumElements(batchSize) );
 
-    float *outputHostArray = reinterpret_cast<float *>( outputWrapper->getHostArray() );
-    memcpy( outputHostArray, output, sizeof(float) * getOutputSize( batchSize ) );
+    float *outputHostArray = reinterpret_cast<float *>(outputWrapper->getHostArray());
+    memcpy(outputHostArray, output, sizeof(float) * getOutputNumElements(batchSize) );
 
     selectorsWrapper->copyToDevice();
     outputWrapper->copyToDevice();
@@ -46,37 +46,37 @@ VIRTUAL void PoolingForwardCpu::forward( int batchSize, CLWrapper *inputWrapper,
     delete[] selectors;
     delete[] output;
 }
-VIRTUAL void PoolingForwardCpu::forward( int batchSize, float *input, int *selectors, float *output ) {
-//    float *output = new float[ getOutputSize( batchSize ) ];
-//    cout << "PoolingForwardCpu::forward( float * )" << endl;
-    StatefulTimer::instance()->timeCheck("PoolingForwardCpu::forward start" );
-    for( int n = 0; n < batchSize; n++ ) {
-        for( int plane = 0; plane < numPlanes; plane++ ) {
-            for( int outputRow = 0; outputRow < outputImageSize; outputRow++ ) {
+VIRTUAL void PoolingForwardCpu::forward(int batchSize, float *input, int *selectors, float *output) {
+//    float *output = new float[ getOutputNumElements(batchSize) ];
+//    cout << "PoolingForwardCpu::forward(float *)" << endl;
+    StatefulTimer::instance()->timeCheck("PoolingForwardCpu::forward start");
+    for(int n = 0; n < batchSize; n++) {
+        for(int plane = 0; plane < numPlanes; plane++) {
+            for(int outputRow = 0; outputRow < outputSize; outputRow++) {
                 int inputRow = outputRow * poolingSize;
-                for( int outputCol = 0; outputCol < outputImageSize; outputCol++ ) {
+                for(int outputCol = 0; outputCol < outputSize; outputCol++) {
                     int inputCol = outputCol * poolingSize;
                     int selector = 0;
-                    float maxValue = input[ getInputIndex( n, plane, inputRow, inputCol ) ];
-                    for( int dx = 0; dx < poolingSize; dx++ ) {
-                        for( int dy = 0; dy < poolingSize; dy++ ) {
-                            if( inputRow + dx < inputImageSize && inputCol + dy < inputImageSize ) {
-                                float thisValue = input[ getInputIndex( n, plane, inputRow + dx, inputCol + dy ) ];
-                                if( thisValue > maxValue ) {
+                    float maxValue = input[ getInputIndex(n, plane, inputRow, inputCol) ];
+                    for(int dx = 0; dx < poolingSize; dx++) {
+                        for(int dy = 0; dy < poolingSize; dy++) {
+                            if(inputRow + dx < inputSize && inputCol + dy < inputSize) {
+                                float thisValue = input[ getInputIndex(n, plane, inputRow + dx, inputCol + dy) ];
+                                if(thisValue > maxValue) {
                                     maxValue = thisValue;
                                     selector = dx * poolingSize + dy;
                                 }
                             }
                         }
                     }
-                    int resultIndex = getResultIndex( n, plane, outputRow, outputCol );
+                    int resultIndex = getResultIndex(n, plane, outputRow, outputCol);
                     output[ resultIndex ] = maxValue;
                     selectors[ resultIndex ] = selector;
                 }
             }
         }
     }
-    StatefulTimer::instance()->timeCheck("PoolingForwardCpu::forward end" );
+    StatefulTimer::instance()->timeCheck("PoolingForwardCpu::forward end");
 //    return output;
 }
 
diff --git a/src/pooling/PoolingForwardCpu.h b/src/pooling/PoolingForwardCpu.h
index 94a107ea..ee158e2b 100644
--- a/src/pooling/PoolingForwardCpu.h
+++ b/src/pooling/PoolingForwardCpu.h
@@ -19,9 +19,9 @@ class PoolingForwardCpu : public PoolingForward {
     // cog_addheaders.add()
     // ]]]
     // generated, using cog:
-    PoolingForwardCpu( EasyCL *cl, bool padZeros, int numPlanes, int inputImageSize, int poolingSize );
-    VIRTUAL void forward( int batchSize, CLWrapper *inputWrapper, CLWrapper *selectorsWrapper, CLWrapper *outputWrapper );
-    VIRTUAL void forward( int batchSize, float *input, int *selectors, float *output );
+    PoolingForwardCpu(EasyCL *cl, bool padZeros, int numPlanes, int inputSize, int poolingSize);
+    VIRTUAL void forward(int batchSize, CLWrapper *inputWrapper, CLWrapper *selectorsWrapper, CLWrapper *outputWrapper);
+    VIRTUAL void forward(int batchSize, float *input, int *selectors, float *output);
 
     // [[[end]]]
 };
diff --git a/src/pooling/PoolingForwardGpuNaive.cpp b/src/pooling/PoolingForwardGpuNaive.cpp
index a0368ca1..8eabf668 100644
--- a/src/pooling/PoolingForwardGpuNaive.cpp
+++ b/src/pooling/PoolingForwardGpuNaive.cpp
@@ -26,36 +26,36 @@ using namespace std;
 VIRTUAL PoolingForwardGpuNaive::~PoolingForwardGpuNaive() {
     delete kernel;
 }
-VIRTUAL void PoolingForwardGpuNaive::forward( int batchSize, CLWrapper *inputWrapper, CLWrapper *selectorsWrapper, CLWrapper *outputWrapper ) {
-//    cout << StatefulTimer::instance()->prefix << "PoolingForwardGpuNaive::forward( CLWrapper * )" << endl;
-    StatefulTimer::instance()->timeCheck("PoolingForwardGpuNaive::forward start" );
+VIRTUAL void PoolingForwardGpuNaive::forward(int batchSize, CLWrapper *inputWrapper, CLWrapper *selectorsWrapper, CLWrapper *outputWrapper) {
+//    cout << StatefulTimer::instance()->prefix << "PoolingForwardGpuNaive::forward(CLWrapper *)" << endl;
+    StatefulTimer::instance()->timeCheck("PoolingForwardGpuNaive::forward start");
 
-    kernel->input( batchSize )->input( inputWrapper )->output( selectorsWrapper )->output( outputWrapper );
-    int globalSize = batchSize * numPlanes * outputImageSize * outputImageSize;
+    kernel->input(batchSize)->input(inputWrapper)->output(selectorsWrapper)->output(outputWrapper);
+    int globalSize = batchSize * numPlanes * outputSize * outputSize;
     int workgroupsize = cl->getMaxWorkgroupSize();
-    globalSize = ( ( globalSize + workgroupsize - 1 ) / workgroupsize ) * workgroupsize;
+    globalSize = (( globalSize + workgroupsize - 1) / workgroupsize) * workgroupsize;
 //    cout << "PoolingForwardGpuNaive::forward batchsize=" << batchSize << " g=" << globalSize << " w=" << workgroupsize << endl;
     kernel->run_1d(globalSize, workgroupsize);
     cl->finish();
 
 //    cout << "PoolingForwardGpuNaive::forward selectorswrapper:" << endl;
-//    PrintBuffer::printInts( cl, selectorsWrapper, outputImageSize, outputImageSize );
+//    PrintBuffer::printInts(cl, selectorsWrapper, outputSize, outputSize);
 
-    StatefulTimer::instance()->timeCheck("PoolingForwardGpuNaive::forward end" );
+    StatefulTimer::instance()->timeCheck("PoolingForwardGpuNaive::forward end");
 }
-PoolingForwardGpuNaive::PoolingForwardGpuNaive( EasyCL *cl, bool padZeros, int numPlanes, int inputImageSize, int poolingSize ) :
-        PoolingForward( cl, padZeros, numPlanes, inputImageSize, poolingSize ) {
+PoolingForwardGpuNaive::PoolingForwardGpuNaive(EasyCL *cl, bool padZeros, int numPlanes, int inputSize, int poolingSize) :
+        PoolingForward(cl, padZeros, numPlanes, inputSize, poolingSize) {
     string options = "";
-    options += " -DgOutputImageSize=" + toString( outputImageSize );
-    options += " -DgOutputImageSizeSquared=" + toString( outputImageSize * outputImageSize );
-    options += " -DgInputImageSize=" + toString( inputImageSize );
-    options += " -DgInputImageSizeSquared=" + toString( inputImageSize * inputImageSize );
-    options += " -DgPoolingSize=" + toString( poolingSize );
-    options += " -DgNumPlanes=" + toString( numPlanes );
+    options += " -DgOutputSize=" + toString(outputSize);
+    options += " -DgOutputSizeSquared=" + toString(outputSize * outputSize);
+    options += " -DgInputSize=" + toString(inputSize);
+    options += " -DgInputSizeSquared=" + toString(inputSize * inputSize);
+    options += " -DgPoolingSize=" + toString(poolingSize);
+    options += " -DgNumPlanes=" + toString(numPlanes);
 
     // [[[cog
     // import stringify
-    // stringify.write_kernel2( "kernel", "cl/pooling.cl", "forwardNaive", 'options' )
+    // stringify.write_kernel2("kernel", "cl/pooling.cl", "forwardNaive", 'options')
     // ]]]
     // generated using cog, from cl/pooling.cl:
     const char * kernelSource =  
@@ -68,33 +68,33 @@ PoolingForwardGpuNaive::PoolingForwardGpuNaive( EasyCL *cl, bool padZeros, int n
     "// every plane is independent\n" 
     "// every example is independent\n" 
     "// so, globalid can be: [n][plane][outputRow][outputCol]\n" 
-    "kernel void forwardNaive( const int batchSize, global const float *input, global int *selectors, global float *output ) {\n" 
+    "kernel void forwardNaive(const int batchSize, global const float *input, global int *selectors, global float *output) {\n" 
     "    const int globalId = get_global_id(0);\n" 
     "\n" 
-    "    const int intraImageOffset = globalId % gOutputImageSizeSquared;\n" 
-    "    const int outputRow = intraImageOffset / gOutputImageSize;\n" 
-    "    const int outputCol = intraImageOffset % gOutputImageSize;\n" 
+    "    const int intraImageOffset = globalId % gOutputSizeSquared;\n" 
+    "    const int outputRow = intraImageOffset / gOutputSize;\n" 
+    "    const int outputCol = intraImageOffset % gOutputSize;\n" 
     "\n" 
-    "    const int image2dIdx = globalId / gOutputImageSizeSquared;\n" 
+    "    const int image2dIdx = globalId / gOutputSizeSquared;\n" 
     "    const int plane = image2dIdx % gNumPlanes;\n" 
     "    const int n = image2dIdx / gNumPlanes;\n" 
     "\n" 
-    "    if( n >= batchSize ) {\n" 
+    "    if (n >= batchSize) {\n" 
     "        return;\n" 
     "    }\n" 
     "\n" 
     "    const int inputRow = outputRow * gPoolingSize;\n" 
     "    const int inputCol = outputCol * gPoolingSize;\n" 
-    "    const int inputImageOffset = ( n * gNumPlanes + plane ) * gInputImageSizeSquared;\n" 
+    "    const int inputImageOffset = (n * gNumPlanes + plane) * gInputSizeSquared;\n" 
     "    int selector = 0;\n" 
-    "    int poolInputOffset = inputImageOffset + inputRow * gInputImageSize + inputCol;\n" 
+    "    int poolInputOffset = inputImageOffset + inputRow * gInputSize + inputCol;\n" 
     "    float maxValue = input[ poolInputOffset ];\n" 
-    "    for( int dRow = 0; dRow < gPoolingSize; dRow++ ) {\n" 
-    "        for( int dCol = 0; dCol < gPoolingSize; dCol++ ) {\n" 
-    "            bool process = ( inputRow + dRow < gInputImageSize ) && ( inputCol + dCol < gInputImageSize );\n" 
-    "            if( process ) {\n" 
-    "                float thisValue = input[ poolInputOffset + dRow * gInputImageSize + dCol ];\n" 
-    "                if( thisValue > maxValue ) {\n" 
+    "    for (int dRow = 0; dRow < gPoolingSize; dRow++) {\n" 
+    "        for (int dCol = 0; dCol < gPoolingSize; dCol++) {\n" 
+    "            bool process = (inputRow + dRow < gInputSize) && (inputCol + dCol < gInputSize);\n" 
+    "            if (process) {\n" 
+    "                float thisValue = input[ poolInputOffset + dRow * gInputSize + dCol ];\n" 
+    "                if (thisValue > maxValue) {\n" 
     "                    maxValue = thisValue;\n" 
     "                    selector = dRow * gPoolingSize + dCol;\n" 
     "                }\n" 
@@ -109,6 +109,6 @@ PoolingForwardGpuNaive::PoolingForwardGpuNaive( EasyCL *cl, bool padZeros, int n
     "";
     kernel = cl->buildKernelFromString( kernelSource, "forwardNaive", options, "cl/pooling.cl" );
     // [[[end]]]
-//    kernel = cl->buildKernel( "pooling.cl", "forwardNaive", options );
+//    kernel = cl->buildKernel("pooling.cl", "forwardNaive", options);
 }
 
diff --git a/src/pooling/PoolingForwardGpuNaive.h b/src/pooling/PoolingForwardGpuNaive.h
index f1b4842b..f3e136a3 100644
--- a/src/pooling/PoolingForwardGpuNaive.h
+++ b/src/pooling/PoolingForwardGpuNaive.h
@@ -23,8 +23,8 @@ class PoolingForwardGpuNaive : public PoolingForward {
     // ]]]
     // generated, using cog:
     VIRTUAL ~PoolingForwardGpuNaive();
-    VIRTUAL void forward( int batchSize, CLWrapper *inputWrapper, CLWrapper *selectorsWrapper, CLWrapper *outputWrapper );
-    PoolingForwardGpuNaive( EasyCL *cl, bool padZeros, int numPlanes, int inputImageSize, int poolingSize );
+    VIRTUAL void forward(int batchSize, CLWrapper *inputWrapper, CLWrapper *selectorsWrapper, CLWrapper *outputWrapper);
+    PoolingForwardGpuNaive(EasyCL *cl, bool padZeros, int numPlanes, int inputSize, int poolingSize);
 
     // [[[end]]]
 };
diff --git a/src/pooling/PoolingLayer.cpp b/src/pooling/PoolingLayer.cpp
index c332c3cd..c7fecb63 100644
--- a/src/pooling/PoolingLayer.cpp
+++ b/src/pooling/PoolingLayer.cpp
@@ -22,14 +22,14 @@ using namespace std;
 #undef STATIC
 #define STATIC
 
-PoolingLayer::PoolingLayer( EasyCL *cl, Layer *previousLayer, PoolingMaker *maker ) :
-        Layer( previousLayer, maker ),
-        padZeros( maker->_padZeros ),
-        numPlanes ( previousLayer->getOutputPlanes() ),
-        inputImageSize( previousLayer->getOutputImageSize() ),
-        poolingSize( maker->_poolingSize ),
-        outputImageSize( maker->_padZeros ? ( previousLayer->getOutputImageSize() + maker->_poolingSize - 1 ) / maker->_poolingSize : previousLayer->getOutputImageSize() / maker->_poolingSize ),
-        cl( cl ),
+PoolingLayer::PoolingLayer(EasyCL *cl, Layer *previousLayer, PoolingMaker *maker) :
+        Layer(previousLayer, maker),
+        padZeros(maker->_padZeros),
+        numPlanes (previousLayer->getOutputPlanes()),
+        inputSize(previousLayer->getOutputSize()),
+        poolingSize(maker->_poolingSize),
+        outputSize(maker->_padZeros ? (previousLayer->getOutputSize() + maker->_poolingSize - 1) / maker->_poolingSize : previousLayer->getOutputSize() / maker->_poolingSize),
+        cl(cl),
         output(0),
         selectors(0),
         gradInput(0),
@@ -40,81 +40,81 @@ PoolingLayer::PoolingLayer( EasyCL *cl, Layer *previousLayer, PoolingMaker *make
 //        gradInputCopiedToHost(false),
         batchSize(0),
         allocatedSize(0){
-    if( inputImageSize == 0 ){
+    if(inputSize == 0){
 //        maker->net->print();
-        throw runtime_error("Error: Pooling layer " + toString( layerIndex ) + ": input image size is 0" );
+        throw runtime_error("Error: Pooling layer " + toString(layerIndex) + ": input image size is 0");
     }
-    if( outputImageSize == 0 ){
+    if(outputSize == 0){
 //        maker->net->print();
-        throw runtime_error("Error: Pooling layer " + toString( layerIndex ) + ": output image size is 0" );
+        throw runtime_error("Error: Pooling layer " + toString(layerIndex) + ": output image size is 0");
     }
-    poolingForwardImpl = PoolingForward::instance( cl, padZeros, numPlanes, inputImageSize, poolingSize );
-    poolingBackpropImpl = PoolingBackward::instance( cl, padZeros, numPlanes, inputImageSize, poolingSize );
+    poolingForwardImpl = PoolingForward::instance(cl, padZeros, numPlanes, inputSize, poolingSize);
+    poolingBackpropImpl = PoolingBackward::instance(cl, padZeros, numPlanes, inputSize, poolingSize);
 }
 VIRTUAL PoolingLayer::~PoolingLayer() {
     delete poolingForwardImpl;
     delete poolingBackpropImpl;
-    if( outputWrapper != 0 ) {
+    if(outputWrapper != 0) {
         delete outputWrapper;
     }
-    if( output != 0 ) {
+    if(output != 0) {
         delete[] output;
     }
-    if( selectorsWrapper != 0 ) {
+    if(selectorsWrapper != 0) {
         delete selectorsWrapper;
     }
-    if( selectors != 0 ) {
+    if(selectors != 0) {
         delete[] selectors;
     }
-    if( gradInputWrapper != 0 ) {
+    if(gradInputWrapper != 0) {
         delete gradInputWrapper;
     }
-    if( gradInput != 0 ) {
+    if(gradInput != 0) {
         delete[] gradInput;
     }
 }
 VIRTUAL std::string PoolingLayer::getClassName() const {
     return "PoolingLayer";
 }
-VIRTUAL void PoolingLayer::setBatchSize( int batchSize ) {
+VIRTUAL void PoolingLayer::setBatchSize(int batchSize) {
 //    cout << "PoolingLayer::setBatchSize" << endl;
-    if( batchSize <= allocatedSize ) {
+    if(batchSize <= allocatedSize) {
         this->batchSize = batchSize;
         return;
     }
-    if( outputWrapper != 0 ) {
+    if(outputWrapper != 0) {
         delete outputWrapper;
     }
-    if( output != 0 ) {
+    if(output != 0) {
         delete[] output;
     }
-    if( selectorsWrapper != 0 ) {
+    if(selectorsWrapper != 0) {
         delete selectorsWrapper;
     }
-    if( selectors != 0 ) {
+    if(selectors != 0) {
         delete[] selectors;
     }
-    if( gradInputWrapper != 0 ) {
+    if(gradInputWrapper != 0) {
         delete gradInputWrapper;
     }
-    if( gradInput != 0 ) {
+    if(gradInput != 0) {
         delete[] gradInput;
     }
     this->batchSize = batchSize;
     this->allocatedSize = batchSize;
-    output = new float[ getOutputSize() ];
-    outputWrapper = cl->wrap( getOutputSize(), output );
-    selectors = new int[ getOutputSize() ];
-    selectorsWrapper = cl->wrap( getOutputSize(), selectors );
-    gradInput = new float[ previousLayer->getOutputSize() ];
-    gradInputWrapper = cl->wrap( previousLayer->getOutputSize(), gradInput );
+    output = new float[ getOutputNumElements() ];
+    outputWrapper = cl->wrap(getOutputNumElements(), output);
+    selectors = new int[ getOutputNumElements() ];
+    selectorsWrapper = cl->wrap(getOutputNumElements(), selectors);
+    gradInput = new float[ previousLayer->getOutputNumElements() ];
+    gradInputWrapper = cl->wrap(previousLayer->getOutputNumElements(), gradInput);
     gradInputWrapper->createOnDevice();
 }
-VIRTUAL int PoolingLayer::getOutputSize() {
-    return batchSize * numPlanes * outputImageSize * outputImageSize;
+VIRTUAL int PoolingLayer::getOutputNumElements() {
+    return batchSize * numPlanes * outputSize * outputSize;
 }
 VIRTUAL float *PoolingLayer::getOutput() {
-    if( outputWrapper->isDeviceDirty() ) {
+    if(outputWrapper->isDeviceDirty()) {
         outputWrapper->copyToHost();
 //        outputCopiedToHost = true;
     }
@@ -123,20 +123,20 @@ VIRTUAL float *PoolingLayer::getOutput() {
 VIRTUAL bool PoolingLayer::needsBackProp() {
     return previousLayer->needsBackProp();
 }
-VIRTUAL int PoolingLayer::getOutputSize() const {
-//    int outputImageSize = inputImageSize / poolingSize;
-    return batchSize * numPlanes * outputImageSize * outputImageSize;
+VIRTUAL int PoolingLayer::getOutputNumElements() const {
+//    int outputSize = inputSize / poolingSize;
+    return batchSize * numPlanes * outputSize * outputSize;
 }
-VIRTUAL int PoolingLayer::getOutputImageSize() const {
-    return outputImageSize;
+VIRTUAL int PoolingLayer::getOutputSize() const {
+    return outputSize;
 }
 VIRTUAL int PoolingLayer::getOutputCubeSize() const {
-    return numPlanes * outputImageSize * outputImageSize;
+    return numPlanes * outputSize * outputSize;
 }
 VIRTUAL int PoolingLayer::getOutputPlanes() const {
     return numPlanes;
 }
-VIRTUAL int PoolingLayer::getPersistSize( int version ) const {
+VIRTUAL int PoolingLayer::getPersistSize(int version) const {
     return 0;
 }
 VIRTUAL bool PoolingLayer::providesGradInputWrapper() const {
@@ -160,58 +160,58 @@ VIRTUAL ActivationFunction const *PoolingLayer::getActivationFunction() {
 }
 VIRTUAL void PoolingLayer::forward() {
     CLWrapper *upstreamOutputWrapper = 0;
-    if( previousLayer->hasOutputWrapper() ) {
+    if(previousLayer->hasOutputWrapper()) {
         upstreamOutputWrapper = previousLayer->getOutputWrapper();
     } else {
         float *upstreamOutput = previousLayer->getOutput();
-        upstreamOutputWrapper = cl->wrap( previousLayer->getOutputSize(), upstreamOutput );
+        upstreamOutputWrapper = cl->wrap(previousLayer->getOutputNumElements(), upstreamOutput);
         upstreamOutputWrapper->copyToDevice();
     }
-    poolingForwardImpl->forward( batchSize, upstreamOutputWrapper, selectorsWrapper, outputWrapper );
-    if( !previousLayer->hasOutputWrapper() ) {
+    poolingForwardImpl->forward(batchSize, upstreamOutputWrapper, selectorsWrapper, outputWrapper);
+    if(!previousLayer->hasOutputWrapper()) {
         delete upstreamOutputWrapper;
     }
 
 //    cout << "PoolingLayer::forward() selectors after forward: " << endl;
-//    for( int i = 0; i < outputImageSize; i++ ) {
-//        for( int j = 0; j < outputImageSize; j++ ) {
-//            cout << selectors[ i * outputImageSize + j ] << " ";
+//    for(int i = 0; i < outputSize; i++) {
+//        for(int j = 0; j < outputSize; j++) {
+//            cout << selectors[ i * outputSize + j ] << " ";
 //        }
 //        cout << endl;
 //    }
 
 //    cout << "PoolingLayer::forward() selectorsWrapper after forward: " << endl;
-//    PrintBuffer::printInts( cl, selectorsWrapper, outputImageSize, outputImageSize );
+//    PrintBuffer::printInts(cl, selectorsWrapper, outputSize, outputSize);
 }
 VIRTUAL void PoolingLayer::backward() {
     // have no weights to backprop to, just need to backprop the errors
 
     CLWrapper *gradOutputWrapper = 0;
     bool weOwnErrorsWrapper = false;
-    if( nextLayer->providesGradInputWrapper() ) {
+    if(nextLayer->providesGradInputWrapper()) {
         gradOutputWrapper = nextLayer->getGradInputWrapper();
     } else {
-        gradOutputWrapper = cl->wrap( getOutputSize(), nextLayer->getGradInput() );
+        gradOutputWrapper = cl->wrap(getOutputNumElements(), nextLayer->getGradInput());
         gradOutputWrapper->copyToDevice();
         weOwnErrorsWrapper = true;
     }
 
 //    cout << "PoolingLayer::backward selectorsWrapper:" << endl;
-//    PrintBuffer::printInts( cl, selectorsWrapper, outputImageSize, outputImageSize );
+//    PrintBuffer::printInts(cl, selectorsWrapper, outputSize, outputSize);
 
-//    int *selectors = reinterpret_cast< int * >( selectorsWrapper->getHostArray() );
+//    int *selectors = reinterpret_cast< int * >(selectorsWrapper->getHostArray());
 //    cout << "PoolingLayer::backward selectors before copy to host:" << endl;
-//    for( int i = 0; i < outputImageSize; i++ ) {
-//        for( int j = 0; j < outputImageSize; j++ ) {
-//            cout << " " << selectors[i * outputImageSize + j];
+//    for(int i = 0; i < outputSize; i++) {
+//        for(int j = 0; j < outputSize; j++) {
+//            cout << " " << selectors[i * outputSize + j];
 //        }
 //        cout << endl;
 //    }
 //    selectorsWrapper->copyToHost();
 //    cout << "PoolingLayer::backward selectors after copy to host:" << endl;
-//    for( int i = 0; i < outputImageSize; i++ ) {
-//        for( int j = 0; j < outputImageSize; j++ ) {
-//            cout << " " << selectors[i * outputImageSize + j];
+//    for(int i = 0; i < outputSize; i++) {
+//        for(int j = 0; j < outputSize; j++) {
+//            cout << " " << selectors[i * outputSize + j];
 //        }
 //        cout << endl;
 //    }
@@ -219,15 +219,15 @@ VIRTUAL void PoolingLayer::backward() {
 
 //    selectorsWrapper->copyToHost();
 
-    poolingBackpropImpl->backward( batchSize, gradOutputWrapper, selectorsWrapper, gradInputWrapper );
+    poolingBackpropImpl->backward(batchSize, gradOutputWrapper, selectorsWrapper, gradInputWrapper);
 
 //    gradInputWrapper->copyToHost();
-//    float *gradInput = reinterpret_cast< float * >( gradInputWrapper->getHostArray() );
+//    float *gradInput = reinterpret_cast< float * >(gradInputWrapper->getHostArray());
 //    cout << "gradInput:" << endl;
-//    for( int i = 0; i < inputImageSize; i++ ) {
-//        for( int j = 0; j < inputImageSize; j++ ) {
-////            cout << " " << gradInput[i * inputImageSize + j];
-//            if( gradInput[i * inputImageSize + j] != 0 ) {
+//    for(int i = 0; i < inputSize; i++) {
+//        for(int j = 0; j < inputSize; j++) {
+////            cout << " " << gradInput[i * inputSize + j];
+//            if(gradInput[i * inputSize + j] != 0) {
 //                cout << " *";
 //            } else {
 //                cout << " .";
@@ -236,12 +236,12 @@ VIRTUAL void PoolingLayer::backward() {
 //        cout << endl;
 //    }
 
-    if( weOwnErrorsWrapper ) {
+    if(weOwnErrorsWrapper) {
         delete gradOutputWrapper;
     }
 }
 VIRTUAL std::string PoolingLayer::asString() const {
-    return "PoolingLayer{ inputPlanes=" + toString(numPlanes) + " inputImageSize=" + toString(inputImageSize) + " poolingSize=" + toString( poolingSize ) + " }";
+    return "PoolingLayer{ inputPlanes=" + toString(numPlanes) + " inputSize=" + toString(inputSize) + " poolingSize=" + toString(poolingSize) + " }";
 }
 
 
diff --git a/src/pooling/PoolingLayer.h b/src/pooling/PoolingLayer.h
index 40cb3350..425eac0c 100644
--- a/src/pooling/PoolingLayer.h
+++ b/src/pooling/PoolingLayer.h
@@ -22,10 +22,10 @@ class PoolingLayer : public Layer {
 public:
     const bool padZeros;
     const int numPlanes;
-    const int inputImageSize;
+    const int inputSize;
     const int poolingSize;
 
-    const int outputImageSize;
+    const int outputSize;
 
     EasyCL *const cl; // NOT owned by us
     PoolingForward *poolingForwardImpl;
@@ -50,18 +50,18 @@ class PoolingLayer : public Layer {
     // cog_addheaders.add()
     // ]]]
     // generated, using cog:
-    PoolingLayer( EasyCL *cl, Layer *previousLayer, PoolingMaker *maker );
+    PoolingLayer(EasyCL *cl, Layer *previousLayer, PoolingMaker *maker);
     VIRTUAL ~PoolingLayer();
     VIRTUAL std::string getClassName() const;
-    VIRTUAL void setBatchSize( int batchSize );
-    VIRTUAL int getOutputSize();
+    VIRTUAL void setBatchSize(int batchSize);
+    VIRTUAL int getOutputNumElements();
     VIRTUAL float *getOutput();
     VIRTUAL bool needsBackProp();
+    VIRTUAL int getOutputNumElements() const;
     VIRTUAL int getOutputSize() const;
-    VIRTUAL int getOutputImageSize() const;
     VIRTUAL int getOutputCubeSize() const;
     VIRTUAL int getOutputPlanes() const;
-    VIRTUAL int getPersistSize( int version ) const;
+    VIRTUAL int getPersistSize(int version) const;
     VIRTUAL bool providesGradInputWrapper() const;
     VIRTUAL CLWrapper *getGradInputWrapper();
     VIRTUAL bool hasOutputWrapper() const;
diff --git a/src/pooling/PoolingMaker.cpp b/src/pooling/PoolingMaker.cpp
index 69859102..b8e2a198 100644
--- a/src/pooling/PoolingMaker.cpp
+++ b/src/pooling/PoolingMaker.cpp
@@ -9,7 +9,7 @@
 
 using namespace std;
 
-Layer *PoolingMaker::createLayer( Layer *previousLayer ) {
-    return new PoolingLayer( cl, previousLayer, this );
+Layer *PoolingMaker::createLayer(Layer *previousLayer) {
+    return new PoolingLayer(cl, previousLayer, this);
 }
 
diff --git a/src/pooling/PoolingMaker.h b/src/pooling/PoolingMaker.h
index 2d650dbd..bb546177 100644
--- a/src/pooling/PoolingMaker.h
+++ b/src/pooling/PoolingMaker.h
@@ -20,10 +20,10 @@ class DeepCL_EXPORT PoolingMaker : public LayerMaker2 {
     int _poolingSize;
     bool _padZeros;
     PUBLICAPI PoolingMaker() :
-        _poolingSize( 2 ),
-        _padZeros( false ) {
+        _poolingSize(2),
+        _padZeros(false) {
     }
-    PUBLICAPI PoolingMaker *poolingSize( int _poolingSize ) {
+    PUBLICAPI PoolingMaker *poolingSize(int _poolingSize) {
         this->_poolingSize = _poolingSize;
         return this;
     }
@@ -36,10 +36,10 @@ class DeepCL_EXPORT PoolingMaker : public LayerMaker2 {
     }
     virtual PoolingMaker *clone() const {
         PoolingMaker *thisClone = new PoolingMaker();
-        memcpy( thisClone, this, sizeof( PoolingMaker ) );
+        memcpy(thisClone, this, sizeof(PoolingMaker) );
         return thisClone;
     }
-    virtual Layer *createLayer( Layer *previousLayer );
+    virtual Layer *createLayer(Layer *previousLayer);
 };
 
 
diff --git a/src/qlearning/QLearner.cpp b/src/qlearning/QLearner.cpp
index 21a5fc62..21d326cf 100644
--- a/src/qlearning/QLearner.cpp
+++ b/src/qlearning/QLearner.cpp
@@ -11,10 +11,10 @@
 
 using namespace std;
 
-QLearner::QLearner( Trainer *trainer, Scenario *scenario, NeuralNet *net ) :
-        trainer( trainer ),
-        scenario( scenario ),
-        net( net ) {
+QLearner::QLearner(Trainer *trainer, Scenario *scenario, NeuralNet *net) :
+        trainer(trainer),
+        scenario(scenario),
+        net(net) {
     epoch = 0;
     lambda = 0.9f;
     maxSamples = 32;
@@ -46,7 +46,7 @@ void QLearner::learnFromPast() {
 
     // draw samples
     Experience **experiences = new Experience *[ batchSize ];
-    for( int n = 0; n < batchSize; n++ ) {
+    for(int n = 0; n < batchSize; n++) {
         int sampleIdx = myrand() % availableSamples;
         Experience *experience = history[sampleIdx];
         experiences[n] = experience;
@@ -55,23 +55,23 @@ void QLearner::learnFromPast() {
     // copy in data 
     float *afters = new float[ batchSize * planes * size * size ];
     float *befores = new float[ batchSize * planes * size * size ];
-    for( int n = 0; n < batchSize; n++ ) {
+    for(int n = 0; n < batchSize; n++) {
         Experience *experience = experiences[n]; 
-        arrayCopy( afters + n * planes * size * size, experience->after, planes * size * size );
-        arrayCopy( befores + n * planes * size * size, experience->before, planes * size * size );
+        arrayCopy(afters + n * planes * size * size, experience->after, planes * size * size);
+        arrayCopy(befores + n * planes * size * size, experience->before, planes * size * size);
     }
 
     // get next q values, based on forward prop 'afters'
-    net->forward( afters );
+    net->forward(afters);
     float const *allOutput = net->getOutput();
     float *bestQ = new float[ batchSize ];
     int *bestAction = new int[ batchSize ];
-    for( int n = 0; n < batchSize; n++ ) {
+    for(int n = 0; n < batchSize; n++) {
         float const *output = allOutput + n * numActions;
         float thisBestQ = output[0];
         int thisBestAction = 0;
-        for( int action = 1; action < numActions; action++ ) {
-            if( output[action] > thisBestQ ) {
+        for(int action = 1; action < numActions; action++) {
+            if(output[action] > thisBestQ) {
                 thisBestQ = output[action];
                 thisBestAction = action;
             }
@@ -81,13 +81,13 @@ void QLearner::learnFromPast() {
     }
     // forward prop 'befores', set up expected values, and backprop
     // new q values
-    net->forward( befores );
+    net->forward(befores);
     allOutput = net->getOutput();
     float *expectedValues = new float[ numActions * batchSize ];
-    arrayCopy( expectedValues, allOutput, batchSize * numActions );
-    for( int n = 0; n < batchSize; n++ ) {
+    arrayCopy(expectedValues, allOutput, batchSize * numActions);
+    for(int n = 0; n < batchSize; n++) {
         Experience *experience = experiences[n]; 
-        if( experience->isEndState ) {
+        if(experience->isEndState) {
             expectedValues[ n * numActions + experience->action ] = experience->reward; 
         } else {
             expectedValues[ n * numActions + experience->action ] = experience->reward + lambda * bestQ[n];        
@@ -95,9 +95,9 @@ void QLearner::learnFromPast() {
     }
     // backprop...
 //    throw runtime_error("need to implement this");
-    TrainingContext context( epoch, 0 );
-    trainer->train( net, &context, befores, expectedValues );
-//    net->backward( learningRate / batchSize, expectedValues );
+    TrainingContext context(epoch, 0);
+    trainer->train(net, &context, befores, expectedValues);
+//    net->backward(learningRate / batchSize, expectedValues);
     net->setBatchSize(1);
 
     epoch++;
@@ -112,36 +112,36 @@ void QLearner::learnFromPast() {
 
 // this is now a scenario-free zone, and therefore no callbacks, and easy to wrap with
 // swig, cython etc.
-int QLearner::step( float lastReward, bool wasReset, float *perception ) { // do one frame
-    if( lastAction != -1 ) {
+int QLearner::step(float lastReward, bool wasReset, float *perception) { // do one frame
+    if(lastAction != -1) {
         Experience *experience = new Experience(); 
         experience->action = lastAction;
         experience->reward = lastReward;
         experience->isEndState = wasReset;
         experience->before = new float[ size * size * planes ];
-        arrayCopy( experience->before, this->lastPerception, size * size * planes );
-//        scenario->getPerception( perception );
+        arrayCopy(experience->before, this->lastPerception, size * size * planes);
+//        scenario->getPerception(perception);
         experience->after = new float[ size * size * planes ];
-        arrayCopy( experience->after, perception, size * size * planes );
-        history.push_back( experience );
-        if( wasReset ) {
+        arrayCopy(experience->after, perception, size * size * planes);
+        history.push_back(experience);
+        if(wasReset) {
             game++;
         }
         learnFromPast();
     }
-//        cout << "see: " << toString( perception, perceptionSize + numActions ) << endl;
+//        cout << "see: " << toString(perception, perceptionSize + numActions) << endl;
     int action = -1;
-    if( lastAction == -1 || (myrand() % 10000 / 10000.0f) <= epsilon ) {
+    if(lastAction == -1 || (myrand() % 10000 / 10000.0f) <= epsilon) {
         action = myrand() % numActions;
 //            cout << "action, rand: " << action << endl;
     } else {
         net->setBatchSize(1);
-        net->forward( perception );
+        net->forward(perception);
         float highestQ = 0;
         int bestAction = 0;
         float const*output = net->getOutput();
-        for( int i = 0; i < numActions; i++ ) {
-            if( i == 0 || output[i] > highestQ ) {
+        for(int i = 0; i < numActions; i++) {
+            if(i == 0 || output[i] > highestQ) {
                 highestQ = output[i];
                 bestAction = i;
             }
@@ -149,8 +149,8 @@ int QLearner::step( float lastReward, bool wasReset, float *perception ) { // do
         action = bestAction;
 //            cout << "action, q: " << action << endl;
     }
-    arrayCopy( this->lastPerception, perception, size * size * planes );
-//        printDirections( net, scenario->height, scenario->width );
+    arrayCopy(this->lastPerception, perception, size * size * planes);
+//        printDirections(net, scenario->height, scenario->width);
     this->lastAction = action;
     return action;
 }
@@ -162,11 +162,11 @@ void QLearner::run() {
 //    int selectedAction = -1;
     float *perception = new float[ size * size * planes ];
     bool wasReset = false;
-    while( true ) {
-        scenario->getPerception( perception );
-        int action = step( lastReward, wasReset, perception );
-        lastReward = scenario->act( action );
-        if( scenario->hasFinished() ) {
+    while(true) {
+        scenario->getPerception(perception);
+        int action = step(lastReward, wasReset, perception);
+        lastReward = scenario->act(action);
+        if(scenario->hasFinished()) {
             scenario->reset();
             wasReset = true;
         } else {
diff --git a/src/qlearning/QLearner.h b/src/qlearning/QLearner.h
index f4187a31..08e1f801 100644
--- a/src/qlearning/QLearner.h
+++ b/src/qlearning/QLearner.h
@@ -38,18 +38,18 @@ class DeepCL_EXPORT QLearner {
 //    float learningRate; // learning rate for the neuralnet; depends on what is appropriate for your particular
 //                        // network design
 
-    QLearner( Trainer *trainer, Scenario *scenario, NeuralNet *net );
+    QLearner(Trainer *trainer, Scenario *scenario, NeuralNet *net);
     // do one frame:
-    int step( float lastReward, bool wasReset, float *perception );
+    int step(float lastReward, bool wasReset, float *perception);
     void run();  // main entry point
     virtual ~QLearner();
 
     void learnFromPast(); // internal method; probably not useful to user, but who knows, so leaving it 
                           // public :-)
-    void setLambda( float lambda ) { this->lambda = lambda; }
-    void setMaxSamples( int maxSamples ) { this->maxSamples = maxSamples; }
-    void setEpsilon( float epsilon ) { this->epsilon = epsilon; }
-//    void setLearningRate( float learningRate ) { this->learningRate = learningRate; }
+    void setLambda(float lambda) { this->lambda = lambda; }
+    void setMaxSamples(int maxSamples) { this->maxSamples = maxSamples; }
+    void setEpsilon(float epsilon) { this->epsilon = epsilon; }
+//    void setLearningRate(float learningRate) { this->learningRate = learningRate; }
 
 protected:
     int size;
diff --git a/src/qlearning/QLearner2.h b/src/qlearning/QLearner2.h
index b0979ac5..b74274dd 100644
--- a/src/qlearning/QLearner2.h
+++ b/src/qlearning/QLearner2.h
@@ -25,7 +25,7 @@ class ScenarioProxy : public Scenario {
 //    float lastReward;
 //    int thisAction;
 //    bool isReset;
-    ScenarioProxy( int numActions, int planes, int size ) :
+    ScenarioProxy(int numActions, int planes, int size) :
         numActions(numActions), planes(planes), size(size) {
     }
     virtual int getPerceptionSize() {
@@ -34,7 +34,7 @@ class ScenarioProxy : public Scenario {
     virtual int getPerceptionPlanes() {
         return planes;
     }
-    virtual void getPerception( float *perception ) {
+    virtual void getPerception(float *perception) {
 //        perception = this->perception;
         throw std::runtime_error("getPerception not implemented");
     }
@@ -47,7 +47,7 @@ class ScenarioProxy : public Scenario {
         return numActions;
 //        throw runtime_error("getNumActions not implemented");
     }
-    virtual float act( int index ) {
+    virtual float act(int index) {
 //        this->thisAction = index;
 //        return lastReward;
         throw std::runtime_error("act not implemented");
@@ -69,25 +69,25 @@ class QLearner2 {
 //    int size;
 //    int numActions;
 public:
-    QLearner2( Trainer *trainer, NeuralNet *net, int numActions, int planes, int size ) : net(net) {
-        scenario = new ScenarioProxy( numActions, planes, size );
-        qlearner = new QLearner( trainer, scenario, net );
+    QLearner2(Trainer *trainer, NeuralNet *net, int numActions, int planes, int size) : net(net) {
+        scenario = new ScenarioProxy(numActions, planes, size);
+        qlearner = new QLearner(trainer, scenario, net);
     }
     ~QLearner2() {
         delete qlearner;
         delete scenario;
     }
-//    QLearner2 *setPlanes( int planes ) {
+//    QLearner2 *setPlanes(int planes) {
 //        this->planes = planes;
 //        scenario->planes = planes;
 //        return this;
 //    }
-//    QLearner2 *setSize( int size ) {
+//    QLearner2 *setSize(int size) {
 //        this->size = size;
 //        scenario->size = size;
 //        return this;
 //    }
-//    QLearner2 *setNumActions( int numActions ) {
+//    QLearner2 *setNumActions(int numActions) {
 //        this->numActions = numActions;
 //        scenario->numActions = numActions;
 //        return this;
@@ -96,12 +96,12 @@ class QLearner2 {
 //        scenario->lastReward = lastReward;
 //        scenario->isReset = isReset;
 //        scenario->perception = currentPerception;
-        int action = qlearner->step( lastReward, wasReset, perception );
+        int action = qlearner->step(lastReward, wasReset, perception);
         return action;
     }
-    void setLambda( float lambda ) { qlearner->setLambda( lambda ); }
-    void setMaxSamples( int maxSamples ) { qlearner->setMaxSamples( maxSamples ); }
-    void setEpsilon( float epsilon ) { qlearner->setEpsilon( epsilon ); }
-//    void setLearningRate( float learningRate ) { qlearner->setLearningRate( learningRate ); }
+    void setLambda(float lambda) { qlearner->setLambda(lambda); }
+    void setMaxSamples(int maxSamples) { qlearner->setMaxSamples(maxSamples); }
+    void setEpsilon(float epsilon) { qlearner->setEpsilon(epsilon); }
+//    void setLearningRate(float learningRate) { qlearner->setLearningRate(learningRate); }
 };
 
diff --git a/src/qlearning/Scenario.h b/src/qlearning/Scenario.h
index 72dc973e..cfd9e4fb 100644
--- a/src/qlearning/Scenario.h
+++ b/src/qlearning/Scenario.h
@@ -15,10 +15,10 @@ class Scenario {
 //    virtual void printQRepresentation(NeuralNet *net) {} // optional implementation
     virtual int getPerceptionSize() = 0;
     virtual int getPerceptionPlanes() = 0;
-    virtual void getPerception( float *perception ) = 0;
+    virtual void getPerception(float *perception) = 0;
     virtual void reset() = 0;
     virtual int getNumActions() = 0;
-    virtual float act( int index ) = 0;  // returns reward
+    virtual float act(int index) = 0;  // returns reward
     virtual bool hasFinished() = 0;
 //    virtual int getWorldSize() = 0;
 };
diff --git a/src/qlearning/array_helper.cpp b/src/qlearning/array_helper.cpp
index 7c0bb039..dad9ffab 100644
--- a/src/qlearning/array_helper.cpp
+++ b/src/qlearning/array_helper.cpp
@@ -9,22 +9,22 @@
 
 using namespace std;
 
-void arrayCopy( float *dest, float const*src, int N ) {
-    for( int i = 0; i < N; i++ ) {
+void arrayCopy(float *dest, float const*src, int N) {
+    for(int i = 0; i < N; i++) {
         dest[i] = src[i];
     }
 }
 
-void arrayZero( float *array, int N ) {
-    for( int i = 0; i < N; i++ ) {
+void arrayZero(float *array, int N) {
+    for(int i = 0; i < N; i++) {
         array[i] = 0;
     }
 }
 
-string toString( float const*array, int N ) {
+string toString(float const*array, int N) {
     string result = "";
-    for( int i = 0; i < N; i++ ) {
-        result += toString( array[i] );
+    for(int i = 0; i < N; i++) {
+        result += toString(array[i]);
     }
     return result;
 }
diff --git a/src/qlearning/array_helper.h b/src/qlearning/array_helper.h
index 845f6035..70141d53 100644
--- a/src/qlearning/array_helper.h
+++ b/src/qlearning/array_helper.h
@@ -10,8 +10,8 @@
 
 #include "DeepCLDllExport.h"
 
-DeepCL_EXPORT void arrayCopy( float *dest, float const*src, int N );
-DeepCL_EXPORT void arrayZero( float *array, int N );
-DeepCL_EXPORT std::string toString( float const*array, int N );
+DeepCL_EXPORT void arrayCopy(float *dest, float const*src, int N);
+DeepCL_EXPORT void arrayZero(float *array, int N);
+DeepCL_EXPORT std::string toString(float const*array, int N);
 
 
diff --git a/src/trainers/Adadelta.cpp b/src/trainers/Adadelta.cpp
index c19e3782..14f93061 100644
--- a/src/trainers/Adadelta.cpp
+++ b/src/trainers/Adadelta.cpp
@@ -31,31 +31,31 @@ using namespace std;
 VIRTUAL Adadelta::~Adadelta() {
 }
 VIRTUAL std::string Adadelta::asString() {
-    return "Adadelta{ learningRate=" + toString( learningRate ) + " }";
+    return "Adadelta{ learningRate=" + toString(learningRate) + " }";
 }
-VIRTUAL void Adadelta::updateWeights( CLWrapper *weightsWrapper, CLWrapper *gradWeightsWrapper,
-        AdadeltaState *trainerState ) {
+VIRTUAL void Adadelta::updateWeights(CLWrapper *weightsWrapper, CLWrapper *gradWeightsWrapper,
+        AdadeltaState *trainerState) {
     // need to calculate
-    // sumGradSquared = decay * sumGradSquared + (1 - decay ) * grad.square()
+    // sumGradSquared = decay * sumGradSquared + (1 - decay) * grad.square()
     // update = - sumUpdateSquared.sqrt() / sumGradSquared.sqrt() * grad
-    // sumUpdateSquared = decay * sumUpdateSquared + ( 1 - decay ) * update.squared()
+    // sumUpdateSquared = decay * sumUpdateSquared + (1 - decay) * update.squared()
     // weights += update
 
     int numWeights = trainerState->numWeights;
     float *working = new float[ numWeights ];
-    CLWrapper *workingWrapper = cl->wrap( numWeights, working );
+    CLWrapper *workingWrapper = cl->wrap(numWeights, working);
     workingWrapper->createOnDevice();
 
-    CLMathWrapper clWeights( weightsWrapper );
-    CLMathWrapper clGradWeights( gradWeightsWrapper );
-    CLMathWrapper clSumGradSquared( trainerState->sumGradSquaredWrapper );
-    CLMathWrapper clSumUpdateSquared( trainerState->sumUpdateSquaredWrapper );
-    CLMathWrapper clWorking( workingWrapper );
+    CLMathWrapper clWeights(weightsWrapper);
+    CLMathWrapper clGradWeights(gradWeightsWrapper);
+    CLMathWrapper clSumGradSquared(trainerState->sumGradSquaredWrapper);
+    CLMathWrapper clSumUpdateSquared(trainerState->sumUpdateSquaredWrapper);
+    CLMathWrapper clWorking(workingWrapper);
 
     // following all happens on gpu, via clmathwrapper:
     clWorking = clGradWeights;
     clWorking.squared();
-    clWorking *= ( 1 - decay );
+    clWorking *= (1 - decay);
     clSumGradSquared *= decay;
     clSumGradSquared += clWorking;
 
@@ -70,62 +70,62 @@ VIRTUAL void Adadelta::updateWeights( CLWrapper *weightsWrapper, CLWrapper *grad
 
     clSumUpdateSquared *= decay;
     clWorking.squared();
-    clWorking *= ( 1 - decay );
+    clWorking *= (1 - decay);
     clSumUpdateSquared += clWorking;
 
     delete workingWrapper;
     delete[] working;
 }
-VIRTUAL BatchResult Adadelta::train( NeuralNet *net, TrainingContext *context,
-    float const*input, OutputData *outputData ) {
+VIRTUAL BatchResult Adadelta::train(NeuralNet *net, TrainingContext *context,
+    float const*input, OutputData *outputData) {
     // learns one batch, including updating weights
     // doesnt have to think about running multiple batches,
     // or loading data, or anything like that
-    bindState( net );
+    bindState(net);
 
-    net->forward( input );
-    int numRight = net->calcNumRight( outputData );
-    float loss = net->calcLoss( outputData );
-    net->backward( outputData );
+    net->forward(input);
+    int numRight = net->calcNumRight(outputData);
+    float loss = net->calcLoss(outputData);
+    net->backward(outputData);
 
     int numLayers = net->getNumLayers();
-    for( int layerIdx = numLayers - 2; layerIdx > 0; layerIdx-- ) {
-        Layer *layer = net->getLayer( layerIdx );
-        if( !layer->needsBackProp() ) {
+    for(int layerIdx = numLayers - 2; layerIdx > 0; layerIdx--) {
+        Layer *layer = net->getLayer(layerIdx);
+        if(!layer->needsBackProp()) {
             break;
         }
-        if( layer->needsTrainerState() ) {
-            updateWeights( layer->getWeightsWrapper(), layer->getGradWeightsWrapper(), 
-                dynamic_cast< AdadeltaState * >( layer->getTrainerState() ) );
-            if( layer->biased() ) {
-                updateWeights( layer->getBiasWrapper(), layer->getGradBiasWrapper(),
-                    dynamic_cast< AdadeltaState * >( layer->getBiasTrainerState() ) );
+        if(layer->needsTrainerState()) {
+            updateWeights(layer->getWeightsWrapper(), layer->getGradWeightsWrapper(), 
+                dynamic_cast< AdadeltaState * >(layer->getTrainerState()) );
+            if(layer->biased()) {
+                updateWeights(layer->getBiasWrapper(), layer->getGradBiasWrapper(),
+                    dynamic_cast< AdadeltaState * >(layer->getBiasTrainerState()) );
             }
         }
     }
-    return BatchResult( loss, numRight );
+    return BatchResult(loss, numRight);
 }
-VIRTUAL BatchResult Adadelta::train( NeuralNet *net, TrainingContext *context,
-        float const*input, float const*expectedOutput ) {
-    ExpectedData expectedData( net, expectedOutput );
-    return this->train( net, context, input, &expectedData );
+VIRTUAL BatchResult Adadelta::train(NeuralNet *net, TrainingContext *context,
+        float const*input, float const*expectedOutput) {
+    ExpectedData expectedData(net, expectedOutput);
+    return this->train(net, context, input, &expectedData);
 }
-VIRTUAL BatchResult Adadelta::trainFromLabels( NeuralNet *net, TrainingContext *context,
-        float const*input, int const*labels ) {
-    LabeledData labeledData( net, labels );
-    return this->train( net, context, input, &labeledData );
+VIRTUAL BatchResult Adadelta::trainFromLabels(NeuralNet *net, TrainingContext *context,
+        float const*input, int const*labels) {
+    LabeledData labeledData(net, labels);
+    return this->train(net, context, input, &labeledData);
 }
-VIRTUAL void Adadelta::bindState( NeuralNet *net ) {
+VIRTUAL void Adadelta::bindState(NeuralNet *net) {
     AdadeltaStateMaker stateMaker;
-    this->_bindState( net, &stateMaker );
+    this->_bindState(net, &stateMaker);
 }
-STATIC Adadelta *Adadelta::instance( EasyCL *cl, float decay ) {
-    Adadelta *trainer = new Adadelta( cl, decay );
+STATIC Adadelta *Adadelta::instance(EasyCL *cl, float decay) {
+    Adadelta *trainer = new Adadelta(cl, decay);
     return trainer;
 }
-Adadelta::Adadelta( EasyCL *cl, float decay ) :
-        Trainer( cl ),
-        decay( decay ) {
-    this->setLearningRate( 0.0f );
+Adadelta::Adadelta(EasyCL *cl, float decay) :
+        Trainer(cl),
+        decay(decay) {
+    this->setLearningRate(0.0f);
 }
 
diff --git a/src/trainers/Adadelta.h b/src/trainers/Adadelta.h
index 6465125a..da1e418c 100644
--- a/src/trainers/Adadelta.h
+++ b/src/trainers/Adadelta.h
@@ -35,17 +35,17 @@ class DeepCL_EXPORT Adadelta : public Trainer{
     // generated, using cog:
     VIRTUAL ~Adadelta();
     VIRTUAL std::string asString();
-    VIRTUAL void updateWeights( CLWrapper *weightsWrapper, CLWrapper *gradWeightsWrapper,
-    AdadeltaState *trainerState );
-    VIRTUAL BatchResult train( NeuralNet *net, TrainingContext *context,
-    float const*input, OutputData *outputData );
-    VIRTUAL BatchResult train( NeuralNet *net, TrainingContext *context,
-    float const*input, float const*expectedOutput );
-    VIRTUAL BatchResult trainFromLabels( NeuralNet *net, TrainingContext *context,
-    float const*input, int const*labels );
-    VIRTUAL void bindState( NeuralNet *net );
-    STATIC Adadelta *instance( EasyCL *cl, float decay );
-    Adadelta( EasyCL *cl, float decay );
+    VIRTUAL void updateWeights(CLWrapper *weightsWrapper, CLWrapper *gradWeightsWrapper,
+    AdadeltaState *trainerState);
+    VIRTUAL BatchResult train(NeuralNet *net, TrainingContext *context,
+    float const*input, OutputData *outputData);
+    VIRTUAL BatchResult train(NeuralNet *net, TrainingContext *context,
+    float const*input, float const*expectedOutput);
+    VIRTUAL BatchResult trainFromLabels(NeuralNet *net, TrainingContext *context,
+    float const*input, int const*labels);
+    VIRTUAL void bindState(NeuralNet *net);
+    STATIC Adadelta *instance(EasyCL *cl, float decay);
+    Adadelta(EasyCL *cl, float decay);
 
     // [[[end]]]
 };
diff --git a/src/trainers/AdadeltaState.cpp b/src/trainers/AdadeltaState.cpp
index b4e3cd26..a0f2e8c4 100644
--- a/src/trainers/AdadeltaState.cpp
+++ b/src/trainers/AdadeltaState.cpp
@@ -24,16 +24,16 @@ VIRTUAL AdadeltaState::~AdadeltaState() {
     delete[] sumUpdateSquared;
 }
 
-AdadeltaState::AdadeltaState( EasyCL *cl, int numWeights ) :
-        numWeights( numWeights ) {
+AdadeltaState::AdadeltaState(EasyCL *cl, int numWeights) :
+        numWeights(numWeights) {
     sumGradSquared = new float[numWeights];
     sumUpdateSquared = new float[numWeights];
-    for( int i = 0; i < numWeights; i++ ) {
+    for(int i = 0; i < numWeights; i++) {
         sumGradSquared[i] = 0.0000001f; // should move this into fudgefactor I guess?
         sumUpdateSquared[i] = 0.0000001f; // should move this into fudgefactor I guess?
     }
-    sumGradSquaredWrapper = cl->wrap( numWeights, sumGradSquared );
-    sumUpdateSquaredWrapper = cl->wrap( numWeights, sumUpdateSquared );
+    sumGradSquaredWrapper = cl->wrap(numWeights, sumGradSquared);
+    sumUpdateSquaredWrapper = cl->wrap(numWeights, sumUpdateSquared);
     sumGradSquaredWrapper->copyToDevice();
     sumUpdateSquaredWrapper->copyToDevice();
 }
diff --git a/src/trainers/AdadeltaState.h b/src/trainers/AdadeltaState.h
index 6bd1377a..5c676830 100644
--- a/src/trainers/AdadeltaState.h
+++ b/src/trainers/AdadeltaState.h
@@ -37,7 +37,7 @@ class DeepCL_EXPORT AdadeltaState : public TrainerState {
     // ]]]
     // generated, using cog:
     VIRTUAL ~AdadeltaState();
-    AdadeltaState( EasyCL *cl, int numWeights );
+    AdadeltaState(EasyCL *cl, int numWeights);
 
     // [[[end]]]
 };
diff --git a/src/trainers/AdadeltaStateMaker.cpp b/src/trainers/AdadeltaStateMaker.cpp
index e2168330..880cea87 100644
--- a/src/trainers/AdadeltaStateMaker.cpp
+++ b/src/trainers/AdadeltaStateMaker.cpp
@@ -16,11 +16,11 @@ using namespace std;
 #define STATIC
 #define VIRTUAL
 
-TrainerState *AdadeltaStateMaker::instance( EasyCL *cl, int numWeights ) {
-    AdadeltaState *state = new AdadeltaState( cl, numWeights );
+TrainerState *AdadeltaStateMaker::instance(EasyCL *cl, int numWeights) {
+    AdadeltaState *state = new AdadeltaState(cl, numWeights);
     return state;
 }
-VIRTUAL bool AdadeltaStateMaker::created( TrainerState *state ) {
+VIRTUAL bool AdadeltaStateMaker::created(TrainerState *state) {
     return dynamic_cast< AdadeltaState * >(state) != 0;
 }
 
diff --git a/src/trainers/AdadeltaStateMaker.h b/src/trainers/AdadeltaStateMaker.h
index 04bf17f2..ab44be75 100644
--- a/src/trainers/AdadeltaStateMaker.h
+++ b/src/trainers/AdadeltaStateMaker.h
@@ -24,8 +24,8 @@ class AdadeltaStateMaker : public TrainerStateMaker {
     // cog_addheaders.add()
     // ]]]
     // generated, using cog:
-    TrainerState *instance( EasyCL *cl, int numWeights );
-    VIRTUAL bool created( TrainerState *state );
+    TrainerState *instance(EasyCL *cl, int numWeights);
+    VIRTUAL bool created(TrainerState *state);
 
     // [[[end]]]
 };
diff --git a/src/trainers/Adagrad.cpp b/src/trainers/Adagrad.cpp
index e5f56926..f50da674 100644
--- a/src/trainers/Adagrad.cpp
+++ b/src/trainers/Adagrad.cpp
@@ -30,25 +30,25 @@ using namespace std;
 
 VIRTUAL Adagrad::~Adagrad() {
 }
-VIRTUAL void Adagrad::setFudgeFactor( float fudgeFactor ) {
+VIRTUAL void Adagrad::setFudgeFactor(float fudgeFactor) {
     this->fudgeFactor = fudgeFactor;
 }
 VIRTUAL std::string Adagrad::asString() {
-    return "Adagrad{ learningRate=" + toString( learningRate ) + ", fudgeFactor=" + 
-        toString( fudgeFactor ) + " }"; // if you have a better name, let me know :-)
+    return "Adagrad{ learningRate=" + toString(learningRate) + ", fudgeFactor=" + 
+        toString(fudgeFactor) + " }"; // if you have a better name, let me know :-)
 }
-VIRTUAL void Adagrad::updateWeights( CLWrapper *weightsWrapper, CLWrapper *gradWeightsWrapper,
-        AdagradState *trainerState ) {
+VIRTUAL void Adagrad::updateWeights(CLWrapper *weightsWrapper, CLWrapper *gradWeightsWrapper,
+        AdagradState *trainerState) {
 
     int numWeights = trainerState->numWeights;
     float *working = new float[ numWeights ];
-    CLWrapper *workingWrapper = cl->wrap( numWeights, working );
+    CLWrapper *workingWrapper = cl->wrap(numWeights, working);
     workingWrapper->createOnDevice();
 
-    CLMathWrapper clWeights( weightsWrapper );
-    CLMathWrapper clGradWeights( gradWeightsWrapper );
-    CLMathWrapper clSumSquares( trainerState->sumSquaresWrapper );
-    CLMathWrapper clWorking( workingWrapper );
+    CLMathWrapper clWeights(weightsWrapper);
+    CLMathWrapper clGradWeights(gradWeightsWrapper);
+    CLMathWrapper clSumSquares(trainerState->sumSquaresWrapper);
+    CLMathWrapper clWorking(workingWrapper);
 
     // following all happens on gpu, via clmathwrapper:
     clWorking = clGradWeights;
@@ -65,56 +65,56 @@ VIRTUAL void Adagrad::updateWeights( CLWrapper *weightsWrapper, CLWrapper *gradW
     delete workingWrapper;
     delete[] working;
 }
-VIRTUAL BatchResult Adagrad::train( NeuralNet *net, TrainingContext *context,
-    float const*input, OutputData *outputData ) {
+VIRTUAL BatchResult Adagrad::train(NeuralNet *net, TrainingContext *context,
+    float const*input, OutputData *outputData) {
     // learns one batch, including updating weights
     // doesnt have to think about running multiple batches,
     // or loading data, or anything like that
-    bindState( net );
+    bindState(net);
 
-    net->forward( input );
-    int numRight = net->calcNumRight( outputData );
-    float loss = net->calcLoss( outputData );
-    net->backward( outputData );
+    net->forward(input);
+    int numRight = net->calcNumRight(outputData);
+    float loss = net->calcLoss(outputData);
+    net->backward(outputData);
 
     int numLayers = net->getNumLayers();
-    for( int layerIdx = numLayers - 2; layerIdx > 0; layerIdx-- ) {
-        Layer *layer = net->getLayer( layerIdx );
-        if( !layer->needsBackProp() ) {
+    for(int layerIdx = numLayers - 2; layerIdx > 0; layerIdx--) {
+        Layer *layer = net->getLayer(layerIdx);
+        if(!layer->needsBackProp()) {
             break;
         }
-        if( layer->needsTrainerState() ) {
-            updateWeights( layer->getWeightsWrapper(), layer->getGradWeightsWrapper(), 
-                dynamic_cast< AdagradState * >( layer->getTrainerState() ) );
-            if( layer->biased() ) {
-                updateWeights( layer->getBiasWrapper(), layer->getGradBiasWrapper(),
-                    dynamic_cast< AdagradState * >( layer->getBiasTrainerState() ) );
+        if(layer->needsTrainerState()) {
+            updateWeights(layer->getWeightsWrapper(), layer->getGradWeightsWrapper(), 
+                dynamic_cast< AdagradState * >(layer->getTrainerState()) );
+            if(layer->biased()) {
+                updateWeights(layer->getBiasWrapper(), layer->getGradBiasWrapper(),
+                    dynamic_cast< AdagradState * >(layer->getBiasTrainerState()) );
             }
         }
     }
-    return BatchResult( loss, numRight );
+    return BatchResult(loss, numRight);
 }
-VIRTUAL BatchResult Adagrad::train( NeuralNet *net, TrainingContext *context,
-        float const*input, float const*expectedOutput ) {
-    ExpectedData expectedData( net, expectedOutput );
-    return this->train( net, context, input, &expectedData );
+VIRTUAL BatchResult Adagrad::train(NeuralNet *net, TrainingContext *context,
+        float const*input, float const*expectedOutput) {
+    ExpectedData expectedData(net, expectedOutput);
+    return this->train(net, context, input, &expectedData);
 }
-VIRTUAL BatchResult Adagrad::trainFromLabels( NeuralNet *net, TrainingContext *context,
-        float const*input, int const*labels ) {
-    LabeledData labeledData( net, labels );
-    return this->train( net, context, input, &labeledData );
+VIRTUAL BatchResult Adagrad::trainFromLabels(NeuralNet *net, TrainingContext *context,
+        float const*input, int const*labels) {
+    LabeledData labeledData(net, labels);
+    return this->train(net, context, input, &labeledData);
 }
-VIRTUAL void Adagrad::bindState( NeuralNet *net ) {
-    AdagradStateMaker stateMaker( fudgeFactor );
-    this->_bindState( net, &stateMaker );
+VIRTUAL void Adagrad::bindState(NeuralNet *net) {
+    AdagradStateMaker stateMaker(fudgeFactor);
+    this->_bindState(net, &stateMaker);
 }
-STATIC Adagrad *Adagrad::instance( EasyCL *cl, float learningRate ) {
-    Adagrad *sgd = new Adagrad( cl );
-    sgd->setLearningRate( learningRate );
+STATIC Adagrad *Adagrad::instance(EasyCL *cl, float learningRate) {
+    Adagrad *sgd = new Adagrad(cl);
+    sgd->setLearningRate(learningRate);
     return sgd;
 }
-Adagrad::Adagrad( EasyCL *cl ) :
-        Trainer( cl ),
-        fudgeFactor( 0.000001f ) {
+Adagrad::Adagrad(EasyCL *cl) :
+        Trainer(cl),
+        fudgeFactor(0.000001f) {
 }
 
diff --git a/src/trainers/Adagrad.h b/src/trainers/Adagrad.h
index ae3eb342..7f4a0909 100644
--- a/src/trainers/Adagrad.h
+++ b/src/trainers/Adagrad.h
@@ -35,19 +35,19 @@ class DeepCL_EXPORT Adagrad : public Trainer{
     // ]]]
     // generated, using cog:
     VIRTUAL ~Adagrad();
-    VIRTUAL void setFudgeFactor( float fudgeFactor );
+    VIRTUAL void setFudgeFactor(float fudgeFactor);
     VIRTUAL std::string asString();
-    VIRTUAL void updateWeights( CLWrapper *weightsWrapper, CLWrapper *gradWeightsWrapper,
-    AdagradState *trainerState );
-    VIRTUAL BatchResult train( NeuralNet *net, TrainingContext *context,
-    float const*input, OutputData *outputData );
-    VIRTUAL BatchResult train( NeuralNet *net, TrainingContext *context,
-    float const*input, float const*expectedOutput );
-    VIRTUAL BatchResult trainFromLabels( NeuralNet *net, TrainingContext *context,
-    float const*input, int const*labels );
-    VIRTUAL void bindState( NeuralNet *net );
-    STATIC Adagrad *instance( EasyCL *cl, float learningRate );
-    Adagrad( EasyCL *cl );
+    VIRTUAL void updateWeights(CLWrapper *weightsWrapper, CLWrapper *gradWeightsWrapper,
+    AdagradState *trainerState);
+    VIRTUAL BatchResult train(NeuralNet *net, TrainingContext *context,
+    float const*input, OutputData *outputData);
+    VIRTUAL BatchResult train(NeuralNet *net, TrainingContext *context,
+    float const*input, float const*expectedOutput);
+    VIRTUAL BatchResult trainFromLabels(NeuralNet *net, TrainingContext *context,
+    float const*input, int const*labels);
+    VIRTUAL void bindState(NeuralNet *net);
+    STATIC Adagrad *instance(EasyCL *cl, float learningRate);
+    Adagrad(EasyCL *cl);
 
     // [[[end]]]
 };
diff --git a/src/trainers/AdagradState.cpp b/src/trainers/AdagradState.cpp
index 7bcf15e2..76e12448 100644
--- a/src/trainers/AdagradState.cpp
+++ b/src/trainers/AdagradState.cpp
@@ -22,13 +22,13 @@ VIRTUAL AdagradState::~AdagradState() {
     delete[] sumSquares;
 }
 
-AdagradState::AdagradState( EasyCL *cl, int numWeights, float fudgeFactor ) :
-        numWeights( numWeights ) {
+AdagradState::AdagradState(EasyCL *cl, int numWeights, float fudgeFactor) :
+        numWeights(numWeights) {
     sumSquares = new float[numWeights];
-    for( int i = 0; i < numWeights; i++ ) {
+    for(int i = 0; i < numWeights; i++) {
         sumSquares[i] = fudgeFactor;
     }
-    sumSquaresWrapper = cl->wrap( numWeights, sumSquares );
+    sumSquaresWrapper = cl->wrap(numWeights, sumSquares);
     sumSquaresWrapper->copyToDevice();
 }
 
diff --git a/src/trainers/AdagradState.h b/src/trainers/AdagradState.h
index 63f7713e..ae6cebee 100644
--- a/src/trainers/AdagradState.h
+++ b/src/trainers/AdagradState.h
@@ -43,7 +43,7 @@ class DeepCL_EXPORT AdagradState : public TrainerState {
     // ]]]
     // generated, using cog:
     VIRTUAL ~AdagradState();
-    AdagradState( EasyCL *cl, int numWeights, float fudgeFactor );
+    AdagradState(EasyCL *cl, int numWeights, float fudgeFactor);
 
     // [[[end]]]
 };
diff --git a/src/trainers/AdagradStateMaker.cpp b/src/trainers/AdagradStateMaker.cpp
index 295c7ca3..adc04ad1 100644
--- a/src/trainers/AdagradStateMaker.cpp
+++ b/src/trainers/AdagradStateMaker.cpp
@@ -16,14 +16,14 @@ using namespace std;
 #define STATIC
 #define VIRTUAL
 
-AdagradStateMaker::AdagradStateMaker( float fudgeFactor ) {
+AdagradStateMaker::AdagradStateMaker(float fudgeFactor) {
     this->fudgeFactor = fudgeFactor;
 }
-TrainerState *AdagradStateMaker::instance( EasyCL *cl, int numWeights ) {
-    AdagradState *state = new AdagradState( cl, numWeights, fudgeFactor );
+TrainerState *AdagradStateMaker::instance(EasyCL *cl, int numWeights) {
+    AdagradState *state = new AdagradState(cl, numWeights, fudgeFactor);
     return state;
 }
-VIRTUAL bool AdagradStateMaker::created( TrainerState *state ) {
+VIRTUAL bool AdagradStateMaker::created(TrainerState *state) {
     return dynamic_cast< AdagradState * >(state) != 0;
 }
 
diff --git a/src/trainers/AdagradStateMaker.h b/src/trainers/AdagradStateMaker.h
index aafde89c..fdda71df 100644
--- a/src/trainers/AdagradStateMaker.h
+++ b/src/trainers/AdagradStateMaker.h
@@ -25,9 +25,9 @@ class AdagradStateMaker : public TrainerStateMaker {
     // cog_addheaders.add()
     // ]]]
     // generated, using cog:
-    AdagradStateMaker( float fudgeFactor );
-    TrainerState *instance( EasyCL *cl, int numWeights );
-    VIRTUAL bool created( TrainerState *state );
+    AdagradStateMaker(float fudgeFactor);
+    TrainerState *instance(EasyCL *cl, int numWeights);
+    VIRTUAL bool created(TrainerState *state);
 
     // [[[end]]]
 };
diff --git a/src/trainers/Annealer.cpp b/src/trainers/Annealer.cpp
index 4b81cec3..241c97a4 100644
--- a/src/trainers/Annealer.cpp
+++ b/src/trainers/Annealer.cpp
@@ -24,19 +24,19 @@ using namespace std;
 #define STATIC
 #define VIRTUAL
 
-STATIC Annealer *Annealer::instance( EasyCL *cl, float learningRate, float anneal ) {
-    Annealer *annealer = new Annealer( cl );
-    annealer->setLearningRate( learningRate );
-    annealer->setAnneal( anneal );
+STATIC Annealer *Annealer::instance(EasyCL *cl, float learningRate, float anneal) {
+    Annealer *annealer = new Annealer(cl);
+    annealer->setLearningRate(learningRate);
+    annealer->setAnneal(anneal);
     return annealer;
 }
-Annealer::Annealer( EasyCL *cl ) :
-    Trainer( cl ) {
+Annealer::Annealer(EasyCL *cl) :
+    Trainer(cl) {
     anneal = 1.0f;
 //    epoch = -1;
-//    copyBuffer = new CopyBuffer( cl );
-//    gpuAdd = new GpuAdd( cl );
-//    multiplyInPlace = new MultiplyInPlace( cl );
+//    copyBuffer = new CopyBuffer(cl);
+//    gpuAdd = new GpuAdd(cl);
+//    multiplyInPlace = new MultiplyInPlace(cl);
 }
 VIRTUAL Annealer::~Annealer() {
 //    delete copyBuffer;
@@ -44,26 +44,26 @@ VIRTUAL Annealer::~Annealer() {
 //    delete multiplyInPlace;
 }
 VIRTUAL std::string Annealer::asString() {
-    return "Annealer{ learningRate=" + toString( learningRate ) + ", anneal=" + 
-        toString( anneal ) + " }";
+    return "Annealer{ learningRate=" + toString(learningRate) + ", anneal=" + 
+        toString(anneal) + " }";
 }
-VIRTUAL void Annealer::setAnneal( float anneal ) {
+VIRTUAL void Annealer::setAnneal(float anneal) {
     this->anneal = anneal;
 }
-VIRTUAL void Annealer::updateWeights( float annealedLearningRate, CLWrapper *weightsWrapper, CLWrapper *gradWeightsWrapper ) {
+VIRTUAL void Annealer::updateWeights(float annealedLearningRate, CLWrapper *weightsWrapper, CLWrapper *gradWeightsWrapper) {
     // hmmmm, so all we need to do is calculate:
-    // annealedLearningRate = learningRate * pow( anneal, epoch )
+    // annealedLearningRate = learningRate * pow(anneal, epoch)
     // weightsWrapper = weightsWrapper - annealedLearningRate * gradWeightsWrapper
 
     int numWeights = weightsWrapper->size();
 
     float *gradWeightsCopy = new float[ numWeights ];
-    CLWrapper *gradWeightsCopyWrapper = cl->wrap( numWeights, gradWeightsCopy );
+    CLWrapper *gradWeightsCopyWrapper = cl->wrap(numWeights, gradWeightsCopy);
     gradWeightsCopyWrapper->createOnDevice();
 
-    CLMathWrapper gradWeights_( gradWeightsWrapper );
-    CLMathWrapper gradWeightsCopy_( gradWeightsCopyWrapper );
-    CLMathWrapper weights_( weightsWrapper );
+    CLMathWrapper gradWeights_(gradWeightsWrapper);
+    CLMathWrapper gradWeightsCopy_(gradWeightsCopyWrapper);
+    CLMathWrapper weights_(weightsWrapper);
 
     // following all happens on gpu, via CLMathWrapper:
     gradWeightsCopy_ = gradWeights_;
@@ -75,60 +75,60 @@ VIRTUAL void Annealer::updateWeights( float annealedLearningRate, CLWrapper *wei
 }
 VIRTUAL BatchResult Annealer::train( 
         NeuralNet *net, TrainingContext *context,
-        float const *input, OutputData *outputData ) {
+        float const *input, OutputData *outputData) {
 
     // hmmmm, so all we need to do is calculate:
-    // annealedLearningRate = learningRate * pow( anneal, epoch )
+    // annealedLearningRate = learningRate * pow(anneal, epoch)
     // weightsWrapper = weightsWrapper - annealedLearningRate * gradWeightsWrapper
 //    cout << " epoch=" << epoch << " learningrate=" << learningRate << " anneal=" << anneal << endl;
 
-    float annealedLearningRate = learningRate * pow( anneal, context->epoch );
-    if( context->batch == 0 ) {
+    float annealedLearningRate = learningRate * pow(anneal, context->epoch);
+    if(context->batch == 0) {
         cout << "Annealer annealedLearningRate=" << annealedLearningRate << endl;
     }
 
-    bindState( net );
+    bindState(net);
 
-    net->forward( input );
-    int numRight = net->calcNumRight( outputData );
-    float loss = net->calcLoss( outputData );
-    net->backward( outputData );
+    net->forward(input);
+    int numRight = net->calcNumRight(outputData);
+    float loss = net->calcLoss(outputData);
+    net->backward(outputData);
 
     int numLayers = net->getNumLayers();
-    for( int layerIdx = numLayers - 2; layerIdx > 0; layerIdx-- ) {
-        Layer *layer = net->getLayer( layerIdx );
-        if( !layer->needsBackProp() ) {
+    for(int layerIdx = numLayers - 2; layerIdx > 0; layerIdx--) {
+        Layer *layer = net->getLayer(layerIdx);
+        if(!layer->needsBackProp()) {
             break;
         }
-        if( layer->needsTrainerState() ) {
-            updateWeights( annealedLearningRate, layer->getWeightsWrapper(), layer->getGradWeightsWrapper() );
-            if( layer->biased() ) {
-                updateWeights( annealedLearningRate, layer->getBiasWrapper(), layer->getGradBiasWrapper() );
+        if(layer->needsTrainerState()) {
+            updateWeights(annealedLearningRate, layer->getWeightsWrapper(), layer->getGradWeightsWrapper());
+            if(layer->biased()) {
+                updateWeights(annealedLearningRate, layer->getBiasWrapper(), layer->getGradBiasWrapper());
             }
         }
     }
-    return BatchResult( loss, numRight );
+    return BatchResult(loss, numRight);
 }
-VIRTUAL BatchResult Annealer::train( NeuralNet *net, TrainingContext *context,
-        float const*input, float const*expectedOutput ) {
-    ExpectedData expectedData( net, expectedOutput );
-    return this->train( net, context, input, &expectedData );
+VIRTUAL BatchResult Annealer::train(NeuralNet *net, TrainingContext *context,
+        float const*input, float const*expectedOutput) {
+    ExpectedData expectedData(net, expectedOutput);
+    return this->train(net, context, input, &expectedData);
 }
-VIRTUAL BatchResult Annealer::trainFromLabels( NeuralNet *net, TrainingContext *context,
-        float const*input, int const*labels ) {
-    LabeledData labeledData( net, labels );
-    return this->train( net, context, input, &labeledData );
+VIRTUAL BatchResult Annealer::trainFromLabels(NeuralNet *net, TrainingContext *context,
+        float const*input, int const*labels) {
+    LabeledData labeledData(net, labels);
+    return this->train(net, context, input, &labeledData);
 }
-VIRTUAL void Annealer::bindState( NeuralNet *net ) {
+VIRTUAL void Annealer::bindState(NeuralNet *net) {
     // since we have no state, all we will do is strip any existing state,
     // so that if another trainer trains the net, it wont come across
     // some stale state
-    for( int layerIdx = 0; layerIdx < net->getNumLayers(); layerIdx++ ) {
-        Layer *layer = net->getLayer( layerIdx );
-        if( layer->needsTrainerState() ) {
+    for(int layerIdx = 0; layerIdx < net->getNumLayers(); layerIdx++) {
+        Layer *layer = net->getLayer(layerIdx);
+        if(layer->needsTrainerState()) {
             TrainerState *state = layer->getTrainerState();
-            if( state != 0 ) {
-                layer->setTrainerState( 0 );
+            if(state != 0) {
+                layer->setTrainerState(0);
             }
         }
     }
diff --git a/src/trainers/Annealer.h b/src/trainers/Annealer.h
index 2e6d9119..3208c5dc 100644
--- a/src/trainers/Annealer.h
+++ b/src/trainers/Annealer.h
@@ -24,7 +24,7 @@ class OutputData;
 #define STATIC static
 
 // anneals learning, so actual learning rate =
-//    learning rate * pow( anneal, epoch )
+//    learning rate * pow(anneal, epoch)
 //    (for zero-based epoch number)
 class DeepCL_EXPORT Annealer : public Trainer {
 public:
@@ -40,20 +40,20 @@ class DeepCL_EXPORT Annealer : public Trainer {
     // cog_addheaders.add()
     // ]]]
     // generated, using cog:
-    STATIC Annealer *instance( EasyCL *cl, float learningRate, float anneal );
-    Annealer( EasyCL *cl );
+    STATIC Annealer *instance(EasyCL *cl, float learningRate, float anneal);
+    Annealer(EasyCL *cl);
     VIRTUAL ~Annealer();
     VIRTUAL std::string asString();
-    VIRTUAL void setAnneal( float anneal );
-    VIRTUAL void updateWeights( float annealedLearningRate, CLWrapper *weightsWrapper, CLWrapper *gradWeightsWrapper );
+    VIRTUAL void setAnneal(float anneal);
+    VIRTUAL void updateWeights(float annealedLearningRate, CLWrapper *weightsWrapper, CLWrapper *gradWeightsWrapper);
     VIRTUAL BatchResult train(
     NeuralNet *net, TrainingContext *context,
-    float const *input, OutputData *outputData );
-    VIRTUAL BatchResult train( NeuralNet *net, TrainingContext *context,
-    float const*input, float const*expectedOutput );
-    VIRTUAL BatchResult trainFromLabels( NeuralNet *net, TrainingContext *context,
-    float const*input, int const*labels );
-    VIRTUAL void bindState( NeuralNet *net );
+    float const *input, OutputData *outputData);
+    VIRTUAL BatchResult train(NeuralNet *net, TrainingContext *context,
+    float const*input, float const*expectedOutput);
+    VIRTUAL BatchResult trainFromLabels(NeuralNet *net, TrainingContext *context,
+    float const*input, int const*labels);
+    VIRTUAL void bindState(NeuralNet *net);
 
     // [[[end]]]
 };
diff --git a/src/trainers/Nesterov.cpp b/src/trainers/Nesterov.cpp
index 46837017..bb7161f8 100644
--- a/src/trainers/Nesterov.cpp
+++ b/src/trainers/Nesterov.cpp
@@ -28,23 +28,23 @@ using namespace std;
 
 VIRTUAL Nesterov::~Nesterov() {
 }
-VIRTUAL void Nesterov::setMomentum( float momentum ) {
+VIRTUAL void Nesterov::setMomentum(float momentum) {
     this->momentum = momentum;
 }
 VIRTUAL std::string Nesterov::asString() {
-    return "Nesterov{ learningRate=" + toString( learningRate ) + ", momentum=" + 
-        toString( momentum ) + " }";
+    return "Nesterov{ learningRate=" + toString(learningRate) + ", momentum=" + 
+        toString(momentum) + " }";
 }
 VIRTUAL void Nesterov::loadFutureWeights(
         CLWrapper *weightsWrapper, CLWrapper *gradWeightsWrapper,
-        NesterovState *trainerState ) {
+        NesterovState *trainerState) {
     // this will save the old weights, into the trainerState,
     // and then add mom * dweights to them
 
     // create CLMathWrapper objects, so we can do per-element maths on the gpu:
-    CLMathWrapper clOldWeights( trainerState->oldWeightsWrapper );
-    CLMathWrapper clWeights( weightsWrapper );
-    CLMathWrapper clGradWeights( gradWeightsWrapper );
+    CLMathWrapper clOldWeights(trainerState->oldWeightsWrapper);
+    CLMathWrapper clWeights(weightsWrapper);
+    CLMathWrapper clGradWeights(gradWeightsWrapper);
 
     // following happens on the gpu:
     clOldWeights = clWeights;
@@ -52,22 +52,22 @@ VIRTUAL void Nesterov::loadFutureWeights(
     clWeights *= momentum;
     clWeights += clOldWeights;
 }
-VIRTUAL void Nesterov::updateWeights( CLWrapper *weightsWrapper,
+VIRTUAL void Nesterov::updateWeights(CLWrapper *weightsWrapper,
         CLWrapper *gradWeightsWrapper,
-        NesterovState *trainerState ) {
-    // we have: gradWeights = gradient( weights[t] + mom * dweights[t] )
+        NesterovState *trainerState) {
+    // we have: gradWeights = gradient(weights[t] + mom * dweights[t])
     //          trainerState->oldWeights = weights[t]
     //          trainerState->lastUpdate = dweights[t]
     // and so we can calculate
     //      dweights[t+1] = mom * dweights[t] - learningrate * gradient( 
-    //                          weights[t] + mom * dweights[t] )
+    //                          weights[t] + mom * dweights[t])
     //      weights[t+1] = weights[t] + dweights[t+1]
 
     // create CLMathWrapper objects, so we can do per-element maths on the gpu:
-    CLMathWrapper clLastUpdate( trainerState->lastUpdateWrapper );
-    CLMathWrapper clOldWeights( trainerState->oldWeightsWrapper );
-    CLMathWrapper clGradWeights( gradWeightsWrapper );
-    CLMathWrapper clWeights( weightsWrapper );
+    CLMathWrapper clLastUpdate(trainerState->lastUpdateWrapper);
+    CLMathWrapper clOldWeights(trainerState->oldWeightsWrapper);
+    CLMathWrapper clGradWeights(gradWeightsWrapper);
+    CLMathWrapper clWeights(weightsWrapper);
 
     // following happens on the gpu, via CLMathWrapper:
 
@@ -79,95 +79,95 @@ VIRTUAL void Nesterov::updateWeights( CLWrapper *weightsWrapper,
 }
 VIRTUAL BatchResult Nesterov::train( 
     NeuralNet *net, TrainingContext *context,
-    float const *input, OutputData *outputData ) {
+    float const *input, OutputData *outputData) {
     // learns one batch, including updating weights
     // doesnt have to think about running multiple batches,
     // or loading data, or anything like that
 
     //      dweights[t+1] = mom * dweights[t] - learningrate * gradient(
-    //                      weights[t] + mom * dweights[t] )
+    //                      weights[t] + mom * dweights[t])
     //      weights[t+1] = weights[t] + dweights[t+1]
     //
     // given weights[t], dweights[t]:
     //      forward/backprop weights[t] + mom * dweights[t]
     //      => calc dweights[t+1]
     //      => calc weights[t+1]
-    bindState( net );
+    bindState(net);
 
     // first, substitute weights + mom * dweights into the weights
     // calculate them first
     // save old weights first I suppose?
 
     int numLayers = net->getNumLayers();
-    for( int layerIdx = numLayers - 2; layerIdx > 0; layerIdx-- ) {
-        Layer *layer = net->getLayer( layerIdx );
-        if( !layer->needsBackProp() ) {
+    for(int layerIdx = numLayers - 2; layerIdx > 0; layerIdx--) {
+        Layer *layer = net->getLayer(layerIdx);
+        if(!layer->needsBackProp()) {
             break;
         }
-        if( layer->needsTrainerState() ) {
-            loadFutureWeights( layer->getWeightsWrapper(), layer->getGradWeightsWrapper(), 
-                dynamic_cast< NesterovState * >( layer->getTrainerState() ) );
-            if( layer->biased() ) {
-                loadFutureWeights( layer->getBiasWrapper(), layer->getGradBiasWrapper(),
-                    dynamic_cast< NesterovState * >( layer->getBiasTrainerState() ) );
+        if(layer->needsTrainerState()) {
+            loadFutureWeights(layer->getWeightsWrapper(), layer->getGradWeightsWrapper(), 
+                dynamic_cast< NesterovState * >(layer->getTrainerState()) );
+            if(layer->biased()) {
+                loadFutureWeights(layer->getBiasWrapper(), layer->getGradBiasWrapper(),
+                    dynamic_cast< NesterovState * >(layer->getBiasTrainerState()) );
             }
         }
     }
 
     // now, we have loaded in weigths + mom * dweights into the weights
     // do forward/backward:
-    net->forward( input );
-    int numRight = net->calcNumRight( outputData );
-    float loss = net->calcLoss( outputData );
-    net->backward( outputData );
+    net->forward(input);
+    int numRight = net->calcNumRight(outputData);
+    float loss = net->calcLoss(outputData);
+    net->backward(outputData);
 
     // now, calculate the new weights
-    for( int layerIdx = numLayers - 2; layerIdx > 0; layerIdx-- ) {
-        Layer *layer = net->getLayer( layerIdx );
-        if( !layer->needsBackProp() ) {
+    for(int layerIdx = numLayers - 2; layerIdx > 0; layerIdx--) {
+        Layer *layer = net->getLayer(layerIdx);
+        if(!layer->needsBackProp()) {
             break;
         }
-        if( layer->needsTrainerState() ) {
-            updateWeights( layer->getWeightsWrapper(), layer->getGradWeightsWrapper(), 
-                dynamic_cast< NesterovState * >( layer->getTrainerState() ) );
-            if( layer->biased() ) {
-                updateWeights( layer->getBiasWrapper(), layer->getGradBiasWrapper(),
-                    dynamic_cast< NesterovState * >( layer->getBiasTrainerState() ) );
+        if(layer->needsTrainerState()) {
+            updateWeights(layer->getWeightsWrapper(), layer->getGradWeightsWrapper(), 
+                dynamic_cast< NesterovState * >(layer->getTrainerState()) );
+            if(layer->biased()) {
+                updateWeights(layer->getBiasWrapper(), layer->getGradBiasWrapper(),
+                    dynamic_cast< NesterovState * >(layer->getBiasTrainerState()) );
             }
         }
     }
 
-    return BatchResult( loss, numRight );
+    return BatchResult(loss, numRight);
 }
-VIRTUAL BatchResult Nesterov::train( NeuralNet *net, TrainingContext *context,
-        float const*input, float const*expectedOutput ) {
+VIRTUAL BatchResult Nesterov::train(NeuralNet *net, TrainingContext *context,
+        float const*input, float const*expectedOutput) {
 
-    ExpectedData expectedData( net, expectedOutput );
-    return this->train( net, context, input, &expectedData );
+    ExpectedData expectedData(net, expectedOutput);
+    return this->train(net, context, input, &expectedData);
 }
-VIRTUAL BatchResult Nesterov::trainFromLabels( NeuralNet *net, TrainingContext *context,
-        float const*input, int const*labels ) {
+VIRTUAL BatchResult Nesterov::trainFromLabels(NeuralNet *net, TrainingContext *context,
+        float const*input, int const*labels) {
 
-    LabeledData labeledData( net, labels );
-    return this->train( net, context, input, &labeledData );
+    LabeledData labeledData(net, labels);
+    return this->train(net, context, input, &labeledData);
 }
-VIRTUAL void Nesterov::bindState( NeuralNet *net ) {
+VIRTUAL void Nesterov::bindState(NeuralNet *net) {
     NesterovStateMaker stateMaker;
-    this->_bindState( net, &stateMaker );
+    this->_bindState(net, &stateMaker);
 }
-STATIC Nesterov *Nesterov::instance( EasyCL *cl, float learningRate ) {
-    Nesterov *sgd = new Nesterov( cl );
-    sgd->setLearningRate( learningRate );
+STATIC Nesterov *Nesterov::instance(EasyCL *cl, float learningRate) {
+    Nesterov *sgd = new Nesterov(cl);
+    sgd->setLearningRate(learningRate);
     return sgd;
 }
-STATIC Nesterov *Nesterov::instance( EasyCL *cl, float learningRate, float momentum ) {
-    Nesterov *sgd = new Nesterov( cl );
-    sgd->setLearningRate( learningRate );
-    sgd->setMomentum( momentum );
+STATIC Nesterov *Nesterov::instance(EasyCL *cl, float learningRate, float momentum) {
+    Nesterov *sgd = new Nesterov(cl);
+    sgd->setLearningRate(learningRate);
+    sgd->setMomentum(momentum);
     return sgd;
 }
-Nesterov::Nesterov( EasyCL *cl ) :
-        Trainer( cl ),
-        momentum( 0.0f ) {
+Nesterov::Nesterov(EasyCL *cl) :
+        Trainer(cl),
+        momentum(0.0f) {
 }
 
diff --git a/src/trainers/Nesterov.h b/src/trainers/Nesterov.h
index a9d2c52d..c804728f 100644
--- a/src/trainers/Nesterov.h
+++ b/src/trainers/Nesterov.h
@@ -22,7 +22,7 @@ class NesterovState;
 
 // implements Nesterov momentum
 // Nesterov momentum defined eg in http://www.cs.toronto.edu/~gdahl/papers/momentumNesterovDeepLearning.pdf
-//      dweights[t+1] = mom * dweights[t] - learningrate * gradient( weights[t] + mom * dweights[t] )
+//      dweights[t+1] = mom * dweights[t] - learningrate * gradient(weights[t] + mom * dweights[t])
 //      weights[t+1] = weights[t] + dweights[t+1]
 //
 // given weights[t], dweights[t]:
@@ -40,25 +40,25 @@ class DeepCL_EXPORT Nesterov : public Trainer {
     // ]]]
     // generated, using cog:
     VIRTUAL ~Nesterov();
-    VIRTUAL void setMomentum( float momentum );
+    VIRTUAL void setMomentum(float momentum);
     VIRTUAL std::string asString();
     VIRTUAL void loadFutureWeights(
     CLWrapper *weightsWrapper, CLWrapper *gradWeightsWrapper,
-    NesterovState *trainerState );
-    VIRTUAL void updateWeights( CLWrapper *weightsWrapper,
+    NesterovState *trainerState);
+    VIRTUAL void updateWeights(CLWrapper *weightsWrapper,
     CLWrapper *gradWeightsWrapper,
-    NesterovState *trainerState );
+    NesterovState *trainerState);
     VIRTUAL BatchResult train(
     NeuralNet *net, TrainingContext *context,
-    float const *input, OutputData *outputData );
-    VIRTUAL BatchResult train( NeuralNet *net, TrainingContext *context,
-    float const*input, float const*expectedOutput );
-    VIRTUAL BatchResult trainFromLabels( NeuralNet *net, TrainingContext *context,
-    float const*input, int const*labels );
-    VIRTUAL void bindState( NeuralNet *net );
-    STATIC Nesterov *instance( EasyCL *cl, float learningRate );
-    STATIC Nesterov *instance( EasyCL *cl, float learningRate, float momentum );
-    Nesterov( EasyCL *cl );
+    float const *input, OutputData *outputData);
+    VIRTUAL BatchResult train(NeuralNet *net, TrainingContext *context,
+    float const*input, float const*expectedOutput);
+    VIRTUAL BatchResult trainFromLabels(NeuralNet *net, TrainingContext *context,
+    float const*input, int const*labels);
+    VIRTUAL void bindState(NeuralNet *net);
+    STATIC Nesterov *instance(EasyCL *cl, float learningRate);
+    STATIC Nesterov *instance(EasyCL *cl, float learningRate, float momentum);
+    Nesterov(EasyCL *cl);
 
     // [[[end]]]
 };
diff --git a/src/trainers/NesterovState.cpp b/src/trainers/NesterovState.cpp
index af2855d9..d94883eb 100644
--- a/src/trainers/NesterovState.cpp
+++ b/src/trainers/NesterovState.cpp
@@ -24,8 +24,8 @@ VIRTUAL NesterovState::~NesterovState() {
     delete[] oldWeightsWrapper;
 }
 
-NesterovState::NesterovState( EasyCL *cl, int numWeights ) :
-        numWeights( numWeights )
+NesterovState::NesterovState(EasyCL *cl, int numWeights) :
+        numWeights(numWeights)
     { // should we handle bias separately?  maybe... not?
       // or each layer could have one trainer for biases, and one for the
       // non-biases?  Maybe kind of ok?
@@ -33,14 +33,14 @@ NesterovState::NesterovState( EasyCL *cl, int numWeights ) :
     // lastUpdate buffer never needs to change size,
     //  since number of weights is invariant with batchSize etc
     lastUpdate = new float[numWeights];
-    for( int i = 0; i < numWeights; i++ ) {
+    for(int i = 0; i < numWeights; i++) {
         lastUpdate[i] = 0.0f;
     }
-    lastUpdateWrapper = cl->wrap( numWeights, lastUpdate );
+    lastUpdateWrapper = cl->wrap(numWeights, lastUpdate);
     lastUpdateWrapper->copyToDevice();
 
     oldWeights = new float[numWeights];
-    oldWeightsWrapper = cl->wrap( numWeights, oldWeights );
+    oldWeightsWrapper = cl->wrap(numWeights, oldWeights);
     oldWeightsWrapper->createOnDevice();
 }
 
diff --git a/src/trainers/NesterovState.h b/src/trainers/NesterovState.h
index f892e3c6..8ac6f1cf 100644
--- a/src/trainers/NesterovState.h
+++ b/src/trainers/NesterovState.h
@@ -46,7 +46,7 @@ class DeepCL_EXPORT NesterovState : public TrainerState {
     // ]]]
     // generated, using cog:
     VIRTUAL ~NesterovState();
-    NesterovState( EasyCL *cl, int numWeights );
+    NesterovState(EasyCL *cl, int numWeights);
 
     // [[[end]]]
 };
diff --git a/src/trainers/NesterovStateMaker.cpp b/src/trainers/NesterovStateMaker.cpp
index fec92781..a07e17ab 100644
--- a/src/trainers/NesterovStateMaker.cpp
+++ b/src/trainers/NesterovStateMaker.cpp
@@ -16,11 +16,11 @@ using namespace std;
 #define STATIC
 #define VIRTUAL
 
-TrainerState *NesterovStateMaker::instance( EasyCL *cl, int numWeights ) {
-    NesterovState *sgd = new NesterovState( cl, numWeights );
+TrainerState *NesterovStateMaker::instance(EasyCL *cl, int numWeights) {
+    NesterovState *sgd = new NesterovState(cl, numWeights);
     return sgd;
 }
-VIRTUAL bool NesterovStateMaker::created( TrainerState *state ) {
+VIRTUAL bool NesterovStateMaker::created(TrainerState *state) {
     return dynamic_cast< NesterovState * >(state) != 0;
 }
 
diff --git a/src/trainers/NesterovStateMaker.h b/src/trainers/NesterovStateMaker.h
index e2370360..661950af 100644
--- a/src/trainers/NesterovStateMaker.h
+++ b/src/trainers/NesterovStateMaker.h
@@ -23,8 +23,8 @@ class NesterovStateMaker : public TrainerStateMaker {
     // cog_addheaders.add()
     // ]]]
     // generated, using cog:
-    TrainerState *instance( EasyCL *cl, int numWeights );
-    VIRTUAL bool created( TrainerState *state );
+    TrainerState *instance(EasyCL *cl, int numWeights);
+    VIRTUAL bool created(TrainerState *state);
 
     // [[[end]]]
 };
diff --git a/src/trainers/Rmsprop.cpp b/src/trainers/Rmsprop.cpp
index e829d16f..d143045a 100644
--- a/src/trainers/Rmsprop.cpp
+++ b/src/trainers/Rmsprop.cpp
@@ -31,20 +31,20 @@ using namespace std;
 VIRTUAL Rmsprop::~Rmsprop() {
 }
 VIRTUAL std::string Rmsprop::asString() {
-    return "Rmsprop{ learningRate=" + toString( learningRate ) + " }";
+    return "Rmsprop{ learningRate=" + toString(learningRate) + " }";
 }
-VIRTUAL void Rmsprop::updateWeights( CLWrapper *weightsWrapper, CLWrapper *gradWeightsWrapper,
-        RmspropState *trainerState ) {
+VIRTUAL void Rmsprop::updateWeights(CLWrapper *weightsWrapper, CLWrapper *gradWeightsWrapper,
+        RmspropState *trainerState) {
 
     int numWeights = trainerState->numWeights;
     float *working = new float[ numWeights ];
-    CLWrapper *workingWrapper = cl->wrap( numWeights, working );
+    CLWrapper *workingWrapper = cl->wrap(numWeights, working);
     workingWrapper->createOnDevice();
 
-    CLMathWrapper clWeights( weightsWrapper );
-    CLMathWrapper clGradWeights( gradWeightsWrapper );
-    CLMathWrapper clMeanSquares( trainerState->meanSquareWrapper );
-    CLMathWrapper clWorking( workingWrapper );
+    CLMathWrapper clWeights(weightsWrapper);
+    CLMathWrapper clGradWeights(gradWeightsWrapper);
+    CLMathWrapper clMeanSquares(trainerState->meanSquareWrapper);
+    CLMathWrapper clWorking(workingWrapper);
 
     // following all happens on gpu, via clmathwrapper:
     clWorking = clGradWeights;
@@ -63,55 +63,55 @@ VIRTUAL void Rmsprop::updateWeights( CLWrapper *weightsWrapper, CLWrapper *gradW
     delete workingWrapper;
     delete[] working;
 }
-VIRTUAL BatchResult Rmsprop::train( NeuralNet *net, TrainingContext *context,
-    float const*input, OutputData *outputData ) {
+VIRTUAL BatchResult Rmsprop::train(NeuralNet *net, TrainingContext *context,
+    float const*input, OutputData *outputData) {
     // learns one batch, including updating weights
     // doesnt have to think about running multiple batches,
     // or loading data, or anything like that
-    bindState( net );
+    bindState(net);
 
-    net->forward( input );
-    int numRight = net->calcNumRight( outputData );
-    float loss = net->calcLoss( outputData );
-    net->backward( outputData );
+    net->forward(input);
+    int numRight = net->calcNumRight(outputData);
+    float loss = net->calcLoss(outputData);
+    net->backward(outputData);
 
     int numLayers = net->getNumLayers();
-    for( int layerIdx = numLayers - 2; layerIdx > 0; layerIdx-- ) {
-        Layer *layer = net->getLayer( layerIdx );
-        if( !layer->needsBackProp() ) {
+    for(int layerIdx = numLayers - 2; layerIdx > 0; layerIdx--) {
+        Layer *layer = net->getLayer(layerIdx);
+        if(!layer->needsBackProp()) {
             break;
         }
-        if( layer->needsTrainerState() ) {
-            updateWeights( layer->getWeightsWrapper(), layer->getGradWeightsWrapper(), 
-                dynamic_cast< RmspropState * >( layer->getTrainerState() ) );
-            if( layer->biased() ) {
-                updateWeights( layer->getBiasWrapper(), layer->getGradBiasWrapper(),
-                    dynamic_cast< RmspropState * >( layer->getBiasTrainerState() ) );
+        if(layer->needsTrainerState()) {
+            updateWeights(layer->getWeightsWrapper(), layer->getGradWeightsWrapper(), 
+                dynamic_cast< RmspropState * >(layer->getTrainerState()) );
+            if(layer->biased()) {
+                updateWeights(layer->getBiasWrapper(), layer->getGradBiasWrapper(),
+                    dynamic_cast< RmspropState * >(layer->getBiasTrainerState()) );
             }
         }
     }
-    return BatchResult( loss, numRight );
+    return BatchResult(loss, numRight);
 }
-VIRTUAL BatchResult Rmsprop::train( NeuralNet *net, TrainingContext *context,
-        float const*input, float const*expectedOutput ) {
-    ExpectedData expectedData( net, expectedOutput );
-    return this->train( net, context, input, &expectedData );
+VIRTUAL BatchResult Rmsprop::train(NeuralNet *net, TrainingContext *context,
+        float const*input, float const*expectedOutput) {
+    ExpectedData expectedData(net, expectedOutput);
+    return this->train(net, context, input, &expectedData);
 }
-VIRTUAL BatchResult Rmsprop::trainFromLabels( NeuralNet *net, TrainingContext *context,
-        float const*input, int const*labels ) {
-    LabeledData labeledData( net, labels );
-    return this->train( net, context, input, &labeledData );
+VIRTUAL BatchResult Rmsprop::trainFromLabels(NeuralNet *net, TrainingContext *context,
+        float const*input, int const*labels) {
+    LabeledData labeledData(net, labels);
+    return this->train(net, context, input, &labeledData);
 }
-VIRTUAL void Rmsprop::bindState( NeuralNet *net ) {
+VIRTUAL void Rmsprop::bindState(NeuralNet *net) {
     RmspropStateMaker stateMaker;
-    this->_bindState( net, &stateMaker );
+    this->_bindState(net, &stateMaker);
 }
-STATIC Rmsprop *Rmsprop::instance( EasyCL *cl, float learningRate ) {
-    Rmsprop *sgd = new Rmsprop( cl );
-    sgd->setLearningRate( learningRate );
+STATIC Rmsprop *Rmsprop::instance(EasyCL *cl, float learningRate) {
+    Rmsprop *sgd = new Rmsprop(cl);
+    sgd->setLearningRate(learningRate);
     return sgd;
 }
-Rmsprop::Rmsprop( EasyCL *cl ) :
-        Trainer( cl ) {
+Rmsprop::Rmsprop(EasyCL *cl) :
+        Trainer(cl) {
 }
 
diff --git a/src/trainers/Rmsprop.h b/src/trainers/Rmsprop.h
index 2e234a91..528c0374 100644
--- a/src/trainers/Rmsprop.h
+++ b/src/trainers/Rmsprop.h
@@ -35,17 +35,17 @@ class DeepCL_EXPORT Rmsprop : public Trainer{
     // generated, using cog:
     VIRTUAL ~Rmsprop();
     VIRTUAL std::string asString();
-    VIRTUAL void updateWeights( CLWrapper *weightsWrapper, CLWrapper *gradWeightsWrapper,
-    RmspropState *trainerState );
-    VIRTUAL BatchResult train( NeuralNet *net, TrainingContext *context,
-    float const*input, OutputData *outputData );
-    VIRTUAL BatchResult train( NeuralNet *net, TrainingContext *context,
-    float const*input, float const*expectedOutput );
-    VIRTUAL BatchResult trainFromLabels( NeuralNet *net, TrainingContext *context,
-    float const*input, int const*labels );
-    VIRTUAL void bindState( NeuralNet *net );
-    STATIC Rmsprop *instance( EasyCL *cl, float learningRate );
-    Rmsprop( EasyCL *cl );
+    VIRTUAL void updateWeights(CLWrapper *weightsWrapper, CLWrapper *gradWeightsWrapper,
+    RmspropState *trainerState);
+    VIRTUAL BatchResult train(NeuralNet *net, TrainingContext *context,
+    float const*input, OutputData *outputData);
+    VIRTUAL BatchResult train(NeuralNet *net, TrainingContext *context,
+    float const*input, float const*expectedOutput);
+    VIRTUAL BatchResult trainFromLabels(NeuralNet *net, TrainingContext *context,
+    float const*input, int const*labels);
+    VIRTUAL void bindState(NeuralNet *net);
+    STATIC Rmsprop *instance(EasyCL *cl, float learningRate);
+    Rmsprop(EasyCL *cl);
 
     // [[[end]]]
 };
diff --git a/src/trainers/RmspropState.cpp b/src/trainers/RmspropState.cpp
index 3d81d243..9be7a286 100644
--- a/src/trainers/RmspropState.cpp
+++ b/src/trainers/RmspropState.cpp
@@ -22,13 +22,13 @@ VIRTUAL RmspropState::~RmspropState() {
     delete[] meanSquare;
 }
 
-RmspropState::RmspropState( EasyCL *cl, int numWeights ) :
-        numWeights( numWeights ) {
+RmspropState::RmspropState(EasyCL *cl, int numWeights) :
+        numWeights(numWeights) {
     meanSquare = new float[numWeights];
-    for( int i = 0; i < numWeights; i++ ) {
+    for(int i = 0; i < numWeights; i++) {
         meanSquare[i] = 0.0000001f; // should move this into fudgefactor I guess?
     }
-    meanSquareWrapper = cl->wrap( numWeights, meanSquare );
+    meanSquareWrapper = cl->wrap(numWeights, meanSquare);
     meanSquareWrapper->copyToDevice();
 }
 
diff --git a/src/trainers/RmspropState.h b/src/trainers/RmspropState.h
index 0be22b0d..27c9cc39 100644
--- a/src/trainers/RmspropState.h
+++ b/src/trainers/RmspropState.h
@@ -34,7 +34,7 @@ class DeepCL_EXPORT RmspropState : public TrainerState {
     // ]]]
     // generated, using cog:
     VIRTUAL ~RmspropState();
-    RmspropState( EasyCL *cl, int numWeights );
+    RmspropState(EasyCL *cl, int numWeights);
 
     // [[[end]]]
 };
diff --git a/src/trainers/RmspropStateMaker.cpp b/src/trainers/RmspropStateMaker.cpp
index ab295c67..af3758fe 100644
--- a/src/trainers/RmspropStateMaker.cpp
+++ b/src/trainers/RmspropStateMaker.cpp
@@ -16,11 +16,11 @@ using namespace std;
 #define STATIC
 #define VIRTUAL
 
-TrainerState *RmspropStateMaker::instance( EasyCL *cl, int numWeights ) {
-    RmspropState *state = new RmspropState( cl, numWeights );
+TrainerState *RmspropStateMaker::instance(EasyCL *cl, int numWeights) {
+    RmspropState *state = new RmspropState(cl, numWeights);
     return state;
 }
-VIRTUAL bool RmspropStateMaker::created( TrainerState *state ) {
+VIRTUAL bool RmspropStateMaker::created(TrainerState *state) {
     return dynamic_cast< RmspropState * >(state) != 0;
 }
 
diff --git a/src/trainers/RmspropStateMaker.h b/src/trainers/RmspropStateMaker.h
index c2cc0500..7248677d 100644
--- a/src/trainers/RmspropStateMaker.h
+++ b/src/trainers/RmspropStateMaker.h
@@ -24,8 +24,8 @@ class RmspropStateMaker : public TrainerStateMaker {
     // cog_addheaders.add()
     // ]]]
     // generated, using cog:
-    TrainerState *instance( EasyCL *cl, int numWeights );
-    VIRTUAL bool created( TrainerState *state );
+    TrainerState *instance(EasyCL *cl, int numWeights);
+    VIRTUAL bool created(TrainerState *state);
 
     // [[[end]]]
 };
diff --git a/src/trainers/SGD.cpp b/src/trainers/SGD.cpp
index f5c5b48d..6b77efc3 100644
--- a/src/trainers/SGD.cpp
+++ b/src/trainers/SGD.cpp
@@ -28,28 +28,28 @@ using namespace std;
 
 VIRTUAL SGD::~SGD() {
 }
-VIRTUAL void SGD::setMomentum( float momentum ) {
+VIRTUAL void SGD::setMomentum(float momentum) {
     this->momentum = momentum;
 }
-VIRTUAL void SGD::setWeightDecay( float weightDecay ) {
+VIRTUAL void SGD::setWeightDecay(float weightDecay) {
     this->weightDecay = weightDecay;
 }
 VIRTUAL std::string SGD::asString() {
-    return "SGD{ learningRate=" + toString( learningRate ) + ", momentum=" + 
-        toString( momentum ) + " }";
+    return "SGD{ learningRate=" + toString(learningRate) + ", momentum=" + 
+        toString(momentum) + " }";
 }
-VIRTUAL void SGD::updateWeights( CLWrapper *weightsWrapper, CLWrapper *gradWeightsWrapper,
-        SGDState *trainerState ) {
+VIRTUAL void SGD::updateWeights(CLWrapper *weightsWrapper, CLWrapper *gradWeightsWrapper,
+        SGDState *trainerState) {
     int numWeights = trainerState->numWeights;
     CLWrapper *lastUpdateWrapper = trainerState->lastUpdateWrapper;
     float *gradWeightsCopy = new float[ numWeights ];
-    CLWrapper *gradWeightsCopyWrapper = cl->wrap( numWeights, gradWeightsCopy );
+    CLWrapper *gradWeightsCopyWrapper = cl->wrap(numWeights, gradWeightsCopy);
     gradWeightsCopyWrapper->createOnDevice();
 
-    CLMathWrapper lastUpdates_( lastUpdateWrapper );
-    CLMathWrapper gradWeights_( gradWeightsWrapper );
-    CLMathWrapper gradWeightsCopy_( gradWeightsCopyWrapper );
-    CLMathWrapper weights_( weightsWrapper );
+    CLMathWrapper lastUpdates_(lastUpdateWrapper);
+    CLMathWrapper gradWeights_(gradWeightsWrapper);
+    CLMathWrapper gradWeightsCopy_(gradWeightsCopyWrapper);
+    CLMathWrapper weights_(weightsWrapper);
 
     // following all happens on gpu, via clmathwrapper:
     lastUpdates_ *= momentum;
@@ -58,7 +58,7 @@ VIRTUAL void SGD::updateWeights( CLWrapper *weightsWrapper, CLWrapper *gradWeigh
     lastUpdates_ += gradWeightsCopy_;
     weights_ += lastUpdates_;
 
-    if( weightDecay > 0 ) {
+    if(weightDecay > 0) {
         // apply weight decay, by multiplying the weights by (1.0f - weightDecay)
         // so weightDecay == 0 means no decay; and weightDecay == 1.0f means
         // weights go immediately to zero
@@ -68,63 +68,63 @@ VIRTUAL void SGD::updateWeights( CLWrapper *weightsWrapper, CLWrapper *gradWeigh
     delete gradWeightsCopyWrapper;
     delete[] gradWeightsCopy;
 }
-VIRTUAL BatchResult SGD::train( NeuralNet *net, TrainingContext *context,
-    float const*input, OutputData *outputData ) {
+VIRTUAL BatchResult SGD::train(NeuralNet *net, TrainingContext *context,
+    float const*input, OutputData *outputData) {
     // learns one batch, including updating weights
     // doesnt have to think about running multiple batches,
     // or loading data, or anything like that
-    bindState( net );
+    bindState(net);
 
-    net->forward( input );
-    int numRight = net->calcNumRight( outputData );
-    float loss = net->calcLoss( outputData );
-    net->backward( outputData );
+    net->forward(input);
+    int numRight = net->calcNumRight(outputData);
+    float loss = net->calcLoss(outputData);
+    net->backward(outputData);
 
     int numLayers = net->getNumLayers();
-    for( int layerIdx = numLayers - 2; layerIdx > 0; layerIdx-- ) {
-        Layer *layer = net->getLayer( layerIdx );
-        if( !layer->needsBackProp() ) {
+    for(int layerIdx = numLayers - 2; layerIdx > 0; layerIdx--) {
+        Layer *layer = net->getLayer(layerIdx);
+        if(!layer->needsBackProp()) {
             break;
         }
-        if( layer->needsTrainerState() ) {
-            updateWeights( layer->getWeightsWrapper(), layer->getGradWeightsWrapper(), 
-                dynamic_cast< SGDState * >( layer->getTrainerState() ) );
-            if( layer->biased() ) {
-                updateWeights( layer->getBiasWrapper(), layer->getGradBiasWrapper(),
-                    dynamic_cast< SGDState * >( layer->getBiasTrainerState() ) );
+        if(layer->needsTrainerState()) {
+            updateWeights(layer->getWeightsWrapper(), layer->getGradWeightsWrapper(), 
+                dynamic_cast< SGDState * >(layer->getTrainerState()) );
+            if(layer->biased()) {
+                updateWeights(layer->getBiasWrapper(), layer->getGradBiasWrapper(),
+                    dynamic_cast< SGDState * >(layer->getBiasTrainerState()) );
             }
         }
     }
-    return BatchResult( loss, numRight );
+    return BatchResult(loss, numRight);
 }
-VIRTUAL BatchResult SGD::train( NeuralNet *net, TrainingContext *context,
-        float const*input, float const*expectedOutput ) {
-    ExpectedData expectedData( net, expectedOutput );
-    return this->train( net, context, input, &expectedData );
+VIRTUAL BatchResult SGD::train(NeuralNet *net, TrainingContext *context,
+        float const*input, float const*expectedOutput) {
+    ExpectedData expectedData(net, expectedOutput);
+    return this->train(net, context, input, &expectedData);
 }
-VIRTUAL BatchResult SGD::trainFromLabels( NeuralNet *net, TrainingContext *context,
-        float const*input, int const*labels ) {
-    LabeledData labeledData( net, labels );
-    return this->train( net, context, input, &labeledData );
+VIRTUAL BatchResult SGD::trainFromLabels(NeuralNet *net, TrainingContext *context,
+        float const*input, int const*labels) {
+    LabeledData labeledData(net, labels);
+    return this->train(net, context, input, &labeledData);
 }
-VIRTUAL void SGD::bindState( NeuralNet *net ) {
+VIRTUAL void SGD::bindState(NeuralNet *net) {
     SGDStateMaker stateMaker;
-    this->_bindState( net, &stateMaker );
+    this->_bindState(net, &stateMaker);
 }
-STATIC SGD *SGD::instance( EasyCL *cl, float learningRate ) {
-    SGD *sgd = new SGD( cl );
-    sgd->setLearningRate( learningRate );
+STATIC SGD *SGD::instance(EasyCL *cl, float learningRate) {
+    SGD *sgd = new SGD(cl);
+    sgd->setLearningRate(learningRate);
     return sgd;
 }
-STATIC SGD *SGD::instance( EasyCL *cl, float learningRate, float momentum ) {
-    SGD *sgd = new SGD( cl );
-    sgd->setLearningRate( learningRate );
-    sgd->setMomentum( momentum );
+STATIC SGD *SGD::instance(EasyCL *cl, float learningRate, float momentum) {
+    SGD *sgd = new SGD(cl);
+    sgd->setLearningRate(learningRate);
+    sgd->setMomentum(momentum);
     return sgd;
 }
-SGD::SGD( EasyCL *cl ) :
-        Trainer( cl ),
-        momentum( 0.0f ),
-        weightDecay( 0.0f ) {
+SGD::SGD(EasyCL *cl) :
+        Trainer(cl),
+        momentum(0.0f),
+        weightDecay(0.0f) {
 }
 
diff --git a/src/trainers/SGD.h b/src/trainers/SGD.h
index 95ecb3b8..0d6f36f2 100644
--- a/src/trainers/SGD.h
+++ b/src/trainers/SGD.h
@@ -26,7 +26,7 @@ class OutputData;
 // implements SGD, including momentum
 // momentum defined eg in http://www.cs.toronto.edu/~gdahl/papers/momentumNesterovDeepLearning.pdf
 // standard momentum:
-//    dweights[t+1] = mom * dweights[t] - learningrate * gradient( weights[t] )
+//    dweights[t+1] = mom * dweights[t] - learningrate * gradient(weights[t])
 //    weights[t+1] = weights[t] + dweights[t+1]
 //
 //training:
@@ -46,21 +46,21 @@ class DeepCL_EXPORT SGD : public Trainer{
     // ]]]
     // generated, using cog:
     VIRTUAL ~SGD();
-    VIRTUAL void setMomentum( float momentum );
-    VIRTUAL void setWeightDecay( float weightDecay );
+    VIRTUAL void setMomentum(float momentum);
+    VIRTUAL void setWeightDecay(float weightDecay);
     VIRTUAL std::string asString();
-    VIRTUAL void updateWeights( CLWrapper *weightsWrapper, CLWrapper *gradWeightsWrapper,
-    SGDState *trainerState );
-    VIRTUAL BatchResult train( NeuralNet *net, TrainingContext *context,
-    float const*input, OutputData *outputData );
-    VIRTUAL BatchResult train( NeuralNet *net, TrainingContext *context,
-    float const*input, float const*expectedOutput );
-    VIRTUAL BatchResult trainFromLabels( NeuralNet *net, TrainingContext *context,
-    float const*input, int const*labels );
-    VIRTUAL void bindState( NeuralNet *net );
-    STATIC SGD *instance( EasyCL *cl, float learningRate );
-    STATIC SGD *instance( EasyCL *cl, float learningRate, float momentum );
-    SGD( EasyCL *cl );
+    VIRTUAL void updateWeights(CLWrapper *weightsWrapper, CLWrapper *gradWeightsWrapper,
+    SGDState *trainerState);
+    VIRTUAL BatchResult train(NeuralNet *net, TrainingContext *context,
+    float const*input, OutputData *outputData);
+    VIRTUAL BatchResult train(NeuralNet *net, TrainingContext *context,
+    float const*input, float const*expectedOutput);
+    VIRTUAL BatchResult trainFromLabels(NeuralNet *net, TrainingContext *context,
+    float const*input, int const*labels);
+    VIRTUAL void bindState(NeuralNet *net);
+    STATIC SGD *instance(EasyCL *cl, float learningRate);
+    STATIC SGD *instance(EasyCL *cl, float learningRate, float momentum);
+    SGD(EasyCL *cl);
 
     // [[[end]]]
 };
diff --git a/src/trainers/SGDMaker.cpp b/src/trainers/SGDMaker.cpp
index 2a6a9628..a8c8e641 100644
--- a/src/trainers/SGDMaker.cpp
+++ b/src/trainers/SGDMaker.cpp
@@ -17,7 +17,7 @@ using namespace std;
 #define STATIC
 #define VIRTUAL
 
-VIRTUAL Trainer *SGDMaker::instance( EasyCL *cl ) {
-    return new SGD( cl );
+VIRTUAL Trainer *SGDMaker::instance(EasyCL *cl) {
+    return new SGD(cl);
 }
 
diff --git a/src/trainers/SGDMaker.h b/src/trainers/SGDMaker.h
index 67575336..e098ce73 100644
--- a/src/trainers/SGDMaker.h
+++ b/src/trainers/SGDMaker.h
@@ -23,7 +23,7 @@ class SGDMaker : public TrainerMaker {
     // cog_addheaders.add()
     // ]]]
     // generated, using cog:
-    VIRTUAL Trainer *instance( EasyCL *cl );
+    VIRTUAL Trainer *instance(EasyCL *cl);
 
     // [[[end]]]
 };
diff --git a/src/trainers/SGDState.cpp b/src/trainers/SGDState.cpp
index 775c0f1a..3d8b145b 100644
--- a/src/trainers/SGDState.cpp
+++ b/src/trainers/SGDState.cpp
@@ -22,8 +22,8 @@ VIRTUAL SGDState::~SGDState() {
     delete[] lastUpdate;
 }
 
-SGDState::SGDState( EasyCL *cl, int numWeights ) :
-        numWeights( numWeights )
+SGDState::SGDState(EasyCL *cl, int numWeights) :
+        numWeights(numWeights)
     { // should we handle bias separately?  maybe... not?
       // or each layer could have one trainer for biases, and one for the
       // non-biases?  Maybe kind of ok?
@@ -31,10 +31,10 @@ SGDState::SGDState( EasyCL *cl, int numWeights ) :
     // lastUpdate buffer never needs to change size,
     //  since number of weights is invariant with batchSize etc
     lastUpdate = new float[numWeights];
-    for( int i = 0; i < numWeights; i++ ) {
+    for(int i = 0; i < numWeights; i++) {
         lastUpdate[i] = 0.0f;
     }
-    lastUpdateWrapper = cl->wrap( numWeights, lastUpdate );
+    lastUpdateWrapper = cl->wrap(numWeights, lastUpdate);
     lastUpdateWrapper->copyToDevice();
 }
 
diff --git a/src/trainers/SGDState.h b/src/trainers/SGDState.h
index e3d4c7a0..ac0fbe21 100644
--- a/src/trainers/SGDState.h
+++ b/src/trainers/SGDState.h
@@ -44,7 +44,7 @@ class DeepCL_EXPORT SGDState : public TrainerState {
     // ]]]
     // generated, using cog:
     VIRTUAL ~SGDState();
-    SGDState( EasyCL *cl, int numWeights );
+    SGDState(EasyCL *cl, int numWeights);
 
     // [[[end]]]
 };
diff --git a/src/trainers/SGDStateMaker.cpp b/src/trainers/SGDStateMaker.cpp
index 3ded4aa7..1ff98141 100644
--- a/src/trainers/SGDStateMaker.cpp
+++ b/src/trainers/SGDStateMaker.cpp
@@ -16,11 +16,11 @@ using namespace std;
 #define STATIC
 #define VIRTUAL
 
-TrainerState *SGDStateMaker::instance( EasyCL *cl, int numWeights ) {
-    SGDState *sgd = new SGDState( cl, numWeights );
+TrainerState *SGDStateMaker::instance(EasyCL *cl, int numWeights) {
+    SGDState *sgd = new SGDState(cl, numWeights);
     return sgd;
 }
-VIRTUAL bool SGDStateMaker::created( TrainerState *state ) {
+VIRTUAL bool SGDStateMaker::created(TrainerState *state) {
     return dynamic_cast< SGDState * >(state) != 0;
 }
 
diff --git a/src/trainers/SGDStateMaker.h b/src/trainers/SGDStateMaker.h
index f7ad75c2..450c7953 100644
--- a/src/trainers/SGDStateMaker.h
+++ b/src/trainers/SGDStateMaker.h
@@ -23,8 +23,8 @@ class SGDStateMaker : public TrainerStateMaker {
     // cog_addheaders.add()
     // ]]]
     // generated, using cog:
-    TrainerState *instance( EasyCL *cl, int numWeights );
-    VIRTUAL bool created( TrainerState *state );
+    TrainerState *instance(EasyCL *cl, int numWeights);
+    VIRTUAL bool created(TrainerState *state);
 
     // [[[end]]]
 };
diff --git a/src/trainers/Trainer.cpp b/src/trainers/Trainer.cpp
index f40d2dc4..454f5a2e 100644
--- a/src/trainers/Trainer.cpp
+++ b/src/trainers/Trainer.cpp
@@ -24,62 +24,62 @@ using namespace std;
 #define VIRTUAL
 
 
-Trainer::Trainer( EasyCL *cl ) :
-    cl( cl ),
-    learningRate( 0 ) {
+Trainer::Trainer(EasyCL *cl) :
+    cl(cl),
+    learningRate(0) {
 }
 VIRTUAL Trainer::~Trainer() {
 }
-VIRTUAL void Trainer::setLearningRate( float learningRate ) {
+VIRTUAL void Trainer::setLearningRate(float learningRate) {
     this->learningRate = learningRate;
 }
 VIRTUAL std::string Trainer::asString() {
-    return "Trainer{ learningRate=" + toString( learningRate ) + " }";
+    return "Trainer{ learningRate=" + toString(learningRate) + " }";
 }
-VIRTUAL BatchResult Trainer::train( Trainable *trainable, 
+VIRTUAL BatchResult Trainer::train(Trainable *trainable, 
         TrainingContext *context,
-        float const*input, float const*expectedOutput ) {
-    MultiNet *multiNet = dynamic_cast< MultiNet *>( trainable );
+        float const*input, float const*expectedOutput) {
+    MultiNet *multiNet = dynamic_cast< MultiNet *>(trainable);
     float loss = 0;
-    if( multiNet != 0 ) {
-        for( int i = 0; i < multiNet->getNumNets(); i++ ) {
-            Trainable *child = multiNet->getNet( i );
-            BatchResult result = this->train( child, context, input, expectedOutput );
+    if(multiNet != 0) {
+        for(int i = 0; i < multiNet->getNumNets(); i++) {
+            Trainable *child = multiNet->getNet(i);
+            BatchResult result = this->train(child, context, input, expectedOutput);
             loss += result.loss;
         }
     } else {
-        NeuralNet *net = dynamic_cast< NeuralNet * > ( trainable );
-        return this->train( net, context, input, expectedOutput );
+        NeuralNet *net = dynamic_cast< NeuralNet * > (trainable);
+        return this->train(net, context, input, expectedOutput);
     }
-    return BatchResult( loss, 0 );
+    return BatchResult(loss, 0);
 }
-VIRTUAL BatchResult Trainer::trainFromLabels( Trainable *trainable,
+VIRTUAL BatchResult Trainer::trainFromLabels(Trainable *trainable,
     TrainingContext *context,
-    float const*input, int const*labels ) {
-    MultiNet *multiNet = dynamic_cast< MultiNet *>( trainable );
+    float const*input, int const*labels) {
+    MultiNet *multiNet = dynamic_cast< MultiNet *>(trainable);
     float loss = 0;
     int numRight = 0;
-    if( multiNet != 0 ) {
-        for( int i = 0; i < multiNet->getNumNets(); i++ ) {
-            Trainable *child = multiNet->getNet( i );
-            BatchResult result = this->trainFromLabels( child, context, input, labels );
+    if(multiNet != 0) {
+        for(int i = 0; i < multiNet->getNumNets(); i++) {
+            Trainable *child = multiNet->getNet(i);
+            BatchResult result = this->trainFromLabels(child, context, input, labels);
             loss += result.loss;
             numRight += result.numRight;
         }
     } else {
-        NeuralNet *net = dynamic_cast< NeuralNet * > ( trainable );
-        return this->trainFromLabels( net, context, input, labels );
+        NeuralNet *net = dynamic_cast< NeuralNet * > (trainable);
+        return this->trainFromLabels(net, context, input, labels);
     }
-    return BatchResult( loss, numRight );
+    return BatchResult(loss, numRight);
 }
-VIRTUAL void Trainer::_bindState( NeuralNet *net, TrainerStateMaker *stateMaker ) {
+VIRTUAL void Trainer::_bindState(NeuralNet *net, TrainerStateMaker *stateMaker) {
     // go through network layers, and assign TrainerState objects
-    for( int layerIdx = 0; layerIdx < net->getNumLayers(); layerIdx++ ) {
-        Layer *layer = net->getLayer( layerIdx );
-        if( layer->needsTrainerState() ) {
+    for(int layerIdx = 0; layerIdx < net->getNumLayers(); layerIdx++) {
+        Layer *layer = net->getLayer(layerIdx);
+        if(layer->needsTrainerState()) {
             TrainerState *state = layer->getTrainerState();
-            if( !stateMaker->created( state ) ) {
-                layer->setTrainerState( stateMaker );
+            if(!stateMaker->created(state) ) {
+                layer->setTrainerState(stateMaker);
             }
         }
     }
diff --git a/src/trainers/Trainer.h b/src/trainers/Trainer.h
index ea72f7f1..7cac2263 100644
--- a/src/trainers/Trainer.h
+++ b/src/trainers/Trainer.h
@@ -33,7 +33,7 @@ class BatchResult {
         loss = 0;
         numRight = 0;
     }
-    BatchResult( float loss, int numRight ) {
+    BatchResult(float loss, int numRight) {
         this->loss = loss;
         this->numRight = numRight;
     }
@@ -59,29 +59,29 @@ class DeepCL_EXPORT Trainer{
 
     float learningRate;
 
-    virtual BatchResult train( NeuralNet *net, 
+    virtual BatchResult train(NeuralNet *net, 
         TrainingContext *context,
-        float const*input, float const*expectedOutput ) = 0;
-    virtual BatchResult trainFromLabels( NeuralNet *net, 
+        float const*input, float const*expectedOutput) = 0;
+    virtual BatchResult trainFromLabels(NeuralNet *net, 
         TrainingContext *context,
-        float const*input, int const*labels ) = 0;
+        float const*input, int const*labels) = 0;
 
     // [[[cog
     // import cog_addheaders
     // cog_addheaders.add()
     // ]]]
     // generated, using cog:
-    Trainer( EasyCL *cl );
+    Trainer(EasyCL *cl);
     VIRTUAL ~Trainer();
-    VIRTUAL void setLearningRate( float learningRate );
+    VIRTUAL void setLearningRate(float learningRate);
     VIRTUAL std::string asString();
-    VIRTUAL BatchResult train( Trainable *trainable,
+    VIRTUAL BatchResult train(Trainable *trainable,
     TrainingContext *context,
-    float const*input, float const*expectedOutput );
-    VIRTUAL BatchResult trainFromLabels( Trainable *trainable,
+    float const*input, float const*expectedOutput);
+    VIRTUAL BatchResult trainFromLabels(Trainable *trainable,
     TrainingContext *context,
-    float const*input, int const*labels );
-    VIRTUAL void _bindState( NeuralNet *net, TrainerStateMaker *stateMaker );
+    float const*input, int const*labels);
+    VIRTUAL void _bindState(NeuralNet *net, TrainerStateMaker *stateMaker);
 
     // [[[end]]]
 };
diff --git a/src/trainers/TrainerMaker.h b/src/trainers/TrainerMaker.h
index 3f0e817e..cdf6c438 100644
--- a/src/trainers/TrainerMaker.h
+++ b/src/trainers/TrainerMaker.h
@@ -20,7 +20,7 @@ class NeuralNet;
 
 class TrainerMaker {
 public:
-    virtual Trainer *instance( EasyCL *cl, NeuralNet *net ) = 0;
+    virtual Trainer *instance(EasyCL *cl, NeuralNet *net) = 0;
 
     // [[[cog
     // import cog_addheaders
diff --git a/src/trainers/TrainerStateMaker.cpp b/src/trainers/TrainerStateMaker.cpp
index 34259af9..bfb307cb 100644
--- a/src/trainers/TrainerStateMaker.cpp
+++ b/src/trainers/TrainerStateMaker.cpp
@@ -16,7 +16,7 @@ using namespace std;
 #define STATIC
 #define VIRTUAL
 
-VIRTUAL bool TrainerStateMaker::created( TrainerState *state ) {
+VIRTUAL bool TrainerStateMaker::created(TrainerState *state) {
     throw runtime_error("TrainerStateMaker::created not implemented for .. this class");
 }
 
diff --git a/src/trainers/TrainerStateMaker.h b/src/trainers/TrainerStateMaker.h
index a7c4935d..1898a17b 100644
--- a/src/trainers/TrainerStateMaker.h
+++ b/src/trainers/TrainerStateMaker.h
@@ -19,14 +19,14 @@ class EasyCL;
 
 class TrainerStateMaker {
 public:
-    virtual TrainerState *instance( EasyCL *cl, int numWeights ) = 0;
+    virtual TrainerState *instance(EasyCL *cl, int numWeights) = 0;
 
     // [[[cog
     // import cog_addheaders
     // cog_addheaders.add()
     // ]]]
     // generated, using cog:
-    VIRTUAL bool created( TrainerState *state );
+    VIRTUAL bool created(TrainerState *state);
 
     // [[[end]]]
 };
diff --git a/src/trainers/TrainingContext.cpp b/src/trainers/TrainingContext.cpp
index 6af546ba..8d79013d 100644
--- a/src/trainers/TrainingContext.cpp
+++ b/src/trainers/TrainingContext.cpp
@@ -15,9 +15,9 @@ using namespace std;
 #define STATIC
 #define VIRTUAL
 
-TrainingContext::TrainingContext( int epoch, int batch ) :
-        epoch( epoch ),
-        batch( batch ) {
+TrainingContext::TrainingContext(int epoch, int batch) :
+        epoch(epoch),
+        batch(batch) {
 }
 int TrainingContext::getEpoch() {
     return epoch;
diff --git a/src/trainers/TrainingContext.h b/src/trainers/TrainingContext.h
index bec2e396..8d001c92 100644
--- a/src/trainers/TrainingContext.h
+++ b/src/trainers/TrainingContext.h
@@ -28,7 +28,7 @@ class DeepCL_EXPORT TrainingContext {
     // cog_addheaders.add()
     // ]]]
     // generated, using cog:
-    TrainingContext( int epoch, int batch );
+    TrainingContext(int epoch, int batch);
     int getEpoch();
     int getBatch();
 
diff --git a/src/util/ExceptionMacros.h b/src/util/ExceptionMacros.h
index 05cf5749..d843b499 100644
--- a/src/util/ExceptionMacros.h
+++ b/src/util/ExceptionMacros.h
@@ -1,4 +1,4 @@
 #pragma once
 
-#define THROW( message ) throw std::runtime_error( std::string(__FILE__) + " " + toString(__LINE__) + ": " + message );
+#define THROW(message) throw std::runtime_error(std::string(__FILE__) + " " + toString(__LINE__) + ": " + message);
 
diff --git a/src/util/FileHelper.cpp b/src/util/FileHelper.cpp
index 7b1f8d5f..028529dd 100644
--- a/src/util/FileHelper.cpp
+++ b/src/util/FileHelper.cpp
@@ -24,95 +24,95 @@
 #undef STATIC
 #define STATIC
 
-PUBLIC STATIC char *FileHelper::readBinary( std::string filepath, long *p_filesize ) {
-    std::string localPath = localizePath( filepath );
-    std::ifstream file( localPath.c_str(), std::ios::in | std::ios::binary | std::ios::ate);
+PUBLIC STATIC char *FileHelper::readBinary(std::string filepath, long *p_filesize) {
+    std::string localPath = localizePath(filepath);
+    std::ifstream file(localPath.c_str(), std::ios::in | std::ios::binary | std::ios::ate);
     if(!file.is_open()) {
         throw std::runtime_error("couldnt open file " + localPath);
     }
-    *p_filesize = static_cast<long>( file.tellg() );
+    *p_filesize = static_cast<long>(file.tellg());
 //    std::cout << " filesize " << *p_filesize << std::endl;
     char *data = new char[*p_filesize];
     file.seekg(0, std::ios::beg);
-    if(!file.read( data, *p_filesize )) {
-        throw std::runtime_error("failed to read from " + localPath );
+    if(!file.read(data, *p_filesize)) {
+        throw std::runtime_error("failed to read from " + localPath);
     }
     file.close();
     return data;
 }
-PUBLIC STATIC long FileHelper::getFilesize( std::string filepath ) {
-    std::ifstream in( localizePath( filepath ).c_str(), std::ifstream::ate | std::ifstream::binary);
-    return static_cast<long>( in.tellg() ); 
+PUBLIC STATIC long FileHelper::getFilesize(std::string filepath) {
+    std::ifstream in(localizePath(filepath).c_str(), std::ifstream::ate | std::ifstream::binary);
+    return static_cast<long>(in.tellg()); 
 }
-PUBLIC STATIC char *FileHelper::readBinaryChunk( std::string filepath, long start, long length ) {
-    std::string localPath = localizePath( filepath );
-    std::ifstream file( localPath.c_str(), std::ios::in | std::ios::binary | std::ios::ate);
+PUBLIC STATIC char *FileHelper::readBinaryChunk(std::string filepath, long start, long length) {
+    std::string localPath = localizePath(filepath);
+    std::ifstream file(localPath.c_str(), std::ios::in | std::ios::binary | std::ios::ate);
     if(!file.is_open()) {
         throw std::runtime_error("failed to open file: " + localPath);
     }
-    file.seekg( start, std::ios::beg );
+    file.seekg(start, std::ios::beg);
     char *data = new char[length];
-    if(!file.read( data, length )) {
-        throw std::runtime_error("failed to read from " + localPath );
+    if(!file.read(data, length)) {
+        throw std::runtime_error("failed to read from " + localPath);
     }
     file.close();
     return data;
 }
 // need to allocate targetArray yourself, beforehand
-PUBLIC STATIC void FileHelper::readBinaryChunk( char *targetArray, std::string filepath, long start, long length ) {
-    std::string localPath = localizePath( filepath );
-    std::ifstream file( localPath.c_str(), std::ios::in | std::ios::binary | std::ios::ate);
+PUBLIC STATIC void FileHelper::readBinaryChunk(char *targetArray, std::string filepath, long start, long length) {
+    std::string localPath = localizePath(filepath);
+    std::ifstream file(localPath.c_str(), std::ios::in | std::ios::binary | std::ios::ate);
     if(!file.is_open()) {
         throw std::runtime_error("failed to open file: " + localPath);
     }
-    file.seekg( start, std::ios::beg );
+    file.seekg(start, std::ios::beg);
 //        char *data = new char[length];
-    if(!file.read( targetArray, length )) {
-        throw std::runtime_error("failed to read from " + localPath );
+    if(!file.read(targetArray, length)) {
+        throw std::runtime_error("failed to read from " + localPath);
     }
     file.close();
 //        return data;
 }
-PUBLIC STATIC void FileHelper::writeBinary( std::string filepath, char const*data, long filesize ) {
-    std::string localPath = localizePath( filepath );
-    std::ofstream file( localPath.c_str(), std::ios::out | std::ios::binary );
+PUBLIC STATIC void FileHelper::writeBinary(std::string filepath, char const*data, long filesize) {
+    std::string localPath = localizePath(filepath);
+    std::ofstream file(localPath.c_str(), std::ios::out | std::ios::binary);
     if(!file.is_open()) {
-         throw std::runtime_error("cannot open file " + localPath );
+         throw std::runtime_error("cannot open file " + localPath);
     }
-    if( !file.write( (char *)data, filesize ) ) {
-        throw std::runtime_error("failed to write to " + localPath );
+    if(!file.write((char *)data, filesize) ) {
+        throw std::runtime_error("failed to write to " + localPath);
     }
     file.close();
 }
-PUBLIC STATIC void FileHelper::writeBinaryChunk( std::string filepath, char const*data, long startPos, long filesize ) {
-    std::string localPath = localizePath( filepath );
-    std::ofstream file( localPath.c_str(), std::ios::out | std::ios::binary );
-    file.seekp( startPos, std::ios::beg );
+PUBLIC STATIC void FileHelper::writeBinaryChunk(std::string filepath, char const*data, long startPos, long filesize) {
+    std::string localPath = localizePath(filepath);
+    std::ofstream file(localPath.c_str(), std::ios::out | std::ios::binary);
+    file.seekp(startPos, std::ios::beg);
     if(!file.is_open()) {
-         throw std::runtime_error("cannot open file " + localPath );
+         throw std::runtime_error("cannot open file " + localPath);
     }
-    if( !file.write( (char *)data, filesize ) ) {
-        throw std::runtime_error("failed to write to " + localPath );
+    if(!file.write((char *)data, filesize) ) {
+        throw std::runtime_error("failed to write to " + localPath);
     }
     file.close();
 }
-PUBLIC STATIC bool FileHelper::exists( const std::string filepath ) {
-   std::string localPath = localizePath( filepath );
-   std::ifstream testifstream( localPath.c_str() );
+PUBLIC STATIC bool FileHelper::exists(const std::string filepath) {
+   std::string localPath = localizePath(filepath);
+   std::ifstream testifstream(localPath.c_str());
    bool exists = testifstream.good();
    testifstream.close();
    return exists;
 }
 
-PUBLIC STATIC void FileHelper::rename( std::string oldname, std::string newname ) {
-    ::rename( localizePath( oldname ).c_str(), localizePath( newname ).c_str() );
+PUBLIC STATIC void FileHelper::rename(std::string oldname, std::string newname) {
+    ::rename(localizePath(oldname).c_str(), localizePath(newname).c_str());
 }
 
-PUBLIC STATIC void FileHelper::remove( std::string filename ) {
-    ::remove( localizePath( filename ).c_str() );
+PUBLIC STATIC void FileHelper::remove(std::string filename) {
+    ::remove(localizePath(filename).c_str());
 }
-PUBLIC STATIC std::string FileHelper::localizePath( std::string path ) {
-    std::replace( path.begin(), path.end(), '/', pathSeparator().c_str()[0] );
+PUBLIC STATIC std::string FileHelper::localizePath(std::string path) {
+    std::replace(path.begin(), path.end(), '/', pathSeparator().c_str()[0]);
     //std::cout << "localized path: " << path << std::endl;
     return path;
 }
@@ -123,23 +123,23 @@ PUBLIC STATIC std::string FileHelper::pathSeparator() {
     return "/";
 #endif
 }
-PUBLIC STATIC void FileHelper::createDirectory( std::string path ) {
+PUBLIC STATIC void FileHelper::createDirectory(std::string path) {
     #ifdef _WIN32
-        if( CreateDirectory( path.c_str(), NULL ) == 0 ) {
-            throw std::runtime_error( "Failed to create directory " + path );
+        if(CreateDirectory(path.c_str(), NULL) == 0) {
+            throw std::runtime_error("Failed to create directory " + path);
         }
     #else
-        if( ::mkdir( path.c_str(), 0775 ) == -1  ) {
-            throw std::runtime_error( "Failed to create directory " + path );
+        if(::mkdir(path.c_str(), 0775) == -1  ) {
+            throw std::runtime_error("Failed to create directory " + path);
         }
     #endif
 }
-PUBLIC STATIC bool FileHelper::folderExists( std::string path ) {
+PUBLIC STATIC bool FileHelper::folderExists(std::string path) {
     #ifdef _WIN32
         return GetFileAttributes(path.c_str()) == INVALID_FILE_ATTRIBUTES;
     #else
         struct stat status;
-        stat( path.c_str(), &status );
+        stat(path.c_str(), &status);
         return S_ISDIR(status.st_mode);
     #endif
 }
diff --git a/src/util/FileHelper.h b/src/util/FileHelper.h
index 6693ea1a..f33b631d 100644
--- a/src/util/FileHelper.h
+++ b/src/util/FileHelper.h
@@ -24,19 +24,19 @@ class DeepCL_EXPORT FileHelper {
     // generated, using cog:
 
     public:
-    STATIC char *readBinary( std::string filepath, long *p_filesize );
-    STATIC long getFilesize( std::string filepath );
-    STATIC char *readBinaryChunk( std::string filepath, long start, long length );
-    STATIC void readBinaryChunk( char *targetArray, std::string filepath, long start, long length );
-    STATIC void writeBinary( std::string filepath, char const*data, long filesize );
-    STATIC void writeBinaryChunk( std::string filepath, char const*data, long startPos, long filesize );
-    STATIC bool exists( const std::string filepath );
-    STATIC void rename( std::string oldname, std::string newname );
-    STATIC void remove( std::string filename );
-    STATIC std::string localizePath( std::string path );
+    STATIC char *readBinary(std::string filepath, long *p_filesize);
+    STATIC long getFilesize(std::string filepath);
+    STATIC char *readBinaryChunk(std::string filepath, long start, long length);
+    STATIC void readBinaryChunk(char *targetArray, std::string filepath, long start, long length);
+    STATIC void writeBinary(std::string filepath, char const*data, long filesize);
+    STATIC void writeBinaryChunk(std::string filepath, char const*data, long startPos, long filesize);
+    STATIC bool exists(const std::string filepath);
+    STATIC void rename(std::string oldname, std::string newname);
+    STATIC void remove(std::string filename);
+    STATIC std::string localizePath(std::string path);
     STATIC std::string pathSeparator();
-    STATIC void createDirectory( std::string path );
-    STATIC bool folderExists( std::string path );
+    STATIC void createDirectory(std::string path);
+    STATIC bool folderExists(std::string path);
 
     // [[[end]]]
 };
diff --git a/src/util/ImageHelper.h b/src/util/ImageHelper.h
index b4d017d1..3bcc59e4 100644
--- a/src/util/ImageHelper.h
+++ b/src/util/ImageHelper.h
@@ -14,42 +14,42 @@
 
 class ImageHelper {
 public:
-//static int **allocateImage( int imageSize ) {
+//static int **allocateImage(int imageSize) {
     //int **image = new int*[imageSize];
-//    for( int i = 0; i < imageSize; i++ ) {
+//    for(int i = 0; i < imageSize; i++) {
 //        image[i] = new int[imageSize];
-//        for( int j = 0; j < imageSize; j++ ) {
+//        for(int j = 0; j < imageSize; j++) {
 //            image[i][j] = 0;
 //        }
 //    }
 //    int *contiguousarray = new int[ imageSize * imageSize ];
-//    memset(contiguousarray, 0, sizeof(int) * imageSize * imageSize );
+//    memset(contiguousarray, 0, sizeof(int) * imageSize * imageSize);
 //    int **image = new int*[imageSize];
-//    for( int i = 0; i < imageSize; i++ ) {
+//    for(int i = 0; i < imageSize; i++) {
 //        image[i] = &(contiguousarray[i*imageSize]);
 //    }
 //    return image;
 //}
 
-//static float **allocateFloats( int imageSize ) {
+//static float **allocateFloats(int imageSize) {
 //    //int **image = new int*[imageSize];
-////    for( int i = 0; i < imageSize; i++ ) {
+////    for(int i = 0; i < imageSize; i++) {
 ////        image[i] = new int[imageSize];
-////        for( int j = 0; j < imageSize; j++ ) {
+////        for(int j = 0; j < imageSize; j++) {
 ////            image[i][j] = 0;
 ////        }
 ////    }
 //    float *contiguousarray = new float[ imageSize * imageSize ];
-//    memset(contiguousarray, 0, sizeof(float) * imageSize * imageSize );
+//    memset(contiguousarray, 0, sizeof(float) * imageSize * imageSize);
 //    float **image = new float*[imageSize];
-//    for( int i = 0; i < imageSize; i++ ) {
+//    for(int i = 0; i < imageSize; i++) {
 //        image[i] = &(contiguousarray[i*imageSize]);
 //    }
 //    return image;
 //}
 
-//static void deleteImage( int ***p_image, int imageSize ) {
-//   if( p_image == 0 ) {
+//static void deleteImage(int ***p_image, int imageSize) {
+//   if(p_image == 0) {
 //      return;
 //   }
 //   delete[] (*p_image)[0];
@@ -57,8 +57,8 @@ class ImageHelper {
 //   *p_image = 0;
 //}
 
-//static void deleteImage( float ***p_image, int imageSize ) {
-//   if( p_image == 0 ) {
+//static void deleteImage(float ***p_image, int imageSize) {
+//   if(p_image == 0) {
 //      return;
 //   }
 //   delete[] (*p_image)[0];
@@ -66,39 +66,39 @@ class ImageHelper {
 //   *p_image = 0;
 //}
 
-//static void copyImage( int *const*const dst, int const*const *const src, int imageSize ) {
-//    for( int i = 0; i < imageSize; i++ ) {
-//        for( int j = 0; j < imageSize; j++ ) {
-//            if( dst[i][j] != src[i][j] ) {
+//static void copyImage(int *const*const dst, int const*const *const src, int imageSize) {
+//    for(int i = 0; i < imageSize; i++) {
+//        for(int j = 0; j < imageSize; j++) {
+//            if(dst[i][j] != src[i][j]) {
 //                dst[i][j] = src[i][j];
 //            }
 //        }
 //    }
 //}
 
-//static void copyImage( float *const*const dst, int const*const *const src, int imageSize ) {
-//    for( int i = 0; i < imageSize; i++ ) {
-//        for( int j = 0; j < imageSize; j++ ) {
-//            if( dst[i][j] != src[i][j] ) {
+//static void copyImage(float *const*const dst, int const*const *const src, int imageSize) {
+//    for(int i = 0; i < imageSize; i++) {
+//        for(int j = 0; j < imageSize; j++) {
+//            if(dst[i][j] != src[i][j]) {
 //                dst[i][j] = src[i][j];
 //            }
 //        }
 //    }
 //}
 
-//static void wipeImage( int *const*const image, int imageSize ) {
-//    for( int i = 0; i < imageSize; i++ ) {
-//        for( int j = 0; j < imageSize; j++ ) {
+//static void wipeImage(int *const*const image, int imageSize) {
+//    for(int i = 0; i < imageSize; i++) {
+//        for(int j = 0; j < imageSize; j++) {
 //            image[i][j] = 0;
 //        }
 //    }
 //}
 
-//static void printInts( int const*const*const image, int imageSize ) {
+//static void printInts(int const*const*const image, int imageSize) {
 //    std::ostringstream ss;
 //    ss << "\n";
-//    for( int i = 0; i < imageSize; i++ ) {
-//       for( int j = 0; j < imageSize; j++ ) {
+//    for(int i = 0; i < imageSize; i++) {
+//       for(int j = 0; j < imageSize; j++) {
 //          ss << image[i][j] << " ";
 //       }
 //       ss << "\n";
@@ -106,11 +106,11 @@ class ImageHelper {
 //    std::cout << ss.str() << std::endl;
 //}
 
-//static void print( float const*const*const image, int imageSize ) {
+//static void print(float const*const*const image, int imageSize) {
 //    std::ostringstream ss;
 //    ss << "\n";
-//    for( int i = 0; i < imageSize; i++ ) {
-//       for( int j = 0; j < imageSize; j++ ) {
+//    for(int i = 0; i < imageSize; i++) {
+//       for(int j = 0; j < imageSize; j++) {
 //          ss << image[i][j] << " ";
 //       }
 //       ss << "\n";
@@ -118,23 +118,23 @@ class ImageHelper {
 //    std::cout << ss.str() << std::endl;
 //}
 
-static void _printImage( int *image, int imageSize ) {
+static void _printImage(int *image, int imageSize) {
     std::ostringstream ss;
     ss << "\n";
-    for( int i = 0; i < imageSize; i++ ) {
-        for( int j = 0; j < imageSize; j++ ) {
+    for(int i = 0; i < imageSize; i++) {
+        for(int j = 0; j < imageSize; j++) {
             int offset = i * imageSize + j;
             int value = image[offset];
-            if( value == 0 ) {
+            if(value == 0) {
                 ss << ".";
             }
-            if( value == 1 ) {
+            if(value == 1) {
                 ss << "*";
             }
-            if( value == 2 ) {
+            if(value == 2) {
                 ss << "O";
             }
-            if( value == 3 ) {
+            if(value == 3) {
                 ss << "+";
             }
         }
@@ -143,23 +143,23 @@ static void _printImage( int *image, int imageSize ) {
     std::cout << ss.str() << std::endl;
 }
 
-static void _printImage( float *image, int imageSize ) {
+static void _printImage(float *image, int imageSize) {
     std::ostringstream ss;
     ss << "\n";
-    for( int i = 0; i < imageSize; i++ ) {
-        for( int j = 0; j < imageSize; j++ ) {
+    for(int i = 0; i < imageSize; i++) {
+        for(int j = 0; j < imageSize; j++) {
             int offset = i * imageSize + j;
             float value = image[offset];
-            if( value == 0 ) {
+            if(value == 0) {
                 ss << ".";
             }
-            if( value == 1 ) {
+            if(value == 1) {
                 ss << "*";
             }
-            if( value == 2 ) {
+            if(value == 2) {
                 ss << "O";
             }
-            if( value == 3 ) {
+            if(value == 3) {
                 ss << "+";
             }
         }
@@ -168,61 +168,61 @@ static void _printImage( float *image, int imageSize ) {
     std::cout << ss.str() << std::endl;
 }
 
-//static void printImage( int const *const *const image, int imageSize ) {
+//static void printImage(int const *const *const image, int imageSize) {
 ///*    int numdigits = 1;
-//    for( int i = 0; i < imageSize; i++ ) {
-//       for( int j = 0; j < imageSize; j++ ) {
-//          std::string thisnum = toString( image[i][j] );
+//    for(int i = 0; i < imageSize; i++) {
+//       for(int j = 0; j < imageSize; j++) {
+//          std::string thisnum = toString(image[i][j]);
 //          int thisdigits = thisnum.length();
 //          numdigits = thisdigits > numdigits ? thisdigits : numdigits;
 //       }
 //    }*/
 //    ostringstream ss;
 //    ss << "\n";
-//    for( int i = 0; i < imageSize; i++ ) {
-//       for( int j = 0; j < imageSize; j++ ) {
-//          if( image[i][j] == 0 ) {
+//    for(int i = 0; i < imageSize; i++) {
+//       for(int j = 0; j < imageSize; j++) {
+//          if(image[i][j] == 0) {
 //              ss << ".";
 //          }
-//          if( image[i][j] == 1 ) {
+//          if(image[i][j] == 1) {
 //              ss << "*";
 //          }
-//          if( image[i][j] == 2 ) {
+//          if(image[i][j] == 2) {
 //              ss << "O";
 //          }
-//          if( image[i][j] == 3 ) {
+//          if(image[i][j] == 3) {
 //              ss << "+";
 //          }
 //       }
 //       ss << "\n";
 //    }
-//    debug( ss.str() );
+//    debug(ss.str());
 //}
 
-//static int **loadImage( std::string filepath, int *p_imageSize ) {
+//static int **loadImage(std::string filepath, int *p_imageSize) {
 //    std::ifstream f;
-//    f.open( filepath.c_str() );
+//    f.open(filepath.c_str());
 //    //f >> imageSize;
 //    //int **image = 0;
 //   std::string thisline;
 //   f >> thisline;
 //   *p_imageSize = (int)thisline.length();
-//   if( *p_imageSize == 0 ) {
+//   if(*p_imageSize == 0) {
 //      std::cout << "imagehelper::loadImage. error: imagesize 0, " << filepath << std::endl;
 //      throw "imagehelper::loadImage. error: imagesize 0 " + filepath;
 //   }
 //   //cout << "imagesize: " << imageSize << std::endl;
-//   int **image = allocateImage( *p_imageSize );
-//    for( int i = 0; i < *p_imageSize; i++ ) {
-//       if( i == 0 ) {
+//   int **image = allocateImage(*p_imageSize);
+//    for(int i = 0; i < *p_imageSize; i++) {
+//       if(i == 0) {
 //       }
-//       for( int j = 0; j < *p_imageSize; j++ ) {
+//       for(int j = 0; j < *p_imageSize; j++) {
 //          std::string thischar = std::string("") + thisline[j];
-//          if( thischar == "*" ) {
+//          if(thischar == "*") {
 //              image[i][j] = 1;
 ////              (*p_piecesPlaced)++;
 //          }
-//          if( thischar == "O" ) {
+//          if(thischar == "O") {
 //              image[i][j] = 2;
 ////              (*p_piecesPlaced)++;
 //          }
diff --git a/src/util/ImagePng.h b/src/util/ImagePng.h
index a2ccff60..e2959f37 100644
--- a/src/util/ImagePng.h
+++ b/src/util/ImagePng.h
@@ -11,200 +11,200 @@
 
 class ImagePng {
 public:
-    static int getImageMax( int ** image, int imageSize ) {
+    static int getImageMax(int ** image, int imageSize) {
         int maxvalue = 0;
-        for( int i = 0; i < imageSize; i++ ) {
-            for( int j = 0; j < imageSize; j++ ) {
-                maxvalue = std::max( maxvalue, image[i][j] );
+        for(int i = 0; i < imageSize; i++) {
+            for(int j = 0; j < imageSize; j++) {
+                maxvalue = std::max(maxvalue, image[i][j]);
             }
         }
         return maxvalue;
     }
 
-    static float getImageMax( float ** image, int imageSize ) {
+    static float getImageMax(float ** image, int imageSize) {
         float maxvalue = 0;
-        for( int i = 0; i < imageSize; i++ ) {
-            for( int j = 0; j < imageSize; j++ ) {
-                maxvalue = std::max( maxvalue, image[i][j] );
+        for(int i = 0; i < imageSize; i++) {
+            for(int j = 0; j < imageSize; j++) {
+                maxvalue = std::max(maxvalue, image[i][j]);
             }
         }
         return maxvalue;
     }
 
-    static float getImageMax( float const* image, int imageSize ) {
+    static float getImageMax(float const* image, int imageSize) {
         float maxvalue = 0;
-        for( int i = 0; i < imageSize; i++ ) {
-            for( int j = 0; j < imageSize; j++ ) {
-                maxvalue = std::max( maxvalue, image[i*imageSize + j] );
+        for(int i = 0; i < imageSize; i++) {
+            for(int j = 0; j < imageSize; j++) {
+                maxvalue = std::max(maxvalue, image[i*imageSize + j]);
             }
         }
         return maxvalue;
     }
-    static float getImageMin( float const* image, int imageSize ) {
+    static float getImageMin(float const* image, int imageSize) {
         float minvalue = 0;
-        for( int i = 0; i < imageSize; i++ ) {
-            for( int j = 0; j < imageSize; j++ ) {
-                minvalue = std::min( minvalue, image[i*imageSize + j] );
+        for(int i = 0; i < imageSize; i++) {
+            for(int j = 0; j < imageSize; j++) {
+                minvalue = std::min(minvalue, image[i*imageSize + j]);
             }
         }
         return minvalue;
     }
-//    static float getImageMax( unsigned char const* image, int imageSize ) {
+//    static float getImageMax(unsigned char const* image, int imageSize) {
 //        float maxvalue = 0;
-//        for( int i = 0; i < imageSize; i++ ) {
-//            for( int j = 0; j < imageSize; j++ ) {
-//                maxvalue = std::max( maxvalue, image[i*imageSize + j] );
+//        for(int i = 0; i < imageSize; i++) {
+//            for(int j = 0; j < imageSize; j++) {
+//                maxvalue = std::max(maxvalue, image[i*imageSize + j]);
 //            }
 //        }
 //        return maxvalue;
 //    }
-//    static float getImageMin( unsigned char const* image, int imageSize ) {
+//    static float getImageMin(unsigned char const* image, int imageSize) {
 //        float minvalue = 0;
-//        for( int i = 0; i < imageSize; i++ ) {
-//            for( int j = 0; j < imageSize; j++ ) {
-//                minvalue = std::min( minvalue, image[i*imageSize + j] );
+//        for(int i = 0; i < imageSize; i++) {
+//            for(int j = 0; j < imageSize; j++) {
+//                minvalue = std::min(minvalue, image[i*imageSize + j]);
 //            }
 //        }
 //        return minvalue;
 //    }
 
-    static void writeImageToPng( std::string filename, int **inimage, int imageSize ) {
-        int maxvalue = getImageMax( inimage, imageSize );
-        png::image< png::rgb_pixel > *image = new png::image< png::rgb_pixel >( imageSize, imageSize );
-        for( int i = 0; i < imageSize; i++ ) {
-            for( int j = 0; j < imageSize; j++ ) {
-               (*image)[i][j] = png::rgb_pixel( inimage[i][j] * 255 / maxvalue, inimage[i][j] * 255 / maxvalue, inimage[i][j] * 255 / maxvalue );
+    static void writeImageToPng(std::string filename, int **inimage, int imageSize) {
+        int maxvalue = getImageMax(inimage, imageSize);
+        png::image< png::rgb_pixel > *image = new png::image< png::rgb_pixel >(imageSize, imageSize);
+        for(int i = 0; i < imageSize; i++) {
+            for(int j = 0; j < imageSize; j++) {
+               (*image)[i][j] = png::rgb_pixel(inimage[i][j] * 255 / maxvalue, inimage[i][j] * 255 / maxvalue, inimage[i][j] * 255 / maxvalue);
             }
         }
-        remove( filename.c_str() );
-        image->write( filename );
+        remove(filename.c_str());
+        image->write(filename);
         delete image;
     }
 
-    static void writeImagesToPng( std::string filename, int ***images, int numImages, int imageSize ) {
-        int cols = sqrt( numImages );
-        if( cols * cols < numImages ) {
+    static void writeImagesToPng(std::string filename, int ***images, int numImages, int imageSize) {
+        int cols = sqrt(numImages);
+        if(cols * cols < numImages) {
             cols++;
         }
-        int rows = ( numImages + cols - 1 ) / cols;
+        int rows = (numImages + cols - 1) / cols;
         std::cout << "numImages " << numImages << " rows " << rows << " cols " << cols << std::endl;
-        png::image< png::rgb_pixel > *image = new png::image< png::rgb_pixel >( imageSize * rows, imageSize * cols );
+        png::image< png::rgb_pixel > *image = new png::image< png::rgb_pixel >(imageSize * rows, imageSize * cols);
 
 
-        for( int x = 0; x < cols; x++ ) {
-           for( int y = 0; y < rows; y++ ) {
-                if( x * rows + y >= numImages ) {
+        for(int x = 0; x < cols; x++) {
+           for(int y = 0; y < rows; y++) {
+                if(x * rows + y >= numImages) {
                     continue;
                 }
 //                cout << "image at x " << x << " y " << y << endl;
                 int **imagearray = images[x*rows + y];
-                int maxvalue = std::max( 1, getImageMax( imagearray, imageSize ) );
-                for( int i = 0; i < imageSize; i++ ) {
-                    for( int j = 0; j < imageSize; j++ ) {
-                       (*image)[x*imageSize + i][y*imageSize + j] = png::rgb_pixel( imagearray[i][j] * 255 / maxvalue, imagearray[i][j] * 255 / maxvalue, imagearray[i][j] * 255 / maxvalue );
+                int maxvalue = std::max(1, getImageMax(imagearray, imageSize) );
+                for(int i = 0; i < imageSize; i++) {
+                    for(int j = 0; j < imageSize; j++) {
+                       (*image)[x*imageSize + i][y*imageSize + j] = png::rgb_pixel(imagearray[i][j] * 255 / maxvalue, imagearray[i][j] * 255 / maxvalue, imagearray[i][j] * 255 / maxvalue);
                     }
                 }
 
             }
          }
-        remove( filename.c_str() );
-        image->write( filename );
+        remove(filename.c_str());
+        image->write(filename);
         delete image;
     }
 
-    static void writeImagesToPng( std::string filename, float ***images, int numImages, int imageSize ) {
-        int cols = sqrt( numImages );
-        if( cols * cols < numImages ) {
+    static void writeImagesToPng(std::string filename, float ***images, int numImages, int imageSize) {
+        int cols = sqrt(numImages);
+        if(cols * cols < numImages) {
             cols++;
         }
-        int rows = ( numImages + cols - 1 ) / cols;
+        int rows = (numImages + cols - 1) / cols;
         std::cout << "numImages " << numImages << " rows " << rows << " cols " << cols << std::endl;
-        png::image< png::rgb_pixel > *image = new png::image< png::rgb_pixel >( imageSize * rows, imageSize * cols );
+        png::image< png::rgb_pixel > *image = new png::image< png::rgb_pixel >(imageSize * rows, imageSize * cols);
 
 
-        for( int x = 0; x < cols; x++ ) {
-           for( int y = 0; y < rows; y++ ) {
-                if( x * rows + y >= numImages ) {
+        for(int x = 0; x < cols; x++) {
+           for(int y = 0; y < rows; y++) {
+                if(x * rows + y >= numImages) {
                     continue;
                 }
 //                cout << "image at x " << x << " y " << y << endl;
                 float **imagearray = images[x*rows + y];
-                float maxvalue = std::max( 1.0f, getImageMax( imagearray, imageSize ) );
-                for( int i = 0; i < imageSize; i++ ) {
-                    for( int j = 0; j < imageSize; j++ ) {
-                       (*image)[x*imageSize + i][y*imageSize + j] = png::rgb_pixel( imagearray[i][j] * 255 / maxvalue, imagearray[i][j] * 255 / maxvalue, imagearray[i][j] * 255 / maxvalue );
+                float maxvalue = std::max(1.0f, getImageMax(imagearray, imageSize) );
+                for(int i = 0; i < imageSize; i++) {
+                    for(int j = 0; j < imageSize; j++) {
+                       (*image)[x*imageSize + i][y*imageSize + j] = png::rgb_pixel(imagearray[i][j] * 255 / maxvalue, imagearray[i][j] * 255 / maxvalue, imagearray[i][j] * 255 / maxvalue);
                     }
                 }
 
             }
          }
-        remove( filename.c_str() );
-        image->write( filename );
+        remove(filename.c_str());
+        image->write(filename);
         delete image;
     }
 
-    static void writeImagesToPng( std::string filename, float const*images, int numImages, int imageSize ) {
-        int cols = sqrt( numImages );
-        if( cols * cols < numImages ) {
+    static void writeImagesToPng(std::string filename, float const*images, int numImages, int imageSize) {
+        int cols = sqrt(numImages);
+        if(cols * cols < numImages) {
             cols++;
         }
-        int rows = ( numImages + cols - 1 ) / cols;
+        int rows = (numImages + cols - 1) / cols;
         std::cout << "numImages " << numImages << " rows " << rows << " cols " << cols << std::endl;
-        png::image< png::rgb_pixel > *image = new png::image< png::rgb_pixel >( imageSize * rows, imageSize * cols );
+        png::image< png::rgb_pixel > *image = new png::image< png::rgb_pixel >(imageSize * rows, imageSize * cols);
 
 
-        for( int x = 0; x < cols; x++ ) {
-           for( int y = 0; y < rows; y++ ) {
-                if( x * rows + y >= numImages ) {
+        for(int x = 0; x < cols; x++) {
+           for(int y = 0; y < rows; y++) {
+                if(x * rows + y >= numImages) {
                     continue;
                 }
 //                cout << "image at x " << x << " y " << y << endl;
-                float const*imagearray = &(images[imageSize * imageSize * ( x*rows + y ) ]);
-                float maxValue = getImageMax( imagearray, imageSize );
-                float minValue = getImageMin( imagearray, imageSize );
-                for( int i = 0; i < imageSize; i++ ) {
-                    for( int j = 0; j < imageSize; j++ ) {
-                       float normValue = ( imagearray[i*imageSize + j] + minValue ) * 255.0f / (maxValue - minValue );
-                       (*image)[x*imageSize + i][y*imageSize + j] = png::rgb_pixel( normValue, normValue, normValue );
+                float const*imagearray = &(images[imageSize * imageSize * (x*rows + y) ]);
+                float maxValue = getImageMax(imagearray, imageSize);
+                float minValue = getImageMin(imagearray, imageSize);
+                for(int i = 0; i < imageSize; i++) {
+                    for(int j = 0; j < imageSize; j++) {
+                       float normValue = (imagearray[i*imageSize + j] + minValue) * 255.0f / (maxValue - minValue);
+                       (*image)[x*imageSize + i][y*imageSize + j] = png::rgb_pixel(normValue, normValue, normValue);
                     }
                 }
 
             }
          }
-        remove( filename.c_str() );
-        image->write( filename );
+        remove(filename.c_str());
+        image->write(filename);
         delete image;
     }
-    static void writeImagesToPng( std::string filename, unsigned char const*images, int numImages, int imageSize ) {
-        int cols = sqrt( numImages );
-        if( cols * cols < numImages ) {
+    static void writeImagesToPng(std::string filename, unsigned char const*images, int numImages, int imageSize) {
+        int cols = sqrt(numImages);
+        if(cols * cols < numImages) {
             cols++;
         }
-        int rows = ( numImages + cols - 1 ) / cols;
+        int rows = (numImages + cols - 1) / cols;
         std::cout << "numImages " << numImages << " rows " << rows << " cols " << cols << std::endl;
-        png::image< png::rgb_pixel > *image = new png::image< png::rgb_pixel >( imageSize * rows, imageSize * cols );
+        png::image< png::rgb_pixel > *image = new png::image< png::rgb_pixel >(imageSize * rows, imageSize * cols);
 
 
-        for( int x = 0; x < cols; x++ ) {
-           for( int y = 0; y < rows; y++ ) {
-                if( x * rows + y >= numImages ) {
+        for(int x = 0; x < cols; x++) {
+           for(int y = 0; y < rows; y++) {
+                if(x * rows + y >= numImages) {
                     continue;
                 }
 //                cout << "image at x " << x << " y " << y << endl;
-                unsigned char const*imagearray = &(images[imageSize * imageSize * ( x*rows + y ) ]);
-                float maxValue = 255; // getImageMax( image, imageSize );
-                float minValue = 0; // getImageMin( image, imageSize );
-                for( int i = 0; i < imageSize; i++ ) {
-                    for( int j = 0; j < imageSize; j++ ) {
-                       float normValue = ( imagearray[i*imageSize + j] + minValue ) * 255.0f / (maxValue - minValue );
-                       (*image)[x*imageSize + i][y*imageSize + j] = png::rgb_pixel( normValue, normValue, normValue );
+                unsigned char const*imagearray = &(images[imageSize * imageSize * (x*rows + y) ]);
+                float maxValue = 255; // getImageMax(image, imageSize);
+                float minValue = 0; // getImageMin(image, imageSize);
+                for(int i = 0; i < imageSize; i++) {
+                    for(int j = 0; j < imageSize; j++) {
+                       float normValue = (imagearray[i*imageSize + j] + minValue) * 255.0f / (maxValue - minValue);
+                       (*image)[x*imageSize + i][y*imageSize + j] = png::rgb_pixel(normValue, normValue, normValue);
                     }
                 }
 
             }
          }
-        remove( filename.c_str() );
-        image->write( filename );
+        remove(filename.c_str());
+        image->write(filename);
         delete image;
     }
 };
diff --git a/src/util/JpegHelper.cpp b/src/util/JpegHelper.cpp
index 65156da3..f78b3293 100644
--- a/src/util/JpegHelper.cpp
+++ b/src/util/JpegHelper.cpp
@@ -21,17 +21,17 @@ using namespace std;
 #define STATIC
 #define VIRTUAL
 
-PUBLIC STATIC void JpegHelper::write( std::string filename, int planes, int width, int height, unsigned char *values ) {
+PUBLIC STATIC void JpegHelper::write(std::string filename, int planes, int width, int height, unsigned char *values) {
     unsigned char *image_buffer = new unsigned char[width * height * planes];
-//    for( int i = 0 ; i < 28 *28 *3; i++ ) {
+//    for(int i = 0 ; i < 28 *28 *3; i++) {
 //=        image_buffer[i] = i * 255 / 28 * 28 / 3;
         //image_buffer[i] = 128;
 //    }
-    for( int row = 0; row < height; row++ ) {
-        for( int col = 0; col < width; col++ ) {
-            for( int plane = 0; plane < planes; plane++ ) {
+    for(int row = 0; row < height; row++) {
+        for(int col = 0; col < width; col++) {
+            for(int plane = 0; plane < planes; plane++) {
                 image_buffer[row*width*planes + col*planes + plane] = values[plane*width*height + row*width + col];
-//                if( ( y % 2 == 0 & x % 2 == 0 ) ) {
+//                if(( y % 2 == 0 & x % 2 == 0) ) {
 //                    image_buffer[x * 28 * 3 + y *3 + c] = 255;
 //                }
             }
@@ -46,7 +46,7 @@ PUBLIC STATIC void JpegHelper::write( std::string filename, int planes, int widt
 //    string filename = "foo.jpeg";
     FILE * outfile;
     if ((outfile = fopen(filename.c_str(), "wb")) == NULL) {
-        throw runtime_error( "can't open "  + filename );
+        throw runtime_error("can't open "  + filename);
     }
     jpeg_stdio_dest(&cinfo, outfile);
 
@@ -54,9 +54,9 @@ PUBLIC STATIC void JpegHelper::write( std::string filename, int planes, int widt
     cinfo.image_width = width;      /* image width and height, in pixels */
     cinfo.image_height = height;
     cinfo.input_components = planes;     /* # of color components per pixel */
-    if( planes == 3 ) {
+    if(planes == 3) {
         cinfo.in_color_space = JCS_RGB; /* colorspace of input image */
-    } else if( planes == 1 ) {
+    } else if(planes == 1) {
         cinfo.in_color_space = JCS_GRAYSCALE;
     } else {
         throw runtime_error("num planes " + toString(planes) + " not handled");
@@ -83,7 +83,7 @@ PUBLIC STATIC void JpegHelper::write( std::string filename, int planes, int widt
     delete[] image_buffer;
 }
 
-PUBLIC STATIC void JpegHelper::read( std::string filename, int planes, int width, int height, unsigned char *values ) {
+PUBLIC STATIC void JpegHelper::read(std::string filename, int planes, int width, int height, unsigned char *values) {
     unsigned char *image_buffer = new unsigned char[width * height * planes];
 
     struct jpeg_decompress_struct cinfo;
@@ -94,26 +94,26 @@ PUBLIC STATIC void JpegHelper::read( std::string filename, int planes, int width
 //    string filename = "foo.jpeg";
     FILE * infile;
     if ((infile = fopen(filename.c_str(), "rb")) == NULL) {
-        throw runtime_error( "can't open "  + filename );
+        throw runtime_error("can't open "  + filename);
     }
     jpeg_stdio_src(&cinfo, infile);
     jpeg_read_header(&cinfo, TRUE);
 
     jpeg_start_decompress(&cinfo);
-    if( (int)cinfo.output_width != width ) {
+    if((int)cinfo.output_width != width) {
         throw runtime_error("error reading " + filename + ":" + 
-            " width is " + toString( cinfo.output_width ) + 
-            " and not " + toString( width ) );
+            " width is " + toString(cinfo.output_width) + 
+            " and not " + toString(width) );
     }
-    if( (int)cinfo.output_height != height ) {
+    if((int)cinfo.output_height != height) {
         throw runtime_error("error reading " + filename + ":" + 
-            " height is " + toString( cinfo.output_height ) + 
-            " and not " + toString( height ) );
+            " height is " + toString(cinfo.output_height) + 
+            " and not " + toString(height) );
     }
-    if( (int)cinfo.output_components != planes ) {
+    if((int)cinfo.output_components != planes) {
         throw runtime_error("error reading " + filename + ":" + 
-            " planes is " + toString( cinfo.output_components ) + 
-            " and not " + toString( planes ) );
+            " planes is " + toString(cinfo.output_components) + 
+            " and not " + toString(planes) );
     }
 
     JSAMPROW row_pointer[1];        /* pointer to a single row */
@@ -131,9 +131,9 @@ PUBLIC STATIC void JpegHelper::read( std::string filename, int planes, int width
 
     fclose(infile);
 
-    for( int row = 0; row < height; row++ ) {
-        for( int col = 0; col < width; col++ ) {
-            for( int plane = 0; plane < planes; plane++ ) {
+    for(int row = 0; row < height; row++) {
+        for(int col = 0; col < width; col++) {
+            for(int plane = 0; plane < planes; plane++) {
                 values[plane*width*height + row*width + col] = image_buffer[row*width*planes + col*planes + plane];
             }
         }
diff --git a/src/util/JpegHelper.h b/src/util/JpegHelper.h
index f35200ee..28bf4a82 100644
--- a/src/util/JpegHelper.h
+++ b/src/util/JpegHelper.h
@@ -22,8 +22,8 @@ class DeepCL_EXPORT JpegHelper {
     // generated, using cog:
 
     public:
-    STATIC void write( std::string filename, int planes, int width, int height, unsigned char *values );
-    STATIC void read( std::string filename, int planes, int width, int height, unsigned char *values );
+    STATIC void write(std::string filename, int planes, int width, int height, unsigned char *values);
+    STATIC void read(std::string filename, int planes, int width, int height, unsigned char *values);
 
     // [[[end]]]
 };
diff --git a/src/util/RandomSingleton.cpp b/src/util/RandomSingleton.cpp
index 0f75a72f..8cd4366b 100644
--- a/src/util/RandomSingleton.cpp
+++ b/src/util/RandomSingleton.cpp
@@ -22,18 +22,18 @@ PUBLIC RandomSingleton::RandomSingleton() {
     #else
     {
         std::chrono::time_point<std::chrono::high_resolution_clock> thistime = std::chrono::high_resolution_clock::now();
-        time = static_cast<int>( std::chrono::duration_cast<std::chrono::milliseconds> ( thistime.time_since_epoch() ).count() );
+        time = static_cast<int>(std::chrono::duration_cast<std::chrono::milliseconds> (thistime.time_since_epoch()).count());
     }
     #endif
     srand(time);
-    unsigned long seed = ( rand() << 8 ) + rand();
-    myrandom.seed( seed );
+    unsigned long seed = (rand() << 8) + rand();
+    myrandom.seed(seed);
 }
 PUBLIC STATIC RandomSingleton *RandomSingleton::instance() {
     static RandomSingleton *thisinstance = new RandomSingleton();
     return thisinstance; // assume single-threaded, which... we are :-)
 }
-//    void testingonly_setInstance( RandomSingleton *testInstance ) {
+//    void testingonly_setInstance(RandomSingleton *testInstance) {
 //        _instance = testinstance;
 //    }
 PUBLIC VIRTUAL float RandomSingleton::_uniform() {
@@ -42,9 +42,9 @@ PUBLIC VIRTUAL float RandomSingleton::_uniform() {
 PUBLIC STATIC float RandomSingleton::uniform() {
     return instance()->_uniform();
 }
-PUBLIC STATIC int RandomSingleton::uniformInt( int minValueInclusive, int maxValueInclusive ) {
-    return ( instance()->myrandom() % 
-        ( maxValueInclusive - minValueInclusive + 1 ) )
+PUBLIC STATIC int RandomSingleton::uniformInt(int minValueInclusive, int maxValueInclusive) {
+    return (instance()->myrandom() % 
+        (maxValueInclusive - minValueInclusive + 1) )
      + minValueInclusive;
 }
 
diff --git a/src/util/RandomSingleton.h b/src/util/RandomSingleton.h
index 7ce2106b..c171365c 100644
--- a/src/util/RandomSingleton.h
+++ b/src/util/RandomSingleton.h
@@ -35,11 +35,11 @@ class DeepCL_EXPORT RandomSingleton {
     private:
     // as long as myrandom stays as private, should be ok to disable the wanrings I think?
     #ifdef _WIN32
-    #pragma warning( disable: 4251 )
+    #pragma warning(disable: 4251)
     #endif
     MT19937 myrandom;
     #ifdef _WIN32
-    #pragma warning( default: 4251 )
+    #pragma warning(default: 4251)
     #endif
 
     // [[[cog
@@ -53,7 +53,7 @@ class DeepCL_EXPORT RandomSingleton {
     STATIC RandomSingleton *instance();
     VIRTUAL float _uniform();
     STATIC float uniform();
-    STATIC int uniformInt( int minValueInclusive, int maxValueInclusive );
+    STATIC int uniformInt(int minValueInclusive, int maxValueInclusive);
 
     // [[[end]]]
 };
diff --git a/src/util/StatefulTimer.h b/src/util/StatefulTimer.h
deleted file mode 100644
index 388dd0a9..00000000
--- a/src/util/StatefulTimer.h
+++ /dev/null
@@ -1,101 +0,0 @@
-// Copyright Hugh Perkins 2014 hughperkins at gmail
-//
-// This Source Code Form is subject to the terms of the Mozilla Public License, 
-// v. 2.0. If a copy of the MPL was not distributed with this file, You can 
-// obtain one at http://mozilla.org/MPL/2.0/.
-
-#pragma once
-
-#include <iostream>
-
-//#if (_MSC_VER == 1500 || _MSC_VER == 1600 ) // visual studio 2008 or 2010
-#ifdef _MSC_VER // make consistent across all msvc versions, so dont have to retest on different msvc versions...
-#define WINNOCHRONO
-//#include <ctime>
-#define NOMINMAX // prevents errors compiling std::max and std::min
-#include <Windows.h>
-#else
-#include <chrono>
-#endif
-
-#include <vector>
-#include <map>
-#include <string>
-
-#include "DeepCLDllExport.h"
-
-class StatefulTimer {
-public:
-    static StatefulTimer *instance() {
-        static StatefulTimer *_instance = new StatefulTimer();
-        return _instance;
-    }
-    #ifdef WINNOCHRONO
-    DWORD last;
-    #else
-    std::chrono::time_point<std::chrono::high_resolution_clock> last;
-    #endif
-    std::map< std::string, float > timeByState;
-    std::string prefix; // = "";
-    StatefulTimer() : prefix("") {
-        #ifdef WINNOCHRONO
-        last = timeGetTime();
-        #else
-         last = std::chrono::high_resolution_clock::now();
-        #endif
-    }
-    ~StatefulTimer() {
-        std::cout << "StatefulTimer readings:" << std::endl;
-        for( std::map< std::string, float >::iterator it = timeByState.begin(); it != timeByState.end(); it++ ) {
-            std::cout << "   " << it->first << ": " << it->second << std::endl;
-        }
-    }
-    void _dump(bool force = false) {
-        double totalTimings = 0;
-        for( std::map< std::string, float >::iterator it = timeByState.begin(); it != timeByState.end(); it++ ) {
-//            std::cout << "   " << it->first << ": " << it->second << std::endl;
-            totalTimings += it->second;
-        }
-        if( !force && totalTimings < 800 ) {
-            return;
-        }
-        std::cout << "StatefulTimer readings:" << std::endl;
-        for( std::map< std::string, float >::iterator it = timeByState.begin(); it != timeByState.end(); it++ ) {
-            if( it->second > 0 ) {
-                std::cout << "   " << it->first << ": " << it->second << "ms" << std::endl;
-            }
-        }
-        timeByState.clear();
-    }
-    static void setPrefix( std::string _prefix ) {
-        instance()->prefix = _prefix;
-    }
-    static void dump(bool force = false) {
-        instance()->_dump(force);
-    }
-    static void timeCheck( std::string state ) {
-        instance()->_timeCheck( state );
-    }
-    void _timeCheck( std::string state ) {
-        state = prefix + state;
-        #ifdef WINNOCHRONO
-        DWORD thistime = timeGetTime();
-		DWORD timemilliseconds = thistime - last;
-        #else
-       std::chrono::time_point<std::chrono::high_resolution_clock> thistime = std::chrono::high_resolution_clock::now();
-       std::chrono::duration<float> change = thistime - last;
-       float timemilliseconds = static_cast<float>( std::chrono::duration_cast<std::chrono::milliseconds> ( change ).count() );
-        #endif
-//        if( timeByState.has_key( state ) ) {
-            timeByState[state] += timemilliseconds;
-//        } else {
-//            timeByState[state] = timemilliseconds;
-//        }
-        #ifdef WINNOCHRONO
-        last = thistime;
-        #else
-        last = thistime;
-        #endif
-    }
-};
-
diff --git a/src/util/Timer.h b/src/util/Timer.h
index ee16bc3e..28ddbe4c 100644
--- a/src/util/Timer.h
+++ b/src/util/Timer.h
@@ -8,7 +8,7 @@
 
 #include <iostream>
 
-//#if (_MSC_VER == 1500 || _MSC_VER == 1600 ) // visual studio 2008 or 2010
+//#if (_MSC_VER == 1500 || _MSC_VER == 1600) // visual studio 2008 or 2010
 #ifdef _MSC_VER // make consistent across all msvc versions, so dont have to retest on different msvc versions...
 #define WINNOCHRONO 
 #define NOMINMAX // prevents errors compiling std::max and std::min
@@ -41,7 +41,7 @@ class DeepCL_EXPORT Timer{
       //  time_t thistime;
        // time(&thistime);
 //	    struct std::timeval tm;
-//	    gettimeofday( &tm, NULL );
+//	    gettimeofday(&tm, NULL);
 //	    return (double)tm.tv_sec + (double)tm.tv_usec / 1000000.0;
         return timeGetTime();
    }
@@ -51,15 +51,15 @@ class DeepCL_EXPORT Timer{
    }
 #endif
 
-   void timeCheck(std::string label ) {
+   void timeCheck(std::string label) {
 //        #ifdef _WIN32
     #ifdef WINNOCHRONO
        DWORD thistime = getCount();
-       DWORD timemilliseconds = ( thistime - last );
+       DWORD timemilliseconds = (thistime - last);
         #else
      std::chrono::time_point<std::chrono::high_resolution_clock> thistime = getCount();
     std::chrono::duration<double> change = thistime - last;
-      double timemilliseconds = static_cast<double>( std::chrono::duration_cast<std::chrono::milliseconds> ( change ).count() );
+      double timemilliseconds = static_cast<double>(std::chrono::duration_cast<std::chrono::milliseconds> (change).count());
         #endif
       last = thistime;
       std::cout << label << " " << timemilliseconds << " ms" << std::endl;
@@ -69,11 +69,11 @@ class DeepCL_EXPORT Timer{
                         // without updating 'last'
     #ifdef WINNOCHRONO
        DWORD thistime = getCount();
-      DWORD timemilliseconds = ( thistime - last );
+      DWORD timemilliseconds = (thistime - last);
        #else
       std::chrono::time_point<std::chrono::high_resolution_clock> thistime = getCount();
     std::chrono::duration<double> change = thistime - last;
-      double timemilliseconds = static_cast<double>( std::chrono::duration_cast<std::chrono::milliseconds> ( change ).count() );
+      double timemilliseconds = static_cast<double>(std::chrono::duration_cast<std::chrono::milliseconds> (change).count());
        #endif
       return timemilliseconds;
     }
@@ -82,11 +82,11 @@ class DeepCL_EXPORT Timer{
 //       #ifdef _WIN32
     #ifdef WINNOCHRONO
        DWORD thistime = getCount();
-      DWORD timemilliseconds = ( thistime - last );
+      DWORD timemilliseconds = (thistime - last);
        #else
       std::chrono::time_point<std::chrono::high_resolution_clock> thistime = getCount();
     std::chrono::duration<double> change = thistime - last;
-      double timemilliseconds = static_cast<double>( std::chrono::duration_cast<std::chrono::milliseconds> ( change ).count() );
+      double timemilliseconds = static_cast<double>(std::chrono::duration_cast<std::chrono::milliseconds> (change).count());
        #endif
       last = thistime;
       return timemilliseconds;
diff --git a/src/util/stringhelper.cpp b/src/util/stringhelper.cpp
index 98300c66..7d9c01dc 100644
--- a/src/util/stringhelper.cpp
+++ b/src/util/stringhelper.cpp
@@ -11,67 +11,67 @@ using namespace std;
 
 #include "util/stringhelper.h"
 
-vector<string> split(const string &str, const string &separator ) {
+vector<string> split(const string &str, const string &separator) {
 	vector<string> splitstring;
 	int start = 0;
 	int npos = (int)str.find(separator);
-	while (npos != (int)str.npos ) {
-		splitstring.push_back( str.substr(start, npos-start) );
+	while (npos != (int)str.npos) {
+		splitstring.push_back(str.substr(start, npos-start));
 		start = npos + (int)separator.length();
 		npos = (int)str.find(separator, start);
 	}
-	splitstring.push_back( str.substr( start ) );
+	splitstring.push_back(str.substr(start) );
     return splitstring;
 }
 
-string trim( const string &target ) {
+string trim(const string &target) {
 
    int origlen = (int)target.size();
    int startpos = -1;
-   for( int i = 0; i < origlen; i++ ) {
-      if( target[i] != ' ' && target[i] != '\r' && target[i] != '\n' ) {
+   for(int i = 0; i < origlen; i++) {
+      if(target[i] != ' ' && target[i] != '\r' && target[i] != '\n') {
          startpos = i;
          break;
       }
    }
    int endpos = -1;
-   for( int i = origlen - 1; i >= 0; i-- ) {
-      if( target[i] != ' ' && target[i] != '\r' && target[i] != '\n' ) {
+   for(int i = origlen - 1; i >= 0; i--) {
+      if(target[i] != ' ' && target[i] != '\r' && target[i] != '\n') {
          endpos = i;
          break;
       }      
    }
-   if( startpos == -1 || endpos == -1 ) {
+   if(startpos == -1 || endpos == -1) {
       return "";
    }
-   return target.substr(startpos, endpos-startpos + 1 );
+   return target.substr(startpos, endpos-startpos + 1);
 }
 
-string replace( string targetString, string oldValue, string newValue ) {
-    size_t pos = targetString.find( oldValue );
-    if( pos == string::npos ) {
+string replace(string targetString, string oldValue, string newValue) {
+    size_t pos = targetString.find(oldValue);
+    if(pos == string::npos) {
         return targetString;
     }
-    return targetString.replace( pos, oldValue.length(), newValue );
+    return targetString.replace(pos, oldValue.length(), newValue);
 }
-string replaceGlobal( string targetString, string oldValue, string newValue ) {
+string replaceGlobal(string targetString, string oldValue, string newValue) {
     int pos = 0;
     string resultString = "";
-    size_t targetPos = targetString.find( oldValue, pos );
-    while( targetPos != string::npos ) {
-        string preOld = targetString.substr( pos, targetPos - pos );
+    size_t targetPos = targetString.find(oldValue, pos);
+    while(targetPos != string::npos) {
+        string preOld = targetString.substr(pos, targetPos - pos);
         resultString += preOld + newValue;
         pos = targetPos + oldValue.length();
-        targetPos = targetString.find( oldValue, pos );
+        targetPos = targetString.find(oldValue, pos);
     }
     resultString += targetString.substr(pos);
     return resultString;
 }
 
-std::string toLower(std::string in ) {
-     int len = static_cast<int>( in.size() );
+std::string toLower(std::string in) {
+     int len = static_cast<int>(in.size());
      char *buffer = new char[len + 1];
-     for( int i = 0; i < len; i++ ) {
+     for(int i = 0; i < len; i++) {
         char thischar = in[i];
         thischar = tolower(thischar);
         buffer[i] = thischar;
@@ -82,11 +82,11 @@ std::string toLower(std::string in ) {
     return result;
 }
 
-void strcpy_safe( char *destination, char const*source, int maxLength ) {
+void strcpy_safe(char *destination, char const*source, int maxLength) {
     int i = 0;
-    for( i = 0; i < maxLength; i++ ) {
+    for(i = 0; i < maxLength; i++) {
         destination[i] = source[i];
-        if( source[i] == 0 ) {
+        if(source[i] == 0) {
             break;
         }
     }
diff --git a/src/util/stringhelper.h b/src/util/stringhelper.h
index c30f7cfc..d7f58346 100644
--- a/src/util/stringhelper.h
+++ b/src/util/stringhelper.h
@@ -19,43 +19,43 @@ class IHasToString {
     virtual std::string toString() = 0;
 };
 
-//std::string toString( IHasToString *val ); // { // not terribly efficient, but works...
+//std::string toString(IHasToString *val); // { // not terribly efficient, but works...
 //   std::ostringstream myostringstream;
 //   myostringstream << val->toString();
 //   return myostringstream.str();
 //}
 
 template<typename T>
-std::string toString(T val ) { // not terribly efficient, but works...
+std::string toString(T val) { // not terribly efficient, but works...
    std::ostringstream myostringstream;
    myostringstream << val;
    return myostringstream.str();
 }
 
-std::vector<std::string> split(const std::string &str, const std::string &separator = " " );
-std::string trim( const std::string &target );
+std::vector<std::string> split(const std::string &str, const std::string &separator = " ");
+std::string trim(const std::string &target);
 
-inline float atof( std::string stringvalue ) {
+inline float atof(std::string stringvalue) {
    return (float)std::atof(stringvalue.c_str());
 }
-inline int atoi( std::string stringvalue ) {
+inline int atoi(std::string stringvalue) {
    return std::atoi(stringvalue.c_str());
 }
 
 // returns empty string if off the end of the number of available tokens
-inline std::string getToken( std::string targetstring, int tokenIndexFromZero, std::string separator = " " ) {
-   std::vector<std::string> splitstring = split( targetstring, separator );
-   if( tokenIndexFromZero < (int)splitstring.size() ) {
+inline std::string getToken(std::string targetstring, int tokenIndexFromZero, std::string separator = " ") {
+   std::vector<std::string> splitstring = split(targetstring, separator);
+   if(tokenIndexFromZero < (int)splitstring.size()) {
       return splitstring[tokenIndexFromZero];
    } else {
       return "";
    }
 }
 
-std::string replace( std::string targetString, std::string oldValue, std::string newValue );
-std::string replaceGlobal( std::string targetString, std::string oldValue, std::string newValue );
+std::string replace(std::string targetString, std::string oldValue, std::string newValue);
+std::string replaceGlobal(std::string targetString, std::string oldValue, std::string newValue);
 
-std::string toLower(std::string in );
+std::string toLower(std::string in);
 
-void strcpy_safe( char *destination, char const*source, int maxLength );
+void strcpy_safe(char *destination, char const*source, int maxLength);
 
diff --git a/src/weights/OriginalInitializer.cpp b/src/weights/OriginalInitializer.cpp
index 17e20d13..e6feb2e3 100644
--- a/src/weights/OriginalInitializer.cpp
+++ b/src/weights/OriginalInitializer.cpp
@@ -16,19 +16,19 @@ using namespace std;
 #define STATIC
 #define VIRTUAL
 
-VIRTUAL void OriginalInitializer::initializeWeights( int numWeights, float *weights, int fanin ) {
+VIRTUAL void OriginalInitializer::initializeWeights(int numWeights, float *weights, int fanin) {
     float rangesize = sqrt(12.0f / (float)fanin) ;
-    for( int i = 0; i < numWeights; i++ ) {
+    for(int i = 0; i < numWeights; i++) {
         float uniformrand = RandomSingleton::uniform();  
-        float weight = rangesize * ( uniformrand - 0.5f );
+        float weight = rangesize * (uniformrand - 0.5f);
         weights[i] = weight;
     }
 }
-VIRTUAL void OriginalInitializer::initializeBias( int numBias, float *bias, int fanin ) {
+VIRTUAL void OriginalInitializer::initializeBias(int numBias, float *bias, int fanin) {
     float rangesize = sqrt(12.0f / (float)fanin) ;
-    for( int i = 0; i < numBias; i++ ) {
+    for(int i = 0; i < numBias; i++) {
         float uniformrand = RandomSingleton::uniform();  
-        float weight = rangesize * ( uniformrand - 0.5f );
+        float weight = rangesize * (uniformrand - 0.5f);
         bias[i] = weight;
     }
 }
diff --git a/src/weights/OriginalInitializer.h b/src/weights/OriginalInitializer.h
index 1a09c357..d9a4fa45 100644
--- a/src/weights/OriginalInitializer.h
+++ b/src/weights/OriginalInitializer.h
@@ -27,8 +27,8 @@ class DeepCL_EXPORT OriginalInitializer : public WeightsInitializer {
     // cog_addheaders.add()
     // ]]]
     // generated, using cog:
-    VIRTUAL void initializeWeights( int numWeights, float *weights, int fanin );
-    VIRTUAL void initializeBias( int numBias, float *bias, int fanin );
+    VIRTUAL void initializeWeights(int numWeights, float *weights, int fanin);
+    VIRTUAL void initializeBias(int numBias, float *bias, int fanin);
 
     // [[[end]]]
 };
diff --git a/src/weights/UniformInitializer.cpp b/src/weights/UniformInitializer.cpp
index 0e6c9f27..09d9fb7b 100644
--- a/src/weights/UniformInitializer.cpp
+++ b/src/weights/UniformInitializer.cpp
@@ -16,19 +16,19 @@ using namespace std;
 #define STATIC
 #define VIRTUAL
 
-UniformInitializer::UniformInitializer( float multiplier ) {
+UniformInitializer::UniformInitializer(float multiplier) {
     this->multiplier = multiplier;
 }
-VIRTUAL void UniformInitializer::initializeWeights( int numWeights, float *weights, int fanin ) {
+VIRTUAL void UniformInitializer::initializeWeights(int numWeights, float *weights, int fanin) {
     float range = multiplier / (float)fanin;
-    for( int i = 0; i < numWeights; i++ ) {
+    for(int i = 0; i < numWeights; i++) {
         float uniformrand = RandomSingleton::uniform() * 2.0f - 1.0f;  
         weights[i] = range * uniformrand;
     }
 }
-VIRTUAL void UniformInitializer::initializeBias( int numBias, float *bias, int fanin ) {
+VIRTUAL void UniformInitializer::initializeBias(int numBias, float *bias, int fanin) {
     float range = multiplier / (float)fanin;
-    for( int i = 0; i < numBias; i++ ) {
+    for(int i = 0; i < numBias; i++) {
         float uniformrand = RandomSingleton::uniform() * 2.0f - 1.0f;  
         bias[i] = range * uniformrand;
     }
diff --git a/src/weights/UniformInitializer.h b/src/weights/UniformInitializer.h
index 80e7fbd5..12a7e93e 100644
--- a/src/weights/UniformInitializer.h
+++ b/src/weights/UniformInitializer.h
@@ -19,7 +19,7 @@
 #define STATIC static
 
 // idea of this is that it will assign random floats uniformly sampled
-// in range ( - multiplier / fanin ) to ( + multiplier / fanin )
+// in range (- multiplier / fanin) to (+ multiplier / fanin)
 class DeepCL_EXPORT UniformInitializer : public WeightsInitializer {
 public:
     float multiplier;
@@ -29,9 +29,9 @@ class DeepCL_EXPORT UniformInitializer : public WeightsInitializer {
     // cog_addheaders.add()
     // ]]]
     // generated, using cog:
-    UniformInitializer( float multiplier );
-    VIRTUAL void initializeWeights( int numWeights, float *weights, int fanin );
-    VIRTUAL void initializeBias( int numBias, float *bias, int fanin );
+    UniformInitializer(float multiplier);
+    VIRTUAL void initializeWeights(int numWeights, float *weights, int fanin);
+    VIRTUAL void initializeBias(int numBias, float *bias, int fanin);
 
     // [[[end]]]
 };
diff --git a/src/weights/WeightsHelper.h b/src/weights/WeightsHelper.h
index a8a6198a..60542291 100644
--- a/src/weights/WeightsHelper.h
+++ b/src/weights/WeightsHelper.h
@@ -8,7 +8,7 @@
 
 class WeightsHelper {
 public:
-    static inline float generateWeight( float rangesize ) {
+    static inline float generateWeight(float rangesize) {
 //        float rangesize = sqrt(12.0f / (float)fanin) ;
     //        float uniformrand = random() / (float)random.max();     
         float signeduniformrand = RandomSingleton::uniform() * 2.0f - 1.0f;
diff --git a/src/weights/WeightsInitializer.h b/src/weights/WeightsInitializer.h
index 16a238a5..6e20dd1d 100644
--- a/src/weights/WeightsInitializer.h
+++ b/src/weights/WeightsInitializer.h
@@ -18,8 +18,8 @@
 
 class DeepCL_EXPORT WeightsInitializer {
 public:
-    virtual void initializeWeights( int numWeights, float *weights, int fanin ) = 0;
-    virtual void initializeBias( int numBias, float *bias, int fanin ) = 0;
+    virtual void initializeWeights(int numWeights, float *weights, int fanin) = 0;
+    virtual void initializeBias(int numBias, float *bias, int fanin) = 0;
     virtual ~WeightsInitializer() {
     }
 
diff --git a/src/weights/WeightsPersister.cpp b/src/weights/WeightsPersister.cpp
index 8c988e18..87127d7b 100644
--- a/src/weights/WeightsPersister.cpp
+++ b/src/weights/WeightsPersister.cpp
@@ -19,59 +19,59 @@ using namespace std;
 #undef STATIC
 #define STATIC
 
-template< typename T > STATIC void WeightsPersister::copyArray( T *dst, T const*src, int length ) { // this might already be in standard C++ library?
-    memcpy( dst, src, length * sizeof(T) );
+template< typename T > STATIC void WeightsPersister::copyArray(T *dst, T const*src, int length) { // this might already be in standard C++ library?
+    memcpy(dst, src, length * sizeof(T));
 }
-STATIC int WeightsPersister::getTotalNumWeights( NeuralNet *net ) {
-    return getTotalNumWeights( latestVersion, net );
+STATIC int WeightsPersister::getTotalNumWeights(NeuralNet *net) {
+    return getTotalNumWeights(latestVersion, net);
 }
-STATIC int WeightsPersister::getTotalNumWeights( int version, NeuralNet *net ) {
+STATIC int WeightsPersister::getTotalNumWeights(int version, NeuralNet *net) {
     int totalWeightsSize = 0;
 //    cout << "layers size " << net->layers.size() << endl;
-    for( int layerIdx = 1; layerIdx < net->getNumLayers(); layerIdx++ ) {
-        Layer *layer = net->getLayer( layerIdx );
-        int thisPersistSize = layer->getPersistSize( version );
+    for(int layerIdx = 1; layerIdx < net->getNumLayers(); layerIdx++) {
+        Layer *layer = net->getLayer(layerIdx);
+        int thisPersistSize = layer->getPersistSize(version);
 //        cout << "layer " << layerIdx << " this persist size " << thisPersistSize << endl;
         totalWeightsSize += thisPersistSize;
     }
     return totalWeightsSize;
 }
-STATIC void WeightsPersister::copyNetWeightsToArray( NeuralNet *net, float *target ) {
-    copyNetWeightsToArray( latestVersion, net, target );
+STATIC void WeightsPersister::copyNetWeightsToArray(NeuralNet *net, float *target) {
+    copyNetWeightsToArray(latestVersion, net, target);
 }
-STATIC void WeightsPersister::copyNetWeightsToArray( int version, NeuralNet *net, float *target ) {
+STATIC void WeightsPersister::copyNetWeightsToArray(int version, NeuralNet *net, float *target) {
     int pos = 0;
-    for( int layerIdx = 1; layerIdx < net->getNumLayers(); layerIdx++ ) {
-        Layer *layer = net->getLayer( layerIdx );
-        int persistSize = layer->getPersistSize( version );
-        if( persistSize > 0 ) {
-            layer->persistToArray( version, &(target[pos]) );
+    for(int layerIdx = 1; layerIdx < net->getNumLayers(); layerIdx++) {
+        Layer *layer = net->getLayer(layerIdx);
+        int persistSize = layer->getPersistSize(version);
+        if(persistSize > 0) {
+            layer->persistToArray(version, &(target[pos]));
         }
         pos += persistSize;
     }
 }
-STATIC void WeightsPersister::copyArrayToNetWeights( float const*source, NeuralNet *net ) {
-    copyArrayToNetWeights( latestVersion, source, net );
+STATIC void WeightsPersister::copyArrayToNetWeights(float const*source, NeuralNet *net) {
+    copyArrayToNetWeights(latestVersion, source, net);
 }
-STATIC void WeightsPersister::copyArrayToNetWeights( int version, float const*source, NeuralNet *net ) {
+STATIC void WeightsPersister::copyArrayToNetWeights(int version, float const*source, NeuralNet *net) {
     int pos = 0;
-    for( int layerIdx = 1; layerIdx < net->getNumLayers(); layerIdx++ ) {
-    Layer *layer = net->getLayer( layerIdx );
-        int persistSize = layer->getPersistSize( version );
-        if( persistSize > 0 ) {
-            layer->unpersistFromArray( version, &(source[pos]) );
+    for(int layerIdx = 1; layerIdx < net->getNumLayers(); layerIdx++) {
+    Layer *layer = net->getLayer(layerIdx);
+        int persistSize = layer->getPersistSize(version);
+        if(persistSize > 0) {
+            layer->unpersistFromArray(version, &(source[pos]));
         }
         pos += persistSize;
     }
 }
-STATIC int WeightsPersister::getArrayOffsetForLayer( NeuralNet *net, int layer ) {
-    return getArrayOffsetForLayer( latestVersion, net, layer );
+STATIC int WeightsPersister::getArrayOffsetForLayer(NeuralNet *net, int layer) {
+    return getArrayOffsetForLayer(latestVersion, net, layer);
 }
-STATIC int WeightsPersister::getArrayOffsetForLayer( int version, NeuralNet *net, int layer ) {
+STATIC int WeightsPersister::getArrayOffsetForLayer(int version, NeuralNet *net, int layer) {
     int pos = 0;
-    for( int layerIdx = 1; layerIdx < layer; layerIdx++ ) {
-    Layer *layer = net->getLayer( layerIdx );
-        pos += layer->getPersistSize( version );
+    for(int layerIdx = 1; layerIdx < layer; layerIdx++) {
+    Layer *layer = net->getLayer(layerIdx);
+        pos += layer->getPersistSize(version);
     }
     return pos;
 }
@@ -80,55 +80,55 @@ STATIC int WeightsPersister::getArrayOffsetForLayer( int version, NeuralNet *net
 // the machine fails right in between the 'delete' and the 'rename', but you 
 // should ideally never actually lose the weights file (unless the drive itself
 // fails of course...)
-STATIC void WeightsPersister::persistWeights( std::string filepath, std::string trainingConfigString, NeuralNet *net, int epoch, int batch, float annealedLearningRate, int numRight, float loss ) { // we should probably rename 'weights' to 'model' now that we are storing normalization data too?
+STATIC void WeightsPersister::persistWeights(std::string filepath, std::string trainingConfigString, NeuralNet *net, int epoch, int batch, float annealedLearningRate, int numRight, float loss) { // we should probably rename 'weights' to 'model' now that we are storing normalization data too?
     int headerLength = 1024;
-    int totalWeightsSize = getTotalNumWeights( latestVersion, net );
+    int totalWeightsSize = getTotalNumWeights(latestVersion, net);
     char *persistArray = new char[headerLength + totalWeightsSize * sizeof(float) ];
     int *persistArrayInts = reinterpret_cast<int *>(persistArray);
     float *persistArrayFloats = reinterpret_cast<float *>(persistArray);
-    strcpy_safe( persistArray, "ClCn", 4 ); // so easy to recognise file type
+    strcpy_safe(persistArray, "ClCn", 4); // so easy to recognise file type
     persistArrayInts[1] = latestVersion; // data file version number
     persistArrayInts[2] = epoch;
     persistArrayInts[3] = batch;
     persistArrayInts[4] = numRight;
     persistArrayFloats[5] = loss;
     persistArrayFloats[6] = annealedLearningRate;
-    strcpy_safe( persistArray + 7 * 4, trainingConfigString.c_str(), 800 );
-    copyNetWeightsToArray( latestVersion, net, reinterpret_cast<float *>(persistArray + headerLength) );
-    FileHelper::writeBinary( filepath + "~", reinterpret_cast<char *>(persistArray), 
-        headerLength + totalWeightsSize * sizeof(float) );
-    FileHelper::remove( filepath );
-    FileHelper::rename( filepath + "~", filepath );
-    std::cout << "wrote weights to file, filesize " << ( ( headerLength + totalWeightsSize ) *sizeof(float)/1024) << "KB" << std::endl;
+    strcpy_safe(persistArray + 7 * 4, trainingConfigString.c_str(), 800);
+    copyNetWeightsToArray(latestVersion, net, reinterpret_cast<float *>(persistArray + headerLength));
+    FileHelper::writeBinary(filepath + "~", reinterpret_cast<char *>(persistArray), 
+        headerLength + totalWeightsSize * sizeof(float));
+    FileHelper::remove(filepath);
+    FileHelper::rename(filepath + "~", filepath);
+    std::cout << "wrote weights to file, filesize " << (( headerLength + totalWeightsSize) *sizeof(float)/1024) << "KB" << std::endl;
     delete[] persistArray;
 }
-STATIC bool WeightsPersister::loadWeights( std::string filepath, std::string trainingConfigString, NeuralNet *net, int *p_epoch, int *p_batch, float *p_annealedLearningRate, int *p_numRight, float *p_loss ) {
-    if( FileHelper::exists( filepath ) ){
+STATIC bool WeightsPersister::loadWeights(std::string filepath, std::string trainingConfigString, NeuralNet *net, int *p_epoch, int *p_batch, float *p_annealedLearningRate, int *p_numRight, float *p_loss) {
+    if(FileHelper::exists(filepath) ){
         int headerSize = 1024;
         long fileSize;
-        char * data = FileHelper::readBinary( filepath, &fileSize );
+        char * data = FileHelper::readBinary(filepath, &fileSize);
 
-        if( !checkData( data, headerSize, fileSize ) ){
+        if(!checkData(data, headerSize, fileSize) ){
             delete [] data;
             return false;
         }
         int *dataAsInts = reinterpret_cast<int *>(data);
         int version = dataAsInts[1];
-        if( version == 1 || version == 3 ) {
-            return loadWeightsv1or3( data, fileSize, trainingConfigString, net, p_epoch, p_batch, p_annealedLearningRate, p_numRight, p_loss );
+        if(version == 1 || version == 3) {
+            return loadWeightsv1or3(data, fileSize, trainingConfigString, net, p_epoch, p_batch, p_annealedLearningRate, p_numRight, p_loss);
         } else {
-            throw std::runtime_error( "weights version " + toString( version ) + " not recognized" );
+            throw std::runtime_error("weights version " + toString(version) + " not recognized");
         }
     }
     return false;
 }
-STATIC bool WeightsPersister::loadWeightsv1or3( char *data, long fileSize, std::string trainingConfigString, NeuralNet *net, int *p_epoch, int *p_batch, float *p_annealedLearningRate, int *p_numRight, float *p_loss ) {
+STATIC bool WeightsPersister::loadWeightsv1or3(char *data, long fileSize, std::string trainingConfigString, NeuralNet *net, int *p_epoch, int *p_batch, float *p_annealedLearningRate, int *p_numRight, float *p_loss) {
         int headerSize = 1024;
         data[headerSize - 1] = 0; // null-terminate the string, if not already done
 
-        if( trainingConfigString != std::string(data + 7 * 4) ) {
+        if(trainingConfigString != std::string(data + 7 * 4)) {
             std::cout << "training options dont match weights file" << std::endl;
-            std::cout << "in file: [" + std::string( data + 7 * 4 ) + "]" << std::endl;
+            std::cout << "in file: [" + std::string(data + 7 * 4) + "]" << std::endl;
             std::cout << "current options: [" + trainingConfigString + "]" << std::endl;
 
             delete [] data;
@@ -139,56 +139,56 @@ STATIC bool WeightsPersister::loadWeightsv1or3( char *data, long fileSize, std::
         float *dataAsFloats = reinterpret_cast<float *>(data);
         float *allWeightsArray = reinterpret_cast<float *>(data + headerSize);
         int version = dataAsInts[1];
-        if( version == 1 || version == 3 ) {
+        if(version == 1 || version == 3) {
             *p_epoch = dataAsInts[2];
             *p_batch = dataAsInts[3];
             *p_numRight = dataAsInts[4];
             *p_loss = dataAsFloats[5];
             *p_annealedLearningRate = dataAsFloats[6];
         } else {
-            throw runtime_error("Unrecognized version " + toString( version ) );
+            throw runtime_error("Unrecognized version " + toString(version) );
         }
 
 //        std::cout << "read weights from file "  << (fileSize/1024) << "KB" << std::endl;
-        int expectedTotalWeightsSize = getTotalNumWeights( version, net );
-        int numFloatsRead = ( fileSize - headerSize ) / sizeof( float );
+        int expectedTotalWeightsSize = getTotalNumWeights(version, net);
+        int numFloatsRead = (fileSize - headerSize) / sizeof(float);
 
-        if( expectedTotalWeightsSize != numFloatsRead ) {
+        if(expectedTotalWeightsSize != numFloatsRead) {
             delete [] data;
-            throw std::runtime_error("weights file contains " + toString(numFloatsRead) + " floats, but we expect to see: " + toString( expectedTotalWeightsSize ) + ".  So there is probably some mismatch between the weights file, and the settings, or network version, used." );
+            throw std::runtime_error("weights file contains " + toString(numFloatsRead) + " floats, but we expect to see: " + toString(expectedTotalWeightsSize) + ".  So there is probably some mismatch between the weights file, and the settings, or network version, used.");
         }
 
-        copyArrayToNetWeights( version, allWeightsArray, net );
+        copyArrayToNetWeights(version, allWeightsArray, net);
 
         delete [] data;
         return true;
 }
-STATIC bool WeightsPersister::checkData( const char * data, long headerSize, long fileSize ) {
-    if( fileSize < headerSize ) {
+STATIC bool WeightsPersister::checkData(const char * data, long headerSize, long fileSize) {
+    if(fileSize < headerSize) {
         std::cout << "weights file has invalid size" << std::endl;
         return false;
     }
 
-    if( data[0] != 'C' || data[1] != 'l' || data[2] != 'C' || data[3] != 'n' ) {
+    if(data[0] != 'C' || data[1] != 'l' || data[2] != 'C' || data[3] != 'n') {
         std::cout << "weights file not ClConvolve format" << std::endl;
         return false;
     }
 
     const int *dataAsInts = reinterpret_cast<const int *>(data);
-    if( dataAsInts[1] != 1 && dataAsInts[1] != 3 ) {
+    if(dataAsInts[1] != 1 && dataAsInts[1] != 3) {
         std::cout << "weights file version not known" << std::endl;
         return false;
     }
 
     return true;
 }
-STATIC bool WeightsPersister::loadConfigString( std::string filepath, std::string & configString ) {
-    if( FileHelper::exists( filepath ) ){
+STATIC bool WeightsPersister::loadConfigString(std::string filepath, std::string & configString) {
+    if(FileHelper::exists(filepath) ){
         int headerSize = 1024;
         long fileSize;
-        char * data = FileHelper::readBinary( filepath, &fileSize );
+        char * data = FileHelper::readBinary(filepath, &fileSize);
 
-        if( !checkData( data, headerSize, fileSize ) ) {
+        if(!checkData(data, headerSize, fileSize) ) {
             delete [] data;
             return false;
         }
@@ -198,10 +198,10 @@ STATIC bool WeightsPersister::loadConfigString( std::string filepath, std::strin
         // + skip the 'netdef='
         const int *dataAsInts = reinterpret_cast<const int *>(data);
         int version = dataAsInts[1];
-        if( version == 1 || version == 3 ) {
-            configString = std::string( data + 7 * 4 + 7 );
+        if(version == 1 || version == 3) {
+            configString = std::string(data + 7 * 4 + 7);
         } else {
-            throw std::runtime_error( "unknown versoin " + toString(version) );
+            throw std::runtime_error("unknown versoin " + toString(version));
         }
 
         delete [] data;
diff --git a/src/weights/WeightsPersister.h b/src/weights/WeightsPersister.h
index 6a1b95e9..f2aef219 100644
--- a/src/weights/WeightsPersister.h
+++ b/src/weights/WeightsPersister.h
@@ -36,20 +36,20 @@ class DeepCL_EXPORT WeightsPersister {
     // cog_addheaders.add()
     // ]]]
     // generated, using cog:
-    template< typename T > STATIC void copyArray( T *dst, T const*src, int length );  // this might already be in standard C++ library?
-    STATIC int getTotalNumWeights( NeuralNet *net );
-    STATIC int getTotalNumWeights( int version, NeuralNet *net );
-    STATIC void copyNetWeightsToArray( NeuralNet *net, float *target );
-    STATIC void copyNetWeightsToArray( int version, NeuralNet *net, float *target );
-    STATIC void copyArrayToNetWeights( float const*source, NeuralNet *net );
-    STATIC void copyArrayToNetWeights( int version, float const*source, NeuralNet *net );
-    STATIC int getArrayOffsetForLayer( NeuralNet *net, int layer );
-    STATIC int getArrayOffsetForLayer( int version, NeuralNet *net, int layer );
-    STATIC void persistWeights( std::string filepath, std::string trainingConfigString, NeuralNet *net, int epoch, int batch, float annealedLearningRate, int numRight, float loss );  // we should probably rename 'weights' to 'model' now that we are storing normalization data too?
-    STATIC bool loadWeights( std::string filepath, std::string trainingConfigString, NeuralNet *net, int *p_epoch, int *p_batch, float *p_annealedLearningRate, int *p_numRight, float *p_loss );
-    STATIC bool loadWeightsv1or3( char *data, long fileSize, std::string trainingConfigString, NeuralNet *net, int *p_epoch, int *p_batch, float *p_annealedLearningRate, int *p_numRight, float *p_loss );
-    STATIC bool checkData( const char * data, long headerSize, long fileSize );
-    STATIC bool loadConfigString( std::string filepath, std::string & configString );
+    template< typename T > STATIC void copyArray(T *dst, T const*src, int length);  // this might already be in standard C++ library?
+    STATIC int getTotalNumWeights(NeuralNet *net);
+    STATIC int getTotalNumWeights(int version, NeuralNet *net);
+    STATIC void copyNetWeightsToArray(NeuralNet *net, float *target);
+    STATIC void copyNetWeightsToArray(int version, NeuralNet *net, float *target);
+    STATIC void copyArrayToNetWeights(float const*source, NeuralNet *net);
+    STATIC void copyArrayToNetWeights(int version, float const*source, NeuralNet *net);
+    STATIC int getArrayOffsetForLayer(NeuralNet *net, int layer);
+    STATIC int getArrayOffsetForLayer(int version, NeuralNet *net, int layer);
+    STATIC void persistWeights(std::string filepath, std::string trainingConfigString, NeuralNet *net, int epoch, int batch, float annealedLearningRate, int numRight, float loss);  // we should probably rename 'weights' to 'model' now that we are storing normalization data too?
+    STATIC bool loadWeights(std::string filepath, std::string trainingConfigString, NeuralNet *net, int *p_epoch, int *p_batch, float *p_annealedLearningRate, int *p_numRight, float *p_loss);
+    STATIC bool loadWeightsv1or3(char *data, long fileSize, std::string trainingConfigString, NeuralNet *net, int *p_epoch, int *p_batch, float *p_annealedLearningRate, int *p_numRight, float *p_loss);
+    STATIC bool checkData(const char * data, long headerSize, long fileSize);
+    STATIC bool loadConfigString(std::string filepath, std::string & configString);
 
     // [[[end]]]
 };
diff --git a/test/DimFromArgs.cpp b/test/DimFromArgs.cpp
index a6719dc7..e45e6f5e 100644
--- a/test/DimFromArgs.cpp
+++ b/test/DimFromArgs.cpp
@@ -18,7 +18,7 @@ using namespace std;
 void DimFromArgs::arg( LayerDimensions *p_dim ) {
     TestArgsParser::arg( "inputplanes", &(p_dim->inputPlanes) );
     TestArgsParser::arg( "numinputplanes", &(p_dim->inputPlanes) );
-    TestArgsParser::arg( "inputimagesize", &(p_dim->inputImageSize) );
+    TestArgsParser::arg( "inputsize", &(p_dim->inputSize) );
     TestArgsParser::arg( "numfilters", &(p_dim->numFilters) );
     TestArgsParser::arg( "filtersize", &(p_dim->filterSize) );
     TestArgsParser::arg( "padzeros", &(p_dim->padZeros) );
diff --git a/test/mnist-to-floats.cpp b/test/mnist-to-floats.cpp
index 29a919c9..2ed24585 100644
--- a/test/mnist-to-floats.cpp
+++ b/test/mnist-to-floats.cpp
@@ -25,11 +25,11 @@ int main( int argc, char *argv[] ) {
     int numExamples = atoi(argv[3]);
     
     int N, planes, size;
-    GenericLoader::getDimensions( mnistImagesFile, &N, &planes, &size );
+    GenericLoader::getDimensions( mnistImagesFile.c_str(), &N, &planes, &size );
     float *imageData = new float[ N * planes * size * size ];
     int *labels = new int[N]; // we'll just throw this away, but it keeps the genericloader happy
                               // probably want an option to not load this actually...
-    GenericLoader::load( mnistImagesFile, imageData, labels, 0, numExamples ); 
+    GenericLoader::load( mnistImagesFile.c_str(), imageData, labels, 0, numExamples ); 
 
     // now we've loaded the data, write it out in deepclexec-expecting format
     int linearLength = numExamples * planes * size * size;
diff --git a/test/mnist-to-pipe.cpp b/test/mnist-to-pipe.cpp
index 65b90406..a438092d 100644
--- a/test/mnist-to-pipe.cpp
+++ b/test/mnist-to-pipe.cpp
@@ -29,11 +29,11 @@ int main( int argc, char *argv[] ) {
     int numExamples = atoi(argv[2]);
     
     int N, planes, size;
-    GenericLoader::getDimensions( mnistImagesFile, &N, &planes, &size );
+    GenericLoader::getDimensions( mnistImagesFile.c_str(), &N, &planes, &size );
     float *imageData = new float[ N * planes * size * size ];
     int *labels = new int[N]; // we'll just throw this away, but it keeps the genericloader happy
                               // probably want an option to not load this actually...
-    GenericLoader::load( mnistImagesFile, imageData, labels, 0, numExamples ); 
+    GenericLoader::load( mnistImagesFile.c_str(), imageData, labels, 0, numExamples ); 
 
     // now we've loaded the data, write it out to ... stdout?
     int linearLength = numExamples * planes * size * size;
diff --git a/test/testCLMathWrapper.cpp b/test/testCLMathWrapper.cpp
index 563ee80a..892f2276 100644
--- a/test/testCLMathWrapper.cpp
+++ b/test/testCLMathWrapper.cpp
@@ -14,25 +14,25 @@
 
 using namespace std;
 
-TEST( testCLMathWrapper, assign ) {
+TEST(testCLMathWrapper, assign) {
     EasyCL *cl = new EasyCL();
     float adat[] = { 1,3,9,12.5f,2.5f };
     float bdat[] = { 4,2.1f, 5,3,9.2f };
-    CLWrapper *a_ = cl->wrap( 5,adat );
-    CLWrapper *b_ = cl->wrap( 5,bdat );
+    CLWrapper *a_ = cl->wrap(5,adat);
+    CLWrapper *b_ = cl->wrap(5,bdat);
     a_->copyToDevice();
     b_->copyToDevice();
 
-    CLMathWrapper a( a_ );
-    CLMathWrapper b( b_ );
+    CLMathWrapper a(a_);
+    CLMathWrapper b(b_);
     a = b;
     a_->copyToHost();
 
-    for( int i = 0; i < 5; i++ ) {
+    for(int i = 0; i < 5; i++) {
         cout << "a[" << i << "]=" << adat[i] << endl;
     }
-    EXPECT_FLOAT_NEAR( 2.1f, adat[1] );
-    EXPECT_FLOAT_NEAR( 9.2f, adat[4] );
+    EXPECT_FLOAT_NEAR(2.1f, adat[1]);
+    EXPECT_FLOAT_NEAR(9.2f, adat[4]);
 
 //    delete a;
 //    delete b;
@@ -41,26 +41,46 @@ TEST( testCLMathWrapper, assign ) {
     delete cl;
 }
 
-TEST( testCLMathWrapper, addinplace ) {
+TEST(testCLMathWrapper, assignScalar) {
+    EasyCL *cl = new EasyCL();
+    float adat[] = { 1,3,9,12.5f,2.5f };
+    CLWrapper *a_ = cl->wrap(5,adat);
+    a_->copyToDevice();
+
+    CLMathWrapper a(a_);
+    a = 3.4f;
+    a_->copyToHost();
+
+    for(int i = 0; i < 5; i++) {
+        cout << "a[" << i << "]=" << adat[i] << endl;
+    }
+    EXPECT_FLOAT_NEAR(3.4f, adat[1]);
+    EXPECT_FLOAT_NEAR(3.4f, adat[4]);
+
+    delete a_;
+    delete cl;
+}
+
+TEST(testCLMathWrapper, addinplace) {
     EasyCL *cl = new EasyCL();
     float adat[] = { 1,3,9,12.5f,2.5f };
     float bdat[] = { 4,2.1f, 5,3,9.2f };
-    CLWrapper *a_ = cl->wrap( 5,adat );
-    CLWrapper *b_ = cl->wrap( 5,bdat );
+    CLWrapper *a_ = cl->wrap(5,adat);
+    CLWrapper *b_ = cl->wrap(5,bdat);
     a_->copyToDevice();
     b_->copyToDevice();
 
-    CLMathWrapper a( a_ );
-    CLMathWrapper b( b_ );
+    CLMathWrapper a(a_);
+    CLMathWrapper b(b_);
     a += b;
     a_->copyToHost();
 
-    for( int i = 0; i < 5; i++ ) {
+    for(int i = 0; i < 5; i++) {
         cout << "a[" << i << "]=" << adat[i] << endl;
     }
-    EXPECT_FLOAT_NEAR( 5.0f, adat[0] );
-    EXPECT_FLOAT_NEAR( 5.1f, adat[1] );
-    EXPECT_FLOAT_NEAR( 2.5f + 9.2f, adat[4] );
+    EXPECT_FLOAT_NEAR(5.0f, adat[0]);
+    EXPECT_FLOAT_NEAR(5.1f, adat[1]);
+    EXPECT_FLOAT_NEAR(2.5f + 9.2f, adat[4]);
 
 //    delete a;
 //    delete b;
@@ -69,27 +89,27 @@ TEST( testCLMathWrapper, addinplace ) {
     delete cl;
 }
 
-TEST( testCLMathWrapper, multiplyinplace ) {
+TEST(testCLMathWrapper, multiplyinplace) {
     EasyCL *cl = new EasyCL();
     float adat[] = { 1,3,9,12.5f,2.5f };
 //    float bdat[] = { 4,2.1f, 5,3,9.2f };
-    CLWrapper *a_ = cl->wrap( 5,adat );
-//    CLWrapper *b_ = cl->wrap( 5,bdat );
+    CLWrapper *a_ = cl->wrap(5,adat);
+//    CLWrapper *b_ = cl->wrap(5,bdat);
     a_->copyToDevice();
 //    b_->copyToDevice();
 
-    CLMathWrapper a( a_ );
-//    CLMathWrapper b( b_ );
+    CLMathWrapper a(a_);
+//    CLMathWrapper b(b_);
 //    a += b;
     a *= 1.5f;
     a_->copyToHost();
 
-    for( int i = 0; i < 5; i++ ) {
+    for(int i = 0; i < 5; i++) {
         cout << "a[" << i << "]=" << adat[i] << endl;
     }
-    EXPECT_FLOAT_NEAR( 1.5f, adat[0] );
-    EXPECT_FLOAT_NEAR( 4.5f, adat[1] );
-    EXPECT_FLOAT_NEAR( 3.75f, adat[4] );
+    EXPECT_FLOAT_NEAR(1.5f, adat[0]);
+    EXPECT_FLOAT_NEAR(4.5f, adat[1]);
+    EXPECT_FLOAT_NEAR(3.75f, adat[4]);
 
 //    delete a;
 //    delete b;
@@ -98,113 +118,113 @@ TEST( testCLMathWrapper, multiplyinplace ) {
     delete cl;
 }
 
-TEST( testCLMathWrapper, addscalar ) {
+TEST(testCLMathWrapper, addscalar) {
     EasyCL *cl = new EasyCL();
     float adat[] = { 1,3,9,12.5f,2.5f };
-    CLWrapper *a_ = cl->wrap( 5,adat );
+    CLWrapper *a_ = cl->wrap(5,adat);
     a_->copyToDevice();
 
-    CLMathWrapper a( a_ );
+    CLMathWrapper a(a_);
     a += 1.5f;
     a_->copyToHost();
 
-    for( int i = 0; i < 5; i++ ) {
+    for(int i = 0; i < 5; i++) {
         cout << "a[" << i << "]=" << adat[i] << endl;
     }
-    EXPECT_FLOAT_NEAR( 2.5f, adat[0] );
-    EXPECT_FLOAT_NEAR( 4.5f, adat[1] );
-    EXPECT_FLOAT_NEAR( 4.0f, adat[4] );
+    EXPECT_FLOAT_NEAR(2.5f, adat[0]);
+    EXPECT_FLOAT_NEAR(4.5f, adat[1]);
+    EXPECT_FLOAT_NEAR(4.0f, adat[4]);
 
     delete a_;
     delete cl;
 }
 
-TEST( testCLMathWrapper, sqrt ) {
+TEST(testCLMathWrapper, sqrt) {
     EasyCL *cl = new EasyCL();
     float adat[] = { 1,3,9,12.5f,2.5f };
-    CLWrapper *a_ = cl->wrap( 5,adat );
+    CLWrapper *a_ = cl->wrap(5,adat);
     a_->copyToDevice();
 
-    CLMathWrapper a( a_ );
+    CLMathWrapper a(a_);
     a.sqrt();
     a_->copyToHost();
 
-    for( int i = 0; i < 5; i++ ) {
+    for(int i = 0; i < 5; i++) {
         cout << "a[" << i << "]=" << adat[i] << endl;
     }
-    EXPECT_FLOAT_NEAR( 1, adat[0] );
-    EXPECT_FLOAT_NEAR( 1.73205f, adat[1] );
-    EXPECT_FLOAT_NEAR( 3, adat[2] );
-    EXPECT_FLOAT_NEAR( sqrt(2.5f), adat[4] );
+    EXPECT_FLOAT_NEAR(1, adat[0]);
+    EXPECT_FLOAT_NEAR(1.73205f, adat[1]);
+    EXPECT_FLOAT_NEAR(3, adat[2]);
+    EXPECT_FLOAT_NEAR(sqrt(2.5f), adat[4]);
 
     delete a_;
     delete cl;
 }
 
-TEST( testCLMathWrapper, squared ) {
+TEST(testCLMathWrapper, squared) {
     EasyCL *cl = new EasyCL();
     float adat[] = { 1,3,9,12.5f,2.5f };
-    CLWrapper *a_ = cl->wrap( 5,adat );
+    CLWrapper *a_ = cl->wrap(5,adat);
     a_->copyToDevice();
 
-    CLMathWrapper a( a_ );
+    CLMathWrapper a(a_);
     a.squared();
     a_->copyToHost();
 
-    for( int i = 0; i < 5; i++ ) {
+    for(int i = 0; i < 5; i++) {
         cout << "a[" << i << "]=" << adat[i] << endl;
     }
-    EXPECT_FLOAT_NEAR( 1, adat[0] );
-    EXPECT_FLOAT_NEAR( 9, adat[1] );
-    EXPECT_FLOAT_NEAR( 81, adat[2] );
-    EXPECT_FLOAT_NEAR( 2.5f * 2.5f, adat[4] );
+    EXPECT_FLOAT_NEAR(1, adat[0]);
+    EXPECT_FLOAT_NEAR(9, adat[1]);
+    EXPECT_FLOAT_NEAR(81, adat[2]);
+    EXPECT_FLOAT_NEAR(2.5f * 2.5f, adat[4]);
 
     delete a_;
     delete cl;
 }
 
-TEST( testCLMathWrapper, inverse ) {
+TEST(testCLMathWrapper, inverse) {
     EasyCL *cl = new EasyCL();
     float adat[] = { 1,3,9,12.5f,2.5f };
-    CLWrapper *a_ = cl->wrap( 5,adat );
+    CLWrapper *a_ = cl->wrap(5,adat);
     a_->copyToDevice();
 
-    CLMathWrapper a( a_ );
+    CLMathWrapper a(a_);
     a.inv();
     a_->copyToHost();
 
-    for( int i = 0; i < 5; i++ ) {
+    for(int i = 0; i < 5; i++) {
         cout << "a[" << i << "]=" << adat[i] << endl;
     }
-    EXPECT_FLOAT_NEAR( 1, adat[0] );
-    EXPECT_FLOAT_NEAR( 0.333333f, adat[1] );
-    EXPECT_FLOAT_NEAR( 1.0f / 9.0f, adat[2] );
-    EXPECT_FLOAT_NEAR( 1.0f / 2.5f, adat[4] );
+    EXPECT_FLOAT_NEAR(1, adat[0]);
+    EXPECT_FLOAT_NEAR(0.333333f, adat[1]);
+    EXPECT_FLOAT_NEAR(1.0f / 9.0f, adat[2]);
+    EXPECT_FLOAT_NEAR(1.0f / 2.5f, adat[4]);
 
     delete a_;
     delete cl;
 }
 
-TEST( testCLMathWrapper, perelementmult ) {
+TEST(testCLMathWrapper, perelementmult) {
     EasyCL *cl = new EasyCL();
     float adat[] = { 1,3,9,12.5f,2.5f };
     float bdat[] = { 4,2.1f, 5,3,9.2f };
-    CLWrapper *a_ = cl->wrap( 5,adat );
-    CLWrapper *b_ = cl->wrap( 5,bdat );
+    CLWrapper *a_ = cl->wrap(5,adat);
+    CLWrapper *b_ = cl->wrap(5,bdat);
     a_->copyToDevice();
     b_->copyToDevice();
 
-    CLMathWrapper a( a_ );
-    CLMathWrapper b( b_ );
+    CLMathWrapper a(a_);
+    CLMathWrapper b(b_);
     a *= b;
     a_->copyToHost();
 
-    for( int i = 0; i < 5; i++ ) {
+    for(int i = 0; i < 5; i++) {
         cout << "a[" << i << "]=" << adat[i] << endl;
     }
-    EXPECT_FLOAT_NEAR( 4.0f, adat[0] );
-    EXPECT_FLOAT_NEAR( 6.3f, adat[1] );
-    EXPECT_FLOAT_NEAR( 2.5f * 9.2f, adat[4] );
+    EXPECT_FLOAT_NEAR(4.0f, adat[0]);
+    EXPECT_FLOAT_NEAR(6.3f, adat[1]);
+    EXPECT_FLOAT_NEAR(2.5f * 9.2f, adat[4]);
 
 //    delete a;
 //    delete b;
diff --git a/test/testClBlas.cpp b/test/testClBlas.cpp
new file mode 100644
index 00000000..80314a7a
--- /dev/null
+++ b/test/testClBlas.cpp
@@ -0,0 +1,374 @@
+#include "clblas/ClBlasHelper.h"
+#include "clblas/ClBlasInstance.h"
+#include "clBLAS.h"
+#include "EasyCL.h"
+
+#include <iostream>
+#include <iomanip>
+#include <algorithm>
+
+#include "gtest/gtest.h"
+
+using namespace std;
+
+#include "test/gtest_supp.h"
+
+TEST(testClBlas, basic) {
+    EasyCL *cl = EasyCL::createForFirstGpuOtherwiseCpu();
+
+    float A[] = {1, 3,
+                 2, 7,
+                 9, 5};
+    float B[] = {3,
+                 -1};
+
+    float C[3];
+    ClBlasInstance clblasInstance;
+    CLWrapper *AWrap = cl->wrap(6, A);
+    CLWrapper *BWrap = cl->wrap(2, B);
+    CLWrapper *CWrap = cl->wrap(3, C);
+    AWrap->copyToDevice();
+    BWrap->copyToDevice();
+    CWrap->createOnDevice();
+    ClBlasHelper::Gemm(
+        cl,
+        clblasRowMajor,
+        clblasNoTrans, clblasNoTrans,
+        3, 2, 1,
+        1,
+        AWrap, 0,
+        BWrap, 0,
+        0,
+        CWrap, 0
+    );
+    cl->finish();
+    CWrap->copyToHost();
+    EXPECT_EQ(0, C[0]);
+    EXPECT_EQ(-1, C[1]);
+    EXPECT_EQ(22, C[2]);
+
+    cl->finish();
+
+    delete CWrap;
+    delete BWrap;
+    delete AWrap;
+
+    cl->finish();
+
+    delete cl;
+    clblasTeardown();
+}
+
+static void transpose(float *matrix, int rows, int cols) {
+    float *tempMatrix = new float[rows * cols];
+    int newRows = cols;
+    int newCols = rows;
+    for(int row = 0; row < rows; row++) {
+        for(int col = 0; col < cols; col++) {
+            int pos1 = row * cols + col;
+            int pos2 = col * rows + row;
+//            float old = matrix[pos1];
+            tempMatrix[pos2] = matrix[pos1];
+        }
+    }
+    for(int i = 0; i < rows * cols; i++) {
+        matrix[i] = tempMatrix[i];
+    }
+    delete[] tempMatrix;
+}
+
+TEST(testClBlas, transA) {
+    EasyCL *cl = EasyCL::createForFirstGpuOtherwiseCpu();
+
+    float A[] = {1, 3,
+                 2, 7,
+                 9, 5};
+    float B[] = {3,
+                 -1};
+
+    float C[3];
+    transpose(A, 3, 2);
+    for(int row=0; row < 2; row++) {
+        for(int col=0; col < 3; col++) {
+            cout << A[row*3 + col] << " ";
+        }
+        cout << endl;
+    }
+    ClBlasInstance clblasInstance;
+//    ClBlasInstance::initializeIfNecessary();
+    CLWrapper *AWrap = cl->wrap(6, A);
+    CLWrapper *BWrap = cl->wrap(2, B);
+    CLWrapper *CWrap = cl->wrap(3, C);
+    AWrap->copyToDevice();
+    BWrap->copyToDevice();
+    ClBlasHelper::Gemm(
+        cl,
+        clblasRowMajor,
+        clblasTrans, clblasNoTrans,
+        3, 2, 1,
+        1,
+        AWrap, 0,
+        BWrap, 0,
+        0,
+        CWrap, 0
+    );
+//    cl->finish();
+    CWrap->copyToHost();
+    EXPECT_EQ(0, C[0]);
+    EXPECT_EQ(-1, C[1]);
+    EXPECT_EQ(22, C[2]);
+
+    delete CWrap;
+    delete BWrap;
+    delete AWrap;
+
+    delete cl;
+}
+
+TEST(testClBlas, transB) {
+    EasyCL *cl = EasyCL::createForFirstGpuOtherwiseCpu();
+
+    float A[] = {1, 3,
+                 2, 7,
+                 9, 5};
+    float B[] = {3,
+                 -1};
+
+    float C[3];
+    transpose(B, 2, 1);
+    for(int row=0; row < 2; row++) {
+        for(int col=0; col < 1; col++) {
+            cout << B[row*1 + col] << " ";
+        }
+        cout << endl;
+    }
+    ClBlasInstance clblasInstance;
+//    ClBlasInstance::initializeIfNecessary();
+    CLWrapper *AWrap = cl->wrap(6, A);
+    CLWrapper *BWrap = cl->wrap(2, B);
+    CLWrapper *CWrap = cl->wrap(3, C);
+    AWrap->copyToDevice();
+    BWrap->copyToDevice();
+    ClBlasHelper::Gemm(
+        cl,
+        clblasRowMajor,
+        clblasNoTrans, clblasTrans,
+        3, 2, 1,
+        1,
+        AWrap, 0,
+        BWrap, 0,
+        0,
+        CWrap, 0
+    );
+//    cl->finish();
+    CWrap->copyToHost();
+    EXPECT_EQ(0, C[0]);
+    EXPECT_EQ(-1, C[1]);
+    EXPECT_EQ(22, C[2]);
+
+    delete CWrap;
+    delete BWrap;
+    delete AWrap;
+
+    delete cl;
+}
+
+TEST(testClBlas, colMajor) {
+    EasyCL *cl = EasyCL::createForFirstGpuOtherwiseCpu();
+
+    float A[] = {1, 3,
+                 2, 7,
+                 9, 5};
+    float B[] = {3,
+                 -1};
+
+    float C[3];
+    transpose(A, 3, 2);
+    transpose(B, 2, 1);
+//    for(int row=0; row < 2; row++) {
+//        for(int col=0; col < 1; col++) {
+//            cout << B[row*1 + col] << " ";
+//        }
+//        cout << endl;
+//    }
+    ClBlasInstance clblasInstance;
+//    ClBlasInstance::initializeIfNecessary();
+    CLWrapper *AWrap = cl->wrap(6, A);
+    CLWrapper *BWrap = cl->wrap(2, B);
+    CLWrapper *CWrap = cl->wrap(3, C);
+    AWrap->copyToDevice();
+    BWrap->copyToDevice();
+    ClBlasHelper::Gemm(
+        cl,
+        clblasColumnMajor,
+        clblasNoTrans, clblasNoTrans,
+        3, 2, 1,
+        1,
+        AWrap, 0,
+        BWrap, 0,
+        0,
+        CWrap, 0
+    );
+//    cl->finish();
+    CWrap->copyToHost();
+    transpose(C, 1, 3);
+    EXPECT_EQ(0, C[0]);
+    EXPECT_EQ(-1, C[1]);
+    EXPECT_EQ(22, C[2]);
+
+    delete CWrap;
+    delete BWrap;
+    delete AWrap;
+
+    delete cl;
+}
+TEST(testClBlas, colMajor2) {
+    EasyCL *cl = EasyCL::createForFirstGpuOtherwiseCpu();
+
+    float A[] = {1, 3,
+                 2, 7,
+                 9, 5,
+                 0, -2};
+    float B[] = {3,2,8,
+                 -1,0,4};
+
+    float C[4*3];
+    transpose(A, 4, 2);
+    transpose(B, 2, 3);
+//    for(int row=0; row < 2; row++) {
+//        for(int col=0; col < 1; col++) {
+//            cout << B[row*1 + col] << " ";
+//        }
+//        cout << endl;
+//    }
+    ClBlasInstance clblasInstance;
+//    ClBlasInstance::initializeIfNecessary();
+    CLWrapper *AWrap = cl->wrap(4*2, A);
+    CLWrapper *BWrap = cl->wrap(2*3, B);
+    CLWrapper *CWrap = cl->wrap(4*3, C);
+    AWrap->copyToDevice();
+    BWrap->copyToDevice();
+    ClBlasHelper::Gemm(
+        cl,
+        clblasColumnMajor,
+        clblasNoTrans, clblasNoTrans,
+        4, 2, 3,
+        1,
+        AWrap, 0,
+        BWrap, 0,
+        0,
+        CWrap, 0
+    );
+//    cl->finish();
+    CWrap->copyToHost();
+    transpose(C, 3, 4);
+    EXPECT_EQ(1*3-1*3, C[0]);
+    EXPECT_EQ(1*2+3*0, C[1]);
+    EXPECT_EQ(1*8+4*3, C[2]);
+    EXPECT_EQ(-8, C[11]);
+
+    delete CWrap;
+    delete BWrap;
+    delete AWrap;
+
+    delete cl;
+}
+TEST(testClBlas, colMajorTransA) {
+    EasyCL *cl = EasyCL::createForFirstGpuOtherwiseCpu();
+
+    float A[] = {1, 3,
+                 2, 7,
+                 9, 5};
+    float B[] = {3,
+                 -1};
+
+    float C[3];
+//    transpose(A, 3, 2);
+    transpose(B, 2, 1);
+//    for(int row=0; row < 2; row++) {
+//        for(int col=0; col < 1; col++) {
+//            cout << B[row*1 + col] << " ";
+//        }
+//        cout << endl;
+//    }
+    ClBlasInstance clblasInstance;
+//    ClBlasInstance::initializeIfNecessary();
+    CLWrapper *AWrap = cl->wrap(6, A);
+    CLWrapper *BWrap = cl->wrap(2, B);
+    CLWrapper *CWrap = cl->wrap(3, C);
+    AWrap->copyToDevice();
+    BWrap->copyToDevice();
+    ClBlasHelper::Gemm(
+        cl,
+        clblasColumnMajor,
+        clblasTrans, clblasNoTrans,
+        3, 2, 1,
+        1,
+        AWrap, 0,
+        BWrap, 0,
+        0,
+        CWrap, 0
+    );
+//    cl->finish();
+    CWrap->copyToHost();
+    transpose(C, 1, 3);
+    EXPECT_EQ(0, C[0]);
+    EXPECT_EQ(-1, C[1]);
+    EXPECT_EQ(22, C[2]);
+
+    delete CWrap;
+    delete BWrap;
+    delete AWrap;
+
+    delete cl;
+}
+TEST(testClBlas, colMajorTransB) {
+    EasyCL *cl = EasyCL::createForFirstGpuOtherwiseCpu();
+
+    float A[] = {1, 3,
+                 2, 7,
+                 9, 5};
+    float B[] = {3,
+                 -1};
+
+    float C[3];
+    transpose(A, 3, 2);
+//    transpose(B, 2, 1);
+//    for(int row=0; row < 2; row++) {
+//        for(int col=0; col < 1; col++) {
+//            cout << B[row*1 + col] << " ";
+//        }
+//        cout << endl;
+//    }
+    ClBlasInstance clblasInstance;
+//    ClBlasInstance::initializeIfNecessary();
+    CLWrapper *AWrap = cl->wrap(6, A);
+    CLWrapper *BWrap = cl->wrap(2, B);
+    CLWrapper *CWrap = cl->wrap(3, C);
+    AWrap->copyToDevice();
+    BWrap->copyToDevice();
+    ClBlasHelper::Gemm(
+        cl,
+        clblasColumnMajor,
+        clblasNoTrans, clblasTrans,
+        3, 2, 1,
+        1,
+        AWrap, 0,
+        BWrap, 0,
+        0,
+        CWrap, 0
+    );
+//    cl->finish();
+    CWrap->copyToHost();
+    transpose(C, 1, 3);
+    EXPECT_EQ(0, C[0]);
+    EXPECT_EQ(-1, C[1]);
+    EXPECT_EQ(22, C[2]);
+
+    delete CWrap;
+    delete BWrap;
+    delete AWrap;
+
+    delete cl;
+}
+
diff --git a/test/testCopyBlock.cpp b/test/testCopyBlock.cpp
index f48fa25b..f1109657 100644
--- a/test/testCopyBlock.cpp
+++ b/test/testCopyBlock.cpp
@@ -138,35 +138,35 @@ CLKernel *makeTestPosKernel( EasyCL *cl ) {
     "// v. 2.0. If a copy of the MPL was not distributed with this file, You can\n" 
     "// obtain one at http://mozilla.org/MPL/2.0/.\n" 
     "\n" 
-    "int posToRow( int pos ) {\n" 
-    "    return ( pos >> 10 ) & ( (1<<10)-1);\n" 
+    "int posToRow(int pos) {\n" 
+    "    return (pos >> 10) & ((1<<10)-1);\n" 
     "//    return 53\n" 
     "}\n" 
-    "int posToCol( int pos ) {\n" 
+    "int posToCol(int pos) {\n" 
     "    return pos & ((1<<10)-1);\n" 
     "  //  return 67;\n" 
     "    //return ((1<<11)-1);\n" 
     "}\n" 
-    "int rowColToPos( int row, int col ) {\n" 
-    "    return ( row << 10 ) | col;\n" 
+    "int rowColToPos(int row, int col) {\n" 
+    "    return (row << 10) | col;\n" 
     "}\n" 
-    "int linearIdToPos( int linearId, int base ) {\n" 
-    "    return rowColToPos( ( linearId / base ), ( linearId % base )  );\n" 
+    "int linearIdToPos(int linearId, int base) {\n" 
+    "    return rowColToPos(( linearId / base), (linearId % base)  );\n" 
     "}\n" 
-    "int posToOffset( int pos, int rowLength ) {\n" 
+    "int posToOffset(int pos, int rowLength) {\n" 
     "    return posToRow(pos) * rowLength + posToCol(pos);\n" 
     "}\n" 
     "\n" 
     "// assumes that the block will fit exactly into the target\n" 
-    "void copyBlock( local float *target, global float const *source,\n" 
-    "    const int sourceSize, const int blockStart, const int blockSize ) {\n" 
-    "    const int totalLinearSize = posToRow( blockSize ) * posToCol( blockSize );\n" 
-    "    const int numLoops = ( totalLinearSize + get_local_size(0) - 1 ) / get_local_size(0);\n" 
-    "    for( int loop = 0; loop < numLoops; loop++ ) {\n" 
+    "void copyBlock(local float *target, global float const *source,\n" 
+    "    const int sourceSize, const int blockStart, const int blockSize) {\n" 
+    "    const int totalLinearSize = posToRow(blockSize) * posToCol(blockSize);\n" 
+    "    const int numLoops = (totalLinearSize + get_local_size(0) - 1) / get_local_size(0);\n" 
+    "    for (int loop = 0; loop < numLoops; loop++) {\n" 
     "        const int offset = get_local_id(0) + loop * get_local_size(0);\n" 
-    "        if( offset < totalLinearSize ) {\n" 
-    "            const int offsetAsPos = linearIdToPos( offset, posToCol( blockSize ) );\n" 
-    "            target[ offset ] = source[ posToOffset( blockStart + offsetAsPos, posToCol( sourceSize ) ) ];\n" 
+    "        if (offset < totalLinearSize) {\n" 
+    "            const int offsetAsPos = linearIdToPos(offset, posToCol(blockSize) );\n" 
+    "            target[ offset ] = source[ posToOffset(blockStart + offsetAsPos, posToCol(sourceSize) ) ];\n" 
     "        }\n" 
     "    }\n" 
     "}\n" 
@@ -180,10 +180,10 @@ CLKernel *makeTestPosKernel( EasyCL *cl ) {
     "// v. 2.0. If a copy of the MPL was not distributed with this file, You can\n" 
     "// obtain one at http://mozilla.org/MPL/2.0/.\n" 
     "\n" 
-    "#define globalId ( get_global_id(0) )\n" 
-    "#define localId ( get_local_id(0)  )\n" 
-    "#define workgroupId ( get_group_id(0) )\n" 
-    "#define workgroupSize ( get_local_size(0) )\n" 
+    "#define globalId (get_global_id(0))\n" 
+    "#define localId (get_local_id(0)  )\n" 
+    "#define workgroupId (get_group_id(0))\n" 
+    "#define workgroupSize (get_local_size(0))\n" 
     "\n" 
     "\n" 
     "\n" 
@@ -194,21 +194,21 @@ CLKernel *makeTestPosKernel( EasyCL *cl ) {
     "// v. 2.0. If a copy of the MPL was not distributed with this file, You can\n" 
     "// obtain one at http://mozilla.org/MPL/2.0/.\n" 
     "\n" 
-    "void copyLocal( local float *target, global float const *source, int N ) {\n" 
-    "    int numLoops = ( N + get_local_size(0) - 1 ) / get_local_size(0);\n" 
-    "    for( int loop = 0; loop < numLoops; loop++ ) {\n" 
+    "void copyLocal(local float *target, global float const *source, int N) {\n" 
+    "    int numLoops = (N + get_local_size(0) - 1) / get_local_size(0);\n" 
+    "    for (int loop = 0; loop < numLoops; loop++) {\n" 
     "        int offset = loop * get_local_size(0) + get_local_id(0);\n" 
-    "        if( offset < N ) {\n" 
+    "        if (offset < N) {\n" 
     "            target[offset] = source[offset];\n" 
     "        }\n" 
     "    }\n" 
     "}\n" 
     "\n" 
-    "void copyGlobal( global float *target, local float const *source, int N ) {\n" 
-    "    int numLoops = ( N + get_local_size(0) - 1 ) / get_local_size(0);\n" 
-    "    for( int loop = 0; loop < numLoops; loop++ ) {\n" 
+    "void copyGlobal(global float *target, local float const *source, int N) {\n" 
+    "    int numLoops = (N + get_local_size(0) - 1) / get_local_size(0);\n" 
+    "    for (int loop = 0; loop < numLoops; loop++) {\n" 
     "        int offset = loop * get_local_size(0) + get_local_id(0);\n" 
-    "        if( offset < N ) {\n" 
+    "        if (offset < N) {\n" 
     "            target[offset] = source[offset];\n" 
     "        }\n" 
     "    }\n" 
@@ -263,35 +263,35 @@ CLKernel *makeBasicKernel( EasyCL *cl ) {
     "// v. 2.0. If a copy of the MPL was not distributed with this file, You can\n" 
     "// obtain one at http://mozilla.org/MPL/2.0/.\n" 
     "\n" 
-    "int posToRow( int pos ) {\n" 
-    "    return ( pos >> 10 ) & ( (1<<10)-1);\n" 
+    "int posToRow(int pos) {\n" 
+    "    return (pos >> 10) & ((1<<10)-1);\n" 
     "//    return 53\n" 
     "}\n" 
-    "int posToCol( int pos ) {\n" 
+    "int posToCol(int pos) {\n" 
     "    return pos & ((1<<10)-1);\n" 
     "  //  return 67;\n" 
     "    //return ((1<<11)-1);\n" 
     "}\n" 
-    "int rowColToPos( int row, int col ) {\n" 
-    "    return ( row << 10 ) | col;\n" 
+    "int rowColToPos(int row, int col) {\n" 
+    "    return (row << 10) | col;\n" 
     "}\n" 
-    "int linearIdToPos( int linearId, int base ) {\n" 
-    "    return rowColToPos( ( linearId / base ), ( linearId % base )  );\n" 
+    "int linearIdToPos(int linearId, int base) {\n" 
+    "    return rowColToPos(( linearId / base), (linearId % base)  );\n" 
     "}\n" 
-    "int posToOffset( int pos, int rowLength ) {\n" 
+    "int posToOffset(int pos, int rowLength) {\n" 
     "    return posToRow(pos) * rowLength + posToCol(pos);\n" 
     "}\n" 
     "\n" 
     "// assumes that the block will fit exactly into the target\n" 
-    "void copyBlock( local float *target, global float const *source,\n" 
-    "    const int sourceSize, const int blockStart, const int blockSize ) {\n" 
-    "    const int totalLinearSize = posToRow( blockSize ) * posToCol( blockSize );\n" 
-    "    const int numLoops = ( totalLinearSize + get_local_size(0) - 1 ) / get_local_size(0);\n" 
-    "    for( int loop = 0; loop < numLoops; loop++ ) {\n" 
+    "void copyBlock(local float *target, global float const *source,\n" 
+    "    const int sourceSize, const int blockStart, const int blockSize) {\n" 
+    "    const int totalLinearSize = posToRow(blockSize) * posToCol(blockSize);\n" 
+    "    const int numLoops = (totalLinearSize + get_local_size(0) - 1) / get_local_size(0);\n" 
+    "    for (int loop = 0; loop < numLoops; loop++) {\n" 
     "        const int offset = get_local_id(0) + loop * get_local_size(0);\n" 
-    "        if( offset < totalLinearSize ) {\n" 
-    "            const int offsetAsPos = linearIdToPos( offset, posToCol( blockSize ) );\n" 
-    "            target[ offset ] = source[ posToOffset( blockStart + offsetAsPos, posToCol( sourceSize ) ) ];\n" 
+    "        if (offset < totalLinearSize) {\n" 
+    "            const int offsetAsPos = linearIdToPos(offset, posToCol(blockSize) );\n" 
+    "            target[ offset ] = source[ posToOffset(blockStart + offsetAsPos, posToCol(sourceSize) ) ];\n" 
     "        }\n" 
     "    }\n" 
     "}\n" 
@@ -305,10 +305,10 @@ CLKernel *makeBasicKernel( EasyCL *cl ) {
     "// v. 2.0. If a copy of the MPL was not distributed with this file, You can\n" 
     "// obtain one at http://mozilla.org/MPL/2.0/.\n" 
     "\n" 
-    "#define globalId ( get_global_id(0) )\n" 
-    "#define localId ( get_local_id(0)  )\n" 
-    "#define workgroupId ( get_group_id(0) )\n" 
-    "#define workgroupSize ( get_local_size(0) )\n" 
+    "#define globalId (get_global_id(0))\n" 
+    "#define localId (get_local_id(0)  )\n" 
+    "#define workgroupId (get_group_id(0))\n" 
+    "#define workgroupSize (get_local_size(0))\n" 
     "\n" 
     "\n" 
     "\n" 
@@ -319,21 +319,21 @@ CLKernel *makeBasicKernel( EasyCL *cl ) {
     "// v. 2.0. If a copy of the MPL was not distributed with this file, You can\n" 
     "// obtain one at http://mozilla.org/MPL/2.0/.\n" 
     "\n" 
-    "void copyLocal( local float *target, global float const *source, int N ) {\n" 
-    "    int numLoops = ( N + get_local_size(0) - 1 ) / get_local_size(0);\n" 
-    "    for( int loop = 0; loop < numLoops; loop++ ) {\n" 
+    "void copyLocal(local float *target, global float const *source, int N) {\n" 
+    "    int numLoops = (N + get_local_size(0) - 1) / get_local_size(0);\n" 
+    "    for (int loop = 0; loop < numLoops; loop++) {\n" 
     "        int offset = loop * get_local_size(0) + get_local_id(0);\n" 
-    "        if( offset < N ) {\n" 
+    "        if (offset < N) {\n" 
     "            target[offset] = source[offset];\n" 
     "        }\n" 
     "    }\n" 
     "}\n" 
     "\n" 
-    "void copyGlobal( global float *target, local float const *source, int N ) {\n" 
-    "    int numLoops = ( N + get_local_size(0) - 1 ) / get_local_size(0);\n" 
-    "    for( int loop = 0; loop < numLoops; loop++ ) {\n" 
+    "void copyGlobal(global float *target, local float const *source, int N) {\n" 
+    "    int numLoops = (N + get_local_size(0) - 1) / get_local_size(0);\n" 
+    "    for (int loop = 0; loop < numLoops; loop++) {\n" 
     "        int offset = loop * get_local_size(0) + get_local_id(0);\n" 
-    "        if( offset < N ) {\n" 
+    "        if (offset < N) {\n" 
     "            target[offset] = source[offset];\n" 
     "        }\n" 
     "    }\n" 
diff --git a/test/testCopyLocal.cpp b/test/testCopyLocal.cpp
index aecfe571..a0a398c9 100644
--- a/test/testCopyLocal.cpp
+++ b/test/testCopyLocal.cpp
@@ -77,35 +77,35 @@ CLKernel *makeKernel( EasyCL *cl ) {
     "// v. 2.0. If a copy of the MPL was not distributed with this file, You can\n" 
     "// obtain one at http://mozilla.org/MPL/2.0/.\n" 
     "\n" 
-    "int posToRow( int pos ) {\n" 
-    "    return ( pos >> 10 ) & ( (1<<10)-1);\n" 
+    "int posToRow(int pos) {\n" 
+    "    return (pos >> 10) & ((1<<10)-1);\n" 
     "//    return 53\n" 
     "}\n" 
-    "int posToCol( int pos ) {\n" 
+    "int posToCol(int pos) {\n" 
     "    return pos & ((1<<10)-1);\n" 
     "  //  return 67;\n" 
     "    //return ((1<<11)-1);\n" 
     "}\n" 
-    "int rowColToPos( int row, int col ) {\n" 
-    "    return ( row << 10 ) | col;\n" 
+    "int rowColToPos(int row, int col) {\n" 
+    "    return (row << 10) | col;\n" 
     "}\n" 
-    "int linearIdToPos( int linearId, int base ) {\n" 
-    "    return rowColToPos( ( linearId / base ), ( linearId % base )  );\n" 
+    "int linearIdToPos(int linearId, int base) {\n" 
+    "    return rowColToPos(( linearId / base), (linearId % base)  );\n" 
     "}\n" 
-    "int posToOffset( int pos, int rowLength ) {\n" 
+    "int posToOffset(int pos, int rowLength) {\n" 
     "    return posToRow(pos) * rowLength + posToCol(pos);\n" 
     "}\n" 
     "\n" 
     "// assumes that the block will fit exactly into the target\n" 
-    "void copyBlock( local float *target, global float const *source,\n" 
-    "    const int sourceSize, const int blockStart, const int blockSize ) {\n" 
-    "    const int totalLinearSize = posToRow( blockSize ) * posToCol( blockSize );\n" 
-    "    const int numLoops = ( totalLinearSize + get_local_size(0) - 1 ) / get_local_size(0);\n" 
-    "    for( int loop = 0; loop < numLoops; loop++ ) {\n" 
+    "void copyBlock(local float *target, global float const *source,\n" 
+    "    const int sourceSize, const int blockStart, const int blockSize) {\n" 
+    "    const int totalLinearSize = posToRow(blockSize) * posToCol(blockSize);\n" 
+    "    const int numLoops = (totalLinearSize + get_local_size(0) - 1) / get_local_size(0);\n" 
+    "    for (int loop = 0; loop < numLoops; loop++) {\n" 
     "        const int offset = get_local_id(0) + loop * get_local_size(0);\n" 
-    "        if( offset < totalLinearSize ) {\n" 
-    "            const int offsetAsPos = linearIdToPos( offset, posToCol( blockSize ) );\n" 
-    "            target[ offset ] = source[ posToOffset( blockStart + offsetAsPos, posToCol( sourceSize ) ) ];\n" 
+    "        if (offset < totalLinearSize) {\n" 
+    "            const int offsetAsPos = linearIdToPos(offset, posToCol(blockSize) );\n" 
+    "            target[ offset ] = source[ posToOffset(blockStart + offsetAsPos, posToCol(sourceSize) ) ];\n" 
     "        }\n" 
     "    }\n" 
     "}\n" 
@@ -119,10 +119,10 @@ CLKernel *makeKernel( EasyCL *cl ) {
     "// v. 2.0. If a copy of the MPL was not distributed with this file, You can\n" 
     "// obtain one at http://mozilla.org/MPL/2.0/.\n" 
     "\n" 
-    "#define globalId ( get_global_id(0) )\n" 
-    "#define localId ( get_local_id(0)  )\n" 
-    "#define workgroupId ( get_group_id(0) )\n" 
-    "#define workgroupSize ( get_local_size(0) )\n" 
+    "#define globalId (get_global_id(0))\n" 
+    "#define localId (get_local_id(0)  )\n" 
+    "#define workgroupId (get_group_id(0))\n" 
+    "#define workgroupSize (get_local_size(0))\n" 
     "\n" 
     "\n" 
     "\n" 
@@ -133,21 +133,21 @@ CLKernel *makeKernel( EasyCL *cl ) {
     "// v. 2.0. If a copy of the MPL was not distributed with this file, You can\n" 
     "// obtain one at http://mozilla.org/MPL/2.0/.\n" 
     "\n" 
-    "void copyLocal( local float *target, global float const *source, int N ) {\n" 
-    "    int numLoops = ( N + get_local_size(0) - 1 ) / get_local_size(0);\n" 
-    "    for( int loop = 0; loop < numLoops; loop++ ) {\n" 
+    "void copyLocal(local float *target, global float const *source, int N) {\n" 
+    "    int numLoops = (N + get_local_size(0) - 1) / get_local_size(0);\n" 
+    "    for (int loop = 0; loop < numLoops; loop++) {\n" 
     "        int offset = loop * get_local_size(0) + get_local_id(0);\n" 
-    "        if( offset < N ) {\n" 
+    "        if (offset < N) {\n" 
     "            target[offset] = source[offset];\n" 
     "        }\n" 
     "    }\n" 
     "}\n" 
     "\n" 
-    "void copyGlobal( global float *target, local float const *source, int N ) {\n" 
-    "    int numLoops = ( N + get_local_size(0) - 1 ) / get_local_size(0);\n" 
-    "    for( int loop = 0; loop < numLoops; loop++ ) {\n" 
+    "void copyGlobal(global float *target, local float const *source, int N) {\n" 
+    "    int numLoops = (N + get_local_size(0) - 1) / get_local_size(0);\n" 
+    "    for (int loop = 0; loop < numLoops; loop++) {\n" 
     "        int offset = loop * get_local_size(0) + get_local_id(0);\n" 
-    "        if( offset < N ) {\n" 
+    "        if (offset < N) {\n" 
     "            target[offset] = source[offset];\n" 
     "        }\n" 
     "    }\n" 
diff --git a/test/testDeepCL.cpp b/test/testDeepCL.cpp
new file mode 100644
index 00000000..d50b7d6e
--- /dev/null
+++ b/test/testDeepCL.cpp
@@ -0,0 +1,63 @@
+// Copyright Hugh Perkins 2014, 2015 hughperkins at gmail
+//
+// This Source Code Form is subject to the terms of the Mozilla Public License, 
+// v. 2.0. If a copy of the MPL was not distributed with this file, You can 
+// obtain one at http://mozilla.org/MPL/2.0/.
+
+#include <iostream>
+#include <iomanip>
+#include <algorithm>
+
+#include "DeepCL.h"
+#include "conv/Forward.h"
+
+#include "gtest/gtest.h"
+
+using namespace std;
+
+#include "test/gtest_supp.h"
+
+TEST(testDeepCL, basic) {
+    DeepCL *cl = DeepCL::createForFirstGpuOtherwiseCpu();
+
+    int batchSize = 2;
+    int numInPlanes = 1; int imageSize = 2;
+    int numOutPlanes = 2; int filterWidth = 2;
+    int padZeros = 0;
+    float data[] = { 0, 0, 
+                      0.5f, 0.5f,
+
+                        13, 17,
+                       -19, 2.3f,
+};
+    float filter1[] = { 0, 0,
+                        -0.5f, 0.5f,
+
+                        0.2f, 0.3f, 
+                         0.7f, -1.1f,
+ };
+    int resultSize = 4;
+    float expectedOutput[] = {
+        -0.5f * 0.5f + 0.5f * 0.5f,
+        0.7f * 0.5f -1.1f * 0.5f,
+        (-0.5f) * (-19) + 0.5f * 2.3f,
+        0.2f*13 + 0.3f* 17 + 0.7f *(-19) -1.1f * 2.3f 
+    };
+    cout << "expected number of output: " << resultSize << endl;
+//    int outputSize = 0;
+    for( int i = 1; i <= 4; i++ ) {
+        Forward *forward = Forward::instanceSpecific( 3, cl,
+            LayerDimensions( numInPlanes, imageSize, numOutPlanes, filterWidth,
+            padZeros == 1, false ) );
+        float *output = new float[forward->getOutputTotalSize(batchSize)];
+        forward->forward( batchSize, data, filter1, 0, output );  
+        for( int result = 0; result < resultSize; result++ ) {
+            ASSERT_EQ( expectedOutput[result], output[result] );
+        }
+        delete forward;
+        delete[] output;
+    }
+
+    delete cl;
+}
+
diff --git a/test/testMemset.cpp b/test/testMemset.cpp
index 99e3c8f2..4a0ce007 100644
--- a/test/testMemset.cpp
+++ b/test/testMemset.cpp
@@ -26,9 +26,9 @@ TEST( testMemset, basic ) {
     "// v. 2.0. If a copy of the MPL was not distributed with this file, You can\n" 
     "// obtain one at http://mozilla.org/MPL/2.0/.\n" 
     "\n" 
-    "kernel void memset( global float *target, const float value, const int N ) {\n" 
+    "kernel void memset(global float *target, const float value, const int N) {\n" 
     "    #define globalId get_global_id(0)\n" 
-    "    if( globalId < N ) {\n" 
+    "    if (globalId < N) {\n" 
     "        target[globalId] = value;\n" 
     "    }\n" 
     "}\n" 
diff --git a/test/testNetdefToNet.cpp b/test/testNetdefToNet.cpp
index 5d59d896..86b1a52c 100644
--- a/test/testNetdefToNet.cpp
+++ b/test/testNetdefToNet.cpp
@@ -140,7 +140,7 @@ TEST( testNetdefToNet, 3x32c5zmp2 ) {
     EXPECT_TRUE( dynamic_cast< SoftMaxLayer * >( net->getLayer(8) ) != 0 );
 
     ConvolutionalLayer *conv = dynamic_cast< ConvolutionalLayer * >( net->getLayer(1) );
-    EXPECT_EQ( 128, conv->dim.inputImageSize );
+    EXPECT_EQ( 128, conv->dim.inputSize );
     EXPECT_EQ( true, conv->dim.padZeros );
     EXPECT_EQ( 1, conv->dim.inputPlanes );
     EXPECT_EQ( 32, conv->dim.numFilters );
@@ -163,7 +163,7 @@ TEST( testNetdefToNet, 2x32c7_3x32c5z ) {
     EXPECT_TRUE( dynamic_cast< ConvolutionalLayer * >( net->getLayer(5) ) != 0 );
     EXPECT_TRUE( dynamic_cast< SoftMaxLayer * >( net->getLayer(7) ) != 0 );
     ConvolutionalLayer *conv = dynamic_cast< ConvolutionalLayer * >( net->getLayer(1) );
-    EXPECT_EQ( 19, conv->dim.inputImageSize );
+    EXPECT_EQ( 19, conv->dim.inputSize );
     EXPECT_EQ( true, conv->dim.padZeros );
     EXPECT_EQ( 1, conv->dim.inputPlanes );
     EXPECT_EQ( 32, conv->dim.numFilters );
@@ -171,7 +171,7 @@ TEST( testNetdefToNet, 2x32c7_3x32c5z ) {
 //    EXPECT_EQ( "RELU", conv->activationFunction->getDefineName() );
 
     conv = dynamic_cast< ConvolutionalLayer * >( net->getLayer(2) );
-    EXPECT_EQ( 19, conv->dim.inputImageSize );
+    EXPECT_EQ( 19, conv->dim.inputSize );
     EXPECT_EQ( true, conv->dim.padZeros );
     EXPECT_EQ( 32, conv->dim.inputPlanes );
     EXPECT_EQ( 32, conv->dim.numFilters );
@@ -179,7 +179,7 @@ TEST( testNetdefToNet, 2x32c7_3x32c5z ) {
 //    EXPECT_EQ( "RELU", conv->activationFunction->getDefineName() );
 
     conv = dynamic_cast< ConvolutionalLayer * >( net->getLayer(3) );
-    EXPECT_EQ( 19, conv->dim.inputImageSize );
+    EXPECT_EQ( 19, conv->dim.inputSize );
     EXPECT_EQ( true, conv->dim.padZeros );
     EXPECT_EQ( 32, conv->dim.inputPlanes );
     EXPECT_EQ( 32, conv->dim.numFilters );
@@ -187,7 +187,7 @@ TEST( testNetdefToNet, 2x32c7_3x32c5z ) {
 //    EXPECT_EQ( "RELU", conv->activationFunction->getDefineName() );
 
     conv = dynamic_cast< ConvolutionalLayer * >( net->getLayer(5) );
-    EXPECT_EQ( 19, conv->dim.inputImageSize );
+    EXPECT_EQ( 19, conv->dim.inputSize );
     EXPECT_EQ( true, conv->dim.padZeros );
     EXPECT_EQ( 32, conv->dim.inputPlanes );
     EXPECT_EQ( 32, conv->dim.numFilters );
diff --git a/test/testactivationbackward.cpp b/test/testactivationbackward.cpp
index 14fe4b4b..476d605b 100644
--- a/test/testactivationbackward.cpp
+++ b/test/testactivationbackward.cpp
@@ -33,14 +33,14 @@ TEST( testactivationbackward, basic ) {
         2, -9, 2.1f,
         0, -1.1f, 3.5f
     };
-    int inputTotalSize = activationBackprop->getInputSize( batchSize );
+    int inputTotalSize = activationBackprop->getInputNumElements( batchSize );
     EXPECT_EQ( batchSize * imageSize * imageSize, inputTotalSize );
     float *gradInput = new float[ inputTotalSize ];
 
     activationBackprop->backward( batchSize, outputs, gradOutput, gradInput );
 
-//    float *expectedGradInput = new float[ activationForward->getInputSize( batchSize ) ];
-//    memset( expectedGradInput, 0, sizeof(float) * activationForward->getInputSize( batchSize ) ];
+//    float *expectedGradInput = new float[ activationForward->getInputNumElements( batchSize ) ];
+//    memset( expectedGradInput, 0, sizeof(float) * activationForward->getInputNumElements( batchSize ) ];
 //    float expectedGradInput[] = {
 //        3,0,-2.7f,
 //        2,0,2.1f,
@@ -84,12 +84,12 @@ TEST( testactivationbackward, basic_2plane_batchsize2 ) {
         2, 
         9
     };
-    float *gradInput = new float[ activationBackprop->getInputSize( batchSize ) ];
+    float *gradInput = new float[ activationBackprop->getInputNumElements( batchSize ) ];
 
     activationBackprop->backward( batchSize, outputs, gradOutput, gradInput );
 
-//    float *expectedGradInput = new float[ activationForward->getInputSize( batchSize ) ];
-//    memset( expectedGradInput, 0, sizeof(float) * activationForward->getInputSize( batchSize ) ];
+//    float *expectedGradInput = new float[ activationForward->getInputNumElements( batchSize ) ];
+//    memset( expectedGradInput, 0, sizeof(float) * activationForward->getInputNumElements( batchSize ) ];
     float expectedGradInput[] = {
         3,
         0,
@@ -106,7 +106,7 @@ TEST( testactivationbackward, basic_2plane_batchsize2 ) {
 }
 
 TEST( SLOW_testactivationbackward, compare_args ) {
-    int inputImageSize = 9;
+    int inputSize = 9;
     std::string activation = "relu";
     int instance0 = 0;
     int instance1 = 1;
@@ -118,24 +118,24 @@ TEST( SLOW_testactivationbackward, compare_args ) {
     TestArgsParser::arg( "activation", &activation );
 //    TestArgsParser::arg( "activationsize", &activationSize );
     TestArgsParser::arg( "numplanes", &numPlanes );
-    TestArgsParser::arg( "inputimagesize", &inputImageSize );
+    TestArgsParser::arg( "inputimagesize", &inputSize );
     TestArgsParser::arg( "instance0", &instance0 );
     TestArgsParser::arg( "instance1", &instance1 );
     TestArgsParser::go();
 
     EasyCL *cl = EasyCL::createForFirstGpuOtherwiseCpu();
-    ActivationBackward *p0 = ActivationBackward::instanceSpecific( instance0, cl, numPlanes, inputImageSize, ActivationFunction::fromName( activation ) );
-    ActivationBackward *p1 = ActivationBackward::instanceSpecific( instance1, cl, numPlanes, inputImageSize, ActivationFunction::fromName( activation ) );
-    int outputImageSize = p1->outputImageSize;
-    int gradOutputSize = batchSize * outputImageSize * outputImageSize * numPlanes;
-    float *gradOutput = new float[ gradOutputSize ];
-    int inputSize = batchSize * inputImageSize * inputImageSize * numPlanes;
-    float *gradInput0 = new float[ inputSize ];
-    float *gradInput1 = new float[ inputSize ];
+    ActivationBackward *p0 = ActivationBackward::instanceSpecific( instance0, cl, numPlanes, inputSize, ActivationFunction::fromName( activation ) );
+    ActivationBackward *p1 = ActivationBackward::instanceSpecific( instance1, cl, numPlanes, inputSize, ActivationFunction::fromName( activation ) );
+    int outputSize = p1->outputSize;
+    int gradOutputNumElements = batchSize * outputSize * outputSize * numPlanes;
+    float *gradOutput = new float[ gradOutputNumElements ];
+    int inputNumElements = batchSize * inputSize * inputSize * numPlanes;
+    float *gradInput0 = new float[ inputNumElements ];
+    float *gradInput1 = new float[ inputNumElements ];
     
-    ActivationForward *forwardprop = ActivationForward::instanceSpecific( 0, cl, numPlanes, inputImageSize, ActivationFunction::fromName( activation ) );
-    float *output = new float[gradOutputSize];
-    float *input = new float[inputSize];
+    ActivationForward *forwardprop = ActivationForward::instanceSpecific( 0, cl, numPlanes, inputSize, ActivationFunction::fromName( activation ) );
+    float *output = new float[gradOutputNumElements];
+    float *input = new float[inputNumElements];
     float *gradInput[2];
     gradInput[0] = gradInput0;
     gradInput[1] = gradInput1;
@@ -146,8 +146,8 @@ TEST( SLOW_testactivationbackward, compare_args ) {
         // selectors might go over the edge if we just choose random ints
         // easiest way to select valid selectors might be to just forwardforward first?
 
-        WeightRandomizer::randomize( it, gradOutput, gradOutputSize, -0.1f, 0.1f );
-        WeightRandomizer::randomize( it, input, inputSize, -0.1f, 0.1f );    
+        WeightRandomizer::randomize( it, gradOutput, gradOutputNumElements, -0.1f, 0.1f );
+        WeightRandomizer::randomize( it, input, inputNumElements, -0.1f, 0.1f );    
         forwardprop->forward( batchSize, input, output );
 
         for( int instance = 0; instance < 2; instance++ ) {
@@ -155,7 +155,7 @@ TEST( SLOW_testactivationbackward, compare_args ) {
         }
         bool ok = true;
         int numErrors = 0;
-        for( int i = 0; i < inputSize; i++ ) {
+        for( int i = 0; i < inputNumElements; i++ ) {
             if( gradInput0[i] != gradInput1[i] ) {
                 cout << "diff: i=" << i << " " << gradInput0[i] << " != " << gradInput1[i] << endl;
                 ok = false;
@@ -204,9 +204,9 @@ TEST( testactivationforward, basic_2plane_batchsize2 ) {
                      -1, -3.5f,
                     37.4f,5
     };
-    int outputSize = activationForward->getOutputSize( batchSize );
-    int *selectors = new int[outputSize];
-    float *output = new float[outputSize];
+    int outputNumElements = activationForward->getOutputNumElements( batchSize );
+    int *selectors = new int[outputNumElements];
+    float *output = new float[outputNumElements];
 
     activationForward->forward( batchSize, data, selectors, output );
 
diff --git a/test/testactivationforward.cpp b/test/testactivationforward.cpp
index 4c1f2660..9e6eace3 100644
--- a/test/testactivationforward.cpp
+++ b/test/testactivationforward.cpp
@@ -28,9 +28,9 @@ TEST( testactivationforward, basic ) {
                      3, 33, 14,23,
                      -1, -3.5f,37.4f,5
     };
-    int outputSize = activationForward->getOutputSize( batchSize );
-    EXPECT_EQ( outputSize, imageSize * imageSize );
-    float *output = new float[outputSize];
+    int outputNumElements = activationForward->getOutputNumElements( batchSize );
+    EXPECT_EQ( outputNumElements, imageSize * imageSize );
+    float *output = new float[outputNumElements];
 
     activationForward->forward( batchSize, data, output );
 
@@ -65,8 +65,8 @@ TEST( testactivationforward, basic_2plane_batchsize2 ) {
                      -1, -3.5f,
                     37.4f,5
     };
-    int outputSize = activationForward->getOutputSize( batchSize );
-    float *output = new float[outputSize];
+    int outputNumElements = activationForward->getOutputNumElements( batchSize );
+    float *output = new float[outputNumElements];
 
     activationForward->forward( batchSize, data, output );
 
@@ -94,12 +94,12 @@ TEST( testactivationforward, fromwrappers ) {
                      3, 33, 14,23,
                      -1, -3.5f,37.4f,5
     };
-    int outputSize = activationForward->getOutputSize( batchSize );
-    float *output = new float[outputSize];
+    int outputNumElements = activationForward->getOutputNumElements( batchSize );
+    float *output = new float[outputNumElements];
 
-    const int inputSize = batchSize * numPlanes * imageSize * imageSize;
-    CLWrapper *inputWrapper = cl->wrap( inputSize, input );
-    CLWrapper *outputWrapper = cl->wrap( outputSize, output );
+    const int inputNumElements = batchSize * numPlanes * imageSize * imageSize;
+    CLWrapper *inputWrapper = cl->wrap( inputNumElements, input );
+    CLWrapper *outputWrapper = cl->wrap( outputNumElements, output );
 
     inputWrapper->copyToDevice();
 
@@ -198,18 +198,18 @@ void compareSpecific( CompareSpecificArgs args ) {
     ActivationForward *activationForward0 = ActivationForward::instanceSpecific( args._instance0, cl, numPlanes, imageSize, ActivationFunction::fromName( args._activation ) );
     ActivationForward *activationForward1 = ActivationForward::instanceSpecific( args._instance1, cl, numPlanes, imageSize, ActivationFunction::fromName( args._activation ) );
 
-    const int inputSize = batchSize * numPlanes * imageSize * imageSize;
-    int outputSize = activationForward0->getOutputSize( batchSize );
+    const int inputNumElements = batchSize * numPlanes * imageSize * imageSize;
+    int outputNumElements = activationForward0->getOutputNumElements( batchSize );
 
-    float *input = new float[ inputSize ];
-    float *output = new float[ outputSize ];
+    float *input = new float[ inputNumElements ];
+    float *output = new float[ outputNumElements ];
 
-    CLWrapper *inputWrapper = cl->wrap( inputSize, input );
-    CLWrapper *outputWrapper = cl->wrap( outputSize, output );
+    CLWrapper *inputWrapper = cl->wrap( inputNumElements, input );
+    CLWrapper *outputWrapper = cl->wrap( outputNumElements, output );
 
-    WeightRandomizer::randomize( input, inputSize, -0.1f, 0.1f );
+    WeightRandomizer::randomize( input, inputNumElements, -0.1f, 0.1f );
 
-    memset( output, 99, sizeof(int) * outputSize );
+    memset( output, 99, sizeof(int) * outputNumElements );
 
     inputWrapper->copyToDevice();
     outputWrapper->copyToDevice();
@@ -217,10 +217,10 @@ void compareSpecific( CompareSpecificArgs args ) {
     activationForward0->forward( batchSize, inputWrapper, outputWrapper );
     outputWrapper->copyToHost();
 
-    float *output0 = new float[ outputSize ];
-    memcpy( output0, output, sizeof(float) * outputSize );
+    float *output0 = new float[ outputNumElements ];
+    memcpy( output0, output, sizeof(float) * outputNumElements );
     
-    memset( output, 99, sizeof(int) * outputSize );
+    memset( output, 99, sizeof(int) * outputNumElements );
 
     inputWrapper->copyToDevice();
     outputWrapper->copyToDevice();
@@ -229,7 +229,7 @@ void compareSpecific( CompareSpecificArgs args ) {
     outputWrapper->copyToHost();
     
     int numErrors = 0;
-    for( int i = 0; i < outputSize; i++ ) {
+    for( int i = 0; i < outputNumElements; i++ ) {
         bool ok = true;
         if( ( output[i] > 0 && output0[i] < 0 ) || ( output[i] < 0 && output0[i] > 0 ) ) {
             cout << "signs differ" << endl;
@@ -262,7 +262,7 @@ void compareSpecific( CompareSpecificArgs args ) {
     }
     EXPECT_EQ( 0, numErrors );
     if( numErrors > 0 ) {
-        int num2dPlanes = inputSize / imageSize / imageSize;
+        int num2dPlanes = inputNumElements / imageSize / imageSize;
         for( int plane = 0; plane < num2dPlanes; plane++ ) {
             cout << "2dplane " << plane << ":" << endl;
             for( int i = 0; i < imageSize; i++ ) {
diff --git a/test/testbackward.cpp b/test/testbackward.cpp
index 30f9740a..fc0c456f 100644
--- a/test/testbackward.cpp
+++ b/test/testbackward.cpp
@@ -18,16 +18,21 @@
 #include "conv/ConvolutionalLayer.h"
 #include "input/InputLayer.h"
 #include "trainers/SGD.h"
+#include "clblas/ClBlasInstance.h"
+
+#include "clBLAS.h"
 
 #include "gtest/gtest.h"
 
 #include "test/gtest_supp.h"
 #include "test/Sampler.h"
 #include "test/WeightRandomizer.h"
+#include "test/TestArgsParser.h"
+#include "test/DimFromArgs.h"
 
 using namespace std;
 
-TEST( testbackward, squareloss ) {
+TEST(testbackward, squareloss) {
     // here's the plan:
     // generate some input, randomly
     // generate some expected output, randomly
@@ -37,9 +42,9 @@ TEST( testbackward, squareloss ) {
     // change some of the inputs, forward prop, recalculate loss, check corresponds
     // to the gradient
     EasyCL *cl = EasyCL::createForFirstGpuOtherwiseCpu();
-    NeuralNet *net = new NeuralNet( cl, 3, 5 );
-    net->addLayer( ForceBackpropLayerMaker::instance() );
-    net->addLayer( SquareLossMaker::instance() );
+    NeuralNet *net = new NeuralNet(cl, 3, 5);
+    net->addLayer(ForceBackpropLayerMaker::instance());
+    net->addLayer(SquareLossMaker::instance());
     cout << net->asString() << endl;
 
     int batchSize = 32;
@@ -56,27 +61,27 @@ TEST( testbackward, squareloss ) {
     float *input = new float[inputTotalSize];
     float *expectedOutput = new float[outputTotalSize];
 
-    WeightRandomizer::randomize( 0, input, inputTotalSize, -2.0f, 2.0f );
-    WeightRandomizer::randomize( 1, expectedOutput, outputTotalSize, -2.0f, 2.0f );
+    WeightRandomizer::randomize(0, input, inputTotalSize, -2.0f, 2.0f);
+    WeightRandomizer::randomize(1, expectedOutput, outputTotalSize, -2.0f, 2.0f);
     
     // now, forward prop
-//    net->input( input );
-    net->forward( input );
+//    net->input(input);
+    net->forward(input);
     net->print();
 //    net->printOutput();
 
     // calculate loss
-    float lossBefore = net->calcLoss( expectedOutput );
+    float lossBefore = net->calcLoss(expectedOutput);
 
     // calculate gradInput
-    net->backward( expectedOutput);
+    net->backward(expectedOutput);
 
     // modify input slightly
     mt19937 random;
     const int numSamples = 10;
-    for( int i = 0; i < numSamples; i++ ) {
+    for(int i = 0; i < numSamples; i++) {
         int inputIndex;
-        WeightRandomizer::randomizeInts( i, &inputIndex, 1, 0, inputTotalSize );
+        WeightRandomizer::randomizeInts(i, &inputIndex, 1, 0, inputTotalSize);
 //        cout << "i=" << i << " index " << inputIndex << endl;
         float oldValue = input[inputIndex];
         // grad for this index is....
@@ -89,10 +94,10 @@ TEST( testbackward, squareloss ) {
         input[inputIndex] = newValue;
 //        cout << "oldvalue=" << oldValue << " newvalue=" << newValue << endl;
         // forwardProp
-        net->forward( input );
+        net->forward(input);
         input[inputIndex] = oldValue;
 //        net->printOutput();
-        float lossAfter = net->calcLoss( expectedOutput );
+        float lossAfter = net->calcLoss(expectedOutput);
         float lossChange = lossAfter - lossBefore;
         cout << "idx=" << inputIndex << " predicted losschange=" << predictedLossChange << " actual=" << lossChange << endl;
     }
@@ -104,7 +109,7 @@ TEST( testbackward, squareloss ) {
     delete cl;
 }
 
-TEST( testbackward, crossentropyloss ) {
+TEST(testbackward, crossentropyloss) {
     // here's the plan:
     // generate some input, randomly
     // generate some expected output, randomly
@@ -114,9 +119,9 @@ TEST( testbackward, crossentropyloss ) {
     // change some of the inputs, forward prop, recalculate loss, check corresponds
     // to the gradient
     EasyCL *cl = EasyCL::createForFirstGpuOtherwiseCpu();
-    NeuralNet *net = new NeuralNet( cl, 3, 5 );
-    net->addLayer( ForceBackpropLayerMaker::instance() );
-    net->addLayer( CrossEntropyLossMaker::instance() );
+    NeuralNet *net = new NeuralNet(cl, 3, 5);
+    net->addLayer(ForceBackpropLayerMaker::instance());
+    net->addLayer(CrossEntropyLossMaker::instance());
     cout << net->asString() << endl;
 
     int batchSize = 4;
@@ -133,27 +138,27 @@ TEST( testbackward, crossentropyloss ) {
     float *input = new float[inputTotalSize];
     float *expectedOutput = new float[outputTotalSize];
 
-    WeightRandomizer::randomize( 0, input, inputTotalSize, 0.0f, 1.0f );
-    WeightRandomizer::randomize( 1, expectedOutput, outputTotalSize, 0.0f, 1.0f );
+    WeightRandomizer::randomize(0, input, inputTotalSize, 0.0f, 1.0f);
+    WeightRandomizer::randomize(1, expectedOutput, outputTotalSize, 0.0f, 1.0f);
     
     // now, forward prop
-//    net->input( input );
-    net->forward( input );
+//    net->input(input);
+    net->forward(input);
     net->print();
 //    net->printOutput();
 
     // calculate loss
-    float lossBefore = net->calcLoss( expectedOutput );
+    float lossBefore = net->calcLoss(expectedOutput);
 
     // calculate gradInput
-    net->backward( expectedOutput);
+    net->backward(expectedOutput);
 
     // modify input slightly
     mt19937 random;
     const int numSamples = 10;
-    for( int i = 0; i < numSamples; i++ ) {
+    for(int i = 0; i < numSamples; i++) {
         int inputIndex;
-        WeightRandomizer::randomizeInts( i, &inputIndex, 1, 0, inputTotalSize );
+        WeightRandomizer::randomizeInts(i, &inputIndex, 1, 0, inputTotalSize);
 //        cout << "i=" << i << " index " << inputIndex << endl;
         float oldValue = input[inputIndex];
         // grad for this index is....
@@ -166,10 +171,10 @@ TEST( testbackward, crossentropyloss ) {
         input[inputIndex] = newValue;
 //        cout << "oldvalue=" << oldValue << " newvalue=" << newValue << endl;
         // forwardProp
-        net->forward( input );
+        net->forward(input);
         input[inputIndex] = oldValue;
 //        net->printOutput();
-        float lossAfter = net->calcLoss( expectedOutput );
+        float lossAfter = net->calcLoss(expectedOutput);
         float lossChange = lossAfter - lossBefore;
         cout << "idx=" << inputIndex << " predicted losschange=" << predictedLossChange << " actual=" << lossChange << endl;
     }
@@ -181,22 +186,22 @@ TEST( testbackward, crossentropyloss ) {
     delete cl;
 }
 
-void normalizeAsProbabilityDistribution( int numPlanes, float *values, int N ) {
+void normalizeAsProbabilityDistribution(int numPlanes, float *values, int N) {
     int batchSize = N / numPlanes;
 //    int cubeSize = numPlanes;
-    for( int n = 0; n < batchSize; n++ ) {
+    for(int n = 0; n < batchSize; n++) {
         float *thisCube = values + n * numPlanes;
         float total = 0;
-        for( int i = 0; i < numPlanes; i++ ) {
+        for(int i = 0; i < numPlanes; i++) {
             total += thisCube[i];
         }
-        for( int i = 0; i < numPlanes; i++ ) {
+        for(int i = 0; i < numPlanes; i++) {
             thisCube[i] /= total;
         }
     }
 }
 
-TEST( testbackward, softmaxloss ) {
+TEST(testbackward, softmaxloss) {
     // here's the plan:
     // generate some input, randomly
     // generate some expected output, randomly
@@ -206,9 +211,9 @@ TEST( testbackward, softmaxloss ) {
     // change some of the inputs, forward prop, recalculate loss, check corresponds
     // to the gradient
     EasyCL *cl = EasyCL::createForFirstGpuOtherwiseCpu();
-    NeuralNet *net = new NeuralNet( cl, 5, 1 );
-    net->addLayer( ForceBackpropLayerMaker::instance() );
-    net->addLayer( SoftMaxMaker::instance() );
+    NeuralNet *net = new NeuralNet(cl, 5, 1);
+    net->addLayer(ForceBackpropLayerMaker::instance());
+    net->addLayer(SoftMaxMaker::instance());
     cout << net->asString() << endl;
 
     const int batchSize = 2;
@@ -226,45 +231,45 @@ TEST( testbackward, softmaxloss ) {
     float *input = new float[inputTotalSize];
     float *expectedOutput = new float[outputTotalSize];
 
-    WeightRandomizer::randomize( 0, input, inputTotalSize, 0.0f, 1.0f );
-    WeightRandomizer::randomize( 1, expectedOutput, outputTotalSize, 0.0f, 1.0f );
+    WeightRandomizer::randomize(0, input, inputTotalSize, 0.0f, 1.0f);
+    WeightRandomizer::randomize(1, expectedOutput, outputTotalSize, 0.0f, 1.0f);
 
     // we should make the input and output a probability distribution I think
     // so: add up the input, and divide each by that.  do same for expectedoutput (?)
-//    normalizeAsProbabilityDistribution( input, inputTotalSize );
-    normalizeAsProbabilityDistribution( outputPlanes, expectedOutput, outputTotalSize );
+//    normalizeAsProbabilityDistribution(input, inputTotalSize);
+    normalizeAsProbabilityDistribution(outputPlanes, expectedOutput, outputTotalSize);
 
     // set all to zero, and one to 1, ie like labelled data
-//    for( int i = 0; i < outputTotalSize; i++ ) {
+//    for(int i = 0; i < outputTotalSize; i++) {
 //        expectedOutput[i] = 0;
 //    }
-//    for( int n = 0; n < batchSize; n++ ) {
+//    for(int n = 0; n < batchSize; n++) {
 //        int chosenLabel = 0;
-//        WeightRandomizer::randomizeInts( n, &chosenLabel, 1, 0, net->getOutputPlanes() );
+//        WeightRandomizer::randomizeInts(n, &chosenLabel, 1, 0, net->getOutputPlanes());
 //        expectedOutput[ n * outputPlanes + chosenLabel ] = 1;
 //    }
-//    for( int i = 0; i < outputTotalSize; i++ ) {
+//    for(int i = 0; i < outputTotalSize; i++) {
 //        cout << "expected[" << i << "]=" << expectedOutput[i] << endl;
 //    }
 //        
     // now, forward prop
-//    net->input( input );
-    net->forward( input );
+//    net->input(input);
+    net->forward(input);
     net->print();
 //    net->printOutput();
 
     // calculate loss
-    float lossBefore = net->calcLoss( expectedOutput );
+    float lossBefore = net->calcLoss(expectedOutput);
 
     // calculate gradInput
-    net->backward( expectedOutput);
+    net->backward(expectedOutput);
 
     // modify input slightly
     mt19937 random;
     const int numSamples = 10;
-    for( int i = 0; i < numSamples; i++ ) {
+    for(int i = 0; i < numSamples; i++) {
         int inputIndex;
-        WeightRandomizer::randomizeInts( i, &inputIndex, 1, 0, inputTotalSize );
+        WeightRandomizer::randomizeInts(i, &inputIndex, 1, 0, inputTotalSize);
 //        cout << "i=" << i << " index " << inputIndex << endl;
         float oldValue = input[inputIndex];
         // grad for this index is....
@@ -277,10 +282,10 @@ TEST( testbackward, softmaxloss ) {
         input[inputIndex] = newValue;
 //        cout << "oldvalue=" << oldValue << " newvalue=" << newValue << endl;
         // forwardProp
-        net->forward( input );
+        net->forward(input);
         input[inputIndex] = oldValue;
 //        net->printOutput();
-        float lossAfter = net->calcLoss( expectedOutput );
+        float lossAfter = net->calcLoss(expectedOutput);
         float lossChange = lossAfter - lossBefore;
         cout << "idx=" << inputIndex << " predicted losschange=" << predictedLossChange << " actual=" << lossChange << endl;
     }
@@ -292,7 +297,7 @@ TEST( testbackward, softmaxloss ) {
     delete cl;
 }
 
-void checkLayer( NeuralNet *net, int targetLayerIndex ) {
+void checkLayer(NeuralNet *net, int targetLayerIndex) {
     // here's the plan:
     // generate some input, randomly
     // generate some expected output, randomly
@@ -320,50 +325,50 @@ void checkLayer( NeuralNet *net, int targetLayerIndex ) {
     Layer *layer = net->getLayer(targetLayerIndex);
     // in fact we dont really need to randomize the weights, since
     // the weights are randomized anyway
-//    if( layer->getPersistSize() > 0 ) {
+//    if(layer->getPersistSize() > 0) {
 //        int weightsSize = layer->getWeightsSize();
 //        int biasSize = layer->getBiasSize();
 //        cout << "weightsize=" << weightsSize << " biassize=" << biasSize << endl;
 //        float *weights = new float[weightsSize];
 //        float *bias = new float[biasSize];
-//        WeightRandomizer::randomize( 2, weights, weightsSize, -0.1f, 0.1f );
-//        WeightRandomizer::randomize( 3, bias, biasSize, -0.1f, 0.1f );
-//        if( weightsSize > 0 || biasSize > 0 ) {
-//            layer->setWeights( weights, bias );
+//        WeightRandomizer::randomize(2, weights, weightsSize, -0.1f, 0.1f);
+//        WeightRandomizer::randomize(3, bias, biasSize, -0.1f, 0.1f);
+//        if(weightsSize > 0 || biasSize > 0) {
+//            layer->setWeights(weights, bias);
 //        }
 //        delete[] weights;
 //        delete[] bias;
 //    }
 
     cout << "layer " << layer->asString() << endl;
-    WeightRandomizer::randomize( 0, input, inputTotalSize, -1.0f, 1.0f );
-    WeightRandomizer::randomize( 1, expectedOutput, outputTotalSize, 0.0f, 1.0f );
+    WeightRandomizer::randomize(0, input, inputTotalSize, -1.0f, 1.0f);
+    WeightRandomizer::randomize(1, expectedOutput, outputTotalSize, 0.0f, 1.0f);
 
     // we should make the input and output a probability distribution I think
     // so: add up the input, and divide each by that.  do same for expectedoutput (?)
-//    normalizeAsProbabilityDistribution( input, inputTotalSize );
-    normalizeAsProbabilityDistribution( outputPlanes, expectedOutput, outputTotalSize );
+//    normalizeAsProbabilityDistribution(input, inputTotalSize);
+    normalizeAsProbabilityDistribution(outputPlanes, expectedOutput, outputTotalSize);
         
     // now, forward prop
-//    net->input( input );
-    net->forward( input );
+//    net->input(input);
+    net->forward(input);
     net->print();
 //    net->printOutput();
 
     // calculate loss
-    float lossBefore = net->calcLoss( expectedOutput );
+    float lossBefore = net->calcLoss(expectedOutput);
 
     // calculate gradInput
     // should be zero, so we dont modify the weights
     // otherwise the losses will be really strange :-)
-    net->backward( expectedOutput);
+    net->backward(expectedOutput);
 
     // modify input slightly
     mt19937 random;
     const int numSamples = 10;
-    for( int i = 0; i < numSamples; i++ ) {
+    for(int i = 0; i < numSamples; i++) {
         int inputIndex;
-        WeightRandomizer::randomizeInts( i, &inputIndex, 1, 0, inputTotalSize );
+        WeightRandomizer::randomizeInts(i, &inputIndex, 1, 0, inputTotalSize);
 //        cout << "i=" << i << " index " << inputIndex << endl;
         float oldValue = input[inputIndex];
         // grad for this index is....
@@ -376,10 +381,10 @@ void checkLayer( NeuralNet *net, int targetLayerIndex ) {
         input[inputIndex] = newValue;
 //        cout << "oldvalue=" << oldValue << " newvalue=" << newValue << endl;
         // forwardProp
-        net->forward( input );
+        net->forward(input);
         input[inputIndex] = oldValue;
 //        net->printOutput();
-        float lossAfter = net->calcLoss( expectedOutput );
+        float lossAfter = net->calcLoss(expectedOutput);
         float lossChange = lossAfter - lossBefore;
         cout << "idx=" << inputIndex << " predicted losschange=" << predictedLossChange << " actual=" << lossChange << endl;
     }
@@ -388,158 +393,161 @@ void checkLayer( NeuralNet *net, int targetLayerIndex ) {
     delete[] input;
 }
 
-TEST( testbackward, squareloss2 ) {
+TEST(testbackward, squareloss2) {
     EasyCL *cl = EasyCL::createForFirstGpuOtherwiseCpu();
-    NeuralNet *net = new NeuralNet( cl, 5, 1 );
-    net->addLayer( ForceBackpropLayerMaker::instance() );
-    net->addLayer( SquareLossMaker::instance() );
+    NeuralNet *net = new NeuralNet(cl, 5, 1);
+    net->addLayer(ForceBackpropLayerMaker::instance());
+    net->addLayer(SquareLossMaker::instance());
     cout << net->asString() << endl;
 
 //    int batchSize = ;
     net->setBatchSize(32);
 
-    checkLayer( net, 2 );
+    checkLayer(net, 2);
     delete net;
     delete cl;
 }
 
-TEST( testbackward, crossentropy2 ) {
+TEST(testbackward, crossentropy2) {
     EasyCL *cl = EasyCL::createForFirstGpuOtherwiseCpu();
-    NeuralNet *net = new NeuralNet( cl, 5, 1 );
-    net->addLayer( ForceBackpropLayerMaker::instance() );
-    net->addLayer( CrossEntropyLossMaker::instance() );
+    NeuralNet *net = new NeuralNet(cl, 5, 1);
+    net->addLayer(ForceBackpropLayerMaker::instance());
+    net->addLayer(CrossEntropyLossMaker::instance());
     cout << net->asString() << endl;
 
 //    int batchSize = ;
     net->setBatchSize(2);
 
-    checkLayer( net, 2 );
+    checkLayer(net, 2);
     delete net;
     delete cl;
 }
 
-TEST( testbackward, softmax2 ) {
+TEST(testbackward, softmax2) {
     EasyCL *cl = EasyCL::createForFirstGpuOtherwiseCpu();
-    NeuralNet *net = new NeuralNet( cl, 5, 1 );
-    net->addLayer( ForceBackpropLayerMaker::instance() );
-    net->addLayer( SoftMaxMaker::instance() );
+    NeuralNet *net = new NeuralNet(cl, 5, 1);
+    net->addLayer(ForceBackpropLayerMaker::instance());
+    net->addLayer(SoftMaxMaker::instance());
     cout << net->asString() << endl;
 
 //    int batchSize = ;
     net->setBatchSize(2);
 
-    checkLayer( net, 2 );
+    checkLayer(net, 2);
     delete net;
     delete cl;
 }
 
-TEST( testbackward, conv1 ) {
+TEST(testbackward, conv1) {
     EasyCL *cl = EasyCL::createForFirstGpuOtherwiseCpu();
-    NeuralNet *net = new NeuralNet( cl, 2, 4 );
-    net->addLayer( ForceBackpropLayerMaker::instance() );
-    net->addLayer( ConvolutionalMaker::instance()->numFilters(2)->filterSize(3)->biased(0)->padZeros(0) );
-    net->addLayer( SquareLossMaker::instance() );
-//    net->addLayer( SoftMaxMaker::instance() ); // maybe should use square loss maker, or cross entropy,
+    ClBlasInstance blasInstance;
+    NeuralNet *net = new NeuralNet(cl, 2, 4);
+    net->addLayer(ForceBackpropLayerMaker::instance());
+    net->addLayer(ConvolutionalMaker::instance()->numFilters(2)->filterSize(3)->biased(0)->padZeros(0));
+    net->addLayer(SquareLossMaker::instance());
+//    net->addLayer(SoftMaxMaker::instance()); // maybe should use square loss maker, or cross entropy,
                           // so that dont have to make filtersize == input image size?
     cout << net->asString() << endl;
 
     net->setBatchSize(4);
 
-    checkLayer( net, 2 );
+    checkLayer(net, 2);
     delete net;
     delete cl;
 }
 
-TEST( testbackward, fc1 ) {
+TEST(testbackward, fc1) {
     EasyCL *cl = EasyCL::createForFirstGpuOtherwiseCpu();
-    NeuralNet *net = new NeuralNet( cl, 2, 4 );
-    net->addLayer( ForceBackpropLayerMaker::instance() );
-    net->addLayer( FullyConnectedMaker::instance()->numPlanes(4)->imageSize(1)->biased(0) );
-    net->addLayer( SquareLossMaker::instance() );
-//    net->addLayer( SoftMaxMaker::instance() ); // maybe should use square loss maker, or cross entropy,
+    ClBlasInstance blasInstance;
+    NeuralNet *net = new NeuralNet(cl, 2, 4);
+    net->addLayer(ForceBackpropLayerMaker::instance());
+    net->addLayer(FullyConnectedMaker::instance()->numPlanes(4)->imageSize(1)->biased(0));
+    net->addLayer(SquareLossMaker::instance());
+//    net->addLayer(SoftMaxMaker::instance()); // maybe should use square loss maker, or cross entropy,
                           // so that dont have to make filtersize == input image size?
     cout << net->asString() << endl;
 
     net->setBatchSize(4);
 
-    checkLayer( net, 2 );
+    checkLayer(net, 2);
     delete net;
     delete cl;
 }
 
-TEST( testbackward, act1 ) {
+TEST(testbackward, act1) {
     EasyCL *cl = EasyCL::createForFirstGpuOtherwiseCpu();
-    NeuralNet *net = new NeuralNet( cl, 1, 2 );
-    net->addLayer( ForceBackpropLayerMaker::instance() );
-    net->addLayer( ActivationMaker::instance()->relu() );
-    net->addLayer( SquareLossMaker::instance() );
-//    net->addLayer( SoftMaxMaker::instance() ); // maybe should use square loss maker, or cross entropy,
+    NeuralNet *net = new NeuralNet(cl, 1, 2);
+    net->addLayer(ForceBackpropLayerMaker::instance());
+    net->addLayer(ActivationMaker::instance()->relu());
+    net->addLayer(SquareLossMaker::instance());
+//    net->addLayer(SoftMaxMaker::instance()); // maybe should use square loss maker, or cross entropy,
                           // so that dont have to make filtersize == input image size?
     cout << net->asString() << endl;
 
     net->setBatchSize(1);
 
-    checkLayer( net, 2 );
+    checkLayer(net, 2);
     delete net;
     delete cl;
 }
 
 // This file contains tests for calculating errors for the upstream layer
 
-void testNumerically( float learningRate, int batchSize, int imageSize, int filterSize, int numPlanes, ActivationFunction *fn, bool padZeros, int its = 20 ) {
+void testNumerically(float learningRate, int batchSize, int imageSize, int filterSize, int numPlanes, ActivationFunction *fn, bool padZeros, int its = 20) {
     EasyCL *cl = EasyCL::createForFirstGpuOtherwiseCpu();
+    ClBlasInstance clblasInstance;
     NeuralNet *net = NeuralNet::maker(cl)->planes(numPlanes)->imageSize(imageSize)->instance();
-    net->addLayer( ConvolutionalMaker::instance()->numFilters(1)->filterSize(filterSize)->biased(0)->padZeros(padZeros) );
-    net->addLayer( ActivationMaker::instance()->fn(fn) );
-    net->addLayer( ConvolutionalMaker::instance()->numFilters(1)->filterSize(filterSize)->biased(0)->padZeros(padZeros) );
-    net->addLayer( ActivationMaker::instance()->fn(fn) );
-    net->addLayer( SquareLossMaker::instance() );
-    net->setBatchSize( batchSize );
-
-    int inputSize = net->getLayer(0)->getOutputSize();
-    int outputSize = net->getLastLayer()->getOutputSize();
+    net->addLayer(ConvolutionalMaker::instance()->numFilters(1)->filterSize(filterSize)->biased(0)->padZeros(padZeros));
+    net->addLayer(ActivationMaker::instance()->fn(fn));
+    net->addLayer(ConvolutionalMaker::instance()->numFilters(1)->filterSize(filterSize)->biased(0)->padZeros(padZeros));
+    net->addLayer(ActivationMaker::instance()->fn(fn));
+    net->addLayer(SquareLossMaker::instance());
+    net->setBatchSize(batchSize);
+
+    int inputNumElements = net->getLayer(0)->getOutputNumElements();
+    int outputNumElements = net->getLastLayer()->getOutputNumElements();
     int weightsSize1 = net->getLayer(1)->getWeightsSize();
     int weightsSize2 = net->getLayer(3)->getWeightsSize();
 
-    float *inputData = new float[std::max<int>(10000, inputSize )];
-    float *expectedOutput = new float[std::max<int>(10000, outputSize )];
-    memset( inputData, 0, sizeof(float) * std::max<int>(10000, inputSize ) );
-    memset( expectedOutput, 0, sizeof(float) * std::max<int>(10000, outputSize ) );
+    float *inputData = new float[std::max<int>(10000, inputNumElements)];
+    float *expectedOutput = new float[std::max<int>(10000, outputNumElements)];
+    memset(inputData, 0, sizeof(float) * std::max<int>(10000, inputNumElements));
+    memset(expectedOutput, 0, sizeof(float) * std::max<int>(10000, outputNumElements));
 //    int seed = 0;
-    std::mt19937 random = WeightRandomizer::randomize( inputData, std::max<int>(10000, inputSize ), -2.0f, 2.0f );
-    WeightRandomizer::randomize( random, expectedOutput, std::max<int>(10000, outputSize ), -2.0f, 2.0f );
-    WeightRandomizer::randomize( random, dynamic_cast<ConvolutionalLayer*>(net->getLayer(1))->weights, weightsSize1, -2.0f, 2.0f );
+    std::mt19937 random = WeightRandomizer::randomize(inputData, std::max<int>(10000, inputNumElements), -2.0f, 2.0f);
+    WeightRandomizer::randomize(random, expectedOutput, std::max<int>(10000, outputNumElements), -2.0f, 2.0f);
+    WeightRandomizer::randomize(random, dynamic_cast<ConvolutionalLayer*>(net->getLayer(1))->weights, weightsSize1, -2.0f, 2.0f);
     dynamic_cast<ConvolutionalLayer*>(net->getLayer(1))->weightsWrapper->copyToDevice();
-    WeightRandomizer::randomize( random, dynamic_cast<ConvolutionalLayer*>(net->getLayer(3))->weights, weightsSize2, -2.0f, 2.0f );
+    WeightRandomizer::randomize(random, dynamic_cast<ConvolutionalLayer*>(net->getLayer(3))->weights, weightsSize2, -2.0f, 2.0f);
     dynamic_cast<ConvolutionalLayer*>(net->getLayer(3))->weightsWrapper->copyToDevice();
 
-    SGD *sgd = SGD::instance( cl, learningRate, 0.0f );
-    for( int it = 0; it < its; it++ ) {
+    SGD *sgd = SGD::instance(cl, learningRate, 0.0f);
+    for(int it = 0; it < its; it++) {
         float *weightsBefore1 = new float[weightsSize1];
         float *currentWeights = net->getLayer(1)->getWeights();
-        for( int i = 0; i < weightsSize1; i++ ) {
+        for(int i = 0; i < weightsSize1; i++) {
             weightsBefore1[i] = currentWeights[i];
         }
         float *weightsBefore2 = new float[weightsSize2];
         currentWeights = net->getLayer(3)->getWeights();
-        for( int i = 0; i < weightsSize2; i++ ) {
+        for(int i = 0; i < weightsSize2; i++) {
             weightsBefore2[i] = currentWeights[i];
         }
 
-        net->forward( inputData );
+        net->forward(inputData);
     //    net->print();
         float loss = net->calcLoss(expectedOutput);
         dynamic_cast<LossLayer*>(net->getLayer(5))->calcLoss(expectedOutput);
-//        net->backward( expectedOutput );
+//        net->backward(expectedOutput);
         TrainingContext context(0, 0);
-        sgd->train( net, &context, inputData, expectedOutput );
+        sgd->train(net, &context, inputData, expectedOutput);
         dynamic_cast<ConvolutionalLayer*>(net->getLayer(1))->weightsWrapper->copyToHost();
         // restore 2nd layer weights :-)
-        for( int i = 0; i < weightsSize2; i++ ) {
+        for(int i = 0; i < weightsSize2; i++) {
 //            dynamic_cast<ConvolutionalLayer*>(net->getLayer(2))->weights[i] = weightsBefore2[i];
         }
         dynamic_cast<ConvolutionalLayer*>(net->getLayer(3))->weightsWrapper->copyToDevice();
-        net->forward( inputData );
+        net->forward(inputData);
 
         float loss2 = net->calcLoss(expectedOutput);
         float lossChange = loss - loss2;
@@ -548,40 +556,43 @@ void testNumerically( float learningRate, int batchSize, int imageSize, int filt
         float *newWeights = net->getLayer(1)->getWeights();
         float sumWeightDiff = 0;
         float sumWeightDiffSquared = 0;
-        for( int i = 0; i < weightsSize1; i++ ) {
+        for(int i = 0; i < weightsSize1; i++) {
             float diff = newWeights[i] - weightsBefore1[i];
             sumWeightDiff += diff;
             sumWeightDiffSquared += diff * diff;
         }
         newWeights = net->getLayer(3)->getWeights();
-        for( int i = 0; i < weightsSize2; i++ ) {
+        for(int i = 0; i < weightsSize2; i++) {
             float diff = newWeights[i] - weightsBefore2[i];
             sumWeightDiff += diff;
             sumWeightDiffSquared += diff * diff;
         }
         cout << "sumweightsdiff " << sumWeightDiff << endl;
-    //    cout << "sumweightsdiff / learningrate " << (sumWeightDiff / learningRate ) << endl;
-    //    cout << "sum weightsdiffsquared " << (sumWeightDiffSquared/ learningRate / learningRate * imageSize ) << endl;
+    //    cout << "sumweightsdiff / learningrate " << (sumWeightDiff / learningRate) << endl;
+    //    cout << "sum weightsdiffsquared " << (sumWeightDiffSquared/ learningRate / learningRate * imageSize) << endl;
 
         float estimatedLossChangeFromW = sumWeightDiffSquared/ learningRate; // / filterSize;
 
         cout << " loss change              " << lossChange << endl;
         cout << " estimatedLossChangeFromW " << estimatedLossChangeFromW << endl;
-    //    cout << abs(estimatedLossChangeFromW - lossChange ) / lossChange << endl;    
-    //    cout << abs(estimatedLossChangeFromW - lossChange ) / estimatedLossChangeFromW << endl;    
-        EXPECT_GT( 0.01f * imageSize * imageSize, abs(estimatedLossChangeFromW - lossChange ) / lossChange ); 
-        EXPECT_GT( 0.01f * imageSize * imageSize, abs(estimatedLossChangeFromW - lossChange ) / estimatedLossChangeFromW ); 
+    //    cout << abs(estimatedLossChangeFromW - lossChange) / lossChange << endl;    
+    //    cout << abs(estimatedLossChangeFromW - lossChange) / estimatedLossChangeFromW << endl;    
+        EXPECT_GT(0.01f * imageSize * imageSize, abs(estimatedLossChangeFromW - lossChange) / lossChange); 
+        EXPECT_GT(0.01f * imageSize * imageSize, abs(estimatedLossChangeFromW - lossChange) / estimatedLossChangeFromW); 
+        delete[] weightsBefore1;
+        delete[] weightsBefore2;
     }
-
 //    delete[] weights1;
 //    delete[] errors;
 //    delete[] output;
+    delete sgd;
     delete[] inputData;
+    delete[] expectedOutput;
     delete net;
     delete cl;
 }
 
-TEST( testbackward, checknumerically ) {
+TEST(testbackward, checknumerically) {
     float learningRate = 0.1f;
     const int batchSize = 1;
     const int imageSize = 1;
@@ -589,10 +600,10 @@ TEST( testbackward, checknumerically ) {
     const int numPlanes = 1;
     bool padZeros = false;
 
-    testNumerically( learningRate, batchSize, imageSize, filterSize, numPlanes, new TanhActivation(), padZeros, 5 );
+    testNumerically(learningRate, batchSize, imageSize, filterSize, numPlanes, new TanhActivation(), padZeros, 5);
 }
 
-TEST( testbackward, checknumerically_imagesize5_filter3_relu ) {
+TEST(testbackward, checknumerically_imagesize5_filter3_relu) {
     float learningRate = 0.0001f;
     const int batchSize = 1;
     const int imageSize = 5;
@@ -601,40 +612,40 @@ TEST( testbackward, checknumerically_imagesize5_filter3_relu ) {
     ActivationFunction *fn = new ReluActivation();
     bool padZeros = true;
 
-    testNumerically( learningRate, batchSize, imageSize, filterSize, numPlanes, fn, padZeros );
+    testNumerically(learningRate, batchSize, imageSize, filterSize, numPlanes, fn, padZeros);
 }
 
-void measurePerf( int instance, int batchSize, LayerDimensions dim ) {
+void measurePerf(int instance, int batchSize, LayerDimensions dim) {
     EasyCL *cl = EasyCL::createForFirstGpuOtherwiseCpu();
 
-    int inputSize = dim.inputCubeSize * batchSize;
+    int inputNumElements = dim.inputCubeSize * batchSize;
     int errorsSize = dim.outputCubeSize * batchSize;
     int weightsSize = dim.filtersSize;
     int errorsForUpstreamSize = dim.inputCubeSize * batchSize;
-    float *input = new float[inputSize];
+    float *input = new float[inputNumElements];
     float *errors = new float[errorsSize];
     float *weights = new float[weightsSize];
 
-    WeightRandomizer::randomize( input, inputSize, -0.1f, 0.1f );
-    WeightRandomizer::randomize( errors, errorsSize, -0.1f, 0.1f );
-    WeightRandomizer::randomize( weights, weightsSize, -0.1f, 0.1f );
+    WeightRandomizer::randomize(input, inputNumElements, -0.1f, 0.1f);
+    WeightRandomizer::randomize(errors, errorsSize, -0.1f, 0.1f);
+    WeightRandomizer::randomize(weights, weightsSize, -0.1f, 0.1f);
 
     float *errorsForUpstream = new float[errorsForUpstreamSize];
-    CLWrapper *inputWrapper = cl->wrap( inputSize, input );
-    CLWrapper *errorsWrapper = cl->wrap( errorsSize, errors );
-    CLWrapper *weightsWrapper = cl->wrap( weightsSize, weights );
-    CLWrapper *errorsForUpstreamWrapper = cl->wrap( errorsForUpstreamSize, errorsForUpstream );
+    CLWrapper *inputWrapper = cl->wrap(inputNumElements, input);
+    CLWrapper *errorsWrapper = cl->wrap(errorsSize, errors);
+    CLWrapper *weightsWrapper = cl->wrap(weightsSize, weights);
+    CLWrapper *errorsForUpstreamWrapper = cl->wrap(errorsForUpstreamSize, errorsForUpstream);
     inputWrapper->copyToDevice();
     errorsWrapper->copyToDevice();
     weightsWrapper->copyToDevice();
     errorsForUpstreamWrapper->createOnDevice();
 
     StatefulTimer::timeCheck("after init");
-    Backward *backwardImpl = Backward::instanceSpecific( instance, cl, dim );
-    for( int it = 0; it < 40; it++ ) {
-        backwardImpl->backward( batchSize, 
+    Backward *backwardImpl = Backward::instanceSpecific(instance, cl, dim);
+    for(int it = 0; it < 40; it++) {
+        backwardImpl->backward(batchSize, 
             inputWrapper, errorsWrapper, weightsWrapper,
-            errorsForUpstreamWrapper );
+            errorsForUpstreamWrapper);
     }
     StatefulTimer::timeCheck("after backprop");
     StatefulTimer::dump(true);
@@ -653,40 +664,42 @@ void measurePerf( int instance, int batchSize, LayerDimensions dim ) {
     delete cl;
 }
 
-TEST( SLOW_testbackward, perf_kgsgo_32c5 ) {
+TEST(SLOW_testbackward, perf_kgsgo_32c5) {
     int batchSize = 128;
     LayerDimensions dim;
-    dim.setInputPlanes( 32 ).setInputImageSize(19).setNumFilters( 32 ).setFilterSize( 5 )
-        .setPadZeros( true ).setBiased( true );  
+    dim.setInputPlanes(32).setInputSize(19).setNumFilters(32).setFilterSize(5)
+        .setPadZeros(true).setBiased(true);  
     cout << dim.buildOptionsString() << endl;  
 //    ActivationFunction *fn = new ReluActivation();
 
-    measurePerf( 2, batchSize, dim );
+    measurePerf(2, batchSize, dim);
 }
 
-void compareSpecific( int instance0, int instance1, int batchSize, LayerDimensions dim ) {
+void compareSpecific(int instance0, int instance1, int numIts, int batchSize, LayerDimensions dim) {
+    cout << "batchsize=" << batchSize << " " << dim << endl;
     EasyCL *cl = EasyCL::createForFirstGpuOtherwiseCpu();
+    ClBlasInstance clblasInstance;
 
-    int inputSize = dim.inputCubeSize * batchSize;
+    int inputNumElements = dim.inputCubeSize * batchSize;
     int errorsSize = dim.outputCubeSize * batchSize;
     int weightsSize = dim.filtersSize;
     int errorsForUpstreamSize = dim.inputCubeSize * batchSize;
 
-    float *input = new float[inputSize];
+    float *input = new float[inputNumElements];
     float *errors = new float[errorsSize];
     float *weights = new float[weightsSize];
     float *errorsForUpstream0 = new float[errorsForUpstreamSize];
     float *errorsForUpstream1 = new float[errorsForUpstreamSize];
 
-    WeightRandomizer::randomize( input, inputSize, -0.1f, 0.1f );
-    WeightRandomizer::randomize( errors, errorsSize, -0.1f, 0.1f );
-    WeightRandomizer::randomize( weights, weightsSize, -0.1f, 0.1f );
+    WeightRandomizer::randomize(0, input, inputNumElements, -0.1f, 0.1f);
+    WeightRandomizer::randomize(1, errors, errorsSize, -0.1f, 0.1f);
+    WeightRandomizer::randomize(2, weights, weightsSize, -0.1f, 0.1f);
 
-    CLWrapper *inputWrapper = cl->wrap( inputSize, input );
-    CLWrapper *errorsWrapper = cl->wrap( errorsSize, errors );
-    CLWrapper *weightsWrapper = cl->wrap( weightsSize, weights );
-    CLWrapper *errorsForUpstreamWrapper0 = cl->wrap( errorsForUpstreamSize, errorsForUpstream0 );
-    CLWrapper *errorsForUpstreamWrapper1 = cl->wrap( errorsForUpstreamSize, errorsForUpstream1 );
+    CLWrapper *inputWrapper = cl->wrap(inputNumElements, input);
+    CLWrapper *errorsWrapper = cl->wrap(errorsSize, errors);
+    CLWrapper *weightsWrapper = cl->wrap(weightsSize, weights);
+    CLWrapper *errorsForUpstreamWrapper0 = cl->wrap(errorsForUpstreamSize, errorsForUpstream0);
+    CLWrapper *errorsForUpstreamWrapper1 = cl->wrap(errorsForUpstreamSize, errorsForUpstream1);
 
     inputWrapper->copyToDevice();
     errorsWrapper->copyToDevice();
@@ -694,50 +707,52 @@ void compareSpecific( int instance0, int instance1, int batchSize, LayerDimensio
     errorsForUpstreamWrapper0->createOnDevice();
     errorsForUpstreamWrapper1->createOnDevice();
 
-    Backward *bp0 = Backward::instanceSpecific( instance0, cl, dim );
-    Backward *bp1 = Backward::instanceSpecific( instance1, cl, dim );
+    Backward *bp0 = Backward::instanceSpecific(instance0, cl, dim);
+    Backward *bp1 = Backward::instanceSpecific(instance1, cl, dim);
     
-    bp0->backward( batchSize, 
-            inputWrapper, errorsWrapper, weightsWrapper,
-            errorsForUpstreamWrapper0 );
-    bp1->backward( batchSize, 
-            inputWrapper, errorsWrapper, weightsWrapper,
-            errorsForUpstreamWrapper1 );
-
-    errorsForUpstreamWrapper0->copyToHost();
-    errorsForUpstreamWrapper1->copyToHost();
-
-    int outputSize = errorsForUpstreamSize;
-    cout << dim << endl;
-    bool same = true;
-    for( int i = 0; i < max( 20, outputSize ); i++ ) {
-        if( i < outputSize ) {
-            if( abs( errorsForUpstream0[i] - errorsForUpstream1[i] ) < 0.000001 || abs( errorsForUpstream0[i] - errorsForUpstream1[i] ) <= 0.001 * max( abs( errorsForUpstream0[i] ), abs( errorsForUpstream1[i] ) ) ) {
-                if( i < 20 ) {
+    for(int it=0; it < numIts; it++ ) {
+        bp0->backward(batchSize, 
+                inputWrapper, errorsWrapper, weightsWrapper,
+                errorsForUpstreamWrapper0);
+        bp1->backward(batchSize, 
+                inputWrapper, errorsWrapper, weightsWrapper,
+                errorsForUpstreamWrapper1);
+
+        errorsForUpstreamWrapper0->copyToHost();
+        errorsForUpstreamWrapper1->copyToHost();
+
+        int outputNumElements = errorsForUpstreamSize;
+        cout << dim << endl;
+        bool same = true;
+        for(int i = 0; i < max(20, outputNumElements); i++) {
+            if(i < outputNumElements) {
+                if(abs(errorsForUpstream0[i] - errorsForUpstream1[i]) < 0.000001 || abs(errorsForUpstream0[i] - errorsForUpstream1[i]) <= 0.001 * max(abs(errorsForUpstream0[i]), abs(errorsForUpstream1[i]))) {
+                    if(it == 0 && i < 20) {
+                        cout << "output[" << i << "]=" << errorsForUpstream0[i] << " " << errorsForUpstream1[i];
+                        cout << " SAME";
+                    }
+                } else {
                     cout << "output[" << i << "]=" << errorsForUpstream0[i] << " " << errorsForUpstream1[i];
-                    cout << " SAME";
+                    cout << " DIFF";
+                    same = false;
                 }
             } else {
-                cout << "output[" << i << "]=" << errorsForUpstream0[i] << " " << errorsForUpstream1[i];
-                cout << " DIFF";
-                same = false;
+                 if(it == 0 && i < 20) {
+                     cout << "     ";
+                 }
+            }
+            if(it == 0 && i < 20) {
+                cout << "  || " << errorsForUpstream1[100+i] ;
+                cout << "  || " << errorsForUpstream1[200+i] ;
+                cout << "  || " << errorsForUpstream1[300+i] ;
+                cout << "  || " << errorsForUpstream1[400+i] ;
+                cout << "  || " << errorsForUpstream1[500+i] ;
+                cout << "  || " << errorsForUpstream1[600+i] ;
+                cout << "  || " << errorsForUpstream1[700+i] << endl;
             }
-        } else {
-             if( i < 20 ) {
-                 cout << "     ";
-             }
-        }
-        if( i < 20 ) {
-            cout << "  || " << errorsForUpstream1[100+i] ;
-            cout << "  || " << errorsForUpstream1[200+i] ;
-            cout << "  || " << errorsForUpstream1[300+i] ;
-            cout << "  || " << errorsForUpstream1[400+i] ;
-            cout << "  || " << errorsForUpstream1[500+i] ;
-            cout << "  || " << errorsForUpstream1[600+i] ;
-            cout << "  || " << errorsForUpstream1[700+i] << endl;
         }
+        EXPECT_EQ(true, same);
     }
-    EXPECT_EQ( true, same );
 
     delete inputWrapper;
     delete errorsWrapper;
@@ -755,70 +770,96 @@ void compareSpecific( int instance0, int instance1, int batchSize, LayerDimensio
     delete[] weights;
 }
 
-TEST( SLOW_testbackward, compare_kgsgo_32c5 ) {
+TEST(SLOW_testbackward, compare_specific_args) {
+    LayerDimensions dim;
     int batchSize = 128;
+    int numIts = 1;
+    int instance0 = 1;
+    int instance1 = 3;
+//    int N = 128;
+//    bool debug = false;
+    dim.setInputPlanes(64).setInputSize(19).setNumFilters(64)
+        .setFilterSize(7)
+        .setPadZeros(true).setBiased(false);    
+
+    TestArgsParser::arg("its", &numIts);
+    DimFromArgs::arg(&dim);
+    TestArgsParser::arg("instance0", &instance0);
+    TestArgsParser::arg("instance1", &instance1);
+//    TestArgsParser::arg("debug", &debug);
+    TestArgsParser::arg("batchsize", &batchSize);
+    TestArgsParser::go();
+    dim.deriveOthers();
+
+    compareSpecific(instance0, instance1, numIts, batchSize, dim);
+}
+
+TEST(testbackward, compare_1_n_kgsgo_32c5) {
+    int batchSize = 8;
     LayerDimensions dim;
-    dim.setInputPlanes( 32 ).setInputImageSize(19).setNumFilters( 32 ).setFilterSize( 5 )
-        .setPadZeros( true ).setBiased( true );  
+    dim.setInputPlanes(32).setInputSize(19).setNumFilters(32).setFilterSize(5)
+        .setPadZeros(true).setBiased(true);  
     cout << dim.buildOptionsString() << endl;  
 //    ActivationFunction *fn = new ReluActivation();
 
-    compareSpecific( 1, 2, batchSize, dim );
-
+    compareSpecific(0, 1, 1, batchSize, dim);
+    for(int i=2; i < Backward::getNumImplementations(); i++) {
+        compareSpecific(1, i, 1, batchSize, dim);
+    }
 }
 
-TEST( SLOW_testbackward, compare_kgsgo_32c5mini ) {
+TEST(SLOW_testbackward, compare_kgsgo_32c5mini) {
     int batchSize = 4;
     LayerDimensions dim;
-    dim.setInputPlanes( 2 ).setInputImageSize(3).setNumFilters( 2 ).setFilterSize( 3 )
-        .setPadZeros( true ).setBiased( true );  
+    dim.setInputPlanes(2).setInputSize(3).setNumFilters(2).setFilterSize(3)
+        .setPadZeros(true).setBiased(true);  
     cout << dim.buildOptionsString() << endl;  
 //    ActivationFunction *fn = new ReluActivation();
 
-    compareSpecific( 1, 2, batchSize, dim );
+    compareSpecific(1, 2, 1, batchSize, dim);
 
 }
 
-TEST( SLOW_testbackward, compare_kgsgo_32c5mini2 ) {
+TEST(SLOW_testbackward, compare_kgsgo_32c5mini2) {
     int batchSize = 1;
     int imageSize = 2;
     LayerDimensions dim;
-    dim.setInputPlanes( 1 ).setInputImageSize(imageSize).setNumFilters( 1 ).setFilterSize( imageSize )
-        .setPadZeros( true ).setBiased( true );
+    dim.setInputPlanes(1).setInputSize(imageSize).setNumFilters(1).setFilterSize(imageSize)
+        .setPadZeros(true).setBiased(true);
     cout << dim.buildOptionsString() << endl;
 //    ActivationFunction *fn = new ReluActivation();
 
-    compareSpecific( 1, 2, batchSize, dim );
+    compareSpecific(1, 2, 1, batchSize, dim);
 
 }
 
 /*
-float *test( int imageSize ) {
+float *test(int imageSize) {
     const int batchSize = 128;
     LayerDimensions dim;
-    dim.setInputPlanes( 32 ).setInputImageSize( 28 ).setNumFilters( 32 ).setFilterSize( 5 )
-        .setBiased( true ).setPadZeros( true );
+    dim.setInputPlanes(32).setInputSize(28).setNumFilters(32).setFilterSize(5)
+        .setBiased(true).setPadZeros(true);
 
     int weightsSize = dim.filtersSize;
     int biasSize = dim.numFilters;
-    int outputSize = batchSize * dim.outputCubeSize;
-    float *weights = new float[max(10000, weightsSize ) ];
-    float *bias = new float[max( 10000, biasSize)];
-    float *errors = new float[max(10000, outputSize )];
-    float *output = new float[max(10000, outputSize )];
-    WeightRandomizer::randomize( weights, max(10000, weightsSize ), -1, 1 );
-    WeightRandomizer::randomize( bias, max( 10000, biasSize), -1, 1 );
-    WeightRandomizer::randomize( errors, max(10000, outputSize ), -1, 1 );
-    WeightRandomizer::randomize( output, max(10000, outputSize ), -1, 1 );
+    int outputNumElements = batchSize * dim.outputCubeSize;
+    float *weights = new float[max(10000, weightsSize) ];
+    float *bias = new float[max(10000, biasSize)];
+    float *errors = new float[max(10000, outputNumElements)];
+    float *output = new float[max(10000, outputNumElements)];
+    WeightRandomizer::randomize(weights, max(10000, weightsSize), -1, 1);
+    WeightRandomizer::randomize(bias, max(10000, biasSize), -1, 1);
+    WeightRandomizer::randomize(errors, max(10000, outputNumElements), -1, 1);
+    WeightRandomizer::randomize(output, max(10000, outputNumElements), -1, 1);
 
     EasyCL cl;
-    Backward *backwardImpl = Backward::instanceForTest( &cl, dim, new ReluActivation() );
+    Backward *backwardImpl = Backward::instanceForTest(&cl, dim, new ReluActivation());
     Timer timer;
-    float *errorsForUpstream = backwardImpl->backward( batchSize, output, weights, bias, errors );
+    float *errorsForUpstream = backwardImpl->backward(batchSize, output, weights, bias, errors);
     StatefulTimer::dump(true);
     timer.timeCheck("after calcing errors");
 
-    Sampler::printSamples( "errorsForUpstream", batchSize * dim.inputCubeSize, errorsForUpstream );
+    Sampler::printSamples("errorsForUpstream", batchSize * dim.inputCubeSize, errorsForUpstream);
 
     delete backwardImpl;
 
@@ -831,62 +872,62 @@ float *test( int imageSize ) {
 */
 // we want to test calcerrors for layer 2 in a network like:
 //    NeuralNet *net = NeuralNet::maker()->planes(1)->imageSize(28)->instance();
-//    net->addLayer( ConvolutionalMaker::instance()->numFilters(32)->filterSize(5)->relu()->biased()->insert();
-//    net->addLayer( ConvolutionalMaker::instance()->numFilters(32)->filterSize(5)->relu()->biased()->insert();
-//    net->addLayer( ConvolutionalMaker::instance()->numFilters(10)->filterSize(20)->tanh()->biased(config.biased)->insert();
-//TEST( testbackward, DISABLED_image28 ) {
+//    net->addLayer(ConvolutionalMaker::instance()->numFilters(32)->filterSize(5)->relu()->biased()->insert();
+//    net->addLayer(ConvolutionalMaker::instance()->numFilters(32)->filterSize(5)->relu()->biased()->insert();
+//    net->addLayer(ConvolutionalMaker::instance()->numFilters(10)->filterSize(20)->tanh()->biased(config.biased)->insert();
+//TEST(testbackward, DISABLED_image28) {
 //    float *errorsForUpstream = test(28);
-//    EXPECT_FLOAT_NEAR( -1.66007, errorsForUpstream[68268] );
-//    EXPECT_FLOAT_NEAR( 0.823709, errorsForUpstream[2927151] );
-//    EXPECT_FLOAT_NEAR( 6.99365, errorsForUpstream[1746549] );
-//    EXPECT_FLOAT_NEAR( 7.25249, errorsForUpstream[576704] );
-//    EXPECT_FLOAT_NEAR( 7.88787, errorsForUpstream[570179] );
+//    EXPECT_FLOAT_NEAR(-1.66007, errorsForUpstream[68268]);
+//    EXPECT_FLOAT_NEAR(0.823709, errorsForUpstream[2927151]);
+//    EXPECT_FLOAT_NEAR(6.99365, errorsForUpstream[1746549]);
+//    EXPECT_FLOAT_NEAR(7.25249, errorsForUpstream[576704]);
+//    EXPECT_FLOAT_NEAR(7.88787, errorsForUpstream[570179]);
 //    delete[] errorsForUpstream;
 //}
 
-//TEST( testbackward, DISABLED_image19 ) { // make it work for a image19 first :-)
+//TEST(testbackward, DISABLED_image19) { // make it work for a image19 first :-)
 //    float *errorsForUpstream = test(19);
-//    EXPECT_FLOAT_NEAR( -24.5602, errorsForUpstream[158380] );
-//    EXPECT_FLOAT_NEAR( 7.39012, errorsForUpstream[2607] );
-//    EXPECT_FLOAT_NEAR( -6.50315, errorsForUpstream[546421] );
-//    EXPECT_FLOAT_NEAR( -1.22025, errorsForUpstream[429248] );
-//    EXPECT_FLOAT_NEAR( -8.89935, errorsForUpstream[1200963] );
+//    EXPECT_FLOAT_NEAR(-24.5602, errorsForUpstream[158380]);
+//    EXPECT_FLOAT_NEAR(7.39012, errorsForUpstream[2607]);
+//    EXPECT_FLOAT_NEAR(-6.50315, errorsForUpstream[546421]);
+//    EXPECT_FLOAT_NEAR(-1.22025, errorsForUpstream[429248]);
+//    EXPECT_FLOAT_NEAR(-8.89935, errorsForUpstream[1200963]);
 //    delete[] errorsForUpstream;
 
 //    const int batchSize = 128;
 //    LayerDimensions dim;
-//    dim.setInputPlanes( 32 ).setInputImageSize( 19 ).setNumFilters( 32 ).setFilterSize( 5 )
-//        .setBiased( true ).setPadZeros( true );    const int batchSize = 128;
+//    dim.setInputPlanes(32).setInputSize(19).setNumFilters(32).setFilterSize(5)
+//        .setBiased(true).setPadZeros(true);    const int batchSize = 128;
 //    LayerDimensions dim;
-//    dim.setInputPlanes( 32 ).setInputImageSize( 28 ).setNumFilters( 32 ).setFilterSize( 5 )
-//        .setBiased( true ).setPadZeros( true );
+//    dim.setInputPlanes(32).setInputSize(28).setNumFilters(32).setFilterSize(5)
+//        .setBiased(true).setPadZeros(true);
 
 //    int weightsSize = dim.filtersSize;
 //    int biasSize = dim.numFilters;
-//    int outputSize = batchSize * dim.outputCubeSize;
-//    float *weights = new float[max(10000, weightsSize ) ];
-//    float *bias = new float[max( 10000, biasSize)];
-//    float *errors = new float[max(10000, outputSize )];
-//    float *output = new float[max(10000, outputSize )];
-//    WeightRandomizer::randomize( weights, max(10000, weightsSize ), -1, 1 );
-//    WeightRandomizer::randomize( bias, max( 10000, biasSize), -1, 1 );
-//    WeightRandomizer::randomize( errors, max(10000, outputSize ), -1, 1 );
-//    WeightRandomizer::randomize( output, max(10000, outputSize ), -1, 1 );
+//    int outputNumElements = batchSize * dim.outputCubeSize;
+//    float *weights = new float[max(10000, weightsSize) ];
+//    float *bias = new float[max(10000, biasSize)];
+//    float *errors = new float[max(10000, outputNumElements)];
+//    float *output = new float[max(10000, outputNumElements)];
+//    WeightRandomizer::randomize(weights, max(10000, weightsSize), -1, 1);
+//    WeightRandomizer::randomize(bias, max(10000, biasSize), -1, 1);
+//    WeightRandomizer::randomize(errors, max(10000, outputNumElements), -1, 1);
+//    WeightRandomizer::randomize(output, max(10000, outputNumElements), -1, 1);
 
 //    EasyCL cl;
-//    BackpropErrors *backwardImpl = BackpropErrors::instanceForTest( &cl, dim, new ReluActivation() );
+//    BackpropErrors *backwardImpl = BackpropErrors::instanceForTest(&cl, dim, new ReluActivation());
 //    Timer timer;
-//    float *errorsForUpstream = backwardImpl->backward( batchSize, output, weights, bias, errors );
+//    float *errorsForUpstream = backwardImpl->backward(batchSize, output, weights, bias, errors);
 //    StatefulTimer::dump(true);
 //    timer.timeCheck("after calcing errors");
 
-//    Sampler::printSamples( "errorsForUpstream", batchSize * dim.inputCubeSize, errorsForUpstream );
+//    Sampler::printSamples("errorsForUpstream", batchSize * dim.inputCubeSize, errorsForUpstream);
 
-//    EXPECT_FLOAT_NEAR( -1.66007, errorsForUpstream[68268] );
-//    EXPECT_FLOAT_NEAR( 0.823709, errorsForUpstream[2927151] );
-//    EXPECT_FLOAT_NEAR( 6.99365, errorsForUpstream[1746549] );
-//    EXPECT_FLOAT_NEAR( 7.25249, errorsForUpstream[576704] );
-//    EXPECT_FLOAT_NEAR( 7.88787, errorsForUpstream[570179] );
+//    EXPECT_FLOAT_NEAR(-1.66007, errorsForUpstream[68268]);
+//    EXPECT_FLOAT_NEAR(0.823709, errorsForUpstream[2927151]);
+//    EXPECT_FLOAT_NEAR(6.99365, errorsForUpstream[1746549]);
+//    EXPECT_FLOAT_NEAR(7.25249, errorsForUpstream[576704]);
+//    EXPECT_FLOAT_NEAR(7.88787, errorsForUpstream[570179]);
 
 //    delete backwardImpl;
 
@@ -898,30 +939,30 @@ float *test( int imageSize ) {
 
 //    int weightsSize = dim.filtersSize;
 //    int biasSize = dim.numFilters;
-//    int outputSize = batchSize * dim.outputCubeSize;
-//    float *weights = new float[max(10000, weightsSize ) ];
-//    float *bias = new float[max( 10000, biasSize)];
-//    float *errors = new float[max(10000, outputSize )];
-//    float *output = new float[max(10000, outputSize )];
-//    WeightRandomizer::randomize( weights, max(10000, weightsSize ), -1, 1 );
-//    WeightRandomizer::randomize( bias, max( 10000, biasSize), -1, 1 );
-//    WeightRandomizer::randomize( errors, max(10000, outputSize ), -1, 1 );
-//    WeightRandomizer::randomize( output, max(10000, outputSize ), -1, 1 );
+//    int outputNumElements = batchSize * dim.outputCubeSize;
+//    float *weights = new float[max(10000, weightsSize) ];
+//    float *bias = new float[max(10000, biasSize)];
+//    float *errors = new float[max(10000, outputNumElements)];
+//    float *output = new float[max(10000, outputNumElements)];
+//    WeightRandomizer::randomize(weights, max(10000, weightsSize), -1, 1);
+//    WeightRandomizer::randomize(bias, max(10000, biasSize), -1, 1);
+//    WeightRandomizer::randomize(errors, max(10000, outputNumElements), -1, 1);
+//    WeightRandomizer::randomize(output, max(10000, outputNumElements), -1, 1);
 
 //    EasyCL cl;
-//    BackpropErrors *backwardImpl = BackpropErrors::instanceForTest( &cl, dim, new ReluActivation() );
+//    BackpropErrors *backwardImpl = BackpropErrors::instanceForTest(&cl, dim, new ReluActivation());
 //    Timer timer;
-//    float *errorsForUpstream = backwardImpl->backward( batchSize, output, weights, bias, errors );
+//    float *errorsForUpstream = backwardImpl->backward(batchSize, output, weights, bias, errors);
 //    StatefulTimer::dump(true);
 //    timer.timeCheck("after calcing errors");
 
-//    Sampler::printSamples( "errorsForUpstream", batchSize * dim.inputCubeSize, errorsForUpstream );
+//    Sampler::printSamples("errorsForUpstream", batchSize * dim.inputCubeSize, errorsForUpstream);
 
-//    EXPECT_FLOAT_NEAR( -24.5602, errorsForUpstream[158380] );
-//    EXPECT_FLOAT_NEAR( 7.39012, errorsForUpstream[2607] );
-//    EXPECT_FLOAT_NEAR( -6.50315, errorsForUpstream[546421] );
-//    EXPECT_FLOAT_NEAR( -1.22025, errorsForUpstream[429248] );
-//    EXPECT_FLOAT_NEAR( -8.89935, errorsForUpstream[1200963] );
+//    EXPECT_FLOAT_NEAR(-24.5602, errorsForUpstream[158380]);
+//    EXPECT_FLOAT_NEAR(7.39012, errorsForUpstream[2607]);
+//    EXPECT_FLOAT_NEAR(-6.50315, errorsForUpstream[546421]);
+//    EXPECT_FLOAT_NEAR(-1.22025, errorsForUpstream[429248]);
+//    EXPECT_FLOAT_NEAR(-8.89935, errorsForUpstream[1200963]);
 
 //    delete backwardImpl;
 
@@ -932,31 +973,31 @@ float *test( int imageSize ) {
 //}
 
 /*
-TEST( testbackward, comparespecific ) {
+TEST(testbackward, comparespecific) {
     const int batchSize = 5;
     LayerDimensions dim;
-    dim.setInputPlanes( 1 ).setInputImageSize( 5 ).setNumFilters( 1 ).setFilterSize( 3 )
-        .setBiased( true ).setPadZeros( false );
+    dim.setInputPlanes(1).setInputSize(5).setNumFilters(1).setFilterSize(3)
+        .setBiased(true).setPadZeros(false);
 
     int weightsSize = dim.filtersSize;
     int biasSize = dim.numFilters;
-    int outputSize = batchSize * dim.outputCubeSize;
-    float *weights = new float[max(10000, weightsSize ) ];
-    float *bias = new float[max( 10000, biasSize)];
-    float *errors = new float[max(10000, outputSize )];
-    float *output = new float[max(10000, outputSize )];
-    memset( weights, 0, sizeof(float) * max(10000, weightsSize ) );
-    memset( bias, 0, sizeof(float) * max(10000, biasSize ) );
-    memset( errors, 0, sizeof(float) * max(10000, outputSize ) );
-    memset( output, 0, sizeof(float) * max(10000, outputSize ) );
-    mt19937 random = WeightRandomizer::randomize( weights, max(10000, weightsSize ), -1, 1 );
-    WeightRandomizer::randomize( random, bias, max( 10000, biasSize), -1, 1 );
-    WeightRandomizer::randomize( random, errors, max(10000, outputSize ), -1, 1 );
-    WeightRandomizer::randomize( random, output, max(10000, outputSize ), -1, 1 );
-//    WeightRandomizer::randomizeInts( weights, max(10000, weightsSize ), 1, 3 );
-//    WeightRandomizer::randomizeInts( bias, max( 10000, biasSize), 0, 3 );
-//    WeightRandomizer::randomizeInts( errors, max(10000, outputSize ), 0, 3 );
-//    WeightRandomizer::randomizeInts( output, max(10000, outputSize ), 0, 3 );
+    int outputNumElements = batchSize * dim.outputCubeSize;
+    float *weights = new float[max(10000, weightsSize) ];
+    float *bias = new float[max(10000, biasSize)];
+    float *errors = new float[max(10000, outputNumElements)];
+    float *output = new float[max(10000, outputNumElements)];
+    memset(weights, 0, sizeof(float) * max(10000, weightsSize));
+    memset(bias, 0, sizeof(float) * max(10000, biasSize));
+    memset(errors, 0, sizeof(float) * max(10000, outputNumElements));
+    memset(output, 0, sizeof(float) * max(10000, outputNumElements));
+    mt19937 random = WeightRandomizer::randomize(weights, max(10000, weightsSize), -1, 1);
+    WeightRandomizer::randomize(random, bias, max(10000, biasSize), -1, 1);
+    WeightRandomizer::randomize(random, errors, max(10000, outputNumElements), -1, 1);
+    WeightRandomizer::randomize(random, output, max(10000, outputNumElements), -1, 1);
+//    WeightRandomizer::randomizeInts(weights, max(10000, weightsSize), 1, 3);
+//    WeightRandomizer::randomizeInts(bias, max(10000, biasSize), 0, 3);
+//    WeightRandomizer::randomizeInts(errors, max(10000, outputNumElements), 0, 3);
+//    WeightRandomizer::randomizeInts(output, max(10000, outputNumElements), 0, 3);
 
 //    weights[0] = 3;
 //    weights[1] = 5;
@@ -982,17 +1023,17 @@ TEST( testbackward, comparespecific ) {
 //    errors[5] = 6;
 
     EasyCL cl;
-    Backward *backwardImpl1 = Backward::instanceSpecific( 0, &cl, dim, new ReluActivation() );
-    float *errorsForUpstream1 = backwardImpl1->backward( batchSize, output, weights, bias, errors );
-    Backward *backwardImpl2 = Backward::instanceSpecific( 1, &cl, dim, new ReluActivation() );
-    float *errorsForUpstream2 = backwardImpl2->backward( batchSize, output, weights, bias, errors );
+    Backward *backwardImpl1 = Backward::instanceSpecific(0, &cl, dim, new ReluActivation());
+    float *errorsForUpstream1 = backwardImpl1->backward(batchSize, output, weights, bias, errors);
+    Backward *backwardImpl2 = Backward::instanceSpecific(1, &cl, dim, new ReluActivation());
+    float *errorsForUpstream2 = backwardImpl2->backward(batchSize, output, weights, bias, errors);
 
     int errorsForUpstreamSize = batchSize * dim.inputCubeSize;
     cout << dim << endl;
-    for( int i = 0; i < 25; i++ ) {
+    for(int i = 0; i < 25; i++) {
         cout << "output[" << i << "]=" << errorsForUpstream1[i] << " " << errorsForUpstream2[i];
-        if( i < outputSize ) {
-            if( errorsForUpstream1[i] == errorsForUpstream2[i] ) {
+        if(i < outputNumElements) {
+            if(errorsForUpstream1[i] == errorsForUpstream2[i]) {
                 cout << " SAME";
             } else {
                 cout << " DIFF";
@@ -1010,18 +1051,18 @@ TEST( testbackward, comparespecific ) {
     }
     bool same = true;
     int errCount = 0;
-    for( int i = 0; i < errorsForUpstreamSize; i++ ) {
-        if( errorsForUpstream1[i] != errorsForUpstream2[i] ) {
+    for(int i = 0; i < errorsForUpstreamSize; i++) {
+        if(errorsForUpstream1[i] != errorsForUpstream2[i]) {
             cout << "DIFF: i " << i << " " << errorsForUpstream1[i] << " != " << errorsForUpstream2[i] << endl;
             same = false;
             errCount++;
-            if( errCount == 5 ) {
+            if(errCount == 5) {
                 cout << " ... " << endl;
                 break;
             }
         }
     }
-    EXPECT_EQ( true, same );
+    EXPECT_EQ(true, same);
 
     delete backwardImpl1;
     delete backwardImpl2;
diff --git a/test/testdropoutbackward.cpp b/test/testdropoutbackward.cpp
index 2f9ac22f..75d44ba1 100644
--- a/test/testdropoutbackward.cpp
+++ b/test/testdropoutbackward.cpp
@@ -33,7 +33,7 @@ TEST( testdropoutbackward, basic ) {
         2, -9, 2.1f,
         0, -1.1f, 3.5f
     };
-    int inputTotalSize = dropoutBackprop->getInputSize( batchSize );
+    int inputTotalSize = dropoutBackprop->getInputNumElements( batchSize );
     EXPECT_FLOAT_NEAR( batchSize * imageSize * imageSize, inputTotalSize );
     float *errorsForUpstream = new float[ inputTotalSize ];
 
@@ -77,12 +77,12 @@ TEST( testdropoutbackward, basic_2plane_batchsize2 ) {
         2, 
         9
     };
-    float *errorsForUpstream = new float[ dropoutBackprop->getInputSize( batchSize ) ];
+    float *errorsForUpstream = new float[ dropoutBackprop->getInputNumElements( batchSize ) ];
 
     dropoutBackprop->backward( batchSize, mask, errors, errorsForUpstream );
 
-//    float *expectedErrorsForUpstream = new float[ dropoutForward->getInputSize( batchSize ) ];
-//    memset( expectedErrorsForUpstream, 0, sizeof(float) * dropoutForward->getInputSize( batchSize ) ];
+//    float *expectedErrorsForUpstream = new float[ dropoutForward->getInputNumElements( batchSize ) ];
+//    memset( expectedErrorsForUpstream, 0, sizeof(float) * dropoutForward->getInputNumElements( batchSize ) ];
     float expectedErrorsForUpstream[] = {
         3,
         5,
@@ -99,7 +99,7 @@ TEST( testdropoutbackward, basic_2plane_batchsize2 ) {
 }
 
 TEST( testdropoutbackward, compare_args ) {
-    int inputImageSize = 9;
+    int inputSize = 9;
     float dropRatio = 0.6f;
     int instance0 = 0;
     int instance1 = 1;
@@ -111,25 +111,25 @@ TEST( testdropoutbackward, compare_args ) {
     TestArgsParser::arg( "dropratio", &dropRatio );
 //    TestArgsParser::arg( "dropoutsize", &dropoutSize );
     TestArgsParser::arg( "numplanes", &numPlanes );
-    TestArgsParser::arg( "inputimagesize", &inputImageSize );
+    TestArgsParser::arg( "inputimagesize", &inputSize );
     TestArgsParser::arg( "instance0", &instance0 );
     TestArgsParser::arg( "instance1", &instance1 );
     TestArgsParser::go();
 
     EasyCL *cl = EasyCL::createForFirstGpuOtherwiseCpu();
-    DropoutBackward *p0 = DropoutBackward::instanceSpecific( instance0, cl, numPlanes, inputImageSize, dropRatio );
-    DropoutBackward *p1 = DropoutBackward::instanceSpecific( instance1, cl, numPlanes, inputImageSize, dropRatio );
-    int outputImageSize = p1->outputImageSize;
-    int errorsSize = batchSize * outputImageSize * outputImageSize * numPlanes;
+    DropoutBackward *p0 = DropoutBackward::instanceSpecific( instance0, cl, numPlanes, inputSize, dropRatio );
+    DropoutBackward *p1 = DropoutBackward::instanceSpecific( instance1, cl, numPlanes, inputSize, dropRatio );
+    int outputSize = p1->outputSize;
+    int errorsSize = batchSize * outputSize * outputSize * numPlanes;
     float *errors = new float[ errorsSize ];
-    int inputSize = batchSize * inputImageSize * inputImageSize * numPlanes;
-    float *errorsForUpstream0 = new float[ inputSize ];
-    float *errorsForUpstream1 = new float[ inputSize ];
+    int inputNumElements = batchSize * inputSize * inputSize * numPlanes;
+    float *errorsForUpstream0 = new float[ inputNumElements ];
+    float *errorsForUpstream1 = new float[ inputNumElements ];
     
-    DropoutForward *forwardprop = DropoutForward::instanceSpecific( 0, cl, numPlanes, inputImageSize, dropRatio );
-    float *input = new float[inputSize];
+    DropoutForward *forwardprop = DropoutForward::instanceSpecific( 0, cl, numPlanes, inputSize, dropRatio );
+    float *input = new float[inputNumElements];
     float *output = new float[errorsSize];
-    uchar *mask = new uchar[inputSize];
+    uchar *mask = new uchar[inputNumElements];
     float *errorsForUpstream[2];
     errorsForUpstream[0] = errorsForUpstream0;
     errorsForUpstream[1] = errorsForUpstream1;
@@ -141,8 +141,8 @@ TEST( testdropoutbackward, compare_args ) {
         // easiest way to select valid selectors might be to just forwardforward first?
 
         WeightRandomizer::randomize( it, errors, errorsSize, -0.1f, 0.1f );
-        WeightRandomizer::randomize( it, input, inputSize, -0.1f, 0.1f );
-        WeightRandomizer::randomizeInts( it, mask, inputSize, 0, 2 );    
+        WeightRandomizer::randomize( it, input, inputNumElements, -0.1f, 0.1f );
+        WeightRandomizer::randomizeInts( it, mask, inputNumElements, 0, 2 );    
         forwardprop->forward( batchSize, mask, input, output );
 
         for( int instance = 0; instance < 2; instance++ ) {
@@ -150,7 +150,7 @@ TEST( testdropoutbackward, compare_args ) {
         }
         bool ok = true;
         int numErrors = 0;
-        for( int i = 0; i < inputSize; i++ ) {
+        for( int i = 0; i < inputNumElements; i++ ) {
             if( errorsForUpstream0[i] != errorsForUpstream1[i] ) {
                 cout << "diff: i=" << i << " " << errorsForUpstream0[i] << " != " << errorsForUpstream1[i] << endl;
                 ok = false;
@@ -199,9 +199,9 @@ TEST( testdropoutforward, basic_2plane_batchsize2 ) {
                      -1, -3.5f,
                     37.4f,5
     };
-    int outputSize = dropoutForward->getOutputSize( batchSize );
-    int *selectors = new int[outputSize];
-    float *output = new float[outputSize];
+    int outputNumElements = dropoutForward->getOutputNumElements( batchSize );
+    int *selectors = new int[outputNumElements];
+    float *output = new float[outputNumElements];
 
     dropoutForward->forward( batchSize, data, selectors, output );
 
diff --git a/test/testdropoutforward.cpp b/test/testdropoutforward.cpp
index 53b324a3..56e668d1 100644
--- a/test/testdropoutforward.cpp
+++ b/test/testdropoutforward.cpp
@@ -54,9 +54,9 @@ TEST( testdropoutforward, basic ) {
                      3, 8.2f, 4.1f,
                      3, -33.1f, 14.2f,
     };
-    int outputSize = dropoutForward->getOutputSize( batchSize );
-    EXPECT_FLOAT_NEAR( outputSize, imageSize * imageSize );
-    float *output = new float[outputSize];
+    int outputNumElements = dropoutForward->getOutputNumElements( batchSize );
+    EXPECT_FLOAT_NEAR( outputNumElements, imageSize * imageSize );
+    float *output = new float[outputNumElements];
 
     dropoutForward->forward( batchSize, mask, data, output );
 
@@ -108,8 +108,8 @@ TEST( testdropoutforward, basic_2plane_batchsize2 ) {
         1,1,
         0,1
     };
-    int outputSize = dropoutForward->getOutputSize( batchSize );
-    float *output = new float[outputSize];
+    int outputNumElements = dropoutForward->getOutputNumElements( batchSize );
+    float *output = new float[outputNumElements];
 
     dropoutForward->forward( batchSize, mask, data, output );
 
@@ -144,13 +144,13 @@ TEST( testdropoutforward, fromwrappers ) {
             1,0,1,0,
             0,0,1,1
     };
-    int outputSize = dropoutForward->getOutputSize( batchSize );
-    float *output = new float[outputSize];
+    int outputNumElements = dropoutForward->getOutputNumElements( batchSize );
+    float *output = new float[outputNumElements];
 
-    const int inputSize = batchSize * numPlanes * imageSize * imageSize;
-    CLWrapper *maskWrapper = cl->wrap( inputSize, mask );
-    CLWrapper *inputWrapper = cl->wrap( inputSize, input );
-    CLWrapper *outputWrapper = cl->wrap( outputSize, output );
+    const int inputNumElements = batchSize * numPlanes * imageSize * imageSize;
+    CLWrapper *maskWrapper = cl->wrap( inputNumElements, mask );
+    CLWrapper *inputWrapper = cl->wrap( inputNumElements, input );
+    CLWrapper *outputWrapper = cl->wrap( outputNumElements, output );
 
     maskWrapper->copyToDevice();
     inputWrapper->copyToDevice();
@@ -245,25 +245,25 @@ void compareSpecific( CompareSpecificArgs args ) {
     DropoutForward *dropoutForward0 = DropoutForward::instanceSpecific( args._instance0, cl, numPlanes, imageSize, args._dropRatio );
     DropoutForward *dropoutForward1 = DropoutForward::instanceSpecific( args._instance1, cl, numPlanes, imageSize, args._dropRatio );
 
-    const int inputSize = batchSize * numPlanes * imageSize * imageSize;
-    int outputSize = dropoutForward0->getOutputSize( batchSize );
+    const int inputNumElements = batchSize * numPlanes * imageSize * imageSize;
+    int outputNumElements = dropoutForward0->getOutputNumElements( batchSize );
 
-    unsigned char *mask = new unsigned char[ inputSize ];
-    float *input = new float[ inputSize ];
-    float *output = new float[ outputSize ];
+    unsigned char *mask = new unsigned char[ inputNumElements ];
+    float *input = new float[ inputNumElements ];
+    float *output = new float[ outputNumElements ];
 
-    CLWrapper *maskWrapper = cl->wrap( inputSize, mask );
-    CLWrapper *inputWrapper = cl->wrap( inputSize, input );
-    CLWrapper *outputWrapper = cl->wrap( outputSize, output );
+    CLWrapper *maskWrapper = cl->wrap( inputNumElements, mask );
+    CLWrapper *inputWrapper = cl->wrap( inputNumElements, input );
+    CLWrapper *outputWrapper = cl->wrap( outputNumElements, output );
 
-    WeightRandomizer::randomizeInts( mask, inputSize, 0, 2 );
-//    for( int i = 0; i < inputSize; i++ ) {
+    WeightRandomizer::randomizeInts( mask, inputNumElements, 0, 2 );
+//    for( int i = 0; i < inputNumElements; i++ ) {
 //        cout << (int)mask[i] << " ";
 //    }
 //    cout << endl;
-    WeightRandomizer::randomize( input, inputSize, -0.1f, 0.1f );
+    WeightRandomizer::randomize( input, inputNumElements, -0.1f, 0.1f );
 
-    memset( output, 99, sizeof(int) * outputSize );
+    memset( output, 99, sizeof(int) * outputNumElements );
 
     maskWrapper->copyToDevice();
     inputWrapper->copyToDevice();
@@ -272,10 +272,10 @@ void compareSpecific( CompareSpecificArgs args ) {
     dropoutForward0->forward( batchSize, maskWrapper, inputWrapper, outputWrapper );
     outputWrapper->copyToHost();
 
-    float *output0 = new float[ outputSize ];
-    memcpy( output0, output, sizeof(float) * outputSize );
+    float *output0 = new float[ outputNumElements ];
+    memcpy( output0, output, sizeof(float) * outputNumElements );
     
-    memset( output, 99, sizeof(int) * outputSize );
+    memset( output, 99, sizeof(int) * outputNumElements );
 
     maskWrapper->copyToDevice();
     inputWrapper->copyToDevice();
@@ -285,7 +285,7 @@ void compareSpecific( CompareSpecificArgs args ) {
     outputWrapper->copyToHost();
     
     int numErrors = 0;
-    for( int i = 0; i < outputSize; i++ ) {
+    for( int i = 0; i < outputNumElements; i++ ) {
         bool ok = true;
         if( ( output[i] > 0 && output0[i] < 0 ) || ( output[i] < 0 && output0[i] > 0 ) ) {
             cout << "signs differ" << endl;
@@ -318,7 +318,7 @@ void compareSpecific( CompareSpecificArgs args ) {
     }
     EXPECT_FLOAT_NEAR( 0, numErrors );
     if( numErrors > 0 ) {
-        int num2dPlanes = inputSize / imageSize / imageSize;
+        int num2dPlanes = inputNumElements / imageSize / imageSize;
         for( int plane = 0; plane < num2dPlanes; plane++ ) {
             cout << "2dplane " << plane << ":" << endl;
             for( int i = 0; i < imageSize; i++ ) {
diff --git a/test/testforward.cpp b/test/testforward.cpp
index 1616c848..0c00d5fa 100644
--- a/test/testforward.cpp
+++ b/test/testforward.cpp
@@ -12,6 +12,8 @@
 #include "layer/LayerMakers.h"
 #include "util/StatefulTimer.h"
 #include "net/NeuralNetMould.h"
+#include "clblas/ClBlasInstance.h"
+#include "clBLAS.h"
 
 #include "test/WeightRandomizer.h"
 #include "test/GtestGlobals.h"
@@ -87,7 +89,7 @@ TEST( testforward, imagesize2_nopadzeros ) {
         0.2f*13 + 0.3f* 17 + 0.7f *(-19) -1.1f * 2.3f 
     };
     cout << "expected number of output: " << resultSize << endl;
-//    int outputImageSize = 0;
+//    int outputSize = 0;
     EasyCL *cl = EasyCL::createForFirstGpuOtherwiseCpu();
     for( int i = 1; i <= 4; i++ ) {
         Forward *forward = Forward::instanceSpecific( 3, cl,
@@ -127,10 +129,10 @@ TEST( testforward, DISABLED_imagesize2_nopadzeros_skip1 ) {
                         0.2f, 0.3f, 
                          0.7f, -1.1f,
  };
-    int outputImageSize = ( imageSize - filterWidth ) / ( skip + 1 ) + 1;
-    cout << "outputimagesize: " << outputImageSize << endl;
-    int outputSize = outputImageSize * numOutPlanes * batchSize;
-    cout << "outputsize: " << outputSize << endl;
+    int outputSize = ( imageSize - filterWidth ) / ( skip + 1 ) + 1;
+    cout << "outputimagesize: " << outputSize << endl;
+    int outputNumElements = outputSize * numOutPlanes * batchSize;
+    cout << "outputsize: " << outputNumElements << endl;
     float expectedOutput[] = {
         -2,  0,
         0, 0,
@@ -146,8 +148,8 @@ TEST( testforward, DISABLED_imagesize2_nopadzeros_skip1 ) {
 
 
     };
-    cout << "expected number of output: " << outputSize << endl;
-//    int outputImageSize = 0;
+    cout << "expected number of output: " << outputNumElements << endl;
+//    int outputSize = 0;
     EasyCL *cl = EasyCL::createForFirstGpuOtherwiseCpu();
     for( int i = 1; i <= 1; i++ ) {
         Forward *forward = Forward::instanceSpecific( 0, cl,
@@ -155,7 +157,7 @@ TEST( testforward, DISABLED_imagesize2_nopadzeros_skip1 ) {
             padZeros == 1, false ).setSkip(1) );
         float *output = new float[forward->getOutputTotalSize(batchSize)];
         forward->forward( batchSize, data, filter1, 0, output );  
-        for( int result = 0; result < outputSize; result++ ) {
+        for( int result = 0; result < outputNumElements; result++ ) {
             cout << "checking result " << result << endl;
             EXPECT_EQ( expectedOutput[result], output[result] );
         }
@@ -233,7 +235,7 @@ TEST( testforward, imagesize2_padzeros ) {
 //        0.2f*13 + 0.3f* 17 + 0.7f *(-19) -1.1f * 2.3f 
 //    };
 
-//    int outputImageSize = 0;
+//    int outputSize = 0;
     EasyCL *cl = EasyCL::createForFirstGpuOtherwiseCpu();
     Forward *forward = Forward::instanceTest( cl, LayerDimensions( numInPlanes, imageSize, numOutPlanes, filterWidth,
         padZeros == 1, false ) );
@@ -295,7 +297,7 @@ TEST( testforward, imagesize3 ) {
 
  };
 
-//    int outputImageSize = 0;
+//    int outputSize = 0;
     EasyCL *cl = EasyCL::createForFirstGpuOtherwiseCpu();
     Forward *forward = Forward::instanceTest( cl, LayerDimensions( numInPlanes, imageSize, numOutPlanes, filterWidth,
         padZeros == 1, false ) );
@@ -322,7 +324,7 @@ TEST( testforward, imagesize3 ) {
 TEST( testforward, test2 ) {
     int batchSize = 2;
     LayerDimensions dim;
-    dim.setNumFilters(2).setNumInputPlanes(1).setInputImageSize(3).setFilterSize(3)
+    dim.setNumFilters(2).setNumInputPlanes(1).setInputSize(3).setFilterSize(3)
         .setPadZeros(false).setBiased(false);
 
     float data[] = { 0, 0, 0,
@@ -375,7 +377,7 @@ TEST( testforward, test3 ) {
     float filter[] = {0.2f,0.3f,
                      0.5f,0.7f};
 
-//    int outputImageSize = 0;
+//    int outputSize = 0;
     EasyCL *cl = EasyCL::createForFirstGpuOtherwiseCpu();
     Forward *forward = Forward::instanceTest( cl, LayerDimensions( numInPlanes, inImageSize, numOutPlanes, filterSize,
         padZeros == 1, false ) );
@@ -408,6 +410,7 @@ TEST( testforward, test3 ) {
 void compareSpecific( bool debug, int N, int batchSize, LayerDimensions dim, int instance0, int instance1 ) {
     cout << dim << endl;
     EasyCL *cl = EasyCL::createForFirstGpuOtherwiseCpu();
+    ClBlasInstance clblasInstance;
 
     int inputsSize = N * dim.inputCubeSize;
     int filtersSize = dim.filtersSize;
@@ -440,9 +443,9 @@ void compareSpecific( bool debug, int N, int batchSize, LayerDimensions dim, int
         if( debug ) cout << "i " << i << " input[i]=" << inputs[i] << " filters[i]=" << filters[i] << endl;
     }
 
-    int outputSize = N * dim.outputCubeSize;
-    float *output1 = new float[ outputSize ];
-    float *output2 = new float[ outputSize ];
+    int outputNumElements = N * dim.outputCubeSize;
+    float *output1 = new float[ outputNumElements ];
+    float *output2 = new float[ outputNumElements ];
     
     int numBatches = ( N + batchSize - 1 ) / batchSize;
     Forward *p1 = Forward::instanceSpecific( instance0, cl, dim );
@@ -482,8 +485,8 @@ void compareSpecific( bool debug, int N, int batchSize, LayerDimensions dim, int
     cout << dim << endl;
     bool same = true;
     int numDiff = 0;
-    for( int i = 0; i < max( 20, outputSize ); i++ ) {
-        if( i < outputSize ) {
+    for( int i = 0; i < max( 20, outputNumElements ); i++ ) {
+        if( i < outputNumElements ) {
             if( abs( output1[i] - output2[i] ) < 0.00001f || abs( output1[i] - output2[i] ) <= 0.001f * max( abs( output1[i] ), abs( output2[i] ) ) ) {
                 if( i < 20 ) {
                     if( debug ) cout << "output[" << i << "]=" << output1[i] << " " << output2[i];
@@ -519,10 +522,10 @@ void compareSpecific( bool debug, int N, int batchSize, LayerDimensions dim, int
     delete[] output2;
     delete p1;
     delete p2;
-    delete cl;
     delete[] inputs;
     delete[] filters;
     delete[] biasFilters;
+    delete cl;
 }
 
 // first, compare the slow, but probably correct, cpu version, with forward1
@@ -535,7 +538,7 @@ TEST( testforward, compare_0_1_biased_nopad ) {
 //    int instance1 = 1;
     int N = 4;
     string activationName = "tanh";
-    dim.setInputPlanes( 8 ).setInputImageSize(19).setNumFilters( 8 )
+    dim.setInputPlanes( 8 ).setInputSize(19).setNumFilters( 8 )
         .setFilterSize( 5 )
         .setPadZeros( false ).setBiased( true );
     compareSpecific( false, N, batchSize, dim, 0, 1 );
@@ -548,7 +551,7 @@ TEST( testforward, compare_0_1_biased_pad ) {
 //    int instance1 = 1;
     int N = 4;
     string activationName = "tanh";
-    dim.setInputPlanes( 8 ).setInputImageSize(19).setNumFilters( 8 )
+    dim.setInputPlanes( 8 ).setInputSize(19).setNumFilters( 8 )
         .setFilterSize( 5 )
         .setPadZeros( true ).setBiased( true );
     compareSpecific( false, N, batchSize, dim, 0, 1 );
@@ -561,10 +564,10 @@ TEST( testforward, compare_1_n_biased_nopad ) {
 //    int instance1 = 1;
     int N = 4;
     string activationName = "tanh";
-    dim.setInputPlanes( 8 ).setInputImageSize(19).setNumFilters( 8 )
+    dim.setInputPlanes( 8 ).setInputSize(19).setNumFilters( 8 )
         .setFilterSize( 5 )
         .setPadZeros( false ).setBiased( true );
-    for( int instance = 2; instance <= 6; instance++ ) {
+    for( int instance = 2; instance <= 7; instance++ ) {
         if( instance == 5 ) {
             continue; // forwardfc, cant use for inputimagesize != filtersize
         }
@@ -578,10 +581,10 @@ TEST( testforward, compare_1_n_biased_pad ) {
     int batchSize = 4;
     int N = 4;
     string activationName = "tanh";
-    dim.setInputPlanes( 8 ).setInputImageSize(19).setNumFilters( 8 )
+    dim.setInputPlanes( 8 ).setInputSize(19).setNumFilters( 8 )
         .setFilterSize( 5 )
         .setPadZeros( true ).setBiased( true );
-    for( int instance = 2; instance <= 6; instance++ ) {
+    for( int instance = 2; instance <= 7; instance++ ) {
         if( instance == 5 ) {
             continue; // forwardfc, cant use for inputimagesize != filtersize
         }
@@ -596,7 +599,7 @@ TEST( testforward, compare_1_5_biased_nopad ) { // only need to do nopad, since
 //    int instance0 = 1;
 //    int instance1 = 1;
     int N = 4;
-    dim.setInputPlanes( 8 ).setInputImageSize(19).setNumFilters( 8 )
+    dim.setInputPlanes( 8 ).setInputSize(19).setNumFilters( 8 )
         .setFilterSize( 19 )
         .setPadZeros( false ).setBiased( true );
     compareSpecific( false, N, batchSize, dim, 1, 5 );
@@ -606,7 +609,7 @@ TEST( testforward, compare_1_4_fcscenario ) { // only need to do nopad, since fc
     LayerDimensions dim;
     int batchSize = 4;
     int N = 4;
-    dim.setInputPlanes( 10 ).setInputImageSize(24).setNumFilters( 10 )
+    dim.setInputPlanes( 10 ).setInputSize(24).setNumFilters( 10 )
         .setFilterSize( 24 )
         .setPadZeros( false ).setBiased( true );    
     compareSpecific( false, N, batchSize, dim, 1, 4 );
@@ -617,7 +620,7 @@ TEST( testforward, compare_1_4_fcscenario ) { // only need to do nopad, since fc
         cog.outl(
             'TEST( testforward, compare_break1_0_{n} ) {{\n'
             '    LayerDimensions dim;\n'
-            '    dim.setInputPlanes( 1 ).setInputImageSize( 33 ).setNumFilters( 1 ).setFilterSize( 1 )\n'
+            '    dim.setInputPlanes( 1 ).setInputSize( 33 ).setNumFilters( 1 ).setFilterSize( 1 )\n'
             '        .setPadZeros( false ).setBiased( false );\n'
             '    compareSpecific( false, 1, 1, dim, 0, {n} );\n'
             '}}\n'.format(
@@ -625,14 +628,14 @@ TEST( testforward, compare_1_4_fcscenario ) { // only need to do nopad, since fc
 *///]]]
 TEST( testforward, compare_break1_0_1 ) {
     LayerDimensions dim;
-    dim.setInputPlanes( 1 ).setInputImageSize( 33 ).setNumFilters( 1 ).setFilterSize( 1 )
+    dim.setInputPlanes( 1 ).setInputSize( 33 ).setNumFilters( 1 ).setFilterSize( 1 )
         .setPadZeros( false ).setBiased( false );
     compareSpecific( false, 1, 1, dim, 0, 1 );
 }
 
 TEST( testforward, compare_break1_0_4 ) {
     LayerDimensions dim;
-    dim.setInputPlanes( 1 ).setInputImageSize( 33 ).setNumFilters( 1 ).setFilterSize( 1 )
+    dim.setInputPlanes( 1 ).setInputSize( 33 ).setNumFilters( 1 ).setFilterSize( 1 )
         .setPadZeros( false ).setBiased( false );
     compareSpecific( false, 1, 1, dim, 0, 4 );
 }
@@ -641,7 +644,7 @@ TEST( testforward, compare_break1_0_4 ) {
 
 //TEST( SLOW_testforward, comparespecific ) {
 //    LayerDimensions dim;
-//    dim.setInputPlanes( 2 ).setInputImageSize(5).setNumFilters( 1 ).setFilterSize( 5 )
+//    dim.setInputPlanes( 2 ).setInputSize(5).setNumFilters( 1 ).setFilterSize( 5 )
 //        .setPadZeros( true ).setBiased( false );    
 //    compareSpecific( 1, dim, 1, 3 );
 //}
@@ -649,7 +652,7 @@ TEST( testforward, compare_break1_0_4 ) {
 //TEST( SLOW_testforward, comparespecific_fc500unbiased ) {
 //    LayerDimensions dim;
 //    const int imageSize = 19;
-//    dim.setInputPlanes( 32 ).setInputImageSize(imageSize).setNumFilters( 500 ).setFilterSize( imageSize )
+//    dim.setInputPlanes( 32 ).setInputSize(imageSize).setNumFilters( 500 ).setFilterSize( imageSize )
 //        .setPadZeros( false ).setBiased( false );    
 //    compareSpecific( 4, dim, 1, 5 );
 //}
@@ -657,7 +660,7 @@ TEST( testforward, compare_break1_0_4 ) {
 //TEST( SLOW_testforward, comparespecific_fc500biased ) {
 //    LayerDimensions dim;
 //    const int imageSize = 19;
-//    dim.setInputPlanes( 32 ).setInputImageSize(imageSize).setNumFilters( 500 ).setFilterSize( imageSize )
+//    dim.setInputPlanes( 32 ).setInputSize(imageSize).setNumFilters( 500 ).setFilterSize( imageSize )
 //        .setPadZeros( false ).setBiased( true );    
 //    compareSpecific( 4, dim, 1, 5 );
 //}
@@ -665,7 +668,7 @@ TEST( testforward, compare_break1_0_4 ) {
 //TEST( SLOW_testforward, comparespecific_kgsgo_64c7 ) {
 //    LayerDimensions dim;
 //    const int imageSize = 19;
-//    dim.setInputPlanes( 64 ).setInputImageSize(imageSize).setNumFilters( 64 ).setFilterSize( 7 )
+//    dim.setInputPlanes( 64 ).setInputSize(imageSize).setNumFilters( 64 ).setFilterSize( 7 )
 //        .setPadZeros( true ).setBiased( true );    
 //    compareSpecific( 128, dim, new ReluActivation(), 1, 6 );
 //}
@@ -677,7 +680,7 @@ TEST( SLOW_testforward, compare_args ) {
     int instance1 = 3;
     int N = 128;
     bool debug = false;
-    dim.setInputPlanes( 64 ).setInputImageSize(19).setNumFilters( 64 )
+    dim.setInputPlanes( 64 ).setInputSize(19).setNumFilters( 64 )
         .setFilterSize( 7 )
         .setPadZeros( true ).setBiased( false );    
 
@@ -700,7 +703,7 @@ TEST( testforward, comparespecific_break2 ) { // this breaks on v5.7.0 for examp
     int instance1 = 5;
     int N = 4;
     bool debug = false;
-    dim.setInputPlanes( 64 ).setInputImageSize(19).setNumFilters( 64 )
+    dim.setInputPlanes( 64 ).setInputSize(19).setNumFilters( 64 )
         .setFilterSize( 19 )
         .setPadZeros( false ).setBiased( false );    
 
@@ -719,7 +722,7 @@ TEST( testforward, comparespecific_break2 ) { // this breaks on v5.7.0 for examp
 //TEST( SLOW_testforward, comparespecific_kgsgo_64c7mini ) {
 //    LayerDimensions dim;
 //    const int imageSize = 9;
-//    dim.setInputPlanes( 4 ).setInputImageSize(imageSize).setNumFilters( 4 ).setFilterSize( 5 )
+//    dim.setInputPlanes( 4 ).setInputSize(imageSize).setNumFilters( 4 ).setFilterSize( 5 )
 //        .setPadZeros( true ).setBiased( false );    
 //    compareSpecific( 4, dim, new ReluActivation(), 1, 6 );
 //}
@@ -789,7 +792,7 @@ TEST( testforward, softmax_byplane ) {
     NeuralNet *net = NeuralNet::maker(cl)->imageSize(2)->planes(1)->instance();
     net->addLayer( SoftMaxMaker::instance()->perPlane() );
     net->setBatchSize( 1 );
-    int imageSizeSquared = net->getLayer(0)->getOutputImageSize() * net->getLayer(0)->getOutputImageSize();
+    int imageSizeSquared = net->getLayer(0)->getOutputSize() * net->getLayer(0)->getOutputSize();
     float *input = new float[imageSizeSquared];
     input[0] = 0;
     input[1] = 1;
@@ -886,7 +889,7 @@ void testPerf( int instance, int N, int batchSize, LayerDimensions dim ) {
 TEST( SLOW_testforward, perf_kgsgo_fc500 ) {
     int batchSize = 128;
     LayerDimensions dim;
-    dim.setInputPlanes( 32 ).setInputImageSize(19).setNumFilters( 500 ).setFilterSize( 19 )
+    dim.setInputPlanes( 32 ).setInputSize(19).setNumFilters( 500 ).setFilterSize( 19 )
         .setPadZeros( false ).setBiased( true );  
     testPerf( -1, 128, batchSize, dim );
 }
@@ -894,7 +897,7 @@ TEST( SLOW_testforward, perf_kgsgo_fc500 ) {
 TEST( SLOW_testforward, perf_mnist_firstconvlayer ) {
     int batchSize = 128;
     LayerDimensions dim;
-    dim.setInputPlanes( 1 ).setInputImageSize(28).setNumFilters( 32 ).setFilterSize( 5 )
+    dim.setInputPlanes( 1 ).setInputSize(28).setNumFilters( 32 ).setFilterSize( 5 )
         .setPadZeros( true ).setBiased( true );    
     testPerf( -1, 128, batchSize, dim );
 }
@@ -902,7 +905,7 @@ TEST( SLOW_testforward, perf_mnist_firstconvlayer ) {
 TEST( SLOW_testforward, perf_mnist_intlayers_128ex ) {
     int batchSize = 128;
     LayerDimensions dim;
-    dim.setInputPlanes( 32 ).setInputImageSize(28).setNumFilters( 32 ).setFilterSize( 5 )
+    dim.setInputPlanes( 32 ).setInputSize(28).setNumFilters( 32 ).setFilterSize( 5 )
         .setPadZeros( true ).setBiased( true );    
     testPerf( -1, 128, batchSize, dim );
 }
@@ -910,7 +913,7 @@ TEST( SLOW_testforward, perf_mnist_intlayers_128ex ) {
 TEST( SLOW_testforward, perf_mnist_intlayers_1024ex ) {
     int batchSize = 1024;
     LayerDimensions dim;
-    dim.setInputPlanes( 32 ).setInputImageSize(28).setNumFilters( 32 ).setFilterSize( 5 )
+    dim.setInputPlanes( 32 ).setInputSize(28).setNumFilters( 32 ).setFilterSize( 5 )
         .setPadZeros( true ).setBiased( true );    
     testPerf( -1, 128, batchSize, dim );
 }
@@ -918,7 +921,7 @@ TEST( SLOW_testforward, perf_mnist_intlayers_1024ex ) {
 TEST( SLOW_testforward, perf_mnist_finallayer ) {
     int batchSize = 128;
     LayerDimensions dim;
-    dim.setInputPlanes( 32 ).setInputImageSize(28).setNumFilters( 10 ).setFilterSize( 28 )
+    dim.setInputPlanes( 32 ).setInputSize(28).setNumFilters( 10 ).setFilterSize( 28 )
         .setPadZeros( false ).setBiased( true );    
     testPerf( -1, 128, batchSize, dim );
 }
@@ -928,7 +931,7 @@ TEST( testforward, crash_from_jm ) {
     int batchSize = 64;
     int N = 64;
     LayerDimensions dim;
-    dim.setInputPlanes( 32 ).setInputImageSize(28).setNumFilters( 20 ).setFilterSize( 28 )
+    dim.setInputPlanes( 32 ).setInputSize(28).setNumFilters( 20 ).setFilterSize( 28 )
         .setPadZeros( false ).setBiased( false );
     DimFromArgs::arg( &dim );
     TestArgsParser::arg( "instance", &instance );
@@ -943,7 +946,7 @@ TEST( SLOW_testforward, perf_kgsgo_64c7_args ) {
     int batchSize = 128;
     int N = 1000;
     LayerDimensions dim;
-    dim.setInputPlanes( 64 ).setInputImageSize(19).setNumFilters( 64 ).setFilterSize( 7 )
+    dim.setInputPlanes( 64 ).setInputSize(19).setNumFilters( 64 ).setFilterSize( 7 )
         .setPadZeros( true ).setBiased( true );
     DimFromArgs::arg( &dim );
     TestArgsParser::arg( "instance", &instance );
@@ -961,7 +964,7 @@ TEST( SLOW_testforward, soumith2 ) {
     TestArgsParser::arg( "instance", &instance );
     TestArgsParser::arg( "biased", &biased );
     TestArgsParser::go();
-    dim.setInputPlanes( 64 ).setInputImageSize( 64 ).setNumFilters( 128 ).setFilterSize( 9 )
+    dim.setInputPlanes( 64 ).setInputSize( 64 ).setNumFilters( 128 ).setFilterSize( 9 )
         .setPadZeros( false ).setBiased( biased );  
     testPerf( instance, 128, batchSize, dim );
 }
diff --git a/test/testlogicaloperators.cpp b/test/testlogicaloperators.cpp
index 62140b2f..10a3377a 100644
--- a/test/testlogicaloperators.cpp
+++ b/test/testlogicaloperators.cpp
@@ -16,6 +16,7 @@ using namespace std;
 #include "EasyCL.h"
 #include "batch/EpochMaker.h"
 #include "layer/LayerMakers.h"
+#include "clblas/ClBlasInstance.h"
 
 //TEST( testlogicaloperators, DISABLED_FullyConnected_Biased_Tanh_And_1layer ) {
 ////    cout << "And" << endl;
@@ -125,6 +126,7 @@ TEST( testlogicaloperators, DISABLED_Convolve_1layer_And_Nobias ) {
     LogicalDataCreator ldc;
     ldc.applyAndGate();
     EasyCL *cl = EasyCL::createForFirstGpuOtherwiseCpu();
+    ClBlasInstance blasInstance;
     NeuralNet *net = NeuralNet::maker(cl)->planes(2)->imageSize(1)->instance();
     net->addLayer( ConvolutionalMaker::instance()->numFilters(2)->filterSize(1)->biased(0) );
     SGD *sgd = SGD::instance( cl, 4.0f, 0 );
@@ -148,6 +150,7 @@ TEST( testlogicaloperators, Convolve_1layer_biased_And ) {
     LogicalDataCreator ldc;
     ldc.applyAndGate();
     EasyCL *cl = EasyCL::createForFirstGpuOtherwiseCpu();
+    ClBlasInstance blasInstance;
     NeuralNet *net = NeuralNet::maker(cl)->planes(2)->imageSize(1)->instance();
     net->addLayer( ConvolutionalMaker::instance()->numFilters(2)->filterSize(1)->biased(1) );
     net->addLayer( SquareLossMaker::instance() );;
@@ -177,6 +180,7 @@ TEST( testlogicaloperators, Convolve_1layerbiased_Or ) {
     LogicalDataCreator ldc;
     ldc.applyOrGate();
     EasyCL *cl = EasyCL::createForFirstGpuOtherwiseCpu();
+    ClBlasInstance blasInstance;
     NeuralNet *net = NeuralNet::maker(cl)->planes(2)->imageSize(1)->instance();
     net->addLayer( ConvolutionalMaker::instance()->numFilters(2)->filterSize(1)->biased(1) );
     net->addLayer( SquareLossMaker::instance() );;
@@ -257,6 +261,7 @@ TEST( testlogicaloperators, Convolve_2layers_relu_Xor ) {
     };
 
     EasyCL *cl = EasyCL::createForFirstGpuOtherwiseCpu();
+    ClBlasInstance blasInstance;
     NeuralNet *net = NeuralNet::maker(cl)->planes(2)->imageSize(1)->instance();
     net->addLayer( ConvolutionalMaker::instance()->numFilters(2)->filterSize(1)->biased(1) );
     net->addLayer( ActivationMaker::instance()->relu() );
diff --git a/test/testpoolingbackward.cpp b/test/testpoolingbackward.cpp
index 73b5f9fb..15cca727 100644
--- a/test/testpoolingbackward.cpp
+++ b/test/testpoolingbackward.cpp
@@ -31,12 +31,12 @@ TEST( testpoolingbackward, basic ) {
         2, 1,
         0, 3
     };
-    float *errorsForUpstream = new float[ poolingBackprop->getInputSize( batchSize ) ];
+    float *errorsForUpstream = new float[ poolingBackprop->getInputNumElements( batchSize ) ];
 
     poolingBackprop->backward( batchSize, errors, selectors, errorsForUpstream );
 
-//    float *expectedErrorsForUpstream = new float[ poolingForward->getInputSize( batchSize ) ];
-//    memset( expectedErrorsForUpstream, 0, sizeof(float) * poolingForward->getInputSize( batchSize ) ];
+//    float *expectedErrorsForUpstream = new float[ poolingForward->getInputNumElements( batchSize ) ];
+//    memset( expectedErrorsForUpstream, 0, sizeof(float) * poolingForward->getInputNumElements( batchSize ) ];
     float expectedErrorsForUpstream[] = {
         0,0,0,5,
         3,0,0,0,
@@ -71,12 +71,12 @@ TEST( testpoolingbackward, basic_2plane_batchsize2 ) {
         0, 
         3
     };
-    float *errorsForUpstream = new float[ poolingBackprop->getInputSize( batchSize ) ];
+    float *errorsForUpstream = new float[ poolingBackprop->getInputNumElements( batchSize ) ];
 
     poolingBackprop->backward( batchSize, errors, selectors, errorsForUpstream );
 
-//    float *expectedErrorsForUpstream = new float[ poolingForward->getInputSize( batchSize ) ];
-//    memset( expectedErrorsForUpstream, 0, sizeof(float) * poolingForward->getInputSize( batchSize ) ];
+//    float *expectedErrorsForUpstream = new float[ poolingForward->getInputNumElements( batchSize ) ];
+//    memset( expectedErrorsForUpstream, 0, sizeof(float) * poolingForward->getInputNumElements( batchSize ) ];
     float expectedErrorsForUpstream[] = {
         0,0,
         3,0,
@@ -100,7 +100,7 @@ TEST( testpoolingbackward, basic_2plane_batchsize2 ) {
 }
 
 TEST( SLOW_testpoolingbackward, compare_args ) {
-    int inputImageSize = 9;
+    int inputSize = 9;
     int poolingSize = 2;
     int instance0 = 0;
     int instance1 = 1;
@@ -111,7 +111,7 @@ TEST( SLOW_testpoolingbackward, compare_args ) {
     TestArgsParser::arg( "batchSize", &batchSize );
     TestArgsParser::arg( "poolingsize", &poolingSize );
     TestArgsParser::arg( "numplanes", &numPlanes );
-    TestArgsParser::arg( "inputimagesize", &inputImageSize );
+    TestArgsParser::arg( "inputimagesize", &inputSize );
     TestArgsParser::arg( "instance0", &instance0 );
     TestArgsParser::arg( "instance1", &instance1 );
     TestArgsParser::go();
@@ -119,19 +119,19 @@ TEST( SLOW_testpoolingbackward, compare_args ) {
     bool padZeros = true;
 
     EasyCL *cl = EasyCL::createForFirstGpuOtherwiseCpu();
-    PoolingBackward *p0 = PoolingBackward::instanceSpecific( instance0, cl, padZeros, numPlanes, inputImageSize, poolingSize );
-    PoolingBackward *p1 = PoolingBackward::instanceSpecific( instance1, cl, padZeros, numPlanes, inputImageSize, poolingSize );
-    int outputImageSize = p1->outputImageSize;
-    int errorsSize = batchSize * outputImageSize * outputImageSize * numPlanes;
+    PoolingBackward *p0 = PoolingBackward::instanceSpecific( instance0, cl, padZeros, numPlanes, inputSize, poolingSize );
+    PoolingBackward *p1 = PoolingBackward::instanceSpecific( instance1, cl, padZeros, numPlanes, inputSize, poolingSize );
+    int outputSize = p1->outputSize;
+    int errorsSize = batchSize * outputSize * outputSize * numPlanes;
     float *errors = new float[ errorsSize ];
-    int inputSize = batchSize * inputImageSize * inputImageSize * numPlanes;
+    int inputNumElements = batchSize * inputSize * inputSize * numPlanes;
     int *selectors = new int[ errorsSize ];
-    float *errorsForUpstream0 = new float[ inputSize ];
-    float *errorsForUpstream1 = new float[ inputSize ];
+    float *errorsForUpstream0 = new float[ inputNumElements ];
+    float *errorsForUpstream1 = new float[ inputNumElements ];
     
-    PoolingForward *forwardprop = PoolingForward::instanceSpecific( 0, cl, padZeros, numPlanes, inputImageSize, poolingSize );
+    PoolingForward *forwardprop = PoolingForward::instanceSpecific( 0, cl, padZeros, numPlanes, inputSize, poolingSize );
     float *output = new float[errorsSize];
-    float *input = new float[inputSize];
+    float *input = new float[inputNumElements];
     float *errorsForUpstream[2];
     errorsForUpstream[0] = errorsForUpstream0;
     errorsForUpstream[1] = errorsForUpstream1;
@@ -143,7 +143,7 @@ TEST( SLOW_testpoolingbackward, compare_args ) {
         // easiest way to select valid selectors might be to just forwardforward first?
 
         WeightRandomizer::randomize( it, errors, errorsSize, -0.1f, 0.1f );
-        WeightRandomizer::randomize( it, input, inputSize, -0.1f, 0.1f );    
+        WeightRandomizer::randomize( it, input, inputNumElements, -0.1f, 0.1f );    
         forwardprop->forward( batchSize, input, selectors, output );
 
         for( int instance = 0; instance < 2; instance++ ) {
@@ -151,7 +151,7 @@ TEST( SLOW_testpoolingbackward, compare_args ) {
         }
         bool ok = true;
         int numErrors = 0;
-        for( int i = 0; i < inputSize; i++ ) {
+        for( int i = 0; i < inputNumElements; i++ ) {
             if( errorsForUpstream0[i] != errorsForUpstream1[i] ) {
                 cout << "diff: i=" << i << " " << errorsForUpstream0[i] << " != " << errorsForUpstream1[i] << endl;
                 ok = false;
@@ -201,9 +201,9 @@ TEST( testpoolingforward, basic_2plane_batchsize2 ) {
                      -1, -3.5f,
                     37.4f,5
     };
-    int outputSize = poolingForward->getOutputSize( batchSize );
-    int *selectors = new int[outputSize];
-    float *output = new float[outputSize];
+    int outputNumElements = poolingForward->getOutputNumElements( batchSize );
+    int *selectors = new int[outputNumElements];
+    float *output = new float[outputNumElements];
 
     poolingForward->forward( batchSize, data, selectors, output );
 
diff --git a/test/testpoolingforward.cpp b/test/testpoolingforward.cpp
index 7c4d0d18..e36f7e3c 100644
--- a/test/testpoolingforward.cpp
+++ b/test/testpoolingforward.cpp
@@ -29,9 +29,9 @@ TEST( testpoolingforward, basic ) {
                      3, 33, 14,23,
                      -1, -3.5f,37.4f,5
     };
-    int outputSize = poolingForward->getOutputSize( batchSize );
-    int *selectors = new int[outputSize];
-    float *output = new float[outputSize];
+    int outputNumElements = poolingForward->getOutputNumElements( batchSize );
+    int *selectors = new int[outputNumElements];
+    float *output = new float[outputNumElements];
 
     poolingForward->forward( batchSize, data, selectors, output );
 
@@ -70,9 +70,9 @@ TEST( testpoolingforward, basic_2plane_batchsize2 ) {
                      -1, -3.5f,
                     37.4f,5
     };
-    int outputSize = poolingForward->getOutputSize( batchSize );
-    int *selectors = new int[outputSize];
-    float *output = new float[outputSize];
+    int outputNumElements = poolingForward->getOutputNumElements( batchSize );
+    int *selectors = new int[outputNumElements];
+    float *output = new float[outputNumElements];
 
     poolingForward->forward( batchSize, data, selectors, output );
 
@@ -104,14 +104,14 @@ TEST( testpoolingforward, fromwrappers ) {
                      3, 33, 14,23,
                      -1, -3.5f,37.4f,5
     };
-    int outputSize = poolingForward->getOutputSize( batchSize );
-    int *selectors = new int[outputSize];
-    float *output = new float[outputSize];
+    int outputNumElements = poolingForward->getOutputNumElements( batchSize );
+    int *selectors = new int[outputNumElements];
+    float *output = new float[outputNumElements];
 
-    const int inputSize = batchSize * numPlanes * imageSize * imageSize;
-    CLWrapper *inputWrapper = cl->wrap( inputSize, input );
-    CLWrapper *selectorsWrapper = cl->wrap( outputSize, selectors );
-    CLWrapper *outputWrapper = cl->wrap( outputSize, output );
+    const int inputNumElements = batchSize * numPlanes * imageSize * imageSize;
+    CLWrapper *inputWrapper = cl->wrap( inputNumElements, input );
+    CLWrapper *selectorsWrapper = cl->wrap( outputNumElements, selectors );
+    CLWrapper *outputWrapper = cl->wrap( outputNumElements, output );
 
     inputWrapper->copyToDevice();
 
@@ -214,21 +214,21 @@ void compareSpecific( CompareSpecificArgs args ) {
     PoolingForward *poolingForward0 = PoolingForward::instanceSpecific( args._instance0, cl, args._padZeros, numPlanes, imageSize, poolingSize );
     PoolingForward *poolingForward1 = PoolingForward::instanceSpecific( args._instance1, cl, args._padZeros, numPlanes, imageSize, poolingSize );
 
-    const int inputSize = batchSize * numPlanes * imageSize * imageSize;
-    int outputSize = poolingForward0->getOutputSize( batchSize );
+    const int inputNumElements = batchSize * numPlanes * imageSize * imageSize;
+    int outputNumElements = poolingForward0->getOutputNumElements( batchSize );
 
-    float *input = new float[ inputSize ];
-    int *selectors = new int[ outputSize ];
-    float *output = new float[ outputSize ];
+    float *input = new float[ inputNumElements ];
+    int *selectors = new int[ outputNumElements ];
+    float *output = new float[ outputNumElements ];
 
-    CLWrapper *inputWrapper = cl->wrap( inputSize, input );
-    CLWrapper *selectorsWrapper = cl->wrap( outputSize, selectors );
-    CLWrapper *outputWrapper = cl->wrap( outputSize, output );
+    CLWrapper *inputWrapper = cl->wrap( inputNumElements, input );
+    CLWrapper *selectorsWrapper = cl->wrap( outputNumElements, selectors );
+    CLWrapper *outputWrapper = cl->wrap( outputNumElements, output );
 
-    WeightRandomizer::randomize( input, inputSize, -0.1f, 0.1f );
+    WeightRandomizer::randomize( input, inputNumElements, -0.1f, 0.1f );
 
-    memset( selectors, 99, sizeof(int) * outputSize );
-    memset( output, 99, sizeof(int) * outputSize );
+    memset( selectors, 99, sizeof(int) * outputNumElements );
+    memset( output, 99, sizeof(int) * outputNumElements );
 
     inputWrapper->copyToDevice();
     selectorsWrapper->copyToDevice();
@@ -238,13 +238,13 @@ void compareSpecific( CompareSpecificArgs args ) {
     selectorsWrapper->copyToHost();
     outputWrapper->copyToHost();
 
-    int *selectors0 = new int[ outputSize ];
-    float *output0 = new float[ outputSize ];
-    memcpy( selectors0, selectors, sizeof(int) * outputSize );
-    memcpy( output0, output, sizeof(float) * outputSize );
+    int *selectors0 = new int[ outputNumElements ];
+    float *output0 = new float[ outputNumElements ];
+    memcpy( selectors0, selectors, sizeof(int) * outputNumElements );
+    memcpy( output0, output, sizeof(float) * outputNumElements );
     
-    memset( selectors, 99, sizeof(int) * outputSize );
-    memset( output, 99, sizeof(int) * outputSize );
+    memset( selectors, 99, sizeof(int) * outputNumElements );
+    memset( output, 99, sizeof(int) * outputNumElements );
 
     inputWrapper->copyToDevice();
     selectorsWrapper->copyToDevice();
@@ -255,7 +255,7 @@ void compareSpecific( CompareSpecificArgs args ) {
     outputWrapper->copyToHost();
     
     int numErrors = 0;
-    for( int i = 0; i < outputSize; i++ ) {
+    for( int i = 0; i < outputNumElements; i++ ) {
         if( selectors[i] != selectors0[i] ) {
             cout << "ERROR: selectors[" << i << "] instance0:" << selectors0[i] << " != instance1:" << selectors[i] << endl;
             numErrors++;
@@ -271,7 +271,7 @@ void compareSpecific( CompareSpecificArgs args ) {
     }
     EXPECT_EQ( 0, numErrors );
     if( numErrors > 0 ) {
-        int num2dPlanes = inputSize / imageSize / imageSize;
+        int num2dPlanes = inputNumElements / imageSize / imageSize;
         for( int plane = 0; plane < num2dPlanes; plane++ ) {
             cout << "2dplane " << plane << ":" << endl;
             for( int i = 0; i < imageSize; i++ ) {
diff --git a/test/testsimpleconvolvenet.cpp b/test/testsimpleconvolvenet.cpp
index e1583fe2..015f50ef 100644
--- a/test/testsimpleconvolvenet.cpp
+++ b/test/testsimpleconvolvenet.cpp
@@ -16,6 +16,7 @@
 #include "layer/LayerMakers.h"
 #include "batch/EpochMaker.h"
 #include "batch/Batcher2.h"
+#include "clblas/ClBlasInstance.h"
 
 #include "test/NetTestHelper.h"
 
@@ -39,6 +40,7 @@ TEST( testsimpleconvolvenet, imagesize1_planes2_filters2_unbiased_tanh ) {
     expectedOutput[2] = -0.5f;
     expectedOutput[3] = 0.5f;
     EasyCL *cl = EasyCL::createForFirstGpuOtherwiseCpu();
+    ClBlasInstance blasInstance;
     NeuralNet *net = NeuralNet::maker(cl)->instance();
     net->addLayer( InputLayerMaker::instance()->numPlanes(1)->imageSize(1) );
     net->addLayer( ConvolutionalMaker::instance()->numFilters(2)->filterSize(1)->biased(0) );
@@ -94,6 +96,7 @@ TEST( testsimpleconvolvenet, imagesize1_planes2_filters2_tanh ) {
     expectedOutput[2] = -0.5f;
     expectedOutput[3] = 0.5f;
     EasyCL *cl = EasyCL::createForFirstGpuOtherwiseCpu();
+    ClBlasInstance blasInstance;
     NeuralNet *net = NeuralNet::maker(cl)->instance();
     net->addLayer( InputLayerMaker::instance()->numPlanes(1)->imageSize(1) );
     net->addLayer( ConvolutionalMaker::instance()->numFilters(2)->filterSize(1)->biased() );
@@ -173,6 +176,7 @@ TEST( testsimpleconvolvenet, imagesize3_n4_filtersize3_tanh ) {
     expectedOutput[6] = -0.5f;
     expectedOutput[7] = 0.5f;
     EasyCL *cl = EasyCL::createForFirstGpuOtherwiseCpu();
+    ClBlasInstance blasInstance;
     NeuralNet *net = NeuralNet::maker(cl)->instance();
     net->addLayer( InputLayerMaker::instance()->numPlanes(1)->imageSize(3) );
     net->addLayer( ConvolutionalMaker::instance()->numFilters(2)->filterSize(3)->biased() );
@@ -227,6 +231,7 @@ TEST( testsimpleconvolvenet, imagesize1_2planes_filtersize1 ) {
     expectedOutput[2] = 0;
     expectedOutput[3] = 1;
     EasyCL *cl = EasyCL::createForFirstGpuOtherwiseCpu();
+    ClBlasInstance blasInstance;
     NeuralNet *net = new NeuralNet(cl);
     net->addLayer( InputLayerMaker::instance()->numPlanes(1)->imageSize(1) );
 //    net->inputMaker<float>()->numPlanes(1)->imageSize(1)->insert();
@@ -307,6 +312,7 @@ TEST( testsimpleconvolvenet, imagesize3_n4_filtersize3_relu ) {
     expectedOutput[6] = 0;
     expectedOutput[7] = 1;
     EasyCL *cl = EasyCL::createForFirstGpuOtherwiseCpu();
+    ClBlasInstance blasInstance;
     NeuralNet *net = NeuralNet::maker(cl)->instance();
     net->addLayer( InputLayerMaker::instance()->numPlanes(1)->imageSize(3) );
 //    net->inputMaker<float>()->numPlanes(1)->imageSize(3)->insert();
@@ -386,6 +392,7 @@ TEST( testsimpleconvolvenet, imagesize3_n4_filtersize3_linear ) {
     expectedOutput[6] = 0;
     expectedOutput[7] = 1;
     EasyCL *cl = EasyCL::createForFirstGpuOtherwiseCpu();
+    ClBlasInstance blasInstance;
     NeuralNet *net = NeuralNet::maker(cl)->instance();
     net->addLayer( InputLayerMaker::instance()->numPlanes(1)->imageSize(3) );
 //    net->inputMaker<float>()->numPlanes(1)->imageSize(3)->insert();
@@ -441,6 +448,7 @@ TEST( testsimpleconvolvenet, imagesize1_n2_2layers_unbiased ) {
     expectedOutput[2] = -0.5f;
     expectedOutput[3] = 0.5f;
     EasyCL *cl = EasyCL::createForFirstGpuOtherwiseCpu();
+    ClBlasInstance blasInstance;
     NeuralNet *net = NeuralNet::maker(cl)->instance();
     net->addLayer( InputLayerMaker::instance()->numPlanes(1)->imageSize(1) );
 //    net->inputMaker<float>()->numPlanes(1)->imageSize(1)->insert();
@@ -509,6 +517,7 @@ TEST( testsimpleconvolvenet, imagesize1_n2_2layers_biased ) {
     expectedOutput[2] = -0.5f;
     expectedOutput[3] = 0.5f;
     EasyCL *cl = EasyCL::createForFirstGpuOtherwiseCpu();
+    ClBlasInstance blasInstance;
     NeuralNet *net = NeuralNet::maker(cl)->instance();
     net->addLayer( InputLayerMaker::instance()->numPlanes(1)->imageSize(1) );
 //    net->inputMaker<float>()->numPlanes(1)->imageSize(1)->insert();
@@ -586,13 +595,13 @@ TEST( testsimpleconvolvenet, imagesize_5_4_2layers_filtersize_2_4_biased_n3 ) {
                     0,0,0,0,0,
                     1,1,1,1,1,
 };
-    int inputSize = imageSize * imageSize * numInPlanes * N;
-    for( int i = 0; i < inputSize; i++ ) {
+    int inputNumElements = imageSize * imageSize * numInPlanes * N;
+    for( int i = 0; i < inputNumElements; i++ ) {
         data[i] -= 0.5f;
     }
     int labels[] = { 0, 1, 2 };
-    int outputSize = numOutPlanes * N;
-    float *expectedOutput = new float[outputSize];
+    int outputNumElements = numOutPlanes * N;
+    float *expectedOutput = new float[outputNumElements];
     for( int n = 0; n < N; n++ ) {
         for( int plane = 0; plane < numOutPlanes; plane++ ) {
             expectedOutput[ n * numOutPlanes + plane] = -0.5f;
@@ -600,6 +609,7 @@ TEST( testsimpleconvolvenet, imagesize_5_4_2layers_filtersize_2_4_biased_n3 ) {
         expectedOutput[ n * numOutPlanes + labels[n]] = +0.5f;
     }
     EasyCL *cl = EasyCL::createForFirstGpuOtherwiseCpu();
+    ClBlasInstance blasInstance;
     NeuralNet *net = NeuralNet::maker(cl)->instance();
     net->addLayer( InputLayerMaker::instance()->numPlanes(1)->imageSize(5) );
 //    net->inputMaker<float>()->numPlanes(1)->imageSize(5)->insert();
@@ -683,13 +693,13 @@ TEST( testsimpleconvolvenet, imagesize_5_4_2layers_filtersize_2_4_biased_n6 ) {
                     1,1,1,1,1,
                     0,0,0,0,0,
 };
-    int inputSize = imageSize * imageSize * numInPlanes * N;
-    for( int i = 0; i < inputSize; i++ ) {
+    int inputNumElements = imageSize * imageSize * numInPlanes * N;
+    for( int i = 0; i < inputNumElements; i++ ) {
         data[i] -= 0.5f;
     }
     int labels[] = { 0, 1, 2, 0, 1, 2 };
-    int outputSize = numOutPlanes * N;
-    float *expectedOutput = new float[outputSize];
+    int outputNumElements = numOutPlanes * N;
+    float *expectedOutput = new float[outputNumElements];
     for( int n = 0; n < N; n++ ) {
         for( int plane = 0; plane < numOutPlanes; plane++ ) {
             expectedOutput[ n * numOutPlanes + plane] = -0.5f;
@@ -697,6 +707,7 @@ TEST( testsimpleconvolvenet, imagesize_5_4_2layers_filtersize_2_4_biased_n6 ) {
         expectedOutput[ n * numOutPlanes + labels[n]] = +0.5f;
     }
     EasyCL *cl = EasyCL::createForFirstGpuOtherwiseCpu();
+    ClBlasInstance blasInstance;
     NeuralNet *net = NeuralNet::maker(cl)->instance();
     net->addLayer( InputLayerMaker::instance()->numPlanes(1)->imageSize(5) );
 //    net->inputMaker<float>()->numPlanes(1)->imageSize(5)->insert();
@@ -801,13 +812,13 @@ TEST( testsimpleconvolvenet, imagesize_5_3_2layers_filtersize_3_3_biased_n6 ) {
                     1,1,1,1,1,
                     0,0,0,0,0,
 };
-    int inputSize = imageSize * imageSize * numInPlanes * N;
-    for( int i = 0; i < inputSize; i++ ) {
+    int inputNumElements = imageSize * imageSize * numInPlanes * N;
+    for( int i = 0; i < inputNumElements; i++ ) {
         data[i] -= 0.5f;
     }
     int labels[] = { 0, 1, 2, 0, 1, 2 };
-    int outputSize = numOutPlanes * N;
-    float *expectedOutput = new float[outputSize];
+    int outputNumElements = numOutPlanes * N;
+    float *expectedOutput = new float[outputNumElements];
     for( int n = 0; n < N; n++ ) {
         for( int plane = 0; plane < numOutPlanes; plane++ ) {
             expectedOutput[ n * numOutPlanes + plane] = -0.5f;
@@ -815,6 +826,7 @@ TEST( testsimpleconvolvenet, imagesize_5_3_2layers_filtersize_3_3_biased_n6 ) {
         expectedOutput[ n * numOutPlanes + labels[n]] = +0.5f;
     }
     EasyCL *cl = EasyCL::createForFirstGpuOtherwiseCpu();
+    ClBlasInstance blasInstance;
     NeuralNet *net = NeuralNet::maker(cl)->instance();
     net->addLayer( InputLayerMaker::instance()->numPlanes(1)->imageSize(5) );
 //    net->inputMaker<float>()->numPlanes(1)->imageSize(5)->insert();
@@ -990,15 +1002,15 @@ TEST( testsimpleconvolvenet, imagesize_5_3_2layers_filtersize_3_3_biased_n18 ) {
                     1,1,1,0,0,
                     0,0,0,0,0,
 };
-    int inputSize = imageSize * imageSize * numInPlanes * N;
-    for( int i = 0; i < inputSize; i++ ) {
+    int inputNumElements = imageSize * imageSize * numInPlanes * N;
+    for( int i = 0; i < inputNumElements; i++ ) {
         data[i] -= 0.5f;
     }
     int labels[] = { 0, 1, 2, 0, 1, 2,
                     0, 1, 2, 0, 1, 2,
                     0, 1, 2, 0, 1, 2 };
-    int outputSize = numOutPlanes * N;
-    float *expectedOutput = new float[outputSize];
+    int outputNumElements = numOutPlanes * N;
+    float *expectedOutput = new float[outputNumElements];
     for( int n = 0; n < N; n++ ) {
         for( int plane = 0; plane < numOutPlanes; plane++ ) {
             expectedOutput[ n * numOutPlanes + plane] = -0.5f;
@@ -1006,6 +1018,7 @@ TEST( testsimpleconvolvenet, imagesize_5_3_2layers_filtersize_3_3_biased_n18 ) {
         expectedOutput[ n * numOutPlanes + labels[n]] = +0.5f;
     }
     EasyCL *cl = EasyCL::createForFirstGpuOtherwiseCpu();
+    ClBlasInstance blasInstance;
     NeuralNet *net = NeuralNet::maker(cl)->instance();
     net->addLayer( InputLayerMaker::instance()->numPlanes(1)->imageSize(5) );
 //    net->inputMaker<float>()->numPlanes(1)->imageSize(5)->insert();
diff --git a/test/testsinglebatch.cpp b/test/testsinglebatch.cpp
index 033b84b3..03da89d7 100644
--- a/test/testsinglebatch.cpp
+++ b/test/testsinglebatch.cpp
@@ -12,6 +12,7 @@
 #include "net/NeuralNetMould.h"
 #include "layer/LayerMakers.h"
 #include "trainers/SGD.h"
+#include "clblas/ClBlasInstance.h"
 
 #include "gtest/gtest.h"
 #include "test/gtest_supp.h"
@@ -129,9 +130,9 @@ void test( float learningRate, int numEpochs, int batchSize, NeuralNet *net, flo
     for( int i = 0; i < inputsSize; i++ ) {
         inputData[i] = random() / (float)random.max() * 0.2f - 0.1f;
     }
-    const int outputSize = net->getLastLayer()->getOutputSize();
-    float *expectedOutput = new float[outputSize];
-    for( int i = 0; i < outputSize; i++ ) {
+    const int outputNumElements = net->getLastLayer()->getOutputNumElements();
+    float *expectedOutput = new float[outputNumElements];
+    for( int i = 0; i < outputNumElements; i++ ) {
         expectedOutput[i] = random() / (float)random.max() * 0.2f - 0.1f;
     }
 
@@ -220,7 +221,7 @@ void test( float learningRate, int numEpochs, int batchSize, NeuralNet *net, flo
     EXPECT_EQ( true, allOk );
 
 //    float *output = (float*)(net->getOutput());
-//    Sampler::printSamples( "net->getOutput()", outputSize, (float*)output );
+//    Sampler::printSamples( "net->getOutput()", outputNumElements, (float*)output );
 
     delete sgd;
     delete[]currentWeights;
@@ -231,6 +232,7 @@ void test( float learningRate, int numEpochs, int batchSize, NeuralNet *net, flo
 
 void test( ActivationFunction *fn, TestArgs args, float tolerance = 1.3f ) {
     EasyCL *cl = EasyCL::createForFirstGpuOtherwiseCpu();
+    ClBlasInstance blasInstance;
     NeuralNet *net = NeuralNet::maker(cl)->planes(1)->imageSize(args.imageSize)->instance();
     for( int i = 0; i < args.numLayers; i++ ) {
         net->addLayer( ConvolutionalMaker::instance()->numFilters(args.numFilters)->filterSize(args.filterSize)->biased() );
@@ -321,6 +323,7 @@ void checkErrorsForLayer( int layerId, float lastLoss, NeuralNet *net, float *la
 
 void testLabelled( TestArgs args ) {
     EasyCL *cl = EasyCL::createForFirstGpuOtherwiseCpu();
+    ClBlasInstance blasInstance;
     NeuralNet *net = NeuralNet::maker(cl)->planes(1)->imageSize(args.imageSize)->instance();
     for( int i = 0; i < args.numLayers; i++ ) {
         net->addLayer( ConvolutionalMaker::instance()->numFilters(args.numFilters)->filterSize(args.filterSize)->biased()->padZeros() );
@@ -341,7 +344,7 @@ void testLabelled( TestArgs args ) {
     for( int i = 0; i < inputsSize; i++ ) {
         inputData[i] = random() / (float)random.max() * 0.2f - 0.1f;
     }
-//    const int outputSize = net->getLastLayer()->getOutputSize();
+//    const int outputNumElements = net->getLastLayer()->getOutputNumElements();
     int *labels = new int[args.batchSize];
     for( int i = 0; i < args.batchSize; i++ ) {
         labels[i] = random() % args.numCats;
@@ -424,7 +427,7 @@ void testLabelled( TestArgs args ) {
     StatefulTimer::dump(true);
 
 //    float *output = (float*)(net->getOutput());
-//    Sampler::printSamples( "net->getOutput()", outputSize, (float*)output );
+//    Sampler::printSamples( "net->getOutput()", outputNumElements, (float*)output );
 
     delete[]currentWeights;
     delete[]lastWeights;
@@ -471,14 +474,14 @@ TEST( testsinglebatch, detailedregression ) {
 
     mt19937 random;
     random.seed(0); // so always gives same output
-    const int inputsSize = net->getInputSizePerExample() * batchSize;
+    const int inputsSize = net->getInputNumElementsPerExample() * batchSize;
     float *inputData = new float[ inputsSize ];
     for( int i = 0; i < inputsSize; i++ ) {
         inputData[i] = random() / (float)mt19937::max() * 0.2f - 0.1f;
     }
-    const int outputSize = net->getLastLayer()->getOutputSize();
-    float *expectedOutput = new float[outputSize];
-    for( int i = 0; i < outputSize; i++ ) {
+    const int outputNumElements = net->getLastLayer()->getOutputNumElements();
+    float *expectedOutput = new float[outputNumElements];
+    for( int i = 0; i < outputNumElements; i++ ) {
         expectedOutput[i] = random() / (float)mt19937::max() * 0.2f - 0.1f;
     }
 
@@ -501,7 +504,7 @@ TEST( testsinglebatch, detailedregression ) {
         ConvolutionalLayer *layer = dynamic_cast<ConvolutionalLayer*>( net->layers[layerIndex] );
         float const*output = layer->getOutput();
         cout << "layer " << layerIndex << endl;
-        Sampler::printSamples( "output", outputSize, (float*)output, 3 );        
+        Sampler::printSamples( "output", outputNumElements, (float*)output, 3 );        
     }
 float *output = (float*)(net->layers[1]->getOutput());
 EXPECT_FLOAT_NEAR( 0.0767364, output[684] );
@@ -533,9 +536,9 @@ ExpectedValuesLayer *expectedValuesLayer = dynamic_cast<ExpectedValuesLayer*>(ne
 //ExpectedValuesLayer *expectedValuesLayer = (new ExpectedValuesLayerMaker(net, net->layers[3]))->instance();
 expectedValuesLayer->setBatchSize(batchSize);
 expectedValuesLayer->calcErrors( expectedOutput );
-int layer3OutputSize = layer3->getOutputSize();
+int layer3OutputNumElements = layer3->getOutputNumElements();
 float *layer3errors = expectedValuesLayer->errors;
-Sampler::printSamples( "layer3errors", layer3OutputSize, layer3errors, 3 );        
+Sampler::printSamples( "layer3errors", layer3OutputNumElements, layer3errors, 3 );        
 
 EXPECT_FLOAT_NEAR( -0.296495, layer3errors[684] );
 EXPECT_FLOAT_NEAR( 0.214934, layer3errors[559] );
@@ -544,11 +547,11 @@ EXPECT_FLOAT_NEAR( 0.1246, layer3errors[373] );
 cout << endl;
 
 ConvolutionalLayer *layer2 = dynamic_cast<ConvolutionalLayer*>( net->layers[2] );
-int layer2OutputSize = layer2->getOutputSize();
+int layer2OutputNumElements = layer2->getOutputNumElements();
 layer3->nextLayer = expectedValuesLayer;
 layer3->backward( learningRate );
 float *layer2errors = layer3->getDerivLossBySumForUpstream();
-Sampler::printSamples( "layer2errors", layer2OutputSize, layer2errors );
+Sampler::printSamples( "layer2errors", layer2OutputNumElements, layer2errors );
 
 //EXPECT_FLOAT_NEAR( -0.296495, layer2errors[684] );
 //EXPECT_FLOAT_NEAR( 0.214934, layer2errors[559] );
@@ -563,11 +566,11 @@ EXPECT_FLOAT_NEAR( 0.0361823, layer2errors[176963] );
 cout << endl;
 
 ConvolutionalLayer *layer1 = dynamic_cast<ConvolutionalLayer*>( net->layers[1] );
-int layer1OutputSize = layer1->getOutputSize();
+int layer1OutputNumElements = layer1->getOutputNumElements();
 layer2->nextLayer = layer3;
 layer2->backward( learningRate );
 float *layer1errors = layer2->getDerivLossBySumForUpstream();
-Sampler::printSamples( "layer1errors", layer1OutputSize, layer1errors );
+Sampler::printSamples( "layer1errors", layer1OutputNumElements, layer1errors );
 
 EXPECT_FLOAT_NEAR( -0.0137842, layer1errors[199340] );
 EXPECT_FLOAT_NEAR( -0.015897, layer1errors[567855] );
@@ -621,7 +624,7 @@ EXPECT_FLOAT_NEAR( -0.0601025, bias[15] );
 EXPECT_FLOAT_NEAR( 0.000941529, bias[21] );
 
 output = (float*)(net->getOutput());
-Sampler::printSamples( "net->getOutput()", outputSize, (float*)output, 3 );
+Sampler::printSamples( "net->getOutput()", outputNumElements, (float*)output, 3 );
 EXPECT_FLOAT_NEAR( -0.232493, net->getOutput()[684] );
 EXPECT_FLOAT_NEAR( 0.179215, net->getOutput()[559] );
 EXPECT_FLOAT_NEAR( 0.14498, net->getOutput()[373] );
@@ -630,7 +633,7 @@ EXPECT_FLOAT_NEAR( 0.14498, net->getOutput()[373] );
 
 net->forward( inputData );
 output = (float*)(net->getOutput());
-Sampler::printSamples( "net->getOutput()", outputSize, (float*)output, 3 );
+Sampler::printSamples( "net->getOutput()", outputNumElements, (float*)output, 3 );
 
 EXPECT_FLOAT_NEAR( 0.549084, net->getOutput()[684] );
 EXPECT_FLOAT_NEAR( -0.00702396, net->getOutput()[559] );
@@ -689,7 +692,7 @@ EXPECT_FLOAT_NEAR( 0.0122473, bias[21] );
     StatefulTimer::dump(true);
 
     output = (float*)(net->getOutput());
-    Sampler::printSamples( "net->getOutput()", outputSize, (float*)output );
+    Sampler::printSamples( "net->getOutput()", outputNumElements, (float*)output );
 
 EXPECT_FLOAT_NEAR( -0.15081, net->getOutput()[684] );
 EXPECT_FLOAT_NEAR( -0.0236106, net->getOutput()[559] );
@@ -717,14 +720,14 @@ TEST( SLOW_testsinglebatch, perf ) {
 
     mt19937 random;
     random.seed(0); // so always gives same output
-    const int inputsSize = net->getInputSizePerExample() * batchSize;
+    const int inputsSize = net->getInputNumElementsPerExample() * batchSize;
     float *inputData = new float[ inputsSize ];
     for( int i = 0; i < inputsSize; i++ ) {
         inputData[i] = random() / (float)mt19937::max() * 0.2f - 0.1f;
     }
-    const int outputSize = net->getLastLayer()->getOutputSize();
-    float *expectedOutput = new float[outputSize];
-    for( int i = 0; i < outputSize; i++ ) {
+    const int outputNumElements = net->getLastLayer()->getOutputNumElements();
+    float *expectedOutput = new float[outputNumElements];
+    for( int i = 0; i < outputNumElements; i++ ) {
         expectedOutput[i] = random() / (float)mt19937::max() * 0.2f - 0.1f;
     }
 
@@ -749,7 +752,7 @@ TEST( SLOW_testsinglebatch, perf ) {
     StatefulTimer::dump(true);
 
     float *output = (float*)(net->getOutput());
-    Sampler::printSamples( "net->getOutput()", outputSize, (float*)output );
+    Sampler::printSamples( "net->getOutput()", outputNumElements, (float*)output );
 
 EXPECT_FLOAT_NEAR( -0.121662, net->getOutput()[684] );
 EXPECT_FLOAT_NEAR( 0.0783329, net->getOutput()[559] );
@@ -778,14 +781,14 @@ TEST( testsinglebatch, perf19 ) {
 
     mt19937 random;
     random.seed(0); // so always gives same output
-    const int inputsSize = net->getInputSizePerExample() * batchSize;
+    const int inputsSize = net->getInputNumElementsPerExample() * batchSize;
     float *inputData = new float[ inputsSize ];
     for( int i = 0; i < inputsSize; i++ ) {
         inputData[i] = random() / (float)mt19937::max() * 0.2f - 0.1f;
     }
-    const int outputSize = net->getLastLayer()->getOutputSize();
-    float *expectedOutput = new float[outputSize];
-    for( int i = 0; i < outputSize; i++ ) {
+    const int outputNumElements = net->getLastLayer()->getOutputNumElements();
+    float *expectedOutput = new float[outputNumElements];
+    for( int i = 0; i < outputNumElements; i++ ) {
         expectedOutput[i] = random() / (float)mt19937::max() * 0.2f - 0.1f;
     }
 
@@ -810,7 +813,7 @@ TEST( testsinglebatch, perf19 ) {
     StatefulTimer::dump(true);
 
     float *output = (float*)(net->getOutput());
-    Sampler::printSamples( "net->getOutput()", outputSize, (float*)output );
+    Sampler::printSamples( "net->getOutput()", outputNumElements, (float*)output );
 
 //EXPECT_FLOAT_NEAR( -0.121662, net->getOutput()[684] );
 //EXPECT_FLOAT_NEAR( 0.0783329, net->getOutput()[559] );
@@ -840,14 +843,14 @@ TEST( SLOW_testsinglebatch, perf19_depth12 ) {
 
     mt19937 random;
     random.seed(0); // so always gives same output
-    const int inputsSize = net->getInputSizePerExample() * batchSize;
+    const int inputsSize = net->getInputNumElementsPerExample() * batchSize;
     float *inputData = new float[ inputsSize ];
     for( int i = 0; i < inputsSize; i++ ) {
         inputData[i] = random() / (float)mt19937::max() * 0.2f - 0.1f;
     }
-    const int outputSize = net->getLastLayer()->getOutputSize();
-    float *expectedOutput = new float[outputSize];
-    for( int i = 0; i < outputSize; i++ ) {
+    const int outputNumElements = net->getLastLayer()->getOutputNumElements();
+    float *expectedOutput = new float[outputNumElements];
+    for( int i = 0; i < outputNumElements; i++ ) {
         expectedOutput[i] = random() / (float)mt19937::max() * 0.2f - 0.1f;
     }
 
@@ -872,7 +875,7 @@ TEST( SLOW_testsinglebatch, perf19_depth12 ) {
     StatefulTimer::dump(true);
 
     float *output = (float*)(net->getOutput());
-    Sampler::printSamples( "net->getOutput()", outputSize, (float*)output );
+    Sampler::printSamples( "net->getOutput()", outputNumElements, (float*)output );
 
 //EXPECT_FLOAT_NEAR( -0.121662, net->getOutput()[684] );
 //EXPECT_FLOAT_NEAR( 0.0783329, net->getOutput()[559] );
diff --git a/test/testupdateweights.cpp b/test/testupdateweights.cpp
index cfa38e15..8ffcd956 100644
--- a/test/testupdateweights.cpp
+++ b/test/testupdateweights.cpp
@@ -19,6 +19,7 @@
 #include "input/InputLayer.h"
 #include "layer/LayerMakers.h"
 #include "trainers/SGD.h"
+#include "clblas/ClBlasInstance.h"
 
 #include "gtest/gtest.h"
 #include "test/gtest_supp.h"
@@ -28,7 +29,7 @@
 
 using namespace std;
 
-void checkWeightsUpdate( NeuralNet *net, int targetLayerIndex ) {
+void checkWeightsUpdate(NeuralNet *net, int targetLayerIndex) {
     // here's the plan:
     // generate some input, randomly
     // generate some expected output, randomly
@@ -56,43 +57,43 @@ void checkWeightsUpdate( NeuralNet *net, int targetLayerIndex ) {
     Layer *layer = net->getLayer(targetLayerIndex);
 
     cout << "layer " << layer->asString() << endl;
-    WeightRandomizer::randomize( 0, input, inputTotalSize, -1.0f, 1.0f );
-    WeightRandomizer::randomize( 1, expectedOutput, outputTotalSize, -1.0f, 1.0f );
+    WeightRandomizer::randomize(0, input, inputTotalSize, -1.0f, 1.0f);
+    WeightRandomizer::randomize(1, expectedOutput, outputTotalSize, -1.0f, 1.0f);
 
     int weightsSize = layer->getWeightsSize();
     int biasSize = layer->getBiasSize();
     cout << "weightsize=" << weightsSize << " biassize=" << biasSize << endl;
     float *weights = new float[weightsSize];
-    WeightRandomizer::randomize( 2, weights, weightsSize, -0.1f, 0.1f );
+    WeightRandomizer::randomize(2, weights, weightsSize, -0.1f, 0.1f);
     float *bias = 0;
-    if( layer->biased() ) {
+    if(layer->biased()) {
         bias = new float[biasSize];
-        WeightRandomizer::randomize( 3, bias, biasSize, -0.1f, 0.1f );
+        WeightRandomizer::randomize(3, bias, biasSize, -0.1f, 0.1f);
     }
-    if( weightsSize > 0 || biasSize > 0 ) {
-        layer->setWeights( weights, bias );
+    if(weightsSize > 0 || biasSize > 0) {
+        layer->setWeights(weights, bias);
     }
 
     // now, forward prop
-    net->forward( input );
+    net->forward(input);
     net->print();
 //    net->printOutput();
 
     // calculate loss
-    float lossBefore = net->calcLoss( expectedOutput );
+    float lossBefore = net->calcLoss(expectedOutput);
 
     // calculate gradInput
     // should be zero, so we dont modify the weights
     // otherwise the losses will be really strange :-)
     // temporarily putting 1.0f, because of the way this works currently...
-    net->backward( expectedOutput);
+    net->backward(expectedOutput);
 
     // modify input slightly
     mt19937 random;
     const int numSamples = 10;
-    for( int i = 0; i < numSamples; i++ ) {
+    for(int i = 0; i < numSamples; i++) {
         int weightIndex;
-        WeightRandomizer::randomizeInts( i, &weightIndex, 1, 0, weightsSize );
+        WeightRandomizer::randomizeInts(i, &weightIndex, 1, 0, weightsSize);
 //        cout << "i=" << i << " index " << inputIndex << endl;
         float oldValue = weights[weightIndex];
         // grad for this index is....
@@ -103,14 +104,14 @@ void checkWeightsUpdate( NeuralNet *net, int targetLayerIndex ) {
         float inputDelta = newValue - oldValue;
         float predictedLossChange = inputDelta * grad;
         weights[weightIndex] = newValue;
-        layer->setWeights( weights, bias );
+        layer->setWeights(weights, bias);
 //        cout << "oldvalue=" << oldValue << " newvalue=" << newValue << endl;
         // forwardProp
-        net->forward( input );
+        net->forward(input);
         weights[weightIndex] = oldValue;
-        layer->setWeights( weights, bias );
+        layer->setWeights(weights, bias);
 //        net->printOutput();
-        float lossAfter = net->calcLoss( expectedOutput );
+        float lossAfter = net->calcLoss(expectedOutput);
         float lossChange = lossAfter - lossBefore;
         cout << "idx=" << weightIndex << " predicted losschange=" << predictedLossChange << " actual=" << lossChange << endl;
     }
@@ -121,72 +122,74 @@ void checkWeightsUpdate( NeuralNet *net, int targetLayerIndex ) {
     delete[] input;
 }
 
-TEST( testupdateweights, conv1 ) {
+TEST(testupdateweights, conv1) {
     EasyCL *cl = EasyCL::createForFirstGpuOtherwiseCpu();
-    NeuralNet *net = new NeuralNet( cl, 2, 5 );
-    net->addLayer( ConvolutionalMaker::instance()->numFilters(2)->filterSize(3)->biased(0)->padZeros(0) );
-    net->addLayer( SquareLossMaker::instance() );
+    ClBlasInstance blasInstance;
+    NeuralNet *net = new NeuralNet(cl, 2, 5);
+    net->addLayer(ConvolutionalMaker::instance()->numFilters(2)->filterSize(3)->biased(0)->padZeros(0));
+    net->addLayer(SquareLossMaker::instance());
     cout << net->asString() << endl;
 
     net->setBatchSize(4);
 
-    checkWeightsUpdate( net, 1 );
+    checkWeightsUpdate(net, 1);
     delete net;
     delete cl;
 }
 
-TEST( testupdateweights, conv1z ) {
+TEST(testupdateweights, conv1z) {
     EasyCL *cl = EasyCL::createForFirstGpuOtherwiseCpu();
-    NeuralNet *net = new NeuralNet( cl, 2, 3 );
-    net->addLayer( ConvolutionalMaker::instance()->numFilters(2)->filterSize(3)->biased(0)->padZeros(1) );
-    net->addLayer( SquareLossMaker::instance() );
+    ClBlasInstance blasInstance;
+    NeuralNet *net = new NeuralNet(cl, 2, 3);
+    net->addLayer(ConvolutionalMaker::instance()->numFilters(2)->filterSize(3)->biased(0)->padZeros(1));
+    net->addLayer(SquareLossMaker::instance());
     cout << net->asString() << endl;
 
     net->setBatchSize(4);
 
-    checkWeightsUpdate( net, 1 );
+    checkWeightsUpdate(net, 1);
     delete net;
     delete cl;
 }
 
-void test( int imageSize, int filterSize, int numPlanes, int batchSize ) {
+void test(int imageSize, int filterSize, int numPlanes, int batchSize) {
     float learningRate = 0.01f;
 
     EasyCL *cl = EasyCL::createForFirstGpuOtherwiseCpu();
     NeuralNet *net = NeuralNet::maker(cl)->instance();
-    net->addLayer( InputLayerMaker::instance()->numPlanes(numPlanes)->imageSize(imageSize) );
-    net->addLayer( ConvolutionalMaker::instance()->numFilters(1)->filterSize(filterSize)->biased(0) );
-    net->addLayer( ActivationMaker::instance()->tanh() );
-    net->addLayer( SquareLossMaker::instance() );;
-    net->setBatchSize( batchSize );
-
-    int inputSize = net->getLayer(0)->getOutputSize();
-    int outputSize = net->getLayer(1)->getOutputSize();
+    net->addLayer(InputLayerMaker::instance()->numPlanes(numPlanes)->imageSize(imageSize));
+    net->addLayer(ConvolutionalMaker::instance()->numFilters(1)->filterSize(filterSize)->biased(0));
+    net->addLayer(ActivationMaker::instance()->tanh());
+    net->addLayer(SquareLossMaker::instance());;
+    net->setBatchSize(batchSize);
+
+    int inputNumElements = net->getLayer(0)->getOutputNumElements();
+    int outputNumElements = net->getLayer(1)->getOutputNumElements();
     int weightsSize = net->getLayer(1)->getWeightsSize();
 
-    float *inputData = new float[max(10000, inputSize )];
-    float *expectedOutput = new float[max(10000, outputSize )];
-    memset( inputData, 0, sizeof(float) * max(10000, inputSize ) );
-    memset( expectedOutput, 0, sizeof(float) * max(10000, outputSize ) );
-    std::mt19937 random = WeightRandomizer::randomize( inputData, max(10000, inputSize ), -1.0f, 1.0f );
-    WeightRandomizer::randomize( random, expectedOutput, max(10000, outputSize ), -1.0f, 1.0f );
-    WeightRandomizer::randomize( random, net->getLayer(1)->getWeights(), weightsSize, -0.1f, 0.1f );
+    float *inputData = new float[max(10000, inputNumElements)];
+    float *expectedOutput = new float[max(10000, outputNumElements)];
+    memset(inputData, 0, sizeof(float) * max(10000, inputNumElements));
+    memset(expectedOutput, 0, sizeof(float) * max(10000, outputNumElements));
+    std::mt19937 random = WeightRandomizer::randomize(inputData, max(10000, inputNumElements), -1.0f, 1.0f);
+    WeightRandomizer::randomize(random, expectedOutput, max(10000, outputNumElements), -1.0f, 1.0f);
+    WeightRandomizer::randomize(random, net->getLayer(1)->getWeights(), weightsSize, -0.1f, 0.1f);
     dynamic_cast<ConvolutionalLayer*>(net->getLayer(1))->weightsWrapper->copyToDevice();
 
     float *weightsBefore = new float[weightsSize];
     float const*currentWeights = net->getLayer(1)->getWeights();
-    for( int i = 0; i < weightsSize; i++ ) {
+    for(int i = 0; i < weightsSize; i++) {
         weightsBefore[i] = currentWeights[i];
     }
 
-    net->forward( inputData );
+    net->forward(inputData);
     float loss = net->calcLoss(expectedOutput);
 
     net->print();
-    SGD *sgd = SGD::instance( cl, learningRate, 0.0f );
+    SGD *sgd = SGD::instance(cl, learningRate, 0.0f);
     TrainingContext context(0, 0);
-    sgd->train( net, &context, inputData, expectedOutput );
-    net->forward( inputData );
+    sgd->train(net, &context, inputData, expectedOutput);
+    net->forward(inputData);
     net->print();
     float loss2 = net->calcLoss(expectedOutput);
     float lossChange = loss - loss2;
@@ -196,7 +199,7 @@ void test( int imageSize, int filterSize, int numPlanes, int batchSize ) {
     float const*newWeights = net->getLayer(1)->getWeights();
     float sumWeightDiff = 0;
     float sumWeightDiffSquared = 0;
-    for( int i = 0; i < weightsSize; i++ ) {
+    for(int i = 0; i < weightsSize; i++) {
         float diff = newWeights[i] - weightsBefore[i];
         sumWeightDiff += diff;
         sumWeightDiffSquared += diff * diff;
@@ -207,8 +210,8 @@ void test( int imageSize, int filterSize, int numPlanes, int batchSize ) {
 
     cout << " loss change              " << lossChange << endl;
     cout << " estimatedLossChangeFromW " << estimatedLossChangeFromW << endl;
-    EXPECT_GT( 0.01f * imageSize * imageSize, abs(estimatedLossChangeFromW - lossChange ) / lossChange ); 
-    EXPECT_GT( 0.01f * imageSize * imageSize, abs(estimatedLossChangeFromW - lossChange ) / estimatedLossChangeFromW ); 
+    EXPECT_GT(0.01f * imageSize * imageSize, abs(estimatedLossChangeFromW - lossChange) / lossChange); 
+    EXPECT_GT(0.01f * imageSize * imageSize, abs(estimatedLossChangeFromW - lossChange) / estimatedLossChangeFromW); 
 
     delete[] weightsBefore;
     delete sgd;
@@ -218,91 +221,91 @@ void test( int imageSize, int filterSize, int numPlanes, int batchSize ) {
     delete cl;
 }
 
-TEST( testupdateweights, numericallytest ) {
+TEST(testupdateweights, numericallytest) {
     // do one learning, with very small learning rate, and check that loss function changed by
     // the amount that we kind of expect
-    test(1, 1, 1, 1 );
+    test(1, 1, 1, 1);
 }
 
-TEST( testupdateweights, numericallytest_imagesize3 ) {
+TEST(testupdateweights, numericallytest_imagesize3) {
     // do one learning, with very small learning rate, and check that loss function changed by
     // the amount that we kind of expect
-    test(3, 1, 1, 1 );
+    test(3, 1, 1, 1);
 }
 
-TEST( testupdateweights, numericallytest_imagesize5 ) {
+TEST(testupdateweights, numericallytest_imagesize5) {
     // do one learning, with very small learning rate, and check that loss function changed by
     // the amount that we kind of expect
-    test(5, 1, 1, 1 );
+    test(5, 1, 1, 1);
 }
 
-TEST( testupdateweights, numericallytest_imagesize9 ) {
+TEST(testupdateweights, numericallytest_imagesize9) {
     // do one learning, with very small learning rate, and check that loss function changed by
     // the amount that we kind of expect
-    test(9, 1, 1, 1 );
+    test(9, 1, 1, 1);
 }
 
-TEST( testupdateweights, numericallytest_imagesize9_filtersize9 ) {
+TEST(testupdateweights, numericallytest_imagesize9_filtersize9) {
     // do one learning, with very small learning rate, and check that loss function changed by
     // the amount that we kind of expect
-    test(9, 9, 1, 1 );
+    test(9, 9, 1, 1);
 }
 
-TEST( testupdateweights, numericallytest_imagesize9_filtersize3 ) {
+TEST(testupdateweights, numericallytest_imagesize9_filtersize3) {
     // do one learning, with very small learning rate, and check that loss function changed by
     // the amount that we kind of expect
-    test(9, 3, 1, 1 );
+    test(9, 3, 1, 1);
 }
 
-TEST( testupdateweights, numericallytest_imagesize3_filtersize3 ) {
+TEST(testupdateweights, numericallytest_imagesize3_filtersize3) {
     // do one learning, with very small learning rate, and check that loss function changed by
     // the amount that we kind of expect
-    test(3, 3, 1, 1 );
+    test(3, 3, 1, 1);
 }
 
-TEST( testupdateweights, numericallytest_imagesize5_filtersize3 ) {
+TEST(testupdateweights, numericallytest_imagesize5_filtersize3) {
     // do one learning, with very small learning rate, and check that loss function changed by
     // the amount that we kind of expect
-    test(5, 3, 1, 1 );
+    test(5, 3, 1, 1);
 }
 
-TEST( testupdateweights, numericallytest_imagesize5_filtersize3_batchsize3 ) {
+TEST(testupdateweights, numericallytest_imagesize5_filtersize3_batchsize3) {
     // do one learning, with very small learning rate, and check that loss function changed by
     // the amount that we kind of expect
-    test(5, 3, 1, 3 );
+    test(5, 3, 1, 3);
 }
 
-TEST( testupdateweights, numericallytest_imagesize5_filtersize3_planes3 ) {
+TEST(testupdateweights, numericallytest_imagesize5_filtersize3_planes3) {
     // do one learning, with very small learning rate, and check that loss function changed by
     // the amount that we kind of expect
-    test(5, 3, 3, 1 );
+    test(5, 3, 3, 1);
 }
 
-TEST( testupdateweights, numericallytest_imagesize5_filtersize3_planes3_batchsize3 ) {
+TEST(testupdateweights, numericallytest_imagesize5_filtersize3_planes3_batchsize3) {
     // do one learning, with very small learning rate, and check that loss function changed by
     // the amount that we kind of expect
-    test(5, 3, 3, 3 );
+    test(5, 3, 3, 3);
 }
 
-void testBackpropWeights( LayerDimensions &dim, int batchSize, float learningMultiplier, float *data, float *errors, float * expectedOutput ) {
+void testBackpropWeights(LayerDimensions &dim, int batchSize, float learningMultiplier, float *data, float *errors, float * expectedOutput) {
     float *output = new float[batchSize * dim.outputCubeSize]; // ignored, for LINEAR
     float *weights = new float[max(dim.filtersSize,20)];
     float *bias = new float[10];
-    memset( weights, 0, sizeof( float ) * max( dim.filtersSize, 20 ) );
-    memset( bias, 0, sizeof(float) * 10 );
+    memset(weights, 0, sizeof(float) * max(dim.filtersSize, 20));
+    memset(bias, 0, sizeof(float) * 10);
 
     EasyCL *cl = EasyCL::createForFirstGpuOtherwiseCpu();
-    BackpropWeights *backpropWeightsImpl = BackpropWeights::instanceForTest( cl, dim );
-    backpropWeightsImpl->calcGradWeights( batchSize, errors, data, weights, bias );
+    BackpropWeights *backpropWeightsImpl = BackpropWeights::instanceForTest(cl, dim);
+    backpropWeightsImpl->calcGradWeights(batchSize, errors, data, weights, bias);
     delete backpropWeightsImpl;
     
-//    for( int i = 0; i < 20; i++ ) {
+//    for(int i = 0; i < 20; i++) {
 //        cout << "weights[" << i << "]=" << weights[i] << endl;
 //    }
-    for( int i = 0; i < dim.filtersSize; i++ ) {
-        if( expectedOutput[i] != -999 && expectedOutput[i] != weights[i] ) {
+    for(int i = 0; i < dim.filtersSize; i++) {
+        if(expectedOutput[i] != -999 && expectedOutput[i] != weights[i]) {
             cout << "mismatch for i " << i << endl;
-            EXPECT_EQ( - expectedOutput[i], weights[i] );
+            EXPECT_EQ(- expectedOutput[i], weights[i]);
         }
     }
     delete[] output;
@@ -311,10 +314,10 @@ void testBackpropWeights( LayerDimensions &dim, int batchSize, float learningMul
     delete cl;
 }
 
-TEST( testupdateweights, backprop_weights_2 ) {
+TEST(testupdateweights, backprop_weights_2) {
     LayerDimensions dim;
-    dim.setInputImageSize( 1 ).setInputPlanes( 1 ).setNumFilters( 1 ).setFilterSize( 1 )
-        .setBiased( 0 ).setPadZeros( 0 );
+    dim.setInputSize(1).setInputPlanes(1).setNumFilters(1).setFilterSize(1)
+        .setBiased(0).setPadZeros(0);
 
     const int batchSize = 1;
     const float learningMultiplier = 1;
@@ -322,14 +325,14 @@ TEST( testupdateweights, backprop_weights_2 ) {
     float data[] = { 3.0f };
     float errors[] = { 7.0f };
     float expectedOutput[] = { - 3 * 7 };
-    testBackpropWeights( dim, batchSize, learningMultiplier, data, errors, expectedOutput );
+    testBackpropWeights(dim, batchSize, learningMultiplier, data, errors, expectedOutput);
 }
 
 
-TEST( testupdateweights, backprop_weights_2_upstreamimagesize2 ) {
+TEST(testupdateweights, backprop_weights_2_upstreamimagesize2) {
     LayerDimensions dim;
-    dim.setInputImageSize( 2 ).setInputPlanes( 1 ).setNumFilters( 1 ).setFilterSize( 1 )
-        .setBiased( 0 ).setPadZeros( 0 );
+    dim.setInputSize(2).setInputPlanes(1).setNumFilters(1).setFilterSize(1)
+        .setBiased(0).setPadZeros(0);
     int batchSize = 1;
     const float learningMultiplier = 1;
 
@@ -340,13 +343,13 @@ TEST( testupdateweights, backprop_weights_2_upstreamimagesize2 ) {
     float expectedOutput[] = { -3 * 7 - 13 * 2 // -191
                                  -17*4 -19*4 };   // 
 
-    testBackpropWeights( dim, batchSize, learningMultiplier, data, DerivLossBySum, expectedOutput );
+    testBackpropWeights(dim, batchSize, learningMultiplier, data, DerivLossBySum, expectedOutput);
 }
 
-TEST( testupdateweights, backprop_weights_2_upstreamimagesize3_filtersize3 ) {
+TEST(testupdateweights, backprop_weights_2_upstreamimagesize3_filtersize3) {
     LayerDimensions dim;
-    dim.setInputImageSize( 3 ).setInputPlanes( 1 ).setNumFilters( 1 ).setFilterSize( 3 )
-        .setBiased( 0 ).setPadZeros( 0 );
+    dim.setInputSize(3).setInputPlanes(1).setNumFilters(1).setFilterSize(3)
+        .setBiased(0).setPadZeros(0);
     int batchSize = 1;
     const float learningMultiplier = 1;
 
@@ -358,13 +361,13 @@ TEST( testupdateweights, backprop_weights_2_upstreamimagesize3_filtersize3 ) {
                                 -7 * 17, - 7 * 19, 7 * 3,   // -119, 133, 21
                                 - 7 * 2,  7 * 4, - 7 * 7 }; // -14, 28, -49
 
-    testBackpropWeights( dim, batchSize, learningMultiplier, data, errors, expectedOutput );
+    testBackpropWeights(dim, batchSize, learningMultiplier, data, errors, expectedOutput);
 }
 
-TEST( testupdateweights, backprop_weights_2_upstreamimagesize4_filtersize3 ) {
+TEST(testupdateweights, backprop_weights_2_upstreamimagesize4_filtersize3) {
     LayerDimensions dim;
-    dim.setInputImageSize( 4 ).setInputPlanes( 1 ).setNumFilters( 1 ).setFilterSize( 3 )
-        .setBiased( 0 ).setPadZeros( 0 );
+    dim.setInputSize(4).setInputPlanes(1).setNumFilters(1).setFilterSize(3)
+        .setBiased(0).setPadZeros(0);
     int batchSize = 1;
     const float learningMultiplier = 1;
 
@@ -378,13 +381,13 @@ TEST( testupdateweights, backprop_weights_2_upstreamimagesize4_filtersize3 ) {
                                 -999, -999, -999,
                                 -999, -999, -49+27 };          //           -22
 
-    testBackpropWeights( dim, batchSize, learningMultiplier, data, errors, expectedOutput );
+    testBackpropWeights(dim, batchSize, learningMultiplier, data, errors, expectedOutput);
 }
 
-TEST( testupdateweights, backprop_weights_2_upstreamimagesize5_filtersize3 ) {
+TEST(testupdateweights, backprop_weights_2_upstreamimagesize5_filtersize3) {
     LayerDimensions dim;
-    dim.setInputImageSize( 5 ).setInputPlanes( 1 ).setNumFilters( 1 ).setFilterSize( 3 )
-        .setBiased( 0 ).setPadZeros( 0 );
+    dim.setInputSize(5).setInputPlanes(1).setNumFilters(1).setFilterSize(3)
+        .setBiased(0).setPadZeros(0);
     int batchSize = 1;
     const float learningMultiplier = 1;
 
@@ -399,153 +402,153 @@ TEST( testupdateweights, backprop_weights_2_upstreamimagesize5_filtersize3 ) {
     float expectedOutput[] = { -(3*7+13*2-1*5+0*17-3*19-1*3+2*2+1*4+0*7), -999, -999 , // 10
                                 -999, -(19*7-3*2-2*1+  0-3*7+0*1   +2*6-1*8+0), -999,
                                 -999, -999, -(7*7+0+2*1   +0-3*9+1*4   +5*2-1*3+0) };          //           -22
-    testBackpropWeights( dim, batchSize, learningMultiplier, data, errors, expectedOutput );
+    testBackpropWeights(dim, batchSize, learningMultiplier, data, errors, expectedOutput);
 }
 
-float *allocateInputCleared( int batchSize, LayerDimensions &dim ) {
-    int inputSize = batchSize * dim.inputCubeSize;
-    float *data = new float[ inputSize ];
-    memset( data, 0, sizeof(float) * inputSize );
+float *allocateInputCleared(int batchSize, LayerDimensions &dim) {
+    int inputNumElements = batchSize * dim.inputCubeSize;
+    float *data = new float[ inputNumElements ];
+    memset(data, 0, sizeof(float) * inputNumElements);
     return data;
 }
 
-float *allocateErrorsCleared( int batchSize, LayerDimensions &dim ) {
-    int outputSize = batchSize * dim.outputCubeSize;
-    float *errors = new float[ outputSize ];
-    memset( errors, 0, sizeof(float) * outputSize );
+float *allocateErrorsCleared(int batchSize, LayerDimensions &dim) {
+    int outputNumElements = batchSize * dim.outputCubeSize;
+    float *errors = new float[ outputNumElements ];
+    memset(errors, 0, sizeof(float) * outputNumElements);
     return errors;
 }
 
-TEST( testupdateweights, backprop_weights_2_upstreamimagesize3_filtersize1 ) {
+TEST(testupdateweights, backprop_weights_2_upstreamimagesize3_filtersize1) {
     LayerDimensions dim;
-    dim.setInputImageSize( 3 ).setInputPlanes( 1 ).setNumFilters( 1 ).setFilterSize( 1 )
-        .setBiased( 0 ).setPadZeros( 0 );
+    dim.setInputSize(3).setInputPlanes(1).setNumFilters(1).setFilterSize(1)
+        .setBiased(0).setPadZeros(0);
     int batchSize = 1;
     const float learningMultiplier = 1;
 
-    float *data = allocateInputCleared( batchSize, dim );
+    float *data = allocateInputCleared(batchSize, dim);
     data[0] = 2;
-    data[1 * dim.inputImageSize + 1] = 7;
-    data[2 * dim.inputImageSize + 2] = 5;
+    data[1 * dim.inputSize + 1] = 7;
+    data[2 * dim.inputSize + 2] = 5;
 
-    float *errors = allocateErrorsCleared( batchSize, dim );
+    float *errors = allocateErrorsCleared(batchSize, dim);
     errors[0] = 5;
-    errors[1 * dim.outputImageSize + 1] = 11;
-    errors[2 * dim.outputImageSize + 2] = 3;
+    errors[1 * dim.outputSize + 1] = 11;
+    errors[2 * dim.outputSize + 2] = 3;
 
-    float expectedOutput[] = { -(2 * 5 +  5 * 3 + 7 * 11 ) };          //           
+    float expectedOutput[] = { -(2 * 5 +  5 * 3 + 7 * 11) };          //           
 
-    testBackpropWeights( dim, batchSize, learningMultiplier, data, errors, expectedOutput );
+    testBackpropWeights(dim, batchSize, learningMultiplier, data, errors, expectedOutput);
 }
 
-TEST( testupdateweights, backprop_weights_2_upstreamimagesize16_filtersize1 ) {
+TEST(testupdateweights, backprop_weights_2_upstreamimagesize16_filtersize1) {
     LayerDimensions dim;
-    dim.setInputImageSize( 16 ).setInputPlanes( 1 ).setNumFilters( 1 ).setFilterSize( 1 )
-        .setBiased( 0 ).setPadZeros( 0 );
+    dim.setInputSize(16).setInputPlanes(1).setNumFilters(1).setFilterSize(1)
+        .setBiased(0).setPadZeros(0);
     int batchSize = 1;
     const float learningMultiplier = 1;
 
-    float *data = allocateInputCleared( batchSize, dim );
+    float *data = allocateInputCleared(batchSize, dim);
     data[0] = 2;
-    data[15 * dim.inputImageSize + 15] = 5;
+    data[15 * dim.inputSize + 15] = 5;
 
-    float *errors = allocateErrorsCleared( batchSize, dim );
+    float *errors = allocateErrorsCleared(batchSize, dim);
     errors[0] = 4;
-    errors[15 * dim.outputImageSize + 15] = 3;
+    errors[15 * dim.outputSize + 15] = 3;
 
-    float expectedOutput[] = { -(2 * 4 +  3 * 5 ) };          //           
+    float expectedOutput[] = { -(2 * 4 +  3 * 5) };          //           
 
-    testBackpropWeights( dim, batchSize, learningMultiplier, data, errors, expectedOutput );
+    testBackpropWeights(dim, batchSize, learningMultiplier, data, errors, expectedOutput);
 }
 
-TEST( testupdateweights, backprop_weights_2_upstreamimagesize17_filtersize1 ) {
+TEST(testupdateweights, backprop_weights_2_upstreamimagesize17_filtersize1) {
     LayerDimensions dim;
-    dim.setInputImageSize( 17 ).setInputPlanes( 1 ).setNumFilters( 1 ).setFilterSize( 1 )
-        .setBiased( 0 ).setPadZeros( 0 );
+    dim.setInputSize(17).setInputPlanes(1).setNumFilters(1).setFilterSize(1)
+        .setBiased(0).setPadZeros(0);
     int batchSize = 1;
     const float learningMultiplier = 1;
     cout << dim << endl;
 
-    float *data = allocateInputCleared( batchSize, dim );
+    float *data = allocateInputCleared(batchSize, dim);
     data[0] = 2;
     data[1] = 3.2f;
     data[2] = 1.234f;
-    data[16 * dim.inputImageSize + 16] = 5;
+    data[16 * dim.inputSize + 16] = 5;
 
-    float *errors = allocateErrorsCleared( batchSize, dim );
+    float *errors = allocateErrorsCleared(batchSize, dim);
     errors[0] = 4;
     errors[1] = -2.5f;
     errors[2] = 4.125f;
-    errors[16 * dim.outputImageSize + 16] = 3;
+    errors[16 * dim.outputSize + 16] = 3;
 
-    float expectedOutput[] = { -( 4*2 - 3.2f * 2.5f + 1.234f * 4.125f + 3*5 ) };          // 
+    float expectedOutput[] = { -(4*2 - 3.2f * 2.5f + 1.234f * 4.125f + 3*5) };          // 
 
-    testBackpropWeights( dim, batchSize, learningMultiplier, data, errors, expectedOutput );
+    testBackpropWeights(dim, batchSize, learningMultiplier, data, errors, expectedOutput);
 }
 
-TEST( testupdateweights, backprop_weights_2_upstreamimagesize17_filtersize1_moredata ) {
+TEST(testupdateweights, backprop_weights_2_upstreamimagesize17_filtersize1_moredata) {
     LayerDimensions dim;
-    dim.setInputImageSize( 17 ).setInputPlanes( 1 ).setNumFilters( 1 ).setFilterSize( 1 )
-        .setBiased( 0 ).setPadZeros( 0 );
+    dim.setInputSize(17).setInputPlanes(1).setNumFilters(1).setFilterSize(1)
+        .setBiased(0).setPadZeros(0);
     int batchSize = 1;
     const float learningMultiplier = 1;
 
-    float *data = allocateInputCleared( batchSize, dim );
-    for( int i = 0; i < square( dim.inputImageSize ); i++ ) {
-        data[i] = ( ( 1 + i ) % 20 ) / 5.3f;
+    float *data = allocateInputCleared(batchSize, dim);
+    for(int i = 0; i < square(dim.inputSize); i++) {
+        data[i] = ((1 + i) % 20) / 5.3f;
     }
 
-    float *errors = allocateErrorsCleared( batchSize, dim );
-    for( int i = 0; i < square( dim.outputImageSize ); i++ ) {
-        errors[i] = ( ( 2 + i ) % 17 ) / 4.2f;
+    float *errors = allocateErrorsCleared(batchSize, dim);
+    for(int i = 0; i < square(dim.outputSize); i++) {
+        errors[i] = ((2 + i) % 17) / 4.2f;
     }
 
     float expectedOutput[1];
     expectedOutput[0] = 0;
-    for ( int i = 0; i < square( dim.inputImageSize ); i++ ) {
+    for (int i = 0; i < square(dim.inputSize); i++) {
         expectedOutput[0] += - data[i] * errors[i];
     }
     cout << "expectedresult: " << expectedOutput[0] << endl;
 
-    testBackpropWeights( dim, batchSize, learningMultiplier, data, errors, expectedOutput );
+    testBackpropWeights(dim, batchSize, learningMultiplier, data, errors, expectedOutput);
 }
 
-TEST( testupdateweights, backprop_instance3_smaller2 ) {
+TEST(testupdateweights, backprop_instance3_smaller2) {
     LayerDimensions dim;
-    dim.setInputImageSize( 96 ).setInputPlanes( 1 ).setNumFilters( 1 ).setFilterSize( 6 )
-        .setBiased( 0 ).setPadZeros( 0 );
+    dim.setInputSize(96).setInputPlanes(1).setNumFilters(1).setFilterSize(6)
+        .setBiased(0).setPadZeros(0);
     int batchSize = 1;
 //    const float learningRate = 1;
 
     EasyCL *cl = EasyCL::createForFirstGpuOtherwiseCpu();
 
-    int outputSize = batchSize * dim.outputCubeSize;
-    int inputSize = batchSize * dim.inputCubeSize;
+    int outputNumElements = batchSize * dim.outputCubeSize;
+    int inputNumElements = batchSize * dim.inputCubeSize;
     int weightsSize = dim.filtersSize;
 //    int biasSize = dim.numFilters;
 
     cout << "numweights: " << weightsSize << endl;
 
-    float *errors = new float[max(10000, outputSize )];
-    float *inputData = new float[max(10000, inputSize )];
-    float *weights0 = new float[max(10000, weightsSize ) ];
-    float *weights1 = new float[max(10000, weightsSize ) ];
+    float *errors = new float[max(10000, outputNumElements)];
+    float *inputData = new float[max(10000, inputNumElements)];
+    float *weights0 = new float[max(10000, weightsSize) ];
+    float *weights1 = new float[max(10000, weightsSize) ];
 
-    memset( errors, 0, sizeof(float) * max(10000, outputSize ) );
-    memset( inputData, 0, sizeof(float) * max(10000, inputSize ) );
-    memset( weights0, 0, sizeof(float) * max(10000, weightsSize ) );
-    memset( weights1, 0, sizeof(float) * max(10000, weightsSize ) );
+    memset(errors, 0, sizeof(float) * max(10000, outputNumElements));
+    memset(inputData, 0, sizeof(float) * max(10000, inputNumElements));
+    memset(weights0, 0, sizeof(float) * max(10000, weightsSize));
+    memset(weights1, 0, sizeof(float) * max(10000, weightsSize));
 
-    CLWrapper *errorsWrap = cl->wrap( 10000, errors );
-    CLWrapper *inputWrap = cl->wrap( 10000, inputData );
-    CLWrapper *weights0Wrap = cl->wrap( 10000, weights0 );
-    CLWrapper *weights1Wrap = cl->wrap( 10000, weights1 );
+    CLWrapper *errorsWrap = cl->wrap(10000, errors);
+    CLWrapper *inputWrap = cl->wrap(10000, inputData);
+    CLWrapper *weights0Wrap = cl->wrap(10000, weights0);
+    CLWrapper *weights1Wrap = cl->wrap(10000, weights1);
 
-    for( int i = 0 * dim.inputImageSize; i < dim.inputImageSize * dim.inputImageSize; i+= dim.inputImageSize * 4 ) {
+    for(int i = 0 * dim.inputSize; i < dim.inputSize * dim.inputSize; i+= dim.inputSize * 4) {
         inputData[i] = 3;
     }
 
-    for( int i = 0; i < dim.outputImageSize * dim.outputImageSize; i+= dim.outputImageSize ) {
+    for(int i = 0; i < dim.outputSize * dim.outputSize; i+= dim.outputSize) {
         errors[i] = 2;
     }
 
@@ -554,24 +557,24 @@ TEST( testupdateweights, backprop_instance3_smaller2 ) {
     weights0Wrap->copyToDevice();
     weights1Wrap->copyToDevice();
     
-    BackpropWeights *backpropWeightsImpl0 = BackpropWeights::instanceSpecific( 0, cl, dim );
+    BackpropWeights *backpropWeightsImpl0 = BackpropWeights::instanceSpecific(0, cl, dim);
     backpropWeightsImpl0->debug = true;
-    backpropWeightsImpl0->calcGradWeights( batchSize, errorsWrap, inputWrap, weights0Wrap, 0 );
-    BackpropWeights *backpropWeightsImpl1 = BackpropWeights::instanceSpecific( 3, cl, dim );
+    backpropWeightsImpl0->calcGradWeights(batchSize, errorsWrap, inputWrap, weights0Wrap, 0);
+    BackpropWeights *backpropWeightsImpl1 = BackpropWeights::instanceSpecific(3, cl, dim);
     backpropWeightsImpl1->debug = true;
-    backpropWeightsImpl1->calcGradWeights( batchSize, errorsWrap, inputWrap, weights1Wrap, 0 );
+    backpropWeightsImpl1->calcGradWeights(batchSize, errorsWrap, inputWrap, weights1Wrap, 0);
     weights0Wrap->copyToHost();
     weights1Wrap->copyToHost();
 
-    for( int i = 0; i < 6; i++ ) {
-        for( int j = 0; j < 6; j++ ) {
+    for(int i = 0; i < 6; i++) {
+        for(int j = 0; j < 6; j++) {
             cout << weights0[i*6+j] << " ";
         }
         cout << endl;
     }
     cout << endl;
-    for( int i = 0; i < 6; i++ ) {
-        for( int j = 0; j < 6; j++ ) {
+    for(int i = 0; i < 6; i++) {
+        for(int j = 0; j < 6; j++) {
             cout << weights1[i*6+j] << " ";
         }
         cout << endl;
@@ -579,9 +582,9 @@ TEST( testupdateweights, backprop_instance3_smaller2 ) {
 
     cout << endl;
     int isok = 1;
-    for( int i = 0; i < 6; i++ ) {
-        for( int j = 0; j < 6; j++ ) {
-            if( weights0[i*6+j] == weights1[i*6+j] ) {
+    for(int i = 0; i < 6; i++) {
+        for(int j = 0; j < 6; j++) {
+            if(weights0[i*6+j] == weights1[i*6+j]) {
                 cout << ".";
             } else {
                 cout << "!";
@@ -591,20 +594,20 @@ TEST( testupdateweights, backprop_instance3_smaller2 ) {
         cout << endl;
     }
     cout << endl;
-    EXPECT_EQ( 1, isok );
+    EXPECT_EQ(1, isok);
 
-    for( int i = 0; i < 12; i++ ) {
+    for(int i = 0; i < 12; i++) {
         cout << i << "=";
-        for( int slice = 0; slice < 8; slice++ ) {
+        for(int slice = 0; slice < 8; slice++) {
             cout << weights1[100+ 12 * slice + i] << " ";
         }
         cout << endl;
     }
     cout << endl;
 
-    for( int i = 0; i < 20; i++ ) {
+    for(int i = 0; i < 20; i++) {
         cout << i << "=";
-        for( int slice = 0; slice < 8; slice++ ) {
+        for(int slice = 0; slice < 8; slice++) {
             cout << weights1[200+ 20 * slice + i] << " ";
         }
         cout << endl;
@@ -628,14 +631,14 @@ class CompareSpecificArgs {
 
     // [[[cog
     // floats= []
-    // ints = [  'inputPlanes', 'inputImageSize', 'numFilters', 'filterSize',
+    // ints = [  'inputPlanes', 'inputSize', 'numFilters', 'filterSize',
     //    'batchSize', 'biased', 'padZeros', 'instance0', 'instance1' ]
     // import cog_fluent
-    // cog_fluent.gov3( 'CompareSpecificArgs', ints = ints, floats = floats )
+    // cog_fluent.gov3('CompareSpecificArgs', ints = ints, floats = floats)
     // ]]]
     // generated, using cog:
     int _inputPlanes;
-    int _inputImageSize;
+    int _inputSize;
     int _numFilters;
     int _filterSize;
     int _batchSize;
@@ -645,7 +648,7 @@ class CompareSpecificArgs {
     int _instance1;
     CompareSpecificArgs() {
         _inputPlanes = 0;
-        _inputImageSize = 0;
+        _inputSize = 0;
         _numFilters = 0;
         _filterSize = 0;
         _batchSize = 0;
@@ -658,8 +661,8 @@ class CompareSpecificArgs {
         this->_inputPlanes = _inputPlanes;
         return *this;
     }
-    CompareSpecificArgs inputImageSize( int _inputImageSize ) {
-        this->_inputImageSize = _inputImageSize;
+    CompareSpecificArgs inputSize( int _inputSize ) {
+        this->_inputSize = _inputSize;
         return *this;
     }
     CompareSpecificArgs numFilters( int _numFilters ) {
@@ -695,41 +698,41 @@ class CompareSpecificArgs {
 
 namespace testupdateweights {
 
-void compareSpecific( bool debug, float learningRate, int its, int batchSize, LayerDimensions dim, int instance0, int instance1 ) {
+void compareSpecific(bool debug, float learningRate, int its, int batchSize, LayerDimensions dim, int instance0, int instance1) {
     cout << dim << endl;
 
-    int outputSize = batchSize * dim.outputCubeSize;
-    int inputSize = batchSize * dim.inputCubeSize;
+    int outputNumElements = batchSize * dim.outputCubeSize;
+    int inputNumElements = batchSize * dim.inputCubeSize;
     int weightsSize = dim.filtersSize;
     int biasSize = dim.numFilters;
 
-    int outputAllocated = max( 10000, outputSize );
-    int inputAllocated = max( 10000, inputSize );
-    int weightsAllocated = max( 10000, weightsSize );
-    int biasAllocated = max( 10000, biasSize );
+    int outputAllocated = max(10000, outputNumElements);
+    int inputAllocated = max(10000, inputNumElements);
+    int weightsAllocated = max(10000, weightsSize);
+    int biasAllocated = max(10000, biasSize);
 
 //    cout << "numweights: " << weightsSize << endl;
 
     float *bias1 = new float[ biasAllocated ];
     float *bias2 = new float[ biasAllocated ];
-    memset( bias1, 0, sizeof(float) * biasAllocated );
-    memset( bias2, 0, sizeof(float) * biasAllocated );
+    memset(bias1, 0, sizeof(float) * biasAllocated);
+    memset(bias2, 0, sizeof(float) * biasAllocated);
 
     float *gradOutput = new float[outputAllocated];
     float *inputData = new float[inputAllocated];
     float *weights1 = new float[weightsAllocated];
     float *weights2 = new float[weightsAllocated];
 
-    memset( gradOutput, 0, sizeof(float) * outputAllocated );
-    memset( inputData, 0, sizeof(float) * inputAllocated );
-    memset( weights1, 0, sizeof(float) * weightsAllocated );
-    memset( weights2, 0, sizeof(float) * weightsAllocated );
+    memset(gradOutput, 0, sizeof(float) * outputAllocated);
+    memset(inputData, 0, sizeof(float) * inputAllocated);
+    memset(weights1, 0, sizeof(float) * weightsAllocated);
+    memset(weights2, 0, sizeof(float) * weightsAllocated);
 
-    WeightRandomizer::randomize( gradOutput, outputAllocated, -0.1f, 0.1f );
-    WeightRandomizer::randomize( inputData, inputAllocated, -0.3f, 0.7f );
+    WeightRandomizer::randomize(gradOutput, outputAllocated, -0.1f, 0.1f);
+    WeightRandomizer::randomize(inputData, inputAllocated, -0.3f, 0.7f);
 
-//    WeightRandomizer::randomizeInts( errors, outputAllocated, 0, 99 );
-//    WeightRandomizer::randomizeInts( inputData, inputAllocated, 0, 99 );
+//    WeightRandomizer::randomizeInts(errors, outputAllocated, 0, 99);
+//    WeightRandomizer::randomizeInts(inputData, inputAllocated, 0, 99);
 
     EasyCL *cl = EasyCL::createForFirstGpuOtherwiseCpu();
     
@@ -743,71 +746,72 @@ void compareSpecific( bool debug, float learningRate, int its, int batchSize, La
     biasByInstance[0] = bias1;
     biasByInstance[1] = bias2;
     BackpropWeights *instanceObjects[2];
-    instanceObjects[0] = BackpropWeights::instanceSpecific( instance0, cl, dim );
-    instanceObjects[1] = BackpropWeights::instanceSpecific( instance1, cl, dim );
-    for( int instance = 0; instance < 2; instance++ ) {
+    instanceObjects[0] = BackpropWeights::instanceSpecific(instance0, cl, dim);
+    instanceObjects[1] = BackpropWeights::instanceSpecific(instance1, cl, dim);
+    for(int instance = 0; instance < 2; instance++) {
         Timer timer;
         BackpropWeights *backpropWeightsImpl = instanceObjects[instance];
         backpropWeightsImpl->debug = true;
-        for( int it = 0; it < its; it++ ) {
-            backpropWeightsImpl->calcGradWeights( batchSize,
-                gradOutput, inputData, weightsByInstance[instance], biasByInstance[instance] );
+        for(int it = 0; it < its; it++) {
+            backpropWeightsImpl->calcGradWeights(batchSize,
+                gradOutput, inputData, weightsByInstance[instance], biasByInstance[instance]);
         }
-        timer.timeCheck("instance " + toString( instances[instance] ) + " backpropweights" );
+        timer.timeCheck("instance " + toString(instances[instance]) + " backpropweights");
 //        delete backpropWeightsImpl;
     }
     delete instanceObjects[0];
     delete instanceObjects[1];
     cout << dim << endl;
-    for( int i = 0; i < 25; i++ ) {
+    for(int i = 0; i < 25; i++) {
         cout << "weights[" << i << "]=" << weights1[i] << " " << weights2[i];
-        if( i < weightsSize ) {
-            if( abs( weights1[i] - weights2[i] ) <= abs(weights1[i]) / 10000.0f ) {
-                if( debug ) cout << " SAME";
+        if(i < weightsSize) {
+            if(abs(weights1[i] - weights2[i]) <= abs(weights1[i]) / 10000.0f) {
+                if(debug) cout << " SAME";
             } else {
                 cout << " DIFF";
             }
         } else {
-            if( debug ) cout << "     ";
+            if(debug) cout << "     ";
         }
-        if( debug ) cout << "  || " << weights2[100+i] ;
-        if( debug ) cout << "  || " << weights2[200+i] ;
-        if( debug ) cout << "  || " << weights2[300+i] ;
-        if( debug ) cout << "  || " << weights2[400+i] ;
-        if( debug ) cout << "  || " << weights2[500+i] ;
-        if( debug ) cout << "  || " << weights2[600+i] ;
-        if( debug ) cout << "  || " << weights2[700+i] << endl;
+        if(debug) cout << "  || " << weights2[100+i] ;
+        if(debug) cout << "  || " << weights2[200+i] ;
+        if(debug) cout << "  || " << weights2[300+i] ;
+        if(debug) cout << "  || " << weights2[400+i] ;
+        if(debug) cout << "  || " << weights2[500+i] ;
+        if(debug) cout << "  || " << weights2[600+i] ;
+        if(debug) cout << "  || " << weights2[700+i];
+        cout << endl;
     }
     bool same = true;
     int errCount = 0;
-    for( int i = 0; i < weightsSize; i++ ) {
-        if( abs( weights1[i] - weights2[i] ) > 0.001 * max( abs( weights1[i] ), abs( weights2[i] ) ) ) {
-//        if( abs( weights1[i] - weights2[i] ) > abs(weights1[i]) / 10000.0f ) {
+    for(int i = 0; i < weightsSize; i++) {
+        if(abs(weights1[i] - weights2[i]) > 0.001 * max(abs(weights1[i]), abs(weights2[i]))) {
+//        if(abs(weights1[i] - weights2[i]) > abs(weights1[i]) / 10000.0f) {
             cout << "DIFF: weights i " << i << " " << weights1[i] << " != " << weights2[i] << endl;
             same = false;
             errCount++;
-            if( errCount == 5 ) {
+            if(errCount == 5) {
                 cout << " ... " << endl;
                 break;
             }
         }
     }
-    if( dim.biased ) {
+    if(dim.biased) {
         errCount = 0;
-        for( int i = 0; i < biasSize; i++ ) {
-            if( abs( bias1[i] - bias2[i] ) > 0.001 * max( abs( bias1[i] ), abs( bias2[i] ) ) ) {
-    //        if( abs( weights1[i] - weights2[i] ) > abs(weights1[i]) / 10000.0f ) {
+        for(int i = 0; i < biasSize; i++) {
+            if(abs(bias1[i] - bias2[i]) > 0.001 * max(abs(bias1[i]), abs(bias2[i]))) {
+    //        if(abs(weights1[i] - weights2[i]) > abs(weights1[i]) / 10000.0f) {
                 cout << "DIFF: bias i " << i << " " << bias1[i] << " != " << bias2[i] << endl;
                 same = false;
                 errCount++;
-                if( errCount == 5 ) {
+                if(errCount == 5) {
                     cout << " etc ... " << endl;
                     break;
                 }
             }
         }
     }
-    EXPECT_EQ( true, same );
+    EXPECT_EQ(true, same);
 
 //    delete backpropWeightsImpl1;
 //    delete backpropWeightsImpl2;
@@ -820,120 +824,120 @@ void compareSpecific( bool debug, float learningRate, int its, int batchSize, La
     delete cl;
 }
 
-TEST( SLOW_testupdateweights, compare_args ) {
+TEST(SLOW_testupdateweights, compare_args) {
     bool debug = false;
     int instance0 = 1;
     int instance1 = 3;
     LayerDimensions dim;
-    dim.setInputImageSize( 28 ).setInputPlanes( 1 ).setNumFilters( 8 ).setFilterSize( 5 )
-        .setBiased( 1 ).setPadZeros( 1 );
+    dim.setInputSize(28).setInputPlanes(4).setNumFilters(8).setFilterSize(5)
+        .setBiased(1).setPadZeros(1);
     int batchSize = 4;
     int its = 1;
 //        string activationName = "tanh";
     float learningRate = 1.0f;
 
-    DimFromArgs::arg( &dim );
-    TestArgsParser::arg( "debug", &debug );
-    TestArgsParser::arg( "instance0", &instance0 );
-    TestArgsParser::arg( "instance1", &instance1 );
-    TestArgsParser::arg( "its", &its );
-    TestArgsParser::arg( "batchsize", &batchSize );
-//        TestArgsParser::arg( "activation", &activationName );
-    TestArgsParser::arg( "learningrate", &learningRate );
+    DimFromArgs::arg(&dim);
+    TestArgsParser::arg("debug", &debug);
+    TestArgsParser::arg("instance0", &instance0);
+    TestArgsParser::arg("instance1", &instance1);
+    TestArgsParser::arg("its", &its);
+    TestArgsParser::arg("batchsize", &batchSize);
+//        TestArgsParser::arg("activation", &activationName);
+    TestArgsParser::arg("learningrate", &learningRate);
     TestArgsParser::go();
     dim.deriveOthers();
-//        ActivationFunction *fn = ActivationFunction::fromName( activationName );
+//        ActivationFunction *fn = ActivationFunction::fromName(activationName);
 
-    compareSpecific( debug, learningRate, its, batchSize, dim, instance0, instance1 );        
+    compareSpecific(debug, learningRate, its, batchSize, dim, instance0, instance1);        
 }
 
-//    TEST( testupdateweights, compare_instance3_smaller2 ) {
+//    TEST(testupdateweights, compare_instance3_smaller2) {
 //        LayerDimensions dim;
-//        dim.setInputImageSize( 96 ).setInputPlanes( 1 ).setNumFilters( 1 ).setFilterSize( 6 )
-//            .setBiased( 0 ).setPadZeros( 0 );
+//        dim.setInputSize(96).setInputPlanes(1).setNumFilters(1).setFilterSize(6)
+//            .setBiased(0).setPadZeros(0);
 //        int batchSize = 1;
 //        const float learningRate = 1;
-//        compareSpecific( CompareSpecificArgs::instance()
-//            .batchSize( 1 ).inputPlanes( 1 ).inputImageSize( 96 ).numFilters( 1 )
-//            .filterSize( 6 ).biased( 0 ).padZeros( false )
-//            .instance0(0).instance1(3) );
+//        compareSpecific(CompareSpecificArgs::instance()
+//            .batchSize(1).inputPlanes(1).inputSize(96).numFilters(1)
+//            .filterSize(6).biased(0).padZeros(false)
+//            .instance0(0).instance1(3));
 //    }
 
-//    TEST( SLOW_testupdateweights, compare_specific ) {
-//        compareSpecific( CompareSpecificArgs::instance()
-//            .batchSize( 128 ).inputPlanes( 32 ).inputImageSize( 19 ).numFilters( 32 )
-//            .filterSize( 3 ).biased( 0 ).padZeros( false )
-//            .instance0(1).instance1(3) );
+//    TEST(SLOW_testupdateweights, compare_specific) {
+//        compareSpecific(CompareSpecificArgs::instance()
+//            .batchSize(128).inputPlanes(32).inputSize(19).numFilters(32)
+//            .filterSize(3).biased(0).padZeros(false)
+//            .instance0(1).instance1(3));
 //    }
 
-//    TEST( SLOW_testupdateweights, compare_specific_96image ) {
-//        compareSpecific( CompareSpecificArgs::instance()
-//            .batchSize( 128 ).inputPlanes( 2 ).inputImageSize( 96 ).numFilters( 8 )
-//            .filterSize( 6 ).biased( 1 ).padZeros( false )
-//            .instance0(0).instance1(3) );
+//    TEST(SLOW_testupdateweights, compare_specific_96image) {
+//        compareSpecific(CompareSpecificArgs::instance()
+//            .batchSize(128).inputPlanes(2).inputSize(96).numFilters(8)
+//            .filterSize(6).biased(1).padZeros(false)
+//            .instance0(0).instance1(3));
 //    }
 
-//    TEST( SLOW_testupdateweights, compare_specific_96image_smaller ) {
-//        compareSpecific( CompareSpecificArgs::instance()
-//            .batchSize( 1 ).inputPlanes( 1 ).inputImageSize( 48 ).numFilters( 1 )
-//            .filterSize( 2 ).biased( 1 ).padZeros( false )
-//            .instance0(0).instance1(3) );
+//    TEST(SLOW_testupdateweights, compare_specific_96image_smaller) {
+//        compareSpecific(CompareSpecificArgs::instance()
+//            .batchSize(1).inputPlanes(1).inputSize(48).numFilters(1)
+//            .filterSize(2).biased(1).padZeros(false)
+//            .instance0(0).instance1(3));
 //    }
 
-//    TEST( SLOW_testupdateweights, compare_specific_96image_smaller2 ) {
-//        compareSpecific( CompareSpecificArgs::instance()
-//            .batchSize( 1 ).inputPlanes( 1 ).inputImageSize( 96 ).numFilters( 1 )
-//            .filterSize( 4 ).biased( 0 ).padZeros( false )
-//            .instance0(0).instance1(3) );
+//    TEST(SLOW_testupdateweights, compare_specific_96image_smaller2) {
+//        compareSpecific(CompareSpecificArgs::instance()
+//            .batchSize(1).inputPlanes(1).inputSize(96).numFilters(1)
+//            .filterSize(4).biased(0).padZeros(false)
+//            .instance0(0).instance1(3));
 //    }
 
-//    TEST( SLOW_testupdateweights, compare_specific_96image_smaller3 ) {
-//        compareSpecific( CompareSpecificArgs::instance()
-//            .batchSize( 1 ).inputPlanes( 1 ).inputImageSize( 96 ).numFilters( 1 )
-//            .filterSize( 6 ).biased( false ).padZeros( false )
-//            .instance0(0).instance1(3) );
+//    TEST(SLOW_testupdateweights, compare_specific_96image_smaller3) {
+//        compareSpecific(CompareSpecificArgs::instance()
+//            .batchSize(1).inputPlanes(1).inputSize(96).numFilters(1)
+//            .filterSize(6).biased(false).padZeros(false)
+//            .instance0(0).instance1(3));
 //    }
 
-//    TEST( SLOW_testupdateweights, compare_specific_96image_smaller4 ) {
-//        compareSpecific( CompareSpecificArgs::instance()
-//            .batchSize( 1 ).inputPlanes( 2 ).inputImageSize( 96 ).numFilters( 8 )
-//            .filterSize( 4 ).biased( 1 ).padZeros( false )
-//            .instance0(0).instance1(3) );
+//    TEST(SLOW_testupdateweights, compare_specific_96image_smaller4) {
+//        compareSpecific(CompareSpecificArgs::instance()
+//            .batchSize(1).inputPlanes(2).inputSize(96).numFilters(8)
+//            .filterSize(4).biased(1).padZeros(false)
+//            .instance0(0).instance1(3));
 //    }
 
-void measurePerf( int batchSize, LayerDimensions dim, int instance ) {
+void measurePerf(int batchSize, LayerDimensions dim, int instance) {
 
-    int outputSize = batchSize * dim.outputCubeSize;
-    int inputSize = batchSize * dim.inputCubeSize;
+    int outputNumElements = batchSize * dim.outputCubeSize;
+    int inputNumElements = batchSize * dim.inputCubeSize;
     int weightsSize = dim.filtersSize;
     int biasSize = dim.numFilters;
 
-    int outputAllocated = outputSize;
-    int inputAllocated = inputSize;
+    int outputAllocated = outputNumElements;
+    int inputAllocated = inputNumElements;
     int weightsAllocated = weightsSize;
     int biasAllocated = biasSize;
 
     cout << "numweights: " << weightsSize << endl;
 
     float *bias = new float[ biasAllocated ];
-    memset( bias, 0, sizeof(float) * biasAllocated );
+    memset(bias, 0, sizeof(float) * biasAllocated);
 
     float *gradOutput = new float[outputAllocated];
     float *inputData = new float[inputAllocated];
     float *weights = new float[weightsAllocated];
 
-    memset( gradOutput, 0, sizeof(float) * outputAllocated );
-    memset( inputData, 0, sizeof(float) * inputAllocated );
-    memset( weights, 0, sizeof(float) * weightsAllocated );
+    memset(gradOutput, 0, sizeof(float) * outputAllocated);
+    memset(inputData, 0, sizeof(float) * inputAllocated);
+    memset(weights, 0, sizeof(float) * weightsAllocated);
 
-    WeightRandomizer::randomizeInts( gradOutput, outputAllocated, 0, 99 );
-    WeightRandomizer::randomizeInts( inputData, inputAllocated, 0, 99 );
+    WeightRandomizer::randomizeInts(gradOutput, outputAllocated, 0, 99);
+    WeightRandomizer::randomizeInts(inputData, inputAllocated, 0, 99);
 
     EasyCL *cl = EasyCL::createForFirstGpuOtherwiseCpu();
     
-    BackpropWeights *backpropWeightsImpl = BackpropWeights::instanceSpecific( instance, cl, dim );
+    BackpropWeights *backpropWeightsImpl = BackpropWeights::instanceSpecific(instance, cl, dim);
     Timer timer;
-    backpropWeightsImpl->calcGradWeights( batchSize, gradOutput, inputData, weights, bias );
+    backpropWeightsImpl->calcGradWeights(batchSize, gradOutput, inputData, weights, bias);
     timer.timeCheck("backprop time");
 
     delete backpropWeightsImpl;