From 551b3b70f1710b30739f69cdf3c20bf87877d70c Mon Sep 17 00:00:00 2001
From: tqchen <tianqi.tchen@gmail.com>
Date: Fri, 29 Aug 2014 18:31:24 -0700
Subject: [PATCH 001/166] check unity back

---
 src/io/io.cpp               |   2 +
 src/io/page_dmatrix-inl.hpp | 204 ++++++++++++++++++++++++
 src/utils/io.h              |  24 ++-
 src/utils/thread.h          | 146 ++++++++++++++++++
 src/utils/thread_buffer.h   | 200 ++++++++++++++++++++++++
 wrapper/xgboost.py          | 298 ++++++++++++++++++++++++++++++++----
 6 files changed, 843 insertions(+), 31 deletions(-)
 create mode 100644 src/io/page_dmatrix-inl.hpp
 create mode 100644 src/utils/thread.h
 create mode 100644 src/utils/thread_buffer.h

diff --git a/src/io/io.cpp b/src/io/io.cpp
index d251d7a96c03..f56cff6794a6 100644
--- a/src/io/io.cpp
+++ b/src/io/io.cpp
@@ -5,6 +5,8 @@
 #include "../utils/io.h"
 #include "../utils/utils.h"
 #include "simple_dmatrix-inl.hpp"
+#include "page_dmatrix-inl.hpp"
+
 // implements data loads using dmatrix simple for now
 
 namespace xgboost {
diff --git a/src/io/page_dmatrix-inl.hpp b/src/io/page_dmatrix-inl.hpp
new file mode 100644
index 000000000000..82a373352f35
--- /dev/null
+++ b/src/io/page_dmatrix-inl.hpp
@@ -0,0 +1,204 @@
+#ifndef XGBOOST_IO_PAGE_ROW_ITER_INL_HPP_
+#define XGBOOST_IO_PAGE_ROW_ITER_INL_HPP_
+/*!
+ * \file page_row_iter-inl.hpp
+ * row iterator based on sparse page
+ * \author Tianqi Chen
+ */
+#include "../data.h"
+#include "../utils/iterator.h"
+#include "../utils/thread_buffer.h"
+namespace xgboost {
+namespace io {
+/*! \brief page structure that can be used to store a rowbatch */
+struct RowBatchPage {
+ public:
+  RowBatchPage(void)  {
+    data_ = new int[kPageSize];
+    utils::Assert(data_ != NULL, "fail to allocate row batch page");
+    this->Clear();
+  }
+  ~RowBatchPage(void) {
+    if (data_ != NULL) delete [] data_;
+  }
+  /*! 
+   * \brief Push one row into page
+   *  \param row an instance row
+   *  \return false or true to push into
+   */  
+  inline bool PushRow(const RowBatch::Inst &row) {
+    const size_t dsize = row.length * sizeof(RowBatch::Entry);
+    if (FreeBytes() < dsize+ sizeof(int)) return false;
+    row_ptr(Size() + 1) = row_ptr(Size()) + row.length;    
+    memcpy(data_ptr(Size()) , row.data, dsize);
+    ++ data_[0];
+    return true;    
+  }
+  /*!
+   * \brief get a row batch representation from the page
+   * \param p_rptr a temporal space that can be used to provide
+   *  ind_ptr storage for RowBatch
+   * \return a new RowBatch object
+   */
+  inline RowBatch GetRowBatch(std::vector<size_t> *p_rptr, size_t base_rowid) {
+    RowBatch batch; 
+    batch.base_rowid = base_rowid;
+    batch.data_ptr = this->data_ptr(0);
+    batch.size = static_cast<size_t>(this->Size());
+    std::vector<size_t> &rptr = *p_rptr;
+    rptr.resize(this->Size()+1);
+    for (size_t i = 0; i < rptr.size(); ++i) {
+      rptr[i] = static_cast<size_t>(this->row_ptr(i));
+    }
+    batch.ind_ptr = &rptr[0];
+    return batch;
+  }
+  /*!
+   * \brief clear the page, cleanup the content
+   */
+  inline void Clear(void) {
+    memset(&data_[0], 0, sizeof(int) * kPageSize);
+  }
+  /*!
+   * \brief load one page form instream
+   * \return true if loading is successful
+   */
+  inline bool Load(utils::IStream &fi) {
+    return fi.Read(&data_[0], sizeof(int) * kPageSize) != 0;
+  }
+  /*! \brief save one page into outstream */
+  inline void Save(utils::IStream &fo) {
+    fo.Write(&data_[0], sizeof(int) * kPageSize);
+  }
+  /*! \return number of elements */
+  inline int Size(void) const {
+    return data_[0];
+  }
+  /*! \brief page size 64 MB */
+  static const size_t kPageSize = 64 << 18;
+
+ private:
+  /*! \return number of elements */
+  inline size_t FreeBytes(void) {
+    return (kPageSize - (Size() + 2)) * sizeof(int) 
+        - row_ptr(Size()) * sizeof(RowBatch::Entry) ;
+  }
+  /*! \brief equivalent row pointer at i */
+  inline int& row_ptr(int i) {
+    return data_[kPageSize - i - 1];
+  }
+  inline RowBatch::Entry* data_ptr(int i) {
+    return (RowBatch::Entry*)(&data_[1]) + i;
+  }
+  // content of data
+  int *data_;  
+};
+/*! \brief thread buffer iterator */
+class ThreadRowPageIterator: public utils::IIterator<RowBatch> {
+ public:
+  ThreadRowPageIterator(void) {
+    itr.SetParam("buffer_size", "4");
+    page_ = NULL;
+    base_rowid_ = 0;
+    isend_ = false;
+  }
+  virtual ~ThreadRowPageIterator(void) {
+  }
+  virtual void Init(void) {
+  }
+  virtual void BeforeFirst(void) {
+    itr.BeforeFirst();
+    isend_ = false;
+    base_rowid_ = 0;
+    utils::Assert(this->LoadNextPage(), "ThreadRowPageIterator");
+  }
+  virtual bool Next(void) {
+    if(!this->LoadNextPage()) return false;
+    out_ = page_->GetRowBatch(&tmp_ptr_, base_rowid_);
+    base_rowid_ += out_.size;
+    return true;
+  }
+  virtual const RowBatch &Value(void) const{
+    return out_;
+  }
+  /*! \brief load and initialize the iterator with fi */
+  inline void Load(const utils::FileStream &fi) {
+    itr.get_factory().SetFile(fi);
+    itr.Init();
+    this->BeforeFirst();
+  }
+  /*!
+   * \brief save a row iterator to output stream, in row iterator format
+   */
+  inline static void Save(utils::IIterator<RowBatch> *iter,
+                          utils::IStream &fo) {
+    RowBatchPage page;
+    iter->BeforeFirst();
+    while (iter->Next()) {
+      const RowBatch &batch = iter->Value();
+      for (size_t i = 0; i < batch.size; ++i) {
+        if (!page.PushRow(batch[i])) {
+          page.Save(fo);
+          page.Clear();
+          utils::Check(page.PushRow(batch[i]), "row is too big");
+        }
+      }
+    }
+    if (page.Size() != 0) page.Save(fo);
+  }
+ private:
+  // load in next page
+  inline bool LoadNextPage(void) {
+    ptop_ = 0;
+    bool ret = itr.Next(page_);
+    isend_ = !ret;
+    return ret;
+  }
+  // base row id
+  size_t base_rowid_;
+  // temporal ptr
+  std::vector<size_t> tmp_ptr_;
+  // output data
+  RowBatch out_;
+  // whether we reach end of file
+  bool isend_;
+  // page pointer type
+  typedef RowBatchPage* PagePtr;
+  // loader factory for page
+  struct Factory {
+   public:
+    size_t file_begin_;
+    utils::FileStream fi;
+    Factory(void) {}
+    inline void SetFile(const utils::FileStream &fi) {
+      this->fi = fi;
+      file_begin_ = this->fi.Tell();
+    }
+    inline bool Init(void) {
+      return true;
+    }
+    inline void SetParam(const char *name, const char *val) {}
+    inline bool LoadNext(PagePtr &val) {
+      return val->Load(fi);
+    }
+    inline PagePtr Create(void) {
+      PagePtr a = new RowBatchPage();
+      return a;
+    }
+    inline void FreeSpace(PagePtr &a) {
+      delete a;
+    }
+    inline void Destroy(void) {}
+    inline void BeforeFirst(void) {
+      fi.Seek(file_begin_);
+    }
+  };
+
+ protected:
+  PagePtr page_;
+  int ptop_;
+  utils::ThreadBuffer<PagePtr,Factory> itr;
+};
+}  // namespace io
+}  // namespace xgboost
+#endif  // XGBOOST_IO_PAGE_ROW_ITER_INL_HPP_
diff --git a/src/utils/io.h b/src/utils/io.h
index 4a80e9a5831a..23fa0d468746 100644
--- a/src/utils/io.h
+++ b/src/utils/io.h
@@ -88,11 +88,19 @@ class IStream {
   }
 };
 
+/*! \brief interface of i/o stream that support seek */
+class ISeekStream: public IStream {
+ public:
+  /*! \brief seek to certain position of the file */
+  virtual void Seek(size_t pos) = 0;
+  /*! \brief tell the position of the stream */
+  virtual size_t Tell(void) = 0;
+};
+
 /*! \brief implementation of file i/o stream */
-class FileStream : public IStream {
- private:
-  FILE *fp;
+class FileStream : public ISeekStream {
  public:
+  explicit FileStream(void) {}
   explicit FileStream(FILE *fp) {
     this->fp = fp;
   }
@@ -102,12 +110,18 @@ class FileStream : public IStream {
   virtual void Write(const void *ptr, size_t size) {
     fwrite(ptr, size, 1, fp);
   }
-  inline void Seek(size_t pos) {
-    fseek(fp, 0, SEEK_SET);
+  virtual void Seek(size_t pos) {
+    fseek(fp, pos, SEEK_SET);
+  }
+  virtual size_t Tell(void) {
+    return static_cast<size_t>(ftell(fp));
   }
   inline void Close(void) {
     fclose(fp);
   }
+
+ private:
+  FILE *fp;
 };
 
 }  // namespace utils
diff --git a/src/utils/thread.h b/src/utils/thread.h
new file mode 100644
index 000000000000..830b21cbeb5a
--- /dev/null
+++ b/src/utils/thread.h
@@ -0,0 +1,146 @@
+#ifndef XGBOOST_UTILS_THREAD_H
+#define XGBOOST_UTILS_THREAD_H
+/*!
+ * \file thread.h
+ * \brief this header include the minimum necessary resource for multi-threading
+ * \author Tianqi Chen
+ * Acknowledgement: this file is adapted from SVDFeature project, by same author. 
+ *  The MAC support part of this code is provided by Artemy Kolchinsky
+ */
+#ifdef _MSC_VER
+#include "utils.h"
+#include <windows.h>
+#include <process.h>
+namespace xgboost {
+namespace utils {
+/*! \brief simple semaphore used for synchronization */
+class Semaphore {
+ public :
+  inline void Init(int init_val) {
+    sem = CreateSemaphore(NULL, init_val, 10, NULL);
+    utils::Assert(sem != NULL, "create Semaphore error");
+  }
+  inline void Destroy(void) {
+    CloseHandle(sem);
+  }
+  inline void Wait(void) {
+    utils::Assert(WaitForSingleObject(sem, INFINITE) == WAIT_OBJECT_0, "WaitForSingleObject error");
+  }
+  inline void Post(void) {
+    utils::Assert(ReleaseSemaphore(sem, 1, NULL)  != 0, "ReleaseSemaphore error");
+  }
+ private:
+  HANDLE sem;
+};
+/*! \brief simple thread that wraps windows thread */
+class Thread {
+ private:
+  HANDLE    thread_handle;
+  unsigned  thread_id;            
+ public:
+  inline void Start(unsigned int __stdcall entry(void*), void *param) {
+    thread_handle = (HANDLE)_beginthreadex(NULL, 0, entry, param, 0, &thread_id);
+  }            
+  inline int Join(void) {
+    WaitForSingleObject(thread_handle, INFINITE);
+    return 0;
+  }
+};
+/*! \brief exit function called from thread */
+inline void ThreadExit(void *status) {
+  _endthreadex(0);
+}
+#define XGBOOST_THREAD_PREFIX unsigned int __stdcall
+}  // namespace utils
+}  // namespace xgboost
+#else
+// thread interface using g++     
+#include <semaphore.h>
+#include <pthread.h>
+namespace xgboost {
+namespace utils {
+/*!\brief semaphore class */
+class Semaphore {
+  #ifdef __APPLE__
+ private:
+  sem_t* semPtr;
+  char sema_name[20];            
+ private:
+  inline void GenRandomString(char *s, const int len) {
+    static const char alphanum[] = "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZ" ;
+    for (int i = 0; i < len; ++i) {
+      s[i] = alphanum[rand() % (sizeof(alphanum) - 1)];
+    }
+    s[len] = 0;
+  }
+ public:
+  inline void Init(int init_val) {
+    sema_name[0]='/'; 
+    sema_name[1]='s'; 
+    sema_name[2]='e'; 
+    sema_name[3]='/'; 
+    GenRandomString(&sema_name[4], 16);
+    if((semPtr = sem_open(sema_name, O_CREAT, 0644, init_val)) == SEM_FAILED) {
+      perror("sem_open");
+      exit(1);
+    }
+    utils::Assert(semPtr != NULL, "create Semaphore error");
+  }
+  inline void Destroy(void) {
+    if (sem_close(semPtr) == -1) {
+      perror("sem_close");
+      exit(EXIT_FAILURE);
+    }
+    if (sem_unlink(sema_name) == -1) {
+      perror("sem_unlink");
+      exit(EXIT_FAILURE);
+    }
+  }
+  inline void Wait(void) {
+    sem_wait(semPtr);
+  }
+  inline void Post(void) {
+    sem_post(semPtr);
+  }               
+  #else
+ private:
+  sem_t sem;
+ public:
+  inline void Init(int init_val) {
+    sem_init(&sem, 0, init_val);
+  }
+  inline void Destroy(void) {
+    sem_destroy(&sem);
+  }
+  inline void Wait(void) {
+    sem_wait(&sem);
+  }
+  inline void Post(void) {
+    sem_post(&sem);
+  }
+  #endif  
+};
+/*!\brief simple thread class */
+class Thread {
+ private:
+  pthread_t thread;                
+ public :
+  inline void Start(void * entry(void*), void *param) {
+    pthread_attr_t attr;
+    pthread_attr_init(&attr);
+    pthread_attr_setdetachstate(&attr, PTHREAD_CREATE_JOINABLE);
+    pthread_create(&thread, &attr, entry, param);
+  }  
+  inline int Join(void) {
+    void *status;
+    return pthread_join(thread, &status);
+  }
+};
+inline void ThreadExit(void *status) {
+  pthread_exit(status);
+}
+}  // namespace utils
+}  // namespace xgboost
+#define XGBOOST_THREAD_PREFIX void *
+#endif
+#endif
diff --git a/src/utils/thread_buffer.h b/src/utils/thread_buffer.h
new file mode 100644
index 000000000000..fa488a2204c7
--- /dev/null
+++ b/src/utils/thread_buffer.h
@@ -0,0 +1,200 @@
+#ifndef XGBOOST_UTILS_THREAD_BUFFER_H
+#define XGBOOST_UTILS_THREAD_BUFFER_H
+/*!
+ * \file thread_buffer.h
+ * \brief  multi-thread buffer, iterator, can be used to create parallel pipeline
+ * \author Tianqi Chen
+ */
+#include <vector>
+#include <cstring>
+#include <cstdlib>
+#include "./utils.h"
+#include "./thread.h"
+namespace xgboost {
+namespace utils {
+/*!
+ * \brief buffered loading iterator that uses multithread
+ * this template method will assume the following paramters
+ * \tparam Elem elememt type to be buffered
+ * \tparam ElemFactory factory type to implement in order to use thread buffer
+ */
+template<typename Elem, typename ElemFactory>
+class ThreadBuffer {
+ public:
+  /*!\brief constructor */
+  ThreadBuffer(void) {
+    this->init_end = false;
+    this->buf_size = 30;
+  }
+  ~ThreadBuffer(void) {
+    if(init_end) this->Destroy();
+  }
+  /*!\brief set parameter, will also pass the parameter to factory */
+  inline void SetParam(const char *name, const char *val) {
+    if (!strcmp( name, "buffer_size")) buf_size = atoi(val);
+    factory.SetParam(name, val);
+  }
+  /*!
+   * \brief initalize the buffered iterator
+   * \param param a initialize parameter that will pass to factory, ignore it if not necessary
+   * \return false if the initlization can't be done, e.g. buffer file hasn't been created 
+   */
+  inline bool Init(void) {
+    if (!factory.Init()) return false;
+    for (int i = 0; i < buf_size; ++i) {
+      bufA.push_back(factory.Create());
+      bufB.push_back(factory.Create());
+    }
+    this->init_end = true;
+    this->StartLoader();
+    return true;
+  }  
+  /*!\brief place the iterator before first value */
+  inline void BeforeFirst(void) {
+    // wait till last loader end
+    loading_end.Wait();
+    // critcal zone
+    current_buf = 1;
+    factory.BeforeFirst();
+    // reset terminate limit
+    endA = endB = buf_size;
+    // wake up loader for first part
+    loading_need.Post();
+    // wait til first part is loaded
+    loading_end.Wait();
+    // set current buf to right value
+    current_buf = 0;
+    // wake loader for next part
+    data_loaded = false;
+    loading_need.Post();
+    // set buffer value
+    buf_index = 0;
+  }  
+  /*! \brief destroy the buffer iterator, will deallocate the buffer */
+  inline void Destroy(void) {
+    // wait until the signal is consumed
+    this->destroy_signal = true;
+    loading_need.Post();
+    loader_thread.Join();
+    loading_need.Destroy();
+    loading_end.Destroy();    
+    for (size_t i = 0; i < bufA.size(); ++i) {
+      factory.FreeSpace(bufA[i]);
+    }
+    for (size_t i = 0; i < bufB.size(); ++i) {
+      factory.FreeSpace(bufB[i]);
+    }
+    bufA.clear(); bufB.clear();
+    factory.Destroy();
+    this->init_end = false;
+  }  
+  /*!
+   * \brief get the next element needed in buffer
+   * \param elem element to store into
+   * \return whether reaches end of data
+   */
+  inline bool Next(Elem &elem) {
+    // end of buffer try to switch
+    if (buf_index == buf_size) {
+      this->SwitchBuffer();
+      buf_index = 0;
+    }
+    if (buf_index >= (current_buf ? endA : endB)) { 
+      return false;
+    }
+    std::vector<Elem> &buf = current_buf ? bufA : bufB;
+    elem = buf[buf_index];
+    ++buf_index;
+    return true;
+  }      
+  /*!
+   * \brief get the factory object
+   */
+  inline ElemFactory &get_factory(void) {
+    return factory;
+  }
+  // size of buffer
+  int  buf_size;
+ private:
+  // factory object used to load configures
+  ElemFactory factory;
+  // index in current buffer
+  int buf_index;
+  // indicate which one is current buffer
+  int current_buf;
+  // max limit of visit, also marks termination
+  int endA, endB;
+  // double buffer, one is accessed by loader
+  // the other is accessed by consumer
+  // buffer of the data
+  std::vector<Elem> bufA, bufB;
+  // initialization end
+  bool init_end;
+  // singal whether the data is loaded
+  bool data_loaded;
+  // signal to kill the thread
+  bool destroy_signal;
+  // thread object
+  Thread loader_thread;
+  // signal of the buffer
+  Semaphore loading_end, loading_need;
+  /*!
+   * \brief slave thread
+   * this implementation is like producer-consumer style
+   */
+  inline void RunLoader(void) {
+    while(!destroy_signal) {
+      // sleep until loading is needed
+      loading_need.Wait();      
+      std::vector<Elem> &buf = current_buf ? bufB : bufA;
+      int i;
+      for (i = 0; i < buf_size ; ++i) {
+        if (!factory.LoadNext(buf[i])) {
+          int &end = current_buf ? endB : endA;
+          end = i; // marks the termination
+          break;
+        }
+      }
+      // signal that loading is done
+      data_loaded = true;
+      loading_end.Post();
+    }
+  }
+  /*!\brief entry point of loader thread */
+  inline static XGBOOST_THREAD_PREFIX LoaderEntry(void *pthread) {
+    static_cast< ThreadBuffer<Elem,ElemFactory>* >(pthread)->RunLoader();
+    ThreadExit(NULL);
+    return NULL;
+  }
+  /*!\brief start loader thread */
+  inline void StartLoader(void) {
+    destroy_signal = false;
+    // set param
+    current_buf = 1;    
+    loading_need.Init(1);
+    loading_end .Init(0);
+    // reset terminate limit
+    endA = endB = buf_size;
+    loader_thread.Start(LoaderEntry, this);
+    // wait until first part of data is loaded
+    loading_end.Wait();
+    // set current buf to right value
+    current_buf = 0;
+    // wake loader for next part
+    data_loaded = false;
+    loading_need.Post();    
+    buf_index = 0; 
+  }
+  /*!\brief switch double buffer */
+  inline void SwitchBuffer(void) {
+    loading_end.Wait();
+    // loader shall be sleep now, critcal zone!
+    current_buf = !current_buf;
+    // wake up loader
+    data_loaded = false;
+    loading_need.Post();
+  }
+};
+}  // namespace utils
+}  // namespace xgboost
+#endif
diff --git a/wrapper/xgboost.py b/wrapper/xgboost.py
index e2cbdba2e161..adf59c829dcc 100644
--- a/wrapper/xgboost.py
+++ b/wrapper/xgboost.py
@@ -3,10 +3,11 @@
 import ctypes
 import os
 # optinally have scipy sparse, though not necessary
-import numpy
+import numpy as np
 import sys
 import numpy.ctypeslib
 import scipy.sparse as scp
+import random
 
 # set this line correctly
 if os.name == 'nt':
@@ -32,16 +33,28 @@
 
 
 def ctypes2numpy(cptr, length, dtype):
-    # convert a ctypes pointer array to numpy
+    """convert a ctypes pointer array to numpy array """
     assert isinstance(cptr, ctypes.POINTER(ctypes.c_float))
     res = numpy.zeros(length, dtype=dtype)
     assert ctypes.memmove(res.ctypes.data, cptr, length * res.strides[0])
     return res
 
-# data matrix used in xgboost
 class DMatrix:
+    """data matrix used in xgboost"""
     # constructor
     def __init__(self, data, label=None, missing=0.0, weight = None):
+        """ constructor of DMatrix
+
+            Args:
+                data: string/numpy array/scipy.sparse
+                      data source, string type is the path of svmlight format txt file or xgb buffer
+                label: list or numpy 1d array, optional
+                       label of training data
+                missing: float
+                         value in data which need to be present as missing value
+                weight: list or numpy 1d array, optional
+                        weight for each instances                        
+        """
         # force into void_p, mac need to pass things in as void_p
         if data == None:
             self.handle = None
@@ -63,22 +76,25 @@ def __init__(self, data, label=None, missing=0.0, weight = None):
             self.set_label(label)
         if weight !=None:
             self.set_weight(weight)
-    # convert data from csr matrix
+
     def __init_from_csr(self, csr):
+        """convert data from csr matrix"""
         assert len(csr.indices) == len(csr.data)
         self.handle = ctypes.c_void_p(xglib.XGDMatrixCreateFromCSR(
             (ctypes.c_ulong  * len(csr.indptr))(*csr.indptr),
             (ctypes.c_uint  * len(csr.indices))(*csr.indices),
             (ctypes.c_float * len(csr.data))(*csr.data),
             len(csr.indptr), len(csr.data)))
-    # convert data from numpy matrix
+
     def __init_from_npy2d(self,mat,missing):
+        """convert data from numpy matrix"""
         data = numpy.array(mat.reshape(mat.size), dtype='float32')
         self.handle = ctypes.c_void_p(xglib.XGDMatrixCreateFromMat(
             data.ctypes.data_as(ctypes.POINTER(ctypes.c_float)),
             mat.shape[0], mat.shape[1], ctypes.c_float(missing)))
-    # destructor
+
     def __del__(self):
+        """destructor"""
         xglib.XGDMatrixFree(self.handle)
     def get_float_info(self, field):
         length = ctypes.c_ulong()
@@ -96,16 +112,39 @@ def set_float_info(self, field, data):
     def set_uint_info(self, field, data):
         xglib.XGDMatrixSetUIntInfo(self.handle, ctypes.c_char_p(field.encode('utf-8')),
                                    (ctypes.c_uint*len(data))(*data), len(data))
-    # load data from file
+
     def save_binary(self, fname, silent=True):
+        """save DMatrix to XGBoost buffer
+            Args:
+                fname: string
+                       name of buffer file
+                slient: bool, option
+                       whether print info
+           Returns:
+                None
+        """
         xglib.XGDMatrixSaveBinary(self.handle, ctypes.c_char_p(fname.encode('utf-8')), int(silent))
-    # set label of dmatrix
+
     def set_label(self, label):
+        """set label of dmatrix
+            Args:
+                label: list
+                       label for DMatrix
+            Returns:
+                None
+        """
         self.set_float_info('label', label)
-    # set weight of each instances
+
     def set_weight(self, weight):
+        """set weight of each instances
+            Args:
+                weight: float
+                        weight for positive instance
+            Returns:
+                None
+        """
         self.set_float_info('weight', weight)
-    # set initialized margin prediction
+
     def set_base_margin(self, margin):
         """
         set base margin of booster to start from
@@ -116,31 +155,143 @@ def set_base_margin(self, margin):
         see also example/demo.py
         """
         self.set_float_info('base_margin', margin)
-    # set group size of dmatrix, used for rank
+
     def set_group(self, group):
+        """set group size of dmatrix, used for rank
+            Args:
+                group:
+
+            Returns:
+                None
+        """
         xglib.XGDMatrixSetGroup(self.handle, (ctypes.c_uint*len(group))(*group), len(group))
-    # get label from dmatrix
+
     def get_label(self):
+        """get label from dmatrix
+            Args:
+                None
+            Returns:
+                list, label of data
+        """
         return self.get_float_info('label')
-    # get weight from dmatrix
+
     def get_weight(self):
+        """get weight from dmatrix
+            Args:
+                None
+            Returns:
+                float, weight
+        """
         return self.get_float_info('weight')
-    # get base_margin from dmatrix
     def get_base_margin(self):
+        """get base_margin from dmatrix
+            Args:
+                None
+            Returns:
+                float, base margin
+        """
         return self.get_float_info('base_margin')
     def num_row(self):
+        """get number of rows
+            Args:
+                None
+            Returns:
+                int, num rows
+        """
         return xglib.XGDMatrixNumRow(self.handle)
-    # slice the DMatrix to return a new DMatrix that only contains rindex
     def slice(self, rindex):
+        """slice the DMatrix to return a new DMatrix that only contains rindex
+            Args:
+                rindex: list
+                        list of index to be chosen
+            Returns:
+                res: DMatrix
+                     new DMatrix with chosen index
+        """
         res = DMatrix(None)
         res.handle = ctypes.c_void_p(xglib.XGDMatrixSliceDMatrix(
             self.handle, (ctypes.c_int*len(rindex))(*rindex), len(rindex)))
         return res
 
+class CVPack:
+    def __init__(self, dtrain, dtest, param):
+        self.dtrain = dtrain
+        self.dtest = dtest
+        self.watchlist = watchlist = [ (dtrain,'train'), (dtest, 'test') ]
+        self.bst = Booster(param, [dtrain,dtest])
+    def update(self,r):
+        self.bst.update(self.dtrain, r)
+    def eval(self,r):
+        return self.bst.eval_set(self.watchlist, r)
+
+def mknfold(dall, nfold, param, seed, weightscale=None):
+    """
+    mk nfold list of cvpack from randidx
+    """
+    randidx = range(dall.num_row())
+    random.seed(seed)
+    random.shuffle(randidx)
+
+    idxset = []
+    kstep = len(randidx) / nfold
+    for i in range(nfold):
+        idxset.append(randidx[ (i*kstep) : min(len(randidx),(i+1)*kstep) ])
+
+    ret = []
+    for k in range(nfold):
+        trainlst = []
+        for j in range(nfold):
+            if j == k:
+                testlst = idxset[j]
+            else:
+                trainlst += idxset[j]
+        dtrain = dall.slice(trainlst)
+        dtest = dall.slice(testlst)
+        # rescale weight of dtrain and dtest
+        if weightscale != None:
+            dtrain.set_weight( dtrain.get_weight() * weightscale * dall.num_row() / dtrain.num_row() )
+            dtest.set_weight( dtest.get_weight() * weightscale * dall.num_row() / dtest.num_row() )
+
+        ret.append(CVPack(dtrain, dtest, param))
+    return ret
+
+def aggcv(rlist):
+    """
+    aggregate cross validation results
+    """
+    cvmap = {}
+    arr = rlist[0].split()
+    ret = arr[0]
+    for it in arr[1:]:
+        k, v  = it.split(':')
+        cvmap[k] = [float(v)]
+    for line in rlist[1:]:
+        arr = line.split()
+        assert ret == arr[0]
+        for it in arr[1:]:
+            k, v  = it.split(':')
+            cvmap[k].append(float(v))
+
+    for k, v in sorted(cvmap.items(), key = lambda x:x[0]):
+        v = np.array(v)
+        ret += '\t%s:%f+%f' % (k, np.mean(v), np.std(v))
+    return ret
+
+
 class Booster:
     """learner class """
     def __init__(self, params={}, cache=[], model_file = None):
-        """ constructor, param: """
+        """ constructor
+            Args:
+                params: dict
+                        params for boosters
+                cache: list
+                        list of cache item
+                model_file: string
+                        path of model file
+            Returns:
+                None
+        """
         for d in cache:
             assert isinstance(d, DMatrix)
         dmats = (ctypes.c_void_p  * len(cache))(*[ d.handle for d in cache])
@@ -166,16 +317,30 @@ def set_param(self, params, pv=None):
                 xglib.XGBoosterSetParam(
                     self.handle, ctypes.c_char_p(k.encode('utf-8')),
                     ctypes.c_char_p(str(v).encode('utf-8')))
+
     def update(self, dtrain, it):
         """
         update
-          dtrain: the training DMatrix
-          it: current iteration number
+            Args:
+                dtrain: DMatrix
+                        the training DMatrix
+                it: int
+                    current iteration number
+            Returns:
+                None
         """
         assert isinstance(dtrain, DMatrix)
         xglib.XGBoosterUpdateOneIter(self.handle, it, dtrain.handle)
     def boost(self, dtrain, grad, hess):
-        """ update """
+        """ update
+            Args:
+                dtrain: DMatrix
+                        the training DMatrix
+                grad: list
+                        the first order of gradient
+                hess: list
+                        the second order of gradient
+        """
         assert len(grad) == len(hess)
         assert isinstance(dtrain, DMatrix)
         xglib.XGBoosterBoostOneIter(self.handle, dtrain.handle,
@@ -183,6 +348,14 @@ def boost(self, dtrain, grad, hess):
                                     (ctypes.c_float*len(hess))(*hess),
                                     len(grad))
     def eval_set(self, evals, it = 0):
+        """evaluates by metric
+            Args:
+                evals: list of tuple (DMatrix, string)
+                       lists of items to be evaluated
+                it: int
+            Returns:
+                evals result
+        """
         for d in evals:
             assert isinstance(d[0], DMatrix)
             assert isinstance(d[1], str)
@@ -195,21 +368,46 @@ def eval(self, mat, name = 'eval', it = 0):
     def predict(self, data, output_margin=False):
         """
         predict with data
-            data: the dmatrix storing the input
-            output_margin: whether output raw margin value that is untransformed
+            Args:
+                data: DMatrix
+                      the dmatrix storing the input
+                output_margin: bool
+                               whether output raw margin value that is untransformed
+            Returns:
+                numpy array of prediction
         """
         length = ctypes.c_ulong()
         preds = xglib.XGBoosterPredict(self.handle, data.handle,
                                        int(output_margin), ctypes.byref(length))
         return ctypes2numpy(preds, length.value, 'float32')
     def save_model(self, fname):
-        """ save model to file """
+        """ save model to file
+            Args:
+                fname: string
+                       file name of saving model
+            Returns:
+                None
+        """
         xglib.XGBoosterSaveModel(self.handle, ctypes.c_char_p(fname.encode('utf-8')))
     def load_model(self, fname):
-        """load model from file"""
+        """load model from file
+            Args:
+                fname: string
+                       file name of saving model
+            Returns:
+                None
+        """
         xglib.XGBoosterLoadModel( self.handle, ctypes.c_char_p(fname.encode('utf-8')) )
     def dump_model(self, fo, fmap=''):
-        """dump model into text file"""
+        """dump model into text file
+            Args:
+                fo: string
+                    file name to be dumped
+                fmap: string, optional
+                      file name of feature map names
+            Returns:
+                None
+        """
         if isinstance(fo,str):
             fo = open(fo,'w')
             need_close = True
@@ -248,7 +446,17 @@ def get_fscore(self, fmap=''):
         return fmap
 
 def evaluate(bst, evals, it, feval = None):
-    """evaluation on eval set"""
+    """evaluation on eval set
+        Args:
+            bst: XGBoost object
+                 object of XGBoost model
+            evals: list of tuple (DMatrix, string)
+                obj need to be evaluated
+            it: int
+            feval: optional
+        Returns:
+            eval result
+    """
     if feval != None:
         res = '[%d]' % it
         for dm, evname in evals:
@@ -259,8 +467,22 @@ def evaluate(bst, evals, it, feval = None):
 
     return res
 
+
+
 def train(params, dtrain, num_boost_round = 10, evals = [], obj=None, feval=None):
-    """ train a booster with given paramaters """
+    """ train a booster with given paramaters
+        Args:
+            params: dict
+                    params of booster
+            dtrain: DMatrix
+                    data to be trained
+            num_boost_round: int
+                             num of round to be boosted
+            evals: list
+                   list of items to be evaluated
+            obj:
+            feval:
+    """
     bst = Booster(params, [dtrain]+[ d[0] for d in evals ] )
     if obj == None:
         for i in range(num_boost_round):
@@ -276,3 +498,27 @@ def train(params, dtrain, num_boost_round = 10, evals = [], obj=None, feval=None
             if len(evals) != 0:
                 sys.stderr.write(evaluate(bst, evals, i, feval)+'\n')
     return bst
+
+def cv(params, dtrain, num_boost_round = 10, nfold=3, evals = [], obj=None, feval=None):
+    """ cross validation  with given paramaters
+        Args:
+            params: dict
+                    params of booster
+            dtrain: DMatrix
+                    data to be trained
+            num_boost_round: int
+                             num of round to be boosted
+            nfold: int
+                   folds to do cv
+            evals: list
+                   list of items to be evaluated
+            obj:
+            feval:
+    """
+    plst = list(params.items())+[('eval_metric', itm) for itm in evals]
+    cvfolds = mknfold(dtrain, nfold, plst, 0)
+    for i in range(num_boost_round):
+        for f in cvfolds:
+            f.update(i)
+        res = aggcv([f.eval(i) for f in cvfolds])
+        sys.stderr.write(res+'\n')

From ce2d34ecd4f4f4f7a965d997413144d6cd4d6f7a Mon Sep 17 00:00:00 2001
From: tqchen <tianqi.tchen@gmail.com>
Date: Fri, 29 Aug 2014 18:35:26 -0700
Subject: [PATCH 002/166] check unity back

---
 src/io/page_dmatrix-inl.hpp       |   2 +-
 src/tree/param.h                  |   6 +-
 src/tree/updater_colmaker-inl.hpp | 194 ++++++++++++++++++++++++++----
 src/utils/io.h                    |   1 -
 4 files changed, 177 insertions(+), 26 deletions(-)

diff --git a/src/io/page_dmatrix-inl.hpp b/src/io/page_dmatrix-inl.hpp
index 82a373352f35..df43d3b7fdaa 100644
--- a/src/io/page_dmatrix-inl.hpp
+++ b/src/io/page_dmatrix-inl.hpp
@@ -18,7 +18,7 @@ struct RowBatchPage {
     utils::Assert(data_ != NULL, "fail to allocate row batch page");
     this->Clear();
   }
-  ~RowBatchPage(void) {
+  ~BinaryPage(void) {
     if (data_ != NULL) delete [] data_;
   }
   /*! 
diff --git a/src/tree/param.h b/src/tree/param.h
index 52c2737495be..92bc1c9907c0 100644
--- a/src/tree/param.h
+++ b/src/tree/param.h
@@ -37,7 +37,9 @@ struct TrainParam{
   // speed optimization for dense column
   float opt_dense_col;
   // leaf vector size
-  int size_leaf_vector;
+  int size_leaf_vector;  
+  // option for parallelization
+  int parallel_option;
   // number of threads to be used for tree construction,
   // if OpenMP is enabled, if equals 0, use system default
   int nthread;
@@ -55,6 +57,7 @@ struct TrainParam{
     opt_dense_col = 1.0f;
     nthread = 0;
     size_leaf_vector = 0;
+    parallel_option = 0;
   }
   /*! 
    * \brief set parameters from outside 
@@ -79,6 +82,7 @@ struct TrainParam{
     if (!strcmp(name, "size_leaf_vector")) size_leaf_vector = atoi(val);
     if (!strcmp(name, "max_depth")) max_depth = atoi(val);
     if (!strcmp(name, "nthread")) nthread = atoi(val);
+    if (!strcmp(name, "parallel_option")) parallel_option = atoi(val);
     if (!strcmp(name, "default_direction")) {
       if (!strcmp(val, "learn")) default_direction = 0;
       if (!strcmp(val, "left")) default_direction = 1;
diff --git a/src/tree/updater_colmaker-inl.hpp b/src/tree/updater_colmaker-inl.hpp
index a8cf6ea7fa58..bf93cb7b5a60 100644
--- a/src/tree/updater_colmaker-inl.hpp
+++ b/src/tree/updater_colmaker-inl.hpp
@@ -45,15 +45,19 @@ class ColMaker: public IUpdater {
   // data structure
   /*! \brief per thread x per node entry to store tmp data */
   struct ThreadEntry {
-    /*! \brief statistics of data*/
+    /*! \brief statistics of data */
     TStats stats;
+    /*! \brief extra statistics of data */
+    TStats stats_extra;
     /*! \brief last feature value scanned */
     float  last_fvalue;
+    /*! \brief first feature value scanned */
+    float  first_fvalue;
     /*! \brief current best solution */
     SplitEntry best;
     // constructor
     explicit ThreadEntry(const TrainParam &param)
-        : stats(param) {
+        : stats(param), stats_extra(param) {
     }
   };
   struct NodeEntry {
@@ -219,7 +223,137 @@ class ColMaker: public IUpdater {
       }
       // use new nodes for qexpand
       qexpand = newnodes;
-    }
+    }    
+    // parallel find the best split of current fid
+    // this function does not support nested functions
+    inline void ParallelFindSplit(const ColBatch::Inst &col,
+                                  bst_uint fid,
+                                  const IFMatrix &fmat,
+                                  const std::vector<bst_gpair> &gpair,
+                                  const BoosterInfo &info) {
+      bool need_forward = param.need_forward_search(fmat.GetColDensity(fid));
+      bool need_backward = param.need_backward_search(fmat.GetColDensity(fid));
+      int nthread;
+      #pragma omp parallel
+      {
+        const int tid = omp_get_thread_num();
+        std::vector<ThreadEntry> &temp = stemp[tid];
+        // cleanup temp statistics
+        for (size_t j = 0; j < qexpand.size(); ++j) {
+          temp[qexpand[j]].stats.Clear();
+        }
+        nthread = omp_get_num_threads();
+        bst_uint step = (col.length + nthread - 1) / nthread;
+        bst_uint end = std::min(col.length, step * (tid + 1));
+        for (bst_uint i = tid * step; i < end; ++i) {
+          const bst_uint ridx = col[i].index;
+          const int nid = position[ridx];
+          if (nid < 0) continue;
+          const float fvalue = col[i].fvalue;
+          if (temp[nid].stats.Empty()) {
+            temp[nid].first_fvalue = fvalue;
+          }
+          temp[nid].stats.Add(gpair, info, ridx);
+          temp[nid].last_fvalue = fvalue;
+        }
+      }
+      // start collecting the partial sum statistics
+      bst_omp_uint nnode = static_cast<bst_omp_uint>(qexpand.size());
+      #pragma omp parallel for schedule(static)
+      for (bst_omp_uint j = 0; j < nnode; ++j) {
+        const int nid = qexpand[j];
+        TStats sum(param), tmp(param), c(param);
+        for (int tid = 0; tid < nthread; ++tid) {
+          tmp = stemp[tid][nid].stats;
+          stemp[tid][nid].stats = sum;
+          sum.Add(tmp);
+          if (tid != 0) {
+            std::swap(stemp[tid - 1][nid].last_fvalue, stemp[tid][nid].first_fvalue);
+          }
+        }
+        for (int tid = 0; tid < nthread; ++tid) {
+          stemp[tid][nid].stats_extra = sum;
+          ThreadEntry &e = stemp[tid][nid];
+          float fsplit;
+          if (tid != 0) {
+            if(fabsf(stemp[tid - 1][nid].last_fvalue - e.first_fvalue) > rt_2eps) {
+              fsplit = (stemp[tid - 1][nid].last_fvalue - e.first_fvalue) * 0.5f;
+            } else {
+              continue;
+            }
+          } else {
+            fsplit = e.first_fvalue - rt_eps;
+          }                        
+          if (need_forward && tid != 0) {
+            c.SetSubstract(snode[nid].stats, e.stats);
+            if (c.sum_hess >= param.min_child_weight && e.stats.sum_hess >= param.min_child_weight) {
+              bst_float loss_chg = static_cast<bst_float>(e.stats.CalcGain(param) + c.CalcGain(param) - snode[nid].root_gain);
+              e.best.Update(loss_chg, fid, fsplit, false);
+            }
+          }
+          if (need_backward) {
+            tmp.SetSubstract(sum, e.stats);
+            c.SetSubstract(snode[nid].stats, tmp);
+            if (c.sum_hess >= param.min_child_weight && tmp.sum_hess >= param.min_child_weight) {
+              bst_float loss_chg = static_cast<bst_float>(tmp.CalcGain(param) + c.CalcGain(param) - snode[nid].root_gain);
+              e.best.Update(loss_chg, fid, fsplit, true);
+            }
+          }
+        }
+        if (need_backward) {
+          tmp = sum;
+          ThreadEntry &e = stemp[nthread-1][nid];
+          c.SetSubstract(snode[nid].stats, tmp);
+          if (c.sum_hess >= param.min_child_weight && tmp.sum_hess >= param.min_child_weight) {
+            bst_float loss_chg = static_cast<bst_float>(tmp.CalcGain(param) + c.CalcGain(param) - snode[nid].root_gain);
+            e.best.Update(loss_chg, fid, e.last_fvalue + rt_eps, true);
+          }
+        }
+      }
+      // rescan, generate candidate split
+      #pragma omp parallel
+      {
+        TStats c(param), cright(param);
+        const int tid = omp_get_thread_num();
+        std::vector<ThreadEntry> &temp = stemp[tid];
+        nthread = static_cast<bst_uint>(omp_get_num_threads());
+        bst_uint step = (col.length + nthread - 1) / nthread;
+        bst_uint end = std::min(col.length, step * (tid + 1));
+        for (bst_uint i = tid * step; i < end; ++i) {
+          const bst_uint ridx = col[i].index;
+          const int nid = position[ridx];
+          if (nid < 0) continue;
+          const float fvalue = col[i].fvalue;
+          // get the statistics of nid
+          ThreadEntry &e = temp[nid];
+          if (e.stats.Empty()) {
+            e.stats.Add(gpair, info, ridx);
+            e.first_fvalue = fvalue;
+          } else {
+            // forward default right
+            if (fabsf(fvalue - e.first_fvalue) > rt_2eps){
+              if (need_forward) { 
+                c.SetSubstract(snode[nid].stats, e.stats);
+                if (c.sum_hess >= param.min_child_weight && e.stats.sum_hess >= param.min_child_weight) {
+                  bst_float loss_chg = static_cast<bst_float>(e.stats.CalcGain(param) + c.CalcGain(param) - snode[nid].root_gain);
+                  e.best.Update(loss_chg, fid, (fvalue + e.first_fvalue) * 0.5f, false);
+                }
+              }
+              if (need_backward) {
+                cright.SetSubstract(e.stats_extra, e.stats);
+                c.SetSubstract(snode[nid].stats, cright);
+                if (c.sum_hess >= param.min_child_weight && cright.sum_hess >= param.min_child_weight) {
+                  bst_float loss_chg = static_cast<bst_float>(cright.CalcGain(param) + c.CalcGain(param) - snode[nid].root_gain);
+                  e.best.Update(loss_chg, fid, (fvalue + e.first_fvalue) * 0.5f, true);
+                }
+              }
+            }          
+            e.stats.Add(gpair, info, ridx);
+            e.first_fvalue = fvalue;            
+          }
+        }
+      }
+    }    
     // enumerate the split values of specific feature
     inline void EnumerateSplit(const ColBatch::Entry *begin,
                                const ColBatch::Entry *end,
@@ -272,6 +406,38 @@ class ColMaker: public IUpdater {
         }
       }
     }
+    // update the solution candidate 
+    virtual void UpdateSolution(const ColBatch &batch,                                
+                                const std::vector<bst_gpair> &gpair,
+                                const IFMatrix &fmat,
+                                const BoosterInfo &info) {
+      // start enumeration
+      const bst_omp_uint nsize = static_cast<bst_omp_uint>(batch.size);
+      #if defined(_OPENMP)                                                                
+      const int batch_size = std::max(static_cast<int>(nsize / this->nthread / 32), 1);
+      #endif
+      if (param.parallel_option == 0) {
+        #pragma omp parallel for schedule(dynamic, batch_size)
+        for (bst_omp_uint i = 0; i < nsize; ++i) {
+          const bst_uint fid = batch.col_index[i];
+          const int tid = omp_get_thread_num();
+          const ColBatch::Inst c = batch[i];
+          if (param.need_forward_search(fmat.GetColDensity(fid))) {
+            this->EnumerateSplit(c.data, c.data + c.length, +1, 
+                                 fid, gpair, info, stemp[tid]);
+          }
+          if (param.need_backward_search(fmat.GetColDensity(fid))) {
+            this->EnumerateSplit(c.data + c.length - 1, c.data - 1, -1, 
+                                 fid, gpair, info, stemp[tid]);
+          }
+        }
+      } else {
+        for (bst_omp_uint i = 0; i < nsize; ++i) {
+          this->ParallelFindSplit(batch[i], batch.col_index[i],
+                                  fmat, gpair, info);
+        }
+      }      
+    }
     // find splits at current level, do split per level
     inline void FindSplit(int depth,
                           const std::vector<int> &qexpand,
@@ -288,26 +454,7 @@ class ColMaker: public IUpdater {
       }
       utils::IIterator<ColBatch> *iter = p_fmat->ColIterator(feat_set);
       while (iter->Next()) {
-        const ColBatch &batch = iter->Value();
-        // start enumeration
-        const bst_omp_uint nsize = static_cast<bst_omp_uint>(batch.size);
-        #if defined(_OPENMP)                                                                
-        const int batch_size = std::max(static_cast<int>(nsize / this->nthread / 32), 1);
-        #endif
-        #pragma omp parallel for schedule(dynamic, batch_size)
-        for (bst_omp_uint i = 0; i < nsize; ++i) {
-          const bst_uint fid = batch.col_index[i];
-          const int tid = omp_get_thread_num();
-          const ColBatch::Inst c = batch[i];
-          if (param.need_forward_search(p_fmat->GetColDensity(fid))) {            
-            this->EnumerateSplit(c.data, c.data + c.length, +1, 
-                                 fid, gpair, info, stemp[tid]);
-          }
-          if (param.need_backward_search(p_fmat->GetColDensity(fid))) {
-            this->EnumerateSplit(c.data + c.length - 1, c.data - 1, -1, 
-                                 fid, gpair, info, stemp[tid]);
-          }
-        }
+        this->UpdateSolution(iter->Value(), gpair, *p_fmat, info);
       }
       // after this each thread's stemp will get the best candidates, aggregate results
       for (size_t i = 0; i < qexpand.size(); ++i) {
@@ -325,6 +472,7 @@ class ColMaker: public IUpdater {
         }
       }
     }
+
     // reset position of each data points after split is created in the tree
     inline void ResetPosition(const std::vector<int> &qexpand, IFMatrix *p_fmat, const RegTree &tree) {
       const std::vector<bst_uint> &rowset = p_fmat->buffered_rowset();
diff --git a/src/utils/io.h b/src/utils/io.h
index 23fa0d468746..141d83f8cb49 100644
--- a/src/utils/io.h
+++ b/src/utils/io.h
@@ -100,7 +100,6 @@ class ISeekStream: public IStream {
 /*! \brief implementation of file i/o stream */
 class FileStream : public ISeekStream {
  public:
-  explicit FileStream(void) {}
   explicit FileStream(FILE *fp) {
     this->fp = fp;
   }

From d0e27482efe2917c60977f46fedf2836e6e18744 Mon Sep 17 00:00:00 2001
From: tqchen <tianqi.tchen@gmail.com>
Date: Fri, 29 Aug 2014 18:44:02 -0700
Subject: [PATCH 003/166] fix compiler error

---
 src/io/page_dmatrix-inl.hpp | 2 +-
 src/utils/io.h              | 1 +
 2 files changed, 2 insertions(+), 1 deletion(-)

diff --git a/src/io/page_dmatrix-inl.hpp b/src/io/page_dmatrix-inl.hpp
index df43d3b7fdaa..82a373352f35 100644
--- a/src/io/page_dmatrix-inl.hpp
+++ b/src/io/page_dmatrix-inl.hpp
@@ -18,7 +18,7 @@ struct RowBatchPage {
     utils::Assert(data_ != NULL, "fail to allocate row batch page");
     this->Clear();
   }
-  ~BinaryPage(void) {
+  ~RowBatchPage(void) {
     if (data_ != NULL) delete [] data_;
   }
   /*! 
diff --git a/src/utils/io.h b/src/utils/io.h
index 141d83f8cb49..23fa0d468746 100644
--- a/src/utils/io.h
+++ b/src/utils/io.h
@@ -100,6 +100,7 @@ class ISeekStream: public IStream {
 /*! \brief implementation of file i/o stream */
 class FileStream : public ISeekStream {
  public:
+  explicit FileStream(void) {}
   explicit FileStream(FILE *fp) {
     this->fp = fp;
   }

From ce772c2f3e9cfcc06ea44287c6825a0e08f0efb0 Mon Sep 17 00:00:00 2001
From: tqchen <tianqi.tchen@gmail.com>
Date: Fri, 29 Aug 2014 19:59:19 -0700
Subject: [PATCH 004/166] first check of page

---
 src/io/io.cpp               | 13 +++++++-
 src/io/page_dmatrix-inl.hpp | 65 +++++++++++++++++++++++++++++++++++--
 src/utils/io.h              |  8 +++--
 3 files changed, 81 insertions(+), 5 deletions(-)

diff --git a/src/io/io.cpp b/src/io/io.cpp
index f56cff6794a6..e8b9ce33772b 100644
--- a/src/io/io.cpp
+++ b/src/io/io.cpp
@@ -22,7 +22,13 @@ DataMatrix* LoadDataMatrix(const char *fname, bool silent, bool savebuffer) {
     dmat->LoadBinary(fs, silent, fname);
     fs.Close();
     return dmat;
-  } 
+  }
+  if (magic == DMatrixPage::kMagic) {
+    DMatrixPage *dmat = new DMatrixPage();
+    dmat->Load(fs, silent, fname);
+    // the file pointer is hold in page matrix
+    return dmat;
+  }
   fs.Close();
  
   DMatrixSimple *dmat = new DMatrixSimple();
@@ -31,6 +37,11 @@ DataMatrix* LoadDataMatrix(const char *fname, bool silent, bool savebuffer) {
 }
 
 void SaveDataMatrix(const DataMatrix &dmat, const char *fname, bool silent) {
+  if (!strcmp(fname + strlen(fname) - 5, ".page")) {
+    
+    DMatrixPage::Save(fname, dmat, silent);
+    return;
+  }
   if (dmat.magic == DMatrixSimple::kMagic) {
     const DMatrixSimple *p_dmat = static_cast<const DMatrixSimple*>(&dmat);
     p_dmat->SaveBinary(fname, silent);
diff --git a/src/io/page_dmatrix-inl.hpp b/src/io/page_dmatrix-inl.hpp
index 82a373352f35..4d13a0bc5a31 100644
--- a/src/io/page_dmatrix-inl.hpp
+++ b/src/io/page_dmatrix-inl.hpp
@@ -8,6 +8,8 @@
 #include "../data.h"
 #include "../utils/iterator.h"
 #include "../utils/thread_buffer.h"
+#include "./simple_fmatrix-inl.hpp"
+
 namespace xgboost {
 namespace io {
 /*! \brief page structure that can be used to store a rowbatch */
@@ -102,7 +104,7 @@ class ThreadRowPageIterator: public utils::IIterator<RowBatch> {
     base_rowid_ = 0;
     isend_ = false;
   }
-  virtual ~ThreadRowPageIterator(void) {
+  virtual ~ThreadRowPageIterator(void) {    
   }
   virtual void Init(void) {
   }
@@ -188,7 +190,9 @@ class ThreadRowPageIterator: public utils::IIterator<RowBatch> {
     inline void FreeSpace(PagePtr &a) {
       delete a;
     }
-    inline void Destroy(void) {}
+    inline void Destroy(void) {
+      fi.Close();
+    }
     inline void BeforeFirst(void) {
       fi.Seek(file_begin_);
     }
@@ -199,6 +203,63 @@ class ThreadRowPageIterator: public utils::IIterator<RowBatch> {
   int ptop_;
   utils::ThreadBuffer<PagePtr,Factory> itr;
 };
+
+/*! \brief data matrix using page */
+class DMatrixPage : public DataMatrix {
+ public:
+  DMatrixPage(void) : DataMatrix(kMagic) {
+    iter_ = new ThreadRowPageIterator();
+    fmat_ = new FMatrixS(iter_);
+  }
+  // virtual destructor
+  virtual ~DMatrixPage(void) {
+    delete fmat_;
+  }
+  virtual IFMatrix *fmat(void) const {
+    return fmat_;
+  }
+  /*! \brief load and initialize the iterator with fi */
+  inline void Load(utils::FileStream &fi,
+                   bool silent = false,
+                   const char *fname = NULL){
+    int magic;
+    utils::Check(fi.Read(&magic, sizeof(magic)) != 0, "invalid input file format");
+    utils::Check(magic == kMagic, "invalid format,magic number mismatch");    
+    this->info.LoadBinary(fi);
+    iter_->Load(fi);
+    if (!silent) {
+      printf("DMatrixPage: %lux%lu matrix is loaded",
+             info.num_row(), info.num_col());
+      if (fname != NULL) {
+        printf(" from %s\n", fname);
+      } else {
+        printf("\n");
+      }
+      if (info.group_ptr.size() != 0) {
+        printf("data contains %u groups\n", (unsigned)info.group_ptr.size()-1);
+      }
+    }
+  }
+  /*! \brief save a DataMatrix as DMatrixPage*/
+  inline static void Save(const char* fname, const DataMatrix &mat, bool silent) {
+    utils::FileStream fs(utils::FopenCheck(fname, "wb"));
+    int magic = kMagic;
+    fs.Write(&magic, sizeof(magic));
+    mat.info.SaveBinary(fs);
+    ThreadRowPageIterator::Save(mat.fmat()->RowIterator(), fs);
+    fs.Close();
+    if (!silent) {
+      printf("DMatrixPage: %lux%lu is saved to %s\n",
+             mat.info.num_row(), mat.info.num_col(), fname);
+    }
+  }
+  /*! \brief the real fmatrix */
+  FMatrixS *fmat_;
+  /*! \brief row iterator */
+  ThreadRowPageIterator *iter_;
+  /*! \brief magic number used to identify DMatrix */
+  static const int kMagic = 0xffffab02;
+};
 }  // namespace io
 }  // namespace xgboost
 #endif  // XGBOOST_IO_PAGE_ROW_ITER_INL_HPP_
diff --git a/src/utils/io.h b/src/utils/io.h
index 23fa0d468746..dbfcee3f6ddc 100644
--- a/src/utils/io.h
+++ b/src/utils/io.h
@@ -100,7 +100,9 @@ class ISeekStream: public IStream {
 /*! \brief implementation of file i/o stream */
 class FileStream : public ISeekStream {
  public:
-  explicit FileStream(void) {}
+  explicit FileStream(void) {
+    this->fp = NULL;
+  }
   explicit FileStream(FILE *fp) {
     this->fp = fp;
   }
@@ -117,7 +119,9 @@ class FileStream : public ISeekStream {
     return static_cast<size_t>(ftell(fp));
   }
   inline void Close(void) {
-    fclose(fp);
+    if (fp != NULL){
+      fclose(fp); fp = NULL;
+    }
   }
 
  private:

From 7bc1c3ee79e31afebafe149f96a456d3a2f0ec82 Mon Sep 17 00:00:00 2001
From: tqchen <tianqi.tchen@gmail.com>
Date: Fri, 29 Aug 2014 20:54:24 -0700
Subject: [PATCH 005/166] various fix of page

---
 src/io/io.cpp                 |  7 ++++---
 src/io/page_dmatrix-inl.hpp   | 12 ++++++++----
 src/io/simple_dmatrix-inl.hpp |  2 +-
 src/io/simple_fmatrix-inl.hpp |  2 +-
 4 files changed, 14 insertions(+), 9 deletions(-)

diff --git a/src/io/io.cpp b/src/io/io.cpp
index e8b9ce33772b..e413b2799a8d 100644
--- a/src/io/io.cpp
+++ b/src/io/io.cpp
@@ -37,8 +37,7 @@ DataMatrix* LoadDataMatrix(const char *fname, bool silent, bool savebuffer) {
 }
 
 void SaveDataMatrix(const DataMatrix &dmat, const char *fname, bool silent) {
-  if (!strcmp(fname + strlen(fname) - 5, ".page")) {
-    
+  if (!strcmp(fname + strlen(fname) - 5, ".page")) {    
     DMatrixPage::Save(fname, dmat, silent);
     return;
   }
@@ -46,7 +45,9 @@ void SaveDataMatrix(const DataMatrix &dmat, const char *fname, bool silent) {
     const DMatrixSimple *p_dmat = static_cast<const DMatrixSimple*>(&dmat);
     p_dmat->SaveBinary(fname, silent);
   } else {
-    utils::Error("not implemented");
+    DMatrixSimple smat;
+    smat.CopyFrom(dmat);
+    smat.SaveBinary(fname, silent);
   }
 }
 
diff --git a/src/io/page_dmatrix-inl.hpp b/src/io/page_dmatrix-inl.hpp
index 4d13a0bc5a31..2701fb3b3bd6 100644
--- a/src/io/page_dmatrix-inl.hpp
+++ b/src/io/page_dmatrix-inl.hpp
@@ -32,7 +32,7 @@ struct RowBatchPage {
     const size_t dsize = row.length * sizeof(RowBatch::Entry);
     if (FreeBytes() < dsize+ sizeof(int)) return false;
     row_ptr(Size() + 1) = row_ptr(Size()) + row.length;    
-    memcpy(data_ptr(Size()) , row.data, dsize);
+    memcpy(data_ptr(row_ptr(Size())) , row.data, dsize);
     ++ data_[0];
     return true;    
   }
@@ -48,13 +48,18 @@ struct RowBatchPage {
     batch.data_ptr = this->data_ptr(0);
     batch.size = static_cast<size_t>(this->Size());
     std::vector<size_t> &rptr = *p_rptr;
-    rptr.resize(this->Size()+1);
+    rptr.resize(this->Size() + 1);
     for (size_t i = 0; i < rptr.size(); ++i) {
       rptr[i] = static_cast<size_t>(this->row_ptr(i));
     }
     batch.ind_ptr = &rptr[0];
     return batch;
   }
+  /*! \brief get i-th row from the batch */
+  inline RowBatch::Inst operator[](size_t i) {
+    return RowBatch::Inst(data_ptr(0) + row_ptr(i), 
+                          static_cast<bst_uint>(row_ptr(i+1) - row_ptr(i)));
+  }
   /*!
    * \brief clear the page, cleanup the content
    */
@@ -77,7 +82,7 @@ struct RowBatchPage {
     return data_[0];
   }
   /*! \brief page size 64 MB */
-  static const size_t kPageSize = 64 << 18;
+  static const size_t kPageSize = 64 << 8;
 
  private:
   /*! \return number of elements */
@@ -112,7 +117,6 @@ class ThreadRowPageIterator: public utils::IIterator<RowBatch> {
     itr.BeforeFirst();
     isend_ = false;
     base_rowid_ = 0;
-    utils::Assert(this->LoadNextPage(), "ThreadRowPageIterator");
   }
   virtual bool Next(void) {
     if(!this->LoadNextPage()) return false;
diff --git a/src/io/simple_dmatrix-inl.hpp b/src/io/simple_dmatrix-inl.hpp
index 47be8a41a96b..8d7064bdd1bb 100644
--- a/src/io/simple_dmatrix-inl.hpp
+++ b/src/io/simple_dmatrix-inl.hpp
@@ -44,8 +44,8 @@ class DMatrixSimple : public DataMatrix {
   }
   /*! \brief copy content data from source matrix */
   inline void CopyFrom(const DataMatrix &src) {
-    this->info = src.info;
     this->Clear();
+    this->info = src.info;
     // clone data content in thos matrix
     utils::IIterator<RowBatch> *iter = src.fmat()->RowIterator();
     iter->BeforeFirst();
diff --git a/src/io/simple_fmatrix-inl.hpp b/src/io/simple_fmatrix-inl.hpp
index 86763a105986..f099eb1a94f0 100644
--- a/src/io/simple_fmatrix-inl.hpp
+++ b/src/io/simple_fmatrix-inl.hpp
@@ -150,7 +150,7 @@ class FMatrixS : public IFMatrix{
     iter_->BeforeFirst();
     while (iter_->Next()) {
       const RowBatch &batch = iter_->Value();
-      for (size_t i = 0; i < batch.size; ++i) {
+      for (size_t i = 0; i < batch.size; ++i) {        
         if (pkeep == 1.0f || random::SampleBinary(pkeep)) {
           buffered_rowset_.push_back(static_cast<bst_uint>(batch.base_rowid+i));
           RowBatch::Inst inst = batch[i];

From 9830674b75c954a6ca02546d60367fae9be1e6d9 Mon Sep 17 00:00:00 2001
From: tqchen <tianqi.tchen@gmail.com>
Date: Fri, 29 Aug 2014 21:04:40 -0700
Subject: [PATCH 006/166] seems page is ok, try add col tmr

---
 src/io/page_dmatrix-inl.hpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/io/page_dmatrix-inl.hpp b/src/io/page_dmatrix-inl.hpp
index 2701fb3b3bd6..e16beb4b612c 100644
--- a/src/io/page_dmatrix-inl.hpp
+++ b/src/io/page_dmatrix-inl.hpp
@@ -82,7 +82,7 @@ struct RowBatchPage {
     return data_[0];
   }
   /*! \brief page size 64 MB */
-  static const size_t kPageSize = 64 << 8;
+  static const size_t kPageSize = 64 << 18;
 
  private:
   /*! \return number of elements */
@@ -104,7 +104,7 @@ struct RowBatchPage {
 class ThreadRowPageIterator: public utils::IIterator<RowBatch> {
  public:
   ThreadRowPageIterator(void) {
-    itr.SetParam("buffer_size", "4");
+    itr.SetParam("buffer_size", "2");
     page_ = NULL;
     base_rowid_ = 0;
     isend_ = false;

From 366ac95ad331fe68970c39cb74e4fad95cde6045 Mon Sep 17 00:00:00 2001
From: Tianqi Chen <workcrow@gmail.com>
Date: Fri, 29 Aug 2014 21:27:03 -0700
Subject: [PATCH 007/166] windows check

---
 .gitignore                  |  2 +-
 R-package/src/Makevars      |  5 +++--
 R-package/src/Makevars.win  |  5 +++--
 src/io/page_dmatrix-inl.hpp |  6 +++---
 src/utils/io.h              | 10 +++++-----
 wrapper/xgboost.py          |  4 ++--
 6 files changed, 17 insertions(+), 15 deletions(-)

diff --git a/.gitignore b/.gitignore
index f1f9400abfc5..4551c79cc00a 100644
--- a/.gitignore
+++ b/.gitignore
@@ -2,7 +2,7 @@
 *.slo
 *.lo
 *.o
-
+*.page
 # Compiled Dynamic libraries
 *.so
 *.dylib
diff --git a/R-package/src/Makevars b/R-package/src/Makevars
index 7dfda4d57799..b0d3283b9665 100644
--- a/R-package/src/Makevars
+++ b/R-package/src/Makevars
@@ -5,9 +5,10 @@ CXX=`R CMD config CXX`
 CFLAGS=`R CMD config CFLAGS` 
 # expose these flags to R CMD SHLIB
 PKG_CPPFLAGS= -DXGBOOST_CUSTOMIZE_ERROR_ -I$(PKGROOT)  $(SHLIB_OPENMP_CFLAGS)
-XGBFLAG= $(CFLAGS) -DXGBOOST_CUSTOMIZE_ERROR_ -fPIC  $(SHLIB_OPENMP_CFLAGS)
+PKG_CPPFLAGS+= $(SHLIB_PTHREAD_FLAGS)
+XGBFLAG= $(CFLAGS) -DXGBOOST_CUSTOMIZE_ERROR_ -fPIC  $(SHLIB_OPENMP_CFLAGS) $(SHLIB_PTHREAD_FLAGS)
 
-PKG_LIBS = $(SHLIB_OPENMP_CFLAGS)
+PKG_LIBS = $(SHLIB_OPENMP_CFLAGS) $(SHLIB_PTHREAD_FLAGS)
 
 ifeq ($(no_omp),1)
 	PKG_CPPFLAGS += -DDISABLE_OPENMP 
diff --git a/R-package/src/Makevars.win b/R-package/src/Makevars.win
index 3df9891fc115..8f5f7ed98467 100644
--- a/R-package/src/Makevars.win
+++ b/R-package/src/Makevars.win
@@ -5,8 +5,9 @@ CXX=`Rcmd config CXX`
 CFLAGS=`Rcmd config CFLAGS` 
 # expose these flags to R CMD SHLIB
 PKG_CPPFLAGS= -DXGBOOST_CUSTOMIZE_ERROR_ -I$(PKGROOT)  $(SHLIB_OPENMP_CFLAGS)
-XGBFLAG= $(CFLAGS) -DXGBOOST_CUSTOMIZE_ERROR_ -fPIC  $(SHLIB_OPENMP_CFLAGS)
-PKG_LIBS = $(SHLIB_OPENMP_CFLAGS)
+PKG_CPPFLAGS+= $(SHLIB_PTHREAD_FLAGS)
+XGBFLAG= $(CFLAGS) -DXGBOOST_CUSTOMIZE_ERROR_ -fPIC  $(SHLIB_OPENMP_CFLAGS) $(SHLIB_PTHREAD_FLAGS)
+PKG_LIBS = $(SHLIB_OPENMP_CFLAGS) $(SHLIB_PTHREAD_FLAGS)
 
 ifeq ($(no_omp),1)
 	PKG_CPPFLAGS += -DDISABLE_OPENMP 
diff --git a/src/io/page_dmatrix-inl.hpp b/src/io/page_dmatrix-inl.hpp
index e16beb4b612c..8db944c85345 100644
--- a/src/io/page_dmatrix-inl.hpp
+++ b/src/io/page_dmatrix-inl.hpp
@@ -50,13 +50,13 @@ struct RowBatchPage {
     std::vector<size_t> &rptr = *p_rptr;
     rptr.resize(this->Size() + 1);
     for (size_t i = 0; i < rptr.size(); ++i) {
-      rptr[i] = static_cast<size_t>(this->row_ptr(i));
+      rptr[i] = static_cast<size_t>(this->row_ptr(static_cast<int>(i)));
     }
     batch.ind_ptr = &rptr[0];
     return batch;
   }
   /*! \brief get i-th row from the batch */
-  inline RowBatch::Inst operator[](size_t i) {
+  inline RowBatch::Inst operator[](int i) {
     return RowBatch::Inst(data_ptr(0) + row_ptr(i), 
                           static_cast<bst_uint>(row_ptr(i+1) - row_ptr(i)));
   }
@@ -173,7 +173,7 @@ class ThreadRowPageIterator: public utils::IIterator<RowBatch> {
   // loader factory for page
   struct Factory {
    public:
-    size_t file_begin_;
+    long file_begin_;
     utils::FileStream fi;
     Factory(void) {}
     inline void SetFile(const utils::FileStream &fi) {
diff --git a/src/utils/io.h b/src/utils/io.h
index dbfcee3f6ddc..276dd7312b11 100644
--- a/src/utils/io.h
+++ b/src/utils/io.h
@@ -92,9 +92,9 @@ class IStream {
 class ISeekStream: public IStream {
  public:
   /*! \brief seek to certain position of the file */
-  virtual void Seek(size_t pos) = 0;
+  virtual void Seek(long pos) = 0;
   /*! \brief tell the position of the stream */
-  virtual size_t Tell(void) = 0;
+  virtual long Tell(void) = 0;
 };
 
 /*! \brief implementation of file i/o stream */
@@ -112,11 +112,11 @@ class FileStream : public ISeekStream {
   virtual void Write(const void *ptr, size_t size) {
     fwrite(ptr, size, 1, fp);
   }
-  virtual void Seek(size_t pos) {
+  virtual void Seek(long pos) {
     fseek(fp, pos, SEEK_SET);
   }
-  virtual size_t Tell(void) {
-    return static_cast<size_t>(ftell(fp));
+  virtual long Tell(void) {
+    return ftell(fp);
   }
   inline void Close(void) {
     if (fp != NULL){
diff --git a/wrapper/xgboost.py b/wrapper/xgboost.py
index adf59c829dcc..e4338e0cd27b 100644
--- a/wrapper/xgboost.py
+++ b/wrapper/xgboost.py
@@ -56,7 +56,7 @@ def __init__(self, data, label=None, missing=0.0, weight = None):
                         weight for each instances                        
         """
         # force into void_p, mac need to pass things in as void_p
-        if data == None:
+        if data is None:
             self.handle = None
             return
         if isinstance(data, str):
@@ -484,7 +484,7 @@ def train(params, dtrain, num_boost_round = 10, evals = [], obj=None, feval=None
             feval:
     """
     bst = Booster(params, [dtrain]+[ d[0] for d in evals ] )
-    if obj == None:
+    if obj is None:
         for i in range(num_boost_round):
             bst.update( dtrain, i )
             if len(evals) != 0:

From 0a7cfb32c6f453482bdc8a8f5fa4f2e2db1308fc Mon Sep 17 00:00:00 2001
From: tqchen <tianqi.tchen@gmail.com>
Date: Sun, 31 Aug 2014 21:58:01 -0700
Subject: [PATCH 008/166] add fmatrix, fight tmr

---
 src/io/page_dmatrix-inl.hpp |  1 +
 src/io/page_fmatrix-inl.hpp | 75 +++++++++++++++++++++++++++++++++++++
 2 files changed, 76 insertions(+)
 create mode 100644 src/io/page_fmatrix-inl.hpp

diff --git a/src/io/page_dmatrix-inl.hpp b/src/io/page_dmatrix-inl.hpp
index 8db944c85345..23013b98b85d 100644
--- a/src/io/page_dmatrix-inl.hpp
+++ b/src/io/page_dmatrix-inl.hpp
@@ -9,6 +9,7 @@
 #include "../utils/iterator.h"
 #include "../utils/thread_buffer.h"
 #include "./simple_fmatrix-inl.hpp"
+#include "./page_fmatrix-inl.hpp"
 
 namespace xgboost {
 namespace io {
diff --git a/src/io/page_fmatrix-inl.hpp b/src/io/page_fmatrix-inl.hpp
new file mode 100644
index 000000000000..156cddb63277
--- /dev/null
+++ b/src/io/page_fmatrix-inl.hpp
@@ -0,0 +1,75 @@
+#ifndef XGBOOST_IO_PAGE_FMATRIX_INL_HPP_
+#define XGBOOST_IO_PAGE_FMATRIX_INL_HPP_
+/*!
+ * \file page_fmatrix-inl.hpp
+ * sparse page manager for fmatrix
+ * \author Tianqi Chen
+ */
+#include "../data.h"
+#include "../utils/iterator.h"
+#include "../utils/thread_buffer.h"
+namespace xgboost {
+namespace io {
+
+class CSCMatrixManager {
+ public:
+  /*! \brief in memory page */
+  struct Page {
+   public:
+    /*! \brief initialize the page */
+    inline void Init(size_t size) {
+      buffer.resize(size);
+      num_entry = 0;
+      col_index.clear();
+      col_data.clear();
+    }    
+    /*! \brief number of used entries */
+    size_t num_entry;
+    /*! \brief column index */
+    std::vector<bst_uint> col_index;
+    /*! \brief column data */
+    std::vector<ColBatch::Inst> col_data;            
+    /*! \brief number of free entries */
+    inline size_t NumFreeEntry(void) const {
+      return buffer.size() - num_entry;
+    }
+    inline ColBatch::Entry* AllocEntry(size_t len) {
+      ColBatch::Entry *p_data = &buffer[0] + num_entry;
+      num_entry += len;
+      return p_data;
+    }
+    /*! \brief get underlying batch */
+    inline ColBatch GetBatch(void) const {
+      ColBatch batch; 
+      batch.col_index = &col_index[0];
+      batch.col_data  = &col_data[0];
+      return batch;
+    }
+   private:
+    /*! \brief buffer space, not to be changed since ready */
+    std::vector<ColBatch::Entry> buffer;
+  };
+  
+ private:
+  /*! \brief fill a page with */
+  inline bool Fill(size_t cidx, Page *p_page) {
+    size_t len = col_ptr_[cidx+1] - col_ptr_[cidx];
+    if (p_page->NumFreeEntry() < len) return false;
+    ColBatch::Entry *p_data = p_page->AllocEntry(len);
+    fi->Seek(col_ptr_[cidx]);
+    utils::Check(fi->Read(p_data, sizeof(ColBatch::Entry) * len) != 0,
+                 "invalid column buffer format");
+    p_page->col_data.push_back(ColBatch::Inst(p_data, len));
+    p_page->col_index.push_back(cidx);
+  }
+  /*! \brief size of data content */
+  size_t data_size_;
+  /*! \brief input stream */
+  utils::ISeekStream *fi;
+  /*! \brief column pointer of CSC format */
+  std::vector<size_t> col_ptr_;  
+};
+
+}  // namespace io
+}  // namespace xgboost
+#endif  // XGBOOST_IO_PAGE_FMATRIX_INL_HPP_

From e3153b976c589ce0d49c555188809b7c2ab160da Mon Sep 17 00:00:00 2001
From: tqchen <tianqi.tchen@gmail.com>
Date: Sun, 31 Aug 2014 22:25:30 -0700
Subject: [PATCH 009/166] chgs

---
 src/io/page_fmatrix-inl.hpp | 36 ++++++++++++++++++++++++++++++++++--
 1 file changed, 34 insertions(+), 2 deletions(-)

diff --git a/src/io/page_fmatrix-inl.hpp b/src/io/page_fmatrix-inl.hpp
index 156cddb63277..f077f0dde408 100644
--- a/src/io/page_fmatrix-inl.hpp
+++ b/src/io/page_fmatrix-inl.hpp
@@ -17,12 +17,17 @@ class CSCMatrixManager {
   struct Page {
    public:
     /*! \brief initialize the page */
-    inline void Init(size_t size) {
+    explicit Page(size_t size) {
       buffer.resize(size);
+      col_index.reserve(10);
+      col_data.reserved(10);
+    }
+    /*! \brief clear the page */
+    inline void Clear(void) {
       num_entry = 0;
       col_index.clear();
       col_data.clear();
-    }    
+    }
     /*! \brief number of used entries */
     size_t num_entry;
     /*! \brief column index */
@@ -49,6 +54,33 @@ class CSCMatrixManager {
     /*! \brief buffer space, not to be changed since ready */
     std::vector<ColBatch::Entry> buffer;
   };
+  /*! \brief define type of page pointer */
+  typedef Page *PagePtr;
+  /*! \brief get column pointer */
+  const std::vector<size_t> &col_ptr(void) const {
+    return col_ptr_;
+  }
+  inline bool Init(void) {
+    return true;
+  }
+  inline void SetParam(const char *name, const char *val) {
+  }
+  inline bool LoadNext(PagePtr &val) {
+    
+  }
+  inline PagePtr Create(void) {
+    PagePtr a = new Page();
+    return a;
+  }
+  inline void FreeSpace(PagePtr &a) {
+    delete a;
+  }
+  inline void Destroy(void) {
+    fi.Close();
+  }
+  inline void BeforeFirst(void) {
+    fi.Seek(file_begin_);
+  }
   
  private:
   /*! \brief fill a page with */

From 7d1e9f06d43b2601963e91ebb6fa871c0dbf7426 Mon Sep 17 00:00:00 2001
From: tqchen <tianqi.tchen@gmail.com>
Date: Mon, 1 Sep 2014 10:45:05 -0700
Subject: [PATCH 010/166] add fmatrix in, todo add buffer file

---
 src/io/page_dmatrix-inl.hpp |  20 +--
 src/io/page_fmatrix-inl.hpp | 248 +++++++++++++++++++++++++++++++++---
 src/utils/thread_buffer.h   |   3 +
 3 files changed, 237 insertions(+), 34 deletions(-)

diff --git a/src/io/page_dmatrix-inl.hpp b/src/io/page_dmatrix-inl.hpp
index 23013b98b85d..01b7f8fc7a8f 100644
--- a/src/io/page_dmatrix-inl.hpp
+++ b/src/io/page_dmatrix-inl.hpp
@@ -108,7 +108,6 @@ class ThreadRowPageIterator: public utils::IIterator<RowBatch> {
     itr.SetParam("buffer_size", "2");
     page_ = NULL;
     base_rowid_ = 0;
-    isend_ = false;
   }
   virtual ~ThreadRowPageIterator(void) {    
   }
@@ -116,11 +115,10 @@ class ThreadRowPageIterator: public utils::IIterator<RowBatch> {
   }
   virtual void BeforeFirst(void) {
     itr.BeforeFirst();
-    isend_ = false;
     base_rowid_ = 0;
   }
   virtual bool Next(void) {
-    if(!this->LoadNextPage()) return false;
+    if(!itr.Next(page_)) return false;
     out_ = page_->GetRowBatch(&tmp_ptr_, base_rowid_);
     base_rowid_ += out_.size;
     return true;
@@ -154,21 +152,12 @@ class ThreadRowPageIterator: public utils::IIterator<RowBatch> {
     if (page.Size() != 0) page.Save(fo);
   }
  private:
-  // load in next page
-  inline bool LoadNextPage(void) {
-    ptop_ = 0;
-    bool ret = itr.Next(page_);
-    isend_ = !ret;
-    return ret;
-  }
   // base row id
   size_t base_rowid_;
   // temporal ptr
   std::vector<size_t> tmp_ptr_;
   // output data
   RowBatch out_;
-  // whether we reach end of file
-  bool isend_;
   // page pointer type
   typedef RowBatchPage* PagePtr;
   // loader factory for page
@@ -205,7 +194,6 @@ class ThreadRowPageIterator: public utils::IIterator<RowBatch> {
 
  protected:
   PagePtr page_;
-  int ptop_;
   utils::ThreadBuffer<PagePtr,Factory> itr;
 };
 
@@ -234,7 +222,8 @@ class DMatrixPage : public DataMatrix {
     iter_->Load(fi);
     if (!silent) {
       printf("DMatrixPage: %lux%lu matrix is loaded",
-             info.num_row(), info.num_col());
+             static_cast<unsigned long>(info.num_row()),
+             static_cast<unsigned long>(info.num_col()));
       if (fname != NULL) {
         printf(" from %s\n", fname);
       } else {
@@ -255,7 +244,8 @@ class DMatrixPage : public DataMatrix {
     fs.Close();
     if (!silent) {
       printf("DMatrixPage: %lux%lu is saved to %s\n",
-             mat.info.num_row(), mat.info.num_col(), fname);
+             static_cast<unsigned long>(mat.info.num_row()),
+             static_cast<unsigned long>(mat.info.num_col()), fname);
     }
   }
   /*! \brief the real fmatrix */
diff --git a/src/io/page_fmatrix-inl.hpp b/src/io/page_fmatrix-inl.hpp
index f077f0dde408..cf4923b7b9ba 100644
--- a/src/io/page_fmatrix-inl.hpp
+++ b/src/io/page_fmatrix-inl.hpp
@@ -20,7 +20,7 @@ class CSCMatrixManager {
     explicit Page(size_t size) {
       buffer.resize(size);
       col_index.reserve(10);
-      col_data.reserved(10);
+      col_data.reserve(10);
     }
     /*! \brief clear the page */
     inline void Clear(void) {
@@ -57,49 +57,259 @@ class CSCMatrixManager {
   /*! \brief define type of page pointer */
   typedef Page *PagePtr;
   /*! \brief get column pointer */
-  const std::vector<size_t> &col_ptr(void) const {
+  inline const std::vector<size_t> &col_ptr(void) const {
     return col_ptr_;
   }
-  inline bool Init(void) {
-    return true;
-  }
   inline void SetParam(const char *name, const char *val) {
-  }
-  inline bool LoadNext(PagePtr &val) {
-    
   }
   inline PagePtr Create(void) {
-    PagePtr a = new Page();
-    return a;
+    return new Page(page_size_);
   }
   inline void FreeSpace(PagePtr &a) {
     delete a;
   }
   inline void Destroy(void) {
-    fi.Close();
   }
   inline void BeforeFirst(void) {
-    fi.Seek(file_begin_);
+    col_index_ = col_todo_;
+    read_top_ = 0;
+  }
+  inline bool LoadNext(PagePtr &val) {
+    val->Clear();
+    if (read_top_ >= col_index_.size()) return false;
+    while (read_top_ < col_index_.size()) {
+      if (!this->TryFill(col_index_[read_top_], val)) return true;
+      ++read_top_;
+    }
+    return true;
+  }
+  inline bool Init(void) {
+    this->BeforeFirst();
+    return true;
+  }
+  inline void Setup(utils::ISeekStream *fi, double page_ratio) {
+    fi_ = fi;
+    fi_->Read(&begin_meta_ , sizeof(size_t));
+    fi_->Seek(begin_meta_);
+    fi_->Read(&col_ptr_);
+    size_t psmax = 0;
+    for (size_t i = 0; i < col_ptr_.size() - 1; ++i) {
+      psmax = std::max(psmax, col_ptr_[i+1] - col_ptr_[i]);
+    }
+    utils::Check(page_ratio >= 1.0f, "col_page_ratio must be at least 1");
+    page_size_ = std::max(static_cast<size_t>(psmax * page_ratio), psmax);    
+  }
+  inline void SetColSet(const std::vector<bst_uint> &cset, bool setall) {
+    if (!setall) {
+      col_todo_.resize(cset.size());
+      for (size_t i = 0; i < cset.size(); ++i) {
+        col_todo_[i] = cset[i];
+        utils::Assert(col_todo_[i] < static_cast<bst_uint>(col_ptr_.size() - 1),
+                      "CSCMatrixManager: column index exceed bound");
+      }
+      std::sort(col_todo_.begin(), col_todo_.end());
+    } else {
+      col_todo_.resize(col_ptr_.size()-1);
+      for (size_t i = 0; i < col_todo_.size(); ++i) {
+        col_todo_[i] = static_cast<bst_uint>(i);
+      }
+    }
   }
-  
  private:
   /*! \brief fill a page with */
-  inline bool Fill(size_t cidx, Page *p_page) {
+  inline bool TryFill(size_t cidx, Page *p_page) {
     size_t len = col_ptr_[cidx+1] - col_ptr_[cidx];
     if (p_page->NumFreeEntry() < len) return false;
     ColBatch::Entry *p_data = p_page->AllocEntry(len);
-    fi->Seek(col_ptr_[cidx]);
-    utils::Check(fi->Read(p_data, sizeof(ColBatch::Entry) * len) != 0,
+    fi_->Seek(col_ptr_[cidx]);
+    utils::Check(fi_->Read(p_data, sizeof(ColBatch::Entry) * len) != 0,
                  "invalid column buffer format");
     p_page->col_data.push_back(ColBatch::Inst(p_data, len));
     p_page->col_index.push_back(cidx);
   }
+  // the following are in memory auxiliary data structure
+  /*! \brief top of reader position */
+  size_t read_top_;
+  /*! \brief size of page */
+  size_t page_size_;
+  /*! \brief column index to be loaded */
+  std::vector<bst_uint> col_index_;
+  /*! \brief column index to be after calling before first */
+  std::vector<bst_uint> col_todo_;
+  // the following are input content
   /*! \brief size of data content */
-  size_t data_size_;
+  size_t begin_meta_;
   /*! \brief input stream */
-  utils::ISeekStream *fi;
+  utils::ISeekStream *fi_;
   /*! \brief column pointer of CSC format */
-  std::vector<size_t> col_ptr_;  
+  std::vector<size_t> col_ptr_;
+};
+
+class ThreadColPageIterator : public utils::IIterator<ColBatch> {
+ public:
+  ThreadColPageIterator(void) {
+    itr_.SetParam("buffer_size", "2");
+    page_ = NULL;
+    fi_ = NULL;
+    silent = 0;
+  }
+  virtual ~ThreadColPageIterator(void) {
+    if (fi_ != NULL) {
+      fi_->Close(); delete fi_;
+    }
+  }
+  virtual void Init(void) {
+    fi_ = new utils::FileStream(utils::FopenCheck(col_pagefile_.c_str(), "rb"));
+    itr_.get_factory().Setup(fi_, col_pageratio_);
+    if (silent == 0) {
+      printf("ThreadColPageIterator: finish initialzing from %s, %u columns\n",
+             col_pagefile_.c_str(), static_cast<unsigned>(col_ptr().size() - 1));
+    }
+  }
+  virtual void SetParam(const char *name, const char *val) {
+    if (!strcmp("col_pageratio", val)) col_pageratio_ = atof(val);
+    if (!strcmp("col_pagefile", val)) col_pagefile_ = val;
+    if (!strcmp("silent", val)) silent = atoi(val);
+  }
+  virtual void BeforeFirst(void) {
+    itr_.BeforeFirst();
+  } 
+  virtual bool Next(void) {
+    if(!itr_.Next(page_)) return false;
+    out_ = page_->GetBatch();
+    return true;
+  }
+  virtual const ColBatch &Value(void) const{
+    return out_;
+  }
+  inline const std::vector<size_t> &col_ptr(void) const {
+    return itr_.get_factory().col_ptr();
+  }
+  inline void SetColSet(const std::vector<bst_uint> &cset, bool setall = false) {
+    itr_.get_factory().SetColSet(cset, setall);
+  }
+
+ private:
+  // shutup
+  int silent;
+  // input file
+  utils::FileStream *fi_;
+  // size of page
+  float col_pageratio_;
+  // name of file
+  std::string col_pagefile_;
+  // output data
+  ColBatch out_;
+  // page to be loaded
+  CSCMatrixManager::PagePtr page_;
+  // internal iterator
+  utils::ThreadBuffer<CSCMatrixManager::PagePtr,CSCMatrixManager> itr_;
+};
+
+/*!
+ * \brief sparse matrix that support column access
+ */
+class FMatrixPage : public IFMatrix {
+ public:
+  /*! \brief constructor */
+  FMatrixPage(utils::IIterator<RowBatch> *iter) {
+    this->row_iter_ = iter;
+    this->col_iter_ = NULL;
+  }
+  // destructor
+  virtual ~FMatrixPage(void) {
+    if (row_iter_ != NULL) delete row_iter_;
+    if (col_iter_ != NULL) delete col_iter_;
+  }
+  /*! \return whether column access is enabled */
+  virtual bool HaveColAccess(void) const {
+    return col_iter_ != NULL;
+  }
+  /*! \brief get number of colmuns */
+  virtual size_t NumCol(void) const {
+    utils::Check(this->HaveColAccess(), "NumCol:need column access");
+    return col_iter_->col_ptr().size() - 1;
+  }
+  /*! \brief get number of buffered rows */
+  virtual const std::vector<bst_uint> &buffered_rowset(void) const {
+    return buffered_rowset_;
+  }
+  /*! \brief get column size */
+  virtual size_t GetColSize(size_t cidx) const {
+    const std::vector<size_t> &col_ptr = col_iter_->col_ptr();
+    return col_ptr[cidx+1] - col_ptr[cidx];
+  }
+  /*! \brief get column density */
+  virtual float GetColDensity(size_t cidx) const {
+    const std::vector<size_t> &col_ptr = col_iter_->col_ptr();
+    size_t nmiss = buffered_rowset_.size() - (col_ptr[cidx+1] - col_ptr[cidx]);
+    return 1.0f - (static_cast<float>(nmiss)) / buffered_rowset_.size();
+  }
+  virtual void InitColAccess(float pkeep = 1.0f) {
+    if (this->HaveColAccess()) return;
+    this->InitColData(pkeep);
+  }
+  /*!
+   * \brief get the row iterator associated with FMatrix
+   */
+  virtual utils::IIterator<RowBatch>* RowIterator(void) {
+    row_iter_->BeforeFirst();
+    return row_iter_;
+  }
+  /*!
+   * \brief get the column based  iterator
+   */
+  virtual utils::IIterator<ColBatch>* ColIterator(void) {
+    std::vector<bst_uint> cset;
+    col_iter_->SetColSet(cset, true);
+    col_iter_->BeforeFirst();
+    return col_iter_;
+  }
+  /*!
+   * \brief colmun based iterator
+   */
+  virtual utils::IIterator<ColBatch> *ColIterator(const std::vector<bst_uint> &fset) {
+    col_iter_->SetColSet(fset, false);
+    col_iter_->BeforeFirst();
+    return col_iter_;
+  }
+  
+ protected:
+  /*!
+   * \brief intialize column data
+   * \param pkeep probability to keep a row
+   */
+  inline void InitColData(float pkeep) {
+    buffered_rowset_.clear();    
+    // start working
+    row_iter_->BeforeFirst();
+    while (row_iter_->Next()) {
+      const RowBatch &batch = row_iter_->Value();
+      for (size_t i = 0; i < batch.size; ++i) {
+      }
+    }
+    row_iter_->BeforeFirst();
+    size_t ktop = 0;
+    while (row_iter_->Next()) {
+      const RowBatch &batch = row_iter_->Value();
+      for (size_t i = 0; i < batch.size; ++i) {
+        if (ktop < buffered_rowset_.size() &&
+            buffered_rowset_[ktop] == batch.base_rowid+i) {
+          ++ktop;
+          // TODO1
+        }
+      }
+    }
+    // sort columns
+  }
+
+ private:
+  // row iterator
+  utils::IIterator<RowBatch> *row_iter_;
+  // column iterator
+  ThreadColPageIterator *col_iter_;
+  /*! \brief list of row index that are buffered */
+  std::vector<bst_uint> buffered_rowset_;
 };
 
 }  // namespace io
diff --git a/src/utils/thread_buffer.h b/src/utils/thread_buffer.h
index fa488a2204c7..ace50c4b8903 100644
--- a/src/utils/thread_buffer.h
+++ b/src/utils/thread_buffer.h
@@ -113,6 +113,9 @@ class ThreadBuffer {
   inline ElemFactory &get_factory(void) {
     return factory;
   }
+  inline const ElemFactory &get_factory(void) const{
+    return factory;
+  }
   // size of buffer
   int  buf_size;
  private:

From 9d3e09ff2a3fded1dbd1204ddb0f0722955ff24d Mon Sep 17 00:00:00 2001
From: tqchen <tianqi.tchen@gmail.com>
Date: Mon, 1 Sep 2014 20:44:15 -0700
Subject: [PATCH 011/166] make rowbatch page flexible

---
 src/io/page_dmatrix-inl.hpp | 12 +++++++-----
 1 file changed, 7 insertions(+), 5 deletions(-)

diff --git a/src/io/page_dmatrix-inl.hpp b/src/io/page_dmatrix-inl.hpp
index 01b7f8fc7a8f..76767d9423b5 100644
--- a/src/io/page_dmatrix-inl.hpp
+++ b/src/io/page_dmatrix-inl.hpp
@@ -16,7 +16,7 @@ namespace io {
 /*! \brief page structure that can be used to store a rowbatch */
 struct RowBatchPage {
  public:
-  RowBatchPage(void)  {
+  RowBatchPage(size_t page_size) : kPageSize(page_size) {
     data_ = new int[kPageSize];
     utils::Assert(data_ != NULL, "fail to allocate row batch page");
     this->Clear();
@@ -82,8 +82,6 @@ struct RowBatchPage {
   inline int Size(void) const {
     return data_[0];
   }
-  /*! \brief page size 64 MB */
-  static const size_t kPageSize = 64 << 18;
 
  private:
   /*! \return number of elements */
@@ -98,6 +96,8 @@ struct RowBatchPage {
   inline RowBatch::Entry* data_ptr(int i) {
     return (RowBatch::Entry*)(&data_[1]) + i;
   }
+  // page size
+  const size_t kPageSize;
   // content of data
   int *data_;  
 };
@@ -137,7 +137,7 @@ class ThreadRowPageIterator: public utils::IIterator<RowBatch> {
    */
   inline static void Save(utils::IIterator<RowBatch> *iter,
                           utils::IStream &fo) {
-    RowBatchPage page;
+    RowBatchPage page(kPageSize);
     iter->BeforeFirst();
     while (iter->Next()) {
       const RowBatch &batch = iter->Value();
@@ -151,6 +151,8 @@ class ThreadRowPageIterator: public utils::IIterator<RowBatch> {
     }
     if (page.Size() != 0) page.Save(fo);
   }
+  /*! \brief page size 64 MB */
+  static const size_t kPageSize = 64 << 18;
  private:
   // base row id
   size_t base_rowid_;
@@ -178,7 +180,7 @@ class ThreadRowPageIterator: public utils::IIterator<RowBatch> {
       return val->Load(fi);
     }
     inline PagePtr Create(void) {
-      PagePtr a = new RowBatchPage();
+      PagePtr a = new RowBatchPage(kPageSize);
       return a;
     }
     inline void FreeSpace(PagePtr &a) {

From e43bb9118541c3ad15adba958b2d7b3d5a885087 Mon Sep 17 00:00:00 2001
From: tqchen <tianqi.tchen@gmail.com>
Date: Mon, 1 Sep 2014 21:30:03 -0700
Subject: [PATCH 012/166] add matrix builder

---
 src/io/page_fmatrix-inl.hpp |   7 +--
 src/utils/matrix_csr.h      | 112 ++++++++++++++++++++++++++++++++++++
 2 files changed, 115 insertions(+), 4 deletions(-)

diff --git a/src/io/page_fmatrix-inl.hpp b/src/io/page_fmatrix-inl.hpp
index cf4923b7b9ba..b2ce76faf85c 100644
--- a/src/io/page_fmatrix-inl.hpp
+++ b/src/io/page_fmatrix-inl.hpp
@@ -121,7 +121,7 @@ class CSCMatrixManager {
     size_t len = col_ptr_[cidx+1] - col_ptr_[cidx];
     if (p_page->NumFreeEntry() < len) return false;
     ColBatch::Entry *p_data = p_page->AllocEntry(len);
-    fi_->Seek(col_ptr_[cidx]);
+    fi_->Seek(col_ptr_[cidx] * sizeof(ColBatch::Entry) + sizeof(size_t));
     utils::Check(fi_->Read(p_data, sizeof(ColBatch::Entry) * len) != 0,
                  "invalid column buffer format");
     p_page->col_data.push_back(ColBatch::Inst(p_data, len));
@@ -285,8 +285,7 @@ class FMatrixPage : public IFMatrix {
     row_iter_->BeforeFirst();
     while (row_iter_->Next()) {
       const RowBatch &batch = row_iter_->Value();
-      for (size_t i = 0; i < batch.size; ++i) {
-      }
+      
     }
     row_iter_->BeforeFirst();
     size_t ktop = 0;
@@ -294,7 +293,7 @@ class FMatrixPage : public IFMatrix {
       const RowBatch &batch = row_iter_->Value();
       for (size_t i = 0; i < batch.size; ++i) {
         if (ktop < buffered_rowset_.size() &&
-            buffered_rowset_[ktop] == batch.base_rowid+i) {
+            buffered_rowset_[ktop] == batch.base_rowid + i) {
           ++ktop;
           // TODO1
         }
diff --git a/src/utils/matrix_csr.h b/src/utils/matrix_csr.h
index 0f3b20a14a49..44a3b88186fe 100644
--- a/src/utils/matrix_csr.h
+++ b/src/utils/matrix_csr.h
@@ -7,6 +7,7 @@
  */
 #include <vector>
 #include <algorithm>
+#include "./io.h"
 #include "./utils.h"
 
 namespace xgboost {
@@ -118,6 +119,117 @@ struct SparseCSRMBuilder {
   }
 };
 
+/*!
+ * \brief a class used to help construct CSR format matrix file
+ * \tparam IndexType type of index used to store the index position
+ * \tparam SizeType type of size used in row pointer
+ */
+template<typename IndexType, typename SizeType = size_t>
+struct SparseCSRFileBuilder {
+ public:
+  explicit SparseCSRFileBuilder(utils::ISeekStream *fo, size_t buffer_size) 
+      : fo(fo), buffer_size(buffer_size) {
+  }
+  /*!
+   * \brief step 1: initialize the number of rows in the data, not necessary exact
+   * \nrows number of rows in the matrix, can be smaller than expected
+   */
+  inline void InitBudget(size_t nrows = 0) {
+    rptr.clear();
+    rptr.resize(nrows + 1, 0);
+  }
+  /*!
+   * \brief step 2: add budget to each rows
+   * \param row_id the id of the row
+   * \param nelem  number of element budget add to this row
+   */
+  inline void AddBudget(size_t row_id, SizeType nelem = 1) {
+    if (rptr.size() < row_id + 2) {
+      rptr.resize(row_id + 2, 0);
+    }
+    rptr[row_id + 1] += nelem;
+  }
+  /*! \brief step 3: initialize the necessary storage */
+  inline void InitStorage(void) {
+    SizeType nelem = 0;
+    for (size_t i = 1; i < rptr.size(); i++) {
+      nelem += rptr[i];
+      rptr[i] = nelem;
+    }
+    SizeType begin_meta = sizeof(SizeType) + nelem * sizeof(IndexType);
+    fo->Seek(0);
+    fo->Write(&begin_meta, sizeof(begin_meta));
+    fo->Seek(begin_meta);
+    fo->Write(rptr);
+    // setup buffer space
+    buffer_rptr.resize(rptr.size());
+    buffer.reserve(buffer_size);
+    buffer_data.resize(buffer_size);
+    saved_offset.clear();
+    saved_offset.resize(rptr.size() - 1, 0);
+    this->ClearBuffer();
+  }
+  /*! \brief step 4: push element into buffer */
+  inline void PushElem(SizeType row_id, IndexType col_id) {
+    if (buffer_temp.size() == buffer_size) {
+      this->WriteBuffer();
+      this->ClearBuffer();
+    }
+    buffer_temp.push_back(std::make_pair(row_id, col_id));    
+  }
+  /*! \brief finalize the construction */
+  inline void Finalize(void) {
+    this->WriteBuffer();
+    for (size_t i = 0; i < saved_offset.size(); ++i) {
+      utils::Assert(saved_offset[i] == rptr[i+1], "some block not write out");
+    }
+  }
+  
+ protected:
+  inline void WriteBuffer(void) {
+    SizeType start = 0;
+    for (size_t i = 1; i < buffer_rptr.size(); ++i) {
+      size_t rlen = buffer_rptr[i];
+      buffer_rptr[i] = start;
+      start += rlen;
+    }
+    for (size_t i = 0; i < buffer_temp.size(); ++i) {
+      SizeType &rp = buffer_rptr[buffer_temp[i].first + 1];
+      buffer_data[rp++] = buffer_temp[i].second;
+    }
+    // write out
+    for (size_t i = 0; i < buffer_rptr.size(); ++i) {
+      size_t nelem = buffer_rptr[i+1] - buffer_rptr[i];
+      if (nelem != 0) {
+        utils::Assert(saved_offset[i] < rptr[i+1], "data exceed bound");
+        fo->Seek((rptr[i] + saved_offset[i]) * sizeof(IndexType) + sizeof(SizeType));
+        fo->Write(&buffer_data[0] + buffer_rptr[i], nelem * sizeof(IndexType));
+        saved_offset[i] += nelem;
+      }
+    }
+  }
+  inline void ClearBuffer(void) {
+    buffer_temp.clear();
+    std::fill(buffer_rptr.begin(), buffer_rptr.end(), 0);
+  }
+ private:
+  /*! \brief output file pointer the data */
+  utils::ISeekStream *fo;
+  /*! \brief pointer to each of the row */
+  std::vector<SizeType> rptr;
+  /*! \brief saved top space of each item */
+  std::vector<SizeType> saved_offset;
+  // ----- the following are buffer space
+  /*! \brief maximum size of content buffer*/
+  size_t buffer_size;
+  /*! \brief store the data content */
+  std::vector< std::pair<SizeType, IndexType> > buffer_temp;
+  /*! \brief saved top space of each item */
+  std::vector<SizeType> buffer_rptr;
+  /*! \brief saved top space of each item */
+  std::vector<IndexType> buffer_data;
+};
+
 }  // namespace utils
 }  // namespace xgboost
 #endif

From 4b9aeea89c4c6fba24a8e0d487df65babb67392f Mon Sep 17 00:00:00 2001
From: tqchen <tianqi.tchen@gmail.com>
Date: Tue, 2 Sep 2014 13:14:54 -0700
Subject: [PATCH 013/166] finish the fmatrix

---
 src/io/page_dmatrix-inl.hpp |  18 +++---
 src/io/page_fmatrix-inl.hpp | 108 +++++++++++++++++++++++-------------
 src/utils/io.h              |   1 -
 src/utils/matrix_csr.h      |  34 ++++++++++--
 4 files changed, 106 insertions(+), 55 deletions(-)

diff --git a/src/io/page_dmatrix-inl.hpp b/src/io/page_dmatrix-inl.hpp
index 76767d9423b5..83c7455995ff 100644
--- a/src/io/page_dmatrix-inl.hpp
+++ b/src/io/page_dmatrix-inl.hpp
@@ -223,16 +223,16 @@ class DMatrixPage : public DataMatrix {
     this->info.LoadBinary(fi);
     iter_->Load(fi);
     if (!silent) {
-      printf("DMatrixPage: %lux%lu matrix is loaded",
-             static_cast<unsigned long>(info.num_row()),
-             static_cast<unsigned long>(info.num_col()));
+      utils::Printf("DMatrixPage: %lux%lu matrix is loaded",
+                    static_cast<unsigned long>(info.num_row()),
+                    static_cast<unsigned long>(info.num_col()));
       if (fname != NULL) {
-        printf(" from %s\n", fname);
+        utils::Printf(" from %s\n", fname);
       } else {
-        printf("\n");
+        utils::Printf("\n");
       }
       if (info.group_ptr.size() != 0) {
-        printf("data contains %u groups\n", (unsigned)info.group_ptr.size()-1);
+        utils::Printf("data contains %u groups\n", (unsigned)info.group_ptr.size()-1);
       }
     }
   }
@@ -245,9 +245,9 @@ class DMatrixPage : public DataMatrix {
     ThreadRowPageIterator::Save(mat.fmat()->RowIterator(), fs);
     fs.Close();
     if (!silent) {
-      printf("DMatrixPage: %lux%lu is saved to %s\n",
-             static_cast<unsigned long>(mat.info.num_row()),
-             static_cast<unsigned long>(mat.info.num_col()), fname);
+      utils::Printf("DMatrixPage: %lux%lu is saved to %s\n",
+                    static_cast<unsigned long>(mat.info.num_row()),
+                    static_cast<unsigned long>(mat.info.num_col()), fname);
     }
   }
   /*! \brief the real fmatrix */
diff --git a/src/io/page_fmatrix-inl.hpp b/src/io/page_fmatrix-inl.hpp
index b2ce76faf85c..7e9903be4aa7 100644
--- a/src/io/page_fmatrix-inl.hpp
+++ b/src/io/page_fmatrix-inl.hpp
@@ -7,10 +7,11 @@
  */
 #include "../data.h"
 #include "../utils/iterator.h"
+#include "../utils/io.h"
+#include "../utils/matrix_csr.h"
 #include "../utils/thread_buffer.h"
 namespace xgboost {
 namespace io {
-
 class CSCMatrixManager {
  public:
   /*! \brief in memory page */
@@ -56,6 +57,10 @@ class CSCMatrixManager {
   };
   /*! \brief define type of page pointer */
   typedef Page *PagePtr;
+  // constructor
+  CSCMatrixManager(void) {
+    fi_ = NULL;
+  }
   /*! \brief get column pointer */
   inline const std::vector<size_t> &col_ptr(void) const {
     return col_ptr_;
@@ -89,7 +94,8 @@ class CSCMatrixManager {
   }
   inline void Setup(utils::ISeekStream *fi, double page_ratio) {
     fi_ = fi;
-    fi_->Read(&begin_meta_ , sizeof(size_t));
+    fi_->Read(&begin_meta_ , sizeof(begin_meta_));
+    begin_data_ = static_cast<size_t>(fi->Tell());
     fi_->Seek(begin_meta_);
     fi_->Read(&col_ptr_);
     size_t psmax = 0;
@@ -121,7 +127,7 @@ class CSCMatrixManager {
     size_t len = col_ptr_[cidx+1] - col_ptr_[cidx];
     if (p_page->NumFreeEntry() < len) return false;
     ColBatch::Entry *p_data = p_page->AllocEntry(len);
-    fi_->Seek(col_ptr_[cidx] * sizeof(ColBatch::Entry) + sizeof(size_t));
+    fi_->Seek(col_ptr_[cidx] * sizeof(ColBatch::Entry) + begin_data_);
     utils::Check(fi_->Read(p_data, sizeof(ColBatch::Entry) * len) != 0,
                  "invalid column buffer format");
     p_page->col_data.push_back(ColBatch::Inst(p_data, len));
@@ -137,6 +143,8 @@ class CSCMatrixManager {
   /*! \brief column index to be after calling before first */
   std::vector<bst_uint> col_todo_;
   // the following are input content
+  /*! \brief beginning position of data content */
+  size_t begin_data_;
   /*! \brief size of data content */
   size_t begin_meta_;
   /*! \brief input stream */
@@ -147,36 +155,25 @@ class CSCMatrixManager {
 
 class ThreadColPageIterator : public utils::IIterator<ColBatch> {
  public:
-  ThreadColPageIterator(void) {
+  explicit ThreadColPageIterator(utils::ISeekStream *fi,
+                                 float page_ratio, bool silent) {
     itr_.SetParam("buffer_size", "2");
-    page_ = NULL;
-    fi_ = NULL;
-    silent = 0;
-  }
-  virtual ~ThreadColPageIterator(void) {
-    if (fi_ != NULL) {
-      fi_->Close(); delete fi_;
+    itr_.get_factory().Setup(fi, page_ratio);
+    if (!silent) {
+      utils::Printf("ThreadColPageIterator: finish initialzing, %u columns\n",
+                    static_cast<unsigned>(col_ptr().size() - 1));
     }
   }
-  virtual void Init(void) {
-    fi_ = new utils::FileStream(utils::FopenCheck(col_pagefile_.c_str(), "rb"));
-    itr_.get_factory().Setup(fi_, col_pageratio_);
-    if (silent == 0) {
-      printf("ThreadColPageIterator: finish initialzing from %s, %u columns\n",
-             col_pagefile_.c_str(), static_cast<unsigned>(col_ptr().size() - 1));
-    }
-  }
-  virtual void SetParam(const char *name, const char *val) {
-    if (!strcmp("col_pageratio", val)) col_pageratio_ = atof(val);
-    if (!strcmp("col_pagefile", val)) col_pagefile_ = val;
-    if (!strcmp("silent", val)) silent = atoi(val);
+  virtual ~ThreadColPageIterator(void) {
   }
   virtual void BeforeFirst(void) {
     itr_.BeforeFirst();
   } 
   virtual bool Next(void) {
-    if(!itr_.Next(page_)) return false;
-    out_ = page_->GetBatch();
+    // page to be loaded
+    CSCMatrixManager::PagePtr page;
+    if(!itr_.Next(page)) return false;
+    out_ = page->GetBatch();
     return true;
   }
   virtual const ColBatch &Value(void) const{
@@ -190,18 +187,8 @@ class ThreadColPageIterator : public utils::IIterator<ColBatch> {
   }
 
  private:
-  // shutup
-  int silent;
-  // input file
-  utils::FileStream *fi_;
-  // size of page
-  float col_pageratio_;
-  // name of file
-  std::string col_pagefile_;
   // output data
   ColBatch out_;
-  // page to be loaded
-  CSCMatrixManager::PagePtr page_;
   // internal iterator
   utils::ThreadBuffer<CSCMatrixManager::PagePtr,CSCMatrixManager> itr_;
 };
@@ -212,14 +199,18 @@ class ThreadColPageIterator : public utils::IIterator<ColBatch> {
 class FMatrixPage : public IFMatrix {
  public:
   /*! \brief constructor */
-  FMatrixPage(utils::IIterator<RowBatch> *iter) {
+  FMatrixPage(utils::IIterator<RowBatch> *iter, std::string fname_buffer) {
     this->row_iter_ = iter;
     this->col_iter_ = NULL;
+    this->fi_ = NULL;
   }
   // destructor
   virtual ~FMatrixPage(void) {
     if (row_iter_ != NULL) delete row_iter_;
     if (col_iter_ != NULL) delete col_iter_;
+    if (fi_ != NULL) {
+      fi_->Close(); delete fi_;
+    } 
   }
   /*! \return whether column access is enabled */
   virtual bool HaveColAccess(void) const {
@@ -275,18 +266,44 @@ class FMatrixPage : public IFMatrix {
   }
   
  protected:
+  /*!
+   * \brief try load column data from file
+   */
+  inline bool LoadColData(void) {
+    FILE *fp = fopen64(fname_cbuffer_.c_str(), "rb");
+    if (fp == NULL) return false;
+    fi_ = new utils::FileStream(fp);
+    static_cast<utils::IStream*>(fi_)->Read(&buffered_rowset_);
+    col_iter_ = new ThreadColPageIterator(fi_, 2.0f, false);    
+    return true;
+  }
   /*!
    * \brief intialize column data
    * \param pkeep probability to keep a row
    */
   inline void InitColData(float pkeep) {
-    buffered_rowset_.clear();    
+    buffered_rowset_.clear();
+    utils::FileStream fo(utils::FopenCheck(fname_cbuffer_.c_str(), "wb+"));
+    // use 64M buffer
+    utils::SparseCSRFileBuilder<ColBatch::Entry> builder(&fo, 64<<20);
+    
     // start working
     row_iter_->BeforeFirst();
     while (row_iter_->Next()) {
       const RowBatch &batch = row_iter_->Value();
-      
+      for (size_t i = 0; i < batch.size; ++i) { 
+        if (pkeep == 1.0f || random::SampleBinary(pkeep)) {
+          buffered_rowset_.push_back(static_cast<bst_uint>(batch.base_rowid+i));
+          RowBatch::Inst inst = batch[i];
+          for (bst_uint j = 0; j < inst.length; ++j) {
+            builder.AddBudget(inst[j].index);
+          }
+        }
+      }
     }
+    // write buffered rowset
+    static_cast<utils::IStream*>(&fo)->Write(buffered_rowset_);
+    builder.InitStorage();
     row_iter_->BeforeFirst();
     size_t ktop = 0;
     while (row_iter_->Next()) {
@@ -295,11 +312,18 @@ class FMatrixPage : public IFMatrix {
         if (ktop < buffered_rowset_.size() &&
             buffered_rowset_[ktop] == batch.base_rowid + i) {
           ++ktop;
-          // TODO1
+          RowBatch::Inst inst = batch[i];
+          for (bst_uint j = 0; j < inst.length; ++j) {
+            builder.PushElem(inst[j].index,
+                             ColBatch::Entry((bst_uint)(batch.base_rowid+i),
+                                             inst[j].fvalue));
+          }
         }
       }
     }
-    // sort columns
+    builder.Finalize();
+    builder.SortRows(ColBatch::Entry::CmpValue, 5);
+    fo.Close();
   }
 
  private:
@@ -307,6 +331,10 @@ class FMatrixPage : public IFMatrix {
   utils::IIterator<RowBatch> *row_iter_;
   // column iterator
   ThreadColPageIterator *col_iter_;
+  // file pointer to data
+  utils::FileStream *fi_;
+  // file name of column buffer
+  std::string fname_cbuffer_;
   /*! \brief list of row index that are buffered */
   std::vector<bst_uint> buffered_rowset_;
 };
diff --git a/src/utils/io.h b/src/utils/io.h
index d98b3e4dc7df..37f489955c4e 100644
--- a/src/utils/io.h
+++ b/src/utils/io.h
@@ -125,7 +125,6 @@ class FileStream : public ISeekStream {
  private:
   FILE *fp;
 };
-
 }  // namespace utils
 }  // namespace xgboost
 #endif
diff --git a/src/utils/matrix_csr.h b/src/utils/matrix_csr.h
index b2768b2eafbb..e4c410511e1e 100644
--- a/src/utils/matrix_csr.h
+++ b/src/utils/matrix_csr.h
@@ -9,6 +9,7 @@
 #include <algorithm>
 #include "./io.h"
 #include "./utils.h"
+#include "./omp.h"
 
 namespace xgboost {
 namespace utils {
@@ -155,9 +156,9 @@ struct SparseCSRFileBuilder {
     for (size_t i = 1; i < rptr.size(); i++) {
       nelem += rptr[i];
       rptr[i] = nelem;
-    }
-    SizeType begin_meta = sizeof(SizeType) + nelem * sizeof(IndexType);
-    fo->Seek(0);
+    }    
+    begin_data = static_cast<SizeType>(fo->Tell()) + sizeof(SizeType);
+    SizeType begin_meta = begin_data + nelem * sizeof(IndexType);
     fo->Write(&begin_meta, sizeof(begin_meta));
     fo->Seek(begin_meta);
     fo->Write(rptr);
@@ -184,7 +185,28 @@ struct SparseCSRFileBuilder {
       utils::Assert(saved_offset[i] == rptr[i+1], "some block not write out");
     }
   }
-  
+  /*! \brief content must be in wb+ */
+  template<typename Comparator>
+  inline void SortRows(Comparator comp, size_t step) {
+    for (size_t i = 0; i < rptr.size() - 1; i += step) {
+      bst_omp_uint begin = static_cast<bst_omp_uint>(i);
+      bst_omp_uint end = static_cast<bst_omp_uint>(std::min(rptr.size(), i + step));
+      if (rptr[end] != rptr[begin]) {
+        fo->Seek(begin_data + rptr[begin] * sizeof(IndexType));
+        buffer_data.resize(rptr[end] - rptr[begin]);
+        fo->Read(BeginPtr(buffer_data), (rptr[end] - rptr[begin]) * sizeof(IndexType));
+        // do parallel sorting
+        #pragma omp parallel for schedule(static)
+        for (bst_omp_uint j = begin; j < end; ++j){
+          std::sort(&buffer_data[0] + rptr[j] - rptr[begin],
+                    &buffer_data[0] + rptr[j+1] - rptr[begin],
+                    comp);
+        }
+        fo->Seek(begin_data + rptr[begin] * sizeof(IndexType));
+        fo->Write(BeginPtr(buffer_data), (rptr[end] - rptr[begin]) * sizeof(IndexType));
+      }
+    }
+  }
  protected:
   inline void WriteBuffer(void) {
     SizeType start = 0;
@@ -202,7 +224,7 @@ struct SparseCSRFileBuilder {
       size_t nelem = buffer_rptr[i+1] - buffer_rptr[i];
       if (nelem != 0) {
         utils::Assert(saved_offset[i] < rptr[i+1], "data exceed bound");
-        fo->Seek((rptr[i] + saved_offset[i]) * sizeof(IndexType) + sizeof(SizeType));
+        fo->Seek((rptr[i] + saved_offset[i]) * sizeof(IndexType) + begin_data);
         fo->Write(&buffer_data[0] + buffer_rptr[i], nelem * sizeof(IndexType));
         saved_offset[i] += nelem;
       }
@@ -219,6 +241,8 @@ struct SparseCSRFileBuilder {
   std::vector<SizeType> rptr;
   /*! \brief saved top space of each item */
   std::vector<SizeType> saved_offset;
+  /*! \brief beginning position of data */
+  size_t begin_data;
   // ----- the following are buffer space
   /*! \brief maximum size of content buffer*/
   size_t buffer_size;

From a89e3063e6cc327f1552cdea154864fa510f8040 Mon Sep 17 00:00:00 2001
From: tqchen <tianqi.tchen@gmail.com>
Date: Tue, 2 Sep 2014 15:34:11 -0700
Subject: [PATCH 014/166] untested version of cpage

---
 src/io/io.cpp               | 11 ++++++++++
 src/io/page_dmatrix-inl.hpp | 44 +++++++++++++++++++++++--------------
 src/io/page_fmatrix-inl.hpp | 32 ++++++++++++++++++++++-----
 3 files changed, 65 insertions(+), 22 deletions(-)

diff --git a/src/io/io.cpp b/src/io/io.cpp
index c2d9e26d3d05..faed31f13ee4 100644
--- a/src/io/io.cpp
+++ b/src/io/io.cpp
@@ -7,6 +7,7 @@ using namespace std;
 #include "../utils/utils.h"
 #include "simple_dmatrix-inl.hpp"
 #include "page_dmatrix-inl.hpp"
+#include "page_fmatrix-inl.hpp"
 
 // implements data loads using dmatrix simple for now
 
@@ -30,6 +31,12 @@ DataMatrix* LoadDataMatrix(const char *fname, bool silent, bool savebuffer) {
     // the file pointer is hold in page matrix
     return dmat;
   }
+  if (magic == DMatrixColPage::kMagic) {
+    DMatrixColPage *dmat = new DMatrixColPage(fname);
+    dmat->Load(fs, silent, fname);
+    // the file pointer is hold in page matrix
+    return dmat;
+  }
   fs.Close();
  
   DMatrixSimple *dmat = new DMatrixSimple();
@@ -42,6 +49,10 @@ void SaveDataMatrix(const DataMatrix &dmat, const char *fname, bool silent) {
     DMatrixPage::Save(fname, dmat, silent);
     return;
   }
+  if (!strcmp(fname + strlen(fname) - 6, ".cpage")) {
+    DMatrixColPage::Save(fname, dmat, silent);
+    return;
+  }
   if (dmat.magic == DMatrixSimple::kMagic) {
     const DMatrixSimple *p_dmat = static_cast<const DMatrixSimple*>(&dmat);
     p_dmat->SaveBinary(fname, silent);
diff --git a/src/io/page_dmatrix-inl.hpp b/src/io/page_dmatrix-inl.hpp
index 83c7455995ff..63010d882dfa 100644
--- a/src/io/page_dmatrix-inl.hpp
+++ b/src/io/page_dmatrix-inl.hpp
@@ -9,7 +9,6 @@
 #include "../utils/iterator.h"
 #include "../utils/thread_buffer.h"
 #include "./simple_fmatrix-inl.hpp"
-#include "./page_fmatrix-inl.hpp"
 
 namespace xgboost {
 namespace io {
@@ -200,26 +199,24 @@ class ThreadRowPageIterator: public utils::IIterator<RowBatch> {
 };
 
 /*! \brief data matrix using page */
-class DMatrixPage : public DataMatrix {
+template<int TKMagic>
+class DMatrixPageBase : public DataMatrix {
  public:
-  DMatrixPage(void) : DataMatrix(kMagic) {
+  DMatrixPageBase(void) : DataMatrix(kMagic) {
     iter_ = new ThreadRowPageIterator();
-    fmat_ = new FMatrixS(iter_);
   }
   // virtual destructor
-  virtual ~DMatrixPage(void) {
-    delete fmat_;
-  }
-  virtual IFMatrix *fmat(void) const {
-    return fmat_;
+  virtual ~DMatrixPageBase(void) {
+    // do not delete row iterator, since it is owned by fmat
+    // to be cleaned up in a more clear way
   }
   /*! \brief load and initialize the iterator with fi */
   inline void Load(utils::FileStream &fi,
                    bool silent = false,
                    const char *fname = NULL){
-    int magic;
-    utils::Check(fi.Read(&magic, sizeof(magic)) != 0, "invalid input file format");
-    utils::Check(magic == kMagic, "invalid format,magic number mismatch");    
+    int tmagic;
+    utils::Check(fi.Read(&tmagic, sizeof(tmagic)) != 0, "invalid input file format");
+    utils::Check(tmagic == magic, "invalid format,magic number mismatch");    
     this->info.LoadBinary(fi);
     iter_->Load(fi);
     if (!silent) {
@@ -250,12 +247,27 @@ class DMatrixPage : public DataMatrix {
                     static_cast<unsigned long>(mat.info.num_col()), fname);
     }
   }
-  /*! \brief the real fmatrix */
-  FMatrixS *fmat_;
+  /*! \brief magic number used to identify DMatrix */
+  static const int kMagic = TKMagic;
+ protected:
+
   /*! \brief row iterator */
   ThreadRowPageIterator *iter_;
-  /*! \brief magic number used to identify DMatrix */
-  static const int kMagic = 0xffffab02;
+};
+
+class DMatrixPage : public DMatrixPageBase<0xffffab02> {
+ public:
+  DMatrixPage(void) {
+    fmat_ = new FMatrixS(iter_);
+  }
+  virtual ~DMatrixPage(void) {
+    delete fmat_;
+  }
+  virtual IFMatrix *fmat(void) const {
+    return fmat_;
+  }
+  /*! \brief the real fmatrix */
+  IFMatrix *fmat_;
 };
 }  // namespace io
 }  // namespace xgboost
diff --git a/src/io/page_fmatrix-inl.hpp b/src/io/page_fmatrix-inl.hpp
index 7e9903be4aa7..4189c0c8501f 100644
--- a/src/io/page_fmatrix-inl.hpp
+++ b/src/io/page_fmatrix-inl.hpp
@@ -199,7 +199,8 @@ class ThreadColPageIterator : public utils::IIterator<ColBatch> {
 class FMatrixPage : public IFMatrix {
  public:
   /*! \brief constructor */
-  FMatrixPage(utils::IIterator<RowBatch> *iter, std::string fname_buffer) {
+  FMatrixPage(utils::IIterator<RowBatch> *iter, std::string fname_buffer)
+      : fname_cbuffer_(fname_buffer) {
     this->row_iter_ = iter;
     this->col_iter_ = NULL;
     this->fi_ = NULL;
@@ -238,7 +239,8 @@ class FMatrixPage : public IFMatrix {
   }
   virtual void InitColAccess(float pkeep = 1.0f) {
     if (this->HaveColAccess()) return;
-    this->InitColData(pkeep);
+    this->InitColData(pkeep, fname_cbuffer_.c_str(),
+                      64 << 20, 5);
   }
   /*!
    * \brief get the row iterator associated with FMatrix
@@ -281,11 +283,12 @@ class FMatrixPage : public IFMatrix {
    * \brief intialize column data
    * \param pkeep probability to keep a row
    */
-  inline void InitColData(float pkeep) {
+  inline void InitColData(float pkeep, const char *fname, 
+                          size_t buffer_size, size_t col_step) {
     buffered_rowset_.clear();
-    utils::FileStream fo(utils::FopenCheck(fname_cbuffer_.c_str(), "wb+"));
+    utils::FileStream fo(utils::FopenCheck(fname, "wb+"));
     // use 64M buffer
-    utils::SparseCSRFileBuilder<ColBatch::Entry> builder(&fo, 64<<20);
+    utils::SparseCSRFileBuilder<ColBatch::Entry> builder(&fo, buffer_size);
     
     // start working
     row_iter_->BeforeFirst();
@@ -322,7 +325,7 @@ class FMatrixPage : public IFMatrix {
       }
     }
     builder.Finalize();
-    builder.SortRows(ColBatch::Entry::CmpValue, 5);
+    builder.SortRows(ColBatch::Entry::CmpValue, col_step);
     fo.Close();
   }
 
@@ -339,6 +342,23 @@ class FMatrixPage : public IFMatrix {
   std::vector<bst_uint> buffered_rowset_;
 };
 
+class DMatrixColPage : public DMatrixPageBase<0xffffab03> {
+ public:
+  DMatrixColPage(const char *fname) {
+    std::string fext = fname;
+    fext += ".col";
+    fmat_ = new FMatrixPage(iter_, fext.c_str());
+  }
+  virtual ~DMatrixColPage(void) {
+    delete fmat_;
+  }
+  virtual IFMatrix *fmat(void) const {
+    return fmat_;
+  }
+  /*! \brief the real fmatrix */
+  IFMatrix *fmat_;
+};
+
 }  // namespace io
 }  // namespace xgboost
 #endif  // XGBOOST_IO_PAGE_FMATRIX_INL_HPP_

From 226d26d40c7a7c44e607dab2a7ae476b3e15fd58 Mon Sep 17 00:00:00 2001
From: tqchen <tianqi.tchen@gmail.com>
Date: Tue, 2 Sep 2014 17:18:17 -0700
Subject: [PATCH 015/166] still buggy

---
 src/io/page_fmatrix-inl.hpp |  9 +++++++--
 src/utils/matrix_csr.h      | 21 ++++++++++++---------
 2 files changed, 19 insertions(+), 11 deletions(-)

diff --git a/src/io/page_fmatrix-inl.hpp b/src/io/page_fmatrix-inl.hpp
index 4189c0c8501f..9e586e1c4540 100644
--- a/src/io/page_fmatrix-inl.hpp
+++ b/src/io/page_fmatrix-inl.hpp
@@ -132,6 +132,7 @@ class CSCMatrixManager {
                  "invalid column buffer format");
     p_page->col_data.push_back(ColBatch::Inst(p_data, len));
     p_page->col_index.push_back(cidx);
+    return true;
   }
   // the following are in memory auxiliary data structure
   /*! \brief top of reader position */
@@ -159,6 +160,7 @@ class ThreadColPageIterator : public utils::IIterator<ColBatch> {
                                  float page_ratio, bool silent) {
     itr_.SetParam("buffer_size", "2");
     itr_.get_factory().Setup(fi, page_ratio);
+    itr_.Init();
     if (!silent) {
       utils::Printf("ThreadColPageIterator: finish initialzing, %u columns\n",
                     static_cast<unsigned>(col_ptr().size() - 1));
@@ -239,8 +241,11 @@ class FMatrixPage : public IFMatrix {
   }
   virtual void InitColAccess(float pkeep = 1.0f) {
     if (this->HaveColAccess()) return;
-    this->InitColData(pkeep, fname_cbuffer_.c_str(),
-                      64 << 20, 5);
+    if (!this->LoadColData()) {
+      this->InitColData(pkeep, fname_cbuffer_.c_str(),
+                        64 << 20, 5);
+      utils::Check(this->LoadColData(), "fail to read in column data");
+    }
   }
   /*!
    * \brief get the row iterator associated with FMatrix
diff --git a/src/utils/matrix_csr.h b/src/utils/matrix_csr.h
index e4c410511e1e..ea5bc8b2dcb6 100644
--- a/src/utils/matrix_csr.h
+++ b/src/utils/matrix_csr.h
@@ -6,6 +6,7 @@
  * \author Tianqi Chen
  */
 #include <vector>
+#include <utility>
 #include <algorithm>
 #include "./io.h"
 #include "./utils.h"
@@ -156,7 +157,7 @@ struct SparseCSRFileBuilder {
     for (size_t i = 1; i < rptr.size(); i++) {
       nelem += rptr[i];
       rptr[i] = nelem;
-    }    
+    }
     begin_data = static_cast<SizeType>(fo->Tell()) + sizeof(SizeType);
     SizeType begin_meta = begin_data + nelem * sizeof(IndexType);
     fo->Write(&begin_meta, sizeof(begin_meta));
@@ -166,8 +167,8 @@ struct SparseCSRFileBuilder {
     buffer_rptr.resize(rptr.size());
     buffer_temp.reserve(buffer_size);
     buffer_data.resize(buffer_size);
-    saved_offset.clear();
-    saved_offset.resize(rptr.size() - 1, 0);
+    saved_offset = rptr;
+    saved_offset.resize(rptr.size() - 1);
     this->ClearBuffer();
   }
   /*! \brief step 4: push element into buffer */
@@ -176,7 +177,8 @@ struct SparseCSRFileBuilder {
       this->WriteBuffer();
       this->ClearBuffer();
     }
-    buffer_temp.push_back(std::make_pair(row_id, col_id));    
+    buffer_rptr[row_id + 1] += 1;
+    buffer_temp.push_back(std::make_pair(row_id, col_id));
   }
   /*! \brief finalize the construction */
   inline void Finalize(void) {
@@ -190,14 +192,14 @@ struct SparseCSRFileBuilder {
   inline void SortRows(Comparator comp, size_t step) {
     for (size_t i = 0; i < rptr.size() - 1; i += step) {
       bst_omp_uint begin = static_cast<bst_omp_uint>(i);
-      bst_omp_uint end = static_cast<bst_omp_uint>(std::min(rptr.size(), i + step));
+      bst_omp_uint end = static_cast<bst_omp_uint>(std::min(rptr.size() - 1, i + step));
       if (rptr[end] != rptr[begin]) {
         fo->Seek(begin_data + rptr[begin] * sizeof(IndexType));
         buffer_data.resize(rptr[end] - rptr[begin]);
         fo->Read(BeginPtr(buffer_data), (rptr[end] - rptr[begin]) * sizeof(IndexType));
         // do parallel sorting
         #pragma omp parallel for schedule(static)
-        for (bst_omp_uint j = begin; j < end; ++j){
+        for (bst_omp_uint j = begin; j < end; ++j) {
           std::sort(&buffer_data[0] + rptr[j] - rptr[begin],
                     &buffer_data[0] + rptr[j+1] - rptr[begin],
                     comp);
@@ -206,6 +208,7 @@ struct SparseCSRFileBuilder {
         fo->Write(BeginPtr(buffer_data), (rptr[end] - rptr[begin]) * sizeof(IndexType));
       }
     }
+    printf("CSV::begin_dat=%lu\n", begin_data);
   }
  protected:
   inline void WriteBuffer(void) {
@@ -220,11 +223,11 @@ struct SparseCSRFileBuilder {
       buffer_data[rp++] = buffer_temp[i].second;
     }
     // write out
-    for (size_t i = 0; i < buffer_rptr.size(); ++i) {
+    for (size_t i = 0; i < buffer_rptr.size() - 1; ++i) {
       size_t nelem = buffer_rptr[i+1] - buffer_rptr[i];
       if (nelem != 0) {
-        utils::Assert(saved_offset[i] < rptr[i+1], "data exceed bound");
-        fo->Seek((rptr[i] + saved_offset[i]) * sizeof(IndexType) + begin_data);
+        utils::Assert(saved_offset[i] + nelem <= rptr[i+1], "data exceed bound");
+        fo->Seek(saved_offset[i] * sizeof(IndexType) + begin_data);
         fo->Write(&buffer_data[0] + buffer_rptr[i], nelem * sizeof(IndexType));
         saved_offset[i] += nelem;
       }

From f3360d173b1eaf2a703c1cf5b7345513734f3fa7 Mon Sep 17 00:00:00 2001
From: tqchen <tianqi.tchen@gmail.com>
Date: Tue, 2 Sep 2014 17:38:51 -0700
Subject: [PATCH 016/166] pass trival test

---
 src/io/page_fmatrix-inl.hpp       | 21 +++++++++++----------
 src/tree/updater_colmaker-inl.hpp |  3 ++-
 2 files changed, 13 insertions(+), 11 deletions(-)

diff --git a/src/io/page_fmatrix-inl.hpp b/src/io/page_fmatrix-inl.hpp
index 9e586e1c4540..22766ab658ca 100644
--- a/src/io/page_fmatrix-inl.hpp
+++ b/src/io/page_fmatrix-inl.hpp
@@ -46,9 +46,10 @@ class CSCMatrixManager {
     }
     /*! \brief get underlying batch */
     inline ColBatch GetBatch(void) const {
-      ColBatch batch; 
-      batch.col_index = &col_index[0];
-      batch.col_data  = &col_data[0];
+      ColBatch batch;
+      batch.size = col_index.size();
+      batch.col_index = BeginPtr(col_index);
+      batch.col_data  = BeginPtr(col_data);
       return batch;
     }
    private:
@@ -79,11 +80,13 @@ class CSCMatrixManager {
     col_index_ = col_todo_;
     read_top_ = 0;
   }
-  inline bool LoadNext(PagePtr &val) {
+  inline bool LoadNext(PagePtr &val) {    
     val->Clear();
     if (read_top_ >= col_index_.size()) return false;
     while (read_top_ < col_index_.size()) {
-      if (!this->TryFill(col_index_[read_top_], val)) return true;
+      if (!this->TryFill(col_index_[read_top_], val)) {
+        return true;
+      }
       ++read_top_;
     }
     return true;
@@ -241,11 +244,9 @@ class FMatrixPage : public IFMatrix {
   }
   virtual void InitColAccess(float pkeep = 1.0f) {
     if (this->HaveColAccess()) return;
-    if (!this->LoadColData()) {
-      this->InitColData(pkeep, fname_cbuffer_.c_str(),
-                        64 << 20, 5);
-      utils::Check(this->LoadColData(), "fail to read in column data");
-    }
+    this->InitColData(pkeep, fname_cbuffer_.c_str(),
+                      64 << 20, 5);
+    utils::Check(this->LoadColData(), "fail to read in column data");
   }
   /*!
    * \brief get the row iterator associated with FMatrix
diff --git a/src/tree/updater_colmaker-inl.hpp b/src/tree/updater_colmaker-inl.hpp
index bf93cb7b5a60..12f808ce4768 100644
--- a/src/tree/updater_colmaker-inl.hpp
+++ b/src/tree/updater_colmaker-inl.hpp
@@ -421,7 +421,7 @@ class ColMaker: public IUpdater {
         for (bst_omp_uint i = 0; i < nsize; ++i) {
           const bst_uint fid = batch.col_index[i];
           const int tid = omp_get_thread_num();
-          const ColBatch::Inst c = batch[i];
+          const ColBatch::Inst c = batch[i];          
           if (param.need_forward_search(fmat.GetColDensity(fid))) {
             this->EnumerateSplit(c.data, c.data + c.length, +1, 
                                  fid, gpair, info, stemp[tid]);
@@ -452,6 +452,7 @@ class ColMaker: public IUpdater {
         utils::Check(n > 0, "colsample_bylevel is too small that no feature can be included");
         feat_set.resize(n);
       }
+      std::sort(feat_set.begin(), feat_set.end());
       utils::IIterator<ColBatch> *iter = p_fmat->ColIterator(feat_set);
       while (iter->Next()) {
         this->UpdateSolution(iter->Value(), gpair, *p_fmat, info);

From e6e467ad6093533c6130746e1755346ef1b4fbb8 Mon Sep 17 00:00:00 2001
From: tqchen <tianqi.tchen@gmail.com>
Date: Tue, 2 Sep 2014 17:40:30 -0700
Subject: [PATCH 017/166] more ignore

---
 .gitignore                        | 2 ++
 src/tree/updater_colmaker-inl.hpp | 1 -
 2 files changed, 2 insertions(+), 1 deletion(-)

diff --git a/.gitignore b/.gitignore
index 220fc602a785..1a2a4b48ee46 100644
--- a/.gitignore
+++ b/.gitignore
@@ -44,3 +44,5 @@ Debug
 *dump
 *save
 *csv
+*.cpage.col
+*.cpage
diff --git a/src/tree/updater_colmaker-inl.hpp b/src/tree/updater_colmaker-inl.hpp
index 12f808ce4768..566e5775288e 100644
--- a/src/tree/updater_colmaker-inl.hpp
+++ b/src/tree/updater_colmaker-inl.hpp
@@ -452,7 +452,6 @@ class ColMaker: public IUpdater {
         utils::Check(n > 0, "colsample_bylevel is too small that no feature can be included");
         feat_set.resize(n);
       }
-      std::sort(feat_set.begin(), feat_set.end());
       utils::IIterator<ColBatch> *iter = p_fmat->ColIterator(feat_set);
       while (iter->Next()) {
         this->UpdateSolution(iter->Value(), gpair, *p_fmat, info);

From 401d648372bcae81346e4ca5226a640abf073e41 Mon Sep 17 00:00:00 2001
From: tqchen <tianqi.tchen@gmail.com>
Date: Tue, 2 Sep 2014 17:49:39 -0700
Subject: [PATCH 018/166] some lint

---
 src/io/page_dmatrix-inl.hpp | 37 +++++++++++++++++++------------------
 src/io/page_fmatrix-inl.hpp | 36 ++++++++++++++++++++----------------
 2 files changed, 39 insertions(+), 34 deletions(-)

diff --git a/src/io/page_dmatrix-inl.hpp b/src/io/page_dmatrix-inl.hpp
index 63010d882dfa..7a0781621e69 100644
--- a/src/io/page_dmatrix-inl.hpp
+++ b/src/io/page_dmatrix-inl.hpp
@@ -5,6 +5,7 @@
  * row iterator based on sparse page
  * \author Tianqi Chen
  */
+#include <vector>
 #include "../data.h"
 #include "../utils/iterator.h"
 #include "../utils/thread_buffer.h"
@@ -15,7 +16,7 @@ namespace io {
 /*! \brief page structure that can be used to store a rowbatch */
 struct RowBatchPage {
  public:
-  RowBatchPage(size_t page_size) : kPageSize(page_size) {
+  explicit RowBatchPage(size_t page_size) : kPageSize(page_size) {
     data_ = new int[kPageSize];
     utils::Assert(data_ != NULL, "fail to allocate row batch page");
     this->Clear();
@@ -31,10 +32,10 @@ struct RowBatchPage {
   inline bool PushRow(const RowBatch::Inst &row) {
     const size_t dsize = row.length * sizeof(RowBatch::Entry);
     if (FreeBytes() < dsize+ sizeof(int)) return false;
-    row_ptr(Size() + 1) = row_ptr(Size()) + row.length;    
+    row_ptr(Size() + 1) = row_ptr(Size()) + row.length;
     memcpy(data_ptr(row_ptr(Size())) , row.data, dsize);
-    ++ data_[0];
-    return true;    
+    ++data_[0];
+    return true;
   }
   /*!
    * \brief get a row batch representation from the page
@@ -43,7 +44,7 @@ struct RowBatchPage {
    * \return a new RowBatch object
    */
   inline RowBatch GetRowBatch(std::vector<size_t> *p_rptr, size_t base_rowid) {
-    RowBatch batch; 
+    RowBatch batch;
     batch.base_rowid = base_rowid;
     batch.data_ptr = this->data_ptr(0);
     batch.size = static_cast<size_t>(this->Size());
@@ -57,7 +58,7 @@ struct RowBatchPage {
   }
   /*! \brief get i-th row from the batch */
   inline RowBatch::Inst operator[](int i) {
-    return RowBatch::Inst(data_ptr(0) + row_ptr(i), 
+    return RowBatch::Inst(data_ptr(0) + row_ptr(i),
                           static_cast<bst_uint>(row_ptr(i+1) - row_ptr(i)));
   }
   /*!
@@ -85,8 +86,8 @@ struct RowBatchPage {
  private:
   /*! \return number of elements */
   inline size_t FreeBytes(void) {
-    return (kPageSize - (Size() + 2)) * sizeof(int) 
-        - row_ptr(Size()) * sizeof(RowBatch::Entry) ;
+    return (kPageSize - (Size() + 2)) * sizeof(int) -
+        row_ptr(Size()) * sizeof(RowBatch::Entry);
   }
   /*! \brief equivalent row pointer at i */
   inline int& row_ptr(int i) {
@@ -98,7 +99,7 @@ struct RowBatchPage {
   // page size
   const size_t kPageSize;
   // content of data
-  int *data_;  
+  int *data_;
 };
 /*! \brief thread buffer iterator */
 class ThreadRowPageIterator: public utils::IIterator<RowBatch> {
@@ -108,8 +109,7 @@ class ThreadRowPageIterator: public utils::IIterator<RowBatch> {
     page_ = NULL;
     base_rowid_ = 0;
   }
-  virtual ~ThreadRowPageIterator(void) {    
-  }
+  virtual ~ThreadRowPageIterator(void) {}
   virtual void Init(void) {
   }
   virtual void BeforeFirst(void) {
@@ -117,12 +117,12 @@ class ThreadRowPageIterator: public utils::IIterator<RowBatch> {
     base_rowid_ = 0;
   }
   virtual bool Next(void) {
-    if(!itr.Next(page_)) return false;
+    if (!itr.Next(page_)) return false;
     out_ = page_->GetRowBatch(&tmp_ptr_, base_rowid_);
     base_rowid_ += out_.size;
     return true;
   }
-  virtual const RowBatch &Value(void) const{
+  virtual const RowBatch &Value(void) const {
     return out_;
   }
   /*! \brief load and initialize the iterator with fi */
@@ -152,6 +152,7 @@ class ThreadRowPageIterator: public utils::IIterator<RowBatch> {
   }
   /*! \brief page size 64 MB */
   static const size_t kPageSize = 64 << 18;
+
  private:
   // base row id
   size_t base_rowid_;
@@ -195,7 +196,7 @@ class ThreadRowPageIterator: public utils::IIterator<RowBatch> {
 
  protected:
   PagePtr page_;
-  utils::ThreadBuffer<PagePtr,Factory> itr;
+  utils::ThreadBuffer<PagePtr, Factory> itr;
 };
 
 /*! \brief data matrix using page */
@@ -213,10 +214,10 @@ class DMatrixPageBase : public DataMatrix {
   /*! \brief load and initialize the iterator with fi */
   inline void Load(utils::FileStream &fi,
                    bool silent = false,
-                   const char *fname = NULL){
+                   const char *fname = NULL) {
     int tmagic;
     utils::Check(fi.Read(&tmagic, sizeof(tmagic)) != 0, "invalid input file format");
-    utils::Check(tmagic == magic, "invalid format,magic number mismatch");    
+    utils::Check(tmagic == magic, "invalid format,magic number mismatch");
     this->info.LoadBinary(fi);
     iter_->Load(fi);
     if (!silent) {
@@ -229,7 +230,7 @@ class DMatrixPageBase : public DataMatrix {
         utils::Printf("\n");
       }
       if (info.group_ptr.size() != 0) {
-        utils::Printf("data contains %u groups\n", (unsigned)info.group_ptr.size()-1);
+        utils::Printf("data contains %u groups\n", (unsigned)info.group_ptr.size() - 1);
       }
     }
   }
@@ -249,8 +250,8 @@ class DMatrixPageBase : public DataMatrix {
   }
   /*! \brief magic number used to identify DMatrix */
   static const int kMagic = TKMagic;
- protected:
 
+ protected:
   /*! \brief row iterator */
   ThreadRowPageIterator *iter_;
 };
diff --git a/src/io/page_fmatrix-inl.hpp b/src/io/page_fmatrix-inl.hpp
index 22766ab658ca..327e5c144b2c 100644
--- a/src/io/page_fmatrix-inl.hpp
+++ b/src/io/page_fmatrix-inl.hpp
@@ -5,6 +5,9 @@
  * sparse page manager for fmatrix
  * \author Tianqi Chen
  */
+#include <vector>
+#include <string>
+#include <algorithm>
 #include "../data.h"
 #include "../utils/iterator.h"
 #include "../utils/io.h"
@@ -34,7 +37,7 @@ class CSCMatrixManager {
     /*! \brief column index */
     std::vector<bst_uint> col_index;
     /*! \brief column data */
-    std::vector<ColBatch::Inst> col_data;            
+    std::vector<ColBatch::Inst> col_data;
     /*! \brief number of free entries */
     inline size_t NumFreeEntry(void) const {
       return buffer.size() - num_entry;
@@ -52,6 +55,7 @@ class CSCMatrixManager {
       batch.col_data  = BeginPtr(col_data);
       return batch;
     }
+
    private:
     /*! \brief buffer space, not to be changed since ready */
     std::vector<ColBatch::Entry> buffer;
@@ -80,7 +84,7 @@ class CSCMatrixManager {
     col_index_ = col_todo_;
     read_top_ = 0;
   }
-  inline bool LoadNext(PagePtr &val) {    
+  inline bool LoadNext(PagePtr &val) {
     val->Clear();
     if (read_top_ >= col_index_.size()) return false;
     while (read_top_ < col_index_.size()) {
@@ -106,7 +110,7 @@ class CSCMatrixManager {
       psmax = std::max(psmax, col_ptr_[i+1] - col_ptr_[i]);
     }
     utils::Check(page_ratio >= 1.0f, "col_page_ratio must be at least 1");
-    page_size_ = std::max(static_cast<size_t>(psmax * page_ratio), psmax);    
+    page_size_ = std::max(static_cast<size_t>(psmax * page_ratio), psmax);
   }
   inline void SetColSet(const std::vector<bst_uint> &cset, bool setall) {
     if (!setall) {
@@ -124,6 +128,7 @@ class CSCMatrixManager {
       }
     }
   }
+
  private:
   /*! \brief fill a page with */
   inline bool TryFill(size_t cidx, Page *p_page) {
@@ -173,21 +178,22 @@ class ThreadColPageIterator : public utils::IIterator<ColBatch> {
   }
   virtual void BeforeFirst(void) {
     itr_.BeforeFirst();
-  } 
+  }
   virtual bool Next(void) {
     // page to be loaded
     CSCMatrixManager::PagePtr page;
-    if(!itr_.Next(page)) return false;
+    if (!itr_.Next(page)) return false;
     out_ = page->GetBatch();
     return true;
   }
-  virtual const ColBatch &Value(void) const{
+  virtual const ColBatch &Value(void) const {
     return out_;
   }
   inline const std::vector<size_t> &col_ptr(void) const {
     return itr_.get_factory().col_ptr();
   }
-  inline void SetColSet(const std::vector<bst_uint> &cset, bool setall = false) {
+  inline void SetColSet(const std::vector<bst_uint> &cset,
+                        bool setall = false) {
     itr_.get_factory().SetColSet(cset, setall);
   }
 
@@ -195,9 +201,8 @@ class ThreadColPageIterator : public utils::IIterator<ColBatch> {
   // output data
   ColBatch out_;
   // internal iterator
-  utils::ThreadBuffer<CSCMatrixManager::PagePtr,CSCMatrixManager> itr_;
+  utils::ThreadBuffer<CSCMatrixManager::PagePtr, CSCMatrixManager> itr_;
 };
-
 /*!
  * \brief sparse matrix that support column access
  */
@@ -216,7 +221,7 @@ class FMatrixPage : public IFMatrix {
     if (col_iter_ != NULL) delete col_iter_;
     if (fi_ != NULL) {
       fi_->Close(); delete fi_;
-    } 
+    }
   }
   /*! \return whether column access is enabled */
   virtual bool HaveColAccess(void) const {
@@ -272,7 +277,7 @@ class FMatrixPage : public IFMatrix {
     col_iter_->BeforeFirst();
     return col_iter_;
   }
-  
+
  protected:
   /*!
    * \brief try load column data from file
@@ -282,25 +287,24 @@ class FMatrixPage : public IFMatrix {
     if (fp == NULL) return false;
     fi_ = new utils::FileStream(fp);
     static_cast<utils::IStream*>(fi_)->Read(&buffered_rowset_);
-    col_iter_ = new ThreadColPageIterator(fi_, 2.0f, false);    
+    col_iter_ = new ThreadColPageIterator(fi_, 2.0f, false);
     return true;
   }
   /*!
    * \brief intialize column data
    * \param pkeep probability to keep a row
    */
-  inline void InitColData(float pkeep, const char *fname, 
+  inline void InitColData(float pkeep, const char *fname,
                           size_t buffer_size, size_t col_step) {
     buffered_rowset_.clear();
     utils::FileStream fo(utils::FopenCheck(fname, "wb+"));
     // use 64M buffer
     utils::SparseCSRFileBuilder<ColBatch::Entry> builder(&fo, buffer_size);
-    
     // start working
     row_iter_->BeforeFirst();
     while (row_iter_->Next()) {
       const RowBatch &batch = row_iter_->Value();
-      for (size_t i = 0; i < batch.size; ++i) { 
+      for (size_t i = 0; i < batch.size; ++i) {
         if (pkeep == 1.0f || random::SampleBinary(pkeep)) {
           buffered_rowset_.push_back(static_cast<bst_uint>(batch.base_rowid+i));
           RowBatch::Inst inst = batch[i];
@@ -350,7 +354,7 @@ class FMatrixPage : public IFMatrix {
 
 class DMatrixColPage : public DMatrixPageBase<0xffffab03> {
  public:
-  DMatrixColPage(const char *fname) {
+  explicit DMatrixColPage(const char *fname) {
     std::string fext = fname;
     fext += ".col";
     fmat_ = new FMatrixPage(iter_, fext.c_str());

From 244a589e5de7f1df7e4418a5fec28a3604217353 Mon Sep 17 00:00:00 2001
From: tqchen <tianqi.tchen@gmail.com>
Date: Wed, 3 Sep 2014 11:31:05 -0700
Subject: [PATCH 019/166] change include order, so that Rinternal does not
 disturb us

---
 R-package/src/xgboost_R.cpp | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/R-package/src/xgboost_R.cpp b/R-package/src/xgboost_R.cpp
index a7753dfa5ad6..9171948eb961 100644
--- a/R-package/src/xgboost_R.cpp
+++ b/R-package/src/xgboost_R.cpp
@@ -3,12 +3,13 @@
 #include <utility>
 #include <cstring>
 #include <cstdio>
-#include "xgboost_R.h"
 #include "wrapper/xgboost_wrapper.h"
 #include "src/utils/utils.h"
 #include "src/utils/omp.h"
 #include "src/utils/matrix_csr.h"
-using namespace std;
+
+#include "xgboost_R.h"
+
 using namespace xgboost;
 
 extern "C" {

From df3eafc5ba877e0e1490f93b9e2d3f45d65d4fe3 Mon Sep 17 00:00:00 2001
From: tqchen <tianqi.tchen@gmail.com>
Date: Thu, 4 Sep 2014 14:20:52 -0700
Subject: [PATCH 020/166] chg mldata to page

---
 src/io/page_dmatrix-inl.hpp | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/src/io/page_dmatrix-inl.hpp b/src/io/page_dmatrix-inl.hpp
index 7a0781621e69..d52700e8777f 100644
--- a/src/io/page_dmatrix-inl.hpp
+++ b/src/io/page_dmatrix-inl.hpp
@@ -83,7 +83,7 @@ struct RowBatchPage {
     return data_[0];
   }
 
- private:
+ protected:
   /*! \return number of elements */
   inline size_t FreeBytes(void) {
     return (kPageSize - (Size() + 2)) * sizeof(int) -
@@ -96,10 +96,10 @@ struct RowBatchPage {
   inline RowBatch::Entry* data_ptr(int i) {
     return (RowBatch::Entry*)(&data_[1]) + i;
   }
-  // page size
-  const size_t kPageSize;
   // content of data
   int *data_;
+  // page size
+  const size_t kPageSize;
 };
 /*! \brief thread buffer iterator */
 class ThreadRowPageIterator: public utils::IIterator<RowBatch> {

From 19a1ee24a525a65f98505c47d29cf679826d3512 Mon Sep 17 00:00:00 2001
From: tqchen <tianqi.tchen@gmail.com>
Date: Sun, 7 Sep 2014 18:40:15 -0700
Subject: [PATCH 021/166] try predpath

---
 src/gbm/gbtree-inl.hpp | 40 ++++++++++++++++++++++++++++++++++++++--
 1 file changed, 38 insertions(+), 2 deletions(-)

diff --git a/src/gbm/gbtree-inl.hpp b/src/gbm/gbtree-inl.hpp
index ed52afa7df90..6688e3829273 100644
--- a/src/gbm/gbtree-inl.hpp
+++ b/src/gbm/gbtree-inl.hpp
@@ -107,7 +107,7 @@ class GBTree : public IGradBooster {
                        int64_t buffer_offset,
                        const BoosterInfo &info,
                        std::vector<float> *out_preds,
-                       unsigned ntree_limit = 0) {
+                       unsigned ntree_limit = 0) {    
     int nthread;
     #pragma omp parallel
     {
@@ -117,6 +117,10 @@ class GBTree : public IGradBooster {
     for (int i = 0; i < nthread; ++i) {
       thread_temp[i].Init(mparam.num_feature);
     }
+    if (tparam.pred_path != 0) {
+      this->PredPath(p_fmat, info, out_preds);
+      return;
+    }
 
     std::vector<float> &preds = *out_preds;
     const size_t stride = info.num_row * mparam.num_output_group;
@@ -144,7 +148,7 @@ class GBTree : public IGradBooster {
         }
       }
     }
-  }
+  }  
   virtual std::vector<std::string> DumpModel(const utils::FeatMap& fmap, int option) {
     std::vector<std::string> dump;
     for (size_t i = 0; i < trees.size(); i++) {
@@ -258,6 +262,34 @@ class GBTree : public IGradBooster {
       out_pred[stride * (i + 1)] = vec_psum[i];
     }
   }
+  // predict independent leaf index
+  inline void PredPath(IFMatrix *p_fmat,
+                       const BoosterInfo &info,
+                       std::vector<float> *out_preds) {
+    std::vector<float> &preds = *out_preds;
+    preds.resize(info.num_row * mparam.num_trees);
+    // start collecting the prediction
+    utils::IIterator<RowBatch> *iter = p_fmat->RowIterator();
+    iter->BeforeFirst();
+    while (iter->Next()) {
+      const RowBatch &batch = iter->Value();
+      // parallel over local batch
+      const bst_omp_uint nsize = static_cast<bst_omp_uint>(batch.size);
+      #pragma omp parallel for schedule(static)
+      for (bst_omp_uint i = 0; i < nsize; ++i) {
+        const int tid = omp_get_thread_num();
+        int64_t ridx = static_cast<int64_t>(batch.base_rowid + i);
+        tree::RegTree::FVec &feats = thread_temp[tid];
+        feats.Fill(batch[i]);
+        for (size_t j = 0; j < trees.size(); ++j) {
+          int tid = trees[i]->GetLeafIndex(feats, info.GetRoot(ridx));
+          preds[ridx * mparam.num_trees + j] = static_cast<float>(tid);
+        }
+        feats.Drop(batch[i]);
+      }
+    }
+  }
+                       
   // --- data structure ---
   /*! \brief training parameters */
   struct TrainParam {
@@ -268,6 +300,8 @@ class GBTree : public IGradBooster {
      *  use this option to support boosted random forest
      */
     int num_parallel_tree;
+    /*! \brief predict path in prediction */
+    int pred_path;
     /*! \brief whether updater is already initialized */
     int updater_initialized;
     /*! \brief tree updater sequence */
@@ -278,6 +312,7 @@ class GBTree : public IGradBooster {
       updater_seq = "grow_colmaker,prune";
       num_parallel_tree = 1;
       updater_initialized = 0;
+      pred_path = 0;
     }
     inline void SetParam(const char *name, const char *val){
       using namespace std;
@@ -292,6 +327,7 @@ class GBTree : public IGradBooster {
       if (!strcmp(name, "num_parallel_tree")) {
         num_parallel_tree = atoi(val);
       }
+      if (!strcmp(name, "pred_path")) pred_path = atoi(val);
     }
   };
   /*! \brief model parameters */

From d4ab359be16bdcbd82213f5c14bf6d3eb791f3d8 Mon Sep 17 00:00:00 2001
From: tqchen <tianqi.tchen@gmail.com>
Date: Sun, 7 Sep 2014 20:01:03 -0700
Subject: [PATCH 022/166] fix

---
 src/gbm/gbtree-inl.hpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/gbm/gbtree-inl.hpp b/src/gbm/gbtree-inl.hpp
index 6688e3829273..08d2164bc3fc 100644
--- a/src/gbm/gbtree-inl.hpp
+++ b/src/gbm/gbtree-inl.hpp
@@ -282,7 +282,7 @@ class GBTree : public IGradBooster {
         tree::RegTree::FVec &feats = thread_temp[tid];
         feats.Fill(batch[i]);
         for (size_t j = 0; j < trees.size(); ++j) {
-          int tid = trees[i]->GetLeafIndex(feats, info.GetRoot(ridx));
+          int tid = trees[j]->GetLeafIndex(feats, info.GetRoot(ridx));
           preds[ridx * mparam.num_trees + j] = static_cast<float>(tid);
         }
         feats.Drop(batch[i]);

From e90b25a38102cd36e30e0f02d2ff0a1ed26422e4 Mon Sep 17 00:00:00 2001
From: tqchen <tianqi.tchen@gmail.com>
Date: Mon, 8 Sep 2014 16:20:41 -0700
Subject: [PATCH 023/166] add object bound checking

---
 src/learner/objective-inl.hpp | 23 +++++++++++++++++++++++
 1 file changed, 23 insertions(+)

diff --git a/src/learner/objective-inl.hpp b/src/learner/objective-inl.hpp
index 96aacf12d05d..69b7ae4fd446 100644
--- a/src/learner/objective-inl.hpp
+++ b/src/learner/objective-inl.hpp
@@ -41,6 +41,25 @@ struct LossType {
       default: utils::Error("unknown loss_type"); return 0.0f;
     }
   }
+  /*!
+   * \brief check if label range is valid
+   */
+  inline bool CheckLabel(float x) const {
+    if (loss_type != kLinearSquare) {
+      return x >= 0.0f && x <= 1.0f;
+    }
+    return true;
+  }
+  /*!
+   * \brief error message displayed when check label fail
+   */
+  inline const char * CheckLabelErrorMsg(void) const {
+    if (loss_type != kLinearSquare) {
+      return "label must be in [0,1] for logistic regression";
+    } else {
+      return "";
+    }
+  }
   /*!
    * \brief calculate first order gradient of loss, given transformed prediction
    * \param predt transformed prediction
@@ -115,6 +134,8 @@ class RegLossObj : public IObjFunction{
                  "labels are not correctly provided");
     std::vector<bst_gpair> &gpair = *out_gpair;
     gpair.resize(preds.size());
+    // check if label in range
+    bool label_correct = true;
     // start calculating gradient
     const unsigned nstep = static_cast<unsigned>(info.labels.size());
     const bst_omp_uint ndata = static_cast<bst_omp_uint>(preds.size());
@@ -124,9 +145,11 @@ class RegLossObj : public IObjFunction{
       float p = loss.PredTransform(preds[i]);
       float w = info.GetWeight(j);
       if (info.labels[j] == 1.0f) w *= scale_pos_weight;
+      if (!loss.CheckLabel(info.labels[j])) label_correct = false;
       gpair[i] = bst_gpair(loss.FirstOrderGradient(p, info.labels[j]) * w,
                            loss.SecondOrderGradient(p, info.labels[j]) * w);
     }
+    utils::Check(label_correct, loss.CheckLabelErrorMsg());
   }
   virtual const char* DefaultEvalMetric(void) const {
     return loss.DefaultEvalMetric();

From a3806398b9471384e89f9c7b58081aa458ef3e68 Mon Sep 17 00:00:00 2001
From: tqchen <tianqi.tchen@gmail.com>
Date: Mon, 8 Sep 2014 21:34:42 -0700
Subject: [PATCH 024/166] delete old cvpack

---
 wrapper/xgboost.py | 65 ----------------------------------------------
 1 file changed, 65 deletions(-)

diff --git a/wrapper/xgboost.py b/wrapper/xgboost.py
index 04dcfa781c8d..134926c8ab6f 100644
--- a/wrapper/xgboost.py
+++ b/wrapper/xgboost.py
@@ -227,71 +227,6 @@ def slice(self, rindex):
             self.handle, (ctypes.c_int*len(rindex))(*rindex), len(rindex)))
         return res
 
-class CVPack:
-    def __init__(self, dtrain, dtest, param):
-        self.dtrain = dtrain
-        self.dtest = dtest
-        self.watchlist = watchlist = [ (dtrain,'train'), (dtest, 'test') ]
-        self.bst = Booster(param, [dtrain,dtest])
-    def update(self,r):
-        self.bst.update(self.dtrain, r)
-    def eval(self,r):
-        return self.bst.eval_set(self.watchlist, r)
-
-def mknfold(dall, nfold, param, seed, weightscale=None):
-    """
-    mk nfold list of cvpack from randidx
-    """
-    randidx = range(dall.num_row())
-    random.seed(seed)
-    random.shuffle(randidx)
-
-    idxset = []
-    kstep = len(randidx) / nfold
-    for i in range(nfold):
-        idxset.append(randidx[ (i*kstep) : min(len(randidx),(i+1)*kstep) ])
-
-    ret = []
-    for k in range(nfold):
-        trainlst = []
-        for j in range(nfold):
-            if j == k:
-                testlst = idxset[j]
-            else:
-                trainlst += idxset[j]
-        dtrain = dall.slice(trainlst)
-        dtest = dall.slice(testlst)
-        # rescale weight of dtrain and dtest
-        if weightscale != None:
-            dtrain.set_weight( dtrain.get_weight() * weightscale * dall.num_row() / dtrain.num_row() )
-            dtest.set_weight( dtest.get_weight() * weightscale * dall.num_row() / dtest.num_row() )
-
-        ret.append(CVPack(dtrain, dtest, param))
-    return ret
-
-def aggcv(rlist):
-    """
-    aggregate cross validation results
-    """
-    cvmap = {}
-    arr = rlist[0].split()
-    ret = arr[0]
-    for it in arr[1:]:
-        k, v  = it.split(':')
-        cvmap[k] = [float(v)]
-    for line in rlist[1:]:
-        arr = line.split()
-        assert ret == arr[0]
-        for it in arr[1:]:
-            k, v  = it.split(':')
-            cvmap[k].append(float(v))
-
-    for k, v in sorted(cvmap.items(), key = lambda x:x[0]):
-        v = np.array(v)
-        ret += '\t%s:%f+%f' % (k, np.mean(v), np.std(v))
-    return ret
-
-
 class Booster:
     """learner class """
     def __init__(self, params={}, cache=[], model_file = None):

From 0e8846a42f3602455d3142566d39ed720f9e16df Mon Sep 17 00:00:00 2001
From: tqchen <tianqi.tchen@gmail.com>
Date: Wed, 10 Sep 2014 13:51:34 -0700
Subject: [PATCH 025/166] ok

---
 src/learner/learner-inl.hpp | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/src/learner/learner-inl.hpp b/src/learner/learner-inl.hpp
index 88026975db3b..6257658c1f05 100644
--- a/src/learner/learner-inl.hpp
+++ b/src/learner/learner-inl.hpp
@@ -86,7 +86,10 @@ class BoostLearner {
       this->SetParam(n.c_str(), val);
     }
     if (!strcmp(name, "silent")) silent = atoi(val);
-    if (!strcmp(name, "prob_buffer_row")) prob_buffer_row = static_cast<float>(atof(val));
+    if (!strcmp(name, "prob_buffer_row")) {
+      prob_buffer_row = static_cast<float>(atof(val));
+      this->SetParam("updater", "grow_colmaker,refresh,prune");
+    }
     if (!strcmp(name, "eval_metric")) evaluator_.AddEval(val);
     if (!strcmp("seed", name)) random::Seed(atoi(val));
     if (!strcmp(name, "num_class")) this->SetParam("num_output_group", val);
@@ -240,6 +243,7 @@ class BoostLearner {
     utils::Assert(gbm_ == NULL, "GBM and obj should be NULL");
     obj_ = CreateObjFunction(name_obj_.c_str());
     gbm_ = gbm::CreateGradBooster(name_gbm_.c_str());
+    
     for (size_t i = 0; i < cfg_.size(); ++i) {
       obj_->SetParam(cfg_[i].first.c_str(), cfg_[i].second.c_str());
       gbm_->SetParam(cfg_[i].first.c_str(), cfg_[i].second.c_str());

From 87cc53f0cd42c3451b28b78d499a459387e934bb Mon Sep 17 00:00:00 2001
From: tqchen <tianqi.tchen@gmail.com>
Date: Wed, 10 Sep 2014 21:38:50 -0700
Subject: [PATCH 026/166] make basic combine buf

---
 tools/xgcombine_buffer.cpp | 15 ++++++++-------
 1 file changed, 8 insertions(+), 7 deletions(-)

diff --git a/tools/xgcombine_buffer.cpp b/tools/xgcombine_buffer.cpp
index 84bc996a4b7a..872586536a26 100644
--- a/tools/xgcombine_buffer.cpp
+++ b/tools/xgcombine_buffer.cpp
@@ -11,6 +11,7 @@
 #include <ctime>
 #include <cmath>
 #include "../src/io/simple_dmatrix-inl.hpp"
+#include "../src/io/io.cpp"
 #include "../src/utils/utils.h"
 
 using namespace xgboost;
@@ -94,14 +95,14 @@ class DataLoader: public DMatrixSimple {
     for( size_t i = 0; i < vec.size(); i ++ ){
       if( !vec[i].is_dense ) { 
         for( int j = 0; j < vec[i].tmp_num; j ++ ){
-          utils::Assert( fscanf ( vec[i].fi, "%u:%f", &e.findex, &e.fvalue ) == 2, "Error when load feat" );  
-          vec[i].CheckBase( e.findex );
-          e.findex += vec[i].base;
+          utils::Assert( fscanf ( vec[i].fi, "%u:%f", &e.index, &e.fvalue ) == 2, "Error when load feat" );  
+          vec[i].CheckBase( e.index );
+          e.index += vec[i].base;
           feats.push_back(e);
         }
       }else{
         utils::Assert( fscanf ( vec[i].fi, "%f", &e.fvalue ) == 1, "load feat" );  
-        e.findex = vec[i].base;
+        e.index = vec[i].base;
         feats.push_back(e);
       }
     }
@@ -163,7 +164,7 @@ class DataLoader: public DMatrixSimple {
       }             
       if( ngleft == 0 && fgroup != NULL && ngacc != 0 ){
         info.group_ptr.push_back( info.group_ptr.back() + ngacc );
-        utils::Assert( info.group_ptr.back() == info.num_row, "group size must match num rows" );
+        utils::Assert( info.group_ptr.back() == info.num_row(), "group size must match num rows" );
         ngacc = 0;
       }
       // linelimit
@@ -173,7 +174,7 @@ class DataLoader: public DMatrixSimple {
     }
     if( ngleft == 0 && fgroup != NULL && ngacc != 0 ){
       info.group_ptr.push_back( info.group_ptr.back() + ngacc );
-      utils::Assert( info.group_ptr.back() == info.num_row, "group size must match num rows" );
+      utils::Assert( info.group_ptr.back() == info.num_row(), "group size must match num rows" );
     }
   }
   
@@ -236,7 +237,7 @@ int main( int argc, char *argv[] ){
   printf("num_features=%d\n", norm( loader.fheader ) ); 
   printf("start creating buffer...\n");
   loader.Load();
-  loader.SaveBinary( argv[2] );
+  io::SaveDataMatrix(loader, argv[2]);
   // close files
   fclose( loader.fp );
   if( loader.fwlist != NULL ) fclose( loader.fwlist );    

From 3a0be47b1c3008174e43cac8d2a04c9bd9f81c1f Mon Sep 17 00:00:00 2001
From: tqchen <tianqi.tchen@gmail.com>
Date: Fri, 12 Sep 2014 15:52:39 -0700
Subject: [PATCH 027/166] add tmp file

---
 src/io/io.cpp               | 28 ++++++++++++++++++++++------
 src/io/page_dmatrix-inl.hpp |  7 +++++--
 src/io/page_fmatrix-inl.hpp |  4 +---
 src/tree/updater.cpp        |  2 ++
 4 files changed, 30 insertions(+), 11 deletions(-)

diff --git a/src/io/io.cpp b/src/io/io.cpp
index a8aed0d43f88..8a4579ab81f3 100644
--- a/src/io/io.cpp
+++ b/src/io/io.cpp
@@ -13,6 +13,14 @@
 namespace xgboost {
 namespace io {
 DataMatrix* LoadDataMatrix(const char *fname, bool silent, bool savebuffer) {
+  std::string tmp_fname;
+  const char *fname_ext = NULL;
+  if (strchr(fname, ';') != NULL) {
+    tmp_fname = fname;
+    char *ptr = strchr(&tmp_fname[0], ';');
+    ptr[0] = '\0'; fname = &tmp_fname[0];
+    fname_ext = ptr + 1;
+  }
   int magic;
   utils::FileStream fs(utils::FopenCheck(fname, "rb"));
   utils::Check(fs.Read(&magic, sizeof(magic)) != 0, "invalid input file format");
@@ -25,15 +33,23 @@ DataMatrix* LoadDataMatrix(const char *fname, bool silent, bool savebuffer) {
     return dmat;
   }
   if (magic == DMatrixPage::kMagic) {
-    DMatrixPage *dmat = new DMatrixPage();
-    dmat->Load(fs, silent, fname);
-    // the file pointer is hold in page matrix
-    return dmat;
+    if (fname_ext == NULL) {
+      DMatrixPage *dmat = new DMatrixPage();
+      dmat->Load(fs, silent, fname);
+      return dmat;
+    } else {
+      DMatrixColPage *dmat = new DMatrixColPage(fname_ext);
+      dmat->Load(fs, silent, fname, true);
+      return dmat;
+    }
   }
   if (magic == DMatrixColPage::kMagic) {
-    DMatrixColPage *dmat = new DMatrixColPage(fname);
+    std::string sfname = fname;
+    if (fname_ext == NULL) {
+      sfname += ".col"; fname_ext = sfname.c_str();
+    }
+    DMatrixColPage *dmat = new DMatrixColPage(fname_ext);
     dmat->Load(fs, silent, fname);
-    // the file pointer is hold in page matrix
     return dmat;
   }
   fs.Close();
diff --git a/src/io/page_dmatrix-inl.hpp b/src/io/page_dmatrix-inl.hpp
index d52700e8777f..41ad19be5cc9 100644
--- a/src/io/page_dmatrix-inl.hpp
+++ b/src/io/page_dmatrix-inl.hpp
@@ -214,10 +214,13 @@ class DMatrixPageBase : public DataMatrix {
   /*! \brief load and initialize the iterator with fi */
   inline void Load(utils::FileStream &fi,
                    bool silent = false,
-                   const char *fname = NULL) {
+                   const char *fname = NULL,
+                   bool skip_magic_check = false) {
     int tmagic;
     utils::Check(fi.Read(&tmagic, sizeof(tmagic)) != 0, "invalid input file format");
-    utils::Check(tmagic == magic, "invalid format,magic number mismatch");
+    if (!skip_magic_check) {
+      utils::Check(tmagic == magic, "invalid format,magic number mismatch");
+    }
     this->info.LoadBinary(fi);
     iter_->Load(fi);
     if (!silent) {
diff --git a/src/io/page_fmatrix-inl.hpp b/src/io/page_fmatrix-inl.hpp
index 327e5c144b2c..3b53c24842ed 100644
--- a/src/io/page_fmatrix-inl.hpp
+++ b/src/io/page_fmatrix-inl.hpp
@@ -355,9 +355,7 @@ class FMatrixPage : public IFMatrix {
 class DMatrixColPage : public DMatrixPageBase<0xffffab03> {
  public:
   explicit DMatrixColPage(const char *fname) {
-    std::string fext = fname;
-    fext += ".col";
-    fmat_ = new FMatrixPage(iter_, fext.c_str());
+    fmat_ = new FMatrixPage(iter_, fname);
   }
   virtual ~DMatrixColPage(void) {
     delete fmat_;
diff --git a/src/tree/updater.cpp b/src/tree/updater.cpp
index 2cb6552fe8ed..5879b2bbd343 100644
--- a/src/tree/updater.cpp
+++ b/src/tree/updater.cpp
@@ -13,6 +13,8 @@ IUpdater* CreateUpdater(const char *name) {
   if (!strcmp(name, "prune")) return new TreePruner();
   if (!strcmp(name, "refresh")) return new TreeRefresher<GradStats>();
   if (!strcmp(name, "grow_colmaker")) return new ColMaker<GradStats>();
+  if (!strcmp(name, "grow_colmaker5")) return new ColMaker< CVGradStats<5> >();
+  if (!strcmp(name, "grow_colmaker3")) return new ColMaker< CVGradStats<3> >();
   utils::Error("unknown updater:%s", name);
   return NULL;
 }

From bf2426f3cdd8ec13f88345f2cd010285db89d3a7 Mon Sep 17 00:00:00 2001
From: tqchen <tianqi.tchen@gmail.com>
Date: Fri, 12 Sep 2014 17:31:06 -0700
Subject: [PATCH 028/166] some changes

---
 src/io/page_fmatrix-inl.hpp | 12 +++++++++++-
 src/learner/learner-inl.hpp |  2 +-
 2 files changed, 12 insertions(+), 2 deletions(-)

diff --git a/src/io/page_fmatrix-inl.hpp b/src/io/page_fmatrix-inl.hpp
index 3b53c24842ed..af8be333f053 100644
--- a/src/io/page_fmatrix-inl.hpp
+++ b/src/io/page_fmatrix-inl.hpp
@@ -249,9 +249,15 @@ class FMatrixPage : public IFMatrix {
   }
   virtual void InitColAccess(float pkeep = 1.0f) {
     if (this->HaveColAccess()) return;
+    utils::Printf("start to initialize page col access\n");
+    if (this->LoadColData()) {
+      utils::Printf("loading previously saved col data\n");
+      return;
+    }
     this->InitColData(pkeep, fname_cbuffer_.c_str(),
-                      64 << 20, 5);
+                      1 << 30, 5);
     utils::Check(this->LoadColData(), "fail to read in column data");
+    utils::Printf("finish initialize page col access\n");
   }
   /*!
    * \brief get the row iterator associated with FMatrix
@@ -331,6 +337,10 @@ class FMatrixPage : public IFMatrix {
                              ColBatch::Entry((bst_uint)(batch.base_rowid+i),
                                              inst[j].fvalue));
           }
+          if (ktop % 100000 == 0) {
+            utils::Printf("\r                         \r");
+            utils::Printf("InitCol: %lu rows ", static_cast<unsigned long>(ktop));            
+          }
         }
       }
     }
diff --git a/src/learner/learner-inl.hpp b/src/learner/learner-inl.hpp
index 6257658c1f05..e132349896c4 100644
--- a/src/learner/learner-inl.hpp
+++ b/src/learner/learner-inl.hpp
@@ -162,7 +162,7 @@ class BoostLearner {
    *  if not intialize it
    * \param p_train pointer to the matrix used by training
    */
-  inline void CheckInit(DMatrix *p_train) {
+  inline void CheckInit(DMatrix *p_train) {    
     p_train->fmat()->InitColAccess(prob_buffer_row);
   }
   /*!

From 91e34c6fb4f88ac425fb36242cfbb6a6a5fad5e3 Mon Sep 17 00:00:00 2001
From: tqchen <tianqi.tchen@gmail.com>
Date: Fri, 12 Sep 2014 21:26:38 -0700
Subject: [PATCH 029/166] ok

---
 src/tree/param.h                  | 2 +-
 src/tree/updater_colmaker-inl.hpp | 6 +++++-
 2 files changed, 6 insertions(+), 2 deletions(-)

diff --git a/src/tree/param.h b/src/tree/param.h
index aab11ad1e18c..4a7c790a6aeb 100644
--- a/src/tree/param.h
+++ b/src/tree/param.h
@@ -57,7 +57,7 @@ struct TrainParam{
     opt_dense_col = 1.0f;
     nthread = 0;
     size_leaf_vector = 0;
-    parallel_option = 0;
+    parallel_option = 2;
   }
   /*! 
    * \brief set parameters from outside 
diff --git a/src/tree/updater_colmaker-inl.hpp b/src/tree/updater_colmaker-inl.hpp
index 1ccf32a411fd..9c2740264534 100644
--- a/src/tree/updater_colmaker-inl.hpp
+++ b/src/tree/updater_colmaker-inl.hpp
@@ -418,7 +418,11 @@ class ColMaker: public IUpdater {
       #if defined(_OPENMP)                                                                
       const int batch_size = std::max(static_cast<int>(nsize / this->nthread / 32), 1);
       #endif
-      if (param.parallel_option == 0) {
+      int poption = param.parallel_option;
+      if (poption == 2) {
+        poption = nsize * 2 < nthread ? 1 : 0;
+      }
+      if (poption == 0) {
         #pragma omp parallel for schedule(dynamic, batch_size)
         for (bst_omp_uint i = 0; i < nsize; ++i) {
           const bst_uint fid = batch.col_index[i];

From d0daecb4d36a2b29e6c9fca101f0ed72cbf7f3d9 Mon Sep 17 00:00:00 2001
From: tqchen <tianqi.tchen@gmail.com>
Date: Wed, 15 Oct 2014 14:30:06 -0700
Subject: [PATCH 030/166] add bitmap .

---
 src/sync/sync.h                  |  20 +++++
 src/tree/updater_distcol-inl.hpp | 123 +++++++++++++++++++++++++++++++
 src/utils/bitmap.h               |  45 +++++++++++
 3 files changed, 188 insertions(+)
 create mode 100644 src/sync/sync.h
 create mode 100644 src/tree/updater_distcol-inl.hpp
 create mode 100644 src/utils/bitmap.h

diff --git a/src/sync/sync.h b/src/sync/sync.h
new file mode 100644
index 000000000000..1d9be719c47f
--- /dev/null
+++ b/src/sync/sync.h
@@ -0,0 +1,20 @@
+#ifndef XGBOOST_SYNC_SYNC_H_
+#define XGBOOST_SYNC_SYNC_H_
+/*!
+ * \file sync.h
+ * \brief interface to do synchronization
+ * \author Tianqi Chen
+ */
+namespace xgboost {
+namespace sync {
+/*! 
+ * \brief synchronization context interface of xgboost,
+ *        will be provided as a singleton
+ */
+class IContext {
+  
+};
+
+}  // namespace sync
+}  // namespace xgboost
+#endif
diff --git a/src/tree/updater_distcol-inl.hpp b/src/tree/updater_distcol-inl.hpp
new file mode 100644
index 000000000000..f5d37c1fc59c
--- /dev/null
+++ b/src/tree/updater_distcol-inl.hpp
@@ -0,0 +1,123 @@
+#ifndef XGBOOST_TREE_UPDATER_DISTCOL_INL_HPP_
+#define XGBOOST_TREE_UPDATER_DISTCOL_INL_HPP_
+/*!
+ * \file updater_distcol-inl.hpp
+ * \brief beta distributed version that takes a sub-column 
+ *        and construct a tree
+ * \author Tianqi Chen
+ */
+#include "../utils/bitmap.h"
+#include "./updater_colmaker-inl.hpp"
+
+namespace xgboost {
+namespace tree {
+template<typename TStats>
+class DistColMaker : public ColMaker<TStats> {
+ public:
+  DistColMaker(void) : builder(param) {}
+  virtual ~DistColMaker(void) {}
+  // set training parameter
+  virtual void SetParam(const char *name, const char *val) {
+    param.SetParam(name, val);
+  }
+  virtual void Update(const std::vector<bst_gpair> &gpair,
+                      IFMatrix *p_fmat,
+                      const BoosterInfo &info,
+                      const std::vector<RegTree*> &trees) {    
+    TStats::CheckInfo(info);
+    utils::Check(trees.size() == 1, "DistColMaker: only support one tree at a time");
+    builder.Update(gpair, p_fmat, info, trees[0]);
+  }
+ private:
+  struct Builder : public ColMaker<TStats>::Builder {
+   public:
+    Builder(const TrainParam &param) 
+        : ColMaker<TStats>::Builder(param) {
+    }
+   protected:
+    virtual void SetNonDefaultPosition(const std::vector<int> &qexpand,
+                                       IFMatrix *p_fmat, const RegTree &tree) {
+      // step 2, classify the non-default data into right places
+      std::vector<unsigned> fsplits;
+      for (size_t i = 0; i < qexpand.size(); ++i) {
+        const int nid = qexpand[i];
+        if (!tree[nid].is_leaf()) {
+          fsplits.push_back(tree[nid].split_index());
+        }
+      }
+      // get the candidate split index
+      std::sort(fsplits.begin(), fsplits.end());
+      fsplits.resize(std::unique(fsplits.begin(), fsplits.end()) - fsplits.begin());
+      while (fsplits.size() != 0 && fsplits.back() >= p_fmat->NumCol()) {
+        fsplits.pop_back();
+      }
+      // setup BitMap
+      bitmap.Resize(this->position.size());
+      bitmap.Clear();
+      utils::IIterator<ColBatch> *iter = p_fmat->ColIterator(fsplits);
+      while (iter->Next()) {
+        const ColBatch &batch = iter->Value();
+        for (size_t i = 0; i < batch.size; ++i) {
+          ColBatch::Inst col = batch[i];
+          const bst_uint fid = batch.col_index[i];
+          const bst_omp_uint ndata = static_cast<bst_omp_uint>(col.length);
+          #pragma omp parallel for schedule(static)
+          for (bst_omp_uint j = 0; j < ndata; ++j) {
+            const bst_uint ridx = col[j].index;
+            const float fvalue = col[j].fvalue;
+            int nid = this->position[ridx];
+            if (nid < 0) continue;
+            // go back to parent, correct those who are not default
+            nid = tree[nid].parent();
+            if (tree[nid].split_index() == fid) {
+              if (fvalue < tree[nid].split_cond()) {
+                if (!tree[nid].default_left()) bitmap.SetTrue(ridx);
+              } else {
+                if (tree[nid].default_left()) bitmap.SetTrue(ridx);
+              }
+            }
+          }
+        }
+      }
+      // communicate bitmap
+      //sync::AllReduce();
+      const std::vector<bst_uint> &rowset = p_fmat->buffered_rowset();   
+      // get the new position
+      const bst_omp_uint ndata = static_cast<bst_omp_uint>(rowset.size());
+      #pragma omp parallel for schedule(static)
+      for (bst_omp_uint i = 0; i < ndata; ++i) {
+        const bst_uint ridx = rowset[i];
+        int nid = this->position[ridx];
+        if (nid >= 0 && bitmap.Get(ridx)) {
+          nid = tree[nid].parent();
+          if (tree[nid].default_left()) {
+            this->position[ridx] = tree[nid].cright();
+          } else {
+            this->position[ridx] = tree[nid].cleft();
+          }
+        }
+      }
+    }
+    // synchronize the best solution of each node
+    virtual void SyncBestSolution(const std::vector<int> &qexpand) {
+      for (size_t i = 0; i < qexpand.size(); ++i) {
+        const int nid = qexpand[i];
+        for (int tid = 0; tid < this->nthread; ++tid) {
+          this->snode[nid].best.Update(this->stemp[tid][nid].best);
+        }
+      }
+      // communicate best solution
+      // sync::AllReduce
+    }
+    
+   private:
+    utils::BitMap bitmap;
+  };
+  // training parameter
+  TrainParam param;
+  // pointer to the builder
+  Builder builder; 
+};
+}  // namespace tree
+}  // namespace xgboost
+#endif
diff --git a/src/utils/bitmap.h b/src/utils/bitmap.h
new file mode 100644
index 000000000000..9c7cf2fc28f6
--- /dev/null
+++ b/src/utils/bitmap.h
@@ -0,0 +1,45 @@
+#ifndef XGBOOST_UTILS_BITMAP_H_
+#define XGBOOST_UTILS_BITMAP_H_
+/*!
+ * \file bitmap.h
+ * \brief a simple implement of bitmap
+ * \author Tianqi Chen
+ */
+#include <vector>
+#include "./utils.h"
+
+namespace xgboost {
+namespace utils {
+/*! \brief bit map that contains set of bit indicators */
+struct BitMap {
+  /*! \brief internal data structure */
+  std::vector<uint32_t> data;
+  /*! 
+   * \brief resize the bitmap to be certain size 
+   * \param size the size of bitmap
+   */
+  inline void Resize(size_t size) {
+    data.resize((size + 31U) >> 5, 0);
+  }
+  /*! 
+   * \brief query the i-th position of bitmap 
+   * \param i the position in 
+   */
+  inline bool Get(size_t i) const {
+    return (data[i >> 5] >> (i & 31U)) & 1U;
+  }
+  /*! 
+   * \brief set i-th position to true 
+   * \param i position index
+   */
+  inline void SetTrue(size_t i) {
+    data[i >> 5] |= (1 << (i & 31U));
+  }
+  /*! \brief clear the bitmap, set all places to false */
+  inline void Clear(void) {
+    std::fill(data.begin(), data.end(), 0U);
+  }
+};
+}  // namespace utils
+}  // namespace xgboost
+#endif

From e29512897315cbc6092e668c4f48bb6ef62f3fd7 Mon Sep 17 00:00:00 2001
From: tqchen <tianqi.tchen@gmail.com>
Date: Wed, 15 Oct 2014 14:30:09 -0700
Subject: [PATCH 031/166] add bitmap .

---
 src/tree/updater.cpp              |  2 ++
 src/tree/updater_colmaker-inl.hpp | 41 ++++++++++++++++++++++---------
 2 files changed, 31 insertions(+), 12 deletions(-)

diff --git a/src/tree/updater.cpp b/src/tree/updater.cpp
index 5879b2bbd343..e2c5301429ad 100644
--- a/src/tree/updater.cpp
+++ b/src/tree/updater.cpp
@@ -5,6 +5,7 @@
 #include "./updater_prune-inl.hpp"
 #include "./updater_refresh-inl.hpp"
 #include "./updater_colmaker-inl.hpp"
+#include "./updater_distcol-inl.hpp"
 
 namespace xgboost {
 namespace tree {
@@ -12,6 +13,7 @@ IUpdater* CreateUpdater(const char *name) {
   using namespace std;
   if (!strcmp(name, "prune")) return new TreePruner();
   if (!strcmp(name, "refresh")) return new TreeRefresher<GradStats>();
+  if (!strcmp(name, "distcol")) return new DistColMaker<GradStats>();
   if (!strcmp(name, "grow_colmaker")) return new ColMaker<GradStats>();
   if (!strcmp(name, "grow_colmaker5")) return new ColMaker< CVGradStats<5> >();
   if (!strcmp(name, "grow_colmaker3")) return new ColMaker< CVGradStats<3> >();
diff --git a/src/tree/updater_colmaker-inl.hpp b/src/tree/updater_colmaker-inl.hpp
index 9c2740264534..596c8c8f50c2 100644
--- a/src/tree/updater_colmaker-inl.hpp
+++ b/src/tree/updater_colmaker-inl.hpp
@@ -36,10 +36,11 @@ class ColMaker: public IUpdater {
       Builder builder(param);
       builder.Update(gpair, p_fmat, info, trees[i]);
     }
+
     param.learning_rate = lr;
   }
 
- private:
+ protected:
   // training parameter
   TrainParam param;
   // data structure
@@ -108,7 +109,7 @@ class ColMaker: public IUpdater {
       }
     }
 
-   private:
+   protected:
     // initialize temp data structure
     inline void InitData(const std::vector<bst_gpair> &gpair,
                          const IFMatrix &fmat,
@@ -409,7 +410,7 @@ class ColMaker: public IUpdater {
       }
     }
     // update the solution candidate 
-    virtual void UpdateSolution(const ColBatch &batch,                                
+    virtual void UpdateSolution(const ColBatch &batch,
                                 const std::vector<bst_gpair> &gpair,
                                 const IFMatrix &fmat,
                                 const BoosterInfo &info) {
@@ -463,12 +464,11 @@ class ColMaker: public IUpdater {
         this->UpdateSolution(iter->Value(), gpair, *p_fmat, info);
       }
       // after this each thread's stemp will get the best candidates, aggregate results
+      this->SyncBestSolution(qexpand);
+      // get the best result, we can synchronize the solution
       for (size_t i = 0; i < qexpand.size(); ++i) {
         const int nid = qexpand[i];
-        NodeEntry &e = snode[nid];
-        for (int tid = 0; tid < this->nthread; ++tid) {
-          e.best.Update(stemp[tid][nid].best);
-        }
+        NodeEntry &e = snode[nid];        
         // now we know the solution in snode[nid], set split
         if (e.best.loss_chg > rt_eps) {
           p_tree->AddChilds(nid);
@@ -476,9 +476,8 @@ class ColMaker: public IUpdater {
         } else {
           (*p_tree)[nid].set_leaf(e.weight * param.learning_rate);
         }
-      }
+      } 
     }
-
     // reset position of each data points after split is created in the tree
     inline void ResetPosition(const std::vector<int> &qexpand, IFMatrix *p_fmat, const RegTree &tree) {
       const std::vector<bst_uint> &rowset = p_fmat->buffered_rowset();
@@ -490,18 +489,36 @@ class ColMaker: public IUpdater {
         const int nid = position[ridx];
         if (nid >= 0) {
           if (tree[nid].is_leaf()) {
-            position[ridx] = -1;
+            position[ridx] = - nid - 1;
           } else {
             // push to default branch, correct latter
             position[ridx] = tree[nid].default_left() ? tree[nid].cleft(): tree[nid].cright();
           }
         }
       }
+      // set the positions in the nondefault places
+      this->SetNonDefaultPosition(qexpand, p_fmat, tree);
+    }
+    // customization part
+    // synchronize the best solution of each node
+    virtual void SyncBestSolution(const std::vector<int> &qexpand) {
+      for (size_t i = 0; i < qexpand.size(); ++i) {
+        const int nid = qexpand[i];
+        NodeEntry &e = snode[nid];
+        for (int tid = 0; tid < this->nthread; ++tid) {
+          e.best.Update(stemp[tid][nid].best);
+        }
+      }
+    }
+    virtual void SetNonDefaultPosition(const std::vector<int> &qexpand,
+                                       IFMatrix *p_fmat, const RegTree &tree) {
       // step 2, classify the non-default data into right places
       std::vector<unsigned> fsplits;
       for (size_t i = 0; i < qexpand.size(); ++i) {
         const int nid = qexpand[i];
-        if (!tree[nid].is_leaf()) fsplits.push_back(tree[nid].split_index());
+        if (!tree[nid].is_leaf()) {
+          fsplits.push_back(tree[nid].split_index());
+        }
       }
       std::sort(fsplits.begin(), fsplits.end());
       fsplits.resize(std::unique(fsplits.begin(), fsplits.end()) - fsplits.begin());
@@ -518,7 +535,7 @@ class ColMaker: public IUpdater {
             const bst_uint ridx = col[j].index;
             const float fvalue = col[j].fvalue;
             int nid = position[ridx];
-            if (nid == -1) continue;
+            if (nid < 0) continue;
             // go back to parent, correct those who are not default
             nid = tree[nid].parent();
             if (tree[nid].split_index() == fid) {

From f2577fec862ae0f7bf0897586b42814d0cf14215 Mon Sep 17 00:00:00 2001
From: tqchen <tianqi.tchen@gmail.com>
Date: Wed, 15 Oct 2014 21:39:42 -0700
Subject: [PATCH 032/166] intial version of sync wrapper

---
 Makefile          | 16 ++++++--
 src/sync/sync.cpp | 61 +++++++++++++++++++++++++++++++
 src/sync/sync.h   | 93 +++++++++++++++++++++++++++++++++++++++++++++--
 3 files changed, 163 insertions(+), 7 deletions(-)
 create mode 100644 src/sync/sync.cpp

diff --git a/Makefile b/Makefile
index 3230661d4643..2852b7ac578b 100644
--- a/Makefile
+++ b/Makefile
@@ -1,8 +1,8 @@
 export CC  = gcc
 export CXX = g++
+export MPICXX = mpicxx
 export LDFLAGS= -pthread -lm 
-
-export CFLAGS = -Wall -O3 -msse2  -Wno-unknown-pragmas -fPIC -pedantic 
+export CFLAGS = -Wall -O3 -msse2  -Wno-unknown-pragmas -fPIC 
 
 ifeq ($(no_omp),1)
 	CFLAGS += -DDISABLE_OPENMP 
@@ -13,11 +13,13 @@ endif
 # specify tensor path
 BIN = xgboost
 OBJ = updater.o gbm.o io.o
+MPIOBJ = sync.o
+MPIBIN = test/test
 SLIB = wrapper/libxgboostwrapper.so 
 
 .PHONY: clean all python Rpack
 
-all: $(BIN) $(OBJ) $(SLIB) 
+all: $(BIN) $(OBJ) $(SLIB) $(MPIOBJ) $(MPIBIN)
 
 python: wrapper/libxgboostwrapper.so
 # now the wrapper takes in two files. io and wrapper part
@@ -25,8 +27,10 @@ wrapper/libxgboostwrapper.so: wrapper/xgboost_wrapper.cpp $(OBJ)
 updater.o: src/tree/updater.cpp  src/tree/*.hpp src/*.h src/tree/*.h
 gbm.o: src/gbm/gbm.cpp src/gbm/*.hpp src/gbm/*.h
 io.o: src/io/io.cpp src/io/*.hpp src/utils/*.h src/learner/dmatrix.h src/*.h
+sync.o: src/sync/sync.cpp 
 xgboost: src/xgboost_main.cpp src/utils/*.h src/*.h src/learner/*.hpp src/learner/*.h $(OBJ)
 wrapper/libxgboostwrapper.so: wrapper/xgboost_wrapper.cpp src/utils/*.h src/*.h src/learner/*.hpp src/learner/*.h $(OBJ)
+test/test: test/test.cpp sync.o
 
 $(BIN) : 
 	$(CXX) $(CFLAGS) $(LDFLAGS) -o $@ $(filter %.cpp %.o %.c, $^)
@@ -37,6 +41,12 @@ $(SLIB) :
 $(OBJ) : 
 	$(CXX) -c $(CFLAGS) -o $@ $(firstword $(filter %.cpp %.c, $^) )
 
+$(MPIOBJ) : 
+	$(MPICXX) -c $(CFLAGS) -o $@ $(firstword $(filter %.cpp %.c, $^) )
+
+$(MPIBIN) : 
+	$(MPICXX) $(CFLAGS) $(LDFLAGS) -o $@ $(filter %.cpp %.o %.c, $^)
+
 install:
 	cp -f -r $(BIN)  $(INSTALL_PATH)
 
diff --git a/src/sync/sync.cpp b/src/sync/sync.cpp
new file mode 100644
index 000000000000..705d19faed67
--- /dev/null
+++ b/src/sync/sync.cpp
@@ -0,0 +1,61 @@
+#include "./sync.h"
+#include "../utils/utils.h"
+#include "mpi.h"
+
+namespace xgboost {
+namespace sync {
+
+// code for reduce handle
+ReduceHandle::ReduceHandle(void) : handle(NULL) {
+}
+ReduceHandle::~ReduceHandle(void) {
+  if (handle != NULL) {
+    MPI::Op *op = reinterpret_cast<MPI::Op*>(handle);
+    op->Free();
+    delete op;
+  }
+}
+void ReduceHandle::Init(ReduceFunction redfunc, bool commute) {
+  utils::Assert(handle == NULL, "cannot initialize reduce handle twice");
+  MPI::Op *op = new MPI::Op();
+  MPI::User_function *pf = reinterpret_cast<MPI::User_function*>(redfunc);
+  op->Init(pf, commute);
+  handle = op;
+}
+void ReduceHandle::AllReduce(void *sendrecvbuf, size_t n4byte) {
+  utils::Assert(handle != NULL, "must intialize handle to call AllReduce");  
+  MPI::Op *op = reinterpret_cast<MPI::Op*>(handle);
+  MPI::COMM_WORLD.Allreduce(MPI_IN_PLACE, sendrecvbuf, n4byte, MPI_INT, *op);
+}
+
+int GetRank(void) {
+  return MPI::COMM_WORLD.Get_rank();
+}
+
+void Init(int argc, char *argv[]) {
+  MPI::Init(argc, argv);
+}
+
+void Finalize(void) {
+  MPI::Finalize();
+}
+
+void AllReduce_(void *sendrecvbuf, int count, const MPI::Datatype &dtype, ReduceOp op) {
+  switch(op) {
+    case kBitwiseOR: MPI::COMM_WORLD.Allreduce(MPI_IN_PLACE, sendrecvbuf, count, dtype, MPI::BOR); return;
+    case kSum: MPI::COMM_WORLD.Allreduce(MPI_IN_PLACE, sendrecvbuf, count, dtype, MPI::SUM); return;
+  }
+}
+
+template<>
+void AllReduce<uint32_t>(uint32_t *sendrecvbuf, int count, ReduceOp op) {
+  AllReduce_(sendrecvbuf, count, MPI::UNSIGNED, op);
+}
+
+template<>
+void AllReduce<float>(float *sendrecvbuf, int count, ReduceOp op) {
+  AllReduce_(sendrecvbuf, count, MPI::FLOAT, op);
+}
+
+}  // namespace sync
+}  // namespace xgboost
diff --git a/src/sync/sync.h b/src/sync/sync.h
index 1d9be719c47f..0548a3c80acc 100644
--- a/src/sync/sync.h
+++ b/src/sync/sync.h
@@ -5,14 +5,99 @@
  * \brief interface to do synchronization
  * \author Tianqi Chen
  */
+#include <cstdio>
+#include <cstring>
+#include "../utils/utils.h"
+
 namespace xgboost {
+/*! \brief syncrhonizer module that minimum wraps MPI */
 namespace sync {
+/*! \brief reduce operator supported */
+enum ReduceOp {
+  kBitwiseOR,
+  kSum
+};
+
+typedef void (ReduceFunction) (const void *src, void *dst, int len);
+
+/* !\brief handle for customized reducer */
+class ReduceHandle {
+ public:
+  // constructor
+  ReduceHandle(void);
+  // destructor
+  ~ReduceHandle(void);
+  // initialize the reduce function
+  void Init(ReduceFunction redfunc, bool commute = true);
+  /*!
+   * \brief customized in-place all reduce operation 
+   * \param sendrecvbuf the in place send-recv buffer
+   * \param n4bytes number of nbytes send through all reduce
+   */
+  void AllReduce(void *sendrecvbuf, size_t n4bytes);
+  
+ private:
+  // handle data field
+  void *handle;
+};
+
+/*! \brief get rank of current process */
+int GetRank(void);
+/*! \brief intiialize the synchronization module */
+void Init(int argc, char *argv[]);
+/*! \brief finalize syncrhonization module */
+void Finalize(void);
+/*!
+ * \brief in-place all reduce operation 
+ * \param sendrecvbuf the in place send-recv buffer
+ * \param count count of data
+ * \param op reduction function
+ */
+template<typename DType>
+void AllReduce(DType *sendrecvbuf, int count, ReduceOp op);
+
 /*! 
- * \brief synchronization context interface of xgboost,
- *        will be provided as a singleton
+ * \brief template class to make customized reduce and all reduce easy  
+ * Do not use reducer directly in the function you call Finalize, because the destructor can happen after Finalize
+ * \tparam DType data type that to be reduced
+ *   DType must be a struct, with no pointer, and contains a function Reduce(const DType &d);
  */
-class IContext {
-  
+template<typename DType>
+class Reducer {
+ public:
+  Reducer(void) {
+    handle.Init(ReduceInner);
+    utils::Assert(sizeof(DType) % sizeof(int) == 0, "struct must be multiple of int");
+  }
+  /*!
+   * \brief customized in-place all reduce operation 
+   * \param sendrecvbuf the in place send-recv buffer
+   * \param bytes number of 4bytes send through all reduce
+   * \param reducer the reducer function
+   */
+  inline void AllReduce(DType *sendrecvbuf, int count) {
+    handle.AllReduce(sendrecvbuf, count * kUnit);
+  }
+
+ private:
+  // unit size 
+  static const size_t kUnit = sizeof(DType) / sizeof(int);
+  // inner implementation of reducer
+  inline static void ReduceInner(const void *src_, void *dst_, int len_) {
+    const int *psrc = reinterpret_cast<const int*>(src_);
+    int *pdst = reinterpret_cast<int*>(dst_);
+    DType tdst, tsrc;
+    utils::Assert(len_ % kUnit == 0, "length not divide by size");
+    for (size_t i = 0; i < len_; i += kUnit) {
+      // use memcpy to avoid alignment issue
+      std::memcpy(&tdst, pdst + i, sizeof(tdst));
+      std::memcpy(&tsrc, psrc + i, sizeof(tsrc));
+      tdst.Reduce(tsrc);
+      std::memcpy(pdst + i, &tdst, sizeof(tdst));      
+    }
+  }
+  // function handle
+  ReduceHandle handle;
 };
 
 }  // namespace sync

From 6680bffaaeabe52122fe7e52109559cdcb9ef36f Mon Sep 17 00:00:00 2001
From: tqchen <tianqi.tchen@gmail.com>
Date: Wed, 15 Oct 2014 21:45:13 -0700
Subject: [PATCH 033/166] chg

---
 src/sync/sync.h | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/src/sync/sync.h b/src/sync/sync.h
index 0548a3c80acc..57c27c1d823a 100644
--- a/src/sync/sync.h
+++ b/src/sync/sync.h
@@ -10,12 +10,12 @@
 #include "../utils/utils.h"
 
 namespace xgboost {
-/*! \brief syncrhonizer module that minimum wraps MPI */
+/*! \brief syncrhonizer module that minimumly wraps interface of MPI */
 namespace sync {
 /*! \brief reduce operator supported */
 enum ReduceOp {
-  kBitwiseOR,
-  kSum
+  kSum,
+  kBitwiseOR
 };
 
 typedef void (ReduceFunction) (const void *src, void *dst, int len);
@@ -75,7 +75,7 @@ class Reducer {
    * \param bytes number of 4bytes send through all reduce
    * \param reducer the reducer function
    */
-  inline void AllReduce(DType *sendrecvbuf, int count) {
+  inline void AllReduce(DType *sendrecvbuf, size_t count) {
     handle.AllReduce(sendrecvbuf, count * kUnit);
   }
 

From aefe58a2075908646b823427b5bbb21fb387cf65 Mon Sep 17 00:00:00 2001
From: tqchen <tianqi.tchen@gmail.com>
Date: Thu, 16 Oct 2014 10:38:49 -0700
Subject: [PATCH 034/166] middle version

---
 Makefile                          | 13 ++++----
 src/sync/sync.cpp                 | 55 ++++++++++++++++++-------------
 src/sync/sync.h                   | 48 +++++++++++++++++----------
 src/tree/model.h                  | 14 ++++++--
 src/tree/param.h                  |  4 +++
 src/tree/updater_colmaker-inl.hpp | 19 +++++++----
 src/tree/updater_distcol-inl.hpp  | 53 ++++++++++++++++++++++++++---
 src/utils/io.h                    | 46 +++++++++++++++++++++++---
 8 files changed, 188 insertions(+), 64 deletions(-)

diff --git a/Makefile b/Makefile
index 2852b7ac578b..b952dbf5d9c4 100644
--- a/Makefile
+++ b/Makefile
@@ -11,11 +11,11 @@ else
 endif
 
 # specify tensor path
-BIN = xgboost
-OBJ = updater.o gbm.o io.o
+BIN = 
+OBJ = updater.o gbm.o io.o main.o
 MPIOBJ = sync.o
-MPIBIN = test/test
-SLIB = wrapper/libxgboostwrapper.so 
+MPIBIN = test/test xgboost
+SLIB = #wrapper/libxgboostwrapper.so 
 
 .PHONY: clean all python Rpack
 
@@ -28,8 +28,9 @@ updater.o: src/tree/updater.cpp  src/tree/*.hpp src/*.h src/tree/*.h
 gbm.o: src/gbm/gbm.cpp src/gbm/*.hpp src/gbm/*.h
 io.o: src/io/io.cpp src/io/*.hpp src/utils/*.h src/learner/dmatrix.h src/*.h
 sync.o: src/sync/sync.cpp 
-xgboost: src/xgboost_main.cpp src/utils/*.h src/*.h src/learner/*.hpp src/learner/*.h $(OBJ)
-wrapper/libxgboostwrapper.so: wrapper/xgboost_wrapper.cpp src/utils/*.h src/*.h src/learner/*.hpp src/learner/*.h $(OBJ)
+main.o: src/xgboost_main.cpp src/utils/*.h src/*.h src/learner/*.hpp src/learner/*.h 
+xgboost: $(OBJ) $(MPIOBJ)
+#wrapper/libxgboostwrapper.so: wrapper/xgboost_wrapper.cpp src/utils/*.h src/*.h src/learner/*.hpp src/learner/*.h $(OBJ)
 test/test: test/test.cpp sync.o
 
 $(BIN) : 
diff --git a/src/sync/sync.cpp b/src/sync/sync.cpp
index 705d19faed67..ced5e2cb1728 100644
--- a/src/sync/sync.cpp
+++ b/src/sync/sync.cpp
@@ -5,29 +5,6 @@
 namespace xgboost {
 namespace sync {
 
-// code for reduce handle
-ReduceHandle::ReduceHandle(void) : handle(NULL) {
-}
-ReduceHandle::~ReduceHandle(void) {
-  if (handle != NULL) {
-    MPI::Op *op = reinterpret_cast<MPI::Op*>(handle);
-    op->Free();
-    delete op;
-  }
-}
-void ReduceHandle::Init(ReduceFunction redfunc, bool commute) {
-  utils::Assert(handle == NULL, "cannot initialize reduce handle twice");
-  MPI::Op *op = new MPI::Op();
-  MPI::User_function *pf = reinterpret_cast<MPI::User_function*>(redfunc);
-  op->Init(pf, commute);
-  handle = op;
-}
-void ReduceHandle::AllReduce(void *sendrecvbuf, size_t n4byte) {
-  utils::Assert(handle != NULL, "must intialize handle to call AllReduce");  
-  MPI::Op *op = reinterpret_cast<MPI::Op*>(handle);
-  MPI::COMM_WORLD.Allreduce(MPI_IN_PLACE, sendrecvbuf, n4byte, MPI_INT, *op);
-}
-
 int GetRank(void) {
   return MPI::COMM_WORLD.Get_rank();
 }
@@ -57,5 +34,37 @@ void AllReduce<float>(float *sendrecvbuf, int count, ReduceOp op) {
   AllReduce_(sendrecvbuf, count, MPI::FLOAT, op);
 }
 
+void Bcast(std::string *sendrecv_data, int root) {
+  unsigned len = static_cast<unsigned>(sendrecv_data->length());
+  MPI::COMM_WORLD.Bcast(&len, 1, MPI::UNSIGNED, root);
+  sendrecv_data->resize(len);
+  if (len != 0) {
+    MPI::COMM_WORLD.Bcast(&(*sendrecv_data)[0], len, MPI::CHAR, root);  
+  }
+}
+
+// code for reduce handle
+ReduceHandle::ReduceHandle(void) : handle(NULL) {
+}
+ReduceHandle::~ReduceHandle(void) {
+  if (handle != NULL) {
+    MPI::Op *op = reinterpret_cast<MPI::Op*>(handle);
+    op->Free();
+    delete op;
+  }
+}
+void ReduceHandle::Init(ReduceFunction redfunc, bool commute) {
+  utils::Assert(handle == NULL, "cannot initialize reduce handle twice");
+  MPI::Op *op = new MPI::Op();
+  MPI::User_function *pf = reinterpret_cast<MPI::User_function*>(redfunc);
+  op->Init(pf, commute);
+  handle = op;
+}
+void ReduceHandle::AllReduce(void *sendrecvbuf, size_t n4byte) {
+  utils::Assert(handle != NULL, "must intialize handle to call AllReduce");  
+  MPI::Op *op = reinterpret_cast<MPI::Op*>(handle);
+  MPI::COMM_WORLD.Allreduce(MPI_IN_PLACE, sendrecvbuf, n4byte, MPI_INT, *op);
+}
+
 }  // namespace sync
 }  // namespace xgboost
diff --git a/src/sync/sync.h b/src/sync/sync.h
index 57c27c1d823a..cf82597e0063 100644
--- a/src/sync/sync.h
+++ b/src/sync/sync.h
@@ -18,11 +18,39 @@ enum ReduceOp {
   kBitwiseOR
 };
 
-typedef void (ReduceFunction) (const void *src, void *dst, int len);
+/*! \brief get rank of current process */
+int GetRank(void);
+/*! \brief intiialize the synchronization module */
+void Init(int argc, char *argv[]);
+/*! \brief finalize syncrhonization module */
+void Finalize(void);
+
+/*!
+ * \brief in-place all reduce operation 
+ * \param sendrecvbuf the in place send-recv buffer
+ * \param count count of data
+ * \param op reduction function
+ */
+template<typename DType>
+void AllReduce(DType *sendrecvbuf, int count, ReduceOp op);
+
+/*!
+ * \brief broadcast an std::string to all others from root
+ * \param sendrecv_data the pointer to send or recive buffer,
+ *                      receive buffer does not need to be pre-allocated
+ *                      and string will be resized to correct length
+ * \param root the root of process
+ */
+void Bcast(std::string *sendrecv_data, int root);
 
-/* !\brief handle for customized reducer */
+/*! 
+ * \brief handle for customized reducer 
+ * user do not need to use this, used Reducer instead
+ */
 class ReduceHandle {
  public:
+  // reduce function
+  typedef void (ReduceFunction) (const void *src, void *dst, int len);
   // constructor
   ReduceHandle(void);
   // destructor
@@ -41,22 +69,8 @@ class ReduceHandle {
   void *handle;
 };
 
-/*! \brief get rank of current process */
-int GetRank(void);
-/*! \brief intiialize the synchronization module */
-void Init(int argc, char *argv[]);
-/*! \brief finalize syncrhonization module */
-void Finalize(void);
+// ----- extensions for ease of use ------
 /*!
- * \brief in-place all reduce operation 
- * \param sendrecvbuf the in place send-recv buffer
- * \param count count of data
- * \param op reduction function
- */
-template<typename DType>
-void AllReduce(DType *sendrecvbuf, int count, ReduceOp op);
-
-/*! 
  * \brief template class to make customized reduce and all reduce easy  
  * Do not use reducer directly in the function you call Finalize, because the destructor can happen after Finalize
  * \tparam DType data type that to be reduced
diff --git a/src/tree/model.h b/src/tree/model.h
index 8049a160811a..dbc35b3b4c76 100644
--- a/src/tree/model.h
+++ b/src/tree/model.h
@@ -110,6 +110,10 @@ class TreeModel {
     inline bool is_left_child(void) const {
       return (parent_ & (1U << 31)) != 0;
     }
+    /*! \brief whether this node is deleted */
+    inline bool is_deleted(void) const {
+      return sindex_ == std::numeric_limits<unsigned>::max();
+    }
     /*! \brief whether current node is root */
     inline bool is_root(void) const {
       return parent_ == -1;
@@ -144,7 +148,11 @@ class TreeModel {
       this->cleft_ = -1;
       this->cright_ = right;
     }
-
+    /*! \brief mark that this node is deleted */
+    inline void mark_delete(void) {
+      this->sindex_ = std::numeric_limits<unsigned>::max();
+    }
+    
    private:
     friend class TreeModel<TSplitCond, TNodeStat>;
     /*! 
@@ -197,11 +205,11 @@ class TreeModel {
     leaf_vector.resize(param.num_nodes * param.size_leaf_vector); 
     return nd;
   }
-  // delete a tree node
+  // delete a tree node, keep the parent field to allow trace back
   inline void DeleteNode(int nid) {
     utils::Assert(nid >= param.num_roots, "can not delete root");
     deleted_nodes.push_back(nid);
-    nodes[nid].set_parent(-1);
+    nodes[nid].mark_delete();
     ++param.num_deleted;
   }
 
diff --git a/src/tree/param.h b/src/tree/param.h
index 4a7c790a6aeb..cf646a76e330 100644
--- a/src/tree/param.h
+++ b/src/tree/param.h
@@ -345,6 +345,10 @@ struct SplitEntry{
       return false;
     }
   }
+  /*! \brief same as update, used by AllReduce*/
+  inline void Reduce(const SplitEntry &e) {
+    this->Update(e);
+  }
   /*!\return feature index to split on */
   inline unsigned split_index(void) const {
     return sindex & ((1U << 31) - 1U);
diff --git a/src/tree/updater_colmaker-inl.hpp b/src/tree/updater_colmaker-inl.hpp
index 596c8c8f50c2..347326fe76bf 100644
--- a/src/tree/updater_colmaker-inl.hpp
+++ b/src/tree/updater_colmaker-inl.hpp
@@ -486,13 +486,17 @@ class ColMaker: public IUpdater {
       #pragma omp parallel for schedule(static)
       for (bst_omp_uint i = 0; i < ndata; ++i) {
         const bst_uint ridx = rowset[i];
-        const int nid = position[ridx];
-        if (nid >= 0) {
-          if (tree[nid].is_leaf()) {
-            position[ridx] = - nid - 1;
+        int nid = position[ridx];
+        if (nid < 0)  nid = ~nid;        
+        if (tree[nid].is_leaf()) {
+          position[ridx] = ~nid;          
+        } else {
+          // push to default branch, correct latter
+          int pid = tree[nid].default_left() ? tree[nid].cleft(): tree[nid].cright();
+          if (position[ridx] < 0) {
+            position[ridx] = ~pid;
           } else {
-            // push to default branch, correct latter
-            position[ridx] = tree[nid].default_left() ? tree[nid].cleft(): tree[nid].cright();
+            position[ridx] = pid;
           }
         }
       }
@@ -535,7 +539,8 @@ class ColMaker: public IUpdater {
             const bst_uint ridx = col[j].index;
             const float fvalue = col[j].fvalue;
             int nid = position[ridx];
-            if (nid < 0) continue;
+            if (nid < 0)  nid = ~nid;            
+
             // go back to parent, correct those who are not default
             nid = tree[nid].parent();
             if (tree[nid].split_index() == fid) {
diff --git a/src/tree/updater_distcol-inl.hpp b/src/tree/updater_distcol-inl.hpp
index f5d37c1fc59c..aae0845610f0 100644
--- a/src/tree/updater_distcol-inl.hpp
+++ b/src/tree/updater_distcol-inl.hpp
@@ -7,7 +7,10 @@
  * \author Tianqi Chen
  */
 #include "../utils/bitmap.h"
+#include "../utils/io.h"
+#include "../sync/sync.h"
 #include "./updater_colmaker-inl.hpp"
+#include "./updater_prune-inl.hpp"
 
 namespace xgboost {
 namespace tree {
@@ -19,6 +22,7 @@ class DistColMaker : public ColMaker<TStats> {
   // set training parameter
   virtual void SetParam(const char *name, const char *val) {
     param.SetParam(name, val);
+    pruner.SetParam(name, val);
   }
   virtual void Update(const std::vector<bst_gpair> &gpair,
                       IFMatrix *p_fmat,
@@ -26,15 +30,46 @@ class DistColMaker : public ColMaker<TStats> {
                       const std::vector<RegTree*> &trees) {    
     TStats::CheckInfo(info);
     utils::Check(trees.size() == 1, "DistColMaker: only support one tree at a time");
+    // build the tree
     builder.Update(gpair, p_fmat, info, trees[0]);
+    // prune the tree
+    pruner.Update(gpair, p_fmat, info, trees);
+    this->SyncTrees(trees[0]);
+    // update position after the tree is pruned
+    builder.UpdatePosition(p_fmat, *trees[0]);
   }
+
  private:
+  inline void SyncTrees(RegTree *tree) {
+    std::string s_model;
+    utils::MemoryBufferStream fs(&s_model);
+    int rank = sync::GetRank();
+    if (rank == 0) {
+      tree->SaveModel(fs);
+      sync::Bcast(&s_model, 0);
+    } else {
+      sync::Bcast(&s_model, 0);
+      tree->LoadModel(fs);
+    }
+  }  
   struct Builder : public ColMaker<TStats>::Builder {
    public:
     Builder(const TrainParam &param) 
         : ColMaker<TStats>::Builder(param) {
     }
-   protected:
+    inline void UpdatePosition(IFMatrix *p_fmat, const RegTree &tree) {
+      const std::vector<bst_uint> &rowset = p_fmat->buffered_rowset();
+      const bst_omp_uint ndata = static_cast<bst_omp_uint>(rowset.size());
+      #pragma omp parallel for schedule(static)
+      for (bst_omp_uint i = 0; i < ndata; ++i) {
+        const bst_uint ridx = rowset[i];
+        int nid = this->position[ridx];
+        if (nid < 0) {
+          
+        }
+      }
+    }
+   protected:    
     virtual void SetNonDefaultPosition(const std::vector<int> &qexpand,
                                        IFMatrix *p_fmat, const RegTree &tree) {
       // step 2, classify the non-default data into right places
@@ -80,8 +115,8 @@ class DistColMaker : public ColMaker<TStats> {
         }
       }
       // communicate bitmap
-      //sync::AllReduce();
-      const std::vector<bst_uint> &rowset = p_fmat->buffered_rowset();   
+      sync::AllReduce(BeginPtr(bitmap.data), bitmap.data.size(), sync::kBitwiseOR);
+      const std::vector<bst_uint> &rowset = p_fmat->buffered_rowset();
       // get the new position
       const bst_omp_uint ndata = static_cast<bst_omp_uint>(rowset.size());
       #pragma omp parallel for schedule(static)
@@ -100,19 +135,29 @@ class DistColMaker : public ColMaker<TStats> {
     }
     // synchronize the best solution of each node
     virtual void SyncBestSolution(const std::vector<int> &qexpand) {
+      std::vector<SplitEntry> vec;
       for (size_t i = 0; i < qexpand.size(); ++i) {
         const int nid = qexpand[i];
         for (int tid = 0; tid < this->nthread; ++tid) {
           this->snode[nid].best.Update(this->stemp[tid][nid].best);
         }
+        vec.push_back(this->snode[nid].best);
       }
       // communicate best solution
-      // sync::AllReduce
+      reducer.AllReduce(BeginPtr(vec), vec.size());
+      // assign solution back
+      for (size_t i = 0; i < qexpand.size(); ++i) {
+        const int nid = qexpand[i];
+        this->snode[nid].best = vec[i];
+      }
     }
     
    private:
     utils::BitMap bitmap;
+    sync::Reducer<SplitEntry> reducer;
   };
+  // we directly introduce pruner here
+  TreePruner pruner;
   // training parameter
   TrainParam param;
   // pointer to the builder
diff --git a/src/utils/io.h b/src/utils/io.h
index d4746681a2bb..7dd550dc892e 100644
--- a/src/utils/io.h
+++ b/src/utils/io.h
@@ -92,11 +92,49 @@ class IStream {
 class ISeekStream: public IStream {
  public:
   /*! \brief seek to certain position of the file */
-  virtual void Seek(long pos) = 0;
+  virtual void Seek(size_t pos) = 0;
   /*! \brief tell the position of the stream */
-  virtual long Tell(void) = 0;
+  virtual size_t Tell(void) = 0;
 };
 
+/*! \brief a in memory buffer that can be read and write as stream interface */
+struct MemoryBufferStream : public ISeekStream {
+ public:
+  MemoryBufferStream(std::string *p_buffer) 
+      : p_buffer_(p_buffer) {
+    curr_ptr_ = 0;
+  }
+  virtual ~MemoryBufferStream(void) {}
+  virtual size_t Read(void *ptr, size_t size) {
+    utils::Assert(curr_ptr_ <= p_buffer_->length(),
+                  "read can not have position excceed buffer length");
+    size_t nread = std::min(p_buffer_->length() - curr_ptr_, size);
+    if (nread != 0) memcpy(ptr, &(*p_buffer_)[0] + curr_ptr_, nread);
+    curr_ptr_ += nread;
+    return nread;
+  }
+  virtual void Write(const void *ptr, size_t size) {
+    if (size == 0) return;
+    if (curr_ptr_ + size > p_buffer_->length()) {
+      p_buffer_->resize(curr_ptr_+size);
+    }
+    memcpy(&(*p_buffer_)[0] + curr_ptr_, ptr, size); 
+    curr_ptr_ += size;
+  }
+  virtual void Seek(size_t pos) {
+    curr_ptr_ = static_cast<size_t>(pos);
+  }
+  virtual size_t Tell(void) {
+    return curr_ptr_;
+  }
+
+ private:
+  /*! \brief in memory buffer */
+  std::string *p_buffer_;
+  /*! \brief current pointer */
+  size_t curr_ptr_;
+}; // class MemoryBufferStream
+
 /*! \brief implementation of file i/o stream */
 class FileStream : public ISeekStream {
  public:
@@ -110,10 +148,10 @@ class FileStream : public ISeekStream {
   virtual void Write(const void *ptr, size_t size) {
     std::fwrite(ptr, size, 1, fp);
   }
-  virtual void Seek(long pos) {
+  virtual void Seek(size_t pos) {
     std::fseek(fp, pos, SEEK_SET);
   }
-  virtual long Tell(void) {
+  virtual size_t Tell(void) {
     return std::ftell(fp);
   }
   inline void Close(void) {

From 47145a7fac21ebeb68e76b1ed5cb4e2338e0575a Mon Sep 17 00:00:00 2001
From: tqchen <tianqi.tchen@gmail.com>
Date: Thu, 16 Oct 2014 11:56:55 -0700
Subject: [PATCH 035/166] ok, now work on update position

---
 Makefile                          |  2 +-
 src/tree/updater_colmaker-inl.hpp | 65 ++++++++++++++++++++-----------
 src/tree/updater_distcol-inl.hpp  | 17 ++++----
 3 files changed, 50 insertions(+), 34 deletions(-)

diff --git a/Makefile b/Makefile
index b952dbf5d9c4..bdc4fb58349f 100644
--- a/Makefile
+++ b/Makefile
@@ -14,7 +14,7 @@ endif
 BIN = 
 OBJ = updater.o gbm.o io.o main.o
 MPIOBJ = sync.o
-MPIBIN = test/test xgboost
+MPIBIN = xgboost
 SLIB = #wrapper/libxgboostwrapper.so 
 
 .PHONY: clean all python Rpack
diff --git a/src/tree/updater_colmaker-inl.hpp b/src/tree/updater_colmaker-inl.hpp
index 347326fe76bf..6db19732e1b8 100644
--- a/src/tree/updater_colmaker-inl.hpp
+++ b/src/tree/updater_colmaker-inl.hpp
@@ -132,17 +132,17 @@ class ColMaker: public IUpdater {
         // mark delete for the deleted datas
         for (size_t i = 0; i < rowset.size(); ++i) {
           const bst_uint ridx = rowset[i];
-          if (gpair[ridx].hess < 0.0f) position[ridx] = -1;
+          if (gpair[ridx].hess < 0.0f) position[ridx] = ~position[ridx];
         }
         // mark subsample
         if (param.subsample < 1.0f) {
           for (size_t i = 0; i < rowset.size(); ++i) {
             const bst_uint ridx = rowset[i];
             if (gpair[ridx].hess < 0.0f) continue;
-            if (random::SampleBinary(param.subsample) == 0) position[ridx] = -1;
+            if (random::SampleBinary(param.subsample) == 0) position[ridx] = ~position[ridx];
           }
         }
-      }    
+      }
       {
         // initialize feature index
         unsigned ncol = static_cast<unsigned>(fmat.NumCol());
@@ -473,6 +473,9 @@ class ColMaker: public IUpdater {
         if (e.best.loss_chg > rt_eps) {
           p_tree->AddChilds(nid);
           (*p_tree)[nid].set_split(e.best.split_index(), e.best.split_value, e.best.default_left());
+          // mark right child as 0, to indicate fresh leaf
+          (*p_tree)[(*p_tree)[nid].cleft()].set_leaf(0.0f, 0);
+          (*p_tree)[(*p_tree)[nid].cright()].set_leaf(0.0f, 0);
         } else {
           (*p_tree)[nid].set_leaf(e.weight * param.learning_rate);
         }
@@ -480,28 +483,33 @@ class ColMaker: public IUpdater {
     }
     // reset position of each data points after split is created in the tree
     inline void ResetPosition(const std::vector<int> &qexpand, IFMatrix *p_fmat, const RegTree &tree) {
+      // set the positions in the nondefault
+      this->SetNonDefaultPosition(qexpand, p_fmat, tree);      
+      // set rest of instances to default position
       const std::vector<bst_uint> &rowset = p_fmat->buffered_rowset();
-      // step 1, set default direct nodes to default, and leaf nodes to -1
+      // set default direct nodes to default
+      // for leaf nodes that are not fresh, mark then to ~nid, 
+      // so that they are ignored in future statistics collection
       const bst_omp_uint ndata = static_cast<bst_omp_uint>(rowset.size());
+      
       #pragma omp parallel for schedule(static)
       for (bst_omp_uint i = 0; i < ndata; ++i) {
         const bst_uint ridx = rowset[i];
-        int nid = position[ridx];
-        if (nid < 0)  nid = ~nid;        
+        const int nid = this->DecodePosition(ridx);
         if (tree[nid].is_leaf()) {
-          position[ridx] = ~nid;          
+          // mark finish when it is not a fresh leaf
+          if (tree[nid].cright() == -1) {
+            position[ridx] = ~nid;
+          }
         } else {
-          // push to default branch, correct latter
-          int pid = tree[nid].default_left() ? tree[nid].cleft(): tree[nid].cright();
-          if (position[ridx] < 0) {
-            position[ridx] = ~pid;
+          // push to default branch
+          if (tree[nid].default_left()) {
+            this->SetEncodePosition(ridx, tree[nid].cleft());
           } else {
-            position[ridx] = pid;
+            this->SetEncodePosition(ridx, tree[nid].cright());
           }
         }
       }
-      // set the positions in the nondefault places
-      this->SetNonDefaultPosition(qexpand, p_fmat, tree);
     }
     // customization part
     // synchronize the best solution of each node
@@ -516,7 +524,7 @@ class ColMaker: public IUpdater {
     }
     virtual void SetNonDefaultPosition(const std::vector<int> &qexpand,
                                        IFMatrix *p_fmat, const RegTree &tree) {
-      // step 2, classify the non-default data into right places
+      // step 1, classify the non-default data into right places
       std::vector<unsigned> fsplits;
       for (size_t i = 0; i < qexpand.size(); ++i) {
         const int nid = qexpand[i];
@@ -538,22 +546,33 @@ class ColMaker: public IUpdater {
           for (bst_omp_uint j = 0; j < ndata; ++j) {
             const bst_uint ridx = col[j].index;
             const float fvalue = col[j].fvalue;
-            int nid = position[ridx];
-            if (nid < 0)  nid = ~nid;            
-
+            const int nid = this->DecodePosition(ridx);
             // go back to parent, correct those who are not default
-            nid = tree[nid].parent();
-            if (tree[nid].split_index() == fid) {
-              if (fvalue < tree[nid].split_cond()) {
-                position[ridx] = tree[nid].cleft();
+            if (!tree[nid].is_leaf() && tree[nid].split_index() == fid) {
+              if(fvalue < tree[nid].split_cond()) {
+                this->SetEncodePosition(ridx, tree[nid].cleft());
               } else {
-                position[ridx] = tree[nid].cright();
+                this->SetEncodePosition(ridx, tree[nid].cright());
               }
             }
           }
         }
       }
     }
+    // utils to get/set position, with encoded format
+    // return decoded position
+    inline int DecodePosition(bst_uint ridx) const{
+      const int pid = position[ridx];
+      return pid < 0 ? ~pid : pid;
+    }
+    // encode the encoded position value for ridx
+    inline void SetEncodePosition(bst_uint ridx, int nid) {
+      if (position[ridx] < 0) {
+        position[ridx] = ~nid;
+      } else {
+        position[ridx] = nid;
+      }
+    }
     //--data fields--
     const TrainParam &param;
     // number of omp thread used during training
diff --git a/src/tree/updater_distcol-inl.hpp b/src/tree/updater_distcol-inl.hpp
index aae0845610f0..e5d1450a59de 100644
--- a/src/tree/updater_distcol-inl.hpp
+++ b/src/tree/updater_distcol-inl.hpp
@@ -100,11 +100,8 @@ class DistColMaker : public ColMaker<TStats> {
           for (bst_omp_uint j = 0; j < ndata; ++j) {
             const bst_uint ridx = col[j].index;
             const float fvalue = col[j].fvalue;
-            int nid = this->position[ridx];
-            if (nid < 0) continue;
-            // go back to parent, correct those who are not default
-            nid = tree[nid].parent();
-            if (tree[nid].split_index() == fid) {
+            const int nid = this->DecodePosition(ridx);
+            if (!tree[nid].is_leaf() && tree[nid].split_index() == fid) {
               if (fvalue < tree[nid].split_cond()) {
                 if (!tree[nid].default_left()) bitmap.SetTrue(ridx);
               } else {
@@ -122,13 +119,13 @@ class DistColMaker : public ColMaker<TStats> {
       #pragma omp parallel for schedule(static)
       for (bst_omp_uint i = 0; i < ndata; ++i) {
         const bst_uint ridx = rowset[i];
-        int nid = this->position[ridx];
-        if (nid >= 0 && bitmap.Get(ridx)) {
-          nid = tree[nid].parent();
+        const int nid = this->DecodePosition(ridx);
+        if (bitmap.Get(ridx)) {
+          utils::Assert(!tree[nid].is_leaf(), "inconsistent reduce information");
           if (tree[nid].default_left()) {
-            this->position[ridx] = tree[nid].cright();
+            this->SetEncodePosition(ridx, tree[nid].cright());
           } else {
-            this->position[ridx] = tree[nid].cleft();
+            this->SetEncodePosition(ridx, tree[nid].cright());
           }
         }
       }

From a21df0770dc53a11147300e443ba159f8184add5 Mon Sep 17 00:00:00 2001
From: tqchen <tianqi.tchen@gmail.com>
Date: Thu, 16 Oct 2014 13:03:42 -0700
Subject: [PATCH 036/166] make clear seperation

---
 Makefile                             | 19 ++++++++++---------
 demo/binary_classification/runexp.sh |  8 ++++----
 src/sync/sync_empty.cpp              | 27 +++++++++++++++++++++++++++
 src/sync/{sync.cpp => sync_mpi.cpp}  |  3 +--
 src/tree/updater_distcol-inl.hpp     | 15 +++++++++------
 src/xgboost_main.cpp                 | 15 ++++++++++++---
 6 files changed, 63 insertions(+), 24 deletions(-)
 create mode 100644 src/sync/sync_empty.cpp
 rename src/sync/{sync.cpp => sync_mpi.cpp} (98%)

diff --git a/Makefile b/Makefile
index bdc4fb58349f..a6e7f3daa774 100644
--- a/Makefile
+++ b/Makefile
@@ -11,11 +11,11 @@ else
 endif
 
 # specify tensor path
-BIN = 
-OBJ = updater.o gbm.o io.o main.o
-MPIOBJ = sync.o
-MPIBIN = xgboost
-SLIB = #wrapper/libxgboostwrapper.so 
+BIN = xgboost
+OBJ = updater.o gbm.o io.o main.o sync_empty.o
+MPIOBJ = sync_mpi.o
+MPIBIN = xgboost-mpi
+SLIB = wrapper/libxgboostwrapper.so 
 
 .PHONY: clean all python Rpack
 
@@ -27,11 +27,12 @@ wrapper/libxgboostwrapper.so: wrapper/xgboost_wrapper.cpp $(OBJ)
 updater.o: src/tree/updater.cpp  src/tree/*.hpp src/*.h src/tree/*.h
 gbm.o: src/gbm/gbm.cpp src/gbm/*.hpp src/gbm/*.h
 io.o: src/io/io.cpp src/io/*.hpp src/utils/*.h src/learner/dmatrix.h src/*.h
-sync.o: src/sync/sync.cpp 
+sync_mpi.o: src/sync/sync_mpi.cpp 
+sync_empty.o: src/sync/sync_empty.cpp 
 main.o: src/xgboost_main.cpp src/utils/*.h src/*.h src/learner/*.hpp src/learner/*.h 
-xgboost: $(OBJ) $(MPIOBJ)
-#wrapper/libxgboostwrapper.so: wrapper/xgboost_wrapper.cpp src/utils/*.h src/*.h src/learner/*.hpp src/learner/*.h $(OBJ)
-test/test: test/test.cpp sync.o
+xgboost:  updater.o gbm.o io.o main.o sync_empty.o
+xgboost-mpi:  updater.o gbm.o io.o main.o sync_mpi.o 
+wrapper/libxgboostwrapper.so: wrapper/xgboost_wrapper.cpp src/utils/*.h src/*.h src/learner/*.hpp src/learner/*.h $(OBJ)
 
 $(BIN) : 
 	$(CXX) $(CFLAGS) $(LDFLAGS) -o $@ $(filter %.cpp %.o %.c, $^)
diff --git a/demo/binary_classification/runexp.sh b/demo/binary_classification/runexp.sh
index 68c3e6fb90f9..c1f191e61780 100755
--- a/demo/binary_classification/runexp.sh
+++ b/demo/binary_classification/runexp.sh
@@ -4,12 +4,12 @@ python mapfeat.py
 # split train and test
 python mknfold.py agaricus.txt 1
 # training and output the models
-../../xgboost mushroom.conf
+mpirun ../../xgboost mushroom.conf
 # output prediction task=pred 
-../../xgboost mushroom.conf task=pred model_in=0002.model
+mpirun ../../xgboost mushroom.conf task=pred model_in=0002.model
 # print the boosters of 00002.model in dump.raw.txt
-../../xgboost mushroom.conf task=dump model_in=0002.model name_dump=dump.raw.txt 
+mpirun ../../xgboost mushroom.conf task=dump model_in=0002.model name_dump=dump.raw.txt 
 # use the feature map in printing for better visualization
-../../xgboost mushroom.conf task=dump model_in=0002.model fmap=featmap.txt name_dump=dump.nice.txt
+mpirun ../../xgboost mushroom.conf task=dump model_in=0002.model fmap=featmap.txt name_dump=dump.nice.txt
 cat dump.nice.txt
 
diff --git a/src/sync/sync_empty.cpp b/src/sync/sync_empty.cpp
new file mode 100644
index 000000000000..e46a6906afd1
--- /dev/null
+++ b/src/sync/sync_empty.cpp
@@ -0,0 +1,27 @@
+#include "./sync.h"
+#include "../utils/utils.h"
+// no synchronization module, single thread mode does not need it anyway
+namespace xgboost {
+namespace sync {
+int GetRank(void) {
+  return 0;
+}
+void Init(int argc, char *argv[]) {
+}
+void Finalize(void) {
+}
+template<>
+void AllReduce<uint32_t>(uint32_t *sendrecvbuf, int count, ReduceOp op) {
+}
+template<>
+void AllReduce<float>(float *sendrecvbuf, int count, ReduceOp op) {
+}
+void Bcast(std::string *sendrecv_data, int root) {
+}
+ReduceHandle::ReduceHandle(void) : handle(NULL) {}
+ReduceHandle::~ReduceHandle(void) {}
+void ReduceHandle::Init(ReduceFunction redfunc, bool commute) {}
+void ReduceHandle::AllReduce(void *sendrecvbuf, size_t n4byte) {}
+}  // namespace sync
+}  // namespace xgboost
+
diff --git a/src/sync/sync.cpp b/src/sync/sync_mpi.cpp
similarity index 98%
rename from src/sync/sync.cpp
rename to src/sync/sync_mpi.cpp
index ced5e2cb1728..2890ab609755 100644
--- a/src/sync/sync.cpp
+++ b/src/sync/sync_mpi.cpp
@@ -1,10 +1,9 @@
 #include "./sync.h"
 #include "../utils/utils.h"
 #include "mpi.h"
-
+// use MPI to implement sync
 namespace xgboost {
 namespace sync {
-
 int GetRank(void) {
   return MPI::COMM_WORLD.Get_rank();
 }
diff --git a/src/tree/updater_distcol-inl.hpp b/src/tree/updater_distcol-inl.hpp
index e5d1450a59de..86fb558b26e6 100644
--- a/src/tree/updater_distcol-inl.hpp
+++ b/src/tree/updater_distcol-inl.hpp
@@ -32,13 +32,13 @@ class DistColMaker : public ColMaker<TStats> {
     utils::Check(trees.size() == 1, "DistColMaker: only support one tree at a time");
     // build the tree
     builder.Update(gpair, p_fmat, info, trees[0]);
-    // prune the tree
+    //// prune the tree
     pruner.Update(gpair, p_fmat, info, trees);
     this->SyncTrees(trees[0]);
     // update position after the tree is pruned
     builder.UpdatePosition(p_fmat, *trees[0]);
   }
-
+  
  private:
   inline void SyncTrees(RegTree *tree) {
     std::string s_model;
@@ -63,10 +63,12 @@ class DistColMaker : public ColMaker<TStats> {
       #pragma omp parallel for schedule(static)
       for (bst_omp_uint i = 0; i < ndata; ++i) {
         const bst_uint ridx = rowset[i];
-        int nid = this->position[ridx];
-        if (nid < 0) {
-          
+        int nid = this->DecodePosition(ridx);
+        while (tree[nid].is_deleted()) {
+          nid = tree[nid].parent();
+          utils::Assert(nid >=0, "distributed learning error");
         }
+        this->position[ridx] = nid;
       }
     }
    protected:    
@@ -111,6 +113,7 @@ class DistColMaker : public ColMaker<TStats> {
           }
         }
       }
+
       // communicate bitmap
       sync::AllReduce(BeginPtr(bitmap.data), bitmap.data.size(), sync::kBitwiseOR);
       const std::vector<bst_uint> &rowset = p_fmat->buffered_rowset();
@@ -125,7 +128,7 @@ class DistColMaker : public ColMaker<TStats> {
           if (tree[nid].default_left()) {
             this->SetEncodePosition(ridx, tree[nid].cright());
           } else {
-            this->SetEncodePosition(ridx, tree[nid].cright());
+            this->SetEncodePosition(ridx, tree[nid].cleft());
           }
         }
       }
diff --git a/src/xgboost_main.cpp b/src/xgboost_main.cpp
index 75544dd0ec41..e96342f69510 100644
--- a/src/xgboost_main.cpp
+++ b/src/xgboost_main.cpp
@@ -5,6 +5,7 @@
 #include <string>
 #include <cstring>
 #include "io/io.h"
+#include "sync/sync.h"
 #include "utils/utils.h"
 #include "utils/config.h"
 #include "learner/learner-inl.hpp"
@@ -19,7 +20,7 @@ class BoostLearnTask{
     if (argc < 2) {
       printf("Usage: <config>\n");
       return 0;
-    }
+    }    
     utils::ConfigIterator itr(argv[1]);
     while (itr.Next()) {
       this->SetParam(itr.name(), itr.val());
@@ -30,6 +31,9 @@ class BoostLearnTask{
         this->SetParam(name, val);
       }
     }
+    if (sync::GetRank() != 0) {
+      this->SetParam("silent", "2");
+    }
     this->InitData();
     this->InitLearner();
     if (task == "dump") {
@@ -145,7 +149,9 @@ class BoostLearnTask{
       if (!silent) printf("boosting round %d, %lu sec elapsed\n", i, elapsed);
       learner.UpdateOneIter(i, *data); 
       std::string res = learner.EvalOneIter(i, devalall, eval_data_names);
-      fprintf(stderr, "%s\n", res.c_str());
+      if (silent < 1) {
+        fprintf(stderr, "%s\n", res.c_str());
+      }
       if (save_period != 0 && (i + 1) % save_period == 0) {
         this->SaveModel(i);
       }
@@ -243,7 +249,10 @@ class BoostLearnTask{
 }
 
 int main(int argc, char *argv[]){
+  xgboost::sync::Init(argc, argv);
   xgboost::random::Seed(0);
   xgboost::BoostLearnTask tsk;
-  return tsk.Run(argc, argv);
+  int ret = tsk.Run(argc, argv);
+  xgboost::sync::Finalize();
+  return ret;
 }

From 0cf2dd39eafa807f5a610c267570637798fdfa52 Mon Sep 17 00:00:00 2001
From: tqchen <tianqi.tchen@gmail.com>
Date: Thu, 16 Oct 2014 15:12:10 -0700
Subject: [PATCH 037/166] new change for mpi

---
 src/gbm/gblinear-inl.hpp         |  1 +
 src/gbm/gbm.h                    |  3 +++
 src/gbm/gbtree-inl.hpp           | 44 +++++++++++++++++++++++++++++---
 src/learner/learner-inl.hpp      |  4 +--
 src/tree/updater.cpp             |  2 +-
 src/tree/updater.h               | 10 ++++++++
 src/tree/updater_distcol-inl.hpp |  7 ++++-
 wrapper/xgboost_wrapper.cpp      |  2 +-
 8 files changed, 65 insertions(+), 8 deletions(-)

diff --git a/src/gbm/gblinear-inl.hpp b/src/gbm/gblinear-inl.hpp
index 624f15c2882c..cae5cf4f3f7f 100644
--- a/src/gbm/gblinear-inl.hpp
+++ b/src/gbm/gblinear-inl.hpp
@@ -42,6 +42,7 @@ class GBLinear : public IGradBooster {
     model.InitModel();
   }
   virtual void DoBoost(IFMatrix *p_fmat,
+                       int64_t buffer_offset,
                        const BoosterInfo &info,
                        std::vector<bst_gpair> *in_gpair) {
     std::vector<bst_gpair> &gpair = *in_gpair;
diff --git a/src/gbm/gbm.h b/src/gbm/gbm.h
index 07dade4aca7e..00d0bc4445fe 100644
--- a/src/gbm/gbm.h
+++ b/src/gbm/gbm.h
@@ -41,11 +41,14 @@ class IGradBooster {
   /*!
    * \brief peform update to the model(boosting)
    * \param p_fmat feature matrix that provide access to features
+   * \param buffer_offset buffer index offset of these instances, if equals -1
+   *        this means we do not have buffer index allocated to the gbm
    * \param info meta information about training
    * \param in_gpair address of the gradient pair statistics of the data
    * the booster may change content of gpair
    */
   virtual void DoBoost(IFMatrix *p_fmat,
+                       int64_t buffer_offset,
                        const BoosterInfo &info,
                        std::vector<bst_gpair> *in_gpair) = 0;
   /*!
diff --git a/src/gbm/gbtree-inl.hpp b/src/gbm/gbtree-inl.hpp
index 08d2164bc3fc..eb526e43ca41 100644
--- a/src/gbm/gbtree-inl.hpp
+++ b/src/gbm/gbtree-inl.hpp
@@ -19,6 +19,8 @@ namespace gbm {
  */
 class GBTree : public IGradBooster {
  public:
+  GBTree(void) {
+  }
   virtual ~GBTree(void) {
     this->Clear();
   }
@@ -83,11 +85,12 @@ class GBTree : public IGradBooster {
     utils::Assert(trees.size() == 0, "GBTree: model already initialized");
   }
   virtual void DoBoost(IFMatrix *p_fmat,
+                       int64_t buffer_offset,
                        const BoosterInfo &info,
                        std::vector<bst_gpair> *in_gpair) {
     const std::vector<bst_gpair> &gpair = *in_gpair;
     if (mparam.num_output_group == 1) {
-      this->BoostNewTrees(gpair, p_fmat, info, 0);
+      this->BoostNewTrees(gpair, p_fmat, buffer_offset, info, 0);
     } else {
       const int ngroup = mparam.num_output_group;
       utils::Check(gpair.size() % ngroup == 0,
@@ -99,7 +102,7 @@ class GBTree : public IGradBooster {
         for (bst_omp_uint i = 0; i < nsize; ++i) {
           tmp[i] = gpair[i * ngroup + gid];
         }
-        this->BoostNewTrees(tmp, p_fmat, info, gid);
+        this->BoostNewTrees(tmp, p_fmat, buffer_offset, info, gid);
       }
     }
   }
@@ -190,6 +193,7 @@ class GBTree : public IGradBooster {
   // do group specific group
   inline void BoostNewTrees(const std::vector<bst_gpair> &gpair,
                             IFMatrix *p_fmat,
+                            int64_t buffer_offset,
                             const BoosterInfo &info,
                             int bst_group) {
     this->InitUpdater();
@@ -206,6 +210,17 @@ class GBTree : public IGradBooster {
     for (size_t i = 0; i < updaters.size(); ++i) {
       updaters[i]->Update(gpair, p_fmat, info, new_trees);
     }
+    // optimization, update buffer, if possible
+    if (buffer_offset >= 0 &&
+        new_trees.size() == 1 && updaters.size() > 0 &&
+        updaters.back()->GetLeafPosition() != NULL) {
+      utils::Check(info.num_row == p_fmat->buffered_rowset().size(),
+                   "distributed mode is not compatible with prob_buffer_row");
+      this->UpdateBufferByPosition(p_fmat,
+                                   buffer_offset, bst_group,
+                                   *new_trees[0],
+                                   updaters.back()->GetLeafPosition());
+    }
     // push back to model
     for (size_t i = 0; i < new_trees.size(); ++i) {
       trees.push_back(new_trees[i]);
@@ -213,13 +228,36 @@ class GBTree : public IGradBooster {
     }
     mparam.num_trees += tparam.num_parallel_tree;
   }
+  // update buffer by pre-cached position
+  inline void UpdateBufferByPosition(IFMatrix *p_fmat,
+                                     int64_t buffer_offset, 
+                                     int bst_group,
+                                     const tree::RegTree &new_tree,
+                                     const int* leaf_position) {
+    const std::vector<bst_uint> &rowset = p_fmat->buffered_rowset();
+    const bst_omp_uint ndata = static_cast<bst_omp_uint>(rowset.size());
+    #pragma omp parallel for schedule(static)
+    for (bst_omp_uint i = 0; i < ndata; ++i) {
+      const bst_uint ridx = rowset[i];
+      const int64_t bid = mparam.BufferOffset(buffer_offset + ridx, bst_group);
+      const int tid = leaf_position[ridx];
+      utils::Assert(pred_counter[bid] == trees.size(), "cached buffer not up to date");
+      utils::Assert(tid >= 0, "invalid leaf position");
+      pred_buffer[bid] += new_tree[tid].leaf_value();
+      for (int i = 0; i < mparam.size_leaf_vector; ++i) {
+        pred_buffer[bid + i + 1] += new_tree.leafvec(tid)[i];
+      }
+      pred_counter[bid] += 1;
+    }
+  }
   // make a prediction for a single instance
   inline void Pred(const RowBatch::Inst &inst,
                    int64_t buffer_index,
                    int bst_group,
                    unsigned root_index,
                    tree::RegTree::FVec *p_feats,
-                   float *out_pred, size_t stride, unsigned ntree_limit) {
+                   float *out_pred, size_t stride, 
+                   unsigned ntree_limit) {
     size_t itop = 0;
     float  psum = 0.0f;
     // sum of leaf vector 
diff --git a/src/learner/learner-inl.hpp b/src/learner/learner-inl.hpp
index e132349896c4..c8c146b451ae 100644
--- a/src/learner/learner-inl.hpp
+++ b/src/learner/learner-inl.hpp
@@ -173,7 +173,7 @@ class BoostLearner {
   inline void UpdateOneIter(int iter, const DMatrix &train) {
     this->PredictRaw(train, &preds_);
     obj_->GetGradient(preds_, train.info, iter, &gpair_);
-    gbm_->DoBoost(train.fmat(), train.info.info, &gpair_);
+    gbm_->DoBoost(train.fmat(), this->FindBufferOffset(train), train.info.info, &gpair_);
   }
   /*!
    * \brief evaluate the model for specific iteration
@@ -335,7 +335,7 @@ class BoostLearner {
   // gradient pairs
   std::vector<bst_gpair> gpair_;
 
- private:
+ protected:
   // cache entry object that helps handle feature caching
   struct CacheEntry {
     const DMatrix *mat_;
diff --git a/src/tree/updater.cpp b/src/tree/updater.cpp
index e2c5301429ad..75fc39799b4d 100644
--- a/src/tree/updater.cpp
+++ b/src/tree/updater.cpp
@@ -13,8 +13,8 @@ IUpdater* CreateUpdater(const char *name) {
   using namespace std;
   if (!strcmp(name, "prune")) return new TreePruner();
   if (!strcmp(name, "refresh")) return new TreeRefresher<GradStats>();
-  if (!strcmp(name, "distcol")) return new DistColMaker<GradStats>();
   if (!strcmp(name, "grow_colmaker")) return new ColMaker<GradStats>();
+  if (!strcmp(name, "distcol")) return new DistColMaker<GradStats>();
   if (!strcmp(name, "grow_colmaker5")) return new ColMaker< CVGradStats<5> >();
   if (!strcmp(name, "grow_colmaker3")) return new ColMaker< CVGradStats<3> >();
   utils::Error("unknown updater:%s", name);
diff --git a/src/tree/updater.h b/src/tree/updater.h
index e3a05c84f46a..49adc8dca27c 100644
--- a/src/tree/updater.h
+++ b/src/tree/updater.h
@@ -37,6 +37,16 @@ class IUpdater {
                       IFMatrix *p_fmat,
                       const BoosterInfo &info,
                       const std::vector<RegTree*> &trees) = 0;
+
+  /*! 
+   * \brief this is simply a function for optimizing performance
+   * this function asks the updater to return the leaf position of each instance in the p_fmat,
+   * if it is cached in the updater, if it is not available, return NULL
+   * \return array of leaf position of each instance in the last updated tree
+   */
+  virtual const int* GetLeafPosition(void) const {
+    return NULL;
+  }
   // destructor
   virtual ~IUpdater(void) {}
 };
diff --git a/src/tree/updater_distcol-inl.hpp b/src/tree/updater_distcol-inl.hpp
index 86fb558b26e6..38249b7d49fa 100644
--- a/src/tree/updater_distcol-inl.hpp
+++ b/src/tree/updater_distcol-inl.hpp
@@ -38,7 +38,9 @@ class DistColMaker : public ColMaker<TStats> {
     // update position after the tree is pruned
     builder.UpdatePosition(p_fmat, *trees[0]);
   }
-  
+  virtual const int* GetLeafPosition(void) const {
+    return builder.GetLeafPosition();
+  }  
  private:
   inline void SyncTrees(RegTree *tree) {
     std::string s_model;
@@ -71,6 +73,9 @@ class DistColMaker : public ColMaker<TStats> {
         this->position[ridx] = nid;
       }
     }
+    virtual const int* GetLeafPosition(void) const {
+      return BeginPtr(this->position);
+    }
    protected:    
     virtual void SetNonDefaultPosition(const std::vector<int> &qexpand,
                                        IFMatrix *p_fmat, const RegTree &tree) {
diff --git a/wrapper/xgboost_wrapper.cpp b/wrapper/xgboost_wrapper.cpp
index 2bd7340589c0..b829c55248b3 100644
--- a/wrapper/xgboost_wrapper.cpp
+++ b/wrapper/xgboost_wrapper.cpp
@@ -44,7 +44,7 @@ class Booster: public learner::BoostLearner {
     for (bst_omp_uint j = 0; j < ndata; ++j) {
       gpair_[j] = bst_gpair(grad[j], hess[j]);
     }
-    gbm_->DoBoost(train.fmat(), train.info.info, &gpair_);
+    gbm_->DoBoost(train.fmat(), this->FindBufferOffset(train), train.info.info, &gpair_);
   }
   inline void CheckInitModel(void) {
     if (!init_model) {

From f512f08437bfa47621556f51e7e5789133825469 Mon Sep 17 00:00:00 2001
From: tqchen <tianqi.tchen@gmail.com>
Date: Thu, 16 Oct 2014 18:06:47 -0700
Subject: [PATCH 038/166] finish mushroom example

---
 .gitignore                       |  3 +++
 Makefile                         |  4 ++--
 demo/mpi/README.md               |  3 +++
 demo/mpi/mpi.conf                | 36 ++++++++++++++++++++++++++++++++
 demo/mpi/runexp-mpi.sh           | 19 +++++++++++++++++
 demo/mpi/splitsvm.py             | 32 ++++++++++++++++++++++++++++
 src/learner/learner-inl.hpp      |  2 ++
 src/sync/sync.h                  |  5 +++++
 src/sync/sync_empty.cpp          | 10 +++++++++
 src/sync/sync_mpi.cpp            |  5 +++++
 src/tree/updater_distcol-inl.hpp | 20 ++++++++++++------
 src/utils/bitmap.h               | 20 ++++++++++++++++++
 src/xgboost_main.cpp             | 18 ++++++++++++++--
 13 files changed, 167 insertions(+), 10 deletions(-)
 create mode 100644 demo/mpi/README.md
 create mode 100644 demo/mpi/mpi.conf
 create mode 100755 demo/mpi/runexp-mpi.sh
 create mode 100644 demo/mpi/splitsvm.py

diff --git a/.gitignore b/.gitignore
index 1a2a4b48ee46..cb017114b0b3 100644
--- a/.gitignore
+++ b/.gitignore
@@ -46,3 +46,6 @@ Debug
 *csv
 *.cpage.col
 *.cpage
+xgboost
+xgboost-mpi
+train*
diff --git a/Makefile b/Makefile
index a6e7f3daa774..c99a9a7fe2d8 100644
--- a/Makefile
+++ b/Makefile
@@ -30,9 +30,9 @@ io.o: src/io/io.cpp src/io/*.hpp src/utils/*.h src/learner/dmatrix.h src/*.h
 sync_mpi.o: src/sync/sync_mpi.cpp 
 sync_empty.o: src/sync/sync_empty.cpp 
 main.o: src/xgboost_main.cpp src/utils/*.h src/*.h src/learner/*.hpp src/learner/*.h 
-xgboost:  updater.o gbm.o io.o main.o sync_empty.o
 xgboost-mpi:  updater.o gbm.o io.o main.o sync_mpi.o 
-wrapper/libxgboostwrapper.so: wrapper/xgboost_wrapper.cpp src/utils/*.h src/*.h src/learner/*.hpp src/learner/*.h $(OBJ)
+xgboost:  updater.o gbm.o io.o main.o sync_empty.o
+wrapper/libxgboostwrapper.so: wrapper/xgboost_wrapper.cpp src/utils/*.h src/*.h src/learner/*.hpp src/learner/*.h  updater.o gbm.o io.o sync_empty.o
 
 $(BIN) : 
 	$(CXX) $(CFLAGS) $(LDFLAGS) -o $@ $(filter %.cpp %.o %.c, $^)
diff --git a/demo/mpi/README.md b/demo/mpi/README.md
new file mode 100644
index 000000000000..60fd0eb6eed7
--- /dev/null
+++ b/demo/mpi/README.md
@@ -0,0 +1,3 @@
+This folder contains toy example script to run xgboost-mpi. 
+
+This is an experimental distributed version of xgboost
diff --git a/demo/mpi/mpi.conf b/demo/mpi/mpi.conf
new file mode 100644
index 000000000000..5b1f978d10c1
--- /dev/null
+++ b/demo/mpi/mpi.conf
@@ -0,0 +1,36 @@
+# General Parameters, see comment for each definition
+# choose the booster, can be gbtree or gblinear
+booster = gbtree
+# choose logistic regression loss function for binary classification
+objective = binary:logistic
+
+# Tree Booster Parameters
+# step size shrinkage
+eta = 1.0 
+# minimum loss reduction required to make a further partition
+gamma = 1.0 
+# minimum sum of instance weight(hessian) needed in a child
+min_child_weight = 1 
+# maximum depth of a tree
+max_depth = 3 
+
+# Task Parameters
+# the number of round to do boosting
+num_round = 2
+# 0 means do not save any model except the final round model
+save_period = 0 
+use_buffer = 0
+
+
+# The path of training data %d is the wildcard for the rank of the data
+# The idea is each process take a feature matrix with subset of columns
+#
+data = "train.col%d" 
+
+# The path of validation data, used to monitor training process, here [test] sets name of the validation set
+eval[test] = "../data/agaricus.txt.test" 
+# evaluate on training data as well each round
+eval_train = 1
+
+# The path of test data, need to use full data of test, try not use it, or keep an subsampled version
+test:data = "agaricus.txt.test"      
diff --git a/demo/mpi/runexp-mpi.sh b/demo/mpi/runexp-mpi.sh
new file mode 100755
index 000000000000..cc0c6d459a5e
--- /dev/null
+++ b/demo/mpi/runexp-mpi.sh
@@ -0,0 +1,19 @@
+#!/bin/bash
+if [[ $# -ne 1 ]]
+then
+    echo "Usage: nprocess"
+    exit -1
+fi
+
+rm -rf train.col*
+k=$1
+
+# split the lib svm file into k subfiles
+python splitsvm.py ../data/agaricus.txt.train train $k
+
+# run xgboost mpi
+mpirun -n $k ../../xgboost-mpi  mpi.conf 
+
+# the model can be directly loaded by single machine xgboost solver, as usuall
+../../xgboost mpi.conf task=dump model_in=0002.model fmap=../data/featmap.txt name_dump=dump.nice.$k.txt
+cat dump.nice.$k.txt
diff --git a/demo/mpi/splitsvm.py b/demo/mpi/splitsvm.py
new file mode 100644
index 000000000000..365aef610c84
--- /dev/null
+++ b/demo/mpi/splitsvm.py
@@ -0,0 +1,32 @@
+#!/usr/bin/python
+import sys
+import random
+
+# split libsvm file into different subcolumns
+if len(sys.argv) < 4:
+    print ('Usage:<fin> <fo> k')
+    exit(0)
+
+random.seed(10)
+fmap = {}
+
+k = int(sys.argv[3])
+fi = open( sys.argv[1], 'r' )
+fos = []
+
+for i in range(k):
+    fos.append(open( sys.argv[2]+'.col%d' % i, 'w' ))
+    
+for l in open(sys.argv[1]):
+    arr = l.split()
+    for f in fos:
+        f.write(arr[0])
+    for it in arr[1:]:
+        fid = int(it.split(':')[0])
+        if fid not in fmap:
+            fmap[fid] = random.randint(0, k-1)
+        fos[fmap[fid]].write(' '+it)
+    for f in fos:
+        f.write('\n')
+for f in fos:    
+    f.close()
diff --git a/src/learner/learner-inl.hpp b/src/learner/learner-inl.hpp
index c8c146b451ae..c43ec7700054 100644
--- a/src/learner/learner-inl.hpp
+++ b/src/learner/learner-inl.hpp
@@ -10,6 +10,7 @@
 #include <utility>
 #include <string>
 #include <limits>
+#include "../sync/sync.h"
 #include "./objective.h"
 #include "./evaluation.h"
 #include "../gbm/gbm.h"
@@ -61,6 +62,7 @@ class BoostLearner {
       buffer_size += mats[i]->info.num_row();
       num_feature = std::max(num_feature, static_cast<unsigned>(mats[i]->info.num_col()));
     }
+    sync::AllReduce(&num_feature, 1, sync::kMax);
     char str_temp[25];
     if (num_feature > mparam.num_feature) {
       utils::SPrintf(str_temp, sizeof(str_temp), "%u", num_feature);
diff --git a/src/sync/sync.h b/src/sync/sync.h
index cf82597e0063..239840cb3ce7 100644
--- a/src/sync/sync.h
+++ b/src/sync/sync.h
@@ -15,11 +15,16 @@ namespace sync {
 /*! \brief reduce operator supported */
 enum ReduceOp {
   kSum,
+  kMax,
   kBitwiseOR
 };
 
 /*! \brief get rank of current process */
 int GetRank(void);
+/*! 
+ * \brief this is used to check if sync module is a true distributed implementation, or simply a dummpy
+ */
+bool IsDistributed(void);
 /*! \brief intiialize the synchronization module */
 void Init(int argc, char *argv[]);
 /*! \brief finalize syncrhonization module */
diff --git a/src/sync/sync_empty.cpp b/src/sync/sync_empty.cpp
index e46a6906afd1..84e2f770bae5 100644
--- a/src/sync/sync_empty.cpp
+++ b/src/sync/sync_empty.cpp
@@ -6,18 +6,28 @@ namespace sync {
 int GetRank(void) {
   return 0;
 }
+
 void Init(int argc, char *argv[]) {
 }
+
 void Finalize(void) {
 }
+
+bool IsDistributed(void) {
+  return false;
+}
+
 template<>
 void AllReduce<uint32_t>(uint32_t *sendrecvbuf, int count, ReduceOp op) {
 }
+
 template<>
 void AllReduce<float>(float *sendrecvbuf, int count, ReduceOp op) {
 }
+
 void Bcast(std::string *sendrecv_data, int root) {
 }
+
 ReduceHandle::ReduceHandle(void) : handle(NULL) {}
 ReduceHandle::~ReduceHandle(void) {}
 void ReduceHandle::Init(ReduceFunction redfunc, bool commute) {}
diff --git a/src/sync/sync_mpi.cpp b/src/sync/sync_mpi.cpp
index 2890ab609755..ecd83c601b7c 100644
--- a/src/sync/sync_mpi.cpp
+++ b/src/sync/sync_mpi.cpp
@@ -12,6 +12,10 @@ void Init(int argc, char *argv[]) {
   MPI::Init(argc, argv);
 }
 
+bool IsDistributed(void) {
+  return true;
+}
+
 void Finalize(void) {
   MPI::Finalize();
 }
@@ -20,6 +24,7 @@ void AllReduce_(void *sendrecvbuf, int count, const MPI::Datatype &dtype, Reduce
   switch(op) {
     case kBitwiseOR: MPI::COMM_WORLD.Allreduce(MPI_IN_PLACE, sendrecvbuf, count, dtype, MPI::BOR); return;
     case kSum: MPI::COMM_WORLD.Allreduce(MPI_IN_PLACE, sendrecvbuf, count, dtype, MPI::SUM); return;
+    case kMax: MPI::COMM_WORLD.Allreduce(MPI_IN_PLACE, sendrecvbuf, count, dtype, MPI::MAX); return;
   }
 }
 
diff --git a/src/tree/updater_distcol-inl.hpp b/src/tree/updater_distcol-inl.hpp
index 38249b7d49fa..d94cdf409df1 100644
--- a/src/tree/updater_distcol-inl.hpp
+++ b/src/tree/updater_distcol-inl.hpp
@@ -93,9 +93,15 @@ class DistColMaker : public ColMaker<TStats> {
       while (fsplits.size() != 0 && fsplits.back() >= p_fmat->NumCol()) {
         fsplits.pop_back();
       }
-      // setup BitMap
-      bitmap.Resize(this->position.size());
-      bitmap.Clear();
+      // bitmap is only word concurrent, set to bool first
+      {
+        bst_omp_uint ndata = static_cast<bst_omp_uint>(this->position.size());
+        boolmap.resize(ndata);
+        #pragma omp parallel for schedule(static)
+        for (bst_omp_uint j = 0; j < ndata; ++j) {
+            boolmap[j] = 0;
+        }        
+      }
       utils::IIterator<ColBatch> *iter = p_fmat->ColIterator(fsplits);
       while (iter->Next()) {
         const ColBatch &batch = iter->Value();
@@ -110,15 +116,16 @@ class DistColMaker : public ColMaker<TStats> {
             const int nid = this->DecodePosition(ridx);
             if (!tree[nid].is_leaf() && tree[nid].split_index() == fid) {
               if (fvalue < tree[nid].split_cond()) {
-                if (!tree[nid].default_left()) bitmap.SetTrue(ridx);
+                if (!tree[nid].default_left()) boolmap[ridx] = 1;
               } else {
-                if (tree[nid].default_left()) bitmap.SetTrue(ridx);
+                if (tree[nid].default_left()) boolmap[ridx] = 1;
               }
             }
           }
         }
       }
-
+      
+      bitmap.InitFromBool(boolmap);
       // communicate bitmap
       sync::AllReduce(BeginPtr(bitmap.data), bitmap.data.size(), sync::kBitwiseOR);
       const std::vector<bst_uint> &rowset = p_fmat->buffered_rowset();
@@ -159,6 +166,7 @@ class DistColMaker : public ColMaker<TStats> {
     
    private:
     utils::BitMap bitmap;
+    std::vector<int> boolmap;
     sync::Reducer<SplitEntry> reducer;
   };
   // we directly introduce pruner here
diff --git a/src/utils/bitmap.h b/src/utils/bitmap.h
index 9c7cf2fc28f6..92420656ad6c 100644
--- a/src/utils/bitmap.h
+++ b/src/utils/bitmap.h
@@ -7,6 +7,7 @@
  */
 #include <vector>
 #include "./utils.h"
+#include "./omp.h"
 
 namespace xgboost {
 namespace utils {
@@ -35,6 +36,25 @@ struct BitMap {
   inline void SetTrue(size_t i) {
     data[i >> 5] |= (1 << (i & 31U));
   }
+  /*! \brief initialize the value of bit map from vector of bool*/
+  inline void InitFromBool(const std::vector<int> &vec) {
+    this->Resize(vec.size());
+    // parallel over the full cases
+    bst_omp_uint nsize = static_cast<bst_omp_uint>(vec.size() / 32);
+    #pragma omp parallel for schedule(static)
+    for (bst_omp_uint i = 0; i < nsize; ++i) {
+      uint32_t res = 0;
+      for (int k = 0; k < 32; ++k) {
+        int bit = vec[(i << 5) | k];
+        res |= (bit << k);
+      }
+      data[i] = res;
+    }
+    if (nsize != vec.size()) data.back() = 0;
+    for (size_t i = nsize; i < vec.size(); ++i) {
+      if (vec[i]) this->SetTrue(i);
+    }
+  }
   /*! \brief clear the bitmap, set all places to false */
   inline void Clear(void) {
     std::fill(data.begin(), data.end(), 0U);
diff --git a/src/xgboost_main.cpp b/src/xgboost_main.cpp
index e96342f69510..690417855f91 100644
--- a/src/xgboost_main.cpp
+++ b/src/xgboost_main.cpp
@@ -14,7 +14,7 @@ namespace xgboost {
 /*!
  * \brief wrapping the training process 
  */
-class BoostLearnTask{
+class BoostLearnTask {
  public:
   inline int Run(int argc, char *argv[]) {
     if (argc < 2) {
@@ -31,6 +31,9 @@ class BoostLearnTask{
         this->SetParam(name, val);
       }
     }
+    if (sync::IsDistributed()) {
+      this->SetParam("updater", "distcol");
+    }
     if (sync::GetRank() != 0) {
       this->SetParam("silent", "2");
     }
@@ -93,6 +96,7 @@ class BoostLearnTask{
     name_pred = "pred.txt";
     name_dump = "dump.txt";
     model_dir_path = "./";
+    load_part = 0;
     data = NULL;
   }
   ~BoostLearnTask(void){
@@ -103,13 +107,20 @@ class BoostLearnTask{
   }
  private:
   inline void InitData(void) {
+    if (strchr(train_path.c_str(), '%') != NULL) {
+      char s_tmp[256];
+      utils::SPrintf(s_tmp, sizeof(s_tmp), train_path.c_str(), sync::GetRank());
+      train_path = s_tmp;
+      load_part = 1;
+    }
+
     if (name_fmap != "NULL") fmap.LoadText(name_fmap.c_str());
     if (task == "dump") return;
     if (task == "pred") {
       data = io::LoadDataMatrix(test_path.c_str(), silent != 0, use_buffer != 0);
     } else {
       // training
-      data = io::LoadDataMatrix(train_path.c_str(), silent != 0, use_buffer != 0);
+      data = io::LoadDataMatrix(train_path.c_str(), silent != 0 && load_part == 0, use_buffer != 0);
       utils::Assert(eval_data_names.size() == eval_data_paths.size(), "BUG");
       for (size_t i = 0; i < eval_data_names.size(); ++i) {
         deval.push_back(io::LoadDataMatrix(eval_data_paths[i].c_str(), silent != 0, use_buffer != 0));
@@ -182,6 +193,7 @@ class BoostLearnTask{
     fclose(fo);
   }
   inline void SaveModel(const char *fname) const {
+    if (sync::GetRank() != 0) return;
     utils::FileStream fo(utils::FopenCheck(fname, "wb"));
     learner.SaveModel(fo);
     fo.Close();
@@ -205,6 +217,8 @@ class BoostLearnTask{
  private:
   /*! \brief whether silent */
   int silent;
+  /*! \brief special load */
+  int load_part;
   /*! \brief whether use auto binary buffer */
   int use_buffer;
   /*! \brief whether evaluate training statistics */            

From 3f3c90c3c0b1b640389b5a596e39b5a3aedbb8ab Mon Sep 17 00:00:00 2001
From: tqchen <tianqi.tchen@gmail.com>
Date: Thu, 16 Oct 2014 19:41:43 -0700
Subject: [PATCH 039/166] add part_load col

---
 src/data.h                    |  3 ++-
 src/io/page_fmatrix-inl.hpp   |  2 +-
 src/io/simple_fmatrix-inl.hpp | 19 ++++++++++------
 src/learner/learner-inl.hpp   | 41 +++++++++++++++++++++++++++++++++--
 src/sync/sync.h               |  3 +++
 src/sync/sync_empty.cpp       |  4 ++++
 src/sync/sync_mpi.cpp         |  4 ++++
 src/xgboost_main.cpp          |  2 +-
 8 files changed, 66 insertions(+), 12 deletions(-)

diff --git a/src/data.h b/src/data.h
index 2ea5f222a996..162a31bfe6c2 100644
--- a/src/data.h
+++ b/src/data.h
@@ -138,9 +138,10 @@ class IFMatrix {
   virtual utils::IIterator<ColBatch> *ColIterator(const std::vector<bst_uint> &fset) = 0;
   /*!
    * \brief check if column access is supported, if not, initialize column access
+   * \param enabled whether certain feature should be included in column access
    * \param subsample subsample ratio when generating column access
    */
-  virtual void InitColAccess(float subsample) = 0;
+  virtual void InitColAccess(const std::vector<bool> &enabled, float subsample) = 0;
   // the following are column meta data, should be able to answer them fast
   /*! \return whether column access is enabled */
   virtual bool HaveColAccess(void) const = 0;
diff --git a/src/io/page_fmatrix-inl.hpp b/src/io/page_fmatrix-inl.hpp
index af8be333f053..971abbb0ed97 100644
--- a/src/io/page_fmatrix-inl.hpp
+++ b/src/io/page_fmatrix-inl.hpp
@@ -247,7 +247,7 @@ class FMatrixPage : public IFMatrix {
     size_t nmiss = buffered_rowset_.size() - (col_ptr[cidx+1] - col_ptr[cidx]);
     return 1.0f - (static_cast<float>(nmiss)) / buffered_rowset_.size();
   }
-  virtual void InitColAccess(float pkeep = 1.0f) {
+  virtual void InitColAccess(const std::vector<bool> &enabled, float pkeep = 1.0f) {
     if (this->HaveColAccess()) return;
     utils::Printf("start to initialize page col access\n");
     if (this->LoadColData()) {
diff --git a/src/io/simple_fmatrix-inl.hpp b/src/io/simple_fmatrix-inl.hpp
index 997268ff35d8..88bc69019eb5 100644
--- a/src/io/simple_fmatrix-inl.hpp
+++ b/src/io/simple_fmatrix-inl.hpp
@@ -48,9 +48,10 @@ class FMatrixS : public IFMatrix{
     size_t nmiss = buffered_rowset_.size() - (col_ptr_[cidx+1] - col_ptr_[cidx]);
     return 1.0f - (static_cast<float>(nmiss)) / buffered_rowset_.size();
   }
-  virtual void InitColAccess(float pkeep = 1.0f) {
+  virtual void InitColAccess(const std::vector<bool> &enabled, 
+                             float pkeep = 1.0f) {
     if (this->HaveColAccess()) return;
-    this->InitColData(pkeep);
+    this->InitColData(pkeep, enabled);
   }
   /*!
    * \brief get the row iterator associated with FMatrix
@@ -141,7 +142,7 @@ class FMatrixS : public IFMatrix{
    * \brief intialize column data
    * \param pkeep probability to keep a row
    */
-  inline void InitColData(float pkeep) {
+  inline void InitColData(float pkeep, const std::vector<bool> &enabled) {
     buffered_rowset_.clear();
     // note: this part of code is serial, todo, parallelize this transformer
     utils::SparseCSRMBuilder<RowBatch::Entry> builder(col_ptr_, col_data_);
@@ -155,7 +156,9 @@ class FMatrixS : public IFMatrix{
           buffered_rowset_.push_back(static_cast<bst_uint>(batch.base_rowid+i));
           RowBatch::Inst inst = batch[i];
           for (bst_uint j = 0; j < inst.length; ++j) {
-            builder.AddBudget(inst[j].index);
+            if (enabled[inst[j].index]){ 
+              builder.AddBudget(inst[j].index);
+            }
           }
         }
       }
@@ -172,9 +175,11 @@ class FMatrixS : public IFMatrix{
           ++ktop;
           RowBatch::Inst inst = batch[i];
           for (bst_uint j = 0; j < inst.length; ++j) {
-            builder.PushElem(inst[j].index,
-                             Entry((bst_uint)(batch.base_rowid+i),
-                                   inst[j].fvalue));
+            if (enabled[inst[j].index]) { 
+              builder.PushElem(inst[j].index,
+                               Entry((bst_uint)(batch.base_rowid+i),
+                                     inst[j].fvalue));
+            }
           }
         }
       }
diff --git a/src/learner/learner-inl.hpp b/src/learner/learner-inl.hpp
index c43ec7700054..1a001eb95274 100644
--- a/src/learner/learner-inl.hpp
+++ b/src/learner/learner-inl.hpp
@@ -31,6 +31,7 @@ class BoostLearner {
     name_gbm_ = "gbtree";
     silent= 0;
     prob_buffer_row = 1.0f;
+    part_load_col = 0;
   }
   ~BoostLearner(void) {
     if (obj_ != NULL) delete obj_;
@@ -88,6 +89,7 @@ class BoostLearner {
       this->SetParam(n.c_str(), val);
     }
     if (!strcmp(name, "silent")) silent = atoi(val);
+    if (!strcmp(name, "part_load_col")) part_load_col = atoi(val);
     if (!strcmp(name, "prob_buffer_row")) {
       prob_buffer_row = static_cast<float>(atof(val));
       this->SetParam("updater", "grow_colmaker,refresh,prune");
@@ -164,8 +166,41 @@ class BoostLearner {
    *  if not intialize it
    * \param p_train pointer to the matrix used by training
    */
-  inline void CheckInit(DMatrix *p_train) {    
-    p_train->fmat()->InitColAccess(prob_buffer_row);
+  inline void CheckInit(DMatrix *p_train) {
+    int ncol = p_train->info.info.num_col;    
+    std::vector<bool> enabled(ncol, true);
+    
+    if (part_load_col != 0) {      
+      std::vector<unsigned> col_index;
+      for (int i = 0; i < ncol; ++i) {
+        col_index.push_back(i);
+      }
+      random::Shuffle(col_index);
+      std::string s_model;
+      utils::MemoryBufferStream ms(&s_model);
+      utils::IStream &fs = ms;
+      if (sync::GetRank() == 0) {
+        fs.Write(col_index);
+        sync::Bcast(&s_model, 0);
+      } else {
+        sync::Bcast(&s_model, 0);
+        fs.Read(&col_index);
+      }
+      int nsize = sync::GetWorldSize();
+      int step = (ncol + nsize -1) / nsize;
+      int pid = sync::GetRank();
+      std::fill(enabled.begin(), enabled.end(), false);
+      int start = step * pid;
+      int end = std::min(step * (pid + 1), ncol);
+      utils::Printf("rank %d idset:", pid);
+      for (int i = start; i < end; ++i) {
+        enabled[col_index[i]] = true;
+        utils::Printf(" %u", col_index[i]);
+      }
+      utils::Printf("\n");
+    }
+    // initialize column access
+    p_train->fmat()->InitColAccess(enabled, prob_buffer_row);    
   }
   /*!
    * \brief update the model for one iteration
@@ -316,6 +351,8 @@ class BoostLearner {
   // data fields
   // silent during training
   int silent;
+  // randomly load part of data
+  int part_load_col;
   // maximum buffred row value
   float prob_buffer_row;
   // evaluation set
diff --git a/src/sync/sync.h b/src/sync/sync.h
index 239840cb3ce7..293f53515fcf 100644
--- a/src/sync/sync.h
+++ b/src/sync/sync.h
@@ -21,6 +21,9 @@ enum ReduceOp {
 
 /*! \brief get rank of current process */
 int GetRank(void);
+/*! \brief get total number of process */
+int GetWorldSize(void);
+
 /*! 
  * \brief this is used to check if sync module is a true distributed implementation, or simply a dummpy
  */
diff --git a/src/sync/sync_empty.cpp b/src/sync/sync_empty.cpp
index 84e2f770bae5..108f170ef622 100644
--- a/src/sync/sync_empty.cpp
+++ b/src/sync/sync_empty.cpp
@@ -17,6 +17,10 @@ bool IsDistributed(void) {
   return false;
 }
 
+int GetWorldSize(void) {
+  return 1;
+}
+
 template<>
 void AllReduce<uint32_t>(uint32_t *sendrecvbuf, int count, ReduceOp op) {
 }
diff --git a/src/sync/sync_mpi.cpp b/src/sync/sync_mpi.cpp
index ecd83c601b7c..faf66ab6f4be 100644
--- a/src/sync/sync_mpi.cpp
+++ b/src/sync/sync_mpi.cpp
@@ -8,6 +8,10 @@ int GetRank(void) {
   return MPI::COMM_WORLD.Get_rank();
 }
 
+int GetWorldSize(void) {
+  return MPI::COMM_WORLD.Get_size();
+}
+
 void Init(int argc, char *argv[]) {
   MPI::Init(argc, argv);
 }
diff --git a/src/xgboost_main.cpp b/src/xgboost_main.cpp
index 690417855f91..dd549634c613 100644
--- a/src/xgboost_main.cpp
+++ b/src/xgboost_main.cpp
@@ -160,7 +160,7 @@ class BoostLearnTask {
       if (!silent) printf("boosting round %d, %lu sec elapsed\n", i, elapsed);
       learner.UpdateOneIter(i, *data); 
       std::string res = learner.EvalOneIter(i, devalall, eval_data_names);
-      if (silent < 1) {
+      if (silent < 2) {
         fprintf(stderr, "%s\n", res.c_str());
       }
       if (save_period != 0 && (i + 1) % save_period == 0) {

From f6d61f02f674a7c17ab75bf00eaa11675f7e3ea1 Mon Sep 17 00:00:00 2001
From: tqchen <tianqi.tchen@gmail.com>
Date: Thu, 16 Oct 2014 21:47:01 -0700
Subject: [PATCH 040/166] fix load bug

---
 src/learner/learner-inl.hpp | 3 ++-
 src/sync/sync.h             | 3 +++
 src/sync/sync_empty.cpp     | 4 ++++
 src/sync/sync_mpi.cpp       | 8 ++++++++
 src/tree/model.h            | 2 +-
 5 files changed, 18 insertions(+), 2 deletions(-)

diff --git a/src/learner/learner-inl.hpp b/src/learner/learner-inl.hpp
index 1a001eb95274..89bc28aec1d3 100644
--- a/src/learner/learner-inl.hpp
+++ b/src/learner/learner-inl.hpp
@@ -192,7 +192,8 @@ class BoostLearner {
       std::fill(enabled.begin(), enabled.end(), false);
       int start = step * pid;
       int end = std::min(step * (pid + 1), ncol);
-      utils::Printf("rank %d idset:", pid);
+      std::string name = sync::GetProcessorName();
+      utils::Printf("rank %d of %s idset:", pid, name.c_str());
       for (int i = start; i < end; ++i) {
         enabled[col_index[i]] = true;
         utils::Printf(" %u", col_index[i]);
diff --git a/src/sync/sync.h b/src/sync/sync.h
index 293f53515fcf..8d83ab5fb3b5 100644
--- a/src/sync/sync.h
+++ b/src/sync/sync.h
@@ -8,6 +8,7 @@
 #include <cstdio>
 #include <cstring>
 #include "../utils/utils.h"
+#include <string>
 
 namespace xgboost {
 /*! \brief syncrhonizer module that minimumly wraps interface of MPI */
@@ -23,6 +24,8 @@ enum ReduceOp {
 int GetRank(void);
 /*! \brief get total number of process */
 int GetWorldSize(void);
+/*! \brief get name of processor */
+std::string GetProcessorName(void);
 
 /*! 
  * \brief this is used to check if sync module is a true distributed implementation, or simply a dummpy
diff --git a/src/sync/sync_empty.cpp b/src/sync/sync_empty.cpp
index 108f170ef622..a86707d61e3b 100644
--- a/src/sync/sync_empty.cpp
+++ b/src/sync/sync_empty.cpp
@@ -21,6 +21,10 @@ int GetWorldSize(void) {
   return 1;
 }
 
+std::string GetProcessorName(void) {
+  return std::string("");
+}
+
 template<>
 void AllReduce<uint32_t>(uint32_t *sendrecvbuf, int count, ReduceOp op) {
 }
diff --git a/src/sync/sync_mpi.cpp b/src/sync/sync_mpi.cpp
index faf66ab6f4be..45f6c3d75259 100644
--- a/src/sync/sync_mpi.cpp
+++ b/src/sync/sync_mpi.cpp
@@ -20,6 +20,14 @@ bool IsDistributed(void) {
   return true;
 }
 
+std::string GetProcessorName(void) {
+  int len;
+  char name[MPI_MAX_PROCESSOR_NAME];
+  MPI::Get_processor_name(name, len);
+  name[len] = '\0';
+  return std::string(name);
+}
+
 void Finalize(void) {
   MPI::Finalize();
 }
diff --git a/src/tree/model.h b/src/tree/model.h
index dbc35b3b4c76..84010bcc0d2f 100644
--- a/src/tree/model.h
+++ b/src/tree/model.h
@@ -305,7 +305,7 @@ class TreeModel {
     // chg deleted nodes
     deleted_nodes.resize(0);
     for (int i = param.num_roots; i < param.num_nodes; i ++) {
-      if (nodes[i].is_root()) deleted_nodes.push_back(i);
+      if (nodes[i].is_deleted()) deleted_nodes.push_back(i);
     }
     utils::Assert(static_cast<int>(deleted_nodes.size()) == param.num_deleted,
                   "number of deleted nodes do not match");

From 9df9e07f9b5ef13762ec506937416f0aa5792fe3 Mon Sep 17 00:00:00 2001
From: tqchen <tianqi.tchen@gmail.com>
Date: Fri, 17 Oct 2014 14:11:46 -0700
Subject: [PATCH 041/166] minor change in main

---
 src/xgboost_main.cpp | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/src/xgboost_main.cpp b/src/xgboost_main.cpp
index dd549634c613..ef3a9079c63b 100644
--- a/src/xgboost_main.cpp
+++ b/src/xgboost_main.cpp
@@ -264,6 +264,10 @@ class BoostLearnTask {
 
 int main(int argc, char *argv[]){
   xgboost::sync::Init(argc, argv);
+  if (xgboost::sync::IsDistributed()) {
+    std::string pname = xgboost::sync::GetProcessorName();
+    printf("start %s:%d\n", pname.c_str(), xgboost::sync::GetRank());
+  }
   xgboost::random::Seed(0);
   xgboost::BoostLearnTask tsk;
   int ret = tsk.Run(argc, argv);

From a68ac8033e3ec13ad6ea570e8d331937f355ed7f Mon Sep 17 00:00:00 2001
From: tqchen <tianqi.tchen@gmail.com>
Date: Fri, 17 Oct 2014 14:48:32 -0700
Subject: [PATCH 042/166] refresher is now distributed

---
 src/tree/param.h                 |  8 ++++++
 src/tree/updater_refresh-inl.hpp | 47 +++++++++++++++++++-------------
 2 files changed, 36 insertions(+), 19 deletions(-)

diff --git a/src/tree/param.h b/src/tree/param.h
index cf646a76e330..8bd855554bb9 100644
--- a/src/tree/param.h
+++ b/src/tree/param.h
@@ -190,6 +190,10 @@ struct GradStats {
   inline void Add(const GradStats &b) {
     this->Add(b.sum_grad, b.sum_hess);
   }
+  /*! \brief same as add, reduce is used in All Reduce */
+  inline void Reduce(const GradStats &b) {
+    this->Add(b);
+  }
   /*! \brief set current value to a - b */
   inline void SetSubstract(const GradStats &a, const GradStats &b) {
     sum_grad = a.sum_grad - b.sum_grad;
@@ -266,6 +270,10 @@ struct CVGradStats : public GradStats {
       valid[i].Add(b.valid[i]);
     }
   }
+  /*! \brief same as add, reduce is used in All Reduce */
+  inline void Reduce(const CVGradStats &b) {
+    this->Add(b);
+  }
   /*! \brief set current value to a - b */
   inline void SetSubstract(const CVGradStats &a, const CVGradStats &b) {
     GradStats::SetSubstract(a, b);
diff --git a/src/tree/updater_refresh-inl.hpp b/src/tree/updater_refresh-inl.hpp
index a37630333611..579ff2bc323a 100644
--- a/src/tree/updater_refresh-inl.hpp
+++ b/src/tree/updater_refresh-inl.hpp
@@ -10,6 +10,7 @@
 #include "./param.h"
 #include "./updater.h"
 #include "../utils/omp.h"
+#include "../sync/sync.h"
 
 namespace xgboost {
 namespace tree {
@@ -26,7 +27,7 @@ class TreeRefresher: public IUpdater {
   virtual void Update(const std::vector<bst_gpair> &gpair,
                       IFMatrix *p_fmat,
                       const BoosterInfo &info,
-                      const std::vector<RegTree*> &trees) {    
+                      const std::vector<RegTree*> &trees) {        
     if (trees.size() == 0) return;
     // number of threads
     // thread temporal space
@@ -39,15 +40,16 @@ class TreeRefresher: public IUpdater {
       nthread = omp_get_num_threads();
     }
     fvec_temp.resize(nthread, RegTree::FVec());
-    stemp.resize(trees.size() * nthread, std::vector<TStats>());
+    stemp.resize(nthread, std::vector<TStats>());
     #pragma omp parallel
     {
       int tid = omp_get_thread_num();
+      int num_nodes = 0;
       for (size_t i = 0; i < trees.size(); ++i) {
-        std::vector<TStats> &vec = stemp[tid * trees.size() + i];
-        vec.resize(trees[i]->param.num_nodes, TStats(param));
-        std::fill(vec.begin(), vec.end(), TStats(param));
+        num_nodes += trees[i]->param.num_nodes;
       }
+      stemp[tid].resize(num_nodes, TStats(param));
+      std::fill(stemp[tid].begin(), stemp[tid].end(), TStats(param));
       fvec_temp[tid].Init(trees[0]->param.num_feature);
     }
     // start accumulating statistics
@@ -65,28 +67,34 @@ class TreeRefresher: public IUpdater {
         const bst_uint ridx = static_cast<bst_uint>(batch.base_rowid + i);
         RegTree::FVec &feats = fvec_temp[tid];
         feats.Fill(inst);
+        int offset = 0;
         for (size_t j = 0; j < trees.size(); ++j) {
           AddStats(*trees[j], feats, gpair, info, ridx,
-                   &stemp[tid * trees.size() + j]);
+                   BeginPtr(stemp[tid]) + offset);
+          offset += trees[j]->param.num_nodes;
         }
         feats.Drop(inst);
       }
     }
-    // start update the trees using the statistics
+    // aggregate the statistics
+    int num_nodes = static_cast<int>(stemp[0].size());
+    #pragma omp parallel for schedule(static)
+    for (int nid = 0; nid < num_nodes; ++nid) {
+      for (int tid = 1; tid < nthread; ++tid) {
+        stemp[0][nid].Add(stemp[tid][nid]);
+      }
+    }
+    // AllReduce, add statistics up
+    reducer.AllReduce(BeginPtr(stemp[0]), stemp[0].size());
     // rescale learning rate according to size of trees
     float lr = param.learning_rate;
     param.learning_rate = lr / trees.size();
-    for (size_t i = 0; i < trees.size(); ++i) {
-      // aggregate
-      #pragma omp parallel for schedule(static)
-      for (int nid = 0; nid < trees[i]->param.num_nodes; ++nid) {
-        for (int tid = 1; tid < nthread; ++tid) {
-          stemp[i][nid].Add(stemp[tid * trees.size() + i][nid]);
-        }
-      }
+    int offset = 0;
+    for (size_t i = 0; i < trees.size(); ++i) {      
       for (int rid = 0; rid < trees[i]->param.num_roots; ++rid) {
-        this->Refresh(stemp[i], rid, trees[i]);
+        this->Refresh(BeginPtr(stemp[0]) + offset, rid, trees[i]);
       }
+      offset += trees[i]->param.num_nodes;
     }
     // set learning rate back
     param.learning_rate = lr;
@@ -98,8 +106,7 @@ class TreeRefresher: public IUpdater {
                               const std::vector<bst_gpair> &gpair,
                               const BoosterInfo &info,
                               const bst_uint ridx,
-                              std::vector<TStats> *p_gstats) {
-    std::vector<TStats> &gstats = *p_gstats;
+                              TStats *gstats) {
     // start from groups that belongs to current data
     int pid = static_cast<int>(info.GetRoot(ridx));
     gstats[pid].Add(gpair, info, ridx);
@@ -110,7 +117,7 @@ class TreeRefresher: public IUpdater {
       gstats[pid].Add(gpair, info, ridx);
     }
   }
-  inline void Refresh(const std::vector<TStats> &gstats,
+  inline void Refresh(const TStats *gstats,
                       int nid, RegTree *p_tree) {
     RegTree &tree = *p_tree;
     tree.stat(nid).base_weight = static_cast<float>(gstats[nid].CalcWeight(param));
@@ -129,6 +136,8 @@ class TreeRefresher: public IUpdater {
   }
   // training parameter
   TrainParam param;
+  // reducer
+  sync::Reducer<TStats> reducer;  
 };
 
 }  // namespace tree

From c2fa3901814e06a1b2099633e0d48d8a564220aa Mon Sep 17 00:00:00 2001
From: tqchen <tianqi.tchen@gmail.com>
Date: Fri, 17 Oct 2014 14:53:43 -0700
Subject: [PATCH 043/166] move sync tree to pruner, pruner is now distributed

---
 src/tree/updater_distcol-inl.hpp | 15 +--------------
 src/tree/updater_prune-inl.hpp   | 23 +++++++++++++++++++++--
 2 files changed, 22 insertions(+), 16 deletions(-)

diff --git a/src/tree/updater_distcol-inl.hpp b/src/tree/updater_distcol-inl.hpp
index d94cdf409df1..bce947fe850d 100644
--- a/src/tree/updater_distcol-inl.hpp
+++ b/src/tree/updater_distcol-inl.hpp
@@ -32,9 +32,8 @@ class DistColMaker : public ColMaker<TStats> {
     utils::Check(trees.size() == 1, "DistColMaker: only support one tree at a time");
     // build the tree
     builder.Update(gpair, p_fmat, info, trees[0]);
-    //// prune the tree
+    //// prune the tree, note that pruner will sync the tree
     pruner.Update(gpair, p_fmat, info, trees);
-    this->SyncTrees(trees[0]);
     // update position after the tree is pruned
     builder.UpdatePosition(p_fmat, *trees[0]);
   }
@@ -42,18 +41,6 @@ class DistColMaker : public ColMaker<TStats> {
     return builder.GetLeafPosition();
   }  
  private:
-  inline void SyncTrees(RegTree *tree) {
-    std::string s_model;
-    utils::MemoryBufferStream fs(&s_model);
-    int rank = sync::GetRank();
-    if (rank == 0) {
-      tree->SaveModel(fs);
-      sync::Bcast(&s_model, 0);
-    } else {
-      sync::Bcast(&s_model, 0);
-      tree->LoadModel(fs);
-    }
-  }  
   struct Builder : public ColMaker<TStats>::Builder {
    public:
     Builder(const TrainParam &param) 
diff --git a/src/tree/updater_prune-inl.hpp b/src/tree/updater_prune-inl.hpp
index 726999f55059..a68404ba7df7 100644
--- a/src/tree/updater_prune-inl.hpp
+++ b/src/tree/updater_prune-inl.hpp
@@ -8,6 +8,7 @@
 #include <vector>
 #include "./param.h"
 #include "./updater.h"
+#include "../sync/sync.h"
 
 namespace xgboost {
 namespace tree {
@@ -33,9 +34,27 @@ class TreePruner: public IUpdater {
       this->DoPrune(*trees[i]);
     }
     param.learning_rate = lr;
-  }
-
+    this->SyncTrees(trees);
+  }  
  private:
+  // synchronize the trees in different nodes, take tree from rank 0
+  inline void SyncTrees(const std::vector<RegTree *> &trees) {
+    if (sync::GetWorldSize() == 1) return;
+    std::string s_model;
+    utils::MemoryBufferStream fs(&s_model);
+    int rank = sync::GetRank();
+    if (rank == 0) {
+      for (size_t i = 0; i < trees.size(); ++i) {
+        trees[i]->SaveModel(fs);
+      }
+      sync::Bcast(&s_model, 0);
+    } else {
+      sync::Bcast(&s_model, 0);
+      for (size_t i = 0; i < trees.size(); ++i) {      
+        trees[i]->LoadModel(fs);
+      }
+    }
+  }
   // try to prune off current leaf
   inline int TryPruneLeaf(RegTree &tree, int nid, int depth, int npruned) {
     if (tree[nid].is_root()) return npruned;

From a7bc769971c7aea26aaa800640ad688cf4f7e544 Mon Sep 17 00:00:00 2001
From: tqchen <tianqi.tchen@gmail.com>
Date: Fri, 17 Oct 2014 17:55:07 -0700
Subject: [PATCH 044/166] incomplete histmaker

---
 src/tree/updater.cpp               |   2 +
 src/tree/updater_histmaker-inl.hpp | 167 +++++++++++++++++++++++++++++
 2 files changed, 169 insertions(+)
 create mode 100644 src/tree/updater_histmaker-inl.hpp

diff --git a/src/tree/updater.cpp b/src/tree/updater.cpp
index 75fc39799b4d..6efcc511f01c 100644
--- a/src/tree/updater.cpp
+++ b/src/tree/updater.cpp
@@ -6,6 +6,7 @@
 #include "./updater_refresh-inl.hpp"
 #include "./updater_colmaker-inl.hpp"
 #include "./updater_distcol-inl.hpp"
+#include "./updater_histmaker-inl.hpp"
 
 namespace xgboost {
 namespace tree {
@@ -14,6 +15,7 @@ IUpdater* CreateUpdater(const char *name) {
   if (!strcmp(name, "prune")) return new TreePruner();
   if (!strcmp(name, "refresh")) return new TreeRefresher<GradStats>();
   if (!strcmp(name, "grow_colmaker")) return new ColMaker<GradStats>();
+  if (!strcmp(name, "grow_histmaker")) return new HistMaker<GradStats>();
   if (!strcmp(name, "distcol")) return new DistColMaker<GradStats>();
   if (!strcmp(name, "grow_colmaker5")) return new ColMaker< CVGradStats<5> >();
   if (!strcmp(name, "grow_colmaker3")) return new ColMaker< CVGradStats<3> >();
diff --git a/src/tree/updater_histmaker-inl.hpp b/src/tree/updater_histmaker-inl.hpp
new file mode 100644
index 000000000000..3ecb022cbc61
--- /dev/null
+++ b/src/tree/updater_histmaker-inl.hpp
@@ -0,0 +1,167 @@
+#ifndef XGBOOST_TREE_UPDATER_HISTMAKER_INL_HPP_
+#define XGBOOST_TREE_UPDATER_HISTMAKER_INL_HPP_
+/*!
+ * \file updater_histmaker-inl.hpp
+ * \brief use histogram counting to construct a tree
+ * \author Tianqi Chen
+ */
+#include <vector>
+#include <algorithm>
+
+namespace xgboost {
+namespace tree {
+template<typename TStats>
+class HistMaker: public IUpdater {
+ public:
+  virtual ~HistMaker(void) {}
+  // set training parameter
+  virtual void SetParam(const char *name, const char *val) {
+    param.SetParam(name, val);
+  }
+  virtual void Update(const std::vector<bst_gpair> &gpair,
+                      IFMatrix *p_fmat,
+                      const BoosterInfo &info,
+                      const std::vector<RegTree*> &trees) {
+    TStats::CheckInfo(info);
+    // rescale learning rate according to size of trees
+    float lr = param.learning_rate;
+    param.learning_rate = lr / trees.size();
+    // build tree
+    for (size_t i = 0; i < trees.size(); ++i) {
+      // TODO
+    }
+    param.learning_rate = lr;
+  }
+
+ protected:
+  /*! \brief a single histogram */
+  struct HistUnit {
+    /*! \brief cutting point of histogram, contains maximum point */
+    const bst_float *cut;
+    /*! \brief content of statistics data */    
+    TStats *data;
+    /*! \brief size of histogram */
+    const unsigned size;
+    // constructor
+    HistUnit(const bst_float *cut, TStats *data, unsigned size)
+        : cut(cut), data(data), size(size) {}
+    /*! \brief add a histogram to data */
+    inline void Add(bst_float fv, 
+                    const std::vector<bst_gpair> &gpair,
+                    const BoosterInfo &info,
+                    const bst_uint ridx) {
+      unsigned i = std::lower_bound(cut, cut + size, fv) - cut;
+      utils::Assert(i < size, "maximum value must be in cut");
+      data[i].Add(gpair, info, ridx);
+    }
+  };
+  /*! \brief a set of histograms from different index */
+  struct HistSet {
+    /*! \brief the index pointer of each histunit */
+    const unsigned *rptr;
+    /*! \brief cutting points in each histunit */
+    const bst_float *cut;
+    /*! \brief data in different hist unit */
+    std::vector<TStats> data;
+    /*! \brief */
+    inline HistUnit operator[](bst_uint fid) {
+      return HistUnit(cut + rptr[fid],
+                      &data[0] + rptr[fid],
+                      rptr[fid+1] - rptr[fid]);
+    }
+  };
+  // thread workspace 
+  struct ThreadWSpace {
+    /*! \brief actual unit pointer */
+    std::vector<unsigned> rptr;
+    /*! \brief cut field */
+    std::vector<unsigned> cut;    
+    // per thread histset
+    std::vector<HistSet> hset;    
+    // initialize the hist set
+    inline void Init(const TrainParam &param) {
+      int nthread;
+      #pragma omp parallel
+      {
+        nthread = omp_get_num_threads();
+      }
+      hset.resize(nthread);
+      // cleanup statistics
+      #pragma omp parallel
+      {
+        int tid = omp_get_thread_num();
+        for (size_t i = 0; i < hset[tid].data.size(); ++i) {
+          hset[tid].data[i].Clear();
+        }
+      }
+      for (int i = 0; i < nthread; ++i) {
+        hset[i].rptr = BeginPtr(rptr);
+        hset[i].cut = BeginPtr(cut);
+        hset[i].data.resize(cut.size(), TStats(param));        
+      }
+    }
+    // aggregate all statistics to hset[0]
+    inline void Aggregate(void) {
+      bst_omp_uint nsize = static_cast<bst_omp_uint>(cut.size());
+      #pragma omp parallel for schedule(static)
+      for (bst_omp_uint i = 0; i < nsize; ++i) {
+        for (size_t tid = 1; tid < hset.size(); ++tid) {
+          hset[0][i].Add(hset[tid][i]);
+        }
+      }
+    }
+    /*! \brief clear the workspace */
+    inline void Clear(void) {
+      cut.clear(); rptr.resize(1); rptr[0] = 0;
+    }
+    /*! \brief total size */
+    inline size_t Size(void) const {
+      return rptr.size() - 1;
+    }
+  };
+  // training parameter
+  TrainParam param;
+  // workspace of thread
+  ThreadWSpace wspace;
+  // position of each data
+  std::vector<int> position;
+ private:
+  // create histogram for a setup histset
+  inline void CreateHist(const std::vector<bst_gpair> &gpair,
+                         IFMatrix *p_fmat,
+                         const BoosterInfo &info,
+                         unsigned num_feature) {
+    // intialize work space
+    wspace.Init(param);
+    // start accumulating statistics
+    utils::IIterator<RowBatch> *iter = p_fmat->RowIterator();
+    iter->BeforeFirst();
+    while (iter->Next()) {
+      const RowBatch &batch = iter->Value();
+      utils::Check(batch.size < std::numeric_limits<unsigned>::max(),
+                   "too large batch size ");
+      const bst_omp_uint nbatch = static_cast<bst_omp_uint>(batch.size);
+      #pragma omp parallel for schedule(static)
+      for (bst_omp_uint i = 0; i < nbatch; ++i) {
+        RowBatch::Inst inst = batch[i];
+        const int tid = omp_get_thread_num();
+        HistSet &hset = wspace.hset[tid];
+        const bst_uint ridx = static_cast<bst_uint>(batch.base_rowid + i);
+        int nid = position[ridx];
+        if (nid >= 0) {
+          for (bst_uint i = 0; i < inst.length; ++i) {
+            utils::Assert(inst[i].index < num_feature, "feature index exceed bound");
+            hset[inst[i].index + nid * num_feature]
+                .Add(inst[i].fvalue, gpair, info, ridx);
+          }
+        }
+      }
+    }
+    // accumulating statistics together
+    wspace.Aggregate();
+    // get the split solution    
+  }  
+};
+}  // namespace tree
+}  // namespace xgboost
+#endif  // XGBOOST_TREE_UPDATER_HISTMAKER_INL_HPP_

From dcd0dd5e26533cfe9cdfb1fbc9782234351aa2de Mon Sep 17 00:00:00 2001
From: tqchen <tianqi.tchen@gmail.com>
Date: Sat, 18 Oct 2014 10:24:29 -0700
Subject: [PATCH 045/166] finish find split, next to do quantile sketch

---
 src/tree/updater_histmaker-inl.hpp | 117 +++++++++++++++++++++++++++--
 1 file changed, 110 insertions(+), 7 deletions(-)

diff --git a/src/tree/updater_histmaker-inl.hpp b/src/tree/updater_histmaker-inl.hpp
index 3ecb022cbc61..48f34146952c 100644
--- a/src/tree/updater_histmaker-inl.hpp
+++ b/src/tree/updater_histmaker-inl.hpp
@@ -7,6 +7,7 @@
  */
 #include <vector>
 #include <algorithm>
+#include "../sync/sync.h"
 
 namespace xgboost {
 namespace tree {
@@ -75,7 +76,7 @@ class HistMaker: public IUpdater {
     /*! \brief actual unit pointer */
     std::vector<unsigned> rptr;
     /*! \brief cut field */
-    std::vector<unsigned> cut;    
+    std::vector<bst_float> cut;
     // per thread histset
     std::vector<HistSet> hset;    
     // initialize the hist set
@@ -106,7 +107,7 @@ class HistMaker: public IUpdater {
       #pragma omp parallel for schedule(static)
       for (bst_omp_uint i = 0; i < nsize; ++i) {
         for (size_t tid = 1; tid < hset.size(); ++tid) {
-          hset[0][i].Add(hset[tid][i]);
+          hset[0].data[i].Add(hset[tid].data[i]);
         }
       }
     }
@@ -125,12 +126,34 @@ class HistMaker: public IUpdater {
   ThreadWSpace wspace;
   // position of each data
   std::vector<int> position;
+  /*! \brief queue of nodes to be expanded */
+  std::vector<int> qexpand;
+  /*! \brief map active node to is working index offset in qexpand*/
+  std::vector<int> node2workindex;
+  // reducer for histogram
+  sync::Reducer<TStats> histred;
  private:
+  virtual void Update(const std::vector<bst_gpair> &gpair,
+                      IFMatrix *p_fmat,
+                      const BoosterInfo &info,
+                      RegTree *p_tree) {
+    //this->InitData(gpair, *p_fmat, info.root_index, *p_tree);
+    //this->InitNewNode(qexpand_, gpair, *p_fmat, info, *p_tree);
+    for (int depth = 0; depth < param.max_depth; ++depth) {
+      this->FindSplit(depth, gpair, p_fmat, info, p_tree);
+      //this->ResetPosition(qexpand_, p_fmat, *p_tree);
+      //this->UpdateQueueExpand(*p_tree, &qexpand_);
+      //this->InitNewNode(qexpand_, gpair, *p_fmat, info, *p_tree);
+      // if nothing left to be expand, break
+      if (qexpand.size() == 0) break;
+    }   
+  }
   // create histogram for a setup histset
   inline void CreateHist(const std::vector<bst_gpair> &gpair,
                          IFMatrix *p_fmat,
                          const BoosterInfo &info,
-                         unsigned num_feature) {
+                         const RegTree &tree) {
+    bst_uint num_feature = tree.param.num_feature;
     // intialize work space
     wspace.Init(param);
     // start accumulating statistics
@@ -147,20 +170,100 @@ class HistMaker: public IUpdater {
         const int tid = omp_get_thread_num();
         HistSet &hset = wspace.hset[tid];
         const bst_uint ridx = static_cast<bst_uint>(batch.base_rowid + i);
-        int nid = position[ridx];
+        const int nid = position[ridx];
         if (nid >= 0) {
+          utils::Assert(tree[nid].is_leaf(), "CreateHist happens in leaf");
+          const int wid = node2workindex[nid];
           for (bst_uint i = 0; i < inst.length; ++i) {
             utils::Assert(inst[i].index < num_feature, "feature index exceed bound");
-            hset[inst[i].index + nid * num_feature]
+            // feature histogram
+            hset[inst[i].index + wid * (num_feature+1)]
                 .Add(inst[i].fvalue, gpair, info, ridx);
           }
+          // node histogram, use num_feature to borrow space
+          hset[num_feature + wid * (num_feature + 1)]
+              .data[0].Add(gpair, info, ridx);
         }
       }
     }
     // accumulating statistics together
     wspace.Aggregate();
-    // get the split solution    
-  }  
+    // sync the histogram
+    histred.AllReduce(BeginPtr(wspace.hset[0].data), wspace.hset[0].data.size());
+  }
+  inline void EnumerateSplit(const HistUnit &hist, 
+                             const TStats &node_sum,
+                             bst_uint fid,
+                             SplitEntry *best) {
+    double root_gain = node_sum.CalcGain(param);
+    TStats s(param), c(param);
+    for (bst_uint i = 0; i < hist.size; ++i) {
+      s.Add(hist.data[i]);
+      if (s.sum_hess >= param.min_child_weight) {
+        c.SetSubstract(node_sum, s);
+        if (c.sum_hess >= param.min_child_weight) {
+          double loss_chg = s.CalcGain(param) + c.CalcGain(param) - root_gain;
+          best->Update(loss_chg, fid, hist.cut[i], false);
+        }
+      }
+    }
+    s.Clear();
+    for (bst_uint i = hist.size - 1; i != 0; --i) {
+      s.Add(hist.data[i]);
+      if (s.sum_hess >= param.min_child_weight) {
+        c.SetSubstract(node_sum, s);
+        if (c.sum_hess >= param.min_child_weight) {
+          double loss_chg = s.CalcGain(param) + c.CalcGain(param) - root_gain;
+          best->Update(loss_chg, fid, hist.cut[i-1], true);
+        }
+      }
+    }
+  }
+  inline void FindSplit(int depth,
+                        const std::vector<bst_gpair> &gpair,
+                        IFMatrix *p_fmat,
+                        const BoosterInfo &info,
+                        RegTree *p_tree) {
+    const bst_uint num_feature = p_tree->param.num_feature;
+    // create histogram
+    this->CreateHist(gpair, p_fmat, info, *p_tree);
+    // get the best split condition for each node
+    std::vector<SplitEntry> sol(qexpand.size());
+    bst_omp_uint nexpand = static_cast<bst_omp_uint>(qexpand.size());
+    #pragma omp parallel for schedule(dynamic, 1)
+    for (bst_omp_uint wid = 0; wid < nexpand; ++ wid) {
+      const int nid = qexpand[wid];
+      utils::Assert(node2workindex[nid] == static_cast<int>(wid), "node2workindex inconsistent");
+      SplitEntry &best = sol[wid];     
+      TStats &node_sum = wspace.hset[0][num_feature + wid * (num_feature + 1)].data[0];
+      for (bst_uint fid = 0; fid < num_feature; ++ fid) {
+        EnumerateSplit(wspace.hset[0][fid + wid * (num_feature+1)],
+                       node_sum, fid, &best);
+      }
+    }    
+    // get the best result, we can synchronize the solution
+    for (bst_omp_uint wid = 0; wid < nexpand; ++ wid) {
+      const int nid = qexpand[wid];
+      const SplitEntry &best = sol[wid];
+      const TStats &node_sum = wspace.hset[0][num_feature + wid * (num_feature + 1)].data[0];
+      bst_float weight = node_sum.CalcWeight(param);
+      // set up the values
+      p_tree->stat(nid).loss_chg = best.loss_chg;
+      p_tree->stat(nid).base_weight = weight;
+      p_tree->stat(nid).sum_hess = static_cast<float>(node_sum.sum_hess);
+      node_sum.SetLeafVec(param, p_tree->leafvec(nid));
+      // now we know the solution in snode[nid], set split
+      if (best.loss_chg > rt_eps) {
+        p_tree->AddChilds(nid);
+        (*p_tree)[nid].set_split(best.split_index(), best.split_value, best.default_left());
+        // mark right child as 0, to indicate fresh leaf
+        (*p_tree)[(*p_tree)[nid].cleft()].set_leaf(0.0f, 0);
+        (*p_tree)[(*p_tree)[nid].cright()].set_leaf(0.0f, 0);
+      } else {
+        (*p_tree)[nid].set_leaf(weight * param.learning_rate);
+      }
+    }
+  }
 };
 }  // namespace tree
 }  // namespace xgboost

From 23eaa7ed3245a9c45d0fae35c88fc3d6653d138d Mon Sep 17 00:00:00 2001
From: tqchen <tianqi.tchen@gmail.com>
Date: Mon, 20 Oct 2014 18:04:39 -0700
Subject: [PATCH 046/166] add quantile sketch

---
 src/utils/bitmap.h   |   1 +
 src/utils/quantile.h | 274 +++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 275 insertions(+)
 create mode 100644 src/utils/quantile.h

diff --git a/src/utils/bitmap.h b/src/utils/bitmap.h
index 92420656ad6c..ba12caf41d4c 100644
--- a/src/utils/bitmap.h
+++ b/src/utils/bitmap.h
@@ -3,6 +3,7 @@
 /*!
  * \file bitmap.h
  * \brief a simple implement of bitmap
+ *  NOTE: bitmap is only threadsafe per word access, remember this when using bitmap
  * \author Tianqi Chen
  */
 #include <vector>
diff --git a/src/utils/quantile.h b/src/utils/quantile.h
new file mode 100644
index 000000000000..16f74df9e09e
--- /dev/null
+++ b/src/utils/quantile.h
@@ -0,0 +1,274 @@
+#ifndef XGBOOST_UTILS_QUANTILE_H_
+#define XGBOOST_UTILS_QUANTILE_H_
+/*!
+ * \file quantile
+ * \brief util to compute quantiles 
+ * \author Tianqi Chen
+ */
+#include <cmath>
+#include <vector>
+#include <cstring>
+#include <algorithm>
+#include "./utils.h"
+
+namespace xgboost {
+namespace utils {
+/*! 
+ * \brief a helper class to compute streaming quantile
+ * \tparam DType type of data content
+ * \tparam RType type of rank
+ */
+template<typename DType, typename RType=unsigned>
+class QuantileSketch {
+ public:
+  /*! \brief an entry in the sketch summary */
+  struct Entry {
+    /*! \brief minimum rank */
+    RType rmin;
+    /*! \brief maximum rank */
+    RType rmax;
+    /*! \brief the value of data */
+    DType value;
+    // constructor
+    Entry(void) {}
+    // constructor
+    Entry(RType rmin, RType rmax, DType value)
+        : rmin(rmin), rmax(rmax), value(value) {}
+  };
+  /*! 
+   * \brief this is data structure presenting one summary
+   */
+  struct Summary {
+    /*! \brief data field */
+    Entry *data;
+    /*! \brief number of elements in the summary */
+    RType size;
+    /*! \brief the maximum error of the summary */
+    inline RType MaxError(void) const {
+      RType res = 0;
+      for (RType i = 1; i < size; ++i) {
+        res = std::max(data[i].rmax - data[i-1].rmin, res);
+      }
+      return res;
+    }
+    /*! \return maximum rank in the summary */
+    inline RType MaxRank(void) const {
+      return data[size - 1].rmax;
+    }
+    /*! \brief set size to 0 */
+    inline void Clear(void) {
+      size = 0;
+    }
+    /*! 
+     * \brief copy content from src
+     * \param src source sketch
+     */
+    inline void CopyFrom(const Summary &src) {
+      size = src.size;
+      std::memcpy(data, src.data, sizeof(Entry) * size);
+    }
+    /*! 
+     * \brief set current summary to be pruned summary of src
+     *        assume data field is already allocated to be at least maxsize
+     * \param src source summary
+     * \param maxsize size we can afford in the pruned sketch
+     */
+    inline void SetPrune(const Summary &src, RType maxsize) {
+      const RType max_rank = src.MaxRank();
+      this->size = maxsize;
+      data[0] = src.data[0];
+      RType n = maxsize - 1;
+      RType top = 1;
+      for (RType i = 1; i < n; ++i) {
+        RType k = (i * max_rank) / n;
+        while (k > src.data[top + 1].rmax) ++top;
+        // assert src.data[top].rmin <= k
+        // because k > src.data[top].rmax >= src.data[top].rmin
+        if ((k - src.data[top].rmin) < (src.data[top+1].rmax - k)) {
+          data[i] = src.data[top];
+        } else {
+          data[i] = src.data[top + 1];
+        }
+      }
+      data[n] = src.data[src.size - 1];
+    }
+    inline void SetCombine(const Summary &sa,
+                           const Summary &sb) {
+      utils::Assert(sa.size > 0 && sb.size > 0, "invalid input for merge"); 
+      const Entry *a = sa.data, *a_end = sa.data + sa.size;
+      const Entry *b = sb.data, *b_end = sb.data + sb.size;
+      this->size = sa.size + sb.size;
+      RType aprev_rmin = 0, bprev_rmin = 0;
+      Entry *dst = this->data;
+      while (a != a_end && b != b_end) {
+        if (a->value < b->value) {
+          *dst = Entry(bprev_rmin + a->rmin,
+                       a->rmax + b->rmax - 1, a->value);
+          aprev_rmin = a->rmin;
+          ++dst; ++a;
+        } else {
+          *dst = Entry(aprev_rmin + b->rmin, 
+                       b->rmax + a->rmax - 1, b->value);
+          bprev_rmin = b->rmin;
+          ++dst; ++b;
+        }
+      }
+      if (a != a_end) {
+        RType bprev_rmax = (b_end - 1)->rmax;
+        do {
+          *dst = Entry(bprev_rmin + a->rmin, bprev_rmax + a->rmax, a->value);
+          ++dst; ++a;
+        } while (a != a_end);
+      }
+      if (b != b_end) {
+        RType aprev_rmax = (a_end - 1)->rmax;
+        do {
+          *dst = Entry(aprev_rmin + b->rmin, aprev_rmax + b->rmax, b->value);
+          ++dst; ++b;
+        } while (b != b_end);
+      }
+      utils::Assert(dst == data + size, "bug in combine");
+    }
+  };
+  // same as summary, but use STL to backup the space
+  struct SummaryContainer : public Summary {
+    std::vector<Entry> space;
+    /*! \brief reserve space for summary */
+    inline void Reserve(size_t size) {
+      space.resize(size);
+      this->data = BeginPtr(space);
+    }
+    /*! 
+     * \brief set the space to be merge of all Summary arrays
+     * \param begin begining position in th summary array
+     * \param end ending position in the Summary array
+     */
+    inline void SetMerge(const Summary *begin,
+                         const Summary *end) {
+      utils::Assert(begin < end, "can not set combine to empty instance");
+      size_t len = end - begin;
+      if (len == 1) {
+        this->Reserve(begin[0].size);
+        this->CopyFrom(begin[0]);
+      } else if (len == 2) {
+        this->Reserve(begin[0].size + begin[1].size);
+        this->SetMerge(begin[0], begin[1]);
+      } else {
+        // recursive merge
+        SummaryContainer lhs, rhs;        
+        lhs.SetCombine(begin, begin + len / 2);
+        rhs.SetCombine(begin + len / 2, end);
+        this->Reserve(lhs.size + rhs.size);
+        this->SetCombine(lhs, rhs);
+      }
+    }
+  };
+  /*! 
+   * \brief intialize the quantile sketch, given the performance specification
+   * \param maxn maximum number of data points can be encountered
+   * \param eps accuracy level of summary
+   */
+  inline void Init(RType maxn, double eps) {
+    eps  = eps * 0.5;
+    size_t L = 0;
+    size_t b = std::max(floor(log2(eps * maxn) / eps), 8.0);
+    // check for lower 
+    while (b < maxn) {
+      L = ceil(log2(maxn / b)) + 1;
+      if (L < eps * b) break;
+      ++b;
+    }
+    L += 1;
+    inqueue.resize(b);
+    level_batch = (b + 1) / 2 + 1;
+    temp.Reserve(level_batch * 2);
+    data.resize(level_batch * L);
+    for (size_t l = 0; l < L; ++l) {
+      Summary s; s.size = 0;
+      s.data = BeginPtr(data) + l * level_batch;
+      level.push_back(s);
+    }
+    printf("init L = %lu, b = %lu, %lu size\n",L, b, data.size());
+    qtail = 0;
+  }
+  /*! 
+   * \brief add an element to a sketch 
+   * \param x the elemented added to the sketch
+   */
+  inline void Add(DType x) {
+    inqueue[qtail++] = x;
+    if (qtail == inqueue.size()) {
+      // start update sketch
+      std::sort(inqueue.begin(), inqueue.end());
+      for (size_t i = 0; i < qtail; ++i) {
+        temp.data[i] = Entry(i + 1, i + 1, inqueue[i]);
+      }
+      temp.size = static_cast<RType>(qtail);
+      // clean up queue
+      qtail = 0;
+      for (size_t l = 1; l < level.size(); ++l) {
+        // check if level l is empty
+        if (level[l].size == 0) {
+          level[l].SetPrune(temp, level_batch);
+          return;
+        } else {
+          // level 0 is actually temp space
+          level[0].SetPrune(temp, level_batch);
+          temp.SetCombine(level[0], level[l]);
+          level[l].size = 0;
+        }
+      }
+      utils::Error("adding more element than allowed");
+    }
+  }
+  /*! 
+   * \brief finalize the result after all data has been passed 
+   *        copy the final result to level 0
+   *        this can only be called once
+   */
+  inline void Finalize(void) {
+    // start update sketch
+    std::sort(inqueue.begin(), inqueue.begin() + qtail);
+    for (size_t i = 0; i < qtail; ++i) {
+      temp.data[i] = Entry(i + 1, i + 1, inqueue[i]);
+    }
+    temp.size = static_cast<RType>(qtail);
+    if (temp.size < level_batch) {
+      level[0].CopyFrom(temp);
+    } else {
+      level[0].SetPrune(temp, level_batch);
+    }
+    // start adding other things in
+    for (size_t l = 1; l < level.size(); ++l) {
+      if (level[l].size == 0) continue;
+      if (level[0].size == 0) {
+        level[0].CopyFrom(level[l]);
+      } else {
+        temp.SetCombine(level[0], level[l]);
+        level[0].SetPrune(temp, level_batch);        
+      }
+      level[l].size = 0;
+    }
+  }
+  /*! \brief get the summary after finalize */
+  inline Summary GetSummary(void) const {
+    return level[0];
+  }  
+  
+ private:  
+  // the input queue
+  std::vector<DType> inqueue;
+  // end of the queue
+  size_t qtail;
+  // size of summary in each level
+  size_t level_batch;
+  // content of the summary
+  std::vector<Entry> data;
+  // different level of summary
+  std::vector<Summary> level;  
+  // temporal summary, used for temp-merge
+  SummaryContainer temp;  
+};
+}  // utils
+}  // xgboost
+#endif

From 96c5196647cad48139b9d5375dda7fc986f62c71 Mon Sep 17 00:00:00 2001
From: tqchen <tianqi.tchen@gmail.com>
Date: Mon, 20 Oct 2014 18:06:15 -0700
Subject: [PATCH 047/166] remv debug

---
 src/utils/quantile.h | 1 -
 1 file changed, 1 deletion(-)

diff --git a/src/utils/quantile.h b/src/utils/quantile.h
index 16f74df9e09e..e123c23f64e8 100644
--- a/src/utils/quantile.h
+++ b/src/utils/quantile.h
@@ -188,7 +188,6 @@ class QuantileSketch {
       s.data = BeginPtr(data) + l * level_batch;
       level.push_back(s);
     }
-    printf("init L = %lu, b = %lu, %lu size\n",L, b, data.size());
     qtail = 0;
   }
   /*! 

From ca9646874572af2ff245ef817b220d802a8017bf Mon Sep 17 00:00:00 2001
From: tqchen <tianqi.tchen@gmail.com>
Date: Sun, 2 Nov 2014 21:52:59 -0800
Subject: [PATCH 048/166] everything is ready, except for propose

---
 src/tree/model.h                   |   2 +-
 src/tree/updater_histmaker-inl.hpp | 127 +++++++++++++++++++++++------
 2 files changed, 105 insertions(+), 24 deletions(-)

diff --git a/src/tree/model.h b/src/tree/model.h
index 84010bcc0d2f..a330e2960e20 100644
--- a/src/tree/model.h
+++ b/src/tree/model.h
@@ -68,7 +68,7 @@ class TreeModel {
     }
   };
   /*! \brief tree node */
-  class Node{
+  class Node {
    public:
     /*! \brief index of left child */
     inline int cleft(void) const {
diff --git a/src/tree/updater_histmaker-inl.hpp b/src/tree/updater_histmaker-inl.hpp
index 48f34146952c..afbafdeace9e 100644
--- a/src/tree/updater_histmaker-inl.hpp
+++ b/src/tree/updater_histmaker-inl.hpp
@@ -29,7 +29,7 @@ class HistMaker: public IUpdater {
     param.learning_rate = lr / trees.size();
     // build tree
     for (size_t i = 0; i < trees.size(); ++i) {
-      // TODO
+      this->Update(gpair, p_fmat, info, trees[i]);
     }
     param.learning_rate = lr;
   }
@@ -80,25 +80,16 @@ class HistMaker: public IUpdater {
     // per thread histset
     std::vector<HistSet> hset;    
     // initialize the hist set
-    inline void Init(const TrainParam &param) {
-      int nthread;
-      #pragma omp parallel
-      {
-        nthread = omp_get_num_threads();
-      }
+    inline void Init(const TrainParam &param, int nthread) {
       hset.resize(nthread);
       // cleanup statistics
-      #pragma omp parallel
-      {
-        int tid = omp_get_thread_num();
+      for (int tid = 0; tid < nthread; ++tid) {
         for (size_t i = 0; i < hset[tid].data.size(); ++i) {
           hset[tid].data[i].Clear();
         }
-      }
-      for (int i = 0; i < nthread; ++i) {
-        hset[i].rptr = BeginPtr(rptr);
-        hset[i].cut = BeginPtr(cut);
-        hset[i].data.resize(cut.size(), TStats(param));        
+        hset[tid].rptr = BeginPtr(rptr);
+        hset[tid].cut = BeginPtr(cut);
+        hset[tid].data.resize(cut.size(), TStats(param));        
       }
     }
     // aggregate all statistics to hset[0]
@@ -119,7 +110,7 @@ class HistMaker: public IUpdater {
     inline size_t Size(void) const {
       return rptr.size() - 1;
     }
-  };
+  };  
   // training parameter
   TrainParam param;
   // workspace of thread
@@ -132,30 +123,116 @@ class HistMaker: public IUpdater {
   std::vector<int> node2workindex;
   // reducer for histogram
   sync::Reducer<TStats> histred;
+  
+  // helper function to get to next level of the tree
+  // must work on non-leaf node
+  inline static int NextLevel(const SparseBatch::Inst &inst, const RegTree &tree, int nid) {
+    const RegTree::Node &n = tree[nid];
+    bst_uint findex = n.split_index();
+    for (unsigned i = 0; i < inst.length; ++i) {
+      if (findex == inst[i].index) {
+        if (inst[i].fvalue < n.split_cond()) {
+          return n.cleft();
+        } else {
+          return n.cright();
+        }
+      }
+    }
+    return n.cdefault();
+  }
+  
  private:
   virtual void Update(const std::vector<bst_gpair> &gpair,
                       IFMatrix *p_fmat,
                       const BoosterInfo &info,
                       RegTree *p_tree) {
-    //this->InitData(gpair, *p_fmat, info.root_index, *p_tree);
-    //this->InitNewNode(qexpand_, gpair, *p_fmat, info, *p_tree);
-    for (int depth = 0; depth < param.max_depth; ++depth) {
+    this->InitData(gpair, *p_fmat, info.root_index, *p_tree);
+    this->UpdateNode2WorkIndex(*p_tree);
+    for (int depth = 0; depth < param.max_depth; ++depth) {      
       this->FindSplit(depth, gpair, p_fmat, info, p_tree);
-      //this->ResetPosition(qexpand_, p_fmat, *p_tree);
-      //this->UpdateQueueExpand(*p_tree, &qexpand_);
-      //this->InitNewNode(qexpand_, gpair, *p_fmat, info, *p_tree);
+      this->UpdateQueueExpand(*p_tree);
+      this->UpdateNode2WorkIndex(*p_tree);
       // if nothing left to be expand, break
       if (qexpand.size() == 0) break;
     }   
   }
+  // initialize temp data structure
+  inline void InitData(const std::vector<bst_gpair> &gpair,
+                       const IFMatrix &fmat,
+                       const std::vector<unsigned> &root_index, const RegTree &tree) {
+    utils::Assert(tree.param.num_nodes == tree.param.num_roots, "HistMaker: can only grow new tree");
+    {// setup position
+      position.resize(gpair.size());
+      if (root_index.size() == 0) {
+        std::fill(position.begin(), position.end(), 0);
+      } else {
+        for (size_t i = 0; i < position.size(); ++i) {
+          position[i] = root_index[i];
+          utils::Assert(root_index[i] < (unsigned)tree.param.num_roots,
+                        "root index exceed setting");
+        }
+      }
+      // mark delete for the deleted datas
+      for (size_t i = 0; i < position.size(); ++i) {
+        if (gpair[i].hess < 0.0f) position[i] = ~position[i];
+      }
+      // mark subsample
+      if (param.subsample < 1.0f) {
+        for (size_t i = 0; i < position.size(); ++i) {
+          if (gpair[i].hess < 0.0f) continue;
+          if (random::SampleBinary(param.subsample) == 0) position[i] = ~position[i];
+        }
+      }
+    }
+    {// expand query
+      qexpand.reserve(256); qexpand.clear();
+      for (int i = 0; i < tree.param.num_roots; ++i) {
+        qexpand.push_back(i);
+      }
+    }
+  }
+  /*! \brief update queue expand add in new leaves */
+  inline void UpdateQueueExpand(const RegTree &tree) {
+    std::vector<int> newnodes;
+    for (size_t i = 0; i < qexpand.size(); ++i) {
+      const int nid = qexpand[i];
+      if (!tree[nid].is_leaf()) {
+        newnodes.push_back(tree[nid].cleft());
+        newnodes.push_back(tree[nid].cright());
+      }
+    }
+    // use new nodes for qexpand
+    qexpand = newnodes;
+  }
+  inline void UpdateNode2WorkIndex(const RegTree &tree) {
+    // update the node2workindex
+    std::fill(node2workindex.begin(), node2workindex.end(), -1);
+    node2workindex.resize(tree.param.num_nodes);
+    for (size_t i = 0; i < qexpand.size(); ++i) {
+      node2workindex[qexpand[i]] = static_cast<int>(i);
+    }
+  }
+  // this function does two jobs
+  // (1) reset the position in array position, to be the latest leaf id
+  // (2) propose a set of candidate cuts and set wspace.rptr wspace.cut correctly 
+  virtual void ResetPosAndPropose(IFMatrix *p_fmat,
+                                  const BoosterInfo &info,
+                                  const RegTree &tree) {
+    
+  }
   // create histogram for a setup histset
   inline void CreateHist(const std::vector<bst_gpair> &gpair,
                          IFMatrix *p_fmat,
                          const BoosterInfo &info,
                          const RegTree &tree) {
     bst_uint num_feature = tree.param.num_feature;
+    int nthread;
+    #pragma omp parallel
+    {
+      nthread = omp_get_num_threads();
+    }
     // intialize work space
-    wspace.Init(param);
+    wspace.Init(param, nthread);
     // start accumulating statistics
     utils::IIterator<RowBatch> *iter = p_fmat->RowIterator();
     iter->BeforeFirst();
@@ -225,6 +302,8 @@ class HistMaker: public IUpdater {
                         const BoosterInfo &info,
                         RegTree *p_tree) {
     const bst_uint num_feature = p_tree->param.num_feature;
+    // reset and propose candidate split
+    this->ResetPosAndPropose(p_fmat, info, *p_tree);
     // create histogram
     this->CreateHist(gpair, p_fmat, info, *p_tree);
     // get the best split condition for each node
@@ -265,6 +344,8 @@ class HistMaker: public IUpdater {
     }
   }
 };
+
+
 }  // namespace tree
 }  // namespace xgboost
 #endif  // XGBOOST_TREE_UPDATER_HISTMAKER_INL_HPP_

From 539fce2856030ee548c7663a6ed02d9831d1e0c1 Mon Sep 17 00:00:00 2001
From: tqchen <tianqi.tchen@gmail.com>
Date: Thu, 6 Nov 2014 15:37:23 -0800
Subject: [PATCH 049/166] ok

---
 Makefile                           |   2 +-
 src/tree/updater.cpp               |   2 +-
 src/tree/updater_histmaker-inl.hpp |  56 +++++++++++----
 src/utils/group_data.h             | 111 +++++++++++++++++++++++++++++
 src/utils/matrix_csr.h             |   1 -
 test/Makefile                      |  29 ++++++++
 test/test_group_data.cpp           |  72 +++++++++++++++++++
 7 files changed, 257 insertions(+), 16 deletions(-)
 create mode 100644 src/utils/group_data.h
 create mode 100644 test/Makefile
 create mode 100644 test/test_group_data.cpp

diff --git a/Makefile b/Makefile
index c99a9a7fe2d8..e483ecad4140 100644
--- a/Makefile
+++ b/Makefile
@@ -74,4 +74,4 @@ Rpack:
 	R CMD check --as-cran xgboost*.tar.gz
 
 clean:
-	$(RM) $(OBJ) $(BIN) $(SLIB) *.o  */*.o */*/*.o *~ */*~ */*/*~
+	$(RM) $(OBJ) $(BIN) $(MPIBIN) $(MPIOBJ) $(SLIB) *.o  */*.o */*/*.o *~ */*~ */*/*~
diff --git a/src/tree/updater.cpp b/src/tree/updater.cpp
index 6efcc511f01c..faa38dc4bccf 100644
--- a/src/tree/updater.cpp
+++ b/src/tree/updater.cpp
@@ -15,7 +15,7 @@ IUpdater* CreateUpdater(const char *name) {
   if (!strcmp(name, "prune")) return new TreePruner();
   if (!strcmp(name, "refresh")) return new TreeRefresher<GradStats>();
   if (!strcmp(name, "grow_colmaker")) return new ColMaker<GradStats>();
-  if (!strcmp(name, "grow_histmaker")) return new HistMaker<GradStats>();
+  if (!strcmp(name, "grow_histmaker")) return new QuantileHistMaker<GradStats>();
   if (!strcmp(name, "distcol")) return new DistColMaker<GradStats>();
   if (!strcmp(name, "grow_colmaker5")) return new ColMaker< CVGradStats<5> >();
   if (!strcmp(name, "grow_colmaker3")) return new ColMaker< CVGradStats<3> >();
diff --git a/src/tree/updater_histmaker-inl.hpp b/src/tree/updater_histmaker-inl.hpp
index afbafdeace9e..02dc5c8fca99 100644
--- a/src/tree/updater_histmaker-inl.hpp
+++ b/src/tree/updater_histmaker-inl.hpp
@@ -8,6 +8,7 @@
 #include <vector>
 #include <algorithm>
 #include "../sync/sync.h"
+#include "../utils/quantile.h"
 
 namespace xgboost {
 namespace tree {
@@ -140,7 +141,13 @@ class HistMaker: public IUpdater {
     }
     return n.cdefault();
   }
-  
+
+  // this function does two jobs
+  // (1) reset the position in array position, to be the latest leaf id
+  // (2) propose a set of candidate cuts and set wspace.rptr wspace.cut correctly 
+  virtual void ResetPosAndPropose(IFMatrix *p_fmat,
+                                  const BoosterInfo &info,
+                                  const RegTree &tree)  = 0;  
  private:
   virtual void Update(const std::vector<bst_gpair> &gpair,
                       IFMatrix *p_fmat,
@@ -160,7 +167,8 @@ class HistMaker: public IUpdater {
   inline void InitData(const std::vector<bst_gpair> &gpair,
                        const IFMatrix &fmat,
                        const std::vector<unsigned> &root_index, const RegTree &tree) {
-    utils::Assert(tree.param.num_nodes == tree.param.num_roots, "HistMaker: can only grow new tree");
+    utils::Assert(tree.param.num_nodes == tree.param.num_roots,
+                  "HistMaker: can only grow new tree");
     {// setup position
       position.resize(gpair.size());
       if (root_index.size() == 0) {
@@ -212,15 +220,6 @@ class HistMaker: public IUpdater {
       node2workindex[qexpand[i]] = static_cast<int>(i);
     }
   }
-  // this function does two jobs
-  // (1) reset the position in array position, to be the latest leaf id
-  // (2) propose a set of candidate cuts and set wspace.rptr wspace.cut correctly 
-  virtual void ResetPosAndPropose(IFMatrix *p_fmat,
-                                  const BoosterInfo &info,
-                                  const RegTree &tree) {
-    
-  }
-  // create histogram for a setup histset
   inline void CreateHist(const std::vector<bst_gpair> &gpair,
                          IFMatrix *p_fmat,
                          const BoosterInfo &info,
@@ -250,7 +249,7 @@ class HistMaker: public IUpdater {
         const int nid = position[ridx];
         if (nid >= 0) {
           utils::Assert(tree[nid].is_leaf(), "CreateHist happens in leaf");
-          const int wid = node2workindex[nid];
+           const int wid = node2workindex[nid];
           for (bst_uint i = 0; i < inst.length; ++i) {
             utils::Assert(inst[i].index < num_feature, "feature index exceed bound");
             // feature histogram
@@ -312,7 +311,8 @@ class HistMaker: public IUpdater {
     #pragma omp parallel for schedule(dynamic, 1)
     for (bst_omp_uint wid = 0; wid < nexpand; ++ wid) {
       const int nid = qexpand[wid];
-      utils::Assert(node2workindex[nid] == static_cast<int>(wid), "node2workindex inconsistent");
+      utils::Assert(node2workindex[nid] == static_cast<int>(wid),
+                    "node2workindex inconsistent");
       SplitEntry &best = sol[wid];     
       TStats &node_sum = wspace.hset[0][num_feature + wid * (num_feature + 1)].data[0];
       for (bst_uint fid = 0; fid < num_feature; ++ fid) {
@@ -345,6 +345,36 @@ class HistMaker: public IUpdater {
   }
 };
 
+// hist maker that propose using quantile sketch
+template<typename TStats>
+class QuantileHistMaker: public HistMaker<TStats> {
+ protected:
+  virtual void ResetPosAndPropose(IFMatrix *p_fmat,
+                                  const BoosterInfo &info,
+                                  const RegTree &tree) {
+    // start accumulating statistics
+    utils::IIterator<RowBatch> *iter = p_fmat->RowIterator();
+    iter->BeforeFirst();
+    while (iter->Next()) {
+      const RowBatch &batch = iter->Value();
+      const bst_omp_uint nbatch = static_cast<bst_omp_uint>(batch.size);
+      #pragma omp parallel for schedule(static)
+      for (bst_omp_uint i = 0; i < nbatch; ++i) {
+        RowBatch::Inst inst = batch[i];
+        const bst_uint ridx = static_cast<bst_uint>(batch.base_rowid + i);
+        int nid = this->position[ridx];
+        if (nid >= 0) {
+          if (tree[nid].is_leaf()) {
+            this->position[ridx] = ~nid; 
+          } else {
+            this->position[ridx] = nid = HistMaker<TStats>::NextLevel(inst, tree, nid);            
+            // todo add the cut point setup
+          }
+        }
+      }      
+    }    
+  }
+};
 
 }  // namespace tree
 }  // namespace xgboost
diff --git a/src/utils/group_data.h b/src/utils/group_data.h
new file mode 100644
index 000000000000..a25eb1eddc9b
--- /dev/null
+++ b/src/utils/group_data.h
@@ -0,0 +1,111 @@
+#ifndef XGBOOST_UTILS_GROUP_DATA_H_
+#define XGBOOST_UTILS_GROUP_DATA_H_
+/*!
+ * \file group_data.h
+ * \brief this file defines utils to group data by integer keys
+ *     Input: given input sequence (key,value), (k1,v1), (k2,v2)
+ *     Ouptupt: an array of values data = [v1,v2,v3 .. vn]
+ *              and a group pointer ptr,
+ *              data[ptr[k]:ptr[k+1]] contains values that corresponds to key k
+ *
+ * This can be used to construct CSR/CSC matrix from un-ordered input
+ * The major algorithm is a two pass linear scan algorithm that requires two pass scan over the data
+ * \author Tianqi Chen
+ */
+namespace xgboost {
+namespace utils {
+/*!
+ * \brief multi-thread version of group builder
+ * \tparam ValueType type of entries in the sparse matrix
+ * \tparam SizeType type of the index range holder
+ */
+template<typename ValueType, typename SizeType = size_t>
+struct ParallelGroupBuilder {
+ public:
+  // parallel group builder of data
+  ParallelGroupBuilder(std::vector<SizeType> *p_rptr,
+                       std::vector<ValueType> *p_data)
+      : rptr(*p_rptr), data(*p_data), thread_rptr(tmp_thread_rptr) {
+  }
+  ParallelGroupBuilder(std::vector<SizeType> *p_rptr,
+                       std::vector<ValueType> *p_data,
+                       std::vector< std::vector<SizeType> > *p_thread_rptr)
+      : rptr(*p_rptr), data(*p_data), thread_rptr(*p_thread_rptr) {
+  }
+  
+ public:
+  /*!
+   * \brief step 1: initialize the helper, with hint of number keys 
+   *                and thread used in the construction
+   * \param nkeys number of keys in the matrix, can be smaller than expected
+   * \param nthread number of thread that will be used in construction
+   */
+  inline void InitBudget(size_t nkeys = 0, int nthread = 1) {
+    thread_rptr.resize(nthread);
+    for (size_t i = 0;  i < thread_rptr.size(); ++i) {
+      thread_rptr[i].resize(nkeys);
+      std::fill(thread_rptr[i].begin(), thread_rptr[i].end(), 0);
+    }
+  }
+  /*!
+   * \brief step 2: add budget to each key
+   * \param key the key
+   * \param threadid the id of thread that calls this function
+   * \param nelem number of element budget add to this row
+   */
+  inline void AddBudget(size_t key, int threadid = 0, SizeType nelem = 1) {
+    std::vector<SizeType> &trptr = thread_rptr[threadid];
+    if (trptr.size() < key + 1) {
+      trptr.resize(key + 1, 0);      
+    }
+    trptr[key] += nelem;
+  }
+  /*! \brief step 3: initialize the necessary storage */
+  inline void InitStorage(void) {
+    // set rptr to correct size
+    for (size_t tid = 0; tid < thread_rptr.size(); ++tid) {
+      if (rptr.size() <= thread_rptr[tid].size()) {
+        rptr.resize(thread_rptr[tid].size()+1);
+      }
+    }
+    // initialize rptr to be beginning of each segment
+    size_t start = 0;
+    for (size_t i = 0; i + 1 < rptr.size(); ++i) {
+      for (size_t tid = 0; tid < thread_rptr.size(); ++tid) {
+        std::vector<SizeType> &trptr = thread_rptr[tid];
+        if (i < trptr.size()) {
+          size_t ncnt = trptr[i];
+          trptr[i] = start;
+          start += ncnt;
+        }
+      }
+      rptr[i + 1] = start;
+    }
+    data.resize(start);
+  }
+  /*!
+   * \brief step 4: add data to the allocated space, 
+   *   the calls to this function should be exactly match previous call to AddBudget
+   *
+   * \param key the key of 
+   * \param threadid the id of thread that calls this function
+   */
+  inline void Push(size_t key, ValueType value, int threadid = 0) {    
+    SizeType &rp = thread_rptr[threadid][key];
+    data[rp++] = value;
+  }
+
+ private:
+  /*! \brief pointer to the beginning and end of each continuous key */
+  std::vector<SizeType> &rptr;
+  /*! \brief index of nonzero entries in each row */
+  std::vector<ValueType> &data;
+  /*! \brief thread local data structure */
+  std::vector< std::vector<SizeType> > &thread_rptr;
+  /*! \brief local temp thread ptr, use this if not specified by the constructor */
+  std::vector< std::vector<SizeType> > tmp_thread_rptr;
+};
+}  // namespace utils
+}  // namespace xgboost
+#endif
+
diff --git a/src/utils/matrix_csr.h b/src/utils/matrix_csr.h
index ea5bc8b2dcb6..bc9479cc3945 100644
--- a/src/utils/matrix_csr.h
+++ b/src/utils/matrix_csr.h
@@ -256,7 +256,6 @@ struct SparseCSRFileBuilder {
   /*! \brief saved top space of each item */
   std::vector<IndexType> buffer_data;
 };
-
 }  // namespace utils
 }  // namespace xgboost
 #endif
diff --git a/test/Makefile b/test/Makefile
new file mode 100644
index 000000000000..9f145085ee9a
--- /dev/null
+++ b/test/Makefile
@@ -0,0 +1,29 @@
+export CC  = gcc
+export CXX = g++
+export MPICXX = mpicxx
+export LDFLAGS= -pthread -lm 
+export CFLAGS = -Wall -O3 -msse2  -Wno-unknown-pragmas -fPIC -I../src
+
+ifeq ($(no_omp),1)
+	CFLAGS += -DDISABLE_OPENMP 
+else 
+	CFLAGS += -fopenmp
+endif
+
+# specify tensor path
+BIN = test_group_data
+
+.PHONY: clean all
+
+all: $(BIN) $(MPIBIN)
+
+test_group_data: test_group_data.cpp
+
+$(BIN) : 
+	$(CXX) $(CFLAGS) $(LDFLAGS) -o $@ $(filter %.cpp %.o %.c, $^)
+
+$(MPIBIN) : 
+	$(MPICXX) $(CFLAGS) $(LDFLAGS) -o $@ $(filter %.cpp %.o %.c, $^)
+
+clean:
+	$(RM) $(BIN) $(MPIBIN) *~
diff --git a/test/test_group_data.cpp b/test/test_group_data.cpp
new file mode 100644
index 000000000000..e5c1d006968f
--- /dev/null
+++ b/test/test_group_data.cpp
@@ -0,0 +1,72 @@
+#include <cstdio>
+#include <cstdlib>
+#include <vector>
+#include <utility>
+#include <ctime>
+#include <utils/group_data.h>
+#include <utils/random.h>
+#include <utils/omp.h>
+#include <utils/utils.h>
+
+using namespace xgboost::utils;
+using namespace xgboost;
+
+int main(int argc, char *argv[]) {
+  if (argc < 3) {
+    printf("Usage: <nkey> <ndata> pnthread]\n");
+    return 0;
+  }
+  if (argc > 3) {
+    omp_set_num_threads(atoi(argv[3]));
+  }
+  random::Seed(0);
+  unsigned nkey = static_cast<unsigned>(atoi(argv[1]));
+  size_t ndata = static_cast<size_t>(atol(argv[2]));
+  
+  std::vector<unsigned> keys;
+  std::vector< std::pair<unsigned, unsigned> > raw;
+  raw.reserve(ndata); keys.reserve(ndata);
+  for (size_t i = 0; i < ndata; ++i) {
+    unsigned key = random::NextUInt32(nkey);
+    utils::Check(key < nkey, "key exceed bound\n");
+    raw.push_back(std::make_pair(key, i));
+    keys.push_back(key);
+  }
+  printf("loading finish, start working\n");
+  time_t start_t = time(NULL);
+  int nthread;
+  #pragma omp parallel
+  {
+    nthread = omp_get_num_threads();
+  }
+  std::vector<size_t> rptr;
+  std::vector<unsigned> data;
+  ParallelGroupBuilder<unsigned> builder(&rptr, &data);
+  builder.InitBudget(0, nthread);
+
+  bst_omp_uint nlen = raw.size();
+  #pragma omp parallel for schedule(static)
+  for (bst_omp_uint i = 0; i < nlen; ++i) {
+    builder.AddBudget(raw[i].first, omp_get_thread_num());
+  }
+  double first_cost = time(NULL) - start_t;
+  builder.InitStorage();  
+  #pragma omp parallel for schedule(static)
+  for (bst_omp_uint i = 0; i < nlen; ++i) {
+    builder.Push(raw[i].first, raw[i].second, omp_get_thread_num());
+  }  
+  double second_cost = time(NULL) - start_t;
+  printf("all finish, phase1=%g sec, phase2=%g sec\n", first_cost, second_cost);
+  Check(rptr.size() <= nkey+1, "nkey exceed bound");
+  Check(rptr.back() == ndata, "data shape inconsistent");
+  for (size_t i = 0; i < rptr.size()-1; ++ i) {
+    Check(rptr[i] <= rptr[i+1], "rptr error");
+    for (size_t j = rptr[i]; j < rptr[i+1]; ++j) {
+      unsigned pos = data[j];
+      Check(pos < keys.size(), "invalid pos");
+      Check(keys[pos] == i, "invalid key entry");
+    }
+  }
+  printf("all check pass\n");
+  return 0;
+}

From aace84c3499b0e7942da3daaac9e042eecb89f8a Mon Sep 17 00:00:00 2001
From: tqchen <tianqi.tchen@gmail.com>
Date: Thu, 6 Nov 2014 15:58:36 -0800
Subject: [PATCH 050/166] pass group data test

---
 test/test_group_data.cpp | 28 ++++++++++++++++++++--------
 1 file changed, 20 insertions(+), 8 deletions(-)

diff --git a/test/test_group_data.cpp b/test/test_group_data.cpp
index e5c1d006968f..676d45e27bf0 100644
--- a/test/test_group_data.cpp
+++ b/test/test_group_data.cpp
@@ -44,17 +44,29 @@ int main(int argc, char *argv[]) {
   ParallelGroupBuilder<unsigned> builder(&rptr, &data);
   builder.InitBudget(0, nthread);
 
-  bst_omp_uint nlen = raw.size();
-  #pragma omp parallel for schedule(static)
-  for (bst_omp_uint i = 0; i < nlen; ++i) {
-    builder.AddBudget(raw[i].first, omp_get_thread_num());
+  size_t nstep = (raw.size() +nthread-1)/ nthread;
+  #pragma omp parallel
+  {
+    int tid = omp_get_thread_num(); 
+    size_t begin = tid * nstep;
+    size_t end = std::min((tid + 1) * nstep, raw.size());
+    for (size_t i = begin; i < end; ++i) {
+      builder.AddBudget(raw[i].first, tid);
+    }
   }
   double first_cost = time(NULL) - start_t;
   builder.InitStorage();  
-  #pragma omp parallel for schedule(static)
-  for (bst_omp_uint i = 0; i < nlen; ++i) {
-    builder.Push(raw[i].first, raw[i].second, omp_get_thread_num());
-  }  
+
+  #pragma omp parallel
+  {
+    int tid = omp_get_thread_num(); 
+    size_t begin = tid * nstep;
+    size_t end = std::min((tid + 1)* nstep, raw.size());
+    for (size_t i = begin; i < end; ++i) {
+      builder.Push(raw[i].first, raw[i].second, tid);
+    }
+  }
+
   double second_cost = time(NULL) - start_t;
   printf("all finish, phase1=%g sec, phase2=%g sec\n", first_cost, second_cost);
   Check(rptr.size() <= nkey+1, "nkey exceed bound");

From 0e6b899d070babcbd0b4c272d8f3638d1ede3fff Mon Sep 17 00:00:00 2001
From: tqchen <tianqi.tchen@gmail.com>
Date: Sun, 9 Nov 2014 16:02:38 -0800
Subject: [PATCH 051/166] quantile

---
 src/utils/quantile.h | 256 +++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 256 insertions(+)

diff --git a/src/utils/quantile.h b/src/utils/quantile.h
index e123c23f64e8..3c5e50b8a45d 100644
--- a/src/utils/quantile.h
+++ b/src/utils/quantile.h
@@ -268,6 +268,262 @@ class QuantileSketch {
   // temporal summary, used for temp-merge
   SummaryContainer temp;  
 };
+
+/*! 
+ * \brief a helper class to compute streaming quantile
+ * \tparam DType type of data content
+ * \tparam RType type of rank
+ */
+template<typename DType, typename RType=unsigned>
+class GKQuantileSketch {
+ public:
+  /*! \brief an entry in the sketch summary */
+  struct Entry {
+    /*! \brief minimum rank */
+    RType rmin;
+    /*! \brief maximum rank */
+    RType rmax;
+    /*! \brief the value of data */
+    DType value;
+    // constructor
+    Entry(void) {}
+    // constructor
+    Entry(RType rmin, RType rmax, DType value)
+        : rmin(rmin), rmax(rmax), value(value) {}
+  };
+  /*! 
+   * \brief this is data structure presenting one summary
+   */
+  struct Summary {
+    /*! \brief data field */
+    Entry *data;
+    /*! \brief number of elements in the summary */
+    RType size;
+    /*! \brief the maximum error of the summary */
+    inline RType MaxError(void) const {
+      RType res = 0;
+      for (RType i = 1; i < size; ++i) {
+        res = std::max(data[i].rmax - data[i-1].rmin, res);
+      }
+      return res;
+    }
+    /*! \return maximum rank in the summary */
+    inline RType MaxRank(void) const {
+      return data[size - 1].rmax;
+    }
+    /*! \brief set size to 0 */
+    inline void Clear(void) {
+      size = 0;
+    }
+    /*! 
+     * \brief copy content from src
+     * \param src source sketch
+     */
+    inline void CopyFrom(const Summary &src) {
+      size = src.size;
+      std::memcpy(data, src.data, sizeof(Entry) * size);
+    }
+    /*! 
+     * \brief set current summary to be pruned summary of src
+     *        assume data field is already allocated to be at least maxsize
+     * \param src source summary
+     * \param maxsize size we can afford in the pruned sketch
+     */
+    inline void SetPrune(const Summary &src, RType maxsize) {
+      const RType max_rank = src.MaxRank();
+      this->size = maxsize;
+      data[0] = src.data[0];
+      RType n = maxsize - 1;
+      RType top = 1;
+      for (RType i = 1; i < n; ++i) {
+        RType k = (i * max_rank) / n;
+        while (k > src.data[top + 1].rmax) ++top;
+        // assert src.data[top].rmin <= k
+        // because k > src.data[top].rmax >= src.data[top].rmin
+        if ((k - src.data[top].rmin) < (src.data[top+1].rmax - k)) {
+          data[i] = src.data[top];
+        } else {
+          data[i] = src.data[top + 1];
+        }
+      }
+      data[n] = src.data[src.size - 1];
+    }
+    inline void SetCombine(const Summary &sa,
+                           const Summary &sb) {
+      utils::Assert(sa.size > 0 && sb.size > 0, "invalid input for merge"); 
+      const Entry *a = sa.data, *a_end = sa.data + sa.size;
+      const Entry *b = sb.data, *b_end = sb.data + sb.size;
+      this->size = sa.size + sb.size;
+      RType aprev_rmin = 0, bprev_rmin = 0;
+      Entry *dst = this->data;
+      while (a != a_end && b != b_end) {
+        if (a->value < b->value) {
+          *dst = Entry(bprev_rmin + a->rmin,
+                       a->rmax + b->rmax - 1, a->value);
+          aprev_rmin = a->rmin;
+          ++dst; ++a;
+        } else {
+          *dst = Entry(aprev_rmin + b->rmin, 
+                       b->rmax + a->rmax - 1, b->value);
+          bprev_rmin = b->rmin;
+          ++dst; ++b;
+        }
+      }
+      if (a != a_end) {
+        RType bprev_rmax = (b_end - 1)->rmax;
+        do {
+          *dst = Entry(bprev_rmin + a->rmin, bprev_rmax + a->rmax, a->value);
+          ++dst; ++a;
+        } while (a != a_end);
+      }
+      if (b != b_end) {
+        RType aprev_rmax = (a_end - 1)->rmax;
+        do {
+          *dst = Entry(aprev_rmin + b->rmin, aprev_rmax + b->rmax, b->value);
+          ++dst; ++b;
+        } while (b != b_end);
+      }
+      utils::Assert(dst == data + size, "bug in combine");
+    }
+  };
+  // same as summary, but use STL to backup the space
+  struct SummaryContainer : public Summary {
+    std::vector<Entry> space;
+    /*! \brief reserve space for summary */
+    inline void Reserve(size_t size) {
+      space.resize(size);
+      this->data = BeginPtr(space);
+    }
+    /*! 
+     * \brief set the space to be merge of all Summary arrays
+     * \param begin begining position in th summary array
+     * \param end ending position in the Summary array
+     */
+    inline void SetMerge(const Summary *begin,
+                         const Summary *end) {
+      utils::Assert(begin < end, "can not set combine to empty instance");
+      size_t len = end - begin;
+      if (len == 1) {
+        this->Reserve(begin[0].size);
+        this->CopyFrom(begin[0]);
+      } else if (len == 2) {
+        this->Reserve(begin[0].size + begin[1].size);
+        this->SetMerge(begin[0], begin[1]);
+      } else {
+        // recursive merge
+        SummaryContainer lhs, rhs;        
+        lhs.SetCombine(begin, begin + len / 2);
+        rhs.SetCombine(begin + len / 2, end);
+        this->Reserve(lhs.size + rhs.size);
+        this->SetCombine(lhs, rhs);
+      }
+    }
+  };
+  /*! 
+   * \brief intialize the quantile sketch, given the performance specification
+   * \param maxn maximum number of data points can be encountered
+   * \param eps accuracy level of summary
+   */
+  inline void Init(RType maxn, double eps) {
+    eps  = eps * 0.5;
+    size_t L = 0;
+    size_t b = std::max(floor(log2(eps * maxn) / eps), 8.0);
+    // check for lower 
+    while (b < maxn) {
+      L = ceil(log2(maxn / b)) + 1;
+      if (L < eps * b) break;
+      ++b;
+    }
+    L += 1;
+    inqueue.resize(b);
+    level_batch = (b + 1) / 2 + 1;
+    temp.Reserve(level_batch * 2);
+    data.resize(level_batch * L);
+    for (size_t l = 0; l < L; ++l) {
+      Summary s; s.size = 0;
+      s.data = BeginPtr(data) + l * level_batch;
+      level.push_back(s);
+    }
+    qtail = 0;
+  }
+  /*! 
+   * \brief add an element to a sketch 
+   * \param x the elemented added to the sketch
+   */
+  inline void Add(DType x) {
+    inqueue[qtail++] = x;
+    if (qtail == inqueue.size()) {
+      // start update sketch
+      std::sort(inqueue.begin(), inqueue.end());
+      for (size_t i = 0; i < qtail; ++i) {
+        temp.data[i] = Entry(i + 1, i + 1, inqueue[i]);
+      }
+      temp.size = static_cast<RType>(qtail);
+      // clean up queue
+      qtail = 0;
+      for (size_t l = 1; l < level.size(); ++l) {
+        // check if level l is empty
+        if (level[l].size == 0) {
+          level[l].SetPrune(temp, level_batch);
+          return;
+        } else {
+          // level 0 is actually temp space
+          level[0].SetPrune(temp, level_batch);
+          temp.SetCombine(level[0], level[l]);
+          level[l].size = 0;
+        }
+      }
+      utils::Error("adding more element than allowed");
+    }
+  }
+  /*! 
+   * \brief finalize the result after all data has been passed 
+   *        copy the final result to level 0
+   *        this can only be called once
+   */
+  inline void Finalize(void) {
+    // start update sketch
+    std::sort(inqueue.begin(), inqueue.begin() + qtail);
+    for (size_t i = 0; i < qtail; ++i) {
+      temp.data[i] = Entry(i + 1, i + 1, inqueue[i]);
+    }
+    temp.size = static_cast<RType>(qtail);
+    if (temp.size < level_batch) {
+      level[0].CopyFrom(temp);
+    } else {
+      level[0].SetPrune(temp, level_batch);
+    }
+    // start adding other things in
+    for (size_t l = 1; l < level.size(); ++l) {
+      if (level[l].size == 0) continue;
+      if (level[0].size == 0) {
+        level[0].CopyFrom(level[l]);
+      } else {
+        temp.SetCombine(level[0], level[l]);
+        level[0].SetPrune(temp, level_batch);        
+      }
+      level[l].size = 0;
+    }
+  }
+  /*! \brief get the summary after finalize */
+  inline Summary GetSummary(void) const {
+    return level[0];
+  }  
+  
+ private:  
+  // the input queue
+  std::vector<DType> inqueue;
+  // end of the queue
+  size_t qtail;
+  // size of summary in each level
+  size_t level_batch;
+  // content of the summary
+  std::vector<Entry> data;
+  // different level of summary
+  std::vector<Summary> level;  
+  // temporal summary, used for temp-merge
+  SummaryContainer temp;  
+};
 }  // utils
 }  // xgboost
 #endif

From 7c1ec78a013a393c373f8dfa8e4b432949359d23 Mon Sep 17 00:00:00 2001
From: tqchen <tianqi.tchen@gmail.com>
Date: Sun, 9 Nov 2014 18:03:36 -0800
Subject: [PATCH 052/166] before test quantile

---
 src/utils/quantile.h | 297 ++++++++++++++++++++++++++++---------------
 test/Makefile        |   5 +-
 2 files changed, 199 insertions(+), 103 deletions(-)

diff --git a/src/utils/quantile.h b/src/utils/quantile.h
index 3c5e50b8a45d..b46a89d7d1d8 100644
--- a/src/utils/quantile.h
+++ b/src/utils/quantile.h
@@ -19,7 +19,7 @@ namespace utils {
  * \tparam RType type of rank
  */
 template<typename DType, typename RType=unsigned>
-class QuantileSketch {
+class WQuantileSketch {
  public:
   /*! \brief an entry in the sketch summary */
   struct Entry {
@@ -27,13 +27,28 @@ class QuantileSketch {
     RType rmin;
     /*! \brief maximum rank */
     RType rmax;
+    /*! \brief maximum weight */
+    RType wmin;
     /*! \brief the value of data */
     DType value;
     // constructor
     Entry(void) {}
     // constructor
-    Entry(RType rmin, RType rmax, DType value)
-        : rmin(rmin), rmax(rmax), value(value) {}
+    Entry(RType rmin, RType rmax, RType wmin, DType value)
+        : rmin(rmin), rmax(rmax), wmin(wmin), value(value) {}
+    /*! \brief debug function,  */
+    inline void CheckValid(void) const {
+      utils::Assert(rmin >= 0 && rmax >= 0 && wmin >= 0, "nonneg constraint");
+      utils::Assert(rmax >= rmin + wmin, "relation constraint");
+    }
+    /*! \return rmin estimation for v strictly bigger than value */
+    inline RType rmin_next(void) const {
+      return rmin + wmin;
+    }
+    /*! \return rmax estimation for v strictly smaller than value */
+    inline RType rmax_prev(void) const {
+      return rmax - wmin;
+    }
   };
   /*! 
    * \brief this is data structure presenting one summary
@@ -42,15 +57,34 @@ class QuantileSketch {
     /*! \brief data field */
     Entry *data;
     /*! \brief number of elements in the summary */
-    RType size;
-    /*! \brief the maximum error of the summary */
+    size_t size;
+    // constructor
+    Summary(void) : size(0) {
+    }
+    /*! 
+     * \brief the maximum error of the Summary
+     */
     inline RType MaxError(void) const {
-      RType res = 0;
+      RType res = data[0].rmax - data[0].rmin - data[0].wmin;
       for (RType i = 1; i < size; ++i) {
-        res = std::max(data[i].rmax - data[i-1].rmin, res);
+        res = std::max(data[i].rmax_prev() - data[i - 1].rmax_next(), res);
+        res = std::max(data[i].rmax - data[i].rmin - data[i].wmin, res);
       }
       return res;
     }
+    /*! 
+     * \brief debug function, validate whether the summary 
+     *  run consistency check to check if it is a valid summary
+     */
+    inline void CheckValid(void) const {
+      for (RType i = 0; i < size; ++i) {
+        data[i].CheckValid();
+        if (i != 0) {
+          utils::Assert(data[i].rmin >= data[i - 1].rmin + data[i - 1].wmin, "rmin range constraint");
+          utils::Assert(data[i].rmax >= data[i - 1].rmax + data[i].wmin, "rmax range constraint");
+        }
+      }
+    }
     /*! \return maximum rank in the summary */
     inline RType MaxRank(void) const {
       return data[size - 1].rmax;
@@ -59,7 +93,7 @@ class QuantileSketch {
     inline void Clear(void) {
       size = 0;
     }
-    /*! 
+    /*!
      * \brief copy content from src
      * \param src source sketch
      */
@@ -67,67 +101,89 @@ class QuantileSketch {
       size = src.size;
       std::memcpy(data, src.data, sizeof(Entry) * size);
     }
-    /*! 
+    /*!
      * \brief set current summary to be pruned summary of src
      *        assume data field is already allocated to be at least maxsize
      * \param src source summary
      * \param maxsize size we can afford in the pruned sketch
      */
     inline void SetPrune(const Summary &src, RType maxsize) {
+      if (src.size <= maxsize) {
+        this->CopyFrom(src); return;
+      }
       const RType max_rank = src.MaxRank();
-      this->size = maxsize;
+      const size_t n = maxsize - 1;
       data[0] = src.data[0];
-      RType n = maxsize - 1;
-      RType top = 1;
-      for (RType i = 1; i < n; ++i) {
-        RType k = (i * max_rank) / n;
-        while (k > src.data[top + 1].rmax) ++top;
-        // assert src.data[top].rmin <= k
-        // because k > src.data[top].rmax >= src.data[top].rmin
-        if ((k - src.data[top].rmin) < (src.data[top+1].rmax - k)) {
-          data[i] = src.data[top];
+      this->size = 1;
+      // lastidx is used to avoid duplicated records
+      size_t i = 0, lastidx = 0;
+      for (RType k = 1; k < n; ++k) {
+        RType d2 = (k * max_rank) / n * 2;
+        // find first i such that  d < (rmax[i+1] + rmin[i+1]) / 2 
+        while (i < src.size - 1 &&
+               d2 < src.data[i + 1].rmax + src.data[i + 1].rmin) ++i;
+        if (i == src.size - 1) break;
+        if (d2 < src.data[i].rmin_next() + src.data[i + 1].rmax_prev()) {
+          if (i != lastidx) {
+            data[size++] = src.data[i]; lastidx = i;
+          }
         } else {
-          data[i] = src.data[top + 1];
+          if (i + 1 != lastidx) {
+            data[size++] = src.data[i + 1]; lastidx = i + 1;
+          }
         }
       }
-      data[n] = src.data[src.size - 1];
+      if (lastidx != src.size - 1) {
+        data[size++] = src.data[src.size - 1];
+      }
     }
     inline void SetCombine(const Summary &sa,
                            const Summary &sb) {
       utils::Assert(sa.size > 0 && sb.size > 0, "invalid input for merge"); 
       const Entry *a = sa.data, *a_end = sa.data + sa.size;
       const Entry *b = sb.data, *b_end = sb.data + sb.size;
-      this->size = sa.size + sb.size;
+      // extended rmin value
       RType aprev_rmin = 0, bprev_rmin = 0;
       Entry *dst = this->data;
       while (a != a_end && b != b_end) {
-        if (a->value < b->value) {
+        // duplicated value entry
+        if (a->value == b->value) {
+          *dst = Entry(a->rmin + b->rmin,
+                       a->rmax + b->rmax,
+                       a->wmin + b->wmin, a->value);
+          aprev_rmin = a->rmin_next();
+          bprev_rmin = b->rmin_next();
+          ++dst; ++a; ++b;
+        } else if (a->value < b->value) {
           *dst = Entry(bprev_rmin + a->rmin,
-                       a->rmax + b->rmax - 1, a->value);
-          aprev_rmin = a->rmin;
+                       a->rmax + b->rmax_prev(),
+                       a->wmin, a->value);
+          aprev_rmin = a->rmin_next();
           ++dst; ++a;
         } else {
-          *dst = Entry(aprev_rmin + b->rmin, 
-                       b->rmax + a->rmax - 1, b->value);
-          bprev_rmin = b->rmin;
+          *dst = Entry(aprev_rmin + b->rmin,
+                       b->rmax + a->rmax_prev(),
+                       b->wmin, b->value);
+          bprev_rmin = b->rmin_next();
           ++dst; ++b;
         }
       }
       if (a != a_end) {
-        RType bprev_rmax = (b_end - 1)->rmax;
+        RType brmax = (b_end - 1)->rmax;
         do {
-          *dst = Entry(bprev_rmin + a->rmin, bprev_rmax + a->rmax, a->value);
+          *dst = Entry(bprev_rmin + a->rmin, brmax + a->rmax, a->wmin, a->value);
           ++dst; ++a;
         } while (a != a_end);
       }
       if (b != b_end) {
-        RType aprev_rmax = (a_end - 1)->rmax;
+        RType armax = (a_end - 1)->rmax;
         do {
-          *dst = Entry(aprev_rmin + b->rmin, aprev_rmax + b->rmax, b->value);
+          *dst = Entry(aprev_rmin + b->rmin, armax + b->rmax, b->wmin, b->value);
           ++dst; ++b;
         } while (b != b_end);
       }
-      utils::Assert(dst == data + size, "bug in combine");
+      this->size = dst - data;
+      utils::Assert(size <= sa.size + sb.size, "bug in combine");
     }
   };
   // same as summary, but use STL to backup the space
@@ -135,8 +191,10 @@ class QuantileSketch {
     std::vector<Entry> space;
     /*! \brief reserve space for summary */
     inline void Reserve(size_t size) {
-      space.resize(size);
-      this->data = BeginPtr(space);
+      if (size > space.size()) {
+        space.resize(size);
+        this->data = BeginPtr(space);
+      }
     }
     /*! 
      * \brief set the space to be merge of all Summary arrays
@@ -169,107 +227,144 @@ class QuantileSketch {
    * \param eps accuracy level of summary
    */
   inline void Init(RType maxn, double eps) {
-    eps  = eps * 0.5;
-    size_t L = 0;
+    nlevel = 0;
     size_t b = std::max(floor(log2(eps * maxn) / eps), 8.0);
-    // check for lower 
+    // check for small n case
     while (b < maxn) {
-      L = ceil(log2(maxn / b)) + 1;
-      if (L < eps * b) break;
+      nlevel = ceil(log2(maxn / b)) + 1;
+      if (nlevel < eps * b) break;
       ++b;
     }
-    L += 1;
-    inqueue.resize(b);
+    nlevel += 1;
     level_batch = (b + 1) / 2 + 1;
-    temp.Reserve(level_batch * 2);
-    data.resize(level_batch * L);
-    for (size_t l = 0; l < L; ++l) {
-      Summary s; s.size = 0;
-      s.data = BeginPtr(data) + l * level_batch;
-      level.push_back(s);
-    }
+    // lazy reserve the space, if there is only one value, no need to allocate space
+    inqueue.resize(1);
+    data.resize(0);
+    level.resize(0);
     qtail = 0;
   }
   /*! 
    * \brief add an element to a sketch 
    * \param x the elemented added to the sketch
    */
-  inline void Add(DType x) {
-    inqueue[qtail++] = x;
+  inline void Add(DType x, RType w = 1) {
     if (qtail == inqueue.size()) {
-      // start update sketch
-      std::sort(inqueue.begin(), inqueue.end());
-      for (size_t i = 0; i < qtail; ++i) {
-        temp.data[i] = Entry(i + 1, i + 1, inqueue[i]);
+      // jump from lazy one value to level_batch * 2
+      if (inqueue.size() == 1) {
+        inqueue.resize(level_batch * 2);
+      } else {
+        temp.Reserve(2 * level_batch);
+        this->Queue2Summary(&temp);
+        for (size_t l = 1; true; ++l) {
+          this->InitLevel(std::max(l + 1, nlevel));
+          // check if level l is empty
+          if (level[l].size == 0) {
+            level[l].SetPrune(temp, level_batch); break;            
+          } else {
+            // level 0 is actually temp space
+            level[0].SetPrune(temp, level_batch);
+            temp.SetCombine(level[0], level[l]);
+            if (temp.size > level_batch) {
+              // try next level
+              level[l].size = 0;
+            } else {
+              // if merged record is still smaller, no need to send to next level
+              level[l].CopyFrom(temp); break;
+            }
+          }
+        }
       }
-      temp.size = static_cast<RType>(qtail);
-      // clean up queue
-      qtail = 0;
+    }
+    if (qtail == 0 || inqueue[qtail - 1].value != x) {
+      inqueue[qtail++] = QEntry(x, w); 
+    } else {
+      inqueue[qtail - 1].weight += w;
+    }
+  } 
+  /*! \brief get the summary after finalize */
+  inline void GetSummary(SummaryContainer *out) {
+    if (level.size() != 0) {
+      out->Reserve(level_batch * 2);
+    }
+    this->Queue2Summary(out);
+    if (level.size() != 0) {
+      level[0].SetPrune(*out, level_batch);
       for (size_t l = 1; l < level.size(); ++l) {
-        // check if level l is empty
-        if (level[l].size == 0) {
-          level[l].SetPrune(temp, level_batch);
-          return;
+        if (level[l].size == 0) continue;
+        if (level[0].size == 0) {
+          level[0].CopyFrom(level[l]);
         } else {
-          // level 0 is actually temp space
-          level[0].SetPrune(temp, level_batch);
-          temp.SetCombine(level[0], level[l]);
-          level[l].size = 0;
+          out->SetCombine(level[0], level[l]);
+          level[0].SetPrune(*out, level_batch);
         }
       }
-      utils::Error("adding more element than allowed");
+      out->CopyFrom(level[0]);
     }
   }
-  /*! 
-   * \brief finalize the result after all data has been passed 
-   *        copy the final result to level 0
-   *        this can only be called once
-   */
-  inline void Finalize(void) {
+  
+ private:
+  // initialize level space to at least nlevel
+  inline void InitLevel(size_t nlevel) {
+    if (level.size() >= nlevel) return;
+    data.resize(level_batch * nlevel);
+    level.resize(nlevel, Summary());
+    for (size_t l = 0; l < level.size(); ++l) {
+      level[l].data = BeginPtr(data) + l * level_batch;
+    }
+  }
+  inline void Queue2Summary(SummaryContainer *temp) {
+    // reserve temp space
+    temp->Reserve(inqueue.size());
+    temp->size = 0;
     // start update sketch
     std::sort(inqueue.begin(), inqueue.begin() + qtail);
-    for (size_t i = 0; i < qtail; ++i) {
-      temp.data[i] = Entry(i + 1, i + 1, inqueue[i]);
-    }
-    temp.size = static_cast<RType>(qtail);
-    if (temp.size < level_batch) {
-      level[0].CopyFrom(temp);
-    } else {
-      level[0].SetPrune(temp, level_batch);
-    }
-    // start adding other things in
-    for (size_t l = 1; l < level.size(); ++l) {
-      if (level[l].size == 0) continue;
-      if (level[0].size == 0) {
-        level[0].CopyFrom(level[l]);
-      } else {
-        temp.SetCombine(level[0], level[l]);
-        level[0].SetPrune(temp, level_batch);        
+    RType wsum = 0;
+    // construct data with unique weights
+    for (size_t i = 0; i < qtail;) {
+      size_t j = 1;
+      RType w = inqueue[i].weight;
+      while (j < qtail && inqueue[j].value == inqueue[i].value) {
+        w += inqueue[j].weight; ++j;
       }
-      level[l].size = 0;
+      temp->data[temp->size++] = Entry(wsum, wsum + w, w, inqueue[i].value);
+      wsum += w; i = j;
     }
+    // clean up queue
+    qtail = 0;
   }
-  /*! \brief get the summary after finalize */
-  inline Summary GetSummary(void) const {
-    return level[0];
-  }  
-  
- private:  
+  // entry in the queue
+  struct QEntry {
+    // value of the instance
+    DType value;
+    // weight of instance
+    RType weight;
+    // default constructor
+    QEntry(void) {}
+    // constructor
+    QEntry(DType value, RType weight) 
+        : value(value), weight(weight) {}
+    // comparator on value
+    inline bool operator<(const QEntry &b) const {
+      return value < b.value;
+    }
+  };
   // the input queue
-  std::vector<DType> inqueue;
+  std::vector<QEntry> inqueue;
   // end of the queue
   size_t qtail;
+  // number of levels
+  size_t nlevel;
   // size of summary in each level
   size_t level_batch;
+  // the level of each summaries
+  std::vector<Summary> level;
   // content of the summary
   std::vector<Entry> data;
-  // different level of summary
-  std::vector<Summary> level;  
   // temporal summary, used for temp-merge
   SummaryContainer temp;  
 };
 
-/*! 
+/*!
  * \brief a helper class to compute streaming quantile
  * \tparam DType type of data content
  * \tparam RType type of rank
diff --git a/test/Makefile b/test/Makefile
index 9f145085ee9a..5057619ec890 100644
--- a/test/Makefile
+++ b/test/Makefile
@@ -11,13 +11,14 @@ else
 endif
 
 # specify tensor path
-BIN = test_group_data
+BIN = test_group_data test_quantile
 
 .PHONY: clean all
 
 all: $(BIN) $(MPIBIN)
 
-test_group_data: test_group_data.cpp
+test_group_data: test_group_data.cpp ../src/utils/*.h
+test_quantile: test_quantile.cpp ../src/utils/*.h
 
 $(BIN) : 
 	$(CXX) $(CFLAGS) $(LDFLAGS) -o $@ $(filter %.cpp %.o %.c, $^)

From 5561dd9cb0954e8d8799abb279c03119c910d4f8 Mon Sep 17 00:00:00 2001
From: tqchen <tianqi.tchen@gmail.com>
Date: Sun, 9 Nov 2014 21:09:07 -0800
Subject: [PATCH 053/166] fix bug in queue2summary

---
 src/utils/quantile.h | 73 ++++++++++++++++++++++----------------------
 1 file changed, 36 insertions(+), 37 deletions(-)

diff --git a/src/utils/quantile.h b/src/utils/quantile.h
index b46a89d7d1d8..aa5edb2f66ce 100644
--- a/src/utils/quantile.h
+++ b/src/utils/quantile.h
@@ -59,8 +59,7 @@ class WQuantileSketch {
     /*! \brief number of elements in the summary */
     size_t size;
     // constructor
-    Summary(void) : size(0) {
-    }
+    Summary(void) : size(0) {}
     /*! 
      * \brief the maximum error of the Summary
      */
@@ -118,12 +117,12 @@ class WQuantileSketch {
       // lastidx is used to avoid duplicated records
       size_t i = 0, lastidx = 0;
       for (RType k = 1; k < n; ++k) {
-        RType d2 = (k * max_rank) / n * 2;
+        RType dx2 = (k * max_rank) / n * 2;
         // find first i such that  d < (rmax[i+1] + rmin[i+1]) / 2 
         while (i < src.size - 1 &&
-               d2 < src.data[i + 1].rmax + src.data[i + 1].rmin) ++i;
+               dx2 < src.data[i + 1].rmax + src.data[i + 1].rmin) ++i;
         if (i == src.size - 1) break;
-        if (d2 < src.data[i].rmin_next() + src.data[i + 1].rmax_prev()) {
+        if (dx2 < src.data[i].rmin_next() + src.data[i + 1].rmax_prev()) {
           if (i != lastidx) {
             data[size++] = src.data[i]; lastidx = i;
           }
@@ -155,13 +154,13 @@ class WQuantileSketch {
           bprev_rmin = b->rmin_next();
           ++dst; ++a; ++b;
         } else if (a->value < b->value) {
-          *dst = Entry(bprev_rmin + a->rmin,
+          *dst = Entry(a->rmin + bprev_rmin,
                        a->rmax + b->rmax_prev(),
                        a->wmin, a->value);
           aprev_rmin = a->rmin_next();
           ++dst; ++a;
         } else {
-          *dst = Entry(aprev_rmin + b->rmin,
+          *dst = Entry(b->rmin + aprev_rmin,
                        b->rmax + a->rmax_prev(),
                        b->wmin, b->value);
           bprev_rmin = b->rmin_next();
@@ -171,14 +170,14 @@ class WQuantileSketch {
       if (a != a_end) {
         RType brmax = (b_end - 1)->rmax;
         do {
-          *dst = Entry(bprev_rmin + a->rmin, brmax + a->rmax, a->wmin, a->value);
+          *dst = Entry(a->rmin + bprev_rmin, a->rmax + brmax, a->wmin, a->value);
           ++dst; ++a;
         } while (a != a_end);
       }
       if (b != b_end) {
         RType armax = (a_end - 1)->rmax;
         do {
-          *dst = Entry(aprev_rmin + b->rmin, armax + b->rmax, b->wmin, b->value);
+          *dst = Entry(b->rmin + aprev_rmin, b->rmax + armax, b->wmin, b->value);
           ++dst; ++b;
         } while (b != b_end);
       }
@@ -236,12 +235,12 @@ class WQuantileSketch {
       ++b;
     }
     nlevel += 1;
-    level_batch = (b + 1) / 2 + 1;
+    limit_size = (b + 1) / 2 + 1;
     // lazy reserve the space, if there is only one value, no need to allocate space
     inqueue.resize(1);
-    data.resize(0);
-    level.resize(0);
     qtail = 0;
+    data.clear();
+    level.clear();
   }
   /*! 
    * \brief add an element to a sketch 
@@ -249,22 +248,24 @@ class WQuantileSketch {
    */
   inline void Add(DType x, RType w = 1) {
     if (qtail == inqueue.size()) {
-      // jump from lazy one value to level_batch * 2
+      // jump from lazy one value to limit_size * 2
       if (inqueue.size() == 1) {
-        inqueue.resize(level_batch * 2);
+        inqueue.resize(limit_size * 2);
       } else {
-        temp.Reserve(2 * level_batch);
+        temp.Reserve(limit_size * 2);
         this->Queue2Summary(&temp);
+        // cleanup queue
+        qtail = 0;
         for (size_t l = 1; true; ++l) {
           this->InitLevel(std::max(l + 1, nlevel));
           // check if level l is empty
           if (level[l].size == 0) {
-            level[l].SetPrune(temp, level_batch); break;            
+            level[l].SetPrune(temp, limit_size); break;            
           } else {
             // level 0 is actually temp space
-            level[0].SetPrune(temp, level_batch);
+            level[0].SetPrune(temp, limit_size);
             temp.SetCombine(level[0], level[l]);
-            if (temp.size > level_batch) {
+            if (temp.size > limit_size) {
               // try next level
               level[l].size = 0;
             } else {
@@ -284,18 +285,18 @@ class WQuantileSketch {
   /*! \brief get the summary after finalize */
   inline void GetSummary(SummaryContainer *out) {
     if (level.size() != 0) {
-      out->Reserve(level_batch * 2);
+      out->Reserve(limit_size * 2);
     }
     this->Queue2Summary(out);
     if (level.size() != 0) {
-      level[0].SetPrune(*out, level_batch);
+      level[0].SetPrune(*out, limit_size);
       for (size_t l = 1; l < level.size(); ++l) {
         if (level[l].size == 0) continue;
         if (level[0].size == 0) {
           level[0].CopyFrom(level[l]);
         } else {
           out->SetCombine(level[0], level[l]);
-          level[0].SetPrune(*out, level_batch);
+          level[0].SetPrune(*out, limit_size);
         }
       }
       out->CopyFrom(level[0]);
@@ -306,10 +307,10 @@ class WQuantileSketch {
   // initialize level space to at least nlevel
   inline void InitLevel(size_t nlevel) {
     if (level.size() >= nlevel) return;
-    data.resize(level_batch * nlevel);
+    data.resize(limit_size * nlevel);
     level.resize(nlevel, Summary());
     for (size_t l = 0; l < level.size(); ++l) {
-      level[l].data = BeginPtr(data) + l * level_batch;
+      level[l].data = BeginPtr(data) + l * limit_size;
     }
   }
   inline void Queue2Summary(SummaryContainer *temp) {
@@ -321,7 +322,7 @@ class WQuantileSketch {
     RType wsum = 0;
     // construct data with unique weights
     for (size_t i = 0; i < qtail;) {
-      size_t j = 1;
+      size_t j = i + 1;
       RType w = inqueue[i].weight;
       while (j < qtail && inqueue[j].value == inqueue[i].value) {
         w += inqueue[j].weight; ++j;
@@ -329,8 +330,6 @@ class WQuantileSketch {
       temp->data[temp->size++] = Entry(wsum, wsum + w, w, inqueue[i].value);
       wsum += w; i = j;
     }
-    // clean up queue
-    qtail = 0;
   }
   // entry in the queue
   struct QEntry {
@@ -355,7 +354,7 @@ class WQuantileSketch {
   // number of levels
   size_t nlevel;
   // size of summary in each level
-  size_t level_batch;
+  size_t limit_size;
   // the level of each summaries
   std::vector<Summary> level;
   // content of the summary
@@ -531,12 +530,12 @@ class GKQuantileSketch {
     }
     L += 1;
     inqueue.resize(b);
-    level_batch = (b + 1) / 2 + 1;
-    temp.Reserve(level_batch * 2);
-    data.resize(level_batch * L);
+    limit_size = (b + 1) / 2 + 1;
+    temp.Reserve(limit_size * 2);
+    data.resize(limit_size * L);
     for (size_t l = 0; l < L; ++l) {
       Summary s; s.size = 0;
-      s.data = BeginPtr(data) + l * level_batch;
+      s.data = BeginPtr(data) + l * limit_size;
       level.push_back(s);
     }
     qtail = 0;
@@ -559,11 +558,11 @@ class GKQuantileSketch {
       for (size_t l = 1; l < level.size(); ++l) {
         // check if level l is empty
         if (level[l].size == 0) {
-          level[l].SetPrune(temp, level_batch);
+          level[l].SetPrune(temp, limit_size);
           return;
         } else {
           // level 0 is actually temp space
-          level[0].SetPrune(temp, level_batch);
+          level[0].SetPrune(temp, limit_size);
           temp.SetCombine(level[0], level[l]);
           level[l].size = 0;
         }
@@ -583,10 +582,10 @@ class GKQuantileSketch {
       temp.data[i] = Entry(i + 1, i + 1, inqueue[i]);
     }
     temp.size = static_cast<RType>(qtail);
-    if (temp.size < level_batch) {
+    if (temp.size < limit_size) {
       level[0].CopyFrom(temp);
     } else {
-      level[0].SetPrune(temp, level_batch);
+      level[0].SetPrune(temp, limit_size);
     }
     // start adding other things in
     for (size_t l = 1; l < level.size(); ++l) {
@@ -595,7 +594,7 @@ class GKQuantileSketch {
         level[0].CopyFrom(level[l]);
       } else {
         temp.SetCombine(level[0], level[l]);
-        level[0].SetPrune(temp, level_batch);        
+        level[0].SetPrune(temp, limit_size);        
       }
       level[l].size = 0;
     }
@@ -611,7 +610,7 @@ class GKQuantileSketch {
   // end of the queue
   size_t qtail;
   // size of summary in each level
-  size_t level_batch;
+  size_t limit_size;
   // content of the summary
   std::vector<Entry> data;
   // different level of summary

From 69874dc57185d795a7d4abb8f2740fd4fa04c9ea Mon Sep 17 00:00:00 2001
From: tqchen <tianqi.tchen@gmail.com>
Date: Sun, 9 Nov 2014 21:56:56 -0800
Subject: [PATCH 054/166] init check

---
 src/utils/quantile.h | 21 ++++++++-------------
 1 file changed, 8 insertions(+), 13 deletions(-)

diff --git a/src/utils/quantile.h b/src/utils/quantile.h
index aa5edb2f66ce..dffe99cb66e4 100644
--- a/src/utils/quantile.h
+++ b/src/utils/quantile.h
@@ -222,20 +222,15 @@ class WQuantileSketch {
   };
   /*! 
    * \brief intialize the quantile sketch, given the performance specification
-   * \param maxn maximum number of data points can be encountered
+   * \param maxn maximum number of data points can be feed into sketch
    * \param eps accuracy level of summary
    */
-  inline void Init(RType maxn, double eps) {
-    nlevel = 0;
-    size_t b = std::max(floor(log2(eps * maxn) / eps), 8.0);
-    // check for small n case
-    while (b < maxn) {
-      nlevel = ceil(log2(maxn / b)) + 1;
-      if (nlevel < eps * b) break;
-      ++b;
-    }
-    nlevel += 1;
-    limit_size = (b + 1) / 2 + 1;
+  inline void Init(size_t maxn, double eps) {
+    nlevel = std::max(ceil(maxn * eps), 1.0);
+    limit_size = ceil(nlevel / eps);
+    // check invariant
+    utils::Assert((1 << nlevel)  * limit_size > maxn, "invalid init parameter");
+    utils::Assert(nlevel <= limit_size * eps, "invalid init parameter");
     // lazy reserve the space, if there is only one value, no need to allocate space
     inqueue.resize(1);
     qtail = 0;
@@ -281,7 +276,7 @@ class WQuantileSketch {
     } else {
       inqueue[qtail - 1].weight += w;
     }
-  } 
+  }
   /*! \brief get the summary after finalize */
   inline void GetSummary(SummaryContainer *out) {
     if (level.size() != 0) {

From d4c4ee0b014c58e06123d6d5abca8389cb9fe149 Mon Sep 17 00:00:00 2001
From: tqchen <tianqi.tchen@gmail.com>
Date: Sun, 9 Nov 2014 23:34:45 -0800
Subject: [PATCH 055/166] mostly correct\n

---
 src/utils/quantile.h | 93 +++++++++++++++++++++++++++++++++-----------
 1 file changed, 70 insertions(+), 23 deletions(-)

diff --git a/src/utils/quantile.h b/src/utils/quantile.h
index dffe99cb66e4..5b536bd00ec7 100644
--- a/src/utils/quantile.h
+++ b/src/utils/quantile.h
@@ -32,14 +32,27 @@ class WQuantileSketch {
     /*! \brief the value of data */
     DType value;
     // constructor
-    Entry(void) {}
+    Entry(void) {
+      rmin = rmax = wmin = 0.0;
+    }
     // constructor
     Entry(RType rmin, RType rmax, RType wmin, DType value)
-        : rmin(rmin), rmax(rmax), wmin(wmin), value(value) {}
-    /*! \brief debug function,  */
-    inline void CheckValid(void) const {
+        : rmin(rmin), rmax(rmax), wmin(wmin), value(value) {
+      if (!(this->rmax - this->rmin - this->wmin > -0.1)) {
+        rmax = rmin + wmin;
+        printf("correct\n");
+        printf("rmax=%f, rmin=%f, wmin=%f, plus=%f, v=%f\n", 
+               rmax, rmin, wmin, rmin+wmin, value);
+      }
+    }
+    /*! \brief debug function,  check Valid */
+    inline void CheckValid(RType eps = 0) const {
       utils::Assert(rmin >= 0 && rmax >= 0 && wmin >= 0, "nonneg constraint");
-      utils::Assert(rmax >= rmin + wmin, "relation constraint");
+      if (!(rmax - rmin - wmin > -eps)) {
+        printf("rmax=%f, rmin=%f, wmin=%f, plus=%f, v=%f\n", 
+               rmax, rmin, wmin, rmin+wmin, value);
+      }
+      utils::Assert(rmax- rmin - wmin > -eps  , "relation constraint: min/max");
     }
     /*! \return rmin estimation for v strictly bigger than value */
     inline RType rmin_next(void) const {
@@ -65,8 +78,8 @@ class WQuantileSketch {
      */
     inline RType MaxError(void) const {
       RType res = data[0].rmax - data[0].rmin - data[0].wmin;
-      for (RType i = 1; i < size; ++i) {
-        res = std::max(data[i].rmax_prev() - data[i - 1].rmax_next(), res);
+      for (size_t i = 1; i < size; ++i) {
+        res = std::max(data[i].rmax_prev() - data[i - 1].rmin_next(), res);
         res = std::max(data[i].rmax - data[i].rmin - data[i].wmin, res);
       }
       return res;
@@ -74,10 +87,12 @@ class WQuantileSketch {
     /*! 
      * \brief debug function, validate whether the summary 
      *  run consistency check to check if it is a valid summary
+     * \param eps the tolerate error level, used when RType is floating point and 
+     *        some inconsistency could occur due to rounding error
      */
-    inline void CheckValid(void) const {
-      for (RType i = 0; i < size; ++i) {
-        data[i].CheckValid();
+    inline void CheckValid(RType eps) const {
+      for (size_t i = 0; i < size; ++i) {
+        data[i].CheckValid(eps);
         if (i != 0) {
           utils::Assert(data[i].rmin >= data[i - 1].rmin + data[i - 1].wmin, "rmin range constraint");
           utils::Assert(data[i].rmax >= data[i - 1].rmax + data[i].wmin, "rmax range constraint");
@@ -92,6 +107,16 @@ class WQuantileSketch {
     inline void Clear(void) {
       size = 0;
     }
+    /*! \brief used for debug purpose, print the summary */
+    inline void Print(void) const {
+      for (size_t i = 0; i < size; ++i) {
+        printf("x=%f\t[%f, %f] wmin=%f\n", 
+               data[i].value, 
+               data[i].rmin,
+               data[i].rmax,
+               data[i].wmin);
+      }
+    }
     /*!
      * \brief copy content from src
      * \param src source sketch
@@ -117,10 +142,10 @@ class WQuantileSketch {
       // lastidx is used to avoid duplicated records
       size_t i = 0, lastidx = 0;
       for (RType k = 1; k < n; ++k) {
-        RType dx2 = (k * max_rank) / n * 2;
+        RType dx2 =  (2 * k * max_rank) / n;
         // find first i such that  d < (rmax[i+1] + rmin[i+1]) / 2 
         while (i < src.size - 1 &&
-               dx2 < src.data[i + 1].rmax + src.data[i + 1].rmin) ++i;
+               dx2 >= src.data[i + 1].rmax + src.data[i + 1].rmin) ++i;
         if (i == src.size - 1) break;
         if (dx2 < src.data[i].rmin_next() + src.data[i + 1].rmax_prev()) {
           if (i != lastidx) {
@@ -136,6 +161,11 @@ class WQuantileSketch {
         data[size++] = src.data[src.size - 1];
       }
     }
+    /*! 
+     * \brief set current summary to be merged summary of sa and sb
+     * \param sa first input summary to be merged
+     * \param sb second input summar to be merged
+     */
     inline void SetCombine(const Summary &sa,
                            const Summary &sb) {
       utils::Assert(sa.size > 0 && sb.size > 0, "invalid input for merge"); 
@@ -226,10 +256,15 @@ class WQuantileSketch {
    * \param eps accuracy level of summary
    */
   inline void Init(size_t maxn, double eps) {
-    nlevel = std::max(ceil(maxn * eps), 1.0);
-    limit_size = ceil(nlevel / eps);
+    //nlevel = std::max(log2(ceil(maxn * eps)) - 2.0, 1.0);
+    nlevel = 1;
+    while (true) {
+      limit_size = ceil(nlevel / eps) + 1;
+      if ((1 << nlevel)  * limit_size >= maxn) break;
+      ++nlevel;
+    }
     // check invariant
-    utils::Assert((1 << nlevel)  * limit_size > maxn, "invalid init parameter");
+    utils::Assert((1 << nlevel) * limit_size >= maxn, "invalid init parameter");
     utils::Assert(nlevel <= limit_size * eps, "invalid init parameter");
     // lazy reserve the space, if there is only one value, no need to allocate space
     inqueue.resize(1);
@@ -241,7 +276,7 @@ class WQuantileSketch {
    * \brief add an element to a sketch 
    * \param x the elemented added to the sketch
    */
-  inline void Add(DType x, RType w = 1) {
+  inline void Push(DType x, RType w = 1) {
     if (qtail == inqueue.size()) {
       // jump from lazy one value to limit_size * 2
       if (inqueue.size() == 1) {
@@ -252,10 +287,11 @@ class WQuantileSketch {
         // cleanup queue
         qtail = 0;
         for (size_t l = 1; true; ++l) {
-          this->InitLevel(std::max(l + 1, nlevel));
+          this->InitLevel(l + 1);
           // check if level l is empty
           if (level[l].size == 0) {
-            level[l].SetPrune(temp, limit_size); break;            
+            level[l].SetPrune(temp, limit_size); 
+            break;            
           } else {
             // level 0 is actually temp space
             level[0].SetPrune(temp, limit_size);
@@ -295,10 +331,21 @@ class WQuantileSketch {
         }
       }
       out->CopyFrom(level[0]);
+    } else {
+      if (out->size > limit_size) {
+        temp.Reserve(limit_size);
+        temp.SetPrune(*out, limit_size);
+        out->CopyFrom(temp);
+      }
     }
   }
+  // used for debug, check if the sketch is valid
+  inline void CheckValid(RType eps) const {
+    for (size_t l = 1; l < level.size(); ++l) {
+      level[l].CheckValid(eps);
+    }    
+  }
   
- private:
   // initialize level space to at least nlevel
   inline void InitLevel(size_t nlevel) {
     if (level.size() >= nlevel) return;
@@ -355,7 +402,7 @@ class WQuantileSketch {
   // content of the summary
   std::vector<Entry> data;
   // temporal summary, used for temp-merge
-  SummaryContainer temp;  
+  SummaryContainer temp;
 };
 
 /*!
@@ -391,7 +438,7 @@ class GKQuantileSketch {
     /*! \brief the maximum error of the summary */
     inline RType MaxError(void) const {
       RType res = 0;
-      for (RType i = 1; i < size; ++i) {
+      for (size_t i = 1; i < size; ++i) {
         res = std::max(data[i].rmax - data[i-1].rmin, res);
       }
       return res;
@@ -424,7 +471,7 @@ class GKQuantileSketch {
       data[0] = src.data[0];
       RType n = maxsize - 1;
       RType top = 1;
-      for (RType i = 1; i < n; ++i) {
+      for (size_t i = 1; i < n; ++i) {
         RType k = (i * max_rank) / n;
         while (k > src.data[top + 1].rmax) ++top;
         // assert src.data[top].rmin <= k
@@ -539,7 +586,7 @@ class GKQuantileSketch {
    * \brief add an element to a sketch 
    * \param x the elemented added to the sketch
    */
-  inline void Add(DType x) {
+  inline void Push(DType x) {
     inqueue[qtail++] = x;
     if (qtail == inqueue.size()) {
       // start update sketch

From 7b8ba268dcf129ec51f0cf1b19e9fdca324843a9 Mon Sep 17 00:00:00 2001
From: tqchen <tianqi.tchen@gmail.com>
Date: Mon, 10 Nov 2014 16:44:07 -0800
Subject: [PATCH 056/166] commit in quantile test

---
 src/utils/quantile.h   | 452 +++++++++++++++++++++--------------------
 test/Makefile          |   2 +-
 test/mkquantest.py     |  35 ++++
 test/test_quantile.cpp |  23 +++
 4 files changed, 296 insertions(+), 216 deletions(-)
 create mode 100755 test/mkquantest.py
 create mode 100644 test/test_quantile.cpp

diff --git a/src/utils/quantile.h b/src/utils/quantile.h
index 5b536bd00ec7..0fadd9c35865 100644
--- a/src/utils/quantile.h
+++ b/src/utils/quantile.h
@@ -13,14 +13,14 @@
 
 namespace xgboost {
 namespace utils {
-/*! 
- * \brief a helper class to compute streaming quantile
+
+/*!
+ * \brief experimental wsummary
  * \tparam DType type of data content
  * \tparam RType type of rank
  */
-template<typename DType, typename RType=unsigned>
-class WQuantileSketch {
- public:
+template<typename DType, typename RType>
+struct WQSummary {
   /*! \brief an entry in the sketch summary */
   struct Entry {
     /*! \brief minimum rank */
@@ -32,27 +32,17 @@ class WQuantileSketch {
     /*! \brief the value of data */
     DType value;
     // constructor
-    Entry(void) {
-      rmin = rmax = wmin = 0.0;
-    }
+    Entry(void) {}
     // constructor
     Entry(RType rmin, RType rmax, RType wmin, DType value)
-        : rmin(rmin), rmax(rmax), wmin(wmin), value(value) {
-      if (!(this->rmax - this->rmin - this->wmin > -0.1)) {
-        rmax = rmin + wmin;
-        printf("correct\n");
-        printf("rmax=%f, rmin=%f, wmin=%f, plus=%f, v=%f\n", 
-               rmax, rmin, wmin, rmin+wmin, value);
-      }
-    }
-    /*! \brief debug function,  check Valid */
+        : rmin(rmin), rmax(rmax), wmin(wmin), value(value) {}
+    /*! 
+     * \brief debug function,  check Valid 
+     * \param eps the tolerate level for violating the relation
+     */
     inline void CheckValid(RType eps = 0) const {
       utils::Assert(rmin >= 0 && rmax >= 0 && wmin >= 0, "nonneg constraint");
-      if (!(rmax - rmin - wmin > -eps)) {
-        printf("rmax=%f, rmin=%f, wmin=%f, plus=%f, v=%f\n", 
-               rmax, rmin, wmin, rmin+wmin, value);
-      }
-      utils::Assert(rmax- rmin - wmin > -eps  , "relation constraint: min/max");
+      utils::Assert(rmax- rmin - wmin > -eps, "relation constraint: min/max");
     }
     /*! \return rmin estimation for v strictly bigger than value */
     inline RType rmin_next(void) const {
@@ -63,161 +53,222 @@ class WQuantileSketch {
       return rmax - wmin;
     }
   };
-  /*! 
-   * \brief this is data structure presenting one summary
-   */
-  struct Summary {
-    /*! \brief data field */
-    Entry *data;
-    /*! \brief number of elements in the summary */
-    size_t size;
-    // constructor
-    Summary(void) : size(0) {}
-    /*! 
-     * \brief the maximum error of the Summary
-     */
-    inline RType MaxError(void) const {
-      RType res = data[0].rmax - data[0].rmin - data[0].wmin;
-      for (size_t i = 1; i < size; ++i) {
-        res = std::max(data[i].rmax_prev() - data[i - 1].rmin_next(), res);
-        res = std::max(data[i].rmax - data[i].rmin - data[i].wmin, res);
+  /*! \brief input data queue before entering the summary */
+  struct Queue {
+    // entry in the queue
+    struct QEntry {
+      // value of the instance
+      DType value;
+      // weight of instance
+      RType weight;
+      // default constructor
+      QEntry(void) {}
+      // constructor
+      QEntry(DType value, RType weight) 
+          : value(value), weight(weight) {}
+      // comparator on value
+      inline bool operator<(const QEntry &b) const {
+        return value < b.value;
       }
-      return res;
-    }
-    /*! 
-     * \brief debug function, validate whether the summary 
-     *  run consistency check to check if it is a valid summary
-     * \param eps the tolerate error level, used when RType is floating point and 
-     *        some inconsistency could occur due to rounding error
-     */
-    inline void CheckValid(RType eps) const {
-      for (size_t i = 0; i < size; ++i) {
-        data[i].CheckValid(eps);
-        if (i != 0) {
-          utils::Assert(data[i].rmin >= data[i - 1].rmin + data[i - 1].wmin, "rmin range constraint");
-          utils::Assert(data[i].rmax >= data[i - 1].rmax + data[i].wmin, "rmax range constraint");
+    };
+    // the input queue
+    std::vector<QEntry> queue;
+    // end of the queue
+    size_t qtail;
+    // push data to the queue
+    inline void Push(DType x, RType w) {
+      if (qtail == 0 || queue[qtail - 1].value != x) {
+        queue[qtail++] = QEntry(x, w);
+      } else {
+        queue[qtail - 1].weight += w;
+      }
+    }   
+    inline void MakeSummary(WQSummary *out) {
+      std::sort(queue.begin(), queue.begin() + qtail);
+      out->size = 0;
+      // start update sketch      
+      RType wsum = 0;
+      // construct data with unique weights
+      for (size_t i = 0; i < qtail;) {
+        size_t j = i + 1;
+        RType w = queue[i].weight;
+        while (j < qtail && queue[j].value == queue[i].value) {
+          w += queue[j].weight; ++j;
         }
+        out->data[out->size++] = Entry(wsum, wsum + w, w, queue[i].value);
+        wsum += w; i = j;
       }
     }
-    /*! \return maximum rank in the summary */
-    inline RType MaxRank(void) const {
-      return data[size - 1].rmax;
-    }
-    /*! \brief set size to 0 */
-    inline void Clear(void) {
-      size = 0;
+  };
+
+  /*! \brief data field */
+  Entry *data;
+  /*! \brief number of elements in the summary */
+  size_t size;
+  // constructor
+  WQSummary(Entry *data, size_t size) 
+      : data(data), size(size) {}
+  /*!
+   * \return the maximum error of the Summary
+   */
+  inline RType MaxError(void) const {
+    RType res = data[0].rmax - data[0].rmin - data[0].wmin;
+    for (size_t i = 1; i < size; ++i) {
+      res = std::max(data[i].rmax_prev() - data[i - 1].rmin_next(), res);
+      res = std::max(data[i].rmax - data[i].rmin - data[i].wmin, res);
     }
-    /*! \brief used for debug purpose, print the summary */
-    inline void Print(void) const {
-      for (size_t i = 0; i < size; ++i) {
-        printf("x=%f\t[%f, %f] wmin=%f\n", 
-               data[i].value, 
-               data[i].rmin,
-               data[i].rmax,
-               data[i].wmin);
+    return res;
+  }
+  /*! \return maximum rank in the summary */
+  inline RType MaxRank(void) const {
+    return data[size - 1].rmax;
+  }
+  /*! \brief set size to 0 */
+  inline void Clear(void) {
+    size = 0;
+  }
+  /*!
+   * \brief copy content from src
+   * \param src source sketch
+   */
+  inline void CopyFrom(const WQSummary &src) {
+    size = src.size;
+    std::memcpy(data, src.data, sizeof(Entry) * size);
+  
+}  /*! 
+   * \brief debug function, validate whether the summary 
+   *  run consistency check to check if it is a valid summary
+   * \param eps the tolerate error level, used when RType is floating point and 
+   *        some inconsistency could occur due to rounding error
+   */
+  inline void CheckValid(RType eps) const {
+    for (size_t i = 0; i < size; ++i) {
+      data[i].CheckValid(eps);
+      if (i != 0) {
+        utils::Assert(data[i].rmin >= data[i - 1].rmin + data[i - 1].wmin, "rmin range constraint");
+        utils::Assert(data[i].rmax >= data[i - 1].rmax + data[i].wmin, "rmax range constraint");
       }
     }
-    /*!
-     * \brief copy content from src
-     * \param src source sketch
-     */
-    inline void CopyFrom(const Summary &src) {
-      size = src.size;
-      std::memcpy(data, src.data, sizeof(Entry) * size);
+  }
+  /*! \brief used for debug purpose, print the summary */
+  inline void Print(void) const {
+    for (size_t i = 0; i < size; ++i) {
+      printf("x=%f\t[%f, %f] wmin=%f\n", 
+             data[i].value, data[i].rmin,
+             data[i].rmax, data[i].wmin);
     }
-    /*!
-     * \brief set current summary to be pruned summary of src
-     *        assume data field is already allocated to be at least maxsize
-     * \param src source summary
-     * \param maxsize size we can afford in the pruned sketch
-     */
-    inline void SetPrune(const Summary &src, RType maxsize) {
-      if (src.size <= maxsize) {
-        this->CopyFrom(src); return;
-      }
-      const RType max_rank = src.MaxRank();
-      const size_t n = maxsize - 1;
-      data[0] = src.data[0];
-      this->size = 1;
-      // lastidx is used to avoid duplicated records
-      size_t i = 0, lastidx = 0;
-      for (RType k = 1; k < n; ++k) {
-        RType dx2 =  (2 * k * max_rank) / n;
+  }
+  /*!
+   * \brief set current summary to be pruned summary of src
+   *        assume data field is already allocated to be at least maxsize
+   * \param src source summary
+   * \param maxsize size we can afford in the pruned sketch
+   */
+
+  inline void SetPrune(const WQSummary &src, RType maxsize) {
+    if (src.size <= maxsize) {
+      this->CopyFrom(src); return;
+    }
+    const RType max_rank = src.MaxRank();
+    const size_t n = maxsize - 1;
+    data[0] = src.data[0];
+    this->size = 1;
+    // lastidx is used to avoid duplicated records
+    size_t i = 0, lastidx = 0;
+    for (RType k = 1; k < n; ++k) {
+      RType dx2 =  (2 * k * max_rank) / n;
         // find first i such that  d < (rmax[i+1] + rmin[i+1]) / 2 
-        while (i < src.size - 1 &&
-               dx2 >= src.data[i + 1].rmax + src.data[i + 1].rmin) ++i;
-        if (i == src.size - 1) break;
-        if (dx2 < src.data[i].rmin_next() + src.data[i + 1].rmax_prev()) {
-          if (i != lastidx) {
-            data[size++] = src.data[i]; lastidx = i;
-          }
-        } else {
-          if (i + 1 != lastidx) {
-            data[size++] = src.data[i + 1]; lastidx = i + 1;
-          }
+      while (i < src.size - 1 &&
+             dx2 >= src.data[i + 1].rmax + src.data[i + 1].rmin) ++i;
+      if (i == src.size - 1) break;
+      if (dx2 < src.data[i].rmin_next() + src.data[i + 1].rmax_prev()) {
+        if (i != lastidx) {
+          data[size++] = src.data[i]; lastidx = i;
         }
-      }
-      if (lastidx != src.size - 1) {
-        data[size++] = src.data[src.size - 1];
-      }
-    }
-    /*! 
-     * \brief set current summary to be merged summary of sa and sb
-     * \param sa first input summary to be merged
-     * \param sb second input summar to be merged
-     */
-    inline void SetCombine(const Summary &sa,
-                           const Summary &sb) {
-      utils::Assert(sa.size > 0 && sb.size > 0, "invalid input for merge"); 
-      const Entry *a = sa.data, *a_end = sa.data + sa.size;
-      const Entry *b = sb.data, *b_end = sb.data + sb.size;
-      // extended rmin value
-      RType aprev_rmin = 0, bprev_rmin = 0;
-      Entry *dst = this->data;
-      while (a != a_end && b != b_end) {
-        // duplicated value entry
-        if (a->value == b->value) {
-          *dst = Entry(a->rmin + b->rmin,
-                       a->rmax + b->rmax,
-                       a->wmin + b->wmin, a->value);
-          aprev_rmin = a->rmin_next();
-          bprev_rmin = b->rmin_next();
-          ++dst; ++a; ++b;
-        } else if (a->value < b->value) {
-          *dst = Entry(a->rmin + bprev_rmin,
-                       a->rmax + b->rmax_prev(),
-                       a->wmin, a->value);
-          aprev_rmin = a->rmin_next();
-          ++dst; ++a;
-        } else {
-          *dst = Entry(b->rmin + aprev_rmin,
-                       b->rmax + a->rmax_prev(),
-                       b->wmin, b->value);
-          bprev_rmin = b->rmin_next();
-          ++dst; ++b;
+      } else {
+        if (i + 1 != lastidx) {
+          data[size++] = src.data[i + 1]; lastidx = i + 1;
         }
       }
-      if (a != a_end) {
-        RType brmax = (b_end - 1)->rmax;
-        do {
-          *dst = Entry(a->rmin + bprev_rmin, a->rmax + brmax, a->wmin, a->value);
-          ++dst; ++a;
-        } while (a != a_end);
-      }
-      if (b != b_end) {
-        RType armax = (a_end - 1)->rmax;
-        do {
-          *dst = Entry(b->rmin + aprev_rmin, b->rmax + armax, b->wmin, b->value);
-          ++dst; ++b;
-        } while (b != b_end);
+    }
+    if (lastidx != src.size - 1) {
+      data[size++] = src.data[src.size - 1];
+    }
+  }
+  /*! 
+   * \brief set current summary to be merged summary of sa and sb
+   * \param sa first input summary to be merged
+   * \param sb second input summar to be merged
+   */
+  inline void SetCombine(const WQSummary &sa,
+                         const WQSummary &sb) {
+    utils::Assert(sa.size > 0 && sb.size > 0, "invalid input for merge"); 
+    const Entry *a = sa.data, *a_end = sa.data + sa.size;
+    const Entry *b = sb.data, *b_end = sb.data + sb.size;
+    // extended rmin value
+    RType aprev_rmin = 0, bprev_rmin = 0;
+    Entry *dst = this->data;
+    while (a != a_end && b != b_end) {
+      // duplicated value entry
+      if (a->value == b->value) {
+        *dst = Entry(a->rmin + b->rmin,
+                     a->rmax + b->rmax,
+                     a->wmin + b->wmin, a->value);
+        aprev_rmin = a->rmin_next();
+        bprev_rmin = b->rmin_next();
+        ++dst; ++a; ++b;
+      } else if (a->value < b->value) {
+        *dst = Entry(a->rmin + bprev_rmin,
+                     a->rmax + b->rmax_prev(),
+                     a->wmin, a->value);
+        aprev_rmin = a->rmin_next();
+        ++dst; ++a;
+      } else {
+        *dst = Entry(b->rmin + aprev_rmin,
+                     b->rmax + a->rmax_prev(),
+                     b->wmin, b->value);
+        bprev_rmin = b->rmin_next();
+        ++dst; ++b;
       }
-      this->size = dst - data;
-      utils::Assert(size <= sa.size + sb.size, "bug in combine");
     }
-  };
-  // same as summary, but use STL to backup the space
+    if (a != a_end) {
+      RType brmax = (b_end - 1)->rmax;
+      do {
+        *dst = Entry(a->rmin + bprev_rmin, a->rmax + brmax, a->wmin, a->value);
+        ++dst; ++a;
+      } while (a != a_end);
+    }
+    if (b != b_end) {
+      RType armax = (a_end - 1)->rmax;
+      do {
+        *dst = Entry(b->rmin + aprev_rmin, b->rmax + armax, b->wmin, b->value);
+        ++dst; ++b;
+      } while (b != b_end);
+    }
+    this->size = dst - data;
+    utils::Assert(size <= sa.size + sb.size, "bug in combine");
+  }
+};
+
+/*!
+ * \brief template for all quantle sketch algorithm
+ *        that uses merge/prune scheme
+ * \tparam DType type of data content
+ * \tparam RType type of rank
+ * \tparam TSummary actual summary data structure it uses
+ */
+template<typename DType, typename RType, class TSummary>
+class QuantileSketchTemplate {
+ public:
+  /*! \brief type of summary type */
+  typedef TSummary Summary;
+  /*! \brief the entry type */
+  typedef typename Summary::Entry Entry;   
+  /*! \brief same as summary, but use STL to backup the space */
   struct SummaryContainer : public Summary {
     std::vector<Entry> space;
+    SummaryContainer(void) : Summary(NULL, 0) { 
+    }
     /*! \brief reserve space for summary */
     inline void Reserve(size_t size) {
       if (size > space.size()) {
@@ -267,8 +318,8 @@ class WQuantileSketch {
     utils::Assert((1 << nlevel) * limit_size >= maxn, "invalid init parameter");
     utils::Assert(nlevel <= limit_size * eps, "invalid init parameter");
     // lazy reserve the space, if there is only one value, no need to allocate space
-    inqueue.resize(1);
-    qtail = 0;
+    inqueue.queue.resize(1);
+    inqueue.qtail = 0;
     data.clear();
     level.clear();
   }
@@ -277,15 +328,15 @@ class WQuantileSketch {
    * \param x the elemented added to the sketch
    */
   inline void Push(DType x, RType w = 1) {
-    if (qtail == inqueue.size()) {
+    if (inqueue.qtail == inqueue.queue.size()) {
       // jump from lazy one value to limit_size * 2
-      if (inqueue.size() == 1) {
-        inqueue.resize(limit_size * 2);
+      if (inqueue.queue.size() == 1) {
+        inqueue.queue.resize(limit_size * 2);
       } else {
         temp.Reserve(limit_size * 2);
-        this->Queue2Summary(&temp);
+        inqueue.MakeSummary(&temp);
         // cleanup queue
-        qtail = 0;
+        inqueue.qtail = 0;
         for (size_t l = 1; true; ++l) {
           this->InitLevel(l + 1);
           // check if level l is empty
@@ -307,18 +358,16 @@ class WQuantileSketch {
         }
       }
     }
-    if (qtail == 0 || inqueue[qtail - 1].value != x) {
-      inqueue[qtail++] = QEntry(x, w); 
-    } else {
-      inqueue[qtail - 1].weight += w;
-    }
+    inqueue.Push(x, w);
   }
   /*! \brief get the summary after finalize */
   inline void GetSummary(SummaryContainer *out) {
     if (level.size() != 0) {
       out->Reserve(limit_size * 2);
+    } else {
+      out->Reserve(inqueue.queue.size());
     }
-    this->Queue2Summary(out);
+    inqueue.MakeSummary(out);
     if (level.size() != 0) {
       level[0].SetPrune(*out, limit_size);
       for (size_t l = 1; l < level.size(); ++l) {
@@ -343,56 +392,19 @@ class WQuantileSketch {
   inline void CheckValid(RType eps) const {
     for (size_t l = 1; l < level.size(); ++l) {
       level[l].CheckValid(eps);
-    }    
+    }
   }
-  
   // initialize level space to at least nlevel
   inline void InitLevel(size_t nlevel) {
     if (level.size() >= nlevel) return;
     data.resize(limit_size * nlevel);
-    level.resize(nlevel, Summary());
+    level.resize(nlevel, Summary(NULL, 0));
     for (size_t l = 0; l < level.size(); ++l) {
       level[l].data = BeginPtr(data) + l * limit_size;
     }
   }
-  inline void Queue2Summary(SummaryContainer *temp) {
-    // reserve temp space
-    temp->Reserve(inqueue.size());
-    temp->size = 0;
-    // start update sketch
-    std::sort(inqueue.begin(), inqueue.begin() + qtail);
-    RType wsum = 0;
-    // construct data with unique weights
-    for (size_t i = 0; i < qtail;) {
-      size_t j = i + 1;
-      RType w = inqueue[i].weight;
-      while (j < qtail && inqueue[j].value == inqueue[i].value) {
-        w += inqueue[j].weight; ++j;
-      }
-      temp->data[temp->size++] = Entry(wsum, wsum + w, w, inqueue[i].value);
-      wsum += w; i = j;
-    }
-  }
-  // entry in the queue
-  struct QEntry {
-    // value of the instance
-    DType value;
-    // weight of instance
-    RType weight;
-    // default constructor
-    QEntry(void) {}
-    // constructor
-    QEntry(DType value, RType weight) 
-        : value(value), weight(weight) {}
-    // comparator on value
-    inline bool operator<(const QEntry &b) const {
-      return value < b.value;
-    }
-  };
-  // the input queue
-  std::vector<QEntry> inqueue;
-  // end of the queue
-  size_t qtail;
+  // input data queue
+  typename Summary::Queue inqueue;
   // number of levels
   size_t nlevel;
   // size of summary in each level
@@ -405,6 +417,16 @@ class WQuantileSketch {
   SummaryContainer temp;
 };
 
+/*!
+ * \brief Quantiel sketch use WQSummary
+ * \tparam DType type of data content
+ * \tparam RType type of rank
+ */
+template<typename DType, typename RType=unsigned>
+class WQuantileSketch : 
+      public QuantileSketchTemplate<DType, RType, WQSummary<DType, RType> >{
+};
+
 /*!
  * \brief a helper class to compute streaming quantile
  * \tparam DType type of data content
diff --git a/test/Makefile b/test/Makefile
index 5057619ec890..6d135e3171cd 100644
--- a/test/Makefile
+++ b/test/Makefile
@@ -1,5 +1,5 @@
 export CC  = gcc
-export CXX = g++
+export CXX = clang++
 export MPICXX = mpicxx
 export LDFLAGS= -pthread -lm 
 export CFLAGS = -Wall -O3 -msse2  -Wno-unknown-pragmas -fPIC -I../src
diff --git a/test/mkquantest.py b/test/mkquantest.py
new file mode 100755
index 000000000000..709d4bb784ae
--- /dev/null
+++ b/test/mkquantest.py
@@ -0,0 +1,35 @@
+#!/usr/bin/python
+import math
+import sys
+import random
+import subprocess
+
+funcs = {
+    'seq': 'lambda n: sorted([(x,1) for x in range(1,n+1)], key = lambda x:random.random())',
+    'seqlogw': 'lambda n: sorted([(x, math.log(x)) for x in range(1,n+1)], key = lambda x:random.random())'
+}
+
+if len(sys.argv) < 3:
+    print 'Usage: python mkquantest.py <maxn> <eps> [generate-type] [ndata]|./test_quantile'
+    print 'Possible generate-types:' 
+    for k, v in funcs.items():
+        print '\t%s: %s' % (k, v)
+    exit(-1)
+random.seed(0)
+maxn = int(sys.argv[1])
+eps = float(sys.argv[2])
+if len(sys.argv) > 3:
+    method = sys.argv[3]
+    assert method in funcs, ('cannot find method %s' % method)
+else:
+    method = 'seq'
+if len(sys.argv) > 4:
+    ndata = int(sys.argv[4])
+    assert ndata <= maxn, 'ndata must be smaller than maxn'
+else:
+    ndata = maxn
+    
+fo = sys.stdout
+fo.write('%d\t%g\n' % (maxn, eps))
+for x, w in eval(funcs[method])(ndata):
+    fo.write(str(x)+'\t'+str(w)+'\n')
diff --git a/test/test_quantile.cpp b/test/test_quantile.cpp
new file mode 100644
index 000000000000..89e97689754b
--- /dev/null
+++ b/test/test_quantile.cpp
@@ -0,0 +1,23 @@
+#include <vector>
+#include <utils/quantile.h>
+using namespace xgboost;
+
+int main(int argc, char *argv[]) {  
+  utils::WQuantileSketch<float, float> sketch;
+  size_t n;
+  double wsum = 0.0;
+  float eps, x, w;
+  utils::Check(scanf("%lu%f", &n, &eps) == 2, "needs to start with n eps");
+  sketch.Init(n, eps);
+  printf("nlevel = %lu, limit_size=%lu\n", sketch.nlevel, sketch.limit_size);
+  while (scanf("%f%f", &x, &w) == 2) {
+    sketch.Push(x, w);
+    wsum += w;
+  }
+  sketch.CheckValid(0.1);
+  utils::WQuantileSketch<float, float>::SummaryContainer out;
+  sketch.GetSummary(&out);
+  printf("MaxError=%f/%f = %g\n", out.MaxError(), wsum, out.MaxError() / wsum);
+  out.Print();
+  return 0;
+}

From 9855a90142d7f382b4fbdd65e46aa926daaddc1c Mon Sep 17 00:00:00 2001
From: tqchen <tianqi.tchen@gmail.com>
Date: Mon, 10 Nov 2014 17:06:10 -0800
Subject: [PATCH 057/166] unified gk and wq

---
 src/utils/quantile.h   | 415 +++++++++++++++--------------------------
 test/test_quantile.cpp |  26 ++-
 2 files changed, 173 insertions(+), 268 deletions(-)

diff --git a/src/utils/quantile.h b/src/utils/quantile.h
index 0fadd9c35865..e51dc16bad36 100644
--- a/src/utils/quantile.h
+++ b/src/utils/quantile.h
@@ -9,6 +9,7 @@
 #include <vector>
 #include <cstring>
 #include <algorithm>
+#include <iostream>
 #include "./utils.h"
 
 namespace xgboost {
@@ -100,7 +101,6 @@ struct WQSummary {
       }
     }
   };
-
   /*! \brief data field */
   Entry *data;
   /*! \brief number of elements in the summary */
@@ -123,19 +123,15 @@ struct WQSummary {
   inline RType MaxRank(void) const {
     return data[size - 1].rmax;
   }
-  /*! \brief set size to 0 */
-  inline void Clear(void) {
-    size = 0;
-  }
   /*!
    * \brief copy content from src
    * \param src source sketch
    */
   inline void CopyFrom(const WQSummary &src) {
     size = src.size;
-    std::memcpy(data, src.data, sizeof(Entry) * size);
-  
-}  /*! 
+    std::memcpy(data, src.data, sizeof(Entry) * size);    
+  }  
+  /*! 
    * \brief debug function, validate whether the summary 
    *  run consistency check to check if it is a valid summary
    * \param eps the tolerate error level, used when RType is floating point and 
@@ -153,9 +149,9 @@ struct WQSummary {
   /*! \brief used for debug purpose, print the summary */
   inline void Print(void) const {
     for (size_t i = 0; i < size; ++i) {
-      printf("x=%f\t[%f, %f] wmin=%f\n", 
-             data[i].value, data[i].rmin,
-             data[i].rmax, data[i].wmin);
+      std::cout << "x=" << data[i].value << "\t"
+                << "[" << data[i].rmin << "," << data[i].rmax << "]"
+                << " wmin=" << data[i].wmin << std::endl;
     }
   }
   /*!
@@ -250,6 +246,147 @@ struct WQSummary {
   }
 };
 
+/*! 
+ * \brief traditional GK summary
+ */
+template<typename DType, typename RType>
+struct GKSummary {
+  /*! \brief an entry in the sketch summary */
+  struct Entry {
+    /*! \brief minimum rank */
+    RType rmin;
+    /*! \brief maximum rank */
+    RType rmax;
+    /*! \brief the value of data */
+    DType value;
+    // constructor
+    Entry(void) {}
+    // constructor
+    Entry(RType rmin, RType rmax, DType value)
+        : rmin(rmin), rmax(rmax), value(value) {}
+  };
+  /*! \brief input data queue before entering the summary */
+  struct Queue {
+    // the input queue
+    std::vector<DType> queue;
+    // end of the queue
+    size_t qtail;
+    // push data to the queue
+    inline void Push(DType x, RType w) {
+      queue[qtail++] = x;
+    }   
+    inline void MakeSummary(GKSummary *out) {
+      std::sort(queue.begin(), queue.begin() + qtail);
+      out->size = qtail;
+      for (size_t i = 0; i < qtail; ++i) {
+        out->data[i] = Entry(i + 1, i + 1, queue[i]);
+      }
+    }
+  };
+  /*! \brief data field */
+  Entry *data;
+  /*! \brief number of elements in the summary */
+  size_t size;
+  GKSummary(Entry *data, size_t size)
+      : data(data), size(size) {} 
+  /*! \brief the maximum error of the summary */
+  inline RType MaxError(void) const {
+    RType res = 0;
+    for (size_t i = 1; i < size; ++i) {
+      res = std::max(data[i].rmax - data[i-1].rmin, res);
+    }
+    return res;
+  }
+  /*! \return maximum rank in the summary */
+  inline RType MaxRank(void) const {
+    return data[size - 1].rmax;
+  }
+  /*! 
+   * \brief copy content from src
+   * \param src source sketch
+   */
+  inline void CopyFrom(const GKSummary &src) {
+    size = src.size;
+    std::memcpy(data, src.data, sizeof(Entry) * size);
+  }
+  inline void CheckValid(RType eps) const {
+    // assume always valid
+  }
+  /*! \brief used for debug purpose, print the summary */
+  inline void Print(void) const {
+    for (size_t i = 0; i < size; ++i) {
+      std::cout << "x=" << data[i].value << "\t"
+                << "[" << data[i].rmin << "," << data[i].rmax << "]"
+                << std::endl;
+    }
+  }  
+  /*! 
+   * \brief set current summary to be pruned summary of src
+   *        assume data field is already allocated to be at least maxsize
+   * \param src source summary
+   * \param maxsize size we can afford in the pruned sketch
+   */
+  inline void SetPrune(const GKSummary &src, RType maxsize) {
+    if (src.size <= maxsize) {
+      this->CopyFrom(src); return;
+    }
+    const RType max_rank = src.MaxRank();
+    this->size = maxsize;
+    data[0] = src.data[0];
+    size_t n = maxsize - 1;
+    RType top = 1;
+    for (size_t i = 1; i < n; ++i) {
+      RType k = (i * max_rank) / n;
+      while (k > src.data[top + 1].rmax) ++top;
+      // assert src.data[top].rmin <= k
+      // because k > src.data[top].rmax >= src.data[top].rmin
+      if ((k - src.data[top].rmin) < (src.data[top+1].rmax - k)) {
+        data[i] = src.data[top];
+      } else {
+        data[i] = src.data[top + 1];
+      }
+    }
+    data[n] = src.data[src.size - 1];
+  }
+  inline void SetCombine(const GKSummary &sa,
+                         const GKSummary &sb) {
+    utils::Assert(sa.size > 0 && sb.size > 0, "invalid input for merge"); 
+    const Entry *a = sa.data, *a_end = sa.data + sa.size;
+    const Entry *b = sb.data, *b_end = sb.data + sb.size;
+    this->size = sa.size + sb.size;
+    RType aprev_rmin = 0, bprev_rmin = 0;
+    Entry *dst = this->data;
+    while (a != a_end && b != b_end) {
+      if (a->value < b->value) {
+        *dst = Entry(bprev_rmin + a->rmin,
+                     a->rmax + b->rmax - 1, a->value);
+        aprev_rmin = a->rmin;
+        ++dst; ++a;
+      } else {
+        *dst = Entry(aprev_rmin + b->rmin, 
+                     b->rmax + a->rmax - 1, b->value);
+        bprev_rmin = b->rmin;
+        ++dst; ++b;
+      }
+    }
+    if (a != a_end) {
+      RType bprev_rmax = (b_end - 1)->rmax;
+      do {
+        *dst = Entry(bprev_rmin + a->rmin, bprev_rmax + a->rmax, a->value);
+        ++dst; ++a;
+      } while (a != a_end);
+    }
+    if (b != b_end) {
+      RType aprev_rmax = (a_end - 1)->rmax;
+      do {
+        *dst = Entry(aprev_rmin + b->rmin, aprev_rmax + b->rmax, b->value);
+        ++dst; ++b;
+      } while (b != b_end);
+    }
+    utils::Assert(dst == data + size, "bug in combine");
+  }
+};
+
 /*!
  * \brief template for all quantle sketch algorithm
  *        that uses merge/prune scheme
@@ -418,7 +555,7 @@ class QuantileSketchTemplate {
 };
 
 /*!
- * \brief Quantiel sketch use WQSummary
+ * \brief Quantile sketch use WQSummary
  * \tparam DType type of data content
  * \tparam RType type of rank
  */
@@ -426,262 +563,16 @@ template<typename DType, typename RType=unsigned>
 class WQuantileSketch : 
       public QuantileSketchTemplate<DType, RType, WQSummary<DType, RType> >{
 };
-
 /*!
- * \brief a helper class to compute streaming quantile
+ * \brief Quantile sketch use WQSummary
  * \tparam DType type of data content
  * \tparam RType type of rank
  */
 template<typename DType, typename RType=unsigned>
-class GKQuantileSketch {
- public:
-  /*! \brief an entry in the sketch summary */
-  struct Entry {
-    /*! \brief minimum rank */
-    RType rmin;
-    /*! \brief maximum rank */
-    RType rmax;
-    /*! \brief the value of data */
-    DType value;
-    // constructor
-    Entry(void) {}
-    // constructor
-    Entry(RType rmin, RType rmax, DType value)
-        : rmin(rmin), rmax(rmax), value(value) {}
-  };
-  /*! 
-   * \brief this is data structure presenting one summary
-   */
-  struct Summary {
-    /*! \brief data field */
-    Entry *data;
-    /*! \brief number of elements in the summary */
-    RType size;
-    /*! \brief the maximum error of the summary */
-    inline RType MaxError(void) const {
-      RType res = 0;
-      for (size_t i = 1; i < size; ++i) {
-        res = std::max(data[i].rmax - data[i-1].rmin, res);
-      }
-      return res;
-    }
-    /*! \return maximum rank in the summary */
-    inline RType MaxRank(void) const {
-      return data[size - 1].rmax;
-    }
-    /*! \brief set size to 0 */
-    inline void Clear(void) {
-      size = 0;
-    }
-    /*! 
-     * \brief copy content from src
-     * \param src source sketch
-     */
-    inline void CopyFrom(const Summary &src) {
-      size = src.size;
-      std::memcpy(data, src.data, sizeof(Entry) * size);
-    }
-    /*! 
-     * \brief set current summary to be pruned summary of src
-     *        assume data field is already allocated to be at least maxsize
-     * \param src source summary
-     * \param maxsize size we can afford in the pruned sketch
-     */
-    inline void SetPrune(const Summary &src, RType maxsize) {
-      const RType max_rank = src.MaxRank();
-      this->size = maxsize;
-      data[0] = src.data[0];
-      RType n = maxsize - 1;
-      RType top = 1;
-      for (size_t i = 1; i < n; ++i) {
-        RType k = (i * max_rank) / n;
-        while (k > src.data[top + 1].rmax) ++top;
-        // assert src.data[top].rmin <= k
-        // because k > src.data[top].rmax >= src.data[top].rmin
-        if ((k - src.data[top].rmin) < (src.data[top+1].rmax - k)) {
-          data[i] = src.data[top];
-        } else {
-          data[i] = src.data[top + 1];
-        }
-      }
-      data[n] = src.data[src.size - 1];
-    }
-    inline void SetCombine(const Summary &sa,
-                           const Summary &sb) {
-      utils::Assert(sa.size > 0 && sb.size > 0, "invalid input for merge"); 
-      const Entry *a = sa.data, *a_end = sa.data + sa.size;
-      const Entry *b = sb.data, *b_end = sb.data + sb.size;
-      this->size = sa.size + sb.size;
-      RType aprev_rmin = 0, bprev_rmin = 0;
-      Entry *dst = this->data;
-      while (a != a_end && b != b_end) {
-        if (a->value < b->value) {
-          *dst = Entry(bprev_rmin + a->rmin,
-                       a->rmax + b->rmax - 1, a->value);
-          aprev_rmin = a->rmin;
-          ++dst; ++a;
-        } else {
-          *dst = Entry(aprev_rmin + b->rmin, 
-                       b->rmax + a->rmax - 1, b->value);
-          bprev_rmin = b->rmin;
-          ++dst; ++b;
-        }
-      }
-      if (a != a_end) {
-        RType bprev_rmax = (b_end - 1)->rmax;
-        do {
-          *dst = Entry(bprev_rmin + a->rmin, bprev_rmax + a->rmax, a->value);
-          ++dst; ++a;
-        } while (a != a_end);
-      }
-      if (b != b_end) {
-        RType aprev_rmax = (a_end - 1)->rmax;
-        do {
-          *dst = Entry(aprev_rmin + b->rmin, aprev_rmax + b->rmax, b->value);
-          ++dst; ++b;
-        } while (b != b_end);
-      }
-      utils::Assert(dst == data + size, "bug in combine");
-    }
-  };
-  // same as summary, but use STL to backup the space
-  struct SummaryContainer : public Summary {
-    std::vector<Entry> space;
-    /*! \brief reserve space for summary */
-    inline void Reserve(size_t size) {
-      space.resize(size);
-      this->data = BeginPtr(space);
-    }
-    /*! 
-     * \brief set the space to be merge of all Summary arrays
-     * \param begin begining position in th summary array
-     * \param end ending position in the Summary array
-     */
-    inline void SetMerge(const Summary *begin,
-                         const Summary *end) {
-      utils::Assert(begin < end, "can not set combine to empty instance");
-      size_t len = end - begin;
-      if (len == 1) {
-        this->Reserve(begin[0].size);
-        this->CopyFrom(begin[0]);
-      } else if (len == 2) {
-        this->Reserve(begin[0].size + begin[1].size);
-        this->SetMerge(begin[0], begin[1]);
-      } else {
-        // recursive merge
-        SummaryContainer lhs, rhs;        
-        lhs.SetCombine(begin, begin + len / 2);
-        rhs.SetCombine(begin + len / 2, end);
-        this->Reserve(lhs.size + rhs.size);
-        this->SetCombine(lhs, rhs);
-      }
-    }
-  };
-  /*! 
-   * \brief intialize the quantile sketch, given the performance specification
-   * \param maxn maximum number of data points can be encountered
-   * \param eps accuracy level of summary
-   */
-  inline void Init(RType maxn, double eps) {
-    eps  = eps * 0.5;
-    size_t L = 0;
-    size_t b = std::max(floor(log2(eps * maxn) / eps), 8.0);
-    // check for lower 
-    while (b < maxn) {
-      L = ceil(log2(maxn / b)) + 1;
-      if (L < eps * b) break;
-      ++b;
-    }
-    L += 1;
-    inqueue.resize(b);
-    limit_size = (b + 1) / 2 + 1;
-    temp.Reserve(limit_size * 2);
-    data.resize(limit_size * L);
-    for (size_t l = 0; l < L; ++l) {
-      Summary s; s.size = 0;
-      s.data = BeginPtr(data) + l * limit_size;
-      level.push_back(s);
-    }
-    qtail = 0;
-  }
-  /*! 
-   * \brief add an element to a sketch 
-   * \param x the elemented added to the sketch
-   */
-  inline void Push(DType x) {
-    inqueue[qtail++] = x;
-    if (qtail == inqueue.size()) {
-      // start update sketch
-      std::sort(inqueue.begin(), inqueue.end());
-      for (size_t i = 0; i < qtail; ++i) {
-        temp.data[i] = Entry(i + 1, i + 1, inqueue[i]);
-      }
-      temp.size = static_cast<RType>(qtail);
-      // clean up queue
-      qtail = 0;
-      for (size_t l = 1; l < level.size(); ++l) {
-        // check if level l is empty
-        if (level[l].size == 0) {
-          level[l].SetPrune(temp, limit_size);
-          return;
-        } else {
-          // level 0 is actually temp space
-          level[0].SetPrune(temp, limit_size);
-          temp.SetCombine(level[0], level[l]);
-          level[l].size = 0;
-        }
-      }
-      utils::Error("adding more element than allowed");
-    }
-  }
-  /*! 
-   * \brief finalize the result after all data has been passed 
-   *        copy the final result to level 0
-   *        this can only be called once
-   */
-  inline void Finalize(void) {
-    // start update sketch
-    std::sort(inqueue.begin(), inqueue.begin() + qtail);
-    for (size_t i = 0; i < qtail; ++i) {
-      temp.data[i] = Entry(i + 1, i + 1, inqueue[i]);
-    }
-    temp.size = static_cast<RType>(qtail);
-    if (temp.size < limit_size) {
-      level[0].CopyFrom(temp);
-    } else {
-      level[0].SetPrune(temp, limit_size);
-    }
-    // start adding other things in
-    for (size_t l = 1; l < level.size(); ++l) {
-      if (level[l].size == 0) continue;
-      if (level[0].size == 0) {
-        level[0].CopyFrom(level[l]);
-      } else {
-        temp.SetCombine(level[0], level[l]);
-        level[0].SetPrune(temp, limit_size);        
-      }
-      level[l].size = 0;
-    }
-  }
-  /*! \brief get the summary after finalize */
-  inline Summary GetSummary(void) const {
-    return level[0];
-  }  
-  
- private:  
-  // the input queue
-  std::vector<DType> inqueue;
-  // end of the queue
-  size_t qtail;
-  // size of summary in each level
-  size_t limit_size;
-  // content of the summary
-  std::vector<Entry> data;
-  // different level of summary
-  std::vector<Summary> level;  
-  // temporal summary, used for temp-merge
-  SummaryContainer temp;  
+class GKQuantileSketch : 
+      public QuantileSketchTemplate<DType, RType, GKSummary<DType, RType> >{
 };
+
 }  // utils
 }  // xgboost
 #endif
diff --git a/test/test_quantile.cpp b/test/test_quantile.cpp
index 89e97689754b..e6af5b1ec508 100644
--- a/test/test_quantile.cpp
+++ b/test/test_quantile.cpp
@@ -2,8 +2,10 @@
 #include <utils/quantile.h>
 using namespace xgboost;
 
-int main(int argc, char *argv[]) {  
-  utils::WQuantileSketch<float, float> sketch;
+
+template<typename Sketch, typename RType>
+inline void test(void) {
+  Sketch sketch;
   size_t n;
   double wsum = 0.0;
   float eps, x, w;
@@ -11,13 +13,25 @@ int main(int argc, char *argv[]) {
   sketch.Init(n, eps);
   printf("nlevel = %lu, limit_size=%lu\n", sketch.nlevel, sketch.limit_size);
   while (scanf("%f%f", &x, &w) == 2) {
-    sketch.Push(x, w);
+    sketch.Push(x, static_cast<RType>(w));
     wsum += w;
   }
-  sketch.CheckValid(0.1);
-  utils::WQuantileSketch<float, float>::SummaryContainer out;
+  sketch.CheckValid(static_cast<RType>(0.1));
+  typename Sketch::SummaryContainer out;
   sketch.GetSummary(&out);
-  printf("MaxError=%f/%f = %g\n", out.MaxError(), wsum, out.MaxError() / wsum);
+  double maxerr = static_cast<double>(out.MaxError());
+  printf("MaxError=%g/%g = %g\n", maxerr, wsum, maxerr / wsum);
   out.Print();
+}
+
+int main(int argc, char *argv[]) {
+  const char *method = "wq";
+  if (argc > 1) method = argv[1];
+  if (!strcmp(method, "wq")) {
+    test<utils::WQuantileSketch<float, float>, float>();
+  }
+  if (!strcmp(method, "gk")) {
+    test<utils::GKQuantileSketch<float, unsigned>, unsigned>();
+  }
   return 0;
 }

From b426eef527b47695fd61650789cb1a49e430ec36 Mon Sep 17 00:00:00 2001
From: tqchen <tianqi.tchen@gmail.com>
Date: Mon, 10 Nov 2014 17:24:44 -0800
Subject: [PATCH 058/166] chg begin end type

---
 src/utils/quantile.h | 9 +++++----
 test/mkquantest.py   | 3 ++-
 2 files changed, 7 insertions(+), 5 deletions(-)

diff --git a/src/utils/quantile.h b/src/utils/quantile.h
index e51dc16bad36..5dd3d90590e1 100644
--- a/src/utils/quantile.h
+++ b/src/utils/quantile.h
@@ -165,15 +165,16 @@ struct WQSummary {
     if (src.size <= maxsize) {
       this->CopyFrom(src); return;
     }
-    const RType max_rank = src.MaxRank();
+    const RType begin = src.data[0].rmax;
+    const RType range = src.data[src.size - 1].rmin - src.data[0].rmax;
     const size_t n = maxsize - 1;
     data[0] = src.data[0];
     this->size = 1;
     // lastidx is used to avoid duplicated records
-    size_t i = 0, lastidx = 0;
+    size_t i = 1, lastidx = 0;
     for (RType k = 1; k < n; ++k) {
-      RType dx2 =  (2 * k * max_rank) / n;
-        // find first i such that  d < (rmax[i+1] + rmin[i+1]) / 2 
+      RType dx2 =  2 * ((k * range) / n + begin);
+      // find first i such that  d < (rmax[i+1] + rmin[i+1]) / 2 
       while (i < src.size - 1 &&
              dx2 >= src.data[i + 1].rmax + src.data[i + 1].rmin) ++i;
       if (i == src.size - 1) break;
diff --git a/test/mkquantest.py b/test/mkquantest.py
index 709d4bb784ae..f228dc1eb62d 100755
--- a/test/mkquantest.py
+++ b/test/mkquantest.py
@@ -6,7 +6,8 @@
 
 funcs = {
     'seq': 'lambda n: sorted([(x,1) for x in range(1,n+1)], key = lambda x:random.random())',
-    'seqlogw': 'lambda n: sorted([(x, math.log(x)) for x in range(1,n+1)], key = lambda x:random.random())'
+    'seqlogw': 'lambda n: sorted([(x, math.log(x)) for x in range(1,n+1)], key = lambda x:random.random())',
+    'lots0': 'lambda n: sorted([(max(x - n*3/4,0), 1) for x in range(1,n+1)], key = lambda x:random.random())'
 }
 
 if len(sys.argv) < 3:

From 9d101b47f902d36cb4a7ac37d5963246eddc16f2 Mon Sep 17 00:00:00 2001
From: tqchen <tianqi.tchen@gmail.com>
Date: Mon, 10 Nov 2014 21:18:37 -0800
Subject: [PATCH 059/166] optimize heavy hitter

---
 src/utils/quantile.h   | 94 ++++++++++++++++++++++++++++++++++++++++--
 test/mkquantest.py     |  6 ++-
 test/test_quantile.cpp |  9 ++--
 3 files changed, 101 insertions(+), 8 deletions(-)

diff --git a/src/utils/quantile.h b/src/utils/quantile.h
index 5dd3d90590e1..5a002c4cb517 100644
--- a/src/utils/quantile.h
+++ b/src/utils/quantile.h
@@ -1,7 +1,7 @@
 #ifndef XGBOOST_UTILS_QUANTILE_H_
 #define XGBOOST_UTILS_QUANTILE_H_
 /*!
- * \file quantile
+ * \file quantile.h
  * \brief util to compute quantiles 
  * \author Tianqi Chen
  */
@@ -14,7 +14,6 @@
 
 namespace xgboost {
 namespace utils {
-
 /*!
  * \brief experimental wsummary
  * \tparam DType type of data content
@@ -172,7 +171,7 @@ struct WQSummary {
     this->size = 1;
     // lastidx is used to avoid duplicated records
     size_t i = 1, lastidx = 0;
-    for (RType k = 1; k < n; ++k) {
+    for (size_t k = 1; k < n; ++k) {
       RType dx2 =  2 * ((k * range) / n + begin);
       // find first i such that  d < (rmax[i+1] + rmin[i+1]) / 2 
       while (i < src.size - 1 &&
@@ -246,7 +245,84 @@ struct WQSummary {
     utils::Assert(size <= sa.size + sb.size, "bug in combine");
   }
 };
-
+/*! \brief try to do efficient prunning */
+template<typename DType, typename RType>
+struct WXQSummary : public WQSummary<DType, RType> {
+  // redefine entry type
+  typedef typename WQSummary<DType, RType>::Entry Entry;
+  // constructor
+  WXQSummary(Entry *data, size_t size)
+      : WQSummary<DType, RType>(data, size) {}
+  // check if the block is large chunk
+  inline static bool CheckLarge(const Entry &e, RType chunk) {
+    return  e.rmin_next() > e.rmax_prev() + chunk;
+  }
+  // set prune
+  inline void SetPrune(const WXQSummary &src, RType maxsize) {
+    if (src.size <= maxsize) {
+      this->CopyFrom(src); return;
+    }
+    RType begin = src.data[0].rmax;
+    size_t n = maxsize - 1, nbig = 0;
+    const RType range = src.data[src.size - 1].rmin - begin;
+    const RType chunk = 2 * range / n;
+    // minimized range
+    RType mrange = 0;
+    {
+      // first scan, grab all the big chunk
+      // moviing block index
+      size_t bid = 0;
+      for (size_t i = 1; i < src.size; ++i) {
+        if (CheckLarge(src.data[i], chunk)) {
+          if (bid != i - 1) {
+            mrange += src.data[i].rmax_prev() - src.data[bid].rmin_next();
+          }
+          bid = i; ++nbig;
+        }
+      }
+      if (bid != src.size - 2) {
+        mrange += src.data[src.size-1].rmax_prev() - src.data[bid].rmin_next();
+      }
+    }
+    utils::Assert(nbig < n - 1, "too many large chunk");
+    this->data[0] = src.data[0];
+    this->size = 1;
+    // use smaller size
+    n = n - nbig;
+    // find the rest of point
+    size_t bid = 0, k = 1, lastidx = 0;
+    for (size_t end = 1; end < src.size; ++end) {
+      if (end == src.size - 1 || CheckLarge(src.data[end], chunk)) {
+        if (bid != end - 1) {
+          size_t i = bid;
+          RType maxdx2 = src.data[end].rmax_prev() * 2;
+          for (; k < n; ++k) {
+            RType dx2 =  2 * ((k * mrange) / n + begin);
+            if (dx2 >= maxdx2) break; 
+            while (i < end &&
+                   dx2 >= src.data[i + 1].rmax + src.data[i + 1].rmin) ++i;
+            if (dx2 < src.data[i].rmin_next() + src.data[i + 1].rmax_prev()) {
+              if (i != lastidx) {
+                this->data[this->size++] = src.data[i]; lastidx = i;
+              }
+            } else {
+              if (i + 1 != lastidx) {
+                this->data[this->size++] = src.data[i + 1]; lastidx = i + 1;
+              }
+            }
+          }
+        }
+        if (lastidx != end) {
+          this->data[this->size++] = src.data[end];
+          lastidx = end;
+        }
+        bid = end;
+        // shift base by the gap 
+        begin += src.data[bid].rmin_next() - src.data[bid].rmax_prev();
+      }
+    }
+  }
+};
 /*! 
  * \brief traditional GK summary
  */
@@ -564,6 +640,16 @@ template<typename DType, typename RType=unsigned>
 class WQuantileSketch : 
       public QuantileSketchTemplate<DType, RType, WQSummary<DType, RType> >{
 };
+
+/*!
+ * \brief Quantile sketch use WXQSummary
+ * \tparam DType type of data content
+ * \tparam RType type of rank
+ */
+template<typename DType, typename RType=unsigned>
+class WXQuantileSketch : 
+      public QuantileSketchTemplate<DType, RType, WXQSummary<DType, RType> >{
+};
 /*!
  * \brief Quantile sketch use WQSummary
  * \tparam DType type of data content
diff --git a/test/mkquantest.py b/test/mkquantest.py
index f228dc1eb62d..48d8375774f4 100755
--- a/test/mkquantest.py
+++ b/test/mkquantest.py
@@ -7,7 +7,11 @@
 funcs = {
     'seq': 'lambda n: sorted([(x,1) for x in range(1,n+1)], key = lambda x:random.random())',
     'seqlogw': 'lambda n: sorted([(x, math.log(x)) for x in range(1,n+1)], key = lambda x:random.random())',
-    'lots0': 'lambda n: sorted([(max(x - n*3/4,0), 1) for x in range(1,n+1)], key = lambda x:random.random())'
+    'lots0': 'lambda n: sorted([(max(x - n*3/4,0), 1) for x in range(1,n+1)], key = lambda x:random.random())',
+    'lots9': 'lambda n: sorted([(9 if x > n / 4 else x, 1) for x in range(1,n+1)], key = lambda x:random.random())',
+    'lotsm': 'lambda n: sorted([(n/8 if x > n / 4 else x, 1) for x in range(1,n+1)], key = lambda x:random.random())',
+    'lotsmr': 'lambda n: sorted([( x * 4 / n + n / 20 if x > n / 10 else x, 1) for x in range(1,n+1)], key = lambda x:random.random())',
+    'lotsmr2': 'lambda n: sorted([( x * 10 / n + n / 20 if x > n / 10 else x, 1) for x in range(1,n+1)], key = lambda x:random.random())'
 }
 
 if len(sys.argv) < 3:
diff --git a/test/test_quantile.cpp b/test/test_quantile.cpp
index e6af5b1ec508..0fed6bf493ac 100644
--- a/test/test_quantile.cpp
+++ b/test/test_quantile.cpp
@@ -2,7 +2,6 @@
 #include <utils/quantile.h>
 using namespace xgboost;
 
-
 template<typename Sketch, typename RType>
 inline void test(void) {
   Sketch sketch;
@@ -11,7 +10,6 @@ inline void test(void) {
   float eps, x, w;
   utils::Check(scanf("%lu%f", &n, &eps) == 2, "needs to start with n eps");
   sketch.Init(n, eps);
-  printf("nlevel = %lu, limit_size=%lu\n", sketch.nlevel, sketch.limit_size);
   while (scanf("%f%f", &x, &w) == 2) {
     sketch.Push(x, static_cast<RType>(w));
     wsum += w;
@@ -20,8 +18,10 @@ inline void test(void) {
   typename Sketch::SummaryContainer out;
   sketch.GetSummary(&out);
   double maxerr = static_cast<double>(out.MaxError());
-  printf("MaxError=%g/%g = %g\n", maxerr, wsum, maxerr / wsum);
   out.Print();
+  
+  printf("MaxError=%g/%g = %g\n", maxerr, wsum, maxerr / wsum);
+  printf("maxlevel = %lu, usedlevel=%lu, limit_size=%lu\n", sketch.nlevel, sketch.level.size(), sketch.limit_size);
 }
 
 int main(int argc, char *argv[]) {
@@ -30,6 +30,9 @@ int main(int argc, char *argv[]) {
   if (!strcmp(method, "wq")) {
     test<utils::WQuantileSketch<float, float>, float>();
   }
+  if (!strcmp(method, "wx")) {
+    test<utils::WXQuantileSketch<float, float>, float>();
+  }
   if (!strcmp(method, "gk")) {
     test<utils::GKQuantileSketch<float, unsigned>, unsigned>();
   }

From e7ea87b5fd75b96b36111d0f6c2ad20fb549d3c1 Mon Sep 17 00:00:00 2001
From: tqchen <tianqi.tchen@gmail.com>
Date: Mon, 10 Nov 2014 22:03:42 -0800
Subject: [PATCH 060/166] ok for now

---
 src/utils/quantile.h   |  6 ++--
 test/test_quantile.cpp | 74 +++++++++++++++++++++++++++++++++++-------
 2 files changed, 66 insertions(+), 14 deletions(-)

diff --git a/src/utils/quantile.h b/src/utils/quantile.h
index 5a002c4cb517..62dc36e6ce56 100644
--- a/src/utils/quantile.h
+++ b/src/utils/quantile.h
@@ -174,9 +174,9 @@ struct WQSummary {
     for (size_t k = 1; k < n; ++k) {
       RType dx2 =  2 * ((k * range) / n + begin);
       // find first i such that  d < (rmax[i+1] + rmin[i+1]) / 2 
-      while (i < src.size - 1 &&
-             dx2 >= src.data[i + 1].rmax + src.data[i + 1].rmin) ++i;
-      if (i == src.size - 1) break;
+      while (i < src.size - 1 
+             && dx2 >= src.data[i + 1].rmax + src.data[i + 1].rmin) ++i;
+      utils::Assert(i != src.size - 1, "this cannot happen");
       if (dx2 < src.data[i].rmin_next() + src.data[i + 1].rmax_prev()) {
         if (i != lastidx) {
           data[size++] = src.data[i]; lastidx = i;
diff --git a/test/test_quantile.cpp b/test/test_quantile.cpp
index 0fed6bf493ac..c1b85668dc7f 100644
--- a/test/test_quantile.cpp
+++ b/test/test_quantile.cpp
@@ -1,40 +1,92 @@
 #include <vector>
 #include <utils/quantile.h>
+#include <ctime>
 using namespace xgboost;
 
+
+struct Entry {
+  double x, w, rmin;
+  inline bool operator<(const Entry &e) const {
+    return x < e.x;
+  }
+};
+
+inline void MakeQuantile(std::vector<Entry> &dat) {
+  std::sort(dat.begin(), dat.end());
+  size_t top = 0;
+  double wsum = 0.0;
+  for (size_t i = 0; i < dat.size();) {
+    size_t j = i + 1;
+    for (;j < dat.size() && dat[i].x == dat[j].x; ++j) {
+      dat[i].w += dat[j].w;
+    }
+    dat[top] = dat[i];
+    dat[top].rmin = wsum;
+    wsum += dat[top].w;
+    ++top;
+    i = j;
+  }
+  dat.resize(top);
+}
+
+template<typename Summary>
+inline void verifyWQ(std::vector<Entry> &dat, Summary out) {
+ MakeQuantile(dat);
+ size_t j = 0;
+ double err = 0.0;
+ const double eps = 1e-4;
+ for (size_t i = 0; i < out.size; ++i) {
+   while (j < dat.size() && dat[j].x < out.data[i].value) ++j;
+   utils::Assert(j < dat.size() && fabs(dat[j].x - out.data[i].value) < eps, "bug");
+   err = std::min(dat[j].rmin - out.data[i].rmin, err);
+   err = std::min(out.data[i].rmax - dat[j].rmin + dat[j].w, err);
+   err = std::min(dat[j].w - out.data[i].wmin, err);
+ }
+ if (err < 0.0) err = -err;
+ printf("verify correctness, max-constraint-violation=%g (0 means perfect, coubld be nonzero due to floating point)\n", err);
+}
+
 template<typename Sketch, typename RType>
-inline void test(void) {
+inline typename Sketch::SummaryContainer test(std::vector<Entry> &dat) {
   Sketch sketch;
   size_t n;
   double wsum = 0.0;
-  float eps, x, w;
+  float eps;
   utils::Check(scanf("%lu%f", &n, &eps) == 2, "needs to start with n eps");
   sketch.Init(n, eps);
-  while (scanf("%f%f", &x, &w) == 2) {
-    sketch.Push(x, static_cast<RType>(w));
-    wsum += w;
+  Entry e;
+  while (scanf("%lf%lf", &e.x, &e.w) == 2) {
+    dat.push_back(e);
+    wsum += e.w;
+  }
+  clock_t start = clock();
+  for (size_t i = 0; i < dat.size(); ++i) {
+    sketch.Push(dat[i].x, dat[i].w);
   }
-  sketch.CheckValid(static_cast<RType>(0.1));
+  double tcost = static_cast<double>(clock() - start) / CLOCKS_PER_SEC;
   typename Sketch::SummaryContainer out;
-  sketch.GetSummary(&out);
+  sketch.GetSummary(&out); 
   double maxerr = static_cast<double>(out.MaxError());
   out.Print();
-  
+  printf("-------------------------\n");
+  printf("timecost=%g sec\n", tcost);
   printf("MaxError=%g/%g = %g\n", maxerr, wsum, maxerr / wsum);
   printf("maxlevel = %lu, usedlevel=%lu, limit_size=%lu\n", sketch.nlevel, sketch.level.size(), sketch.limit_size);
+  return out;
 }
 
 int main(int argc, char *argv[]) {
   const char *method = "wq";
   if (argc > 1) method = argv[1];
+  std::vector<Entry> dat;
   if (!strcmp(method, "wq")) {
-    test<utils::WQuantileSketch<float, float>, float>();
+    verifyWQ(dat, test<utils::WQuantileSketch<float, float>, float>(dat));
   }
   if (!strcmp(method, "wx")) {
-    test<utils::WXQuantileSketch<float, float>, float>();
+    verifyWQ(dat, test<utils::WXQuantileSketch<float, float>, float>(dat));
   }
   if (!strcmp(method, "gk")) {
-    test<utils::GKQuantileSketch<float, unsigned>, unsigned>();
+    test<utils::GKQuantileSketch<float, unsigned>, unsigned>(dat);
   }
   return 0;
 }

From 698c0102470bcd00692d8e408528607c22239d49 Mon Sep 17 00:00:00 2001
From: tqchen <tianqi.tchen@gmail.com>
Date: Mon, 10 Nov 2014 22:09:01 -0800
Subject: [PATCH 061/166] add example

---
 test/mkquantest.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/test/mkquantest.py b/test/mkquantest.py
index 48d8375774f4..70c467b462e4 100755
--- a/test/mkquantest.py
+++ b/test/mkquantest.py
@@ -15,10 +15,12 @@
 }
 
 if len(sys.argv) < 3:
-    print 'Usage: python mkquantest.py <maxn> <eps> [generate-type] [ndata]|./test_quantile'
+    print 'Usage: python mkquantest.py <maxn> <eps> [generate-type] [ndata]|./test_quantile [solver]'
+    print 'test_quantile need to be compiled, solver can be gk(GK nonweight version), wq(weighted version), wx(weighthed version, with prune optimized for heavy hitter)'
     print 'Possible generate-types:' 
     for k, v in funcs.items():
         print '\t%s: %s' % (k, v)
+    print 'Example: ./mkquantest.py 50000 0.3 lotsmr |./test_quantile wq'
     exit(-1)
 random.seed(0)
 maxn = int(sys.argv[1])

From c1f1bb9206cf8059ac24398ca896b93a144051eb Mon Sep 17 00:00:00 2001
From: tqchen <tianqi.tchen@gmail.com>
Date: Sat, 15 Nov 2014 09:46:30 -0800
Subject: [PATCH 062/166] first ver

---
 src/tree/param.h                   |  4 ++
 src/tree/updater_histmaker-inl.hpp | 78 ++++++++++++++++++++++++++----
 2 files changed, 72 insertions(+), 10 deletions(-)

diff --git a/src/tree/param.h b/src/tree/param.h
index 8bd855554bb9..47d31df1e867 100644
--- a/src/tree/param.h
+++ b/src/tree/param.h
@@ -36,6 +36,8 @@ struct TrainParam{
   float colsample_bytree;
   // speed optimization for dense column
   float opt_dense_col;
+  // accuracy of sketch
+  float sketch_eps;
   // leaf vector size
   int size_leaf_vector;  
   // option for parallelization
@@ -58,6 +60,7 @@ struct TrainParam{
     nthread = 0;
     size_leaf_vector = 0;
     parallel_option = 2;
+    sketch_eps = 0.1f;
   }
   /*! 
    * \brief set parameters from outside 
@@ -79,6 +82,7 @@ struct TrainParam{
     if (!strcmp(name, "subsample")) subsample = static_cast<float>(atof(val));
     if (!strcmp(name, "colsample_bylevel")) colsample_bylevel = static_cast<float>(atof(val));
     if (!strcmp(name, "colsample_bytree")) colsample_bytree  = static_cast<float>(atof(val));
+    if (!strcmp(name, "sketch_eps")) sketch_eps  = static_cast<float>(atof(val));
     if (!strcmp(name, "opt_dense_col")) opt_dense_col = static_cast<float>(atof(val));
     if (!strcmp(name, "size_leaf_vector")) size_leaf_vector = atoi(val);
     if (!strcmp(name, "max_depth")) max_depth = atoi(val);
diff --git a/src/tree/updater_histmaker-inl.hpp b/src/tree/updater_histmaker-inl.hpp
index 02dc5c8fca99..40c4a54972d4 100644
--- a/src/tree/updater_histmaker-inl.hpp
+++ b/src/tree/updater_histmaker-inl.hpp
@@ -9,6 +9,7 @@
 #include <algorithm>
 #include "../sync/sync.h"
 #include "../utils/quantile.h"
+#include "../utils/group_data.h"
 
 namespace xgboost {
 namespace tree {
@@ -145,7 +146,8 @@ class HistMaker: public IUpdater {
   // this function does two jobs
   // (1) reset the position in array position, to be the latest leaf id
   // (2) propose a set of candidate cuts and set wspace.rptr wspace.cut correctly 
-  virtual void ResetPosAndPropose(IFMatrix *p_fmat,
+  virtual void ResetPosAndPropose(const std::vector<bst_gpair> &gpair,
+                                  IFMatrix *p_fmat,
                                   const BoosterInfo &info,
                                   const RegTree &tree)  = 0;  
  private:
@@ -249,7 +251,7 @@ class HistMaker: public IUpdater {
         const int nid = position[ridx];
         if (nid >= 0) {
           utils::Assert(tree[nid].is_leaf(), "CreateHist happens in leaf");
-           const int wid = node2workindex[nid];
+          const int wid = this->node2workindex[nid];
           for (bst_uint i = 0; i < inst.length; ++i) {
             utils::Assert(inst[i].index < num_feature, "feature index exceed bound");
             // feature histogram
@@ -302,7 +304,7 @@ class HistMaker: public IUpdater {
                         RegTree *p_tree) {
     const bst_uint num_feature = p_tree->param.num_feature;
     // reset and propose candidate split
-    this->ResetPosAndPropose(p_fmat, info, *p_tree);
+    this->ResetPosAndPropose(gpair, p_fmat, info, *p_tree);
     // create histogram
     this->CreateHist(gpair, p_fmat, info, *p_tree);
     // get the best split condition for each node
@@ -347,17 +349,32 @@ class HistMaker: public IUpdater {
 
 // hist maker that propose using quantile sketch
 template<typename TStats>
-class QuantileHistMaker: public HistMaker<TStats> {
+class QuantileHistMaker: public HistMaker<TStats> {  
  protected:
-  virtual void ResetPosAndPropose(IFMatrix *p_fmat,
+  virtual void ResetPosAndPropose(const std::vector<bst_gpair> &gpair,
+                                  IFMatrix *p_fmat,
                                   const BoosterInfo &info,
                                   const RegTree &tree) {
+    // initialize the data structure
+    int nthread;
+    #pragma omp parallel
+    {
+      nthread = omp_get_num_threads();
+    }
+    sketchs.resize(this->qexpand.size() * tree.param.num_feature);
+    for (size_t i = 0; i < sketchs.size(); ++i) {
+      sketchs[i].Init(info.num_row, this->param.sketch_eps);
+    }    
     // start accumulating statistics
     utils::IIterator<RowBatch> *iter = p_fmat->RowIterator();
     iter->BeforeFirst();
     while (iter->Next()) {
       const RowBatch &batch = iter->Value();
-      const bst_omp_uint nbatch = static_cast<bst_omp_uint>(batch.size);
+      // parallel convert to column major format
+      utils::ParallelGroupBuilder<SparseBatch::Entry> builder(&col_ptr, &col_data, &thread_col_ptr);
+      builder.InitBudget(tree.param.num_feature, nthread);
+
+      const bst_omp_uint nbatch = static_cast<bst_omp_uint>(batch.size);      
       #pragma omp parallel for schedule(static)
       for (bst_omp_uint i = 0; i < nbatch; ++i) {
         RowBatch::Inst inst = batch[i];
@@ -367,13 +384,54 @@ class QuantileHistMaker: public HistMaker<TStats> {
           if (tree[nid].is_leaf()) {
             this->position[ridx] = ~nid; 
           } else {
-            this->position[ridx] = nid = HistMaker<TStats>::NextLevel(inst, tree, nid);            
-            // todo add the cut point setup
+            this->position[ridx] = nid = HistMaker<TStats>::NextLevel(inst, tree, nid);
+            for (bst_uint j = 0; j < inst.length; ++j) { 
+              builder.AddBudget(inst[j].index, omp_get_thread_num());
+            }
           }
         }
-      }      
-    }    
+      }
+      builder.InitStorage();
+      #pragma omp parallel for schedule(static)
+      for (bst_omp_uint i = 0; i < nbatch; ++i) {
+        RowBatch::Inst inst = batch[i];
+        const bst_uint ridx = static_cast<bst_uint>(batch.base_rowid + i);
+        const int nid = this->position[ridx];
+        if (nid >= 0) {
+          for (bst_uint j = 0; j < inst.length; ++j) {
+            builder.Push(inst[j].index,
+                         SparseBatch::Entry(nid, inst[j].fvalue),
+                         omp_get_thread_num());
+          }
+        }
+      }
+      // start putting things into sketch
+      const bst_omp_uint nfeat = tree.param.num_feature;
+      #pragma omp parallel for schedule(dynamic, 1)
+      for (bst_omp_uint k = 0; k < nfeat; ++k) {
+        for (size_t i = col_ptr[k]; i < col_ptr[k+1]; ++i) {
+          const SparseBatch::Entry &e = col_data[i];
+          const int wid = this->node2workindex[e.index];
+          sketchs[wid * tree.param.num_feature + k].Push(e.fvalue, gpair[e.index].hess);
+        }
+      }
+    }
+    // synchronize sketch
+    
+    
+    // now we have all the results in the sketchs, try to setup the cut point
   }
+
+ private:
+  //
+
+  // local temp column data structure
+  std::vector<size_t> col_ptr;
+  // local storage of column data
+  std::vector<SparseBatch::Entry> col_data;
+  std::vector< std::vector<size_t> > thread_col_ptr;
+  // per node, per feature sketch
+  std::vector< utils::WQuantileSketch<bst_float, bst_float> > sketchs;
 };
 
 }  // namespace tree

From c86b83ea0493ee8cb1941ded0b79b644509e2087 Mon Sep 17 00:00:00 2001
From: tqchen <tianqi.tchen@gmail.com>
Date: Sat, 15 Nov 2014 17:41:03 -0800
Subject: [PATCH 063/166] a version that compile

---
 src/sync/sync.h                    | 49 +++++++++++++++++
 src/tree/param.h                   |  4 ++
 src/tree/updater_histmaker-inl.hpp | 37 ++++++++++---
 src/utils/io.h                     | 39 ++++++++++++++
 src/utils/quantile.h               | 84 +++++++++++++++++++++++++++++-
 5 files changed, 204 insertions(+), 9 deletions(-)

diff --git a/src/sync/sync.h b/src/sync/sync.h
index 8d83ab5fb3b5..fe34983ef235 100644
--- a/src/sync/sync.h
+++ b/src/sync/sync.h
@@ -8,6 +8,7 @@
 #include <cstdio>
 #include <cstring>
 #include "../utils/utils.h"
+#include "../utils/io.h"
 #include <string>
 
 namespace xgboost {
@@ -125,6 +126,54 @@ class Reducer {
   ReduceHandle handle;
 };
 
+/*!
+ * \brief template class to make customized reduce, complex reducer handles all the data structure that can be 
+ *        serialized/deserialzed into fixed size buffer
+ * Do not use reducer directly in the function you call Finalize, because the destructor can happen after Finalize
+ * 
+ * \tparam DType data type that to be reduced, DType must contain following functions:
+ *   (1) Save(IStream &fs)  (2) Load(IStream &fs) (3) Reduce(const DType &d);
+ */
+template<typename DType>
+class ComplexReducer {
+ public:
+  ComplexReducer(void) {
+    handle.Init(ReduceInner);
+  }
+  /*!
+   * \brief customized in-place all reduce operation 
+   * \param sendrecvobj pointer to the object to be reduced
+   * \param max_n4byte maximum amount of memory needed in 4byte
+   * \param reducer the reducer function
+   */
+  inline void AllReduce(DType *sendrecvobj, size_t max_n4byte) {
+    buffer.resize(max_n4byte);
+    utils::MemoryFixSizeBuffer fs(BeginPtr(buffer), max_n4byte * 4);
+    sendrecvobj->Save(fs);
+    handle.AllReduce(BeginPtr(buffer), max_n4byte);
+    fs.Seek(0);
+    sendrecvobj->Load(fs);
+  }
+
+ private:
+  // unit size
+  // inner implementation of reducer
+  inline static void ReduceInner(const void *src_, void *dst_, int len_) {
+    utils::MemoryFixSizeBuffer fsrc((void*)(src_), len_);
+    utils::MemoryFixSizeBuffer fdst(dst_, len_);
+    // temp space
+    DType tsrc, tdst;
+    tsrc.Load(fsrc); tdst.Load(fdst);
+    // govern const check
+    tdst.Reduce(static_cast<const DType &>(tsrc));
+    tdst.Save(fdst);
+  }
+  // function handle
+  ReduceHandle handle;
+  // reduce buffer
+  std::vector<int> buffer;
+};
+
 }  // namespace sync
 }  // namespace xgboost
 #endif
diff --git a/src/tree/param.h b/src/tree/param.h
index 47d31df1e867..6402ef76a9b4 100644
--- a/src/tree/param.h
+++ b/src/tree/param.h
@@ -38,6 +38,8 @@ struct TrainParam{
   float opt_dense_col;
   // accuracy of sketch
   float sketch_eps;
+  // accuracy of sketch
+  float sketch_ratio;
   // leaf vector size
   int size_leaf_vector;  
   // option for parallelization
@@ -61,6 +63,7 @@ struct TrainParam{
     size_leaf_vector = 0;
     parallel_option = 2;
     sketch_eps = 0.1f;
+    sketch_ratio = 1.4f;
   }
   /*! 
    * \brief set parameters from outside 
@@ -83,6 +86,7 @@ struct TrainParam{
     if (!strcmp(name, "colsample_bylevel")) colsample_bylevel = static_cast<float>(atof(val));
     if (!strcmp(name, "colsample_bytree")) colsample_bytree  = static_cast<float>(atof(val));
     if (!strcmp(name, "sketch_eps")) sketch_eps  = static_cast<float>(atof(val));
+    if (!strcmp(name, "sketch_ratio")) sketch_ratio  = static_cast<float>(atof(val));
     if (!strcmp(name, "opt_dense_col")) opt_dense_col = static_cast<float>(atof(val));
     if (!strcmp(name, "size_leaf_vector")) size_leaf_vector = atoi(val);
     if (!strcmp(name, "max_depth")) max_depth = atoi(val);
diff --git a/src/tree/updater_histmaker-inl.hpp b/src/tree/updater_histmaker-inl.hpp
index 40c4a54972d4..97e4d0aea24d 100644
--- a/src/tree/updater_histmaker-inl.hpp
+++ b/src/tree/updater_histmaker-inl.hpp
@@ -124,8 +124,7 @@ class HistMaker: public IUpdater {
   /*! \brief map active node to is working index offset in qexpand*/
   std::vector<int> node2workindex;
   // reducer for histogram
-  sync::Reducer<TStats> histred;
-  
+  sync::Reducer<TStats> histred;  
   // helper function to get to next level of the tree
   // must work on non-leaf node
   inline static int NextLevel(const SparseBatch::Inst &inst, const RegTree &tree, int nid) {
@@ -142,7 +141,6 @@ class HistMaker: public IUpdater {
     }
     return n.cdefault();
   }
-
   // this function does two jobs
   // (1) reset the position in array position, to be the latest leaf id
   // (2) propose a set of candidate cuts and set wspace.rptr wspace.cut correctly 
@@ -416,15 +414,38 @@ class QuantileHistMaker: public HistMaker<TStats> {
         }
       }
     }
+    // setup maximum size
+    size_t max_size = static_cast<size_t>(this->param.sketch_ratio / this->param.sketch_eps);
     // synchronize sketch
-    
-    
-    // now we have all the results in the sketchs, try to setup the cut point
+    summary_array.Init(sketchs.size(), max_size);
+    size_t n4bytes = (summary_array.MemSize() + 3) / 4;
+    sreducer.AllReduce(&summary_array, n4bytes);    
+    // now we get the final result of sketch, setup the cut
+    for (size_t wid = 0; wid < this->qexpand.size(); ++wid) {      
+      for (size_t fid = 0; fid < tree.param.num_feature; ++fid) {
+        const WXQSketch::Summary a = summary_array[wid * tree.param.num_feature + fid];        
+        for (size_t i = 0; i < a.size; ++i) {
+          bst_float cpt = a.data[i].value + rt_eps;
+          if (i == 0 || cpt > this->wspace.cut.back()){
+            this->wspace.cut.push_back(cpt);
+          }
+        }
+        this->wspace.rptr.push_back(this->wspace.cut.size());
+      }
+      // reserve last value for global statistics
+      this->wspace.cut.push_back(0.0f);
+      this->wspace.rptr.push_back(this->wspace.cut.size());
+    }
+    utils::Assert(this->wspace.rptr.size() ==
+                  (tree.param.num_feature + 1) * this->qexpand.size(), "cut space inconsistent");
   }
 
  private:
-  //
-
+  typedef utils::WXQuantileSketch<bst_float, bst_float> WXQSketch;
+  // summary array
+  WXQSketch::SummaryArray summary_array;
+  // reducer for summary
+  sync::ComplexReducer<WXQSketch::SummaryArray> sreducer;
   // local temp column data structure
   std::vector<size_t> col_ptr;
   // local storage of column data
diff --git a/src/utils/io.h b/src/utils/io.h
index 7dd550dc892e..1a748feabdbd 100644
--- a/src/utils/io.h
+++ b/src/utils/io.h
@@ -97,6 +97,45 @@ class ISeekStream: public IStream {
   virtual size_t Tell(void) = 0;
 };
 
+/*! \brief fixed size memory buffer */
+struct MemoryFixSizeBuffer : public ISeekStream {
+ public:
+  MemoryFixSizeBuffer(void *p_buffer, size_t buffer_size) 
+      : p_buffer_(reinterpret_cast<char*>(p_buffer)), buffer_size_(buffer_size) {
+    curr_ptr_ = 0;
+  }
+  virtual ~MemoryFixSizeBuffer(void) {}
+  virtual size_t Read(void *ptr, size_t size) {
+    utils::Assert(curr_ptr_ <= buffer_size_,
+                  "read can not have position excceed buffer length");
+    size_t nread = std::min(buffer_size_ - curr_ptr_, size);
+    if (nread != 0) memcpy(ptr, p_buffer_ + curr_ptr_, nread);
+    curr_ptr_ += nread;
+    return nread;
+  }
+  virtual void Write(const void *ptr, size_t size) {
+    if (size == 0) return;
+    utils::Assert(curr_ptr_ + size <=  buffer_size_, 
+                  "write position exceed fixed buffer size");
+    memcpy(p_buffer_ + curr_ptr_, ptr, size);
+    curr_ptr_ += size;
+  }
+  virtual void Seek(size_t pos) {
+    curr_ptr_ = static_cast<size_t>(pos);
+  }
+  virtual size_t Tell(void) {
+    return curr_ptr_;
+  }
+
+ private:
+  /*! \brief in memory buffer */
+  char *p_buffer_;
+  /*! \brief current pointer */
+  size_t buffer_size_;
+  /*! \brief current pointer */
+  size_t curr_ptr_;
+}; // class MemoryFixSizeBuffer
+
 /*! \brief a in memory buffer that can be read and write as stream interface */
 struct MemoryBufferStream : public ISeekStream {
  public:
diff --git a/src/utils/quantile.h b/src/utils/quantile.h
index 62dc36e6ce56..c27fa9bfe523 100644
--- a/src/utils/quantile.h
+++ b/src/utils/quantile.h
@@ -515,13 +515,95 @@ class QuantileSketchTemplate {
       }
     }
   };
+  /*!
+   * \brief represent an array of summary
+   *  each contains fixed maximum size summary
+   */
+  class SummaryArray {
+   public:
+    /*!
+     * \brief intialize the SummaryArray 
+     * \param num_summary number of summary in the array
+     * \param max_size maximum number of elements in each summary
+     */
+    inline void Init(unsigned num_summary, unsigned max_size) {
+      this->num_summary = num_summary;
+      this->max_size = max_size;
+      sizes.resize(num_summary);
+      data.resize(num_summary * max_size);
+    }
+    /*!
+     * \brief set i-th element of array to be the src summary,
+     *   the summary can be pruned if it does not fit into max_size
+     * \param the index in the array
+     * \param src the source summary
+     * \tparam the type if source summary
+     */
+    template<typename TSrc>
+    inline void Set(size_t i, const TSrc &src) {
+      Summary dst = (*this)[i];
+      dst.SetPrune(src, max_size);
+      this->sizes[i] = dst.size;
+    }
+    /*! 
+     * \brief get i-th summary of the array, only use this for read purpose
+     */
+    inline const Summary operator[](size_t i) const {
+      return Summary((Entry*)BeginPtr(data) + i * max_size, sizes[i]);
+    }
+    /*!
+     * \brief do elementwise combination of summary array
+     *        this[i] = combine(this[i], src[i]) for each i
+     * \param src the source summary
+     */
+    inline void Reduce(const SummaryArray &src) {
+      utils::Check(num_summary == src.num_summary &&
+                   max_size == src.max_size, "array shape mismatch in reduce");
+      SummaryContainer temp;
+      temp.Reserve(max_size * 2);
+      for (unsigned i = 0; i < num_summary; ++i) {
+        temp.SetCombine((*this)[i], src[i]);
+        this->Set(i, temp);
+      }
+    }
+    /*! \brief return the number of bytes this data structure cost in serialization */
+    inline size_t MemSize(void) const {
+      return sizeof(num_summary) + sizeof(max_size) 
+          + data.size() * sizeof(Entry) + sizes.size() * sizeof(unsigned);
+    }
+    /*! \brief save the data structure into stream */
+    inline void Save(IStream &fo) const {
+      fo.Write(&num_summary, sizeof(num_summary));
+      fo.Write(&max_size, sizeof(max_size));
+      fo.Write(BeginPtr(sizes), sizes.size() * sizeof(unsigned));
+      fo.Write(BeginPtr(data), data.size() * sizeof(Entry));
+    }
+    /*! \brief load data structure from input stream */
+    inline void Load(IStream &fi) {
+      utils::Check(fi.Read(&num_summary, sizeof(num_summary)) != 0, "invalid SummaryArray");
+      utils::Check(fi.Read(&max_size, sizeof(max_size)) != 0, "invalid SummaryArray");
+      sizes.resize(num_summary);
+      data.resize(num_summary * max_size);
+      utils::Check(fi.Read(BeginPtr(sizes), sizes.size() * sizeof(unsigned)) != 0, "invalid SummaryArray");
+      utils::Check(fi.Read(BeginPtr(data), data.size() * sizeof(Entry)) != 0, "invalid SummaryArray");
+    }
+
+   private:
+    /*! \brief number of summaries in the group */
+    unsigned num_summary;
+    /*! \brief maximum size of each summary */
+    unsigned max_size;
+    /*! \brief the current size of each summary */
+    std::vector<unsigned> sizes;
+    /*! \brief the data content */
+    std::vector<Entry> data;
+  };
   /*! 
    * \brief intialize the quantile sketch, given the performance specification
    * \param maxn maximum number of data points can be feed into sketch
    * \param eps accuracy level of summary
    */
   inline void Init(size_t maxn, double eps) {
-    //nlevel = std::max(log2(ceil(maxn * eps)) - 2.0, 1.0);
     nlevel = 1;
     while (true) {
       limit_size = ceil(nlevel / eps) + 1;

From daa28f238e7d5097059dedbc341ed79c0138c51b Mon Sep 17 00:00:00 2001
From: tqchen <tianqi.tchen@gmail.com>
Date: Sat, 15 Nov 2014 21:02:19 -0800
Subject: [PATCH 064/166] fix compile, need final leaf node?

---
 src/tree/updater_histmaker-inl.hpp | 38 ++++++++++++++++++++----------
 src/utils/quantile.h               |  2 +-
 2 files changed, 27 insertions(+), 13 deletions(-)

diff --git a/src/tree/updater_histmaker-inl.hpp b/src/tree/updater_histmaker-inl.hpp
index 97e4d0aea24d..72033613df8e 100644
--- a/src/tree/updater_histmaker-inl.hpp
+++ b/src/tree/updater_histmaker-inl.hpp
@@ -166,7 +166,8 @@ class HistMaker: public IUpdater {
   // initialize temp data structure
   inline void InitData(const std::vector<bst_gpair> &gpair,
                        const IFMatrix &fmat,
-                       const std::vector<unsigned> &root_index, const RegTree &tree) {
+                       const std::vector<unsigned> &root_index,
+                       const RegTree &tree) {
     utils::Assert(tree.param.num_nodes == tree.param.num_roots,
                   "HistMaker: can only grow new tree");
     {// setup position
@@ -271,6 +272,7 @@ class HistMaker: public IUpdater {
                              const TStats &node_sum,
                              bst_uint fid,
                              SplitEntry *best) {
+    if (hist.size == 0) return;
     double root_gain = node_sum.CalcGain(param);
     TStats s(param), c(param);
     for (bst_uint i = 0; i < hist.size; ++i) {
@@ -319,7 +321,7 @@ class HistMaker: public IUpdater {
         EnumerateSplit(wspace.hset[0][fid + wid * (num_feature+1)],
                        node_sum, fid, &best);
       }
-    }    
+    }
     // get the best result, we can synchronize the solution
     for (bst_omp_uint wid = 0; wid < nexpand; ++ wid) {
       const int nid = qexpand[wid];
@@ -334,7 +336,8 @@ class HistMaker: public IUpdater {
       // now we know the solution in snode[nid], set split
       if (best.loss_chg > rt_eps) {
         p_tree->AddChilds(nid);
-        (*p_tree)[nid].set_split(best.split_index(), best.split_value, best.default_left());
+        (*p_tree)[nid].set_split(best.split_index(),
+                                 best.split_value, best.default_left());
         // mark right child as 0, to indicate fresh leaf
         (*p_tree)[(*p_tree)[nid].cleft()].set_leaf(0.0f, 0);
         (*p_tree)[(*p_tree)[nid].cright()].set_leaf(0.0f, 0);
@@ -379,10 +382,12 @@ class QuantileHistMaker: public HistMaker<TStats> {
         const bst_uint ridx = static_cast<bst_uint>(batch.base_rowid + i);
         int nid = this->position[ridx];
         if (nid >= 0) {
-          if (tree[nid].is_leaf()) {
-            this->position[ridx] = ~nid; 
-          } else {
+          if (!tree[nid].is_leaf()) {
             this->position[ridx] = nid = HistMaker<TStats>::NextLevel(inst, tree, nid);
+          } 
+          if (this->node2workindex[nid] < 0) {
+            this->position[ridx] = ~nid;
+          } else{
             for (bst_uint j = 0; j < inst.length; ++j) { 
               builder.AddBudget(inst[j].index, omp_get_thread_num());
             }
@@ -404,7 +409,7 @@ class QuantileHistMaker: public HistMaker<TStats> {
         }
       }
       // start putting things into sketch
-      const bst_omp_uint nfeat = tree.param.num_feature;
+      const bst_omp_uint nfeat = col_ptr.size() - 1;
       #pragma omp parallel for schedule(dynamic, 1)
       for (bst_omp_uint k = 0; k < nfeat; ++k) {
         for (size_t i = col_ptr[k]; i < col_ptr[k+1]; ++i) {
@@ -418,15 +423,23 @@ class QuantileHistMaker: public HistMaker<TStats> {
     size_t max_size = static_cast<size_t>(this->param.sketch_ratio / this->param.sketch_eps);
     // synchronize sketch
     summary_array.Init(sketchs.size(), max_size);
+    for (size_t i = 0; i < sketchs.size(); ++i) {
+      utils::WQuantileSketch<bst_float, bst_float>::SummaryContainer out;
+      sketchs[i].GetSummary(&out);
+      summary_array.Set(i, out);
+    }
     size_t n4bytes = (summary_array.MemSize() + 3) / 4;
-    sreducer.AllReduce(&summary_array, n4bytes);    
+    sreducer.AllReduce(&summary_array, n4bytes);
     // now we get the final result of sketch, setup the cut
-    for (size_t wid = 0; wid < this->qexpand.size(); ++wid) {      
+    this->wspace.cut.clear();
+    this->wspace.rptr.clear();
+    this->wspace.rptr.push_back(0);
+    for (size_t wid = 0; wid < this->qexpand.size(); ++wid) {
       for (size_t fid = 0; fid < tree.param.num_feature; ++fid) {
-        const WXQSketch::Summary a = summary_array[wid * tree.param.num_feature + fid];        
+        const WXQSketch::Summary a = summary_array[wid * tree.param.num_feature + fid];
         for (size_t i = 0; i < a.size; ++i) {
           bst_float cpt = a.data[i].value + rt_eps;
-          if (i == 0 || cpt > this->wspace.cut.back()){
+          if (i == 0 || cpt > this->wspace.cut.back()) {
             this->wspace.cut.push_back(cpt);
           }
         }
@@ -437,7 +450,8 @@ class QuantileHistMaker: public HistMaker<TStats> {
       this->wspace.rptr.push_back(this->wspace.cut.size());
     }
     utils::Assert(this->wspace.rptr.size() ==
-                  (tree.param.num_feature + 1) * this->qexpand.size(), "cut space inconsistent");
+                  (tree.param.num_feature + 1) * this->qexpand.size() + 1,
+                  "cut space inconsistent");
   }
 
  private:
diff --git a/src/utils/quantile.h b/src/utils/quantile.h
index c27fa9bfe523..a3b8c18dd611 100644
--- a/src/utils/quantile.h
+++ b/src/utils/quantile.h
@@ -258,7 +258,7 @@ struct WXQSummary : public WQSummary<DType, RType> {
     return  e.rmin_next() > e.rmax_prev() + chunk;
   }
   // set prune
-  inline void SetPrune(const WXQSummary &src, RType maxsize) {
+  inline void SetPrune(const WQSummary<DType, RType> &src, RType maxsize) {
     if (src.size <= maxsize) {
       this->CopyFrom(src); return;
     }

From 02c2278f9629020e153ba76040f9e44fdefa863b Mon Sep 17 00:00:00 2001
From: tqchen <tianqi.tchen@gmail.com>
Date: Sat, 15 Nov 2014 21:18:15 -0800
Subject: [PATCH 065/166] ok

---
 src/tree/updater_histmaker-inl.hpp | 42 +++++++++++++++++++++---------
 1 file changed, 30 insertions(+), 12 deletions(-)

diff --git a/src/tree/updater_histmaker-inl.hpp b/src/tree/updater_histmaker-inl.hpp
index 72033613df8e..68ceca37195c 100644
--- a/src/tree/updater_histmaker-inl.hpp
+++ b/src/tree/updater_histmaker-inl.hpp
@@ -161,7 +161,11 @@ class HistMaker: public IUpdater {
       this->UpdateNode2WorkIndex(*p_tree);
       // if nothing left to be expand, break
       if (qexpand.size() == 0) break;
-    }   
+    }
+    for (size_t i = 0; i < qexpand.size(); ++i) {
+      const int nid = qexpand[i];
+      (*p_tree)[nid].set_leaf(p_tree->stat(nid).base_weight * param.learning_rate);
+    }
   }
   // initialize temp data structure
   inline void InitData(const std::vector<bst_gpair> &gpair,
@@ -271,7 +275,8 @@ class HistMaker: public IUpdater {
   inline void EnumerateSplit(const HistUnit &hist, 
                              const TStats &node_sum,
                              bst_uint fid,
-                             SplitEntry *best) {
+                             SplitEntry *best,
+                             TStats *left_sum) {
     if (hist.size == 0) return;
     double root_gain = node_sum.CalcGain(param);
     TStats s(param), c(param);
@@ -281,7 +286,9 @@ class HistMaker: public IUpdater {
         c.SetSubstract(node_sum, s);
         if (c.sum_hess >= param.min_child_weight) {
           double loss_chg = s.CalcGain(param) + c.CalcGain(param) - root_gain;
-          best->Update(loss_chg, fid, hist.cut[i], false);
+          if (best->Update(loss_chg, fid, hist.cut[i], false)) {
+            *left_sum = s;
+          }
         }
       }
     }
@@ -292,7 +299,9 @@ class HistMaker: public IUpdater {
         c.SetSubstract(node_sum, s);
         if (c.sum_hess >= param.min_child_weight) {
           double loss_chg = s.CalcGain(param) + c.CalcGain(param) - root_gain;
-          best->Update(loss_chg, fid, hist.cut[i-1], true);
+          if (best->Update(loss_chg, fid, hist.cut[i-1], true)) {
+            *left_sum = c;
+          }
         }
       }
     }
@@ -309,17 +318,18 @@ class HistMaker: public IUpdater {
     this->CreateHist(gpair, p_fmat, info, *p_tree);
     // get the best split condition for each node
     std::vector<SplitEntry> sol(qexpand.size());
+    std::vector<TStats> left_sum(qexpand.size());    
     bst_omp_uint nexpand = static_cast<bst_omp_uint>(qexpand.size());
     #pragma omp parallel for schedule(dynamic, 1)
     for (bst_omp_uint wid = 0; wid < nexpand; ++ wid) {
       const int nid = qexpand[wid];
       utils::Assert(node2workindex[nid] == static_cast<int>(wid),
                     "node2workindex inconsistent");
-      SplitEntry &best = sol[wid];     
+      SplitEntry &best = sol[wid];
       TStats &node_sum = wspace.hset[0][num_feature + wid * (num_feature + 1)].data[0];
       for (bst_uint fid = 0; fid < num_feature; ++ fid) {
         EnumerateSplit(wspace.hset[0][fid + wid * (num_feature+1)],
-                       node_sum, fid, &best);
+                       node_sum, fid, &best, &left_sum[wid]);
       }
     }
     // get the best result, we can synchronize the solution
@@ -327,25 +337,33 @@ class HistMaker: public IUpdater {
       const int nid = qexpand[wid];
       const SplitEntry &best = sol[wid];
       const TStats &node_sum = wspace.hset[0][num_feature + wid * (num_feature + 1)].data[0];
-      bst_float weight = node_sum.CalcWeight(param);
+      this->SetStats(p_tree, nid, node_sum);
       // set up the values
       p_tree->stat(nid).loss_chg = best.loss_chg;
-      p_tree->stat(nid).base_weight = weight;
-      p_tree->stat(nid).sum_hess = static_cast<float>(node_sum.sum_hess);
-      node_sum.SetLeafVec(param, p_tree->leafvec(nid));
       // now we know the solution in snode[nid], set split
       if (best.loss_chg > rt_eps) {
         p_tree->AddChilds(nid);
         (*p_tree)[nid].set_split(best.split_index(),
                                  best.split_value, best.default_left());
         // mark right child as 0, to indicate fresh leaf
-        (*p_tree)[(*p_tree)[nid].cleft()].set_leaf(0.0f, 0);
+        (*p_tree)[(*p_tree)[nid].cleft()].set_leaf(0.0f, 0);        
         (*p_tree)[(*p_tree)[nid].cright()].set_leaf(0.0f, 0);
+        // right side sum
+        TStats right_sum;
+        right_sum.SetSubstract(node_sum, left_sum[wid]);
+        this->SetStats(p_tree, (*p_tree)[nid].cleft(), left_sum[wid]);
+        this->SetStats(p_tree, (*p_tree)[nid].cright(), right_sum);
       } else {
-        (*p_tree)[nid].set_leaf(weight * param.learning_rate);
+        (*p_tree)[nid].set_leaf(p_tree->stat(nid).base_weight * param.learning_rate);
       }
     }
   }
+  
+  inline void SetStats(RegTree *p_tree, int nid, const TStats &node_sum) {
+      p_tree->stat(nid).base_weight = node_sum.CalcWeight(param);
+      p_tree->stat(nid).sum_hess = static_cast<float>(node_sum.sum_hess);
+      node_sum.SetLeafVec(param, p_tree->leafvec(nid));    
+  }
 };
 
 // hist maker that propose using quantile sketch

From 129fee64f32150547d331430bb4f208018ab468d Mon Sep 17 00:00:00 2001
From: tqchen <tianqi.tchen@gmail.com>
Date: Sun, 16 Nov 2014 11:38:21 -0800
Subject: [PATCH 066/166] fix regression

---
 demo/kaggle-higgs/higgs-cv.py      |  2 +-
 src/tree/updater_histmaker-inl.hpp | 22 +++++++++++++++-------
 src/utils/quantile.h               |  7 ++++++-
 3 files changed, 22 insertions(+), 9 deletions(-)

diff --git a/demo/kaggle-higgs/higgs-cv.py b/demo/kaggle-higgs/higgs-cv.py
index 3e36fa66b4e9..1d660aa8fa37 100755
--- a/demo/kaggle-higgs/higgs-cv.py
+++ b/demo/kaggle-higgs/higgs-cv.py
@@ -10,7 +10,7 @@
 data   = train[:,1:31]
 weight = train[:,31]
 dtrain = xgb.DMatrix( data, label=label, missing = -999.0, weight=weight )
-param = {'max_depth':6, 'eta':0.1, 'silent':1, 'objective':'binary:logitraw', 'nthread':4}
+param = {'max_depth':6, 'eta':0.1, 'silent':1, 'objective':'binary:logitraw', 'nthread':4, 'updater':'grow_histmaker,prune'}
 num_round = 120
 
 print ('running cross validation, with preprocessing function')
diff --git a/src/tree/updater_histmaker-inl.hpp b/src/tree/updater_histmaker-inl.hpp
index 68ceca37195c..8cade6313411 100644
--- a/src/tree/updater_histmaker-inl.hpp
+++ b/src/tree/updater_histmaker-inl.hpp
@@ -53,7 +53,7 @@ class HistMaker: public IUpdater {
                     const std::vector<bst_gpair> &gpair,
                     const BoosterInfo &info,
                     const bst_uint ridx) {
-      unsigned i = std::lower_bound(cut, cut + size, fv) - cut;
+      unsigned i = std::upper_bound(cut, cut + size, fv) - cut;      
       utils::Assert(i < size, "maximum value must be in cut");
       data[i].Add(gpair, info, ridx);
     }
@@ -155,7 +155,7 @@ class HistMaker: public IUpdater {
                       RegTree *p_tree) {
     this->InitData(gpair, *p_fmat, info.root_index, *p_tree);
     this->UpdateNode2WorkIndex(*p_tree);
-    for (int depth = 0; depth < param.max_depth; ++depth) {      
+    for (int depth = 0; depth < param.max_depth; ++depth) {
       this->FindSplit(depth, gpair, p_fmat, info, p_tree);
       this->UpdateQueueExpand(*p_tree);
       this->UpdateNode2WorkIndex(*p_tree);
@@ -278,6 +278,7 @@ class HistMaker: public IUpdater {
                              SplitEntry *best,
                              TStats *left_sum) {
     if (hist.size == 0) return;
+
     double root_gain = node_sum.CalcGain(param);
     TStats s(param), c(param);
     for (bst_uint i = 0; i < hist.size; ++i) {
@@ -383,7 +384,7 @@ class QuantileHistMaker: public HistMaker<TStats> {
     sketchs.resize(this->qexpand.size() * tree.param.num_feature);
     for (size_t i = 0; i < sketchs.size(); ++i) {
       sketchs[i].Init(info.num_row, this->param.sketch_eps);
-    }    
+    }
     // start accumulating statistics
     utils::IIterator<RowBatch> *iter = p_fmat->RowIterator();
     iter->BeforeFirst();
@@ -453,14 +454,21 @@ class QuantileHistMaker: public HistMaker<TStats> {
     this->wspace.rptr.clear();
     this->wspace.rptr.push_back(0);
     for (size_t wid = 0; wid < this->qexpand.size(); ++wid) {
-      for (size_t fid = 0; fid < tree.param.num_feature; ++fid) {
+      for (int fid = 0; fid < tree.param.num_feature; ++fid) {
         const WXQSketch::Summary a = summary_array[wid * tree.param.num_feature + fid];
-        for (size_t i = 0; i < a.size; ++i) {
-          bst_float cpt = a.data[i].value + rt_eps;
-          if (i == 0 || cpt > this->wspace.cut.back()) {
+        for (size_t i = 1; i < a.size; ++i) {
+          bst_float cpt = a.data[i].value - rt_eps;
+          if (i == 1 || cpt > this->wspace.cut.back()) {
             this->wspace.cut.push_back(cpt);
           }
         }
+        // push a value that is greater than anything
+        if (a.size != 0) {
+          bst_float cpt = a.data[a.size - 1].value;
+          // this must be bigger than last value in a scale
+          bst_float last = cpt + fabs(cpt);
+          this->wspace.cut.push_back(last);
+        }
         this->wspace.rptr.push_back(this->wspace.cut.size());
       }
       // reserve last value for global statistics
diff --git a/src/utils/quantile.h b/src/utils/quantile.h
index a3b8c18dd611..53117f28b3ab 100644
--- a/src/utils/quantile.h
+++ b/src/utils/quantile.h
@@ -10,6 +10,7 @@
 #include <cstring>
 #include <algorithm>
 #include <iostream>
+#include "./io.h"
 #include "./utils.h"
 
 namespace xgboost {
@@ -481,7 +482,11 @@ class QuantileSketchTemplate {
   /*! \brief same as summary, but use STL to backup the space */
   struct SummaryContainer : public Summary {
     std::vector<Entry> space;
-    SummaryContainer(void) : Summary(NULL, 0) { 
+    explicit SummaryContainer(void) : Summary(NULL, 0) { 
+    }
+    explicit SummaryContainer(const SummaryContainer &src) : Summary(NULL, src.size) { 
+      this->space = src.space;
+      this->data = BeginPtr(this->space);
     }
     /*! \brief reserve space for summary */
     inline void Reserve(size_t size) {

From 5061d55725bbae7d74883e151ad8f8bff159687a Mon Sep 17 00:00:00 2001
From: tqchen <tianqi.tchen@gmail.com>
Date: Sun, 16 Nov 2014 11:47:21 -0800
Subject: [PATCH 067/166] alrite

---
 demo/kaggle-higgs/speedtest.py     | 18 +++++++++---------
 src/tree/updater_histmaker-inl.hpp |  2 +-
 2 files changed, 10 insertions(+), 10 deletions(-)

diff --git a/demo/kaggle-higgs/speedtest.py b/demo/kaggle-higgs/speedtest.py
index c5cc2fd29dc5..2da9c86ef356 100755
--- a/demo/kaggle-higgs/speedtest.py
+++ b/demo/kaggle-higgs/speedtest.py
@@ -5,7 +5,6 @@
 # add path of xgboost python module
 sys.path.append('../../wrapper/')
 import xgboost as xgb
-from sklearn.ensemble import GradientBoostingClassifier
 import time
 test_size = 550000
 
@@ -38,29 +37,30 @@
 param['scale_pos_weight'] = sum_wneg/sum_wpos
 param['bst:eta'] = 0.1
 param['bst:max_depth'] = 6
-param['eval_metric'] = 'auc'
+#param['eval_metric'] = 'auc'
 param['silent'] = 1
+param['updater'] = sys.argv[1]
 param['nthread'] = 4
 
-plst = param.items()+[('eval_metric', 'ams@0.15')]
+#plst = param.items()+[('eval_metric', 'ams@0.15')]
 
 watchlist = [ (xgmat,'train') ]
 # boost 10 tres
 num_round = 10
 print ('loading data end, start to boost trees')
 print ("training GBM from sklearn")
-tmp = time.time()
-gbm = GradientBoostingClassifier(n_estimators=num_round, max_depth=6, verbose=2)
-gbm.fit(data, label)
-print ("sklearn.GBM costs: %s seconds" % str(time.time() - tmp))
+#tmp = time.time()
+#gbm = GradientBoostingClassifier(n_estimators=num_round, max_depth=6, verbose=2)
+#gbm.fit(data, label)
+#print ("sklearn.GBM costs: %s seconds" % str(time.time() - tmp))
 #raw_input()
 print ("training xgboost")
 threads = [1, 2, 4, 16]
 for i in threads:
     param['nthread'] = i
     tmp = time.time()
-    plst = param.items()+[('eval_metric', 'ams@0.15')]
-    bst = xgb.train( plst, xgmat, num_round, watchlist );
+    #plst = param.items()+[('eval_metric', 'ams@0.15')]
+    bst = xgb.train( param, xgmat, num_round, watchlist );
     print ("XGBoost with %d thread costs: %s seconds" % (i, str(time.time() - tmp)))
 
 print ('finish training')
diff --git a/src/tree/updater_histmaker-inl.hpp b/src/tree/updater_histmaker-inl.hpp
index 8cade6313411..52622f00e672 100644
--- a/src/tree/updater_histmaker-inl.hpp
+++ b/src/tree/updater_histmaker-inl.hpp
@@ -466,7 +466,7 @@ class QuantileHistMaker: public HistMaker<TStats> {
         if (a.size != 0) {
           bst_float cpt = a.data[a.size - 1].value;
           // this must be bigger than last value in a scale
-          bst_float last = cpt + fabs(cpt);
+          bst_float last = cpt + fabs(cpt) + rt_eps;
           this->wspace.cut.push_back(last);
         }
         this->wspace.rptr.push_back(this->wspace.cut.size());

From 8ed585a7a20c59a32a09ee92a4ccdd7ade7b3727 Mon Sep 17 00:00:00 2001
From: tqchen <tianqi.tchen@gmail.com>
Date: Sun, 16 Nov 2014 13:31:50 -0800
Subject: [PATCH 068/166] check in two bad ones, start think of column
 distribut cut row

---
 src/tree/updater.cpp               |  3 +-
 src/tree/updater_histmaker-inl.hpp | 88 +++++++++++++++++++++++++++++-
 2 files changed, 88 insertions(+), 3 deletions(-)

diff --git a/src/tree/updater.cpp b/src/tree/updater.cpp
index faa38dc4bccf..0d0e6627440c 100644
--- a/src/tree/updater.cpp
+++ b/src/tree/updater.cpp
@@ -15,7 +15,8 @@ IUpdater* CreateUpdater(const char *name) {
   if (!strcmp(name, "prune")) return new TreePruner();
   if (!strcmp(name, "refresh")) return new TreeRefresher<GradStats>();
   if (!strcmp(name, "grow_colmaker")) return new ColMaker<GradStats>();
-  if (!strcmp(name, "grow_histmaker")) return new QuantileHistMaker<GradStats>();
+  if (!strcmp(name, "grow_qhistmaker")) return new QuantileHistMaker<GradStats>();
+  if (!strcmp(name, "grow_chistmaker")) return new ColumnHistMaker<GradStats>();
   if (!strcmp(name, "distcol")) return new DistColMaker<GradStats>();
   if (!strcmp(name, "grow_colmaker5")) return new ColMaker< CVGradStats<5> >();
   if (!strcmp(name, "grow_colmaker3")) return new ColMaker< CVGradStats<3> >();
diff --git a/src/tree/updater_histmaker-inl.hpp b/src/tree/updater_histmaker-inl.hpp
index 52622f00e672..7036010e750d 100644
--- a/src/tree/updater_histmaker-inl.hpp
+++ b/src/tree/updater_histmaker-inl.hpp
@@ -251,7 +251,10 @@ class HistMaker: public IUpdater {
         const int tid = omp_get_thread_num();
         HistSet &hset = wspace.hset[tid];
         const bst_uint ridx = static_cast<bst_uint>(batch.base_rowid + i);
-        const int nid = position[ridx];
+        int nid = position[ridx];
+        if (!tree[nid].is_leaf()) {
+          this->position[ridx] = nid = HistMaker<TStats>::NextLevel(inst, tree, nid);
+        } 
         if (nid >= 0) {
           utils::Assert(tree[nid].is_leaf(), "CreateHist happens in leaf");
           const int wid = this->node2workindex[nid];
@@ -367,7 +370,88 @@ class HistMaker: public IUpdater {
   }
 };
 
-// hist maker that propose using quantile sketch
+template<typename TStats>
+class ColumnHistMaker: public HistMaker<TStats> {
+ public:
+  virtual void ResetPosAndPropose(const std::vector<bst_gpair> &gpair,
+                                  IFMatrix *p_fmat,
+                                  const BoosterInfo &info,
+                                  const RegTree &tree) {
+    sketchs.resize(tree.param.num_feature);
+    for (size_t i = 0; i < sketchs.size(); ++i) {
+      sketchs[i].Init(info.num_row, this->param.sketch_eps);
+    }
+    utils::IIterator<ColBatch> *iter = p_fmat->ColIterator();
+    while (iter->Next()) {
+      const ColBatch &batch = iter->Value();
+      const bst_omp_uint nsize = static_cast<bst_omp_uint>(batch.size);
+      #pragma omp parallel for schedule(dynamic, 1)
+      for (bst_omp_uint i = 0; i < nsize; ++i) {
+        const bst_uint fid = batch.col_index[i];
+        const ColBatch::Inst &col = batch[i];
+        unsigned nstep = col.length * (this->param.sketch_eps / this->param.sketch_ratio);
+        if (nstep == 0) nstep = 1; 
+        for (unsigned i = 0; i < col.length; i += nstep) {
+          sketchs[fid].Push(col[i].fvalue);
+        }
+        if (col.length != 0 && col.length - 1 % nstep != 0)  {
+          sketchs[fid].Push(col[col.length-1].fvalue);
+        }
+      }
+    }  
+    
+    size_t max_size = static_cast<size_t>(this->param.sketch_ratio / this->param.sketch_eps);
+    // synchronize sketch
+    summary_array.Init(sketchs.size(), max_size);
+    for (size_t i = 0; i < sketchs.size(); ++i) {
+      utils::WQuantileSketch<bst_float, bst_float>::SummaryContainer out;
+      sketchs[i].GetSummary(&out);
+      summary_array.Set(i, out);
+    }
+    size_t n4bytes = (summary_array.MemSize() + 3) / 4;
+    sreducer.AllReduce(&summary_array, n4bytes);
+    // now we get the final result of sketch, setup the cut
+    this->wspace.cut.clear();
+    this->wspace.rptr.clear();
+    this->wspace.rptr.push_back(0);
+    for (size_t wid = 0; wid < this->qexpand.size(); ++wid) {
+      for (int fid = 0; fid < tree.param.num_feature; ++fid) {
+        const WXQSketch::Summary a = summary_array[fid];
+        for (size_t i = 1; i < a.size; ++i) {
+          bst_float cpt = a.data[i].value - rt_eps;
+          if (i == 1 || cpt > this->wspace.cut.back()) {
+            this->wspace.cut.push_back(cpt);
+          }
+        }
+        // push a value that is greater than anything
+        if (a.size != 0) {
+          bst_float cpt = a.data[a.size - 1].value;
+          // this must be bigger than last value in a scale
+          bst_float last = cpt + fabs(cpt) + rt_eps;
+          this->wspace.cut.push_back(last);
+        }
+        this->wspace.rptr.push_back(this->wspace.cut.size());
+      }
+      // reserve last value for global statistics
+      this->wspace.cut.push_back(0.0f);
+      this->wspace.rptr.push_back(this->wspace.cut.size());
+    }
+    utils::Assert(this->wspace.rptr.size() ==
+                  (tree.param.num_feature + 1) * this->qexpand.size() + 1,
+                  "cut space inconsistent");    
+  }
+
+ private:
+  typedef utils::WXQuantileSketch<bst_float, bst_float> WXQSketch;
+  // summary array
+  WXQSketch::SummaryArray summary_array;
+  // reducer for summary
+  sync::ComplexReducer<WXQSketch::SummaryArray> sreducer;
+  // per feature sketch
+  std::vector< utils::WQuantileSketch<bst_float, bst_float> > sketchs;
+};
+
+
 template<typename TStats>
 class QuantileHistMaker: public HistMaker<TStats> {  
  protected:

From d11445e0b11d5d7df30548aadf20898dfd4d5dcd Mon Sep 17 00:00:00 2001
From: tqchen <tianqi.tchen@gmail.com>
Date: Sun, 16 Nov 2014 22:01:22 -0800
Subject: [PATCH 069/166] add in sync

---
 src/tree/updater.cpp              |  2 ++
 src/tree/updater_colmaker-inl.hpp |  2 +-
 src/tree/updater_prune-inl.hpp    | 27 ++++------------
 src/tree/updater_sync-inl.hpp     | 54 +++++++++++++++++++++++++++++++
 4 files changed, 63 insertions(+), 22 deletions(-)
 create mode 100644 src/tree/updater_sync-inl.hpp

diff --git a/src/tree/updater.cpp b/src/tree/updater.cpp
index 0d0e6627440c..f6e669ffad2c 100644
--- a/src/tree/updater.cpp
+++ b/src/tree/updater.cpp
@@ -2,6 +2,7 @@
 #define _CRT_SECURE_NO_DEPRECATE
 #include <cstring>
 #include "./updater.h"
+#include "./updater_sync-inl.hpp"
 #include "./updater_prune-inl.hpp"
 #include "./updater_refresh-inl.hpp"
 #include "./updater_colmaker-inl.hpp"
@@ -13,6 +14,7 @@ namespace tree {
 IUpdater* CreateUpdater(const char *name) {
   using namespace std;
   if (!strcmp(name, "prune")) return new TreePruner();
+  if (!strcmp(name, "sync")) return new TreeSyncher();
   if (!strcmp(name, "refresh")) return new TreeRefresher<GradStats>();
   if (!strcmp(name, "grow_colmaker")) return new ColMaker<GradStats>();
   if (!strcmp(name, "grow_qhistmaker")) return new QuantileHistMaker<GradStats>();
diff --git a/src/tree/updater_colmaker-inl.hpp b/src/tree/updater_colmaker-inl.hpp
index 6db19732e1b8..6f8cb35d3ad8 100644
--- a/src/tree/updater_colmaker-inl.hpp
+++ b/src/tree/updater_colmaker-inl.hpp
@@ -14,7 +14,7 @@
 
 namespace xgboost {
 namespace tree {
-/*! \brief pruner that prunes a tree after growing finishs */
+/*! \brief colunwise update to construct a tree */
 template<typename TStats>
 class ColMaker: public IUpdater {
  public:
diff --git a/src/tree/updater_prune-inl.hpp b/src/tree/updater_prune-inl.hpp
index a68404ba7df7..e7e5f9f0b752 100644
--- a/src/tree/updater_prune-inl.hpp
+++ b/src/tree/updater_prune-inl.hpp
@@ -8,7 +8,7 @@
 #include <vector>
 #include "./param.h"
 #include "./updater.h"
-#include "../sync/sync.h"
+#include "./updater_sync-inl.hpp"
 
 namespace xgboost {
 namespace tree {
@@ -20,6 +20,7 @@ class TreePruner: public IUpdater {
   virtual void SetParam(const char *name, const char *val) {
     using namespace std;
     param.SetParam(name, val);
+    syncher.SetParam(name, val);
     if (!strcmp(name, "silent")) silent = atoi(val);
   }
   // update the tree, do pruning
@@ -34,27 +35,9 @@ class TreePruner: public IUpdater {
       this->DoPrune(*trees[i]);
     }
     param.learning_rate = lr;
-    this->SyncTrees(trees);
-  }  
- private:
-  // synchronize the trees in different nodes, take tree from rank 0
-  inline void SyncTrees(const std::vector<RegTree *> &trees) {
-    if (sync::GetWorldSize() == 1) return;
-    std::string s_model;
-    utils::MemoryBufferStream fs(&s_model);
-    int rank = sync::GetRank();
-    if (rank == 0) {
-      for (size_t i = 0; i < trees.size(); ++i) {
-        trees[i]->SaveModel(fs);
-      }
-      sync::Bcast(&s_model, 0);
-    } else {
-      sync::Bcast(&s_model, 0);
-      for (size_t i = 0; i < trees.size(); ++i) {      
-        trees[i]->LoadModel(fs);
-      }
-    }
+    syncher.Update(gpair, p_fmat, info, trees);
   }
+ private:
   // try to prune off current leaf
   inline int TryPruneLeaf(RegTree &tree, int nid, int depth, int npruned) {
     if (tree[nid].is_root()) return npruned;
@@ -89,6 +72,8 @@ class TreePruner: public IUpdater {
   }
 
  private:
+  // synchronizer
+  TreeSyncher syncher;
   // shutup
   int silent;
   // training parameter
diff --git a/src/tree/updater_sync-inl.hpp b/src/tree/updater_sync-inl.hpp
new file mode 100644
index 000000000000..68a6096168d3
--- /dev/null
+++ b/src/tree/updater_sync-inl.hpp
@@ -0,0 +1,54 @@
+#ifndef XGBOOST_TREE_UPDATER_SYNC_INL_HPP_
+#define XGBOOST_TREE_UPDATER_SYNC_INL_HPP_
+/*!
+ * \file updater_sync-inl.hpp
+ * \brief synchronize the tree in all distributed nodes
+ * \author Tianqi Chen
+ */
+#include <vector>
+#include <limits>
+#include "./updater.h"
+#include "../sync/sync.h"
+
+namespace xgboost {
+namespace tree {
+/*! 
+ * \brief syncher that synchronize the tree in all distributed nodes
+ * can implement various strategies, so far it is always set to node 0's tree
+ */
+class TreeSyncher: public IUpdater {
+ public:
+  virtual ~TreeSyncher(void) {}
+  virtual void SetParam(const char *name, const char *val) {
+  }
+  // update the tree, do pruning
+  virtual void Update(const std::vector<bst_gpair> &gpair,
+                      IFMatrix *p_fmat,
+                      const BoosterInfo &info,
+                      const std::vector<RegTree*> &trees) {
+    this->SyncTrees(trees);
+  }
+  
+ private:
+  // synchronize the trees in different nodes, take tree from rank 0
+  inline void SyncTrees(const std::vector<RegTree *> &trees) {
+    if (sync::GetWorldSize() == 1) return;
+    std::string s_model;
+    utils::MemoryBufferStream fs(&s_model);
+    int rank = sync::GetRank();
+    if (rank == 0) {
+      for (size_t i = 0; i < trees.size(); ++i) {
+        trees[i]->SaveModel(fs);
+      }
+      sync::Bcast(&s_model, 0);
+    } else {
+      sync::Bcast(&s_model, 0);
+      for (size_t i = 0; i < trees.size(); ++i) {      
+        trees[i]->LoadModel(fs);
+      }
+    }
+  }    
+};
+}  // namespace tree
+}  // namespace xgboost
+#endif  // XGBOOST_TREE_UPDATER_SYNC_INL_HPP_

From 8874234e5e06b2712228f975248de6459b6b1efa Mon Sep 17 00:00:00 2001
From: tqchen <tianqi.tchen@gmail.com>
Date: Sun, 16 Nov 2014 22:23:33 -0800
Subject: [PATCH 070/166] check in basemaker

---
 src/tree/updater_basemaker-inl.hpp | 148 +++++++++++++++++++++++++++++
 src/tree/updater_histmaker-inl.hpp |  92 +-----------------
 2 files changed, 151 insertions(+), 89 deletions(-)
 create mode 100644 src/tree/updater_basemaker-inl.hpp

diff --git a/src/tree/updater_basemaker-inl.hpp b/src/tree/updater_basemaker-inl.hpp
new file mode 100644
index 000000000000..17469658f9dc
--- /dev/null
+++ b/src/tree/updater_basemaker-inl.hpp
@@ -0,0 +1,148 @@
+#ifndef XGBOOST_TREE_UPDATER_BASEMAKER_INL_HPP_
+#define XGBOOST_TREE_UPDATER_BASEMAKER_INL_HPP_
+/*!
+ * \file updater_basemaker-inl.hpp
+ * \brief implement a common tree constructor
+ * \author Tianqi Chen
+ */
+#include <vector>
+#include <algorithm>
+#include "../utils/random.h"
+
+namespace xgboost {
+namespace tree {
+/*! 
+ * \brief base tree maker class that defines common operation
+ *  needed in tree making
+ */
+class BaseMaker: public IUpdater {
+ public:
+  // destructor
+  virtual ~BaseMaker(void) {}
+  // set training parameter
+  virtual void SetParam(const char *name, const char *val) {
+    param.SetParam(name, val);
+  }
+  
+ protected:
+  // ------static helper functions ------
+  // helper function to get to next level of the tree
+  // must work on non-leaf node
+  inline static int NextLevel(const SparseBatch::Inst &inst, const RegTree &tree, int nid) {
+    const RegTree::Node &n = tree[nid];
+    bst_uint findex = n.split_index();
+    for (unsigned i = 0; i < inst.length; ++i) {
+      if (findex == inst[i].index) {
+        if (inst[i].fvalue < n.split_cond()) {
+          return n.cleft();
+        } else {
+          return n.cright();
+        }
+      }
+    }
+    return n.cdefault();
+  }
+  /*! \brief get number of omp thread in current context */
+  inline static int get_nthread(void) {
+    int nthread;
+    #pragma omp parallel
+    {
+      nthread = omp_get_num_threads();
+    }
+    return nthread;
+  }
+  // ------class member helpers---------
+  // return decoded position
+  inline int DecodePosition(bst_uint ridx) const{
+    const int pid = position[ridx];
+    return pid < 0 ? ~pid : pid;
+  }
+  // encode the encoded position value for ridx
+  inline void SetEncodePosition(bst_uint ridx, int nid) {
+    if (position[ridx] < 0) {
+      position[ridx] = ~nid;
+    } else {
+      position[ridx] = nid;
+    }
+  }
+  /*! \brief initialize temp data structure */
+  inline void InitData(const std::vector<bst_gpair> &gpair,
+                       const IFMatrix &fmat,
+                       const std::vector<unsigned> &root_index,
+                       const RegTree &tree) {
+    utils::Assert(tree.param.num_nodes == tree.param.num_roots,
+                  "TreeMaker: can only grow new tree");
+    {// setup position
+      position.resize(gpair.size());
+      if (root_index.size() == 0) {
+        std::fill(position.begin(), position.end(), 0);
+      } else {
+        for (size_t i = 0; i < position.size(); ++i) {
+          position[i] = root_index[i];
+          utils::Assert(root_index[i] < (unsigned)tree.param.num_roots,
+                        "root index exceed setting");
+        }
+      }
+      // mark delete for the deleted datas
+      for (size_t i = 0; i < position.size(); ++i) {
+        if (gpair[i].hess < 0.0f) position[i] = ~position[i];
+      }
+      // mark subsample
+      if (param.subsample < 1.0f) {
+        for (size_t i = 0; i < position.size(); ++i) {
+          if (gpair[i].hess < 0.0f) continue;
+          if (random::SampleBinary(param.subsample) == 0) position[i] = ~position[i];
+        }
+      }
+    }
+    {// expand query
+      qexpand.reserve(256); qexpand.clear();
+      for (int i = 0; i < tree.param.num_roots; ++i) {
+        qexpand.push_back(i);
+      }
+      this->UpdateNode2WorkIndex(tree);
+    }
+  }
+  /*! \brief update queue expand add in new leaves */
+  inline void UpdateQueueExpand(const RegTree &tree) {
+    std::vector<int> newnodes;
+    for (size_t i = 0; i < qexpand.size(); ++i) {
+      const int nid = qexpand[i];
+      if (!tree[nid].is_leaf()) {
+        newnodes.push_back(tree[nid].cleft());
+        newnodes.push_back(tree[nid].cright());
+      }
+    }
+    // use new nodes for qexpand
+    qexpand = newnodes;
+    this->UpdateNode2WorkIndex(tree);
+  }
+  /*! \brief training parameter of tree grower */
+  TrainParam param;
+  /*! \brief queue of nodes to be expanded */
+  std::vector<int> qexpand;
+  /*!
+   * \brief map active node to is working index offset in qexpand,
+   *   can be -1, which means the node is node actively expanding
+   */
+  std::vector<int> node2workindex;
+  /*!
+   * \brief position of each instance in the tree
+   *   can be negative, which means this position is no longer expanding
+   *   see also Decode/EncodePosition
+   */
+  std::vector<int> position;
+
+ private:
+  inline void UpdateNode2WorkIndex(const RegTree &tree) {
+    // update the node2workindex
+    std::fill(node2workindex.begin(), node2workindex.end(), -1);
+    node2workindex.resize(tree.param.num_nodes);
+    for (size_t i = 0; i < qexpand.size(); ++i) {
+      node2workindex[qexpand[i]] = static_cast<int>(i);
+    }
+  }
+};
+}  // namespace tree
+}  // namespace xgboost
+#endif // XGBOOST_TREE_UPDATER_BASEMAKER_INL_HPP_
diff --git a/src/tree/updater_histmaker-inl.hpp b/src/tree/updater_histmaker-inl.hpp
index 7036010e750d..6080085157a3 100644
--- a/src/tree/updater_histmaker-inl.hpp
+++ b/src/tree/updater_histmaker-inl.hpp
@@ -10,17 +10,14 @@
 #include "../sync/sync.h"
 #include "../utils/quantile.h"
 #include "../utils/group_data.h"
+#include "./updater_basemaker-inl.hpp"
 
 namespace xgboost {
 namespace tree {
 template<typename TStats>
-class HistMaker: public IUpdater {
+class HistMaker: public BaseMaker {
  public:
   virtual ~HistMaker(void) {}
-  // set training parameter
-  virtual void SetParam(const char *name, const char *val) {
-    param.SetParam(name, val);
-  }
   virtual void Update(const std::vector<bst_gpair> &gpair,
                       IFMatrix *p_fmat,
                       const BoosterInfo &info,
@@ -113,34 +110,11 @@ class HistMaker: public IUpdater {
       return rptr.size() - 1;
     }
   };  
-  // training parameter
-  TrainParam param;
   // workspace of thread
   ThreadWSpace wspace;
-  // position of each data
-  std::vector<int> position;
-  /*! \brief queue of nodes to be expanded */
-  std::vector<int> qexpand;
-  /*! \brief map active node to is working index offset in qexpand*/
-  std::vector<int> node2workindex;
   // reducer for histogram
   sync::Reducer<TStats> histred;  
-  // helper function to get to next level of the tree
-  // must work on non-leaf node
-  inline static int NextLevel(const SparseBatch::Inst &inst, const RegTree &tree, int nid) {
-    const RegTree::Node &n = tree[nid];
-    bst_uint findex = n.split_index();
-    for (unsigned i = 0; i < inst.length; ++i) {
-      if (findex == inst[i].index) {
-        if (inst[i].fvalue < n.split_cond()) {
-          return n.cleft();
-        } else {
-          return n.cright();
-        }
-      }
-    }
-    return n.cdefault();
-  }
+
   // this function does two jobs
   // (1) reset the position in array position, to be the latest leaf id
   // (2) propose a set of candidate cuts and set wspace.rptr wspace.cut correctly 
@@ -154,11 +128,9 @@ class HistMaker: public IUpdater {
                       const BoosterInfo &info,
                       RegTree *p_tree) {
     this->InitData(gpair, *p_fmat, info.root_index, *p_tree);
-    this->UpdateNode2WorkIndex(*p_tree);
     for (int depth = 0; depth < param.max_depth; ++depth) {
       this->FindSplit(depth, gpair, p_fmat, info, p_tree);
       this->UpdateQueueExpand(*p_tree);
-      this->UpdateNode2WorkIndex(*p_tree);
       // if nothing left to be expand, break
       if (qexpand.size() == 0) break;
     }
@@ -167,64 +139,6 @@ class HistMaker: public IUpdater {
       (*p_tree)[nid].set_leaf(p_tree->stat(nid).base_weight * param.learning_rate);
     }
   }
-  // initialize temp data structure
-  inline void InitData(const std::vector<bst_gpair> &gpair,
-                       const IFMatrix &fmat,
-                       const std::vector<unsigned> &root_index,
-                       const RegTree &tree) {
-    utils::Assert(tree.param.num_nodes == tree.param.num_roots,
-                  "HistMaker: can only grow new tree");
-    {// setup position
-      position.resize(gpair.size());
-      if (root_index.size() == 0) {
-        std::fill(position.begin(), position.end(), 0);
-      } else {
-        for (size_t i = 0; i < position.size(); ++i) {
-          position[i] = root_index[i];
-          utils::Assert(root_index[i] < (unsigned)tree.param.num_roots,
-                        "root index exceed setting");
-        }
-      }
-      // mark delete for the deleted datas
-      for (size_t i = 0; i < position.size(); ++i) {
-        if (gpair[i].hess < 0.0f) position[i] = ~position[i];
-      }
-      // mark subsample
-      if (param.subsample < 1.0f) {
-        for (size_t i = 0; i < position.size(); ++i) {
-          if (gpair[i].hess < 0.0f) continue;
-          if (random::SampleBinary(param.subsample) == 0) position[i] = ~position[i];
-        }
-      }
-    }
-    {// expand query
-      qexpand.reserve(256); qexpand.clear();
-      for (int i = 0; i < tree.param.num_roots; ++i) {
-        qexpand.push_back(i);
-      }
-    }
-  }
-  /*! \brief update queue expand add in new leaves */
-  inline void UpdateQueueExpand(const RegTree &tree) {
-    std::vector<int> newnodes;
-    for (size_t i = 0; i < qexpand.size(); ++i) {
-      const int nid = qexpand[i];
-      if (!tree[nid].is_leaf()) {
-        newnodes.push_back(tree[nid].cleft());
-        newnodes.push_back(tree[nid].cright());
-      }
-    }
-    // use new nodes for qexpand
-    qexpand = newnodes;
-  }
-  inline void UpdateNode2WorkIndex(const RegTree &tree) {
-    // update the node2workindex
-    std::fill(node2workindex.begin(), node2workindex.end(), -1);
-    node2workindex.resize(tree.param.num_nodes);
-    for (size_t i = 0; i < qexpand.size(); ++i) {
-      node2workindex[qexpand[i]] = static_cast<int>(i);
-    }
-  }
   inline void CreateHist(const std::vector<bst_gpair> &gpair,
                          IFMatrix *p_fmat,
                          const BoosterInfo &info,

From 5e8e9a9b743e837ccb1e6bf491d153aef6977d9c Mon Sep 17 00:00:00 2001
From: tqchen <tianqi.tchen@gmail.com>
Date: Mon, 17 Nov 2014 10:49:53 -0800
Subject: [PATCH 071/166] updated base

---
 src/tree/updater_basemaker-inl.hpp | 110 +++++++++++++++++++++++++----
 1 file changed, 95 insertions(+), 15 deletions(-)

diff --git a/src/tree/updater_basemaker-inl.hpp b/src/tree/updater_basemaker-inl.hpp
index 17469658f9dc..f37335004e1d 100644
--- a/src/tree/updater_basemaker-inl.hpp
+++ b/src/tree/updater_basemaker-inl.hpp
@@ -27,8 +27,8 @@ class BaseMaker: public IUpdater {
  protected:
   // ------static helper functions ------
   // helper function to get to next level of the tree
-  // must work on non-leaf node
-  inline static int NextLevel(const SparseBatch::Inst &inst, const RegTree &tree, int nid) {
+  /*! \brief this is  helper function for row based data*/
+  inline static int NextLevel(const RowBatch::Inst &inst, const RegTree &tree, int nid) {
     const RegTree::Node &n = tree[nid];
     bst_uint findex = n.split_index();
     for (unsigned i = 0; i < inst.length; ++i) {
@@ -52,19 +52,6 @@ class BaseMaker: public IUpdater {
     return nthread;
   }
   // ------class member helpers---------
-  // return decoded position
-  inline int DecodePosition(bst_uint ridx) const{
-    const int pid = position[ridx];
-    return pid < 0 ? ~pid : pid;
-  }
-  // encode the encoded position value for ridx
-  inline void SetEncodePosition(bst_uint ridx, int nid) {
-    if (position[ridx] < 0) {
-      position[ridx] = ~nid;
-    } else {
-      position[ridx] = nid;
-    }
-  }
   /*! \brief initialize temp data structure */
   inline void InitData(const std::vector<bst_gpair> &gpair,
                        const IFMatrix &fmat,
@@ -117,6 +104,99 @@ class BaseMaker: public IUpdater {
     qexpand = newnodes;
     this->UpdateNode2WorkIndex(tree);
   }
+  // return decoded position
+  inline int DecodePosition(bst_uint ridx) const{
+    const int pid = position[ridx];
+    return pid < 0 ? ~pid : pid;
+  }
+  // encode the encoded position value for ridx
+  inline void SetEncodePosition(bst_uint ridx, int nid) {
+    if (position[ridx] < 0) {
+      position[ridx] = ~nid;
+    } else {
+      position[ridx] = nid;
+    }
+  }
+  /*! 
+   * \brief this is helper function uses column based data structure,
+   *        reset the positions to the lastest one
+   * \param nodes the set of nodes that contains the split to be used
+   * \param p_fmat feature matrix needed for tree construction
+   * \param tree the regression tree structure
+   */
+  inline void ResetPositionCol(const std::vector<int> &nodes, IFMatrix *p_fmat, const RegTree &tree) {
+    // set the positions in the nondefault
+    this->SetNonDefaultPositionCol(nodes, p_fmat, tree);
+    // set rest of instances to default position
+    const std::vector<bst_uint> &rowset = p_fmat->buffered_rowset();
+    // set default direct nodes to default
+    // for leaf nodes that are not fresh, mark then to ~nid, 
+    // so that they are ignored in future statistics collection
+    const bst_omp_uint ndata = static_cast<bst_omp_uint>(rowset.size());
+    
+    #pragma omp parallel for schedule(static)
+    for (bst_omp_uint i = 0; i < ndata; ++i) {
+      const bst_uint ridx = rowset[i];
+      const int nid = this->DecodePosition(ridx);
+      if (tree[nid].is_leaf()) {
+        // mark finish when it is not a fresh leaf
+        if (tree[nid].cright() == -1) {
+          position[ridx] = ~nid;
+        }
+        } else {
+        // push to default branch
+        if (tree[nid].default_left()) {
+          this->SetEncodePosition(ridx, tree[nid].cleft());
+        } else {
+          this->SetEncodePosition(ridx, tree[nid].cright());
+        }
+      }
+    }
+  }
+  /*!
+   * \brief this is helper function uses column based data structure,
+   *        update all positions into nondefault branch, if any, ignore the default branch
+   * \param nodes the set of nodes that contains the split to be used
+   * \param p_fmat feature matrix needed for tree construction
+   * \param tree the regression tree structure
+   */
+  virtual void SetNonDefaultPositionCol(const std::vector<int> &nodes,
+                                        IFMatrix *p_fmat, const RegTree &tree) {
+    // step 1, classify the non-default data into right places
+    std::vector<unsigned> fsplits;
+    for (size_t i = 0; i < nodes.size(); ++i) {
+      const int nid = nodes[i];
+      if (!tree[nid].is_leaf()) {
+        fsplits.push_back(tree[nid].split_index());
+      }
+    }
+    std::sort(fsplits.begin(), fsplits.end());
+    fsplits.resize(std::unique(fsplits.begin(), fsplits.end()) - fsplits.begin());
+    
+    utils::IIterator<ColBatch> *iter = p_fmat->ColIterator(fsplits);
+    while (iter->Next()) {
+      const ColBatch &batch = iter->Value();
+      for (size_t i = 0; i < batch.size; ++i) {
+        ColBatch::Inst col = batch[i];
+        const bst_uint fid = batch.col_index[i];
+        const bst_omp_uint ndata = static_cast<bst_omp_uint>(col.length);
+        #pragma omp parallel for schedule(static)
+        for (bst_omp_uint j = 0; j < ndata; ++j) {
+          const bst_uint ridx = col[j].index;
+          const float fvalue = col[j].fvalue;
+          const int nid = this->DecodePosition(ridx);
+          // go back to parent, correct those who are not default
+          if (!tree[nid].is_leaf() && tree[nid].split_index() == fid) {
+            if(fvalue < tree[nid].split_cond()) {
+              this->SetEncodePosition(ridx, tree[nid].cleft());
+            } else {
+              this->SetEncodePosition(ridx, tree[nid].cright());
+            }
+          }
+        }
+      }
+    }
+  }
   /*! \brief training parameter of tree grower */
   TrainParam param;
   /*! \brief queue of nodes to be expanded */

From 5de0a2cdc0eabe7a052940532e4cffb9f80a939e Mon Sep 17 00:00:00 2001
From: tqchen <tianqi.tchen@gmail.com>
Date: Tue, 18 Nov 2014 10:19:18 -0800
Subject: [PATCH 072/166] sorted base sketch maker

---
 src/tree/updater.cpp               |   1 +
 src/tree/updater_colmaker-inl.hpp  |   2 +-
 src/tree/updater_histmaker-inl.hpp | 232 +++++++++++++++++++++++++++--
 src/utils/quantile.h               |  47 +++---
 4 files changed, 248 insertions(+), 34 deletions(-)

diff --git a/src/tree/updater.cpp b/src/tree/updater.cpp
index f6e669ffad2c..495d589f7e19 100644
--- a/src/tree/updater.cpp
+++ b/src/tree/updater.cpp
@@ -18,6 +18,7 @@ IUpdater* CreateUpdater(const char *name) {
   if (!strcmp(name, "refresh")) return new TreeRefresher<GradStats>();
   if (!strcmp(name, "grow_colmaker")) return new ColMaker<GradStats>();
   if (!strcmp(name, "grow_qhistmaker")) return new QuantileHistMaker<GradStats>();
+  if (!strcmp(name, "grow_cqmaker")) return new CQHistMaker<GradStats>();
   if (!strcmp(name, "grow_chistmaker")) return new ColumnHistMaker<GradStats>();
   if (!strcmp(name, "distcol")) return new DistColMaker<GradStats>();
   if (!strcmp(name, "grow_colmaker5")) return new ColMaker< CVGradStats<5> >();
diff --git a/src/tree/updater_colmaker-inl.hpp b/src/tree/updater_colmaker-inl.hpp
index 6f8cb35d3ad8..61b7fbf7a3ea 100644
--- a/src/tree/updater_colmaker-inl.hpp
+++ b/src/tree/updater_colmaker-inl.hpp
@@ -428,7 +428,7 @@ class ColMaker: public IUpdater {
         for (bst_omp_uint i = 0; i < nsize; ++i) {
           const bst_uint fid = batch.col_index[i];
           const int tid = omp_get_thread_num();
-          const ColBatch::Inst c = batch[i];          
+          const ColBatch::Inst c = batch[i];
           if (param.need_forward_search(fmat.GetColDensity(fid))) {
             this->EnumerateSplit(c.data, c.data + c.length, +1, 
                                  fid, gpair, info, stemp[tid]);
diff --git a/src/tree/updater_histmaker-inl.hpp b/src/tree/updater_histmaker-inl.hpp
index 6080085157a3..ca5a5dc725a6 100644
--- a/src/tree/updater_histmaker-inl.hpp
+++ b/src/tree/updater_histmaker-inl.hpp
@@ -50,8 +50,10 @@ class HistMaker: public BaseMaker {
                     const std::vector<bst_gpair> &gpair,
                     const BoosterInfo &info,
                     const bst_uint ridx) {
-      unsigned i = std::upper_bound(cut, cut + size, fv) - cut;      
-      utils::Assert(i < size, "maximum value must be in cut");
+      unsigned i = std::upper_bound(cut, cut + size, fv) - cut;
+      utils::Assert(size != 0, "try insert into size=0");
+      utils::Assert(i < size, 
+                    "maximum value must be in cut, fv = %g, cutmax=%g", fv, cut[size-1]);
       data[i].Add(gpair, info, ridx);
     }
   };
@@ -122,7 +124,7 @@ class HistMaker: public BaseMaker {
                                   IFMatrix *p_fmat,
                                   const BoosterInfo &info,
                                   const RegTree &tree)  = 0;  
- private:
+
   virtual void Update(const std::vector<bst_gpair> &gpair,
                       IFMatrix *p_fmat,
                       const BoosterInfo &info,
@@ -130,6 +132,7 @@ class HistMaker: public BaseMaker {
     this->InitData(gpair, *p_fmat, info.root_index, *p_tree);
     for (int depth = 0; depth < param.max_depth; ++depth) {
       this->FindSplit(depth, gpair, p_fmat, info, p_tree);
+      this->ResetPositionCol(this->qexpand, p_fmat, *p_tree);
       this->UpdateQueueExpand(*p_tree);
       // if nothing left to be expand, break
       if (qexpand.size() == 0) break;
@@ -139,6 +142,8 @@ class HistMaker: public BaseMaker {
       (*p_tree)[nid].set_leaf(p_tree->stat(nid).base_weight * param.learning_rate);
     }
   }
+
+ private:
   inline void CreateHist(const std::vector<bst_gpair> &gpair,
                          IFMatrix *p_fmat,
                          const BoosterInfo &info,
@@ -166,11 +171,7 @@ class HistMaker: public BaseMaker {
         HistSet &hset = wspace.hset[tid];
         const bst_uint ridx = static_cast<bst_uint>(batch.base_rowid + i);
         int nid = position[ridx];
-        if (!tree[nid].is_leaf()) {
-          this->position[ridx] = nid = HistMaker<TStats>::NextLevel(inst, tree, nid);
-        } 
         if (nid >= 0) {
-          utils::Assert(tree[nid].is_leaf(), "CreateHist happens in leaf");
           const int wid = this->node2workindex[nid];
           for (bst_uint i = 0; i < inst.length; ++i) {
             utils::Assert(inst[i].index < num_feature, "feature index exceed bound");
@@ -365,6 +366,217 @@ class ColumnHistMaker: public HistMaker<TStats> {
   std::vector< utils::WQuantileSketch<bst_float, bst_float> > sketchs;
 };
 
+template<typename TStats>
+class CQHistMaker: public HistMaker<TStats> {
+ protected:
+  virtual void ResetPosAndPropose(const std::vector<bst_gpair> &gpair,
+                                  IFMatrix *p_fmat,
+                                  const BoosterInfo &info,
+                                  const RegTree &tree) {
+    sketchs.resize(this->qexpand.size() * tree.param.num_feature);
+    for (size_t i = 0; i < sketchs.size(); ++i) {
+      sketchs[i].Init(info.num_row, this->param.sketch_eps);
+    }
+    std::vector< std::vector<SketchEntry> > stemp;
+    stemp.resize(this->get_nthread());
+
+    // start accumulating statistics
+    utils::IIterator<ColBatch> *iter = p_fmat->ColIterator();
+    iter->BeforeFirst();
+    while (iter->Next()) {
+      const ColBatch &batch = iter->Value();
+      // start enumeration
+      const bst_omp_uint nsize = static_cast<bst_omp_uint>(batch.size);
+      #pragma omp parallel for schedule(dynamic, 1)
+      for (bst_omp_uint i = 0; i < nsize; ++i) {
+        this->MakeSketch(gpair, batch[i], tree, batch.col_index[i],
+                         &stemp[omp_get_thread_num()]);       
+      }
+    }
+    // setup maximum size
+    size_t max_size = static_cast<size_t>(this->param.sketch_ratio / this->param.sketch_eps);
+    // synchronize sketch
+    summary_array.Init(sketchs.size(), max_size);
+    for (size_t i = 0; i < sketchs.size(); ++i) {
+      utils::WXQuantileSketch<bst_float, bst_float>::SummaryContainer out;
+      sketchs[i].GetSummary(&out);
+      summary_array.Set(i, out);
+    }
+    size_t n4bytes = (summary_array.MemSize() + 3) / 4;
+    sreducer.AllReduce(&summary_array, n4bytes);
+    // now we get the final result of sketch, setup the cut
+    this->wspace.cut.clear();
+    this->wspace.rptr.clear();
+    this->wspace.rptr.push_back(0);
+    for (size_t wid = 0; wid < this->qexpand.size(); ++wid) {
+      for (int fid = 0; fid < tree.param.num_feature; ++fid) {
+        const WXQSketch::Summary a = summary_array[wid * tree.param.num_feature + fid];
+        for (size_t i = 1; i < a.size; ++i) {
+          bst_float cpt = a.data[i].value - rt_eps;
+          if (i == 1 || cpt > this->wspace.cut.back()) {
+            this->wspace.cut.push_back(cpt);
+          }
+        }
+        // push a value that is greater than anything
+        if (a.size != 0) {
+          bst_float cpt = a.data[a.size - 1].value;
+          // this must be bigger than last value in a scale
+          bst_float last = cpt + fabs(cpt) + rt_eps;
+          this->wspace.cut.push_back(last);
+        }
+        this->wspace.rptr.push_back(this->wspace.cut.size());
+      }
+      // reserve last value for global statistics
+      this->wspace.cut.push_back(0.0f);
+      this->wspace.rptr.push_back(this->wspace.cut.size());
+    }
+    utils::Assert(this->wspace.rptr.size() ==
+                  (tree.param.num_feature + 1) * this->qexpand.size() + 1,
+                  "cut space inconsistent");    
+  }
+  // temporal space to build a sketch
+  struct SketchEntry {
+    /*! \brief total sum of */
+    bst_float sum_total;
+    /*! \brief statistics used in the sketch */
+    bst_float rmin, wmin;
+    /*! \brief last seen feature value */
+    bst_float last_fvalue;
+    /*! \brief current size of sketch */
+    bst_float next_goal;
+    // pointer to the sketch to put things in
+    utils::WXQuantileSketch<bst_float, bst_float> *sketch;
+    // initialize the space
+    inline void Init(unsigned max_size) {
+      next_goal = 0.0f;
+      rmin = wmin = 0.0f;
+      sketch->temp.Reserve(max_size + 1);
+      sketch->temp.size = 0;
+    }
+    /*!
+     * \brief push a new element to sketch 
+     * \param fvalue feature value, comes in sorted ascending order
+     * \param w weight
+     * \param max_size
+     */
+    inline void Push(bst_float fvalue, bst_float w, unsigned max_size) {
+      if (w == 0.0f) return;
+      if (wmin == 0.0f) {
+        last_fvalue = fvalue;
+        wmin = w;
+        return;
+      }
+      if (last_fvalue != fvalue) {
+        bst_float rmax = rmin + wmin;
+        if (rmax >= next_goal) {
+          if (sketch->temp.size == 0 || last_fvalue > sketch->temp.data[sketch->temp.size-1].value) {
+            // push to sketch
+            sketch->temp.data[sketch->temp.size] =
+                utils::WXQuantileSketch<bst_float, bst_float>::
+                Entry(rmin, rmax, wmin, last_fvalue);
+            utils::Assert(sketch->temp.size < max_size,
+                          "invalid maximum size max_size=%u, stemp.size=%lu\n",
+                          max_size, sketch->temp.size);
+            ++sketch->temp.size;
+          }
+          if (sketch->temp.size == max_size) {
+            next_goal = sum_total * 2.0f + 1e-5f;
+          } else{
+            next_goal = static_cast<bst_float>(sketch->temp.size * sum_total / max_size);
+          }
+        }
+        rmin = rmax;
+        wmin = w;
+        last_fvalue = fvalue;
+      } else {
+        wmin += w;
+      }
+    }
+    /*! \brief push final unfinished value to the sketch */
+    inline void Finalize(unsigned max_size) {
+      bst_float rmax = rmin + wmin;
+      //utils::Assert(fabs(rmax - sum_total) < 1e-4 + sum_total * 1e-5,
+      //"invalid sum value, rmax=%f, sum_total=%lf", rmax, sum_total);
+      if (sketch->temp.size == 0 || last_fvalue > sketch->temp.data[sketch->temp.size-1].value) {
+        utils::Assert(sketch->temp.size <= max_size,
+                      "Finalize: invalid maximum size, max_size=%u, stemp.size=%lu",
+                      sketch->temp.size, max_size );
+        // push to sketch
+        sketch->temp.data[sketch->temp.size] =
+            utils::WXQuantileSketch<bst_float, bst_float>::
+            Entry(rmin, rmax, wmin, last_fvalue);
+        ++sketch->temp.size;
+      }
+      sketch->PushTemp();
+    }
+  };
+  
+ private:
+  inline void MakeSketch(const std::vector<bst_gpair> &gpair,
+                         const ColBatch::Inst &c,
+                         const RegTree &tree,
+                         bst_uint fid,
+                         std::vector<SketchEntry> *p_temp) {
+    if (c.length == 0) return;
+    // initialize sbuilder for use
+    std::vector<SketchEntry> &sbuilder = *p_temp;
+    sbuilder.resize(this->qexpand.size());
+    for (size_t i = 0; i < sbuilder.size(); ++i) {
+      sbuilder[i].sum_total = 0.0f;
+      sbuilder[i].sketch = &sketchs[i * tree.param.num_feature + fid];
+    }
+    // second pass, build the sketch
+    for (bst_uint j = 0; j < c.length; ++j) {
+      const bst_uint ridx = c[j].index;
+      const int nid = this->position[ridx];
+      if (nid >= 0) {
+        const int wid = this->node2workindex[nid];
+        sbuilder[wid].sketch->Push(c[j].fvalue, gpair[ridx].hess);
+      }
+    }
+    return;
+    // first pass, get sum of weight, TODO, optimization to skip first pass
+    for (bst_uint j = 0; j < c.length; ++j) {
+      const bst_uint ridx = c[j].index;
+      const int nid = this->position[ridx];
+      if (nid >= 0) {
+        const int wid = this->node2workindex[nid];
+        sbuilder[wid].sum_total += gpair[ridx].hess;
+      }
+    }
+    // if only one value, no need to do second pass
+    if (c[0].fvalue  == c[c.length-1].fvalue) {
+      for (size_t wid = 0; wid < this->qexpand.size(); ++wid) {
+        sbuilder[wid].sketch->Push(c[0].fvalue, sbuilder[wid].sum_total);
+      }
+      return;
+    }
+    // two pass scan
+    unsigned max_size = static_cast<unsigned>(this->param.sketch_ratio / this->param.sketch_eps);
+    for (size_t wid = 0; wid < sbuilder.size(); ++wid) {
+      sbuilder[wid].Init(max_size);
+    }
+    // second pass, build the sketch
+    for (bst_uint j = 0; j < c.length; ++j) {
+      const bst_uint ridx = c[j].index;
+      const int nid = this->position[ridx];
+      if (nid >= 0) {
+        const int wid = this->node2workindex[nid];
+        sbuilder[wid].Push(c[j].fvalue, gpair[ridx].hess, max_size);
+      }
+    }
+    for (size_t wid = 0; wid < sbuilder.size(); ++wid) {
+      sbuilder[wid].Finalize(max_size);
+    }
+   }
+  typedef utils::WXQuantileSketch<bst_float, bst_float> WXQSketch;
+  // summary array
+  WXQSketch::SummaryArray summary_array;
+  // reducer for summary
+  sync::ComplexReducer<WXQSketch::SummaryArray> sreducer;
+  // per node, per feature sketch
+  std::vector< utils::WXQuantileSketch<bst_float, bst_float> > sketchs;  
+};
 
 template<typename TStats>
 class QuantileHistMaker: public HistMaker<TStats> {  
@@ -374,11 +586,7 @@ class QuantileHistMaker: public HistMaker<TStats> {
                                   const BoosterInfo &info,
                                   const RegTree &tree) {
     // initialize the data structure
-    int nthread;
-    #pragma omp parallel
-    {
-      nthread = omp_get_num_threads();
-    }
+    int nthread = BaseMaker::get_nthread();
     sketchs.resize(this->qexpand.size() * tree.param.num_feature);
     for (size_t i = 0; i < sketchs.size(); ++i) {
       sketchs[i].Init(info.num_row, this->param.sketch_eps);
diff --git a/src/utils/quantile.h b/src/utils/quantile.h
index 53117f28b3ab..dc53ac9e6611 100644
--- a/src/utils/quantile.h
+++ b/src/utils/quantile.h
@@ -624,8 +624,8 @@ class QuantileSketchTemplate {
     data.clear();
     level.clear();
   }
-  /*! 
-   * \brief add an element to a sketch 
+  /*!
+   * \brief add an element to a sketch
    * \param x the elemented added to the sketch
    */
   inline void Push(DType x, RType w = 1) {
@@ -638,29 +638,34 @@ class QuantileSketchTemplate {
         inqueue.MakeSummary(&temp);
         // cleanup queue
         inqueue.qtail = 0;
-        for (size_t l = 1; true; ++l) {
-          this->InitLevel(l + 1);
-          // check if level l is empty
-          if (level[l].size == 0) {
-            level[l].SetPrune(temp, limit_size); 
-            break;            
-          } else {
-            // level 0 is actually temp space
-            level[0].SetPrune(temp, limit_size);
-            temp.SetCombine(level[0], level[l]);
-            if (temp.size > limit_size) {
-              // try next level
-              level[l].size = 0;
-            } else {
-              // if merged record is still smaller, no need to send to next level
-              level[l].CopyFrom(temp); break;
-            }
-          }
-        }
+        this->PushTemp();
       }
     }
     inqueue.Push(x, w);
   }
+  /*! \brief push up temp */
+  inline void PushTemp(void) {
+    temp.Reserve(limit_size * 2);
+    for (size_t l = 1; true; ++l) {
+      this->InitLevel(l + 1);
+      // check if level l is empty
+      if (level[l].size == 0) {
+        level[l].SetPrune(temp, limit_size);
+        break;
+      } else {
+        // level 0 is actually temp space
+        level[0].SetPrune(temp, limit_size);
+        temp.SetCombine(level[0], level[l]);
+        if (temp.size > limit_size) {
+          // try next level
+          level[l].size = 0;
+        } else {
+          // if merged record is still smaller, no need to send to next level
+          level[l].CopyFrom(temp); break;
+        }
+      }
+    }
+  }
   /*! \brief get the summary after finalize */
   inline void GetSummary(SummaryContainer *out) {
     if (level.size() != 0) {

From ce7ecadf5e6792f9188592aa5678f2c0c769b206 Mon Sep 17 00:00:00 2001
From: tqchen <tianqi.tchen@gmail.com>
Date: Tue, 18 Nov 2014 10:52:18 -0800
Subject: [PATCH 073/166] simplify

---
 src/tree/updater_histmaker-inl.hpp | 50 +++++++++++++-----------------
 1 file changed, 22 insertions(+), 28 deletions(-)

diff --git a/src/tree/updater_histmaker-inl.hpp b/src/tree/updater_histmaker-inl.hpp
index ca5a5dc725a6..1dc586c26b6d 100644
--- a/src/tree/updater_histmaker-inl.hpp
+++ b/src/tree/updater_histmaker-inl.hpp
@@ -377,8 +377,7 @@ class CQHistMaker: public HistMaker<TStats> {
     for (size_t i = 0; i < sketchs.size(); ++i) {
       sketchs[i].Init(info.num_row, this->param.sketch_eps);
     }
-    std::vector< std::vector<SketchEntry> > stemp;
-    stemp.resize(this->get_nthread());
+    thread_temp.resize(this->get_nthread());
 
     // start accumulating statistics
     utils::IIterator<ColBatch> *iter = p_fmat->ColIterator();
@@ -390,7 +389,7 @@ class CQHistMaker: public HistMaker<TStats> {
       #pragma omp parallel for schedule(dynamic, 1)
       for (bst_omp_uint i = 0; i < nsize; ++i) {
         this->MakeSketch(gpair, batch[i], tree, batch.col_index[i],
-                         &stemp[omp_get_thread_num()]);       
+                         &thread_temp[omp_get_thread_num()]);       
       }
     }
     // setup maximum size
@@ -460,7 +459,6 @@ class CQHistMaker: public HistMaker<TStats> {
      * \param max_size
      */
     inline void Push(bst_float fvalue, bst_float w, unsigned max_size) {
-      if (w == 0.0f) return;
       if (wmin == 0.0f) {
         last_fvalue = fvalue;
         wmin = w;
@@ -520,56 +518,52 @@ class CQHistMaker: public HistMaker<TStats> {
     if (c.length == 0) return;
     // initialize sbuilder for use
     std::vector<SketchEntry> &sbuilder = *p_temp;
-    sbuilder.resize(this->qexpand.size());
-    for (size_t i = 0; i < sbuilder.size(); ++i) {
-      sbuilder[i].sum_total = 0.0f;
-      sbuilder[i].sketch = &sketchs[i * tree.param.num_feature + fid];
+    sbuilder.resize(tree.param.num_nodes);
+    for (size_t i = 0; i < this->qexpand.size(); ++i) {
+      const unsigned nid = this->qexpand[i];
+      const unsigned wid = this->node2workindex[nid];
+      sbuilder[nid].sum_total = 0.0f;
+      sbuilder[nid].sketch = &sketchs[wid * tree.param.num_feature + fid];
     }
-    // second pass, build the sketch
-    for (bst_uint j = 0; j < c.length; ++j) {
-      const bst_uint ridx = c[j].index;
-      const int nid = this->position[ridx];
-      if (nid >= 0) {
-        const int wid = this->node2workindex[nid];
-        sbuilder[wid].sketch->Push(c[j].fvalue, gpair[ridx].hess);
-      }
-    }
-    return;
+
     // first pass, get sum of weight, TODO, optimization to skip first pass
     for (bst_uint j = 0; j < c.length; ++j) {
       const bst_uint ridx = c[j].index;
       const int nid = this->position[ridx];
       if (nid >= 0) {
-        const int wid = this->node2workindex[nid];
-        sbuilder[wid].sum_total += gpair[ridx].hess;
+        sbuilder[nid].sum_total += gpair[ridx].hess;
       }
     }
     // if only one value, no need to do second pass
     if (c[0].fvalue  == c[c.length-1].fvalue) {
-      for (size_t wid = 0; wid < this->qexpand.size(); ++wid) {
-        sbuilder[wid].sketch->Push(c[0].fvalue, sbuilder[wid].sum_total);
+      for (size_t i = 0; i < this->qexpand.size(); ++i) {
+        const int nid = this->qexpand[i];
+        sbuilder[nid].sketch->Push(c[0].fvalue, sbuilder[nid].sum_total);
       }
       return;
     }
     // two pass scan
     unsigned max_size = static_cast<unsigned>(this->param.sketch_ratio / this->param.sketch_eps);
-    for (size_t wid = 0; wid < sbuilder.size(); ++wid) {
-      sbuilder[wid].Init(max_size);
+    for (size_t i = 0; i < this->qexpand.size(); ++i) {
+      const int nid = this->qexpand[i];
+      sbuilder[nid].Init(max_size);
     }
     // second pass, build the sketch
     for (bst_uint j = 0; j < c.length; ++j) {
       const bst_uint ridx = c[j].index;
       const int nid = this->position[ridx];
       if (nid >= 0) {
-        const int wid = this->node2workindex[nid];
-        sbuilder[wid].Push(c[j].fvalue, gpair[ridx].hess, max_size);
+        sbuilder[nid].Push(c[j].fvalue, gpair[ridx].hess, max_size);
       }
     }
-    for (size_t wid = 0; wid < sbuilder.size(); ++wid) {
-      sbuilder[wid].Finalize(max_size);
+    for (size_t i = 0; i < this->qexpand.size(); ++i) {
+      const int nid = this->qexpand[i];
+      sbuilder[nid].Finalize(max_size);
     }
    }
   typedef utils::WXQuantileSketch<bst_float, bst_float> WXQSketch;
+  // thread temp data
+  std::vector< std::vector<SketchEntry> > thread_temp;
   // summary array
   WXQSketch::SummaryArray summary_array;
   // reducer for summary

From 303f8b9bc5c3056002f346d940db4bd892b87bd5 Mon Sep 17 00:00:00 2001
From: tqchen <tianqi.tchen@gmail.com>
Date: Tue, 18 Nov 2014 11:25:54 -0800
Subject: [PATCH 074/166] hack to make the propose fast in one pass, start
 sketchmaker

---
 src/tree/updater.cpp               |   4 +-
 src/tree/updater_histmaker-inl.hpp | 150 ++++++++++++-----------------
 2 files changed, 61 insertions(+), 93 deletions(-)

diff --git a/src/tree/updater.cpp b/src/tree/updater.cpp
index 495d589f7e19..fb9e84c4f06b 100644
--- a/src/tree/updater.cpp
+++ b/src/tree/updater.cpp
@@ -19,10 +19,8 @@ IUpdater* CreateUpdater(const char *name) {
   if (!strcmp(name, "grow_colmaker")) return new ColMaker<GradStats>();
   if (!strcmp(name, "grow_qhistmaker")) return new QuantileHistMaker<GradStats>();
   if (!strcmp(name, "grow_cqmaker")) return new CQHistMaker<GradStats>();
-  if (!strcmp(name, "grow_chistmaker")) return new ColumnHistMaker<GradStats>();
   if (!strcmp(name, "distcol")) return new DistColMaker<GradStats>();
-  if (!strcmp(name, "grow_colmaker5")) return new ColMaker< CVGradStats<5> >();
-  if (!strcmp(name, "grow_colmaker3")) return new ColMaker< CVGradStats<3> >();
+
   utils::Error("unknown updater:%s", name);
   return NULL;
 }
diff --git a/src/tree/updater_histmaker-inl.hpp b/src/tree/updater_histmaker-inl.hpp
index 1dc586c26b6d..d893de52d6f1 100644
--- a/src/tree/updater_histmaker-inl.hpp
+++ b/src/tree/updater_histmaker-inl.hpp
@@ -285,87 +285,6 @@ class HistMaker: public BaseMaker {
   }
 };
 
-template<typename TStats>
-class ColumnHistMaker: public HistMaker<TStats> {
- public:
-  virtual void ResetPosAndPropose(const std::vector<bst_gpair> &gpair,
-                                  IFMatrix *p_fmat,
-                                  const BoosterInfo &info,
-                                  const RegTree &tree) {
-    sketchs.resize(tree.param.num_feature);
-    for (size_t i = 0; i < sketchs.size(); ++i) {
-      sketchs[i].Init(info.num_row, this->param.sketch_eps);
-    }
-    utils::IIterator<ColBatch> *iter = p_fmat->ColIterator();
-    while (iter->Next()) {
-      const ColBatch &batch = iter->Value();
-      const bst_omp_uint nsize = static_cast<bst_omp_uint>(batch.size);
-      #pragma omp parallel for schedule(dynamic, 1)
-      for (bst_omp_uint i = 0; i < nsize; ++i) {
-        const bst_uint fid = batch.col_index[i];
-        const ColBatch::Inst &col = batch[i];
-        unsigned nstep = col.length * (this->param.sketch_eps / this->param.sketch_ratio);
-        if (nstep == 0) nstep = 1; 
-        for (unsigned i = 0; i < col.length; i += nstep) {
-          sketchs[fid].Push(col[i].fvalue);
-        }
-        if (col.length != 0 && col.length - 1 % nstep != 0)  {
-          sketchs[fid].Push(col[col.length-1].fvalue);
-        }
-      }
-    }  
-    
-    size_t max_size = static_cast<size_t>(this->param.sketch_ratio / this->param.sketch_eps);
-    // synchronize sketch
-    summary_array.Init(sketchs.size(), max_size);
-    for (size_t i = 0; i < sketchs.size(); ++i) {
-      utils::WQuantileSketch<bst_float, bst_float>::SummaryContainer out;
-      sketchs[i].GetSummary(&out);
-      summary_array.Set(i, out);
-    }
-    size_t n4bytes = (summary_array.MemSize() + 3) / 4;
-    sreducer.AllReduce(&summary_array, n4bytes);
-    // now we get the final result of sketch, setup the cut
-    this->wspace.cut.clear();
-    this->wspace.rptr.clear();
-    this->wspace.rptr.push_back(0);
-    for (size_t wid = 0; wid < this->qexpand.size(); ++wid) {
-      for (int fid = 0; fid < tree.param.num_feature; ++fid) {
-        const WXQSketch::Summary a = summary_array[fid];
-        for (size_t i = 1; i < a.size; ++i) {
-          bst_float cpt = a.data[i].value - rt_eps;
-          if (i == 1 || cpt > this->wspace.cut.back()) {
-            this->wspace.cut.push_back(cpt);
-          }
-        }
-        // push a value that is greater than anything
-        if (a.size != 0) {
-          bst_float cpt = a.data[a.size - 1].value;
-          // this must be bigger than last value in a scale
-          bst_float last = cpt + fabs(cpt) + rt_eps;
-          this->wspace.cut.push_back(last);
-        }
-        this->wspace.rptr.push_back(this->wspace.cut.size());
-      }
-      // reserve last value for global statistics
-      this->wspace.cut.push_back(0.0f);
-      this->wspace.rptr.push_back(this->wspace.cut.size());
-    }
-    utils::Assert(this->wspace.rptr.size() ==
-                  (tree.param.num_feature + 1) * this->qexpand.size() + 1,
-                  "cut space inconsistent");    
-  }
-
- private:
-  typedef utils::WXQuantileSketch<bst_float, bst_float> WXQSketch;
-  // summary array
-  WXQSketch::SummaryArray summary_array;
-  // reducer for summary
-  sync::ComplexReducer<WXQSketch::SummaryArray> sreducer;
-  // per feature sketch
-  std::vector< utils::WQuantileSketch<bst_float, bst_float> > sketchs;
-};
-
 template<typename TStats>
 class CQHistMaker: public HistMaker<TStats> {
  protected:
@@ -378,7 +297,8 @@ class CQHistMaker: public HistMaker<TStats> {
       sketchs[i].Init(info.num_row, this->param.sketch_eps);
     }
     thread_temp.resize(this->get_nthread());
-
+    std::vector<bst_float> root_stats;
+    this->GetRootStats(gpair, *p_fmat, tree, &root_stats);
     // start accumulating statistics
     utils::IIterator<ColBatch> *iter = p_fmat->ColIterator();
     iter->BeforeFirst();
@@ -388,7 +308,10 @@ class CQHistMaker: public HistMaker<TStats> {
       const bst_omp_uint nsize = static_cast<bst_omp_uint>(batch.size);
       #pragma omp parallel for schedule(dynamic, 1)
       for (bst_omp_uint i = 0; i < nsize; ++i) {
-        this->MakeSketch(gpair, batch[i], tree, batch.col_index[i],
+        this->MakeSketch(gpair, batch[i], tree,
+                         root_stats,
+                         batch.col_index[i],
+                         p_fmat->GetColDensity(batch.col_index[i]),
                          &thread_temp[omp_get_thread_num()]);       
       }
     }
@@ -513,7 +436,9 @@ class CQHistMaker: public HistMaker<TStats> {
   inline void MakeSketch(const std::vector<bst_gpair> &gpair,
                          const ColBatch::Inst &c,
                          const RegTree &tree,
+                         const std::vector<bst_float> &root_stats,
                          bst_uint fid,
+                         float col_density,
                          std::vector<SketchEntry> *p_temp) {
     if (c.length == 0) return;
     // initialize sbuilder for use
@@ -526,13 +451,20 @@ class CQHistMaker: public HistMaker<TStats> {
       sbuilder[nid].sketch = &sketchs[wid * tree.param.num_feature + fid];
     }
 
-    // first pass, get sum of weight, TODO, optimization to skip first pass
-    for (bst_uint j = 0; j < c.length; ++j) {
-      const bst_uint ridx = c[j].index;
-      const int nid = this->position[ridx];
-      if (nid >= 0) {
-        sbuilder[nid].sum_total += gpair[ridx].hess;
+    if (col_density != 1.0f) {
+      // first pass, get sum of weight, TODO, optimization to skip first pass
+      for (bst_uint j = 0; j < c.length; ++j) {
+        const bst_uint ridx = c[j].index;
+        const int nid = this->position[ridx];
+        if (nid >= 0) {
+          sbuilder[nid].sum_total += gpair[ridx].hess;
+        }
       }
+    } else {
+      for (size_t i = 0; i < this->qexpand.size(); ++i) {
+        const unsigned nid = this->qexpand[i];        
+        sbuilder[nid].sum_total = root_stats[nid];
+      } 
     }
     // if only one value, no need to do second pass
     if (c[0].fvalue  == c[c.length-1].fvalue) {
@@ -560,7 +492,45 @@ class CQHistMaker: public HistMaker<TStats> {
       const int nid = this->qexpand[i];
       sbuilder[nid].Finalize(max_size);
     }
-   }
+  }
+  inline void GetRootStats(const std::vector<bst_gpair> &gpair,
+                           const IFMatrix &fmat,
+                           const RegTree &tree,
+                           std::vector<float> *p_snode) {
+    std::vector<float> &snode = *p_snode;
+    thread_temp.resize(this->get_nthread());
+    snode.resize(tree.param.num_nodes);
+    #pragma omp parallel
+    {
+      const int tid = omp_get_thread_num();
+      thread_temp[tid].resize(tree.param.num_nodes);
+      for (size_t i = 0; i < this->qexpand.size(); ++i) {
+        const unsigned nid = this->qexpand[i];
+        thread_temp[tid][nid].sum_total = 0.0f;
+      }
+    }
+    const std::vector<bst_uint> &rowset = fmat.buffered_rowset();
+    // setup position
+    const bst_omp_uint ndata = static_cast<bst_omp_uint>(rowset.size());
+    #pragma omp parallel for schedule(static)
+    for (bst_omp_uint i = 0; i < ndata; ++i) {
+      const bst_uint ridx = rowset[i];
+      const int tid = omp_get_thread_num();
+      if (this->position[ridx] < 0) continue;
+      thread_temp[tid][this->position[ridx]].sum_total += gpair[ridx].hess;
+    }
+    // sum the per thread statistics together
+    for (size_t j = 0; j < this->qexpand.size(); ++j) {
+      const int nid = this->qexpand[j];
+      double wsum = 0.0f;
+      for (size_t tid = 0; tid < thread_temp.size(); ++tid) {
+        wsum += thread_temp[tid][nid].sum_total; 
+      }
+      // update node statistics
+      snode[nid] = static_cast<bst_float>(wsum);
+    }
+  }
+
   typedef utils::WXQuantileSketch<bst_float, bst_float> WXQSketch;
   // thread temp data
   std::vector< std::vector<SketchEntry> > thread_temp;

From 1b66a874569c17637da6301411841fd016b09f9d Mon Sep 17 00:00:00 2001
From: tqchen <tianqi.tchen@gmail.com>
Date: Tue, 18 Nov 2014 20:57:28 -0800
Subject: [PATCH 075/166] checkin skmaker

---
 Makefile                           |   4 +-
 src/tree/param.h                   |   6 +
 src/tree/updater.cpp               |   2 +
 src/tree/updater_basemaker-inl.hpp |  42 +++
 src/tree/updater_skmaker-inl.hpp   | 467 +++++++++++++++++++++++++++++
 src/utils/quantile.h               |  25 ++
 6 files changed, 544 insertions(+), 2 deletions(-)
 create mode 100644 src/tree/updater_skmaker-inl.hpp

diff --git a/Makefile b/Makefile
index e483ecad4140..72c981706dc8 100644
--- a/Makefile
+++ b/Makefile
@@ -24,8 +24,8 @@ all: $(BIN) $(OBJ) $(SLIB) $(MPIOBJ) $(MPIBIN)
 python: wrapper/libxgboostwrapper.so
 # now the wrapper takes in two files. io and wrapper part
 wrapper/libxgboostwrapper.so: wrapper/xgboost_wrapper.cpp $(OBJ)
-updater.o: src/tree/updater.cpp  src/tree/*.hpp src/*.h src/tree/*.h
-gbm.o: src/gbm/gbm.cpp src/gbm/*.hpp src/gbm/*.h
+updater.o: src/tree/updater.cpp  src/tree/*.hpp src/*.h src/tree/*.h src/utils/*.h
+gbm.o: src/gbm/gbm.cpp src/gbm/*.hpp src/gbm/*.h 
 io.o: src/io/io.cpp src/io/*.hpp src/utils/*.h src/learner/dmatrix.h src/*.h
 sync_mpi.o: src/sync/sync_mpi.cpp 
 sync_empty.o: src/sync/sync_empty.cpp 
diff --git a/src/tree/param.h b/src/tree/param.h
index 6402ef76a9b4..69ac0502d4a8 100644
--- a/src/tree/param.h
+++ b/src/tree/param.h
@@ -144,6 +144,12 @@ struct TrainParam{
   inline bool cannot_split(double sum_hess, int depth) const {
     return sum_hess < this->min_child_weight * 2.0;
   }
+  /*! \brief maximum sketch size */
+  inline unsigned max_sketch_size(void) const {
+    unsigned ret = static_cast<unsigned>(sketch_ratio / sketch_eps);
+    utils::Check(ret > 0, "sketch_ratio/sketch_eps must be bigger than 1");
+    return ret;
+  }
 
  protected:
   // functions for L1 cost
diff --git a/src/tree/updater.cpp b/src/tree/updater.cpp
index fb9e84c4f06b..63f401af8b51 100644
--- a/src/tree/updater.cpp
+++ b/src/tree/updater.cpp
@@ -7,6 +7,7 @@
 #include "./updater_refresh-inl.hpp"
 #include "./updater_colmaker-inl.hpp"
 #include "./updater_distcol-inl.hpp"
+#include "./updater_skmaker-inl.hpp"
 #include "./updater_histmaker-inl.hpp"
 
 namespace xgboost {
@@ -19,6 +20,7 @@ IUpdater* CreateUpdater(const char *name) {
   if (!strcmp(name, "grow_colmaker")) return new ColMaker<GradStats>();
   if (!strcmp(name, "grow_qhistmaker")) return new QuantileHistMaker<GradStats>();
   if (!strcmp(name, "grow_cqmaker")) return new CQHistMaker<GradStats>();
+  if (!strcmp(name, "grow_skmaker")) return new SketchMaker();
   if (!strcmp(name, "distcol")) return new DistColMaker<GradStats>();
 
   utils::Error("unknown updater:%s", name);
diff --git a/src/tree/updater_basemaker-inl.hpp b/src/tree/updater_basemaker-inl.hpp
index f37335004e1d..8152b86cb8cf 100644
--- a/src/tree/updater_basemaker-inl.hpp
+++ b/src/tree/updater_basemaker-inl.hpp
@@ -197,6 +197,48 @@ class BaseMaker: public IUpdater {
       }
     }
   }
+  /*! \brief helper function to get statistics from a tree */
+  template<typename TStats>
+  inline void GetNodeStats(const std::vector<bst_gpair> &gpair,
+                           const IFMatrix &fmat,
+                           const RegTree &tree,
+                           const BoosterInfo &info,
+                           std::vector< std::vector<TStats> > *p_thread_temp,
+                           std::vector<TStats> *p_node_stats) {
+    std::vector< std::vector<TStats> > &thread_temp = *p_thread_temp;
+    thread_temp.resize(this->get_nthread());
+    p_node_stats->resize(tree.param.num_nodes);
+    #pragma omp parallel
+    {
+      const int tid = omp_get_thread_num();
+      thread_temp[tid].resize(tree.param.num_nodes, TStats(param));
+      for (size_t i = 0; i < qexpand.size(); ++i) {
+        const unsigned nid = qexpand[i];
+        thread_temp[tid][nid].Clear();
+      }
+    }
+    const std::vector<bst_uint> &rowset = fmat.buffered_rowset();
+    // setup position
+    const bst_omp_uint ndata = static_cast<bst_omp_uint>(rowset.size());
+    #pragma omp parallel for schedule(static)
+    for (bst_omp_uint i = 0; i < ndata; ++i) {
+      const bst_uint ridx = rowset[i];
+      const int nid = position[ridx];
+      const int tid = omp_get_thread_num();
+      if (nid >= 0) {
+        thread_temp[tid][nid].Add(gpair, info, ridx);
+      }
+    }
+    // sum the per thread statistics together
+    for (size_t j = 0; j < qexpand.size(); ++j) {
+      const int nid = qexpand[j];
+      TStats &s = (*p_node_stats)[nid];
+      s.Clear();
+      for (size_t tid = 0; tid < thread_temp.size(); ++tid) {
+        s.Add(thread_temp[tid][nid]);
+      }
+    }
+  }  
   /*! \brief training parameter of tree grower */
   TrainParam param;
   /*! \brief queue of nodes to be expanded */
diff --git a/src/tree/updater_skmaker-inl.hpp b/src/tree/updater_skmaker-inl.hpp
new file mode 100644
index 000000000000..58f150ee616b
--- /dev/null
+++ b/src/tree/updater_skmaker-inl.hpp
@@ -0,0 +1,467 @@
+#ifndef XGBOOST_TREE_UPDATER_SKMAKER_INL_HPP_
+#define XGBOOST_TREE_UPDATER_SKMAKER_INL_HPP_
+/*!
+ * \file updater_skmaker-inl.hpp
+ * \brief use approximation sketch to construct a tree,
+          a refresh is needed to make the statistics exactly correct
+ * \author Tianqi Chen
+ */
+#include <vector>
+#include <algorithm>
+#include "../sync/sync.h"
+#include "../utils/quantile.h"
+#include "./updater_basemaker-inl.hpp"
+
+namespace xgboost {
+namespace tree {
+class SketchMaker: public BaseMaker {
+ public:
+  virtual ~SketchMaker(void) {}
+  virtual void Update(const std::vector<bst_gpair> &gpair,
+                      IFMatrix *p_fmat,
+                      const BoosterInfo &info,
+                      const std::vector<RegTree*> &trees) {
+    // rescale learning rate according to size of trees
+    float lr = param.learning_rate;
+    param.learning_rate = lr / trees.size();
+    // build tree
+    for (size_t i = 0; i < trees.size(); ++i) {
+      this->Update(gpair, p_fmat, info, trees[i]);
+    }
+    param.learning_rate = lr;
+  }
+ 
+ protected:
+  inline void Update(const std::vector<bst_gpair> &gpair,
+                      IFMatrix *p_fmat,
+                      const BoosterInfo &info,
+                      RegTree *p_tree) {
+    this->InitData(gpair, *p_fmat, info.root_index, *p_tree);
+    for (int depth = 0; depth < param.max_depth; ++depth) {
+      this->GetNodeStats(gpair, *p_fmat, *p_tree, info,
+                         &thread_stats, &node_stats);
+      this->BuildSketch(gpair, p_fmat, info, *p_tree);
+      this->SyncNodeStats();
+      this->FindSplit(depth, gpair, p_fmat, info, p_tree);
+      this->ResetPositionCol(qexpand, p_fmat, *p_tree);
+      this->UpdateQueueExpand(*p_tree);
+      // if nothing left to be expand, break
+      if (qexpand.size() == 0) break;
+    }
+    if (qexpand.size() != 0) {
+      this->GetNodeStats(gpair, *p_fmat, *p_tree, info,
+                         &thread_stats, &node_stats);
+      this->SyncNodeStats();
+    }
+    // set all statistics correctly
+    for (int nid = 0; nid < p_tree->param.num_nodes; ++nid) {
+      this->SetStats(nid, node_stats[nid], p_tree);
+      if (!(*p_tree)[nid].is_leaf()) {
+        p_tree->stat(nid).loss_chg =
+            node_stats[(*p_tree)[nid].cleft()].CalcGain(param) +
+            node_stats[(*p_tree)[nid].cright()].CalcGain(param) -
+            node_stats[nid].CalcGain(param);
+      }
+    }
+    // set left leaves
+    for (size_t i = 0; i < qexpand.size(); ++i) {
+      const int nid = qexpand[i];
+      (*p_tree)[nid].set_leaf(p_tree->stat(nid).base_weight * param.learning_rate);
+    }
+  }
+  // define the sketch we want to use
+  typedef utils::WXQuantileSketch<bst_float, bst_float> WXQSketch;
+
+ private:
+  // statistics needed in the gradient calculation
+  struct SKStats {
+    /*! \brief sum of all positive gradient */
+    double pos_grad;
+    /*! \brief sum of all negative gradient */
+    double neg_grad;
+    /*! \brief sum of hessian statistics */    
+    double sum_hess;
+    explicit SKStats(void) {}
+    // constructor
+    explicit SKStats(const TrainParam &param) {
+      this->Clear();
+    }
+    /*! \brief clear the statistics */
+    inline void Clear(void) {
+      neg_grad = pos_grad = sum_hess = 0.0f;
+    }
+    // accumulate statistics
+    inline void Add(const std::vector<bst_gpair> &gpair,
+                    const BoosterInfo &info,
+                    bst_uint ridx) {
+      const bst_gpair &b = gpair[ridx];
+      if (b.grad >= 0.0f) {
+        pos_grad += b.grad;
+      } else {
+        neg_grad -= b.grad;
+      }
+      sum_hess += b.hess;
+    }
+    /*! \brief calculate gain of the solution */
+    inline double CalcGain(const TrainParam &param) const {
+      return param.CalcGain(pos_grad - neg_grad, sum_hess);
+    }
+    /*! \brief set current value to a - b */
+    inline void SetSubstract(const SKStats &a, const SKStats &b) {
+      pos_grad = a.pos_grad - b.pos_grad;
+      neg_grad = a.neg_grad - b.neg_grad;
+      sum_hess = a.sum_hess - b.sum_hess;
+    }
+    // calculate leaf weight
+    inline double CalcWeight(const TrainParam &param) const {
+      return param.CalcWeight(pos_grad - neg_grad, sum_hess);
+    }
+    /*! \brief add statistics to the data */
+    inline void Add(const SKStats &b) {
+      pos_grad += b.pos_grad;
+      neg_grad += b.neg_grad;
+      sum_hess += b.sum_hess;
+    }
+    /*! \brief same as add, reduce is used in All Reduce */
+    inline void Reduce(const SKStats &b) {
+      this->Add(b);
+    }
+    /*! \brief set leaf vector value based on statistics */
+    inline void SetLeafVec(const TrainParam &param, bst_float *vec) const {
+    }
+  };
+  // temporal space to build a sketch
+  struct SketchEntry {
+    /*! \brief total sum of amount to be met */
+    bst_float sum_total;
+    /*! \brief statistics used in the sketch */
+    bst_float rmin, wmin;
+    /*! \brief last seen feature value */
+    bst_float last_fvalue;
+    /*! \brief current size of sketch */
+    bst_float next_goal;
+    // pointer to the sketch to put things in
+    utils::WXQuantileSketch<bst_float, bst_float> *sketch;
+    // initialize the space
+    inline void Init(unsigned max_size) {
+      next_goal = -1.0f;
+      rmin = wmin = 0.0f;
+      sketch->temp.Reserve(max_size + 1);
+      sketch->temp.size = 0;
+    }
+    /*!
+     * \brief push a new element to sketch 
+     * \param fvalue feature value, comes in sorted ascending order
+     * \param w weight
+     * \param max_size
+     */
+    inline void Push(bst_float fvalue, bst_float w, unsigned max_size) {
+      if (next_goal == -1.0f) {
+        next_goal = 0.0f;
+        last_fvalue = fvalue;
+        wmin = w;
+        return;
+      }
+      if (last_fvalue != fvalue) {
+        bst_float rmax = rmin + wmin;
+        if (rmax >= next_goal) {
+          if (sketch->temp.size == 0 || last_fvalue > sketch->temp.data[sketch->temp.size-1].value) {
+            // push to sketch
+            sketch->temp.data[sketch->temp.size] =
+                utils::WXQuantileSketch<bst_float, bst_float>::
+                Entry(rmin, rmax, wmin, last_fvalue);
+            utils::Assert(sketch->temp.size < max_size,
+                          "invalid maximum size max_size=%u, stemp.size=%lu\n",
+                          max_size, sketch->temp.size);
+            ++sketch->temp.size;
+          }
+          if (sketch->temp.size == max_size) {
+            next_goal = sum_total * 2.0f + 1e-5f;
+          } else{
+            next_goal = static_cast<bst_float>(sketch->temp.size * sum_total / max_size);
+          }
+        }
+        rmin = rmax;
+        wmin = w;
+        last_fvalue = fvalue;
+      } else {
+        wmin += w;
+      }
+    }
+    /*! \brief push final unfinished value to the sketch */
+    inline void Finalize(unsigned max_size) {
+      bst_float rmax = rmin + wmin;
+      if (sketch->temp.size == 0 || last_fvalue > sketch->temp.data[sketch->temp.size-1].value) {
+        utils::Assert(sketch->temp.size <= max_size,
+                      "Finalize: invalid maximum size, max_size=%u, stemp.size=%lu",
+                      sketch->temp.size, max_size );
+        // push to sketch
+        sketch->temp.data[sketch->temp.size] =
+            utils::WXQuantileSketch<bst_float, bst_float>::
+            Entry(rmin, rmax, wmin, last_fvalue);
+        ++sketch->temp.size;
+      }
+      sketch->PushTemp();
+    }
+  };
+  inline void BuildSketch(const std::vector<bst_gpair> &gpair,
+                          IFMatrix *p_fmat,
+                          const BoosterInfo &info,
+                          const RegTree &tree) {
+    sketchs.resize(this->qexpand.size() * tree.param.num_feature * 3);
+    for (size_t i = 0; i < sketchs.size(); ++i) {
+      sketchs[i].Init(info.num_row, this->param.sketch_eps);
+    }
+    thread_sketch.resize(this->get_nthread());
+    // number of rows in 
+    const size_t nrows = p_fmat->buffered_rowset().size();
+    // start accumulating statistics
+    utils::IIterator<ColBatch> *iter = p_fmat->ColIterator();
+    iter->BeforeFirst();
+    while (iter->Next()) {
+      const ColBatch &batch = iter->Value();
+      // start enumeration
+      const bst_omp_uint nsize = static_cast<bst_omp_uint>(batch.size);
+      #pragma omp parallel for schedule(dynamic, 1)
+      for (bst_omp_uint i = 0; i < nsize; ++i) {
+        this->UpdateSketchCol(gpair, batch[i], tree,
+                              node_stats,
+                              batch.col_index[i],
+                              batch[i].length == nrows,
+                              &thread_sketch[omp_get_thread_num()]);
+      }
+    }
+    // setup maximum size
+    unsigned max_size = param.max_sketch_size();
+    // synchronize sketch
+    summary_array.Init(sketchs.size(), max_size);
+    for (size_t i = 0; i < sketchs.size(); ++i) {
+      utils::WXQuantileSketch<bst_float, bst_float>::SummaryContainer out;
+      sketchs[i].GetSummary(&out);
+      summary_array.Set(i, out);
+    }
+    size_t n4bytes = (summary_array.MemSize() + 3) / 4;
+    sketch_reducer.AllReduce(&summary_array, n4bytes);    
+  }
+  // update sketch information in column fid
+  inline void UpdateSketchCol(const std::vector<bst_gpair> &gpair,
+                              const ColBatch::Inst &c,
+                              const RegTree &tree,
+                              const std::vector<SKStats> &nstats,
+                              bst_uint fid,
+                              bool col_full,
+                              std::vector<SketchEntry> *p_temp) {
+    if (c.length == 0) return;
+    // initialize sbuilder for use
+    std::vector<SketchEntry> &sbuilder = *p_temp;
+    sbuilder.resize(tree.param.num_nodes * 3);
+    for (size_t i = 0; i < this->qexpand.size(); ++i) {
+      const unsigned nid = this->qexpand[i];
+      const unsigned wid = this->node2workindex[nid];
+      for (int k = 0; k < 3; ++k) {
+        sbuilder[3 * nid + k].sum_total = 0.0f;
+        sbuilder[3 * nid + k].sketch = &sketchs[(wid * tree.param.num_feature + fid) * 3 + k];       
+      }
+    }
+    if (!col_full) {
+      for (bst_uint j = 0; j < c.length; ++j) {
+        const bst_uint ridx = c[j].index;
+        const int nid = this->position[ridx];
+        if (nid >= 0) {
+          const bst_gpair &e = gpair[ridx];
+          if (e.grad >= 0.0f) {
+            sbuilder[3 * nid + 0].sum_total += e.grad;
+          } else {
+            sbuilder[3 * nid + 1].sum_total -= e.grad;
+          }
+          sbuilder[3 * nid + 2].sum_total += e.hess;
+        }
+      }
+    } else {
+      for (size_t i = 0; i < this->qexpand.size(); ++i) {
+        const unsigned nid = this->qexpand[i];
+        sbuilder[3 * nid + 0].sum_total = nstats[nid].pos_grad;
+        sbuilder[3 * nid + 1].sum_total = nstats[nid].neg_grad;
+        sbuilder[3 * nid + 2].sum_total = nstats[nid].sum_hess;        
+      }
+    }
+    // if only one value, no need to do second pass
+    if (c[0].fvalue  == c[c.length-1].fvalue) {
+      for (size_t i = 0; i < this->qexpand.size(); ++i) {
+        const int nid = this->qexpand[i];
+        for (int k = 0; k < 3; ++k) {
+          sbuilder[3 * nid + k].sketch->Push(c[0].fvalue, sbuilder[3 * nid + k].sum_total);
+        }
+      }
+      return;
+    }
+    // two pass scan
+    unsigned max_size = param.max_sketch_size();
+    for (size_t i = 0; i < this->qexpand.size(); ++i) {
+      const int nid = this->qexpand[i];
+      for (int k = 0; k < 3; ++k) {
+        sbuilder[3 * nid + k].Init(max_size);
+      }
+    }
+    // second pass, build the sketch
+    for (bst_uint j = 0; j < c.length; ++j) {
+      const bst_uint ridx = c[j].index;
+      const int nid = this->position[ridx];
+      if (nid >= 0) {
+        const bst_gpair &e = gpair[ridx];
+        if (e.grad >= 0.0f) {
+          sbuilder[3 * nid + 0].Push(c[j].fvalue, e.grad, max_size);
+        } else {
+          sbuilder[3 * nid + 1].Push(c[j].fvalue, -e.grad, max_size);
+        }
+        sbuilder[3 * nid + 2].Push(c[j].fvalue, e.hess, max_size);
+      }
+    }
+    for (size_t i = 0; i < this->qexpand.size(); ++i) {
+      const int nid = this->qexpand[i];
+      for (int k = 0; k < 3; ++k) {
+        sbuilder[3 * nid + k].Finalize(max_size);
+      }
+    }
+  }  
+  inline void SyncNodeStats(void) {
+    utils::Assert(qexpand.size() != 0, "qexpand must not be empty");
+    std::vector<SKStats> tmp(qexpand.size());
+    for (size_t i = 0; i < qexpand.size(); ++i) {
+      tmp[i] = node_stats[qexpand[i]];
+    }
+    stats_reducer.AllReduce(BeginPtr(tmp), tmp.size());
+    for (size_t i = 0; i < qexpand.size(); ++i) {
+      node_stats[qexpand[i]] = tmp[i];
+    }
+  }
+  inline void FindSplit(int depth,
+                        const std::vector<bst_gpair> &gpair,
+                        IFMatrix *p_fmat,
+                        const BoosterInfo &info,
+                        RegTree *p_tree) {
+    const bst_uint num_feature = p_tree->param.num_feature;
+    // get the best split condition for each node
+    std::vector<SplitEntry> sol(qexpand.size());
+    bst_omp_uint nexpand = static_cast<bst_omp_uint>(qexpand.size());
+    #pragma omp parallel for schedule(dynamic, 1)
+    for (bst_omp_uint wid = 0; wid < nexpand; ++ wid) {
+      const int nid = qexpand[wid];
+      utils::Assert(node2workindex[nid] == static_cast<int>(wid),
+                    "node2workindex inconsistent");
+      SplitEntry &best = sol[wid];
+      for (bst_uint fid = 0; fid < num_feature; ++ fid) {
+        unsigned base = (wid * p_tree->param.num_feature + fid) * 3;
+        EnumerateSplit(summary_array[base + 0],
+                       summary_array[base + 1],
+                       summary_array[base + 2],
+                       node_stats[nid], fid, &best);
+      }
+    }
+    // get the best result, we can synchronize the solution
+    for (bst_omp_uint wid = 0; wid < nexpand; ++ wid) {
+      const int nid = qexpand[wid];
+      const SplitEntry &best = sol[wid];
+      // set up the values
+      p_tree->stat(nid).loss_chg = best.loss_chg;
+      this->SetStats(nid, node_stats[nid], p_tree);
+      // now we know the solution in snode[nid], set split
+      if (best.loss_chg > rt_eps) {
+        p_tree->AddChilds(nid);
+        (*p_tree)[nid].set_split(best.split_index(),
+                                 best.split_value, best.default_left());
+        // mark right child as 0, to indicate fresh leaf
+        (*p_tree)[(*p_tree)[nid].cleft()].set_leaf(0.0f, 0);
+        (*p_tree)[(*p_tree)[nid].cright()].set_leaf(0.0f, 0);
+      } else {
+        (*p_tree)[nid].set_leaf(p_tree->stat(nid).base_weight * param.learning_rate);
+      }
+    }
+  }
+  // set statistics on ptree
+  inline void SetStats(int nid, const SKStats &node_sum, RegTree *p_tree) {
+    p_tree->stat(nid).base_weight = node_sum.CalcWeight(param);
+    p_tree->stat(nid).sum_hess = static_cast<float>(node_sum.sum_hess);
+    node_sum.SetLeafVec(param, p_tree->leafvec(nid));
+  }
+  inline void EnumerateSplit(const WXQSketch::Summary &pos_grad,
+                             const WXQSketch::Summary &neg_grad,
+                             const WXQSketch::Summary &sum_hess,
+                             const SKStats &node_sum,
+                             bst_uint fid,
+                             SplitEntry *best) {
+    if (sum_hess.size == 0) return;
+    double root_gain = node_sum.CalcGain(param);
+    std::vector<bst_float> fsplits;
+    for (size_t i = 0; i < pos_grad.size; ++i) {
+      fsplits.push_back(pos_grad.data[i].value);
+    }
+    for (size_t i = 0; i < neg_grad.size; ++i) {
+      fsplits.push_back(neg_grad.data[i].value);
+    }
+    for (size_t i = 0; i < sum_hess.size; ++i) {
+      fsplits.push_back(sum_hess.data[i].value);
+    }
+    std::sort(fsplits.begin(), fsplits.end());
+    fsplits.resize(std::unique(fsplits.begin(), fsplits.end()) - fsplits.begin());
+    // sum feature
+    SKStats feat_sum;
+    feat_sum.pos_grad = pos_grad.data[pos_grad.size - 1].rmax;
+    feat_sum.neg_grad = neg_grad.data[neg_grad.size - 1].rmax;
+    feat_sum.sum_hess = sum_hess.data[sum_hess.size - 1].rmax;
+    size_t ipos = 0, ineg = 0, ihess = 0;
+    for (size_t i = 1; i < fsplits.size(); ++i) {      
+      WXQSketch::Entry pos = pos_grad.Query(fsplits[i], ipos);
+      WXQSketch::Entry neg = neg_grad.Query(fsplits[i], ineg);
+      WXQSketch::Entry hess = sum_hess.Query(fsplits[i], ihess);
+      SKStats s, c;
+      s.pos_grad = 0.5f * (pos.rmin + pos.rmax - pos.wmin);
+      s.neg_grad = 0.5f * (neg.rmin + neg.rmax - neg.wmin);
+      s.sum_hess = 0.5f * (hess.rmin + hess.rmax - hess.wmin);
+      c.SetSubstract(node_sum, s);      
+      // forward
+      if (s.sum_hess >= param.min_child_weight &&
+          c.sum_hess >= param.min_child_weight) {
+        double loss_chg = s.CalcGain(param) + c.CalcGain(param) - root_gain;        
+        best->Update(loss_chg, fid, fsplits[i], false);
+      }
+      // backward
+      c.SetSubstract(feat_sum, s);
+      s.SetSubstract(node_sum, c);
+      if (s.sum_hess >= param.min_child_weight &&
+          c.sum_hess >= param.min_child_weight) {
+        double loss_chg = s.CalcGain(param) + c.CalcGain(param) - root_gain;        
+        best->Update(loss_chg, fid, fsplits[i], true);
+      }      
+    }
+    {// all including
+      SKStats s = feat_sum, c;
+      c.SetSubstract(node_sum, s);
+      if (s.sum_hess >= param.min_child_weight &&
+          c.sum_hess >= param.min_child_weight) {
+        bst_float cpt = fsplits.back();
+        double loss_chg = s.CalcGain(param) + c.CalcGain(param) - root_gain;        
+        best->Update(loss_chg, fid, cpt + fabsf(cpt) + 1.0f, true);
+      }
+    }
+  }
+   
+  // thread temp data
+  // used to hold temporal sketch
+  std::vector< std::vector<SketchEntry> > thread_sketch;
+  // used to hold statistics
+  std::vector< std::vector<SKStats> > thread_stats;
+  // node statistics
+  std::vector<SKStats> node_stats;
+  // summary array
+  WXQSketch::SummaryArray summary_array;
+  // reducer for summary
+  sync::Reducer<SKStats> stats_reducer;
+  // reducer for summary
+  sync::ComplexReducer<WXQSketch::SummaryArray> sketch_reducer;
+  // per node, per feature sketch
+  std::vector< utils::WXQuantileSketch<bst_float, bst_float> > sketchs;
+};
+}  // tree
+}  // xgboost
+#endif
diff --git a/src/utils/quantile.h b/src/utils/quantile.h
index dc53ac9e6611..c3cdb86c2c26 100644
--- a/src/utils/quantile.h
+++ b/src/utils/quantile.h
@@ -119,6 +119,31 @@ struct WQSummary {
     }
     return res;
   }
+  /*! 
+   * \brief query qvalue, start from istart
+   * \param qvalue the value we query for
+   * \param istart starting position
+   */
+  inline Entry Query(DType qvalue, size_t &istart) const {
+    while (istart < size && qvalue > data[istart].value) {
+      ++istart;
+    }
+    if (istart == size) {
+      RType rmax = data[size - 1].rmax;
+      return Entry(rmax, rmax, 0.0f, qvalue);
+    }
+    if (qvalue == data[istart].value) {
+      return data[istart];
+    } else {
+      if (istart == 0) {
+        return Entry(0.0f, 0.0f, 0.0f, qvalue);    
+      } else {
+        return Entry(data[istart - 1].rmin_next(),
+                     data[istart].rmax_prev(),
+                     0.0f, qvalue);
+      }
+    }
+  }
   /*! \return maximum rank in the summary */
   inline RType MaxRank(void) const {
     return data[size - 1].rmax;

From 08e9813c9b66f521aa2239e6ddb8d4e731fadd5c Mon Sep 17 00:00:00 2001
From: tqchen <tianqi.tchen@gmail.com>
Date: Tue, 18 Nov 2014 21:23:36 -0800
Subject: [PATCH 076/166] potential BUG in skmaker?

---
 src/tree/param.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/tree/param.h b/src/tree/param.h
index 69ac0502d4a8..701721c177f9 100644
--- a/src/tree/param.h
+++ b/src/tree/param.h
@@ -63,7 +63,7 @@ struct TrainParam{
     size_leaf_vector = 0;
     parallel_option = 2;
     sketch_eps = 0.1f;
-    sketch_ratio = 1.4f;
+    sketch_ratio = 2.0f;
   }
   /*! 
    * \brief set parameters from outside 

From 32beb56ba3b4eb55e7270f2987066ff50f997982 Mon Sep 17 00:00:00 2001
From: tqchen <tianqi.tchen@gmail.com>
Date: Tue, 18 Nov 2014 22:21:41 -0800
Subject: [PATCH 077/166] only need to add in create hist col base

---
 src/tree/updater_basemaker-inl.hpp |  77 ++++++++++-
 src/tree/updater_histmaker-inl.hpp | 215 ++++++++---------------------
 src/tree/updater_skmaker-inl.hpp   |  74 ----------
 3 files changed, 135 insertions(+), 231 deletions(-)

diff --git a/src/tree/updater_basemaker-inl.hpp b/src/tree/updater_basemaker-inl.hpp
index 8152b86cb8cf..f414752d9652 100644
--- a/src/tree/updater_basemaker-inl.hpp
+++ b/src/tree/updater_basemaker-inl.hpp
@@ -8,6 +8,7 @@
 #include <vector>
 #include <algorithm>
 #include "../utils/random.h"
+#include "../utils/quantile.h"
 
 namespace xgboost {
 namespace tree {
@@ -238,7 +239,81 @@ class BaseMaker: public IUpdater {
         s.Add(thread_temp[tid][nid]);
       }
     }
-  }  
+  }
+  /*! \brief common helper data structure to build sketch*/
+  struct SketchEntry {
+    /*! \brief total sum of amount to be met */
+    bst_float sum_total;
+    /*! \brief statistics used in the sketch */
+    bst_float rmin, wmin;
+    /*! \brief last seen feature value */
+    bst_float last_fvalue;
+    /*! \brief current size of sketch */
+    bst_float next_goal;
+    // pointer to the sketch to put things in
+    utils::WXQuantileSketch<bst_float, bst_float> *sketch;
+    // initialize the space
+    inline void Init(unsigned max_size) {
+      next_goal = -1.0f;
+      rmin = wmin = 0.0f;
+      sketch->temp.Reserve(max_size + 1);
+      sketch->temp.size = 0;
+    }
+    /*!
+     * \brief push a new element to sketch 
+     * \param fvalue feature value, comes in sorted ascending order
+     * \param w weight
+     * \param max_size
+     */
+    inline void Push(bst_float fvalue, bst_float w, unsigned max_size) {
+      if (next_goal == -1.0f) {
+        next_goal = 0.0f;
+        last_fvalue = fvalue;
+        wmin = w;
+        return;
+      }
+      if (last_fvalue != fvalue) {
+        bst_float rmax = rmin + wmin;
+        if (rmax >= next_goal) {
+          if (sketch->temp.size == 0 || last_fvalue > sketch->temp.data[sketch->temp.size-1].value) {
+            // push to sketch
+            sketch->temp.data[sketch->temp.size] =
+                utils::WXQuantileSketch<bst_float, bst_float>::
+                Entry(rmin, rmax, wmin, last_fvalue);
+            utils::Assert(sketch->temp.size < max_size,
+                          "invalid maximum size max_size=%u, stemp.size=%lu\n",
+                          max_size, sketch->temp.size);
+            ++sketch->temp.size;
+          }
+          if (sketch->temp.size == max_size) {
+            next_goal = sum_total * 2.0f + 1e-5f;
+          } else{
+            next_goal = static_cast<bst_float>(sketch->temp.size * sum_total / max_size);
+          }
+        }
+        rmin = rmax;
+        wmin = w;
+        last_fvalue = fvalue;
+      } else {
+        wmin += w;
+      }
+    }
+    /*! \brief push final unfinished value to the sketch */
+    inline void Finalize(unsigned max_size) {
+      bst_float rmax = rmin + wmin;
+      if (sketch->temp.size == 0 || last_fvalue > sketch->temp.data[sketch->temp.size-1].value) {
+        utils::Assert(sketch->temp.size <= max_size,
+                      "Finalize: invalid maximum size, max_size=%u, stemp.size=%lu",
+                      sketch->temp.size, max_size );
+        // push to sketch
+        sketch->temp.data[sketch->temp.size] =
+            utils::WXQuantileSketch<bst_float, bst_float>::
+            Entry(rmin, rmax, wmin, last_fvalue);
+        ++sketch->temp.size;
+      }
+      sketch->PushTemp();
+    }
+  };
   /*! \brief training parameter of tree grower */
   TrainParam param;
   /*! \brief queue of nodes to be expanded */
diff --git a/src/tree/updater_histmaker-inl.hpp b/src/tree/updater_histmaker-inl.hpp
index d893de52d6f1..63b5e99f4a30 100644
--- a/src/tree/updater_histmaker-inl.hpp
+++ b/src/tree/updater_histmaker-inl.hpp
@@ -79,7 +79,7 @@ class HistMaker: public BaseMaker {
     /*! \brief cut field */
     std::vector<bst_float> cut;
     // per thread histset
-    std::vector<HistSet> hset;    
+    std::vector<HistSet> hset;
     // initialize the hist set
     inline void Init(const TrainParam &param, int nthread) {
       hset.resize(nthread);
@@ -111,28 +111,26 @@ class HistMaker: public BaseMaker {
     inline size_t Size(void) const {
       return rptr.size() - 1;
     }
-  };  
+  };
   // workspace of thread
   ThreadWSpace wspace;
   // reducer for histogram
-  sync::Reducer<TStats> histred;  
-
-  // this function does two jobs
-  // (1) reset the position in array position, to be the latest leaf id
-  // (2) propose a set of candidate cuts and set wspace.rptr wspace.cut correctly 
-  virtual void ResetPosAndPropose(const std::vector<bst_gpair> &gpair,
-                                  IFMatrix *p_fmat,
-                                  const BoosterInfo &info,
-                                  const RegTree &tree)  = 0;  
-
+  sync::Reducer<TStats> histred;
+  // update function implementation
   virtual void Update(const std::vector<bst_gpair> &gpair,
                       IFMatrix *p_fmat,
                       const BoosterInfo &info,
                       RegTree *p_tree) {
     this->InitData(gpair, *p_fmat, info.root_index, *p_tree);
     for (int depth = 0; depth < param.max_depth; ++depth) {
+      // reset and propose candidate split
+      this->ResetPosAndPropose(gpair, p_fmat, info, *p_tree);
+      // create histogram
+      this->CreateHist(gpair, p_fmat, info, *p_tree);
+      // find split based on histogram statistics
       this->FindSplit(depth, gpair, p_fmat, info, p_tree);
-      this->ResetPositionCol(this->qexpand, p_fmat, *p_tree);
+      // reset position after split
+      this->ResetPositionAfterSplit(p_fmat, *p_tree);
       this->UpdateQueueExpand(*p_tree);
       // if nothing left to be expand, break
       if (qexpand.size() == 0) break;
@@ -142,12 +140,21 @@ class HistMaker: public BaseMaker {
       (*p_tree)[nid].set_leaf(p_tree->stat(nid).base_weight * param.learning_rate);
     }
   }
-
- private:
-  inline void CreateHist(const std::vector<bst_gpair> &gpair,
-                         IFMatrix *p_fmat,
-                         const BoosterInfo &info,
-                         const RegTree &tree) {
+  // this function does two jobs
+  // (1) reset the position in array position, to be the latest leaf id
+  // (2) propose a set of candidate cuts and set wspace.rptr wspace.cut correctly 
+  virtual void ResetPosAndPropose(const std::vector<bst_gpair> &gpair,
+                                  IFMatrix *p_fmat,
+                                  const BoosterInfo &info,
+                                  const RegTree &tree)  = 0;  
+  // reset position after split, this is not a must, depending on implementation
+  virtual void ResetPositionAfterSplit(IFMatrix *p_fmat,
+                                       const RegTree &tree) {
+  }
+  virtual void CreateHist(const std::vector<bst_gpair> &gpair,
+                          IFMatrix *p_fmat,
+                          const BoosterInfo &info,
+                          const RegTree &tree) {
     bst_uint num_feature = tree.param.num_feature;
     int nthread;
     #pragma omp parallel
@@ -190,6 +197,8 @@ class HistMaker: public BaseMaker {
     // sync the histogram
     histred.AllReduce(BeginPtr(wspace.hset[0].data), wspace.hset[0].data.size());
   }
+
+ private:
   inline void EnumerateSplit(const HistUnit &hist, 
                              const TStats &node_sum,
                              bst_uint fid,
@@ -231,10 +240,6 @@ class HistMaker: public BaseMaker {
                         const BoosterInfo &info,
                         RegTree *p_tree) {
     const bst_uint num_feature = p_tree->param.num_feature;
-    // reset and propose candidate split
-    this->ResetPosAndPropose(gpair, p_fmat, info, *p_tree);
-    // create histogram
-    this->CreateHist(gpair, p_fmat, info, *p_tree);
     // get the best split condition for each node
     std::vector<SplitEntry> sol(qexpand.size());
     std::vector<TStats> left_sum(qexpand.size());    
@@ -288,17 +293,23 @@ class HistMaker: public BaseMaker {
 template<typename TStats>
 class CQHistMaker: public HistMaker<TStats> {
  protected:
+  virtual void ResetPositionAfterSplit(IFMatrix *p_fmat,
+                                       const RegTree &tree) {
+    this->ResetPositionCol(this->qexpand, p_fmat, tree);
+  }
   virtual void ResetPosAndPropose(const std::vector<bst_gpair> &gpair,
                                   IFMatrix *p_fmat,
                                   const BoosterInfo &info,
                                   const RegTree &tree) {
+    this->GetNodeStats(gpair, *p_fmat, tree, info,
+                       &thread_stats, &node_stats);
     sketchs.resize(this->qexpand.size() * tree.param.num_feature);
     for (size_t i = 0; i < sketchs.size(); ++i) {
       sketchs[i].Init(info.num_row, this->param.sketch_eps);
     }
-    thread_temp.resize(this->get_nthread());
-    std::vector<bst_float> root_stats;
-    this->GetRootStats(gpair, *p_fmat, tree, &root_stats);
+    thread_sketch.resize(this->get_nthread());
+    // number of rows in
+    const size_t nrows = p_fmat->buffered_rowset().size();
     // start accumulating statistics
     utils::IIterator<ColBatch> *iter = p_fmat->ColIterator();
     iter->BeforeFirst();
@@ -308,15 +319,15 @@ class CQHistMaker: public HistMaker<TStats> {
       const bst_omp_uint nsize = static_cast<bst_omp_uint>(batch.size);
       #pragma omp parallel for schedule(dynamic, 1)
       for (bst_omp_uint i = 0; i < nsize; ++i) {
-        this->MakeSketch(gpair, batch[i], tree,
-                         root_stats,
-                         batch.col_index[i],
-                         p_fmat->GetColDensity(batch.col_index[i]),
-                         &thread_temp[omp_get_thread_num()]);       
+        this->UpdateSketchCol(gpair, batch[i], tree,
+                              node_stats,
+                              batch.col_index[i],
+                              batch[i].length == nrows,                              
+                              &thread_sketch[omp_get_thread_num()]);       
       }
     }
     // setup maximum size
-    size_t max_size = static_cast<size_t>(this->param.sketch_ratio / this->param.sketch_eps);
+    unsigned max_size = this->param.max_sketch_size();
     // synchronize sketch
     summary_array.Init(sketchs.size(), max_size);
     for (size_t i = 0; i < sketchs.size(); ++i) {
@@ -356,93 +367,18 @@ class CQHistMaker: public HistMaker<TStats> {
                   (tree.param.num_feature + 1) * this->qexpand.size() + 1,
                   "cut space inconsistent");    
   }
-  // temporal space to build a sketch
-  struct SketchEntry {
-    /*! \brief total sum of */
-    bst_float sum_total;
-    /*! \brief statistics used in the sketch */
-    bst_float rmin, wmin;
-    /*! \brief last seen feature value */
-    bst_float last_fvalue;
-    /*! \brief current size of sketch */
-    bst_float next_goal;
-    // pointer to the sketch to put things in
-    utils::WXQuantileSketch<bst_float, bst_float> *sketch;
-    // initialize the space
-    inline void Init(unsigned max_size) {
-      next_goal = 0.0f;
-      rmin = wmin = 0.0f;
-      sketch->temp.Reserve(max_size + 1);
-      sketch->temp.size = 0;
-    }
-    /*!
-     * \brief push a new element to sketch 
-     * \param fvalue feature value, comes in sorted ascending order
-     * \param w weight
-     * \param max_size
-     */
-    inline void Push(bst_float fvalue, bst_float w, unsigned max_size) {
-      if (wmin == 0.0f) {
-        last_fvalue = fvalue;
-        wmin = w;
-        return;
-      }
-      if (last_fvalue != fvalue) {
-        bst_float rmax = rmin + wmin;
-        if (rmax >= next_goal) {
-          if (sketch->temp.size == 0 || last_fvalue > sketch->temp.data[sketch->temp.size-1].value) {
-            // push to sketch
-            sketch->temp.data[sketch->temp.size] =
-                utils::WXQuantileSketch<bst_float, bst_float>::
-                Entry(rmin, rmax, wmin, last_fvalue);
-            utils::Assert(sketch->temp.size < max_size,
-                          "invalid maximum size max_size=%u, stemp.size=%lu\n",
-                          max_size, sketch->temp.size);
-            ++sketch->temp.size;
-          }
-          if (sketch->temp.size == max_size) {
-            next_goal = sum_total * 2.0f + 1e-5f;
-          } else{
-            next_goal = static_cast<bst_float>(sketch->temp.size * sum_total / max_size);
-          }
-        }
-        rmin = rmax;
-        wmin = w;
-        last_fvalue = fvalue;
-      } else {
-        wmin += w;
-      }
-    }
-    /*! \brief push final unfinished value to the sketch */
-    inline void Finalize(unsigned max_size) {
-      bst_float rmax = rmin + wmin;
-      //utils::Assert(fabs(rmax - sum_total) < 1e-4 + sum_total * 1e-5,
-      //"invalid sum value, rmax=%f, sum_total=%lf", rmax, sum_total);
-      if (sketch->temp.size == 0 || last_fvalue > sketch->temp.data[sketch->temp.size-1].value) {
-        utils::Assert(sketch->temp.size <= max_size,
-                      "Finalize: invalid maximum size, max_size=%u, stemp.size=%lu",
-                      sketch->temp.size, max_size );
-        // push to sketch
-        sketch->temp.data[sketch->temp.size] =
-            utils::WXQuantileSketch<bst_float, bst_float>::
-            Entry(rmin, rmax, wmin, last_fvalue);
-        ++sketch->temp.size;
-      }
-      sketch->PushTemp();
-    }
-  };
   
  private:
-  inline void MakeSketch(const std::vector<bst_gpair> &gpair,
-                         const ColBatch::Inst &c,
-                         const RegTree &tree,
-                         const std::vector<bst_float> &root_stats,
-                         bst_uint fid,
-                         float col_density,
-                         std::vector<SketchEntry> *p_temp) {
+  inline void UpdateSketchCol(const std::vector<bst_gpair> &gpair,
+                              const ColBatch::Inst &c,
+                              const RegTree &tree,
+                              const std::vector<TStats> &nstats,
+                              bst_uint fid,
+                              bool col_full,
+                              std::vector<BaseMaker::SketchEntry> *p_temp) {
     if (c.length == 0) return;
     // initialize sbuilder for use
-    std::vector<SketchEntry> &sbuilder = *p_temp;
+    std::vector<BaseMaker::SketchEntry> &sbuilder = *p_temp;
     sbuilder.resize(tree.param.num_nodes);
     for (size_t i = 0; i < this->qexpand.size(); ++i) {
       const unsigned nid = this->qexpand[i];
@@ -451,7 +387,7 @@ class CQHistMaker: public HistMaker<TStats> {
       sbuilder[nid].sketch = &sketchs[wid * tree.param.num_feature + fid];
     }
 
-    if (col_density != 1.0f) {
+    if (!col_full) {
       // first pass, get sum of weight, TODO, optimization to skip first pass
       for (bst_uint j = 0; j < c.length; ++j) {
         const bst_uint ridx = c[j].index;
@@ -463,7 +399,7 @@ class CQHistMaker: public HistMaker<TStats> {
     } else {
       for (size_t i = 0; i < this->qexpand.size(); ++i) {
         const unsigned nid = this->qexpand[i];        
-        sbuilder[nid].sum_total = root_stats[nid];
+        sbuilder[nid].sum_total = nstats[nid].sum_hess;
       } 
     }
     // if only one value, no need to do second pass
@@ -475,7 +411,7 @@ class CQHistMaker: public HistMaker<TStats> {
       return;
     }
     // two pass scan
-    unsigned max_size = static_cast<unsigned>(this->param.sketch_ratio / this->param.sketch_eps);
+    unsigned max_size = this->param.max_sketch_size();
     for (size_t i = 0; i < this->qexpand.size(); ++i) {
       const int nid = this->qexpand[i];
       sbuilder[nid].Init(max_size);
@@ -493,47 +429,14 @@ class CQHistMaker: public HistMaker<TStats> {
       sbuilder[nid].Finalize(max_size);
     }
   }
-  inline void GetRootStats(const std::vector<bst_gpair> &gpair,
-                           const IFMatrix &fmat,
-                           const RegTree &tree,
-                           std::vector<float> *p_snode) {
-    std::vector<float> &snode = *p_snode;
-    thread_temp.resize(this->get_nthread());
-    snode.resize(tree.param.num_nodes);
-    #pragma omp parallel
-    {
-      const int tid = omp_get_thread_num();
-      thread_temp[tid].resize(tree.param.num_nodes);
-      for (size_t i = 0; i < this->qexpand.size(); ++i) {
-        const unsigned nid = this->qexpand[i];
-        thread_temp[tid][nid].sum_total = 0.0f;
-      }
-    }
-    const std::vector<bst_uint> &rowset = fmat.buffered_rowset();
-    // setup position
-    const bst_omp_uint ndata = static_cast<bst_omp_uint>(rowset.size());
-    #pragma omp parallel for schedule(static)
-    for (bst_omp_uint i = 0; i < ndata; ++i) {
-      const bst_uint ridx = rowset[i];
-      const int tid = omp_get_thread_num();
-      if (this->position[ridx] < 0) continue;
-      thread_temp[tid][this->position[ridx]].sum_total += gpair[ridx].hess;
-    }
-    // sum the per thread statistics together
-    for (size_t j = 0; j < this->qexpand.size(); ++j) {
-      const int nid = this->qexpand[j];
-      double wsum = 0.0f;
-      for (size_t tid = 0; tid < thread_temp.size(); ++tid) {
-        wsum += thread_temp[tid][nid].sum_total; 
-      }
-      // update node statistics
-      snode[nid] = static_cast<bst_float>(wsum);
-    }
-  }
 
   typedef utils::WXQuantileSketch<bst_float, bst_float> WXQSketch;
   // thread temp data
-  std::vector< std::vector<SketchEntry> > thread_temp;
+  std::vector< std::vector<BaseMaker::SketchEntry> > thread_sketch;
+  // used to hold statistics
+  std::vector< std::vector<TStats> > thread_stats;
+  // node statistics
+  std::vector<TStats> node_stats;
   // summary array
   WXQSketch::SummaryArray summary_array;
   // reducer for summary
diff --git a/src/tree/updater_skmaker-inl.hpp b/src/tree/updater_skmaker-inl.hpp
index 58f150ee616b..dd23b22c146b 100644
--- a/src/tree/updater_skmaker-inl.hpp
+++ b/src/tree/updater_skmaker-inl.hpp
@@ -130,80 +130,6 @@ class SketchMaker: public BaseMaker {
     inline void SetLeafVec(const TrainParam &param, bst_float *vec) const {
     }
   };
-  // temporal space to build a sketch
-  struct SketchEntry {
-    /*! \brief total sum of amount to be met */
-    bst_float sum_total;
-    /*! \brief statistics used in the sketch */
-    bst_float rmin, wmin;
-    /*! \brief last seen feature value */
-    bst_float last_fvalue;
-    /*! \brief current size of sketch */
-    bst_float next_goal;
-    // pointer to the sketch to put things in
-    utils::WXQuantileSketch<bst_float, bst_float> *sketch;
-    // initialize the space
-    inline void Init(unsigned max_size) {
-      next_goal = -1.0f;
-      rmin = wmin = 0.0f;
-      sketch->temp.Reserve(max_size + 1);
-      sketch->temp.size = 0;
-    }
-    /*!
-     * \brief push a new element to sketch 
-     * \param fvalue feature value, comes in sorted ascending order
-     * \param w weight
-     * \param max_size
-     */
-    inline void Push(bst_float fvalue, bst_float w, unsigned max_size) {
-      if (next_goal == -1.0f) {
-        next_goal = 0.0f;
-        last_fvalue = fvalue;
-        wmin = w;
-        return;
-      }
-      if (last_fvalue != fvalue) {
-        bst_float rmax = rmin + wmin;
-        if (rmax >= next_goal) {
-          if (sketch->temp.size == 0 || last_fvalue > sketch->temp.data[sketch->temp.size-1].value) {
-            // push to sketch
-            sketch->temp.data[sketch->temp.size] =
-                utils::WXQuantileSketch<bst_float, bst_float>::
-                Entry(rmin, rmax, wmin, last_fvalue);
-            utils::Assert(sketch->temp.size < max_size,
-                          "invalid maximum size max_size=%u, stemp.size=%lu\n",
-                          max_size, sketch->temp.size);
-            ++sketch->temp.size;
-          }
-          if (sketch->temp.size == max_size) {
-            next_goal = sum_total * 2.0f + 1e-5f;
-          } else{
-            next_goal = static_cast<bst_float>(sketch->temp.size * sum_total / max_size);
-          }
-        }
-        rmin = rmax;
-        wmin = w;
-        last_fvalue = fvalue;
-      } else {
-        wmin += w;
-      }
-    }
-    /*! \brief push final unfinished value to the sketch */
-    inline void Finalize(unsigned max_size) {
-      bst_float rmax = rmin + wmin;
-      if (sketch->temp.size == 0 || last_fvalue > sketch->temp.data[sketch->temp.size-1].value) {
-        utils::Assert(sketch->temp.size <= max_size,
-                      "Finalize: invalid maximum size, max_size=%u, stemp.size=%lu",
-                      sketch->temp.size, max_size );
-        // push to sketch
-        sketch->temp.data[sketch->temp.size] =
-            utils::WXQuantileSketch<bst_float, bst_float>::
-            Entry(rmin, rmax, wmin, last_fvalue);
-        ++sketch->temp.size;
-      }
-      sketch->PushTemp();
-    }
-  };
   inline void BuildSketch(const std::vector<bst_gpair> &gpair,
                           IFMatrix *p_fmat,
                           const BoosterInfo &info,

From fa1581b94c9d2918d65836314e75501a6f9f649f Mon Sep 17 00:00:00 2001
From: tqchen <tianqi.tchen@gmail.com>
Date: Wed, 19 Nov 2014 09:51:30 -0800
Subject: [PATCH 078/166] cqmaker ok

---
 src/tree/updater.cpp               |   3 +-
 src/tree/updater_basemaker-inl.hpp |  77 +++++++-
 src/tree/updater_histmaker-inl.hpp | 305 +++++++++++++----------------
 src/tree/updater_skmaker-inl.hpp   |  74 -------
 4 files changed, 215 insertions(+), 244 deletions(-)

diff --git a/src/tree/updater.cpp b/src/tree/updater.cpp
index 63f401af8b51..a087bf9ed5ca 100644
--- a/src/tree/updater.cpp
+++ b/src/tree/updater.cpp
@@ -18,8 +18,7 @@ IUpdater* CreateUpdater(const char *name) {
   if (!strcmp(name, "sync")) return new TreeSyncher();
   if (!strcmp(name, "refresh")) return new TreeRefresher<GradStats>();
   if (!strcmp(name, "grow_colmaker")) return new ColMaker<GradStats>();
-  if (!strcmp(name, "grow_qhistmaker")) return new QuantileHistMaker<GradStats>();
-  if (!strcmp(name, "grow_cqmaker")) return new CQHistMaker<GradStats>();
+  if (!strcmp(name, "grow_histmaker")) return new CQHistMaker<GradStats>();
   if (!strcmp(name, "grow_skmaker")) return new SketchMaker();
   if (!strcmp(name, "distcol")) return new DistColMaker<GradStats>();
 
diff --git a/src/tree/updater_basemaker-inl.hpp b/src/tree/updater_basemaker-inl.hpp
index 8152b86cb8cf..f414752d9652 100644
--- a/src/tree/updater_basemaker-inl.hpp
+++ b/src/tree/updater_basemaker-inl.hpp
@@ -8,6 +8,7 @@
 #include <vector>
 #include <algorithm>
 #include "../utils/random.h"
+#include "../utils/quantile.h"
 
 namespace xgboost {
 namespace tree {
@@ -238,7 +239,81 @@ class BaseMaker: public IUpdater {
         s.Add(thread_temp[tid][nid]);
       }
     }
-  }  
+  }
+  /*! \brief common helper data structure to build sketch*/
+  struct SketchEntry {
+    /*! \brief total sum of amount to be met */
+    bst_float sum_total;
+    /*! \brief statistics used in the sketch */
+    bst_float rmin, wmin;
+    /*! \brief last seen feature value */
+    bst_float last_fvalue;
+    /*! \brief current size of sketch */
+    bst_float next_goal;
+    // pointer to the sketch to put things in
+    utils::WXQuantileSketch<bst_float, bst_float> *sketch;
+    // initialize the space
+    inline void Init(unsigned max_size) {
+      next_goal = -1.0f;
+      rmin = wmin = 0.0f;
+      sketch->temp.Reserve(max_size + 1);
+      sketch->temp.size = 0;
+    }
+    /*!
+     * \brief push a new element to sketch 
+     * \param fvalue feature value, comes in sorted ascending order
+     * \param w weight
+     * \param max_size
+     */
+    inline void Push(bst_float fvalue, bst_float w, unsigned max_size) {
+      if (next_goal == -1.0f) {
+        next_goal = 0.0f;
+        last_fvalue = fvalue;
+        wmin = w;
+        return;
+      }
+      if (last_fvalue != fvalue) {
+        bst_float rmax = rmin + wmin;
+        if (rmax >= next_goal) {
+          if (sketch->temp.size == 0 || last_fvalue > sketch->temp.data[sketch->temp.size-1].value) {
+            // push to sketch
+            sketch->temp.data[sketch->temp.size] =
+                utils::WXQuantileSketch<bst_float, bst_float>::
+                Entry(rmin, rmax, wmin, last_fvalue);
+            utils::Assert(sketch->temp.size < max_size,
+                          "invalid maximum size max_size=%u, stemp.size=%lu\n",
+                          max_size, sketch->temp.size);
+            ++sketch->temp.size;
+          }
+          if (sketch->temp.size == max_size) {
+            next_goal = sum_total * 2.0f + 1e-5f;
+          } else{
+            next_goal = static_cast<bst_float>(sketch->temp.size * sum_total / max_size);
+          }
+        }
+        rmin = rmax;
+        wmin = w;
+        last_fvalue = fvalue;
+      } else {
+        wmin += w;
+      }
+    }
+    /*! \brief push final unfinished value to the sketch */
+    inline void Finalize(unsigned max_size) {
+      bst_float rmax = rmin + wmin;
+      if (sketch->temp.size == 0 || last_fvalue > sketch->temp.data[sketch->temp.size-1].value) {
+        utils::Assert(sketch->temp.size <= max_size,
+                      "Finalize: invalid maximum size, max_size=%u, stemp.size=%lu",
+                      sketch->temp.size, max_size );
+        // push to sketch
+        sketch->temp.data[sketch->temp.size] =
+            utils::WXQuantileSketch<bst_float, bst_float>::
+            Entry(rmin, rmax, wmin, last_fvalue);
+        ++sketch->temp.size;
+      }
+      sketch->PushTemp();
+    }
+  };
   /*! \brief training parameter of tree grower */
   TrainParam param;
   /*! \brief queue of nodes to be expanded */
diff --git a/src/tree/updater_histmaker-inl.hpp b/src/tree/updater_histmaker-inl.hpp
index d893de52d6f1..4c0136ac8809 100644
--- a/src/tree/updater_histmaker-inl.hpp
+++ b/src/tree/updater_histmaker-inl.hpp
@@ -41,7 +41,9 @@ class HistMaker: public BaseMaker {
     /*! \brief content of statistics data */    
     TStats *data;
     /*! \brief size of histogram */
-    const unsigned size;
+    unsigned size;
+    // default constructor
+    HistUnit(void) {}
     // constructor
     HistUnit(const bst_float *cut, TStats *data, unsigned size)
         : cut(cut), data(data), size(size) {}
@@ -79,7 +81,7 @@ class HistMaker: public BaseMaker {
     /*! \brief cut field */
     std::vector<bst_float> cut;
     // per thread histset
-    std::vector<HistSet> hset;    
+    std::vector<HistSet> hset;
     // initialize the hist set
     inline void Init(const TrainParam &param, int nthread) {
       hset.resize(nthread);
@@ -111,28 +113,26 @@ class HistMaker: public BaseMaker {
     inline size_t Size(void) const {
       return rptr.size() - 1;
     }
-  };  
+  };
   // workspace of thread
   ThreadWSpace wspace;
   // reducer for histogram
-  sync::Reducer<TStats> histred;  
-
-  // this function does two jobs
-  // (1) reset the position in array position, to be the latest leaf id
-  // (2) propose a set of candidate cuts and set wspace.rptr wspace.cut correctly 
-  virtual void ResetPosAndPropose(const std::vector<bst_gpair> &gpair,
-                                  IFMatrix *p_fmat,
-                                  const BoosterInfo &info,
-                                  const RegTree &tree)  = 0;  
-
+  sync::Reducer<TStats> histred;
+  // update function implementation
   virtual void Update(const std::vector<bst_gpair> &gpair,
                       IFMatrix *p_fmat,
                       const BoosterInfo &info,
                       RegTree *p_tree) {
     this->InitData(gpair, *p_fmat, info.root_index, *p_tree);
     for (int depth = 0; depth < param.max_depth; ++depth) {
+      // reset and propose candidate split
+      this->ResetPosAndPropose(gpair, p_fmat, info, *p_tree);
+      // create histogram
+      this->CreateHist(gpair, p_fmat, info, *p_tree);
+      // find split based on histogram statistics
       this->FindSplit(depth, gpair, p_fmat, info, p_tree);
-      this->ResetPositionCol(this->qexpand, p_fmat, *p_tree);
+      // reset position after split
+      this->ResetPositionAfterSplit(p_fmat, *p_tree);
       this->UpdateQueueExpand(*p_tree);
       // if nothing left to be expand, break
       if (qexpand.size() == 0) break;
@@ -142,20 +142,24 @@ class HistMaker: public BaseMaker {
       (*p_tree)[nid].set_leaf(p_tree->stat(nid).base_weight * param.learning_rate);
     }
   }
-
- private:
-  inline void CreateHist(const std::vector<bst_gpair> &gpair,
-                         IFMatrix *p_fmat,
-                         const BoosterInfo &info,
-                         const RegTree &tree) {
+  // this function does two jobs
+  // (1) reset the position in array position, to be the latest leaf id
+  // (2) propose a set of candidate cuts and set wspace.rptr wspace.cut correctly 
+  virtual void ResetPosAndPropose(const std::vector<bst_gpair> &gpair,
+                                  IFMatrix *p_fmat,
+                                  const BoosterInfo &info,
+                                  const RegTree &tree)  = 0;  
+  // reset position after split, this is not a must, depending on implementation
+  virtual void ResetPositionAfterSplit(IFMatrix *p_fmat,
+                                       const RegTree &tree) {
+  }
+  virtual void CreateHist(const std::vector<bst_gpair> &gpair,
+                          IFMatrix *p_fmat,
+                          const BoosterInfo &info,
+                          const RegTree &tree) {
     bst_uint num_feature = tree.param.num_feature;
-    int nthread;
-    #pragma omp parallel
-    {
-      nthread = omp_get_num_threads();
-    }
     // intialize work space
-    wspace.Init(param, nthread);
+    wspace.Init(param, this->get_nthread());
     // start accumulating statistics
     utils::IIterator<RowBatch> *iter = p_fmat->RowIterator();
     iter->BeforeFirst();
@@ -190,6 +194,8 @@ class HistMaker: public BaseMaker {
     // sync the histogram
     histred.AllReduce(BeginPtr(wspace.hset[0].data), wspace.hset[0].data.size());
   }
+
+ private:
   inline void EnumerateSplit(const HistUnit &hist, 
                              const TStats &node_sum,
                              bst_uint fid,
@@ -231,10 +237,6 @@ class HistMaker: public BaseMaker {
                         const BoosterInfo &info,
                         RegTree *p_tree) {
     const bst_uint num_feature = p_tree->param.num_feature;
-    // reset and propose candidate split
-    this->ResetPosAndPropose(gpair, p_fmat, info, *p_tree);
-    // create histogram
-    this->CreateHist(gpair, p_fmat, info, *p_tree);
     // get the best split condition for each node
     std::vector<SplitEntry> sol(qexpand.size());
     std::vector<TStats> left_sum(qexpand.size());    
@@ -247,7 +249,7 @@ class HistMaker: public BaseMaker {
       SplitEntry &best = sol[wid];
       TStats &node_sum = wspace.hset[0][num_feature + wid * (num_feature + 1)].data[0];
       for (bst_uint fid = 0; fid < num_feature; ++ fid) {
-        EnumerateSplit(wspace.hset[0][fid + wid * (num_feature+1)],
+        EnumerateSplit(this->wspace.hset[0][fid + wid * (num_feature+1)],
                        node_sum, fid, &best, &left_sum[wid]);
       }
     }
@@ -279,26 +281,77 @@ class HistMaker: public BaseMaker {
   }
   
   inline void SetStats(RegTree *p_tree, int nid, const TStats &node_sum) {
-      p_tree->stat(nid).base_weight = node_sum.CalcWeight(param);
-      p_tree->stat(nid).sum_hess = static_cast<float>(node_sum.sum_hess);
-      node_sum.SetLeafVec(param, p_tree->leafvec(nid));    
+    p_tree->stat(nid).base_weight = node_sum.CalcWeight(param);
+    p_tree->stat(nid).sum_hess = static_cast<float>(node_sum.sum_hess);
+    node_sum.SetLeafVec(param, p_tree->leafvec(nid));    
   }
 };
 
 template<typename TStats>
 class CQHistMaker: public HistMaker<TStats> {
  protected:
+  struct HistEntry {
+    typename HistMaker<TStats>::HistUnit hist;
+    unsigned istart;
+    /*! 
+     * \brief add a histogram to data,
+     * do linear scan, start from istart
+     */
+    inline void Add(bst_float fv,
+                    const std::vector<bst_gpair> &gpair,
+                    const BoosterInfo &info,
+                    const bst_uint ridx) {
+      while (istart < hist.size && !(fv < hist.cut[istart])) ++istart;
+      utils::Assert(istart != hist.size, "the bound variable must be max");
+      hist.data[istart].Add(gpair, info, ridx);
+    }
+  };
+  virtual void CreateHist(const std::vector<bst_gpair> &gpair,
+                          IFMatrix *p_fmat,
+                          const BoosterInfo &info,
+                          const RegTree &tree) {
+    this->wspace.Init(this->param, 1);
+    thread_hist.resize(this->get_nthread());
+    // start accumulating statistics
+    utils::IIterator<ColBatch> *iter = p_fmat->ColIterator();
+    iter->BeforeFirst();
+    while (iter->Next()) {
+      const ColBatch &batch = iter->Value();
+      // start enumeration
+      const bst_omp_uint nsize = static_cast<bst_omp_uint>(batch.size);
+      #pragma omp parallel for schedule(dynamic, 1)
+      for (bst_omp_uint i = 0; i < nsize; ++i) {
+        this->UpdateHistCol(gpair, batch[i], info, tree,
+                            batch.col_index[i],                        
+                            &thread_hist[omp_get_thread_num()]);       
+      }
+    }
+    for (size_t i = 0; i < this->qexpand.size(); ++i) {
+      const int nid = this->qexpand[i];
+      const int wid = this->node2workindex[nid];
+      this->wspace.hset[0][tree.param.num_feature + wid * (tree.param.num_feature+1)]
+          .data[0] = node_stats[nid];
+    }
+    // sync the histogram
+    this->histred.AllReduce(BeginPtr(this->wspace.hset[0].data), this->wspace.hset[0].data.size());    
+  }
+  virtual void ResetPositionAfterSplit(IFMatrix *p_fmat,
+                                       const RegTree &tree) {
+    this->ResetPositionCol(this->qexpand, p_fmat, tree);
+  }
   virtual void ResetPosAndPropose(const std::vector<bst_gpair> &gpair,
                                   IFMatrix *p_fmat,
                                   const BoosterInfo &info,
                                   const RegTree &tree) {
+    this->GetNodeStats(gpair, *p_fmat, tree, info,
+                       &thread_stats, &node_stats);
     sketchs.resize(this->qexpand.size() * tree.param.num_feature);
     for (size_t i = 0; i < sketchs.size(); ++i) {
       sketchs[i].Init(info.num_row, this->param.sketch_eps);
     }
-    thread_temp.resize(this->get_nthread());
-    std::vector<bst_float> root_stats;
-    this->GetRootStats(gpair, *p_fmat, tree, &root_stats);
+    thread_sketch.resize(this->get_nthread());
+    // number of rows in
+    const size_t nrows = p_fmat->buffered_rowset().size();
     // start accumulating statistics
     utils::IIterator<ColBatch> *iter = p_fmat->ColIterator();
     iter->BeforeFirst();
@@ -308,15 +361,15 @@ class CQHistMaker: public HistMaker<TStats> {
       const bst_omp_uint nsize = static_cast<bst_omp_uint>(batch.size);
       #pragma omp parallel for schedule(dynamic, 1)
       for (bst_omp_uint i = 0; i < nsize; ++i) {
-        this->MakeSketch(gpair, batch[i], tree,
-                         root_stats,
-                         batch.col_index[i],
-                         p_fmat->GetColDensity(batch.col_index[i]),
-                         &thread_temp[omp_get_thread_num()]);       
+        this->UpdateSketchCol(gpair, batch[i], tree,
+                              node_stats,
+                              batch.col_index[i],
+                              batch[i].length == nrows,                              
+                              &thread_sketch[omp_get_thread_num()]);       
       }
     }
     // setup maximum size
-    size_t max_size = static_cast<size_t>(this->param.sketch_ratio / this->param.sketch_eps);
+    unsigned max_size = this->param.max_sketch_size();
     // synchronize sketch
     summary_array.Init(sketchs.size(), max_size);
     for (size_t i = 0; i < sketchs.size(); ++i) {
@@ -356,93 +409,42 @@ class CQHistMaker: public HistMaker<TStats> {
                   (tree.param.num_feature + 1) * this->qexpand.size() + 1,
                   "cut space inconsistent");    
   }
-  // temporal space to build a sketch
-  struct SketchEntry {
-    /*! \brief total sum of */
-    bst_float sum_total;
-    /*! \brief statistics used in the sketch */
-    bst_float rmin, wmin;
-    /*! \brief last seen feature value */
-    bst_float last_fvalue;
-    /*! \brief current size of sketch */
-    bst_float next_goal;
-    // pointer to the sketch to put things in
-    utils::WXQuantileSketch<bst_float, bst_float> *sketch;
-    // initialize the space
-    inline void Init(unsigned max_size) {
-      next_goal = 0.0f;
-      rmin = wmin = 0.0f;
-      sketch->temp.Reserve(max_size + 1);
-      sketch->temp.size = 0;
-    }
-    /*!
-     * \brief push a new element to sketch 
-     * \param fvalue feature value, comes in sorted ascending order
-     * \param w weight
-     * \param max_size
-     */
-    inline void Push(bst_float fvalue, bst_float w, unsigned max_size) {
-      if (wmin == 0.0f) {
-        last_fvalue = fvalue;
-        wmin = w;
-        return;
-      }
-      if (last_fvalue != fvalue) {
-        bst_float rmax = rmin + wmin;
-        if (rmax >= next_goal) {
-          if (sketch->temp.size == 0 || last_fvalue > sketch->temp.data[sketch->temp.size-1].value) {
-            // push to sketch
-            sketch->temp.data[sketch->temp.size] =
-                utils::WXQuantileSketch<bst_float, bst_float>::
-                Entry(rmin, rmax, wmin, last_fvalue);
-            utils::Assert(sketch->temp.size < max_size,
-                          "invalid maximum size max_size=%u, stemp.size=%lu\n",
-                          max_size, sketch->temp.size);
-            ++sketch->temp.size;
-          }
-          if (sketch->temp.size == max_size) {
-            next_goal = sum_total * 2.0f + 1e-5f;
-          } else{
-            next_goal = static_cast<bst_float>(sketch->temp.size * sum_total / max_size);
-          }
-        }
-        rmin = rmax;
-        wmin = w;
-        last_fvalue = fvalue;
-      } else {
-        wmin += w;
-      }
+  
+ private:
+  inline void UpdateHistCol(const std::vector<bst_gpair> &gpair,
+                            const ColBatch::Inst &c,
+                            const BoosterInfo &info,
+                            const RegTree &tree,
+                            bst_uint fid,
+                            std::vector<HistEntry> *p_temp) {
+    if (c.length == 0) return;
+    // initialize sbuilder for use
+    std::vector<HistEntry> &hbuilder = *p_temp;
+    hbuilder.resize(tree.param.num_nodes);
+    for (size_t i = 0; i < this->qexpand.size(); ++i) {
+      const unsigned nid = this->qexpand[i];
+      const unsigned wid = this->node2workindex[nid];
+      hbuilder[nid].istart = 0;
+      hbuilder[nid].hist = this->wspace.hset[0][fid + wid * (tree.param.num_feature+1)];
     }
-    /*! \brief push final unfinished value to the sketch */
-    inline void Finalize(unsigned max_size) {
-      bst_float rmax = rmin + wmin;
-      //utils::Assert(fabs(rmax - sum_total) < 1e-4 + sum_total * 1e-5,
-      //"invalid sum value, rmax=%f, sum_total=%lf", rmax, sum_total);
-      if (sketch->temp.size == 0 || last_fvalue > sketch->temp.data[sketch->temp.size-1].value) {
-        utils::Assert(sketch->temp.size <= max_size,
-                      "Finalize: invalid maximum size, max_size=%u, stemp.size=%lu",
-                      sketch->temp.size, max_size );
-        // push to sketch
-        sketch->temp.data[sketch->temp.size] =
-            utils::WXQuantileSketch<bst_float, bst_float>::
-            Entry(rmin, rmax, wmin, last_fvalue);
-        ++sketch->temp.size;
+    for (bst_uint j = 0; j < c.length; ++j) {
+      const bst_uint ridx = c[j].index;
+      const int nid = this->position[ridx];
+      if (nid >= 0) {
+        hbuilder[nid].Add(c[j].fvalue, gpair, info, ridx);
       }
-      sketch->PushTemp();
     }
-  };
-  
- private:
-  inline void MakeSketch(const std::vector<bst_gpair> &gpair,
-                         const ColBatch::Inst &c,
-                         const RegTree &tree,
-                         const std::vector<bst_float> &root_stats,
-                         bst_uint fid,
-                         float col_density,
-                         std::vector<SketchEntry> *p_temp) {
+  }
+  inline void UpdateSketchCol(const std::vector<bst_gpair> &gpair,
+                              const ColBatch::Inst &c,
+                              const RegTree &tree,
+                              const std::vector<TStats> &nstats,
+                              bst_uint fid,
+                              bool col_full,
+                              std::vector<BaseMaker::SketchEntry> *p_temp) {
     if (c.length == 0) return;
     // initialize sbuilder for use
-    std::vector<SketchEntry> &sbuilder = *p_temp;
+    std::vector<BaseMaker::SketchEntry> &sbuilder = *p_temp;
     sbuilder.resize(tree.param.num_nodes);
     for (size_t i = 0; i < this->qexpand.size(); ++i) {
       const unsigned nid = this->qexpand[i];
@@ -451,7 +453,7 @@ class CQHistMaker: public HistMaker<TStats> {
       sbuilder[nid].sketch = &sketchs[wid * tree.param.num_feature + fid];
     }
 
-    if (col_density != 1.0f) {
+    if (!col_full) {
       // first pass, get sum of weight, TODO, optimization to skip first pass
       for (bst_uint j = 0; j < c.length; ++j) {
         const bst_uint ridx = c[j].index;
@@ -463,7 +465,7 @@ class CQHistMaker: public HistMaker<TStats> {
     } else {
       for (size_t i = 0; i < this->qexpand.size(); ++i) {
         const unsigned nid = this->qexpand[i];        
-        sbuilder[nid].sum_total = root_stats[nid];
+        sbuilder[nid].sum_total = nstats[nid].sum_hess;
       } 
     }
     // if only one value, no need to do second pass
@@ -475,7 +477,7 @@ class CQHistMaker: public HistMaker<TStats> {
       return;
     }
     // two pass scan
-    unsigned max_size = static_cast<unsigned>(this->param.sketch_ratio / this->param.sketch_eps);
+    unsigned max_size = this->param.max_sketch_size();
     for (size_t i = 0; i < this->qexpand.size(); ++i) {
       const int nid = this->qexpand[i];
       sbuilder[nid].Init(max_size);
@@ -493,47 +495,16 @@ class CQHistMaker: public HistMaker<TStats> {
       sbuilder[nid].Finalize(max_size);
     }
   }
-  inline void GetRootStats(const std::vector<bst_gpair> &gpair,
-                           const IFMatrix &fmat,
-                           const RegTree &tree,
-                           std::vector<float> *p_snode) {
-    std::vector<float> &snode = *p_snode;
-    thread_temp.resize(this->get_nthread());
-    snode.resize(tree.param.num_nodes);
-    #pragma omp parallel
-    {
-      const int tid = omp_get_thread_num();
-      thread_temp[tid].resize(tree.param.num_nodes);
-      for (size_t i = 0; i < this->qexpand.size(); ++i) {
-        const unsigned nid = this->qexpand[i];
-        thread_temp[tid][nid].sum_total = 0.0f;
-      }
-    }
-    const std::vector<bst_uint> &rowset = fmat.buffered_rowset();
-    // setup position
-    const bst_omp_uint ndata = static_cast<bst_omp_uint>(rowset.size());
-    #pragma omp parallel for schedule(static)
-    for (bst_omp_uint i = 0; i < ndata; ++i) {
-      const bst_uint ridx = rowset[i];
-      const int tid = omp_get_thread_num();
-      if (this->position[ridx] < 0) continue;
-      thread_temp[tid][this->position[ridx]].sum_total += gpair[ridx].hess;
-    }
-    // sum the per thread statistics together
-    for (size_t j = 0; j < this->qexpand.size(); ++j) {
-      const int nid = this->qexpand[j];
-      double wsum = 0.0f;
-      for (size_t tid = 0; tid < thread_temp.size(); ++tid) {
-        wsum += thread_temp[tid][nid].sum_total; 
-      }
-      // update node statistics
-      snode[nid] = static_cast<bst_float>(wsum);
-    }
-  }
-
+  
   typedef utils::WXQuantileSketch<bst_float, bst_float> WXQSketch;
   // thread temp data
-  std::vector< std::vector<SketchEntry> > thread_temp;
+  std::vector< std::vector<BaseMaker::SketchEntry> > thread_sketch;
+  // used to hold statistics
+  std::vector< std::vector<TStats> > thread_stats;
+  // used to hold start pointer
+  std::vector< std::vector<HistEntry> > thread_hist;
+  // node statistics
+  std::vector<TStats> node_stats;
   // summary array
   WXQSketch::SummaryArray summary_array;
   // reducer for summary
diff --git a/src/tree/updater_skmaker-inl.hpp b/src/tree/updater_skmaker-inl.hpp
index 58f150ee616b..dd23b22c146b 100644
--- a/src/tree/updater_skmaker-inl.hpp
+++ b/src/tree/updater_skmaker-inl.hpp
@@ -130,80 +130,6 @@ class SketchMaker: public BaseMaker {
     inline void SetLeafVec(const TrainParam &param, bst_float *vec) const {
     }
   };
-  // temporal space to build a sketch
-  struct SketchEntry {
-    /*! \brief total sum of amount to be met */
-    bst_float sum_total;
-    /*! \brief statistics used in the sketch */
-    bst_float rmin, wmin;
-    /*! \brief last seen feature value */
-    bst_float last_fvalue;
-    /*! \brief current size of sketch */
-    bst_float next_goal;
-    // pointer to the sketch to put things in
-    utils::WXQuantileSketch<bst_float, bst_float> *sketch;
-    // initialize the space
-    inline void Init(unsigned max_size) {
-      next_goal = -1.0f;
-      rmin = wmin = 0.0f;
-      sketch->temp.Reserve(max_size + 1);
-      sketch->temp.size = 0;
-    }
-    /*!
-     * \brief push a new element to sketch 
-     * \param fvalue feature value, comes in sorted ascending order
-     * \param w weight
-     * \param max_size
-     */
-    inline void Push(bst_float fvalue, bst_float w, unsigned max_size) {
-      if (next_goal == -1.0f) {
-        next_goal = 0.0f;
-        last_fvalue = fvalue;
-        wmin = w;
-        return;
-      }
-      if (last_fvalue != fvalue) {
-        bst_float rmax = rmin + wmin;
-        if (rmax >= next_goal) {
-          if (sketch->temp.size == 0 || last_fvalue > sketch->temp.data[sketch->temp.size-1].value) {
-            // push to sketch
-            sketch->temp.data[sketch->temp.size] =
-                utils::WXQuantileSketch<bst_float, bst_float>::
-                Entry(rmin, rmax, wmin, last_fvalue);
-            utils::Assert(sketch->temp.size < max_size,
-                          "invalid maximum size max_size=%u, stemp.size=%lu\n",
-                          max_size, sketch->temp.size);
-            ++sketch->temp.size;
-          }
-          if (sketch->temp.size == max_size) {
-            next_goal = sum_total * 2.0f + 1e-5f;
-          } else{
-            next_goal = static_cast<bst_float>(sketch->temp.size * sum_total / max_size);
-          }
-        }
-        rmin = rmax;
-        wmin = w;
-        last_fvalue = fvalue;
-      } else {
-        wmin += w;
-      }
-    }
-    /*! \brief push final unfinished value to the sketch */
-    inline void Finalize(unsigned max_size) {
-      bst_float rmax = rmin + wmin;
-      if (sketch->temp.size == 0 || last_fvalue > sketch->temp.data[sketch->temp.size-1].value) {
-        utils::Assert(sketch->temp.size <= max_size,
-                      "Finalize: invalid maximum size, max_size=%u, stemp.size=%lu",
-                      sketch->temp.size, max_size );
-        // push to sketch
-        sketch->temp.data[sketch->temp.size] =
-            utils::WXQuantileSketch<bst_float, bst_float>::
-            Entry(rmin, rmax, wmin, last_fvalue);
-        ++sketch->temp.size;
-      }
-      sketch->PushTemp();
-    }
-  };
   inline void BuildSketch(const std::vector<bst_gpair> &gpair,
                           IFMatrix *p_fmat,
                           const BoosterInfo &info,

From 54e2ed90d7b21aea2d7a302f72d6e9c440d10968 Mon Sep 17 00:00:00 2001
From: tqchen <tianqi.tchen@gmail.com>
Date: Wed, 19 Nov 2014 11:21:07 -0800
Subject: [PATCH 079/166] recheck column mode

---
 Makefile                    |  5 +++--
 demo/mpi/README.md          |  3 ---
 demo/mpi/mpi.conf           | 36 ------------------------------------
 demo/mpi/runexp-mpi.sh      | 19 -------------------
 demo/mpi/splitsvm.py        | 32 --------------------------------
 multi-node/README.md        | 33 +++++++++++++++++++++++++++++++++
 src/learner/learner-inl.hpp | 14 ++++++++++++++
 src/xgboost_main.cpp        |  2 +-
 8 files changed, 51 insertions(+), 93 deletions(-)
 delete mode 100644 demo/mpi/README.md
 delete mode 100644 demo/mpi/mpi.conf
 delete mode 100755 demo/mpi/runexp-mpi.sh
 delete mode 100644 demo/mpi/splitsvm.py
 create mode 100644 multi-node/README.md

diff --git a/Makefile b/Makefile
index 72c981706dc8..51b7a578ac1b 100644
--- a/Makefile
+++ b/Makefile
@@ -17,9 +17,10 @@ MPIOBJ = sync_mpi.o
 MPIBIN = xgboost-mpi
 SLIB = wrapper/libxgboostwrapper.so 
 
-.PHONY: clean all python Rpack
+.PHONY: clean all mpi python Rpack
 
-all: $(BIN) $(OBJ) $(SLIB) $(MPIOBJ) $(MPIBIN)
+all: $(BIN) $(OBJ) $(SLIB) 
+mpi: $(MPIBIN)
 
 python: wrapper/libxgboostwrapper.so
 # now the wrapper takes in two files. io and wrapper part
diff --git a/demo/mpi/README.md b/demo/mpi/README.md
deleted file mode 100644
index 60fd0eb6eed7..000000000000
--- a/demo/mpi/README.md
+++ /dev/null
@@ -1,3 +0,0 @@
-This folder contains toy example script to run xgboost-mpi. 
-
-This is an experimental distributed version of xgboost
diff --git a/demo/mpi/mpi.conf b/demo/mpi/mpi.conf
deleted file mode 100644
index 5b1f978d10c1..000000000000
--- a/demo/mpi/mpi.conf
+++ /dev/null
@@ -1,36 +0,0 @@
-# General Parameters, see comment for each definition
-# choose the booster, can be gbtree or gblinear
-booster = gbtree
-# choose logistic regression loss function for binary classification
-objective = binary:logistic
-
-# Tree Booster Parameters
-# step size shrinkage
-eta = 1.0 
-# minimum loss reduction required to make a further partition
-gamma = 1.0 
-# minimum sum of instance weight(hessian) needed in a child
-min_child_weight = 1 
-# maximum depth of a tree
-max_depth = 3 
-
-# Task Parameters
-# the number of round to do boosting
-num_round = 2
-# 0 means do not save any model except the final round model
-save_period = 0 
-use_buffer = 0
-
-
-# The path of training data %d is the wildcard for the rank of the data
-# The idea is each process take a feature matrix with subset of columns
-#
-data = "train.col%d" 
-
-# The path of validation data, used to monitor training process, here [test] sets name of the validation set
-eval[test] = "../data/agaricus.txt.test" 
-# evaluate on training data as well each round
-eval_train = 1
-
-# The path of test data, need to use full data of test, try not use it, or keep an subsampled version
-test:data = "agaricus.txt.test"      
diff --git a/demo/mpi/runexp-mpi.sh b/demo/mpi/runexp-mpi.sh
deleted file mode 100755
index cc0c6d459a5e..000000000000
--- a/demo/mpi/runexp-mpi.sh
+++ /dev/null
@@ -1,19 +0,0 @@
-#!/bin/bash
-if [[ $# -ne 1 ]]
-then
-    echo "Usage: nprocess"
-    exit -1
-fi
-
-rm -rf train.col*
-k=$1
-
-# split the lib svm file into k subfiles
-python splitsvm.py ../data/agaricus.txt.train train $k
-
-# run xgboost mpi
-mpirun -n $k ../../xgboost-mpi  mpi.conf 
-
-# the model can be directly loaded by single machine xgboost solver, as usuall
-../../xgboost mpi.conf task=dump model_in=0002.model fmap=../data/featmap.txt name_dump=dump.nice.$k.txt
-cat dump.nice.$k.txt
diff --git a/demo/mpi/splitsvm.py b/demo/mpi/splitsvm.py
deleted file mode 100644
index 365aef610c84..000000000000
--- a/demo/mpi/splitsvm.py
+++ /dev/null
@@ -1,32 +0,0 @@
-#!/usr/bin/python
-import sys
-import random
-
-# split libsvm file into different subcolumns
-if len(sys.argv) < 4:
-    print ('Usage:<fin> <fo> k')
-    exit(0)
-
-random.seed(10)
-fmap = {}
-
-k = int(sys.argv[3])
-fi = open( sys.argv[1], 'r' )
-fos = []
-
-for i in range(k):
-    fos.append(open( sys.argv[2]+'.col%d' % i, 'w' ))
-    
-for l in open(sys.argv[1]):
-    arr = l.split()
-    for f in fos:
-        f.write(arr[0])
-    for it in arr[1:]:
-        fid = int(it.split(':')[0])
-        if fid not in fmap:
-            fmap[fid] = random.randint(0, k-1)
-        fos[fmap[fid]].write(' '+it)
-    for f in fos:
-        f.write('\n')
-for f in fos:    
-    f.close()
diff --git a/multi-node/README.md b/multi-node/README.md
new file mode 100644
index 000000000000..fab7472e7470
--- /dev/null
+++ b/multi-node/README.md
@@ -0,0 +1,33 @@
+Distributed XGBoost
+======
+This folder contains information about experimental version of distributed xgboost.
+
+Build
+=====
+* You will need to have MPI
+* In the root folder, run ```make mpi```, this will give you xgboost-mpi
+
+Design Choice
+=====
+* Does distributed xgboost reply on MPI?
+  - Yes, but the dependency is isolated in [sync](../src/sync/sync.h) module
+  - Specificially, xgboost reply on MPI protocol that provide Broadcast and AllReduce,
+     if there are platform/framework that implements these protocol, xgboost should naturally extends to these platform
+* How is the data distributed?
+  - There are two solvers in distributed xgboost
+  - Column-based solver split data by column, each node work on subset of columns, 
+    it uses exactly the same algorithm as single node version.
+  - Row-based solver split data by row, each node work on subset of rows,
+    it uses an approximate histogram count algorithm, and will only examine subset of 
+    potential split points as opposed to all split points.
+* How to run the distributed version
+  - The current code run in MPI enviroment, you will need to have a network filesystem,
+    or copy data to local file system before running the code
+  - The distributed version is still multi-threading optimized.
+    You should run one xgboost-mpi per node that takes most available CPU,
+    this will reduce the communication overhead and improve the performance.
+  - One way to do that is limit mpi slot in each machine to be 1, or reserve nthread processors for each process.
+  
+Examples
+====
+* [Column-based version](col-split)
diff --git a/src/learner/learner-inl.hpp b/src/learner/learner-inl.hpp
index 89bc28aec1d3..b1a95dd96d01 100644
--- a/src/learner/learner-inl.hpp
+++ b/src/learner/learner-inl.hpp
@@ -32,6 +32,7 @@ class BoostLearner {
     silent= 0;
     prob_buffer_row = 1.0f;
     part_load_col = 0;
+    distributed_mode = 0;
   }
   ~BoostLearner(void) {
     if (obj_ != NULL) delete obj_;
@@ -89,6 +90,17 @@ class BoostLearner {
       this->SetParam(n.c_str(), val);
     }
     if (!strcmp(name, "silent")) silent = atoi(val);
+    if (!strcmp(name, "dsplit")) {
+      if (!strcmp(val, "col")) {
+        this->SetParam("updater", "distcol,prune");
+        distributed_mode = 1;
+      } else if (!strcmp(val, "row")) {
+        this->SetParam("updater", "grow_histmaker,prune");
+        distributed_mode = 2;
+      } else {
+        utils::Error("%s is invalid value for dsplit, should be row or col", val);
+      }
+    }
     if (!strcmp(name, "part_load_col")) part_load_col = atoi(val);
     if (!strcmp(name, "prob_buffer_row")) {
       prob_buffer_row = static_cast<float>(atof(val));
@@ -352,6 +364,8 @@ class BoostLearner {
   // data fields
   // silent during training
   int silent;
+  // distributed learning mode, if any, 0:none, 1:col, 2:row
+  int distributed_mode;
   // randomly load part of data
   int part_load_col;
   // maximum buffred row value
diff --git a/src/xgboost_main.cpp b/src/xgboost_main.cpp
index ef3a9079c63b..1b596ebfb615 100644
--- a/src/xgboost_main.cpp
+++ b/src/xgboost_main.cpp
@@ -32,7 +32,7 @@ class BoostLearnTask {
       }
     }
     if (sync::IsDistributed()) {
-      this->SetParam("updater", "distcol");
+      this->SetParam("data_split", "col");
     }
     if (sync::GetRank() != 0) {
       this->SetParam("silent", "2");

From 03e24cf59088326e4ebd4a9f149fa761ae222ccd Mon Sep 17 00:00:00 2001
From: tqchen <tianqi.tchen@gmail.com>
Date: Wed, 19 Nov 2014 11:22:17 -0800
Subject: [PATCH 080/166] check multinode

---
 multi-node/col-split/README.md     |  2 ++
 multi-node/col-split/runexp-mpi.sh | 19 ++++++++++++++++++
 multi-node/col-split/splitsvm.py   | 32 ++++++++++++++++++++++++++++++
 3 files changed, 53 insertions(+)
 create mode 100644 multi-node/col-split/README.md
 create mode 100755 multi-node/col-split/runexp-mpi.sh
 create mode 100644 multi-node/col-split/splitsvm.py

diff --git a/multi-node/col-split/README.md b/multi-node/col-split/README.md
new file mode 100644
index 000000000000..14fe993d20f1
--- /dev/null
+++ b/multi-node/col-split/README.md
@@ -0,0 +1,2 @@
+Column Split Version of XGBoost
+====
diff --git a/multi-node/col-split/runexp-mpi.sh b/multi-node/col-split/runexp-mpi.sh
new file mode 100755
index 000000000000..d5469e7148ae
--- /dev/null
+++ b/multi-node/col-split/runexp-mpi.sh
@@ -0,0 +1,19 @@
+#!/bin/bash
+if [[ $# -ne 1 ]]
+then
+    echo "Usage: nprocess"
+    exit -1
+fi
+
+rm -rf train.col*
+k=$1
+
+# split the lib svm file into k subfiles
+python splitsvm.py ../../demo/data/agaricus.txt.train train $k
+
+# run xgboost mpi
+mpirun -n $k ../../xgboost-mpi  mushroom-col.conf dsplit=col
+
+# the model can be directly loaded by single machine xgboost solver, as usuall
+../../xgboost mpi.conf task=dump model_in=0002.model fmap=../../demo/data/featmap.txt name_dump=dump.nice.$k.txt
+cat dump.nice.$k.txt
diff --git a/multi-node/col-split/splitsvm.py b/multi-node/col-split/splitsvm.py
new file mode 100644
index 000000000000..365aef610c84
--- /dev/null
+++ b/multi-node/col-split/splitsvm.py
@@ -0,0 +1,32 @@
+#!/usr/bin/python
+import sys
+import random
+
+# split libsvm file into different subcolumns
+if len(sys.argv) < 4:
+    print ('Usage:<fin> <fo> k')
+    exit(0)
+
+random.seed(10)
+fmap = {}
+
+k = int(sys.argv[3])
+fi = open( sys.argv[1], 'r' )
+fos = []
+
+for i in range(k):
+    fos.append(open( sys.argv[2]+'.col%d' % i, 'w' ))
+    
+for l in open(sys.argv[1]):
+    arr = l.split()
+    for f in fos:
+        f.write(arr[0])
+    for it in arr[1:]:
+        fid = int(it.split(':')[0])
+        if fid not in fmap:
+            fmap[fid] = random.randint(0, k-1)
+        fos[fmap[fid]].write(' '+it)
+    for f in fos:
+        f.write('\n')
+for f in fos:    
+    f.close()

From da54f5e5d860b81284325ac93d82b5f1b78d7026 Mon Sep 17 00:00:00 2001
From: tqchen <tianqi.tchen@gmail.com>
Date: Wed, 19 Nov 2014 11:37:54 -0800
Subject: [PATCH 081/166] add note for col

---
 multi-node/col-split/README.md       | 14 ++++++++++++++
 multi-node/col-split/run-mushroom.sh | 19 +++++++++++++++++++
 multi-node/col-split/runexp-mpi.sh   |  4 ++--
 3 files changed, 35 insertions(+), 2 deletions(-)
 create mode 100755 multi-node/col-split/run-mushroom.sh
 mode change 100755 => 100644 multi-node/col-split/runexp-mpi.sh

diff --git a/multi-node/col-split/README.md b/multi-node/col-split/README.md
index 14fe993d20f1..b3053080f7a6 100644
--- a/multi-node/col-split/README.md
+++ b/multi-node/col-split/README.md
@@ -1,2 +1,16 @@
 Column Split Version of XGBoost
 ====
+* run ```bash run-mushroom.sh```
+
+Steps to use column split version
+====
+* First split the data by column, 
+* In the config, specify data file as containing a wildcard %d, where %d is the rank of the node, each node will load their part of data
+* Enable column split mode by ```dsplit=col```
+
+Note on the Column Split Version
+====
+* The code is multi-threaded, so you want to run one xgboost-mpi per node
+* The code will work correctly as long as union of each column subset is all the columns we are interested in.
+  - The column subset can overlap with each other.
+* It uses exactly the same algorithm as single node version, to examine all potential split points.
diff --git a/multi-node/col-split/run-mushroom.sh b/multi-node/col-split/run-mushroom.sh
new file mode 100755
index 000000000000..5c4c06587f40
--- /dev/null
+++ b/multi-node/col-split/run-mushroom.sh
@@ -0,0 +1,19 @@
+#!/bin/bash
+if [[ $# -ne 1 ]]
+then
+    echo "Usage: nprocess"
+    exit -1
+fi
+
+rm -rf train.col*
+k=$1
+
+# split the lib svm file into k subfiles
+python splitsvm.py ../../demo/data/agaricus.txt.train train $k
+
+# run xgboost mpi
+mpirun -n $k ../../xgboost-mpi  mushroom-col.conf updater=distcol silent=0
+
+# the model can be directly loaded by single machine xgboost solver, as usuall
+../../xgboost mushroom-col.conf task=dump model_in=0002.model fmap=../../demo/data/featmap.txt name_dump=dump.nice.$k.txt
+cat dump.nice.$k.txt
diff --git a/multi-node/col-split/runexp-mpi.sh b/multi-node/col-split/runexp-mpi.sh
old mode 100755
new mode 100644
index d5469e7148ae..906ace94c32b
--- a/multi-node/col-split/runexp-mpi.sh
+++ b/multi-node/col-split/runexp-mpi.sh
@@ -12,8 +12,8 @@ k=$1
 python splitsvm.py ../../demo/data/agaricus.txt.train train $k
 
 # run xgboost mpi
-mpirun -n $k ../../xgboost-mpi  mushroom-col.conf dsplit=col
+mpirun -n $k ../../xgboost-mpi mushroom-col.conf dsplit=col
 
 # the model can be directly loaded by single machine xgboost solver, as usuall
-../../xgboost mpi.conf task=dump model_in=0002.model fmap=../../demo/data/featmap.txt name_dump=dump.nice.$k.txt
+../../xgboost mushroom-col.conf task=dump model_in=0002.model fmap=../../demo/data/featmap.txt name_dump=dump.nice.$k.txt
 cat dump.nice.$k.txt

From 55e62a7120705b411f9314f40c0c3533012fd722 Mon Sep 17 00:00:00 2001
From: tqchen <tianqi.tchen@gmail.com>
Date: Wed, 19 Nov 2014 11:44:24 -0800
Subject: [PATCH 082/166] still need to test row merge

---
 multi-node/col-split/README.md                |  6 +++---
 .../{runexp-mpi.sh => mushroom-col.sh}        |  0
 multi-node/col-split/run-mushroom.sh          | 19 -------------------
 src/learner/learner-inl.hpp                   |  4 +++-
 4 files changed, 6 insertions(+), 23 deletions(-)
 rename multi-node/col-split/{runexp-mpi.sh => mushroom-col.sh} (100%)
 mode change 100644 => 100755
 delete mode 100755 multi-node/col-split/run-mushroom.sh

diff --git a/multi-node/col-split/README.md b/multi-node/col-split/README.md
index b3053080f7a6..c0b9fef7c68a 100644
--- a/multi-node/col-split/README.md
+++ b/multi-node/col-split/README.md
@@ -1,14 +1,14 @@
-Column Split Version of XGBoost
+Distributed XGBoost: Column Split Version
 ====
 * run ```bash run-mushroom.sh```
 
-Steps to use column split version
+How to Use
 ====
 * First split the data by column, 
 * In the config, specify data file as containing a wildcard %d, where %d is the rank of the node, each node will load their part of data
 * Enable column split mode by ```dsplit=col```
 
-Note on the Column Split Version
+Notes
 ====
 * The code is multi-threaded, so you want to run one xgboost-mpi per node
 * The code will work correctly as long as union of each column subset is all the columns we are interested in.
diff --git a/multi-node/col-split/runexp-mpi.sh b/multi-node/col-split/mushroom-col.sh
old mode 100644
new mode 100755
similarity index 100%
rename from multi-node/col-split/runexp-mpi.sh
rename to multi-node/col-split/mushroom-col.sh
diff --git a/multi-node/col-split/run-mushroom.sh b/multi-node/col-split/run-mushroom.sh
deleted file mode 100755
index 5c4c06587f40..000000000000
--- a/multi-node/col-split/run-mushroom.sh
+++ /dev/null
@@ -1,19 +0,0 @@
-#!/bin/bash
-if [[ $# -ne 1 ]]
-then
-    echo "Usage: nprocess"
-    exit -1
-fi
-
-rm -rf train.col*
-k=$1
-
-# split the lib svm file into k subfiles
-python splitsvm.py ../../demo/data/agaricus.txt.train train $k
-
-# run xgboost mpi
-mpirun -n $k ../../xgboost-mpi  mushroom-col.conf updater=distcol silent=0
-
-# the model can be directly loaded by single machine xgboost solver, as usuall
-../../xgboost mushroom-col.conf task=dump model_in=0002.model fmap=../../demo/data/featmap.txt name_dump=dump.nice.$k.txt
-cat dump.nice.$k.txt
diff --git a/src/learner/learner-inl.hpp b/src/learner/learner-inl.hpp
index b1a95dd96d01..70e71cf57894 100644
--- a/src/learner/learner-inl.hpp
+++ b/src/learner/learner-inl.hpp
@@ -92,7 +92,7 @@ class BoostLearner {
     if (!strcmp(name, "silent")) silent = atoi(val);
     if (!strcmp(name, "dsplit")) {
       if (!strcmp(val, "col")) {
-        this->SetParam("updater", "distcol,prune");
+        this->SetParam("updater", "distcol");
         distributed_mode = 1;
       } else if (!strcmp(val, "row")) {
         this->SetParam("updater", "grow_histmaker,prune");
@@ -104,6 +104,8 @@ class BoostLearner {
     if (!strcmp(name, "part_load_col")) part_load_col = atoi(val);
     if (!strcmp(name, "prob_buffer_row")) {
       prob_buffer_row = static_cast<float>(atof(val));
+      utils::Check(distributed_mode == 0,
+                   "prob_buffer_row can only be used in single node mode so far");
       this->SetParam("updater", "grow_colmaker,refresh,prune");
     }
     if (!strcmp(name, "eval_metric")) evaluator_.AddEval(val);

From 7c3a392136d44e0f0a0da7e1d840b824a13a004b Mon Sep 17 00:00:00 2001
From: tqchen <tianqi.tchen@gmail.com>
Date: Wed, 19 Nov 2014 15:28:09 -0800
Subject: [PATCH 083/166] compile

---
 Makefile                             |  2 +-
 multi-node/col-split/mushroom-col.sh |  2 +-
 src/sync/sync.h                      | 84 ++++++++++++++----------
 src/sync/sync_empty.cpp              |  4 +-
 src/sync/sync_mpi.cpp                | 40 +++++++++---
 src/tree/updater.cpp                 |  6 +-
 src/tree/updater_histmaker-inl.hpp   | 36 ++++++-----
 src/utils/quantile.h                 | 95 ++++++++--------------------
 8 files changed, 136 insertions(+), 133 deletions(-)

diff --git a/Makefile b/Makefile
index 51b7a578ac1b..d5fd7c394069 100644
--- a/Makefile
+++ b/Makefile
@@ -19,7 +19,7 @@ SLIB = wrapper/libxgboostwrapper.so
 
 .PHONY: clean all mpi python Rpack
 
-all: $(BIN) $(OBJ) $(SLIB) 
+all: $(BIN) $(OBJ) $(SLIB) mpi
 mpi: $(MPIBIN)
 
 python: wrapper/libxgboostwrapper.so
diff --git a/multi-node/col-split/mushroom-col.sh b/multi-node/col-split/mushroom-col.sh
index 906ace94c32b..63b5092ba2e1 100755
--- a/multi-node/col-split/mushroom-col.sh
+++ b/multi-node/col-split/mushroom-col.sh
@@ -5,7 +5,7 @@ then
     exit -1
 fi
 
-rm -rf train.col*
+rm -rf train.col* *.model
 k=$1
 
 # split the lib svm file into k subfiles
diff --git a/src/sync/sync.h b/src/sync/sync.h
index fe34983ef235..e728932e0887 100644
--- a/src/sync/sync.h
+++ b/src/sync/sync.h
@@ -11,6 +11,10 @@
 #include "../utils/io.h"
 #include <string>
 
+namespace MPI {
+// forward delcaration of MPI::Datatype, but not include content
+class Datatype;
+};
 namespace xgboost {
 /*! \brief syncrhonizer module that minimumly wraps interface of MPI */
 namespace sync {
@@ -62,23 +66,31 @@ void Bcast(std::string *sendrecv_data, int root);
 class ReduceHandle {
  public:
   // reduce function
-  typedef void (ReduceFunction) (const void *src, void *dst, int len);
+  typedef void (ReduceFunction) (const void *src, void *dst, int len, const MPI::Datatype &dtype);
   // constructor
   ReduceHandle(void);
   // destructor
   ~ReduceHandle(void);
-  // initialize the reduce function
-  void Init(ReduceFunction redfunc, bool commute = true);
+  /*!
+   * \brief initialize the reduce function, with the type the reduce function need to deal with   
+   */
+  void Init(ReduceFunction redfunc, size_t type_n4bytes, bool commute = true);
   /*!
    * \brief customized in-place all reduce operation 
    * \param sendrecvbuf the in place send-recv buffer
-   * \param n4bytes number of nbytes send through all reduce
+   * \param type_n4bytes unit size of the type, in terms of 4bytes
+   * \param count number of elements to send
    */
-  void AllReduce(void *sendrecvbuf, size_t n4bytes);
-  
+  void AllReduce(void *sendrecvbuf, size_t type_n4bytes, size_t count);
+  /*! \return the number of bytes occupied by the type */
+  static int TypeSize(const MPI::Datatype &dtype);
  private:
   // handle data field
   void *handle;
+  // handle to the type field
+  void *htype;
+  // the created type in 4 bytes
+  size_t created_type_n4bytes;
 };
 
 // ----- extensions for ease of use ------
@@ -92,7 +104,7 @@ template<typename DType>
 class Reducer {
  public:
   Reducer(void) {
-    handle.Init(ReduceInner);
+    handle.Init(ReduceInner, kUnit);
     utils::Assert(sizeof(DType) % sizeof(int) == 0, "struct must be multiple of int");
   }
   /*!
@@ -102,24 +114,23 @@ class Reducer {
    * \param reducer the reducer function
    */
   inline void AllReduce(DType *sendrecvbuf, size_t count) {
-    handle.AllReduce(sendrecvbuf, count * kUnit);
+    handle.AllReduce(sendrecvbuf, kUnit, count);
   }
 
  private:
   // unit size 
   static const size_t kUnit = sizeof(DType) / sizeof(int);
   // inner implementation of reducer
-  inline static void ReduceInner(const void *src_, void *dst_, int len_) {
+  inline static void ReduceInner(const void *src_, void *dst_, int len_, const MPI::Datatype &dtype) {
     const int *psrc = reinterpret_cast<const int*>(src_);
     int *pdst = reinterpret_cast<int*>(dst_);
     DType tdst, tsrc;
-    utils::Assert(len_ % kUnit == 0, "length not divide by size");
-    for (size_t i = 0; i < len_; i += kUnit) {
+    for (size_t i = 0; i < len_; ++i) {
       // use memcpy to avoid alignment issue
-      std::memcpy(&tdst, pdst + i, sizeof(tdst));
-      std::memcpy(&tsrc, psrc + i, sizeof(tsrc));
+      std::memcpy(&tdst, pdst + i * kUnit, sizeof(tdst));
+      std::memcpy(&tsrc, psrc + i * kUnit, sizeof(tsrc));
       tdst.Reduce(tsrc);
-      std::memcpy(pdst + i, &tdst, sizeof(tdst));      
+      std::memcpy(pdst + i * kUnit, &tdst, sizeof(tdst));      
     }
   }
   // function handle
@@ -135,38 +146,47 @@ class Reducer {
  *   (1) Save(IStream &fs)  (2) Load(IStream &fs) (3) Reduce(const DType &d);
  */
 template<typename DType>
-class ComplexReducer {
+class SerializeReducer {
  public:
-  ComplexReducer(void) {
-    handle.Init(ReduceInner);
+  SerializeReducer(void) {
+    handle.Init(ReduceInner, 0);
   }
   /*!
-   * \brief customized in-place all reduce operation 
+   * \brief customized in-place all reduce operation
    * \param sendrecvobj pointer to the object to be reduced
    * \param max_n4byte maximum amount of memory needed in 4byte
    * \param reducer the reducer function
    */
-  inline void AllReduce(DType *sendrecvobj, size_t max_n4byte) {
-    buffer.resize(max_n4byte);
-    utils::MemoryFixSizeBuffer fs(BeginPtr(buffer), max_n4byte * 4);
-    sendrecvobj->Save(fs);
-    handle.AllReduce(BeginPtr(buffer), max_n4byte);
-    fs.Seek(0);
-    sendrecvobj->Load(fs);
+  inline void AllReduce(DType *sendrecvobj, size_t max_n4byte, size_t count) {
+    buffer.resize(max_n4byte * count);
+    for (size_t i = 0; i < count; ++i) {
+      utils::MemoryFixSizeBuffer fs(BeginPtr(buffer) + i * max_n4byte * 4, max_n4byte * 4);
+      sendrecvobj[i]->Save(fs);
+    }
+    handle.AllReduce(BeginPtr(buffer), max_n4byte, count);
+    for (size_t i = 0; i < count; ++i) {
+      utils::MemoryFixSizeBuffer fs(BeginPtr(buffer) + i * max_n4byte * 4, max_n4byte * 4);
+      sendrecvobj[i]->Load(fs);
+    }
   }
 
  private:
   // unit size
   // inner implementation of reducer
-  inline static void ReduceInner(const void *src_, void *dst_, int len_) {
-    utils::MemoryFixSizeBuffer fsrc((void*)(src_), len_);
-    utils::MemoryFixSizeBuffer fdst(dst_, len_);
+  inline static void ReduceInner(const void *src_, void *dst_, int len_, const MPI::Datatype &dtype) {
+    int nbytes = ReduceHandle::TypeSize(dtype);
     // temp space
     DType tsrc, tdst;
-    tsrc.Load(fsrc); tdst.Load(fdst);
-    // govern const check
-    tdst.Reduce(static_cast<const DType &>(tsrc));
-    tdst.Save(fdst);
+    for (int i = 0; i < len_; ++i) {
+      utils::MemoryFixSizeBuffer fsrc((void*)(src_) + i * nbytes, nbytes);
+      utils::MemoryFixSizeBuffer fdst(dst_ + i * nbytes, nbytes);
+      tsrc.Load(fsrc);
+      tdst.Load(fdst);
+      // govern const check
+      tdst.Reduce(static_cast<const DType &>(tsrc));
+      fdst.Seek(0);
+      tdst.Save(fdst);
+    }
   }
   // function handle
   ReduceHandle handle;
diff --git a/src/sync/sync_empty.cpp b/src/sync/sync_empty.cpp
index a86707d61e3b..d11d164cd82b 100644
--- a/src/sync/sync_empty.cpp
+++ b/src/sync/sync_empty.cpp
@@ -38,8 +38,8 @@ void Bcast(std::string *sendrecv_data, int root) {
 
 ReduceHandle::ReduceHandle(void) : handle(NULL) {}
 ReduceHandle::~ReduceHandle(void) {}
-void ReduceHandle::Init(ReduceFunction redfunc, bool commute) {}
-void ReduceHandle::AllReduce(void *sendrecvbuf, size_t n4byte) {}
+void ReduceHandle::Init(ReduceFunction redfunc, size_t type_n4bytes, bool commute) {}
+void ReduceHandle::AllReduce(void *sendrecvbuf, size_t type_n4bytes, size_t n4byte) {}
 }  // namespace sync
 }  // namespace xgboost
 
diff --git a/src/sync/sync_mpi.cpp b/src/sync/sync_mpi.cpp
index 45f6c3d75259..b96a509a0b89 100644
--- a/src/sync/sync_mpi.cpp
+++ b/src/sync/sync_mpi.cpp
@@ -1,6 +1,7 @@
 #include "./sync.h"
 #include "../utils/utils.h"
-#include "mpi.h"
+#include <mpi.h>
+
 // use MPI to implement sync
 namespace xgboost {
 namespace sync {
@@ -60,7 +61,7 @@ void Bcast(std::string *sendrecv_data, int root) {
 }
 
 // code for reduce handle
-ReduceHandle::ReduceHandle(void) : handle(NULL) {
+ReduceHandle::ReduceHandle(void) : handle(NULL), htype(NULL) {
 }
 ReduceHandle::~ReduceHandle(void) {
   if (handle != NULL) {
@@ -68,19 +69,42 @@ ReduceHandle::~ReduceHandle(void) {
     op->Free();
     delete op;
   }
+  if (htype != NULL) {
+    MPI::Datatype *dtype = reinterpret_cast<MPI::Datatype*>(htype);
+    dtype->Free();
+    delete dtype;
+  }
 }
-void ReduceHandle::Init(ReduceFunction redfunc, bool commute) {
+int ReduceHandle::TypeSize(const MPI::Datatype &dtype) {
+  return dtype.Get_size();
+}
+void ReduceHandle::Init(ReduceFunction redfunc, size_t type_n4bytes, bool commute) {
   utils::Assert(handle == NULL, "cannot initialize reduce handle twice");
+  if (type_n4bytes != 0) {
+    MPI::Datatype *dtype = new MPI::Datatype();
+    *dtype = MPI::INT.Create_contiguous(type_n4bytes);
+    dtype->Commit();
+    created_type_n4bytes = type_n4bytes;
+    htype = dtype;
+  }
+  
   MPI::Op *op = new MPI::Op();
-  MPI::User_function *pf = reinterpret_cast<MPI::User_function*>(redfunc);
+  MPI::User_function *pf = redfunc;
   op->Init(pf, commute);
   handle = op;
 }
-void ReduceHandle::AllReduce(void *sendrecvbuf, size_t n4byte) {
-  utils::Assert(handle != NULL, "must intialize handle to call AllReduce");  
+void ReduceHandle::AllReduce(void *sendrecvbuf, size_t type_n4bytes, size_t count) {
+  utils::Assert(handle != NULL, "must intialize handle to call AllReduce");
   MPI::Op *op = reinterpret_cast<MPI::Op*>(handle);
-  MPI::COMM_WORLD.Allreduce(MPI_IN_PLACE, sendrecvbuf, n4byte, MPI_INT, *op);
-}
+  MPI::Datatype *dtype = reinterpret_cast<MPI::Datatype*>(htype);
 
+  if (created_type_n4bytes != type_n4bytes || htype == NULL) {
+    dtype->Free();
+    *dtype = MPI::INT.Create_contiguous(type_n4bytes);
+    dtype->Commit();
+    created_type_n4bytes = type_n4bytes;
+  }
+  MPI::COMM_WORLD.Allreduce(MPI_IN_PLACE, sendrecvbuf, count, *dtype, *op);
+}
 }  // namespace sync
 }  // namespace xgboost
diff --git a/src/tree/updater.cpp b/src/tree/updater.cpp
index a087bf9ed5ca..a4cd65de0bdc 100644
--- a/src/tree/updater.cpp
+++ b/src/tree/updater.cpp
@@ -7,7 +7,7 @@
 #include "./updater_refresh-inl.hpp"
 #include "./updater_colmaker-inl.hpp"
 #include "./updater_distcol-inl.hpp"
-#include "./updater_skmaker-inl.hpp"
+//#include "./updater_skmaker-inl.hpp"
 #include "./updater_histmaker-inl.hpp"
 
 namespace xgboost {
@@ -18,8 +18,8 @@ IUpdater* CreateUpdater(const char *name) {
   if (!strcmp(name, "sync")) return new TreeSyncher();
   if (!strcmp(name, "refresh")) return new TreeRefresher<GradStats>();
   if (!strcmp(name, "grow_colmaker")) return new ColMaker<GradStats>();
-  if (!strcmp(name, "grow_histmaker")) return new CQHistMaker<GradStats>();
-  if (!strcmp(name, "grow_skmaker")) return new SketchMaker();
+  //if (!strcmp(name, "grow_histmaker")) return new CQHistMaker<GradStats>();
+  //if (!strcmp(name, "grow_skmaker")) return new SketchMaker();
   if (!strcmp(name, "distcol")) return new DistColMaker<GradStats>();
 
   utils::Error("unknown updater:%s", name);
diff --git a/src/tree/updater_histmaker-inl.hpp b/src/tree/updater_histmaker-inl.hpp
index 4c0136ac8809..f05308ce22eb 100644
--- a/src/tree/updater_histmaker-inl.hpp
+++ b/src/tree/updater_histmaker-inl.hpp
@@ -306,6 +306,7 @@ class CQHistMaker: public HistMaker<TStats> {
       hist.data[istart].Add(gpair, info, ridx);
     }
   };
+  typedef utils::WXQuantileSketch<bst_float, bst_float> WXQSketch;
   virtual void CreateHist(const std::vector<bst_gpair> &gpair,
                           IFMatrix *p_fmat,
                           const BoosterInfo &info,
@@ -371,21 +372,22 @@ class CQHistMaker: public HistMaker<TStats> {
     // setup maximum size
     unsigned max_size = this->param.max_sketch_size();
     // synchronize sketch
-    summary_array.Init(sketchs.size(), max_size);
+    summary_array.resize(sketchs.size());
     for (size_t i = 0; i < sketchs.size(); ++i) {
       utils::WXQuantileSketch<bst_float, bst_float>::SummaryContainer out;
       sketchs[i].GetSummary(&out);
-      summary_array.Set(i, out);
+      summary_array[i].Reserve(max_size);
+      summary_array[i].SetPrune(out, max_size);
     }
-    size_t n4bytes = (summary_array.MemSize() + 3) / 4;
-    sreducer.AllReduce(&summary_array, n4bytes);
+    size_t n4bytes = (WXQSketch::SummaryContainer::CalcMemCost(max_size) + 3) / 4;
+    sreducer.AllReduce(BeginPtr(summary_array), n4bytes, summary_array.size());
     // now we get the final result of sketch, setup the cut
     this->wspace.cut.clear();
     this->wspace.rptr.clear();
     this->wspace.rptr.push_back(0);
     for (size_t wid = 0; wid < this->qexpand.size(); ++wid) {
       for (int fid = 0; fid < tree.param.num_feature; ++fid) {
-        const WXQSketch::Summary a = summary_array[wid * tree.param.num_feature + fid];
+        const WXQSketch::Summary &a = summary_array[wid * tree.param.num_feature + fid];
         for (size_t i = 1; i < a.size; ++i) {
           bst_float cpt = a.data[i].value - rt_eps;
           if (i == 1 || cpt > this->wspace.cut.back()) {
@@ -407,7 +409,7 @@ class CQHistMaker: public HistMaker<TStats> {
     }
     utils::Assert(this->wspace.rptr.size() ==
                   (tree.param.num_feature + 1) * this->qexpand.size() + 1,
-                  "cut space inconsistent");    
+                  "cut space inconsistent");
   }
   
  private:
@@ -496,7 +498,6 @@ class CQHistMaker: public HistMaker<TStats> {
     }
   }
   
-  typedef utils::WXQuantileSketch<bst_float, bst_float> WXQSketch;
   // thread temp data
   std::vector< std::vector<BaseMaker::SketchEntry> > thread_sketch;
   // used to hold statistics
@@ -506,9 +507,9 @@ class CQHistMaker: public HistMaker<TStats> {
   // node statistics
   std::vector<TStats> node_stats;
   // summary array
-  WXQSketch::SummaryArray summary_array;
+  std::vector< WXQSketch::SummaryContainer> summary_array;
   // reducer for summary
-  sync::ComplexReducer<WXQSketch::SummaryArray> sreducer;
+  sync::SerializeReducer<WXQSketch::SummaryContainer> sreducer;
   // per node, per feature sketch
   std::vector< utils::WXQuantileSketch<bst_float, bst_float> > sketchs;  
 };
@@ -580,23 +581,24 @@ class QuantileHistMaker: public HistMaker<TStats> {
       }
     }
     // setup maximum size
-    size_t max_size = static_cast<size_t>(this->param.sketch_ratio / this->param.sketch_eps);
+    unsigned max_size = this->param.max_sketch_size();
     // synchronize sketch
-    summary_array.Init(sketchs.size(), max_size);
+    summary_array.resize(sketchs.size());
     for (size_t i = 0; i < sketchs.size(); ++i) {
       utils::WQuantileSketch<bst_float, bst_float>::SummaryContainer out;
       sketchs[i].GetSummary(&out);
-      summary_array.Set(i, out);
+      summary_array[i].Reserve(max_size);
+      summary_array[i].SetPrune(out, max_size);
     }
-    size_t n4bytes = (summary_array.MemSize() + 3) / 4;
-    sreducer.AllReduce(&summary_array, n4bytes);
+    size_t n4bytes = (WXQSketch::SummaryContainer::CalcMemCost(max_size) + 3) / 4;
+    sreducer.AllReduce(BeginPtr(summary_array), n4bytes, summary_array.size());
     // now we get the final result of sketch, setup the cut
     this->wspace.cut.clear();
     this->wspace.rptr.clear();
     this->wspace.rptr.push_back(0);
     for (size_t wid = 0; wid < this->qexpand.size(); ++wid) {
       for (int fid = 0; fid < tree.param.num_feature; ++fid) {
-        const WXQSketch::Summary a = summary_array[wid * tree.param.num_feature + fid];
+        const WXQSketch::Summary &a = summary_array[wid * tree.param.num_feature + fid];
         for (size_t i = 1; i < a.size; ++i) {
           bst_float cpt = a.data[i].value - rt_eps;
           if (i == 1 || cpt > this->wspace.cut.back()) {
@@ -624,9 +626,9 @@ class QuantileHistMaker: public HistMaker<TStats> {
  private:
   typedef utils::WXQuantileSketch<bst_float, bst_float> WXQSketch;
   // summary array
-  WXQSketch::SummaryArray summary_array;
+  std::vector< WXQSketch::SummaryContainer> summary_array;
   // reducer for summary
-  sync::ComplexReducer<WXQSketch::SummaryArray> sreducer;
+  sync::SerializeReducer<WXQSketch::SummaryContainer> sreducer;
   // local temp column data structure
   std::vector<size_t> col_ptr;
   // local storage of column data
diff --git a/src/utils/quantile.h b/src/utils/quantile.h
index c3cdb86c2c26..6727c86751fa 100644
--- a/src/utils/quantile.h
+++ b/src/utils/quantile.h
@@ -224,6 +224,12 @@ struct WQSummary {
    */
   inline void SetCombine(const WQSummary &sa,
                          const WQSummary &sb) {
+    if (sa.size == 0) {
+      this->CopyFrom(sb); return;
+    }
+    if (sb.size == 0) {
+      this->CopyFrom(sa); return;
+    }
     utils::Assert(sa.size > 0 && sb.size > 0, "invalid input for merge"); 
     const Entry *a = sa.data, *a_end = sa.data + sa.size;
     const Entry *b = sb.data, *b_end = sb.data + sb.size;
@@ -453,6 +459,12 @@ struct GKSummary {
   }
   inline void SetCombine(const GKSummary &sa,
                          const GKSummary &sb) {
+    if (sa.size == 0) {
+      this->CopyFrom(sb); return;
+    }
+    if (sb.size == 0) {
+      this->CopyFrom(sa); return;
+    }    
     utils::Assert(sa.size > 0 && sb.size > 0, "invalid input for merge"); 
     const Entry *a = sa.data, *a_end = sa.data + sa.size;
     const Entry *b = sb.data, *b_end = sb.data + sb.size;
@@ -537,96 +549,41 @@ class QuantileSketchTemplate {
         this->SetMerge(begin[0], begin[1]);
       } else {
         // recursive merge
-        SummaryContainer lhs, rhs;        
+        SummaryContainer lhs, rhs;
         lhs.SetCombine(begin, begin + len / 2);
         rhs.SetCombine(begin + len / 2, end);
         this->Reserve(lhs.size + rhs.size);
         this->SetCombine(lhs, rhs);
       }
     }
-  };
-  /*!
-   * \brief represent an array of summary
-   *  each contains fixed maximum size summary
-   */
-  class SummaryArray {
-   public:
-    /*!
-     * \brief intialize the SummaryArray 
-     * \param num_summary number of summary in the array
-     * \param max_size maximum number of elements in each summary
-     */
-    inline void Init(unsigned num_summary, unsigned max_size) {
-      this->num_summary = num_summary;
-      this->max_size = max_size;
-      sizes.resize(num_summary);
-      data.resize(num_summary * max_size);
-    }
-    /*!
-     * \brief set i-th element of array to be the src summary,
-     *   the summary can be pruned if it does not fit into max_size
-     * \param the index in the array
-     * \param src the source summary
-     * \tparam the type if source summary
-     */
-    template<typename TSrc>
-    inline void Set(size_t i, const TSrc &src) {
-      Summary dst = (*this)[i];
-      dst.SetPrune(src, max_size);
-      this->sizes[i] = dst.size;
-    }
-    /*! 
-     * \brief get i-th summary of the array, only use this for read purpose
-     */
-    inline const Summary operator[](size_t i) const {
-      return Summary((Entry*)BeginPtr(data) + i * max_size, sizes[i]);
-    }
     /*!
      * \brief do elementwise combination of summary array
      *        this[i] = combine(this[i], src[i]) for each i
      * \param src the source summary
+     * \param max_nbyte, maximum number of byte allowed in here
      */
-    inline void Reduce(const SummaryArray &src) {
-      utils::Check(num_summary == src.num_summary &&
-                   max_size == src.max_size, "array shape mismatch in reduce");
+    inline void Reduce(const Summary &src, size_t max_nbyte) {
+      this->Reserve((max_nbyte - sizeof(this->size)) / sizeof(Entry));
       SummaryContainer temp;
-      temp.Reserve(max_size * 2);
-      for (unsigned i = 0; i < num_summary; ++i) {
-        temp.SetCombine((*this)[i], src[i]);
-        this->Set(i, temp);
-      }
+      temp.Reserve(this->size + src.size);
+      temp.SetCombine(*this, src);
+      this->SetPrune(temp, space.size());
     }
     /*! \brief return the number of bytes this data structure cost in serialization */
-    inline size_t MemSize(void) const {
-      return sizeof(num_summary) + sizeof(max_size) 
-          + data.size() * sizeof(Entry) + sizes.size() * sizeof(unsigned);
+    inline static size_t CalcMemCost(size_t nentry) {
+      return sizeof(size_t) + sizeof(Entry) * nentry;
     }
     /*! \brief save the data structure into stream */
     inline void Save(IStream &fo) const {
-      fo.Write(&num_summary, sizeof(num_summary));
-      fo.Write(&max_size, sizeof(max_size));
-      fo.Write(BeginPtr(sizes), sizes.size() * sizeof(unsigned));
-      fo.Write(BeginPtr(data), data.size() * sizeof(Entry));
+      fo.Write(&(this->size), sizeof(this->size));
+      fo.Write(data, this->size * sizeof(Entry));
     }
     /*! \brief load data structure from input stream */
     inline void Load(IStream &fi) {
-      utils::Check(fi.Read(&num_summary, sizeof(num_summary)) != 0, "invalid SummaryArray");
-      utils::Check(fi.Read(&max_size, sizeof(max_size)) != 0, "invalid SummaryArray");
-      sizes.resize(num_summary);
-      data.resize(num_summary * max_size);
-      utils::Check(fi.Read(BeginPtr(sizes), sizes.size() * sizeof(unsigned)) != 0, "invalid SummaryArray");
-      utils::Check(fi.Read(BeginPtr(data), data.size() * sizeof(Entry)) != 0, "invalid SummaryArray");
+      utils::Check(fi.Read(&this->size, sizeof(this->size)) != 0, "invalid SummaryArray 1");
+      this->Reserve(this->size);      
+      utils::Check(fi.Read(data, this->size * sizeof(Entry)) != 0, "invalid SummaryArray 2");
     }
-
-   private:
-    /*! \brief number of summaries in the group */
-    unsigned num_summary;
-    /*! \brief maximum size of each summary */
-    unsigned max_size;
-    /*! \brief the current size of each summary */
-    std::vector<unsigned> sizes;
-    /*! \brief the data content */
-    std::vector<Entry> data;
   };
   /*! 
    * \brief intialize the quantile sketch, given the performance specification

From c42ba8d2811ee5e3d55670a395215998db28cdde Mon Sep 17 00:00:00 2001
From: tqchen <tianqi.tchen@gmail.com>
Date: Wed, 19 Nov 2014 19:19:53 -0800
Subject: [PATCH 084/166] get multinode in

---
 multi-node/README.md                   |  3 ++-
 multi-node/col-split/README.md         |  2 +-
 multi-node/row-split/README.md         | 17 +++++++++++++
 multi-node/row-split/machine-row.conf  | 31 +++++++++++++++++++++++
 multi-node/row-split/machine-row.sh    | 21 ++++++++++++++++
 multi-node/row-split/mushroom-row.conf | 35 ++++++++++++++++++++++++++
 multi-node/row-split/mushroom-row.sh   | 19 ++++++++++++++
 src/sync/sync.h                        | 14 +++++------
 src/sync/sync_empty.cpp                |  3 +++
 src/sync/sync_mpi.cpp                  |  9 ++++---
 src/tree/updater.cpp                   |  2 +-
 src/tree/updater_histmaker-inl.hpp     |  6 ++---
 src/utils/io.h                         |  2 +-
 src/utils/quantile.h                   | 16 +++++++-----
 14 files changed, 157 insertions(+), 23 deletions(-)
 create mode 100644 multi-node/row-split/README.md
 create mode 100644 multi-node/row-split/machine-row.conf
 create mode 100755 multi-node/row-split/machine-row.sh
 create mode 100644 multi-node/row-split/mushroom-row.conf
 create mode 100755 multi-node/row-split/mushroom-row.sh

diff --git a/multi-node/README.md b/multi-node/README.md
index fab7472e7470..ba445da125d6 100644
--- a/multi-node/README.md
+++ b/multi-node/README.md
@@ -28,6 +28,7 @@ Design Choice
     this will reduce the communication overhead and improve the performance.
   - One way to do that is limit mpi slot in each machine to be 1, or reserve nthread processors for each process.
   
-Examples
+Usage
 ====
 * [Column-based version](col-split)
+* [Row-based version](row-split)
diff --git a/multi-node/col-split/README.md b/multi-node/col-split/README.md
index c0b9fef7c68a..bdafb2e32385 100644
--- a/multi-node/col-split/README.md
+++ b/multi-node/col-split/README.md
@@ -1,6 +1,6 @@
 Distributed XGBoost: Column Split Version
 ====
-* run ```bash run-mushroom.sh```
+* run ```bash mushroom-row.sh <n-mpi-process>```
 
 How to Use
 ====
diff --git a/multi-node/row-split/README.md b/multi-node/row-split/README.md
new file mode 100644
index 000000000000..6c007888366d
--- /dev/null
+++ b/multi-node/row-split/README.md
@@ -0,0 +1,17 @@
+Distributed XGBoost: Row Split Version
+====
+* Mushroom: run ```bash mushroom-row.sh <n-mpi-process>```
+* Machine: run ```bash machine-row.sh <n-mpi-process>```
+
+How to Use
+====
+* First split the data by rows
+* In the config, specify data file as containing a wildcard %d, where %d is the rank of the node, each node will load their part of data
+* Enable ow split mode by ```dsplit=row```
+
+Notes
+====
+* The code is multi-threaded, so you want to run one xgboost-mpi per node
+* Row-based solver split data by row, each node work on subset of rows, it uses an approximate histogram count algorithm,
+  and will only examine subset of potential split points as opposed to all split points.
+* ```colsample_bytree``` is not enabled in row split mode so far
diff --git a/multi-node/row-split/machine-row.conf b/multi-node/row-split/machine-row.conf
new file mode 100644
index 000000000000..ac816ab454b3
--- /dev/null
+++ b/multi-node/row-split/machine-row.conf
@@ -0,0 +1,31 @@
+# General Parameters, see comment for each definition
+# choose the tree booster, can also change to gblinear
+booster = gbtree
+# this is the only difference with classification, use reg:linear to do linear classification
+# when labels are in [0,1] we can also use reg:logistic
+objective = reg:linear
+
+# Tree Booster Parameters
+# step size shrinkage
+eta = 1.0 
+# minimum loss reduction required to make a further partition
+gamma = 1.0 
+# minimum sum of instance weight(hessian) needed in a child
+min_child_weight = 1 
+# maximum depth of a tree
+max_depth = 3 
+
+# Task parameters
+# the number of round to do boosting
+num_round = 2
+# 0 means do not save any model except the final round model
+save_period = 0 
+use_buffer = 0
+
+# The path of training data
+data = "train-machine.row%d" 
+# The path of validation data, used to monitor training process, here [test] sets name of the validation set
+eval[test] = "../../demo/regression/machine.txt.test" 
+# The path of test data 
+test:data = "../../demo/regression/machine.txt.test" 
+
diff --git a/multi-node/row-split/machine-row.sh b/multi-node/row-split/machine-row.sh
new file mode 100755
index 000000000000..41b8e8634652
--- /dev/null
+++ b/multi-node/row-split/machine-row.sh
@@ -0,0 +1,21 @@
+#!/bin/bash
+if [[ $# -ne 1 ]]
+then
+    echo "Usage: nprocess"
+    exit -1
+fi
+
+rm -rf train-machine.row* *.model
+k=$1
+# make machine data
+cd ../../demo/regression/
+python mapfeat.py
+python mknfold.py machine.txt 1
+cd -
+
+# split the lib svm file into k subfiles
+python splitrows.py ../../demo/regression/machine.txt.train train-machine $k
+
+# run xgboost mpi
+mpirun -n $k ../../xgboost-mpi machine-row.conf dsplit=row 
+
diff --git a/multi-node/row-split/mushroom-row.conf b/multi-node/row-split/mushroom-row.conf
new file mode 100644
index 000000000000..4cc2e8b11e47
--- /dev/null
+++ b/multi-node/row-split/mushroom-row.conf
@@ -0,0 +1,35 @@
+# General Parameters, see comment for each definition
+# choose the booster, can be gbtree or gblinear
+booster = gbtree
+# choose logistic regression loss function for binary classification
+objective = binary:logistic
+
+# Tree Booster Parameters
+# step size shrinkage
+eta = 1.0 
+# minimum loss reduction required to make a further partition
+gamma = 1.0 
+# minimum sum of instance weight(hessian) needed in a child
+min_child_weight = 1 
+# maximum depth of a tree
+max_depth = 3 
+
+# Task Parameters
+# the number of round to do boosting
+num_round = 2
+# 0 means do not save any model except the final round model
+save_period = 0 
+use_buffer = 0
+
+# The path of training data %d is the wildcard for the rank of the data
+# The idea is each process take a feature matrix with subset of columns
+#
+data = "train.row%d" 
+
+# The path of validation data, used to monitor training process, here [test] sets name of the validation set
+eval[test] = "../../demo/data/agaricus.txt.test" 
+# evaluate on training data as well each round
+eval_train = 1
+
+# The path of test data, need to use full data of test, try not use it, or keep an subsampled version
+test:data = "../../demo/data/agaricus.txt.test"      
diff --git a/multi-node/row-split/mushroom-row.sh b/multi-node/row-split/mushroom-row.sh
new file mode 100755
index 000000000000..a98fb6b0d137
--- /dev/null
+++ b/multi-node/row-split/mushroom-row.sh
@@ -0,0 +1,19 @@
+#!/bin/bash
+if [[ $# -ne 1 ]]
+then
+    echo "Usage: nprocess"
+    exit -1
+fi
+
+rm -rf train.row* *.model
+k=$1
+
+# split the lib svm file into k subfiles
+python splitrows.py ../../demo/data/agaricus.txt.train train $k
+
+# run xgboost mpi
+mpirun -n $k ../../xgboost-mpi mushroom-row.conf dsplit=row nthread=1
+
+# the model can be directly loaded by single machine xgboost solver, as usuall
+../../xgboost mushroom-row.conf task=dump model_in=0002.model fmap=../../demo/data/featmap.txt name_dump=dump.nice.$k.txt
+cat dump.nice.$k.txt
diff --git a/src/sync/sync.h b/src/sync/sync.h
index e728932e0887..0619c3ea3641 100644
--- a/src/sync/sync.h
+++ b/src/sync/sync.h
@@ -160,13 +160,13 @@ class SerializeReducer {
   inline void AllReduce(DType *sendrecvobj, size_t max_n4byte, size_t count) {
     buffer.resize(max_n4byte * count);
     for (size_t i = 0; i < count; ++i) {
-      utils::MemoryFixSizeBuffer fs(BeginPtr(buffer) + i * max_n4byte * 4, max_n4byte * 4);
-      sendrecvobj[i]->Save(fs);
+      utils::MemoryFixSizeBuffer fs(BeginPtr(buffer) + i * max_n4byte, max_n4byte * 4);
+      sendrecvobj[i].Save(fs);
     }
     handle.AllReduce(BeginPtr(buffer), max_n4byte, count);
     for (size_t i = 0; i < count; ++i) {
-      utils::MemoryFixSizeBuffer fs(BeginPtr(buffer) + i * max_n4byte * 4, max_n4byte * 4);
-      sendrecvobj[i]->Load(fs);
+      utils::MemoryFixSizeBuffer fs(BeginPtr(buffer) + i * max_n4byte, max_n4byte * 4);
+      sendrecvobj[i].Load(fs);
     }
   }
 
@@ -178,12 +178,12 @@ class SerializeReducer {
     // temp space
     DType tsrc, tdst;
     for (int i = 0; i < len_; ++i) {
-      utils::MemoryFixSizeBuffer fsrc((void*)(src_) + i * nbytes, nbytes);
-      utils::MemoryFixSizeBuffer fdst(dst_ + i * nbytes, nbytes);
+      utils::MemoryFixSizeBuffer fsrc((char*)(src_) + i * nbytes, nbytes);
+      utils::MemoryFixSizeBuffer fdst((char*)(dst_) + i * nbytes, nbytes);
       tsrc.Load(fsrc);
       tdst.Load(fdst);
       // govern const check
-      tdst.Reduce(static_cast<const DType &>(tsrc));
+      tdst.Reduce(static_cast<const DType &>(tsrc), nbytes);
       fdst.Seek(0);
       tdst.Save(fdst);
     }
diff --git a/src/sync/sync_empty.cpp b/src/sync/sync_empty.cpp
index d11d164cd82b..c0f956db3cb3 100644
--- a/src/sync/sync_empty.cpp
+++ b/src/sync/sync_empty.cpp
@@ -38,6 +38,9 @@ void Bcast(std::string *sendrecv_data, int root) {
 
 ReduceHandle::ReduceHandle(void) : handle(NULL) {}
 ReduceHandle::~ReduceHandle(void) {}
+int ReduceHandle::TypeSize(const MPI::Datatype &dtype) {
+  return 0;
+}
 void ReduceHandle::Init(ReduceFunction redfunc, size_t type_n4bytes, bool commute) {}
 void ReduceHandle::AllReduce(void *sendrecvbuf, size_t type_n4bytes, size_t n4byte) {}
 }  // namespace sync
diff --git a/src/sync/sync_mpi.cpp b/src/sync/sync_mpi.cpp
index b96a509a0b89..42b7c7ba6bbe 100644
--- a/src/sync/sync_mpi.cpp
+++ b/src/sync/sync_mpi.cpp
@@ -97,9 +97,12 @@ void ReduceHandle::AllReduce(void *sendrecvbuf, size_t type_n4bytes, size_t coun
   utils::Assert(handle != NULL, "must intialize handle to call AllReduce");
   MPI::Op *op = reinterpret_cast<MPI::Op*>(handle);
   MPI::Datatype *dtype = reinterpret_cast<MPI::Datatype*>(htype);
-
-  if (created_type_n4bytes != type_n4bytes || htype == NULL) {
-    dtype->Free();
+  if (created_type_n4bytes != type_n4bytes || dtype == NULL) {
+    if (dtype == NULL) {
+      dtype = new MPI::Datatype();
+    } else {
+      dtype->Free();
+    }
     *dtype = MPI::INT.Create_contiguous(type_n4bytes);
     dtype->Commit();
     created_type_n4bytes = type_n4bytes;
diff --git a/src/tree/updater.cpp b/src/tree/updater.cpp
index a4cd65de0bdc..a1349b806968 100644
--- a/src/tree/updater.cpp
+++ b/src/tree/updater.cpp
@@ -18,7 +18,7 @@ IUpdater* CreateUpdater(const char *name) {
   if (!strcmp(name, "sync")) return new TreeSyncher();
   if (!strcmp(name, "refresh")) return new TreeRefresher<GradStats>();
   if (!strcmp(name, "grow_colmaker")) return new ColMaker<GradStats>();
-  //if (!strcmp(name, "grow_histmaker")) return new CQHistMaker<GradStats>();
+  if (!strcmp(name, "grow_histmaker")) return new CQHistMaker<GradStats>();
   //if (!strcmp(name, "grow_skmaker")) return new SketchMaker();
   if (!strcmp(name, "distcol")) return new DistColMaker<GradStats>();
 
diff --git a/src/tree/updater_histmaker-inl.hpp b/src/tree/updater_histmaker-inl.hpp
index f05308ce22eb..76f8ccf315a7 100644
--- a/src/tree/updater_histmaker-inl.hpp
+++ b/src/tree/updater_histmaker-inl.hpp
@@ -507,7 +507,7 @@ class CQHistMaker: public HistMaker<TStats> {
   // node statistics
   std::vector<TStats> node_stats;
   // summary array
-  std::vector< WXQSketch::SummaryContainer> summary_array;
+  std::vector<WXQSketch::SummaryContainer> summary_array;
   // reducer for summary
   sync::SerializeReducer<WXQSketch::SummaryContainer> sreducer;
   // per node, per feature sketch
@@ -517,6 +517,7 @@ class CQHistMaker: public HistMaker<TStats> {
 template<typename TStats>
 class QuantileHistMaker: public HistMaker<TStats> {  
  protected:
+  typedef utils::WXQuantileSketch<bst_float, bst_float> WXQSketch;
   virtual void ResetPosAndPropose(const std::vector<bst_gpair> &gpair,
                                   IFMatrix *p_fmat,
                                   const BoosterInfo &info,
@@ -624,9 +625,8 @@ class QuantileHistMaker: public HistMaker<TStats> {
   }
 
  private:
-  typedef utils::WXQuantileSketch<bst_float, bst_float> WXQSketch;
   // summary array
-  std::vector< WXQSketch::SummaryContainer> summary_array;
+  std::vector<WXQSketch::SummaryContainer> summary_array;
   // reducer for summary
   sync::SerializeReducer<WXQSketch::SummaryContainer> sreducer;
   // local temp column data structure
diff --git a/src/utils/io.h b/src/utils/io.h
index 1a748feabdbd..97aaa94b2a44 100644
--- a/src/utils/io.h
+++ b/src/utils/io.h
@@ -106,7 +106,7 @@ struct MemoryFixSizeBuffer : public ISeekStream {
   }
   virtual ~MemoryFixSizeBuffer(void) {}
   virtual size_t Read(void *ptr, size_t size) {
-    utils::Assert(curr_ptr_ <= buffer_size_,
+    utils::Assert(curr_ptr_ + size <= buffer_size_,
                   "read can not have position excceed buffer length");
     size_t nread = std::min(buffer_size_ - curr_ptr_, size);
     if (nread != 0) memcpy(ptr, p_buffer_ + curr_ptr_, nread);
diff --git a/src/utils/quantile.h b/src/utils/quantile.h
index 6727c86751fa..8d49afc98ed9 100644
--- a/src/utils/quantile.h
+++ b/src/utils/quantile.h
@@ -519,12 +519,12 @@ class QuantileSketchTemplate {
   /*! \brief same as summary, but use STL to backup the space */
   struct SummaryContainer : public Summary {
     std::vector<Entry> space;
-    explicit SummaryContainer(void) : Summary(NULL, 0) { 
-    }
-    explicit SummaryContainer(const SummaryContainer &src) : Summary(NULL, src.size) { 
+    SummaryContainer(const SummaryContainer &src) : Summary(NULL, src.size) { 
       this->space = src.space;
       this->data = BeginPtr(this->space);
     }
+    SummaryContainer(void) : Summary(NULL, 0) { 
+    }
     /*! \brief reserve space for summary */
     inline void Reserve(size_t size) {
       if (size > space.size()) {
@@ -576,13 +576,17 @@ class QuantileSketchTemplate {
     /*! \brief save the data structure into stream */
     inline void Save(IStream &fo) const {
       fo.Write(&(this->size), sizeof(this->size));
-      fo.Write(data, this->size * sizeof(Entry));
+      if (this->size != 0) {
+        fo.Write(this->data, this->size * sizeof(Entry));
+      }
     }
     /*! \brief load data structure from input stream */
     inline void Load(IStream &fi) {
       utils::Check(fi.Read(&this->size, sizeof(this->size)) != 0, "invalid SummaryArray 1");
-      this->Reserve(this->size);      
-      utils::Check(fi.Read(data, this->size * sizeof(Entry)) != 0, "invalid SummaryArray 2");
+      this->Reserve(this->size);
+      if (this->size != 0) {
+        utils::Check(fi.Read(this->data, this->size * sizeof(Entry)) != 0, "invalid SummaryArray 2");
+      }
     }
   };
   /*! 

From 3b48a9f35901944b65252a3b813065f73ed16450 Mon Sep 17 00:00:00 2001
From: tqchen <tianqi.tchen@gmail.com>
Date: Wed, 19 Nov 2014 19:21:56 -0800
Subject: [PATCH 085/166] checkin split row

---
 multi-node/row-split/splitrows.py | 24 ++++++++++++++++++++++++
 1 file changed, 24 insertions(+)
 create mode 100644 multi-node/row-split/splitrows.py

diff --git a/multi-node/row-split/splitrows.py b/multi-node/row-split/splitrows.py
new file mode 100644
index 000000000000..2e9d1184d44f
--- /dev/null
+++ b/multi-node/row-split/splitrows.py
@@ -0,0 +1,24 @@
+#!/usr/bin/python
+import sys
+import random
+
+# split libsvm file into different rows
+if len(sys.argv) < 4:
+    print ('Usage:<fin> <fo> k')
+    exit(0)
+
+random.seed(10)
+
+k = int(sys.argv[3])
+fi = open( sys.argv[1], 'r' )
+fos = []
+
+for i in range(k):
+    fos.append(open( sys.argv[2]+'.row%d' % i, 'w' ))
+    
+for l in open(sys.argv[1]):
+    i = random.randint(0, k-1)
+    fos[i].write(l)
+
+for f in fos:    
+    f.close()

From a0342cb196d5d89fcec80c1c5bdf0975de6340b6 Mon Sep 17 00:00:00 2001
From: tqchen <tianqi.tchen@gmail.com>
Date: Wed, 19 Nov 2014 19:22:36 -0800
Subject: [PATCH 086/166] small change

---
 multi-node/README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/multi-node/README.md b/multi-node/README.md
index ba445da125d6..a3f5fb107bd9 100644
--- a/multi-node/README.md
+++ b/multi-node/README.md
@@ -10,7 +10,7 @@ Build
 Design Choice
 =====
 * Does distributed xgboost reply on MPI?
-  - Yes, but the dependency is isolated in [sync](../src/sync/sync.h) module
+  - Yes, but the dependency is isolated in [sync module](../src/sync/sync.h)
   - Specificially, xgboost reply on MPI protocol that provide Broadcast and AllReduce,
      if there are platform/framework that implements these protocol, xgboost should naturally extends to these platform
 * How is the data distributed?

From 338117867bf0c379dce142bf4c99e16b44247bc0 Mon Sep 17 00:00:00 2001
From: tqchen <tianqi.tchen@gmail.com>
Date: Wed, 19 Nov 2014 19:24:20 -0800
Subject: [PATCH 087/166] small change

---
 multi-node/README.md | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/multi-node/README.md b/multi-node/README.md
index a3f5fb107bd9..47edbc654228 100644
--- a/multi-node/README.md
+++ b/multi-node/README.md
@@ -11,6 +11,8 @@ Design Choice
 =====
 * Does distributed xgboost reply on MPI?
   - Yes, but the dependency is isolated in [sync module](../src/sync/sync.h)
+  - All other parts of code uses interface defined in sync.h
+  - sync_mpi.cpp is a implementation of sync interface
   - Specificially, xgboost reply on MPI protocol that provide Broadcast and AllReduce,
      if there are platform/framework that implements these protocol, xgboost should naturally extends to these platform
 * How is the data distributed?

From 41eac089c823e8993dba6d8881243febabf21f8c Mon Sep 17 00:00:00 2001
From: tqchen <tianqi.tchen@gmail.com>
Date: Wed, 19 Nov 2014 19:25:49 -0800
Subject: [PATCH 088/166] chg

---
 multi-node/README.md | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/multi-node/README.md b/multi-node/README.md
index 47edbc654228..dc2eadc6e829 100644
--- a/multi-node/README.md
+++ b/multi-node/README.md
@@ -22,10 +22,12 @@ Design Choice
   - Row-based solver split data by row, each node work on subset of rows,
     it uses an approximate histogram count algorithm, and will only examine subset of 
     potential split points as opposed to all split points.
-* How to run the distributed version
+
+Run the distributed version
+====
   - The current code run in MPI enviroment, you will need to have a network filesystem,
     or copy data to local file system before running the code
-  - The distributed version is still multi-threading optimized.
+  - ***Note*** The distributed version is still multi-threading optimized.
     You should run one xgboost-mpi per node that takes most available CPU,
     this will reduce the communication overhead and improve the performance.
   - One way to do that is limit mpi slot in each machine to be 1, or reserve nthread processors for each process.

From 26e5eae6f228d5693b66a6c8332c69b01ed75bac Mon Sep 17 00:00:00 2001
From: tqchen <tianqi.tchen@gmail.com>
Date: Wed, 19 Nov 2014 19:27:04 -0800
Subject: [PATCH 089/166] ok

---
 multi-node/README.md | 17 ++++++++---------
 1 file changed, 8 insertions(+), 9 deletions(-)

diff --git a/multi-node/README.md b/multi-node/README.md
index dc2eadc6e829..717b40042e4d 100644
--- a/multi-node/README.md
+++ b/multi-node/README.md
@@ -23,16 +23,15 @@ Design Choice
     it uses an approximate histogram count algorithm, and will only examine subset of 
     potential split points as opposed to all split points.
 
-Run the distributed version
+
+Usage
 ====
-  - The current code run in MPI enviroment, you will need to have a network filesystem,
+* The current code run in MPI enviroment, you will need to have a network filesystem,
     or copy data to local file system before running the code
-  - ***Note*** The distributed version is still multi-threading optimized.
+* ***Note*** The distributed version is still multi-threading optimized.
     You should run one xgboost-mpi per node that takes most available CPU,
     this will reduce the communication overhead and improve the performance.
-  - One way to do that is limit mpi slot in each machine to be 1, or reserve nthread processors for each process.
-  
-Usage
-====
-* [Column-based version](col-split)
-* [Row-based version](row-split)
+   - One way to do that is limit mpi slot in each machine to be 1, or reserve nthread processors for each process.
+* Examples:
+  - [Column-based version](col-split)
+  - [Row-based version](row-split)

From 970dd58dc2d42f54e3a99309dee610aa13167abb Mon Sep 17 00:00:00 2001
From: tqchen <tianqi.tchen@gmail.com>
Date: Wed, 19 Nov 2014 20:06:08 -0800
Subject: [PATCH 090/166] checkin continue training

---
 multi-node/row-split/machine-row.sh |  5 ++++-
 src/gbm/gbm.h                       |  6 ++++++
 src/gbm/gbtree-inl.hpp              |  6 ++++++
 src/learner/learner-inl.hpp         | 12 ++++++++++--
 src/xgboost_main.cpp                |  2 +-
 5 files changed, 27 insertions(+), 4 deletions(-)

diff --git a/multi-node/row-split/machine-row.sh b/multi-node/row-split/machine-row.sh
index 41b8e8634652..fdb1f1d6b554 100755
--- a/multi-node/row-split/machine-row.sh
+++ b/multi-node/row-split/machine-row.sh
@@ -17,5 +17,8 @@ cd -
 python splitrows.py ../../demo/regression/machine.txt.train train-machine $k
 
 # run xgboost mpi
-mpirun -n $k ../../xgboost-mpi machine-row.conf dsplit=row 
+mpirun -n $k ../../xgboost-mpi machine-row.conf dsplit=row num_round=3
 
+# run xgboost-mpi save model 0001, continue to run from existing model
+mpirun -n $k ../../xgboost-mpi machine-row.conf dsplit=row num_round=1
+mpirun -n $k ../../xgboost-mpi machine-row.conf dsplit=row num_round=2 model_in=0001.model
diff --git a/src/gbm/gbm.h b/src/gbm/gbm.h
index 00d0bc4445fe..28b370c48238 100644
--- a/src/gbm/gbm.h
+++ b/src/gbm/gbm.h
@@ -38,6 +38,12 @@ class IGradBooster {
    * \brief initialize the model
    */
   virtual void InitModel(void) = 0;
+  /*! 
+   * \brief reset the predict buffer
+   * this will invalidate all the previous cached results
+   * and recalculate from scratch
+   */
+  virtual void ResetPredBuffer(size_t num_pbuffer) {}
   /*!
    * \brief peform update to the model(boosting)
    * \param p_fmat feature matrix that provide access to features
diff --git a/src/gbm/gbtree-inl.hpp b/src/gbm/gbtree-inl.hpp
index eb526e43ca41..d334296c8816 100644
--- a/src/gbm/gbtree-inl.hpp
+++ b/src/gbm/gbtree-inl.hpp
@@ -84,6 +84,12 @@ class GBTree : public IGradBooster {
     utils::Assert(mparam.num_trees == 0, "GBTree: model already initialized");
     utils::Assert(trees.size() == 0, "GBTree: model already initialized");
   }
+  virtual void ResetPredBuffer(size_t num_pbuffer) {
+    mparam.num_pbuffer = static_cast<int64_t>(num_pbuffer);
+    pred_buffer.clear(); pred_counter.clear();
+    pred_buffer.resize(mparam.PredBufferSize(), 0.0f);
+    pred_counter.resize(mparam.PredBufferSize(), 0);
+  }
   virtual void DoBoost(IFMatrix *p_fmat,
                        int64_t buffer_offset,
                        const BoosterInfo &info,
diff --git a/src/learner/learner-inl.hpp b/src/learner/learner-inl.hpp
index 70e71cf57894..6b74402394fc 100644
--- a/src/learner/learner-inl.hpp
+++ b/src/learner/learner-inl.hpp
@@ -33,6 +33,7 @@ class BoostLearner {
     prob_buffer_row = 1.0f;
     part_load_col = 0;
     distributed_mode = 0;
+    pred_buffer_size = 0;
   }
   ~BoostLearner(void) {
     if (obj_ != NULL) delete obj_;
@@ -76,6 +77,7 @@ class BoostLearner {
     if (!silent) {
       utils::Printf("buffer_size=%ld\n", static_cast<long>(buffer_size));
     }
+    this->pred_buffer_size = buffer_size;
   }
   /*!
    * \brief set parameters from outside
@@ -139,8 +141,9 @@ class BoostLearner {
   /*!
    * \brief load model from stream
    * \param fi input stream
+   * \param keep_predbuffer whether to keep predict buffer
    */
-  inline void LoadModel(utils::IStream &fi) {
+  inline void LoadModel(utils::IStream &fi, bool keep_predbuffer = true) {
     utils::Check(fi.Read(&mparam, sizeof(ModelParam)) != 0,
                  "BoostLearner: wrong model format");
     utils::Check(fi.Read(&name_obj_), "BoostLearner: wrong model format");
@@ -150,6 +153,9 @@ class BoostLearner {
     if (gbm_ != NULL) delete gbm_;
     this->InitObjGBM();
     gbm_->LoadModel(fi);
+    if (keep_predbuffer && distributed_mode == 2 && sync::GetRank() != 0) {
+      gbm_->ResetPredBuffer(pred_buffer_size);
+    }
   }
   /*!
    * \brief load model from file
@@ -370,12 +376,14 @@ class BoostLearner {
   int distributed_mode;
   // randomly load part of data
   int part_load_col;
+  // cached size of predict buffer
+  size_t pred_buffer_size;
   // maximum buffred row value
   float prob_buffer_row;
   // evaluation set
   EvalSet evaluator_;
   // model parameter
-  ModelParam   mparam;
+  ModelParam  mparam;
   // gbm model that back everything
   gbm::IGradBooster *gbm_;
   // name of gbm model used for training
diff --git a/src/xgboost_main.cpp b/src/xgboost_main.cpp
index 1b596ebfb615..4a459f2d7ab3 100644
--- a/src/xgboost_main.cpp
+++ b/src/xgboost_main.cpp
@@ -142,7 +142,7 @@ class BoostLearnTask {
     }
   }
   inline void InitLearner(void) {
-    if (model_in != "NULL"){
+    if (model_in != "NULL") {
       utils::FileStream fi(utils::FopenCheck(model_in.c_str(), "rb"));
       learner.LoadModel(fi);
       fi.Close();

From b595854e8c60cecb65008e99c2c494ae6a8fe8f3 Mon Sep 17 00:00:00 2001
From: Tianqi Chen <tqchen@users.noreply.github.com>
Date: Wed, 19 Nov 2014 20:08:11 -0800
Subject: [PATCH 091/166] Update README.md

---
 multi-node/row-split/README.md | 1 +
 1 file changed, 1 insertion(+)

diff --git a/multi-node/row-split/README.md b/multi-node/row-split/README.md
index 6c007888366d..b01e52271801 100644
--- a/multi-node/row-split/README.md
+++ b/multi-node/row-split/README.md
@@ -2,6 +2,7 @@ Distributed XGBoost: Row Split Version
 ====
 * Mushroom: run ```bash mushroom-row.sh <n-mpi-process>```
 * Machine: run ```bash machine-row.sh <n-mpi-process>```
+  - Machine case also include example to continue training from existing model
 
 How to Use
 ====

From 9af464303aec12fc820163d23d7e2af7a55b4c89 Mon Sep 17 00:00:00 2001
From: tqchen <tianqi.tchen@gmail.com>
Date: Wed, 19 Nov 2014 20:09:26 -0800
Subject: [PATCH 092/166] checkin row continue training

---
 multi-node/col-split/mushroom-col.sh | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/multi-node/col-split/mushroom-col.sh b/multi-node/col-split/mushroom-col.sh
index 63b5092ba2e1..4d7de9892c3f 100755
--- a/multi-node/col-split/mushroom-col.sh
+++ b/multi-node/col-split/mushroom-col.sh
@@ -16,4 +16,9 @@ mpirun -n $k ../../xgboost-mpi mushroom-col.conf dsplit=col
 
 # the model can be directly loaded by single machine xgboost solver, as usuall
 ../../xgboost mushroom-col.conf task=dump model_in=0002.model fmap=../../demo/data/featmap.txt name_dump=dump.nice.$k.txt
-cat dump.nice.$k.txt
+
+# run for one round, and continue training
+mpirun -n $k ../../xgboost-mpi mushroom-col.conf dsplit=col num_round=1
+mpirun -n $k ../../xgboost-mpi mushroom-col.conf dsplit=col model_in=0001.model
+
+cat dump.nice.$k.txt
\ No newline at end of file

From 974202eb55d816a320323585c2790fc60e18a0f3 Mon Sep 17 00:00:00 2001
From: tqchen <tianqi.tchen@gmail.com>
Date: Thu, 20 Nov 2014 11:22:09 -0800
Subject: [PATCH 093/166] check pipe, commit optimization for hist

---
 multi-node/row-split/README.md        |   2 +-
 multi-node/row-split/machine-row.conf |   1 -
 multi-node/row-split/mushroom-row.sh  |   2 +-
 src/tree/updater_basemaker-inl.hpp    |  68 ++++++++-
 src/tree/updater_histmaker-inl.hpp    | 194 +++++++++++++++-----------
 5 files changed, 182 insertions(+), 85 deletions(-)

diff --git a/multi-node/row-split/README.md b/multi-node/row-split/README.md
index b01e52271801..4c427f3ec3bd 100644
--- a/multi-node/row-split/README.md
+++ b/multi-node/row-split/README.md
@@ -15,4 +15,4 @@ Notes
 * The code is multi-threaded, so you want to run one xgboost-mpi per node
 * Row-based solver split data by row, each node work on subset of rows, it uses an approximate histogram count algorithm,
   and will only examine subset of potential split points as opposed to all split points.
-* ```colsample_bytree``` is not enabled in row split mode so far
+
diff --git a/multi-node/row-split/machine-row.conf b/multi-node/row-split/machine-row.conf
index ac816ab454b3..c0cba3da8cf9 100644
--- a/multi-node/row-split/machine-row.conf
+++ b/multi-node/row-split/machine-row.conf
@@ -14,7 +14,6 @@ gamma = 1.0
 min_child_weight = 1 
 # maximum depth of a tree
 max_depth = 3 
-
 # Task parameters
 # the number of round to do boosting
 num_round = 2
diff --git a/multi-node/row-split/mushroom-row.sh b/multi-node/row-split/mushroom-row.sh
index a98fb6b0d137..eb65799b695f 100755
--- a/multi-node/row-split/mushroom-row.sh
+++ b/multi-node/row-split/mushroom-row.sh
@@ -12,7 +12,7 @@ k=$1
 python splitrows.py ../../demo/data/agaricus.txt.train train $k
 
 # run xgboost mpi
-mpirun -n $k ../../xgboost-mpi mushroom-row.conf dsplit=row nthread=1
+mpirun -n $k ../../xgboost-mpi mushroom-row.conf dsplit=row nthread=1 
 
 # the model can be directly loaded by single machine xgboost solver, as usuall
 ../../xgboost mushroom-row.conf task=dump model_in=0002.model fmap=../../demo/data/featmap.txt name_dump=dump.nice.$k.txt
diff --git a/src/tree/updater_basemaker-inl.hpp b/src/tree/updater_basemaker-inl.hpp
index f414752d9652..e5cfd17fabb7 100644
--- a/src/tree/updater_basemaker-inl.hpp
+++ b/src/tree/updater_basemaker-inl.hpp
@@ -7,6 +7,7 @@
  */
 #include <vector>
 #include <algorithm>
+#include <limits>
 #include "../utils/random.h"
 #include "../utils/quantile.h"
 
@@ -24,8 +25,73 @@ class BaseMaker: public IUpdater {
   virtual void SetParam(const char *name, const char *val) {
     param.SetParam(name, val);
   }
-  
+   
  protected:
+  // helper to collect and query feature meta information
+  struct FMetaHelper {
+   public:
+    /*! \brief find type of each feature, use column format */
+    inline void InitByCol(IFMatrix *p_fmat,
+                          const RegTree &tree) {
+      fminmax.resize(tree.param.num_feature * 2);
+      std::fill(fminmax.begin(), fminmax.end(),
+                -std::numeric_limits<bst_float>::max());
+      // start accumulating statistics
+      utils::IIterator<ColBatch> *iter = p_fmat->ColIterator();
+      iter->BeforeFirst();
+      while (iter->Next()) {
+        const ColBatch &batch = iter->Value();
+        for (bst_uint i = 0; i < batch.size; ++i) {
+          const bst_uint fid = batch.col_index[i];
+          const ColBatch::Inst &c = batch[i];
+          if (c.length != 0) {
+            fminmax[fid * 2 + 0] = std::max(-c[0].fvalue, fminmax[fid * 2 + 0]);
+            fminmax[fid * 2 + 1] = std::max(c[c.length - 1].fvalue, fminmax[fid * 2 + 1]);
+          }
+        }
+      }      
+      sync::AllReduce(BeginPtr(fminmax), fminmax.size(), sync::kMax);
+    }
+    // get feature type, 0:empty 1:binary 2:real
+    inline int Type(bst_uint fid) const {
+      utils::Assert(fid * 2 + 1 < fminmax.size(),
+                    "FeatHelper fid exceed query bound ");
+      bst_float a = fminmax[fid * 2];
+      bst_float b = fminmax[fid * 2 + 1];
+      if (a == -std::numeric_limits<bst_float>::max()) return 0;
+      if (-a == b) return 1;
+      else return 2;
+    }
+    inline bst_float MaxValue(bst_uint fid) const {
+      return fminmax[fid *2 + 1];
+    }
+    inline void SampleCol(float p, std::vector<bst_uint> *p_findex) const {
+      std::vector<bst_uint> &findex = *p_findex;
+      findex.clear();
+      for (size_t i = 0; i < fminmax.size(); i += 2) {
+        if (this->Type(i / 2) != 0) findex.push_back(i / 2);
+      }
+      unsigned n = static_cast<unsigned>(p * findex.size());
+      random::Shuffle(findex);
+      findex.resize(n);
+      if (n != findex.size()) {
+        // sync the findex if it is subsample
+        std::string s_cache;
+        utils::MemoryBufferStream fc(&s_cache);
+        utils::IStream &fs = fc;
+        if (sync::GetRank() == 0) {
+          fs.Write(findex);
+          sync::Bcast(&s_cache, 0);
+        } else {
+          sync::Bcast(&s_cache, 0);
+          fs.Read(&findex);
+        }
+      }
+    }
+    
+   private:
+    std::vector<bst_float> fminmax;
+  };
   // ------static helper functions ------
   // helper function to get to next level of the tree
   /*! \brief this is  helper function for row based data*/
diff --git a/src/tree/updater_histmaker-inl.hpp b/src/tree/updater_histmaker-inl.hpp
index 76f8ccf315a7..06febf47a3ed 100644
--- a/src/tree/updater_histmaker-inl.hpp
+++ b/src/tree/updater_histmaker-inl.hpp
@@ -118,19 +118,22 @@ class HistMaker: public BaseMaker {
   ThreadWSpace wspace;
   // reducer for histogram
   sync::Reducer<TStats> histred;
+  // set of working features
+  std::vector<bst_uint> fwork_set;
   // update function implementation
   virtual void Update(const std::vector<bst_gpair> &gpair,
                       IFMatrix *p_fmat,
                       const BoosterInfo &info,
                       RegTree *p_tree) {
     this->InitData(gpair, *p_fmat, info.root_index, *p_tree);
+    this->InitWorkSet(p_fmat, *p_tree, &fwork_set);
     for (int depth = 0; depth < param.max_depth; ++depth) {
       // reset and propose candidate split
-      this->ResetPosAndPropose(gpair, p_fmat, info, *p_tree);
+      this->ResetPosAndPropose(gpair, p_fmat, info, fwork_set, *p_tree);
       // create histogram
-      this->CreateHist(gpair, p_fmat, info, *p_tree);
+      this->CreateHist(gpair, p_fmat, info, fwork_set, *p_tree);
       // find split based on histogram statistics
-      this->FindSplit(depth, gpair, p_fmat, info, p_tree);
+      this->FindSplit(depth, gpair, p_fmat, info, fwork_set, p_tree);
       // reset position after split
       this->ResetPositionAfterSplit(p_fmat, *p_tree);
       this->UpdateQueueExpand(*p_tree);
@@ -148,7 +151,17 @@ class HistMaker: public BaseMaker {
   virtual void ResetPosAndPropose(const std::vector<bst_gpair> &gpair,
                                   IFMatrix *p_fmat,
                                   const BoosterInfo &info,
-                                  const RegTree &tree)  = 0;  
+                                  const std::vector <bst_uint> &fset,
+                                  const RegTree &tree)  = 0;
+  // initialize the current working set of features in this round
+  virtual void InitWorkSet(IFMatrix *p_fmat,
+                           const RegTree &tree,
+                           std::vector<bst_uint> *p_fset) {
+    p_fset->resize(tree.param.num_feature);
+    for (size_t i = 0; i < p_fset->size(); ++i) {
+      (*p_fset)[i] = i;
+    }
+  }
   // reset position after split, this is not a must, depending on implementation
   virtual void ResetPositionAfterSplit(IFMatrix *p_fmat,
                                        const RegTree &tree) {
@@ -156,45 +169,8 @@ class HistMaker: public BaseMaker {
   virtual void CreateHist(const std::vector<bst_gpair> &gpair,
                           IFMatrix *p_fmat,
                           const BoosterInfo &info,
-                          const RegTree &tree) {
-    bst_uint num_feature = tree.param.num_feature;
-    // intialize work space
-    wspace.Init(param, this->get_nthread());
-    // start accumulating statistics
-    utils::IIterator<RowBatch> *iter = p_fmat->RowIterator();
-    iter->BeforeFirst();
-    while (iter->Next()) {
-      const RowBatch &batch = iter->Value();
-      utils::Check(batch.size < std::numeric_limits<unsigned>::max(),
-                   "too large batch size ");
-      const bst_omp_uint nbatch = static_cast<bst_omp_uint>(batch.size);
-      #pragma omp parallel for schedule(static)
-      for (bst_omp_uint i = 0; i < nbatch; ++i) {
-        RowBatch::Inst inst = batch[i];
-        const int tid = omp_get_thread_num();
-        HistSet &hset = wspace.hset[tid];
-        const bst_uint ridx = static_cast<bst_uint>(batch.base_rowid + i);
-        int nid = position[ridx];
-        if (nid >= 0) {
-          const int wid = this->node2workindex[nid];
-          for (bst_uint i = 0; i < inst.length; ++i) {
-            utils::Assert(inst[i].index < num_feature, "feature index exceed bound");
-            // feature histogram
-            hset[inst[i].index + wid * (num_feature+1)]
-                .Add(inst[i].fvalue, gpair, info, ridx);
-          }
-          // node histogram, use num_feature to borrow space
-          hset[num_feature + wid * (num_feature + 1)]
-              .data[0].Add(gpair, info, ridx);
-        }
-      }
-    }
-    // accumulating statistics together
-    wspace.Aggregate();
-    // sync the histogram
-    histred.AllReduce(BeginPtr(wspace.hset[0].data), wspace.hset[0].data.size());
-  }
-
+                          const std::vector <bst_uint> &fset,
+                          const RegTree &tree)  = 0;
  private:
   inline void EnumerateSplit(const HistUnit &hist, 
                              const TStats &node_sum,
@@ -235,8 +211,9 @@ class HistMaker: public BaseMaker {
                         const std::vector<bst_gpair> &gpair,
                         IFMatrix *p_fmat,
                         const BoosterInfo &info,
+                        const std::vector <bst_uint> &fset,
                         RegTree *p_tree) {
-    const bst_uint num_feature = p_tree->param.num_feature;
+    const size_t num_feature = fset.size();
     // get the best split condition for each node
     std::vector<SplitEntry> sol(qexpand.size());
     std::vector<TStats> left_sum(qexpand.size());    
@@ -248,9 +225,9 @@ class HistMaker: public BaseMaker {
                     "node2workindex inconsistent");
       SplitEntry &best = sol[wid];
       TStats &node_sum = wspace.hset[0][num_feature + wid * (num_feature + 1)].data[0];
-      for (bst_uint fid = 0; fid < num_feature; ++ fid) {
-        EnumerateSplit(this->wspace.hset[0][fid + wid * (num_feature+1)],
-                       node_sum, fid, &best, &left_sum[wid]);
+      for (size_t i = 0; i < fset.size(); ++ i) {
+        EnumerateSplit(this->wspace.hset[0][i + wid * (num_feature+1)],
+                       node_sum, fset[i], &best, &left_sum[wid]);
       }
     }
     // get the best result, we can synchronize the solution
@@ -306,15 +283,32 @@ class CQHistMaker: public HistMaker<TStats> {
       hist.data[istart].Add(gpair, info, ridx);
     }
   };
+  // sketch type used for this
   typedef utils::WXQuantileSketch<bst_float, bst_float> WXQSketch;
+  // initialize the work set of tree
+  virtual void InitWorkSet(IFMatrix *p_fmat,
+                           const RegTree &tree,
+                           std::vector<bst_uint> *p_fset) {
+    feat_helper.InitByCol(p_fmat, tree);
+    feat_helper.SampleCol(this->param.colsample_bytree, p_fset);
+  }
+  // code to create histogram  
   virtual void CreateHist(const std::vector<bst_gpair> &gpair,
                           IFMatrix *p_fmat,
                           const BoosterInfo &info,
+                          const std::vector<bst_uint> &fset,
                           const RegTree &tree) {
+    // fill in reverse map
+    feat2workindex.resize(tree.param.num_feature);
+    std::fill(feat2workindex.begin(), feat2workindex.end(), -1);
+    for (size_t i = 0; i < fset.size(); ++i) {
+      feat2workindex[fset[i]] = static_cast<int>(i);
+    } 
+    // start to work
     this->wspace.Init(this->param, 1);
     thread_hist.resize(this->get_nthread());
     // start accumulating statistics
-    utils::IIterator<ColBatch> *iter = p_fmat->ColIterator();
+    utils::IIterator<ColBatch> *iter = p_fmat->ColIterator(fset);
     iter->BeforeFirst();
     while (iter->Next()) {
       const ColBatch &batch = iter->Value();
@@ -322,15 +316,18 @@ class CQHistMaker: public HistMaker<TStats> {
       const bst_omp_uint nsize = static_cast<bst_omp_uint>(batch.size);
       #pragma omp parallel for schedule(dynamic, 1)
       for (bst_omp_uint i = 0; i < nsize; ++i) {
-        this->UpdateHistCol(gpair, batch[i], info, tree,
-                            batch.col_index[i],                        
-                            &thread_hist[omp_get_thread_num()]);       
+        int offset = feat2workindex[batch.col_index[i]];
+        if (offset >= 0) {
+          this->UpdateHistCol(gpair, batch[i], info, tree,
+                              fset, offset,
+                              &thread_hist[omp_get_thread_num()]);
+        }
       }
     }
     for (size_t i = 0; i < this->qexpand.size(); ++i) {
       const int nid = this->qexpand[i];
       const int wid = this->node2workindex[nid];
-      this->wspace.hset[0][tree.param.num_feature + wid * (tree.param.num_feature+1)]
+      this->wspace.hset[0][fset.size() + wid * (fset.size()+1)]
           .data[0] = node_stats[nid];
     }
     // sync the histogram
@@ -343,10 +340,24 @@ class CQHistMaker: public HistMaker<TStats> {
   virtual void ResetPosAndPropose(const std::vector<bst_gpair> &gpair,
                                   IFMatrix *p_fmat,
                                   const BoosterInfo &info,
+                                  const std::vector<bst_uint> &fset,
                                   const RegTree &tree) {
+    // fill in reverse map
+    feat2workindex.resize(tree.param.num_feature);
+    std::fill(feat2workindex.begin(), feat2workindex.end(), -1);
+    freal_set.clear();
+    for (size_t i = 0; i < fset.size(); ++i) {
+      if (feat_helper.Type(fset[i]) == 2) {
+        feat2workindex[fset[i]] = static_cast<int>(freal_set.size());
+        freal_set.push_back(fset[i]);
+      } else {
+        feat2workindex[fset[i]] = -2;  
+      }
+    }
+        
     this->GetNodeStats(gpair, *p_fmat, tree, info,
                        &thread_stats, &node_stats);
-    sketchs.resize(this->qexpand.size() * tree.param.num_feature);
+    sketchs.resize(this->qexpand.size() * freal_set.size());
     for (size_t i = 0; i < sketchs.size(); ++i) {
       sketchs[i].Init(info.num_row, this->param.sketch_eps);
     }
@@ -354,7 +365,7 @@ class CQHistMaker: public HistMaker<TStats> {
     // number of rows in
     const size_t nrows = p_fmat->buffered_rowset().size();
     // start accumulating statistics
-    utils::IIterator<ColBatch> *iter = p_fmat->ColIterator();
+    utils::IIterator<ColBatch> *iter = p_fmat->ColIterator(freal_set);
     iter->BeforeFirst();
     while (iter->Next()) {
       const ColBatch &batch = iter->Value();
@@ -362,11 +373,14 @@ class CQHistMaker: public HistMaker<TStats> {
       const bst_omp_uint nsize = static_cast<bst_omp_uint>(batch.size);
       #pragma omp parallel for schedule(dynamic, 1)
       for (bst_omp_uint i = 0; i < nsize; ++i) {
-        this->UpdateSketchCol(gpair, batch[i], tree,
-                              node_stats,
-                              batch.col_index[i],
-                              batch[i].length == nrows,                              
-                              &thread_sketch[omp_get_thread_num()]);       
+        int offset = feat2workindex[batch.col_index[i]];
+        if (offset >= 0) {
+          this->UpdateSketchCol(gpair, batch[i], tree,
+                                node_stats,
+                                freal_set, offset,
+                                batch[i].length == nrows,
+                                &thread_sketch[omp_get_thread_num()]);
+        }
       }
     }
     // setup maximum size
@@ -379,36 +393,46 @@ class CQHistMaker: public HistMaker<TStats> {
       summary_array[i].Reserve(max_size);
       summary_array[i].SetPrune(out, max_size);
     }
-    size_t n4bytes = (WXQSketch::SummaryContainer::CalcMemCost(max_size) + 3) / 4;
-    sreducer.AllReduce(BeginPtr(summary_array), n4bytes, summary_array.size());
+    if (summary_array.size() != 0) {
+      size_t n4bytes = (WXQSketch::SummaryContainer::CalcMemCost(max_size) + 3) / 4;
+      sreducer.AllReduce(BeginPtr(summary_array), n4bytes, summary_array.size());
+    }
     // now we get the final result of sketch, setup the cut
     this->wspace.cut.clear();
     this->wspace.rptr.clear();
     this->wspace.rptr.push_back(0);
     for (size_t wid = 0; wid < this->qexpand.size(); ++wid) {
-      for (int fid = 0; fid < tree.param.num_feature; ++fid) {
-        const WXQSketch::Summary &a = summary_array[wid * tree.param.num_feature + fid];
-        for (size_t i = 1; i < a.size; ++i) {
-          bst_float cpt = a.data[i].value - rt_eps;
-          if (i == 1 || cpt > this->wspace.cut.back()) {
-            this->wspace.cut.push_back(cpt);
+      for (size_t i = 0; i < fset.size(); ++i) {
+        int offset = feat2workindex[fset[i]];
+        if (offset >= 0) {
+          const WXQSketch::Summary &a = summary_array[wid * freal_set.size() + offset];
+          for (size_t i = 1; i < a.size; ++i) {
+            bst_float cpt = a.data[i].value - rt_eps;
+            if (i == 1 || cpt > this->wspace.cut.back()) {
+              this->wspace.cut.push_back(cpt);
+            }
           }
+          // push a value that is greater than anything
+          if (a.size != 0) {
+            bst_float cpt = a.data[a.size - 1].value;
+            // this must be bigger than last value in a scale
+            bst_float last = cpt + fabs(cpt) + rt_eps;
+            this->wspace.cut.push_back(last);
+          }
+          this->wspace.rptr.push_back(this->wspace.cut.size());
+        } else {
+          utils::Assert(offset == -2, "BUG in mark");
+          bst_float cpt = feat_helper.MaxValue(fset[i]);        
+          this->wspace.cut.push_back(cpt + fabs(cpt) + rt_eps);
+          this->wspace.rptr.push_back(this->wspace.cut.size());        
         }
-        // push a value that is greater than anything
-        if (a.size != 0) {
-          bst_float cpt = a.data[a.size - 1].value;
-          // this must be bigger than last value in a scale
-          bst_float last = cpt + fabs(cpt) + rt_eps;
-          this->wspace.cut.push_back(last);
-        }
-        this->wspace.rptr.push_back(this->wspace.cut.size());
       }
       // reserve last value for global statistics
       this->wspace.cut.push_back(0.0f);
       this->wspace.rptr.push_back(this->wspace.cut.size());
     }
     utils::Assert(this->wspace.rptr.size() ==
-                  (tree.param.num_feature + 1) * this->qexpand.size() + 1,
+                  (fset.size() + 1) * this->qexpand.size() + 1,
                   "cut space inconsistent");
   }
   
@@ -417,7 +441,8 @@ class CQHistMaker: public HistMaker<TStats> {
                             const ColBatch::Inst &c,
                             const BoosterInfo &info,
                             const RegTree &tree,
-                            bst_uint fid,
+                            const std::vector<bst_uint> &fset,
+                            bst_uint fid_offset,
                             std::vector<HistEntry> *p_temp) {
     if (c.length == 0) return;
     // initialize sbuilder for use
@@ -427,7 +452,7 @@ class CQHistMaker: public HistMaker<TStats> {
       const unsigned nid = this->qexpand[i];
       const unsigned wid = this->node2workindex[nid];
       hbuilder[nid].istart = 0;
-      hbuilder[nid].hist = this->wspace.hset[0][fid + wid * (tree.param.num_feature+1)];
+      hbuilder[nid].hist = this->wspace.hset[0][fid_offset + wid * (fset.size()+1)];
     }
     for (bst_uint j = 0; j < c.length; ++j) {
       const bst_uint ridx = c[j].index;
@@ -441,7 +466,8 @@ class CQHistMaker: public HistMaker<TStats> {
                               const ColBatch::Inst &c,
                               const RegTree &tree,
                               const std::vector<TStats> &nstats,
-                              bst_uint fid,
+                              const std::vector<bst_uint> &frealset,
+                              bst_uint offset,
                               bool col_full,
                               std::vector<BaseMaker::SketchEntry> *p_temp) {
     if (c.length == 0) return;
@@ -452,7 +478,7 @@ class CQHistMaker: public HistMaker<TStats> {
       const unsigned nid = this->qexpand[i];
       const unsigned wid = this->node2workindex[nid];
       sbuilder[nid].sum_total = 0.0f;
-      sbuilder[nid].sketch = &sketchs[wid * tree.param.num_feature + fid];
+      sbuilder[nid].sketch = &sketchs[wid * frealset.size() + offset];
     }
 
     if (!col_full) {
@@ -497,7 +523,12 @@ class CQHistMaker: public HistMaker<TStats> {
       sbuilder[nid].Finalize(max_size);
     }
   }
-  
+  // feature helper
+  BaseMaker::FMetaHelper feat_helper;
+  // temp space to map feature id to working index
+  std::vector<int> feat2workindex;
+  // set of index from fset that are real
+  std::vector<bst_uint> freal_set; 
   // thread temp data
   std::vector< std::vector<BaseMaker::SketchEntry> > thread_sketch;
   // used to hold statistics
@@ -521,6 +552,7 @@ class QuantileHistMaker: public HistMaker<TStats> {
   virtual void ResetPosAndPropose(const std::vector<bst_gpair> &gpair,
                                   IFMatrix *p_fmat,
                                   const BoosterInfo &info,
+                                  const std::vector <bst_uint> &fset,
                                   const RegTree &tree) {
     // initialize the data structure
     int nthread = BaseMaker::get_nthread();

From 23fbf079b93871ebbb8b899ab3c5ea3732fd4fad Mon Sep 17 00:00:00 2001
From: tqchen <tianqi.tchen@gmail.com>
Date: Thu, 20 Nov 2014 12:56:30 -0800
Subject: [PATCH 094/166] fix bug in row

---
 src/io/page_fmatrix-inl.hpp   | 8 ++++----
 src/io/simple_fmatrix-inl.hpp | 6 +++++-
 2 files changed, 9 insertions(+), 5 deletions(-)

diff --git a/src/io/page_fmatrix-inl.hpp b/src/io/page_fmatrix-inl.hpp
index 971abbb0ed97..91d24cf2d104 100644
--- a/src/io/page_fmatrix-inl.hpp
+++ b/src/io/page_fmatrix-inl.hpp
@@ -114,11 +114,11 @@ class CSCMatrixManager {
   }
   inline void SetColSet(const std::vector<bst_uint> &cset, bool setall) {
     if (!setall) {
-      col_todo_.resize(cset.size());
+      col_todo_.resize(0);
       for (size_t i = 0; i < cset.size(); ++i) {
-        col_todo_[i] = cset[i];
-        utils::Assert(col_todo_[i] < static_cast<bst_uint>(col_ptr_.size() - 1),
-                      "CSCMatrixManager: column index exceed bound");
+        if (col_todo_[i] < static_cast<bst_uint>(col_ptr_.size() - 1)) {
+          col_todo_.push_back(cset[i]);
+        }
       }
       std::sort(col_todo_.begin(), col_todo_.end());
     } else {
diff --git a/src/io/simple_fmatrix-inl.hpp b/src/io/simple_fmatrix-inl.hpp
index 88bc69019eb5..08e25e28b781 100644
--- a/src/io/simple_fmatrix-inl.hpp
+++ b/src/io/simple_fmatrix-inl.hpp
@@ -76,7 +76,11 @@ class FMatrixS : public IFMatrix{
    * \brief colmun based iterator
    */
   virtual utils::IIterator<ColBatch> *ColIterator(const std::vector<bst_uint> &fset) {
-    col_iter_.col_index_ = fset;
+    size_t ncol = this->NumCol();
+    col_iter_.col_index_.resize(0);
+    for (size_t i = 0; i < fset.size(); ++i) {
+      if (fset[i] < ncol) col_iter_.col_index_.push_back(fset[i]); 
+    }
     col_iter_.SetBatch(col_ptr_, col_data_);
     return &col_iter_;
   }

From d4103ea7ea82169af0e10c979c82323217a47db2 Mon Sep 17 00:00:00 2001
From: Tianqi Chen <tqchen@users.noreply.github.com>
Date: Thu, 20 Nov 2014 22:01:26 -0800
Subject: [PATCH 095/166] Update README.md

---
 multi-node/README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/multi-node/README.md b/multi-node/README.md
index 717b40042e4d..6f1008514481 100644
--- a/multi-node/README.md
+++ b/multi-node/README.md
@@ -12,7 +12,7 @@ Design Choice
 * Does distributed xgboost reply on MPI?
   - Yes, but the dependency is isolated in [sync module](../src/sync/sync.h)
   - All other parts of code uses interface defined in sync.h
-  - sync_mpi.cpp is a implementation of sync interface
+  - sync_mpi.cpp is a implementation of sync interface using standard MPI library
   - Specificially, xgboost reply on MPI protocol that provide Broadcast and AllReduce,
      if there are platform/framework that implements these protocol, xgboost should naturally extends to these platform
 * How is the data distributed?

From 168bb0d0c9bf6a5251d5a60de37ff9993e22789b Mon Sep 17 00:00:00 2001
From: tqchen <tianqi.tchen@gmail.com>
Date: Fri, 21 Nov 2014 09:32:09 -0800
Subject: [PATCH 096/166] add predict leaf indices

---
 demo/README.md                            |  2 ++
 demo/guide-python/README.md               |  1 +
 demo/guide-python/predict_leaf_indices.py | 22 +++++++++++++
 demo/guide-python/runall.sh               |  1 +
 src/gbm/gblinear-inl.hpp                  |  6 ++++
 src/gbm/gbm.h                             | 14 +++++++++
 src/gbm/gbtree-inl.hpp                    | 38 +++++++++++++++--------
 src/learner/learner-inl.hpp               | 14 ++++++---
 wrapper/xgboost.py                        | 27 ++++++++++++----
 wrapper/xgboost_wrapper.cpp               |  8 ++---
 wrapper/xgboost_wrapper.h                 | 10 ++++--
 11 files changed, 114 insertions(+), 29 deletions(-)
 create mode 100755 demo/guide-python/predict_leaf_indices.py

diff --git a/demo/README.md b/demo/README.md
index bcc356712133..56915a32e008 100644
--- a/demo/README.md
+++ b/demo/README.md
@@ -32,6 +32,8 @@ This is a list of short codes introducing different functionalities of xgboost a
   [python](guide-python/cross_validation.py)
   [R](../R-package/demo/cross_validation.R)
   [Julia](https://github.com/antinucleon/XGBoost.jl/blob/master/demo/cross_validation.jl)  
+* Predicting leaf indices
+  [python](guide-python/predict_leaf_indices.py)
 
 Basic Examples by Tasks
 ====
diff --git a/demo/guide-python/README.md b/demo/guide-python/README.md
index 3625c40f561f..bc1c219d0b1d 100644
--- a/demo/guide-python/README.md
+++ b/demo/guide-python/README.md
@@ -6,3 +6,4 @@ XGBoost Python Feature Walkthrough
 * [Predicting using first n trees](predict_first_ntree.py)
 * [Generalized Linear Model](generalized_linear_model.py)
 * [Cross validation](cross_validation.py)
+* [Predicting leaf indices](predict_leaf_indices.py)
diff --git a/demo/guide-python/predict_leaf_indices.py b/demo/guide-python/predict_leaf_indices.py
new file mode 100755
index 000000000000..291ad1ee7831
--- /dev/null
+++ b/demo/guide-python/predict_leaf_indices.py
@@ -0,0 +1,22 @@
+#!/usr/bin/python
+import sys
+import numpy as np
+sys.path.append('../../wrapper')
+import xgboost as xgb
+
+### load data in do training
+dtrain = xgb.DMatrix('../data/agaricus.txt.train')
+dtest = xgb.DMatrix('../data/agaricus.txt.test')
+param = {'max_depth':2, 'eta':1, 'silent':1, 'objective':'binary:logistic' }
+watchlist  = [(dtest,'eval'), (dtrain,'train')]
+num_round = 3
+bst = xgb.train(param, dtrain, num_round, watchlist)
+
+print ('start testing predict the leaf indices')
+### predict using first 2 tree
+leafindex = bst.predict(dtest, ntree_limit=2, pred_leaf = True)
+print leafindex.shape
+print leafindex
+### predict all trees
+leafindex = bst.predict(dtest, pred_leaf = True)
+print leafindex.shape
diff --git a/demo/guide-python/runall.sh b/demo/guide-python/runall.sh
index 2dd2c20b04fe..5317186d5045 100755
--- a/demo/guide-python/runall.sh
+++ b/demo/guide-python/runall.sh
@@ -4,4 +4,5 @@ python custom_objective.py
 python boost_from_prediction.py
 python generalized_linear_model.py
 python cross_validation.py
+python predict_leaf_index.py
 rm -rf *~ *.model *.buffer 
\ No newline at end of file
diff --git a/src/gbm/gblinear-inl.hpp b/src/gbm/gblinear-inl.hpp
index cae5cf4f3f7f..6d507ac6ed66 100644
--- a/src/gbm/gblinear-inl.hpp
+++ b/src/gbm/gblinear-inl.hpp
@@ -135,6 +135,12 @@ class GBLinear : public IGradBooster {
       }
     }
   }
+  virtual void PredictLeaf(IFMatrix *p_fmat,
+                           const BoosterInfo &info,
+                           std::vector<float> *out_preds,
+                           unsigned ntree_limit = 0) {
+    utils::Error("gblinear does not support predict leaf index");
+  }
   virtual std::vector<std::string> DumpModel(const utils::FeatMap& fmap, int option) {
     utils::Error("gblinear does not support dump model");
     return std::vector<std::string>();
diff --git a/src/gbm/gbm.h b/src/gbm/gbm.h
index 28b370c48238..f8eae6dbb4cd 100644
--- a/src/gbm/gbm.h
+++ b/src/gbm/gbm.h
@@ -74,6 +74,20 @@ class IGradBooster {
                        const BoosterInfo &info,
                        std::vector<float> *out_preds,
                        unsigned ntree_limit = 0) = 0;
+  
+  /*!
+   * \brief predict the leaf index of each tree, the output will be nsample * ntree vector
+   *        this is only valid in gbtree predictor
+   * \param p_fmat feature matrix
+   * \param info extra side information that may be needed for prediction
+   * \param out_preds output vector to hold the predictions
+   * \param ntree_limit limit the number of trees used in prediction, when it equals 0, this means 
+   *    we do not limit number of trees, this parameter is only valid for gbtree, but not for gblinear
+   */
+  virtual void PredictLeaf(IFMatrix *p_fmat,
+                           const BoosterInfo &info,
+                           std::vector<float> *out_preds,
+                           unsigned ntree_limit = 0) = 0;
   /*!
    * \brief dump the model in text format
    * \param fmap feature map that may help give interpretations of feature
diff --git a/src/gbm/gbtree-inl.hpp b/src/gbm/gbtree-inl.hpp
index d334296c8816..b20acd48e917 100644
--- a/src/gbm/gbtree-inl.hpp
+++ b/src/gbm/gbtree-inl.hpp
@@ -126,11 +126,6 @@ class GBTree : public IGradBooster {
     for (int i = 0; i < nthread; ++i) {
       thread_temp[i].Init(mparam.num_feature);
     }
-    if (tparam.pred_path != 0) {
-      this->PredPath(p_fmat, info, out_preds);
-      return;
-    }
-
     std::vector<float> &preds = *out_preds;
     const size_t stride = info.num_row * mparam.num_output_group;
     preds.resize(stride * (mparam.size_leaf_vector+1));
@@ -158,6 +153,22 @@ class GBTree : public IGradBooster {
       }
     }
   }  
+  virtual void PredictLeaf(IFMatrix *p_fmat,
+                           const BoosterInfo &info,
+                           std::vector<float> *out_preds,
+                           unsigned ntree_limit) {
+    int nthread;
+    #pragma omp parallel
+    {
+      nthread = omp_get_num_threads();
+    }
+    thread_temp.resize(nthread, tree::RegTree::FVec());
+    for (int i = 0; i < nthread; ++i) {
+      thread_temp[i].Init(mparam.num_feature);
+    }
+    this->PredPath(p_fmat, info, out_preds, ntree_limit);
+    
+  }
   virtual std::vector<std::string> DumpModel(const utils::FeatMap& fmap, int option) {
     std::vector<std::string> dump;
     for (size_t i = 0; i < trees.size(); i++) {
@@ -309,9 +320,14 @@ class GBTree : public IGradBooster {
   // predict independent leaf index
   inline void PredPath(IFMatrix *p_fmat,
                        const BoosterInfo &info,
-                       std::vector<float> *out_preds) {
+                       std::vector<float> *out_preds,
+                       unsigned ntree_limit) {
+    // number of valid trees
+    if (ntree_limit == 0 || ntree_limit > trees.size()) {
+      ntree_limit = trees.size();
+    } 
     std::vector<float> &preds = *out_preds;
-    preds.resize(info.num_row * mparam.num_trees);
+    preds.resize(info.num_row * ntree_limit);
     // start collecting the prediction
     utils::IIterator<RowBatch> *iter = p_fmat->RowIterator();
     iter->BeforeFirst();
@@ -325,9 +341,9 @@ class GBTree : public IGradBooster {
         int64_t ridx = static_cast<int64_t>(batch.base_rowid + i);
         tree::RegTree::FVec &feats = thread_temp[tid];
         feats.Fill(batch[i]);
-        for (size_t j = 0; j < trees.size(); ++j) {
+        for (unsigned j = 0; j < ntree_limit; ++j) {
           int tid = trees[j]->GetLeafIndex(feats, info.GetRoot(ridx));
-          preds[ridx * mparam.num_trees + j] = static_cast<float>(tid);
+          preds[ridx * ntree_limit + j] = static_cast<float>(tid);
         }
         feats.Drop(batch[i]);
       }
@@ -344,8 +360,6 @@ class GBTree : public IGradBooster {
      *  use this option to support boosted random forest
      */
     int num_parallel_tree;
-    /*! \brief predict path in prediction */
-    int pred_path;
     /*! \brief whether updater is already initialized */
     int updater_initialized;
     /*! \brief tree updater sequence */
@@ -356,7 +370,6 @@ class GBTree : public IGradBooster {
       updater_seq = "grow_colmaker,prune";
       num_parallel_tree = 1;
       updater_initialized = 0;
-      pred_path = 0;
     }
     inline void SetParam(const char *name, const char *val){
       using namespace std;
@@ -371,7 +384,6 @@ class GBTree : public IGradBooster {
       if (!strcmp(name, "num_parallel_tree")) {
         num_parallel_tree = atoi(val);
       }
-      if (!strcmp(name, "pred_path")) pred_path = atoi(val);
     }
   };
   /*! \brief model parameters */
diff --git a/src/learner/learner-inl.hpp b/src/learner/learner-inl.hpp
index 6b74402394fc..d16986e838a0 100644
--- a/src/learner/learner-inl.hpp
+++ b/src/learner/learner-inl.hpp
@@ -280,10 +280,16 @@ class BoostLearner {
   inline void Predict(const DMatrix &data,
                       bool output_margin,
                       std::vector<float> *out_preds,
-                      unsigned ntree_limit = 0) const {
-    this->PredictRaw(data, out_preds, ntree_limit);
-    if (!output_margin) {
-      obj_->PredTransform(out_preds);
+                      unsigned ntree_limit = 0,
+                      bool pred_leaf = false
+                      ) const {
+    if (pred_leaf) {
+      gbm_->PredictLeaf(data.fmat(), data.info.info, out_preds, ntree_limit);      
+    } else {
+      this->PredictRaw(data, out_preds, ntree_limit);
+      if (!output_margin) {
+        obj_->PredTransform(out_preds);
+      }
     }
   }
   /*! \brief dump model out */
diff --git a/wrapper/xgboost.py b/wrapper/xgboost.py
index b549ddd8b230..08aacb90eab7 100644
--- a/wrapper/xgboost.py
+++ b/wrapper/xgboost.py
@@ -333,23 +333,38 @@ def eval_set(self, evals, it = 0, feval = None):
             return res
     def eval(self, mat, name = 'eval', it = 0):
         return self.eval_set( [(mat,name)], it)
-    def predict(self, data, output_margin=False, ntree_limit=0):
+    def predict(self, data, output_margin=False, ntree_limit=0, pred_leaf=False):
         """
         predict with data
             Args:
                 data: DMatrix
-                      the dmatrix storing the input
+                    the dmatrix storing the input
                 output_margin: bool
-                               whether output raw margin value that is untransformed
+                    whether output raw margin value that is untransformed
                 ntree_limit: int
-                             limit number of trees in prediction, default to 0, 0 means using all the trees
+                    limit number of trees in prediction, default to 0, 0 means using all the trees
+                pred_leaf: bool
+                    when this option is on, the output will be a matrix of (nsample, ntrees)
+                    with each record indicate the predicted leaf index of each sample in each tree
+                    Note that the leaf index of tree is unique per tree, so you may find leaf 1 in both tree 1 and tree 0
             Returns:
                 numpy array of prediction
         """
+        option_mask = 0
+        if output_margin:
+            option_mask += 1
+        if pred_leaf:
+            option_mask += 2
         length = ctypes.c_ulong()
         preds = xglib.XGBoosterPredict(self.handle, data.handle,
-                                       int(output_margin), ntree_limit, ctypes.byref(length))
-        return ctypes2numpy(preds, length.value, 'float32')
+                                       option_mask, ntree_limit, ctypes.byref(length))        
+        preds = ctypes2numpy(preds, length.value, 'float32')
+        if pred_leaf:
+            preds = preds.astype('int32')        
+        nrow = data.num_row()
+        if preds.size != nrow and preds.size % nrow == 0:
+            preds = preds.reshape(nrow, preds.size / nrow) 
+        return preds
     def save_model(self, fname):
         """ save model to file
             Args:
diff --git a/wrapper/xgboost_wrapper.cpp b/wrapper/xgboost_wrapper.cpp
index ac054090cf99..d0efc4bd0873 100644
--- a/wrapper/xgboost_wrapper.cpp
+++ b/wrapper/xgboost_wrapper.cpp
@@ -30,9 +30,9 @@ class Booster: public learner::BoostLearner {
     this->init_model = false;
     this->SetCacheData(mats);
   }
-  inline const float *Pred(const DataMatrix &dmat, int output_margin, unsigned ntree_limit, bst_ulong *len) {
+  inline const float *Pred(const DataMatrix &dmat, int option_mask, unsigned ntree_limit, bst_ulong *len) {
     this->CheckInitModel();
-    this->Predict(dmat, output_margin != 0, &this->preds_, ntree_limit);
+    this->Predict(dmat, (option_mask&1) != 0, &this->preds_, ntree_limit, (option_mask&2) != 0);
     *len = static_cast<bst_ulong>(this->preds_.size());
     return BeginPtr(this->preds_);
   }
@@ -284,8 +284,8 @@ extern "C"{
     bst->eval_str = bst->EvalOneIter(iter, mats, names);
     return bst->eval_str.c_str();
   }
-  const float *XGBoosterPredict(void *handle, void *dmat, int output_margin, unsigned ntree_limit, bst_ulong *len) {
-    return static_cast<Booster*>(handle)->Pred(*static_cast<DataMatrix*>(dmat), output_margin, ntree_limit, len);
+  const float *XGBoosterPredict(void *handle, void *dmat, int option_mask, unsigned ntree_limit, bst_ulong *len) {
+    return static_cast<Booster*>(handle)->Pred(*static_cast<DataMatrix*>(dmat), option_mask, ntree_limit, len);
   }
   void XGBoosterLoadModel(void *handle, const char *fname) {
     static_cast<Booster*>(handle)->LoadModel(fname);
diff --git a/wrapper/xgboost_wrapper.h b/wrapper/xgboost_wrapper.h
index 2ae70f026fe2..16d54f62b545 100644
--- a/wrapper/xgboost_wrapper.h
+++ b/wrapper/xgboost_wrapper.h
@@ -178,12 +178,18 @@ extern "C" {
    * \brief make prediction based on dmat
    * \param handle handle
    * \param dmat data matrix
-   * \param output_margin whether only output raw margin value
+   * \param option_mask bit-mask of options taken in prediction, possible values
+   *          0:normal prediction
+   *          1:output margin instead of transformed value
+   *          2:output leaf index of trees instead of leaf value, note leaf index is unique per tree
    * \param ntree_limit limit number of trees used for prediction, this is only valid for boosted trees
    *    when the parameter is set to 0, we will use all the trees
    * \param len used to store length of returning result
    */
-  XGB_DLL const float *XGBoosterPredict(void *handle, void *dmat, int output_margin, unsigned ntree_limit, bst_ulong *len);
+  XGB_DLL const float *XGBoosterPredict(void *handle, void *dmat, 
+                                        int option_mask, 
+                                        unsigned ntree_limit,
+                                        bst_ulong *len);
   /*!
    * \brief load model from existing file
    * \param handle handle

From 84dcab67951e5df055f6b6b40a707d0e2f0e5c9c Mon Sep 17 00:00:00 2001
From: tqchen <tianqi.tchen@gmail.com>
Date: Fri, 21 Nov 2014 16:09:26 -0800
Subject: [PATCH 097/166] checkin socket module

---
 src/utils/socket.h | 212 +++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 212 insertions(+)
 create mode 100644 src/utils/socket.h

diff --git a/src/utils/socket.h b/src/utils/socket.h
new file mode 100644
index 000000000000..5b216980e7d1
--- /dev/null
+++ b/src/utils/socket.h
@@ -0,0 +1,212 @@
+#ifndef XGBOOST_UTILS_SOCKET_H
+#define XGBOOST_UTILS_SOCKET_H
+/*!
+ * \file socket.h
+ * \brief this file aims to provide a platform independent wrapper 
+ *        of socket
+ * \author Tianqi Chen
+ */
+#include <fcntl.h>
+#include <netdb.h>
+#include <errno.h>
+#include <unistd.h>
+#include <arpa/inet.h>
+#include <netinet/in.h>
+#include <sys/socket.h>
+#include <sys/select.h>
+#include <string>
+#include <cstring>
+#include "./utils.h"
+
+namespace xgboost {
+namespace utils {
+
+/*! \brief data structure for network address */
+struct SockAddr {
+  sockaddr_in addr;
+  // constructor
+  SockAddr(void) {}
+  SockAddr(const char *url, int port) {
+    this->Set(url, port);
+  }
+  /*! 
+   * \brief set the address
+   * \param url the url of the address
+   * \param port the port of address
+   */
+  inline void Set(const char *url, int port) {
+    hostent *hp = gethostbyname(url);
+    Check(hp != NULL, "cannot obtain address of %s", url);
+    memset(&addr, 0, sizeof(addr));
+    addr.sin_family = AF_INET;
+    addr.sin_port = htons(port);
+    memcpy(&addr.sin_addr, hp->h_addr_list[0], hp->h_length);
+  }
+  /*! \return a string representation of the address */
+  inline std::string ToString(void) const {
+    std::string buf; buf.resize(256);
+    const char *s = inet_ntop(AF_INET, &addr, &buf[0], buf.length());
+    Assert(s != NULL, "cannot decode address");
+    std::string res = s;
+    sprintf(&buf[0], "%u", ntohs(addr.sin_port));
+    res += ":" + buf;
+    return res;
+  }
+};
+/*! 
+ * \brief a wrapper of TCP socket that hopefully be cross platform
+ */
+class TCPSocket {
+ public:
+  /*! \brief the file descriptor of socket */
+  int sockfd;
+  // constructor
+  TCPSocket(void) {}
+  // default conversion to int
+  inline int operator()() const {
+    return sockfd;
+  }
+  /*!
+   * \brief start up the socket module
+   *   call this before using the sockets
+   */
+  inline static void Startup(void) {
+  }
+  /*! 
+   * \brief shutdown the socket module after use, all sockets need to be closed
+   */  
+  inline static void Finalize(void) {
+  }
+  /*! 
+   * \brief set this socket to use async I/O 
+   */
+  inline void SetAsync(void) {
+    if (fcntl(sockfd, fcntl(sockfd, F_GETFL) | O_NONBLOCK) == -1) {
+      SockError("SetAsync", errno);
+    }
+  }
+  /*!
+   * \brief perform listen of the socket
+   * \param backlog backlog parameter
+   */
+  inline void Listen(int backlog = 16) {
+    listen(sockfd, backlog);
+  }
+  /*! 
+   * \brief bind the socket to an address 
+   * \param 3
+   */
+  inline void Bind(const SockAddr &addr) {
+    if (bind(sockfd, (sockaddr*)&addr.addr, sizeof(addr.addr)) == -1) {
+      SockError("Bind", errno);
+    }
+  }
+  /*! 
+   * \brief connect to an address 
+   * \param addr the address to connect to
+   */
+  inline void Connect(const SockAddr &addr) {
+    if (connect(sockfd, (sockaddr*)&addr.addr, sizeof(addr.addr)) == -1) {
+      SockError("Connect", errno);
+    }
+  }
+  /*! \brief close the connection */
+  inline void Close(void) {
+    close(sockfd);
+  }
+  /*!
+   * \brief send data using the socket 
+   * \param buf the pointer to the buffer
+   * \param len the size of the buffer
+   * \param flags extra flags
+   * \return size of data actually sent
+   */
+  inline size_t Send(const void *buf, size_t len, int flag = 0) {
+    ssize_t ret = send(sockfd, buf, len, flag);
+    if (ret == -1) SockError("Send", errno);
+    return ret;
+  }
+  /*! 
+   * \brief send data using the socket 
+   * \param buf the pointer to the buffer
+   * \param len the size of the buffer
+   * \param flags extra flags
+   * \return size of data actually received 
+   */
+  inline size_t Recv(void *buf, size_t len, int flags = 0) {
+    ssize_t ret = recv(sockfd, buf, len, flags);
+    if (ret == -1) SockError("Recv", errno);
+    return ret;
+   }
+ private:
+  // report an socket error
+  inline static void SockError(const char *msg, int errsv) {
+    char buf[256];    
+    Error("Socket %s Error:%s", msg, strerror_r(errsv, buf, sizeof(buf)));
+  }
+};
+/*! \brief helper data structure to perform select */
+struct SelectHelper {
+ public:
+  SelectHelper(void) {}
+  /*!
+   * \brief add file descriptor to watch for read 
+   * \param fd file descriptor to be watched
+   */
+  inline void WatchRead(int fd) {
+    FD_SET(fd, &read_set);
+    if (fd > maxfd) maxfd = fd;
+  }
+  /*!
+   * \brief add file descriptor to watch for write
+   * \param fd file descriptor to be watched
+   */
+  inline void WatchWrite(int fd) {
+    FD_SET(fd, &write_set);
+    if (fd > maxfd) maxfd = fd;
+  }
+  /*!
+   * \brief Check if the descriptor is ready for read
+   * \param 
+   */
+  inline bool CheckRead(int fd) const {
+    return FD_ISSET(fd, &read_set);
+  }
+  inline bool CheckWrite(int fd) const {
+    return FD_ISSET(fd, &write_set);
+  }
+  inline void Clear(void) {
+    FD_ZERO(&read_set);
+    FD_ZERO(&write_set);
+    maxfd = 0;
+  }
+  /*!
+   * \brief peform select on the set defined
+   * \param timeout specify timeout in micro-seconds(ms) if equals 0, means select will always block
+   * \return number of active descriptors selected
+   */
+  inline int Select(long timeout = 0) {
+    int ret;
+    if (timeout == 0) {
+      ret = select(maxfd + 1, &read_set, &write_set, NULL, NULL);
+    } else {
+      timeval tm;
+      tm.tv_usec = (timeout % 1000) * 1000;
+      tm.tv_sec = timeout / 1000;
+      ret = select(maxfd + 1, &read_set, &write_set, NULL, &tm);
+    }
+    if (ret == -1) {
+      int errsv = errno;
+      char buf[256];
+      Error("Select Error:%s", strerror_r(errsv, buf, sizeof(buf)));      
+    }
+    return ret;
+  }
+  
+ private:
+  int maxfd; 
+  fd_set read_set, write_set;
+};
+}
+}
+#endif

From b6e1b1920597edcee1dffd7412b7c7809572f375 Mon Sep 17 00:00:00 2001
From: tqchen <tianqi.tchen@gmail.com>
Date: Fri, 21 Nov 2014 16:09:28 -0800
Subject: [PATCH 098/166] checkin socket module

---
 src/sync/sync.h | 3 ++-
 test/Makefile   | 3 ++-
 2 files changed, 4 insertions(+), 2 deletions(-)

diff --git a/src/sync/sync.h b/src/sync/sync.h
index 0619c3ea3641..c69755b14918 100644
--- a/src/sync/sync.h
+++ b/src/sync/sync.h
@@ -7,9 +7,10 @@
  */
 #include <cstdio>
 #include <cstring>
+#include <string>
+
 #include "../utils/utils.h"
 #include "../utils/io.h"
-#include <string>
 
 namespace MPI {
 // forward delcaration of MPI::Datatype, but not include content
diff --git a/test/Makefile b/test/Makefile
index 6d135e3171cd..6c943e155aaa 100644
--- a/test/Makefile
+++ b/test/Makefile
@@ -11,7 +11,7 @@ else
 endif
 
 # specify tensor path
-BIN = test_group_data test_quantile
+BIN = test_group_data test_quantile test_sock
 
 .PHONY: clean all
 
@@ -19,6 +19,7 @@ all: $(BIN) $(MPIBIN)
 
 test_group_data: test_group_data.cpp ../src/utils/*.h
 test_quantile: test_quantile.cpp ../src/utils/*.h
+test_sock: test_sock.cpp ../src/utils/*.h
 
 $(BIN) : 
 	$(CXX) $(CFLAGS) $(LDFLAGS) -o $@ $(filter %.cpp %.o %.c, $^)

From 7ec3fc936a32635d443360fac04d5a0b46871e45 Mon Sep 17 00:00:00 2001
From: tqchen <tianqi.tchen@gmail.com>
Date: Fri, 21 Nov 2014 22:54:11 -0800
Subject: [PATCH 099/166] check in allreduce tcp, check if there could be more
 concise form

---
 Makefile              |   6 +-
 src/sync/sync_tcp.cpp | 280 ++++++++++++++++++++++++++++++++++++++++++
 src/utils/socket.h    |   7 +-
 3 files changed, 287 insertions(+), 6 deletions(-)
 create mode 100644 src/sync/sync_tcp.cpp

diff --git a/Makefile b/Makefile
index d5fd7c394069..172a7607b4e9 100644
--- a/Makefile
+++ b/Makefile
@@ -12,7 +12,7 @@ endif
 
 # specify tensor path
 BIN = xgboost
-OBJ = updater.o gbm.o io.o main.o sync_empty.o
+OBJ = updater.o gbm.o io.o main.o sync_empty.o sync_tcp.o
 MPIOBJ = sync_mpi.o
 MPIBIN = xgboost-mpi
 SLIB = wrapper/libxgboostwrapper.so 
@@ -24,11 +24,11 @@ mpi: $(MPIBIN)
 
 python: wrapper/libxgboostwrapper.so
 # now the wrapper takes in two files. io and wrapper part
-wrapper/libxgboostwrapper.so: wrapper/xgboost_wrapper.cpp $(OBJ)
 updater.o: src/tree/updater.cpp  src/tree/*.hpp src/*.h src/tree/*.h src/utils/*.h
 gbm.o: src/gbm/gbm.cpp src/gbm/*.hpp src/gbm/*.h 
 io.o: src/io/io.cpp src/io/*.hpp src/utils/*.h src/learner/dmatrix.h src/*.h
-sync_mpi.o: src/sync/sync_mpi.cpp 
+sync_mpi.o: src/sync/sync_mpi.cpp
+sync_tcp.o: src/sync/sync_tcp.cpp
 sync_empty.o: src/sync/sync_empty.cpp 
 main.o: src/xgboost_main.cpp src/utils/*.h src/*.h src/learner/*.hpp src/learner/*.h 
 xgboost-mpi:  updater.o gbm.o io.o main.o sync_mpi.o 
diff --git a/src/sync/sync_tcp.cpp b/src/sync/sync_tcp.cpp
new file mode 100644
index 000000000000..2aa85ea9a989
--- /dev/null
+++ b/src/sync/sync_tcp.cpp
@@ -0,0 +1,280 @@
+/*!
+ * \file sync_tcp.cpp
+ * \brief implementation of sync AllReduce using TCP sockets
+ *   with use async socket and tree-shape reduction
+ * \author Tianqi Chen
+ */
+#include "./sync.h"
+#include "../utils/socket.h"
+
+namespace MPI {
+struct Datatype {
+  size_t type_size;
+  Datatype(size_t type_size) : type_size(type_size) {}
+};
+}
+namespace xgboost {
+namespace sync {
+/*! \brief implementation of sync goes to here */
+class SyncManager {  
+ public:
+  // initialize the manager
+  inline void Init(int argc, char *argv[]) {    
+  }
+  /*!
+   * \brief perform in-place allreduce, on sendrecvbuf 
+   *        this function is not thread-safe
+   * \param sendrecvbuf buffer for both sending and recving data
+   * \param type_n4bytes the unit number of bytes the type have
+   * \param count number of elements to be reduced
+   * \param reducer reduce function
+   */
+  inline void AllReduce(void *sendrecvbuf_,
+                        size_t type_nbytes,
+                        size_t count,
+                        ReduceHandle::ReduceFunction reducer) {
+    if (parent.size() == 0 && childs.size() == 0) return;
+    char *sendrecvbuf = reinterpret_cast<char*>(sendrecvbuf_);
+    // total size of message
+    const size_t total_size = type_nbytes * count;
+    // size of space that we already performs reduce in up pass
+    size_t size_up_reduce = 0;
+    // size of space that we have already passed to parent
+    size_t size_up_out = 0;
+    // size of message we received, and send in the down pass
+    size_t size_down_in = 0;
+    // initialize the send buffer
+    for (size_t i = 0; i < childs.size(); ++i) {
+      childs[i].Init(type_nbytes, count);
+    }
+    // if no childs, no need to reduce
+    if (childs.size() == 0) size_up_reduce = total_size;    
+    // while we have not passed the messages out
+    while(true) {
+      selecter.Select();
+      // read data from childs
+      for (size_t i = 0; i < childs.size(); ++i) {
+        if (selecter.CheckRead(childs[i].sock)) {
+          childs[i].Read(size_up_out);
+        }
+      }
+      // peform reduce
+      if (childs.size() != 0) {
+        const size_t buffer_size = childs[0].buffer_size;
+        // do upstream reduce
+        size_t min_read = childs[0].size_read;
+        for (size_t i = 1; i < childs.size(); ++i) {
+          min_read = std::min(min_read, childs[i].size_read);
+        }
+        // align to type_nbytes
+        min_read = (min_read / type_nbytes * type_nbytes);
+        // start position
+        size_t start = size_up_reduce % buffer_size;
+        // peform read till end of buffer
+        if (start + min_read - size_up_reduce > buffer_size) {
+          const size_t nread = buffer_size - start;
+          utils::Assert(nread % type_nbytes == 0, "AllReduce: size check");
+          for (size_t i = 0; i < childs.size(); ++i) {
+            reducer(childs[i].buffer_head + start,
+                    sendrecvbuf + size_up_reduce,
+                    nread / type_nbytes,
+                    MPI::Datatype(type_nbytes));
+          }
+          size_up_reduce += nread;
+          start = 0;
+        }
+        // peform second phase of reduce
+        const size_t nread = min_read - size_up_reduce;
+        if (nread != 0) {
+          utils::Assert(nread % type_nbytes == 0, "AllReduce: size check");
+          for (size_t i = 0; i < childs.size(); ++i) {
+            reducer(childs[i].buffer_head + start,
+                    sendrecvbuf + size_up_reduce,
+                    nread / type_nbytes,
+                    MPI::Datatype(type_nbytes));
+          }
+        }
+        size_up_reduce += nread;
+      }
+      if (parent.size() != 0) {
+        // can pass message up to parent
+        if (selecter.CheckWrite(parent[0])) {
+          size_up_out += parent[0]
+              .Send(sendrecvbuf + size_up_out, size_up_reduce - size_up_out);
+        }
+        // read data from parent
+        if (selecter.CheckRead(parent[0])) {
+          size_down_in +=  parent[0]
+              .Recv(sendrecvbuf + size_down_in, total_size - size_down_in);
+          utils::Assert(size_down_in <= size_up_out, "AllReduce: boundary error");
+        }
+      } else {
+        // this is root, can use reduce as most recent point
+        size_down_in = size_up_out = size_up_reduce;
+      }
+      // check if we finished the job of message passing
+      size_t nfinished = size_down_in;
+      // can pass message down to childs
+      for (size_t i = 0; i < childs.size(); ++i) {
+        if (selecter.CheckWrite(childs[i].sock)) {
+          childs[i].size_write += childs[i].sock
+              .Send(sendrecvbuf + childs[i].size_write, size_down_in - childs[i].size_write);
+        }
+        nfinished = std::min(childs[i].size_write, nfinished);
+      }
+      // check boundary condition
+      if (nfinished >= total_size) {
+        utils::Assert(nfinished == total_size, "AllReduce: nfinished check");
+        break;
+      }
+    }
+  }
+  inline void Bcast(std::string *sendrecv_data, int root) {
+    if (parent.size() == 0 && childs.size() == 0) return;
+    // message send to parent
+    size_t size_up_out = 0;
+    // all messages received
+    size_t size_in = 0;
+    // all headers received so far
+    size_t header_in = 0;
+    // total size of data
+    size_t total_size;
+    // input channel, -1 means parent, -2 means unknown yet
+    // otherwise its child index
+    int in_channel = -2;
+    // root already reads all data in
+    if (root == rank) {
+      in_channel = -3;
+      total_size = size_in = sendrecv_data->length();
+      header_in = sizeof(total_size);
+    }
+    // initialize write position
+    for (size_t i = 0; i < childs.size(); ++i) {
+      childs[i].size_write = 0;
+    }
+    const int nchilds = static_cast<int>(childs.size());
+
+    while (true) {
+      selecter.Select();
+      if (selecter.CheckRead(parent[0])) {
+        utils::Assert(in_channel == -2 || in_channel == -1, "invalid in channel");
+        this->BcastRecvData(parent[0], sendrecv_data,
+                            header_in, size_in, total_size);
+        if (header_in != 0) in_channel = -1;
+      }
+      for (int i = 0; i < nchilds; ++i) {
+        if (selecter.CheckRead(childs[i].sock)) {
+          utils::Assert(in_channel == -2 || in_channel == i, "invalid in channel");
+          this->BcastRecvData(parent[0], sendrecv_data,
+                              header_in, size_in, total_size);
+          if (header_in != 0) in_channel = i;
+        }
+      }
+      if (in_channel == -2) continue;
+      if (in_channel != -1) {
+        if (selecter.CheckWrite(parent[0])) {
+          size_t nsend = size_in - size_up_out;
+          if (nsend != 0) {
+            size_up_out += parent[0].Send(&(*sendrecv_data)[0] + size_up_out, nsend);
+          }
+        }
+      } else {
+        size_up_out = size_in;
+      }
+      size_t nfinished = size_up_out;
+      for (int i = 0; i < nchilds; ++i) {
+        if (in_channel != i) {
+          if (selecter.CheckWrite(childs[i].sock)) {
+            size_t nsend = size_in - childs[i].size_write;
+            if (nsend != 0) {
+              childs[i].size_write += childs[i].sock
+                  .Send(&(*sendrecv_data)[0] + childs[i].size_write, nsend);
+            }
+          }
+          nfinished = std::min(nfinished, childs[i].size_write);
+        }
+      }
+      // check boundary condition
+      if (nfinished >= total_size) {
+        utils::Assert(nfinished == total_size, "Bcast: nfinished check");
+        break;
+      }
+    }
+  }
+
+ private:
+  inline void BcastRecvData(utils::TCPSocket &sock,
+                            std::string *sendrecv_data,   
+                            size_t &header_in,
+                            size_t &size_in,
+                            size_t &total_size) {
+    if (header_in < sizeof(total_size)) {
+      char *p = reinterpret_cast<char*>(&total_size);
+      header_in += sock.Recv(p + size_in, sizeof(total_size) - header_in);
+      if (header_in == sizeof(total_size)) {
+        sendrecv_data->resize(total_size);
+      }
+    } else {
+      size_t nread  = total_size - size_in;
+      if (nread != 0) {
+        size_in += sock
+            .Recv(&(*sendrecv_data)[0] + size_in, nread);
+      }
+    }
+  }
+  
+  // 128 MB
+  const static size_t kBufferSize = 128;
+  // an independent child record
+  struct ChildRecord {
+   public:
+    // socket to get data from child
+    utils::TCPSocket sock;
+    // size of data readed from child
+    size_t size_read;
+    // size of data write into child
+    size_t size_write;
+    // pointer to buffer head
+    char *buffer_head;
+    // buffer size, in bytes
+    size_t buffer_size;
+    // initialize buffer
+    inline void Init(size_t type_nbytes, size_t count) {
+      utils::Assert(type_nbytes < kBufferSize, "too large type_nbytes");
+      size_t n = (type_nbytes * count + 7)/ 8;
+      buffer_.resize(std::min(kBufferSize, n));
+      // make sure align to type_nbytes
+      buffer_size = buffer_.size() * sizeof(uint64_t) / type_nbytes * type_nbytes;
+      // set buffer head
+      buffer_head = reinterpret_cast<char*>(BeginPtr(buffer_));
+      // set write head
+      size_write = size_read = 0;
+    }
+    // maximum number of bytes we are able to read
+    // currently without corrupt the data
+    inline void Read(size_t size_up_out) {
+      size_t ngap = size_read - size_up_out;
+      utils::Assert(ngap <= buffer_size, "AllReduce: boundary check");
+      size_t offset = size_read % buffer_size;      
+      size_t nmax = std::min(ngap, buffer_size - offset);
+      size_t len = sock.Recv(buffer_head + offset, nmax);
+      size_read += len;
+    }
+
+   private:
+    // recv buffer to get data from child
+    // aligned with 64 bits, will be able to perform 64 bits operations freely
+    std::vector<uint64_t> buffer_;
+  };
+  // current rank
+  int rank;                  
+  // parent socket, can be of size 0 or 1
+  std::vector<utils::TCPSocket> parent;
+  // sockets of all childs, can be of size 0, 1, 2 or more
+  std::vector<ChildRecord> childs;
+  // select helper
+  utils::SelectHelper selecter;
+};
+
+}  // namespace sync
+}  // namespace xgboost
diff --git a/src/utils/socket.h b/src/utils/socket.h
index 5b216980e7d1..5eebbd160188 100644
--- a/src/utils/socket.h
+++ b/src/utils/socket.h
@@ -2,8 +2,7 @@
 #define XGBOOST_UTILS_SOCKET_H
 /*!
  * \file socket.h
- * \brief this file aims to provide a platform independent wrapper 
- *        of socket
+ * \brief this file aims to provide a wrapper of sockets
  * \author Tianqi Chen
  */
 #include <fcntl.h>
@@ -63,7 +62,7 @@ class TCPSocket {
   // constructor
   TCPSocket(void) {}
   // default conversion to int
-  inline int operator()() const {
+  inline operator int() const {
     return sockfd;
   }
   /*!
@@ -122,6 +121,7 @@ class TCPSocket {
    * \return size of data actually sent
    */
   inline size_t Send(const void *buf, size_t len, int flag = 0) {
+    if (len == 0) return 0;
     ssize_t ret = send(sockfd, buf, len, flag);
     if (ret == -1) SockError("Send", errno);
     return ret;
@@ -134,6 +134,7 @@ class TCPSocket {
    * \return size of data actually received 
    */
   inline size_t Recv(void *buf, size_t len, int flags = 0) {
+    if (len == 0) return 0;
     ssize_t ret = recv(sockfd, buf, len, flags);
     if (ret == -1) SockError("Recv", errno);
     return ret;

From 48642207023492ac3e86b9aa5d26db60ffbdf660 Mon Sep 17 00:00:00 2001
From: tqchen <tianqi.tchen@gmail.com>
Date: Sat, 22 Nov 2014 12:15:30 -0800
Subject: [PATCH 100/166] have the function, ready, need initializer

---
 src/sync/sync_tcp.cpp | 313 +++++++++++++++++++++---------------------
 src/utils/socket.h    |  30 +++-
 2 files changed, 177 insertions(+), 166 deletions(-)

diff --git a/src/sync/sync_tcp.cpp b/src/sync/sync_tcp.cpp
index 2aa85ea9a989..5e46a711903e 100644
--- a/src/sync/sync_tcp.cpp
+++ b/src/sync/sync_tcp.cpp
@@ -4,6 +4,7 @@
  *   with use async socket and tree-shape reduction
  * \author Tianqi Chen
  */
+#include <vector>
 #include "./sync.h"
 #include "../utils/socket.h"
 
@@ -23,8 +24,8 @@ class SyncManager {
   }
   /*!
    * \brief perform in-place allreduce, on sendrecvbuf 
-   *        this function is not thread-safe
-   * \param sendrecvbuf buffer for both sending and recving data
+   *        this function is NOT thread-safe
+   * \param sendrecvbuf_ buffer for both sending and recving data
    * \param type_n4bytes the unit number of bytes the type have
    * \param count number of elements to be reduced
    * \param reducer reduce function
@@ -33,79 +34,83 @@ class SyncManager {
                         size_t type_nbytes,
                         size_t count,
                         ReduceHandle::ReduceFunction reducer) {
-    if (parent.size() == 0 && childs.size() == 0) return;
-    char *sendrecvbuf = reinterpret_cast<char*>(sendrecvbuf_);
+    if (links.size() == 0) return;
     // total size of message
     const size_t total_size = type_nbytes * count;
+    // number of links
+    const int nlink = static_cast<int>(links.size());
+    // send recv buffer
+    char *sendrecvbuf = reinterpret_cast<char*>(sendrecvbuf_);
     // size of space that we already performs reduce in up pass
     size_t size_up_reduce = 0;
     // size of space that we have already passed to parent
     size_t size_up_out = 0;
     // size of message we received, and send in the down pass
-    size_t size_down_in = 0;
-    // initialize the send buffer
-    for (size_t i = 0; i < childs.size(); ++i) {
-      childs[i].Init(type_nbytes, count);
+    size_t size_down_in = 0;    
+
+    // initialize the link ring-buffer and pointer
+    for (int i = 0; i < nlink; ++i) {
+      if (i != parent_index) links[i].InitBuffer(type_nbytes, count);
+      links[i].ResetSize();
     }
     // if no childs, no need to reduce
-    if (childs.size() == 0) size_up_reduce = total_size;    
+    if (nlink == static_cast<int>(parent_index != -1)) {
+      size_up_reduce = total_size;
+    }
+    
     // while we have not passed the messages out
     while(true) {
       selecter.Select();
       // read data from childs
-      for (size_t i = 0; i < childs.size(); ++i) {
-        if (selecter.CheckRead(childs[i].sock)) {
-          childs[i].Read(size_up_out);
+      for (int i = 0; i < nlink; ++i) {
+        if (i != parent_index && selecter.CheckRead(links[i].sock)) {
+          links[i].ReadToRingBuffer(size_up_out);
         }
       }
-      // peform reduce
-      if (childs.size() != 0) {
-        const size_t buffer_size = childs[0].buffer_size;
+      // this node have childs, peform reduce
+      if (nlink > static_cast<int>(parent_index != -1)) {
+        size_t buffer_size = 0;
         // do upstream reduce
-        size_t min_read = childs[0].size_read;
-        for (size_t i = 1; i < childs.size(); ++i) {
-          min_read = std::min(min_read, childs[i].size_read);
-        }
-        // align to type_nbytes
-        min_read = (min_read / type_nbytes * type_nbytes);
-        // start position
-        size_t start = size_up_reduce % buffer_size;
-        // peform read till end of buffer
-        if (start + min_read - size_up_reduce > buffer_size) {
-          const size_t nread = buffer_size - start;
-          utils::Assert(nread % type_nbytes == 0, "AllReduce: size check");
-          for (size_t i = 0; i < childs.size(); ++i) {
-            reducer(childs[i].buffer_head + start,
-                    sendrecvbuf + size_up_reduce,
-                    nread / type_nbytes,
-                    MPI::Datatype(type_nbytes));
+        size_t max_reduce = total_size;
+        for (int i = 0; i < nlink; ++i) {
+          if (i != parent_index) {
+            max_reduce= std::min(max_reduce, links[i].size_read);
+            utils::Assert(buffer_size == 0 || buffer_size == links[i].buffer_size,
+                          "buffer size inconsistent");
+            buffer_size = links[i].buffer_size;
           }
-          size_up_reduce += nread;
-          start = 0;
         }
-        // peform second phase of reduce
-        const size_t nread = min_read - size_up_reduce;
-        if (nread != 0) {
+        utils::Assert(buffer_size != 0, "must assign buffer_size");
+        // round to type_n4bytes
+        max_reduce = (max_reduce / type_nbytes * type_nbytes);
+        // peform reduce, can be at most two rounds
+        while (size_up_reduce < max_reduce) {
+          // start position
+          size_t start = size_up_reduce % buffer_size;
+          // peform read till end of buffer
+          size_t nread = std::min(buffer_size - start, max_reduce - size_up_reduce);          
           utils::Assert(nread % type_nbytes == 0, "AllReduce: size check");
-          for (size_t i = 0; i < childs.size(); ++i) {
-            reducer(childs[i].buffer_head + start,
-                    sendrecvbuf + size_up_reduce,
-                    nread / type_nbytes,
-                    MPI::Datatype(type_nbytes));
+          for (int i = 0; i < nlink; ++i) {
+            if (i != parent_index) {
+              reducer(links[i].buffer_head + start,
+                      sendrecvbuf + size_up_reduce,
+                      nread / type_nbytes,
+                      MPI::Datatype(type_nbytes));
+            }
           }
+          size_up_reduce += nread;
         }
-        size_up_reduce += nread;
       }
-      if (parent.size() != 0) {
-        // can pass message up to parent
-        if (selecter.CheckWrite(parent[0])) {
-          size_up_out += parent[0]
-              .Send(sendrecvbuf + size_up_out, size_up_reduce - size_up_out);
+      if (parent_index != -1) {
+        // pass message up to parent, can pass data that are already been reduced
+        if (selecter.CheckWrite(links[parent_index].sock)) {
+          size_up_out += links[parent_index].sock.
+              Send(sendrecvbuf + size_up_out, size_up_reduce - size_up_out);
         }
         // read data from parent
-        if (selecter.CheckRead(parent[0])) {
-          size_down_in +=  parent[0]
-              .Recv(sendrecvbuf + size_down_in, total_size - size_down_in);
+        if (selecter.CheckRead(links[parent_index].sock)) {
+          size_down_in +=  links[parent_index].sock.
+              Recv(sendrecvbuf + size_down_in, total_size - size_down_in);
           utils::Assert(size_down_in <= size_up_out, "AllReduce: boundary error");
         }
       } else {
@@ -115,131 +120,95 @@ class SyncManager {
       // check if we finished the job of message passing
       size_t nfinished = size_down_in;
       // can pass message down to childs
-      for (size_t i = 0; i < childs.size(); ++i) {
-        if (selecter.CheckWrite(childs[i].sock)) {
-          childs[i].size_write += childs[i].sock
-              .Send(sendrecvbuf + childs[i].size_write, size_down_in - childs[i].size_write);
+      for (int i = 0; i < nlink; ++i) {
+        if (i != parent_index && selecter.CheckWrite(links[i].sock)) {
+          links[i].WriteFromArray(sendrecvbuf, size_down_in);
+          nfinished = std::min(links[i].size_write, nfinished);
         }
-        nfinished = std::min(childs[i].size_write, nfinished);
       }
       // check boundary condition
-      if (nfinished >= total_size) {
-        utils::Assert(nfinished == total_size, "AllReduce: nfinished check");
-        break;
-      }
+      if (nfinished >= total_size) break;
     }
   }
-  inline void Bcast(std::string *sendrecv_data, int root) {
-    if (parent.size() == 0 && childs.size() == 0) return;
-    // message send to parent
-    size_t size_up_out = 0;
-    // all messages received
+  /*!
+   * \brief broadcast data from root to all nodes
+   * \param sendrecvbuf_ buffer for both sending and recving data
+   * \param type_n4bytes the unit number of bytes the type have
+   * \param count number of elements to be reduced
+   * \param reducer reduce function
+   */  
+  inline void Bcast(void *sendrecvbuf_,
+                    size_t total_size,
+                    int root) {
+    if (links.size() == 0) return;
+    // number of links
+    const int nlink = static_cast<int>(links.size());
+    // size of space already read from data
     size_t size_in = 0;
-    // all headers received so far
-    size_t header_in = 0;
-    // total size of data
-    size_t total_size;
-    // input channel, -1 means parent, -2 means unknown yet
-    // otherwise its child index
-    int in_channel = -2;
-    // root already reads all data in
-    if (root == rank) {
-      in_channel = -3;
-      total_size = size_in = sendrecv_data->length();
-      header_in = sizeof(total_size);
+    // input link, -2 means unknown yet, -1 means this is root
+    int in_link = -2;
+
+    // initialize the link statistics
+    for (int i = 0; i < nlink; ++i) {
+      links[i].ResetSize();
     }
-    // initialize write position
-    for (size_t i = 0; i < childs.size(); ++i) {
-      childs[i].size_write = 0;
+    // root have all the data
+    if (this->rank == root) {
+      size_in = total_size;
+      in_link = -1;
     }
-    const int nchilds = static_cast<int>(childs.size());
-
-    while (true) {
+    
+    // while we have not passed the messages out
+    while(true) {
       selecter.Select();
-      if (selecter.CheckRead(parent[0])) {
-        utils::Assert(in_channel == -2 || in_channel == -1, "invalid in channel");
-        this->BcastRecvData(parent[0], sendrecv_data,
-                            header_in, size_in, total_size);
-        if (header_in != 0) in_channel = -1;
-      }
-      for (int i = 0; i < nchilds; ++i) {
-        if (selecter.CheckRead(childs[i].sock)) {
-          utils::Assert(in_channel == -2 || in_channel == i, "invalid in channel");
-          this->BcastRecvData(parent[0], sendrecv_data,
-                              header_in, size_in, total_size);
-          if (header_in != 0) in_channel = i;
-        }
-      }
-      if (in_channel == -2) continue;
-      if (in_channel != -1) {
-        if (selecter.CheckWrite(parent[0])) {
-          size_t nsend = size_in - size_up_out;
-          if (nsend != 0) {
-            size_up_out += parent[0].Send(&(*sendrecv_data)[0] + size_up_out, nsend);
+      if (in_link == -2) {
+        // probe in-link
+        for (int i = 0; i < nlink; ++i) {
+          if (selecter.CheckRead(links[i].sock)) {
+            links[i].ReadToArray(sendrecvbuf_, total_size);
+            size_in = links[i].size_read;
+            if (size_in != 0) {
+              in_link = i; break;
+            }
           }
         }
       } else {
-        size_up_out = size_in;
+        // read from in link
+        if (in_link >= 0 && selecter.CheckRead(links[in_link].sock)) {
+          links[in_link].ReadToArray(sendrecvbuf_, total_size);
+          size_in = links[in_link].size_read;
+        }
       }
-      size_t nfinished = size_up_out;
-      for (int i = 0; i < nchilds; ++i) {
-        if (in_channel != i) {
-          if (selecter.CheckWrite(childs[i].sock)) {
-            size_t nsend = size_in - childs[i].size_write;
-            if (nsend != 0) {
-              childs[i].size_write += childs[i].sock
-                  .Send(&(*sendrecv_data)[0] + childs[i].size_write, nsend);
-            }
-          }
-          nfinished = std::min(nfinished, childs[i].size_write);
+      size_t nfinished = total_size;
+      // send data to all out-link
+      for (int i = 0; i < nlink; ++i) {
+        if (i != in_link && selecter.CheckWrite(links[i].sock)) {
+          links[i].WriteFromArray(sendrecvbuf_, size_in);
+          nfinished = std::min(nfinished, links[i].size_write);
         }
       }
       // check boundary condition
-      if (nfinished >= total_size) {
-        utils::Assert(nfinished == total_size, "Bcast: nfinished check");
-        break;
-      }
+      if (nfinished >= total_size) break;
     }
   }
-
- private:
-  inline void BcastRecvData(utils::TCPSocket &sock,
-                            std::string *sendrecv_data,   
-                            size_t &header_in,
-                            size_t &size_in,
-                            size_t &total_size) {
-    if (header_in < sizeof(total_size)) {
-      char *p = reinterpret_cast<char*>(&total_size);
-      header_in += sock.Recv(p + size_in, sizeof(total_size) - header_in);
-      if (header_in == sizeof(total_size)) {
-        sendrecv_data->resize(total_size);
-      }
-    } else {
-      size_t nread  = total_size - size_in;
-      if (nread != 0) {
-        size_in += sock
-            .Recv(&(*sendrecv_data)[0] + size_in, nread);
-      }
-    }
-  }
-  
+ private:  
   // 128 MB
   const static size_t kBufferSize = 128;
   // an independent child record
-  struct ChildRecord {
+  struct LinkRecord {
    public:
-    // socket to get data from child
+    // socket to get data from/to link
     utils::TCPSocket sock;
-    // size of data readed from child
+    // size of data readed from link
     size_t size_read;
-    // size of data write into child
+    // size of data sent to the link
     size_t size_write;
     // pointer to buffer head
     char *buffer_head;
     // buffer size, in bytes
     size_t buffer_size;
     // initialize buffer
-    inline void Init(size_t type_nbytes, size_t count) {
+    inline void InitBuffer(size_t type_nbytes, size_t count) {
       utils::Assert(type_nbytes < kBufferSize, "too large type_nbytes");
       size_t n = (type_nbytes * count + 7)/ 8;
       buffer_.resize(std::min(kBufferSize, n));
@@ -247,18 +216,42 @@ class SyncManager {
       buffer_size = buffer_.size() * sizeof(uint64_t) / type_nbytes * type_nbytes;
       // set buffer head
       buffer_head = reinterpret_cast<char*>(BeginPtr(buffer_));
-      // set write head
+    }
+    // reset the recv and sent size
+    inline void ResetSize(void) {
       size_write = size_read = 0;
     }
-    // maximum number of bytes we are able to read
-    // currently without corrupt the data
-    inline void Read(size_t size_up_out) {
-      size_t ngap = size_read - size_up_out;
+    /*! 
+     * \brief read data into ring-buffer, with care not to existing useful override data
+     *  position after protect_start
+     * \param protect_start all data start from protect_start is still needed in buffer
+     *                      read shall not override this 
+     */
+    inline void ReadToRingBuffer(size_t protect_start) {
+      size_t ngap = size_read - protect_start;
       utils::Assert(ngap <= buffer_size, "AllReduce: boundary check");
-      size_t offset = size_read % buffer_size;      
-      size_t nmax = std::min(ngap, buffer_size - offset);
-      size_t len = sock.Recv(buffer_head + offset, nmax);
-      size_read += len;
+      size_t offset = size_read % buffer_size;
+      size_t nmax = std::min(buffer_size - ngap, buffer_size - offset);
+      size_read += sock.Recv(buffer_head + offset, nmax);
+    }
+    /*!
+     * \brief read data into array,
+     * this function can not be used together with ReadToRingBuffer
+     * a link can either read into the ring buffer, or existing array
+     * \param max_size maximum size of array
+     */
+    inline void ReadToArray(void *recvbuf_, size_t max_size) {
+      char *p = static_cast<char*>(recvbuf_);
+      size_read += sock.Recv(p + size_read, max_size - size_read);
+    }
+    /*!
+     * \brief write data in array to sock
+     * \param sendbuf_ head of array
+     * \param max_size maximum size of array
+     */
+    inline void WriteFromArray(const void *sendbuf_, size_t max_size) {
+      const char *p = static_cast<const char*>(sendbuf_);
+      size_write += sock.Send(p + size_write, max_size - size_write);
     }
 
    private:
@@ -267,11 +260,11 @@ class SyncManager {
     std::vector<uint64_t> buffer_;
   };
   // current rank
-  int rank;                  
-  // parent socket, can be of size 0 or 1
-  std::vector<utils::TCPSocket> parent;
-  // sockets of all childs, can be of size 0, 1, 2 or more
-  std::vector<ChildRecord> childs;
+  int rank;
+  // index of parent link, can be -1, meaning this is root of the tree
+  int parent_index;
+  // sockets of all links
+  std::vector<LinkRecord> links;
   // select helper
   utils::SelectHelper selecter;
 };
diff --git a/src/utils/socket.h b/src/utils/socket.h
index 5eebbd160188..299c5468e2f0 100644
--- a/src/utils/socket.h
+++ b/src/utils/socket.h
@@ -149,13 +149,15 @@ class TCPSocket {
 /*! \brief helper data structure to perform select */
 struct SelectHelper {
  public:
-  SelectHelper(void) {}
+  SelectHelper(void) {
+    this->Clear();
+  }
   /*!
    * \brief add file descriptor to watch for read 
    * \param fd file descriptor to be watched
    */
   inline void WatchRead(int fd) {
-    FD_SET(fd, &read_set);
+    read_fds.push_back(fd);
     if (fd > maxfd) maxfd = fd;
   }
   /*!
@@ -163,22 +165,29 @@ struct SelectHelper {
    * \param fd file descriptor to be watched
    */
   inline void WatchWrite(int fd) {
-    FD_SET(fd, &write_set);
+    write_fds.push_back(fd);
     if (fd > maxfd) maxfd = fd;
   }
   /*!
    * \brief Check if the descriptor is ready for read
-   * \param 
+   * \param fd file descriptor to check status
    */
   inline bool CheckRead(int fd) const {
     return FD_ISSET(fd, &read_set);
   }
+  /*!
+   * \brief Check if the descriptor is ready for write
+   * \param fd file descriptor to check status
+   */
   inline bool CheckWrite(int fd) const {
     return FD_ISSET(fd, &write_set);
   }
+  /*!
+   * \brief clear all the monitored descriptors
+   */
   inline void Clear(void) {
-    FD_ZERO(&read_set);
-    FD_ZERO(&write_set);
+    read_fds.clear();
+    write_fds.clear();
     maxfd = 0;
   }
   /*!
@@ -187,6 +196,14 @@ struct SelectHelper {
    * \return number of active descriptors selected
    */
   inline int Select(long timeout = 0) {
+    FD_ZERO(&read_set);
+    FD_ZERO(&write_set);
+    for (size_t i = 0; i < read_fds.size(); ++i) {
+      FD_SET(read_fds[i], &read_set);
+    } 
+    for (size_t i = 0; i < write_fds.size(); ++i) {
+      FD_SET(write_fds[i], &write_set);
+    }
     int ret;
     if (timeout == 0) {
       ret = select(maxfd + 1, &read_set, &write_set, NULL, NULL);
@@ -207,6 +224,7 @@ struct SelectHelper {
  private:
   int maxfd; 
   fd_set read_set, write_set;
+  std::vector<int> read_fds, write_fds;
 };
 }
 }

From 67c5d8a2e6b0063c7cc7c586dfd64ea60cb1356c Mon Sep 17 00:00:00 2001
From: tqchen <tianqi.tchen@gmail.com>
Date: Sat, 22 Nov 2014 17:12:19 -0800
Subject: [PATCH 101/166] allreduce server side ok, need to add master

---
 src/sync/sync_tcp.cpp | 227 +++++++++++++++++++++++++++++++++++++++++-
 src/utils/socket.h    | 145 +++++++++++++++++++++++----
 2 files changed, 350 insertions(+), 22 deletions(-)

diff --git a/src/sync/sync_tcp.cpp b/src/sync/sync_tcp.cpp
index 5e46a711903e..cfd0d57cd77f 100644
--- a/src/sync/sync_tcp.cpp
+++ b/src/sync/sync_tcp.cpp
@@ -5,6 +5,8 @@
  * \author Tianqi Chen
  */
 #include <vector>
+#include <string>
+#include <cstring>
 #include "./sync.h"
 #include "../utils/socket.h"
 
@@ -19,8 +21,113 @@ namespace sync {
 /*! \brief implementation of sync goes to here */
 class SyncManager {  
  public:
+  const static int kMagic = 0xff99;
+  SyncManager(void) {
+    master_uri = "localhost";
+    master_port = 9000;
+    slave_port = 9010;
+    nport_trial = 1000;
+  }
+  ~SyncManager(void) {
+    this->Shutdown();
+  }
   // initialize the manager
-  inline void Init(int argc, char *argv[]) {    
+  inline void Init(void) {
+    utils::Assert(links.size() == 0, "can only call Init once");
+    int magic = kMagic;
+    int nchild = 0, nparent = 0;
+    this->host_uri = utils::SockAddr::GetHostName();
+    // get information from master
+    utils::TCPSocket master;
+    master.Create();
+    master.Connect(utils::SockAddr(master_uri.c_str(), master_port));
+    utils::Assert(master.SendAll(&magic, sizeof(magic)) == sizeof(magic), "sync::Init failure 1");
+    utils::Assert(master.RecvAll(&magic, sizeof(magic)) == sizeof(magic), "sync::Init failure 2");
+    utils::Check(magic == kMagic, "sync::Invalid master message, init failure");
+    utils::Assert(master.RecvAll(&rank, sizeof(rank)) == sizeof(rank), "sync::Init failure 3");
+    utils::Assert(master.RecvAll(&world_size, sizeof(world_size)) == sizeof(world_size), "sync::Init failure 4");
+    utils::Assert(master.RecvAll(&nparent, sizeof(nparent)) == sizeof(nparent), "sync::Init failure 5");
+    utils::Assert(master.RecvAll(&nchild, sizeof(nchild)) == sizeof(nchild), "sync::Init failure 6");
+    utils::Assert(nchild >= 0, "in correct number of childs");
+    utils::Assert(nparent == 1 || nparent == 0, "in correct number of parent");
+
+    // create listen
+    utils::TCPSocket sock_listen;
+    sock_listen.Create();
+    int port = sock_listen.TryBindHost(slave_port, slave_port + nport_trial);
+    utils::Check(port != -1, "sync::Init fail to bind the ports specified");
+    sock_listen.Listen();
+
+    if (nparent != 0) {
+      parent_index = 0;
+      links.push_back(LinkRecord());
+      int len, hport;
+      std::string hname;
+      utils::Assert(master.RecvAll(&len, sizeof(len)) == sizeof(len), "sync::Init failure 9");
+      hname.resize(len);
+      utils::Assert(len != 0, "string must not be empty");
+      utils::Assert(master.RecvAll(&hname[0], len) == static_cast<size_t>(len), "sync::Init failure 10");
+      utils::Assert(master.RecvAll(&hport, sizeof(hport)) == sizeof(hport), "sync::Init failure 11");
+      links[0].sock.Create();
+      links[0].sock.Connect(utils::SockAddr(hname.c_str(), hport));
+      utils::Assert(links[0].sock.SendAll(&magic, sizeof(magic)) == sizeof(magic), "sync::Init failure");
+      utils::Assert(links[0].sock.RecvAll(&magic, sizeof(magic)) == sizeof(magic), "sync::Init failure");
+      utils::Check(magic == kMagic, "sync::Init failure, parent magic number mismatch");
+      parent_index = 0;
+    } else {
+      parent_index = -1;
+    }
+    // send back socket listening port to master
+    utils::Assert(master.SendAll(&port, sizeof(port)) == sizeof(port), "sync::Init failure 12");
+    // close connection to master
+    master.Close();
+    // accept links from childs
+    for (int i = 0; i < nchild; ++i) {
+      LinkRecord r; 
+      while (true) {
+        r.sock = sock_listen.Accept();
+        if (links[0].sock.RecvAll(&magic, sizeof(magic)) == sizeof(magic) && magic == kMagic) {
+          utils::Assert(r.sock.SendAll(&magic, sizeof(magic)) == sizeof(magic), "sync::Init failure");
+          break;
+        } else {
+          // not a valid child
+          r.sock.Close();
+        }
+      }
+      links.push_back(r);
+    }
+    // close listening sockets
+    sock_listen.Close();
+    // setup selecter
+    selecter.Clear();
+    for (size_t i = 0; i < links.size(); ++i) {
+      selecter.WatchRead(links[i].sock);
+      selecter.WatchWrite(links[i].sock);
+    }
+    // done
+  }
+  inline void Shutdown(void) {
+    for (size_t i = 0; i < links.size(); ++i) {
+      links[i].sock.Close();
+    }
+    links.clear();
+  }
+  /*! \brief set parameters to the sync manager */
+  inline void SetParam(const char *name, const char *val) {
+    if (!strcmp(name, "master_uri")) master_uri = val;
+    if (!strcmp(name, "master_port")) master_port = atoi(val);
+  }
+  /*! \brief get rank */
+  inline int GetRank(void) const {
+    return rank;
+  }  
+  /*! \brief get rank */
+  inline int GetWorldSize(void) const {
+    return world_size;
+  }
+  /*! \brief get rank */
+  inline std::string GetHost(void) const {
+    return host_uri;
   }
   /*!
    * \brief perform in-place allreduce, on sendrecvbuf 
@@ -259,8 +366,19 @@ class SyncManager {
     // aligned with 64 bits, will be able to perform 64 bits operations freely
     std::vector<uint64_t> buffer_;
   };
+  //------------------
+  // uri of current host, to be set by Init
+  std::string host_uri;
+  // uri of master
+  std::string master_uri;
+  // port of master address
+  int master_port;
+  // port of slave process
+  int slave_port, nport_trial;
   // current rank
   int rank;
+  // world size
+  int world_size;
   // index of parent link, can be -1, meaning this is root of the tree
   int parent_index;
   // sockets of all links
@@ -269,5 +387,112 @@ class SyncManager {
   utils::SelectHelper selecter;
 };
 
+// singleton sync manager
+SyncManager manager;
+
+/*! \brief get rank of current process */
+int GetRank(void) {
+  return manager.GetRank();
+}
+/*! \brief get total number of process */
+int GetWorldSize(void) {
+  return manager.GetWorldSize();
+}
+
+/*! \brief get name of processor */
+std::string GetProcessorName(void) {
+  return manager.GetHost();
+}
+
+bool IsDistributed(void) {
+  return true;
+}
+/*! \brief intiialize the synchronization module */
+void Init(int argc, char *argv[]) {
+  for (int i = 1; i < argc; ++i) {
+    char name[256], val[256];
+    if (sscanf(argv[i], "%[^=]=%s", name, val) == 2) {
+      manager.SetParam(name, val);
+    }
+  }
+  manager.Init();
+}
+
+/*! \brief finalize syncrhonization module */
+void Finalize(void) {
+  manager.Shutdown();
+}
+
+// this can only be used for data that was smaller than 64 bit
+template<typename DType>
+inline void ReduceSum(const void *src_, void *dst_, int len, const MPI::Datatype &dtype) {
+  const DType *src = (const DType*)src_;
+  DType *dst = (DType*)dst_;  
+  for (int i = 0; i < len; ++i) {
+    dst[i] += src[i];
+  }
+}
+template<typename DType>
+inline void ReduceMax(const void *src_, void *dst_, int len, const MPI::Datatype &dtype) {
+  const DType *src = (const DType*)src_;
+  DType *dst = (DType*)dst_;  
+  for (int i = 0; i < len; ++i) {
+    if (src[i] > dst[i]) dst[i] = src[i];
+  }
+}
+template<typename DType>
+inline void ReduceBitOR(const void *src_, void *dst_, int len, const MPI::Datatype &dtype) {
+  const DType *src = (const DType*)src_;
+  DType *dst = (DType*)dst_;  
+  for (int i = 0; i < len; ++i) {
+    dst[i] |= src[i];
+  }
+}
+
+template<>
+void AllReduce<uint32_t>(uint32_t *sendrecvbuf, int count, ReduceOp op) {
+  typedef uint32_t DType;
+  switch(op) {
+    case kBitwiseOR: manager.AllReduce(sendrecvbuf, sizeof(DType), count, ReduceBitOR<DType>); return;
+    default: utils::Error("reduce op not supported");
+  }
+}
+
+template<>
+void AllReduce<float>(float *sendrecvbuf, int count, ReduceOp op) {
+  typedef float DType;
+  switch(op) {
+    case kSum: manager.AllReduce(sendrecvbuf, sizeof(DType), count, ReduceSum<DType>); return;
+    case kMax: manager.AllReduce(sendrecvbuf, sizeof(DType), count, ReduceMax<DType>); return;
+    default: utils::Error("unknown ReduceOp");
+  }
+}
+
+void Bcast(std::string *sendrecv_data, int root) {
+  unsigned len = static_cast<unsigned>(sendrecv_data->length());
+  manager.Bcast(&len, sizeof(len), root);
+  sendrecv_data->resize(len);
+  if (len != 0) {
+    manager.Bcast(&(*sendrecv_data)[0], len, root);  
+  }
+}
+
+// code for reduce handle
+ReduceHandle::ReduceHandle(void) : handle(NULL), htype(NULL) {
+}
+ReduceHandle::~ReduceHandle(void) {}
+
+int ReduceHandle::TypeSize(const MPI::Datatype &dtype) {
+  return dtype.type_size;
+}
+void ReduceHandle::Init(ReduceFunction redfunc, size_t type_n4bytes, bool commute) {
+  utils::Assert(handle == NULL, "cannot initialize reduce handle twice");
+  handle = reinterpret_cast<void*>(redfunc);
+}
+void ReduceHandle::AllReduce(void *sendrecvbuf, size_t type_n4bytes, size_t count) {
+  utils::Assert(handle != NULL, "must intialize handle to call AllReduce");
+  manager.AllReduce(sendrecvbuf, type_n4bytes * 4, count, reinterpret_cast<ReduceFunction*>(handle));
+}
+
 }  // namespace sync
 }  // namespace xgboost
diff --git a/src/utils/socket.h b/src/utils/socket.h
index 299c5468e2f0..48b917d6ac70 100644
--- a/src/utils/socket.h
+++ b/src/utils/socket.h
@@ -28,28 +28,34 @@ struct SockAddr {
   SockAddr(const char *url, int port) {
     this->Set(url, port);
   }
+  inline static std::string GetHostName(void) {
+    std::string buf; buf.resize(256);
+    utils::Check(gethostname(&buf[0], 256) != -1, "fail to get host name");
+    return std::string(buf.c_str());
+  }
   /*! 
    * \brief set the address
    * \param url the url of the address
    * \param port the port of address
    */
-  inline void Set(const char *url, int port) {
-    hostent *hp = gethostbyname(url);
-    Check(hp != NULL, "cannot obtain address of %s", url);
+  inline void Set(const char *host, int port) {
+    hostent *hp = gethostbyname(host);
+    Check(hp != NULL, "cannot obtain address of %s", host);
     memset(&addr, 0, sizeof(addr));
     addr.sin_family = AF_INET;
     addr.sin_port = htons(port);
     memcpy(&addr.sin_addr, hp->h_addr_list[0], hp->h_length);
   }
+  /*! \brief return port of the address*/
+  inline int port(void) const {
+    return ntohs(addr.sin_port);
+  }
   /*! \return a string representation of the address */
-  inline std::string ToString(void) const {
+  inline std::string AddrStr(void) const {
     std::string buf; buf.resize(256);
-    const char *s = inet_ntop(AF_INET, &addr, &buf[0], buf.length());
+    const char *s = inet_ntop(AF_INET, &addr.sin_addr, &buf[0], buf.length());
     Assert(s != NULL, "cannot decode address");
-    std::string res = s;
-    sprintf(&buf[0], "%u", ntohs(addr.sin_port));
-    res += ":" + buf;
-    return res;
+    return std::string(s);
   }
 };
 /*! 
@@ -60,11 +66,27 @@ class TCPSocket {
   /*! \brief the file descriptor of socket */
   int sockfd;
   // constructor
-  TCPSocket(void) {}
+  TCPSocket(void) : sockfd(-1) {
+  }
+  explicit TCPSocket(int sockfd) : sockfd(sockfd) {
+  }
+  ~TCPSocket(void) {
+    if (sockfd != -1) this->Close();
+  }
   // default conversion to int
   inline operator int() const {
     return sockfd;
   }
+  /*!
+   * \brief create the socket, call this before using socket
+   * \param af domain
+   */
+  inline void Create(int af = PF_INET) {
+    sockfd = socket(PF_INET, SOCK_STREAM, 0);
+    if (sockfd == -1) {
+      SockError("Create", errno);
+    }
+  }
   /*!
    * \brief start up the socket module
    *   call this before using the sockets
@@ -79,9 +101,9 @@ class TCPSocket {
   /*! 
    * \brief set this socket to use async I/O 
    */
-  inline void SetAsync(void) {
+  inline void SetNonBlock(void) {
     if (fcntl(sockfd, fcntl(sockfd, F_GETFL) | O_NONBLOCK) == -1) {
-      SockError("SetAsync", errno);
+      SockError("SetNonBlock", errno);
     }
   }
   /*!
@@ -91,15 +113,42 @@ class TCPSocket {
   inline void Listen(int backlog = 16) {
     listen(sockfd, backlog);
   }
+  /*! \brief get a new connection */
+  TCPSocket Accept(void) {
+    int newfd = accept(sockfd, NULL, NULL);
+    if (newfd == -1) {
+      SockError("Accept", errno);
+    }
+    return TCPSocket(newfd);
+  }
   /*! 
    * \brief bind the socket to an address 
-   * \param 3
+   * \param addr
    */
   inline void Bind(const SockAddr &addr) {
     if (bind(sockfd, (sockaddr*)&addr.addr, sizeof(addr.addr)) == -1) {
       SockError("Bind", errno);
     }
   }
+  /*! 
+   * \brief try bind the socket to host, from start_port to end_port
+   * \param start_port starting port number to try
+   * \param end_port ending port number to try
+   * \param out_addr the binding address, if successful
+   * \return whether the binding is successful
+   */
+  inline int TryBindHost(int start_port, int end_port) {    
+    for (int port = start_port; port < end_port; ++port) {
+      SockAddr addr("0.0.0.0", port);
+      if (bind(sockfd, (sockaddr*)&addr.addr, sizeof(addr.addr)) == 0) {
+        return port;
+      }
+      if (errno != EADDRINUSE) {
+        SockError("TryBindHost", errno);
+      }
+    }
+    return -1;
+  }                      
   /*! 
    * \brief connect to an address 
    * \param addr the address to connect to
@@ -111,7 +160,11 @@ class TCPSocket {
   }
   /*! \brief close the connection */
   inline void Close(void) {
-    close(sockfd);
+    if (sockfd != -1) {
+      close(sockfd); sockfd = -1;
+    } else {
+      Error("TCPSocket::Close double close the socket or close without create");
+    }
   }
   /*!
    * \brief send data using the socket 
@@ -123,22 +176,72 @@ class TCPSocket {
   inline size_t Send(const void *buf, size_t len, int flag = 0) {
     if (len == 0) return 0;
     ssize_t ret = send(sockfd, buf, len, flag);
-    if (ret == -1) SockError("Send", errno);
+    if (ret == -1) {
+      if (errno == EAGAIN || errno == EWOULDBLOCK) return 0;
+      SockError("Send", errno);
+    }
     return ret;
-  }
+  }  
   /*! 
-   * \brief send data using the socket 
+   * \brief receive data using the socket 
    * \param buf the pointer to the buffer
    * \param len the size of the buffer
    * \param flags extra flags
    * \return size of data actually received 
    */
   inline size_t Recv(void *buf, size_t len, int flags = 0) {
-    if (len == 0) return 0;
+    if (len == 0) return 0;    
     ssize_t ret = recv(sockfd, buf, len, flags);
-    if (ret == -1) SockError("Recv", errno);
+    if (ret == -1) {
+      if (errno == EAGAIN || errno == EWOULDBLOCK) return 0;
+      SockError("Recv", errno);
+    }
     return ret;
-   }
+  } 
+  /*!
+   * \brief peform block write that will attempt to send all data out
+   *    can still return smaller than request when error occurs
+   * \param buf the pointer to the buffer
+   * \param len the size of the buffer
+   * \return size of data actually sent
+   */
+  inline size_t SendAll(const void *buf_, size_t len) {
+    const char *buf = reinterpret_cast<const char*>(buf_);
+    size_t ndone = 0;
+    while (ndone <  len) {
+      ssize_t ret = send(sockfd, buf, len, 0);
+      if (ret == -1) {
+        if (errno == EAGAIN || errno == EWOULDBLOCK) return ndone;
+        SockError("Recv", errno);
+      }
+      buf += ret;
+      ndone += ret;
+    }
+    return ndone;
+  }
+  /*!
+   * \brief peforma block read that will attempt to read all data
+   *    can still return smaller than request when error occurs
+   * \param buf_ the buffer pointer
+   * \param len length of data to recv
+   * \return size of data actually sent
+   */
+  inline size_t RecvAll(void *buf_, size_t len) {
+    char *buf = reinterpret_cast<char*>(buf_);
+    size_t ndone = 0;
+    while (ndone <  len) {
+      ssize_t ret = recv(sockfd, buf, len, MSG_WAITALL);
+      if (ret == -1) {
+        if (errno == EAGAIN || errno == EWOULDBLOCK) return ndone;
+        SockError("Recv", errno);
+      }
+      if (ret == 0) return ndone;
+      buf += ret;
+      ndone += ret;
+    }
+    return ndone;
+  }
+
  private:
   // report an socket error
   inline static void SockError(const char *msg, int errsv) {
@@ -216,7 +319,7 @@ struct SelectHelper {
     if (ret == -1) {
       int errsv = errno;
       char buf[256];
-      Error("Select Error:%s", strerror_r(errsv, buf, sizeof(buf)));      
+      Error("Select Error: %s", strerror_r(errsv, buf, sizeof(buf)));      
     }
     return ret;
   }

From cb1c34aef0199b58000cc525d9017da244c00cf8 Mon Sep 17 00:00:00 2001
From: tqchen <tianqi.tchen@gmail.com>
Date: Sat, 22 Nov 2014 17:15:05 -0800
Subject: [PATCH 102/166] add nonblocking mode

---
 src/sync/sync_tcp.cpp | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/src/sync/sync_tcp.cpp b/src/sync/sync_tcp.cpp
index cfd0d57cd77f..dd851d05ae5b 100644
--- a/src/sync/sync_tcp.cpp
+++ b/src/sync/sync_tcp.cpp
@@ -101,6 +101,8 @@ class SyncManager {
     // setup selecter
     selecter.Clear();
     for (size_t i = 0; i < links.size(); ++i) {
+      // set the socket to non-blocking mode
+      links[i].sock.SetNonBlock();
       selecter.WatchRead(links[i].sock);
       selecter.WatchWrite(links[i].sock);
     }

From c499dd0f0c597a021ee3b607e7fe238ca8b905ee Mon Sep 17 00:00:00 2001
From: tqchen <tianqi.tchen@gmail.com>
Date: Sat, 22 Nov 2014 22:55:43 -0800
Subject: [PATCH 103/166] start testing allreduce

---
 src/sync/submit_tcp.py  | 108 ++++++++++++++++++++++++++++++++++++++++
 src/sync/sync_tcp.cpp   |  90 +++++++++++++++++++++------------
 test/Makefile           |  11 ++--
 test/test_allreduce.cpp |  22 ++++++++
 4 files changed, 196 insertions(+), 35 deletions(-)
 create mode 100755 src/sync/submit_tcp.py
 create mode 100644 test/test_allreduce.cpp

diff --git a/src/sync/submit_tcp.py b/src/sync/submit_tcp.py
new file mode 100755
index 000000000000..79f26edb8d9e
--- /dev/null
+++ b/src/sync/submit_tcp.py
@@ -0,0 +1,108 @@
+#!/usr/bin/python
+"""
+Master script for xgboost submit_tcp
+This script can be used to start jobs of multi-node xgboost using sync_tcp
+
+Tianqi Chen
+"""
+
+import sys
+import os
+import socket
+import struct
+import subprocess
+
+class ExSocket:
+    def __init__(self, sock):
+        self.sock = sock
+    def recvall(self, nbytes):
+        res = []
+        sock = self.sock
+        nread = 0    
+        while nread < nbytes:
+            chunk = self.sock.recv(min(nbytes - nread, 1024), socket.MSG_WAITALL)
+            nread += len(chunk)
+            res.append(chunk)
+        return ''.join(res)
+    def recvint(self):
+        return struct.unpack('!i', self.recvall(4))[0]
+    def sendint(self, n):
+        self.sock.sendall(struct.pack('!i', n))
+    def sendstr(self, s):
+        self.sendint(len(s))
+        self.sock.sendall(s)
+
+# magic number used to verify existence of data
+kMagic = 0xff99
+
+class Master:
+    def __init__(self, port = 9000, port_end = 9999):
+        sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
+        for port in range(port, port_end):
+            try:
+                sock.bind(('', port))
+                self.port = port
+                break
+            except socket.error:
+                continue
+        sock.listen(16)
+        self.sock = sock
+        print 'start listen on %s:%d' % (socket.gethostname(), self.port)
+    def __del__(self):
+        self.sock.close()
+    def slave_args(self):
+        return ['master_uri=%s' % socket.gethostname(),
+                'master_port=%s' % self.port]    
+    def accept_slaves(self, nslave):        
+        slave_addrs = []
+        for rank in range(nslave):
+            while True:
+                fd, s_addr = self.sock.accept()
+                print 'accept connection from %s' % s_addr
+                slave = ExSocket(fd)
+                nparent = int(rank != 0)
+                nchild = 0
+                if (rank + 1) * 2 - 1 < nslave:
+                    nchild += 1
+                if (rank + 1) * 2 < nslave:
+                    nchild += 1                
+                try:
+                    magic = slave.readint()
+                    if magic != kMagic:
+                        slave.sock.close()
+                        continue
+                except socket.error:
+                    slave.sock.close()
+                    continue
+                slave.sendint(kMagic)
+                slave.sendint(rank)
+                slave.sendint(nslave)
+                slave.sendint(nparent)
+                slave.sendint(nchild)
+                if nparent != 0:
+                    parent_index = (rank + 1) / 2 - 1
+                    ptuple = slave_addrs[parent_index]
+                    slave.sendstr(ptuple[0])
+                    slave.sendint(ptuple[1])
+                s_port = slave.recvint()
+                assert rank == len(slave_addrs)
+                slave_addrs.append(s_addr, s_port)
+                break
+        print 'all slaves setup complete'
+        
+def mpi_submit(nslave, args):
+    cmd = ' '.join(['mpirun -n %d' % nslave] + args)
+    print cmd
+    os.system(cmd)
+    
+def submit(nslave, args, fun_submit = mpi_submit):
+    master = Master()
+    fun_submit(nslave, args + master.slave_args())
+    master.accept_slaves(nslave)
+
+if __name__ == '__main__':
+    if len(sys.argv) < 2:
+        print 'Usage: <nslave> <cmd>'
+        exit(0)
+    submit(int(sys.argv[1]), sys.argv[2:])
+
diff --git a/src/sync/sync_tcp.cpp b/src/sync/sync_tcp.cpp
index dd851d05ae5b..3ae1bf8a5dbb 100644
--- a/src/sync/sync_tcp.cpp
+++ b/src/sync/sync_tcp.cpp
@@ -23,16 +23,64 @@ class SyncManager {
  public:
   const static int kMagic = 0xff99;
   SyncManager(void) {
-    master_uri = "localhost";
+    master_uri = "NULL";
     master_port = 9000;
+    host_uri = "";
     slave_port = 9010;
     nport_trial = 1000;
+    rank = 0;
+    world_size = 1;
+    reduce_buffer_size = 128; 
   }
   ~SyncManager(void) {
     this->Shutdown();
   }
+  inline void Shutdown(void) {
+    for (size_t i = 0; i < links.size(); ++i) {
+      links[i].sock.Close();
+    }
+    links.clear();
+  }
+  /*! \brief set parameters to the sync manager */
+  inline void SetParam(const char *name, const char *val) {
+    if (!strcmp(name, "master_uri")) master_uri = val;
+    if (!strcmp(name, "master_port")) master_port = atoi(val);
+    if (!strcmp(name, "reduce_buffer")) {
+      char unit;
+      unsigned long amount;
+      if (sscanf(val, "%lu%c", &amount, &unit) == 2) {
+        switch (unit) {
+          case 'B': reduce_buffer_size = amount; break;
+          case 'K': reduce_buffer_size = amount << 10UL; break;
+          case 'M': reduce_buffer_size = amount << 20UL; break;
+          case 'G': reduce_buffer_size = amount << 30UL; break;
+          default: utils::Error("invalid format for reduce buffer");
+        }
+      } else {
+        utils::Error("invalid format for reduce_buffer, shhould be {integer}{unit}, unit can be {B, KB, MB, GB}");
+      }
+    }
+  }
+  /*! \brief get rank */
+  inline int GetRank(void) const {
+    return rank;
+  }
+  /*! \brief check whether its distributed mode */
+  inline bool IsDistributed(void) const {
+    return links.size() != 0;
+  }
+  /*! \brief get rank */
+  inline int GetWorldSize(void) const {
+    return world_size;
+  }
+  /*! \brief get rank */
+  inline std::string GetHost(void) const {
+    return host_uri;
+  }
   // initialize the manager
   inline void Init(void) {
+    // single node mode
+    if (master_uri == "NULL") return;
     utils::Assert(links.size() == 0, "can only call Init once");
     int magic = kMagic;
     int nchild = 0, nparent = 0;
@@ -108,29 +156,6 @@ class SyncManager {
     }
     // done
   }
-  inline void Shutdown(void) {
-    for (size_t i = 0; i < links.size(); ++i) {
-      links[i].sock.Close();
-    }
-    links.clear();
-  }
-  /*! \brief set parameters to the sync manager */
-  inline void SetParam(const char *name, const char *val) {
-    if (!strcmp(name, "master_uri")) master_uri = val;
-    if (!strcmp(name, "master_port")) master_port = atoi(val);
-  }
-  /*! \brief get rank */
-  inline int GetRank(void) const {
-    return rank;
-  }  
-  /*! \brief get rank */
-  inline int GetWorldSize(void) const {
-    return world_size;
-  }
-  /*! \brief get rank */
-  inline std::string GetHost(void) const {
-    return host_uri;
-  }
   /*!
    * \brief perform in-place allreduce, on sendrecvbuf 
    *        this function is NOT thread-safe
@@ -159,7 +184,9 @@ class SyncManager {
 
     // initialize the link ring-buffer and pointer
     for (int i = 0; i < nlink; ++i) {
-      if (i != parent_index) links[i].InitBuffer(type_nbytes, count);
+      if (i != parent_index) {
+        links[i].InitBuffer(type_nbytes, count, reduce_buffer_size);
+      }
       links[i].ResetSize();
     }
     // if no childs, no need to reduce
@@ -301,8 +328,6 @@ class SyncManager {
     }
   }
  private:  
-  // 128 MB
-  const static size_t kBufferSize = 128;
   // an independent child record
   struct LinkRecord {
    public:
@@ -317,10 +342,10 @@ class SyncManager {
     // buffer size, in bytes
     size_t buffer_size;
     // initialize buffer
-    inline void InitBuffer(size_t type_nbytes, size_t count) {
-      utils::Assert(type_nbytes < kBufferSize, "too large type_nbytes");
+    inline void InitBuffer(size_t type_nbytes, size_t count, size_t reduce_buffer_size) {
+      utils::Assert(type_nbytes < reduce_buffer_size, "too large type_nbytes");
       size_t n = (type_nbytes * count + 7)/ 8;
-      buffer_.resize(std::min(kBufferSize, n));
+      buffer_.resize(std::min(reduce_buffer_size, n));
       // make sure align to type_nbytes
       buffer_size = buffer_.size() * sizeof(uint64_t) / type_nbytes * type_nbytes;
       // set buffer head
@@ -377,6 +402,8 @@ class SyncManager {
   int master_port;
   // port of slave process
   int slave_port, nport_trial;
+  // reduce buffer size
+  size_t reduce_buffer_size;
   // current rank
   int rank;
   // world size
@@ -405,9 +432,8 @@ int GetWorldSize(void) {
 std::string GetProcessorName(void) {
   return manager.GetHost();
 }
-
 bool IsDistributed(void) {
-  return true;
+  return manager.IsDistributed();
 }
 /*! \brief intiialize the synchronization module */
 void Init(int argc, char *argv[]) {
diff --git a/test/Makefile b/test/Makefile
index 6c943e155aaa..b03c91720d94 100644
--- a/test/Makefile
+++ b/test/Makefile
@@ -11,19 +11,24 @@ else
 endif
 
 # specify tensor path
-BIN = test_group_data test_quantile test_sock
-
+BIN = test_group_data test_quantile test_allreduce
+OBJ = sync_tcp.o
 .PHONY: clean all
 
 all: $(BIN) $(MPIBIN)
 
+sync_tcp.o: ../src/sync/sync_tcp.cpp ../src/utils/*.h
+
 test_group_data: test_group_data.cpp ../src/utils/*.h
 test_quantile: test_quantile.cpp ../src/utils/*.h
-test_sock: test_sock.cpp ../src/utils/*.h
+test_allreduce: test_allreduce.cpp ../src/utils/*.h ../src/sync/sync.h sync_tcp.o
 
 $(BIN) : 
 	$(CXX) $(CFLAGS) $(LDFLAGS) -o $@ $(filter %.cpp %.o %.c, $^)
 
+$(OBJ) : 
+	$(CXX) -c $(CFLAGS) -o $@ $(firstword $(filter %.cpp %.c, $^) )
+
 $(MPIBIN) : 
 	$(MPICXX) $(CFLAGS) $(LDFLAGS) -o $@ $(filter %.cpp %.o %.c, $^)
 
diff --git a/test/test_allreduce.cpp b/test/test_allreduce.cpp
new file mode 100644
index 000000000000..6df6ed0fab12
--- /dev/null
+++ b/test/test_allreduce.cpp
@@ -0,0 +1,22 @@
+#include <sync/sync.h>
+
+using namespace xgboost;
+
+int main(int argc, char *argv[]) {
+  sync::Init(argc, argv);
+  int rank = sync::GetRank();
+  std::string name = sync::GetProcessorName().c_str();
+  printf("start %s rank=%d\n", name.c_str(), rank);
+
+  std::vector<float> ndata(16);
+  for (size_t i = 0; i < ndata.size(); ++i) {
+    ndata[i] = i + rank;
+  }
+  sync::AllReduce(&ndata[0], ndata.size(), sync::kMax);
+  sync::Finalize();
+  for (size_t i = 0; i < ndata.size(); ++i) {
+    printf("%lu: %f\n", i, ndata[i]);
+  }
+  printf("all end\n");
+  return 0;
+}

From 115424826bc779674f3f139d382b573629b3170e Mon Sep 17 00:00:00 2001
From: tqchen <tianqi.tchen@gmail.com>
Date: Sun, 23 Nov 2014 11:15:48 -0800
Subject: [PATCH 104/166] basic test pass

---
 multi-node/submit_job_tcp.py              |  32 ++++++
 src/sync/sync.h                           |   3 +-
 src/sync/sync_tcp.cpp                     |  33 +++---
 src/sync/{submit_tcp.py => tcp_master.py} |  30 +++---
 src/utils/socket.h                        |  26 +++--
 test/Makefile                             |   1 -
 test/test_allreduce.cpp                   | 119 ++++++++++++++++++++--
 7 files changed, 193 insertions(+), 51 deletions(-)
 create mode 100755 multi-node/submit_job_tcp.py
 rename src/sync/{submit_tcp.py => tcp_master.py} (79%)
 mode change 100755 => 100644

diff --git a/multi-node/submit_job_tcp.py b/multi-node/submit_job_tcp.py
new file mode 100755
index 000000000000..069f5d577c68
--- /dev/null
+++ b/multi-node/submit_job_tcp.py
@@ -0,0 +1,32 @@
+#!/usr/bin/python
+"""
+This is an example script to create a customized job submit
+script using xgboost sync_tcp mode
+"""
+import sys
+import os
+import subprocess
+# import the tcp_master.py
+# add path to sync
+sys.path.append(os.path.dirname(__file__)+'/../src/sync/')
+import tcp_master as master
+
+def mpi_submit(nslave, args):
+    """
+      customized submit script, that submit nslave jobs, each must contain args as parameter
+      note this can be a lambda function containing additional parameters in input
+      Parameters
+         nslave number of slave process to start up
+         args arguments to launch each job
+              this usually includes the parameters of master_uri and parameters passed into submit
+    """
+    cmd = ' '.join(['mpirun -n %d' % nslave] + args)
+    print cmd
+    subprocess.check_call(cmd, shell = True)
+
+if __name__ == '__main__':
+    if len(sys.argv) < 2:
+        print 'Usage: <nslave> <cmd>'
+        exit(0)        
+    # call submit, with nslave, the commands to run each job and submit function
+    master.submit(int(sys.argv[1]), sys.argv[2:], fun_submit= mpi_submit)
diff --git a/src/sync/sync.h b/src/sync/sync.h
index c69755b14918..8d053faa04c4 100644
--- a/src/sync/sync.h
+++ b/src/sync/sync.h
@@ -85,7 +85,8 @@ class ReduceHandle {
   void AllReduce(void *sendrecvbuf, size_t type_n4bytes, size_t count);
   /*! \return the number of bytes occupied by the type */
   static int TypeSize(const MPI::Datatype &dtype);
- private:
+
+ protected:
   // handle data field
   void *handle;
   // handle to the type field
diff --git a/src/sync/sync_tcp.cpp b/src/sync/sync_tcp.cpp
index 3ae1bf8a5dbb..2cb4e598e66d 100644
--- a/src/sync/sync_tcp.cpp
+++ b/src/sync/sync_tcp.cpp
@@ -1,7 +1,7 @@
 /*!
  * \file sync_tcp.cpp
  * \brief implementation of sync AllReduce using TCP sockets
- *   with use async socket and tree-shape reduction
+ *   with use non-block socket and tree-shape reduction
  * \author Tianqi Chen
  */
 #include <vector>
@@ -11,7 +11,8 @@
 #include "../utils/socket.h"
 
 namespace MPI {
-struct Datatype {
+class Datatype {
+ public:
   size_t type_size;
   Datatype(size_t type_size) : type_size(type_size) {}
 };
@@ -30,7 +31,7 @@ class SyncManager {
     nport_trial = 1000;
     rank = 0;
     world_size = 1;
-    reduce_buffer_size = 128; 
+    this->SetParam("reduce_buffer", "256MB");
   }
   ~SyncManager(void) {
     this->Shutdown();
@@ -50,10 +51,10 @@ class SyncManager {
       unsigned long amount;
       if (sscanf(val, "%lu%c", &amount, &unit) == 2) {
         switch (unit) {
-          case 'B': reduce_buffer_size = amount; break;
-          case 'K': reduce_buffer_size = amount << 10UL; break;
-          case 'M': reduce_buffer_size = amount << 20UL; break;
-          case 'G': reduce_buffer_size = amount << 30UL; break;
+          case 'B': reduce_buffer_size = (amount + 7)/ 8; break;
+          case 'K': reduce_buffer_size = amount << 7UL; break;
+          case 'M': reduce_buffer_size = amount << 17UL; break;
+          case 'G': reduce_buffer_size = amount << 27UL; break;
           default: utils::Error("invalid format for reduce buffer");
         }
       } else {
@@ -117,16 +118,16 @@ class SyncManager {
       utils::Assert(master.RecvAll(&hname[0], len) == static_cast<size_t>(len), "sync::Init failure 10");
       utils::Assert(master.RecvAll(&hport, sizeof(hport)) == sizeof(hport), "sync::Init failure 11");
       links[0].sock.Create();
-      links[0].sock.Connect(utils::SockAddr(hname.c_str(), hport));
-      utils::Assert(links[0].sock.SendAll(&magic, sizeof(magic)) == sizeof(magic), "sync::Init failure");
-      utils::Assert(links[0].sock.RecvAll(&magic, sizeof(magic)) == sizeof(magic), "sync::Init failure");
+      links[0].sock.Connect(utils::SockAddr(hname.c_str(), hport));      
+      utils::Assert(links[0].sock.SendAll(&magic, sizeof(magic)) == sizeof(magic), "sync::Init failure 12");
+      utils::Assert(links[0].sock.RecvAll(&magic, sizeof(magic)) == sizeof(magic), "sync::Init failure 13");
       utils::Check(magic == kMagic, "sync::Init failure, parent magic number mismatch");
       parent_index = 0;
     } else {
       parent_index = -1;
     }
     // send back socket listening port to master
-    utils::Assert(master.SendAll(&port, sizeof(port)) == sizeof(port), "sync::Init failure 12");
+    utils::Assert(master.SendAll(&port, sizeof(port)) == sizeof(port), "sync::Init failure 14");
     // close connection to master
     master.Close();
     // accept links from childs
@@ -134,10 +135,10 @@ class SyncManager {
       LinkRecord r; 
       while (true) {
         r.sock = sock_listen.Accept();
-        if (links[0].sock.RecvAll(&magic, sizeof(magic)) == sizeof(magic) && magic == kMagic) {
-          utils::Assert(r.sock.SendAll(&magic, sizeof(magic)) == sizeof(magic), "sync::Init failure");
+        if (r.sock.RecvAll(&magic, sizeof(magic)) == sizeof(magic) && magic == kMagic) {
+          utils::Assert(r.sock.SendAll(&magic, sizeof(magic)) == sizeof(magic), "sync::Init failure 15");
           break;
-        } else {
+        } else {         
           // not a valid child
           r.sock.Close();
         }
@@ -150,7 +151,7 @@ class SyncManager {
     selecter.Clear();
     for (size_t i = 0; i < links.size(); ++i) {
       // set the socket to non-blocking mode
-      links[i].sock.SetNonBlock();
+      links[i].sock.SetNonBlock(true);
       selecter.WatchRead(links[i].sock);
       selecter.WatchWrite(links[i].sock);
     }
@@ -343,11 +344,11 @@ class SyncManager {
     size_t buffer_size;
     // initialize buffer
     inline void InitBuffer(size_t type_nbytes, size_t count, size_t reduce_buffer_size) {
-      utils::Assert(type_nbytes < reduce_buffer_size, "too large type_nbytes");
       size_t n = (type_nbytes * count + 7)/ 8;
       buffer_.resize(std::min(reduce_buffer_size, n));
       // make sure align to type_nbytes
       buffer_size = buffer_.size() * sizeof(uint64_t) / type_nbytes * type_nbytes;
+      utils::Assert(type_nbytes < buffer_size, "too large type_nbytes=%lu, buffer_size", type_nbytes, buffer_size);
       // set buffer head
       buffer_head = reinterpret_cast<char*>(BeginPtr(buffer_));
     }
diff --git a/src/sync/submit_tcp.py b/src/sync/tcp_master.py
old mode 100755
new mode 100644
similarity index 79%
rename from src/sync/submit_tcp.py
rename to src/sync/tcp_master.py
index 79f26edb8d9e..c0820f14b2f0
--- a/src/sync/submit_tcp.py
+++ b/src/sync/tcp_master.py
@@ -1,6 +1,5 @@
-#!/usr/bin/python
 """
-Master script for xgboost submit_tcp
+Master script for xgboost, tcp_master
 This script can be used to start jobs of multi-node xgboost using sync_tcp
 
 Tianqi Chen
@@ -11,6 +10,7 @@
 import socket
 import struct
 import subprocess
+from threading import Thread
 
 class ExSocket:
     def __init__(self, sock):
@@ -25,9 +25,9 @@ def recvall(self, nbytes):
             res.append(chunk)
         return ''.join(res)
     def recvint(self):
-        return struct.unpack('!i', self.recvall(4))[0]
+        return struct.unpack('@i', self.recvall(4))[0]
     def sendint(self, n):
-        self.sock.sendall(struct.pack('!i', n))
+        self.sock.sendall(struct.pack('@i', n))
     def sendstr(self, s):
         self.sendint(len(s))
         self.sock.sendall(s)
@@ -58,7 +58,6 @@ def accept_slaves(self, nslave):
         for rank in range(nslave):
             while True:
                 fd, s_addr = self.sock.accept()
-                print 'accept connection from %s' % s_addr
                 slave = ExSocket(fd)
                 nparent = int(rank != 0)
                 nchild = 0
@@ -67,11 +66,13 @@ def accept_slaves(self, nslave):
                 if (rank + 1) * 2 < nslave:
                     nchild += 1                
                 try:
-                    magic = slave.readint()
+                    magic = slave.recvint()
                     if magic != kMagic:
+                        print 'invalid magic number=%d from %s' % (magic, s_addr[0])                        
                         slave.sock.close()
                         continue
                 except socket.error:
+                    print 'sock error in %s' % (s_addr[0])
                     slave.sock.close()
                     continue
                 slave.sendint(kMagic)
@@ -86,23 +87,20 @@ def accept_slaves(self, nslave):
                     slave.sendint(ptuple[1])
                 s_port = slave.recvint()
                 assert rank == len(slave_addrs)
-                slave_addrs.append(s_addr, s_port)
+                slave_addrs.append((s_addr[0], s_port))
+                slave.sock.close()
+                print 'finish starting rank=%d at %s' % (rank, s_addr[0])
                 break
         print 'all slaves setup complete'
         
 def mpi_submit(nslave, args):
     cmd = ' '.join(['mpirun -n %d' % nslave] + args)
     print cmd
-    os.system(cmd)
+    return subprocess.check_call(cmd, shell = True)
     
 def submit(nslave, args, fun_submit = mpi_submit):
     master = Master()
-    fun_submit(nslave, args + master.slave_args())
+    submit_thread = Thread(target = fun_submit, args = (nslave, args + master.slave_args()))
+    submit_thread.start()
     master.accept_slaves(nslave)
-
-if __name__ == '__main__':
-    if len(sys.argv) < 2:
-        print 'Usage: <nslave> <cmd>'
-        exit(0)
-    submit(int(sys.argv[1]), sys.argv[2:])
-
+    submit_thread.join()
diff --git a/src/utils/socket.h b/src/utils/socket.h
index 48b917d6ac70..86d737f98a2f 100644
--- a/src/utils/socket.h
+++ b/src/utils/socket.h
@@ -71,7 +71,8 @@ class TCPSocket {
   explicit TCPSocket(int sockfd) : sockfd(sockfd) {
   }
   ~TCPSocket(void) {
-    if (sockfd != -1) this->Close();
+    // do nothing in destructor
+    // user need to take care of close
   }
   // default conversion to int
   inline operator int() const {
@@ -99,11 +100,22 @@ class TCPSocket {
   inline static void Finalize(void) {
   }
   /*! 
-   * \brief set this socket to use async I/O 
+   * \brief set this socket to use non-blocking mode
+   * \param non_block whether set it to be non-block, if it is false
+   *        it will set it back to block mode
    */
-  inline void SetNonBlock(void) {
-    if (fcntl(sockfd, fcntl(sockfd, F_GETFL) | O_NONBLOCK) == -1) {
-      SockError("SetNonBlock", errno);
+  inline void SetNonBlock(bool non_block) {
+    int flag = fcntl(sockfd, F_GETFL, 0);
+    if (flag == -1) {
+      SockError("SetNonBlock-1", errno);
+    }
+    if (non_block) {
+      flag |= O_NONBLOCK;
+    } else {
+      flag &= ~O_NONBLOCK;
+    }
+    if (fcntl(sockfd, F_SETFL, flag) == -1) {
+      SockError("SetNonBlock-2", errno);
     }
   }
   /*!
@@ -209,7 +221,7 @@ class TCPSocket {
     const char *buf = reinterpret_cast<const char*>(buf_);
     size_t ndone = 0;
     while (ndone <  len) {
-      ssize_t ret = send(sockfd, buf, len, 0);
+      ssize_t ret = send(sockfd, buf, len - ndone, 0);
       if (ret == -1) {
         if (errno == EAGAIN || errno == EWOULDBLOCK) return ndone;
         SockError("Recv", errno);
@@ -230,7 +242,7 @@ class TCPSocket {
     char *buf = reinterpret_cast<char*>(buf_);
     size_t ndone = 0;
     while (ndone <  len) {
-      ssize_t ret = recv(sockfd, buf, len, MSG_WAITALL);
+      ssize_t ret = recv(sockfd, buf, len - ndone, MSG_WAITALL);
       if (ret == -1) {
         if (errno == EAGAIN || errno == EWOULDBLOCK) return ndone;
         SockError("Recv", errno);
diff --git a/test/Makefile b/test/Makefile
index b03c91720d94..571d1189faf6 100644
--- a/test/Makefile
+++ b/test/Makefile
@@ -22,7 +22,6 @@ sync_tcp.o: ../src/sync/sync_tcp.cpp ../src/utils/*.h
 test_group_data: test_group_data.cpp ../src/utils/*.h
 test_quantile: test_quantile.cpp ../src/utils/*.h
 test_allreduce: test_allreduce.cpp ../src/utils/*.h ../src/sync/sync.h sync_tcp.o
-
 $(BIN) : 
 	$(CXX) $(CFLAGS) $(LDFLAGS) -o $@ $(filter %.cpp %.o %.c, $^)
 
diff --git a/test/test_allreduce.cpp b/test/test_allreduce.cpp
index 6df6ed0fab12..496039e28aea 100644
--- a/test/test_allreduce.cpp
+++ b/test/test_allreduce.cpp
@@ -1,22 +1,121 @@
 #include <sync/sync.h>
+#include <utils/utils.h>
+#include <cstdio>
+#include <cstdlib>
+#include <cmath>
 
 using namespace xgboost;
 
-int main(int argc, char *argv[]) {
-  sync::Init(argc, argv);
+inline void TestMax(size_t n) {
+  int rank = sync::GetRank();
+  int nproc = sync::GetWorldSize();
+  
+  std::vector<float> ndata(n);
+  for (size_t i = 0; i < ndata.size(); ++i) {
+    ndata[i] = (i * (rank+1)) % 111;
+  }
+  sync::AllReduce(&ndata[0], ndata.size(), sync::kMax);  
+  for (size_t i = 0; i < ndata.size(); ++i) {
+    float rmax = (i * 1) % 111;
+    for (int r = 0; r < nproc; ++r) {
+      rmax = std::max(rmax, (float)((i * (r+1)) % 111));
+    }
+    utils::Check(rmax == ndata[i], "[%d] TestMax check failure", rank);
+  }
+}
+
+inline void TestSum(size_t n) {
   int rank = sync::GetRank();
-  std::string name = sync::GetProcessorName().c_str();
-  printf("start %s rank=%d\n", name.c_str(), rank);
+  int nproc = sync::GetWorldSize();
+  const int z = 131;
 
-  std::vector<float> ndata(16);
+  std::vector<float> ndata(n);
   for (size_t i = 0; i < ndata.size(); ++i) {
-    ndata[i] = i + rank;
+    ndata[i] = (i * (rank+1)) % z;
   }
-  sync::AllReduce(&ndata[0], ndata.size(), sync::kMax);
-  sync::Finalize();
+  sync::AllReduce(&ndata[0], ndata.size(), sync::kSum);  
   for (size_t i = 0; i < ndata.size(); ++i) {
-    printf("%lu: %f\n", i, ndata[i]);
+    float rsum = 0.0f;
+    for (int r = 0; r < nproc; ++r) {
+      rsum += (float)((i * (r+1)) % z);
+    }
+    utils::Check(fabsf(rsum - ndata[i]) < 1e-5 ,
+                 "[%d] TestSum check failure, local=%g, allreduce=%g", rank, rsum, ndata[i]);
+  }
+}
+
+struct Rec {
+  double rmax;
+  double rmin;
+  double rsum;
+  Rec() {}
+  Rec(double r) {
+    rmax = rmin = rsum = r;
   }
-  printf("all end\n");
+  inline void Reduce(const Rec &b) {
+    rmax = std::max(b.rmax, rmax);
+    rmin = std::max(b.rmin, rmin);
+    rsum += b.rsum;
+  }
+  inline void CheckSameAs(const Rec &b) {
+    if (rmax != b.rmax || rmin != b.rmin || fabs(rsum - b.rsum) > 1e-6) {
+      utils::Error("[%d] TestReducer check failure", sync::GetRank());
+    }
+  }
+};
+
+inline void TestReducer(int n) {
+  int rank = sync::GetRank();
+  int nproc = sync::GetWorldSize();
+  const int z = 131;
+  sync::Reducer<Rec> red;
+  std::vector<Rec> ndata(n);
+  for (size_t i = 0; i < ndata.size(); ++i) {
+    ndata[i] = Rec((i * (rank+1)) % z);
+  }
+  red.AllReduce(&ndata[0], ndata.size());  
+                
+  for (size_t i = 0; i < ndata.size(); ++i) {
+    Rec rec((i * 1) % z);
+    for (int r = 1; r < nproc; ++r) {
+      rec.Reduce(Rec((i * (r+1)) % z));
+    }
+    rec.CheckSameAs(ndata[i]);
+  }  
+}
+
+
+inline void TestBcast(size_t n, int root) {
+  int rank = sync::GetRank();
+  std::string s; s.resize(n);      
+  for (size_t i = 0; i < n; ++i) {
+    s[i] = char(i % 126 + 1);
+  }
+  std::string res;
+  if (root == rank) {
+    res = s;
+    sync::Bcast(&res, root);
+  } else {
+    sync::Bcast(&res, root);
+  }
+  utils::Check(res == s, "[%d] TestBcast fail", rank);
+}
+
+int main(int argc, char *argv[]) {
+  if (argc < 2) {
+    printf("Usage: <ndata>\n");
+    return 0;
+  }
+  int n = atoi(argv[1]);
+  sync::Init(argc, argv);
+  int rank = sync::GetRank();
+  //int nproc = sync::GetWorldSize();
+  std::string name = sync::GetProcessorName();
+  printf("[%d] start at %s\n", rank, name.c_str());
+  TestMax(n);
+  TestSum(n);
+  TestReducer(n);
+  sync::Finalize();
+  printf("[%d] all check pass\n", rank);
   return 0;
 }

From 69b2f31098cfcd51eef77a8719e0a3b063da86ee Mon Sep 17 00:00:00 2001
From: tqchen <tianqi.tchen@gmail.com>
Date: Sun, 23 Nov 2014 11:31:34 -0800
Subject: [PATCH 105/166] bugfix in allreduce

---
 src/sync/sync_tcp.cpp   | 12 ++++++++----
 test/test_allreduce.cpp |  3 +++
 2 files changed, 11 insertions(+), 4 deletions(-)

diff --git a/src/sync/sync_tcp.cpp b/src/sync/sync_tcp.cpp
index 2cb4e598e66d..b21451d1ca9c 100644
--- a/src/sync/sync_tcp.cpp
+++ b/src/sync/sync_tcp.cpp
@@ -258,8 +258,10 @@ class SyncManager {
       size_t nfinished = size_down_in;
       // can pass message down to childs
       for (int i = 0; i < nlink; ++i) {
-        if (i != parent_index && selecter.CheckWrite(links[i].sock)) {
-          links[i].WriteFromArray(sendrecvbuf, size_down_in);
+        if (i != parent_index) {
+          if (selecter.CheckWrite(links[i].sock)) {
+            links[i].WriteFromArray(sendrecvbuf, size_down_in);
+          }
           nfinished = std::min(links[i].size_write, nfinished);
         }
       }
@@ -319,8 +321,10 @@ class SyncManager {
       size_t nfinished = total_size;
       // send data to all out-link
       for (int i = 0; i < nlink; ++i) {
-        if (i != in_link && selecter.CheckWrite(links[i].sock)) {
-          links[i].WriteFromArray(sendrecvbuf_, size_in);
+        if (i != in_link) {
+          if (selecter.CheckWrite(links[i].sock)) {
+            links[i].WriteFromArray(sendrecvbuf_, size_in);
+          }
           nfinished = std::min(nfinished, links[i].size_write);
         }
       }
diff --git a/test/test_allreduce.cpp b/test/test_allreduce.cpp
index 496039e28aea..4a47d7f559bd 100644
--- a/test/test_allreduce.cpp
+++ b/test/test_allreduce.cpp
@@ -113,8 +113,11 @@ int main(int argc, char *argv[]) {
   std::string name = sync::GetProcessorName();
   printf("[%d] start at %s\n", rank, name.c_str());
   TestMax(n);
+  printf("[%d] TestMax pass\n", rank);
   TestSum(n);
+  printf("[%d] TestSum pass\n", rank);
   TestReducer(n);
+  printf("[%d] TestReducer pass\n", rank);
   sync::Finalize();
   printf("[%d] all check pass\n", rank);
   return 0;

From 5f08313cb232a1a4c85b0d465cb28d1aceca473e Mon Sep 17 00:00:00 2001
From: tqchen <tianqi.tchen@gmail.com>
Date: Sun, 23 Nov 2014 14:03:59 -0800
Subject: [PATCH 106/166] make wrapper ok

---
 Makefile                                    |  4 +--
 demo/binary_classification/runexp.sh        |  8 +++---
 demo/guide-python/runall.sh                 |  4 +--
 multi-node/README.md                        | 23 +++++++++-------
 multi-node/col-split/README.md              |  7 ++++-
 multi-node/col-split/mushroom-col-python.sh | 22 ++++++++++++++++
 multi-node/col-split/mushroom-col.py        | 29 +++++++++++++++++++++
 multi-node/submit_job_tcp.py                |  4 +++
 src/io/io.cpp                               |  5 ++++
 src/io/simple_dmatrix-inl.hpp               | 11 ++++++--
 src/sync/sync_tcp.cpp                       |  4 ++-
 test/Makefile                               |  2 +-
 wrapper/xgboost.py                          | 20 +++++++++++++-
 wrapper/xgboost_wrapper.cpp                 | 17 ++++++++++++
 wrapper/xgboost_wrapper.h                   | 24 ++++++++++++++++-
 15 files changed, 160 insertions(+), 24 deletions(-)
 create mode 100755 multi-node/col-split/mushroom-col-python.sh
 create mode 100644 multi-node/col-split/mushroom-col.py

diff --git a/Makefile b/Makefile
index 172a7607b4e9..f11c20e2168f 100644
--- a/Makefile
+++ b/Makefile
@@ -32,8 +32,8 @@ sync_tcp.o: src/sync/sync_tcp.cpp
 sync_empty.o: src/sync/sync_empty.cpp 
 main.o: src/xgboost_main.cpp src/utils/*.h src/*.h src/learner/*.hpp src/learner/*.h 
 xgboost-mpi:  updater.o gbm.o io.o main.o sync_mpi.o 
-xgboost:  updater.o gbm.o io.o main.o sync_empty.o
-wrapper/libxgboostwrapper.so: wrapper/xgboost_wrapper.cpp src/utils/*.h src/*.h src/learner/*.hpp src/learner/*.h  updater.o gbm.o io.o sync_empty.o
+xgboost:  updater.o gbm.o io.o main.o sync_tcp.o
+wrapper/libxgboostwrapper.so: wrapper/xgboost_wrapper.cpp src/utils/*.h src/*.h src/learner/*.hpp src/learner/*.h  updater.o gbm.o io.o sync_tcp.o
 
 $(BIN) : 
 	$(CXX) $(CFLAGS) $(LDFLAGS) -o $@ $(filter %.cpp %.o %.c, $^)
diff --git a/demo/binary_classification/runexp.sh b/demo/binary_classification/runexp.sh
index c1f191e61780..68c3e6fb90f9 100755
--- a/demo/binary_classification/runexp.sh
+++ b/demo/binary_classification/runexp.sh
@@ -4,12 +4,12 @@ python mapfeat.py
 # split train and test
 python mknfold.py agaricus.txt 1
 # training and output the models
-mpirun ../../xgboost mushroom.conf
+../../xgboost mushroom.conf
 # output prediction task=pred 
-mpirun ../../xgboost mushroom.conf task=pred model_in=0002.model
+../../xgboost mushroom.conf task=pred model_in=0002.model
 # print the boosters of 00002.model in dump.raw.txt
-mpirun ../../xgboost mushroom.conf task=dump model_in=0002.model name_dump=dump.raw.txt 
+../../xgboost mushroom.conf task=dump model_in=0002.model name_dump=dump.raw.txt 
 # use the feature map in printing for better visualization
-mpirun ../../xgboost mushroom.conf task=dump model_in=0002.model fmap=featmap.txt name_dump=dump.nice.txt
+../../xgboost mushroom.conf task=dump model_in=0002.model fmap=featmap.txt name_dump=dump.nice.txt
 cat dump.nice.txt
 
diff --git a/demo/guide-python/runall.sh b/demo/guide-python/runall.sh
index 5317186d5045..8f4f9832a6d4 100755
--- a/demo/guide-python/runall.sh
+++ b/demo/guide-python/runall.sh
@@ -4,5 +4,5 @@ python custom_objective.py
 python boost_from_prediction.py
 python generalized_linear_model.py
 python cross_validation.py
-python predict_leaf_index.py
-rm -rf *~ *.model *.buffer 
\ No newline at end of file
+python predict_leaf_indices.py
+rm -rf *~ *.model *.buffer 
diff --git a/multi-node/README.md b/multi-node/README.md
index 6f1008514481..d1e6418481a6 100644
--- a/multi-node/README.md
+++ b/multi-node/README.md
@@ -4,17 +4,21 @@ This folder contains information about experimental version of distributed xgboo
 
 Build
 =====
-* You will need to have MPI
 * In the root folder, run ```make mpi```, this will give you xgboost-mpi
+  - You will need to have MPI to build xgboost-mpi
+* Alternatively, you can run ```make```, this will give you xgboost, which uses a beta buildin allreduce
+  - You do not need MPI to build this, you can modify [submit_job_tcp.py](submit_job_tcp.py) to use any job scheduler you like to submit the job
 
 Design Choice
 =====
-* Does distributed xgboost reply on MPI?
-  - Yes, but the dependency is isolated in [sync module](../src/sync/sync.h)
+* Does distributed xgboost must reply on MPI library?
+  - No, XGBoost replies on MPI protocol that provide Broadcast and AllReduce,
+  - The dependency is isolated in [sync module](../src/sync/sync.h)
   - All other parts of code uses interface defined in sync.h
-  - sync_mpi.cpp is a implementation of sync interface using standard MPI library
-  - Specificially, xgboost reply on MPI protocol that provide Broadcast and AllReduce,
-     if there are platform/framework that implements these protocol, xgboost should naturally extends to these platform
+  - [sync_mpi.cpp](../src/sync/sync_mpi.cpp) is a implementation of sync interface using standard MPI library, to use xgboost-mpi, you need an MPI library
+  - If there are platform/framework that implements these protocol, xgboost should naturally extends to these platform
+  - As an example, [sync_tcp.cpp](../src/sync/sync_tcp.cpp) is an implementation of interface using TCP, and is linked with xgboost by default
+
 * How is the data distributed?
   - There are two solvers in distributed xgboost
   - Column-based solver split data by column, each node work on subset of columns, 
@@ -26,10 +30,11 @@ Design Choice
 
 Usage
 ====
-* The current code run in MPI enviroment, you will need to have a network filesystem,
-    or copy data to local file system before running the code
+* You will need a network filesystem, or copy data to local file system before running the code
+* xgboost-mpi run in MPI enviroment, 
+* xgboost can be used together with [submit_job_tcp.py](submit_job_tcp.py) on other types of job schedulers
 * ***Note*** The distributed version is still multi-threading optimized.
-    You should run one xgboost-mpi per node that takes most available CPU,
+    You should run one process per node that takes most available CPU,
     this will reduce the communication overhead and improve the performance.
    - One way to do that is limit mpi slot in each machine to be 1, or reserve nthread processors for each process.
 * Examples:
diff --git a/multi-node/col-split/README.md b/multi-node/col-split/README.md
index bdafb2e32385..9a9b8dd243b3 100644
--- a/multi-node/col-split/README.md
+++ b/multi-node/col-split/README.md
@@ -1,6 +1,11 @@
 Distributed XGBoost: Column Split Version
 ====
-* run ```bash mushroom-row.sh <n-mpi-process>```
+* run ```bash mushroom-col.sh <n-mpi-process>```
+* run ```bash mushroom-col-tcp.sh <n-process>```
+  - mushroom-col-tcp.sh starts xgboost job using xgboost's buildin allreduce 
+* run ```bash mushroom-col-python.sh <n-process>```
+  - mushroom-col-python.sh starts xgboost python job using xgboost's buildin all reduce
+  - see mushroom-col.py
 
 How to Use
 ====
diff --git a/multi-node/col-split/mushroom-col-python.sh b/multi-node/col-split/mushroom-col-python.sh
new file mode 100755
index 000000000000..45008a1b47dd
--- /dev/null
+++ b/multi-node/col-split/mushroom-col-python.sh
@@ -0,0 +1,22 @@
+#!/bin/bash
+if [[ $# -ne 1 ]]
+then
+    echo "Usage: nprocess"
+    exit -1
+fi
+
+#
+# This script is same as mushroom-col except that we will be using xgboost python module
+# 
+# xgboost used built in tcp-based allreduce module, and can be run on more enviroment, so long as we know how to start job by modifying ../submit_job_tcp.py
+#
+rm -rf train.col* *.model
+k=$1
+
+# split the lib svm file into k subfiles
+python splitsvm.py ../../demo/data/agaricus.txt.train train $k
+
+# run xgboost mpi
+../submit_job_tcp.py $k python mushroom-col.py
+
+cat dump.nice.$k.txt
diff --git a/multi-node/col-split/mushroom-col.py b/multi-node/col-split/mushroom-col.py
new file mode 100644
index 000000000000..3e24f5f2c9c7
--- /dev/null
+++ b/multi-node/col-split/mushroom-col.py
@@ -0,0 +1,29 @@
+import os
+import sys
+sys.path.append(os.path.dirname(__file__)+'/../wrapper')
+import xgboost as xgb
+# this is example script of running distributed xgboost using python
+
+# call this additional function to intialize the xgboost sync module
+# in distributed mode
+xgb.sync_init(sys.argv)
+rank = xgb.sync_get_rank()
+# read in dataset
+dtrain = xgb.DMatrix('train.col%d' % rank)
+param = {'max_depth':3, 'eta':1, 'silent':1, 'objective':'binary:logistic' }
+param['dsplit'] = 'col'
+nround = 3
+
+if rank == 0:
+    dtest = xgb.DMatrix('../../demo/data/agaricus.txt.test')
+    model = xgb.train(param, dtrain, nround, [(dtrain, 'train') , (dtest, 'test')])
+else:
+    # if it is a slave node, do not run evaluation
+    model = xgb.train(param, dtrain, nround)
+
+if rank == 0:
+    model.save_model('%04d.model' % nround)
+    # dump model with feature map
+    model.dump_model('dump.nice.%d.txt' % xgb.sync_get_world_size(),'../../demo/data/featmap.txt')
+# shutdown the synchronization module
+xgb.sync_finalize()
diff --git a/multi-node/submit_job_tcp.py b/multi-node/submit_job_tcp.py
index 069f5d577c68..aa415d07a682 100755
--- a/multi-node/submit_job_tcp.py
+++ b/multi-node/submit_job_tcp.py
@@ -11,6 +11,10 @@
 sys.path.append(os.path.dirname(__file__)+'/../src/sync/')
 import tcp_master as master
 
+#
+#  Note: this submit script is only used for example purpose
+#  It does not have to be mpirun, it can be any job submission script that starts the job, qsub, hadoop streaming etc.
+#  
 def mpi_submit(nslave, args):
     """
       customized submit script, that submit nslave jobs, each must contain args as parameter
diff --git a/src/io/io.cpp b/src/io/io.cpp
index 8a4579ab81f3..0f9611e67f5c 100644
--- a/src/io/io.cpp
+++ b/src/io/io.cpp
@@ -13,6 +13,11 @@
 namespace xgboost {
 namespace io {
 DataMatrix* LoadDataMatrix(const char *fname, bool silent, bool savebuffer) {
+  if (!strcmp(fname, "stdin")) {
+    DMatrixSimple *dmat = new DMatrixSimple();
+    dmat->LoadText(fname, silent);
+    return dmat;
+  }
   std::string tmp_fname;
   const char *fname_ext = NULL;
   if (strchr(fname, ';') != NULL) {
diff --git a/src/io/simple_dmatrix-inl.hpp b/src/io/simple_dmatrix-inl.hpp
index 9165a5832e1c..f3cf6425e017 100644
--- a/src/io/simple_dmatrix-inl.hpp
+++ b/src/io/simple_dmatrix-inl.hpp
@@ -84,7 +84,12 @@ class DMatrixSimple : public DataMatrix {
   inline void LoadText(const char* fname, bool silent = false) {
     using namespace std;
     this->Clear();
-    FILE* file = utils::FopenCheck(fname, "r");
+    FILE* file;
+    if (!strcmp(fname, "stdin")) {
+      file = stdin;
+    } else {
+      file = utils::FopenCheck(fname, "r");      
+    }
     float label; bool init = true;
     char tmp[1024];
     std::vector<RowBatch::Entry> feats;
@@ -112,7 +117,9 @@ class DMatrixSimple : public DataMatrix {
                     static_cast<unsigned long>(info.num_col()),
                     static_cast<unsigned long>(row_data_.size()), fname);
     }
-    fclose(file);
+    if (file != stdin) {
+      fclose(file);
+    }
     // try to load in additional file
     std::string name = fname;
     std::string gname = name + ".group";
diff --git a/src/sync/sync_tcp.cpp b/src/sync/sync_tcp.cpp
index b21451d1ca9c..330b5318de06 100644
--- a/src/sync/sync_tcp.cpp
+++ b/src/sync/sync_tcp.cpp
@@ -352,7 +352,7 @@ class SyncManager {
       buffer_.resize(std::min(reduce_buffer_size, n));
       // make sure align to type_nbytes
       buffer_size = buffer_.size() * sizeof(uint64_t) / type_nbytes * type_nbytes;
-      utils::Assert(type_nbytes < buffer_size, "too large type_nbytes=%lu, buffer_size", type_nbytes, buffer_size);
+      utils::Assert(type_nbytes <= buffer_size, "too large type_nbytes=%lu, buffer_size=%lu", type_nbytes, buffer_size);
       // set buffer head
       buffer_head = reinterpret_cast<char*>(BeginPtr(buffer_));
     }
@@ -487,6 +487,8 @@ void AllReduce<uint32_t>(uint32_t *sendrecvbuf, int count, ReduceOp op) {
   typedef uint32_t DType;
   switch(op) {
     case kBitwiseOR: manager.AllReduce(sendrecvbuf, sizeof(DType), count, ReduceBitOR<DType>); return;
+    case kSum: manager.AllReduce(sendrecvbuf, sizeof(DType), count, ReduceSum<DType>); return;
+    case kMax: manager.AllReduce(sendrecvbuf, sizeof(DType), count, ReduceMax<DType>); return;
     default: utils::Error("reduce op not supported");
   }
 }
diff --git a/test/Makefile b/test/Makefile
index 571d1189faf6..a702d073fa1c 100644
--- a/test/Makefile
+++ b/test/Makefile
@@ -1,5 +1,5 @@
 export CC  = gcc
-export CXX = clang++
+export CXX = g++
 export MPICXX = mpicxx
 export LDFLAGS= -pthread -lm 
 export CFLAGS = -Wall -O3 -msse2  -Wno-unknown-pragmas -fPIC -I../src
diff --git a/wrapper/xgboost.py b/wrapper/xgboost.py
index 08aacb90eab7..d351928dc0a5 100644
--- a/wrapper/xgboost.py
+++ b/wrapper/xgboost.py
@@ -33,7 +33,10 @@
 xglib.XGBoosterPredict.restype = ctypes.POINTER(ctypes.c_float)
 xglib.XGBoosterEvalOneIter.restype = ctypes.c_char_p
 xglib.XGBoosterDumpModel.restype = ctypes.POINTER(ctypes.c_char_p)
-
+# sync function
+xglib.XGSyncGetRank.restype = ctypes.c_int
+xglib.XGSyncGetWorldSize.restype = ctypes.c_int
+# initialize communication module
 
 def ctypes2numpy(cptr, length, dtype):
     """convert a ctypes pointer array to numpy array """
@@ -553,3 +556,18 @@ def cv(params, dtrain, num_boost_round = 10, nfold=3, metrics=[], \
         sys.stderr.write(res+'\n')
         results.append(res)
     return results
+
+# synchronization module
+def sync_init(args = sys.argv):
+    arr = (ctypes.c_char_p * len(args))()
+    arr[:] = args
+    xglib.XGSyncInit(len(args), arr)
+    
+def sync_finalize():
+    xglib.XGSyncFinalize()
+
+def sync_get_rank():
+    return int(xglib.XGSyncGetRank())
+
+def sync_get_world_size():
+    return int(xglib.XGSyncGetWorldSize())
diff --git a/wrapper/xgboost_wrapper.cpp b/wrapper/xgboost_wrapper.cpp
index d0efc4bd0873..63fb310c6a95 100644
--- a/wrapper/xgboost_wrapper.cpp
+++ b/wrapper/xgboost_wrapper.cpp
@@ -80,6 +80,23 @@ class Booster: public learner::BoostLearner {
 using namespace xgboost::wrapper;
 
 extern "C"{
+  void XGSyncInit(int argc, char *argv[]) {
+    sync::Init(argc, argv);
+    if (sync::IsDistributed()) {
+      std::string pname = xgboost::sync::GetProcessorName();
+      utils::Printf("distributed job start %s:%d\n", pname.c_str(), xgboost::sync::GetRank());
+    }
+  }
+  void XGSyncFinalize(void) {
+    sync::Finalize();
+  }
+  int XGSyncGetRank(void) {
+    int rank = xgboost::sync::GetRank();
+    return rank;
+  }
+  int XGSyncGetWorldSize(void) {
+    return sync::GetWorldSize();
+  }
   void* XGDMatrixCreateFromFile(const char *fname, int silent) {
     return LoadDataMatrix(fname, silent != 0, false);
   }
diff --git a/wrapper/xgboost_wrapper.h b/wrapper/xgboost_wrapper.h
index 16d54f62b545..c0379a35f35b 100644
--- a/wrapper/xgboost_wrapper.h
+++ b/wrapper/xgboost_wrapper.h
@@ -17,6 +17,28 @@ typedef unsigned long bst_ulong;
 #ifdef __cplusplus
 extern "C" {
 #endif
+  /*!
+   * \brief initialize sync module, this is needed if used in distributed model
+   *        normally, argv need to contain master_uri and master_port
+   *        if start using submit_job_tcp script, then pass args to this will do
+   * \param argc number of arguments
+   * \param argv the arguments to be passed in sync module
+   */
+  XGB_DLL void XGSyncInit(int argc, char *argv[]);
+  /*!
+   * \brief finalize sync module, call this when everything is done
+   */
+  XGB_DLL void XGSyncFinalize(void);
+  /*!
+   * \brief get the rank 
+   * \return return the rank of 
+   */
+  XGB_DLL int XGSyncGetRank(void);
+  /*!
+   * \brief get the world size from sync
+   * \return return the number of distributed job ran in the group
+   */
+  XGB_DLL int XGSyncGetWorldSize(void);
   /*!
    * \brief load a data matrix 
    * \return a loaded data matrix
@@ -41,7 +63,7 @@ extern "C" {
    * \param col_ptr pointer to col headers
    * \param indices findex
    * \param data fvalue
-   * \param nindptr number of rows in the matix + 1 
+   * \param nindptr number of rows in the matix + 1
    * \param nelem number of nonzero elements in the matrix
    * \return created dmatrix
    */

From 373620503ab97aa3ec5d9f12eb34a5be1a839c2a Mon Sep 17 00:00:00 2001
From: tqchen <tianqi.tchen@gmail.com>
Date: Sun, 23 Nov 2014 14:08:34 -0800
Subject: [PATCH 107/166] ok

---
 multi-node/col-split/README.md           |  1 +
 multi-node/col-split/mushroom-col-tcp.sh | 28 ++++++++++++++++++++++++
 multi-node/row-split/README.md           |  2 ++
 multi-node/row-split/machine-row-tcp.sh  | 24 ++++++++++++++++++++
 4 files changed, 55 insertions(+)
 create mode 100755 multi-node/col-split/mushroom-col-tcp.sh
 create mode 100755 multi-node/row-split/machine-row-tcp.sh

diff --git a/multi-node/col-split/README.md b/multi-node/col-split/README.md
index 9a9b8dd243b3..cf6622b53131 100644
--- a/multi-node/col-split/README.md
+++ b/multi-node/col-split/README.md
@@ -1,6 +1,7 @@
 Distributed XGBoost: Column Split Version
 ====
 * run ```bash mushroom-col.sh <n-mpi-process>```
+  - mushroom-col.sh starts xgboost-mpi job
 * run ```bash mushroom-col-tcp.sh <n-process>```
   - mushroom-col-tcp.sh starts xgboost job using xgboost's buildin allreduce 
 * run ```bash mushroom-col-python.sh <n-process>```
diff --git a/multi-node/col-split/mushroom-col-tcp.sh b/multi-node/col-split/mushroom-col-tcp.sh
new file mode 100755
index 000000000000..7257f98907da
--- /dev/null
+++ b/multi-node/col-split/mushroom-col-tcp.sh
@@ -0,0 +1,28 @@
+#!/bin/bash
+if [[ $# -ne 1 ]]
+then
+    echo "Usage: nprocess"
+    exit -1
+fi
+
+#
+# This script is same as mushroom-col except that we will be using xgboost instead of xgboost-mpi
+# xgboost used built in tcp-based allreduce module, and can be run on more enviroment, so long as we know how to start job by modifying ../submit_job_tcp.py
+#
+rm -rf train.col* *.model
+k=$1
+
+# split the lib svm file into k subfiles
+python splitsvm.py ../../demo/data/agaricus.txt.train train $k
+
+# run xgboost mpi
+../submit_job_tcp.py $k ../../xgboost mushroom-col.conf dsplit=col
+
+# the model can be directly loaded by single machine xgboost solver, as usuall
+../../xgboost mushroom-col.conf task=dump model_in=0002.model fmap=../../demo/data/featmap.txt name_dump=dump.nice.$k.txt
+
+# run for one round, and continue training
+../submit_job_tcp.py $k ../../xgboost mushroom-col.conf dsplit=col num_round=1
+../submit_job_tcp.py $k ../../xgboost mushroom-col.conf dsplit=col model_in=0001.model
+
+cat dump.nice.$k.txt
\ No newline at end of file
diff --git a/multi-node/row-split/README.md b/multi-node/row-split/README.md
index 4c427f3ec3bd..807b0608def6 100644
--- a/multi-node/row-split/README.md
+++ b/multi-node/row-split/README.md
@@ -3,6 +3,8 @@ Distributed XGBoost: Row Split Version
 * Mushroom: run ```bash mushroom-row.sh <n-mpi-process>```
 * Machine: run ```bash machine-row.sh <n-mpi-process>```
   - Machine case also include example to continue training from existing model
+* Machine TCP: run ```bash machine-row-tcp.sh <n-mpi-process>```
+  - machine-col-tcp.sh starts xgboost job using xgboost's buildin allreduce 
 
 How to Use
 ====
diff --git a/multi-node/row-split/machine-row-tcp.sh b/multi-node/row-split/machine-row-tcp.sh
new file mode 100755
index 000000000000..c312eb3a52ce
--- /dev/null
+++ b/multi-node/row-split/machine-row-tcp.sh
@@ -0,0 +1,24 @@
+#!/bin/bash
+if [[ $# -ne 1 ]]
+then
+    echo "Usage: nprocess"
+    exit -1
+fi
+
+rm -rf train-machine.row* *.model
+k=$1
+# make machine data
+cd ../../demo/regression/
+python mapfeat.py
+python mknfold.py machine.txt 1
+cd -
+
+# split the lib svm file into k subfiles
+python splitrows.py ../../demo/regression/machine.txt.train train-machine $k
+
+# run xgboost mpi
+../submit_job_tcp.py $k ../../xgboost machine-row.conf dsplit=row num_round=3
+
+# run xgboost-mpi save model 0001, continue to run from existing model
+../submit_job_tcp.py $k ../../xgboost machine-row.conf dsplit=row num_round=1
+../submit_job_tcp.py $k ../../xgboost machine-row.conf dsplit=row num_round=2 model_in=0001.model

From 372de9f968ff1feb778fc9468a1f16535168bd8a Mon Sep 17 00:00:00 2001
From: tqchen <tianqi.tchen@gmail.com>
Date: Sun, 23 Nov 2014 17:35:21 -0800
Subject: [PATCH 108/166] check in conf

---
 multi-node/col-split/mushroom-col.conf | 35 ++++++++++++++++++++++++++
 1 file changed, 35 insertions(+)
 create mode 100644 multi-node/col-split/mushroom-col.conf

diff --git a/multi-node/col-split/mushroom-col.conf b/multi-node/col-split/mushroom-col.conf
new file mode 100644
index 000000000000..2c779a44da53
--- /dev/null
+++ b/multi-node/col-split/mushroom-col.conf
@@ -0,0 +1,35 @@
+# General Parameters, see comment for each definition
+# choose the booster, can be gbtree or gblinear
+booster = gbtree
+# choose logistic regression loss function for binary classification
+objective = binary:logistic
+
+# Tree Booster Parameters
+# step size shrinkage
+eta = 1.0 
+# minimum loss reduction required to make a further partition
+gamma = 1.0 
+# minimum sum of instance weight(hessian) needed in a child
+min_child_weight = 1 
+# maximum depth of a tree
+max_depth = 3 
+
+# Task Parameters
+# the number of round to do boosting
+num_round = 2
+# 0 means do not save any model except the final round model
+save_period = 0 
+use_buffer = 0
+
+# The path of training data %d is the wildcard for the rank of the data
+# The idea is each process take a feature matrix with subset of columns
+#
+data = "train.col%d" 
+
+# The path of validation data, used to monitor training process, here [test] sets name of the validation set
+eval[test] = "../../demo/data/agaricus.txt.test" 
+# evaluate on training data as well each round
+eval_train = 1
+
+# The path of test data, need to use full data of test, try not use it, or keep an subsampled version
+test:data = "../../demo/data/agaricus.txt.test"      

From b55fe803503e494a05178705edab0e098330d2cf Mon Sep 17 00:00:00 2001
From: tqchen <tianqi.tchen@gmail.com>
Date: Sun, 23 Nov 2014 18:15:42 -0800
Subject: [PATCH 109/166] add row map example

---
 multi-node/row-split/machine-row-map.sh | 20 ++++++++++++++++++++
 multi-node/row-split/map.sh             |  3 +++
 2 files changed, 23 insertions(+)
 create mode 100755 multi-node/row-split/machine-row-map.sh
 create mode 100644 multi-node/row-split/map.sh

diff --git a/multi-node/row-split/machine-row-map.sh b/multi-node/row-split/machine-row-map.sh
new file mode 100755
index 000000000000..a1c5bfe0c012
--- /dev/null
+++ b/multi-node/row-split/machine-row-map.sh
@@ -0,0 +1,20 @@
+#!/bin/bash
+if [[ $# -ne 1 ]]
+then
+    echo "Usage: nprocess"
+    exit -1
+fi
+
+rm -rf train-machine.row* *.model
+k=$1
+# make machine data
+cd ../../demo/regression/
+python mapfeat.py
+python mknfold.py machine.txt 1
+cd -
+
+# split the lib svm file into k subfiles
+python splitrows.py ../../demo/regression/machine.txt.train train-machine $k
+
+# run xgboost mpi, take data from stdin
+../submit_job_tcp.py $k "bash map.sh train-machine.row ../../xgboost machine-row.conf dsplit=row num_round=3 data=stdin"
diff --git a/multi-node/row-split/map.sh b/multi-node/row-split/map.sh
new file mode 100644
index 000000000000..624192121e63
--- /dev/null
+++ b/multi-node/row-split/map.sh
@@ -0,0 +1,3 @@
+# a simple script to simulate mapreduce mapper
+echo "cat $1$OMPI_COMM_WORLD_RANK | ${@:2}"
+cat $1$OMPI_COMM_WORLD_RANK | ${@:2}

From 2e444f833864e7f944e77954d511ce1dff8b62fa Mon Sep 17 00:00:00 2001
From: Tianqi Chen <workcrow@gmail.com>
Date: Sun, 23 Nov 2014 20:52:13 -0800
Subject: [PATCH 110/166] remove warning from MSVC need another round of check

---
 src/gbm/gbtree-inl.hpp             |  2 +-
 src/io/io.cpp                      |  1 +
 src/io/page_dmatrix-inl.hpp        |  2 +-
 src/io/page_fmatrix-inl.hpp        |  4 ++--
 src/learner/learner-inl.hpp        |  2 +-
 src/sync/sync.h                    |  2 +-
 src/sync/sync_mpi.cpp              |  6 +++---
 src/sync/sync_tcp.cpp              |  4 ++--
 src/tree/updater_basemaker-inl.hpp |  3 ++-
 src/tree/updater_histmaker-inl.hpp | 18 +++++++++---------
 src/utils/io.h                     |  2 +-
 src/utils/quantile.h               | 14 ++++++++------
 12 files changed, 32 insertions(+), 28 deletions(-)

diff --git a/src/gbm/gbtree-inl.hpp b/src/gbm/gbtree-inl.hpp
index b20acd48e917..8d511f06e513 100644
--- a/src/gbm/gbtree-inl.hpp
+++ b/src/gbm/gbtree-inl.hpp
@@ -324,7 +324,7 @@ class GBTree : public IGradBooster {
                        unsigned ntree_limit) {
     // number of valid trees
     if (ntree_limit == 0 || ntree_limit > trees.size()) {
-      ntree_limit = trees.size();
+      ntree_limit = static_cast<unsigned>(trees.size());
     } 
     std::vector<float> &preds = *out_preds;
     preds.resize(info.num_row * ntree_limit);
diff --git a/src/io/io.cpp b/src/io/io.cpp
index 0f9611e67f5c..0072618c65c6 100644
--- a/src/io/io.cpp
+++ b/src/io/io.cpp
@@ -1,5 +1,6 @@
 #define _CRT_SECURE_NO_WARNINGS
 #define _CRT_SECURE_NO_DEPRECATE
+#define NOMINMAX
 #include <string>
 #include "./io.h"
 #include "../utils/io.h"
diff --git a/src/io/page_dmatrix-inl.hpp b/src/io/page_dmatrix-inl.hpp
index 41ad19be5cc9..4f70ff2e9fb3 100644
--- a/src/io/page_dmatrix-inl.hpp
+++ b/src/io/page_dmatrix-inl.hpp
@@ -165,7 +165,7 @@ class ThreadRowPageIterator: public utils::IIterator<RowBatch> {
   // loader factory for page
   struct Factory {
    public:
-    long file_begin_;
+    size_t file_begin_;
     utils::FileStream fi;
     Factory(void) {}
     inline void SetFile(const utils::FileStream &fi) {
diff --git a/src/io/page_fmatrix-inl.hpp b/src/io/page_fmatrix-inl.hpp
index 91d24cf2d104..0527da827b3c 100644
--- a/src/io/page_fmatrix-inl.hpp
+++ b/src/io/page_fmatrix-inl.hpp
@@ -138,8 +138,8 @@ class CSCMatrixManager {
     fi_->Seek(col_ptr_[cidx] * sizeof(ColBatch::Entry) + begin_data_);
     utils::Check(fi_->Read(p_data, sizeof(ColBatch::Entry) * len) != 0,
                  "invalid column buffer format");
-    p_page->col_data.push_back(ColBatch::Inst(p_data, len));
-    p_page->col_index.push_back(cidx);
+    p_page->col_data.push_back(ColBatch::Inst(p_data, static_cast<bst_uint>(len)));
+    p_page->col_index.push_back(static_cast<bst_uint>(cidx));
     return true;
   }
   // the following are in memory auxiliary data structure
diff --git a/src/learner/learner-inl.hpp b/src/learner/learner-inl.hpp
index d16986e838a0..05ab09f98f10 100644
--- a/src/learner/learner-inl.hpp
+++ b/src/learner/learner-inl.hpp
@@ -187,7 +187,7 @@ class BoostLearner {
    * \param p_train pointer to the matrix used by training
    */
   inline void CheckInit(DMatrix *p_train) {
-    int ncol = p_train->info.info.num_col;    
+    int ncol = static_cast<int>(p_train->info.info.num_col);    
     std::vector<bool> enabled(ncol, true);
     
     if (part_load_col != 0) {      
diff --git a/src/sync/sync.h b/src/sync/sync.h
index 8d053faa04c4..2e14f2807b7c 100644
--- a/src/sync/sync.h
+++ b/src/sync/sync.h
@@ -49,7 +49,7 @@ void Finalize(void);
  * \param op reduction function
  */
 template<typename DType>
-void AllReduce(DType *sendrecvbuf, int count, ReduceOp op);
+void AllReduce(DType *sendrecvbuf, size_t count, ReduceOp op);
 
 /*!
  * \brief broadcast an std::string to all others from root
diff --git a/src/sync/sync_mpi.cpp b/src/sync/sync_mpi.cpp
index 42b7c7ba6bbe..f867fa5d0064 100644
--- a/src/sync/sync_mpi.cpp
+++ b/src/sync/sync_mpi.cpp
@@ -33,7 +33,7 @@ void Finalize(void) {
   MPI::Finalize();
 }
 
-void AllReduce_(void *sendrecvbuf, int count, const MPI::Datatype &dtype, ReduceOp op) {
+void AllReduce_(void *sendrecvbuf, size_t count, const MPI::Datatype &dtype, ReduceOp op) {
   switch(op) {
     case kBitwiseOR: MPI::COMM_WORLD.Allreduce(MPI_IN_PLACE, sendrecvbuf, count, dtype, MPI::BOR); return;
     case kSum: MPI::COMM_WORLD.Allreduce(MPI_IN_PLACE, sendrecvbuf, count, dtype, MPI::SUM); return;
@@ -42,12 +42,12 @@ void AllReduce_(void *sendrecvbuf, int count, const MPI::Datatype &dtype, Reduce
 }
 
 template<>
-void AllReduce<uint32_t>(uint32_t *sendrecvbuf, int count, ReduceOp op) {
+void AllReduce<uint32_t>(uint32_t *sendrecvbuf, size_t count, ReduceOp op) {
   AllReduce_(sendrecvbuf, count, MPI::UNSIGNED, op);
 }
 
 template<>
-void AllReduce<float>(float *sendrecvbuf, int count, ReduceOp op) {
+void AllReduce<float>(float *sendrecvbuf, size_t count, ReduceOp op) {
   AllReduce_(sendrecvbuf, count, MPI::FLOAT, op);
 }
 
diff --git a/src/sync/sync_tcp.cpp b/src/sync/sync_tcp.cpp
index 330b5318de06..6ed24f51d658 100644
--- a/src/sync/sync_tcp.cpp
+++ b/src/sync/sync_tcp.cpp
@@ -483,7 +483,7 @@ inline void ReduceBitOR(const void *src_, void *dst_, int len, const MPI::Dataty
 }
 
 template<>
-void AllReduce<uint32_t>(uint32_t *sendrecvbuf, int count, ReduceOp op) {
+void AllReduce<uint32_t>(uint32_t *sendrecvbuf, size_t count, ReduceOp op) {
   typedef uint32_t DType;
   switch(op) {
     case kBitwiseOR: manager.AllReduce(sendrecvbuf, sizeof(DType), count, ReduceBitOR<DType>); return;
@@ -494,7 +494,7 @@ void AllReduce<uint32_t>(uint32_t *sendrecvbuf, int count, ReduceOp op) {
 }
 
 template<>
-void AllReduce<float>(float *sendrecvbuf, int count, ReduceOp op) {
+void AllReduce<float>(float *sendrecvbuf, size_t count, ReduceOp op) {
   typedef float DType;
   switch(op) {
     case kSum: manager.AllReduce(sendrecvbuf, sizeof(DType), count, ReduceSum<DType>); return;
diff --git a/src/tree/updater_basemaker-inl.hpp b/src/tree/updater_basemaker-inl.hpp
index e5cfd17fabb7..68bd9ede4c5e 100644
--- a/src/tree/updater_basemaker-inl.hpp
+++ b/src/tree/updater_basemaker-inl.hpp
@@ -69,7 +69,8 @@ class BaseMaker: public IUpdater {
       std::vector<bst_uint> &findex = *p_findex;
       findex.clear();
       for (size_t i = 0; i < fminmax.size(); i += 2) {
-        if (this->Type(i / 2) != 0) findex.push_back(i / 2);
+		const bst_uint fid = static_cast<bst_uint>(i / 2);
+        if (this->Type(fid) != 0) findex.push_back(fid);
       }
       unsigned n = static_cast<unsigned>(p * findex.size());
       random::Shuffle(findex);
diff --git a/src/tree/updater_histmaker-inl.hpp b/src/tree/updater_histmaker-inl.hpp
index 06febf47a3ed..61d3008d5ce5 100644
--- a/src/tree/updater_histmaker-inl.hpp
+++ b/src/tree/updater_histmaker-inl.hpp
@@ -68,7 +68,7 @@ class HistMaker: public BaseMaker {
     /*! \brief data in different hist unit */
     std::vector<TStats> data;
     /*! \brief */
-    inline HistUnit operator[](bst_uint fid) {
+    inline HistUnit operator[](size_t fid) {
       return HistUnit(cut + rptr[fid],
                       &data[0] + rptr[fid],
                       rptr[fid+1] - rptr[fid]);
@@ -159,7 +159,7 @@ class HistMaker: public BaseMaker {
                            std::vector<bst_uint> *p_fset) {
     p_fset->resize(tree.param.num_feature);
     for (size_t i = 0; i < p_fset->size(); ++i) {
-      (*p_fset)[i] = i;
+      (*p_fset)[i] = static_cast<unsigned>(i);
     }
   }
   // reset position after split, this is not a must, depending on implementation
@@ -187,7 +187,7 @@ class HistMaker: public BaseMaker {
         c.SetSubstract(node_sum, s);
         if (c.sum_hess >= param.min_child_weight) {
           double loss_chg = s.CalcGain(param) + c.CalcGain(param) - root_gain;
-          if (best->Update(loss_chg, fid, hist.cut[i], false)) {
+          if (best->Update((float)loss_chg, fid, hist.cut[i], false)) {
             *left_sum = s;
           }
         }
@@ -200,7 +200,7 @@ class HistMaker: public BaseMaker {
         c.SetSubstract(node_sum, s);
         if (c.sum_hess >= param.min_child_weight) {
           double loss_chg = s.CalcGain(param) + c.CalcGain(param) - root_gain;
-          if (best->Update(loss_chg, fid, hist.cut[i-1], true)) {
+          if (best->Update((float)loss_chg, fid, hist.cut[i-1], true)) {
             *left_sum = c;
           }
         }
@@ -258,7 +258,7 @@ class HistMaker: public BaseMaker {
   }
   
   inline void SetStats(RegTree *p_tree, int nid, const TStats &node_sum) {
-    p_tree->stat(nid).base_weight = node_sum.CalcWeight(param);
+    p_tree->stat(nid).base_weight = static_cast<float>(node_sum.CalcWeight(param));
     p_tree->stat(nid).sum_hess = static_cast<float>(node_sum.sum_hess);
     node_sum.SetLeafVec(param, p_tree->leafvec(nid));    
   }
@@ -419,17 +419,17 @@ class CQHistMaker: public HistMaker<TStats> {
             bst_float last = cpt + fabs(cpt) + rt_eps;
             this->wspace.cut.push_back(last);
           }
-          this->wspace.rptr.push_back(this->wspace.cut.size());
+          this->wspace.rptr.push_back(static_cast<unsigned>(this->wspace.cut.size()));
         } else {
           utils::Assert(offset == -2, "BUG in mark");
           bst_float cpt = feat_helper.MaxValue(fset[i]);        
           this->wspace.cut.push_back(cpt + fabs(cpt) + rt_eps);
-          this->wspace.rptr.push_back(this->wspace.cut.size());        
+          this->wspace.rptr.push_back(static_cast<unsigned>(this->wspace.cut.size()));        
         }
       }
       // reserve last value for global statistics
       this->wspace.cut.push_back(0.0f);
-      this->wspace.rptr.push_back(this->wspace.cut.size());
+      this->wspace.rptr.push_back(static_cast<unsigned>(this->wspace.cut.size()));
     }
     utils::Assert(this->wspace.rptr.size() ==
                   (fset.size() + 1) * this->qexpand.size() + 1,
@@ -493,7 +493,7 @@ class CQHistMaker: public HistMaker<TStats> {
     } else {
       for (size_t i = 0; i < this->qexpand.size(); ++i) {
         const unsigned nid = this->qexpand[i];        
-        sbuilder[nid].sum_total = nstats[nid].sum_hess;
+        sbuilder[nid].sum_total = static_cast<bst_float>(nstats[nid].sum_hess);
       } 
     }
     // if only one value, no need to do second pass
diff --git a/src/utils/io.h b/src/utils/io.h
index 97aaa94b2a44..dff691ee0cb0 100644
--- a/src/utils/io.h
+++ b/src/utils/io.h
@@ -188,7 +188,7 @@ class FileStream : public ISeekStream {
     std::fwrite(ptr, size, 1, fp);
   }
   virtual void Seek(size_t pos) {
-    std::fseek(fp, pos, SEEK_SET);
+    std::fseek(fp, static_cast<long>(pos), SEEK_SET);
   }
   virtual size_t Tell(void) {
     return std::ftell(fp);
diff --git a/src/utils/quantile.h b/src/utils/quantile.h
index 8d49afc98ed9..32495fb3b2dd 100644
--- a/src/utils/quantile.h
+++ b/src/utils/quantile.h
@@ -186,7 +186,7 @@ struct WQSummary {
    * \param maxsize size we can afford in the pruned sketch
    */
 
-  inline void SetPrune(const WQSummary &src, RType maxsize) {
+  inline void SetPrune(const WQSummary &src, size_t maxsize) {
     if (src.size <= maxsize) {
       this->CopyFrom(src); return;
     }
@@ -290,7 +290,7 @@ struct WXQSummary : public WQSummary<DType, RType> {
     return  e.rmin_next() > e.rmax_prev() + chunk;
   }
   // set prune
-  inline void SetPrune(const WQSummary<DType, RType> &src, RType maxsize) {
+  inline void SetPrune(const WQSummary<DType, RType> &src, size_t maxsize) {
     if (src.size <= maxsize) {
       this->CopyFrom(src); return;
     }
@@ -435,7 +435,7 @@ struct GKSummary {
    * \param src source summary
    * \param maxsize size we can afford in the pruned sketch
    */
-  inline void SetPrune(const GKSummary &src, RType maxsize) {
+  inline void SetPrune(const GKSummary &src, size_t maxsize) {
     if (src.size <= maxsize) {
       this->CopyFrom(src); return;
     }
@@ -597,12 +597,14 @@ class QuantileSketchTemplate {
   inline void Init(size_t maxn, double eps) {
     nlevel = 1;
     while (true) {
-      limit_size = ceil(nlevel / eps) + 1;
-      if ((1 << nlevel)  * limit_size >= maxn) break;
+      limit_size = static_cast<size_t>(ceil(nlevel / eps)) + 1;
+	  size_t n = (1UL << nlevel);
+      if (n * limit_size >= maxn) break;
       ++nlevel;
     }
     // check invariant
-    utils::Assert((1 << nlevel) * limit_size >= maxn, "invalid init parameter");
+	size_t n = (1UL << nlevel);
+    utils::Assert(n * limit_size >= maxn, "invalid init parameter");
     utils::Assert(nlevel <= limit_size * eps, "invalid init parameter");
     // lazy reserve the space, if there is only one value, no need to allocate space
     inqueue.queue.resize(1);

From db2adb688586eb91737e1d54119494f85e5245c8 Mon Sep 17 00:00:00 2001
From: tqchen <tianqi.tchen@gmail.com>
Date: Sun, 23 Nov 2014 20:59:10 -0800
Subject: [PATCH 111/166] start check windows compatiblity

---
 src/gbm/gbm.cpp         | 1 +
 src/sync/sync_empty.cpp | 6 ++++--
 src/sync/sync_mpi.cpp   | 3 +++
 src/sync/sync_tcp.cpp   | 6 +++++-
 src/tree/updater.cpp    | 1 +
 src/utils/quantile.h    | 4 ++--
 src/xgboost_main.cpp    | 2 +-
 7 files changed, 17 insertions(+), 6 deletions(-)

diff --git a/src/gbm/gbm.cpp b/src/gbm/gbm.cpp
index e280fdd4a525..fe8d778e4be0 100644
--- a/src/gbm/gbm.cpp
+++ b/src/gbm/gbm.cpp
@@ -1,5 +1,6 @@
 #define _CRT_SECURE_NO_WARNINGS
 #define _CRT_SECURE_NO_DEPRECATE
+#define NOMINMAX
 #include <cstring>
 #include "./gbm.h"
 #include "./gbtree-inl.hpp"
diff --git a/src/sync/sync_empty.cpp b/src/sync/sync_empty.cpp
index c0f956db3cb3..959a4b87a618 100644
--- a/src/sync/sync_empty.cpp
+++ b/src/sync/sync_empty.cpp
@@ -1,3 +1,5 @@
+#define _CRT_SECURE_NO_WARNINGS
+#define _CRT_SECURE_NO_DEPRECATE
 #include "./sync.h"
 #include "../utils/utils.h"
 // no synchronization module, single thread mode does not need it anyway
@@ -26,11 +28,11 @@ std::string GetProcessorName(void) {
 }
 
 template<>
-void AllReduce<uint32_t>(uint32_t *sendrecvbuf, int count, ReduceOp op) {
+void AllReduce<uint32_t>(uint32_t *sendrecvbuf, size_t count, ReduceOp op) {
 }
 
 template<>
-void AllReduce<float>(float *sendrecvbuf, int count, ReduceOp op) {
+void AllReduce<float>(float *sendrecvbuf, size_t count, ReduceOp op) {
 }
 
 void Bcast(std::string *sendrecv_data, int root) {
diff --git a/src/sync/sync_mpi.cpp b/src/sync/sync_mpi.cpp
index f867fa5d0064..d4521b6d59d3 100644
--- a/src/sync/sync_mpi.cpp
+++ b/src/sync/sync_mpi.cpp
@@ -1,3 +1,6 @@
+#define _CRT_SECURE_NO_WARNINGS
+#define _CRT_SECURE_NO_DEPRECATE
+#define NOMINMAX
 #include "./sync.h"
 #include "../utils/utils.h"
 #include <mpi.h>
diff --git a/src/sync/sync_tcp.cpp b/src/sync/sync_tcp.cpp
index 6ed24f51d658..e59c45e31f15 100644
--- a/src/sync/sync_tcp.cpp
+++ b/src/sync/sync_tcp.cpp
@@ -4,6 +4,9 @@
  *   with use non-block socket and tree-shape reduction
  * \author Tianqi Chen
  */
+#define _CRT_SECURE_NO_WARNINGS
+#define _CRT_SECURE_NO_DEPRECATE
+#define NOMINMAX
 #include <vector>
 #include <string>
 #include <cstring>
@@ -34,13 +37,13 @@ class SyncManager {
     this->SetParam("reduce_buffer", "256MB");
   }
   ~SyncManager(void) {
-    this->Shutdown();
   }
   inline void Shutdown(void) {
     for (size_t i = 0; i < links.size(); ++i) {
       links[i].sock.Close();
     }
     links.clear();
+    TCPSocket::Finalize();
   }
   /*! \brief set parameters to the sync manager */
   inline void SetParam(const char *name, const char *val) {
@@ -80,6 +83,7 @@ class SyncManager {
   }
   // initialize the manager
   inline void Init(void) {
+    TCPSocket::Startup();
     // single node mode
     if (master_uri == "NULL") return;
     utils::Assert(links.size() == 0, "can only call Init once");
diff --git a/src/tree/updater.cpp b/src/tree/updater.cpp
index a1349b806968..1b3bc46946c7 100644
--- a/src/tree/updater.cpp
+++ b/src/tree/updater.cpp
@@ -1,5 +1,6 @@
 #define _CRT_SECURE_NO_WARNINGS
 #define _CRT_SECURE_NO_DEPRECATE
+#define NOMINMAX
 #include <cstring>
 #include "./updater.h"
 #include "./updater_sync-inl.hpp"
diff --git a/src/utils/quantile.h b/src/utils/quantile.h
index 32495fb3b2dd..f5e5f006cb98 100644
--- a/src/utils/quantile.h
+++ b/src/utils/quantile.h
@@ -598,12 +598,12 @@ class QuantileSketchTemplate {
     nlevel = 1;
     while (true) {
       limit_size = static_cast<size_t>(ceil(nlevel / eps)) + 1;
-	  size_t n = (1UL << nlevel);
+      size_t n = (1UL << nlevel);
       if (n * limit_size >= maxn) break;
       ++nlevel;
     }
     // check invariant
-	size_t n = (1UL << nlevel);
+    size_t n = (1UL << nlevel);
     utils::Assert(n * limit_size >= maxn, "invalid init parameter");
     utils::Assert(nlevel <= limit_size * eps, "invalid init parameter");
     // lazy reserve the space, if there is only one value, no need to allocate space
diff --git a/src/xgboost_main.cpp b/src/xgboost_main.cpp
index 4a459f2d7ab3..a2ce7ed48bb3 100644
--- a/src/xgboost_main.cpp
+++ b/src/xgboost_main.cpp
@@ -1,6 +1,6 @@
 #define _CRT_SECURE_NO_WARNINGS
 #define _CRT_SECURE_NO_DEPRECATE
-
+#define NOMINMAX
 #include <ctime>
 #include <string>
 #include <cstring>

From 7f3dc967cfeb403516a4d71ad5fbef7d30e41680 Mon Sep 17 00:00:00 2001
From: Tianqi Chen <workcrow@gmail.com>
Date: Sun, 23 Nov 2014 21:21:52 -0800
Subject: [PATCH 112/166] changes in socket, a bit work in linux side first

---
 src/utils/socket.h              | 39 ++++++++++++++++++++++++---------
 windows/xgboost/xgboost.vcxproj |  1 +
 2 files changed, 30 insertions(+), 10 deletions(-)

diff --git a/src/utils/socket.h b/src/utils/socket.h
index 86d737f98a2f..8f99c6bff770 100644
--- a/src/utils/socket.h
+++ b/src/utils/socket.h
@@ -5,6 +5,9 @@
  * \brief this file aims to provide a wrapper of sockets
  * \author Tianqi Chen
  */
+#if defined(_WIN32)
+#include <winsock2.h>
+#else
 #include <fcntl.h>
 #include <netdb.h>
 #include <errno.h>
@@ -13,12 +16,16 @@
 #include <netinet/in.h>
 #include <sys/socket.h>
 #include <sys/select.h>
+#endif
 #include <string>
 #include <cstring>
 #include "./utils.h"
 
 namespace xgboost {
 namespace utils {
+#if defined(_WIN32)
+typedef int ssize_t;
+#endif
 
 /*! \brief data structure for network address */
 struct SockAddr {
@@ -64,18 +71,18 @@ struct SockAddr {
 class TCPSocket {
  public:
   /*! \brief the file descriptor of socket */
-  int sockfd;
+  SOCKET sockfd;
   // constructor
-  TCPSocket(void) : sockfd(-1) {
+  TCPSocket(void) : sockfd(INVALID_SOCKET) {
   }
-  explicit TCPSocket(int sockfd) : sockfd(sockfd) {
+  explicit TCPSocket(SOCKET sockfd) : sockfd(sockfd) {
   }
   ~TCPSocket(void) {
     // do nothing in destructor
     // user need to take care of close
   }
   // default conversion to int
-  inline operator int() const {
+  inline operator SOCKET() const {
     return sockfd;
   }
   /*!
@@ -84,7 +91,7 @@ class TCPSocket {
    */
   inline void Create(int af = PF_INET) {
     sockfd = socket(PF_INET, SOCK_STREAM, 0);
-    if (sockfd == -1) {
+    if (sockfd == INVALID_SOCKET) {
       SockError("Create", errno);
     }
   }
@@ -105,6 +112,12 @@ class TCPSocket {
    *        it will set it back to block mode
    */
   inline void SetNonBlock(bool non_block) {
+#ifdef _WIN32  
+	u_long mode = non_block ? 1 : 0;
+	if (ioctlsocket(sockfd, FIONBIO, &mode) != NO_ERROR) {
+      SockError("SetNonBlock", WSAGetLastError());
+	}
+#else
     int flag = fcntl(sockfd, F_GETFL, 0);
     if (flag == -1) {
       SockError("SetNonBlock-1", errno);
@@ -117,6 +130,7 @@ class TCPSocket {
     if (fcntl(sockfd, F_SETFL, flag) == -1) {
       SockError("SetNonBlock-2", errno);
     }
+#endif
   }
   /*!
    * \brief perform listen of the socket
@@ -127,9 +141,9 @@ class TCPSocket {
   }
   /*! \brief get a new connection */
   TCPSocket Accept(void) {
-    int newfd = accept(sockfd, NULL, NULL);
-    if (newfd == -1) {
-      SockError("Accept", errno);
+    SOCKET newfd = accept(sockfd, NULL, NULL);
+    if (newfd == INVALID_SOCKET) {
+      SockError("Accept");
     }
     return TCPSocket(newfd);
   }
@@ -173,7 +187,12 @@ class TCPSocket {
   /*! \brief close the connection */
   inline void Close(void) {
     if (sockfd != -1) {
-      close(sockfd); sockfd = -1;
+#ifdef _WIN32
+      closesocket(sockfd);
+#else
+	  close(sockfd);
+#endif
+	  sockfd = INVALID_SOCKET;
     } else {
       Error("TCPSocket::Close double close the socket or close without create");
     }
@@ -221,7 +240,7 @@ class TCPSocket {
     const char *buf = reinterpret_cast<const char*>(buf_);
     size_t ndone = 0;
     while (ndone <  len) {
-      ssize_t ret = send(sockfd, buf, len - ndone, 0);
+      ssize_t ret = send(sockfd, buf, static_cast<ssize_t>(len - ndone), 0);
       if (ret == -1) {
         if (errno == EAGAIN || errno == EWOULDBLOCK) return ndone;
         SockError("Recv", errno);
diff --git a/windows/xgboost/xgboost.vcxproj b/windows/xgboost/xgboost.vcxproj
index 3d303efc4da8..82270393fd51 100644
--- a/windows/xgboost/xgboost.vcxproj
+++ b/windows/xgboost/xgboost.vcxproj
@@ -21,6 +21,7 @@
   <ItemGroup>
     <ClCompile Include="..\..\src\gbm\gbm.cpp" />
     <ClCompile Include="..\..\src\io\io.cpp" />
+    <ClCompile Include="..\..\src\sync\sync_tcp.cpp" />
     <ClCompile Include="..\..\src\tree\updater.cpp" />
     <ClCompile Include="..\..\src\xgboost_main.cpp" />
   </ItemGroup>

From d2f151ef5adc3c504cac36543d99fdc22a8ffb84 Mon Sep 17 00:00:00 2001
From: tqchen <tianqi.tchen@gmail.com>
Date: Sun, 23 Nov 2014 21:27:16 -0800
Subject: [PATCH 113/166] bring it back alive again

---
 src/sync/sync_tcp.cpp |  4 ++--
 src/utils/socket.h    | 26 +++++++++++++++-----------
 2 files changed, 17 insertions(+), 13 deletions(-)

diff --git a/src/sync/sync_tcp.cpp b/src/sync/sync_tcp.cpp
index e59c45e31f15..3bfddeac5c29 100644
--- a/src/sync/sync_tcp.cpp
+++ b/src/sync/sync_tcp.cpp
@@ -43,7 +43,7 @@ class SyncManager {
       links[i].sock.Close();
     }
     links.clear();
-    TCPSocket::Finalize();
+    utils::TCPSocket::Finalize();
   }
   /*! \brief set parameters to the sync manager */
   inline void SetParam(const char *name, const char *val) {
@@ -83,7 +83,7 @@ class SyncManager {
   }
   // initialize the manager
   inline void Init(void) {
-    TCPSocket::Startup();
+    utils::TCPSocket::Startup();
     // single node mode
     if (master_uri == "NULL") return;
     utils::Assert(links.size() == 0, "can only call Init once");
diff --git a/src/utils/socket.h b/src/utils/socket.h
index 8f99c6bff770..eb23a5908785 100644
--- a/src/utils/socket.h
+++ b/src/utils/socket.h
@@ -25,6 +25,9 @@ namespace xgboost {
 namespace utils {
 #if defined(_WIN32)
 typedef int ssize_t;
+#else
+typedef int SOCKET;
+const int INVALID_SOCKET = -1;
 #endif
 
 /*! \brief data structure for network address */
@@ -92,7 +95,7 @@ class TCPSocket {
   inline void Create(int af = PF_INET) {
     sockfd = socket(PF_INET, SOCK_STREAM, 0);
     if (sockfd == INVALID_SOCKET) {
-      SockError("Create", errno);
+      SockError("Create");
     }
   }
   /*!
@@ -120,7 +123,7 @@ class TCPSocket {
 #else
     int flag = fcntl(sockfd, F_GETFL, 0);
     if (flag == -1) {
-      SockError("SetNonBlock-1", errno);
+      SockError("SetNonBlock-1");
     }
     if (non_block) {
       flag |= O_NONBLOCK;
@@ -128,7 +131,7 @@ class TCPSocket {
       flag &= ~O_NONBLOCK;
     }
     if (fcntl(sockfd, F_SETFL, flag) == -1) {
-      SockError("SetNonBlock-2", errno);
+      SockError("SetNonBlock-2");
     }
 #endif
   }
@@ -153,7 +156,7 @@ class TCPSocket {
    */
   inline void Bind(const SockAddr &addr) {
     if (bind(sockfd, (sockaddr*)&addr.addr, sizeof(addr.addr)) == -1) {
-      SockError("Bind", errno);
+      SockError("Bind");
     }
   }
   /*! 
@@ -170,7 +173,7 @@ class TCPSocket {
         return port;
       }
       if (errno != EADDRINUSE) {
-        SockError("TryBindHost", errno);
+        SockError("TryBindHost");
       }
     }
     return -1;
@@ -181,7 +184,7 @@ class TCPSocket {
    */
   inline void Connect(const SockAddr &addr) {
     if (connect(sockfd, (sockaddr*)&addr.addr, sizeof(addr.addr)) == -1) {
-      SockError("Connect", errno);
+      SockError("Connect");
     }
   }
   /*! \brief close the connection */
@@ -209,7 +212,7 @@ class TCPSocket {
     ssize_t ret = send(sockfd, buf, len, flag);
     if (ret == -1) {
       if (errno == EAGAIN || errno == EWOULDBLOCK) return 0;
-      SockError("Send", errno);
+      SockError("Send");
     }
     return ret;
   }  
@@ -225,7 +228,7 @@ class TCPSocket {
     ssize_t ret = recv(sockfd, buf, len, flags);
     if (ret == -1) {
       if (errno == EAGAIN || errno == EWOULDBLOCK) return 0;
-      SockError("Recv", errno);
+      SockError("Recv");
     }
     return ret;
   } 
@@ -243,7 +246,7 @@ class TCPSocket {
       ssize_t ret = send(sockfd, buf, static_cast<ssize_t>(len - ndone), 0);
       if (ret == -1) {
         if (errno == EAGAIN || errno == EWOULDBLOCK) return ndone;
-        SockError("Recv", errno);
+        SockError("Recv");
       }
       buf += ret;
       ndone += ret;
@@ -264,7 +267,7 @@ class TCPSocket {
       ssize_t ret = recv(sockfd, buf, len - ndone, MSG_WAITALL);
       if (ret == -1) {
         if (errno == EAGAIN || errno == EWOULDBLOCK) return ndone;
-        SockError("Recv", errno);
+        SockError("Recv");
       }
       if (ret == 0) return ndone;
       buf += ret;
@@ -275,7 +278,8 @@ class TCPSocket {
 
  private:
   // report an socket error
-  inline static void SockError(const char *msg, int errsv) {
+  inline static void SockError(const char *msg) {
+    int errsv = errno;
     char buf[256];    
     Error("Socket %s Error:%s", msg, strerror_r(errsv, buf, sizeof(buf)));
   }

From 78ca72b9c709cd7a9c68c8db4b11427616022293 Mon Sep 17 00:00:00 2001
From: tqchen <tianqi.tchen@gmail.com>
Date: Sun, 23 Nov 2014 21:34:15 -0800
Subject: [PATCH 114/166] start work on win

---
 src/utils/socket.h | 36 +++++++++++++++++-------------------
 1 file changed, 17 insertions(+), 19 deletions(-)

diff --git a/src/utils/socket.h b/src/utils/socket.h
index eb23a5908785..104ebaef2d15 100644
--- a/src/utils/socket.h
+++ b/src/utils/socket.h
@@ -26,8 +26,8 @@ namespace utils {
 #if defined(_WIN32)
 typedef int ssize_t;
 #else
-typedef int SOCKET;
-const int INVALID_SOCKET = -1;
+typedef int sock_t;
+const int INVALID_sock_t = -1;
 #endif
 
 /*! \brief data structure for network address */
@@ -74,18 +74,18 @@ struct SockAddr {
 class TCPSocket {
  public:
   /*! \brief the file descriptor of socket */
-  SOCKET sockfd;
+  sock_t sockfd;
   // constructor
-  TCPSocket(void) : sockfd(INVALID_SOCKET) {
+  TCPSocket(void) : sockfd(INVALID_sock_t) {
   }
-  explicit TCPSocket(SOCKET sockfd) : sockfd(sockfd) {
+  explicit TCPSocket(sock_t sockfd) : sockfd(sockfd) {
   }
   ~TCPSocket(void) {
     // do nothing in destructor
     // user need to take care of close
   }
   // default conversion to int
-  inline operator SOCKET() const {
+  inline operator sock_t() const {
     return sockfd;
   }
   /*!
@@ -94,7 +94,7 @@ class TCPSocket {
    */
   inline void Create(int af = PF_INET) {
     sockfd = socket(PF_INET, SOCK_STREAM, 0);
-    if (sockfd == INVALID_SOCKET) {
+    if (sockfd == INVALID_sock_t) {
       SockError("Create");
     }
   }
@@ -118,7 +118,7 @@ class TCPSocket {
 #ifdef _WIN32  
 	u_long mode = non_block ? 1 : 0;
 	if (ioctlsocket(sockfd, FIONBIO, &mode) != NO_ERROR) {
-      SockError("SetNonBlock", WSAGetLastError());
+      SockError("SetNonBlock");
 	}
 #else
     int flag = fcntl(sockfd, F_GETFL, 0);
@@ -144,8 +144,8 @@ class TCPSocket {
   }
   /*! \brief get a new connection */
   TCPSocket Accept(void) {
-    SOCKET newfd = accept(sockfd, NULL, NULL);
-    if (newfd == INVALID_SOCKET) {
+    sock_t newfd = accept(sockfd, NULL, NULL);
+    if (newfd == INVALID_sock_t) {
       SockError("Accept");
     }
     return TCPSocket(newfd);
@@ -195,7 +195,7 @@ class TCPSocket {
 #else
 	  close(sockfd);
 #endif
-	  sockfd = INVALID_SOCKET;
+	  sockfd = INVALID_sock_t;
     } else {
       Error("TCPSocket::Close double close the socket or close without create");
     }
@@ -280,8 +280,7 @@ class TCPSocket {
   // report an socket error
   inline static void SockError(const char *msg) {
     int errsv = errno;
-    char buf[256];    
-    Error("Socket %s Error:%s", msg, strerror_r(errsv, buf, sizeof(buf)));
+    Error("Socket %s Error:%s", msg, strerror(errsv));
   }
 };
 /*! \brief helper data structure to perform select */
@@ -294,7 +293,7 @@ struct SelectHelper {
    * \brief add file descriptor to watch for read 
    * \param fd file descriptor to be watched
    */
-  inline void WatchRead(int fd) {
+  inline void WatchRead(sock_t fd) {
     read_fds.push_back(fd);
     if (fd > maxfd) maxfd = fd;
   }
@@ -302,7 +301,7 @@ struct SelectHelper {
    * \brief add file descriptor to watch for write
    * \param fd file descriptor to be watched
    */
-  inline void WatchWrite(int fd) {
+  inline void WatchWrite(sock_t fd) {
     write_fds.push_back(fd);
     if (fd > maxfd) maxfd = fd;
   }
@@ -310,14 +309,14 @@ struct SelectHelper {
    * \brief Check if the descriptor is ready for read
    * \param fd file descriptor to check status
    */
-  inline bool CheckRead(int fd) const {
+  inline bool CheckRead(sock_t fd) const {
     return FD_ISSET(fd, &read_set);
   }
   /*!
    * \brief Check if the descriptor is ready for write
    * \param fd file descriptor to check status
    */
-  inline bool CheckWrite(int fd) const {
+  inline bool CheckWrite(sock_t fd) const {
     return FD_ISSET(fd, &write_set);
   }
   /*!
@@ -353,8 +352,7 @@ struct SelectHelper {
     }
     if (ret == -1) {
       int errsv = errno;
-      char buf[256];
-      Error("Select Error: %s", strerror_r(errsv, buf, sizeof(buf)));      
+      Error("Select Error: %s", strerror(errsv));
     }
     return ret;
   }

From 77ffd0465b6fe8d26a1dedcc03e8c6377425c87a Mon Sep 17 00:00:00 2001
From: tqchen <tianqi.tchen@gmail.com>
Date: Sun, 23 Nov 2014 21:36:22 -0800
Subject: [PATCH 115/166] ok

---
 src/utils/socket.h | 28 ++++++++++++++--------------
 1 file changed, 14 insertions(+), 14 deletions(-)

diff --git a/src/utils/socket.h b/src/utils/socket.h
index 104ebaef2d15..054ba97e5e31 100644
--- a/src/utils/socket.h
+++ b/src/utils/socket.h
@@ -26,8 +26,8 @@ namespace utils {
 #if defined(_WIN32)
 typedef int ssize_t;
 #else
-typedef int sock_t;
-const int INVALID_sock_t = -1;
+typedef int SOCKET;
+const int INVALID_SOCKET = -1;
 #endif
 
 /*! \brief data structure for network address */
@@ -74,18 +74,18 @@ struct SockAddr {
 class TCPSocket {
  public:
   /*! \brief the file descriptor of socket */
-  sock_t sockfd;
+  SOCKET sockfd;
   // constructor
-  TCPSocket(void) : sockfd(INVALID_sock_t) {
+  TCPSocket(void) : sockfd(INVALID_SOCKET) {
   }
-  explicit TCPSocket(sock_t sockfd) : sockfd(sockfd) {
+  explicit TCPSocket(SOCKET sockfd) : sockfd(sockfd) {
   }
   ~TCPSocket(void) {
     // do nothing in destructor
     // user need to take care of close
   }
   // default conversion to int
-  inline operator sock_t() const {
+  inline operator SOCKET() const {
     return sockfd;
   }
   /*!
@@ -94,7 +94,7 @@ class TCPSocket {
    */
   inline void Create(int af = PF_INET) {
     sockfd = socket(PF_INET, SOCK_STREAM, 0);
-    if (sockfd == INVALID_sock_t) {
+    if (sockfd == INVALID_SOCKET) {
       SockError("Create");
     }
   }
@@ -144,8 +144,8 @@ class TCPSocket {
   }
   /*! \brief get a new connection */
   TCPSocket Accept(void) {
-    sock_t newfd = accept(sockfd, NULL, NULL);
-    if (newfd == INVALID_sock_t) {
+    SOCKET newfd = accept(sockfd, NULL, NULL);
+    if (newfd == INVALID_SOCKET) {
       SockError("Accept");
     }
     return TCPSocket(newfd);
@@ -195,7 +195,7 @@ class TCPSocket {
 #else
 	  close(sockfd);
 #endif
-	  sockfd = INVALID_sock_t;
+	  sockfd = INVALID_SOCKET;
     } else {
       Error("TCPSocket::Close double close the socket or close without create");
     }
@@ -293,7 +293,7 @@ struct SelectHelper {
    * \brief add file descriptor to watch for read 
    * \param fd file descriptor to be watched
    */
-  inline void WatchRead(sock_t fd) {
+  inline void WatchRead(SOCKET fd) {
     read_fds.push_back(fd);
     if (fd > maxfd) maxfd = fd;
   }
@@ -301,7 +301,7 @@ struct SelectHelper {
    * \brief add file descriptor to watch for write
    * \param fd file descriptor to be watched
    */
-  inline void WatchWrite(sock_t fd) {
+  inline void WatchWrite(SOCKET fd) {
     write_fds.push_back(fd);
     if (fd > maxfd) maxfd = fd;
   }
@@ -309,14 +309,14 @@ struct SelectHelper {
    * \brief Check if the descriptor is ready for read
    * \param fd file descriptor to check status
    */
-  inline bool CheckRead(sock_t fd) const {
+  inline bool CheckRead(SOCKET fd) const {
     return FD_ISSET(fd, &read_set);
   }
   /*!
    * \brief Check if the descriptor is ready for write
    * \param fd file descriptor to check status
    */
-  inline bool CheckWrite(sock_t fd) const {
+  inline bool CheckWrite(SOCKET fd) const {
     return FD_ISSET(fd, &write_set);
   }
   /*!

From fde580b08e73f5671e4251e4ab3e3bb7eb68503e Mon Sep 17 00:00:00 2001
From: Tianqi Chen <workcrow@gmail.com>
Date: Sun, 23 Nov 2014 22:12:55 -0800
Subject: [PATCH 116/166] fix windows run

---
 src/sync/sync_tcp.cpp                         |  4 +-
 src/utils/socket.h                            | 46 +++++++++++++------
 windows/xgboost/xgboost.vcxproj               |  1 +
 .../xgboost_wrapper/xgboost_wrapper.vcxproj   |  2 +
 wrapper/xgboost_wrapper.cpp                   |  8 ++--
 5 files changed, 43 insertions(+), 18 deletions(-)

diff --git a/src/sync/sync_tcp.cpp b/src/sync/sync_tcp.cpp
index 3bfddeac5c29..01dce3dbf73f 100644
--- a/src/sync/sync_tcp.cpp
+++ b/src/sync/sync_tcp.cpp
@@ -235,7 +235,7 @@ class SyncManager {
             if (i != parent_index) {
               reducer(links[i].buffer_head + start,
                       sendrecvbuf + size_up_reduce,
-                      nread / type_nbytes,
+                      static_cast<int>(nread / type_nbytes),
                       MPI::Datatype(type_nbytes));
             }
           }
@@ -522,7 +522,7 @@ ReduceHandle::ReduceHandle(void) : handle(NULL), htype(NULL) {
 ReduceHandle::~ReduceHandle(void) {}
 
 int ReduceHandle::TypeSize(const MPI::Datatype &dtype) {
-  return dtype.type_size;
+  return static_cast<int>(dtype.type_size);
 }
 void ReduceHandle::Init(ReduceFunction redfunc, size_t type_n4bytes, bool commute) {
   utils::Assert(handle == NULL, "cannot initialize reduce handle twice");
diff --git a/src/utils/socket.h b/src/utils/socket.h
index 054ba97e5e31..156f455a0cc9 100644
--- a/src/utils/socket.h
+++ b/src/utils/socket.h
@@ -7,6 +7,7 @@
  */
 #if defined(_WIN32)
 #include <winsock2.h>
+#include <ws2tcpip.h>
 #else
 #include <fcntl.h>
 #include <netdb.h>
@@ -25,8 +26,10 @@ namespace xgboost {
 namespace utils {
 #if defined(_WIN32)
 typedef int ssize_t;
+typedef int sock_size_t;
 #else
 typedef int SOCKET;
+typedef size_t sock_size_t;
 const int INVALID_SOCKET = -1;
 #endif
 
@@ -63,7 +66,11 @@ struct SockAddr {
   /*! \return a string representation of the address */
   inline std::string AddrStr(void) const {
     std::string buf; buf.resize(256);
-    const char *s = inet_ntop(AF_INET, &addr.sin_addr, &buf[0], buf.length());
+#ifdef _WIN32
+    const char *s = inet_ntop(AF_INET, (PVOID)&addr.sin_addr, &buf[0], buf.length());
+#else
+	const char *s = inet_ntop(AF_INET, &addr.sin_addr, &buf[0], buf.length());
+#endif
     Assert(s != NULL, "cannot decode address");
     return std::string(s);
   }
@@ -103,11 +110,22 @@ class TCPSocket {
    *   call this before using the sockets
    */
   inline static void Startup(void) {
+#ifdef _WIN32
+	WSADATA wsa_data;
+    if (WSAStartup(MAKEWORD(2, 2), &wsa_data) != -1) {
+	  SockError("Startup");
+	}
+    if (LOBYTE(wsa_data.wVersion) != 2 || HIBYTE(wsa_data.wVersion) != 2) {
+	  WSACleanup();
+      utils::Error("Could not find a usable version of Winsock.dll\n");
+    }
+#endif
   }
   /*! 
    * \brief shutdown the socket module after use, all sockets need to be closed
    */  
   inline static void Finalize(void) {
+    WSACleanup();
   }
   /*! 
    * \brief set this socket to use non-blocking mode
@@ -207,9 +225,10 @@ class TCPSocket {
    * \param flags extra flags
    * \return size of data actually sent
    */
-  inline size_t Send(const void *buf, size_t len, int flag = 0) {
+  inline size_t Send(const void *buf_, size_t len, int flag = 0) {
+	const char *buf = reinterpret_cast<const char*>(buf_);
     if (len == 0) return 0;
-    ssize_t ret = send(sockfd, buf, len, flag);
+    ssize_t ret = send(sockfd, buf, static_cast<sock_size_t>(len), flag);
     if (ret == -1) {
       if (errno == EAGAIN || errno == EWOULDBLOCK) return 0;
       SockError("Send");
@@ -218,14 +237,15 @@ class TCPSocket {
   }  
   /*! 
    * \brief receive data using the socket 
-   * \param buf the pointer to the buffer
+   * \param buf_ the pointer to the buffer
    * \param len the size of the buffer
    * \param flags extra flags
    * \return size of data actually received 
    */
-  inline size_t Recv(void *buf, size_t len, int flags = 0) {
+  inline size_t Recv(void *buf_, size_t len, int flags = 0) {
+	char *buf = reinterpret_cast<char*>(buf_);
     if (len == 0) return 0;    
-    ssize_t ret = recv(sockfd, buf, len, flags);
+    ssize_t ret = recv(sockfd, buf, static_cast<sock_size_t>(len), flags);
     if (ret == -1) {
       if (errno == EAGAIN || errno == EWOULDBLOCK) return 0;
       SockError("Recv");
@@ -264,7 +284,7 @@ class TCPSocket {
     char *buf = reinterpret_cast<char*>(buf_);
     size_t ndone = 0;
     while (ndone <  len) {
-      ssize_t ret = recv(sockfd, buf, len - ndone, MSG_WAITALL);
+      ssize_t ret = recv(sockfd, buf, static_cast<sock_size_t>(len - ndone), MSG_WAITALL);
       if (ret == -1) {
         if (errno == EAGAIN || errno == EWOULDBLOCK) return ndone;
         SockError("Recv");
@@ -310,14 +330,14 @@ struct SelectHelper {
    * \param fd file descriptor to check status
    */
   inline bool CheckRead(SOCKET fd) const {
-    return FD_ISSET(fd, &read_set);
+    return FD_ISSET(fd, &read_set) != 0;
   }
   /*!
    * \brief Check if the descriptor is ready for write
    * \param fd file descriptor to check status
    */
   inline bool CheckWrite(SOCKET fd) const {
-    return FD_ISSET(fd, &write_set);
+    return FD_ISSET(fd, &write_set) != 0;
   }
   /*!
    * \brief clear all the monitored descriptors
@@ -343,12 +363,12 @@ struct SelectHelper {
     }
     int ret;
     if (timeout == 0) {
-      ret = select(maxfd + 1, &read_set, &write_set, NULL, NULL);
+      ret = select(static_cast<int>(maxfd + 1), &read_set, &write_set, NULL, NULL);
     } else {
       timeval tm;
       tm.tv_usec = (timeout % 1000) * 1000;
       tm.tv_sec = timeout / 1000;
-      ret = select(maxfd + 1, &read_set, &write_set, NULL, &tm);
+      ret = select(static_cast<int>(maxfd + 1), &read_set, &write_set, NULL, &tm);
     }
     if (ret == -1) {
       int errsv = errno;
@@ -358,9 +378,9 @@ struct SelectHelper {
   }
   
  private:
-  int maxfd; 
+  SOCKET maxfd; 
   fd_set read_set, write_set;
-  std::vector<int> read_fds, write_fds;
+  std::vector<SOCKET> read_fds, write_fds;
 };
 }
 }
diff --git a/windows/xgboost/xgboost.vcxproj b/windows/xgboost/xgboost.vcxproj
index 82270393fd51..ab6216d74fe9 100644
--- a/windows/xgboost/xgboost.vcxproj
+++ b/windows/xgboost/xgboost.vcxproj
@@ -112,6 +112,7 @@
       <GenerateDebugInformation>true</GenerateDebugInformation>
       <EnableCOMDATFolding>true</EnableCOMDATFolding>
       <OptimizeReferences>true</OptimizeReferences>
+      <AdditionalDependencies>ws2_32.lib;%(AdditionalDependencies)</AdditionalDependencies>
     </Link>
   </ItemDefinitionGroup>
   <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
diff --git a/windows/xgboost_wrapper/xgboost_wrapper.vcxproj b/windows/xgboost_wrapper/xgboost_wrapper.vcxproj
index 6c73e3ceed36..e78c6e38c42c 100644
--- a/windows/xgboost_wrapper/xgboost_wrapper.vcxproj
+++ b/windows/xgboost_wrapper/xgboost_wrapper.vcxproj
@@ -21,6 +21,7 @@
   <ItemGroup>
     <ClCompile Include="..\..\src\gbm\gbm.cpp" />
     <ClCompile Include="..\..\src\io\io.cpp" />
+    <ClCompile Include="..\..\src\sync\sync_tcp.cpp" />
     <ClCompile Include="..\..\src\tree\updater.cpp" />
     <ClCompile Include="..\..\wrapper\xgboost_wrapper.cpp" />
   </ItemGroup>
@@ -112,6 +113,7 @@
       <GenerateDebugInformation>true</GenerateDebugInformation>
       <EnableCOMDATFolding>true</EnableCOMDATFolding>
       <OptimizeReferences>true</OptimizeReferences>
+      <AdditionalDependencies>ws2_32.lib;%(AdditionalDependencies)</AdditionalDependencies>
     </Link>
   </ItemDefinitionGroup>
   <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
diff --git a/wrapper/xgboost_wrapper.cpp b/wrapper/xgboost_wrapper.cpp
index 63fb310c6a95..d7c824b8aa85 100644
--- a/wrapper/xgboost_wrapper.cpp
+++ b/wrapper/xgboost_wrapper.cpp
@@ -8,7 +8,9 @@
 #include <algorithm>
 // include all std functions
 using namespace std;
-
+#ifdef _MSC_VER
+#define isnan(x) (_isnan(x) != 0)
+#endif
 #include "./xgboost_wrapper.h"
 #include "../src/data.h"
 #include "../src/learner/learner-inl.hpp"
@@ -149,7 +151,7 @@ extern "C"{
                                bst_ulong nrow,
                                bst_ulong ncol,
                                float  missing) {
-    bool nan_missing = std::isnan(missing);
+    bool nan_missing = isnan(missing);
     DMatrixSimple *p_mat = new DMatrixSimple();
     DMatrixSimple &mat = *p_mat;
     mat.info.info.num_row = nrow;
@@ -157,7 +159,7 @@ extern "C"{
     for (bst_ulong i = 0; i < nrow; ++i, data += ncol) {
       bst_ulong nelem = 0;
       for (bst_ulong j = 0; j < ncol; ++j) {
-        if (std::isnan(data[j])) {
+        if (isnan(data[j])) {
           utils::Check(nan_missing, "There are NAN in the matrix, however, you did not set missing=NAN");          
         } else {
           if (nan_missing || data[j] != missing) {

From 35bf2101fe7a06d244835e5cd74b0be553e26c71 Mon Sep 17 00:00:00 2001
From: tqchen <tianqi.tchen@gmail.com>
Date: Sun, 23 Nov 2014 22:18:28 -0800
Subject: [PATCH 117/166] seems a prob in win

---
 src/utils/socket.h | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/src/utils/socket.h b/src/utils/socket.h
index 156f455a0cc9..ac3e96665296 100644
--- a/src/utils/socket.h
+++ b/src/utils/socket.h
@@ -125,7 +125,9 @@ class TCPSocket {
    * \brief shutdown the socket module after use, all sockets need to be closed
    */  
   inline static void Finalize(void) {
+#ifdef _WIN32
     WSACleanup();
+#endif
   }
   /*! 
    * \brief set this socket to use non-blocking mode

From 3e162ceda6208bbaf37cc18599c9f909cf36391e Mon Sep 17 00:00:00 2001
From: tqchen <tianqi.tchen@gmail.com>
Date: Sun, 23 Nov 2014 22:21:15 -0800
Subject: [PATCH 118/166] windows strange

---
 src/tree/model.h | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/src/tree/model.h b/src/tree/model.h
index a330e2960e20..621edf1140ae 100644
--- a/src/tree/model.h
+++ b/src/tree/model.h
@@ -304,11 +304,12 @@ class TreeModel {
     }
     // chg deleted nodes
     deleted_nodes.resize(0);
-    for (int i = param.num_roots; i < param.num_nodes; i ++) {
+    for (int i = param.num_roots; i < param.num_nodes; ++i) {
       if (nodes[i].is_deleted()) deleted_nodes.push_back(i);
     }
     utils::Assert(static_cast<int>(deleted_nodes.size()) == param.num_deleted,
-                  "number of deleted nodes do not match");
+                  "number of deleted nodes do not match, num_deleted=%d, dnsize=%lu, num_nodes=%d",
+                  param.num_deleted, deleted_nodes.size(), param.num_nodes);
   }
   /*! 
    * \brief save model to stream

From f805ecb5f3164b7b2851933a1e751c074d8cec5f Mon Sep 17 00:00:00 2001
From: Tianqi Chen <workcrow@gmail.com>
Date: Sun, 23 Nov 2014 22:35:30 -0800
Subject: [PATCH 119/166] fix a bug in node sindex set

---
 src/tree/model.h | 1 +
 1 file changed, 1 insertion(+)

diff --git a/src/tree/model.h b/src/tree/model.h
index 621edf1140ae..bc7035145d7f 100644
--- a/src/tree/model.h
+++ b/src/tree/model.h
@@ -70,6 +70,7 @@ class TreeModel {
   /*! \brief tree node */
   class Node {
    public:
+	Node(void) : sindex_(0) {}
     /*! \brief index of left child */
     inline int cleft(void) const {
       return this->cleft_;

From f53be2884a720f2f26a06027981b6c038729624f Mon Sep 17 00:00:00 2001
From: tqchen <tianqi.tchen@gmail.com>
Date: Sun, 23 Nov 2014 22:42:44 -0800
Subject: [PATCH 120/166] ok

---
 R-package/src/Makevars     | 2 +-
 R-package/src/Makevars.win | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/R-package/src/Makevars b/R-package/src/Makevars
index 289f1a15a435..5cea1438c886 100644
--- a/R-package/src/Makevars
+++ b/R-package/src/Makevars
@@ -4,4 +4,4 @@ PKGROOT=../../
 PKG_CPPFLAGS= -DXGBOOST_CUSTOMIZE_MSG_ -DXGBOOST_CUSTOMIZE_PRNG_ -DXGBOOST_STRICT_CXX98_ -I$(PKGROOT)
 PKG_CXXFLAGS= $(SHLIB_OPENMP_CFLAGS)
 PKG_LIBS = $(SHLIB_OPENMP_CFLAGS)
-OBJECTS= xgboost_R.o xgboost_assert.o $(PKGROOT)/wrapper/xgboost_wrapper.o $(PKGROOT)/src/io/io.o $(PKGROOT)/src/gbm/gbm.o $(PKGROOT)/src/tree/updater.o
+OBJECTS= xgboost_R.o xgboost_assert.o $(PKGROOT)/wrapper/xgboost_wrapper.o $(PKGROOT)/src/io/io.o $(PKGROOT)/src/gbm/gbm.o $(PKGROOT)/src/tree/updater.o $(PKGROOT)/src/sync/sync_empty.o
diff --git a/R-package/src/Makevars.win b/R-package/src/Makevars.win
index 289f1a15a435..5cea1438c886 100644
--- a/R-package/src/Makevars.win
+++ b/R-package/src/Makevars.win
@@ -4,4 +4,4 @@ PKGROOT=../../
 PKG_CPPFLAGS= -DXGBOOST_CUSTOMIZE_MSG_ -DXGBOOST_CUSTOMIZE_PRNG_ -DXGBOOST_STRICT_CXX98_ -I$(PKGROOT)
 PKG_CXXFLAGS= $(SHLIB_OPENMP_CFLAGS)
 PKG_LIBS = $(SHLIB_OPENMP_CFLAGS)
-OBJECTS= xgboost_R.o xgboost_assert.o $(PKGROOT)/wrapper/xgboost_wrapper.o $(PKGROOT)/src/io/io.o $(PKGROOT)/src/gbm/gbm.o $(PKGROOT)/src/tree/updater.o
+OBJECTS= xgboost_R.o xgboost_assert.o $(PKGROOT)/wrapper/xgboost_wrapper.o $(PKGROOT)/src/io/io.o $(PKGROOT)/src/gbm/gbm.o $(PKGROOT)/src/tree/updater.o $(PKGROOT)/src/sync/sync_empty.o

From cdcfa5687aca97028d0a440f3dee371adbdbdb28 Mon Sep 17 00:00:00 2001
From: Tianqi Chen <tqchen@users.noreply.github.com>
Date: Sun, 23 Nov 2014 22:46:57 -0800
Subject: [PATCH 121/166] Update socket.h

---
 src/utils/socket.h | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

diff --git a/src/utils/socket.h b/src/utils/socket.h
index ac3e96665296..73caa2feabb1 100644
--- a/src/utils/socket.h
+++ b/src/utils/socket.h
@@ -69,7 +69,7 @@ struct SockAddr {
 #ifdef _WIN32
     const char *s = inet_ntop(AF_INET, (PVOID)&addr.sin_addr, &buf[0], buf.length());
 #else
-	const char *s = inet_ntop(AF_INET, &addr.sin_addr, &buf[0], buf.length());
+    const char *s = inet_ntop(AF_INET, &addr.sin_addr, &buf[0], buf.length());
 #endif
     Assert(s != NULL, "cannot decode address");
     return std::string(s);
@@ -111,12 +111,12 @@ class TCPSocket {
    */
   inline static void Startup(void) {
 #ifdef _WIN32
-	WSADATA wsa_data;
+    WSADATA wsa_data;
     if (WSAStartup(MAKEWORD(2, 2), &wsa_data) != -1) {
-	  SockError("Startup");
-	}
+      SockError("Startup");
+    }
     if (LOBYTE(wsa_data.wVersion) != 2 || HIBYTE(wsa_data.wVersion) != 2) {
-	  WSACleanup();
+      WSACleanup();
       utils::Error("Could not find a usable version of Winsock.dll\n");
     }
 #endif
@@ -136,10 +136,10 @@ class TCPSocket {
    */
   inline void SetNonBlock(bool non_block) {
 #ifdef _WIN32  
-	u_long mode = non_block ? 1 : 0;
-	if (ioctlsocket(sockfd, FIONBIO, &mode) != NO_ERROR) {
+    u_long mode = non_block ? 1 : 0;
+    if (ioctlsocket(sockfd, FIONBIO, &mode) != NO_ERROR) {
       SockError("SetNonBlock");
-	}
+    }
 #else
     int flag = fcntl(sockfd, F_GETFL, 0);
     if (flag == -1) {

From 5ae99372d66870d33b9c21c7249cdd9774fcbbec Mon Sep 17 00:00:00 2001
From: Tianqi Chen <tqchen@users.noreply.github.com>
Date: Wed, 26 Nov 2014 09:13:49 -0800
Subject: [PATCH 122/166] Update simple_dmatrix-inl.hpp

---
 src/io/simple_dmatrix-inl.hpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/io/simple_dmatrix-inl.hpp b/src/io/simple_dmatrix-inl.hpp
index f3cf6425e017..a793c779fb28 100644
--- a/src/io/simple_dmatrix-inl.hpp
+++ b/src/io/simple_dmatrix-inl.hpp
@@ -159,7 +159,7 @@ class DMatrixSimple : public DataMatrix {
   inline void LoadBinary(utils::IStream &fs, bool silent = false, const char *fname = NULL) {
     int tmagic;
     utils::Check(fs.Read(&tmagic, sizeof(tmagic)) != 0, "invalid input file format");
-    utils::Check(tmagic == kMagic, "invalid format,magic number mismatch");
+    utils::Check(tmagic == kMagic, "\"%s\" invalid format, magic number mismatch", fname == NULL ? "" : fname);
 
     info.LoadBinary(fs);
     FMatrixS::LoadBinary(fs, &row_ptr_, &row_data_);

From 8e16cc4617ca9ea1eea0b8e61479a52a063b959d Mon Sep 17 00:00:00 2001
From: tqchen <tianqi.tchen@gmail.com>
Date: Sat, 20 Dec 2014 00:17:09 -0800
Subject: [PATCH 123/166] change allreduce lib to rabit library, xgboost now
 run with rabit

---
 Makefile                                      |  34 +-
 multi-node/README.md                          |  20 +-
 multi-node/col-split/README.md                |  11 +-
 .../{mushroom-col.sh => mushroom-col-mpi.sh}  |   0
 multi-node/col-split/mushroom-col-python.sh   |   2 +-
 ...hroom-col-tcp.sh => mushroom-col-rabit.sh} |   8 +-
 multi-node/col-split/mushroom-col.py          |   6 +-
 multi-node/row-split/README.md                |   8 +-
 .../{machine-row.sh => machine-row-mpi.sh}    |   0
 multi-node/row-split/machine-row-tcp.sh       |  24 -
 .../{mushroom-row.sh => mushroom-row-mpi.sh}  |   0
 multi-node/submit_job_tcp.py                  |  36 --
 src/learner/learner-inl.hpp                   |  44 +-
 src/sync/sync.h                               | 201 -------
 src/sync/sync_empty.cpp                       |  50 --
 src/sync/sync_mpi.cpp                         | 116 ----
 src/sync/sync_tcp.cpp                         | 537 ------------------
 src/sync/tcp_master.py                        | 106 ----
 src/tree/updater.cpp                          |   2 +-
 src/tree/updater_basemaker-inl.hpp            |   9 +-
 src/tree/updater_distcol-inl.hpp              |   9 +-
 src/tree/updater_histmaker-inl.hpp            |  18 +-
 src/tree/updater_refresh-inl.hpp              |   6 +-
 src/tree/updater_skmaker-inl.hpp              |  12 +-
 src/tree/updater_sync-inl.hpp                 |  12 +-
 src/utils/quantile.h                          |   6 +-
 src/xgboost_main.cpp                          |  20 +-
 wrapper/xgboost_wrapper.cpp                   |  14 +-
 28 files changed, 105 insertions(+), 1206 deletions(-)
 rename multi-node/col-split/{mushroom-col.sh => mushroom-col-mpi.sh} (100%)
 rename multi-node/col-split/{mushroom-col-tcp.sh => mushroom-col-rabit.sh} (69%)
 rename multi-node/row-split/{machine-row.sh => machine-row-mpi.sh} (100%)
 delete mode 100755 multi-node/row-split/machine-row-tcp.sh
 rename multi-node/row-split/{mushroom-row.sh => mushroom-row-mpi.sh} (100%)
 delete mode 100755 multi-node/submit_job_tcp.py
 delete mode 100644 src/sync/sync.h
 delete mode 100644 src/sync/sync_empty.cpp
 delete mode 100644 src/sync/sync_mpi.cpp
 delete mode 100644 src/sync/sync_tcp.cpp
 delete mode 100644 src/sync/tcp_master.py

diff --git a/Makefile b/Makefile
index f11c20e2168f..12d2507e291b 100644
--- a/Makefile
+++ b/Makefile
@@ -1,8 +1,8 @@
 export CC  = gcc
 export CXX = g++
 export MPICXX = mpicxx
-export LDFLAGS= -pthread -lm 
-export CFLAGS = -Wall -O3 -msse2  -Wno-unknown-pragmas -fPIC 
+export LDFLAGS= -Lrabit/lib -pthread -lm 
+export CFLAGS = -Wall -O3 -msse2  -Wno-unknown-pragmas -fPIC  -Irabit/src
 
 ifeq ($(no_omp),1)
 	CFLAGS += -DDISABLE_OPENMP 
@@ -12,43 +12,47 @@ endif
 
 # specify tensor path
 BIN = xgboost
-OBJ = updater.o gbm.o io.o main.o sync_empty.o sync_tcp.o
-MPIOBJ = sync_mpi.o
+OBJ = updater.o gbm.o io.o main.o 
 MPIBIN = xgboost-mpi
 SLIB = wrapper/libxgboostwrapper.so 
 
-.PHONY: clean all mpi python Rpack
+.PHONY: clean all mpi python Rpack librabit librabit_mpi
 
 all: $(BIN) $(OBJ) $(SLIB) mpi
 mpi: $(MPIBIN)
 
+# rules to get rabit library
+librabit:
+	if [ ! -d rabit ]; then git clone https://github.com/tqchen/rabit.git; fi
+	cd rabit;make lib/librabit.a; cd -
+librabit_mpi:
+	if [ ! -d rabit ]; then git clone https://github.com/tqchen/rabit.git; fi
+	cd rabit;make lib/librabit_mpi.a; cd -
+
 python: wrapper/libxgboostwrapper.so
 # now the wrapper takes in two files. io and wrapper part
 updater.o: src/tree/updater.cpp  src/tree/*.hpp src/*.h src/tree/*.h src/utils/*.h
 gbm.o: src/gbm/gbm.cpp src/gbm/*.hpp src/gbm/*.h 
 io.o: src/io/io.cpp src/io/*.hpp src/utils/*.h src/learner/dmatrix.h src/*.h
-sync_mpi.o: src/sync/sync_mpi.cpp
-sync_tcp.o: src/sync/sync_tcp.cpp
-sync_empty.o: src/sync/sync_empty.cpp 
 main.o: src/xgboost_main.cpp src/utils/*.h src/*.h src/learner/*.hpp src/learner/*.h 
-xgboost-mpi:  updater.o gbm.o io.o main.o sync_mpi.o 
-xgboost:  updater.o gbm.o io.o main.o sync_tcp.o
-wrapper/libxgboostwrapper.so: wrapper/xgboost_wrapper.cpp src/utils/*.h src/*.h src/learner/*.hpp src/learner/*.h  updater.o gbm.o io.o sync_tcp.o
+xgboost-mpi:  updater.o gbm.o io.o main.o librabit_mpi
+xgboost:  updater.o gbm.o io.o main.o  librabit
+wrapper/libxgboostwrapper.so: wrapper/xgboost_wrapper.cpp src/utils/*.h src/*.h src/learner/*.hpp src/learner/*.h  updater.o gbm.o io.o librabit
 
 $(BIN) : 
-	$(CXX) $(CFLAGS) $(LDFLAGS) -o $@ $(filter %.cpp %.o %.c, $^)
+	$(CXX) $(CFLAGS) -o $@ $(filter %.cpp %.o %.c, $^) $(LDFLAGS)  -lrabit
 
 $(SLIB) :
-	$(CXX) $(CFLAGS) -fPIC $(LDFLAGS) -shared -o $@ $(filter %.cpp %.o %.c, $^)
+	$(CXX) $(CFLAGS) -fPIC -shared -o $@ $(filter %.cpp %.o %.c, $^) $(LDFLAGS)  -lrabit
 
 $(OBJ) : 
 	$(CXX) -c $(CFLAGS) -o $@ $(firstword $(filter %.cpp %.c, $^) )
 
 $(MPIOBJ) : 
-	$(MPICXX) -c $(CFLAGS) -o $@ $(firstword $(filter %.cpp %.c, $^) )
+	$(MPICXX) -c $(CFLAGS) -o $@ $(firstword $(filter %.cpp %.c, $^) ) 
 
 $(MPIBIN) : 
-	$(MPICXX) $(CFLAGS) $(LDFLAGS) -o $@ $(filter %.cpp %.o %.c, $^)
+	$(MPICXX) $(CFLAGS) -o $@ $(filter %.cpp %.o %.c, $^) $(LDFLAGS) -lrabit_mpi
 
 install:
 	cp -f -r $(BIN)  $(INSTALL_PATH)
diff --git a/multi-node/README.md b/multi-node/README.md
index d1e6418481a6..31067af5d6eb 100644
--- a/multi-node/README.md
+++ b/multi-node/README.md
@@ -4,20 +4,16 @@ This folder contains information about experimental version of distributed xgboo
 
 Build
 =====
-* In the root folder, run ```make mpi```, this will give you xgboost-mpi
+* In the root folder, run ```make```, this will give you xgboost, which uses rabit allreduce
+  - this version of xgboost should be fault tolerant eventually
+* Alterniatively, run ```make mpi```, this will give you xgboost-mpi
   - You will need to have MPI to build xgboost-mpi
-* Alternatively, you can run ```make```, this will give you xgboost, which uses a beta buildin allreduce
-  - You do not need MPI to build this, you can modify [submit_job_tcp.py](submit_job_tcp.py) to use any job scheduler you like to submit the job
 
 Design Choice
 =====
-* Does distributed xgboost must reply on MPI library?
-  - No, XGBoost replies on MPI protocol that provide Broadcast and AllReduce,
-  - The dependency is isolated in [sync module](../src/sync/sync.h)
-  - All other parts of code uses interface defined in sync.h
-  - [sync_mpi.cpp](../src/sync/sync_mpi.cpp) is a implementation of sync interface using standard MPI library, to use xgboost-mpi, you need an MPI library
-  - If there are platform/framework that implements these protocol, xgboost should naturally extends to these platform
-  - As an example, [sync_tcp.cpp](../src/sync/sync_tcp.cpp) is an implementation of interface using TCP, and is linked with xgboost by default
+* XGBoost replies on [Rabit Library](https://github.com/tqchen/rabit)
+* Rabit is an fault tolerant and portable allreduce library that provides Allreduce and Broadcast
+* Since rabit is compatible with MPI, xgboost can be compiled using MPI backend
 
 * How is the data distributed?
   - There are two solvers in distributed xgboost
@@ -27,12 +23,10 @@ Design Choice
     it uses an approximate histogram count algorithm, and will only examine subset of 
     potential split points as opposed to all split points.
 
-
 Usage
 ====
 * You will need a network filesystem, or copy data to local file system before running the code
-* xgboost-mpi run in MPI enviroment, 
-* xgboost can be used together with [submit_job_tcp.py](submit_job_tcp.py) on other types of job schedulers
+* xgboost can be used together with submission script provided in Rabit on different possible types of job scheduler
 * ***Note*** The distributed version is still multi-threading optimized.
     You should run one process per node that takes most available CPU,
     this will reduce the communication overhead and improve the performance.
diff --git a/multi-node/col-split/README.md b/multi-node/col-split/README.md
index cf6622b53131..04227d1eb88a 100644
--- a/multi-node/col-split/README.md
+++ b/multi-node/col-split/README.md
@@ -1,12 +1,9 @@
 Distributed XGBoost: Column Split Version
 ====
-* run ```bash mushroom-col.sh <n-mpi-process>```
+* run ```bash mushroom-col-rabit.sh <n-process>```
+  - mushroom-col-tcp.sh starts xgboost job using rabit's allreduce
+* run ```bash mushroom-col-mpi.sh <n-mpi-process>```
   - mushroom-col.sh starts xgboost-mpi job
-* run ```bash mushroom-col-tcp.sh <n-process>```
-  - mushroom-col-tcp.sh starts xgboost job using xgboost's buildin allreduce 
-* run ```bash mushroom-col-python.sh <n-process>```
-  - mushroom-col-python.sh starts xgboost python job using xgboost's buildin all reduce
-  - see mushroom-col.py
 
 How to Use
 ====
@@ -16,7 +13,7 @@ How to Use
 
 Notes
 ====
-* The code is multi-threaded, so you want to run one xgboost-mpi per node
+* The code is multi-threaded, so you want to run one process per node
 * The code will work correctly as long as union of each column subset is all the columns we are interested in.
   - The column subset can overlap with each other.
 * It uses exactly the same algorithm as single node version, to examine all potential split points.
diff --git a/multi-node/col-split/mushroom-col.sh b/multi-node/col-split/mushroom-col-mpi.sh
similarity index 100%
rename from multi-node/col-split/mushroom-col.sh
rename to multi-node/col-split/mushroom-col-mpi.sh
diff --git a/multi-node/col-split/mushroom-col-python.sh b/multi-node/col-split/mushroom-col-python.sh
index 45008a1b47dd..8551ee4653a9 100755
--- a/multi-node/col-split/mushroom-col-python.sh
+++ b/multi-node/col-split/mushroom-col-python.sh
@@ -17,6 +17,6 @@ k=$1
 python splitsvm.py ../../demo/data/agaricus.txt.train train $k
 
 # run xgboost mpi
-../submit_job_tcp.py $k python mushroom-col.py
+../../rabit/tracker/rabit_mpi.py $k local python mushroom-col.py
 
 cat dump.nice.$k.txt
diff --git a/multi-node/col-split/mushroom-col-tcp.sh b/multi-node/col-split/mushroom-col-rabit.sh
similarity index 69%
rename from multi-node/col-split/mushroom-col-tcp.sh
rename to multi-node/col-split/mushroom-col-rabit.sh
index 7257f98907da..b9595e5b7612 100755
--- a/multi-node/col-split/mushroom-col-tcp.sh
+++ b/multi-node/col-split/mushroom-col-rabit.sh
@@ -16,13 +16,13 @@ k=$1
 python splitsvm.py ../../demo/data/agaricus.txt.train train $k
 
 # run xgboost mpi
-../submit_job_tcp.py $k ../../xgboost mushroom-col.conf dsplit=col
+../../rabit/tracker/rabit_mpi.py $k local ../../xgboost mushroom-col.conf dsplit=col
 
 # the model can be directly loaded by single machine xgboost solver, as usuall
 ../../xgboost mushroom-col.conf task=dump model_in=0002.model fmap=../../demo/data/featmap.txt name_dump=dump.nice.$k.txt
 
 # run for one round, and continue training
-../submit_job_tcp.py $k ../../xgboost mushroom-col.conf dsplit=col num_round=1
-../submit_job_tcp.py $k ../../xgboost mushroom-col.conf dsplit=col model_in=0001.model
+../../rabit/tracker/rabit_mpi.py $k local  ../../xgboost mushroom-col.conf dsplit=col num_round=1
+../../rabit/tracker/rabit_mpi.py $k local  ../../xgboost mushroom-col.conf  mushroom-col.conf dsplit=col model_in=0001.model
 
-cat dump.nice.$k.txt
\ No newline at end of file
+cat dump.nice.$k.txt
diff --git a/multi-node/col-split/mushroom-col.py b/multi-node/col-split/mushroom-col.py
index 3e24f5f2c9c7..a905aff5c08b 100644
--- a/multi-node/col-split/mushroom-col.py
+++ b/multi-node/col-split/mushroom-col.py
@@ -1,6 +1,10 @@
 import os
 import sys
-sys.path.append(os.path.dirname(__file__)+'/../wrapper')
+path = os.path.dirname(__file__)
+if path == '':
+    path = '.'
+sys.path.append(path+'/../../wrapper')
+
 import xgboost as xgb
 # this is example script of running distributed xgboost using python
 
diff --git a/multi-node/row-split/README.md b/multi-node/row-split/README.md
index 807b0608def6..46656644dbd3 100644
--- a/multi-node/row-split/README.md
+++ b/multi-node/row-split/README.md
@@ -1,10 +1,10 @@
 Distributed XGBoost: Row Split Version
 ====
-* Mushroom: run ```bash mushroom-row.sh <n-mpi-process>```
-* Machine: run ```bash machine-row.sh <n-mpi-process>```
+* Machine Rabit: run ```bash machine-row-rabit.sh <n-mpi-process>```
+  - machine-col-rabit.sh starts xgboost job using rabit
+* Mushroom: run ```bash mushroom-row-mpi.sh <n-mpi-process>```
+* Machine: run ```bash machine-row-mpi.sh <n-mpi-process>```
   - Machine case also include example to continue training from existing model
-* Machine TCP: run ```bash machine-row-tcp.sh <n-mpi-process>```
-  - machine-col-tcp.sh starts xgboost job using xgboost's buildin allreduce 
 
 How to Use
 ====
diff --git a/multi-node/row-split/machine-row.sh b/multi-node/row-split/machine-row-mpi.sh
similarity index 100%
rename from multi-node/row-split/machine-row.sh
rename to multi-node/row-split/machine-row-mpi.sh
diff --git a/multi-node/row-split/machine-row-tcp.sh b/multi-node/row-split/machine-row-tcp.sh
deleted file mode 100755
index c312eb3a52ce..000000000000
--- a/multi-node/row-split/machine-row-tcp.sh
+++ /dev/null
@@ -1,24 +0,0 @@
-#!/bin/bash
-if [[ $# -ne 1 ]]
-then
-    echo "Usage: nprocess"
-    exit -1
-fi
-
-rm -rf train-machine.row* *.model
-k=$1
-# make machine data
-cd ../../demo/regression/
-python mapfeat.py
-python mknfold.py machine.txt 1
-cd -
-
-# split the lib svm file into k subfiles
-python splitrows.py ../../demo/regression/machine.txt.train train-machine $k
-
-# run xgboost mpi
-../submit_job_tcp.py $k ../../xgboost machine-row.conf dsplit=row num_round=3
-
-# run xgboost-mpi save model 0001, continue to run from existing model
-../submit_job_tcp.py $k ../../xgboost machine-row.conf dsplit=row num_round=1
-../submit_job_tcp.py $k ../../xgboost machine-row.conf dsplit=row num_round=2 model_in=0001.model
diff --git a/multi-node/row-split/mushroom-row.sh b/multi-node/row-split/mushroom-row-mpi.sh
similarity index 100%
rename from multi-node/row-split/mushroom-row.sh
rename to multi-node/row-split/mushroom-row-mpi.sh
diff --git a/multi-node/submit_job_tcp.py b/multi-node/submit_job_tcp.py
deleted file mode 100755
index aa415d07a682..000000000000
--- a/multi-node/submit_job_tcp.py
+++ /dev/null
@@ -1,36 +0,0 @@
-#!/usr/bin/python
-"""
-This is an example script to create a customized job submit
-script using xgboost sync_tcp mode
-"""
-import sys
-import os
-import subprocess
-# import the tcp_master.py
-# add path to sync
-sys.path.append(os.path.dirname(__file__)+'/../src/sync/')
-import tcp_master as master
-
-#
-#  Note: this submit script is only used for example purpose
-#  It does not have to be mpirun, it can be any job submission script that starts the job, qsub, hadoop streaming etc.
-#  
-def mpi_submit(nslave, args):
-    """
-      customized submit script, that submit nslave jobs, each must contain args as parameter
-      note this can be a lambda function containing additional parameters in input
-      Parameters
-         nslave number of slave process to start up
-         args arguments to launch each job
-              this usually includes the parameters of master_uri and parameters passed into submit
-    """
-    cmd = ' '.join(['mpirun -n %d' % nslave] + args)
-    print cmd
-    subprocess.check_call(cmd, shell = True)
-
-if __name__ == '__main__':
-    if len(sys.argv) < 2:
-        print 'Usage: <nslave> <cmd>'
-        exit(0)        
-    # call submit, with nslave, the commands to run each job and submit function
-    master.submit(int(sys.argv[1]), sys.argv[2:], fun_submit= mpi_submit)
diff --git a/src/learner/learner-inl.hpp b/src/learner/learner-inl.hpp
index 05ab09f98f10..6ca3b7c7aaef 100644
--- a/src/learner/learner-inl.hpp
+++ b/src/learner/learner-inl.hpp
@@ -10,7 +10,8 @@
 #include <utility>
 #include <string>
 #include <limits>
-#include "../sync/sync.h"
+// rabit library for synchronization
+#include <rabit.h>
 #include "./objective.h"
 #include "./evaluation.h"
 #include "../gbm/gbm.h"
@@ -31,7 +32,6 @@ class BoostLearner {
     name_gbm_ = "gbtree";
     silent= 0;
     prob_buffer_row = 1.0f;
-    part_load_col = 0;
     distributed_mode = 0;
     pred_buffer_size = 0;
   }
@@ -65,7 +65,7 @@ class BoostLearner {
       buffer_size += mats[i]->info.num_row();
       num_feature = std::max(num_feature, static_cast<unsigned>(mats[i]->info.num_col()));
     }
-    sync::AllReduce(&num_feature, 1, sync::kMax);
+    rabit::Allreduce<rabit::op::Max>(&num_feature, 1);
     char str_temp[25];
     if (num_feature > mparam.num_feature) {
       utils::SPrintf(str_temp, sizeof(str_temp), "%u", num_feature);
@@ -103,7 +103,6 @@ class BoostLearner {
         utils::Error("%s is invalid value for dsplit, should be row or col", val);
       }
     }
-    if (!strcmp(name, "part_load_col")) part_load_col = atoi(val);
     if (!strcmp(name, "prob_buffer_row")) {
       prob_buffer_row = static_cast<float>(atof(val));
       utils::Check(distributed_mode == 0,
@@ -153,7 +152,7 @@ class BoostLearner {
     if (gbm_ != NULL) delete gbm_;
     this->InitObjGBM();
     gbm_->LoadModel(fi);
-    if (keep_predbuffer && distributed_mode == 2 && sync::GetRank() != 0) {
+    if (keep_predbuffer && distributed_mode == 2 && rabit::GetRank() != 0) {
       gbm_->ResetPredBuffer(pred_buffer_size);
     }
   }
@@ -188,38 +187,7 @@ class BoostLearner {
    */
   inline void CheckInit(DMatrix *p_train) {
     int ncol = static_cast<int>(p_train->info.info.num_col);    
-    std::vector<bool> enabled(ncol, true);
-    
-    if (part_load_col != 0) {      
-      std::vector<unsigned> col_index;
-      for (int i = 0; i < ncol; ++i) {
-        col_index.push_back(i);
-      }
-      random::Shuffle(col_index);
-      std::string s_model;
-      utils::MemoryBufferStream ms(&s_model);
-      utils::IStream &fs = ms;
-      if (sync::GetRank() == 0) {
-        fs.Write(col_index);
-        sync::Bcast(&s_model, 0);
-      } else {
-        sync::Bcast(&s_model, 0);
-        fs.Read(&col_index);
-      }
-      int nsize = sync::GetWorldSize();
-      int step = (ncol + nsize -1) / nsize;
-      int pid = sync::GetRank();
-      std::fill(enabled.begin(), enabled.end(), false);
-      int start = step * pid;
-      int end = std::min(step * (pid + 1), ncol);
-      std::string name = sync::GetProcessorName();
-      utils::Printf("rank %d of %s idset:", pid, name.c_str());
-      for (int i = start; i < end; ++i) {
-        enabled[col_index[i]] = true;
-        utils::Printf(" %u", col_index[i]);
-      }
-      utils::Printf("\n");
-    }
+    std::vector<bool> enabled(ncol, true);    
     // initialize column access
     p_train->fmat()->InitColAccess(enabled, prob_buffer_row);    
   }
@@ -380,8 +348,6 @@ class BoostLearner {
   int silent;
   // distributed learning mode, if any, 0:none, 1:col, 2:row
   int distributed_mode;
-  // randomly load part of data
-  int part_load_col;
   // cached size of predict buffer
   size_t pred_buffer_size;
   // maximum buffred row value
diff --git a/src/sync/sync.h b/src/sync/sync.h
deleted file mode 100644
index 2e14f2807b7c..000000000000
--- a/src/sync/sync.h
+++ /dev/null
@@ -1,201 +0,0 @@
-#ifndef XGBOOST_SYNC_SYNC_H_
-#define XGBOOST_SYNC_SYNC_H_
-/*!
- * \file sync.h
- * \brief interface to do synchronization
- * \author Tianqi Chen
- */
-#include <cstdio>
-#include <cstring>
-#include <string>
-
-#include "../utils/utils.h"
-#include "../utils/io.h"
-
-namespace MPI {
-// forward delcaration of MPI::Datatype, but not include content
-class Datatype;
-};
-namespace xgboost {
-/*! \brief syncrhonizer module that minimumly wraps interface of MPI */
-namespace sync {
-/*! \brief reduce operator supported */
-enum ReduceOp {
-  kSum,
-  kMax,
-  kBitwiseOR
-};
-
-/*! \brief get rank of current process */
-int GetRank(void);
-/*! \brief get total number of process */
-int GetWorldSize(void);
-/*! \brief get name of processor */
-std::string GetProcessorName(void);
-
-/*! 
- * \brief this is used to check if sync module is a true distributed implementation, or simply a dummpy
- */
-bool IsDistributed(void);
-/*! \brief intiialize the synchronization module */
-void Init(int argc, char *argv[]);
-/*! \brief finalize syncrhonization module */
-void Finalize(void);
-
-/*!
- * \brief in-place all reduce operation 
- * \param sendrecvbuf the in place send-recv buffer
- * \param count count of data
- * \param op reduction function
- */
-template<typename DType>
-void AllReduce(DType *sendrecvbuf, size_t count, ReduceOp op);
-
-/*!
- * \brief broadcast an std::string to all others from root
- * \param sendrecv_data the pointer to send or recive buffer,
- *                      receive buffer does not need to be pre-allocated
- *                      and string will be resized to correct length
- * \param root the root of process
- */
-void Bcast(std::string *sendrecv_data, int root);
-
-/*! 
- * \brief handle for customized reducer 
- * user do not need to use this, used Reducer instead
- */
-class ReduceHandle {
- public:
-  // reduce function
-  typedef void (ReduceFunction) (const void *src, void *dst, int len, const MPI::Datatype &dtype);
-  // constructor
-  ReduceHandle(void);
-  // destructor
-  ~ReduceHandle(void);
-  /*!
-   * \brief initialize the reduce function, with the type the reduce function need to deal with   
-   */
-  void Init(ReduceFunction redfunc, size_t type_n4bytes, bool commute = true);
-  /*!
-   * \brief customized in-place all reduce operation 
-   * \param sendrecvbuf the in place send-recv buffer
-   * \param type_n4bytes unit size of the type, in terms of 4bytes
-   * \param count number of elements to send
-   */
-  void AllReduce(void *sendrecvbuf, size_t type_n4bytes, size_t count);
-  /*! \return the number of bytes occupied by the type */
-  static int TypeSize(const MPI::Datatype &dtype);
-
- protected:
-  // handle data field
-  void *handle;
-  // handle to the type field
-  void *htype;
-  // the created type in 4 bytes
-  size_t created_type_n4bytes;
-};
-
-// ----- extensions for ease of use ------
-/*!
- * \brief template class to make customized reduce and all reduce easy  
- * Do not use reducer directly in the function you call Finalize, because the destructor can happen after Finalize
- * \tparam DType data type that to be reduced
- *   DType must be a struct, with no pointer, and contains a function Reduce(const DType &d);
- */
-template<typename DType>
-class Reducer {
- public:
-  Reducer(void) {
-    handle.Init(ReduceInner, kUnit);
-    utils::Assert(sizeof(DType) % sizeof(int) == 0, "struct must be multiple of int");
-  }
-  /*!
-   * \brief customized in-place all reduce operation 
-   * \param sendrecvbuf the in place send-recv buffer
-   * \param bytes number of 4bytes send through all reduce
-   * \param reducer the reducer function
-   */
-  inline void AllReduce(DType *sendrecvbuf, size_t count) {
-    handle.AllReduce(sendrecvbuf, kUnit, count);
-  }
-
- private:
-  // unit size 
-  static const size_t kUnit = sizeof(DType) / sizeof(int);
-  // inner implementation of reducer
-  inline static void ReduceInner(const void *src_, void *dst_, int len_, const MPI::Datatype &dtype) {
-    const int *psrc = reinterpret_cast<const int*>(src_);
-    int *pdst = reinterpret_cast<int*>(dst_);
-    DType tdst, tsrc;
-    for (size_t i = 0; i < len_; ++i) {
-      // use memcpy to avoid alignment issue
-      std::memcpy(&tdst, pdst + i * kUnit, sizeof(tdst));
-      std::memcpy(&tsrc, psrc + i * kUnit, sizeof(tsrc));
-      tdst.Reduce(tsrc);
-      std::memcpy(pdst + i * kUnit, &tdst, sizeof(tdst));      
-    }
-  }
-  // function handle
-  ReduceHandle handle;
-};
-
-/*!
- * \brief template class to make customized reduce, complex reducer handles all the data structure that can be 
- *        serialized/deserialzed into fixed size buffer
- * Do not use reducer directly in the function you call Finalize, because the destructor can happen after Finalize
- * 
- * \tparam DType data type that to be reduced, DType must contain following functions:
- *   (1) Save(IStream &fs)  (2) Load(IStream &fs) (3) Reduce(const DType &d);
- */
-template<typename DType>
-class SerializeReducer {
- public:
-  SerializeReducer(void) {
-    handle.Init(ReduceInner, 0);
-  }
-  /*!
-   * \brief customized in-place all reduce operation
-   * \param sendrecvobj pointer to the object to be reduced
-   * \param max_n4byte maximum amount of memory needed in 4byte
-   * \param reducer the reducer function
-   */
-  inline void AllReduce(DType *sendrecvobj, size_t max_n4byte, size_t count) {
-    buffer.resize(max_n4byte * count);
-    for (size_t i = 0; i < count; ++i) {
-      utils::MemoryFixSizeBuffer fs(BeginPtr(buffer) + i * max_n4byte, max_n4byte * 4);
-      sendrecvobj[i].Save(fs);
-    }
-    handle.AllReduce(BeginPtr(buffer), max_n4byte, count);
-    for (size_t i = 0; i < count; ++i) {
-      utils::MemoryFixSizeBuffer fs(BeginPtr(buffer) + i * max_n4byte, max_n4byte * 4);
-      sendrecvobj[i].Load(fs);
-    }
-  }
-
- private:
-  // unit size
-  // inner implementation of reducer
-  inline static void ReduceInner(const void *src_, void *dst_, int len_, const MPI::Datatype &dtype) {
-    int nbytes = ReduceHandle::TypeSize(dtype);
-    // temp space
-    DType tsrc, tdst;
-    for (int i = 0; i < len_; ++i) {
-      utils::MemoryFixSizeBuffer fsrc((char*)(src_) + i * nbytes, nbytes);
-      utils::MemoryFixSizeBuffer fdst((char*)(dst_) + i * nbytes, nbytes);
-      tsrc.Load(fsrc);
-      tdst.Load(fdst);
-      // govern const check
-      tdst.Reduce(static_cast<const DType &>(tsrc), nbytes);
-      fdst.Seek(0);
-      tdst.Save(fdst);
-    }
-  }
-  // function handle
-  ReduceHandle handle;
-  // reduce buffer
-  std::vector<int> buffer;
-};
-
-}  // namespace sync
-}  // namespace xgboost
-#endif
diff --git a/src/sync/sync_empty.cpp b/src/sync/sync_empty.cpp
deleted file mode 100644
index 959a4b87a618..000000000000
--- a/src/sync/sync_empty.cpp
+++ /dev/null
@@ -1,50 +0,0 @@
-#define _CRT_SECURE_NO_WARNINGS
-#define _CRT_SECURE_NO_DEPRECATE
-#include "./sync.h"
-#include "../utils/utils.h"
-// no synchronization module, single thread mode does not need it anyway
-namespace xgboost {
-namespace sync {
-int GetRank(void) {
-  return 0;
-}
-
-void Init(int argc, char *argv[]) {
-}
-
-void Finalize(void) {
-}
-
-bool IsDistributed(void) {
-  return false;
-}
-
-int GetWorldSize(void) {
-  return 1;
-}
-
-std::string GetProcessorName(void) {
-  return std::string("");
-}
-
-template<>
-void AllReduce<uint32_t>(uint32_t *sendrecvbuf, size_t count, ReduceOp op) {
-}
-
-template<>
-void AllReduce<float>(float *sendrecvbuf, size_t count, ReduceOp op) {
-}
-
-void Bcast(std::string *sendrecv_data, int root) {
-}
-
-ReduceHandle::ReduceHandle(void) : handle(NULL) {}
-ReduceHandle::~ReduceHandle(void) {}
-int ReduceHandle::TypeSize(const MPI::Datatype &dtype) {
-  return 0;
-}
-void ReduceHandle::Init(ReduceFunction redfunc, size_t type_n4bytes, bool commute) {}
-void ReduceHandle::AllReduce(void *sendrecvbuf, size_t type_n4bytes, size_t n4byte) {}
-}  // namespace sync
-}  // namespace xgboost
-
diff --git a/src/sync/sync_mpi.cpp b/src/sync/sync_mpi.cpp
deleted file mode 100644
index d4521b6d59d3..000000000000
--- a/src/sync/sync_mpi.cpp
+++ /dev/null
@@ -1,116 +0,0 @@
-#define _CRT_SECURE_NO_WARNINGS
-#define _CRT_SECURE_NO_DEPRECATE
-#define NOMINMAX
-#include "./sync.h"
-#include "../utils/utils.h"
-#include <mpi.h>
-
-// use MPI to implement sync
-namespace xgboost {
-namespace sync {
-int GetRank(void) {
-  return MPI::COMM_WORLD.Get_rank();
-}
-
-int GetWorldSize(void) {
-  return MPI::COMM_WORLD.Get_size();
-}
-
-void Init(int argc, char *argv[]) {
-  MPI::Init(argc, argv);
-}
-
-bool IsDistributed(void) {
-  return true;
-}
-
-std::string GetProcessorName(void) {
-  int len;
-  char name[MPI_MAX_PROCESSOR_NAME];
-  MPI::Get_processor_name(name, len);
-  name[len] = '\0';
-  return std::string(name);
-}
-
-void Finalize(void) {
-  MPI::Finalize();
-}
-
-void AllReduce_(void *sendrecvbuf, size_t count, const MPI::Datatype &dtype, ReduceOp op) {
-  switch(op) {
-    case kBitwiseOR: MPI::COMM_WORLD.Allreduce(MPI_IN_PLACE, sendrecvbuf, count, dtype, MPI::BOR); return;
-    case kSum: MPI::COMM_WORLD.Allreduce(MPI_IN_PLACE, sendrecvbuf, count, dtype, MPI::SUM); return;
-    case kMax: MPI::COMM_WORLD.Allreduce(MPI_IN_PLACE, sendrecvbuf, count, dtype, MPI::MAX); return;
-  }
-}
-
-template<>
-void AllReduce<uint32_t>(uint32_t *sendrecvbuf, size_t count, ReduceOp op) {
-  AllReduce_(sendrecvbuf, count, MPI::UNSIGNED, op);
-}
-
-template<>
-void AllReduce<float>(float *sendrecvbuf, size_t count, ReduceOp op) {
-  AllReduce_(sendrecvbuf, count, MPI::FLOAT, op);
-}
-
-void Bcast(std::string *sendrecv_data, int root) {
-  unsigned len = static_cast<unsigned>(sendrecv_data->length());
-  MPI::COMM_WORLD.Bcast(&len, 1, MPI::UNSIGNED, root);
-  sendrecv_data->resize(len);
-  if (len != 0) {
-    MPI::COMM_WORLD.Bcast(&(*sendrecv_data)[0], len, MPI::CHAR, root);  
-  }
-}
-
-// code for reduce handle
-ReduceHandle::ReduceHandle(void) : handle(NULL), htype(NULL) {
-}
-ReduceHandle::~ReduceHandle(void) {
-  if (handle != NULL) {
-    MPI::Op *op = reinterpret_cast<MPI::Op*>(handle);
-    op->Free();
-    delete op;
-  }
-  if (htype != NULL) {
-    MPI::Datatype *dtype = reinterpret_cast<MPI::Datatype*>(htype);
-    dtype->Free();
-    delete dtype;
-  }
-}
-int ReduceHandle::TypeSize(const MPI::Datatype &dtype) {
-  return dtype.Get_size();
-}
-void ReduceHandle::Init(ReduceFunction redfunc, size_t type_n4bytes, bool commute) {
-  utils::Assert(handle == NULL, "cannot initialize reduce handle twice");
-  if (type_n4bytes != 0) {
-    MPI::Datatype *dtype = new MPI::Datatype();
-    *dtype = MPI::INT.Create_contiguous(type_n4bytes);
-    dtype->Commit();
-    created_type_n4bytes = type_n4bytes;
-    htype = dtype;
-  }
-  
-  MPI::Op *op = new MPI::Op();
-  MPI::User_function *pf = redfunc;
-  op->Init(pf, commute);
-  handle = op;
-}
-void ReduceHandle::AllReduce(void *sendrecvbuf, size_t type_n4bytes, size_t count) {
-  utils::Assert(handle != NULL, "must intialize handle to call AllReduce");
-  MPI::Op *op = reinterpret_cast<MPI::Op*>(handle);
-  MPI::Datatype *dtype = reinterpret_cast<MPI::Datatype*>(htype);
-  if (created_type_n4bytes != type_n4bytes || dtype == NULL) {
-    if (dtype == NULL) {
-      dtype = new MPI::Datatype();
-    } else {
-      dtype->Free();
-    }
-    *dtype = MPI::INT.Create_contiguous(type_n4bytes);
-    dtype->Commit();
-    created_type_n4bytes = type_n4bytes;
-  }
-  MPI::COMM_WORLD.Allreduce(MPI_IN_PLACE, sendrecvbuf, count, *dtype, *op);
-}
-}  // namespace sync
-}  // namespace xgboost
diff --git a/src/sync/sync_tcp.cpp b/src/sync/sync_tcp.cpp
deleted file mode 100644
index 01dce3dbf73f..000000000000
--- a/src/sync/sync_tcp.cpp
+++ /dev/null
@@ -1,537 +0,0 @@
-/*!
- * \file sync_tcp.cpp
- * \brief implementation of sync AllReduce using TCP sockets
- *   with use non-block socket and tree-shape reduction
- * \author Tianqi Chen
- */
-#define _CRT_SECURE_NO_WARNINGS
-#define _CRT_SECURE_NO_DEPRECATE
-#define NOMINMAX
-#include <vector>
-#include <string>
-#include <cstring>
-#include "./sync.h"
-#include "../utils/socket.h"
-
-namespace MPI {
-class Datatype {
- public:
-  size_t type_size;
-  Datatype(size_t type_size) : type_size(type_size) {}
-};
-}
-namespace xgboost {
-namespace sync {
-/*! \brief implementation of sync goes to here */
-class SyncManager {  
- public:
-  const static int kMagic = 0xff99;
-  SyncManager(void) {
-    master_uri = "NULL";
-    master_port = 9000;
-    host_uri = "";
-    slave_port = 9010;
-    nport_trial = 1000;
-    rank = 0;
-    world_size = 1;
-    this->SetParam("reduce_buffer", "256MB");
-  }
-  ~SyncManager(void) {
-  }
-  inline void Shutdown(void) {
-    for (size_t i = 0; i < links.size(); ++i) {
-      links[i].sock.Close();
-    }
-    links.clear();
-    utils::TCPSocket::Finalize();
-  }
-  /*! \brief set parameters to the sync manager */
-  inline void SetParam(const char *name, const char *val) {
-    if (!strcmp(name, "master_uri")) master_uri = val;
-    if (!strcmp(name, "master_port")) master_port = atoi(val);
-    if (!strcmp(name, "reduce_buffer")) {
-      char unit;
-      unsigned long amount;
-      if (sscanf(val, "%lu%c", &amount, &unit) == 2) {
-        switch (unit) {
-          case 'B': reduce_buffer_size = (amount + 7)/ 8; break;
-          case 'K': reduce_buffer_size = amount << 7UL; break;
-          case 'M': reduce_buffer_size = amount << 17UL; break;
-          case 'G': reduce_buffer_size = amount << 27UL; break;
-          default: utils::Error("invalid format for reduce buffer");
-        }
-      } else {
-        utils::Error("invalid format for reduce_buffer, shhould be {integer}{unit}, unit can be {B, KB, MB, GB}");
-      }
-    }
-  }
-  /*! \brief get rank */
-  inline int GetRank(void) const {
-    return rank;
-  }
-  /*! \brief check whether its distributed mode */
-  inline bool IsDistributed(void) const {
-    return links.size() != 0;
-  }
-  /*! \brief get rank */
-  inline int GetWorldSize(void) const {
-    return world_size;
-  }
-  /*! \brief get rank */
-  inline std::string GetHost(void) const {
-    return host_uri;
-  }
-  // initialize the manager
-  inline void Init(void) {
-    utils::TCPSocket::Startup();
-    // single node mode
-    if (master_uri == "NULL") return;
-    utils::Assert(links.size() == 0, "can only call Init once");
-    int magic = kMagic;
-    int nchild = 0, nparent = 0;
-    this->host_uri = utils::SockAddr::GetHostName();
-    // get information from master
-    utils::TCPSocket master;
-    master.Create();
-    master.Connect(utils::SockAddr(master_uri.c_str(), master_port));
-    utils::Assert(master.SendAll(&magic, sizeof(magic)) == sizeof(magic), "sync::Init failure 1");
-    utils::Assert(master.RecvAll(&magic, sizeof(magic)) == sizeof(magic), "sync::Init failure 2");
-    utils::Check(magic == kMagic, "sync::Invalid master message, init failure");
-    utils::Assert(master.RecvAll(&rank, sizeof(rank)) == sizeof(rank), "sync::Init failure 3");
-    utils::Assert(master.RecvAll(&world_size, sizeof(world_size)) == sizeof(world_size), "sync::Init failure 4");
-    utils::Assert(master.RecvAll(&nparent, sizeof(nparent)) == sizeof(nparent), "sync::Init failure 5");
-    utils::Assert(master.RecvAll(&nchild, sizeof(nchild)) == sizeof(nchild), "sync::Init failure 6");
-    utils::Assert(nchild >= 0, "in correct number of childs");
-    utils::Assert(nparent == 1 || nparent == 0, "in correct number of parent");
-
-    // create listen
-    utils::TCPSocket sock_listen;
-    sock_listen.Create();
-    int port = sock_listen.TryBindHost(slave_port, slave_port + nport_trial);
-    utils::Check(port != -1, "sync::Init fail to bind the ports specified");
-    sock_listen.Listen();
-
-    if (nparent != 0) {
-      parent_index = 0;
-      links.push_back(LinkRecord());
-      int len, hport;
-      std::string hname;
-      utils::Assert(master.RecvAll(&len, sizeof(len)) == sizeof(len), "sync::Init failure 9");
-      hname.resize(len);
-      utils::Assert(len != 0, "string must not be empty");
-      utils::Assert(master.RecvAll(&hname[0], len) == static_cast<size_t>(len), "sync::Init failure 10");
-      utils::Assert(master.RecvAll(&hport, sizeof(hport)) == sizeof(hport), "sync::Init failure 11");
-      links[0].sock.Create();
-      links[0].sock.Connect(utils::SockAddr(hname.c_str(), hport));      
-      utils::Assert(links[0].sock.SendAll(&magic, sizeof(magic)) == sizeof(magic), "sync::Init failure 12");
-      utils::Assert(links[0].sock.RecvAll(&magic, sizeof(magic)) == sizeof(magic), "sync::Init failure 13");
-      utils::Check(magic == kMagic, "sync::Init failure, parent magic number mismatch");
-      parent_index = 0;
-    } else {
-      parent_index = -1;
-    }
-    // send back socket listening port to master
-    utils::Assert(master.SendAll(&port, sizeof(port)) == sizeof(port), "sync::Init failure 14");
-    // close connection to master
-    master.Close();
-    // accept links from childs
-    for (int i = 0; i < nchild; ++i) {
-      LinkRecord r; 
-      while (true) {
-        r.sock = sock_listen.Accept();
-        if (r.sock.RecvAll(&magic, sizeof(magic)) == sizeof(magic) && magic == kMagic) {
-          utils::Assert(r.sock.SendAll(&magic, sizeof(magic)) == sizeof(magic), "sync::Init failure 15");
-          break;
-        } else {         
-          // not a valid child
-          r.sock.Close();
-        }
-      }
-      links.push_back(r);
-    }
-    // close listening sockets
-    sock_listen.Close();
-    // setup selecter
-    selecter.Clear();
-    for (size_t i = 0; i < links.size(); ++i) {
-      // set the socket to non-blocking mode
-      links[i].sock.SetNonBlock(true);
-      selecter.WatchRead(links[i].sock);
-      selecter.WatchWrite(links[i].sock);
-    }
-    // done
-  }
-  /*!
-   * \brief perform in-place allreduce, on sendrecvbuf 
-   *        this function is NOT thread-safe
-   * \param sendrecvbuf_ buffer for both sending and recving data
-   * \param type_n4bytes the unit number of bytes the type have
-   * \param count number of elements to be reduced
-   * \param reducer reduce function
-   */
-  inline void AllReduce(void *sendrecvbuf_,
-                        size_t type_nbytes,
-                        size_t count,
-                        ReduceHandle::ReduceFunction reducer) {
-    if (links.size() == 0) return;
-    // total size of message
-    const size_t total_size = type_nbytes * count;
-    // number of links
-    const int nlink = static_cast<int>(links.size());
-    // send recv buffer
-    char *sendrecvbuf = reinterpret_cast<char*>(sendrecvbuf_);
-    // size of space that we already performs reduce in up pass
-    size_t size_up_reduce = 0;
-    // size of space that we have already passed to parent
-    size_t size_up_out = 0;
-    // size of message we received, and send in the down pass
-    size_t size_down_in = 0;    
-
-    // initialize the link ring-buffer and pointer
-    for (int i = 0; i < nlink; ++i) {
-      if (i != parent_index) {
-        links[i].InitBuffer(type_nbytes, count, reduce_buffer_size);
-      }
-      links[i].ResetSize();
-    }
-    // if no childs, no need to reduce
-    if (nlink == static_cast<int>(parent_index != -1)) {
-      size_up_reduce = total_size;
-    }
-    
-    // while we have not passed the messages out
-    while(true) {
-      selecter.Select();
-      // read data from childs
-      for (int i = 0; i < nlink; ++i) {
-        if (i != parent_index && selecter.CheckRead(links[i].sock)) {
-          links[i].ReadToRingBuffer(size_up_out);
-        }
-      }
-      // this node have childs, peform reduce
-      if (nlink > static_cast<int>(parent_index != -1)) {
-        size_t buffer_size = 0;
-        // do upstream reduce
-        size_t max_reduce = total_size;
-        for (int i = 0; i < nlink; ++i) {
-          if (i != parent_index) {
-            max_reduce= std::min(max_reduce, links[i].size_read);
-            utils::Assert(buffer_size == 0 || buffer_size == links[i].buffer_size,
-                          "buffer size inconsistent");
-            buffer_size = links[i].buffer_size;
-          }
-        }
-        utils::Assert(buffer_size != 0, "must assign buffer_size");
-        // round to type_n4bytes
-        max_reduce = (max_reduce / type_nbytes * type_nbytes);
-        // peform reduce, can be at most two rounds
-        while (size_up_reduce < max_reduce) {
-          // start position
-          size_t start = size_up_reduce % buffer_size;
-          // peform read till end of buffer
-          size_t nread = std::min(buffer_size - start, max_reduce - size_up_reduce);          
-          utils::Assert(nread % type_nbytes == 0, "AllReduce: size check");
-          for (int i = 0; i < nlink; ++i) {
-            if (i != parent_index) {
-              reducer(links[i].buffer_head + start,
-                      sendrecvbuf + size_up_reduce,
-                      static_cast<int>(nread / type_nbytes),
-                      MPI::Datatype(type_nbytes));
-            }
-          }
-          size_up_reduce += nread;
-        }
-      }
-      if (parent_index != -1) {
-        // pass message up to parent, can pass data that are already been reduced
-        if (selecter.CheckWrite(links[parent_index].sock)) {
-          size_up_out += links[parent_index].sock.
-              Send(sendrecvbuf + size_up_out, size_up_reduce - size_up_out);
-        }
-        // read data from parent
-        if (selecter.CheckRead(links[parent_index].sock)) {
-          size_down_in +=  links[parent_index].sock.
-              Recv(sendrecvbuf + size_down_in, total_size - size_down_in);
-          utils::Assert(size_down_in <= size_up_out, "AllReduce: boundary error");
-        }
-      } else {
-        // this is root, can use reduce as most recent point
-        size_down_in = size_up_out = size_up_reduce;
-      }
-      // check if we finished the job of message passing
-      size_t nfinished = size_down_in;
-      // can pass message down to childs
-      for (int i = 0; i < nlink; ++i) {
-        if (i != parent_index) {
-          if (selecter.CheckWrite(links[i].sock)) {
-            links[i].WriteFromArray(sendrecvbuf, size_down_in);
-          }
-          nfinished = std::min(links[i].size_write, nfinished);
-        }
-      }
-      // check boundary condition
-      if (nfinished >= total_size) break;
-    }
-  }
-  /*!
-   * \brief broadcast data from root to all nodes
-   * \param sendrecvbuf_ buffer for both sending and recving data
-   * \param type_n4bytes the unit number of bytes the type have
-   * \param count number of elements to be reduced
-   * \param reducer reduce function
-   */  
-  inline void Bcast(void *sendrecvbuf_,
-                    size_t total_size,
-                    int root) {
-    if (links.size() == 0) return;
-    // number of links
-    const int nlink = static_cast<int>(links.size());
-    // size of space already read from data
-    size_t size_in = 0;
-    // input link, -2 means unknown yet, -1 means this is root
-    int in_link = -2;
-
-    // initialize the link statistics
-    for (int i = 0; i < nlink; ++i) {
-      links[i].ResetSize();
-    }
-    // root have all the data
-    if (this->rank == root) {
-      size_in = total_size;
-      in_link = -1;
-    }
-    
-    // while we have not passed the messages out
-    while(true) {
-      selecter.Select();
-      if (in_link == -2) {
-        // probe in-link
-        for (int i = 0; i < nlink; ++i) {
-          if (selecter.CheckRead(links[i].sock)) {
-            links[i].ReadToArray(sendrecvbuf_, total_size);
-            size_in = links[i].size_read;
-            if (size_in != 0) {
-              in_link = i; break;
-            }
-          }
-        }
-      } else {
-        // read from in link
-        if (in_link >= 0 && selecter.CheckRead(links[in_link].sock)) {
-          links[in_link].ReadToArray(sendrecvbuf_, total_size);
-          size_in = links[in_link].size_read;
-        }
-      }
-      size_t nfinished = total_size;
-      // send data to all out-link
-      for (int i = 0; i < nlink; ++i) {
-        if (i != in_link) {
-          if (selecter.CheckWrite(links[i].sock)) {
-            links[i].WriteFromArray(sendrecvbuf_, size_in);
-          }
-          nfinished = std::min(nfinished, links[i].size_write);
-        }
-      }
-      // check boundary condition
-      if (nfinished >= total_size) break;
-    }
-  }
- private:  
-  // an independent child record
-  struct LinkRecord {
-   public:
-    // socket to get data from/to link
-    utils::TCPSocket sock;
-    // size of data readed from link
-    size_t size_read;
-    // size of data sent to the link
-    size_t size_write;
-    // pointer to buffer head
-    char *buffer_head;
-    // buffer size, in bytes
-    size_t buffer_size;
-    // initialize buffer
-    inline void InitBuffer(size_t type_nbytes, size_t count, size_t reduce_buffer_size) {
-      size_t n = (type_nbytes * count + 7)/ 8;
-      buffer_.resize(std::min(reduce_buffer_size, n));
-      // make sure align to type_nbytes
-      buffer_size = buffer_.size() * sizeof(uint64_t) / type_nbytes * type_nbytes;
-      utils::Assert(type_nbytes <= buffer_size, "too large type_nbytes=%lu, buffer_size=%lu", type_nbytes, buffer_size);
-      // set buffer head
-      buffer_head = reinterpret_cast<char*>(BeginPtr(buffer_));
-    }
-    // reset the recv and sent size
-    inline void ResetSize(void) {
-      size_write = size_read = 0;
-    }
-    /*! 
-     * \brief read data into ring-buffer, with care not to existing useful override data
-     *  position after protect_start
-     * \param protect_start all data start from protect_start is still needed in buffer
-     *                      read shall not override this 
-     */
-    inline void ReadToRingBuffer(size_t protect_start) {
-      size_t ngap = size_read - protect_start;
-      utils::Assert(ngap <= buffer_size, "AllReduce: boundary check");
-      size_t offset = size_read % buffer_size;
-      size_t nmax = std::min(buffer_size - ngap, buffer_size - offset);
-      size_read += sock.Recv(buffer_head + offset, nmax);
-    }
-    /*!
-     * \brief read data into array,
-     * this function can not be used together with ReadToRingBuffer
-     * a link can either read into the ring buffer, or existing array
-     * \param max_size maximum size of array
-     */
-    inline void ReadToArray(void *recvbuf_, size_t max_size) {
-      char *p = static_cast<char*>(recvbuf_);
-      size_read += sock.Recv(p + size_read, max_size - size_read);
-    }
-    /*!
-     * \brief write data in array to sock
-     * \param sendbuf_ head of array
-     * \param max_size maximum size of array
-     */
-    inline void WriteFromArray(const void *sendbuf_, size_t max_size) {
-      const char *p = static_cast<const char*>(sendbuf_);
-      size_write += sock.Send(p + size_write, max_size - size_write);
-    }
-
-   private:
-    // recv buffer to get data from child
-    // aligned with 64 bits, will be able to perform 64 bits operations freely
-    std::vector<uint64_t> buffer_;
-  };
-  //------------------
-  // uri of current host, to be set by Init
-  std::string host_uri;
-  // uri of master
-  std::string master_uri;
-  // port of master address
-  int master_port;
-  // port of slave process
-  int slave_port, nport_trial;
-  // reduce buffer size
-  size_t reduce_buffer_size;
-  // current rank
-  int rank;
-  // world size
-  int world_size;
-  // index of parent link, can be -1, meaning this is root of the tree
-  int parent_index;
-  // sockets of all links
-  std::vector<LinkRecord> links;
-  // select helper
-  utils::SelectHelper selecter;
-};
-
-// singleton sync manager
-SyncManager manager;
-
-/*! \brief get rank of current process */
-int GetRank(void) {
-  return manager.GetRank();
-}
-/*! \brief get total number of process */
-int GetWorldSize(void) {
-  return manager.GetWorldSize();
-}
-
-/*! \brief get name of processor */
-std::string GetProcessorName(void) {
-  return manager.GetHost();
-}
-bool IsDistributed(void) {
-  return manager.IsDistributed();
-}
-/*! \brief intiialize the synchronization module */
-void Init(int argc, char *argv[]) {
-  for (int i = 1; i < argc; ++i) {
-    char name[256], val[256];
-    if (sscanf(argv[i], "%[^=]=%s", name, val) == 2) {
-      manager.SetParam(name, val);
-    }
-  }
-  manager.Init();
-}
-
-/*! \brief finalize syncrhonization module */
-void Finalize(void) {
-  manager.Shutdown();
-}
-
-// this can only be used for data that was smaller than 64 bit
-template<typename DType>
-inline void ReduceSum(const void *src_, void *dst_, int len, const MPI::Datatype &dtype) {
-  const DType *src = (const DType*)src_;
-  DType *dst = (DType*)dst_;  
-  for (int i = 0; i < len; ++i) {
-    dst[i] += src[i];
-  }
-}
-template<typename DType>
-inline void ReduceMax(const void *src_, void *dst_, int len, const MPI::Datatype &dtype) {
-  const DType *src = (const DType*)src_;
-  DType *dst = (DType*)dst_;  
-  for (int i = 0; i < len; ++i) {
-    if (src[i] > dst[i]) dst[i] = src[i];
-  }
-}
-template<typename DType>
-inline void ReduceBitOR(const void *src_, void *dst_, int len, const MPI::Datatype &dtype) {
-  const DType *src = (const DType*)src_;
-  DType *dst = (DType*)dst_;  
-  for (int i = 0; i < len; ++i) {
-    dst[i] |= src[i];
-  }
-}
-
-template<>
-void AllReduce<uint32_t>(uint32_t *sendrecvbuf, size_t count, ReduceOp op) {
-  typedef uint32_t DType;
-  switch(op) {
-    case kBitwiseOR: manager.AllReduce(sendrecvbuf, sizeof(DType), count, ReduceBitOR<DType>); return;
-    case kSum: manager.AllReduce(sendrecvbuf, sizeof(DType), count, ReduceSum<DType>); return;
-    case kMax: manager.AllReduce(sendrecvbuf, sizeof(DType), count, ReduceMax<DType>); return;
-    default: utils::Error("reduce op not supported");
-  }
-}
-
-template<>
-void AllReduce<float>(float *sendrecvbuf, size_t count, ReduceOp op) {
-  typedef float DType;
-  switch(op) {
-    case kSum: manager.AllReduce(sendrecvbuf, sizeof(DType), count, ReduceSum<DType>); return;
-    case kMax: manager.AllReduce(sendrecvbuf, sizeof(DType), count, ReduceMax<DType>); return;
-    default: utils::Error("unknown ReduceOp");
-  }
-}
-
-void Bcast(std::string *sendrecv_data, int root) {
-  unsigned len = static_cast<unsigned>(sendrecv_data->length());
-  manager.Bcast(&len, sizeof(len), root);
-  sendrecv_data->resize(len);
-  if (len != 0) {
-    manager.Bcast(&(*sendrecv_data)[0], len, root);  
-  }
-}
-
-// code for reduce handle
-ReduceHandle::ReduceHandle(void) : handle(NULL), htype(NULL) {
-}
-ReduceHandle::~ReduceHandle(void) {}
-
-int ReduceHandle::TypeSize(const MPI::Datatype &dtype) {
-  return static_cast<int>(dtype.type_size);
-}
-void ReduceHandle::Init(ReduceFunction redfunc, size_t type_n4bytes, bool commute) {
-  utils::Assert(handle == NULL, "cannot initialize reduce handle twice");
-  handle = reinterpret_cast<void*>(redfunc);
-}
-void ReduceHandle::AllReduce(void *sendrecvbuf, size_t type_n4bytes, size_t count) {
-  utils::Assert(handle != NULL, "must intialize handle to call AllReduce");
-  manager.AllReduce(sendrecvbuf, type_n4bytes * 4, count, reinterpret_cast<ReduceFunction*>(handle));
-}
-
-}  // namespace sync
-}  // namespace xgboost
diff --git a/src/sync/tcp_master.py b/src/sync/tcp_master.py
deleted file mode 100644
index c0820f14b2f0..000000000000
--- a/src/sync/tcp_master.py
+++ /dev/null
@@ -1,106 +0,0 @@
-"""
-Master script for xgboost, tcp_master
-This script can be used to start jobs of multi-node xgboost using sync_tcp
-
-Tianqi Chen
-"""
-
-import sys
-import os
-import socket
-import struct
-import subprocess
-from threading import Thread
-
-class ExSocket:
-    def __init__(self, sock):
-        self.sock = sock
-    def recvall(self, nbytes):
-        res = []
-        sock = self.sock
-        nread = 0    
-        while nread < nbytes:
-            chunk = self.sock.recv(min(nbytes - nread, 1024), socket.MSG_WAITALL)
-            nread += len(chunk)
-            res.append(chunk)
-        return ''.join(res)
-    def recvint(self):
-        return struct.unpack('@i', self.recvall(4))[0]
-    def sendint(self, n):
-        self.sock.sendall(struct.pack('@i', n))
-    def sendstr(self, s):
-        self.sendint(len(s))
-        self.sock.sendall(s)
-
-# magic number used to verify existence of data
-kMagic = 0xff99
-
-class Master:
-    def __init__(self, port = 9000, port_end = 9999):
-        sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
-        for port in range(port, port_end):
-            try:
-                sock.bind(('', port))
-                self.port = port
-                break
-            except socket.error:
-                continue
-        sock.listen(16)
-        self.sock = sock
-        print 'start listen on %s:%d' % (socket.gethostname(), self.port)
-    def __del__(self):
-        self.sock.close()
-    def slave_args(self):
-        return ['master_uri=%s' % socket.gethostname(),
-                'master_port=%s' % self.port]    
-    def accept_slaves(self, nslave):        
-        slave_addrs = []
-        for rank in range(nslave):
-            while True:
-                fd, s_addr = self.sock.accept()
-                slave = ExSocket(fd)
-                nparent = int(rank != 0)
-                nchild = 0
-                if (rank + 1) * 2 - 1 < nslave:
-                    nchild += 1
-                if (rank + 1) * 2 < nslave:
-                    nchild += 1                
-                try:
-                    magic = slave.recvint()
-                    if magic != kMagic:
-                        print 'invalid magic number=%d from %s' % (magic, s_addr[0])                        
-                        slave.sock.close()
-                        continue
-                except socket.error:
-                    print 'sock error in %s' % (s_addr[0])
-                    slave.sock.close()
-                    continue
-                slave.sendint(kMagic)
-                slave.sendint(rank)
-                slave.sendint(nslave)
-                slave.sendint(nparent)
-                slave.sendint(nchild)
-                if nparent != 0:
-                    parent_index = (rank + 1) / 2 - 1
-                    ptuple = slave_addrs[parent_index]
-                    slave.sendstr(ptuple[0])
-                    slave.sendint(ptuple[1])
-                s_port = slave.recvint()
-                assert rank == len(slave_addrs)
-                slave_addrs.append((s_addr[0], s_port))
-                slave.sock.close()
-                print 'finish starting rank=%d at %s' % (rank, s_addr[0])
-                break
-        print 'all slaves setup complete'
-        
-def mpi_submit(nslave, args):
-    cmd = ' '.join(['mpirun -n %d' % nslave] + args)
-    print cmd
-    return subprocess.check_call(cmd, shell = True)
-    
-def submit(nslave, args, fun_submit = mpi_submit):
-    master = Master()
-    submit_thread = Thread(target = fun_submit, args = (nslave, args + master.slave_args()))
-    submit_thread.start()
-    master.accept_slaves(nslave)
-    submit_thread.join()
diff --git a/src/tree/updater.cpp b/src/tree/updater.cpp
index 1b3bc46946c7..ca19b93b35a5 100644
--- a/src/tree/updater.cpp
+++ b/src/tree/updater.cpp
@@ -8,8 +8,8 @@
 #include "./updater_refresh-inl.hpp"
 #include "./updater_colmaker-inl.hpp"
 #include "./updater_distcol-inl.hpp"
-//#include "./updater_skmaker-inl.hpp"
 #include "./updater_histmaker-inl.hpp"
+//#include "./updater_skmaker-inl.hpp"
 
 namespace xgboost {
 namespace tree {
diff --git a/src/tree/updater_basemaker-inl.hpp b/src/tree/updater_basemaker-inl.hpp
index 68bd9ede4c5e..9b7c38b00762 100644
--- a/src/tree/updater_basemaker-inl.hpp
+++ b/src/tree/updater_basemaker-inl.hpp
@@ -8,6 +8,7 @@
 #include <vector>
 #include <algorithm>
 #include <limits>
+#include <rabit.h>
 #include "../utils/random.h"
 #include "../utils/quantile.h"
 
@@ -50,7 +51,7 @@ class BaseMaker: public IUpdater {
           }
         }
       }      
-      sync::AllReduce(BeginPtr(fminmax), fminmax.size(), sync::kMax);
+      rabit::Allreduce<rabit::op::Max>(BeginPtr(fminmax), fminmax.size());
     }
     // get feature type, 0:empty 1:binary 2:real
     inline int Type(bst_uint fid) const {
@@ -80,11 +81,11 @@ class BaseMaker: public IUpdater {
         std::string s_cache;
         utils::MemoryBufferStream fc(&s_cache);
         utils::IStream &fs = fc;
-        if (sync::GetRank() == 0) {
+        if (rabit::GetRank() == 0) {
           fs.Write(findex);
-          sync::Bcast(&s_cache, 0);
+          rabit::Broadcast(&s_cache, 0);
         } else {
-          sync::Bcast(&s_cache, 0);
+          rabit::Broadcast(&s_cache, 0);
           fs.Read(&findex);
         }
       }
diff --git a/src/tree/updater_distcol-inl.hpp b/src/tree/updater_distcol-inl.hpp
index bce947fe850d..658fbe2b169e 100644
--- a/src/tree/updater_distcol-inl.hpp
+++ b/src/tree/updater_distcol-inl.hpp
@@ -6,9 +6,9 @@
  *        and construct a tree
  * \author Tianqi Chen
  */
+#include <rabit.h>
 #include "../utils/bitmap.h"
 #include "../utils/io.h"
-#include "../sync/sync.h"
 #include "./updater_colmaker-inl.hpp"
 #include "./updater_prune-inl.hpp"
 
@@ -114,7 +114,7 @@ class DistColMaker : public ColMaker<TStats> {
       
       bitmap.InitFromBool(boolmap);
       // communicate bitmap
-      sync::AllReduce(BeginPtr(bitmap.data), bitmap.data.size(), sync::kBitwiseOR);
+      rabit::Allreduce<rabit::op::BitOR>(BeginPtr(bitmap.data), bitmap.data.size());
       const std::vector<bst_uint> &rowset = p_fmat->buffered_rowset();
       // get the new position
       const bst_omp_uint ndata = static_cast<bst_omp_uint>(rowset.size());
@@ -142,8 +142,9 @@ class DistColMaker : public ColMaker<TStats> {
         }
         vec.push_back(this->snode[nid].best);
       }
+      // TODO, lazy version
       // communicate best solution
-      reducer.AllReduce(BeginPtr(vec), vec.size());
+      reducer.Allreduce(BeginPtr(vec), vec.size());
       // assign solution back
       for (size_t i = 0; i < qexpand.size(); ++i) {
         const int nid = qexpand[i];
@@ -154,7 +155,7 @@ class DistColMaker : public ColMaker<TStats> {
    private:
     utils::BitMap bitmap;
     std::vector<int> boolmap;
-    sync::Reducer<SplitEntry> reducer;
+    rabit::Reducer<SplitEntry> reducer;
   };
   // we directly introduce pruner here
   TreePruner pruner;
diff --git a/src/tree/updater_histmaker-inl.hpp b/src/tree/updater_histmaker-inl.hpp
index 61d3008d5ce5..ab1c5ef1c60a 100644
--- a/src/tree/updater_histmaker-inl.hpp
+++ b/src/tree/updater_histmaker-inl.hpp
@@ -7,7 +7,7 @@
  */
 #include <vector>
 #include <algorithm>
-#include "../sync/sync.h"
+#include <rabit.h>
 #include "../utils/quantile.h"
 #include "../utils/group_data.h"
 #include "./updater_basemaker-inl.hpp"
@@ -117,7 +117,7 @@ class HistMaker: public BaseMaker {
   // workspace of thread
   ThreadWSpace wspace;
   // reducer for histogram
-  sync::Reducer<TStats> histred;
+  rabit::Reducer<TStats> histred;
   // set of working features
   std::vector<bst_uint> fwork_set;
   // update function implementation
@@ -331,7 +331,7 @@ class CQHistMaker: public HistMaker<TStats> {
           .data[0] = node_stats[nid];
     }
     // sync the histogram
-    this->histred.AllReduce(BeginPtr(this->wspace.hset[0].data), this->wspace.hset[0].data.size());    
+    this->histred.Allreduce(BeginPtr(this->wspace.hset[0].data), this->wspace.hset[0].data.size());    
   }
   virtual void ResetPositionAfterSplit(IFMatrix *p_fmat,
                                        const RegTree &tree) {
@@ -394,8 +394,8 @@ class CQHistMaker: public HistMaker<TStats> {
       summary_array[i].SetPrune(out, max_size);
     }
     if (summary_array.size() != 0) {
-      size_t n4bytes = (WXQSketch::SummaryContainer::CalcMemCost(max_size) + 3) / 4;
-      sreducer.AllReduce(BeginPtr(summary_array), n4bytes, summary_array.size());
+      size_t nbytes = WXQSketch::SummaryContainer::CalcMemCost(max_size);
+      sreducer.Allreduce(BeginPtr(summary_array), nbytes, summary_array.size());
     }
     // now we get the final result of sketch, setup the cut
     this->wspace.cut.clear();
@@ -540,7 +540,7 @@ class CQHistMaker: public HistMaker<TStats> {
   // summary array
   std::vector<WXQSketch::SummaryContainer> summary_array;
   // reducer for summary
-  sync::SerializeReducer<WXQSketch::SummaryContainer> sreducer;
+  rabit::SerializeReducer<WXQSketch::SummaryContainer> sreducer;
   // per node, per feature sketch
   std::vector< utils::WXQuantileSketch<bst_float, bst_float> > sketchs;  
 };
@@ -623,8 +623,8 @@ class QuantileHistMaker: public HistMaker<TStats> {
       summary_array[i].Reserve(max_size);
       summary_array[i].SetPrune(out, max_size);
     }
-    size_t n4bytes = (WXQSketch::SummaryContainer::CalcMemCost(max_size) + 3) / 4;
-    sreducer.AllReduce(BeginPtr(summary_array), n4bytes, summary_array.size());
+    size_t nbytes = WXQSketch::SummaryContainer::CalcMemCost(max_size);
+    sreducer.Allreduce(BeginPtr(summary_array), nbytes, summary_array.size());
     // now we get the final result of sketch, setup the cut
     this->wspace.cut.clear();
     this->wspace.rptr.clear();
@@ -660,7 +660,7 @@ class QuantileHistMaker: public HistMaker<TStats> {
   // summary array
   std::vector<WXQSketch::SummaryContainer> summary_array;
   // reducer for summary
-  sync::SerializeReducer<WXQSketch::SummaryContainer> sreducer;
+  rabit::SerializeReducer<WXQSketch::SummaryContainer> sreducer;
   // local temp column data structure
   std::vector<size_t> col_ptr;
   // local storage of column data
diff --git a/src/tree/updater_refresh-inl.hpp b/src/tree/updater_refresh-inl.hpp
index 579ff2bc323a..83a81615c4f1 100644
--- a/src/tree/updater_refresh-inl.hpp
+++ b/src/tree/updater_refresh-inl.hpp
@@ -7,10 +7,10 @@
  */
 #include <vector>
 #include <limits>
+#include <rabit.h>
 #include "./param.h"
 #include "./updater.h"
 #include "../utils/omp.h"
-#include "../sync/sync.h"
 
 namespace xgboost {
 namespace tree {
@@ -85,7 +85,7 @@ class TreeRefresher: public IUpdater {
       }
     }
     // AllReduce, add statistics up
-    reducer.AllReduce(BeginPtr(stemp[0]), stemp[0].size());
+    reducer.Allreduce(BeginPtr(stemp[0]), stemp[0].size());
     // rescale learning rate according to size of trees
     float lr = param.learning_rate;
     param.learning_rate = lr / trees.size();
@@ -137,7 +137,7 @@ class TreeRefresher: public IUpdater {
   // training parameter
   TrainParam param;
   // reducer
-  sync::Reducer<TStats> reducer;  
+  rabit::Reducer<TStats> reducer;  
 };
 
 }  // namespace tree
diff --git a/src/tree/updater_skmaker-inl.hpp b/src/tree/updater_skmaker-inl.hpp
index dd23b22c146b..45202273a796 100644
--- a/src/tree/updater_skmaker-inl.hpp
+++ b/src/tree/updater_skmaker-inl.hpp
@@ -8,7 +8,7 @@
  */
 #include <vector>
 #include <algorithm>
-#include "../sync/sync.h"
+#include <rabit.h>
 #include "../utils/quantile.h"
 #include "./updater_basemaker-inl.hpp"
 
@@ -166,8 +166,8 @@ class SketchMaker: public BaseMaker {
       sketchs[i].GetSummary(&out);
       summary_array.Set(i, out);
     }
-    size_t n4bytes = (summary_array.MemSize() + 3) / 4;
-    sketch_reducer.AllReduce(&summary_array, n4bytes);    
+    size_t nbytes = summary_array.MemSize();;
+    sketch_reducer.Allreduce(&summary_array, nbytes);    
   }
   // update sketch information in column fid
   inline void UpdateSketchCol(const std::vector<bst_gpair> &gpair,
@@ -256,7 +256,7 @@ class SketchMaker: public BaseMaker {
     for (size_t i = 0; i < qexpand.size(); ++i) {
       tmp[i] = node_stats[qexpand[i]];
     }
-    stats_reducer.AllReduce(BeginPtr(tmp), tmp.size());
+    stats_reducer.Allreduce(BeginPtr(tmp), tmp.size());
     for (size_t i = 0; i < qexpand.size(); ++i) {
       node_stats[qexpand[i]] = tmp[i];
     }
@@ -382,9 +382,9 @@ class SketchMaker: public BaseMaker {
   // summary array
   WXQSketch::SummaryArray summary_array;
   // reducer for summary
-  sync::Reducer<SKStats> stats_reducer;
+  rabit::Reducer<SKStats> stats_reducer;
   // reducer for summary
-  sync::ComplexReducer<WXQSketch::SummaryArray> sketch_reducer;
+  rabit::SerializeReducer<WXQSketch::SummaryArray> sketch_reducer;
   // per node, per feature sketch
   std::vector< utils::WXQuantileSketch<bst_float, bst_float> > sketchs;
 };
diff --git a/src/tree/updater_sync-inl.hpp b/src/tree/updater_sync-inl.hpp
index 68a6096168d3..d29743bf3529 100644
--- a/src/tree/updater_sync-inl.hpp
+++ b/src/tree/updater_sync-inl.hpp
@@ -7,8 +7,8 @@
  */
 #include <vector>
 #include <limits>
+#include <rabit.h>
 #include "./updater.h"
-#include "../sync/sync.h"
 
 namespace xgboost {
 namespace tree {
@@ -32,22 +32,22 @@ class TreeSyncher: public IUpdater {
  private:
   // synchronize the trees in different nodes, take tree from rank 0
   inline void SyncTrees(const std::vector<RegTree *> &trees) {
-    if (sync::GetWorldSize() == 1) return;
+    if (rabit::GetWorldSize() == 1) return;
     std::string s_model;
     utils::MemoryBufferStream fs(&s_model);
-    int rank = sync::GetRank();
+    int rank = rabit::GetRank();
     if (rank == 0) {
       for (size_t i = 0; i < trees.size(); ++i) {
         trees[i]->SaveModel(fs);
       }
-      sync::Bcast(&s_model, 0);
+      rabit::Broadcast(&s_model, 0);
     } else {
-      sync::Bcast(&s_model, 0);
+      rabit::Broadcast(&s_model, 0);
       for (size_t i = 0; i < trees.size(); ++i) {      
         trees[i]->LoadModel(fs);
       }
     }
-  }    
+  }
 };
 }  // namespace tree
 }  // namespace xgboost
diff --git a/src/utils/quantile.h b/src/utils/quantile.h
index f5e5f006cb98..bc76f40171ab 100644
--- a/src/utils/quantile.h
+++ b/src/utils/quantile.h
@@ -574,14 +574,16 @@ class QuantileSketchTemplate {
       return sizeof(size_t) + sizeof(Entry) * nentry;
     }
     /*! \brief save the data structure into stream */
-    inline void Save(IStream &fo) const {
+    template<typename TStream>
+    inline void Save(TStream &fo) const {
       fo.Write(&(this->size), sizeof(this->size));
       if (this->size != 0) {
         fo.Write(this->data, this->size * sizeof(Entry));
       }
     }
     /*! \brief load data structure from input stream */
-    inline void Load(IStream &fi) {
+    template<typename TStream>
+    inline void Load(TStream &fi) {
       utils::Check(fi.Read(&this->size, sizeof(this->size)) != 0, "invalid SummaryArray 1");
       this->Reserve(this->size);
       if (this->size != 0) {
diff --git a/src/xgboost_main.cpp b/src/xgboost_main.cpp
index a2ce7ed48bb3..9583a2278c39 100644
--- a/src/xgboost_main.cpp
+++ b/src/xgboost_main.cpp
@@ -4,8 +4,8 @@
 #include <ctime>
 #include <string>
 #include <cstring>
+#include <rabit.h>
 #include "io/io.h"
-#include "sync/sync.h"
 #include "utils/utils.h"
 #include "utils/config.h"
 #include "learner/learner-inl.hpp"
@@ -31,10 +31,10 @@ class BoostLearnTask {
         this->SetParam(name, val);
       }
     }
-    if (sync::IsDistributed()) {
+    if (rabit::IsDistributed()) {
       this->SetParam("data_split", "col");
     }
-    if (sync::GetRank() != 0) {
+    if (rabit::GetRank() != 0) {
       this->SetParam("silent", "2");
     }
     this->InitData();
@@ -109,7 +109,7 @@ class BoostLearnTask {
   inline void InitData(void) {
     if (strchr(train_path.c_str(), '%') != NULL) {
       char s_tmp[256];
-      utils::SPrintf(s_tmp, sizeof(s_tmp), train_path.c_str(), sync::GetRank());
+      utils::SPrintf(s_tmp, sizeof(s_tmp), train_path.c_str(), rabit::GetRank());
       train_path = s_tmp;
       load_part = 1;
     }
@@ -193,7 +193,7 @@ class BoostLearnTask {
     fclose(fo);
   }
   inline void SaveModel(const char *fname) const {
-    if (sync::GetRank() != 0) return;
+    if (rabit::GetRank() != 0) return;
     utils::FileStream fo(utils::FopenCheck(fname, "wb"));
     learner.SaveModel(fo);
     fo.Close();
@@ -263,14 +263,14 @@ class BoostLearnTask {
 }
 
 int main(int argc, char *argv[]){
-  xgboost::sync::Init(argc, argv);
-  if (xgboost::sync::IsDistributed()) {
-    std::string pname = xgboost::sync::GetProcessorName();
-    printf("start %s:%d\n", pname.c_str(), xgboost::sync::GetRank());
+  rabit::Init(argc, argv);
+  if (rabit::IsDistributed()) {
+    std::string pname = rabit::GetProcessorName();
+    printf("start %s:%d\n", pname.c_str(), rabit::GetRank());
   }
   xgboost::random::Seed(0);
   xgboost::BoostLearnTask tsk;
   int ret = tsk.Run(argc, argv);
-  xgboost::sync::Finalize();
+  rabit::Finalize();
   return ret;
 }
diff --git a/wrapper/xgboost_wrapper.cpp b/wrapper/xgboost_wrapper.cpp
index d7c824b8aa85..700356ade780 100644
--- a/wrapper/xgboost_wrapper.cpp
+++ b/wrapper/xgboost_wrapper.cpp
@@ -83,21 +83,21 @@ using namespace xgboost::wrapper;
 
 extern "C"{
   void XGSyncInit(int argc, char *argv[]) {
-    sync::Init(argc, argv);
-    if (sync::IsDistributed()) {
-      std::string pname = xgboost::sync::GetProcessorName();
-      utils::Printf("distributed job start %s:%d\n", pname.c_str(), xgboost::sync::GetRank());
+    rabit::Init(argc, argv);
+    if (rabit::GetWorldSize() != 1) {
+      std::string pname = rabit::GetProcessorName();
+      utils::Printf("distributed job start %s:%d\n", pname.c_str(), rabit::GetRank());
     }
   }
   void XGSyncFinalize(void) {
-    sync::Finalize();
+    rabit::Finalize();
   }
   int XGSyncGetRank(void) {
-    int rank = xgboost::sync::GetRank();
+    int rank = rabit::GetRank();
     return rank;
   }
   int XGSyncGetWorldSize(void) {
-    return sync::GetWorldSize();
+    return rabit::GetWorldSize();
   }
   void* XGDMatrixCreateFromFile(const char *fname, int silent) {
     return LoadDataMatrix(fname, silent != 0, false);

From deb21351b976aec90454aa2cef8c0d62dca5edf8 Mon Sep 17 00:00:00 2001
From: tqchen <tianqi.tchen@gmail.com>
Date: Sat, 20 Dec 2014 01:05:40 -0800
Subject: [PATCH 124/166] add rabit checkpoint to xgb

---
 src/gbm/gblinear-inl.hpp    |  4 ++--
 src/gbm/gbm.h               |  6 +++--
 src/gbm/gbtree-inl.hpp      | 20 +++++++++++------
 src/learner/learner-inl.hpp | 45 ++++++++++++++++++++++++++++++-------
 src/xgboost_main.cpp        | 37 +++++++++++++++++++++---------
 5 files changed, 83 insertions(+), 29 deletions(-)

diff --git a/src/gbm/gblinear-inl.hpp b/src/gbm/gblinear-inl.hpp
index 6d507ac6ed66..005eada55d24 100644
--- a/src/gbm/gblinear-inl.hpp
+++ b/src/gbm/gblinear-inl.hpp
@@ -32,10 +32,10 @@ class GBLinear : public IGradBooster {
       model.param.SetParam(name, val);
     }
   }
-  virtual void LoadModel(utils::IStream &fi) {
+  virtual void LoadModel(utils::IStream &fi, bool with_pbuffer) {
     model.LoadModel(fi);
   }
-  virtual void SaveModel(utils::IStream &fo) const {
+  virtual void SaveModel(utils::IStream &fo, bool with_pbuffer) const {
     model.SaveModel(fo);
   }
   virtual void InitModel(void) {
diff --git a/src/gbm/gbm.h b/src/gbm/gbm.h
index f8eae6dbb4cd..8799a7af0465 100644
--- a/src/gbm/gbm.h
+++ b/src/gbm/gbm.h
@@ -27,13 +27,15 @@ class IGradBooster {
   /*!
    * \brief load model from stream
    * \param fi input stream
+   * \param with_pbuffer whether the incoming data contains pbuffer
    */
-  virtual void LoadModel(utils::IStream &fi) = 0;
+  virtual void LoadModel(utils::IStream &fi, bool with_pbuffer) = 0;
   /*!
    * \brief save model to stream
    * \param fo output stream
+   * \param with_pbuffer whether save out pbuffer
    */
-  virtual void SaveModel(utils::IStream &fo) const = 0;
+  virtual void SaveModel(utils::IStream &fo, bool with_pbuffer) const = 0;
   /*!
    * \brief initialize the model
    */
diff --git a/src/gbm/gbtree-inl.hpp b/src/gbm/gbtree-inl.hpp
index 8d511f06e513..e63ea42fa3ee 100644
--- a/src/gbm/gbtree-inl.hpp
+++ b/src/gbm/gbtree-inl.hpp
@@ -39,7 +39,7 @@ class GBTree : public IGradBooster {
     tparam.SetParam(name, val);
     if (trees.size() == 0) mparam.SetParam(name, val);
   }
-  virtual void LoadModel(utils::IStream &fi) {
+  virtual void LoadModel(utils::IStream &fi, bool with_pbuffer) {
     this->Clear();
     utils::Check(fi.Read(&mparam, sizeof(ModelParam)) != 0,
                  "GBTree: invalid model file");
@@ -56,13 +56,19 @@ class GBTree : public IGradBooster {
     if (mparam.num_pbuffer != 0) {
       pred_buffer.resize(mparam.PredBufferSize());
       pred_counter.resize(mparam.PredBufferSize());
-      utils::Check(fi.Read(&pred_buffer[0], pred_buffer.size() * sizeof(float)) != 0,
-                   "GBTree: invalid model file");
-      utils::Check(fi.Read(&pred_counter[0], pred_counter.size() * sizeof(unsigned)) != 0,
-                   "GBTree: invalid model file");
+      if (with_pbuffer) {
+        utils::Check(fi.Read(&pred_buffer[0], pred_buffer.size() * sizeof(float)) != 0,
+                     "GBTree: invalid model file");
+        utils::Check(fi.Read(&pred_counter[0], pred_counter.size() * sizeof(unsigned)) != 0,
+                     "GBTree: invalid model file");
+      } else {
+        // reset predict buffer if the input do not have them
+        std::fill(pred_buffer.begin(), pred_buffer.end(), 0.0f);
+        std::fill(pred_counter.begin(), pred_counter.end(), 0);
+      }
     }
   }
-  virtual void SaveModel(utils::IStream &fo) const {
+  virtual void SaveModel(utils::IStream &fo, bool with_pbuffer) const {
     utils::Assert(mparam.num_trees == static_cast<int>(trees.size()), "GBTree");
     fo.Write(&mparam, sizeof(ModelParam));
     for (size_t i = 0; i < trees.size(); ++i) {
@@ -71,7 +77,7 @@ class GBTree : public IGradBooster {
     if (tree_info.size() != 0) {
       fo.Write(&tree_info[0], sizeof(int) * tree_info.size());
     }
-    if (mparam.num_pbuffer != 0) {
+    if (mparam.num_pbuffer != 0 && with_pbuffer) {      
       fo.Write(&pred_buffer[0], pred_buffer.size() * sizeof(float));
       fo.Write(&pred_counter[0], pred_counter.size() * sizeof(unsigned));
     }
diff --git a/src/learner/learner-inl.hpp b/src/learner/learner-inl.hpp
index 6ca3b7c7aaef..1640071b692c 100644
--- a/src/learner/learner-inl.hpp
+++ b/src/learner/learner-inl.hpp
@@ -23,7 +23,7 @@ namespace learner {
  * \brief learner that takes do gradient boosting on specific objective functions
  *  and do training and prediction
  */
-class BoostLearner {
+class BoostLearner : public rabit::ISerializable {
  public:
   BoostLearner(void) {
     obj_ = NULL;
@@ -35,7 +35,7 @@ class BoostLearner {
     distributed_mode = 0;
     pred_buffer_size = 0;
   }
-  ~BoostLearner(void) {
+  virtual ~BoostLearner(void) {
     if (obj_ != NULL) delete obj_;
     if (gbm_ != NULL) delete gbm_;
   }
@@ -140,9 +140,9 @@ class BoostLearner {
   /*!
    * \brief load model from stream
    * \param fi input stream
-   * \param keep_predbuffer whether to keep predict buffer
+   * \param with_pbuffer whether to load with predict buffer
    */
-  inline void LoadModel(utils::IStream &fi, bool keep_predbuffer = true) {
+  inline void LoadModel(utils::IStream &fi, bool with_pbuffer = true) {
     utils::Check(fi.Read(&mparam, sizeof(ModelParam)) != 0,
                  "BoostLearner: wrong model format");
     utils::Check(fi.Read(&name_obj_), "BoostLearner: wrong model format");
@@ -151,11 +151,23 @@ class BoostLearner {
     if (obj_ != NULL) delete obj_;
     if (gbm_ != NULL) delete gbm_;
     this->InitObjGBM();
-    gbm_->LoadModel(fi);
-    if (keep_predbuffer && distributed_mode == 2 && rabit::GetRank() != 0) {
+    gbm_->LoadModel(fi, with_pbuffer);
+    if (with_pbuffer && distributed_mode == 2 && rabit::GetRank() != 0) {
       gbm_->ResetPredBuffer(pred_buffer_size);
     }
   }
+  // rabit load model from rabit checkpoint
+  virtual void Load(rabit::IStream &fi) {
+    RabitStreamAdapter fs(fi);
+    // for row split, we should not keep pbuffer
+    this->LoadModel(fs, distributed_mode != 2);
+  }
+  // rabit save model to rabit checkpoint
+  virtual void Save(rabit::IStream &fo) const {
+    RabitStreamAdapter fs(fo);
+    // for row split, we should not keep pbuffer
+    this->SaveModel(fs, distributed_mode != 2);
+  }
   /*!
    * \brief load model from file
    * \param fname file name
@@ -165,11 +177,11 @@ class BoostLearner {
     this->LoadModel(fi);
     fi.Close();
   }
-  inline void SaveModel(utils::IStream &fo) const {
+  inline void SaveModel(utils::IStream &fo, bool with_pbuffer = true) const {
     fo.Write(&mparam, sizeof(ModelParam));
     fo.Write(name_obj_);
     fo.Write(name_gbm_);
-    gbm_->SaveModel(fo);
+    gbm_->SaveModel(fo, with_pbuffer);
   }
   /*!
    * \brief save model into file
@@ -394,6 +406,23 @@ class BoostLearner {
   // data structure field
   /*! \brief the entries indicates that we have internal prediction cache */
   std::vector<CacheEntry> cache_;
+
+ private:
+  // adapt rabit stream to utils stream
+  struct RabitStreamAdapter : public utils::IStream {
+    // rabit stream
+    rabit::IStream &fs;
+    // constructr
+    RabitStreamAdapter(rabit::IStream &fs) : fs(fs) {}
+    // destructor
+    virtual ~RabitStreamAdapter(void){}
+    virtual size_t Read(void *ptr, size_t size) {
+      return fs.Read(ptr, size);
+    }
+    virtual void Write(const void *ptr, size_t size) {
+      fs.Write(ptr, size);
+    }
+  };
 };
 }  // namespace learner
 }  // namespace xgboost
diff --git a/src/xgboost_main.cpp b/src/xgboost_main.cpp
index 9583a2278c39..d25140461b87 100644
--- a/src/xgboost_main.cpp
+++ b/src/xgboost_main.cpp
@@ -31,14 +31,32 @@ class BoostLearnTask {
         this->SetParam(name, val);
       }
     }
+    // whether need data rank
+    bool need_data_rank = strchr(train_path.c_str(), '%') != NULL;
+    // if need data rank in loading, initialize rabit engine before load data
+    // otherwise, initialize rabit engine after loading data
+    // lazy initialization of rabit engine can be helpful in speculative execution
+    if (need_data_rank) rabit::Init(argc, argv);
+    this->InitData();
+    if (!need_data_rank) rabit::Init(argc, argv);
+    if (rabit::IsDistributed()) {
+      std::string pname = rabit::GetProcessorName();
+      printf("start %s:%d\n", pname.c_str(), rabit::GetRank());
+    }
     if (rabit::IsDistributed()) {
       this->SetParam("data_split", "col");
     }
     if (rabit::GetRank() != 0) {
       this->SetParam("silent", "2");
     }
-    this->InitData();
-    this->InitLearner();
+    
+    if (task == "train") {
+      // if task is training, will try recover from checkpoint
+      this->TaskTrain();
+      return 0;
+    } else {
+      this->InitLearner();
+    }
     if (task == "dump") {
       this->TaskDump(); return 0;
     }
@@ -47,8 +65,6 @@ class BoostLearnTask {
     }
     if (task == "pred") {
       this->TaskPred();
-    } else {
-      this->TaskTrain();
     }
     return 0;
   }
@@ -152,10 +168,13 @@ class BoostLearnTask {
     }
   }
   inline void TaskTrain(void) {
+    int version = rabit::LoadCheckPoint(&learner);
+    if (version == 0) this->InitLearner();
+
     const time_t start = time(NULL);
     unsigned long elapsed = 0;
     learner.CheckInit(data);
-    for (int i = 0; i < num_round; ++i) {
+    for (int i = version; i < num_round; ++i) {
       elapsed = (unsigned long)(time(NULL) - start);
       if (!silent) printf("boosting round %d, %lu sec elapsed\n", i, elapsed);
       learner.UpdateOneIter(i, *data); 
@@ -166,6 +185,9 @@ class BoostLearnTask {
       if (save_period != 0 && (i + 1) % save_period == 0) {
         this->SaveModel(i);
       }
+      utils::Assert(rabit::VersionNumber() == i, "incorrect version number");
+      // checkpoint the model
+      rabit::CheckPoint(&learner);
       elapsed = (unsigned long)(time(NULL) - start);
     }
     // always save final round
@@ -263,11 +285,6 @@ class BoostLearnTask {
 }
 
 int main(int argc, char *argv[]){
-  rabit::Init(argc, argv);
-  if (rabit::IsDistributed()) {
-    std::string pname = rabit::GetProcessorName();
-    printf("start %s:%d\n", pname.c_str(), rabit::GetRank());
-  }
   xgboost::random::Seed(0);
   xgboost::BoostLearnTask tsk;
   int ret = tsk.Run(argc, argv);

From 7a35e1a906634ae9b86d7b90bc56c478d25c2678 Mon Sep 17 00:00:00 2001
From: tqchen <tianqi.tchen@gmail.com>
Date: Sat, 20 Dec 2014 05:02:38 -0800
Subject: [PATCH 125/166] change hist update to lazy

---
 Makefile                           |   6 ++
 src/tree/updater_histmaker-inl.hpp | 136 +++++++++++++++++------------
 src/tree/updater_refresh-inl.hpp   |  70 ++++++++-------
 3 files changed, 127 insertions(+), 85 deletions(-)

diff --git a/Makefile b/Makefile
index 12d2507e291b..583f12038faf 100644
--- a/Makefile
+++ b/Makefile
@@ -10,6 +10,12 @@ else
 	CFLAGS += -fopenmp
 endif
 
+# by default use c++11
+ifeq ($(no_cxx11),1)
+else 
+	CFLAGS += -std=c++11
+endif
+
 # specify tensor path
 BIN = xgboost
 OBJ = updater.o gbm.o io.o main.o 
diff --git a/src/tree/updater_histmaker-inl.hpp b/src/tree/updater_histmaker-inl.hpp
index ab1c5ef1c60a..9f35d57318c4 100644
--- a/src/tree/updater_histmaker-inl.hpp
+++ b/src/tree/updater_histmaker-inl.hpp
@@ -152,7 +152,7 @@ class HistMaker: public BaseMaker {
                                   IFMatrix *p_fmat,
                                   const BoosterInfo &info,
                                   const std::vector <bst_uint> &fset,
-                                  const RegTree &tree)  = 0;
+                                  const RegTree &tree) = 0;
   // initialize the current working set of features in this round
   virtual void InitWorkSet(IFMatrix *p_fmat,
                            const RegTree &tree,
@@ -306,32 +306,45 @@ class CQHistMaker: public HistMaker<TStats> {
     } 
     // start to work
     this->wspace.Init(this->param, 1);
-    thread_hist.resize(this->get_nthread());
-    // start accumulating statistics
-    utils::IIterator<ColBatch> *iter = p_fmat->ColIterator(fset);
-    iter->BeforeFirst();
-    while (iter->Next()) {
-      const ColBatch &batch = iter->Value();
-      // start enumeration
-      const bst_omp_uint nsize = static_cast<bst_omp_uint>(batch.size);
-      #pragma omp parallel for schedule(dynamic, 1)
-      for (bst_omp_uint i = 0; i < nsize; ++i) {
-        int offset = feat2workindex[batch.col_index[i]];
-        if (offset >= 0) {
-          this->UpdateHistCol(gpair, batch[i], info, tree,
-                              fset, offset,
-                              &thread_hist[omp_get_thread_num()]);
+    // if it is C++11, use lazy evaluation for Allreduce,
+    // to gain speedup in recovery
+#if __cplusplus >= 201103L
+    auto lazy_get_hist = [&]()
+#endif
+    {
+      thread_hist.resize(this->get_nthread());
+      // start accumulating statistics
+      utils::IIterator<ColBatch> *iter = p_fmat->ColIterator(fset);
+      iter->BeforeFirst();
+      while (iter->Next()) {
+        const ColBatch &batch = iter->Value();
+        // start enumeration
+        const bst_omp_uint nsize = static_cast<bst_omp_uint>(batch.size);
+        #pragma omp parallel for schedule(dynamic, 1)
+        for (bst_omp_uint i = 0; i < nsize; ++i) {
+          int offset = feat2workindex[batch.col_index[i]];
+          if (offset >= 0) {
+            this->UpdateHistCol(gpair, batch[i], info, tree,
+                                fset, offset,
+                                &thread_hist[omp_get_thread_num()]);
+          }
         }
       }
-    }
-    for (size_t i = 0; i < this->qexpand.size(); ++i) {
-      const int nid = this->qexpand[i];
-      const int wid = this->node2workindex[nid];
-      this->wspace.hset[0][fset.size() + wid * (fset.size()+1)]
-          .data[0] = node_stats[nid];
-    }
+      for (size_t i = 0; i < this->qexpand.size(); ++i) {
+        const int nid = this->qexpand[i];
+        const int wid = this->node2workindex[nid];
+        this->wspace.hset[0][fset.size() + wid * (fset.size()+1)]
+            .data[0] = node_stats[nid];
+      }
+    };
     // sync the histogram
-    this->histred.Allreduce(BeginPtr(this->wspace.hset[0].data), this->wspace.hset[0].data.size());    
+    // if it is C++11, use lazy evaluation for Allreduce
+#if __cplusplus >= 201103L
+    this->histred.Allreduce(BeginPtr(this->wspace.hset[0].data), 
+                            this->wspace.hset[0].data.size(), lazy_get_hist);
+#else
+    this->histred.Allreduce(BeginPtr(this->wspace.hset[0].data), this->wspace.hset[0].data.size());   
+#endif    
   }
   virtual void ResetPositionAfterSplit(IFMatrix *p_fmat,
                                        const RegTree &tree) {
@@ -353,49 +366,61 @@ class CQHistMaker: public HistMaker<TStats> {
       } else {
         feat2workindex[fset[i]] = -2;  
       }
-    }
-        
+    }      
     this->GetNodeStats(gpair, *p_fmat, tree, info,
-                       &thread_stats, &node_stats);
+                       &thread_stats, &node_stats);       
     sketchs.resize(this->qexpand.size() * freal_set.size());
     for (size_t i = 0; i < sketchs.size(); ++i) {
       sketchs[i].Init(info.num_row, this->param.sketch_eps);
     }
-    thread_sketch.resize(this->get_nthread());
-    // number of rows in
-    const size_t nrows = p_fmat->buffered_rowset().size();
-    // start accumulating statistics
-    utils::IIterator<ColBatch> *iter = p_fmat->ColIterator(freal_set);
-    iter->BeforeFirst();
-    while (iter->Next()) {
-      const ColBatch &batch = iter->Value();
-      // start enumeration
-      const bst_omp_uint nsize = static_cast<bst_omp_uint>(batch.size);
-      #pragma omp parallel for schedule(dynamic, 1)
-      for (bst_omp_uint i = 0; i < nsize; ++i) {
-        int offset = feat2workindex[batch.col_index[i]];
-        if (offset >= 0) {
-          this->UpdateSketchCol(gpair, batch[i], tree,
-                                node_stats,
-                                freal_set, offset,
-                                batch[i].length == nrows,
-                                &thread_sketch[omp_get_thread_num()]);
-        }
-      }
-    }
+    // intitialize the summary array
+    summary_array.resize(sketchs.size());
     // setup maximum size
     unsigned max_size = this->param.max_sketch_size();
-    // synchronize sketch
-    summary_array.resize(sketchs.size());
     for (size_t i = 0; i < sketchs.size(); ++i) {
-      utils::WXQuantileSketch<bst_float, bst_float>::SummaryContainer out;
-      sketchs[i].GetSummary(&out);
       summary_array[i].Reserve(max_size);
-      summary_array[i].SetPrune(out, max_size);
     }
+    // if it is C++11, use lazy evaluation for Allreduce
+#if __cplusplus >= 201103L
+    auto lazy_get_summary = [&]()
+#endif
+    {// get smmary
+      thread_sketch.resize(this->get_nthread());
+      // number of rows in
+      const size_t nrows = p_fmat->buffered_rowset().size();
+      // start accumulating statistics
+      utils::IIterator<ColBatch> *iter = p_fmat->ColIterator(freal_set);
+      iter->BeforeFirst();
+      while (iter->Next()) {
+        const ColBatch &batch = iter->Value();
+        // start enumeration
+        const bst_omp_uint nsize = static_cast<bst_omp_uint>(batch.size);
+        #pragma omp parallel for schedule(dynamic, 1)
+        for (bst_omp_uint i = 0; i < nsize; ++i) {
+          int offset = feat2workindex[batch.col_index[i]];
+          if (offset >= 0) {
+            this->UpdateSketchCol(gpair, batch[i], tree,
+                                  node_stats,
+                                  freal_set, offset,
+                                  batch[i].length == nrows,
+                                  &thread_sketch[omp_get_thread_num()]);
+          }
+        }
+      }
+      for (size_t i = 0; i < sketchs.size(); ++i) {
+        utils::WXQuantileSketch<bst_float, bst_float>::SummaryContainer out;
+        sketchs[i].GetSummary(&out);
+        summary_array[i].SetPrune(out, max_size);
+      }
+      utils::Assert(summary_array.size() == sketchs.size(), "shape mismatch");
+    };
     if (summary_array.size() != 0) {
       size_t nbytes = WXQSketch::SummaryContainer::CalcMemCost(max_size);
+#if __cplusplus >= 201103L
+      sreducer.Allreduce(BeginPtr(summary_array), nbytes, summary_array.size(), lazy_get_summary);
+#else
       sreducer.Allreduce(BeginPtr(summary_array), nbytes, summary_array.size());
+#endif
     }
     // now we get the final result of sketch, setup the cut
     this->wspace.cut.clear();
@@ -623,7 +648,8 @@ class QuantileHistMaker: public HistMaker<TStats> {
       summary_array[i].Reserve(max_size);
       summary_array[i].SetPrune(out, max_size);
     }
-    size_t nbytes = WXQSketch::SummaryContainer::CalcMemCost(max_size);
+    
+    size_t nbytes = WXQSketch::SummaryContainer::CalcMemCost(max_size);    
     sreducer.Allreduce(BeginPtr(summary_array), nbytes, summary_array.size());
     // now we get the final result of sketch, setup the cut
     this->wspace.cut.clear();
diff --git a/src/tree/updater_refresh-inl.hpp b/src/tree/updater_refresh-inl.hpp
index 83a81615c4f1..0285f07713c0 100644
--- a/src/tree/updater_refresh-inl.hpp
+++ b/src/tree/updater_refresh-inl.hpp
@@ -52,40 +52,50 @@ class TreeRefresher: public IUpdater {
       std::fill(stemp[tid].begin(), stemp[tid].end(), TStats(param));
       fvec_temp[tid].Init(trees[0]->param.num_feature);
     }
-    // start accumulating statistics
-    utils::IIterator<RowBatch> *iter = p_fmat->RowIterator();
-    iter->BeforeFirst();
-    while (iter->Next()) {
-      const RowBatch &batch = iter->Value();
-      utils::Check(batch.size < std::numeric_limits<unsigned>::max(),
-                   "too large batch size ");
-      const bst_omp_uint nbatch = static_cast<bst_omp_uint>(batch.size);
-      #pragma omp parallel for schedule(static)
-      for (bst_omp_uint i = 0; i < nbatch; ++i) {
-        RowBatch::Inst inst = batch[i];
-        const int tid = omp_get_thread_num();
-        const bst_uint ridx = static_cast<bst_uint>(batch.base_rowid + i);
-        RegTree::FVec &feats = fvec_temp[tid];
-        feats.Fill(inst);
-        int offset = 0;
-        for (size_t j = 0; j < trees.size(); ++j) {
-          AddStats(*trees[j], feats, gpair, info, ridx,
-                   BeginPtr(stemp[tid]) + offset);
-          offset += trees[j]->param.num_nodes;
+    // if it is C++11, use lazy evaluation for Allreduce,
+    // to gain speedup in recovery
+#if __cplusplus >= 201103L
+    auto lazy_get_stats = [&]()
+#endif
+    {
+      // start accumulating statistics
+      utils::IIterator<RowBatch> *iter = p_fmat->RowIterator();
+      iter->BeforeFirst();
+      while (iter->Next()) {
+        const RowBatch &batch = iter->Value();
+        utils::Check(batch.size < std::numeric_limits<unsigned>::max(),
+                     "too large batch size ");
+        const bst_omp_uint nbatch = static_cast<bst_omp_uint>(batch.size);
+        #pragma omp parallel for schedule(static)
+        for (bst_omp_uint i = 0; i < nbatch; ++i) {
+          RowBatch::Inst inst = batch[i];
+          const int tid = omp_get_thread_num();
+          const bst_uint ridx = static_cast<bst_uint>(batch.base_rowid + i);
+          RegTree::FVec &feats = fvec_temp[tid];
+          feats.Fill(inst);
+          int offset = 0;
+          for (size_t j = 0; j < trees.size(); ++j) {
+            AddStats(*trees[j], feats, gpair, info, ridx,
+                     BeginPtr(stemp[tid]) + offset);
+            offset += trees[j]->param.num_nodes;
+          }
+          feats.Drop(inst);
         }
-        feats.Drop(inst);
       }
-    }
-    // aggregate the statistics
-    int num_nodes = static_cast<int>(stemp[0].size());
-    #pragma omp parallel for schedule(static)
-    for (int nid = 0; nid < num_nodes; ++nid) {
-      for (int tid = 1; tid < nthread; ++tid) {
-        stemp[0][nid].Add(stemp[tid][nid]);
+      // aggregate the statistics
+      int num_nodes = static_cast<int>(stemp[0].size());
+      #pragma omp parallel for schedule(static)
+      for (int nid = 0; nid < num_nodes; ++nid) {
+        for (int tid = 1; tid < nthread; ++tid) {
+          stemp[0][nid].Add(stemp[tid][nid]);
+        }
       }
-    }
-    // AllReduce, add statistics up
+    };
+#if __cplusplus >= 201103L
+    reducer.Allreduce(BeginPtr(stemp[0]), stemp[0].size(), lazy_get_stats);
+#else
     reducer.Allreduce(BeginPtr(stemp[0]), stemp[0].size());
+#endif
     // rescale learning rate according to size of trees
     float lr = param.learning_rate;
     param.learning_rate = lr / trees.size();

From b07866398249734c84db7072c956ef39848cd28e Mon Sep 17 00:00:00 2001
From: tqchen <tianqi.tchen@gmail.com>
Date: Sat, 20 Dec 2014 16:39:39 -0800
Subject: [PATCH 126/166] ok

---
 Makefile                                  |  2 +-
 multi-node/row-split/machine-row-rabit.sh | 24 +++++++++++++++++++++++
 2 files changed, 25 insertions(+), 1 deletion(-)
 create mode 100755 multi-node/row-split/machine-row-rabit.sh

diff --git a/Makefile b/Makefile
index 583f12038faf..a78c28e56916 100644
--- a/Makefile
+++ b/Makefile
@@ -2,7 +2,7 @@ export CC  = gcc
 export CXX = g++
 export MPICXX = mpicxx
 export LDFLAGS= -Lrabit/lib -pthread -lm 
-export CFLAGS = -Wall -O3 -msse2  -Wno-unknown-pragmas -fPIC  -Irabit/src
+export CFLAGS = -Wall -O3 -msse2  -Wno-unknown-pragmas -fPIC  -Irabit/include
 
 ifeq ($(no_omp),1)
 	CFLAGS += -DDISABLE_OPENMP 
diff --git a/multi-node/row-split/machine-row-rabit.sh b/multi-node/row-split/machine-row-rabit.sh
new file mode 100755
index 000000000000..4a526ff94c5c
--- /dev/null
+++ b/multi-node/row-split/machine-row-rabit.sh
@@ -0,0 +1,24 @@
+#!/bin/bash
+if [[ $# -ne 1 ]]
+then
+    echo "Usage: nprocess"
+    exit -1
+fi
+
+rm -rf train-machine.row* *.model
+k=$1
+# make machine data
+cd ../../demo/regression/
+python mapfeat.py
+python mknfold.py machine.txt 1
+cd -
+
+# split the lib svm file into k subfiles
+python splitrows.py ../../demo/regression/machine.txt.train train-machine $k
+
+# run xgboost mpi
+../../rabit/tracker/rabit_mpi.py $k local ../../xgboost machine-row.conf dsplit=row num_round=3
+
+# run xgboost-mpi save model 0001, continue to run from existing model
+../../rabit/tracker/rabit_mpi.py $k local ../../xgboost machine-row.conf dsplit=row num_round=1
+../../rabit/tracker/rabit_mpi.py $k local ../../xgboost machine-row.conf dsplit=row num_round=2 model_in=0001.model

From 31eedfea59af49e0ee23d5c3aff38808f4f6859f Mon Sep 17 00:00:00 2001
From: tqchen <tianqi.tchen@gmail.com>
Date: Sun, 21 Dec 2014 00:14:00 -0800
Subject: [PATCH 127/166] pas mock, need to fix rabit lib for not
 initialization

---
 .gitignore                                    |  1 +
 Makefile                                      |  2 +-
 multi-node/col-split/README.md                |  4 +-
 .../col-split/mushroom-col-rabit-mock.sh      | 25 ++++++++++
 src/gbm/gbtree-inl.hpp                        | 18 +++-----
 src/learner/learner-inl.hpp                   | 46 +++++++++++++------
 src/xgboost_main.cpp                          |  5 +-
 7 files changed, 69 insertions(+), 32 deletions(-)
 create mode 100755 multi-node/col-split/mushroom-col-rabit-mock.sh

diff --git a/.gitignore b/.gitignore
index cb017114b0b3..8bb1ead7fa71 100644
--- a/.gitignore
+++ b/.gitignore
@@ -49,3 +49,4 @@ Debug
 xgboost
 xgboost-mpi
 train*
+rabit
diff --git a/Makefile b/Makefile
index a78c28e56916..3f0b3c7cfb62 100644
--- a/Makefile
+++ b/Makefile
@@ -46,7 +46,7 @@ xgboost:  updater.o gbm.o io.o main.o  librabit
 wrapper/libxgboostwrapper.so: wrapper/xgboost_wrapper.cpp src/utils/*.h src/*.h src/learner/*.hpp src/learner/*.h  updater.o gbm.o io.o librabit
 
 $(BIN) : 
-	$(CXX) $(CFLAGS) -o $@ $(filter %.cpp %.o %.c, $^) $(LDFLAGS)  -lrabit
+	$(CXX) $(CFLAGS) -o $@ $(filter %.cpp %.o %.c, $^) $(LDFLAGS)  -lrabit_mock
 
 $(SLIB) :
 	$(CXX) $(CFLAGS) -fPIC -shared -o $@ $(filter %.cpp %.o %.c, $^) $(LDFLAGS)  -lrabit
diff --git a/multi-node/col-split/README.md b/multi-node/col-split/README.md
index 04227d1eb88a..4f0d07b27117 100644
--- a/multi-node/col-split/README.md
+++ b/multi-node/col-split/README.md
@@ -1,7 +1,9 @@
 Distributed XGBoost: Column Split Version
 ====
 * run ```bash mushroom-col-rabit.sh <n-process>```
-  - mushroom-col-tcp.sh starts xgboost job using rabit's allreduce
+  - mushroom-col-rabit.sh starts xgboost job using rabit's allreduce
+* run ```bash mushroom-col-rabit-mock.sh <n-process>```
+  - mushroom-col-rabit-mock.sh starts xgboost job using rabit's allreduce, inserts suicide signal at certain point and test recovery
 * run ```bash mushroom-col-mpi.sh <n-mpi-process>```
   - mushroom-col.sh starts xgboost-mpi job
 
diff --git a/multi-node/col-split/mushroom-col-rabit-mock.sh b/multi-node/col-split/mushroom-col-rabit-mock.sh
new file mode 100755
index 000000000000..5148a0b61721
--- /dev/null
+++ b/multi-node/col-split/mushroom-col-rabit-mock.sh
@@ -0,0 +1,25 @@
+#!/bin/bash
+if [[ $# -ne 1 ]]
+then
+    echo "Usage: nprocess"
+    exit -1
+fi
+
+#
+# This script is same as mushroom-col except that we will be using xgboost instead of xgboost-mpi
+# xgboost used built in tcp-based allreduce module, and can be run on more enviroment, so long as we know how to start job by modifying ../submit_job_tcp.py
+#
+rm -rf train.col* *.model
+k=$1
+
+# split the lib svm file into k subfiles
+python splitsvm.py ../../demo/data/agaricus.txt.train train $k
+
+# run xgboost mpi
+../../rabit/tracker/rabit_mpi.py $k local ../../rabit/test/keepalive.sh ../../xgboost mushroom-col.conf dsplit=col mock=0,0,1,0 mock=1,1,0,0
+
+# the model can be directly loaded by single machine xgboost solver, as usuall
+#../../xgboost mushroom-col.conf task=dump model_in=0002.model fmap=../../demo/data/featmap.txt name_dump=dump.nice.$k.txt
+
+
+#cat dump.nice.$k.txt
diff --git a/src/gbm/gbtree-inl.hpp b/src/gbm/gbtree-inl.hpp
index e63ea42fa3ee..e8f1b1933294 100644
--- a/src/gbm/gbtree-inl.hpp
+++ b/src/gbm/gbtree-inl.hpp
@@ -53,19 +53,13 @@ class GBTree : public IGradBooster {
       utils::Check(fi.Read(&tree_info[0], sizeof(int) * mparam.num_trees) != 0,
                    "GBTree: invalid model file");
     }
-    if (mparam.num_pbuffer != 0) {
+    if (mparam.num_pbuffer != 0 && with_pbuffer) {
       pred_buffer.resize(mparam.PredBufferSize());
       pred_counter.resize(mparam.PredBufferSize());
-      if (with_pbuffer) {
-        utils::Check(fi.Read(&pred_buffer[0], pred_buffer.size() * sizeof(float)) != 0,
-                     "GBTree: invalid model file");
-        utils::Check(fi.Read(&pred_counter[0], pred_counter.size() * sizeof(unsigned)) != 0,
-                     "GBTree: invalid model file");
-      } else {
-        // reset predict buffer if the input do not have them
-        std::fill(pred_buffer.begin(), pred_buffer.end(), 0.0f);
-        std::fill(pred_counter.begin(), pred_counter.end(), 0);
-      }
+      utils::Check(fi.Read(&pred_buffer[0], pred_buffer.size() * sizeof(float)) != 0,
+                   "GBTree: invalid model file");
+      utils::Check(fi.Read(&pred_counter[0], pred_counter.size() * sizeof(unsigned)) != 0,
+                   "GBTree: invalid model file");
     }
   }
   virtual void SaveModel(utils::IStream &fo, bool with_pbuffer) const {
@@ -77,7 +71,7 @@ class GBTree : public IGradBooster {
     if (tree_info.size() != 0) {
       fo.Write(&tree_info[0], sizeof(int) * tree_info.size());
     }
-    if (mparam.num_pbuffer != 0 && with_pbuffer) {      
+    if (mparam.num_pbuffer != 0 && with_pbuffer) {
       fo.Write(&pred_buffer[0], pred_buffer.size() * sizeof(float));
       fo.Write(&pred_counter[0], pred_counter.size() * sizeof(unsigned));
     }
diff --git a/src/learner/learner-inl.hpp b/src/learner/learner-inl.hpp
index 1640071b692c..2f6c3f0b3390 100644
--- a/src/learner/learner-inl.hpp
+++ b/src/learner/learner-inl.hpp
@@ -48,11 +48,9 @@ class BoostLearner : public rabit::ISerializable {
    * \param mats array of pointers to matrix whose prediction result need to be cached
    */          
   inline void SetCacheData(const std::vector<DMatrix*>& mats) {
-    // estimate feature bound
-    unsigned num_feature = 0;
+    utils::Assert(cache_.size() == 0, "can only call cache data once");
     // assign buffer index
     size_t buffer_size = 0;
-    utils::Assert(cache_.size() == 0, "can only call cache data once");
     for (size_t i = 0; i < mats.size(); ++i) {
       bool dupilicate = false;
       for (size_t j = 0; j < i; ++j) {
@@ -63,16 +61,10 @@ class BoostLearner : public rabit::ISerializable {
       mats[i]->cache_learner_ptr_ = this;
       cache_.push_back(CacheEntry(mats[i], buffer_size, mats[i]->info.num_row()));
       buffer_size += mats[i]->info.num_row();
-      num_feature = std::max(num_feature, static_cast<unsigned>(mats[i]->info.num_col()));
     }
-    rabit::Allreduce<rabit::op::Max>(&num_feature, 1);
     char str_temp[25];
-    if (num_feature > mparam.num_feature) {
-      utils::SPrintf(str_temp, sizeof(str_temp), "%u", num_feature);
-      this->SetParam("bst:num_feature", str_temp);
-    }
-    utils::SPrintf(str_temp, sizeof(str_temp), "%lu",
-			 static_cast<unsigned long>(buffer_size));
+    utils::SPrintf(str_temp, sizeof(str_temp), "%lu", 
+                   static_cast<unsigned long>(buffer_size));
     this->SetParam("num_pbuffer", str_temp);
     if (!silent) {
       utils::Printf("buffer_size=%ld\n", static_cast<long>(buffer_size));
@@ -126,10 +118,29 @@ class BoostLearner : public rabit::ISerializable {
       cfg_.push_back(std::make_pair(std::string(name), std::string(val)));
     }
   }
+  // this is an internal function
+  // initialize the trainer, called at InitModel and LoadModel
+  inline void InitTrainer(bool calc_num_feature = true) {
+    if (calc_num_feature) {
+      // estimate feature bound
+      unsigned num_feature = 0;
+      for (size_t i = 0; i < cache_.size(); ++i) {
+        num_feature = std::max(num_feature, 
+                               static_cast<unsigned>(cache_[i].mat_->info.num_col()));
+      }
+      // run allreduce on num_feature to find the maximum value
+      rabit::Allreduce<rabit::op::Max>(&num_feature, 1);
+      if (num_feature > mparam.num_feature) mparam.num_feature = num_feature;
+    } 
+    char str_temp[25];
+    utils::SPrintf(str_temp, sizeof(str_temp), "%d", mparam.num_feature);
+    this->SetParam("bst:num_feature", str_temp);   
+  }
   /*!
    * \brief initialize the model
    */
   inline void InitModel(void) {
+    this->InitTrainer();
     // initialize model
     this->InitObjGBM();
     // reset the base score
@@ -141,8 +152,9 @@ class BoostLearner : public rabit::ISerializable {
    * \brief load model from stream
    * \param fi input stream
    * \param with_pbuffer whether to load with predict buffer
+   * \param calc_num_feature whether call InitTrainer with calc_num_feature
    */
-  inline void LoadModel(utils::IStream &fi, bool with_pbuffer = true) {
+  inline void LoadModel(utils::IStream &fi, bool with_pbuffer = true, bool calc_num_feature = true) {
     utils::Check(fi.Read(&mparam, sizeof(ModelParam)) != 0,
                  "BoostLearner: wrong model format");
     utils::Check(fi.Read(&name_obj_), "BoostLearner: wrong model format");
@@ -150,9 +162,10 @@ class BoostLearner : public rabit::ISerializable {
     // delete existing gbm if any
     if (obj_ != NULL) delete obj_;
     if (gbm_ != NULL) delete gbm_;
+    this->InitTrainer(calc_num_feature);
     this->InitObjGBM();
     gbm_->LoadModel(fi, with_pbuffer);
-    if (with_pbuffer && distributed_mode == 2 && rabit::GetRank() != 0) {
+    if (!with_pbuffer || distributed_mode == 2) {
       gbm_->ResetPredBuffer(pred_buffer_size);
     }
   }
@@ -160,7 +173,7 @@ class BoostLearner : public rabit::ISerializable {
   virtual void Load(rabit::IStream &fi) {
     RabitStreamAdapter fs(fi);
     // for row split, we should not keep pbuffer
-    this->LoadModel(fs, distributed_mode != 2);
+    this->LoadModel(fs, distributed_mode != 2, false);
   }
   // rabit save model to rabit checkpoint
   virtual void Save(rabit::IStream &fo) const {
@@ -209,9 +222,12 @@ class BoostLearner : public rabit::ISerializable {
    * \param p_train pointer to the data matrix
    */
   inline void UpdateOneIter(int iter, const DMatrix &train) {
+    printf("!!UpdateOneIter\n");
     this->PredictRaw(train, &preds_);
     obj_->GetGradient(preds_, train.info, iter, &gpair_);
+    printf("!!UpdateOneDoboost\n");
     gbm_->DoBoost(train.fmat(), this->FindBufferOffset(train), train.info.info, &gpair_);
+    printf("!!UpdateOneIter finish\n");
   }
   /*!
    * \brief evaluate the model for specific iteration
@@ -335,7 +351,7 @@ class BoostLearner : public rabit::ISerializable {
     /* \brief number of class, if it is multi-class classification  */
     int num_class;
     /*! \brief reserved field */
-    int reserved[32];
+    int reserved[31];
     /*! \brief constructor */
     ModelParam(void) {
       base_score = 0.5f;
diff --git a/src/xgboost_main.cpp b/src/xgboost_main.cpp
index d25140461b87..89cc1b77da4a 100644
--- a/src/xgboost_main.cpp
+++ b/src/xgboost_main.cpp
@@ -48,8 +48,7 @@ class BoostLearnTask {
     }
     if (rabit::GetRank() != 0) {
       this->SetParam("silent", "2");
-    }
-    
+    }    
     if (task == "train") {
       // if task is training, will try recover from checkpoint
       this->TaskTrain();
@@ -151,7 +150,7 @@ class BoostLearnTask {
       learner.SetCacheData(dcache);
       
       // add training set to evaluation set if needed
-      if( eval_train != 0 ) {
+      if (eval_train != 0) {
         devalall.push_back(data);
         eval_data_names.push_back(std::string("train"));
       }

From d603852828ac40388f21a28eee86098d3f8c9065 Mon Sep 17 00:00:00 2001
From: tqchen <tianqi.tchen@gmail.com>
Date: Sun, 21 Dec 2014 00:17:27 -0800
Subject: [PATCH 128/166] rm boost str

---
 src/learner/learner-inl.hpp | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/src/learner/learner-inl.hpp b/src/learner/learner-inl.hpp
index 2f6c3f0b3390..c1a4bc3705c9 100644
--- a/src/learner/learner-inl.hpp
+++ b/src/learner/learner-inl.hpp
@@ -222,12 +222,9 @@ class BoostLearner : public rabit::ISerializable {
    * \param p_train pointer to the data matrix
    */
   inline void UpdateOneIter(int iter, const DMatrix &train) {
-    printf("!!UpdateOneIter\n");
     this->PredictRaw(train, &preds_);
     obj_->GetGradient(preds_, train.info, iter, &gpair_);
-    printf("!!UpdateOneDoboost\n");
     gbm_->DoBoost(train.fmat(), this->FindBufferOffset(train), train.info.info, &gpair_);
-    printf("!!UpdateOneIter finish\n");
   }
   /*!
    * \brief evaluate the model for specific iteration

From eff5c6baa86a3a33a9763e69376ef1e72dbf630a Mon Sep 17 00:00:00 2001
From: tqchen <tianqi.tchen@gmail.com>
Date: Sun, 21 Dec 2014 04:36:18 -0800
Subject: [PATCH 129/166] push in row mock file

---
 .../row-split/machine-row-rabit-mock.sh       | 20 +++++++++++++++++++
 1 file changed, 20 insertions(+)
 create mode 100755 multi-node/row-split/machine-row-rabit-mock.sh

diff --git a/multi-node/row-split/machine-row-rabit-mock.sh b/multi-node/row-split/machine-row-rabit-mock.sh
new file mode 100755
index 000000000000..b2e04c9c7830
--- /dev/null
+++ b/multi-node/row-split/machine-row-rabit-mock.sh
@@ -0,0 +1,20 @@
+#!/bin/bash
+if [[ $# -ne 1 ]]
+then
+    echo "Usage: nprocess"
+    exit -1
+fi
+
+rm -rf train-machine.row* *.model
+k=$1
+# make machine data
+cd ../../demo/regression/
+python mapfeat.py
+python mknfold.py machine.txt 1
+cd -
+
+# split the lib svm file into k subfiles
+python splitrows.py ../../demo/regression/machine.txt.train train-machine $k
+
+# run xgboost mpi
+../../rabit/tracker/rabit_mpi.py $k local ../../rabit/test/keepalive.sh ../../xgboost machine-row.conf dsplit=row num_round=3 mock=1,1,1,0

From 677475529f448b87e42e6f7d61cac34809a4ed98 Mon Sep 17 00:00:00 2001
From: tqchen <tianqi.tchen@gmail.com>
Date: Sun, 21 Dec 2014 17:31:42 -0800
Subject: [PATCH 130/166] fix the row split recovery, add per iteration random
 number seed

---
 Makefile                                      |  4 ++--
 .../col-split/mushroom-col-rabit-mock.sh      |  2 +-
 src/learner/learner-inl.hpp                   | 18 ++++++++++++++++-
 src/tree/updater_basemaker-inl.hpp            | 20 ++++++++-----------
 src/tree/updater_sync-inl.hpp                 | 11 +++++-----
 src/xgboost_main.cpp                          |  2 +-
 6 files changed, 34 insertions(+), 23 deletions(-)

diff --git a/Makefile b/Makefile
index 3f0b3c7cfb62..5738f2573706 100644
--- a/Makefile
+++ b/Makefile
@@ -13,7 +13,7 @@ endif
 # by default use c++11
 ifeq ($(no_cxx11),1)
 else 
-	CFLAGS += -std=c++11
+	CFLAGS += 
 endif
 
 # specify tensor path
@@ -30,7 +30,7 @@ mpi: $(MPIBIN)
 # rules to get rabit library
 librabit:
 	if [ ! -d rabit ]; then git clone https://github.com/tqchen/rabit.git; fi
-	cd rabit;make lib/librabit.a; cd -
+	cd rabit;make lib/librabit.a lib/librabit_mock.a; cd -
 librabit_mpi:
 	if [ ! -d rabit ]; then git clone https://github.com/tqchen/rabit.git; fi
 	cd rabit;make lib/librabit_mpi.a; cd -
diff --git a/multi-node/col-split/mushroom-col-rabit-mock.sh b/multi-node/col-split/mushroom-col-rabit-mock.sh
index 5148a0b61721..148e629a287f 100755
--- a/multi-node/col-split/mushroom-col-rabit-mock.sh
+++ b/multi-node/col-split/mushroom-col-rabit-mock.sh
@@ -16,7 +16,7 @@ k=$1
 python splitsvm.py ../../demo/data/agaricus.txt.train train $k
 
 # run xgboost mpi
-../../rabit/tracker/rabit_mpi.py $k local ../../rabit/test/keepalive.sh ../../xgboost mushroom-col.conf dsplit=col mock=0,0,1,0 mock=1,1,0,0
+../../rabit/tracker/rabit_mpi.py $k local ../../rabit/test/keepalive.sh ../../xgboost mushroom-col.conf dsplit=col mock=0,1,0,0 mock=1,1,0,0
 
 # the model can be directly loaded by single machine xgboost solver, as usuall
 #../../xgboost mushroom-col.conf task=dump model_in=0002.model fmap=../../demo/data/featmap.txt name_dump=dump.nice.$k.txt
diff --git a/src/learner/learner-inl.hpp b/src/learner/learner-inl.hpp
index c1a4bc3705c9..201100a6ff31 100644
--- a/src/learner/learner-inl.hpp
+++ b/src/learner/learner-inl.hpp
@@ -34,6 +34,8 @@ class BoostLearner : public rabit::ISerializable {
     prob_buffer_row = 1.0f;
     distributed_mode = 0;
     pred_buffer_size = 0;
+    seed_per_iteration = 0;
+    seed = 0;
   }
   virtual ~BoostLearner(void) {
     if (obj_ != NULL) delete obj_;
@@ -102,7 +104,10 @@ class BoostLearner : public rabit::ISerializable {
       this->SetParam("updater", "grow_colmaker,refresh,prune");
     }
     if (!strcmp(name, "eval_metric")) evaluator_.AddEval(val);
-    if (!strcmp("seed", name)) random::Seed(atoi(val));
+    if (!strcmp("seed", name)) {
+      this->seed = seed; random::Seed(atoi(val));
+    }
+    if (!strcmp("seed_per_iter", name)) seed_per_iteration = atoi(val);
     if (!strcmp(name, "num_class")) this->SetParam("num_output_group", val);
     if (!strcmp(name, "nthread")) {
       omp_set_num_threads(atoi(val));
@@ -222,6 +227,9 @@ class BoostLearner : public rabit::ISerializable {
    * \param p_train pointer to the data matrix
    */
   inline void UpdateOneIter(int iter, const DMatrix &train) {
+    if (seed_per_iteration || rabit::IsDistributed()) {
+      random::Seed(this->seed * kRandSeedMagic);
+    }
     this->PredictRaw(train, &preds_);
     obj_->GetGradient(preds_, train.info, iter, &gpair_);
     gbm_->DoBoost(train.fmat(), this->FindBufferOffset(train), train.info.info, &gpair_);
@@ -369,6 +377,12 @@ class BoostLearner : public rabit::ISerializable {
     }
   };
   // data fields
+  // stored random seed
+  int seed;
+  // whether seed the PRNG each iteration
+  // this is important for restart from existing iterations
+  // default set to no, but will auto switch on in distributed mode
+  int seed_per_iteration;
   // silent during training
   int silent;
   // distributed learning mode, if any, 0:none, 1:col, 2:row
@@ -397,6 +411,8 @@ class BoostLearner : public rabit::ISerializable {
   std::vector<bst_gpair> gpair_;
 
  protected:
+  // magic number to transform random seed
+  const static int kRandSeedMagic = 127;
   // cache entry object that helps handle feature caching
   struct CacheEntry {
     const DMatrix *mat_;
diff --git a/src/tree/updater_basemaker-inl.hpp b/src/tree/updater_basemaker-inl.hpp
index 9b7c38b00762..851811eaee80 100644
--- a/src/tree/updater_basemaker-inl.hpp
+++ b/src/tree/updater_basemaker-inl.hpp
@@ -76,19 +76,15 @@ class BaseMaker: public IUpdater {
       unsigned n = static_cast<unsigned>(p * findex.size());
       random::Shuffle(findex);
       findex.resize(n);
-      if (n != findex.size()) {
-        // sync the findex if it is subsample
-        std::string s_cache;
-        utils::MemoryBufferStream fc(&s_cache);
-        utils::IStream &fs = fc;
-        if (rabit::GetRank() == 0) {
-          fs.Write(findex);
-          rabit::Broadcast(&s_cache, 0);
-        } else {
-          rabit::Broadcast(&s_cache, 0);
-          fs.Read(&findex);
-        }
+      // sync the findex if it is subsample
+      std::string s_cache;
+      utils::MemoryBufferStream fc(&s_cache);
+      utils::IStream &fs = fc;
+      if (rabit::GetRank() == 0) {
+        fs.Write(findex);
       }
+      rabit::Broadcast(&s_cache, 0);
+      fs.Read(&findex);
     }
     
    private:
diff --git a/src/tree/updater_sync-inl.hpp b/src/tree/updater_sync-inl.hpp
index d29743bf3529..0cbbb4eede68 100644
--- a/src/tree/updater_sync-inl.hpp
+++ b/src/tree/updater_sync-inl.hpp
@@ -40,12 +40,11 @@ class TreeSyncher: public IUpdater {
       for (size_t i = 0; i < trees.size(); ++i) {
         trees[i]->SaveModel(fs);
       }
-      rabit::Broadcast(&s_model, 0);
-    } else {
-      rabit::Broadcast(&s_model, 0);
-      for (size_t i = 0; i < trees.size(); ++i) {      
-        trees[i]->LoadModel(fs);
-      }
+    }
+    fs.Seek(0);
+    rabit::Broadcast(&s_model, 0);
+    for (size_t i = 0; i < trees.size(); ++i) {      
+      trees[i]->LoadModel(fs);
     }
   }
 };
diff --git a/src/xgboost_main.cpp b/src/xgboost_main.cpp
index 89cc1b77da4a..a3f838131522 100644
--- a/src/xgboost_main.cpp
+++ b/src/xgboost_main.cpp
@@ -284,8 +284,8 @@ class BoostLearnTask {
 }
 
 int main(int argc, char *argv[]){
-  xgboost::random::Seed(0);
   xgboost::BoostLearnTask tsk;
+  tsk.SetParam("seed", "0");
   int ret = tsk.Run(argc, argv);
   rabit::Finalize();
   return ret;

From c8396ca24eb82994038574373af95126afcd0a63 Mon Sep 17 00:00:00 2001
From: tqchen <tianqi.tchen@gmail.com>
Date: Sun, 21 Dec 2014 18:47:56 -0800
Subject: [PATCH 131/166] add mock exec

---
 Makefile                                        | 9 +++++++--
 multi-node/col-split/mushroom-col-rabit-mock.sh | 2 +-
 multi-node/row-split/machine-row-rabit-mock.sh  | 2 +-
 3 files changed, 9 insertions(+), 4 deletions(-)

diff --git a/Makefile b/Makefile
index 5738f2573706..7c89d24da4f3 100644
--- a/Makefile
+++ b/Makefile
@@ -17,14 +17,15 @@ else
 endif
 
 # specify tensor path
-BIN = xgboost
+BIN = xgboost 
+MOCKBIN = xgboost-mock
 OBJ = updater.o gbm.o io.o main.o 
 MPIBIN = xgboost-mpi
 SLIB = wrapper/libxgboostwrapper.so 
 
 .PHONY: clean all mpi python Rpack librabit librabit_mpi
 
-all: $(BIN) $(OBJ) $(SLIB) mpi
+all: $(BIN) $(OBJ) $(SLIB) $(MOCKBIN)
 mpi: $(MPIBIN)
 
 # rules to get rabit library
@@ -42,10 +43,14 @@ gbm.o: src/gbm/gbm.cpp src/gbm/*.hpp src/gbm/*.h
 io.o: src/io/io.cpp src/io/*.hpp src/utils/*.h src/learner/dmatrix.h src/*.h
 main.o: src/xgboost_main.cpp src/utils/*.h src/*.h src/learner/*.hpp src/learner/*.h 
 xgboost-mpi:  updater.o gbm.o io.o main.o librabit_mpi
+xgboost-mock: updater.o gbm.o io.o main.o librabit
 xgboost:  updater.o gbm.o io.o main.o  librabit
 wrapper/libxgboostwrapper.so: wrapper/xgboost_wrapper.cpp src/utils/*.h src/*.h src/learner/*.hpp src/learner/*.h  updater.o gbm.o io.o librabit
 
 $(BIN) : 
+	$(CXX) $(CFLAGS) -o $@ $(filter %.cpp %.o %.c, $^) $(LDFLAGS)  -lrabit
+
+$(MOCKBIN) : 
 	$(CXX) $(CFLAGS) -o $@ $(filter %.cpp %.o %.c, $^) $(LDFLAGS)  -lrabit_mock
 
 $(SLIB) :
diff --git a/multi-node/col-split/mushroom-col-rabit-mock.sh b/multi-node/col-split/mushroom-col-rabit-mock.sh
index 148e629a287f..269967419db8 100755
--- a/multi-node/col-split/mushroom-col-rabit-mock.sh
+++ b/multi-node/col-split/mushroom-col-rabit-mock.sh
@@ -16,7 +16,7 @@ k=$1
 python splitsvm.py ../../demo/data/agaricus.txt.train train $k
 
 # run xgboost mpi
-../../rabit/tracker/rabit_mpi.py $k local ../../rabit/test/keepalive.sh ../../xgboost mushroom-col.conf dsplit=col mock=0,1,0,0 mock=1,1,0,0
+../../rabit/tracker/rabit_mpi.py $k local ../../rabit/test/keepalive.sh ../../xgboost-mock mushroom-col.conf dsplit=col mock=0,1,0,0 mock=1,1,0,0
 
 # the model can be directly loaded by single machine xgboost solver, as usuall
 #../../xgboost mushroom-col.conf task=dump model_in=0002.model fmap=../../demo/data/featmap.txt name_dump=dump.nice.$k.txt
diff --git a/multi-node/row-split/machine-row-rabit-mock.sh b/multi-node/row-split/machine-row-rabit-mock.sh
index b2e04c9c7830..f61ef2152c18 100755
--- a/multi-node/row-split/machine-row-rabit-mock.sh
+++ b/multi-node/row-split/machine-row-rabit-mock.sh
@@ -17,4 +17,4 @@ cd -
 python splitrows.py ../../demo/regression/machine.txt.train train-machine $k
 
 # run xgboost mpi
-../../rabit/tracker/rabit_mpi.py $k local ../../rabit/test/keepalive.sh ../../xgboost machine-row.conf dsplit=row num_round=3 mock=1,1,1,0
+../../rabit/tracker/rabit_mpi.py $k local ../../rabit/test/keepalive.sh ../../xgboost-mock machine-row.conf dsplit=row num_round=3 mock=1,1,1,0

From 6d7ef172ef52e8ab0e10deac06ae4ec620253c72 Mon Sep 17 00:00:00 2001
From: tqchen <tianqi.tchen@gmail.com>
Date: Wed, 24 Dec 2014 02:33:50 -0800
Subject: [PATCH 132/166] add base64 model format

---
 src/learner/learner-inl.hpp |  54 ++++++++--
 src/utils/base64.h          | 205 ++++++++++++++++++++++++++++++++++++
 src/utils/thread_buffer.h   |   4 +-
 src/xgboost_main.cpp        |  15 +--
 4 files changed, 261 insertions(+), 17 deletions(-)
 create mode 100644 src/utils/base64.h

diff --git a/src/learner/learner-inl.hpp b/src/learner/learner-inl.hpp
index 201100a6ff31..cb02ee075b6c 100644
--- a/src/learner/learner-inl.hpp
+++ b/src/learner/learner-inl.hpp
@@ -12,6 +12,8 @@
 #include <limits>
 // rabit library for synchronization
 #include <rabit.h>
+#include "../utils/io.h"
+#include "../utils/base64.h"
 #include "./objective.h"
 #include "./evaluation.h"
 #include "../gbm/gbm.h"
@@ -36,6 +38,7 @@ class BoostLearner : public rabit::ISerializable {
     pred_buffer_size = 0;
     seed_per_iteration = 0;
     seed = 0;
+    save_base64 = 0;
   }
   virtual ~BoostLearner(void) {
     if (obj_ != NULL) delete obj_;
@@ -68,9 +71,6 @@ class BoostLearner : public rabit::ISerializable {
     utils::SPrintf(str_temp, sizeof(str_temp), "%lu", 
                    static_cast<unsigned long>(buffer_size));
     this->SetParam("num_pbuffer", str_temp);
-    if (!silent) {
-      utils::Printf("buffer_size=%ld\n", static_cast<long>(buffer_size));
-    }
     this->pred_buffer_size = buffer_size;
   }
   /*!
@@ -108,6 +108,7 @@ class BoostLearner : public rabit::ISerializable {
       this->seed = seed; random::Seed(atoi(val));
     }
     if (!strcmp("seed_per_iter", name)) seed_per_iteration = atoi(val);
+    if (!strcmp("save_base64", name)) save_base64 = atoi(val);
     if (!strcmp(name, "num_class")) this->SetParam("num_output_group", val);
     if (!strcmp(name, "nthread")) {
       omp_set_num_threads(atoi(val));
@@ -191,9 +192,29 @@ class BoostLearner : public rabit::ISerializable {
    * \param fname file name
    */
   inline void LoadModel(const char *fname) {
-    utils::FileStream fi(utils::FopenCheck(fname, "rb"));
+    FILE *fp = utils::FopenCheck(fname, "rb");
+    std::string header; header.resize(4);
+    utils::FileStream fi(fp);
+    // check header for different binary encode
+    // can be base64 or binary
+    if (fi.Read(&header[0], 4) != 0) {
+      // base64 format
+      if (header == "bs64") {
+        utils::Base64InStream bsin(fp);
+        bsin.InitPosition();
+        this->LoadModel(bsin);
+        fclose(fp);
+        return;
+      }
+      if (header == "binf") {
+        this->LoadModel(fi);
+        fclose(fp);
+        return;
+      }
+    }
+    fi.Seek(0);
     this->LoadModel(fi);
-    fi.Close();
+    fclose(fp);
   }
   inline void SaveModel(utils::IStream &fo, bool with_pbuffer = true) const {
     fo.Write(&mparam, sizeof(ModelParam));
@@ -206,9 +227,24 @@ class BoostLearner : public rabit::ISerializable {
    * \param fname file name
    */
   inline void SaveModel(const char *fname) const {
-    utils::FileStream fo(utils::FopenCheck(fname, "wb"));
-    this->SaveModel(fo);
-    fo.Close();
+    FILE *fp;
+    if (!strcmp(fname, "stdout")) {
+      fp = stdout;
+    } else {
+      fp = utils::FopenCheck(fname, "wb");      
+    }
+    utils::FileStream fo(fp);
+    std::string header;
+    if (save_base64 != 0|| fp == stdout) {
+      fo.Write("bs64\t", 5);
+      utils::Base64OutStream bout(fp);
+      this->SaveModel(bout);
+      bout.Finish('\n');
+    } else {
+      fo.Write("binf", 4);
+      this->SaveModel(fo);      
+    }
+    if (fp != stdout) fclose(fp);
   }
   /*!
    * \brief check if data matrix is ready to be used by training,
@@ -383,6 +419,8 @@ class BoostLearner : public rabit::ISerializable {
   // this is important for restart from existing iterations
   // default set to no, but will auto switch on in distributed mode
   int seed_per_iteration;
+  // save model in base64 encoding
+  int save_base64;
   // silent during training
   int silent;
   // distributed learning mode, if any, 0:none, 1:col, 2:row
diff --git a/src/utils/base64.h b/src/utils/base64.h
new file mode 100644
index 000000000000..36699199f7d6
--- /dev/null
+++ b/src/utils/base64.h
@@ -0,0 +1,205 @@
+#ifndef XGBOOST_UTILS_BASE64_H_
+#define XGBOOST_UTILS_BASE64_H_
+/*!
+ * \file base64.h
+ * \brief data stream support to input and output from/to base64 stream
+ * base64 is easier to store and pass as text format in mapreduce
+ * \author Tianqi Chen
+ */
+#include <cctype>
+#include <cstdio>
+#include "./utils.h"
+#include "./io.h"
+
+namespace xgboost {
+namespace utils {
+/*! \brief namespace of base64 decoding and encoding table */
+namespace base64 {
+const char DecodeTable[] = {
+  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+  62,  // '+'
+  0, 0, 0,
+  63,  // '/'
+  52, 53, 54, 55, 56, 57, 58, 59, 60, 61,  // '0'-'9'
+  0, 0, 0, 0, 0, 0, 0,
+  0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12,
+  13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25,  // 'A'-'Z'
+  0, 0, 0, 0, 0, 0,
+  26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38,
+  39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51,  // 'a'-'z'
+};
+static const char EncodeTable[] =
+    "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/";
+} // namespace base64
+/*! \brief the stream that reads from base64, note we take from file pointers */
+class Base64InStream: public IStream {
+ public:
+  explicit Base64InStream(FILE *fp) : fp(fp) {
+    num_prev = 0; tmp_ch = 0;
+  }
+  /*! 
+   * \brief initialize the stream position to beginning of next base64 stream 
+   * call this function before actually start read
+   */
+  inline void InitPosition(void) {
+    // get a charater
+    do {
+      tmp_ch = fgetc(fp);
+    } while (isspace(tmp_ch));
+  }
+  /*! \brief whether current position is end of a base64 stream */
+  inline bool IsEOF(void) const {
+    return num_prev == 0 && (tmp_ch == EOF || isspace(tmp_ch));
+  }
+  virtual size_t Read(void *ptr, size_t size) {
+    using base64::DecodeTable;
+    if (size == 0) return 0;
+    // use tlen to record left size
+    size_t tlen = size;
+    unsigned char *cptr = static_cast<unsigned char*>(ptr);
+    // if anything left, load from previous buffered result
+    if (num_prev != 0) {
+      if (num_prev == 2) {
+        if (tlen >= 2) {
+          *cptr++ = buf_prev[0];
+          *cptr++ = buf_prev[1];
+          tlen -= 2;
+          num_prev = 0;
+        } else {
+          // assert tlen == 1
+          *cptr++ = buf_prev[0]; --tlen;
+          buf_prev[0] = buf_prev[1];
+          num_prev = 1;
+        }
+      } else {
+        // assert num_prev == 1
+        *cptr++ = buf_prev[0]; --tlen; num_prev = 0;
+      }
+    }
+    if (tlen == 0) return size;
+    int nvalue;
+    // note: everything goes with 4 bytes in Base64
+    // so we process 4 bytes a unit
+    while (tlen && tmp_ch != EOF && !isspace(tmp_ch)) {
+      // first byte
+      nvalue = DecodeTable[tmp_ch] << 18;
+      {
+        // second byte
+        Check((tmp_ch = fgetc(fp), tmp_ch != EOF && !isspace(tmp_ch)),
+              "invalid base64 format");
+        nvalue |= DecodeTable[tmp_ch] << 12;
+        *cptr++ = (nvalue >> 16) & 0xFF; --tlen;
+      }
+      {
+        // third byte
+        Check((tmp_ch = fgetc(fp), tmp_ch != EOF && !isspace(tmp_ch)),
+              "invalid base64 format");
+        // handle termination
+        if (tmp_ch == '=') {
+          Check((tmp_ch = fgetc(fp), tmp_ch == '='), "invalid base64 format");
+          Check((tmp_ch = fgetc(fp), tmp_ch == EOF || isspace(tmp_ch)),
+                "invalid base64 format");
+          break;
+        }
+        nvalue |= DecodeTable[tmp_ch] << 6;
+        if (tlen) {
+          *cptr++ = (nvalue >> 8) & 0xFF; --tlen;
+        } else {
+          buf_prev[num_prev++] = (nvalue >> 8) & 0xFF;
+        }
+      }
+      {
+        // fourth byte
+        Check((tmp_ch = fgetc(fp), tmp_ch != EOF && !isspace(tmp_ch)),
+              "invalid base64 format");
+        if (tmp_ch == '=') {
+          Check((tmp_ch = fgetc(fp), tmp_ch == EOF || isspace(tmp_ch)),
+                "invalid base64 format");
+          break;
+        }
+        nvalue |= DecodeTable[tmp_ch];
+        if (tlen) {
+          *cptr++ = nvalue & 0xFF; --tlen;
+        } else {
+          buf_prev[num_prev ++] = nvalue & 0xFF;
+        }
+      }
+      // get next char
+      tmp_ch = fgetc(fp);
+    }
+    if (kStrictCheck) {
+      Check(tlen == 0, "Base64InStream: read incomplete");
+    }
+    return size - tlen;
+  }
+  virtual void Write(const void *ptr, size_t size) {
+    utils::Error("Base64InStream do not support write");
+  }
+
+ private:
+  FILE *fp;
+  unsigned char tmp_ch;
+  int num_prev;
+  unsigned char buf_prev[2];
+  // whether we need to do strict check
+  static const bool kStrictCheck = false;
+};
+/*! \brief the stream that write to base64, note we take from file pointers */
+class Base64OutStream: public IStream {
+ public:
+  explicit Base64OutStream(FILE *fp) : fp(fp) {
+    buf_top = 0;
+  }
+  virtual void Write(const void *ptr, size_t size) {
+    using base64::EncodeTable;
+    size_t tlen = size;
+    const unsigned char *cptr = static_cast<const unsigned char*>(ptr);
+    while (tlen) {
+      while (buf_top < 3  && tlen != 0) {
+        buf[++buf_top] = *cptr++; --tlen;
+      }
+      if (buf_top == 3) {
+        // flush 4 bytes out
+        fputc(EncodeTable[buf[1] >> 2], fp);
+        fputc(EncodeTable[((buf[1] << 4) | (buf[2] >> 4)) & 0x3F], fp);
+        fputc(EncodeTable[((buf[2] << 2) | (buf[3] >> 6)) & 0x3F], fp);
+        fputc(EncodeTable[buf[3] & 0x3F], fp);
+        buf_top = 0;
+      }
+    }
+  }
+  virtual size_t Read(void *ptr, size_t size) {
+    Error("Base64OutStream do not support read");
+    return 0;
+  }
+  /*!
+   * \brief finish writing of all current base64 stream, do some post processing
+   * \param endch charater to put to end of stream, if it is EOF, then nothing will be done
+   */
+  inline void Finish(char endch = EOF) {
+    using base64::EncodeTable;
+    if (buf_top == 1) {
+      fputc(EncodeTable[buf[1] >> 2], fp);
+      fputc(EncodeTable[(buf[1] << 4) & 0x3F], fp);
+      fputc('=', fp);
+      fputc('=', fp);
+    }
+    if (buf_top == 2) {
+      fputc(EncodeTable[buf[1] >> 2], fp);
+      fputc(EncodeTable[((buf[1] << 4) | (buf[2] >> 4)) & 0x3F], fp);
+      fputc(EncodeTable[(buf[2] << 2) & 0x3F], fp);
+      fputc('=', fp);
+    }
+    buf_top = 0;
+    if (endch != EOF) fputc(endch, fp);
+  }
+
+ private:
+  FILE *fp;
+  int buf_top;
+  unsigned char buf[4];
+};
+}  // namespace utils
+}  // namespace xgboost
+#endif  // XGBOOST_UTILS_BASE64_H_
diff --git a/src/utils/thread_buffer.h b/src/utils/thread_buffer.h
index ace50c4b8903..ed36e1b43551 100644
--- a/src/utils/thread_buffer.h
+++ b/src/utils/thread_buffer.h
@@ -1,5 +1,5 @@
-#ifndef XGBOOST_UTILS_THREAD_BUFFER_H
-#define XGBOOST_UTILS_THREAD_BUFFER_H
+#ifndef XGBOOST_UTILS_THREAD_BUFFER_H_
+#define XGBOOST_UTILS_THREAD_BUFFER_H_
 /*!
  * \file thread_buffer.h
  * \brief  multi-thread buffer, iterator, can be used to create parallel pipeline
diff --git a/src/xgboost_main.cpp b/src/xgboost_main.cpp
index a3f838131522..7816fbfd262e 100644
--- a/src/xgboost_main.cpp
+++ b/src/xgboost_main.cpp
@@ -31,6 +31,11 @@ class BoostLearnTask {
         this->SetParam(name, val);
       }
     }
+    // do not save anything when save to stdout
+    if (model_out == "stdout") {
+      this->SetParam("silent", "1");
+      save_period = 0;
+    }
     // whether need data rank
     bool need_data_rank = strchr(train_path.c_str(), '%') != NULL;
     // if need data rank in loading, initialize rabit engine before load data
@@ -41,7 +46,7 @@ class BoostLearnTask {
     if (!need_data_rank) rabit::Init(argc, argv);
     if (rabit::IsDistributed()) {
       std::string pname = rabit::GetProcessorName();
-      printf("start %s:%d\n", pname.c_str(), rabit::GetRank());
+      fprintf(stderr, "start %s:%d\n", pname.c_str(), rabit::GetRank());
     }
     if (rabit::IsDistributed()) {
       this->SetParam("data_split", "col");
@@ -158,9 +163,7 @@ class BoostLearnTask {
   }
   inline void InitLearner(void) {
     if (model_in != "NULL") {
-      utils::FileStream fi(utils::FopenCheck(model_in.c_str(), "rb"));
-      learner.LoadModel(fi);
-      fi.Close();
+      learner.LoadModel(model_in.c_str());
     } else {
       utils::Assert(task == "train", "model_in not specified");
       learner.InitModel();
@@ -215,9 +218,7 @@ class BoostLearnTask {
   }
   inline void SaveModel(const char *fname) const {
     if (rabit::GetRank() != 0) return;
-    utils::FileStream fo(utils::FopenCheck(fname, "wb"));
-    learner.SaveModel(fo);
-    fo.Close();
+    learner.SaveModel(fname);
   }
   inline void SaveModel(int i) const {
     char fname[256];

From c8f422b3b9b15f3c93be90a00112f5ac738a8704 Mon Sep 17 00:00:00 2001
From: tqchen <tianqi.tchen@gmail.com>
Date: Wed, 24 Dec 2014 02:56:32 -0800
Subject: [PATCH 133/166] add dump to linear model

---
 src/gbm/gblinear-inl.hpp | 17 +++++++++++++++--
 1 file changed, 15 insertions(+), 2 deletions(-)

diff --git a/src/gbm/gblinear-inl.hpp b/src/gbm/gblinear-inl.hpp
index 005eada55d24..8cbe8becf437 100644
--- a/src/gbm/gblinear-inl.hpp
+++ b/src/gbm/gblinear-inl.hpp
@@ -8,6 +8,7 @@
  */
 #include <vector>
 #include <string>
+#include <sstream>
 #include <algorithm>
 #include "./gbm.h"
 #include "../tree/updater.h"
@@ -142,8 +143,20 @@ class GBLinear : public IGradBooster {
     utils::Error("gblinear does not support predict leaf index");
   }
   virtual std::vector<std::string> DumpModel(const utils::FeatMap& fmap, int option) {
-    utils::Error("gblinear does not support dump model");
-    return std::vector<std::string>();
+    std::stringstream fo("");
+    fo << "bias:\n";
+    for (int i = 0; i < model.param.num_output_group; ++i) {
+      fo << model.bias()[i] << std::endl;
+    }
+    fo << "weight:\n";
+    for (int i = 0; i < model.param.num_output_group; ++i) {
+      for (int j = 0; j <model.param.num_feature; ++j) {
+        fo << model[i][j] << std::endl;
+      }
+    }
+    std::vector<std::string> v;
+    v.push_back(fo.str());
+    return v;
   }
 
  protected:

From c395c5bed3109f834a63384901321e04148450a2 Mon Sep 17 00:00:00 2001
From: tqchen <tianqi.tchen@gmail.com>
Date: Mon, 29 Dec 2014 17:41:47 -0800
Subject: [PATCH 134/166] update build script

---
 build.sh | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/build.sh b/build.sh
index 35a566cccf1c..f6fbcb74b808 100755
--- a/build.sh
+++ b/build.sh
@@ -3,6 +3,13 @@
 # basically, it first try to make with OpenMP, if fails, disable OpenMP and make again
 # This will automatically make xgboost for MAC users who do not have openmp support
 # In most cases, type make will give what you want
+
+# download rabit
+if [ ! -d rabit ]; then
+    git clone https://github.com/tqchen/rabit.git
+else
+    cd rabit; git pull; cd ..
+fi
 if make; then
     echo "Successfully build multi-thread xgboost"
 else

From 5ad100b5a3c59545d2d3694dab9b8ca9a1f0fa7c Mon Sep 17 00:00:00 2001
From: tqchen <tianqi.tchen@gmail.com>
Date: Mon, 29 Dec 2014 19:24:08 -0800
Subject: [PATCH 135/166] now support distributed evaluation

---
 .../row-split/machine-row-rabit-mock.sh       |  2 +-
 multi-node/row-split/machine-row-rabit.sh     |  6 +--
 src/learner/evaluation-inl.hpp                | 48 +++++++++++++++----
 src/learner/evaluation.h                      | 11 +++--
 src/learner/learner-inl.hpp                   |  2 +-
 src/xgboost_main.cpp                          | 10 +++-
 6 files changed, 59 insertions(+), 20 deletions(-)

diff --git a/multi-node/row-split/machine-row-rabit-mock.sh b/multi-node/row-split/machine-row-rabit-mock.sh
index f61ef2152c18..b08e7d4e69fc 100755
--- a/multi-node/row-split/machine-row-rabit-mock.sh
+++ b/multi-node/row-split/machine-row-rabit-mock.sh
@@ -17,4 +17,4 @@ cd -
 python splitrows.py ../../demo/regression/machine.txt.train train-machine $k
 
 # run xgboost mpi
-../../rabit/tracker/rabit_mpi.py $k local ../../rabit/test/keepalive.sh ../../xgboost-mock machine-row.conf dsplit=row num_round=3 mock=1,1,1,0
+../../rabit/tracker/rabit_mpi.py -n $k ../../rabit/test/keepalive.sh ../../xgboost-mock machine-row.conf dsplit=row num_round=3 mock=1,1,1,0 
diff --git a/multi-node/row-split/machine-row-rabit.sh b/multi-node/row-split/machine-row-rabit.sh
index 4a526ff94c5c..69f94b9d1442 100755
--- a/multi-node/row-split/machine-row-rabit.sh
+++ b/multi-node/row-split/machine-row-rabit.sh
@@ -17,8 +17,8 @@ cd -
 python splitrows.py ../../demo/regression/machine.txt.train train-machine $k
 
 # run xgboost mpi
-../../rabit/tracker/rabit_mpi.py $k local ../../xgboost machine-row.conf dsplit=row num_round=3
+../../rabit/tracker/rabit_mpi.py -n $k ../../xgboost machine-row.conf dsplit=row num_round=3 eval_train=1
 
 # run xgboost-mpi save model 0001, continue to run from existing model
-../../rabit/tracker/rabit_mpi.py $k local ../../xgboost machine-row.conf dsplit=row num_round=1
-../../rabit/tracker/rabit_mpi.py $k local ../../xgboost machine-row.conf dsplit=row num_round=2 model_in=0001.model
+../../rabit/tracker/rabit_mpi.py -n $k ../../xgboost machine-row.conf dsplit=row num_round=1
+../../rabit/tracker/rabit_mpi.py -n $k ../../xgboost machine-row.conf dsplit=row num_round=2 model_in=0001.model
diff --git a/src/learner/evaluation-inl.hpp b/src/learner/evaluation-inl.hpp
index fb0b8953daab..7334ad6891c7 100644
--- a/src/learner/evaluation-inl.hpp
+++ b/src/learner/evaluation-inl.hpp
@@ -11,6 +11,8 @@
 #include <cmath>
 #include <climits>
 #include <algorithm>
+// rabit library for synchronization
+#include <rabit.h>
 #include "./evaluation.h"
 #include "./helper_utils.h"
 
@@ -23,7 +25,8 @@ namespace learner {
 template<typename Derived>
 struct EvalEWiseBase : public IEvaluator {
   virtual float Eval(const std::vector<float> &preds,
-                     const MetaInfo &info) const {
+                     const MetaInfo &info,
+                     bool distributed) const {
     utils::Check(info.labels.size() != 0, "label set cannot be empty");
     utils::Check(preds.size() % info.labels.size() == 0,
                  "label and prediction size not match");
@@ -37,7 +40,11 @@ struct EvalEWiseBase : public IEvaluator {
       sum += Derived::EvalRow(info.labels[i], preds[i]) * wt;
       wsum += wt;
     }
-    return Derived::GetFinal(sum, wsum);
+    float dat[2]; dat[0] = sum, dat[1] = wsum;
+    if (distributed) {
+      rabit::Allreduce<rabit::op::Sum>(dat, 2);
+    }
+    return Derived::GetFinal(dat[0], dat[1]);
   }
   /*! 
    * \brief to be implemented by subclass, 
@@ -113,7 +120,9 @@ struct EvalCTest: public IEvaluator {
     return name_.c_str();
   }
   virtual float Eval(const std::vector<float> &preds,
-                     const MetaInfo &info) const {
+                     const MetaInfo &info,
+                     bool distributed) const {
+    utils::Check(!distributed, "metric %s do not support distributed evaluation", name_.c_str());
     utils::Check(preds.size() % info.labels.size() == 0,
                  "label and prediction size not match");
     size_t ngroup = preds.size() / info.labels.size() - 1;
@@ -150,7 +159,9 @@ struct EvalAMS : public IEvaluator {
     utils::Check(std::sscanf(name, "ams@%f", &ratio_) == 1, "invalid ams format");
   }
   virtual float Eval(const std::vector<float> &preds,
-                     const MetaInfo &info) const {
+                     const MetaInfo &info,
+                     bool distributed) const {
+    utils::Check(!distributed, "metric AMS do not support distributed evaluation");
     using namespace std;
     const bst_omp_uint ndata = static_cast<bst_omp_uint>(info.labels.size());
 
@@ -212,7 +223,9 @@ struct EvalPrecisionRatio : public IEvaluator{
     }
   }
   virtual float Eval(const std::vector<float> &preds,
-                     const MetaInfo &info) const {
+                     const MetaInfo &info,
+                     bool distributed) const {
+    utils::Check(!distributed, "metric %s do not support distributed evaluation", Name());
     utils::Check(info.labels.size() != 0, "label set cannot be empty");    
     utils::Assert(preds.size() % info.labels.size() == 0,
                   "label size predict size not match");
@@ -252,7 +265,8 @@ struct EvalPrecisionRatio : public IEvaluator{
 /*! \brief Area under curve, for both classification and rank */
 struct EvalAuc : public IEvaluator {
   virtual float Eval(const std::vector<float> &preds,
-                     const MetaInfo &info) const {
+                     const MetaInfo &info,
+                     bool distributed) const {
     utils::Check(info.labels.size() != 0, "label set cannot be empty");
     utils::Check(preds.size() % info.labels.size() == 0,
                  "label size predict size not match");
@@ -299,8 +313,14 @@ struct EvalAuc : public IEvaluator {
         sum_auc += sum_pospair / (sum_npos*sum_nneg);
       }
     }
-    // return average AUC over list
-    return static_cast<float>(sum_auc) / ngroup;
+    if (distributed) {
+      float dat[2]; dat[0] = sum_auc; dat[1] = ngroup;      
+      // approximately estimate auc using mean
+      rabit::Allreduce<rabit::op::Sum>(dat, 2);
+      return dat[0] / dat[1];
+    } else {
+      return static_cast<float>(sum_auc) / ngroup;
+    }
   }
   virtual const char *Name(void) const {
     return "auc";
@@ -311,7 +331,8 @@ struct EvalAuc : public IEvaluator {
 struct EvalRankList : public IEvaluator {
  public:
   virtual float Eval(const std::vector<float> &preds,
-                     const MetaInfo &info) const {
+                     const MetaInfo &info,
+                     bool distributed) const {
     utils::Check(preds.size() == info.labels.size(),
                   "label size predict size not match");
     // quick consistency when group is not available
@@ -336,7 +357,14 @@ struct EvalRankList : public IEvaluator {
         sum_metric += this->EvalMetric(rec);
       }
     }
-    return static_cast<float>(sum_metric) / ngroup;
+    if (distributed) {
+      float dat[2]; dat[0] = sum_metric; dat[1] = ngroup;      
+      // approximately estimate auc using mean
+      rabit::Allreduce<rabit::op::Sum>(dat, 2);
+      return dat[0] / dat[1];
+    } else {
+      return static_cast<float>(sum_metric) / ngroup;
+    }
   }
   virtual const char *Name(void) const {
     return name_.c_str();
diff --git a/src/learner/evaluation.h b/src/learner/evaluation.h
index 33370e706f3a..4d59e270aba5 100644
--- a/src/learner/evaluation.h
+++ b/src/learner/evaluation.h
@@ -19,9 +19,13 @@ struct IEvaluator{
    * \brief evaluate a specific metric
    * \param preds prediction
    * \param info information, including label etc.
+   * \param distributed whether a call to Allreduce is needed to gather 
+   *        the average statistics across all the node,
+   *        this is only supported by some metrics
    */
   virtual float Eval(const std::vector<float> &preds,
-                     const MetaInfo &info) const = 0;
+                     const MetaInfo &info,
+                     bool distributed = false) const = 0;
   /*! \return name of metric */
   virtual const char *Name(void) const = 0;
   /*! \brief virtual destructor */
@@ -70,10 +74,11 @@ class EvalSet{
   }
   inline std::string Eval(const char *evname,
                           const std::vector<float> &preds,
-                          const MetaInfo &info) const {
+                          const MetaInfo &info,
+                          bool distributed = false) {
     std::string result = "";
     for (size_t i = 0; i < evals_.size(); ++i) {
-      float res = evals_[i]->Eval(preds, info);
+      float res = evals_[i]->Eval(preds, info, distributed);
       char tmp[1024];
       utils::SPrintf(tmp, sizeof(tmp), "\t%s-%s:%f", evname, evals_[i]->Name(), res);
       result += tmp;
diff --git a/src/learner/learner-inl.hpp b/src/learner/learner-inl.hpp
index cb02ee075b6c..ae0967ce8bd0 100644
--- a/src/learner/learner-inl.hpp
+++ b/src/learner/learner-inl.hpp
@@ -287,7 +287,7 @@ class BoostLearner : public rabit::ISerializable {
     for (size_t i = 0; i < evals.size(); ++i) {
       this->PredictRaw(*evals[i], &preds_);
       obj_->EvalTransform(&preds_);
-      res += evaluator_.Eval(evname[i].c_str(), preds_, evals[i]->info);
+      res += evaluator_.Eval(evname[i].c_str(), preds_, evals[i]->info, distributed_mode == 2);
     }
     return res;
   }
diff --git a/src/xgboost_main.cpp b/src/xgboost_main.cpp
index 7816fbfd262e..52ce4e58b4f3 100644
--- a/src/xgboost_main.cpp
+++ b/src/xgboost_main.cpp
@@ -181,8 +181,14 @@ class BoostLearnTask {
       if (!silent) printf("boosting round %d, %lu sec elapsed\n", i, elapsed);
       learner.UpdateOneIter(i, *data); 
       std::string res = learner.EvalOneIter(i, devalall, eval_data_names);
-      if (silent < 2) {
-        fprintf(stderr, "%s\n", res.c_str());
+      if (rabit::IsDistributed()){
+        if (rabit::GetRank() == 0) {
+          rabit::TrackerPrintf("%s\n", res.c_str());
+        }
+      } else {
+        if (silent < 2) {
+          fprintf(stderr, "%s\n", res.c_str());
+        }
       }
       if (save_period != 0 && (i + 1) % save_period == 0) {
         this->SaveModel(i);

From f82732a3629221674a3729e5f6a9a35dd928f01c Mon Sep 17 00:00:00 2001
From: Boliang Chen <cblsjtu@gmail.com>
Date: Tue, 6 Jan 2015 17:09:15 +0800
Subject: [PATCH 136/166] add hadoop folder

---
 multi-node/README.md | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/multi-node/README.md b/multi-node/README.md
index 31067af5d6eb..02d6fc82074e 100644
--- a/multi-node/README.md
+++ b/multi-node/README.md
@@ -22,6 +22,8 @@ Design Choice
   - Row-based solver split data by row, each node work on subset of rows,
     it uses an approximate histogram count algorithm, and will only examine subset of 
     potential split points as opposed to all split points.
+  - Hadoop version can run on the existing hadoop platform,
+    it use Rabit to submit jobs as map-reduce tasks.
 
 Usage
 ====

From e20d4f43870b092fcc5ab1e85f8e48fbb54baa95 Mon Sep 17 00:00:00 2001
From: Boliang Chen <cblsjtu@gmail.com>
Date: Sat, 10 Jan 2015 12:26:43 +0800
Subject: [PATCH 137/166] comment some parameters not supported by hadoop
 version of xgboost

---
 .../mushroom.hadoop.conf                      | 31 +++++++++++++++++++
 1 file changed, 31 insertions(+)
 create mode 100644 demo/binary_classification/mushroom.hadoop.conf

diff --git a/demo/binary_classification/mushroom.hadoop.conf b/demo/binary_classification/mushroom.hadoop.conf
new file mode 100644
index 000000000000..1dffe4f8d61e
--- /dev/null
+++ b/demo/binary_classification/mushroom.hadoop.conf
@@ -0,0 +1,31 @@
+# General Parameters, see comment for each definition
+# choose the booster, can be gbtree or gblinear
+booster = gbtree
+# choose logistic regression loss function for binary classification
+objective = binary:logistic
+
+# Tree Booster Parameters
+# step size shrinkage
+eta = 1.0 
+# minimum loss reduction required to make a further partition
+gamma = 1.0 
+# minimum sum of instance weight(hessian) needed in a child
+min_child_weight = 1 
+# maximum depth of a tree
+max_depth = 3 
+
+# Task Parameters
+# the number of round to do boosting
+num_round = 2
+# 0 means do not save any model except the final round model
+save_period = 0 
+# The path of training data
+data = "agaricus.txt.train" 
+
+# The following parameters are not supported by xgboost running in hadoop yet!
+# The path of validation data, used to monitor training process, here [test] sets name of the validation set
+#eval[test] = "agaricus.txt.test" 
+# evaluate on training data as well each round
+#eval_train = 1
+# The path of test data 
+#test:data = "agaricus.txt.test"      

From 61a43111a7d3cf533264084152105c7240aa6867 Mon Sep 17 00:00:00 2001
From: Boliang Chen <cblsjtu@gmail.com>
Date: Sat, 10 Jan 2015 12:30:00 +0800
Subject: [PATCH 138/166] hadoop version of xgboost binary classification
 script

---
 .../run_binary_classification_on_hadoop.sh    | 43 +++++++++++++++++++
 1 file changed, 43 insertions(+)
 create mode 100755 multi-node/hadoop/run_binary_classification_on_hadoop.sh

diff --git a/multi-node/hadoop/run_binary_classification_on_hadoop.sh b/multi-node/hadoop/run_binary_classification_on_hadoop.sh
new file mode 100755
index 000000000000..c194fcea4ae6
--- /dev/null
+++ b/multi-node/hadoop/run_binary_classification_on_hadoop.sh
@@ -0,0 +1,43 @@
+#!/bin/bash
+
+if [ "$#" -lt 2 ];
+then
+    echo "Usage: <nworkers> <path_in_HDFS>"
+    exit -1
+fi
+
+curDir=`pwd`
+dataDir=../../demo/binary_classification
+trainFile=$dataDir/agaricus.txt.train
+input=$2
+output=$2/model
+
+# generate the training file if it doesnot exist
+if [ ! -f "$trainFile" ];
+then 
+  echo "Generating training file:"
+  cd $dataDir
+  # map feature using indicator encoding, also produce featmap.txt
+  python mapfeat.py
+  # split train and test
+  python mknfold.py agaricus.txt 1
+  cd $curDir
+fi
+
+hadoop fs -mkdir $input
+hadoop fs -put $trainFile $input
+#hadoop fs -rm -skipTrash -r $output
+
+# training and output the models
+python ../../rabit/tracker/rabit_hadoop.py -n 3 -i $input/agaricus.txt.train -o $output -f $dataDir/mushroom.hadoop.conf \
+    --jobname xgboost_hadoop ../../xgboost mushroom.hadoop.conf dsplit=row num_round=3 data=stdin model_out=stdout
+
+# get the final model file
+hadoop fs -get $output/part-00000 ./final.model
+# output prediction task=pred 
+../../xgboost $dataDir/mushroom.hadoop.conf task=pred model_in=./final.model
+# print the boosters of 00002.model in dump.raw.txt
+../../xgboost $dataDir/mushroom.hadoop.conf task=dump model_in=final.model name_dump=dump.raw.txt
+# use the feature map in printing for better visualization
+../../xgboost $dataDir/mushroom.hadoop.conf task=dump model_in=final.model fmap=$dataDir/featmap.txt name_dump=dump.nice.txt
+cat dump.nice.txt

From 24f99220cbd7955f51e047e2feff030d5e7423a3 Mon Sep 17 00:00:00 2001
From: Boliang Chen <cblsjtu@gmail.com>
Date: Sat, 10 Jan 2015 23:59:25 +0800
Subject: [PATCH 139/166] fix bugs

---
 .../hadoop/run_binary_classification.sh       | 43 +++++++++++++++++++
 1 file changed, 43 insertions(+)
 create mode 100755 multi-node/hadoop/run_binary_classification.sh

diff --git a/multi-node/hadoop/run_binary_classification.sh b/multi-node/hadoop/run_binary_classification.sh
new file mode 100755
index 000000000000..740a468cf668
--- /dev/null
+++ b/multi-node/hadoop/run_binary_classification.sh
@@ -0,0 +1,43 @@
+#!/bin/bash
+
+if [ "$#" -lt 2 ];
+then
+    echo "Usage: <nworkers> <path_in_HDFS>"
+    exit -1
+fi
+
+curDir=`pwd`
+dataDir=../../demo/binary_classification
+trainFile=$dataDir/agaricus.txt.train
+input=$2
+output=$2/model
+
+# generate the training file if it doesnot exist
+if [ ! -f "$trainFile" ];
+then 
+  echo "Generating training file:"
+  cd $dataDir
+  # map feature using indicator encoding, also produce featmap.txt
+  python mapfeat.py
+  # split train and test
+  python mknfold.py agaricus.txt 1
+  cd $curDir
+fi
+
+hadoop fs -mkdir $input
+hadoop fs -put $trainFile $input
+#hadoop fs -rm -skipTrash -r $output
+
+# training and output the final model file
+python ../../rabit/tracker/rabit_hadoop.py -n $1 -i $input/agaricus.txt.train -o $output -f $dataDir/mushroom.hadoop.conf \
+    --jobname xgboost_hadoop ../../xgboost mushroom.hadoop.conf data=stdin model_out=stdout
+
+# get the final model file
+hadoop fs -get $output/part-00000 ./final.model
+# output prediction task=pred 
+../../xgboost $dataDir/mushroom.hadoop.conf task=pred model_in=final.model
+# print the boosters of 00002.model in dump.raw.txt
+../../xgboost $dataDir/mushroom.hadoop.conf task=dump model_in=final.model name_dump=dump.raw.txt
+# use the feature map in printing for better visualization
+../../xgboost $dataDir/mushroom.hadoop.conf task=dump model_in=final.model fmap=$dataDir/featmap.txt name_dump=dump.nice.txt
+cat dump.nice.txt

From 74348c8001e8eab706fae8c91f21ac8a9e022e92 Mon Sep 17 00:00:00 2001
From: Boliang Chen <cblsjtu@gmail.com>
Date: Sun, 11 Jan 2015 00:00:03 +0800
Subject: [PATCH 140/166] initialize

---
 multi-node/hadoop/README.md | 15 +++++++++++++++
 1 file changed, 15 insertions(+)
 create mode 100644 multi-node/hadoop/README.md

diff --git a/multi-node/hadoop/README.md b/multi-node/hadoop/README.md
new file mode 100644
index 000000000000..adfacdb8b9a9
--- /dev/null
+++ b/multi-node/hadoop/README.md
@@ -0,0 +1,15 @@
+Distributed XGBoost: Hadoop Version
+====
+* Hadoop version: run ```bash run_binary_classification.sh <n_hadoop_workers> <path_in_HDFS>```
+  - This is the hadoop version of binary classification example in the demo folder.
+
+How to Use
+====
+* Check whether environment variable $HADOOP_HOME exists (e.g. run ```echo $HADOOP_HOME```). If not, plz set up hadoop-streaming.jar path in rabit_hadoop.py. 
+
+Notes
+====
+* The code has been tested on MapReduce 1 (MRv1), it should be ok to run on MapReduce 2 (MRv2, YARN).
+* The code is multi-threaded, so you want to run one xgboost per node/worker, which means the parameter <n_workers> should be less than the number of slaves/workers. 
+* The hadoop version now can only save the final model and evaluate test data locally after the training process.
+

From 7665dd1ed26abf246caaeb820e6ae83a42546ceb Mon Sep 17 00:00:00 2001
From: Boliang Chen <cblsjtu@gmail.com>
Date: Sun, 11 Jan 2015 00:04:47 +0800
Subject: [PATCH 141/166] rename

---
 .../run_binary_classification_on_hadoop.sh    | 43 -------------------
 1 file changed, 43 deletions(-)
 delete mode 100755 multi-node/hadoop/run_binary_classification_on_hadoop.sh

diff --git a/multi-node/hadoop/run_binary_classification_on_hadoop.sh b/multi-node/hadoop/run_binary_classification_on_hadoop.sh
deleted file mode 100755
index c194fcea4ae6..000000000000
--- a/multi-node/hadoop/run_binary_classification_on_hadoop.sh
+++ /dev/null
@@ -1,43 +0,0 @@
-#!/bin/bash
-
-if [ "$#" -lt 2 ];
-then
-    echo "Usage: <nworkers> <path_in_HDFS>"
-    exit -1
-fi
-
-curDir=`pwd`
-dataDir=../../demo/binary_classification
-trainFile=$dataDir/agaricus.txt.train
-input=$2
-output=$2/model
-
-# generate the training file if it doesnot exist
-if [ ! -f "$trainFile" ];
-then 
-  echo "Generating training file:"
-  cd $dataDir
-  # map feature using indicator encoding, also produce featmap.txt
-  python mapfeat.py
-  # split train and test
-  python mknfold.py agaricus.txt 1
-  cd $curDir
-fi
-
-hadoop fs -mkdir $input
-hadoop fs -put $trainFile $input
-#hadoop fs -rm -skipTrash -r $output
-
-# training and output the models
-python ../../rabit/tracker/rabit_hadoop.py -n 3 -i $input/agaricus.txt.train -o $output -f $dataDir/mushroom.hadoop.conf \
-    --jobname xgboost_hadoop ../../xgboost mushroom.hadoop.conf dsplit=row num_round=3 data=stdin model_out=stdout
-
-# get the final model file
-hadoop fs -get $output/part-00000 ./final.model
-# output prediction task=pred 
-../../xgboost $dataDir/mushroom.hadoop.conf task=pred model_in=./final.model
-# print the boosters of 00002.model in dump.raw.txt
-../../xgboost $dataDir/mushroom.hadoop.conf task=dump model_in=final.model name_dump=dump.raw.txt
-# use the feature map in printing for better visualization
-../../xgboost $dataDir/mushroom.hadoop.conf task=dump model_in=final.model fmap=$dataDir/featmap.txt name_dump=dump.nice.txt
-cat dump.nice.txt

From d5e9b1d4eadf9831cba51b3ef52106696c3082f1 Mon Sep 17 00:00:00 2001
From: Boliang Chen <cblsjtu@gmail.com>
Date: Sun, 11 Jan 2015 13:08:52 +0800
Subject: [PATCH 142/166] delete hadoop conf

---
 .../mushroom.hadoop.conf                      | 31 -------------------
 1 file changed, 31 deletions(-)
 delete mode 100644 demo/binary_classification/mushroom.hadoop.conf

diff --git a/demo/binary_classification/mushroom.hadoop.conf b/demo/binary_classification/mushroom.hadoop.conf
deleted file mode 100644
index 1dffe4f8d61e..000000000000
--- a/demo/binary_classification/mushroom.hadoop.conf
+++ /dev/null
@@ -1,31 +0,0 @@
-# General Parameters, see comment for each definition
-# choose the booster, can be gbtree or gblinear
-booster = gbtree
-# choose logistic regression loss function for binary classification
-objective = binary:logistic
-
-# Tree Booster Parameters
-# step size shrinkage
-eta = 1.0 
-# minimum loss reduction required to make a further partition
-gamma = 1.0 
-# minimum sum of instance weight(hessian) needed in a child
-min_child_weight = 1 
-# maximum depth of a tree
-max_depth = 3 
-
-# Task Parameters
-# the number of round to do boosting
-num_round = 2
-# 0 means do not save any model except the final round model
-save_period = 0 
-# The path of training data
-data = "agaricus.txt.train" 
-
-# The following parameters are not supported by xgboost running in hadoop yet!
-# The path of validation data, used to monitor training process, here [test] sets name of the validation set
-#eval[test] = "agaricus.txt.test" 
-# evaluate on training data as well each round
-#eval_train = 1
-# The path of test data 
-#test:data = "agaricus.txt.test"      

From 9eaf073e3c8344f677fd5233041ad8dfc2ec9cb7 Mon Sep 17 00:00:00 2001
From: tqchen <tianqi.tchen@gmail.com>
Date: Sat, 10 Jan 2015 21:33:07 -0800
Subject: [PATCH 143/166] change default distributed mode to row

---
 src/xgboost_main.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/xgboost_main.cpp b/src/xgboost_main.cpp
index 52ce4e58b4f3..9440c791a9d8 100644
--- a/src/xgboost_main.cpp
+++ b/src/xgboost_main.cpp
@@ -49,7 +49,7 @@ class BoostLearnTask {
       fprintf(stderr, "start %s:%d\n", pname.c_str(), rabit::GetRank());
     }
     if (rabit::IsDistributed()) {
-      this->SetParam("data_split", "col");
+      this->SetParam("dsplit", "row");
     }
     if (rabit::GetRank() != 0) {
       this->SetParam("silent", "2");

From 2f95968a1ca46e4ade45765a612e473cb917a45f Mon Sep 17 00:00:00 2001
From: Boliang Chen <cblsjtu@gmail.com>
Date: Sun, 11 Jan 2015 15:34:55 +0800
Subject: [PATCH 144/166] ok

---
 multi-node/hadoop/run_hadoop_mushroom.sh | 29 ++++++++++++++++++++++++
 1 file changed, 29 insertions(+)
 create mode 100755 multi-node/hadoop/run_hadoop_mushroom.sh

diff --git a/multi-node/hadoop/run_hadoop_mushroom.sh b/multi-node/hadoop/run_hadoop_mushroom.sh
new file mode 100755
index 000000000000..2f095ff2554a
--- /dev/null
+++ b/multi-node/hadoop/run_hadoop_mushroom.sh
@@ -0,0 +1,29 @@
+#!/bin/bash
+
+if [ "$#" -lt 2 ];
+then
+    echo "Usage: <num_of_slave_nodes> <path_in_HDFS>"
+    exit -1
+fi
+
+# put the local training file to HDFS
+hadoop fs -mkdir $2/data
+hadoop fs -put ../../demo/data/agaricus.txt.train $2/data
+
+# training and output the final model file
+../../rabit/tracker/rabit_hadoop.py -n $1 -i $2/data/agaricus.txt.train \
+    -o $2/model -f ../../demo/data/agaricus.txt.test \
+    ../../xgboost mushroom.hadoop.conf dsplit=row 
+
+# get the final model file
+hadoop fs -get $2/model/part-00000 ./final.model
+
+# output prediction task=pred 
+../../xgboost mushroom.hadoop.conf task=pred model_in=final.model \
+    test:data=../../demo/data/agaricus.txt.test
+# print the boosters of final.model in dump.raw.txt
+../../xgboost mushroom.hadoop.conf task=dump model_in=final.model name_dump=dump.raw.txt
+# use the feature map in printing for better visualization
+../../xgboost mushroom.hadoop.conf task=dump model_in=final.model \
+    fmap=../../demo/data/featmap.txt name_dump=dump.nice.txt
+cat dump.nice.txt

From fb65356dd421662d4e6608c940f123cce9f87d03 Mon Sep 17 00:00:00 2001
From: Boliang Chen <cblsjtu@gmail.com>
Date: Sun, 11 Jan 2015 15:41:46 +0800
Subject: [PATCH 145/166] change file name

---
 .../hadoop/run_binary_classification.sh       | 43 -------------------
 1 file changed, 43 deletions(-)
 delete mode 100755 multi-node/hadoop/run_binary_classification.sh

diff --git a/multi-node/hadoop/run_binary_classification.sh b/multi-node/hadoop/run_binary_classification.sh
deleted file mode 100755
index 740a468cf668..000000000000
--- a/multi-node/hadoop/run_binary_classification.sh
+++ /dev/null
@@ -1,43 +0,0 @@
-#!/bin/bash
-
-if [ "$#" -lt 2 ];
-then
-    echo "Usage: <nworkers> <path_in_HDFS>"
-    exit -1
-fi
-
-curDir=`pwd`
-dataDir=../../demo/binary_classification
-trainFile=$dataDir/agaricus.txt.train
-input=$2
-output=$2/model
-
-# generate the training file if it doesnot exist
-if [ ! -f "$trainFile" ];
-then 
-  echo "Generating training file:"
-  cd $dataDir
-  # map feature using indicator encoding, also produce featmap.txt
-  python mapfeat.py
-  # split train and test
-  python mknfold.py agaricus.txt 1
-  cd $curDir
-fi
-
-hadoop fs -mkdir $input
-hadoop fs -put $trainFile $input
-#hadoop fs -rm -skipTrash -r $output
-
-# training and output the final model file
-python ../../rabit/tracker/rabit_hadoop.py -n $1 -i $input/agaricus.txt.train -o $output -f $dataDir/mushroom.hadoop.conf \
-    --jobname xgboost_hadoop ../../xgboost mushroom.hadoop.conf data=stdin model_out=stdout
-
-# get the final model file
-hadoop fs -get $output/part-00000 ./final.model
-# output prediction task=pred 
-../../xgboost $dataDir/mushroom.hadoop.conf task=pred model_in=final.model
-# print the boosters of 00002.model in dump.raw.txt
-../../xgboost $dataDir/mushroom.hadoop.conf task=dump model_in=final.model name_dump=dump.raw.txt
-# use the feature map in printing for better visualization
-../../xgboost $dataDir/mushroom.hadoop.conf task=dump model_in=final.model fmap=$dataDir/featmap.txt name_dump=dump.nice.txt
-cat dump.nice.txt

From ceabf5755f447c40a22adcaaef510aceff2723e5 Mon Sep 17 00:00:00 2001
From: Boliang Chen <cblsjtu@gmail.com>
Date: Sun, 11 Jan 2015 15:44:16 +0800
Subject: [PATCH 146/166] hadoop version conf

---
 multi-node/hadoop/mushroom.hadoop.conf | 30 ++++++++++++++++++++++++++
 1 file changed, 30 insertions(+)
 create mode 100644 multi-node/hadoop/mushroom.hadoop.conf

diff --git a/multi-node/hadoop/mushroom.hadoop.conf b/multi-node/hadoop/mushroom.hadoop.conf
new file mode 100644
index 000000000000..305b82dd3ac4
--- /dev/null
+++ b/multi-node/hadoop/mushroom.hadoop.conf
@@ -0,0 +1,30 @@
+# General Parameters, see comment for each definition
+# choose the booster, can be gbtree or gblinear
+booster = gbtree
+# choose logistic regression loss function for binary classification
+objective = binary:logistic
+
+# Tree Booster Parameters
+# step size shrinkage
+eta = 1.0 
+# minimum loss reduction required to make a further partition
+gamma = 1.0 
+# minimum sum of instance weight(hessian) needed in a child
+min_child_weight = 1 
+# maximum depth of a tree
+max_depth = 3 
+
+# Task Parameters
+# the number of round to do boosting
+num_round = 2
+# 0 means do not save any model except the final round model
+save_period = 0 
+# The path of training data
+data = stdin
+# The path of model file
+model_out = stdout 
+
+# The path of validation data, used to monitor training process, here [test] sets name of the validation set
+eval[test] = "agaricus.txt.test" 
+# evaluate on training data as well each round
+eval_train = 1

From 69e079941e2612cb6379a97126b3d33caef66af8 Mon Sep 17 00:00:00 2001
From: tqchen <tianqi.tchen@gmail.com>
Date: Sat, 10 Jan 2015 23:46:29 -0800
Subject: [PATCH 147/166] allow pred to stdout

---
 src/xgboost_main.cpp | 17 +++++++++++------
 1 file changed, 11 insertions(+), 6 deletions(-)

diff --git a/src/xgboost_main.cpp b/src/xgboost_main.cpp
index 9440c791a9d8..db37cbd1db57 100644
--- a/src/xgboost_main.cpp
+++ b/src/xgboost_main.cpp
@@ -32,7 +32,7 @@ class BoostLearnTask {
       }
     }
     // do not save anything when save to stdout
-    if (model_out == "stdout") {
+    if (model_out == "stdout" || name_pred == "stdout") {
       this->SetParam("silent", "1");
       save_period = 0;
     }
@@ -235,12 +235,17 @@ class BoostLearnTask {
     std::vector<float> preds;
     if (!silent) printf("start prediction...\n");
     learner.Predict(*data, pred_margin != 0, &preds, ntree_limit);
-    if (!silent) printf("writing prediction to %s\n", name_pred.c_str());
-    FILE *fo = utils::FopenCheck(name_pred.c_str(), "w");
-    for (size_t i = 0; i < preds.size(); i++) {
-      fprintf(fo, "%f\n", preds[i]);
+    if (!silent) printf("writing prediction to %s\n", name_pred.c_str());    
+    FILE *fo;
+    if (name_pred != "stdout") {
+      fo = utils::FopenCheck(name_pred.c_str(), "w");
+    } else {
+      fo = stdout;
     }
-    fclose(fo);
+    for (size_t i = 0; i < preds.size(); ++i) {
+      fprintf(fo, "%g\n", preds[i]);
+    }
+    if (fo != stdout) fclose(fo);
   }
  private:
   /*! \brief whether silent */

From 525c1594e5f7130d0baa34cce7b8347cca19f8f4 Mon Sep 17 00:00:00 2001
From: Boliang Chen <cblsjtu@gmail.com>
Date: Sun, 11 Jan 2015 16:06:19 +0800
Subject: [PATCH 148/166] revise the script

---
 multi-node/hadoop/run_hadoop_mushroom.sh | 16 ++++++----------
 1 file changed, 6 insertions(+), 10 deletions(-)

diff --git a/multi-node/hadoop/run_hadoop_mushroom.sh b/multi-node/hadoop/run_hadoop_mushroom.sh
index 2f095ff2554a..1e7c9a1d0012 100755
--- a/multi-node/hadoop/run_hadoop_mushroom.sh
+++ b/multi-node/hadoop/run_hadoop_mushroom.sh
@@ -11,19 +11,15 @@ hadoop fs -mkdir $2/data
 hadoop fs -put ../../demo/data/agaricus.txt.train $2/data
 
 # training and output the final model file
-../../rabit/tracker/rabit_hadoop.py -n $1 -i $2/data/agaricus.txt.train \
-    -o $2/model -f ../../demo/data/agaricus.txt.test \
-    ../../xgboost mushroom.hadoop.conf dsplit=row 
+../../rabit/tracker/rabit_hadoop.py -n $1 -i $2/data/agaricus.txt.train -o $2/mushroom.final.model ../../xgboost mushroom.hadoop.conf
 
 # get the final model file
-hadoop fs -get $2/model/part-00000 ./final.model
+hadoop fs -get $2/mushroom.final.model/part-00000 ./mushroom.final.model
 
-# output prediction task=pred 
-../../xgboost mushroom.hadoop.conf task=pred model_in=final.model \
-    test:data=../../demo/data/agaricus.txt.test
+# output prediction task=pred of test:data
+../../xgboost mushroom.hadoop.conf task=pred model_in=mushroom.final.model test:data=../../demo/data/agaricus.txt.test
 # print the boosters of final.model in dump.raw.txt
-../../xgboost mushroom.hadoop.conf task=dump model_in=final.model name_dump=dump.raw.txt
+../../xgboost mushroom.hadoop.conf task=dump model_in=mushroom.final.model name_dump=dump.raw.txt
 # use the feature map in printing for better visualization
-../../xgboost mushroom.hadoop.conf task=dump model_in=final.model \
-    fmap=../../demo/data/featmap.txt name_dump=dump.nice.txt
+../../xgboost mushroom.hadoop.conf task=dump model_in=mushroom.final.model fmap=../../demo/data/featmap.txt name_dump=dump.nice.txt
 cat dump.nice.txt

From ef2518364c62a15362628d43aee4566f06267336 Mon Sep 17 00:00:00 2001
From: Boliang Chen <cblsjtu@gmail.com>
Date: Sun, 11 Jan 2015 16:07:00 +0800
Subject: [PATCH 149/166] change to minimal setting

---
 multi-node/hadoop/mushroom.hadoop.conf | 12 ++++++++----
 1 file changed, 8 insertions(+), 4 deletions(-)

diff --git a/multi-node/hadoop/mushroom.hadoop.conf b/multi-node/hadoop/mushroom.hadoop.conf
index 305b82dd3ac4..15e05f2da945 100644
--- a/multi-node/hadoop/mushroom.hadoop.conf
+++ b/multi-node/hadoop/mushroom.hadoop.conf
@@ -19,12 +19,16 @@ max_depth = 3
 num_round = 2
 # 0 means do not save any model except the final round model
 save_period = 0 
+# evaluate on training data as well each round
+# eval_train = 1
+# The path of validation data, used to monitor training process, here [test] sets name of the validation set
+# eval[test] = "agaricus.txt.test"
+
+# Plz donot modify the following parameters
 # The path of training data
 data = stdin
 # The path of model file
 model_out = stdout 
+# split pattern of xgboost
+dsplit = row
 
-# The path of validation data, used to monitor training process, here [test] sets name of the validation set
-eval[test] = "agaricus.txt.test" 
-# evaluate on training data as well each round
-eval_train = 1

From fdbca6013d7ae2d8417a2c95f149f4213dfcda07 Mon Sep 17 00:00:00 2001
From: Boliang Chen <cblsjtu@gmail.com>
Date: Sun, 11 Jan 2015 17:57:41 +0800
Subject: [PATCH 150/166] modify

---
 multi-node/hadoop/README.md | 32 +++++++++++++++++++++++++++++---
 1 file changed, 29 insertions(+), 3 deletions(-)

diff --git a/multi-node/hadoop/README.md b/multi-node/hadoop/README.md
index adfacdb8b9a9..aecee38e03f8 100644
--- a/multi-node/hadoop/README.md
+++ b/multi-node/hadoop/README.md
@@ -1,15 +1,41 @@
 Distributed XGBoost: Hadoop Version
 ====
-* Hadoop version: run ```bash run_binary_classification.sh <n_hadoop_workers> <path_in_HDFS>```
+*  The script in this fold shows an example of how to run distributed xgboost on hadoop platform.
+*  It relies on [Rabit Library](https://github.com/tqchen/rabit) and Hadoop Streaming. 
+*  Quick start: run ```bash run_binary_classification.sh <n_hadoop_workers> <path_in_HDFS>```
   - This is the hadoop version of binary classification example in the demo folder.
+  - More info of the binary classification task can be refered to https://github.com/tqchen/xgboost/wiki/Binary-Classification.
 
+Before you run the script
+====
+* Make sure you have set up the hadoop environment. Otherwise you should run single machine examples in the demo fold.
+* Build: run ```bash build.sh``` in the root folder, it will automatically download rabit and build xgboost.
+* Check whether the environment variable $HADOOP_HOME exists (e.g. run ```echo $HADOOP_HOME```). If not, plz set up hadoop-streaming.jar path in rabit_hadoop.py. 
+ 
 How to Use
 ====
-* Check whether environment variable $HADOOP_HOME exists (e.g. run ```echo $HADOOP_HOME```). If not, plz set up hadoop-streaming.jar path in rabit_hadoop.py. 
+* Input data format: LIBSVM format. The example here uses generated data in demo/data folder.
+* Put the training data in HDFS (hadoop distributed file system).
+* Use rabit ```rabit_hadoop.py``` to submit training task to hadoop, and save the final model file.
+* Get the final model file from HDFS, and locally do prediction as well as visualization of model.
+
+XGBoost: Single machine verison VS Hadoop version
+====
+If you have used xgboost (single machine version) before, this section will show you how to run xgboost on hadoop with a slight modification on conf file.
+* IO: instead of reading and writing file locally, hadoop version use "stdin" to read training file and use "stdout" to store the final model file. Therefore, you should change the parameters "data" and "model_out" in conf file to ```data = stdin; model_out = stdout```.
+* File cache: ```rabit_hadoop.py``` also provide several ways to cache necesary files, including binary file (xgboost), conf file, small size of dataset which used for eveluation during the training process, and so on.
+  - Any file used in config file, excluding stdin, should be cached in the script. ```rabit_hadoop.py``` will automatically cache files in the command line. For example, ```rabit_hadoop.py -n 3 -i $hdfsPath/agaricus.txt.train -o $hdfsPath/mushroom.final.model $localPath/xgboost mushroom.hadoop.conf``` will cache "xgboost" and "mushroom.hadoop.conf".
+  - You could also use "-f" to manually cache one or more files, like ```-f file1 -f file2``` or ```-f file1#file2```.
+* Test locally
+* 
+* 
+
+Usage of rabit_hadoop.py
+====
 
 Notes
 ====
 * The code has been tested on MapReduce 1 (MRv1), it should be ok to run on MapReduce 2 (MRv2, YARN).
 * The code is multi-threaded, so you want to run one xgboost per node/worker, which means the parameter <n_workers> should be less than the number of slaves/workers. 
-* The hadoop version now can only save the final model and evaluate test data locally after the training process.
+* The hadoop version now can only save the final model.
 

From df3f87c182cc12ccc9ac1f9cafbe01ea7ebf0ac4 Mon Sep 17 00:00:00 2001
From: Boliang Chen <cblsjtu@gmail.com>
Date: Sun, 11 Jan 2015 18:20:16 +0800
Subject: [PATCH 151/166] add more details

---
 multi-node/hadoop/README.md | 17 +++++++++--------
 1 file changed, 9 insertions(+), 8 deletions(-)

diff --git a/multi-node/hadoop/README.md b/multi-node/hadoop/README.md
index aecee38e03f8..7ff7c5da7e41 100644
--- a/multi-node/hadoop/README.md
+++ b/multi-node/hadoop/README.md
@@ -22,20 +22,21 @@ How to Use
 XGBoost: Single machine verison VS Hadoop version
 ====
 If you have used xgboost (single machine version) before, this section will show you how to run xgboost on hadoop with a slight modification on conf file.
+* Hadoop version needs to set up how many slave nodes/machines/workers you would like to use at first. 
 * IO: instead of reading and writing file locally, hadoop version use "stdin" to read training file and use "stdout" to store the final model file. Therefore, you should change the parameters "data" and "model_out" in conf file to ```data = stdin; model_out = stdout```.
 * File cache: ```rabit_hadoop.py``` also provide several ways to cache necesary files, including binary file (xgboost), conf file, small size of dataset which used for eveluation during the training process, and so on.
   - Any file used in config file, excluding stdin, should be cached in the script. ```rabit_hadoop.py``` will automatically cache files in the command line. For example, ```rabit_hadoop.py -n 3 -i $hdfsPath/agaricus.txt.train -o $hdfsPath/mushroom.final.model $localPath/xgboost mushroom.hadoop.conf``` will cache "xgboost" and "mushroom.hadoop.conf".
-  - You could also use "-f" to manually cache one or more files, like ```-f file1 -f file2``` or ```-f file1#file2```.
-* Test locally
-* 
-* 
-
-Usage of rabit_hadoop.py
-====
+  - You could also use "-f" to manually cache one or more files, like ```-f file1 -f file2``` or ```-f file1#file2``` (use "#" to spilt file names).
+  - The local path of cached files in command is "./".
+  - Since the cached files will be packaged and delivered to hadoop slave nodes, the cached file should not be large. For instance, trying to cache files of GB size may reduce the performance.
+* Hadoop version also support evaluting each training round. You just need to modify parameters "eval_train" and "eval[test]" in conf file and cache the evaluation file.
+* Hadoop version now can only save the final model.
+* Predict locally. Althought the hadoop version supports training process, you should do prediction locally, just the same as single machine version.
+* The hadoop version now can only save the final model. 
+* More details of hadoop version can be referred to the usage of ```rabit_hadoop.py```.  
 
 Notes
 ====
 * The code has been tested on MapReduce 1 (MRv1), it should be ok to run on MapReduce 2 (MRv2, YARN).
 * The code is multi-threaded, so you want to run one xgboost per node/worker, which means the parameter <n_workers> should be less than the number of slaves/workers. 
-* The hadoop version now can only save the final model.
 

From 0111a14aef2ecd2bbc98ec8e0b4111b01c8b52d6 Mon Sep 17 00:00:00 2001
From: chenshuaihua <albertcool@126.com>
Date: Sun, 11 Jan 2015 23:57:52 +0800
Subject: [PATCH 152/166] yarn script

---
 multi-node/hadoop/run_yarn_mushroom.sh | 29 ++++++++++++++++++++++++++
 1 file changed, 29 insertions(+)
 create mode 100644 multi-node/hadoop/run_yarn_mushroom.sh

diff --git a/multi-node/hadoop/run_yarn_mushroom.sh b/multi-node/hadoop/run_yarn_mushroom.sh
new file mode 100644
index 000000000000..07ac291d1624
--- /dev/null
+++ b/multi-node/hadoop/run_yarn_mushroom.sh
@@ -0,0 +1,29 @@
+#!/bin/bash
+if [ "$#" -lt 3 ];
+then
+	echo "Usage: <nworkers> <nthreads> <path_in_HDFS>"
+	exit -1
+fi
+
+# put the local training file to HDFS
+hadoop fs -mkdir $3/data
+hadoop fs -put ../../demo/data/agaricus.txt.train $3/data
+
+
+python ../../rabit/tracker/rabit_yarn.py  -nw $1 -nt $2 -f ../../demo/data/agaricus.txt.test \
+	-i $3/data/agaricus.txt.train -o $3/model  ../../xgboost mushroom.hadoop.conf  nthread=$2 dsplit=row
+
+
+
+# get the final model file
+hadoop fs -get $3/model/part-00000 ./final.model
+
+# output prediction task=pred 
+../../xgboost mushroom.hadoop.conf task=pred model_in=final.model \
+    test:data=../../demo/data/agaricus.txt.test
+# print the boosters of final.model in dump.raw.txt
+../../xgboost mushroom.hadoop.conf task=dump model_in=final.model name_dump=dump.raw.txt
+# use the feature map in printing for better visualization
+../../xgboost mushroom.hadoop.conf task=dump model_in=final.model \
+fmap=../../demo/data/featmap.txt name_dump=dump.nice.txt
+cat dump.nice.txt

From 62a108a7c2906ca6aebe3c3a41f359781edfbbd0 Mon Sep 17 00:00:00 2001
From: tqchen <tianqi.tchen@gmail.com>
Date: Sun, 11 Jan 2015 21:02:38 -0800
Subject: [PATCH 153/166] chg of hadoop script

---
 multi-node/hadoop/README.md                   | 10 +++----
 multi-node/hadoop/run_hadoop_mushroom.sh      | 29 -------------------
 .../{run_yarn_mushroom.sh => run_mushroom.sh} | 12 ++------
 3 files changed, 8 insertions(+), 43 deletions(-)
 delete mode 100755 multi-node/hadoop/run_hadoop_mushroom.sh
 rename multi-node/hadoop/{run_yarn_mushroom.sh => run_mushroom.sh} (68%)
 mode change 100644 => 100755

diff --git a/multi-node/hadoop/README.md b/multi-node/hadoop/README.md
index adfacdb8b9a9..a3411fee4467 100644
--- a/multi-node/hadoop/README.md
+++ b/multi-node/hadoop/README.md
@@ -1,6 +1,6 @@
 Distributed XGBoost: Hadoop Version
 ====
-* Hadoop version: run ```bash run_binary_classification.sh <n_hadoop_workers> <path_in_HDFS>```
+* Hadoop version: run ```bash run_binary_classification.sh <n_hadoop_workers> <n_thread_per_worker> <path_in_HDFS>```
   - This is the hadoop version of binary classification example in the demo folder.
 
 How to Use
@@ -9,7 +9,7 @@ How to Use
 
 Notes
 ====
-* The code has been tested on MapReduce 1 (MRv1), it should be ok to run on MapReduce 2 (MRv2, YARN).
-* The code is multi-threaded, so you want to run one xgboost per node/worker, which means the parameter <n_workers> should be less than the number of slaves/workers. 
-* The hadoop version now can only save the final model and evaluate test data locally after the training process.
-
+* The code has been tested on MapReduce 1 (MRv1) and YARN, it recommended run on MapReduce 2 (MRv2, YARN).
+* The code is multi-threaded, so you want to run one xgboost per node/worker, which means you want to set <n_thread_per_worker> to be number of cores you have on each machine.
+  - You will need YARN to set specify number of cores of each worker
+* The hadoop version save the final model into HDFS
diff --git a/multi-node/hadoop/run_hadoop_mushroom.sh b/multi-node/hadoop/run_hadoop_mushroom.sh
deleted file mode 100755
index 2f095ff2554a..000000000000
--- a/multi-node/hadoop/run_hadoop_mushroom.sh
+++ /dev/null
@@ -1,29 +0,0 @@
-#!/bin/bash
-
-if [ "$#" -lt 2 ];
-then
-    echo "Usage: <num_of_slave_nodes> <path_in_HDFS>"
-    exit -1
-fi
-
-# put the local training file to HDFS
-hadoop fs -mkdir $2/data
-hadoop fs -put ../../demo/data/agaricus.txt.train $2/data
-
-# training and output the final model file
-../../rabit/tracker/rabit_hadoop.py -n $1 -i $2/data/agaricus.txt.train \
-    -o $2/model -f ../../demo/data/agaricus.txt.test \
-    ../../xgboost mushroom.hadoop.conf dsplit=row 
-
-# get the final model file
-hadoop fs -get $2/model/part-00000 ./final.model
-
-# output prediction task=pred 
-../../xgboost mushroom.hadoop.conf task=pred model_in=final.model \
-    test:data=../../demo/data/agaricus.txt.test
-# print the boosters of final.model in dump.raw.txt
-../../xgboost mushroom.hadoop.conf task=dump model_in=final.model name_dump=dump.raw.txt
-# use the feature map in printing for better visualization
-../../xgboost mushroom.hadoop.conf task=dump model_in=final.model \
-    fmap=../../demo/data/featmap.txt name_dump=dump.nice.txt
-cat dump.nice.txt
diff --git a/multi-node/hadoop/run_yarn_mushroom.sh b/multi-node/hadoop/run_mushroom.sh
old mode 100644
new mode 100755
similarity index 68%
rename from multi-node/hadoop/run_yarn_mushroom.sh
rename to multi-node/hadoop/run_mushroom.sh
index 07ac291d1624..1e647047f7b0
--- a/multi-node/hadoop/run_yarn_mushroom.sh
+++ b/multi-node/hadoop/run_mushroom.sh
@@ -9,21 +9,15 @@ fi
 hadoop fs -mkdir $3/data
 hadoop fs -put ../../demo/data/agaricus.txt.train $3/data
 
-
-python ../../rabit/tracker/rabit_yarn.py  -nw $1 -nt $2 -f ../../demo/data/agaricus.txt.test \
-	-i $3/data/agaricus.txt.train -o $3/model  ../../xgboost mushroom.hadoop.conf  nthread=$2 dsplit=row
-
-
+../../rabit/tracker/rabit_hadoop.py  -n $1 -nt $2 -i $3/data/agaricus.txt.train -o $3/model ../../xgboost mushroom.hadoop.conf  nthread=$2
 
 # get the final model file
 hadoop fs -get $3/model/part-00000 ./final.model
 
 # output prediction task=pred 
-../../xgboost mushroom.hadoop.conf task=pred model_in=final.model \
-    test:data=../../demo/data/agaricus.txt.test
+../../xgboost mushroom.hadoop.conf task=pred model_in=final.model test:data=../../demo/data/agaricus.txt.test
 # print the boosters of final.model in dump.raw.txt
 ../../xgboost mushroom.hadoop.conf task=dump model_in=final.model name_dump=dump.raw.txt
 # use the feature map in printing for better visualization
-../../xgboost mushroom.hadoop.conf task=dump model_in=final.model \
-fmap=../../demo/data/featmap.txt name_dump=dump.nice.txt
+../../xgboost mushroom.hadoop.conf task=dump model_in=final.model fmap=../../demo/data/featmap.txt name_dump=dump.nice.txt
 cat dump.nice.txt

From d57cb4f17b1770c7ed29524012bb1762dc7b6323 Mon Sep 17 00:00:00 2001
From: Tianqi Chen <tqchen@users.noreply.github.com>
Date: Mon, 12 Jan 2015 09:02:53 -0800
Subject: [PATCH 154/166] Update mushroom.hadoop.conf

---
 multi-node/hadoop/mushroom.hadoop.conf | 2 --
 1 file changed, 2 deletions(-)

diff --git a/multi-node/hadoop/mushroom.hadoop.conf b/multi-node/hadoop/mushroom.hadoop.conf
index 305b82dd3ac4..a40c950a7c47 100644
--- a/multi-node/hadoop/mushroom.hadoop.conf
+++ b/multi-node/hadoop/mushroom.hadoop.conf
@@ -24,7 +24,5 @@ data = stdin
 # The path of model file
 model_out = stdout 
 
-# The path of validation data, used to monitor training process, here [test] sets name of the validation set
-eval[test] = "agaricus.txt.test" 
 # evaluate on training data as well each round
 eval_train = 1

From 5e0e8a5ff7da9320656299449df74ec535e78fce Mon Sep 17 00:00:00 2001
From: tqchen <tianqi.tchen@gmail.com>
Date: Mon, 12 Jan 2015 11:47:46 -0800
Subject: [PATCH 155/166] changes

---
 multi-node/hadoop/README.md | 26 ++++++++++++--------------
 1 file changed, 12 insertions(+), 14 deletions(-)

diff --git a/multi-node/hadoop/README.md b/multi-node/hadoop/README.md
index ce514be2e856..f2afc6a1d5e5 100644
--- a/multi-node/hadoop/README.md
+++ b/multi-node/hadoop/README.md
@@ -2,16 +2,17 @@ Distributed XGBoost: Hadoop Version
 ====
 *  The script in this fold shows an example of how to run distributed xgboost on hadoop platform.
 *  It relies on [Rabit Library](https://github.com/tqchen/rabit) and Hadoop Streaming. 
-*  Quick start: run ```bash run_binary_classification.sh <n_hadoop_workers> <path_in_HDFS>```
+*  Quick start: run ```bash run_binary_classification.sh <n_hadoop_workers> <n_thread_per_worker> <path_in_HDFS>```
   - This is the hadoop version of binary classification example in the demo folder.
-  - More info of the binary classification task can be refered to https://github.com/tqchen/xgboost/wiki/Binary-Classification.
+  - More info of the usage of xgboost can be refered to [wiki page](https://github.com/tqchen/xgboost/wiki)
 
 Before you run the script
 ====
-* Make sure you have set up the hadoop environment. Otherwise you should run single machine examples in the demo fold.
+* Make sure you have set up the hadoop environment.
+* If you want to only use single machine multi-threading, tryout single machine examples in the [demo folder](../../demo).
 * Build: run ```bash build.sh``` in the root folder, it will automatically download rabit and build xgboost.
-* Check whether the environment variable $HADOOP_HOME exists (e.g. run ```echo $HADOOP_HOME```). If not, plz set up hadoop-streaming.jar path in rabit_hadoop.py. 
- 
+* Check whether the environment variable $HADOOP_HOME exists (e.g. run ```echo $HADOOP_HOME```). If not, please set up hadoop-streaming.jar path in rabit_hadoop.py.
+
 How to Use
 ====
 * Input data format: LIBSVM format. The example here uses generated data in demo/data folder.
@@ -19,25 +20,22 @@ How to Use
 * Use rabit ```rabit_hadoop.py``` to submit training task to hadoop, and save the final model file.
 * Get the final model file from HDFS, and locally do prediction as well as visualization of model.
 
-XGBoost: Single machine verison VS Hadoop version
+XGBoost: Single machine verison vs Hadoop version
 ====
 If you have used xgboost (single machine version) before, this section will show you how to run xgboost on hadoop with a slight modification on conf file.
-* Hadoop version needs to set up how many slave nodes/machines/workers you would like to use at first. 
-* IO: instead of reading and writing file locally, hadoop version use "stdin" to read training file and use "stdout" to store the final model file. Therefore, you should change the parameters "data" and "model_out" in conf file to ```data = stdin; model_out = stdout```.
+* Hadoop version needs to set up how many slave nodes/machines/workers you would like to use at first.
+* IO: instead of reading and writing file locally, hadoop version use "stdin" to read training file and use "stdout" to store the final model file. Therefore, you should change the parameters "data" and "model_out" in conf file to ```data=stdin``` and ```model_out=stdout```.
 * File cache: ```rabit_hadoop.py``` also provide several ways to cache necesary files, including binary file (xgboost), conf file, small size of dataset which used for eveluation during the training process, and so on.
   - Any file used in config file, excluding stdin, should be cached in the script. ```rabit_hadoop.py``` will automatically cache files in the command line. For example, ```rabit_hadoop.py -n 3 -i $hdfsPath/agaricus.txt.train -o $hdfsPath/mushroom.final.model $localPath/xgboost mushroom.hadoop.conf``` will cache "xgboost" and "mushroom.hadoop.conf".
   - You could also use "-f" to manually cache one or more files, like ```-f file1 -f file2``` or ```-f file1#file2``` (use "#" to spilt file names).
   - The local path of cached files in command is "./".
   - Since the cached files will be packaged and delivered to hadoop slave nodes, the cached file should not be large. For instance, trying to cache files of GB size may reduce the performance.
-* Hadoop version also support evaluting each training round. You just need to modify parameters "eval_train" and "eval[test]" in conf file and cache the evaluation file.
-* Hadoop version now can only save the final model.
-* Predict locally. Althought the hadoop version supports training process, you should do prediction locally, just the same as single machine version.
-* The hadoop version now can only save the final model. 
-* More details of hadoop version can be referred to the usage of ```rabit_hadoop.py```.  
+* Hadoop version also support evaluting each training round. You just need to modify parameters "eval_train".
+* Hadoop version now only saves the final model.
+* More details of submission can be referred to the usage of ```rabit_hadoop.py```.  
 
 Notes
 ====       
 * The code has been tested on MapReduce 1 (MRv1) and YARN, it recommended run on MapReduce 2 (MRv2, YARN).
 * The code is multi-threaded, so you want to run one xgboost per node/worker, which means you want to set <n_thread_per_worker> to be number of cores you have on each machine.
   - You will need YARN to set specify number of cores of each worker
-

From 6b7f20c002a0ed8b5b826d6c9f8f481e6208a508 Mon Sep 17 00:00:00 2001
From: tqchen <tianqi.tchen@gmail.com>
Date: Mon, 12 Jan 2015 11:49:42 -0800
Subject: [PATCH 156/166] chgs

---
 multi-node/hadoop/README.md | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/multi-node/hadoop/README.md b/multi-node/hadoop/README.md
index f2afc6a1d5e5..13e68c4f02b3 100644
--- a/multi-node/hadoop/README.md
+++ b/multi-node/hadoop/README.md
@@ -20,7 +20,7 @@ How to Use
 * Use rabit ```rabit_hadoop.py``` to submit training task to hadoop, and save the final model file.
 * Get the final model file from HDFS, and locally do prediction as well as visualization of model.
 
-XGBoost: Single machine verison vs Hadoop version
+Single machine vs Hadoop version
 ====
 If you have used xgboost (single machine version) before, this section will show you how to run xgboost on hadoop with a slight modification on conf file.
 * Hadoop version needs to set up how many slave nodes/machines/workers you would like to use at first.
@@ -31,8 +31,8 @@ If you have used xgboost (single machine version) before, this section will show
   - The local path of cached files in command is "./".
   - Since the cached files will be packaged and delivered to hadoop slave nodes, the cached file should not be large. For instance, trying to cache files of GB size may reduce the performance.
 * Hadoop version also support evaluting each training round. You just need to modify parameters "eval_train".
-* Hadoop version now only saves the final model.
-* More details of submission can be referred to the usage of ```rabit_hadoop.py```.  
+* More details of submission can be referred to the usage of ```rabit_hadoop.py```.
+* The model saved by hadoop version is compatible with single machine version.
 
 Notes
 ====       

From 2a9a864b11ec6bd350f7e7b667d26c39a92777d6 Mon Sep 17 00:00:00 2001
From: tqchen <tianqi.tchen@gmail.com>
Date: Mon, 12 Jan 2015 11:50:18 -0800
Subject: [PATCH 157/166] ok

---
 multi-node/hadoop/README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/multi-node/hadoop/README.md b/multi-node/hadoop/README.md
index 13e68c4f02b3..d98068fe989f 100644
--- a/multi-node/hadoop/README.md
+++ b/multi-node/hadoop/README.md
@@ -9,7 +9,7 @@ Distributed XGBoost: Hadoop Version
 Before you run the script
 ====
 * Make sure you have set up the hadoop environment.
-* If you want to only use single machine multi-threading, tryout single machine examples in the [demo folder](../../demo).
+* If you want to only use single machine multi-threading, try single machine examples in the [demo folder](../../demo).
 * Build: run ```bash build.sh``` in the root folder, it will automatically download rabit and build xgboost.
 * Check whether the environment variable $HADOOP_HOME exists (e.g. run ```echo $HADOOP_HOME```). If not, please set up hadoop-streaming.jar path in rabit_hadoop.py.
 

From 9346c328cb721e94e1fa399765a47740a8fe96b8 Mon Sep 17 00:00:00 2001
From: tqchen <tianqi.tchen@gmail.com>
Date: Mon, 12 Jan 2015 11:53:40 -0800
Subject: [PATCH 158/166] chg

---
 multi-node/hadoop/README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/multi-node/hadoop/README.md b/multi-node/hadoop/README.md
index d98068fe989f..e03f3a592b7b 100644
--- a/multi-node/hadoop/README.md
+++ b/multi-node/hadoop/README.md
@@ -2,7 +2,7 @@ Distributed XGBoost: Hadoop Version
 ====
 *  The script in this fold shows an example of how to run distributed xgboost on hadoop platform.
 *  It relies on [Rabit Library](https://github.com/tqchen/rabit) and Hadoop Streaming. 
-*  Quick start: run ```bash run_binary_classification.sh <n_hadoop_workers> <n_thread_per_worker> <path_in_HDFS>```
+*  Quick start: run ```bash run_mushroom.sh <n_hadoop_workers> <n_thread_per_worker> <path_in_HDFS>```
   - This is the hadoop version of binary classification example in the demo folder.
   - More info of the usage of xgboost can be refered to [wiki page](https://github.com/tqchen/xgboost/wiki)
 

From a53f0cd9bf9ea5ed78a8ec28ec19f5ca5b8702ea Mon Sep 17 00:00:00 2001
From: tqchen <tianqi.tchen@gmail.com>
Date: Mon, 12 Jan 2015 11:55:42 -0800
Subject: [PATCH 159/166] doc chg

---
 multi-node/hadoop/README.md | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/multi-node/hadoop/README.md b/multi-node/hadoop/README.md
index e03f3a592b7b..e513c59cfb3b 100644
--- a/multi-node/hadoop/README.md
+++ b/multi-node/hadoop/README.md
@@ -35,7 +35,9 @@ If you have used xgboost (single machine version) before, this section will show
 * The model saved by hadoop version is compatible with single machine version.
 
 Notes
-====       
-* The code has been tested on MapReduce 1 (MRv1) and YARN, it recommended run on MapReduce 2 (MRv2, YARN).
-* The code is multi-threaded, so you want to run one xgboost per node/worker, which means you want to set <n_thread_per_worker> to be number of cores you have on each machine.
+====
+* The code has been tested on MapReduce 1 (MRv1) and YARN.
+  - We recommend to run it on MapReduce 2 (MRv2, YARN) so that multi-threading can be enabled.
+* The code is optimized with multi-threading, so you will want to run one xgboost per node/worker for best performance.
+  - You will want to set <n_thread_per_worker> to be number of cores you have on each machine.
   - You will need YARN to set specify number of cores of each worker

From ede1222b02824c2306b6b4ee3797cfc00c3fca21 Mon Sep 17 00:00:00 2001
From: Boliang Chen <cblsjtu@gmail.com>
Date: Wed, 14 Jan 2015 22:15:31 +0800
Subject: [PATCH 160/166] modify doc

---
 multi-node/hadoop/README.md | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/multi-node/hadoop/README.md b/multi-node/hadoop/README.md
index 7ff7c5da7e41..a403af47418b 100644
--- a/multi-node/hadoop/README.md
+++ b/multi-node/hadoop/README.md
@@ -1,7 +1,7 @@
 Distributed XGBoost: Hadoop Version
 ====
 *  The script in this fold shows an example of how to run distributed xgboost on hadoop platform.
-*  It relies on [Rabit Library](https://github.com/tqchen/rabit) and Hadoop Streaming. 
+*  It relies on [Rabit Library](https://github.com/tqchen/rabit) (Reliable Allreduce and Broadcast Interface) and Hadoop Streaming. Rabit provides an interface to aggregate gradient values and split statistics, that allow xgboost to run reliably on hadoop. You do not need to care how to update model in each iteration, just use the script ```rabit_hadoop.py```. For those who want to know how it exactly works, plz refer to the main page of [Rabit](https://github.com/tqchen/rabit).
 *  Quick start: run ```bash run_binary_classification.sh <n_hadoop_workers> <path_in_HDFS>```
   - This is the hadoop version of binary classification example in the demo folder.
   - More info of the binary classification task can be refered to https://github.com/tqchen/xgboost/wiki/Binary-Classification.
@@ -37,6 +37,6 @@ If you have used xgboost (single machine version) before, this section will show
 
 Notes
 ====
-* The code has been tested on MapReduce 1 (MRv1), it should be ok to run on MapReduce 2 (MRv2, YARN).
+* The code has been tested on MapReduce 1 (MRv1), it should be ok and recommended to run on MapReduce 2 (MRv2, YARN).
 * The code is multi-threaded, so you want to run one xgboost per node/worker, which means the parameter <n_workers> should be less than the number of slaves/workers. 
 

From b762231b0280feecedff132ff9f96c2ae4f340ed Mon Sep 17 00:00:00 2001
From: tqchen <tianqi.tchen@gmail.com>
Date: Thu, 15 Jan 2015 21:32:31 -0800
Subject: [PATCH 161/166] change makefile to lazy checkpt, fix col splt code

---
 Makefile                                      |  8 ++--
 .../col-split/mushroom-col-rabit-mock.sh      |  2 +-
 multi-node/col-split/mushroom-col-rabit.sh    |  8 ++--
 .../row-split/machine-row-rabit-mock.sh       |  2 +-
 src/gbm/gbm.h                                 |  9 +++-
 src/gbm/gbtree-inl.hpp                        | 48 +++++++++++++------
 src/learner/learner-inl.hpp                   |  6 +++
 src/xgboost_main.cpp                          | 34 +++++++++----
 8 files changed, 84 insertions(+), 33 deletions(-)

diff --git a/Makefile b/Makefile
index 7c89d24da4f3..9716f8149146 100644
--- a/Makefile
+++ b/Makefile
@@ -18,9 +18,9 @@ endif
 
 # specify tensor path
 BIN = xgboost 
-MOCKBIN = xgboost-mock
+MOCKBIN = xgboost.mock
 OBJ = updater.o gbm.o io.o main.o 
-MPIBIN = xgboost-mpi
+MPIBIN = xgboost.mpi
 SLIB = wrapper/libxgboostwrapper.so 
 
 .PHONY: clean all mpi python Rpack librabit librabit_mpi
@@ -42,8 +42,8 @@ updater.o: src/tree/updater.cpp  src/tree/*.hpp src/*.h src/tree/*.h src/utils/*
 gbm.o: src/gbm/gbm.cpp src/gbm/*.hpp src/gbm/*.h 
 io.o: src/io/io.cpp src/io/*.hpp src/utils/*.h src/learner/dmatrix.h src/*.h
 main.o: src/xgboost_main.cpp src/utils/*.h src/*.h src/learner/*.hpp src/learner/*.h 
-xgboost-mpi:  updater.o gbm.o io.o main.o librabit_mpi
-xgboost-mock: updater.o gbm.o io.o main.o librabit
+xgboost.mpi:  updater.o gbm.o io.o main.o librabit_mpi
+xgboost.mock: updater.o gbm.o io.o main.o librabit
 xgboost:  updater.o gbm.o io.o main.o  librabit
 wrapper/libxgboostwrapper.so: wrapper/xgboost_wrapper.cpp src/utils/*.h src/*.h src/learner/*.hpp src/learner/*.h  updater.o gbm.o io.o librabit
 
diff --git a/multi-node/col-split/mushroom-col-rabit-mock.sh b/multi-node/col-split/mushroom-col-rabit-mock.sh
index 269967419db8..65e62309af38 100755
--- a/multi-node/col-split/mushroom-col-rabit-mock.sh
+++ b/multi-node/col-split/mushroom-col-rabit-mock.sh
@@ -16,7 +16,7 @@ k=$1
 python splitsvm.py ../../demo/data/agaricus.txt.train train $k
 
 # run xgboost mpi
-../../rabit/tracker/rabit_mpi.py $k local ../../rabit/test/keepalive.sh ../../xgboost-mock mushroom-col.conf dsplit=col mock=0,1,0,0 mock=1,1,0,0
+../../rabit/tracker/rabit_demo.py -n $k  ../../xgboost.mock mushroom-col.conf dsplit=col mock=0,2,0,0 mock=1,2,0,0 mock=2,2,8,0 mock=2,3,0,0
 
 # the model can be directly loaded by single machine xgboost solver, as usuall
 #../../xgboost mushroom-col.conf task=dump model_in=0002.model fmap=../../demo/data/featmap.txt name_dump=dump.nice.$k.txt
diff --git a/multi-node/col-split/mushroom-col-rabit.sh b/multi-node/col-split/mushroom-col-rabit.sh
index b9595e5b7612..f958305aa50d 100755
--- a/multi-node/col-split/mushroom-col-rabit.sh
+++ b/multi-node/col-split/mushroom-col-rabit.sh
@@ -16,13 +16,13 @@ k=$1
 python splitsvm.py ../../demo/data/agaricus.txt.train train $k
 
 # run xgboost mpi
-../../rabit/tracker/rabit_mpi.py $k local ../../xgboost mushroom-col.conf dsplit=col
+../../rabit/tracker/rabit_demo.py -n $k ../../xgboost mushroom-col.conf dsplit=col
 
 # the model can be directly loaded by single machine xgboost solver, as usuall
-../../xgboost mushroom-col.conf task=dump model_in=0002.model fmap=../../demo/data/featmap.txt name_dump=dump.nice.$k.txt
+../../xgboost mushroom-col.conf task=dump model_in=0002.model fmap=../../demo/data/featmap.txt name_dump=dump.nice.$k.txt 
 
 # run for one round, and continue training
-../../rabit/tracker/rabit_mpi.py $k local  ../../xgboost mushroom-col.conf dsplit=col num_round=1
-../../rabit/tracker/rabit_mpi.py $k local  ../../xgboost mushroom-col.conf  mushroom-col.conf dsplit=col model_in=0001.model
+../../rabit/tracker/rabit_demo.py -n $k  ../../xgboost mushroom-col.conf dsplit=col num_round=1
+../../rabit/tracker/rabit_demo.py -n $k  ../../xgboost mushroom-col.conf  mushroom-col.conf dsplit=col model_in=0001.model
 
 cat dump.nice.$k.txt
diff --git a/multi-node/row-split/machine-row-rabit-mock.sh b/multi-node/row-split/machine-row-rabit-mock.sh
index b08e7d4e69fc..b8ef10b2d219 100755
--- a/multi-node/row-split/machine-row-rabit-mock.sh
+++ b/multi-node/row-split/machine-row-rabit-mock.sh
@@ -17,4 +17,4 @@ cd -
 python splitrows.py ../../demo/regression/machine.txt.train train-machine $k
 
 # run xgboost mpi
-../../rabit/tracker/rabit_mpi.py -n $k ../../rabit/test/keepalive.sh ../../xgboost-mock machine-row.conf dsplit=row num_round=3 mock=1,1,1,0 
+../../rabit/tracker/rabit_demo.py -n $k  ../../xgboost.mock machine-row.conf dsplit=row num_round=3 mock=1,1,1,0  mock=0,0,3,0 mock=2,2,3,0
diff --git a/src/gbm/gbm.h b/src/gbm/gbm.h
index 8799a7af0465..57b8c05732e1 100644
--- a/src/gbm/gbm.h
+++ b/src/gbm/gbm.h
@@ -46,6 +46,14 @@ class IGradBooster {
    * and recalculate from scratch
    */
   virtual void ResetPredBuffer(size_t num_pbuffer) {}
+  /*! 
+   * \brief whether the model allow lazy checkpoint
+   * return true if model is only updated in DoBoost 
+   * after all Allreduce calls
+   */
+  virtual bool AllowLazyCheckPoint(void) const {
+    return false;
+  }
   /*!
    * \brief peform update to the model(boosting)
    * \param p_fmat feature matrix that provide access to features
@@ -76,7 +84,6 @@ class IGradBooster {
                        const BoosterInfo &info,
                        std::vector<float> *out_preds,
                        unsigned ntree_limit = 0) = 0;
-  
   /*!
    * \brief predict the leaf index of each tree, the output will be nsample * ntree vector
    *        this is only valid in gbtree predictor
diff --git a/src/gbm/gbtree-inl.hpp b/src/gbm/gbtree-inl.hpp
index e8f1b1933294..c08d15dd72c7 100644
--- a/src/gbm/gbtree-inl.hpp
+++ b/src/gbm/gbtree-inl.hpp
@@ -90,13 +90,17 @@ class GBTree : public IGradBooster {
     pred_buffer.resize(mparam.PredBufferSize(), 0.0f);
     pred_counter.resize(mparam.PredBufferSize(), 0);
   }
+  virtual bool AllowLazyCheckPoint(void) const {
+    return !(tparam.distcol_mode != 0  && mparam.num_output_group != 1);
+  }
   virtual void DoBoost(IFMatrix *p_fmat,
                        int64_t buffer_offset,
                        const BoosterInfo &info,
                        std::vector<bst_gpair> *in_gpair) {
     const std::vector<bst_gpair> &gpair = *in_gpair;
-    if (mparam.num_output_group == 1) {
-      this->BoostNewTrees(gpair, p_fmat, buffer_offset, info, 0);
+    std::vector<std::vector<tree::RegTree*> > new_trees;
+    if (mparam.num_output_group == 1) {      
+      new_trees.push_back(BoostNewTrees(gpair, p_fmat, buffer_offset, info, 0));
     } else {
       const int ngroup = mparam.num_output_group;
       utils::Check(gpair.size() % ngroup == 0,
@@ -108,9 +112,12 @@ class GBTree : public IGradBooster {
         for (bst_omp_uint i = 0; i < nsize; ++i) {
           tmp[i] = gpair[i * ngroup + gid];
         }
-        this->BoostNewTrees(tmp, p_fmat, buffer_offset, info, gid);
+        new_trees.push_back(BoostNewTrees(tmp, p_fmat, buffer_offset, info, gid));
       }
     }
+    for (int gid = 0; gid < mparam.num_output_group; ++gid) {
+      this->CommitModel(new_trees[gid], gid);
+    }
   }
   virtual void Predict(IFMatrix *p_fmat,
                        int64_t buffer_offset,
@@ -208,14 +215,15 @@ class GBTree : public IGradBooster {
     tparam.updater_initialized = 1;
   }
   // do group specific group
-  inline void BoostNewTrees(const std::vector<bst_gpair> &gpair,
-                            IFMatrix *p_fmat,
-                            int64_t buffer_offset,
-                            const BoosterInfo &info,
-                            int bst_group) {
+  inline std::vector<tree::RegTree*>
+  BoostNewTrees(const std::vector<bst_gpair> &gpair,
+                IFMatrix *p_fmat,
+                int64_t buffer_offset,
+                const BoosterInfo &info,
+                int bst_group) {
+    std::vector<tree::RegTree *> new_trees;
     this->InitUpdater();
     // create the trees
-    std::vector<tree::RegTree *> new_trees;
     for (int i = 0; i < tparam.num_parallel_tree; ++i) {
       new_trees.push_back(new tree::RegTree());
       for (size_t j = 0; j < cfg.size(); ++j) {
@@ -226,9 +234,12 @@ class GBTree : public IGradBooster {
     // update the trees
     for (size_t i = 0; i < updaters.size(); ++i) {
       updaters[i]->Update(gpair, p_fmat, info, new_trees);
-    }
+    }    
     // optimization, update buffer, if possible
-    if (buffer_offset >= 0 &&
+    // this is only under distributed column mode
+    // for safety check of lazy checkpoint
+    if (
+        buffer_offset >= 0 &&
         new_trees.size() == 1 && updaters.size() > 0 &&
         updaters.back()->GetLeafPosition() != NULL) {
       utils::Check(info.num_row == p_fmat->buffered_rowset().size(),
@@ -238,12 +249,15 @@ class GBTree : public IGradBooster {
                                    *new_trees[0],
                                    updaters.back()->GetLeafPosition());
     }
-    // push back to model
+    return new_trees;
+  }
+  // commit new trees all at once
+  inline void CommitModel(const std::vector<tree::RegTree*> &new_trees, int bst_group) {
     for (size_t i = 0; i < new_trees.size(); ++i) {
       trees.push_back(new_trees[i]);
       tree_info.push_back(bst_group);
     }
-    mparam.num_trees += tparam.num_parallel_tree;
+    mparam.num_trees += static_cast<int>(new_trees.size());
   }
   // update buffer by pre-cached position
   inline void UpdateBufferByPosition(IFMatrix *p_fmat,
@@ -264,7 +278,7 @@ class GBTree : public IGradBooster {
       for (int i = 0; i < mparam.size_leaf_vector; ++i) {
         pred_buffer[bid + i + 1] += new_tree.leafvec(tid)[i];
       }
-      pred_counter[bid] += 1;
+      pred_counter[bid] += tparam.num_parallel_tree;
     }
   }
   // make a prediction for a single instance
@@ -362,6 +376,8 @@ class GBTree : public IGradBooster {
     int num_parallel_tree;
     /*! \brief whether updater is already initialized */
     int updater_initialized;
+    /*! \brief distributed column mode */
+    int distcol_mode;
     /*! \brief tree updater sequence */
     std::string updater_seq;
     // construction
@@ -370,6 +386,7 @@ class GBTree : public IGradBooster {
       updater_seq = "grow_colmaker,prune";
       num_parallel_tree = 1;
       updater_initialized = 0;
+      distcol_mode = 0;
     }
     inline void SetParam(const char *name, const char *val){
       using namespace std;
@@ -378,6 +395,9 @@ class GBTree : public IGradBooster {
         updater_seq = val;
         updater_initialized = 0;
       }
+      if (!strcmp(name, "dsplit") && !strcmp(val, "col")) {
+        distcol_mode = 1;
+      }
       if (!strcmp(name, "nthread")) {
         omp_set_num_threads(nthread = atoi(val));
       }
diff --git a/src/learner/learner-inl.hpp b/src/learner/learner-inl.hpp
index ae0967ce8bd0..5e3622e4dc30 100644
--- a/src/learner/learner-inl.hpp
+++ b/src/learner/learner-inl.hpp
@@ -270,6 +270,12 @@ class BoostLearner : public rabit::ISerializable {
     obj_->GetGradient(preds_, train.info, iter, &gpair_);
     gbm_->DoBoost(train.fmat(), this->FindBufferOffset(train), train.info.info, &gpair_);
   }
+  /*!
+   * \brief whether model allow lazy checkpoint
+   */
+  inline bool AllowLazyCheckPoint(void) const {
+    return gbm_->AllowLazyCheckPoint();
+  }
   /*!
    * \brief evaluate the model for specific iteration
    * \param iter iteration number
diff --git a/src/xgboost_main.cpp b/src/xgboost_main.cpp
index db37cbd1db57..94e6d6bc181b 100644
--- a/src/xgboost_main.cpp
+++ b/src/xgboost_main.cpp
@@ -48,7 +48,7 @@ class BoostLearnTask {
       std::string pname = rabit::GetProcessorName();
       fprintf(stderr, "start %s:%d\n", pname.c_str(), rabit::GetRank());
     }
-    if (rabit::IsDistributed()) {
+    if (rabit::IsDistributed() && data_split == "NONE") {
       this->SetParam("dsplit", "row");
     }
     if (rabit::GetRank() != 0) {
@@ -89,6 +89,7 @@ class BoostLearnTask {
     if (!strcmp("fmap", name)) name_fmap = val;
     if (!strcmp("name_dump", name)) name_dump = val;
     if (!strcmp("name_pred", name)) name_pred = val;
+    if (!strcmp("dsplit", name)) data_split = val;
     if (!strcmp("dump_stats", name)) dump_model_stats = atoi(val);
     if (!strncmp("eval[", name, 5)) {
       char evname[256];
@@ -116,6 +117,7 @@ class BoostLearnTask {
     name_pred = "pred.txt";
     name_dump = "dump.txt";
     model_dir_path = "./";
+    data_split = "NONE";
     load_part = 0;
     data = NULL;
   }
@@ -172,14 +174,24 @@ class BoostLearnTask {
   inline void TaskTrain(void) {
     int version = rabit::LoadCheckPoint(&learner);
     if (version == 0) this->InitLearner();
-
     const time_t start = time(NULL);
     unsigned long elapsed = 0;
     learner.CheckInit(data);
-    for (int i = version; i < num_round; ++i) {
+
+    bool allow_lazy = learner.AllowLazyCheckPoint();
+    for (int i = version / 2; i < num_round; ++i) {
       elapsed = (unsigned long)(time(NULL) - start);
-      if (!silent) printf("boosting round %d, %lu sec elapsed\n", i, elapsed);
-      learner.UpdateOneIter(i, *data); 
+      if (version % 2 == 0) { 
+        if (!silent) printf("boosting round %d, %lu sec elapsed\n", i, elapsed);
+        learner.UpdateOneIter(i, *data);
+        if (allow_lazy) {
+          rabit::LazyCheckPoint(&learner);
+        } else {
+          rabit::CheckPoint(&learner);
+        }
+        version += 1;
+      }
+      utils::Assert(version == rabit::VersionNumber(), "consistent check");
       std::string res = learner.EvalOneIter(i, devalall, eval_data_names);
       if (rabit::IsDistributed()){
         if (rabit::GetRank() == 0) {
@@ -193,9 +205,13 @@ class BoostLearnTask {
       if (save_period != 0 && (i + 1) % save_period == 0) {
         this->SaveModel(i);
       }
-      utils::Assert(rabit::VersionNumber() == i, "incorrect version number");
-      // checkpoint the model
-      rabit::CheckPoint(&learner);
+      if (allow_lazy) {
+        rabit::LazyCheckPoint(&learner);
+      } else {
+        rabit::CheckPoint(&learner);
+      }
+      version += 1;
+      utils::Assert(version == rabit::VersionNumber(), "consistent check");
       elapsed = (unsigned long)(time(NULL) - start);
     }
     // always save final round
@@ -272,6 +288,8 @@ class BoostLearnTask {
   std::string task;
   /*! \brief name of predict file */
   std::string name_pred;
+  /*! \brief data split mode */
+  std::string data_split;
   /*!\brief limit number of trees in prediction */
   int ntree_limit;
   /*!\brief whether to directly output margin value */

From b1f89f29b8241a5e6741c57fd56d6b5fdde265bb Mon Sep 17 00:00:00 2001
From: tqchen <tianqi.tchen@gmail.com>
Date: Thu, 15 Jan 2015 21:55:56 -0800
Subject: [PATCH 162/166] cleanup multi-node

---
 multi-node/README.md                        | 56 ++++++++++-----------
 multi-node/col-split/README.md              |  2 -
 multi-node/col-split/mushroom-col-mpi.sh    | 24 ---------
 multi-node/col-split/mushroom-col-python.sh | 22 --------
 multi-node/col-split/mushroom-col.py        | 33 ------------
 multi-node/row-split/README.md              |  4 +-
 multi-node/row-split/machine-row-map.sh     | 20 --------
 multi-node/row-split/machine-row-mpi.sh     | 24 ---------
 multi-node/row-split/map.sh                 |  3 --
 multi-node/row-split/mushroom-row-mpi.sh    | 19 -------
 multi-node/row-split/mushroom-row.conf      | 35 -------------
 wrapper/xgboost.py                          | 18 -------
 wrapper/xgboost_wrapper.cpp                 | 17 -------
 13 files changed, 28 insertions(+), 249 deletions(-)
 delete mode 100755 multi-node/col-split/mushroom-col-mpi.sh
 delete mode 100755 multi-node/col-split/mushroom-col-python.sh
 delete mode 100644 multi-node/col-split/mushroom-col.py
 delete mode 100755 multi-node/row-split/machine-row-map.sh
 delete mode 100755 multi-node/row-split/machine-row-mpi.sh
 delete mode 100644 multi-node/row-split/map.sh
 delete mode 100755 multi-node/row-split/mushroom-row-mpi.sh
 delete mode 100644 multi-node/row-split/mushroom-row.conf

diff --git a/multi-node/README.md b/multi-node/README.md
index 02d6fc82074e..1c388d8bc772 100644
--- a/multi-node/README.md
+++ b/multi-node/README.md
@@ -1,38 +1,36 @@
 Distributed XGBoost
 ======
-This folder contains information about experimental version of distributed xgboost.
+This folder contains information of Distributed XGBoost.
 
-Build
-=====
-* In the root folder, run ```make```, this will give you xgboost, which uses rabit allreduce
-  - this version of xgboost should be fault tolerant eventually
-* Alterniatively, run ```make mpi```, this will give you xgboost-mpi
-  - You will need to have MPI to build xgboost-mpi
+* The distributed version is built on Rabit:[Reliable Allreduce and Broadcast Library](https://github.com/tqchen/rabit)
+  - Rabit is a portable library that provides fault-tolerance for Allreduce calls for distributed machine learning  
+  - This makes xgboost portable and fault-tolerant against node failures
+* You can run Distributed XGBoost on common platforms that rabit can port to,
+  including Hadoop(see [hadoop folder](hadoop)) and MPI
 
-Design Choice
+Usage
 =====
-* XGBoost replies on [Rabit Library](https://github.com/tqchen/rabit)
-* Rabit is an fault tolerant and portable allreduce library that provides Allreduce and Broadcast
-* Since rabit is compatible with MPI, xgboost can be compiled using MPI backend
+* In the root folder, run ```./build.sh```, this will give you xgboost, which uses rabit allreduce
+
+Notes
+====
+* Rabit handles all the fault tolerant and communications efficiently, we only use platform specific command to start programs
+  - The Hadoop version does not rely on Mapreduce to do iterations
+  - You can expect xgboost not suffering the drawbacks of iterative MapReduce program
+* The design choice was made because Allreduce is very natural and efficient for distributed tree building
+  - In current version of xgboost, the distributed version is only adds several lines of Allreduce synchronization code
+* The multi-threading nature of xgboost is inheritated in distributed mode
+  - This means xgboost efficiently use all the threads in one machine, and communicates only between machines
+  - Remember to run on xgboost process per machine and this will give you maximum speedup
+* For more information about rabit and how it works, see the [tutorial](https://github.com/tqchen/rabit/tree/master/guide)
 
-* How is the data distributed?
-  - There are two solvers in distributed xgboost
-  - Column-based solver split data by column, each node work on subset of columns, 
+Solvers
+=====
+There are two solvers in distributed xgboost. You can check for local demo of the two solvers, see [row-split](row-split) and [col-split](col-split)
+  * Column-based solver split data by column, each node work on subset of columns, 
     it uses exactly the same algorithm as single node version.
-  - Row-based solver split data by row, each node work on subset of rows,
+  * Row-based solver split data by row, each node work on subset of rows,
     it uses an approximate histogram count algorithm, and will only examine subset of 
     potential split points as opposed to all split points.
-  - Hadoop version can run on the existing hadoop platform,
-    it use Rabit to submit jobs as map-reduce tasks.
-
-Usage
-====
-* You will need a network filesystem, or copy data to local file system before running the code
-* xgboost can be used together with submission script provided in Rabit on different possible types of job scheduler
-* ***Note*** The distributed version is still multi-threading optimized.
-    You should run one process per node that takes most available CPU,
-    this will reduce the communication overhead and improve the performance.
-   - One way to do that is limit mpi slot in each machine to be 1, or reserve nthread processors for each process.
-* Examples:
-  - [Column-based version](col-split)
-  - [Row-based version](row-split)
+    - This is the mode used by current hadoop version, since usually data was stored by rows in many industry system
+    
diff --git a/multi-node/col-split/README.md b/multi-node/col-split/README.md
index 4f0d07b27117..3ea0799fead0 100644
--- a/multi-node/col-split/README.md
+++ b/multi-node/col-split/README.md
@@ -4,8 +4,6 @@ Distributed XGBoost: Column Split Version
   - mushroom-col-rabit.sh starts xgboost job using rabit's allreduce
 * run ```bash mushroom-col-rabit-mock.sh <n-process>```
   - mushroom-col-rabit-mock.sh starts xgboost job using rabit's allreduce, inserts suicide signal at certain point and test recovery
-* run ```bash mushroom-col-mpi.sh <n-mpi-process>```
-  - mushroom-col.sh starts xgboost-mpi job
 
 How to Use
 ====
diff --git a/multi-node/col-split/mushroom-col-mpi.sh b/multi-node/col-split/mushroom-col-mpi.sh
deleted file mode 100755
index 4d7de9892c3f..000000000000
--- a/multi-node/col-split/mushroom-col-mpi.sh
+++ /dev/null
@@ -1,24 +0,0 @@
-#!/bin/bash
-if [[ $# -ne 1 ]]
-then
-    echo "Usage: nprocess"
-    exit -1
-fi
-
-rm -rf train.col* *.model
-k=$1
-
-# split the lib svm file into k subfiles
-python splitsvm.py ../../demo/data/agaricus.txt.train train $k
-
-# run xgboost mpi
-mpirun -n $k ../../xgboost-mpi mushroom-col.conf dsplit=col
-
-# the model can be directly loaded by single machine xgboost solver, as usuall
-../../xgboost mushroom-col.conf task=dump model_in=0002.model fmap=../../demo/data/featmap.txt name_dump=dump.nice.$k.txt
-
-# run for one round, and continue training
-mpirun -n $k ../../xgboost-mpi mushroom-col.conf dsplit=col num_round=1
-mpirun -n $k ../../xgboost-mpi mushroom-col.conf dsplit=col model_in=0001.model
-
-cat dump.nice.$k.txt
\ No newline at end of file
diff --git a/multi-node/col-split/mushroom-col-python.sh b/multi-node/col-split/mushroom-col-python.sh
deleted file mode 100755
index 8551ee4653a9..000000000000
--- a/multi-node/col-split/mushroom-col-python.sh
+++ /dev/null
@@ -1,22 +0,0 @@
-#!/bin/bash
-if [[ $# -ne 1 ]]
-then
-    echo "Usage: nprocess"
-    exit -1
-fi
-
-#
-# This script is same as mushroom-col except that we will be using xgboost python module
-# 
-# xgboost used built in tcp-based allreduce module, and can be run on more enviroment, so long as we know how to start job by modifying ../submit_job_tcp.py
-#
-rm -rf train.col* *.model
-k=$1
-
-# split the lib svm file into k subfiles
-python splitsvm.py ../../demo/data/agaricus.txt.train train $k
-
-# run xgboost mpi
-../../rabit/tracker/rabit_mpi.py $k local python mushroom-col.py
-
-cat dump.nice.$k.txt
diff --git a/multi-node/col-split/mushroom-col.py b/multi-node/col-split/mushroom-col.py
deleted file mode 100644
index a905aff5c08b..000000000000
--- a/multi-node/col-split/mushroom-col.py
+++ /dev/null
@@ -1,33 +0,0 @@
-import os
-import sys
-path = os.path.dirname(__file__)
-if path == '':
-    path = '.'
-sys.path.append(path+'/../../wrapper')
-
-import xgboost as xgb
-# this is example script of running distributed xgboost using python
-
-# call this additional function to intialize the xgboost sync module
-# in distributed mode
-xgb.sync_init(sys.argv)
-rank = xgb.sync_get_rank()
-# read in dataset
-dtrain = xgb.DMatrix('train.col%d' % rank)
-param = {'max_depth':3, 'eta':1, 'silent':1, 'objective':'binary:logistic' }
-param['dsplit'] = 'col'
-nround = 3
-
-if rank == 0:
-    dtest = xgb.DMatrix('../../demo/data/agaricus.txt.test')
-    model = xgb.train(param, dtrain, nround, [(dtrain, 'train') , (dtest, 'test')])
-else:
-    # if it is a slave node, do not run evaluation
-    model = xgb.train(param, dtrain, nround)
-
-if rank == 0:
-    model.save_model('%04d.model' % nround)
-    # dump model with feature map
-    model.dump_model('dump.nice.%d.txt' % xgb.sync_get_world_size(),'../../demo/data/featmap.txt')
-# shutdown the synchronization module
-xgb.sync_finalize()
diff --git a/multi-node/row-split/README.md b/multi-node/row-split/README.md
index 46656644dbd3..30e2528d33f0 100644
--- a/multi-node/row-split/README.md
+++ b/multi-node/row-split/README.md
@@ -1,10 +1,8 @@
 Distributed XGBoost: Row Split Version
 ====
+* You might be interested to checkout the [Hadoop example](../hadoop)
 * Machine Rabit: run ```bash machine-row-rabit.sh <n-mpi-process>```
   - machine-col-rabit.sh starts xgboost job using rabit
-* Mushroom: run ```bash mushroom-row-mpi.sh <n-mpi-process>```
-* Machine: run ```bash machine-row-mpi.sh <n-mpi-process>```
-  - Machine case also include example to continue training from existing model
 
 How to Use
 ====
diff --git a/multi-node/row-split/machine-row-map.sh b/multi-node/row-split/machine-row-map.sh
deleted file mode 100755
index a1c5bfe0c012..000000000000
--- a/multi-node/row-split/machine-row-map.sh
+++ /dev/null
@@ -1,20 +0,0 @@
-#!/bin/bash
-if [[ $# -ne 1 ]]
-then
-    echo "Usage: nprocess"
-    exit -1
-fi
-
-rm -rf train-machine.row* *.model
-k=$1
-# make machine data
-cd ../../demo/regression/
-python mapfeat.py
-python mknfold.py machine.txt 1
-cd -
-
-# split the lib svm file into k subfiles
-python splitrows.py ../../demo/regression/machine.txt.train train-machine $k
-
-# run xgboost mpi, take data from stdin
-../submit_job_tcp.py $k "bash map.sh train-machine.row ../../xgboost machine-row.conf dsplit=row num_round=3 data=stdin"
diff --git a/multi-node/row-split/machine-row-mpi.sh b/multi-node/row-split/machine-row-mpi.sh
deleted file mode 100755
index fdb1f1d6b554..000000000000
--- a/multi-node/row-split/machine-row-mpi.sh
+++ /dev/null
@@ -1,24 +0,0 @@
-#!/bin/bash
-if [[ $# -ne 1 ]]
-then
-    echo "Usage: nprocess"
-    exit -1
-fi
-
-rm -rf train-machine.row* *.model
-k=$1
-# make machine data
-cd ../../demo/regression/
-python mapfeat.py
-python mknfold.py machine.txt 1
-cd -
-
-# split the lib svm file into k subfiles
-python splitrows.py ../../demo/regression/machine.txt.train train-machine $k
-
-# run xgboost mpi
-mpirun -n $k ../../xgboost-mpi machine-row.conf dsplit=row num_round=3
-
-# run xgboost-mpi save model 0001, continue to run from existing model
-mpirun -n $k ../../xgboost-mpi machine-row.conf dsplit=row num_round=1
-mpirun -n $k ../../xgboost-mpi machine-row.conf dsplit=row num_round=2 model_in=0001.model
diff --git a/multi-node/row-split/map.sh b/multi-node/row-split/map.sh
deleted file mode 100644
index 624192121e63..000000000000
--- a/multi-node/row-split/map.sh
+++ /dev/null
@@ -1,3 +0,0 @@
-# a simple script to simulate mapreduce mapper
-echo "cat $1$OMPI_COMM_WORLD_RANK | ${@:2}"
-cat $1$OMPI_COMM_WORLD_RANK | ${@:2}
diff --git a/multi-node/row-split/mushroom-row-mpi.sh b/multi-node/row-split/mushroom-row-mpi.sh
deleted file mode 100755
index eb65799b695f..000000000000
--- a/multi-node/row-split/mushroom-row-mpi.sh
+++ /dev/null
@@ -1,19 +0,0 @@
-#!/bin/bash
-if [[ $# -ne 1 ]]
-then
-    echo "Usage: nprocess"
-    exit -1
-fi
-
-rm -rf train.row* *.model
-k=$1
-
-# split the lib svm file into k subfiles
-python splitrows.py ../../demo/data/agaricus.txt.train train $k
-
-# run xgboost mpi
-mpirun -n $k ../../xgboost-mpi mushroom-row.conf dsplit=row nthread=1 
-
-# the model can be directly loaded by single machine xgboost solver, as usuall
-../../xgboost mushroom-row.conf task=dump model_in=0002.model fmap=../../demo/data/featmap.txt name_dump=dump.nice.$k.txt
-cat dump.nice.$k.txt
diff --git a/multi-node/row-split/mushroom-row.conf b/multi-node/row-split/mushroom-row.conf
deleted file mode 100644
index 4cc2e8b11e47..000000000000
--- a/multi-node/row-split/mushroom-row.conf
+++ /dev/null
@@ -1,35 +0,0 @@
-# General Parameters, see comment for each definition
-# choose the booster, can be gbtree or gblinear
-booster = gbtree
-# choose logistic regression loss function for binary classification
-objective = binary:logistic
-
-# Tree Booster Parameters
-# step size shrinkage
-eta = 1.0 
-# minimum loss reduction required to make a further partition
-gamma = 1.0 
-# minimum sum of instance weight(hessian) needed in a child
-min_child_weight = 1 
-# maximum depth of a tree
-max_depth = 3 
-
-# Task Parameters
-# the number of round to do boosting
-num_round = 2
-# 0 means do not save any model except the final round model
-save_period = 0 
-use_buffer = 0
-
-# The path of training data %d is the wildcard for the rank of the data
-# The idea is each process take a feature matrix with subset of columns
-#
-data = "train.row%d" 
-
-# The path of validation data, used to monitor training process, here [test] sets name of the validation set
-eval[test] = "../../demo/data/agaricus.txt.test" 
-# evaluate on training data as well each round
-eval_train = 1
-
-# The path of test data, need to use full data of test, try not use it, or keep an subsampled version
-test:data = "../../demo/data/agaricus.txt.test"      
diff --git a/wrapper/xgboost.py b/wrapper/xgboost.py
index d351928dc0a5..cf442a61fbc2 100644
--- a/wrapper/xgboost.py
+++ b/wrapper/xgboost.py
@@ -33,10 +33,6 @@
 xglib.XGBoosterPredict.restype = ctypes.POINTER(ctypes.c_float)
 xglib.XGBoosterEvalOneIter.restype = ctypes.c_char_p
 xglib.XGBoosterDumpModel.restype = ctypes.POINTER(ctypes.c_char_p)
-# sync function
-xglib.XGSyncGetRank.restype = ctypes.c_int
-xglib.XGSyncGetWorldSize.restype = ctypes.c_int
-# initialize communication module
 
 def ctypes2numpy(cptr, length, dtype):
     """convert a ctypes pointer array to numpy array """
@@ -557,17 +553,3 @@ def cv(params, dtrain, num_boost_round = 10, nfold=3, metrics=[], \
         results.append(res)
     return results
 
-# synchronization module
-def sync_init(args = sys.argv):
-    arr = (ctypes.c_char_p * len(args))()
-    arr[:] = args
-    xglib.XGSyncInit(len(args), arr)
-    
-def sync_finalize():
-    xglib.XGSyncFinalize()
-
-def sync_get_rank():
-    return int(xglib.XGSyncGetRank())
-
-def sync_get_world_size():
-    return int(xglib.XGSyncGetWorldSize())
diff --git a/wrapper/xgboost_wrapper.cpp b/wrapper/xgboost_wrapper.cpp
index 700356ade780..432ae0bf28af 100644
--- a/wrapper/xgboost_wrapper.cpp
+++ b/wrapper/xgboost_wrapper.cpp
@@ -82,23 +82,6 @@ class Booster: public learner::BoostLearner {
 using namespace xgboost::wrapper;
 
 extern "C"{
-  void XGSyncInit(int argc, char *argv[]) {
-    rabit::Init(argc, argv);
-    if (rabit::GetWorldSize() != 1) {
-      std::string pname = rabit::GetProcessorName();
-      utils::Printf("distributed job start %s:%d\n", pname.c_str(), rabit::GetRank());
-    }
-  }
-  void XGSyncFinalize(void) {
-    rabit::Finalize();
-  }
-  int XGSyncGetRank(void) {
-    int rank = rabit::GetRank();
-    return rank;
-  }
-  int XGSyncGetWorldSize(void) {
-    return rabit::GetWorldSize();
-  }
   void* XGDMatrixCreateFromFile(const char *fname, int silent) {
     return LoadDataMatrix(fname, silent != 0, false);
   }

From b1df8039a03a6e7d25c88c8f630ea6e2f85f46de Mon Sep 17 00:00:00 2001
From: tqchen <tianqi.tchen@gmail.com>
Date: Thu, 15 Jan 2015 21:56:39 -0800
Subject: [PATCH 163/166] ignore

---
 .gitignore | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/.gitignore b/.gitignore
index 8bb1ead7fa71..5070aab7f057 100644
--- a/.gitignore
+++ b/.gitignore
@@ -47,6 +47,7 @@ Debug
 *.cpage.col
 *.cpage
 xgboost
-xgboost-mpi
+xgboost.mpi
+xgboost.mock
 train*
 rabit

From 4715672d76e888c290cf5ac46b4a1e95f89b15f2 Mon Sep 17 00:00:00 2001
From: tqchen <tianqi.tchen@gmail.com>
Date: Thu, 15 Jan 2015 22:01:29 -0800
Subject: [PATCH 164/166] chg

---
 multi-node/README.md | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/multi-node/README.md b/multi-node/README.md
index 1c388d8bc772..ee4c24620a1f 100644
--- a/multi-node/README.md
+++ b/multi-node/README.md
@@ -5,8 +5,8 @@ This folder contains information of Distributed XGBoost.
 * The distributed version is built on Rabit:[Reliable Allreduce and Broadcast Library](https://github.com/tqchen/rabit)
   - Rabit is a portable library that provides fault-tolerance for Allreduce calls for distributed machine learning  
   - This makes xgboost portable and fault-tolerant against node failures
-* You can run Distributed XGBoost on common platforms that rabit can port to,
-  including Hadoop(see [hadoop folder](hadoop)) and MPI
+* You can run Distributed XGBoost on platforms including Hadoop(see [hadoop folder](hadoop)) and MPI
+  - Rabit only replies a platform to start the programs, so it should be easy to port xgboost to most platforms
 
 Usage
 =====

From 90ec783e65f9b884735b6d7d66d9f24f30b7da74 Mon Sep 17 00:00:00 2001
From: tqchen <tianqi.tchen@gmail.com>
Date: Thu, 15 Jan 2015 22:01:55 -0800
Subject: [PATCH 165/166] remove build

---
 multi-node/README.md | 4 ----
 1 file changed, 4 deletions(-)

diff --git a/multi-node/README.md b/multi-node/README.md
index ee4c24620a1f..62165da4a84b 100644
--- a/multi-node/README.md
+++ b/multi-node/README.md
@@ -8,10 +8,6 @@ This folder contains information of Distributed XGBoost.
 * You can run Distributed XGBoost on platforms including Hadoop(see [hadoop folder](hadoop)) and MPI
   - Rabit only replies a platform to start the programs, so it should be easy to port xgboost to most platforms
 
-Usage
-=====
-* In the root folder, run ```./build.sh```, this will give you xgboost, which uses rabit allreduce
-
 Notes
 ====
 * Rabit handles all the fault tolerant and communications efficiently, we only use platform specific command to start programs

From b898672753c50d08758f4c1332b3dd1f9983d7d7 Mon Sep 17 00:00:00 2001
From: tqchen <tianqi.tchen@gmail.com>
Date: Thu, 15 Jan 2015 22:03:32 -0800
Subject: [PATCH 166/166] ok

---
 multi-node/README.md | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/multi-node/README.md b/multi-node/README.md
index 62165da4a84b..ce37daeab15d 100644
--- a/multi-node/README.md
+++ b/multi-node/README.md
@@ -8,6 +8,10 @@ This folder contains information of Distributed XGBoost.
 * You can run Distributed XGBoost on platforms including Hadoop(see [hadoop folder](hadoop)) and MPI
   - Rabit only replies a platform to start the programs, so it should be easy to port xgboost to most platforms
 
+Build
+=====
+* In the root folder, run ```./build.sh```, this will give you xgboost, which uses rabit allreduce
+
 Notes
 ====
 * Rabit handles all the fault tolerant and communications efficiently, we only use platform specific command to start programs