Merge branch 'unity'

Conflicts: .gitignore R-package/src/xgboost_R.cpp src/gbm/gblinear-inl.hpp tools/xgcombine_buffer.cpp
DroLuo · Jan 19, 2015 · f49fd88 · f49fd88
2 parents d50079f + b898672
commit f49fd88
Show file tree

Hide file tree

Showing 70 changed files with 6,398 additions and 262 deletions.
diff --git a/.gitignore b/.gitignore
@@ -2,7 +2,7 @@
 *.slo
 *.lo
 *.o
-
+*.page
 # Compiled Dynamic libraries
 *.so
 *.dylib
@@ -45,3 +45,11 @@ Debug
 *save
 *csv
 .Rproj.user
+*.cpage.col
+*.cpage
+xgboost
+xgboost.mpi
+xgboost.mock
+train*
+rabit
+
diff --git a/Makefile b/Makefile
@@ -1,42 +1,70 @@
 export CC  = gcc
 export CXX = g++
-export LDFLAGS= -pthread -lm 
-
-export CFLAGS = -Wall -O3 -msse2  -Wno-unknown-pragmas -fPIC -pedantic 
+export MPICXX = mpicxx
+export LDFLAGS= -Lrabit/lib -pthread -lm 
+export CFLAGS = -Wall -O3 -msse2  -Wno-unknown-pragmas -fPIC  -Irabit/include
 
 ifeq ($(no_omp),1)
 	CFLAGS += -DDISABLE_OPENMP 
 else 
 	CFLAGS += -fopenmp
 endif
 
+# by default use c++11
+ifeq ($(no_cxx11),1)
+else 
+	CFLAGS += 
+endif
+
 # specify tensor path
-BIN = xgboost
-OBJ = updater.o gbm.o io.o
+BIN = xgboost 
+MOCKBIN = xgboost.mock
+OBJ = updater.o gbm.o io.o main.o 
+MPIBIN = xgboost.mpi
 SLIB = wrapper/libxgboostwrapper.so 
 
-.PHONY: clean all python Rpack
+.PHONY: clean all mpi python Rpack librabit librabit_mpi
 
-all: $(BIN) $(OBJ) $(SLIB) 
+all: $(BIN) $(OBJ) $(SLIB) $(MOCKBIN)
+mpi: $(MPIBIN)
+
+# rules to get rabit library
+librabit:
+	if [ ! -d rabit ]; then git clone https://github.com/tqchen/rabit.git; fi
+	cd rabit;make lib/librabit.a lib/librabit_mock.a; cd -
+librabit_mpi:
+	if [ ! -d rabit ]; then git clone https://github.com/tqchen/rabit.git; fi
+	cd rabit;make lib/librabit_mpi.a; cd -
 
 python: wrapper/libxgboostwrapper.so
 # now the wrapper takes in two files. io and wrapper part
-wrapper/libxgboostwrapper.so: wrapper/xgboost_wrapper.cpp $(OBJ)
-updater.o: src/tree/updater.cpp  src/tree/*.hpp src/*.h src/tree/*.h
-gbm.o: src/gbm/gbm.cpp src/gbm/*.hpp src/gbm/*.h
+updater.o: src/tree/updater.cpp  src/tree/*.hpp src/*.h src/tree/*.h src/utils/*.h
+gbm.o: src/gbm/gbm.cpp src/gbm/*.hpp src/gbm/*.h 
 io.o: src/io/io.cpp src/io/*.hpp src/utils/*.h src/learner/dmatrix.h src/*.h
-xgboost: src/xgboost_main.cpp src/utils/*.h src/*.h src/learner/*.hpp src/learner/*.h $(OBJ)
-wrapper/libxgboostwrapper.so: wrapper/xgboost_wrapper.cpp src/utils/*.h src/*.h src/learner/*.hpp src/learner/*.h $(OBJ)
+main.o: src/xgboost_main.cpp src/utils/*.h src/*.h src/learner/*.hpp src/learner/*.h 
+xgboost.mpi:  updater.o gbm.o io.o main.o librabit_mpi
+xgboost.mock: updater.o gbm.o io.o main.o librabit
+xgboost:  updater.o gbm.o io.o main.o  librabit
+wrapper/libxgboostwrapper.so: wrapper/xgboost_wrapper.cpp src/utils/*.h src/*.h src/learner/*.hpp src/learner/*.h  updater.o gbm.o io.o librabit
 
 $(BIN) : 
-	$(CXX) $(CFLAGS) $(LDFLAGS) -o $@ $(filter %.cpp %.o %.c, $^)
+	$(CXX) $(CFLAGS) -o $@ $(filter %.cpp %.o %.c, $^) $(LDFLAGS)  -lrabit
+
+$(MOCKBIN) : 
+	$(CXX) $(CFLAGS) -o $@ $(filter %.cpp %.o %.c, $^) $(LDFLAGS)  -lrabit_mock
 
 $(SLIB) :
-	$(CXX) $(CFLAGS) -fPIC $(LDFLAGS) -shared -o $@ $(filter %.cpp %.o %.c, $^)
+	$(CXX) $(CFLAGS) -fPIC -shared -o $@ $(filter %.cpp %.o %.c, $^) $(LDFLAGS)  -lrabit
 
 $(OBJ) : 
 	$(CXX) -c $(CFLAGS) -o $@ $(firstword $(filter %.cpp %.c, $^) )
 
+$(MPIOBJ) : 
+	$(MPICXX) -c $(CFLAGS) -o $@ $(firstword $(filter %.cpp %.c, $^) ) 
+
+$(MPIBIN) : 
+	$(MPICXX) $(CFLAGS) -o $@ $(filter %.cpp %.o %.c, $^) $(LDFLAGS) -lrabit_mpi
+
 install:
 	cp -f -r $(BIN)  $(INSTALL_PATH)
 
@@ -62,4 +90,4 @@ Rpack:
 	R CMD check --as-cran xgboost*.tar.gz
 
 clean:
-	$(RM) $(OBJ) $(BIN) $(SLIB) *.o  */*.o */*/*.o *~ */*~ */*/*~
+	$(RM) $(OBJ) $(BIN) $(MPIBIN) $(MPIOBJ) $(SLIB) *.o  */*.o */*/*.o *~ */*~ */*/*~
diff --git a/R-package/src/Makevars b/R-package/src/Makevars
@@ -4,6 +4,4 @@ PKGROOT=../../
 PKG_CPPFLAGS= -DXGBOOST_CUSTOMIZE_MSG_ -DXGBOOST_CUSTOMIZE_PRNG_ -DXGBOOST_STRICT_CXX98_ -I$(PKGROOT)
 PKG_CXXFLAGS= $(SHLIB_OPENMP_CFLAGS)
 PKG_LIBS = $(SHLIB_OPENMP_CFLAGS)
-OBJECTS= xgboost_R.o xgboost_assert.o $(PKGROOT)/wrapper/xgboost_wrapper.o $(PKGROOT)/src/io/io.o $(PKGROOT)/src/gbm/gbm.o $(PKGROOT)/src/tree/updater.o
-
-
+OBJECTS= xgboost_R.o xgboost_assert.o $(PKGROOT)/wrapper/xgboost_wrapper.o $(PKGROOT)/src/io/io.o $(PKGROOT)/src/gbm/gbm.o $(PKGROOT)/src/tree/updater.o $(PKGROOT)/src/sync/sync_empty.o
diff --git a/R-package/src/Makevars.win b/R-package/src/Makevars.win
@@ -4,4 +4,4 @@ PKGROOT=../../
 PKG_CPPFLAGS= -DXGBOOST_CUSTOMIZE_MSG_ -DXGBOOST_CUSTOMIZE_PRNG_ -DXGBOOST_STRICT_CXX98_ -I$(PKGROOT)
 PKG_CXXFLAGS= $(SHLIB_OPENMP_CFLAGS)
 PKG_LIBS = $(SHLIB_OPENMP_CFLAGS)
-OBJECTS= xgboost_R.o xgboost_assert.o $(PKGROOT)/wrapper/xgboost_wrapper.o $(PKGROOT)/src/io/io.o $(PKGROOT)/src/gbm/gbm.o $(PKGROOT)/src/tree/updater.o
+OBJECTS= xgboost_R.o xgboost_assert.o $(PKGROOT)/wrapper/xgboost_wrapper.o $(PKGROOT)/src/io/io.o $(PKGROOT)/src/gbm/gbm.o $(PKGROOT)/src/tree/updater.o $(PKGROOT)/src/sync/sync_empty.o
diff --git a/R-package/src/xgboost_R.cpp b/R-package/src/xgboost_R.cpp
@@ -4,10 +4,11 @@
 #include <cstring>
 #include <cstdio>
 #include <sstream> 
-#include "xgboost_R.h"
 #include "wrapper/xgboost_wrapper.h"
 #include "src/utils/utils.h"
 #include "src/utils/omp.h"
+#include "xgboost_R.h"
+
 using namespace std;
 using namespace xgboost;
 
@@ -290,4 +291,4 @@ extern "C" {
     UNPROTECT(1);
     return out;
   }
-}
+}
diff --git a/build.sh b/build.sh
@@ -3,6 +3,13 @@
 # basically, it first try to make with OpenMP, if fails, disable OpenMP and make again
 # This will automatically make xgboost for MAC users who do not have openmp support
 # In most cases, type make will give what you want
+
+# download rabit
+if [ ! -d rabit ]; then
+    git clone https://github.com/tqchen/rabit.git
+else
+    cd rabit; git pull; cd ..
+fi
 if make; then
     echo "Successfully build multi-thread xgboost"
 else

diff --git a/demo/README.md b/demo/README.md
@@ -32,6 +32,8 @@ This is a list of short codes introducing different functionalities of xgboost a
   [python](guide-python/cross_validation.py)
   [R](../R-package/demo/cross_validation.R)
   [Julia](https://github.com/antinucleon/XGBoost.jl/blob/master/demo/cross_validation.jl)  
+* Predicting leaf indices
+  [python](guide-python/predict_leaf_indices.py)
 
 Basic Examples by Tasks
 ====

diff --git a/demo/guide-python/README.md b/demo/guide-python/README.md
@@ -6,3 +6,4 @@ XGBoost Python Feature Walkthrough
 * [Predicting using first n trees](predict_first_ntree.py)
 * [Generalized Linear Model](generalized_linear_model.py)
 * [Cross validation](cross_validation.py)
+* [Predicting leaf indices](predict_leaf_indices.py)
diff --git a/demo/guide-python/predict_leaf_indices.py b/demo/guide-python/predict_leaf_indices.py
@@ -0,0 +1,22 @@
+#!/usr/bin/python
+import sys
+import numpy as np
+sys.path.append('../../wrapper')
+import xgboost as xgb
+
+### load data in do training
+dtrain = xgb.DMatrix('../data/agaricus.txt.train')
+dtest = xgb.DMatrix('../data/agaricus.txt.test')
+param = {'max_depth':2, 'eta':1, 'silent':1, 'objective':'binary:logistic' }
+watchlist  = [(dtest,'eval'), (dtrain,'train')]
+num_round = 3
+bst = xgb.train(param, dtrain, num_round, watchlist)
+
+print ('start testing predict the leaf indices')
+### predict using first 2 tree
+leafindex = bst.predict(dtest, ntree_limit=2, pred_leaf = True)
+print leafindex.shape
+print leafindex
+### predict all trees
+leafindex = bst.predict(dtest, pred_leaf = True)
+print leafindex.shape
diff --git a/demo/guide-python/runall.sh b/demo/guide-python/runall.sh
@@ -4,4 +4,5 @@ python custom_objective.py
 python boost_from_prediction.py
 python generalized_linear_model.py
 python cross_validation.py
-rm -rf *~ *.model *.buffer 
+python predict_leaf_indices.py
+rm -rf *~ *.model *.buffer 
diff --git a/multi-node/README.md b/multi-node/README.md
@@ -0,0 +1,36 @@
+Distributed XGBoost
+======
+This folder contains information of Distributed XGBoost.
+
+* The distributed version is built on Rabit:[Reliable Allreduce and Broadcast Library](https://github.com/tqchen/rabit)
+  - Rabit is a portable library that provides fault-tolerance for Allreduce calls for distributed machine learning  
+  - This makes xgboost portable and fault-tolerant against node failures
+* You can run Distributed XGBoost on platforms including Hadoop(see [hadoop folder](hadoop)) and MPI
+  - Rabit only replies a platform to start the programs, so it should be easy to port xgboost to most platforms
+
+Build
+=====
+* In the root folder, run ```./build.sh```, this will give you xgboost, which uses rabit allreduce
+
+Notes
+====
+* Rabit handles all the fault tolerant and communications efficiently, we only use platform specific command to start programs
+  - The Hadoop version does not rely on Mapreduce to do iterations
+  - You can expect xgboost not suffering the drawbacks of iterative MapReduce program
+* The design choice was made because Allreduce is very natural and efficient for distributed tree building
+  - In current version of xgboost, the distributed version is only adds several lines of Allreduce synchronization code
+* The multi-threading nature of xgboost is inheritated in distributed mode
+  - This means xgboost efficiently use all the threads in one machine, and communicates only between machines
+  - Remember to run on xgboost process per machine and this will give you maximum speedup
+* For more information about rabit and how it works, see the [tutorial](https://github.com/tqchen/rabit/tree/master/guide)
+
+Solvers
+=====
+There are two solvers in distributed xgboost. You can check for local demo of the two solvers, see [row-split](row-split) and [col-split](col-split)
+  * Column-based solver split data by column, each node work on subset of columns, 
+    it uses exactly the same algorithm as single node version.
+  * Row-based solver split data by row, each node work on subset of rows,
+    it uses an approximate histogram count algorithm, and will only examine subset of 
+    potential split points as opposed to all split points.
+    - This is the mode used by current hadoop version, since usually data was stored by rows in many industry system
+
diff --git a/multi-node/col-split/README.md b/multi-node/col-split/README.md
@@ -0,0 +1,19 @@
+Distributed XGBoost: Column Split Version
+====
+* run ```bash mushroom-col-rabit.sh <n-process>```
+  - mushroom-col-rabit.sh starts xgboost job using rabit's allreduce
+* run ```bash mushroom-col-rabit-mock.sh <n-process>```
+  - mushroom-col-rabit-mock.sh starts xgboost job using rabit's allreduce, inserts suicide signal at certain point and test recovery
+
+How to Use
+====
+* First split the data by column, 
+* In the config, specify data file as containing a wildcard %d, where %d is the rank of the node, each node will load their part of data
+* Enable column split mode by ```dsplit=col```
+
+Notes
+====
+* The code is multi-threaded, so you want to run one process per node
+* The code will work correctly as long as union of each column subset is all the columns we are interested in.
+  - The column subset can overlap with each other.
+* It uses exactly the same algorithm as single node version, to examine all potential split points.
diff --git a/multi-node/col-split/mushroom-col-rabit-mock.sh b/multi-node/col-split/mushroom-col-rabit-mock.sh
@@ -0,0 +1,25 @@
+#!/bin/bash
+if [[ $# -ne 1 ]]
+then
+    echo "Usage: nprocess"
+    exit -1
+fi
+
+#
+# This script is same as mushroom-col except that we will be using xgboost instead of xgboost-mpi
+# xgboost used built in tcp-based allreduce module, and can be run on more enviroment, so long as we know how to start job by modifying ../submit_job_tcp.py
+#
+rm -rf train.col* *.model
+k=$1
+
+# split the lib svm file into k subfiles
+python splitsvm.py ../../demo/data/agaricus.txt.train train $k
+
+# run xgboost mpi
+../../rabit/tracker/rabit_demo.py -n $k  ../../xgboost.mock mushroom-col.conf dsplit=col mock=0,2,0,0 mock=1,2,0,0 mock=2,2,8,0 mock=2,3,0,0
+
+# the model can be directly loaded by single machine xgboost solver, as usuall
+#../../xgboost mushroom-col.conf task=dump model_in=0002.model fmap=../../demo/data/featmap.txt name_dump=dump.nice.$k.txt
+
+
+#cat dump.nice.$k.txt
diff --git a/multi-node/col-split/mushroom-col-rabit.sh b/multi-node/col-split/mushroom-col-rabit.sh
@@ -0,0 +1,28 @@
+#!/bin/bash
+if [[ $# -ne 1 ]]
+then
+    echo "Usage: nprocess"
+    exit -1
+fi
+
+#
+# This script is same as mushroom-col except that we will be using xgboost instead of xgboost-mpi
+# xgboost used built in tcp-based allreduce module, and can be run on more enviroment, so long as we know how to start job by modifying ../submit_job_tcp.py
+#
+rm -rf train.col* *.model
+k=$1
+
+# split the lib svm file into k subfiles
+python splitsvm.py ../../demo/data/agaricus.txt.train train $k
+
+# run xgboost mpi
+../../rabit/tracker/rabit_demo.py -n $k ../../xgboost mushroom-col.conf dsplit=col
+
+# the model can be directly loaded by single machine xgboost solver, as usuall
+../../xgboost mushroom-col.conf task=dump model_in=0002.model fmap=../../demo/data/featmap.txt name_dump=dump.nice.$k.txt 
+
+# run for one round, and continue training
+../../rabit/tracker/rabit_demo.py -n $k  ../../xgboost mushroom-col.conf dsplit=col num_round=1
+../../rabit/tracker/rabit_demo.py -n $k  ../../xgboost mushroom-col.conf  mushroom-col.conf dsplit=col model_in=0001.model
+
+cat dump.nice.$k.txt
diff --git a/multi-node/col-split/mushroom-col.conf b/multi-node/col-split/mushroom-col.conf
@@ -0,0 +1,35 @@
+# General Parameters, see comment for each definition
+# choose the booster, can be gbtree or gblinear
+booster = gbtree
+# choose logistic regression loss function for binary classification
+objective = binary:logistic
+
+# Tree Booster Parameters
+# step size shrinkage
+eta = 1.0 
+# minimum loss reduction required to make a further partition
+gamma = 1.0 
+# minimum sum of instance weight(hessian) needed in a child
+min_child_weight = 1 
+# maximum depth of a tree
+max_depth = 3 
+
+# Task Parameters
+# the number of round to do boosting
+num_round = 2
+# 0 means do not save any model except the final round model
+save_period = 0 
+use_buffer = 0
+
+# The path of training data %d is the wildcard for the rank of the data
+# The idea is each process take a feature matrix with subset of columns
+#
+data = "train.col%d" 
+
+# The path of validation data, used to monitor training process, here [test] sets name of the validation set
+eval[test] = "../../demo/data/agaricus.txt.test" 
+# evaluate on training data as well each round
+eval_train = 1
+
+# The path of test data, need to use full data of test, try not use it, or keep an subsampled version
+test:data = "../../demo/data/agaricus.txt.test"      
diff --git a/multi-node/col-split/splitsvm.py b/multi-node/col-split/splitsvm.py
@@ -0,0 +1,32 @@
+#!/usr/bin/python
+import sys
+import random
+
+# split libsvm file into different subcolumns
+if len(sys.argv) < 4:
+    print ('Usage:<fin> <fo> k')
+    exit(0)
+
+random.seed(10)
+fmap = {}
+
+k = int(sys.argv[3])
+fi = open( sys.argv[1], 'r' )
+fos = []
+
+for i in range(k):
+    fos.append(open( sys.argv[2]+'.col%d' % i, 'w' ))
+
+for l in open(sys.argv[1]):
+    arr = l.split()
+    for f in fos:
+        f.write(arr[0])
+    for it in arr[1:]:
+        fid = int(it.split(':')[0])
+        if fid not in fmap:
+            fmap[fid] = random.randint(0, k-1)
+        fos[fmap[fid]].write(' '+it)
+    for f in fos:
+        f.write('\n')
+for f in fos:    
+    f.close()