Skip to content

Commit

Permalink
Merge branch 'unity'
Browse files Browse the repository at this point in the history
Conflicts:
	.gitignore
	R-package/src/xgboost_R.cpp
	src/gbm/gblinear-inl.hpp
	tools/xgcombine_buffer.cpp
  • Loading branch information
tqchen committed Jan 19, 2015
2 parents d50079f + b898672 commit f49fd88
Show file tree
Hide file tree
Showing 70 changed files with 6,398 additions and 262 deletions.
10 changes: 9 additions & 1 deletion .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
*.slo
*.lo
*.o

*.page
# Compiled Dynamic libraries
*.so
*.dylib
Expand Down Expand Up @@ -45,3 +45,11 @@ Debug
*save
*csv
.Rproj.user
*.cpage.col
*.cpage
xgboost
xgboost.mpi
xgboost.mock
train*
rabit

58 changes: 43 additions & 15 deletions Makefile
Original file line number Diff line number Diff line change
@@ -1,42 +1,70 @@
export CC = gcc
export CXX = g++
export LDFLAGS= -pthread -lm

export CFLAGS = -Wall -O3 -msse2 -Wno-unknown-pragmas -fPIC -pedantic
export MPICXX = mpicxx
export LDFLAGS= -Lrabit/lib -pthread -lm
export CFLAGS = -Wall -O3 -msse2 -Wno-unknown-pragmas -fPIC -Irabit/include

ifeq ($(no_omp),1)
CFLAGS += -DDISABLE_OPENMP
else
CFLAGS += -fopenmp
endif

# by default use c++11
ifeq ($(no_cxx11),1)
else
CFLAGS +=
endif

# specify tensor path
BIN = xgboost
OBJ = updater.o gbm.o io.o
BIN = xgboost
MOCKBIN = xgboost.mock
OBJ = updater.o gbm.o io.o main.o
MPIBIN = xgboost.mpi
SLIB = wrapper/libxgboostwrapper.so

.PHONY: clean all python Rpack
.PHONY: clean all mpi python Rpack librabit librabit_mpi

all: $(BIN) $(OBJ) $(SLIB)
all: $(BIN) $(OBJ) $(SLIB) $(MOCKBIN)
mpi: $(MPIBIN)

# rules to get rabit library
librabit:
if [ ! -d rabit ]; then git clone https://github.com/tqchen/rabit.git; fi
cd rabit;make lib/librabit.a lib/librabit_mock.a; cd -
librabit_mpi:
if [ ! -d rabit ]; then git clone https://github.com/tqchen/rabit.git; fi
cd rabit;make lib/librabit_mpi.a; cd -

python: wrapper/libxgboostwrapper.so
# now the wrapper takes in two files. io and wrapper part
wrapper/libxgboostwrapper.so: wrapper/xgboost_wrapper.cpp $(OBJ)
updater.o: src/tree/updater.cpp src/tree/*.hpp src/*.h src/tree/*.h
gbm.o: src/gbm/gbm.cpp src/gbm/*.hpp src/gbm/*.h
updater.o: src/tree/updater.cpp src/tree/*.hpp src/*.h src/tree/*.h src/utils/*.h
gbm.o: src/gbm/gbm.cpp src/gbm/*.hpp src/gbm/*.h
io.o: src/io/io.cpp src/io/*.hpp src/utils/*.h src/learner/dmatrix.h src/*.h
xgboost: src/xgboost_main.cpp src/utils/*.h src/*.h src/learner/*.hpp src/learner/*.h $(OBJ)
wrapper/libxgboostwrapper.so: wrapper/xgboost_wrapper.cpp src/utils/*.h src/*.h src/learner/*.hpp src/learner/*.h $(OBJ)
main.o: src/xgboost_main.cpp src/utils/*.h src/*.h src/learner/*.hpp src/learner/*.h
xgboost.mpi: updater.o gbm.o io.o main.o librabit_mpi
xgboost.mock: updater.o gbm.o io.o main.o librabit
xgboost: updater.o gbm.o io.o main.o librabit
wrapper/libxgboostwrapper.so: wrapper/xgboost_wrapper.cpp src/utils/*.h src/*.h src/learner/*.hpp src/learner/*.h updater.o gbm.o io.o librabit

$(BIN) :
$(CXX) $(CFLAGS) $(LDFLAGS) -o $@ $(filter %.cpp %.o %.c, $^)
$(CXX) $(CFLAGS) -o $@ $(filter %.cpp %.o %.c, $^) $(LDFLAGS) -lrabit

$(MOCKBIN) :
$(CXX) $(CFLAGS) -o $@ $(filter %.cpp %.o %.c, $^) $(LDFLAGS) -lrabit_mock

$(SLIB) :
$(CXX) $(CFLAGS) -fPIC $(LDFLAGS) -shared -o $@ $(filter %.cpp %.o %.c, $^)
$(CXX) $(CFLAGS) -fPIC -shared -o $@ $(filter %.cpp %.o %.c, $^) $(LDFLAGS) -lrabit

$(OBJ) :
$(CXX) -c $(CFLAGS) -o $@ $(firstword $(filter %.cpp %.c, $^) )

$(MPIOBJ) :
$(MPICXX) -c $(CFLAGS) -o $@ $(firstword $(filter %.cpp %.c, $^) )

$(MPIBIN) :
$(MPICXX) $(CFLAGS) -o $@ $(filter %.cpp %.o %.c, $^) $(LDFLAGS) -lrabit_mpi

install:
cp -f -r $(BIN) $(INSTALL_PATH)

Expand All @@ -62,4 +90,4 @@ Rpack:
R CMD check --as-cran xgboost*.tar.gz

clean:
$(RM) $(OBJ) $(BIN) $(SLIB) *.o */*.o */*/*.o *~ */*~ */*/*~
$(RM) $(OBJ) $(BIN) $(MPIBIN) $(MPIOBJ) $(SLIB) *.o */*.o */*/*.o *~ */*~ */*/*~
4 changes: 1 addition & 3 deletions R-package/src/Makevars
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,4 @@ PKGROOT=../../
PKG_CPPFLAGS= -DXGBOOST_CUSTOMIZE_MSG_ -DXGBOOST_CUSTOMIZE_PRNG_ -DXGBOOST_STRICT_CXX98_ -I$(PKGROOT)
PKG_CXXFLAGS= $(SHLIB_OPENMP_CFLAGS)
PKG_LIBS = $(SHLIB_OPENMP_CFLAGS)
OBJECTS= xgboost_R.o xgboost_assert.o $(PKGROOT)/wrapper/xgboost_wrapper.o $(PKGROOT)/src/io/io.o $(PKGROOT)/src/gbm/gbm.o $(PKGROOT)/src/tree/updater.o


OBJECTS= xgboost_R.o xgboost_assert.o $(PKGROOT)/wrapper/xgboost_wrapper.o $(PKGROOT)/src/io/io.o $(PKGROOT)/src/gbm/gbm.o $(PKGROOT)/src/tree/updater.o $(PKGROOT)/src/sync/sync_empty.o
2 changes: 1 addition & 1 deletion R-package/src/Makevars.win
Original file line number Diff line number Diff line change
Expand Up @@ -4,4 +4,4 @@ PKGROOT=../../
PKG_CPPFLAGS= -DXGBOOST_CUSTOMIZE_MSG_ -DXGBOOST_CUSTOMIZE_PRNG_ -DXGBOOST_STRICT_CXX98_ -I$(PKGROOT)
PKG_CXXFLAGS= $(SHLIB_OPENMP_CFLAGS)
PKG_LIBS = $(SHLIB_OPENMP_CFLAGS)
OBJECTS= xgboost_R.o xgboost_assert.o $(PKGROOT)/wrapper/xgboost_wrapper.o $(PKGROOT)/src/io/io.o $(PKGROOT)/src/gbm/gbm.o $(PKGROOT)/src/tree/updater.o
OBJECTS= xgboost_R.o xgboost_assert.o $(PKGROOT)/wrapper/xgboost_wrapper.o $(PKGROOT)/src/io/io.o $(PKGROOT)/src/gbm/gbm.o $(PKGROOT)/src/tree/updater.o $(PKGROOT)/src/sync/sync_empty.o
5 changes: 3 additions & 2 deletions R-package/src/xgboost_R.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -4,10 +4,11 @@
#include <cstring>
#include <cstdio>
#include <sstream>
#include "xgboost_R.h"
#include "wrapper/xgboost_wrapper.h"
#include "src/utils/utils.h"
#include "src/utils/omp.h"
#include "xgboost_R.h"

using namespace std;
using namespace xgboost;

Expand Down Expand Up @@ -290,4 +291,4 @@ extern "C" {
UNPROTECT(1);
return out;
}
}
}
7 changes: 7 additions & 0 deletions build.sh
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,13 @@
# basically, it first try to make with OpenMP, if fails, disable OpenMP and make again
# This will automatically make xgboost for MAC users who do not have openmp support
# In most cases, type make will give what you want

# download rabit
if [ ! -d rabit ]; then
git clone https://github.com/tqchen/rabit.git
else
cd rabit; git pull; cd ..
fi
if make; then
echo "Successfully build multi-thread xgboost"
else
Expand Down
2 changes: 2 additions & 0 deletions demo/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,8 @@ This is a list of short codes introducing different functionalities of xgboost a
[python](guide-python/cross_validation.py)
[R](../R-package/demo/cross_validation.R)
[Julia](https://github.com/antinucleon/XGBoost.jl/blob/master/demo/cross_validation.jl)
* Predicting leaf indices
[python](guide-python/predict_leaf_indices.py)

Basic Examples by Tasks
====
Expand Down
1 change: 1 addition & 0 deletions demo/guide-python/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -6,3 +6,4 @@ XGBoost Python Feature Walkthrough
* [Predicting using first n trees](predict_first_ntree.py)
* [Generalized Linear Model](generalized_linear_model.py)
* [Cross validation](cross_validation.py)
* [Predicting leaf indices](predict_leaf_indices.py)
22 changes: 22 additions & 0 deletions demo/guide-python/predict_leaf_indices.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
#!/usr/bin/python
import sys
import numpy as np
sys.path.append('../../wrapper')
import xgboost as xgb

### load data in do training
dtrain = xgb.DMatrix('../data/agaricus.txt.train')
dtest = xgb.DMatrix('../data/agaricus.txt.test')
param = {'max_depth':2, 'eta':1, 'silent':1, 'objective':'binary:logistic' }
watchlist = [(dtest,'eval'), (dtrain,'train')]
num_round = 3
bst = xgb.train(param, dtrain, num_round, watchlist)

print ('start testing predict the leaf indices')
### predict using first 2 tree
leafindex = bst.predict(dtest, ntree_limit=2, pred_leaf = True)
print leafindex.shape
print leafindex
### predict all trees
leafindex = bst.predict(dtest, pred_leaf = True)
print leafindex.shape
3 changes: 2 additions & 1 deletion demo/guide-python/runall.sh
Original file line number Diff line number Diff line change
Expand Up @@ -4,4 +4,5 @@ python custom_objective.py
python boost_from_prediction.py
python generalized_linear_model.py
python cross_validation.py
rm -rf *~ *.model *.buffer
python predict_leaf_indices.py
rm -rf *~ *.model *.buffer
36 changes: 36 additions & 0 deletions multi-node/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,36 @@
Distributed XGBoost
======
This folder contains information of Distributed XGBoost.

* The distributed version is built on Rabit:[Reliable Allreduce and Broadcast Library](https://github.com/tqchen/rabit)
- Rabit is a portable library that provides fault-tolerance for Allreduce calls for distributed machine learning
- This makes xgboost portable and fault-tolerant against node failures
* You can run Distributed XGBoost on platforms including Hadoop(see [hadoop folder](hadoop)) and MPI
- Rabit only replies a platform to start the programs, so it should be easy to port xgboost to most platforms

Build
=====
* In the root folder, run ```./build.sh```, this will give you xgboost, which uses rabit allreduce

Notes
====
* Rabit handles all the fault tolerant and communications efficiently, we only use platform specific command to start programs
- The Hadoop version does not rely on Mapreduce to do iterations
- You can expect xgboost not suffering the drawbacks of iterative MapReduce program
* The design choice was made because Allreduce is very natural and efficient for distributed tree building
- In current version of xgboost, the distributed version is only adds several lines of Allreduce synchronization code
* The multi-threading nature of xgboost is inheritated in distributed mode
- This means xgboost efficiently use all the threads in one machine, and communicates only between machines
- Remember to run on xgboost process per machine and this will give you maximum speedup
* For more information about rabit and how it works, see the [tutorial](https://github.com/tqchen/rabit/tree/master/guide)

Solvers
=====
There are two solvers in distributed xgboost. You can check for local demo of the two solvers, see [row-split](row-split) and [col-split](col-split)
* Column-based solver split data by column, each node work on subset of columns,
it uses exactly the same algorithm as single node version.
* Row-based solver split data by row, each node work on subset of rows,
it uses an approximate histogram count algorithm, and will only examine subset of
potential split points as opposed to all split points.
- This is the mode used by current hadoop version, since usually data was stored by rows in many industry system

19 changes: 19 additions & 0 deletions multi-node/col-split/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
Distributed XGBoost: Column Split Version
====
* run ```bash mushroom-col-rabit.sh <n-process>```
- mushroom-col-rabit.sh starts xgboost job using rabit's allreduce
* run ```bash mushroom-col-rabit-mock.sh <n-process>```
- mushroom-col-rabit-mock.sh starts xgboost job using rabit's allreduce, inserts suicide signal at certain point and test recovery

How to Use
====
* First split the data by column,
* In the config, specify data file as containing a wildcard %d, where %d is the rank of the node, each node will load their part of data
* Enable column split mode by ```dsplit=col```

Notes
====
* The code is multi-threaded, so you want to run one process per node
* The code will work correctly as long as union of each column subset is all the columns we are interested in.
- The column subset can overlap with each other.
* It uses exactly the same algorithm as single node version, to examine all potential split points.
25 changes: 25 additions & 0 deletions multi-node/col-split/mushroom-col-rabit-mock.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
#!/bin/bash
if [[ $# -ne 1 ]]
then
echo "Usage: nprocess"
exit -1
fi

#
# This script is same as mushroom-col except that we will be using xgboost instead of xgboost-mpi
# xgboost used built in tcp-based allreduce module, and can be run on more enviroment, so long as we know how to start job by modifying ../submit_job_tcp.py
#
rm -rf train.col* *.model
k=$1

# split the lib svm file into k subfiles
python splitsvm.py ../../demo/data/agaricus.txt.train train $k

# run xgboost mpi
../../rabit/tracker/rabit_demo.py -n $k ../../xgboost.mock mushroom-col.conf dsplit=col mock=0,2,0,0 mock=1,2,0,0 mock=2,2,8,0 mock=2,3,0,0

# the model can be directly loaded by single machine xgboost solver, as usuall
#../../xgboost mushroom-col.conf task=dump model_in=0002.model fmap=../../demo/data/featmap.txt name_dump=dump.nice.$k.txt


#cat dump.nice.$k.txt
28 changes: 28 additions & 0 deletions multi-node/col-split/mushroom-col-rabit.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,28 @@
#!/bin/bash
if [[ $# -ne 1 ]]
then
echo "Usage: nprocess"
exit -1
fi

#
# This script is same as mushroom-col except that we will be using xgboost instead of xgboost-mpi
# xgboost used built in tcp-based allreduce module, and can be run on more enviroment, so long as we know how to start job by modifying ../submit_job_tcp.py
#
rm -rf train.col* *.model
k=$1

# split the lib svm file into k subfiles
python splitsvm.py ../../demo/data/agaricus.txt.train train $k

# run xgboost mpi
../../rabit/tracker/rabit_demo.py -n $k ../../xgboost mushroom-col.conf dsplit=col

# the model can be directly loaded by single machine xgboost solver, as usuall
../../xgboost mushroom-col.conf task=dump model_in=0002.model fmap=../../demo/data/featmap.txt name_dump=dump.nice.$k.txt

# run for one round, and continue training
../../rabit/tracker/rabit_demo.py -n $k ../../xgboost mushroom-col.conf dsplit=col num_round=1
../../rabit/tracker/rabit_demo.py -n $k ../../xgboost mushroom-col.conf mushroom-col.conf dsplit=col model_in=0001.model

cat dump.nice.$k.txt
35 changes: 35 additions & 0 deletions multi-node/col-split/mushroom-col.conf
Original file line number Diff line number Diff line change
@@ -0,0 +1,35 @@
# General Parameters, see comment for each definition
# choose the booster, can be gbtree or gblinear
booster = gbtree
# choose logistic regression loss function for binary classification
objective = binary:logistic

# Tree Booster Parameters
# step size shrinkage
eta = 1.0
# minimum loss reduction required to make a further partition
gamma = 1.0
# minimum sum of instance weight(hessian) needed in a child
min_child_weight = 1
# maximum depth of a tree
max_depth = 3

# Task Parameters
# the number of round to do boosting
num_round = 2
# 0 means do not save any model except the final round model
save_period = 0
use_buffer = 0

# The path of training data %d is the wildcard for the rank of the data
# The idea is each process take a feature matrix with subset of columns
#
data = "train.col%d"

# The path of validation data, used to monitor training process, here [test] sets name of the validation set
eval[test] = "../../demo/data/agaricus.txt.test"
# evaluate on training data as well each round
eval_train = 1

# The path of test data, need to use full data of test, try not use it, or keep an subsampled version
test:data = "../../demo/data/agaricus.txt.test"
32 changes: 32 additions & 0 deletions multi-node/col-split/splitsvm.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
#!/usr/bin/python
import sys
import random

# split libsvm file into different subcolumns
if len(sys.argv) < 4:
print ('Usage:<fin> <fo> k')
exit(0)

random.seed(10)
fmap = {}

k = int(sys.argv[3])
fi = open( sys.argv[1], 'r' )
fos = []

for i in range(k):
fos.append(open( sys.argv[2]+'.col%d' % i, 'w' ))

for l in open(sys.argv[1]):
arr = l.split()
for f in fos:
f.write(arr[0])
for it in arr[1:]:
fid = int(it.split(':')[0])
if fid not in fmap:
fmap[fid] = random.randint(0, k-1)
fos[fmap[fid]].write(' '+it)
for f in fos:
f.write('\n')
for f in fos:
f.close()
Loading

0 comments on commit f49fd88

Please sign in to comment.