diff --git a/.gitignore b/.gitignore index 94a8c105..517cce2b 100644 --- a/.gitignore +++ b/.gitignore @@ -37,8 +37,12 @@ recommonmark recom _* +#mpi lib +mpich/ +mpich-3.2/ + # Jetbrain .idea cmake-build-debug/ - +.vscode/ diff --git a/.travis.yml b/.travis.yml index ba01f423..a2eff4dc 100644 --- a/.travis.yml +++ b/.travis.yml @@ -1,5 +1,12 @@ -# disable sudo to use container based build -sudo: false +sudo: true + +os: + - linux + - osx + +osx_image: xcode10.2 + +dist: xenial # Use Build Matrix to do lint and build seperately env: @@ -10,15 +17,17 @@ env: - TASK=build - TASK=mpi-build - TASK=cmake-build - - TASK=test CXX=g++-4.8 + - TASK=test CXX=g++ # dependent apt packages -dist: xenial addons: apt: + sources: + - llvm-toolchain-trusty-5.0 + - ubuntu-toolchain-r-test + - george-edison55-precise-backports packages: - doxygen - - libopenmpi-dev - wget - git - libcurl4-openssl-dev @@ -26,30 +35,43 @@ addons: - python-numpy - gcc-4.8 - g++-4.8 - - openmpi-bin - - openmpi-common - openssh-client - openssh-server - - libopenmpi-dev + - python3 + - python3-setuptools + homebrew: + packages: + - gcc49 + - openssl + - libgit2 + - python3 + update: true before_install: - export TRAVIS=dmlc-core/scripts/travis/ - source ${TRAVIS}/travis_setup_env.sh + - ${TRAVIS}/travis_osx_install.sh install: - - pip install --user cpplint pylint kubernetes urllib3 + - if [[ ${TRAVIS_OS_NAME} == "linux" ]]; then sudo apt-get install python3-pip; fi + - if [[ ${TRAVIS_OS_NAME} == "osx" ]]; then brew install python3; fi + - pip3 install cpplint pylint urllib3 numpy + - pip3 install websocket-client kubernetes script: scripts/travis_script.sh - -before_cache: - - ${TRAVIS}/travis_before_cache.sh - - cache: directories: - ${HOME}/.cache/usr + - ${HOME}/.cache/pip + - mpich +before_cache: + - ${TRAVIS}/travis_before_cache.sh + +after_success: + - tree build + - bash <(curl -s https://codecov.io/bash) -a '-o src/ src/*.c' notifications: # Emails are sent to the committer's git-configured email address by default, @@ -57,4 +79,3 @@ notifications: on_success: change on_failure: always - diff --git a/Makefile b/Makefile index 2af5b9b2..5599d4f3 100644 --- a/Makefile +++ b/Makefile @@ -1,23 +1,26 @@ OS := $(shell uname) -export MPICXX = mpicxx -export LDFLAGS= -Llib +export WARNFLAGS= -Wall -Wextra -Wno-unused-parameter -Wno-unknown-pragmas -std=c++11 +export CFLAGS = -O3 $(WARNFLAGS) -I $(DMLC)/include -I include/ +export LDFLAGS =-Llib -OS := $(shell uname) +#download mpi +#echo $(shell scripts/mpi.sh) + +MPICXX=./mpich/bin/mpicxx ifeq ($(OS), Darwin) ifndef CC - export CC = $(if $(shell which clang), clang, gcc) + export CC = gcc-4.9 endif ifndef CXX - export CXX = $(if $(shell which clang++), clang++, g++) + export CXX = g++-4.9 endif else ifeq ($(OS), FreeBSD) ifndef CXX export CXX = g++6 endif - export MPICXX = /usr/local/mpi/bin/mpicxx export LDFLAGS= -Llib -Wl,-rpath=/usr/local/lib/gcc6 else # linux defaults @@ -27,13 +30,10 @@ else ifndef CXX export CXX = g++ endif - LDFLAGS += -lrt + LDFLAGS +=-lrt endif endif -export WARNFLAGS= -Wall -Wextra -Wno-unused-parameter -Wno-unknown-pragmas -std=c++11 -export CFLAGS = -O3 $(WARNFLAGS) - #---------------------------- # Settings for power and arm arch #---------------------------- @@ -69,8 +69,10 @@ BPATH=. MPIOBJ= $(BPATH)/engine_mpi.o OBJ= $(BPATH)/allreduce_base.o $(BPATH)/allreduce_robust.o $(BPATH)/engine.o $(BPATH)/engine_empty.o $(BPATH)/engine_mock.o\ $(BPATH)/c_api.o $(BPATH)/engine_base.o -SLIB= lib/librabit.so lib/librabit_mpi.so lib/librabit_mock.so lib/librabit_base.so -ALIB= lib/librabit.a lib/librabit_mpi.a lib/librabit_empty.a lib/librabit_mock.a lib/librabit_base.a +SLIB= lib/librabit.so lib/librabit_mock.so lib/librabit_base.so +ALIB= lib/librabit.a lib/librabit_empty.a lib/librabit_mock.a lib/librabit_base.a +MPISLIB= lib/librabit_mpi.so +MPIALIB= lib/librabit_mpi.a HEADERS=src/*.h include/rabit/*.h include/rabit/internal/*.h DMLC=dmlc-core @@ -95,10 +97,7 @@ lib/librabit_empty.a: $(BPATH)/engine_empty.o $(BPATH)/c_api.o lib/librabit_mpi.a lib/librabit_mpi.so: $(MPIOBJ) $(OBJ) : - $(CXX) -c $(CFLAGS) -o $@ $(firstword $(filter %.cpp %.c %.cc, $^) ) -I include/ -I $(DMLC)/include - -$(MPIOBJ) : - $(MPICXX) -c $(CFLAGS) -o $@ $(firstword $(filter %.cpp %.c %.cc, $^) ) -I $(DMLC)/include + $(CXX) -c $(CFLAGS) -o $@ $(firstword $(filter %.cpp %.c %.cc, $^) ) $(ALIB): ar cr $@ $+ @@ -106,6 +105,16 @@ $(ALIB): $(SLIB) : $(CXX) $(CFLAGS) -shared -o $@ $(filter %.cpp %.o %.c %.cc %.a, $^) $(LDFLAGS) +$(MPIOBJ) : + $(MPICXX) -c $(CFLAGS) -I./mpich/include -o $@ $(firstword $(filter %.cpp %.c %.cc, $^) ) + +$(MPIALIB): + ar cr $@ $+ + +$(MPISLIB) : + $(MPICXX) $(CFLAGS) -I./mpich/include -shared -o $@ $(filter %.cpp %.o %.c %.cc %.a, $^) \ + $(LDFLAGS) -L./mpich/lib -Wl,-rpath,./mpich/lib -lmpi + lint: $(DMLC)/scripts/lint.py rabit $(LINT_LANG) src include @@ -113,4 +122,4 @@ doc doxygen: cd include; doxygen ../doc/Doxyfile; cd - clean: - $(RM) $(OBJ) $(MPIOBJ) $(ALIB) $(MPIALIB) $(SLIB) *~ src/*~ include/*~ include/*/*~ + $(RM) $(OBJ) $(MPIOBJ) $(ALIB) $(MPIALIB) $(SLIB) *~ src/*~ include/*~ include/*/*~ diff --git a/dmlc-core b/dmlc-core index 15362f8f..13d5acb8 160000 --- a/dmlc-core +++ b/dmlc-core @@ -1 +1 @@ -Subproject commit 15362f8fcc7345d60de13a676a2cbd3ffdc3f064 +Subproject commit 13d5acb8ba7e79550bbf2f730f1a3944ff0fa68b diff --git a/python/rabit.py b/python/rabit.py index d57587ba..dccb464c 100644 --- a/python/rabit.py +++ b/python/rabit.py @@ -4,7 +4,7 @@ Author: Tianqi Chen """ # pylint: disable=unused-argument,invalid-name,global-statement,dangerous-default-value, -import cPickle as pickle +import pickle import ctypes import os import sys @@ -99,9 +99,10 @@ def init(args=None, lib='standard', lib_dll=None): When this is presented argument lib will be ignored. """ if args is None: - args = sys.argv + args = [] _loadlib(lib, lib_dll) arr = (ctypes.c_char_p * len(args))() + arr[:] = args _LIB.RabitInit(len(args), arr) diff --git a/scripts/mpi_build.sh b/scripts/mpi_build.sh new file mode 100755 index 00000000..b1e70be9 --- /dev/null +++ b/scripts/mpi_build.sh @@ -0,0 +1,27 @@ +#!/usr/bin/env bash + +if [ -f mpich/lib/libmpich.so ]; then + echo "libmpich.so found -- nothing to build." +else + echo "Downloading mpich source." + wget http://www.mpich.org/static/downloads/3.2/mpich-3.2.tar.gz + tar xfz mpich-3.2.tar.gz + rm mpich-3.2.tar.gz* + echo "configuring and building mpich." + cd mpich-3.2 + #CC=gcc CXX=g++ CFLAGS=-m64 CXXFLAGS=-m64 FFLAGS=-m64 + ./configure \ + --prefix=`pwd`/../mpich \ + --enable-static=false \ + --enable-alloca=true \ + --disable-long-double \ + --enable-threads=single \ + --enable-fortran=no \ + --enable-fast=all \ + --enable-g=none \ + --enable-timing=none \ + --enable-cxx + make -j4 + make install + cd - +fi \ No newline at end of file diff --git a/scripts/travis_runtest.sh b/scripts/travis_runtest.sh index 4f14ad17..37fc9953 100755 --- a/scripts/travis_runtest.sh +++ b/scripts/travis_runtest.sh @@ -7,3 +7,4 @@ make -f test.mk local_recover_10_10k || exit -1 make -f test.mk lazy_recover_10_10k_die_hard || exit -1 make -f test.mk lazy_recover_10_10k_die_same || exit -1 make -f test.mk ringallreduce_10_10k || exit -1 +make -f test.mk pylocal_recover_10_10k || exit -1 diff --git a/scripts/travis_script.sh b/scripts/travis_script.sh index 12e78f65..ae150487 100755 --- a/scripts/travis_script.sh +++ b/scripts/travis_script.sh @@ -15,6 +15,7 @@ if [ ${TASK} == "build" ]; then fi if [ ${TASK} == "mpi-build" ]; then + ./scripts/mpi_build.sh cd test make mpi && make speed_test.mpi || exit -1 fi diff --git a/src/allreduce_base.cc b/src/allreduce_base.cc index 143db6e1..60351699 100644 --- a/src/allreduce_base.cc +++ b/src/allreduce_base.cc @@ -131,6 +131,8 @@ void AllreduceBase::Shutdown(void) { utils::TCPSocket tracker = this->ConnectTracker(); tracker.SendStr(std::string("shutdown")); tracker.Close(); + // close listening sockets + sock_listen.Close(); utils::TCPSocket::Finalize(); } void AllreduceBase::TrackerPrint(const std::string &msg) { @@ -271,12 +273,26 @@ void AllreduceBase::ReConnectLinks(const char *cmd) { "ReConnectLink failure 4"); Assert(tracker.RecvAll(&next_rank, sizeof(next_rank)) == sizeof(next_rank), "ReConnectLink failure 4"); - // create listening socket - utils::TCPSocket sock_listen; - sock_listen.Create(); - int port = sock_listen.TryBindHost(slave_port, slave_port + nport_trial); - utils::Check(port != -1, "ReConnectLink fail to bind the ports specified"); - sock_listen.Listen(); + + if (sock_listen == INVALID_SOCKET || sock_listen.AtMark()) { + if (!sock_listen.IsClosed()) { + sock_listen.Close(); + } + // create listening socket + sock_listen.Create(); + sock_listen.SetKeepAlive(true); + // http://deepix.github.io/2016/10/21/tcprst.html + sock_listen.SetLinger(0); + // [slave_port, slave_port+1 .... slave_port + newrank ...slave_port + nport_trial) + // work around processes bind to same port without set reuse option, + // start explore from slave_port + newrank towards end + port = sock_listen.TryBindHost(slave_port+ newrank%nport_trial, slave_port + nport_trial); + // if no port bindable, explore first half of range + if (port == -1) sock_listen.TryBindHost(slave_port, newrank% nport_trial + slave_port); + + utils::Check(port != -1, "ReConnectLink fail to bind the ports specified"); + sock_listen.Listen(); + } // get number of to connect and number of to accept nodes from tracker int num_conn, num_accept, num_error = 1; @@ -311,6 +327,7 @@ void AllreduceBase::ReConnectLinks(const char *cmd) { "ReConnectLink failure 9"); Assert(tracker.RecvAll(&hrank, sizeof(hrank)) == sizeof(hrank), "ReConnectLink failure 10"); + r.sock.Create(); if (!r.sock.Connect(utils::SockAddr(hname.c_str(), hport))) { num_error += 1; r.sock.Close(); continue; @@ -357,8 +374,7 @@ void AllreduceBase::ReConnectLinks(const char *cmd) { } if (!match) all_links.push_back(r); } - // close listening sockets - sock_listen.Close(); + this->parent_index = -1; // setup tree links and ring structure tree_links.plinks.clear(); diff --git a/src/allreduce_base.h b/src/allreduce_base.h index 02c76d6f..b83cb0d0 100644 --- a/src/allreduce_base.h +++ b/src/allreduce_base.h @@ -521,6 +521,10 @@ class AllreduceBase : public IEngine { int world_size; // connect retry time int connect_retry; + // backdoor listening peer connection + utils::TCPSocket sock_listen; + // backdoor port + int port = 0; }; } // namespace engine } // namespace rabit diff --git a/src/allreduce_robust.cc b/src/allreduce_robust.cc index 43871e01..ce5a5616 100644 --- a/src/allreduce_robust.cc +++ b/src/allreduce_robust.cc @@ -51,12 +51,10 @@ void AllreduceRobust::Shutdown(void) { utils::Assert(RecoverExec(NULL, 0, ActionSummary::kCheckAck, ActionSummary::kSpecialOp), "Shutdown: check ack must return true"); - // one worker shutdowns and closes sockets while rest still run kCheckAck, - // seems has something to do with time-wait state in tcp connection, - // this cause rest workers checkandrecover and hang inf, - // https://github.com/dmlc/xgboost/pull/3818 - // TODO(Chen Qin): a fundamental fix for this - sleep(1); +#if defined (__APPLE__) + sleep(1); +#endif + AllreduceBase::Shutdown(); } /*! diff --git a/src/socket.h b/src/socket.h index d127113a..d6e6f597 100644 --- a/src/socket.h +++ b/src/socket.h @@ -276,13 +276,21 @@ class TCPSocket : public Socket{ * \brief enable/disable TCP keepalive * \param keepalive whether to set the keep alive option on */ - inline void SetKeepAlive(bool keepalive) { + void SetKeepAlive(bool keepalive) { int opt = static_cast(keepalive); if (setsockopt(sockfd, SOL_SOCKET, SO_KEEPALIVE, reinterpret_cast(&opt), sizeof(opt)) < 0) { Socket::Error("SetKeepAlive"); } } + inline void SetLinger(int timeout = 0) { + struct linger sl; + sl.l_onoff = 1; /* non-zero value enables linger option in kernel */ + sl.l_linger = timeout; /* timeout interval in seconds */ + if (setsockopt(sockfd, SOL_SOCKET, SO_LINGER, &sl, sizeof(sl)) == -1) { + Socket::Error("SO_LINGER"); + } + } /*! * \brief create the socket, call this before using socket * \param af domain diff --git a/test/Makefile b/test/Makefile index 5a6ba2aa..b1c329d3 100644 --- a/test/Makefile +++ b/test/Makefile @@ -1,4 +1,4 @@ -export MPICXX = mpicxx +MPICXX=../mpich/bin/mpicxx export LDFLAGS= -L../lib -pthread -lm export CFLAGS = -Wall -O3 -msse2 -Wno-unknown-pragmas -fPIC -I../include -I ../dmlc-core/include -std=c++11 @@ -27,11 +27,17 @@ OBJ = $(RABIT_OBJ) speed_test.o model_recover.o local_recover.o lazy_recover.o MPIBIN = speed_test.mpi .PHONY: clean all lib mpi +.PHONY: lib all + all: $(BIN) + lib: - cd ..;make;cd - + cd ..;make clean;make;cd - + +.PHONY: mpi mpi: cd ..;make mpi;cd - + # programs speed_test.o: speed_test.cc ../include/rabit/*.h lib mpi model_recover.o: model_recover.cc ../include/rabit/*.h lib @@ -52,7 +58,8 @@ $(OBJ) : $(CXX) -c $(CFLAGS) -o $@ $(firstword $(filter %.cpp %.c %.cc, $^) ) $(MPIBIN) : - $(MPICXX) $(CFLAGS) -o $@ $(filter %.cpp %.o %.c %.cc, $^) ../lib/librabit_mpi.so $(LDFLAGS) + $(MPICXX) $(CFLAGS) -I../mpich/include -shared -o $@ $(filter %.cpp %.o %.c %.cc, $^) \ + ../lib/librabit_mpi.so $(LDFLAGS) -L../mpich/lib -Wl,-rpath,../mpich/lib -lmpi clean: $(RM) $(OBJ) $(BIN) $(MPIBIN) $(MPIOBJ) *~ ../src/*~ diff --git a/test/lazy_recover.cc b/test/lazy_recover.cc index dd64294b..180e2e4b 100644 --- a/test/lazy_recover.cc +++ b/test/lazy_recover.cc @@ -118,7 +118,7 @@ int main(int argc, char *argv[]) { TestSum(&model, ntrial, r); printf("[%d] !!!TestSum pass, iter=%d\n", rank, r); rabit::LazyCheckPoint(&model); - printf("[%d] !!!CheckPont pass, iter=%d\n", rank, r); + printf("[%d] !!!CheckPoint pass, iter=%d\n", rank, r); } rabit::Finalize(); return 0; diff --git a/test/local_recover.cc b/test/local_recover.cc index a63bd2f8..1f0b28b3 100644 --- a/test/local_recover.cc +++ b/test/local_recover.cc @@ -130,7 +130,7 @@ int main(int argc, char *argv[]) { TestSum(&model, &local, ntrial, r); printf("[%d] !!!TestSum pass, iter=%d\n", rank, r); rabit::CheckPoint(&model, &local); - printf("[%d] !!!CheckPont pass, iter=%d\n", rank, r); + printf("[%d] !!!CheckPoint pass, iter=%d\n", rank, r); } rabit::Finalize(); return 0; diff --git a/test/local_recover.py b/test/local_recover.py index 315acbf0..6f7fae84 100755 --- a/test/local_recover.py +++ b/test/local_recover.py @@ -1,6 +1,11 @@ -#!/usr/bin/python +#!/usr/bin/env python3 + from __future__ import print_function from builtins import range + +import sys +sys.path.append('../python') + import rabit import numpy as np diff --git a/test/test.mk b/test/test.mk index 7b999507..b4b9afc4 100644 --- a/test/test.mk +++ b/test/test.mk @@ -1,29 +1,29 @@ # this is a makefile used to show testcases of rabit .PHONY: all -all: model_recover_10_10k model_recover_10_10k_die_same model_recover_10_10k_die_hard local_recover_10_10k +all: model_recover_10_10k model_recover_10_10k_die_same model_recover_10_10k_die_hard local_recover_10_10k lazy_recover_10_10k_die_hard lazy_recover_10_10k_die_same ringallreduce_10_10k pylocal_recover_10_10k # this experiment test recovery with actually process exit, use keepalive to keep program alive model_recover_10_10k: - ../dmlc-core/tracker/dmlc-submit --cluster local --num-workers=10 --local-num-attempt=10 model_recover 10000 mock=0,0,1,0 mock=1,1,1,0 + ../dmlc-core/tracker/dmlc-submit --cluster local --num-workers=10 --local-num-attempt=20 model_recover 10000 mock=0,0,1,0 mock=1,1,1,0 model_recover_10_10k_die_same: - ../dmlc-core/tracker/dmlc-submit --cluster local --num-workers=10 --local-num-attempt=10 model_recover 10000 mock=0,0,1,0 mock=1,1,1,0 mock=0,1,1,0 mock=4,1,1,0 mock=9,1,1,0 + ../dmlc-core/tracker/dmlc-submit --cluster local --num-workers=10 --local-num-attempt=20 model_recover 10000 mock=0,0,1,0 mock=1,1,1,0 mock=0,1,1,0 mock=4,1,1,0 mock=9,1,1,0 model_recover_10_10k_die_hard: - ../dmlc-core/tracker/dmlc-submit --cluster local --num-workers=10 --local-num-attempt=10 model_recover 10000 mock=0,0,1,0 mock=1,1,1,0 mock=1,1,1,1 mock=0,1,1,0 mock=4,1,1,0 mock=9,1,1,0 mock=8,1,2,0 mock=4,1,3,0 + ../dmlc-core/tracker/dmlc-submit --cluster local --num-workers=10 --local-num-attempt=20 model_recover 10000 mock=0,0,1,0 mock=1,1,1,0 mock=1,1,1,1 mock=0,1,1,0 mock=4,1,1,0 mock=9,1,1,0 mock=8,1,2,0 mock=4,1,3,0 local_recover_10_10k: - ../dmlc-core/tracker/dmlc-submit --cluster local --num-workers=10 --local-num-attempt=10 local_recover 10000 mock=0,0,1,0 mock=1,1,1,0 mock=0,1,1,0 mock=4,1,1,0 mock=9,1,1,0 mock=1,1,1,1 + ../dmlc-core/tracker/dmlc-submit --cluster local --num-workers=10 --local-num-attempt=20 local_recover 10000 mock=0,0,1,0 mock=1,1,1,0 mock=0,1,1,0 mock=4,1,1,0 mock=9,1,1,0 mock=1,1,1,1 pylocal_recover_10_10k: - ../dmlc-core/tracker/dmlc-submit --cluster local --num-workers=10 local_recover.py 10000 mock=0,0,1,0 mock=1,1,1,0 mock=0,1,1,0 mock=4,1,1,0 mock=9,1,1,0 mock=1,1,1,1 + ../dmlc-core/tracker/dmlc-submit --cluster local --num-workers=10 --local-num-attempt=20 local_recover.py 10000 mock=0,0,1,0 mock=1,1,1,0 mock=0,1,1,0 mock=4,1,1,0 mock=9,1,1,0 mock=1,1,1,1 lazy_recover_10_10k_die_hard: - ../dmlc-core/tracker/dmlc-submit --cluster local --num-workers=10 --local-num-attempt=10 lazy_recover 10000 mock=0,0,1,0 mock=1,1,1,0 mock=1,1,1,1 mock=0,1,1,0 mock=4,1,1,0 mock=9,1,1,0 mock=8,1,2,0 mock=4,1,3,0 + ../dmlc-core/tracker/dmlc-submit --cluster local --num-workers=10 --local-num-attempt=20 lazy_recover 10000 mock=0,0,1,0 mock=1,1,1,0 mock=1,1,1,1 mock=0,1,1,0 mock=4,1,1,0 mock=9,1,1,0 mock=8,1,2,0 mock=4,1,3,0 lazy_recover_10_10k_die_same: - ../dmlc-core/tracker/dmlc-submit --cluster local --num-workers=10 --local-num-attempt=10 lazy_recover 10000 mock=0,0,1,0 mock=1,1,1,0 mock=0,1,1,0 mock=4,1,1,0 mock=9,1,1,0 + ../dmlc-core/tracker/dmlc-submit --cluster local --num-workers=10 --local-num-attempt=20 lazy_recover 10000 mock=0,0,1,0 mock=1,1,1,0 mock=0,1,1,0 mock=4,1,1,0 mock=9,1,1,0 ringallreduce_10_10k: - ../dmlc-core/tracker/dmlc-submit --cluster local --num-workers=10 model_recover 100 rabit_reduce_ring_mincount=10 + ../dmlc-core/tracker/dmlc-submit --cluster local --num-workers=10 model_recover 10000 rabit_reduce_ring_mincount=10