Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[rabit harden] include osx in tests, address time_wait on port assignment, enable all tests #90

Merged
merged 21 commits into from
Apr 24, 2019
Merged
Show file tree
Hide file tree
Changes from 19 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 5 additions & 1 deletion .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -37,8 +37,12 @@ recommonmark
recom
_*

#mpi lib
mpich/
mpich-3.2/

# Jetbrain
.idea
cmake-build-debug/

.vscode/

3 changes: 2 additions & 1 deletion .gitmodules
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
[submodule "dmlc-core"]
path = dmlc-core
url = https://github.com/dmlc/dmlc-core
url = https://github.com/chenqin/dmlc-core
branch = py3
51 changes: 36 additions & 15 deletions .travis.yml
Original file line number Diff line number Diff line change
@@ -1,5 +1,12 @@
# disable sudo to use container based build
sudo: false
sudo: true

os:
- linux
- osx

osx_image: xcode9.3
chenqin marked this conversation as resolved.
Show resolved Hide resolved

dist: xenial

# Use Build Matrix to do lint and build seperately
env:
Expand All @@ -10,51 +17,65 @@ env:
- TASK=build
- TASK=mpi-build
- TASK=cmake-build
- TASK=test CXX=g++-4.8
- TASK=test CXX=g++

# dependent apt packages
dist: xenial
addons:
apt:
sources:
chenqin marked this conversation as resolved.
Show resolved Hide resolved
- llvm-toolchain-trusty-5.0
- ubuntu-toolchain-r-test
- george-edison55-precise-backports
packages:
- doxygen
- libopenmpi-dev
- wget
- git
- libcurl4-openssl-dev
- unzip
- python-numpy
- gcc-4.8
- g++-4.8
- openmpi-bin
- openmpi-common
- openssh-client
- openssh-server
- libopenmpi-dev
- python3
- python3-setuptools
homebrew:
packages:
- gcc49
- openssl
- libgit2
- python3
update: true

before_install:
- export TRAVIS=dmlc-core/scripts/travis/
- source ${TRAVIS}/travis_setup_env.sh
- ${TRAVIS}/travis_osx_install.sh

install:
- pip install --user cpplint pylint kubernetes urllib3
- if [[ ${TRAVIS_OS_NAME} == "linux" ]]; then sudo apt-get install python3-pip; fi
- if [[ ${TRAVIS_OS_NAME} == "osx" ]]; then brew install python3; fi
- pip3 install cpplint pylint urllib3
- pip3 install websocket-client kubernetes

script: scripts/travis_script.sh


before_cache:
- ${TRAVIS}/travis_before_cache.sh


cache:
directories:
- ${HOME}/.cache/usr
- ${HOME}/.cache/pip
- mpich

before_cache:
- ${TRAVIS}/travis_before_cache.sh

after_success:
- tree build
- bash <(curl -s https://codecov.io/bash) -a '-o src/ src/*.c'

notifications:
# Emails are sent to the committer's git-configured email address by default,
email:
on_success: change
on_failure: always


43 changes: 26 additions & 17 deletions Makefile
Original file line number Diff line number Diff line change
@@ -1,23 +1,26 @@
OS := $(shell uname)

export MPICXX = mpicxx
export LDFLAGS= -Llib
export WARNFLAGS= -Wall -Wextra -Wno-unused-parameter -Wno-unknown-pragmas -std=c++11
export CFLAGS = -O3 $(WARNFLAGS) -I $(DMLC)/include -I include/
export LDFLAGS =-Llib

OS := $(shell uname)
#download mpi
#echo $(shell scripts/mpi.sh)

MPICXX=./mpich/bin/mpicxx

ifeq ($(OS), Darwin)
ifndef CC
export CC = $(if $(shell which clang), clang, gcc)
export CC = gcc-4.9
endif
ifndef CXX
export CXX = $(if $(shell which clang++), clang++, g++)
export CXX = g++-4.9
endif
else
ifeq ($(OS), FreeBSD)
ifndef CXX
export CXX = g++6
endif
export MPICXX = /usr/local/mpi/bin/mpicxx
export LDFLAGS= -Llib -Wl,-rpath=/usr/local/lib/gcc6
else
# linux defaults
Expand All @@ -27,13 +30,10 @@ else
ifndef CXX
export CXX = g++
endif
LDFLAGS += -lrt
LDFLAGS +=-lrt
endif
endif

export WARNFLAGS= -Wall -Wextra -Wno-unused-parameter -Wno-unknown-pragmas -std=c++11
export CFLAGS = -O3 $(WARNFLAGS)

#----------------------------
# Settings for power and arm arch
#----------------------------
Expand Down Expand Up @@ -69,8 +69,10 @@ BPATH=.
MPIOBJ= $(BPATH)/engine_mpi.o
OBJ= $(BPATH)/allreduce_base.o $(BPATH)/allreduce_robust.o $(BPATH)/engine.o $(BPATH)/engine_empty.o $(BPATH)/engine_mock.o\
$(BPATH)/c_api.o $(BPATH)/engine_base.o
SLIB= lib/librabit.so lib/librabit_mpi.so lib/librabit_mock.so lib/librabit_base.so
ALIB= lib/librabit.a lib/librabit_mpi.a lib/librabit_empty.a lib/librabit_mock.a lib/librabit_base.a
SLIB= lib/librabit.so lib/librabit_mock.so lib/librabit_base.so
ALIB= lib/librabit.a lib/librabit_empty.a lib/librabit_mock.a lib/librabit_base.a
MPISLIB= lib/librabit_mpi.so
MPIALIB= lib/librabit_mpi.a
HEADERS=src/*.h include/rabit/*.h include/rabit/internal/*.h
DMLC=dmlc-core

Expand All @@ -95,22 +97,29 @@ lib/librabit_empty.a: $(BPATH)/engine_empty.o $(BPATH)/c_api.o
lib/librabit_mpi.a lib/librabit_mpi.so: $(MPIOBJ)

$(OBJ) :
$(CXX) -c $(CFLAGS) -o $@ $(firstword $(filter %.cpp %.c %.cc, $^) ) -I include/ -I $(DMLC)/include

$(MPIOBJ) :
$(MPICXX) -c $(CFLAGS) -o $@ $(firstword $(filter %.cpp %.c %.cc, $^) ) -I $(DMLC)/include
$(CXX) -c $(CFLAGS) -o $@ $(firstword $(filter %.cpp %.c %.cc, $^) )

$(ALIB):
ar cr $@ $+

$(SLIB) :
$(CXX) $(CFLAGS) -shared -o $@ $(filter %.cpp %.o %.c %.cc %.a, $^) $(LDFLAGS)

$(MPIOBJ) :
$(MPICXX) -c $(CFLAGS) -I./mpich/include -o $@ $(firstword $(filter %.cpp %.c %.cc, $^) )

$(MPIALIB):
ar cr $@ $+

$(MPISLIB) :
$(MPICXX) $(CFLAGS) -I./mpich/include -shared -o $@ $(filter %.cpp %.o %.c %.cc %.a, $^) \
$(LDFLAGS) -L./mpich/lib -Wl,-rpath,./mpich/lib -lmpi

lint:
$(DMLC)/scripts/lint.py rabit $(LINT_LANG) src include

doc doxygen:
cd include; doxygen ../doc/Doxyfile; cd -

clean:
$(RM) $(OBJ) $(MPIOBJ) $(ALIB) $(MPIALIB) $(SLIB) *~ src/*~ include/*~ include/*/*~
$(RM) $(OBJ) $(MPIOBJ) $(ALIB) $(MPIALIB) $(SLIB) *~ src/*~ include/*~ include/*/*~
2 changes: 1 addition & 1 deletion dmlc-core
27 changes: 27 additions & 0 deletions scripts/mpi.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@
#!/usr/bin/env bash
chenqin marked this conversation as resolved.
Show resolved Hide resolved

if [ -f mpich/lib/libmpich.so ]; then
echo "libmpich.so found -- nothing to build."
else
echo "Downloading mpich source."
wget http://www.mpich.org/static/downloads/3.2/mpich-3.2.tar.gz
tar xfz mpich-3.2.tar.gz
rm mpich-3.2.tar.gz*
echo "configuring and building mpich."
cd mpich-3.2
#CC=gcc CXX=g++ CFLAGS=-m64 CXXFLAGS=-m64 FFLAGS=-m64
./configure \
--prefix=`pwd`/../mpich \
--enable-static=false \
--enable-alloca=true \
--disable-long-double \
--enable-threads=single \
--enable-fortran=no \
--enable-fast=all \
--enable-g=none \
--enable-timing=none \
--enable-cxx
make -j4
make install
cd -
fi
1 change: 1 addition & 0 deletions scripts/travis_script.sh
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@ if [ ${TASK} == "build" ]; then
fi

if [ ${TASK} == "mpi-build" ]; then
./scripts/mpi.sh
cd test
make mpi && make speed_test.mpi || exit -1
fi
Expand Down
32 changes: 24 additions & 8 deletions src/allreduce_base.cc
Original file line number Diff line number Diff line change
Expand Up @@ -131,6 +131,8 @@ void AllreduceBase::Shutdown(void) {
utils::TCPSocket tracker = this->ConnectTracker();
tracker.SendStr(std::string("shutdown"));
tracker.Close();
// close listening sockets
sock_listen.Close();
utils::TCPSocket::Finalize();
}
void AllreduceBase::TrackerPrint(const std::string &msg) {
Expand Down Expand Up @@ -271,12 +273,26 @@ void AllreduceBase::ReConnectLinks(const char *cmd) {
"ReConnectLink failure 4");
Assert(tracker.RecvAll(&next_rank, sizeof(next_rank)) == sizeof(next_rank),
"ReConnectLink failure 4");
// create listening socket
utils::TCPSocket sock_listen;
sock_listen.Create();
int port = sock_listen.TryBindHost(slave_port, slave_port + nport_trial);
utils::Check(port != -1, "ReConnectLink fail to bind the ports specified");
sock_listen.Listen();

if (sock_listen == INVALID_SOCKET || sock_listen.AtMark()) {
if (!sock_listen.IsClosed()) {
sock_listen.Close();
}
// create listening socket
sock_listen.Create();
sock_listen.SetKeepAlive(true);
// http://deepix.github.io/2016/10/21/tcprst.html
sock_listen.SetLinger(0);
// [slave_port, slave_port+1 .... slave_port + newrank ...slave_port + nport_trial)
// work around processes bind to same port without set reuse option,
// start explore from slave_port + newrank towards end
port = sock_listen.TryBindHost(slave_port+ newrank%nport_trial, slave_port + nport_trial);
// if no port bindable, explore first half of range
if (port == -1) sock_listen.TryBindHost(slave_port, newrank% nport_trial + slave_port);

utils::Check(port != -1, "ReConnectLink fail to bind the ports specified");
sock_listen.Listen();
}

// get number of to connect and number of to accept nodes from tracker
int num_conn, num_accept, num_error = 1;
Expand Down Expand Up @@ -311,6 +327,7 @@ void AllreduceBase::ReConnectLinks(const char *cmd) {
"ReConnectLink failure 9");
Assert(tracker.RecvAll(&hrank, sizeof(hrank)) == sizeof(hrank),
"ReConnectLink failure 10");

r.sock.Create();
if (!r.sock.Connect(utils::SockAddr(hname.c_str(), hport))) {
num_error += 1; r.sock.Close(); continue;
Expand Down Expand Up @@ -357,8 +374,7 @@ void AllreduceBase::ReConnectLinks(const char *cmd) {
}
if (!match) all_links.push_back(r);
}
// close listening sockets
sock_listen.Close();

this->parent_index = -1;
// setup tree links and ring structure
tree_links.plinks.clear();
Expand Down
4 changes: 4 additions & 0 deletions src/allreduce_base.h
Original file line number Diff line number Diff line change
Expand Up @@ -521,6 +521,10 @@ class AllreduceBase : public IEngine {
int world_size;
// connect retry time
int connect_retry;
// backdoor listening peer connection
utils::TCPSocket sock_listen;
// backdoor port
int port = 0;
};
} // namespace engine
} // namespace rabit
Expand Down
10 changes: 4 additions & 6 deletions src/allreduce_robust.cc
Original file line number Diff line number Diff line change
Expand Up @@ -51,12 +51,10 @@ void AllreduceRobust::Shutdown(void) {
utils::Assert(RecoverExec(NULL, 0, ActionSummary::kCheckAck, ActionSummary::kSpecialOp),
"Shutdown: check ack must return true");

// one worker shutdowns and closes sockets while rest still run kCheckAck,
// seems has something to do with time-wait state in tcp connection,
// this cause rest workers checkandrecover and hang inf,
// https://github.com/dmlc/xgboost/pull/3818
// TODO(Chen Qin): a fundamental fix for this
sleep(1);
#if defined (__APPLE__)
sleep(1);
#endif

AllreduceBase::Shutdown();
}
/*!
Expand Down
10 changes: 9 additions & 1 deletion src/socket.h
Original file line number Diff line number Diff line change
Expand Up @@ -276,13 +276,21 @@ class TCPSocket : public Socket{
* \brief enable/disable TCP keepalive
* \param keepalive whether to set the keep alive option on
*/
inline void SetKeepAlive(bool keepalive) {
void SetKeepAlive(bool keepalive) {
int opt = static_cast<int>(keepalive);
if (setsockopt(sockfd, SOL_SOCKET, SO_KEEPALIVE,
reinterpret_cast<char*>(&opt), sizeof(opt)) < 0) {
Socket::Error("SetKeepAlive");
}
}
inline void SetLinger(int timeout = 0) {
struct linger sl;
sl.l_onoff = 1; /* non-zero value enables linger option in kernel */
sl.l_linger = timeout; /* timeout interval in seconds */
if (setsockopt(sockfd, SOL_SOCKET, SO_LINGER, &sl, sizeof(sl)) == -1) {
Socket::Error("SO_LINGER");
}
}
/*!
* \brief create the socket, call this before using socket
* \param af domain
Expand Down
13 changes: 10 additions & 3 deletions test/Makefile
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
export MPICXX = mpicxx
MPICXX=../mpich/bin/mpicxx
export LDFLAGS= -L../lib -pthread -lm
export CFLAGS = -Wall -O3 -msse2 -Wno-unknown-pragmas -fPIC -I../include -I ../dmlc-core/include -std=c++11

Expand Down Expand Up @@ -27,11 +27,17 @@ OBJ = $(RABIT_OBJ) speed_test.o model_recover.o local_recover.o lazy_recover.o
MPIBIN = speed_test.mpi
.PHONY: clean all lib mpi

.PHONY: lib all

all: $(BIN)

lib:
cd ..;make;cd -
cd ..;make clean;make;cd -

.PHONY: mpi
mpi:
cd ..;make mpi;cd -

# programs
speed_test.o: speed_test.cc ../include/rabit/*.h lib mpi
model_recover.o: model_recover.cc ../include/rabit/*.h lib
Expand All @@ -52,7 +58,8 @@ $(OBJ) :
$(CXX) -c $(CFLAGS) -o $@ $(firstword $(filter %.cpp %.c %.cc, $^) )

$(MPIBIN) :
$(MPICXX) $(CFLAGS) -o $@ $(filter %.cpp %.o %.c %.cc, $^) ../lib/librabit_mpi.so $(LDFLAGS)
$(MPICXX) $(CFLAGS) -I../mpich/include -shared -o $@ $(filter %.cpp %.o %.c %.cc, $^) \
../lib/librabit_mpi.so $(LDFLAGS) -L../mpich/lib -Wl,-rpath,../mpich/lib -lmpi

clean:
$(RM) $(OBJ) $(BIN) $(MPIBIN) $(MPIOBJ) *~ ../src/*~
Loading