From 0ecc511e193235195d0000c645e8cc94b907e133 Mon Sep 17 00:00:00 2001 From: Chen Qin Date: Sat, 6 Apr 2019 20:47:39 -0700 Subject: [PATCH 01/21] [cleanup] disabled include tests against xgb master --- .gitignore | 2 +- .travis.yml | 4 +++- scripts/travis_script.sh | 40 +++++++++++++++++++++++++++++++++++++++- src/allreduce_base.cc | 1 + src/allreduce_robust.cc | 7 +++++-- 5 files changed, 49 insertions(+), 5 deletions(-) diff --git a/.gitignore b/.gitignore index 94a8c105..3a862372 100644 --- a/.gitignore +++ b/.gitignore @@ -40,5 +40,5 @@ _* # Jetbrain .idea cmake-build-debug/ - +.vscode/ diff --git a/.travis.yml b/.travis.yml index ba01f423..93d763a2 100644 --- a/.travis.yml +++ b/.travis.yml @@ -1,5 +1,5 @@ # disable sudo to use container based build -sudo: false +sudo: true # Use Build Matrix to do lint and build seperately env: @@ -11,6 +11,8 @@ env: - TASK=mpi-build - TASK=cmake-build - TASK=test CXX=g++-4.8 + #- TASK=xgb-cmake + #- TASK=xgb-java-tests # dependent apt packages dist: xenial diff --git a/scripts/travis_script.sh b/scripts/travis_script.sh index 12e78f65..8bc4102a 100755 --- a/scripts/travis_script.sh +++ b/scripts/travis_script.sh @@ -30,4 +30,42 @@ if [ ${TASK} == "cmake-build" ]; then cd build cmake .. make all || exit -1 -fi \ No newline at end of file +fi + +if [ ${TASK} == "xgb-cmake" ]; then + git clone --recursive https://github.com/dmlc/xgboost ../xgboost + rm -rf ../xgboost/rabit + cp -r ../rabit ../xgboost/rabit + + set -e + # Build gtest via cmake + wget -nc https://github.com/google/googletest/archive/release-1.7.0.zip + unzip -n release-1.7.0.zip + mv googletest-release-1.7.0 gtest && cd gtest + cmake . && make + mkdir lib && mv libgtest.a lib + cd .. + rm -rf release-1.7.0.zip + + # Build/test + rm -rf build + mkdir build && cd build + PLUGINS="-DPLUGIN_LZ4=ON -DPLUGIN_DENSE_PARSER=ON" + cmake .. -DGOOGLE_TEST=ON -DGTEST_ROOT=$PWD/../gtest/ ${PLUGINS} + make + ./testxgboost + cd .. + rm -rf build +fi + +if [ ${TASK} == "xgb-java-tests" ]; then + git clone --recursive https://github.com/dmlc/xgboost ../xgboost + rm -rf ../xgboost/rabit + cp -r ../rabit ../xgboost/rabit + echo "MAVEN_OPTS='-Xmx2g -XX:MaxPermSize=1024m -XX:ReservedCodeCacheSize=512m -Dorg.slf4j.simpleLogger.defaultLogLevel=error'" > ~/.mavenrc + cd ../xgboost + set -e + cd jvm-packages + mvn -q clean install -DskipTests -Dmaven.test.skip + mvn -q test || exit -1 +fi diff --git a/src/allreduce_base.cc b/src/allreduce_base.cc index 143db6e1..09ca58a3 100644 --- a/src/allreduce_base.cc +++ b/src/allreduce_base.cc @@ -207,6 +207,7 @@ utils::TCPSocket AllreduceBase::ConnectTracker(void) const { utils::Socket::Error("Connect"); } else { fprintf(stderr, "retry connect to ip(retry time %d): [%s]\n", retry, tracker_uri.c_str()); + #if defined(_MSC_VER) || defined (__MINGW32__) Sleep(retry << 1); #else diff --git a/src/allreduce_robust.cc b/src/allreduce_robust.cc index 43871e01..342e19a4 100644 --- a/src/allreduce_robust.cc +++ b/src/allreduce_robust.cc @@ -50,13 +50,16 @@ void AllreduceRobust::Shutdown(void) { // execute check ack step, load happens here utils::Assert(RecoverExec(NULL, 0, ActionSummary::kCheckAck, ActionSummary::kSpecialOp), "Shutdown: check ack must return true"); - // one worker shutdowns and closes sockets while rest still run kCheckAck, // seems has something to do with time-wait state in tcp connection, // this cause rest workers checkandrecover and hang inf, // https://github.com/dmlc/xgboost/pull/3818 // TODO(Chen Qin): a fundamental fix for this - sleep(1); +#if defined(_MSC_VER) || defined (__MINGW32__) + Sleep(1); +#else + sleep(1); +#endif AllreduceBase::Shutdown(); } /*! From a694b83c11fdcbd6ee16c7028b84df7c96d39a8a Mon Sep 17 00:00:00 2001 From: Chen Qin Date: Tue, 9 Apr 2019 11:41:14 -0700 Subject: [PATCH 02/21] enable xgb-tests use chenqin/xgboost:master with updated path --- .travis.yml | 6 +++--- scripts/travis_script.sh | 10 +++++----- src/allreduce_base.cc | 1 - 3 files changed, 8 insertions(+), 9 deletions(-) diff --git a/.travis.yml b/.travis.yml index 93d763a2..8f38817a 100644 --- a/.travis.yml +++ b/.travis.yml @@ -1,5 +1,5 @@ # disable sudo to use container based build -sudo: true +sudo: false # Use Build Matrix to do lint and build seperately env: @@ -11,8 +11,8 @@ env: - TASK=mpi-build - TASK=cmake-build - TASK=test CXX=g++-4.8 - #- TASK=xgb-cmake - #- TASK=xgb-java-tests + - TASK=xgb-cmake + - TASK=xgb-java-tests # dependent apt packages dist: xenial diff --git a/scripts/travis_script.sh b/scripts/travis_script.sh index 8bc4102a..8e655840 100755 --- a/scripts/travis_script.sh +++ b/scripts/travis_script.sh @@ -33,10 +33,10 @@ if [ ${TASK} == "cmake-build" ]; then fi if [ ${TASK} == "xgb-cmake" ]; then - git clone --recursive https://github.com/dmlc/xgboost ../xgboost + git clone --recursive https://github.com/chenqin/xgboost ../xgboost rm -rf ../xgboost/rabit - cp -r ../rabit ../xgboost/rabit - + ln -s ../rabit ../xgboost/rabit + cd ../xgboost set -e # Build gtest via cmake wget -nc https://github.com/google/googletest/archive/release-1.7.0.zip @@ -59,9 +59,9 @@ if [ ${TASK} == "xgb-cmake" ]; then fi if [ ${TASK} == "xgb-java-tests" ]; then - git clone --recursive https://github.com/dmlc/xgboost ../xgboost + git clone --recursive https://github.com/chenqin/xgboost ../xgboost rm -rf ../xgboost/rabit - cp -r ../rabit ../xgboost/rabit + ln -s ../rabit ../xgboost/rabit echo "MAVEN_OPTS='-Xmx2g -XX:MaxPermSize=1024m -XX:ReservedCodeCacheSize=512m -Dorg.slf4j.simpleLogger.defaultLogLevel=error'" > ~/.mavenrc cd ../xgboost set -e diff --git a/src/allreduce_base.cc b/src/allreduce_base.cc index 09ca58a3..143db6e1 100644 --- a/src/allreduce_base.cc +++ b/src/allreduce_base.cc @@ -207,7 +207,6 @@ utils::TCPSocket AllreduceBase::ConnectTracker(void) const { utils::Socket::Error("Connect"); } else { fprintf(stderr, "retry connect to ip(retry time %d): [%s]\n", retry, tracker_uri.c_str()); - #if defined(_MSC_VER) || defined (__MINGW32__) Sleep(retry << 1); #else From 17a31e4e3890a01ae5de63c6a35979cd5fe413fe Mon Sep 17 00:00:00 2001 From: Chen Qin Date: Tue, 9 Apr 2019 13:08:26 -0700 Subject: [PATCH 03/21] include env setup on xgb related travis tests --- .travis.yml | 1 + scripts/travis_script.sh | 3 +++ 2 files changed, 4 insertions(+) diff --git a/.travis.yml b/.travis.yml index 8f38817a..fd20babe 100644 --- a/.travis.yml +++ b/.travis.yml @@ -36,6 +36,7 @@ addons: before_install: - export TRAVIS=dmlc-core/scripts/travis/ + - export PYTHONPATH=${PYTHONPATH}:${PWD}/python-package - source ${TRAVIS}/travis_setup_env.sh install: diff --git a/scripts/travis_script.sh b/scripts/travis_script.sh index 8e655840..49a99d15 100755 --- a/scripts/travis_script.sh +++ b/scripts/travis_script.sh @@ -37,6 +37,7 @@ if [ ${TASK} == "xgb-cmake" ]; then rm -rf ../xgboost/rabit ln -s ../rabit ../xgboost/rabit cd ../xgboost + source tests/travis/setup.sh set -e # Build gtest via cmake wget -nc https://github.com/google/googletest/archive/release-1.7.0.zip @@ -64,6 +65,8 @@ if [ ${TASK} == "xgb-java-tests" ]; then ln -s ../rabit ../xgboost/rabit echo "MAVEN_OPTS='-Xmx2g -XX:MaxPermSize=1024m -XX:ReservedCodeCacheSize=512m -Dorg.slf4j.simpleLogger.defaultLogLevel=error'" > ~/.mavenrc cd ../xgboost + source tests/travis/setup.sh + set -e cd jvm-packages mvn -q clean install -DskipTests -Dmaven.test.skip From f19d62ecb249a5f875c9946f93f0b3b6395045e2 Mon Sep 17 00:00:00 2001 From: Chen Qin Date: Tue, 9 Apr 2019 13:35:25 -0700 Subject: [PATCH 04/21] port packages from xgb --- .travis.yml | 51 +++++++++++++++++++++++++++++++++------- scripts/travis_script.sh | 28 ---------------------- 2 files changed, 43 insertions(+), 36 deletions(-) diff --git a/.travis.yml b/.travis.yml index fd20babe..06be656b 100644 --- a/.travis.yml +++ b/.travis.yml @@ -1,5 +1,11 @@ # disable sudo to use container based build -sudo: false +sudo: true + +os: + - linux + - osx + +osx_image: xcode9.3 # Use Build Matrix to do lint and build seperately env: @@ -11,13 +17,28 @@ env: - TASK=mpi-build - TASK=cmake-build - TASK=test CXX=g++-4.8 - - TASK=xgb-cmake - TASK=xgb-java-tests +matrix: + exclude: + - os: osx + env: TASK=lint LINT_LANG=cpp + - os: osx + env: TASK=lint LINT_LANG=python + - os: osx + env: TASK=doc + - os: osx + env: TASK=test CXX=g++-4.8 + - os: osx + env: TASK=mpi-build + # dependent apt packages -dist: xenial addons: apt: + sources: + - llvm-toolchain-trusty-5.0 + - ubuntu-toolchain-r-test + - george-edison55-precise-backports packages: - doxygen - libopenmpi-dev @@ -33,10 +54,22 @@ addons: - openssh-client - openssh-server - libopenmpi-dev + - clang + - clang-tidy-5.0 + - cmake-data + - graphviz + homebrew: + packages: + - gcc@7 + - graphviz + - openssl + - libgit2 + update: true before_install: - export TRAVIS=dmlc-core/scripts/travis/ - export PYTHONPATH=${PYTHONPATH}:${PWD}/python-package + - echo "MAVEN_OPTS='-Xmx2g -XX:MaxPermSize=1024m -XX:ReservedCodeCacheSize=512m -Dorg.slf4j.simpleLogger.defaultLogLevel=error'" > ~/.mavenrc - source ${TRAVIS}/travis_setup_env.sh install: @@ -44,15 +77,17 @@ install: script: scripts/travis_script.sh - -before_cache: - - ${TRAVIS}/travis_before_cache.sh - - cache: directories: - ${HOME}/.cache/usr + - ${HOME}/.cache/pip + +before_cache: + - ${TRAVIS}/travis_before_cache.sh +after_success: + - tree build + - bash <(curl -s https://codecov.io/bash) -a '-o src/ src/*.c' notifications: # Emails are sent to the committer's git-configured email address by default, diff --git a/scripts/travis_script.sh b/scripts/travis_script.sh index 49a99d15..7ec7fac9 100755 --- a/scripts/travis_script.sh +++ b/scripts/travis_script.sh @@ -32,38 +32,10 @@ if [ ${TASK} == "cmake-build" ]; then make all || exit -1 fi -if [ ${TASK} == "xgb-cmake" ]; then - git clone --recursive https://github.com/chenqin/xgboost ../xgboost - rm -rf ../xgboost/rabit - ln -s ../rabit ../xgboost/rabit - cd ../xgboost - source tests/travis/setup.sh - set -e - # Build gtest via cmake - wget -nc https://github.com/google/googletest/archive/release-1.7.0.zip - unzip -n release-1.7.0.zip - mv googletest-release-1.7.0 gtest && cd gtest - cmake . && make - mkdir lib && mv libgtest.a lib - cd .. - rm -rf release-1.7.0.zip - - # Build/test - rm -rf build - mkdir build && cd build - PLUGINS="-DPLUGIN_LZ4=ON -DPLUGIN_DENSE_PARSER=ON" - cmake .. -DGOOGLE_TEST=ON -DGTEST_ROOT=$PWD/../gtest/ ${PLUGINS} - make - ./testxgboost - cd .. - rm -rf build -fi - if [ ${TASK} == "xgb-java-tests" ]; then git clone --recursive https://github.com/chenqin/xgboost ../xgboost rm -rf ../xgboost/rabit ln -s ../rabit ../xgboost/rabit - echo "MAVEN_OPTS='-Xmx2g -XX:MaxPermSize=1024m -XX:ReservedCodeCacheSize=512m -Dorg.slf4j.simpleLogger.defaultLogLevel=error'" > ~/.mavenrc cd ../xgboost source tests/travis/setup.sh From e6ae16856ac49a90859084cc196af3cc50f4267e Mon Sep 17 00:00:00 2001 From: Chen Qin Date: Sat, 6 Apr 2019 20:47:39 -0700 Subject: [PATCH 05/21] [cleanup] include java regression tests against xgb master enable xgb-tests use chenqin/xgboost:master with updated path port packages from xgb enable test on osx --- .travis.yml | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/.travis.yml b/.travis.yml index 06be656b..d7c0ed4b 100644 --- a/.travis.yml +++ b/.travis.yml @@ -1,4 +1,3 @@ -# disable sudo to use container based build sudo: true os: @@ -16,7 +15,7 @@ env: - TASK=build - TASK=mpi-build - TASK=cmake-build - - TASK=test CXX=g++-4.8 + - TASK=test CXX=g++ - TASK=xgb-java-tests matrix: @@ -27,8 +26,6 @@ matrix: env: TASK=lint LINT_LANG=python - os: osx env: TASK=doc - - os: osx - env: TASK=test CXX=g++-4.8 - os: osx env: TASK=mpi-build @@ -95,4 +92,3 @@ notifications: on_success: change on_failure: always - From 42553e3f930370cbd748b9a9a4e5f73bc2e6315e Mon Sep 17 00:00:00 2001 From: Chen Qin Date: Tue, 9 Apr 2019 17:06:41 -0700 Subject: [PATCH 06/21] per feedback, clean up packages remove xgb java tests --- .travis.yml | 16 +--------------- scripts/travis_script.sh | 12 ------------ 2 files changed, 1 insertion(+), 27 deletions(-) diff --git a/.travis.yml b/.travis.yml index d7c0ed4b..70c41be8 100644 --- a/.travis.yml +++ b/.travis.yml @@ -16,16 +16,9 @@ env: - TASK=mpi-build - TASK=cmake-build - TASK=test CXX=g++ - - TASK=xgb-java-tests matrix: exclude: - - os: osx - env: TASK=lint LINT_LANG=cpp - - os: osx - env: TASK=lint LINT_LANG=python - - os: osx - env: TASK=doc - os: osx env: TASK=mpi-build @@ -50,15 +43,9 @@ addons: - openmpi-common - openssh-client - openssh-server - - libopenmpi-dev - - clang - - clang-tidy-5.0 - - cmake-data - - graphviz homebrew: packages: - - gcc@7 - - graphviz + - gcc48 - openssl - libgit2 update: true @@ -66,7 +53,6 @@ addons: before_install: - export TRAVIS=dmlc-core/scripts/travis/ - export PYTHONPATH=${PYTHONPATH}:${PWD}/python-package - - echo "MAVEN_OPTS='-Xmx2g -XX:MaxPermSize=1024m -XX:ReservedCodeCacheSize=512m -Dorg.slf4j.simpleLogger.defaultLogLevel=error'" > ~/.mavenrc - source ${TRAVIS}/travis_setup_env.sh install: diff --git a/scripts/travis_script.sh b/scripts/travis_script.sh index 7ec7fac9..d4dde11a 100755 --- a/scripts/travis_script.sh +++ b/scripts/travis_script.sh @@ -32,15 +32,3 @@ if [ ${TASK} == "cmake-build" ]; then make all || exit -1 fi -if [ ${TASK} == "xgb-java-tests" ]; then - git clone --recursive https://github.com/chenqin/xgboost ../xgboost - rm -rf ../xgboost/rabit - ln -s ../rabit ../xgboost/rabit - cd ../xgboost - source tests/travis/setup.sh - - set -e - cd jvm-packages - mvn -q clean install -DskipTests -Dmaven.test.skip - mvn -q test || exit -1 -fi From 9d0e23543ab46d41e8d7aefcce56f84bfea8e7d3 Mon Sep 17 00:00:00 2001 From: Chen Qin Date: Thu, 11 Apr 2019 14:37:39 -0700 Subject: [PATCH 07/21] address port binding issue --- src/allreduce_base.cc | 10 +++++++++- test/lazy_recover.cc | 2 +- test/local_recover.cc | 2 +- 3 files changed, 11 insertions(+), 3 deletions(-) diff --git a/src/allreduce_base.cc b/src/allreduce_base.cc index 143db6e1..4a8345b9 100644 --- a/src/allreduce_base.cc +++ b/src/allreduce_base.cc @@ -274,7 +274,14 @@ void AllreduceBase::ReConnectLinks(const char *cmd) { // create listening socket utils::TCPSocket sock_listen; sock_listen.Create(); - int port = sock_listen.TryBindHost(slave_port, slave_port + nport_trial); + + // [slave_port, slave_port+1 .... slave_port + newrank ...slave_port + nport_trial) + // work around processes bind to same port without set reuse option, + // start explore from slave_port + newrank towards end + int port = sock_listen.TryBindHost(slave_port+ newrank%nport_trial, slave_port + nport_trial); + // if no port bindable, explore first half of range + if (port == -1) sock_listen.TryBindHost(slave_port, newrank% nport_trial + slave_port); + utils::Check(port != -1, "ReConnectLink fail to bind the ports specified"); sock_listen.Listen(); @@ -311,6 +318,7 @@ void AllreduceBase::ReConnectLinks(const char *cmd) { "ReConnectLink failure 9"); Assert(tracker.RecvAll(&hrank, sizeof(hrank)) == sizeof(hrank), "ReConnectLink failure 10"); + r.sock.Create(); if (!r.sock.Connect(utils::SockAddr(hname.c_str(), hport))) { num_error += 1; r.sock.Close(); continue; diff --git a/test/lazy_recover.cc b/test/lazy_recover.cc index dd64294b..180e2e4b 100644 --- a/test/lazy_recover.cc +++ b/test/lazy_recover.cc @@ -118,7 +118,7 @@ int main(int argc, char *argv[]) { TestSum(&model, ntrial, r); printf("[%d] !!!TestSum pass, iter=%d\n", rank, r); rabit::LazyCheckPoint(&model); - printf("[%d] !!!CheckPont pass, iter=%d\n", rank, r); + printf("[%d] !!!CheckPoint pass, iter=%d\n", rank, r); } rabit::Finalize(); return 0; diff --git a/test/local_recover.cc b/test/local_recover.cc index a63bd2f8..1f0b28b3 100644 --- a/test/local_recover.cc +++ b/test/local_recover.cc @@ -130,7 +130,7 @@ int main(int argc, char *argv[]) { TestSum(&model, &local, ntrial, r); printf("[%d] !!!TestSum pass, iter=%d\n", rank, r); rabit::CheckPoint(&model, &local); - printf("[%d] !!!CheckPont pass, iter=%d\n", rank, r); + printf("[%d] !!!CheckPoint pass, iter=%d\n", rank, r); } rabit::Finalize(); return 0; From 6829dabacf0122e18b847d56639b97d2fd816ac3 Mon Sep 17 00:00:00 2001 From: Chen Qin Date: Thu, 11 Apr 2019 22:42:16 -0700 Subject: [PATCH 08/21] brew can't find gcc48 use gcc49, enable mpi test on osx --- .travis.yml | 8 ++------ Makefile | 15 ++++++++------- 2 files changed, 10 insertions(+), 13 deletions(-) diff --git a/.travis.yml b/.travis.yml index 70c41be8..a84e4009 100644 --- a/.travis.yml +++ b/.travis.yml @@ -17,11 +17,6 @@ env: - TASK=cmake-build - TASK=test CXX=g++ -matrix: - exclude: - - os: osx - env: TASK=mpi-build - # dependent apt packages addons: apt: @@ -45,9 +40,10 @@ addons: - openssh-server homebrew: packages: - - gcc48 + - gcc49 - openssl - libgit2 + - openmpi update: true before_install: diff --git a/Makefile b/Makefile index 2af5b9b2..24d59c5b 100644 --- a/Makefile +++ b/Makefile @@ -1,8 +1,8 @@ OS := $(shell uname) -export MPICXX = mpicxx -export LDFLAGS= -Llib - +export WARNFLAGS= -Wall -Wextra -Wno-unused-parameter -Wno-unknown-pragmas -std=c++11 +export CFLAGS = -O3 $(WARNFLAGS) +export LDFLAGS =-Llib OS := $(shell uname) ifeq ($(OS), Darwin) @@ -12,6 +12,9 @@ ifeq ($(OS), Darwin) ifndef CXX export CXX = $(if $(shell which clang++), clang++, g++) endif + export MPICXX=/usr/local/opt/openmpi/bin/mpicxx + export LDFLAGS+=-L/usr/local/opt/openmpi/lib + export CFLAGS += -I/usr/local/opt/openmpi/include else ifeq ($(OS), FreeBSD) ifndef CXX @@ -20,6 +23,7 @@ else export MPICXX = /usr/local/mpi/bin/mpicxx export LDFLAGS= -Llib -Wl,-rpath=/usr/local/lib/gcc6 else + export MPICXX = mpicxx # linux defaults ifndef CC export CC = gcc @@ -27,13 +31,10 @@ else ifndef CXX export CXX = g++ endif - LDFLAGS += -lrt + LDFLAGS +=-lrt endif endif -export WARNFLAGS= -Wall -Wextra -Wno-unused-parameter -Wno-unknown-pragmas -std=c++11 -export CFLAGS = -O3 $(WARNFLAGS) - #---------------------------- # Settings for power and arm arch #---------------------------- From e1768dea779ca39f2d7b0716d3f4c82b74ceec75 Mon Sep 17 00:00:00 2001 From: Chen Qin Date: Fri, 12 Apr 2019 16:02:36 -0700 Subject: [PATCH 09/21] split mpi build from reset, add mpi download for osx env --- .gitignore | 4 ++++ .travis.yml | 7 ++----- Makefile | 40 ++++++++++++++++++++++++---------------- scripts/mpi.sh | 27 +++++++++++++++++++++++++++ scripts/travis_script.sh | 28 ++++++++++++++++++++++++++-- 5 files changed, 83 insertions(+), 23 deletions(-) create mode 100755 scripts/mpi.sh diff --git a/.gitignore b/.gitignore index 3a862372..517cce2b 100644 --- a/.gitignore +++ b/.gitignore @@ -37,6 +37,10 @@ recommonmark recom _* +#mpi lib +mpich/ +mpich-3.2/ + # Jetbrain .idea cmake-build-debug/ diff --git a/.travis.yml b/.travis.yml index a84e4009..9f5f2ed4 100644 --- a/.travis.yml +++ b/.travis.yml @@ -1,4 +1,4 @@ -sudo: true +sudo: false os: - linux @@ -26,7 +26,6 @@ addons: - george-edison55-precise-backports packages: - doxygen - - libopenmpi-dev - wget - git - libcurl4-openssl-dev @@ -34,8 +33,6 @@ addons: - python-numpy - gcc-4.8 - g++-4.8 - - openmpi-bin - - openmpi-common - openssh-client - openssh-server homebrew: @@ -43,7 +40,6 @@ addons: - gcc49 - openssl - libgit2 - - openmpi update: true before_install: @@ -60,6 +56,7 @@ cache: directories: - ${HOME}/.cache/usr - ${HOME}/.cache/pip + - mpich before_cache: - ${TRAVIS}/travis_before_cache.sh diff --git a/Makefile b/Makefile index 24d59c5b..5599d4f3 100644 --- a/Makefile +++ b/Makefile @@ -1,29 +1,28 @@ OS := $(shell uname) export WARNFLAGS= -Wall -Wextra -Wno-unused-parameter -Wno-unknown-pragmas -std=c++11 -export CFLAGS = -O3 $(WARNFLAGS) +export CFLAGS = -O3 $(WARNFLAGS) -I $(DMLC)/include -I include/ export LDFLAGS =-Llib -OS := $(shell uname) + +#download mpi +#echo $(shell scripts/mpi.sh) + +MPICXX=./mpich/bin/mpicxx ifeq ($(OS), Darwin) ifndef CC - export CC = $(if $(shell which clang), clang, gcc) + export CC = gcc-4.9 endif ifndef CXX - export CXX = $(if $(shell which clang++), clang++, g++) + export CXX = g++-4.9 endif - export MPICXX=/usr/local/opt/openmpi/bin/mpicxx - export LDFLAGS+=-L/usr/local/opt/openmpi/lib - export CFLAGS += -I/usr/local/opt/openmpi/include else ifeq ($(OS), FreeBSD) ifndef CXX export CXX = g++6 endif - export MPICXX = /usr/local/mpi/bin/mpicxx export LDFLAGS= -Llib -Wl,-rpath=/usr/local/lib/gcc6 else - export MPICXX = mpicxx # linux defaults ifndef CC export CC = gcc @@ -70,8 +69,10 @@ BPATH=. MPIOBJ= $(BPATH)/engine_mpi.o OBJ= $(BPATH)/allreduce_base.o $(BPATH)/allreduce_robust.o $(BPATH)/engine.o $(BPATH)/engine_empty.o $(BPATH)/engine_mock.o\ $(BPATH)/c_api.o $(BPATH)/engine_base.o -SLIB= lib/librabit.so lib/librabit_mpi.so lib/librabit_mock.so lib/librabit_base.so -ALIB= lib/librabit.a lib/librabit_mpi.a lib/librabit_empty.a lib/librabit_mock.a lib/librabit_base.a +SLIB= lib/librabit.so lib/librabit_mock.so lib/librabit_base.so +ALIB= lib/librabit.a lib/librabit_empty.a lib/librabit_mock.a lib/librabit_base.a +MPISLIB= lib/librabit_mpi.so +MPIALIB= lib/librabit_mpi.a HEADERS=src/*.h include/rabit/*.h include/rabit/internal/*.h DMLC=dmlc-core @@ -96,10 +97,7 @@ lib/librabit_empty.a: $(BPATH)/engine_empty.o $(BPATH)/c_api.o lib/librabit_mpi.a lib/librabit_mpi.so: $(MPIOBJ) $(OBJ) : - $(CXX) -c $(CFLAGS) -o $@ $(firstword $(filter %.cpp %.c %.cc, $^) ) -I include/ -I $(DMLC)/include - -$(MPIOBJ) : - $(MPICXX) -c $(CFLAGS) -o $@ $(firstword $(filter %.cpp %.c %.cc, $^) ) -I $(DMLC)/include + $(CXX) -c $(CFLAGS) -o $@ $(firstword $(filter %.cpp %.c %.cc, $^) ) $(ALIB): ar cr $@ $+ @@ -107,6 +105,16 @@ $(ALIB): $(SLIB) : $(CXX) $(CFLAGS) -shared -o $@ $(filter %.cpp %.o %.c %.cc %.a, $^) $(LDFLAGS) +$(MPIOBJ) : + $(MPICXX) -c $(CFLAGS) -I./mpich/include -o $@ $(firstword $(filter %.cpp %.c %.cc, $^) ) + +$(MPIALIB): + ar cr $@ $+ + +$(MPISLIB) : + $(MPICXX) $(CFLAGS) -I./mpich/include -shared -o $@ $(filter %.cpp %.o %.c %.cc %.a, $^) \ + $(LDFLAGS) -L./mpich/lib -Wl,-rpath,./mpich/lib -lmpi + lint: $(DMLC)/scripts/lint.py rabit $(LINT_LANG) src include @@ -114,4 +122,4 @@ doc doxygen: cd include; doxygen ../doc/Doxyfile; cd - clean: - $(RM) $(OBJ) $(MPIOBJ) $(ALIB) $(MPIALIB) $(SLIB) *~ src/*~ include/*~ include/*/*~ + $(RM) $(OBJ) $(MPIOBJ) $(ALIB) $(MPIALIB) $(SLIB) *~ src/*~ include/*~ include/*/*~ diff --git a/scripts/mpi.sh b/scripts/mpi.sh new file mode 100755 index 00000000..b1e70be9 --- /dev/null +++ b/scripts/mpi.sh @@ -0,0 +1,27 @@ +#!/usr/bin/env bash + +if [ -f mpich/lib/libmpich.so ]; then + echo "libmpich.so found -- nothing to build." +else + echo "Downloading mpich source." + wget http://www.mpich.org/static/downloads/3.2/mpich-3.2.tar.gz + tar xfz mpich-3.2.tar.gz + rm mpich-3.2.tar.gz* + echo "configuring and building mpich." + cd mpich-3.2 + #CC=gcc CXX=g++ CFLAGS=-m64 CXXFLAGS=-m64 FFLAGS=-m64 + ./configure \ + --prefix=`pwd`/../mpich \ + --enable-static=false \ + --enable-alloca=true \ + --disable-long-double \ + --enable-threads=single \ + --enable-fortran=no \ + --enable-fast=all \ + --enable-g=none \ + --enable-timing=none \ + --enable-cxx + make -j4 + make install + cd - +fi \ No newline at end of file diff --git a/scripts/travis_script.sh b/scripts/travis_script.sh index d4dde11a..cc7ab6cf 100755 --- a/scripts/travis_script.sh +++ b/scripts/travis_script.sh @@ -15,6 +15,31 @@ if [ ${TASK} == "build" ]; then fi if [ ${TASK} == "mpi-build" ]; then + if [ -f mpich/lib/libmpich.so ]; then + echo "libmpich.so found -- nothing to build." + else + echo "Downloading mpich source." + wget http://www.mpich.org/static/downloads/3.2/mpich-3.2.tar.gz + tar xfz mpich-3.2.tar.gz + rm mpich-3.2.tar.gz* + echo "configuring and building mpich." + cd mpich-3.2 + #CC=gcc CXX=g++ CFLAGS=-m64 CXXFLAGS=-m64 FFLAGS=-m64 + ./configure \ + --prefix=`pwd`/../mpich \ + --enable-static=false \ + --enable-alloca=true \ + --disable-long-double \ + --enable-threads=single \ + --enable-fortran=no \ + --enable-fast=all \ + --enable-g=none \ + --enable-timing=none \ + --enable-cxx + make -j4 + make install + cd - + fi cd test make mpi && make speed_test.mpi || exit -1 fi @@ -30,5 +55,4 @@ if [ ${TASK} == "cmake-build" ]; then cd build cmake .. make all || exit -1 -fi - +fi \ No newline at end of file From ad6f9464ab3f1fcf72d21fd307753e9c03aa3ee6 Mon Sep 17 00:00:00 2001 From: Chen Qin Date: Sat, 13 Apr 2019 18:43:57 -0700 Subject: [PATCH 10/21] clean up mpi test --- scripts/travis_script.sh | 26 +------------------------- test/Makefile | 5 +++-- 2 files changed, 4 insertions(+), 27 deletions(-) diff --git a/scripts/travis_script.sh b/scripts/travis_script.sh index cc7ab6cf..338e3781 100755 --- a/scripts/travis_script.sh +++ b/scripts/travis_script.sh @@ -15,31 +15,7 @@ if [ ${TASK} == "build" ]; then fi if [ ${TASK} == "mpi-build" ]; then - if [ -f mpich/lib/libmpich.so ]; then - echo "libmpich.so found -- nothing to build." - else - echo "Downloading mpich source." - wget http://www.mpich.org/static/downloads/3.2/mpich-3.2.tar.gz - tar xfz mpich-3.2.tar.gz - rm mpich-3.2.tar.gz* - echo "configuring and building mpich." - cd mpich-3.2 - #CC=gcc CXX=g++ CFLAGS=-m64 CXXFLAGS=-m64 FFLAGS=-m64 - ./configure \ - --prefix=`pwd`/../mpich \ - --enable-static=false \ - --enable-alloca=true \ - --disable-long-double \ - --enable-threads=single \ - --enable-fortran=no \ - --enable-fast=all \ - --enable-g=none \ - --enable-timing=none \ - --enable-cxx - make -j4 - make install - cd - - fi + ./scripts/mpi.sh cd test make mpi && make speed_test.mpi || exit -1 fi diff --git a/test/Makefile b/test/Makefile index 5a6ba2aa..55452d1f 100644 --- a/test/Makefile +++ b/test/Makefile @@ -1,4 +1,4 @@ -export MPICXX = mpicxx +MPICXX=../mpich/bin/mpicxx export LDFLAGS= -L../lib -pthread -lm export CFLAGS = -Wall -O3 -msse2 -Wno-unknown-pragmas -fPIC -I../include -I ../dmlc-core/include -std=c++11 @@ -52,7 +52,8 @@ $(OBJ) : $(CXX) -c $(CFLAGS) -o $@ $(firstword $(filter %.cpp %.c %.cc, $^) ) $(MPIBIN) : - $(MPICXX) $(CFLAGS) -o $@ $(filter %.cpp %.o %.c %.cc, $^) ../lib/librabit_mpi.so $(LDFLAGS) + $(MPICXX) $(CFLAGS) -I../mpich/include -shared -o $@ $(filter %.cpp %.o %.c %.cc, $^) \ + ../lib/librabit_mpi.so $(LDFLAGS) -L../mpich/lib -Wl,-rpath,../mpich/lib -lmpi clean: $(RM) $(OBJ) $(BIN) $(MPIBIN) $(MPIOBJ) *~ ../src/*~ From 112f0462d42b9739d534d779737e90b5a59cd5aa Mon Sep 17 00:00:00 2001 From: unknown Date: Sun, 14 Apr 2019 09:26:40 -0700 Subject: [PATCH 11/21] keep sock_listen alive, force close not time_wait --- src/allreduce_base.cc | 34 +++++++++++++++++++++------------- src/allreduce_base.h | 4 ++++ src/allreduce_robust.cc | 10 ---------- src/socket.h | 10 +++++++++- 4 files changed, 34 insertions(+), 24 deletions(-) diff --git a/src/allreduce_base.cc b/src/allreduce_base.cc index 4a8345b9..60351699 100644 --- a/src/allreduce_base.cc +++ b/src/allreduce_base.cc @@ -131,6 +131,8 @@ void AllreduceBase::Shutdown(void) { utils::TCPSocket tracker = this->ConnectTracker(); tracker.SendStr(std::string("shutdown")); tracker.Close(); + // close listening sockets + sock_listen.Close(); utils::TCPSocket::Finalize(); } void AllreduceBase::TrackerPrint(const std::string &msg) { @@ -271,19 +273,26 @@ void AllreduceBase::ReConnectLinks(const char *cmd) { "ReConnectLink failure 4"); Assert(tracker.RecvAll(&next_rank, sizeof(next_rank)) == sizeof(next_rank), "ReConnectLink failure 4"); - // create listening socket - utils::TCPSocket sock_listen; - sock_listen.Create(); - // [slave_port, slave_port+1 .... slave_port + newrank ...slave_port + nport_trial) - // work around processes bind to same port without set reuse option, - // start explore from slave_port + newrank towards end - int port = sock_listen.TryBindHost(slave_port+ newrank%nport_trial, slave_port + nport_trial); - // if no port bindable, explore first half of range - if (port == -1) sock_listen.TryBindHost(slave_port, newrank% nport_trial + slave_port); + if (sock_listen == INVALID_SOCKET || sock_listen.AtMark()) { + if (!sock_listen.IsClosed()) { + sock_listen.Close(); + } + // create listening socket + sock_listen.Create(); + sock_listen.SetKeepAlive(true); + // http://deepix.github.io/2016/10/21/tcprst.html + sock_listen.SetLinger(0); + // [slave_port, slave_port+1 .... slave_port + newrank ...slave_port + nport_trial) + // work around processes bind to same port without set reuse option, + // start explore from slave_port + newrank towards end + port = sock_listen.TryBindHost(slave_port+ newrank%nport_trial, slave_port + nport_trial); + // if no port bindable, explore first half of range + if (port == -1) sock_listen.TryBindHost(slave_port, newrank% nport_trial + slave_port); - utils::Check(port != -1, "ReConnectLink fail to bind the ports specified"); - sock_listen.Listen(); + utils::Check(port != -1, "ReConnectLink fail to bind the ports specified"); + sock_listen.Listen(); + } // get number of to connect and number of to accept nodes from tracker int num_conn, num_accept, num_error = 1; @@ -365,8 +374,7 @@ void AllreduceBase::ReConnectLinks(const char *cmd) { } if (!match) all_links.push_back(r); } - // close listening sockets - sock_listen.Close(); + this->parent_index = -1; // setup tree links and ring structure tree_links.plinks.clear(); diff --git a/src/allreduce_base.h b/src/allreduce_base.h index 02c76d6f..b83cb0d0 100644 --- a/src/allreduce_base.h +++ b/src/allreduce_base.h @@ -521,6 +521,10 @@ class AllreduceBase : public IEngine { int world_size; // connect retry time int connect_retry; + // backdoor listening peer connection + utils::TCPSocket sock_listen; + // backdoor port + int port = 0; }; } // namespace engine } // namespace rabit diff --git a/src/allreduce_robust.cc b/src/allreduce_robust.cc index 342e19a4..fa6726e8 100644 --- a/src/allreduce_robust.cc +++ b/src/allreduce_robust.cc @@ -50,16 +50,6 @@ void AllreduceRobust::Shutdown(void) { // execute check ack step, load happens here utils::Assert(RecoverExec(NULL, 0, ActionSummary::kCheckAck, ActionSummary::kSpecialOp), "Shutdown: check ack must return true"); - // one worker shutdowns and closes sockets while rest still run kCheckAck, - // seems has something to do with time-wait state in tcp connection, - // this cause rest workers checkandrecover and hang inf, - // https://github.com/dmlc/xgboost/pull/3818 - // TODO(Chen Qin): a fundamental fix for this -#if defined(_MSC_VER) || defined (__MINGW32__) - Sleep(1); -#else - sleep(1); -#endif AllreduceBase::Shutdown(); } /*! diff --git a/src/socket.h b/src/socket.h index d127113a..d6e6f597 100644 --- a/src/socket.h +++ b/src/socket.h @@ -276,13 +276,21 @@ class TCPSocket : public Socket{ * \brief enable/disable TCP keepalive * \param keepalive whether to set the keep alive option on */ - inline void SetKeepAlive(bool keepalive) { + void SetKeepAlive(bool keepalive) { int opt = static_cast(keepalive); if (setsockopt(sockfd, SOL_SOCKET, SO_KEEPALIVE, reinterpret_cast(&opt), sizeof(opt)) < 0) { Socket::Error("SetKeepAlive"); } } + inline void SetLinger(int timeout = 0) { + struct linger sl; + sl.l_onoff = 1; /* non-zero value enables linger option in kernel */ + sl.l_linger = timeout; /* timeout interval in seconds */ + if (setsockopt(sockfd, SOL_SOCKET, SO_LINGER, &sl, sizeof(sl)) == -1) { + Socket::Error("SO_LINGER"); + } + } /*! * \brief create the socket, call this before using socket * \param af domain From 10897372cd6d04448f4b77c380b3d1eb6ea526a8 Mon Sep 17 00:00:00 2001 From: Chen Qin Date: Mon, 15 Apr 2019 11:20:51 -0700 Subject: [PATCH 12/21] update test makefile --- test/Makefile | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/test/Makefile b/test/Makefile index 55452d1f..b1c329d3 100644 --- a/test/Makefile +++ b/test/Makefile @@ -27,11 +27,17 @@ OBJ = $(RABIT_OBJ) speed_test.o model_recover.o local_recover.o lazy_recover.o MPIBIN = speed_test.mpi .PHONY: clean all lib mpi +.PHONY: lib all + all: $(BIN) + lib: - cd ..;make;cd - + cd ..;make clean;make;cd - + +.PHONY: mpi mpi: cd ..;make mpi;cd - + # programs speed_test.o: speed_test.cc ../include/rabit/*.h lib mpi model_recover.o: model_recover.cc ../include/rabit/*.h lib From 1a93313ce2617d75c2d1a1efa993248764a2f5e8 Mon Sep 17 00:00:00 2001 From: Chen Qin Date: Mon, 15 Apr 2019 16:26:16 -0700 Subject: [PATCH 13/21] add retry count --- test/test.mk | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/test/test.mk b/test/test.mk index 7b999507..bebcbcce 100644 --- a/test/test.mk +++ b/test/test.mk @@ -5,25 +5,25 @@ all: model_recover_10_10k model_recover_10_10k_die_same model_recover_10_10k_di # this experiment test recovery with actually process exit, use keepalive to keep program alive model_recover_10_10k: - ../dmlc-core/tracker/dmlc-submit --cluster local --num-workers=10 --local-num-attempt=10 model_recover 10000 mock=0,0,1,0 mock=1,1,1,0 + ../dmlc-core/tracker/dmlc-submit --cluster local --num-workers=10 --local-num-attempt=20 model_recover 10000 mock=0,0,1,0 mock=1,1,1,0 model_recover_10_10k_die_same: - ../dmlc-core/tracker/dmlc-submit --cluster local --num-workers=10 --local-num-attempt=10 model_recover 10000 mock=0,0,1,0 mock=1,1,1,0 mock=0,1,1,0 mock=4,1,1,0 mock=9,1,1,0 + ../dmlc-core/tracker/dmlc-submit --cluster local --num-workers=10 --local-num-attempt=20 model_recover 10000 mock=0,0,1,0 mock=1,1,1,0 mock=0,1,1,0 mock=4,1,1,0 mock=9,1,1,0 model_recover_10_10k_die_hard: - ../dmlc-core/tracker/dmlc-submit --cluster local --num-workers=10 --local-num-attempt=10 model_recover 10000 mock=0,0,1,0 mock=1,1,1,0 mock=1,1,1,1 mock=0,1,1,0 mock=4,1,1,0 mock=9,1,1,0 mock=8,1,2,0 mock=4,1,3,0 + ../dmlc-core/tracker/dmlc-submit --cluster local --num-workers=10 --local-num-attempt=20 model_recover 10000 mock=0,0,1,0 mock=1,1,1,0 mock=1,1,1,1 mock=0,1,1,0 mock=4,1,1,0 mock=9,1,1,0 mock=8,1,2,0 mock=4,1,3,0 local_recover_10_10k: - ../dmlc-core/tracker/dmlc-submit --cluster local --num-workers=10 --local-num-attempt=10 local_recover 10000 mock=0,0,1,0 mock=1,1,1,0 mock=0,1,1,0 mock=4,1,1,0 mock=9,1,1,0 mock=1,1,1,1 + ../dmlc-core/tracker/dmlc-submit --cluster local --num-workers=10 --local-num-attempt=20 local_recover 10000 mock=0,0,1,0 mock=1,1,1,0 mock=0,1,1,0 mock=4,1,1,0 mock=9,1,1,0 mock=1,1,1,1 pylocal_recover_10_10k: ../dmlc-core/tracker/dmlc-submit --cluster local --num-workers=10 local_recover.py 10000 mock=0,0,1,0 mock=1,1,1,0 mock=0,1,1,0 mock=4,1,1,0 mock=9,1,1,0 mock=1,1,1,1 lazy_recover_10_10k_die_hard: - ../dmlc-core/tracker/dmlc-submit --cluster local --num-workers=10 --local-num-attempt=10 lazy_recover 10000 mock=0,0,1,0 mock=1,1,1,0 mock=1,1,1,1 mock=0,1,1,0 mock=4,1,1,0 mock=9,1,1,0 mock=8,1,2,0 mock=4,1,3,0 + ../dmlc-core/tracker/dmlc-submit --cluster local --num-workers=10 --local-num-attempt=20 lazy_recover 10000 mock=0,0,1,0 mock=1,1,1,0 mock=1,1,1,1 mock=0,1,1,0 mock=4,1,1,0 mock=9,1,1,0 mock=8,1,2,0 mock=4,1,3,0 lazy_recover_10_10k_die_same: - ../dmlc-core/tracker/dmlc-submit --cluster local --num-workers=10 --local-num-attempt=10 lazy_recover 10000 mock=0,0,1,0 mock=1,1,1,0 mock=0,1,1,0 mock=4,1,1,0 mock=9,1,1,0 + ../dmlc-core/tracker/dmlc-submit --cluster local --num-workers=10 --local-num-attempt=20 lazy_recover 10000 mock=0,0,1,0 mock=1,1,1,0 mock=0,1,1,0 mock=4,1,1,0 mock=9,1,1,0 ringallreduce_10_10k: - ../dmlc-core/tracker/dmlc-submit --cluster local --num-workers=10 model_recover 100 rabit_reduce_ring_mincount=10 + ../dmlc-core/tracker/dmlc-submit --cluster local --num-workers=10 model_recover 10000 rabit_reduce_ring_mincount=10 From b7f7e0b3f5da57593d7b60f3a3b39d760f7d7687 Mon Sep 17 00:00:00 2001 From: Chen Qin Date: Tue, 16 Apr 2019 21:52:05 -0700 Subject: [PATCH 14/21] enable all tests other than python --- test/test.mk | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/test/test.mk b/test/test.mk index bebcbcce..94485f17 100644 --- a/test/test.mk +++ b/test/test.mk @@ -1,7 +1,7 @@ # this is a makefile used to show testcases of rabit .PHONY: all -all: model_recover_10_10k model_recover_10_10k_die_same model_recover_10_10k_die_hard local_recover_10_10k +all: model_recover_10_10k model_recover_10_10k_die_same model_recover_10_10k_die_hard local_recover_10_10k lazy_recover_10_10k_die_hard lazy_recover_10_10k_die_same ringallreduce_10_10k # this experiment test recovery with actually process exit, use keepalive to keep program alive model_recover_10_10k: @@ -17,7 +17,7 @@ local_recover_10_10k: ../dmlc-core/tracker/dmlc-submit --cluster local --num-workers=10 --local-num-attempt=20 local_recover 10000 mock=0,0,1,0 mock=1,1,1,0 mock=0,1,1,0 mock=4,1,1,0 mock=9,1,1,0 mock=1,1,1,1 pylocal_recover_10_10k: - ../dmlc-core/tracker/dmlc-submit --cluster local --num-workers=10 local_recover.py 10000 mock=0,0,1,0 mock=1,1,1,0 mock=0,1,1,0 mock=4,1,1,0 mock=9,1,1,0 mock=1,1,1,1 + ../dmlc-core/tracker/dmlc-submit --cluster local --num-workers=10 --local-num-attempt=20 local_recover.py 10000 mock=0,0,1,0 mock=1,1,1,0 mock=0,1,1,0 mock=4,1,1,0 mock=9,1,1,0 mock=1,1,1,1 lazy_recover_10_10k_die_hard: ../dmlc-core/tracker/dmlc-submit --cluster local --num-workers=10 --local-num-attempt=20 lazy_recover 10000 mock=0,0,1,0 mock=1,1,1,0 mock=1,1,1,1 mock=0,1,1,0 mock=4,1,1,0 mock=9,1,1,0 mock=8,1,2,0 mock=4,1,3,0 From dfe163c7c6bf59fec42b9723af1c22832dc38e19 Mon Sep 17 00:00:00 2001 From: Chen Qin Date: Thu, 18 Apr 2019 11:36:55 -0700 Subject: [PATCH 15/21] point to private branch with python3 dmlc tracker --- .gitmodules | 3 ++- .travis.yml | 6 +++++- dmlc-core | 2 +- 3 files changed, 8 insertions(+), 3 deletions(-) diff --git a/.gitmodules b/.gitmodules index 8e517101..d5647adc 100644 --- a/.gitmodules +++ b/.gitmodules @@ -1,3 +1,4 @@ [submodule "dmlc-core"] path = dmlc-core - url = https://github.com/dmlc/dmlc-core + url = https://github.com/chenqin/dmlc-core + branch = py3 diff --git a/.travis.yml b/.travis.yml index 9f5f2ed4..a873ede4 100644 --- a/.travis.yml +++ b/.travis.yml @@ -40,15 +40,19 @@ addons: - gcc49 - openssl - libgit2 + - python3 update: true before_install: - export TRAVIS=dmlc-core/scripts/travis/ - export PYTHONPATH=${PYTHONPATH}:${PWD}/python-package - source ${TRAVIS}/travis_setup_env.sh + - ${TRAVIS}/travis_osx_install.sh install: - - pip install --user cpplint pylint kubernetes urllib3 + - if [[ ${TRAVIS_OS_NAME} == "osx" ]]; then brew install python; fi + - python3 --version ; python3 -m pip --version + - python3 -m pip install cpplint pylint kubernetes urllib3 script: scripts/travis_script.sh diff --git a/dmlc-core b/dmlc-core index 15362f8f..0ef2c038 160000 --- a/dmlc-core +++ b/dmlc-core @@ -1 +1 @@ -Subproject commit 15362f8fcc7345d60de13a676a2cbd3ffdc3f064 +Subproject commit 0ef2c038656569eb7a662c2c2b33c7f7f4c2a6eb From 686e6c2182c907604b53426afd5ab6502a51404d Mon Sep 17 00:00:00 2001 From: Chen Qin Date: Thu, 18 Apr 2019 13:10:25 -0700 Subject: [PATCH 16/21] pip3 upgrade --- .travis.yml | 16 ++++++++++------ 1 file changed, 10 insertions(+), 6 deletions(-) diff --git a/.travis.yml b/.travis.yml index a873ede4..97ffb64c 100644 --- a/.travis.yml +++ b/.travis.yml @@ -1,10 +1,12 @@ -sudo: false +sudo: true os: - linux - osx -osx_image: xcode9.3 +osx_image: xcode10.2 + +dist: xenial # Use Build Matrix to do lint and build seperately env: @@ -35,6 +37,8 @@ addons: - g++-4.8 - openssh-client - openssh-server + - python3 + - python3-setuptools homebrew: packages: - gcc49 @@ -45,14 +49,14 @@ addons: before_install: - export TRAVIS=dmlc-core/scripts/travis/ - - export PYTHONPATH=${PYTHONPATH}:${PWD}/python-package - source ${TRAVIS}/travis_setup_env.sh - ${TRAVIS}/travis_osx_install.sh install: - - if [[ ${TRAVIS_OS_NAME} == "osx" ]]; then brew install python; fi - - python3 --version ; python3 -m pip --version - - python3 -m pip install cpplint pylint kubernetes urllib3 + - if [[ ${TRAVIS_OS_NAME} == "linux" ]]; then sudo apt-get install python3-pip; fi + - if [[ ${TRAVIS_OS_NAME} == "osx" ]]; then brew install python3; fi + - pip3 install cpplint pylint urllib3 + - pip3 install websocket-client kubernetes script: scripts/travis_script.sh From 9c2451a3df655a0a85626d4afdafbd01738307b2 Mon Sep 17 00:00:00 2001 From: Chen Qin Date: Thu, 18 Apr 2019 17:54:17 -0700 Subject: [PATCH 17/21] try apply https://github.com/dmlc/dmlc-core/pull/524 and compare with issue in https://travis-ci.org/chenqin/rabit/jobs/521959640 --- .gitmodules | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.gitmodules b/.gitmodules index d5647adc..328d00ce 100644 --- a/.gitmodules +++ b/.gitmodules @@ -1,4 +1,4 @@ [submodule "dmlc-core"] path = dmlc-core url = https://github.com/chenqin/dmlc-core - branch = py3 + branch = master From 843a089059426893ed4a8e8704d71d2702faca4a Mon Sep 17 00:00:00 2001 From: Chen Qin Date: Thu, 18 Apr 2019 19:45:39 -0700 Subject: [PATCH 18/21] Revert "try apply https://github.com/dmlc/dmlc-core/pull/524 and compare with issue in" This reverts commit 9c2451a3df655a0a85626d4afdafbd01738307b2. --- .gitmodules | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.gitmodules b/.gitmodules index 328d00ce..d5647adc 100644 --- a/.gitmodules +++ b/.gitmodules @@ -1,4 +1,4 @@ [submodule "dmlc-core"] path = dmlc-core url = https://github.com/chenqin/dmlc-core - branch = master + branch = py3 From 03a64a9d96d9fc1fd67421671304592eeb642b9f Mon Sep 17 00:00:00 2001 From: Chen Qin Date: Thu, 18 Apr 2019 19:46:02 -0700 Subject: [PATCH 19/21] revert xcode9.3 and apple sleep(1) --- .travis.yml | 2 +- src/allreduce_robust.cc | 5 +++++ 2 files changed, 6 insertions(+), 1 deletion(-) diff --git a/.travis.yml b/.travis.yml index 97ffb64c..d75c9c39 100644 --- a/.travis.yml +++ b/.travis.yml @@ -4,7 +4,7 @@ os: - linux - osx -osx_image: xcode10.2 +osx_image: xcode9.3 dist: xenial diff --git a/src/allreduce_robust.cc b/src/allreduce_robust.cc index fa6726e8..ce5a5616 100644 --- a/src/allreduce_robust.cc +++ b/src/allreduce_robust.cc @@ -50,6 +50,11 @@ void AllreduceRobust::Shutdown(void) { // execute check ack step, load happens here utils::Assert(RecoverExec(NULL, 0, ActionSummary::kCheckAck, ActionSummary::kSpecialOp), "Shutdown: check ack must return true"); + +#if defined (__APPLE__) + sleep(1); +#endif + AllreduceBase::Shutdown(); } /*! From d080d88ec32d7b999c05941c73e83c38f8ea7c75 Mon Sep 17 00:00:00 2001 From: Chen Qin Date: Fri, 19 Apr 2019 13:41:33 -0700 Subject: [PATCH 20/21] dmlc-core point back to https://github.com/dmlc/dmlc-core master --- .gitmodules | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/.gitmodules b/.gitmodules index d5647adc..8e517101 100644 --- a/.gitmodules +++ b/.gitmodules @@ -1,4 +1,3 @@ [submodule "dmlc-core"] path = dmlc-core - url = https://github.com/chenqin/dmlc-core - branch = py3 + url = https://github.com/dmlc/dmlc-core From 5e6e0717fa98f763e431921d66434c293702c205 Mon Sep 17 00:00:00 2001 From: Chen Qin Date: Fri, 19 Apr 2019 14:02:59 -0700 Subject: [PATCH 21/21] fix python test --- .travis.yml | 4 ++-- dmlc-core | 2 +- python/rabit.py | 5 +++-- scripts/{mpi.sh => mpi_build.sh} | 0 scripts/travis_runtest.sh | 1 + scripts/travis_script.sh | 2 +- test/local_recover.py | 7 ++++++- test/test.mk | 2 +- 8 files changed, 15 insertions(+), 8 deletions(-) rename scripts/{mpi.sh => mpi_build.sh} (100%) diff --git a/.travis.yml b/.travis.yml index d75c9c39..a2eff4dc 100644 --- a/.travis.yml +++ b/.travis.yml @@ -4,7 +4,7 @@ os: - linux - osx -osx_image: xcode9.3 +osx_image: xcode10.2 dist: xenial @@ -55,7 +55,7 @@ before_install: install: - if [[ ${TRAVIS_OS_NAME} == "linux" ]]; then sudo apt-get install python3-pip; fi - if [[ ${TRAVIS_OS_NAME} == "osx" ]]; then brew install python3; fi - - pip3 install cpplint pylint urllib3 + - pip3 install cpplint pylint urllib3 numpy - pip3 install websocket-client kubernetes script: scripts/travis_script.sh diff --git a/dmlc-core b/dmlc-core index 0ef2c038..13d5acb8 160000 --- a/dmlc-core +++ b/dmlc-core @@ -1 +1 @@ -Subproject commit 0ef2c038656569eb7a662c2c2b33c7f7f4c2a6eb +Subproject commit 13d5acb8ba7e79550bbf2f730f1a3944ff0fa68b diff --git a/python/rabit.py b/python/rabit.py index d57587ba..dccb464c 100644 --- a/python/rabit.py +++ b/python/rabit.py @@ -4,7 +4,7 @@ Author: Tianqi Chen """ # pylint: disable=unused-argument,invalid-name,global-statement,dangerous-default-value, -import cPickle as pickle +import pickle import ctypes import os import sys @@ -99,9 +99,10 @@ def init(args=None, lib='standard', lib_dll=None): When this is presented argument lib will be ignored. """ if args is None: - args = sys.argv + args = [] _loadlib(lib, lib_dll) arr = (ctypes.c_char_p * len(args))() + arr[:] = args _LIB.RabitInit(len(args), arr) diff --git a/scripts/mpi.sh b/scripts/mpi_build.sh similarity index 100% rename from scripts/mpi.sh rename to scripts/mpi_build.sh diff --git a/scripts/travis_runtest.sh b/scripts/travis_runtest.sh index 4f14ad17..37fc9953 100755 --- a/scripts/travis_runtest.sh +++ b/scripts/travis_runtest.sh @@ -7,3 +7,4 @@ make -f test.mk local_recover_10_10k || exit -1 make -f test.mk lazy_recover_10_10k_die_hard || exit -1 make -f test.mk lazy_recover_10_10k_die_same || exit -1 make -f test.mk ringallreduce_10_10k || exit -1 +make -f test.mk pylocal_recover_10_10k || exit -1 diff --git a/scripts/travis_script.sh b/scripts/travis_script.sh index 338e3781..ae150487 100755 --- a/scripts/travis_script.sh +++ b/scripts/travis_script.sh @@ -15,7 +15,7 @@ if [ ${TASK} == "build" ]; then fi if [ ${TASK} == "mpi-build" ]; then - ./scripts/mpi.sh + ./scripts/mpi_build.sh cd test make mpi && make speed_test.mpi || exit -1 fi diff --git a/test/local_recover.py b/test/local_recover.py index 315acbf0..6f7fae84 100755 --- a/test/local_recover.py +++ b/test/local_recover.py @@ -1,6 +1,11 @@ -#!/usr/bin/python +#!/usr/bin/env python3 + from __future__ import print_function from builtins import range + +import sys +sys.path.append('../python') + import rabit import numpy as np diff --git a/test/test.mk b/test/test.mk index 94485f17..b4b9afc4 100644 --- a/test/test.mk +++ b/test/test.mk @@ -1,7 +1,7 @@ # this is a makefile used to show testcases of rabit .PHONY: all -all: model_recover_10_10k model_recover_10_10k_die_same model_recover_10_10k_die_hard local_recover_10_10k lazy_recover_10_10k_die_hard lazy_recover_10_10k_die_same ringallreduce_10_10k +all: model_recover_10_10k model_recover_10_10k_die_same model_recover_10_10k_die_hard local_recover_10_10k lazy_recover_10_10k_die_hard lazy_recover_10_10k_die_same ringallreduce_10_10k pylocal_recover_10_10k # this experiment test recovery with actually process exit, use keepalive to keep program alive model_recover_10_10k: