Codebase list rabit / upstream/0.0_git20200805.4acdd7c
New upstream version 0.0~git20200805.4acdd7c Mo Zhou 3 years ago
22 changed file(s) with 212 addition(s) and 142 deletion(s). Raw diff Collapse all Expand all
6262
6363 before_install:
6464 - git clone https://github.com/dmlc/dmlc-core
65 - export TRAVIS=dmlc-core/scripts/travis/
65 - export TRAVIS=./scripts/
6666 - source ${TRAVIS}/travis_setup_env.sh
6767 - ${TRAVIS}/travis_osx_install.sh
6868 - source ./scripts/travis_setup.sh
0 /*!
1 * Copyright (c) 2020 by Contributors
2 * \file base.h
3 * \brief Macros common to all headers
4 *
5 * \author Hyunsu Cho
6 */
7
8 #ifndef RABIT_BASE_H_
9 #define RABIT_BASE_H_
10
11 #ifndef _CRT_SECURE_NO_WARNINGS
12 #define _CRT_SECURE_NO_WARNINGS
13 #endif // _CRT_SECURE_NO_WARNINGS
14 #ifndef _CRT_SECURE_NO_DEPRECATE
15 #define _CRT_SECURE_NO_DEPRECATE
16 #endif // _CRT_SECURE_NO_DEPRECATE
17
18 #endif // RABIT_BASE_H_
1717 #if defined(_MSC_VER) || defined(_WIN32)
1818 #define RABIT_DLL RABIT_EXTERN_C __declspec(dllexport)
1919 #else
20 #define RABIT_DLL RABIT_EXTERN_C
20 #define RABIT_DLL RABIT_EXTERN_C __attribute__ ((visibility ("default")))
2121 #endif // defined(_MSC_VER) || defined(_WIN32)
2222
2323 /*! \brief rabit unsigned long type */
55 */
66 #ifndef RABIT_INTERNAL_UTILS_H_
77 #define RABIT_INTERNAL_UTILS_H_
8 #define _CRT_SECURE_NO_WARNINGS
8
9 #include <rabit/base.h>
910 #include <string.h>
1011 #include <cstdio>
1112 #include <string>
6364 /*! \brief error message buffer length */
6465 const int kPrintBuffer = 1 << 12;
6566
66 /*! \brief we may want to keep the process alive when there are multiple workers
67 * co-locate in the same process */
68 extern bool STOP_PROCESS_ON_ERROR;
69
7067 /* \brief Case-insensitive string comparison */
7168 inline int CompareStringsCaseInsensitive(const char* s1, const char* s2) {
7269 #ifdef _MSC_VER
8784 * \param msg error message
8885 */
8986 inline void HandleAssertError(const char *msg) {
90 if (STOP_PROCESS_ON_ERROR) {
91 fprintf(stderr, "AssertError:%s, shutting down process\n", msg);
92 exit(-1);
93 } else {
94 fprintf(stderr, "AssertError:%s, rabit is configured to keep process running\n", msg);
95 throw dmlc::Error(msg);
96 }
87 fprintf(stderr,
88 "AssertError:%s, rabit is configured to keep process running\n", msg);
89 throw dmlc::Error(msg);
9790 }
9891 /*!
9992 * \brief handling of Check error, caused by inappropriate input
10093 * \param msg error message
10194 */
10295 inline void HandleCheckError(const char *msg) {
103 if (STOP_PROCESS_ON_ERROR) {
104 fprintf(stderr, "%s, shutting down process\n", msg);
105 exit(-1);
106 } else {
107 fprintf(stderr, "%s, rabit is configured to keep process running\n", msg);
108 throw dmlc::Error(msg);
109 }
96 fprintf(stderr, "%s, rabit is configured to keep process running\n", msg);
97 throw dmlc::Error(msg);
11098 }
11199 inline void HandlePrint(const char *msg) {
112100 printf("%s", msg);
0 #!/bin/bash
1 set -e
2 set -x
3
4 if [ ${TRAVIS_OS_NAME} != "osx" ]; then
5 exit 0
6 fi
00 #!/bin/bash
1
2 conda activate python3
3 conda --version
4 python --version
15
26 make -f test.mk RABIT_BUILD_DMLC=1 model_recover_10_10k || exit -1
37 make -f test.mk RABIT_BUILD_DMLC=1 model_recover_10_10k_die_same || exit -1
11
22 echo "Testing on: ${TRAVIS_OS_NAME}, Home directory: ${HOME}"
33
4 pip3 install cpplint pylint urllib3 numpy cpplint
5 pip3 install websocket-client kubernetes
6
4 # Install Miniconda
5 if [ ${TRAVIS_OS_NAME} == "osx" ]; then
6 wget -O conda.sh https://repo.anaconda.com/miniconda/Miniconda3-latest-MacOSX-x86_64.sh
7 else
8 wget -O conda.sh https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh
9 fi
10 bash conda.sh -b -p $HOME/miniconda
11 source $HOME/miniconda/bin/activate
12 conda config --set always_yes yes --set changeps1 no
13 conda update -q conda
14 conda info -a
15 conda create -n python3 python=3.7
16 conda activate python3
17 conda --version
18 python --version
19 # Install Python packages
20 conda install -c conda-forge numpy scipy urllib3 websocket-client
21 python -m pip install cpplint pylint kubernetes
722
823 # Install googletest under home directory
924 GTEST_VERSION=1.8.1
2641 popd
2742
2843 if [ ${TRAVIS_OS_NAME} == "linux" ]; then
29 sudo apt-get install python3-pip tree
44 sudo apt-get install tree
3045 fi
3146
3247 if [ ${TRAVIS_OS_NAME} == "osx" ]; then
0 # script to be sourced in travis yml
1 # setup all enviroment variables
2
3 export CACHE_PREFIX=${HOME}/.cache/usr
4 export PATH=${HOME}/.local/bin:${PATH}
5 export PATH=${PATH}:${CACHE_PREFIX}/bin
6 export CPLUS_INCLUDE_PATH=${CPLUS_INCLUDE_PATH}:${CACHE_PREFIX}/include
7 export C_INCLUDE_PATH=${C_INCLUDE_PATH}:${CACHE_PREFIX}/include
8 export LIBRARY_PATH=${LIBRARY_PATH}:${CACHE_PREFIX}/lib
9 export LD_LIBRARY_PATH=${LD_LIBRARY_PATH}:${CACHE_PREFIX}/lib
10 export DYLD_LIBRARY_PATH=${DYLD_LIBRARY_PATH}:${CACHE_PREFIX}/lib
11
12 alias make="make -j4"
13
14 # setup the cache prefix folder
15 if [ ! -d ${HOME}/.cache ]; then
16 mkdir ${HOME}/.cache
17 fi
18
19 if [ ! -d ${CACHE_PREFIX} ]; then
20 mkdir ${CACHE_PREFIX}
21 fi
22 if [ ! -d ${CACHE_PREFIX}/include ]; then
23 mkdir ${CACHE_PREFIX}/include
24 fi
25 if [ ! -d ${CACHE_PREFIX}/lib ]; then
26 mkdir ${CACHE_PREFIX}/lib
27 fi
28 if [ ! -d ${CACHE_PREFIX}/bin ]; then
29 mkdir ${CACHE_PREFIX}/bin
30 fi
31
32 # setup CUDA path if NVCC_PREFIX exists
33 if [ ! -z "$NVCC_PREFIX" ]; then
34 export PATH=${PATH}:${NVCC_PREFIX}/usr/local/cuda-7.5/bin
35 export CPLUS_INCLUDE_PATH=${CPLUS_INCLUDE_PATH}:${NVCC_PREFIX}/usr/local/cuda-7.5/include
36 export C_INCLUDE_PATH=${C_INCLUDE_PATH}:${NVCC_PREFIX}/usr/local/cuda-7.5/include
37 export LIBRARY_PATH=${LIBRARY_PATH}:${NVCC_PREFIX}/usr/local/cuda-7.5/lib64:${NVCC_PREFIX}/usr/lib/x86_64-linux-gnu
38 export LD_LIBRARY_PATH=${LD_LIBRARY_PATH}:${NVCC_PREFIX}/usr/local/cuda-7.5/lib64:${NVCC_PREFIX}/usr/lib/x86_64-linux-gnu
39 fi
44 *
55 * \author Tianqi Chen, Ignacio Cano, Tianyi Zhou
66 */
7 #define _CRT_SECURE_NO_WARNINGS
8 #define _CRT_SECURE_NO_DEPRECATE
97 #define NOMINMAX
8 #include "allreduce_base.h"
9 #include <rabit/base.h>
1010 #include <netinet/tcp.h>
1111 #include <cstring>
1212 #include <map>
13 #include "allreduce_base.h"
1413
1514 namespace rabit {
16
17 namespace utils {
18 bool STOP_PROCESS_ON_ERROR = true;
19 }
20
2115 namespace engine {
2216 // constructor
2317 AllreduceBase::AllreduceBase(void) {
3327 version_number = 0;
3428 // 32 K items
3529 reduce_ring_mincount = 32 << 10;
30 // 1M reducer size each time
31 tree_reduce_minsize = 1 << 20;
3632 // tracker URL
3733 task_id = "NULL";
3834 err_link = NULL;
4642 env_vars.push_back("DMLC_TRACKER_URI");
4743 env_vars.push_back("DMLC_TRACKER_PORT");
4844 env_vars.push_back("DMLC_WORKER_CONNECT_RETRY");
49 env_vars.push_back("DMLC_WORKER_STOP_PROCESS_ON_ERROR");
5045 }
5146
5247 // initialization function
187182 if (!strcmp(name, "DMLC_ROLE")) dmlc_role = val;
188183 if (!strcmp(name, "rabit_world_size")) world_size = atoi(val);
189184 if (!strcmp(name, "rabit_hadoop_mode")) hadoop_mode = utils::StringToBool(val);
185 if (!strcmp(name, "rabit_tree_reduce_minsize")) tree_reduce_minsize = atoi(val);
190186 if (!strcmp(name, "rabit_reduce_ring_mincount")) {
191187 reduce_ring_mincount = atoi(val);
192188 utils::Assert(reduce_ring_mincount > 0, "rabit_reduce_ring_mincount should be greater than 0");
196192 }
197193 if (!strcmp(name, "DMLC_WORKER_CONNECT_RETRY")) {
198194 connect_retry = atoi(val);
199 }
200 if (!strcmp(name, "DMLC_WORKER_STOP_PROCESS_ON_ERROR")) {
201 if (!strcmp(val, "true")) {
202 rabit::utils::STOP_PROCESS_ON_ERROR = true;
203 } else if (!strcmp(val, "false")) {
204 rabit::utils::STOP_PROCESS_ON_ERROR = false;
205 } else {
206 throw std::runtime_error("invalid value of DMLC_WORKER_STOP_PROCESS_ON_ERROR");
207 }
208195 }
209196 if (!strcmp(name, "rabit_bootstrap_cache")) {
210197 rabit_bootstrap_cache = utils::StringToBool(val);
504491 size_t size_up_out = 0;
505492 // size of message we received, and send in the down pass
506493 size_t size_down_in = 0;
494 // minimal size of each reducer
495 const size_t eachreduce = (tree_reduce_minsize / type_nbytes * type_nbytes);
496
507497 // initialize the link ring-buffer and pointer
508498 for (int i = 0; i < nlink; ++i) {
509499 if (i != parent_index) {
560550 // read data from childs
561551 for (int i = 0; i < nlink; ++i) {
562552 if (i != parent_index && watcher.CheckRead(links[i].sock)) {
563 ReturnType ret = links[i].ReadToRingBuffer(size_up_out, total_size);
564 if (ret != kSuccess) {
565 return ReportError(&links[i], ret);
553 // make sure to receive minimal reducer size
554 // since each child reduce and sends the minimal reducer size
555 while (links[i].size_read < total_size
556 && links[i].size_read - size_up_reduce < eachreduce) {
557 ReturnType ret = links[i].ReadToRingBuffer(size_up_out, total_size);
558 if (ret != kSuccess) {
559 return ReportError(&links[i], ret);
560 }
566561 }
567562 }
568563 }
582577 utils::Assert(buffer_size != 0, "must assign buffer_size");
583578 // round to type_n4bytes
584579 max_reduce = (max_reduce / type_nbytes * type_nbytes);
580
581 // if max reduce is less than total size, we reduce multiple times of
582 // eachreduce size
583 if (max_reduce < total_size)
584 max_reduce = max_reduce - max_reduce % eachreduce;
585
585586 // peform reduce, can be at most two rounds
586587 while (size_up_reduce < max_reduce) {
587588 // start position
605606 // pass message up to parent, can pass data that are already been reduced
606607 if (size_up_out < size_up_reduce) {
607608 ssize_t len = links[parent_index].sock.
608 Send(sendrecvbuf + size_up_out, size_up_reduce - size_up_out);
609 Send(sendrecvbuf + size_up_out, size_up_reduce - size_up_out);
609610 if (len != -1) {
610611 size_up_out += static_cast<size_t>(len);
611612 } else {
618619 // read data from parent
619620 if (watcher.CheckRead(links[parent_index].sock) &&
620621 total_size > size_down_in) {
621 ssize_t len = links[parent_index].sock.
622 Recv(sendrecvbuf + size_down_in, total_size - size_down_in);
623 if (len == 0) {
624 links[parent_index].sock.Close();
625 return ReportError(&links[parent_index], kRecvZeroLen);
626 }
627 if (len != -1) {
628 size_down_in += static_cast<size_t>(len);
629 utils::Assert(size_down_in <= size_up_out,
630 "Allreduce: boundary error");
631 } else {
632 ReturnType ret = Errno2Return();
633 if (ret != kSuccess) {
634 return ReportError(&links[parent_index], ret);
622 size_t left_size = total_size-size_down_in;
623 size_t reduce_size_min = std::min(left_size, eachreduce);
624 size_t recved = 0;
625 while (recved < reduce_size_min) {
626 ssize_t len = links[parent_index].sock.
627 Recv(sendrecvbuf + size_down_in, total_size - size_down_in);
628
629 if (len == 0) {
630 links[parent_index].sock.Close();
631 return ReportError(&links[parent_index], kRecvZeroLen);
632 }
633 if (len != -1) {
634 size_down_in += static_cast<size_t>(len);
635 utils::Assert(size_down_in <= size_up_out,
636 "Allreduce: boundary error");
637 recved+=len;
638
639 // if it receives more data than each reduce, it means the next block is sent.
640 // we double the reduce_size_min or add to left_size
641 while (recved > reduce_size_min) {
642 reduce_size_min += std::min(left_size-reduce_size_min, eachreduce);
643 }
644 } else {
645 ReturnType ret = Errno2Return();
646 if (ret != kSuccess) {
647 return ReportError(&links[parent_index], ret);
648 }
635649 }
636650 }
637651 }
564564 int reduce_method;
565565 // mininum count of cells to use ring based method
566566 size_t reduce_ring_mincount;
567 // minimul block size per tree reduce
568 size_t tree_reduce_minsize;
567569 // current rank
568570 int rank;
569571 // world size
44 *
55 * \author Tianqi Chen, Ignacio Cano, Tianyi Zhou
66 */
7 #define _CRT_SECURE_NO_WARNINGS
8 #define _CRT_SECURE_NO_DEPRECATE
97 #define NOMINMAX
8 #include <rabit/base.h>
109 #include <chrono>
1110 #include <thread>
1211 #include <limits>
167166 * \param size_prev_slice size of the previous slice i.e. slice of node (rank - 1) % world_size
168167 * \param _file caller file name used to generate unique cache key
169168 * \param _line caller line number used to generate unique cache key
170 * \param _caller caller function name used to generate unique cache key
171 */
169 * \param _caller caller function name used to generate unique cache key
170 */
172171 void AllreduceRobust::Allgather(void *sendrecvbuf,
173172 size_t total_size,
174173 size_t slice_begin,
518517 }
519518 // execute checkpoint, note: when checkpoint existing, load will not happen
520519 _assert(RecoverExec(NULL, 0, ActionSummary::kCheckPoint,
521 ActionSummary::kSpecialOp, cur_cache_seq),
522 "check point must return true");
520 ActionSummary::kSpecialOp, cur_cache_seq),
521 "check point must return true");
523522 // this is the critical region where we will change all the stored models
524523 // increase version number
525524 version_number += 1;
550549 delta = utils::GetTime() - start;
551550 // log checkpoint ack latency
552551 if (rabit_debug) {
553 utils::HandleLogInfo("[%d] checkpoint ack finished version %d, take %f seconds\n",
554 rank, version_number, delta);
552 utils::HandleLogInfo(
553 "[%d] checkpoint ack finished version %d, take %f seconds\n", rank,
554 version_number, delta);
555555 }
556556 }
557557 /*!
00 // Copyright by Contributors
11 // implementations in ctypes
2 #define _CRT_SECURE_NO_WARNINGS
3 #define _CRT_SECURE_NO_DEPRECATE
4
2 #include <rabit/base.h>
53 #include <cstring>
64 #include <string>
75 #include "rabit/rabit.h"
219217 } // namespace c_api
220218 } // namespace rabit
221219
222 bool RabitInit(int argc, char *argv[]) {
220 RABIT_DLL bool RabitInit(int argc, char *argv[]) {
223221 return rabit::Init(argc, argv);
224222 }
225223
226 bool RabitFinalize() {
224 RABIT_DLL bool RabitFinalize() {
227225 return rabit::Finalize();
228226 }
229227
230 int RabitGetRingPrevRank() {
228 RABIT_DLL int RabitGetRingPrevRank() {
231229 return rabit::GetRingPrevRank();
232230 }
233231
234 int RabitGetRank() {
232 RABIT_DLL int RabitGetRank() {
235233 return rabit::GetRank();
236234 }
237235
238 int RabitGetWorldSize() {
236 RABIT_DLL int RabitGetWorldSize() {
239237 return rabit::GetWorldSize();
240238 }
241239
242 int RabitIsDistributed() {
240 RABIT_DLL int RabitIsDistributed() {
243241 return rabit::IsDistributed();
244242 }
245243
246 void RabitTrackerPrint(const char *msg) {
244 RABIT_DLL void RabitTrackerPrint(const char *msg) {
247245 std::string m(msg);
248246 rabit::TrackerPrint(m);
249247 }
250248
251 void RabitGetProcessorName(char *out_name,
252 rbt_ulong *out_len,
253 rbt_ulong max_len) {
249 RABIT_DLL void RabitGetProcessorName(char *out_name,
250 rbt_ulong *out_len,
251 rbt_ulong max_len) {
254252 std::string s = rabit::GetProcessorName();
255253 if (s.length() > max_len) {
256254 s.resize(max_len - 1);
259257 *out_len = static_cast<rbt_ulong>(s.length());
260258 }
261259
262 void RabitBroadcast(void *sendrecv_data,
263 rbt_ulong size, int root) {
260 RABIT_DLL void RabitBroadcast(void *sendrecv_data,
261 rbt_ulong size, int root) {
264262 rabit::Broadcast(sendrecv_data, size, root);
265263 }
266264
267 void RabitAllgather(void *sendrecvbuf_,
268 size_t total_size,
269 size_t beginIndex,
270 size_t size_node_slice,
271 size_t size_prev_slice,
272 int enum_dtype) {
265 RABIT_DLL void RabitAllgather(void *sendrecvbuf_, size_t total_size,
266 size_t beginIndex, size_t size_node_slice,
267 size_t size_prev_slice, int enum_dtype) {
273268 rabit::c_api::Allgather(sendrecvbuf_,
274269 total_size,
275270 beginIndex,
278273 static_cast<rabit::engine::mpi::DataType>(enum_dtype));
279274 }
280275
281
282 void RabitAllreduce(void *sendrecvbuf,
283 size_t count,
284 int enum_dtype,
285 int enum_op,
286 void (*prepare_fun)(void *arg),
287 void *prepare_arg) {
276 RABIT_DLL void RabitAllreduce(void *sendrecvbuf, size_t count, int enum_dtype,
277 int enum_op, void (*prepare_fun)(void *arg),
278 void *prepare_arg) {
288279 rabit::c_api::Allreduce
289280 (sendrecvbuf, count,
290281 static_cast<rabit::engine::mpi::DataType>(enum_dtype),
292283 prepare_fun, prepare_arg);
293284 }
294285
295 int RabitLoadCheckPoint(char **out_global_model,
296 rbt_ulong *out_global_len,
297 char **out_local_model,
298 rbt_ulong *out_local_len) {
286 RABIT_DLL int RabitLoadCheckPoint(char **out_global_model,
287 rbt_ulong *out_global_len,
288 char **out_local_model,
289 rbt_ulong *out_local_len) {
299290 // NOTE: this function is not thread-safe
300291 using rabit::BeginPtr;
301292 using namespace rabit::c_api; // NOLINT(*)
320311 return version;
321312 }
322313
323 void RabitCheckPoint(const char *global_model,
324 rbt_ulong global_len,
325 const char *local_model,
326 rbt_ulong local_len) {
314 RABIT_DLL void RabitCheckPoint(const char *global_model, rbt_ulong global_len,
315 const char *local_model, rbt_ulong local_len) {
327316 using namespace rabit::c_api; // NOLINT(*)
328317 WriteWrapper sg(global_model, global_len);
329318 WriteWrapper sl(local_model, local_len);
334323 }
335324 }
336325
337 int RabitVersionNumber() {
326 RABIT_DLL int RabitVersionNumber() {
338327 return rabit::VersionNumber();
339328 }
340329
341 int RabitLinkTag() {
330 RABIT_DLL int RabitLinkTag() {
342331 return 0;
343332 }
55 *
66 * \author Tianqi Chen, Ignacio Cano, Tianyi Zhou
77 */
8 #define _CRT_SECURE_NO_WARNINGS
9 #define _CRT_SECURE_NO_DEPRECATE
10 #define NOMINMAX
11
8 #include <rabit/base.h>
129 #include <memory>
1310 #include "rabit/internal/engine.h"
1411 #include "allreduce_base.h"
55 * \author Tianqi Chen
66 */
77 // define use MOCK, os we will use mock Manager
8 #define _CRT_SECURE_NO_WARNINGS
9 #define _CRT_SECURE_NO_DEPRECATE
108 #define NOMINMAX
9 #include <rabit/base.h>
1110 // switch engine to AllreduceMock
1211 #define RABIT_USE_BASE
1312 #include "engine.cc"
55 * This is usually NOT needed, use engine_mpi or engine for real distributed version
66 * \author Tianqi Chen
77 */
8 #define _CRT_SECURE_NO_WARNINGS
9 #define _CRT_SECURE_NO_DEPRECATE
108 #define NOMINMAX
119
10 #include <rabit/base.h>
1211 #include "rabit/internal/engine.h"
1312
1413 namespace rabit {
15
16 namespace utils {
17 bool STOP_PROCESS_ON_ERROR = true;
18 }
19
2014 namespace engine {
2115 /*! \brief EmptyEngine */
2216 class EmptyEngine : public IEngine {
55 * \author Tianqi Chen
66 */
77 // define use MOCK, os we will use mock Manager
8 #define _CRT_SECURE_NO_WARNINGS
9 #define _CRT_SECURE_NO_DEPRECATE
108 #define NOMINMAX
119 // switch engine to AllreduceMock
1210 #define RABIT_USE_MOCK
11 #include <rabit/base.h>
1312 #include "allreduce_mock.h"
1413 #include "engine.cc"
1514
55 *
66 * \author Tianqi Chen
77 */
8 #define _CRT_SECURE_NO_WARNINGS
9 #define _CRT_SECURE_NO_DEPRECATE
108 #define NOMINMAX
119 #include <mpi.h>
10 #include <rabit/base.h>
1211 #include <cstdio>
12 #include <string>
1313 #include "rabit/internal/engine.h"
1414 #include "rabit/internal/utils.h"
1515
1616 namespace rabit {
17
18 namespace utils {
19 bool STOP_PROCESS_ON_ERROR = true;
20 }
21
2217 namespace engine {
2318 /*! \brief implementation of engine using MPI */
2419 class MPIEngine : public IEngine {
22 add_executable(
33 unit_tests
44 test_io.cc
5 test_utils.cc
56 allreduce_robust_test.cc
67 allreduce_base_test.cc
78 allreduce_mock_test.cc
1616 char* argv[] = {cmd};
1717 m.Init(1, argv);
1818 m.rank = 0;
19 EXPECT_EXIT(m.Allreduce(nullptr,0,0,nullptr,nullptr,nullptr), ::testing::ExitedWithCode(255), "");
19 EXPECT_THROW(m.Allreduce(nullptr,0,0,nullptr,nullptr,nullptr), dmlc::Error);
2020 }
2121
2222 TEST(allreduce_mock, mock_broadcast)
3131 m.rank = 0;
3232 m.version_number=1;
3333 m.seq_counter=2;
34 EXPECT_EXIT(m.Broadcast(nullptr,0,0), ::testing::ExitedWithCode(255), "");
34 EXPECT_THROW(m.Broadcast(nullptr,0,0), dmlc::Error);
3535 }
22
33 #include <string>
44 #include <iostream>
5 #include <dmlc/logging.h>
56 #include "../../src/allreduce_mock.h"
67
78 TEST(allreduce_mock, mock_allreduce)
1617 char* argv[] = {cmd};
1718 m.Init(1, argv);
1819 m.rank = 0;
19 EXPECT_EXIT(m.Allreduce(nullptr,0,0,nullptr,nullptr,nullptr), ::testing::ExitedWithCode(255), "");
20 EXPECT_THROW({m.Allreduce(nullptr,0,0,nullptr,nullptr,nullptr);}, dmlc::Error);
2021 }
2122
2223 TEST(allreduce_mock, mock_broadcast)
3132 m.rank = 0;
3233 m.version_number=1;
3334 m.seq_counter=2;
34 EXPECT_EXIT(m.Broadcast(nullptr,0,0), ::testing::ExitedWithCode(255), "");
35 EXPECT_THROW({m.Broadcast(nullptr,0,0);}, dmlc::Error);
3536 }
3637
3738 TEST(allreduce_mock, mock_gather)
4647 m.rank = 3;
4748 m.version_number=13;
4849 m.seq_counter=22;
49 EXPECT_EXIT(m.Allgather(nullptr,0,0,0,0), ::testing::ExitedWithCode(255), "");
50 EXPECT_THROW({m.Allgather(nullptr,0,0,0,0);}, dmlc::Error);
5051 }
0 #include <gtest/gtest.h>
1 #include <rabit/internal/utils.h>
2
3 TEST(Utils, Assert) {
4 EXPECT_THROW({rabit::utils::Assert(false, "foo");}, dmlc::Error);
5 }
1212
1313 # this experiment test recovery with actually process exit, use keepalive to keep program alive
1414 model_recover_10_10k:
15 $(DMLC)/tracker/dmlc-submit --cluster local --num-workers=10 --local-num-attempt=20 model_recover 10000 mock=0,0,1,0 mock=1,1,1,0 rabit_bootstrap_cache=true rabit_debug=true rabit_reduce_ring_mincount=1 rabit_timeout=true rabit_timeout_sec=5
15 python $(DMLC)/tracker/dmlc-submit --cluster local --num-workers=10 --local-num-attempt=20 model_recover 10000 mock=0,0,1,0 mock=1,1,1,0 rabit_bootstrap_cache=true rabit_debug=true rabit_reduce_ring_mincount=1 rabit_timeout=true rabit_timeout_sec=5
1616
1717 model_recover_10_10k_die_same:
18 $(DMLC)/tracker/dmlc-submit --cluster local --num-workers=10 --local-num-attempt=20 model_recover 10000 mock=0,0,1,0 mock=1,1,1,0 mock=0,1,1,0 mock=4,1,1,0 mock=9,1,1,0 rabit_bootstrap_cache=1
18 python $(DMLC)/tracker/dmlc-submit --cluster local --num-workers=10 --local-num-attempt=20 model_recover 10000 mock=0,0,1,0 mock=1,1,1,0 mock=0,1,1,0 mock=4,1,1,0 mock=9,1,1,0 rabit_bootstrap_cache=1
1919
2020 model_recover_10_10k_die_hard:
21 $(DMLC)/tracker/dmlc-submit --cluster local --num-workers=10 --local-num-attempt=20 model_recover 10000 mock=0,0,1,0 mock=1,1,1,0 mock=1,1,1,1 mock=0,1,1,0 mock=4,1,1,0 mock=9,1,1,0 mock=8,1,2,0 mock=4,1,3,0 rabit_bootstrap_cache=1
21 python $(DMLC)/tracker/dmlc-submit --cluster local --num-workers=10 --local-num-attempt=20 model_recover 10000 mock=0,0,1,0 mock=1,1,1,0 mock=1,1,1,1 mock=0,1,1,0 mock=4,1,1,0 mock=9,1,1,0 mock=8,1,2,0 mock=4,1,3,0 rabit_bootstrap_cache=1
2222
2323 local_recover_10_10k:
24 $(DMLC)/tracker/dmlc-submit --cluster local --num-workers=10 --local-num-attempt=20 local_recover 10000 mock=0,0,1,0 mock=1,1,1,0 mock=0,1,1,0 mock=4,1,1,0 mock=9,1,1,0 mock=1,1,1,1
24 python $(DMLC)/tracker/dmlc-submit --cluster local --num-workers=10 --local-num-attempt=20 local_recover 10000 mock=0,0,1,0 mock=1,1,1,0 mock=0,1,1,0 mock=4,1,1,0 mock=9,1,1,0 mock=1,1,1,1
2525
2626 pylocal_recover_10_10k:
27 $(DMLC)/tracker/dmlc-submit --cluster local --num-workers=10 --local-num-attempt=20 local_recover.py 10000 mock=0,0,1,0 mock=1,1,1,0 mock=0,1,1,0 mock=4,1,1,0 mock=9,1,1,0 mock=1,1,1,1
27 python $(DMLC)/tracker/dmlc-submit --cluster local --num-workers=10 --local-num-attempt=20 local_recover.py 10000 mock=0,0,1,0 mock=1,1,1,0 mock=0,1,1,0 mock=4,1,1,0 mock=9,1,1,0 mock=1,1,1,1
2828
2929 lazy_recover_10_10k_die_hard:
30 $(DMLC)/tracker/dmlc-submit --cluster local --num-workers=10 --local-num-attempt=20 lazy_recover 10000 mock=0,0,1,0 mock=1,1,1,0 mock=1,1,1,1 mock=0,1,1,0 mock=4,1,1,0 mock=9,1,1,0 mock=8,1,2,0 mock=4,1,3,0
30 python $(DMLC)/tracker/dmlc-submit --cluster local --num-workers=10 --local-num-attempt=20 lazy_recover 10000 mock=0,0,1,0 mock=1,1,1,0 mock=1,1,1,1 mock=0,1,1,0 mock=4,1,1,0 mock=9,1,1,0 mock=8,1,2,0 mock=4,1,3,0
3131
3232 lazy_recover_10_10k_die_same:
33 $(DMLC)/tracker/dmlc-submit --cluster local --num-workers=10 --local-num-attempt=20 lazy_recover 10000 mock=0,0,1,0 mock=1,1,1,0 mock=0,1,1,0 mock=4,1,1,0 mock=9,1,1,0
33 python $(DMLC)/tracker/dmlc-submit --cluster local --num-workers=10 --local-num-attempt=20 lazy_recover 10000 mock=0,0,1,0 mock=1,1,1,0 mock=0,1,1,0 mock=4,1,1,0 mock=9,1,1,0
3434
3535 ringallreduce_10_10k:
36 $(DMLC)/tracker/dmlc-submit --cluster local --num-workers=10 model_recover 10000 rabit_reduce_ring_mincount=10
36 python $(DMLC)/tracker/dmlc-submit --cluster local --num-workers=10 model_recover 10000 rabit_reduce_ring_mincount=10