New upstream version 0.0~git20200805.4acdd7c
Mo Zhou
3 years ago
62 | 62 | |
63 | 63 | before_install: |
64 | 64 | - git clone https://github.com/dmlc/dmlc-core |
65 | - export TRAVIS=dmlc-core/scripts/travis/ | |
65 | - export TRAVIS=./scripts/ | |
66 | 66 | - source ${TRAVIS}/travis_setup_env.sh |
67 | 67 | - ${TRAVIS}/travis_osx_install.sh |
68 | 68 | - source ./scripts/travis_setup.sh |
0 | /*! | |
1 | * Copyright (c) 2020 by Contributors | |
2 | * \file base.h | |
3 | * \brief Macros common to all headers | |
4 | * | |
5 | * \author Hyunsu Cho | |
6 | */ | |
7 | ||
8 | #ifndef RABIT_BASE_H_ | |
9 | #define RABIT_BASE_H_ | |
10 | ||
11 | #ifndef _CRT_SECURE_NO_WARNINGS | |
12 | #define _CRT_SECURE_NO_WARNINGS | |
13 | #endif // _CRT_SECURE_NO_WARNINGS | |
14 | #ifndef _CRT_SECURE_NO_DEPRECATE | |
15 | #define _CRT_SECURE_NO_DEPRECATE | |
16 | #endif // _CRT_SECURE_NO_DEPRECATE | |
17 | ||
18 | #endif // RABIT_BASE_H_ |
17 | 17 | #if defined(_MSC_VER) || defined(_WIN32) |
18 | 18 | #define RABIT_DLL RABIT_EXTERN_C __declspec(dllexport) |
19 | 19 | #else |
20 | #define RABIT_DLL RABIT_EXTERN_C | |
20 | #define RABIT_DLL RABIT_EXTERN_C __attribute__ ((visibility ("default"))) | |
21 | 21 | #endif // defined(_MSC_VER) || defined(_WIN32) |
22 | 22 | |
23 | 23 | /*! \brief rabit unsigned long type */ |
5 | 5 | */ |
6 | 6 | #ifndef RABIT_INTERNAL_UTILS_H_ |
7 | 7 | #define RABIT_INTERNAL_UTILS_H_ |
8 | #define _CRT_SECURE_NO_WARNINGS | |
8 | ||
9 | #include <rabit/base.h> | |
9 | 10 | #include <string.h> |
10 | 11 | #include <cstdio> |
11 | 12 | #include <string> |
63 | 64 | /*! \brief error message buffer length */ |
64 | 65 | const int kPrintBuffer = 1 << 12; |
65 | 66 | |
66 | /*! \brief we may want to keep the process alive when there are multiple workers | |
67 | * co-locate in the same process */ | |
68 | extern bool STOP_PROCESS_ON_ERROR; | |
69 | ||
70 | 67 | /* \brief Case-insensitive string comparison */ |
71 | 68 | inline int CompareStringsCaseInsensitive(const char* s1, const char* s2) { |
72 | 69 | #ifdef _MSC_VER |
87 | 84 | * \param msg error message |
88 | 85 | */ |
89 | 86 | inline void HandleAssertError(const char *msg) { |
90 | if (STOP_PROCESS_ON_ERROR) { | |
91 | fprintf(stderr, "AssertError:%s, shutting down process\n", msg); | |
92 | exit(-1); | |
93 | } else { | |
94 | fprintf(stderr, "AssertError:%s, rabit is configured to keep process running\n", msg); | |
95 | throw dmlc::Error(msg); | |
96 | } | |
87 | fprintf(stderr, | |
88 | "AssertError:%s, rabit is configured to keep process running\n", msg); | |
89 | throw dmlc::Error(msg); | |
97 | 90 | } |
98 | 91 | /*! |
99 | 92 | * \brief handling of Check error, caused by inappropriate input |
100 | 93 | * \param msg error message |
101 | 94 | */ |
102 | 95 | inline void HandleCheckError(const char *msg) { |
103 | if (STOP_PROCESS_ON_ERROR) { | |
104 | fprintf(stderr, "%s, shutting down process\n", msg); | |
105 | exit(-1); | |
106 | } else { | |
107 | fprintf(stderr, "%s, rabit is configured to keep process running\n", msg); | |
108 | throw dmlc::Error(msg); | |
109 | } | |
96 | fprintf(stderr, "%s, rabit is configured to keep process running\n", msg); | |
97 | throw dmlc::Error(msg); | |
110 | 98 | } |
111 | 99 | inline void HandlePrint(const char *msg) { |
112 | 100 | printf("%s", msg); |
0 | 0 | #!/bin/bash |
1 | ||
2 | conda activate python3 | |
3 | conda --version | |
4 | python --version | |
1 | 5 | |
2 | 6 | make -f test.mk RABIT_BUILD_DMLC=1 model_recover_10_10k || exit -1 |
3 | 7 | make -f test.mk RABIT_BUILD_DMLC=1 model_recover_10_10k_die_same || exit -1 |
1 | 1 | |
2 | 2 | echo "Testing on: ${TRAVIS_OS_NAME}, Home directory: ${HOME}" |
3 | 3 | |
4 | pip3 install cpplint pylint urllib3 numpy cpplint | |
5 | pip3 install websocket-client kubernetes | |
6 | ||
4 | # Install Miniconda | |
5 | if [ ${TRAVIS_OS_NAME} == "osx" ]; then | |
6 | wget -O conda.sh https://repo.anaconda.com/miniconda/Miniconda3-latest-MacOSX-x86_64.sh | |
7 | else | |
8 | wget -O conda.sh https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh | |
9 | fi | |
10 | bash conda.sh -b -p $HOME/miniconda | |
11 | source $HOME/miniconda/bin/activate | |
12 | conda config --set always_yes yes --set changeps1 no | |
13 | conda update -q conda | |
14 | conda info -a | |
15 | conda create -n python3 python=3.7 | |
16 | conda activate python3 | |
17 | conda --version | |
18 | python --version | |
19 | # Install Python packages | |
20 | conda install -c conda-forge numpy scipy urllib3 websocket-client | |
21 | python -m pip install cpplint pylint kubernetes | |
7 | 22 | |
8 | 23 | # Install googletest under home directory |
9 | 24 | GTEST_VERSION=1.8.1 |
26 | 41 | popd |
27 | 42 | |
28 | 43 | if [ ${TRAVIS_OS_NAME} == "linux" ]; then |
29 | sudo apt-get install python3-pip tree | |
44 | sudo apt-get install tree | |
30 | 45 | fi |
31 | 46 | |
32 | 47 | if [ ${TRAVIS_OS_NAME} == "osx" ]; then |
0 | # script to be sourced in travis yml | |
1 | # setup all enviroment variables | |
2 | ||
3 | export CACHE_PREFIX=${HOME}/.cache/usr | |
4 | export PATH=${HOME}/.local/bin:${PATH} | |
5 | export PATH=${PATH}:${CACHE_PREFIX}/bin | |
6 | export CPLUS_INCLUDE_PATH=${CPLUS_INCLUDE_PATH}:${CACHE_PREFIX}/include | |
7 | export C_INCLUDE_PATH=${C_INCLUDE_PATH}:${CACHE_PREFIX}/include | |
8 | export LIBRARY_PATH=${LIBRARY_PATH}:${CACHE_PREFIX}/lib | |
9 | export LD_LIBRARY_PATH=${LD_LIBRARY_PATH}:${CACHE_PREFIX}/lib | |
10 | export DYLD_LIBRARY_PATH=${DYLD_LIBRARY_PATH}:${CACHE_PREFIX}/lib | |
11 | ||
12 | alias make="make -j4" | |
13 | ||
14 | # setup the cache prefix folder | |
15 | if [ ! -d ${HOME}/.cache ]; then | |
16 | mkdir ${HOME}/.cache | |
17 | fi | |
18 | ||
19 | if [ ! -d ${CACHE_PREFIX} ]; then | |
20 | mkdir ${CACHE_PREFIX} | |
21 | fi | |
22 | if [ ! -d ${CACHE_PREFIX}/include ]; then | |
23 | mkdir ${CACHE_PREFIX}/include | |
24 | fi | |
25 | if [ ! -d ${CACHE_PREFIX}/lib ]; then | |
26 | mkdir ${CACHE_PREFIX}/lib | |
27 | fi | |
28 | if [ ! -d ${CACHE_PREFIX}/bin ]; then | |
29 | mkdir ${CACHE_PREFIX}/bin | |
30 | fi | |
31 | ||
32 | # setup CUDA path if NVCC_PREFIX exists | |
33 | if [ ! -z "$NVCC_PREFIX" ]; then | |
34 | export PATH=${PATH}:${NVCC_PREFIX}/usr/local/cuda-7.5/bin | |
35 | export CPLUS_INCLUDE_PATH=${CPLUS_INCLUDE_PATH}:${NVCC_PREFIX}/usr/local/cuda-7.5/include | |
36 | export C_INCLUDE_PATH=${C_INCLUDE_PATH}:${NVCC_PREFIX}/usr/local/cuda-7.5/include | |
37 | export LIBRARY_PATH=${LIBRARY_PATH}:${NVCC_PREFIX}/usr/local/cuda-7.5/lib64:${NVCC_PREFIX}/usr/lib/x86_64-linux-gnu | |
38 | export LD_LIBRARY_PATH=${LD_LIBRARY_PATH}:${NVCC_PREFIX}/usr/local/cuda-7.5/lib64:${NVCC_PREFIX}/usr/lib/x86_64-linux-gnu | |
39 | fi |
4 | 4 | * |
5 | 5 | * \author Tianqi Chen, Ignacio Cano, Tianyi Zhou |
6 | 6 | */ |
7 | #define _CRT_SECURE_NO_WARNINGS | |
8 | #define _CRT_SECURE_NO_DEPRECATE | |
9 | 7 | #define NOMINMAX |
8 | #include "allreduce_base.h" | |
9 | #include <rabit/base.h> | |
10 | 10 | #include <netinet/tcp.h> |
11 | 11 | #include <cstring> |
12 | 12 | #include <map> |
13 | #include "allreduce_base.h" | |
14 | 13 | |
15 | 14 | namespace rabit { |
16 | ||
17 | namespace utils { | |
18 | bool STOP_PROCESS_ON_ERROR = true; | |
19 | } | |
20 | ||
21 | 15 | namespace engine { |
22 | 16 | // constructor |
23 | 17 | AllreduceBase::AllreduceBase(void) { |
33 | 27 | version_number = 0; |
34 | 28 | // 32 K items |
35 | 29 | reduce_ring_mincount = 32 << 10; |
30 | // 1M reducer size each time | |
31 | tree_reduce_minsize = 1 << 20; | |
36 | 32 | // tracker URL |
37 | 33 | task_id = "NULL"; |
38 | 34 | err_link = NULL; |
46 | 42 | env_vars.push_back("DMLC_TRACKER_URI"); |
47 | 43 | env_vars.push_back("DMLC_TRACKER_PORT"); |
48 | 44 | env_vars.push_back("DMLC_WORKER_CONNECT_RETRY"); |
49 | env_vars.push_back("DMLC_WORKER_STOP_PROCESS_ON_ERROR"); | |
50 | 45 | } |
51 | 46 | |
52 | 47 | // initialization function |
187 | 182 | if (!strcmp(name, "DMLC_ROLE")) dmlc_role = val; |
188 | 183 | if (!strcmp(name, "rabit_world_size")) world_size = atoi(val); |
189 | 184 | if (!strcmp(name, "rabit_hadoop_mode")) hadoop_mode = utils::StringToBool(val); |
185 | if (!strcmp(name, "rabit_tree_reduce_minsize")) tree_reduce_minsize = atoi(val); | |
190 | 186 | if (!strcmp(name, "rabit_reduce_ring_mincount")) { |
191 | 187 | reduce_ring_mincount = atoi(val); |
192 | 188 | utils::Assert(reduce_ring_mincount > 0, "rabit_reduce_ring_mincount should be greater than 0"); |
196 | 192 | } |
197 | 193 | if (!strcmp(name, "DMLC_WORKER_CONNECT_RETRY")) { |
198 | 194 | connect_retry = atoi(val); |
199 | } | |
200 | if (!strcmp(name, "DMLC_WORKER_STOP_PROCESS_ON_ERROR")) { | |
201 | if (!strcmp(val, "true")) { | |
202 | rabit::utils::STOP_PROCESS_ON_ERROR = true; | |
203 | } else if (!strcmp(val, "false")) { | |
204 | rabit::utils::STOP_PROCESS_ON_ERROR = false; | |
205 | } else { | |
206 | throw std::runtime_error("invalid value of DMLC_WORKER_STOP_PROCESS_ON_ERROR"); | |
207 | } | |
208 | 195 | } |
209 | 196 | if (!strcmp(name, "rabit_bootstrap_cache")) { |
210 | 197 | rabit_bootstrap_cache = utils::StringToBool(val); |
504 | 491 | size_t size_up_out = 0; |
505 | 492 | // size of message we received, and send in the down pass |
506 | 493 | size_t size_down_in = 0; |
494 | // minimal size of each reducer | |
495 | const size_t eachreduce = (tree_reduce_minsize / type_nbytes * type_nbytes); | |
496 | ||
507 | 497 | // initialize the link ring-buffer and pointer |
508 | 498 | for (int i = 0; i < nlink; ++i) { |
509 | 499 | if (i != parent_index) { |
560 | 550 | // read data from childs |
561 | 551 | for (int i = 0; i < nlink; ++i) { |
562 | 552 | if (i != parent_index && watcher.CheckRead(links[i].sock)) { |
563 | ReturnType ret = links[i].ReadToRingBuffer(size_up_out, total_size); | |
564 | if (ret != kSuccess) { | |
565 | return ReportError(&links[i], ret); | |
553 | // make sure to receive minimal reducer size | |
554 | // since each child reduce and sends the minimal reducer size | |
555 | while (links[i].size_read < total_size | |
556 | && links[i].size_read - size_up_reduce < eachreduce) { | |
557 | ReturnType ret = links[i].ReadToRingBuffer(size_up_out, total_size); | |
558 | if (ret != kSuccess) { | |
559 | return ReportError(&links[i], ret); | |
560 | } | |
566 | 561 | } |
567 | 562 | } |
568 | 563 | } |
582 | 577 | utils::Assert(buffer_size != 0, "must assign buffer_size"); |
583 | 578 | // round to type_n4bytes |
584 | 579 | max_reduce = (max_reduce / type_nbytes * type_nbytes); |
580 | ||
581 | // if max reduce is less than total size, we reduce multiple times of | |
582 | // eachreduce size | |
583 | if (max_reduce < total_size) | |
584 | max_reduce = max_reduce - max_reduce % eachreduce; | |
585 | ||
585 | 586 | // peform reduce, can be at most two rounds |
586 | 587 | while (size_up_reduce < max_reduce) { |
587 | 588 | // start position |
605 | 606 | // pass message up to parent, can pass data that are already been reduced |
606 | 607 | if (size_up_out < size_up_reduce) { |
607 | 608 | ssize_t len = links[parent_index].sock. |
608 | Send(sendrecvbuf + size_up_out, size_up_reduce - size_up_out); | |
609 | Send(sendrecvbuf + size_up_out, size_up_reduce - size_up_out); | |
609 | 610 | if (len != -1) { |
610 | 611 | size_up_out += static_cast<size_t>(len); |
611 | 612 | } else { |
618 | 619 | // read data from parent |
619 | 620 | if (watcher.CheckRead(links[parent_index].sock) && |
620 | 621 | total_size > size_down_in) { |
621 | ssize_t len = links[parent_index].sock. | |
622 | Recv(sendrecvbuf + size_down_in, total_size - size_down_in); | |
623 | if (len == 0) { | |
624 | links[parent_index].sock.Close(); | |
625 | return ReportError(&links[parent_index], kRecvZeroLen); | |
626 | } | |
627 | if (len != -1) { | |
628 | size_down_in += static_cast<size_t>(len); | |
629 | utils::Assert(size_down_in <= size_up_out, | |
630 | "Allreduce: boundary error"); | |
631 | } else { | |
632 | ReturnType ret = Errno2Return(); | |
633 | if (ret != kSuccess) { | |
634 | return ReportError(&links[parent_index], ret); | |
622 | size_t left_size = total_size-size_down_in; | |
623 | size_t reduce_size_min = std::min(left_size, eachreduce); | |
624 | size_t recved = 0; | |
625 | while (recved < reduce_size_min) { | |
626 | ssize_t len = links[parent_index].sock. | |
627 | Recv(sendrecvbuf + size_down_in, total_size - size_down_in); | |
628 | ||
629 | if (len == 0) { | |
630 | links[parent_index].sock.Close(); | |
631 | return ReportError(&links[parent_index], kRecvZeroLen); | |
632 | } | |
633 | if (len != -1) { | |
634 | size_down_in += static_cast<size_t>(len); | |
635 | utils::Assert(size_down_in <= size_up_out, | |
636 | "Allreduce: boundary error"); | |
637 | recved+=len; | |
638 | ||
639 | // if it receives more data than each reduce, it means the next block is sent. | |
640 | // we double the reduce_size_min or add to left_size | |
641 | while (recved > reduce_size_min) { | |
642 | reduce_size_min += std::min(left_size-reduce_size_min, eachreduce); | |
643 | } | |
644 | } else { | |
645 | ReturnType ret = Errno2Return(); | |
646 | if (ret != kSuccess) { | |
647 | return ReportError(&links[parent_index], ret); | |
648 | } | |
635 | 649 | } |
636 | 650 | } |
637 | 651 | } |
564 | 564 | int reduce_method; |
565 | 565 | // mininum count of cells to use ring based method |
566 | 566 | size_t reduce_ring_mincount; |
567 | // minimul block size per tree reduce | |
568 | size_t tree_reduce_minsize; | |
567 | 569 | // current rank |
568 | 570 | int rank; |
569 | 571 | // world size |
4 | 4 | * |
5 | 5 | * \author Tianqi Chen, Ignacio Cano, Tianyi Zhou |
6 | 6 | */ |
7 | #define _CRT_SECURE_NO_WARNINGS | |
8 | #define _CRT_SECURE_NO_DEPRECATE | |
9 | 7 | #define NOMINMAX |
8 | #include <rabit/base.h> | |
10 | 9 | #include <chrono> |
11 | 10 | #include <thread> |
12 | 11 | #include <limits> |
167 | 166 | * \param size_prev_slice size of the previous slice i.e. slice of node (rank - 1) % world_size |
168 | 167 | * \param _file caller file name used to generate unique cache key |
169 | 168 | * \param _line caller line number used to generate unique cache key |
170 | * \param _caller caller function name used to generate unique cache key | |
171 | */ | |
169 | * \param _caller caller function name used to generate unique cache key | |
170 | */ | |
172 | 171 | void AllreduceRobust::Allgather(void *sendrecvbuf, |
173 | 172 | size_t total_size, |
174 | 173 | size_t slice_begin, |
518 | 517 | } |
519 | 518 | // execute checkpoint, note: when checkpoint existing, load will not happen |
520 | 519 | _assert(RecoverExec(NULL, 0, ActionSummary::kCheckPoint, |
521 | ActionSummary::kSpecialOp, cur_cache_seq), | |
522 | "check point must return true"); | |
520 | ActionSummary::kSpecialOp, cur_cache_seq), | |
521 | "check point must return true"); | |
523 | 522 | // this is the critical region where we will change all the stored models |
524 | 523 | // increase version number |
525 | 524 | version_number += 1; |
550 | 549 | delta = utils::GetTime() - start; |
551 | 550 | // log checkpoint ack latency |
552 | 551 | if (rabit_debug) { |
553 | utils::HandleLogInfo("[%d] checkpoint ack finished version %d, take %f seconds\n", | |
554 | rank, version_number, delta); | |
552 | utils::HandleLogInfo( | |
553 | "[%d] checkpoint ack finished version %d, take %f seconds\n", rank, | |
554 | version_number, delta); | |
555 | 555 | } |
556 | 556 | } |
557 | 557 | /*! |
0 | 0 | // Copyright by Contributors |
1 | 1 | // implementations in ctypes |
2 | #define _CRT_SECURE_NO_WARNINGS | |
3 | #define _CRT_SECURE_NO_DEPRECATE | |
4 | ||
2 | #include <rabit/base.h> | |
5 | 3 | #include <cstring> |
6 | 4 | #include <string> |
7 | 5 | #include "rabit/rabit.h" |
219 | 217 | } // namespace c_api |
220 | 218 | } // namespace rabit |
221 | 219 | |
222 | bool RabitInit(int argc, char *argv[]) { | |
220 | RABIT_DLL bool RabitInit(int argc, char *argv[]) { | |
223 | 221 | return rabit::Init(argc, argv); |
224 | 222 | } |
225 | 223 | |
226 | bool RabitFinalize() { | |
224 | RABIT_DLL bool RabitFinalize() { | |
227 | 225 | return rabit::Finalize(); |
228 | 226 | } |
229 | 227 | |
230 | int RabitGetRingPrevRank() { | |
228 | RABIT_DLL int RabitGetRingPrevRank() { | |
231 | 229 | return rabit::GetRingPrevRank(); |
232 | 230 | } |
233 | 231 | |
234 | int RabitGetRank() { | |
232 | RABIT_DLL int RabitGetRank() { | |
235 | 233 | return rabit::GetRank(); |
236 | 234 | } |
237 | 235 | |
238 | int RabitGetWorldSize() { | |
236 | RABIT_DLL int RabitGetWorldSize() { | |
239 | 237 | return rabit::GetWorldSize(); |
240 | 238 | } |
241 | 239 | |
242 | int RabitIsDistributed() { | |
240 | RABIT_DLL int RabitIsDistributed() { | |
243 | 241 | return rabit::IsDistributed(); |
244 | 242 | } |
245 | 243 | |
246 | void RabitTrackerPrint(const char *msg) { | |
244 | RABIT_DLL void RabitTrackerPrint(const char *msg) { | |
247 | 245 | std::string m(msg); |
248 | 246 | rabit::TrackerPrint(m); |
249 | 247 | } |
250 | 248 | |
251 | void RabitGetProcessorName(char *out_name, | |
252 | rbt_ulong *out_len, | |
253 | rbt_ulong max_len) { | |
249 | RABIT_DLL void RabitGetProcessorName(char *out_name, | |
250 | rbt_ulong *out_len, | |
251 | rbt_ulong max_len) { | |
254 | 252 | std::string s = rabit::GetProcessorName(); |
255 | 253 | if (s.length() > max_len) { |
256 | 254 | s.resize(max_len - 1); |
259 | 257 | *out_len = static_cast<rbt_ulong>(s.length()); |
260 | 258 | } |
261 | 259 | |
262 | void RabitBroadcast(void *sendrecv_data, | |
263 | rbt_ulong size, int root) { | |
260 | RABIT_DLL void RabitBroadcast(void *sendrecv_data, | |
261 | rbt_ulong size, int root) { | |
264 | 262 | rabit::Broadcast(sendrecv_data, size, root); |
265 | 263 | } |
266 | 264 | |
267 | void RabitAllgather(void *sendrecvbuf_, | |
268 | size_t total_size, | |
269 | size_t beginIndex, | |
270 | size_t size_node_slice, | |
271 | size_t size_prev_slice, | |
272 | int enum_dtype) { | |
265 | RABIT_DLL void RabitAllgather(void *sendrecvbuf_, size_t total_size, | |
266 | size_t beginIndex, size_t size_node_slice, | |
267 | size_t size_prev_slice, int enum_dtype) { | |
273 | 268 | rabit::c_api::Allgather(sendrecvbuf_, |
274 | 269 | total_size, |
275 | 270 | beginIndex, |
278 | 273 | static_cast<rabit::engine::mpi::DataType>(enum_dtype)); |
279 | 274 | } |
280 | 275 | |
281 | ||
282 | void RabitAllreduce(void *sendrecvbuf, | |
283 | size_t count, | |
284 | int enum_dtype, | |
285 | int enum_op, | |
286 | void (*prepare_fun)(void *arg), | |
287 | void *prepare_arg) { | |
276 | RABIT_DLL void RabitAllreduce(void *sendrecvbuf, size_t count, int enum_dtype, | |
277 | int enum_op, void (*prepare_fun)(void *arg), | |
278 | void *prepare_arg) { | |
288 | 279 | rabit::c_api::Allreduce |
289 | 280 | (sendrecvbuf, count, |
290 | 281 | static_cast<rabit::engine::mpi::DataType>(enum_dtype), |
292 | 283 | prepare_fun, prepare_arg); |
293 | 284 | } |
294 | 285 | |
295 | int RabitLoadCheckPoint(char **out_global_model, | |
296 | rbt_ulong *out_global_len, | |
297 | char **out_local_model, | |
298 | rbt_ulong *out_local_len) { | |
286 | RABIT_DLL int RabitLoadCheckPoint(char **out_global_model, | |
287 | rbt_ulong *out_global_len, | |
288 | char **out_local_model, | |
289 | rbt_ulong *out_local_len) { | |
299 | 290 | // NOTE: this function is not thread-safe |
300 | 291 | using rabit::BeginPtr; |
301 | 292 | using namespace rabit::c_api; // NOLINT(*) |
320 | 311 | return version; |
321 | 312 | } |
322 | 313 | |
323 | void RabitCheckPoint(const char *global_model, | |
324 | rbt_ulong global_len, | |
325 | const char *local_model, | |
326 | rbt_ulong local_len) { | |
314 | RABIT_DLL void RabitCheckPoint(const char *global_model, rbt_ulong global_len, | |
315 | const char *local_model, rbt_ulong local_len) { | |
327 | 316 | using namespace rabit::c_api; // NOLINT(*) |
328 | 317 | WriteWrapper sg(global_model, global_len); |
329 | 318 | WriteWrapper sl(local_model, local_len); |
334 | 323 | } |
335 | 324 | } |
336 | 325 | |
337 | int RabitVersionNumber() { | |
326 | RABIT_DLL int RabitVersionNumber() { | |
338 | 327 | return rabit::VersionNumber(); |
339 | 328 | } |
340 | 329 | |
341 | int RabitLinkTag() { | |
330 | RABIT_DLL int RabitLinkTag() { | |
342 | 331 | return 0; |
343 | 332 | } |
5 | 5 | * |
6 | 6 | * \author Tianqi Chen, Ignacio Cano, Tianyi Zhou |
7 | 7 | */ |
8 | #define _CRT_SECURE_NO_WARNINGS | |
9 | #define _CRT_SECURE_NO_DEPRECATE | |
10 | #define NOMINMAX | |
11 | ||
8 | #include <rabit/base.h> | |
12 | 9 | #include <memory> |
13 | 10 | #include "rabit/internal/engine.h" |
14 | 11 | #include "allreduce_base.h" |
5 | 5 | * \author Tianqi Chen |
6 | 6 | */ |
7 | 7 | // define use MOCK, os we will use mock Manager |
8 | #define _CRT_SECURE_NO_WARNINGS | |
9 | #define _CRT_SECURE_NO_DEPRECATE | |
10 | 8 | #define NOMINMAX |
9 | #include <rabit/base.h> | |
11 | 10 | // switch engine to AllreduceMock |
12 | 11 | #define RABIT_USE_BASE |
13 | 12 | #include "engine.cc" |
5 | 5 | * This is usually NOT needed, use engine_mpi or engine for real distributed version |
6 | 6 | * \author Tianqi Chen |
7 | 7 | */ |
8 | #define _CRT_SECURE_NO_WARNINGS | |
9 | #define _CRT_SECURE_NO_DEPRECATE | |
10 | 8 | #define NOMINMAX |
11 | 9 | |
10 | #include <rabit/base.h> | |
12 | 11 | #include "rabit/internal/engine.h" |
13 | 12 | |
14 | 13 | namespace rabit { |
15 | ||
16 | namespace utils { | |
17 | bool STOP_PROCESS_ON_ERROR = true; | |
18 | } | |
19 | ||
20 | 14 | namespace engine { |
21 | 15 | /*! \brief EmptyEngine */ |
22 | 16 | class EmptyEngine : public IEngine { |
5 | 5 | * \author Tianqi Chen |
6 | 6 | */ |
7 | 7 | // define use MOCK, os we will use mock Manager |
8 | #define _CRT_SECURE_NO_WARNINGS | |
9 | #define _CRT_SECURE_NO_DEPRECATE | |
10 | 8 | #define NOMINMAX |
11 | 9 | // switch engine to AllreduceMock |
12 | 10 | #define RABIT_USE_MOCK |
11 | #include <rabit/base.h> | |
13 | 12 | #include "allreduce_mock.h" |
14 | 13 | #include "engine.cc" |
15 | 14 |
5 | 5 | * |
6 | 6 | * \author Tianqi Chen |
7 | 7 | */ |
8 | #define _CRT_SECURE_NO_WARNINGS | |
9 | #define _CRT_SECURE_NO_DEPRECATE | |
10 | 8 | #define NOMINMAX |
11 | 9 | #include <mpi.h> |
10 | #include <rabit/base.h> | |
12 | 11 | #include <cstdio> |
12 | #include <string> | |
13 | 13 | #include "rabit/internal/engine.h" |
14 | 14 | #include "rabit/internal/utils.h" |
15 | 15 | |
16 | 16 | namespace rabit { |
17 | ||
18 | namespace utils { | |
19 | bool STOP_PROCESS_ON_ERROR = true; | |
20 | } | |
21 | ||
22 | 17 | namespace engine { |
23 | 18 | /*! \brief implementation of engine using MPI */ |
24 | 19 | class MPIEngine : public IEngine { |
2 | 2 | add_executable( |
3 | 3 | unit_tests |
4 | 4 | test_io.cc |
5 | test_utils.cc | |
5 | 6 | allreduce_robust_test.cc |
6 | 7 | allreduce_base_test.cc |
7 | 8 | allreduce_mock_test.cc |
16 | 16 | char* argv[] = {cmd}; |
17 | 17 | m.Init(1, argv); |
18 | 18 | m.rank = 0; |
19 | EXPECT_EXIT(m.Allreduce(nullptr,0,0,nullptr,nullptr,nullptr), ::testing::ExitedWithCode(255), ""); | |
19 | EXPECT_THROW(m.Allreduce(nullptr,0,0,nullptr,nullptr,nullptr), dmlc::Error); | |
20 | 20 | } |
21 | 21 | |
22 | 22 | TEST(allreduce_mock, mock_broadcast) |
31 | 31 | m.rank = 0; |
32 | 32 | m.version_number=1; |
33 | 33 | m.seq_counter=2; |
34 | EXPECT_EXIT(m.Broadcast(nullptr,0,0), ::testing::ExitedWithCode(255), ""); | |
34 | EXPECT_THROW(m.Broadcast(nullptr,0,0), dmlc::Error); | |
35 | 35 | } |
2 | 2 | |
3 | 3 | #include <string> |
4 | 4 | #include <iostream> |
5 | #include <dmlc/logging.h> | |
5 | 6 | #include "../../src/allreduce_mock.h" |
6 | 7 | |
7 | 8 | TEST(allreduce_mock, mock_allreduce) |
16 | 17 | char* argv[] = {cmd}; |
17 | 18 | m.Init(1, argv); |
18 | 19 | m.rank = 0; |
19 | EXPECT_EXIT(m.Allreduce(nullptr,0,0,nullptr,nullptr,nullptr), ::testing::ExitedWithCode(255), ""); | |
20 | EXPECT_THROW({m.Allreduce(nullptr,0,0,nullptr,nullptr,nullptr);}, dmlc::Error); | |
20 | 21 | } |
21 | 22 | |
22 | 23 | TEST(allreduce_mock, mock_broadcast) |
31 | 32 | m.rank = 0; |
32 | 33 | m.version_number=1; |
33 | 34 | m.seq_counter=2; |
34 | EXPECT_EXIT(m.Broadcast(nullptr,0,0), ::testing::ExitedWithCode(255), ""); | |
35 | EXPECT_THROW({m.Broadcast(nullptr,0,0);}, dmlc::Error); | |
35 | 36 | } |
36 | 37 | |
37 | 38 | TEST(allreduce_mock, mock_gather) |
46 | 47 | m.rank = 3; |
47 | 48 | m.version_number=13; |
48 | 49 | m.seq_counter=22; |
49 | EXPECT_EXIT(m.Allgather(nullptr,0,0,0,0), ::testing::ExitedWithCode(255), ""); | |
50 | EXPECT_THROW({m.Allgather(nullptr,0,0,0,0);}, dmlc::Error); | |
50 | 51 | } |
0 | #include <gtest/gtest.h> | |
1 | #include <rabit/internal/utils.h> | |
2 | ||
3 | TEST(Utils, Assert) { | |
4 | EXPECT_THROW({rabit::utils::Assert(false, "foo");}, dmlc::Error); | |
5 | } |
12 | 12 | |
13 | 13 | # this experiment test recovery with actually process exit, use keepalive to keep program alive |
14 | 14 | model_recover_10_10k: |
15 | $(DMLC)/tracker/dmlc-submit --cluster local --num-workers=10 --local-num-attempt=20 model_recover 10000 mock=0,0,1,0 mock=1,1,1,0 rabit_bootstrap_cache=true rabit_debug=true rabit_reduce_ring_mincount=1 rabit_timeout=true rabit_timeout_sec=5 | |
15 | python $(DMLC)/tracker/dmlc-submit --cluster local --num-workers=10 --local-num-attempt=20 model_recover 10000 mock=0,0,1,0 mock=1,1,1,0 rabit_bootstrap_cache=true rabit_debug=true rabit_reduce_ring_mincount=1 rabit_timeout=true rabit_timeout_sec=5 | |
16 | 16 | |
17 | 17 | model_recover_10_10k_die_same: |
18 | $(DMLC)/tracker/dmlc-submit --cluster local --num-workers=10 --local-num-attempt=20 model_recover 10000 mock=0,0,1,0 mock=1,1,1,0 mock=0,1,1,0 mock=4,1,1,0 mock=9,1,1,0 rabit_bootstrap_cache=1 | |
18 | python $(DMLC)/tracker/dmlc-submit --cluster local --num-workers=10 --local-num-attempt=20 model_recover 10000 mock=0,0,1,0 mock=1,1,1,0 mock=0,1,1,0 mock=4,1,1,0 mock=9,1,1,0 rabit_bootstrap_cache=1 | |
19 | 19 | |
20 | 20 | model_recover_10_10k_die_hard: |
21 | $(DMLC)/tracker/dmlc-submit --cluster local --num-workers=10 --local-num-attempt=20 model_recover 10000 mock=0,0,1,0 mock=1,1,1,0 mock=1,1,1,1 mock=0,1,1,0 mock=4,1,1,0 mock=9,1,1,0 mock=8,1,2,0 mock=4,1,3,0 rabit_bootstrap_cache=1 | |
21 | python $(DMLC)/tracker/dmlc-submit --cluster local --num-workers=10 --local-num-attempt=20 model_recover 10000 mock=0,0,1,0 mock=1,1,1,0 mock=1,1,1,1 mock=0,1,1,0 mock=4,1,1,0 mock=9,1,1,0 mock=8,1,2,0 mock=4,1,3,0 rabit_bootstrap_cache=1 | |
22 | 22 | |
23 | 23 | local_recover_10_10k: |
24 | $(DMLC)/tracker/dmlc-submit --cluster local --num-workers=10 --local-num-attempt=20 local_recover 10000 mock=0,0,1,0 mock=1,1,1,0 mock=0,1,1,0 mock=4,1,1,0 mock=9,1,1,0 mock=1,1,1,1 | |
24 | python $(DMLC)/tracker/dmlc-submit --cluster local --num-workers=10 --local-num-attempt=20 local_recover 10000 mock=0,0,1,0 mock=1,1,1,0 mock=0,1,1,0 mock=4,1,1,0 mock=9,1,1,0 mock=1,1,1,1 | |
25 | 25 | |
26 | 26 | pylocal_recover_10_10k: |
27 | $(DMLC)/tracker/dmlc-submit --cluster local --num-workers=10 --local-num-attempt=20 local_recover.py 10000 mock=0,0,1,0 mock=1,1,1,0 mock=0,1,1,0 mock=4,1,1,0 mock=9,1,1,0 mock=1,1,1,1 | |
27 | python $(DMLC)/tracker/dmlc-submit --cluster local --num-workers=10 --local-num-attempt=20 local_recover.py 10000 mock=0,0,1,0 mock=1,1,1,0 mock=0,1,1,0 mock=4,1,1,0 mock=9,1,1,0 mock=1,1,1,1 | |
28 | 28 | |
29 | 29 | lazy_recover_10_10k_die_hard: |
30 | $(DMLC)/tracker/dmlc-submit --cluster local --num-workers=10 --local-num-attempt=20 lazy_recover 10000 mock=0,0,1,0 mock=1,1,1,0 mock=1,1,1,1 mock=0,1,1,0 mock=4,1,1,0 mock=9,1,1,0 mock=8,1,2,0 mock=4,1,3,0 | |
30 | python $(DMLC)/tracker/dmlc-submit --cluster local --num-workers=10 --local-num-attempt=20 lazy_recover 10000 mock=0,0,1,0 mock=1,1,1,0 mock=1,1,1,1 mock=0,1,1,0 mock=4,1,1,0 mock=9,1,1,0 mock=8,1,2,0 mock=4,1,3,0 | |
31 | 31 | |
32 | 32 | lazy_recover_10_10k_die_same: |
33 | $(DMLC)/tracker/dmlc-submit --cluster local --num-workers=10 --local-num-attempt=20 lazy_recover 10000 mock=0,0,1,0 mock=1,1,1,0 mock=0,1,1,0 mock=4,1,1,0 mock=9,1,1,0 | |
33 | python $(DMLC)/tracker/dmlc-submit --cluster local --num-workers=10 --local-num-attempt=20 lazy_recover 10000 mock=0,0,1,0 mock=1,1,1,0 mock=0,1,1,0 mock=4,1,1,0 mock=9,1,1,0 | |
34 | 34 | |
35 | 35 | ringallreduce_10_10k: |
36 | $(DMLC)/tracker/dmlc-submit --cluster local --num-workers=10 model_recover 10000 rabit_reduce_ring_mincount=10 | |
36 | python $(DMLC)/tracker/dmlc-submit --cluster local --num-workers=10 model_recover 10000 rabit_reduce_ring_mincount=10 |