New upstream snapshot.
Debian Janitor
2 years ago
0 | 0 | # Rabit: Reliable Allreduce and Broadcast Interface |
1 | 1 | [![Build Status](https://travis-ci.org/dmlc/rabit.svg?branch=master)](https://travis-ci.org/dmlc/rabit) |
2 | 2 | [![Documentation Status](https://readthedocs.org/projects/rabit/badge/?version=latest)](http://rabit.readthedocs.org/) |
3 | ||
4 | ## Recent developments of Rabit have been moved into [dmlc/xgboost](https://github.com/dmlc/xgboost). See discussion in [dmlc/xgboost#5995](https://github.com/dmlc/xgboost/issues/5995). | |
3 | 5 | |
4 | 6 | rabit is a light weight library that provides a fault tolerant interface of Allreduce and Broadcast. It is designed to support easy implementations of distributed machine learning programs, many of which fall naturally under the Allreduce abstraction. The goal of rabit is to support ***portable*** , ***scalable*** and ***reliable*** distributed machine learning programs. |
5 | 7 |
0 | rabit (0.0~git20201105.f307ace-1) UNRELEASED; urgency=low | |
1 | ||
2 | * New upstream snapshot. | |
3 | ||
4 | -- Debian Janitor <janitor@jelmer.uk> Thu, 19 Aug 2021 08:20:21 -0000 | |
5 | ||
0 | 6 | rabit (0.0~git20200628.74bf00a-2) unstable; urgency=medium |
1 | 7 | |
2 | 8 | * Stop tracking C++ symbols. |
64 | 64 | /*! \brief error message buffer length */ |
65 | 65 | const int kPrintBuffer = 1 << 12; |
66 | 66 | |
67 | /*! \brief we may want to keep the process alive when there are multiple workers | |
68 | * co-locate in the same process */ | |
69 | extern bool STOP_PROCESS_ON_ERROR; | |
70 | ||
71 | 67 | /* \brief Case-insensitive string comparison */ |
72 | 68 | inline int CompareStringsCaseInsensitive(const char* s1, const char* s2) { |
73 | 69 | #ifdef _MSC_VER |
88 | 84 | * \param msg error message |
89 | 85 | */ |
90 | 86 | inline void HandleAssertError(const char *msg) { |
91 | if (STOP_PROCESS_ON_ERROR) { | |
92 | fprintf(stderr, "AssertError:%s, shutting down process\n", msg); | |
93 | exit(-1); | |
94 | } else { | |
95 | fprintf(stderr, "AssertError:%s, rabit is configured to keep process running\n", msg); | |
96 | throw dmlc::Error(msg); | |
97 | } | |
87 | fprintf(stderr, | |
88 | "AssertError:%s, rabit is configured to keep process running\n", msg); | |
89 | throw dmlc::Error(msg); | |
98 | 90 | } |
99 | 91 | /*! |
100 | 92 | * \brief handling of Check error, caused by inappropriate input |
101 | 93 | * \param msg error message |
102 | 94 | */ |
103 | 95 | inline void HandleCheckError(const char *msg) { |
104 | if (STOP_PROCESS_ON_ERROR) { | |
105 | fprintf(stderr, "%s, shutting down process\n", msg); | |
106 | exit(-1); | |
107 | } else { | |
108 | fprintf(stderr, "%s, rabit is configured to keep process running\n", msg); | |
109 | throw dmlc::Error(msg); | |
110 | } | |
96 | fprintf(stderr, "%s, rabit is configured to keep process running\n", msg); | |
97 | throw dmlc::Error(msg); | |
111 | 98 | } |
112 | 99 | inline void HandlePrint(const char *msg) { |
113 | 100 | printf("%s", msg); |
5 | 5 | * \author Tianqi Chen, Ignacio Cano, Tianyi Zhou |
6 | 6 | */ |
7 | 7 | #define NOMINMAX |
8 | #include "allreduce_base.h" | |
8 | 9 | #include <rabit/base.h> |
9 | 10 | #include <netinet/tcp.h> |
10 | 11 | #include <cstring> |
11 | 12 | #include <map> |
12 | #include "allreduce_base.h" | |
13 | 13 | |
14 | 14 | namespace rabit { |
15 | ||
16 | namespace utils { | |
17 | bool STOP_PROCESS_ON_ERROR = true; | |
18 | } | |
19 | ||
20 | 15 | namespace engine { |
21 | 16 | // constructor |
22 | 17 | AllreduceBase::AllreduceBase(void) { |
32 | 27 | version_number = 0; |
33 | 28 | // 32 K items |
34 | 29 | reduce_ring_mincount = 32 << 10; |
30 | // 1M reducer size each time | |
31 | tree_reduce_minsize = 1 << 20; | |
35 | 32 | // tracker URL |
36 | 33 | task_id = "NULL"; |
37 | 34 | err_link = NULL; |
45 | 42 | env_vars.push_back("DMLC_TRACKER_URI"); |
46 | 43 | env_vars.push_back("DMLC_TRACKER_PORT"); |
47 | 44 | env_vars.push_back("DMLC_WORKER_CONNECT_RETRY"); |
48 | env_vars.push_back("DMLC_WORKER_STOP_PROCESS_ON_ERROR"); | |
49 | 45 | } |
50 | 46 | |
51 | 47 | // initialization function |
186 | 182 | if (!strcmp(name, "DMLC_ROLE")) dmlc_role = val; |
187 | 183 | if (!strcmp(name, "rabit_world_size")) world_size = atoi(val); |
188 | 184 | if (!strcmp(name, "rabit_hadoop_mode")) hadoop_mode = utils::StringToBool(val); |
185 | if (!strcmp(name, "rabit_tree_reduce_minsize")) tree_reduce_minsize = atoi(val); | |
189 | 186 | if (!strcmp(name, "rabit_reduce_ring_mincount")) { |
190 | 187 | reduce_ring_mincount = atoi(val); |
191 | 188 | utils::Assert(reduce_ring_mincount > 0, "rabit_reduce_ring_mincount should be greater than 0"); |
195 | 192 | } |
196 | 193 | if (!strcmp(name, "DMLC_WORKER_CONNECT_RETRY")) { |
197 | 194 | connect_retry = atoi(val); |
198 | } | |
199 | if (!strcmp(name, "DMLC_WORKER_STOP_PROCESS_ON_ERROR")) { | |
200 | if (!strcmp(val, "true")) { | |
201 | rabit::utils::STOP_PROCESS_ON_ERROR = true; | |
202 | } else if (!strcmp(val, "false")) { | |
203 | rabit::utils::STOP_PROCESS_ON_ERROR = false; | |
204 | } else { | |
205 | throw std::runtime_error("invalid value of DMLC_WORKER_STOP_PROCESS_ON_ERROR"); | |
206 | } | |
207 | 195 | } |
208 | 196 | if (!strcmp(name, "rabit_bootstrap_cache")) { |
209 | 197 | rabit_bootstrap_cache = utils::StringToBool(val); |
503 | 491 | size_t size_up_out = 0; |
504 | 492 | // size of message we received, and send in the down pass |
505 | 493 | size_t size_down_in = 0; |
494 | // minimal size of each reducer | |
495 | const size_t eachreduce = (tree_reduce_minsize / type_nbytes * type_nbytes); | |
496 | ||
506 | 497 | // initialize the link ring-buffer and pointer |
507 | 498 | for (int i = 0; i < nlink; ++i) { |
508 | 499 | if (i != parent_index) { |
559 | 550 | // read data from childs |
560 | 551 | for (int i = 0; i < nlink; ++i) { |
561 | 552 | if (i != parent_index && watcher.CheckRead(links[i].sock)) { |
562 | ReturnType ret = links[i].ReadToRingBuffer(size_up_out, total_size); | |
563 | if (ret != kSuccess) { | |
564 | return ReportError(&links[i], ret); | |
553 | // make sure to receive minimal reducer size | |
554 | // since each child reduce and sends the minimal reducer size | |
555 | while (links[i].size_read < total_size | |
556 | && links[i].size_read - size_up_reduce < eachreduce) { | |
557 | ReturnType ret = links[i].ReadToRingBuffer(size_up_out, total_size); | |
558 | if (ret != kSuccess) { | |
559 | return ReportError(&links[i], ret); | |
560 | } | |
565 | 561 | } |
566 | 562 | } |
567 | 563 | } |
581 | 577 | utils::Assert(buffer_size != 0, "must assign buffer_size"); |
582 | 578 | // round to type_n4bytes |
583 | 579 | max_reduce = (max_reduce / type_nbytes * type_nbytes); |
580 | ||
581 | // if max reduce is less than total size, we reduce multiple times of | |
582 | // eachreduce size | |
583 | if (max_reduce < total_size) | |
584 | max_reduce = max_reduce - max_reduce % eachreduce; | |
585 | ||
584 | 586 | // peform reduce, can be at most two rounds |
585 | 587 | while (size_up_reduce < max_reduce) { |
586 | 588 | // start position |
604 | 606 | // pass message up to parent, can pass data that are already been reduced |
605 | 607 | if (size_up_out < size_up_reduce) { |
606 | 608 | ssize_t len = links[parent_index].sock. |
607 | Send(sendrecvbuf + size_up_out, size_up_reduce - size_up_out); | |
609 | Send(sendrecvbuf + size_up_out, size_up_reduce - size_up_out); | |
608 | 610 | if (len != -1) { |
609 | 611 | size_up_out += static_cast<size_t>(len); |
610 | 612 | } else { |
617 | 619 | // read data from parent |
618 | 620 | if (watcher.CheckRead(links[parent_index].sock) && |
619 | 621 | total_size > size_down_in) { |
620 | ssize_t len = links[parent_index].sock. | |
621 | Recv(sendrecvbuf + size_down_in, total_size - size_down_in); | |
622 | if (len == 0) { | |
623 | links[parent_index].sock.Close(); | |
624 | return ReportError(&links[parent_index], kRecvZeroLen); | |
625 | } | |
626 | if (len != -1) { | |
627 | size_down_in += static_cast<size_t>(len); | |
628 | utils::Assert(size_down_in <= size_up_out, | |
629 | "Allreduce: boundary error"); | |
630 | } else { | |
631 | ReturnType ret = Errno2Return(); | |
632 | if (ret != kSuccess) { | |
633 | return ReportError(&links[parent_index], ret); | |
622 | size_t left_size = total_size-size_down_in; | |
623 | size_t reduce_size_min = std::min(left_size, eachreduce); | |
624 | size_t recved = 0; | |
625 | while (recved < reduce_size_min) { | |
626 | ssize_t len = links[parent_index].sock. | |
627 | Recv(sendrecvbuf + size_down_in, total_size - size_down_in); | |
628 | ||
629 | if (len == 0) { | |
630 | links[parent_index].sock.Close(); | |
631 | return ReportError(&links[parent_index], kRecvZeroLen); | |
632 | } | |
633 | if (len != -1) { | |
634 | size_down_in += static_cast<size_t>(len); | |
635 | utils::Assert(size_down_in <= size_up_out, | |
636 | "Allreduce: boundary error"); | |
637 | recved+=len; | |
638 | ||
639 | // if it receives more data than each reduce, it means the next block is sent. | |
640 | // we double the reduce_size_min or add to left_size | |
641 | while (recved > reduce_size_min) { | |
642 | reduce_size_min += std::min(left_size-reduce_size_min, eachreduce); | |
643 | } | |
644 | } else { | |
645 | ReturnType ret = Errno2Return(); | |
646 | if (ret != kSuccess) { | |
647 | return ReportError(&links[parent_index], ret); | |
648 | } | |
634 | 649 | } |
635 | 650 | } |
636 | 651 | } |
564 | 564 | int reduce_method; |
565 | 565 | // mininum count of cells to use ring based method |
566 | 566 | size_t reduce_ring_mincount; |
567 | // minimul block size per tree reduce | |
568 | size_t tree_reduce_minsize; | |
567 | 569 | // current rank |
568 | 570 | int rank; |
569 | 571 | // world size |
166 | 166 | * \param size_prev_slice size of the previous slice i.e. slice of node (rank - 1) % world_size |
167 | 167 | * \param _file caller file name used to generate unique cache key |
168 | 168 | * \param _line caller line number used to generate unique cache key |
169 | * \param _caller caller function name used to generate unique cache key | |
170 | */ | |
169 | * \param _caller caller function name used to generate unique cache key | |
170 | */ | |
171 | 171 | void AllreduceRobust::Allgather(void *sendrecvbuf, |
172 | 172 | size_t total_size, |
173 | 173 | size_t slice_begin, |
517 | 517 | } |
518 | 518 | // execute checkpoint, note: when checkpoint existing, load will not happen |
519 | 519 | _assert(RecoverExec(NULL, 0, ActionSummary::kCheckPoint, |
520 | ActionSummary::kSpecialOp, cur_cache_seq), | |
521 | "check point must return true"); | |
520 | ActionSummary::kSpecialOp, cur_cache_seq), | |
521 | "check point must return true"); | |
522 | 522 | // this is the critical region where we will change all the stored models |
523 | 523 | // increase version number |
524 | 524 | version_number += 1; |
549 | 549 | delta = utils::GetTime() - start; |
550 | 550 | // log checkpoint ack latency |
551 | 551 | if (rabit_debug) { |
552 | utils::HandleLogInfo("[%d] checkpoint ack finished version %d, take %f seconds\n", | |
553 | rank, version_number, delta); | |
552 | utils::HandleLogInfo( | |
553 | "[%d] checkpoint ack finished version %d, take %f seconds\n", rank, | |
554 | version_number, delta); | |
554 | 555 | } |
555 | 556 | } |
556 | 557 | /*! |
11 | 11 | #include "rabit/internal/engine.h" |
12 | 12 | |
13 | 13 | namespace rabit { |
14 | ||
15 | namespace utils { | |
16 | bool STOP_PROCESS_ON_ERROR = true; | |
17 | } | |
18 | ||
19 | 14 | namespace engine { |
20 | 15 | /*! \brief EmptyEngine */ |
21 | 16 | class EmptyEngine : public IEngine { |
6 | 6 | * \author Tianqi Chen |
7 | 7 | */ |
8 | 8 | #define NOMINMAX |
9 | #include <mpi.h> | |
9 | 10 | #include <rabit/base.h> |
10 | #include <mpi.h> | |
11 | 11 | #include <cstdio> |
12 | #include <string> | |
12 | 13 | #include "rabit/internal/engine.h" |
13 | 14 | #include "rabit/internal/utils.h" |
14 | 15 | |
15 | 16 | namespace rabit { |
16 | ||
17 | namespace utils { | |
18 | bool STOP_PROCESS_ON_ERROR = true; | |
19 | } | |
20 | ||
21 | 17 | namespace engine { |
22 | 18 | /*! \brief implementation of engine using MPI */ |
23 | 19 | class MPIEngine : public IEngine { |
2 | 2 | add_executable( |
3 | 3 | unit_tests |
4 | 4 | test_io.cc |
5 | test_utils.cc | |
5 | 6 | allreduce_robust_test.cc |
6 | 7 | allreduce_base_test.cc |
7 | 8 | allreduce_mock_test.cc |
16 | 16 | char* argv[] = {cmd}; |
17 | 17 | m.Init(1, argv); |
18 | 18 | m.rank = 0; |
19 | EXPECT_EXIT(m.Allreduce(nullptr,0,0,nullptr,nullptr,nullptr), ::testing::ExitedWithCode(255), ""); | |
19 | EXPECT_THROW(m.Allreduce(nullptr,0,0,nullptr,nullptr,nullptr), dmlc::Error); | |
20 | 20 | } |
21 | 21 | |
22 | 22 | TEST(allreduce_mock, mock_broadcast) |
31 | 31 | m.rank = 0; |
32 | 32 | m.version_number=1; |
33 | 33 | m.seq_counter=2; |
34 | EXPECT_EXIT(m.Broadcast(nullptr,0,0), ::testing::ExitedWithCode(255), ""); | |
34 | EXPECT_THROW(m.Broadcast(nullptr,0,0), dmlc::Error); | |
35 | 35 | } |
2 | 2 | |
3 | 3 | #include <string> |
4 | 4 | #include <iostream> |
5 | #include <dmlc/logging.h> | |
5 | 6 | #include "../../src/allreduce_mock.h" |
6 | 7 | |
7 | 8 | TEST(allreduce_mock, mock_allreduce) |
16 | 17 | char* argv[] = {cmd}; |
17 | 18 | m.Init(1, argv); |
18 | 19 | m.rank = 0; |
19 | EXPECT_EXIT(m.Allreduce(nullptr,0,0,nullptr,nullptr,nullptr), ::testing::ExitedWithCode(255), ""); | |
20 | EXPECT_THROW({m.Allreduce(nullptr,0,0,nullptr,nullptr,nullptr);}, dmlc::Error); | |
20 | 21 | } |
21 | 22 | |
22 | 23 | TEST(allreduce_mock, mock_broadcast) |
31 | 32 | m.rank = 0; |
32 | 33 | m.version_number=1; |
33 | 34 | m.seq_counter=2; |
34 | EXPECT_EXIT(m.Broadcast(nullptr,0,0), ::testing::ExitedWithCode(255), ""); | |
35 | EXPECT_THROW({m.Broadcast(nullptr,0,0);}, dmlc::Error); | |
35 | 36 | } |
36 | 37 | |
37 | 38 | TEST(allreduce_mock, mock_gather) |
46 | 47 | m.rank = 3; |
47 | 48 | m.version_number=13; |
48 | 49 | m.seq_counter=22; |
49 | EXPECT_EXIT(m.Allgather(nullptr,0,0,0,0), ::testing::ExitedWithCode(255), ""); | |
50 | EXPECT_THROW({m.Allgather(nullptr,0,0,0,0);}, dmlc::Error); | |
50 | 51 | } |