New upstream version 0.0~git20200628.74bf00a
Mo Zhou
3 years ago
64 | 64 | /*! \brief error message buffer length */ |
65 | 65 | const int kPrintBuffer = 1 << 12; |
66 | 66 | |
67 | /*! \brief we may want to keep the process alive when there are multiple workers | |
68 | * co-locate in the same process */ | |
69 | extern bool STOP_PROCESS_ON_ERROR; | |
70 | ||
67 | 71 | /* \brief Case-insensitive string comparison */ |
68 | 72 | inline int CompareStringsCaseInsensitive(const char* s1, const char* s2) { |
69 | 73 | #ifdef _MSC_VER |
84 | 88 | * \param msg error message |
85 | 89 | */ |
86 | 90 | inline void HandleAssertError(const char *msg) { |
87 | fprintf(stderr, | |
88 | "AssertError:%s, rabit is configured to keep process running\n", msg); | |
89 | throw dmlc::Error(msg); | |
91 | if (STOP_PROCESS_ON_ERROR) { | |
92 | fprintf(stderr, "AssertError:%s, shutting down process\n", msg); | |
93 | exit(-1); | |
94 | } else { | |
95 | fprintf(stderr, "AssertError:%s, rabit is configured to keep process running\n", msg); | |
96 | throw dmlc::Error(msg); | |
97 | } | |
90 | 98 | } |
91 | 99 | /*! |
92 | 100 | * \brief handling of Check error, caused by inappropriate input |
93 | 101 | * \param msg error message |
94 | 102 | */ |
95 | 103 | inline void HandleCheckError(const char *msg) { |
96 | fprintf(stderr, "%s, rabit is configured to keep process running\n", msg); | |
97 | throw dmlc::Error(msg); | |
104 | if (STOP_PROCESS_ON_ERROR) { | |
105 | fprintf(stderr, "%s, shutting down process\n", msg); | |
106 | exit(-1); | |
107 | } else { | |
108 | fprintf(stderr, "%s, rabit is configured to keep process running\n", msg); | |
109 | throw dmlc::Error(msg); | |
110 | } | |
98 | 111 | } |
99 | 112 | inline void HandlePrint(const char *msg) { |
100 | 113 | printf("%s", msg); |
5 | 5 | * \author Tianqi Chen, Ignacio Cano, Tianyi Zhou |
6 | 6 | */ |
7 | 7 | #define NOMINMAX |
8 | #include "allreduce_base.h" | |
9 | 8 | #include <rabit/base.h> |
10 | 9 | #include <netinet/tcp.h> |
11 | 10 | #include <cstring> |
12 | 11 | #include <map> |
12 | #include "allreduce_base.h" | |
13 | 13 | |
14 | 14 | namespace rabit { |
15 | ||
16 | namespace utils { | |
17 | bool STOP_PROCESS_ON_ERROR = true; | |
18 | } | |
19 | ||
15 | 20 | namespace engine { |
16 | 21 | // constructor |
17 | 22 | AllreduceBase::AllreduceBase(void) { |
27 | 32 | version_number = 0; |
28 | 33 | // 32 K items |
29 | 34 | reduce_ring_mincount = 32 << 10; |
30 | // 1M reducer size each time | |
31 | tree_reduce_minsize = 1 << 20; | |
32 | 35 | // tracker URL |
33 | 36 | task_id = "NULL"; |
34 | 37 | err_link = NULL; |
42 | 45 | env_vars.push_back("DMLC_TRACKER_URI"); |
43 | 46 | env_vars.push_back("DMLC_TRACKER_PORT"); |
44 | 47 | env_vars.push_back("DMLC_WORKER_CONNECT_RETRY"); |
48 | env_vars.push_back("DMLC_WORKER_STOP_PROCESS_ON_ERROR"); | |
45 | 49 | } |
46 | 50 | |
47 | 51 | // initialization function |
182 | 186 | if (!strcmp(name, "DMLC_ROLE")) dmlc_role = val; |
183 | 187 | if (!strcmp(name, "rabit_world_size")) world_size = atoi(val); |
184 | 188 | if (!strcmp(name, "rabit_hadoop_mode")) hadoop_mode = utils::StringToBool(val); |
185 | if (!strcmp(name, "rabit_tree_reduce_minsize")) tree_reduce_minsize = atoi(val); | |
186 | 189 | if (!strcmp(name, "rabit_reduce_ring_mincount")) { |
187 | 190 | reduce_ring_mincount = atoi(val); |
188 | 191 | utils::Assert(reduce_ring_mincount > 0, "rabit_reduce_ring_mincount should be greater than 0"); |
192 | 195 | } |
193 | 196 | if (!strcmp(name, "DMLC_WORKER_CONNECT_RETRY")) { |
194 | 197 | connect_retry = atoi(val); |
198 | } | |
199 | if (!strcmp(name, "DMLC_WORKER_STOP_PROCESS_ON_ERROR")) { | |
200 | if (!strcmp(val, "true")) { | |
201 | rabit::utils::STOP_PROCESS_ON_ERROR = true; | |
202 | } else if (!strcmp(val, "false")) { | |
203 | rabit::utils::STOP_PROCESS_ON_ERROR = false; | |
204 | } else { | |
205 | throw std::runtime_error("invalid value of DMLC_WORKER_STOP_PROCESS_ON_ERROR"); | |
206 | } | |
195 | 207 | } |
196 | 208 | if (!strcmp(name, "rabit_bootstrap_cache")) { |
197 | 209 | rabit_bootstrap_cache = utils::StringToBool(val); |
491 | 503 | size_t size_up_out = 0; |
492 | 504 | // size of message we received, and send in the down pass |
493 | 505 | size_t size_down_in = 0; |
494 | // minimal size of each reducer | |
495 | const size_t eachreduce = (tree_reduce_minsize / type_nbytes * type_nbytes); | |
496 | ||
497 | 506 | // initialize the link ring-buffer and pointer |
498 | 507 | for (int i = 0; i < nlink; ++i) { |
499 | 508 | if (i != parent_index) { |
550 | 559 | // read data from childs |
551 | 560 | for (int i = 0; i < nlink; ++i) { |
552 | 561 | if (i != parent_index && watcher.CheckRead(links[i].sock)) { |
553 | // make sure to receive minimal reducer size | |
554 | // since each child reduce and sends the minimal reducer size | |
555 | while (links[i].size_read < total_size | |
556 | && links[i].size_read - size_up_reduce < eachreduce) { | |
557 | ReturnType ret = links[i].ReadToRingBuffer(size_up_out, total_size); | |
558 | if (ret != kSuccess) { | |
559 | return ReportError(&links[i], ret); | |
560 | } | |
562 | ReturnType ret = links[i].ReadToRingBuffer(size_up_out, total_size); | |
563 | if (ret != kSuccess) { | |
564 | return ReportError(&links[i], ret); | |
561 | 565 | } |
562 | 566 | } |
563 | 567 | } |
577 | 581 | utils::Assert(buffer_size != 0, "must assign buffer_size"); |
578 | 582 | // round to type_n4bytes |
579 | 583 | max_reduce = (max_reduce / type_nbytes * type_nbytes); |
580 | ||
581 | // if max reduce is less than total size, we reduce multiple times of | |
582 | // eachreduce size | |
583 | if (max_reduce < total_size) | |
584 | max_reduce = max_reduce - max_reduce % eachreduce; | |
585 | ||
586 | 584 | // peform reduce, can be at most two rounds |
587 | 585 | while (size_up_reduce < max_reduce) { |
588 | 586 | // start position |
606 | 604 | // pass message up to parent, can pass data that are already been reduced |
607 | 605 | if (size_up_out < size_up_reduce) { |
608 | 606 | ssize_t len = links[parent_index].sock. |
609 | Send(sendrecvbuf + size_up_out, size_up_reduce - size_up_out); | |
607 | Send(sendrecvbuf + size_up_out, size_up_reduce - size_up_out); | |
610 | 608 | if (len != -1) { |
611 | 609 | size_up_out += static_cast<size_t>(len); |
612 | 610 | } else { |
619 | 617 | // read data from parent |
620 | 618 | if (watcher.CheckRead(links[parent_index].sock) && |
621 | 619 | total_size > size_down_in) { |
622 | size_t left_size = total_size-size_down_in; | |
623 | size_t reduce_size_min = std::min(left_size, eachreduce); | |
624 | size_t recved = 0; | |
625 | while (recved < reduce_size_min) { | |
626 | ssize_t len = links[parent_index].sock. | |
627 | Recv(sendrecvbuf + size_down_in, total_size - size_down_in); | |
628 | ||
629 | if (len == 0) { | |
630 | links[parent_index].sock.Close(); | |
631 | return ReportError(&links[parent_index], kRecvZeroLen); | |
632 | } | |
633 | if (len != -1) { | |
634 | size_down_in += static_cast<size_t>(len); | |
635 | utils::Assert(size_down_in <= size_up_out, | |
636 | "Allreduce: boundary error"); | |
637 | recved+=len; | |
638 | ||
639 | // if it receives more data than each reduce, it means the next block is sent. | |
640 | // we double the reduce_size_min or add to left_size | |
641 | while (recved > reduce_size_min) { | |
642 | reduce_size_min += std::min(left_size-reduce_size_min, eachreduce); | |
643 | } | |
644 | } else { | |
645 | ReturnType ret = Errno2Return(); | |
646 | if (ret != kSuccess) { | |
647 | return ReportError(&links[parent_index], ret); | |
648 | } | |
620 | ssize_t len = links[parent_index].sock. | |
621 | Recv(sendrecvbuf + size_down_in, total_size - size_down_in); | |
622 | if (len == 0) { | |
623 | links[parent_index].sock.Close(); | |
624 | return ReportError(&links[parent_index], kRecvZeroLen); | |
625 | } | |
626 | if (len != -1) { | |
627 | size_down_in += static_cast<size_t>(len); | |
628 | utils::Assert(size_down_in <= size_up_out, | |
629 | "Allreduce: boundary error"); | |
630 | } else { | |
631 | ReturnType ret = Errno2Return(); | |
632 | if (ret != kSuccess) { | |
633 | return ReportError(&links[parent_index], ret); | |
649 | 634 | } |
650 | 635 | } |
651 | 636 | } |
564 | 564 | int reduce_method; |
565 | 565 | // mininum count of cells to use ring based method |
566 | 566 | size_t reduce_ring_mincount; |
567 | // minimul block size per tree reduce | |
568 | size_t tree_reduce_minsize; | |
569 | 567 | // current rank |
570 | 568 | int rank; |
571 | 569 | // world size |
166 | 166 | * \param size_prev_slice size of the previous slice i.e. slice of node (rank - 1) % world_size |
167 | 167 | * \param _file caller file name used to generate unique cache key |
168 | 168 | * \param _line caller line number used to generate unique cache key |
169 | * \param _caller caller function name used to generate unique cache key | |
170 | */ | |
169 | * \param _caller caller function name used to generate unique cache key | |
170 | */ | |
171 | 171 | void AllreduceRobust::Allgather(void *sendrecvbuf, |
172 | 172 | size_t total_size, |
173 | 173 | size_t slice_begin, |
517 | 517 | } |
518 | 518 | // execute checkpoint, note: when checkpoint existing, load will not happen |
519 | 519 | _assert(RecoverExec(NULL, 0, ActionSummary::kCheckPoint, |
520 | ActionSummary::kSpecialOp, cur_cache_seq), | |
521 | "check point must return true"); | |
520 | ActionSummary::kSpecialOp, cur_cache_seq), | |
521 | "check point must return true"); | |
522 | 522 | // this is the critical region where we will change all the stored models |
523 | 523 | // increase version number |
524 | 524 | version_number += 1; |
549 | 549 | delta = utils::GetTime() - start; |
550 | 550 | // log checkpoint ack latency |
551 | 551 | if (rabit_debug) { |
552 | utils::HandleLogInfo( | |
553 | "[%d] checkpoint ack finished version %d, take %f seconds\n", rank, | |
554 | version_number, delta); | |
552 | utils::HandleLogInfo("[%d] checkpoint ack finished version %d, take %f seconds\n", | |
553 | rank, version_number, delta); | |
555 | 554 | } |
556 | 555 | } |
557 | 556 | /*! |
11 | 11 | #include "rabit/internal/engine.h" |
12 | 12 | |
13 | 13 | namespace rabit { |
14 | ||
15 | namespace utils { | |
16 | bool STOP_PROCESS_ON_ERROR = true; | |
17 | } | |
18 | ||
14 | 19 | namespace engine { |
15 | 20 | /*! \brief EmptyEngine */ |
16 | 21 | class EmptyEngine : public IEngine { |
6 | 6 | * \author Tianqi Chen |
7 | 7 | */ |
8 | 8 | #define NOMINMAX |
9 | #include <rabit/base.h> | |
9 | 10 | #include <mpi.h> |
10 | #include <rabit/base.h> | |
11 | 11 | #include <cstdio> |
12 | #include <string> | |
13 | 12 | #include "rabit/internal/engine.h" |
14 | 13 | #include "rabit/internal/utils.h" |
15 | 14 | |
16 | 15 | namespace rabit { |
16 | ||
17 | namespace utils { | |
18 | bool STOP_PROCESS_ON_ERROR = true; | |
19 | } | |
20 | ||
17 | 21 | namespace engine { |
18 | 22 | /*! \brief implementation of engine using MPI */ |
19 | 23 | class MPIEngine : public IEngine { |
2 | 2 | add_executable( |
3 | 3 | unit_tests |
4 | 4 | test_io.cc |
5 | test_utils.cc | |
6 | 5 | allreduce_robust_test.cc |
7 | 6 | allreduce_base_test.cc |
8 | 7 | allreduce_mock_test.cc |
16 | 16 | char* argv[] = {cmd}; |
17 | 17 | m.Init(1, argv); |
18 | 18 | m.rank = 0; |
19 | EXPECT_THROW(m.Allreduce(nullptr,0,0,nullptr,nullptr,nullptr), dmlc::Error); | |
19 | EXPECT_EXIT(m.Allreduce(nullptr,0,0,nullptr,nullptr,nullptr), ::testing::ExitedWithCode(255), ""); | |
20 | 20 | } |
21 | 21 | |
22 | 22 | TEST(allreduce_mock, mock_broadcast) |
31 | 31 | m.rank = 0; |
32 | 32 | m.version_number=1; |
33 | 33 | m.seq_counter=2; |
34 | EXPECT_THROW(m.Broadcast(nullptr,0,0), dmlc::Error); | |
34 | EXPECT_EXIT(m.Broadcast(nullptr,0,0), ::testing::ExitedWithCode(255), ""); | |
35 | 35 | } |
2 | 2 | |
3 | 3 | #include <string> |
4 | 4 | #include <iostream> |
5 | #include <dmlc/logging.h> | |
6 | 5 | #include "../../src/allreduce_mock.h" |
7 | 6 | |
8 | 7 | TEST(allreduce_mock, mock_allreduce) |
17 | 16 | char* argv[] = {cmd}; |
18 | 17 | m.Init(1, argv); |
19 | 18 | m.rank = 0; |
20 | EXPECT_THROW({m.Allreduce(nullptr,0,0,nullptr,nullptr,nullptr);}, dmlc::Error); | |
19 | EXPECT_EXIT(m.Allreduce(nullptr,0,0,nullptr,nullptr,nullptr), ::testing::ExitedWithCode(255), ""); | |
21 | 20 | } |
22 | 21 | |
23 | 22 | TEST(allreduce_mock, mock_broadcast) |
32 | 31 | m.rank = 0; |
33 | 32 | m.version_number=1; |
34 | 33 | m.seq_counter=2; |
35 | EXPECT_THROW({m.Broadcast(nullptr,0,0);}, dmlc::Error); | |
34 | EXPECT_EXIT(m.Broadcast(nullptr,0,0), ::testing::ExitedWithCode(255), ""); | |
36 | 35 | } |
37 | 36 | |
38 | 37 | TEST(allreduce_mock, mock_gather) |
47 | 46 | m.rank = 3; |
48 | 47 | m.version_number=13; |
49 | 48 | m.seq_counter=22; |
50 | EXPECT_THROW({m.Allgather(nullptr,0,0,0,0);}, dmlc::Error); | |
49 | EXPECT_EXIT(m.Allgather(nullptr,0,0,0,0), ::testing::ExitedWithCode(255), ""); | |
51 | 50 | } |