diff --git a/.gitignore b/.gitignore
new file mode 100644
index 0000000..ad9fedf
--- /dev/null
+++ b/.gitignore
@@ -0,0 +1,52 @@
+# Compiled Object files
+*.slo
+*.lo
+*.o
+*.obj
+
+# Precompiled Headers
+*.gch
+*.pch
+*.lnk
+# Compiled Dynamic libraries
+*.so
+*.dylib
+*.dll
+
+# Fortran module files
+*.mod
+
+# Compiled Static libraries
+*.lai
+*.la
+*.a
+*.lib
+
+# Executables
+*.miss
+*.exe
+*.out
+*.app
+*~
+*.pyc
+*.mpi
+*.exe
+*tmp*
+*.rabit
+*.mock
+recommonmark
+recom
+_*
+
+#mpi lib
+mpich/
+mpich-3.2/
+
+# Jetbrain
+.idea
+cmake-build-debug/
+.vscode/
+
+# cmake
+build/
+compile_commands.json
\ No newline at end of file
diff --git a/.travis.yml b/.travis.yml
new file mode 100644
index 0000000..f0a7a99
--- /dev/null
+++ b/.travis.yml
@@ -0,0 +1,90 @@
+sudo: true
+
+os:
+  - linux
+  - osx
+
+osx_image: xcode10.2
+
+dist: xenial
+
+language: cpp
+
+# Use Build Matrix to do lint and build seperately
+env:
+  matrix:
+    - TASK=lint LINT_LANG=cpp
+    - TASK=lint LINT_LANG=python
+    - TASK=doc
+    # - TASK=build
+    - TASK=mpi-build
+    - TASK=cmake-test
+
+matrix:
+  exclude:
+    - os: osx
+      env: TASK=lint LINT_LANG=cpp
+    - os: osx
+      env: TASK=lint LINT_LANG=python
+    - os: osx
+      env: TASK=doc
+    - os: osx
+      env: TASK=build
+
+# dependent apt packages
+addons:
+  apt:
+    sources:
+      - llvm-toolchain-trusty-5.0
+      - ubuntu-toolchain-r-test
+      - george-edison55-precise-backports
+    packages:
+      - doxygen
+      - wget
+      - git
+      - libcurl4-openssl-dev
+      - unzip
+      - python-numpy
+      - gcc-4.8
+      - g++-4.8
+      - openssh-client
+      - openssh-server
+      - python3
+      - python3-setuptools
+      - python3-pip
+      - tree
+  homebrew:
+    packages:
+      - gcc49
+      - openssl
+      - libgit2
+      - python3
+    update: true
+
+before_install:
+  - git clone https://github.com/dmlc/dmlc-core
+  - export TRAVIS=dmlc-core/scripts/travis/
+  - source ${TRAVIS}/travis_setup_env.sh
+  - ${TRAVIS}/travis_osx_install.sh
+  - source ./scripts/travis_setup.sh
+
+script: scripts/travis_script.sh
+
+cache:
+  directories:
+    - ${HOME}/.cache/usr
+    - ${HOME}/.cache/pip
+    - mpich
+
+before_cache:
+  - ${TRAVIS}/travis_before_cache.sh
+
+after_success:
+  - tree build
+  - bash <(curl -s https://codecov.io/bash) -a '-o src/ src/*.c'
+
+notifications:
+# Emails are sent to the committer's git-configured email address by default,
+  email:
+    on_success: change
+    on_failure: always
diff --git a/CMakeLists.txt b/CMakeLists.txt
new file mode 100644
index 0000000..541b990
--- /dev/null
+++ b/CMakeLists.txt
@@ -0,0 +1,182 @@
+cmake_minimum_required(VERSION 3.3)
+
+project(rabit VERSION 0.3.0 LANGUAGES CXX)
+
+if ((${CMAKE_VERSION} VERSION_GREATER 3.13) OR (${CMAKE_VERSION} VERSION_EQUAL 3.13))
+  # This allows user to specify `RABIT_BUILD_DMLC` and others as CMake variable.
+  cmake_policy(SET CMP0077 NEW)
+endif ((${CMAKE_VERSION} VERSION_GREATER 3.13) OR (${CMAKE_VERSION} VERSION_EQUAL 3.13))
+
+option(RABIT_BUILD_TESTS "Build rabit tests" OFF)
+option(RABIT_BUILD_MPI "Build MPI" OFF)
+option(RABIT_BUILD_DMLC "Include DMLC_CORE in build" OFF)
+option(RABIT_WITH_R_LIB "Fit the strict environment of R" OFF)
+
+option(DMLC_ROOT "Specify root of external dmlc core.")
+# by default point to xgboost/dmlc-core
+set(DMLC_ROOT ${CMAKE_CURRENT_LIST_DIR}/../dmlc-core)
+
+# moved from xgboost build
+if(R_LIB OR MINGW OR WIN32)
+  add_library(rabit src/engine_empty.cc src/c_api.cc)
+  set(rabit_libs rabit)
+  set_target_properties(rabit
+          PROPERTIES CXX_STANDARD 11
+          CXX_STANDARD_REQUIRED ON
+          POSITION_INDEPENDENT_CODE ON)
+else()
+  find_package(Threads REQUIRED)
+  add_library(rabit_empty src/engine_empty.cc src/c_api.cc)
+  add_library(rabit_base src/allreduce_base.cc src/engine_base.cc src/c_api.cc)
+
+  add_library(rabit src/allreduce_base.cc src/allreduce_robust.cc src/engine.cc src/c_api.cc)
+  add_library(rabit_mock_static src/allreduce_base.cc src/allreduce_robust.cc src/engine_mock.cc src/c_api.cc)
+  add_library(rabit_mock SHARED src/allreduce_base.cc src/allreduce_robust.cc src/engine_mock.cc src/c_api.cc)
+  target_link_libraries(rabit Threads::Threads)
+  target_link_libraries(rabit_mock_static Threads::Threads)
+  target_link_libraries(rabit_mock Threads::Threads)
+
+  set(rabit_libs rabit rabit_base rabit_empty rabit_mock rabit_mock_static)
+  set_target_properties(rabit rabit_base rabit_empty rabit_mock rabit_mock_static
+    PROPERTIES CXX_STANDARD 11
+    CXX_STANDARD_REQUIRED ON
+    POSITION_INDEPENDENT_CODE ON)
+ENDIF(R_LIB OR MINGW OR WIN32)
+
+if(RABIT_BUILD_MPI)
+  find_package(MPI REQUIRED)
+  if (NOT MPI_CXX_FOUND)
+    message(FATAL_ERROR "CXX Interface for MPI is required for building MPI backend.")
+  endif (NOT MPI_CXX_FOUND)
+  add_library(rabit_mpi src/engine_mpi.cc ${MPI_INCLUDE_PATH})
+  target_link_libraries(rabit_mpi ${MPI_CXX_LIBRARIES})
+  list(APPEND rabit_libs rabit_mpi)
+endif()
+
+# place binaries and libraries according to GNU standards
+include(GNUInstallDirs)
+set(CMAKE_ARCHIVE_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/${CMAKE_INSTALL_LIBDIR})
+set(CMAKE_LIBRARY_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/${CMAKE_INSTALL_LIBDIR})
+set(CMAKE_RUNTIME_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/${CMAKE_INSTALL_BINDIR})
+
+# we use this to get code coverage
+if ((CMAKE_CONFIGURATION_TYPES STREQUAL "Debug") AND (CMAKE_CXX_COMPILER_ID MATCHES GNU))
+  foreach(lib ${rabit_libs})
+    target_compile_options(${lib}
+      -fprofile-arcs
+      -ftest-coverage)
+    endforeach()
+endif((CMAKE_CONFIGURATION_TYPES STREQUAL "Debug") AND (CMAKE_CXX_COMPILER_ID MATCHES GNU))
+
+if(RABIT_BUILD_DMLC)
+  set(DMLC_ROOT ${CMAKE_CURRENT_LIST_DIR}/dmlc-core)
+endif()
+
+if(DMLC_ROOT)
+  message("DMLC_ROOT point to " ${DMLC_ROOT})
+endif(DMLC_ROOT)
+
+foreach(lib ${rabit_libs})
+  target_include_directories(${lib} PUBLIC
+          "$<BUILD_INTERFACE:${rabit_SOURCE_DIR}/include>"
+          "$<BUILD_INTERFACE:${DMLC_ROOT}/include>")
+endforeach()
+
+if (RABIT_BUILD_TESTS)
+  enable_testing()
+  add_subdirectory(${rabit_SOURCE_DIR}/test/cpp)
+
+  # rabit mock based integration tests
+  list(REMOVE_ITEM rabit_libs "rabit_mock_static") # remove here to avoid installing it
+  set(tests lazy_recover local_recover model_recover)
+
+  foreach(test ${tests})
+    add_executable(${test} test/${test}.cc)
+    target_link_libraries(${test} rabit_mock_static)
+    set_target_properties(${test}  PROPERTIES CXX_STANDARD 11 CXX_STANDARD_REQUIRED ON)
+    install(TARGETS ${test} DESTINATION test) # Why are we installing these??
+  endforeach()
+
+  if(RABIT_BUILD_MPI)
+    add_executable(speed_test_mpi test/speed_test.cc)
+    target_link_libraries(speed_test_mpi rabit_mpi)
+    install(TARGETS speed_test_mpi DESTINATION test)
+  endif()
+endif (RABIT_BUILD_TESTS)
+
+# Installation (https://github.com/forexample/package-example) {
+
+# Layout. This works for all platforms:
+#   * <prefix>/lib/cmake/<PROJECT-NAME>
+#   * <prefix>/lib/
+#   * <prefix>/include/
+set(CMAKE_INSTALL_PREFIX "${rabit_SOURCE_DIR}")
+set(config_install_dir "lib/cmake/${PROJECT_NAME}")
+set(include_install_dir "include")
+
+set(generated_dir "${CMAKE_CURRENT_BINARY_DIR}/generated")
+
+# Configuration
+set(version_config "${generated_dir}/${PROJECT_NAME}ConfigVersion.cmake")
+set(project_config "${generated_dir}/${PROJECT_NAME}Config.cmake")
+set(TARGETS_EXPORT_NAME "${PROJECT_NAME}Targets")
+set(namespace "${PROJECT_NAME}::")
+
+# Include module with fuction 'write_basic_package_version_file'
+include(CMakePackageConfigHelpers)
+
+# Configure '<PROJECT-NAME>ConfigVersion.cmake'
+# Use:
+#   * PROJECT_VERSION
+write_basic_package_version_file(
+    "${version_config}" COMPATIBILITY SameMajorVersion
+)
+
+# Configure '<PROJECT-NAME>Config.cmake'
+# Use variables:
+#   * TARGETS_EXPORT_NAME
+#   * PROJECT_NAME
+configure_package_config_file(
+    "cmake/Config.cmake.in"
+    "${project_config}"
+    INSTALL_DESTINATION "${config_install_dir}"
+)
+
+# Targets:
+#   * <prefix>/lib/librabit.a
+#   * <prefix>/lib/librabit_base
+#   * <prefix>/lib/librabit_empty
+#   * header location after install: <prefix>/include/rabit/rabit.h
+#   * headers can be included by C++ code `#include <rabit/rabit.h>`
+install(
+    TARGETS ${rabit_libs}
+    EXPORT "${TARGETS_EXPORT_NAME}"
+    LIBRARY DESTINATION "lib"
+    ARCHIVE DESTINATION "lib"
+    RUNTIME DESTINATION "bin"
+    INCLUDES DESTINATION "${include_install_dir}"
+)
+
+# Headers:
+install(
+    DIRECTORY "include/"
+    DESTINATION "${include_install_dir}"
+    FILES_MATCHING PATTERN "*.h"
+)
+
+# Config
+#   * <prefix>/lib/cmake/rabit/rabitConfig.cmake
+#   * <prefix>/lib/cmake/rabit/rabitConfigVersion.cmake
+install(
+    FILES "${project_config}" "${version_config}"
+    DESTINATION "${config_install_dir}"
+)
+
+# Config
+#   * <prefix>/lib/cmake/Foo/FooTargets.cmake
+install(
+    EXPORT "${TARGETS_EXPORT_NAME}"
+    NAMESPACE "${namespace}"
+    DESTINATION "${config_install_dir}"
+)
+# }
diff --git a/LICENSE b/LICENSE
new file mode 100644
index 0000000..2485f4e
--- /dev/null
+++ b/LICENSE
@@ -0,0 +1,28 @@
+Copyright (c) 2014 by Contributors
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+* Redistributions of source code must retain the above copyright notice, this
+  list of conditions and the following disclaimer.
+
+* Redistributions in binary form must reproduce the above copyright notice,
+  this list of conditions and the following disclaimer in the documentation
+  and/or other materials provided with the distribution.
+
+* Neither the name of rabit nor the names of its
+  contributors may be used to endorse or promote products derived from
+  this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
diff --git a/Makefile b/Makefile
new file mode 100644
index 0000000..dfbdc0b
--- /dev/null
+++ b/Makefile
@@ -0,0 +1,104 @@
+OS := $(shell uname)
+
+RABIT_BUILD_DMLC = 0
+
+export WARNFLAGS= -Wall -Wextra -Wno-unused-parameter -Wno-unknown-pragmas -std=c++11
+export CFLAGS = -O3 $(WARNFLAGS)
+export LDFLAGS =-Llib
+
+#download mpi
+#echo $(shell scripts/mpi.sh)
+
+MPICXX=./mpich/bin/mpicxx
+
+export CXX = g++
+
+
+#----------------------------
+# Settings for power and arm arch
+#----------------------------
+ARCH := $(shell uname -a)
+ifneq (,$(filter $(ARCH), armv6l armv7l powerpc64le ppc64le aarch64))
+	CFLAGS += -march=native
+else
+	CFLAGS += -msse2
+endif
+
+ifndef WITH_FPIC
+	WITH_FPIC = 1
+endif
+ifeq ($(WITH_FPIC), 1)
+	CFLAGS += -fPIC
+endif
+
+ifndef LINT_LANG
+	LINT_LANG="all"
+endif
+
+ifeq ($(RABIT_BUILD_DMLC),1)
+    DMLC=dmlc-core
+else
+    DMLC=../dmlc-core
+endif
+
+CFLAGS += -I $(DMLC)/include -I include/
+
+# build path
+BPATH=.
+# objectives that makes up rabit library
+MPIOBJ= $(BPATH)/engine_mpi.o
+OBJ= $(BPATH)/allreduce_base.o $(BPATH)/allreduce_robust.o $(BPATH)/engine.o $(BPATH)/engine_empty.o $(BPATH)/engine_mock.o\
+	$(BPATH)/c_api.o $(BPATH)/engine_base.o
+SLIB= lib/librabit.so lib/librabit_mock.so lib/librabit_base.so
+ALIB= lib/librabit.a lib/librabit_empty.a lib/librabit_mock.a lib/librabit_base.a
+MPISLIB= lib/librabit_mpi.so
+MPIALIB= lib/librabit_mpi.a
+HEADERS=src/*.h include/rabit/*.h include/rabit/internal/*.h
+
+.PHONY: clean all install mpi python lint doc doxygen
+
+all: lib/librabit.a lib/librabit_mock.a  lib/librabit.so lib/librabit_base.a lib/librabit_mock.so
+mpi: lib/librabit_mpi.a lib/librabit_mpi.so
+
+$(BPATH)/allreduce_base.o: src/allreduce_base.cc $(HEADERS)
+$(BPATH)/engine.o: src/engine.cc $(HEADERS)
+$(BPATH)/allreduce_robust.o: src/allreduce_robust.cc $(HEADERS)
+$(BPATH)/engine_mpi.o: src/engine_mpi.cc $(HEADERS)
+$(BPATH)/engine_empty.o: src/engine_empty.cc $(HEADERS)
+$(BPATH)/engine_mock.o: src/engine_mock.cc $(HEADERS)
+$(BPATH)/engine_base.o: src/engine_base.cc $(HEADERS)
+$(BPATH)/c_api.o: src/c_api.cc $(HEADERS)
+
+lib/librabit.a lib/librabit.so: $(BPATH)/allreduce_base.o $(BPATH)/allreduce_robust.o $(BPATH)/engine.o $(BPATH)/c_api.o
+lib/librabit_base.a lib/librabit_base.so: $(BPATH)/allreduce_base.o $(BPATH)/engine_base.o $(BPATH)/c_api.o
+lib/librabit_mock.a lib/librabit_mock.so: $(BPATH)/allreduce_base.o $(BPATH)/allreduce_robust.o $(BPATH)/engine_mock.o $(BPATH)/c_api.o
+lib/librabit_empty.a: $(BPATH)/engine_empty.o $(BPATH)/c_api.o
+lib/librabit_mpi.a lib/librabit_mpi.so: $(MPIOBJ)
+
+$(OBJ) :
+	$(CXX) -c $(CFLAGS) -o $@ $(firstword $(filter %.cpp %.c %.cc, $^) )
+
+$(ALIB):
+	ar cr $@ $+
+
+$(SLIB) :
+	$(CXX) $(CFLAGS) -shared -o $@ $(filter %.cpp %.o %.c %.cc %.a, $^) $(LDFLAGS)
+
+$(MPIOBJ) :
+	$(MPICXX) -c $(CFLAGS) -I./mpich/include -o $@ $(firstword $(filter %.cpp %.c %.cc, $^) )
+
+$(MPIALIB):
+	ar cr $@ $+
+
+$(MPISLIB) :
+	$(MPICXX) $(CFLAGS) -I./mpich/include -shared -o $@ $(filter %.cpp %.o %.c %.cc %.a, $^) \
+	$(LDFLAGS) -L./mpich/lib -Wl,-rpath,./mpich/lib -lmpi
+
+lint:
+	$(DMLC)/scripts/lint.py rabit $(LINT_LANG) src include
+
+doc doxygen:
+	cd include; doxygen ../doc/Doxyfile; cd -
+
+clean:
+	$(RM)  $(OBJ) $(MPIOBJ) $(ALIB) $(MPIALIB) $(SLIB) *~ src/*~ include/*~ include/*/*~
diff --git a/README.md b/README.md
new file mode 100644
index 0000000..eb0ce71
--- /dev/null
+++ b/README.md
@@ -0,0 +1,40 @@
+# Rabit: Reliable Allreduce and Broadcast Interface
+[![Build Status](https://travis-ci.org/dmlc/rabit.svg?branch=master)](https://travis-ci.org/dmlc/rabit)
+[![Documentation Status](https://readthedocs.org/projects/rabit/badge/?version=latest)](http://rabit.readthedocs.org/)
+
+rabit is a light weight library that provides a fault tolerant interface of Allreduce and Broadcast. It is designed to support easy implementations of distributed machine learning programs, many of which fall naturally under the Allreduce abstraction. The goal of rabit is to support ***portable*** , ***scalable*** and ***reliable*** distributed machine learning programs.
+
+* [Tutorial](guide)
+* [API Documentation](http://homes.cs.washington.edu/~tqchen/rabit/doc)
+* You can also directly read the [interface header](include/rabit.h)
+* [XGBoost](https://github.com/dmlc/xgboost)
+  - Rabit is one of the backbone library to support distributed XGBoost
+
+## Features
+All these features comes from the facts about small rabbit:)
+* Portable: rabit is light weight and runs everywhere
+  - Rabit is a library instead of a framework, a program only needs to link the library to run
+  - Rabit only replies on a mechanism to start program, which was provided by most framework
+  - You can run rabit programs on many platforms, including Yarn(Hadoop), MPI using the same code
+* Scalable and Flexible: rabit runs fast
+  * Rabit program use Allreduce to communicate, and do not suffer the cost between iterations of MapReduce abstraction.
+  - Programs can call rabit functions in any order, as opposed to frameworks where callbacks are offered and called by the framework, i.e. inversion of control principle.
+  - Programs persist over all the iterations, unless they fail and recover.
+* Reliable: rabit dig burrows to avoid disasters
+  - Rabit programs can recover the model and results using synchronous function calls.
+  - Rabit programs can set rabit_boostrap_cache=1 to support allreduce/broadcast operations before loadcheckpoint
+  `
+    rabit::Init(); -> rabit::AllReduce(); -> rabit::loadCheckpoint(); -> for () { rabit::AllReduce(); rabit::Checkpoint();} -> rabit::Shutdown();
+  `
+
+## Use Rabit
+* Type make in the root folder will compile the rabit library in lib folder
+* Add lib to the library path and include to the include path of compiler
+* Languages: You can use rabit in C++ and python
+  - It is also possible to port the library to other languages
+
+## Contributing
+Rabit is an open-source library, contributions are welcomed, including:
+* The rabit core library.
+* Customized tracker script for new platforms and interface of new languages.
+* Tutorial and examples about the library.
diff --git a/cmake/Config.cmake.in b/cmake/Config.cmake.in
new file mode 100644
index 0000000..38bbde7
--- /dev/null
+++ b/cmake/Config.cmake.in
@@ -0,0 +1,4 @@
+@PACKAGE_INIT@
+
+include("${CMAKE_CURRENT_LIST_DIR}/@TARGETS_EXPORT_NAME@.cmake")
+check_required_components("@PROJECT_NAME@")
diff --git a/cmake/googletest-download.cmake b/cmake/googletest-download.cmake
new file mode 100644
index 0000000..7cf367c
--- /dev/null
+++ b/cmake/googletest-download.cmake
@@ -0,0 +1,20 @@
+# code copied from https://crascit.com/2015/07/25/cmake-gtest/
+cmake_minimum_required(VERSION 3.5 FATAL_ERROR)
+
+project(googletest-download NONE)
+
+include(ExternalProject)
+
+ExternalProject_Add(
+        googletest
+        SOURCE_DIR "@GOOGLETEST_DOWNLOAD_ROOT@/googletest-src"
+        BINARY_DIR "@GOOGLETEST_DOWNLOAD_ROOT@/googletest-build"
+        GIT_REPOSITORY
+        https://github.com/google/googletest.git
+        GIT_TAG
+        release-1.8.0
+        CONFIGURE_COMMAND ""
+        BUILD_COMMAND ""
+        INSTALL_COMMAND ""
+        TEST_COMMAND ""
+)
\ No newline at end of file
diff --git a/cmake/googletest.cmake b/cmake/googletest.cmake
new file mode 100644
index 0000000..0a0b1a5
--- /dev/null
+++ b/cmake/googletest.cmake
@@ -0,0 +1,32 @@
+# the following code to fetch googletest
+# is inspired by and adapted after https://crascit.com/2015/07/25/cmake-gtest/
+# download and unpack googletest at configure time
+
+macro(fetch_googletest _download_module_path _download_root)
+    set(GOOGLETEST_DOWNLOAD_ROOT ${_download_root})
+    configure_file(
+            ${_download_module_path}/googletest-download.cmake
+            ${_download_root}/CMakeLists.txt
+            @ONLY
+    )
+    unset(GOOGLETEST_DOWNLOAD_ROOT)
+
+    execute_process(
+            COMMAND
+            "${CMAKE_COMMAND}" -G "${CMAKE_GENERATOR}" .
+            WORKING_DIRECTORY
+            ${_download_root}
+    )
+    execute_process(
+            COMMAND
+            "${CMAKE_COMMAND}" --build .
+            WORKING_DIRECTORY
+            ${_download_root}
+    )
+
+    # adds the targers: gtest, gtest_main, gmock, gmock_main
+    add_subdirectory(
+            ${_download_root}/googletest-src
+            ${_download_root}/googletest-build
+    )
+endmacro()
\ No newline at end of file
diff --git a/doc/.gitignore b/doc/.gitignore
new file mode 100644
index 0000000..95f88be
--- /dev/null
+++ b/doc/.gitignore
@@ -0,0 +1,5 @@
+html
+latex
+*.sh
+_*
+doxygen
diff --git a/doc/Doxyfile b/doc/Doxyfile
new file mode 100644
index 0000000..3e64641
--- /dev/null
+++ b/doc/Doxyfile
@@ -0,0 +1,281 @@
+# Doxyfile 1.7.6.1
+
+#---------------------------------------------------------------------------
+# Project related configuration options
+#---------------------------------------------------------------------------
+DOXYFILE_ENCODING      = UTF-8
+PROJECT_NAME           = "rabit"
+PROJECT_NUMBER         =
+PROJECT_BRIEF          =
+PROJECT_LOGO           =
+OUTPUT_DIRECTORY       = ../doc/doxygen
+CREATE_SUBDIRS         = NO
+OUTPUT_LANGUAGE        = English
+BRIEF_MEMBER_DESC      = YES
+REPEAT_BRIEF           = YES
+ABBREVIATE_BRIEF       =
+ALWAYS_DETAILED_SEC    = NO
+INLINE_INHERITED_MEMB  = NO
+FULL_PATH_NAMES        = YES
+STRIP_FROM_PATH        =
+STRIP_FROM_INC_PATH    =
+SHORT_NAMES            = NO
+JAVADOC_AUTOBRIEF      = NO
+QT_AUTOBRIEF           = NO
+MULTILINE_CPP_IS_BRIEF = NO
+INHERIT_DOCS           = YES
+SEPARATE_MEMBER_PAGES  = NO
+TAB_SIZE               = 8
+ALIASES                =
+TCL_SUBST              =
+OPTIMIZE_OUTPUT_FOR_C  = YES
+OPTIMIZE_OUTPUT_JAVA   = NO
+OPTIMIZE_FOR_FORTRAN   = NO
+OPTIMIZE_OUTPUT_VHDL   = NO
+EXTENSION_MAPPING      =
+BUILTIN_STL_SUPPORT    = NO
+CPP_CLI_SUPPORT        = NO
+SIP_SUPPORT            = NO
+IDL_PROPERTY_SUPPORT   = YES
+DISTRIBUTE_GROUP_DOC   = NO
+SUBGROUPING            = YES
+INLINE_GROUPED_CLASSES = NO
+INLINE_SIMPLE_STRUCTS  = NO
+TYPEDEF_HIDES_STRUCT   = NO
+LOOKUP_CACHE_SIZE      = 0
+#---------------------------------------------------------------------------
+# Build related configuration options
+#---------------------------------------------------------------------------
+EXTRACT_ALL            = NO
+EXTRACT_PRIVATE        = NO
+EXTRACT_STATIC         = NO
+EXTRACT_LOCAL_CLASSES  = YES
+EXTRACT_LOCAL_METHODS  = NO
+EXTRACT_ANON_NSPACES   = NO
+HIDE_UNDOC_MEMBERS     = NO
+HIDE_UNDOC_CLASSES     = YES
+HIDE_FRIEND_COMPOUNDS  = NO
+HIDE_IN_BODY_DOCS      = NO
+INTERNAL_DOCS          = NO
+CASE_SENSE_NAMES       = YES
+HIDE_SCOPE_NAMES       = NO
+SHOW_INCLUDE_FILES     = YES
+FORCE_LOCAL_INCLUDES   = NO
+INLINE_INFO            = YES
+SORT_MEMBER_DOCS       = YES
+SORT_BRIEF_DOCS        = NO
+SORT_MEMBERS_CTORS_1ST = NO
+SORT_GROUP_NAMES       = NO
+SORT_BY_SCOPE_NAME     = NO
+STRICT_PROTO_MATCHING  = NO
+GENERATE_TODOLIST      = YES
+GENERATE_TESTLIST      = YES
+GENERATE_BUGLIST       = YES
+GENERATE_DEPRECATEDLIST= YES
+ENABLED_SECTIONS       =
+MAX_INITIALIZER_LINES  = 30
+SHOW_USED_FILES        = YES
+SHOW_FILES             = YES
+SHOW_NAMESPACES        = YES
+FILE_VERSION_FILTER    =
+LAYOUT_FILE            =
+CITE_BIB_FILES         =
+#---------------------------------------------------------------------------
+# configuration options related to warning and progress messages
+#---------------------------------------------------------------------------
+QUIET                  = NO
+WARNINGS               = YES
+WARN_IF_UNDOCUMENTED   = YES
+WARN_IF_DOC_ERROR      = YES
+WARN_NO_PARAMDOC       = YES
+WARN_FORMAT            = "$file:$line: $text"
+WARN_LOGFILE           =
+#---------------------------------------------------------------------------
+# configuration options related to the input files
+#---------------------------------------------------------------------------
+INPUT                  = rabit
+INPUT_ENCODING         = UTF-8
+FILE_PATTERNS          =
+RECURSIVE              = NO
+EXCLUDE                =
+EXCLUDE_SYMLINKS       = NO
+EXCLUDE_PATTERNS       = *-inl.hpp
+EXCLUDE_SYMBOLS        =
+EXAMPLE_PATH           =
+EXAMPLE_PATTERNS       =
+EXAMPLE_RECURSIVE      = NO
+IMAGE_PATH             =
+INPUT_FILTER           =
+FILTER_PATTERNS        =
+FILTER_SOURCE_FILES    = NO
+FILTER_SOURCE_PATTERNS =
+#---------------------------------------------------------------------------
+# configuration options related to source browsing
+#---------------------------------------------------------------------------
+SOURCE_BROWSER         = NO
+INLINE_SOURCES         = NO
+STRIP_CODE_COMMENTS    = YES
+REFERENCED_BY_RELATION = NO
+REFERENCES_RELATION    = NO
+REFERENCES_LINK_SOURCE = YES
+USE_HTAGS              = NO
+VERBATIM_HEADERS       = YES
+#---------------------------------------------------------------------------
+# configuration options related to the alphabetical class index
+#---------------------------------------------------------------------------
+ALPHABETICAL_INDEX     = YES
+COLS_IN_ALPHA_INDEX    = 5
+IGNORE_PREFIX          =
+#---------------------------------------------------------------------------
+# configuration options related to the HTML output
+#---------------------------------------------------------------------------
+GENERATE_HTML          = YES
+HTML_OUTPUT            = html
+HTML_FILE_EXTENSION    = .html
+HTML_HEADER            =
+HTML_FOOTER            =
+HTML_STYLESHEET        =
+HTML_EXTRA_FILES       =
+HTML_COLORSTYLE_HUE    = 220
+HTML_COLORSTYLE_SAT    = 100
+HTML_COLORSTYLE_GAMMA  = 80
+HTML_TIMESTAMP         = YES
+HTML_DYNAMIC_SECTIONS  = NO
+GENERATE_DOCSET        = NO
+DOCSET_FEEDNAME        = "Doxygen generated docs"
+DOCSET_BUNDLE_ID       = org.doxygen.Project
+DOCSET_PUBLISHER_ID    = org.doxygen.Publisher
+DOCSET_PUBLISHER_NAME  = Publisher
+GENERATE_HTMLHELP      = NO
+CHM_FILE               =
+HHC_LOCATION           =
+GENERATE_CHI           = NO
+CHM_INDEX_ENCODING     =
+BINARY_TOC             = NO
+TOC_EXPAND             = NO
+GENERATE_QHP           = NO
+QCH_FILE               =
+QHP_NAMESPACE          = org.doxygen.Project
+QHP_VIRTUAL_FOLDER     = doc
+QHP_CUST_FILTER_NAME   =
+QHP_CUST_FILTER_ATTRS  =
+QHP_SECT_FILTER_ATTRS  =
+QHG_LOCATION           =
+GENERATE_ECLIPSEHELP   = NO
+ECLIPSE_DOC_ID         = org.doxygen.Project
+DISABLE_INDEX          = NO
+GENERATE_TREEVIEW      = NO
+ENUM_VALUES_PER_LINE   = 4
+TREEVIEW_WIDTH         = 250
+EXT_LINKS_IN_WINDOW    = NO
+FORMULA_FONTSIZE       = 10
+FORMULA_TRANSPARENT    = YES
+USE_MATHJAX            = NO
+MATHJAX_RELPATH        = http://www.mathjax.org/mathjax
+MATHJAX_EXTENSIONS     =
+SEARCHENGINE           = YES
+SERVER_BASED_SEARCH    = NO
+#---------------------------------------------------------------------------
+# configuration options related to the LaTeX output
+#---------------------------------------------------------------------------
+GENERATE_LATEX         = YES
+LATEX_OUTPUT           = latex
+LATEX_CMD_NAME         = latex
+MAKEINDEX_CMD_NAME     = makeindex
+COMPACT_LATEX          = NO
+PAPER_TYPE             = a4
+EXTRA_PACKAGES         =
+LATEX_HEADER           =
+LATEX_FOOTER           =
+PDF_HYPERLINKS         = YES
+USE_PDFLATEX           = YES
+LATEX_BATCHMODE        = NO
+LATEX_HIDE_INDICES     = NO
+LATEX_SOURCE_CODE      = NO
+LATEX_BIB_STYLE        = plain
+#---------------------------------------------------------------------------
+# configuration options related to the RTF output
+#---------------------------------------------------------------------------
+GENERATE_RTF           = NO
+RTF_OUTPUT             = rtf
+COMPACT_RTF            = NO
+RTF_HYPERLINKS         = NO
+RTF_STYLESHEET_FILE    =
+RTF_EXTENSIONS_FILE    =
+#---------------------------------------------------------------------------
+# configuration options related to the man page output
+#---------------------------------------------------------------------------
+GENERATE_MAN           = NO
+MAN_OUTPUT             = man
+MAN_EXTENSION          = .3
+MAN_LINKS              = NO
+#---------------------------------------------------------------------------
+# configuration options related to the XML output
+#---------------------------------------------------------------------------
+GENERATE_XML           = YES
+XML_OUTPUT             = xml
+XML_PROGRAMLISTING     = YES
+#---------------------------------------------------------------------------
+# configuration options for the AutoGen Definitions output
+#---------------------------------------------------------------------------
+GENERATE_AUTOGEN_DEF   = NO
+#---------------------------------------------------------------------------
+# configuration options related to the Perl module output
+#---------------------------------------------------------------------------
+GENERATE_PERLMOD       = NO
+PERLMOD_LATEX          = NO
+PERLMOD_PRETTY         = YES
+PERLMOD_MAKEVAR_PREFIX =
+#---------------------------------------------------------------------------
+# Configuration options related to the preprocessor
+#---------------------------------------------------------------------------
+ENABLE_PREPROCESSING   = NO
+MACRO_EXPANSION        = NO
+EXPAND_ONLY_PREDEF     = NO
+SEARCH_INCLUDES        = YES
+INCLUDE_PATH           =
+INCLUDE_FILE_PATTERNS  =
+PREDEFINED             =
+EXPAND_AS_DEFINED      =
+SKIP_FUNCTION_MACROS   = YES
+#---------------------------------------------------------------------------
+# Configuration::additions related to external references
+#---------------------------------------------------------------------------
+TAGFILES               =
+GENERATE_TAGFILE       =
+ALLEXTERNALS           = NO
+EXTERNAL_GROUPS        = YES
+PERL_PATH              = /usr/bin/perl
+#---------------------------------------------------------------------------
+# Configuration options related to the dot tool
+#---------------------------------------------------------------------------
+CLASS_DIAGRAMS         = YES
+MSCGEN_PATH            =
+HIDE_UNDOC_RELATIONS   = YES
+HAVE_DOT               = NO
+DOT_NUM_THREADS        = 0
+DOT_FONTNAME           = Helvetica
+DOT_FONTSIZE           = 10
+DOT_FONTPATH           =
+CLASS_GRAPH            = YES
+COLLABORATION_GRAPH    = YES
+GROUP_GRAPHS           = YES
+UML_LOOK               = NO
+TEMPLATE_RELATIONS     = NO
+INCLUDE_GRAPH          = YES
+INCLUDED_BY_GRAPH      = YES
+CALL_GRAPH             = NO
+CALLER_GRAPH           = NO
+GRAPHICAL_HIERARCHY    = YES
+DIRECTORY_GRAPH        = YES
+DOT_IMAGE_FORMAT       = png
+INTERACTIVE_SVG        = NO
+DOT_PATH               =
+DOTFILE_DIRS           =
+MSCFILE_DIRS           =
+DOT_GRAPH_MAX_NODES    = 50
+MAX_DOT_GRAPH_DEPTH    = 0
+DOT_TRANSPARENT        = NO
+DOT_MULTI_TARGETS      = YES
+GENERATE_LEGEND        = YES
+DOT_CLEANUP            = YES
diff --git a/doc/Makefile b/doc/Makefile
new file mode 100644
index 0000000..40bba2a
--- /dev/null
+++ b/doc/Makefile
@@ -0,0 +1,192 @@
+# Makefile for Sphinx documentation
+#
+
+# You can set these variables from the command line.
+SPHINXOPTS    =
+SPHINXBUILD   = sphinx-build
+PAPER         =
+BUILDDIR      = _build
+
+# User-friendly check for sphinx-build
+ifeq ($(shell which $(SPHINXBUILD) >/dev/null 2>&1; echo $$?), 1)
+$(error The '$(SPHINXBUILD)' command was not found. Make sure you have Sphinx installed, then set the SPHINXBUILD environment variable to point to the full path of the '$(SPHINXBUILD)' executable. Alternatively you can add the directory with the executable to your PATH. If you don't have Sphinx installed, grab it from http://sphinx-doc.org/)
+endif
+
+# Internal variables.
+PAPEROPT_a4     = -D latex_paper_size=a4
+PAPEROPT_letter = -D latex_paper_size=letter
+ALLSPHINXOPTS   = -d $(BUILDDIR)/doctrees $(PAPEROPT_$(PAPER)) $(SPHINXOPTS) .
+# the i18n builder cannot share the environment and doctrees with the others
+I18NSPHINXOPTS  = $(PAPEROPT_$(PAPER)) $(SPHINXOPTS) .
+
+.PHONY: help clean html dirhtml singlehtml pickle json htmlhelp qthelp devhelp epub latex latexpdf text man changes linkcheck doctest coverage gettext
+
+help:
+	@echo "Please use \`make <target>' where <target> is one of"
+	@echo "  html       to make standalone HTML files"
+	@echo "  dirhtml    to make HTML files named index.html in directories"
+	@echo "  singlehtml to make a single large HTML file"
+	@echo "  pickle     to make pickle files"
+	@echo "  json       to make JSON files"
+	@echo "  htmlhelp   to make HTML files and a HTML help project"
+	@echo "  qthelp     to make HTML files and a qthelp project"
+	@echo "  applehelp  to make an Apple Help Book"
+	@echo "  devhelp    to make HTML files and a Devhelp project"
+	@echo "  epub       to make an epub"
+	@echo "  latex      to make LaTeX files, you can set PAPER=a4 or PAPER=letter"
+	@echo "  latexpdf   to make LaTeX files and run them through pdflatex"
+	@echo "  latexpdfja to make LaTeX files and run them through platex/dvipdfmx"
+	@echo "  text       to make text files"
+	@echo "  man        to make manual pages"
+	@echo "  texinfo    to make Texinfo files"
+	@echo "  info       to make Texinfo files and run them through makeinfo"
+	@echo "  gettext    to make PO message catalogs"
+	@echo "  changes    to make an overview of all changed/added/deprecated items"
+	@echo "  xml        to make Docutils-native XML files"
+	@echo "  pseudoxml  to make pseudoxml-XML files for display purposes"
+	@echo "  linkcheck  to check all external links for integrity"
+	@echo "  doctest    to run all doctests embedded in the documentation (if enabled)"
+	@echo "  coverage   to run coverage check of the documentation (if enabled)"
+
+clean:
+	rm -rf $(BUILDDIR)/*
+
+html:
+	$(SPHINXBUILD) -b html $(ALLSPHINXOPTS) $(BUILDDIR)/html
+	@echo
+	@echo "Build finished. The HTML pages are in $(BUILDDIR)/html."
+
+dirhtml:
+	$(SPHINXBUILD) -b dirhtml $(ALLSPHINXOPTS) $(BUILDDIR)/dirhtml
+	@echo
+	@echo "Build finished. The HTML pages are in $(BUILDDIR)/dirhtml."
+
+singlehtml:
+	$(SPHINXBUILD) -b singlehtml $(ALLSPHINXOPTS) $(BUILDDIR)/singlehtml
+	@echo
+	@echo "Build finished. The HTML page is in $(BUILDDIR)/singlehtml."
+
+pickle:
+	$(SPHINXBUILD) -b pickle $(ALLSPHINXOPTS) $(BUILDDIR)/pickle
+	@echo
+	@echo "Build finished; now you can process the pickle files."
+
+json:
+	$(SPHINXBUILD) -b json $(ALLSPHINXOPTS) $(BUILDDIR)/json
+	@echo
+	@echo "Build finished; now you can process the JSON files."
+
+htmlhelp:
+	$(SPHINXBUILD) -b htmlhelp $(ALLSPHINXOPTS) $(BUILDDIR)/htmlhelp
+	@echo
+	@echo "Build finished; now you can run HTML Help Workshop with the" \
+	      ".hhp project file in $(BUILDDIR)/htmlhelp."
+
+qthelp:
+	$(SPHINXBUILD) -b qthelp $(ALLSPHINXOPTS) $(BUILDDIR)/qthelp
+	@echo
+	@echo "Build finished; now you can run "qcollectiongenerator" with the" \
+	      ".qhcp project file in $(BUILDDIR)/qthelp, like this:"
+	@echo "# qcollectiongenerator $(BUILDDIR)/qthelp/rabit.qhcp"
+	@echo "To view the help file:"
+	@echo "# assistant -collectionFile $(BUILDDIR)/qthelp/rabit.qhc"
+
+applehelp:
+	$(SPHINXBUILD) -b applehelp $(ALLSPHINXOPTS) $(BUILDDIR)/applehelp
+	@echo
+	@echo "Build finished. The help book is in $(BUILDDIR)/applehelp."
+	@echo "N.B. You won't be able to view it unless you put it in" \
+	      "~/Library/Documentation/Help or install it in your application" \
+	      "bundle."
+
+devhelp:
+	$(SPHINXBUILD) -b devhelp $(ALLSPHINXOPTS) $(BUILDDIR)/devhelp
+	@echo
+	@echo "Build finished."
+	@echo "To view the help file:"
+	@echo "# mkdir -p $$HOME/.local/share/devhelp/rabit"
+	@echo "# ln -s $(BUILDDIR)/devhelp $$HOME/.local/share/devhelp/rabit"
+	@echo "# devhelp"
+
+epub:
+	$(SPHINXBUILD) -b epub $(ALLSPHINXOPTS) $(BUILDDIR)/epub
+	@echo
+	@echo "Build finished. The epub file is in $(BUILDDIR)/epub."
+
+latex:
+	$(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex
+	@echo
+	@echo "Build finished; the LaTeX files are in $(BUILDDIR)/latex."
+	@echo "Run \`make' in that directory to run these through (pdf)latex" \
+	      "(use \`make latexpdf' here to do that automatically)."
+
+latexpdf:
+	$(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex
+	@echo "Running LaTeX files through pdflatex..."
+	$(MAKE) -C $(BUILDDIR)/latex all-pdf
+	@echo "pdflatex finished; the PDF files are in $(BUILDDIR)/latex."
+
+latexpdfja:
+	$(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex
+	@echo "Running LaTeX files through platex and dvipdfmx..."
+	$(MAKE) -C $(BUILDDIR)/latex all-pdf-ja
+	@echo "pdflatex finished; the PDF files are in $(BUILDDIR)/latex."
+
+text:
+	$(SPHINXBUILD) -b text $(ALLSPHINXOPTS) $(BUILDDIR)/text
+	@echo
+	@echo "Build finished. The text files are in $(BUILDDIR)/text."
+
+man:
+	$(SPHINXBUILD) -b man $(ALLSPHINXOPTS) $(BUILDDIR)/man
+	@echo
+	@echo "Build finished. The manual pages are in $(BUILDDIR)/man."
+
+texinfo:
+	$(SPHINXBUILD) -b texinfo $(ALLSPHINXOPTS) $(BUILDDIR)/texinfo
+	@echo
+	@echo "Build finished. The Texinfo files are in $(BUILDDIR)/texinfo."
+	@echo "Run \`make' in that directory to run these through makeinfo" \
+	      "(use \`make info' here to do that automatically)."
+
+info:
+	$(SPHINXBUILD) -b texinfo $(ALLSPHINXOPTS) $(BUILDDIR)/texinfo
+	@echo "Running Texinfo files through makeinfo..."
+	make -C $(BUILDDIR)/texinfo info
+	@echo "makeinfo finished; the Info files are in $(BUILDDIR)/texinfo."
+
+gettext:
+	$(SPHINXBUILD) -b gettext $(I18NSPHINXOPTS) $(BUILDDIR)/locale
+	@echo
+	@echo "Build finished. The message catalogs are in $(BUILDDIR)/locale."
+
+changes:
+	$(SPHINXBUILD) -b changes $(ALLSPHINXOPTS) $(BUILDDIR)/changes
+	@echo
+	@echo "The overview file is in $(BUILDDIR)/changes."
+
+linkcheck:
+	$(SPHINXBUILD) -b linkcheck $(ALLSPHINXOPTS) $(BUILDDIR)/linkcheck
+	@echo
+	@echo "Link check complete; look for any errors in the above output " \
+	      "or in $(BUILDDIR)/linkcheck/output.txt."
+
+doctest:
+	$(SPHINXBUILD) -b doctest $(ALLSPHINXOPTS) $(BUILDDIR)/doctest
+	@echo "Testing of doctests in the sources finished, look at the " \
+	      "results in $(BUILDDIR)/doctest/output.txt."
+
+coverage:
+	$(SPHINXBUILD) -b coverage $(ALLSPHINXOPTS) $(BUILDDIR)/coverage
+	@echo "Testing of coverage in the sources finished, look at the " \
+	      "results in $(BUILDDIR)/coverage/python.txt."
+
+xml:
+	$(SPHINXBUILD) -b xml $(ALLSPHINXOPTS) $(BUILDDIR)/xml
+	@echo
+	@echo "Build finished. The XML files are in $(BUILDDIR)/xml."
+
+pseudoxml:
+	$(SPHINXBUILD) -b pseudoxml $(ALLSPHINXOPTS) $(BUILDDIR)/pseudoxml
+	@echo
+	@echo "Build finished. The pseudo-XML files are in $(BUILDDIR)/pseudoxml."
diff --git a/doc/conf.py b/doc/conf.py
new file mode 100644
index 0000000..ef89de4
--- /dev/null
+++ b/doc/conf.py
@@ -0,0 +1,184 @@
+# -*- coding: utf-8 -*-
+#
+# documentation build configuration file, created by
+# sphinx-quickstart on Thu Jul 23 19:40:08 2015.
+#
+# This file is execfile()d with the current directory set to its
+# containing dir.
+#
+# Note that not all possible configuration values are present in this
+# autogenerated file.
+#
+# All configuration values have a default; values that are commented out
+# serve to show the default.
+import sys
+import os, subprocess
+import shlex
+# If extensions (or modules to document with autodoc) are in another directory,
+# add these directories to sys.path here. If the directory is relative to the
+# documentation root, use os.path.abspath to make it absolute, like shown here.
+curr_path = os.path.dirname(os.path.abspath(os.path.expanduser(__file__)))
+libpath = os.path.join(curr_path, '../wrapper/')
+sys.path.insert(0, os.path.join(curr_path, '../wrapper/'))
+sys.path.insert(0, curr_path)
+from sphinx_util import MarkdownParser, AutoStructify
+
+# -- General configuration ------------------------------------------------
+
+# General information about the project.
+project = u'rabit'
+copyright = u'2015, rabit developers'
+author = u'rabit developers'
+github_doc_root = 'https://github.com/dmlc/rabit/tree/master/doc/'
+
+# add markdown parser
+MarkdownParser.github_doc_root = github_doc_root
+source_parsers = {
+    '.md': MarkdownParser,
+}
+# Version information.
+import rabit
+
+version = rabit.__version__
+release = rabit.__version__
+
+# Add any Sphinx extension module names here, as strings. They can be
+# extensions coming with Sphinx (named 'sphinx.ext.*') or your custom ones
+extensions = [
+    'sphinx.ext.autodoc',
+    'sphinx.ext.napoleon',
+    'sphinx.ext.mathjax',
+    'breathe',
+]
+
+# Use breathe to include doxygen documents
+breathe_projects = {'rabit' : 'doxygen/xml/'}
+breathe_default_project = 'rabit'
+
+# Add any paths that contain templates here, relative to this directory.
+templates_path = ['_templates']
+
+# The suffix(es) of source filenames.
+# You can specify multiple suffix as a list of string:
+# source_suffix = ['.rst', '.md']
+source_suffix = ['.rst', '.md']
+
+# The encoding of source files.
+#source_encoding = 'utf-8-sig'
+
+# The master toctree document.
+master_doc = 'index'
+
+# The language for content autogenerated by Sphinx. Refer to documentation
+# for a list of supported languages.
+#
+# This is also used if you do content translation via gettext catalogs.
+# Usually you set "language" from the command line for these cases.
+language = None
+
+# There are two options for replacing |today|: either, you set today to some
+# non-false value, then it is used:
+#today = ''
+# Else, today_fmt is used as the format for a strftime call.
+#today_fmt = '%B %d, %Y'
+
+# List of patterns, relative to source directory, that match files and
+# directories to ignore when looking for source files.
+exclude_patterns = ['_build']
+
+# The reST default role (used for this markup: `text`) to use for all
+# documents.
+#default_role = None
+
+# If true, '()' will be appended to :func: etc. cross-reference text.
+#add_function_parentheses = True
+
+# If true, the current module name will be prepended to all description
+# unit titles (such as .. function::).
+#add_module_names = True
+
+# If true, sectionauthor and moduleauthor directives will be shown in the
+# output. They are ignored by default.
+#show_authors = False
+
+# The name of the Pygments (syntax highlighting) style to use.
+pygments_style = 'sphinx'
+
+# A list of ignored prefixes for module index sorting.
+#modindex_common_prefix = []
+
+# If true, keep warnings as "system message" paragraphs in the built documents.
+#keep_warnings = False
+
+# If true, `todo` and `todoList` produce output, else they produce nothing.
+todo_include_todos = False
+
+# -- Options for HTML output ----------------------------------------------
+
+# The theme to use for HTML and HTML Help pages.  See the documentation for
+# a list of builtin themes.
+# html_theme = 'alabaster'
+
+# Add any paths that contain custom static files (such as style sheets) here,
+# relative to this directory. They are copied after the builtin static files,
+# so a file named "default.css" will overwrite the builtin "default.css".
+html_static_path = ['_static']
+
+# Output file base name for HTML help builder.
+htmlhelp_basename = project + 'doc'
+
+# -- Options for LaTeX output ---------------------------------------------
+latex_elements = {
+}
+
+# Grouping the document tree into LaTeX files. List of tuples
+# (source start file, target name, title,
+#  author, documentclass [howto, manual, or own class]).
+latex_documents = [
+  (master_doc, 'rabit.tex', project,
+   author, 'manual'),
+]
+
+# hook for doxygen
+def run_doxygen(folder):
+    """Run the doxygen make command in the designated folder."""
+    try:
+        retcode = subprocess.call("cd %s; make doxygen" % folder, shell=True)
+        if retcode < 0:
+            sys.stderr.write("doxygen terminated by signal %s" % (-retcode))
+    except OSError as e:
+        sys.stderr.write("doxygen execution failed: %s" % e)
+
+
+def run_build_lib(folder):
+    """Run the doxygen make command in the designated folder."""
+    try:
+        retcode = subprocess.call("cd %s; make" % folder, shell=True)
+        retcode = subprocess.call("rm -rf _build/html/doxygen", shell=True)
+        retcode = subprocess.call("mkdir _build", shell=True)
+        retcode = subprocess.call("mkdir _build/html", shell=True)
+        retcode = subprocess.call("cp -rf doxygen/html _build/html/doxygen", shell=True)
+        if retcode < 0:
+            sys.stderr.write("build terminated by signal %s" % (-retcode))
+    except OSError as e:
+        sys.stderr.write("build execution failed: %s" % e)
+
+
+def generate_doxygen_xml(app):
+    """Run the doxygen make commands if we're on the ReadTheDocs server"""
+    read_the_docs_build = os.environ.get('READTHEDOCS', None) == 'True'
+    if read_the_docs_build:
+        run_doxygen('..')
+        sys.stderr.write('Check if shared lib exists\n')
+        run_build_lib('..')
+    sys.stderr.write('The wrapper path: %s\n' % str(os.listdir('../wrapper')))
+    rabit._loadlib()
+
+
+def setup(app):
+    # Add hook for building doxygen xml when needed
+    app.connect("builder-inited", generate_doxygen_xml)
+    app.add_config_value('recommonmark_config', {
+            'url_resolver': lambda url: github_doc_root + url,
+            }, True)
+    app.add_transform(AutoStructify)
diff --git a/doc/cpp_api.md b/doc/cpp_api.md
new file mode 100644
index 0000000..c6184aa
--- /dev/null
+++ b/doc/cpp_api.md
@@ -0,0 +1,9 @@
+C++ Library API of Rabit
+========================
+This page contains document of Library API of rabit.
+
+```eval_rst
+.. toctree::
+
+.. doxygennamespace:: rabit
+```
diff --git a/doc/guide.md b/doc/guide.md
new file mode 100644
index 0000000..7bf50b0
--- /dev/null
+++ b/doc/guide.md
@@ -0,0 +1,383 @@
+Tutorial
+========
+This is rabit's tutorial, a ***Reliable Allreduce and Broadcast Interface***.
+All the example codes are in the [guide](https://github.com/dmlc/rabit/blob/master/guide/) folder of the project.
+To run the examples locally, you will need to build them with ```make```.
+
+**List of Topics**
+* [What is Allreduce](#what-is-allreduce)
+* [Common Use Case](#common-use-case)
+* [Use Rabit API](#use-rabit-api)
+  - [Structure of a Rabit Program](#structure-of-a-rabit-program)
+  - [Allreduce and Lazy Preparation](#allreduce-and-lazy-preparation)
+  - [Checkpoint and LazyCheckpoint](#checkpoint-and-lazycheckpoint)
+* [Compile Programs with Rabit](#compile-programs-with-rabit)
+* [Running Rabit Jobs](#running-rabit-jobs)
+* [Fault Tolerance](#fault-tolerance)
+
+What is Allreduce
+-----------------
+The main methods provided by rabit are Allreduce and Broadcast. Allreduce performs reduction across different computation nodes,
+and returns the result to every node. To understand the behavior of the function, consider the following example in [basic.cc](../guide/basic.cc) (there is a python example right after this if you are more familiar with python).
+```c++
+#include <rabit.h>
+using namespace rabit;
+const int N = 3;
+int main(int argc, char *argv[]) {
+  int a[N];
+  rabit::Init(argc, argv);
+  for (int i = 0; i < N; ++i) {
+    a[i] = rabit::GetRank() + i;
+  }
+  printf("@node[%d] before-allreduce: a={%d, %d, %d}\n",
+         rabit::GetRank(), a[0], a[1], a[2]);
+  // allreduce take max of each elements in all processes
+  Allreduce<op::Max>(&a[0], N);
+  printf("@node[%d] after-allreduce-max: a={%d, %d, %d}\n",
+         rabit::GetRank(), a[0], a[1], a[2]);
+  // second allreduce that sums everything up
+  Allreduce<op::Sum>(&a[0], N);
+  printf("@node[%d] after-allreduce-sum: a={%d, %d, %d}\n",
+         rabit::GetRank(), a[0], a[1], a[2]);
+  rabit::Finalize();
+  return 0;
+}
+```
+You can run the example using the rabit_demo.py script. The following command
+starts the rabit program with two worker processes.
+```bash
+../tracker/rabit_demo.py -n 2 basic.rabit
+```
+This will start two processes, one process with rank 0 and the other with rank 1, both processes run the same code.
+The ```rabit::GetRank()``` function returns the rank of current process.
+
+Before the call to Allreduce, process 0 contains the array ```a = {0, 1, 2}```, while process 1 has the array
+```a = {1, 2, 3}```. After the call to Allreduce, the array contents in all processes are replaced by the
+reduction result (in this case, the maximum value in each position across all the processes). So, after the
+Allreduce call, the result will become ```a = {1, 2, 3}```.
+Rabit provides different reduction operators, for example,  if you change ```op::Max``` to ```op::Sum```,
+the reduction operation will be a summation, and the result will become ```a = {1, 3, 5}```.
+You can also run the example with different processes by setting -n to different values.
+
+If you are more familiar with python, you can also use rabit in python. The same example as before can be found in [basic.py](../guide/basic.py):
+
+```python
+import numpy as np
+import rabit
+
+rabit.init()
+n = 3
+rank = rabit.get_rank()
+a = np.zeros(n)
+for i in xrange(n):
+    a[i] = rank + i
+
+print '@node[%d] before-allreduce: a=%s' % (rank, str(a))
+a = rabit.allreduce(a, rabit.MAX)
+print '@node[%d] after-allreduce-max: a=%s' % (rank, str(a))
+a = rabit.allreduce(a, rabit.SUM)
+print '@node[%d] after-allreduce-sum: a=%s' % (rank, str(a))
+rabit.finalize()
+```
+You can run the program using the following command
+```bash
+../tracker/rabit_demo.py -n 2 basic.py
+```
+
+Broadcast is another method provided by rabit besides Allreduce. This function allows one node to broadcast its
+local data to all other nodes. The following code in [broadcast.cc](../guide/broadcast.cc) broadcasts a string from
+node 0 to all other nodes.
+```c++
+#include <rabit.h>
+using namespace rabit;
+const int N = 3;
+int main(int argc, char *argv[]) {
+  rabit::Init(argc, argv);
+  std::string s;
+  if (rabit::GetRank() == 0) s = "hello world";
+  printf("@node[%d] before-broadcast: s=\"%s\"\n",
+         rabit::GetRank(), s.c_str());
+  // broadcast s from node 0 to all other nodes
+  rabit::Broadcast(&s, 0);
+  printf("@node[%d] after-broadcast: s=\"%s\"\n",
+         rabit::GetRank(), s.c_str());
+  rabit::Finalize();
+  return 0;
+}
+```
+The following command starts the program with three worker processes.
+```bash
+../tracker/rabit_demo.py -n 3 broadcast.rabit
+```
+Besides strings, rabit also allows to broadcast constant size array and vectors.
+
+The counterpart in python can be found in [broadcast.py](../guide/broadcast.py). Here is a snippet so that you can get a better sense of how simple is to use the python library:
+
+```python
+import rabit
+rabit.init()
+n = 3
+rank = rabit.get_rank()
+s = None
+if rank == 0:
+    s = {'hello world':100, 2:3}
+print '@node[%d] before-broadcast: s=\"%s\"' % (rank, str(s))
+s = rabit.broadcast(s, 0)
+print '@node[%d] after-broadcast: s=\"%s\"' % (rank, str(s))
+rabit.finalize()
+```
+
+Common Use Case
+---------------
+Many distributed machine learning algorithms involve splitting the data into different nodes,
+computing statistics locally, and finally aggregating them. Such workflow is usually done repetitively through many iterations before the algorithm converges. Allreduce naturally meets the structure of such programs,
+common use cases include:
+
+* Aggregation of gradient values, which can be used in optimization methods such as L-BFGS.
+* Aggregation of other statistics, which can be used in KMeans and Gaussian Mixture Models.
+* Find the best split candidate and aggregation of split statistics, used for tree based models.
+
+Rabit is a reliable and portable library for distributed machine learning programs, that allow programs to run reliably on different platforms.
+
+Use Rabit API
+-------------
+This section introduces topics about how to use rabit API.
+You can always refer to [API Documentation](http://homes.cs.washington.edu/~tqchen/rabit/doc) for definition of each functions.
+This section trys to gives examples of different aspectes of rabit API.
+
+#### Structure of a Rabit Program
+The following code illustrates the common structure of a rabit program. This is an abstract example,
+you can also refer to [wormhole](https://github.com/dmlc/wormhole/blob/master/learn/kmeans/kmeans.cc) for an example implementation of kmeans algorithm.
+
+```c++
+#include <rabit.h>
+int main(int argc, char *argv[]) {
+  ...
+  rabit::Init(argc, argv);
+  // sync on expected model size before load checkpoint, if we pass rabit_bootstrap_cache=true
+  rabit::Allreduce<rabit::op::Max>(&model.size(), 1);
+  // load the latest checked model
+  int version = rabit::LoadCheckPoint(&model);
+  // initialize the model if it is the first version
+  if (version == 0) model.InitModel();
+  // the version number marks the iteration to resume
+  for (int iter = version; iter < max_iter; ++iter) {
+    // at this point, the model object should allow us to recover the program state
+    ...
+    // each iteration can contain multiple calls of allreduce/broadcast
+    rabit::Allreduce<rabit::op::Max>(&data[0], n);
+    ...
+    // checkpoint model after one iteration finishes
+    rabit::CheckPoint(&model);
+  }
+  rabit::Finalize();
+  return 0;
+}
+```
+
+Besides the common Allreduce and Broadcast functions, there are two additional functions: ```LoadCheckPoint```
+and ```CheckPoint```. These two functions are used for fault-tolerance purposes.
+As mentioned before, traditional machine learning programs involve several iterations. In each iteration, we start with a model, make some calls
+to Allreduce or Broadcast and update the model. The calling sequence in each iteration does not need to be the same.
+
+* When the nodes start from the beginning (i.e. iteration 0), ```LoadCheckPoint``` returns 0, so we can initialize the model.
+* ```CheckPoint``` saves the model after each iteration.
+  - Efficiency Note: the model is only kept in local memory and no save to disk is performed when calling Checkpoint
+* When a node goes down and restarts, ```LoadCheckPoint``` will recover the latest saved model, and
+* When a node goes down, the rest of the nodes will block in the call of Allreduce/Broadcast and wait for
+  the recovery of the failed node until it catches up.
+
+Please see the [Fault Tolerance](#fault-tolerance) section to understand the recovery procedure executed by rabit.
+
+#### Allreduce and Lazy Preparation
+Allreduce is one of the most important function provided by rabit. You can call allreduce by specifying the
+reduction operator, pointer to the data and size of the buffer, as follows
+```c++
+Allreduce<operator>(pointer_of_data, size_of_data);
+```
+This is the basic use case of Allreduce function. It is common that user writes the code to prepare the data needed
+into the data buffer, pass the data to Allreduce function, and get the reduced result. However, when a node restarts
+from failure, we can directly recover the result from other nodes(see also [Fault Tolerance](#fault-tolerance)) and
+the data preparation procedure no longer necessary. Rabit Allreduce add an optional parameter preparation function
+to support such scenario. User can pass in a function that corresponds to the data preparation procedure to Allreduce
+calls, and the data preparation function will only be called when necessary. We use [lazy_allreduce.cc](../guide/lazy_allreduce.cc)
+as an example to demonstrate this feature. It is modified from [basic.cc](../guide/basic.cc), and you can compare the two codes.
+```c++
+#include <rabit.h>
+using namespace rabit;
+const int N = 3;
+int main(int argc, char *argv[]) {
+  int a[N] = {0};
+  rabit::Init(argc, argv);
+  // lazy preparation function
+  auto prepare = [&]() {
+    printf("@node[%d] run prepare function\n", rabit::GetRank());
+    for (int i = 0; i < N; ++i) {
+      a[i] = rabit::GetRank() + i;
+    }
+  };
+  printf("@node[%d] before-allreduce: a={%d, %d, %d}\n",
+         rabit::GetRank(), a[0], a[1], a[2]);
+  // allreduce take max of each elements in all processes
+  Allreduce<op::Max>(&a[0], N, prepare);
+  printf("@node[%d] after-allreduce-sum: a={%d, %d, %d}\n",
+         rabit::GetRank(), a[0], a[1], a[2]);
+  // rum second allreduce
+  Allreduce<op::Sum>(&a[0], N);
+  printf("@node[%d] after-allreduce-max: a={%d, %d, %d}\n",
+         rabit::GetRank(), a[0], a[1], a[2]);
+  rabit::Finalize();
+  return 0;
+}
+```
+Here we use features of C++11 because the lambda function makes things much shorter.
+There is also C++ compatible callback interface provided in the [API](http://homes.cs.washington.edu/~tqchen/rabit/doc).
+You can compile the program by typing ```make lazy_allreduce.mock```. We link against the mock library so that we can see
+the effect when a process goes down. You can run the program using the following command
+```bash
+../tracker/rabit_demo.py -n 2 lazy_allreduce.mock mock=0,0,1,0
+```
+The additional arguments ```mock=0,0,1,0``` will cause node 0 to kill itself before second call of Allreduce (see also [mock test](#link-against-mock-test-rabit-library)).
+You will find that the prepare function's print is only executed once and node 0 will no longer execute the preparation function when it restarts from failure.
+
+You can also find python version of the example in [lazy_allreduce.py](../guide/lazy_allreduce.py), and run it using the followin command
+```bash
+../tracker/rabit_demo.py -n 2 lazy_allreduce.py mock=0,0,1,0
+
+```
+
+Since lazy preparation function may not be called during execution. User should be careful when using this feature. For example, a possible mistake
+could be putting some memory allocation code in the lazy preparation function, and the computing memory was not allocated when lazy preparation function is not called.
+The example in [lazy_allreduce.cc](../guide/lazy_allreduce.cc) provides a simple way to migrate normal prepration code([basic.cc](../guide/basic.cc)) to lazy version: wrap the preparation
+code with a lambda function, and pass it to allreduce.
+
+#### Checkpoint and LazyCheckpoint
+Common machine learning algorithms usually involves iterative computation. As mentioned in the section ([Structure of a Rabit Program](#structure-of-a-rabit-program)),
+user can and should use Checkpoint to ```save``` the progress so far, so that when a node fails, the latest checkpointed model can be loaded.
+
+There are two model arguments you can pass to Checkpoint and LoadCheckpoint: ```global_model``` and ```local_model```:
+* ```global_model``` refers to the model that is commonly shared across all the nodes
+  - For example, the centriods of clusters in kmeans is shared across all nodes
+* ```local_model``` refers to the model that is specifically tied to the current node
+  - For example, in topic modeling, the topic assignments of subset of documents in current node is local model
+
+Because the different nature of the two types of models, different strategy will be used for them.
+```global_model``` is simply saved in local memory of each node, while ```local_model``` will replicated to some other
+nodes (selected using a ring replication strategy). The checkpoint is only saved in the memory without touching the disk which makes rabit programs more efficient.
+User is encouraged to use ```global_model``` only when is sufficient for better efficiency.
+
+To enable a model class to be checked pointed, user can implement a [serialization interface](../include/rabit_serialization.h). The serialization interface already
+provide serialization functions of STL vector and string. For python API, user can checkpoint any python object that can be pickled.
+
+There is a special Checkpoint function called [LazyCheckpoint](http://homes.cs.washington.edu/~tqchen/rabit/doc/namespacerabit.html#a99f74c357afa5fba2c80cc0363e4e459),
+which can be used for ```global_model``` only cases under certain condition.
+When LazyCheckpoint is called, no action is taken and the rabit engine only remembers the pointer to the model.
+The serialization will only happen when another node fails and the recovery starts. So user basically pays no extra cost calling LazyCheckpoint.
+To use this function, the user need to ensure the model remain unchanged until the last call of Allreduce/Broadcast in the current version finishes.
+So that when recovery procedure happens in these function calls, the serialized model will be the same.
+
+For example, consider the following calling sequence
+```
+LazyCheckPoint, code1, Allreduce, code2, Broadcast, code3, LazyCheckPoint
+```
+The user must only change the model in code3. Such condition can usually be satiesfied in many scenarios, and user can use LazyCheckpoint to further
+improve the efficiency of the program.
+
+
+Compile Programs with Rabit
+---------------------------
+Rabit is a portable library, to use it, you only need to include the rabit header file.
+* You will need to add the path to [../include](../include) to the header search path of the compiler
+  - Solution 1: add ```-I/path/to/rabit/include``` to the compiler flag in gcc or clang
+  - Solution 2: add the path to the environment variable CPLUS_INCLUDE_PATH
+* You will need to add the path to [../lib](../lib) to the library search path of the compiler
+  - Solution 1: add ```-L/path/to/rabit/lib``` to the linker flag
+  - Solution 2: add the path to environment variable LIBRARY_PATH AND LD_LIBRARY_PATH
+* Link against lib/rabit.a
+  - Add ```-lrabit``` to the linker flag
+
+The procedure above allows you to compile a program with rabit. The following two sections contain additional
+options you can use to link against different backends other than the normal one.
+
+#### Link against MPI Allreduce
+You can link against ```rabit_mpi.a``` instead of using MPI Allreduce, however, the resulting program is backed by MPI and
+is not fault tolerant anymore.
+* Simply change the linker flag from ```-lrabit``` to ```-lrabit_mpi```
+* The final linking needs to be done by mpi wrapper compiler ```mpicxx```
+
+#### Link against Mock Test Rabit Library
+If you want to use a mock to test the program in order to see the behavior of the code when some nodes go down, you can link against ```rabit_mock.a``` .
+* Simply change the linker flag from ```-lrabit``` to ```-lrabit_mock```
+
+The resulting rabit mock program can take in additional arguments in the following format
+```
+mock=rank,version,seq,ndeath
+```
+
+The four integers specify an event that will cause the program to ```commit suicide```(exit with -2)
+* rank specifies the rank of the node to kill
+* version specifies the version (iteration) of the model where you want the process to die
+* seq specifies the sequence number of the Allreduce/Broadcast call since last checkpoint, where the process will be killed
+* ndeath specifies how many times this node died already
+
+For example, consider the following script in the test case
+```bash
+../tracker/rabit_demo.py -n 10 test_model_recover 10000\
+                         mock=0,0,1,0 mock=1,1,1,0 mock=1,1,1,1
+```
+* The first mock will cause node 0 to exit when calling the second Allreduce/Broadcast (seq = 1) in iteration 0
+* The second mock will cause node 1 to exit when calling the second Allreduce/Broadcast (seq = 1) in iteration 1
+* The third mock will cause node 1 to exit again when calling second Allreduce/Broadcast (seq = 1) in iteration 1
+  - Note that ndeath = 1 means this will happen only if node 1 died once, which is our case
+
+Running Rabit Jobs
+------------------
+Rabit is a portable library that can run on multiple platforms.
+All the rabit jobs can be submitted using [dmlc-tracker](https://github.com/dmlc/dmlc-core/tree/master/tracker)
+
+Fault Tolerance
+---------------
+This section introduces how fault tolerance works in rabit.
+The following figure shows how rabit deals with failures.
+
+![](http://homes.cs.washington.edu/~tqchen/rabit/fig/fault-tol.png)
+
+The scenario is as follows:
+* Node 1 fails between the first and second call of Allreduce after the second checkpoint
+* The other nodes wait in the call of the second Allreduce in order to help node 1 to recover.
+* When node 1 restarts, it will call ```LoadCheckPoint```, and get the latest checkpoint from one of the existing nodes.
+* Then node 1 can start from the latest checkpoint and continue running.
+* When node 1 calls the first Allreduce again, as the other nodes already know the result, node 1 can get it from one of them.
+* When node 1 reaches the second Allreduce, the other nodes find out that node 1 has catched up and they can continue the program normally.
+
+This fault tolerance model is based on a key property of Allreduce and
+Broadcast: All the nodes get the same result after calling Allreduce/Broadcast.
+Because of this property, any node can record the results of history
+Allreduce/Broadcast calls.  When a node is recovered, it can fetch the lost
+results from some alive nodes and rebuild its model.
+
+The checkpoint is introduced so that we can discard the history results of
+Allreduce/Broadcast calls before the latest checkpoint. This saves memory
+consumption used for backup.  The checkpoint of each node is a model defined by
+users and can be split into 2 parts: a global model and a local model. The
+global model is shared by all nodes and can be backed up by any nodes. The
+local model of a node is replicated to some other nodes (selected using a ring
+replication strategy).  The checkpoint is only saved in the memory without
+touching the disk which makes rabit programs more efficient.  The strategy of
+rabit is different from the fail-restart strategy where all the nodes restart
+from the same checkpoint when any of them fail.  In rabit, all the alive nodes
+will block in the Allreduce call and help the recovery.  To catch up, the
+recovered node fetches its latest checkpoint and the results of
+Allreduce/Broadcast calls after the checkpoint from some alive nodes.
+
+This is just a conceptual introduction to rabit's fault tolerance model. The actual implementation is more sophisticated,
+and can deal with more complicated cases such as multiple nodes failure and node failure during recovery phase.
+
+Rabit Timeout
+---------------
+
+In certain cases, rabit cluster may suffer lack of resources to retry failed workers.
+Thanks to fault tolerant assumption with infinite retry, it might cause entire cluster hang infinitely.
+We introduce sidecar thread which runs when rabit fault tolerant runtime observed allreduce/broadcast errors.
+By default, it will wait for 30 mins before all workers program exit. 
+User can opt-in this feature and change treshold by passing rabit_timeout=true and rabit_timeout_sec=x (in seconds).
diff --git a/doc/index.md b/doc/index.md
new file mode 100644
index 0000000..d209d95
--- /dev/null
+++ b/doc/index.md
@@ -0,0 +1,24 @@
+Rabit Documentation
+=====================
+rabit is a light weight library that provides a fault tolerant interface of Allreduce and Broadcast. It is designed to support easy implementations of distributed machine learning programs, many of which fall naturally under the Allreduce abstraction. The goal of rabit is to support **portable** , **scalable** and **reliable** distributed machine learning programs.
+
+API Documents
+-------------
+```eval_rst
+
+.. toctree::
+   :maxdepth: 2
+
+   python_api.md
+   cpp_api.md
+   parameters.md
+   guide.md
+```
+Indices and tables
+------------------
+
+```eval_rst
+* :ref:`genindex`
+* :ref:`modindex`
+* :ref:`search`
+```
\ No newline at end of file
diff --git a/doc/parameters.md b/doc/parameters.md
new file mode 100644
index 0000000..37580d5
--- /dev/null
+++ b/doc/parameters.md
@@ -0,0 +1,21 @@
+Parameters
+==========
+This section list all the parameters that can be passed to rabit::Init function as argv.
+All the parameters are passed in as string in format of ``parameter-name=parameter-value``.
+In most setting these parameters have default value or will be automatically detected,
+and do not need to be manually configured.
+
+* rabit_tracker_uri [passed in automatically by tracker]
+  - The uri/ip of rabit tracker
+* rabit_tracker_port [passed in automatically by tracker]
+  - The port of rabit tracker
+* rabit_task_id [automatically detected]
+  - The unique identifier of computing process
+  - When running on hadoop, this is automatically extracted from enviroment variable
+* rabit_reduce_buffer [default = 256MB]
+  - The memory buffer used to store intermediate result of reduction
+  - Format "digits + unit", can be 128M, 1G
+* rabit_global_replica [default = 5]
+  - Number of replication copies of result kept for each Allreduce/Broadcast call
+* rabit_local_replica [default = 2]
+  - Number of replication of local model in check point
diff --git a/doc/python-requirements.txt b/doc/python-requirements.txt
new file mode 100644
index 0000000..5970c43
--- /dev/null
+++ b/doc/python-requirements.txt
@@ -0,0 +1,4 @@
+numpy
+breathe
+commonmark
+
diff --git a/doc/python_api.md b/doc/python_api.md
new file mode 100644
index 0000000..8a0eda9
--- /dev/null
+++ b/doc/python_api.md
@@ -0,0 +1,11 @@
+Python API of Rabit
+===================
+This page contains document of python API of rabit.
+
+```eval_rst
+.. toctree::
+
+.. automodule:: rabit
+    :members:
+    :show-inheritance:
+```
diff --git a/doc/sphinx_util.py b/doc/sphinx_util.py
new file mode 100644
index 0000000..f6a33ff
--- /dev/null
+++ b/doc/sphinx_util.py
@@ -0,0 +1,16 @@
+# -*- coding: utf-8 -*-
+"""Helper utilty function for customization."""
+import sys
+import os
+import docutils
+import subprocess
+
+if os.environ.get('READTHEDOCS', None) == 'True':
+    subprocess.call('cd ..; rm -rf recommonmark;' +
+                    'git clone https://github.com/tqchen/recommonmark', shell=True)
+
+sys.path.insert(0, os.path.abspath('../recommonmark/'))
+from recommonmark import parser, transform
+
+MarkdownParser = parser.CommonMarkParser
+AutoStructify = transform.AutoStructify
diff --git a/guide/Makefile b/guide/Makefile
new file mode 100644
index 0000000..8028890
--- /dev/null
+++ b/guide/Makefile
@@ -0,0 +1,26 @@
+export CC  = gcc
+export CXX = g++
+export MPICXX = mpicxx
+export LDFLAGS= -pthread -lm -L../lib
+export CFLAGS = -Wall -O3 -msse2 -std=c++11 -Wno-unknown-pragmas -fPIC -fopenmp -I../include
+
+.PHONY: clean all lib libmpi
+BIN = basic.rabit broadcast.rabit
+MOCKBIN= lazy_allreduce.mock
+
+all: $(BIN)
+basic.rabit: basic.cc lib ../lib/librabit.a
+broadcast.rabit: broadcast.cc lib ../lib/librabit.a
+lazy_allreduce.mock: lazy_allreduce.cc lib ../lib/librabit.a
+
+$(BIN) :
+	$(CXX) $(CFLAGS) -o $@ $(filter %.cpp %.o %.c %.cc %.a,  $^) $(LDFLAGS)
+
+$(MOCKBIN) :
+	$(CXX) $(CFLAGS) -std=c++11 -o $@ $(filter %.cpp %.o %.c %.cc,  $^) $(LDFLAGS) -lrabit_mock
+
+$(OBJ) :
+	$(CXX) -c $(CFLAGS) -o $@ $(firstword $(filter %.cpp %.c %.cc, $^) )
+
+clean:
+	$(RM) $(OBJ) $(BIN) $(MOCKBIN) *~ ../src/*~
diff --git a/guide/README b/guide/README
new file mode 100644
index 0000000..2483d68
--- /dev/null
+++ b/guide/README
@@ -0,0 +1 @@
+See tutorial at ../doc/guide.md
\ No newline at end of file
diff --git a/guide/basic.cc b/guide/basic.cc
new file mode 100644
index 0000000..d08397b
--- /dev/null
+++ b/guide/basic.cc
@@ -0,0 +1,35 @@
+/*!
+ *  Copyright (c) 2014 by Contributors
+ * \file basic.cc
+ * \brief This is an example demonstrating what is Allreduce
+ *
+ * \author Tianqi Chen
+ */
+#define _CRT_SECURE_NO_WARNINGS
+#define _CRT_SECURE_NO_DEPRECATE
+#include <vector>
+#include <rabit/rabit.h>
+using namespace rabit;
+int main(int argc, char *argv[]) {
+  int N = 3;
+  if (argc > 1) {
+    N = atoi(argv[1]);
+  }
+  std::vector<int> a(N);
+  rabit::Init(argc, argv);
+  for (int i = 0; i < N; ++i) {
+    a[i] = rabit::GetRank() + i;
+  }
+  printf("@node[%d] before-allreduce: a={%d, %d, %d}\n",
+         rabit::GetRank(), a[0], a[1], a[2]);
+  // allreduce take max of each elements in all processes
+  Allreduce<op::Max>(&a[0], N);
+  printf("@node[%d] after-allreduce-max: a={%d, %d, %d}\n",
+         rabit::GetRank(), a[0], a[1], a[2]);
+  // second allreduce that sums everything up
+  Allreduce<op::Sum>(&a[0], N);
+  printf("@node[%d] after-allreduce-sum: a={%d, %d, %d}\n",
+         rabit::GetRank(), a[0], a[1], a[2]);
+  rabit::Finalize();
+  return 0;
+}
diff --git a/guide/basic.py b/guide/basic.py
new file mode 100755
index 0000000..363150b
--- /dev/null
+++ b/guide/basic.py
@@ -0,0 +1,27 @@
+#!/usr/bin/python
+"""
+demo python script of rabit
+"""
+from __future__ import print_function
+from builtins import range
+import os
+import sys
+import numpy as np
+# import rabit, the tracker script will setup the lib path correctly
+# for normal run without tracker script, add following line
+# sys.path.append(os.path.dirname(__file__) + '/../python')
+import rabit
+
+rabit.init()
+n = 3
+rank = rabit.get_rank()
+a = np.zeros(n)
+for i in range(n):
+    a[i] = rank + i
+
+print('@node[%d] before-allreduce: a=%s' % (rank, str(a)))
+a = rabit.allreduce(a, rabit.MAX)
+print('@node[%d] after-allreduce-max: a=%s' % (rank, str(a)))
+a = rabit.allreduce(a, rabit.SUM)
+print('@node[%d] after-allreduce-sum: a=%s' % (rank, str(a)))
+rabit.finalize()
diff --git a/guide/broadcast.cc b/guide/broadcast.cc
new file mode 100644
index 0000000..9e360d8
--- /dev/null
+++ b/guide/broadcast.cc
@@ -0,0 +1,16 @@
+#include <rabit/rabit.h>
+using namespace rabit;
+const int N = 3;
+int main(int argc, char *argv[]) {
+  rabit::Init(argc, argv);
+  std::string s;
+  if (rabit::GetRank() == 0) s = "hello world";
+  printf("@node[%d] before-broadcast: s=\"%s\"\n",
+         rabit::GetRank(), s.c_str());
+  // broadcast s from node 0 to all other nodes
+  rabit::Broadcast(&s, 0);
+  printf("@node[%d] after-broadcast: s=\"%s\"\n",
+         rabit::GetRank(), s.c_str());
+  rabit::Finalize();
+  return 0;
+}
diff --git a/guide/broadcast.py b/guide/broadcast.py
new file mode 100755
index 0000000..8b81692
--- /dev/null
+++ b/guide/broadcast.py
@@ -0,0 +1,23 @@
+#!/usr/bin/python
+"""
+demo python script of rabit
+"""
+from __future__ import print_function
+import os
+import sys
+# add path to wrapper
+# for normal run without tracker script, add following line
+# sys.path.append(os.path.dirname(__file__) + '/../wrapper')
+import rabit
+
+rabit.init()
+n = 3
+rank = rabit.get_rank()
+s = None
+if rank == 0:
+    s = {'hello world':100, 2:3}
+print('@node[%d] before-broadcast: s=\"%s\"' % (rank, str(s)))
+s = rabit.broadcast(s, 0)
+
+print('@node[%d] after-broadcast: s=\"%s\"' % (rank, str(s)))
+rabit.finalize()
diff --git a/guide/lazy_allreduce.cc b/guide/lazy_allreduce.cc
new file mode 100644
index 0000000..b4b816f
--- /dev/null
+++ b/guide/lazy_allreduce.cc
@@ -0,0 +1,34 @@
+/*!
+ *  Copyright (c) 2014 by Contributors
+ * \file basic.cc
+ * \brief This is an example demonstrating what is Allreduce
+ *
+ * \author Tianqi Chen
+ */
+#include <rabit/rabit.h>
+
+using namespace rabit;
+const int N = 3;
+int main(int argc, char *argv[]) {
+  int a[N] = {0};
+  rabit::Init(argc, argv);
+  // lazy preparation function
+  auto prepare = [&]() {
+    printf("@node[%d] run prepare function\n", rabit::GetRank());
+    for (int i = 0; i < N; ++i) {
+      a[i] = rabit::GetRank() + i;
+    }
+  };
+  printf("@node[%d] before-allreduce: a={%d, %d, %d}\n",
+         rabit::GetRank(), a[0], a[1], a[2]);
+  // allreduce take max of each elements in all processes
+  Allreduce<op::Max>(&a[0], N, prepare);
+  printf("@node[%d] after-allreduce-sum: a={%d, %d, %d}\n",
+         rabit::GetRank(), a[0], a[1], a[2]);
+  // rum second allreduce
+  Allreduce<op::Sum>(&a[0], N);
+  printf("@node[%d] after-allreduce-max: a={%d, %d, %d}\n",
+         rabit::GetRank(), a[0], a[1], a[2]);
+  rabit::Finalize();
+  return 0;
+}
diff --git a/guide/lazy_allreduce.py b/guide/lazy_allreduce.py
new file mode 100755
index 0000000..2b60a8c
--- /dev/null
+++ b/guide/lazy_allreduce.py
@@ -0,0 +1,31 @@
+#!/usr/bin/python
+"""
+demo python script of rabit: Lazy preparation function
+"""
+import os
+import sys
+import numpy as np
+# import rabit, the tracker script will setup the lib path correctly
+# for normal run without tracker script, add following line
+# sys.path.append(os.path.dirname(__file__) + '/../wrapper')
+import rabit
+
+
+# use mock library so that we can run failure test
+rabit.init(lib = 'mock')
+n = 3
+rank = rabit.get_rank()
+a = np.zeros(n)
+
+def prepare(a):
+    print('@node[%d] run prepare function' % rank)
+    # must take in reference and modify the reference
+    for i in xrange(n):
+        a[i] = rank + i
+
+print('@node[%d] before-allreduce: a=%s' % (rank, str(a)))
+a = rabit.allreduce(a, rabit.MAX, prepare_fun = prepare)
+print('@node[%d] after-allreduce-max: a=%s' % (rank, str(a)))
+a = rabit.allreduce(a, rabit.SUM)
+print('@node[%d] after-allreduce-sum: a=%s' % (rank, str(a)))
+rabit.finalize()
diff --git a/include/rabit/c_api.h b/include/rabit/c_api.h
new file mode 100644
index 0000000..0a96ef7
--- /dev/null
+++ b/include/rabit/c_api.h
@@ -0,0 +1,196 @@
+/*!
+ * Copyright by Contributors
+ * \file c_api.h
+ * \author Tianqi Chen
+ * \brief a C style API of rabit.
+ */
+#ifndef RABIT_C_API_H_
+#define RABIT_C_API_H_
+
+#ifdef __cplusplus
+#define RABIT_EXTERN_C extern "C"
+#include <cstdio>
+#else
+#define RABIT_EXTERN_C
+#include <stdio.h>
+#endif  // __cplusplus
+
+#if defined(_MSC_VER) || defined(_WIN32)
+#define RABIT_DLL RABIT_EXTERN_C __declspec(dllexport)
+#else
+#define RABIT_DLL RABIT_EXTERN_C
+#endif  // defined(_MSC_VER) || defined(_WIN32)
+
+/*! \brief rabit unsigned long type */
+typedef unsigned long rbt_ulong;  // NOLINT(*)
+
+/*!
+ * \brief intialize the rabit module,
+ *  call this once before using anything
+ *  The additional arguments is not necessary.
+ *  Usually rabit will detect settings
+ *  from environment variables.
+ * \param argc number of arguments in argv
+ * \param argv the array of input arguments
+ * \return true if rabit is initialized successfully otherwise false
+ */
+RABIT_DLL bool RabitInit(int argc, char *argv[]);
+
+/*!
+ * \brief finalize the rabit engine,
+ * call this function after you finished all jobs.
+ * \return true if rabit is initialized successfully otherwise false
+ */
+RABIT_DLL bool RabitFinalize(void);
+
+/*!
+ * \brief get rank of previous process in ring topology
+ * \return rank number of worker
+ * */
+RABIT_DLL int RabitGetRingPrevRank(void);
+
+/*!
+ * \brief get rank of current process
+ * \return rank number of worker
+ * */
+RABIT_DLL int RabitGetRank(void);
+
+/*!
+ * \brief get total number of process
+ * \return total world size
+ * */
+RABIT_DLL int RabitGetWorldSize(void);
+
+/*!
+ * \brief get rank of current process
+ * \return if rabit is distributed
+ * */
+RABIT_DLL int RabitIsDistributed(void);
+
+/*!
+ * \brief print the msg to the tracker,
+ *    this function can be used to communicate the information of the progress to
+ *    the user who monitors the tracker
+ * \param msg the message to be printed
+ */
+RABIT_DLL void RabitTrackerPrint(const char *msg);
+/*!
+ * \brief get name of processor
+ * \param out_name hold output string
+ * \param out_len hold length of output string
+ * \param max_len maximum buffer length of input
+   */
+RABIT_DLL void RabitGetProcessorName(char *out_name,
+                                     rbt_ulong *out_len,
+                                     rbt_ulong max_len);
+/*!
+ * \brief broadcast an memory region to all others from root
+ *
+ *     Example: int a = 1; Broadcast(&a, sizeof(a), root);
+ * \param sendrecv_data the pointer to send or recive buffer,
+ * \param size the size of the data
+ * \param root the root of process
+ */
+RABIT_DLL void RabitBroadcast(void *sendrecv_data,
+                              rbt_ulong size, int root);
+
+/*!
+ * \brief Allgather function, each node have a segment of data in the ring of sendrecvbuf,
+ *  the data provided by current node k is [slice_begin, slice_end),
+ *  the next node's segment must start with slice_end
+ *  after the call of Allgather, sendrecvbuf_ contains all the contents including all segments
+ *  use a ring based algorithm
+ *
+ * \param sendrecvbuf buffer for both sending and receiving data, it is a ring conceptually
+ * \param total_size total size of data to be gathered
+ * \param beginIndex beginning of the current slice in sendrecvbuf of type enum_dtype
+ * \param size_node_slice size of the current node slice
+ * \param size_prev_slice size of the previous slice i.e. slice of node (rank - 1) % world_size
+ * \param enum_dtype the enumeration of data type, see rabit::engine::mpi::DataType in engine.h of rabit include
+ * \return this function can return kSuccess, kSockError, kGetExcept, see ReturnType for details
+ * \sa ReturnType
+ */
+RABIT_DLL void RabitAllgather(void *sendrecvbuf,
+                                  size_t total_size,
+                                  size_t beginIndex,
+                                  size_t size_node_slice,
+                                  size_t size_prev_slice,
+                                  int enum_dtype);
+
+/*!
+ * \brief perform in-place allreduce, on sendrecvbuf
+ *        this function is NOT thread-safe
+ *
+ * Example Usage: the following code gives sum of the result
+ *     vector<int> data(10);
+ *     ...
+ *     Allreduce<op::Sum>(&data[0], data.size());
+ *     ...
+ * \param sendrecvbuf buffer for both sending and recving data
+ * \param count number of elements to be reduced
+ * \param enum_dtype the enumeration of data type, see rabit::engine::mpi::DataType in engine.h of rabit include
+ * \param enum_op the enumeration of operation type, see rabit::engine::mpi::OpType in engine.h of rabit
+ * \param prepare_fun Lazy preprocessing function, if it is not NULL, prepare_fun(prepare_arg)
+ *                    will be called by the function before performing Allreduce, to intialize the data in sendrecvbuf_.
+ *                     If the result of Allreduce can be recovered directly, then prepare_func will NOT be called
+   * \param prepare_arg argument used to passed into the lazy preprocessing function
+   */
+RABIT_DLL void RabitAllreduce(void *sendrecvbuf,
+                              size_t count,
+                              int enum_dtype,
+                              int enum_op,
+                              void (*prepare_fun)(void *arg),
+                              void *prepare_arg);
+
+/*!
+ * \brief load latest check point
+ * \param out_global_model hold output of serialized global_model
+ * \param out_global_len the output length of serialized global model
+ * \param out_local_model hold output of serialized local_model, can be NULL
+ * \param out_local_len the output length of serialized local model, can be NULL
+ *
+ * \return the version number of check point loaded
+ *     if returned version == 0, this means no model has been CheckPointed
+ *     nothing will be touched
+ */
+RABIT_DLL int RabitLoadCheckPoint(char **out_global_model,
+                                  rbt_ulong *out_global_len,
+                                  char **out_local_model,
+                                  rbt_ulong *out_local_len);
+/*!
+ * \brief checkpoint the model, meaning we finished a stage of execution
+ *  every time we call check point, there is a version number which will increase by one
+ *
+ * \param global_model hold content of serialized global_model
+ * \param global_len the content length of serialized global model
+ * \param local_model hold content of serialized local_model, can be NULL
+ * \param local_len the content length of serialized local model, can be NULL
+ *
+ * NOTE: local_model requires explicit replication of the model for fault-tolerance, which will
+ *       bring replication cost in CheckPoint function. global_model do not need explicit replication.
+ *       So only CheckPoint with global_model if possible
+ */
+RABIT_DLL void RabitCheckPoint(const char *global_model,
+                               rbt_ulong global_len,
+                               const char *local_model,
+                               rbt_ulong local_len);
+/*!
+ * \return version number of current stored model,
+ * which means how many calls to CheckPoint we made so far
+ * \return rabit version number
+ */
+RABIT_DLL int RabitVersionNumber(void);
+
+
+/*!
+ * \brief a Dummy function,
+ *  used to cause force link of C API  into the  DLL.
+ * \code
+ * \/\/force link rabit C API library.
+ * static int must_link_rabit_ = RabitLinkTag();
+ * \endcode
+ * \return a dummy integer.
+ */
+RABIT_DLL int RabitLinkTag(void);
+
+#endif  // RABIT_C_API_H_
diff --git a/include/rabit/internal/engine.h b/include/rabit/internal/engine.h
new file mode 100644
index 0000000..0db10e7
--- /dev/null
+++ b/include/rabit/internal/engine.h
@@ -0,0 +1,346 @@
+/*!
+ *  Copyright (c) 2014 by Contributors
+ * \file engine.h
+ * \brief This file defines the core interface of rabit library
+ * \author Tianqi Chen, Nacho, Tianyi
+ */
+#ifndef RABIT_INTERNAL_ENGINE_H_
+#define RABIT_INTERNAL_ENGINE_H_
+#include <string>
+#include "rabit/serializable.h"
+
+#if (defined(__GNUC__) && !defined(__clang__))
+#define _FILE  __builtin_FILE()
+#define _LINE  __builtin_LINE()
+#define _CALLER  __builtin_FUNCTION()
+#else
+#define _FILE  "N/A"
+#define _LINE  -1
+#define _CALLER  "N/A"
+#endif  // (defined(__GNUC__) && !defined(__clang__))
+
+namespace MPI {
+/*! \brief MPI data type just to be compatible with MPI reduce function*/
+class Datatype;
+}
+
+/*! \brief namespace of rabit */
+namespace rabit {
+/*! \brief core interface of the engine */
+namespace engine {
+/*! \brief interface of core Allreduce engine */
+class IEngine {
+ public:
+  /*!
+   * \brief Preprocessing function, that is called before AllReduce,
+   *        used to prepare the data used by AllReduce
+   * \param arg additional possible argument used to invoke the preprocessor
+   */
+  typedef void (PreprocFunction) (void *arg);
+  /*!
+   * \brief reduce function, the same form of MPI reduce function is used,
+   *        to be compatible with MPI interface
+   *        In all the functions, the memory is ensured to aligned to 64-bit
+   *        which means it is OK to cast src,dst to double* int* etc
+   * \param src pointer to source space
+   * \param dst pointer to destination reduction
+   * \param count total number of elements to be reduced (note this is total number of elements instead of bytes)
+   *              the definition of the reduce function should be type aware
+   * \param dtype the data type object, to be compatible with MPI reduce
+   */
+  typedef void (ReduceFunction) (const void *src,
+                                 void *dst, int count,
+                                 const MPI::Datatype &dtype);
+  /*! \brief virtual destructor */
+  virtual ~IEngine() {}
+  /*!
+   * \brief Allgather function, each node have a segment of data in the ring of sendrecvbuf,
+   *  the data provided by current node k is [slice_begin, slice_end),
+   *  the next node's segment must start with slice_end
+   *  after the call of Allgather, sendrecvbuf_ contains all the contents including all segments
+   *  use a ring based algorithm
+   *
+   * \param sendrecvbuf_ buffer for both sending and receiving data, it is a ring conceptually
+   * \param total_size total size of data to be gathered
+   * \param slice_begin beginning of the current slice
+   * \param slice_end end of the current slice
+   * \param size_prev_slice size of the previous slice i.e. slice of node (rank - 1) % world_size
+   * \param _file caller file name used to generate unique cache key
+   * \param _line caller line number used to generate unique cache key
+   * \param _caller caller function name used to generate unique cache key
+   */ 
+  virtual void Allgather(void *sendrecvbuf,
+                         size_t total_size,
+                         size_t slice_begin,
+                         size_t slice_end,
+                         size_t size_prev_slice,
+                         const char* _file = _FILE,
+                         const int _line = _LINE,
+                         const char* _caller = _CALLER) = 0;
+  /*!
+   * \brief performs in-place Allreduce, on sendrecvbuf
+   *        this function is NOT thread-safe
+   * \param sendrecvbuf_ buffer for both sending and receiving data
+   * \param type_nbytes the number of bytes the type has
+   * \param count number of elements to be reduced
+   * \param reducer reduce function
+   * \param prepare_func Lazy preprocessing function, if it is not NULL, prepare_fun(prepare_arg)
+   *                     will be called by the function before performing Allreduce in order to initialize the data in sendrecvbuf.
+   *                     If the result of Allreduce can be recovered directly, then prepare_func will NOT be called
+   * \param prepare_arg argument used to pass into the lazy preprocessing function
+   * \param _file caller file name used to generate unique cache key
+   * \param _line caller line number used to generate unique cache key
+   * \param _caller caller function name used to generate unique cache key
+   */
+  virtual void Allreduce(void *sendrecvbuf_,
+                         size_t type_nbytes,
+                         size_t count,
+                         ReduceFunction reducer,
+                         PreprocFunction prepare_fun = NULL,
+                         void *prepare_arg = NULL,
+                         const char* _file = _FILE,
+                         const int _line = _LINE,
+                         const char* _caller = _CALLER) = 0;
+  /*!
+   * \brief broadcasts data from root to every other node
+   * \param sendrecvbuf_ buffer for both sending and receiving data
+   * \param size the size of the data to be broadcasted
+   * \param root the root worker id to broadcast the data
+   * \param _file caller file name used to generate unique cache key
+   * \param _line caller line number used to generate unique cache key
+   * \param _caller caller function name used to generate unique cache key
+   */
+  virtual void Broadcast(void *sendrecvbuf_, size_t size, int root,
+                         const char* _file = _FILE,
+                         const int _line = _LINE,
+                         const char* _caller = _CALLER) = 0;
+  /*!
+   * \brief explicitly re-initialize everything before calling LoadCheckPoint
+   *    call this function when IEngine throws an exception,
+   *    this function should only be used for test purposes
+   */
+  virtual void InitAfterException(void) = 0;
+  /*!
+   * \brief loads the latest check point
+   * \param global_model pointer to the globally shared model/state
+   *   when calling this function, the caller needs to guarantee that the global_model
+   *   is the same in all nodes
+   * \param local_model pointer to the local model that is specific to current node/rank
+   *   this can be NULL when no local model is needed
+   *
+   * \return the version number of the model loaded
+   *     if returned version == 0, this means no model has been CheckPointed
+   *     the p_model is not touched, users should do necessary initialization by themselves
+   *
+   *   Common usage example:
+   *      int iter = rabit::LoadCheckPoint(&model);
+   *      if (iter == 0) model.InitParameters();
+   *      for (i = iter; i < max_iter; ++i) {
+   *        do many things, include allreduce
+   *        rabit::CheckPoint(model);
+   *      }
+   *
+   * \sa CheckPoint, VersionNumber
+   */
+  virtual int LoadCheckPoint(Serializable *global_model,
+                             Serializable *local_model = NULL) = 0;
+  /*!
+   * \brief checkpoints the model, meaning a stage of execution was finished
+   *  every time we call check point, a version number increases by ones
+   *
+   * \param global_model pointer to the globally shared model/state
+   *   when calling this function, the caller needs to guarantee that the global_model
+   *   is the same in every node
+   * \param local_model pointer to the local model that is specific to current node/rank
+   *   this can be NULL when no local state is needed
+   *
+   * NOTE: local_model requires explicit replication of the model for fault-tolerance, which will
+   *       bring replication cost in CheckPoint function. global_model does not need explicit replication.
+   *       So, only CheckPoint with global_model if possible
+   *
+   * \sa LoadCheckPoint, VersionNumber
+   */
+  virtual void CheckPoint(const Serializable *global_model,
+                          const Serializable *local_model = NULL) = 0;
+  /*!
+   * \brief This function can be used to replace CheckPoint for global_model only,
+   *   when certain condition is met (see detailed explanation).
+   *
+   *   This is a "lazy" checkpoint such that only the pointer to global_model is
+   *   remembered and no memory copy is taken. To use this function, the user MUST ensure that:
+   *   The global_model must remain unchanged until the last call of Allreduce/Broadcast in the current version finishes.
+   *   In other words, global_model can be changed only between the last call of
+   *   Allreduce/Broadcast and LazyCheckPoint in the current version
+   *
+   *   For example, suppose the calling sequence is:
+   *   LazyCheckPoint, code1, Allreduce, code2, Broadcast, code3, LazyCheckPoint
+   *
+   *   If the user can only change global_model in code3, then LazyCheckPoint can be used to
+   *   improve the efficiency of the program.
+   * \param global_model pointer to the globally shared model/state
+   *   when calling this function, the caller needs to guarantee that global_model
+   *   is the same in every node
+   * \sa LoadCheckPoint, CheckPoint, VersionNumber
+   */
+  virtual void LazyCheckPoint(const Serializable *global_model) = 0;
+  /*!
+   * \return version number of the current stored model,
+   *         which means how many calls to CheckPoint we made so far
+   * \sa LoadCheckPoint, CheckPoint
+   */
+  virtual int VersionNumber(void) const = 0;
+  /*! \brief gets rank of previous node in ring topology */
+  virtual int GetRingPrevRank(void) const = 0;
+  /*! \brief gets rank of current node */
+  virtual int GetRank(void) const = 0;
+  /*! \brief gets total number of nodes */
+  virtual int GetWorldSize(void) const = 0;
+  /*! \brief whether we run in distribted mode */
+  virtual bool IsDistributed(void) const = 0;
+  /*! \brief gets the host name of the current node */
+  virtual std::string GetHost(void) const = 0;
+  /*!
+   * \brief prints the msg in the tracker,
+   *    this function can be used to communicate progress information to
+   *    the user who monitors the tracker
+   * \param msg message to be printed in the tracker
+   */
+  virtual void TrackerPrint(const std::string &msg) = 0;
+};
+
+/*! \brief initializes the engine module */
+bool Init(int argc, char *argv[]);
+/*! \brief finalizes the engine module */
+bool Finalize(void);
+/*! \brief singleton method to get engine */
+IEngine *GetEngine(void);
+
+/*! \brief namespace that contains stubs to be compatible with MPI */
+namespace mpi {
+/*!\brief enum of all operators */
+enum OpType {
+  kMax = 0,
+  kMin = 1,
+  kSum = 2,
+  kBitwiseOR = 3
+};
+/*!\brief enum of supported data types */
+enum DataType {
+  kChar = 0,
+  kUChar = 1,
+  kInt = 2,
+  kUInt = 3,
+  kLong = 4,
+  kULong = 5,
+  kFloat = 6,
+  kDouble = 7,
+  kLongLong = 8,
+  kULongLong = 9
+};
+}  // namespace mpi
+/*!
+ * \brief Allgather function, each node have a segment of data in the ring of sendrecvbuf,
+ *  the data provided by current node k is [slice_begin, slice_end),
+ *  the next node's segment must start with slice_end
+ *  after the call of Allgather, sendrecvbuf_ contains all the contents including all segments
+ *  use a ring based algorithm
+ *
+ * \param sendrecvbuf buffer for both sending and receiving data, it is a ring conceptually
+ * \param total_size total size of data to be gathered
+ * \param slice_begin beginning of the current slice
+ * \param slice_end end of the current slice
+ * \param size_prev_slice size of the previous slice i.e. slice of node (rank - 1) % world_size
+ * \param _file caller file name used to generate unique cache key
+ * \param _line caller line number used to generate unique cache key
+ * \param _caller caller function name used to generate unique cache key
+ */
+void Allgather(void* sendrecvbuf,
+                   size_t total_size,
+                   size_t slice_begin,
+                   size_t slice_end,
+                   size_t size_prev_slice,
+                   const char* _file = _FILE,
+                   const int _line = _LINE,
+                   const char* _caller = _CALLER);
+/*!
+ * \brief perform in-place Allreduce, on sendrecvbuf
+ *   this is an internal function used by rabit to be able to compile with MPI
+ *   do not use this function directly
+ * \param sendrecvbuf buffer for both sending and receiving data
+ * \param type_nbytes the number of bytes the type has
+ * \param count number of elements to be reduced
+ * \param reducer reduce function
+ * \param dtype the data type
+ * \param op the reduce operator type
+ * \param prepare_func Lazy preprocessing function, lazy prepare_fun(prepare_arg)
+ *                     will be called by the function before performing Allreduce, to initialize the data in sendrecvbuf_.
+ *                     If the result of Allreduce can be recovered directly, then prepare_func will NOT be called
+ * \param prepare_arg argument used to pass into the lazy preprocessing function.
+ * \param _file caller file name used to generate unique cache key
+ * \param _line caller line number used to generate unique cache key
+ * \param _caller caller function name used to generate unique cache key
+ */
+void Allreduce_(void *sendrecvbuf,
+                size_t type_nbytes,
+                size_t count,
+                IEngine::ReduceFunction red,
+                mpi::DataType dtype,
+                mpi::OpType op,
+                IEngine::PreprocFunction prepare_fun = NULL,
+                void *prepare_arg = NULL,
+                const char* _file = _FILE,
+                const int _line = _LINE,
+                const char* _caller = _CALLER);
+/*!
+ * \brief handle for customized reducer, used to handle customized reduce
+ *  this class is mainly created for compatiblity issues with MPI's customized reduce
+ */
+class ReduceHandle {
+ public:
+  // constructor
+  ReduceHandle(void);
+  // destructor
+  ~ReduceHandle(void);
+  /*!
+   * \brief initialize the reduce function,
+   *   with the type the reduce function needs to deal with
+   *   the reduce function MUST be communicative
+   */
+  void Init(IEngine::ReduceFunction redfunc, size_t type_nbytes);
+  /*!
+   * \brief customized in-place all reduce operation
+   * \param sendrecvbuf the in place send-recv buffer
+   * \param type_n4bytes size of the type, in terms of 4bytes
+   * \param count number of elements to send
+   * \param prepare_func Lazy preprocessing function, lazy prepare_fun(prepare_arg)
+   *                     will be called by the function before performing Allreduce in order to initialize the data in sendrecvbuf_.
+   *                     If the result of Allreduce can be recovered directly, then prepare_func will NOT be called
+   * \param prepare_arg argument used to pass into the lazy preprocessing function
+   * \param _file caller file name used to generate unique cache key
+   * \param _line caller line number used to generate unique cache key
+   * \param _caller caller function name used to generate unique cache key
+   */
+  void Allreduce(void *sendrecvbuf,
+                 size_t type_nbytes,
+                 size_t count,
+                 IEngine::PreprocFunction prepare_fun = NULL,
+                 void *prepare_arg = NULL,
+                 const char* _file = _FILE,
+                 const int _line = _LINE,
+                 const char* _caller = _CALLER);
+  /*! \return the number of bytes occupied by the type */
+  static int TypeSize(const MPI::Datatype &dtype);
+
+ protected:
+  // handle function field
+  void *handle_;
+  // reduce function of the reducer
+  IEngine::ReduceFunction *redfunc_;
+  // handle to the type field
+  void *htype_;
+  // the created type in 4 bytes
+  size_t created_type_nbytes_;
+};
+}  // namespace engine
+}  // namespace rabit
+#endif  // RABIT_INTERNAL_ENGINE_H_
diff --git a/include/rabit/internal/io.h b/include/rabit/internal/io.h
new file mode 100644
index 0000000..f7e255b
--- /dev/null
+++ b/include/rabit/internal/io.h
@@ -0,0 +1,114 @@
+/*!
+ *  Copyright (c) 2014-2019 by Contributors
+ * \file io.h
+ * \brief utilities with different serializable implementations
+ * \author Tianqi Chen
+ */
+#ifndef RABIT_INTERNAL_IO_H_
+#define RABIT_INTERNAL_IO_H_
+#include <cstdio>
+#include <vector>
+#include <cstring>
+#include <string>
+#include <algorithm>
+#include <numeric>
+#include <limits>
+#include "rabit/internal/utils.h"
+#include "rabit/serializable.h"
+
+namespace rabit {
+namespace utils {
+/*! \brief re-use definition of dmlc::SeekStream */
+typedef dmlc::SeekStream SeekStream;
+/*! \brief fixed size memory buffer */
+struct MemoryFixSizeBuffer : public SeekStream {
+ public:
+  // similar to SEEK_END in libc
+  static size_t constexpr SeekEnd = std::numeric_limits<size_t>::max();
+
+ public:
+  MemoryFixSizeBuffer(void *p_buffer, size_t buffer_size)
+      : p_buffer_(reinterpret_cast<char*>(p_buffer)),
+        buffer_size_(buffer_size) {
+    curr_ptr_ = 0;
+  }
+  virtual ~MemoryFixSizeBuffer(void) {}
+  virtual size_t Read(void *ptr, size_t size) {
+    size_t nread = std::min(buffer_size_ - curr_ptr_, size);
+    if (nread != 0) std::memcpy(ptr, p_buffer_ + curr_ptr_, nread);
+    curr_ptr_ += nread;
+    return nread;
+  }
+  virtual void Write(const void *ptr, size_t size) {
+    if (size == 0) return;
+    utils::Assert(curr_ptr_ + size <=  buffer_size_,
+                  "write position exceed fixed buffer size");
+    std::memcpy(p_buffer_ + curr_ptr_, ptr, size);
+    curr_ptr_ += size;
+  }
+  virtual void Seek(size_t pos) {
+    if (pos == SeekEnd) {
+      curr_ptr_ = buffer_size_;
+    } else {
+      curr_ptr_ = static_cast<size_t>(pos);
+    }
+  }
+  virtual size_t Tell(void) {
+    return curr_ptr_;
+  }
+  virtual bool AtEnd(void) const {
+    return curr_ptr_ == buffer_size_;
+  }
+
+ private:
+  /*! \brief in memory buffer */
+  char *p_buffer_;
+  /*! \brief current pointer */
+  size_t buffer_size_;
+  /*! \brief current pointer */
+  size_t curr_ptr_;
+};  // class MemoryFixSizeBuffer
+
+/*! \brief a in memory buffer that can be read and write as stream interface */
+struct MemoryBufferStream : public SeekStream {
+ public:
+  explicit MemoryBufferStream(std::string *p_buffer)
+      : p_buffer_(p_buffer) {
+    curr_ptr_ = 0;
+  }
+  virtual ~MemoryBufferStream(void) {}
+  virtual size_t Read(void *ptr, size_t size) {
+    utils::Assert(curr_ptr_ <= p_buffer_->length(),
+                  "read can not have position excceed buffer length");
+    size_t nread = std::min(p_buffer_->length() - curr_ptr_, size);
+    if (nread != 0) std::memcpy(ptr, &(*p_buffer_)[0] + curr_ptr_, nread);
+    curr_ptr_ += nread;
+    return nread;
+  }
+  virtual void Write(const void *ptr, size_t size) {
+    if (size == 0) return;
+    if (curr_ptr_ + size > p_buffer_->length()) {
+      p_buffer_->resize(curr_ptr_+size);
+    }
+    std::memcpy(&(*p_buffer_)[0] + curr_ptr_, ptr, size);
+    curr_ptr_ += size;
+  }
+  virtual void Seek(size_t pos) {
+    curr_ptr_ = static_cast<size_t>(pos);
+  }
+  virtual size_t Tell(void) {
+    return curr_ptr_;
+  }
+  virtual bool AtEnd(void) const {
+    return curr_ptr_ == p_buffer_->length();
+  }
+
+ private:
+  /*! \brief in memory buffer */
+  std::string *p_buffer_;
+  /*! \brief current pointer */
+  size_t curr_ptr_;
+};  // class MemoryBufferStream
+}  // namespace utils
+}  // namespace rabit
+#endif  // RABIT_INTERNAL_IO_H_
diff --git a/include/rabit/internal/rabit-inl.h b/include/rabit/internal/rabit-inl.h
new file mode 100644
index 0000000..8ae604c
--- /dev/null
+++ b/include/rabit/internal/rabit-inl.h
@@ -0,0 +1,386 @@
+/*!
+ * Copyright (c) 2014-2019 by Contributors
+ * \file rabit-inl.h
+ * \brief implementation of inline template function for rabit interface
+ *
+ * \author Tianqi Chen
+ */
+#ifndef RABIT_INTERNAL_RABIT_INL_H_
+#define RABIT_INTERNAL_RABIT_INL_H_
+// use engine for implementation
+#include <vector>
+#include <string>
+#include "rabit/internal/io.h"
+#include "rabit/internal/utils.h"
+#include "rabit/rabit.h"
+
+namespace rabit {
+namespace engine {
+namespace mpi {
+// template function to translate type to enum indicator
+template<typename DType>
+inline DataType GetType(void);
+template<>
+inline DataType GetType<char>(void) {
+  return kChar;
+}
+template<>
+inline DataType GetType<unsigned char>(void) {
+  return kUChar;
+}
+template<>
+inline DataType GetType<int>(void) {
+  return kInt;
+}
+template<>
+inline DataType GetType<unsigned int>(void) { // NOLINT(*)
+  return kUInt;
+}
+template<>
+inline DataType GetType<long>(void) {  // NOLINT(*)
+  return kLong;
+}
+template<>
+inline DataType GetType<unsigned long>(void) { // NOLINT(*)
+  return kULong;
+}
+template<>
+inline DataType GetType<float>(void) {
+  return kFloat;
+}
+template<>
+inline DataType GetType<double>(void) {
+  return kDouble;
+}
+template<>
+inline DataType GetType<long long>(void) { // NOLINT(*)
+  return kLongLong;
+}
+template<>
+inline DataType GetType<unsigned long long>(void) { // NOLINT(*)
+  return kULongLong;
+}
+}  // namespace mpi
+}  // namespace engine
+
+namespace op {
+struct Max {
+  static const engine::mpi::OpType kType = engine::mpi::kMax;
+  template<typename DType>
+  inline static void Reduce(DType &dst, const DType &src) { // NOLINT(*)
+    if (dst < src) dst = src;
+  }
+};
+struct Min {
+  static const engine::mpi::OpType kType = engine::mpi::kMin;
+  template<typename DType>
+  inline static void Reduce(DType &dst, const DType &src) { // NOLINT(*)
+    if (dst > src) dst = src;
+  }
+};
+struct Sum {
+  static const engine::mpi::OpType kType = engine::mpi::kSum;
+  template<typename DType>
+  inline static void Reduce(DType &dst, const DType &src) { // NOLINT(*)
+    dst += src;
+  }
+};
+struct BitOR {
+  static const engine::mpi::OpType kType = engine::mpi::kBitwiseOR;
+  template<typename DType>
+  inline static void Reduce(DType &dst, const DType &src) { // NOLINT(*)
+    dst |= src;
+  }
+};
+template<typename OP, typename DType>
+inline void Reducer(const void *src_, void *dst_, int len, const MPI::Datatype &dtype) {
+  const DType* src = (const DType*)src_;
+  DType* dst = (DType*)dst_;  // NOLINT(*)
+  for (int i = 0; i < len; i++) {
+    OP::Reduce(dst[i], src[i]);
+  }
+}
+}  // namespace op
+
+// intialize the rabit engine
+inline bool Init(int argc, char *argv[]) {
+  return engine::Init(argc, argv);
+}
+// finalize the rabit engine
+inline bool Finalize(void) {
+  return engine::Finalize();
+}
+// get the rank of the previous worker in ring topology
+inline int GetRingPrevRank(void) {
+  return engine::GetEngine()->GetRingPrevRank();
+}
+// get the rank of current process
+inline int GetRank(void) {
+  return engine::GetEngine()->GetRank();
+}
+// the the size of the world
+inline int GetWorldSize(void) {
+  return engine::GetEngine()->GetWorldSize();
+}
+// whether rabit is distributed
+inline bool IsDistributed(void) {
+  return engine::GetEngine()->IsDistributed();
+}
+// get the name of current processor
+inline std::string GetProcessorName(void) {
+  return engine::GetEngine()->GetHost();
+}
+// broadcast data to all other nodes from root
+inline void Broadcast(void *sendrecv_data, size_t size, int root,
+                      const char* _file,
+                      const int _line,
+                      const char* _caller) {
+  engine::GetEngine()->Broadcast(sendrecv_data, size, root,
+    _file, _line, _caller);
+}
+template<typename DType>
+inline void Broadcast(std::vector<DType> *sendrecv_data, int root,
+                      const char* _file,
+                      const int _line,
+                      const char* _caller) {
+  size_t size = sendrecv_data->size();
+  Broadcast(&size, sizeof(size), root, _file, _line, _caller);
+  if (sendrecv_data->size() != size) {
+    sendrecv_data->resize(size);
+  }
+  if (size != 0) {
+    Broadcast(&(*sendrecv_data)[0], size * sizeof(DType), root,
+      _file, _line, _caller);
+  }
+}
+inline void Broadcast(std::string *sendrecv_data, int root,
+                      const char* _file,
+                      const int _line,
+                      const char* _caller) {
+  size_t size = sendrecv_data->length();
+  Broadcast(&size, sizeof(size), root, _file, _line, _caller);
+  if (sendrecv_data->length() != size) {
+    sendrecv_data->resize(size);
+  }
+  if (size != 0) {
+    Broadcast(&(*sendrecv_data)[0], size * sizeof(char), root,
+    _file, _line, _caller);
+  }
+}
+
+// perform inplace Allreduce
+template<typename OP, typename DType>
+inline void Allreduce(DType *sendrecvbuf, size_t count,
+                      void (*prepare_fun)(void *arg),
+                      void *prepare_arg,
+                      const char* _file,
+                      const int _line,
+                      const char* _caller) {
+  engine::Allreduce_(sendrecvbuf, sizeof(DType), count, op::Reducer<OP, DType>,
+                     engine::mpi::GetType<DType>(), OP::kType, prepare_fun, prepare_arg,
+                     _file, _line, _caller);
+}
+
+// C++11 support for lambda prepare function
+#if DMLC_USE_CXX11
+inline void InvokeLambda_(void *fun) {
+  (*static_cast<std::function<void()>*>(fun))();
+}
+template<typename OP, typename DType>
+inline void Allreduce(DType *sendrecvbuf, size_t count,
+                      std::function<void()> prepare_fun,
+                      const char* _file,
+                      const int _line,
+                      const char* _caller) {
+  engine::Allreduce_(sendrecvbuf, sizeof(DType), count, op::Reducer<OP, DType>,
+                     engine::mpi::GetType<DType>(), OP::kType, InvokeLambda_, &prepare_fun,
+                     _file, _line, _caller);
+}
+
+// Performs inplace Allgather
+template<typename DType>
+inline void Allgather(DType *sendrecvbuf,
+                      size_t totalSize,
+                      size_t beginIndex,
+                      size_t sizeNodeSlice,
+                      size_t sizePrevSlice,
+                      const char* _file,
+                      const int _line,
+                      const char* _caller) {
+  engine::GetEngine()->Allgather(sendrecvbuf, totalSize * sizeof(DType), beginIndex * sizeof(DType),
+                        (beginIndex + sizeNodeSlice) * sizeof(DType),
+                        sizePrevSlice * sizeof(DType), _file, _line, _caller);
+}
+#endif  // C++11
+
+// print message to the tracker
+inline void TrackerPrint(const std::string &msg) {
+  engine::GetEngine()->TrackerPrint(msg);
+}
+#ifndef RABIT_STRICT_CXX98_
+inline void TrackerPrintf(const char *fmt, ...) {
+  const int kPrintBuffer = 1 << 10;
+  std::string msg(kPrintBuffer, '\0');
+  va_list args;
+  va_start(args, fmt);
+  vsnprintf(&msg[0], kPrintBuffer, fmt, args);
+  va_end(args);
+  msg.resize(strlen(msg.c_str()));
+  TrackerPrint(msg);
+}
+
+#endif  // RABIT_STRICT_CXX98_
+// load latest check point
+inline int LoadCheckPoint(Serializable *global_model,
+                          Serializable *local_model) {
+  return engine::GetEngine()->LoadCheckPoint(global_model, local_model);
+}
+// checkpoint the model, meaning we finished a stage of execution
+inline void CheckPoint(const Serializable *global_model,
+                       const Serializable *local_model) {
+  engine::GetEngine()->CheckPoint(global_model, local_model);
+}
+// lazy checkpoint the model, only remember the pointer to global_model
+inline void LazyCheckPoint(const Serializable *global_model) {
+  engine::GetEngine()->LazyCheckPoint(global_model);
+}
+// return the version number of currently stored model
+inline int VersionNumber(void) {
+  return engine::GetEngine()->VersionNumber();
+}
+// ---------------------------------
+// Code to handle customized Reduce
+// ---------------------------------
+// function to perform reduction for Reducer
+template<typename DType, void (*freduce)(DType &dst, const DType &src)>
+inline void ReducerSafe_(const void *src_, void *dst_, int len_, const MPI::Datatype &dtype) {
+  const size_t kUnit = sizeof(DType);
+  const char *psrc = reinterpret_cast<const char*>(src_);
+  char *pdst = reinterpret_cast<char*>(dst_);
+
+  for (int i = 0; i < len_; ++i) {
+    DType tdst, tsrc;
+    // use memcpy to avoid alignment issue
+    std::memcpy(&tdst, pdst + (i * kUnit), sizeof(tdst));
+    std::memcpy(&tsrc, psrc + (i * kUnit), sizeof(tsrc));
+    freduce(tdst, tsrc);
+    std::memcpy(pdst + i * kUnit, &tdst, sizeof(tdst));
+  }
+}
+// function to perform reduction for Reducer
+template<typename DType, void (*freduce)(DType &dst, const DType &src)> // NOLINT(*)
+inline void ReducerAlign_(const void *src_, void *dst_,
+                          int len_, const MPI::Datatype &dtype) {
+  const DType *psrc = reinterpret_cast<const DType*>(src_);
+  DType *pdst = reinterpret_cast<DType*>(dst_);
+  for (int i = 0; i < len_; ++i) {
+    freduce(pdst[i], psrc[i]);
+  }
+}
+template<typename DType, void (*freduce)(DType &dst, const DType &src)>  // NOLINT(*)
+inline Reducer<DType, freduce>::Reducer(void) {
+  // it is safe to directly use handle for aligned data types
+  if (sizeof(DType) == 8 || sizeof(DType) == 4 || sizeof(DType) == 1) {
+    this->handle_.Init(ReducerAlign_<DType, freduce>, sizeof(DType));
+  } else {
+    this->handle_.Init(ReducerSafe_<DType, freduce>, sizeof(DType));
+  }
+}
+template<typename DType, void (*freduce)(DType &dst, const DType &src)> // NOLINT(*)
+inline void Reducer<DType, freduce>::Allreduce(DType *sendrecvbuf, size_t count,
+                                               void (*prepare_fun)(void *arg),
+                                               void *prepare_arg,
+                                               const char* _file,
+                                               const int _line,
+                                               const char* _caller) {
+  handle_.Allreduce(sendrecvbuf, sizeof(DType), count, prepare_fun,
+    prepare_arg, _file, _line, _caller);
+}
+// function to perform reduction for SerializeReducer
+template<typename DType>
+inline void SerializeReducerFunc_(const void *src_, void *dst_,
+                                  int len_, const MPI::Datatype &dtype) {
+  int nbytes = engine::ReduceHandle::TypeSize(dtype);
+  // temp space
+  for (int i = 0; i < len_; ++i) {
+    DType tsrc, tdst;
+    utils::MemoryFixSizeBuffer fsrc((char*)(src_) + i * nbytes, nbytes); // NOLINT(*)
+    utils::MemoryFixSizeBuffer fdst((char*)(dst_) + i * nbytes, nbytes); // NOLINT(*)
+    tsrc.Load(fsrc);
+    tdst.Load(fdst);
+    // govern const check
+    tdst.Reduce(static_cast<const DType &>(tsrc), nbytes);
+    fdst.Seek(0);
+    tdst.Save(fdst);
+  }
+}
+template<typename DType>
+inline SerializeReducer<DType>::SerializeReducer(void) {
+  handle_.Init(SerializeReducerFunc_<DType>, sizeof(DType));
+}
+// closure to call Allreduce
+template<typename DType>
+struct SerializeReduceClosure {
+  DType *sendrecvobj;
+  size_t max_nbyte, count;
+  void (*prepare_fun)(void *arg);
+  void *prepare_arg;
+  std::string *p_buffer;
+  // invoke the closure
+  inline void Run(void) {
+    if (prepare_fun != NULL) prepare_fun(prepare_arg);
+    for (size_t i = 0; i < count; ++i) {
+      utils::MemoryFixSizeBuffer fs(BeginPtr(*p_buffer) + i * max_nbyte, max_nbyte);
+      sendrecvobj[i].Save(fs);
+    }
+  }
+  inline static void Invoke(void *c) {
+    static_cast<SerializeReduceClosure<DType>*>(c)->Run();
+  }
+};
+template<typename DType>
+inline void SerializeReducer<DType>::Allreduce(DType *sendrecvobj,
+                                               size_t max_nbyte, size_t count,
+                                               void (*prepare_fun)(void *arg),
+                                               void *prepare_arg,
+                                               const char* _file,
+                                               const int _line,
+                                               const char* _caller) {
+  buffer_.resize(max_nbyte * count);
+  // setup closure
+  SerializeReduceClosure<DType> c;
+  c.sendrecvobj = sendrecvobj; c.max_nbyte = max_nbyte; c.count = count;
+  c.prepare_fun = prepare_fun; c.prepare_arg = prepare_arg; c.p_buffer = &buffer_;
+  // invoke here
+  handle_.Allreduce(BeginPtr(buffer_), max_nbyte, count,
+                    SerializeReduceClosure<DType>::Invoke, &c,
+                    _file, _line, _caller);
+  for (size_t i = 0; i < count; ++i) {
+    utils::MemoryFixSizeBuffer fs(BeginPtr(buffer_) + i * max_nbyte, max_nbyte);
+    sendrecvobj[i].Load(fs);
+  }
+}
+
+#if DMLC_USE_CXX11
+template<typename DType, void (*freduce)(DType &dst, const DType &src)>  // NOLINT(*)g
+inline void Reducer<DType, freduce>::Allreduce(DType *sendrecvbuf, size_t count,
+                                               std::function<void()> prepare_fun,
+                                               const char* _file,
+                                               const int _line,
+                                               const char* _caller) {
+  this->Allreduce(sendrecvbuf, count, InvokeLambda_, &prepare_fun,
+    _file, _line, _caller);
+}
+template<typename DType>
+inline void SerializeReducer<DType>::Allreduce(DType *sendrecvobj,
+                                               size_t max_nbytes, size_t count,
+                                               std::function<void()> prepare_fun,
+                                               const char* _file,
+                                               const int _line,
+                                               const char* _caller) {
+  this->Allreduce(sendrecvobj, max_nbytes, count, InvokeLambda_, &prepare_fun,
+    _file, _line, _caller);
+}
+#endif  // DMLC_USE_CXX11
+}  // namespace rabit
+#endif  // RABIT_INTERNAL_RABIT_INL_H_
diff --git a/include/rabit/internal/socket.h b/include/rabit/internal/socket.h
new file mode 100644
index 0000000..a80c4bc
--- /dev/null
+++ b/include/rabit/internal/socket.h
@@ -0,0 +1,536 @@
+/*!
+ *  Copyright (c) 2014-2019 by Contributors
+ * \file socket.h
+ * \brief this file aims to provide a wrapper of sockets
+ * \author Tianqi Chen
+ */
+#ifndef RABIT_INTERNAL_SOCKET_H_
+#define RABIT_INTERNAL_SOCKET_H_
+#if defined(_WIN32)
+#include <winsock2.h>
+#include <ws2tcpip.h>
+#ifdef _MSC_VER
+#pragma comment(lib, "Ws2_32.lib")
+#endif  // _MSC_VER
+#else
+#include <fcntl.h>
+#include <netdb.h>
+#include <errno.h>
+#include <unistd.h>
+#include <arpa/inet.h>
+#include <netinet/in.h>
+#include <sys/socket.h>
+#include <sys/ioctl.h>
+#endif  // defined(_WIN32)
+#include <string>
+#include <cstring>
+#include <vector>
+#include <unordered_map>
+#include "utils.h"
+
+#if defined(_WIN32) || defined(__MINGW32__)
+typedef int ssize_t;
+#endif  // defined(_WIN32) || defined(__MINGW32__)
+
+#if defined(_WIN32)
+typedef int sock_size_t;
+
+static inline int poll(struct pollfd *pfd, int nfds,
+                       int timeout) { return WSAPoll ( pfd, nfds, timeout ); }
+#else
+#include <sys/poll.h>
+typedef int SOCKET;
+typedef size_t sock_size_t;
+const int INVALID_SOCKET = -1;
+#endif  // defined(_WIN32)
+
+namespace rabit {
+namespace utils {
+/*! \brief data structure for network address */
+struct SockAddr {
+  sockaddr_in addr;
+  // constructor
+  SockAddr(void) {}
+  SockAddr(const char *url, int port) {
+    this->Set(url, port);
+  }
+  inline static std::string GetHostName(void) {
+    std::string buf; buf.resize(256);
+    utils::Check(gethostname(&buf[0], 256) != -1, "fail to get host name");
+    return std::string(buf.c_str());
+  }
+  /*!
+   * \brief set the address
+   * \param url the url of the address
+   * \param port the port of address
+   */
+  inline void Set(const char *host, int port) {
+    addrinfo hints;
+    memset(&hints, 0, sizeof(hints));
+    hints.ai_family = AF_INET;
+    hints.ai_protocol = SOCK_STREAM;
+    addrinfo *res = NULL;
+    int sig = getaddrinfo(host, NULL, &hints, &res);
+    Check(sig == 0 && res != NULL, "cannot obtain address of %s", host);
+    Check(res->ai_family == AF_INET, "Does not support IPv6");
+    memcpy(&addr, res->ai_addr, res->ai_addrlen);
+    addr.sin_port = htons(port);
+    freeaddrinfo(res);
+  }
+  /*! \brief return port of the address*/
+  inline int port(void) const {
+    return ntohs(addr.sin_port);
+  }
+  /*! \return a string representation of the address */
+  inline std::string AddrStr(void) const {
+    std::string buf; buf.resize(256);
+#ifdef _WIN32
+    const char *s = inet_ntop(AF_INET, (PVOID)&addr.sin_addr,
+                    &buf[0], buf.length());
+#else
+    const char *s = inet_ntop(AF_INET, &addr.sin_addr,
+                              &buf[0], buf.length());
+#endif  // _WIN32
+    Assert(s != NULL, "cannot decode address");
+    return std::string(s);
+  }
+};
+
+/*!
+ * \brief base class containing common operations of TCP and UDP sockets
+ */
+class Socket {
+ public:
+  /*! \brief the file descriptor of socket */
+  SOCKET sockfd;
+  // default conversion to int
+  inline operator SOCKET() const {
+    return sockfd;
+  }
+  /*!
+   * \return last error of socket operation
+   */
+  inline static int GetLastError(void) {
+#ifdef _WIN32
+    return WSAGetLastError();
+#else
+    return errno;
+#endif  // _WIN32
+  }
+  /*! \return whether last error was would block */
+  inline static bool LastErrorWouldBlock(void) {
+    int errsv = GetLastError();
+#ifdef _WIN32
+    return errsv == WSAEWOULDBLOCK;
+#else
+    return errsv == EAGAIN || errsv == EWOULDBLOCK;
+#endif  // _WIN32
+  }
+  /*!
+   * \brief start up the socket module
+   *   call this before using the sockets
+   */
+  inline static void Startup(void) {
+#ifdef _WIN32
+    WSADATA wsa_data;
+    if (WSAStartup(MAKEWORD(2, 2), &wsa_data) == -1) {
+    Socket::Error("Startup");
+    }
+    if (LOBYTE(wsa_data.wVersion) != 2 || HIBYTE(wsa_data.wVersion) != 2) {
+    WSACleanup();
+    utils::Error("Could not find a usable version of Winsock.dll\n");
+    }
+#endif  // _WIN32
+  }
+  /*!
+   * \brief shutdown the socket module after use, all sockets need to be closed
+   */
+  inline static void Finalize(void) {
+#ifdef _WIN32
+    WSACleanup();
+#endif  // _WIN32
+  }
+  /*!
+   * \brief set this socket to use non-blocking mode
+   * \param non_block whether set it to be non-block, if it is false
+   *        it will set it back to block mode
+   */
+  inline void SetNonBlock(bool non_block) {
+#ifdef _WIN32
+    u_long mode = non_block ? 1 : 0;
+    if (ioctlsocket(sockfd, FIONBIO, &mode) != NO_ERROR) {
+      Socket::Error("SetNonBlock");
+    }
+#else
+    int flag = fcntl(sockfd, F_GETFL, 0);
+    if (flag == -1) {
+      Socket::Error("SetNonBlock-1");
+    }
+    if (non_block) {
+      flag |= O_NONBLOCK;
+    } else {
+      flag &= ~O_NONBLOCK;
+    }
+    if (fcntl(sockfd, F_SETFL, flag) == -1) {
+      Socket::Error("SetNonBlock-2");
+    }
+#endif  // _WIN32
+  }
+  /*!
+   * \brief bind the socket to an address
+   * \param addr
+   */
+  inline void Bind(const SockAddr &addr) {
+    if (bind(sockfd, reinterpret_cast<const sockaddr*>(&addr.addr),
+             sizeof(addr.addr)) == -1) {
+      Socket::Error("Bind");
+    }
+  }
+  /*!
+   * \brief try bind the socket to host, from start_port to end_port
+   * \param start_port starting port number to try
+   * \param end_port ending port number to try
+   * \return the port successfully bind to, return -1 if failed to bind any port
+   */
+  inline int TryBindHost(int start_port, int end_port) {
+    // TODO(tqchen) add prefix check
+    for (int port = start_port; port < end_port; ++port) {
+      SockAddr addr("0.0.0.0", port);
+      if (bind(sockfd, reinterpret_cast<sockaddr*>(&addr.addr),
+               sizeof(addr.addr)) == 0) {
+        return port;
+      }
+#if defined(_WIN32)
+      if (WSAGetLastError() != WSAEADDRINUSE) {
+        Socket::Error("TryBindHost");
+      }
+#else
+      if (errno != EADDRINUSE) {
+        Socket::Error("TryBindHost");
+      }
+#endif  // defined(_WIN32)
+    }
+
+    return -1;
+  }
+  /*! \brief get last error code if any */
+  inline int GetSockError(void) const {
+    int error = 0;
+    socklen_t len = sizeof(error);
+    if (getsockopt(sockfd,  SOL_SOCKET, SO_ERROR,
+            reinterpret_cast<char*>(&error), &len) != 0) {
+      Error("GetSockError");
+    }
+    return error;
+  }
+  /*! \brief check if anything bad happens */
+  inline bool BadSocket(void) const {
+    if (IsClosed()) return true;
+    int err = GetSockError();
+    if (err == EBADF || err == EINTR) return true;
+    return false;
+  }
+  /*! \brief check if socket is already closed */
+  inline bool IsClosed(void) const {
+    return sockfd == INVALID_SOCKET;
+  }
+  /*! \brief close the socket */
+  inline void Close(void) {
+    if (sockfd != INVALID_SOCKET) {
+#ifdef _WIN32
+      closesocket(sockfd);
+#else
+      close(sockfd);
+#endif
+      sockfd = INVALID_SOCKET;
+    } else {
+      Error("Socket::Close double close the socket or close without create");
+    }
+  }
+  // report an socket error
+  inline static void Error(const char *msg) {
+    int errsv = GetLastError();
+#ifdef _WIN32
+    utils::Error("Socket %s Error:WSAError-code=%d", msg, errsv);
+#else
+    utils::Error("Socket %s Error:%s", msg, strerror(errsv));
+#endif
+  }
+
+ protected:
+  explicit Socket(SOCKET sockfd) : sockfd(sockfd) {
+  }
+};
+
+/*!
+ * \brief a wrapper of TCP socket that hopefully be cross platform
+ */
+class TCPSocket : public Socket{
+ public:
+  // constructor
+  TCPSocket(void) : Socket(INVALID_SOCKET) {
+  }
+  explicit TCPSocket(SOCKET sockfd) : Socket(sockfd) {
+  }
+  /*!
+   * \brief enable/disable TCP keepalive
+   * \param keepalive whether to set the keep alive option on
+   */
+  void SetKeepAlive(bool keepalive) {
+    int opt = static_cast<int>(keepalive);
+    if (setsockopt(sockfd, SOL_SOCKET, SO_KEEPALIVE,
+                   reinterpret_cast<char*>(&opt), sizeof(opt)) < 0) {
+      Socket::Error("SetKeepAlive");
+    }
+  }
+  inline void SetLinger(int timeout = 0) {
+    struct linger sl;
+    sl.l_onoff = 1;    /* non-zero value enables linger option in kernel */
+    sl.l_linger = timeout;    /* timeout interval in seconds */
+    if (setsockopt(sockfd, SOL_SOCKET, SO_LINGER, reinterpret_cast<char*>(&sl), sizeof(sl)) == -1) {
+      Socket::Error("SO_LINGER");
+    }
+  }
+  /*!
+   * \brief create the socket, call this before using socket
+   * \param af domain
+   */
+  inline void Create(int af = PF_INET) {
+    sockfd = socket(PF_INET, SOCK_STREAM, 0);
+    if (sockfd == INVALID_SOCKET) {
+      Socket::Error("Create");
+    }
+  }
+  /*!
+   * \brief perform listen of the socket
+   * \param backlog backlog parameter
+   */
+  inline void Listen(int backlog = 16) {
+    listen(sockfd, backlog);
+  }
+  /*! \brief get a new connection */
+  TCPSocket Accept(void) {
+    SOCKET newfd = accept(sockfd, NULL, NULL);
+    if (newfd == INVALID_SOCKET) {
+      Socket::Error("Accept");
+    }
+    return TCPSocket(newfd);
+  }
+  /*!
+   * \brief decide whether the socket is at OOB mark
+   * \return 1 if at mark, 0 if not, -1 if an error occured
+   */
+  inline int AtMark(void) const {
+#ifdef _WIN32
+    unsigned long atmark;  // NOLINT(*)
+    if (ioctlsocket(sockfd, SIOCATMARK, &atmark) != NO_ERROR) return -1;
+#else
+    int atmark;
+    if (ioctl(sockfd, SIOCATMARK, &atmark) == -1) return -1;
+#endif  // _WIN32
+    return static_cast<int>(atmark);
+  }
+  /*!
+   * \brief connect to an address
+   * \param addr the address to connect to
+   * \return whether connect is successful
+   */
+  inline bool Connect(const SockAddr &addr) {
+    return connect(sockfd, reinterpret_cast<const sockaddr*>(&addr.addr),
+                   sizeof(addr.addr)) == 0;
+  }
+  /*!
+   * \brief send data using the socket
+   * \param buf the pointer to the buffer
+   * \param len the size of the buffer
+   * \param flags extra flags
+   * \return size of data actually sent
+   *         return -1 if error occurs
+   */
+  inline ssize_t Send(const void *buf_, size_t len, int flag = 0) {
+    const char *buf = reinterpret_cast<const char*>(buf_);
+    return send(sockfd, buf, static_cast<sock_size_t>(len), flag);
+  }
+  /*!
+   * \brief receive data using the socket
+   * \param buf_ the pointer to the buffer
+   * \param len the size of the buffer
+   * \param flags extra flags
+   * \return size of data actually received
+   *         return -1 if error occurs
+   */
+  inline ssize_t Recv(void *buf_, size_t len, int flags = 0) {
+    char *buf = reinterpret_cast<char*>(buf_);
+    return recv(sockfd, buf, static_cast<sock_size_t>(len), flags);
+  }
+  /*!
+   * \brief peform block write that will attempt to send all data out
+   *    can still return smaller than request when error occurs
+   * \param buf the pointer to the buffer
+   * \param len the size of the buffer
+   * \return size of data actually sent
+   */
+  inline size_t SendAll(const void *buf_, size_t len) {
+    const char *buf = reinterpret_cast<const char*>(buf_);
+    size_t ndone = 0;
+    while (ndone <  len) {
+      ssize_t ret = send(sockfd, buf, static_cast<ssize_t>(len - ndone), 0);
+      if (ret == -1) {
+        if (LastErrorWouldBlock()) return ndone;
+        Socket::Error("SendAll");
+      }
+      buf += ret;
+      ndone += ret;
+    }
+    return ndone;
+  }
+  /*!
+   * \brief peforma block read that will attempt to read all data
+   *    can still return smaller than request when error occurs
+   * \param buf_ the buffer pointer
+   * \param len length of data to recv
+   * \return size of data actually sent
+   */
+  inline size_t RecvAll(void *buf_, size_t len) {
+    char *buf = reinterpret_cast<char*>(buf_);
+    size_t ndone = 0;
+    while (ndone <  len) {
+      ssize_t ret = recv(sockfd, buf,
+                         static_cast<sock_size_t>(len - ndone), MSG_WAITALL);
+      if (ret == -1) {
+        if (LastErrorWouldBlock()) return ndone;
+        Socket::Error("RecvAll");
+      }
+      if (ret == 0) return ndone;
+      buf += ret;
+      ndone += ret;
+    }
+    return ndone;
+  }
+  /*!
+   * \brief send a string over network
+   * \param str the string to be sent
+   */
+  inline void SendStr(const std::string &str) {
+    int len = static_cast<int>(str.length());
+    utils::Assert(this->SendAll(&len, sizeof(len)) == sizeof(len),
+                  "error during send SendStr");
+    if (len != 0) {
+      utils::Assert(this->SendAll(str.c_str(), str.length()) == str.length(),
+                    "error during send SendStr");
+    }
+  }
+  /*!
+   * \brief recv a string from network
+   * \param out_str the string to receive
+   */
+  inline void RecvStr(std::string *out_str) {
+    int len;
+    utils::Assert(this->RecvAll(&len, sizeof(len)) == sizeof(len),
+                  "error during send RecvStr");
+    out_str->resize(len);
+    if (len != 0) {
+      utils::Assert(this->RecvAll(&(*out_str)[0], len) == out_str->length(),
+                    "error during send SendStr");
+    }
+  }
+};
+
+/*! \brief helper data structure to perform poll */
+struct PollHelper {
+ public:
+  /*!
+   * \brief add file descriptor to watch for read
+   * \param fd file descriptor to be watched
+   */
+  inline void WatchRead(SOCKET fd) {
+    auto& pfd = fds[fd];
+    pfd.fd = fd;
+    pfd.events |= POLLIN;
+  }
+  /*!
+   * \brief add file descriptor to watch for write
+   * \param fd file descriptor to be watched
+   */
+  inline void WatchWrite(SOCKET fd) {
+    auto& pfd = fds[fd];
+    pfd.fd = fd;
+    pfd.events |= POLLOUT;
+  }
+  /*!
+   * \brief add file descriptor to watch for exception
+   * \param fd file descriptor to be watched
+   */
+  inline void WatchException(SOCKET fd) {
+    auto& pfd = fds[fd];
+    pfd.fd = fd;
+    pfd.events |= POLLPRI;
+  }
+  /*!
+   * \brief Check if the descriptor is ready for read
+   * \param fd file descriptor to check status
+   */
+  inline bool CheckRead(SOCKET fd) const {
+    const auto& pfd = fds.find(fd);
+    return pfd != fds.end() && ((pfd->second.events & POLLIN) != 0);
+  }
+  /*!
+   * \brief Check if the descriptor is ready for write
+   * \param fd file descriptor to check status
+   */
+  inline bool CheckWrite(SOCKET fd) const {
+    const auto& pfd = fds.find(fd);
+    return pfd != fds.end() && ((pfd->second.events & POLLOUT) != 0);
+  }
+  /*!
+   * \brief Check if the descriptor has any exception
+   * \param fd file descriptor to check status
+   */
+  inline bool CheckExcept(SOCKET fd) const {
+    const auto& pfd = fds.find(fd);
+    return pfd != fds.end() && ((pfd->second.events & POLLPRI) != 0);
+  }
+  /*!
+   * \brief wait for exception event on a single descriptor
+   * \param fd the file descriptor to wait the event for
+   * \param timeout the timeout counter, can be negative, which means wait until the event happen
+   * \return 1 if success, 0 if timeout, and -1 if error occurs
+   */
+  inline static int WaitExcept(SOCKET fd, long timeout = -1) { // NOLINT(*)
+    pollfd pfd;
+    pfd.fd = fd;
+    pfd.events = POLLPRI;
+    return poll(&pfd, 1, timeout);
+  }
+
+  /*!
+   * \brief peform poll on the set defined, read, write, exception
+   * \param timeout specify timeout in milliseconds(ms) if negative, means poll will block
+   * \return
+   */
+  inline void Poll(long timeout = -1) {  // NOLINT(*)
+    std::vector<pollfd> fdset;
+    fdset.reserve(fds.size());
+    for (auto kv : fds) {
+      fdset.push_back(kv.second);
+    }
+    int ret = poll(fdset.data(), fdset.size(), timeout);
+    if (ret == -1) {
+      Socket::Error("Poll");
+    } else {
+      for (auto& pfd : fdset) {
+        auto revents = pfd.revents & pfd.events;
+        if (!revents) {
+          fds.erase(pfd.fd);
+        } else {
+          fds[pfd.fd].events = revents;
+        }
+      }
+    }
+  }
+
+  std::unordered_map<SOCKET, pollfd> fds;
+};
+}  // namespace utils
+}  // namespace rabit
+#endif  // RABIT_INTERNAL_SOCKET_H_
diff --git a/include/rabit/internal/thread_local.h b/include/rabit/internal/thread_local.h
new file mode 100644
index 0000000..4eebd64
--- /dev/null
+++ b/include/rabit/internal/thread_local.h
@@ -0,0 +1,87 @@
+/*!
+ *  Copyright (c) 2015 by Contributors
+ * \file thread_local.h
+ * \brief Common utility for thread local storage.
+ */
+#ifndef RABIT_INTERNAL_THREAD_LOCAL_H_
+#define RABIT_INTERNAL_THREAD_LOCAL_H_
+
+#include "../include/dmlc/base.h"
+
+#if DMLC_ENABLE_STD_THREAD
+#include <mutex>
+#endif  // DMLC_ENABLE_STD_THREAD
+
+#include <memory>
+#include <vector>
+
+namespace rabit {
+
+// macro hanlding for threadlocal variables
+#ifdef __GNUC__
+  #define MX_TREAD_LOCAL __thread
+#elif __STDC_VERSION__ >= 201112L
+  #define  MX_TREAD_LOCAL _Thread_local
+#elif defined(_MSC_VER)
+  #define MX_TREAD_LOCAL __declspec(thread)
+#endif  // __GNUC__
+
+#ifndef MX_TREAD_LOCAL
+#message("Warning: Threadlocal is not enabled");
+#endif  // MX_TREAD_LOCAL
+
+/*!
+ * \brief A threadlocal store to store threadlocal variables.
+ *  Will return a thread local singleton of type T
+ * \tparam T the type we like to store
+ */
+template<typename T>
+class ThreadLocalStore {
+ public:
+  /*! \return get a thread local singleton */
+  static T* Get() {
+    static MX_TREAD_LOCAL T* ptr = nullptr;
+    if (ptr == nullptr) {
+      ptr = new T();
+      Singleton()->RegisterDelete(ptr);
+    }
+    return ptr;
+  }
+
+ private:
+  /*! \brief constructor */
+  ThreadLocalStore() {}
+  /*! \brief destructor */
+  ~ThreadLocalStore() {
+    for (size_t i = 0; i < data_.size(); ++i) {
+      delete data_[i];
+    }
+  }
+  /*! \return singleton of the store */
+  static ThreadLocalStore<T> *Singleton() {
+    static ThreadLocalStore<T> inst;
+    return &inst;
+  }
+  /*!
+   * \brief register str for internal deletion
+   * \param str the string pointer
+   */
+  void RegisterDelete(T *str) {
+#if DMLC_ENABLE_STD_THREAD
+    std::unique_lock<std::mutex> lock(mutex_);
+    data_.push_back(str);
+    lock.unlock();
+#else
+    data_.push_back(str);
+#endif  // DMLC_ENABLE_STD_THREAD
+  }
+
+#if DMLC_ENABLE_STD_THREAD
+  /*! \brief internal mutex */
+  std::mutex mutex_;
+#endif  // DMLC_ENABLE_STD_THREAD
+  /*!\brief internal data */
+  std::vector<T*> data_;
+};
+}  // namespace rabit
+#endif  // RABIT_INTERNAL_THREAD_LOCAL_H_
diff --git a/include/rabit/internal/timer.h b/include/rabit/internal/timer.h
new file mode 100644
index 0000000..3ce1bf8
--- /dev/null
+++ b/include/rabit/internal/timer.h
@@ -0,0 +1,41 @@
+/*!
+ * Copyright (c) 2014-2019 by Contributors
+ * \file timer.h
+ * \brief This file defines the utils for timing
+ * \author Tianqi Chen, Nacho, Tianyi
+ */
+#ifndef RABIT_INTERNAL_TIMER_H_
+#define RABIT_INTERNAL_TIMER_H_
+#include <time.h>
+#ifdef __MACH__
+#include <mach/clock.h>
+#include <mach/mach.h>
+#endif  // __MACH__
+#include "./utils.h"
+
+namespace rabit {
+namespace utils {
+/*!
+ * \brief return time in seconds, not cross platform, avoid to use this in most places
+ */
+inline double GetTime(void) {
+#ifdef __MACH__
+  clock_serv_t cclock;
+  mach_timespec_t mts;
+  host_get_clock_service(mach_host_self(), CALENDAR_CLOCK, &cclock);
+  utils::Check(clock_get_time(cclock, &mts) == 0, "failed to get time");
+  mach_port_deallocate(mach_task_self(), cclock);
+  return static_cast<double>(mts.tv_sec) + static_cast<double>(mts.tv_nsec) * 1e-9;
+#else
+#if defined(__unix__) || defined(__linux__)
+  timespec ts;
+  utils::Check(clock_gettime(CLOCK_REALTIME, &ts) == 0, "failed to get time");
+  return static_cast<double>(ts.tv_sec) + static_cast<double>(ts.tv_nsec) * 1e-9;
+#else
+  return static_cast<double>(time(NULL));
+#endif  // defined(__unix__) || defined(__linux__)
+#endif  // __MACH__
+}
+}  // namespace utils
+}  // namespace rabit
+#endif  // RABIT_INTERNAL_TIMER_H_
diff --git a/include/rabit/internal/utils.h b/include/rabit/internal/utils.h
new file mode 100644
index 0000000..05f3555
--- /dev/null
+++ b/include/rabit/internal/utils.h
@@ -0,0 +1,231 @@
+/*!
+ *  Copyright (c) 2014 by Contributors
+ * \file utils.h
+ * \brief simple utils to support the code
+ * \author Tianqi Chen
+ */
+#ifndef RABIT_INTERNAL_UTILS_H_
+#define RABIT_INTERNAL_UTILS_H_
+#define _CRT_SECURE_NO_WARNINGS
+#include <string.h>
+#include <cstdio>
+#include <string>
+#include <cstdlib>
+#include <stdexcept>
+#include <vector>
+#include "dmlc/io.h"
+
+#ifndef RABIT_STRICT_CXX98_
+#include <cstdarg>
+#endif  // RABIT_STRICT_CXX98_
+
+#if !defined(__GNUC__) || defined(__FreeBSD__)
+#define fopen64 std::fopen
+#endif  // !defined(__GNUC__) || defined(__FreeBSD__)
+
+#ifdef _MSC_VER
+// NOTE: sprintf_s is not equivalent to snprintf,
+// they are equivalent when success, which is sufficient for our case
+#define snprintf sprintf_s
+#define vsnprintf vsprintf_s
+
+#else
+
+#ifdef _FILE_OFFSET_BITS
+#if _FILE_OFFSET_BITS == 32
+#pragma message("Warning: FILE OFFSET BITS defined to be 32 bit")
+#endif  // _FILE_OFFSET_BITS == 32
+#endif  // _FILE_OFFSET_BITS
+
+#ifdef __APPLE__
+#define off64_t off_t
+#define fopen64 std::fopen
+#endif  // __APPLE__
+
+extern "C" {
+#include <sys/types.h>
+}
+#endif  // _MSC_VER
+
+#ifdef _MSC_VER
+typedef unsigned char uint8_t;
+typedef unsigned __int16 uint16_t;
+typedef unsigned __int32 uint32_t;
+typedef unsigned __int64 uint64_t;
+typedef __int64 int64_t;
+#else
+#include <inttypes.h>
+#endif  // _MSC_VER
+
+namespace rabit {
+/*! \brief namespace for helper utils of the project */
+namespace utils {
+
+/*! \brief error message buffer length */
+const int kPrintBuffer = 1 << 12;
+
+/*! \brief we may want to keep the process alive when there are multiple workers
+ * co-locate in the same process */
+extern bool STOP_PROCESS_ON_ERROR;
+
+/* \brief Case-insensitive string comparison */
+inline int CompareStringsCaseInsensitive(const char* s1, const char* s2) {
+#ifdef _MSC_VER
+  return _stricmp(s1, s2);
+#else  // _MSC_VER
+  return strcasecmp(s1, s2);
+#endif  // _MSC_VER
+}
+
+/* \brief parse config string too bool*/
+inline bool StringToBool(const char* s) {
+  return CompareStringsCaseInsensitive(s, "true") == 0 || atoi(s) != 0;
+}
+
+#ifndef RABIT_CUSTOMIZE_MSG_
+/*!
+ * \brief handling of Assert error, caused by inappropriate input
+ * \param msg error message
+ */
+inline void HandleAssertError(const char *msg) {
+  if (STOP_PROCESS_ON_ERROR) {
+    fprintf(stderr, "AssertError:%s, shutting down process\n", msg);
+    exit(-1);
+  } else {
+    fprintf(stderr, "AssertError:%s, rabit is configured to keep process running\n", msg);
+    throw dmlc::Error(msg);
+  }
+}
+/*!
+ * \brief handling of Check error, caused by inappropriate input
+ * \param msg error message
+ */
+inline void HandleCheckError(const char *msg) {
+  if (STOP_PROCESS_ON_ERROR) {
+    fprintf(stderr, "%s, shutting down process\n", msg);
+    exit(-1);
+  } else {
+    fprintf(stderr, "%s, rabit is configured to keep process running\n", msg);
+    throw dmlc::Error(msg);
+  }
+}
+inline void HandlePrint(const char *msg) {
+  printf("%s", msg);
+}
+
+inline void HandleLogInfo(const char *fmt, ...) {
+  std::string msg(kPrintBuffer, '\0');
+  va_list args;
+  va_start(args, fmt);
+  vsnprintf(&msg[0], kPrintBuffer, fmt, args);
+  va_end(args);
+  fprintf(stdout, "%s", msg.c_str());
+  fflush(stdout);
+}
+#else
+#ifndef RABIT_STRICT_CXX98_
+// include declarations, some one must implement this
+void HandleAssertError(const char *msg);
+void HandleCheckError(const char *msg);
+void HandlePrint(const char *msg);
+#endif  // RABIT_STRICT_CXX98_
+#endif  // RABIT_CUSTOMIZE_MSG_
+#ifdef RABIT_STRICT_CXX98_
+// these function pointers are to be assigned
+extern "C" void (*Printf)(const char *fmt, ...);
+extern "C" int (*SPrintf)(char *buf, size_t size, const char *fmt, ...);
+extern "C" void (*Assert)(int exp, const char *fmt, ...);
+extern "C" void (*Check)(int exp, const char *fmt, ...);
+extern "C" void (*Error)(const char *fmt, ...);
+#else
+/*! \brief printf, prints messages to the console */
+inline void Printf(const char *fmt, ...) {
+  std::string msg(kPrintBuffer, '\0');
+  va_list args;
+  va_start(args, fmt);
+  vsnprintf(&msg[0], kPrintBuffer, fmt, args);
+  va_end(args);
+  HandlePrint(msg.c_str());
+}
+/*! \brief portable version of snprintf */
+inline int SPrintf(char *buf, size_t size, const char *fmt, ...) {
+  va_list args;
+  va_start(args, fmt);
+  int ret = vsnprintf(buf, size, fmt, args);
+  va_end(args);
+  return ret;
+}
+
+/*! \brief assert a condition is true, use this to handle debug information */
+inline void Assert(bool exp, const char *fmt, ...) {
+  if (!exp) {
+    std::string msg(kPrintBuffer, '\0');
+    va_list args;
+    va_start(args, fmt);
+    vsnprintf(&msg[0], kPrintBuffer, fmt, args);
+    va_end(args);
+    HandleAssertError(msg.c_str());
+  }
+}
+
+/*!\brief same as assert, but this is intended to be used as a message for users */
+inline void Check(bool exp, const char *fmt, ...) {
+  if (!exp) {
+    std::string msg(kPrintBuffer, '\0');
+    va_list args;
+    va_start(args, fmt);
+    vsnprintf(&msg[0], kPrintBuffer, fmt, args);
+    va_end(args);
+    HandleCheckError(msg.c_str());
+  }
+}
+
+/*! \brief report error message, same as check */
+inline void Error(const char *fmt, ...) {
+  {
+    std::string msg(kPrintBuffer, '\0');
+    va_list args;
+    va_start(args, fmt);
+    vsnprintf(&msg[0], kPrintBuffer, fmt, args);
+    va_end(args);
+    HandleCheckError(msg.c_str());
+  }
+}
+#endif  // RABIT_STRICT_CXX98_
+
+/*! \brief replace fopen, report error when the file open fails */
+inline std::FILE *FopenCheck(const char *fname, const char *flag) {
+  std::FILE *fp = fopen64(fname, flag);
+  Check(fp != NULL, "can not open file \"%s\"\n", fname);
+  return fp;
+}
+}  // namespace utils
+// easy utils that can be directly accessed in xgboost
+/*! \brief get the beginning address of a vector */
+template<typename T>
+inline T *BeginPtr(std::vector<T> &vec) {  // NOLINT(*)
+  if (vec.size() == 0) {
+    return NULL;
+  } else {
+    return &vec[0];
+  }
+}
+/*! \brief get the beginning address of a vector */
+template<typename T>
+inline const T *BeginPtr(const std::vector<T> &vec) {  // NOLINT(*)
+  if (vec.size() == 0) {
+    return NULL;
+  } else {
+    return &vec[0];
+  }
+}
+inline char* BeginPtr(std::string &str) {  // NOLINT(*)
+  if (str.length() == 0) return NULL;
+  return &str[0];
+}
+inline const char* BeginPtr(const std::string &str) {
+  if (str.length() == 0) return NULL;
+  return &str[0];
+}
+}  // namespace rabit
+#endif  // RABIT_INTERNAL_UTILS_H_
diff --git a/include/rabit/rabit.h b/include/rabit/rabit.h
new file mode 100644
index 0000000..396354e
--- /dev/null
+++ b/include/rabit/rabit.h
@@ -0,0 +1,460 @@
+/*!
+ *  Copyright (c) 2014 by Contributors
+ * \file rabit.h
+ * \brief This file defines rabit's Allreduce/Broadcast interface
+ *   The rabit engine contains the actual implementation
+ *   Code that only uses this header can also be compiled with MPI Allreduce (non fault-tolerant),
+ *
+ *   rabit.h and serializable.h is all what the user needs to use the rabit interface
+ * \author Tianqi Chen, Ignacio Cano, Tianyi Zhou
+ */
+#ifndef RABIT_RABIT_H_  // NOLINT(*)
+#define RABIT_RABIT_H_  // NOLINT(*)
+#include <string>
+#include <vector>
+
+// whether or not use c++11 support
+#ifndef DMLC_USE_CXX11
+#if defined(__GXX_EXPERIMENTAL_CXX0X__) || defined(_MSC_VER)
+#define DMLC_USE_CXX11 1
+#else
+#define DMLC_USE_CXX11 (__cplusplus >= 201103L)
+#endif  // defined(__GXX_EXPERIMENTAL_CXX0X__) || defined(_MSC_VER)
+#endif  // DMLC_USE_CXX11
+
+// keeps rabit api caller signature
+#ifndef RABIT_API_CALLER_SIGNATURE
+#define RABIT_API_CALLER_SIGNATURE
+
+#if (defined(__GNUC__) && !defined(__clang__))
+#define _FILE  __builtin_FILE()
+#define _LINE  __builtin_LINE()
+#define _CALLER  __builtin_FUNCTION()
+#else
+#define _FILE  "N/A"
+#define _LINE  -1
+#define _CALLER  "N/A"
+#endif  // (defined(__GNUC__) && !defined(__clang__))
+
+#endif  // RABIT_API_CALLER_SIGNATURE
+
+// optionally support of lambda functions in C++11, if available
+#if DMLC_USE_CXX11
+#include <functional>
+#endif  // C++11
+// engine definition of rabit, defines internal implementation
+// to use rabit interface, there is no need to read engine.h
+// rabit.h and serializable.h are enough to use the interface
+#include "./internal/engine.h"
+
+/*! \brief rabit namespace */
+namespace rabit {
+/*!
+ * \brief defines stream used in rabit
+ * see definition of Stream in dmlc/io.h
+ */
+typedef dmlc::Stream Stream;
+/*!
+ * \brief defines serializable objects used in rabit
+ * see definition of Serializable in dmlc/io.h
+ */
+typedef dmlc::Serializable Serializable;
+
+/*!
+ * \brief reduction operators namespace
+ */
+namespace op {
+/*!
+ * \class rabit::op::Max
+ * \brief maximum reduction operator
+ */
+struct Max;
+/*!
+ * \class rabit::op::Min
+ * \brief minimum reduction operator
+ */
+struct Min;
+/*!
+ * \class rabit::op::Sum
+ * \brief sum reduction operator
+ */
+struct Sum;
+/*!
+ * \class rabit::op::BitOR
+ * \brief bitwise OR reduction operator
+ */
+struct BitOR;
+}  // namespace op
+/*!
+ * \brief initializes rabit, call this once at the beginning of your program
+ * \param argc number of arguments in argv
+ * \param argv the array of input arguments
+ * \return true if initialized successfully, otherwise false
+ */
+inline bool Init(int argc, char *argv[]);
+/*!
+ * \brief finalizes the rabit engine, call this function after you finished with all the jobs
+ * \return true if finalized successfully, otherwise false
+ */
+inline bool Finalize();
+/*! \brief gets rank of the current process
+ * \return rank number of worker*/
+inline int GetRank();
+/*! \brief gets total number of processes
+ * \return total world size*/
+inline int GetWorldSize();
+/*! \brief whether rabit env is in distributed mode
+ * \return is distributed*/
+inline bool IsDistributed();
+
+/*! \brief gets processor's name
+ * \return processor name*/
+inline std::string GetProcessorName();
+/*!
+ * \brief prints the msg to the tracker,
+ *    this function can be used to communicate progress information to
+ *    the user who monitors the tracker
+ * \param msg the message to be printed
+ */
+inline void TrackerPrint(const std::string &msg);
+
+#ifndef RABIT_STRICT_CXX98_
+/*!
+ * \brief prints the msg to the tracker, this function may not be available
+ *    in very strict c++98 compilers, though it usually is.
+ *    this function can be used to communicate progress information to
+ *    the user who monitors the tracker
+ * \param fmt the format string
+ */
+inline void TrackerPrintf(const char *fmt, ...);
+#endif  // RABIT_STRICT_CXX98_
+/*!
+ * \brief broadcasts a memory region to every node from the root
+ *
+ *     Example: int a = 1; Broadcast(&a, sizeof(a), root);
+ * \param sendrecv_data the pointer to the send/receive buffer,
+ * \param size the data size
+ * \param root the process root
+ * \param _file caller file name used to generate unique cache key
+ * \param _line caller line number used to generate unique cache key
+ * \param _caller caller function name used to generate unique cache key
+ */
+inline void Broadcast(void *sendrecv_data, size_t size, int root,
+                      const char* _file = _FILE,
+                      const int _line = _LINE,
+                      const char* _caller = _CALLER);
+
+/*!
+ * \brief broadcasts an std::vector<DType> to every node from root
+ * \param sendrecv_data the pointer to send/receive vector,
+ *        for the receiver, the vector does not need to be pre-allocated
+ * \param root the process root
+ * \param _file caller file name used to generate unique cache key
+ * \param _line caller line number used to generate unique cache key
+ * \param _caller caller function name used to generate unique cache key
+ * \tparam DType the data type stored in the vector, has to be a simple data type
+ *               that can be directly transmitted by sending the sizeof(DType)
+ */
+template<typename DType>
+inline void Broadcast(std::vector<DType> *sendrecv_data, int root,
+                      const char* _file = _FILE,
+                      const int _line = _LINE,
+                      const char* _caller = _CALLER);
+/*!
+ * \brief broadcasts a std::string to every node from the root
+ * \param sendrecv_data the pointer to the send/receive buffer,
+ *        for the receiver, the vector does not need to be pre-allocated
+ * \param _file caller file name used to generate unique cache key
+ * \param _line caller line number used to generate unique cache key
+ * \param _caller caller function name used to generate unique cache key
+ * \param root the process root
+ */
+inline void Broadcast(std::string *sendrecv_data, int root,
+                      const char* _file = _FILE,
+                      const int _line = _LINE,
+                      const char* _caller = _CALLER);
+/*!
+ * \brief performs in-place Allreduce on sendrecvbuf
+ *        this function is NOT thread-safe
+ *
+ * Example Usage: the following code does an Allreduce and outputs the sum as the result
+ * \code{.cpp}
+ * vector<int> data(10);
+ * ...
+ * Allreduce<op::Sum>(&data[0], data.size());
+ * ...
+ * \endcode
+ *
+ * \param sendrecvbuf buffer for both sending and receiving data
+ * \param count number of elements to be reduced
+ * \param prepare_fun Lazy preprocessing function, if it is not NULL, prepare_fun(prepare_arg)
+ *                    will be called by the function before performing Allreduce in order to initialize the data in sendrecvbuf.
+ *                     If the result of Allreduce can be recovered directly, then prepare_func will NOT be called
+ * \param prepare_arg argument used to pass into the lazy preprocessing function
+ * \param _file caller file name used to generate unique cache key
+ * \param _line caller line number used to generate unique cache key
+ * \param _caller caller function name used to generate unique cache key
+ * \tparam OP see namespace op, reduce operator
+ * \tparam DType data type
+ */
+template<typename OP, typename DType>
+inline void Allreduce(DType *sendrecvbuf, size_t count,
+                      void (*prepare_fun)(void *) = NULL,
+                      void *prepare_arg = NULL,
+                      const char* _file = _FILE,
+                      const int _line = _LINE,
+                      const char* _caller = _CALLER);
+
+/*!
+* \brief Allgather function, each node have a segment of data in the ring of sendrecvbuf,
+*  the data provided by current node k is [slice_begin, slice_end),
+*  the next node's segment must start with slice_end
+*  after the call of Allgather, sendrecvbuf_ contains all the contents including all segments
+*  use a ring based algorithm
+*
+* \param sendrecvbuf_ buffer for both sending and receiving data, it is a ring conceptually
+* \param total_size total size of data to be gathered
+* \param slice_begin beginning of the current slice
+* \param slice_end end of the current slice
+* \param size_prev_slice size of the previous slice i.e. slice of node (rank - 1) % world_size
+* \param _file caller file name used to generate unique cache key
+* \param _line caller line number used to generate unique cache key
+* \param _caller caller function name used to generate unique cache key
+*/ 
+template<typename DType>
+inline void Allgather(DType *sendrecvbuf_,
+                  size_t total_size,
+                  size_t slice_begin,
+                  size_t slice_end,
+                  size_t size_prev_slice,
+                  const char* _file = _FILE,
+                  const int _line = _LINE,
+                  const char* _caller = _CALLER);
+
+// C++11 support for lambda prepare function
+#if DMLC_USE_CXX11
+/*!
+ * \brief performs in-place Allreduce, on sendrecvbuf
+ *        with a prepare function specified by a lambda function
+ *
+ * Example Usage:
+ * \code{.cpp}
+ * // the following code does an Allreduce and outputs the sum as the result
+ * vector<int> data(10);
+ * ...
+ * Allreduce<op::Sum>(&data[0], data.size(), [&]() {
+ *                     for (int i = 0; i < 10; ++i) {
+ *                       data[i] = i;
+ *                     }
+ *                    });
+ *     ...
+ * \endcode
+ * \param sendrecvbuf buffer for both sending and receiving data
+ * \param count number of elements to be reduced
+ * \param prepare_fun  Lazy lambda preprocessing function, prepare_fun() will be invoked
+ *                     by the function before performing Allreduce in order to initialize the data in sendrecvbuf.
+ *                     If the result of Allreduce can be recovered directly, then prepare_func will NOT be called
+ * \param _file caller file name used to generate unique cache key
+ * \param _line caller line number used to generate unique cache key
+ * \param _caller caller function name used to generate unique cache key
+ * \tparam OP see namespace op, reduce operator
+ * \tparam DType data type
+ */
+template<typename OP, typename DType>
+inline void Allreduce(DType *sendrecvbuf, size_t count,
+                      std::function<void()> prepare_fun,
+                      const char* _file = _FILE,
+                      const int _line = _LINE,
+                      const char* _caller = _CALLER);
+#endif  // C++11
+/*!
+ * \brief loads the latest check point
+ * \param global_model pointer to the globally shared model/state
+ *   when calling this function, the caller needs to guarantee that the global_model
+ *   is the same in every node
+ * \param local_model pointer to the local model that is specific to the current node/rank
+ *   this can be NULL when no local model is needed
+ *
+ * \return the version number of the check point loaded
+ *     if returned version == 0, this means no model has been CheckPointed
+ *     the p_model is not touched, users should do the necessary initialization by themselves
+ *
+ * \code{.cpp}
+ * // Example usage code of LoadCheckPoint
+ * int iter = rabit::LoadCheckPoint(&model);
+ * if (iter == 0) model.InitParameters();
+ * for (i = iter; i < max_iter; ++i) {
+ *   // do many things, include allreduce
+ *   rabit::CheckPoint(model);
+ * }
+ * \endcode
+ * \sa CheckPoint, VersionNumber
+ */
+inline int LoadCheckPoint(Serializable *global_model,
+                          Serializable *local_model = NULL);
+/*!
+ * \brief checkpoints the model, meaning a stage of execution has finished.
+ *  every time we call check point, a version number will be increased by one
+ *
+ * \param global_model pointer to the globally shared model/state
+ *   when calling this function, the caller needs to guarantee that the global_model
+ *   is the same in every node
+ * \param local_model pointer to the local model that is specific to the current node/rank
+ *   this can be NULL when no local state is needed
+   * NOTE: local_model requires explicit replication of the model for fault-tolerance, which will
+   *       bring replication cost in the CheckPoint function. global_model does not need explicit replication.
+   *       So, only CheckPoint with the global_model if possible
+   * \sa LoadCheckPoint, VersionNumber
+   */
+inline void CheckPoint(const Serializable *global_model,
+                       const Serializable *local_model = NULL);
+/*!
+ * \brief This function can be used to replace CheckPoint for global_model only,
+ *   when certain condition is met (see detailed explanation).
+ *
+ *   This is a "lazy" checkpoint such that only the pointer to the global_model is
+ *   remembered and no memory copy is taken. To use this function, the user MUST ensure that:
+ *   The global_model must remain unchanged until the last call of Allreduce/Broadcast in the current version finishes.
+ *   In other words, the global_model model can be changed only between the last call of
+ *   Allreduce/Broadcast and LazyCheckPoint, both in the same version
+ *
+ *   For example, suppose the calling sequence is:
+ *   LazyCheckPoint, code1, Allreduce, code2, Broadcast, code3, LazyCheckPoint/(or can be CheckPoint)
+ *
+ *   Then the user MUST only change the global_model in code3.
+ *
+ *   The use of LazyCheckPoint instead of CheckPoint will improve the efficiency of the program.
+ * \param global_model pointer to the globally shared model/state
+ *   when calling this function, the caller needs to guarantee that the global_model
+ *   is the same in every node
+ * \sa LoadCheckPoint, CheckPoint, VersionNumber
+ */
+inline void LazyCheckPoint(const Serializable *global_model);
+/*!
+ * \return version number of the current stored model,
+ *         which means how many calls to CheckPoint we made so far
+ * \sa LoadCheckPoint, CheckPoint
+ */
+inline int VersionNumber();
+// ----- extensions that allow customized reducer ------
+// helper class to do customized reduce, user do not need to know the type
+namespace engine {
+class ReduceHandle;
+}  // namespace engine
+/*!
+ * \brief template class to make customized reduce and all reduce easy
+ *  Do not use reducer directly in the function you call Finalize,
+ *   because the destructor can execute after Finalize
+ * \tparam DType data type that to be reduced
+ * \tparam freduce the customized reduction function
+ *  DType must be a struct, with no pointer
+ */
+template<typename DType, void (*freduce)(DType &dst, const DType &src)>  // NOLINT(*)
+class Reducer {
+ public:
+  Reducer();
+  /*!
+   * \brief customized in-place all reduce operation
+   * \param sendrecvbuf the in place send-recv buffer
+   * \param count number of elements to be reduced
+   * \param prepare_fun Lazy preprocessing function, if it is not NULL, prepare_fun(prepare_arg)
+   *                     will be called by the function before performing Allreduce, to initialize the data in sendrecvbuf.
+   *                     If the result of Allreduce can be recovered directly, then prepare_func will NOT be called
+   * \param prepare_arg argument used to pass into the lazy preprocessing function
+   * \param _file caller file name used to generate unique cache key
+   * \param _line caller line number used to generate unique cache key
+   * \param _caller caller function name used to generate unique cache key
+   */
+  inline void Allreduce(DType *sendrecvbuf, size_t count,
+                        void (*prepare_fun)(void *) = NULL,
+                        void *prepare_arg = NULL,
+                        const char* _file = _FILE,
+                        const int _line = _LINE,
+                        const char* _caller = _CALLER);
+#if DMLC_USE_CXX11
+  /*!
+   * \brief customized in-place all reduce operation, with lambda function as preprocessor
+   * \param sendrecvbuf pointer to the array of objects to be reduced
+   * \param count number of elements to be reduced
+   * \param prepare_fun lambda function executed to prepare the data, if necessary
+   * \param _file caller file name used to generate unique cache key
+   * \param _line caller line number used to generate unique cache key
+   * \param _caller caller function name used to generate unique cache key
+   */
+  inline void Allreduce(DType *sendrecvbuf, size_t count,
+                        std::function<void()> prepare_fun,
+                        const char* _file = _FILE,
+                        const int _line = _LINE,
+                        const char* _caller = _CALLER);
+#endif  // DMLC_USE_CXX11
+
+ private:
+  /*! \brief function handle to do reduce */
+  engine::ReduceHandle handle_;
+};
+/*!
+ * \brief template class to make customized reduce,
+ *  this class defines complex reducer handles all the data structure that can be
+ *  serialized/deserialized into fixed size buffer
+ *  Do not use reducer directly in the function you call Finalize, because the destructor can execute after Finalize
+ *
+ * \tparam DType data type that to be reduced, DType must contain the following functions:
+ * \tparam freduce the customized reduction function
+ *   (1) Save(IStream &fs)  (2) Load(IStream &fs) (3) Reduce(const DType &src, size_t max_nbyte)
+ */
+template<typename DType>
+class SerializeReducer {
+ public:
+  SerializeReducer();
+  /*!
+   * \brief customized in-place all reduce operation
+   * \param sendrecvobj pointer to the array of objects to be reduced
+   * \param max_nbyte maximum amount of memory needed to serialize each object
+   *        this includes budget limit for intermediate and final result
+   * \param count number of elements to be reduced
+   * \param prepare_fun Lazy preprocessing function, if it is not NULL, prepare_fun(prepare_arg)
+   *                     will be called by the function before performing Allreduce, to initialize the data in sendrecvbuf.
+   *                     If the result of Allreduce can be recovered directly, then the prepare_func will NOT be called
+   * \param prepare_arg argument used to pass into the lazy preprocessing function
+   * \param _file caller file name used to generate unique cache key
+   * \param _line caller line number used to generate unique cache key
+   * \param _caller caller function name used to generate unique cache key
+   */
+  inline void Allreduce(DType *sendrecvobj,
+                        size_t max_nbyte, size_t count,
+                        void (*prepare_fun)(void *) = NULL,
+                        void *prepare_arg = NULL,
+                        const char* _file = _FILE,
+                        const int _line = _LINE,
+                        const char* _caller = _CALLER);
+// C++11 support for lambda prepare function
+#if DMLC_USE_CXX11
+  /*!
+   * \brief customized in-place all reduce operation, with lambda function as preprocessor
+   * \param sendrecvobj pointer to the array of objects to be reduced
+   * \param max_nbyte maximum amount of memory needed to serialize each object
+   *        this includes budget limit for intermediate and final result
+   * \param count number of elements to be reduced
+   * \param prepare_fun lambda function executed to prepare the data, if necessary
+   * \param _file caller file name used to generate unique cache key
+   * \param _line caller line number used to generate unique cache key
+   * \param _caller caller function name used to generate unique cache key
+   */
+  inline void Allreduce(DType *sendrecvobj,
+                        size_t max_nbyte, size_t count,
+                        std::function<void()> prepare_fun,
+                        const char* _file = _FILE,
+                        const int _line = _LINE,
+                        const char* _caller = _CALLER);
+#endif  // DMLC_USE_CXX11
+
+ private:
+  /*! \brief function handle to do reduce */
+  engine::ReduceHandle handle_;
+  /*! \brief temporal buffer used to do reduce*/
+  std::string buffer_;
+};
+}  // namespace rabit
+// implementation of template functions
+#include "./internal/rabit-inl.h"
+#endif  // RABIT_RABIT_H_ // NOLINT(*)
diff --git a/include/rabit/serializable.h b/include/rabit/serializable.h
new file mode 100644
index 0000000..581262f
--- /dev/null
+++ b/include/rabit/serializable.h
@@ -0,0 +1,26 @@
+/*!
+ *  Copyright (c) 2014 by Contributors
+ * \file serializable.h
+ * \brief defines serializable interface of rabit
+ * \author Tianqi Chen
+ */
+#ifndef RABIT_SERIALIZABLE_H_
+#define RABIT_SERIALIZABLE_H_
+#include <vector>
+#include <string>
+#include "rabit/internal/utils.h"
+
+namespace rabit {
+/*!
+ * \brief defines stream used in rabit
+ * see definition of Stream in dmlc/io.h
+ */
+typedef dmlc::Stream Stream;
+/*!
+ * \brief defines serializable objects used in rabit
+ * see definition of Serializable in dmlc/io.h
+ */
+typedef dmlc::Serializable Serializable;
+
+}  // namespace rabit
+#endif  // RABIT_SERIALIZABLE_H_
diff --git a/lib/README.md b/lib/README.md
new file mode 100644
index 0000000..b6a5aa8
--- /dev/null
+++ b/lib/README.md
@@ -0,0 +1,15 @@
+Rabit Library
+=====
+This folder holds the library file generated by the compiler. To generate the library file, type ```make``` in the project root folder. If you want mpi compatible library, type ```make mpi```
+
+***List of Files***
+* rabit.a The rabit package library
+  - Normally you need to link with this one
+* rabit_mock.a The rabit package library with mock test
+  - This library allows additional mock-test
+* rabit_mpi.a The MPI backed library
+  - Link against this library makes the program use MPI Allreduce
+  - This library is not fault-tolerant
+* rabit_empty.a Dummy package implementation
+  - This is an empty library that does not provide anything
+  - Only introduced to minimize code dependency for projects that only need single machine code
diff --git a/python/rabit.py b/python/rabit.py
new file mode 100644
index 0000000..a56cfcc
--- /dev/null
+++ b/python/rabit.py
@@ -0,0 +1,364 @@
+"""
+Reliable Allreduce and Broadcast Library.
+
+Author: Tianqi Chen
+"""
+# pylint: disable=unused-argument,invalid-name,global-statement,dangerous-default-value,
+import pickle
+import ctypes
+import os
+import platform
+import sys
+import warnings
+import numpy as np
+
+# version information about the doc
+__version__ = '1.0'
+
+_LIB = None
+
+def _find_lib_path(dll_name):
+    """Find the rabit dynamic library files.
+
+    Returns
+    -------
+    lib_path: list(string)
+       List of all found library path to rabit
+    """
+    curr_path = os.path.dirname(os.path.abspath(os.path.expanduser(__file__)))
+    # make pythonpack hack: copy this directory one level upper for setup.py
+    dll_path = [curr_path,
+                os.path.join(curr_path, '../lib/'),
+                os.path.join(curr_path, './lib/')]
+    if os.name == 'nt':
+        dll_path = [os.path.join(p, dll_name) for p in dll_path]
+    else:
+        dll_path = [os.path.join(p, dll_name) for p in dll_path]
+    lib_path = [p for p in dll_path if os.path.exists(p) and os.path.isfile(p)]
+    #From github issues, most of installation errors come from machines w/o compilers
+    if len(lib_path) == 0 and not os.environ.get('XGBOOST_BUILD_DOC', False):
+        raise RuntimeError(
+            'Cannot find Rabit Libarary in the candicate path, ' +
+            'did you install compilers and run build.sh in root path?\n'
+            'List of candidates:\n' + ('\n'.join(dll_path)))
+    return lib_path
+
+# load in xgboost library
+def _loadlib(lib='standard', lib_dll=None):
+    """Load rabit library."""
+    global _LIB
+    if _LIB is not None:
+        warnings.warn('rabit.int call was ignored because it has'\
+                          ' already been initialized', level=2)
+        return
+
+    if lib_dll is not None:
+        _LIB = lib_dll
+        return
+
+    if lib == 'standard':
+        dll_name = 'librabit'
+    else:
+        dll_name = 'librabit_' + lib
+
+    if os.name == 'nt':
+        dll_name += '.dll'
+    elif platform.system() == 'Darwin':
+        dll_name += '.dylib'
+    else:
+        dll_name += '.so'
+
+    _LIB = ctypes.cdll.LoadLibrary(_find_lib_path(dll_name)[0])
+    _LIB.RabitGetRank.restype = ctypes.c_int
+    _LIB.RabitGetWorldSize.restype = ctypes.c_int
+    _LIB.RabitVersionNumber.restype = ctypes.c_int
+
+def _unloadlib():
+    """Unload rabit library."""
+    global _LIB
+    del _LIB
+    _LIB = None
+
+# reduction operators
+MAX = 0
+MIN = 1
+SUM = 2
+BITOR = 3
+
+def init(args=None, lib='standard', lib_dll=None):
+    """Intialize the rabit module, call this once before using anything.
+
+    Parameters
+    ----------
+    args: list of str, optional
+        The list of arguments used to initialized the rabit
+        usually you need to pass in sys.argv.
+        Defaults to sys.argv when it is None.
+    lib: {'standard', 'mock', 'mpi'}, optional
+        Type of library we want to load
+        When cdll is specified
+    lib_dll: ctypes.DLL, optional
+        The DLL object used as lib.
+        When this is presented argument lib will be ignored.
+    """
+    if args is None:
+        args = []
+    _loadlib(lib, lib_dll)
+    arr = (ctypes.c_char_p * len(args))()
+
+    arr[:] = args
+    _LIB.RabitInit(len(args), arr)
+
+def finalize():
+    """Finalize the rabit engine.
+
+    Call this function after you finished all jobs.
+    """
+    _LIB.RabitFinalize()
+    _unloadlib()
+
+def get_rank():
+    """Get rank of current process.
+
+    Returns
+    -------
+    rank : int
+        Rank of current process.
+    """
+    ret = _LIB.RabitGetRank()
+    return ret
+
+def get_world_size():
+    """Get total number workers.
+
+    Returns
+    -------
+    n : int
+        Total number of process.
+    """
+    ret = _LIB.RabitGetWorldSize()
+    return ret
+
+def tracker_print(msg):
+    """Print message to the tracker.
+
+    This function can be used to communicate the information of
+    the progress to the tracker
+
+    Parameters
+    ----------
+    msg : str
+        The message to be printed to tracker.
+    """
+    if not isinstance(msg, str):
+        msg = str(msg)
+    _LIB.RabitTrackerPrint(ctypes.c_char_p(msg).encode('utf-8'))
+
+def get_processor_name():
+    """Get the processor name.
+
+    Returns
+    -------
+    name : str
+        the name of processor(host)
+    """
+    mxlen = 256
+    length = ctypes.c_ulong()
+    buf = ctypes.create_string_buffer(mxlen)
+    _LIB.RabitGetProcessorName(buf, ctypes.byref(length), mxlen)
+    return buf.value
+
+def broadcast(data, root):
+    """Broadcast object from one node to all other nodes.
+
+    Parameters
+    ----------
+    data : any type that can be pickled
+        Input data, if current rank does not equal root, this can be None
+    root : int
+        Rank of the node to broadcast data from.
+
+    Returns
+    -------
+    object : int
+        the result of broadcast.
+    """
+    rank = get_rank()
+    length = ctypes.c_ulong()
+    if root == rank:
+        assert data is not None, 'need to pass in data when broadcasting'
+        s = pickle.dumps(data, protocol=pickle.HIGHEST_PROTOCOL)
+        length.value = len(s)
+    # run first broadcast
+    _LIB.RabitBroadcast(ctypes.byref(length),
+                        ctypes.sizeof(ctypes.c_ulong), root)
+    if root != rank:
+        dptr = (ctypes.c_char * length.value)()
+        # run second
+        _LIB.RabitBroadcast(ctypes.cast(dptr, ctypes.c_void_p),
+                            length.value, root)
+        data = pickle.loads(dptr.raw)
+        del dptr
+    else:
+        _LIB.RabitBroadcast(ctypes.cast(ctypes.c_char_p(s), ctypes.c_void_p),
+                            length.value, root)
+        del s
+    return data
+
+# enumeration of dtypes
+DTYPE_ENUM__ = {
+    np.dtype('int8') : 0,
+    np.dtype('uint8') : 1,
+    np.dtype('int32') : 2,
+    np.dtype('uint32') : 3,
+    np.dtype('int64') : 4,
+    np.dtype('uint64') : 5,
+    np.dtype('float32') : 6,
+    np.dtype('float64') : 7
+}
+
+def allreduce(data, op, prepare_fun=None):
+    """Perform allreduce, return the result.
+
+    Parameters
+    ----------
+    data: numpy array
+        Input data.
+    op: int
+        Reduction operators, can be MIN, MAX, SUM, BITOR
+    prepare_fun: function
+        Lazy preprocessing function, if it is not None, prepare_fun(data)
+        will be called by the function before performing allreduce, to intialize the data
+        If the result of Allreduce can be recovered directly,
+        then prepare_fun will NOT be called
+
+    Returns
+    -------
+    result : array_like
+        The result of allreduce, have same shape as data
+
+    Notes
+    -----
+    This function is not thread-safe.
+    """
+    if not isinstance(data, np.ndarray):
+        raise Exception('allreduce only takes in numpy.ndarray')
+    buf = data.ravel()
+    if buf.base is data.base:
+        buf = buf.copy()
+    if buf.dtype not in DTYPE_ENUM__:
+        raise Exception('data type %s not supported' % str(buf.dtype))
+    if prepare_fun is None:
+        _LIB.RabitAllreduce(buf.ctypes.data_as(ctypes.c_void_p),
+                            buf.size, DTYPE_ENUM__[buf.dtype],
+                            op, None, None)
+    else:
+        func_ptr = ctypes.CFUNCTYPE(None, ctypes.c_void_p)
+        def pfunc(args):
+            """prepare function."""
+            prepare_fun(data)
+        _LIB.RabitAllreduce(buf.ctypes.data_as(ctypes.c_void_p),
+                            buf.size, DTYPE_ENUM__[buf.dtype],
+                            op, func_ptr(pfunc), None)
+    return buf
+
+
+def _load_model(ptr, length):
+    """
+    Internal function used by the module,
+    unpickle a model from a buffer specified by ptr, length
+    Arguments:
+        ptr: ctypes.POINTER(ctypes._char)
+            pointer to the memory region of buffer
+        length: int
+            the length of buffer
+    """
+    data = (ctypes.c_char * length).from_address(ctypes.addressof(ptr.contents))
+    return pickle.loads(data.raw)
+
+def load_checkpoint(with_local=False):
+    """Load latest check point.
+
+    Parameters
+    ----------
+    with_local: bool, optional
+        whether the checkpoint contains local model
+
+    Returns
+    -------
+    tuple : tuple
+        if with_local: return (version, gobal_model, local_model)
+        else return (version, gobal_model)
+        if returned version == 0, this means no model has been CheckPointed
+        and global_model, local_model returned will be None
+    """
+    gptr = ctypes.POINTER(ctypes.c_char)()
+    global_len = ctypes.c_ulong()
+    if with_local:
+        lptr = ctypes.POINTER(ctypes.c_char)()
+        local_len = ctypes.c_ulong()
+        version = _LIB.RabitLoadCheckPoint(
+            ctypes.byref(gptr),
+            ctypes.byref(global_len),
+            ctypes.byref(lptr),
+            ctypes.byref(local_len))
+        if version == 0:
+            return (version, None, None)
+        return (version,
+                _load_model(gptr, global_len.value),
+                _load_model(lptr, local_len.value))
+    else:
+        version = _LIB.RabitLoadCheckPoint(
+            ctypes.byref(gptr),
+            ctypes.byref(global_len),
+            None, None)
+        if version == 0:
+            return (version, None)
+        return (version,
+                _load_model(gptr, global_len.value))
+
+def checkpoint(global_model, local_model=None):
+    """Checkpoint the model.
+
+    This means we finished a stage of execution.
+    Every time we call check point, there is a version number which will increase by one.
+
+    Parameters
+    ----------
+    global_model: anytype that can be pickled
+        globally shared model/state when calling this function,
+        the caller need to gauranttees that global_model is the same in all nodes
+
+    local_model: anytype that can be pickled
+       Local model, that is specific to current node/rank.
+       This can be None when no local state is needed.
+
+    Notes
+    -----
+    local_model requires explicit replication of the model for fault-tolerance.
+    This will bring replication cost in checkpoint function.
+    while global_model do not need explicit replication.
+    It is recommended to use global_model if possible.
+    """
+    sglobal = pickle.dumps(global_model)
+    if local_model is None:
+        _LIB.RabitCheckPoint(sglobal, len(sglobal), None, 0)
+        del sglobal
+    else:
+        slocal = pickle.dumps(local_model)
+        _LIB.RabitCheckPoint(sglobal, len(sglobal), slocal, len(slocal))
+        del slocal
+        del sglobal
+
+def version_number():
+    """Returns version number of current stored model.
+
+    This means how many calls to CheckPoint we made so far.
+
+    Returns
+    -------
+    version : int
+        Version number of currently stored model
+    """
+    ret = _LIB.RabitVersionNumber()
+    return ret
diff --git a/scripts/mpi_build.sh b/scripts/mpi_build.sh
new file mode 100755
index 0000000..b1e70be
--- /dev/null
+++ b/scripts/mpi_build.sh
@@ -0,0 +1,27 @@
+#!/usr/bin/env bash
+
+if [ -f mpich/lib/libmpich.so ]; then
+  echo "libmpich.so found -- nothing to build."
+else
+  echo "Downloading mpich source."
+  wget http://www.mpich.org/static/downloads/3.2/mpich-3.2.tar.gz
+  tar xfz mpich-3.2.tar.gz
+  rm mpich-3.2.tar.gz*
+  echo "configuring and building mpich."
+  cd mpich-3.2
+  #CC=gcc CXX=g++ CFLAGS=-m64 CXXFLAGS=-m64 FFLAGS=-m64
+  ./configure \
+          --prefix=`pwd`/../mpich \
+          --enable-static=false \
+          --enable-alloca=true \
+          --disable-long-double \
+          --enable-threads=single \
+          --enable-fortran=no \
+          --enable-fast=all \
+          --enable-g=none \
+          --enable-timing=none \
+          --enable-cxx
+  make -j4
+  make install
+  cd -
+fi
\ No newline at end of file
diff --git a/scripts/travis_runtest.sh b/scripts/travis_runtest.sh
new file mode 100755
index 0000000..1ec04bb
--- /dev/null
+++ b/scripts/travis_runtest.sh
@@ -0,0 +1,10 @@
+#!/bin/bash
+
+make -f test.mk RABIT_BUILD_DMLC=1 model_recover_10_10k || exit -1
+make -f test.mk RABIT_BUILD_DMLC=1 model_recover_10_10k_die_same  || exit -1
+make -f test.mk RABIT_BUILD_DMLC=1 model_recover_10_10k_die_hard || exit -1
+make -f test.mk RABIT_BUILD_DMLC=1 local_recover_10_10k || exit -1
+make -f test.mk RABIT_BUILD_DMLC=1 lazy_recover_10_10k_die_hard || exit -1
+make -f test.mk RABIT_BUILD_DMLC=1 lazy_recover_10_10k_die_same || exit -1
+make -f test.mk RABIT_BUILD_DMLC=1 ringallreduce_10_10k || exit -1
+make -f test.mk RABIT_BUILD_DMLC=1 pylocal_recover_10_10k || exit -1
diff --git a/scripts/travis_script.sh b/scripts/travis_script.sh
new file mode 100755
index 0000000..1912822
--- /dev/null
+++ b/scripts/travis_script.sh
@@ -0,0 +1,36 @@
+#!/bin/bash
+
+# main script of travis
+if [ ${TASK} == "lint" ]; then
+    make lint RABIT_BUILD_DMLC=1 || exit -1
+fi
+
+if [ ${TASK} == "doc" ]; then
+    make doc 2>log.txt
+    (cat log.txt| grep -v ENABLE_PREPROCESSING |grep -v "unsupported tag" |grep warning) && exit -1
+fi
+
+# we should depreciate Makefile based build
+if [ ${TASK} == "build" ]; then
+    make all RABIT_BUILD_DMLC=1 || exit -1
+fi
+
+if [ ${TASK} == "mpi-build" ]; then
+    ./scripts/mpi_build.sh
+    cd test
+    make mpi RABIT_BUILD_DMLC=1 && make speed_test.mpi RABIT_BUILD_DMLC=1 || exit -1
+fi
+#
+if [ ${TASK} == "cmake-test" ]; then
+    mkdir build
+    cd build
+    cmake -DRABIT_BUILD_TESTS=ON -DRABIT_BUILD_DMLC=ON -DGTEST_ROOT=${HOME}/.local ..
+    # known osx gtest 1.8 issue
+    cp ${HOME}/.local/lib/*.dylib .
+    make -j$(nproc)
+    make test
+    make install || exit -1
+    cd ../test
+    ../scripts/travis_runtest.sh || exit -1
+    rm -rf ../build
+fi
diff --git a/scripts/travis_setup.sh b/scripts/travis_setup.sh
new file mode 100755
index 0000000..d0b82be
--- /dev/null
+++ b/scripts/travis_setup.sh
@@ -0,0 +1,35 @@
+#!/bin/bash
+
+echo "Testing on: ${TRAVIS_OS_NAME}, Home directory: ${HOME}"
+
+pip3 install cpplint pylint urllib3 numpy cpplint
+pip3 install websocket-client kubernetes
+
+
+# Install googletest under home directory
+GTEST_VERSION=1.8.1
+GTEST_RELEASE=release-${GTEST_VERSION}.tar.gz
+GTEST_TAR_BALL=googletest_${GTEST_RELEASE}
+
+wget https://github.com/google/googletest/archive/${GTEST_RELEASE} -O ${GTEST_TAR_BALL}
+echo "152b849610d91a9dfa1401293f43230c2e0c33f8 ${GTEST_TAR_BALL}" | sha1sum -c
+tar -xf ${GTEST_TAR_BALL}
+pushd .
+
+cd googletest-release-${GTEST_VERSION}
+mkdir build
+cd build
+echo "Installing to ${HOME}/.local"
+cmake .. -DBUILD_SHARED_LIBS=ON -DCMAKE_INSTALL_PREFIX=${HOME}/.local
+make -j$(nproc)
+make install
+
+popd
+
+if [ ${TRAVIS_OS_NAME} == "linux" ]; then
+    sudo apt-get install python3-pip tree
+fi
+
+if [ ${TRAVIS_OS_NAME} == "osx" ]; then
+    brew install python3
+fi
diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
new file mode 100644
index 0000000..a7aa5d0
--- /dev/null
+++ b/src/CMakeLists.txt
@@ -0,0 +1,31 @@
+option(DMLC_ROOT "Specify root of external dmlc core.")
+
+add_library(allreduce_base "")
+add_library(allreduce_mock "")
+
+target_sources(
+        allreduce_base
+        PRIVATE
+        allreduce_base.cc
+        PUBLIC
+        ${CMAKE_CURRENT_LIST_DIR}/allreduce_base.h
+)
+target_sources(
+	allreduce_mock
+	PRIVATE
+	allreduce_robust.cc
+	PUBLIC
+	${CMAKE_CURRENT_LIST_DIR}/allreduce_mock.h
+)
+
+target_include_directories(
+        allreduce_base
+        PUBLIC
+	${DMLC_ROOT}/include
+        ${CMAKE_CURRENT_LIST_DIR}/../../include)
+
+target_include_directories(
+	allreduce_mock
+	PUBLIC
+	${DMLC_ROOT}/include
+	${CMAKE_CURRENT_LIST_DIR}/../../include)
diff --git a/src/README.md b/src/README.md
new file mode 100644
index 0000000..5e55d92
--- /dev/null
+++ b/src/README.md
@@ -0,0 +1,6 @@
+Source Files of Rabit
+====
+* This folder contains the source files of rabit library
+* The library headers are in folder [include](../include)
+* The .h files in this folder are internal header files that are only used by rabit and will not be seen by users
+
diff --git a/src/allreduce_base.cc b/src/allreduce_base.cc
new file mode 100644
index 0000000..20f9318
--- /dev/null
+++ b/src/allreduce_base.cc
@@ -0,0 +1,965 @@
+/*!
+ *  Copyright (c) 2014 by Contributors
+ * \file allreduce_base.cc
+ * \brief Basic implementation of AllReduce
+ *
+ * \author Tianqi Chen, Ignacio Cano, Tianyi Zhou
+ */
+#define _CRT_SECURE_NO_WARNINGS
+#define _CRT_SECURE_NO_DEPRECATE
+#define NOMINMAX
+#include <netinet/tcp.h>
+#include <cstring>
+#include <map>
+#include "allreduce_base.h"
+
+namespace rabit {
+
+namespace utils {
+  bool STOP_PROCESS_ON_ERROR = true;
+}
+
+namespace engine {
+// constructor
+AllreduceBase::AllreduceBase(void) {
+  tracker_uri = "NULL";
+  tracker_port = 9000;
+  host_uri = "";
+  slave_port = 9010;
+  nport_trial = 1000;
+  rank = 0;
+  world_size = -1;
+  connect_retry = 5;
+  hadoop_mode = 0;
+  version_number = 0;
+  // 32 K items
+  reduce_ring_mincount = 32 << 10;
+  // tracker URL
+  task_id = "NULL";
+  err_link = NULL;
+  dmlc_role = "worker";
+  this->SetParam("rabit_reduce_buffer", "256MB");
+  // setup possible enviroment variable of interest
+  // include dmlc support direct variables
+  env_vars.push_back("DMLC_TASK_ID");
+  env_vars.push_back("DMLC_ROLE");
+  env_vars.push_back("DMLC_NUM_ATTEMPT");
+  env_vars.push_back("DMLC_TRACKER_URI");
+  env_vars.push_back("DMLC_TRACKER_PORT");
+  env_vars.push_back("DMLC_WORKER_CONNECT_RETRY");
+  env_vars.push_back("DMLC_WORKER_STOP_PROCESS_ON_ERROR");
+}
+
+// initialization function
+bool AllreduceBase::Init(int argc, char* argv[]) {
+  // setup from enviroment variables
+  // handler to get variables from env
+  for (size_t i = 0; i < env_vars.size(); ++i) {
+    const char *value = getenv(env_vars[i].c_str());
+    if (value != NULL) {
+      this->SetParam(env_vars[i].c_str(), value);
+    }
+  }
+  // pass in arguments override env variable.
+  for (int i = 0; i < argc; ++i) {
+    char name[256], val[256];
+    if (sscanf(argv[i], "%[^=]=%s", name, val) == 2) {
+      this->SetParam(name, val);
+    }
+  }
+
+  {
+    // handling for hadoop
+    const char *task_id = getenv("mapred_tip_id");
+    if (task_id == NULL) {
+      task_id = getenv("mapreduce_task_id");
+    }
+    if (hadoop_mode) {
+      utils::Check(task_id != NULL,
+                   "hadoop_mode is set but cannot find mapred_task_id");
+    }
+    if (task_id != NULL) {
+      this->SetParam("rabit_task_id", task_id);
+      this->SetParam("rabit_hadoop_mode", "1");
+    }
+    const char *attempt_id = getenv("mapred_task_id");
+    if (attempt_id != 0) {
+      const char *att = strrchr(attempt_id, '_');
+      int num_trial;
+      if (att != NULL && sscanf(att + 1, "%d", &num_trial) == 1) {
+        this->SetParam("rabit_num_trial", att + 1);
+      }
+    }
+    // handling for hadoop
+    const char *num_task = getenv("mapred_map_tasks");
+    if (num_task == NULL) {
+      num_task = getenv("mapreduce_job_maps");
+    }
+    if (hadoop_mode) {
+      utils::Check(num_task != NULL,
+                   "hadoop_mode is set but cannot find mapred_map_tasks");
+    }
+    if (num_task != NULL) {
+      this->SetParam("rabit_world_size", num_task);
+    }
+  }
+  if (dmlc_role != "worker") {
+    fprintf(stderr, "Rabit Module currently only work with dmlc worker"\
+            ", quit this program by exit 0\n");
+    exit(0);
+  }
+
+  // clear the setting before start reconnection
+  this->rank = -1;
+  //---------------------
+  // start socket
+  utils::Socket::Startup();
+  utils::Assert(all_links.size() == 0, "can only call Init once");
+  this->host_uri = utils::SockAddr::GetHostName();
+  // get information from tracker
+  return this->ReConnectLinks();
+}
+
+bool AllreduceBase::Shutdown(void) {
+  try {
+    for (size_t i = 0; i < all_links.size(); ++i) {
+      all_links[i].sock.Close();
+    }
+    all_links.clear();
+    tree_links.plinks.clear();
+
+    if (tracker_uri == "NULL") return true;
+    // notify tracker rank i have shutdown
+    utils::TCPSocket tracker = this->ConnectTracker();
+    tracker.SendStr(std::string("shutdown"));
+    tracker.Close();
+    utils::TCPSocket::Finalize();
+    return true;
+  } catch (const std::exception& e) {
+    fprintf(stderr, "failed to shutdown due to %s\n", e.what());
+    return false;
+  }
+}
+
+void AllreduceBase::TrackerPrint(const std::string &msg) {
+  if (tracker_uri == "NULL") {
+    utils::Printf("%s", msg.c_str()); return;
+  }
+  utils::TCPSocket tracker = this->ConnectTracker();
+  tracker.SendStr(std::string("print"));
+  tracker.SendStr(msg);
+  tracker.Close();
+}
+
+// util to parse data with unit suffix
+inline size_t ParseUnit(const char *name, const char *val) {
+  char unit;
+  unsigned long amt;  // NOLINT(*)
+  int n = sscanf(val, "%lu%c", &amt, &unit);
+  size_t amount = amt;
+  if (n == 2) {
+    switch (unit) {
+      case 'B': return amount;
+      case 'K': return amount << 10UL;
+      case 'M': return amount << 20UL;
+      case 'G': return amount << 30UL;
+      default: utils::Error("invalid format for %s", name); return 0;
+    }
+  } else if (n == 1) {
+    return amount;
+  } else {
+    utils::Error("invalid format for %s,"                               \
+                 "shhould be {integer}{unit}, unit can be {B, KB, MB, GB}", name);
+    return 0;
+  }
+}
+/*!
+ * \brief set parameters to the engine
+ * \param name parameter name
+ * \param val parameter value
+ */
+void AllreduceBase::SetParam(const char *name, const char *val) {
+  if (!strcmp(name, "rabit_tracker_uri")) tracker_uri = val;
+  if (!strcmp(name, "rabit_tracker_port")) tracker_port = atoi(val);
+  if (!strcmp(name, "rabit_task_id")) task_id = val;
+  if (!strcmp(name, "DMLC_TRACKER_URI")) tracker_uri = val;
+  if (!strcmp(name, "DMLC_TRACKER_PORT")) tracker_port = atoi(val);
+  if (!strcmp(name, "DMLC_TASK_ID")) task_id = val;
+  if (!strcmp(name, "DMLC_ROLE")) dmlc_role = val;
+  if (!strcmp(name, "rabit_world_size")) world_size = atoi(val);
+  if (!strcmp(name, "rabit_hadoop_mode")) hadoop_mode = utils::StringToBool(val);
+  if (!strcmp(name, "rabit_reduce_ring_mincount")) {
+    reduce_ring_mincount = atoi(val);
+    utils::Assert(reduce_ring_mincount > 0, "rabit_reduce_ring_mincount should be greater than 0");
+  }
+  if (!strcmp(name, "rabit_reduce_buffer")) {
+    reduce_buffer_size = (ParseUnit(name, val) + 7) >> 3;
+  }
+  if (!strcmp(name, "DMLC_WORKER_CONNECT_RETRY")) {
+    connect_retry = atoi(val);
+  }
+  if (!strcmp(name, "DMLC_WORKER_STOP_PROCESS_ON_ERROR")) {
+    if (!strcmp(val, "true")) {
+      rabit::utils::STOP_PROCESS_ON_ERROR = true;
+    } else if (!strcmp(val, "false")) {
+      rabit::utils::STOP_PROCESS_ON_ERROR = false;
+    } else {
+      throw std::runtime_error("invalid value of DMLC_WORKER_STOP_PROCESS_ON_ERROR");
+    }
+  }
+  if (!strcmp(name, "rabit_bootstrap_cache")) {
+    rabit_bootstrap_cache = utils::StringToBool(val);
+  }
+  if (!strcmp(name, "rabit_debug")) {
+    rabit_debug = utils::StringToBool(val);
+  }
+  if (!strcmp(name, "rabit_timeout")) {
+    rabit_timeout = utils::StringToBool(val);
+  }
+  if (!strcmp(name, "rabit_timeout_sec")) {
+    timeout_sec = atoi(val);
+    utils::Assert(timeout_sec >= 0, "rabit_timeout_sec should be non negative second");
+  }
+  if (!strcmp(name, "rabit_enable_tcp_no_delay")) {
+    if (!strcmp(val, "true"))
+      rabit_enable_tcp_no_delay = true;
+    else
+      rabit_enable_tcp_no_delay = false;
+  }
+}
+/*!
+ * \brief initialize connection to the tracker
+ * \return a socket that initializes the connection
+ */
+utils::TCPSocket AllreduceBase::ConnectTracker(void) const {
+  int magic = kMagic;
+  // get information from tracker
+  utils::TCPSocket tracker;
+  tracker.Create();
+
+  int retry = 0;
+  do {
+    if (!tracker.Connect(utils::SockAddr(tracker_uri.c_str(), tracker_port))) {
+      if (++retry >= connect_retry) {
+        fprintf(stderr, "connect to (failed): [%s]\n", tracker_uri.c_str());
+        utils::Socket::Error("Connect");
+      } else {
+        fprintf(stderr, "retry connect to ip(retry time %d): [%s]\n", retry, tracker_uri.c_str());
+#if defined(_MSC_VER) || defined (__MINGW32__)
+        Sleep(retry << 1);
+#else
+        sleep(retry << 1);
+#endif
+        continue;
+      }
+    }
+    break;
+  } while (1);
+
+  using utils::Assert;
+  Assert(tracker.SendAll(&magic, sizeof(magic)) == sizeof(magic),
+         "ReConnectLink failure 1");
+  Assert(tracker.RecvAll(&magic, sizeof(magic)) == sizeof(magic),
+         "ReConnectLink failure 2");
+  utils::Check(magic == kMagic, "sync::Invalid tracker message, init failure");
+  Assert(tracker.SendAll(&rank, sizeof(rank)) == sizeof(rank),
+                "ReConnectLink failure 3");
+  Assert(tracker.SendAll(&world_size, sizeof(world_size)) == sizeof(world_size),
+         "ReConnectLink failure 3");
+  tracker.SendStr(task_id);
+  return tracker;
+}
+/*!
+ * \brief connect to the tracker to fix the the missing links
+ *   this function is also used when the engine start up
+ */
+bool AllreduceBase::ReConnectLinks(const char *cmd) {
+  // single node mode
+  if (tracker_uri == "NULL") {
+    rank = 0; world_size = 1; return true;
+  }
+  try {
+    utils::TCPSocket tracker = this->ConnectTracker();
+    fprintf(stdout, "task %s connected to the tracker\n", task_id.c_str());
+    tracker.SendStr(std::string(cmd));
+
+    // the rank of previous link, next link in ring
+    int prev_rank, next_rank;
+    // the rank of neighbors
+    std::map<int, int> tree_neighbors;
+    using utils::Assert;
+    // get new ranks
+    int newrank, num_neighbors;
+    Assert(tracker.RecvAll(&newrank, sizeof(newrank)) == sizeof(newrank),
+           "ReConnectLink failure 4");
+    Assert(tracker.RecvAll(&parent_rank, sizeof(parent_rank)) == \
+         sizeof(parent_rank), "ReConnectLink failure 4");
+    Assert(tracker.RecvAll(&world_size, sizeof(world_size)) == sizeof(world_size),
+           "ReConnectLink failure 4");
+    Assert(rank == -1 || newrank == rank,
+           "must keep rank to same if the node already have one");
+    rank = newrank;
+
+    // tracker got overwhelemed and not able to assign correct rank
+    if (rank == -1) exit(-1);
+
+    fprintf(stdout, "task %s got new rank %d\n", task_id.c_str(), rank);
+
+    Assert(tracker.RecvAll(&num_neighbors, sizeof(num_neighbors)) == \
+         sizeof(num_neighbors), "ReConnectLink failure 4");
+    for (int i = 0; i < num_neighbors; ++i) {
+      int nrank;
+      Assert(tracker.RecvAll(&nrank, sizeof(nrank)) == sizeof(nrank),
+             "ReConnectLink failure 4");
+      tree_neighbors[nrank] = 1;
+    }
+    Assert(tracker.RecvAll(&prev_rank, sizeof(prev_rank)) == sizeof(prev_rank),
+           "ReConnectLink failure 4");
+    Assert(tracker.RecvAll(&next_rank, sizeof(next_rank)) == sizeof(next_rank),
+           "ReConnectLink failure 4");
+
+    utils::TCPSocket sock_listen;
+    if (!sock_listen.IsClosed()) {
+      sock_listen.Close();
+    }
+    // create listening socket
+    sock_listen.Create();
+    int port = sock_listen.TryBindHost(slave_port, slave_port + nport_trial);
+    utils::Check(port != -1, "ReConnectLink fail to bind the ports specified");
+    sock_listen.Listen();
+
+    // get number of to connect and number of to accept nodes from tracker
+    int num_conn, num_accept, num_error = 1;
+    do {
+      // send over good links
+      std::vector<int> good_link;
+      for (size_t i = 0; i < all_links.size(); ++i) {
+        if (!all_links[i].sock.BadSocket()) {
+          good_link.push_back(static_cast<int>(all_links[i].rank));
+        } else {
+          if (!all_links[i].sock.IsClosed()) all_links[i].sock.Close();
+        }
+      }
+      int ngood = static_cast<int>(good_link.size());
+      Assert(tracker.SendAll(&ngood, sizeof(ngood)) == sizeof(ngood),
+             "ReConnectLink failure 5");
+      for (size_t i = 0; i < good_link.size(); ++i) {
+        Assert(tracker.SendAll(&good_link[i], sizeof(good_link[i])) == \
+             sizeof(good_link[i]), "ReConnectLink failure 6");
+      }
+      Assert(tracker.RecvAll(&num_conn, sizeof(num_conn)) == sizeof(num_conn),
+             "ReConnectLink failure 7");
+      Assert(tracker.RecvAll(&num_accept, sizeof(num_accept)) == \
+           sizeof(num_accept), "ReConnectLink failure 8");
+      num_error = 0;
+      for (int i = 0; i < num_conn; ++i) {
+        LinkRecord r;
+        int hport, hrank;
+        std::string hname;
+        tracker.RecvStr(&hname);
+        Assert(tracker.RecvAll(&hport, sizeof(hport)) == sizeof(hport),
+               "ReConnectLink failure 9");
+        Assert(tracker.RecvAll(&hrank, sizeof(hrank)) == sizeof(hrank),
+               "ReConnectLink failure 10");
+
+        r.sock.Create();
+        if (!r.sock.Connect(utils::SockAddr(hname.c_str(), hport))) {
+          num_error += 1;
+          r.sock.Close();
+          continue;
+        }
+        Assert(r.sock.SendAll(&rank, sizeof(rank)) == sizeof(rank),
+               "ReConnectLink failure 12");
+        Assert(r.sock.RecvAll(&r.rank, sizeof(r.rank)) == sizeof(r.rank),
+               "ReConnectLink failure 13");
+        utils::Check(hrank == r.rank,
+                     "ReConnectLink failure, link rank inconsistent");
+        bool match = false;
+        for (size_t i = 0; i < all_links.size(); ++i) {
+          if (all_links[i].rank == hrank) {
+            Assert(all_links[i].sock.IsClosed(),
+                   "Override a link that is active");
+            all_links[i].sock = r.sock;
+            match = true;
+            break;
+          }
+        }
+        if (!match) all_links.push_back(r);
+      }
+      Assert(tracker.SendAll(&num_error, sizeof(num_error)) == sizeof(num_error),
+             "ReConnectLink failure 14");
+    } while (num_error != 0);
+    // send back socket listening port to tracker
+    Assert(tracker.SendAll(&port, sizeof(port)) == sizeof(port),
+           "ReConnectLink failure 14");
+    // close connection to tracker
+    tracker.Close();
+    // listen to incoming links
+    for (int i = 0; i < num_accept; ++i) {
+      LinkRecord r;
+      r.sock = sock_listen.Accept();
+      Assert(r.sock.SendAll(&rank, sizeof(rank)) == sizeof(rank),
+             "ReConnectLink failure 15");
+      Assert(r.sock.RecvAll(&r.rank, sizeof(r.rank)) == sizeof(r.rank),
+             "ReConnectLink failure 15");
+      bool match = false;
+      for (size_t i = 0; i < all_links.size(); ++i) {
+        if (all_links[i].rank == r.rank) {
+          utils::Assert(all_links[i].sock.IsClosed(),
+                        "Override a link that is active");
+          all_links[i].sock = r.sock;
+          match = true;
+          break;
+        }
+      }
+      if (!match) all_links.push_back(r);
+    }
+    sock_listen.Close();
+    this->parent_index = -1;
+    // setup tree links and ring structure
+    tree_links.plinks.clear();
+    int tcpNoDelay = 1;
+    for (size_t i = 0; i < all_links.size(); ++i) {
+      utils::Assert(!all_links[i].sock.BadSocket(), "ReConnectLink: bad socket");
+      // set the socket to non-blocking mode, enable TCP keepalive
+      all_links[i].sock.SetNonBlock(true);
+      all_links[i].sock.SetKeepAlive(true);
+      if (rabit_enable_tcp_no_delay) {
+        setsockopt(all_links[i].sock, IPPROTO_TCP,
+                   TCP_NODELAY, reinterpret_cast<void *>(&tcpNoDelay), sizeof(tcpNoDelay));
+      }
+      if (tree_neighbors.count(all_links[i].rank) != 0) {
+        if (all_links[i].rank == parent_rank) {
+          parent_index = static_cast<int>(tree_links.plinks.size());
+        }
+        tree_links.plinks.push_back(&all_links[i]);
+      }
+      if (all_links[i].rank == prev_rank) ring_prev = &all_links[i];
+      if (all_links[i].rank == next_rank) ring_next = &all_links[i];
+    }
+    Assert(parent_rank == -1 || parent_index != -1,
+           "cannot find parent in the link");
+    Assert(prev_rank == -1 || ring_prev != NULL,
+           "cannot find prev ring in the link");
+    Assert(next_rank == -1 || ring_next != NULL,
+           "cannot find next ring in the link");
+    return true;
+  } catch (const std::exception& e) {
+    fprintf(stderr, "failed in ReconnectLink %s\n", e.what());
+    return false;
+  }
+}
+/*!
+ * \brief perform in-place allreduce, on sendrecvbuf, this function can fail, and will return the cause of failure
+ *
+ * NOTE on Allreduce:
+ *    The kSuccess TryAllreduce does NOT mean every node have successfully finishes TryAllreduce.
+ *    It only means the current node get the correct result of Allreduce.
+ *    However, it means every node finishes LAST call(instead of this one) of Allreduce/Bcast
+ *
+ * \param sendrecvbuf_ buffer for both sending and recving data
+ * \param type_nbytes the unit number of bytes the type have
+ * \param count number of elements to be reduced
+ * \param reducer reduce function
+ * \return this function can return kSuccess, kSockError, kGetExcept, see ReturnType for details
+ * \sa ReturnType
+ */
+AllreduceBase::ReturnType
+AllreduceBase::TryAllreduce(void *sendrecvbuf_,
+                            size_t type_nbytes,
+                            size_t count,
+                            ReduceFunction reducer) {
+  if (count > reduce_ring_mincount) {
+    return this->TryAllreduceRing(sendrecvbuf_, type_nbytes, count, reducer);
+  } else {
+    return this->TryAllreduceTree(sendrecvbuf_, type_nbytes, count, reducer);
+  }
+}
+/*!
+ * \brief perform in-place allreduce, on sendrecvbuf,
+ * this function implements tree-shape reduction
+ *
+ * \param sendrecvbuf_ buffer for both sending and recving data
+ * \param type_nbytes the unit number of bytes the type have
+ * \param count number of elements to be reduced
+ * \param reducer reduce function
+ * \return this function can return kSuccess, kSockError, kGetExcept, see ReturnType for details
+ * \sa ReturnType
+ */
+AllreduceBase::ReturnType
+AllreduceBase::TryAllreduceTree(void *sendrecvbuf_,
+                                size_t type_nbytes,
+                                size_t count,
+                                ReduceFunction reducer) {
+  RefLinkVector &links = tree_links;
+  if (links.size() == 0 || count == 0) return kSuccess;
+  // total size of message
+  const size_t total_size = type_nbytes * count;
+  // number of links
+  const int nlink = static_cast<int>(links.size());
+  // send recv buffer
+  char *sendrecvbuf = reinterpret_cast<char*>(sendrecvbuf_);
+  // size of space that we already performs reduce in up pass
+  size_t size_up_reduce = 0;
+  // size of space that we have already passed to parent
+  size_t size_up_out = 0;
+  // size of message we received, and send in the down pass
+  size_t size_down_in = 0;
+  // initialize the link ring-buffer and pointer
+  for (int i = 0; i < nlink; ++i) {
+    if (i != parent_index) {
+      links[i].InitBuffer(type_nbytes, count, reduce_buffer_size);
+    }
+    links[i].ResetSize();
+  }
+  // if no childs, no need to reduce
+  if (nlink == static_cast<int>(parent_index != -1)) {
+    size_up_reduce = total_size;
+  }
+  // while we have not passed the messages out
+  while (true) {
+    // select helper
+    bool finished = true;
+    utils::PollHelper watcher;
+    for (int i = 0; i < nlink; ++i) {
+      if (i == parent_index) {
+        if (size_down_in != total_size) {
+          watcher.WatchRead(links[i].sock);
+          // only watch for exception in live channels
+          watcher.WatchException(links[i].sock);
+          finished = false;
+        }
+        if (size_up_out != total_size && size_up_out < size_up_reduce) {
+          watcher.WatchWrite(links[i].sock);
+        }
+      } else {
+        if (links[i].size_read != total_size) {
+          watcher.WatchRead(links[i].sock);
+        }
+        // size_write <= size_read
+        if (links[i].size_write != total_size) {
+          if (links[i].size_write < size_down_in) {
+            watcher.WatchWrite(links[i].sock);
+          }
+          // only watch for exception in live channels
+          watcher.WatchException(links[i].sock);
+          finished = false;
+        }
+      }
+    }
+    // finish runing allreduce
+    if (finished) break;
+    // select must return
+    watcher.Poll();
+    // exception handling
+    for (int i = 0; i < nlink; ++i) {
+      // recive OOB message from some link
+      if (watcher.CheckExcept(links[i].sock)) {
+        return ReportError(&links[i], kGetExcept);
+      }
+    }
+    // read data from childs
+    for (int i = 0; i < nlink; ++i) {
+      if (i != parent_index && watcher.CheckRead(links[i].sock)) {
+        ReturnType ret = links[i].ReadToRingBuffer(size_up_out, total_size);
+        if (ret != kSuccess) {
+          return ReportError(&links[i], ret);
+        }
+      }
+    }
+    // this node have childs, peform reduce
+    if (nlink > static_cast<int>(parent_index != -1)) {
+      size_t buffer_size = 0;
+      // do upstream reduce
+      size_t max_reduce = total_size;
+      for (int i = 0; i < nlink; ++i) {
+        if (i != parent_index) {
+          max_reduce = std::min(max_reduce, links[i].size_read);
+          utils::Assert(buffer_size == 0 || buffer_size == links[i].buffer_size,
+                        "buffer size inconsistent");
+          buffer_size = links[i].buffer_size;
+        }
+      }
+      utils::Assert(buffer_size != 0, "must assign buffer_size");
+      // round to type_n4bytes
+      max_reduce = (max_reduce / type_nbytes * type_nbytes);
+      // peform reduce, can be at most two rounds
+      while (size_up_reduce < max_reduce) {
+        // start position
+        size_t start = size_up_reduce % buffer_size;
+        // peform read till end of buffer
+        size_t nread = std::min(buffer_size - start,
+                                max_reduce - size_up_reduce);
+        utils::Assert(nread % type_nbytes == 0, "Allreduce: size check");
+        for (int i = 0; i < nlink; ++i) {
+          if (i != parent_index) {
+            reducer(links[i].buffer_head + start,
+                    sendrecvbuf + size_up_reduce,
+                    static_cast<int>(nread / type_nbytes),
+                    MPI::Datatype(type_nbytes));
+          }
+        }
+        size_up_reduce += nread;
+      }
+    }
+    if (parent_index != -1) {
+      // pass message up to parent, can pass data that are already been reduced
+      if (size_up_out < size_up_reduce) {
+        ssize_t len = links[parent_index].sock.
+            Send(sendrecvbuf + size_up_out, size_up_reduce - size_up_out);
+        if (len != -1) {
+          size_up_out += static_cast<size_t>(len);
+        } else {
+          ReturnType ret = Errno2Return();
+          if (ret != kSuccess) {
+            return ReportError(&links[parent_index], ret);
+          }
+        }
+      }
+      // read data from parent
+      if (watcher.CheckRead(links[parent_index].sock) &&
+          total_size > size_down_in) {
+        ssize_t len = links[parent_index].sock.
+            Recv(sendrecvbuf + size_down_in, total_size - size_down_in);
+        if (len == 0) {
+          links[parent_index].sock.Close();
+          return ReportError(&links[parent_index], kRecvZeroLen);
+        }
+        if (len != -1) {
+          size_down_in += static_cast<size_t>(len);
+          utils::Assert(size_down_in <= size_up_out,
+                        "Allreduce: boundary error");
+        } else {
+          ReturnType ret = Errno2Return();
+          if (ret != kSuccess) {
+            return ReportError(&links[parent_index], ret);
+          }
+        }
+      }
+    } else {
+      // this is root, can use reduce as most recent point
+      size_down_in = size_up_out = size_up_reduce;
+    }
+    // can pass message down to childs
+    for (int i = 0; i < nlink; ++i) {
+      if (i != parent_index && links[i].size_write < size_down_in) {
+        ReturnType ret = links[i].WriteFromArray(sendrecvbuf, size_down_in);
+        if (ret != kSuccess) {
+          return ReportError(&links[i], ret);
+        }
+      }
+    }
+  }
+  return kSuccess;
+}
+/*!
+ * \brief broadcast data from root to all nodes, this function can fail,and will return the cause of failure
+ * \param sendrecvbuf_ buffer for both sending and recving data
+ * \param total_size the size of the data to be broadcasted
+ * \param root the root worker id to broadcast the data
+ * \return this function can return kSuccess, kSockError, kGetExcept, see ReturnType for details
+ * \sa ReturnType
+ */
+AllreduceBase::ReturnType
+AllreduceBase::TryBroadcast(void *sendrecvbuf_, size_t total_size, int root) {
+  RefLinkVector &links = tree_links;
+  if (links.size() == 0 || total_size == 0) return kSuccess;
+  utils::Check(root < world_size,
+               "Broadcast: root should be smaller than world size");
+  // number of links
+  const int nlink = static_cast<int>(links.size());
+  // size of space already read from data
+  size_t size_in = 0;
+  // input link, -2 means unknown yet, -1 means this is root
+  int in_link = -2;
+
+  // initialize the link statistics
+  for (int i = 0; i < nlink; ++i) {
+    links[i].ResetSize();
+  }
+  // root have all the data
+  if (this->rank == root) {
+    size_in = total_size;
+    in_link = -1;
+  }
+  // while we have not passed the messages out
+  while (true) {
+    bool finished = true;
+    // select helper
+    utils::PollHelper watcher;
+    for (int i = 0; i < nlink; ++i) {
+      if (in_link == -2) {
+        watcher.WatchRead(links[i].sock); finished = false;
+      }
+      if (i == in_link && links[i].size_read != total_size) {
+        watcher.WatchRead(links[i].sock); finished = false;
+      }
+      if (in_link != -2 && i != in_link && links[i].size_write != total_size) {
+        if (links[i].size_write < size_in) {
+          watcher.WatchWrite(links[i].sock);
+        }
+        finished = false;
+      }
+      watcher.WatchException(links[i].sock);
+    }
+    // finish running
+    if (finished) break;
+    // select
+    watcher.Poll();
+    // exception handling
+    for (int i = 0; i < nlink; ++i) {
+      // recive OOB message from some link
+      if (watcher.CheckExcept(links[i].sock)) {
+        return ReportError(&links[i], kGetExcept);
+      }
+    }
+    if (in_link == -2) {
+      // probe in-link
+      for (int i = 0; i < nlink; ++i) {
+        if (watcher.CheckRead(links[i].sock)) {
+          ReturnType ret = links[i].ReadToArray(sendrecvbuf_, total_size);
+          if (ret != kSuccess) {
+            return ReportError(&links[i], ret);
+          }
+          size_in = links[i].size_read;
+          if (size_in != 0) {
+            in_link = i; break;
+          }
+        }
+      }
+    } else {
+      // read from in link
+      if (in_link >= 0 && watcher.CheckRead(links[in_link].sock)) {
+        ReturnType ret = links[in_link].ReadToArray(sendrecvbuf_, total_size);
+        if (ret != kSuccess) {
+          return ReportError(&links[in_link], ret);
+        }
+        size_in = links[in_link].size_read;
+      }
+    }
+    // send data to all out-link
+    for (int i = 0; i < nlink; ++i) {
+      if (i != in_link && links[i].size_write < size_in) {
+        ReturnType ret = links[i].WriteFromArray(sendrecvbuf_, size_in);
+        if (ret != kSuccess) {
+          return ReportError(&links[i], ret);
+        }
+      }
+    }
+  }
+  return kSuccess;
+}
+/*!
+ * \brief internal Allgather function, each node have a segment of data in the ring of sendrecvbuf,
+ *  the data provided by current node k is [slice_begin, slice_end),
+ *  the next node's segment must start with slice_end
+ *  after the call of Allgather, sendrecvbuf_ contains all the contents including all segments
+ *  use a ring based algorithm
+ *
+ * \param sendrecvbuf_ buffer for both sending and receiving data, it is a ring conceptually
+ * \param total_size total size of data to be gathered
+ * \param slice_begin beginning of the current slice
+ * \param slice_end end of the current slice
+ * \param size_prev_slice size of the previous slice i.e. slice of node (rank - 1) % world_size
+ */
+AllreduceBase::ReturnType
+AllreduceBase::TryAllgatherRing(void *sendrecvbuf_, size_t total_size,
+                                size_t slice_begin,
+                                size_t slice_end,
+                                size_t size_prev_slice) {
+  // read from next link and send to prev one
+  LinkRecord &prev = *ring_prev, &next = *ring_next;
+  // need to reply on special rank structure
+  utils::Assert(next.rank == (rank + 1) % world_size &&
+                rank == (prev.rank + 1) % world_size,
+                "need to assume rank structure");
+  // send recv buffer
+  char *sendrecvbuf = reinterpret_cast<char*>(sendrecvbuf_);
+  const size_t stop_read = total_size + slice_begin;
+  const size_t stop_write = total_size + slice_begin - size_prev_slice;
+  size_t write_ptr = slice_begin;
+  size_t read_ptr = slice_end;
+
+  while (true) {
+    // select helper
+    bool finished = true;
+    utils::PollHelper watcher;
+    if (read_ptr != stop_read) {
+      watcher.WatchRead(next.sock);
+      finished = false;
+    }
+    if (write_ptr != stop_write) {
+      if (write_ptr < read_ptr) {
+        watcher.WatchWrite(prev.sock);
+      }
+      finished  = false;
+    }
+    if (finished) break;
+    watcher.Poll();
+    if (read_ptr != stop_read && watcher.CheckRead(next.sock)) {
+      size_t size = stop_read - read_ptr;
+      size_t start = read_ptr % total_size;
+      if (start + size > total_size) {
+        size = total_size - start;
+      }
+      ssize_t len = next.sock.Recv(sendrecvbuf + start, size);
+      if (len != -1) {
+        read_ptr += static_cast<size_t>(len);
+      } else {
+        ReturnType ret = Errno2Return();
+        if (ret != kSuccess) return ReportError(&next, ret);
+      }
+    }
+    if (write_ptr < read_ptr && write_ptr != stop_write) {
+      size_t size = std::min(read_ptr, stop_write) - write_ptr;
+      size_t start = write_ptr % total_size;
+      if (start + size > total_size) {
+        size = total_size - start;
+      }
+      ssize_t len = prev.sock.Send(sendrecvbuf + start, size);
+      if (len != -1) {
+        write_ptr += static_cast<size_t>(len);
+      } else {
+        ReturnType ret = Errno2Return();
+        if (ret != kSuccess) return ReportError(&prev, ret);
+      }
+    }
+  }
+  return kSuccess;
+}
+/*!
+ * \brief perform in-place allreduce, on sendrecvbuf, this function can fail,
+ *  and will return the cause of failure
+ *
+ *  Ring-based algorithm
+ *
+ * \param sendrecvbuf_ buffer for both sending and recving data
+ * \param type_nbytes the unit number of bytes the type have
+ * \param count number of elements to be reduced
+ * \param reducer reduce function
+ * \return this function can return kSuccess, kSockError, kGetExcept, see ReturnType for details
+ * \sa ReturnType, TryAllreduce
+ */
+AllreduceBase::ReturnType
+AllreduceBase::TryReduceScatterRing(void *sendrecvbuf_,
+                                    size_t type_nbytes,
+                                    size_t count,
+                                    ReduceFunction reducer) {
+  // read from next link and send to prev one
+  LinkRecord &prev = *ring_prev, &next = *ring_next;
+  // need to reply on special rank structure
+  utils::Assert(next.rank == (rank + 1) % world_size &&
+                rank == (prev.rank + 1) % world_size,
+                "need to assume rank structure");
+  // total size of message
+  const size_t total_size = type_nbytes * count;
+  size_t n = static_cast<size_t>(world_size);
+  size_t step = (count + n - 1) / n;
+  size_t r = static_cast<size_t>(next.rank);
+  size_t write_ptr = std::min(r * step, count) * type_nbytes;
+  size_t read_ptr = std::min((r + 1) * step, count) * type_nbytes;
+  size_t reduce_ptr = read_ptr;
+  // send recv buffer
+  char *sendrecvbuf = reinterpret_cast<char*>(sendrecvbuf_);
+  // position to stop reading
+  const size_t stop_read = total_size + write_ptr;
+  // position to stop writing
+  size_t stop_write = total_size + std::min(rank * step, count) * type_nbytes;
+  if (stop_write > stop_read) {
+    stop_write -= total_size;
+    utils::Assert(write_ptr <= stop_write, "write ptr boundary check");
+  }
+  // use ring buffer in next position
+  next.InitBuffer(type_nbytes, step, reduce_buffer_size);
+  // set size_read to read pointer for ring buffer to work properly
+  next.size_read = read_ptr;
+
+  while (true) {
+    // select helper
+    bool finished = true;
+    utils::PollHelper watcher;
+    if (read_ptr != stop_read) {
+      watcher.WatchRead(next.sock);
+      finished = false;
+    }
+    if (write_ptr != stop_write) {
+      if (write_ptr < reduce_ptr) {
+        watcher.WatchWrite(prev.sock);
+      }
+      finished = false;
+    }
+    if (finished) break;
+    watcher.Poll();
+    if (read_ptr != stop_read && watcher.CheckRead(next.sock)) {
+      ReturnType ret = next.ReadToRingBuffer(reduce_ptr, stop_read);
+      if (ret != kSuccess) {
+        return ReportError(&next, ret);
+      }
+      // sync the rate
+      read_ptr = next.size_read;
+      utils::Assert(read_ptr <= stop_read, "[%d] read_ptr boundary check", rank);
+      const size_t buffer_size = next.buffer_size;
+      size_t max_reduce = (read_ptr  / type_nbytes) * type_nbytes;
+      while (reduce_ptr < max_reduce) {
+        size_t bstart = reduce_ptr % buffer_size;
+        size_t nread = std::min(buffer_size - bstart,
+                                max_reduce - reduce_ptr);
+        size_t rstart = reduce_ptr % total_size;
+        nread = std::min(nread, total_size - rstart);
+        reducer(next.buffer_head + bstart,
+                sendrecvbuf + rstart,
+                static_cast<int>(nread / type_nbytes),
+                MPI::Datatype(type_nbytes));
+        reduce_ptr += nread;
+      }
+    }
+    if (write_ptr < reduce_ptr && write_ptr != stop_write) {
+      size_t size = std::min(reduce_ptr, stop_write) - write_ptr;
+      size_t start = write_ptr % total_size;
+      if (start + size > total_size) {
+        size = total_size - start;
+      }
+      ssize_t len = prev.sock.Send(sendrecvbuf + start, size);
+      if (len != -1) {
+        write_ptr += static_cast<size_t>(len);
+      } else {
+        ReturnType ret = Errno2Return();
+        if (ret != kSuccess) return ReportError(&prev, ret);
+      }
+    }
+  }
+  return kSuccess;
+}
+/*!
+ * \brief perform in-place allreduce, on sendrecvbuf
+ *  use a ring based algorithm
+ *
+ * \param sendrecvbuf_ buffer for both sending and recving data
+ * \param type_nbytes the unit number of bytes the type have
+ * \param count number of elements to be reduced
+ * \param reducer reduce function
+ * \return this function can return kSuccess, kSockError, kGetExcept, see ReturnType for details
+ * \sa ReturnType
+ */
+AllreduceBase::ReturnType
+AllreduceBase::TryAllreduceRing(void *sendrecvbuf_,
+                                size_t type_nbytes,
+                                size_t count,
+                                ReduceFunction reducer) {
+  ReturnType ret = TryReduceScatterRing(sendrecvbuf_, type_nbytes, count, reducer);
+  if (ret != kSuccess) return ret;
+  size_t n = static_cast<size_t>(world_size);
+  size_t step = (count + n - 1) / n;
+  size_t begin = std::min(rank * step, count) * type_nbytes;
+  size_t end = std::min((rank + 1) * step, count) * type_nbytes;
+  // previous rank
+  int prank = ring_prev->rank;
+  // get rank of previous
+  return TryAllgatherRing
+      (sendrecvbuf_, type_nbytes * count,
+       begin, end,
+       (std::min((prank + 1) * step, count) -
+        std::min(prank * step, count)) * type_nbytes);
+}
+}  // namespace engine
+}  // namespace rabit
diff --git a/src/allreduce_base.h b/src/allreduce_base.h
new file mode 100644
index 0000000..d1d3333
--- /dev/null
+++ b/src/allreduce_base.h
@@ -0,0 +1,587 @@
+/*!
+ *  Copyright (c) 2014 by Contributors
+ * \file allreduce_base.h
+ * \brief Basic implementation of AllReduce
+ *   using TCP non-block socket and tree-shape reduction.
+ *
+ *   This implementation provides basic utility of AllReduce and Broadcast
+ *   without considering node failure
+ *
+ * \author Tianqi Chen, Ignacio Cano, Tianyi Zhou
+ */
+#ifndef RABIT_ALLREDUCE_BASE_H_
+#define RABIT_ALLREDUCE_BASE_H_
+
+#include <vector>
+#include <string>
+#include <algorithm>
+#include "rabit/internal/utils.h"
+#include "rabit/internal/engine.h"
+#include "rabit/internal/socket.h"
+
+#ifdef RABIT_CXXTESTDEFS_H
+#define private   public
+#define protected public
+#endif  // RABIT_CXXTESTDEFS_H
+
+
+namespace MPI {
+// MPI data type to be compatible with existing MPI interface
+class Datatype {
+ public:
+  size_t type_size;
+  explicit Datatype(size_t type_size) : type_size(type_size) {}
+};
+}
+namespace rabit {
+namespace engine {
+/*! \brief implementation of basic Allreduce engine */
+class AllreduceBase : public IEngine {
+ public:
+  // magic number to verify server
+  static const int kMagic = 0xff99;
+  // constant one byte out of band message to indicate error happening
+  AllreduceBase(void);
+  virtual ~AllreduceBase(void) {}
+  // initialize the manager
+  virtual bool Init(int argc, char* argv[]);
+  // shutdown the engine
+  virtual bool Shutdown(void);
+  /*!
+   * \brief set parameters to the engine
+   * \param name parameter name
+   * \param val parameter value
+   */
+  virtual void SetParam(const char *name, const char *val);
+  /*!
+   * \brief print the msg in the tracker,
+   *    this function can be used to communicate the information of the progress to
+   *    the user who monitors the tracker
+   * \param msg message to be printed in the tracker
+   */
+  virtual void TrackerPrint(const std::string &msg);
+
+  /*! \brief get rank of previous node in ring topology*/
+  virtual int GetRingPrevRank(void) const {
+    return ring_prev->rank;
+  }
+  /*! \brief get rank */
+  virtual int GetRank(void) const {
+    return rank;
+  }
+  /*! \brief get rank */
+  virtual int GetWorldSize(void) const {
+    if (world_size == -1) return 1;
+    return world_size;
+  }
+  /*! \brief whether is distributed or not */
+  virtual bool IsDistributed(void) const {
+    return tracker_uri != "NULL";
+  }
+  /*! \brief get rank */
+  virtual std::string GetHost(void) const {
+    return host_uri;
+  }
+
+  /*!
+  * \brief internal Allgather function, each node have a segment of data in the ring of sendrecvbuf,
+  *  the data provided by current node k is [slice_begin, slice_end),
+  *  the next node's segment must start with slice_end
+  *  after the call of Allgather, sendrecvbuf_ contains all the contents including all segments
+  *  use a ring based algorithm
+  *
+  * \param sendrecvbuf_ buffer for both sending and receiving data, it is a ring conceptually
+  * \param total_size total size of data to be gathered
+  * \param slice_begin beginning of the current slice
+  * \param slice_end end of the current slice
+  * \param size_prev_slice size of the previous slice i.e. slice of node (rank - 1) % world_size
+  * \param _file caller file name used to generate unique cache key
+  * \param _line caller line number used to generate unique cache key
+  * \param _caller caller function name used to generate unique cache key
+  */
+  virtual void Allgather(void *sendrecvbuf_, size_t total_size,
+                             size_t slice_begin,
+                             size_t slice_end,
+                             size_t size_prev_slice,
+                             const char* _file = _FILE,
+                             const int _line = _LINE,
+                             const char* _caller = _CALLER) {
+    if (world_size == 1 || world_size == -1) return;
+    utils::Assert(TryAllgatherRing(sendrecvbuf_, total_size,
+                                   slice_begin, slice_end, size_prev_slice) == kSuccess,
+                  "AllgatherRing failed");
+  }
+  /*!
+   * \brief perform in-place allreduce, on sendrecvbuf
+   *        this function is NOT thread-safe
+   * \param sendrecvbuf_ buffer for both sending and recving data
+   * \param type_nbytes the unit number of bytes the type have
+   * \param count number of elements to be reduced
+   * \param reducer reduce function
+   * \param prepare_func Lazy preprocessing function, lazy prepare_fun(prepare_arg)
+   *                     will be called by the function before performing Allreduce, to intialize the data in sendrecvbuf_.
+   *                     If the result of Allreduce can be recovered directly, then prepare_func will NOT be called
+   * \param prepare_arg argument used to passed into the lazy preprocessing function
+   * \param _file caller file name used to generate unique cache key
+   * \param _line caller line number used to generate unique cache key
+   * \param _caller caller function name used to generate unique cache key
+   */
+  virtual void Allreduce(void *sendrecvbuf_,
+                         size_t type_nbytes,
+                         size_t count,
+                         ReduceFunction reducer,
+                         PreprocFunction prepare_fun = NULL,
+                         void *prepare_arg = NULL,
+                         const char* _file = _FILE,
+                         const int _line = _LINE,
+                         const char* _caller = _CALLER) {
+    if (prepare_fun != NULL) prepare_fun(prepare_arg);
+    if (world_size == 1 || world_size == -1) return;
+    utils::Assert(TryAllreduce(sendrecvbuf_,
+                               type_nbytes, count, reducer) == kSuccess,
+                  "Allreduce failed");
+  }
+  /*!
+   * \brief broadcast data from root to all nodes
+   * \param sendrecvbuf_ buffer for both sending and recving data
+   * \param size the size of the data to be broadcasted
+   * \param root the root worker id to broadcast the data
+   * \param _file caller file name used to generate unique cache key
+   * \param _line caller line number used to generate unique cache key
+   * \param _caller caller function name used to generate unique cache key
+   */
+  virtual void Broadcast(void *sendrecvbuf_, size_t total_size, int root,
+    const char* _file = _FILE, const int _line = _LINE, const char* _caller = _CALLER) {
+    if (world_size == 1 || world_size == -1) return;
+    utils::Assert(TryBroadcast(sendrecvbuf_, total_size, root) == kSuccess,
+                  "Broadcast failed");
+  }
+  /*!
+   * \brief load latest check point
+   * \param global_model pointer to the globally shared model/state
+   *   when calling this function, the caller need to gauranttees that global_model
+   *   is the same in all nodes
+   * \param local_model pointer to local model, that is specific to current node/rank
+   *   this can be NULL when no local model is needed
+   *
+   * \return the version number of check point loaded
+   *     if returned version == 0, this means no model has been CheckPointed
+   *     the p_model is not touched, user should do necessary initialization by themselves
+   *
+   *   Common usage example:
+   *      int iter = rabit::LoadCheckPoint(&model);
+   *      if (iter == 0) model.InitParameters();
+   *      for (i = iter; i < max_iter; ++i) {
+   *        do many things, include allreduce
+   *        rabit::CheckPoint(model);
+   *      }
+   *
+   * \sa CheckPoint, VersionNumber
+   */
+  virtual int LoadCheckPoint(Serializable *global_model,
+                             Serializable *local_model = NULL) {
+    return 0;
+  }
+  /*!
+   * \brief checkpoint the model, meaning we finished a stage of execution
+   *  every time we call check point, there is a version number which will increase by one
+   *
+   * \param global_model pointer to the globally shared model/state
+   *   when calling this function, the caller need to gauranttees that global_model
+   *   is the same in all nodes
+   * \param local_model pointer to local model, that is specific to current node/rank
+   *   this can be NULL when no local state is needed
+   *
+   * NOTE: local_model requires explicit replication of the model for fault-tolerance, which will
+   *       bring replication cost in CheckPoint function. global_model do not need explicit replication.
+   *       So only CheckPoint with global_model if possible
+   *
+   * \sa LoadCheckPoint, VersionNumber
+   */
+  virtual void CheckPoint(const Serializable *global_model,
+                          const Serializable *local_model = NULL) {
+    version_number += 1;
+  }
+  /*!
+   * \brief This function can be used to replace CheckPoint for global_model only,
+   *   when certain condition is met(see detailed expplaination).
+   *
+   *   This is a "lazy" checkpoint such that only the pointer to global_model is
+   *   remembered and no memory copy is taken. To use this function, the user MUST ensure that:
+   *   The global_model must remain unchanged util last call of Allreduce/Broadcast in current version finishs.
+   *   In another words, global_model model can be changed only between last call of
+   *   Allreduce/Broadcast and LazyCheckPoint in current version
+   *
+   *   For example, suppose the calling sequence is:
+   *   LazyCheckPoint, code1, Allreduce, code2, Broadcast, code3, LazyCheckPoint
+   *
+   *   If user can only changes global_model in code3, then LazyCheckPoint can be used to
+   *   improve efficiency of the program.
+   * \param global_model pointer to the globally shared model/state
+   *   when calling this function, the caller need to gauranttees that global_model
+   *   is the same in all nodes
+   * \sa LoadCheckPoint, CheckPoint, VersionNumber
+   */
+  virtual void LazyCheckPoint(const Serializable *global_model) {
+    version_number += 1;
+  }
+  /*!
+   * \return version number of current stored model,
+   *         which means how many calls to CheckPoint we made so far
+   * \sa LoadCheckPoint, CheckPoint
+   */
+  virtual int VersionNumber(void) const {
+    return version_number;
+  }
+  /*!
+   * \brief explicitly re-init everything before calling LoadCheckPoint
+   *    call this function when IEngine throw an exception out,
+   *    this function is only used for test purpose
+   */
+  virtual void InitAfterException(void) {
+    utils::Error("InitAfterException: not implemented");
+  }
+  /*!
+   * \brief report current status to the job tracker
+   * depending on the job tracker we are in
+   */
+  inline void ReportStatus(void) const {
+    if (hadoop_mode != 0) {
+      fprintf(stderr, "reporter:status:Rabit Phase[%03d] Operation %03d\n",
+              version_number, seq_counter);
+    }
+  }
+
+ protected:
+  /*! \brief enumeration of possible returning results from Try functions */
+  enum ReturnTypeEnum {
+    /*! \brief execution is successful */
+    kSuccess,
+    /*! \brief a link was reset by peer */
+    kConnReset,
+    /*! \brief received a zero length message */
+    kRecvZeroLen,
+    /*! \brief a neighbor node go down, the connection is dropped */
+    kSockError,
+    /*!
+     * \brief another node which is not my neighbor go down,
+     *   get Out-of-Band exception notification from my neighbor
+     */
+    kGetExcept
+  };
+  /*! \brief struct return type to avoid implicit conversion to int/bool */
+  struct ReturnType {
+    /*! \brief internal return type */
+    ReturnTypeEnum value;
+    // constructor
+    ReturnType() {}
+    ReturnType(ReturnTypeEnum value) : value(value) {}  // NOLINT(*)
+    inline bool operator==(const ReturnTypeEnum &v) const {
+      return value == v;
+    }
+    inline bool operator!=(const ReturnTypeEnum &v) const {
+      return value != v;
+    }
+  };
+  /*! \brief translate errno to return type */
+  inline static ReturnType Errno2Return() {
+    int errsv = utils::Socket::GetLastError();
+    if (errsv == EAGAIN || errsv == EWOULDBLOCK || errsv == 0) return kSuccess;
+#ifdef _WIN32
+    if (errsv == WSAEWOULDBLOCK) return kSuccess;
+    if (errsv == WSAECONNRESET) return kConnReset;
+#endif  // _WIN32
+    if (errsv == ECONNRESET) return kConnReset;
+    return kSockError;
+  }
+  // link record to a neighbor
+  struct LinkRecord {
+   public:
+    // socket to get data from/to link
+    utils::TCPSocket sock;
+    // rank of the node in this link
+    int rank;
+    // size of data readed from link
+    size_t size_read;
+    // size of data sent to the link
+    size_t size_write;
+    // pointer to buffer head
+    char *buffer_head;
+    // buffer size, in bytes
+    size_t buffer_size;
+    // constructor
+    LinkRecord(void)
+        : buffer_head(NULL), buffer_size(0) {
+    }
+    // initialize buffer
+    inline void InitBuffer(size_t type_nbytes, size_t count,
+                           size_t reduce_buffer_size) {
+      size_t n = (type_nbytes * count + 7)/ 8;
+      buffer_.resize(std::min(reduce_buffer_size, n));
+      // make sure align to type_nbytes
+      buffer_size =
+          buffer_.size() * sizeof(uint64_t) / type_nbytes * type_nbytes;
+      utils::Assert(type_nbytes <= buffer_size,
+                    "too large type_nbytes=%lu, buffer_size=%lu",
+                    type_nbytes, buffer_size);
+      // set buffer head
+      buffer_head = reinterpret_cast<char*>(BeginPtr(buffer_));
+    }
+    // reset the recv and sent size
+    inline void ResetSize(void) {
+      size_write = size_read = 0;
+    }
+    /*!
+     * \brief read data into ring-buffer, with care not to existing useful override data
+     *  position after protect_start
+     * \param protect_start all data start from protect_start is still needed in buffer
+     *                      read shall not override this
+     * \param max_size_read maximum logical amount we can read, size_read cannot exceed this value
+     * \return the type of reading
+     */
+    inline ReturnType ReadToRingBuffer(size_t protect_start, size_t max_size_read) {
+      utils::Assert(buffer_head != NULL, "ReadToRingBuffer: buffer not allocated");
+      utils::Assert(size_read <= max_size_read, "ReadToRingBuffer: max_size_read check");
+      size_t ngap = size_read - protect_start;
+      utils::Assert(ngap <= buffer_size, "Allreduce: boundary check");
+      size_t offset = size_read % buffer_size;
+      size_t nmax = max_size_read - size_read;
+      nmax = std::min(nmax, buffer_size - ngap);
+      nmax = std::min(nmax, buffer_size - offset);
+      if (nmax == 0) return kSuccess;
+      ssize_t len = sock.Recv(buffer_head + offset, nmax);
+      // length equals 0, remote disconnected
+      if (len == 0) {
+        sock.Close(); return kRecvZeroLen;
+      }
+      if (len == -1) return Errno2Return();
+      size_read += static_cast<size_t>(len);
+      return kSuccess;
+    }
+    /*!
+     * \brief read data into array,
+     * this function can not be used together with ReadToRingBuffer
+     * a link can either read into the ring buffer, or existing array
+     * \param max_size maximum size of array
+     * \return true if it is an successful read, false if there is some error happens, check errno
+     */
+    inline ReturnType ReadToArray(void *recvbuf_, size_t max_size) {
+      if (max_size == size_read) return kSuccess;
+      char *p = static_cast<char*>(recvbuf_);
+      ssize_t len = sock.Recv(p + size_read, max_size - size_read);
+      // length equals 0, remote disconnected
+      if (len == 0) {
+        sock.Close(); return kRecvZeroLen;
+      }
+      if (len == -1) return Errno2Return();
+      size_read += static_cast<size_t>(len);
+      return kSuccess;
+    }
+    /*!
+     * \brief write data in array to sock
+     * \param sendbuf_ head of array
+     * \param max_size maximum size of array
+     * \return true if it is an successful write, false if there is some error happens, check errno
+     */
+    inline ReturnType WriteFromArray(const void *sendbuf_, size_t max_size) {
+      const char *p = static_cast<const char*>(sendbuf_);
+      ssize_t len = sock.Send(p + size_write, max_size - size_write);
+      if (len == -1) return Errno2Return();
+      size_write += static_cast<size_t>(len);
+      return kSuccess;
+    }
+
+   private:
+    // recv buffer to get data from child
+    // aligned with 64 bits, will be able to perform 64 bits operations freely
+    std::vector<uint64_t> buffer_;
+  };
+  /*!
+   * \brief simple data structure that works like a vector
+   *  but takes reference instead of space
+   */
+  struct RefLinkVector {
+    std::vector<LinkRecord*> plinks;
+    inline LinkRecord &operator[](size_t i) {
+      return *plinks[i];
+    }
+    inline size_t size(void) const {
+      return plinks.size();
+    }
+  };
+  /*!
+   * \brief initialize connection to the tracker
+   * \return a socket that initializes the connection
+   */
+  utils::TCPSocket ConnectTracker(void) const;
+  /*!
+   * \brief connect to the tracker to fix the the missing links
+   *   this function is also used when the engine start up
+   * \param cmd possible command to sent to tracker
+   */
+  bool ReConnectLinks(const char *cmd = "start");
+  /*!
+   * \brief perform in-place allreduce, on sendrecvbuf, this function can fail, and will return the cause of failure
+   *
+   * NOTE on Allreduce:
+   *    The kSuccess TryAllreduce does NOT mean every node have successfully finishes TryAllreduce.
+   *    It only means the current node get the correct result of Allreduce.
+   *    However, it means every node finishes LAST call(instead of this one) of Allreduce/Bcast
+   *
+   * \param sendrecvbuf_ buffer for both sending and recving data
+   * \param type_nbytes the unit number of bytes the type have
+   * \param count number of elements to be reduced
+   * \param reducer reduce function
+   * \return this function can return kSuccess, kSockError, kGetExcept, see ReturnType for details
+   * \sa ReturnType
+   */
+  ReturnType TryAllreduce(void *sendrecvbuf_,
+                          size_t type_nbytes,
+                          size_t count,
+                          ReduceFunction reducer);
+  /*!
+   * \brief broadcast data from root to all nodes, this function can fail,and will return the cause of failure
+   * \param sendrecvbuf_ buffer for both sending and receiving data
+   * \param size the size of the data to be broadcasted
+   * \param root the root worker id to broadcast the data
+   * \return this function can return kSuccess, kSockError, kGetExcept, see ReturnType for details
+   * \sa ReturnType
+   */
+  ReturnType TryBroadcast(void *sendrecvbuf_, size_t size, int root);
+  /*!
+   * \brief perform in-place allreduce, on sendrecvbuf,
+   * this function implements tree-shape reduction
+   *
+   * \param sendrecvbuf_ buffer for both sending and recving data
+   * \param type_nbytes the unit number of bytes the type have
+   * \param count number of elements to be reduced
+   * \param reducer reduce function
+   * \return this function can return kSuccess, kSockError, kGetExcept, see ReturnType for details
+   * \sa ReturnType
+   */
+  ReturnType TryAllreduceTree(void *sendrecvbuf_,
+                              size_t type_nbytes,
+                              size_t count,
+                              ReduceFunction reducer);
+  /*!
+   * \brief internal Allgather function, each node have a segment of data in the ring of sendrecvbuf,
+   *  the data provided by current node k is [slice_begin, slice_end),
+   *  the next node's segment must start with slice_end
+   *  after the call of Allgather, sendrecvbuf_ contains all the contents including all segments
+   *  use a ring based algorithm
+   *
+   * \param sendrecvbuf_ buffer for both sending and receiving data, it is a ring conceptually
+   * \param total_size total size of data to be gathered
+   * \param slice_begin beginning of the current slice
+   * \param slice_end end of the current slice
+   * \param size_prev_slice size of the previous slice i.e. slice of node (rank - 1) % world_size
+   * \return this function can return kSuccess, kSockError, kGetExcept, see ReturnType for details
+   * \sa ReturnType
+   */
+  ReturnType TryAllgatherRing(void *sendrecvbuf_, size_t total_size,
+                              size_t slice_begin, size_t slice_end,
+                              size_t size_prev_slice);
+  /*!
+   * \brief perform in-place allreduce, reduce on the sendrecvbuf,
+   *
+   *  after the function, node k get k-th segment of the reduction result
+   *  the k-th segment is defined by [k * step, min((k + 1) * step,count) )
+   *  where step = ceil(count / world_size)
+   *
+   * \param sendrecvbuf_ buffer for both sending and recving data
+   * \param type_nbytes the unit number of bytes the type have
+   * \param count number of elements to be reduced
+   * \param reducer reduce function
+   * \return this function can return kSuccess, kSockError, kGetExcept, see ReturnType for details
+   * \sa ReturnType, TryAllreduce
+   */
+  ReturnType TryReduceScatterRing(void *sendrecvbuf_,
+                                  size_t type_nbytes,
+                                  size_t count,
+                                  ReduceFunction reducer);
+  /*!
+   * \brief perform in-place allreduce, on sendrecvbuf
+   *  use a ring based algorithm, reduce-scatter + allgather
+   *
+   * \param sendrecvbuf_ buffer for both sending and recving data
+   * \param type_nbytes the unit number of bytes the type have
+   * \param count number of elements to be reduced
+   * \param reducer reduce function
+   * \return this function can return kSuccess, kSockError, kGetExcept, see ReturnType for details
+   * \sa ReturnType
+   */
+  ReturnType TryAllreduceRing(void *sendrecvbuf_,
+                              size_t type_nbytes,
+                              size_t count,
+                              ReduceFunction reducer);
+  /*!
+   * \brief function used to report error when a link goes wrong
+   * \param link the pointer to the link who causes the error
+   * \param err the error type
+   */
+  inline ReturnType ReportError(LinkRecord *link, ReturnType err) {
+    err_link = link; return err;
+  }
+  //---- data structure related to model ----
+  // call sequence counter, records how many calls we made so far
+  // from last call to CheckPoint, LoadCheckPoint
+  int seq_counter;
+  // version number of model
+  int version_number;
+  // whether the job is running in hadoop
+  bool hadoop_mode;
+  //---- local data related to link ----
+  // index of parent link, can be -1, meaning this is root of the tree
+  int parent_index;
+  // rank of parent node, can be -1
+  int parent_rank;
+  // sockets of all links this connects to
+  std::vector<LinkRecord> all_links;
+  // used to record the link where things goes wrong
+  LinkRecord *err_link;
+  // all the links in the reduction tree connection
+  RefLinkVector tree_links;
+  // pointer to links in the ring
+  LinkRecord *ring_prev, *ring_next;
+  //----- meta information-----
+  // list of enviroment variables that are of possible interest
+  std::vector<std::string> env_vars;
+  // unique identifier of the possible job this process is doing
+  // used to assign ranks, optional, default to NULL
+  std::string task_id;
+  // uri of current host, to be set by Init
+  std::string host_uri;
+  // uri of tracker
+  std::string tracker_uri;
+  // role in dmlc jobs
+  std::string dmlc_role;
+  // port of tracker address
+  int tracker_port;
+  // port of slave process
+  int slave_port, nport_trial;
+  // reduce buffer size
+  size_t reduce_buffer_size;
+  // reduction method
+  int reduce_method;
+  // mininum count of cells to use ring based method
+  size_t reduce_ring_mincount;
+  // current rank
+  int rank;
+  // world size
+  int world_size;
+  // connect retry time
+  int connect_retry;
+  // enable bootstrap cache 0 false 1 true
+  bool rabit_bootstrap_cache = false;
+  // enable detailed logging
+  bool rabit_debug = false;
+  // by default, if rabit worker not recover in half an hour exit
+  int timeout_sec = 1800;
+  // flag to enable rabit_timeout
+  bool rabit_timeout = false;
+  // Enable TCP node delay
+  bool rabit_enable_tcp_no_delay = false;
+};
+}  // namespace engine
+}  // namespace rabit
+#endif  // RABIT_ALLREDUCE_BASE_H_
diff --git a/src/allreduce_mock.h b/src/allreduce_mock.h
new file mode 100644
index 0000000..ab9f0e0
--- /dev/null
+++ b/src/allreduce_mock.h
@@ -0,0 +1,206 @@
+/*!
+ * Copyright by Contributors
+ * \file allreduce_mock.h
+ * \brief Mock test module of AllReduce engine,
+ * insert failures in certain call point, to test if the engine is robust to failure
+ *
+ * \author Ignacio Cano, Tianqi Chen
+ */
+#ifndef RABIT_ALLREDUCE_MOCK_H_
+#define RABIT_ALLREDUCE_MOCK_H_
+#include <vector>
+#include <map>
+#include <sstream>
+#include "rabit/internal/engine.h"
+#include "rabit/internal/timer.h"
+#include "allreduce_robust.h"
+
+namespace rabit {
+namespace engine {
+class AllreduceMock : public AllreduceRobust {
+ public:
+  // constructor
+  AllreduceMock(void) {
+    num_trial = 0;
+    force_local = 0;
+    report_stats = 0;
+    tsum_allreduce = 0.0;
+    tsum_allgather = 0.0;
+  }
+  // destructor
+  virtual ~AllreduceMock(void) {}
+  virtual void SetParam(const char *name, const char *val) {
+    AllreduceRobust::SetParam(name, val);
+    // additional parameters
+    if (!strcmp(name, "rabit_num_trial")) num_trial = atoi(val);
+    if (!strcmp(name, "DMLC_NUM_ATTEMPT")) num_trial = atoi(val);
+    if (!strcmp(name, "report_stats")) report_stats = atoi(val);
+    if (!strcmp(name, "force_local")) force_local = atoi(val);
+    if (!strcmp(name, "mock")) {
+      MockKey k;
+      utils::Check(sscanf(val, "%d,%d,%d,%d",
+                          &k.rank, &k.version, &k.seqno, &k.ntrial) == 4,
+                   "invalid mock parameter");
+      mock_map[k] = 1;
+    }
+  }
+  virtual void Allreduce(void *sendrecvbuf_,
+                         size_t type_nbytes,
+                         size_t count,
+                         ReduceFunction reducer,
+                         PreprocFunction prepare_fun,
+                         void *prepare_arg,
+                         const char* _file = _FILE,
+                         const int _line = _LINE,
+                         const char* _caller = _CALLER) {
+    this->Verify(MockKey(rank, version_number, seq_counter, num_trial), "AllReduce");
+    double tstart = utils::GetTime();
+    AllreduceRobust::Allreduce(sendrecvbuf_, type_nbytes,
+                               count, reducer, prepare_fun, prepare_arg,
+                               _file, _line, _caller);
+    tsum_allreduce += utils::GetTime() - tstart;
+  }
+  virtual void Allgather(void *sendrecvbuf,
+                             size_t total_size,
+                             size_t slice_begin,
+                             size_t slice_end,
+                             size_t size_prev_slice,
+                             const char* _file = _FILE,
+                             const int _line = _LINE,
+                             const char* _caller = _CALLER) {
+    this->Verify(MockKey(rank, version_number, seq_counter, num_trial), "Allgather");
+    double tstart = utils::GetTime();
+    AllreduceRobust::Allgather(sendrecvbuf, total_size,
+                                   slice_begin, slice_end,
+                                   size_prev_slice, _file, _line, _caller);
+    tsum_allgather += utils::GetTime() - tstart;
+  }
+  virtual void Broadcast(void *sendrecvbuf_, size_t total_size, int root,
+                         const char* _file = _FILE,
+                         const int _line = _LINE,
+                         const char* _caller = _CALLER) {
+    this->Verify(MockKey(rank, version_number, seq_counter, num_trial), "Broadcast");
+    AllreduceRobust::Broadcast(sendrecvbuf_, total_size, root, _file, _line, _caller);
+  }
+  virtual int LoadCheckPoint(Serializable *global_model,
+                             Serializable *local_model) {
+    tsum_allreduce = 0.0;
+    tsum_allgather = 0.0;
+    time_checkpoint = utils::GetTime();
+    if (force_local == 0) {
+      return AllreduceRobust::LoadCheckPoint(global_model, local_model);
+    } else {
+      DummySerializer dum;
+      ComboSerializer com(global_model, local_model);
+      return AllreduceRobust::LoadCheckPoint(&dum, &com);
+    }
+  }
+  virtual void CheckPoint(const Serializable *global_model,
+                          const Serializable *local_model) {
+    this->Verify(MockKey(rank, version_number, seq_counter, num_trial), "CheckPoint");
+    double tstart = utils::GetTime();
+    double tbet_chkpt = tstart - time_checkpoint;
+    if (force_local == 0) {
+      AllreduceRobust::CheckPoint(global_model, local_model);
+    } else {
+      DummySerializer dum;
+      ComboSerializer com(global_model, local_model);
+      AllreduceRobust::CheckPoint(&dum, &com);
+    }
+    time_checkpoint = utils::GetTime();
+    double tcost = utils::GetTime() - tstart;
+    if (report_stats != 0 && rank == 0) {
+      std::stringstream ss;
+      ss << "[v" << version_number << "] global_size=" << global_checkpoint.length()
+         << ",local_size=" << (local_chkpt[0].length() + local_chkpt[1].length())
+         << ",check_tcost="<< tcost <<" sec"
+         << ",allreduce_tcost=" << tsum_allreduce << " sec"
+         << ",allgather_tcost=" << tsum_allgather << " sec"
+         << ",between_chpt=" << tbet_chkpt << "sec\n";
+      this->TrackerPrint(ss.str());
+    }
+    tsum_allreduce = 0.0;
+    tsum_allgather = 0.0;
+  }
+
+  virtual void LazyCheckPoint(const Serializable *global_model) {
+    this->Verify(MockKey(rank, version_number, seq_counter, num_trial), "LazyCheckPoint");
+    AllreduceRobust::LazyCheckPoint(global_model);
+  }
+
+ protected:
+  // force checkpoint to local
+  int force_local;
+  // whether report statistics
+  int report_stats;
+  // sum of allreduce
+  double tsum_allreduce;
+  // sum of allgather
+  double tsum_allgather;
+  double time_checkpoint;
+
+ private:
+  struct DummySerializer : public Serializable {
+    virtual void Load(Stream *fi) {
+    }
+    virtual void Save(Stream *fo) const {
+    }
+  };
+  struct ComboSerializer : public Serializable {
+    Serializable *lhs;
+    Serializable *rhs;
+    const Serializable *c_lhs;
+    const Serializable *c_rhs;
+    ComboSerializer(Serializable *lhs, Serializable *rhs)
+        : lhs(lhs), rhs(rhs), c_lhs(lhs), c_rhs(rhs) {
+    }
+    ComboSerializer(const Serializable *lhs, const Serializable *rhs)
+        : lhs(NULL), rhs(NULL), c_lhs(lhs), c_rhs(rhs) {
+    }
+    virtual void Load(Stream *fi) {
+      if (lhs != NULL) lhs->Load(fi);
+      if (rhs != NULL) rhs->Load(fi);
+    }
+    virtual void Save(Stream *fo) const {
+      if (c_lhs != NULL) c_lhs->Save(fo);
+      if (c_rhs != NULL) c_rhs->Save(fo);
+    }
+  };
+  // key to identify the mock stage
+  struct MockKey {
+    int rank;
+    int version;
+    int seqno;
+    int ntrial;
+    MockKey(void) {}
+    MockKey(int rank, int version, int seqno, int ntrial)
+        : rank(rank), version(version), seqno(seqno), ntrial(ntrial) {}
+    inline bool operator==(const MockKey &b) const {
+      return rank == b.rank &&
+          version == b.version &&
+          seqno == b.seqno &&
+          ntrial == b.ntrial;
+    }
+    inline bool operator<(const MockKey &b) const {
+      if (rank != b.rank) return rank < b.rank;
+      if (version != b.version) return version < b.version;
+      if (seqno != b.seqno) return seqno < b.seqno;
+      return ntrial < b.ntrial;
+    }
+  };
+  // number of failure trials
+  int num_trial;
+  // record all mock actions
+  std::map<MockKey, int> mock_map;
+  // used to generate all kinds of exceptions
+  inline void Verify(const MockKey &key, const char *name) {
+    if (mock_map.count(key) != 0) {
+      num_trial += 1;
+      // data processing frameworks runs on shared process
+      _error("[%d]@@@Hit Mock Error:%s ", rank, name);
+    }
+  }
+};
+}  // namespace engine
+}  // namespace rabit
+#endif  // RABIT_ALLREDUCE_MOCK_H_
diff --git a/src/allreduce_robust-inl.h b/src/allreduce_robust-inl.h
new file mode 100644
index 0000000..7baa14b
--- /dev/null
+++ b/src/allreduce_robust-inl.h
@@ -0,0 +1,169 @@
+/*!
+ *  Copyright (c) 2014 by Contributors
+ * \file allreduce_robust-inl.h
+ * \brief implementation of inline template function in AllreduceRobust
+ *
+ * \author Tianqi Chen
+ */
+#ifndef RABIT_ALLREDUCE_ROBUST_INL_H_
+#define RABIT_ALLREDUCE_ROBUST_INL_H_
+#include <vector>
+
+namespace rabit {
+namespace engine {
+/*!
+ * \brief run message passing algorithm on the allreduce tree
+ *        the result is edge message stored in p_edge_in and p_edge_out
+ * \param node_value the value associated with current node
+ * \param p_edge_in used to store input message from each of the edge
+ * \param p_edge_out used to store output message from each of the edge
+ * \param func a function that defines the message passing rule
+ *        Parameters of func:
+ *           - node_value same as node_value in the main function
+ *           - edge_in the array of input messages from each edge,
+ *                     this includes the output edge, which should be excluded
+ *           - out_index array the index of output edge, the function should
+ *                       exclude the output edge when compute the message passing value
+ *        Return of func:
+ *           the function returns the output message based on the input message and node_value
+ *
+ * \tparam EdgeType type of edge message, must be simple struct
+ * \tparam NodeType type of node value
+ */
+template<typename NodeType, typename EdgeType>
+inline AllreduceRobust::ReturnType
+AllreduceRobust::MsgPassing(const NodeType &node_value,
+                            std::vector<EdgeType> *p_edge_in,
+                            std::vector<EdgeType> *p_edge_out,
+                            EdgeType(*func)
+                            (const NodeType &node_value,
+                             const std::vector<EdgeType> &edge_in,
+                             size_t out_index)) {
+  RefLinkVector &links = tree_links;
+  if (links.size() == 0) return kSuccess;
+  // number of links
+  const int nlink = static_cast<int>(links.size());
+  // initialize the pointers
+  for (int i = 0; i < nlink; ++i) {
+    links[i].ResetSize();
+  }
+  std::vector<EdgeType> &edge_in = *p_edge_in;
+  std::vector<EdgeType> &edge_out = *p_edge_out;
+  edge_in.resize(nlink);
+  edge_out.resize(nlink);
+  // stages in the process
+  // 0: recv messages from childs
+  // 1: send message to parent
+  // 2: recv message from parent
+  // 3: send message to childs
+  int stage = 0;
+  // if no childs, no need to, directly start passing message
+  if (nlink == static_cast<int>(parent_index != -1)) {
+    utils::Assert(parent_index == 0, "parent must be 0");
+    edge_out[parent_index] = func(node_value, edge_in, parent_index);
+    stage = 1;
+  }
+  // while we have not passed the messages out
+  while (true) {
+    // for node with no parent, directly do stage 3
+    if (parent_index == -1) {
+      utils::Assert(stage != 2 && stage != 1, "invalie stage id");
+    }
+    // poll helper
+    utils::PollHelper watcher;
+    bool done = (stage == 3);
+    for (int i = 0; i < nlink; ++i) {
+      watcher.WatchException(links[i].sock);
+      switch (stage) {
+        case 0:
+          if (i != parent_index && links[i].size_read != sizeof(EdgeType)) {
+            watcher.WatchRead(links[i].sock);
+          }
+          break;
+        case 1:
+          if (i == parent_index) {
+            watcher.WatchWrite(links[i].sock);
+          }
+          break;
+        case 2:
+          if (i == parent_index) {
+            watcher.WatchRead(links[i].sock);
+          }
+          break;
+        case 3:
+          if (i != parent_index && links[i].size_write != sizeof(EdgeType)) {
+            watcher.WatchWrite(links[i].sock);
+            done = false;
+          }
+          break;
+        default: utils::Error("invalid stage");
+      }
+    }
+    // finish all the stages, and write out message
+    if (done) break;
+    watcher.Poll();
+    // exception handling
+    for (int i = 0; i < nlink; ++i) {
+      // recive OOB message from some link
+      if (watcher.CheckExcept(links[i].sock)) {
+        return ReportError(&links[i], kGetExcept);
+      }
+    }
+    if (stage == 0) {
+      bool finished = true;
+      // read data from childs
+      for (int i = 0; i < nlink; ++i) {
+        if (i != parent_index) {
+          if (watcher.CheckRead(links[i].sock)) {
+            ReturnType ret = links[i].ReadToArray(&edge_in[i], sizeof(EdgeType));
+            if (ret != kSuccess) return ReportError(&links[i], ret);
+          }
+          if (links[i].size_read != sizeof(EdgeType)) finished = false;
+        }
+      }
+      // if no parent, jump to stage 3, otherwise do stage 1
+      if (finished) {
+        if (parent_index != -1) {
+          edge_out[parent_index] = func(node_value, edge_in, parent_index);
+          stage = 1;
+        } else {
+          for (int i = 0; i < nlink; ++i) {
+            edge_out[i] = func(node_value, edge_in, i);
+          }
+          stage = 3;
+        }
+      }
+    }
+    if (stage == 1) {
+      const int pid = this->parent_index;
+      utils::Assert(pid != -1, "MsgPassing invalid stage");
+      ReturnType ret = links[pid].WriteFromArray(&edge_out[pid], sizeof(EdgeType));
+      if (ret != kSuccess) return ReportError(&links[pid], ret);
+      if (links[pid].size_write == sizeof(EdgeType)) stage = 2;
+    }
+    if (stage == 2) {
+      const int pid = this->parent_index;
+      utils::Assert(pid != -1, "MsgPassing invalid stage");
+      ReturnType ret = links[pid].ReadToArray(&edge_in[pid], sizeof(EdgeType));
+      if (ret != kSuccess) return ReportError(&links[pid], ret);
+      if (links[pid].size_read == sizeof(EdgeType)) {
+        for (int i = 0; i < nlink; ++i) {
+          if (i != pid) edge_out[i] = func(node_value, edge_in, i);
+        }
+        stage = 3;
+      }
+    }
+    if (stage == 3) {
+      for (int i = 0; i < nlink; ++i) {
+        if (i != parent_index && links[i].size_write != sizeof(EdgeType)) {
+          ReturnType ret = links[i].WriteFromArray(&edge_out[i], sizeof(EdgeType));
+          if (ret != kSuccess) return ReportError(&links[i], ret);
+        }
+      }
+    }
+  }
+  return kSuccess;
+}
+}  // namespace engine
+}  // namespace rabit
+#endif  // RABIT_ALLREDUCE_ROBUST_INL_H_
diff --git a/src/allreduce_robust.cc b/src/allreduce_robust.cc
new file mode 100644
index 0000000..8ae1acc
--- /dev/null
+++ b/src/allreduce_robust.cc
@@ -0,0 +1,1589 @@
+/*!
+ *  Copyright (c) 2014-2019 by Contributors
+ * \file allreduce_robust.cc
+ * \brief Robust implementation of Allreduce
+ *
+ * \author Tianqi Chen, Ignacio Cano, Tianyi Zhou
+ */
+#define _CRT_SECURE_NO_WARNINGS
+#define _CRT_SECURE_NO_DEPRECATE
+#define NOMINMAX
+#include <chrono>
+#include <thread>
+#include <limits>
+#include <utility>
+#include "rabit/internal/io.h"
+#include "rabit/internal/timer.h"
+#include "rabit/internal/utils.h"
+#include "rabit/internal/engine.h"
+#include "rabit/internal/rabit-inl.h"
+#include "allreduce_robust.h"
+
+#undef _assert
+
+namespace rabit {
+namespace engine {
+
+AllreduceRobust::AllreduceRobust(void) {
+  num_local_replica = 0;
+  num_global_replica = 5;
+  default_local_replica = 2;
+  seq_counter = 0;
+  cur_cache_seq = 0;
+  local_chkpt_version = 0;
+  result_buffer_round = 1;
+  global_lazycheck = NULL;
+  use_local_model = -1;
+  recover_counter = 0;
+  checkpoint_loaded = false;
+  env_vars.push_back("rabit_global_replica");
+  env_vars.push_back("rabit_local_replica");
+}
+bool AllreduceRobust::Init(int argc, char* argv[]) {
+  if (AllreduceBase::Init(argc, argv)) {
+    // chenqin: alert user opted in experimental feature.
+    if (rabit_bootstrap_cache) utils::HandleLogInfo(
+      "[EXPERIMENTAL] bootstrap cache has been enabled\n");
+    checkpoint_loaded = false;
+    if (num_global_replica == 0) {
+      result_buffer_round = -1;
+    } else {
+      result_buffer_round = std::max(world_size / num_global_replica, 1);
+    }
+    return true;
+  } else {
+    return false;
+  }
+}
+/*! \brief shutdown the engine */
+bool AllreduceRobust::Shutdown(void) {
+  try {
+    // need to sync the exec before we shutdown, do a pesudo check point
+    // execute checkpoint, note: when checkpoint existing, load will not happen
+    _assert(RecoverExec(NULL, 0, ActionSummary::kCheckPoint, ActionSummary::kSpecialOp,
+      cur_cache_seq), "Shutdown: check point must return true");
+    // reset result buffer
+    resbuf.Clear(); seq_counter = 0;
+    cachebuf.Clear(); cur_cache_seq = 0;
+    lookupbuf.Clear();
+    // execute check ack step, load happens here
+    _assert(RecoverExec(NULL, 0, ActionSummary::kCheckAck,
+      ActionSummary::kSpecialOp, cur_cache_seq), "Shutdown: check ack must return true");
+// travis ci only osx test hang
+#if defined (__APPLE__)
+    sleep(1);
+#endif
+    shutdown_timeout = true;
+    if (rabit_timeout_task.valid()) {
+      rabit_timeout_task.wait();
+      _assert(rabit_timeout_task.get(), "expect timeout task return\n");
+    }
+    return AllreduceBase::Shutdown();
+  } catch (const std::exception& e) {
+    fprintf(stderr, "%s\n", e.what());
+    return false;
+  }
+}
+
+/*!
+ * \brief set parameters to the engine
+ * \param name parameter name
+ * \param val parameter value
+ */
+void AllreduceRobust::SetParam(const char *name, const char *val) {
+  AllreduceBase::SetParam(name, val);
+  if (!strcmp(name, "rabit_global_replica")) num_global_replica = atoi(val);
+  if (!strcmp(name, "rabit_local_replica")) {
+    num_local_replica = atoi(val);
+  }
+}
+
+int AllreduceRobust::SetBootstrapCache(const std::string &key, const void *buf,
+  const size_t type_nbytes, const size_t count) {
+  int index = -1;
+  for (int i = 0 ; i < cur_cache_seq; i++) {
+    size_t nsize = 0;
+    void* name = lookupbuf.Query(i, &nsize);
+    if (nsize == key.length() + 1
+      && strcmp(static_cast<const char*>(name), key.c_str()) == 0) {
+      index = i;
+      break;
+    }
+  }
+  // we should consider way to support duplicated signatures
+  // https://github.com/dmlc/xgboost/issues/5012
+  // _assert(index == -1, "immutable cache key already exists");
+  _assert(type_nbytes*count > 0, "can't set empty cache");
+  void* temp = cachebuf.AllocTemp(type_nbytes, count);
+  cachebuf.PushTemp(cur_cache_seq, type_nbytes, count);
+  std::memcpy(temp, buf, type_nbytes*count);
+
+  std::string k(key);
+  void* name = lookupbuf.AllocTemp(strlen(k.c_str()) + 1, 1);
+  lookupbuf.PushTemp(cur_cache_seq, strlen(k.c_str()) + 1, 1);
+  std::memcpy(name, key.c_str(), strlen(k.c_str()) + 1);
+  cur_cache_seq += 1;
+  return 0;
+}
+
+int AllreduceRobust::GetBootstrapCache(const std::string &key, void* buf,
+  const size_t type_nbytes, const size_t count) {
+  // as requester sync with rest of nodes on latest cache content
+  if (!RecoverExec(NULL, 0, ActionSummary::kLoadBootstrapCache,
+    seq_counter, cur_cache_seq)) return -1;
+
+  int index = -1;
+  for (int i = 0 ; i < cur_cache_seq; i++) {
+    size_t nsize = 0;
+    void* name = lookupbuf.Query(i, &nsize);
+    if (nsize == strlen(key.c_str()) + 1
+      && strcmp(reinterpret_cast<char*>(name), key.c_str()) == 0) {
+      index = i;
+      break;
+    }
+  }
+  // cache doesn't exists
+  if (index == -1) return -1;
+
+  size_t siz = 0;
+  void* temp = cachebuf.Query(index, &siz);
+  utils::Assert(cur_cache_seq > index, "cur_cache_seq is smaller than lookup cache seq index");
+  utils::Assert(siz == type_nbytes*count, "cache size stored expected to be same as requested");
+  utils::Assert(siz > 0, "cache size should be greater than 0");
+  std::memcpy(buf, temp, type_nbytes*count);
+  return 0;
+}
+
+/*!
+ * \brief Allgather function, each node have a segment of data in the ring of sendrecvbuf,
+ *  the data provided by current node k is [slice_begin, slice_end),
+ *  the next node's segment must start with slice_end
+ *  after the call of Allgather, sendrecvbuf_ contains all the contents including all segments
+ *  use a ring based algorithm
+ *
+ * \param sendrecvbuf buffer for both sending and receiving data, it is a ring conceptually
+ * \param total_size total size of data to be gathered
+ * \param slice_begin beginning of the current slice
+ * \param slice_end end of the current slice
+ * \param size_prev_slice size of the previous slice i.e. slice of node (rank - 1) % world_size
+ * \param _file caller file name used to generate unique cache key
+ * \param _line caller line number used to generate unique cache key
+ * \param _caller caller function name used to generate unique cache key 
+ */ 
+void AllreduceRobust::Allgather(void *sendrecvbuf,
+                                    size_t total_size,
+                                    size_t slice_begin,
+                                    size_t slice_end,
+                                    size_t size_prev_slice,
+                                    const char* _file,
+                                    const int _line,
+                                    const char* _caller) {
+  if (world_size == 1 || world_size == -1) return;
+  // genreate unique allgather signature
+  std::string key = std::string(_file) + "::" + std::to_string(_line) + "::"
+    + std::string(_caller) + "#" +std::to_string(total_size);
+
+  // try fetch bootstrap allgather results from cache
+  if (!checkpoint_loaded && rabit_bootstrap_cache &&
+    GetBootstrapCache(key, sendrecvbuf, total_size, 1) != -1) return;
+
+  double start = utils::GetTime();
+  bool recovered = RecoverExec(sendrecvbuf, total_size, 0, seq_counter, cur_cache_seq);
+
+  if (resbuf.LastSeqNo() != -1 &&
+    (result_buffer_round == -1 ||
+      resbuf.LastSeqNo() % result_buffer_round != rank % result_buffer_round)) {
+    resbuf.DropLast();
+  }
+
+  void *temp = resbuf.AllocTemp(total_size, 1);
+  while (true) {
+    if (recovered) {
+      std::memcpy(temp, sendrecvbuf, total_size); break;
+    } else {
+      std::memcpy(temp, sendrecvbuf, total_size);
+      if (CheckAndRecover(TryAllgatherRing(temp, total_size,
+                                           slice_begin, slice_end, size_prev_slice))) {
+        std::memcpy(sendrecvbuf, temp, total_size); break;
+      } else {
+        recovered = RecoverExec(sendrecvbuf, total_size, 0, seq_counter, cur_cache_seq);
+      }
+    }
+  }
+  double delta = utils::GetTime() - start;
+  // log allgather latency
+  if (rabit_debug) {
+    utils::HandleLogInfo("[%d] allgather (%s) finished version %d, seq %d, take %f seconds\n",
+      rank, key.c_str(), version_number, seq_counter, delta);
+  }
+
+  // if bootstrap allgather, store and fetch through cache
+  if (checkpoint_loaded || !rabit_bootstrap_cache) {
+    resbuf.PushTemp(seq_counter, total_size, 1);
+    seq_counter += 1;
+  } else {
+    SetBootstrapCache(key, sendrecvbuf, total_size, 1);
+  }
+}
+
+/*!
+ * \brief perform in-place allreduce, on sendrecvbuf
+ *        this function is NOT thread-safe
+ * \param sendrecvbuf_ buffer for both sending and recving data
+ * \param type_nbytes the unit number of bytes the type have
+ * \param count number of elements to be reduced
+ * \param reducer reduce function
+ * \param prepare_func Lazy preprocessing function, lazy prepare_fun(prepare_arg)
+ *                     will be called by the function before performing Allreduce, to intialize the data in sendrecvbuf_.
+ *                     If the result of Allreduce can be recovered directly, then prepare_func will NOT be called
+ * \param prepare_arg argument used to passed into the lazy preprocessing function
+ * \param _file caller file name used to generate unique cache key
+ * \param _line caller line number used to generate unique cache key
+ * \param _caller caller function name used to generate unique cache key
+ */
+void AllreduceRobust::Allreduce(void *sendrecvbuf_,
+                                size_t type_nbytes,
+                                size_t count,
+                                ReduceFunction reducer,
+                                PreprocFunction prepare_fun,
+                                void *prepare_arg,
+                                const char* _file,
+                                const int _line,
+                                const char* _caller) {
+  // skip action in single node
+  if (world_size == 1 || world_size == -1) {
+    if (prepare_fun != NULL) prepare_fun(prepare_arg);
+    return;
+  }
+
+  // genreate unique allreduce signature
+  std::string key = std::string(_file) + "::" + std::to_string(_line) + "::"
+    + std::string(_caller) + "#" +std::to_string(type_nbytes) + "x" + std::to_string(count);
+
+  // try fetch bootstrap allreduce results from cache
+  if (!checkpoint_loaded && rabit_bootstrap_cache &&
+    GetBootstrapCache(key, sendrecvbuf_, type_nbytes, count) != -1) return;
+
+  double start = utils::GetTime();
+  bool recovered = RecoverExec(sendrecvbuf_, type_nbytes * count, 0, seq_counter, cur_cache_seq);
+
+  if (resbuf.LastSeqNo() != -1 &&
+    (result_buffer_round == -1 ||
+      resbuf.LastSeqNo() % result_buffer_round != rank % result_buffer_round)) {
+    resbuf.DropLast();
+  }
+
+  if (!recovered && prepare_fun != NULL) prepare_fun(prepare_arg);
+  void *temp = resbuf.AllocTemp(type_nbytes, count);
+  while (true) {
+    if (recovered) {
+      std::memcpy(temp, sendrecvbuf_, type_nbytes * count); break;
+    } else {
+      std::memcpy(temp, sendrecvbuf_, type_nbytes * count);
+      if (CheckAndRecover(TryAllreduce(temp, type_nbytes, count, reducer))) {
+        std::memcpy(sendrecvbuf_, temp, type_nbytes * count); break;
+      } else {
+        recovered = RecoverExec(sendrecvbuf_, type_nbytes * count, 0, seq_counter, cur_cache_seq);
+      }
+    }
+  }
+  double delta = utils::GetTime() - start;
+  // log allreduce latency
+  if (rabit_debug) {
+    utils::HandleLogInfo("[%d] allreduce (%s) finished version %d, seq %d, take %f seconds\n",
+      rank, key.c_str(), version_number, seq_counter, delta);
+  }
+
+  // if bootstrap allreduce, store and fetch through cache
+  if (checkpoint_loaded || !rabit_bootstrap_cache) {
+    resbuf.PushTemp(seq_counter, type_nbytes, count);
+    seq_counter += 1;
+  } else {
+    SetBootstrapCache(key, sendrecvbuf_, type_nbytes, count);
+  }
+}
+/*!
+ * \brief broadcast data from root to all nodes
+ * \param sendrecvbuf_ buffer for both sending and recving data
+ * \param size the size of the data to be broadcasted
+ * \param root the root worker id to broadcast the data
+ * \param _file caller file name used to generate unique cache key
+ * \param _line caller line number used to generate unique cache key
+ * \param _caller caller function name used to generate unique cache key
+ */
+void AllreduceRobust::Broadcast(void *sendrecvbuf_, size_t total_size, int root,
+                                const char* _file,
+                                const int _line,
+                                const char* _caller) {
+  // skip action in single node
+  if (world_size == 1 || world_size == -1) return;
+  // genreate unique cache signature
+  std::string key = std::string(_file) + "::" + std::to_string(_line) + "::"
+    + std::string(_caller) + "#" +std::to_string(total_size) + "@" + std::to_string(root);
+  // try fetch bootstrap allreduce results from cache
+  if (!checkpoint_loaded && rabit_bootstrap_cache &&
+  GetBootstrapCache(key, sendrecvbuf_, total_size, 1) != -1) return;
+  double start = utils::GetTime();
+  bool recovered = RecoverExec(sendrecvbuf_, total_size, 0, seq_counter, cur_cache_seq);
+  // now we are free to remove the last result, if any
+  if (resbuf.LastSeqNo() != -1 &&
+      (result_buffer_round == -1 ||
+       resbuf.LastSeqNo() % result_buffer_round != rank % result_buffer_round)) {
+    resbuf.DropLast();
+  }
+  void *temp = resbuf.AllocTemp(1, total_size);
+  while (true) {
+    if (recovered) {
+      std::memcpy(temp, sendrecvbuf_, total_size); break;
+    } else {
+      if (CheckAndRecover(TryBroadcast(sendrecvbuf_, total_size, root))) {
+        std::memcpy(temp, sendrecvbuf_, total_size); break;
+      } else {
+        recovered = RecoverExec(sendrecvbuf_, total_size, 0, seq_counter, cur_cache_seq);
+      }
+    }
+  }
+
+  double delta = utils::GetTime() - start;
+  // log broadcast latency
+  if (rabit_debug) {
+    utils::HandleLogInfo(
+      "[%d] broadcast (%s) root %d finished version %d,seq %d, take %f seconds\n",
+      rank, key.c_str(), root, version_number, seq_counter, delta);
+  }
+  // if bootstrap broadcast, store and fetch through cache
+  if (checkpoint_loaded || !rabit_bootstrap_cache) {
+    resbuf.PushTemp(seq_counter, 1, total_size);
+    seq_counter += 1;
+  } else {
+    SetBootstrapCache(key, sendrecvbuf_, total_size, 1);
+  }
+}
+/*!
+ * \brief load latest check point
+ * \param global_model pointer to the globally shared model/state
+ *   when calling this function, the caller need to gauranttees that global_model
+ *   is the same in all nodes
+ * \param local_model pointer to local model, that is specific to current node/rank
+ *   this can be NULL when no local model is needed
+ *
+ * \return the version number of check point loaded
+ *     if returned version == 0, this means no model has been CheckPointed
+ *     the p_model is not touched, user should do necessary initialization by themselves
+ *
+ *   Common usage example:
+ *      int iter = rabit::LoadCheckPoint(&model);
+ *      if (iter == 0) model.InitParameters();
+ *      for (i = iter; i < max_iter; ++i) {
+ *        do many things, include allreduce
+ *        rabit::CheckPoint(model);
+ *      }
+ *
+ * \sa CheckPoint, VersionNumber
+ */
+int AllreduceRobust::LoadCheckPoint(Serializable *global_model,
+                                    Serializable *local_model) {
+  checkpoint_loaded = true;
+  // skip action in single node
+  if (world_size == 1) return 0;
+  this->LocalModelCheck(local_model != NULL);
+  if (num_local_replica == 0) {
+    utils::Check(local_model == NULL,
+                 "need to set rabit_local_replica larger than 1 to checkpoint local_model");
+  }
+  double start = utils::GetTime();
+  // check if we succeed
+  if (RecoverExec(NULL, 0, ActionSummary::kLoadCheck, ActionSummary::kSpecialOp, cur_cache_seq)) {
+    int nlocal = std::max(static_cast<int>(local_rptr[local_chkpt_version].size()) - 1, 0);
+    if (local_model != NULL) {
+      if (nlocal == num_local_replica + 1) {
+        // load in local model
+        utils::MemoryFixSizeBuffer fs(BeginPtr(local_chkpt[local_chkpt_version]),
+                                      local_rptr[local_chkpt_version][1]);
+        local_model->Load(&fs);
+      } else {
+        _assert(nlocal == 0, "[%d] local model inconsistent, nlocal=%d", rank, nlocal);
+      }
+    }
+    // reset result buffer
+    resbuf.Clear(); seq_counter = 0;
+    // load from buffer
+    utils::MemoryBufferStream fs(&global_checkpoint);
+    if (global_checkpoint.length() == 0) {
+      version_number = 0;
+    } else {
+      _assert(fs.Read(&version_number, sizeof(version_number)) != 0,
+                    "read in version number");
+      global_model->Load(&fs);
+      _assert(local_model == NULL || nlocal == num_local_replica + 1,
+                    "local model inconsistent, nlocal=%d", nlocal);
+    }
+    // run another phase of check ack, if recovered from data
+    _assert(RecoverExec(NULL, 0, ActionSummary::kCheckAck,
+      ActionSummary::kSpecialOp, cur_cache_seq), "check ack must return true");
+
+    if (!RecoverExec(NULL, 0, ActionSummary::kLoadBootstrapCache, seq_counter, cur_cache_seq)) {
+      utils::Printf("no need to load cache\n");
+    }
+    double delta = utils::GetTime() - start;
+
+    // log broadcast latency
+    if (rabit_debug) {
+      utils::HandleLogInfo("[%d] loadcheckpoint size %ld finished version %d, "
+                         "seq %d, take %f seconds\n",
+                         rank, global_checkpoint.length(),
+                         version_number, seq_counter, delta);
+    }
+    return version_number;
+  } else {
+    // log job fresh start
+    if (rabit_debug) utils::HandleLogInfo("[%d] loadcheckpoint reset\n", rank);
+
+    // reset result buffer
+    resbuf.Clear(); seq_counter = 0; version_number = 0;
+    // nothing loaded, a fresh start, everyone init model
+    return version_number;
+  }
+}
+/*!
+ * \brief internal consistency check function,
+ *  use check to ensure user always call CheckPoint/LoadCheckPoint
+ *  with or without local but not both, this function will set the approperiate settings
+ *  in the first call of LoadCheckPoint/CheckPoint
+ *
+ * \param with_local whether the user calls CheckPoint with local model
+ */
+void AllreduceRobust::LocalModelCheck(bool with_local) {
+  if (use_local_model == -1) {
+    if (with_local) {
+      use_local_model = 1;
+      if (num_local_replica == 0) {
+        num_local_replica = default_local_replica;
+      }
+    } else {
+      use_local_model = 0;
+      num_local_replica = 0;
+    }
+  } else {
+    utils::Check(use_local_model == static_cast<int>(with_local),
+                 "Can only call Checkpoint/LoadCheckPoint always with"\
+                 "or without local_model, but not mixed case");
+  }
+}
+/*!
+ * \brief internal implementation of checkpoint, support both lazy and normal way
+ *
+ * \param global_model pointer to the globally shared model/state
+ *   when calling this function, the caller need to gauranttees that global_model
+ *   is the same in all nodes
+ * \param local_model pointer to local model, that is specific to current node/rank
+ *   this can be NULL when no local state is needed
+ * \param lazy_checkpt whether the action is lazy checkpoint
+ *
+ * \sa CheckPoint, LazyCheckPoint
+ */
+void AllreduceRobust::CheckPoint_(const Serializable *global_model,
+                                  const Serializable *local_model,
+                                  bool lazy_checkpt) {
+  // never do check point in single machine mode
+  if (world_size == 1) {
+    version_number += 1; return;
+  }
+  double start = utils::GetTime();
+  this->LocalModelCheck(local_model != NULL);
+  if (num_local_replica == 0) {
+    utils::Check(local_model == NULL,
+                 "need to set rabit_local_replica larger than 1 to checkpoint local_model");
+  }
+  if (num_local_replica != 0) {
+    while (true) {
+      if (RecoverExec(NULL, 0, 0, ActionSummary::kLocalCheckPoint)) break;
+      // save model to new version place
+      int new_version = !local_chkpt_version;
+
+      local_chkpt[new_version].clear();
+      utils::MemoryBufferStream fs(&local_chkpt[new_version]);
+      if (local_model != NULL) {
+        local_model->Save(&fs);
+      }
+      local_rptr[new_version].clear();
+      local_rptr[new_version].push_back(0);
+      local_rptr[new_version].push_back(local_chkpt[new_version].length());
+      if (CheckAndRecover(TryCheckinLocalState(&local_rptr[new_version],
+                                               &local_chkpt[new_version]))) break;
+    }
+    // run the ack phase, can be true or false
+    RecoverExec(NULL, 0, 0, ActionSummary::kLocalCheckAck);
+    // switch pointer to new version
+    local_chkpt_version = !local_chkpt_version;
+  }
+  // execute checkpoint, note: when checkpoint existing, load will not happen
+  _assert(RecoverExec(NULL, 0, ActionSummary::kCheckPoint,
+    ActionSummary::kSpecialOp, cur_cache_seq),
+                "check point must return true");
+  // this is the critical region where we will change all the stored models
+  // increase version number
+  version_number += 1;
+  // save model
+  if (lazy_checkpt) {
+    global_lazycheck = global_model;
+  } else {
+    global_checkpoint.resize(0);
+    utils::MemoryBufferStream fs(&global_checkpoint);
+    fs.Write(&version_number, sizeof(version_number));
+    global_model->Save(&fs);
+    global_lazycheck = NULL;
+  }
+  double delta = utils::GetTime() - start;
+  // log checkpoint latency
+  if (rabit_debug) {
+    utils::HandleLogInfo(
+      "[%d] checkpoint finished version %d,seq %d, take %f seconds\n",
+      rank, version_number, seq_counter, delta);
+  }
+  start = utils::GetTime();
+  // reset result buffer, mark boostrap phase complete
+  resbuf.Clear(); seq_counter = 0;
+  // execute check ack step, load happens here
+  _assert(RecoverExec(NULL, 0, ActionSummary::kCheckAck,
+    ActionSummary::kSpecialOp, cur_cache_seq), "check ack must return true");
+
+  delta = utils::GetTime() - start;
+  // log checkpoint ack latency
+  if (rabit_debug) {
+    utils::HandleLogInfo("[%d] checkpoint ack finished version %d, take %f seconds\n",
+    rank, version_number, delta);
+  }
+}
+/*!
+ * \brief reset the all the existing links by sending Out-of-Band message marker
+ *  after this function finishes, all the messages received and sent before in all live links are discarded,
+ *  This allows us to get a fresh start after error has happened
+ *
+ * \return this function can return kSuccess or kSockError
+ *         when kSockError is returned, it simply means there are bad sockets in the links,
+ *         and some link recovery proceduer is needed
+ */
+AllreduceRobust::ReturnType AllreduceRobust::TryResetLinks(void) {
+  // number of links
+  const int nlink = static_cast<int>(all_links.size());
+  for (int i = 0; i < nlink; ++i) {
+    all_links[i].InitBuffer(sizeof(int), 1 << 10, reduce_buffer_size);
+    all_links[i].ResetSize();
+  }
+  // read and discard data from all channels until pass mark
+  while (true) {
+    for (int i = 0; i < nlink; ++i) {
+      if (all_links[i].sock.BadSocket()) continue;
+      if (all_links[i].size_write == 0) {
+        char sig = kOOBReset;
+        ssize_t len = all_links[i].sock.Send(&sig, sizeof(sig), MSG_OOB);
+        // error will be filtered in next loop
+        if (len == sizeof(sig)) all_links[i].size_write = 1;
+      }
+      if (all_links[i].size_write == 1) {
+        char sig = kResetMark;
+        ssize_t len = all_links[i].sock.Send(&sig, sizeof(sig));
+        if (len == sizeof(sig)) all_links[i].size_write = 2;
+      }
+    }
+    utils::PollHelper rsel;
+    bool finished = true;
+    for (int i = 0; i < nlink; ++i) {
+      if (all_links[i].size_write != 2 && !all_links[i].sock.BadSocket()) {
+        rsel.WatchWrite(all_links[i].sock); finished = false;
+      }
+    }
+    if (finished) break;
+    // wait to read from the channels to discard data
+    rsel.Poll();
+  }
+  for (int i = 0; i < nlink; ++i) {
+    if (!all_links[i].sock.BadSocket()) {
+      utils::PollHelper::WaitExcept(all_links[i].sock);
+    }
+  }
+  while (true) {
+    utils::PollHelper rsel;
+    bool finished = true;
+    for (int i = 0; i < nlink; ++i) {
+      if (all_links[i].size_read == 0 && !all_links[i].sock.BadSocket()) {
+        rsel.WatchRead(all_links[i].sock); finished = false;
+      }
+    }
+    if (finished) break;
+    rsel.Poll();
+    for (int i = 0; i < nlink; ++i) {
+      if (all_links[i].sock.BadSocket()) continue;
+      if (all_links[i].size_read == 0) {
+        int atmark = all_links[i].sock.AtMark();
+        if (atmark < 0) {
+          _assert(all_links[i].sock.BadSocket(), "must already gone bad");
+        } else if (atmark > 0) {
+          all_links[i].size_read = 1;
+        } else {
+          // no at mark, read and discard data
+          ssize_t len = all_links[i].sock.Recv(all_links[i].buffer_head, all_links[i].buffer_size);
+          if (all_links[i].sock.AtMark()) all_links[i].size_read = 1;
+          // zero length, remote closed the connection, close socket
+          if (len == 0) all_links[i].sock.Close();
+        }
+      }
+    }
+  }
+  // start synchronization, use blocking I/O to avoid select
+  for (int i = 0; i < nlink; ++i) {
+    if (!all_links[i].sock.BadSocket()) {
+      char oob_mark;
+      all_links[i].sock.SetNonBlock(false);
+      ssize_t len = all_links[i].sock.Recv(&oob_mark, sizeof(oob_mark), MSG_WAITALL);
+      if (len == 0) {
+        all_links[i].sock.Close(); continue;
+      } else if (len > 0) {
+        _assert(oob_mark == kResetMark, "wrong oob msg");
+        _assert(all_links[i].sock.AtMark() != 1, "should already read past mark");
+      } else {
+        _assert(errno != EAGAIN|| errno != EWOULDBLOCK, "BUG");
+      }
+      // send out ack
+      char ack = kResetAck;
+      while (true) {
+        len = all_links[i].sock.Send(&ack, sizeof(ack));
+        if (len == sizeof(ack)) break;
+        if (len == -1) {
+          if (errno != EAGAIN && errno != EWOULDBLOCK) break;
+        }
+      }
+    }
+  }
+  // wait all ack
+  for (int i = 0; i < nlink; ++i) {
+    if (!all_links[i].sock.BadSocket()) {
+      char ack;
+      ssize_t len = all_links[i].sock.Recv(&ack, sizeof(ack), MSG_WAITALL);
+      if (len == 0) {
+        all_links[i].sock.Close(); continue;
+      } else if (len > 0) {
+        _assert(ack == kResetAck, "wrong Ack MSG");
+      } else {
+        _assert(errno != EAGAIN|| errno != EWOULDBLOCK, "BUG");
+      }
+      // set back to nonblock mode
+      all_links[i].sock.SetNonBlock(true);
+    }
+  }
+  for (int i = 0; i < nlink; ++i) {
+    if (all_links[i].sock.BadSocket()) return kSockError;
+  }
+  return kSuccess;
+}
+/*!
+ * \brief if err_type indicates an error
+ *         recover links according to the error type reported
+ *        if there is no error, return true
+ * \param err_type the type of error happening in the system
+ * \return true if err_type is kSuccess, false otherwise
+ */
+bool AllreduceRobust::CheckAndRecover(ReturnType err_type) {
+  shutdown_timeout = err_type == kSuccess;
+  if (err_type == kSuccess) return true;
+
+  _assert(err_link != NULL, "must know the error link");
+  recover_counter += 1;
+  // async launch timeout task if enable_rabit_timeout is set
+  if (rabit_timeout && !rabit_timeout_task.valid()) {
+    utils::Printf("[EXPERIMENTAL] timeout thread expires in %d second(s)\n", timeout_sec);
+    rabit_timeout_task = std::async(std::launch::async, [=]() {
+      if (rabit_debug) {
+        utils::Printf("[%d] timeout thread %ld starts\n", rank,
+                      std::this_thread::get_id());
+      }
+      int time = 0;
+      // check if rabit recovered every 100ms
+      while (time++ < 10 * timeout_sec) {
+        std::this_thread::sleep_for(std::chrono::milliseconds(100));
+        if (shutdown_timeout.load()) {
+          if (rabit_debug) {
+            utils::Printf("[%d] timeout task thread %ld exits\n",
+              rank, std::this_thread::get_id());
+          }
+          return true;
+        }
+      }
+      _error("[%d] exit due to time out %d s\n", rank, timeout_sec);
+      return false;
+    });
+  }
+  // simple way, shutdown all links
+  for (size_t i = 0; i < all_links.size(); ++i) {
+    if (!all_links[i].sock.BadSocket()) all_links[i].sock.Close();
+  }
+  // smooth out traffic to tracker
+  std::this_thread::sleep_for(std::chrono::milliseconds(10*rank));
+  ReConnectLinks("recover");
+  return false;
+}
+/*!
+ * \brief message passing function, used to decide the
+ *        shortest distance to the possible source of data
+ * \param node_value a pair of have_data and size
+ *           have_data whether current node have data
+ *           size gives the size of data, if current node is kHaveData
+ * \param dist_in the shorest to any data source distance in each direction
+ * \param out_index the edge index of output link
+ * \return the shorest distance result of out edge specified by out_index
+ */
+inline std::pair<int, size_t>
+ShortestDist(const std::pair<bool, size_t> &node_value,
+             const std::vector< std::pair<int, size_t> > &dist_in,
+             size_t out_index) {
+  if (node_value.first) {
+    return std::make_pair(1, node_value.second);
+  }
+  size_t size = 0;
+  int res = std::numeric_limits<int>::max();
+  for (size_t i = 0; i < dist_in.size(); ++i) {
+    if (i == out_index) continue;
+    if (dist_in[i].first == std::numeric_limits<int>::max()) continue;
+    if (dist_in[i].first + 1 < res) {
+      res = dist_in[i].first + 1;
+      size = dist_in[i].second;
+    }
+  }
+  // add one hop
+
+  return std::make_pair(res, size);
+}
+/*!
+ * \brief message passing function, used to decide the
+ *    data request from each edge, whether need to request data from certain edge
+ * \param node_value a pair of request_data and best_link
+ *           request_data stores whether current node need to request data
+ *           best_link gives the best edge index to fetch the data
+ * \param req_in the data request from incoming edges
+ * \param out_index the edge index of output link
+ * \return the request to the output edge
+ */
+inline char DataRequest(const std::pair<bool, int> &node_value,
+                        const std::vector<char> &req_in,
+                        size_t out_index) {
+  // whether current node need to request data
+  bool request_data = node_value.first;
+  // which edge index is the best link to request data
+  // can be -1, which means current node contains data
+  const int best_link = node_value.second;
+  if (static_cast<int>(out_index) == best_link) {
+    if (request_data) return 1;
+    for (size_t i = 0; i < req_in.size(); ++i) {
+      if (i == out_index) continue;
+      if (req_in[i] != 0) return 1;
+    }
+  }
+  return 0;
+}
+/*!
+ * \brief try to decide the recovery message passing request
+ * \param role the current role of the node
+ * \param p_size used to store the size of the message, for node in state kHaveData,
+ *               this size must be set correctly before calling the function
+ *               for others, this surves as output parameter
+ *
+ * \param p_recvlink used to store the link current node should recv data from, if necessary
+ *          this can be -1, which means current node have the data
+ * \param p_req_in used to store the resulting vector, indicating which link we should send the data to
+ *
+ * \return this function can return kSuccess/kSockError/kGetExcept, see ReturnType for details
+ * \sa ReturnType
+ */
+AllreduceRobust::ReturnType
+AllreduceRobust::TryDecideRouting(AllreduceRobust::RecoverType role,
+                                  size_t *p_size,
+                                  int *p_recvlink,
+                                  std::vector<bool> *p_req_in) {
+  int best_link = -2;
+  {
+    // get the shortest distance to the request point
+    std::vector<std::pair<int, size_t> > dist_in, dist_out;
+
+    ReturnType succ = MsgPassing(std::make_pair(role == kHaveData, *p_size),
+                                 &dist_in, &dist_out, ShortestDist);
+    if (succ != kSuccess) return succ;
+    if (role != kHaveData) {
+      for (size_t i = 0; i < dist_in.size(); ++i) {
+        if (dist_in[i].first != std::numeric_limits<int>::max()) {
+          utils::Check(best_link == -2 || *p_size == dist_in[i].second,
+                       "[%d] Allreduce size inconsistent, distin=%lu, size=%lu, reporting=%lu\n",
+                       rank, dist_in[i].first, *p_size, dist_in[i].second);
+          if (best_link == -2 || dist_in[i].first < dist_in[best_link].first) {
+            best_link = static_cast<int>(i);
+            *p_size = dist_in[i].second;
+          }
+        }
+      }
+      utils::Check(best_link != -2, "Too many nodes went down and we cannot recover..");
+    } else {
+      best_link = -1;
+    }
+  }
+  // get the node request
+  std::vector<char> req_in, req_out;
+  ReturnType succ = MsgPassing(std::make_pair(role == kRequestData, best_link),
+                               &req_in, &req_out, DataRequest);
+  if (succ != kSuccess) return succ;
+  // set p_req_in
+  p_req_in->resize(req_in.size());
+  for (size_t i = 0; i < req_in.size(); ++i) {
+    // set p_req_in
+    (*p_req_in)[i] = (req_in[i] != 0);
+    if (req_out[i] != 0) {
+      _assert(req_in[i] == 0, "cannot get and receive request");
+      _assert(static_cast<int>(i) == best_link, "request result inconsistent");
+    }
+  }
+  *p_recvlink = best_link;
+  return kSuccess;
+}
+/*!
+ * \brief try to finish the data recovery request,
+ *        this function is used together with TryDecideRouting
+ * \param role the current role of the node
+ * \param sendrecvbuf_ the buffer to store the data to be sent/recived
+ *          - if the role is kHaveData, this stores the data to be sent
+ *          - if the role is kRequestData, this is the buffer to store the result
+ *          - if the role is kPassData, this will not be used, and can be NULL
+ * \param size the size of the data, obtained from TryDecideRouting
+ * \param recv_link the link index to receive data, if necessary, obtained from TryDecideRouting
+ * \param req_in the request of each link to send data, obtained from TryDecideRouting
+ *
+ * \return this function can return kSuccess/kSockError/kGetExcept, see ReturnType for details
+ * \sa ReturnType, TryDecideRouting
+ */
+AllreduceRobust::ReturnType
+AllreduceRobust::TryRecoverData(RecoverType role,
+                                void *sendrecvbuf_,
+                                size_t size,
+                                int recv_link,
+                                const std::vector<bool> &req_in) {
+  RefLinkVector &links = tree_links;
+  // no need to run recovery for zero size messages
+  if (links.size() == 0 || size == 0) return kSuccess;
+  _assert(req_in.size() == links.size(), "TryRecoverData");
+  const int nlink = static_cast<int>(links.size());
+  {
+    bool req_data = role == kRequestData;
+    for (int i = 0; i < nlink; ++i) {
+      if (req_in[i]) {
+        _assert(i != recv_link, "TryDecideRouting");
+        req_data = true;
+      }
+    }
+    // do not need to provide data or receive data, directly exit
+    if (!req_data) return kSuccess;
+  }
+  _assert(recv_link >= 0 || role == kHaveData, "recv_link must be active");
+  if (role == kPassData) {
+    links[recv_link].InitBuffer(1, size, reduce_buffer_size);
+  }
+  for (int i = 0; i < nlink; ++i) {
+    links[i].ResetSize();
+  }
+  while (true) {
+    bool finished = true;
+    utils::PollHelper watcher;
+    for (int i = 0; i < nlink; ++i) {
+      if (i == recv_link && links[i].size_read != size) {
+        watcher.WatchRead(links[i].sock);
+        finished = false;
+      }
+      if (req_in[i] && links[i].size_write != size) {
+        if (role == kHaveData ||
+            (links[recv_link].size_read != links[i].size_write)) {
+          watcher.WatchWrite(links[i].sock);
+        }
+        finished = false;
+      }
+      watcher.WatchException(links[i].sock);
+    }
+    if (finished) break;
+    watcher.Poll();
+    // exception handling
+    for (int i = 0; i < nlink; ++i) {
+      if (watcher.CheckExcept(links[i].sock)) {
+        return ReportError(&links[i], kGetExcept);
+      }
+    }
+    if (role == kRequestData) {
+      const int pid = recv_link;
+      if (watcher.CheckRead(links[pid].sock)) {
+        ReturnType ret = links[pid].ReadToArray(sendrecvbuf_, size);
+        if (ret != kSuccess) {
+          return ReportError(&links[pid], ret);
+        }
+      }
+      for (int i = 0; i < nlink; ++i) {
+        if (req_in[i] && links[i].size_write != links[pid].size_read) {
+          ReturnType ret = links[i].WriteFromArray(sendrecvbuf_, links[pid].size_read);
+          if (ret != kSuccess) {
+            return ReportError(&links[i], ret);
+          }
+        }
+      }
+    }
+    if (role == kHaveData) {
+      for (int i = 0; i < nlink; ++i) {
+        if (req_in[i] && links[i].size_write != size) {
+          ReturnType ret = links[i].WriteFromArray(sendrecvbuf_, size);
+          if (ret != kSuccess) {
+            return ReportError(&links[i], ret);
+          }
+        }
+      }
+    }
+    if (role == kPassData) {
+      const int pid = recv_link;
+      const size_t buffer_size = links[pid].buffer_size;
+      if (watcher.CheckRead(links[pid].sock)) {
+        size_t min_write = size;
+        for (int i = 0; i < nlink; ++i) {
+          if (req_in[i]) min_write = std::min(links[i].size_write, min_write);
+        }
+        _assert(min_write <= links[pid].size_read, "boundary check");
+        ReturnType ret = links[pid].ReadToRingBuffer(min_write, size);
+        if (ret != kSuccess) {
+          return ReportError(&links[pid], ret);
+        }
+      }
+      for (int i = 0; i < nlink; ++i) {
+        if (req_in[i] && links[pid].size_read != links[i].size_write) {
+          size_t start = links[i].size_write % buffer_size;
+          // send out data from ring buffer
+          size_t nwrite = std::min(buffer_size - start, links[pid].size_read - links[i].size_write);
+          ssize_t len = links[i].sock.Send(links[pid].buffer_head + start, nwrite);
+          if (len != -1) {
+            links[i].size_write += len;
+          } else {
+            ReturnType ret = Errno2Return();
+            if (ret != kSuccess) return ReportError(&links[i], ret);
+          }
+        }
+      }
+    }
+  }
+  return kSuccess;
+}
+/*!
+ * \brief try to fetch allreduce/broadcast results from rest of nodes
+ *  as collaberative function called by all nodes, only requester node
+ *  will pass seqno to rest of nodes and reconstruct/backfill sendrecvbuf_
+ *  of specific seqno from other nodes.
+ */
+AllreduceRobust::ReturnType AllreduceRobust::TryRestoreCache(bool requester,
+  const int min_seq, const int max_seq) {
+  // clear requester and rebuild from those with most cache entries
+  if (requester) {
+    _assert(cur_cache_seq <= max_seq, "requester is expected to have fewer cache entries");
+    cachebuf.Clear();
+    lookupbuf.Clear();
+    cur_cache_seq = 0;
+  }
+  RecoverType role = requester ? kRequestData : kHaveData;
+  size_t size = 1;
+  int recv_link;
+  std::vector<bool> req_in;
+  ReturnType ret = TryDecideRouting(role, &size, &recv_link, &req_in);
+  if (ret != kSuccess) return ret;
+  // only recover missing cache entries in requester
+  // as tryrecoverdata is collective call, need to go through entire cache
+  // and only work on those missing
+  for (int i = 0; i < max_seq; i++) {
+    // restore lookup map
+    size_t cache_size = 0;
+    void* key = lookupbuf.Query(i, &cache_size);
+    ret = TryRecoverData(role, &cache_size, sizeof(size_t), recv_link, req_in);
+    if (ret != kSuccess) return ret;
+    if (requester) {
+      key = lookupbuf.AllocTemp(cache_size, 1);
+      lookupbuf.PushTemp(i, cache_size, 1);
+    }
+    ret = TryRecoverData(role, key, cache_size, recv_link, req_in);
+    if (ret != kSuccess) return ret;
+    // restore cache content
+    cache_size = 0;
+    void* buf = cachebuf.Query(i, &cache_size);
+    ret = TryRecoverData(role, &cache_size, sizeof(size_t), recv_link, req_in);
+    if (requester) {
+      buf = cachebuf.AllocTemp(cache_size, 1);
+      cachebuf.PushTemp(i, cache_size, 1);
+      cur_cache_seq +=1;
+    }
+    ret = TryRecoverData(role, buf, cache_size, recv_link, req_in);
+    if (ret != kSuccess) return ret;
+  }
+
+  return kSuccess;
+}
+
+/*!
+ * \brief try to load check point
+ *
+ *        This is a collaborative function called by all nodes
+ *        only the nodes with requester set to true really needs to load the check point
+ *        other nodes acts as collaborative roles to complete this request
+ *
+ * \param requester whether current node is the requester
+ * \return this function can return kSuccess/kSockError/kGetExcept, see ReturnType for details
+ * \sa ReturnType
+ */
+AllreduceRobust::ReturnType AllreduceRobust::TryLoadCheckPoint(bool requester) {
+  // check in local data
+  RecoverType role =  requester ? kRequestData : kHaveData;
+  ReturnType succ;
+  if (num_local_replica != 0) {
+    if (requester) {
+      // clear existing history, if any, before load
+      local_rptr[local_chkpt_version].clear();
+      local_chkpt[local_chkpt_version].clear();
+    }
+    // recover local checkpoint
+    succ = TryRecoverLocalState(&local_rptr[local_chkpt_version],
+                                &local_chkpt[local_chkpt_version]);
+    if (succ != kSuccess) return succ;
+    int nlocal = std::max(static_cast<int>(local_rptr[local_chkpt_version].size()) - 1, 0);
+    // check if everyone is OK
+    unsigned state = 0;
+    if (nlocal == num_local_replica + 1) {
+      // complete recovery
+      state = 1;
+    } else if (nlocal == 0) {
+      // get nothing
+      state = 2;
+    } else {
+      // partially complete state
+      state = 4;
+    }
+    succ = TryAllreduce(&state, sizeof(state), 1, op::Reducer<op::BitOR, unsigned>);
+    if (succ != kSuccess) return succ;
+    utils::Check(state == 1 || state == 2,
+                 "LoadCheckPoint: too many nodes fails, cannot recover local state");
+  }
+  // do call save model if the checkpoint was lazy
+  if (role == kHaveData && global_lazycheck != NULL) {
+    global_checkpoint.resize(0);
+    utils::MemoryBufferStream fs(&global_checkpoint);
+    fs.Write(&version_number, sizeof(version_number));
+    global_lazycheck->Save(&fs);
+    global_lazycheck = NULL;
+  }
+  // recover global checkpoint
+  size_t size = this->global_checkpoint.length();
+  int recv_link;
+  std::vector<bool> req_in;
+  succ = TryDecideRouting(role, &size, &recv_link, &req_in);
+  if (succ != kSuccess) return succ;
+  if (role == kRequestData) {
+    global_checkpoint.resize(size);
+  }
+  if (size == 0) return kSuccess;
+  return TryRecoverData(role, BeginPtr(global_checkpoint), size, recv_link, req_in);
+}
+/*!
+ * \brief try to get the result of operation specified by seqno
+ *
+ *        This is a collaborative function called by all nodes
+ *        only the nodes with requester set to true really needs to get the result
+ *        other nodes acts as collaborative roles to complete this request
+ *
+ * \param buf the buffer to store the result, this parameter is only used when current node is requester
+ * \param size the total size of the buffer, this parameter is only used when current node is requester
+ * \param seqno sequence number of the operation, this is unique index of a operation in current iteration
+ * \param requester whether current node is the requester
+ * \return this function can return kSuccess/kSockError/kGetExcept, see ReturnType for details
+ * \sa ReturnType
+ */
+AllreduceRobust::ReturnType
+AllreduceRobust::TryGetResult(void *sendrecvbuf, size_t size, int seqno, bool requester) {
+  // if minimum sequence requested is local check point ack,
+  // this means all nodes have finished local check point, directly return
+  if (seqno == ActionSummary::kLocalCheckAck) return kSuccess;
+  if (seqno == ActionSummary::kLocalCheckPoint) {
+    // new version of local model
+    int new_version = !local_chkpt_version;
+    int nlocal = std::max(static_cast<int>(local_rptr[new_version].size()) - 1, 0);
+    // if we goes to this place, use must have already setup the state once
+    _assert(nlocal == 1 || nlocal == num_local_replica + 1,
+                  "TryGetResult::Checkpoint");
+    return TryRecoverLocalState(&local_rptr[new_version], &local_chkpt[new_version]);
+  }
+
+  // handles normal data recovery
+  RecoverType role;
+  if (!requester) {
+    sendrecvbuf = resbuf.Query(seqno, &size);
+    role = sendrecvbuf != NULL ? kHaveData : kPassData;
+  } else {
+    role = kRequestData;
+  }
+  int recv_link;
+  std::vector<bool> req_in;
+  // size of data
+  size_t data_size = size;
+  ReturnType succ = TryDecideRouting(role, &data_size, &recv_link, &req_in);
+  if (succ != kSuccess) return succ;
+  utils::Check(data_size != 0, "zero size check point is not allowed");
+  if (role == kRequestData || role == kHaveData) {
+    utils::Check(data_size == size,
+                 "Allreduce Recovered data size do not match the specification of function call.\n"\
+                 "Please check if calling sequence of recovered program is the " \
+                 "same the original one in current VersionNumber");
+  }
+  return TryRecoverData(role, sendrecvbuf, data_size, recv_link, req_in);
+}
+/*!
+ * \brief try to run recover execution for a request action described by flag and seqno,
+ *        the function will keep blocking to run possible recovery operations before the specified action,
+ *        until the requested result is received by a recovering procedure,
+ *        or the function discovers that the requested action is not yet executed, and return false
+ *
+ * \param buf the buffer to store the result
+ * \param size the total size of the buffer
+ * \param flag flag information about the action \sa ActionSummary
+ * \param seqno sequence number of the action, if it is special action with flag set,
+ *              seqno needs to be set to ActionSummary::kSpecialOp
+ *
+ * \return if this function can return true or false
+ *    - true means buf already set to the
+ *           result by recovering procedure, the action is complete, no further action is needed
+ *    - false means this is the lastest action that has not yet been executed, need to execute the action
+ */
+bool AllreduceRobust::RecoverExec(void *buf, size_t size, int flag, int seqno,
+                                  int cache_seqno, const char* caller) {
+  // kLoadBootstrapCache should be treated similar as allreduce
+  // when loadcheck/check/checkack runs in other nodes
+  if (flag != 0 && flag != ActionSummary::kLoadBootstrapCache) {
+    _assert(seqno == ActionSummary::kSpecialOp, "must only set seqno for normal operations");
+  }
+
+  std::string msg = std::string(caller) + " pass negative seqno "
+    + std::to_string(seqno) + " flag " + std::to_string(flag)
+    + " version " + std::to_string(version_number);
+  _assert(seqno >=0, msg.c_str());
+
+  ActionSummary req(flag, flag, seqno, cache_seqno);
+
+  while (true) {
+    this->ReportStatus();
+    // copy to action and send to allreduce with other nodes
+    ActionSummary act = req;
+    // get the reduced action
+    if (!CheckAndRecover(TryAllreduce(&act, sizeof(act), 1, ActionSummary::Reducer))) continue;
+
+    if (act.check_ack()) {
+      if (act.check_point()) {
+        // if we also have check_point, do check point first
+        _assert(!act.diff_seq(),
+                      "check ack & check pt  cannot occur together with normal ops");
+        // if we requested checkpoint, we are free to go
+        if (req.check_point()) return true;
+      } else if (act.load_check()) {
+        // if there is only check_ack and load_check, do load_check
+        if (!CheckAndRecover(TryLoadCheckPoint(req.load_check()))) continue;
+        // if requested load check, then misson complete
+        if (req.load_check()) return true;
+      } else {
+        // there is no check point and no load check, execute check ack
+        if (req.check_ack()) return true;
+      }
+      // if execute to this point
+      // this means the action requested has not been completed
+      // try next round
+    } else {
+      if (act.check_point()) {
+        if (act.diff_seq()) {
+          _assert(act.seqno() != ActionSummary::kSpecialOp, "min seq bug");
+          // print checkpoint consensus flag if user turn on debug
+          if (rabit_debug) {
+            req.print_flags(rank, "checkpoint req");
+            act.print_flags(rank, "checkpoint act");
+          }
+          /*
+           * Chen Qin
+           * at least one hit checkpoint_ code & at least one not hitting
+           * compare with version_number of req.check_point() set true with rest
+           * expect to be equal, means rest fall behind in sequence
+           * use resbuf resbuf to recover
+           * worker-0           worker-1
+           * checkpoint(n-1)    checkpoint(n-1)
+           * allreduce          allreduce (requester) |
+           * broadcast                                V
+           * checkpoint(n req)
+           * after catch up to checkpoint n, diff_seq will be false
+           * */
+          // assume requester is falling behind
+          bool requester = req.seqno() == act.seqno();
+          // if not load cache
+          if (!act.load_cache()) {
+            if (act.seqno() > 0) {
+              if (!requester) {
+                _assert(req.check_point(), "checkpoint node should be KHaveData role");
+                buf = resbuf.Query(act.seqno(), &size);
+                _assert(buf != NULL, "buf should have data from resbuf");
+                _assert(size > 0, "buf size should be greater than 0");
+              }
+              if (!CheckAndRecover(TryGetResult(buf, size, act.seqno(), requester))) continue;
+            }
+          } else {
+            // cache seq no should be smaller than kSpecialOp
+            _assert(act.seqno(SeqType::kCache) != ActionSummary::kSpecialOp,
+              "checkpoint with kSpecialOp");
+            int max_cache_seq = cur_cache_seq;
+            if (TryAllreduce(&max_cache_seq, sizeof(max_cache_seq), 1,
+              op::Reducer<op::Max, unsigned>) != kSuccess) continue;
+
+            if (TryRestoreCache(req.load_cache(), act.seqno(), max_cache_seq)
+              != kSuccess) continue;
+          }
+          if (requester) return true;
+        } else  {
+          // no difference in seq no, means we are free to check point
+          if (req.check_point()) return true;
+        }
+      } else {
+        // no check point
+        if (act.load_check()) {
+          // all the nodes called load_check, this is an incomplete action
+          if (!act.diff_seq()) return false;
+          // load check have higher priority, do load_check
+          if (!CheckAndRecover(TryLoadCheckPoint(req.load_check()))) continue;
+          // if requested load check, then misson complete
+          if (req.load_check()) return true;
+        } else {
+          // run all nodes in a isolated cache restore logic
+          if (act.load_cache()) {
+            // print checkpoint consensus flag if user turn on debug
+            if (rabit_debug) {
+              req.print_flags(rank, "loadcache req");
+              act.print_flags(rank, "loadcache act");
+            }
+            // load cache should not running in parralel with other states
+            _assert(!act.load_check(),
+              "load cache state expect no nodes doing load checkpoint");
+            _assert(!act.check_point() ,
+              "load cache state expect no nodes doing checkpoint");
+            _assert(!act.check_ack(),
+              "load cache state expect no nodes doing checkpoint ack");
+
+            // if all nodes are requester in load cache, skip
+            if (act.load_cache(SeqType::kCache)) return false;
+
+            // bootstrap cache always restore before loadcheckpoint
+            // requester always have seq diff with non requester
+            if (act.diff_seq()) {
+              // restore cache failed, retry from what's left
+              if (TryRestoreCache(req.load_cache(), act.seqno(), act.seqno(SeqType::kCache))
+                != kSuccess) continue;
+            }
+            // if requested load cache, then mission complete
+            if (req.load_cache()) return true;
+            continue;
+          }
+
+          // assert no req with load cache set goes into seq catch up
+          _assert(!req.load_cache(), "load cache not interacte with rest states");
+
+          // no special flags, no checkpoint, check ack, load_check
+          _assert(act.seqno() != ActionSummary::kSpecialOp, "min seq bug");
+          if (act.diff_seq()) {
+            bool requester = req.seqno() == act.seqno();
+            if (!CheckAndRecover(TryGetResult(buf, size, act.seqno(), requester))) continue;
+            if (requester) return true;
+          } else {
+            // all the request is same,
+            // this is most recent command that is yet to be executed
+            return false;
+          }
+        }
+      }
+      // something is still incomplete try next round
+    }
+  }
+  _assert(false, "RecoverExec: should not reach here");
+  return true;
+}
+/*!
+ * \brief try to recover the local state, making each local state to be the result of itself
+ *        plus replication of states in previous num_local_replica hops in the ring
+ *
+ * The input parameters must contain the valid local states available in current nodes,
+ * This function try ist best to "complete" the missing parts of local_rptr and local_chkpt
+ * If there is sufficient information in the ring, when the function returns, local_chkpt will
+ * contain num_local_replica + 1 checkpoints (including the chkpt of this node)
+ * If there is no sufficient information in the ring, this function the number of checkpoints
+ * will be less than the specified value
+ *
+ * \param p_local_rptr the pointer to the segment pointers in the states array
+ * \param p_local_chkpt the pointer to the storage of local check points
+ * \return this function can return kSuccess/kSockError/kGetExcept, see ReturnType for details
+ * \sa ReturnType
+ */
+AllreduceRobust::ReturnType
+AllreduceRobust::TryRecoverLocalState(std::vector<size_t> *p_local_rptr,
+                                      std::string *p_local_chkpt) {
+  // if there is no local replica, we can do nothing
+  if (num_local_replica == 0) return kSuccess;
+  std::vector<size_t> &rptr = *p_local_rptr;
+  std::string &chkpt = *p_local_chkpt;
+  if (rptr.size() == 0) {
+    rptr.push_back(0);
+    _assert(chkpt.length() == 0, "local chkpt space inconsistent");
+  }
+  const int n = num_local_replica;
+  {
+    // backward passing, passing state in backward direction of the ring
+    const int nlocal = static_cast<int>(rptr.size() - 1);
+    _assert(nlocal <= n + 1, "invalid local replica");
+    std::vector<int> msg_back(n + 1);
+    msg_back[0] = nlocal;
+    // backward passing one hop the request
+    ReturnType succ;
+    succ = RingPassing(BeginPtr(msg_back),
+                       1 * sizeof(int), (n+1) * sizeof(int),
+                       0 * sizeof(int), n * sizeof(int),
+                       ring_next, ring_prev);
+    if (succ != kSuccess) return succ;
+    int msg_forward[2];
+    msg_forward[0] = nlocal;
+    succ = RingPassing(msg_forward,
+                       1 * sizeof(int), 2 * sizeof(int),
+                       0 * sizeof(int), 1 * sizeof(int),
+                       ring_prev, ring_next);
+    if (succ != kSuccess) return succ;
+    // calculate the number of things we can read from next link
+    int nread_end = nlocal;
+    for (int i = 1; i <= n; ++i) {
+      nread_end = std::max(nread_end, msg_back[i] - i);
+    }
+    // gives the size of forward
+    int nwrite_start = std::min(msg_forward[1] + 1, nread_end);
+    // get the size of each segments
+    std::vector<size_t> sizes(nread_end);
+    for (int i = 0; i < nlocal; ++i) {
+      sizes[i] = rptr[i + 1] - rptr[i];
+    }
+    // pass size through the link
+    succ = RingPassing(BeginPtr(sizes),
+                       nlocal * sizeof(size_t),
+                       nread_end * sizeof(size_t),
+                       nwrite_start * sizeof(size_t),
+                       nread_end * sizeof(size_t),
+                       ring_next, ring_prev);
+    if (succ != kSuccess) return succ;
+    // update rptr
+    rptr.resize(nread_end + 1);
+    for (int i = nlocal; i < nread_end; ++i) {
+      rptr[i + 1] = rptr[i] + sizes[i];
+    }
+    chkpt.resize(rptr.back());
+    // pass data through the link
+    succ = RingPassing(BeginPtr(chkpt), rptr[nlocal], rptr[nread_end],
+                       rptr[nwrite_start], rptr[nread_end],
+                       ring_next, ring_prev);
+    if (succ != kSuccess) {
+      rptr.resize(nlocal + 1); chkpt.resize(rptr.back()); return succ;
+    }
+  }
+  {
+    // forward passing, passing state in forward direction of the ring
+    const int nlocal = static_cast<int>(rptr.size() - 1);
+    _assert(nlocal <= n + 1, "invalid local replica");
+    std::vector<int> msg_forward(n + 1);
+    msg_forward[0] = nlocal;
+    // backward passing one hop the request
+    ReturnType succ;
+    succ = RingPassing(BeginPtr(msg_forward),
+                       1 * sizeof(int), (n+1) * sizeof(int),
+                       0 * sizeof(int), n * sizeof(int),
+                       ring_prev, ring_next);
+    if (succ != kSuccess) return succ;
+    int msg_back[2];
+    msg_back[0] = nlocal;
+    succ = RingPassing(msg_back,
+                       1 * sizeof(int), 2 * sizeof(int),
+                       0 * sizeof(int), 1 * sizeof(int),
+                       ring_next, ring_prev);
+    if (succ != kSuccess) return succ;
+    // calculate the number of things we can read from next link
+    int nread_end = nlocal, nwrite_end = 1;
+    // have to have itself in order to get other data from prev link
+    if (nlocal != 0) {
+      for (int i = 1; i <= n; ++i) {
+        if (msg_forward[i] == 0) break;
+        nread_end = std::max(nread_end, i + 1);
+        nwrite_end = i + 1;
+      }
+      if (nwrite_end > n) nwrite_end = n;
+    } else  {
+      nread_end = 0; nwrite_end = 0;
+    }
+    // gives the size of forward
+    int nwrite_start = std::min(msg_back[1] - 1, nwrite_end);
+    // next node miss the state of itself, cannot recover
+    if (nwrite_start < 0) nwrite_start = nwrite_end = 0;
+    // get the size of each segments
+    std::vector<size_t> sizes(nread_end);
+    for (int i = 0; i < nlocal; ++i) {
+      sizes[i] = rptr[i + 1] - rptr[i];
+    }
+    // pass size through the link, check consistency
+    succ = RingPassing(BeginPtr(sizes),
+                       nlocal * sizeof(size_t),
+                       nread_end * sizeof(size_t),
+                       nwrite_start * sizeof(size_t),
+                       nwrite_end * sizeof(size_t),
+                       ring_prev, ring_next);
+    if (succ != kSuccess) return succ;
+    // update rptr
+    rptr.resize(nread_end + 1);
+    for (int i = nlocal; i < nread_end; ++i) {
+      rptr[i + 1] = rptr[i] + sizes[i];
+    }
+    chkpt.resize(rptr.back());
+    // pass data through the link
+    succ = RingPassing(BeginPtr(chkpt), rptr[nlocal], rptr[nread_end],
+                       rptr[nwrite_start], rptr[nwrite_end],
+                       ring_prev, ring_next);
+    if (succ != kSuccess) {
+      rptr.resize(nlocal + 1); chkpt.resize(rptr.back()); return succ;
+    }
+  }
+  return kSuccess;
+}
+/*!
+ * \brief try to checkpoint local state, this function is called in normal executation phase
+ *    of checkpoint that contains local state
+ *  the input state must exactly one saved state(local state of current node),
+ *  after complete, this function will get local state from previous num_local_replica nodes and put them
+ *  into local_chkpt and local_rptr
+ *
+ *  It is also OK to call TryRecoverLocalState instead,
+ *  TryRecoverLocalState makes less assumption about the input, and requires more communications
+ *
+ * \param p_local_rptr the pointer to the segment pointers in the states array
+ * \param p_local_chkpt the pointer to the storage of local check points
+ * \return this function can return kSuccess/kSockError/kGetExcept, see ReturnType for details
+ * \sa ReturnType, TryRecoverLocalState
+ */
+AllreduceRobust::ReturnType
+AllreduceRobust::TryCheckinLocalState(std::vector<size_t> *p_local_rptr,
+                                      std::string *p_local_chkpt) {
+  // if there is no local replica, we can do nothing
+  if (num_local_replica == 0) return kSuccess;
+  std::vector<size_t> &rptr = *p_local_rptr;
+  std::string &chkpt = *p_local_chkpt;
+  _assert(rptr.size() == 2,
+                "TryCheckinLocalState must have exactly 1 state");
+  const int n = num_local_replica;
+  std::vector<size_t> sizes(n + 1);
+  sizes[0] = rptr[1] - rptr[0];
+  ReturnType succ;
+  // pass size through the link
+  succ = RingPassing(BeginPtr(sizes),
+                     1 * sizeof(size_t),
+                     (n + 1) * sizeof(size_t),
+                     0 * sizeof(size_t),
+                     n * sizeof(size_t),
+                     ring_prev, ring_next);
+  if (succ != kSuccess) return succ;
+  // update rptr
+  rptr.resize(n + 2);
+  for (int i = 1; i <= n; ++i) {
+    rptr[i + 1] = rptr[i] + sizes[i];
+  }
+  chkpt.resize(rptr.back());
+  // pass data through the link
+  succ = RingPassing(BeginPtr(chkpt),
+                     rptr[1], rptr[n + 1],
+                     rptr[0], rptr[n],
+                     ring_prev, ring_next);
+  if (succ != kSuccess) {
+    rptr.resize(2); chkpt.resize(rptr.back()); return succ;
+  }
+  return kSuccess;
+}
+/*!
+ * \brief perform a ring passing to receive data from prev link, and sent data to next link
+ *  this allows data to stream over a ring structure
+ *  sendrecvbuf[0:read_ptr] are already provided by current node
+ *  current node will recv sendrecvbuf[read_ptr:read_end] from prev link
+ *  current node will send sendrecvbuf[write_ptr:write_end] to next link
+ *  write_ptr will wait till the data is readed before sending the data
+ *  this function requires read_end >= write_end
+ *
+ * \param sendrecvbuf_ the place to hold the incoming and outgoing data
+ * \param read_ptr the initial read pointer
+ * \param read_end the ending position to read
+ * \param write_ptr the initial write pointer
+ * \param write_end the ending position to write
+ * \param read_link pointer to link to previous position in ring
+ * \param write_link pointer to link of next position in ring
+ */
+AllreduceRobust::ReturnType
+AllreduceRobust::RingPassing(void *sendrecvbuf_,
+                             size_t read_ptr,
+                             size_t read_end,
+                             size_t write_ptr,
+                             size_t write_end,
+                             LinkRecord *read_link,
+                             LinkRecord *write_link) {
+  if (read_link == NULL || write_link == NULL || read_end == 0) return kSuccess;
+  _assert(write_end <= read_end,
+                "RingPassing: boundary check1");
+  _assert(read_ptr <= read_end, "RingPassing: boundary check2");
+  _assert(write_ptr <= write_end, "RingPassing: boundary check3");
+  // take reference
+  LinkRecord &prev = *read_link, &next = *write_link;
+  // send recv buffer
+  char *buf = reinterpret_cast<char*>(sendrecvbuf_);
+  while (true) {
+    bool finished = true;
+    utils::PollHelper watcher;
+    if (read_ptr != read_end) {
+      watcher.WatchRead(prev.sock);
+      finished = false;
+    }
+    if (write_ptr < read_ptr && write_ptr != write_end) {
+      watcher.WatchWrite(next.sock);
+      finished = false;
+    }
+    watcher.WatchException(prev.sock);
+    watcher.WatchException(next.sock);
+    if (finished) break;
+    watcher.Poll();
+    if (watcher.CheckExcept(prev.sock)) return ReportError(&prev, kGetExcept);
+    if (watcher.CheckExcept(next.sock)) return ReportError(&next, kGetExcept);
+    if (read_ptr != read_end && watcher.CheckRead(prev.sock)) {
+      ssize_t len = prev.sock.Recv(buf + read_ptr, read_end - read_ptr);
+      if (len == 0) {
+        prev.sock.Close(); return ReportError(&prev, kRecvZeroLen);
+      }
+      if (len != -1) {
+        read_ptr += static_cast<size_t>(len);
+      } else {
+        ReturnType ret = Errno2Return();
+        if (ret != kSuccess) return ReportError(&prev, ret);
+      }
+    }
+    if (write_ptr != write_end && write_ptr < read_ptr) {
+      size_t nsend = std::min(write_end - write_ptr, read_ptr - write_ptr);
+      ssize_t len = next.sock.Send(buf + write_ptr, nsend);
+      if (len != -1) {
+        write_ptr += static_cast<size_t>(len);
+      } else {
+        ReturnType ret = Errno2Return();
+        if (ret != kSuccess) return ReportError(&prev, ret);
+      }
+    }
+  }
+  return kSuccess;
+}
+}  // namespace engine
+}  // namespace rabit
diff --git a/src/allreduce_robust.h b/src/allreduce_robust.h
new file mode 100644
index 0000000..a4bee7c
--- /dev/null
+++ b/src/allreduce_robust.h
@@ -0,0 +1,672 @@
+/*!
+ *  Copyright (c) 2014 by Contributors
+ * \file allreduce_robust.h
+ * \brief Robust implementation of Allreduce
+ *   using TCP non-block socket and tree-shape reduction.
+ *
+ *   This implementation considers the failure of nodes
+ *
+ * \author Tianqi Chen, Ignacio Cano, Tianyi Zhou
+ */
+#ifndef RABIT_ALLREDUCE_ROBUST_H_
+#define RABIT_ALLREDUCE_ROBUST_H_
+#include <future>
+#include <vector>
+#include <string>
+#include <algorithm>
+#include "rabit/internal/engine.h"
+#include "allreduce_base.h"
+
+namespace rabit {
+namespace engine {
+/*! \brief implementation of fault tolerant all reduce engine */
+class AllreduceRobust : public AllreduceBase {
+ public:
+  AllreduceRobust(void);
+  virtual ~AllreduceRobust(void) {}
+  // initialize the manager
+  virtual bool Init(int argc, char* argv[]);
+  /*! \brief shutdown the engine */
+  virtual bool Shutdown(void);
+  /*!
+   * \brief set parameters to the engine
+   * \param name parameter name
+   * \param val parameter value
+   */
+  virtual void SetParam(const char *name, const char *val);
+  /*!
+   * \brief perform immutable local bootstrap cache insertion
+   * \param key unique cache key
+   * \param buf buffer of allreduce/robust payload to copy
+   * \param buflen total number of bytes
+   * \return -1 if no recovery cache fetched otherwise 0
+   */
+  int SetBootstrapCache(const std::string &key, const void *buf,
+    const size_t type_nbytes, const size_t count);
+  /*!
+   * \brief perform bootstrap cache lookup if nodes in fault recovery
+   * \param key unique cache key
+   * \param buf buffer for recv allreduce/robust payload
+   * \param buflen total number of bytes
+   */
+  int GetBootstrapCache(const std::string &key, void *buf, const size_t type_nbytes,
+    const size_t count);
+  /*!
+   * \brief internal Allgather function, each node have a segment of data in the ring of sendrecvbuf,
+   *  the data provided by current node k is [slice_begin, slice_end),
+   *  the next node's segment must start with slice_end
+   *  after the call of Allgather, sendrecvbuf_ contains all the contents including all segments
+   *  use a ring based algorithm
+   *
+   * \param sendrecvbuf_ buffer for both sending and receiving data, it is a ring conceptually
+   * \param total_size total size of data to be gathered
+   * \param slice_begin beginning of the current slice
+   * \param slice_end end of the current slice
+   * \param size_prev_slice size of the previous slice i.e. slice of node (rank - 1) % world_size
+   * \param _file caller file name used to generate unique cache key
+   * \param _line caller line number used to generate unique cache key
+   * \param _caller caller function name used to generate unique cache key
+   */
+  virtual void Allgather(void *sendrecvbuf_, size_t total_size,
+                              size_t slice_begin,
+                              size_t slice_end,
+                              size_t size_prev_slice,
+                              const char* _file = _FILE,
+                              const int _line = _LINE,
+                              const char* _caller = _CALLER);
+  /*!
+   * \brief perform in-place allreduce, on sendrecvbuf
+   *        this function is NOT thread-safe
+   * \param sendrecvbuf_ buffer for both sending and recving data
+   * \param type_nbytes the unit number of bytes the type have
+   * \param count number of elements to be reduced
+   * \param reducer reduce function
+   * \param prepare_func Lazy preprocessing function, lazy prepare_fun(prepare_arg)
+   *                     will be called by the function before performing Allreduce, to intialize the data in sendrecvbuf_.
+   *                     If the result of Allreduce can be recovered directly, then prepare_func will NOT be called
+   * \param prepare_arg argument used to passed into the lazy preprocessing function
+   * \param prepare_arg argument used to passed into the lazy preprocessing function
+   * \param _file caller file name used to generate unique cache key
+   * \param _line caller line number used to generate unique cache key
+   * \param _caller caller function name used to generate unique cache key
+   */
+  virtual void Allreduce(void *sendrecvbuf_,
+                         size_t type_nbytes,
+                         size_t count,
+                         ReduceFunction reducer,
+                         PreprocFunction prepare_fun = NULL,
+                         void *prepare_arg = NULL,
+                         const char* _file = _FILE,
+                         const int _line = _LINE,
+                         const char* _caller = _CALLER);
+  /*!
+   * \brief broadcast data from root to all nodes
+   * \param sendrecvbuf_ buffer for both sending and recving data
+   * \param size the size of the data to be broadcasted
+   * \param root the root worker id to broadcast the data
+   * \param _file caller file name used to generate unique cache key
+   * \param _line caller line number used to generate unique cache key
+   * \param _caller caller function name used to generate unique cache key
+   */
+  virtual void Broadcast(void *sendrecvbuf_, size_t total_size, int root,
+                         const char* _file = _FILE,
+                         const int _line = _LINE,
+                         const char* _caller = _CALLER);
+  /*!
+   * \brief load latest check point
+   * \param global_model pointer to the globally shared model/state
+   *   when calling this function, the caller need to gauranttees that global_model
+   *   is the same in all nodes
+   * \param local_model pointer to local model, that is specific to current node/rank
+   *   this can be NULL when no local model is needed
+   *
+   * \return the version number of check point loaded
+   *     if returned version == 0, this means no model has been CheckPointed
+   *     the p_model is not touched, user should do necessary initialization by themselves
+   *
+   *   Common usage example:
+   *      int iter = rabit::LoadCheckPoint(&model);
+   *      if (iter == 0) model.InitParameters();
+   *      for (i = iter; i < max_iter; ++i) {
+   *        do many things, include allreduce
+   *        rabit::CheckPoint(model);
+   *      }
+   *
+   * \sa CheckPoint, VersionNumber
+   */
+  virtual int LoadCheckPoint(Serializable *global_model,
+                             Serializable *local_model = NULL);
+  /*!
+   * \brief checkpoint the model, meaning we finished a stage of execution
+   *  every time we call check point, there is a version number which will increase by one
+   *
+   * \param global_model pointer to the globally shared model/state
+   *   when calling this function, the caller need to gauranttees that global_model
+   *   is the same in all nodes
+   * \param local_model pointer to local model, that is specific to current node/rank
+   *   this can be NULL when no local state is needed
+   *
+   * NOTE: local_model requires explicit replication of the model for fault-tolerance, which will
+   *       bring replication cost in CheckPoint function. global_model do not need explicit replication.
+   *       So only CheckPoint with global_model if possible
+   *
+   * \sa LoadCheckPoint, VersionNumber
+   */
+  virtual void CheckPoint(const Serializable *global_model,
+                          const Serializable *local_model = NULL) {
+    this->CheckPoint_(global_model, local_model, false);
+  }
+  /*!
+   * \brief This function can be used to replace CheckPoint for global_model only,
+   *   when certain condition is met(see detailed expplaination).
+   *
+   *   This is a "lazy" checkpoint such that only the pointer to global_model is
+   *   remembered and no memory copy is taken. To use this function, the user MUST ensure that:
+   *   The global_model must remain unchanged util last call of Allreduce/Broadcast in current version finishs.
+   *   In another words, global_model model can be changed only between last call of
+   *   Allreduce/Broadcast and LazyCheckPoint in current version
+   *
+   *   For example, suppose the calling sequence is:
+   *   LazyCheckPoint, code1, Allreduce, code2, Broadcast, code3, LazyCheckPoint
+   *
+   *   If user can only changes global_model in code3, then LazyCheckPoint can be used to
+   *   improve efficiency of the program.
+   * \param global_model pointer to the globally shared model/state
+   *   when calling this function, the caller need to gauranttees that global_model
+   *   is the same in all nodes
+   * \sa LoadCheckPoint, CheckPoint, VersionNumber
+   */
+  virtual void LazyCheckPoint(const Serializable *global_model) {
+    this->CheckPoint_(global_model, NULL, true);
+  }
+  /*!
+   * \brief explicitly re-init everything before calling LoadCheckPoint
+   *    call this function when IEngine throw an exception out,
+   *    this function is only used for test purpose
+   */
+  virtual void InitAfterException(void) {
+    // simple way, shutdown all links
+    for (size_t i = 0; i < all_links.size(); ++i) {
+      if (!all_links[i].sock.BadSocket()) all_links[i].sock.Close();
+    }
+    ReConnectLinks("recover");
+  }
+
+ protected:
+  // constant one byte out of band message to indicate error happening
+  // and mark for channel cleanup
+  static const char kOOBReset = 95;
+  // and mark for channel cleanup, after OOB signal
+  static const char kResetMark = 97;
+  // and mark for channel cleanup
+  static const char kResetAck = 97;
+  /*! \brief type of roles each node can play during recovery */
+  enum RecoverType {
+    /*! \brief current node have data */
+    kHaveData = 0,
+    /*! \brief current node request data */
+    kRequestData = 1,
+    /*! \brief current node only helps to pass data around */
+    kPassData = 2
+  };
+
+  enum SeqType {
+    /*! \brief apply to rabit seq code */
+    kSeq = 0,
+    /*! \brief apply to rabit cache seq code */
+    kCache = 1
+  };
+  /*!
+   * \brief summary of actions proposed in all nodes
+   *  this data structure is used to make consensus decision
+   *  about next action to take in the recovery mode
+   */
+  struct ActionSummary {
+    // maximumly allowed sequence id
+    static const u_int32_t kSpecialOp = (1 << 26);
+    // special sequence number for local state checkpoint
+    static const u_int32_t kLocalCheckPoint = (1 << 26) - 2;
+    // special sequnce number for local state checkpoint ack signal
+    static const u_int32_t kLocalCheckAck = (1 << 26) - 1;
+    //---------------------------------------------
+    // The following are bit mask of flag used in
+    //----------------------------------------------
+    // some node want to load check point
+    static const int kLoadCheck = 1;
+    // some node want to do check point
+    static const int kCheckPoint = 2;
+    // check point Ack, we use a two phase message in check point,
+    // this is the second phase of check pointing
+    static const int kCheckAck = 4;
+    // there are difference sequence number the nodes proposed
+    // this means we want to do recover execution of the lower sequence
+    // action instead of normal execution
+    static const int kDiffSeq = 8;
+    // there are nodes request load cache
+    static const int kLoadBootstrapCache = 16;
+    // constructor
+    ActionSummary(void) {}
+    // constructor of action
+    explicit ActionSummary(int seqno_flag, int cache_flag = 0,
+      u_int32_t minseqno = kSpecialOp, u_int32_t maxseqno = kSpecialOp) {
+      seqcode = (minseqno << 5) | seqno_flag;
+      maxseqcode = (maxseqno << 5) | cache_flag;
+    }
+    // minimum number of all operations by default
+    // maximum number of all cache operations otherwise
+    inline u_int32_t seqno(SeqType t = SeqType::kSeq) const {
+      int code = t == SeqType::kSeq ? seqcode : maxseqcode;
+      return code >> 5;
+    }
+    // whether the operation set contains a load_check
+    inline bool load_check(SeqType t = SeqType::kSeq) const {
+      int code = t == SeqType::kSeq ? seqcode : maxseqcode;
+      return (code & kLoadCheck) != 0;
+    }
+    // whether the operation set contains a load_cache
+    inline bool load_cache(SeqType t = SeqType::kSeq) const {
+      int code = t == SeqType::kSeq ? seqcode : maxseqcode;
+      return (code & kLoadBootstrapCache) != 0;
+    }
+    // whether the operation set contains a check point
+    inline bool check_point(SeqType t = SeqType::kSeq) const {
+      int code = t == SeqType::kSeq ? seqcode : maxseqcode;
+      return (code & kCheckPoint) != 0;
+    }
+    // whether the operation set contains a check ack
+    inline bool check_ack(SeqType t = SeqType::kSeq) const {
+      int code = t == SeqType::kSeq ? seqcode : maxseqcode;
+      return (code & kCheckAck) != 0;
+    }
+    // whether the operation set contains different sequence number
+    inline bool diff_seq() const {
+      return (seqcode & kDiffSeq) != 0;
+    }
+    // returns the operation flag of the result
+    inline int flag(SeqType t = SeqType::kSeq) const {
+      int code = t == SeqType::kSeq ? seqcode : maxseqcode;
+      return code & 31;
+    }
+    // print flags in user friendly way
+    inline void print_flags(int rank, std::string prefix ) {
+      utils::HandleLogInfo("[%d] %s - |%lu|%d|%d|%d|%d| - |%lu|%d|\n",
+                    rank, prefix.c_str(),
+                    seqno(), check_point(), check_ack(), load_cache(),
+                    diff_seq(), seqno(SeqType::kCache), load_cache(SeqType::kCache));
+    }
+    // reducer for Allreduce, get the result ActionSummary from all nodes
+    inline static void Reducer(const void *src_, void *dst_,
+                               int len, const MPI::Datatype &dtype) {
+      const ActionSummary *src = (const ActionSummary*)src_;
+      ActionSummary *dst = reinterpret_cast<ActionSummary*>(dst_);
+      for (int i = 0; i < len; ++i) {
+        u_int32_t min_seqno = std::min(src[i].seqno(), dst[i].seqno());
+        u_int32_t max_seqno = std::max(src[i].seqno(SeqType::kCache),
+          dst[i].seqno(SeqType::kCache));
+        int action_flag = src[i].flag() | dst[i].flag();
+        // if any node is not requester set to 0 otherwise 1
+        int role_flag = src[i].flag(SeqType::kCache) & dst[i].flag(SeqType::kCache);
+        // if seqno is different in src and destination
+        int seq_diff_flag = src[i].seqno() != dst[i].seqno() ? kDiffSeq : 0;
+        // apply or to both seq diff flag as well as cache seq diff flag
+        dst[i] = ActionSummary(action_flag | seq_diff_flag,
+          role_flag, min_seqno, max_seqno);
+      }
+    }
+
+   private:
+    // internel sequence code min of rabit seqno
+    u_int32_t seqcode;
+    // internal sequence code max of cache seqno
+    u_int32_t maxseqcode;
+  };
+  /*! \brief data structure to remember result of Bcast and Allreduce calls*/
+  class ResultBuffer{
+   public:
+    // constructor
+    ResultBuffer(void) {
+      this->Clear();
+    }
+    // clear the existing record
+    inline void Clear(void) {
+      seqno_.clear(); size_.clear();
+      rptr_.clear(); rptr_.push_back(0);
+      data_.clear();
+    }
+    // allocate temporal space
+    inline void *AllocTemp(size_t type_nbytes, size_t count) {
+      size_t size = type_nbytes * count;
+      size_t nhop = (size + sizeof(uint64_t) - 1) / sizeof(uint64_t);
+      utils::Assert(nhop != 0, "cannot allocate 0 size memory");
+      // allocate addational nhop buffer size
+      data_.resize(rptr_.back() + nhop);
+      return BeginPtr(data_) + rptr_.back();
+    }
+    // push the result in temp to the
+    inline void PushTemp(int seqid, size_t type_nbytes, size_t count) {
+      size_t size = type_nbytes * count;
+      size_t nhop = (size + sizeof(uint64_t) - 1) / sizeof(uint64_t);
+      if (seqno_.size() != 0) {
+        utils::Assert(seqno_.back() < seqid, "PushTemp seqid inconsistent");
+      }
+      seqno_.push_back(seqid);
+      rptr_.push_back(rptr_.back() + nhop);
+      size_.push_back(size);
+      utils::Assert(data_.size() == rptr_.back(), "PushTemp inconsistent");
+    }
+    // return the stored result of seqid, if any
+    inline void* Query(int seqid, size_t *p_size) {
+      size_t idx = std::lower_bound(seqno_.begin(),
+                                    seqno_.end(), seqid) - seqno_.begin();
+      if (idx == seqno_.size() || seqno_[idx] != seqid) return NULL;
+      *p_size = size_[idx];
+      return BeginPtr(data_) + rptr_[idx];
+    }
+    // drop last stored result
+    inline void DropLast(void) {
+      utils::Assert(seqno_.size() != 0, "there is nothing to be dropped");
+      seqno_.pop_back();
+      rptr_.pop_back();
+      size_.pop_back();
+      data_.resize(rptr_.back());
+    }
+    // the sequence number of last stored result
+    inline int LastSeqNo(void) const {
+      if (seqno_.size() == 0) return -1;
+      return seqno_.back();
+    }
+
+   private:
+    // sequence number of each
+    std::vector<int> seqno_;
+    // pointer to the positions
+    std::vector<size_t> rptr_;
+    // actual size of each buffer
+    std::vector<size_t> size_;
+    // content of the buffer
+    std::vector<uint64_t> data_;
+  };
+  /*!
+   * \brief internal consistency check function,
+   *  use check to ensure user always call CheckPoint/LoadCheckPoint
+   *  with or without local but not both, this function will set the approperiate settings
+   *  in the first call of LoadCheckPoint/CheckPoint
+   *
+   * \param with_local whether the user calls CheckPoint with local model
+   */
+  void LocalModelCheck(bool with_local);
+  /*!
+   * \brief internal implementation of checkpoint, support both lazy and normal way
+   *
+   * \param global_model pointer to the globally shared model/state
+   *   when calling this function, the caller need to gauranttees that global_model
+   *   is the same in all nodes
+   * \param local_model pointer to local model, that is specific to current node/rank
+   *   this can be NULL when no local state is needed
+   * \param lazy_checkpt whether the action is lazy checkpoint
+   *
+   * \sa CheckPoint, LazyCheckPoint
+   */
+  void CheckPoint_(const Serializable *global_model,
+                   const Serializable *local_model,
+                   bool lazy_checkpt);
+  /*!
+   * \brief reset the all the existing links by sending Out-of-Band message marker
+   *  after this function finishes, all the messages received and sent
+   *  before in all live links are discarded,
+   *  This allows us to get a fresh start after error has happened
+   *
+   *  TODO(tqchen): this function is not yet functioning was not used by engine,
+   *   simple resetlink and reconnect strategy is used
+   *
+   * \return this function can return kSuccess or kSockError
+   *         when kSockError is returned, it simply means there are bad sockets in the links,
+   *         and some link recovery proceduer is needed
+   */
+  ReturnType TryResetLinks(void);
+  /*!
+   * \brief if err_type indicates an error
+   *         recover links according to the error type reported
+   *        if there is no error, return true
+   * \param err_type the type of error happening in the system
+   * \return true if err_type is kSuccess, false otherwise
+   */
+  bool CheckAndRecover(ReturnType err_type);
+  /*!
+   * \brief try to run recover execution for a request action described by flag and seqno,
+   *        the function will keep blocking to run possible recovery operations before the specified action,
+   *        until the requested result is received by a recovering procedure,
+   *        or the function discovers that the requested action is not yet executed, and return false
+   *
+   * \param buf the buffer to store the result
+   * \param size the total size of the buffer
+   * \param flag flag information about the action \sa ActionSummary
+   * \param seqno sequence number of the action, if it is special action with flag set,
+   *        seqno needs to be set to ActionSummary::kSpecialOp
+   *
+   * \return if this function can return true or false
+   *    - true means buf already set to the
+   *           result by recovering procedure, the action is complete, no further action is needed
+   *    - false means this is the lastest action that has not yet been executed, need to execute the action
+   */
+  bool RecoverExec(void *buf, size_t size, int flag,
+    int seqno = ActionSummary::kSpecialOp,
+    int cacheseqno = ActionSummary::kSpecialOp,
+    const char* caller = _CALLER);
+  /*!
+   * \brief try to load check point
+   *
+   *        This is a collaborative function called by all nodes
+   *        only the nodes with requester set to true really needs to load the check point
+   *        other nodes acts as collaborative roles to complete this request
+   *
+   * \param requester whether current node is the requester
+   * \return this function can return kSuccess/kSockError/kGetExcept, see ReturnType for details
+   * \sa ReturnType
+   */
+  ReturnType TryLoadCheckPoint(bool requester);
+
+  /*!
+   * \brief try to load cache
+   *
+   *        This is a collaborative function called by all nodes
+   *        only the nodes with requester set to true really needs to load the check point
+   *        other nodes acts as collaborative roles to complete this request
+   * \param requester whether current node is the requester
+   * \return this function can return kSuccess/kSockError/kGetExcept, see ReturnType for details
+   * \sa ReturnType
+   */
+  ReturnType TryRestoreCache(bool requester, const int min_seq = ActionSummary::kSpecialOp,
+    const int max_seq = ActionSummary::kSpecialOp);
+  /*!
+   * \brief try to get the result of operation specified by seqno
+   *
+   *        This is a collaborative function called by all nodes
+   *        only the nodes with requester set to true really needs to get the result
+   *        other nodes acts as collaborative roles to complete this request
+   *
+   * \param buf the buffer to store the result, this parameter is only used when current node is requester
+   * \param size the total size of the buffer, this parameter is only used when current node is requester
+   * \param seqno sequence number of the operation, this is unique index of a operation in current iteration
+   * \param requester whether current node is the requester
+   * \return this function can return kSuccess/kSockError/kGetExcept, see ReturnType for details
+   * \sa ReturnType
+   */
+  ReturnType TryGetResult(void *buf, size_t size, int seqno, bool requester);
+  /*!
+   * \brief try to decide the routing strategy for recovery
+   * \param role the current role of the node
+   * \param p_size used to store the size of the message, for node in state kHaveData,
+   *               this size must be set correctly before calling the function
+   *               for others, this surves as output parameter
+
+   * \param p_recvlink used to store the link current node should recv data from, if necessary
+   *          this can be -1, which means current node have the data
+   * \param p_req_in used to store the resulting vector, indicating which link we should send the data to
+   *
+   * \return this function can return kSuccess/kSockError/kGetExcept, see ReturnType for details
+   * \sa ReturnType, TryRecoverData
+   */
+  ReturnType TryDecideRouting(RecoverType role,
+                              size_t *p_size,
+                              int *p_recvlink,
+                              std::vector<bool> *p_req_in);
+  /*!
+   * \brief try to finish the data recovery request,
+   *        this function is used together with TryDecideRouting
+   * \param role the current role of the node
+   * \param sendrecvbuf_ the buffer to store the data to be sent/recived
+   *          - if the role is kHaveData, this stores the data to be sent
+   *          - if the role is kRequestData, this is the buffer to store the result
+   *          - if the role is kPassData, this will not be used, and can be NULL
+   * \param size the size of the data, obtained from TryDecideRouting
+   * \param recv_link the link index to receive data, if necessary, obtained from TryDecideRouting
+   * \param req_in the request of each link to send data, obtained from TryDecideRouting
+   *
+   * \return this function can return kSuccess/kSockError/kGetExcept, see ReturnType for details
+   * \sa ReturnType, TryDecideRouting
+   */
+  ReturnType TryRecoverData(RecoverType role,
+                            void *sendrecvbuf_,
+                            size_t size,
+                            int recv_link,
+                            const std::vector<bool> &req_in);
+  /*!
+   * \brief try to recover the local state, making each local state to be the result of itself
+   *        plus replication of states in previous num_local_replica hops in the ring
+   *
+   * The input parameters must contain the valid local states available in current nodes,
+   * This function try ist best to "complete" the missing parts of local_rptr and local_chkpt
+   * If there is sufficient information in the ring, when the function returns, local_chkpt will
+   * contain num_local_replica + 1 checkpoints (including the chkpt of this node)
+   * If there is no sufficient information in the ring, this function the number of checkpoints
+   * will be less than the specified value
+   *
+   * \param p_local_rptr the pointer to the segment pointers in the states array
+   * \param p_local_chkpt the pointer to the storage of local check points
+   * \return this function can return kSuccess/kSockError/kGetExcept, see ReturnType for details
+   * \sa ReturnType
+   */
+  ReturnType TryRecoverLocalState(std::vector<size_t> *p_local_rptr,
+                                  std::string *p_local_chkpt);
+  /*!
+   * \brief try to checkpoint local state, this function is called in normal executation phase
+   *    of checkpoint that contains local state
+o   *  the input state must exactly one saved state(local state of current node),
+   *  after complete, this function will get local state from previous num_local_replica nodes and put them
+   *  into local_chkpt and local_rptr
+   *
+   *  It is also OK to call TryRecoverLocalState instead,
+   *  TryRecoverLocalState makes less assumption about the input, and requires more communications
+   *
+   * \param p_local_rptr the pointer to the segment pointers in the states array
+   * \param p_local_chkpt the pointer to the storage of local check points
+   * \return this function can return kSuccess/kSockError/kGetExcept, see ReturnType for details
+   * \sa ReturnType, TryRecoverLocalState
+   */
+  ReturnType TryCheckinLocalState(std::vector<size_t> *p_local_rptr,
+                                  std::string *p_local_chkpt);
+  /*!
+   * \brief perform a ring passing to receive data from prev link, and sent data to next link
+   *  this allows data to stream over a ring structure
+   *  sendrecvbuf[0:read_ptr] are already provided by current node
+   *  current node will recv sendrecvbuf[read_ptr:read_end] from prev link
+   *  current node will send sendrecvbuf[write_ptr:write_end] to next link
+   *  write_ptr will wait till the data is readed before sending the data
+   *  this function requires read_end >= write_end
+   *
+   * \param sendrecvbuf_ the place to hold the incoming and outgoing data
+   * \param read_ptr the initial read pointer
+   * \param read_end the ending position to read
+   * \param write_ptr the initial write pointer
+   * \param write_end the ending position to write
+   * \param read_link pointer to link to previous position in ring
+   * \param write_link pointer to link of next position in ring
+   */
+  ReturnType RingPassing(void *senrecvbuf_,
+                         size_t read_ptr,
+                         size_t read_end,
+                         size_t write_ptr,
+                         size_t write_end,
+                         LinkRecord *read_link,
+                         LinkRecord *write_link);
+  /*!
+   * \brief run message passing algorithm on the allreduce tree
+   *        the result is edge message stored in p_edge_in and p_edge_out
+   * \param node_value the value associated with current node
+   * \param p_edge_in used to store input message from each of the edge
+   * \param p_edge_out used to store output message from each of the edge
+   * \param func a function that defines the message passing rule
+   *        Parameters of func:
+   *           - node_value same as node_value in the main function
+   *           - edge_in the array of input messages from each edge,
+   *                     this includes the output edge, which should be excluded
+   *           - out_index array the index of output edge, the function should
+   *                       exclude the output edge when compute the message passing value
+   *        Return of func:
+   *           the function returns the output message based on the input message and node_value
+   *
+   * \tparam EdgeType type of edge message, must be simple struct
+   * \tparam NodeType type of node value
+   */
+  template<typename NodeType, typename EdgeType>
+  inline ReturnType MsgPassing(const NodeType &node_value,
+                               std::vector<EdgeType> *p_edge_in,
+                               std::vector<EdgeType> *p_edge_out,
+                               EdgeType(*func)
+                               (const NodeType &node_value,
+                                const std::vector<EdgeType> &edge_in,
+                                size_t out_index));
+  //---- recovery data structure ----
+  // the round of result buffer, used to mode the result
+  int result_buffer_round;
+  // result buffer of all reduce
+  ResultBuffer resbuf;
+  // current cached allreduce/braodcast sequence number
+  int cur_cache_seq;
+  // result buffer of cached all reduce
+  ResultBuffer cachebuf;
+  // key of each cache entry
+  ResultBuffer lookupbuf;
+  // last check point global model
+  std::string global_checkpoint;
+  // lazy checkpoint of global model
+  const Serializable *global_lazycheck;
+  // number of replica for local state/model
+  int num_local_replica;
+  // number of default local replica
+  int default_local_replica;
+  // flag to decide whether local model is used, -1: unknown, 0: no, 1:yes
+  int use_local_model;
+  // number of replica for global state/model
+  int num_global_replica;
+  // number of times recovery happens
+  int recover_counter;
+  // --- recovery data structure for local checkpoint
+  // there is two version of the data structure,
+  // at one time one version is valid and another is used as temp memory
+  // pointer to memory position in the local model
+  // local model is stored in CSR format(like a sparse matrices)
+  // local_model[rptr[0]:rptr[1]] stores the model of current node
+  // local_model[rptr[k]:rptr[k+1]] stores the model of node in previous k hops
+  std::vector<size_t> local_rptr[2];
+  // storage for local model replicas
+  std::string local_chkpt[2];
+  // version of local checkpoint can be 1 or 0
+  int local_chkpt_version;
+  // if checkpoint were loaded, used to distinguish results boostrap cache from seqno cache
+  bool checkpoint_loaded;
+  // sidecar executing timeout task
+  std::future<bool> rabit_timeout_task;
+  // flag to shutdown rabit_timeout_task before timeout
+  std::atomic<bool> shutdown_timeout{false};
+  // error handler
+  void (* _error)(const char *fmt, ...) = utils::Error;
+  // assert handler
+  void (* _assert)(bool exp, const char *fmt, ...) = utils::Assert;
+};
+}  // namespace engine
+}  // namespace rabit
+// implementation of inline template function
+#include "./allreduce_robust-inl.h"
+#endif  // RABIT_ALLREDUCE_ROBUST_H_
diff --git a/src/c_api.cc b/src/c_api.cc
new file mode 100644
index 0000000..d853319
--- /dev/null
+++ b/src/c_api.cc
@@ -0,0 +1,344 @@
+// Copyright by Contributors
+// implementations in ctypes
+#define _CRT_SECURE_NO_WARNINGS
+#define _CRT_SECURE_NO_DEPRECATE
+
+#include <cstring>
+#include <string>
+#include "rabit/rabit.h"
+#include "rabit/c_api.h"
+
+namespace rabit {
+namespace c_api {
+// helper use to avoid BitOR operator
+template<typename OP, typename DType>
+struct FHelper {
+  static void
+  Allreduce(DType *senrecvbuf_,
+            size_t count,
+            void (*prepare_fun)(void *arg),
+            void *prepare_arg) {
+    rabit::Allreduce<OP>(senrecvbuf_, count,
+                         prepare_fun, prepare_arg);
+  }
+};
+
+template<typename DType>
+struct FHelper<op::BitOR, DType> {
+  static void
+  Allreduce(DType *senrecvbuf_,
+            size_t count,
+            void (*prepare_fun)(void *arg),
+            void *prepare_arg) {
+    utils::Error("DataType does not support bitwise or operation");
+  }
+};
+
+template<typename OP>
+void Allreduce_(void *sendrecvbuf_,
+                size_t count,
+                engine::mpi::DataType enum_dtype,
+                void (*prepare_fun)(void *arg),
+                void *prepare_arg) {
+  using namespace engine::mpi;
+  switch (enum_dtype) {
+    case kChar:
+      rabit::Allreduce<OP>
+          (static_cast<char*>(sendrecvbuf_),
+           count, prepare_fun, prepare_arg);
+      return;
+    case kUChar:
+      rabit::Allreduce<OP>
+          (static_cast<unsigned char*>(sendrecvbuf_),
+           count, prepare_fun, prepare_arg);
+      return;
+    case kInt:
+      rabit::Allreduce<OP>
+          (static_cast<int*>(sendrecvbuf_),
+           count, prepare_fun, prepare_arg);
+      return;
+    case kUInt:
+      rabit::Allreduce<OP>
+          (static_cast<unsigned*>(sendrecvbuf_),
+           count, prepare_fun, prepare_arg);
+      return;
+    case kLong:
+      rabit::Allreduce<OP>
+          (static_cast<long*>(sendrecvbuf_),  // NOLINT(*)
+           count, prepare_fun, prepare_arg);
+      return;
+    case kULong:
+      rabit::Allreduce<OP>
+          (static_cast<unsigned long*>(sendrecvbuf_),  // NOLINT(*)
+           count, prepare_fun, prepare_arg);
+      return;
+    case kFloat:
+      FHelper<OP, float>::Allreduce
+          (static_cast<float*>(sendrecvbuf_),
+           count, prepare_fun, prepare_arg);
+      return;
+    case kDouble:
+      FHelper<OP, double>::Allreduce
+          (static_cast<double*>(sendrecvbuf_),
+           count, prepare_fun, prepare_arg);
+      return;
+    default: utils::Error("unknown data_type");
+  }
+}
+void Allreduce(void *sendrecvbuf,
+               size_t count,
+               engine::mpi::DataType enum_dtype,
+               engine::mpi::OpType enum_op,
+               void (*prepare_fun)(void *arg),
+               void *prepare_arg) {
+  using namespace engine::mpi;
+  switch (enum_op) {
+    case kMax:
+      Allreduce_<op::Max>
+          (sendrecvbuf,
+           count, enum_dtype,
+           prepare_fun, prepare_arg);
+      return;
+    case kMin:
+      Allreduce_<op::Min>
+          (sendrecvbuf,
+           count, enum_dtype,
+           prepare_fun, prepare_arg);
+      return;
+    case kSum:
+      Allreduce_<op::Sum>
+          (sendrecvbuf,
+           count, enum_dtype,
+           prepare_fun, prepare_arg);
+      return;
+    case kBitwiseOR:
+      Allreduce_<op::BitOR>
+          (sendrecvbuf,
+           count, enum_dtype,
+           prepare_fun, prepare_arg);
+      return;
+    default: utils::Error("unknown enum_op");
+  }
+}
+void Allgather(void *sendrecvbuf_,
+               size_t total_size,
+               size_t beginIndex,
+               size_t size_node_slice,
+               size_t size_prev_slice,
+               int enum_dtype) {
+  using namespace engine::mpi;
+  size_t type_size = 0;
+  switch (enum_dtype) {
+  case kChar:
+    type_size = sizeof(char);
+    rabit::Allgather(static_cast<char*>(sendrecvbuf_), total_size * type_size,
+      beginIndex * type_size, (beginIndex + size_node_slice) * type_size,
+      size_prev_slice * type_size);
+    break;
+  case kUChar:
+    type_size = sizeof(unsigned char);
+    rabit::Allgather(static_cast<unsigned char*>(sendrecvbuf_), total_size * type_size,
+      beginIndex * type_size, (beginIndex + size_node_slice) * type_size,
+      size_prev_slice * type_size);
+    break;
+  case kInt:
+    type_size = sizeof(int);
+    rabit::Allgather(static_cast<int*>(sendrecvbuf_), total_size * type_size,
+      beginIndex * type_size, (beginIndex + size_node_slice) * type_size,
+      size_prev_slice * type_size);
+    break;
+  case kUInt:
+    type_size = sizeof(unsigned);
+    rabit::Allgather(static_cast<unsigned*>(sendrecvbuf_), total_size * type_size,
+      beginIndex * type_size, (beginIndex + size_node_slice) * type_size,
+      size_prev_slice * type_size);
+    break;
+  case kLong:
+    type_size = sizeof(int64_t);
+    rabit::Allgather(static_cast<int64_t*>(sendrecvbuf_), total_size * type_size,
+      beginIndex * type_size, (beginIndex + size_node_slice) * type_size,
+      size_prev_slice * type_size);
+    break;
+  case kULong:
+    type_size = sizeof(uint64_t);
+    rabit::Allgather(static_cast<uint64_t*>(sendrecvbuf_), total_size * type_size,
+      beginIndex * type_size, (beginIndex + size_node_slice) * type_size,
+      size_prev_slice * type_size);
+    break;
+  case kFloat:
+    type_size = sizeof(float);
+    rabit::Allgather(static_cast<float*>(sendrecvbuf_), total_size * type_size,
+      beginIndex * type_size, (beginIndex + size_node_slice) * type_size,
+      size_prev_slice * type_size);
+    break;
+  case kDouble:
+    type_size = sizeof(double);
+    rabit::Allgather(static_cast<double*>(sendrecvbuf_), total_size * type_size,
+      beginIndex * type_size, (beginIndex + size_node_slice) * type_size,
+      size_prev_slice * type_size);
+    break;
+  default: utils::Error("unknown data_type");
+  }
+}
+
+// wrapper for serialization
+struct ReadWrapper : public Serializable {
+  std::string *p_str;
+  explicit ReadWrapper(std::string *p_str)
+      : p_str(p_str) {}
+  virtual void Load(Stream *fi) {
+    uint64_t sz;
+    utils::Assert(fi->Read(&sz, sizeof(sz)) != 0,
+                 "Read pickle string");
+    p_str->resize(sz);
+    if (sz != 0) {
+      utils::Assert(fi->Read(&(*p_str)[0], sizeof(char) * sz) != 0,
+                    "Read pickle string");
+    }
+  }
+  virtual void Save(Stream *fo) const {
+    utils::Error("not implemented");
+  }
+};
+
+struct WriteWrapper : public Serializable {
+  const char *data;
+  size_t length;
+  explicit WriteWrapper(const char *data,
+                        size_t length)
+      : data(data), length(length) {
+  }
+  virtual void Load(Stream *fi) {
+    utils::Error("not implemented");
+  }
+  virtual void Save(Stream *fo) const {
+    uint64_t sz = static_cast<uint16_t>(length);
+    fo->Write(&sz, sizeof(sz));
+    fo->Write(data, length * sizeof(char));
+  }
+};
+}  // namespace c_api
+}  // namespace rabit
+
+bool RabitInit(int argc, char *argv[]) {
+  return rabit::Init(argc, argv);
+}
+
+bool RabitFinalize() {
+  return rabit::Finalize();
+}
+
+int RabitGetRingPrevRank() {
+  return rabit::GetRingPrevRank();
+}
+
+int RabitGetRank() {
+  return rabit::GetRank();
+}
+
+int RabitGetWorldSize() {
+  return rabit::GetWorldSize();
+}
+
+int RabitIsDistributed() {
+  return rabit::IsDistributed();
+}
+
+void RabitTrackerPrint(const char *msg) {
+  std::string m(msg);
+  rabit::TrackerPrint(m);
+}
+
+void RabitGetProcessorName(char *out_name,
+                           rbt_ulong *out_len,
+                           rbt_ulong max_len) {
+  std::string s = rabit::GetProcessorName();
+  if (s.length() > max_len) {
+    s.resize(max_len - 1);
+  }
+  strcpy(out_name, s.c_str()); // NOLINT(*)
+  *out_len = static_cast<rbt_ulong>(s.length());
+}
+
+void RabitBroadcast(void *sendrecv_data,
+                    rbt_ulong size, int root) {
+  rabit::Broadcast(sendrecv_data, size, root);
+}
+
+void RabitAllgather(void *sendrecvbuf_,
+                        size_t total_size,
+                        size_t beginIndex,
+                        size_t size_node_slice,
+                        size_t size_prev_slice,
+                        int enum_dtype) {
+  rabit::c_api::Allgather(sendrecvbuf_,
+                          total_size,
+                          beginIndex,
+                          size_node_slice,
+                          size_prev_slice,
+                          static_cast<rabit::engine::mpi::DataType>(enum_dtype));
+}
+
+
+void RabitAllreduce(void *sendrecvbuf,
+                    size_t count,
+                    int enum_dtype,
+                    int enum_op,
+                    void (*prepare_fun)(void *arg),
+                    void *prepare_arg) {
+  rabit::c_api::Allreduce
+      (sendrecvbuf, count,
+       static_cast<rabit::engine::mpi::DataType>(enum_dtype),
+       static_cast<rabit::engine::mpi::OpType>(enum_op),
+       prepare_fun, prepare_arg);
+}
+
+int RabitLoadCheckPoint(char **out_global_model,
+                        rbt_ulong *out_global_len,
+                        char **out_local_model,
+                        rbt_ulong *out_local_len) {
+  // NOTE: this function is not thread-safe
+  using rabit::BeginPtr;
+  using namespace rabit::c_api; // NOLINT(*)
+  static std::string global_buffer;
+  static std::string local_buffer;
+
+  ReadWrapper sg(&global_buffer);
+  ReadWrapper sl(&local_buffer);
+  int version;
+
+  if (out_local_model == NULL) {
+    version = rabit::LoadCheckPoint(&sg, NULL);
+    *out_global_model = BeginPtr(global_buffer);
+    *out_global_len = static_cast<rbt_ulong>(global_buffer.length());
+  } else {
+    version = rabit::LoadCheckPoint(&sg, &sl);
+    *out_global_model = BeginPtr(global_buffer);
+    *out_global_len = static_cast<rbt_ulong>(global_buffer.length());
+    *out_local_model = BeginPtr(local_buffer);
+    *out_local_len = static_cast<rbt_ulong>(local_buffer.length());
+  }
+  return version;
+}
+
+void RabitCheckPoint(const char *global_model,
+                     rbt_ulong global_len,
+                     const char *local_model,
+                     rbt_ulong local_len) {
+  using namespace rabit::c_api; // NOLINT(*)
+  WriteWrapper sg(global_model, global_len);
+  WriteWrapper sl(local_model, local_len);
+  if (local_model == NULL) {
+    rabit::CheckPoint(&sg, NULL);
+  } else {
+    rabit::CheckPoint(&sg, &sl);
+  }
+}
+
+int RabitVersionNumber() {
+  return rabit::VersionNumber();
+}
+
+int RabitLinkTag() {
+  return 0;
+}
diff --git a/src/engine.cc b/src/engine.cc
new file mode 100644
index 0000000..4701d2f
--- /dev/null
+++ b/src/engine.cc
@@ -0,0 +1,145 @@
+/*!
+ *  Copyright (c) 2014 by Contributors
+ * \file engine.cc
+ * \brief this file governs which implementation of engine we are actually using
+ *  provides an singleton of engine interface
+ *
+ * \author Tianqi Chen, Ignacio Cano, Tianyi Zhou
+ */
+#define _CRT_SECURE_NO_WARNINGS
+#define _CRT_SECURE_NO_DEPRECATE
+#define NOMINMAX
+
+#include <memory>
+#include "rabit/internal/engine.h"
+#include "allreduce_base.h"
+#include "allreduce_robust.h"
+#include "rabit/internal/thread_local.h"
+
+namespace rabit {
+namespace engine {
+// singleton sync manager
+#ifndef RABIT_USE_BASE
+#ifndef RABIT_USE_MOCK
+typedef AllreduceRobust Manager;
+#else
+typedef AllreduceMock Manager;
+#endif  // RABIT_USE_MOCK
+#else
+typedef AllreduceBase Manager;
+#endif  // RABIT_USE_BASE
+
+/*! \brief entry to to easily hold returning information */
+struct ThreadLocalEntry {
+  /*! \brief stores the current engine */
+  std::unique_ptr<Manager> engine;
+  /*! \brief whether init has been called */
+  bool initialized;
+  /*! \brief constructor */
+  ThreadLocalEntry() : initialized(false) {}
+};
+
+// define the threadlocal store.
+typedef ThreadLocalStore<ThreadLocalEntry> EngineThreadLocal;
+
+/*! \brief intiialize the synchronization module */
+bool Init(int argc, char *argv[]) {
+  ThreadLocalEntry* e = EngineThreadLocal::Get();
+  if (e->engine.get() == nullptr) {
+    e->initialized = true;
+    e->engine.reset(new Manager());
+    return e->engine->Init(argc, argv);
+  } else {
+    return true;
+  }
+}
+
+/*! \brief finalize syncrhonization module */
+bool Finalize() {
+  ThreadLocalEntry* e = EngineThreadLocal::Get();
+  if (e->engine.get() != nullptr) {
+    if (e->engine->Shutdown()) {
+      e->engine.reset(nullptr);
+      e->initialized = false;
+      return true;
+    } else {
+      return false;
+    }
+  } else {
+    return true;
+  }
+}
+
+/*! \brief singleton method to get engine */
+IEngine *GetEngine() {
+  // un-initialized default manager.
+  static AllreduceBase default_manager;
+  ThreadLocalEntry* e = EngineThreadLocal::Get();
+  IEngine* ptr = e->engine.get();
+  if (ptr == nullptr) {
+    utils::Check(!e->initialized, "the rabit has not been initialized");
+    return &default_manager;
+  } else {
+    return ptr;
+  }
+}
+
+// perform in-place allgather, on sendrecvbuf
+void Allgather(void *sendrecvbuf_, size_t total_size,
+                   size_t slice_begin,
+                   size_t slice_end,
+                   size_t size_prev_slice,
+                   const char* _file,
+                   const int _line,
+                   const char* _caller) {
+  GetEngine()->Allgather(sendrecvbuf_, total_size, slice_begin,
+    slice_end, size_prev_slice, _file, _line, _caller);
+}
+
+
+// perform in-place allreduce, on sendrecvbuf
+void Allreduce_(void *sendrecvbuf,
+                size_t type_nbytes,
+                size_t count,
+                IEngine::ReduceFunction red,
+                mpi::DataType dtype,
+                mpi::OpType op,
+                IEngine::PreprocFunction prepare_fun,
+                void *prepare_arg,
+                const char* _file,
+                const int _line,
+                const char* _caller) {
+  GetEngine()->Allreduce(sendrecvbuf, type_nbytes, count, red, prepare_fun,
+    prepare_arg, _file, _line, _caller);
+}
+
+// code for reduce handle
+ReduceHandle::ReduceHandle(void)
+  : handle_(NULL), redfunc_(NULL), htype_(NULL) {
+}
+
+ReduceHandle::~ReduceHandle(void) {}
+
+int ReduceHandle::TypeSize(const MPI::Datatype &dtype) {
+  return static_cast<int>(dtype.type_size);
+}
+
+void ReduceHandle::Init(IEngine::ReduceFunction redfunc, size_t type_nbytes) {
+  utils::Assert(redfunc_ == NULL, "cannot initialize reduce handle twice");
+  redfunc_ = redfunc;
+}
+
+void ReduceHandle::Allreduce(void *sendrecvbuf,
+                             size_t type_nbytes, size_t count,
+                             IEngine::PreprocFunction prepare_fun,
+                             void *prepare_arg,
+                             const char* _file,
+                             const int _line,
+                             const char* _caller) {
+  utils::Assert(redfunc_ != NULL, "must intialize handle to call AllReduce");
+  GetEngine()->Allreduce(sendrecvbuf, type_nbytes, count,
+                         redfunc_, prepare_fun, prepare_arg,
+                         _file, _line, _caller);
+}
+}  // namespace engine
+}  // namespace rabit
diff --git a/src/engine_base.cc b/src/engine_base.cc
new file mode 100644
index 0000000..39da566
--- /dev/null
+++ b/src/engine_base.cc
@@ -0,0 +1,15 @@
+/*!
+ *  Copyright (c) 2014 by Contributors
+ * \file engine_mock.cc
+ * \brief this is an engine implementation that will 
+ * insert failures in certain call point, to test if the engine is robust to failure
+ * \author Tianqi Chen
+ */
+// define use MOCK, os we will use mock Manager
+#define _CRT_SECURE_NO_WARNINGS
+#define _CRT_SECURE_NO_DEPRECATE
+#define NOMINMAX
+// switch engine to AllreduceMock
+#define RABIT_USE_BASE
+#include "engine.cc"
+
diff --git a/src/engine_empty.cc b/src/engine_empty.cc
new file mode 100644
index 0000000..5cecc03
--- /dev/null
+++ b/src/engine_empty.cc
@@ -0,0 +1,149 @@
+/*!
+ *  Copyright (c) 2014 by Contributors
+ * \file engine_empty.cc
+ * \brief this file provides a dummy implementation of engine that does nothing
+ *  this file provides a way to fall back to single node program without causing too many dependencies
+ *  This is usually NOT needed, use engine_mpi or engine for real distributed version
+ * \author Tianqi Chen
+ */
+#define _CRT_SECURE_NO_WARNINGS
+#define _CRT_SECURE_NO_DEPRECATE
+#define NOMINMAX
+
+#include "rabit/internal/engine.h"
+
+namespace rabit {
+
+namespace utils {
+  bool STOP_PROCESS_ON_ERROR = true;
+}
+
+namespace engine {
+/*! \brief EmptyEngine */
+class EmptyEngine : public IEngine {
+ public:
+  EmptyEngine(void) {
+    version_number = 0;
+  }
+  virtual void Allgather(void *sendrecvbuf_,
+                         size_t total_size,
+                         size_t slice_begin,
+                         size_t slice_end,
+                         size_t size_prev_slice,
+                         const char* _file,
+                         const int _line,
+                         const char* _caller) {
+    utils::Error("EmptyEngine:: Allgather is not supported");
+  }
+  virtual int GetRingPrevRank(void) const {
+    utils::Error("EmptyEngine:: GetRingPrevRank is not supported");
+    return -1;
+  }
+  virtual void Allreduce(void *sendrecvbuf_,
+                         size_t type_nbytes,
+                         size_t count,
+                         ReduceFunction reducer,
+                         PreprocFunction prepare_fun,
+                         void *prepare_arg,
+                         const char* _file,
+                         const int _line,
+                         const char* _caller) {
+    utils::Error("EmptyEngine:: Allreduce is not supported,"\
+                 "use Allreduce_ instead");
+  }
+  virtual void Broadcast(void *sendrecvbuf_, size_t size, int root,
+                          const char* _file, const int _line, const char* _caller) {
+  }
+  virtual void InitAfterException(void) {
+    utils::Error("EmptyEngine is not fault tolerant");
+  }
+  virtual int LoadCheckPoint(Serializable *global_model,
+                             Serializable *local_model = NULL) {
+    return 0;
+  }
+  virtual void CheckPoint(const Serializable *global_model,
+                          const Serializable *local_model = NULL) {
+    version_number += 1;
+  }
+  virtual void LazyCheckPoint(const Serializable *global_model) {
+    version_number += 1;
+  }
+  virtual int VersionNumber(void) const {
+    return version_number;
+  }
+  /*! \brief get rank of current node */
+  virtual int GetRank(void) const {
+    return 0;
+  }
+  /*! \brief get total number of */
+  virtual int GetWorldSize(void) const {
+    return 1;
+  }
+  /*! \brief whether it is distributed */
+  virtual bool IsDistributed(void) const {
+    return false;
+  }
+  /*! \brief get the host name of current node */
+  virtual std::string GetHost(void) const {
+    return std::string("");
+  }
+  virtual void TrackerPrint(const std::string &msg) {
+    // simply print information into the tracker
+    utils::Printf("%s", msg.c_str());
+  }
+
+ private:
+  int version_number;
+};
+
+// singleton sync manager
+EmptyEngine manager;
+
+/*! \brief intiialize the synchronization module */
+bool Init(int argc, char *argv[]) {
+  return true;
+}
+/*! \brief finalize syncrhonization module */
+bool Finalize(void) {
+  return true;
+}
+
+/*! \brief singleton method to get engine */
+IEngine *GetEngine(void) {
+  return &manager;
+}
+// perform in-place allreduce, on sendrecvbuf
+void Allreduce_(void *sendrecvbuf,
+                size_t type_nbytes,
+                size_t count,
+                IEngine::ReduceFunction red,
+                mpi::DataType dtype,
+                mpi::OpType op,
+                IEngine::PreprocFunction prepare_fun,
+                void *prepare_arg,
+                const char* _file,
+                const int _line,
+                const char* _caller) {
+  if (prepare_fun != NULL) prepare_fun(prepare_arg);
+}
+
+// code for reduce handle
+ReduceHandle::ReduceHandle(void) : handle_(NULL), htype_(NULL) {
+}
+ReduceHandle::~ReduceHandle(void) {}
+
+int ReduceHandle::TypeSize(const MPI::Datatype &dtype) {
+  return 0;
+}
+void ReduceHandle::Init(IEngine::ReduceFunction redfunc, size_t type_nbytes) {}
+void ReduceHandle::Allreduce(void *sendrecvbuf,
+                             size_t type_nbytes, size_t count,
+                             IEngine::PreprocFunction prepare_fun,
+                             void *prepare_arg,
+                             const char* _file,
+                             const int _line,
+                             const char* _caller) {
+  if (prepare_fun != NULL) prepare_fun(prepare_arg);
+}
+}  // namespace engine
+}  // namespace rabit
diff --git a/src/engine_mock.cc b/src/engine_mock.cc
new file mode 100644
index 0000000..aab012d
--- /dev/null
+++ b/src/engine_mock.cc
@@ -0,0 +1,16 @@
+/*!
+ *  Copyright (c) 2014 by Contributors
+ * \file engine_mock.cc
+ * \brief this is an engine implementation that will 
+ * insert failures in certain call point, to test if the engine is robust to failure
+ * \author Tianqi Chen
+ */
+// define use MOCK, os we will use mock Manager
+#define _CRT_SECURE_NO_WARNINGS
+#define _CRT_SECURE_NO_DEPRECATE
+#define NOMINMAX
+// switch engine to AllreduceMock
+#define RABIT_USE_MOCK
+#include "allreduce_mock.h"
+#include "engine.cc"
+
diff --git a/src/engine_mpi.cc b/src/engine_mpi.cc
new file mode 100644
index 0000000..f617708
--- /dev/null
+++ b/src/engine_mpi.cc
@@ -0,0 +1,252 @@
+/*!
+ *  Copyright (c) 2014 by Contributors
+ * \file engine_mpi.cc
+ * \brief this file gives an implementation of engine interface using MPI,
+ *   this will allow rabit program to run with MPI, but do not comes with fault tolerant
+ *
+ * \author Tianqi Chen
+ */
+#define _CRT_SECURE_NO_WARNINGS
+#define _CRT_SECURE_NO_DEPRECATE
+#define NOMINMAX
+#include <mpi.h>
+#include <cstdio>
+#include "rabit/internal/engine.h"
+#include "rabit/internal/utils.h"
+
+namespace rabit {
+
+namespace utils {
+    bool STOP_PROCESS_ON_ERROR = true;
+}
+
+namespace engine {
+/*! \brief implementation of engine using MPI */
+class MPIEngine : public IEngine {
+ public:
+  MPIEngine(void) {
+    version_number = 0;
+  }
+  virtual void Allgather(void *sendrecvbuf_,
+                             size_t total_size,
+                             size_t slice_begin,
+                             size_t slice_end,
+                             size_t size_prev_slice,
+                             const char* _file,
+                             const int _line,
+                             const char* _caller) {
+    utils::Error("MPIEngine:: Allgather is not supported");
+  }
+  virtual void Allreduce(void *sendrecvbuf_,
+                         size_t type_nbytes,
+                         size_t count,
+                         ReduceFunction reducer,
+                         PreprocFunction prepare_fun,
+                         void *prepare_arg,
+                         const char* _file,
+                         const int _line,
+                         const char* _caller) {
+    utils::Error("MPIEngine:: Allreduce is not supported,"\
+                 "use Allreduce_ instead");
+  }
+  virtual int GetRingPrevRank(void) const {
+    utils::Error("MPIEngine:: GetRingPrevRank is not supported");
+  }
+  virtual void Broadcast(void *sendrecvbuf_, size_t size, int root,
+    const char* _file, const int _line,
+    const char* _caller) {
+    MPI::COMM_WORLD.Bcast(sendrecvbuf_, size, MPI::CHAR, root);
+  }
+  virtual void InitAfterException(void) {
+    utils::Error("MPI is not fault tolerant");
+  }
+  virtual int LoadCheckPoint(Serializable *global_model,
+                             Serializable *local_model = NULL) {
+    return 0;
+  }
+  virtual void CheckPoint(const Serializable *global_model,
+                          const Serializable *local_model = NULL) {
+    version_number += 1;
+  }
+  virtual void LazyCheckPoint(const Serializable *global_model) {
+    version_number += 1;
+  }
+  virtual int VersionNumber(void) const {
+    return version_number;
+  }
+  /*! \brief get rank of current node */
+  virtual int GetRank(void) const {
+    return MPI::COMM_WORLD.Get_rank();
+  }
+  /*! \brief get total number of */
+  virtual int GetWorldSize(void) const {
+    return MPI::COMM_WORLD.Get_size();
+  }
+  /*! \brief whether it is distributed */
+  virtual bool IsDistributed(void) const {
+    return true;
+  }
+  /*! \brief get the host name of current node */
+  virtual std::string GetHost(void) const {
+    int len;
+    char name[MPI_MAX_PROCESSOR_NAME];
+    MPI::Get_processor_name(name, len);
+    name[len] = '\0';
+    return std::string(name);
+  }
+  virtual void TrackerPrint(const std::string &msg) {
+    // simply print information into the tracker
+    if (GetRank() == 0) {
+      utils::Printf("%s", msg.c_str());
+    }
+  }
+
+ private:
+  int version_number;
+};
+
+// singleton sync manager
+MPIEngine manager;
+
+/*! \brief initialize the synchronization module */
+bool Init(int argc, char *argv[]) {
+  try {
+    MPI::Init(argc, argv);
+    return true;
+  } catch (const std::exception& e) {
+    fprintf(stderr, " failed in MPI Init %s\n", e.what());
+    return false;
+  }
+}
+/*! \brief finalize syncrhonization module */
+bool Finalize(void) {
+  try {
+    MPI::Finalize();
+    return true;
+  } catch (const std::exception& e) {
+    fprintf(stderr, "failed in MPI shutdown %s\n", e.what());
+    return false;
+  }
+}
+
+/*! \brief singleton method to get engine */
+IEngine *GetEngine(void) {
+  return &manager;
+}
+// transform enum to MPI data type
+inline MPI::Datatype GetType(mpi::DataType dtype) {
+  using namespace mpi;
+  switch (dtype) {
+    case kChar: return MPI::CHAR;
+    case kUChar: return MPI::BYTE;
+    case kInt: return MPI::INT;
+    case kUInt: return MPI::UNSIGNED;
+    case kLong: return MPI::LONG;
+    case kULong: return MPI::UNSIGNED_LONG;
+    case kFloat: return MPI::FLOAT;
+    case kDouble: return MPI::DOUBLE;
+    case kLongLong: return MPI::LONG_LONG;
+    case kULongLong: return MPI::UNSIGNED_LONG_LONG;
+  }
+  utils::Error("unknown mpi::DataType");
+  return MPI::CHAR;
+}
+// transform enum to MPI OP
+inline MPI::Op GetOp(mpi::OpType otype) {
+  using namespace mpi;
+  switch (otype) {
+    case kMax: return MPI::MAX;
+    case kMin: return MPI::MIN;
+    case kSum: return MPI::SUM;
+    case kBitwiseOR: return MPI::BOR;
+  }
+  utils::Error("unknown mpi::OpType");
+  return MPI::MAX;
+}
+// perform in-place allreduce, on sendrecvbuf
+void Allreduce_(void *sendrecvbuf,
+                size_t type_nbytes,
+                size_t count,
+                IEngine::ReduceFunction red,
+                mpi::DataType dtype,
+                mpi::OpType op,
+                IEngine::PreprocFunction prepare_fun,
+                void *prepare_arg,
+                const char* _file,
+                const int _line,
+                const char* _caller) {
+  if (prepare_fun != NULL) prepare_fun(prepare_arg);
+  MPI::COMM_WORLD.Allreduce(MPI_IN_PLACE, sendrecvbuf,
+                            count, GetType(dtype), GetOp(op));
+}
+
+// code for reduce handle
+ReduceHandle::ReduceHandle(void)
+    : handle_(NULL), redfunc_(NULL), htype_(NULL) {
+}
+ReduceHandle::~ReduceHandle(void) {
+  if (handle_ != NULL) {
+    MPI::Op *op = reinterpret_cast<MPI::Op*>(handle_);
+    op->Free();
+    delete op;
+  }
+  if (htype_ != NULL) {
+    MPI::Datatype *dtype = reinterpret_cast<MPI::Datatype*>(htype_);
+    dtype->Free();
+    delete dtype;
+  }
+}
+int ReduceHandle::TypeSize(const MPI::Datatype &dtype) {
+  return dtype.Get_size();
+}
+void ReduceHandle::Init(IEngine::ReduceFunction redfunc, size_t type_nbytes) {
+  utils::Assert(handle_ == NULL, "cannot initialize reduce handle twice");
+  if (type_nbytes != 0) {
+    MPI::Datatype *dtype = new MPI::Datatype();
+    if (type_nbytes % 8 == 0) {
+      *dtype = MPI::LONG.Create_contiguous(type_nbytes / sizeof(long));  // NOLINT(*)
+    } else if (type_nbytes % 4 == 0) {
+      *dtype = MPI::INT.Create_contiguous(type_nbytes / sizeof(int));
+    } else {
+      *dtype = MPI::CHAR.Create_contiguous(type_nbytes);
+    }
+    dtype->Commit();
+    created_type_nbytes_ = type_nbytes;
+    htype_ = dtype;
+  }
+  MPI::Op *op = new MPI::Op();
+  MPI::User_function *pf = redfunc;
+  op->Init(pf, true);
+  handle_ = op;
+}
+void ReduceHandle::Allreduce(void *sendrecvbuf,
+                             size_t type_nbytes, size_t count,
+                             IEngine::PreprocFunction prepare_fun,
+                             void *prepare_arg,
+                             const char* _file,
+                             const int _line,
+                             const char* _caller) {
+  utils::Assert(handle_ != NULL, "must intialize handle to call AllReduce");
+  MPI::Op *op = reinterpret_cast<MPI::Op*>(handle_);
+  MPI::Datatype *dtype = reinterpret_cast<MPI::Datatype*>(htype_);
+  if (created_type_nbytes_ != type_nbytes || dtype == NULL) {
+    if (dtype == NULL) {
+      dtype = new MPI::Datatype();
+    } else {
+      dtype->Free();
+    }
+    if (type_nbytes % 8 == 0) {
+      *dtype = MPI::LONG.Create_contiguous(type_nbytes / sizeof(long));  // NOLINT(*)
+    } else if (type_nbytes % 4 == 0) {
+      *dtype = MPI::INT.Create_contiguous(type_nbytes / sizeof(int));
+    } else {
+      *dtype = MPI::CHAR.Create_contiguous(type_nbytes);
+    }
+    dtype->Commit();
+    created_type_nbytes_ = type_nbytes;
+  }
+  if (prepare_fun != NULL) prepare_fun(prepare_arg);
+  MPI::COMM_WORLD.Allreduce(MPI_IN_PLACE, sendrecvbuf, count, *dtype, *op);
+}
+}  // namespace engine
+}  // namespace rabit
diff --git a/test/.gitignore b/test/.gitignore
new file mode 100644
index 0000000..8e1ff37
--- /dev/null
+++ b/test/.gitignore
@@ -0,0 +1,3 @@
+*.mpi
+*_test
+*_recover
diff --git a/test/Makefile b/test/Makefile
new file mode 100644
index 0000000..1a8e207
--- /dev/null
+++ b/test/Makefile
@@ -0,0 +1,77 @@
+RABIT_BUILD_DMLC = 0
+
+ifeq ($(RABIT_BUILD_DMLC),1)
+    DMLC=../dmlc-core
+else
+    DMLC=../../dmlc-core
+endif
+
+MPICXX=../mpich/bin/mpicxx
+export LDFLAGS=  -L../lib -pthread -lm
+export CFLAGS = -Wall -O3 -Wno-unknown-pragmas
+
+export CC = gcc
+export CXX = g++
+
+
+#----------------------------
+# Settings for power and arm arch
+#----------------------------
+ARCH := $(shell uname -a)
+ifneq (,$(filter $(ARCH), armv6l armv7l powerpc64le ppc64le aarch64))
+	CFLAGS += -march=native
+else
+	CFLAGS += -msse2
+endif
+
+ifndef WITH_FPIC
+	WITH_FPIC = 1
+endif
+ifeq ($(WITH_FPIC), 1)
+	CFLAGS += -fPIC
+endif
+
+CFLAGS += -I../include -I $(DMLC)/include -std=c++11
+
+# specify tensor path
+BIN = speed_test model_recover local_recover lazy_recover
+OBJ = $(RABIT_OBJ) speed_test.o model_recover.o local_recover.o lazy_recover.o
+MPIBIN = speed_test.mpi
+.PHONY: clean all lib mpi
+
+.PHONY: lib all
+
+all: $(BIN)
+
+lib:
+	cd ..;make clean;make;cd -
+
+.PHONY: mpi
+mpi:
+	cd ..;make mpi;cd -
+
+# programs
+speed_test.o: speed_test.cc ../include/rabit/*.h lib mpi
+model_recover.o: model_recover.cc ../include/rabit/*.h lib
+local_recover.o: local_recover.cc ../include/rabit/*.h lib
+lazy_recover.o: lazy_recover.cc ../include/rabit/*.h lib
+
+# we can link against MPI version to get use MPI
+speed_test: speed_test.o  $(RABIT_OBJ)
+speed_test.mpi: speed_test.o $(MPIOBJ)
+model_recover: model_recover.o  $(RABIT_OBJ)
+local_recover: local_recover.o  $(RABIT_OBJ)
+lazy_recover: lazy_recover.o  $(RABIT_OBJ)
+
+$(BIN) :
+	$(CXX) $(CFLAGS) -o $@ $(filter %.cpp %.o %.c %.cc, $^) ../lib/librabit_mock.a $(LDFLAGS)
+
+$(OBJ) :
+	$(CXX) -c $(CFLAGS) -o $@ $(firstword $(filter %.cpp %.c %.cc, $^) )
+
+$(MPIBIN) :
+	$(MPICXX) $(CFLAGS) -I../mpich/include -shared -o $@ $(filter %.cpp %.o %.c %.cc, $^) \
+    ../lib/librabit_mpi.so $(LDFLAGS) -L../mpich/lib -Wl,-rpath,../mpich/lib -lmpi
+
+clean:
+	$(RM) $(OBJ) $(BIN) $(MPIBIN) $(MPIOBJ) *~ ../src/*~
diff --git a/test/README.md b/test/README.md
new file mode 100644
index 0000000..fb68112
--- /dev/null
+++ b/test/README.md
@@ -0,0 +1,18 @@
+Testcases of Rabit
+====
+This folder contains internal testcases to test correctness and efficiency of rabit API
+
+The example running scripts for testcases are given by test.mk
+* type ```make -f test.mk testcasename``` to run certain testcase
+
+
+Helper Scripts
+====
+* test.mk contains Makefile documentation of all testcases
+* keepalive.sh helper bash to restart a program when it dies abnormally
+
+List of Programs
+====
+* speed_test: test the running speed of rabit API
+* test_local_recover: test recovery of local state when error happens
+* test_model_recover: test recovery of global state when error happens
diff --git a/test/cpp/CMakeLists.txt b/test/cpp/CMakeLists.txt
new file mode 100644
index 0000000..979c059
--- /dev/null
+++ b/test/cpp/CMakeLists.txt
@@ -0,0 +1,31 @@
+find_package(GTest REQUIRED)
+
+add_executable(
+  unit_tests
+  test_io.cc
+  allreduce_robust_test.cc
+  allreduce_base_test.cc
+  allreduce_mock_test.cc
+  test_main.cpp)
+
+target_link_libraries(
+  unit_tests PRIVATE
+  GTest::GTest GTest::Main
+  rabit_base rabit_mock rabit)
+
+target_include_directories(unit_tests PUBLIC
+  "$<BUILD_INTERFACE:${rabit_SOURCE_DIR}/include>"
+  "$<BUILD_INTERFACE:${DMLC_ROOT}/include>")
+
+set_target_properties(unit_tests
+  PROPERTIES
+  CXX_STANDARD 11
+  CXX_STANDARD_REQUIRED ON
+  RUNTIME_OUTPUT_DIRECTORY ${rabit_BINARY_DIR}
+  RUNTIME_OUTPUT_DIRECTORY_DEBUG ${rabit_BINARY_DIR}
+  RUNTIME_OUTPUT_DIRECTORY_RELEASE ${rabit_BINARY_DIR})
+
+add_test(
+  NAME TestRabitLib
+  COMMAND unit_tests
+  WORKING_DIRECTORY ${rabit_BINARY_DIR})
diff --git a/test/cpp/README.md b/test/cpp/README.md
new file mode 100644
index 0000000..9962980
--- /dev/null
+++ b/test/cpp/README.md
@@ -0,0 +1 @@
+Unittests for Rabit
diff --git a/test/cpp/allreduce_base_test.cc b/test/cpp/allreduce_base_test.cc
new file mode 100644
index 0000000..65a3dd5
--- /dev/null
+++ b/test/cpp/allreduce_base_test.cc
@@ -0,0 +1,66 @@
+#define RABIT_CXXTESTDEFS_H
+#include <gtest/gtest.h>
+
+#include <string>
+#include <iostream>
+#include "../../src/allreduce_base.h"
+
+TEST(allreduce_base, init_task)
+{
+  rabit::engine::AllreduceBase base;
+
+  std::string rabit_task_id = "rabit_task_id=1";
+  char cmd[rabit_task_id.size()+1];
+  std::copy(rabit_task_id.begin(), rabit_task_id.end(), cmd);
+  cmd[rabit_task_id.size()] = '\0';
+
+  char* argv[] = {cmd};
+  base.Init(1, argv);
+  EXPECT_EQ(base.task_id, "1");
+}
+
+TEST(allreduce_base, init_with_cache_on)
+{
+  rabit::engine::AllreduceBase base;
+
+  std::string rabit_task_id = "rabit_task_id=1";
+  char cmd[rabit_task_id.size()+1];
+  std::copy(rabit_task_id.begin(), rabit_task_id.end(), cmd);
+  cmd[rabit_task_id.size()] = '\0';
+
+  std::string rabit_bootstrap_cache = "rabit_bootstrap_cache=1";
+  char cmd2[rabit_bootstrap_cache.size()+1];
+  std::copy(rabit_bootstrap_cache.begin(), rabit_bootstrap_cache.end(), cmd2);
+  cmd2[rabit_bootstrap_cache.size()] = '\0';
+
+  std::string rabit_debug = "rabit_debug=1";
+  char cmd3[rabit_debug.size()+1];
+  std::copy(rabit_debug.begin(), rabit_debug.end(), cmd3);
+  cmd3[rabit_debug.size()] = '\0';
+
+  char* argv[] = {cmd, cmd2, cmd3};
+  base.Init(3, argv);
+  EXPECT_EQ(base.task_id, "1");
+  EXPECT_EQ(base.rabit_bootstrap_cache, 1);
+  EXPECT_EQ(base.rabit_debug, 1);
+}
+
+TEST(allreduce_base, init_with_ring_reduce)
+{
+  rabit::engine::AllreduceBase base;
+
+  std::string rabit_task_id = "rabit_task_id=1";
+  char cmd[rabit_task_id.size()+1];
+  std::copy(rabit_task_id.begin(), rabit_task_id.end(), cmd);
+  cmd[rabit_task_id.size()] = '\0';
+
+  std::string rabit_reduce_ring_mincount = "rabit_reduce_ring_mincount=1";
+  char cmd2[rabit_reduce_ring_mincount.size()+1];
+  std::copy(rabit_reduce_ring_mincount.begin(), rabit_reduce_ring_mincount.end(), cmd2);
+  cmd2[rabit_reduce_ring_mincount.size()] = '\0';
+
+  char* argv[] = {cmd, cmd2};
+  base.Init(2, argv);
+  EXPECT_EQ(base.task_id, "1");
+  EXPECT_EQ(base.reduce_ring_mincount, 1);
+}
diff --git a/test/cpp/allreduce_base_test.cpp b/test/cpp/allreduce_base_test.cpp
new file mode 100644
index 0000000..65a3dd5
--- /dev/null
+++ b/test/cpp/allreduce_base_test.cpp
@@ -0,0 +1,66 @@
+#define RABIT_CXXTESTDEFS_H
+#include <gtest/gtest.h>
+
+#include <string>
+#include <iostream>
+#include "../../src/allreduce_base.h"
+
+TEST(allreduce_base, init_task)
+{
+  rabit::engine::AllreduceBase base;
+
+  std::string rabit_task_id = "rabit_task_id=1";
+  char cmd[rabit_task_id.size()+1];
+  std::copy(rabit_task_id.begin(), rabit_task_id.end(), cmd);
+  cmd[rabit_task_id.size()] = '\0';
+
+  char* argv[] = {cmd};
+  base.Init(1, argv);
+  EXPECT_EQ(base.task_id, "1");
+}
+
+TEST(allreduce_base, init_with_cache_on)
+{
+  rabit::engine::AllreduceBase base;
+
+  std::string rabit_task_id = "rabit_task_id=1";
+  char cmd[rabit_task_id.size()+1];
+  std::copy(rabit_task_id.begin(), rabit_task_id.end(), cmd);
+  cmd[rabit_task_id.size()] = '\0';
+
+  std::string rabit_bootstrap_cache = "rabit_bootstrap_cache=1";
+  char cmd2[rabit_bootstrap_cache.size()+1];
+  std::copy(rabit_bootstrap_cache.begin(), rabit_bootstrap_cache.end(), cmd2);
+  cmd2[rabit_bootstrap_cache.size()] = '\0';
+
+  std::string rabit_debug = "rabit_debug=1";
+  char cmd3[rabit_debug.size()+1];
+  std::copy(rabit_debug.begin(), rabit_debug.end(), cmd3);
+  cmd3[rabit_debug.size()] = '\0';
+
+  char* argv[] = {cmd, cmd2, cmd3};
+  base.Init(3, argv);
+  EXPECT_EQ(base.task_id, "1");
+  EXPECT_EQ(base.rabit_bootstrap_cache, 1);
+  EXPECT_EQ(base.rabit_debug, 1);
+}
+
+TEST(allreduce_base, init_with_ring_reduce)
+{
+  rabit::engine::AllreduceBase base;
+
+  std::string rabit_task_id = "rabit_task_id=1";
+  char cmd[rabit_task_id.size()+1];
+  std::copy(rabit_task_id.begin(), rabit_task_id.end(), cmd);
+  cmd[rabit_task_id.size()] = '\0';
+
+  std::string rabit_reduce_ring_mincount = "rabit_reduce_ring_mincount=1";
+  char cmd2[rabit_reduce_ring_mincount.size()+1];
+  std::copy(rabit_reduce_ring_mincount.begin(), rabit_reduce_ring_mincount.end(), cmd2);
+  cmd2[rabit_reduce_ring_mincount.size()] = '\0';
+
+  char* argv[] = {cmd, cmd2};
+  base.Init(2, argv);
+  EXPECT_EQ(base.task_id, "1");
+  EXPECT_EQ(base.reduce_ring_mincount, 1);
+}
diff --git a/test/cpp/allreduce_mock_test.cc b/test/cpp/allreduce_mock_test.cc
new file mode 100644
index 0000000..e659d8e
--- /dev/null
+++ b/test/cpp/allreduce_mock_test.cc
@@ -0,0 +1,36 @@
+#define RABIT_CXXTESTDEFS_H
+#include <gtest/gtest.h>
+
+#include <string>
+#include <iostream>
+#include "../../src/allreduce_mock.h"
+
+TEST(allreduce_mock, mock_allreduce)
+{
+  rabit::engine::AllreduceMock m;
+
+  std::string mock_str = "mock=0,0,0,0";
+  char cmd[mock_str.size()+1];
+  std::copy(mock_str.begin(), mock_str.end(), cmd);
+  cmd[mock_str.size()] = '\0';
+
+  char* argv[] = {cmd};
+  m.Init(1, argv);
+  m.rank = 0;
+  EXPECT_EXIT(m.Allreduce(nullptr,0,0,nullptr,nullptr,nullptr), ::testing::ExitedWithCode(255), "");
+}
+
+TEST(allreduce_mock, mock_broadcast)
+{
+  rabit::engine::AllreduceMock m;
+  std::string mock_str = "mock=0,1,2,0";
+  char cmd[mock_str.size()+1];
+  std::copy(mock_str.begin(), mock_str.end(), cmd);
+  cmd[mock_str.size()] = '\0';
+  char* argv[] = {cmd};
+  m.Init(1, argv);
+  m.rank = 0;
+  m.version_number=1;
+  m.seq_counter=2;
+  EXPECT_EXIT(m.Broadcast(nullptr,0,0), ::testing::ExitedWithCode(255), "");
+}
diff --git a/test/cpp/allreduce_mock_test.cpp b/test/cpp/allreduce_mock_test.cpp
new file mode 100644
index 0000000..ec3190c
--- /dev/null
+++ b/test/cpp/allreduce_mock_test.cpp
@@ -0,0 +1,51 @@
+#define RABIT_CXXTESTDEFS_H
+#include <gtest/gtest.h>
+
+#include <string>
+#include <iostream>
+#include "../../src/allreduce_mock.h"
+
+TEST(allreduce_mock, mock_allreduce)
+{
+  rabit::engine::AllreduceMock m;
+
+  std::string mock_str = "mock=0,0,0,0";
+  char cmd[mock_str.size()+1];
+  std::copy(mock_str.begin(), mock_str.end(), cmd);
+  cmd[mock_str.size()] = '\0';
+
+  char* argv[] = {cmd};
+  m.Init(1, argv);
+  m.rank = 0;
+  EXPECT_EXIT(m.Allreduce(nullptr,0,0,nullptr,nullptr,nullptr), ::testing::ExitedWithCode(255), "");
+}
+
+TEST(allreduce_mock, mock_broadcast)
+{
+  rabit::engine::AllreduceMock m;
+  std::string mock_str = "mock=0,1,2,0";
+  char cmd[mock_str.size()+1];
+  std::copy(mock_str.begin(), mock_str.end(), cmd);
+  cmd[mock_str.size()] = '\0';
+  char* argv[] = {cmd};
+  m.Init(1, argv);
+  m.rank = 0;
+  m.version_number=1;
+  m.seq_counter=2;
+  EXPECT_EXIT(m.Broadcast(nullptr,0,0), ::testing::ExitedWithCode(255), "");
+}
+
+TEST(allreduce_mock, mock_gather)
+{
+  rabit::engine::AllreduceMock m;
+  std::string mock_str = "mock=3,13,22,0";
+  char cmd[mock_str.size()+1];
+  std::copy(mock_str.begin(), mock_str.end(), cmd);
+  cmd[mock_str.size()] = '\0';
+  char* argv[] = {cmd};
+  m.Init(1, argv);
+  m.rank = 3;
+  m.version_number=13;
+  m.seq_counter=22;
+  EXPECT_EXIT(m.Allgather(nullptr,0,0,0,0), ::testing::ExitedWithCode(255), "");
+}
diff --git a/test/cpp/allreduce_robust_test.cc b/test/cpp/allreduce_robust_test.cc
new file mode 100644
index 0000000..c86f762
--- /dev/null
+++ b/test/cpp/allreduce_robust_test.cc
@@ -0,0 +1,233 @@
+#define RABIT_CXXTESTDEFS_H
+#include <gtest/gtest.h>
+
+#include <chrono>
+#include <string>
+#include <iostream>
+#include "../../src/allreduce_robust.h"
+
+inline void mockerr(const char *fmt, ...) {EXPECT_STRCASEEQ(fmt, "[%d] exit due to time out %d s\n");}
+inline void mockassert(bool val, const char *fmt, ...) {}
+rabit::engine::AllreduceRobust::ReturnType err_type(rabit::engine::AllreduceRobust::ReturnTypeEnum::kSockError);
+rabit::engine::AllreduceRobust::ReturnType succ_type(rabit::engine::AllreduceRobust::ReturnTypeEnum::kSuccess);
+
+TEST(allreduce_robust, sync_error_timeout)
+{
+  rabit::engine::AllreduceRobust m;
+
+  std::string rabit_timeout = "rabit_timeout=1";
+  char cmd[rabit_timeout.size()+1];
+  std::copy(rabit_timeout.begin(), rabit_timeout.end(), cmd);
+  cmd[rabit_timeout.size()] = '\0';
+
+  std::string rabit_timeout_sec = "rabit_timeout_sec=1";
+  char cmd1[rabit_timeout_sec.size()+1];
+  std::copy(rabit_timeout_sec.begin(), rabit_timeout_sec.end(), cmd1);
+  cmd1[rabit_timeout_sec.size()] = '\0';
+
+  char* argv[] = {cmd,cmd1};
+  m.Init(2, argv);
+  m.rank = 0;
+  m.rabit_bootstrap_cache = 1;
+  m._error = mockerr;
+  m._assert = mockassert;
+  EXPECT_EQ(m.CheckAndRecover(err_type), false);
+  std::this_thread::sleep_for(std::chrono::milliseconds(1500));
+  EXPECT_EQ(m.rabit_timeout_task.get(), false);
+}
+
+TEST(allreduce_robust, sync_error_reset)
+{
+  rabit::engine::AllreduceRobust m;
+
+  std::string rabit_timeout = "rabit_timeout=1";
+  char cmd[rabit_timeout.size()+1];
+  std::copy(rabit_timeout.begin(), rabit_timeout.end(), cmd);
+  cmd[rabit_timeout.size()] = '\0';
+
+  std::string rabit_timeout_sec = "rabit_timeout_sec=1";
+  char cmd1[rabit_timeout_sec.size()+1];
+  std::copy(rabit_timeout_sec.begin(), rabit_timeout_sec.end(), cmd1);
+  cmd1[rabit_timeout_sec.size()] = '\0';
+
+  std::string rabit_debug = "rabit_debug=1";
+  char cmd2[rabit_debug.size()+1];
+  std::copy(rabit_debug.begin(), rabit_debug.end(), cmd2);
+  cmd2[rabit_debug.size()] = '\0';
+
+  char* argv[] = {cmd, cmd1,cmd2};
+  m.Init(3, argv);
+  m.rank = 0;
+  m._assert = mockassert;
+  EXPECT_EQ(m.CheckAndRecover(err_type), false);
+  std::this_thread::sleep_for(std::chrono::milliseconds(100));
+  EXPECT_EQ(m.CheckAndRecover(succ_type), true);
+  EXPECT_EQ(m.rabit_timeout_task.get(), true);
+  m.Shutdown();
+}
+
+TEST(allreduce_robust, sync_success_error_timeout)
+{
+  rabit::engine::AllreduceRobust m;
+
+  std::string rabit_timeout = "rabit_timeout=1";
+  char cmd[rabit_timeout.size()+1];
+  std::copy(rabit_timeout.begin(), rabit_timeout.end(), cmd);
+  cmd[rabit_timeout.size()] = '\0';
+
+  std::string rabit_timeout_sec = "rabit_timeout_sec=1";
+  char cmd1[rabit_timeout_sec.size()+1];
+  std::copy(rabit_timeout_sec.begin(), rabit_timeout_sec.end(), cmd1);
+  cmd1[rabit_timeout_sec.size()] = '\0';
+
+  std::string rabit_debug = "rabit_debug=1";
+  char cmd2[rabit_debug.size()+1];
+  std::copy(rabit_debug.begin(), rabit_debug.end(), cmd2);
+  cmd2[rabit_debug.size()] = '\0';
+
+  char* argv[] = {cmd, cmd1,cmd2};
+  m.Init(3, argv);
+  m.rank = 0;
+  m.rabit_bootstrap_cache = 1;
+  m._assert = mockassert;
+  m._error = mockerr;
+  EXPECT_EQ(m.CheckAndRecover(succ_type), true);
+  std::this_thread::sleep_for(std::chrono::milliseconds(100));
+  EXPECT_EQ(m.CheckAndRecover(err_type), false);
+  std::this_thread::sleep_for(std::chrono::milliseconds(1500));
+  EXPECT_EQ(m.rabit_timeout_task.get(), false);
+}
+
+TEST(allreduce_robust, sync_success_error_success)
+{
+  rabit::engine::AllreduceRobust m;
+
+  std::string rabit_timeout = "rabit_timeout=1";
+  char cmd[rabit_timeout.size()+1];
+  std::copy(rabit_timeout.begin(), rabit_timeout.end(), cmd);
+  cmd[rabit_timeout.size()] = '\0';
+
+  std::string rabit_timeout_sec = "rabit_timeout_sec=1";
+  char cmd1[rabit_timeout_sec.size()+1];
+  std::copy(rabit_timeout_sec.begin(), rabit_timeout_sec.end(), cmd1);
+  cmd1[rabit_timeout_sec.size()] = '\0';
+
+  std::string rabit_debug = "rabit_debug=1";
+  char cmd2[rabit_debug.size()+1];
+  std::copy(rabit_debug.begin(), rabit_debug.end(), cmd2);
+  cmd2[rabit_debug.size()] = '\0';
+
+  char* argv[] = {cmd, cmd1,cmd2};
+  m.Init(3, argv);
+  m.rank = 0;
+  m.rabit_bootstrap_cache = 1;
+  m._assert = mockassert;
+  EXPECT_EQ(m.CheckAndRecover(succ_type), true);
+  std::this_thread::sleep_for(std::chrono::milliseconds(10));
+
+  EXPECT_EQ(m.CheckAndRecover(err_type), false);
+  std::this_thread::sleep_for(std::chrono::milliseconds(10));
+  EXPECT_EQ(m.CheckAndRecover(succ_type), true);
+  std::this_thread::sleep_for(std::chrono::milliseconds(1100));
+  EXPECT_EQ(m.rabit_timeout_task.get(), true);
+  m.Shutdown();
+}
+
+TEST(allreduce_robust, sync_error_no_reset_timeout)
+{
+  rabit::engine::AllreduceRobust m;
+
+  std::string rabit_timeout = "rabit_timeout=1";
+  char cmd[rabit_timeout.size()+1];
+  std::copy(rabit_timeout.begin(), rabit_timeout.end(), cmd);
+  cmd[rabit_timeout.size()] = '\0';
+
+  std::string rabit_timeout_sec = "rabit_timeout_sec=1";
+  char cmd1[rabit_timeout_sec.size()+1];
+  std::copy(rabit_timeout_sec.begin(), rabit_timeout_sec.end(), cmd1);
+  cmd1[rabit_timeout_sec.size()] = '\0';
+
+  std::string rabit_debug = "rabit_debug=1";
+  char cmd2[rabit_debug.size()+1];
+  std::copy(rabit_debug.begin(), rabit_debug.end(), cmd2);
+  cmd2[rabit_debug.size()] = '\0';
+
+  char* argv[] = {cmd, cmd1,cmd2};
+  m.Init(3, argv);
+  m.rank = 0;
+  m.rabit_bootstrap_cache = 1;
+  m._assert = mockassert;
+  m._error = mockerr;
+  auto start = std::chrono::system_clock::now();
+
+  EXPECT_EQ(m.CheckAndRecover(err_type), false);
+  std::this_thread::sleep_for(std::chrono::milliseconds(1100));
+
+  EXPECT_EQ(m.CheckAndRecover(err_type), false);
+
+  m.rabit_timeout_task.wait();
+  auto end = std::chrono::system_clock::now();
+  std::chrono::duration<double> diff = end-start;
+
+  EXPECT_EQ(m.rabit_timeout_task.get(), false);
+  // expect second error don't overwrite/reset timeout task
+  EXPECT_LT(diff.count(), 2);
+}
+
+TEST(allreduce_robust, no_timeout_shut_down)
+{
+  rabit::engine::AllreduceRobust m;
+
+  std::string rabit_timeout = "rabit_timeout=1";
+  char cmd[rabit_timeout.size()+1];
+  std::copy(rabit_timeout.begin(), rabit_timeout.end(), cmd);
+  cmd[rabit_timeout.size()] = '\0';
+
+  std::string rabit_timeout_sec = "rabit_timeout_sec=1";
+  char cmd1[rabit_timeout_sec.size()+1];
+  std::copy(rabit_timeout_sec.begin(), rabit_timeout_sec.end(), cmd1);
+  cmd1[rabit_timeout_sec.size()] = '\0';
+
+  std::string rabit_debug = "rabit_debug=1";
+  char cmd2[rabit_debug.size()+1];
+  std::copy(rabit_debug.begin(), rabit_debug.end(), cmd2);
+  cmd2[rabit_debug.size()] = '\0';
+
+  char* argv[] = {cmd, cmd1,cmd2};
+  m.Init(3, argv);
+  m.rank = 0;
+
+  EXPECT_EQ(m.CheckAndRecover(succ_type), true);
+  std::this_thread::sleep_for(std::chrono::milliseconds(10));
+  m.Shutdown();
+}
+
+TEST(allreduce_robust, shut_down_before_timeout)
+{
+  rabit::engine::AllreduceRobust m;
+
+  std::string rabit_timeout = "rabit_timeout=1";
+  char cmd[rabit_timeout.size()+1];
+  std::copy(rabit_timeout.begin(), rabit_timeout.end(), cmd);
+  cmd[rabit_timeout.size()] = '\0';
+
+  std::string rabit_timeout_sec = "rabit_timeout_sec=1";
+  char cmd1[rabit_timeout_sec.size()+1];
+  std::copy(rabit_timeout_sec.begin(), rabit_timeout_sec.end(), cmd1);
+  cmd1[rabit_timeout_sec.size()] = '\0';
+
+  std::string rabit_debug = "rabit_debug=1";
+  char cmd2[rabit_debug.size()+1];
+  std::copy(rabit_debug.begin(), rabit_debug.end(), cmd2);
+  cmd2[rabit_debug.size()] = '\0';
+
+  char* argv[] = {cmd, cmd1,cmd2};
+  m.Init(3, argv);
+  m.rank = 0;
+  rabit::engine::AllreduceRobust::LinkRecord a;
+  m.err_link = &a;
+  
+  EXPECT_EQ(m.CheckAndRecover(err_type), false);
+  std::this_thread::sleep_for(std::chrono::milliseconds(10));
+  m.Shutdown();
+}
\ No newline at end of file
diff --git a/test/cpp/test_io.cc b/test/cpp/test_io.cc
new file mode 100644
index 0000000..0e4b70b
--- /dev/null
+++ b/test/cpp/test_io.cc
@@ -0,0 +1,18 @@
+/*!
+ *  Copyright (c) 2019 by Contributors
+ */
+#include <gtest/gtest.h>
+#include <rabit/internal/io.h>
+
+#include <vector>
+
+namespace rabit {
+TEST(MemoryFixSizeBuffer, Seek) {
+  size_t constexpr kSize { 64 };
+  std::vector<int32_t> memory( kSize );
+  utils::MemoryFixSizeBuffer buf(memory.data(), memory.size());
+  buf.Seek(utils::MemoryFixSizeBuffer::SeekEnd);
+  size_t end = buf.Tell();
+  ASSERT_EQ(end, kSize);
+}
+}  // namespace rabit
diff --git a/test/cpp/test_main.cpp b/test/cpp/test_main.cpp
new file mode 100644
index 0000000..6eb025a
--- /dev/null
+++ b/test/cpp/test_main.cpp
@@ -0,0 +1,8 @@
+#include "gtest/gtest.h"
+
+int main(int argc, char** argv)
+{
+    ::testing::InitGoogleTest(&argc, argv);
+    ::testing::FLAGS_gtest_death_test_style = "threadsafe";
+    return RUN_ALL_TESTS();
+}
diff --git a/test/lazy_recover.cc b/test/lazy_recover.cc
new file mode 100644
index 0000000..180e2e4
--- /dev/null
+++ b/test/lazy_recover.cc
@@ -0,0 +1,125 @@
+// this is a test case to test whether rabit can recover model when
+// facing an exception
+#include <rabit/rabit.h>
+#include <cstdio>
+#include <cstdlib>
+#include <cmath>
+using namespace rabit;
+
+// dummy model
+class Model : public rabit::Serializable {
+ public:
+  // iterations
+  std::vector<float> data;
+  // load from stream
+  virtual void Load(rabit::Stream *fi) {
+    fi->Read(&data);
+  }
+  /*! \brief save the model to the stream */
+  virtual void Save(rabit::Stream *fo) const {
+    fo->Write(data);
+  }
+  virtual void InitModel(size_t n) {
+    data.clear();
+    data.resize(n, 1.0f);
+  }
+};
+
+inline void TestMax(Model *model, int ntrial, int iter) {
+  int rank = rabit::GetRank();
+  int nproc = rabit::GetWorldSize();
+  const int z = iter + 111;
+
+  std::vector<float> ndata(model->data.size());
+  for (size_t i = 0; i < ndata.size(); ++i) {
+    ndata[i] = (i * (rank+1)) % z  + model->data[i];
+  }
+  rabit::Allreduce<op::Max>(&ndata[0], ndata.size());
+
+  for (size_t i = 0; i < ndata.size(); ++i) {
+    float rmax = (i * 1) % z + model->data[i];
+    for (int r = 0; r < nproc; ++r) {
+      rmax = std::max(rmax, (float)((i * (r+1)) % z) + model->data[i]);
+    }
+    utils::Check(rmax == ndata[i], "[%d] TestMax check failurem i=%lu, rmax=%f, ndata=%f", rank, i, rmax, ndata[i]);
+  }
+}
+
+inline void TestSum(Model *model, int ntrial, int iter) {
+  int rank = rabit::GetRank();
+  int nproc = rabit::GetWorldSize();
+  const int z = 131 + iter;
+
+  std::vector<float> ndata(model->data.size());
+  for (size_t i = 0; i < ndata.size(); ++i) {
+    ndata[i] = (i * (rank+1)) % z + model->data[i];
+  }
+  Allreduce<op::Sum>(&ndata[0], ndata.size());
+
+  for (size_t i = 0; i < ndata.size(); ++i) {
+    float rsum = model->data[i] * nproc;
+    for (int r = 0; r < nproc; ++r) {
+      rsum += (float)((i * (r+1)) % z);
+    }
+    utils::Check(fabsf(rsum - ndata[i]) < 1e-5 ,
+                 "[%d] TestSum check failure, local=%g, allreduce=%g", rank, rsum, ndata[i]);
+  }
+  model->data = ndata;
+}
+
+inline void TestBcast(size_t n, int root, int ntrial, int iter) {
+  int rank = rabit::GetRank();
+  std::string s; s.resize(n);
+  for (size_t i = 0; i < n; ++i) {
+    s[i] = char(i % 126 + 1);
+  }
+  std::string res;
+  if (root == rank) {
+    res = s;
+    rabit::Broadcast(&res, root);
+  } else {
+    rabit::Broadcast(&res, root);
+  }
+  utils::Check(res == s, "[%d] TestBcast fail", rank);
+}
+
+int main(int argc, char *argv[]) {
+  if (argc < 3) {
+    printf("Usage: <ndata> <config>\n");
+    return 0;
+  }
+  int n = atoi(argv[1]);
+  rabit::Init(argc, argv);
+  int rank = rabit::GetRank();
+  int nproc = rabit::GetWorldSize();
+  std::string name = rabit::GetProcessorName();
+  Model model;
+  srand(0);
+  int ntrial = 0;
+  for (int i = 1; i < argc; ++i) {
+    int n;
+    if (sscanf(argv[i], "rabit_num_trial=%d", &n) == 1) ntrial = n;
+  }
+  int iter = rabit::LoadCheckPoint(&model);
+  if (iter == 0) {
+    model.InitModel(n);
+    printf("[%d] reload-trail=%d, init iter=%d\n", rank, ntrial, iter);
+  } else {
+    printf("[%d] reload-trail=%d, init iter=%d\n", rank, ntrial, iter);
+  }
+  for (int r = iter; r < 3; ++r) {
+    TestMax(&model, ntrial, r);
+    printf("[%d] !!!TestMax pass, iter=%d\n",  rank, r);
+    int step = std::max(nproc / 3, 1);
+    for (int i = 0; i < nproc; i += step) {
+      TestBcast(n, i, ntrial, r);
+    }
+    printf("[%d] !!!TestBcast pass, iter=%d\n", rank, r);
+    TestSum(&model, ntrial, r);
+    printf("[%d] !!!TestSum pass, iter=%d\n", rank, r);
+    rabit::LazyCheckPoint(&model);
+    printf("[%d] !!!CheckPoint pass, iter=%d\n", rank, r);
+  }
+  rabit::Finalize();
+  return 0;
+}
diff --git a/test/local_recover.cc b/test/local_recover.cc
new file mode 100644
index 0000000..1f0b28b
--- /dev/null
+++ b/test/local_recover.cc
@@ -0,0 +1,137 @@
+// this is a test case to test whether rabit can recover model when
+// facing an exception
+#include <rabit/rabit.h>
+#include <cstdio>
+#include <cstdlib>
+#include <cmath>
+
+using namespace rabit;
+
+// dummy model
+class Model : public rabit::Serializable {
+ public:
+  // iterations
+  std::vector<float> data;
+  // load from stream
+  virtual void Load(rabit::Stream *fi) {
+    fi->Read(&data);
+  }
+  /*! \brief save the model to the stream */
+  virtual void Save(rabit::Stream *fo) const {
+    fo->Write(data);
+  }
+  virtual void InitModel(size_t n, float v) {
+    data.clear();
+    data.resize(n, v);
+  }
+};
+
+inline void TestMax(Model *model, Model *local, int ntrial, int iter) {
+  int rank = rabit::GetRank();
+  int nproc = rabit::GetWorldSize();
+  const int z = iter + 111;
+  std::vector<float> ndata(model->data.size());
+  rabit::Allreduce<op::Max>(&ndata[0], ndata.size(),
+                            [&]() {
+                              // use lambda expression to prepare the data
+                              for (size_t i = 0; i < ndata.size(); ++i) {
+                                ndata[i] = (i * (rank+1)) % z  + local->data[i];
+                              }
+                            });
+
+  for (size_t i = 0; i < ndata.size(); ++i) {
+    float rmax = (i * 1) % z + model->data[i];
+    for (int r = 0; r < nproc; ++r) {
+      rmax = std::max(rmax, (float)((i * (r+1)) % z) + model->data[i] + r);
+    }
+    utils::Check(rmax == ndata[i], "[%d] TestMax check failure", rank);
+  }
+  model->data = ndata;
+  local->data = ndata;
+  for (size_t i = 0; i < ndata.size(); ++i) {
+    local->data[i] = ndata[i] + rank;
+  }
+}
+
+inline void TestSum(Model *model, Model *local, int ntrial, int iter) {
+  int rank = rabit::GetRank();
+  int nproc = rabit::GetWorldSize();
+  const int z = 131 + iter;
+
+  std::vector<float> ndata(model->data.size());
+  for (size_t i = 0; i < ndata.size(); ++i) {
+    ndata[i] = (i * (rank+1)) % z + local->data[i];
+  }
+  Allreduce<op::Sum>(&ndata[0], ndata.size());
+
+  for (size_t i = 0; i < ndata.size(); ++i) {
+    float rsum = 0.0f;
+    for (int r = 0; r < nproc; ++r) {
+      rsum += (float)((i * (r+1)) % z) + model->data[i] + r;
+    }
+    utils::Check(fabsf(rsum - ndata[i]) < 1e-5 ,
+                 "[%d] TestSum check failure, local=%g, allreduce=%g", rank, rsum, ndata[i]);
+  }
+  model->data = ndata;
+  for (size_t i = 0; i < ndata.size(); ++i) {
+    local->data[i] = ndata[i] + rank;
+  }
+}
+
+inline void TestBcast(size_t n, int root, int ntrial, int iter) {
+  int rank = rabit::GetRank();
+  std::string s; s.resize(n);
+  for (size_t i = 0; i < n; ++i) {
+    s[i] = char(i % 126 + 1);
+  }
+  std::string res;
+  if (root == rank) {
+    res = s;
+    rabit::Broadcast(&res, root);
+  } else {
+    rabit::Broadcast(&res, root);
+  }
+  utils::Check(res == s, "[%d] TestBcast fail", rank);
+}
+
+int main(int argc, char *argv[]) {
+  if (argc < 3) {
+    printf("Usage: <ndata>\n");
+    return 0;
+  }
+  int n = atoi(argv[1]);
+  rabit::Init(argc, argv);
+  int rank = rabit::GetRank();
+  int nproc = rabit::GetWorldSize();
+  std::string name = rabit::GetProcessorName();
+  Model model, local;
+  srand(0);
+  int ntrial = 0;
+  for (int i = 1; i < argc; ++i) {
+    int n;
+    if (sscanf(argv[i], "repeat=%d", &n) == 1) ntrial = n;
+  }
+  int iter = rabit::LoadCheckPoint(&model, &local);
+  if (iter == 0) {
+    model.InitModel(n, 1.0f);
+    local.InitModel(n, 1.0f + rank);
+    printf("[%d] reload-trail=%d, init iter=%d\n", rank, ntrial, iter);
+  } else {
+    printf("[%d] reload-trail=%d, init iter=%d\n", rank, ntrial, iter);
+  }
+  for (int r = iter; r < 3; ++r) {
+    TestMax(&model, &local, ntrial, r);
+    printf("[%d] !!!TestMax pass, iter=%d\n",  rank, r);
+    int step = std::max(nproc / 3, 1);
+    for (int i = 0; i < nproc; i += step) {
+      TestBcast(n, i, ntrial, r);
+    }
+    printf("[%d] !!!TestBcast pass, iter=%d\n", rank, r);
+    TestSum(&model, &local, ntrial, r);
+    printf("[%d] !!!TestSum pass, iter=%d\n", rank, r);
+    rabit::CheckPoint(&model, &local);
+    printf("[%d] !!!CheckPoint pass, iter=%d\n", rank, r);
+  }
+  rabit::Finalize();
+  return 0;
+}
diff --git a/test/local_recover.py b/test/local_recover.py
new file mode 100755
index 0000000..6f7fae8
--- /dev/null
+++ b/test/local_recover.py
@@ -0,0 +1,32 @@
+#!/usr/bin/env python3
+
+from __future__ import print_function
+from builtins import range
+
+import sys
+sys.path.append('../python')
+
+import rabit
+import numpy as np
+
+rabit.init(lib='mock')
+rank = rabit.get_rank()
+n = 10
+nround = 3
+data = np.ones(n) * rank
+
+version, model, local = rabit.load_checkpoint(True)
+if version == 0:
+    model = np.zeros(n)
+    local = np.ones(n)
+else:
+    print('[%d] restart from version %d' % (rank, version))
+
+for i in range(version, nround):
+    res = rabit.allreduce(data + model+local, rabit.SUM)
+    print('[%d] iter=%d: %s' % (rank, i, str(res)))
+    model = res
+    local[:] = i
+    rabit.checkpoint(model, local)
+
+rabit.finalize()
diff --git a/test/model_recover.cc b/test/model_recover.cc
new file mode 100644
index 0000000..181638c
--- /dev/null
+++ b/test/model_recover.cc
@@ -0,0 +1,157 @@
+// this is a test case to test whether rabit can recover model when
+// facing an exception
+#include <rabit/rabit.h>
+#include <cstdio>
+#include <cstdlib>
+#include <cmath>
+
+using namespace rabit;
+
+// dummy model
+class Model : public rabit::Serializable {
+ public:
+  // iterations
+  std::vector<float> data;
+  // load from stream
+  virtual void Load(rabit::Stream *fi) {
+    fi->Read(&data);
+  }
+  /*! \brief save the model to the stream */
+  virtual void Save(rabit::Stream *fo) const {
+    fo->Write(data);
+  }
+  virtual void InitModel(size_t n) {
+    data.clear();
+    data.resize(n, 1.0f);
+  }
+};
+
+inline void TestMax(Model *model, int iter) {
+  int rank = rabit::GetRank();
+  int nproc = rabit::GetWorldSize();
+  const int z = iter + 111;
+
+  std::vector<float> ndata(model->data.size());
+  for (size_t i = 0; i < ndata.size(); ++i) {
+    ndata[i] = (i * (rank+1)) % z  + model->data[i];
+  }
+  rabit::Allreduce<op::Max>(&ndata[0], ndata.size());
+
+  for (size_t i = 0; i < ndata.size(); ++i) {
+    float rmax = (i * 1) % z + model->data[i];
+    for (int r = 0; r < nproc; ++r) {
+      rmax = std::max(rmax, (float)((i * (r+1)) % z) + model->data[i]);
+    }
+    utils::Check(rmax == ndata[i], "[%d] TestMax check failurem i=%lu, rmax=%f, ndata=%f", rank, i, rmax, ndata[i]);
+  }
+  model->data = ndata;
+}
+
+inline void TestSum(Model *model, int iter) {
+  int rank = rabit::GetRank();
+  int nproc = rabit::GetWorldSize();
+  const int z = 131 + iter;
+
+  std::vector<float> ndata(model->data.size());
+  for (size_t i = 0; i < ndata.size(); ++i) {
+    ndata[i] = (i * (rank+1)) % z + model->data[i];
+  }
+  Allreduce<op::Sum>(&ndata[0], ndata.size());
+
+  for (size_t i = 0; i < ndata.size(); ++i) {
+    float rsum = model->data[i] * nproc;
+    for (int r = 0; r < nproc; ++r) {
+      rsum += (float)((i * (r+1)) % z);
+    }
+    utils::Check(fabsf(rsum - ndata[i]) < 1e-5 ,
+                 "[%d] TestSum check failure, local=%g, allreduce=%g", rank, rsum, ndata[i]);
+  }
+  model->data = ndata;
+}
+
+inline void TestAllgather(Model *model, int iter) {
+  int rank = rabit::GetRank();
+  int nproc = rabit::GetWorldSize();
+  const int z = 131 + iter;
+
+  std::vector<float> ndata(model->data.size() * nproc);
+  size_t beginSlice = rank * model->data.size();
+  for (size_t i = 0; i < model->data.size(); ++i) {
+    ndata[beginSlice + i] = (i * (rank+1)) % z + model->data[i];
+  }
+  Allgather(&ndata[0], ndata.size(), beginSlice,
+  model->data.size(), model->data.size());
+
+  for (size_t i = 0; i < ndata.size(); ++i) {
+    int curRank = i / model->data.size();
+    int remainder = i % model->data.size();
+    float data = (remainder * (curRank+1)) % z + model->data[remainder];
+    utils::Check(fabsf(data - ndata[i]) < 1e-5 ,
+                 "[%d] TestAllgather check failure, local=%g, allgatherring=%g", rank, data, ndata[i]);
+  }
+  model->data = ndata;
+}
+
+inline void TestBcast(size_t n, int root) {
+  int rank = rabit::GetRank();
+  std::string s; s.resize(n);
+  for (size_t i = 0; i < n; ++i) {
+    s[i] = char(i % 126 + 1);
+  }
+  std::string res;
+  if (root == rank) {
+    res = s;
+  }
+  rabit::Broadcast(&res, root);
+
+  utils::Check(res == s, "[%d] TestBcast fail", rank);
+}
+
+int main(int argc, char *argv[]) {
+  if (argc < 3) {
+    printf("Usage: <ndata> <config>\n");
+    return 0;
+  }
+  int n = atoi(argv[1]);
+  rabit::Init(argc, argv);
+  int rank = rabit::GetRank();
+  int nproc = rabit::GetWorldSize();
+  std::string name = rabit::GetProcessorName();
+
+  int max_rank = rank;
+  rabit::Allreduce<op::Max>(&max_rank, 1);
+  utils::Check(max_rank == nproc - 1, "max rank is world size-1");
+
+  Model model;
+  srand(0);
+  int ntrial = 0;
+  for (int i = 1; i < argc; ++i) {
+    int n;
+    if (sscanf(argv[i], "rabit_num_trial=%d", &n) == 1) ntrial = n;
+  }
+  int iter = rabit::LoadCheckPoint(&model);
+  if (iter == 0) {
+    model.InitModel(n);
+  }
+  printf("[%d] reload-trail=%d, init iter=%d\n", rank, ntrial, iter);
+
+  for (int r = iter; r < 3; ++r) {
+    TestMax(&model, r);
+    printf("[%d] !!!TestMax pass, iter=%d\n",  rank, r);
+    int step = std::max(nproc / 3, 1);
+    for (int i = 0; i < nproc; i += step) {
+      TestBcast(n, i);
+    }
+    printf("[%d] !!!TestBcast pass, iter=%d\n", rank, r);
+
+    TestSum(&model, r);
+    printf("[%d] !!!TestSum pass, iter=%d\n", rank, r);
+    TestAllgather(&model, r);
+    printf("[%d] !!!TestAllgather pass, iter=%d\n", rank, r);
+    rabit::CheckPoint(&model);
+    printf("[%d] !!!Checkpoint pass, iter=%d\n", rank, r);
+  }
+  rabit::Finalize();
+  return 0;
+}
+
diff --git a/test/speed_runner.py b/test/speed_runner.py
new file mode 100644
index 0000000..1644bfe
--- /dev/null
+++ b/test/speed_runner.py
@@ -0,0 +1,34 @@
+import os
+import argparse
+import sys
+
+def main():
+  parser = argparse.ArgumentParser(description='TODO')
+  parser.add_argument('-ho', '--host_dir', required=True)
+  parser.add_argument('-s', '--submit_script', required=True)
+  parser.add_argument('-rex', '--rabit_exec', required=True)
+  parser.add_argument('-mpi', '--mpi_exec', required=True)
+  args = parser.parse_args()
+
+  ndata = [10**4, 10**5, 10**6, 10**7]
+  nrepeat = [10**4, 10**3, 10**2, 10]
+
+  machines = [2,4,8,16,31]
+
+  executables = [args.rabit_exec, args.mpi_exec]
+
+  for executable in executables:
+    sys.stderr.write('Executable %s' % executable)
+    sys.stderr.flush()
+    for i, data in enumerate(ndata):
+      for machine in machines:
+        host_file = os.path.join(args.host_dir, 'hosts%d' % machine)
+        cmd = 'python %s %d %s %s %d %d' % (args.submit_script, machine, host_file, executable, data, nrepeat[i])
+        sys.stderr.write('data=%d, repeat=%d, machine=%d\n' % (data, nrepeat[i], machine))
+        sys.stderr.flush()
+        os.system(cmd)
+    sys.stderr.write('\n')
+    sys.stderr.flush()
+
+if __name__ == "__main__":
+  main()
diff --git a/test/speed_test.cc b/test/speed_test.cc
new file mode 100644
index 0000000..8eb543d
--- /dev/null
+++ b/test/speed_test.cc
@@ -0,0 +1,99 @@
+// This program is used to test the speed of rabit API
+#include <rabit/rabit.h>
+#include <rabit/internal/timer.h>
+#include <cstdio>
+#include <cstdlib>
+#include <cmath>
+#include <time.h>
+
+using namespace rabit;
+
+double max_tdiff, sum_tdiff, bcast_tdiff, tot_tdiff;
+
+inline void TestMax(size_t n) {
+  int rank = rabit::GetRank();
+  std::vector<float> ndata(n);
+  for (size_t i = 0; i < ndata.size(); ++i) {
+    ndata[i] = (i * (rank+1)) % 111;
+  }
+  double tstart = utils::GetTime();
+  rabit::Allreduce<op::Max>(&ndata[0], ndata.size());
+  max_tdiff += utils::GetTime() - tstart;
+}
+
+inline void TestSum(size_t n) {
+  int rank = rabit::GetRank();
+  const int z = 131;
+  std::vector<float> ndata(n);
+  for (size_t i = 0; i < ndata.size(); ++i) {
+    ndata[i] = (i * (rank+1)) % z;
+  }
+  double tstart = utils::GetTime();
+  rabit::Allreduce<op::Sum>(&ndata[0], ndata.size());
+  sum_tdiff += utils::GetTime() - tstart;
+}
+
+inline void TestBcast(size_t n, int root) {
+  int rank = rabit::GetRank();
+  std::string s; s.resize(n);
+  for (size_t i = 0; i < n; ++i) {
+    s[i] = char(i % 126 + 1);
+  }
+  std::string res;
+  res.resize(n);
+  if (root == rank) {
+    res = s;
+  }
+  double tstart = utils::GetTime();
+  rabit::Broadcast(&res[0], res.length(), root);
+  bcast_tdiff += utils::GetTime() - tstart;
+}
+
+inline void PrintStats(const char *name, double tdiff, int n, int nrep, size_t size) {
+  int nproc = rabit::GetWorldSize();
+  double tsum = tdiff;
+  rabit::Allreduce<op::Sum>(&tsum, 1);
+  double tavg = tsum / nproc;
+  double tsqr = tdiff - tavg;
+  tsqr *= tsqr;
+  rabit::Allreduce<op::Sum>(&tsqr, 1);
+  double tstd = sqrt(tsqr / nproc);
+  if (rabit::GetRank() == 0) {
+    rabit::TrackerPrintf("%s: mean=%g, std=%g sec\n", name, tavg, tstd);
+    double ndata = n;
+    ndata *= nrep * size;
+    if (n != 0) {
+      rabit::TrackerPrintf("%s-speed: %g MB/sec\n", name, (ndata / tavg) / 1024 / 1024 );
+    }
+  }
+}
+
+int main(int argc, char *argv[]) {
+  if (argc < 3) {
+    printf("Usage: <ndata> <nrepeat>\n");
+    return 0;
+  }
+  srand(0);
+  int n = atoi(argv[1]);
+  int nrep = atoi(argv[2]);
+  utils::Check(nrep >= 1, "need to at least repeat running once");
+  rabit::Init(argc, argv);
+  //int rank = rabit::GetRank();
+  int nproc = rabit::GetWorldSize();
+  std::string name = rabit::GetProcessorName();
+  max_tdiff = sum_tdiff = bcast_tdiff = 0;
+  double tstart = utils::GetTime();
+  for (int i = 0; i < nrep; ++i) {
+    TestMax(n);
+    TestSum(n);
+    TestBcast(n, rand() % nproc);
+  }
+  tot_tdiff = utils::GetTime() - tstart;
+  // use allreduce to get the sum and std of time
+  PrintStats("max_tdiff", max_tdiff, n, nrep, sizeof(float));
+  PrintStats("sum_tdiff", sum_tdiff, n, nrep, sizeof(float));
+  PrintStats("bcast_tdiff", bcast_tdiff, n, nrep, sizeof(char));
+  PrintStats("tot_tdiff", tot_tdiff, 0, nrep, sizeof(float));
+  rabit::Finalize();
+  return 0;
+}
diff --git a/test/test.mk b/test/test.mk
new file mode 100644
index 0000000..1028ec5
--- /dev/null
+++ b/test/test.mk
@@ -0,0 +1,37 @@
+RABIT_BUILD_DMLC = 0
+
+ifeq ($(RABIT_BUILD_DMLC),1)
+    DMLC=../dmlc-core
+else
+    DMLC=../../dmlc-core
+endif
+
+# this is a makefile used to show testcases of rabit
+.PHONY: all
+
+all: model_recover_10_10k  model_recover_10_10k_die_same model_recover_10_10k_die_hard local_recover_10_10k lazy_recover_10_10k_die_hard lazy_recover_10_10k_die_same ringallreduce_10_10k pylocal_recover_10_10k
+
+# this experiment test recovery with actually process exit, use keepalive to keep program alive
+model_recover_10_10k:
+	$(DMLC)/tracker/dmlc-submit --cluster local --num-workers=10 --local-num-attempt=20 model_recover 10000 mock=0,0,1,0 mock=1,1,1,0 rabit_bootstrap_cache=true rabit_debug=true rabit_reduce_ring_mincount=1 rabit_timeout=true rabit_timeout_sec=5
+
+model_recover_10_10k_die_same:
+	$(DMLC)/tracker/dmlc-submit --cluster local --num-workers=10 --local-num-attempt=20 model_recover 10000 mock=0,0,1,0 mock=1,1,1,0 mock=0,1,1,0 mock=4,1,1,0 mock=9,1,1,0 rabit_bootstrap_cache=1
+
+model_recover_10_10k_die_hard:
+	$(DMLC)/tracker/dmlc-submit --cluster local --num-workers=10 --local-num-attempt=20 model_recover 10000 mock=0,0,1,0 mock=1,1,1,0 mock=1,1,1,1 mock=0,1,1,0 mock=4,1,1,0 mock=9,1,1,0 mock=8,1,2,0 mock=4,1,3,0 rabit_bootstrap_cache=1
+
+local_recover_10_10k:
+	$(DMLC)/tracker/dmlc-submit --cluster local --num-workers=10 --local-num-attempt=20 local_recover 10000 mock=0,0,1,0 mock=1,1,1,0 mock=0,1,1,0 mock=4,1,1,0 mock=9,1,1,0 mock=1,1,1,1
+
+pylocal_recover_10_10k:
+	$(DMLC)/tracker/dmlc-submit --cluster local --num-workers=10 --local-num-attempt=20 local_recover.py 10000 mock=0,0,1,0 mock=1,1,1,0 mock=0,1,1,0 mock=4,1,1,0 mock=9,1,1,0 mock=1,1,1,1
+
+lazy_recover_10_10k_die_hard:
+	$(DMLC)/tracker/dmlc-submit --cluster local --num-workers=10 --local-num-attempt=20 lazy_recover 10000 mock=0,0,1,0 mock=1,1,1,0 mock=1,1,1,1 mock=0,1,1,0 mock=4,1,1,0 mock=9,1,1,0 mock=8,1,2,0 mock=4,1,3,0
+
+lazy_recover_10_10k_die_same:
+	$(DMLC)/tracker/dmlc-submit --cluster local --num-workers=10 --local-num-attempt=20 lazy_recover 10000 mock=0,0,1,0 mock=1,1,1,0 mock=0,1,1,0 mock=4,1,1,0 mock=9,1,1,0
+
+ringallreduce_10_10k:
+	$(DMLC)/tracker/dmlc-submit --cluster local --num-workers=10 model_recover 10000 rabit_reduce_ring_mincount=10