New Upstream Release - libpsm2
Ready changes
Summary
Merged new upstream version: 11.2.230 (was: 11.2.185).
Resulting package
Built on 2022-10-20T05:10 (took 13m0s)
The resulting binary packages can be installed (if you have the apt repository enabled) by running one of:
apt install -t fresh-releases libpsm2-2-compat-dbgsymapt install -t fresh-releases libpsm2-2-compatapt install -t fresh-releases libpsm2-2-dbgsymapt install -t fresh-releases libpsm2-2apt install -t fresh-releases libpsm2-dev
Diff
diff --git a/COMMIT b/COMMIT
index b94efbd..e4fb92c 100644
--- a/COMMIT
+++ b/COMMIT
@@ -1 +1 @@
-30c52a0fd155774e18cc06328a1ba83c2a6a8104
\ No newline at end of file
+3f7c29811e820bc5645cbcad6a4c9d61133f3156
\ No newline at end of file
diff --git a/CONTRIBUTORS b/CONTRIBUTORS
index 7571183..631a5b1 100644
--- a/CONTRIBUTORS
+++ b/CONTRIBUTORS
@@ -13,3 +13,5 @@ Dmitry (dmitrygx on github.com)
Florian Weimer (fweimer on github.com)
Jonas Hahnfeld (hahnjo on github.com)
Tom Stellard (tstellar on github.com)
+Chuck Cranor (chuchcranor on github.com)
+Rémi Lacroix (RemiLacroix-IDRIS on github.com)
diff --git a/Makefile b/Makefile
index 5a31d64..9faeb73 100644
--- a/Makefile
+++ b/Makefile
@@ -4,6 +4,7 @@
#
# GPL LICENSE SUMMARY
#
+# Copyright(c) 2021 Cornelis Networks.
# Copyright(c) 2017 Intel Corporation.
#
# This program is free software; you can redistribute it and/or modify
@@ -16,10 +17,11 @@
# General Public License for more details.
#
# Contact Information:
-# Intel Corporation, www.intel.com
+# Cornelis Networks, www.cornelisnetworks.com
#
# BSD LICENSE
#
+# Copyright(c) 2021 Cornelis Networks.
# Copyright(c) 2017 Intel Corporation.
#
# Redistribution and use in source and binary forms, with or without
@@ -129,7 +131,7 @@ INCLUDES += -I$(top_srcdir) -I$(top_srcdir)/ptl_ips -I$(OUTDIR)
ifneq (x86_64,$(arch))
ifneq (i386,$(arch))
- $(error Unsupported architecture $(arch))
+ anerr := $(error Unsupported architecture $(arch))
endif
endif
@@ -164,7 +166,7 @@ nthreads := $(shell echo $$(( `nproc` * 2 )) )
# The DISTRO variable is used subsequently for variable
# behaviors of the 3 distros.
-DISTRO := $(shell . /etc/os-release; if [[ "$$ID" == "sle_hpc" ]]; then ID="sles"; fi; echo $$ID)
+DISTRO := $(shell . /etc/os-release; if [ "$$ID" = "sle_hpc" ]; then ID="sles"; fi; echo $$ID)
# By default the following two variables have the following values:
LIBPSM2_COMPAT_CONF_DIR := /etc
@@ -374,8 +376,8 @@ debug:
$(MAKE) OUTDIR=$(OUTDIR) PSM_DEBUG=1
test_clean:
- if [ -d ./test ]; then \
- $(MAKE) -C test clean; \
+ if [ -d ./test && -e ./test/Makefile ]; then \
+ $(MAKE) -f ./test/Makefile -C test clean; \
fi
specfile_clean:
diff --git a/README b/README
index 7990555..2961534 100644
--- a/README
+++ b/README
@@ -67,7 +67,6 @@ Contains the following sections:
- INSTALLING
* INSTALLING USING MAKEFILE
* INSTALLING USING EITHER YUM OR DNF
-- TESTING
- RELATED SOFTWARE TO PSM2
- SUPPORTING DOCUMENTATION
@@ -251,6 +250,24 @@ libraries available on them. Open MPI provides a standard configure, make and
make install mechanism which will detect and build the relevant PSM2 network
modules for Open MPI once the header and runtime files are detected.
+Open MPI 4.1.x, OFI BTL, and high PPN jobs
+----------------
+Open MPI added the OFI BTL for one-sided communication. On an OPA fabric, the
+OFI BTL may use the PSM2 OFI provider underneath. If PSM2 is in-use as both
+the MTL (directly or via OFI) and the BTL (via OFI), then each rank in the
+Open MPI job will require two PSM2 endpoints and PSM2 context-sharing will
+be disabled.
+
+In this case, total number of PSM2 ranks on a node can be no more than:
+ (num_hfi * num_user_contexts)/2
+Where num_user_contexts is typically equal to the number of physical CPU
+cores on that node.
+
+If your job does not require an inter-node BTL (e.g. OFI), then you can
+disable the OFI BTL in one of two ways:
+ 1. When building Open MPI, specify '--with-ofi=no' when you run 'configure'.
+ 2. When running your Open MPI job, add '-mca btl self,vader'.
+
MVAPICH2 support
----------------
MVAPICH2 supports PSM2 transport for optimized communication on HFI hardware.
diff --git a/buildflags.mak b/buildflags.mak
index 7c3cda0..0ce15aa 100644
--- a/buildflags.mak
+++ b/buildflags.mak
@@ -4,6 +4,7 @@
#
# GPL LICENSE SUMMARY
#
+# Copyright(c) 2021 Cornelis Networks.
# Copyright(c) 2016 Intel Corporation.
#
# This program is free software; you can redistribute it and/or modify
@@ -16,10 +17,11 @@
# General Public License for more details.
#
# Contact Information:
-# Intel Corporation, www.intel.com
+# Cornelis Networks, www.cornelisnetworks.com
#
# BSD LICENSE
#
+# Copyright(c) 2021 Cornelis Networks.
# Copyright(c) 2016 Intel Corporation.
#
# Redistribution and use in source and binary forms, with or without
@@ -118,13 +120,13 @@ ifneq (icc,${CC})
RET := $(shell echo "int main() {}" | ${CC} ${MAVX2} -E -dM -xc - 2>&1 | grep -q AVX2 ; echo $$?)
else
RET := $(shell echo "int main() {}" | ${CC} ${MAVX2} -E -dM -xc - 2>&1 | grep -q AVX ; echo $$?)
- $(warning ***NOTE TO USER**** Disabling AVX2 will harm performance)
+ anerr := $(warning ***NOTE TO USER**** Disabling AVX2 will harm performance)
endif
ifeq (0,${RET})
BASECFLAGS += ${MAVX2}
else
- $(error Compiler does not support ${MAVX2} )
+ anerr := $(error Compiler does not support ${MAVX2} )
endif
else
BASECFLAGS += ${MAVX2}
@@ -138,7 +140,7 @@ ifneq (,${PSM_AVX512})
ifeq (0,${RET})
BASECFLAGS += -mavx512f
else
- $(error Compiler does not support AVX512 )
+ anerr := $(error Compiler does not support AVX512 )
endif
BASECFLAGS += -DPSM_AVX512
endif
@@ -203,7 +205,7 @@ else
BASECFLAGS += -funwind-tables -Wno-strict-aliasing -Wformat-security
else
ifneq (${CCARCH},gcc4)
- $(error Unknown compiler arch "${CCARCH}")
+ anerr := $(error Unknown compiler arch "${CCARCH}")
endif # gcc4
endif # gcc
endif # icc
diff --git a/debian/changelog b/debian/changelog
index dd6f458..2c4dcf7 100644
--- a/debian/changelog
+++ b/debian/changelog
@@ -1,3 +1,10 @@
+libpsm2 (11.2.230-1) UNRELEASED; urgency=low
+
+ * New upstream release.
+ * Drop patch remove_makefile_bashisms.patch, present upstream.
+
+ -- Debian Janitor <janitor@jelmer.uk> Thu, 20 Oct 2022 04:58:46 -0000
+
libpsm2 (11.2.185-2) unstable; urgency=medium
* Team upload.
diff --git a/debian/patches/disable_makefile_git_versioning.patch b/debian/patches/disable_makefile_git_versioning.patch
index 9ed9733..9d21b5e 100644
--- a/debian/patches/disable_makefile_git_versioning.patch
+++ b/debian/patches/disable_makefile_git_versioning.patch
@@ -6,9 +6,11 @@ Forwarded: not-needed
Last-Update: 2020-07-30
---
This patch header follows DEP-3: http://dep.debian.net/deps/dep3/
---- a/Makefile
-+++ b/Makefile
-@@ -195,7 +195,7 @@
+Index: libpsm2.git/Makefile
+===================================================================
+--- libpsm2.git.orig/Makefile
++++ libpsm2.git/Makefile
+@@ -197,7 +197,7 @@ endif
export LIBPSM2_COMPAT_CONF_DIR
# The desired version number comes from the most recent tag starting with "v"
diff --git a/debian/patches/fortify_source.patch b/debian/patches/fortify_source.patch
index 0bc52b6..d17c1d3 100644
--- a/debian/patches/fortify_source.patch
+++ b/debian/patches/fortify_source.patch
@@ -5,9 +5,11 @@ Forwarded: not-needed
Last-Update: 2020-07-30
---
This patch header follows DEP-3: http://dep.debian.net/deps/dep3/
---- a/buildflags.mak
-+++ b/buildflags.mak
-@@ -158,7 +158,7 @@
+Index: libpsm2.git/buildflags.mak
+===================================================================
+--- libpsm2.git.orig/buildflags.mak
++++ libpsm2.git/buildflags.mak
+@@ -160,7 +160,7 @@ endif
ifneq (,${PSM_DEBUG})
BASECFLAGS += -O -g3 -DPSM_DEBUG -D_HFI_DEBUGGING -funit-at-a-time -Wp,-D_FORTIFY_SOURCE=2
else
@@ -16,9 +18,11 @@ This patch header follows DEP-3: http://dep.debian.net/deps/dep3/
endif
ifneq (,${PSM_COVERAGE}) # This check must come after PSM_DEBUG to override optimization setting
BASECFLAGS += -O -fprofile-arcs -ftest-coverage
---- a/compat/buildflags.mak
-+++ b/compat/buildflags.mak
-@@ -72,7 +72,7 @@
+Index: libpsm2.git/compat/buildflags.mak
+===================================================================
+--- libpsm2.git.orig/compat/buildflags.mak
++++ libpsm2.git/compat/buildflags.mak
+@@ -72,7 +72,7 @@ LINKER_SCRIPT := -Wl,--version-script $(
WERROR := -Werror
INCLUDES := -I$(top_srcdir)/include -I$(top_srcdir)/include/$(os)-$(arch) -I$(top_srcdir)/mpspawn
diff --git a/debian/patches/remove_makefile_bashisms.patch b/debian/patches/remove_makefile_bashisms.patch
deleted file mode 100644
index a2f7784..0000000
--- a/debian/patches/remove_makefile_bashisms.patch
+++ /dev/null
@@ -1,17 +0,0 @@
-Description: Correct bashishsms in Makefile
-Author: Brian T. Smith <bsmith@systemfabricworks.com>
-Forwarded: https://github.com/intel/opa-psm2/issues/55
-Last-Update: 2020-08-06
----
-This patch header follows DEP-3: http://dep.debian.net/deps/dep3/
---- a/Makefile
-+++ b/Makefile
-@@ -164,7 +164,7 @@
- # The DISTRO variable is used subsequently for variable
- # behaviors of the 3 distros.
-
--DISTRO := $(shell . /etc/os-release; if [[ "$$ID" == "sle_hpc" ]]; then ID="sles"; fi; echo $$ID)
-+DISTRO := $(shell . /etc/os-release; if [ "$$ID" = "sle_hpc" ]; then ID="sles"; fi; echo $$ID)
-
- # By default the following two variables have the following values:
- LIBPSM2_COMPAT_CONF_DIR := /etc
diff --git a/debian/patches/series b/debian/patches/series
index 059c7a0..7a83a45 100644
--- a/debian/patches/series
+++ b/debian/patches/series
@@ -1,3 +1,2 @@
disable_makefile_git_versioning.patch
fortify_source.patch
-remove_makefile_bashisms.patch
diff --git a/include/linux-i386/sysdep.h b/include/linux-i386/sysdep.h
index bfd5746..2c48c1e 100644
--- a/include/linux-i386/sysdep.h
+++ b/include/linux-i386/sysdep.h
@@ -139,12 +139,9 @@ static __inline__ uint32_t ips_cmpxchg(volatile uint32_t *ptr,
uint32_t old_val, uint32_t new_val)
{
uint32_t prev;
- struct xchg_dummy {
- uint32_t a[100];
- };
asm volatile (LOCK_PREFIX "cmpxchgl %1,%2" : "=a"(prev)
- : "q"(new_val), "m"(*(struct xchg_dummy *)ptr), "0"(old_val)
+ : "q"(new_val), "m"(*ptr), "0"(old_val)
: "memory");
return prev;
diff --git a/include/opa_debug.h b/include/opa_debug.h
index d5d8ff2..36e53a3 100644
--- a/include/opa_debug.h
+++ b/include/opa_debug.h
@@ -83,6 +83,7 @@
#define __HFI_ENVDBG 0x400
#define __HFI_EPKTDBG 0x800 /* print error packet data */
#define __HFI_CCADBG 0x1000 /* print CCA related events */
+#define __HFI_CUDADBG 0x2000 /* print CUDA calls, events */
#else /* _HFI_DEBUGGING */
/*
@@ -100,6 +101,7 @@
/* print mmap/nopage stuff, not using VDBG any more */
#define __HFI_MMDBG 0x0
#define __HFI_CCADBG 0x0 /* print CCA related events */
+#define __HFI_CUDADBG 0x0
#endif /* _HFI_DEBUGGING */
diff --git a/include/opa_udebug.h b/include/opa_udebug.h
index 9fd59cb..b25da1f 100644
--- a/include/opa_udebug.h
+++ b/include/opa_udebug.h
@@ -124,6 +124,7 @@ extern FILE *__hfi_dbgout;
"env " fmt, ##__VA_ARGS__)
#define _HFI_MMDBG(fmt, ...) __HFI_DBG_WHICH(__HFI_MMDBG, fmt, ##__VA_ARGS__)
#define _HFI_CCADBG(fmt, ...) __HFI_DBG_WHICH(__HFI_CCADBG, fmt, ##__VA_ARGS__)
+#define _HFI_CUDADBG(fmt, ...) __HFI_DBG_WHICH(__HFI_CUDADBG, fmt, ##__VA_ARGS__)
/*
* Use these macros (_HFI_DBG_ON and _HFI_DBG_ALWAYS) together
@@ -150,6 +151,9 @@ extern FILE *__hfi_dbgout;
#define _HFI_CCADBG_ON unlikely(hfi_debug & __HFI_CCADBG)
#define _HFI_CCADBG_ALWAYS(fmt, ...) _HFI_DBG_ALWAYS(fmt, ##__VA_ARGS__)
+#define _HFI_CUDADBG_ON unlikely(hfi_debug & __HFI_CUDADBG)
+#define _HFI_CUDADBG_ALWAYS(fmt, ...) _HFI_DBG_ALWAYS(fmt, ##__VA_ARGS__)
+
#define _HFI_INFO_ON unlikely(hfi_debug & __HFI_INFO)
#define _HFI_INFO_ALWAYS(fmt, ...) _HFI_DBG_ALWAYS(fmt, ##__VA_ARGS__)
@@ -177,6 +181,7 @@ extern FILE *__hfi_dbgout;
#define _HFI_VDBG(fmt, ...)
#define _HFI_MMDBG(fmt, ...)
#define _HFI_CCADBG(fmt, ...)
+#define _HFI_CUDADBG(fmt, ...)
#define _HFI_DBG_ON 0
#define _HFI_DBG_ALWAYS(fmt, ...)
@@ -186,6 +191,8 @@ extern FILE *__hfi_dbgout;
#define _HFI_PRDBG_ALWAYS(fmt, ...)
#define _HFI_CCADBG_ON 0
#define _HFI_CCADBG_ALWAYS(fmt, ...)
+#define _HFI_CUDADBG_ON 0
+#define _HFI_CUDADBG_ALWAYS(fmt, ...)
#define _HFI_INFO_ON 0
#define _HFI_INFO_ALWAYS(fmt, ...)
diff --git a/psm.c b/psm.c
index 7f929ce..ec1fd88 100644
--- a/psm.c
+++ b/psm.c
@@ -5,6 +5,7 @@
GPL LICENSE SUMMARY
+ Copyright(c) 2021 Cornelis Networks.
Copyright(c) 2016 Intel Corporation.
This program is free software; you can redistribute it and/or modify
@@ -17,10 +18,11 @@
General Public License for more details.
Contact Information:
- Intel Corporation, www.intel.com
+ Cornelis Networks, www.cornelisnetworks.com
BSD LICENSE
+ Copyright(c) 2021 Cornelis Networks.
Copyright(c) 2016 Intel Corporation.
Redistribution and use in source and binary forms, with or without
@@ -92,8 +94,8 @@ uint32_t psmi_cpu_model;
#ifdef PSM_CUDA
int is_cuda_enabled;
int is_gdr_copy_enabled;
-int device_support_gpudirect;
-int gpu_p2p_supported = 0;
+int _device_support_gpudirect = -1; // -1 indicates "unset". See device_support_gpudirect().
+int _gpu_p2p_supported = -1; // -1 indicates "unset". see gpu_p2p_supported().
int my_gpu_device = 0;
int cuda_lib_version;
int is_driver_gpudirect_enabled;
@@ -116,6 +118,7 @@ CUresult (*psmi_cuDriverGetVersion)(int* driverVersion);
CUresult (*psmi_cuDeviceGetCount)(int* count);
CUresult (*psmi_cuStreamCreate)(CUstream* phStream, unsigned int Flags);
CUresult (*psmi_cuStreamDestroy)(CUstream phStream);
+CUresult (*psmi_cuStreamSynchronize)(CUstream phStream);
CUresult (*psmi_cuEventCreate)(CUevent* phEvent, unsigned int Flags);
CUresult (*psmi_cuEventDestroy)(CUevent hEvent);
CUresult (*psmi_cuEventQuery)(CUevent hEvent);
@@ -217,6 +220,7 @@ int psmi_cuda_lib_load()
PSMI_CUDA_DLSYM(psmi_cuda_lib, cuDeviceGetCount);
PSMI_CUDA_DLSYM(psmi_cuda_lib, cuStreamCreate);
PSMI_CUDA_DLSYM(psmi_cuda_lib, cuStreamDestroy);
+ PSMI_CUDA_DLSYM(psmi_cuda_lib, cuStreamSynchronize);
PSMI_CUDA_DLSYM(psmi_cuda_lib, cuEventCreate);
PSMI_CUDA_DLSYM(psmi_cuda_lib, cuEventDestroy);
PSMI_CUDA_DLSYM(psmi_cuda_lib, cuEventQuery);
@@ -251,7 +255,6 @@ fail:
int psmi_cuda_initialize()
{
psm2_error_t err = PSM2_OK;
- int num_devices, dev;
PSM2_LOG_MSG("entering");
_HFI_VDBG("Enabling CUDA support.\n");
@@ -262,77 +265,6 @@ int psmi_cuda_initialize()
PSMI_CUDA_CALL(cuInit, 0);
- /* Check if CUDA context is available. If not, we are not allowed to
- * launch any CUDA API calls */
- PSMI_CUDA_CALL(cuCtxGetCurrent, &ctxt);
- if (ctxt == NULL) {
- _HFI_INFO("Unable to find active CUDA context\n");
- is_cuda_enabled = 0;
- err = PSM2_OK;
- return err;
- }
-
- CUdevice current_device;
- CUcontext primary_ctx;
- PSMI_CUDA_CALL(cuCtxGetDevice, ¤t_device);
- int is_ctx_active;
- unsigned ctx_flags;
- PSMI_CUDA_CALL(cuDevicePrimaryCtxGetState, current_device, &ctx_flags,
- &is_ctx_active);
- if (!is_ctx_active) {
- /* There is an issue where certain CUDA API calls create
- * contexts but does not make it active which cause the
- * driver API call to fail with error 709 */
- PSMI_CUDA_CALL(cuDevicePrimaryCtxRetain, &primary_ctx,
- current_device);
- is_cuda_primary_context_retain = 1;
- }
-
- /* Check if all devices support Unified Virtual Addressing. */
- PSMI_CUDA_CALL(cuDeviceGetCount, &num_devices);
-
- device_support_gpudirect = 1;
-
- for (dev = 0; dev < num_devices; dev++) {
- CUdevice device;
- PSMI_CUDA_CALL(cuDeviceGet, &device, dev);
- int unifiedAddressing;
- PSMI_CUDA_CALL(cuDeviceGetAttribute,
- &unifiedAddressing,
- CU_DEVICE_ATTRIBUTE_UNIFIED_ADDRESSING,
- device);
-
- if (unifiedAddressing !=1) {
- _HFI_ERROR("CUDA device %d does not support Unified Virtual Addressing.\n", dev);
- goto fail;
- }
-
- int major;
- PSMI_CUDA_CALL(cuDeviceGetAttribute,
- &major,
- CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR,
- device);
- if (major < 3) {
- device_support_gpudirect = 0;
- _HFI_INFO("CUDA device %d does not support GPUDirect RDMA (Non-fatal error)\n", dev);
- }
-
- if (device != current_device) {
- int canAccessPeer = 0;
- PSMI_CUDA_CALL(cuDeviceCanAccessPeer, &canAccessPeer,
- current_device, device);
-
- if (canAccessPeer != 1)
- _HFI_DBG("CUDA device %d does not support P2P from current device (Non-fatal error)\n", dev);
- else
- gpu_p2p_supported |= (1 << device);
- } else {
- /* Always support p2p on the same GPU */
- my_gpu_device = device;
- gpu_p2p_supported |= (1 << device);
- }
- }
-
union psmi_envvar_val env_enable_gdr_copy;
psmi_getenv("PSM2_GDRCOPY",
"Enable (set envvar to 1) for gdr copy support in PSM (Enabled by default)",
@@ -384,6 +316,8 @@ psm2_error_t __psm2_init(int *major, int *minor)
{
psm2_error_t err = PSM2_OK;
union psmi_envvar_val env_tmask;
+ union psmi_envvar_val devs;
+ int devid_enabled[PTL_MAX_INIT];
psmi_log_initialize();
@@ -538,12 +472,26 @@ psm2_error_t __psm2_init(int *major, int *minor)
psmi_epid_init();
- int rc = psmi_hal_initialize();
+ psmi_getenv("PSM2_DEVICES",
+ "Ordered list of PSM-level devices",
+ PSMI_ENVVAR_LEVEL_USER,
+ PSMI_ENVVAR_TYPE_STR,
+ (union psmi_envvar_val)PSMI_DEVICES_DEFAULT, &devs);
- if (rc)
- {
- err = PSM2_INTERNAL_ERR;
+ if ((err = psmi_parse_devices(devid_enabled, devs.e_str)))
goto fail;
+
+ /* setup a dummy (null) hal if we are not using PTL_DEVID_IPS */
+ if (!psmi_device_is_enabled(devid_enabled, PTL_DEVID_IPS)) {
+ psmi_hal_initialize_null();
+ } else {
+ int rc = psmi_hal_initialize();
+
+ if (rc)
+ {
+ err = PSM2_INTERNAL_ERR;
+ goto fail;
+ }
}
#ifdef PSM_CUDA
@@ -782,10 +730,10 @@ psm2_error_t __psm2_info_query(psm2_info_query_t q, void *out,
{
char *networkType = (char*)out;
size_t networkTypeLength = args[0].length;
- const char *const intelopa = "Intel(R) OPA";
- if (networkTypeLength >= strlen(intelopa)+1)
+ const char *const cornelisopx = "Cornelis(TM) OPX";
+ if (networkTypeLength >= strlen(cornelisopx)+1)
{
- strcpy(networkType,intelopa);
+ strcpy(networkType,cornelisopx);
rv = PSM2_OK;
}
}
diff --git a/psm2_hal.c b/psm2_hal.c
index b4b9d9a..09aa251 100644
--- a/psm2_hal.c
+++ b/psm2_hal.c
@@ -379,6 +379,20 @@ int psmi_hal_initialize(void)
return -PSM_HAL_ERROR_INIT_FAILED;
}
+/* psmi_hal_initialize_null */
+void psmi_hal_initialize_null(void)
+{
+ static struct _psmi_hal_instance nullhal = {
+ .type = PSM_HAL_INSTANCE_NULL,
+ .description = "NULL HAL (hardware disabled)",
+ .hfi_name = "null",
+ .hfi_sys_class_path = "/dev/null",
+ .params = {0}
+ };
+
+ psmi_hal_current_hal_instance = &nullhal;
+}
+
int psmi_hal_finalize(void)
{
struct _psmi_hal_instance *p = psmi_hal_current_hal_instance;
diff --git a/psm2_hal.h b/psm2_hal.h
index 1bec596..9e94da8 100644
--- a/psm2_hal.h
+++ b/psm2_hal.h
@@ -75,6 +75,7 @@ typedef enum
PSM_HAL_INSTANCE_GEN2 = 2,
PSM_HAL_INSTANCE_GEN3 = 3,
+ PSM_HAL_INSTANCE_NULL = 98,
#ifdef PSM2_MOCK_TESTING
PSM_HAL_INSTANCE_MOCK = 99,
#endif
@@ -701,6 +702,7 @@ void psmi_hal_register_instance(psmi_hal_instance_t *);
INSTANCES are registered, or PSM_HAL_ERROR_INIT_FAILED when
another failure has occured during initialization. */
int psmi_hal_initialize(void);
+void psmi_hal_initialize_null(void);
int psmi_hal_finalize(void);
diff --git a/psm2_mq.h b/psm2_mq.h
index 7b63608..98047d3 100644
--- a/psm2_mq.h
+++ b/psm2_mq.h
@@ -5,6 +5,7 @@
GPL LICENSE SUMMARY
+ Copyright(c) 2021 Cornelis Networks.
Copyright(c) 2017 Intel Corporation.
This program is free software; you can redistribute it and/or modify
@@ -17,10 +18,11 @@
General Public License for more details.
Contact Information:
- Intel Corporation, www.intel.com
+ Cornelis Networks, www.cornelisnetworks.com
BSD LICENSE
+ Copyright(c) 2021 Cornelis Networks.
Copyright(c) 2017 Intel Corporation.
Redistribution and use in source and binary forms, with or without
@@ -519,6 +521,8 @@ psm2_error_t psm2_mq_setopt(psm2_mq_t mq, int option, const void *value);
#define PSM2_MQ_FLAG_SENDSYNC 0x01
/**< MQ Send Force synchronous send */
+#define PSM2_MQ_FLAG_GDRCPY_ONLY 0x02
+ /**< only GDRCOPY is allowed for GPU */
#define PSM2_MQ_REQINVALID ((psm2_mq_req_t)(NULL))
/**< MQ request completion value */
diff --git a/psm_ep.c b/psm_ep.c
index 8c4fe5e..8a092a8 100644
--- a/psm_ep.c
+++ b/psm_ep.c
@@ -5,6 +5,7 @@
GPL LICENSE SUMMARY
+ Copyright(c) 2021 Cornelis Networks.
Copyright(c) 2016 Intel Corporation.
This program is free software; you can redistribute it and/or modify
@@ -17,10 +18,11 @@
General Public License for more details.
Contact Information:
- Intel Corporation, www.intel.com
+ Cornelis Networks, www.cornelisnetworks.com
BSD LICENSE
+ Copyright(c) 2021 Cornelis Networks.
Copyright(c) 2016 Intel Corporation.
Redistribution and use in source and binary forms, with or without
@@ -89,9 +91,6 @@ static psm2_error_t psmi_ep_open_device(const psm2_ep_t ep,
* hfi.
*/
-static psm2_error_t psmi_parse_devices(int devices[PTL_MAX_INIT],
- const char *devstr);
-static int psmi_device_is_enabled(const int devices[PTL_MAX_INIT], int devid);
int psmi_ep_device_is_enabled(const psm2_ep_t ep, int devid);
psm2_error_t __psm2_ep_num_devunits(uint32_t *num_units_o)
@@ -1034,6 +1033,9 @@ __psm2_ep_open(psm2_uuid_t const unique_job_key,
char uvalue[6], pvalue[6];
int devid_enabled[PTL_MAX_INIT];
union psmi_envvar_val devs;
+#ifdef PSM_CUDA
+ int release_gdr = 0;
+#endif
PSM2_LOG_MSG("entering");
PSMI_ERR_UNLESS_INITIALIZED(NULL);
@@ -1089,8 +1091,10 @@ __psm2_ep_open(psm2_uuid_t const unique_job_key,
}
#ifdef PSM_CUDA
- if (PSMI_IS_GDR_COPY_ENABLED)
+ if (PSMI_IS_GDR_COPY_ENABLED) {
hfi_gdr_open();
+ release_gdr = 1;
+ }
#endif
err = __psm2_ep_open_internal(unique_job_key,
@@ -1144,6 +1148,10 @@ __psm2_ep_open(psm2_uuid_t const unique_job_key,
_HFI_VDBG("psm2_ep_open() OK....\n");
fail:
+#ifdef PSM_CUDA
+ if (err && release_gdr)
+ hfi_gdr_close();
+#endif
PSMI_UNLOCK(psmi_creation_lock);
PSM2_LOG_MSG("leaving");
return err;
@@ -1160,16 +1168,6 @@ psm2_error_t __psm2_ep_close(psm2_ep_t ep, int mode, int64_t timeout_in)
}
#endif
-#ifdef PSM_CUDA
- /*
- * The close on the gdr fd needs to be called before the
- * close on the hfi fd as the the gdr device will hold
- * reference count on the hfi device which will make the close
- * on the hfi fd return without actually closing the fd.
- */
- if (PSMI_IS_GDR_COPY_ENABLED)
- hfi_gdr_close();
-#endif
union psmi_envvar_val timeout_intval;
psm2_ep_t tmp;
psm2_mq_t mmq;
@@ -1350,6 +1348,11 @@ psm2_error_t __psm2_ep_close(psm2_ep_t ep, int mode, int64_t timeout_in)
PSMI_UNLOCK(psmi_creation_lock);
+#ifdef PSM_CUDA
+ if (PSMI_IS_GDR_COPY_ENABLED)
+ hfi_gdr_close();
+#endif
+
if (_HFI_PRDBG_ON) {
_HFI_PRDBG_ALWAYS("Closed endpoint in %.3f secs\n",
(double)cycles_to_nanosecs(get_cycles() -
@@ -1491,7 +1494,6 @@ fail:
/* Get a list of PTLs we want to use. The order is important, it affects
* whether node-local processes use shm or ips */
-static
psm2_error_t
psmi_parse_devices(int devices[PTL_MAX_INIT], const char *devstring)
{
@@ -1562,7 +1564,6 @@ fail:
}
-static
int psmi_device_is_enabled(const int devid_enabled[PTL_MAX_INIT], int devid)
{
int i;
diff --git a/psm_ep.h b/psm_ep.h
index b526fa0..e4e3708 100644
--- a/psm_ep.h
+++ b/psm_ep.h
@@ -236,4 +236,9 @@ struct psm2_epaddr {
PSMI_PROFILE_UNBLOCK(); \
} while (0)
+
+psm2_error_t psmi_parse_devices(int devices[PTL_MAX_INIT],
+ const char *devstr);
+int psmi_device_is_enabled(const int devices[PTL_MAX_INIT], int devid);
+
#endif /* _PSMI_EP_H */
diff --git a/psm_gdrcpy.h b/psm_gdrcpy.h
index 2773454..c10062d 100644
--- a/psm_gdrcpy.h
+++ b/psm_gdrcpy.h
@@ -5,6 +5,7 @@
GPL LICENSE SUMMARY
+ Copyright(c) 2021 Cornelis Networks.
Copyright(c) 2018 Intel Corporation.
This program is free software; you can redistribute it and/or modify
@@ -17,10 +18,11 @@
General Public License for more details.
Contact Information:
- Intel Corporation, www.intel.com
+ Cornelis Networks, www.cornelisnetworks.com
BSD LICENSE
+ Copyright(c) 2021 Cornelis Networks.
Copyright(c) 2018 Intel Corporation.
Redistribution and use in source and binary forms, with or without
@@ -60,18 +62,17 @@
#define GDR_FD get_gdr_fd()
-int get_gdr_fd();
+int get_gdr_fd(void);
-void hfi_gdr_open();
+void hfi_gdr_open(void);
-void hfi_gdr_close();
+void hfi_gdr_close(void);
void *
gdr_convert_gpu_to_host_addr(int gdr_fd, unsigned long buf,
size_t size, int flags,
struct ips_proto* proto);
-uint64_t
-gdr_cache_evict();
+uint64_t gdr_cache_evict(void);
#endif
#endif
diff --git a/psm_hal_gen1/opa_proto_gen1.c b/psm_hal_gen1/opa_proto_gen1.c
index eb8bce9..868f06e 100644
--- a/psm_hal_gen1/opa_proto_gen1.c
+++ b/psm_hal_gen1/opa_proto_gen1.c
@@ -310,7 +310,7 @@ struct _hfi_ctrl *hfi_userinit_internal(int fd, bool skip_affinity,
__hfi_pg_sz = sysconf(_SC_PAGESIZE);
if (!(spctrl = calloc(1, sizeof(struct _hfi_ctrl)))) {
- _HFI_INFO("can't allocate memory for hfi_ctrl: %s\n",
+ _HFI_INFO("Warning: can't allocate memory for hfi_ctrl: %s\n",
strerror(errno));
goto err_calloc_hfi_ctrl;
}
@@ -357,12 +357,12 @@ struct _hfi_ctrl *hfi_userinit_internal(int fd, bool skip_affinity,
#endif
if (hfi_cmd_write(fd, &c, sizeof(c)) == -1) {
if (errno == ENODEV) {
- _HFI_INFO("PSM2 and driver version mismatch\n");
+ _HFI_INFO("Warning: PSM2 and driver version mismatch\n");
/* Overwrite errno. One would wish that the driver
* didn't return ENODEV for a version mismatch */
errno = EPROTONOSUPPORT;
} else {
- _HFI_INFO("assign_context command failed: %s\n",
+ _HFI_INFO("Warning: assign_context command failed: %s\n",
strerror(errno));
}
goto err_hfi_cmd_assign_ctxt;
@@ -387,36 +387,36 @@ struct _hfi_ctrl *hfi_userinit_internal(int fd, bool skip_affinity,
c.addr = (__u64) cinfo;
if (hfi_cmd_write(fd, &c, sizeof(c)) == -1) {
- _HFI_INFO("CTXT_INFO command failed: %s\n", strerror(errno));
+ _HFI_ERROR("CTXT_INFO command failed: %s\n", strerror(errno));
goto err_hfi_cmd_ctxt_info;
}
/* sanity checking... */
if (cinfo->rcvtids%8) {
- _HFI_INFO("rcvtids not 8 multiple: %d\n", cinfo->rcvtids);
+ _HFI_ERROR("rcvtids not 8 multiple: %d\n", cinfo->rcvtids);
goto err_sanity_check;
}
if (cinfo->egrtids%8) {
- _HFI_INFO("egrtids not 8 multiple: %d\n", cinfo->egrtids);
+ _HFI_ERROR("egrtids not 8 multiple: %d\n", cinfo->egrtids);
goto err_sanity_check;
}
if (cinfo->rcvtids < cinfo->egrtids) {
- _HFI_INFO("rcvtids(%d) < egrtids(%d)\n",
+ _HFI_ERROR("rcvtids(%d) < egrtids(%d)\n",
cinfo->rcvtids, cinfo->egrtids);
goto err_sanity_check;
}
if (cinfo->rcvhdrq_cnt%32) {
- _HFI_INFO("rcvhdrq_cnt not 32 multiple: %d\n",
+ _HFI_ERROR("rcvhdrq_cnt not 32 multiple: %d\n",
cinfo->rcvhdrq_cnt);
goto err_sanity_check;
}
if (cinfo->rcvhdrq_entsize%64) {
- _HFI_INFO("rcvhdrq_entsize not 64 multiple: %d\n",
+ _HFI_ERROR("rcvhdrq_entsize not 64 multiple: %d\n",
cinfo->rcvhdrq_entsize);
goto err_sanity_check;
}
if (cinfo->rcvegr_size%__hfi_pg_sz) {
- _HFI_INFO("rcvegr_size not page multiple: %d\n",
+ _HFI_ERROR("rcvegr_size not page multiple: %d\n",
cinfo->rcvegr_size);
goto err_sanity_check;
}
@@ -443,7 +443,7 @@ struct _hfi_ctrl *hfi_userinit_internal(int fd, bool skip_affinity,
CPU_ZERO(&cpuset);
CPU_SET(cinfo->rec_cpu, &cpuset);
if (sched_setaffinity(0, sizeof(cpuset), &cpuset)) {
- _HFI_INFO("Couldn't set runon processor %u "
+ _HFI_INFO("Warning: Couldn't set runon processor %u "
"(unit:context %u:%u) (%u active chips): %s\n",
cinfo->rec_cpu, cinfo->unit, cinfo->ctxt,
cinfo->num_active, strerror(errno));
@@ -456,7 +456,7 @@ struct _hfi_ctrl *hfi_userinit_internal(int fd, bool skip_affinity,
c.addr = (__u64) binfo;
if (hfi_cmd_write(fd, &c, sizeof(c)) == -1) {
- _HFI_INFO("BASE_INFO command failed: %s\n", strerror(errno));
+ _HFI_ERROR("BASE_INFO command failed: %s\n", strerror(errno));
goto err_hfi_cmd_user_info;
}
@@ -481,7 +481,7 @@ struct _hfi_ctrl *hfi_userinit_internal(int fd, bool skip_affinity,
* this is different from PSM API version.
*/
if ((binfo->sw_version >> HFI1_SWMAJOR_SHIFT) != hfi_get_user_major_version()) {
- _HFI_INFO
+ _HFI_ERROR
("User major version 0x%x not same as driver major 0x%x\n",
hfi_get_user_major_version(), binfo->sw_version >> HFI1_SWMAJOR_SHIFT);
if ((binfo->sw_version >> HFI1_SWMAJOR_SHIFT) < hfi_get_user_major_version())
diff --git a/psm_hal_gen1/psm_gdrcpy.c b/psm_hal_gen1/psm_gdrcpy.c
index 1896f9e..f5f2b77 100644
--- a/psm_hal_gen1/psm_gdrcpy.c
+++ b/psm_hal_gen1/psm_gdrcpy.c
@@ -5,6 +5,7 @@
GPL LICENSE SUMMARY
+ Copyright(c) 2021 Cornelis Networks.
Copyright(c) 2018 Intel Corporation.
This program is free software; you can redistribute it and/or modify
@@ -17,10 +18,11 @@
General Public License for more details.
Contact Information:
- Intel Corporation, www.intel.com
+ Cornelis Networks, www.cornelisnetworks.com
BSD LICENSE
+ Copyright(c) 2021 Cornelis Networks.
Copyright(c) 2018 Intel Corporation.
Redistribution and use in source and binary forms, with or without
@@ -61,9 +63,11 @@
#include "ptl_ips/ips_expected_proto.h"
#include "opa_user_gen1.h"
-static int gdr_fd;
+static int gdr_refcount;
+static int gdr_fd = -1;
-int get_gdr_fd(){
+int get_gdr_fd(void)
+{
return gdr_fd;
}
@@ -71,7 +75,8 @@ int get_gdr_fd(){
#define GPU_PAGE_MASK ~GPU_PAGE_OFFSET_MASK
uint64_t
-gdr_cache_evict() {
+gdr_cache_evict(void)
+{
int ret;
struct hfi1_gdr_cache_evict_params params;
params.evict_params_in.version = HFI1_GDR_VERSION;
@@ -90,8 +95,9 @@ gdr_cache_evict() {
}
-uint64_t
-ips_sdma_gpu_cache_evict(int fd) {
+static uint64_t
+ips_sdma_gpu_cache_evict(int fd)
+{
int ret;
struct hfi1_sdma_gpu_cache_evict_params params;
params.evict_params_in.version = HFI1_GDR_VERSION;
@@ -117,7 +123,7 @@ ips_sdma_gpu_cache_evict(int fd) {
* which we bail out. If successful we retry to PIN/MMAP once
* again
*/
-uint64_t
+static uint64_t
handle_out_of_bar_space(struct ips_proto *proto)
{
time_t lastEvictTime = 0;
@@ -158,27 +164,40 @@ gdr_convert_gpu_to_host_addr(int gdr_fd, unsigned long buf,
size_t size, int flags,
struct ips_proto* proto)
{
- struct hfi1_gdr_query_params query_params;
- void *host_addr_buf;
- int ret;
+ _HFI_VDBG("(gdrcopy) buf=%p size=%zu flags=0x%x proto=%p\n",
+ (void*)buf, size, flags, proto);
+ if (!size) {
+ // Attempting 0-length pin results in error from driver.
+ // Just return NULL. Caller has to figure out what to do in this
+ // case.
+ return NULL;
+ }
- query_params.query_params_in.version = HFI1_GDR_VERSION;
uintptr_t pageaddr = buf & GPU_PAGE_MASK;
- /* As size is guarenteed to be in the range of 0-8kB
- * there is a guarentee that buf+size-1 does not overflow
- * 64 bits.
- */
- uint32_t pagelen = (uint32_t) (PSMI_GPU_PAGESIZE +
- ((buf + size - 1) & GPU_PAGE_MASK) -
- pageaddr);
+ uintptr_t pageend = PSMI_GPU_PAGESIZE + ((buf + size - 1) & GPU_PAGE_MASK);
+
+ // Validate pointer arithmetic
+ if (pageend < pageaddr) {
+ psmi_handle_error(PSMI_EP_NORETURN, PSM2_INTERNAL_ERR,
+ "pageend < pageaddr, wraparound; pageend=%p pageaddr=%p",
+ (void*)pageend, (void*)pageaddr);
+ } else if ((pageend - pageaddr) > UINT32_MAX) {
+ psmi_handle_error(PSMI_EP_NORETURN, PSM2_INTERNAL_ERR,
+ "pageend - pageaddr > UINT32_MAX; pageend=%p pageaddr=%p difference=%zu",
+ (void*)pageend, (void*)pageaddr, (pageend - pageaddr));
+ }
- _HFI_VDBG("(gpudirect) buf=%p size=%zu pageaddr=%p pagelen=%u flags=0x%x proto=%p\n",
- (void *)buf, size, (void *)pageaddr, pagelen, flags, proto);
+ uint32_t pagelen = pageend - pageaddr;
+ _HFI_VDBG("(gdrcopy) pageaddr=%p pagelen=%u pageend=%p\n",
+ (void *)pageaddr, pagelen, (void*)pageend);
+ struct hfi1_gdr_query_params query_params;
+ query_params.query_params_in.version = HFI1_GDR_VERSION;
query_params.query_params_in.gpu_buf_addr = pageaddr;
query_params.query_params_in.gpu_buf_size = pagelen;
- retry:
+ int ret;
+ retry:
ret = ioctl(gdr_fd, HFI1_IOCTL_GDR_GPU_PIN_MMAP, &query_params);
if (ret) {
@@ -199,29 +218,40 @@ gdr_convert_gpu_to_host_addr(int gdr_fd, unsigned long buf,
return NULL;
}
}
- host_addr_buf = (void *)query_params.query_params_out.host_buf_addr;
+ void *host_addr_buf = (void *)query_params.query_params_out.host_buf_addr;
return host_addr_buf + (buf & GPU_PAGE_OFFSET_MASK);
}
-void hfi_gdr_open(){
- gdr_fd = open(GDR_DEVICE_PATH, O_RDWR);
- if (-1 == gdr_fd ) {
- /* Non-Fatal error. If device cannot be found we assume
- * that the driver does not support GDR Copy and we fallback
- * to sending all GPU messages using rndv protocol
- */
- _HFI_INFO(" Warning: The HFI1 driver installed does not support GPUDirect RDMA"
- " fast copy. Turning off GDR fast copy in PSM \n");
- is_gdr_copy_enabled = 0;
- return;
+void hfi_gdr_open(void)
+{
+ if (gdr_fd < 0) {
+ psmi_assert(!gdr_refcount);
+ gdr_fd = open(GDR_DEVICE_PATH, O_RDWR);
+ if (-1 == gdr_fd ) {
+ /* Non-Fatal error. If device cannot be found we assume
+ * that the driver does not support GDR Copy and we fallback
+ * to sending all GPU messages using rndv protocol
+ */
+ _HFI_INFO(" Warning: The HFI1 driver installed does not support GPUDirect RDMA"
+ " fast copy. Turning off GDR fast copy in PSM \n");
+ is_gdr_copy_enabled = 0;
+ return;
+ }
}
- return;
+ gdr_refcount++;
}
-void hfi_gdr_close()
+void hfi_gdr_close(void)
{
- close(GDR_FD);
+ if (gdr_fd > -1) {
+ psmi_assert(gdr_refcount);
+ gdr_refcount--;
+ if (!gdr_refcount) {
+ close(gdr_fd);
+ gdr_fd = -1;
+ }
+ }
}
#endif
diff --git a/psm_hal_gen1/psm_hal_gen1_spio.c b/psm_hal_gen1/psm_hal_gen1_spio.c
index eb9d5aa..5444897 100644
--- a/psm_hal_gen1/psm_hal_gen1_spio.c
+++ b/psm_hal_gen1/psm_hal_gen1_spio.c
@@ -5,6 +5,7 @@
GPL LICENSE SUMMARY
+ Copyright(c) 2021 Cornelis Networks.
Copyright(c) 2017 Intel Corporation.
This program is free software; you can redistribute it and/or modify
@@ -17,10 +18,11 @@
General Public License for more details.
Contact Information:
- Intel Corporation, www.intel.com
+ Cornelis Networks, www.cornelisnetworks.com
BSD LICENSE
+ Copyright(c) 2021 Cornelis Networks.
Copyright(c) 2017 Intel Corporation.
Redistribution and use in source and binary forms, with or without
@@ -181,10 +183,7 @@ ips_spio_init(const struct psmi_context *context, struct ptl *ptl,
#ifdef PSM_CUDA
- if (PSMI_IS_CUDA_ENABLED) {
- PSMI_CUDA_CALL(cuMemHostAlloc, (void **) &ctrl->cuda_pio_buffer,
- MAX_CUDA_MTU, CU_MEMHOSTALLOC_PORTABLE);
- }
+ ctrl->cuda_pio_buffer = NULL;
#endif
_HFI_PRDBG("ips_spio_init() done\n");
@@ -195,7 +194,7 @@ ips_spio_init(const struct psmi_context *context, struct ptl *ptl,
static PSMI_HAL_INLINE psm2_error_t ips_spio_fini(struct ips_spio *ctrl)
{
#ifdef PSM_CUDA
- if (PSMI_IS_CUDA_ENABLED)
+ if (PSMI_IS_CUDA_ENABLED && ctrl->cuda_pio_buffer != NULL)
PSMI_CUDA_CALL(cuMemFreeHost, (void *) ctrl->cuda_pio_buffer);
#endif
spio_report_stall(ctrl, get_cycles(), 0ULL);
@@ -810,6 +809,10 @@ fi_busy:
/* Write to PIO: other blocks of payload */
#ifdef PSM_CUDA
if (is_cuda_payload) {
+ if (ctrl->cuda_pio_buffer == NULL) {
+ PSMI_CUDA_CALL(cuMemHostAlloc, (void **) &ctrl->cuda_pio_buffer,
+ MAX_CUDA_MTU, CU_MEMHOSTALLOC_PORTABLE);
+ }
/* Since the implementation of cuMemcpy is unknown,
and the HFI specifies several conditions for how PIO
writes must occur, for safety reasons we should not assume
diff --git a/psm_mpool.c b/psm_mpool.c
index 1f2a365..e36e917 100644
--- a/psm_mpool.c
+++ b/psm_mpool.c
@@ -5,6 +5,7 @@
GPL LICENSE SUMMARY
+ Copyright(c) 2021 Cornelis Networks.
Copyright(c) 2016 Intel Corporation.
This program is free software; you can redistribute it and/or modify
@@ -17,10 +18,11 @@
General Public License for more details.
Contact Information:
- Intel Corporation, www.intel.com
+ Cornelis Networks, www.cornelisnetworks.com
BSD LICENSE
+ Copyright(c) 2021 Cornelis Networks.
Copyright(c) 2016 Intel Corporation.
Redistribution and use in source and binary forms, with or without
@@ -101,7 +103,6 @@ struct mpool {
#ifdef PSM_CUDA
alloc_dealloc_callback_fn_t mp_alloc_dealloc_cb;
- void *mp_alloc_dealloc_cb_context;
#endif
};
@@ -230,7 +231,7 @@ psmi_mpool_create_for_cuda(size_t obj_size, uint32_t num_obj_per_chunk,
uint32_t num_obj_max_total, int flags,
psmi_memtype_t statstype,
non_empty_callback_fn_t cb, void *context,
- alloc_dealloc_callback_fn_t ad_cb, void *ad_context)
+ alloc_dealloc_callback_fn_t ad_cb)
{
mpool_t mp;
@@ -242,7 +243,6 @@ psmi_mpool_create_for_cuda(size_t obj_size, uint32_t num_obj_per_chunk,
return NULL;
mp->mp_alloc_dealloc_cb = ad_cb;
- mp->mp_alloc_dealloc_cb_context = ad_context;
if (psmi_mpool_allocate_chunk(mp) != PSM2_OK) {
psmi_mpool_destroy(mp);
@@ -418,7 +418,6 @@ void psmi_mpool_chunk_dealloc(mpool_t mp, int idx)
int j;
for (j = 0; j < mp->mp_num_obj_per_chunk; j++)
mp->mp_alloc_dealloc_cb(0 /* is not alloc */,
- mp->mp_alloc_dealloc_cb_context,
((void *) mp->mp_elm_vector[idx]) +
j * mp->mp_elm_size +
sizeof(struct mpool_element));
@@ -509,7 +508,6 @@ static int psmi_mpool_allocate_chunk(mpool_t mp)
#ifdef PSM_CUDA
if (mp->mp_alloc_dealloc_cb)
mp->mp_alloc_dealloc_cb(1 /* is alloc */,
- mp->mp_alloc_dealloc_cb_context,
chunk + i * mp->mp_elm_size +
sizeof(struct mpool_element));
#endif
diff --git a/psm_mpool.h b/psm_mpool.h
index 8098f60..97f95ab 100644
--- a/psm_mpool.h
+++ b/psm_mpool.h
@@ -5,6 +5,7 @@
GPL LICENSE SUMMARY
+ Copyright(c) 2021 Cornelis Networks.
Copyright(c) 2016 Intel Corporation.
This program is free software; you can redistribute it and/or modify
@@ -17,10 +18,11 @@
General Public License for more details.
Contact Information:
- Intel Corporation, www.intel.com
+ Cornelis Networks, www.cornelisnetworks.com
BSD LICENSE
+ Copyright(c) 2021 Cornelis Networks.
Copyright(c) 2016 Intel Corporation.
Redistribution and use in source and binary forms, with or without
@@ -70,8 +72,7 @@
typedef struct mpool *mpool_t;
typedef void (*non_empty_callback_fn_t) (void *context);
-typedef void (*alloc_dealloc_callback_fn_t) (int is_alloc, void *context,
- void *chunk);
+typedef void (*alloc_dealloc_callback_fn_t) (int is_alloc, void *chunk);
mpool_t
MOCKABLE(psmi_mpool_create)(size_t obj_size, uint32_t num_obj_per_chunk,
@@ -84,8 +85,7 @@ mpool_t psmi_mpool_create_for_cuda(size_t obj_size, uint32_t num_obj_per_chunk,
uint32_t num_obj_max_total, int flags,
psmi_memtype_t statstype,
non_empty_callback_fn_t cb, void *context,
- alloc_dealloc_callback_fn_t ad_cb,
- void *ad_context);
+ alloc_dealloc_callback_fn_t ad_cb);
void psmi_mpool_destroy(mpool_t mp);
diff --git a/psm_mq.c b/psm_mq.c
index a25a581..9f25461 100644
--- a/psm_mq.c
+++ b/psm_mq.c
@@ -5,6 +5,7 @@
GPL LICENSE SUMMARY
+ Copyright(c) 2021 Cornelis Networks.
Copyright(c) 2016 Intel Corporation.
This program is free software; you can redistribute it and/or modify
@@ -17,10 +18,11 @@
General Public License for more details.
Contact Information:
- Intel Corporation, www.intel.com
+ Cornelis Networks, www.cornelisnetworks.com
BSD LICENSE
+ Copyright(c) 2021 Cornelis Networks.
Copyright(c) 2016 Intel Corporation.
Redistribution and use in source and binary forms, with or without
@@ -767,6 +769,7 @@ psm2_mq_irecv_inner(psm2_mq_t mq, psm2_mq_req_t req, void *buf, uint32_t len)
PSM2_LOG_MSG("entering");
psmi_assert(MQE_TYPE_IS_RECV(req->type));
psmi_mtucpy_fn_t psmi_mtucpy_fn = psmi_mq_mtucpy;
+
#ifdef PSM_CUDA
if (!req->is_buf_gpu_mem)
psmi_mtucpy_fn = psmi_mq_mtucpy_host_mem;
@@ -785,6 +788,9 @@ psm2_mq_irecv_inner(psm2_mq_t mq, psm2_mq_req_t req, void *buf, uint32_t len)
len, 1,
mq->ep->epaddr->proto);
psmi_mtucpy_fn = psmi_mq_mtucpy_host_mem;
+ } else if ((req->flags_user & PSM2_MQ_FLAG_GDRCPY_ONLY) && req->is_buf_gpu_mem) {
+ psmi_handle_error(PSMI_EP_NORETURN, PSM2_INTERNAL_ERR,
+ "CUDA memcpy not permitted for this operation.");
}
#endif
psmi_mtucpy_fn(ubuf, (const void *)req->req_data.buf, copysz);
@@ -808,6 +814,9 @@ psm2_mq_irecv_inner(psm2_mq_t mq, psm2_mq_req_t req, void *buf, uint32_t len)
req->req_data.send_msglen, 1,
mq->ep->epaddr->proto);
psmi_mtucpy_fn = psmi_mq_mtucpy_host_mem;
+ } else if ((req->flags_user & PSM2_MQ_FLAG_GDRCPY_ONLY) && req->is_buf_gpu_mem) {
+ psmi_handle_error(PSMI_EP_NORETURN, PSM2_INTERNAL_ERR,
+ "CUDA memcpy not permitted for this operation.");
}
#endif
@@ -912,6 +921,7 @@ __psm2_mq_fp_msg(psm2_ep_t ep, psm2_mq_t mq, psm2_epaddr_t addr, psm2_mq_tag_t *
recv_req->req_data.recv_msglen = len;
recv_req->recv_msgoff = 0;
recv_req->req_data.context = context;
+ recv_req->flags_user = flags;
#ifdef PSM_CUDA
recv_req->is_buf_gpu_mem = gpu_mem;
@@ -935,6 +945,7 @@ __psm2_mq_fp_msg(psm2_ep_t ep, psm2_mq_t mq, psm2_epaddr_t addr, psm2_mq_tag_t *
#endif
recv_req->req_data.context = context;
+ recv_req->flags_user = flags;
psm2_mq_irecv_inner(mq, recv_req, buf, len);
}
@@ -995,6 +1006,7 @@ __psm2_mq_irecv2(psm2_mq_t mq, psm2_epaddr_t src,
req->req_data.recv_msglen = len;
req->recv_msgoff = 0;
req->req_data.context = context;
+ req->flags_user = flags;
#ifdef PSM_CUDA
req->is_buf_gpu_mem = gpu_mem;
@@ -1023,6 +1035,7 @@ __psm2_mq_irecv2(psm2_mq_t mq, psm2_epaddr_t src,
#endif
req->req_data.context = context;
+ req->flags_user |= flags;
psm2_mq_irecv_inner(mq, req, buf, len);
}
diff --git a/psm_mq_internal.h b/psm_mq_internal.h
index a1afaf8..0a12058 100644
--- a/psm_mq_internal.h
+++ b/psm_mq_internal.h
@@ -5,6 +5,7 @@
GPL LICENSE SUMMARY
+ Copyright(c) 2021 Cornelis Networks.
Copyright(c) 2016 Intel Corporation.
This program is free software; you can redistribute it and/or modify
@@ -17,10 +18,11 @@
General Public License for more details.
Contact Information:
- Intel Corporation, www.intel.com
+ Cornelis Networks, www.cornelisnetworks.com
BSD LICENSE
+ Copyright(c) 2021 Cornelis Networks.
Copyright(c) 2016 Intel Corporation.
Redistribution and use in source and binary forms, with or without
@@ -249,7 +251,6 @@ struct psm2_mq_req {
uint32_t prefetch_send_msgoff;
int cuda_hostbuf_used;
CUipcMemHandle cuda_ipc_handle;
- CUevent cuda_ipc_event;
uint8_t cuda_ipc_handle_attached;
uint32_t cuda_ipc_offset;
/*
@@ -555,7 +556,14 @@ int psmi_mq_handle_envelope(psm2_mq_t mq, psm2_epaddr_t src, psm2_mq_tag_t *tag,
uint32_t msglen, uint32_t offset,
const void *payload, uint32_t paylen, int msgorder,
uint32_t opcode, psm2_mq_req_t *req_o);
+
+#ifdef PSM_CUDA
+// GDRCopy code requires ips_proto*, so CUDA-aware implementation must accept
+// ips_proto*
+int psmi_mq_handle_outoforder(psm2_mq_t mq, psm2_mq_req_t req, struct ips_proto *proto);
+#else
int psmi_mq_handle_outoforder(psm2_mq_t mq, psm2_mq_req_t req);
+#endif
void psmi_mq_stats_register(psm2_mq_t mq, mpspawn_stats_add_fn add_fn);
diff --git a/psm_mq_recv.c b/psm_mq_recv.c
index 642fbc1..e35cbe3 100644
--- a/psm_mq_recv.c
+++ b/psm_mq_recv.c
@@ -5,6 +5,7 @@
GPL LICENSE SUMMARY
+ Copyright(c) 2021 Cornelis Networks.
Copyright(c) 2015 Intel Corporation.
This program is free software; you can redistribute it and/or modify
@@ -17,10 +18,11 @@
General Public License for more details.
Contact Information:
- Intel Corporation, www.intel.com
+ Cornelis Networks, www.cornelisnetworks.com
BSD LICENSE
+ Copyright(c) 2021 Cornelis Networks.
Copyright(c) 2015 Intel Corporation.
Redistribution and use in source and binary forms, with or without
@@ -366,6 +368,9 @@ psmi_mq_handle_envelope(psm2_mq_t mq, psm2_epaddr_t src, psm2_mq_tag_t *tag,
user_buffer = gdr_convert_gpu_to_host_addr(GDR_FD,
(unsigned long)req->req_data.buf,
msglen, 1, src->proto);
+ } else if ((req->flags_user & PSM2_MQ_FLAG_GDRCPY_ONLY) && req->is_buf_gpu_mem) {
+ psmi_handle_error(PSMI_EP_NORETURN, PSM2_INTERNAL_ERR,
+ "CUDA memcpy not permitted for this operation.");
}
#endif
mq_copy_tiny((uint32_t *) user_buffer, (uint32_t *) payload, msglen);
@@ -383,6 +388,9 @@ psmi_mq_handle_envelope(psm2_mq_t mq, psm2_epaddr_t src, psm2_mq_tag_t *tag,
(unsigned long)req->req_data.buf,
msglen, 1, src->proto);
psmi_mtucpy_fn = psmi_mq_mtucpy_host_mem;
+ } else if ((req->flags_user & PSM2_MQ_FLAG_GDRCPY_ONLY) && req->is_buf_gpu_mem) {
+ psmi_handle_error(PSMI_EP_NORETURN, PSM2_INTERNAL_ERR,
+ "CUDA memcpy not permitted for this operation.");
}
#endif
if (msglen <= paylen) {
@@ -414,6 +422,9 @@ psmi_mq_handle_envelope(psm2_mq_t mq, psm2_epaddr_t src, psm2_mq_tag_t *tag,
req->req_data.buf = gdr_convert_gpu_to_host_addr(GDR_FD,
(unsigned long)req->user_gpu_buffer,
req->req_data.send_msglen, 1, src->proto);
+ } else if ((req->flags_user & PSM2_MQ_FLAG_GDRCPY_ONLY) && req->is_buf_gpu_mem) {
+ psmi_handle_error(PSMI_EP_NORETURN, PSM2_INTERNAL_ERR,
+ "CUDA memcpy not permitted for this operation.");
}
#endif
if (paylen > 0)
@@ -540,7 +551,11 @@ psmi_mq_handle_envelope(psm2_mq_t mq, psm2_epaddr_t src, psm2_mq_tag_t *tag,
return MQ_RET_UNEXP_OK;
}
-int psmi_mq_handle_outoforder(psm2_mq_t mq, psm2_mq_req_t ureq)
+int psmi_mq_handle_outoforder(psm2_mq_t mq, psm2_mq_req_t ureq
+#ifdef PSM_CUDA
+ , struct ips_proto *proto
+#endif
+)
{
psm2_mq_req_t ereq;
uint32_t msglen;
@@ -555,12 +570,18 @@ int psmi_mq_handle_outoforder(psm2_mq_t mq, psm2_mq_req_t ureq)
ereq->req_data.peer = ureq->req_data.peer;
ereq->req_data.tag = ureq->req_data.tag;
msglen = mq_set_msglen(ereq, ereq->req_data.buf_len, ureq->req_data.send_msglen);
+ uint8_t *dest = ereq->req_data.buf;
+
+#ifdef PSM_CUDA
+ if (proto && PSMI_USE_GDR_COPY(ereq, msglen)) {
+ dest = gdr_convert_gpu_to_host_addr(GDR_FD, (unsigned long)dest, msglen, 0, proto);
+ }
+#endif
switch (ureq->state) {
case MQ_STATE_COMPLETE:
if (ureq->req_data.buf != NULL) { /* 0-byte don't alloc a sysreq_data.buf */
- psmi_mq_mtucpy(ereq->req_data.buf, (const void *)ureq->req_data.buf,
- msglen);
+ psmi_mq_mtucpy(dest, (const void *)ureq->req_data.buf, msglen);
psmi_mq_sysbuf_free(mq, ureq->req_data.buf);
}
ereq->state = MQ_STATE_COMPLETE;
@@ -574,7 +595,7 @@ int psmi_mq_handle_outoforder(psm2_mq_t mq, psm2_mq_req_t ureq)
ereq->send_msgoff = ureq->send_msgoff;
ereq->recv_msgoff = min(ureq->recv_msgoff, msglen);
if (ereq->recv_msgoff) {
- psmi_mq_mtucpy(ereq->req_data.buf,
+ psmi_mq_mtucpy(dest,
(const void *)ureq->req_data.buf,
ereq->recv_msgoff);
}
@@ -590,7 +611,7 @@ int psmi_mq_handle_outoforder(psm2_mq_t mq, psm2_mq_req_t ureq)
ereq->send_msgoff = ureq->send_msgoff;
ereq->recv_msgoff = min(ureq->recv_msgoff, msglen);
if (ereq->recv_msgoff) {
- psmi_mq_mtucpy(ereq->req_data.buf,
+ psmi_mq_mtucpy(dest,
(const void *)ureq->req_data.buf,
ereq->recv_msgoff);
}
diff --git a/psm_mq_utils.c b/psm_mq_utils.c
index a0409db..3032776 100644
--- a/psm_mq_utils.c
+++ b/psm_mq_utils.c
@@ -5,6 +5,7 @@
GPL LICENSE SUMMARY
+ Copyright(c) 2021 Cornelis Networks.
Copyright(c) 2015 Intel Corporation.
This program is free software; you can redistribute it and/or modify
@@ -17,10 +18,11 @@
General Public License for more details.
Contact Information:
- Intel Corporation, www.intel.com
+ Cornelis Networks, www.cornelisnetworks.com
BSD LICENSE
+ Copyright(c) 2021 Cornelis Networks.
Copyright(c) 2015 Intel Corporation.
Redistribution and use in source and binary forms, with or without
@@ -93,6 +95,7 @@ psm2_mq_req_t MOCKABLE(psmi_mq_req_alloc)(psm2_mq_t mq, uint32_t type)
#ifdef PSM_CUDA
req->is_buf_gpu_mem = 0;
req->user_gpu_buffer = NULL;
+ req->cuda_ipc_handle_attached = 0;
#endif
req->flags_user = 0;
req->flags_internal = 0;
@@ -114,19 +117,6 @@ psm2_mq_req_t MOCKABLE(psmi_mq_req_alloc)(psm2_mq_t mq, uint32_t type)
}
MOCK_DEF_EPILOGUE(psmi_mq_req_alloc);
-#ifdef PSM_CUDA
-void psmi_cuda_recvreq_alloc_func(int is_alloc, void* context, void* obj) {
- psm2_mq_req_t recvreq = (psm2_mq_req_t)obj;
- if (PSMI_IS_CUDA_ENABLED) {
- if (is_alloc)
- PSMI_CUDA_CALL(cuEventCreate, &recvreq->cuda_ipc_event, CU_EVENT_DEFAULT);
- else
- PSMI_CUDA_CALL(cuEventDestroy, recvreq->cuda_ipc_event);
- }
- return;
-}
-#endif
-
psm2_error_t psmi_mq_req_init(psm2_mq_t mq)
{
psm2_mq_req_t warmup_req;
@@ -165,29 +155,6 @@ psm2_error_t psmi_mq_req_init(psm2_mq_t mq)
if ((err =
psmi_parse_mpool_env(mq, 0, &rlim, &maxsz, &chunksz)))
goto fail;
- /* Have a callback function for receive req mpool which creates
- * and destroy events.
- */
-#ifdef PSM_CUDA
- if (PSMI_IS_CUDA_ENABLED) {
- if ((mq->rreq_pool =
- psmi_mpool_create_for_cuda(sizeof(struct psm2_mq_req), chunksz,
- maxsz, 0, DESCRIPTORS, NULL,
- NULL, psmi_cuda_recvreq_alloc_func, NULL)) == NULL) {
- err = PSM2_NO_MEMORY;
- goto fail;
- }
- }
- else {
- if ((mq->rreq_pool =
- psmi_mpool_create(sizeof(struct psm2_mq_req), chunksz,
- maxsz, 0, DESCRIPTORS, NULL,
- NULL)) == NULL) {
- err = PSM2_NO_MEMORY;
- goto fail;
- }
- }
-#else
if ((mq->rreq_pool =
psmi_mpool_create(sizeof(struct psm2_mq_req), chunksz,
maxsz, 0, DESCRIPTORS, NULL,
@@ -195,7 +162,6 @@ psm2_error_t psmi_mq_req_init(psm2_mq_t mq)
err = PSM2_NO_MEMORY;
goto fail;
}
-#endif
}
/* Warm up the allocators */
diff --git a/psm_user.h b/psm_user.h
index 09477c5..523b9fb 100644
--- a/psm_user.h
+++ b/psm_user.h
@@ -5,6 +5,7 @@
GPL LICENSE SUMMARY
+ Copyright(c) 2021 Cornelis Networks.
Copyright(c) 2016 Intel Corporation.
This program is free software; you can redistribute it and/or modify
@@ -17,10 +18,11 @@
General Public License for more details.
Contact Information:
- Intel Corporation, www.intel.com
+ Cornelis Networks, www.cornelisnetworks.com
BSD LICENSE
+ Copyright(c) 2021 Cornelis Networks.
Copyright(c) 2016 Intel Corporation.
Redistribution and use in source and binary forms, with or without
@@ -296,6 +298,7 @@ void psmi_profile_reblock(int did_no_progress) __attribute__ ((weak));
#endif
#ifdef PSM_CUDA
+
#include <cuda.h>
#include <driver_types.h>
@@ -305,12 +308,12 @@ void psmi_profile_reblock(int did_no_progress) __attribute__ ((weak));
extern int is_cuda_enabled;
extern int is_gdr_copy_enabled;
-extern int device_support_gpudirect;
-extern int gpu_p2p_supported;
+extern int _device_support_gpudirect;
+extern int _gpu_p2p_supported;
extern int my_gpu_device;
extern int cuda_lib_version;
-extern CUcontext ctxt;
+extern CUcontext cu_ctxt;
extern void *psmi_cuda_lib;
extern CUresult (*psmi_cuInit)(unsigned int Flags );
@@ -326,6 +329,7 @@ extern CUresult (*psmi_cuDriverGetVersion)(int* driverVersion);
extern CUresult (*psmi_cuDeviceGetCount)(int* count);
extern CUresult (*psmi_cuStreamCreate)(CUstream* phStream, unsigned int Flags);
extern CUresult (*psmi_cuStreamDestroy)(CUstream phStream);
+extern CUresult (*psmi_cuStreamSynchronize)(CUstream phStream);
extern CUresult (*psmi_cuEventCreate)(CUevent* phEvent, unsigned int Flags);
extern CUresult (*psmi_cuEventDestroy)(CUevent hEvent);
extern CUresult (*psmi_cuEventQuery)(CUevent hEvent);
@@ -348,14 +352,35 @@ extern CUresult (*psmi_cuDevicePrimaryCtxRetain)(CUcontext* pctx, CUdevice dev);
extern CUresult (*psmi_cuCtxGetDevice)(CUdevice* device);
extern CUresult (*psmi_cuDevicePrimaryCtxRelease)(CUdevice device);
+static int check_set_cuda_ctxt(void)
+{
+ CUresult err;
+ CUcontext tmpctxt = {0};
+
+ if (!psmi_cuCtxGetCurrent || !psmi_cuCtxSetCurrent)
+ return 0;
+
+ err = psmi_cuCtxGetCurrent(&tmpctxt);
+ if (!err) {
+ if (!tmpctxt && cu_ctxt) {
+ err = psmi_cuCtxSetCurrent(cu_ctxt);
+ return !!err;
+ } else if (tmpctxt && !cu_ctxt) {
+ cu_ctxt = tmpctxt;
+ }
+ }
+ return 0;
+}
+
#define PSMI_CUDA_CALL(func, args...) do { \
+ _HFI_CUDADBG("func=psmi_"#func"\n"); \
CUresult cudaerr; \
+ if (check_set_cuda_ctxt()) { \
+ psmi_handle_error(PSMI_EP_NORETURN, PSM2_INTERNAL_ERR, \
+ "Failed to set/synchronize CUDA context.\n"); \
+ } \
cudaerr = psmi_##func(args); \
if (cudaerr != CUDA_SUCCESS) { \
- if (ctxt == NULL) \
- _HFI_ERROR( \
- "Check if CUDA is initialized" \
- "before psm2_ep_open call \n"); \
_HFI_ERROR( \
"CUDA failure: %s() (at %s:%d)" \
"returned %d\n", \
@@ -366,6 +391,92 @@ extern CUresult (*psmi_cuDevicePrimaryCtxRelease)(CUdevice device);
} \
} while (0)
+PSMI_ALWAYS_INLINE(
+int device_support_gpudirect())
+{
+ if (_device_support_gpudirect > -1) return _device_support_gpudirect;
+
+ int num_devices, dev;
+
+ /* Check if all devices support Unified Virtual Addressing. */
+ PSMI_CUDA_CALL(cuDeviceGetCount, &num_devices);
+
+ _device_support_gpudirect = 1;
+
+ for (dev = 0; dev < num_devices; dev++) {
+ CUdevice device;
+ PSMI_CUDA_CALL(cuDeviceGet, &device, dev);
+ int unifiedAddressing;
+ PSMI_CUDA_CALL(cuDeviceGetAttribute,
+ &unifiedAddressing,
+ CU_DEVICE_ATTRIBUTE_UNIFIED_ADDRESSING,
+ device);
+
+ if (unifiedAddressing !=1) {
+ psmi_handle_error(PSMI_EP_NORETURN, PSM2_EP_DEVICE_FAILURE,
+ "CUDA device %d does not support Unified Virtual Addressing.\n",
+ dev);
+ }
+
+ int major;
+ PSMI_CUDA_CALL(cuDeviceGetAttribute,
+ &major,
+ CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR,
+ device);
+ if (major < 3) {
+ _device_support_gpudirect = 0;
+ _HFI_INFO("CUDA device %d does not support GPUDirect RDMA (Non-fatal error)\n", dev);
+ }
+ }
+
+ return _device_support_gpudirect;
+}
+
+#define PSMI_IS_CUDA_ENABLED likely(is_cuda_enabled)
+#define PSMI_IS_CUDA_DISABLED unlikely(!is_cuda_enabled)
+
+PSMI_ALWAYS_INLINE(
+int gpu_p2p_supported())
+{
+ if (likely(_gpu_p2p_supported > -1)) return _gpu_p2p_supported;
+
+ if (PSMI_IS_CUDA_DISABLED) {
+ _gpu_p2p_supported=0;
+ return 0;
+ }
+
+ int num_devices, dev;
+
+ /* Check which devices the current device has p2p access to. */
+ CUdevice current_device;
+ PSMI_CUDA_CALL(cuCtxGetDevice, ¤t_device);
+ PSMI_CUDA_CALL(cuDeviceGetCount, &num_devices);
+
+ _gpu_p2p_supported = 0;
+
+ for (dev = 0; dev < num_devices; dev++) {
+ CUdevice device;
+ PSMI_CUDA_CALL(cuDeviceGet, &device, dev);
+
+ if (device != current_device) {
+ int canAccessPeer = 0;
+ PSMI_CUDA_CALL(cuDeviceCanAccessPeer, &canAccessPeer,
+ current_device, device);
+
+ if (canAccessPeer != 1)
+ _HFI_DBG("CUDA device %d does not support P2P from current device (Non-fatal error)\n", dev);
+ else
+ _gpu_p2p_supported |= (1 << device);
+ } else {
+ /* Always support p2p on the same GPU */
+ my_gpu_device = device;
+ _gpu_p2p_supported |= (1 << device);
+ }
+ }
+
+ return _gpu_p2p_supported;
+}
+
/**
* Similar to PSMI_CUDA_CALL() except does not error out
* if func(args) returns CUDA_SUCCESS or except_err
@@ -378,9 +489,14 @@ extern CUresult (*psmi_cuDevicePrimaryCtxRelease)(CUdevice device);
* DBG level.
*/
#define PSMI_CUDA_CALL_EXCEPT(except_err, func, args...) do { \
+ _HFI_CUDADBG("func=psmi_"#func",except_err=%d\n", except_err); \
+ if (check_set_cuda_ctxt()) { \
+ psmi_handle_error(PSMI_EP_NORETURN, PSM2_INTERNAL_ERR, \
+ "Failed to set/synchronize CUDA context.\n"); \
+ } \
cudaerr = psmi_##func(args); \
if (cudaerr != CUDA_SUCCESS && cudaerr != except_err) { \
- if (ctxt == NULL) \
+ if (cu_ctxt == NULL) \
_HFI_ERROR( \
"Check if CUDA is initialized" \
"before psm2_ep_open call \n"); \
@@ -400,6 +516,7 @@ extern CUresult (*psmi_cuDevicePrimaryCtxRelease)(CUdevice device);
} while (0)
#define PSMI_CUDA_CHECK_EVENT(event, cudaerr) do { \
+ _HFI_CUDADBG("event=%p\n", (void*)(event)); \
cudaerr = psmi_cuEventQuery(event); \
if ((cudaerr != CUDA_SUCCESS) && \
(cudaerr != CUDA_ERROR_NOT_READY)) { \
@@ -442,9 +559,6 @@ _psmi_is_cuda_mem(const void *ptr))
return 0;
}
-#define PSMI_IS_CUDA_ENABLED likely(is_cuda_enabled)
-#define PSMI_IS_CUDA_DISABLED unlikely(!is_cuda_enabled)
-
PSMI_ALWAYS_INLINE(
int
_psmi_is_gdr_copy_enabled())
@@ -473,7 +587,7 @@ struct ips_cuda_hostbuf {
struct ips_cuda_hostbuf_mpool_cb_context {
unsigned bufsz;
};
-void psmi_cuda_hostbuf_alloc_func(int is_alloc, void *context, void *obj);
+void psmi_cuda_hostbuf_alloc_func(int is_alloc, void *obj);
#define CUDA_HOSTBUFFER_LIMITS { \
.env = "PSM_CUDA_BOUNCEBUFFERS_MAX", \
@@ -500,9 +614,10 @@ extern uint32_t gdr_copy_threshold_send;
*/
extern uint32_t gdr_copy_threshold_recv;
-#define PSMI_USE_GDR_COPY(req, len) req->is_buf_gpu_mem && \
- PSMI_IS_GDR_COPY_ENABLED && \
- len >=1 && len <= gdr_copy_threshold_recv
+#define PSMI_USE_GDR_COPY(req, len) \
+ req->is_buf_gpu_mem && \
+ PSMI_IS_GDR_COPY_ENABLED && \
+ (req->flags_user & PSM2_MQ_FLAG_GDRCPY_ONLY || (len >=1 && len <= gdr_copy_threshold_recv))
enum psm2_chb_match_type {
/* Complete data found in a single chb */
diff --git a/ptl_am/am_cuda_memhandle_cache.c b/ptl_am/am_cuda_memhandle_cache.c
index 730562d..1f34ac2 100644
--- a/ptl_am/am_cuda_memhandle_cache.c
+++ b/ptl_am/am_cuda_memhandle_cache.c
@@ -5,6 +5,7 @@
GPL LICENSE SUMMARY
+ Copyright(c) 2021 Cornelis Networks.
Copyright(c) 2016 Intel Corporation.
This program is free software; you can redistribute it and/or modify
@@ -17,10 +18,11 @@
General Public License for more details.
Contact Information:
- Intel Corporation, www.intel.com
+ Cornelis Networks, www.cornelisnetworks.com
BSD LICENSE
+ Copyright(c) 2021 Cornelis Networks.
Copyright(c) 2016 Intel Corporation.
Redistribution and use in source and binary forms, with or without
@@ -168,7 +170,7 @@ static void print_cuda_memhandle_cache_stats(void)
* which helps in closing all memhandles.
*/
static void
-psmi_cuda_memhandle_cache_alloc_func(int is_alloc, void* context, void* obj)
+psmi_cuda_memhandle_cache_alloc_func(int is_alloc, void* obj)
{
cl_map_item_t* memcache_item = (cl_map_item_t*)obj;
if (!is_alloc) {
@@ -196,8 +198,7 @@ am_cuda_memhandle_mpool_init(uint32_t memcache_size)
cuda_memhandle_cache_size,
cuda_memhandle_cache_size, 0,
UNDEFINED, NULL, NULL,
- psmi_cuda_memhandle_cache_alloc_func,
- NULL);
+ psmi_cuda_memhandle_cache_alloc_func);
if (cuda_memhandle_mpool == NULL) {
err = psmi_handle_error(PSMI_EP_NORETURN, PSM2_NO_MEMORY,
"Couldn't allocate CUDA host receive buffer pool");
diff --git a/ptl_am/am_reqrep_shmem.c b/ptl_am/am_reqrep_shmem.c
index 9be72f9..384fb50 100644
--- a/ptl_am/am_reqrep_shmem.c
+++ b/ptl_am/am_reqrep_shmem.c
@@ -5,6 +5,7 @@
GPL LICENSE SUMMARY
+ Copyright(c) 2021 Cornelis Networks.
Copyright(c) 2016 Intel Corporation.
This program is free software; you can redistribute it and/or modify
@@ -17,10 +18,11 @@
General Public License for more details.
Contact Information:
- Intel Corporation, www.intel.com
+ Cornelis Networks, www.cornelisnetworks.com
BSD LICENSE
+ Copyright(c) 2021 Cornelis Networks.
Copyright(c) 2016 Intel Corporation.
Redistribution and use in source and binary forms, with or without
@@ -2099,11 +2101,12 @@ amsh_mq_send_inner(psm2_mq_t mq, psm2_mq_req_t req, psm2_epaddr_t epaddr,
#ifdef PSM_CUDA
int gpu_mem = 0;
- int ep_supports_p2p = (1 << ((am_epaddr_t *) epaddr)->gpuid) & gpu_p2p_supported;
+ int ep_supports_p2p;
if (PSMI_IS_CUDA_ENABLED && PSMI_IS_CUDA_MEM(ubuf)) {
gpu_mem = 1;
+ ep_supports_p2p = (1 << ((am_epaddr_t *) epaddr)->gpuid) & gpu_p2p_supported();
/* All sends from a gpu buffer use the rendezvous protocol if p2p is supported */
if (ep_supports_p2p) {
goto do_rendezvous;
diff --git a/ptl_am/ptl.c b/ptl_am/ptl.c
index 2e42c1b..d43fde6 100644
--- a/ptl_am/ptl.c
+++ b/ptl_am/ptl.c
@@ -5,6 +5,7 @@
GPL LICENSE SUMMARY
+ Copyright(c) 2021 Cornelis Networks.
Copyright(c) 2016 Intel Corporation.
This program is free software; you can redistribute it and/or modify
@@ -17,10 +18,11 @@
General Public License for more details.
Contact Information:
- Intel Corporation, www.intel.com
+ Cornelis Networks, www.cornelisnetworks.com
BSD LICENSE
+ Copyright(c) 2021 Cornelis Networks.
Copyright(c) 2016 Intel Corporation.
Redistribution and use in source and binary forms, with or without
@@ -96,8 +98,7 @@ ptl_handle_rtsmatch_request(psm2_mq_req_t req, int was_posted,
if (req->is_buf_gpu_mem) {
PSMI_CUDA_CALL(cuMemcpyDtoD, (CUdeviceptr)req->req_data.buf, cuda_ipc_dev_ptr,
req->req_data.recv_msglen);
- PSMI_CUDA_CALL(cuEventRecord, req->cuda_ipc_event, 0);
- PSMI_CUDA_CALL(cuEventSynchronize, req->cuda_ipc_event);
+ PSMI_CUDA_CALL(cuStreamSynchronize, 0);
} else
PSMI_CUDA_CALL(cuMemcpyDtoH, req->req_data.buf, cuda_ipc_dev_ptr,
req->req_data.recv_msglen);
@@ -129,8 +130,7 @@ ptl_handle_rtsmatch_request(psm2_mq_req_t req, int was_posted,
* copies for msg sizes less than 64k. The event record
* and synchronize calls are to guarentee completion.
*/
- PSMI_CUDA_CALL(cuEventRecord, req->cuda_ipc_event, 0);
- PSMI_CUDA_CALL(cuEventSynchronize, req->cuda_ipc_event);
+ PSMI_CUDA_CALL(cuStreamSynchronize, 0);
psmi_free(cuda_ipc_bounce_buf);
} else {
/* cma can be done in handler context or not. */
@@ -206,7 +206,7 @@ psmi_am_mq_handler(void *toki, psm2_amarg_t *args, int narg, void *buf,
tag.tag[1] = args[1].u32w0;
tag.tag[2] = args[2].u32w1;
psmi_assert(toki != NULL);
- _HFI_VDBG("mq=%p opcode=%d, len=%d, msglen=%d\n",
+ _HFI_VDBG("mq=%p opcode=0x%x, len=%d, msglen=%d\n",
tok->mq, opcode, (int)len, msglen);
switch (opcode) {
diff --git a/ptl_ips/ips_proto.c b/ptl_ips/ips_proto.c
index dfd03e6..d1489e6 100644
--- a/ptl_ips/ips_proto.c
+++ b/ptl_ips/ips_proto.c
@@ -5,6 +5,7 @@
GPL LICENSE SUMMARY
+ Copyright(c) 2021 Cornelis Networks.
Copyright(c) 2016 Intel Corporation.
This program is free software; you can redistribute it and/or modify
@@ -17,10 +18,11 @@
General Public License for more details.
Contact Information:
- Intel Corporation, www.intel.com
+ Cornelis Networks, www.cornelisnetworks.com
BSD LICENSE
+ Copyright(c) 2021 Cornelis Networks.
Copyright(c) 2016 Intel Corporation.
Redistribution and use in source and binary forms, with or without
@@ -95,22 +97,17 @@ static psm2_error_t proto_sdma_init(struct ips_proto *proto,
const psmi_context_t *context);
#ifdef PSM_CUDA
-void psmi_cuda_hostbuf_alloc_func(int is_alloc, void *context, void *obj)
+void psmi_cuda_hostbuf_alloc_func(int is_alloc, void *obj)
{
- struct ips_cuda_hostbuf *icb;
- struct ips_cuda_hostbuf_mpool_cb_context *ctxt =
- (struct ips_cuda_hostbuf_mpool_cb_context *) context;
-
- icb = (struct ips_cuda_hostbuf *)obj;
+ struct ips_cuda_hostbuf *icb = (struct ips_cuda_hostbuf *)obj;
if (is_alloc) {
- PSMI_CUDA_CALL(cuMemHostAlloc,
- (void **) &icb->host_buf,
- ctxt->bufsz,
- CU_MEMHOSTALLOC_PORTABLE);
- PSMI_CUDA_CALL(cuEventCreate, &icb->copy_status, CU_EVENT_DEFAULT);
+ icb->host_buf = NULL;
+ icb->copy_status = NULL;
} else {
- if (icb->host_buf) {
+ if (icb->host_buf != NULL) {
PSMI_CUDA_CALL(cuMemFreeHost, icb->host_buf);
+ }
+ if (icb->copy_status != NULL) {
PSMI_CUDA_CALL(cuEventDestroy, icb->copy_status);
}
}
@@ -520,10 +517,7 @@ ips_proto_init(const psmi_context_t *context, const ptl_t *ptl,
if (protoexp_flags & IPS_PROTOEXP_FLAG_ENABLED) {
#ifdef PSM_CUDA
- if (PSMI_IS_CUDA_ENABLED) {
- PSMI_CUDA_CALL(cuStreamCreate,
- &proto->cudastream_send, CU_STREAM_NON_BLOCKING);
- }
+ proto->cudastream_send = NULL;
#endif
proto->scbc_rv = NULL;
if ((err = ips_protoexp_init(context, proto, protoexp_flags,
@@ -635,14 +629,34 @@ ips_proto_init(const psmi_context_t *context, const ptl_t *ptl,
PSMI_ENVVAR_LEVEL_USER, PSMI_ENVVAR_TYPE_UINT_FLAGS,
(union psmi_envvar_val)0, /* Disabled by default */
&env_gpudirect_rdma);
+ /* Use GPUDirect RDMA for SDMA send? */
+ union psmi_envvar_val env_gpudirect_rdma_send;
+ psmi_getenv("PSM2_GPUDIRECT_RDMA_SEND",
+ "Use GPUDirect RDMA support to allow the HFI to directly"
+ " read from the GPU for SDMA. Requires driver"
+ " support.(default is disabled i.e. 0)",
+ PSMI_ENVVAR_LEVEL_USER, PSMI_ENVVAR_TYPE_UINT_FLAGS,
+ (union psmi_envvar_val)0, /* Disabled by default */
+ &env_gpudirect_rdma_send);
+
+ /* Use GPUDirect RDMA for recv? */
+ union psmi_envvar_val env_gpudirect_rdma_recv;
+ psmi_getenv("PSM2_GPUDIRECT_RDMA_RECV",
+ "Use GPUDirect RDMA support to allow the HFI to directly"
+ " write into GPU. Requires driver support.(default is"
+ " disabled i.e. 0)",
+ PSMI_ENVVAR_LEVEL_USER, PSMI_ENVVAR_TYPE_UINT_FLAGS,
+ (union psmi_envvar_val)0, /* Disabled by default */
+ &env_gpudirect_rdma_recv);
+
/* The following cases need to be handled:
* 1) GPU DIRECT is turned off but GDR COPY is turned on by the user or
* by default - Turn off GDR COPY
* 2) GPU DIRECT is on but GDR COPY is turned off by the user - Leave
*. this config as it is.
*/
- if (!env_gpudirect_rdma.e_uint)
- is_gdr_copy_enabled = 0;
+ if (!env_gpudirect_rdma.e_uint && !env_gpudirect_rdma_send.e_uint && !env_gpudirect_rdma_recv.e_uint)
+ is_gdr_copy_enabled = 0;
/* Default Send threshold for Gpu-direct set to 30000 */
union psmi_envvar_val env_gpudirect_send_thresh;
@@ -659,7 +673,7 @@ ips_proto_init(const psmi_context_t *context, const ptl_t *ptl,
(union psmi_envvar_val)UINT_MAX, &env_gpudirect_recv_thresh);
gpudirect_recv_threshold = env_gpudirect_recv_thresh.e_uint;
- if (env_gpudirect_rdma.e_uint && device_support_gpudirect) {
+ if (env_gpudirect_rdma.e_uint && device_support_gpudirect()) {
if (PSMI_IS_CUDA_DISABLED ||
/* All pio, No SDMA*/
(proto->flags & IPS_PROTO_FLAG_SPIO) ||
@@ -675,16 +689,7 @@ ips_proto_init(const psmi_context_t *context, const ptl_t *ptl,
* experimentation and will not be documented for any customers.
*/
/* Use GPUDirect RDMA for SDMA send? */
- union psmi_envvar_val env_gpudirect_rdma_send;
- psmi_getenv("PSM2_GPUDIRECT_RDMA_SEND",
- "Use GPUDirect RDMA support to allow the HFI to directly"
- " read from the GPU for SDMA. Requires driver"
- " support.(default is disabled i.e. 0)",
- PSMI_ENVVAR_LEVEL_USER, PSMI_ENVVAR_TYPE_UINT_FLAGS,
- (union psmi_envvar_val)0, /* Disabled by default */
- &env_gpudirect_rdma_send);
-
- if (env_gpudirect_rdma_send.e_uint && device_support_gpudirect) {
+ if (env_gpudirect_rdma_send.e_uint && device_support_gpudirect()) {
if (PSMI_IS_CUDA_DISABLED ||
/* All pio, No SDMA*/
(proto->flags & IPS_PROTO_FLAG_SPIO))
@@ -695,16 +700,7 @@ ips_proto_init(const psmi_context_t *context, const ptl_t *ptl,
proto->flags |= IPS_PROTO_FLAG_GPUDIRECT_RDMA_SEND;
}
/* Use GPUDirect RDMA for recv? */
- union psmi_envvar_val env_gpudirect_rdma_recv;
- psmi_getenv("PSM2_GPUDIRECT_RDMA_RECV",
- "Use GPUDirect RDMA support to allow the HFI to directly"
- " write into GPU. Requires driver support.(default is"
- " disabled i.e. 0)",
- PSMI_ENVVAR_LEVEL_USER, PSMI_ENVVAR_TYPE_UINT_FLAGS,
- (union psmi_envvar_val)0, /* Disabled by default */
- &env_gpudirect_rdma_recv);
-
- if (env_gpudirect_rdma_recv.e_uint && device_support_gpudirect) {
+ if (env_gpudirect_rdma_recv.e_uint && device_support_gpudirect()) {
if (PSMI_IS_CUDA_DISABLED ||
!(protoexp_flags & IPS_PROTOEXP_FLAG_ENABLED))
err = psmi_handle_error(PSMI_EP_NORETURN,
@@ -734,9 +730,7 @@ ips_proto_init(const psmi_context_t *context, const ptl_t *ptl,
psmi_mpool_create_for_cuda(sizeof(struct ips_cuda_hostbuf),
chunksz, max_elements, 0,
UNDEFINED, NULL, NULL,
- psmi_cuda_hostbuf_alloc_func,
- (void *)
- &proto->cuda_hostbuf_send_cfg);
+ psmi_cuda_hostbuf_alloc_func);
if (proto->cuda_hostbuf_pool_send == NULL) {
err = psmi_handle_error(proto->ep, PSM2_NO_MEMORY,
@@ -750,9 +744,7 @@ ips_proto_init(const psmi_context_t *context, const ptl_t *ptl,
psmi_mpool_create_for_cuda(sizeof(struct ips_cuda_hostbuf),
chunksz, max_elements, 0,
UNDEFINED, NULL, NULL,
- psmi_cuda_hostbuf_alloc_func,
- (void *)
- &proto->cuda_hostbuf_small_send_cfg);
+ psmi_cuda_hostbuf_alloc_func);
if (proto->cuda_hostbuf_pool_small_send == NULL) {
err = psmi_handle_error(proto->ep, PSM2_NO_MEMORY,
@@ -928,7 +920,7 @@ ips_proto_fini(struct ips_proto *proto, int force, uint64_t timeout_in)
#endif
#ifdef PSM_CUDA
- if (PSMI_IS_CUDA_ENABLED) {
+ if (PSMI_IS_CUDA_ENABLED && proto->cudastream_send) {
PSMI_CUDA_CALL(cuStreamDestroy, proto->cudastream_send);
}
#endif
diff --git a/ptl_ips/ips_proto.h b/ptl_ips/ips_proto.h
index dc8e7d4..221edd6 100644
--- a/ptl_ips/ips_proto.h
+++ b/ptl_ips/ips_proto.h
@@ -664,7 +664,6 @@ int ips_proto_mq_handle_rts(struct ips_recvhdrq_event *rcv_ev);
int ips_proto_mq_handle_tiny(struct ips_recvhdrq_event *rcv_ev);
int ips_proto_mq_handle_short(struct ips_recvhdrq_event *rcv_ev);
int ips_proto_mq_handle_eager(struct ips_recvhdrq_event *rcv_ev);
-void ips_proto_mq_handle_outoforder_queue(psm2_mq_t mq, ips_msgctl_t *msgctl);
int ips_proto_mq_handle_data(struct ips_recvhdrq_event *rcv_ev);
psm2_error_t ips_proto_mq_send(psm2_mq_t mq, psm2_epaddr_t epaddr,
diff --git a/ptl_ips/ips_proto_connect.c b/ptl_ips/ips_proto_connect.c
index a608760..f650a36 100644
--- a/ptl_ips/ips_proto_connect.c
+++ b/ptl_ips/ips_proto_connect.c
@@ -5,6 +5,7 @@
GPL LICENSE SUMMARY
+ Copyright(c) 2021 Cornelis Networks.
Copyright(c) 2015 Intel Corporation.
This program is free software; you can redistribute it and/or modify
@@ -17,10 +18,11 @@
General Public License for more details.
Contact Information:
- Intel Corporation, www.intel.com
+ Cornelis Networks, www.cornelisnetworks.com
BSD LICENSE
+ Copyright(c) 2021 Cornelis Networks.
Copyright(c) 2015 Intel Corporation.
Redistribution and use in source and binary forms, with or without
@@ -905,7 +907,7 @@ ptl_handle_connect_req(struct ips_proto *proto, psm2_epaddr_t epaddr,
if (!proto->done_warning) {
psmi_syslog(proto->ep, 1, LOG_INFO,
"Non-fatal connection problem: Received an out-of-context "
- "connection message from host %s LID=0x%x context=%d. (Ignoring)",
+ "connection message from host %s LID=0x%x context=%"PRId64". (Ignoring)",
req->hostname,
(int)psm2_epid_nid(req->epid),
psm2_epid_context(req->epid));
diff --git a/ptl_ips/ips_proto_expected.c b/ptl_ips/ips_proto_expected.c
index 7e7e997..b25b2a3 100644
--- a/ptl_ips/ips_proto_expected.c
+++ b/ptl_ips/ips_proto_expected.c
@@ -5,6 +5,7 @@
GPL LICENSE SUMMARY
+ Copyright(c) 2021 Cornelis Networks.
Copyright(c) 2016 Intel Corporation.
This program is free software; you can redistribute it and/or modify
@@ -17,10 +18,11 @@
General Public License for more details.
Contact Information:
- Intel Corporation, www.intel.com
+ Cornelis Networks, www.cornelisnetworks.com
BSD LICENSE
+ Copyright(c) 2021 Cornelis Networks.
Copyright(c) 2016 Intel Corporation.
Redistribution and use in source and binary forms, with or without
@@ -370,9 +372,7 @@ MOCKABLE(ips_protoexp_init)(const psmi_context_t *context,
psmi_mpool_create_for_cuda(sizeof(struct ips_cuda_hostbuf),
chunksz, max_elements, 0,
UNDEFINED, NULL, NULL,
- psmi_cuda_hostbuf_alloc_func,
- (void *)
- &protoexp->cuda_hostbuf_recv_cfg);
+ psmi_cuda_hostbuf_alloc_func);
if (protoexp->cuda_hostbuf_pool_recv == NULL) {
err = psmi_handle_error(proto->ep, PSM2_NO_MEMORY,
@@ -386,9 +386,7 @@ MOCKABLE(ips_protoexp_init)(const psmi_context_t *context,
psmi_mpool_create_for_cuda(sizeof(struct ips_cuda_hostbuf),
chunksz, max_elements, 0,
UNDEFINED, NULL, NULL,
- psmi_cuda_hostbuf_alloc_func,
- (void *)
- &protoexp->cuda_hostbuf_small_recv_cfg);
+ psmi_cuda_hostbuf_alloc_func);
if (protoexp->cuda_hostbuf_pool_small_recv == NULL) {
err = psmi_handle_error(proto->ep, PSM2_NO_MEMORY,
@@ -396,9 +394,7 @@ MOCKABLE(ips_protoexp_init)(const psmi_context_t *context,
goto fail;
}
- PSMI_CUDA_CALL(cuStreamCreate,
- &protoexp->cudastream_recv,
- CU_STREAM_NON_BLOCKING);
+ protoexp->cudastream_recv = NULL;
STAILQ_INIT(&protoexp->cudapend_getreqsq);
} else {
protoexp->cuda_hostbuf_pool_recv = NULL;
@@ -437,7 +433,9 @@ psm2_error_t ips_protoexp_fini(struct ips_protoexp *protoexp)
!(protoexp->proto->flags & IPS_PROTO_FLAG_GPUDIRECT_RDMA_RECV)) {
psmi_mpool_destroy(protoexp->cuda_hostbuf_pool_small_recv);
psmi_mpool_destroy(protoexp->cuda_hostbuf_pool_recv);
- PSMI_CUDA_CALL(cuStreamDestroy, protoexp->cudastream_recv);
+ if (protoexp->cudastream_recv != NULL) {
+ PSMI_CUDA_CALL(cuStreamDestroy, protoexp->cudastream_recv);
+ }
}
#endif
psmi_mpool_destroy(protoexp->tid_getreq_pool);
@@ -922,11 +920,15 @@ int ips_protoexp_data(struct ips_recvhdrq_event *rcv_ev)
* assert(0 < paylen < MTU).
*/
if (psmi_hal_has_cap(PSM_HAL_CAP_RSM_FECN_SUPP) &&
- ips_recvhdrq_event_payload(rcv_ev) &&
- ips_recvhdrq_event_paylen(rcv_ev))
- psmi_mq_mtucpy(tidrecvc->buffer + p_hdr->exp_offset,
- ips_recvhdrq_event_payload(rcv_ev),
- ips_recvhdrq_event_paylen(rcv_ev));
+ ips_recvhdrq_event_payload(rcv_ev) &&
+ ips_recvhdrq_event_paylen(rcv_ev)) {
+
+ psmi_assert(p_hdr->exp_offset + ips_recvhdrq_event_paylen(rcv_ev) <= tidrecvc->recv_tidbytes);
+
+ psmi_mq_mtucpy(tidrecvc->buffer + tidrecvc->tid_list.tsess_unaligned_start + p_hdr->exp_offset,
+ ips_recvhdrq_event_payload(rcv_ev),
+ ips_recvhdrq_event_paylen(rcv_ev));
+ }
/* If last packet then we are done. We send a tid transfer completion
* packet back to sender, free all tids and close the current tidflow
@@ -1094,12 +1096,17 @@ void psmi_cuda_run_prefetcher(struct ips_protoexp *protoexp,
window_len =
ips_cuda_next_window(tidsendc->ipsaddr->window_rv,
offset, req->req_data.buf_len);
- if (window_len <= CUDA_SMALLHOSTBUF_SZ)
+ unsigned bufsz;
+ if (window_len <= CUDA_SMALLHOSTBUF_SZ) {
chb = (struct ips_cuda_hostbuf *) psmi_mpool_get(
proto->cuda_hostbuf_pool_small_send);
- if (chb == NULL)
+ bufsz = proto->cuda_hostbuf_small_send_cfg.bufsz;
+ }
+ if (chb == NULL) {
chb = (struct ips_cuda_hostbuf *) psmi_mpool_get(
proto->cuda_hostbuf_pool_send);
+ bufsz = proto->cuda_hostbuf_send_cfg.bufsz;
+ }
/* were any buffers available for the prefetcher? */
if (chb == NULL)
return;
@@ -1109,6 +1116,20 @@ void psmi_cuda_run_prefetcher(struct ips_protoexp *protoexp,
chb->req = req;
chb->gpu_buf = (CUdeviceptr) req->req_data.buf + offset;
chb->bytes_read = 0;
+
+ if (proto->cudastream_send == NULL) {
+ PSMI_CUDA_CALL(cuStreamCreate,
+ &proto->cudastream_send, CU_STREAM_NON_BLOCKING);
+ }
+ if (chb->host_buf == NULL) {
+ PSMI_CUDA_CALL(cuMemHostAlloc,
+ (void **) &chb->host_buf,
+ bufsz,
+ CU_MEMHOSTALLOC_PORTABLE);
+ }
+ if (chb->copy_status == NULL) {
+ PSMI_CUDA_CALL(cuEventCreate, &chb->copy_status, CU_EVENT_DEFAULT);
+ }
PSMI_CUDA_CALL(cuMemcpyDtoHAsync,
chb->host_buf, chb->gpu_buf,
window_len,
@@ -1143,12 +1164,17 @@ void psmi_attach_chb_to_tidsendc(struct ips_protoexp *protoexp,
window_len =
ips_cuda_next_window(tidsendc->ipsaddr->window_rv,
offset, req->req_data.buf_len);
- if (window_len <= CUDA_SMALLHOSTBUF_SZ)
+ unsigned bufsz;
+ if (window_len <= CUDA_SMALLHOSTBUF_SZ) {
chb = (struct ips_cuda_hostbuf *) psmi_mpool_get(
proto->cuda_hostbuf_pool_small_send);
- if (chb == NULL)
+ bufsz = proto->cuda_hostbuf_small_send_cfg.bufsz;
+ }
+ if (chb == NULL) {
chb = (struct ips_cuda_hostbuf *) psmi_mpool_get(
proto->cuda_hostbuf_pool_send);
+ bufsz = proto->cuda_hostbuf_send_cfg.bufsz;
+ }
/* were any buffers available? If not force allocate */
if (chb == NULL) {
@@ -1162,6 +1188,19 @@ void psmi_attach_chb_to_tidsendc(struct ips_protoexp *protoexp,
chb->req = req;
chb->gpu_buf = (CUdeviceptr) req->req_data.buf + offset;
chb->bytes_read = 0;
+ if (proto->cudastream_send == NULL) {
+ PSMI_CUDA_CALL(cuStreamCreate,
+ &proto->cudastream_send, CU_STREAM_NON_BLOCKING);
+ }
+ if (chb->host_buf == NULL) {
+ PSMI_CUDA_CALL(cuMemHostAlloc,
+ (void **) &chb->host_buf,
+ bufsz,
+ CU_MEMHOSTALLOC_PORTABLE);
+ }
+ if (chb->copy_status == NULL) {
+ PSMI_CUDA_CALL(cuEventCreate, &chb->copy_status, CU_EVENT_DEFAULT);
+ }
PSMI_CUDA_CALL(cuMemcpyDtoHAsync,
chb->host_buf, chb->gpu_buf,
window_len,
@@ -2047,14 +2086,19 @@ ips_tid_recv_alloc(struct ips_protoexp *protoexp,
/* 4. allocate a cuda bounce buffer, if required */
struct ips_cuda_hostbuf *chb = NULL;
if (getreq->cuda_hostbuf_used) {
- if (nbytes_this <= CUDA_SMALLHOSTBUF_SZ)
+ unsigned bufsz;
+ if (nbytes_this <= CUDA_SMALLHOSTBUF_SZ) {
chb = (struct ips_cuda_hostbuf *)
psmi_mpool_get(
protoexp->cuda_hostbuf_pool_small_recv);
- if (chb == NULL)
+ bufsz = protoexp->cuda_hostbuf_small_recv_cfg.bufsz;
+ }
+ if (chb == NULL) {
chb = (struct ips_cuda_hostbuf *)
psmi_mpool_get(
protoexp->cuda_hostbuf_pool_recv);
+ bufsz = protoexp->cuda_hostbuf_recv_cfg.bufsz;
+ }
if (chb == NULL) {
/* Unable to get a cudahostbuf for TID.
* Release the resources we're holding and reschedule.*/
@@ -2069,6 +2113,12 @@ ips_tid_recv_alloc(struct ips_protoexp *protoexp,
return PSM2_EP_NO_RESOURCES;
}
+ if (chb->host_buf == NULL) {
+ PSMI_CUDA_CALL(cuMemHostAlloc,
+ (void **) &chb->host_buf,
+ bufsz,
+ CU_MEMHOSTALLOC_PORTABLE);
+ }
tidrecvc->cuda_hostbuf = chb;
tidrecvc->buffer = chb->host_buf;
chb->size = 0;
@@ -2423,11 +2473,20 @@ void psmi_cudamemcpy_tid_to_device(struct ips_tid_recv_desc *tidrecvc)
chb->size += tidrecvc->recv_tidbytes + tidrecvc->tid_list.tsess_unaligned_start +
tidrecvc->tid_list.tsess_unaligned_end;
+ if (protoexp->cudastream_recv == NULL) {
+ PSMI_CUDA_CALL(cuStreamCreate,
+ &protoexp->cudastream_recv,
+ CU_STREAM_NON_BLOCKING);
+ }
+
PSMI_CUDA_CALL(cuMemcpyHtoDAsync,
chb->gpu_buf, chb->host_buf,
tidrecvc->recv_tidbytes + tidrecvc->tid_list.tsess_unaligned_start +
tidrecvc->tid_list.tsess_unaligned_end,
protoexp->cudastream_recv);
+ if (chb->copy_status == NULL) {
+ PSMI_CUDA_CALL(cuEventCreate, &chb->copy_status, CU_EVENT_DEFAULT);
+ }
PSMI_CUDA_CALL(cuEventRecord, chb->copy_status,
protoexp->cudastream_recv);
diff --git a/ptl_ips/ips_proto_mq.c b/ptl_ips/ips_proto_mq.c
index 8a047c6..032aa33 100644
--- a/ptl_ips/ips_proto_mq.c
+++ b/ptl_ips/ips_proto_mq.c
@@ -5,6 +5,7 @@
GPL LICENSE SUMMARY
+ Copyright(c) 2021 Cornelis Networks.
Copyright(c) 2016 Intel Corporation.
This program is free software; you can redistribute it and/or modify
@@ -17,10 +18,11 @@
General Public License for more details.
Contact Information:
- Intel Corporation, www.intel.com
+ Cornelis Networks, www.cornelisnetworks.com
BSD LICENSE
+ Copyright(c) 2021 Cornelis Networks.
Copyright(c) 2016 Intel Corporation.
Redistribution and use in source and binary forms, with or without
@@ -486,14 +488,19 @@ ips_ptl_mq_rndv(struct ips_proto *proto, psm2_mq_req_t req,
ips_cuda_next_window(ipsaddr->window_rv,
offset, len);
- if (window_len <= CUDA_SMALLHOSTBUF_SZ)
+ unsigned bufsz;
+ if (window_len <= CUDA_SMALLHOSTBUF_SZ) {
chb = (struct ips_cuda_hostbuf *)
psmi_mpool_get(
proto->cuda_hostbuf_pool_small_send);
- if (chb == NULL)
+ bufsz = proto->cuda_hostbuf_small_send_cfg.bufsz;
+ }
+ if (chb == NULL) {
chb = (struct ips_cuda_hostbuf *)
psmi_mpool_get(
proto->cuda_hostbuf_pool_send);
+ bufsz = proto->cuda_hostbuf_send_cfg.bufsz;
+ }
/* any buffers available? */
if (chb == NULL)
@@ -507,6 +514,19 @@ ips_ptl_mq_rndv(struct ips_proto *proto, psm2_mq_req_t req,
chb->gpu_buf = (CUdeviceptr) buf + offset;
chb->bytes_read = 0;
+ if (proto->cudastream_send == NULL) {
+ PSMI_CUDA_CALL(cuStreamCreate,
+ &proto->cudastream_send, CU_STREAM_NON_BLOCKING);
+ }
+ if (chb->host_buf == NULL) {
+ PSMI_CUDA_CALL(cuMemHostAlloc,
+ (void **) &chb->host_buf,
+ bufsz,
+ CU_MEMHOSTALLOC_PORTABLE);
+ }
+ if (chb->copy_status == NULL) {
+ PSMI_CUDA_CALL(cuEventCreate, &chb->copy_status, CU_EVENT_DEFAULT);
+ }
PSMI_CUDA_CALL(cuMemcpyDtoHAsync,
chb->host_buf, chb->gpu_buf,
window_len,
@@ -1411,6 +1431,9 @@ ips_proto_mq_handle_cts(struct ips_recvhdrq_event *rcv_ev)
return IPS_RECVHDRQ_CONTINUE;
}
+static void ips_proto_mq_handle_outoforder_queue(psm2_mq_t mq, ips_msgctl_t *msgctl,
+ struct ips_proto *proto);
+
int
ips_proto_mq_handle_rts(struct ips_recvhdrq_event *rcv_ev)
{
@@ -1515,7 +1538,7 @@ ips_proto_mq_handle_rts(struct ips_recvhdrq_event *rcv_ev)
/* XXX if blocking, break out of progress loop */
if (msgctl->outoforder_count)
- ips_proto_mq_handle_outoforder_queue(mq, msgctl);
+ ips_proto_mq_handle_outoforder_queue(mq, msgctl, rcv_ev->proto);
if (rc == MQ_RET_UNEXP_OK)
ret = IPS_RECVHDRQ_BREAK;
@@ -1569,7 +1592,7 @@ ips_proto_mq_handle_tiny(struct ips_recvhdrq_event *rcv_ev)
*/
psmi_assert(msgorder != IPS_MSG_ORDER_PAST);
- _HFI_VDBG("tag=%08x.%08x.%08x opcode=%d, msglen=%d\n",
+ _HFI_VDBG("tag=%08x.%08x.%08x opcode=0x%x, msglen=%d\n",
p_hdr->tag[0], p_hdr->tag[1], p_hdr->tag[2],
OPCODE_TINY, p_hdr->hdr_data.u32w1);
@@ -1602,7 +1625,7 @@ ips_proto_mq_handle_tiny(struct ips_recvhdrq_event *rcv_ev)
ipsaddr->msg_toggle = 0;
if (msgctl->outoforder_count)
- ips_proto_mq_handle_outoforder_queue(mq, msgctl);
+ ips_proto_mq_handle_outoforder_queue(mq, msgctl, rcv_ev->proto);
if (rc == MQ_RET_UNEXP_OK)
ret = IPS_RECVHDRQ_BREAK;
@@ -1655,7 +1678,7 @@ ips_proto_mq_handle_short(struct ips_recvhdrq_event *rcv_ev)
*/
psmi_assert(msgorder != IPS_MSG_ORDER_PAST);
- _HFI_VDBG("tag=%08x.%08x.%08x opcode=%d, msglen=%d\n",
+ _HFI_VDBG("tag=%08x.%08x.%08x opcode=0x%x, msglen=%d\n",
p_hdr->tag[0], p_hdr->tag[1], p_hdr->tag[2],
OPCODE_SHORT, p_hdr->hdr_data.u32w1);
@@ -1689,7 +1712,7 @@ ips_proto_mq_handle_short(struct ips_recvhdrq_event *rcv_ev)
ipsaddr->msg_toggle = 0;
if (msgctl->outoforder_count)
- ips_proto_mq_handle_outoforder_queue(mq, msgctl);
+ ips_proto_mq_handle_outoforder_queue(mq, msgctl, rcv_ev->proto);
if (rc == MQ_RET_UNEXP_OK)
ret = IPS_RECVHDRQ_BREAK;
@@ -1752,6 +1775,9 @@ ips_proto_mq_handle_eager(struct ips_recvhdrq_event *rcv_ev)
req->req_data.buf = gdr_convert_gpu_to_host_addr(GDR_FD,
(unsigned long)req->user_gpu_buffer,
req->req_data.send_msglen, 1, rcv_ev->proto);
+ } else if ((req->flags_user & PSM2_MQ_FLAG_GDRCPY_ONLY) && req->is_buf_gpu_mem) {
+ psmi_handle_error(PSMI_EP_NORETURN, PSM2_INTERNAL_ERR,
+ "CUDA memcpy not permitted for this operation.");
}
#endif
psmi_mq_handle_data(mq, req,
@@ -1788,7 +1814,7 @@ ips_proto_mq_handle_eager(struct ips_recvhdrq_event *rcv_ev)
*/
psmi_assert(msgorder != IPS_MSG_ORDER_PAST);
- _HFI_VDBG("tag=%08x.%08x.%08x opcode=%d, msglen=%d\n",
+ _HFI_VDBG("tag=%08x.%08x.%08x opcode=0x%x, msglen=%d\n",
p_hdr->tag[0], p_hdr->tag[1], p_hdr->tag[2],
OPCODE_EAGER, p_hdr->hdr_data.u32w1);
@@ -1822,7 +1848,7 @@ ips_proto_mq_handle_eager(struct ips_recvhdrq_event *rcv_ev)
ipsaddr->msg_toggle = 0;
if (msgctl->outoforder_count)
- ips_proto_mq_handle_outoforder_queue(mq, msgctl);
+ ips_proto_mq_handle_outoforder_queue(mq, msgctl, rcv_ev->proto);
if (rc == MQ_RET_UNEXP_OK)
ret = IPS_RECVHDRQ_BREAK;
@@ -1841,22 +1867,26 @@ ips_proto_mq_handle_eager(struct ips_recvhdrq_event *rcv_ev)
* Progress the out of order queue to see if any message matches
* current receiving sequence number.
*/
-void
-ips_proto_mq_handle_outoforder_queue(psm2_mq_t mq, ips_msgctl_t *msgctl)
+static void
+ips_proto_mq_handle_outoforder_queue(psm2_mq_t mq, ips_msgctl_t *msgctl,
+ struct ips_proto *proto)
{
psm2_mq_req_t req;
do {
- req =
- mq_ooo_match(&mq->outoforder_q, msgctl,
- msgctl->mq_recv_seqnum);
+ req = mq_ooo_match(&mq->outoforder_q, msgctl,
+ msgctl->mq_recv_seqnum);
if (req == NULL)
return;
msgctl->outoforder_count--;
msgctl->mq_recv_seqnum++;
+#ifdef PSM_CUDA
+ psmi_mq_handle_outoforder(mq, req, proto);
+#else
psmi_mq_handle_outoforder(mq, req);
+#endif
} while (msgctl->outoforder_count > 0);
diff --git a/ptl_ips/ips_proto_recv.c b/ptl_ips/ips_proto_recv.c
index bf363bb..aa5b727 100644
--- a/ptl_ips/ips_proto_recv.c
+++ b/ptl_ips/ips_proto_recv.c
@@ -1116,7 +1116,7 @@ ips_proto_process_becn(struct ips_recvhdrq_event *rcv_ev)
static void ips_bad_opcode(uint8_t op_code, struct ips_message_header *proto)
{
- _HFI_DBG("Discarding message with bad opcode 0x%x\n", op_code);
+ _HFI_INFO("Discarding message with bad opcode 0x%x\n", op_code);
if (hfi_debug & __HFI_DBG) {
ips_proto_show_header(proto, "received bad opcode");
diff --git a/ptl_ips/ips_recvhdrq.c b/ptl_ips/ips_recvhdrq.c
index 6c5fd07..dc9a699 100644
--- a/ptl_ips/ips_recvhdrq.c
+++ b/ptl_ips/ips_recvhdrq.c
@@ -5,6 +5,7 @@
GPL LICENSE SUMMARY
+ Copyright(c) 2021 Cornelis Networks.
Copyright(c) 2015 Intel Corporation.
This program is free software; you can redistribute it and/or modify
@@ -17,10 +18,11 @@
General Public License for more details.
Contact Information:
- Intel Corporation, www.intel.com
+ Cornelis Networks, www.cornelisnetworks.com
BSD LICENSE
+ Copyright(c) 2021 Cornelis Networks.
Copyright(c) 2015 Intel Corporation.
Redistribution and use in source and binary forms, with or without
@@ -162,8 +164,11 @@ static __inline__ void _dump_invalid_pkt(struct ips_recvhdrq_event *rcv_ev)
if (hfi_debug & __HFI_PKTDBG) {
ips_proto_dump_frame(rcv_ev->p_hdr, HFI_MESSAGE_HDR_SIZE,
"header");
- if (paylen)
+ if (!payload) {
+ _HFI_DBG("Cannot dump frame; payload is NULL\n");
+ } else if (paylen) {
ips_proto_dump_frame(payload, paylen, "data");
+ }
}
}
@@ -508,7 +513,7 @@ psm2_error_t ips_recvhdrq_progress(struct ips_recvhdrq *recvq)
rcv_ev.recvq->context->psm_hw_ctxt);
_HFI_VDBG_ALWAYS(
- "hdrq_head %d, p_hdr: %p, opcode %x, payload %p paylen %d; "
+ "hdrq_head %d, p_hdr: %p, opcode 0x%x, payload %p paylen %d; "
"egrhead %x egrtail %x; "
"useegrbit %x egrindex %x, egroffset %x, egrindexhead %x\n",
state->hdrq_head,
diff --git a/ptl_ips/ptl_rcvthread.c b/ptl_ips/ptl_rcvthread.c
index 4adb65a..f3352a6 100644
--- a/ptl_ips/ptl_rcvthread.c
+++ b/ptl_ips/ptl_rcvthread.c
@@ -5,6 +5,7 @@
GPL LICENSE SUMMARY
+ Copyright(c) 2021 Cornelis Networks.
Copyright(c) 2015 Intel Corporation.
This program is free software; you can redistribute it and/or modify
@@ -17,11 +18,11 @@
General Public License for more details.
Contact Information:
- Intel Corporation, www.intel.com
+ Cornelis Networks, www.cornelisnetworks.com
BSD LICENSE
- Copyright(c) 2015 Intel Corporation.
+ Copyright(c) 2021 Cornelis Networks.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions
@@ -96,7 +97,7 @@ struct ptl_rcvthread {
* stored to provide hints during a cuda failure
* due to a null cuda context.
*/
- CUcontext ctxt;
+ CUcontext cu_ctxt;
#endif
/*
@@ -124,7 +125,7 @@ psm2_error_t ips_ptl_rcvthread_init(ptl_t *ptl_gen, struct ips_recvhdrq *recvq)
#ifdef PSM_CUDA
if (PSMI_IS_CUDA_ENABLED)
- PSMI_CUDA_CALL(cuCtxGetCurrent, &ctxt);
+ PSMI_CUDA_CALL(cuCtxGetCurrent, &cu_ctxt);
#endif
if (psmi_hal_has_sw_status(PSM_HAL_PSMI_RUNTIME_RTS_RX_THREAD) &&
@@ -347,8 +348,8 @@ void *ips_ptl_pollintr(void *rcvthreadc)
psm2_error_t err;
#ifdef PSM_CUDA
- if (PSMI_IS_CUDA_ENABLED && ctxt != NULL)
- PSMI_CUDA_CALL(cuCtxSetCurrent, ctxt);
+ if (PSMI_IS_CUDA_ENABLED && cu_ctxt != NULL)
+ PSMI_CUDA_CALL(cuCtxSetCurrent, cu_ctxt);
#endif
PSM2_LOG_MSG("entering");
diff --git a/rpm_release_extension b/rpm_release_extension
index 725a5ba..dcb6b5b 100644
--- a/rpm_release_extension
+++ b/rpm_release_extension
@@ -1 +1 @@
-185
+230
Debdiff
[The following lists of changes regard files as different if they have different names, permissions or owners.]
Files in second set of .debs but not in first
-rw-r--r-- root/root /usr/lib/debug/.build-id/34/f1ab7acbd640b8be6d546885b44a1c385977ed.debug -rw-r--r-- root/root /usr/lib/debug/.build-id/fe/49f24b1fad832ecfe459d71ab1a4ca66523298.debug
Files in first set of .debs but not in second
-rw-r--r-- root/root /usr/lib/debug/.build-id/5c/121cc74d7467923b171aec9d0754081e64d381.debug -rw-r--r-- root/root /usr/lib/debug/.build-id/ca/a3d2f2b6bc4f476801798bafbf4f0a8e2fafbb.debug
No differences were encountered between the control files of package libpsm2-2
No differences were encountered between the control files of package libpsm2-2-compat
Control files of package libpsm2-2-compat-dbgsym: lines which differ (wdiff format)
Build-Ids: 5c121cc74d7467923b171aec9d0754081e64d381 fe49f24b1fad832ecfe459d71ab1a4ca66523298
Control files of package libpsm2-2-dbgsym: lines which differ (wdiff format)
Build-Ids: caa3d2f2b6bc4f476801798bafbf4f0a8e2fafbb 34f1ab7acbd640b8be6d546885b44a1c385977ed
No differences were encountered between the control files of package libpsm2-dev