New Upstream Release - libpsm2

Ready changes

Summary

Merged new upstream version: 11.2.230 (was: 11.2.185).

Resulting package

Built on 2022-10-20T05:10 (took 13m0s)

The resulting binary packages can be installed (if you have the apt repository enabled) by running one of:

apt install -t fresh-releases libpsm2-2-compat-dbgsymapt install -t fresh-releases libpsm2-2-compatapt install -t fresh-releases libpsm2-2-dbgsymapt install -t fresh-releases libpsm2-2apt install -t fresh-releases libpsm2-dev

Diff

diff --git a/COMMIT b/COMMIT
index b94efbd..e4fb92c 100644
--- a/COMMIT
+++ b/COMMIT
@@ -1 +1 @@
-30c52a0fd155774e18cc06328a1ba83c2a6a8104
\ No newline at end of file
+3f7c29811e820bc5645cbcad6a4c9d61133f3156
\ No newline at end of file
diff --git a/CONTRIBUTORS b/CONTRIBUTORS
index 7571183..631a5b1 100644
--- a/CONTRIBUTORS
+++ b/CONTRIBUTORS
@@ -13,3 +13,5 @@ Dmitry (dmitrygx on github.com)
 Florian Weimer (fweimer on github.com)
 Jonas Hahnfeld (hahnjo on github.com)
 Tom Stellard (tstellar on github.com)
+Chuck Cranor (chuchcranor on github.com)
+Rémi Lacroix (RemiLacroix-IDRIS on github.com)
diff --git a/Makefile b/Makefile
index 5a31d64..9faeb73 100644
--- a/Makefile
+++ b/Makefile
@@ -4,6 +4,7 @@
 #
 #  GPL LICENSE SUMMARY
 #
+#  Copyright(c) 2021 Cornelis Networks.
 #  Copyright(c) 2017 Intel Corporation.
 #
 #  This program is free software; you can redistribute it and/or modify
@@ -16,10 +17,11 @@
 #  General Public License for more details.
 #
 #  Contact Information:
-#  Intel Corporation, www.intel.com
+#  Cornelis Networks, www.cornelisnetworks.com
 #
 #  BSD LICENSE
 #
+#  Copyright(c) 2021 Cornelis Networks.
 #  Copyright(c) 2017 Intel Corporation.
 #
 #  Redistribution and use in source and binary forms, with or without
@@ -129,7 +131,7 @@ INCLUDES += -I$(top_srcdir) -I$(top_srcdir)/ptl_ips -I$(OUTDIR)
 
 ifneq (x86_64,$(arch))
    ifneq (i386,$(arch))
-      $(error Unsupported architecture $(arch))
+      anerr := $(error Unsupported architecture $(arch))
    endif
 endif
 
@@ -164,7 +166,7 @@ nthreads := $(shell echo $$(( `nproc` * 2 )) )
 # The DISTRO variable is used subsequently for variable
 # behaviors of the 3 distros.
 
-DISTRO := $(shell . /etc/os-release; if [[ "$$ID" == "sle_hpc" ]]; then ID="sles"; fi; echo $$ID)
+DISTRO := $(shell . /etc/os-release; if [ "$$ID" = "sle_hpc" ]; then ID="sles"; fi; echo $$ID)
 
 # By default the following two variables have the following values:
 LIBPSM2_COMPAT_CONF_DIR := /etc
@@ -374,8 +376,8 @@ debug:
 	$(MAKE) OUTDIR=$(OUTDIR) PSM_DEBUG=1
 
 test_clean:
-	if [ -d ./test ]; then \
-		$(MAKE) -C test clean; \
+	if [ -d ./test && -e ./test/Makefile ]; then \
+		$(MAKE) -f ./test/Makefile -C test clean; \
 	fi
 
 specfile_clean:
diff --git a/README b/README
index 7990555..2961534 100644
--- a/README
+++ b/README
@@ -67,7 +67,6 @@ Contains the following sections:
 - INSTALLING
   * INSTALLING USING MAKEFILE
   * INSTALLING USING EITHER YUM OR DNF
-- TESTING
 - RELATED SOFTWARE TO PSM2
 - SUPPORTING DOCUMENTATION
 
@@ -251,6 +250,24 @@ libraries available on them. Open MPI provides a standard configure, make and
 make install mechanism which will detect and build the relevant PSM2 network
 modules for Open MPI once the header and runtime files are detected.
 
+Open MPI 4.1.x, OFI BTL, and high PPN jobs
+----------------
+Open MPI added the OFI BTL for one-sided communication. On an OPA fabric, the
+OFI BTL may use the PSM2 OFI provider underneath. If PSM2 is in-use as both
+the MTL (directly or via OFI) and the BTL (via OFI), then each rank in the
+Open MPI job will require two PSM2 endpoints and PSM2 context-sharing will
+be disabled.
+
+In this case, total number of PSM2 ranks on a node can be no more than:
+  (num_hfi * num_user_contexts)/2
+Where num_user_contexts is typically equal to the number of physical CPU
+cores on that node.
+
+If your job does not require an inter-node BTL (e.g. OFI), then you can
+disable the OFI BTL in one of two ways:
+  1. When building Open MPI, specify '--with-ofi=no' when you run 'configure'.
+  2. When running your Open MPI job, add '-mca btl self,vader'.
+
 MVAPICH2 support
 ----------------
 MVAPICH2 supports PSM2 transport for optimized communication on HFI hardware.
diff --git a/buildflags.mak b/buildflags.mak
index 7c3cda0..0ce15aa 100644
--- a/buildflags.mak
+++ b/buildflags.mak
@@ -4,6 +4,7 @@
 #
 #  GPL LICENSE SUMMARY
 #
+#  Copyright(c) 2021 Cornelis Networks.
 #  Copyright(c) 2016 Intel Corporation.
 #
 #  This program is free software; you can redistribute it and/or modify
@@ -16,10 +17,11 @@
 #  General Public License for more details.
 #
 #  Contact Information:
-#  Intel Corporation, www.intel.com
+#  Cornelis Networks, www.cornelisnetworks.com
 #
 #  BSD LICENSE
 #
+#  Copyright(c) 2021 Cornelis Networks.
 #  Copyright(c) 2016 Intel Corporation.
 #
 #  Redistribution and use in source and binary forms, with or without
@@ -118,13 +120,13 @@ ifneq (icc,${CC})
 		RET := $(shell echo "int main() {}" | ${CC} ${MAVX2} -E -dM -xc - 2>&1 | grep -q AVX2 ; echo $$?)
 	else
 		RET := $(shell echo "int main() {}" | ${CC} ${MAVX2} -E -dM -xc - 2>&1 | grep -q AVX ; echo $$?)
-		$(warning ***NOTE TO USER**** Disabling AVX2 will harm performance)
+                anerr := $(warning ***NOTE TO USER**** Disabling AVX2 will harm performance)
 	endif
 
 	ifeq (0,${RET})
 		BASECFLAGS += ${MAVX2}
 	else
-		$(error Compiler does not support ${MAVX2} )
+		anerr := $(error Compiler does not support ${MAVX2} )
 	endif
 else
 		BASECFLAGS += ${MAVX2}
@@ -138,7 +140,7 @@ ifneq (,${PSM_AVX512})
 		ifeq (0,${RET})
 			BASECFLAGS += -mavx512f
 		else
-			$(error Compiler does not support AVX512 )
+			anerr := $(error Compiler does not support AVX512 )
 		endif
 		BASECFLAGS += -DPSM_AVX512
 	endif
@@ -203,7 +205,7 @@ else
 		BASECFLAGS += -funwind-tables -Wno-strict-aliasing -Wformat-security
 	else
 		ifneq (${CCARCH},gcc4)
-			$(error Unknown compiler arch "${CCARCH}")
+			anerr := $(error Unknown compiler arch "${CCARCH}")
 		endif # gcc4
 	endif # gcc
 endif # icc
diff --git a/debian/changelog b/debian/changelog
index dd6f458..2c4dcf7 100644
--- a/debian/changelog
+++ b/debian/changelog
@@ -1,3 +1,10 @@
+libpsm2 (11.2.230-1) UNRELEASED; urgency=low
+
+  * New upstream release.
+  * Drop patch remove_makefile_bashisms.patch, present upstream.
+
+ -- Debian Janitor <janitor@jelmer.uk>  Thu, 20 Oct 2022 04:58:46 -0000
+
 libpsm2 (11.2.185-2) unstable; urgency=medium
 
   * Team upload.
diff --git a/debian/patches/disable_makefile_git_versioning.patch b/debian/patches/disable_makefile_git_versioning.patch
index 9ed9733..9d21b5e 100644
--- a/debian/patches/disable_makefile_git_versioning.patch
+++ b/debian/patches/disable_makefile_git_versioning.patch
@@ -6,9 +6,11 @@ Forwarded: not-needed
 Last-Update: 2020-07-30
 ---
 This patch header follows DEP-3: http://dep.debian.net/deps/dep3/
---- a/Makefile
-+++ b/Makefile
-@@ -195,7 +195,7 @@
+Index: libpsm2.git/Makefile
+===================================================================
+--- libpsm2.git.orig/Makefile
++++ libpsm2.git/Makefile
+@@ -197,7 +197,7 @@ endif
  export 	LIBPSM2_COMPAT_CONF_DIR
  
  # The desired version number comes from the most recent tag starting with "v"
diff --git a/debian/patches/fortify_source.patch b/debian/patches/fortify_source.patch
index 0bc52b6..d17c1d3 100644
--- a/debian/patches/fortify_source.patch
+++ b/debian/patches/fortify_source.patch
@@ -5,9 +5,11 @@ Forwarded: not-needed
 Last-Update: 2020-07-30
 ---
 This patch header follows DEP-3: http://dep.debian.net/deps/dep3/
---- a/buildflags.mak
-+++ b/buildflags.mak
-@@ -158,7 +158,7 @@
+Index: libpsm2.git/buildflags.mak
+===================================================================
+--- libpsm2.git.orig/buildflags.mak
++++ libpsm2.git/buildflags.mak
+@@ -160,7 +160,7 @@ endif
  ifneq (,${PSM_DEBUG})
  	BASECFLAGS += -O -g3 -DPSM_DEBUG -D_HFI_DEBUGGING -funit-at-a-time -Wp,-D_FORTIFY_SOURCE=2
  else
@@ -16,9 +18,11 @@ This patch header follows DEP-3: http://dep.debian.net/deps/dep3/
  endif
  ifneq (,${PSM_COVERAGE}) # This check must come after PSM_DEBUG to override optimization setting
  	BASECFLAGS += -O -fprofile-arcs -ftest-coverage
---- a/compat/buildflags.mak
-+++ b/compat/buildflags.mak
-@@ -72,7 +72,7 @@
+Index: libpsm2.git/compat/buildflags.mak
+===================================================================
+--- libpsm2.git.orig/compat/buildflags.mak
++++ libpsm2.git/compat/buildflags.mak
+@@ -72,7 +72,7 @@ LINKER_SCRIPT := -Wl,--version-script $(
  WERROR := -Werror
  INCLUDES := -I$(top_srcdir)/include -I$(top_srcdir)/include/$(os)-$(arch) -I$(top_srcdir)/mpspawn
  
diff --git a/debian/patches/remove_makefile_bashisms.patch b/debian/patches/remove_makefile_bashisms.patch
deleted file mode 100644
index a2f7784..0000000
--- a/debian/patches/remove_makefile_bashisms.patch
+++ /dev/null
@@ -1,17 +0,0 @@
-Description: Correct bashishsms in Makefile
-Author: Brian T. Smith <bsmith@systemfabricworks.com>
-Forwarded: https://github.com/intel/opa-psm2/issues/55
-Last-Update: 2020-08-06
----
-This patch header follows DEP-3: http://dep.debian.net/deps/dep3/
---- a/Makefile
-+++ b/Makefile
-@@ -164,7 +164,7 @@
- # The DISTRO variable is used subsequently for variable
- # behaviors of the 3 distros.
- 
--DISTRO := $(shell . /etc/os-release; if [[ "$$ID" == "sle_hpc" ]]; then ID="sles"; fi; echo $$ID)
-+DISTRO := $(shell . /etc/os-release; if [ "$$ID" = "sle_hpc" ]; then ID="sles"; fi; echo $$ID)
- 
- # By default the following two variables have the following values:
- LIBPSM2_COMPAT_CONF_DIR := /etc
diff --git a/debian/patches/series b/debian/patches/series
index 059c7a0..7a83a45 100644
--- a/debian/patches/series
+++ b/debian/patches/series
@@ -1,3 +1,2 @@
 disable_makefile_git_versioning.patch
 fortify_source.patch
-remove_makefile_bashisms.patch
diff --git a/include/linux-i386/sysdep.h b/include/linux-i386/sysdep.h
index bfd5746..2c48c1e 100644
--- a/include/linux-i386/sysdep.h
+++ b/include/linux-i386/sysdep.h
@@ -139,12 +139,9 @@ static __inline__ uint32_t ips_cmpxchg(volatile uint32_t *ptr,
 				       uint32_t old_val, uint32_t new_val)
 {
 	uint32_t prev;
-	struct xchg_dummy {
-		uint32_t a[100];
-	};
 
 	asm volatile (LOCK_PREFIX "cmpxchgl %1,%2" : "=a"(prev)
-		      : "q"(new_val), "m"(*(struct xchg_dummy *)ptr), "0"(old_val)
+		      : "q"(new_val), "m"(*ptr), "0"(old_val)
 		      : "memory");
 
 	return prev;
diff --git a/include/opa_debug.h b/include/opa_debug.h
index d5d8ff2..36e53a3 100644
--- a/include/opa_debug.h
+++ b/include/opa_debug.h
@@ -83,6 +83,7 @@
 #define __HFI_ENVDBG	    0x400
 #define __HFI_EPKTDBG     0x800	/* print error packet data */
 #define __HFI_CCADBG      0x1000	/* print CCA related events */
+#define __HFI_CUDADBG     0x2000	/* print CUDA calls, events */
 #else /* _HFI_DEBUGGING */
 
 /*
@@ -100,6 +101,7 @@
 /* print mmap/nopage stuff, not using VDBG any more */
 #define __HFI_MMDBG     0x0
 #define __HFI_CCADBG    0x0	/* print CCA related events */
+#define __HFI_CUDADBG   0x0
 
 #endif /* _HFI_DEBUGGING */
 
diff --git a/include/opa_udebug.h b/include/opa_udebug.h
index 9fd59cb..b25da1f 100644
--- a/include/opa_udebug.h
+++ b/include/opa_udebug.h
@@ -124,6 +124,7 @@ extern FILE *__hfi_dbgout;
 		"env " fmt, ##__VA_ARGS__)
 #define _HFI_MMDBG(fmt, ...) __HFI_DBG_WHICH(__HFI_MMDBG, fmt, ##__VA_ARGS__)
 #define _HFI_CCADBG(fmt, ...) __HFI_DBG_WHICH(__HFI_CCADBG, fmt, ##__VA_ARGS__)
+#define _HFI_CUDADBG(fmt, ...) __HFI_DBG_WHICH(__HFI_CUDADBG, fmt, ##__VA_ARGS__)
 
 /*
  * Use these macros (_HFI_DBG_ON and _HFI_DBG_ALWAYS) together
@@ -150,6 +151,9 @@ extern FILE *__hfi_dbgout;
 #define _HFI_CCADBG_ON unlikely(hfi_debug & __HFI_CCADBG)
 #define _HFI_CCADBG_ALWAYS(fmt, ...) _HFI_DBG_ALWAYS(fmt, ##__VA_ARGS__)
 
+#define _HFI_CUDADBG_ON unlikely(hfi_debug & __HFI_CUDADBG)
+#define _HFI_CUDADBG_ALWAYS(fmt, ...) _HFI_DBG_ALWAYS(fmt, ##__VA_ARGS__)
+
 #define _HFI_INFO_ON unlikely(hfi_debug & __HFI_INFO)
 #define _HFI_INFO_ALWAYS(fmt, ...) _HFI_DBG_ALWAYS(fmt, ##__VA_ARGS__)
 
@@ -177,6 +181,7 @@ extern FILE *__hfi_dbgout;
 #define _HFI_VDBG(fmt, ...)
 #define _HFI_MMDBG(fmt, ...)
 #define _HFI_CCADBG(fmt, ...)
+#define _HFI_CUDADBG(fmt, ...)
 
 #define _HFI_DBG_ON 0
 #define _HFI_DBG_ALWAYS(fmt, ...)
@@ -186,6 +191,8 @@ extern FILE *__hfi_dbgout;
 #define _HFI_PRDBG_ALWAYS(fmt, ...)
 #define _HFI_CCADBG_ON 0
 #define _HFI_CCADBG_ALWAYS(fmt, ...)
+#define _HFI_CUDADBG_ON 0
+#define _HFI_CUDADBG_ALWAYS(fmt, ...)
 #define _HFI_INFO_ON 0
 #define _HFI_INFO_ALWAYS(fmt, ...)
 
diff --git a/psm.c b/psm.c
index 7f929ce..ec1fd88 100644
--- a/psm.c
+++ b/psm.c
@@ -5,6 +5,7 @@
 
   GPL LICENSE SUMMARY
 
+  Copyright(c) 2021 Cornelis Networks.
   Copyright(c) 2016 Intel Corporation.
 
   This program is free software; you can redistribute it and/or modify
@@ -17,10 +18,11 @@
   General Public License for more details.
 
   Contact Information:
-  Intel Corporation, www.intel.com
+  Cornelis Networks, www.cornelisnetworks.com
 
   BSD LICENSE
 
+  Copyright(c) 2021 Cornelis Networks.
   Copyright(c) 2016 Intel Corporation.
 
   Redistribution and use in source and binary forms, with or without
@@ -92,8 +94,8 @@ uint32_t psmi_cpu_model;
 #ifdef PSM_CUDA
 int is_cuda_enabled;
 int is_gdr_copy_enabled;
-int device_support_gpudirect;
-int gpu_p2p_supported = 0;
+int _device_support_gpudirect = -1; // -1 indicates "unset". See device_support_gpudirect().
+int _gpu_p2p_supported = -1; // -1 indicates "unset". see gpu_p2p_supported().
 int my_gpu_device = 0;
 int cuda_lib_version;
 int is_driver_gpudirect_enabled;
@@ -116,6 +118,7 @@ CUresult (*psmi_cuDriverGetVersion)(int* driverVersion);
 CUresult (*psmi_cuDeviceGetCount)(int* count);
 CUresult (*psmi_cuStreamCreate)(CUstream* phStream, unsigned int Flags);
 CUresult (*psmi_cuStreamDestroy)(CUstream phStream);
+CUresult (*psmi_cuStreamSynchronize)(CUstream phStream);
 CUresult (*psmi_cuEventCreate)(CUevent* phEvent, unsigned int Flags);
 CUresult (*psmi_cuEventDestroy)(CUevent hEvent);
 CUresult (*psmi_cuEventQuery)(CUevent hEvent);
@@ -217,6 +220,7 @@ int psmi_cuda_lib_load()
 	PSMI_CUDA_DLSYM(psmi_cuda_lib, cuDeviceGetCount);
 	PSMI_CUDA_DLSYM(psmi_cuda_lib, cuStreamCreate);
 	PSMI_CUDA_DLSYM(psmi_cuda_lib, cuStreamDestroy);
+	PSMI_CUDA_DLSYM(psmi_cuda_lib, cuStreamSynchronize);
 	PSMI_CUDA_DLSYM(psmi_cuda_lib, cuEventCreate);
 	PSMI_CUDA_DLSYM(psmi_cuda_lib, cuEventDestroy);
 	PSMI_CUDA_DLSYM(psmi_cuda_lib, cuEventQuery);
@@ -251,7 +255,6 @@ fail:
 int psmi_cuda_initialize()
 {
 	psm2_error_t err = PSM2_OK;
-	int num_devices, dev;
 
 	PSM2_LOG_MSG("entering");
 	_HFI_VDBG("Enabling CUDA support.\n");
@@ -262,77 +265,6 @@ int psmi_cuda_initialize()
 
 	PSMI_CUDA_CALL(cuInit, 0);
 
-	/* Check if CUDA context is available. If not, we are not allowed to
-	 * launch any CUDA API calls */
-	PSMI_CUDA_CALL(cuCtxGetCurrent, &ctxt);
-	if (ctxt == NULL) {
-		_HFI_INFO("Unable to find active CUDA context\n");
-		is_cuda_enabled = 0;
-		err = PSM2_OK;
-		return err;
-	}
-
-	CUdevice current_device;
-	CUcontext primary_ctx;
-	PSMI_CUDA_CALL(cuCtxGetDevice, &current_device);
-	int is_ctx_active;
-	unsigned ctx_flags;
-	PSMI_CUDA_CALL(cuDevicePrimaryCtxGetState, current_device, &ctx_flags,
-			&is_ctx_active);
-	if (!is_ctx_active) {
-		/* There is an issue where certain CUDA API calls create
-		 * contexts but does not make it active which cause the
-		 * driver API call to fail with error 709 */
-		PSMI_CUDA_CALL(cuDevicePrimaryCtxRetain, &primary_ctx,
-				current_device);
-		is_cuda_primary_context_retain = 1;
-	}
-
-	/* Check if all devices support Unified Virtual Addressing. */
-	PSMI_CUDA_CALL(cuDeviceGetCount, &num_devices);
-
-	device_support_gpudirect = 1;
-
-	for (dev = 0; dev < num_devices; dev++) {
-		CUdevice device;
-		PSMI_CUDA_CALL(cuDeviceGet, &device, dev);
-		int unifiedAddressing;
-		PSMI_CUDA_CALL(cuDeviceGetAttribute,
-				&unifiedAddressing,
-				CU_DEVICE_ATTRIBUTE_UNIFIED_ADDRESSING,
-				device);
-
-		if (unifiedAddressing !=1) {
-			_HFI_ERROR("CUDA device %d does not support Unified Virtual Addressing.\n", dev);
-			goto fail;
-		}
-
-		int major;
-		PSMI_CUDA_CALL(cuDeviceGetAttribute,
-				&major,
-				CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR,
-				device);
-		if (major < 3) {
-			device_support_gpudirect = 0;
-			_HFI_INFO("CUDA device %d does not support GPUDirect RDMA (Non-fatal error)\n", dev);
-		}
-
-		if (device != current_device) {
-			int canAccessPeer = 0;
-			PSMI_CUDA_CALL(cuDeviceCanAccessPeer, &canAccessPeer,
-					current_device, device);
-
-			if (canAccessPeer != 1)
-				_HFI_DBG("CUDA device %d does not support P2P from current device (Non-fatal error)\n", dev);
-			else
-				gpu_p2p_supported |= (1 << device);
-		} else {
-			/* Always support p2p on the same GPU */
-			my_gpu_device = device;
-			gpu_p2p_supported |= (1 << device);
-		}
-	}
-
 	union psmi_envvar_val env_enable_gdr_copy;
 	psmi_getenv("PSM2_GDRCOPY",
 				"Enable (set envvar to 1) for gdr copy support in PSM (Enabled by default)",
@@ -384,6 +316,8 @@ psm2_error_t __psm2_init(int *major, int *minor)
 {
 	psm2_error_t err = PSM2_OK;
 	union psmi_envvar_val env_tmask;
+	union psmi_envvar_val devs;
+	int devid_enabled[PTL_MAX_INIT];
 
 	psmi_log_initialize();
 
@@ -538,12 +472,26 @@ psm2_error_t __psm2_init(int *major, int *minor)
 
 	psmi_epid_init();
 
-	int rc = psmi_hal_initialize();
+	psmi_getenv("PSM2_DEVICES",
+		    "Ordered list of PSM-level devices",
+		    PSMI_ENVVAR_LEVEL_USER,
+		    PSMI_ENVVAR_TYPE_STR,
+		    (union psmi_envvar_val)PSMI_DEVICES_DEFAULT, &devs);
 
-	if (rc)
-	{
-		err = PSM2_INTERNAL_ERR;
+	if ((err = psmi_parse_devices(devid_enabled, devs.e_str)))
 		goto fail;
+
+	/* setup a dummy (null) hal if we are not using PTL_DEVID_IPS */
+	if (!psmi_device_is_enabled(devid_enabled, PTL_DEVID_IPS)) {
+		psmi_hal_initialize_null();
+	} else {
+		int rc = psmi_hal_initialize();
+
+		if (rc)
+		{
+			err = PSM2_INTERNAL_ERR;
+			goto fail;
+		}
 	}
 
 #ifdef PSM_CUDA
@@ -782,10 +730,10 @@ psm2_error_t __psm2_info_query(psm2_info_query_t q, void *out,
 		{
 			char              *networkType      = (char*)out;
 			size_t            networkTypeLength = args[0].length;
-			const char *const intelopa          = "Intel(R) OPA";
-			if (networkTypeLength >= strlen(intelopa)+1)
+			const char *const cornelisopx          = "Cornelis(TM) OPX";
+			if (networkTypeLength >= strlen(cornelisopx)+1)
 			{
-				strcpy(networkType,intelopa);
+				strcpy(networkType,cornelisopx);
 				rv = PSM2_OK;
 			}
 		}
diff --git a/psm2_hal.c b/psm2_hal.c
index b4b9d9a..09aa251 100644
--- a/psm2_hal.c
+++ b/psm2_hal.c
@@ -379,6 +379,20 @@ int psmi_hal_initialize(void)
 	return -PSM_HAL_ERROR_INIT_FAILED;
 }
 
+/* psmi_hal_initialize_null */
+void psmi_hal_initialize_null(void)
+{
+	static struct _psmi_hal_instance nullhal = {
+		.type = PSM_HAL_INSTANCE_NULL,
+		.description = "NULL HAL (hardware disabled)",
+		.hfi_name = "null",
+		.hfi_sys_class_path = "/dev/null",
+		.params = {0}
+	};
+
+	psmi_hal_current_hal_instance = &nullhal;
+}
+
 int psmi_hal_finalize(void)
 {
 	struct _psmi_hal_instance *p = psmi_hal_current_hal_instance;
diff --git a/psm2_hal.h b/psm2_hal.h
index 1bec596..9e94da8 100644
--- a/psm2_hal.h
+++ b/psm2_hal.h
@@ -75,6 +75,7 @@ typedef enum
 	PSM_HAL_INSTANCE_GEN2    =  2,
 	PSM_HAL_INSTANCE_GEN3    =  3,
 
+	PSM_HAL_INSTANCE_NULL	 = 98,
 #ifdef PSM2_MOCK_TESTING
 	PSM_HAL_INSTANCE_MOCK    = 99,
 #endif
@@ -701,6 +702,7 @@ void psmi_hal_register_instance(psmi_hal_instance_t *);
     INSTANCES are registered, or PSM_HAL_ERROR_INIT_FAILED when
     another failure has occured during initialization. */
 int psmi_hal_initialize(void);
+void psmi_hal_initialize_null(void);
 
 int psmi_hal_finalize(void);
 
diff --git a/psm2_mq.h b/psm2_mq.h
index 7b63608..98047d3 100644
--- a/psm2_mq.h
+++ b/psm2_mq.h
@@ -5,6 +5,7 @@
 
   GPL LICENSE SUMMARY
 
+  Copyright(c) 2021 Cornelis Networks.
   Copyright(c) 2017 Intel Corporation.
 
   This program is free software; you can redistribute it and/or modify
@@ -17,10 +18,11 @@
   General Public License for more details.
 
   Contact Information:
-  Intel Corporation, www.intel.com
+  Cornelis Networks, www.cornelisnetworks.com
 
   BSD LICENSE
 
+  Copyright(c) 2021 Cornelis Networks.
   Copyright(c) 2017 Intel Corporation.
 
   Redistribution and use in source and binary forms, with or without
@@ -519,6 +521,8 @@ psm2_error_t psm2_mq_setopt(psm2_mq_t mq, int option, const void *value);
 #define PSM2_MQ_FLAG_SENDSYNC	0x01
 				/**< MQ Send Force synchronous send */
 
+#define PSM2_MQ_FLAG_GDRCPY_ONLY	0x02
+				/**< only GDRCOPY is allowed for GPU */
 #define PSM2_MQ_REQINVALID	((psm2_mq_req_t)(NULL))
 				/**< MQ request completion value */
 
diff --git a/psm_ep.c b/psm_ep.c
index 8c4fe5e..8a092a8 100644
--- a/psm_ep.c
+++ b/psm_ep.c
@@ -5,6 +5,7 @@
 
   GPL LICENSE SUMMARY
 
+  Copyright(c) 2021 Cornelis Networks.
   Copyright(c) 2016 Intel Corporation.
 
   This program is free software; you can redistribute it and/or modify
@@ -17,10 +18,11 @@
   General Public License for more details.
 
   Contact Information:
-  Intel Corporation, www.intel.com
+  Cornelis Networks, www.cornelisnetworks.com
 
   BSD LICENSE
 
+  Copyright(c) 2021 Cornelis Networks.
   Copyright(c) 2016 Intel Corporation.
 
   Redistribution and use in source and binary forms, with or without
@@ -89,9 +91,6 @@ static psm2_error_t psmi_ep_open_device(const psm2_ep_t ep,
  * hfi.
  */
 
-static psm2_error_t psmi_parse_devices(int devices[PTL_MAX_INIT],
-				      const char *devstr);
-static int psmi_device_is_enabled(const int devices[PTL_MAX_INIT], int devid);
 int psmi_ep_device_is_enabled(const psm2_ep_t ep, int devid);
 
 psm2_error_t __psm2_ep_num_devunits(uint32_t *num_units_o)
@@ -1034,6 +1033,9 @@ __psm2_ep_open(psm2_uuid_t const unique_job_key,
 	char uvalue[6], pvalue[6];
 	int devid_enabled[PTL_MAX_INIT];
 	union psmi_envvar_val devs;
+#ifdef PSM_CUDA
+	int release_gdr = 0;
+#endif
 
 	PSM2_LOG_MSG("entering");
 	PSMI_ERR_UNLESS_INITIALIZED(NULL);
@@ -1089,8 +1091,10 @@ __psm2_ep_open(psm2_uuid_t const unique_job_key,
 	}
 
 #ifdef PSM_CUDA
-	if (PSMI_IS_GDR_COPY_ENABLED)
+	if (PSMI_IS_GDR_COPY_ENABLED) {
 		hfi_gdr_open();
+		release_gdr = 1;
+	}
 #endif
 
 	err = __psm2_ep_open_internal(unique_job_key,
@@ -1144,6 +1148,10 @@ __psm2_ep_open(psm2_uuid_t const unique_job_key,
 	_HFI_VDBG("psm2_ep_open() OK....\n");
 
 fail:
+#ifdef PSM_CUDA
+	if (err && release_gdr)
+		hfi_gdr_close();
+#endif
 	PSMI_UNLOCK(psmi_creation_lock);
 	PSM2_LOG_MSG("leaving");
 	return err;
@@ -1160,16 +1168,6 @@ psm2_error_t __psm2_ep_close(psm2_ep_t ep, int mode, int64_t timeout_in)
 	}
 #endif
 
-#ifdef PSM_CUDA
-	/*
-	 * The close on the gdr fd needs to be called before the
-	 * close on the hfi fd as the the gdr device will hold
-	 * reference count on the hfi device which will make the close
-	 * on the hfi fd return without actually closing the fd.
-	 */
-	if (PSMI_IS_GDR_COPY_ENABLED)
-		hfi_gdr_close();
-#endif
 	union psmi_envvar_val timeout_intval;
 	psm2_ep_t tmp;
 	psm2_mq_t mmq;
@@ -1350,6 +1348,11 @@ psm2_error_t __psm2_ep_close(psm2_ep_t ep, int mode, int64_t timeout_in)
 
 	PSMI_UNLOCK(psmi_creation_lock);
 
+#ifdef PSM_CUDA
+	if (PSMI_IS_GDR_COPY_ENABLED)
+		hfi_gdr_close();
+#endif
+
 	if (_HFI_PRDBG_ON) {
 		_HFI_PRDBG_ALWAYS("Closed endpoint in %.3f secs\n",
 				 (double)cycles_to_nanosecs(get_cycles() -
@@ -1491,7 +1494,6 @@ fail:
 
 /* Get a list of PTLs we want to use.  The order is important, it affects
  * whether node-local processes use shm or ips */
-static
 psm2_error_t
 psmi_parse_devices(int devices[PTL_MAX_INIT], const char *devstring)
 {
@@ -1562,7 +1564,6 @@ fail:
 
 }
 
-static
 int psmi_device_is_enabled(const int devid_enabled[PTL_MAX_INIT], int devid)
 {
 	int i;
diff --git a/psm_ep.h b/psm_ep.h
index b526fa0..e4e3708 100644
--- a/psm_ep.h
+++ b/psm_ep.h
@@ -236,4 +236,9 @@ struct psm2_epaddr {
 	PSMI_PROFILE_UNBLOCK();						\
 } while (0)
 
+
+psm2_error_t psmi_parse_devices(int devices[PTL_MAX_INIT],
+				      const char *devstr);
+int psmi_device_is_enabled(const int devices[PTL_MAX_INIT], int devid);
+
 #endif /* _PSMI_EP_H */
diff --git a/psm_gdrcpy.h b/psm_gdrcpy.h
index 2773454..c10062d 100644
--- a/psm_gdrcpy.h
+++ b/psm_gdrcpy.h
@@ -5,6 +5,7 @@
 
   GPL LICENSE SUMMARY
 
+  Copyright(c) 2021 Cornelis Networks.
   Copyright(c) 2018 Intel Corporation.
 
   This program is free software; you can redistribute it and/or modify
@@ -17,10 +18,11 @@
   General Public License for more details.
 
   Contact Information:
-  Intel Corporation, www.intel.com
+  Cornelis Networks, www.cornelisnetworks.com
 
   BSD LICENSE
 
+  Copyright(c) 2021 Cornelis Networks.
   Copyright(c) 2018 Intel Corporation.
 
   Redistribution and use in source and binary forms, with or without
@@ -60,18 +62,17 @@
 
 #define GDR_FD get_gdr_fd()
 
-int get_gdr_fd();
+int get_gdr_fd(void);
 
-void hfi_gdr_open();
+void hfi_gdr_open(void);
 
-void hfi_gdr_close();
+void hfi_gdr_close(void);
 
 void *
 gdr_convert_gpu_to_host_addr(int gdr_fd, unsigned long buf,
 				size_t size, int flags,
 				struct ips_proto* proto);
 
-uint64_t
-gdr_cache_evict();
+uint64_t gdr_cache_evict(void);
 #endif
 #endif
diff --git a/psm_hal_gen1/opa_proto_gen1.c b/psm_hal_gen1/opa_proto_gen1.c
index eb8bce9..868f06e 100644
--- a/psm_hal_gen1/opa_proto_gen1.c
+++ b/psm_hal_gen1/opa_proto_gen1.c
@@ -310,7 +310,7 @@ struct _hfi_ctrl *hfi_userinit_internal(int fd, bool skip_affinity,
 	__hfi_pg_sz = sysconf(_SC_PAGESIZE);
 
 	if (!(spctrl = calloc(1, sizeof(struct _hfi_ctrl)))) {
-		_HFI_INFO("can't allocate memory for hfi_ctrl: %s\n",
+		_HFI_INFO("Warning: can't allocate memory for hfi_ctrl: %s\n",
 			  strerror(errno));
 		goto err_calloc_hfi_ctrl;
 	}
@@ -357,12 +357,12 @@ struct _hfi_ctrl *hfi_userinit_internal(int fd, bool skip_affinity,
 #endif
 	if (hfi_cmd_write(fd, &c, sizeof(c)) == -1) {
 		if (errno == ENODEV) {
-			_HFI_INFO("PSM2 and driver version mismatch\n");
+			_HFI_INFO("Warning: PSM2 and driver version mismatch\n");
 			/* Overwrite errno. One would wish that the driver
 			 * didn't return ENODEV for a version mismatch */
 			errno = EPROTONOSUPPORT;
 		} else {
-			_HFI_INFO("assign_context command failed: %s\n",
+			_HFI_INFO("Warning: assign_context command failed: %s\n",
 				  strerror(errno));
 		}
 		goto err_hfi_cmd_assign_ctxt;
@@ -387,36 +387,36 @@ struct _hfi_ctrl *hfi_userinit_internal(int fd, bool skip_affinity,
 	c.addr = (__u64) cinfo;
 
 	if (hfi_cmd_write(fd, &c, sizeof(c)) == -1) {
-		_HFI_INFO("CTXT_INFO command failed: %s\n", strerror(errno));
+		_HFI_ERROR("CTXT_INFO command failed: %s\n", strerror(errno));
 		goto err_hfi_cmd_ctxt_info;
 	}
 
 	/* sanity checking... */
 	if (cinfo->rcvtids%8) {
-		_HFI_INFO("rcvtids not 8 multiple: %d\n", cinfo->rcvtids);
+		_HFI_ERROR("rcvtids not 8 multiple: %d\n", cinfo->rcvtids);
 		goto err_sanity_check;
 	}
 	if (cinfo->egrtids%8) {
-		_HFI_INFO("egrtids not 8 multiple: %d\n", cinfo->egrtids);
+		_HFI_ERROR("egrtids not 8 multiple: %d\n", cinfo->egrtids);
 		goto err_sanity_check;
 	}
 	if (cinfo->rcvtids < cinfo->egrtids) {
-		_HFI_INFO("rcvtids(%d) < egrtids(%d)\n",
+		_HFI_ERROR("rcvtids(%d) < egrtids(%d)\n",
 				cinfo->rcvtids, cinfo->egrtids);
 		goto err_sanity_check;
 	}
 	if (cinfo->rcvhdrq_cnt%32) {
-		_HFI_INFO("rcvhdrq_cnt not 32 multiple: %d\n",
+		_HFI_ERROR("rcvhdrq_cnt not 32 multiple: %d\n",
 				cinfo->rcvhdrq_cnt);
 		goto err_sanity_check;
 	}
 	if (cinfo->rcvhdrq_entsize%64) {
-		_HFI_INFO("rcvhdrq_entsize not 64 multiple: %d\n",
+		_HFI_ERROR("rcvhdrq_entsize not 64 multiple: %d\n",
 				cinfo->rcvhdrq_entsize);
 		goto err_sanity_check;
 	}
 	if (cinfo->rcvegr_size%__hfi_pg_sz) {
-		_HFI_INFO("rcvegr_size not page multiple: %d\n",
+		_HFI_ERROR("rcvegr_size not page multiple: %d\n",
 				cinfo->rcvegr_size);
 		goto err_sanity_check;
 	}
@@ -443,7 +443,7 @@ struct _hfi_ctrl *hfi_userinit_internal(int fd, bool skip_affinity,
 		CPU_ZERO(&cpuset);
 		CPU_SET(cinfo->rec_cpu, &cpuset);
 		if (sched_setaffinity(0, sizeof(cpuset), &cpuset)) {
-			_HFI_INFO("Couldn't set runon processor %u "
+			_HFI_INFO("Warning: Couldn't set runon processor %u "
 				  "(unit:context %u:%u) (%u active chips): %s\n",
 				  cinfo->rec_cpu, cinfo->unit, cinfo->ctxt,
 				  cinfo->num_active, strerror(errno));
@@ -456,7 +456,7 @@ struct _hfi_ctrl *hfi_userinit_internal(int fd, bool skip_affinity,
 	c.addr = (__u64) binfo;
 
 	if (hfi_cmd_write(fd, &c, sizeof(c)) == -1) {
-		_HFI_INFO("BASE_INFO command failed: %s\n", strerror(errno));
+		_HFI_ERROR("BASE_INFO command failed: %s\n", strerror(errno));
 		goto err_hfi_cmd_user_info;
 	}
 
@@ -481,7 +481,7 @@ struct _hfi_ctrl *hfi_userinit_internal(int fd, bool skip_affinity,
 	 * this is different from PSM API version.
 	 */
 	if ((binfo->sw_version >> HFI1_SWMAJOR_SHIFT) != hfi_get_user_major_version()) {
-		_HFI_INFO
+		_HFI_ERROR
 		    ("User major version 0x%x not same as driver major 0x%x\n",
 		     hfi_get_user_major_version(), binfo->sw_version >> HFI1_SWMAJOR_SHIFT);
 		if ((binfo->sw_version >> HFI1_SWMAJOR_SHIFT) < hfi_get_user_major_version())
diff --git a/psm_hal_gen1/psm_gdrcpy.c b/psm_hal_gen1/psm_gdrcpy.c
index 1896f9e..f5f2b77 100644
--- a/psm_hal_gen1/psm_gdrcpy.c
+++ b/psm_hal_gen1/psm_gdrcpy.c
@@ -5,6 +5,7 @@
 
   GPL LICENSE SUMMARY
 
+  Copyright(c) 2021 Cornelis Networks.
   Copyright(c) 2018 Intel Corporation.
 
   This program is free software; you can redistribute it and/or modify
@@ -17,10 +18,11 @@
   General Public License for more details.
 
   Contact Information:
-  Intel Corporation, www.intel.com
+  Cornelis Networks, www.cornelisnetworks.com
 
   BSD LICENSE
 
+  Copyright(c) 2021 Cornelis Networks.
   Copyright(c) 2018 Intel Corporation.
 
   Redistribution and use in source and binary forms, with or without
@@ -61,9 +63,11 @@
 #include "ptl_ips/ips_expected_proto.h"
 #include "opa_user_gen1.h"
 
-static int gdr_fd;
+static int gdr_refcount;
+static int gdr_fd = -1;
 
-int get_gdr_fd(){
+int get_gdr_fd(void)
+{
 	return gdr_fd;
 }
 
@@ -71,7 +75,8 @@ int get_gdr_fd(){
 #define GPU_PAGE_MASK ~GPU_PAGE_OFFSET_MASK
 
 uint64_t
-gdr_cache_evict() {
+gdr_cache_evict(void)
+{
 	int ret;
 	struct hfi1_gdr_cache_evict_params params;
 	params.evict_params_in.version = HFI1_GDR_VERSION;
@@ -90,8 +95,9 @@ gdr_cache_evict() {
 }
 
 
-uint64_t
-ips_sdma_gpu_cache_evict(int fd) {
+static uint64_t
+ips_sdma_gpu_cache_evict(int fd)
+{
 	int ret;
 	struct hfi1_sdma_gpu_cache_evict_params params;
 	params.evict_params_in.version = HFI1_GDR_VERSION;
@@ -117,7 +123,7 @@ ips_sdma_gpu_cache_evict(int fd) {
  * which we bail out. If successful we retry to PIN/MMAP once
  * again
  */
-uint64_t
+static uint64_t
 handle_out_of_bar_space(struct ips_proto *proto)
 {
 	time_t lastEvictTime = 0;
@@ -158,27 +164,40 @@ gdr_convert_gpu_to_host_addr(int gdr_fd, unsigned long buf,
 							 size_t size, int flags,
 							 struct ips_proto* proto)
 {
-	struct hfi1_gdr_query_params query_params;
-	void *host_addr_buf;
-	int ret;
+	_HFI_VDBG("(gdrcopy) buf=%p size=%zu flags=0x%x proto=%p\n",
+	  (void*)buf, size, flags, proto);
+	if (!size) {
+		// Attempting 0-length pin results in error from driver.
+		// Just return NULL. Caller has to figure out what to do in this
+		// case.
+		return NULL;
+	}
 
-	query_params.query_params_in.version = HFI1_GDR_VERSION;
 	uintptr_t pageaddr = buf & GPU_PAGE_MASK;
-	/* As size is guarenteed to be in the range of 0-8kB
-	 * there is a guarentee that buf+size-1 does not overflow
-	 * 64 bits.
-	 */
-	uint32_t pagelen = (uint32_t) (PSMI_GPU_PAGESIZE +
-					   ((buf + size - 1) & GPU_PAGE_MASK) -
-					   pageaddr);
+	uintptr_t pageend = PSMI_GPU_PAGESIZE + ((buf + size - 1) & GPU_PAGE_MASK);
+
+	// Validate pointer arithmetic
+	if (pageend < pageaddr) {
+		psmi_handle_error(PSMI_EP_NORETURN, PSM2_INTERNAL_ERR,
+		  "pageend < pageaddr, wraparound; pageend=%p pageaddr=%p",
+		  (void*)pageend, (void*)pageaddr);
+	} else if ((pageend - pageaddr) > UINT32_MAX) {
+		psmi_handle_error(PSMI_EP_NORETURN, PSM2_INTERNAL_ERR,
+		  "pageend - pageaddr > UINT32_MAX; pageend=%p pageaddr=%p difference=%zu",
+		  (void*)pageend, (void*)pageaddr, (pageend - pageaddr));
+	}
 
-	_HFI_VDBG("(gpudirect) buf=%p size=%zu pageaddr=%p pagelen=%u flags=0x%x proto=%p\n",
-		(void *)buf, size, (void *)pageaddr, pagelen, flags, proto);
+	uint32_t pagelen = pageend - pageaddr;
+	_HFI_VDBG("(gdrcopy) pageaddr=%p pagelen=%u pageend=%p\n",
+	  (void *)pageaddr, pagelen, (void*)pageend);
 
+	struct hfi1_gdr_query_params query_params;
+	query_params.query_params_in.version = HFI1_GDR_VERSION;
 	query_params.query_params_in.gpu_buf_addr = pageaddr;
 	query_params.query_params_in.gpu_buf_size = pagelen;
- retry:
 
+	int ret;
+ retry:
 	ret = ioctl(gdr_fd, HFI1_IOCTL_GDR_GPU_PIN_MMAP, &query_params);
 
 	if (ret) {
@@ -199,29 +218,40 @@ gdr_convert_gpu_to_host_addr(int gdr_fd, unsigned long buf,
 			return NULL;
 		}
 	}
-	host_addr_buf = (void *)query_params.query_params_out.host_buf_addr;
+	void *host_addr_buf = (void *)query_params.query_params_out.host_buf_addr;
 	return host_addr_buf + (buf & GPU_PAGE_OFFSET_MASK);
 }
 
 
-void hfi_gdr_open(){
-	gdr_fd = open(GDR_DEVICE_PATH, O_RDWR);
-	if (-1 == gdr_fd ) {
-		/* Non-Fatal error. If device cannot be found we assume
-		 * that the driver does not support GDR Copy and we fallback
-		 * to sending all GPU messages using rndv protocol
-		 */
-		_HFI_INFO(" Warning: The HFI1 driver installed does not support GPUDirect RDMA"
-				  " fast copy. Turning off GDR fast copy in PSM \n");
-		is_gdr_copy_enabled = 0;
-		return;
+void hfi_gdr_open(void)
+{
+	if (gdr_fd < 0) {
+		psmi_assert(!gdr_refcount);
+		gdr_fd = open(GDR_DEVICE_PATH, O_RDWR);
+		if (-1 == gdr_fd ) {
+			/* Non-Fatal error. If device cannot be found we assume
+			 * that the driver does not support GDR Copy and we fallback
+			 * to sending all GPU messages using rndv protocol
+			 */
+			_HFI_INFO(" Warning: The HFI1 driver installed does not support GPUDirect RDMA"
+					  " fast copy. Turning off GDR fast copy in PSM \n");
+			is_gdr_copy_enabled = 0;
+			return;
+		}
 	}
-	return;
+	gdr_refcount++;
 }
 
-void hfi_gdr_close()
+void hfi_gdr_close(void)
 {
-	close(GDR_FD);
+	if (gdr_fd > -1) {
+		psmi_assert(gdr_refcount);
+		gdr_refcount--;
+		if (!gdr_refcount) {
+			close(gdr_fd);
+			gdr_fd = -1;
+		}
+	}
 }
 
 #endif
diff --git a/psm_hal_gen1/psm_hal_gen1_spio.c b/psm_hal_gen1/psm_hal_gen1_spio.c
index eb9d5aa..5444897 100644
--- a/psm_hal_gen1/psm_hal_gen1_spio.c
+++ b/psm_hal_gen1/psm_hal_gen1_spio.c
@@ -5,6 +5,7 @@
 
   GPL LICENSE SUMMARY
 
+  Copyright(c) 2021 Cornelis Networks.
   Copyright(c) 2017 Intel Corporation.
 
   This program is free software; you can redistribute it and/or modify
@@ -17,10 +18,11 @@
   General Public License for more details.
 
   Contact Information:
-  Intel Corporation, www.intel.com
+  Cornelis Networks, www.cornelisnetworks.com
 
   BSD LICENSE
 
+  Copyright(c) 2021 Cornelis Networks.
   Copyright(c) 2017 Intel Corporation.
 
   Redistribution and use in source and binary forms, with or without
@@ -181,10 +183,7 @@ ips_spio_init(const struct psmi_context *context, struct ptl *ptl,
 
 
 #ifdef PSM_CUDA
-	if (PSMI_IS_CUDA_ENABLED) {
-		PSMI_CUDA_CALL(cuMemHostAlloc, (void **) &ctrl->cuda_pio_buffer,
-				MAX_CUDA_MTU, CU_MEMHOSTALLOC_PORTABLE);
-	}
+	ctrl->cuda_pio_buffer = NULL;
 #endif
 
 	_HFI_PRDBG("ips_spio_init() done\n");
@@ -195,7 +194,7 @@ ips_spio_init(const struct psmi_context *context, struct ptl *ptl,
 static PSMI_HAL_INLINE psm2_error_t ips_spio_fini(struct ips_spio *ctrl)
 {
 #ifdef PSM_CUDA
-	if (PSMI_IS_CUDA_ENABLED)
+	if (PSMI_IS_CUDA_ENABLED && ctrl->cuda_pio_buffer != NULL)
 		PSMI_CUDA_CALL(cuMemFreeHost, (void *) ctrl->cuda_pio_buffer);
 #endif
 	spio_report_stall(ctrl, get_cycles(), 0ULL);
@@ -810,6 +809,10 @@ fi_busy:
 	/* Write to PIO: other blocks of payload */
 #ifdef PSM_CUDA
 	if (is_cuda_payload) {
+		if (ctrl->cuda_pio_buffer == NULL) {
+			PSMI_CUDA_CALL(cuMemHostAlloc, (void **) &ctrl->cuda_pio_buffer,
+							MAX_CUDA_MTU, CU_MEMHOSTALLOC_PORTABLE);
+		}
 		/* Since the implementation of cuMemcpy is unknown,
 		   and the HFI specifies several conditions for how PIO
 		   writes must occur, for safety reasons we should not assume
diff --git a/psm_mpool.c b/psm_mpool.c
index 1f2a365..e36e917 100644
--- a/psm_mpool.c
+++ b/psm_mpool.c
@@ -5,6 +5,7 @@
 
   GPL LICENSE SUMMARY
 
+  Copyright(c) 2021 Cornelis Networks.
   Copyright(c) 2016 Intel Corporation.
 
   This program is free software; you can redistribute it and/or modify
@@ -17,10 +18,11 @@
   General Public License for more details.
 
   Contact Information:
-  Intel Corporation, www.intel.com
+  Cornelis Networks, www.cornelisnetworks.com
 
   BSD LICENSE
 
+  Copyright(c) 2021 Cornelis Networks.
   Copyright(c) 2016 Intel Corporation.
 
   Redistribution and use in source and binary forms, with or without
@@ -101,7 +103,6 @@ struct mpool {
 
 #ifdef PSM_CUDA
 	alloc_dealloc_callback_fn_t mp_alloc_dealloc_cb;
-	void *mp_alloc_dealloc_cb_context;
 #endif
 };
 
@@ -230,7 +231,7 @@ psmi_mpool_create_for_cuda(size_t obj_size, uint32_t num_obj_per_chunk,
 			   uint32_t num_obj_max_total, int flags,
 			   psmi_memtype_t statstype,
 			   non_empty_callback_fn_t cb, void *context,
-			   alloc_dealloc_callback_fn_t ad_cb, void *ad_context)
+			   alloc_dealloc_callback_fn_t ad_cb)
 {
 	mpool_t mp;
 
@@ -242,7 +243,6 @@ psmi_mpool_create_for_cuda(size_t obj_size, uint32_t num_obj_per_chunk,
 		return NULL;
 
 	mp->mp_alloc_dealloc_cb = ad_cb;
-	mp->mp_alloc_dealloc_cb_context = ad_context;
 
 	if (psmi_mpool_allocate_chunk(mp) != PSM2_OK) {
 		psmi_mpool_destroy(mp);
@@ -418,7 +418,6 @@ void psmi_mpool_chunk_dealloc(mpool_t mp, int idx)
 	int j;
 	for (j = 0; j < mp->mp_num_obj_per_chunk; j++)
 		mp->mp_alloc_dealloc_cb(0 /* is not alloc */,
-					mp->mp_alloc_dealloc_cb_context,
 					((void *) mp->mp_elm_vector[idx]) +
 					j * mp->mp_elm_size +
 					sizeof(struct mpool_element));
@@ -509,7 +508,6 @@ static int psmi_mpool_allocate_chunk(mpool_t mp)
 #ifdef PSM_CUDA
 		if (mp->mp_alloc_dealloc_cb)
 			mp->mp_alloc_dealloc_cb(1 /* is alloc */,
-						mp->mp_alloc_dealloc_cb_context,
 						chunk + i * mp->mp_elm_size +
 						sizeof(struct mpool_element));
 #endif
diff --git a/psm_mpool.h b/psm_mpool.h
index 8098f60..97f95ab 100644
--- a/psm_mpool.h
+++ b/psm_mpool.h
@@ -5,6 +5,7 @@
 
   GPL LICENSE SUMMARY
 
+  Copyright(c) 2021 Cornelis Networks.
   Copyright(c) 2016 Intel Corporation.
 
   This program is free software; you can redistribute it and/or modify
@@ -17,10 +18,11 @@
   General Public License for more details.
 
   Contact Information:
-  Intel Corporation, www.intel.com
+  Cornelis Networks, www.cornelisnetworks.com
 
   BSD LICENSE
 
+  Copyright(c) 2021 Cornelis Networks.
   Copyright(c) 2016 Intel Corporation.
 
   Redistribution and use in source and binary forms, with or without
@@ -70,8 +72,7 @@
 
 typedef struct mpool *mpool_t;
 typedef void (*non_empty_callback_fn_t) (void *context);
-typedef void (*alloc_dealloc_callback_fn_t) (int is_alloc, void *context,
-					     void *chunk);
+typedef void (*alloc_dealloc_callback_fn_t) (int is_alloc, void *chunk);
 
 mpool_t
 MOCKABLE(psmi_mpool_create)(size_t obj_size, uint32_t num_obj_per_chunk,
@@ -84,8 +85,7 @@ mpool_t psmi_mpool_create_for_cuda(size_t obj_size, uint32_t num_obj_per_chunk,
 				   uint32_t num_obj_max_total, int flags,
 				   psmi_memtype_t statstype,
 				   non_empty_callback_fn_t cb, void *context,
-				   alloc_dealloc_callback_fn_t ad_cb,
-				   void *ad_context);
+				   alloc_dealloc_callback_fn_t ad_cb);
 
 void psmi_mpool_destroy(mpool_t mp);
 
diff --git a/psm_mq.c b/psm_mq.c
index a25a581..9f25461 100644
--- a/psm_mq.c
+++ b/psm_mq.c
@@ -5,6 +5,7 @@
 
   GPL LICENSE SUMMARY
 
+  Copyright(c) 2021 Cornelis Networks.
   Copyright(c) 2016 Intel Corporation.
 
   This program is free software; you can redistribute it and/or modify
@@ -17,10 +18,11 @@
   General Public License for more details.
 
   Contact Information:
-  Intel Corporation, www.intel.com
+  Cornelis Networks, www.cornelisnetworks.com
 
   BSD LICENSE
 
+  Copyright(c) 2021 Cornelis Networks.
   Copyright(c) 2016 Intel Corporation.
 
   Redistribution and use in source and binary forms, with or without
@@ -767,6 +769,7 @@ psm2_mq_irecv_inner(psm2_mq_t mq, psm2_mq_req_t req, void *buf, uint32_t len)
 	PSM2_LOG_MSG("entering");
 	psmi_assert(MQE_TYPE_IS_RECV(req->type));
 	psmi_mtucpy_fn_t psmi_mtucpy_fn = psmi_mq_mtucpy;
+
 #ifdef PSM_CUDA
 	if (!req->is_buf_gpu_mem)
 		psmi_mtucpy_fn = psmi_mq_mtucpy_host_mem;
@@ -785,6 +788,9 @@ psm2_mq_irecv_inner(psm2_mq_t mq, psm2_mq_req_t req, void *buf, uint32_t len)
 								    len, 1,
 								    mq->ep->epaddr->proto);
 				psmi_mtucpy_fn = psmi_mq_mtucpy_host_mem;
+			} else if ((req->flags_user & PSM2_MQ_FLAG_GDRCPY_ONLY) && req->is_buf_gpu_mem) {
+				psmi_handle_error(PSMI_EP_NORETURN, PSM2_INTERNAL_ERR,
+				  "CUDA memcpy not permitted for this operation.");
 			}
 #endif
 			psmi_mtucpy_fn(ubuf, (const void *)req->req_data.buf, copysz);
@@ -808,6 +814,9 @@ psm2_mq_irecv_inner(psm2_mq_t mq, psm2_mq_req_t req, void *buf, uint32_t len)
 							   req->req_data.send_msglen, 1,
 							   mq->ep->epaddr->proto);
 			psmi_mtucpy_fn = psmi_mq_mtucpy_host_mem;
+		} else if ((req->flags_user & PSM2_MQ_FLAG_GDRCPY_ONLY) && req->is_buf_gpu_mem) {
+			psmi_handle_error(PSMI_EP_NORETURN, PSM2_INTERNAL_ERR,
+			  "CUDA memcpy not permitted for this operation.");
 		}
 #endif
 
@@ -912,6 +921,7 @@ __psm2_mq_fp_msg(psm2_ep_t ep, psm2_mq_t mq, psm2_epaddr_t addr, psm2_mq_tag_t *
 			recv_req->req_data.recv_msglen = len;
 			recv_req->recv_msgoff = 0;
 			recv_req->req_data.context = context;
+			recv_req->flags_user = flags;
 
 #ifdef PSM_CUDA
 			recv_req->is_buf_gpu_mem = gpu_mem;
@@ -935,6 +945,7 @@ __psm2_mq_fp_msg(psm2_ep_t ep, psm2_mq_t mq, psm2_epaddr_t addr, psm2_mq_tag_t *
 #endif
 
 			recv_req->req_data.context = context;
+			recv_req->flags_user = flags;
 
 			psm2_mq_irecv_inner(mq, recv_req, buf, len);
 		}
@@ -995,6 +1006,7 @@ __psm2_mq_irecv2(psm2_mq_t mq, psm2_epaddr_t src,
 		req->req_data.recv_msglen = len;
 		req->recv_msgoff = 0;
 		req->req_data.context = context;
+		req->flags_user = flags;
 
 #ifdef PSM_CUDA
 		req->is_buf_gpu_mem = gpu_mem;
@@ -1023,6 +1035,7 @@ __psm2_mq_irecv2(psm2_mq_t mq, psm2_epaddr_t src,
 #endif
 
 		req->req_data.context = context;
+		req->flags_user |= flags;
 
 		psm2_mq_irecv_inner(mq, req, buf, len);
 	}
diff --git a/psm_mq_internal.h b/psm_mq_internal.h
index a1afaf8..0a12058 100644
--- a/psm_mq_internal.h
+++ b/psm_mq_internal.h
@@ -5,6 +5,7 @@
 
   GPL LICENSE SUMMARY
 
+  Copyright(c) 2021 Cornelis Networks.
   Copyright(c) 2016 Intel Corporation.
 
   This program is free software; you can redistribute it and/or modify
@@ -17,10 +18,11 @@
   General Public License for more details.
 
   Contact Information:
-  Intel Corporation, www.intel.com
+  Cornelis Networks, www.cornelisnetworks.com
 
   BSD LICENSE
 
+  Copyright(c) 2021 Cornelis Networks.
   Copyright(c) 2016 Intel Corporation.
 
   Redistribution and use in source and binary forms, with or without
@@ -249,7 +251,6 @@ struct psm2_mq_req {
 	uint32_t prefetch_send_msgoff;
 	int cuda_hostbuf_used;
 	CUipcMemHandle cuda_ipc_handle;
-	CUevent cuda_ipc_event;
 	uint8_t cuda_ipc_handle_attached;
 	uint32_t cuda_ipc_offset;
 	/*
@@ -555,7 +556,14 @@ int psmi_mq_handle_envelope(psm2_mq_t mq, psm2_epaddr_t src, psm2_mq_tag_t *tag,
 			    uint32_t msglen, uint32_t offset,
 			    const void *payload, uint32_t paylen, int msgorder,
 			    uint32_t opcode, psm2_mq_req_t *req_o);
+
+#ifdef PSM_CUDA
+// GDRCopy code requires ips_proto*, so CUDA-aware implementation must accept
+// ips_proto*
+int psmi_mq_handle_outoforder(psm2_mq_t mq, psm2_mq_req_t req, struct ips_proto *proto);
+#else
 int psmi_mq_handle_outoforder(psm2_mq_t mq, psm2_mq_req_t req);
+#endif
 
 void psmi_mq_stats_register(psm2_mq_t mq, mpspawn_stats_add_fn add_fn);
 
diff --git a/psm_mq_recv.c b/psm_mq_recv.c
index 642fbc1..e35cbe3 100644
--- a/psm_mq_recv.c
+++ b/psm_mq_recv.c
@@ -5,6 +5,7 @@
 
   GPL LICENSE SUMMARY
 
+  Copyright(c) 2021 Cornelis Networks.
   Copyright(c) 2015 Intel Corporation.
 
   This program is free software; you can redistribute it and/or modify
@@ -17,10 +18,11 @@
   General Public License for more details.
 
   Contact Information:
-  Intel Corporation, www.intel.com
+  Cornelis Networks, www.cornelisnetworks.com
 
   BSD LICENSE
 
+  Copyright(c) 2021 Cornelis Networks.
   Copyright(c) 2015 Intel Corporation.
 
   Redistribution and use in source and binary forms, with or without
@@ -366,6 +368,9 @@ psmi_mq_handle_envelope(psm2_mq_t mq, psm2_epaddr_t src, psm2_mq_tag_t *tag,
 				user_buffer = gdr_convert_gpu_to_host_addr(GDR_FD,
 								(unsigned long)req->req_data.buf,
 								msglen, 1, src->proto);
+			} else if ((req->flags_user & PSM2_MQ_FLAG_GDRCPY_ONLY) && req->is_buf_gpu_mem) {
+				psmi_handle_error(PSMI_EP_NORETURN, PSM2_INTERNAL_ERR,
+				  "CUDA memcpy not permitted for this operation.");
 			}
 #endif
 			mq_copy_tiny((uint32_t *) user_buffer, (uint32_t *) payload, msglen);
@@ -383,6 +388,9 @@ psmi_mq_handle_envelope(psm2_mq_t mq, psm2_epaddr_t src, psm2_mq_tag_t *tag,
 							(unsigned long)req->req_data.buf,
 							msglen, 1, src->proto);
 				psmi_mtucpy_fn = psmi_mq_mtucpy_host_mem;
+			} else if ((req->flags_user & PSM2_MQ_FLAG_GDRCPY_ONLY) && req->is_buf_gpu_mem) {
+				psmi_handle_error(PSMI_EP_NORETURN, PSM2_INTERNAL_ERR,
+				  "CUDA memcpy not permitted for this operation.");
 			}
 #endif
 			if (msglen <= paylen) {
@@ -414,6 +422,9 @@ psmi_mq_handle_envelope(psm2_mq_t mq, psm2_epaddr_t src, psm2_mq_tag_t *tag,
 				req->req_data.buf = gdr_convert_gpu_to_host_addr(GDR_FD,
 						(unsigned long)req->user_gpu_buffer,
 						req->req_data.send_msglen, 1, src->proto);
+			} else if ((req->flags_user & PSM2_MQ_FLAG_GDRCPY_ONLY) && req->is_buf_gpu_mem) {
+				psmi_handle_error(PSMI_EP_NORETURN, PSM2_INTERNAL_ERR,
+				  "CUDA memcpy not permitted for this operation.");
 			}
 #endif
 			if (paylen > 0)
@@ -540,7 +551,11 @@ psmi_mq_handle_envelope(psm2_mq_t mq, psm2_epaddr_t src, psm2_mq_tag_t *tag,
 	return MQ_RET_UNEXP_OK;
 }
 
-int psmi_mq_handle_outoforder(psm2_mq_t mq, psm2_mq_req_t ureq)
+int psmi_mq_handle_outoforder(psm2_mq_t mq, psm2_mq_req_t ureq
+#ifdef PSM_CUDA
+  , struct ips_proto *proto
+#endif
+)
 {
 	psm2_mq_req_t ereq;
 	uint32_t msglen;
@@ -555,12 +570,18 @@ int psmi_mq_handle_outoforder(psm2_mq_t mq, psm2_mq_req_t ureq)
 	ereq->req_data.peer = ureq->req_data.peer;
 	ereq->req_data.tag = ureq->req_data.tag;
 	msglen = mq_set_msglen(ereq, ereq->req_data.buf_len, ureq->req_data.send_msglen);
+	uint8_t *dest = ereq->req_data.buf;
+
+#ifdef PSM_CUDA
+	if (proto && PSMI_USE_GDR_COPY(ereq, msglen)) {
+		dest = gdr_convert_gpu_to_host_addr(GDR_FD, (unsigned long)dest, msglen, 0, proto);
+	}
+#endif
 
 	switch (ureq->state) {
 	case MQ_STATE_COMPLETE:
 		if (ureq->req_data.buf != NULL) {	/* 0-byte don't alloc a sysreq_data.buf */
-			psmi_mq_mtucpy(ereq->req_data.buf, (const void *)ureq->req_data.buf,
-				       msglen);
+			psmi_mq_mtucpy(dest, (const void *)ureq->req_data.buf, msglen);
 			psmi_mq_sysbuf_free(mq, ureq->req_data.buf);
 		}
 		ereq->state = MQ_STATE_COMPLETE;
@@ -574,7 +595,7 @@ int psmi_mq_handle_outoforder(psm2_mq_t mq, psm2_mq_req_t ureq)
 		ereq->send_msgoff = ureq->send_msgoff;
 		ereq->recv_msgoff = min(ureq->recv_msgoff, msglen);
 		if (ereq->recv_msgoff) {
-			psmi_mq_mtucpy(ereq->req_data.buf,
+			psmi_mq_mtucpy(dest,
 				       (const void *)ureq->req_data.buf,
 				       ereq->recv_msgoff);
 		}
@@ -590,7 +611,7 @@ int psmi_mq_handle_outoforder(psm2_mq_t mq, psm2_mq_req_t ureq)
 		ereq->send_msgoff = ureq->send_msgoff;
 		ereq->recv_msgoff = min(ureq->recv_msgoff, msglen);
 		if (ereq->recv_msgoff) {
-			psmi_mq_mtucpy(ereq->req_data.buf,
+			psmi_mq_mtucpy(dest,
 				       (const void *)ureq->req_data.buf,
 				       ereq->recv_msgoff);
 		}
diff --git a/psm_mq_utils.c b/psm_mq_utils.c
index a0409db..3032776 100644
--- a/psm_mq_utils.c
+++ b/psm_mq_utils.c
@@ -5,6 +5,7 @@
 
   GPL LICENSE SUMMARY
 
+  Copyright(c) 2021 Cornelis Networks.
   Copyright(c) 2015 Intel Corporation.
 
   This program is free software; you can redistribute it and/or modify
@@ -17,10 +18,11 @@
   General Public License for more details.
 
   Contact Information:
-  Intel Corporation, www.intel.com
+  Cornelis Networks, www.cornelisnetworks.com
 
   BSD LICENSE
 
+  Copyright(c) 2021 Cornelis Networks.
   Copyright(c) 2015 Intel Corporation.
 
   Redistribution and use in source and binary forms, with or without
@@ -93,6 +95,7 @@ psm2_mq_req_t MOCKABLE(psmi_mq_req_alloc)(psm2_mq_t mq, uint32_t type)
 #ifdef PSM_CUDA
 		req->is_buf_gpu_mem = 0;
 		req->user_gpu_buffer = NULL;
+		req->cuda_ipc_handle_attached = 0;
 #endif
 		req->flags_user = 0;
 		req->flags_internal = 0;
@@ -114,19 +117,6 @@ psm2_mq_req_t MOCKABLE(psmi_mq_req_alloc)(psm2_mq_t mq, uint32_t type)
 }
 MOCK_DEF_EPILOGUE(psmi_mq_req_alloc);
 
-#ifdef PSM_CUDA
-void psmi_cuda_recvreq_alloc_func(int is_alloc, void* context, void* obj) {
-	psm2_mq_req_t recvreq = (psm2_mq_req_t)obj;
-	if (PSMI_IS_CUDA_ENABLED) {
-		if (is_alloc)
-			PSMI_CUDA_CALL(cuEventCreate, &recvreq->cuda_ipc_event, CU_EVENT_DEFAULT);
-		else
-			PSMI_CUDA_CALL(cuEventDestroy, recvreq->cuda_ipc_event);
-	}
-	return;
-}
-#endif
-
 psm2_error_t psmi_mq_req_init(psm2_mq_t mq)
 {
 	psm2_mq_req_t warmup_req;
@@ -165,29 +155,6 @@ psm2_error_t psmi_mq_req_init(psm2_mq_t mq)
 		if ((err =
 		     psmi_parse_mpool_env(mq, 0, &rlim, &maxsz, &chunksz)))
 			goto fail;
-		/* Have a callback function for receive req mpool which creates
-		 * and destroy events.
-		 */
-#ifdef PSM_CUDA
-		if (PSMI_IS_CUDA_ENABLED) {
-			if ((mq->rreq_pool =
-	                     psmi_mpool_create_for_cuda(sizeof(struct psm2_mq_req), chunksz,
-                                       maxsz, 0, DESCRIPTORS, NULL,
-                                       NULL, psmi_cuda_recvreq_alloc_func, NULL)) == NULL) {
-				err = PSM2_NO_MEMORY;
-				goto fail;
-			}
-		}
-		else {
-			if ((mq->rreq_pool =
-				psmi_mpool_create(sizeof(struct psm2_mq_req), chunksz,
-                                       maxsz, 0, DESCRIPTORS, NULL,
-                                       NULL)) == NULL) {
-				err = PSM2_NO_MEMORY;
-				goto fail;
-			}
-		}
-#else
 		if ((mq->rreq_pool =
 			psmi_mpool_create(sizeof(struct psm2_mq_req), chunksz,
 				       maxsz, 0, DESCRIPTORS, NULL,
@@ -195,7 +162,6 @@ psm2_error_t psmi_mq_req_init(psm2_mq_t mq)
 			err = PSM2_NO_MEMORY;
 			goto fail;
 		}
-#endif
 	}
 
 	/* Warm up the allocators */
diff --git a/psm_user.h b/psm_user.h
index 09477c5..523b9fb 100644
--- a/psm_user.h
+++ b/psm_user.h
@@ -5,6 +5,7 @@
 
   GPL LICENSE SUMMARY
 
+  Copyright(c) 2021 Cornelis Networks.
   Copyright(c) 2016 Intel Corporation.
 
   This program is free software; you can redistribute it and/or modify
@@ -17,10 +18,11 @@
   General Public License for more details.
 
   Contact Information:
-  Intel Corporation, www.intel.com
+  Cornelis Networks, www.cornelisnetworks.com
 
   BSD LICENSE
 
+  Copyright(c) 2021 Cornelis Networks.
   Copyright(c) 2016 Intel Corporation.
 
   Redistribution and use in source and binary forms, with or without
@@ -296,6 +298,7 @@ void psmi_profile_reblock(int did_no_progress) __attribute__ ((weak));
 #endif
 
 #ifdef PSM_CUDA
+
 #include <cuda.h>
 #include <driver_types.h>
 
@@ -305,12 +308,12 @@ void psmi_profile_reblock(int did_no_progress) __attribute__ ((weak));
 
 extern int is_cuda_enabled;
 extern int is_gdr_copy_enabled;
-extern int device_support_gpudirect;
-extern int gpu_p2p_supported;
+extern int _device_support_gpudirect;
+extern int _gpu_p2p_supported;
 extern int my_gpu_device;
 extern int cuda_lib_version;
 
-extern CUcontext ctxt;
+extern CUcontext cu_ctxt;
 extern void *psmi_cuda_lib;
 
 extern CUresult (*psmi_cuInit)(unsigned int  Flags );
@@ -326,6 +329,7 @@ extern CUresult (*psmi_cuDriverGetVersion)(int* driverVersion);
 extern CUresult (*psmi_cuDeviceGetCount)(int* count);
 extern CUresult (*psmi_cuStreamCreate)(CUstream* phStream, unsigned int Flags);
 extern CUresult (*psmi_cuStreamDestroy)(CUstream phStream);
+extern CUresult (*psmi_cuStreamSynchronize)(CUstream phStream);
 extern CUresult (*psmi_cuEventCreate)(CUevent* phEvent, unsigned int Flags);
 extern CUresult (*psmi_cuEventDestroy)(CUevent hEvent);
 extern CUresult (*psmi_cuEventQuery)(CUevent hEvent);
@@ -348,14 +352,35 @@ extern CUresult (*psmi_cuDevicePrimaryCtxRetain)(CUcontext* pctx, CUdevice dev);
 extern CUresult (*psmi_cuCtxGetDevice)(CUdevice* device);
 extern CUresult (*psmi_cuDevicePrimaryCtxRelease)(CUdevice device);
 
+static int check_set_cuda_ctxt(void)
+{
+	CUresult err;
+	CUcontext tmpctxt = {0};
+
+	if (!psmi_cuCtxGetCurrent || !psmi_cuCtxSetCurrent)
+		return 0;
+
+	err = psmi_cuCtxGetCurrent(&tmpctxt);
+	if (!err) {
+		if (!tmpctxt && cu_ctxt) {
+			err = psmi_cuCtxSetCurrent(cu_ctxt);
+			return !!err;
+		} else if (tmpctxt && !cu_ctxt) {
+			cu_ctxt = tmpctxt;
+		}
+	}
+	return 0;
+}
+
 #define PSMI_CUDA_CALL(func, args...) do {				\
+		_HFI_CUDADBG("func=psmi_"#func"\n"); \
 		CUresult cudaerr;					\
+		if (check_set_cuda_ctxt()) { \
+			psmi_handle_error(PSMI_EP_NORETURN, PSM2_INTERNAL_ERR, \
+				"Failed to set/synchronize CUDA context.\n"); \
+		} \
 		cudaerr = psmi_##func(args);				\
 		if (cudaerr != CUDA_SUCCESS) {				\
-			if (ctxt == NULL)				\
-				_HFI_ERROR(				\
-				"Check if CUDA is initialized"	\
-				"before psm2_ep_open call \n");		\
 			_HFI_ERROR(					\
 				"CUDA failure: %s() (at %s:%d)"		\
 				"returned %d\n",			\
@@ -366,6 +391,92 @@ extern CUresult (*psmi_cuDevicePrimaryCtxRelease)(CUdevice device);
 		}							\
 	} while (0)
 
+PSMI_ALWAYS_INLINE(
+int device_support_gpudirect())
+{
+	if (_device_support_gpudirect > -1) return _device_support_gpudirect;
+
+	int num_devices, dev;
+	
+	/* Check if all devices support Unified Virtual Addressing. */
+	PSMI_CUDA_CALL(cuDeviceGetCount, &num_devices);
+
+	_device_support_gpudirect = 1;
+
+	for (dev = 0; dev < num_devices; dev++) {
+		CUdevice device;
+		PSMI_CUDA_CALL(cuDeviceGet, &device, dev);
+		int unifiedAddressing;
+		PSMI_CUDA_CALL(cuDeviceGetAttribute,
+				&unifiedAddressing,
+				CU_DEVICE_ATTRIBUTE_UNIFIED_ADDRESSING,
+				device);
+
+		if (unifiedAddressing !=1) {
+			psmi_handle_error(PSMI_EP_NORETURN, PSM2_EP_DEVICE_FAILURE,
+				"CUDA device %d does not support Unified Virtual Addressing.\n",
+				dev);
+		}
+
+		int major;
+		PSMI_CUDA_CALL(cuDeviceGetAttribute,
+				&major,
+				CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR,
+				device);
+		if (major < 3) {
+			_device_support_gpudirect = 0;
+			_HFI_INFO("CUDA device %d does not support GPUDirect RDMA (Non-fatal error)\n", dev);
+		}
+	}
+
+	return _device_support_gpudirect;
+}
+
+#define PSMI_IS_CUDA_ENABLED  likely(is_cuda_enabled)
+#define PSMI_IS_CUDA_DISABLED unlikely(!is_cuda_enabled)
+
+PSMI_ALWAYS_INLINE(
+int gpu_p2p_supported())
+{
+	if (likely(_gpu_p2p_supported > -1)) return _gpu_p2p_supported;
+
+	if (PSMI_IS_CUDA_DISABLED) {
+		_gpu_p2p_supported=0; 
+		return 0;
+	}
+
+	int num_devices, dev;
+	
+	/* Check which devices the current device has p2p access to. */
+	CUdevice current_device;
+	PSMI_CUDA_CALL(cuCtxGetDevice, &current_device);
+	PSMI_CUDA_CALL(cuDeviceGetCount, &num_devices);
+
+	_gpu_p2p_supported = 0;
+
+	for (dev = 0; dev < num_devices; dev++) {
+		CUdevice device;
+		PSMI_CUDA_CALL(cuDeviceGet, &device, dev);
+
+		if (device != current_device) {
+			int canAccessPeer = 0;
+			PSMI_CUDA_CALL(cuDeviceCanAccessPeer, &canAccessPeer,
+					current_device, device);
+
+			if (canAccessPeer != 1)
+				_HFI_DBG("CUDA device %d does not support P2P from current device (Non-fatal error)\n", dev);
+			else
+				_gpu_p2p_supported |= (1 << device);
+		} else {
+			/* Always support p2p on the same GPU */
+			my_gpu_device = device;
+			_gpu_p2p_supported |= (1 << device);
+		}
+	}
+
+	return _gpu_p2p_supported;
+}
+
 /**
  * Similar to PSMI_CUDA_CALL() except does not error out
  * if func(args) returns CUDA_SUCCESS or except_err
@@ -378,9 +489,14 @@ extern CUresult (*psmi_cuDevicePrimaryCtxRelease)(CUdevice device);
  * DBG level.
  */
 #define PSMI_CUDA_CALL_EXCEPT(except_err, func, args...) do { \
+		_HFI_CUDADBG("func=psmi_"#func",except_err=%d\n", except_err); \
+		if (check_set_cuda_ctxt()) { \
+			psmi_handle_error(PSMI_EP_NORETURN, PSM2_INTERNAL_ERR, \
+				"Failed to set/synchronize CUDA context.\n"); \
+		} \
 		cudaerr = psmi_##func(args);				\
 		if (cudaerr != CUDA_SUCCESS && cudaerr != except_err) {	\
-			if (ctxt == NULL)				\
+			if (cu_ctxt == NULL)				\
 				_HFI_ERROR(				\
 				"Check if CUDA is initialized"	\
 				"before psm2_ep_open call \n");		\
@@ -400,6 +516,7 @@ extern CUresult (*psmi_cuDevicePrimaryCtxRelease)(CUdevice device);
 	} while (0)
 
 #define PSMI_CUDA_CHECK_EVENT(event, cudaerr) do {			\
+		_HFI_CUDADBG("event=%p\n", (void*)(event)); \
 		cudaerr = psmi_cuEventQuery(event);			\
 		if ((cudaerr != CUDA_SUCCESS) &&			\
 		    (cudaerr != CUDA_ERROR_NOT_READY)) {		\
@@ -442,9 +559,6 @@ _psmi_is_cuda_mem(const void *ptr))
 		return 0;
 }
 
-#define PSMI_IS_CUDA_ENABLED  likely(is_cuda_enabled)
-#define PSMI_IS_CUDA_DISABLED unlikely(!is_cuda_enabled)
-
 PSMI_ALWAYS_INLINE(
 int
 _psmi_is_gdr_copy_enabled())
@@ -473,7 +587,7 @@ struct ips_cuda_hostbuf {
 struct ips_cuda_hostbuf_mpool_cb_context {
 	unsigned bufsz;
 };
-void psmi_cuda_hostbuf_alloc_func(int is_alloc, void *context, void *obj);
+void psmi_cuda_hostbuf_alloc_func(int is_alloc, void *obj);
 
 #define CUDA_HOSTBUFFER_LIMITS {				\
 	    .env = "PSM_CUDA_BOUNCEBUFFERS_MAX",		\
@@ -500,9 +614,10 @@ extern uint32_t gdr_copy_threshold_send;
  */
 extern uint32_t gdr_copy_threshold_recv;
 
-#define PSMI_USE_GDR_COPY(req, len) req->is_buf_gpu_mem &&       \
-				    PSMI_IS_GDR_COPY_ENABLED  && \
-				    len >=1 && len <= gdr_copy_threshold_recv
+#define PSMI_USE_GDR_COPY(req, len) \
+	req->is_buf_gpu_mem && \
+	PSMI_IS_GDR_COPY_ENABLED  && \
+	(req->flags_user & PSM2_MQ_FLAG_GDRCPY_ONLY || (len >=1 && len <= gdr_copy_threshold_recv))
 
 enum psm2_chb_match_type {
 	/* Complete data found in a single chb */
diff --git a/ptl_am/am_cuda_memhandle_cache.c b/ptl_am/am_cuda_memhandle_cache.c
index 730562d..1f34ac2 100644
--- a/ptl_am/am_cuda_memhandle_cache.c
+++ b/ptl_am/am_cuda_memhandle_cache.c
@@ -5,6 +5,7 @@
 
   GPL LICENSE SUMMARY
 
+  Copyright(c) 2021 Cornelis Networks.
   Copyright(c) 2016 Intel Corporation.
 
   This program is free software; you can redistribute it and/or modify
@@ -17,10 +18,11 @@
   General Public License for more details.
 
   Contact Information:
-  Intel Corporation, www.intel.com
+  Cornelis Networks, www.cornelisnetworks.com
 
   BSD LICENSE
 
+  Copyright(c) 2021 Cornelis Networks.
   Copyright(c) 2016 Intel Corporation.
 
   Redistribution and use in source and binary forms, with or without
@@ -168,7 +170,7 @@ static void print_cuda_memhandle_cache_stats(void)
  * which helps in closing all memhandles.
  */
 static void
-psmi_cuda_memhandle_cache_alloc_func(int is_alloc, void* context, void* obj)
+psmi_cuda_memhandle_cache_alloc_func(int is_alloc, void* obj)
 {
 	cl_map_item_t* memcache_item = (cl_map_item_t*)obj;
 	if (!is_alloc) {
@@ -196,8 +198,7 @@ am_cuda_memhandle_mpool_init(uint32_t memcache_size)
 					cuda_memhandle_cache_size,
 					cuda_memhandle_cache_size, 0,
 					UNDEFINED, NULL, NULL,
-					psmi_cuda_memhandle_cache_alloc_func,
-					NULL);
+					psmi_cuda_memhandle_cache_alloc_func);
 	if (cuda_memhandle_mpool == NULL) {
 		err = psmi_handle_error(PSMI_EP_NORETURN, PSM2_NO_MEMORY,
 				"Couldn't allocate CUDA host receive buffer pool");
diff --git a/ptl_am/am_reqrep_shmem.c b/ptl_am/am_reqrep_shmem.c
index 9be72f9..384fb50 100644
--- a/ptl_am/am_reqrep_shmem.c
+++ b/ptl_am/am_reqrep_shmem.c
@@ -5,6 +5,7 @@
 
   GPL LICENSE SUMMARY
 
+  Copyright(c) 2021 Cornelis Networks.
   Copyright(c) 2016 Intel Corporation.
 
   This program is free software; you can redistribute it and/or modify
@@ -17,10 +18,11 @@
   General Public License for more details.
 
   Contact Information:
-  Intel Corporation, www.intel.com
+  Cornelis Networks, www.cornelisnetworks.com
 
   BSD LICENSE
 
+  Copyright(c) 2021 Cornelis Networks.
   Copyright(c) 2016 Intel Corporation.
 
   Redistribution and use in source and binary forms, with or without
@@ -2099,11 +2101,12 @@ amsh_mq_send_inner(psm2_mq_t mq, psm2_mq_req_t req, psm2_epaddr_t epaddr,
 
 #ifdef PSM_CUDA
 	int gpu_mem = 0;
-	int ep_supports_p2p = (1 << ((am_epaddr_t *) epaddr)->gpuid) & gpu_p2p_supported;
+	int ep_supports_p2p;
 
 	if (PSMI_IS_CUDA_ENABLED && PSMI_IS_CUDA_MEM(ubuf)) {
 		gpu_mem = 1;
 
+		ep_supports_p2p = (1 << ((am_epaddr_t *) epaddr)->gpuid) & gpu_p2p_supported();
 		/* All sends from a gpu buffer use the rendezvous protocol if p2p is supported */
 		if (ep_supports_p2p) {
 			goto do_rendezvous;
diff --git a/ptl_am/ptl.c b/ptl_am/ptl.c
index 2e42c1b..d43fde6 100644
--- a/ptl_am/ptl.c
+++ b/ptl_am/ptl.c
@@ -5,6 +5,7 @@
 
   GPL LICENSE SUMMARY
 
+  Copyright(c) 2021 Cornelis Networks.
   Copyright(c) 2016 Intel Corporation.
 
   This program is free software; you can redistribute it and/or modify
@@ -17,10 +18,11 @@
   General Public License for more details.
 
   Contact Information:
-  Intel Corporation, www.intel.com
+  Cornelis Networks, www.cornelisnetworks.com
 
   BSD LICENSE
 
+  Copyright(c) 2021 Cornelis Networks.
   Copyright(c) 2016 Intel Corporation.
 
   Redistribution and use in source and binary forms, with or without
@@ -96,8 +98,7 @@ ptl_handle_rtsmatch_request(psm2_mq_req_t req, int was_posted,
 		if (req->is_buf_gpu_mem) {
 			PSMI_CUDA_CALL(cuMemcpyDtoD, (CUdeviceptr)req->req_data.buf, cuda_ipc_dev_ptr,
 				       req->req_data.recv_msglen);
-			PSMI_CUDA_CALL(cuEventRecord, req->cuda_ipc_event, 0);
-			PSMI_CUDA_CALL(cuEventSynchronize, req->cuda_ipc_event);
+			PSMI_CUDA_CALL(cuStreamSynchronize, 0);
 		} else
 			PSMI_CUDA_CALL(cuMemcpyDtoH, req->req_data.buf, cuda_ipc_dev_ptr,
 				       req->req_data.recv_msglen);
@@ -129,8 +130,7 @@ ptl_handle_rtsmatch_request(psm2_mq_req_t req, int was_posted,
 			 * copies for msg sizes less than 64k. The event record
 			 * and synchronize calls are to guarentee completion.
 			 */
-			PSMI_CUDA_CALL(cuEventRecord, req->cuda_ipc_event, 0);
-			PSMI_CUDA_CALL(cuEventSynchronize, req->cuda_ipc_event);
+			PSMI_CUDA_CALL(cuStreamSynchronize, 0);
 			psmi_free(cuda_ipc_bounce_buf);
 		} else {
 			/* cma can be done in handler context or not. */
@@ -206,7 +206,7 @@ psmi_am_mq_handler(void *toki, psm2_amarg_t *args, int narg, void *buf,
 	tag.tag[1] = args[1].u32w0;
 	tag.tag[2] = args[2].u32w1;
 	psmi_assert(toki != NULL);
-	_HFI_VDBG("mq=%p opcode=%d, len=%d, msglen=%d\n",
+	_HFI_VDBG("mq=%p opcode=0x%x, len=%d, msglen=%d\n",
 		  tok->mq, opcode, (int)len, msglen);
 
 	switch (opcode) {
diff --git a/ptl_ips/ips_proto.c b/ptl_ips/ips_proto.c
index dfd03e6..d1489e6 100644
--- a/ptl_ips/ips_proto.c
+++ b/ptl_ips/ips_proto.c
@@ -5,6 +5,7 @@
 
   GPL LICENSE SUMMARY
 
+  Copyright(c) 2021 Cornelis Networks.
   Copyright(c) 2016 Intel Corporation.
 
   This program is free software; you can redistribute it and/or modify
@@ -17,10 +18,11 @@
   General Public License for more details.
 
   Contact Information:
-  Intel Corporation, www.intel.com
+  Cornelis Networks, www.cornelisnetworks.com
 
   BSD LICENSE
 
+  Copyright(c) 2021 Cornelis Networks.
   Copyright(c) 2016 Intel Corporation.
 
   Redistribution and use in source and binary forms, with or without
@@ -95,22 +97,17 @@ static psm2_error_t proto_sdma_init(struct ips_proto *proto,
 				   const psmi_context_t *context);
 
 #ifdef PSM_CUDA
-void psmi_cuda_hostbuf_alloc_func(int is_alloc, void *context, void *obj)
+void psmi_cuda_hostbuf_alloc_func(int is_alloc, void *obj)
 {
-	struct ips_cuda_hostbuf *icb;
-	struct ips_cuda_hostbuf_mpool_cb_context *ctxt =
-		(struct ips_cuda_hostbuf_mpool_cb_context *) context;
-
-	icb = (struct ips_cuda_hostbuf *)obj;
+	struct ips_cuda_hostbuf *icb = (struct ips_cuda_hostbuf *)obj;
 	if (is_alloc) {
-		PSMI_CUDA_CALL(cuMemHostAlloc,
-			       (void **) &icb->host_buf,
-			       ctxt->bufsz,
-			       CU_MEMHOSTALLOC_PORTABLE);
-		PSMI_CUDA_CALL(cuEventCreate, &icb->copy_status, CU_EVENT_DEFAULT);
+		icb->host_buf = NULL;
+		icb->copy_status = NULL;
 	} else {
-		if (icb->host_buf) {
+		if (icb->host_buf != NULL) {
 			PSMI_CUDA_CALL(cuMemFreeHost, icb->host_buf);
+		}
+		if (icb->copy_status != NULL) {
 			PSMI_CUDA_CALL(cuEventDestroy, icb->copy_status);
 		}
 	}
@@ -520,10 +517,7 @@ ips_proto_init(const psmi_context_t *context, const ptl_t *ptl,
 
 	if (protoexp_flags & IPS_PROTOEXP_FLAG_ENABLED) {
 #ifdef PSM_CUDA
-		if (PSMI_IS_CUDA_ENABLED) {
-			PSMI_CUDA_CALL(cuStreamCreate,
-				   &proto->cudastream_send, CU_STREAM_NON_BLOCKING);
-		}
+		proto->cudastream_send = NULL;
 #endif
 		proto->scbc_rv = NULL;
 		if ((err = ips_protoexp_init(context, proto, protoexp_flags,
@@ -635,14 +629,34 @@ ips_proto_init(const psmi_context_t *context, const ptl_t *ptl,
 				PSMI_ENVVAR_LEVEL_USER, PSMI_ENVVAR_TYPE_UINT_FLAGS,
 				(union psmi_envvar_val)0, /* Disabled by default */
 				&env_gpudirect_rdma);
+        /* Use GPUDirect RDMA for SDMA send? */
+        union psmi_envvar_val env_gpudirect_rdma_send;
+        psmi_getenv("PSM2_GPUDIRECT_RDMA_SEND",
+                                "Use GPUDirect RDMA support to allow the HFI to directly"
+                                " read from the GPU for SDMA.  Requires driver"
+                                " support.(default is disabled i.e. 0)",
+                                PSMI_ENVVAR_LEVEL_USER, PSMI_ENVVAR_TYPE_UINT_FLAGS,
+                                (union psmi_envvar_val)0, /* Disabled by default */
+                                &env_gpudirect_rdma_send);
+ 
+        /* Use GPUDirect RDMA for recv? */
+        union psmi_envvar_val env_gpudirect_rdma_recv;
+        psmi_getenv("PSM2_GPUDIRECT_RDMA_RECV",
+                                "Use GPUDirect RDMA support to allow the HFI to directly"
+                                " write into GPU.  Requires driver support.(default is"
+                                " disabled i.e. 0)",
+                                PSMI_ENVVAR_LEVEL_USER, PSMI_ENVVAR_TYPE_UINT_FLAGS,
+                                (union psmi_envvar_val)0, /* Disabled by default */
+                                &env_gpudirect_rdma_recv);
+
 	/* The following cases need to be handled:
 	 * 1) GPU DIRECT is turned off but GDR COPY is turned on by the user or
 	 *    by default - Turn off GDR COPY
 	 * 2) GPU DIRECT is on but GDR COPY is turned off by the user - Leave
 	 *.   this config as it is.
 	 */
-	if (!env_gpudirect_rdma.e_uint)
-		is_gdr_copy_enabled = 0;
+        if (!env_gpudirect_rdma.e_uint && !env_gpudirect_rdma_send.e_uint && !env_gpudirect_rdma_recv.e_uint)		
+                is_gdr_copy_enabled = 0;
 
 	/* Default Send threshold for Gpu-direct set to 30000 */
 	union psmi_envvar_val env_gpudirect_send_thresh;
@@ -659,7 +673,7 @@ ips_proto_init(const psmi_context_t *context, const ptl_t *ptl,
 		    (union psmi_envvar_val)UINT_MAX, &env_gpudirect_recv_thresh);
 	gpudirect_recv_threshold = env_gpudirect_recv_thresh.e_uint;
 
-	if (env_gpudirect_rdma.e_uint && device_support_gpudirect) {
+	if (env_gpudirect_rdma.e_uint && device_support_gpudirect()) {
 		if (PSMI_IS_CUDA_DISABLED ||
 			/* All pio, No SDMA*/
 			(proto->flags & IPS_PROTO_FLAG_SPIO) ||
@@ -675,16 +689,7 @@ ips_proto_init(const psmi_context_t *context, const ptl_t *ptl,
 		 * experimentation and will not be documented for any customers.
 		 */
 		/* Use GPUDirect RDMA for SDMA send? */
-		union psmi_envvar_val env_gpudirect_rdma_send;
-		psmi_getenv("PSM2_GPUDIRECT_RDMA_SEND",
-					"Use GPUDirect RDMA support to allow the HFI to directly"
-					" read from the GPU for SDMA.  Requires driver"
-					" support.(default is disabled i.e. 0)",
-					PSMI_ENVVAR_LEVEL_USER, PSMI_ENVVAR_TYPE_UINT_FLAGS,
-					(union psmi_envvar_val)0, /* Disabled by default */
-					&env_gpudirect_rdma_send);
-
-		if (env_gpudirect_rdma_send.e_uint && device_support_gpudirect) {
+		if (env_gpudirect_rdma_send.e_uint && device_support_gpudirect()) {
 			if (PSMI_IS_CUDA_DISABLED ||
 				/* All pio, No SDMA*/
 				(proto->flags & IPS_PROTO_FLAG_SPIO))
@@ -695,16 +700,7 @@ ips_proto_init(const psmi_context_t *context, const ptl_t *ptl,
 			proto->flags |= IPS_PROTO_FLAG_GPUDIRECT_RDMA_SEND;
 		}
 		/* Use GPUDirect RDMA for recv? */
-		union psmi_envvar_val env_gpudirect_rdma_recv;
-		psmi_getenv("PSM2_GPUDIRECT_RDMA_RECV",
-					"Use GPUDirect RDMA support to allow the HFI to directly"
-					" write into GPU.  Requires driver support.(default is"
-					" disabled i.e. 0)",
-					PSMI_ENVVAR_LEVEL_USER, PSMI_ENVVAR_TYPE_UINT_FLAGS,
-					(union psmi_envvar_val)0, /* Disabled by default */
-					&env_gpudirect_rdma_recv);
-
-		if (env_gpudirect_rdma_recv.e_uint && device_support_gpudirect) {
+		if (env_gpudirect_rdma_recv.e_uint && device_support_gpudirect()) {
 			if (PSMI_IS_CUDA_DISABLED ||
 				!(protoexp_flags & IPS_PROTOEXP_FLAG_ENABLED))
 					err = psmi_handle_error(PSMI_EP_NORETURN,
@@ -734,9 +730,7 @@ ips_proto_init(const psmi_context_t *context, const ptl_t *ptl,
 			psmi_mpool_create_for_cuda(sizeof(struct ips_cuda_hostbuf),
 						   chunksz, max_elements, 0,
 						   UNDEFINED, NULL, NULL,
-						   psmi_cuda_hostbuf_alloc_func,
-						   (void *)
-						   &proto->cuda_hostbuf_send_cfg);
+						   psmi_cuda_hostbuf_alloc_func);
 
 		if (proto->cuda_hostbuf_pool_send == NULL) {
 			err = psmi_handle_error(proto->ep, PSM2_NO_MEMORY,
@@ -750,9 +744,7 @@ ips_proto_init(const psmi_context_t *context, const ptl_t *ptl,
 			psmi_mpool_create_for_cuda(sizeof(struct ips_cuda_hostbuf),
 						   chunksz, max_elements, 0,
 						   UNDEFINED, NULL, NULL,
-						   psmi_cuda_hostbuf_alloc_func,
-						   (void *)
-						   &proto->cuda_hostbuf_small_send_cfg);
+						   psmi_cuda_hostbuf_alloc_func);
 
 		if (proto->cuda_hostbuf_pool_small_send == NULL) {
 			err = psmi_handle_error(proto->ep, PSM2_NO_MEMORY,
@@ -928,7 +920,7 @@ ips_proto_fini(struct ips_proto *proto, int force, uint64_t timeout_in)
 #endif
 
 #ifdef PSM_CUDA
-	if (PSMI_IS_CUDA_ENABLED) {
+	if (PSMI_IS_CUDA_ENABLED && proto->cudastream_send) {
 		PSMI_CUDA_CALL(cuStreamDestroy, proto->cudastream_send);
 	}
 #endif
diff --git a/ptl_ips/ips_proto.h b/ptl_ips/ips_proto.h
index dc8e7d4..221edd6 100644
--- a/ptl_ips/ips_proto.h
+++ b/ptl_ips/ips_proto.h
@@ -664,7 +664,6 @@ int ips_proto_mq_handle_rts(struct ips_recvhdrq_event *rcv_ev);
 int ips_proto_mq_handle_tiny(struct ips_recvhdrq_event *rcv_ev);
 int ips_proto_mq_handle_short(struct ips_recvhdrq_event *rcv_ev);
 int ips_proto_mq_handle_eager(struct ips_recvhdrq_event *rcv_ev);
-void ips_proto_mq_handle_outoforder_queue(psm2_mq_t mq, ips_msgctl_t *msgctl);
 int ips_proto_mq_handle_data(struct ips_recvhdrq_event *rcv_ev);
 
 psm2_error_t ips_proto_mq_send(psm2_mq_t mq, psm2_epaddr_t epaddr,
diff --git a/ptl_ips/ips_proto_connect.c b/ptl_ips/ips_proto_connect.c
index a608760..f650a36 100644
--- a/ptl_ips/ips_proto_connect.c
+++ b/ptl_ips/ips_proto_connect.c
@@ -5,6 +5,7 @@
 
   GPL LICENSE SUMMARY
 
+  Copyright(c) 2021 Cornelis Networks.
   Copyright(c) 2015 Intel Corporation.
 
   This program is free software; you can redistribute it and/or modify
@@ -17,10 +18,11 @@
   General Public License for more details.
 
   Contact Information:
-  Intel Corporation, www.intel.com
+  Cornelis Networks, www.cornelisnetworks.com
 
   BSD LICENSE
 
+  Copyright(c) 2021 Cornelis Networks.
   Copyright(c) 2015 Intel Corporation.
 
   Redistribution and use in source and binary forms, with or without
@@ -905,7 +907,7 @@ ptl_handle_connect_req(struct ips_proto *proto, psm2_epaddr_t epaddr,
 			if (!proto->done_warning) {
 				psmi_syslog(proto->ep, 1, LOG_INFO,
 					    "Non-fatal connection problem: Received an out-of-context "
-					    "connection message from host %s LID=0x%x context=%d. (Ignoring)",
+					    "connection message from host %s LID=0x%x context=%"PRId64". (Ignoring)",
 					    req->hostname,
 					    (int)psm2_epid_nid(req->epid),
 					    psm2_epid_context(req->epid));
diff --git a/ptl_ips/ips_proto_expected.c b/ptl_ips/ips_proto_expected.c
index 7e7e997..b25b2a3 100644
--- a/ptl_ips/ips_proto_expected.c
+++ b/ptl_ips/ips_proto_expected.c
@@ -5,6 +5,7 @@
 
   GPL LICENSE SUMMARY
 
+  Copyright(c) 2021 Cornelis Networks.
   Copyright(c) 2016 Intel Corporation.
 
   This program is free software; you can redistribute it and/or modify
@@ -17,10 +18,11 @@
   General Public License for more details.
 
   Contact Information:
-  Intel Corporation, www.intel.com
+  Cornelis Networks, www.cornelisnetworks.com
 
   BSD LICENSE
 
+  Copyright(c) 2021 Cornelis Networks.
   Copyright(c) 2016 Intel Corporation.
 
   Redistribution and use in source and binary forms, with or without
@@ -370,9 +372,7 @@ MOCKABLE(ips_protoexp_init)(const psmi_context_t *context,
 				psmi_mpool_create_for_cuda(sizeof(struct ips_cuda_hostbuf),
 							   chunksz, max_elements, 0,
 							   UNDEFINED, NULL, NULL,
-							   psmi_cuda_hostbuf_alloc_func,
-							   (void *)
-							   &protoexp->cuda_hostbuf_recv_cfg);
+							   psmi_cuda_hostbuf_alloc_func);
 
 			if (protoexp->cuda_hostbuf_pool_recv == NULL) {
 				err = psmi_handle_error(proto->ep, PSM2_NO_MEMORY,
@@ -386,9 +386,7 @@ MOCKABLE(ips_protoexp_init)(const psmi_context_t *context,
 				psmi_mpool_create_for_cuda(sizeof(struct ips_cuda_hostbuf),
 							   chunksz, max_elements, 0,
 							   UNDEFINED, NULL, NULL,
-							   psmi_cuda_hostbuf_alloc_func,
-							   (void *)
-							   &protoexp->cuda_hostbuf_small_recv_cfg);
+							   psmi_cuda_hostbuf_alloc_func);
 
 			if (protoexp->cuda_hostbuf_pool_small_recv == NULL) {
 				err = psmi_handle_error(proto->ep, PSM2_NO_MEMORY,
@@ -396,9 +394,7 @@ MOCKABLE(ips_protoexp_init)(const psmi_context_t *context,
 				goto fail;
 			}
 
-			PSMI_CUDA_CALL(cuStreamCreate,
-				&protoexp->cudastream_recv,
-				CU_STREAM_NON_BLOCKING);
+			protoexp->cudastream_recv = NULL;
 			STAILQ_INIT(&protoexp->cudapend_getreqsq);
 		} else {
 			protoexp->cuda_hostbuf_pool_recv = NULL;
@@ -437,7 +433,9 @@ psm2_error_t ips_protoexp_fini(struct ips_protoexp *protoexp)
 		 !(protoexp->proto->flags & IPS_PROTO_FLAG_GPUDIRECT_RDMA_RECV)) {
 		psmi_mpool_destroy(protoexp->cuda_hostbuf_pool_small_recv);
 		psmi_mpool_destroy(protoexp->cuda_hostbuf_pool_recv);
-		PSMI_CUDA_CALL(cuStreamDestroy, protoexp->cudastream_recv);
+		if (protoexp->cudastream_recv != NULL) {
+			PSMI_CUDA_CALL(cuStreamDestroy, protoexp->cudastream_recv);
+		}
 	}
 #endif
 	psmi_mpool_destroy(protoexp->tid_getreq_pool);
@@ -922,11 +920,15 @@ int ips_protoexp_data(struct ips_recvhdrq_event *rcv_ev)
 	 * assert(0 < paylen < MTU).
 	 */
 	if (psmi_hal_has_cap(PSM_HAL_CAP_RSM_FECN_SUPP) &&
-	    ips_recvhdrq_event_payload(rcv_ev) &&
-	    ips_recvhdrq_event_paylen(rcv_ev))
-		psmi_mq_mtucpy(tidrecvc->buffer + p_hdr->exp_offset,
-			       ips_recvhdrq_event_payload(rcv_ev),
-			       ips_recvhdrq_event_paylen(rcv_ev));
+			ips_recvhdrq_event_payload(rcv_ev) &&
+			ips_recvhdrq_event_paylen(rcv_ev)) {
+
+		psmi_assert(p_hdr->exp_offset + ips_recvhdrq_event_paylen(rcv_ev) <= tidrecvc->recv_tidbytes);
+
+		psmi_mq_mtucpy(tidrecvc->buffer + tidrecvc->tid_list.tsess_unaligned_start + p_hdr->exp_offset,
+			ips_recvhdrq_event_payload(rcv_ev),
+			ips_recvhdrq_event_paylen(rcv_ev));
+	}
 
 	/* If last packet then we are done. We send a tid transfer completion
 	 * packet back to sender, free all tids and close the current tidflow
@@ -1094,12 +1096,17 @@ void psmi_cuda_run_prefetcher(struct ips_protoexp *protoexp,
 		window_len =
 			ips_cuda_next_window(tidsendc->ipsaddr->window_rv,
 					     offset, req->req_data.buf_len);
-		if (window_len <= CUDA_SMALLHOSTBUF_SZ)
+		unsigned bufsz;
+		if (window_len <= CUDA_SMALLHOSTBUF_SZ) {
 			chb = (struct ips_cuda_hostbuf *) psmi_mpool_get(
 				proto->cuda_hostbuf_pool_small_send);
-		if (chb == NULL)
+			bufsz = proto->cuda_hostbuf_small_send_cfg.bufsz;
+		}
+		if (chb == NULL) {
 			chb = (struct ips_cuda_hostbuf *) psmi_mpool_get(
 				proto->cuda_hostbuf_pool_send);
+			bufsz = proto->cuda_hostbuf_send_cfg.bufsz;
+		}
 		/* were any buffers available for the prefetcher? */
 		if (chb == NULL)
 			return;
@@ -1109,6 +1116,20 @@ void psmi_cuda_run_prefetcher(struct ips_protoexp *protoexp,
 		chb->req = req;
 		chb->gpu_buf = (CUdeviceptr) req->req_data.buf + offset;
 		chb->bytes_read = 0;
+
+		if (proto->cudastream_send == NULL) {
+			PSMI_CUDA_CALL(cuStreamCreate,
+				   &proto->cudastream_send, CU_STREAM_NON_BLOCKING);
+		}
+		if (chb->host_buf == NULL) {
+			PSMI_CUDA_CALL(cuMemHostAlloc,
+				       (void **) &chb->host_buf,
+				       bufsz,
+				       CU_MEMHOSTALLOC_PORTABLE);
+		}
+		if (chb->copy_status == NULL) {
+			PSMI_CUDA_CALL(cuEventCreate, &chb->copy_status, CU_EVENT_DEFAULT);
+		}
 		PSMI_CUDA_CALL(cuMemcpyDtoHAsync,
 			       chb->host_buf, chb->gpu_buf,
 			       window_len,
@@ -1143,12 +1164,17 @@ void psmi_attach_chb_to_tidsendc(struct ips_protoexp *protoexp,
 		window_len =
 			ips_cuda_next_window(tidsendc->ipsaddr->window_rv,
 					     offset, req->req_data.buf_len);
-		if (window_len <= CUDA_SMALLHOSTBUF_SZ)
+		unsigned bufsz;
+		if (window_len <= CUDA_SMALLHOSTBUF_SZ) {
 			chb = (struct ips_cuda_hostbuf *) psmi_mpool_get(
 				proto->cuda_hostbuf_pool_small_send);
-		if (chb == NULL)
+			bufsz = proto->cuda_hostbuf_small_send_cfg.bufsz;
+		}
+		if (chb == NULL) {
 			chb = (struct ips_cuda_hostbuf *) psmi_mpool_get(
 				proto->cuda_hostbuf_pool_send);
+			bufsz = proto->cuda_hostbuf_send_cfg.bufsz;
+		}
 
 		/* were any buffers available? If not force allocate */
 		if (chb == NULL) {
@@ -1162,6 +1188,19 @@ void psmi_attach_chb_to_tidsendc(struct ips_protoexp *protoexp,
 		chb->req = req;
 		chb->gpu_buf = (CUdeviceptr) req->req_data.buf + offset;
 		chb->bytes_read = 0;
+		if (proto->cudastream_send == NULL) {
+			PSMI_CUDA_CALL(cuStreamCreate,
+				   &proto->cudastream_send, CU_STREAM_NON_BLOCKING);
+		}
+		if (chb->host_buf == NULL) {
+			PSMI_CUDA_CALL(cuMemHostAlloc,
+				       (void **) &chb->host_buf,
+				       bufsz,
+				       CU_MEMHOSTALLOC_PORTABLE);
+		}
+		if (chb->copy_status == NULL) {
+			PSMI_CUDA_CALL(cuEventCreate, &chb->copy_status, CU_EVENT_DEFAULT);
+		}
 		PSMI_CUDA_CALL(cuMemcpyDtoHAsync,
 			       chb->host_buf, chb->gpu_buf,
 			       window_len,
@@ -2047,14 +2086,19 @@ ips_tid_recv_alloc(struct ips_protoexp *protoexp,
 	/* 4. allocate a cuda bounce buffer, if required */
 	struct ips_cuda_hostbuf *chb = NULL;
 	if (getreq->cuda_hostbuf_used) {
-		if (nbytes_this <= CUDA_SMALLHOSTBUF_SZ)
+		unsigned bufsz;
+		if (nbytes_this <= CUDA_SMALLHOSTBUF_SZ) {
 			chb = (struct ips_cuda_hostbuf *)
 				psmi_mpool_get(
 					protoexp->cuda_hostbuf_pool_small_recv);
-		if (chb == NULL)
+			bufsz = protoexp->cuda_hostbuf_small_recv_cfg.bufsz;
+		}
+		if (chb == NULL) {
 			chb = (struct ips_cuda_hostbuf *)
 				psmi_mpool_get(
 					protoexp->cuda_hostbuf_pool_recv);
+			bufsz = protoexp->cuda_hostbuf_recv_cfg.bufsz;
+		}
 		if (chb == NULL) {
 			/* Unable to get a cudahostbuf for TID.
 			 * Release the resources we're holding and reschedule.*/
@@ -2069,6 +2113,12 @@ ips_tid_recv_alloc(struct ips_protoexp *protoexp,
 			return PSM2_EP_NO_RESOURCES;
 		}
 
+		if (chb->host_buf == NULL) {
+			PSMI_CUDA_CALL(cuMemHostAlloc,
+				       (void **) &chb->host_buf,
+				       bufsz,
+				       CU_MEMHOSTALLOC_PORTABLE);
+		}
 		tidrecvc->cuda_hostbuf = chb;
 		tidrecvc->buffer = chb->host_buf;
 		chb->size = 0;
@@ -2423,11 +2473,20 @@ void psmi_cudamemcpy_tid_to_device(struct ips_tid_recv_desc *tidrecvc)
 	chb->size += tidrecvc->recv_tidbytes + tidrecvc->tid_list.tsess_unaligned_start +
 			tidrecvc->tid_list.tsess_unaligned_end;
 
+	if (protoexp->cudastream_recv == NULL) {
+		PSMI_CUDA_CALL(cuStreamCreate,
+			&protoexp->cudastream_recv,
+			CU_STREAM_NON_BLOCKING);
+	}
+
 	PSMI_CUDA_CALL(cuMemcpyHtoDAsync,
 		       chb->gpu_buf, chb->host_buf,
 		       tidrecvc->recv_tidbytes + tidrecvc->tid_list.tsess_unaligned_start +
 							tidrecvc->tid_list.tsess_unaligned_end,
 		       protoexp->cudastream_recv);
+	if (chb->copy_status == NULL) {
+		PSMI_CUDA_CALL(cuEventCreate, &chb->copy_status, CU_EVENT_DEFAULT);
+	}
 	PSMI_CUDA_CALL(cuEventRecord, chb->copy_status,
 		       protoexp->cudastream_recv);
 
diff --git a/ptl_ips/ips_proto_mq.c b/ptl_ips/ips_proto_mq.c
index 8a047c6..032aa33 100644
--- a/ptl_ips/ips_proto_mq.c
+++ b/ptl_ips/ips_proto_mq.c
@@ -5,6 +5,7 @@
 
   GPL LICENSE SUMMARY
 
+  Copyright(c) 2021 Cornelis Networks.
   Copyright(c) 2016 Intel Corporation.
 
   This program is free software; you can redistribute it and/or modify
@@ -17,10 +18,11 @@
   General Public License for more details.
 
   Contact Information:
-  Intel Corporation, www.intel.com
+  Cornelis Networks, www.cornelisnetworks.com
 
   BSD LICENSE
 
+  Copyright(c) 2021 Cornelis Networks.
   Copyright(c) 2016 Intel Corporation.
 
   Redistribution and use in source and binary forms, with or without
@@ -486,14 +488,19 @@ ips_ptl_mq_rndv(struct ips_proto *proto, psm2_mq_req_t req,
 				ips_cuda_next_window(ipsaddr->window_rv,
 						     offset, len);
 
-			if (window_len <= CUDA_SMALLHOSTBUF_SZ)
+			unsigned bufsz;
+			if (window_len <= CUDA_SMALLHOSTBUF_SZ) {
 				chb = (struct ips_cuda_hostbuf *)
 					psmi_mpool_get(
 					proto->cuda_hostbuf_pool_small_send);
-			if (chb == NULL)
+				bufsz = proto->cuda_hostbuf_small_send_cfg.bufsz;
+			}
+			if (chb == NULL) {
 				chb = (struct ips_cuda_hostbuf *)
 					psmi_mpool_get(
 					proto->cuda_hostbuf_pool_send);
+				bufsz = proto->cuda_hostbuf_send_cfg.bufsz;
+			}
 
 			/* any buffers available? */
 			if (chb == NULL)
@@ -507,6 +514,19 @@ ips_ptl_mq_rndv(struct ips_proto *proto, psm2_mq_req_t req,
 			chb->gpu_buf = (CUdeviceptr) buf + offset;
 			chb->bytes_read = 0;
 
+			if (proto->cudastream_send == NULL) {
+				PSMI_CUDA_CALL(cuStreamCreate,
+					   &proto->cudastream_send, CU_STREAM_NON_BLOCKING);
+			}
+			if (chb->host_buf == NULL) {
+				PSMI_CUDA_CALL(cuMemHostAlloc,
+					       (void **) &chb->host_buf,
+					       bufsz,
+					       CU_MEMHOSTALLOC_PORTABLE);
+			}
+			if (chb->copy_status == NULL) {
+				PSMI_CUDA_CALL(cuEventCreate, &chb->copy_status, CU_EVENT_DEFAULT);
+			}
 			PSMI_CUDA_CALL(cuMemcpyDtoHAsync,
 				       chb->host_buf, chb->gpu_buf,
 				       window_len,
@@ -1411,6 +1431,9 @@ ips_proto_mq_handle_cts(struct ips_recvhdrq_event *rcv_ev)
 	return IPS_RECVHDRQ_CONTINUE;
 }
 
+static void ips_proto_mq_handle_outoforder_queue(psm2_mq_t mq, ips_msgctl_t *msgctl,
+  struct ips_proto *proto);
+
 int
 ips_proto_mq_handle_rts(struct ips_recvhdrq_event *rcv_ev)
 {
@@ -1515,7 +1538,7 @@ ips_proto_mq_handle_rts(struct ips_recvhdrq_event *rcv_ev)
 		/* XXX if blocking, break out of progress loop */
 
 		if (msgctl->outoforder_count)
-			ips_proto_mq_handle_outoforder_queue(mq, msgctl);
+			ips_proto_mq_handle_outoforder_queue(mq, msgctl, rcv_ev->proto);
 
 		if (rc == MQ_RET_UNEXP_OK)
 			ret = IPS_RECVHDRQ_BREAK;
@@ -1569,7 +1592,7 @@ ips_proto_mq_handle_tiny(struct ips_recvhdrq_event *rcv_ev)
 	 */
 	psmi_assert(msgorder != IPS_MSG_ORDER_PAST);
 
-	_HFI_VDBG("tag=%08x.%08x.%08x opcode=%d, msglen=%d\n",
+	_HFI_VDBG("tag=%08x.%08x.%08x opcode=0x%x, msglen=%d\n",
 		  p_hdr->tag[0], p_hdr->tag[1], p_hdr->tag[2],
 		  OPCODE_TINY, p_hdr->hdr_data.u32w1);
 
@@ -1602,7 +1625,7 @@ ips_proto_mq_handle_tiny(struct ips_recvhdrq_event *rcv_ev)
 		ipsaddr->msg_toggle = 0;
 
 		if (msgctl->outoforder_count)
-			ips_proto_mq_handle_outoforder_queue(mq, msgctl);
+			ips_proto_mq_handle_outoforder_queue(mq, msgctl, rcv_ev->proto);
 
 		if (rc == MQ_RET_UNEXP_OK)
 			ret = IPS_RECVHDRQ_BREAK;
@@ -1655,7 +1678,7 @@ ips_proto_mq_handle_short(struct ips_recvhdrq_event *rcv_ev)
 	 */
 	psmi_assert(msgorder != IPS_MSG_ORDER_PAST);
 
-	_HFI_VDBG("tag=%08x.%08x.%08x opcode=%d, msglen=%d\n",
+	_HFI_VDBG("tag=%08x.%08x.%08x opcode=0x%x, msglen=%d\n",
 		  p_hdr->tag[0], p_hdr->tag[1], p_hdr->tag[2],
 		  OPCODE_SHORT, p_hdr->hdr_data.u32w1);
 
@@ -1689,7 +1712,7 @@ ips_proto_mq_handle_short(struct ips_recvhdrq_event *rcv_ev)
 		ipsaddr->msg_toggle = 0;
 
 		if (msgctl->outoforder_count)
-			ips_proto_mq_handle_outoforder_queue(mq, msgctl);
+			ips_proto_mq_handle_outoforder_queue(mq, msgctl, rcv_ev->proto);
 
 		if (rc == MQ_RET_UNEXP_OK)
 			ret = IPS_RECVHDRQ_BREAK;
@@ -1752,6 +1775,9 @@ ips_proto_mq_handle_eager(struct ips_recvhdrq_event *rcv_ev)
 				req->req_data.buf = gdr_convert_gpu_to_host_addr(GDR_FD,
 							(unsigned long)req->user_gpu_buffer,
 							req->req_data.send_msglen, 1, rcv_ev->proto);
+			} else if ((req->flags_user & PSM2_MQ_FLAG_GDRCPY_ONLY) && req->is_buf_gpu_mem) {
+				psmi_handle_error(PSMI_EP_NORETURN, PSM2_INTERNAL_ERR,
+				  "CUDA memcpy not permitted for this operation.");
 			}
 #endif
 			psmi_mq_handle_data(mq, req,
@@ -1788,7 +1814,7 @@ ips_proto_mq_handle_eager(struct ips_recvhdrq_event *rcv_ev)
 	 */
 	psmi_assert(msgorder != IPS_MSG_ORDER_PAST);
 
-	_HFI_VDBG("tag=%08x.%08x.%08x opcode=%d, msglen=%d\n",
+	_HFI_VDBG("tag=%08x.%08x.%08x opcode=0x%x, msglen=%d\n",
 		p_hdr->tag[0], p_hdr->tag[1], p_hdr->tag[2],
 		OPCODE_EAGER, p_hdr->hdr_data.u32w1);
 
@@ -1822,7 +1848,7 @@ ips_proto_mq_handle_eager(struct ips_recvhdrq_event *rcv_ev)
 		ipsaddr->msg_toggle = 0;
 
 		if (msgctl->outoforder_count)
-			ips_proto_mq_handle_outoforder_queue(mq, msgctl);
+			ips_proto_mq_handle_outoforder_queue(mq, msgctl, rcv_ev->proto);
 
 		if (rc == MQ_RET_UNEXP_OK)
 			ret = IPS_RECVHDRQ_BREAK;
@@ -1841,22 +1867,26 @@ ips_proto_mq_handle_eager(struct ips_recvhdrq_event *rcv_ev)
  * Progress the out of order queue to see if any message matches
  * current receiving sequence number.
  */
-void
-ips_proto_mq_handle_outoforder_queue(psm2_mq_t mq, ips_msgctl_t *msgctl)
+static void
+ips_proto_mq_handle_outoforder_queue(psm2_mq_t mq, ips_msgctl_t *msgctl,
+  struct ips_proto *proto)
 {
 	psm2_mq_req_t req;
 
 	do {
-		req =
-		    mq_ooo_match(&mq->outoforder_q, msgctl,
-				 msgctl->mq_recv_seqnum);
+		req = mq_ooo_match(&mq->outoforder_q, msgctl,
+		   msgctl->mq_recv_seqnum);
 		if (req == NULL)
 			return;
 
 		msgctl->outoforder_count--;
 		msgctl->mq_recv_seqnum++;
 
+#ifdef PSM_CUDA
+		psmi_mq_handle_outoforder(mq, req, proto);
+#else
 		psmi_mq_handle_outoforder(mq, req);
+#endif
 
 	} while (msgctl->outoforder_count > 0);
 
diff --git a/ptl_ips/ips_proto_recv.c b/ptl_ips/ips_proto_recv.c
index bf363bb..aa5b727 100644
--- a/ptl_ips/ips_proto_recv.c
+++ b/ptl_ips/ips_proto_recv.c
@@ -1116,7 +1116,7 @@ ips_proto_process_becn(struct ips_recvhdrq_event *rcv_ev)
 
 static void ips_bad_opcode(uint8_t op_code, struct ips_message_header *proto)
 {
-	_HFI_DBG("Discarding message with bad opcode 0x%x\n", op_code);
+	_HFI_INFO("Discarding message with bad opcode 0x%x\n", op_code);
 
 	if (hfi_debug & __HFI_DBG) {
 		ips_proto_show_header(proto, "received bad opcode");
diff --git a/ptl_ips/ips_recvhdrq.c b/ptl_ips/ips_recvhdrq.c
index 6c5fd07..dc9a699 100644
--- a/ptl_ips/ips_recvhdrq.c
+++ b/ptl_ips/ips_recvhdrq.c
@@ -5,6 +5,7 @@
 
   GPL LICENSE SUMMARY
 
+  Copyright(c) 2021 Cornelis Networks.
   Copyright(c) 2015 Intel Corporation.
 
   This program is free software; you can redistribute it and/or modify
@@ -17,10 +18,11 @@
   General Public License for more details.
 
   Contact Information:
-  Intel Corporation, www.intel.com
+  Cornelis Networks, www.cornelisnetworks.com
 
   BSD LICENSE
 
+  Copyright(c) 2021 Cornelis Networks.
   Copyright(c) 2015 Intel Corporation.
 
   Redistribution and use in source and binary forms, with or without
@@ -162,8 +164,11 @@ static __inline__ void _dump_invalid_pkt(struct ips_recvhdrq_event *rcv_ev)
 	if (hfi_debug & __HFI_PKTDBG) {
 		ips_proto_dump_frame(rcv_ev->p_hdr, HFI_MESSAGE_HDR_SIZE,
 				     "header");
-		if (paylen)
+		if (!payload) {
+			_HFI_DBG("Cannot dump frame; payload is NULL\n");
+		} else if (paylen) {
 			ips_proto_dump_frame(payload, paylen, "data");
+		}
 	}
 
 }
@@ -508,7 +513,7 @@ psm2_error_t ips_recvhdrq_progress(struct ips_recvhdrq *recvq)
 					    rcv_ev.recvq->context->psm_hw_ctxt);
 
 			_HFI_VDBG_ALWAYS(
-				"hdrq_head %d, p_hdr: %p, opcode %x, payload %p paylen %d; "
+				"hdrq_head %d, p_hdr: %p, opcode 0x%x, payload %p paylen %d; "
 				"egrhead %x egrtail %x; "
 				"useegrbit %x egrindex %x, egroffset %x, egrindexhead %x\n",
 				state->hdrq_head,
diff --git a/ptl_ips/ptl_rcvthread.c b/ptl_ips/ptl_rcvthread.c
index 4adb65a..f3352a6 100644
--- a/ptl_ips/ptl_rcvthread.c
+++ b/ptl_ips/ptl_rcvthread.c
@@ -5,6 +5,7 @@
 
   GPL LICENSE SUMMARY
 
+  Copyright(c) 2021 Cornelis Networks.
   Copyright(c) 2015 Intel Corporation.
 
   This program is free software; you can redistribute it and/or modify
@@ -17,11 +18,11 @@
   General Public License for more details.
 
   Contact Information:
-  Intel Corporation, www.intel.com
+  Cornelis Networks, www.cornelisnetworks.com
 
   BSD LICENSE
 
-  Copyright(c) 2015 Intel Corporation.
+  Copyright(c) 2021 Cornelis Networks.
 
   Redistribution and use in source and binary forms, with or without
   modification, are permitted provided that the following conditions
@@ -96,7 +97,7 @@ struct ptl_rcvthread {
          * stored to provide hints during a cuda failure
          * due to a null cuda context.
          */
-	CUcontext ctxt;
+	CUcontext cu_ctxt;
 #endif
 
 /*
@@ -124,7 +125,7 @@ psm2_error_t ips_ptl_rcvthread_init(ptl_t *ptl_gen, struct ips_recvhdrq *recvq)
 
 #ifdef PSM_CUDA
 	if (PSMI_IS_CUDA_ENABLED)
-		PSMI_CUDA_CALL(cuCtxGetCurrent, &ctxt);
+		PSMI_CUDA_CALL(cuCtxGetCurrent, &cu_ctxt);
 #endif
 
 	if (psmi_hal_has_sw_status(PSM_HAL_PSMI_RUNTIME_RTS_RX_THREAD) &&
@@ -347,8 +348,8 @@ void *ips_ptl_pollintr(void *rcvthreadc)
 	psm2_error_t err;
 
 #ifdef PSM_CUDA
-	if (PSMI_IS_CUDA_ENABLED && ctxt != NULL)
-		PSMI_CUDA_CALL(cuCtxSetCurrent, ctxt);
+	if (PSMI_IS_CUDA_ENABLED && cu_ctxt != NULL)
+		PSMI_CUDA_CALL(cuCtxSetCurrent, cu_ctxt);
 #endif
 
 	PSM2_LOG_MSG("entering");
diff --git a/rpm_release_extension b/rpm_release_extension
index 725a5ba..dcb6b5b 100644
--- a/rpm_release_extension
+++ b/rpm_release_extension
@@ -1 +1 @@
-185
+230

Debdiff

[The following lists of changes regard files as different if they have different names, permissions or owners.]

Files in second set of .debs but not in first

-rw-r--r--  root/root   /usr/lib/debug/.build-id/34/f1ab7acbd640b8be6d546885b44a1c385977ed.debug
-rw-r--r--  root/root   /usr/lib/debug/.build-id/fe/49f24b1fad832ecfe459d71ab1a4ca66523298.debug

Files in first set of .debs but not in second

-rw-r--r--  root/root   /usr/lib/debug/.build-id/5c/121cc74d7467923b171aec9d0754081e64d381.debug
-rw-r--r--  root/root   /usr/lib/debug/.build-id/ca/a3d2f2b6bc4f476801798bafbf4f0a8e2fafbb.debug

No differences were encountered between the control files of package libpsm2-2

No differences were encountered between the control files of package libpsm2-2-compat

Control files of package libpsm2-2-compat-dbgsym: lines which differ (wdiff format)

  • Build-Ids: 5c121cc74d7467923b171aec9d0754081e64d381 fe49f24b1fad832ecfe459d71ab1a4ca66523298

Control files of package libpsm2-2-dbgsym: lines which differ (wdiff format)

  • Build-Ids: caa3d2f2b6bc4f476801798bafbf4f0a8e2fafbb 34f1ab7acbd640b8be6d546885b44a1c385977ed

No differences were encountered between the control files of package libpsm2-dev

More details

Full run details