diff --git a/CMakeLists.txt b/CMakeLists.txt
index 0bfe9e1..a3a4d99 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -23,19 +23,7 @@
 ##
 ################################################################################
 
-cmake_minimum_required ( VERSION 3.5.0 )
-
-## Cosmetic Cmake version warnings.
-if(NOT EXISTS VERSION_WARNED)
-  set (VERSION_WARNED FALSE CACHE BOOL "")
-endif()
-if(${CMAKE_VERSION} VERSION_LESS "3.6.0" AND NOT ${VERSION_WARNED})
-  message("Your CMake version is too old for full functionality.
-Generated package file names may be incorrect.
-Please update to CMake 3.6 or newer to generate correct package file names")
-  set( VERSION_WARNED TRUE CACHE BOOL "Suppress cosmetic build errors due to CMake version after first warning." FORCE )
-  mark_as_advanced( FORCE VERSION_WARNED )
-endif()
+cmake_minimum_required ( VERSION 3.6.3 )
 
 set ( HSAKMT "hsakmt" )
 set ( HSAKMT_PACKAGE "hsakmt-roct" )
@@ -106,7 +94,24 @@ set ( HSAKMT_LINKER_SCRIPT "${CMAKE_CURRENT_SOURCE_DIR}/src/libhsakmt.ver" )
 
 ## Linker Flags
 ## Add --enable-new-dtags to generate DT_RUNPATH
-set (HSAKMT_LINK_FLAGS "-Wl,--enable-new-dtags -Wl,--version-script=${HSAKMT_LINKER_SCRIPT} -Wl,-soname=${HSAKMT_COMPONENT}.so.${LIB_VERSION_MAJOR} -Wl,-z,nodelete -Wl,-no-undefined" )
+set (HSAKMT_LINK_FLAGS "${HSAKMT_LINK_FLAGS} -Wl,--enable-new-dtags -Wl,--version-script=${HSAKMT_LINKER_SCRIPT} -Wl,-soname=${HSAKMT_COMPONENT}.so.${LIB_VERSION_MAJOR} -Wl,-z,nodelete")
+
+## Address Sanitize Flag
+if ( ${ADDRESS_SANITIZER} )
+    set ( HSAKMT_C_FLAGS "${HSAKMT_C_FLAGS}" -fsanitize=address )
+    set ( HSAKMT_LINK_FLAGS "${HSAKMT_LINK_FLAGS}" -fsanitize=address )
+    if ( BUILD_SHARED_LIBS )
+        set ( HSAKMT_LINK_FLAGS "${HSAKMT_LINK_FLAGS}" -shared-libsan )
+    else ()
+        set ( HSAKMT_LINK_FLAGS "${HSAKMT_LINK_FLAGS}" -static-libsan )
+    endif ()
+else ()
+    if ( CMAKE_COMPILER_IS_GNUCC )
+        set ( HSAKMT_LINK_FLAGS "${HSAKMT_LINK_FLAGS}" -Wl,-no-undefined )
+    else ()
+        set ( HSAKMT_LINK_FLAGS "${HSAKMT_LINK_FLAGS}" -Wl,-undefined,error )
+    endif ()
+endif ()
 
 ## Source files
 set ( HSAKMT_SRC "src/debug.c"
@@ -123,6 +128,7 @@ set ( HSAKMT_SRC "src/debug.c"
                  "src/time.c"
                  "src/topology.c"
                  "src/rbtree.c"
+                 "src/spm.c"
                  "src/version.c")
 
 ## Declare the library target name
@@ -226,74 +232,89 @@ configure_file ( libhsakmt.pc.in libhsakmt.pc @ONLY )
 
 install ( FILES ${CMAKE_CURRENT_BINARY_DIR}/libhsakmt.pc DESTINATION ${CMAKE_INSTALL_DATADIR}/pkgconfig COMPONENT devel)
 
+###########################
+# Packaging directives
+###########################
 # Use component packaging
-set ( CPACK_COMPONENTS_GROUPING IGNORE )
-set ( CPACK_COMPONENTS_ALL binary devel )
-set ( CPACK_DEB_COMPONENT_INSTALL ON )
-set ( CPACK_RPM_COMPONENT_INSTALL ON )
-
-## Add the packaging directives for the runtime library.
-set ( CPACK_PACKAGE_VENDOR "Advanced Micro Devices, Inc." )
-set ( CPACK_PACKAGE_VERSION_MAJOR ${BUILD_VERSION_MAJOR} )
-set ( CPACK_PACKAGE_VERSION_MINOR ${BUILD_VERSION_MINOR} )
-set ( CPACK_PACKAGE_VERSION_PATCH ${BUILD_VERSION_PATCH} )
-set ( CPACK_PACKAGE_CONTACT "Advanced Micro Devices, Inc." )
-set ( CPACK_RESOURCE_FILE_LICENSE "${CMAKE_CURRENT_SOURCE_DIR}/LICENSE.md" )
-
-## Component packaging, BINARY
-set ( CPACK_COMPONENT_BINARY_DESCRIPTION "HSAKMT library for AMD KFD support" )
-set ( CPACK_DEBIAN_BINARY_PACKAGE_NAME ${HSAKMT_PACKAGE} )
-set ( CPACK_DEBIAN_BINARY_FILE_NAME "${CPACK_DEBIAN_BINARY_PACKAGE_NAME}_${CPACK_PACKAGE_VERSION_MAJOR}.${CPACK_PACKAGE_VERSION_MINOR}.${CPACK_PACKAGE_VERSION_PATCH}_amd64.deb" )
-set ( CPACK_RPM_BINARY_PACKAGE_NAME ${HSAKMT_PACKAGE} )
-set ( CPACK_RPM_BINARY_FILE_NAME "${CPACK_RPM_BINARY_PACKAGE_NAME}-${CPACK_PACKAGE_VERSION_MAJOR}.${CPACK_PACKAGE_VERSION_MINOR}.${CPACK_PACKAGE_VERSION_PATCH}.x86_64.rpm" )
-
-#checking for OS-version
-if( EXISTS "/etc/os-release" )
-    file( STRINGS "/etc/os-release" DISTRO_ID REGEX "^ID=" )
-    file( STRINGS "/etc/os-release" DISTRO_RELEASE REGEX "^VERSION_ID=" )
-    string( REPLACE "ID=" "" DISTRO_ID "${DISTRO_ID}" )
-    string( REPLACE "VERSION_ID=" "" DISTRO_RELEASE "${DISTRO_RELEASE}" )
-    message( STATUS "Detected distribution: ${DISTRO_ID}:${DISTRO_RELEASE}" )
-elseif( EXISTS "/etc/centos-release" )
-    # Example: CentOS release 6.10 (Final)
-    file( STRINGS "/etc/centos-release" DISTRO_FULL_STR REGEX "release" )
-    string( REGEX MATCH "^[a-zA-Z]+" DISTRO_ID "${DISTRO_FULL_STR}" )
-    string( TOLOWER "${DISTRO_ID}" DISTRO_ID )
-    string( REGEX MATCH "[0-9]+" DISTRO_RELEASE "${DISTRO_FULL_STR}" )
-    message( STATUS "Detected distribution: ${DISTRO_ID}:${DISTRO_RELEASE}" )
-else()
-     message( STATUS "Not able to detect OS" )
+set(CPACK_COMPONENTS_GROUPING IGNORE)
+set(CPACK_DEB_COMPONENT_INSTALL ON)
+set(CPACK_RPM_COMPONENT_INSTALL ON)
+set(CPACK_DEBIAN_BINARY_PACKAGE_NAME ${HSAKMT_PACKAGE})
+set(CPACK_RPM_BINARY_PACKAGE_NAME ${HSAKMT_PACKAGE})
+set(CPACK_PACKAGE_VENDOR "Advanced Micro Devices, Inc.")
+set(CPACK_PACKAGE_VERSION_MAJOR ${VERSION_MAJOR})
+set(CPACK_PACKAGE_VERSION_MINOR ${VERSION_MINOR})
+set(CPACK_PACKAGE_VERSION_PATCH ${VERSION_PATCH})
+set(CPACK_PACKAGE_CONTACT "AMD GFX mailing list <amd-gfx@lists.freedesktop.org>")
+set(CPACK_RESOURCE_FILE_LICENSE "${CMAKE_CURRENT_SOURCE_DIR}/LICENSE.md")
+set(CPACK_COMPONENT_BINARY_DESCRIPTION "HSAKMT library for AMD KFD support\n This repository includes the user-mode API interfaces\n used to interact with the ROCk driver.")
+
+## Component packaging, DEVEL
+set(CPACK_COMPONENT_DEVEL_DESCRIPTION "HSAKMT development package.\n This package contains the headers, pkgonfig and\n cmake files for the ROCT package")
+
+# Make proper version for appending
+# Default Value is 99999, setting it first
+set(ROCM_VERSION_FOR_PACKAGE "99999")
+if(DEFINED ENV{ROCM_LIBPATCH_VERSION})
+  set(ROCM_VERSION_FOR_PACKAGE $ENV{ROCM_LIBPATCH_VERSION})
 endif()
 
 # Debian package specific variables
-set ( CPACK_DEBIAN_PACKAGE_HOMEPAGE "https://github.com/RadeonOpenCompute/ROCT-Thunk-Interface" )
-set ( CPACK_DEBIAN_BINARY_PACKAGE_CONTROL_EXTRA "${CMAKE_CURRENT_SOURCE_DIR}/DEBIAN/postinst;${CMAKE_CURRENT_SOURCE_DIR}/DEBIAN/prerm" )
-set ( CPACK_DEBIAN_BINARY_PACKAGE_DEPENDS "libc6, libnuma1, zlib1g, libudev1" )
+set(CPACK_DEBIAN_PACKAGE_HOMEPAGE "https://github.com/RadeonOpenCompute/ROCT-Thunk-Interface")
+set(CPACK_DEBIAN_BINARY_PACKAGE_CONTROL_EXTRA "${CMAKE_CURRENT_SOURCE_DIR}/DEBIAN/postinst;${CMAKE_CURRENT_SOURCE_DIR}/DEBIAN/prerm")
+set(CPACK_DEBIAN_BINARY_PACKAGE_DEPENDS "libc6, libnuma1, zlib1g, libudev1")
+set(CPACK_DEBIAN_DEVEL_PACKAGE_NAME "hsakmt-roct-dev")
+if (DEFINED ENV{CPACK_DEBIAN_PACKAGE_RELEASE})
+   set(CPACK_DEBIAN_PACKAGE_RELEASE $ENV{CPACK_DEBIAN_PACKAGE_RELEASE})
+else()
+   set(CPACK_DEBIAN_PACKAGE_RELEASE "local")
+endif()
 
 # RPM package specific variables
-if( DISTRO_ID MATCHES "sles" )
-    set ( CPACK_RPM_BINARY_PACKAGE_REQUIRES "glibc, libnuma-devel, libgcc_s1")
-elseif( DISTRO_ID MATCHES "centos" AND DISTRO_RELEASE MATCHES "6" )
-    set ( CPACK_RPM_BINARY_PACKAGE_REQUIRES "glibc, numactl, libgcc" )
+# get OS-info
+get_os_info()
+if(DISTRO_ID MATCHES "sles" )
+    set(CPACK_RPM_BINARY_PACKAGE_REQUIRES "glibc, libnuma-devel, libgcc_s1")
+elseif(DISTRO_ID MATCHES "centos" AND DISTRO_RELEASE MATCHES "6")
+    set(CPACK_RPM_BINARY_PACKAGE_REQUIRES "glibc, numactl, libgcc")
 else()
-    set ( CPACK_RPM_BINARY_PACKAGE_REQUIRES "glibc, numactl-libs, libgcc" )
+    set(CPACK_RPM_BINARY_PACKAGE_REQUIRES "glibc, numactl-libs, libgcc")
+endif()
+set(CPACK_RPM_BINARY_PRE_INSTALL_SCRIPT_FILE "${CMAKE_CURRENT_SOURCE_DIR}/RPM/rpm_post")
+set(CPACK_RPM_BINARY_POST_UNINSTALL_SCRIPT_FILE "${CMAKE_CURRENT_SOURCE_DIR}/RPM/rpm_postun")
+if(DEFINED CPACK_PACKAGING_INSTALL_PREFIX)
+  set(CPACK_RPM_EXCLUDE_FROM_AUTO_FILELIST_ADDITION "${CPACK_PACKAGING_INSTALL_PREFIX} ${CPACK_PACKAGING_INSTALL_PREFIX}/${CMAKE_INSTALL_BINDIR}")
+endif()
+set(CPACK_RPM_DEVEL_PACKAGE_NAME "hsakmt-roct-devel")
+# Since we changed the package name to match RPM specs, take care of older builds that had -dev installed
+set(CPACK_RPM_DEVEL_PACKAGE_OBSOLETES "hsakmt-roct-dev")
+if(DEFINED ENV{CPACK_RPM_PACKAGE_RELEASE})
+  set(CPACK_RPM_PACKAGE_RELEASE $ENV{CPACK_RPM_PACKAGE_RELEASE})
+else()
+  set(CPACK_RPM_PACKAGE_RELEASE "local")
 endif()
-set ( CPACK_RPM_BINARY_PRE_INSTALL_SCRIPT_FILE "${CMAKE_CURRENT_SOURCE_DIR}/RPM/rpm_post" )
-set ( CPACK_RPM_BINARY_POST_UNINSTALL_SCRIPT_FILE "${CMAKE_CURRENT_SOURCE_DIR}/RPM/rpm_postun" )
 
-## Component packaging, DEVEL
-set ( CPACK_COMPONENT_DEVEL_DESCRIPTION "HSAKMT development package." )
+# 'dist' breaks manual builds on debian systems due to empty Provides
+execute_process( COMMAND rpm --eval %{?dist}
+                 RESULT_VARIABLE PROC_RESULT
+                 OUTPUT_VARIABLE EVAL_RESULT
+                 OUTPUT_STRIP_TRAILING_WHITESPACE )
+message("RESULT_VARIABLE ${PROC_RESULT} OUTPUT_VARIABLE: ${EVAL_RESULT}")
 
-# Debian package specific variables
-set ( CPACK_DEBIAN_DEVEL_PACKAGE_NAME "hsakmt-roct-dev" )
-set ( CPACK_DEBIAN_DEVEL_FILE_NAME "${CPACK_DEBIAN_DEVEL_PACKAGE_NAME}_${CPACK_PACKAGE_VERSION_MAJOR}.${CPACK_PACKAGE_VERSION_MINOR}.${CPACK_PACKAGE_VERSION_PATCH}_amd64.deb" )
-set ( CPACK_DEBIAN_DEVEL_PACKAGE_DEPENDS "${HSAKMT_PACKAGE} (=${BUILD_VERSION_MAJOR}.${BUILD_VERSION_MINOR}.${BUILD_VERSION_PATCH})" )
+if ( PROC_RESULT EQUAL "0" AND NOT EVAL_RESULT STREQUAL "" )
+  string ( APPEND CPACK_RPM_PACKAGE_RELEASE "%{?dist}" )
+endif()
 
-# RPM package specific variables
-set ( CPACK_RPM_DEVEL_PACKAGE_NAME "hsakmt-roct-devel" )
-set ( CPACK_RPM_DEVEL_FILE_NAME "${CPACK_RPM_DEVEL_PACKAGE_NAME}-${CPACK_PACKAGE_VERSION_MAJOR}.${CPACK_PACKAGE_VERSION_MINOR}.${CPACK_PACKAGE_VERSION_PATCH}.x86_64.rpm" )
-set ( CPACK_RPM_DEVEL_PACKAGE_DEPENDS "${HSAKMT_PACKAGE} = ${BUILD_VERSION_MAJOR}.${BUILD_VERSION_MINOR}.${BUILD_VERSION_PATCH}" )
-# Since we changed the package name to match RPM specs, take care of older builds that had -dev installed
-set ( CPACK_RPM_DEVEL_PACKAGE_OBSOLETES "hsakmt-roct-dev" )
+# Prepare final version for the CPACK use
+set(PACKAGE_VERSION_STR "${CPACK_PACKAGE_VERSION_MAJOR}.${CPACK_PACKAGE_VERSION_MINOR}.${CPACK_PACKAGE_VERSION_PATCH}.${ROCM_VERSION_FOR_PACKAGE}")
+set(CPACK_PACKAGE_VERSION "${PACKAGE_VERSION_STR}")
+
+# Setting devel package dependendent version
+set(CPACK_DEBIAN_DEVEL_PACKAGE_DEPENDS "${HSAKMT_PACKAGE} (=${PACKAGE_VERSION_STR}-${CPACK_DEBIAN_PACKAGE_RELEASE})")
+set(CPACK_RPM_DEVEL_PACKAGE_REQUIRES "${HSAKMT_PACKAGE} = ${PACKAGE_VERSION_STR}-${CPACK_RPM_PACKAGE_RELEASE}")
+
+# Set the names now using CPACK utility
+set(CPACK_DEBIAN_FILE_NAME "DEB-DEFAULT")
+set(CPACK_RPM_FILE_NAME "RPM-DEFAULT")
 
-include ( CPack )
+include(CPack)
diff --git a/README.md b/README.md
index 2d1d4ea..b274bb7 100644
--- a/README.md
+++ b/README.md
@@ -1,8 +1,9 @@
 ### ROCt Library
 
-This repository includes the user-mode API interfaces used to interact with the ROCk driver. Currently supported agents include only the AMD/ATI Fiji family of discrete GPUs.
+This repository includes the user-mode API interfaces used to interact with the ROCk driver.
 
 Starting at 1.7 release, ROCt uses drm render device. This requires the user to belong to video group. Add the user account to video group with "sudo usermod -a -G video _username_" command if the user if not part of video group yet.
+NOTE: Users of Ubuntu 20.04 will need to add the user to the new "render" group, as Ubuntu has changed the owner:group of /dev/kfd to render:render as of that release
 
 #### ROCk Driver
 
@@ -28,8 +29,13 @@ If the hsakmt-roct and hsakmt-roct-dev packages are desired:
     cd build
     cmake ..
     make package
-    make package-dev
 ```
+If you choose not to build and install packages, manual installation of the binaries and header files can be done via:
+```bash
+    make install
+```
+
+NOTE: For older versions of the thunk where hsakmt-dev.txt is present, "make package-dev" and "make install-dev" are required to generate/install the developer packages. Currently, these are created via the "make package" and "make install" commands
 
 #### Disclaimer
 
@@ -37,4 +43,4 @@ The information contained herein is for informational purposes only, and is subj
 
 AMD, the AMD Arrow logo, and combinations thereof are trademarks of Advanced Micro Devices, Inc. Other product names used in this publication are for identification purposes only and may be trademarks of their respective companies.
 
-Copyright (c) 2014-2017 Advanced Micro Devices, Inc. All rights reserved.
+Copyright (c) 2014-2020 Advanced Micro Devices, Inc. All rights reserved.
diff --git a/cmake_modules/utils.cmake b/cmake_modules/utils.cmake
index 096b3de..28dafeb 100644
--- a/cmake_modules/utils.cmake
+++ b/cmake_modules/utils.cmake
@@ -114,3 +114,26 @@ function ( get_version DEFAULT_VERSION_STRING )
     set( VERSION_BUILD  "${VERSION_BUILD}" PARENT_SCOPE )
 
 endfunction()
+
+#get the OS version
+function(get_os_info)
+if( EXISTS "/etc/os-release")
+    file(STRINGS "/etc/os-release" DISTRO_ID REGEX "^ID=")
+    file(STRINGS "/etc/os-release" DISTRO_RELEASE REGEX "^VERSION_ID=")
+    string(REPLACE "ID=" "" DISTRO_ID "${DISTRO_ID}")
+    string(REPLACE "VERSION_ID=" "" DISTRO_RELEASE "${DISTRO_RELEASE}")
+    message(STATUS "Detected distribution: ${DISTRO_ID}:${DISTRO_RELEASE}")
+elseif(EXISTS "/etc/centos-release" )
+    # Example: CentOS release 6.10 (Final)
+    file(STRINGS "/etc/centos-release" DISTRO_FULL_STR REGEX "release")
+    string(REGEX MATCH "^[a-zA-Z]+" DISTRO_ID "${DISTRO_FULL_STR}")
+    string(TOLOWER "${DISTRO_ID}" DISTRO_ID)
+    string(REGEX MATCH "[0-9]+" DISTRO_RELEASE "${DISTRO_FULL_STR}")
+    message(STATUS "Detected distribution: ${DISTRO_ID}:${DISTRO_RELEASE}")
+else()
+     message(STATUS "Not able to detect OS")
+endif()
+    set(DISTRO_ID "${DISTRO_ID}" PARENT_SCOPE )
+    set(DISTRO_RELEASE "${DISTRO_RELEASE}" PARENT_SCOPE )
+
+endfunction()
diff --git a/include/hsakmt.h b/include/hsakmt.h
index 237f80b..83f41ca 100644
--- a/include/hsakmt.h
+++ b/include/hsakmt.h
@@ -1209,6 +1209,41 @@ hsaKmtSetMemoryUserData(
     void *          UserData    //IN
     );
 
+/**
+  Acquire request exclusive use of SPM
+*/
+HSAKMT_STATUS
+HSAKMTAPI
+hsaKmtSPMAcquire(
+    HSAuint32	PreferredNode	//IN
+    );
+
+
+/**
+  Release exclusive use of SPM
+*/
+HSAKMT_STATUS
+HSAKMTAPI
+hsaKmtSPMRelease(
+    HSAuint32	PreferredNode	//IN
+    );
+
+/**
+   Set up the destination user mode buffer for stream performance
+   counter data.
+*/
+
+HSAKMT_STATUS
+HSAKMTAPI
+hsaKmtSPMSetDestBuffer(
+	HSAuint32   PreferredNode,		//IN
+	HSAuint32   SizeInBytes,		//IN
+	HSAuint32   * timeout,			//IN/OUT
+	HSAuint32   * SizeCopied,		//OUT
+	void        *DestMemoryAddress,		//IN
+	bool        *isSPMDataLoss		//OUT
+    );
+
 #ifdef __cplusplus
 }   //extern "C"
 #endif
diff --git a/include/hsakmttypes.h b/include/hsakmttypes.h
index b4fd9bd..947a22b 100644
--- a/include/hsakmttypes.h
+++ b/include/hsakmttypes.h
@@ -209,11 +209,12 @@ typedef union
         unsigned int WaveLaunchTrapOverrideSupported: 1; // Indicates if Wave Launch Trap Override is supported on the node.
         unsigned int WaveLaunchModeSupported: 1; // Indicates if Wave Launch Mode is supported on the node.
         unsigned int PreciseMemoryOperationsSupported: 1; // Indicates if Precise Memory Operations are supported on the node.
-        unsigned int SRAM_EDCSupport: 1;         // Indicates if GFX internal SRAM EDC/ECC functionality is active
+        unsigned int DEPRECATED_SRAM_EDCSupport: 1; // Old buggy user mode depends on this being 0
         unsigned int Mem_EDCSupport: 1;          // Indicates if GFX internal DRAM/HBM EDC/ECC functionality is active
         unsigned int RASEventNotify: 1;          // Indicates if GFX extended RASFeatures and RAS EventNotify status is available
         unsigned int ASICRevision: 4;            // Indicates the ASIC revision of the chip on this node.
-        unsigned int Reserved            : 6;
+        unsigned int SRAM_EDCSupport: 1;         // Indicates if GFX internal SRAM EDC/ECC functionality is active
+        unsigned int Reserved            : 5;
     } ui32;
 } HSA_CAPABILITY;
 
@@ -652,15 +653,16 @@ typedef enum _HSA_QUEUE_TYPE
 } HSA_QUEUE_TYPE;
 
 /**
-  The user context save area starts at offset 0 with the
-  HsaUserContextSaveAreaHeader header followed by the space for a
-  user space copy of the control stack and the user space wave save
-  state. The area must be dword aligned. The context save area is
-  valid for the duration that the associated queue exists. When a
-  context save occurs, the HsaUserContextSaveAreaHeader header will
-  be updated with information about the context save. The context save
-  area is not modified by any other operation, including a context
-  resume.
+  The user context save area is page aligned. The HsaUserContextSaveAreaHeader
+  header starts at offset 0. Space for a user space copy of the control stack
+  comes next and is immediately followed by the user space wave save state. The
+  start of the user space wave save state is page aligned. The debugger reserved
+  area comes next and is 64 byte aligned.
+
+  The user context save area is valid for the duration that the associated
+  queue exists. When a context save occurs, the HsaUserContextSaveAreaHeader
+  header will be updated with information about the context save. The context
+  save area is not modified by any other operation, including a context resume.
  */
 
 typedef struct
@@ -676,6 +678,11 @@ typedef struct
                                  // of wave state data. Must be 4 byte aligned.
     HSAuint32 WaveStateSize;     // Byte size of the last saved wave state data.
                                  // Must be 4 byte aligned.
+    HSAuint32 DebugOffset;       // Byte offset from start of the user context
+                                 // save area to the memory reserved for the
+                                 // debugger. Must be 64 byte aligned.
+    HSAuint32 DebugSize;         // Byte size of the memory reserved for the
+                                 // debugger. Must be 64 byte aligned.
 } HsaUserContextSaveAreaHeader;
 
 
diff --git a/include/linux/kfd_ioctl.h b/include/linux/kfd_ioctl.h
index 6a7dd1f..62a84e4 100644
--- a/include/linux/kfd_ioctl.h
+++ b/include/linux/kfd_ioctl.h
@@ -574,6 +574,99 @@ struct kfd_ioctl_import_dmabuf_args {
 	__u32 dmabuf_fd;	/* to KFD */
 };
 
+/*
+ * KFD SMI(System Management Interface) events
+ */
+enum kfd_smi_event {
+	KFD_SMI_EVENT_NONE = 0, /* not used */
+	KFD_SMI_EVENT_VMFAULT = 1, /* event start counting at 1 */
+	KFD_SMI_EVENT_THERMAL_THROTTLE = 2,
+	KFD_SMI_EVENT_GPU_PRE_RESET = 3,
+	KFD_SMI_EVENT_GPU_POST_RESET = 4,
+};
+
+#define KFD_SMI_EVENT_MASK_FROM_INDEX(i) (1ULL << ((i) - 1))
+
+struct kfd_ioctl_smi_events_args {
+	__u32 gpuid;	/* to KFD */
+	__u32 anon_fd;	/* from KFD */
+};
+
+/**
+ * kfd_ioctl_spm_op - SPM ioctl operations
+ *
+ * @KFD_IOCTL_SPM_OP_ACQUIRE: acquire exclusive access to SPM
+ * @KFD_IOCTL_SPM_OP_RELEASE: release exclusive access to SPM
+ * @KFD_IOCTL_SPM_OP_SET_DEST_BUF: set or unset destination buffer for SPM streaming
+ */
+enum kfd_ioctl_spm_op {
+	KFD_IOCTL_SPM_OP_ACQUIRE,
+	KFD_IOCTL_SPM_OP_RELEASE,
+	KFD_IOCTL_SPM_OP_SET_DEST_BUF
+};
+
+/**
+ * kfd_ioctl_spm_args - Arguments for SPM ioctl
+ *
+ * @op[in]:            specifies the operation to perform
+ * @gpu_id[in]:        GPU ID of the GPU to profile
+ * @dst_buf[in]:       used for the address of the destination buffer
+ *                      in @KFD_IOCTL_SPM_SET_DEST_BUFFER
+ * @buf_size[in]:      size of the destination buffer
+ * @timeout[in/out]:   [in]: timeout in milliseconds, [out]: amount of time left
+ *                      `in the timeout window
+ * @bytes_copied[out]: amount of data that was copied to the previous dest_buf
+ * @has_data_loss:     boolean indicating whether data was lost
+ *                      (e.g. due to a ring-buffer overflow)
+ *
+ * This ioctl performs different functions depending on the @op parameter.
+ *
+ * KFD_IOCTL_SPM_OP_ACQUIRE
+ * ------------------------
+ *
+ * Acquires exclusive access of SPM on the specified @gpu_id for the calling process.
+ * This must be called before using KFD_IOCTL_SPM_OP_SET_DEST_BUF.
+ *
+ * KFD_IOCTL_SPM_OP_RELEASE
+ * ------------------------
+ *
+ * Releases exclusive access of SPM on the specified @gpu_id for the calling process,
+ * which allows another process to acquire it in the future.
+ *
+ * KFD_IOCTL_SPM_OP_SET_DEST_BUF
+ * -----------------------------
+ *
+ * If @dst_buf is NULL, the destination buffer address is unset and copying of counters
+ * is stopped.
+ *
+ * If @dst_buf is not NULL, it specifies the pointer to a new destination buffer.
+ * @buf_size specifies the size of the buffer.
+ *
+ * If @timeout is non-0, the call will wait for up to @timeout ms for the previous
+ * buffer to be filled. If previous buffer to be filled before timeout, the @timeout
+ * will be updated value with the time remaining. If the timeout is exceeded, the function
+ * copies any partial data available into the previous user buffer and returns success.
+ * The amount of valid data in the previous user buffer is indicated by @bytes_copied.
+ *
+ * If @timeout is 0, the function immediately replaces the previous destination buffer
+ * without waiting for the previous buffer to be filled. That means the previous buffer
+ * may only be partially filled, and @bytes_copied will indicate how much data has been
+ * copied to it.
+ *
+ * If data was lost, e.g. due to a ring buffer overflow, @has_data_loss will be non-0.
+ *
+ * Returns negative error code on failure, 0 on success.
+ */
+struct kfd_ioctl_spm_args {
+	__u64 dest_buf;
+	__u32 buf_size;
+	__u32 op;
+	__u32 timeout;
+	__u32 gpu_id;
+	__u32 bytes_copied;
+	__u32 has_data_loss;
+};
+
 /* Register offset inside the remapped mmio page
  */
 enum kfd_mmio_remap {
@@ -741,7 +834,12 @@ struct kfd_ioctl_cross_memory_copy_args {
 #define AMDKFD_IOC_CROSS_MEMORY_COPY		\
 		AMDKFD_IOWR(0x83, struct kfd_ioctl_cross_memory_copy_args)
 
+#define AMDKFD_IOC_RLC_SPM		\
+		AMDKFD_IOWR(0x84, struct kfd_ioctl_spm_args)
+
+
+
 #define AMDKFD_COMMAND_START_2		0x80
-#define AMDKFD_COMMAND_END_2		0x84
+#define AMDKFD_COMMAND_END_2		0x85
 
 #endif
diff --git a/src/fmm.c b/src/fmm.c
index 6867c69..172c1dc 100644
--- a/src/fmm.c
+++ b/src/fmm.c
@@ -931,7 +931,7 @@ static vm_object_t *fmm_allocate_memory_object(uint32_t gpu_id, void *mem,
 	args.flags = flags |
 		KFD_IOC_ALLOC_MEM_FLAGS_NO_SUBSTITUTE;
 	args.va_addr = (uint64_t)mem;
-	if (!topology_is_dgpu(get_device_id_by_gpu_id(gpu_id)) &&
+	if (!is_dgpu &&
 	    (flags & KFD_IOC_ALLOC_MEM_FLAGS_VRAM))
 		args.va_addr = VOID_PTRS_SUB(mem, aperture->base);
 	if (flags & KFD_IOC_ALLOC_MEM_FLAGS_USERPTR)
@@ -1150,7 +1150,7 @@ static void fmm_release_scratch(uint32_t gpu_id)
 
 	size = VOID_PTRS_SUB(aperture->limit, aperture->base) + 1;
 
-	if (topology_is_dgpu(gpu_mem[gpu_mem_id].device_id)) {
+	if (is_dgpu) {
 		/* unmap and remove all remaining objects */
 		pthread_mutex_lock(&aperture->fmm_mutex);
 		while ((n = rbtree_node_any(&aperture->tree, MID))) {
@@ -1217,7 +1217,7 @@ void *fmm_allocate_scratch(uint32_t gpu_id, void *address, uint64_t MemorySizeIn
 		return NULL;
 
 	/* Allocate address space for scratch backing, 64KB aligned */
-	if (topology_is_dgpu(gpu_mem[gpu_mem_id].device_id)) {
+	if (is_dgpu) {
 		pthread_mutex_lock(&svm.dgpu_aperture->fmm_mutex);
 		mem = aperture_allocate_area_aligned(
 			svm.dgpu_aperture, address,
@@ -1349,8 +1349,7 @@ void *fmm_allocate_device(uint32_t gpu_id, void *address, uint64_t MemorySizeInB
 	}
 
 	if (mem) {
-		int map_fd = mmap_offset >= (1ULL<<40) ? kfd_fd :
-					gpu_mem[gpu_mem_id].drm_render_fd;
+		int map_fd = gpu_mem[gpu_mem_id].drm_render_fd;
 		int prot = flags.ui32.HostAccess ? PROT_READ | PROT_WRITE :
 					PROT_NONE;
 		int flag = flags.ui32.HostAccess ? MAP_SHARED | MAP_FIXED :
@@ -1362,6 +1361,13 @@ void *fmm_allocate_device(uint32_t gpu_id, void *address, uint64_t MemorySizeInB
 			__fmm_release(vm_obj, aperture);
 			return NULL;
 		}
+		/*
+		 * This madvise() call is needed to avoid additional references
+		 * to mapped BOs in child processes that can prevent freeing
+		 * memory in the parent process and lead to out-of-memory
+		 * conditions.
+		 */
+		madvise(mem, MemorySizeInBytes, MADV_DONTFORK);
 	}
 
 	return mem;
@@ -1588,7 +1594,7 @@ static void *fmm_allocate_host_gpu(uint32_t node_id, void *address,
 					     &mmap_offset, ioc_flags, &vm_obj);
 
 		if (mem && flags.ui32.HostAccess) {
-			int map_fd = mmap_offset >= (1ULL<<40) ? kfd_fd : gpu_drm_fd;
+			int map_fd = gpu_drm_fd;
 			void *ret = mmap(mem, MemorySizeInBytes,
 					 PROT_READ | PROT_WRITE,
 					 MAP_SHARED | MAP_FIXED, map_fd, mmap_offset);
@@ -2167,12 +2173,17 @@ HSAKMT_STATUS fmm_init_process_apertures(unsigned int NumNodes)
 	 */
 
 	pacc = pci_ids_create();
+
+	is_dgpu = false;
+
 	for (i = 0; i < NumNodes; i++) {
 		memset(&props, 0, sizeof(props));
-		ret = topology_sysfs_get_node_props(i, &props, &gpu_id, pacc);
+		ret = topology_sysfs_get_node_props(i, &props, &gpu_id, pacc, NULL, NULL);
 		if (ret != HSAKMT_STATUS_SUCCESS)
 			goto sysfs_parse_failed;
 
+		topology_setup_is_dgpu_param(&props);
+
 		/* Skip non-GPU nodes */
 		if (gpu_id != 0) {
 			int fd = open_drm_render_device(props.DrmRenderMinor);
@@ -2203,6 +2214,7 @@ HSAKMT_STATUS fmm_init_process_apertures(unsigned int NumNodes)
 			gpu_mem_count++;
 		}
 	}
+
 	pci_ids_destroy(pacc);
 
 	/* The ioctl will also return Number of Nodes if
@@ -2571,6 +2583,11 @@ static int _fmm_map_to_gpu(manageable_aperture_t *aperture,
 	args.n_success = 0;
 
 	ret = kmtIoctl(kfd_fd, AMDKFD_IOC_MAP_MEMORY_TO_GPU, &args);
+	if (ret) {
+		pr_err("GPU mapping failed (%d) for obj at %p, userptr %p, size %lu",
+		       ret, object->start, object->userptr, object->size);
+		goto err_map_failed;
+	}
 
 	add_device_ids_to_mapped_array(object,
 				(uint32_t *)args.device_ids_array_ptr,
@@ -2589,6 +2606,7 @@ static int _fmm_map_to_gpu(manageable_aperture_t *aperture,
 
 exit_ok:
 err_object_not_found:
+err_map_failed:
 	if (!obj)
 		pthread_mutex_unlock(&aperture->fmm_mutex);
 
@@ -2611,7 +2629,7 @@ static int _fmm_map_to_gpu_scratch(uint32_t gpu_id, manageable_aperture_t *apert
 	if (gpu_mem_id < 0)
 		return -1;
 
-	if (!topology_is_dgpu(gpu_mem[gpu_mem_id].device_id))
+	if (!is_dgpu)
 		return 0; /* Nothing to do on APU */
 
 	/* sanity check the address */
@@ -2629,8 +2647,7 @@ static int _fmm_map_to_gpu_scratch(uint32_t gpu_id, manageable_aperture_t *apert
 		if (!obj)
 			return -1;
 		/* Create a CPU mapping for the debugger */
-		map_fd = mmap_offset >= (1ULL<<40) ? kfd_fd :
-					gpu_mem[gpu_mem_id].drm_render_fd;
+		map_fd = gpu_mem[gpu_mem_id].drm_render_fd;
 		mmap_ret = mmap(address, size, PROT_NONE,
 				MAP_PRIVATE | MAP_FIXED, map_fd, mmap_offset);
 		if (mmap_ret == MAP_FAILED) {
@@ -2642,8 +2659,7 @@ static int _fmm_map_to_gpu_scratch(uint32_t gpu_id, manageable_aperture_t *apert
 			gpu_id, address, size, aperture, &mmap_offset,
 			KFD_IOC_ALLOC_MEM_FLAGS_GTT |
 			KFD_IOC_ALLOC_MEM_FLAGS_WRITABLE);
-		map_fd = mmap_offset >= (1ULL<<40) ? kfd_fd :
-					gpu_mem[gpu_mem_id].drm_render_fd;
+		map_fd = gpu_mem[gpu_mem_id].drm_render_fd;
 		mmap_ret = mmap(address, size,
 				PROT_READ | PROT_WRITE,
 				MAP_SHARED | MAP_FIXED, map_fd, mmap_offset);
@@ -2826,7 +2842,7 @@ static int _fmm_unmap_from_gpu_scratch(uint32_t gpu_id,
 	if (gpu_mem_id < 0)
 		return -1;
 
-	if (!topology_is_dgpu(gpu_mem[gpu_mem_id].device_id))
+	if (!is_dgpu)
 		return 0; /* Nothing to do on APU */
 
 	pthread_mutex_lock(&aperture->fmm_mutex);
@@ -3291,8 +3307,7 @@ HSAKMT_STATUS fmm_register_shared_memory(const HsaSharedMemoryHandle *SharedMemo
 			goto err_free_obj;
 		}
 		obj->node_id = gpu_mem[gpu_mem_id].node_id;
-		map_fd = importArgs.mmap_offset >= (1ULL<<40) ? kfd_fd :
-					gpu_mem[gpu_mem_id].drm_render_fd;
+		map_fd = gpu_mem[gpu_mem_id].drm_render_fd;
 		ret = mmap(reservedMem, (SizeInPages << PAGE_SHIFT),
 			   PROT_READ | PROT_WRITE,
 			   MAP_SHARED | MAP_FIXED, map_fd, importArgs.mmap_offset);
diff --git a/src/libhsakmt.h b/src/libhsakmt.h
index cb20026..34e30ec 100644
--- a/src/libhsakmt.h
+++ b/src/libhsakmt.h
@@ -129,6 +129,9 @@ enum asic_family_type {
 	CHIP_NAVI12,	/* 16 */
 	CHIP_NAVI14,	/* 17 */
 	CHIP_SIENNA_CICHLID,	/* 18 */
+	CHIP_NAVY_FLOUNDER,	/* 19 */
+	CHIP_DIMGREY_CAVEFISH,	/* 20 */
+	CHIP_VANGOGH,	/* 21 */
 	CHIP_LAST
 };
 
@@ -137,7 +140,6 @@ struct hsa_gfxip_table {
 	unsigned char major;		// GFXIP Major engine version
 	unsigned char minor;		// GFXIP Minor engine version
 	unsigned char stepping;		// GFXIP Stepping info
-	unsigned char is_dgpu;		// Predicate for dGPU devices
 	const char *amd_name;		// CALName of the device
 	enum asic_family_type asic_family;	// Device family id
 };
@@ -156,9 +158,10 @@ HSAKMT_STATUS validate_nodeid_array(uint32_t **gpu_id_array,
 		uint32_t NumberOfNodes, uint32_t *NodeArray);
 
 HSAKMT_STATUS topology_sysfs_get_node_props(uint32_t node_id, HsaNodeProperties *props,
-		uint32_t *gpu_id, struct pci_ids pacc);
+					uint32_t *gpu_id, struct pci_ids pacc,
+					bool *p2p_links, uint32_t *num_p2pLinks);
 HSAKMT_STATUS topology_sysfs_get_system_props(HsaSystemProperties *props);
-bool topology_is_dgpu(uint16_t device_id);
+void topology_setup_is_dgpu_param(HsaNodeProperties *props);
 bool topology_is_svm_needed(uint16_t device_id);
 HSAKMT_STATUS topology_get_asic_family(uint16_t device_id,
 					enum asic_family_type *asic);
diff --git a/src/libhsakmt.ver b/src/libhsakmt.ver
index d47f68d..97e2cf6 100644
--- a/src/libhsakmt.ver
+++ b/src/libhsakmt.ver
@@ -71,6 +71,9 @@ hsaKmtGetKernelDebugTrapVersionInfo;
 hsaKmtGetThunkDebugTrapVersionInfo;
 hsaKmtSetAddressWatch;
 hsaKmtClearAddressWatch;
+hsaKmtSPMAcquire;
+hsaKmtSPMRelease;
+hsaKmtSPMSetDestBuffer;
 
 local: *;
 };
diff --git a/src/openclose.c b/src/openclose.c
index 72dfd66..784ee51 100644
--- a/src/openclose.c
+++ b/src/openclose.c
@@ -136,17 +136,17 @@ static HSAKMT_STATUS init_vars_from_env(void)
 		zfb_support = atoi(envvar);
 
 	/* Force all the GPUs to a certain type, use the below command:
-	 * export HSA_FORCE_ASIC_TYPE="10.1.0 1 Navi10 14"
-	 * meaning major.minor.step dgpu asic_name asic_id
+	 * export HSA_FORCE_ASIC_TYPE="10.1.0 Navi10 14"
+	 * meaning major.minor.step asic_name asic_id
 	 */
 	envvar = getenv("HSA_FORCE_ASIC_TYPE");
 	if (envvar) {
-		uint32_t major, minor, step, dgpu, asic_family;
+		uint32_t major, minor, step, asic_family;
 
-		if ((sscanf(envvar, "%u.%u.%u %u %63s %u", &major, &minor, &step,
-				&dgpu, force_asic_name, &asic_family) != 6)
+		if ((sscanf(envvar, "%u.%u.%u %63s %u", &major, &minor, &step,
+				force_asic_name, &asic_family) != 5)
 			|| (major > 63 || minor > 255 || step > 255)
-			|| dgpu > 1 || asic_family >= CHIP_LAST) {
+			|| asic_family >= CHIP_LAST) {
 			pr_err("HSA_FORCE_ASIC_TYPE %s is invalid\n", envvar);
 			return HSAKMT_STATUS_ERROR;
 		}
@@ -154,7 +154,6 @@ static HSAKMT_STATUS init_vars_from_env(void)
 		force_asic_entry.major = major;
 		force_asic_entry.minor = minor;
 		force_asic_entry.stepping = step;
-		force_asic_entry.is_dgpu = dgpu;
 
 		force_asic_entry.asic_family = asic_family;
 
diff --git a/src/pmc_table.c b/src/pmc_table.c
index 5d30391..6e76842 100644
--- a/src/pmc_table.c
+++ b/src/pmc_table.c
@@ -2132,6 +2132,7 @@ HSAKMT_STATUS get_block_properties(uint32_t node_id,
 		break;
 	case CHIP_NAVI10:
 	case CHIP_NAVI14:
+	case CHIP_VANGOGH:
 		*block = navi_blocks[block_id];
 		break;
 	default:
diff --git a/src/queues.c b/src/queues.c
index 206e66e..e24e74a 100644
--- a/src/queues.c
+++ b/src/queues.c
@@ -47,8 +47,9 @@
 #define LDS_SIZE_PER_CU		0x10000
 #define HWREG_SIZE_PER_CU	0x1000
 #define WG_CONTEXT_DATA_SIZE_PER_CU(asic_family)	(VGPR_SIZE_PER_CU(asic_family) + SGPR_SIZE_PER_CU + LDS_SIZE_PER_CU + HWREG_SIZE_PER_CU)
-#define WAVES_PER_CU		32
-#define CNTL_STACK_BYTES_PER_WAVE	8
+#define CNTL_STACK_BYTES_PER_WAVE(asic_family)	(asic_family >= CHIP_NAVI10 ? 12 : 8)
+#define DEBUGGER_BYTES_ALIGN	64
+#define DEBUGGER_BYTES_PER_WAVE(asic_family)	32
 
 struct device_info {
 	enum asic_family_type asic_family;
@@ -170,6 +171,24 @@ const struct device_info sienna_cichlid_device_info = {
     .doorbell_size = DOORBELL_SIZE_GFX9,
 };
 
+const struct device_info navy_flounder_device_info = {
+    .asic_family = CHIP_NAVY_FLOUNDER,
+    .eop_buffer_size = 4096,
+    .doorbell_size = DOORBELL_SIZE_GFX9,
+};
+
+const struct device_info dimgrey_cavefish_device_info = {
+    .asic_family = CHIP_DIMGREY_CAVEFISH,
+    .eop_buffer_size = 4096,
+    .doorbell_size = DOORBELL_SIZE_GFX9,
+};
+
+const struct device_info vangogh_device_info = {
+    .asic_family = CHIP_VANGOGH,
+    .eop_buffer_size = 4096,
+    .doorbell_size = DOORBELL_SIZE_GFX9,
+};
+
 static const struct device_info *dev_lookup_table[] = {
 	[CHIP_KAVERI] = &kaveri_device_info,
 	[CHIP_HAWAII] = &hawaii_device_info,
@@ -190,6 +209,9 @@ static const struct device_info *dev_lookup_table[] = {
 	[CHIP_NAVI12] = &navi12_device_info,
 	[CHIP_NAVI14] = &navi14_device_info,
 	[CHIP_SIENNA_CICHLID] = &sienna_cichlid_device_info,
+	[CHIP_NAVY_FLOUNDER] = &navy_flounder_device_info,
+	[CHIP_DIMGREY_CAVEFISH] = &dimgrey_cavefish_device_info,
+	[CHIP_VANGOGH] = &vangogh_device_info,
 };
 
 struct queue {
@@ -200,6 +222,7 @@ struct queue {
 	void *ctx_save_restore;
 	uint32_t ctx_save_restore_size;
 	uint32_t ctl_stack_size;
+	uint32_t debug_memory_size;
 	const struct device_info *dev_info;
 	bool use_ats;
 	/* This queue structure is allocated from GPU with page aligned size
@@ -265,7 +288,7 @@ static void get_doorbell_map_info(uint16_t dev_id,
 	 * GPUVM doorbell on Tonga requires a workaround for VM TLB ACTIVE bit
 	 * lookup bug. Remove ASIC check when this is implemented in amdgpu.
 	 */
-	doorbell->use_gpuvm = (topology_is_dgpu(dev_id) &&
+	doorbell->use_gpuvm = (is_dgpu &&
 			       dev_info->asic_family != CHIP_TONGA);
 	doorbell->size = DOORBELLS_PAGE_SIZE(dev_info->doorbell_size);
 }
@@ -417,14 +440,28 @@ static bool update_ctx_save_restore_size(uint32_t nodeid, struct queue *q)
 	if (node.NumFComputeCores && node.NumSIMDPerCU) {
 		uint32_t ctl_stack_size, wg_data_size;
 		uint32_t cu_num = node.NumFComputeCores / node.NumSIMDPerCU;
+		uint32_t wave_num = (q->dev_info->asic_family < CHIP_NAVI10)
+			? MIN(cu_num * 40, node.NumShaderBanks / node.NumArrays * 512)
+			: cu_num * 32;
 
-		ctl_stack_size = cu_num * WAVES_PER_CU * CNTL_STACK_BYTES_PER_WAVE + 8;
+		ctl_stack_size = wave_num * CNTL_STACK_BYTES_PER_WAVE(q->dev_info->asic_family) + 8;
 		wg_data_size = cu_num * WG_CONTEXT_DATA_SIZE_PER_CU(q->dev_info->asic_family);
-		q->ctl_stack_size = PAGE_ALIGN_UP(ctl_stack_size
-					+ sizeof(HsaUserContextSaveAreaHeader));
+		q->ctl_stack_size = PAGE_ALIGN_UP(sizeof(HsaUserContextSaveAreaHeader)
+					+ ctl_stack_size);
+		if (q->dev_info->asic_family >= CHIP_NAVI10 &&
+			q->dev_info->asic_family <= CHIP_NAVY_FLOUNDER) {
+			/* HW design limits control stack size to 0x7000.
+			 * This is insufficient for theoretical PM4 cases
+			 * but sufficient for AQL, limited by SPI events.
+			 */
+			q->ctl_stack_size = MIN(q->ctl_stack_size, 0x7000);
+		}
+
+		q->debug_memory_size =
+			ALIGN_UP(wave_num * DEBUGGER_BYTES_PER_WAVE(q->dev_info->asic_family), DEBUGGER_BYTES_ALIGN);
 
 		q->ctx_save_restore_size = q->ctl_stack_size
-					+ PAGE_ALIGN_UP(wg_data_size);
+					+ PAGE_ALIGN_UP(wg_data_size + q->debug_memory_size);
 		return true;
 	}
 	return false;
@@ -552,6 +589,8 @@ static int handle_concrete_asic(struct queue *q,
 	ret = update_ctx_save_restore_size(NodeId, q);
 
 	if (ret) {
+		HsaUserContextSaveAreaHeader *header;
+
 		args->ctx_save_restore_size = q->ctx_save_restore_size;
 		args->ctl_stack_size = q->ctl_stack_size;
 		q->ctx_save_restore =
@@ -562,6 +601,10 @@ static int handle_concrete_asic(struct queue *q,
 			return HSAKMT_STATUS_NO_MEMORY;
 
 		args->ctx_save_restore_address = (uintptr_t)q->ctx_save_restore;
+
+		header = (HsaUserContextSaveAreaHeader *)q->ctx_save_restore;
+		header->DebugOffset = q->ctx_save_restore_size - q->debug_memory_size;
+		header->DebugSize = q->debug_memory_size;
 	}
 
 	return HSAKMT_STATUS_SUCCESS;
diff --git a/src/spm.c b/src/spm.c
new file mode 100644
index 0000000..c4eb33e
--- /dev/null
+++ b/src/spm.c
@@ -0,0 +1,103 @@
+/*
+ * Copyright © 2020 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person
+ * obtaining a copy of this software and associated documentation
+ * files (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use, copy,
+ * modify, merge, publish, distribute, sublicense, and/or sell copies
+ * of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including
+ * the next paragraph) shall be included in all copies or substantial
+ * portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT.  IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
+ * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
+ * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ */
+
+#include "libhsakmt.h"
+#include "linux/kfd_ioctl.h"
+#include <stdlib.h>
+#include <stdio.h>
+
+
+HSAKMT_STATUS HSAKMTAPI hsaKmtSPMAcquire(HSAuint32 PreferredNode)
+{
+	int ret;
+	struct kfd_ioctl_spm_args args = {0};
+	uint32_t gpu_id;
+
+	ret = validate_nodeid(PreferredNode, &gpu_id);
+	if (ret != HSAKMT_STATUS_SUCCESS) {
+		pr_err("[%s] invalid node ID: %d\n", __func__, PreferredNode);
+		return ret;
+	}
+
+	ret = HSAKMT_STATUS_SUCCESS;
+	args.op = KFD_IOCTL_SPM_OP_ACQUIRE;
+	args.gpu_id = gpu_id;
+
+	ret = kmtIoctl(kfd_fd, AMDKFD_IOC_RLC_SPM, &args);
+
+	return ret;
+}
+
+HSAKMT_STATUS HSAKMTAPI hsaKmtSPMSetDestBuffer(HSAuint32 PreferredNode,
+						HSAuint32 SizeInBytes,
+						HSAuint32 * timeout,
+						HSAuint32 * SizeCopied,
+						void *DestMemoryAddress,
+						bool *isSPMDataLoss)
+{
+	int ret;
+	struct kfd_ioctl_spm_args args = {0};
+	uint32_t gpu_id;
+
+	ret = HSAKMT_STATUS_SUCCESS;
+
+	ret = validate_nodeid(PreferredNode, &gpu_id);
+
+	args.timeout    = *timeout;
+	args.dest_buf    = (uint64_t)DestMemoryAddress;
+	args.buf_size   = SizeInBytes;
+	args.op         = KFD_IOCTL_SPM_OP_SET_DEST_BUF;
+	args.gpu_id     = gpu_id;
+
+	ret = kmtIoctl(kfd_fd, AMDKFD_IOC_RLC_SPM, &args);
+
+	*SizeCopied = args.bytes_copied;
+	*isSPMDataLoss = args.has_data_loss;
+	*timeout = args.timeout;
+
+	return ret;
+}
+
+HSAKMT_STATUS HSAKMTAPI hsaKmtSPMRelease(HSAuint32 PreferredNode)
+{
+	int ret = HSAKMT_STATUS_SUCCESS;
+	struct kfd_ioctl_spm_args args = {0};
+	uint32_t gpu_id;
+
+	ret = validate_nodeid(PreferredNode, &gpu_id);
+	if (ret != HSAKMT_STATUS_SUCCESS) {
+		pr_err("[%s] invalid node ID: %d\n", __func__, PreferredNode);
+		return ret;
+	}
+
+	args.op = KFD_IOCTL_SPM_OP_RELEASE;
+	args.gpu_id = gpu_id;
+
+	ret = kmtIoctl(kfd_fd, AMDKFD_IOC_RLC_SPM, &args);
+
+	return ret;
+}
+
+
diff --git a/src/topology.c b/src/topology.c
index 502337d..7c3f47d 100644
--- a/src/topology.c
+++ b/src/topology.c
@@ -92,149 +92,161 @@ struct hsa_gfxip_table force_asic_entry = {
 
 static const struct hsa_gfxip_table gfxip_lookup_table[] = {
 	/* Kaveri Family */
-	{ 0x1304, 7, 0, 0, 0, "Spectre", CHIP_KAVERI },
-	{ 0x1305, 7, 0, 0, 0, "Spectre", CHIP_KAVERI },
-	{ 0x1306, 7, 0, 0, 0, "Spectre", CHIP_KAVERI },
-	{ 0x1307, 7, 0, 0, 0, "Spectre", CHIP_KAVERI },
-	{ 0x1309, 7, 0, 0, 0, "Spectre", CHIP_KAVERI },
-	{ 0x130A, 7, 0, 0, 0, "Spectre", CHIP_KAVERI },
-	{ 0x130B, 7, 0, 0, 0, "Spectre", CHIP_KAVERI },
-	{ 0x130C, 7, 0, 0, 0, "Spectre", CHIP_KAVERI },
-	{ 0x130D, 7, 0, 0, 0, "Spectre", CHIP_KAVERI },
-	{ 0x130E, 7, 0, 0, 0, "Spectre", CHIP_KAVERI },
-	{ 0x130F, 7, 0, 0, 0, "Spectre", CHIP_KAVERI },
-	{ 0x1310, 7, 0, 0, 0, "Spectre", CHIP_KAVERI },
-	{ 0x1311, 7, 0, 0, 0, "Spectre", CHIP_KAVERI },
-	{ 0x1312, 7, 0, 0, 0, "Spooky", CHIP_KAVERI },
-	{ 0x1313, 7, 0, 0, 0, "Spectre", CHIP_KAVERI },
-	{ 0x1315, 7, 0, 0, 0, "Spectre", CHIP_KAVERI },
-	{ 0x1316, 7, 0, 0, 0, "Spooky", CHIP_KAVERI },
-	{ 0x1317, 7, 0, 0, 0, "Spooky", CHIP_KAVERI },
-	{ 0x1318, 7, 0, 0, 0, "Spectre", CHIP_KAVERI },
-	{ 0x131B, 7, 0, 0, 0, "Spectre", CHIP_KAVERI },
-	{ 0x131C, 7, 0, 0, 0, "Spectre", CHIP_KAVERI },
-	{ 0x131D, 7, 0, 0, 0, "Spectre", CHIP_KAVERI },
+	{ 0x1304, 7, 0, 0, "Spectre", CHIP_KAVERI },
+	{ 0x1305, 7, 0, 0, "Spectre", CHIP_KAVERI },
+	{ 0x1306, 7, 0, 0, "Spectre", CHIP_KAVERI },
+	{ 0x1307, 7, 0, 0, "Spectre", CHIP_KAVERI },
+	{ 0x1309, 7, 0, 0, "Spectre", CHIP_KAVERI },
+	{ 0x130A, 7, 0, 0, "Spectre", CHIP_KAVERI },
+	{ 0x130B, 7, 0, 0, "Spectre", CHIP_KAVERI },
+	{ 0x130C, 7, 0, 0, "Spectre", CHIP_KAVERI },
+	{ 0x130D, 7, 0, 0, "Spectre", CHIP_KAVERI },
+	{ 0x130E, 7, 0, 0, "Spectre", CHIP_KAVERI },
+	{ 0x130F, 7, 0, 0, "Spectre", CHIP_KAVERI },
+	{ 0x1310, 7, 0, 0, "Spectre", CHIP_KAVERI },
+	{ 0x1311, 7, 0, 0, "Spectre", CHIP_KAVERI },
+	{ 0x1312, 7, 0, 0, "Spooky", CHIP_KAVERI },
+	{ 0x1313, 7, 0, 0, "Spectre", CHIP_KAVERI },
+	{ 0x1315, 7, 0, 0, "Spectre", CHIP_KAVERI },
+	{ 0x1316, 7, 0, 0, "Spooky", CHIP_KAVERI },
+	{ 0x1317, 7, 0, 0, "Spooky", CHIP_KAVERI },
+	{ 0x1318, 7, 0, 0, "Spectre", CHIP_KAVERI },
+	{ 0x131B, 7, 0, 0, "Spectre", CHIP_KAVERI },
+	{ 0x131C, 7, 0, 0, "Spectre", CHIP_KAVERI },
+	{ 0x131D, 7, 0, 0, "Spectre", CHIP_KAVERI },
 	/* Hawaii Family */
-	{ 0x67A0, 7, 0, 1, 1, "Hawaii", CHIP_HAWAII },
-	{ 0x67A1, 7, 0, 1, 1, "Hawaii", CHIP_HAWAII },
-	{ 0x67A2, 7, 0, 1, 1, "Hawaii", CHIP_HAWAII },
-	{ 0x67A8, 7, 0, 1, 1, "Hawaii", CHIP_HAWAII },
-	{ 0x67A9, 7, 0, 1, 1, "Hawaii", CHIP_HAWAII },
-	{ 0x67AA, 7, 0, 1, 1, "Hawaii", CHIP_HAWAII },
-	{ 0x67B0, 7, 0, 1, 1, "Hawaii", CHIP_HAWAII },
-	{ 0x67B1, 7, 0, 1, 1, "Hawaii", CHIP_HAWAII },
-	{ 0x67B8, 7, 0, 1, 1, "Hawaii", CHIP_HAWAII },
-	{ 0x67B9, 7, 0, 1, 1, "Hawaii", CHIP_HAWAII },
-	{ 0x67BA, 7, 0, 1, 1, "Hawaii", CHIP_HAWAII },
-	{ 0x67BE, 7, 0, 1, 1, "Hawaii", CHIP_HAWAII },
+	{ 0x67A0, 7, 0, 1, "Hawaii", CHIP_HAWAII },
+	{ 0x67A1, 7, 0, 1, "Hawaii", CHIP_HAWAII },
+	{ 0x67A2, 7, 0, 1, "Hawaii", CHIP_HAWAII },
+	{ 0x67A8, 7, 0, 1, "Hawaii", CHIP_HAWAII },
+	{ 0x67A9, 7, 0, 1, "Hawaii", CHIP_HAWAII },
+	{ 0x67AA, 7, 0, 1, "Hawaii", CHIP_HAWAII },
+	{ 0x67B0, 7, 0, 1, "Hawaii", CHIP_HAWAII },
+	{ 0x67B1, 7, 0, 1, "Hawaii", CHIP_HAWAII },
+	{ 0x67B8, 7, 0, 1, "Hawaii", CHIP_HAWAII },
+	{ 0x67B9, 7, 0, 1, "Hawaii", CHIP_HAWAII },
+	{ 0x67BA, 7, 0, 1, "Hawaii", CHIP_HAWAII },
+	{ 0x67BE, 7, 0, 1, "Hawaii", CHIP_HAWAII },
 	/* Carrizo Family */
-	{ 0x9870, 8, 0, 1, 0, "Carrizo", CHIP_CARRIZO },
-	{ 0x9874, 8, 0, 1, 0, "Carrizo", CHIP_CARRIZO },
-	{ 0x9875, 8, 0, 1, 0, "Carrizo", CHIP_CARRIZO },
-	{ 0x9876, 8, 0, 1, 0, "Carrizo", CHIP_CARRIZO },
-	{ 0x9877, 8, 0, 1, 0, "Carrizo", CHIP_CARRIZO },
+	{ 0x9870, 8, 0, 1, "Carrizo", CHIP_CARRIZO },
+	{ 0x9874, 8, 0, 1, "Carrizo", CHIP_CARRIZO },
+	{ 0x9875, 8, 0, 1, "Carrizo", CHIP_CARRIZO },
+	{ 0x9876, 8, 0, 1, "Carrizo", CHIP_CARRIZO },
+	{ 0x9877, 8, 0, 1, "Carrizo", CHIP_CARRIZO },
 	/* Tonga Family */
-	{ 0x6920, 8, 0, 2, 1, "Tonga", CHIP_TONGA },
-	{ 0x6921, 8, 0, 2, 1, "Tonga", CHIP_TONGA },
-	{ 0x6928, 8, 0, 2, 1, "Tonga", CHIP_TONGA },
-	{ 0x6929, 8, 0, 2, 1, "Tonga", CHIP_TONGA },
-	{ 0x692B, 8, 0, 2, 1, "Tonga", CHIP_TONGA },
-	{ 0x692F, 8, 0, 2, 1, "Tonga", CHIP_TONGA },
-	{ 0x6930, 8, 0, 2, 1, "Tonga", CHIP_TONGA },
-	{ 0x6938, 8, 0, 2, 1, "Tonga", CHIP_TONGA },
-	{ 0x6939, 8, 0, 2, 1, "Tonga", CHIP_TONGA },
+	{ 0x6920, 8, 0, 2, "Tonga", CHIP_TONGA },
+	{ 0x6921, 8, 0, 2, "Tonga", CHIP_TONGA },
+	{ 0x6928, 8, 0, 2, "Tonga", CHIP_TONGA },
+	{ 0x6929, 8, 0, 2, "Tonga", CHIP_TONGA },
+	{ 0x692B, 8, 0, 2, "Tonga", CHIP_TONGA },
+	{ 0x692F, 8, 0, 2, "Tonga", CHIP_TONGA },
+	{ 0x6930, 8, 0, 2, "Tonga", CHIP_TONGA },
+	{ 0x6938, 8, 0, 2, "Tonga", CHIP_TONGA },
+	{ 0x6939, 8, 0, 2, "Tonga", CHIP_TONGA },
 	/* Fiji */
-	{ 0x7300, 8, 0, 3, 1, "Fiji", CHIP_FIJI },
-	{ 0x730F, 8, 0, 3, 1, "Fiji", CHIP_FIJI },
+	{ 0x7300, 8, 0, 3, "Fiji", CHIP_FIJI },
+	{ 0x730F, 8, 0, 3, "Fiji", CHIP_FIJI },
 	/* Polaris10 */
-	{ 0x67C0, 8, 0, 3, 1, "Polaris10", CHIP_POLARIS10 },
-	{ 0x67C1, 8, 0, 3, 1, "Polaris10", CHIP_POLARIS10 },
-	{ 0x67C2, 8, 0, 3, 1, "Polaris10", CHIP_POLARIS10 },
-	{ 0x67C4, 8, 0, 3, 1, "Polaris10", CHIP_POLARIS10 },
-	{ 0x67C7, 8, 0, 3, 1, "Polaris10", CHIP_POLARIS10 },
-	{ 0x67C8, 8, 0, 3, 1, "Polaris10", CHIP_POLARIS10 },
-	{ 0x67C9, 8, 0, 3, 1, "Polaris10", CHIP_POLARIS10 },
-	{ 0x67CA, 8, 0, 3, 1, "Polaris10", CHIP_POLARIS10 },
-	{ 0x67CC, 8, 0, 3, 1, "Polaris10", CHIP_POLARIS10 },
-	{ 0x67CF, 8, 0, 3, 1, "Polaris10", CHIP_POLARIS10 },
-	{ 0x67D0, 8, 0, 3, 1, "Polaris10", CHIP_POLARIS10 },
-	{ 0x67DF, 8, 0, 3, 1, "Polaris10", CHIP_POLARIS10 },
-	{ 0x6FDF, 8, 0, 3, 1, "Polaris10", CHIP_POLARIS10 },
+	{ 0x67C0, 8, 0, 3, "Polaris10", CHIP_POLARIS10 },
+	{ 0x67C1, 8, 0, 3, "Polaris10", CHIP_POLARIS10 },
+	{ 0x67C2, 8, 0, 3, "Polaris10", CHIP_POLARIS10 },
+	{ 0x67C4, 8, 0, 3, "Polaris10", CHIP_POLARIS10 },
+	{ 0x67C7, 8, 0, 3, "Polaris10", CHIP_POLARIS10 },
+	{ 0x67C8, 8, 0, 3, "Polaris10", CHIP_POLARIS10 },
+	{ 0x67C9, 8, 0, 3, "Polaris10", CHIP_POLARIS10 },
+	{ 0x67CA, 8, 0, 3, "Polaris10", CHIP_POLARIS10 },
+	{ 0x67CC, 8, 0, 3, "Polaris10", CHIP_POLARIS10 },
+	{ 0x67CF, 8, 0, 3, "Polaris10", CHIP_POLARIS10 },
+	{ 0x67D0, 8, 0, 3, "Polaris10", CHIP_POLARIS10 },
+	{ 0x67DF, 8, 0, 3, "Polaris10", CHIP_POLARIS10 },
+	{ 0x6FDF, 8, 0, 3, "Polaris10", CHIP_POLARIS10 },
 	/* Polaris11 */
-	{ 0x67E0, 8, 0, 3, 1, "Polaris11", CHIP_POLARIS11 },
-	{ 0x67E1, 8, 0, 3, 1, "Polaris11", CHIP_POLARIS11 },
-	{ 0x67E3, 8, 0, 3, 1, "Polaris11", CHIP_POLARIS11 },
-	{ 0x67E7, 8, 0, 3, 1, "Polaris11", CHIP_POLARIS11 },
-	{ 0x67E8, 8, 0, 3, 1, "Polaris11", CHIP_POLARIS11 },
-	{ 0x67E9, 8, 0, 3, 1, "Polaris11", CHIP_POLARIS11 },
-	{ 0x67EB, 8, 0, 3, 1, "Polaris11", CHIP_POLARIS11 },
-	{ 0x67EF, 8, 0, 3, 1, "Polaris11", CHIP_POLARIS11 },
-	{ 0x67FF, 8, 0, 3, 1, "Polaris11", CHIP_POLARIS11 },
+	{ 0x67E0, 8, 0, 3, "Polaris11", CHIP_POLARIS11 },
+	{ 0x67E1, 8, 0, 3, "Polaris11", CHIP_POLARIS11 },
+	{ 0x67E3, 8, 0, 3, "Polaris11", CHIP_POLARIS11 },
+	{ 0x67E7, 8, 0, 3, "Polaris11", CHIP_POLARIS11 },
+	{ 0x67E8, 8, 0, 3, "Polaris11", CHIP_POLARIS11 },
+	{ 0x67E9, 8, 0, 3, "Polaris11", CHIP_POLARIS11 },
+	{ 0x67EB, 8, 0, 3, "Polaris11", CHIP_POLARIS11 },
+	{ 0x67EF, 8, 0, 3, "Polaris11", CHIP_POLARIS11 },
+	{ 0x67FF, 8, 0, 3, "Polaris11", CHIP_POLARIS11 },
 	/* Polaris12 */
-	{ 0x6980, 8, 0, 3, 1, "Polaris12", CHIP_POLARIS12 },
-	{ 0x6981, 8, 0, 3, 1, "Polaris12", CHIP_POLARIS12 },
-	{ 0x6985, 8, 0, 3, 1, "Polaris12", CHIP_POLARIS12 },
-	{ 0x6986, 8, 0, 3, 1, "Polaris12", CHIP_POLARIS12 },
-	{ 0x6987, 8, 0, 3, 1, "Polaris12", CHIP_POLARIS12 },
-	{ 0x6995, 8, 0, 3, 1, "Polaris12", CHIP_POLARIS12 },
-	{ 0x6997, 8, 0, 3, 1, "Polaris12", CHIP_POLARIS12 },
-	{ 0x699F, 8, 0, 3, 1, "Polaris12", CHIP_POLARIS12 },
+	{ 0x6980, 8, 0, 3, "Polaris12", CHIP_POLARIS12 },
+	{ 0x6981, 8, 0, 3, "Polaris12", CHIP_POLARIS12 },
+	{ 0x6985, 8, 0, 3, "Polaris12", CHIP_POLARIS12 },
+	{ 0x6986, 8, 0, 3, "Polaris12", CHIP_POLARIS12 },
+	{ 0x6987, 8, 0, 3, "Polaris12", CHIP_POLARIS12 },
+	{ 0x6995, 8, 0, 3, "Polaris12", CHIP_POLARIS12 },
+	{ 0x6997, 8, 0, 3, "Polaris12", CHIP_POLARIS12 },
+	{ 0x699F, 8, 0, 3, "Polaris12", CHIP_POLARIS12 },
 	/* VegaM */
-	{ 0x694C, 8, 0, 3, 1, "VegaM", CHIP_VEGAM },
-	{ 0x694E, 8, 0, 3, 1, "VegaM", CHIP_VEGAM },
-	{ 0x694F, 8, 0, 3, 1, "VegaM", CHIP_VEGAM },
+	{ 0x694C, 8, 0, 3, "VegaM", CHIP_VEGAM },
+	{ 0x694E, 8, 0, 3, "VegaM", CHIP_VEGAM },
+	{ 0x694F, 8, 0, 3, "VegaM", CHIP_VEGAM },
 	/* Vega10 */
-	{ 0x6860, 9, 0, 0, 1, "Vega10", CHIP_VEGA10 },
-	{ 0x6861, 9, 0, 0, 1, "Vega10", CHIP_VEGA10 },
-	{ 0x6862, 9, 0, 0, 1, "Vega10", CHIP_VEGA10 },
-	{ 0x6863, 9, 0, 0, 1, "Vega10", CHIP_VEGA10 },
-	{ 0x6864, 9, 0, 0, 1, "Vega10", CHIP_VEGA10 },
-	{ 0x6867, 9, 0, 0, 1, "Vega10", CHIP_VEGA10 },
-	{ 0x6868, 9, 0, 0, 1, "Vega10", CHIP_VEGA10 },
-	{ 0x6869, 9, 0, 0, 1, "Vega10", CHIP_VEGA10 },
-	{ 0x686A, 9, 0, 0, 1, "Vega10", CHIP_VEGA10 },
-	{ 0x686B, 9, 0, 0, 1, "Vega10", CHIP_VEGA10 },
-	{ 0x686C, 9, 0, 0, 1, "Vega10", CHIP_VEGA10 },
-	{ 0x686D, 9, 0, 0, 1, "Vega10", CHIP_VEGA10 },
-	{ 0x686E, 9, 0, 0, 1, "Vega10", CHIP_VEGA10 },
-	{ 0x687F, 9, 0, 0, 1, "Vega10", CHIP_VEGA10 },
+	{ 0x6860, 9, 0, 0, "Vega10", CHIP_VEGA10 },
+	{ 0x6861, 9, 0, 0, "Vega10", CHIP_VEGA10 },
+	{ 0x6862, 9, 0, 0, "Vega10", CHIP_VEGA10 },
+	{ 0x6863, 9, 0, 0, "Vega10", CHIP_VEGA10 },
+	{ 0x6864, 9, 0, 0, "Vega10", CHIP_VEGA10 },
+	{ 0x6867, 9, 0, 0, "Vega10", CHIP_VEGA10 },
+	{ 0x6868, 9, 0, 0, "Vega10", CHIP_VEGA10 },
+	{ 0x6869, 9, 0, 0, "Vega10", CHIP_VEGA10 },
+	{ 0x686A, 9, 0, 0, "Vega10", CHIP_VEGA10 },
+	{ 0x686B, 9, 0, 0, "Vega10", CHIP_VEGA10 },
+	{ 0x686C, 9, 0, 0, "Vega10", CHIP_VEGA10 },
+	{ 0x686D, 9, 0, 0, "Vega10", CHIP_VEGA10 },
+	{ 0x686E, 9, 0, 0, "Vega10", CHIP_VEGA10 },
+	{ 0x687F, 9, 0, 0, "Vega10", CHIP_VEGA10 },
 	/* Vega12 */
-	{ 0x69A0, 9, 0, 4, 1, "Vega12", CHIP_VEGA12 },
-	{ 0x69A1, 9, 0, 4, 1, "Vega12", CHIP_VEGA12 },
-	{ 0x69A2, 9, 0, 4, 1, "Vega12", CHIP_VEGA12 },
-	{ 0x69A3, 9, 0, 4, 1, "Vega12", CHIP_VEGA12 },
-	{ 0x69Af, 9, 0, 4, 1, "Vega12", CHIP_VEGA12 },
+	{ 0x69A0, 9, 0, 4, "Vega12", CHIP_VEGA12 },
+	{ 0x69A1, 9, 0, 4, "Vega12", CHIP_VEGA12 },
+	{ 0x69A2, 9, 0, 4, "Vega12", CHIP_VEGA12 },
+	{ 0x69A3, 9, 0, 4, "Vega12", CHIP_VEGA12 },
+	{ 0x69Af, 9, 0, 4, "Vega12", CHIP_VEGA12 },
 	/* Raven */
-	{ 0x15DD, 9, 0, 2, 0, "Raven", CHIP_RAVEN },
-	{ 0x15D8, 9, 0, 2, 0, "Raven", CHIP_RAVEN },
+	{ 0x15DD, 9, 0, 2, "Raven", CHIP_RAVEN },
+	{ 0x15D8, 9, 0, 2, "Raven", CHIP_RAVEN },
 	/* Renoir */
-	{ 0x1636, 9, 0, 0, 1, "Renoir", CHIP_RENOIR },
+	{ 0x1636, 9, 0, 2, "Renoir", CHIP_RENOIR },
+	{ 0x1638, 9, 0, 2, "Renoir", CHIP_RENOIR },
+	{ 0x164C, 9, 0, 2, "Renoir", CHIP_RENOIR },
 	/* Vega20 */
-	{ 0x66A0, 9, 0, 6, 1, "Vega20", CHIP_VEGA20 },
-	{ 0x66A1, 9, 0, 6, 1, "Vega20", CHIP_VEGA20 },
-	{ 0x66A2, 9, 0, 6, 1, "Vega20", CHIP_VEGA20 },
-	{ 0x66A3, 9, 0, 6, 1, "Vega20", CHIP_VEGA20 },
-	{ 0x66A4, 9, 0, 6, 1, "Vega20", CHIP_VEGA20 },
-	{ 0x66A7, 9, 0, 6, 1, "Vega20", CHIP_VEGA20 },
-	{ 0x66AF, 9, 0, 6, 1, "Vega20", CHIP_VEGA20 },
+	{ 0x66A0, 9, 0, 6, "Vega20", CHIP_VEGA20 },
+	{ 0x66A1, 9, 0, 6, "Vega20", CHIP_VEGA20 },
+	{ 0x66A2, 9, 0, 6, "Vega20", CHIP_VEGA20 },
+	{ 0x66A3, 9, 0, 6, "Vega20", CHIP_VEGA20 },
+	{ 0x66A4, 9, 0, 6, "Vega20", CHIP_VEGA20 },
+	{ 0x66A7, 9, 0, 6, "Vega20", CHIP_VEGA20 },
+	{ 0x66AF, 9, 0, 6, "Vega20", CHIP_VEGA20 },
 	/* Arcturus */
-	{ 0x7388, 9, 0, 8, 1, "Arcturus", CHIP_ARCTURUS },
-	{ 0x738C, 9, 0, 8, 1, "Arcturus", CHIP_ARCTURUS },
-	{ 0x738E, 9, 0, 8, 1, "Arcturus", CHIP_ARCTURUS },
-	{ 0x7390, 9, 0, 8, 1, "Arcturus", CHIP_ARCTURUS },
+	{ 0x7388, 9, 0, 8, "Arcturus", CHIP_ARCTURUS },
+	{ 0x738C, 9, 0, 8, "Arcturus", CHIP_ARCTURUS },
+	{ 0x738E, 9, 0, 8, "Arcturus", CHIP_ARCTURUS },
+	{ 0x7390, 9, 0, 8, "Arcturus", CHIP_ARCTURUS },
 	/* Navi10 */
-	{ 0x7310, 10, 1, 0, 1, "Navi10", CHIP_NAVI10 },
-	{ 0x7312, 10, 1, 0, 1, "Navi10", CHIP_NAVI10 },
-	{ 0x7318, 10, 1, 0, 1, "Navi10", CHIP_NAVI10 },
-	{ 0x731A, 10, 1, 0, 1, "Navi10", CHIP_NAVI10 },
-	{ 0x731F, 10, 1, 0, 1, "Navi10", CHIP_NAVI10 },
+	{ 0x7310, 10, 1, 0, "Navi10", CHIP_NAVI10 },
+	{ 0x7312, 10, 1, 0, "Navi10", CHIP_NAVI10 },
+	{ 0x7318, 10, 1, 0, "Navi10", CHIP_NAVI10 },
+	{ 0x731A, 10, 1, 0, "Navi10", CHIP_NAVI10 },
+	{ 0x731E, 10, 1, 0, "Navi10", CHIP_NAVI10 },
+	{ 0x731F, 10, 1, 0, "Navi10", CHIP_NAVI10 },
 	/* Navi14 */
-	{ 0x7340, 10, 1, 2, 1, "Navi14", CHIP_NAVI14 },
-	{ 0x7341, 10, 1, 2, 1, "Navi14", CHIP_NAVI14 },
-	{ 0x7347, 10, 1, 2, 1, "Navi14", CHIP_NAVI14 },
+	{ 0x7340, 10, 1, 2, "Navi14", CHIP_NAVI14 },
+	{ 0x7341, 10, 1, 2, "Navi14", CHIP_NAVI14 },
+	{ 0x7347, 10, 1, 2, "Navi14", CHIP_NAVI14 },
 	/* Navi12 */
-	{ 0x7360, 10, 1, 1, 1, "Navi12", CHIP_NAVI12 },
-	{ 0x7362, 10, 1, 1, 1, "Navi12", CHIP_NAVI12 },
+	{ 0x7360, 10, 1, 1, "Navi12", CHIP_NAVI12 },
+	{ 0x7362, 10, 1, 1, "Navi12", CHIP_NAVI12 },
+	/* SIENNA_CICHLID */
+	{ 0x73A0, 10, 3, 0, "SIENNA_CICHLID", CHIP_SIENNA_CICHLID },
+	{ 0x73A2, 10, 3, 0, "SIENNA_CICHLID", CHIP_SIENNA_CICHLID },
+	{ 0x73A3, 10, 3, 0, "SIENNA_CICHLID", CHIP_SIENNA_CICHLID },
+	{ 0x73AB, 10, 3, 0, "SIENNA_CICHLID", CHIP_SIENNA_CICHLID },
+	{ 0x73AE, 10, 3, 0, "SIENNA_CICHLID", CHIP_SIENNA_CICHLID },
+	{ 0x73BF, 10, 3, 0, "SIENNA_CICHLID", CHIP_SIENNA_CICHLID },
+	/* VanGogh */
+	{ 0x163F, 10, 3, 3, "VanGogh", CHIP_VANGOGH },
 };
 
 /* information from /proc/cpuinfo */
@@ -640,7 +652,6 @@ HSAKMT_STATUS topology_sysfs_get_system_props(HsaSystemProperties *props)
 	bool is_node_supported = true;
 	uint32_t num_supported_nodes = 0;
 
-
 	assert(props);
 	fd = fopen(KFD_SYSFS_PATH_SYSTEM_PROPERTIES, "r");
 	if (!fd)
@@ -753,24 +764,19 @@ HSAKMT_STATUS topology_get_asic_family(uint16_t device_id,
 	return HSAKMT_STATUS_SUCCESS;
 }
 
-bool topology_is_dgpu(uint16_t device_id)
-{
-	const struct hsa_gfxip_table *hsa_gfxip =
-				find_hsa_gfxip_device(device_id);
 
-	if (hsa_gfxip && hsa_gfxip->is_dgpu) {
+void topology_setup_is_dgpu_param(HsaNodeProperties *props)
+{
+	/* if we found a dGPU node, then treat the whole system as dGPU */
+	if (!props->NumCPUCores && props->NumFComputeCores)
 		is_dgpu = true;
-		return true;
-	}
-	is_dgpu = false;
-	return false;
 }
 
 bool topology_is_svm_needed(uint16_t device_id)
 {
 	const struct hsa_gfxip_table *hsa_gfxip;
 
-	if (topology_is_dgpu(device_id))
+	if (is_dgpu)
 		return true;
 
 	hsa_gfxip = find_hsa_gfxip_device(device_id);
@@ -858,7 +864,7 @@ static HSAKMT_STATUS topology_parse_cpuinfo(struct proc_cpuinfo *cpuinfo,
 			p += 2; /* remove ": " */
 			proc = atoi(p);
 			if (proc >= num_procs) {
-				pr_warn("cpuinfo contains processor %d lager than %u\n",
+				pr_warn("cpuinfo contains processor %d larger than %u\n",
 					proc, num_procs);
 				ret = HSAKMT_STATUS_NO_MEMORY;
 				goto exit;
@@ -880,12 +886,10 @@ static HSAKMT_STATUS topology_parse_cpuinfo(struct proc_cpuinfo *cpuinfo,
 			p = strchr(read_buf, ':');
 			p += 2; /* remove ": " */
 			p_len = strlen(p);
-			if (p_len < HSA_PUBLIC_NAME_SIZE) {
-				/* -1 to remove \n from p */
-				strncpy(cpuinfo[proc].model_name, p, p_len - 1);
-				cpuinfo[proc].model_name[p_len - 1] = '\0';
-			} else
-				strncpy(cpuinfo[proc].model_name, p, HSA_PUBLIC_NAME_SIZE);
+			if (p_len > HSA_PUBLIC_NAME_SIZE)
+				p_len = HSA_PUBLIC_NAME_SIZE;
+			memcpy(cpuinfo[proc].model_name, p, p_len);
+			cpuinfo[proc].model_name[p_len - 1] = '\0';
 			continue;
 		}
 
@@ -911,7 +915,9 @@ exit:
 HSAKMT_STATUS topology_sysfs_get_node_props(uint32_t node_id,
 					    HsaNodeProperties *props,
 					    uint32_t *gpu_id,
-					    struct pci_ids pacc)
+					    struct pci_ids pacc,
+					    bool *p2p_links,
+					    uint32_t *num_p2pLinks)
 {
 	FILE *fd;
 	char *read_buf, *p, *envvar, dummy;
@@ -973,7 +979,13 @@ HSAKMT_STATUS topology_sysfs_get_node_props(uint32_t node_id,
 			props->NumCaches = (uint32_t)prop_val;
 		else if (strcmp(prop_name, "io_links_count") == 0)
 			props->NumIOLinks = (uint32_t)prop_val;
-		else if (strcmp(prop_name, "cpu_core_id_base") == 0)
+		else if (strcmp(prop_name, "p2p_links_count") == 0) {
+			props->NumIOLinks += (uint32_t)prop_val;
+			if (num_p2pLinks)
+				*num_p2pLinks = (uint32_t)prop_val;
+			if (p2p_links)
+				*p2p_links = true;
+		} else if (strcmp(prop_name, "cpu_core_id_base") == 0)
 			props->CComputeIdLo = (uint32_t)prop_val;
 		else if (strcmp(prop_name, "simd_id_base") == 0)
 			props->FComputeIdLo = (uint32_t)prop_val;
@@ -1410,7 +1422,7 @@ static HSAKMT_STATUS topology_map_sysfs_to_user_node_id(uint32_t sys_node_id, ui
  */
 static HSAKMT_STATUS topology_sysfs_get_iolink_props(uint32_t node_id,
 						     uint32_t iolink_id,
-						     HsaIoLinkProperties *props)
+						     HsaIoLinkProperties *props, bool p2pLink)
 {
 	FILE *fd;
 	char *read_buf, *p;
@@ -1427,7 +1439,11 @@ static HSAKMT_STATUS topology_sysfs_get_iolink_props(uint32_t node_id,
 	if (ret != HSAKMT_STATUS_SUCCESS)
 		return ret;
 
-	snprintf(path, 256, "%s/%d/io_links/%d/properties", KFD_SYSFS_PATH_NODES, sys_node_id, iolink_id);
+	if (p2pLink)
+		snprintf(path, 256, "%s/%d/p2p_links/%d/properties", KFD_SYSFS_PATH_NODES, sys_node_id, iolink_id);
+	else
+		snprintf(path, 256, "%s/%d/io_links/%d/properties", KFD_SYSFS_PATH_NODES, sys_node_id, iolink_id);
+
 	fd = fopen(path, "r");
 	if (!fd)
 		return HSAKMT_STATUS_ERROR;
@@ -1735,6 +1751,9 @@ HSAKMT_STATUS topology_take_snapshot(void)
 	struct pci_ids pacc;
 	struct proc_cpuinfo *cpuinfo;
 	const uint32_t num_procs = get_nprocs();
+	uint32_t num_ioLinks;
+	bool p2p_links = false;
+	uint32_t num_p2pLinks = 0;
 
 	cpuinfo = calloc(num_procs, sizeof(struct proc_cpuinfo));
 	if (!cpuinfo) {
@@ -1760,7 +1779,8 @@ retry:
 		for (i = 0; i < sys_props.NumNodes; i++) {
 			ret = topology_sysfs_get_node_props(i,
 					&temp_props[i].node,
-					&temp_props[i].gpu_id, pacc);
+					&temp_props[i].gpu_id,
+					pacc, &p2p_links, &num_p2pLinks);
 			if (ret != HSAKMT_STATUS_SUCCESS) {
 				free_properties(temp_props, i);
 				goto err;
@@ -1818,17 +1838,19 @@ retry:
 				free_properties(temp_props, i + 1);
 				goto err;
 			}
+			num_ioLinks = temp_props[i].node.NumIOLinks - num_p2pLinks;
+			uint32_t link_id = 0;
 
-			if (temp_props[i].node.NumIOLinks) {
-				uint32_t sys_link_id = 0, link_id = 0;
+			if (num_ioLinks) {
+				uint32_t sys_link_id = 0;
 
 				/* Parse all the sysfs specified io links. Skip the ones where the
 				 * remote node (node_to) is not accessible
 				 */
-				while (sys_link_id < temp_props[i].node.NumIOLinks &&
+				while (sys_link_id < num_ioLinks &&
 					link_id < sys_props.NumNodes - 1) {
 					ret = topology_sysfs_get_iolink_props(i, sys_link_id++,
-									      &temp_props[i].link[link_id]);
+								&temp_props[i].link[link_id], false);
 					if (ret == HSAKMT_STATUS_NOT_SUPPORTED) {
 						ret = HSAKMT_STATUS_SUCCESS;
 						continue;
@@ -1839,16 +1861,39 @@ retry:
 					link_id++;
 				}
 				/* sysfs specifies all the io links. Limit the number to valid ones */
+				num_ioLinks = link_id;
+			}
+
+			if (num_p2pLinks) {
+				uint32_t sys_link_id = 0;
+
+				/* Parse all the sysfs specified p2p links.
+				 */
+				while (sys_link_id < num_p2pLinks &&
+					link_id < sys_props.NumNodes - 1) {
+					ret = topology_sysfs_get_iolink_props(i, sys_link_id++,
+								&temp_props[i].link[link_id], true);
+					if (ret == HSAKMT_STATUS_NOT_SUPPORTED) {
+						ret = HSAKMT_STATUS_SUCCESS;
+						continue;
+					} else if (ret != HSAKMT_STATUS_SUCCESS) {
+						free_properties(temp_props, i + 1);
+						goto err;
+					}
+					link_id++;
+				}
 				temp_props[i].node.NumIOLinks = link_id;
 			}
 		}
 		pci_ids_destroy(pacc);
 	}
 
-	/* All direct IO links are created in the kernel. Here we need to
-	 * connect GPU<->GPU or GPU<->CPU indirect IO links.
-	 */
-	topology_create_indirect_gpu_links(&sys_props, temp_props);
+	if (!p2p_links) {
+		/* All direct IO links are created in the kernel. Here we need to
+		 * connect GPU<->GPU or GPU<->CPU indirect IO links.
+		 */
+		topology_create_indirect_gpu_links(&sys_props, temp_props);
+	}
 
 	ret = topology_sysfs_get_generation(&gen_end);
 	if (ret != HSAKMT_STATUS_SUCCESS) {
@@ -2007,7 +2052,7 @@ HSAKMT_STATUS HSAKMTAPI hsaKmtGetNodeProperties(HSAuint32 NodeId,
 	/* For CPU only node don't add any additional GPU memory banks. */
 	if (gpu_id) {
 		uint64_t base, limit;
-		if (topology_is_dgpu(get_device_id_by_gpu_id(gpu_id)))
+		if (is_dgpu)
 			NodeProperties->NumMemoryBanks += NUM_OF_DGPU_HEAPS;
 		else
 			NodeProperties->NumMemoryBanks += NUM_OF_IGPU_HEAPS;
diff --git a/tests/kfdtest/CMakeLists.txt b/tests/kfdtest/CMakeLists.txt
index d92a615..17d4b9b 100644
--- a/tests/kfdtest/CMakeLists.txt
+++ b/tests/kfdtest/CMakeLists.txt
@@ -24,7 +24,7 @@
 # If environment variable DRM_DIR or LIBHSAKMT_PATH is set, the script
 # will pick up the corresponding libraries from those pathes.
 
-cmake_minimum_required(VERSION 2.8 FATAL_ERROR)
+cmake_minimum_required(VERSION 3.5 FATAL_ERROR)
 
 project(KFDTest)
 
@@ -165,6 +165,12 @@ else ()
     set ( CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -g" )
 endif ()
 
+## Address Sanitize Flag
+if ( ${ADDRESS_SANITIZER} )
+    set ( CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fsanitize=address" )
+    set ( CMAKE_EXE_LINKER_FLAGS -fsanitize=address )
+endif ()
+
 # link_directories() has to be put before add_executable()
 # The modules found by pkg_check_modules() in the default pkg config
 # path do not need to use link_directories() here.
@@ -173,7 +179,7 @@ link_directories(${SP3_DIR})
 
 add_executable(kfdtest ${SRC_FILES})
 
-target_link_libraries(kfdtest ${HSAKMT_LIBRARIES} ${DRM_LIBRARIES} ${DRM_AMDGPU_LIBRARIES} pthread m stdc++ rt amdsp3 numa)
+target_link_libraries(kfdtest ${HSAKMT_LIBRARIES} ${DRM_LDFLAGS} ${DRM_AMDGPU_LDFLAGS} pthread m stdc++ rt amdsp3 numa)
 
 configure_file ( scripts/kfdtest.exclude kfdtest.exclude COPYONLY )
 configure_file ( scripts/run_kfdtest.sh run_kfdtest.sh COPYONLY )
diff --git a/tests/kfdtest/scripts/kfdtest.exclude b/tests/kfdtest/scripts/kfdtest.exclude
index 7d71f69..6a8d6e9 100644
--- a/tests/kfdtest/scripts/kfdtest.exclude
+++ b/tests/kfdtest/scripts/kfdtest.exclude
@@ -173,15 +173,25 @@ FILTER[vega20]=\
 "KFDEvictTest.BurstyTest:"\
 "KFDQMTest.GPUDoorbellWrite"
 
+FILTER[raven_dgpuFallback]=\
+"$BLACKLIST_ALL_ASICS:"\
+"KFDMemoryTest.MMBench:"\
+"KFDMemoryTest.MemoryRegister:"\
+"KFDQMTest.SdmaConcurrentCopies"
+
 FILTER[raven]=\
 "$BLACKLIST_ALL_ASICS:"\
-"KFDQMTest.QueueLatency:"\
-"KFDQMTest.SdmaEventInterrupt"
+"KFDQMTest.SdmaConcurrentCopies:"\
+"KFDQMTest.MultipleCpQueuesStressDispatch"
 
 FILTER[renoir]=\
 "$BLACKLIST_ALL_ASICS:"\
-"KFDQMTest.QueueLatency:"\
-"KFDQMTest.SdmaEventInterrupt"
+"KFDEvictTest.*:"\
+"KFDQMTest.SdmaEventInterrupt:"\
+"KFDMemoryTest.LargestSysBufferTest:"\
+"KFDQMTest.BasicCuMaskingEven:"\
+"KFDMemoryTest.MMBench:"\
+"KFDMemoryTest.SignalHandling"
 
 # KFDExceptionTest.* (KFD-435)
 # KFDEvictTest.BurstyTest (KFD-425)
@@ -204,3 +214,15 @@ FILTER[sienna_cichlid]=\
 "KFDQMTest.BasicCuMaskingEven:"\
 "KFDDBGTest.*:"\
 "KFDPerfCountersTest.*:"\
+
+FILTER[navy_flounder]=\
+"$BLACKLIST_ALL_ASICS:"\
+"KFDQMTest.BasicCuMaskingEven:"\
+"KFDDBGTest.*:"\
+"KFDPerfCountersTest.*:"\
+
+FILTER[dimgrey_cavefish]=\
+"$BLACKLIST_ALL_ASICS:"\
+"KFDQMTest.BasicCuMaskingEven:"\
+"KFDDBGTest.*:"\
+"KFDPerfCountersTest.*:"\
diff --git a/tests/kfdtest/scripts/run_kfdtest.sh b/tests/kfdtest/scripts/run_kfdtest.sh
index 12ef8af..6311e1b 100755
--- a/tests/kfdtest/scripts/run_kfdtest.sh
+++ b/tests/kfdtest/scripts/run_kfdtest.sh
@@ -144,6 +144,13 @@ getHsaNodes() {
 getNodeName() {
     local nodeId=$1; shift;
     local gpuName=$(cat $TOPOLOGY_SYSFS_DIR/$nodeId/name)
+    if [ "$gpuName" == "raven" ]; then
+      local CpuCoresCount=$(cat $TOPOLOGY_SYSFS_DIR/$nodeId/properties | grep cpu_cores_count | awk '{print $2}')
+      local SimdCount=$(cat $TOPOLOGY_SYSFS_DIR/$nodeId/properties | grep simd_count | awk '{print $2}')
+      if [ "$CpuCoresCount" -eq 0 ] && [ "$SimdCount" -gt 0 ]; then
+	gpuName="raven_dgpuFallback"
+      fi
+    fi
     echo "$gpuName"
 }
 
@@ -244,16 +251,17 @@ while [ "$1" != "" ]; do
 done
 
 # If the SMI is missing, just report and continue
+SMI="$(find /opt/rocm* -type l -name rocm-smi | tail -1)"
 if [ "$FORCE_HIGH" == "true" ]; then
-    if [ -e "/opt/rocm/bin/rocm-smi" ]; then
-        OLDPERF=$(/opt/rocm/bin/rocm-smi -p | awk '/Performance Level/ {print $NF; exit}')
-	$(/opt/rocm/bin/rocm-smi --setperflevel high &> /dev/null)
+    if [ -e "$SMI" ]; then
+        OLDPERF=$($SMI -p | awk '/Performance Level:/ {print $NF; exit}')
+	$($SMI --setperflevel high &> /dev/null)
 	if [ $? != 0 ]; then
             echo "SMI failed to set perf level"
 	    OLDPERF=""
         fi
     else
-        echo "Unable to set clocks to high"
+        echo "Unable to set clocks to high, cannot find rocm-smi"
     fi
 fi
 
@@ -263,5 +271,5 @@ runKfdTest
 
 # OLDPERF is only set if FORCE_HIGH and SMI both exist
 if [ -n "$OLDPERF" ]; then
-    /opt/rocm/bin/rocm-smi --setperflevel $OLDPERF &> /dev/null
+    $SMI --setperflevel $OLDPERF &> /dev/null
 fi
diff --git a/tests/kfdtest/src/BaseQueue.cpp b/tests/kfdtest/src/BaseQueue.cpp
index 835ab4c..e66d3dd 100644
--- a/tests/kfdtest/src/BaseQueue.cpp
+++ b/tests/kfdtest/src/BaseQueue.cpp
@@ -186,6 +186,8 @@ BaseQueue* QueueArray::GetQueue(unsigned int Node) {
     case HSA_QUEUE_COMPUTE_AQL:
         pQueue = new AqlQueue();
         break;
+    default:
+        return NULL;
     }
 
     if (pQueue) {
diff --git a/tests/kfdtest/src/KFDMemoryTest.cpp b/tests/kfdtest/src/KFDMemoryTest.cpp
index aa816b8..d27d796 100644
--- a/tests/kfdtest/src/KFDMemoryTest.cpp
+++ b/tests/kfdtest/src/KFDMemoryTest.cpp
@@ -131,6 +131,26 @@ type(CS)\n\
     end\n\
 ";
 
+const char* gfx10_PollMemory =
+"\
+shader ReadMemory\n\
+wave_size(32)\n\
+type(CS)\n\
+/* Assume src address in s0, s1 and dst address in s2, s3*/\n\
+    s_movk_i32 s18, 0x5678\n\
+    v_mov_b32 v0, s2\n\
+    v_mov_b32 v1, s3\n\
+    v_mov_b32 v2, 0x5678\n\
+    LOOP:\n\
+    s_load_dword s16, s[0:1], 0x0 glc\n\
+    s_cmp_eq_i32 s16, s18\n\
+    s_cbranch_scc0   LOOP\n\
+    flat_store_dword v[0,1], v2 slc\n\
+    s_waitcnt vmcnt(0)&lgkmcnt(0)\n\
+    s_endpgm\n\
+    end\n\
+";
+
 /* Input: A buffer of at least 3 dwords.
  * DW0: used as a signal. 0xcafe means it is signaled
  * DW1: Input buffer for device to read.
@@ -159,6 +179,32 @@ POLLSIGNAL:\n\
     end\n\
 ";
 
+const char* gfx10_CopyOnSignal =
+"\
+shader CopyOnSignal\n\
+wave_size(32)\n\
+type(CS)\n\
+/* Assume input buffer in s0, s1 */\n\
+    s_add_u32 s2, s0, 0x8\n\
+    s_addc_u32 s3, s1, 0x0\n\
+    s_mov_b32 s18, 0xcafe\n\
+    v_mov_b32 v0, s0\n\
+    v_mov_b32 v1, s1\n\
+    v_mov_b32 v4, s2\n\
+    v_mov_b32 v5, s3\n\
+POLLSIGNAL:\n\
+    s_load_dword s16, s[0:1], 0x0 glc\n\
+    s_cmp_eq_i32 s16, s18\n\
+    s_cbranch_scc0   POLLSIGNAL\n\
+    s_load_dword s17, s[0:1], 0x4 glc\n\
+    s_waitcnt vmcnt(0) & lgkmcnt(0)\n\
+    v_mov_b32 v2, s17\n\
+    flat_store_dword v[4,5], v2 glc\n\
+    s_waitcnt vmcnt(0) & lgkmcnt(0)\n\
+    s_endpgm\n\
+    end\n\
+";
+
 /* Input0: A buffer of at least 2 dwords.
  * DW0: used as a signal. Write 0xcafe to signal
  * DW1: Write to this buffer for other device to read.
@@ -180,6 +226,30 @@ type(CS)\n\
     end\n\
 ";
 
+const char* gfx10_WriteAndSignal =
+"\
+shader WriteAndSignal\n\
+wave_size(32)\n\
+type(CS)\n\
+/* Assume input buffer in s0, s1 */\n\
+    s_add_u32 s4, s0, 0x4\n\
+    s_addc_u32 s5, s1, 0x0\n\
+    v_mov_b32 v0, s0\n\
+    v_mov_b32 v1, s1\n\
+    v_mov_b32 v2, s2\n\
+    v_mov_b32 v3, s3\n\
+    v_mov_b32 v4, s4\n\
+    v_mov_b32 v5, s5\n\
+    v_mov_b32 v18, 0xbeef\n\
+    flat_store_dword v[4:5], v18 glc\n\
+    v_mov_b32 v18, 0x1\n\
+    flat_store_dword v[2:3], v18 glc\n\
+    v_mov_b32 v18, 0xcafe\n\
+    flat_store_dword v[0:1], v18 glc\n\
+    s_endpgm\n\
+    end\n\
+";
+
 //These gfx9_PullMemory, gfx9_CopyOnSignal, gfx9_WriteAndSignal shaders can be used by both gfx9 and gfx10
 
 void KFDMemoryTest::SetUp() {
@@ -313,7 +383,13 @@ TEST_F(KFDMemoryTest, MapUnmapToNodes) {
     HsaMemoryBuffer srcBuffer(PAGE_SIZE, defaultGPUNode);
     HsaMemoryBuffer dstBuffer(PAGE_SIZE, defaultGPUNode);
 
-    m_pIsaGen->CompileShader(gfx9_PollMemory, "ReadMemory", isaBuffer);
+    const char *pReadMemory;
+    if (m_FamilyId < FAMILY_NV)
+        pReadMemory = gfx9_PollMemory;
+    else
+        pReadMemory = gfx10_PollMemory;
+
+    m_pIsaGen->CompileShader(pReadMemory, "ReadMemory", isaBuffer);
 
     PM4Queue pm4Queue;
     ASSERT_SUCCESS(pm4Queue.Create(defaultGPUNode));
@@ -1023,6 +1099,7 @@ TEST_F(KFDMemoryTest, MMBench) {
         unsigned nBufs = TEST_NBUFS(testIndex);
         unsigned memType = TEST_MEMTYPE(testIndex);
         bool interleaveSDMA = TEST_SDMA(testIndex);
+        unsigned bufLimit;
         HSAuint64 allocTime, map1Time, unmap1Time, mapAllTime, unmapAllTime, freeTime;
         HSAuint32 allocNode;
 
@@ -1043,6 +1120,13 @@ TEST_F(KFDMemoryTest, MMBench) {
             memFlags.ui32.PageSize = HSA_PAGE_SIZE_4KB;
             memFlags.ui32.HostAccess = 0;
             memFlags.ui32.NonPaged = 1;
+            /* Upper limit of buffer number to fit 90% vram size */
+            bufLimit = ((vramSizeMB << 20) * 9 / 10) / bufSize ;
+            if (bufLimit == 0)
+                continue; // skip when bufSize > vram
+
+            /* When vram is too small to fit all the buffers, fill 90% vram size*/
+            nBufs = (nBufs < bufLimit) ? nBufs : bufLimit;
         }
 
         /* Allocation */
@@ -1872,7 +1956,12 @@ TEST_F(KFDMemoryTest, HostHdpFlush) {
     PM4Queue queue;
     ASSERT_SUCCESS(queue.Create(defaultGPUNode));
     HsaMemoryBuffer isaBuffer(PAGE_SIZE, defaultGPUNode, true/*zero*/, false/*local*/, true/*exec*/);
-    m_pIsaGen->CompileShader(gfx9_CopyOnSignal, "CopyOnSignal", isaBuffer);
+    const char *pCopyOnSignal;
+    if (m_FamilyId < FAMILY_NV)
+        pCopyOnSignal = gfx9_CopyOnSignal;
+    else
+        pCopyOnSignal = gfx10_CopyOnSignal;
+    m_pIsaGen->CompileShader(pCopyOnSignal, "CopyOnSignal", isaBuffer);
     Dispatch dispatch0(isaBuffer);
     dispatch0.SetArgs(buffer, NULL);
     dispatch0.Submit(queue);
@@ -1919,7 +2008,7 @@ TEST_F(KFDMemoryTest, DeviceHdpFlush) {
     const HsaNodeProperties *pNodeProperties;
     HSAuint32 *mmioBase = NULL;
     unsigned int *nullPtr = NULL;
-    std::vector<HSAuint32> nodes;
+    std::vector<int> nodes;
     int numPeers;
 
     const std::vector<int> gpuNodes = m_NodeInfo.GetNodesWithGPU();
@@ -1933,8 +2022,7 @@ TEST_F(KFDMemoryTest, DeviceHdpFlush) {
         nodes.push_back(g_TestNodeId);
         nodes.push_back(g_TestDstNodeId);
 
-        if (!m_NodeInfo.IsGPUNodeLargeBar(g_TestNodeId) &&
-            !m_NodeInfo.AreGPUNodesXGMI(g_TestNodeId, g_TestDstNodeId)) {
+        if (!m_NodeInfo.IsPeerAccessibleByNode(g_TestDstNodeId, g_TestNodeId)) {
             LOG() << "Skipping test: first GPU specified is not peer-accessible." << std::endl;
             return;
         }
@@ -1945,7 +2033,7 @@ TEST_F(KFDMemoryTest, DeviceHdpFlush) {
         }
     } else {
         HSAint32 defaultGPU = m_NodeInfo.HsaDefaultGPUNode();
-        m_NodeInfo.FindAccessiblePeers(&nodes, defaultGPU, false);
+        m_NodeInfo.FindAccessiblePeers(&nodes, defaultGPU);
         if (nodes.size() < 2) {
             LOG() << "Skipping test: Test requires at least one large bar GPU." << std::endl;
             LOG() << "               or two GPUs are XGMI connected." << std::endl;
@@ -1989,7 +2077,12 @@ TEST_F(KFDMemoryTest, DeviceHdpFlush) {
     PM4Queue queue;
     ASSERT_SUCCESS(queue.Create(nodes[0]));
     HsaMemoryBuffer isaBuffer(PAGE_SIZE, nodes[0], true/*zero*/, false/*local*/, true/*exec*/);
-    m_pIsaGen->CompileShader(gfx9_CopyOnSignal, "CopyOnSignal", isaBuffer);
+    const char *pCopyOnSignal;
+    if (m_FamilyId < FAMILY_NV)
+        pCopyOnSignal = gfx9_CopyOnSignal;
+    else
+        pCopyOnSignal = gfx10_CopyOnSignal;
+    m_pIsaGen->CompileShader(pCopyOnSignal, "CopyOnSignal", isaBuffer);
     Dispatch dispatch(isaBuffer);
     dispatch.SetArgs(buffer, NULL);
     dispatch.Submit(queue);
@@ -1997,7 +2090,12 @@ TEST_F(KFDMemoryTest, DeviceHdpFlush) {
     PM4Queue queue0;
     ASSERT_SUCCESS(queue0.Create(nodes[1]));
     HsaMemoryBuffer isaBuffer0(PAGE_SIZE, nodes[1], true/*zero*/, false/*local*/, true/*exec*/);
-    m_pIsaGen->CompileShader(gfx9_WriteAndSignal, "WriteAndSignal", isaBuffer0);
+    const char *pWriteAndSignal;
+    if (m_FamilyId < FAMILY_NV)
+        pWriteAndSignal = gfx9_WriteAndSignal;
+    else
+        pWriteAndSignal = gfx10_WriteAndSignal;
+    m_pIsaGen->CompileShader(pWriteAndSignal, "WriteAndSignal", isaBuffer0);
     Dispatch dispatch0(isaBuffer0);
     dispatch0.SetArgs(buffer, mmioBase);
     dispatch0.Submit(queue0);
diff --git a/tests/kfdtest/src/KFDPerformanceTest.cpp b/tests/kfdtest/src/KFDPerformanceTest.cpp
index e80e4e7..32f342b 100644
--- a/tests/kfdtest/src/KFDPerformanceTest.cpp
+++ b/tests/kfdtest/src/KFDPerformanceTest.cpp
@@ -145,7 +145,7 @@ TEST_F(KFDPerformanceTest, P2PBandWidthTest) {
     }
 
     const std::vector<int> gpuNodes = m_NodeInfo.GetNodesWithGPU();
-    std::vector<HSAuint32> nodes;
+    std::vector<int> nodes;
     const bool isSpecified = g_TestDstNodeId != -1 && g_TestNodeId != -1;
     int numPeers = 0;
 
@@ -153,14 +153,13 @@ TEST_F(KFDPerformanceTest, P2PBandWidthTest) {
         if (g_TestNodeId != g_TestDstNodeId) {
             nodes.push_back(g_TestNodeId);
             nodes.push_back(g_TestDstNodeId);
-            if ((m_NodeInfo.IsGPUNodeLargeBar(g_TestNodeId) &&
-                 m_NodeInfo.IsGPUNodeLargeBar(g_TestDstNodeId)) ||
-                m_NodeInfo.AreGPUNodesXGMI(g_TestNodeId, g_TestDstNodeId))
+            if ((m_NodeInfo.IsPeerAccessibleByNode(g_TestNodeId, g_TestDstNodeId) &&
+                 m_NodeInfo.IsPeerAccessibleByNode(g_TestDstNodeId, g_TestNodeId)))
                 numPeers = 2;
         }
     } else {
-        HSAint32 defaultGPU = m_NodeInfo.HsaDefaultGPUNode();
-        numPeers = m_NodeInfo.FindAccessiblePeers(&nodes, defaultGPU, true);
+        nodes = m_NodeInfo.GetNodesWithGPU();
+        numPeers = nodes.size();
     }
 
     if (numPeers < 2) {
@@ -168,7 +167,7 @@ TEST_F(KFDPerformanceTest, P2PBandWidthTest) {
         return;
     }
 
-    std::vector<HSAuint32> sysNodes(nodes); // include sysMem node 0...
+    std::vector<int> sysNodes(nodes); // include sysMem node 0...
     sysNodes.insert(sysNodes.begin(),0);
 
     const int total_tests = 7;
@@ -230,6 +229,9 @@ TEST_F(KFDPerformanceTest, P2PBandWidthTest) {
                 if (n1 == n2)
                     continue;
 
+                if (!m_NodeInfo.IsPeerAccessibleByNode(n2, n1))
+                    continue;
+
                 snprintf(str, sizeof(str), "[%d -> %d] ", n1, n2);
                 msg << str << std::endl;
                 testNodeToNodes(n1, &n2, 1, test_suits[s][0], test_suits[s][1], size, &speed, &speed2, &msg);
@@ -251,6 +253,10 @@ TEST_F(KFDPerformanceTest, P2PBandWidthTest) {
             for (unsigned j = i + 1; j < nodes.size(); j++) {
                 HSAuint32 n2 = nodes[j];
 
+                if (!m_NodeInfo.IsPeerAccessibleByNode(n2, n1) ||
+                    !m_NodeInfo.IsPeerAccessibleByNode(n1, n2))
+                    continue;
+
                 snprintf(str, sizeof(str), "[%d <-> %d] ", n1, n2);
                 msg << str << std::endl;
                 testNodeToNodes(n1, &n2, 1, test_suits[s][0], test_suits[s][1], size, &speed, &speed2, &msg);
@@ -265,8 +271,8 @@ TEST_F(KFDPerformanceTest, P2PBandWidthTest) {
         LOG() << test_suits_string[s] << std::endl;
         msg << test_suits_string[s] << std::endl;
         /* Just use GPU nodes to do copy.*/
-        std::vector<HSAuint32> &src = test_suits[s][0] != NONE ? nodes : sysNodes;
-        std::vector<HSAuint32> &dst = test_suits[s][1] != NONE ? nodes : sysNodes;
+        std::vector<int> &src = test_suits[s][0] != NONE ? nodes : sysNodes;
+        std::vector<int> &dst = test_suits[s][1] != NONE ? nodes : sysNodes;
 
         for (unsigned i = 0; i < src.size(); i++) {
             HSAuint32 n1 = src[i];
@@ -275,9 +281,18 @@ TEST_F(KFDPerformanceTest, P2PBandWidthTest) {
             int n = 0;
             char str[64];
 
-            for (unsigned j = 0; j < dst.size(); j++)
-                if (dst[j] != n1)
+            for (unsigned j = 0; j < dst.size(); j++) {
+                if (dst[j] != n1) {
+                    if (test_suits[s][0] != NONE &&
+                        !m_NodeInfo.IsPeerAccessibleByNode(dst[j], n1))
+                            continue;
+                    if (test_suits[s][1] != NONE &&
+                        !m_NodeInfo.IsPeerAccessibleByNode(n1, dst[j]))
+                            continue;
                     n2[n++] = dst[j];
+                }
+            }
+
             /* At least 2 dst GPUs.*/
             if (n < 2)
                 continue;
@@ -293,7 +308,6 @@ TEST_F(KFDPerformanceTest, P2PBandWidthTest) {
                                         (float)speed2 / 1024 << " GB/s" << std::endl;
         }
     }
-
 exit:
     /* New line.*/
     LOG() << std::endl << msg.str() << std::endl;
@@ -309,17 +323,17 @@ TEST_F(KFDPerformanceTest, P2POverheadTest) {
     }
 
     const std::vector<int> gpuNodes = m_NodeInfo.GetNodesWithGPU();
-    std::vector<HSAuint32> nodes;
+    std::vector<int> nodes;
 
-    HSAint32 defaultGPU = m_NodeInfo.HsaDefaultGPUNode();
-    int numPeers = m_NodeInfo.FindAccessiblePeers(&nodes, defaultGPU, true);
+    nodes = m_NodeInfo.GetNodesWithGPU();
+    int numPeers = nodes.size();
 
     if (numPeers < 2) {
         LOG() << "Skipping test: Need at least two large bar GPU or XGMI connected." << std::endl;
         return;
     }
 
-    std::vector<HSAuint32> sysNodes(nodes); // include sysMem node 0...
+    std::vector<int> sysNodes(nodes); // include sysMem node 0...
     sysNodes.insert(sysNodes.begin(),0);
 
     /* size should be small.*/
@@ -352,6 +366,9 @@ TEST_F(KFDPerformanceTest, P2POverheadTest) {
                 HSAuint32 n2 = sysNodes[j];
                 std::stringstream msg;
 
+                if (n1 != n2 && !m_NodeInfo.IsPeerAccessibleByNode(n2, n1))
+                    continue;
+
                 msg << test_suits_string[s] << "[" << n1 << " -> " << n2 << "]";
                 for (auto &size : sizeArray) {
                     testNodeToNodes(n1, &n2, 1, test_suits[s], NONE, size, 0, 0, 0, 1, &time);
diff --git a/tests/kfdtest/src/KFDQMTest.cpp b/tests/kfdtest/src/KFDQMTest.cpp
index eab1b54..ae561fc 100644
--- a/tests/kfdtest/src/KFDQMTest.cpp
+++ b/tests/kfdtest/src/KFDQMTest.cpp
@@ -1625,7 +1625,7 @@ TEST_F(KFDQMTest, P2PTest) {
         LOG() << "Skipping test: At least two GPUs are required." << std::endl;
         return;
     }
-    std::vector<HSAuint32> nodes;
+    std::vector<int> nodes;
 
     /* This test simulates RT team's P2P part in IPCtest:
      *
@@ -1647,8 +1647,7 @@ TEST_F(KFDQMTest, P2PTest) {
         nodes.push_back(g_TestNodeId);
         nodes.push_back(g_TestDstNodeId);
 
-        if (!m_NodeInfo.IsGPUNodeLargeBar(g_TestDstNodeId) &&
-            !m_NodeInfo.AreGPUNodesXGMI(g_TestNodeId, g_TestDstNodeId)) {
+        if (!m_NodeInfo.IsPeerAccessibleByNode(g_TestNodeId, g_TestDstNodeId)) {
             LOG() << "Skipping test: Dst GPU specified is not peer-accessible." << std::endl;
             return;
         }
@@ -1657,8 +1656,7 @@ TEST_F(KFDQMTest, P2PTest) {
             return;
         }
     } else {
-        HSAint32 defaultGPU = m_NodeInfo.HsaDefaultGPUNode();
-        m_NodeInfo.FindAccessiblePeers(&nodes, defaultGPU, true);
+        nodes = m_NodeInfo.GetNodesWithGPU();
         if (nodes.size() < 2) {
             LOG() << "Skipping test: Test requires at least one large bar GPU." << std::endl;
             LOG() << "               or two GPUs are XGMI connected." << std::endl;
@@ -1685,7 +1683,7 @@ TEST_F(KFDQMTest, P2PTest) {
     EXPECT_SUCCESS(hsaKmtAllocMemory(0, size, memFlags,
                                      reinterpret_cast<void **>(&sysBuf)));
     EXPECT_SUCCESS(hsaKmtMapMemoryToGPUNodes(sysBuf, size, NULL,
-                                             mapFlags, nodes.size(), &nodes[0]));
+                                             mapFlags, nodes.size(), (HSAuint32 *)&nodes[0]));
 #define MAGIC_NUM 0xdeadbeaf
 
     /* First GPU fills mem with MAGIC_NUM */
@@ -1707,6 +1705,11 @@ TEST_F(KFDQMTest, P2PTest) {
         } else {
             n = 2;
             next = nodes[i];
+
+            /* check if cur access next node */
+            if (!m_NodeInfo.IsPeerAccessibleByNode(next, cur))
+                continue;
+
             ASSERT_SUCCESS(hsaKmtAllocMemory(next, size, memFlags, reinterpret_cast<void**>(&dst)));
             ASSERT_SUCCESS(hsaKmtMapMemoryToGPU(dst, size, NULL));
         }
diff --git a/tests/kfdtest/src/KFDRASTest.cpp b/tests/kfdtest/src/KFDRASTest.cpp
index 428db0e..69a4719 100644
--- a/tests/kfdtest/src/KFDRASTest.cpp
+++ b/tests/kfdtest/src/KFDRASTest.cpp
@@ -58,8 +58,8 @@ void KFDRASTest::SetUp() {
                 AMDGPU_INFO_RAS_ENABLED_FEATURES,
                 sizeof(uint32_t), &rasFeatures);
     if (!(rasFeatures &
-            (AMDGPU_INFO_RAS_ENABLED_SDMA ||
-             AMDGPU_INFO_RAS_ENABLED_UMC ||
+            (AMDGPU_INFO_RAS_ENABLED_SDMA |
+             AMDGPU_INFO_RAS_ENABLED_UMC |
              AMDGPU_INFO_RAS_ENABLED_GFX))) {
         LOG() << "Skipping test: GPU doesn't support RAS features!" << std::endl;
         throw;
diff --git a/tests/kfdtest/src/KFDTestMain.cpp b/tests/kfdtest/src/KFDTestMain.cpp
index a8837f9..2b33ac1 100644
--- a/tests/kfdtest/src/KFDTestMain.cpp
+++ b/tests/kfdtest/src/KFDTestMain.cpp
@@ -27,7 +27,7 @@
 #include "GoogleTestExtension.hpp"
 #include "OSWrapper.hpp"
 
-#define KFD_TEST_DEFAULT_TIMEOUT 2000
+#define KFD_TEST_DEFAULT_TIMEOUT 10000
 
 std::ostream& operator << (std::ostream& out, TESTPROFILE profile) {
     switch (profile) {
diff --git a/tests/kfdtest/src/KFDTestUtil.cpp b/tests/kfdtest/src/KFDTestUtil.cpp
index b55cd80..1e1b85a 100644
--- a/tests/kfdtest/src/KFDTestUtil.cpp
+++ b/tests/kfdtest/src/KFDTestUtil.cpp
@@ -616,6 +616,23 @@ const bool HsaNodeInfo::IsGPUNodeLargeBar(int node) const {
     return false;
 }
 
+const bool HsaNodeInfo::IsPeerAccessibleByNode(int peer, int node) const {
+    const HsaNodeProperties *pNodeProperties;
+
+    pNodeProperties = GetNodeProperties(node);
+    if (pNodeProperties) {
+        HsaIoLinkProperties p2pLinksProperties[pNodeProperties->NumIOLinks];
+        EXPECT_SUCCESS(hsaKmtGetNodeIoLinkProperties(node,
+					pNodeProperties->NumIOLinks, p2pLinksProperties));
+
+        for (unsigned link = 0; link < pNodeProperties->NumIOLinks; link++)
+            if (p2pLinksProperties[link].NodeTo == peer)
+                return true;
+    }
+
+    return false;
+}
+
 const int HsaNodeInfo::FindLargeBarGPUNode() const {
     const std::vector<int> gpuNodes = GetNodesWithGPU();
 
@@ -637,27 +654,16 @@ const bool HsaNodeInfo::AreGPUNodesXGMI(int node0, int node1) const {
     return false;
 }
 
-int HsaNodeInfo::FindAccessiblePeers(std::vector<HSAuint32> *peers, HSAuint32 dstNode,
-        bool bidirectional) const {
-    peers->push_back(dstNode);
-    if (IsGPUNodeLargeBar(dstNode)) {
-        for (unsigned i = 0; i < m_NodesWithGPU.size(); i++) {
-            if (m_NodesWithGPU.at(i) == dstNode)
-                continue;
+int HsaNodeInfo::FindAccessiblePeers(std::vector<int> *peers,
+		                             HSAuint32 node) const {
+    peers->push_back(node);
 
-            if (!bidirectional || IsGPUNodeLargeBar(m_NodesWithGPU.at(i)) ||
-                AreGPUNodesXGMI(dstNode, m_NodesWithGPU.at(i)))
-                peers->push_back(m_NodesWithGPU.at(i));
-        }
-    } else {
-        for (unsigned i = 0; i < m_NodesWithGPU.size(); i++) {
-            if (m_NodesWithGPU.at(i) == dstNode)
-                continue;
+    for (unsigned i = 0; i < m_NodesWithGPU.size(); i++) {
+        if (m_NodesWithGPU.at(i) == node)
+            continue;
 
-            if (AreGPUNodesXGMI(dstNode, m_NodesWithGPU.at(i)))
-                peers->push_back(m_NodesWithGPU.at(i));
-        }
+        if (IsPeerAccessibleByNode(m_NodesWithGPU.at(i), node))
+            peers->push_back(m_NodesWithGPU.at(i));
     }
-
     return peers->size();
 }
diff --git a/tests/kfdtest/src/KFDTestUtil.hpp b/tests/kfdtest/src/KFDTestUtil.hpp
index 28847f3..e55ca95 100644
--- a/tests/kfdtest/src/KFDTestUtil.hpp
+++ b/tests/kfdtest/src/KFDTestUtil.hpp
@@ -190,12 +190,13 @@ class HsaNodeInfo {
 
     void PrintNodeInfo() const;
     const bool IsGPUNodeLargeBar(int node) const;
+    const bool IsPeerAccessibleByNode(int peer, int node) const;
     // @brief Find the first available Large-BAR GPU node
     // @return: Node ID if successful or -1
     const int FindLargeBarGPUNode() const;
     const bool AreGPUNodesXGMI(int node0, int node1) const;
-    int FindAccessiblePeers(std::vector<HSAuint32> *peers, HSAuint32 dstNode,
-            bool bidirectional) const;
+    int FindAccessiblePeers(std::vector<int> *peers,
+                                        HSAuint32 node) const;
 };
 
 #endif  // __KFD__TEST__UTIL__H__
diff --git a/tests/kfdtest/src/SDMAPacket.cpp b/tests/kfdtest/src/SDMAPacket.cpp
index ee12e35..f5a8a57 100644
--- a/tests/kfdtest/src/SDMAPacket.cpp
+++ b/tests/kfdtest/src/SDMAPacket.cpp
@@ -189,7 +189,7 @@ void SDMAFencePacket::InitPacketNV(void * destAddr,unsigned int data) {
      * System = 1 because the memory is system memory
      * mtype = uncached, for the purpose of CPU coherent, L2 policy doesn't matter in this case
      */
-    packetData.HEADER_UNION.DW_0_DATA = (0 << 23) | (1 << 22) | (1 << 20) | (3 << 15) | SDMA_OP_FENCE;
+    packetData.HEADER_UNION.DW_0_DATA = (0 << 23) | (1 << 22) | (1 << 20) | (3 << 16) | SDMA_OP_FENCE;
 
     SplitU64(reinterpret_cast<unsigned long long>(destAddr),
              packetData.ADDR_LO_UNION.DW_1_DATA, /*dst_addr_31_0*/