Codebase list roct-thunk-interface / bc09fe7
Update upstream source from tag 'upstream/5.4.0' Update to upstream version '5.4.0' with Debian dir c57fc150e425eabf98034483f55a0572eda6bb42 Étienne Mollier 1 year, 5 months ago
73 changed file(s) with 2619 addition(s) and 3078 deletion(s). Raw diff Collapse all Expand all
142142 target_include_directories( ${HSAKMT_TARGET}
143143 PUBLIC
144144 $<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/include>
145 $<INSTALL_INTERFACE:include>
145 $<INSTALL_INTERFACE:${CMAKE_INSTALL_INCLUDEDIR}>
146146 PRIVATE
147147 ${CMAKE_CURRENT_SOURCE_DIR}/src )
148148
159159 find_package(PkgConfig)
160160 # Check for libraries required for building
161161 find_library(LIBC NAMES libc.so.6 REQUIRED)
162 find_library(NUMA NAMES libnuma.so REQUIRED)
162 find_library(NUMA NAMES numa REQUIRED)
163163 message(STATUS "LIBC:" ${LIBC})
164164 message(STATUS "NUMA:" ${NUMA})
165165
182182 include_directories(${DRM_INCLUDE_DIRS})
183183
184184 target_link_libraries ( ${HSAKMT_TARGET}
185 PRIVATE ${DRM_LDFLAGS} ${DRM_AMDGPU_LDFLAGS} pthread rt ${LIBC} ${NUMA}
185 PRIVATE ${DRM_LDFLAGS} ${DRM_AMDGPU_LDFLAGS} pthread rt ${LIBC} numa
186186 )
187187
188188 target_compile_options(${HSAKMT_TARGET} PRIVATE ${DRM_CFLAGS} ${HSAKMT_C_FLAGS})
190190 find_library(LIBGCC NAMES libgcc_s.so.1 REQUIRED)
191191 message(STATUS "LIBGCC:" ${LIBGCC})
192192 target_link_libraries( ${HSAKMT_TARGET} PRIVATE ${LIBGCC} )
193 else()
194 find_library(UDEV NAMES libudev.so libudev.a REQUIRED)
195 message(STATUS "UDEV:" ${UDEV})
196 find_package(ZLIB REQUIRED)
197 target_link_libraries( ${HSAKMT_TARGET} PRIVATE ${ZLIB} ${UDEV} )
198193 endif()
199194
200195 ## Define default paths and packages.
213208 #install ( FILES ${CMAKE_CURRENT_SOURCE_DIR}/LICENSE.md DESTINATION ${CMAKE_INSTALL_DOCDIR} COMPONENT devel )
214209
215210 # Install public headers
216 install ( DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}/include/ DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}
211 install ( DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}/include/ DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}/${HSAKMT_TARGET}
217212 COMPONENT devel PATTERN "linux" EXCLUDE )
213
214 # Option to build header path migration helpers.
215 option(INCLUDE_PATH_COMPATIBILITY "Generate backward compatible headers and include paths. Use of these headers will warn when included." ON)
216 if(INCLUDE_PATH_COMPATIBILITY)
217 include(hsakmt-backward-compat.cmake)
218 endif()
218219
219220 # Record our usage data for clients find_package calls.
220221 install ( EXPORT ${HSAKMT_TARGET}Targets
288289 set ( ENABLE_LDCONFIG ON CACHE BOOL "Set library links and caches using ldconfig.")
289290
290291 # Install License file
291 install ( FILES ${CPACK_RESOURCE_FILE_LICENSE} DESTINATION ${CMAKE_INSTALL_DOCDIR}/${CPACK_PACKAGE_NAME} COMPONENT devel)
292 install ( FILES ${CPACK_RESOURCE_FILE_LICENSE} DESTINATION ${CMAKE_INSTALL_DOCDIR} COMPONENT devel)
292293
293294 # Make proper version for appending
294295 # Default Value is 99999, setting it first
44 # left-hand term originates from ENABLE_LDCONFIG = ON/OFF at package build
55 do_ldconfig() {
66 if [ "@ENABLE_LDCONFIG@" == "ON" ]; then
7 echo @CPACK_PACKAGING_INSTALL_PREFIX@/lib > /etc/ld.so.conf.d/x86_64-libhsakmt.conf
7 echo @CPACK_PACKAGING_INSTALL_PREFIX@/@CMAKE_INSTALL_LIBDIR@ > /@CMAKE_INSTALL_SYSCONFDIR@/ld.so.conf.d/x86_64-libhsakmt.conf
88 ldconfig
99 fi
1010 }
44 # left-hand term originates from ENABLE_LDCONFIG = ON/OFF at package build
55 rm_ldconfig() {
66 if [ "@ENABLE_LDCONFIG@" == "ON" ]; then
7 rm -f /etc/ld.so.conf.d/x86_64-libhsakmt.conf && ldconfig
7 rm -f /@CMAKE_INSTALL_SYSCONFDIR@/ld.so.conf.d/x86_64-libhsakmt.conf && ldconfig
88 fi
99 }
1010
00 # left-hand term originates from ENABLE_LDCONFIG = ON/OFF at package build
11 if [ "@ENABLE_LDCONFIG@" == "ON" ]; then
2 echo -e "@CPACK_PACKAGING_INSTALL_PREFIX@/lib\n@CPACK_PACKAGING_INSTALL_PREFIX@/lib64" > /etc/ld.so.conf.d/x86_64-libhsakmt.conf
2 echo -e "@CPACK_PACKAGING_INSTALL_PREFIX@/@CMAKE_INSTALL_LIBDIR@" > /@CMAKE_INSTALL_SYSCONFDIR@/ld.so.conf.d/x86_64-libhsakmt.conf
33 ldconfig
44 fi
00 # second term originates from ENABLE_LDCONFIG = ON/OFF at package build
11 if [ $1 -le 1 ] && [ "@ENABLE_LDCONFIG@" == "ON" ]; then
22 # perform the below actions for rpm remove($1=0) or upgrade($1=1) operations
3 rm -f /etc/ld.so.conf.d/x86_64-libhsakmt.conf
3 rm -f /@CMAKE_INSTALL_SYSCONFDIR@/ld.so.conf.d/x86_64-libhsakmt.conf
44 ldconfig
55 fi
0 # Copyright (c) 2022 Advanced Micro Devices, Inc. All Rights Reserved.
1 # Permission is hereby granted, free of charge, to any person obtaining a copy
2 # of this software and associated documentation files (the "Software"), to deal
3 # in the Software without restriction, including without limitation the rights
4 # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
5 # copies of the Software, and to permit persons to whom the Software is
6 # furnished to do so, subject to the following conditions:
7 #
8 # The above copyright notice and this permission notice shall be included in
9 # all copies or substantial portions of the Software.
10 #
11 # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
12 # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
13 # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
14 # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
15 # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
16 # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
17 # THE SOFTWARE.
18
19 set(HSAKMT_WRAPPER_DIR ${CMAKE_CURRENT_BINARY_DIR}/wrapper_dir)
20 set(HSAKMT_WRAPPER_INC_DIR ${HSAKMT_WRAPPER_DIR}/include)
21 #Function to generate header template file
22 function(create_header_template)
23 file(WRITE ${HSAKMT_WRAPPER_DIR}/header.hpp.in "/*
24 Copyright (c) 2022 Advanced Micro Devices, Inc. All rights reserved.
25
26 Permission is hereby granted, free of charge, to any person obtaining a copy
27 of this software and associated documentation files (the \"Software\"), to deal
28 in the Software without restriction, including without limitation the rights
29 to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
30 copies of the Software, and to permit persons to whom the Software is
31 furnished to do so, subject to the following conditions:
32
33 The above copyright notice and this permission notice shall be included in
34 all copies or substantial portions of the Software.
35
36 THE SOFTWARE IS PROVIDED \"AS IS\", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
37 IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
38 FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
39 AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
40 LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
41 OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
42 THE SOFTWARE.
43 */\n\n#ifndef @include_guard@\n#define @include_guard@ \n\n#pragma message(\"@file_name@ has moved to @CMAKE_INSTALL_PREFIX@/@CMAKE_INSTALL_INCLUDEDIR@/hsakmt and package include paths have changed.\\nInclude as \\\"hsakmt/@file_name@\\\" when using cmake packages.\")\n@include_statements@\n\n#endif")
44 endfunction()
45
46 #use header template file and generate wrapper header files
47 function(generate_wrapper_header)
48 file(MAKE_DIRECTORY ${HSAKMT_WRAPPER_INC_DIR})
49 #find all header files from include folder
50 file(GLOB include_files ${CMAKE_CURRENT_SOURCE_DIR}/include/*.h)
51 #generate wrapper header files
52 foreach(header_file ${include_files})
53 # set include guard
54 get_filename_component(INC_GUARD_NAME ${header_file} NAME_WE)
55 string(TOUPPER ${INC_GUARD_NAME} INC_GUARD_NAME)
56 set(include_guard "${include_guard}HSAKMT_WRAPPER_INCLUDE_${INC_GUARD_NAME}_H")
57 # set include statements
58 get_filename_component(file_name ${header_file} NAME)
59 set(include_statements "${include_statements}#include \"hsakmt/${file_name}\"\n")
60 configure_file(${HSAKMT_WRAPPER_DIR}/header.hpp.in ${HSAKMT_WRAPPER_INC_DIR}/${file_name})
61 unset(include_guard)
62 unset(include_statements)
63 endforeach()
64 endfunction()
65
66 #Creater a template for header file
67 create_header_template()
68 #Use template header file and generater wrapper header files
69 generate_wrapper_header()
70 install(DIRECTORY ${HSAKMT_WRAPPER_INC_DIR} DESTINATION . COMPONENT devel PATTERN "linux" EXCLUDE)
374374 );
375375
376376 /**
377 Inquires memory available for allocation as a memory buffer
378 */
379
380 HSAKMT_STATUS
381 HSAKMTAPI
382 hsaKmtAvailableMemory(
383 HSAuint32 Node,
384 HSAuint64 *AvailableBytes
385 );
386
387 /**
377388 Registers with KFD a memory buffer that may be accessed by the GPU
378389 */
379390
865876 HSAint32 * enable // OUT: returns XNACK value.
866877 );
867878
879 /**
880 Open anonymous file handle to enable events and read SMI events.
881
882 To enable events, write 64bit events mask to fd, event enums as bit index.
883 for example, event mask (HSA_SMI_EVENT_MASK_FROM_INDEX(HSA_SMI_EVENT_INDEX_MAX) - 1) to enable all events
884
885 Read event from fd is not blocking, use poll with timeout value to check if event is available.
886 Event is dropped if kernel event fifo is full.
887 */
888 HSAKMT_STATUS
889 HSAKMTAPI
890 hsaKmtOpenSMI(
891 HSAuint32 NodeId, // IN: GPU node_id to receive the SMI event from
892 int *fd // OUT: anonymous file handle
893 );
894
868895 #ifdef __cplusplus
869896 } //extern "C"
870897 #endif
327327
328328 HSAuint32 VGPRSizePerCU; // VGPR size in bytes per CU
329329 HSAuint32 SGPRSizePerCU; // SGPR size in bytes per CU
330 HSAuint8 Reserved[12];
330
331 HSAuint32 KFDGpuID; // GPU Hash ID generated by KFD
332
333 HSAuint32 FamilyID; // GPU family id
334 HSAuint8 Reserved[4];
331335 } HsaNodeProperties;
332336
333337
13281332 HSA_SVM_FLAG_GPU_RO = 0x00000008, // GPUs only read, allows replication
13291333 HSA_SVM_FLAG_GPU_EXEC = 0x00000010, // Allow execution on GPU
13301334 HSA_SVM_FLAG_GPU_READ_MOSTLY = 0x00000020, // GPUs mostly read, may allow similar optimizations as RO, but writes fault
1335 HSA_SVM_FLAG_GPU_ALWAYS_MAPPED = 0x00000040, // Keep GPU memory mapping always valid as if XNACK is disable
13311336 } HSA_SVM_FLAGS;
13321337
13331338 typedef enum _HSA_SVM_ATTR_TYPE {
13511356 HSAuint32 value; // attribute value
13521357 } HSA_SVM_ATTRIBUTE;
13531358
1359 typedef enum _HSA_SMI_EVENT {
1360 HSA_SMI_EVENT_NONE = 0, /* not used */
1361 HSA_SMI_EVENT_VMFAULT = 1, /* event start counting at 1 */
1362 HSA_SMI_EVENT_THERMAL_THROTTLE = 2,
1363 HSA_SMI_EVENT_GPU_PRE_RESET = 3,
1364 HSA_SMI_EVENT_GPU_POST_RESET = 4,
1365 HSA_SMI_EVENT_MIGRATE_START = 5,
1366 HSA_SMI_EVENT_MIGRATE_END = 6,
1367 HSA_SMI_EVENT_PAGE_FAULT_START = 7,
1368 HSA_SMI_EVENT_PAGE_FAULT_END = 8,
1369 HSA_SMI_EVENT_QUEUE_EVICTION = 9,
1370 HSA_SMI_EVENT_QUEUE_RESTORE = 10,
1371 HSA_SMI_EVENT_UNMAP_FROM_GPU = 11,
1372 HSA_SMI_EVENT_INDEX_MAX = 12,
1373
1374 /*
1375 * max event number, as a flag bit to get events from all processes,
1376 * this requires super user permission, otherwise will not be able to
1377 * receive event from any process. Without this flag to receive events
1378 * from same process.
1379 */
1380 HSA_SMI_EVENT_ALL_PROCESS = 64
1381 } HSA_EVENT_TYPE;
1382
1383 typedef enum _HSA_MIGRATE_TRIGGERS {
1384 HSA_MIGRATE_TRIGGER_PREFETCH,
1385 HSA_MIGRATE_TRIGGER_PAGEFAULT_GPU,
1386 HSA_MIGRATE_TRIGGER_PAGEFAULT_CPU,
1387 HSA_MIGRATE_TRIGGER_TTM_EVICTION
1388 } HSA_MIGRATE_TRIGGERS;
1389
1390 typedef enum _HSA_QUEUE_EVICTION_TRIGGERS {
1391 HSA_QUEUE_EVICTION_TRIGGER_SVM,
1392 HSA_QUEUE_EVICTION_TRIGGER_USERPTR,
1393 HSA_QUEUE_EVICTION_TRIGGER_TTM,
1394 HSA_QUEUE_EVICTION_TRIGGER_SUSPEND,
1395 HSA_QUEUE_EVICTION_CRIU_CHECKPOINT,
1396 HSA_QUEUE_EVICTION_CRIU_RESTORE
1397 } HSA_QUEUE_EVICTION_TRIGGERS;
1398
1399 typedef enum _HSA_SVM_UNMAP_TRIGGERS {
1400 HSA_SVM_UNMAP_TRIGGER_MMU_NOTIFY,
1401 HSA_SVM_UNMAP_TRIGGER_MMU_NOTIFY_MIGRATE,
1402 HSA_SVM_UNMAP_TRIGGER_UNMAP_FROM_CPU
1403 } HSA_SVM_UNMAP_TRIGGERS;
1404
1405 #define HSA_SMI_EVENT_MASK_FROM_INDEX(i) (1ULL << ((i) - 1))
1406 #define HSA_SMI_EVENT_MSG_SIZE 96
1407
13541408 #pragma pack(pop, hsakmttypes_h)
13551409
13561410
3333 * - 1.6 - Query clear flags in SVM get_attr API
3434 * - 1.7 - Checkpoint Restore (CRIU) API
3535 * - 1.8 - CRIU - Support for SDMA transfers with GTT BOs
36 * - 1.9 - Add available_memory ioctl
37 * - 1.10 - Add SMI profiler event log
38 * - 1.11 - Add unified memory for ctx save/restore area
3639 */
3740 #define KFD_IOCTL_MAJOR_VERSION 1
38 #define KFD_IOCTL_MINOR_VERSION 8
41 #define KFD_IOCTL_MINOR_VERSION 11
3942
4043 /*
4144 * Debug revision change log
768771 __u64 handle; /* to KFD */
769772 };
770773
774 /* Inquire available memory with kfd_ioctl_get_available_memory
775 *
776 * @available: memory available for alloc
777 */
778 struct kfd_ioctl_get_available_memory_args {
779 __u64 available; /* from KFD */
780 __u32 gpu_id; /* to KFD */
781 __u32 pad;
782 };
783
771784 /* Map memory to one or more GPUs
772785 *
773786 * @handle: memory handle returned by alloc
10681081 #define KFD_IOCTL_SVM_FLAG_GPU_EXEC 0x00000010
10691082 /* GPUs mostly read, may allow similar optimizations as RO, but writes fault */
10701083 #define KFD_IOCTL_SVM_FLAG_GPU_READ_MOSTLY 0x00000020
1084 /* Keep GPU memory mapping always valid as if XNACK is disable */
1085 #define KFD_IOCTL_SVM_FLAG_GPU_ALWAYS_MAPPED 0x00000040
10711086
10721087 /**
10731088 * kfd_ioctl_svm_op - SVM ioctl operations
13261341 #define AMDKFD_IOC_CRIU_OP \
13271342 AMDKFD_IOWR(0x22, struct kfd_ioctl_criu_args)
13281343
1344 #define AMDKFD_IOC_AVAILABLE_MEMORY \
1345 AMDKFD_IOWR(0x23, struct kfd_ioctl_get_available_memory_args)
1346
13291347 #define AMDKFD_COMMAND_START 0x01
1330 #define AMDKFD_COMMAND_END 0x23
1348 #define AMDKFD_COMMAND_END 0x24
13311349
13321350 /* non-upstream ioctls */
13331351 #define AMDKFD_IOC_IPC_IMPORT_HANDLE \
296296 }
297297
298298 #define HSA_RUNTIME_ENABLE_MIN_MAJOR 10
299 #define HSA_RUNTIME_ENABLE_MAX_MAJOR 13
299300 #define HSA_RUNTIME_ENABLE_MIN_MINOR 0
301
302 static HSAKMT_STATUS checkRuntimeDebugSupport(void) {
303 HSAuint32 kMajor, kMinor;
304 HsaNodeProperties node = {0};
305 HsaSystemProperties props = {0};
306
307 memset(&node, 0x00, sizeof(node));
308 memset(&props, 0x00, sizeof(props));
309 if (hsaKmtAcquireSystemProperties(&props))
310 return HSAKMT_STATUS_ERROR;
311
312 //the firmware of gpu node doesn't support the debugger, disable it.
313 for (uint32_t i = 0; i < props.NumNodes; i++) {
314 if (hsaKmtGetNodeProperties(i, &node))
315 return HSAKMT_STATUS_ERROR;
316
317 //ignore cpu node
318 if (node.NumCPUCores)
319 continue;
320 if (!node.Capability.ui32.DebugSupportedFirmware)
321 return HSAKMT_STATUS_NOT_SUPPORTED;
322 }
323
324 if (hsaKmtGetKernelDebugTrapVersionInfo(&kMajor, &kMinor))
325 return HSAKMT_STATUS_NOT_SUPPORTED;
326
327 if (kMajor < HSA_RUNTIME_ENABLE_MIN_MAJOR || kMajor > HSA_RUNTIME_ENABLE_MAX_MAJOR ||
328 (kMajor == HSA_RUNTIME_ENABLE_MIN_MAJOR &&
329 (int)kMinor < HSA_RUNTIME_ENABLE_MIN_MINOR))
330 return HSAKMT_STATUS_NOT_SUPPORTED;
331
332 return HSAKMT_STATUS_SUCCESS;
333 }
334
300335 HSAKMT_STATUS HSAKMTAPI hsaKmtRuntimeEnable(void *rDebug,
301336 bool setupTtmp)
302337 {
303338 struct kfd_ioctl_dbg_trap_args args = {0};
304 HSAuint32 kMajor, kMinor;
305 HSAKMT_STATUS result;
306
307 result = hsaKmtGetKernelDebugTrapVersionInfo(&kMajor, &kMinor);
339 HSAKMT_STATUS result = checkRuntimeDebugSupport();
308340
309341 if (result)
310 return HSAKMT_STATUS_NOT_SUPPORTED;
311
312 if (kMajor != HSA_RUNTIME_ENABLE_MIN_MAJOR ||
313 (int)kMinor < HSA_RUNTIME_ENABLE_MIN_MINOR)
314 return HSAKMT_STATUS_NOT_SUPPORTED;
342 return result;
315343
316344 memset(&args, 0x00, sizeof(args));
317345 args.op = KFD_IOC_DBG_TRAP_RUNTIME_ENABLE;
335363 HSAKMT_STATUS HSAKMTAPI hsaKmtRuntimeDisable(void)
336364 {
337365 struct kfd_ioctl_dbg_trap_args args = {0};
338 HSAuint32 kMajor, kMinor;
339 HSAKMT_STATUS result;
340
341 result = hsaKmtGetKernelDebugTrapVersionInfo(&kMajor, &kMinor);
366 HSAKMT_STATUS result = checkRuntimeDebugSupport();
342367
343368 if (result)
344 return HSAKMT_STATUS_NOT_SUPPORTED;
345
346 if (kMajor != HSA_RUNTIME_ENABLE_MIN_MAJOR ||
347 (int)kMinor < HSA_RUNTIME_ENABLE_MIN_MINOR)
348 return HSAKMT_STATUS_NOT_SUPPORTED;
369 return result;
349370
350371 memset(&args, 0x00, sizeof(args));
351372 args.op = KFD_IOC_DBG_TRAP_RUNTIME_ENABLE;
338338
339339 return result;
340340 }
341
342 HSAKMT_STATUS HSAKMTAPI hsaKmtOpenSMI(HSAuint32 NodeId, int *fd)
343 {
344 struct kfd_ioctl_smi_events_args args;
345 HSAKMT_STATUS result;
346 uint32_t gpuid;
347
348 CHECK_KFD_OPEN();
349
350 pr_debug("[%s] node %d\n", __func__, NodeId);
351
352 result = validate_nodeid(NodeId, &gpuid);
353 if (result != HSAKMT_STATUS_SUCCESS) {
354 pr_err("[%s] invalid node ID: %d\n", __func__, NodeId);
355 return result;
356 }
357
358 args.gpuid = gpuid;
359 result = kmtIoctl(kfd_fd, AMDKFD_IOC_SMI_EVENTS, &args);
360 if (result) {
361 pr_debug("open SMI event fd failed %s\n", strerror(errno));
362 return HSAKMT_STATUS_ERROR;
363 }
364
365 *fd = args.anon_fd;
366 return HSAKMT_STATUS_SUCCESS;
367 }
3535 #include <sys/mman.h>
3636 #include <sys/time.h>
3737 #include <errno.h>
38 #include <assert.h>
3839
3940 #include <numa.h>
4041 #include <numaif.h>
183184 */
184185 manageable_aperture_t gpuvm_aperture; /* used for GPUVM on APU, outsidethe canonical address range */
185186 int drm_render_fd;
187 uint32_t usable_peer_id_num;
188 uint32_t *usable_peer_id_array;
186189 } gpu_mem_t;
187190
188191 enum svm_aperture_type {
702705 return start;
703706 }
704707
708 void *mmap_allocate_aligned(int prot, int flags, uint64_t size, uint64_t align,
709 uint64_t guard_size, void *aper_base, void *aper_limit)
710 {
711 void *addr, *aligned_addr, *aligned_end, *mapping_end;
712 uint64_t aligned_padded_size;
713
714 aligned_padded_size = size + guard_size * 2 + (align - PAGE_SIZE);
715
716 /* Map memory PROT_NONE to alloc address space only */
717 addr = mmap(0, aligned_padded_size, PROT_NONE, flags, -1, 0);
718 if (addr == MAP_FAILED) {
719 pr_err("mmap failed: %s\n", strerror(errno));
720 return NULL;
721 }
722
723 /* Adjust for alignment and guard pages */
724 aligned_addr = (void *)ALIGN_UP((uint64_t)addr + guard_size, align);
725 if (aligned_addr < aper_base ||
726 VOID_PTR_ADD(aligned_addr, size - 1) > aper_limit) {
727 pr_err("mmap returned %p, out of range %p-%p\n", aligned_addr,
728 aper_base, aper_limit);
729 munmap(addr, aligned_padded_size);
730 return NULL;
731 }
732
733 /* Unmap padding and guard pages */
734 if (aligned_addr > addr)
735 munmap(addr, VOID_PTRS_SUB(aligned_addr, addr));
736
737 aligned_end = VOID_PTR_ADD(aligned_addr, size);
738 mapping_end = VOID_PTR_ADD(addr, aligned_padded_size);
739 if (mapping_end > aligned_end)
740 munmap(aligned_end, VOID_PTRS_SUB(mapping_end, aligned_end));
741
742 if (prot == PROT_NONE)
743 return aligned_addr;
744
745 /* MAP_FIXED to the aligned address with required prot */
746 addr = mmap(aligned_addr, size, prot, flags | MAP_FIXED, -1, 0);
747 if (addr == MAP_FAILED) {
748 pr_err("mmap failed: %s\n", strerror(errno));
749 return NULL;
750 }
751
752 return addr;
753 }
754
705755 static void *mmap_aperture_allocate_aligned(manageable_aperture_t *aper,
706756 void *address,
707757 uint64_t size, uint64_t align)
708758 {
709 uint64_t aligned_padded_size, guard_size;
710759 uint64_t alignment_size = PAGE_SIZE << svm.alignment_order;
711 void *addr, *aligned_addr, *aligned_end, *mapping_end;
760 uint64_t guard_size;
712761
713762 if (address)
714763 return NULL;
732781 * pages on both sides
733782 */
734783 guard_size = (uint64_t)aper->guard_pages * PAGE_SIZE;
735 aligned_padded_size = size + align +
736 2*guard_size - PAGE_SIZE;
737
738 /* Map memory */
739 addr = mmap(0, aligned_padded_size, PROT_NONE,
740 MAP_ANONYMOUS | MAP_NORESERVE | MAP_PRIVATE, -1, 0);
741 if (addr == MAP_FAILED) {
742 pr_err("mmap failed: %s\n", strerror(errno));
743 return NULL;
744 }
745
746 /* Adjust for alignment and guard pages, range-check the reslt */
747 aligned_addr = (void *)ALIGN_UP((uint64_t)addr + guard_size, align);
748 if (aligned_addr < aper->base ||
749 VOID_PTR_ADD(aligned_addr, size - 1) > aper->limit) {
750 pr_err("mmap returned %p, out of range %p-%p\n", aligned_addr,
751 aper->base, aper->limit);
752 munmap(addr, aligned_padded_size);
753 return NULL;
754 }
755
756 /* Unmap padding and guard pages */
757 if (aligned_addr > addr)
758 munmap(addr, VOID_PTRS_SUB(aligned_addr, addr));
759
760 aligned_end = VOID_PTR_ADD(aligned_addr, size);
761 mapping_end = VOID_PTR_ADD(addr, aligned_padded_size);
762 if (mapping_end > aligned_end)
763 munmap(aligned_end, VOID_PTRS_SUB(mapping_end, aligned_end));
764
765 return aligned_addr;
784
785 return mmap_allocate_aligned(PROT_NONE, MAP_ANONYMOUS | MAP_NORESERVE | MAP_PRIVATE,
786 size, align, guard_size, aper->base, aper->limit);
766787 }
767788
768789 static void mmap_aperture_release(manageable_aperture_t *aper,
826847
827848 for (i = 0 ; i < gpu_mem_count ; i++)
828849 if (gpu_mem[i].gpu_id == gpu_id)
850 return i;
851
852 return -1;
853 }
854
855 static int32_t gpu_mem_find_by_node_id(uint32_t node_id)
856 {
857 uint32_t i;
858
859 for (i = 0 ; i < gpu_mem_count ; i++)
860 if (gpu_mem[i].node_id == node_id)
829861 return i;
830862
831863 return -1;
12491281 aligned_size, SCRATCH_ALIGN);
12501282 pthread_mutex_unlock(&svm.dgpu_aperture->fmm_mutex);
12511283 } else {
1252 uint64_t aligned_padded_size = aligned_size +
1253 SCRATCH_ALIGN - PAGE_SIZE;
1254 void *padded_end, *aligned_start, *aligned_end;
1255
12561284 if (address)
12571285 return NULL;
12581286
1259 mem = mmap(0, aligned_padded_size,
1260 PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_ANONYMOUS,
1261 -1, 0);
1262 if (!mem)
1263 return NULL;
1264 /* align start and unmap padding */
1265 padded_end = VOID_PTR_ADD(mem, aligned_padded_size);
1266 aligned_start = (void *)ALIGN_UP((uint64_t)mem, SCRATCH_ALIGN);
1267 aligned_end = VOID_PTR_ADD(aligned_start, aligned_size);
1268 if (aligned_start > mem)
1269 munmap(mem, VOID_PTRS_SUB(aligned_start, mem));
1270 if (aligned_end < padded_end)
1271 munmap(aligned_end,
1272 VOID_PTRS_SUB(padded_end, aligned_end));
1273 mem = aligned_start;
1287 mem = mmap_allocate_aligned(PROT_READ | PROT_WRITE,
1288 MAP_PRIVATE | MAP_ANONYMOUS,
1289 aligned_size, SCRATCH_ALIGN, 0,
1290 0, (void *)LONG_MAX);
12741291 }
12751292
12761293 /* Remember scratch backing aperture for later */
21692186 {
21702187 uint32_t i;
21712188 int32_t gpu_mem_id = 0;
2172 uint32_t gpu_id;
2173 HsaNodeProperties props;
21742189 struct kfd_process_device_apertures *process_apertures;
21752190 uint32_t num_of_sysfs_nodes;
21762191 HSAKMT_STATUS ret = HSAKMT_STATUS_SUCCESS;
22332248 is_dgpu = false;
22342249
22352250 for (i = 0; i < NumNodes; i++) {
2236 memset(&props, 0, sizeof(props));
2237 ret = topology_sysfs_get_node_props(i, &props, &gpu_id, NULL, NULL);
2251 HsaNodeProperties props;
2252
2253 ret = topology_get_node_props(i, &props);
22382254 if (ret != HSAKMT_STATUS_SUCCESS)
2239 goto sysfs_parse_failed;
2255 goto gpu_mem_init_failed;
22402256
22412257 topology_setup_is_dgpu_param(&props);
22422258
22432259 /* Skip non-GPU nodes */
2244 if (gpu_id != 0) {
2260 if (props.KFDGpuID) {
22452261 int fd = open_drm_render_device(props.DrmRenderMinor);
22462262 if (fd <= 0) {
22472263 ret = HSAKMT_STATUS_ERROR;
2248 goto sysfs_parse_failed;
2264 goto gpu_mem_init_failed;
22492265 }
2266
2267 gpu_mem[gpu_mem_count].usable_peer_id_array =
2268 calloc(NumNodes, sizeof(uint32_t));
2269 if (!gpu_mem[gpu_mem_count].usable_peer_id_array) {
2270 ret = HSAKMT_STATUS_NO_MEMORY;
2271 goto gpu_mem_init_failed;
2272 }
2273 gpu_mem[gpu_mem_count].usable_peer_id_array[0] = props.KFDGpuID;
2274 gpu_mem[gpu_mem_count].usable_peer_id_num = 1;
22502275
22512276 gpu_mem[gpu_mem_count].EngineId.ui32.Major = props.EngineId.ui32.Major;
22522277 gpu_mem[gpu_mem_count].EngineId.ui32.Minor = props.EngineId.ui32.Minor;
22532278 gpu_mem[gpu_mem_count].EngineId.ui32.Stepping = props.EngineId.ui32.Stepping;
22542279
22552280 gpu_mem[gpu_mem_count].drm_render_fd = fd;
2256 gpu_mem[gpu_mem_count].gpu_id = gpu_id;
2281 gpu_mem[gpu_mem_count].gpu_id = props.KFDGpuID;
22572282 gpu_mem[gpu_mem_count].local_mem_size = props.LocalMemSize;
22582283 gpu_mem[gpu_mem_count].device_id = props.DeviceId;
22592284 gpu_mem[gpu_mem_count].node_id = i;
23112336 }
23122337
23132338 for (i = 0 ; i < num_of_sysfs_nodes ; i++) {
2339 HsaNodeProperties nodeProps;
2340 HsaIoLinkProperties linkProps[NumNodes];
2341 uint32_t nodeId;
2342 uint32_t j;
2343
23142344 /* Map Kernel process device data node i <--> gpu_mem_id which
23152345 * indexes into gpu_mem[] based on gpu_id
23162346 */
23202350
23212351 if (all_gpu_id_array_size == gpu_mem_count) {
23222352 ret = HSAKMT_STATUS_ERROR;
2323 goto invalid_gpu_id;
2353 goto aperture_init_failed;
23242354 }
23252355 all_gpu_id_array[all_gpu_id_array_size++] = process_apertures[i].gpu_id;
2356
2357 /* Add this GPU to the usable_peer_id_arrays of all GPUs that
2358 * this GPU has an IO link to. This GPU can map memory
2359 * allocated on those GPUs.
2360 */
2361 nodeId = gpu_mem[gpu_mem_id].node_id;
2362 ret = topology_get_node_props(nodeId, &nodeProps);
2363 if (ret != HSAKMT_STATUS_SUCCESS)
2364 goto aperture_init_failed;
2365 assert(nodeProps.NumIOLinks <= NumNodes);
2366 ret = topology_get_iolink_props(nodeId, nodeProps.NumIOLinks,
2367 linkProps);
2368 if (ret != HSAKMT_STATUS_SUCCESS)
2369 goto aperture_init_failed;
2370 for (j = 0; j < nodeProps.NumIOLinks; j++) {
2371 int32_t to_gpu_mem_id =
2372 gpu_mem_find_by_node_id(linkProps[j].NodeTo);
2373 uint32_t peer;
2374
2375 if (to_gpu_mem_id < 0)
2376 continue;
2377
2378 assert(gpu_mem[to_gpu_mem_id].usable_peer_id_num < NumNodes);
2379 peer = gpu_mem[to_gpu_mem_id].usable_peer_id_num++;
2380 gpu_mem[to_gpu_mem_id].usable_peer_id_array[peer] =
2381 gpu_mem[gpu_mem_id].gpu_id;
2382 }
23262383
23272384 gpu_mem[gpu_mem_id].lds_aperture.base =
23282385 PORT_UINT64_TO_VPTR(process_apertures[i].lds_base);
23742431 ret = acquire_vm(gpu_mem[gpu_mem_id].gpu_id,
23752432 gpu_mem[gpu_mem_id].drm_render_fd);
23762433 if (ret != HSAKMT_STATUS_SUCCESS)
2377 goto acquire_vm_failed;
2434 goto aperture_init_failed;
23782435 }
23792436 all_gpu_id_array_size *= sizeof(uint32_t);
23802437
24382495 free(process_apertures);
24392496 return ret;
24402497
2441 invalid_gpu_id:
2498 aperture_init_failed:
24422499 init_svm_failed:
2443 acquire_vm_failed:
24442500 set_memory_policy_failed:
24452501 free(all_gpu_id_array);
24462502 all_gpu_id_array = NULL;
24472503 get_aperture_ioctl_failed:
24482504 free(process_apertures);
24492505 sysfs_parse_failed:
2506 gpu_mem_init_failed:
24502507 fmm_destroy_process_apertures();
24512508 return ret;
24522509 }
24552512 {
24562513 release_mmio();
24572514 if (gpu_mem) {
2515 while (gpu_mem_count-- > 0)
2516 free(gpu_mem[gpu_mem_count].usable_peer_id_array);
24582517 free(gpu_mem);
24592518 gpu_mem = NULL;
24602519 }
26352694 sizeof(uint32_t);
26362695 } else {
26372696 /* not specified, not registered: map all GPUs */
2638 args.device_ids_array_ptr = (uint64_t)all_gpu_id_array;
2639 args.n_devices = all_gpu_id_array_size / sizeof(uint32_t);
2697 int32_t gpu_mem_id = gpu_mem_find_by_node_id(obj->node_id);
2698
2699 if (!obj->userptr && get_device_id_by_node_id(obj->node_id) &&
2700 gpu_mem_id >= 0) {
2701 args.device_ids_array_ptr = (uint64_t)
2702 gpu_mem[gpu_mem_id].usable_peer_id_array;
2703 args.n_devices =
2704 gpu_mem[gpu_mem_id].usable_peer_id_num;
2705 } else {
2706 args.device_ids_array_ptr = (uint64_t)all_gpu_id_array;
2707 args.n_devices = all_gpu_id_array_size / sizeof(uint32_t);
2708 }
26402709 }
26412710 args.n_success = 0;
26422711
33443413 importArgs.gpu_id = SharedMemoryStruct->ExportGpuId;
33453414
33463415 aperture = fmm_get_aperture(SharedMemoryStruct->ApeInfo);
3416 if (!aperture)
3417 return HSAKMT_STATUS_INVALID_PARAMETER;
33473418
33483419 pthread_mutex_lock(&aperture->fmm_mutex);
33493420 reservedMem = aperture_allocate_area(aperture, NULL,
37363807 fmm_clear_aperture(&gpu_mem[i].scratch_physical);
37373808 }
37383809
3739 gpu_mem_count = 0;
3740 free(gpu_mem);
3741 gpu_mem = NULL;
3742 }
3810 fmm_destroy_process_apertures();
3811 }
8989 uint32_t *nodes_to_map, uint64_t num_of_nodes, uint64_t *gpuvm_address);
9090
9191 int open_drm_render_device(int minor);
92 void *mmap_allocate_aligned(int prot, int flags, uint64_t size, uint64_t align,
93 uint64_t guard_size, void *aper_base, void *aper_limit);
94
9295 #endif /* FMM_H_ */
171171 HSAKMT_STATUS validate_nodeid_array(uint32_t **gpu_id_array,
172172 uint32_t NumberOfNodes, uint32_t *NodeArray);
173173
174 HSAKMT_STATUS topology_sysfs_get_node_props(uint32_t node_id, HsaNodeProperties *props,
175 uint32_t *gpu_id,
176 bool *p2p_links, uint32_t *num_p2pLinks);
177174 HSAKMT_STATUS topology_sysfs_get_system_props(HsaSystemProperties *props);
175 HSAKMT_STATUS topology_get_node_props(HSAuint32 NodeId,
176 HsaNodeProperties *NodeProperties);
177 HSAKMT_STATUS topology_get_iolink_props(HSAuint32 NodeId,
178 HSAuint32 NumIoLinks,
179 HsaIoLinkProperties *IoLinkProperties);
178180 void topology_setup_is_dgpu_param(HsaNodeProperties *props);
179181 bool topology_is_svm_needed(HSA_ENGINE_ID EngineId);
180182
2323 hsaKmtSetMemoryPolicy;
2424 hsaKmtAllocMemory;
2525 hsaKmtFreeMemory;
26 hsaKmtAvailableMemory;
2627 hsaKmtRegisterMemory;
2728 hsaKmtRegisterMemoryToNodes;
2829 hsaKmtRegisterMemoryWithFlags;
6768 hsaKmtSVMGetAttr;
6869 hsaKmtSetXNACKMode;
6970 hsaKmtGetXNACKMode;
71 hsaKmtOpenSMI;
7072
7173 local: *;
7274 };
198198 return fmm_release(MemoryAddress);
199199 }
200200
201 HSAKMT_STATUS HSAKMTAPI hsaKmtAvailableMemory(HSAuint32 Node,
202 HSAuint64 *AvailableBytes)
203 {
204 struct kfd_ioctl_get_available_memory_args args = {};
205 HSAKMT_STATUS result;
206
207 CHECK_KFD_OPEN();
208 CHECK_KFD_MINOR_VERSION(9);
209
210 pr_debug("[%s] node %d\n", __func__, Node);
211
212 result = validate_nodeid(Node, &args.gpu_id);
213 if (result != HSAKMT_STATUS_SUCCESS) {
214 pr_err("[%s] invalid node ID: %d\n", __func__, Node);
215 return result;
216 }
217
218 if (kmtIoctl(kfd_fd, AMDKFD_IOC_AVAILABLE_MEMORY, &args))
219 return HSAKMT_STATUS_ERROR;
220
221 *AvailableBytes = args.available;
222 return HSAKMT_STATUS_SUCCESS;
223 }
224
201225 HSAKMT_STATUS HSAKMTAPI hsaKmtRegisterMemory(void *MemoryAddress,
202226 HSAuint64 MemorySizeInBytes)
203227 {
361385 return ret;
362386 }
363387
364 static uint64_t convertHsaToKfdRange(HsaMemoryRange *HsaRange)
365 {
366 if (sizeof(struct kfd_memory_range) !=
367 sizeof(HsaMemoryRange)) {
368 pr_err("Struct size mismatch in thunk. Cannot cast Hsa Range to KFD IOCTL range\n");
369 return 0;
370 }
371 return (uint64_t) HsaRange;
372 }
373
374388 HSAKMT_STATUS HSAKMTAPI hsaKmtProcessVMRead(HSAuint32 Pid,
375389 HsaMemoryRange *LocalMemoryArray,
376390 HSAuint64 LocalMemoryArrayCount,
378392 HSAuint64 RemoteMemoryArrayCount,
379393 HSAuint64 *SizeCopied)
380394 {
381 int ret = HSAKMT_STATUS_SUCCESS;
382 struct kfd_ioctl_cross_memory_copy_args args = {0};
383
384 pr_debug("[%s]\n", __func__);
385
386 if (!LocalMemoryArray || !RemoteMemoryArray ||
387 LocalMemoryArrayCount == 0 || RemoteMemoryArrayCount == 0)
388 return HSAKMT_STATUS_ERROR;
389
390 args.flags = 0;
391 KFD_SET_CROSS_MEMORY_READ(args.flags);
392 args.pid = Pid;
393 args.src_mem_range_array = convertHsaToKfdRange(RemoteMemoryArray);
394 args.src_mem_array_size = RemoteMemoryArrayCount;
395 args.dst_mem_range_array = convertHsaToKfdRange(LocalMemoryArray);
396 args.dst_mem_array_size = LocalMemoryArrayCount;
397 args.bytes_copied = 0;
398
399 if (kmtIoctl(kfd_fd, AMDKFD_IOC_CROSS_MEMORY_COPY, &args))
400 ret = HSAKMT_STATUS_ERROR;
401
402 if (SizeCopied)
403 *SizeCopied = args.bytes_copied;
404
405 return ret;
395 pr_err("[%s] Deprecated\n", __func__);
396
397 return HSAKMT_STATUS_NOT_IMPLEMENTED;
406398 }
407399
408400 HSAKMT_STATUS HSAKMTAPI hsaKmtProcessVMWrite(HSAuint32 Pid,
412404 HSAuint64 RemoteMemoryArrayCount,
413405 HSAuint64 *SizeCopied)
414406 {
415 int ret = HSAKMT_STATUS_SUCCESS;
416 struct kfd_ioctl_cross_memory_copy_args args = {0};
417
418 pr_debug("[%s]\n", __func__);
419
420 if (SizeCopied)
421 *SizeCopied = 0;
422
423 if (!LocalMemoryArray || !RemoteMemoryArray ||
424 LocalMemoryArrayCount == 0 || RemoteMemoryArrayCount == 0)
425 return HSAKMT_STATUS_ERROR;
426
427 args.flags = 0;
428 KFD_SET_CROSS_MEMORY_WRITE(args.flags);
429 args.pid = Pid;
430 args.src_mem_range_array = convertHsaToKfdRange(LocalMemoryArray);
431 args.src_mem_array_size = LocalMemoryArrayCount;
432 args.dst_mem_range_array = convertHsaToKfdRange(RemoteMemoryArray);
433 args.dst_mem_array_size = RemoteMemoryArrayCount;
434 args.bytes_copied = 0;
435
436 if (kmtIoctl(kfd_fd, AMDKFD_IOC_CROSS_MEMORY_COPY, &args))
437 ret = HSAKMT_STATUS_ERROR;
438
439 if (SizeCopied)
440 *SizeCopied = args.bytes_copied;
441
442 return ret;
407 pr_err("[%s] Deprecated\n", __func__);
408
409 return HSAKMT_STATUS_NOT_IMPLEMENTED;
443410 }
444411
445412
178178 if (result != HSAKMT_STATUS_SUCCESS)
179179 goto topology_sysfs_failed;
180180
181 result = fmm_init_process_apertures(sys_props.NumNodes);
182 if (result != HSAKMT_STATUS_SUCCESS)
183 goto init_process_aperture_failed;
184
185 result = init_process_doorbells(sys_props.NumNodes);
186 if (result != HSAKMT_STATUS_SUCCESS)
187 goto init_doorbell_failed;
188
189181 kfd_open_count = 1;
190182
191183 if (init_device_debugging_memory(sys_props.NumNodes) != HSAKMT_STATUS_SUCCESS)
211203
212204 pthread_mutex_unlock(&hsakmt_mutex);
213205 return result;
214
215 init_doorbell_failed:
216 fmm_destroy_process_apertures();
217 init_process_aperture_failed:
218206 topology_sysfs_failed:
219207 kfd_version_failed:
220208 close(fd);
234222 if (--kfd_open_count == 0) {
235223 destroy_counter_props();
236224 destroy_device_debugging_memory();
237 destroy_process_doorbells();
238 fmm_destroy_process_apertures();
239225 if (kfd_fd) {
240226 close(kfd_fd);
241227 kfd_fd = 0;
6767 uint32_t eop_buffer_size;
6868 uint32_t gfxv;
6969 bool use_ats;
70 bool unified_ctx_save_restore;
7071 /* This queue structure is allocated from GPU with page aligned size
7172 * but only small bytes are used. We use the extra space in the end for
7273 * cu_mask bits array.
277278 wg_data_size = cu_num * WG_CONTEXT_DATA_SIZE_PER_CU(q->gfxv);
278279 q->ctl_stack_size = PAGE_ALIGN_UP(sizeof(HsaUserContextSaveAreaHeader)
279280 + ctl_stack_size);
280 if (q->gfxv >= GFX_VERSION_NAVI10 &&
281 q->gfxv <= GFX_VERSION_YELLOW_CARP) {
281 if ((q->gfxv & 0x3f0000) == 0xA0000) {
282282 /* HW design limits control stack size to 0x7000.
283283 * This is insufficient for theoretical PM4 cases
284284 * but sufficient for AQL, limited by SPI events.
363363 static void *allocate_exec_aligned_memory(uint32_t size,
364364 bool use_ats,
365365 uint32_t NodeId,
366 bool nonPaged,
366367 bool DeviceLocal,
367368 bool Uncached)
368369 {
369370 if (!use_ats)
370371 return allocate_exec_aligned_memory_gpu(size, PAGE_SIZE, NodeId,
371 DeviceLocal, DeviceLocal,
372 nonPaged, DeviceLocal,
372373 Uncached);
373374 return allocate_exec_aligned_memory_cpu(size);
374375 }
382383 munmap(addr, size);
383384 }
384385
386 static HSAKMT_STATUS register_svm_range(void *mem, uint32_t size,
387 uint32_t gpuNode, uint32_t prefetchNode,
388 uint32_t preferredNode, bool alwaysMapped)
389 {
390 HSA_SVM_ATTRIBUTE *attrs;
391 HSAuint64 s_attr;
392 HSAuint32 nattr;
393 HSAuint32 flags;
394
395 flags = HSA_SVM_FLAG_HOST_ACCESS | HSA_SVM_FLAG_GPU_EXEC;
396
397 if (alwaysMapped) {
398 CHECK_KFD_MINOR_VERSION(11);
399 flags |= HSA_SVM_FLAG_GPU_ALWAYS_MAPPED;
400 }
401
402 nattr = 5;
403 s_attr = sizeof(*attrs) * nattr;
404 attrs = (HSA_SVM_ATTRIBUTE *)alloca(s_attr);
405
406 attrs[0].type = HSA_SVM_ATTR_PREFETCH_LOC;
407 attrs[0].value = prefetchNode;
408 attrs[1].type = HSA_SVM_ATTR_PREFERRED_LOC;
409 attrs[1].value = preferredNode;
410 attrs[2].type = HSA_SVM_ATTR_CLR_FLAGS;
411 attrs[2].value = ~flags;
412 attrs[3].type = HSA_SVM_ATTR_SET_FLAGS;
413 attrs[3].value = flags;
414 attrs[4].type = HSA_SVM_ATTR_ACCESS;
415 attrs[4].value = gpuNode;
416
417 return hsaKmtSVMSetAttr(mem, size, nattr, attrs);
418 }
419
385420 static void free_queue(struct queue *q)
386421 {
387422 if (q->eop_buffer)
388423 free_exec_aligned_memory(q->eop_buffer,
389424 q->eop_buffer_size,
390425 PAGE_SIZE, q->use_ats);
391 if (q->ctx_save_restore)
426 if (q->unified_ctx_save_restore)
427 munmap(q->ctx_save_restore,
428 PAGE_ALIGN_UP(q->ctx_save_restore_size + q->debug_memory_size));
429 else if (q->ctx_save_restore)
392430 free_exec_aligned_memory(q->ctx_save_restore,
393 q->ctx_save_restore_size,
431 q->ctx_save_restore_size + q->debug_memory_size,
394432 PAGE_SIZE, q->use_ats);
395433
396434 free_exec_aligned_memory((void *)q, sizeof(*q), PAGE_SIZE, q->use_ats);
435 }
436
437 static inline void fill_cwsr_header(struct queue *q, void *addr,
438 HsaEvent *Event, volatile HSAint64 *ErrPayload)
439 {
440 HsaUserContextSaveAreaHeader *header =
441 (HsaUserContextSaveAreaHeader *)addr;
442
443 header->ErrorEventId = 0;
444 if (Event)
445 header->ErrorEventId = Event->EventId;
446 header->ErrorReason = ErrPayload;
447 header->DebugOffset = q->ctx_save_restore_size;
448 header->DebugSize = q->debug_memory_size;
397449 }
398450
399451 static int handle_concrete_asic(struct queue *q,
411463 if (q->eop_buffer_size > 0) {
412464 q->eop_buffer = allocate_exec_aligned_memory(q->eop_buffer_size,
413465 q->use_ats,
414 NodeId, true, /* Unused for VRAM */false);
466 NodeId, true, true, /* Unused for VRAM */false);
415467 if (!q->eop_buffer)
416468 return HSAKMT_STATUS_NO_MEMORY;
417469
423475
424476 if (ret) {
425477 uint32_t total_mem_alloc_size = 0;
426 HsaUserContextSaveAreaHeader *header;
478 HsaNodeProperties node;
479 bool svm_api;
427480
428481 args->ctx_save_restore_size = q->ctx_save_restore_size;
429482 args->ctl_stack_size = q->ctl_stack_size;
433486 */
434487 total_mem_alloc_size = q->ctx_save_restore_size +
435488 q->debug_memory_size;
436 q->ctx_save_restore =
437 allocate_exec_aligned_memory(total_mem_alloc_size,
438 q->use_ats, NodeId, false, false);
439
440 if (!q->ctx_save_restore)
441 return HSAKMT_STATUS_NO_MEMORY;
489
490 if (hsaKmtGetNodeProperties(NodeId, &node))
491 svm_api = false;
492 else
493 svm_api = node.Capability.ui32.SVMAPISupported;
494
495 /* Allocate unified memory for context save restore
496 * area on dGPU.
497 */
498 if (!q->use_ats && svm_api) {
499 uint32_t size = PAGE_ALIGN_UP(total_mem_alloc_size);
500 void *addr;
501 HSAKMT_STATUS r = HSAKMT_STATUS_ERROR;
502
503 addr = mmap_allocate_aligned(PROT_READ | PROT_WRITE,
504 MAP_ANONYMOUS | MAP_PRIVATE,
505 size, GPU_HUGE_PAGE_SIZE, 0,
506 0, (void *)LONG_MAX);
507 if (!addr) {
508 pr_err("mmap failed to alloc ctx area size 0x%x: %s\n",
509 size, strerror(errno));
510 } else {
511 /*
512 * To avoid fork child process COW MMU notifier
513 * callback evict parent process queues.
514 */
515 if (madvise(addr, size, MADV_DONTFORK))
516 pr_err("madvise failed -%d\n", errno);
517
518 fill_cwsr_header(q, addr, Event, ErrPayload);
519
520 r = register_svm_range(addr, size,
521 NodeId, NodeId, 0, true);
522
523 if (r == HSAKMT_STATUS_SUCCESS) {
524 q->ctx_save_restore = addr;
525 q->unified_ctx_save_restore = true;
526 } else {
527 munmap(addr, size);
528 }
529 }
530 }
531
532 if (!q->unified_ctx_save_restore) {
533 q->ctx_save_restore = allocate_exec_aligned_memory(
534 total_mem_alloc_size,
535 q->use_ats, NodeId,
536 false, false, false);
537
538 if (!q->ctx_save_restore)
539 return HSAKMT_STATUS_NO_MEMORY;
540
541 fill_cwsr_header(q, q->ctx_save_restore, Event, ErrPayload);
542 }
442543
443544 args->ctx_save_restore_address = (uintptr_t)q->ctx_save_restore;
444
445 header = (HsaUserContextSaveAreaHeader *)q->ctx_save_restore;
446 header->ErrorEventId = 0;
447 if (Event)
448 header->ErrorEventId = Event->EventId;
449 header->ErrorReason = ErrPayload;
450 header->DebugOffset = q->ctx_save_restore_size;
451 header->DebugSize = q->debug_memory_size;
452545 }
453546
454547 return HSAKMT_STATUS_SUCCESS;
476569 int err;
477570 HsaNodeProperties props;
478571 uint32_t cu_num, i;
479 bool use_ats;
480572
481573 CHECK_KFD_OPEN();
482574
488580 if (result != HSAKMT_STATUS_SUCCESS)
489581 return result;
490582
491 use_ats = prefer_ats(NodeId);
492
493583 struct queue *q = allocate_exec_aligned_memory(sizeof(*q),
494 use_ats,
495 NodeId, false, true);
584 false, NodeId, true, false, true);
496585 if (!q)
497586 return HSAKMT_STATUS_NO_MEMORY;
498587
499588 memset(q, 0, sizeof(*q));
500589
501590 q->gfxv = get_gfxv_by_node_id(NodeId);
502 q->use_ats = use_ats;
591 q->use_ats = false;
503592 q->eop_buffer_size = EOP_BUFFER_SIZE(q->gfxv);
504593
505594 /* By default, CUs are all turned on. Initialize cu_mask to '1
585674 err = map_doorbell(NodeId, gpu_id, doorbell_mmap_offset);
586675 if (err != HSAKMT_STATUS_SUCCESS) {
587676 hsaKmtDestroyQueue(q->queue_id);
588 free_queue(q);
589677 return HSAKMT_STATUS_ERROR;
590678 }
591679
5555 #define KFD_SYSFS_PATH_NODES "/sys/devices/virtual/kfd/kfd/topology/nodes"
5656
5757 typedef struct {
58 uint32_t gpu_id;
5958 HsaNodeProperties node;
6059 HsaMemoryProperties *mem; /* node->NumBanks elements */
6160 HsaCacheProperties *cache;
8786 };
8887
8988 static HSAKMT_STATUS topology_take_snapshot(void);
90 static HSAKMT_STATUS topology_drop_snapshot(void);
89 static void topology_drop_snapshot(void);
9190
9291 static const struct hsa_gfxip_table gfxip_lookup_table[] = {
9392 /* Kaveri Family */
800799 return ret;
801800 }
802801
803 static const struct hsa_gfxip_table *find_hsa_gfxip_device(uint16_t device_id)
804 {
802 static const struct hsa_gfxip_table *find_hsa_gfxip_device(uint16_t device_id, uint8_t gfxv_major)
803 {
804 if (gfxv_major > 10)
805 return NULL;
806
805807 uint32_t i, table_size;
806808
807809 table_size = sizeof(gfxip_lookup_table)/sizeof(struct hsa_gfxip_table);
10041006 return ret;
10051007 }
10061008
1007 static int topology_get_marketing_name(int minor, uint16_t *marketing_name)
1009 static int topology_get_node_props_from_drm(HsaNodeProperties *props)
10081010 {
10091011 int drm_fd;
10101012 uint32_t major_version;
10111013 uint32_t minor_version;
10121014 amdgpu_device_handle device_handle;
1015 struct amdgpu_gpu_info gpu_info;
10131016 const char *name;
1014 int i;
1015
1016 if (marketing_name == NULL)
1017 int i, ret = 0;
1018
1019 if (props == NULL)
10171020 return -1;
1018 drm_fd = drmOpenRender(minor);
1021
1022 drm_fd = drmOpenRender(props->DrmRenderMinor);
10191023 if (drm_fd < 0)
10201024 return -1;
1025
10211026 if (amdgpu_device_initialize(drm_fd,
10221027 &major_version, &minor_version, &device_handle) < 0) {
1023 drmClose(drm_fd);
1024 return -1;
1025 }
1028 ret = -1;
1029 goto err_device_initialize;
1030 }
1031
10261032 name = amdgpu_get_marketing_name(device_handle);
10271033 if (name != NULL) {
10281034 for (i = 0; name[i] != 0 && i < HSA_PUBLIC_NAME_SIZE - 1; i++)
1029 marketing_name[i] = name[i];
1030 marketing_name[i] = '\0';
1031 }
1035 props->MarketingName[i] = name[i];
1036 props->MarketingName[i] = '\0';
1037 }
1038
1039 if (amdgpu_query_gpu_info(device_handle, &gpu_info)) {
1040 ret = -1;
1041 goto err_query_gpu_info;
1042 }
1043
1044 props->FamilyID = gpu_info.family_id;
1045
1046 err_query_gpu_info:
10321047 amdgpu_device_deinitialize(device_handle);
1048 err_device_initialize:
10331049 drmClose(drm_fd);
1034 return 0;
1035 }
1036
1037 HSAKMT_STATUS topology_sysfs_get_node_props(uint32_t node_id,
1038 HsaNodeProperties *props,
1039 uint32_t *gpu_id,
1040 bool *p2p_links,
1041 uint32_t *num_p2pLinks)
1050 return ret;
1051 }
1052
1053 static HSAKMT_STATUS topology_sysfs_get_node_props(uint32_t node_id,
1054 HsaNodeProperties *props,
1055 bool *p2p_links,
1056 uint32_t *num_p2pLinks)
10421057 {
10431058 FILE *fd;
10441059 char *read_buf, *p, *envvar, dummy;
10551070 HSAKMT_STATUS ret = HSAKMT_STATUS_SUCCESS;
10561071
10571072 assert(props);
1058 assert(gpu_id);
10591073 ret = topology_sysfs_map_node_id(node_id, &sys_node_id);
10601074 if (ret != HSAKMT_STATUS_SUCCESS)
10611075 return ret;
10621076
10631077 /* Retrieve the GPU ID */
1064 ret = topology_sysfs_get_gpu_id(sys_node_id, gpu_id);
1078 ret = topology_sysfs_get_gpu_id(sys_node_id, &props->KFDGpuID);
1079 if (ret != HSAKMT_STATUS_SUCCESS)
1080 return ret;
10651081
10661082 read_buf = malloc(PAGE_SIZE);
10671083 if (!read_buf)
11701186 gfxv = (uint32_t)prop_val;
11711187 }
11721188
1189 /* Bail out early, if a CPU node */
1190 if (props->NumCPUCores)
1191 goto err;
1192
11731193 gfxv_major = HSA_GET_GFX_VERSION_MAJOR(gfxv);
11741194 gfxv_minor = HSA_GET_GFX_VERSION_MINOR(gfxv);
11751195 gfxv_stepping = HSA_GET_GFX_VERSION_STEP(gfxv);
11761196
1177 hsa_gfxip = find_hsa_gfxip_device(props->DeviceId);
1197 hsa_gfxip = find_hsa_gfxip_device(props->DeviceId, gfxv_major);
11781198 if (hsa_gfxip || gfxv) {
11791199 envvar = getenv("HSA_OVERRIDE_GFX_VERSION");
11801200 if (envvar) {
12101230 snprintf((char *)props->AMDName, sizeof(props->AMDName)-1, "GFX%06x",
12111231 HSA_GET_GFX_VERSION_FULL(props->EngineId.ui32));
12121232
1213 if (!props->NumCPUCores) {
1214 /* Is dGPU Node, not APU
1215 * Retrieve the marketing name of the node.
1216 */
1217 if (topology_get_marketing_name(props->DrmRenderMinor,
1218 props->MarketingName) != 0)
1219 pr_info("failed to get marketing name for device ID 0x%x\n",
1220 props->DeviceId);
1221 }
1233 /* Is dGPU Node, not APU
1234 * Retrieve the marketing name of the node.
1235 */
1236 if (topology_get_node_props_from_drm(props))
1237 pr_info("failed to get marketing name for device ID 0x%x\n", props->DeviceId);
12221238
12231239 /* Get VGPR/SGPR size in byte per CU */
12241240 props->SGPRSizePerCU = SGPR_SIZE_PER_CU;
17221738 HsaIoLinkProperties *props = node_props[gpu_node].link;
17231739 uint32_t i;
17241740
1725 if (!node_props[gpu_node].gpu_id || !props ||
1741 if (!node_props[gpu_node].node.KFDGpuID || !props ||
17261742 node_props[gpu_node].node.NumIOLinks == 0)
17271743 return -1;
17281744
17751791 return HSAKMT_STATUS_INVALID_PARAMETER;
17761792
17771793 /* CPU->CPU is not an indirect link */
1778 if (!node_props[node1].gpu_id && !node_props[node2].gpu_id)
1794 if (!node_props[node1].node.KFDGpuID && !node_props[node2].node.KFDGpuID)
17791795 return HSAKMT_STATUS_INVALID_NODE_UNIT;
17801796
17811797 if (node_props[node1].node.HiveID &&
17831799 node_props[node1].node.HiveID == node_props[node2].node.HiveID)
17841800 return HSAKMT_STATUS_INVALID_PARAMETER;
17851801
1786 if (node_props[node1].gpu_id)
1802 if (node_props[node1].node.KFDGpuID)
17871803 dir_cpu1 = gpu_get_direct_link_cpu(node1, node_props);
1788 if (node_props[node2].gpu_id)
1804 if (node_props[node2].node.KFDGpuID)
17891805 dir_cpu2 = gpu_get_direct_link_cpu(node2, node_props);
17901806
17911807 if (dir_cpu1 < 0 && dir_cpu2 < 0)
17921808 return HSAKMT_STATUS_ERROR;
17931809
17941810 /* if the node2(dst) is GPU , it need to be large bar for host access*/
1795 if (node_props[node2].gpu_id) {
1811 if (node_props[node2].node.KFDGpuID) {
17961812 for (i = 0; i < node_props[node2].node.NumMemoryBanks; ++i)
17971813 if (node_props[node2].mem[i].HeapType ==
17981814 HSA_HEAPTYPE_FRAME_BUFFER_PUBLIC)
19211937 for (i = 0; i < sys_props.NumNodes; i++) {
19221938 ret = topology_sysfs_get_node_props(i,
19231939 &temp_props[i].node,
1924 &temp_props[i].gpu_id,
19251940 &p2p_links, &num_p2pLinks);
19261941 if (ret != HSAKMT_STATUS_SUCCESS) {
19271942 free_properties(temp_props, i);
19621977 goto err;
19631978 }
19641979 }
1965 } else if (!temp_props[i].gpu_id) { /* a CPU node */
1980 } else if (!temp_props[i].node.KFDGpuID) { /* a CPU node */
19661981 ret = topology_get_cpu_cache_props(
19671982 i, cpuinfo, &temp_props[i]);
19681983 if (ret != HSAKMT_STATUS_SUCCESS) {
20672082 }
20682083
20692084 /* Drop the Snashot of the HSA topology information. Assume lock is held. */
2070 HSAKMT_STATUS topology_drop_snapshot(void)
2071 {
2072 HSAKMT_STATUS err;
2073
2074 if (!!g_system != !!g_props) {
2085 void topology_drop_snapshot(void)
2086 {
2087 if (!!g_system != !!g_props)
20752088 pr_warn("Probably inconsistency?\n");
2076 err = HSAKMT_STATUS_SUCCESS;
2077 goto out;
2078 }
20792089
20802090 if (g_props) {
20812091 /* Remove state */
20912101 map_user_to_sysfs_node_id = NULL;
20922102 map_user_to_sysfs_node_id_size = 0;
20932103 }
2094
2095 err = HSAKMT_STATUS_SUCCESS;
2096
2097 out:
2098 return err;
20992104 }
21002105
21012106 HSAKMT_STATUS validate_nodeid(uint32_t nodeid, uint32_t *gpu_id)
21032108 if (!g_props || !g_system || g_system->NumNodes <= nodeid)
21042109 return HSAKMT_STATUS_INVALID_NODE_UNIT;
21052110 if (gpu_id)
2106 *gpu_id = g_props[nodeid].gpu_id;
2111 *gpu_id = g_props[nodeid].node.KFDGpuID;
21072112
21082113 return HSAKMT_STATUS_SUCCESS;
21092114 }
21132118 uint64_t node_idx;
21142119
21152120 for (node_idx = 0; node_idx < g_system->NumNodes; node_idx++) {
2116 if (g_props[node_idx].gpu_id == gpu_id) {
2121 if (g_props[node_idx].node.KFDGpuID == gpu_id) {
21172122 *node_id = node_idx;
21182123 return HSAKMT_STATUS_SUCCESS;
21192124 }
21252130
21262131 HSAKMT_STATUS HSAKMTAPI hsaKmtAcquireSystemProperties(HsaSystemProperties *SystemProperties)
21272132 {
2128 HSAKMT_STATUS err;
2133 HSAKMT_STATUS err = HSAKMT_STATUS_SUCCESS;
21292134
21302135 CHECK_KFD_OPEN();
21312136
21332138 return HSAKMT_STATUS_INVALID_PARAMETER;
21342139
21352140 pthread_mutex_lock(&hsakmt_mutex);
2141
2142 /* We already have a valid snapshot. Avoid double initialization that
2143 * would leak memory.
2144 */
2145 if (g_system) {
2146 *SystemProperties = *g_system;
2147 goto out;
2148 }
21362149
21372150 err = topology_take_snapshot();
21382151 if (err != HSAKMT_STATUS_SUCCESS)
21402153
21412154 assert(g_system);
21422155
2156 err = fmm_init_process_apertures(g_system->NumNodes);
2157 if (err != HSAKMT_STATUS_SUCCESS)
2158 goto init_process_apertures_failed;
2159
2160 err = init_process_doorbells(g_system->NumNodes);
2161 if (err != HSAKMT_STATUS_SUCCESS)
2162 goto init_doorbells_failed;
2163
21432164 *SystemProperties = *g_system;
2144 err = HSAKMT_STATUS_SUCCESS;
2165
2166 goto out;
2167
2168 init_doorbells_failed:
2169 fmm_destroy_process_apertures();
2170 init_process_apertures_failed:
2171 topology_drop_snapshot();
21452172
21462173 out:
21472174 pthread_mutex_unlock(&hsakmt_mutex);
21502177
21512178 HSAKMT_STATUS HSAKMTAPI hsaKmtReleaseSystemProperties(void)
21522179 {
2153 HSAKMT_STATUS err;
2154
21552180 pthread_mutex_lock(&hsakmt_mutex);
21562181
2157 err = topology_drop_snapshot();
2182 destroy_process_doorbells();
2183 fmm_destroy_process_apertures();
2184 topology_drop_snapshot();
21582185
21592186 pthread_mutex_unlock(&hsakmt_mutex);
21602187
2161 return err;
2188 return HSAKMT_STATUS_SUCCESS;
2189 }
2190
2191 HSAKMT_STATUS topology_get_node_props(HSAuint32 NodeId,
2192 HsaNodeProperties *NodeProperties)
2193 {
2194 if (!g_system || !g_props || NodeId >= g_system->NumNodes)
2195 return HSAKMT_STATUS_ERROR;
2196
2197 *NodeProperties = g_props[NodeId].node;
2198 return HSAKMT_STATUS_SUCCESS;
21622199 }
21632200
21642201 HSAKMT_STATUS HSAKMTAPI hsaKmtGetNodeProperties(HSAuint32 NodeId,
21772214 if (err != HSAKMT_STATUS_SUCCESS)
21782215 goto out;
21792216
2180 *NodeProperties = g_props[NodeId].node;
2217 err = topology_get_node_props(NodeId, NodeProperties);
2218 if (err != HSAKMT_STATUS_SUCCESS)
2219 goto out;
21812220 /* For CPU only node don't add any additional GPU memory banks. */
21822221 if (gpu_id) {
21832222 uint64_t base, limit;
21892228 &limit) == HSAKMT_STATUS_SUCCESS)
21902229 NodeProperties->NumMemoryBanks += 1;
21912230 }
2192 err = HSAKMT_STATUS_SUCCESS;
21932231
21942232 out:
21952233 pthread_mutex_unlock(&hsakmt_mutex);
23172355 return err;
23182356 }
23192357
2358 HSAKMT_STATUS topology_get_iolink_props(HSAuint32 NodeId,
2359 HSAuint32 NumIoLinks,
2360 HsaIoLinkProperties *IoLinkProperties)
2361 {
2362 if (!g_system || !g_props || NodeId >= g_system->NumNodes)
2363 return HSAKMT_STATUS_ERROR;
2364
2365 memcpy(IoLinkProperties, g_props[NodeId].link,
2366 NumIoLinks * sizeof(*IoLinkProperties));
2367
2368 return HSAKMT_STATUS_SUCCESS;
2369 }
2370
23202371 HSAKMT_STATUS HSAKMTAPI hsaKmtGetNodeIoLinkProperties(HSAuint32 NodeId,
23212372 HSAuint32 NumIoLinks,
23222373 HsaIoLinkProperties *IoLinkProperties)
23232374 {
23242375 HSAKMT_STATUS err;
2325 uint32_t i;
23262376
23272377 if (!IoLinkProperties)
23282378 return HSAKMT_STATUS_INVALID_PARAMETER;
23422392 goto out;
23432393 }
23442394
2345 for (i = 0; i < MIN(g_props[NodeId].node.NumIOLinks, NumIoLinks); i++) {
2346 assert(g_props[NodeId].link);
2347 IoLinkProperties[i] = g_props[NodeId].link[i];
2348 }
2349
2350 err = HSAKMT_STATUS_SUCCESS;
2395 assert(g_props[NodeId].link);
2396 err = topology_get_iolink_props(NodeId, NumIoLinks, IoLinkProperties);
23512397
23522398 out:
23532399 pthread_mutex_unlock(&hsakmt_mutex);
23822428 return 0;
23832429
23842430 for (i = 0; i < g_system->NumNodes; i++) {
2385 if (g_props[i].gpu_id == gpu_id)
2431 if (g_props[i].node.KFDGpuID == gpu_id)
23862432 return g_props[i].node.DeviceId;
23872433 }
23882434
3232 set ( CPACK_PACKAGE_CONTACT "Advanced Micro Devices Inc." )
3333 set ( CPACK_PACKAGE_DESCRIPTION "This package includes kfdtest, the list of excluded tests for each ASIC, and a convenience script to run the test suite" )
3434 set ( CPACK_PACKAGE_DESCRIPTION_SUMMARY "Test suite for ROCK/KFD" )
35
36 # Make proper version for appending
37 # Default Value is 99999, setting it first
38 set(ROCM_VERSION_FOR_PACKAGE "99999")
39 if(DEFINED ENV{ROCM_LIBPATCH_VERSION})
40 set(ROCM_VERSION_FOR_PACKAGE $ENV{ROCM_LIBPATCH_VERSION})
41 endif()
42
3543 set ( CPACK_PACKAGE_VERSION_MAJOR "1" )
3644 set ( CPACK_PACKAGE_VERSION_MINOR "0" )
3745 set ( CPACK_PACKAGE_VERSION_PATCH "0" )
3846 set ( CPACK_PACKAGE_HOMEPAGE_URL "https://github.com/RadeonOpenCompute/ROCT-Thunk-Interface" )
47 set ( CPACK_DEBIAN_FILE_NAME "DEB-DEFAULT")
48 set ( CPACK_RPM_FILE_NAME "RPM-DEFAULT")
49
50 set(PACKAGE_VERSION_STR "${CPACK_PACKAGE_VERSION_MAJOR}.${CPACK_PACKAGE_VERSION_MINOR}.${CPACK_PACKAGE_VERSION_PATCH}.${ROCM_VERSION_FOR_PACKAGE}")
51 set(CPACK_PACKAGE_VERSION "${PACKAGE_VERSION_STR}")
52
3953
4054 ## Define default variable and variables for the optional build target hsakmt-dev
4155 set ( SOURCE_DIR ${CMAKE_CURRENT_SOURCE_DIR} CACHE STRING "Location of hsakmt source code." )
94108
95109 message ( "Find libhsakmt at ${HSAKMT_LIBRARY_DIRS}" )
96110
97 set ( SP3_DIR ${PROJECT_SOURCE_DIR}/sp3 )
111 if ( POLICY CMP0074 )
112 cmake_policy( SET CMP0074 NEW )
113 endif()
114
115 find_path( LIGHTNING_CMAKE_DIR NAMES LLVMConfig.cmake
116 PATHS $ENV{OUT_DIR}/llvm/lib/cmake/llvm NO_CACHE NO_DEFAULT_PATH)
117
118 if ( DEFINED LIGHTNING_CMAKE_DIR AND EXISTS ${LIGHTNING_CMAKE_DIR} )
119 set ( LLVM_DIR ${LIGHTNING_CMAKE_DIR} )
120 else()
121 message( STATUS "Couldn't find Lightning build in compute directory. "
122 "Searching LLVM_DIR then defaulting to system LLVM install if still not found..." )
123 endif()
124
125 find_package( LLVM REQUIRED CONFIG )
126
127 if( ${LLVM_PACKAGE_VERSION} VERSION_LESS "7.0" )
128 message( FATAL_ERROR "Requires LLVM 7.0 or greater "
129 "(found ${LLVM_PACKAGE_VERSION})" )
130 elseif( ${LLVM_PACKAGE_VERSION} VERSION_LESS "14.0" )
131 message( WARNING "Not using latest LLVM version. "
132 "Some ASIC targets may not work!" )
133 endif()
134
135 message( STATUS "Found LLVM ${LLVM_PACKAGE_VERSION}" )
136 message( STATUS "Using LLVMConfig.cmake in: ${LLVM_DIR}" )
137
138 include_directories(${LLVM_INCLUDE_DIRS})
139 separate_arguments(LLVM_DEFINITIONS_LIST NATIVE_COMMAND ${LLVM_DEFINITIONS})
140 add_definitions(${LLVM_DEFINITIONS_LIST})
141
142 llvm_map_components_to_libnames(llvm_libs AMDGPUAsmParser Core Support)
98143
99144 include_directories(${PROJECT_SOURCE_DIR}/gtest-1.6.0)
100145 include_directories(${PROJECT_SOURCE_DIR}/include)
101146 include_directories(${PROJECT_SOURCE_DIR}/../../include)
102 include_directories(${SP3_DIR})
103147
104148 include_directories(${DRM_INCLUDE_DIRS})
105149
111155 src/Dispatch.cpp
112156 src/GoogleTestExtension.cpp
113157 src/IndirectBuffer.cpp
114 src/IsaGenerator.cpp
115 src/IsaGenerator_Aldebaran.cpp
116 src/IsaGenerator_Gfx10.cpp
117 src/IsaGenerator_Gfx72.cpp
118 src/IsaGenerator_Gfx8.cpp
119 src/IsaGenerator_Gfx9.cpp
158 src/Assemble.cpp
159 src/ShaderStore.cpp
120160 src/LinuxOSWrapper.cpp
121161 src/PM4Packet.cpp
122162 src/PM4Queue.cpp
139179 src/KFDExceptionTest.cpp
140180 src/KFDGraphicsInterop.cpp
141181 src/KFDPerfCounters.cpp
142 src/KFDDBGTest.cpp
143182 src/KFDGWSTest.cpp
144183 src/KFDIPCTest.cpp
184 src/KFDASMTest.cpp
145185
146186 src/KFDEvictTest.cpp
147187 src/KFDHWSTest.cpp
162202
163203 if ( "${CMAKE_C_COMPILER_VERSION}" STRGREATER "4.8.0")
164204 ## Add --enable-new-dtags to generate DT_RUNPATH
165 set ( CMAKE_CXX_FLAGS "-std=gnu++11 -Wl,--enable-new-dtags" )
205 set ( CMAKE_CXX_FLAGS "-std=gnu++17 -Wl,--enable-new-dtags" )
166206 endif()
167207 if ( "${CMAKE_BUILD_TYPE}" STREQUAL Release )
168208 set ( CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -O2" )
180220 # The modules found by pkg_check_modules() in the default pkg config
181221 # path do not need to use link_directories() here.
182222 link_directories(${HSAKMT_LIBRARY_DIRS})
183 link_directories(${SP3_DIR})
184223
185224 add_executable(kfdtest ${SRC_FILES})
186225
187 target_link_libraries(kfdtest ${HSAKMT_LIBRARIES} ${DRM_LDFLAGS} ${DRM_AMDGPU_LDFLAGS} pthread m stdc++ rt amdsp3 numa)
226 target_link_libraries(kfdtest ${HSAKMT_LIBRARIES} ${DRM_LDFLAGS} ${DRM_AMDGPU_LDFLAGS} ${llvm_libs} pthread m stdc++ rt numa)
188227
189228 configure_file ( scripts/kfdtest.exclude kfdtest.exclude COPYONLY )
190229 configure_file ( scripts/run_kfdtest.sh run_kfdtest.sh COPYONLY )
6868 "KFDQMTest.mGPUShareBO:"\
6969 "KFDQMTest.SdmaEventInterrupt:"\
7070 "KFDMemoryTest.CacheInvalidateOnRemoteWrite:"\
71 "KFDDBGTest.BasicDebuggerSuspendResume:"\
7271 "KFDEvictTest.BurstyTest:"\
7372 "KFDHWSTest.*:"\
7473 "KFDSVMRangeTest.ReadOnlyRangeTest"
105104 "KFDQMTest.Atomics:"\
106105 "KFDQMTest.GPUDoorbellWrite"
107106
107 # KFDCWSRTest.BasicTest*: SWDEV-353206
108 BLACKLIST_GFX10=\
109 "KFDMemoryTest.DeviceHdpFlush:"\
110 "KFDQMTest.BasicCuMaskingEven:"\
111 "KFDSVMEvictTest.*:"\
112 "KFDCWSRTest.BasicTest*"
113
114 BLACKLIST_GFX10_NV2X=\
115 "$BLACKLIST_GFX10:"\
116 "KFDPerfCountersTest.*"
117
118 # GFX11 still undergoing debug. Ticket links:
119 # KFDMemoryTest.FlatScratchAccess - SWDEV-329877
120 # KFDEvictTest.QueueTest - SWDEV-325064
121 # KFDQMTest.MultipleCpQueuesStressDispatch - SWDEV-340965
122 # KFDExceptionTest.* - SWDEV-340972
123 TEMPORARY_BLACKLIST_GFX11=\
124 "KFDQMTest.CreateAqlCpQueue:"\
125 "KFDQMTest.MultipleCpQueuesStressDispatch:"\
126 "KFDCWSRTest.InterruptRestore:"\
127 "KFDExceptionTest.*:"\
128 "KFDEvictTest.QueueTest:"\
129 "KFDSVMRangeTest.*Migrate*:"\
130 "KFDSVMRangeTest.*Migration*:"\
131 "KFDMemoryTest.FlatScratchAccess"
132
108133 # KFDQMTest.CpuWriteCoherence fails. 0 dwordsAvailable (KFD-338)
109134 # KFDMemoryTest.MemoryRegister fails on SDMA queue creation (KFD-337)
110135 FILTER[kaveri]=\
223248 "KFDMemoryTest.PtraceAccess:"\
224249 "KFDMemoryTest.DeviceHdpFlush"
225250
226 # SP3 Compiler needs to be updated for GFX10. Temporarily disable all tests
227 # that require shader compiler
228 # Adding KFDSVMEvictTest as SVM/HMM was never validated on GFX10
229 TEMP_GFX10_BLACKLIST=\
230 "KFDMemoryTest.FlatScratchAccess:"\
231 "KFDMemoryTest.PtraceAccessInvisibleVram:"\
232 "KFDQMTest.QueuePriorityOnDifferentPipe:"\
233 "KFDQMTest.QueuePriorityOnSamePipe:"\
234 "KFDCWSRTest.BasicTest:"\
235 "KFDQMTest.BasicCuMaskingEven:"\
236 "KFDEvictTest.QueueTest:"\
237 "KFDMemoryTest.MapUnmapToNodes:"\
238 "KFDMemoryTest.HostHdpFlush:"\
239 "KFDMemoryTest.DeviceHdpFlush:"\
240 "KFDSVMEvictTest.*"
241
242251 FILTER[navi10]=\
243252 "$BLACKLIST_ALL_ASICS:"\
244 "$TEMP_GFX10_BLACKLIST:"\
253 "$BLACKLIST_GFX10:"\
245254 "KFDMemoryTest.MMBench"
246255
247256 # Need to verify the following failed tests on another machine:
250259 # P2PBandwidth failing (wait times out) on node-to-multiple-nodes by [push, NONE]
251260 FILTER[navi12]=\
252261 "$BLACKLIST_ALL_ASICS:"\
262 "$BLACKLIST_GFX10:"\
253263 "KFDExceptionTest.*:"\
254264 "KFDPerfCountersTest.*:"\
255 "KFDPerformanceTest.P2PBandWidthTest:"\
256 "$TEMP_GFX10_BLACKLIST"
265 "KFDPerformanceTest.P2PBandWidthTest"
257266
258267 FILTER[navi14]=\
259268 "$BLACKLIST_ALL_ASICS:"\
260 "$TEMP_GFX10_BLACKLIST"
269 "$BLACKLIST_GFX10"
261270
262271 FILTER[sienna_cichlid]=\
263272 "$BLACKLIST_ALL_ASICS:"\
264 "$TEMP_GFX10_BLACKLIST:"\
265 "KFDQMTest.BasicCuMaskingEven:"\
266 "KFDDBGTest.*:"\
267 "KFDPerfCountersTest.*:"\
273 "$BLACKLIST_GFX10_NV2X"
268274
269275 FILTER[navy_flounder]=\
270276 "$BLACKLIST_ALL_ASICS:"\
271 "$TEMP_GFX10_BLACKLIST:"\
272 "KFDQMTest.BasicCuMaskingEven:"\
273 "KFDDBGTest.*:"\
274 "KFDPerfCountersTest.*:"\
277 "$BLACKLIST_GFX10_NV2X"
275278
276279 FILTER[dimgrey_cavefish]=\
277280 "$BLACKLIST_ALL_ASICS:"\
278 "$TEMP_GFX10_BLACKLIST:"\
279 "KFDQMTest.BasicCuMaskingEven:"\
280 "KFDDBGTest.*:"\
281 "KFDPerfCountersTest.*:"\
281 "$BLACKLIST_GFX10_NV2X"
282282
283283 FILTER[beige_goby]=\
284284 "$BLACKLIST_ALL_ASICS:"\
285 "$TEMP_GFX10_BLACKLIST:"\
286 "KFDQMTest.BasicCuMaskingEven:"\
287 "KFDDBGTest.*:"\
288 "KFDPerfCountersTest.*:"\
285 "$BLACKLIST_GFX10_NV2X"
289286
290287 FILTER[yellow_carp]=\
291288 "$BLACKLIST_ALL_ASICS:"\
292 "$TEMP_GFX10_BLACKLIST:"\
293 "KFDQMTest.BasicCuMaskingEven:"\
294 "KFDIPCTest.CMABasicTest"
289 "$BLACKLIST_GFX10_NV2X"
290
291 FILTER[gfx1100]=\
292 "$BLACKLIST_ALL_ASICS:"\
293 "$BLACKLIST_GFX10_NV2X:"\
294 "$TEMPORARY_BLACKLIST_GFX11"
295
296 FILTER[gfx1101]=\
297 "$BLACKLIST_ALL_ASICS:"\
298 "$BLACKLIST_GFX10_NV2X:"\
299 "$TEMPORARY_BLACKLIST_GFX11"
300
301 FILTER[gfx1102]=\
302 "$BLACKLIST_ALL_ASICS:"\
303 "$BLACKLIST_GFX10_NV2X:"\
304 "$TEMPORARY_BLACKLIST_GFX11"
305
306 FILTER[gfx1103]=\
307 "$BLACKLIST_ALL_ASICS:"\
308 "$BLACKLIST_GFX10_NV2X:"\
309 "$TEMPORARY_BLACKLIST_GFX11"
310
311 FILTER[gfx1036]=\
312 "$BLACKLIST_ALL_ASICS:"\
313 "$BLACKLIST_GFX10_NV2X"
7979 NODE=""
8080 FORCE_HIGH=""
8181 RUN_IN_DOCKER=""
82 ADDITIONAL_EXCLUDE=""
8283
8384 printUsage() {
8485 echo
9495 echo " -l , --list List available nodes"
9596 echo " --high Force clocks to high for test execution"
9697 echo " -d , --docker Run in docker container"
98 echo " -e , --exclude Additional tests to exclude, in addition to kfdtest.exclude (colon-separated, single quoted string as an argument)"
9799 echo " -h , --help Prints this help"
98100 echo
99101 echo "Gtest arguments will be forwarded to the app"
121123 gtestFilter="--gtest_filter=${FILTER[$platform]}"
122124 ;;
123125 esac
126 if [ -n "$ADDITIONAL_EXCLUDE" ]; then
127 gtestFilter="$gtestFilter:$ADDITIONAL_EXCLUDE"
128 fi
124129 }
125130
126131 TOPOLOGY_SYSFS_DIR=/sys/devices/virtual/kfd/kfd/topology/nodes
138143 }
139144
140145
141 # Prints GPU Name for the given Node ID
146 # Prints GPU Name for the given Node ID. If transitioned to IP discovery,
147 # use target gfx version
142148 # param - Node ID
143149 getNodeName() {
144150 local nodeId=$1; shift;
147153 local CpuCoresCount=$(cat $TOPOLOGY_SYSFS_DIR/$nodeId/properties | grep cpu_cores_count | awk '{print $2}')
148154 local SimdCount=$(cat $TOPOLOGY_SYSFS_DIR/$nodeId/properties | grep simd_count | awk '{print $2}')
149155 if [ "$CpuCoresCount" -eq 0 ] && [ "$SimdCount" -gt 0 ]; then
150 gpuName="raven_dgpuFallback"
156 gpuName="raven_dgpuFallback"
157 fi
158 elif [ "$gpuName" == "ip discovery" ]; then
159 if [ -n "$HSA_OVERRIDE_GFX_VERSION" ]; then
160 gpuName="gfx$(echo "$HSA_OVERRIDE_GFX_VERSION" | awk 'BEGIN {FS="."; RS=""} {printf "%d%x%x", $1, $2, $3 }')"
161 else
162 local GfxVersionDec=$(cat $TOPOLOGY_SYSFS_DIR/$nodeId/properties | grep gfx_target_version | awk '{print $2}')
163 gpuName="gfx$(printf "$GfxVersionDec" | fold -w2 | awk 'BEGIN {FS="\n"; RS=""} {printf "%d%x%x", $1, $2, $3}')"
151164 fi
152165 fi
153166 echo "$gpuName"
165178 exit 0
166179 fi
167180 PKG_ROOT="$(getPackageRoot)"
181 fi
182
183 if [ -n "$GTEST_ARGS" ] && [ -n "$ADDITIONAL_EXCLUDE" ]; then
184 echo "Cannot use -e and --gtest_filter flags together"
185 exit 0
168186 fi
169187
170188 if [ "$NODE" == "" ]; then
241259 FORCE_HIGH="true" ;;
242260 -d | --docker )
243261 RUN_IN_DOCKER="true" ;;
262 -e | --exclude )
263 ADDITIONAL_EXCLUDE="$2" ; shift ;;
244264 -h | --help )
245265 printUsage; exit 0 ;;
246266 *)
0 ////////////////////////////////////////////////////////////////////////////////
1 //
2 // The University of Illinois/NCSA
3 // Open Source License (NCSA)
4 //
5 // Copyright (c) 2022, Advanced Micro Devices, Inc. All rights reserved.
6 //
7 // Developed by:
8 //
9 // AMD Research and AMD HSA Software Development
10 //
11 // Advanced Micro Devices, Inc.
12 //
13 // www.amd.com
14 //
15 // Permission is hereby granted, free of charge, to any person obtaining a copy
16 // of this software and associated documentation files (the "Software"), to
17 // deal with the Software without restriction, including without limitation
18 // the rights to use, copy, modify, merge, publish, distribute, sublicense,
19 // and/or sell copies of the Software, and to permit persons to whom the
20 // Software is furnished to do so, subject to the following conditions:
21 //
22 // - Redistributions of source code must retain the above copyright notice,
23 // this list of conditions and the following disclaimers.
24 // - Redistributions in binary form must reproduce the above copyright
25 // notice, this list of conditions and the following disclaimers in
26 // the documentation and/or other materials provided with the distribution.
27 // - Neither the names of Advanced Micro Devices, Inc,
28 // nor the names of its contributors may be used to endorse or promote
29 // products derived from this Software without specific prior written
30 // permission.
31 //
32 // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
33 // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
34 // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
35 // THE CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
36 // OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
37 // ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
38 // DEALINGS WITH THE SOFTWARE.
39 //
40 ////////////////////////////////////////////////////////////////////////////////
41
42 /**
43 * Self-contained assembler that uses the LLVM MC API to assemble AMDGCN
44 * instructions
45 */
46
47 #include <llvm/Config/llvm-config.h>
48 #include <llvm/MC/MCAsmBackend.h>
49 #include <llvm/MC/MCAsmInfo.h>
50 #include <llvm/MC/MCCodeEmitter.h>
51 #include <llvm/MC/MCContext.h>
52 #include <llvm/MC/MCInstPrinter.h>
53 #include <llvm/MC/MCInstrInfo.h>
54 #include <llvm/MC/MCObjectFileInfo.h>
55 #include <llvm/MC/MCObjectWriter.h>
56 #include <llvm/MC/MCParser/AsmLexer.h>
57 #include <llvm/MC/MCParser/MCTargetAsmParser.h>
58 #include <llvm/MC/MCRegisterInfo.h>
59 #include <llvm/MC/MCStreamer.h>
60 #include <llvm/MC/MCSubtargetInfo.h>
61 #include <llvm/Support/CommandLine.h>
62 #include <llvm/Support/InitLLVM.h>
63 #include <llvm/Support/MemoryBuffer.h>
64 #include <llvm/Support/SourceMgr.h>
65 #include <llvm/Support/TargetSelect.h>
66 #if LLVM_VERSION_MAJOR > 13
67 #include <llvm/MC/TargetRegistry.h>
68 #else
69 #include <llvm/Support/TargetRegistry.h>
70 #endif
71
72 #include <linux/elf.h>
73 #include "OSWrapper.hpp"
74 #include "Assemble.hpp"
75
76 using namespace llvm;
77
78 Assembler::Assembler(const uint32_t Gfxv) {
79 SetTargetAsic(Gfxv);
80 TextData = nullptr;
81 TextSize = 0;
82 LLVMInit();
83 }
84
85 Assembler::~Assembler() {
86 FlushText();
87 llvm_shutdown();
88 }
89
90 const char* Assembler::GetInstrStream() {
91 return TextData;
92 }
93
94 const size_t Assembler::GetInstrStreamSize() {
95 return TextSize;
96 }
97
98 int Assembler::CopyInstrStream(char* OutBuf, const size_t BufSize) {
99 if (TextSize > BufSize)
100 return -2;
101
102 std::copy(TextData, TextData + TextSize, OutBuf);
103 return 0;
104 }
105
106 const char* Assembler::GetTargetAsic() {
107 return MCPU;
108 }
109
110 /**
111 * Set MCPU via GFX Version from Thunk
112 * LLVM Target IDs use decimal for Maj/Min, hex for Step
113 */
114 void Assembler::SetTargetAsic(const uint32_t Gfxv) {
115 const uint8_t Major = (Gfxv >> 16) & 0xff;
116 const uint8_t Minor = (Gfxv >> 8) & 0xff;
117 const uint8_t Step = Gfxv & 0xff;
118
119 snprintf(MCPU, ASM_MCPU_LEN, "gfx%d%d%x", Major, Minor, Step);
120 }
121
122 /**
123 * Initialize LLVM targets and assembly printers/parsers
124 */
125 void Assembler::LLVMInit() {
126 LLVMInitializeAMDGPUTargetInfo();
127 LLVMInitializeAMDGPUTargetMC();
128 LLVMInitializeAMDGPUAsmParser();
129 }
130
131 /**
132 * Flush/reset TextData and TextSize to initial state
133 */
134 void Assembler::FlushText() {
135 if (TextData)
136 delete[] TextData;
137 TextData = nullptr;
138 TextSize = 0;
139 }
140
141 /**
142 * Print hex of ELF object to stdout (debug)
143 */
144 void Assembler::PrintELFHex(const std::string Data) {
145 outs() << "ASM Info: assembled ELF hex data (length " << Data.length() << "):\n";
146 outs() << "0x00:\t";
147 for (size_t i = 0; i < Data.length(); ++i) {
148 char c = Data[i];
149 outs() << format_hex(static_cast<uint8_t>(c), 4);
150 if ((i+1) % 16 == 0)
151 outs() << "\n" << format_hex(i+1, 4) << ":\t";
152 else
153 outs() << " ";
154 }
155 outs() << "\n";
156 }
157
158 /**
159 * Print hex of raw instruction stream to stdout (debug)
160 */
161 void Assembler::PrintTextHex() {
162 outs() << "ASM Info: assembled .text hex data (length " << TextSize << "):\n";
163 outs() << "0x00:\t";
164 for (size_t i = 0; i < TextSize; i++) {
165 outs() << format_hex(static_cast<uint8_t>(TextData[i]), 4);
166 if ((i+1) % 16 == 0)
167 outs() << "\n" << format_hex(i+1, 4) << ":\t";
168 else
169 outs() << " ";
170 }
171 outs() << "\n";
172 }
173
174 /**
175 * Extract raw instruction stream from .text section in ELF object
176 *
177 * @param RawData Raw C string of ELF object
178 * @return 0 on success
179 */
180 int Assembler::ExtractELFText(const char* RawData) {
181 const Elf64_Ehdr* ElfHeader;
182 const Elf64_Shdr* SectHeader;
183 const Elf64_Shdr* SectStrTable;
184 const char* SectStrAddr;
185 unsigned NumSects, SectIdx;
186
187 if (!(ElfHeader = reinterpret_cast<const Elf64_Ehdr*>(RawData))) {
188 outs() << "ASM Error: elf data is invalid or corrupted\n";
189 return -1;
190 }
191 if (ElfHeader->e_ident[EI_CLASS] != ELFCLASS64) {
192 outs() << "ASM Error: elf object must be of 64-bit type\n";
193 return -1;
194 }
195
196 SectHeader = reinterpret_cast<const Elf64_Shdr*>(RawData + ElfHeader->e_shoff);
197 SectStrTable = &SectHeader[ElfHeader->e_shstrndx];
198 SectStrAddr = static_cast<const char*>(RawData + SectStrTable->sh_offset);
199
200 // Loop through sections, break on .text
201 NumSects = ElfHeader->e_shnum;
202 for (SectIdx = 0; SectIdx < NumSects; SectIdx++) {
203 std::string SectName = std::string(SectStrAddr + SectHeader[SectIdx].sh_name);
204 if (SectName == std::string(".text")) {
205 TextSize = SectHeader[SectIdx].sh_size;
206 TextData = new char[TextSize];
207 memcpy(TextData, RawData + SectHeader[SectIdx].sh_offset, TextSize);
208 break;
209 }
210 }
211
212 if (SectIdx >= NumSects) {
213 outs() << "ASM Error: couldn't locate .text section\n";
214 return -1;
215 }
216
217 return 0;
218 }
219
220 /**
221 * Assemble shader, fill member vars, and copy to output buffer
222 *
223 * @param AssemblySource Shader source represented as a raw C string
224 * @param OutBuf Raw instruction stream output buffer
225 * @param BufSize Size of OutBuf (defaults to PAGE_SIZE)
226 * @return Value of RunAssemble() (0 on success)
227 */
228 int Assembler::RunAssembleBuf(const char* const AssemblySource, char* OutBuf,
229 const size_t BufSize) {
230 int ret = RunAssemble(AssemblySource);
231 return ret ? ret : CopyInstrStream(OutBuf, BufSize);
232 }
233
234 /**
235 * Assemble shader and fill member vars
236 *
237 * @param AssemblySource Shader source represented as a raw C string
238 * @return 0 on success
239 */
240 int Assembler::RunAssemble(const char* const AssemblySource) {
241 // Ensure target ASIC has been set
242 if (!MCPU) {
243 outs() << "ASM Error: target asic is uninitialized\n";
244 return -1;
245 }
246
247 // Delete TextData for any previous runs
248 FlushText();
249
250 #if 0
251 outs() << "ASM Info: running assembly for target: " << MCPU << "\n";
252 outs() << "ASM Info: source:\n";
253 outs() << AssemblySource << "\n";
254 #endif
255
256 // Initialize MCOptions and target triple
257 const MCTargetOptions MCOptions;
258 Triple TheTriple;
259
260 const Target* TheTarget =
261 TargetRegistry::lookupTarget(ArchName, TheTriple, Error);
262 if (!TheTarget) {
263 outs() << Error;
264 return -1;
265 }
266
267 TheTriple.setArchName(ArchName);
268 TheTriple.setVendorName(VendorName);
269 TheTriple.setOSName(OSName);
270
271 TripleName = TheTriple.getTriple();
272 TheTriple.setTriple(Triple::normalize(TripleName));
273
274 // Create MemoryBuffer for assembly source
275 StringRef AssemblyRef(AssemblySource);
276 std::unique_ptr<MemoryBuffer> BufferPtr =
277 MemoryBuffer::getMemBuffer(AssemblyRef, "", false);
278 if (!BufferPtr->getBufferSize()) {
279 outs() << "ASM Error: assembly source is empty\n";
280 return -1;
281 }
282
283 // Instantiate SrcMgr and transfer BufferPtr ownership
284 SourceMgr SrcMgr;
285 SrcMgr.AddNewSourceBuffer(std::move(BufferPtr), SMLoc());
286
287 // Initialize MC interfaces and base class objects
288 std::unique_ptr<const MCRegisterInfo> MRI(
289 TheTarget->createMCRegInfo(TripleName));
290 if (!MRI) {
291 outs() << "ASM Error: no register info for target " << MCPU << "\n";
292 return -1;
293 }
294 #if LLVM_VERSION_MAJOR > 9
295 std::unique_ptr<const MCAsmInfo> MAI(
296 TheTarget->createMCAsmInfo(*MRI, TripleName, MCOptions));
297 #else
298 std::unique_ptr<const MCAsmInfo> MAI(
299 TheTarget->createMCAsmInfo(*MRI, TripleName));
300 #endif
301 if (!MAI) {
302 outs() << "ASM Error: no assembly info for target " << MCPU << "\n";
303 return -1;
304 }
305 std::unique_ptr<MCInstrInfo> MCII(
306 TheTarget->createMCInstrInfo());
307 if (!MCII) {
308 outs() << "ASM Error: no instruction info for target " << MCPU << "\n";
309 return -1;
310 }
311 std::unique_ptr<MCSubtargetInfo> STI(
312 TheTarget->createMCSubtargetInfo(TripleName, MCPU, std::string()));
313 if (!STI || !STI->isCPUStringValid(MCPU)) {
314 outs() << "ASM Error: no subtarget info for target " << MCPU << "\n";
315 return -1;
316 }
317
318 // Set up the MCContext for creating symbols and MCExpr's
319 #if LLVM_VERSION_MAJOR > 12
320 MCContext Ctx(TheTriple, MAI.get(), MRI.get(), STI.get(), &SrcMgr, &MCOptions);
321 #else
322 MCObjectFileInfo MOFI;
323 MCContext Ctx(MAI.get(), MRI.get(), &MOFI, &SrcMgr, &MCOptions);
324 MOFI.InitMCObjectFileInfo(TheTriple, true, Ctx);
325 #endif
326
327 // Finalize setup for output object code stream
328 std::string Data;
329 std::unique_ptr<raw_string_ostream> DataStream(std::make_unique<raw_string_ostream>(Data));
330 std::unique_ptr<buffer_ostream> BOS(std::make_unique<buffer_ostream>(*DataStream));
331 raw_pwrite_stream* OS = BOS.get();
332
333 #if LLVM_VERSION_MAJOR > 14
334 MCCodeEmitter* CE = TheTarget->createMCCodeEmitter(*MCII, Ctx);
335 #else
336 MCCodeEmitter* CE = TheTarget->createMCCodeEmitter(*MCII, *MRI, Ctx);
337 #endif
338 MCAsmBackend* MAB = TheTarget->createMCAsmBackend(*STI, *MRI, MCOptions);
339
340 std::unique_ptr<MCStreamer> Streamer(TheTarget->createMCObjectStreamer(
341 TheTriple, Ctx,
342 std::unique_ptr<MCAsmBackend>(MAB), MAB->createObjectWriter(*OS),
343 std::unique_ptr<MCCodeEmitter>(CE), *STI, MCOptions.MCRelaxAll,
344 MCOptions.MCIncrementalLinkerCompatible, /*DWARFMustBeAtTheEnd*/ false));
345
346 std::unique_ptr<MCAsmParser> Parser(
347 createMCAsmParser(SrcMgr, Ctx, *Streamer, *MAI));
348
349 // Set parser to target parser and run
350 std::unique_ptr<MCTargetAsmParser> TAP(
351 TheTarget->createMCAsmParser(*STI, *Parser, *MCII, MCOptions));
352 if (!TAP) {
353 outs() << "ASM Error: no assembly parsing support for target " << MCPU << "\n";
354 return -1;
355 }
356 Parser->setTargetParser(*TAP);
357
358 if (Parser->Run(true)) {
359 outs() << "ASM Error: assembly parser failed\n";
360 return -1;
361 }
362
363 BOS.reset();
364 DataStream->flush();
365
366 int ret = ExtractELFText(Data.data());
367 if (ret < 0 || !TextData) {
368 outs() << "ASM Error: .text extraction failed\n";
369 return ret;
370 }
371
372 #if 0
373 PrintELFHex(Data);
374 PrintTextHex();
375 #endif
376
377 return 0;
378 }
0 ////////////////////////////////////////////////////////////////////////////////
1 //
2 // The University of Illinois/NCSA
3 // Open Source License (NCSA)
4 //
5 // Copyright (c) 2022, Advanced Micro Devices, Inc. All rights reserved.
6 //
7 // Developed by:
8 //
9 // AMD Research and AMD HSA Software Development
10 //
11 // Advanced Micro Devices, Inc.
12 //
13 // www.amd.com
14 //
15 // Permission is hereby granted, free of charge, to any person obtaining a copy
16 // of this software and associated documentation files (the "Software"), to
17 // deal with the Software without restriction, including without limitation
18 // the rights to use, copy, modify, merge, publish, distribute, sublicense,
19 // and/or sell copies of the Software, and to permit persons to whom the
20 // Software is furnished to do so, subject to the following conditions:
21 //
22 // - Redistributions of source code must retain the above copyright notice,
23 // this list of conditions and the following disclaimers.
24 // - Redistributions in binary form must reproduce the above copyright
25 // notice, this list of conditions and the following disclaimers in
26 // the documentation and/or other materials provided with the distribution.
27 // - Neither the names of Advanced Micro Devices, Inc,
28 // nor the names of its contributors may be used to endorse or promote
29 // products derived from this Software without specific prior written
30 // permission.
31 //
32 // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
33 // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
34 // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
35 // THE CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
36 // OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
37 // ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
38 // DEALINGS WITH THE SOFTWARE.
39 //
40 ////////////////////////////////////////////////////////////////////////////////
41
42 #ifndef _ASSEMBLE_H_
43 #define _ASSEMBLE_H_
44
45 #include "OSWrapper.hpp"
46
47 #define ASM_MCPU_LEN 16
48
49 class Assembler {
50 private:
51 const char* ArchName = "amdgcn";
52 const char* VendorName = "amd";
53 const char* OSName = "amdhsa";
54 char MCPU[ASM_MCPU_LEN];
55
56 std::string TripleName;
57 std::string Error;
58
59 char* TextData;
60 size_t TextSize;
61
62 void SetTargetAsic(const uint32_t Gfxv);
63
64 void LLVMInit();
65 void FlushText();
66 void PrintELFHex(const std::string Data);
67 int ExtractELFText(const char* RawData);
68
69 public:
70 Assembler(const uint32_t Gfxv);
71 ~Assembler();
72
73 void PrintTextHex();
74 const char* GetTargetAsic();
75
76 const char* GetInstrStream();
77 const size_t GetInstrStreamSize();
78 int CopyInstrStream(char* OutBuf, const size_t BufSize = PAGE_SIZE);
79
80 int RunAssemble(const char* const AssemblySource);
81 int RunAssembleBuf(const char* const AssemblySource, char* OutBuf,
82 const size_t BufSize = PAGE_SIZE);
83 };
84
85 #endif // _ASSEMBLE_H_
137137 pgmRsrc2 |= (1 << COMPUTE_PGM_RSRC2__EXCP_EN_MSB__SHIFT)
138138 & COMPUTE_PGM_RSRC2__EXCP_EN_MSB_MASK;
139139
140 const bool priv = (m_FamilyId == FAMILY_GFX11);
140141 const unsigned int COMPUTE_PGM_RSRC[] = {
141 // PGM_RSRC1 = { VGPRS: 16 SGPRS: 16 PRIORITY: m_SpiPriority FLOAT_MODE: c0 PRIV: 0
142 // DX10_CLAMP: 0 DEBUG_MODE: 0 IEEE_MODE: 0 BULKY: 0 CDBG_USER: 0 }
143 0x000c0084 | ((m_SpiPriority & 3) << 10),
142 // PGM_RSRC1 = { VGPRS: 16 SGPRS: 16 PRIORITY: m_SpiPriority FLOAT_MODE: c0
143 // PRIV: 0 (1 for GFX11) DX10_CLAMP: 0 DEBUG_MODE: 0 IEEE_MODE: 0 BULKY: 0 CDBG_USER: 0 }
144 0x000c0084 | ((m_SpiPriority & 3) << 10) | (priv << 20),
144145 pgmRsrc2
145146 };
146147
+0
-126
tests/kfdtest/src/IsaGenerator.cpp less more
0 /*
1 * Copyright (C) 2014-2018 Advanced Micro Devices, Inc. All Rights Reserved.
2 *
3 * Permission is hereby granted, free of charge, to any person obtaining a
4 * copy of this software and associated documentation files (the "Software"),
5 * to deal in the Software without restriction, including without limitation
6 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
7 * and/or sell copies of the Software, and to permit persons to whom the
8 * Software is furnished to do so, subject to the following conditions:
9 *
10 * The above copyright notice and this permission notice shall be included in
11 * all copies or substantial portions of the Software.
12 *
13 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
14 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
15 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
16 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
17 * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
18 * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
19 * OTHER DEALINGS IN THE SOFTWARE.
20 *
21 */
22
23 #include "IsaGenerator.hpp"
24
25 #include <algorithm>
26 #include <string>
27
28 #include "IsaGenerator_Gfx72.hpp"
29 #include "IsaGenerator_Gfx8.hpp"
30 #include "IsaGenerator_Gfx9.hpp"
31 #include "IsaGenerator_Gfx10.hpp"
32 #include "IsaGenerator_Aldebaran.hpp"
33
34 #include "GoogleTestExtension.hpp"
35
36 #include "sp3.h"
37
38 const std::string IsaGenerator::ADDRESS_WATCH_SP3(
39 "var REG_TRAPSTS_EXCP_MASK = 0x000001ff\n"
40 "var WAVE_COUNT_OFFSET = 12\n"
41 "var TMA_CYCLE_OFFSET = 16\n"
42 "\n"
43 "/*\n"
44 " * ttmp[0:1] -- The ISA address that triggered this trap handler\n"
45 " * ttmp[10:11] -- The TMA user provided, used to store the debug info in this shader\n"
46 " * v[10:14] ttmp[7:8] -- temp use inside this shader\n"
47 " * s5 -- store the counts that this trap been triggered\n"
48 " * Each time when the trap is triggered , this shader will write\n"
49 " * ttmp[0] : ttmp[1] : Trap_Status : [reserved]\n"
50 " * to TMA + (trap count * TMA_CYCLE_OFFSET)\n"
51 " * The TMA + WAVE_COUNT_OFFSET(the first [reserved] address)\n"
52 " * used to store the total triggered trap count.\n"
53 " */\n"
54 "shader main\n"
55 "\n"
56 " asic(VI)\n"
57 "\n"
58 " type(CS)\n"
59 " v_mov_b32 v10, ttmp10\n"
60 " v_mov_b32 v11, ttmp11\n"
61 " s_mov_b32 ttmp7, s5\n"
62 " s_mulk_i32 ttmp7, TMA_CYCLE_OFFSET\n"
63 " s_addk_i32 s5, 1\n"
64 " v_mov_b32 v12, ttmp0\n"
65 " v_add_u32 v10, vcc, ttmp7, v10\n"
66 " flat_store_dword v[10,11], v12 slc glc\n"
67 " v_mov_b32 v12, ttmp1\n"
68 " v_add_u32 v10, vcc, 4, v10\n"
69 " flat_store_dword v[10,11], v12 slc glc\n"
70 " s_getreg_b32 ttmp8, hwreg(HW_REG_TRAPSTS)\n"
71 " s_and_b32 ttmp8, ttmp8, REG_TRAPSTS_EXCP_MASK\n"
72 " v_mov_b32 v12, ttmp8\n"
73 " v_add_u32 v10, vcc, 4, v10\n"
74 " flat_store_dword v[10,11], v12 glc\n"
75 " v_mov_b32 v10, ttmp10\n"
76 " v_add_u32 v10, vcc, WAVE_COUNT_OFFSET, v10\n"
77 " v_mov_b32 v13, 1\n"
78 " flat_atomic_add v14, v[10:11], v13 slc glc\n"
79 " s_and_b32 ttmp1, ttmp1, 0xffff\n"
80 " s_rfe_b64 [ttmp0,ttmp1]\n"
81 "end\n"
82 );
83
84 IsaGenerator* IsaGenerator::Create(unsigned int familyId) {
85 switch (familyId) {
86 case FAMILY_CI:
87 case FAMILY_KV:
88 return new IsaGenerator_Gfx72;
89 case FAMILY_VI:
90 case FAMILY_CZ:
91 return new IsaGenerator_Gfx8;
92 case FAMILY_AI:
93 case FAMILY_RV:
94 case FAMILY_AR:
95 return new IsaGenerator_Gfx9;
96 case FAMILY_AL:
97 return new IsaGenerator_Aldbrn;
98 case FAMILY_NV:
99 return new IsaGenerator_Gfx10;
100
101 default:
102 LOG() << "Error: Invalid ISA" << std::endl;
103 return NULL;
104 }
105 }
106
107 void IsaGenerator::GetAwTrapHandler(HsaMemoryBuffer& rBuf) {
108 CompileShader(ADDRESS_WATCH_SP3.c_str(), "main", rBuf);
109 }
110
111 void IsaGenerator::CompileShader(const char* shaderCode, const char* shaderName, HsaMemoryBuffer& rBuf) {
112 sp3_context* pSp3 = sp3_new();
113 sp3_setasic(pSp3, GetAsicName().c_str());
114 sp3_parse_string(pSp3, shaderCode);
115 sp3_shader* pShader = sp3_compile(pSp3, shaderName);
116
117 std::copy(pShader->data, pShader->data + pShader->size, rBuf.As<unsigned int*>());
118 sp3_free_shader(pShader);
119
120 /** Inside this close function, there is an unknown reason of free memory not used by compiler.
121 * Comment out this as a workaround. System will do the garbage collection after this
122 * application is closed.
123 */
124 // sp3_close(pSp3);
125 }
+0
-52
tests/kfdtest/src/IsaGenerator.hpp less more
0 /*
1 * Copyright (C) 2014-2018 Advanced Micro Devices, Inc. All Rights Reserved.
2 *
3 * Permission is hereby granted, free of charge, to any person obtaining a
4 * copy of this software and associated documentation files (the "Software"),
5 * to deal in the Software without restriction, including without limitation
6 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
7 * and/or sell copies of the Software, and to permit persons to whom the
8 * Software is furnished to do so, subject to the following conditions:
9 *
10 * The above copyright notice and this permission notice shall be included in
11 * all copies or substantial portions of the Software.
12 *
13 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
14 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
15 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
16 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
17 * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
18 * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
19 * OTHER DEALINGS IN THE SOFTWARE.
20 *
21 */
22
23 #ifndef _ISAGENERATOR_H_
24 #define _ISAGENERATOR_H_
25
26 #include "KFDTestUtil.hpp"
27
28 /* isa generation class - interface */
29 class IsaGenerator {
30 public:
31 static IsaGenerator* Create(unsigned int familyId);
32
33 virtual ~IsaGenerator() {}
34
35 virtual void GetNoopIsa(HsaMemoryBuffer& rBuf) = 0;
36 virtual void GetCopyDwordIsa(HsaMemoryBuffer& rBuf) = 0;
37 virtual void GetInfiniteLoopIsa(HsaMemoryBuffer& rBuf) = 0;
38 virtual void GetAtomicIncIsa(HsaMemoryBuffer& rBuf) = 0;
39 virtual void GetCwsrTrapHandler(HsaMemoryBuffer& rBuf) {}
40 virtual void GetAwTrapHandler(HsaMemoryBuffer& rBuf);
41
42 void CompileShader(const char* shaderCode, const char* shaderName, HsaMemoryBuffer& rBuf);
43
44 protected:
45 virtual const std::string& GetAsicName() = 0;
46
47 private:
48 static const std::string ADDRESS_WATCH_SP3;
49 };
50
51 #endif // _ISAGENERATOR_H_
+0
-113
tests/kfdtest/src/IsaGenerator_Aldebaran.cpp less more
0 /*
1 * Copyright (C) 2020 Advanced Micro Devices, Inc. All Rights Reserved.
2 *
3 * Permission is hereby granted, free of charge, to any person obtaining a
4 * copy of this software and associated documentation files (the "Software"),
5 * to deal in the Software without restriction, including without limitation
6 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
7 * and/or sell copies of the Software, and to permit persons to whom the
8 * Software is furnished to do so, subject to the following conditions:
9 *
10 * The above copyright notice and this permission notice shall be included in
11 * all copies or substantial portions of the Software.
12 *
13 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
14 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
15 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
16 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
17 * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
18 * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
19 * OTHER DEALINGS IN THE SOFTWARE.
20 *
21 */
22
23 #include "IsaGenerator_Aldebaran.hpp"
24
25 #include <algorithm>
26 #include <string>
27
28 const std::string IsaGenerator_Aldbrn::ASIC_NAME = "ALDEBARAN";
29
30 /* The binaries are generated from following ISA */
31 #if 0
32 /* flat_atomic_inc will not support by some PCIE, use flat_atomic_add instead */
33 shader atomic_add
34 asic(ALDEBARAN)
35 type(CS)
36 v_mov_b32 v0, s0
37 v_mov_b32 v1, s1
38 v_mov_b32 v2, 1
39 flat_atomic_add v3, v[0:1], v2 slc glc scc
40 s_waitcnt 0
41 s_endpgm
42 end
43
44 shader copy_dword
45 asic(ALDEBARAN)
46 type(CS)
47 /* copy the parameters from scalar registers to vector registers */
48 v_mov_b32 v0, s0
49 v_mov_b32 v1, s1
50 v_mov_b32 v2, s2
51 v_mov_b32 v3, s3
52 /* copy a dword between the passed addresses */
53 flat_load_dword v4, v[0:1] slc glc
54 s_waitcnt 0
55 flat_store_dword v[2:3], v4 slc glc
56 s_endpgm
57 end
58
59 shader main
60 asic(ALDEBARAN)
61 type(CS)
62 loop:
63 s_branch loop
64 s_endpgm
65 end
66
67
68 #endif
69
70 const uint32_t IsaGenerator_Aldbrn::NOOP_ISA[] = {
71 0xbf810000
72 };
73
74 const uint32_t IsaGenerator_Aldbrn::COPY_DWORD_ISA[] = {
75 0x7e000200, 0x7e020201,
76 0x7e040202, 0x7e060203,
77 0xdc530000, 0x047f0000,
78 0xbf8c0000, 0xdc730000,
79 0x007f0402, 0xbf810000
80 };
81
82 const uint32_t IsaGenerator_Aldbrn::INFINITE_LOOP_ISA[] = {
83 0xbf82ffff, 0xbf810000
84 };
85
86 const uint32_t IsaGenerator_Aldbrn::ATOMIC_ADD_ISA[] = {
87 0x7e000200, 0x7e020201,
88 0x7e040281, 0xdf0b0000,
89 0x037f0200, 0xbf8c0000,
90 0xbf810000, 0x00000000
91 };
92
93 void IsaGenerator_Aldbrn::GetNoopIsa(HsaMemoryBuffer& rBuf) {
94 std::copy(NOOP_ISA, NOOP_ISA+ARRAY_SIZE(NOOP_ISA), rBuf.As<uint32_t*>());
95 }
96
97 void IsaGenerator_Aldbrn::GetCopyDwordIsa(HsaMemoryBuffer& rBuf) {
98 std::copy(COPY_DWORD_ISA, COPY_DWORD_ISA+ARRAY_SIZE(COPY_DWORD_ISA), rBuf.As<uint32_t*>());
99 }
100
101 void IsaGenerator_Aldbrn::GetInfiniteLoopIsa(HsaMemoryBuffer& rBuf) {
102 std::copy(INFINITE_LOOP_ISA, INFINITE_LOOP_ISA+ARRAY_SIZE(INFINITE_LOOP_ISA), rBuf.As<uint32_t*>());
103 }
104
105 void IsaGenerator_Aldbrn::GetAtomicIncIsa(HsaMemoryBuffer& rBuf) {
106 std::copy(ATOMIC_ADD_ISA, ATOMIC_ADD_ISA+ARRAY_SIZE(ATOMIC_ADD_ISA), rBuf.As<uint32_t*>());
107 }
108
109 const std::string& IsaGenerator_Aldbrn::GetAsicName() {
110 return ASIC_NAME;
111 }
112
+0
-49
tests/kfdtest/src/IsaGenerator_Aldebaran.hpp less more
0 /*
1 * Copyright (C) 2020 Advanced Micro Devices, Inc. All Rights Reserved.
2 *
3 * Permission is hereby granted, free of charge, to any person obtaining a
4 * copy of this software and associated documentation files (the "Software"),
5 * to deal in the Software without restriction, including without limitation
6 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
7 * and/or sell copies of the Software, and to permit persons to whom the
8 * Software is furnished to do so, subject to the following conditions:
9 *
10 * The above copyright notice and this permission notice shall be included in
11 * all copies or substantial portions of the Software.
12 *
13 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
14 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
15 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
16 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
17 * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
18 * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
19 * OTHER DEALINGS IN THE SOFTWARE.
20 *
21 */
22
23 #ifndef _ISAGENERATOR_ALDEBARAN_H_
24 #define _ISAGENERATOR_ALDEBARAN_H_
25
26 #include <string>
27 #include "IsaGenerator.hpp"
28
29 class IsaGenerator_Aldbrn : public IsaGenerator {
30 public:
31 virtual void GetNoopIsa(HsaMemoryBuffer& rBuf);
32 virtual void GetCopyDwordIsa(HsaMemoryBuffer& rBuf);
33 virtual void GetInfiniteLoopIsa(HsaMemoryBuffer& rBuf);
34 virtual void GetAtomicIncIsa(HsaMemoryBuffer& rBuf);
35
36 protected:
37 virtual const std::string& GetAsicName();
38
39 private:
40 static const std::string ASIC_NAME;
41
42 static const uint32_t NOOP_ISA[];
43 static const uint32_t COPY_DWORD_ISA[];
44 static const uint32_t INFINITE_LOOP_ISA[];
45 static const uint32_t ATOMIC_ADD_ISA[];
46 };
47
48 #endif // _ISAGENERATOR_ALDEBARAN_H_
+0
-142
tests/kfdtest/src/IsaGenerator_Gfx10.cpp less more
0 /*
1 * Copyright (C) 2019 Advanced Micro Devices, Inc. All Rights Reserved.
2 *
3 * Permission is hereby granted, free of charge, to any person obtaining a
4 * copy of this software and associated documentation files (the "Software"),
5 * to deal in the Software without restriction, including without limitation
6 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
7 * and/or sell copies of the Software, and to permit persons to whom the
8 * Software is furnished to do so, subject to the following conditions:
9 *
10 * The above copyright notice and this permission notice shall be included in
11 * all copies or substantial portions of the Software.
12 *
13 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
14 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
15 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
16 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
17 * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
18 * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
19 * OTHER DEALINGS IN THE SOFTWARE.
20 *
21 */
22
23 #include "IsaGenerator_Gfx10.hpp"
24
25 #include <algorithm>
26 #include <string>
27
28 /* The binaries are generated from following ISA */
29 const std::string IsaGenerator_Gfx10::ASIC_NAME = "GFX10";
30 #if 0
31 static const char * atomic_add = \
32 "\
33 shader atomic_add \n\
34 asic(GFX10) \n\
35 wave_size(32) \n\
36 type(CS) \n\
37 v_mov_b32 v0, s0 \n\
38 v_mov_b32 v1, s1 \n\
39 v_mov_b32 v2, 1 \n\
40 flat_atomic_add v3, v[0:1], v2 slc glc \n\
41 s_waitcnt 0 \n\
42 s_endpgm \n\
43 end \n\
44 ";
45
46 static const char * copy_dword = \
47 "\
48 shader copy_dword \n\
49 asic(GFX10) \n\
50 wave_size(32) \n\
51 type(CS) \n\
52 v_mov_b32 v0, s0 \n\
53 v_mov_b32 v1, s1 \n\
54 v_mov_b32 v2, s2 \n\
55 v_mov_b32 v3, s3 \n\
56 flat_load_dword v4, v[0:1] slc glc \n\
57 s_waitcnt 0 \n\
58 flat_store_dword v[2:3], v4 slc glc \n\
59 s_endpgm \n\
60 end \n\
61 ";
62
63 static const char * loop= \
64 "\
65 shader loop \n\
66 asic(GFX10) \n\
67 type(CS) \n\
68 wave_size(32) \n\
69 loop: \n\
70 s_branch loop \n\
71 s_endpgm \n\
72 end \n\
73 ";
74
75 static const char * noop= \
76 "\
77 shader noop \n\
78 asic(GFX10) \n\
79 type(CS) \n\
80 wave_size(32) \n\
81 s_endpgm \n\
82 end \n\
83 ";
84 #endif
85
86 const uint32_t IsaGenerator_Gfx10::NOOP_ISA[] = {
87 0xb0804004, 0xbf810000,
88 0xbf9f0000, 0xbf9f0000,
89 0xbf9f0000, 0xbf9f0000,
90 0xbf9f0000
91 };
92
93 const uint32_t IsaGenerator_Gfx10::COPY_DWORD_ISA[] = {
94 0xb0804004, 0x7e000200,
95 0x7e020201, 0x7e040202,
96 0x7e060203, 0xdc330000,
97 0x47d0000, 0xbf8c0000,
98 0xdc730000, 0x7d0402,
99 0xbf810000, 0xbf9f0000,
100 0xbf9f0000, 0xbf9f0000,
101 0xbf9f0000, 0xbf9f0000
102 };
103
104 const uint32_t IsaGenerator_Gfx10::INFINITE_LOOP_ISA[] = {
105 0xbf82ffff, 0xb0804004,
106 0xbf810000, 0xbf9f0000,
107 0xbf9f0000, 0xbf9f0000,
108 0xbf9f0000, 0xbf9f0000
109 };
110
111 const uint32_t IsaGenerator_Gfx10::ATOMIC_ADD_ISA[] = {
112 0xb0804004, 0x7e000200,
113 0x7e020201, 0x7e040281,
114 0xdccb0000, 0x37d0200,
115 0xbf8c0000, 0xbf810000,
116 0xbf9f0000, 0xbf9f0000,
117 0xbf9f0000, 0xbf9f0000,
118 0xbf9f0000
119 };
120
121
122 void IsaGenerator_Gfx10::GetNoopIsa(HsaMemoryBuffer& rBuf) {
123 std::copy(NOOP_ISA, NOOP_ISA+ARRAY_SIZE(NOOP_ISA), rBuf.As<uint32_t*>());
124 }
125
126 void IsaGenerator_Gfx10::GetCopyDwordIsa(HsaMemoryBuffer& rBuf) {
127 std::copy(COPY_DWORD_ISA, COPY_DWORD_ISA+ARRAY_SIZE(COPY_DWORD_ISA), rBuf.As<uint32_t*>());
128 }
129
130 void IsaGenerator_Gfx10::GetInfiniteLoopIsa(HsaMemoryBuffer& rBuf) {
131 std::copy(INFINITE_LOOP_ISA, INFINITE_LOOP_ISA+ARRAY_SIZE(INFINITE_LOOP_ISA), rBuf.As<uint32_t*>());
132 }
133
134 void IsaGenerator_Gfx10::GetAtomicIncIsa(HsaMemoryBuffer& rBuf) {
135 std::copy(ATOMIC_ADD_ISA, ATOMIC_ADD_ISA+ARRAY_SIZE(ATOMIC_ADD_ISA), rBuf.As<uint32_t*>());
136 }
137
138 const std::string& IsaGenerator_Gfx10::GetAsicName() {
139 return ASIC_NAME;
140 }
141
+0
-49
tests/kfdtest/src/IsaGenerator_Gfx10.hpp less more
0 /*
1 * Copyright (C) 2019 Advanced Micro Devices, Inc. All Rights Reserved.
2 *
3 * Permission is hereby granted, free of charge, to any person obtaining a
4 * copy of this software and associated documentation files (the "Software"),
5 * to deal in the Software without restriction, including without limitation
6 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
7 * and/or sell copies of the Software, and to permit persons to whom the
8 * Software is furnished to do so, subject to the following conditions:
9 *
10 * The above copyright notice and this permission notice shall be included in
11 * all copies or substantial portions of the Software.
12 *
13 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
14 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
15 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
16 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
17 * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
18 * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
19 * OTHER DEALINGS IN THE SOFTWARE.
20 *
21 */
22
23 #ifndef _ISAGENERATOR_GFX10_H_
24 #define _ISAGENERATOR_GFX10_H_
25
26 #include <string>
27 #include "IsaGenerator.hpp"
28
29 class IsaGenerator_Gfx10 : public IsaGenerator {
30 public:
31 virtual void GetNoopIsa(HsaMemoryBuffer& rBuf);
32 virtual void GetCopyDwordIsa(HsaMemoryBuffer& rBuf);
33 virtual void GetInfiniteLoopIsa(HsaMemoryBuffer& rBuf);
34 virtual void GetAtomicIncIsa(HsaMemoryBuffer& rBuf);
35
36 protected:
37 virtual const std::string& GetAsicName();
38
39 private:
40 static const std::string ASIC_NAME;
41
42 static const uint32_t NOOP_ISA[];
43 static const uint32_t COPY_DWORD_ISA[];
44 static const uint32_t INFINITE_LOOP_ISA[];
45 static const uint32_t ATOMIC_ADD_ISA[];
46 };
47
48 #endif // _ISAGENERATOR_GFX9_H_
+0
-123
tests/kfdtest/src/IsaGenerator_Gfx72.cpp less more
0 /*
1 * Copyright (C) 2014-2018 Advanced Micro Devices, Inc. All Rights Reserved.
2 *
3 * Permission is hereby granted, free of charge, to any person obtaining a
4 * copy of this software and associated documentation files (the "Software"),
5 * to deal in the Software without restriction, including without limitation
6 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
7 * and/or sell copies of the Software, and to permit persons to whom the
8 * Software is furnished to do so, subject to the following conditions:
9 *
10 * The above copyright notice and this permission notice shall be included in
11 * all copies or substantial portions of the Software.
12 *
13 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
14 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
15 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
16 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
17 * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
18 * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
19 * OTHER DEALINGS IN THE SOFTWARE.
20 *
21 */
22
23 #include "IsaGenerator_Gfx72.hpp"
24
25 #include <algorithm>
26 #include <string>
27
28 const std::string IsaGenerator_Gfx72::ASIC_NAME = "CI";
29
30 const uint32_t IsaGenerator_Gfx72::NOOP_ISA[] = {
31 0xbf810000 // S_ENDPGM
32 };
33
34 /* The below arrays are filled with hex values in order not to reference
35 * proprietary header files, but we still leave the code here for future
36 * reference.
37 */
38 #if 0
39 const uint32_t IsaGenerator_Gfx72::COPY_DWORD_ISA[] = {
40 (63u << SQ_VOP1__ENCODING__SHIFT) | (0 << SQ_VOP1__VDST__SHIFT) | (SQ_V_MOV_B32 << SQ_VOP1__OP__SHIFT) | (0 << SQ_VOP1__SRC0__SHIFT), // v_mov_b32 v0, s0 (VOP1)
41 (63u << SQ_VOP1__ENCODING__SHIFT) | (1 << SQ_VOP1__VDST__SHIFT) | (SQ_V_MOV_B32 << SQ_VOP1__OP__SHIFT) | (1 << SQ_VOP1__SRC0__SHIFT), // v_mov_b32 v1, s1 (VOP1)
42 (63u << SQ_VOP1__ENCODING__SHIFT) | (2 << SQ_VOP1__VDST__SHIFT) | (SQ_V_MOV_B32 << SQ_VOP1__OP__SHIFT) | (2 << SQ_VOP1__SRC0__SHIFT), // v_mov_b32 v2, s2 (VOP1)
43 (63u << SQ_VOP1__ENCODING__SHIFT) | (3 << SQ_VOP1__VDST__SHIFT) | (SQ_V_MOV_B32 << SQ_VOP1__OP__SHIFT) | (3 << SQ_VOP1__SRC0__SHIFT), // v_mov_b32 v3, s3 (VOP1)
44
45 (55u << SQ_FLAT_0__ENCODING__SHIFT) | (SQ_FLAT_LOAD_DWORD << SQ_FLAT_0__OP__SHIFT) | (1 << SQ_FLAT_0__SLC__SHIFT) | (1 << SQ_FLAT_0__GLC__SHIFT)/*(3 << 16)*/, // SQ_FLAT_0, flat_load_dword, slc = 1, glc = 1 (FLAT_0)
46 (4u << SQ_FLAT_1__VDST__SHIFT) | (0 << SQ_FLAT_1__ADDR__SHIFT), // ADDR = V0:V1, VDST = V4 (FLAT_1)
47
48 (383u << SQ_SOPP__ENCODING__SHIFT) | (SQ_S_WAITCNT << SQ_SOPP__OP__SHIFT) | (0 << SQ_SOPP__SIMM16__SHIFT), // s_waitcnt 0 (SOPP)
49
50 (55u << SQ_FLAT_0__ENCODING__SHIFT) | (SQ_FLAT_STORE_DWORD << SQ_FLAT_0__OP__SHIFT) | (1 << SQ_FLAT_0__SLC__SHIFT) | (1 << SQ_FLAT_0__GLC__SHIFT), // SQ_FLAT_0, flat_store_dword, slc = 1, glc = 1 (FLAT_0)
51 (4u << SQ_FLAT_1__DATA__SHIFT) | (2 << SQ_FLAT_1__ADDR__SHIFT), // ADDR = V2:V3, DATA = V4 (FLAT_1)
52
53 0xBF810000u // s_endpgm, note that we rely on the implicit s_waitcnt 0,0,0
54 };
55
56 const uint32_t IsaGenerator_Gfx72::INFINITE_LOOP_ISA[] = {
57 (0x17F << SQ_SOPP__ENCODING__SHIFT) | (SQ_S_BRANCH << SQ_SOPP__OP__SHIFT) | ( (const uint32_t)-1 << SQ_SOPP__SIMM16__SHIFT), // s_branch -1 (PC <- PC + SIMM*4)+4
58 0xBF810000u // S_ENDPGM
59 };
60
61 const uint32_t IsaGenerator_Gfx72::ATOMIC_INC_ISA[] = {
62 (63u << SQ_VOP1__ENCODING__SHIFT) | (0 << SQ_VOP1__VDST__SHIFT) | (SQ_V_MOV_B32 << SQ_VOP1__OP__SHIFT) | (0 << SQ_VOP1__SRC0__SHIFT), // v_mov_b32 v0, s0 (VOP1)
63 (63u << SQ_VOP1__ENCODING__SHIFT) | (1 << SQ_VOP1__VDST__SHIFT) | (SQ_V_MOV_B32 << SQ_VOP1__OP__SHIFT) | (1 << SQ_VOP1__SRC0__SHIFT), // v_mov_b32 v1, s1 (VOP1)
64 (63u << SQ_VOP1__ENCODING__SHIFT) | (2 << SQ_VOP1__VDST__SHIFT) | (SQ_V_MOV_B32 << SQ_VOP1__OP__SHIFT) | (0xC1 << SQ_VOP1__SRC0__SHIFT), // v_mov_b32 0xFFFFFFFF, s2 (VOP1)
65
66 (55u << SQ_FLAT_0__ENCODING__SHIFT) | (SQ_FLAT_ATOMIC_INC << SQ_FLAT_0__OP__SHIFT) | (1 << SQ_FLAT_0__SLC__SHIFT) | (0 << SQ_FLAT_0__GLC__SHIFT), // SQ_FLAT_0, flat_atomic_inc, slc = 1, glc = 0 (FLAT_0)
67 (3u << SQ_FLAT_1__VDST__SHIFT) | (2u << SQ_FLAT_1__DATA__SHIFT) | (0 << SQ_FLAT_1__ADDR__SHIFT), // ADDR/dst = V0:V1, VDST/ret = V3, DATA/src=V2 (FLAT_1)
68 0xBF810000u // s_endpgm, note that we rely on the implicit s_waitcnt 0,0,0
69 };
70 #endif
71
72 const uint32_t IsaGenerator_Gfx72::COPY_DWORD_ISA[] = {
73 0x7e000200, // v_mov_b32 v0, s0 (VOP1)
74 0x7e020201, // v_mov_b32 v1, s1 (VOP1)
75 0x7e040202, // v_mov_b32 v2, s2 (VOP1)
76 0x7e060203, // v_mov_b32 v3, s3 (VOP1)
77
78 0xdc330000, // SQ_FLAT_0, flat_load_dword, slc = 1, glc = 1 (FLAT_0)
79 0x04000000, // ADDR = V0:V1, VDST = V4 (FLAT_1)
80
81 0xbf8c0000, // s_waitcnt 0 (SOPP)
82
83 0xdc730000, // SQ_FLAT_0, flat_store_dword, slc = 1, glc = 1 (FLAT_0)
84 0x00000402, // ADDR = V2:V3, DATA = V4 (FLAT_1)
85
86 0xbf810000 // s_endpgm, note that we rely on the implicit s_waitcnt 0,0,0
87 };
88
89 const uint32_t IsaGenerator_Gfx72::INFINITE_LOOP_ISA[] = {
90 0xbf82ffff, // s_branch -1 (PC <- PC + SIMM*4)+4
91 0xbf810000 // S_ENDPGM
92 };
93
94 const uint32_t IsaGenerator_Gfx72::ATOMIC_INC_ISA[] = {
95 0x7e000200, // v_mov_b32 v0, s0 (VOP1)
96 0x7e020201, // v_mov_b32 v1, s1 (VOP1)
97 0x7e0402c1, // v_mov_b32 0xFFFFFFFF, s2 (VOP1)
98
99 0xdcf20000, // SQ_FLAT_0, flat_atomic_inc, slc = 1, glc = 0 (FLAT_0)
100 0x03000200, // ADDR/dst = V0:V1, VDST/ret = V3, DATA/src=V2 (FLAT_1)
101 0xbf810000 // s_endpgm, note that we rely on the implicit s_waitcnt 0,0,0
102 };
103
104 void IsaGenerator_Gfx72::GetNoopIsa(HsaMemoryBuffer& rBuf) {
105 std::copy(NOOP_ISA, NOOP_ISA+ARRAY_SIZE(NOOP_ISA), rBuf.As<uint32_t*>());
106 }
107
108 void IsaGenerator_Gfx72::GetCopyDwordIsa(HsaMemoryBuffer& rBuf) {
109 std::copy(COPY_DWORD_ISA, COPY_DWORD_ISA+ARRAY_SIZE(COPY_DWORD_ISA), rBuf.As<uint32_t*>());
110 }
111
112 void IsaGenerator_Gfx72::GetInfiniteLoopIsa(HsaMemoryBuffer& rBuf) {
113 std::copy(INFINITE_LOOP_ISA, INFINITE_LOOP_ISA+ARRAY_SIZE(INFINITE_LOOP_ISA), rBuf.As<uint32_t*>());
114 }
115
116 void IsaGenerator_Gfx72::GetAtomicIncIsa(HsaMemoryBuffer& rBuf) {
117 std::copy(ATOMIC_INC_ISA, ATOMIC_INC_ISA+ARRAY_SIZE(ATOMIC_INC_ISA), rBuf.As<uint32_t*>());
118 }
119
120 const std::string& IsaGenerator_Gfx72::GetAsicName() {
121 return ASIC_NAME;
122 }
+0
-49
tests/kfdtest/src/IsaGenerator_Gfx72.hpp less more
0 /*
1 * Copyright (C) 2014-2018 Advanced Micro Devices, Inc. All Rights Reserved.
2 *
3 * Permission is hereby granted, free of charge, to any person obtaining a
4 * copy of this software and associated documentation files (the "Software"),
5 * to deal in the Software without restriction, including without limitation
6 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
7 * and/or sell copies of the Software, and to permit persons to whom the
8 * Software is furnished to do so, subject to the following conditions:
9 *
10 * The above copyright notice and this permission notice shall be included in
11 * all copies or substantial portions of the Software.
12 *
13 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
14 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
15 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
16 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
17 * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
18 * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
19 * OTHER DEALINGS IN THE SOFTWARE.
20 *
21 */
22
23 #ifndef _ISAGENERATOR_GFX72_H_
24 #define _ISAGENERATOR_GFX72_H_
25
26 #include <string>
27 #include "IsaGenerator.hpp"
28
29 class IsaGenerator_Gfx72 : public IsaGenerator {
30 public:
31 virtual void GetNoopIsa(HsaMemoryBuffer& rBuf);
32 virtual void GetCopyDwordIsa(HsaMemoryBuffer& rBuf);
33 virtual void GetInfiniteLoopIsa(HsaMemoryBuffer& rBuf);
34 virtual void GetAtomicIncIsa(HsaMemoryBuffer& rBuf);
35
36 protected:
37 virtual const std::string& GetAsicName();
38
39 private:
40 static const std::string ASIC_NAME;
41
42 static const uint32_t NOOP_ISA[];
43 static const uint32_t COPY_DWORD_ISA[];
44 static const uint32_t INFINITE_LOOP_ISA[];
45 static const uint32_t ATOMIC_INC_ISA[];
46 };
47
48 #endif // _ISAGENERATOR_GFX72_H_
+0
-128
tests/kfdtest/src/IsaGenerator_Gfx8.cpp less more
0 /*
1 * Copyright (C) 2014-2018 Advanced Micro Devices, Inc. All Rights Reserved.
2 *
3 * Permission is hereby granted, free of charge, to any person obtaining a
4 * copy of this software and associated documentation files (the "Software"),
5 * to deal in the Software without restriction, including without limitation
6 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
7 * and/or sell copies of the Software, and to permit persons to whom the
8 * Software is furnished to do so, subject to the following conditions:
9 *
10 * The above copyright notice and this permission notice shall be included in
11 * all copies or substantial portions of the Software.
12 *
13 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
14 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
15 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
16 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
17 * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
18 * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
19 * OTHER DEALINGS IN THE SOFTWARE.
20 *
21 */
22
23 #include "IsaGenerator_Gfx8.hpp"
24
25 #include <algorithm>
26 #include <string>
27
28 const std::string IsaGenerator_Gfx8::ASIC_NAME = "VI";
29
30 const uint32_t IsaGenerator_Gfx8::NOOP_ISA[] = {
31 0xbf810000 // S_ENDPGM
32 };
33
34 /** The below arrays are filled with hex values in order not to reference
35 * proprietary header files, but we still leave the code here for future
36 * reference.
37 */
38 #if 0
39 const uint32_t IsaGenerator_Gfx8::COPY_DWORD_ISA[] = {
40 (63u << SQ_VOP1__ENCODING__SHIFT) | (0 << SQ_VOP1__VDST__SHIFT) | (SQ_V_MOV_B32 << SQ_VOP1__OP__SHIFT) | (0 << SQ_VOP1__SRC0__SHIFT), // v_mov_b32 v0, s0 (VOP1)
41 (63u << SQ_VOP1__ENCODING__SHIFT) | (1 << SQ_VOP1__VDST__SHIFT) | (SQ_V_MOV_B32 << SQ_VOP1__OP__SHIFT) | (1 << SQ_VOP1__SRC0__SHIFT), // v_mov_b32 v1, s1 (VOP1)
42 (63u << SQ_VOP1__ENCODING__SHIFT) | (2 << SQ_VOP1__VDST__SHIFT) | (SQ_V_MOV_B32 << SQ_VOP1__OP__SHIFT) | (2 << SQ_VOP1__SRC0__SHIFT), // v_mov_b32 v2, s2 (VOP1)
43 (63u << SQ_VOP1__ENCODING__SHIFT) | (3 << SQ_VOP1__VDST__SHIFT) | (SQ_V_MOV_B32 << SQ_VOP1__OP__SHIFT) | (3 << SQ_VOP1__SRC0__SHIFT), // v_mov_b32 v3, s3 (VOP1)
44
45 (55u << SQ_FLAT_0__ENCODING__SHIFT) | (SQ_FLAT_LOAD_DWORD << SQ_FLAT_0__OP__SHIFT) | (1 << SQ_FLAT_0__SLC__SHIFT) | (1 << SQ_FLAT_0__GLC__SHIFT)/*(3 << 16)*/, // SQ_FLAT_0, flat_load_dword, slc = 1, glc = 1 (FLAT_0)
46 (4u << SQ_FLAT_1__VDST__SHIFT) | (0 << SQ_FLAT_1__ADDR__SHIFT), // ADDR = V0:V1, VDST = V4 (FLAT_1)
47
48 (383u << SQ_SOPP__ENCODING__SHIFT) | (SQ_S_WAITCNT << SQ_SOPP__OP__SHIFT) | (0 << SQ_SOPP__SIMM16__SHIFT), // s_waitcnt 0 (SOPP)
49
50 (55u << SQ_FLAT_0__ENCODING__SHIFT) | (SQ_FLAT_STORE_DWORD << SQ_FLAT_0__OP__SHIFT) | (1 << SQ_FLAT_0__SLC__SHIFT) | (1 << SQ_FLAT_0__GLC__SHIFT), // SQ_FLAT_0, flat_store_dword, slc = 1, glc = 1 (FLAT_0)
51 (4u << SQ_FLAT_1__DATA__SHIFT) | (2 << SQ_FLAT_1__ADDR__SHIFT), // ADDR = V2:V3, DATA = V4 (FLAT_1)
52
53 0xBF810000u // s_endpgm, note that we rely on the implicit s_waitcnt 0,0,0
54 };
55
56 const uint32_t IsaGenerator_Gfx8::INFINITE_LOOP_ISA[] = {
57 (0x17F << SQ_SOPP__ENCODING__SHIFT) | (SQ_S_BRANCH << SQ_SOPP__OP__SHIFT) | ( (const uint32_t)-1 << SQ_SOPP__SIMM16__SHIFT), // s_branch -1 (PC <- PC + SIMM*4)+4
58 0xBF810000u // S_ENDPGM
59 };
60 #endif
61
62 const uint32_t IsaGenerator_Gfx8::COPY_DWORD_ISA[] = {
63 0x7e000200, // v_mov_b32 v0, s0 (VOP1)
64 0x7e020201, // v_mov_b32 v1, s1 (VOP1)
65 0x7e040202, // v_mov_b32 v2, s2 (VOP1)
66 0x7e060203, // v_mov_b32 v3, s3 (VOP1)
67
68 0xdc530000, // SQ_FLAT_0, flat_load_dword, slc = 1, glc = 1 (FLAT_0)
69 0x04000000, // ADDR = V0:V1, VDST = V4 (FLAT_1)
70
71 0xbf8c0000, // s_waitcnt 0 (SOPP)
72
73 0xdc730000, // SQ_FLAT_0, flat_store_dword, slc = 1, glc = 1 (FLAT_0)
74 0x00000402, // ADDR = V2:V3, DATA = V4 (FLAT_1)
75
76 0xbf810000 // s_endpgm, note that we rely on the implicit s_waitcnt 0,0,0
77 };
78
79 const uint32_t IsaGenerator_Gfx8::INFINITE_LOOP_ISA[] = {
80 0xbf82ffff, // s_branch -1 (PC <- PC + SIMM*4)+4
81 0xbf810000 // S_ENDPGM
82 };
83
84 /**
85 * The atomic_add_isa binary is generated from following ISA
86 * The original atomic_inc is not support by some PCIE, so use atomic_add instead
87 *
88 */
89 /*
90 shader atomic_add
91 asic(VI)
92 type(CS)
93 v_mov_b32 v0, s0
94 v_mov_b32 v1, s1
95 v_mov_b32 v2, 1
96 flat_atomic_add v3, v[0:1], v2 slc glc
97 s_waitcnt 0
98 s_endpgm
99 end
100 */
101
102 const uint32_t IsaGenerator_Gfx8::ATOMIC_ADD_ISA[] = {
103 0x7e000200, 0x7e020201,
104 0x7e040281, 0xdd0b0000,
105 0x03000200, 0xbf8c0000,
106 0xbf810000, 0x00000000
107 };
108
109 void IsaGenerator_Gfx8::GetNoopIsa(HsaMemoryBuffer& rBuf) {
110 std::copy(NOOP_ISA, NOOP_ISA+ARRAY_SIZE(NOOP_ISA), rBuf.As<uint32_t*>());
111 }
112
113 void IsaGenerator_Gfx8::GetCopyDwordIsa(HsaMemoryBuffer& rBuf) {
114 std::copy(COPY_DWORD_ISA, COPY_DWORD_ISA+ARRAY_SIZE(COPY_DWORD_ISA), rBuf.As<uint32_t*>());
115 }
116
117 void IsaGenerator_Gfx8::GetInfiniteLoopIsa(HsaMemoryBuffer& rBuf) {
118 std::copy(INFINITE_LOOP_ISA, INFINITE_LOOP_ISA+ARRAY_SIZE(INFINITE_LOOP_ISA), rBuf.As<uint32_t*>());
119 }
120
121 void IsaGenerator_Gfx8::GetAtomicIncIsa(HsaMemoryBuffer& rBuf) {
122 std::copy(ATOMIC_ADD_ISA, ATOMIC_ADD_ISA+ARRAY_SIZE(ATOMIC_ADD_ISA), rBuf.As<uint32_t*>());
123 }
124
125 const std::string& IsaGenerator_Gfx8::GetAsicName() {
126 return ASIC_NAME;
127 }
+0
-49
tests/kfdtest/src/IsaGenerator_Gfx8.hpp less more
0 /*
1 * Copyright (C) 2014-2018 Advanced Micro Devices, Inc. All Rights Reserved.
2 *
3 * Permission is hereby granted, free of charge, to any person obtaining a
4 * copy of this software and associated documentation files (the "Software"),
5 * to deal in the Software without restriction, including without limitation
6 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
7 * and/or sell copies of the Software, and to permit persons to whom the
8 * Software is furnished to do so, subject to the following conditions:
9 *
10 * The above copyright notice and this permission notice shall be included in
11 * all copies or substantial portions of the Software.
12 *
13 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
14 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
15 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
16 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
17 * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
18 * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
19 * OTHER DEALINGS IN THE SOFTWARE.
20 *
21 */
22
23 #ifndef _ISAGENERATOR_GFX8_H_
24 #define _ISAGENERATOR_GFX8_H_
25
26 #include <string>
27 #include "IsaGenerator.hpp"
28
29 class IsaGenerator_Gfx8 : public IsaGenerator {
30 public:
31 virtual void GetNoopIsa(HsaMemoryBuffer& rBuf);
32 virtual void GetCopyDwordIsa(HsaMemoryBuffer& rBuf);
33 virtual void GetInfiniteLoopIsa(HsaMemoryBuffer& rBuf);
34 virtual void GetAtomicIncIsa(HsaMemoryBuffer& rBuf);
35
36 protected:
37 virtual const std::string& GetAsicName();
38
39 private:
40 static const std::string ASIC_NAME;
41
42 static const uint32_t NOOP_ISA[];
43 static const uint32_t COPY_DWORD_ISA[];
44 static const uint32_t INFINITE_LOOP_ISA[];
45 static const uint32_t ATOMIC_ADD_ISA[];
46 };
47
48 #endif // _ISAGENERATOR_GFX72_H_
+0
-113
tests/kfdtest/src/IsaGenerator_Gfx9.cpp less more
0 /*
1 * Copyright (C) 2014-2018 Advanced Micro Devices, Inc. All Rights Reserved.
2 *
3 * Permission is hereby granted, free of charge, to any person obtaining a
4 * copy of this software and associated documentation files (the "Software"),
5 * to deal in the Software without restriction, including without limitation
6 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
7 * and/or sell copies of the Software, and to permit persons to whom the
8 * Software is furnished to do so, subject to the following conditions:
9 *
10 * The above copyright notice and this permission notice shall be included in
11 * all copies or substantial portions of the Software.
12 *
13 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
14 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
15 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
16 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
17 * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
18 * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
19 * OTHER DEALINGS IN THE SOFTWARE.
20 *
21 */
22
23 #include "IsaGenerator_Gfx9.hpp"
24
25 #include <algorithm>
26 #include <string>
27
28 const std::string IsaGenerator_Gfx9::ASIC_NAME = "GFX9";
29
30 /* The binaries are generated from following ISA */
31 #if 0
32 /* flat_atomic_inc will not support by some PCIE, use flat_atomic_add instead */
33 shader atomic_add
34 asic(GFX9)
35 type(CS)
36 v_mov_b32 v0, s0
37 v_mov_b32 v1, s1
38 v_mov_b32 v2, 1
39 flat_atomic_add v3, v[0:1], v2 slc glc
40 s_waitcnt 0
41 s_endpgm
42 end
43
44 shader copy_dword
45 asic(GFX9)
46 type(CS)
47 /* copy the parameters from scalar registers to vector registers */
48 v_mov_b32 v0, s0
49 v_mov_b32 v1, s1
50 v_mov_b32 v2, s2
51 v_mov_b32 v3, s3
52 /* copy a dword between the passed addresses */
53 flat_load_dword v4, v[0:1] slc glc
54 s_waitcnt 0
55 flat_store_dword v[2:3], v4 slc glc
56 s_endpgm
57 end
58
59 shader main
60 asic(GFX9)
61 type(CS)
62 loop:
63 s_branch loop
64 s_endpgm
65 end
66
67
68 #endif
69
70 const uint32_t IsaGenerator_Gfx9::NOOP_ISA[] = {
71 0xbf810000
72 };
73
74 const uint32_t IsaGenerator_Gfx9::COPY_DWORD_ISA[] = {
75 0x7e000200, 0x7e020201,
76 0x7e040202, 0x7e060203,
77 0xdc530000, 0x047f0000,
78 0xbf8c0000, 0xdc730000,
79 0x007f0402, 0xbf810000
80 };
81
82 const uint32_t IsaGenerator_Gfx9::INFINITE_LOOP_ISA[] = {
83 0xbf82ffff, 0xbf810000
84 };
85
86 const uint32_t IsaGenerator_Gfx9::ATOMIC_ADD_ISA[] = {
87 0x7e000200, 0x7e020201,
88 0x7e040281, 0xdd0b0000,
89 0x037f0200, 0xbf8c0000,
90 0xbf810000, 0x00000000
91 };
92
93 void IsaGenerator_Gfx9::GetNoopIsa(HsaMemoryBuffer& rBuf) {
94 std::copy(NOOP_ISA, NOOP_ISA+ARRAY_SIZE(NOOP_ISA), rBuf.As<uint32_t*>());
95 }
96
97 void IsaGenerator_Gfx9::GetCopyDwordIsa(HsaMemoryBuffer& rBuf) {
98 std::copy(COPY_DWORD_ISA, COPY_DWORD_ISA+ARRAY_SIZE(COPY_DWORD_ISA), rBuf.As<uint32_t*>());
99 }
100
101 void IsaGenerator_Gfx9::GetInfiniteLoopIsa(HsaMemoryBuffer& rBuf) {
102 std::copy(INFINITE_LOOP_ISA, INFINITE_LOOP_ISA+ARRAY_SIZE(INFINITE_LOOP_ISA), rBuf.As<uint32_t*>());
103 }
104
105 void IsaGenerator_Gfx9::GetAtomicIncIsa(HsaMemoryBuffer& rBuf) {
106 std::copy(ATOMIC_ADD_ISA, ATOMIC_ADD_ISA+ARRAY_SIZE(ATOMIC_ADD_ISA), rBuf.As<uint32_t*>());
107 }
108
109 const std::string& IsaGenerator_Gfx9::GetAsicName() {
110 return ASIC_NAME;
111 }
112
+0
-49
tests/kfdtest/src/IsaGenerator_Gfx9.hpp less more
0 /*
1 * Copyright (C) 2014-2018 Advanced Micro Devices, Inc. All Rights Reserved.
2 *
3 * Permission is hereby granted, free of charge, to any person obtaining a
4 * copy of this software and associated documentation files (the "Software"),
5 * to deal in the Software without restriction, including without limitation
6 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
7 * and/or sell copies of the Software, and to permit persons to whom the
8 * Software is furnished to do so, subject to the following conditions:
9 *
10 * The above copyright notice and this permission notice shall be included in
11 * all copies or substantial portions of the Software.
12 *
13 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
14 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
15 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
16 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
17 * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
18 * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
19 * OTHER DEALINGS IN THE SOFTWARE.
20 *
21 */
22
23 #ifndef _ISAGENERATOR_GFX9_H_
24 #define _ISAGENERATOR_GFX9_H_
25
26 #include <string>
27 #include "IsaGenerator.hpp"
28
29 class IsaGenerator_Gfx9 : public IsaGenerator {
30 public:
31 virtual void GetNoopIsa(HsaMemoryBuffer& rBuf);
32 virtual void GetCopyDwordIsa(HsaMemoryBuffer& rBuf);
33 virtual void GetInfiniteLoopIsa(HsaMemoryBuffer& rBuf);
34 virtual void GetAtomicIncIsa(HsaMemoryBuffer& rBuf);
35
36 protected:
37 virtual const std::string& GetAsicName();
38
39 private:
40 static const std::string ASIC_NAME;
41
42 static const uint32_t NOOP_ISA[];
43 static const uint32_t COPY_DWORD_ISA[];
44 static const uint32_t INFINITE_LOOP_ISA[];
45 static const uint32_t ATOMIC_ADD_ISA[];
46 };
47
48 #endif // _ISAGENERATOR_GFX9_H_
0 /*
1 * Copyright (C) 2022 Advanced Micro Devices, Inc. All Rights Reserved.
2 *
3 * Permission is hereby granted, free of charge, to any person obtaining a
4 * copy of this software and associated documentation files (the "Software"),
5 * to deal in the Software without restriction, including without limitation
6 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
7 * and/or sell copies of the Software, and to permit persons to whom the
8 * Software is furnished to do so, subject to the following conditions:
9 *
10 * The above copyright notice and this permission notice shall be included in
11 * all copies or substantial portions of the Software.
12 *
13 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
14 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
15 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
16 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
17 * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
18 * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
19 * OTHER DEALINGS IN THE SOFTWARE.
20 *
21 */
22
23 #include "GoogleTestExtension.hpp"
24 #include "KFDASMTest.hpp"
25 #include "ShaderStore.hpp"
26 #include "Assemble.hpp"
27
28 void KFDASMTest::SetUp() {}
29 void KFDASMTest::TearDown() {}
30
31 static const std::vector<uint32_t> TargetList = {
32 0x080001,
33 0x080002,
34 0x080003,
35 0x080005,
36 0x080100,
37 0x090000,
38 0x090002,
39 0x090004,
40 0x090006,
41 0x090008,
42 0x090009,
43 0x09000a,
44 0x09000c,
45 0x0a0100,
46 0x0a0101,
47 0x0a0102,
48 0x0a0103,
49 0x0a0300,
50 0x0a0301,
51 0x0a0302,
52 0x0a0303,
53 0x0a0304,
54 0x0a0305,
55 0x0a0306,
56 };
57
58 TEST_F(KFDASMTest, AssembleShaders) {
59 TEST_START(TESTPROFILE_RUNALL)
60
61 for (auto &t : TargetList) {
62 Assembler asmblr(t);
63
64 LOG() << "Running ASM test for target " << asmblr.GetTargetAsic() << std::endl;
65
66 for (auto &s : ShaderList) {
67 EXPECT_SUCCESS(asmblr.RunAssemble(s));
68 }
69 }
70
71 TEST_END
72 }
0 /*
1 * Copyright (C) 2022 Advanced Micro Devices, Inc. All Rights Reserved.
2 *
3 * Permission is hereby granted, free of charge, to any person obtaining a
4 * copy of this software and associated documentation files (the "Software"),
5 * to deal in the Software without restriction, including without limitation
6 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
7 * and/or sell copies of the Software, and to permit persons to whom the
8 * Software is furnished to do so, subject to the following conditions:
9 *
10 * The above copyright notice and this permission notice shall be included in
11 * all copies or substantial portions of the Software.
12 *
13 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
14 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
15 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
16 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
17 * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
18 * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
19 * OTHER DEALINGS IN THE SOFTWARE.
20 *
21 */
22
23 #ifndef __KFD_ASM_TEST__H__
24 #define __KFD_ASM_TEST__H__
25
26 #include <gtest/gtest.h>
27
28 class KFDASMTest : public testing::Test {
29 public:
30 KFDASMTest() {}
31 ~KFDASMTest() {}
32
33 protected:
34 virtual void SetUp();
35 virtual void TearDown();
36 };
37
38 #endif // __KFD_ASM_TEST__H__
6767
6868 g_baseTest = this;
6969
70 m_pAsm = new Assembler(GetGfxVersion(nodeProperties));
71
7072 ROUTINE_END
7173 }
7274
8486 EXPECT_SUCCESS(hsaKmtReleaseSystemProperties());
8587 EXPECT_SUCCESS(hsaKmtCloseKFD());
8688 g_baseTest = NULL;
89
90 if (m_pAsm)
91 delete m_pAsm;
92 m_pAsm = nullptr;
8793
8894 ROUTINE_END
8995 }
3333 #include "hsakmt.h"
3434 #include "OSWrapper.hpp"
3535 #include "KFDTestUtil.hpp"
36 #include "Assemble.hpp"
37 #include "ShaderStore.hpp"
3638
3739 // @class KFDBaseComponentTest
3840 class KFDBaseComponentTest : public testing::Test {
7375 HsaMemFlags m_MemoryFlags;
7476 HsaNodeInfo m_NodeInfo;
7577 HSAint32 m_xnack;
78 Assembler* m_pAsm;
7679
7780 // @brief Executed before every test that uses KFDBaseComponentTest class and sets all common settings for the tests.
7881 virtual void SetUp();
2323 #include "KFDCWSRTest.hpp"
2424 #include "Dispatch.hpp"
2525
26
27 /* Initial state:
28 * s[0:1] - 64 bits iteration number; only the lower 32 bits are useful.
29 * s[2:3] - result buffer base address
30 * s4 - workgroup id
31 * v0 - workitem id, always 0 because
32 * NUM_THREADS_X(number of threads) in workgroup set to 1
33 * Registers:
34 * v0 - calculated workitem = v0 + s4 * NUM_THREADS_X, which is s4
35 * v2 - = s0, 32 bits iteration number
36 * v[4:5] - corresponding output buf address: s[2:3] + v0 * 4
37 * v6 - counter
38 */
39
40 static const char* iterate_isa_gfx8 = \
41 "\
42 shader iterate_isa\n\
43 wave_size(32)\n\
44 type(CS)\n\
45 // copy the parameters from scalar registers to vector registers\n\
46 v_mov_b32 v2, s0 // v[2:3] = s[0:1] \n\
47 v_mov_b32 v3, s1 // v[2:3] = s[0:1] \n\
48 v_mov_b32 v0, s4 // use workgroup id as index \n\
49 v_lshlrev_b32 v0, 2, v0 // v0 *= 4 \n\
50 v_add_u32 v4, vcc, s2, v0 // v[4:5] = s[2:3] + v0 * 4 \n\
51 v_mov_b32 v5, s3 // v[4:5] = s[2:3] + v0 * 4 \n\
52 v_add_u32 v5, vcc, v5, vcc_lo // v[4:5] = s[2:3] + v0 * 4 \n\
53 v_mov_b32 v6, 0 \n\
54 LOOP: \n\
55 v_add_u32 v6, vcc, 1, v6 \n\
56 // compare the result value (v6) to iteration value (v2), and \n\
57 // jump if equal (i.e. if VCC is not zero after the comparison) \n\
58 v_cmp_lt_u32 vcc, v6, v2 \n\
59 s_cbranch_vccnz LOOP \n\
60 flat_store_dword v[4:5], v6 \n\
61 s_waitcnt vmcnt(0)&lgkmcnt(0) \n\
62 s_endpgm \n\
63 end \n\
64 ";
65
66 //This shader can be used by gfx9 and gfx10
67 static const char* iterate_isa_gfx9 = \
68 "\
69 shader iterate_isa\n\
70 wave_size(32)\n\
71 type(CS)\n\
72 // copy the parameters from scalar registers to vector registers\n\
73 v_mov_b32 v2, s0 // v[2:3] = s[0:1] \n\
74 v_mov_b32 v3, s1 // v[2:3] = s[0:1] \n\
75 v_mov_b32 v0, s4 // use workgroup id as index \n\
76 v_lshlrev_b32 v0, 2, v0 // v0 *= 4 \n\
77 v_add_co_u32 v4, vcc, s2, v0 // v[4:5] = s[2:3] + v0 * 4 \n\
78 v_mov_b32 v5, s3 // v[4:5] = s[2:3] + v0 * 4 \n\
79 v_add_co_u32 v5, vcc, v5, vcc_lo // v[4:5] = s[2:3] + v0 * 4 \n\
80 v_mov_b32 v6, 0 \n\
81 LOOP: \n\
82 v_add_co_u32 v6, vcc, 1, v6 \n\
83 // compare the result value (v6) to iteration value (v2), and \n\
84 // jump if equal (i.e. if VCC is not zero after the comparison) \n\
85 v_cmp_lt_u32 vcc, v6, v2 \n\
86 s_cbranch_vccnz LOOP \n\
87 flat_store_dword v[4:5], v6 \n\
88 s_waitcnt vmcnt(0)&lgkmcnt(0) \n\
89 s_endpgm \n\
90 end \n\
91 ";
92
93 static const char* infinite_isa = \
94 "\
95 shader infinite_isa \n\
96 wave_size(32) \n\
97 type(CS) \n\
98 LOOP: \n\
99 s_branch LOOP \n\
100 end \n\
101 ";
102
10326 void KFDCWSRTest::SetUp() {
10427 ROUTINE_START
10528
10629 KFDBaseComponentTest::SetUp();
10730
108 m_pIsaGen = IsaGenerator::Create(m_FamilyId);
109
110 wave_number = 1;
111
11231 ROUTINE_END
11332 }
11433
11534 void KFDCWSRTest::TearDown() {
11635 ROUTINE_START
117 if (m_pIsaGen)
118 delete m_pIsaGen;
119 m_pIsaGen = NULL;
12036
12137 KFDBaseComponentTest::TearDown();
12238
12339 ROUTINE_END
124 }
125
126 bool isOnEmulator() {
127 uint32_t isEmuMode = 0;
128
129 fscanf_dec("/sys/module/amdgpu/parameters/emu_mode", &isEmuMode);
130
131 return isEmuMode;
13240 }
13341
13442 static inline uint32_t checkCWSREnabled() {
14250 /**
14351 * KFDCWSRTest.BasicTest
14452 *
145 * This test dispatches the loop_inc_isa shader and lets it run, ensuring its destination pointer gets incremented.
146 * It then triggers CWSR and ensures the shader stops running.
147 * It then resumes the shader, ensures that it's running again and terminates it.
148 */
149 TEST_F(KFDCWSRTest, BasicTest) {
53 * This test dispatches the IterateIsa shader, which continuously increments a vgpr for
54 * (num_witems / WAVE_SIZE) waves. While this shader is running, dequeue/requeue requests
55 * are sent in a loop to trigger CWSRs.
56 *
57 * This is a paremeterized test. See the INSTANTIATE_TEST_CASE_P below for an explanation
58 * on the parameters.
59 *
60 * This test defines a CWSR threshold. The shader will continuously loop until inputBuf is
61 * filled with the known stop value, which occurs once cwsr_thresh CWSRs have been
62 * successfully triggered.
63 *
64 * 4 parameterized tests are defined:
65 *
66 * KFDCWSRTest.BasicTest/0
67 * KFDCWSRTest.BasicTest/1
68 * KFDCWSRTest.BasicTest/2
69 * KFDCWSRTest.BasicTest/3
70 *
71 * 0: 1 work-item, CWSR threshold of 10
72 * 1: 256 work-items (multi-wave), CWSR threshold of 50
73 * 2: 512 work-items (multi-wave), CWSR threshold of 100
74 * 3: 1024 work-items (multi-wave), CWSR threshold of 1000
75 */
76 TEST_P(KFDCWSRTest, BasicTest) {
15077 TEST_START(TESTPROFILE_RUNALL);
15178
79 int num_witems = std::get<0>(GetParam());
80 int cwsr_thresh = std::get<1>(GetParam());
15281 int defaultGPUNode = m_NodeInfo.HsaDefaultGPUNode();
15382
15483 if ((m_FamilyId >= FAMILY_VI) && (checkCWSREnabled())) {
155 const char *pIterateIsa;
156 HsaMemoryBuffer isaBuffer(PAGE_SIZE, defaultGPUNode, true/*zero*/, false/*local*/, true/*exec*/);
157 HsaMemoryBuffer resultBuf1(PAGE_SIZE, defaultGPUNode, true, false, false);
158 uint64_t count1 = 400000000;
159
160 if (m_FamilyId < FAMILY_AI)
161 pIterateIsa = iterate_isa_gfx8;
162 else
163 pIterateIsa = iterate_isa_gfx9;
164
165 if (isOnEmulator()) {
166 // Divide the iterator times by 10000 so that the test can
167 // finish in a reasonable time.
168 count1 /= 10000;
169 LOG() << "On Emulators" << std::endl;
170 }
171
172 unsigned int* result1 = resultBuf1.As<unsigned int*>();
173
174 m_pIsaGen->CompileShader(pIterateIsa, "iterate_isa", isaBuffer);
175
176 PM4Queue queue1;
177
178 ASSERT_SUCCESS(queue1.Create(defaultGPUNode));
179
180 Dispatch *dispatch1;
181
182 dispatch1 = new Dispatch(isaBuffer);
183
184 dispatch1->SetArgs(reinterpret_cast<void *>(count1), result1);
185 dispatch1->SetDim(wave_number, 1, 1);
186
187 // Submit the shader, queue1
188 dispatch1->Submit(queue1);
189
190 //Give time for waves to launch before disabling queue.
191 Delay(1);
192 EXPECT_SUCCESS(queue1.Update(0/*percentage*/, BaseQueue::DEFAULT_PRIORITY, false));
84 HsaMemoryBuffer isaBuffer(PAGE_SIZE, defaultGPUNode, true, false, true);
85 ASSERT_SUCCESS(m_pAsm->RunAssembleBuf(IterateIsa, isaBuffer.As<char*>()));
86
87 unsigned stopval = 0x1234'5678;
88 unsigned outval = 0x8765'4321;
89
90 // 4B per work-item ==> 1 page per 1024 work-items (take ceiling)
91 unsigned bufSize = PAGE_SIZE * ((num_witems / 1024) + (num_witems % 1024 != 0));
92
93 HsaMemoryBuffer inputBuf(bufSize, defaultGPUNode, true, false, false);
94 HsaMemoryBuffer outputBuf(bufSize, defaultGPUNode, true, false, false);
95 unsigned int* input = inputBuf.As<unsigned int*>();
96 unsigned int* output = outputBuf.As<unsigned int*>();
97 inputBuf.Fill(0);
98 outputBuf.Fill(outval);
99
100 PM4Queue queue;
101 ASSERT_SUCCESS(queue.Create(defaultGPUNode));
102
103 Dispatch dispatch(isaBuffer);
104 dispatch.SetArgs(input, output);
105 dispatch.SetDim(num_witems, 1, 1);
106 dispatch.Submit(queue);
107
193108 Delay(5);
194 EXPECT_SUCCESS(queue1.Update(100/*percentage*/, BaseQueue::DEFAULT_PRIORITY, false));
195
196 dispatch1->Sync();
197 // Ensure all the waves complete as expected
198 int i;
199 for (i = 0 ; i < wave_number; ++i) {
200 if (result1[i] != count1) {
201 LOG() << "Dispatch 1, work item [" << std::dec << i << "] "
202 << result1[i] << " != " << count1 << std::endl;
203 break;
109
110 LOG() << "Starting iteration for " << std::dec << num_witems
111 << " work items(s) (targeting " << std::dec << cwsr_thresh
112 << " CWSRs)" << std::endl;
113
114 for (int num_cwsrs = 0; num_cwsrs < cwsr_thresh; num_cwsrs++) {
115
116 // Send dequeue request
117 EXPECT_SUCCESS(queue.Update(0, BaseQueue::DEFAULT_PRIORITY, false));
118
119 Delay(5);
120
121 // Send requeue request
122 EXPECT_SUCCESS(queue.Update(100, BaseQueue::DEFAULT_PRIORITY, false));
123
124 Delay(50);
125
126 // Check for reg mangling
127 for (int i = 0; i < num_witems; i++) {
128 EXPECT_EQ(outval, output[i]);
204129 }
205130 }
206 EXPECT_EQ(i, wave_number);
207
208 EXPECT_SUCCESS(queue1.Destroy());
209
210 delete dispatch1;
131
132 LOG() << "Successful completion for " << std::dec << num_witems
133 << " work item(s) (CWSRs triggered: " << std::dec << cwsr_thresh
134 << ")" << std::endl;
135 LOG() << "Signalling shader stop..." << std::endl;
136
137 inputBuf.Fill(stopval);
138
139 // Wait for shader to finish or timeout if shader has vm page fault
140 EXPECT_EQ(0, dispatch.SyncWithStatus(180000));
141
142 EXPECT_SUCCESS(queue.Destroy());
211143 } else {
212144 LOG() << "Skipping test: No CWSR present for family ID 0x" << m_FamilyId << "." << std::endl;
213145 }
214146
215147 TEST_END
216148 }
149
150 /**
151 * Instantiates various KFDCWSRTest.BasicTest parameterizations
152 * Tuple Format: (num_witems, cwsr_thresh)
153 *
154 * num_witems: Defines the number of work-items.
155 * cwsr_thresh: Defines the number of CWSRs to trigger.
156 */
157 INSTANTIATE_TEST_CASE_P(
158 , KFDCWSRTest,
159 ::testing::Values(
160 std::make_tuple(1, 10), /* Single Wave Test, 10 CWSR Triggers */
161 std::make_tuple(256, 50), /* Multi Wave Test, 50 CWSR Triggers */
162 std::make_tuple(512, 100), /* Multi Wave Test, 100 CWSR Triggers */
163 std::make_tuple(1024, 1000) /* Multi Wave Test, 1000 CWSR Triggers */
164 )
165 );
217166
218167 /**
219168 * KFDCWSRTest.InterruptRestore
235184 if ((m_FamilyId >= FAMILY_VI) && (checkCWSREnabled())) {
236185 HsaMemoryBuffer isaBuffer(PAGE_SIZE, defaultGPUNode, true/*zero*/, false/*local*/, true/*exec*/);
237186
238 m_pIsaGen->CompileShader(infinite_isa, "infinite_isa", isaBuffer);
187 ASSERT_SUCCESS(m_pAsm->RunAssembleBuf(InfiniteLoopIsa, isaBuffer.As<char*>()));
239188
240189 PM4Queue queue1, queue2, queue3;
241190
2626 #include <gtest/gtest.h>
2727
2828 #include "PM4Queue.hpp"
29 #include "IsaGenerator.hpp"
3029 #include "KFDBaseComponentTest.hpp"
3130
32 class KFDCWSRTest : public KFDBaseComponentTest {
31 class KFDCWSRTest : public KFDBaseComponentTest,
32 public ::testing::WithParamInterface<std::tuple<int, int>> {
3333 public:
34 KFDCWSRTest() :m_pIsaGen(NULL) {}
34 KFDCWSRTest() {}
3535 ~KFDCWSRTest() {}
3636
3737 protected:
3838 virtual void SetUp();
3939 virtual void TearDown();
40
41 protected: // Members
42 unsigned wave_number;
43 IsaGenerator* m_pIsaGen;
4440 };
4541
4642 #endif // __KFD_CWSR_TEST__H__
175175
176176 KFDBaseComponentTest::SetUp();
177177
178 m_pIsaGen = IsaGenerator::Create(m_FamilyId);
179
180178 ROUTINE_END
181179 }
182180
183181 void KFDDBGTest::TearDown() {
184182 ROUTINE_START
185 if (m_pIsaGen)
186 delete m_pIsaGen;
187 m_pIsaGen = NULL;
188183
189184 /* Reset the user trap handler */
190185 hsaKmtSetTrapHandler(m_NodeInfo.HsaDefaultGPUNode(), 0, 0, 0, 0);
2525
2626 #include <gtest/gtest.h>
2727
28 #include "IsaGenerator.hpp"
2928 #include "KFDBaseComponentTest.hpp"
3029
3130 class KFDDBGTest : public KFDBaseComponentTest {
3231 public:
33 KFDDBGTest() :m_pIsaGen(NULL) {}
32 KFDDBGTest() {}
3433 ~KFDDBGTest() {}
3534
3635 protected:
3736 virtual void SetUp();
3837 virtual void TearDown();
39
40 protected: // Members
41 IsaGenerator* m_pIsaGen;
4238 };
4339
4440 #endif // __KFD_DBG_TEST__H__
4040
4141 KFDBaseComponentTest::SetUp();
4242
43 m_pIsaGen = IsaGenerator::Create(m_FamilyId);
44
4543 ROUTINE_END
4644 }
4745
4846 void KFDEvictTest::TearDown() {
4947 ROUTINE_START
50
51 if (m_pIsaGen)
52 delete m_pIsaGen;
53 m_pIsaGen = NULL;
5448
5549 KFDBaseComponentTest::TearDown();
5650
285279 EXPECT_EQ(0, amdgpu_cs_ctx_free(contextHandle));
286280 }
287281
288 /* Shader to read local buffers using multiple wavefronts in parallel
289 * until address buffer is filled with specific value 0x5678 by host program,
290 * then each wavefront fills value 0x5678 at corresponding result buffer and quit
291 *
292 * Initial state:
293 * s[0:1] - address buffer base address
294 * s[2:3] - result buffer base address
295 * s4 - workgroup id
296 * v0 - workitem id, always 0 because NUM_THREADS_X(number of threads) in workgroup set to 1
297 * Registers:
298 * v0 - calculated workitem id, v0 = v0 + s4 * NUM_THREADS_X
299 * v[2:3] - address of corresponding local buf address offset: s[0:1] + v0 * 8
300 * v[4:5] - corresponding output buf address: s[2:3] + v0 * 4
301 * v[6:7] - local buf address used for read test
302 *
303 * This shader can be used by gfx9 and gfx10
304 *
305 */
306
307 static const char* gfx9_ReadMemory =
308 "\
309 shader ReadMemory\n\
310 wave_size(32)\n\
311 type(CS)\n\
312 \n\
313 // compute address of corresponding output buffer\n\
314 v_mov_b32 v0, s4 // use workgroup id as index\n\
315 v_lshlrev_b32 v0, 2, v0 // v0 *= 4\n\
316 v_add_co_u32 v4, vcc, s2, v0 // v[4:5] = s[2:3] + v0 * 4\n\
317 v_mov_b32 v5, s3\n\
318 v_add_co_u32 v5, vcc, v5, vcc_lo\n\
319 \n\
320 // compute input buffer offset used to store corresponding local buffer address\n\
321 v_lshlrev_b32 v0, 1, v0 // v0 *= 8\n\
322 v_add_co_u32 v2, vcc, s0, v0 // v[2:3] = s[0:1] + v0 * 8\n\
323 v_mov_b32 v3, s1\n\
324 v_add_co_u32 v3, vcc, v3, vcc_lo\n\
325 \n\
326 // load 64bit local buffer address stored at v[2:3] to v[6:7]\n\
327 flat_load_dwordx2 v[6:7], v[2:3] slc\n\
328 s_waitcnt vmcnt(0) & lgkmcnt(0) // wait for memory reads to finish\n\
329 \n\
330 v_mov_b32 v8, 0x5678\n\
331 s_movk_i32 s8, 0x5678\n\
332 L_REPEAT:\n\
333 s_load_dword s16, s[0:1], 0x0 glc\n\
334 s_waitcnt vmcnt(0) & lgkmcnt(0) // wait for memory reads to finish\n\
335 s_cmp_eq_i32 s16, s8\n\
336 s_cbranch_scc1 L_QUIT // if notified to quit by host\n\
337 // loop read 64M local buffer starting at v[6:7]\n\
338 // every 4k page only read once\n\
339 v_mov_b32 v9, 0\n\
340 v_mov_b32 v10, 0x1000 // 4k page\n\
341 v_mov_b32 v11, 0x4000000 // 64M size\n\
342 v_mov_b32 v12, v6\n\
343 v_mov_b32 v13, v7\n\
344 L_LOOP_READ:\n\
345 flat_load_dwordx2 v[14:15], v[12:13] slc\n\
346 v_add_co_u32 v9, vcc, v9, v10 \n\
347 v_add_co_u32 v12, vcc, v12, v10\n\
348 v_add_co_u32 v13, vcc, v13, vcc_lo\n\
349 v_cmp_lt_u32 vcc, v9, v11\n\
350 s_cbranch_vccnz L_LOOP_READ\n\
351 s_branch L_REPEAT\n\
352 L_QUIT:\n\
353 flat_store_dword v[4:5], v8\n\
354 s_waitcnt vmcnt(0) & lgkmcnt(0) // wait for memory writes to finish\n\
355 s_endpgm\n\
356 end\n\
357 ";
358
359 static const char* gfx8_ReadMemory =
360 "\
361 shader ReadMemory\n\
362 asic(VI)\n\
363 type(CS)\n\
364 \n\
365 // compute address of corresponding output buffer\n\
366 v_mov_b32 v0, s4 // use workgroup id as index\n\
367 v_lshlrev_b32 v0, 2, v0 // v0 *= 4\n\
368 v_add_u32 v4, vcc, s2, v0 // v[4:5] = s[2:3] + v0 * 4\n\
369 v_mov_b32 v5, s3\n\
370 v_addc_u32 v5, vcc, v5, 0, vcc\n\
371 \n\
372 // compute input buffer offset used to store corresponding local buffer address\n\
373 v_lshlrev_b32 v0, 1, v0 // v0 *= 8\n\
374 v_add_u32 v2, vcc, s0, v0 // v[2:3] = s[0:1] + v0 * 8\n\
375 v_mov_b32 v3, s1\n\
376 v_addc_u32 v3, vcc, v3, 0, vcc\n\
377 \n\
378 // load 64bit local buffer address stored at v[2:3] to v[6:7]\n\
379 flat_load_dwordx2 v[6:7], v[2:3] slc\n\
380 s_waitcnt vmcnt(0) & lgkmcnt(0) // wait for memory reads to finish\n\
381 \n\
382 v_mov_b32 v8, 0x5678\n\
383 s_movk_i32 s8, 0x5678\n\
384 L_REPEAT:\n\
385 s_load_dword s16, s[0:1], 0x0 glc\n\
386 s_waitcnt vmcnt(0) & lgkmcnt(0) // wait for memory reads to finish\n\
387 s_cmp_eq_i32 s16, s8\n\
388 s_cbranch_scc1 L_QUIT // if notified to quit by host\n\
389 // loop read 64M local buffer starting at v[6:7]\n\
390 // every 4k page only read once\n\
391 v_mov_b32 v9, 0\n\
392 v_mov_b32 v10, 0x1000 // 4k page\n\
393 v_mov_b32 v11, 0x4000000 // 64M size\n\
394 v_mov_b32 v12, v6\n\
395 v_mov_b32 v13, v7\n\
396 L_LOOP_READ:\n\
397 flat_load_dwordx2 v[14:15], v[12:13] slc\n\
398 v_add_u32 v9, vcc, v9, v10 \n\
399 v_add_u32 v12, vcc, v12, v10\n\
400 v_addc_u32 v13, vcc, v13, 0, vcc\n\
401 v_cmp_lt_u32 vcc, v9, v11\n\
402 s_cbranch_vccnz L_LOOP_READ\n\
403 s_branch L_REPEAT\n\
404 L_QUIT:\n\
405 flat_store_dword v[4:5], v8\n\
406 s_waitcnt vmcnt(0) & lgkmcnt(0) // wait for memory writes to finish\n\
407 s_endpgm\n\
408 end\n\
409 ";
410
411 std::string KFDEvictTest::CreateShader() {
412 if (m_FamilyId < FAMILY_AI)
413 return gfx8_ReadMemory;
414 else
415 return gfx9_ReadMemory;
416 }
417
418282 /* Evict and restore procedure basic test
419283 *
420284 * Use N_PROCESSES processes to allocate vram buf size larger than total vram size
566430 HsaMemoryBuffer addrBuffer(PAGE_SIZE, defaultGPUNode);
567431 HsaMemoryBuffer resultBuffer(PAGE_SIZE, defaultGPUNode);
568432
569 m_pIsaGen->CompileShader(CreateShader().c_str(), "ReadMemory", isaBuffer);
433 ASSERT_SUCCESS(m_pAsm->RunAssembleBuf(ReadMemoryIsa, isaBuffer.As<char*>()));
570434
571435 PM4Queue pm4Queue;
572436 ASSERT_SUCCESS(pm4Queue.Create(defaultGPUNode));
2626 #include <string>
2727 #include <vector>
2828 #include "KFDMultiProcessTest.hpp"
29 #include "IsaGenerator.hpp"
3029 #include "PM4Queue.hpp"
3130
3231 // @class KFDEvictTest
3332 // Test eviction and restore procedure using two processes
3433 class KFDEvictTest : public KFDMultiProcessTest {
3534 public:
36 KFDEvictTest(void): m_pIsaGen(NULL) {}
37
35 KFDEvictTest(void) {}
3836 ~KFDEvictTest(void) {}
3937
4038 protected:
4139 virtual void SetUp();
4240 virtual void TearDown();
4341
44 std::string CreateShader();
4542 void AllocBuffers(HSAuint32 defaultGPUNode, HSAuint32 count, HSAuint64 vramBufSize,
4643 std::vector<void *> &pBuffers);
4744 void FreeBuffers(std::vector<void *> &pBuffers, HSAuint64 vramBufSize);
5148 PM4Queue *computeQueue);
5249
5350 protected: // Members
54 IsaGenerator* m_pIsaGen;
5551 HsaMemFlags m_Flags;
5652 void* m_pBuf;
5753 };
3232
3333 KFDBaseComponentTest::SetUp();
3434
35 m_pIsaGen = IsaGenerator::Create(m_FamilyId);
35 LOG() << "This Exception test might cause expected page fault "
36 "error logs at kernel level." << std::endl;
3637
3738 ROUTINE_END
3839 }
3940
4041 void KFDExceptionTest::TearDown() {
4142 ROUTINE_START
42
43 if (m_pIsaGen)
44 delete m_pIsaGen;
45 m_pIsaGen = NULL;
4643
4744 KFDBaseComponentTest::TearDown();
4845
7471 eventDesc.SyncVar.SyncVar.UserData = NULL;
7572 eventDesc.SyncVar.SyncVarSize = 0;
7673
77 m_pIsaGen->GetCopyDwordIsa(isaBuffer);
74 ASSERT_SUCCESS(m_pAsm->RunAssembleBuf(CopyDwordIsa, isaBuffer.As<char*>()));
75
7876 m_ChildStatus = queue.Create(defaultGPUNode);
7977 if (m_ChildStatus != HSAKMT_STATUS_SUCCESS) {
8078 WARN() << "Queue create failed" << std::endl;
185183
186184 m_ChildPid = fork();
187185 if (m_ChildPid == 0) {
188 m_ChildStatus = hsaKmtOpenKFD();
189 if (m_ChildStatus != HSAKMT_STATUS_SUCCESS) {
190 WARN() << "KFD open failed in child process" << std::endl;
191 return;
192 }
186 KFDBaseComponentTest::TearDown();
187 KFDBaseComponentTest::SetUp();
193188
194189 HsaMemoryBuffer srcBuffer(PAGE_SIZE, defaultGPUNode, false);
195190
229224
230225 m_ChildPid = fork();
231226 if (m_ChildPid == 0) {
232 m_ChildStatus = hsaKmtOpenKFD();
233 if (m_ChildStatus != HSAKMT_STATUS_SUCCESS) {
234 WARN() << "KFD open failed in child process" << std::endl;
235 return;
236 }
227 KFDBaseComponentTest::TearDown();
228 KFDBaseComponentTest::SetUp();
237229
238230 HsaMemoryBuffer readOnlyBuffer(PAGE_SIZE, defaultGPUNode, false /*zero*/,
239231 false /*isLocal*/, true /*isExec*/,
279271
280272 m_ChildPid = fork();
281273 if (m_ChildPid == 0) {
282 m_ChildStatus = hsaKmtOpenKFD();
283 if (m_ChildStatus != HSAKMT_STATUS_SUCCESS) {
284 WARN() << "KFD open failed in child process" << std::endl;
285 return;
286 }
274 KFDBaseComponentTest::TearDown();
275 KFDBaseComponentTest::SetUp();
287276
288277 TestMemoryException(defaultGPUNode, 0x12345678, 0x76543210, 1024, 1024, 1);
289278 } else {
322311 if (m_ChildPid == 0) {
323312 unsigned int* pDb = NULL;
324313 unsigned int *nullPtr = NULL;
325 m_ChildStatus = hsaKmtOpenKFD();
326 if (m_ChildStatus != HSAKMT_STATUS_SUCCESS) {
327 WARN() << "KFD open failed in child process" << std::endl;
328 return;
329 }
314
315 KFDBaseComponentTest::TearDown();
316 KFDBaseComponentTest::SetUp();
317
330318 m_MemoryFlags.ui32.NonPaged = 1;
319 m_MemoryFlags.ui32.HostAccess = 0;
331320 ASSERT_SUCCESS(hsaKmtAllocMemory(defaultGPUNode, PAGE_SIZE, m_MemoryFlags,
332321 reinterpret_cast<void**>(&pDb)));
333322 // verify that pDb is not null before it's being used
2525
2626 #include <gtest/gtest.h>
2727
28 #include "IsaGenerator.hpp"
2928 #include "KFDBaseComponentTest.hpp"
3029
3130 class KFDExceptionTest : public KFDBaseComponentTest {
3231 public:
33 KFDExceptionTest() :m_pIsaGen(NULL), m_ChildPid(-1) {
32 KFDExceptionTest() : m_ChildPid(-1) {
3433 /* Because there could be early return before m_ChildPid is set
3534 * by fork(), we should initialize m_ChildPid to a non-zero value
3635 * to avoid possible exit of the main process.
4241 * child process finishes, gtest assumes the test has finished and
4342 * starts the next test while the parent is still active.
4443 */
45 if (m_ChildPid == 0)
44 if (m_ChildPid == 0) {
45 if (!m_ChildStatus && HasFatalFailure())
46 m_ChildStatus = HSAKMT_STATUS_ERROR;
4647 exit(m_ChildStatus);
48 }
4749 }
4850
4951 protected:
5860 protected: // Members
5961 pid_t m_ChildPid;
6062 HSAKMT_STATUS m_ChildStatus;
61
62 IsaGenerator* m_pIsaGen;
6363 };
6464
6565 #endif // __KFD_EXCEPTION_TEST__H__
2525 #include "PM4Packet.hpp"
2626 #include "Dispatch.hpp"
2727
28 /* Shader to initialize gws counter to 1*/
29 const char* gfx9_10_GwsInit =
30 "\
31 shader GwsInit\n\
32 type(CS)\n\
33 wave_size(32)\n\
34 s_mov_b32 m0, 0\n\
35 s_nop 0\n\
36 s_load_dword s16, s[0:1], 0x0 glc\n\
37 s_waitcnt 0\n\
38 v_mov_b32 v0, s16\n\
39 s_waitcnt 0\n\
40 ds_gws_init v0 gds:1 offset0:0\n\
41 s_waitcnt 0\n\
42 s_endpgm\n\
43 end\n\
44 ";
45
46 /* Atomically increase a value in memory
47 * This is expected to be executed from
48 * multiple work groups simultaneously.
49 * GWS semaphore is used to guarantee
50 * the operation is atomic.
51 */
52 const char* gfx9_AtomicIncrease =
53 "\
54 shader AtomicIncrease\n\
55 type(CS)\n\
56 /* Assume src address in s0, s1 */\n\
57 s_mov_b32 m0, 0\n\
58 s_nop 0\n\
59 ds_gws_sema_p gds:1 offset0:0\n\
60 s_waitcnt 0\n\
61 s_load_dword s16, s[0:1], 0x0 glc\n\
62 s_waitcnt 0\n\
63 s_add_u32 s16, s16, 1\n\
64 s_store_dword s16, s[0:1], 0x0 glc\n\
65 s_waitcnt lgkmcnt(0)\n\
66 ds_gws_sema_v gds:1 offset0:0\n\
67 s_waitcnt 0\n\
68 s_endpgm\n\
69 end\n\
70 ";
71
72 const char* gfx10_AtomicIncrease =
73 "\
74 shader AtomicIncrease\n\
75 asic(GFX10)\n\
76 type(CS)\n\
77 wave_size(32)\n\
78 /* Assume src address in s0, s1 */\n\
79 s_mov_b32 m0, 0\n\
80 s_mov_b32 exec_lo, 0x1\n\
81 v_mov_b32 v0, s0\n\
82 v_mov_b32 v1, s1\n\
83 ds_gws_sema_p gds:1 offset0:0\n\
84 s_waitcnt 0\n\
85 flat_load_dword v2, v[0:1] glc:1 dlc:1\n\
86 s_waitcnt 0\n\
87 v_add_nc_u32 v2, v2, 1\n\
88 flat_store_dword v[0:1], v2\n\
89 s_waitcnt_vscnt null, 0\n\
90 ds_gws_sema_v gds:1 offset0:0\n\
91 s_waitcnt 0\n\
92 s_endpgm\n\
93 end\n\
94 ";
95
9628 void KFDGWSTest::SetUp() {
9729 ROUTINE_START
9830
9931 KFDBaseComponentTest::SetUp();
100
101 m_pIsaGen = IsaGenerator::Create(m_FamilyId);
10232
10333 ROUTINE_END
10434 }
10535
10636 void KFDGWSTest::TearDown() {
10737 ROUTINE_START
108
109 if (m_pIsaGen)
110 delete m_pIsaGen;
111 m_pIsaGen = NULL;
11238
11339 KFDBaseComponentTest::TearDown();
11440
15985 pNodeProperties->NumGws,&firstGWS));
16086 EXPECT_EQ(0, firstGWS);
16187
162 m_pIsaGen = IsaGenerator::Create(m_FamilyId);
163 m_pIsaGen->CompileShader(gfx9_10_GwsInit, "GwsInit", isaBuffer);
88 ASSERT_SUCCESS(m_pAsm->RunAssembleBuf(GwsInitIsa, isaBuffer.As<char*>()));
89
16490 Dispatch dispatch0(isaBuffer);
16591 buffer.Fill(numResources, 0, 4);
16692 dispatch0.SetArgs(buffer.As<void*>(), NULL);
16793 dispatch0.Submit(queue);
16894 dispatch0.Sync();
16995
170 const char *pAtomicIncrease;
171 if (m_FamilyId <= FAMILY_AL)
172 pAtomicIncrease = gfx9_AtomicIncrease;
173 else
174 pAtomicIncrease = gfx10_AtomicIncrease;
175
176 m_pIsaGen->CompileShader(pAtomicIncrease, "AtomicIncrease", isaBuffer);
96 ASSERT_SUCCESS(m_pAsm->RunAssembleBuf(GwsAtomicIncreaseIsa, isaBuffer.As<char*>()));
17797
17898 Dispatch dispatch(isaBuffer);
17999 dispatch.SetArgs(buffer.As<void*>(), NULL);
2525
2626 #include <gtest/gtest.h>
2727
28 #include "IsaGenerator.hpp"
2928 #include "KFDBaseComponentTest.hpp"
3029
3130 class KFDGWSTest : public KFDBaseComponentTest {
3231 public:
33 KFDGWSTest() :m_pIsaGen(NULL) {}
32 KFDGWSTest() {}
3433 ~KFDGWSTest() {}
3534
3635 protected:
3736 virtual void SetUp();
3837 virtual void TearDown();
39
40 protected: // Members
41 IsaGenerator* m_pIsaGen;
4238 };
4339
4440 #endif // __KFD_GWS_TEST__H__
100100
101101 // Copy contents to a system memory buffer for comparison
102102 HsaMemoryBuffer isaBuffer(PAGE_SIZE, defaultGPUNode, true/*zero*/, false/*local*/, true/*exec*/);
103 m_pIsaGen->GetCopyDwordIsa(isaBuffer);
103
104 ASSERT_SUCCESS(m_pAsm->RunAssembleBuf(CopyDwordIsa, isaBuffer.As<char*>()));
104105
105106 HsaMemoryBuffer dstBuffer(PAGE_SIZE, defaultGPUNode, true/*zero*/);
106107
2727
2828 KFDBaseComponentTest::SetUp();
2929
30 m_pIsaGen = IsaGenerator::Create(m_FamilyId);
31
3230 ROUTINE_END
3331 }
3432
3533 void KFDHWSTest::TearDown() {
3634 ROUTINE_START
37
38 if (m_pIsaGen)
39 delete m_pIsaGen;
40 m_pIsaGen = NULL;
4135
4236 KFDBaseComponentTest::TearDown();
4337
6963
7064 // Run work on all queues
7165 HsaMemoryBuffer isaBuffer(PAGE_SIZE, defaultGPUNode, true/*zero*/, false/*local*/, true/*exec*/);
72 m_pIsaGen->GetNoopIsa(isaBuffer);
66
67 ASSERT_SUCCESS(m_pAsm->RunAssembleBuf(NoopIsa, isaBuffer.As<char*>()));
68
7369 for (l = 0; l < nLoops; l++) {
7470 for (q = 0; q < nQueues; q++) {
7571 if (dispatch[q])
2626 #include <gtest/gtest.h>
2727
2828 #include "PM4Queue.hpp"
29 #include "IsaGenerator.hpp"
3029 #include "KFDMultiProcessTest.hpp"
3130 #include "Dispatch.hpp"
3231
3332 class KFDHWSTest : public KFDMultiProcessTest {
3433 public:
35 KFDHWSTest():m_pIsaGen(NULL) {}
36
34 KFDHWSTest() {}
3735 ~KFDHWSTest() {}
3836
3937 protected:
4139 virtual void TearDown();
4240
4341 void RunTest(unsigned nProcesses, unsigned nQueues, unsigned nLoops);
44
45 protected: // Members
46 IsaGenerator* m_pIsaGen;
4742 };
4843
4944 #endif // __KFD_QCM_TEST__H__
6969 /* Open KFD device for child process. This needs to called before
7070 * any memory definitions
7171 */
72 if (HSAKMT_STATUS_SUCCESS != hsaKmtOpenKFD())
73 exit(1);
72 TearDown();
73 SetUp();
7474
7575 SDMAQueue sdmaQueue;
7676 HsaSharedMemoryHandle sharedHandleLM;
215215
216216 TEST_END
217217 }
218
219 /* Cross Memory Attach Test. Memory Descriptor Array.
220 * The following 2 2D-arrays describe the source and destination memory arrays used
221 * by CMA test. The entry is only valid if Size != 0. Each of these buffers will be
222 * filled intially with "FillPattern". After the test the srcRange is still expected
223 * to have the same pattern. The dstRange is expected to have srcRange pattern.
224 *
225 * For e.g. for TEST_COUNT = 1,
226 * srcRange has 2 buffers of size 0x1800. Buf1 filled with 0xA5A5A5A5 and Buf2
227 * filled with 0xAAAAAAAA
228 * dstRange has 3 buffers of size 0x1000. All of them filled 0xFFFFFFFF.
229 * After Copy: dstBuf1[0-0x1000] is expected to be 0xA5A5A5A5
230 * dstBuf2[0-0x800] is expected to be 0xA5A5A5A5
231 * dstBuf3[0x800-0x1000] is expected to be 0xAAAAAAAA
232 * and dstBuf4[0x0-0x1000] is expected to be 0xAAAAAAAA
233 *
234 * For this CMA test, after copying only the first and the last of dstBuf is checked
235 */
236
237 static testMemoryDescriptor srcRange[CMA_TEST_COUNT][CMA_MEMORY_TEST_ARRAY_SIZE] = {
238 { /* Memory Type Size FillPattern FirstItem Last item */
239 { CMA_MEM_TYPE_USERPTR, 0x801800, 0xA5A5A5A5, 0xA5A5A5A5, 0xA5A5A5A5 },
240 { CMA_MEM_TYPE_USERPTR, 0x1800, 0xAAAAAAAA, 0xAAAAAAAA, 0xAAAAAAAA },
241 { CMA_MEM_TYPE_USERPTR, 0x0, 0xA5A5A5A5, 0xA5A5A5A5, 0xA5A5A5A5 },
242 { CMA_MEM_TYPE_USERPTR, 0x0, 0xA5A5A5A5, 0xA5A5A5A5, 0xA5A5A5A5 },
243 },
244 {
245 { CMA_MEM_TYPE_SYSTEM, 0x208000, 0xDEADBEEF, 0xA5A5A5A5, 0xA5A5A5A5 },
246 { CMA_MEM_TYPE_SYSTEM, 0x4000, 0xA5A5A5A5, 0xA5A5A5A5, 0xA5A5A5A5 },
247 { CMA_MEM_TYPE_SYSTEM, 0x6000, 0xA5A5A5A5, 0xA5A5A5A5, 0xA5A5A5A5 },
248 { CMA_MEM_TYPE_SYSTEM, 0x2000, 0xA5A5A5A5, 0xA5A5A5A5, 0xA5A5A5A5 },
249 },
250 {
251 { CMA_MEM_TYPE_LOCAL_MEM, 0x800000, 0xDEADBEEF, 0xA5A5A5A5, 0xA5A5A5A5 },
252 { CMA_MEM_TYPE_LOCAL_MEM, 0x1000, 0xA5A5A5A5, 0xA5A5A5A5, 0xA5A5A5A5 },
253 { CMA_MEM_TYPE_LOCAL_MEM, 0x1000, 0xA5A5A5A5, 0xA5A5A5A5, 0xA5A5A5A5 },
254 { CMA_MEM_TYPE_LOCAL_MEM, 0x1000, 0xA5A5A5A5, 0xA5A5A5A5, 0xA5A5A5A5 },
255 }
256 };
257
258 static testMemoryDescriptor dstRange[CMA_TEST_COUNT][CMA_MEMORY_TEST_ARRAY_SIZE] = {
259 {
260 /* Memory Type Size FillPattern FirstItem Last item */
261 { CMA_MEM_TYPE_USERPTR, 0x801000, 0xFFFFFFFF, 0xA5A5A5A5, 0xA5A5A5A5 },
262 { CMA_MEM_TYPE_USERPTR, 0x1000, 0xFFFFFFFF, 0xA5A5A5A5, 0xAAAAAAAA },
263 { CMA_MEM_TYPE_USERPTR, 0x1000, 0xFFFFFFFF, 0xAAAAAAAA, 0xAAAAAAAA },
264 { CMA_MEM_TYPE_USERPTR, 0x0, 0xFFFFFFFF, 0xA5A5A5A5, 0xA5A5A5A5 },
265 },
266 {
267 { CMA_MEM_TYPE_SYSTEM, 0x202000, 0xFFFFFFFF, 0xDEADBEEF, 0xDEADBEEF },
268 { CMA_MEM_TYPE_SYSTEM, 0x4000, 0xFFFFFFFF, 0xDEADBEEF, 0xDEADBEEF },
269 { CMA_MEM_TYPE_SYSTEM, 0x8000, 0xFFFFFFFF, 0xDEADBEEF, 0xA5A5A5A5 },
270 { CMA_MEM_TYPE_SYSTEM, 0x6000, 0xFFFFFFFF, 0xA5A5A5A5, 0xA5A5A5A5 },
271 },
272 {
273 { CMA_MEM_TYPE_LOCAL_MEM, 0x800000, 0xFFFFFFFF, 0xDEADBEEF, 0xDEADBEEF },
274 { CMA_MEM_TYPE_LOCAL_MEM, 0x1000, 0xFFFFFFFF, 0xA5A5A5A5, 0xA5A5A5A5 },
275 { CMA_MEM_TYPE_LOCAL_MEM, 0x1000, 0xFFFFFFFF, 0xA5A5A5A5, 0xA5A5A5A5 },
276 { CMA_MEM_TYPE_LOCAL_MEM, 0x1000, 0xFFFFFFFF, 0xA5A5A5A5, 0xA5A5A5A5 },
277 }
278 };
279
280 KFDCMAArray::KFDCMAArray() : m_ValidCount(0), m_QueueArray(HSA_QUEUE_SDMA) {
281 memset(m_MemArray, 0, sizeof(m_MemArray));
282 memset(m_HsaMemoryRange, 0, sizeof(m_HsaMemoryRange));
283 }
284
285 CMA_TEST_STATUS KFDCMAArray::Destroy() {
286 for (int i = 0; i < m_ValidCount; i++) {
287 if (m_MemArray[i]) {
288 void *userPtr;
289
290 userPtr = m_MemArray[i]->GetUserPtr();
291 delete m_MemArray[i];
292
293 if (userPtr)
294 free(userPtr);
295 }
296 }
297
298 memset(m_MemArray, 0, sizeof(m_MemArray));
299 memset(m_HsaMemoryRange, 0, sizeof(m_HsaMemoryRange));
300 m_ValidCount = 0;
301
302 return CMA_TEST_SUCCESS;
303 }
304
305 /* Initialize KFDCMAArray based on array of testMemoryDescriptor. Usually testMemoryDescriptor[] is
306 * statically defined array by the user. Only items with non-zero size are considered valid
307 */
308 CMA_TEST_STATUS KFDCMAArray::Init(testMemoryDescriptor(*memDescriptor)[CMA_MEMORY_TEST_ARRAY_SIZE], int node) {
309 CMA_TEST_STATUS err = CMA_TEST_SUCCESS;
310 memset(m_MemArray, 0, sizeof(m_MemArray));
311 memset(m_HsaMemoryRange, 0, sizeof(m_HsaMemoryRange));
312
313 m_ValidCount = 0;
314 for (int i = 0; i < CMA_MEMORY_TEST_ARRAY_SIZE; i++) {
315 if ((*memDescriptor)[i].m_MemSize == 0)
316 continue;
317
318 switch ((*memDescriptor)[i].m_MemType) {
319 case CMA_MEM_TYPE_SYSTEM:
320 m_MemArray[i] = new HsaMemoryBuffer((*memDescriptor)[i].m_MemSize, node);
321 break;
322
323 case CMA_MEM_TYPE_USERPTR:
324 {
325 void *userPtr = malloc((*memDescriptor)[i].m_MemSize);
326 m_MemArray[i] = new HsaMemoryBuffer(userPtr, (*memDescriptor)[i].m_MemSize);
327 break;
328 }
329
330 case CMA_MEM_TYPE_LOCAL_MEM:
331 m_MemArray[i] = new HsaMemoryBuffer((*memDescriptor)[i].m_MemSize, node, false, true);
332 break;
333 }
334
335 if (m_MemArray[i]) {
336 m_HsaMemoryRange[i].MemoryAddress = m_MemArray[i]->As<void*>();
337 m_HsaMemoryRange[i].SizeInBytes = m_MemArray[i]->Size();
338 m_ValidCount++;
339 } else {
340 err = CMA_TEST_NOMEM;
341 break;
342 }
343 }
344
345 return err;
346 }
347
348 /* Fill each buffer of KFDCMAArray with the pattern described by testMemoryDescriptor[] */
349 void KFDCMAArray::FillPattern(testMemoryDescriptor(*memDescriptor)[CMA_MEMORY_TEST_ARRAY_SIZE]) {
350 SDMAQueue sdmaQueue;
351 bool queueCreated = false;
352 unsigned int queueNode;
353
354 for (int i = 0; i < m_ValidCount; i++) {
355 if (m_MemArray[i]->isLocal())
356 m_MemArray[i]->Fill((*memDescriptor)[i].m_FillPattern, *m_QueueArray.GetQueue(m_MemArray[i]->Node()));
357 else
358 m_MemArray[i]->Fill((*memDescriptor)[i].m_FillPattern);
359 }
360 }
361
362 /* Check the first and last item of each buffer in KFDCMAArray with the pattern described by
363 * testMemoryDescriptor[]. Return 0 on success.
364 */
365 CMA_TEST_STATUS KFDCMAArray::checkPattern(testMemoryDescriptor(*memDescriptor)[CMA_MEMORY_TEST_ARRAY_SIZE]) {
366 HSAuint64 lastItem;
367 CMA_TEST_STATUS ret = CMA_TEST_SUCCESS;
368 unsigned int queueNode = 0;
369 bool queueCreated = false;
370 HsaMemoryBuffer tmpBuffer(PAGE_SIZE, 0, true /* zero */);
371 volatile HSAuint32 *tmp = tmpBuffer.As<volatile HSAuint32 *>();
372
373 for (int i = 0; i < m_ValidCount; i++) {
374 lastItem = m_MemArray[i]->Size();
375 lastItem -= sizeof(HSAuint32);
376
377 if (m_MemArray[i]->isLocal()) {
378 BaseQueue *sdmaQueue = m_QueueArray.GetQueue(m_MemArray[i]->Node());
379
380 if (!m_MemArray[i]->IsPattern(0, (*memDescriptor)[i].m_CheckFirstWordPattern, *sdmaQueue, tmp) ||
381 !m_MemArray[i]->IsPattern(lastItem, (*memDescriptor)[i].m_CheckLastWordPattern, *sdmaQueue, tmp)) {
382 ret = CMA_CHECK_PATTERN_ERROR;
383 break;
384 }
385
386 } else {
387 if (!m_MemArray[i]->IsPattern(0, (*memDescriptor)[i].m_CheckFirstWordPattern) ||
388 !m_MemArray[i]->IsPattern(lastItem, (*memDescriptor)[i].m_CheckLastWordPattern)) {
389 ret = CMA_CHECK_PATTERN_ERROR;
390 break;
391 }
392 }
393 }
394
395 return ret;
396 }
397
398
399 /* Non-blocking read and write to avoid Test from hanging (block indefinitely)
400 * if either server or client process exits due to assert failure
401 */
402 static int write_non_block(int fd, const void *buf, int size) {
403 int total_bytes = 0, cur_bytes = 0;
404 int retries = 5;
405 struct timespec tm = { 0, 10000000ULL };
406 const char *ptr = (const char *)buf;
407
408 do {
409 cur_bytes = write(fd, ptr, (size - total_bytes));
410
411 if (cur_bytes < 0 && errno != EAGAIN)
412 return cur_bytes;
413
414 if (cur_bytes > 0) {
415 total_bytes += cur_bytes;
416 ptr += cur_bytes;
417 }
418
419 if (total_bytes < size)
420 nanosleep(&tm, NULL);
421 } while (total_bytes < size && retries--);
422
423 /* Check for overflow */
424 if (total_bytes > size)
425 return -1;
426
427 return total_bytes;
428 }
429
430 static int read_non_block(int fd, void *buf, int size) {
431 int total_bytes = 0, cur_bytes = 0;
432 int retries = 5;
433 struct timespec tm = { 0, 100000000ULL };
434 char *ptr = reinterpret_cast<char *>(buf);
435
436 do {
437 cur_bytes = read(fd, ptr, (size - total_bytes));
438
439 if (cur_bytes < 0 && errno != EAGAIN)
440 return cur_bytes;
441
442 if (cur_bytes > 0) {
443 total_bytes += cur_bytes;
444 ptr += cur_bytes;
445 }
446
447 if (total_bytes < size)
448 nanosleep(&tm, NULL);
449 } while (total_bytes < size && retries--);
450
451 if (total_bytes > size)
452 return -1;
453
454 return total_bytes;
455 }
456
457
458 /* Send HsaMemoryRange to another process that is connected via writePipe */
459 CMA_TEST_STATUS KFDCMAArray::sendCMAArray(int writePipe) {
460 if (write_non_block(writePipe, reinterpret_cast<void*>(&m_HsaMemoryRange), sizeof(m_HsaMemoryRange)) !=
461 sizeof(m_HsaMemoryRange))
462 return CMA_IPC_PIPE_ERROR;
463 return CMA_TEST_SUCCESS;
464 }
465
466 /* Send HsaMemoryRange from another process and initialize KFDCMAArray */
467 CMA_TEST_STATUS KFDCMAArray::recvCMAArray(int readPipe) {
468 int i;
469
470 if (read_non_block(readPipe, reinterpret_cast<void*>(&m_HsaMemoryRange), sizeof(m_HsaMemoryRange)) !=
471 sizeof(m_HsaMemoryRange))
472 return CMA_IPC_PIPE_ERROR;
473
474 for (i = 0; i < CMA_MEMORY_TEST_ARRAY_SIZE; i++) {
475 if (m_HsaMemoryRange[i].SizeInBytes)
476 m_ValidCount++;
477 }
478 return CMA_TEST_SUCCESS;
479 }
480
481
482 CMA_TEST_STATUS KFDIPCTest::CrossMemoryAttachChildProcess(int defaultGPUNode, int writePipe,
483 int readPipe, CMA_TEST_TYPE testType) {
484 KFDCMAArray cmaLocalArray;
485 char msg[16];
486 int testNo;
487 CMA_TEST_STATUS status;
488
489 /* Initialize and fill Local Buffer Array with a pattern.
490 * READ_TEST: Send the Array to parent process. Wait for the parent
491 * to finish reading and checking. Then move to next text case or
492 * quit if last one.
493 * WRITE_TEST: Send Local Buffer Array to parent process and and wait
494 * for parent to write to it. Check for new pattern. Then move to next
495 * case or quit if last one.
496 */
497 for (testNo = 0; testNo < CMA_TEST_COUNT; testNo++) {
498 if (testType == CMA_READ_TEST) {
499 cmaLocalArray.Init(&srcRange[testNo], defaultGPUNode);
500 cmaLocalArray.FillPattern(&srcRange[testNo]);
501 } else {
502 cmaLocalArray.Init(&dstRange[testNo], defaultGPUNode);
503 cmaLocalArray.FillPattern(&dstRange[testNo]);
504 }
505
506 if (cmaLocalArray.sendCMAArray(writePipe) < 0) {
507 status = CMA_IPC_PIPE_ERROR;
508 break;
509 }
510
511 /* Wait until the test is over */
512 memset(msg, 0, sizeof(msg));
513 if (read_non_block(readPipe, msg, 4) < 0) {
514 status = CMA_IPC_PIPE_ERROR;
515 break;
516 }
517
518 if (!strcmp(msg, "CHCK"))
519 status = cmaLocalArray.checkPattern(&dstRange[testNo]);
520 else if (!strcmp(msg, "NEXT"))
521 status = CMA_TEST_SUCCESS;
522 else if (!strcmp(msg, "EXIT"))
523 status = CMA_TEST_ABORT;
524 else
525 status = CMA_PARENT_FAIL;
526
527 cmaLocalArray.Destroy();
528 if (status != CMA_TEST_SUCCESS)
529 break;
530 }
531
532 return status;
533 }
534
535
536 CMA_TEST_STATUS KFDIPCTest::CrossMemoryAttachParentProcess(int defaultGPUNode, pid_t cid,
537 int writePipe, int readPipe,
538 CMA_TEST_TYPE testType) {
539 KFDCMAArray cmaLocalArray, cmaRemoteArray;
540 HSAuint64 copied = 0;
541 int testNo;
542 CMA_TEST_STATUS status;
543
544 /* Receive buffer array from child and then initialize and fill in Local Buffer Array.
545 * READ_TEST: Copy remote buffer array into Local Buffer Array and then check
546 * for the new pattern.
547 * WRITE_TEST: Write Local Buffer Array into remote buffer array. Notify child to
548 * to check for the new pattern.
549 */
550 for (testNo = 0; testNo < CMA_TEST_COUNT; testNo++) {
551 status = cmaRemoteArray.recvCMAArray(readPipe);
552 if (status != CMA_TEST_SUCCESS)
553 break;
554
555 if (testType == CMA_READ_TEST) {
556 status = cmaLocalArray.Init(&dstRange[testNo], defaultGPUNode);
557 if (status != CMA_TEST_SUCCESS)
558 break;
559 cmaLocalArray.FillPattern(&dstRange[testNo]);
560
561 if (hsaKmtProcessVMRead(cid, cmaLocalArray.getMemoryRange(),
562 cmaLocalArray.getValidRangeCount(),
563 cmaRemoteArray.getMemoryRange(),
564 cmaRemoteArray.getValidRangeCount(),
565 &copied) != HSAKMT_STATUS_SUCCESS) {
566 status = CMA_TEST_HSA_READ_FAIL;
567 break;
568 }
569
570 status = cmaLocalArray.checkPattern(&dstRange[testNo]);
571 if (status != CMA_TEST_SUCCESS)
572 break;
573
574 cmaLocalArray.Destroy();
575 cmaRemoteArray.Destroy();
576
577 if (write_non_block(writePipe, "NEXT", 4) < 0) {
578 status = CMA_IPC_PIPE_ERROR;
579 break;
580 }
581 } else {
582 status = cmaLocalArray.Init(&srcRange[testNo], defaultGPUNode);
583 if (status != CMA_TEST_SUCCESS)
584 break;
585 cmaLocalArray.FillPattern(&srcRange[testNo]);
586
587 if (hsaKmtProcessVMWrite(cid, cmaLocalArray.getMemoryRange(),
588 cmaLocalArray.getValidRangeCount(),
589 cmaRemoteArray.getMemoryRange(),
590 cmaRemoteArray.getValidRangeCount(),
591 &copied) != HSAKMT_STATUS_SUCCESS) {
592 status = CMA_TEST_HSA_WRITE_FAIL;
593 break;
594 }
595
596 cmaLocalArray.Destroy();
597 cmaRemoteArray.Destroy();
598 if (write_non_block(writePipe, "CHCK", 4) < 0) {
599 status = CMA_IPC_PIPE_ERROR;
600 break;
601 }
602 }
603 } /* for loop */
604
605 return status;
606 }
607
608 /* Test Cross Memory Attach
609 * hsaKmtProcessVMRead and hsaKmtProcessVMWrite are GPU address equivalent to
610 * process_vm_readv and process_vm_writev. These calls transfer data between
611 * the address space of the calling process ("the local process") and the process
612 * identified by pid ("the remote process").
613 *
614 * In the tests parent process will be the local process and child will be
615 * the remote.
616 */
617 TEST_F(KFDIPCTest, CrossMemoryAttachTest) {
618 TEST_START(TESTPROFILE_RUNALL)
619
620 int defaultGPUNode = m_NodeInfo.HsaDefaultGPUNode();
621 int pipeCtoP[2], pipePtoC[2];
622 int status;
623
624 ASSERT_GE(defaultGPUNode, 0) << "failed to get default GPU Node";
625
626 if (!GetVramSize(defaultGPUNode)) {
627 LOG() << "Skipping test: No VRAM found." << std::endl;
628 return;
629 }
630
631 /* Create Pipes for communicating shared handles */
632 ASSERT_EQ(pipe2(pipeCtoP, O_NONBLOCK), 0);
633 ASSERT_EQ(pipe2(pipePtoC, O_NONBLOCK), 0);
634
635 /* Create a child process and share the above Local Memory with it */
636 m_ChildPid = fork();
637 if (m_ChildPid == 0 && hsaKmtOpenKFD() == HSAKMT_STATUS_SUCCESS) {
638 /* Child Process */
639 status = CrossMemoryAttachChildProcess(defaultGPUNode, pipeCtoP[1],
640 pipePtoC[0], CMA_READ_TEST);
641 EXPECT_EQ(status, CMA_TEST_SUCCESS) << "Child: Read Test Fail";
642 status = CrossMemoryAttachChildProcess(defaultGPUNode, pipeCtoP[1],
643 pipePtoC[0], CMA_WRITE_TEST);
644 EXPECT_EQ(status, CMA_TEST_SUCCESS) << "Child: Write Test Fail";
645 } else {
646 int childStatus;
647
648 status = CrossMemoryAttachParentProcess(defaultGPUNode, m_ChildPid,
649 pipePtoC[1], pipeCtoP[0], CMA_READ_TEST); /* Parent proces */
650 EXPECT_EQ(status, CMA_TEST_SUCCESS) << "Parent: Read Test Fail";
651 status = CrossMemoryAttachParentProcess(defaultGPUNode, m_ChildPid,
652 pipePtoC[1], pipeCtoP[0], CMA_WRITE_TEST);
653 EXPECT_EQ(status, CMA_TEST_SUCCESS) << "Parent: Write Test Fail";
654
655 waitpid(m_ChildPid, &childStatus, 0);
656 EXPECT_EQ(WIFEXITED(childStatus), true);
657 EXPECT_EQ(WEXITSTATUS(childStatus), 0);
658 }
659
660 /* Code path executed by both parent and child with respective fds */
661 close(pipeCtoP[1]);
662 close(pipeCtoP[0]);
663 close(pipePtoC[1]);
664 close(pipePtoC[0]);
665 TEST_END
666 }
667
668 /* Test Cross Memory Attach
669 *
670 * hsaKmtProcessVMRead and hsaKmtProcessVMWrite are GPU address equivalent to
671 * process_vm_readv and process_vm_writev. These calls are used to transfer data
672 * between the address space of the calling process ("the local process") and the process
673 * identified by pid ("the remote process"). However, these functions should also work
674 * with a single process and single BO.
675 */
676 TEST_F(KFDIPCTest, CMABasicTest) {
677 TEST_START(TESTPROFILE_RUNALL)
678
679 int defaultGPUNode = m_NodeInfo.HsaDefaultGPUNode();
680 HSAuint64 size = PAGE_SIZE;
681 SDMAQueue sdmaQueue;
682 HsaMemoryRange srcRange, dstRange;
683 HSAuint64 copied;
684 const int PATTERN1 = 0xA5A5A5A5, PATTERN2 = 0xFFFFFFFF;
685 HSAKMT_STATUS status;
686
687 ASSERT_GE(defaultGPUNode, 0) << "failed to get default GPU Node";
688
689 if (!GetVramSize(defaultGPUNode)) {
690 LOG() << "Skipping test: No VRAM found." << std::endl;
691 return;
692 }
693
694 ASSERT_SUCCESS(sdmaQueue.Create(defaultGPUNode));
695 HsaMemoryBuffer tmpBuffer(PAGE_SIZE, 0, true /* zero */);
696 volatile HSAuint32 *tmp = tmpBuffer.As<volatile HSAuint32 *>();
697
698 /* Initialize test buffer. Fill first half and second half with
699 * different pattern
700 */
701 HsaMemoryBuffer testLocalBuffer(size, defaultGPUNode, false, true);
702 testLocalBuffer.Fill(PATTERN1, sdmaQueue, 0, size/2);
703 testLocalBuffer.Fill(PATTERN2, sdmaQueue, size/2, size/2);
704
705 /* Test1. Copy (or overwrite) buffer onto itself */
706 srcRange.MemoryAddress = testLocalBuffer.As<void*>();
707 srcRange.SizeInBytes = size;
708 dstRange.MemoryAddress = testLocalBuffer.As<void*>();
709 dstRange.SizeInBytes = size;
710 ASSERT_SUCCESS(hsaKmtProcessVMRead(getpid(), &dstRange, 1, &srcRange, 1, &copied));
711 EXPECT_EQ(copied, size);
712
713 EXPECT_TRUE(testLocalBuffer.IsPattern(0, PATTERN1, sdmaQueue, tmp));
714 EXPECT_TRUE(testLocalBuffer.IsPattern(size - 4, PATTERN2, sdmaQueue, tmp));
715
716
717 /* Test2. Test unaligned byte copy. Write 3 bytes to an unaligned destination address */
718 const int unaligned_offset = 1;
719 const int unaligned_size = 3;
720 const int unaligned_mask = (((1 << (unaligned_size * 8)) - 1) << (unaligned_offset * 8));
721 HSAuint32 expected_pattern;
722
723 srcRange.MemoryAddress = testLocalBuffer.As<void*>();
724
725 /* Deliberately set to value > unaligned_size. Only unaligned_size
726 * should be copied since dstRange.SizeInBytes == unaligned_size
727 */
728 srcRange.SizeInBytes = size;
729
730 dstRange.MemoryAddress = reinterpret_cast<void *>(testLocalBuffer.As<char*>() + (size / 2) + unaligned_offset);
731 dstRange.SizeInBytes = unaligned_size;
732 ASSERT_SUCCESS(hsaKmtProcessVMRead(getpid(), &dstRange, 1, &srcRange, 1, &copied));
733 EXPECT_EQ(copied, unaligned_size);
734
735 expected_pattern = (PATTERN2 & ~unaligned_mask | (PATTERN1 & unaligned_mask));
736 EXPECT_TRUE(testLocalBuffer.IsPattern(size/2, expected_pattern, sdmaQueue, tmp));
737
738
739 /* Test3. Test overflow and expect failure */
740 srcRange.MemoryAddress = testLocalBuffer.As<void*>();
741 srcRange.SizeInBytes = size;
742 dstRange.MemoryAddress = reinterpret_cast<void *>(testLocalBuffer.As<char*>() + 4);
743 dstRange.SizeInBytes = size; /* This should overflow since offset is VA + 4 */
744 status = hsaKmtProcessVMRead(getpid(), &dstRange, 1, &srcRange, 1, &copied);
745 EXPECT_NE(status, HSAKMT_STATUS_SUCCESS);
746 EXPECT_LE(copied, (size - 4));
747
748 EXPECT_SUCCESS(sdmaQueue.Destroy());
749
750 TEST_END
751 }
2222
2323 #include "KFDBaseComponentTest.hpp"
2424 #include "BaseQueue.hpp"
25 #include "IsaGenerator.hpp"
2625
2726 #ifndef __KFD_MEMORY_TEST__H__
2827 #define __KFD_MEMORY_TEST__H__
3232
3333 KFDBaseComponentTest::SetUp();
3434
35 m_pIsaGen = IsaGenerator::Create(m_FamilyId);
36
3735 ROUTINE_END
3836 }
3937
4038 void KFDLocalMemoryTest::TearDown() {
4139 ROUTINE_START
42
43 if (m_pIsaGen)
44 delete m_pIsaGen;
45 m_pIsaGen = NULL;
4640
4741 KFDBaseComponentTest::TearDown();
4842
106100
107101 srcSysBuffer.Fill(0x01010101);
108102
109 m_pIsaGen->GetCopyDwordIsa(isaBuffer);
103 ASSERT_SUCCESS(m_pAsm->RunAssembleBuf(CopyDwordIsa, isaBuffer.As<char*>()));
110104
111105 ASSERT_SUCCESS(hsaKmtMapMemoryToGPUNodes(srcLocalBuffer.As<void*>(), srcLocalBuffer.Size(), &AlternateVAGPU,
112106 mapFlags, 1, reinterpret_cast<HSAuint32 *>(&defaultGPUNode)));
163157
164158 SysBufferA.Fill(0x01010101);
165159
166 m_pIsaGen->GetCopyDwordIsa(isaBuffer);
160 ASSERT_SUCCESS(m_pAsm->RunAssembleBuf(CopyDwordIsa, isaBuffer.As<char*>()));
167161
168162 ASSERT_SUCCESS(queue.Create(defaultGPUNode));
169163 queue.SetSkipWaitConsump(0);
302296 PM4Queue queue;
303297 ASSERT_SUCCESS(queue.Create(defaultGPUNode));
304298 HsaMemoryBuffer isaBuffer(PAGE_SIZE, defaultGPUNode);
305 m_pIsaGen->GetCopyDwordIsa(isaBuffer);
299
300 ASSERT_SUCCESS(m_pAsm->RunAssembleBuf(CopyDwordIsa, isaBuffer.As<char*>()));
306301
307302 /* Allocate and test memory using the strategy explained at the top */
308303 HSAKMT_STATUS status;
2525
2626 #include <gtest/gtest.h>
2727
28 #include "IsaGenerator.hpp"
2928 #include "KFDBaseComponentTest.hpp"
3029
3130 class KFDLocalMemoryTest : public KFDBaseComponentTest {
3231 public:
33 KFDLocalMemoryTest() :m_pIsaGen(NULL) {}
32 KFDLocalMemoryTest() {}
3433 ~KFDLocalMemoryTest() {}
3534
3635 protected:
3736 virtual void SetUp();
3837 virtual void TearDown();
39
40 protected: // Members
41 IsaGenerator* m_pIsaGen;
4238 };
4339
4440 #endif // __KFD_LOCALMEMORY_TEST__H__
3838 #include "SDMAPacket.hpp"
3939 #include "linux/kfd_ioctl.h"
4040
41 const char* gfx8_ScratchCopyDword =
42 "\
43 shader ScratchCopyDword\n\
44 asic(VI)\n\
45 type(CS)\n\
46 /*copy the parameters from scalar registers to vector registers*/\n\
47 v_mov_b32 v0, s0\n\
48 v_mov_b32 v1, s1\n\
49 v_mov_b32 v2, s2\n\
50 v_mov_b32 v3, s3\n\
51 /*set up the scratch parameters. This assumes a single 16-reg block.*/\n\
52 s_mov_b32 flat_scratch_lo, 8/*2 dwords of scratch per thread*/\n\
53 s_mov_b32 flat_scratch_hi, 0/*offset in units of 256bytes*/\n\
54 /*copy a dword between the passed addresses*/\n\
55 flat_load_dword v4, v[0:1] slc\n\
56 s_waitcnt vmcnt(0)&lgkmcnt(0)\n\
57 flat_store_dword v[2:3], v4 slc\n\
58 \n\
59 s_endpgm\n\
60 \n\
61 end\n\
62 ";
63
64 const char* gfx9_ScratchCopyDword =
65 "\
66 shader ScratchCopyDword\n\
67 asic(GFX9)\n\
68 type(CS)\n\
69 /*copy the parameters from scalar registers to vector registers*/\n\
70 v_mov_b32 v0, s0\n\
71 v_mov_b32 v1, s1\n\
72 v_mov_b32 v2, s2\n\
73 v_mov_b32 v3, s3\n\
74 /*set up the scratch parameters. This assumes a single 16-reg block.*/\n\
75 s_mov_b32 flat_scratch_lo, s4\n\
76 s_mov_b32 flat_scratch_hi, s5\n\
77 /*copy a dword between the passed addresses*/\n\
78 flat_load_dword v4, v[0:1] slc\n\
79 s_waitcnt vmcnt(0)&lgkmcnt(0)\n\
80 flat_store_dword v[2:3], v4 slc\n\
81 \n\
82 s_endpgm\n\
83 \n\
84 end\n\
85 ";
86 const char* gfx10_ScratchCopyDword =
87 "\
88 shader ScratchCopyDword\n\
89 asic(GFX10)\n\
90 type(CS)\n\
91 wave_size(32)\n\
92 /*copy the parameters from scalar registers to vector registers*/\n\
93 v_mov_b32 v0, s0\n\
94 v_mov_b32 v1, s1\n\
95 v_mov_b32 v2, s2\n\
96 v_mov_b32 v3, s3\n\
97 /*set up the scratch parameters. This assumes a single 16-reg block.*/\n\
98 s_setreg_b32 hwreg(HW_REG_SHADER_FLAT_SCRATCH_LO), s4\n\
99 s_setreg_b32 hwreg(HW_REG_SHADER_FLAT_SCRATCH_HI), s5\n\
100 /*copy a dword between the passed addresses*/\n\
101 flat_load_dword v4, v[0:1] slc\n\
102 s_waitcnt vmcnt(0)&lgkmcnt(0)\n\
103 flat_store_dword v[2:3], v4 slc\n\
104 \n\
105 s_endpgm\n\
106 \n\
107 end\n\
108 ";
109
110 const char* aldbrn_ScratchCopyDword =
111 "\
112 shader ScratchCopyDword\n\
113 asic(ALDEBARAN)\n\
114 type(CS)\n\
115 /*copy the parameters from scalar registers to vector registers*/\n\
116 v_mov_b32 v0, s0\n\
117 v_mov_b32 v1, s1\n\
118 v_mov_b32 v2, s2\n\
119 v_mov_b32 v3, s3\n\
120 /*set up the scratch parameters. This assumes a single 16-reg block.*/\n\
121 s_mov_b32 flat_scratch_lo, s4\n\
122 s_mov_b32 flat_scratch_hi, s5\n\
123 /*copy a dword between the passed addresses*/\n\
124 flat_load_dword v4, v[0:1] slc\n\
125 s_waitcnt vmcnt(0)&lgkmcnt(0)\n\
126 flat_store_dword v[2:3], v4 slc\n\
127 \n\
128 s_endpgm\n\
129 \n\
130 end\n\
131 ";
132
133
134
135 /* Continuously poll src buffer and check buffer value
136 * After src buffer is filled with specific value (0x5678,
137 * by host program), fill dst buffer with specific
138 * value(0x5678) and quit
139 */
140 const char* gfx9_PollMemory =
141 "\
142 shader ReadMemory\n\
143 wave_size(32)\n\
144 type(CS)\n\
145 /* Assume src address in s0, s1 and dst address in s2, s3*/\n\
146 s_movk_i32 s18, 0x5678\n\
147 LOOP:\n\
148 s_load_dword s16, s[0:1], 0x0 glc\n\
149 s_cmp_eq_i32 s16, s18\n\
150 s_cbranch_scc0 LOOP\n\
151 s_store_dword s18, s[2:3], 0x0 glc\n\
152 s_endpgm\n\
153 end\n\
154 ";
155
156 /* Similar to gfx9_PollMemory except that the buffer
157 * polled can be Non-coherant memory. SCC system-level
158 * cache coherence is not supported in scalar (smem) path.
159 * Use vmem operations with scc
160 */
161 const char* gfx9_PollNCMemory =
162 "\
163 shader ReadMemory\n\
164 asic(ALDEBARAN)\n\
165 wave_size(32)\n\
166 type(CS)\n\
167 /* Assume src address in s0, s1 and dst address in s2, s3*/\n\
168 v_mov_b32 v6, 0x5678\n\
169 v_mov_b32 v0, s0\n\
170 v_mov_b32 v1, s1\n\
171 LOOP:\n\
172 flat_load_dword v4, v[0:1] scc\n\
173 v_cmp_eq_u32 vcc, v4, v6\n\
174 s_cbranch_vccz LOOP\n\
175 v_mov_b32 v0, s2\n\
176 v_mov_b32 v1, s3\n\
177 flat_store_dword v[0:1], v6 scc\n\
178 s_endpgm\n\
179 end\n\
180 ";
181
182 const char* gfx10_PollMemory =
183 "\
184 shader ReadMemory\n\
185 wave_size(32)\n\
186 type(CS)\n\
187 /* Assume src address in s0, s1 and dst address in s2, s3*/\n\
188 s_movk_i32 s18, 0x5678\n\
189 v_mov_b32 v0, s2\n\
190 v_mov_b32 v1, s3\n\
191 v_mov_b32 v2, 0x5678\n\
192 LOOP:\n\
193 s_load_dword s16, s[0:1], 0x0 glc\n\
194 s_cmp_eq_i32 s16, s18\n\
195 s_cbranch_scc0 LOOP\n\
196 flat_store_dword v[0,1], v2 slc\n\
197 s_waitcnt vmcnt(0)&lgkmcnt(0)\n\
198 s_endpgm\n\
199 end\n\
200 ";
201
202 /* Input: A buffer of at least 3 dwords.
203 * DW0: used as a signal. 0xcafe means it is signaled
204 * DW1: Input buffer for device to read.
205 * DW2: Output buffer for device to write.
206 * Once receive signal, device will copy DW1 to DW2
207 * This shader continously poll the signal buffer,
208 * Once signal buffer is signaled, it copies input buffer
209 * to output buffer
210 */
211 const char* gfx9_CopyOnSignal =
212 "\
213 shader CopyOnSignal\n\
214 wave_size(32)\n\
215 type(CS)\n\
216 /* Assume input buffer in s0, s1 */\n\
217 s_mov_b32 s18, 0xcafe\n\
218 POLLSIGNAL:\n\
219 s_load_dword s16, s[0:1], 0x0 glc\n\
220 s_cmp_eq_i32 s16, s18\n\
221 s_cbranch_scc0 POLLSIGNAL\n\
222 s_load_dword s17, s[0:1], 0x4 glc\n\
223 s_waitcnt vmcnt(0) & lgkmcnt(0)\n\
224 s_store_dword s17, s[0:1], 0x8 glc\n\
225 s_waitcnt vmcnt(0) & lgkmcnt(0)\n\
226 s_endpgm\n\
227 end\n\
228 ";
229
230 const char* gfx10_CopyOnSignal =
231 "\
232 shader CopyOnSignal\n\
233 wave_size(32)\n\
234 type(CS)\n\
235 /* Assume input buffer in s0, s1 */\n\
236 s_add_u32 s2, s0, 0x8\n\
237 s_addc_u32 s3, s1, 0x0\n\
238 s_mov_b32 s18, 0xcafe\n\
239 v_mov_b32 v0, s0\n\
240 v_mov_b32 v1, s1\n\
241 v_mov_b32 v4, s2\n\
242 v_mov_b32 v5, s3\n\
243 POLLSIGNAL:\n\
244 s_load_dword s16, s[0:1], 0x0 glc\n\
245 s_cmp_eq_i32 s16, s18\n\
246 s_cbranch_scc0 POLLSIGNAL\n\
247 s_load_dword s17, s[0:1], 0x4 glc\n\
248 s_waitcnt vmcnt(0) & lgkmcnt(0)\n\
249 v_mov_b32 v2, s17\n\
250 flat_store_dword v[4,5], v2 glc\n\
251 s_waitcnt vmcnt(0) & lgkmcnt(0)\n\
252 s_endpgm\n\
253 end\n\
254 ";
255
256 /* Input0: A buffer of at least 2 dwords.
257 * DW0: used as a signal. Write 0xcafe to signal
258 * DW1: Write to this buffer for other device to read.
259 * Input1: mmio base address
260 */
261 const char* gfx9_WriteAndSignal =
262 "\
263 shader WriteAndSignal\n\
264 wave_size(32)\n\
265 type(CS)\n\
266 /* Assume input buffer in s0, s1 */\n\
267 s_mov_b32 s18, 0xbeef\n\
268 s_store_dword s18, s[0:1], 0x4 glc\n\
269 s_mov_b32 s18, 0x1\n\
270 s_store_dword s18, s[2:3], 0 glc\n\
271 s_mov_b32 s18, 0xcafe\n\
272 s_store_dword s18, s[0:1], 0x0 glc\n\
273 s_endpgm\n\
274 end\n\
275 ";
276
277 /* Continuously poll the flag at src buffer
278 * After the flag of s[0:1] is 1 filled,
279 * copy the value from s[0:1]+4 to dst buffer
280 */
281 const char* gfx9_PollAndCopy =
282 "\
283 shader CopyMemory\n\
284 wave_size(32)\n\
285 type(CS)\n\
286 /* Assume src buffer in s[0:1] and dst buffer in s[2:3]*/\n\
287 s_movk_i32 s18, 0x1\n\
288 LOOP:\n\
289 s_load_dword s16, s[0:1], 0x0 glc\n\
290 s_cmp_eq_i32 s16, s18\n\
291 s_cbranch_scc0 LOOP\n\
292 s_load_dword s17, s[0:1], 0x4 glc\n\
293 s_waitcnt vmcnt(0) & lgkmcnt(0)\n\
294 s_store_dword s17, s[2:3], 0x0 glc:1\n\
295 s_waitcnt vmcnt(0) & lgkmcnt(0)\n\
296 s_endpgm\n\
297 end\n\
298 ";
299
300 const char* gfx9aldbrn_PollAndCopy =
301 "\
302 shader CopyMemory\n\
303 wave_size(32)\n\
304 type(CS)\n\
305 /* Assume src buffer in s[0:1] and dst buffer in s[2:3]*/\n\
306 v_mov_b32 v0, s0\n\
307 v_mov_b32 v1, s1\n\
308 v_mov_b32 v18, 0x1\n\
309 LOOP:\n\
310 flat_load_dword v16, v[0:1] glc\n\
311 s_waitcnt vmcnt(0) & lgkmcnt(0)\n\
312 v_cmp_eq_i32 vcc, v16, v18\n\
313 s_cbranch_vccz LOOP\n\
314 buffer_invl2\n\
315 s_load_dword s17, s[0:1], 0x4 glc\n\
316 s_waitcnt vmcnt(0) & lgkmcnt(0)\n\
317 s_store_dword s17, s[2:3], 0x0 glc\n\
318 s_waitcnt vmcnt(0) & lgkmcnt(0)\n\
319 buffer_wbl2\n\
320 s_waitcnt vmcnt(0) & lgkmcnt(0)\n\
321 s_endpgm\n\
322 end\n\
323 ";
324
325 /* Input0: A buffer of at least 2 dwords.
326 * DW0: used as a signal. Write 0x1 to signal
327 * DW1: Write the value from 2nd input buffer
328 * for other device to read.
329 * Input1: A buffer of at least 2 dwords.
330 * DW0: used as the value to be written.
331 */
332 const char* gfx9aldbrn_WriteFlagAndValue =
333 "\
334 shader WriteMemory\n\
335 wave_size(32)\n\
336 type(CS)\n\
337 /* Assume two inputs buffer in s[0:1] and s[2:3]*/\n\
338 v_mov_b32 v0, s0\n\
339 v_mov_b32 v1, s1\n\
340 s_load_dword s18, s[2:3], 0x0 glc\n\
341 s_waitcnt vmcnt(0) & lgkmcnt(0)\n\
342 s_store_dword s18, s[0:1], 0x4 glc\n\
343 s_waitcnt vmcnt(0) & lgkmcnt(0)\n\
344 buffer_wbl2\n\
345 s_waitcnt vmcnt(0) & lgkmcnt(0)\n\
346 v_mov_b32 v16, 0x1\n\
347 flat_store_dword v[0:1], v16 glc\n\
348 s_endpgm\n\
349 end\n\
350 ";
351
352 const char* gfx10_WriteAndSignal =
353 "\
354 shader WriteAndSignal\n\
355 wave_size(32)\n\
356 type(CS)\n\
357 /* Assume input buffer in s0, s1 */\n\
358 s_add_u32 s4, s0, 0x4\n\
359 s_addc_u32 s5, s1, 0x0\n\
360 v_mov_b32 v0, s0\n\
361 v_mov_b32 v1, s1\n\
362 v_mov_b32 v2, s2\n\
363 v_mov_b32 v3, s3\n\
364 v_mov_b32 v4, s4\n\
365 v_mov_b32 v5, s5\n\
366 v_mov_b32 v18, 0xbeef\n\
367 flat_store_dword v[4:5], v18 glc\n\
368 v_mov_b32 v18, 0x1\n\
369 flat_store_dword v[2:3], v18 glc\n\
370 v_mov_b32 v18, 0xcafe\n\
371 flat_store_dword v[0:1], v18 glc\n\
372 s_endpgm\n\
373 end\n\
374 ";
375
376 //These gfx9_PullMemory, gfx9_CopyOnSignal, gfx9_WriteAndSignal shaders can be used by both gfx9 and gfx10
377
37841 void KFDMemoryTest::SetUp() {
37942 ROUTINE_START
38043
38144 KFDBaseComponentTest::SetUp();
38245
383 m_pIsaGen = IsaGenerator::Create(m_FamilyId);
384
38546 ROUTINE_END
38647 }
38748
38849 void KFDMemoryTest::TearDown() {
38950 ROUTINE_START
390
391 if (m_pIsaGen)
392 delete m_pIsaGen;
393 m_pIsaGen = NULL;
39451
39552 KFDBaseComponentTest::TearDown();
39653
507164 HsaMemoryBuffer dstBuffer(PAGE_SIZE, defaultGPUNode);
508165
509166 const char *pReadMemory;
510 if (m_FamilyId < FAMILY_NV)
511 pReadMemory = gfx9_PollMemory;
512 else
513 pReadMemory = gfx10_PollMemory;
514
515167 if (m_NodeInfo.IsNodeXGMItoCPU(defaultGPUNode))
516168 /* On A+A system memory is mapped as NC */
517 m_pIsaGen->CompileShader(gfx9_PollNCMemory, "ReadMemory", isaBuffer);
169 pReadMemory = PollNCMemoryIsa;
518170 else
519 m_pIsaGen->CompileShader(pReadMemory, "ReadMemory", isaBuffer);
171 pReadMemory = PollMemoryIsa;
172
173 ASSERT_SUCCESS(m_pAsm->RunAssembleBuf(pReadMemory, isaBuffer.As<char*>()));
520174
521175 PM4Queue pm4Queue;
522176 ASSERT_SUCCESS(pm4Queue.Create(defaultGPUNode));
591245 m_MemoryFlags.ui32.NoNUMABind = 1;
592246 EXPECT_SUCCESS(hsaKmtAllocMemory(0 /* system */, PAGE_SIZE, m_MemoryFlags, reinterpret_cast<void**>(&pDb)));
593247
248 TEST_END
249 }
250
251 // Basic test for hsaKmtAllocMemory
252 TEST_F(KFDMemoryTest, MemoryAllocAll) {
253 TEST_START(TESTPROFILE_RUNALL)
254
255 int defaultGPUNode = m_NodeInfo.HsaDefaultGPUNode();
256 HsaMemFlags memFlags = {0};
257 memFlags.ui32.NonPaged = 1; // sys mem vs vram
258 HSAuint64 available;
259 void *object = NULL;
260 int shrink = 21, success = HSAKMT_STATUS_NO_MEMORY;
261
262 EXPECT_SUCCESS(hsaKmtAvailableMemory(defaultGPUNode, &available));
263 LOG() << "Available: " << available << " bytes" << std::endl;
264 for (int i = 0; i < available >> shrink; i++) {
265 HSAuint64 size = available - ((HSAuint64)i << shrink);
266 if (hsaKmtAllocMemory(defaultGPUNode, size, memFlags, &object) == HSAKMT_STATUS_SUCCESS) {
267 LOG() << "Allocated: " << size << " bytes" << std::endl;
268 success = hsaKmtFreeMemory(object, available);
269 break;
270 }
271 }
272 EXPECT_SUCCESS(success);
594273 TEST_END
595274 }
596275
673352 ASSERT_SUCCESS(sdmaQueue.Create(defaultGPUNode));
674353
675354 HsaMemoryBuffer isaBuffer(PAGE_SIZE, defaultGPUNode, true/*zero*/, false/*local*/, true/*exec*/);
676 m_pIsaGen->GetCopyDwordIsa(isaBuffer);
355
356 ASSERT_SUCCESS(m_pAsm->RunAssembleBuf(CopyDwordIsa, isaBuffer.As<char*>()));
677357
678358 /* First submit just so the queues are not empty, and to get the
679359 * TLB populated (in case we need to flush TLBs somewhere after
854534 // Initialize the srcBuffer to some fixed value
855535 srcMemBuffer.Fill(0x01010101);
856536
857 const char *pScratchCopyDword;
858 if (m_FamilyId < FAMILY_AI)
859 pScratchCopyDword = gfx8_ScratchCopyDword;
860 else if (m_FamilyId < FAMILY_AL)
861 pScratchCopyDword = gfx9_ScratchCopyDword;
862 else if (m_FamilyId == FAMILY_AL)
863 pScratchCopyDword = aldbrn_ScratchCopyDword;
864 else
865 pScratchCopyDword = gfx10_ScratchCopyDword;
866 m_pIsaGen->CompileShader(pScratchCopyDword, "ScratchCopyDword", isaBuffer);
537 ASSERT_SUCCESS(m_pAsm->RunAssembleBuf(ScratchCopyDwordIsa, isaBuffer.As<char*>()));
867538
868539 const HsaNodeProperties *pNodeProperties = m_NodeInfo.GetNodeProperties(defaultGPUNode);
869540
1143814 TEST_END
1144815 }
1145816
817 #define VRAM_ALLOCATION_ALIGN (1 << 21) //Align VRAM allocations to 2MB
1146818 TEST_F(KFDMemoryTest, MMBench) {
1147819 TEST_REQUIRE_ENV_CAPABILITIES(ENVCAPS_64BITLINUX);
1148820 TEST_START(TESTPROFILE_RUNALL);
1253925 memFlags.ui32.PageSize = HSA_PAGE_SIZE_4KB;
1254926 memFlags.ui32.HostAccess = 0;
1255927 memFlags.ui32.NonPaged = 1;
1256 /* Upper limit of buffer number to fit 90% vram size */
1257 bufLimit = ((vramSizeMB << 20) * 8 / 10) / bufSize ;
928
929 /* Buffer sizes are 2MB aligned to match new allocation policy.
930 * Upper limit of buffer number to fit 80% vram size.
931 */
932 bufLimit = ((vramSizeMB << 20) * 8 / 10) / ALIGN_UP(bufSize, VRAM_ALLOCATION_ALIGN);
1258933
1259934 if (bufLimit == 0)
1260935 continue; // skip when bufSize > vram
17271402 // dstBuffer is cpu accessible gtt memory
17281403 HsaMemoryBuffer dstBuffer(PAGE_SIZE, defaultGPUNode);
17291404
1730 const char *pScratchCopyDword;
1731 if (m_FamilyId < FAMILY_AI)
1732 pScratchCopyDword = gfx8_ScratchCopyDword;
1733 else if (m_FamilyId < FAMILY_AL)
1734 pScratchCopyDword = gfx9_ScratchCopyDword;
1735 else if (m_FamilyId == FAMILY_AL)
1736 pScratchCopyDword = aldbrn_ScratchCopyDword;
1737 else
1738 pScratchCopyDword = gfx10_ScratchCopyDword;
1739
1740 m_pIsaGen->CompileShader(pScratchCopyDword, "ScratchCopyDword", isaBuffer);
1405 ASSERT_SUCCESS(m_pAsm->RunAssembleBuf(ScratchCopyDwordIsa, isaBuffer.As<char*>()));
1406
17411407 Dispatch dispatch0(isaBuffer);
17421408 dispatch0.SetArgs(mem0, dstBuffer.As<void*>());
17431409 dispatch0.Submit(queue);
19201586 TEST_REQUIRE_ENV_CAPABILITIES(ENVCAPS_64BITLINUX);
19211587 TEST_START(TESTPROFILE_RUNALL);
19221588
1923 const unsigned nBufs = 1000; /* measure us, report ns */
1589 unsigned nBufs = 1000; /* measure us, report ns */
19241590 unsigned testIndex, sizeIndex, memType;
19251591 const unsigned nMemTypes = 2;
19261592 const char *memTypeStrings[nMemTypes] = {"SysMem", "VRAM"};
19651631 unsigned memType = _TEST_MEMTYPE(testIndex);
19661632 HSAuint64 mcpRTime, mcpWTime, accessRTime, accessWTime;
19671633 HSAuint32 allocNode;
1634 unsigned bufLimit;
19681635
19691636 if ((testIndex & (nSizes-1)) == 0)
19701637 LOG() << "----------------------------------------------------------------------" << std::endl;
19811648 memFlags.ui32.PageSize = HSA_PAGE_SIZE_4KB;
19821649 memFlags.ui32.HostAccess = 1;
19831650 memFlags.ui32.NonPaged = 1;
1651
1652 /* Buffer sizes are 2MB aligned to match new allocation policy.
1653 * Upper limit of buffer number to fit 80% vram size.
1654 */
1655 bufLimit = ((vramSizeMB << 20) * 8 / 10) / ALIGN_UP(bufSize, VRAM_ALLOCATION_ALIGN);
1656 if (bufLimit == 0)
1657 continue; // skip when bufSize > vram
1658
1659 /* When vram is too small to fit all the buffers, fill 80% vram size*/
1660 nBufs = std::min(nBufs , bufLimit);
19841661 }
19851662
19861663 for (i = 0; i < nBufs; i++)
21081785 PM4Queue queue;
21091786 ASSERT_SUCCESS(queue.Create(defaultGPUNode));
21101787 HsaMemoryBuffer isaBuffer(PAGE_SIZE, defaultGPUNode, true/*zero*/, false/*local*/, true/*exec*/);
2111 const char *pCopyOnSignal;
2112 if (m_FamilyId < FAMILY_NV)
2113 pCopyOnSignal = gfx9_CopyOnSignal;
2114 else
2115 pCopyOnSignal = gfx10_CopyOnSignal;
2116 m_pIsaGen->CompileShader(pCopyOnSignal, "CopyOnSignal", isaBuffer);
1788
1789 ASSERT_SUCCESS(m_pAsm->RunAssembleBuf(CopyOnSignalIsa, isaBuffer.As<char*>()));
1790
21171791 Dispatch dispatch0(isaBuffer);
21181792 dispatch0.SetArgs(buffer, NULL);
21191793 dispatch0.Submit(queue);
22331907 PM4Queue queue;
22341908 ASSERT_SUCCESS(queue.Create(nodes[0]));
22351909 HsaMemoryBuffer isaBuffer(PAGE_SIZE, nodes[0], true/*zero*/, false/*local*/, true/*exec*/);
2236 const char *pCopyOnSignal;
2237 if (m_FamilyId < FAMILY_NV)
2238 pCopyOnSignal = gfx9_CopyOnSignal;
2239 else
2240 pCopyOnSignal = gfx10_CopyOnSignal;
2241 m_pIsaGen->CompileShader(pCopyOnSignal, "CopyOnSignal", isaBuffer);
1910
1911 ASSERT_SUCCESS(m_pAsm->RunAssembleBuf(CopyOnSignalIsa, isaBuffer.As<char*>()));
1912
22421913 Dispatch dispatch(isaBuffer);
22431914 dispatch.SetArgs(buffer, NULL);
22441915 dispatch.Submit(queue);
22461917 PM4Queue queue0;
22471918 ASSERT_SUCCESS(queue0.Create(nodes[1]));
22481919 HsaMemoryBuffer isaBuffer0(PAGE_SIZE, nodes[1], true/*zero*/, false/*local*/, true/*exec*/);
2249 const char *pWriteAndSignal;
2250 if (m_FamilyId < FAMILY_NV)
2251 pWriteAndSignal = gfx9_WriteAndSignal;
2252 else
2253 pWriteAndSignal = gfx10_WriteAndSignal;
2254 m_pIsaGen->CompileShader(pWriteAndSignal, "WriteAndSignal", isaBuffer0);
1920
1921 ASSERT_SUCCESS(m_pAsm->RunAssembleBuf(WriteAndSignalIsa, isaBuffer0.As<char*>()));
1922
22551923 Dispatch dispatch0(isaBuffer0);
22561924 dispatch0.SetArgs(buffer, mmioBase);
22571925 dispatch0.Submit(queue0);
23031971 PM4Queue queue;
23041972 ASSERT_SUCCESS(queue.Create(defaultGPUNode));
23051973 HsaMemoryBuffer isaBuffer(PAGE_SIZE, defaultGPUNode, true/*zero*/, false/*local*/, true/*exec*/);
2306 m_pIsaGen->CompileShader(gfx9_PollMemory, "ReadMemory", isaBuffer);
1974
1975 ASSERT_SUCCESS(m_pAsm->RunAssembleBuf(PollMemoryIsa, isaBuffer.As<char*>()));
1976
23071977 Dispatch dispatch(isaBuffer);
23081978 dispatch.SetArgs(buffer.As<int*>(), buffer.As<int*>()+dwLocation);
23091979 dispatch.Submit(queue);
23562026 PM4Queue queue;
23572027 ASSERT_SUCCESS(queue.Create(defaultGPUNode));
23582028 HsaMemoryBuffer isaBuffer(PAGE_SIZE, defaultGPUNode, true/*zero*/, false/*local*/, true/*exec*/);
2359 m_pIsaGen->CompileShader(gfx9_PollMemory, "ReadMemory", isaBuffer);
2029
2030 ASSERT_SUCCESS(m_pAsm->RunAssembleBuf(PollMemoryIsa, isaBuffer.As<char*>()));
2031
23602032 Dispatch dispatch(isaBuffer);
23612033 dispatch.SetArgs(buffer, buffer+100);
23622034 dispatch.Submit(queue);
24182090 PM4Queue queue;
24192091 ASSERT_SUCCESS(queue.Create(defaultGPUNode));
24202092 HsaMemoryBuffer isaBuffer(PAGE_SIZE, defaultGPUNode, true/*zero*/, false/*local*/, true/*exec*/);
2421 m_pIsaGen->CompileShader(gfx9_PollMemory, "ReadMemory", isaBuffer);
2093
2094 ASSERT_SUCCESS(m_pAsm->RunAssembleBuf(PollMemoryIsa, isaBuffer.As<char*>()));
2095
24222096 Dispatch dispatch(isaBuffer);
24232097 dispatch.SetArgs(buffer.As<int*>(), buffer.As<int*>()+dwLocation);
24242098 dispatch.Submit(queue);
24332107 ASSERT_SUCCESS(queue1.Create(nondefaultNode));
24342108 buffer.Fill(0x5678, sdmaQueue, dwLocation1*sizeof(int), 4);
24352109 HsaMemoryBuffer isaBuffer1(PAGE_SIZE, nondefaultNode, true/*zero*/, false/*local*/, true/*exec*/);
2436 m_pIsaGen->GetCopyDwordIsa(isaBuffer1);
2110
2111 ASSERT_SUCCESS(m_pAsm->RunAssembleBuf(CopyDwordIsa, isaBuffer.As<char*>()));
2112
24372113 Dispatch dispatch1(isaBuffer1);
24382114 dispatch1.SetArgs(buffer.As<int*>()+dwLocation1, buffer.As<int*>());
24392115 dispatch1.Submit(queue1);
24992175 PM4Queue queue;
25002176 ASSERT_SUCCESS(queue.Create(defaultGPUNode));
25012177 HsaMemoryBuffer isaBuffer(PAGE_SIZE, defaultGPUNode, true/*zero*/, false/*local*/, true/*exec*/);
2502 m_pIsaGen->CompileShader(gfx9aldbrn_PollAndCopy, "CopyMemory", isaBuffer);
2178
2179 ASSERT_SUCCESS(m_pAsm->RunAssembleBuf(PollAndCopyIsa, isaBuffer.As<char*>()));
2180
25032181 Dispatch dispatch(isaBuffer);
25042182 dispatch.SetArgs(buffer.As<char *>(), buffer.As<char *>()+dwLocation);
25052183 dispatch.Submit(queue);
25142192 PM4Queue queue1;
25152193 ASSERT_SUCCESS(queue1.Create(nondefaultNode));
25162194 HsaMemoryBuffer isaBuffer1(PAGE_SIZE, nondefaultNode, true/*zero*/, false/*local*/, true/*exec*/);
2517 m_pIsaGen->CompileShader(gfx9aldbrn_WriteFlagAndValue, "WriteMemory", isaBuffer1);
2195
2196 ASSERT_SUCCESS(m_pAsm->RunAssembleBuf(WriteFlagAndValueIsa, isaBuffer1.As<char*>()));
2197
25182198 Dispatch dispatch1(isaBuffer1);
25192199 dispatch1.SetArgs(buffer.As<char *>(), buffer.As<char *>()+dwSource);
25202200 dispatch1.Submit(queue1);
25682248 PM4Queue queue;
25692249 ASSERT_SUCCESS(queue.Create(defaultGPUNode));
25702250 HsaMemoryBuffer isaBuffer(PAGE_SIZE, defaultGPUNode, true/*zero*/, false/*local*/, true/*exec*/);
2571 m_pIsaGen->CompileShader(gfx9aldbrn_PollAndCopy, "CopyMemory", isaBuffer);
2251
2252 ASSERT_SUCCESS(m_pAsm->RunAssembleBuf(PollAndCopyIsa, isaBuffer.As<char*>()));
2253
25722254 Dispatch dispatch(isaBuffer);
25732255 dispatch.SetArgs(buffer, buffer+dwLocation);
25742256 dispatch.Submit(queue);
26072289 return;
26082290 }
26092291
2292 int defaultGPUNode = m_NodeInfo.HsaDefaultGPUNode();
2293 const int dwLocation = 0x80;
2294
2295 if (!m_NodeInfo.IsNodeXGMItoCPU(defaultGPUNode)) {
2296 LOG() << "Skipping test: XGMI link to CPU is required." << std::endl;
2297 return;
2298 }
2299
26102300 unsigned int *fineBuffer = NULL;
26112301 unsigned int tmp;
2612
2613 int defaultGPUNode = m_NodeInfo.HsaDefaultGPUNode();
2614 const int dwLocation = 0x80;
26152302
26162303 ASSERT_SUCCESS(hsaKmtAllocMemory(defaultGPUNode /* system */, PAGE_SIZE, m_MemoryFlags,
26172304 reinterpret_cast<void**>(&fineBuffer)));
26262313 ASSERT_SUCCESS(queue.Create(defaultGPUNode));
26272314 HsaMemoryBuffer isaBuffer(PAGE_SIZE, defaultGPUNode, true/*zero*/, false/*local*/, true/*exec*/);
26282315
2629 if (m_NodeInfo.IsNodeXGMItoCPU(defaultGPUNode))
2630 m_pIsaGen->CompileShader(gfx9aldbrn_PollAndCopy, "CopyMemory", isaBuffer);
2631 else
2632 m_pIsaGen->CompileShader(gfx9_PollAndCopy, "CopyMemory", isaBuffer);
2316 ASSERT_SUCCESS(m_pAsm->RunAssembleBuf(PollAndCopyIsa, isaBuffer.As<char*>()));
26332317
26342318 Dispatch dispatch(isaBuffer);
26352319 dispatch.SetArgs(fineBuffer, fineBuffer+dwLocation);
2121 */
2222
2323 #include "KFDBaseComponentTest.hpp"
24 #include "IsaGenerator.hpp"
2524
2625 #ifndef __KFD_MEMORY_TEST__H__
2726 #define __KFD_MEMORY_TEST__H__
3231 */
3332 class KFDMemoryTest : public KFDBaseComponentTest {
3433 public:
35 KFDMemoryTest(void) :m_pIsaGen(NULL) {}
34 KFDMemoryTest(void) {}
3635 ~KFDMemoryTest(void) {}
3736 protected:
3837 virtual void SetUp();
3938 virtual void TearDown();
4039
4140 protected:
42 IsaGenerator* m_pIsaGen;
43
4441 void BinarySearchLargestBuffer(int allocNode, const HsaMemFlags &memFlags,
4542 HSAuint64 highMB, int nodeToMap,
4643 HSAuint64 *lastSizeMB);
3838
3939 KFDBaseComponentTest::SetUp();
4040
41 m_pIsaGen = IsaGenerator::Create(m_FamilyId);
42
4341 ROUTINE_END
4442 }
4543
4644 void KFDQMTest::TearDown() {
4745 ROUTINE_START
48
49 if (m_pIsaGen)
50 delete m_pIsaGen;
51 m_pIsaGen = NULL;
5246
5347 KFDBaseComponentTest::TearDown();
5448
676670 TEST_END
677671 }
678672
679 /* A simple isa loop program with dense mathematic operations
680 * s1 controls the number iterations of the loop
681 * This shader can be used by GFX8, GFX9 and GFX10
682 */
683 static const char *loop_isa = \
684 "\
685 shader loop_isa\n\
686 wave_size(32)\n\
687 type(CS)\n\
688 s_movk_i32 s0, 0x0008\n\
689 s_movk_i32 s1, 0x00ff\n\
690 v_mov_b32 v0, 0\n\
691 v_mov_b32 v1, 0\n\
692 v_mov_b32 v2, 0\n\
693 v_mov_b32 v3, 0\n\
694 v_mov_b32 v4, 0\n\
695 v_mov_b32 v5, 0\n\
696 v_mov_b32 v6, 0\n\
697 v_mov_b32 v7, 0\n\
698 v_mov_b32 v8, 0\n\
699 v_mov_b32 v9, 0\n\
700 v_mov_b32 v10, 0\n\
701 v_mov_b32 v11, 0\n\
702 v_mov_b32 v12, 0\n\
703 v_mov_b32 v13, 0\n\
704 v_mov_b32 v14, 0\n\
705 v_mov_b32 v15, 0\n\
706 v_mov_b32 v16, 0\n\
707 LOOP:\n\
708 s_mov_b32 s8, s4\n\
709 s_mov_b32 s9, s1\n\
710 s_mov_b32 s10, s6\n\
711 s_mov_b32 s11, s7\n\
712 s_cmp_le_i32 s1, s0\n\
713 s_cbranch_scc1 END_OF_PGM\n\
714 s_buffer_load_dwordx8 s[8:15], s[8:11], 0x10\n\
715 v_add_f32 v0, 2.0, v0\n\
716 v_cvt_f32_i32 v17, s1\n\
717 s_waitcnt lgkmcnt(0)\n\
718 v_add_f32 v18, s8, v17\n\
719 v_add_f32 v19, s9, v17\n\
720 v_add_f32 v20, s10, v17\n\
721 v_add_f32 v21, s11, v17\n\
722 v_add_f32 v22, s12, v17\n\
723 v_add_f32 v23, s13, v17\n\
724 v_add_f32 v24, s14, v17\n\
725 v_add_f32 v17, s15, v17\n\
726 v_log_f32 v25, v18\n\
727 v_mul_f32 v25, v22, v25\n\
728 v_exp_f32 v25, v25\n\
729 v_log_f32 v26, v19\n\
730 v_mul_f32 v26, v23, v26\n\
731 v_exp_f32 v26, v26\n\
732 v_log_f32 v27, v20\n\
733 v_mul_f32 v27, v24, v27\n\
734 v_exp_f32 v27, v27\n\
735 v_log_f32 v28, v21\n\
736 v_mul_f32 v28, v17, v28\n\
737 v_exp_f32 v28, v28\n\
738 v_add_f32 v5, v5, v25\n\
739 v_add_f32 v6, v6, v26\n\
740 v_add_f32 v7, v7, v27\n\
741 v_add_f32 v8, v8, v28\n\
742 v_mul_f32 v18, 0x3fb8aa3b, v18\n\
743 v_exp_f32 v18, v18\n\
744 v_mul_f32 v19, 0x3fb8aa3b, v19\n\
745 v_exp_f32 v19, v19\n\
746 v_mul_f32 v20, 0x3fb8aa3b, v20\n\
747 v_exp_f32 v20, v20\n\
748 v_mul_f32 v21, 0x3fb8aa3b, v21\n\
749 v_exp_f32 v21, v21\n\
750 v_add_f32 v9, v9, v18\n\
751 v_add_f32 v10, v10, v19\n\
752 v_add_f32 v11, v11, v20\n\
753 v_add_f32 v12, v12, v21\n\
754 v_sqrt_f32 v18, v22\n\
755 v_sqrt_f32 v19, v23\n\
756 v_sqrt_f32 v20, v24\n\
757 v_sqrt_f32 v21, v17\n\
758 v_add_f32 v13, v13, v18\n\
759 v_add_f32 v14, v14, v19\n\
760 v_add_f32 v15, v15, v20\n\
761 v_add_f32 v16, v16, v21\n\
762 v_rsq_f32 v18, v22\n\
763 v_rsq_f32 v19, v23\n\
764 v_rsq_f32 v20, v24\n\
765 v_rsq_f32 v17, v17\n\
766 v_add_f32 v1, v1, v18\n\
767 v_add_f32 v2, v2, v19\n\
768 v_add_f32 v3, v3, v20\n\
769 v_add_f32 v4, v4, v17\n\
770 s_add_u32 s0, s0, 1\n\
771 s_branch LOOP\n\
772 END_OF_PGM:\n\
773 s_endpgm\n\
774 end\n\
775 ";
776
777673 HSAint64 KFDQMTest::TimeConsumedwithCUMask(int node, uint32_t* mask, uint32_t mask_count) {
778674 HsaMemoryBuffer isaBuffer(PAGE_SIZE, node, true/*zero*/, false/*local*/, true/*exec*/);
779675 HsaMemoryBuffer dstBuffer(PAGE_SIZE, node, true, false, false);
780676 HsaMemoryBuffer ctlBuffer(PAGE_SIZE, node, true, false, false);
781677
782 m_pIsaGen = IsaGenerator::Create(m_FamilyId);
783 m_pIsaGen->CompileShader(loop_isa, "loop_isa", isaBuffer);
678 EXPECT_SUCCESS(m_pAsm->RunAssembleBuf(LoopIsa, isaBuffer.As<char*>()));
784679
785680 Dispatch dispatch(isaBuffer);
786681 dispatch.SetDim(1024, 16, 16);
837732 TEST_START(TESTPROFILE_RUNALL);
838733 int defaultGPUNode = m_NodeInfo.HsaDefaultGPUNode();
839734 ASSERT_GE(defaultGPUNode, 0) << "failed to get default GPU Node";
840 m_pIsaGen = IsaGenerator::Create(m_FamilyId);
841735
842736 if (m_FamilyId >= FAMILY_VI) {
843737 const HsaNodeProperties *pNodeProperties = m_NodeInfo.GetNodeProperties(defaultGPUNode);
981875 HSAint32 *syncBuffer = syncBuf.As<HSAint32*>();
982876 HsaMemoryBuffer isaBuffer(PAGE_SIZE, node, true/*zero*/, false/*local*/, true/*exec*/);
983877
984 m_pIsaGen->CompileShader(loop_isa, "loop_isa", isaBuffer);
878 ASSERT_SUCCESS(m_pAsm->RunAssembleBuf(LoopIsa, isaBuffer.As<char*>()));
985879
986880 Dispatch dispatch[2] = {
987881 Dispatch(isaBuffer, true),
1046940 HSAint32 *syncBuffer = syncBuf.As<HSAint32*>();
1047941 HsaMemoryBuffer isaBuffer(PAGE_SIZE, node, true/*zero*/, false/*local*/, true/*exec*/);
1048942
1049 m_pIsaGen->CompileShader(loop_isa, "loop_isa", isaBuffer);
943 ASSERT_SUCCESS(m_pAsm->RunAssembleBuf(LoopIsa, isaBuffer.As<char*>()));
1050944
1051945 Dispatch dispatch[2] = {
1052946 Dispatch(isaBuffer, true),
11391033
11401034 HsaMemoryBuffer isaBuffer(PAGE_SIZE, defaultGPUNode, true/*zero*/, false/*local*/, true/*exec*/);
11411035
1142 m_pIsaGen->GetNoopIsa(isaBuffer);
1036 ASSERT_SUCCESS(m_pAsm->RunAssembleBuf(NoopIsa, isaBuffer.As<char*>()));
11431037
11441038 SyncDispatch(isaBuffer, NULL, NULL);
11451039
11581052
11591053 srcBuffer.Fill(0x01010101);
11601054
1161 m_pIsaGen->GetCopyDwordIsa(isaBuffer);
1055 ASSERT_SUCCESS(m_pAsm->RunAssembleBuf(CopyDwordIsa, isaBuffer.As<char*>()));
11621056
11631057 SyncDispatch(isaBuffer, srcBuffer.As<void*>(), destBuffer.As<void*>());
11641058
11931087
11941088 destBuffer.Fill(0xFF);
11951089
1196 m_pIsaGen->GetCopyDwordIsa(isaBuffer);
1090 ASSERT_SUCCESS(m_pAsm->RunAssembleBuf(CopyDwordIsa, isaBuffer.As<char*>()));
11971091
11981092 for (i = 0; i < MAX_CP_QUEUES; ++i)
11991093 ASSERT_SUCCESS(queues[i].Create(defaultGPUNode)) << " QueueId=" << i;
15321426
15331427 PM4Queue queue;
15341428
1535 m_pIsaGen->GetAtomicIncIsa(isaBuf);
1429 ASSERT_SUCCESS(m_pAsm->RunAssembleBuf(AtomicIncIsa, isaBuf.As<char*>()));
15361430
15371431 Dispatch dispatch(isaBuf);
15381432 dispatch.SetArgs(destBuf.As<void*>(), NULL);
15971491
15981492 srcNodeMem.Fill(0x05050505);
15991493
1600 m_pIsaGen->GetCopyDwordIsa(isaBufferSrc);
1494 ASSERT_SUCCESS(m_pAsm->RunAssemble(CopyDwordIsa));
1495
1496 m_pAsm->CopyInstrStream(isaBufferSrc.As<char*>());
16011497 SyncDispatch(isaBufferSrc, srcNodeMem.As<void*>(), shared_addr.As<void *>(), src_node);
16021498
1603 m_pIsaGen->GetCopyDwordIsa(isaBufferDst);
1499 m_pAsm->CopyInstrStream(isaBufferDst.As<char*>());
16041500 SyncDispatch(isaBufferDst, shared_addr.As<void *>(), dstNodeMem.As<void*>(), dst_node);
16051501
16061502 EXPECT_EQ(dstNodeMem.As<unsigned int*>()[0], 0x05050505);
2626 #include <gtest/gtest.h>
2727
2828 #include "PM4Queue.hpp"
29 #include "IsaGenerator.hpp"
3029 #include "KFDBaseComponentTest.hpp"
3130 #include "Dispatch.hpp"
3231
3332 class KFDQMTest : public KFDBaseComponentTest {
3433 public:
35 KFDQMTest():m_pIsaGen(NULL) {}
34 KFDQMTest() {}
3635
3736 ~KFDQMTest() {}
3837
4847 const double CuVariance = 0.15;
4948 const double CuNegVariance = 1.0 - CuVariance;
5049 const double CuPosVariance = 1.0 + CuVariance;
51 IsaGenerator* m_pIsaGen;
5250 };
5351
5452 #endif // __KFD_QCM_TEST__H__
8787
8888 for (HSAuint32 i = 0; i < count; i++) {
8989 m_pBuf = mmap(0, vramBufSize, PROT_READ | PROT_WRITE, MAP_ANONYMOUS | MAP_PRIVATE, -1, 0);
90 EXPECT_NOTNULL(m_pBuf);
90 ASSERT_NE(MAP_FAILED, m_pBuf);
9191
9292 m_Flags = (HSA_SVM_FLAGS)0;
9393 retry:
231231 WaitChildProcesses();
232232
233233 TEST_END
234 }
235
236 /* Shader to read local buffers using multiple wavefronts in parallel
237 * until address buffer is filled with specific value 0x5678 by host program,
238 * then each wavefront fills value 0x5678 at corresponding result buffer and quit
239 *
240 * initial state:
241 * s[0:1] - address buffer base address
242 * s[2:3] - result buffer base address
243 * s4 - workgroup id
244 * v0 - workitem id, always 0 because NUM_THREADS_X(number of threads) in workgroup set to 1
245 * registers:
246 * v0 - calculated workitem id, v0 = v0 + s4 * NUM_THREADS_X
247 * v[2:3] - address of corresponding local buf address offset: s[0:1] + v0 * 8
248 * v[4:5] - corresponding output buf address: s[2:3] + v0 * 4
249 * v[6:7] - local buf address used for read test
250 */
251 static const char* gfx9_ReadMemory =
252 "\
253 shader ReadMemory\n\
254 type(CS)\n\
255 \n\
256 // compute address of corresponding output buffer\n\
257 v_mov_b32 v0, s4 // use workgroup id as index\n\
258 v_lshlrev_b32 v0, 2, v0 // v0 *= 4\n\
259 v_add_co_u32 v4, vcc, s2, v0 // v[4:5] = s[2:3] + v0 * 4\n\
260 v_mov_b32 v5, s3\n\
261 v_add_u32 v5, vcc_lo, v5\n\
262 \n\
263 // compute input buffer offset used to store corresponding local buffer address\n\
264 v_lshlrev_b32 v0, 1, v0 // v0 *= 8\n\
265 v_add_co_u32 v2, vcc, s0, v0 // v[2:3] = s[0:1] + v0 * 8\n\
266 v_mov_b32 v3, s1\n\
267 v_add_u32 v3, vcc_lo, v3\n\
268 \n\
269 // load 64bit local buffer address stored at v[2:3] to v[6:7]\n\
270 flat_load_dwordx2 v[6:7], v[2:3] slc\n\
271 s_waitcnt vmcnt(0) & lgkmcnt(0) // wait for memory reads to finish\n\
272 \n\
273 v_mov_b32 v8, 0x5678\n\
274 s_movk_i32 s8, 0x5678\n\
275 L_REPEAT:\n\
276 s_load_dword s16, s[0:1], 0x0 glc\n\
277 s_waitcnt vmcnt(0) & lgkmcnt(0) // wait for memory reads to finish\n\
278 s_cmp_eq_i32 s16, s8\n\
279 s_cbranch_scc1 L_QUIT // if notified to quit by host\n\
280 // loop read 64M local buffer starting at v[6:7]\n\
281 // every 4k page only read once\n\
282 v_mov_b32 v9, 0\n\
283 v_mov_b32 v10, 0x1000 // 4k page\n\
284 v_mov_b32 v11, 0x4000000 // 64M size\n\
285 v_mov_b32 v12, v6\n\
286 v_mov_b32 v13, v7\n\
287 L_LOOP_READ:\n\
288 flat_load_dwordx2 v[14:15], v[12:13] slc\n\
289 v_add_u32 v9, v9, v10 \n\
290 v_add_co_u32 v12, vcc, v12, v10\n\
291 v_add_u32 v13, vcc_lo, v13\n\
292 v_cmp_lt_u32 vcc, v9, v11\n\
293 s_cbranch_vccnz L_LOOP_READ\n\
294 s_branch L_REPEAT\n\
295 L_QUIT:\n\
296 flat_store_dword v[4:5], v8\n\
297 s_waitcnt vmcnt(0) & lgkmcnt(0) // wait for memory writes to finish\n\
298 s_endpgm\n\
299 end\n\
300 ";
301
302 static const char* gfx8_ReadMemory =
303 "\
304 shader ReadMemory\n\
305 asic(VI)\n\
306 type(CS)\n\
307 \n\
308 // compute address of corresponding output buffer\n\
309 v_mov_b32 v0, s4 // use workgroup id as index\n\
310 v_lshlrev_b32 v0, 2, v0 // v0 *= 4\n\
311 v_add_u32 v4, vcc, s2, v0 // v[4:5] = s[2:3] + v0 * 4\n\
312 v_mov_b32 v5, s3\n\
313 v_addc_u32 v5, vcc, v5, 0, vcc\n\
314 \n\
315 // compute input buffer offset used to store corresponding local buffer address\n\
316 v_lshlrev_b32 v0, 1, v0 // v0 *= 8\n\
317 v_add_u32 v2, vcc, s0, v0 // v[2:3] = s[0:1] + v0 * 8\n\
318 v_mov_b32 v3, s1\n\
319 v_addc_u32 v3, vcc, v3, 0, vcc\n\
320 \n\
321 // load 64bit local buffer address stored at v[2:3] to v[6:7]\n\
322 flat_load_dwordx2 v[6:7], v[2:3] slc\n\
323 s_waitcnt vmcnt(0) & lgkmcnt(0) // wait for memory reads to finish\n\
324 \n\
325 v_mov_b32 v8, 0x5678\n\
326 s_movk_i32 s8, 0x5678\n\
327 L_REPEAT:\n\
328 s_load_dword s16, s[0:1], 0x0 glc\n\
329 s_waitcnt vmcnt(0) & lgkmcnt(0) // wait for memory reads to finish\n\
330 s_cmp_eq_i32 s16, s8\n\
331 s_cbranch_scc1 L_QUIT // if notified to quit by host\n\
332 // loop read 64M local buffer starting at v[6:7]\n\
333 // every 4k page only read once\n\
334 v_mov_b32 v9, 0\n\
335 v_mov_b32 v10, 0x1000 // 4k page\n\
336 v_mov_b32 v11, 0x4000000 // 64M size\n\
337 v_mov_b32 v12, v6\n\
338 v_mov_b32 v13, v7\n\
339 L_LOOP_READ:\n\
340 flat_load_dwordx2 v[14:15], v[12:13] slc\n\
341 v_add_u32 v9, vcc, v9, v10 \n\
342 v_add_u32 v12, vcc, v12, v10\n\
343 v_addc_u32 v13, vcc, v13, 0, vcc\n\
344 v_cmp_lt_u32 vcc, v9, v11\n\
345 s_cbranch_vccnz L_LOOP_READ\n\
346 s_branch L_REPEAT\n\
347 L_QUIT:\n\
348 flat_store_dword v[4:5], v8\n\
349 s_waitcnt vmcnt(0) & lgkmcnt(0) // wait for memory writes to finish\n\
350 s_endpgm\n\
351 end\n\
352 ";
353
354 std::string KFDSVMEvictTest::CreateShader() {
355 if (m_FamilyId >= FAMILY_AI)
356 return gfx9_ReadMemory;
357 else
358 return gfx8_ReadMemory;
359234 }
360235
361236 /* Evict and restore queue test
433308 for (i = 0; i < wavefront_num; i++)
434309 *(localBufAddr + i) = pBuffers[i];
435310
436 m_pIsaGen->CompileShader(CreateShader().c_str(), "ReadMemory", isaBuffer);
311 ASSERT_SUCCESS(m_pAsm->RunAssembleBuf(ReadMemoryIsa, isaBuffer.As<char*>()));
437312
438313 PM4Queue pm4Queue;
439314 ASSERT_SUCCESS(pm4Queue.Create(defaultGPUNode));
2727 #include <vector>
2828 #include "KFDLocalMemoryTest.hpp"
2929 #include "KFDBaseComponentTest.hpp"
30 #include "IsaGenerator.hpp"
3130
3231 // @class KFDEvictTest
3332 // Test eviction and restore procedure using two processes
2020 *
2121 */
2222 #include "KFDSVMRangeTest.hpp"
23 #include <poll.h>
2324 #include <sys/mman.h>
2425 #include <vector>
2526 #include "PM4Queue.hpp"
3334
3435 KFDBaseComponentTest::SetUp();
3536
36 m_pIsaGen = IsaGenerator::Create(m_FamilyId);
37
3837 SVMSetXNACKMode();
3938
4039 ROUTINE_END
4241
4342 void KFDSVMRangeTest::TearDown() {
4443 ROUTINE_START
45
46 if (m_pIsaGen)
47 delete m_pIsaGen;
48 m_pIsaGen = NULL;
4944
5045 SVMRestoreXNACKMode();
5146
7974
8075 srcSysBuffer.Fill(0x01010101);
8176
82 m_pIsaGen->GetCopyDwordIsa(isaBuffer);
77 ASSERT_SUCCESS(m_pAsm->RunAssembleBuf(CopyDwordIsa, isaBuffer.As<char*>()));
8378
8479 ASSERT_SUCCESS(queue.Create(defaultGPUNode));
8580 queue.SetSkipWaitConsump(0);
363358 ASSERT_SUCCESS(sdmaQueue.Create(defaultGPUNode));
364359
365360 HsaMemoryBuffer isaBuffer(PAGE_SIZE, defaultGPUNode, true/*zero*/, false/*local*/, true/*exec*/);
366 m_pIsaGen->GetCopyDwordIsa(isaBuffer);
361
362 ASSERT_SUCCESS(m_pAsm->RunAssembleBuf(CopyDwordIsa, isaBuffer.As<char*>()));
367363
368364 Dispatch dispatch0(isaBuffer);
369365 dispatch0.SetArgs(srcBuffer.As<void*>(), dstBuffer.As<void*>());
457453
458454 munmap(pBuf2, Buf2Size);
459455
460 m_pIsaGen->GetCopyDwordIsa(isaBuffer);
456 ASSERT_SUCCESS(m_pAsm->RunAssembleBuf(CopyDwordIsa, isaBuffer.As<char*>()));
457
461458 ASSERT_SUCCESS(queue.Create(defaultGPUNode));
462459
463460 Dispatch dispatch(isaBuffer);
506503
507504 srcSysBuffer.Fill(0x01010101);
508505
509 m_pIsaGen->GetCopyDwordIsa(isaBuffer);
506 ASSERT_SUCCESS(m_pAsm->RunAssembleBuf(CopyDwordIsa, isaBuffer.As<char*>()));
510507
511508 ASSERT_SUCCESS(queue.Create(defaultGPUNode));
512509 queue.SetSkipWaitConsump(0);
942939 #ifdef USE_PM4_QUEUE_TRIGGER_VM_FAULT
943940 HsaMemoryBuffer isaBuffer(PAGE_SIZE, defaultGPUNode);
944941 PM4Queue queue;
945 m_pIsaGen->GetCopyDwordIsa(isaBuffer);
942
943 ASSERT_SUCCESS(m_pAsm->RunAssembleBuf(CopyDwordIsa, isaBuffer.As<char*>()));
944
946945 ASSERT_SUCCESS(queue.Create(defaultGPUNode));
947946
948947 for (HSAuint64 i = 0; i < BufferSize / 8; i += 512) {
998997 return;
999998 }
1000999
1001 const std::vector<int> gpuNodes = m_NodeInfo.GetNodesWithGPU();
1000 const std::vector<int> gpuNodesAll = m_NodeInfo.GetNodesWithGPU();
1001 std::vector<int> gpuNodes;
1002
1003 for (int i : gpuNodesAll) {
1004 const HsaNodeProperties *pNodeProperties;
1005
1006 pNodeProperties = m_NodeInfo.GetNodeProperties(gpuNodesAll.at(i));
1007 if (pNodeProperties->Capability.ui32.SVMAPISupported)
1008 gpuNodes.push_back(gpuNodesAll.at(i));
1009 }
10021010 if (gpuNodes.size() < 2) {
1003 LOG() << "Skipping test: at least two GPUs needed." << std::endl;
1011 LOG() << "Skipping test: at least two SVM supported GPUs needed." << std::endl;
10041012 return;
10051013 }
10061014
10731081 return;
10741082 }
10751083
1076 const std::vector<int> gpuNodes = m_NodeInfo.GetNodesWithGPU();
1084 const std::vector<int> gpuNodesAll = m_NodeInfo.GetNodesWithGPU();
1085 std::vector<int> gpuNodes;
1086
1087 for (int i : gpuNodesAll) {
1088 const HsaNodeProperties *pNodeProperties;
1089
1090 pNodeProperties = m_NodeInfo.GetNodeProperties(gpuNodesAll.at(i));
1091 if (pNodeProperties->Capability.ui32.SVMAPISupported)
1092 gpuNodes.push_back(gpuNodesAll.at(i));
1093 }
10771094 if (gpuNodes.size() < 2) {
1078 LOG() << "Skipping test: at least two GPUs needed." << std::endl;
1095 LOG() << "Skipping test: at least two SVM supported GPUs needed." << std::endl;
10791096 return;
10801097 }
10811098
12361253 ASSERT_EQ(size, write(fd, buf, size));
12371254
12381255 void *MmapedFile = mmap(NULL, size, PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0);
1239 ASSERT_NE(nullptr, MmapedFile);
1256 ASSERT_NE(MAP_FAILED, MmapedFile);
12401257
12411258 HsaSVMRange filebackedRange(MmapedFile, size, defaultGPUNode, defaultGPUNode);
12421259
13841401
13851402 TEST_END
13861403 }
1404
1405 /*
1406 * Test SMI HMM SVM profiling event
1407 * Use separate thread to read event the same way as ROCr and ROCProfiler
1408 */
1409 struct ReadEventThreadParams {
1410 int nodeid;
1411 HSAuint64 *pBuf;
1412 int BufSize;
1413 pthread_barrier_t *barrier;
1414 };
1415
1416 unsigned int ReadSMIEventThread(void* p) {
1417 struct ReadEventThreadParams *pArgs = (struct ReadEventThreadParams *)p;
1418 char msg[HSA_SMI_EVENT_MSG_SIZE];
1419 struct pollfd fds = {0};
1420 HSAuint64 events;
1421 int fd;
1422
1423 EXPECT_SUCCESS(hsaKmtOpenSMI(pArgs->nodeid, &fd));
1424 events = HSA_SMI_EVENT_MASK_FROM_INDEX(HSA_SMI_EVENT_INDEX_MAX) - 1;
1425 EXPECT_EQ(write(fd, &events, sizeof(events)), sizeof(events));
1426
1427 pthread_barrier_wait(pArgs->barrier);
1428
1429 fds.fd = fd;
1430 fds.events = POLLIN;
1431 EXPECT_GE(poll(&fds, 1, 1000), 0);
1432
1433 memset(msg, 0, sizeof(msg));
1434 EXPECT_GE(read(fd, msg, HSA_SMI_EVENT_MSG_SIZE), 0);
1435
1436 int event_id, pid, size, trigger, unused;
1437 HSAuint64 timestamp;
1438 HSAuint64 addr;
1439 EXPECT_EQ(sscanf(msg, "%x %ld -%d @%lx(%d) %d->%x %x:%d %d\n", &event_id, &timestamp, &pid,
1440 &addr, &size, &unused, &unused, &unused, &unused, &trigger), 10);
1441 EXPECT_EQ(event_id, HSA_SMI_EVENT_MIGRATE_START);
1442 EXPECT_EQ((HSAuint64 *)(addr << PAGE_SHIFT), pArgs->pBuf);
1443 EXPECT_EQ(size << PAGE_SHIFT, pArgs->BufSize);
1444 EXPECT_EQ(pid, getpid());
1445 EXPECT_EQ(trigger, HSA_MIGRATE_TRIGGER_PREFETCH);
1446 close(fd);
1447 return 0;
1448 }
1449
1450 TEST_F(KFDSVMRangeTest, HMMProfilingEvent) {
1451 TEST_REQUIRE_ENV_CAPABILITIES(ENVCAPS_64BITLINUX);
1452 TEST_START(TESTPROFILE_RUNALL);
1453
1454 if (!SVMAPISupported())
1455 return;
1456
1457 if (m_VersionInfo.KernelInterfaceMinorVersion < 10)
1458 return;
1459
1460 int defaultGPUNode = m_NodeInfo.HsaDefaultGPUNode();
1461 ASSERT_GE(defaultGPUNode, 0) << "failed to get default GPU Node";
1462
1463 if (!GetVramSize(defaultGPUNode)) {
1464 LOG() << "Skipping test: No VRAM found." << std::endl;
1465 return;
1466 }
1467
1468 pthread_barrier_t barrier;
1469 ASSERT_SUCCESS(pthread_barrier_init(&barrier, NULL, 2));
1470
1471 int BufSize = 16 << 10;
1472 HsaSVMRange SysBuffer(BufSize, defaultGPUNode);
1473 HSAuint64 *pBuf = SysBuffer.As<HSAuint64 *>();
1474
1475 struct ReadEventThreadParams pArgs = {defaultGPUNode, pBuf, BufSize, &barrier};
1476 uint64_t threadId;
1477 ASSERT_EQ(true, StartThread(&ReadSMIEventThread, &pArgs, threadId));
1478
1479 pthread_barrier_wait(&barrier);
1480
1481 EXPECT_SUCCESS(SVMRangePrefetchToNode(pBuf, BufSize, defaultGPUNode));
1482
1483 WaitForThread(threadId);
1484
1485 TEST_END
1486 }
1487
1488 /*
1489 * Test SVM support VRAM overcommitment
1490 *
1491 * Prefetch total VRAM size plus overCommitSize SVM range to VRAM. after VRAM is full,
1492 * KFD should support VRAM overcommitment by evicting SVM ranges to system memory to alloc
1493 * VRAM for new ranges.
1494 */
1495 TEST_F(KFDSVMRangeTest, VramOvercommitTest) {
1496 TEST_REQUIRE_ENV_CAPABILITIES(ENVCAPS_64BITLINUX);
1497 TEST_START(TESTPROFILE_RUNALL);
1498
1499 if (!SVMAPISupported())
1500 return;
1501
1502 int defaultGPUNode = m_NodeInfo.HsaDefaultGPUNode();
1503 ASSERT_GE(defaultGPUNode, 0) << "failed to get default GPU Node";
1504
1505 if (m_FamilyId < FAMILY_AI) {
1506 LOG() << std::hex << "Skipping test: No svm range support for family ID 0x" << m_FamilyId << "." << std::endl;
1507 return;
1508 }
1509
1510 HSAuint64 vramSize = GetVramSize(defaultGPUNode);
1511 if (!vramSize) {
1512 LOG() << "Skipping test: No VRAM found." << std::endl;
1513 return;
1514 }
1515
1516 unsigned long overCommitSize = 1UL << 30;
1517
1518 /* With XNACK off, KFD checks that all SVM memory will fit into system memory */
1519 if (vramSize + overCommitSize > GetSysMemSize() / 2) {
1520 LOG() << "Skipping test: no enough system memory." << std::endl;
1521 return;
1522 }
1523
1524 unsigned long BufSize = 512UL << 20;
1525 unsigned long numBufs = (vramSize + overCommitSize) / BufSize;
1526 HSAKMT_STATUS ret;
1527
1528 void *pBuf[numBufs];
1529 unsigned long i;
1530
1531 for (i = 0; i < numBufs; i++) {
1532 pBuf[i] = mmap(0, BufSize, PROT_READ | PROT_WRITE, MAP_ANONYMOUS | MAP_PRIVATE, -1, 0);
1533 ASSERT_NE(MAP_FAILED, pBuf[i]);
1534
1535 ret = RegisterSVMRange(defaultGPUNode, pBuf[i], BufSize, defaultGPUNode, 0);
1536 if (ret != HSAKMT_STATUS_SUCCESS)
1537 break;
1538 }
1539
1540 EXPECT_EQ(numBufs, i);
1541
1542 while (i--)
1543 munmap(pBuf[i], BufSize);
1544
1545 TEST_END
1546 }
1547
1548 /*
1549 * Test SVM support VRAM overcommitment
1550 *
1551 * Prefetch giant overcommit SVM range to VRAM, KFD should support VRAM overcommitment
1552 * by spliting giant range into smaller ranges, evicting SVM ranges to system memory to
1553 * alloc VRAM for overcommitment ranges.
1554 */
1555 TEST_F(KFDSVMRangeTest, VramOvercommitGiantRangeTest) {
1556 TEST_REQUIRE_ENV_CAPABILITIES(ENVCAPS_64BITLINUX);
1557 TEST_START(TESTPROFILE_RUNALL);
1558
1559 if (!SVMAPISupported())
1560 return;
1561
1562 int defaultGPUNode = m_NodeInfo.HsaDefaultGPUNode();
1563 ASSERT_GE(defaultGPUNode, 0) << "failed to get default GPU Node";
1564
1565 if (m_FamilyId < FAMILY_AI) {
1566 LOG() << std::hex << "Skipping test: No svm range support for family ID 0x" << m_FamilyId << "." << std::endl;
1567 return;
1568 }
1569
1570 HSAuint64 vramSize = GetVramSize(defaultGPUNode);
1571 if (!vramSize) {
1572 LOG() << "Skipping test: No VRAM found." << std::endl;
1573 return;
1574 }
1575
1576 unsigned long overCommitSize = 1UL << 30;
1577
1578 /* With XNACK off, KFD checks that all SVM memory will fit into system memory */
1579 if (vramSize + overCommitSize > GetSysMemSize() / 2) {
1580 LOG() << "Skipping test: no enough system memory." << std::endl;
1581 return;
1582 }
1583
1584 unsigned long BufSize = vramSize + overCommitSize;
1585 HSAKMT_STATUS ret;
1586 void *pBuf;
1587
1588 pBuf = mmap(0, BufSize, PROT_READ | PROT_WRITE, MAP_ANONYMOUS | MAP_PRIVATE, -1, 0);
1589 ASSERT_NE(MAP_FAILED, pBuf);
1590
1591 ret = RegisterSVMRange(defaultGPUNode, pBuf, BufSize, defaultGPUNode, 0);
1592 EXPECT_EQ (HSAKMT_STATUS_SUCCESS, ret);
1593
1594 munmap(pBuf, BufSize);
1595 TEST_END
1596 }
1597
1598 /*
1599 * Test partial range prefault
1600 *
1601 * mmap alloc 4 pages range, memset middle 2 pages, prefetch entire range to VRAM,
1602 * use sdma to memset the rest 2 pages, each page has different value 0x1, 0x2, 0x3, 0x4
1603 * then check if all page have the specific value after migrating 4 pages to system memory.
1604 */
1605 TEST_F(KFDSVMRangeTest, PrefaultPartialRangeTest) {
1606 TEST_REQUIRE_ENV_CAPABILITIES(ENVCAPS_64BITLINUX);
1607 TEST_START(TESTPROFILE_RUNALL);
1608
1609 if (!SVMAPISupported())
1610 return;
1611
1612 int defaultGPUNode = m_NodeInfo.HsaDefaultGPUNode();
1613 ASSERT_GE(defaultGPUNode, 0) << "failed to get default GPU Node";
1614
1615 if (m_FamilyId < FAMILY_AI) {
1616 LOG() << std::hex << "Skipping test: No svm range support for family ID 0x" << m_FamilyId << "." << std::endl;
1617 return;
1618 }
1619
1620 unsigned long BufSize = 4 * PAGE_SIZE;
1621 HSAKMT_STATUS ret;
1622 char *pBuf;
1623
1624 pBuf = (char *)mmap(0, BufSize, PROT_READ | PROT_WRITE, MAP_ANONYMOUS | MAP_PRIVATE, -1, 0);
1625 ASSERT_NE(MAP_FAILED, pBuf);
1626
1627 memset(pBuf + PAGE_SIZE, 0x2, PAGE_SIZE);
1628 memset(pBuf + 2 * PAGE_SIZE, 0x3, PAGE_SIZE);
1629
1630 EXPECT_SUCCESS(RegisterSVMRange(defaultGPUNode, pBuf, BufSize, 0, 0));
1631 EXPECT_SUCCESS(SVMRangePrefetchToNode(pBuf, BufSize, defaultGPUNode));
1632
1633 SDMAQueue sdmaQueue;
1634 EXPECT_SUCCESS(sdmaQueue.Create(defaultGPUNode));
1635
1636 sdmaQueue.PlaceAndSubmitPacket(SDMAFillDataPacket(sdmaQueue.GetFamilyId(),
1637 pBuf, 0x01010101, PAGE_SIZE));
1638 sdmaQueue.PlaceAndSubmitPacket(SDMAFillDataPacket(sdmaQueue.GetFamilyId(),
1639 pBuf + 3 * PAGE_SIZE, 0x04040404, PAGE_SIZE));
1640 sdmaQueue.Wait4PacketConsumption();
1641
1642 EXPECT_SUCCESS(sdmaQueue.Destroy());
1643
1644 for (int i = 0; i < 4; i++)
1645 EXPECT_EQ(pBuf[i * PAGE_SIZE], i + 1);
1646
1647 munmap(pBuf, BufSize);
1648 TEST_END
1649 }
2525
2626 #include <gtest/gtest.h>
2727
28 #include "IsaGenerator.hpp"
2928 #include "KFDBaseComponentTest.hpp"
3029
3130 class KFDSVMRangeTest : public KFDBaseComponentTest {
3231 public:
33 KFDSVMRangeTest() :m_pIsaGen(NULL) {}
32 KFDSVMRangeTest() {}
3433 ~KFDSVMRangeTest() {}
3534 void SplitRangeTest(int defaultGPUNode, int prefetch_location);
3635
3736 protected:
3837 virtual void SetUp();
3938 virtual void TearDown();
40
41 protected: // Members
42 IsaGenerator* m_pIsaGen;
4339 };
4440
4541 #endif // __KFD_LOCALMEMORY_TEST__H__
5151
5252 enum KfdFamilyId {
5353 FAMILY_UNKNOWN = 0,
54 FAMILY_CI, // Sea Islands: Hawaii (P), Maui (P), Bonaire (M)
55 FAMILY_KV, // Fusion Kaveri: Spectre, Spooky; Fusion Kabini: Kalindi
56 FAMILY_VI, // Volcanic Islands: Iceland (V), Tonga (M)
57 FAMILY_CZ, // Carrizo, Nolan, Amur
58 FAMILY_AI, // Arctic Islands
59 FAMILY_RV, // Raven
60 FAMILY_AR, // Arcturus
61 FAMILY_AL, // Aldebaran
62 FAMILY_NV, // Navi10
54 FAMILY_CI, // Sea Islands: Hawaii (P), Maui (P), Bonaire (M)
55 FAMILY_KV, // Fusion Kaveri: Spectre, Spooky; Fusion Kabini: Kalindi
56 FAMILY_VI, // Volcanic Islands: Iceland (V), Tonga (M)
57 FAMILY_CZ, // Carrizo, Nolan, Amur
58 FAMILY_AI, // Arctic Islands
59 FAMILY_RV, // Raven
60 FAMILY_AR, // Arcturus
61 FAMILY_AL, // Aldebaran
62 FAMILY_NV, // Navi10
63 FAMILY_GFX11, // GFX11
6364 };
6465
6566 #endif // __KFD_TEST_FLAGS__H__
193193 case 10:
194194 familyId = FAMILY_NV;
195195 break;
196 case 11:
197 familyId = FAMILY_GFX11;
198 break;
196199 }
197200
198201 if (props->NumCPUCores && props->NumFComputeCores)
228231 }
229232
230233 return false;
234 }
235
236 const uint32_t GetGfxVersion(const HsaNodeProperties *props) {
237 return ((props->EngineId.ui32.Major << 16) |
238 (props->EngineId.ui32.Minor << 8) |
239 (props->EngineId.ui32.Stepping));
231240 }
232241
233242 HSAuint64 GetSystemTickCountInMicroSec() {
854863 m_SelfAllocated(false) {
855864 if (!m_pUser) {
856865 m_pUser = mmap(0, m_Size, PROT_READ | PROT_WRITE, MAP_ANONYMOUS | MAP_PRIVATE, -1, 0);
857 EXPECT_NOTNULL(m_pUser);
866 EXPECT_NE(MAP_FAILED, m_pUser);
858867 m_SelfAllocated = true;
859868 }
860869
5151 bool isTonga(const HsaNodeProperties *props);
5252 bool hasPciAtomicsSupport(int node);
5353 unsigned int FamilyIdFromNode(const HsaNodeProperties *props);
54 const uint32_t GetGfxVersion(const HsaNodeProperties *props);
5455
5556 void GetHwQueueInfo(const HsaNodeProperties *props,
5657 unsigned int *p_num_cp_queues,
3333
3434 KFDBaseComponentTest::SetUp();
3535
36 m_pIsaGen = IsaGenerator::Create(m_FamilyId);
37
3836 ROUTINE_END
3937 }
4038
4139 void RDMATest::TearDown() {
4240 ROUTINE_START
43 if (m_pIsaGen)
44 delete m_pIsaGen;
45 m_pIsaGen = NULL;
4641
4742 KFDBaseComponentTest::TearDown();
4843
7671 srcSysBuffer.Fill(0xfe);
7772
7873 /* Put 'copy dword' command to ISA buffer */
79 m_pIsaGen->GetCopyDwordIsa(isaBuffer);
74 ASSERT_SUCCESS(m_pAsm->RunAssembleBuf(CopyDwordIsa, isaBuffer.As<char*>()));
75
8076
8177 ASSERT_SUCCESS(queue.Create(defaultGPUNode));
8278 Dispatch dispatch(isaBuffer);
2525
2626 #include <gtest/gtest.h>
2727
28 #include "IsaGenerator.hpp"
2928 #include "KFDBaseComponentTest.hpp"
3029
3130 class RDMATest : public KFDBaseComponentTest {
3231 public:
33 RDMATest():m_pIsaGen(NULL) {}
32 RDMATest() {}
3433 ~RDMATest() {}
3534
3635 protected:
3736 virtual void SetUp();
3837 virtual void TearDown();
39
40 protected: // Members
41 IsaGenerator* m_pIsaGen;
4238 };
4339
4440 #endif // __RDMA_TEST__H__
0 /*
1 * Copyright (C) 2021 Advanced Micro Devices, Inc. All Rights Reserved.
2 *
3 * Permission is hereby granted, free of charge, to any person obtaining a
4 * copy of this software and associated documentation files (the "Software"),
5 * to deal in the Software without restriction, including without limitation
6 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
7 * and/or sell copies of the Software, and to permit persons to whom the
8 * Software is furnished to do so, subject to the following conditions:
9 *
10 * The above copyright notice and this permission notice shall be included in
11 * all copies or substantial portions of the Software.
12 *
13 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
14 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
15 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
16 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
17 * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
18 * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
19 * OTHER DEALINGS IN THE SOFTWARE.
20 *
21 */
22
23 #include "ShaderStore.hpp"
24
25 /**
26 * KFDASMTest List
27 */
28
29 const std::vector<const char*> ShaderList = {
30 NoopIsa,
31 CopyDwordIsa,
32 InfiniteLoopIsa,
33 AtomicIncIsa,
34 ScratchCopyDwordIsa,
35 PollMemoryIsa,
36 CopyOnSignalIsa,
37 PollAndCopyIsa,
38 WriteFlagAndValueIsa,
39 WriteAndSignalIsa,
40 LoopIsa,
41 IterateIsa,
42 ReadMemoryIsa,
43 GwsInitIsa,
44 GwsAtomicIncreaseIsa,
45 };
46
47 /**
48 * Macros
49 */
50
51 /* Create macro for portable v_add_co_u32, v_add_co_ci_u32,
52 * and v_cmp_lt_u32
53 */
54 #define SHADER_MACROS \
55 " .text\n"\
56 " .macro V_ADD_CO_U32 vdst, src0, vsrc1\n"\
57 " .if (.amdgcn.gfx_generation_number >= 10)\n"\
58 " v_add_co_u32 \\vdst, vcc_lo, \\src0, \\vsrc1\n"\
59 " .elseif (.amdgcn.gfx_generation_number >= 9)\n"\
60 " v_add_co_u32 \\vdst, vcc, \\src0, \\vsrc1\n"\
61 " .else\n"\
62 " v_add_u32 \\vdst, vcc, \\src0, \\vsrc1\n"\
63 " .endif\n"\
64 " .endm\n"\
65 " .macro V_ADD_CO_CI_U32 vdst, src0, vsrc1\n"\
66 " .if (.amdgcn.gfx_generation_number >= 10)\n"\
67 " v_add_co_ci_u32 \\vdst, vcc_lo, \\src0, \\vsrc1, vcc_lo\n"\
68 " .elseif (.amdgcn.gfx_generation_number >= 9)\n"\
69 " v_addc_co_u32 \\vdst, vcc, \\src0, \\vsrc1, vcc\n"\
70 " .else\n"\
71 " v_addc_u32 \\vdst, vcc, \\src0, \\vsrc1, vcc\n"\
72 " .endif\n"\
73 " .endm\n"\
74 " .macro V_CMP_LT_U32 src0, vsrc1\n"\
75 " .if (.amdgcn.gfx_generation_number >= 10)\n"\
76 " v_cmp_lt_u32 vcc_lo, \\src0, \\vsrc1\n"\
77 " .else\n"\
78 " v_cmp_lt_u32 vcc, \\src0, \\vsrc1\n"\
79 " .endif\n"\
80 " .endm\n"
81
82 /**
83 * Common
84 */
85
86 const char *NoopIsa = R"(
87 .text
88 s_endpgm
89 )";
90
91 const char *CopyDwordIsa = R"(
92 .text
93 v_mov_b32 v0, s0
94 v_mov_b32 v1, s1
95 v_mov_b32 v2, s2
96 v_mov_b32 v3, s3
97 flat_load_dword v4, v[0:1] glc slc
98 s_waitcnt 0
99 flat_store_dword v[2:3], v4 glc slc
100 s_endpgm
101 )";
102
103 const char *InfiniteLoopIsa = R"(
104 .text
105 LOOP:
106 s_branch LOOP
107 s_endpgm
108 )";
109
110 const char *AtomicIncIsa = R"(
111 .text
112 v_mov_b32 v0, s0
113 v_mov_b32 v1, s1
114 .if (.amdgcn.gfx_generation_number >= 8)
115 v_mov_b32 v2, 1
116 flat_atomic_add v3, v[0:1], v2 glc slc
117 .else
118 v_mov_b32 v2, -1
119 flat_atomic_inc v3, v[0:1], v2 glc slc
120 .endif
121 s_waitcnt 0
122 s_endpgm
123 )";
124
125 /**
126 * KFDMemoryTest
127 */
128
129 const char *ScratchCopyDwordIsa = R"(
130 .text
131 // Copy the parameters from scalar registers to vector registers
132 .if (.amdgcn.gfx_generation_number >= 9)
133 v_mov_b32 v0, s0
134 v_mov_b32 v1, s1
135 v_mov_b32 v2, s2
136 v_mov_b32 v3, s3
137 .else
138 v_mov_b32_e32 v0, s0
139 v_mov_b32_e32 v1, s1
140 v_mov_b32_e32 v2, s2
141 v_mov_b32_e32 v3, s3
142 .endif
143 // Setup the scratch parameters. This assumes a single 16-reg block
144 .if (.amdgcn.gfx_generation_number >= 10)
145 s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s4
146 s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s5
147 .elseif (.amdgcn.gfx_generation_number == 9)
148 s_mov_b32 flat_scratch_lo, s4
149 s_mov_b32 flat_scratch_hi, s5
150 .else
151 s_mov_b32 flat_scratch_lo, 8
152 s_mov_b32 flat_scratch_hi, 0
153 .endif
154 // Copy a dword between the passed addresses
155 flat_load_dword v4, v[0:1] slc
156 s_waitcnt vmcnt(0) & lgkmcnt(0)
157 flat_store_dword v[2:3], v4 slc
158 s_endpgm
159 )";
160
161 /* Continuously poll src buffer and check buffer value
162 * After src buffer is filled with specific value (0x5678,
163 * by host program), fill dst buffer with specific
164 * value(0x5678) and quit
165 */
166 const char *PollMemoryIsa = R"(
167 .text
168 // Assume src address in s0, s1, and dst address in s2, s3
169 s_movk_i32 s18, 0x5678
170 .if (.amdgcn.gfx_generation_number >= 10)
171 v_mov_b32 v0, s2
172 v_mov_b32 v1, s3
173 v_mov_b32 v2, 0x5678
174 .endif
175 LOOP:
176 s_load_dword s16, s[0:1], 0x0 glc
177 s_cmp_eq_i32 s16, s18
178 s_cbranch_scc0 LOOP
179 .if (.amdgcn.gfx_generation_number >= 10)
180 flat_store_dword v[0:1], v2 slc
181 .else
182 s_store_dword s18, s[2:3], 0x0 glc
183 .endif
184 s_endpgm
185 )";
186
187 /* Similar to PollMemoryIsa except that the buffer
188 * polled can be Non-coherant memory. SCC system-level
189 * cache coherence is not supported in scalar (smem) path.
190 * Use vmem operations with scc
191 *
192 * Note: Only works on Aldebaran, and even then the scc modifier
193 * has been defeatured. This shader is more or less
194 * deprecated.
195 */
196 const char *PollNCMemoryIsa = R"(
197 .text
198 // Assume src address in s0, s1, and dst address in s2, s3
199 v_mov_b32 v6, 0x5678
200 v_mov_b32 v0, s0
201 v_mov_b32 v1, s1
202 LOOP:
203 flat_load_dword v4, v[0:1] scc
204 v_cmp_eq_u32 vcc, v4, v6
205 s_cbranch_vccz LOOP
206 v_mov_b32 v0, s2
207 v_mov_b32 v1, s3
208 flat_store_dword v[0:1], v6 scc
209 s_endpgm
210 )";
211
212 /* Input: A buffer of at least 3 dwords.
213 * DW0: used as a signal. 0xcafe means it is signaled
214 * DW1: Input buffer for device to read.
215 * DW2: Output buffer for device to write.
216 * Once receive signal, device will copy DW1 to DW2
217 * This shader continously poll the signal buffer,
218 * Once signal buffer is signaled, it copies input buffer
219 * to output buffer
220 */
221 const char *CopyOnSignalIsa = R"(
222 .text
223 // Assume input buffer in s0, s1
224 .if (.amdgcn.gfx_generation_number >= 10)
225 s_add_u32 s2, s0, 0x8
226 s_addc_u32 s3, s1, 0x0
227 s_mov_b32 s18, 0xcafe
228 v_mov_b32 v0, s0
229 v_mov_b32 v1, s1
230 v_mov_b32 v4, s2
231 v_mov_b32 v5, s3
232 .else
233 s_mov_b32 s18, 0xcafe
234 .endif
235 POLLSIGNAL:
236 s_load_dword s16, s[0:1], 0x0 glc
237 s_cmp_eq_i32 s16, s18
238 s_cbranch_scc0 POLLSIGNAL
239 s_load_dword s17, s[0:1], 0x4 glc
240 s_waitcnt vmcnt(0) & lgkmcnt(0)
241 .if (.amdgcn.gfx_generation_number >= 10)
242 v_mov_b32 v2, s17
243 flat_store_dword v[4:5], v2 glc
244 .else
245 s_store_dword s17, s[0:1], 0x8 glc
246 .endif
247 s_waitcnt vmcnt(0) & lgkmcnt(0)
248 s_endpgm
249 )";
250
251 /* Continuously poll the flag at src buffer
252 * After the flag of s[0:1] is 1 filled,
253 * copy the value from s[0:1]+4 to dst buffer
254 *
255 * Note: Only works on GFX9 (only used in
256 * aldebaran tests)
257 */
258 const char *PollAndCopyIsa = R"(
259 .text
260 // Assume src buffer in s[0:1] and dst buffer in s[2:3]
261 .if (.amdgcn.gfx_generation_number == 9 && .amdgcn.gfx_generation_stepping == 10)
262 // Path for Aldebaran
263 v_mov_b32 v0, s0
264 v_mov_b32 v1, s1
265 v_mov_b32 v18, 0x1
266 LOOP_ALDBRN:
267 flat_load_dword v16, v[0:1] glc
268 s_waitcnt vmcnt(0) & lgkmcnt(0)
269 v_cmp_eq_i32 vcc, v16, v18
270 s_cbranch_vccz LOOP_ALDBRN
271 buffer_invl2
272 s_load_dword s17, s[0:1], 0x4 glc
273 s_waitcnt vmcnt(0) & lgkmcnt(0)
274 s_store_dword s17, s[2:3], 0x0 glc
275 s_waitcnt vmcnt(0) & lgkmcnt(0)
276 buffer_wbl2
277 .elseif (.amdgcn.gfx_generation_number == 9)
278 s_movk_i32 s18, 0x1
279 LOOP:
280 s_load_dword s16, s[0:1], 0x0 glc
281 s_cmp_eq_i32 s16, s18
282 s_cbranch_scc0 LOOP
283 s_load_dword s17, s[0:1], 0x4 glc
284 s_waitcnt vmcnt(0) & lgkmcnt(0)
285 s_store_dword s17, s[2:3], 0x0 glc
286 .endif
287 s_waitcnt vmcnt(0) & lgkmcnt(0)
288 s_endpgm
289 )";
290
291 /* Input0: A buffer of at least 2 dwords.
292 * DW0: used as a signal. Write 0x1 to signal
293 * DW1: Write the value from 2nd input buffer
294 * for other device to read.
295 * Input1: A buffer of at least 2 dwords.
296 * DW0: used as the value to be written.
297 *
298 * Note: Only works on Aldebaran
299 */
300 const char *WriteFlagAndValueIsa = R"(
301 .text
302 // Assume two inputs buffer in s[0:1] and s[2:3]
303 .if (.amdgcn.gfx_generation_number == 9 && .amdgcn.gfx_generation_stepping == 10)
304 v_mov_b32 v0, s0
305 v_mov_b32 v1, s1
306 s_load_dword s18, s[2:3], 0x0 glc
307 s_waitcnt vmcnt(0) & lgkmcnt(0)
308 s_store_dword s18, s[0:1], 0x4 glc
309 s_waitcnt vmcnt(0) & lgkmcnt(0)
310 buffer_wbl2
311 s_waitcnt vmcnt(0) & lgkmcnt(0)
312 v_mov_b32 v16, 0x1
313 flat_store_dword v[0:1], v16 glc
314 .endif
315 s_endpgm
316 )";
317
318 /* Input0: A buffer of at least 2 dwords.
319 * DW0: used as a signal. Write 0xcafe to signal
320 * DW1: Write to this buffer for other device to read.
321 * Input1: mmio base address
322 */
323 const char *WriteAndSignalIsa = R"(
324 .text
325 // Assume input buffer in s0, s1
326 .if (.amdgcn.gfx_generation_number >= 10)
327 s_add_u32 s4, s0, 0x4
328 s_addc_u32 s5, s1, 0x0
329 v_mov_b32 v0, s0
330 v_mov_b32 v1, s1
331 v_mov_b32 v2, s2
332 v_mov_b32 v3, s3
333 v_mov_b32 v4, s4
334 v_mov_b32 v5, s5
335 v_mov_b32 v18, 0xbeef
336 flat_store_dword v[4:5], v18 glc
337 v_mov_b32 v18, 0x1
338 flat_store_dword v[2:3], v18 glc
339 v_mov_b32 v18, 0xcafe
340 flat_store_dword v[0:1], v18 glc
341 .else
342 s_mov_b32 s18, 0xbeef
343 s_store_dword s18, s[0:1], 0x4 glc
344 s_mov_b32 s18, 0x1
345 s_store_dword s18, s[2:3], 0 glc
346 s_mov_b32 s18, 0xcafe
347 s_store_dword s18, s[0:1], 0x0 glc
348 .endif
349 s_endpgm
350 )";
351
352 /**
353 * KFDQMTest
354 */
355
356 /* A simple isa loop program with dense mathematic operations
357 * s1 controls the number iterations of the loop
358 * This shader can be used by GFX8, GFX9 and GFX10
359 */
360 const char *LoopIsa = R"(
361 .text
362 s_movk_i32 s0, 0x0008
363 s_movk_i32 s1, 0x00ff
364 v_mov_b32 v0, 0
365 v_mov_b32 v1, 0
366 v_mov_b32 v2, 0
367 v_mov_b32 v3, 0
368 v_mov_b32 v4, 0
369 v_mov_b32 v5, 0
370 v_mov_b32 v6, 0
371 v_mov_b32 v7, 0
372 v_mov_b32 v8, 0
373 v_mov_b32 v9, 0
374 v_mov_b32 v10, 0
375 v_mov_b32 v11, 0
376 v_mov_b32 v12, 0
377 v_mov_b32 v13, 0
378 v_mov_b32 v14, 0
379 v_mov_b32 v15, 0
380 v_mov_b32 v16, 0
381 LOOP:
382 s_mov_b32 s8, s4
383 s_mov_b32 s9, s1
384 s_mov_b32 s10, s6
385 s_mov_b32 s11, s7
386 s_cmp_le_i32 s1, s0
387 s_cbranch_scc1 END_OF_PGM
388 v_add_f32 v0, 2.0, v0
389 v_cvt_f32_i32 v17, s1
390 s_waitcnt lgkmcnt(0)
391 v_add_f32 v18, s8, v17
392 v_add_f32 v19, s9, v17
393 v_add_f32 v20, s10, v17
394 v_add_f32 v21, s11, v17
395 v_add_f32 v22, s12, v17
396 v_add_f32 v23, s13, v17
397 v_add_f32 v24, s14, v17
398 v_add_f32 v17, s15, v17
399 v_log_f32 v25, v18
400 v_mul_f32 v25, v22, v25
401 v_exp_f32 v25, v25
402 v_log_f32 v26, v19
403 v_mul_f32 v26, v23, v26
404 v_exp_f32 v26, v26
405 v_log_f32 v27, v20
406 v_mul_f32 v27, v24, v27
407 v_exp_f32 v27, v27
408 v_log_f32 v28, v21
409 v_mul_f32 v28, v17, v28
410 v_exp_f32 v28, v28
411 v_add_f32 v5, v5, v25
412 v_add_f32 v6, v6, v26
413 v_add_f32 v7, v7, v27
414 v_add_f32 v8, v8, v28
415 v_mul_f32 v18, 0x3fb8aa3b, v18
416 v_exp_f32 v18, v18
417 v_mul_f32 v19, 0x3fb8aa3b, v19
418 v_exp_f32 v19, v19
419 v_mul_f32 v20, 0x3fb8aa3b, v20
420 v_exp_f32 v20, v20
421 v_mul_f32 v21, 0x3fb8aa3b, v21
422 v_exp_f32 v21, v21
423 v_add_f32 v9, v9, v18
424 v_add_f32 v10, v10, v19
425 v_add_f32 v11, v11, v20
426 v_add_f32 v12, v12, v21
427 v_sqrt_f32 v18, v22
428 v_sqrt_f32 v19, v23
429 v_sqrt_f32 v20, v24
430 v_sqrt_f32 v21, v17
431 v_add_f32 v13, v13, v18
432 v_add_f32 v14, v14, v19
433 v_add_f32 v15, v15, v20
434 v_add_f32 v16, v16, v21
435 v_rsq_f32 v18, v22
436 v_rsq_f32 v19, v23
437 v_rsq_f32 v20, v24
438 v_rsq_f32 v17, v17
439 v_add_f32 v1, v1, v18
440 v_add_f32 v2, v2, v19
441 v_add_f32 v3, v3, v20
442 v_add_f32 v4, v4, v17
443 s_add_u32 s0, s0, 1
444 s_branch LOOP
445 END_OF_PGM:
446 s_endpgm
447 )";
448
449
450 /**
451 * KFDCWSRTest
452 */
453
454 /* Initial state:
455 * s[0:1] - input buffer base address
456 * s[2:3] - output buffer base address
457 * s4 - workgroup id
458 * v0 - workitem id
459 * Registers:
460 * v0 - calculated workitem = v0 + s4 * NUM_THREADS_X, which is s4
461 * v[4:5] - corresponding output buf address: s[2:3] + v0 * 4
462 * v6 - register storing known-value output for mangle testing
463 * v7 - counter
464 */
465 const char *IterateIsa = SHADER_MACROS R"(
466 // Compute address of output buffer
467 v_mov_b32 v0, s4 // use workgroup id as index
468 v_lshlrev_b32 v0, 2, v0 // v0 *= 4
469 V_ADD_CO_U32 v4, s2, v0 // v[4:5] = s[2:3] + v0 * 4
470 v_mov_b32 v5, s3 // v[4:5] = s[2:3] + v0 * 4
471 V_ADD_CO_CI_U32 v5, v5, 0 // v[4:5] = s[2:3] + v0 * 4
472
473 // Store known-value output in register
474 flat_load_dword v6, v[4:5] glc
475 s_waitcnt vmcnt(0) & lgkmcnt(0) // wait for memory reads to finish
476
477 // Initialize counter
478 v_mov_b32 v7, 0
479
480 LOOP:
481 flat_store_dword v[4:5], v6 // store known-val in output
482 V_ADD_CO_U32 v7, 1, v7 // increment counter
483
484 s_load_dword s6, s[0:1], 0 glc
485 s_waitcnt vmcnt(0) & lgkmcnt(0) // wait for memory reads to finish
486 s_cmp_eq_i32 s6, 0x12345678 // compare input buf to stopval
487 s_cbranch_scc1 L_QUIT // branch if notified to quit by host
488
489 s_branch LOOP
490
491 L_QUIT:
492 s_waitcnt vmcnt(0) & lgkmcnt(0)
493 s_endpgm
494 )";
495
496 /**
497 * KFDEvictTest
498 */
499
500 /* Shader to read local buffers using multiple wavefronts in parallel
501 * until address buffer is filled with specific value 0x5678 by host program,
502 * then each wavefront fills value 0x5678 at corresponding result buffer and quit
503 *
504 * Initial state:
505 * s[0:1] - address buffer base address
506 * s[2:3] - result buffer base address
507 * s4 - workgroup id
508 * v0 - workitem id, always 0 because NUM_THREADS_X(number of threads) in workgroup set to 1
509 * Registers:
510 * v0 - calculated workitem id, v0 = v0 + s4 * NUM_THREADS_X
511 * v[2:3] - address of corresponding local buf address offset: s[0:1] + v0 * 8
512 * v[4:5] - corresponding output buf address: s[2:3] + v0 * 4
513 * v[6:7] - local buf address used for read test
514 */
515 const char *ReadMemoryIsa = SHADER_MACROS R"(
516 // Compute address of corresponding output buffer
517 v_mov_b32 v0, s4 // use workgroup id as index
518 v_lshlrev_b32 v0, 2, v0 // v0 *= 4
519 V_ADD_CO_U32 v4, s2, v0 // v[4:5] = s[2:3] + v0 * 4
520 v_mov_b32 v5, s3 // v[4:5] = s[2:3] + v0 * 4
521 V_ADD_CO_CI_U32 v5, v5, 0 // v[4:5] = s[2:3] + v0 * 4
522
523 // Compute input buffer offset used to store corresponding local buffer address
524 v_lshlrev_b32 v0, 1, v0 // v0 *= 8
525 V_ADD_CO_U32 v2, s0, v0 // v[2:3] = s[0:1] + v0 * 8
526 v_mov_b32 v3, s1 // v[2:3] = s[0:1] + v0 * 8
527 V_ADD_CO_CI_U32 v3, v3, 0 // v[2:3] = s[0:1] + v0 * 8
528
529 // Load 64bit local buffer address stored at v[2:3] to v[6:7]
530 flat_load_dwordx2 v[6:7], v[2:3] slc
531 s_waitcnt vmcnt(0) & lgkmcnt(0) // wait for memory reads to finish
532 v_mov_b32 v8, 0x5678
533 s_movk_i32 s8, 0x5678
534 L_REPEAT:
535 s_load_dword s16, s[0:1], 0x0 glc
536 s_waitcnt vmcnt(0) & lgkmcnt(0) // wait for memory reads to finish
537 s_cmp_eq_i32 s16, s8
538 s_cbranch_scc1 L_QUIT // if notified to quit by host
539
540 // Loop read 64M local buffer starting at v[6:7]
541 // every 4k page only read once
542 v_mov_b32 v9, 0
543 v_mov_b32 v10, 0x1000 // 4k page
544 v_mov_b32 v11, 0x4000000 // 64M size
545 v_mov_b32 v12, v6
546 v_mov_b32 v13, v7
547 L_LOOP_READ:
548 flat_load_dwordx2 v[14:15], v[12:13] slc
549 V_ADD_CO_U32 v9, v9, v10
550 V_ADD_CO_U32 v12, v12, v10
551 V_ADD_CO_CI_U32 v13, v13, 0
552 V_CMP_LT_U32 v9, v11
553 s_cbranch_vccnz L_LOOP_READ
554 s_branch L_REPEAT
555 L_QUIT:
556 flat_store_dword v[4:5], v8
557 s_waitcnt vmcnt(0) & lgkmcnt(0) // wait for memory writes to finish
558 s_endpgm
559 )";
560
561 /**
562 * KFDGWSTest
563 */
564
565 /* Shader to initialize gws counter to 1 */
566 const char *GwsInitIsa = R"(
567 .text
568 s_mov_b32 m0, 0
569 s_nop 0
570 s_load_dword s16, s[0:1], 0x0 glc
571 s_waitcnt 0
572 v_mov_b32 v0, s16
573 s_waitcnt 0
574 ds_gws_init v0 offset:0 gds
575 s_waitcnt 0
576 s_endpgm
577 )";
578
579 /* Atomically increase a value in memory
580 * This is expected to be executed from
581 * multiple work groups simultaneously.
582 * GWS semaphore is used to guarantee
583 * the operation is atomic.
584 */
585 const char *GwsAtomicIncreaseIsa = R"(
586 .text
587 // Assume src address in s0, s1
588 .if (.amdgcn.gfx_generation_number >= 10)
589 s_mov_b32 m0, 0
590 s_mov_b32 exec_lo, 0x1
591 v_mov_b32 v0, s0
592 v_mov_b32 v1, s1
593 ds_gws_sema_p offset:0 gds
594 s_waitcnt 0
595 flat_load_dword v2, v[0:1] glc dlc
596 s_waitcnt 0
597 v_add_nc_u32 v2, v2, 1
598 flat_store_dword v[0:1], v2
599 s_waitcnt_vscnt null, 0
600 ds_gws_sema_v offset:0 gds
601 .else
602 s_mov_b32 m0, 0
603 s_nop 0
604 ds_gws_sema_p offset:0 gds
605 s_waitcnt 0
606 s_load_dword s16, s[0:1], 0x0 glc
607 s_waitcnt 0
608 s_add_u32 s16, s16, 1
609 s_store_dword s16, s[0:1], 0x0 glc
610 s_waitcnt lgkmcnt(0)
611 ds_gws_sema_v offset:0 gds
612 .endif
613 s_waitcnt 0
614 s_endpgm
615 )";
0 /*
1 * Copyright (C) 2021 Advanced Micro Devices, Inc. All Rights Reserved.
2 *
3 * Permission is hereby granted, free of charge, to any person obtaining a
4 * copy of this software and associated documentation files (the "Software"),
5 * to deal in the Software without restriction, including without limitation
6 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
7 * and/or sell copies of the Software, and to permit persons to whom the
8 * Software is furnished to do so, subject to the following conditions:
9 *
10 * The above copyright notice and this permission notice shall be included in
11 * all copies or substantial portions of the Software.
12 *
13 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
14 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
15 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
16 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
17 * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
18 * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
19 * OTHER DEALINGS IN THE SOFTWARE.
20 *
21 */
22
23 #ifndef _SHADERSTORE_H_
24 #define _SHADERSTORE_H_
25
26 #include <vector>
27
28 /* KFDASMTest List */
29 extern const std::vector<const char*> ShaderList;
30
31 /* Common */
32 extern const char *NoopIsa;
33 extern const char *CopyDwordIsa;
34 extern const char *InfiniteLoopIsa;
35 extern const char *AtomicIncIsa;
36
37 /* KFDMemoryTest */
38 extern const char *ScratchCopyDwordIsa;
39 extern const char *PollMemoryIsa;
40 extern const char *PollNCMemoryIsa;
41 extern const char *CopyOnSignalIsa;
42 extern const char *PollAndCopyIsa;
43 extern const char *WriteFlagAndValueIsa;
44 extern const char *WriteAndSignalIsa;
45
46 /* KFDQMTest */
47 extern const char *LoopIsa;
48
49 /* KFDCWSRTest */
50 extern const char *IterateIsa;
51
52 /* KFDEvictTest */
53 extern const char *ReadMemoryIsa;
54
55 /* KFDGWSTest */
56 extern const char *GwsInitIsa;
57 extern const char *GwsAtomicIncreaseIsa;
58
59 #endif // _SHADERSTORE_H_