Update upstream source from tag 'upstream/5.4.0'
Update to upstream version '5.4.0'
with Debian dir c57fc150e425eabf98034483f55a0572eda6bb42
Étienne Mollier
1 year, 5 months ago
142 | 142 | target_include_directories( ${HSAKMT_TARGET} |
143 | 143 | PUBLIC |
144 | 144 | $<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/include> |
145 | $<INSTALL_INTERFACE:include> | |
145 | $<INSTALL_INTERFACE:${CMAKE_INSTALL_INCLUDEDIR}> | |
146 | 146 | PRIVATE |
147 | 147 | ${CMAKE_CURRENT_SOURCE_DIR}/src ) |
148 | 148 | |
159 | 159 | find_package(PkgConfig) |
160 | 160 | # Check for libraries required for building |
161 | 161 | find_library(LIBC NAMES libc.so.6 REQUIRED) |
162 | find_library(NUMA NAMES libnuma.so REQUIRED) | |
162 | find_library(NUMA NAMES numa REQUIRED) | |
163 | 163 | message(STATUS "LIBC:" ${LIBC}) |
164 | 164 | message(STATUS "NUMA:" ${NUMA}) |
165 | 165 | |
182 | 182 | include_directories(${DRM_INCLUDE_DIRS}) |
183 | 183 | |
184 | 184 | target_link_libraries ( ${HSAKMT_TARGET} |
185 | PRIVATE ${DRM_LDFLAGS} ${DRM_AMDGPU_LDFLAGS} pthread rt ${LIBC} ${NUMA} | |
185 | PRIVATE ${DRM_LDFLAGS} ${DRM_AMDGPU_LDFLAGS} pthread rt ${LIBC} numa | |
186 | 186 | ) |
187 | 187 | |
188 | 188 | target_compile_options(${HSAKMT_TARGET} PRIVATE ${DRM_CFLAGS} ${HSAKMT_C_FLAGS}) |
190 | 190 | find_library(LIBGCC NAMES libgcc_s.so.1 REQUIRED) |
191 | 191 | message(STATUS "LIBGCC:" ${LIBGCC}) |
192 | 192 | target_link_libraries( ${HSAKMT_TARGET} PRIVATE ${LIBGCC} ) |
193 | else() | |
194 | find_library(UDEV NAMES libudev.so libudev.a REQUIRED) | |
195 | message(STATUS "UDEV:" ${UDEV}) | |
196 | find_package(ZLIB REQUIRED) | |
197 | target_link_libraries( ${HSAKMT_TARGET} PRIVATE ${ZLIB} ${UDEV} ) | |
198 | 193 | endif() |
199 | 194 | |
200 | 195 | ## Define default paths and packages. |
213 | 208 | #install ( FILES ${CMAKE_CURRENT_SOURCE_DIR}/LICENSE.md DESTINATION ${CMAKE_INSTALL_DOCDIR} COMPONENT devel ) |
214 | 209 | |
215 | 210 | # Install public headers |
216 | install ( DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}/include/ DESTINATION ${CMAKE_INSTALL_INCLUDEDIR} | |
211 | install ( DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}/include/ DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}/${HSAKMT_TARGET} | |
217 | 212 | COMPONENT devel PATTERN "linux" EXCLUDE ) |
213 | ||
214 | # Option to build header path migration helpers. | |
215 | option(INCLUDE_PATH_COMPATIBILITY "Generate backward compatible headers and include paths. Use of these headers will warn when included." ON) | |
216 | if(INCLUDE_PATH_COMPATIBILITY) | |
217 | include(hsakmt-backward-compat.cmake) | |
218 | endif() | |
218 | 219 | |
219 | 220 | # Record our usage data for clients find_package calls. |
220 | 221 | install ( EXPORT ${HSAKMT_TARGET}Targets |
288 | 289 | set ( ENABLE_LDCONFIG ON CACHE BOOL "Set library links and caches using ldconfig.") |
289 | 290 | |
290 | 291 | # Install License file |
291 | install ( FILES ${CPACK_RESOURCE_FILE_LICENSE} DESTINATION ${CMAKE_INSTALL_DOCDIR}/${CPACK_PACKAGE_NAME} COMPONENT devel) | |
292 | install ( FILES ${CPACK_RESOURCE_FILE_LICENSE} DESTINATION ${CMAKE_INSTALL_DOCDIR} COMPONENT devel) | |
292 | 293 | |
293 | 294 | # Make proper version for appending |
294 | 295 | # Default Value is 99999, setting it first |
4 | 4 | # left-hand term originates from ENABLE_LDCONFIG = ON/OFF at package build |
5 | 5 | do_ldconfig() { |
6 | 6 | if [ "@ENABLE_LDCONFIG@" == "ON" ]; then |
7 | echo @CPACK_PACKAGING_INSTALL_PREFIX@/lib > /etc/ld.so.conf.d/x86_64-libhsakmt.conf | |
7 | echo @CPACK_PACKAGING_INSTALL_PREFIX@/@CMAKE_INSTALL_LIBDIR@ > /@CMAKE_INSTALL_SYSCONFDIR@/ld.so.conf.d/x86_64-libhsakmt.conf | |
8 | 8 | ldconfig |
9 | 9 | fi |
10 | 10 | } |
4 | 4 | # left-hand term originates from ENABLE_LDCONFIG = ON/OFF at package build |
5 | 5 | rm_ldconfig() { |
6 | 6 | if [ "@ENABLE_LDCONFIG@" == "ON" ]; then |
7 | rm -f /etc/ld.so.conf.d/x86_64-libhsakmt.conf && ldconfig | |
7 | rm -f /@CMAKE_INSTALL_SYSCONFDIR@/ld.so.conf.d/x86_64-libhsakmt.conf && ldconfig | |
8 | 8 | fi |
9 | 9 | } |
10 | 10 |
0 | 0 | # left-hand term originates from ENABLE_LDCONFIG = ON/OFF at package build |
1 | 1 | if [ "@ENABLE_LDCONFIG@" == "ON" ]; then |
2 | echo -e "@CPACK_PACKAGING_INSTALL_PREFIX@/lib\n@CPACK_PACKAGING_INSTALL_PREFIX@/lib64" > /etc/ld.so.conf.d/x86_64-libhsakmt.conf | |
2 | echo -e "@CPACK_PACKAGING_INSTALL_PREFIX@/@CMAKE_INSTALL_LIBDIR@" > /@CMAKE_INSTALL_SYSCONFDIR@/ld.so.conf.d/x86_64-libhsakmt.conf | |
3 | 3 | ldconfig |
4 | 4 | fi |
0 | 0 | # second term originates from ENABLE_LDCONFIG = ON/OFF at package build |
1 | 1 | if [ $1 -le 1 ] && [ "@ENABLE_LDCONFIG@" == "ON" ]; then |
2 | 2 | # perform the below actions for rpm remove($1=0) or upgrade($1=1) operations |
3 | rm -f /etc/ld.so.conf.d/x86_64-libhsakmt.conf | |
3 | rm -f /@CMAKE_INSTALL_SYSCONFDIR@/ld.so.conf.d/x86_64-libhsakmt.conf | |
4 | 4 | ldconfig |
5 | 5 | fi |
0 | # Copyright (c) 2022 Advanced Micro Devices, Inc. All Rights Reserved. | |
1 | # Permission is hereby granted, free of charge, to any person obtaining a copy | |
2 | # of this software and associated documentation files (the "Software"), to deal | |
3 | # in the Software without restriction, including without limitation the rights | |
4 | # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell | |
5 | # copies of the Software, and to permit persons to whom the Software is | |
6 | # furnished to do so, subject to the following conditions: | |
7 | # | |
8 | # The above copyright notice and this permission notice shall be included in | |
9 | # all copies or substantial portions of the Software. | |
10 | # | |
11 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR | |
12 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, | |
13 | # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE | |
14 | # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER | |
15 | # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, | |
16 | # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN | |
17 | # THE SOFTWARE. | |
18 | ||
19 | set(HSAKMT_WRAPPER_DIR ${CMAKE_CURRENT_BINARY_DIR}/wrapper_dir) | |
20 | set(HSAKMT_WRAPPER_INC_DIR ${HSAKMT_WRAPPER_DIR}/include) | |
21 | #Function to generate header template file | |
22 | function(create_header_template) | |
23 | file(WRITE ${HSAKMT_WRAPPER_DIR}/header.hpp.in "/* | |
24 | Copyright (c) 2022 Advanced Micro Devices, Inc. All rights reserved. | |
25 | ||
26 | Permission is hereby granted, free of charge, to any person obtaining a copy | |
27 | of this software and associated documentation files (the \"Software\"), to deal | |
28 | in the Software without restriction, including without limitation the rights | |
29 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell | |
30 | copies of the Software, and to permit persons to whom the Software is | |
31 | furnished to do so, subject to the following conditions: | |
32 | ||
33 | The above copyright notice and this permission notice shall be included in | |
34 | all copies or substantial portions of the Software. | |
35 | ||
36 | THE SOFTWARE IS PROVIDED \"AS IS\", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR | |
37 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, | |
38 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE | |
39 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER | |
40 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, | |
41 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN | |
42 | THE SOFTWARE. | |
43 | */\n\n#ifndef @include_guard@\n#define @include_guard@ \n\n#pragma message(\"@file_name@ has moved to @CMAKE_INSTALL_PREFIX@/@CMAKE_INSTALL_INCLUDEDIR@/hsakmt and package include paths have changed.\\nInclude as \\\"hsakmt/@file_name@\\\" when using cmake packages.\")\n@include_statements@\n\n#endif") | |
44 | endfunction() | |
45 | ||
46 | #use header template file and generate wrapper header files | |
47 | function(generate_wrapper_header) | |
48 | file(MAKE_DIRECTORY ${HSAKMT_WRAPPER_INC_DIR}) | |
49 | #find all header files from include folder | |
50 | file(GLOB include_files ${CMAKE_CURRENT_SOURCE_DIR}/include/*.h) | |
51 | #generate wrapper header files | |
52 | foreach(header_file ${include_files}) | |
53 | # set include guard | |
54 | get_filename_component(INC_GUARD_NAME ${header_file} NAME_WE) | |
55 | string(TOUPPER ${INC_GUARD_NAME} INC_GUARD_NAME) | |
56 | set(include_guard "${include_guard}HSAKMT_WRAPPER_INCLUDE_${INC_GUARD_NAME}_H") | |
57 | # set include statements | |
58 | get_filename_component(file_name ${header_file} NAME) | |
59 | set(include_statements "${include_statements}#include \"hsakmt/${file_name}\"\n") | |
60 | configure_file(${HSAKMT_WRAPPER_DIR}/header.hpp.in ${HSAKMT_WRAPPER_INC_DIR}/${file_name}) | |
61 | unset(include_guard) | |
62 | unset(include_statements) | |
63 | endforeach() | |
64 | endfunction() | |
65 | ||
66 | #Creater a template for header file | |
67 | create_header_template() | |
68 | #Use template header file and generater wrapper header files | |
69 | generate_wrapper_header() | |
70 | install(DIRECTORY ${HSAKMT_WRAPPER_INC_DIR} DESTINATION . COMPONENT devel PATTERN "linux" EXCLUDE) |
374 | 374 | ); |
375 | 375 | |
376 | 376 | /** |
377 | Inquires memory available for allocation as a memory buffer | |
378 | */ | |
379 | ||
380 | HSAKMT_STATUS | |
381 | HSAKMTAPI | |
382 | hsaKmtAvailableMemory( | |
383 | HSAuint32 Node, | |
384 | HSAuint64 *AvailableBytes | |
385 | ); | |
386 | ||
387 | /** | |
377 | 388 | Registers with KFD a memory buffer that may be accessed by the GPU |
378 | 389 | */ |
379 | 390 | |
865 | 876 | HSAint32 * enable // OUT: returns XNACK value. |
866 | 877 | ); |
867 | 878 | |
879 | /** | |
880 | Open anonymous file handle to enable events and read SMI events. | |
881 | ||
882 | To enable events, write 64bit events mask to fd, event enums as bit index. | |
883 | for example, event mask (HSA_SMI_EVENT_MASK_FROM_INDEX(HSA_SMI_EVENT_INDEX_MAX) - 1) to enable all events | |
884 | ||
885 | Read event from fd is not blocking, use poll with timeout value to check if event is available. | |
886 | Event is dropped if kernel event fifo is full. | |
887 | */ | |
888 | HSAKMT_STATUS | |
889 | HSAKMTAPI | |
890 | hsaKmtOpenSMI( | |
891 | HSAuint32 NodeId, // IN: GPU node_id to receive the SMI event from | |
892 | int *fd // OUT: anonymous file handle | |
893 | ); | |
894 | ||
868 | 895 | #ifdef __cplusplus |
869 | 896 | } //extern "C" |
870 | 897 | #endif |
327 | 327 | |
328 | 328 | HSAuint32 VGPRSizePerCU; // VGPR size in bytes per CU |
329 | 329 | HSAuint32 SGPRSizePerCU; // SGPR size in bytes per CU |
330 | HSAuint8 Reserved[12]; | |
330 | ||
331 | HSAuint32 KFDGpuID; // GPU Hash ID generated by KFD | |
332 | ||
333 | HSAuint32 FamilyID; // GPU family id | |
334 | HSAuint8 Reserved[4]; | |
331 | 335 | } HsaNodeProperties; |
332 | 336 | |
333 | 337 | |
1328 | 1332 | HSA_SVM_FLAG_GPU_RO = 0x00000008, // GPUs only read, allows replication |
1329 | 1333 | HSA_SVM_FLAG_GPU_EXEC = 0x00000010, // Allow execution on GPU |
1330 | 1334 | HSA_SVM_FLAG_GPU_READ_MOSTLY = 0x00000020, // GPUs mostly read, may allow similar optimizations as RO, but writes fault |
1335 | HSA_SVM_FLAG_GPU_ALWAYS_MAPPED = 0x00000040, // Keep GPU memory mapping always valid as if XNACK is disable | |
1331 | 1336 | } HSA_SVM_FLAGS; |
1332 | 1337 | |
1333 | 1338 | typedef enum _HSA_SVM_ATTR_TYPE { |
1351 | 1356 | HSAuint32 value; // attribute value |
1352 | 1357 | } HSA_SVM_ATTRIBUTE; |
1353 | 1358 | |
1359 | typedef enum _HSA_SMI_EVENT { | |
1360 | HSA_SMI_EVENT_NONE = 0, /* not used */ | |
1361 | HSA_SMI_EVENT_VMFAULT = 1, /* event start counting at 1 */ | |
1362 | HSA_SMI_EVENT_THERMAL_THROTTLE = 2, | |
1363 | HSA_SMI_EVENT_GPU_PRE_RESET = 3, | |
1364 | HSA_SMI_EVENT_GPU_POST_RESET = 4, | |
1365 | HSA_SMI_EVENT_MIGRATE_START = 5, | |
1366 | HSA_SMI_EVENT_MIGRATE_END = 6, | |
1367 | HSA_SMI_EVENT_PAGE_FAULT_START = 7, | |
1368 | HSA_SMI_EVENT_PAGE_FAULT_END = 8, | |
1369 | HSA_SMI_EVENT_QUEUE_EVICTION = 9, | |
1370 | HSA_SMI_EVENT_QUEUE_RESTORE = 10, | |
1371 | HSA_SMI_EVENT_UNMAP_FROM_GPU = 11, | |
1372 | HSA_SMI_EVENT_INDEX_MAX = 12, | |
1373 | ||
1374 | /* | |
1375 | * max event number, as a flag bit to get events from all processes, | |
1376 | * this requires super user permission, otherwise will not be able to | |
1377 | * receive event from any process. Without this flag to receive events | |
1378 | * from same process. | |
1379 | */ | |
1380 | HSA_SMI_EVENT_ALL_PROCESS = 64 | |
1381 | } HSA_EVENT_TYPE; | |
1382 | ||
1383 | typedef enum _HSA_MIGRATE_TRIGGERS { | |
1384 | HSA_MIGRATE_TRIGGER_PREFETCH, | |
1385 | HSA_MIGRATE_TRIGGER_PAGEFAULT_GPU, | |
1386 | HSA_MIGRATE_TRIGGER_PAGEFAULT_CPU, | |
1387 | HSA_MIGRATE_TRIGGER_TTM_EVICTION | |
1388 | } HSA_MIGRATE_TRIGGERS; | |
1389 | ||
1390 | typedef enum _HSA_QUEUE_EVICTION_TRIGGERS { | |
1391 | HSA_QUEUE_EVICTION_TRIGGER_SVM, | |
1392 | HSA_QUEUE_EVICTION_TRIGGER_USERPTR, | |
1393 | HSA_QUEUE_EVICTION_TRIGGER_TTM, | |
1394 | HSA_QUEUE_EVICTION_TRIGGER_SUSPEND, | |
1395 | HSA_QUEUE_EVICTION_CRIU_CHECKPOINT, | |
1396 | HSA_QUEUE_EVICTION_CRIU_RESTORE | |
1397 | } HSA_QUEUE_EVICTION_TRIGGERS; | |
1398 | ||
1399 | typedef enum _HSA_SVM_UNMAP_TRIGGERS { | |
1400 | HSA_SVM_UNMAP_TRIGGER_MMU_NOTIFY, | |
1401 | HSA_SVM_UNMAP_TRIGGER_MMU_NOTIFY_MIGRATE, | |
1402 | HSA_SVM_UNMAP_TRIGGER_UNMAP_FROM_CPU | |
1403 | } HSA_SVM_UNMAP_TRIGGERS; | |
1404 | ||
1405 | #define HSA_SMI_EVENT_MASK_FROM_INDEX(i) (1ULL << ((i) - 1)) | |
1406 | #define HSA_SMI_EVENT_MSG_SIZE 96 | |
1407 | ||
1354 | 1408 | #pragma pack(pop, hsakmttypes_h) |
1355 | 1409 | |
1356 | 1410 |
33 | 33 | * - 1.6 - Query clear flags in SVM get_attr API |
34 | 34 | * - 1.7 - Checkpoint Restore (CRIU) API |
35 | 35 | * - 1.8 - CRIU - Support for SDMA transfers with GTT BOs |
36 | * - 1.9 - Add available_memory ioctl | |
37 | * - 1.10 - Add SMI profiler event log | |
38 | * - 1.11 - Add unified memory for ctx save/restore area | |
36 | 39 | */ |
37 | 40 | #define KFD_IOCTL_MAJOR_VERSION 1 |
38 | #define KFD_IOCTL_MINOR_VERSION 8 | |
41 | #define KFD_IOCTL_MINOR_VERSION 11 | |
39 | 42 | |
40 | 43 | /* |
41 | 44 | * Debug revision change log |
768 | 771 | __u64 handle; /* to KFD */ |
769 | 772 | }; |
770 | 773 | |
774 | /* Inquire available memory with kfd_ioctl_get_available_memory | |
775 | * | |
776 | * @available: memory available for alloc | |
777 | */ | |
778 | struct kfd_ioctl_get_available_memory_args { | |
779 | __u64 available; /* from KFD */ | |
780 | __u32 gpu_id; /* to KFD */ | |
781 | __u32 pad; | |
782 | }; | |
783 | ||
771 | 784 | /* Map memory to one or more GPUs |
772 | 785 | * |
773 | 786 | * @handle: memory handle returned by alloc |
1068 | 1081 | #define KFD_IOCTL_SVM_FLAG_GPU_EXEC 0x00000010 |
1069 | 1082 | /* GPUs mostly read, may allow similar optimizations as RO, but writes fault */ |
1070 | 1083 | #define KFD_IOCTL_SVM_FLAG_GPU_READ_MOSTLY 0x00000020 |
1084 | /* Keep GPU memory mapping always valid as if XNACK is disable */ | |
1085 | #define KFD_IOCTL_SVM_FLAG_GPU_ALWAYS_MAPPED 0x00000040 | |
1071 | 1086 | |
1072 | 1087 | /** |
1073 | 1088 | * kfd_ioctl_svm_op - SVM ioctl operations |
1326 | 1341 | #define AMDKFD_IOC_CRIU_OP \ |
1327 | 1342 | AMDKFD_IOWR(0x22, struct kfd_ioctl_criu_args) |
1328 | 1343 | |
1344 | #define AMDKFD_IOC_AVAILABLE_MEMORY \ | |
1345 | AMDKFD_IOWR(0x23, struct kfd_ioctl_get_available_memory_args) | |
1346 | ||
1329 | 1347 | #define AMDKFD_COMMAND_START 0x01 |
1330 | #define AMDKFD_COMMAND_END 0x23 | |
1348 | #define AMDKFD_COMMAND_END 0x24 | |
1331 | 1349 | |
1332 | 1350 | /* non-upstream ioctls */ |
1333 | 1351 | #define AMDKFD_IOC_IPC_IMPORT_HANDLE \ |
296 | 296 | } |
297 | 297 | |
298 | 298 | #define HSA_RUNTIME_ENABLE_MIN_MAJOR 10 |
299 | #define HSA_RUNTIME_ENABLE_MAX_MAJOR 13 | |
299 | 300 | #define HSA_RUNTIME_ENABLE_MIN_MINOR 0 |
301 | ||
302 | static HSAKMT_STATUS checkRuntimeDebugSupport(void) { | |
303 | HSAuint32 kMajor, kMinor; | |
304 | HsaNodeProperties node = {0}; | |
305 | HsaSystemProperties props = {0}; | |
306 | ||
307 | memset(&node, 0x00, sizeof(node)); | |
308 | memset(&props, 0x00, sizeof(props)); | |
309 | if (hsaKmtAcquireSystemProperties(&props)) | |
310 | return HSAKMT_STATUS_ERROR; | |
311 | ||
312 | //the firmware of gpu node doesn't support the debugger, disable it. | |
313 | for (uint32_t i = 0; i < props.NumNodes; i++) { | |
314 | if (hsaKmtGetNodeProperties(i, &node)) | |
315 | return HSAKMT_STATUS_ERROR; | |
316 | ||
317 | //ignore cpu node | |
318 | if (node.NumCPUCores) | |
319 | continue; | |
320 | if (!node.Capability.ui32.DebugSupportedFirmware) | |
321 | return HSAKMT_STATUS_NOT_SUPPORTED; | |
322 | } | |
323 | ||
324 | if (hsaKmtGetKernelDebugTrapVersionInfo(&kMajor, &kMinor)) | |
325 | return HSAKMT_STATUS_NOT_SUPPORTED; | |
326 | ||
327 | if (kMajor < HSA_RUNTIME_ENABLE_MIN_MAJOR || kMajor > HSA_RUNTIME_ENABLE_MAX_MAJOR || | |
328 | (kMajor == HSA_RUNTIME_ENABLE_MIN_MAJOR && | |
329 | (int)kMinor < HSA_RUNTIME_ENABLE_MIN_MINOR)) | |
330 | return HSAKMT_STATUS_NOT_SUPPORTED; | |
331 | ||
332 | return HSAKMT_STATUS_SUCCESS; | |
333 | } | |
334 | ||
300 | 335 | HSAKMT_STATUS HSAKMTAPI hsaKmtRuntimeEnable(void *rDebug, |
301 | 336 | bool setupTtmp) |
302 | 337 | { |
303 | 338 | struct kfd_ioctl_dbg_trap_args args = {0}; |
304 | HSAuint32 kMajor, kMinor; | |
305 | HSAKMT_STATUS result; | |
306 | ||
307 | result = hsaKmtGetKernelDebugTrapVersionInfo(&kMajor, &kMinor); | |
339 | HSAKMT_STATUS result = checkRuntimeDebugSupport(); | |
308 | 340 | |
309 | 341 | if (result) |
310 | return HSAKMT_STATUS_NOT_SUPPORTED; | |
311 | ||
312 | if (kMajor != HSA_RUNTIME_ENABLE_MIN_MAJOR || | |
313 | (int)kMinor < HSA_RUNTIME_ENABLE_MIN_MINOR) | |
314 | return HSAKMT_STATUS_NOT_SUPPORTED; | |
342 | return result; | |
315 | 343 | |
316 | 344 | memset(&args, 0x00, sizeof(args)); |
317 | 345 | args.op = KFD_IOC_DBG_TRAP_RUNTIME_ENABLE; |
335 | 363 | HSAKMT_STATUS HSAKMTAPI hsaKmtRuntimeDisable(void) |
336 | 364 | { |
337 | 365 | struct kfd_ioctl_dbg_trap_args args = {0}; |
338 | HSAuint32 kMajor, kMinor; | |
339 | HSAKMT_STATUS result; | |
340 | ||
341 | result = hsaKmtGetKernelDebugTrapVersionInfo(&kMajor, &kMinor); | |
366 | HSAKMT_STATUS result = checkRuntimeDebugSupport(); | |
342 | 367 | |
343 | 368 | if (result) |
344 | return HSAKMT_STATUS_NOT_SUPPORTED; | |
345 | ||
346 | if (kMajor != HSA_RUNTIME_ENABLE_MIN_MAJOR || | |
347 | (int)kMinor < HSA_RUNTIME_ENABLE_MIN_MINOR) | |
348 | return HSAKMT_STATUS_NOT_SUPPORTED; | |
369 | return result; | |
349 | 370 | |
350 | 371 | memset(&args, 0x00, sizeof(args)); |
351 | 372 | args.op = KFD_IOC_DBG_TRAP_RUNTIME_ENABLE; |
338 | 338 | |
339 | 339 | return result; |
340 | 340 | } |
341 | ||
342 | HSAKMT_STATUS HSAKMTAPI hsaKmtOpenSMI(HSAuint32 NodeId, int *fd) | |
343 | { | |
344 | struct kfd_ioctl_smi_events_args args; | |
345 | HSAKMT_STATUS result; | |
346 | uint32_t gpuid; | |
347 | ||
348 | CHECK_KFD_OPEN(); | |
349 | ||
350 | pr_debug("[%s] node %d\n", __func__, NodeId); | |
351 | ||
352 | result = validate_nodeid(NodeId, &gpuid); | |
353 | if (result != HSAKMT_STATUS_SUCCESS) { | |
354 | pr_err("[%s] invalid node ID: %d\n", __func__, NodeId); | |
355 | return result; | |
356 | } | |
357 | ||
358 | args.gpuid = gpuid; | |
359 | result = kmtIoctl(kfd_fd, AMDKFD_IOC_SMI_EVENTS, &args); | |
360 | if (result) { | |
361 | pr_debug("open SMI event fd failed %s\n", strerror(errno)); | |
362 | return HSAKMT_STATUS_ERROR; | |
363 | } | |
364 | ||
365 | *fd = args.anon_fd; | |
366 | return HSAKMT_STATUS_SUCCESS; | |
367 | } |
35 | 35 | #include <sys/mman.h> |
36 | 36 | #include <sys/time.h> |
37 | 37 | #include <errno.h> |
38 | #include <assert.h> | |
38 | 39 | |
39 | 40 | #include <numa.h> |
40 | 41 | #include <numaif.h> |
183 | 184 | */ |
184 | 185 | manageable_aperture_t gpuvm_aperture; /* used for GPUVM on APU, outsidethe canonical address range */ |
185 | 186 | int drm_render_fd; |
187 | uint32_t usable_peer_id_num; | |
188 | uint32_t *usable_peer_id_array; | |
186 | 189 | } gpu_mem_t; |
187 | 190 | |
188 | 191 | enum svm_aperture_type { |
702 | 705 | return start; |
703 | 706 | } |
704 | 707 | |
708 | void *mmap_allocate_aligned(int prot, int flags, uint64_t size, uint64_t align, | |
709 | uint64_t guard_size, void *aper_base, void *aper_limit) | |
710 | { | |
711 | void *addr, *aligned_addr, *aligned_end, *mapping_end; | |
712 | uint64_t aligned_padded_size; | |
713 | ||
714 | aligned_padded_size = size + guard_size * 2 + (align - PAGE_SIZE); | |
715 | ||
716 | /* Map memory PROT_NONE to alloc address space only */ | |
717 | addr = mmap(0, aligned_padded_size, PROT_NONE, flags, -1, 0); | |
718 | if (addr == MAP_FAILED) { | |
719 | pr_err("mmap failed: %s\n", strerror(errno)); | |
720 | return NULL; | |
721 | } | |
722 | ||
723 | /* Adjust for alignment and guard pages */ | |
724 | aligned_addr = (void *)ALIGN_UP((uint64_t)addr + guard_size, align); | |
725 | if (aligned_addr < aper_base || | |
726 | VOID_PTR_ADD(aligned_addr, size - 1) > aper_limit) { | |
727 | pr_err("mmap returned %p, out of range %p-%p\n", aligned_addr, | |
728 | aper_base, aper_limit); | |
729 | munmap(addr, aligned_padded_size); | |
730 | return NULL; | |
731 | } | |
732 | ||
733 | /* Unmap padding and guard pages */ | |
734 | if (aligned_addr > addr) | |
735 | munmap(addr, VOID_PTRS_SUB(aligned_addr, addr)); | |
736 | ||
737 | aligned_end = VOID_PTR_ADD(aligned_addr, size); | |
738 | mapping_end = VOID_PTR_ADD(addr, aligned_padded_size); | |
739 | if (mapping_end > aligned_end) | |
740 | munmap(aligned_end, VOID_PTRS_SUB(mapping_end, aligned_end)); | |
741 | ||
742 | if (prot == PROT_NONE) | |
743 | return aligned_addr; | |
744 | ||
745 | /* MAP_FIXED to the aligned address with required prot */ | |
746 | addr = mmap(aligned_addr, size, prot, flags | MAP_FIXED, -1, 0); | |
747 | if (addr == MAP_FAILED) { | |
748 | pr_err("mmap failed: %s\n", strerror(errno)); | |
749 | return NULL; | |
750 | } | |
751 | ||
752 | return addr; | |
753 | } | |
754 | ||
705 | 755 | static void *mmap_aperture_allocate_aligned(manageable_aperture_t *aper, |
706 | 756 | void *address, |
707 | 757 | uint64_t size, uint64_t align) |
708 | 758 | { |
709 | uint64_t aligned_padded_size, guard_size; | |
710 | 759 | uint64_t alignment_size = PAGE_SIZE << svm.alignment_order; |
711 | void *addr, *aligned_addr, *aligned_end, *mapping_end; | |
760 | uint64_t guard_size; | |
712 | 761 | |
713 | 762 | if (address) |
714 | 763 | return NULL; |
732 | 781 | * pages on both sides |
733 | 782 | */ |
734 | 783 | guard_size = (uint64_t)aper->guard_pages * PAGE_SIZE; |
735 | aligned_padded_size = size + align + | |
736 | 2*guard_size - PAGE_SIZE; | |
737 | ||
738 | /* Map memory */ | |
739 | addr = mmap(0, aligned_padded_size, PROT_NONE, | |
740 | MAP_ANONYMOUS | MAP_NORESERVE | MAP_PRIVATE, -1, 0); | |
741 | if (addr == MAP_FAILED) { | |
742 | pr_err("mmap failed: %s\n", strerror(errno)); | |
743 | return NULL; | |
744 | } | |
745 | ||
746 | /* Adjust for alignment and guard pages, range-check the reslt */ | |
747 | aligned_addr = (void *)ALIGN_UP((uint64_t)addr + guard_size, align); | |
748 | if (aligned_addr < aper->base || | |
749 | VOID_PTR_ADD(aligned_addr, size - 1) > aper->limit) { | |
750 | pr_err("mmap returned %p, out of range %p-%p\n", aligned_addr, | |
751 | aper->base, aper->limit); | |
752 | munmap(addr, aligned_padded_size); | |
753 | return NULL; | |
754 | } | |
755 | ||
756 | /* Unmap padding and guard pages */ | |
757 | if (aligned_addr > addr) | |
758 | munmap(addr, VOID_PTRS_SUB(aligned_addr, addr)); | |
759 | ||
760 | aligned_end = VOID_PTR_ADD(aligned_addr, size); | |
761 | mapping_end = VOID_PTR_ADD(addr, aligned_padded_size); | |
762 | if (mapping_end > aligned_end) | |
763 | munmap(aligned_end, VOID_PTRS_SUB(mapping_end, aligned_end)); | |
764 | ||
765 | return aligned_addr; | |
784 | ||
785 | return mmap_allocate_aligned(PROT_NONE, MAP_ANONYMOUS | MAP_NORESERVE | MAP_PRIVATE, | |
786 | size, align, guard_size, aper->base, aper->limit); | |
766 | 787 | } |
767 | 788 | |
768 | 789 | static void mmap_aperture_release(manageable_aperture_t *aper, |
826 | 847 | |
827 | 848 | for (i = 0 ; i < gpu_mem_count ; i++) |
828 | 849 | if (gpu_mem[i].gpu_id == gpu_id) |
850 | return i; | |
851 | ||
852 | return -1; | |
853 | } | |
854 | ||
855 | static int32_t gpu_mem_find_by_node_id(uint32_t node_id) | |
856 | { | |
857 | uint32_t i; | |
858 | ||
859 | for (i = 0 ; i < gpu_mem_count ; i++) | |
860 | if (gpu_mem[i].node_id == node_id) | |
829 | 861 | return i; |
830 | 862 | |
831 | 863 | return -1; |
1249 | 1281 | aligned_size, SCRATCH_ALIGN); |
1250 | 1282 | pthread_mutex_unlock(&svm.dgpu_aperture->fmm_mutex); |
1251 | 1283 | } else { |
1252 | uint64_t aligned_padded_size = aligned_size + | |
1253 | SCRATCH_ALIGN - PAGE_SIZE; | |
1254 | void *padded_end, *aligned_start, *aligned_end; | |
1255 | ||
1256 | 1284 | if (address) |
1257 | 1285 | return NULL; |
1258 | 1286 | |
1259 | mem = mmap(0, aligned_padded_size, | |
1260 | PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_ANONYMOUS, | |
1261 | -1, 0); | |
1262 | if (!mem) | |
1263 | return NULL; | |
1264 | /* align start and unmap padding */ | |
1265 | padded_end = VOID_PTR_ADD(mem, aligned_padded_size); | |
1266 | aligned_start = (void *)ALIGN_UP((uint64_t)mem, SCRATCH_ALIGN); | |
1267 | aligned_end = VOID_PTR_ADD(aligned_start, aligned_size); | |
1268 | if (aligned_start > mem) | |
1269 | munmap(mem, VOID_PTRS_SUB(aligned_start, mem)); | |
1270 | if (aligned_end < padded_end) | |
1271 | munmap(aligned_end, | |
1272 | VOID_PTRS_SUB(padded_end, aligned_end)); | |
1273 | mem = aligned_start; | |
1287 | mem = mmap_allocate_aligned(PROT_READ | PROT_WRITE, | |
1288 | MAP_PRIVATE | MAP_ANONYMOUS, | |
1289 | aligned_size, SCRATCH_ALIGN, 0, | |
1290 | 0, (void *)LONG_MAX); | |
1274 | 1291 | } |
1275 | 1292 | |
1276 | 1293 | /* Remember scratch backing aperture for later */ |
2169 | 2186 | { |
2170 | 2187 | uint32_t i; |
2171 | 2188 | int32_t gpu_mem_id = 0; |
2172 | uint32_t gpu_id; | |
2173 | HsaNodeProperties props; | |
2174 | 2189 | struct kfd_process_device_apertures *process_apertures; |
2175 | 2190 | uint32_t num_of_sysfs_nodes; |
2176 | 2191 | HSAKMT_STATUS ret = HSAKMT_STATUS_SUCCESS; |
2233 | 2248 | is_dgpu = false; |
2234 | 2249 | |
2235 | 2250 | for (i = 0; i < NumNodes; i++) { |
2236 | memset(&props, 0, sizeof(props)); | |
2237 | ret = topology_sysfs_get_node_props(i, &props, &gpu_id, NULL, NULL); | |
2251 | HsaNodeProperties props; | |
2252 | ||
2253 | ret = topology_get_node_props(i, &props); | |
2238 | 2254 | if (ret != HSAKMT_STATUS_SUCCESS) |
2239 | goto sysfs_parse_failed; | |
2255 | goto gpu_mem_init_failed; | |
2240 | 2256 | |
2241 | 2257 | topology_setup_is_dgpu_param(&props); |
2242 | 2258 | |
2243 | 2259 | /* Skip non-GPU nodes */ |
2244 | if (gpu_id != 0) { | |
2260 | if (props.KFDGpuID) { | |
2245 | 2261 | int fd = open_drm_render_device(props.DrmRenderMinor); |
2246 | 2262 | if (fd <= 0) { |
2247 | 2263 | ret = HSAKMT_STATUS_ERROR; |
2248 | goto sysfs_parse_failed; | |
2264 | goto gpu_mem_init_failed; | |
2249 | 2265 | } |
2266 | ||
2267 | gpu_mem[gpu_mem_count].usable_peer_id_array = | |
2268 | calloc(NumNodes, sizeof(uint32_t)); | |
2269 | if (!gpu_mem[gpu_mem_count].usable_peer_id_array) { | |
2270 | ret = HSAKMT_STATUS_NO_MEMORY; | |
2271 | goto gpu_mem_init_failed; | |
2272 | } | |
2273 | gpu_mem[gpu_mem_count].usable_peer_id_array[0] = props.KFDGpuID; | |
2274 | gpu_mem[gpu_mem_count].usable_peer_id_num = 1; | |
2250 | 2275 | |
2251 | 2276 | gpu_mem[gpu_mem_count].EngineId.ui32.Major = props.EngineId.ui32.Major; |
2252 | 2277 | gpu_mem[gpu_mem_count].EngineId.ui32.Minor = props.EngineId.ui32.Minor; |
2253 | 2278 | gpu_mem[gpu_mem_count].EngineId.ui32.Stepping = props.EngineId.ui32.Stepping; |
2254 | 2279 | |
2255 | 2280 | gpu_mem[gpu_mem_count].drm_render_fd = fd; |
2256 | gpu_mem[gpu_mem_count].gpu_id = gpu_id; | |
2281 | gpu_mem[gpu_mem_count].gpu_id = props.KFDGpuID; | |
2257 | 2282 | gpu_mem[gpu_mem_count].local_mem_size = props.LocalMemSize; |
2258 | 2283 | gpu_mem[gpu_mem_count].device_id = props.DeviceId; |
2259 | 2284 | gpu_mem[gpu_mem_count].node_id = i; |
2311 | 2336 | } |
2312 | 2337 | |
2313 | 2338 | for (i = 0 ; i < num_of_sysfs_nodes ; i++) { |
2339 | HsaNodeProperties nodeProps; | |
2340 | HsaIoLinkProperties linkProps[NumNodes]; | |
2341 | uint32_t nodeId; | |
2342 | uint32_t j; | |
2343 | ||
2314 | 2344 | /* Map Kernel process device data node i <--> gpu_mem_id which |
2315 | 2345 | * indexes into gpu_mem[] based on gpu_id |
2316 | 2346 | */ |
2320 | 2350 | |
2321 | 2351 | if (all_gpu_id_array_size == gpu_mem_count) { |
2322 | 2352 | ret = HSAKMT_STATUS_ERROR; |
2323 | goto invalid_gpu_id; | |
2353 | goto aperture_init_failed; | |
2324 | 2354 | } |
2325 | 2355 | all_gpu_id_array[all_gpu_id_array_size++] = process_apertures[i].gpu_id; |
2356 | ||
2357 | /* Add this GPU to the usable_peer_id_arrays of all GPUs that | |
2358 | * this GPU has an IO link to. This GPU can map memory | |
2359 | * allocated on those GPUs. | |
2360 | */ | |
2361 | nodeId = gpu_mem[gpu_mem_id].node_id; | |
2362 | ret = topology_get_node_props(nodeId, &nodeProps); | |
2363 | if (ret != HSAKMT_STATUS_SUCCESS) | |
2364 | goto aperture_init_failed; | |
2365 | assert(nodeProps.NumIOLinks <= NumNodes); | |
2366 | ret = topology_get_iolink_props(nodeId, nodeProps.NumIOLinks, | |
2367 | linkProps); | |
2368 | if (ret != HSAKMT_STATUS_SUCCESS) | |
2369 | goto aperture_init_failed; | |
2370 | for (j = 0; j < nodeProps.NumIOLinks; j++) { | |
2371 | int32_t to_gpu_mem_id = | |
2372 | gpu_mem_find_by_node_id(linkProps[j].NodeTo); | |
2373 | uint32_t peer; | |
2374 | ||
2375 | if (to_gpu_mem_id < 0) | |
2376 | continue; | |
2377 | ||
2378 | assert(gpu_mem[to_gpu_mem_id].usable_peer_id_num < NumNodes); | |
2379 | peer = gpu_mem[to_gpu_mem_id].usable_peer_id_num++; | |
2380 | gpu_mem[to_gpu_mem_id].usable_peer_id_array[peer] = | |
2381 | gpu_mem[gpu_mem_id].gpu_id; | |
2382 | } | |
2326 | 2383 | |
2327 | 2384 | gpu_mem[gpu_mem_id].lds_aperture.base = |
2328 | 2385 | PORT_UINT64_TO_VPTR(process_apertures[i].lds_base); |
2374 | 2431 | ret = acquire_vm(gpu_mem[gpu_mem_id].gpu_id, |
2375 | 2432 | gpu_mem[gpu_mem_id].drm_render_fd); |
2376 | 2433 | if (ret != HSAKMT_STATUS_SUCCESS) |
2377 | goto acquire_vm_failed; | |
2434 | goto aperture_init_failed; | |
2378 | 2435 | } |
2379 | 2436 | all_gpu_id_array_size *= sizeof(uint32_t); |
2380 | 2437 | |
2438 | 2495 | free(process_apertures); |
2439 | 2496 | return ret; |
2440 | 2497 | |
2441 | invalid_gpu_id: | |
2498 | aperture_init_failed: | |
2442 | 2499 | init_svm_failed: |
2443 | acquire_vm_failed: | |
2444 | 2500 | set_memory_policy_failed: |
2445 | 2501 | free(all_gpu_id_array); |
2446 | 2502 | all_gpu_id_array = NULL; |
2447 | 2503 | get_aperture_ioctl_failed: |
2448 | 2504 | free(process_apertures); |
2449 | 2505 | sysfs_parse_failed: |
2506 | gpu_mem_init_failed: | |
2450 | 2507 | fmm_destroy_process_apertures(); |
2451 | 2508 | return ret; |
2452 | 2509 | } |
2455 | 2512 | { |
2456 | 2513 | release_mmio(); |
2457 | 2514 | if (gpu_mem) { |
2515 | while (gpu_mem_count-- > 0) | |
2516 | free(gpu_mem[gpu_mem_count].usable_peer_id_array); | |
2458 | 2517 | free(gpu_mem); |
2459 | 2518 | gpu_mem = NULL; |
2460 | 2519 | } |
2635 | 2694 | sizeof(uint32_t); |
2636 | 2695 | } else { |
2637 | 2696 | /* not specified, not registered: map all GPUs */ |
2638 | args.device_ids_array_ptr = (uint64_t)all_gpu_id_array; | |
2639 | args.n_devices = all_gpu_id_array_size / sizeof(uint32_t); | |
2697 | int32_t gpu_mem_id = gpu_mem_find_by_node_id(obj->node_id); | |
2698 | ||
2699 | if (!obj->userptr && get_device_id_by_node_id(obj->node_id) && | |
2700 | gpu_mem_id >= 0) { | |
2701 | args.device_ids_array_ptr = (uint64_t) | |
2702 | gpu_mem[gpu_mem_id].usable_peer_id_array; | |
2703 | args.n_devices = | |
2704 | gpu_mem[gpu_mem_id].usable_peer_id_num; | |
2705 | } else { | |
2706 | args.device_ids_array_ptr = (uint64_t)all_gpu_id_array; | |
2707 | args.n_devices = all_gpu_id_array_size / sizeof(uint32_t); | |
2708 | } | |
2640 | 2709 | } |
2641 | 2710 | args.n_success = 0; |
2642 | 2711 | |
3344 | 3413 | importArgs.gpu_id = SharedMemoryStruct->ExportGpuId; |
3345 | 3414 | |
3346 | 3415 | aperture = fmm_get_aperture(SharedMemoryStruct->ApeInfo); |
3416 | if (!aperture) | |
3417 | return HSAKMT_STATUS_INVALID_PARAMETER; | |
3347 | 3418 | |
3348 | 3419 | pthread_mutex_lock(&aperture->fmm_mutex); |
3349 | 3420 | reservedMem = aperture_allocate_area(aperture, NULL, |
3736 | 3807 | fmm_clear_aperture(&gpu_mem[i].scratch_physical); |
3737 | 3808 | } |
3738 | 3809 | |
3739 | gpu_mem_count = 0; | |
3740 | free(gpu_mem); | |
3741 | gpu_mem = NULL; | |
3742 | } | |
3810 | fmm_destroy_process_apertures(); | |
3811 | } |
89 | 89 | uint32_t *nodes_to_map, uint64_t num_of_nodes, uint64_t *gpuvm_address); |
90 | 90 | |
91 | 91 | int open_drm_render_device(int minor); |
92 | void *mmap_allocate_aligned(int prot, int flags, uint64_t size, uint64_t align, | |
93 | uint64_t guard_size, void *aper_base, void *aper_limit); | |
94 | ||
92 | 95 | #endif /* FMM_H_ */ |
171 | 171 | HSAKMT_STATUS validate_nodeid_array(uint32_t **gpu_id_array, |
172 | 172 | uint32_t NumberOfNodes, uint32_t *NodeArray); |
173 | 173 | |
174 | HSAKMT_STATUS topology_sysfs_get_node_props(uint32_t node_id, HsaNodeProperties *props, | |
175 | uint32_t *gpu_id, | |
176 | bool *p2p_links, uint32_t *num_p2pLinks); | |
177 | 174 | HSAKMT_STATUS topology_sysfs_get_system_props(HsaSystemProperties *props); |
175 | HSAKMT_STATUS topology_get_node_props(HSAuint32 NodeId, | |
176 | HsaNodeProperties *NodeProperties); | |
177 | HSAKMT_STATUS topology_get_iolink_props(HSAuint32 NodeId, | |
178 | HSAuint32 NumIoLinks, | |
179 | HsaIoLinkProperties *IoLinkProperties); | |
178 | 180 | void topology_setup_is_dgpu_param(HsaNodeProperties *props); |
179 | 181 | bool topology_is_svm_needed(HSA_ENGINE_ID EngineId); |
180 | 182 |
23 | 23 | hsaKmtSetMemoryPolicy; |
24 | 24 | hsaKmtAllocMemory; |
25 | 25 | hsaKmtFreeMemory; |
26 | hsaKmtAvailableMemory; | |
26 | 27 | hsaKmtRegisterMemory; |
27 | 28 | hsaKmtRegisterMemoryToNodes; |
28 | 29 | hsaKmtRegisterMemoryWithFlags; |
67 | 68 | hsaKmtSVMGetAttr; |
68 | 69 | hsaKmtSetXNACKMode; |
69 | 70 | hsaKmtGetXNACKMode; |
71 | hsaKmtOpenSMI; | |
70 | 72 | |
71 | 73 | local: *; |
72 | 74 | }; |
198 | 198 | return fmm_release(MemoryAddress); |
199 | 199 | } |
200 | 200 | |
201 | HSAKMT_STATUS HSAKMTAPI hsaKmtAvailableMemory(HSAuint32 Node, | |
202 | HSAuint64 *AvailableBytes) | |
203 | { | |
204 | struct kfd_ioctl_get_available_memory_args args = {}; | |
205 | HSAKMT_STATUS result; | |
206 | ||
207 | CHECK_KFD_OPEN(); | |
208 | CHECK_KFD_MINOR_VERSION(9); | |
209 | ||
210 | pr_debug("[%s] node %d\n", __func__, Node); | |
211 | ||
212 | result = validate_nodeid(Node, &args.gpu_id); | |
213 | if (result != HSAKMT_STATUS_SUCCESS) { | |
214 | pr_err("[%s] invalid node ID: %d\n", __func__, Node); | |
215 | return result; | |
216 | } | |
217 | ||
218 | if (kmtIoctl(kfd_fd, AMDKFD_IOC_AVAILABLE_MEMORY, &args)) | |
219 | return HSAKMT_STATUS_ERROR; | |
220 | ||
221 | *AvailableBytes = args.available; | |
222 | return HSAKMT_STATUS_SUCCESS; | |
223 | } | |
224 | ||
201 | 225 | HSAKMT_STATUS HSAKMTAPI hsaKmtRegisterMemory(void *MemoryAddress, |
202 | 226 | HSAuint64 MemorySizeInBytes) |
203 | 227 | { |
361 | 385 | return ret; |
362 | 386 | } |
363 | 387 | |
364 | static uint64_t convertHsaToKfdRange(HsaMemoryRange *HsaRange) | |
365 | { | |
366 | if (sizeof(struct kfd_memory_range) != | |
367 | sizeof(HsaMemoryRange)) { | |
368 | pr_err("Struct size mismatch in thunk. Cannot cast Hsa Range to KFD IOCTL range\n"); | |
369 | return 0; | |
370 | } | |
371 | return (uint64_t) HsaRange; | |
372 | } | |
373 | ||
374 | 388 | HSAKMT_STATUS HSAKMTAPI hsaKmtProcessVMRead(HSAuint32 Pid, |
375 | 389 | HsaMemoryRange *LocalMemoryArray, |
376 | 390 | HSAuint64 LocalMemoryArrayCount, |
378 | 392 | HSAuint64 RemoteMemoryArrayCount, |
379 | 393 | HSAuint64 *SizeCopied) |
380 | 394 | { |
381 | int ret = HSAKMT_STATUS_SUCCESS; | |
382 | struct kfd_ioctl_cross_memory_copy_args args = {0}; | |
383 | ||
384 | pr_debug("[%s]\n", __func__); | |
385 | ||
386 | if (!LocalMemoryArray || !RemoteMemoryArray || | |
387 | LocalMemoryArrayCount == 0 || RemoteMemoryArrayCount == 0) | |
388 | return HSAKMT_STATUS_ERROR; | |
389 | ||
390 | args.flags = 0; | |
391 | KFD_SET_CROSS_MEMORY_READ(args.flags); | |
392 | args.pid = Pid; | |
393 | args.src_mem_range_array = convertHsaToKfdRange(RemoteMemoryArray); | |
394 | args.src_mem_array_size = RemoteMemoryArrayCount; | |
395 | args.dst_mem_range_array = convertHsaToKfdRange(LocalMemoryArray); | |
396 | args.dst_mem_array_size = LocalMemoryArrayCount; | |
397 | args.bytes_copied = 0; | |
398 | ||
399 | if (kmtIoctl(kfd_fd, AMDKFD_IOC_CROSS_MEMORY_COPY, &args)) | |
400 | ret = HSAKMT_STATUS_ERROR; | |
401 | ||
402 | if (SizeCopied) | |
403 | *SizeCopied = args.bytes_copied; | |
404 | ||
405 | return ret; | |
395 | pr_err("[%s] Deprecated\n", __func__); | |
396 | ||
397 | return HSAKMT_STATUS_NOT_IMPLEMENTED; | |
406 | 398 | } |
407 | 399 | |
408 | 400 | HSAKMT_STATUS HSAKMTAPI hsaKmtProcessVMWrite(HSAuint32 Pid, |
412 | 404 | HSAuint64 RemoteMemoryArrayCount, |
413 | 405 | HSAuint64 *SizeCopied) |
414 | 406 | { |
415 | int ret = HSAKMT_STATUS_SUCCESS; | |
416 | struct kfd_ioctl_cross_memory_copy_args args = {0}; | |
417 | ||
418 | pr_debug("[%s]\n", __func__); | |
419 | ||
420 | if (SizeCopied) | |
421 | *SizeCopied = 0; | |
422 | ||
423 | if (!LocalMemoryArray || !RemoteMemoryArray || | |
424 | LocalMemoryArrayCount == 0 || RemoteMemoryArrayCount == 0) | |
425 | return HSAKMT_STATUS_ERROR; | |
426 | ||
427 | args.flags = 0; | |
428 | KFD_SET_CROSS_MEMORY_WRITE(args.flags); | |
429 | args.pid = Pid; | |
430 | args.src_mem_range_array = convertHsaToKfdRange(LocalMemoryArray); | |
431 | args.src_mem_array_size = LocalMemoryArrayCount; | |
432 | args.dst_mem_range_array = convertHsaToKfdRange(RemoteMemoryArray); | |
433 | args.dst_mem_array_size = RemoteMemoryArrayCount; | |
434 | args.bytes_copied = 0; | |
435 | ||
436 | if (kmtIoctl(kfd_fd, AMDKFD_IOC_CROSS_MEMORY_COPY, &args)) | |
437 | ret = HSAKMT_STATUS_ERROR; | |
438 | ||
439 | if (SizeCopied) | |
440 | *SizeCopied = args.bytes_copied; | |
441 | ||
442 | return ret; | |
407 | pr_err("[%s] Deprecated\n", __func__); | |
408 | ||
409 | return HSAKMT_STATUS_NOT_IMPLEMENTED; | |
443 | 410 | } |
444 | 411 | |
445 | 412 |
178 | 178 | if (result != HSAKMT_STATUS_SUCCESS) |
179 | 179 | goto topology_sysfs_failed; |
180 | 180 | |
181 | result = fmm_init_process_apertures(sys_props.NumNodes); | |
182 | if (result != HSAKMT_STATUS_SUCCESS) | |
183 | goto init_process_aperture_failed; | |
184 | ||
185 | result = init_process_doorbells(sys_props.NumNodes); | |
186 | if (result != HSAKMT_STATUS_SUCCESS) | |
187 | goto init_doorbell_failed; | |
188 | ||
189 | 181 | kfd_open_count = 1; |
190 | 182 | |
191 | 183 | if (init_device_debugging_memory(sys_props.NumNodes) != HSAKMT_STATUS_SUCCESS) |
211 | 203 | |
212 | 204 | pthread_mutex_unlock(&hsakmt_mutex); |
213 | 205 | return result; |
214 | ||
215 | init_doorbell_failed: | |
216 | fmm_destroy_process_apertures(); | |
217 | init_process_aperture_failed: | |
218 | 206 | topology_sysfs_failed: |
219 | 207 | kfd_version_failed: |
220 | 208 | close(fd); |
234 | 222 | if (--kfd_open_count == 0) { |
235 | 223 | destroy_counter_props(); |
236 | 224 | destroy_device_debugging_memory(); |
237 | destroy_process_doorbells(); | |
238 | fmm_destroy_process_apertures(); | |
239 | 225 | if (kfd_fd) { |
240 | 226 | close(kfd_fd); |
241 | 227 | kfd_fd = 0; |
67 | 67 | uint32_t eop_buffer_size; |
68 | 68 | uint32_t gfxv; |
69 | 69 | bool use_ats; |
70 | bool unified_ctx_save_restore; | |
70 | 71 | /* This queue structure is allocated from GPU with page aligned size |
71 | 72 | * but only small bytes are used. We use the extra space in the end for |
72 | 73 | * cu_mask bits array. |
277 | 278 | wg_data_size = cu_num * WG_CONTEXT_DATA_SIZE_PER_CU(q->gfxv); |
278 | 279 | q->ctl_stack_size = PAGE_ALIGN_UP(sizeof(HsaUserContextSaveAreaHeader) |
279 | 280 | + ctl_stack_size); |
280 | if (q->gfxv >= GFX_VERSION_NAVI10 && | |
281 | q->gfxv <= GFX_VERSION_YELLOW_CARP) { | |
281 | if ((q->gfxv & 0x3f0000) == 0xA0000) { | |
282 | 282 | /* HW design limits control stack size to 0x7000. |
283 | 283 | * This is insufficient for theoretical PM4 cases |
284 | 284 | * but sufficient for AQL, limited by SPI events. |
363 | 363 | static void *allocate_exec_aligned_memory(uint32_t size, |
364 | 364 | bool use_ats, |
365 | 365 | uint32_t NodeId, |
366 | bool nonPaged, | |
366 | 367 | bool DeviceLocal, |
367 | 368 | bool Uncached) |
368 | 369 | { |
369 | 370 | if (!use_ats) |
370 | 371 | return allocate_exec_aligned_memory_gpu(size, PAGE_SIZE, NodeId, |
371 | DeviceLocal, DeviceLocal, | |
372 | nonPaged, DeviceLocal, | |
372 | 373 | Uncached); |
373 | 374 | return allocate_exec_aligned_memory_cpu(size); |
374 | 375 | } |
382 | 383 | munmap(addr, size); |
383 | 384 | } |
384 | 385 | |
386 | static HSAKMT_STATUS register_svm_range(void *mem, uint32_t size, | |
387 | uint32_t gpuNode, uint32_t prefetchNode, | |
388 | uint32_t preferredNode, bool alwaysMapped) | |
389 | { | |
390 | HSA_SVM_ATTRIBUTE *attrs; | |
391 | HSAuint64 s_attr; | |
392 | HSAuint32 nattr; | |
393 | HSAuint32 flags; | |
394 | ||
395 | flags = HSA_SVM_FLAG_HOST_ACCESS | HSA_SVM_FLAG_GPU_EXEC; | |
396 | ||
397 | if (alwaysMapped) { | |
398 | CHECK_KFD_MINOR_VERSION(11); | |
399 | flags |= HSA_SVM_FLAG_GPU_ALWAYS_MAPPED; | |
400 | } | |
401 | ||
402 | nattr = 5; | |
403 | s_attr = sizeof(*attrs) * nattr; | |
404 | attrs = (HSA_SVM_ATTRIBUTE *)alloca(s_attr); | |
405 | ||
406 | attrs[0].type = HSA_SVM_ATTR_PREFETCH_LOC; | |
407 | attrs[0].value = prefetchNode; | |
408 | attrs[1].type = HSA_SVM_ATTR_PREFERRED_LOC; | |
409 | attrs[1].value = preferredNode; | |
410 | attrs[2].type = HSA_SVM_ATTR_CLR_FLAGS; | |
411 | attrs[2].value = ~flags; | |
412 | attrs[3].type = HSA_SVM_ATTR_SET_FLAGS; | |
413 | attrs[3].value = flags; | |
414 | attrs[4].type = HSA_SVM_ATTR_ACCESS; | |
415 | attrs[4].value = gpuNode; | |
416 | ||
417 | return hsaKmtSVMSetAttr(mem, size, nattr, attrs); | |
418 | } | |
419 | ||
385 | 420 | static void free_queue(struct queue *q) |
386 | 421 | { |
387 | 422 | if (q->eop_buffer) |
388 | 423 | free_exec_aligned_memory(q->eop_buffer, |
389 | 424 | q->eop_buffer_size, |
390 | 425 | PAGE_SIZE, q->use_ats); |
391 | if (q->ctx_save_restore) | |
426 | if (q->unified_ctx_save_restore) | |
427 | munmap(q->ctx_save_restore, | |
428 | PAGE_ALIGN_UP(q->ctx_save_restore_size + q->debug_memory_size)); | |
429 | else if (q->ctx_save_restore) | |
392 | 430 | free_exec_aligned_memory(q->ctx_save_restore, |
393 | q->ctx_save_restore_size, | |
431 | q->ctx_save_restore_size + q->debug_memory_size, | |
394 | 432 | PAGE_SIZE, q->use_ats); |
395 | 433 | |
396 | 434 | free_exec_aligned_memory((void *)q, sizeof(*q), PAGE_SIZE, q->use_ats); |
435 | } | |
436 | ||
437 | static inline void fill_cwsr_header(struct queue *q, void *addr, | |
438 | HsaEvent *Event, volatile HSAint64 *ErrPayload) | |
439 | { | |
440 | HsaUserContextSaveAreaHeader *header = | |
441 | (HsaUserContextSaveAreaHeader *)addr; | |
442 | ||
443 | header->ErrorEventId = 0; | |
444 | if (Event) | |
445 | header->ErrorEventId = Event->EventId; | |
446 | header->ErrorReason = ErrPayload; | |
447 | header->DebugOffset = q->ctx_save_restore_size; | |
448 | header->DebugSize = q->debug_memory_size; | |
397 | 449 | } |
398 | 450 | |
399 | 451 | static int handle_concrete_asic(struct queue *q, |
411 | 463 | if (q->eop_buffer_size > 0) { |
412 | 464 | q->eop_buffer = allocate_exec_aligned_memory(q->eop_buffer_size, |
413 | 465 | q->use_ats, |
414 | NodeId, true, /* Unused for VRAM */false); | |
466 | NodeId, true, true, /* Unused for VRAM */false); | |
415 | 467 | if (!q->eop_buffer) |
416 | 468 | return HSAKMT_STATUS_NO_MEMORY; |
417 | 469 | |
423 | 475 | |
424 | 476 | if (ret) { |
425 | 477 | uint32_t total_mem_alloc_size = 0; |
426 | HsaUserContextSaveAreaHeader *header; | |
478 | HsaNodeProperties node; | |
479 | bool svm_api; | |
427 | 480 | |
428 | 481 | args->ctx_save_restore_size = q->ctx_save_restore_size; |
429 | 482 | args->ctl_stack_size = q->ctl_stack_size; |
433 | 486 | */ |
434 | 487 | total_mem_alloc_size = q->ctx_save_restore_size + |
435 | 488 | q->debug_memory_size; |
436 | q->ctx_save_restore = | |
437 | allocate_exec_aligned_memory(total_mem_alloc_size, | |
438 | q->use_ats, NodeId, false, false); | |
439 | ||
440 | if (!q->ctx_save_restore) | |
441 | return HSAKMT_STATUS_NO_MEMORY; | |
489 | ||
490 | if (hsaKmtGetNodeProperties(NodeId, &node)) | |
491 | svm_api = false; | |
492 | else | |
493 | svm_api = node.Capability.ui32.SVMAPISupported; | |
494 | ||
495 | /* Allocate unified memory for context save restore | |
496 | * area on dGPU. | |
497 | */ | |
498 | if (!q->use_ats && svm_api) { | |
499 | uint32_t size = PAGE_ALIGN_UP(total_mem_alloc_size); | |
500 | void *addr; | |
501 | HSAKMT_STATUS r = HSAKMT_STATUS_ERROR; | |
502 | ||
503 | addr = mmap_allocate_aligned(PROT_READ | PROT_WRITE, | |
504 | MAP_ANONYMOUS | MAP_PRIVATE, | |
505 | size, GPU_HUGE_PAGE_SIZE, 0, | |
506 | 0, (void *)LONG_MAX); | |
507 | if (!addr) { | |
508 | pr_err("mmap failed to alloc ctx area size 0x%x: %s\n", | |
509 | size, strerror(errno)); | |
510 | } else { | |
511 | /* | |
512 | * To avoid fork child process COW MMU notifier | |
513 | * callback evict parent process queues. | |
514 | */ | |
515 | if (madvise(addr, size, MADV_DONTFORK)) | |
516 | pr_err("madvise failed -%d\n", errno); | |
517 | ||
518 | fill_cwsr_header(q, addr, Event, ErrPayload); | |
519 | ||
520 | r = register_svm_range(addr, size, | |
521 | NodeId, NodeId, 0, true); | |
522 | ||
523 | if (r == HSAKMT_STATUS_SUCCESS) { | |
524 | q->ctx_save_restore = addr; | |
525 | q->unified_ctx_save_restore = true; | |
526 | } else { | |
527 | munmap(addr, size); | |
528 | } | |
529 | } | |
530 | } | |
531 | ||
532 | if (!q->unified_ctx_save_restore) { | |
533 | q->ctx_save_restore = allocate_exec_aligned_memory( | |
534 | total_mem_alloc_size, | |
535 | q->use_ats, NodeId, | |
536 | false, false, false); | |
537 | ||
538 | if (!q->ctx_save_restore) | |
539 | return HSAKMT_STATUS_NO_MEMORY; | |
540 | ||
541 | fill_cwsr_header(q, q->ctx_save_restore, Event, ErrPayload); | |
542 | } | |
442 | 543 | |
443 | 544 | args->ctx_save_restore_address = (uintptr_t)q->ctx_save_restore; |
444 | ||
445 | header = (HsaUserContextSaveAreaHeader *)q->ctx_save_restore; | |
446 | header->ErrorEventId = 0; | |
447 | if (Event) | |
448 | header->ErrorEventId = Event->EventId; | |
449 | header->ErrorReason = ErrPayload; | |
450 | header->DebugOffset = q->ctx_save_restore_size; | |
451 | header->DebugSize = q->debug_memory_size; | |
452 | 545 | } |
453 | 546 | |
454 | 547 | return HSAKMT_STATUS_SUCCESS; |
476 | 569 | int err; |
477 | 570 | HsaNodeProperties props; |
478 | 571 | uint32_t cu_num, i; |
479 | bool use_ats; | |
480 | 572 | |
481 | 573 | CHECK_KFD_OPEN(); |
482 | 574 | |
488 | 580 | if (result != HSAKMT_STATUS_SUCCESS) |
489 | 581 | return result; |
490 | 582 | |
491 | use_ats = prefer_ats(NodeId); | |
492 | ||
493 | 583 | struct queue *q = allocate_exec_aligned_memory(sizeof(*q), |
494 | use_ats, | |
495 | NodeId, false, true); | |
584 | false, NodeId, true, false, true); | |
496 | 585 | if (!q) |
497 | 586 | return HSAKMT_STATUS_NO_MEMORY; |
498 | 587 | |
499 | 588 | memset(q, 0, sizeof(*q)); |
500 | 589 | |
501 | 590 | q->gfxv = get_gfxv_by_node_id(NodeId); |
502 | q->use_ats = use_ats; | |
591 | q->use_ats = false; | |
503 | 592 | q->eop_buffer_size = EOP_BUFFER_SIZE(q->gfxv); |
504 | 593 | |
505 | 594 | /* By default, CUs are all turned on. Initialize cu_mask to '1 |
585 | 674 | err = map_doorbell(NodeId, gpu_id, doorbell_mmap_offset); |
586 | 675 | if (err != HSAKMT_STATUS_SUCCESS) { |
587 | 676 | hsaKmtDestroyQueue(q->queue_id); |
588 | free_queue(q); | |
589 | 677 | return HSAKMT_STATUS_ERROR; |
590 | 678 | } |
591 | 679 |
55 | 55 | #define KFD_SYSFS_PATH_NODES "/sys/devices/virtual/kfd/kfd/topology/nodes" |
56 | 56 | |
57 | 57 | typedef struct { |
58 | uint32_t gpu_id; | |
59 | 58 | HsaNodeProperties node; |
60 | 59 | HsaMemoryProperties *mem; /* node->NumBanks elements */ |
61 | 60 | HsaCacheProperties *cache; |
87 | 86 | }; |
88 | 87 | |
89 | 88 | static HSAKMT_STATUS topology_take_snapshot(void); |
90 | static HSAKMT_STATUS topology_drop_snapshot(void); | |
89 | static void topology_drop_snapshot(void); | |
91 | 90 | |
92 | 91 | static const struct hsa_gfxip_table gfxip_lookup_table[] = { |
93 | 92 | /* Kaveri Family */ |
800 | 799 | return ret; |
801 | 800 | } |
802 | 801 | |
803 | static const struct hsa_gfxip_table *find_hsa_gfxip_device(uint16_t device_id) | |
804 | { | |
802 | static const struct hsa_gfxip_table *find_hsa_gfxip_device(uint16_t device_id, uint8_t gfxv_major) | |
803 | { | |
804 | if (gfxv_major > 10) | |
805 | return NULL; | |
806 | ||
805 | 807 | uint32_t i, table_size; |
806 | 808 | |
807 | 809 | table_size = sizeof(gfxip_lookup_table)/sizeof(struct hsa_gfxip_table); |
1004 | 1006 | return ret; |
1005 | 1007 | } |
1006 | 1008 | |
1007 | static int topology_get_marketing_name(int minor, uint16_t *marketing_name) | |
1009 | static int topology_get_node_props_from_drm(HsaNodeProperties *props) | |
1008 | 1010 | { |
1009 | 1011 | int drm_fd; |
1010 | 1012 | uint32_t major_version; |
1011 | 1013 | uint32_t minor_version; |
1012 | 1014 | amdgpu_device_handle device_handle; |
1015 | struct amdgpu_gpu_info gpu_info; | |
1013 | 1016 | const char *name; |
1014 | int i; | |
1015 | ||
1016 | if (marketing_name == NULL) | |
1017 | int i, ret = 0; | |
1018 | ||
1019 | if (props == NULL) | |
1017 | 1020 | return -1; |
1018 | drm_fd = drmOpenRender(minor); | |
1021 | ||
1022 | drm_fd = drmOpenRender(props->DrmRenderMinor); | |
1019 | 1023 | if (drm_fd < 0) |
1020 | 1024 | return -1; |
1025 | ||
1021 | 1026 | if (amdgpu_device_initialize(drm_fd, |
1022 | 1027 | &major_version, &minor_version, &device_handle) < 0) { |
1023 | drmClose(drm_fd); | |
1024 | return -1; | |
1025 | } | |
1028 | ret = -1; | |
1029 | goto err_device_initialize; | |
1030 | } | |
1031 | ||
1026 | 1032 | name = amdgpu_get_marketing_name(device_handle); |
1027 | 1033 | if (name != NULL) { |
1028 | 1034 | for (i = 0; name[i] != 0 && i < HSA_PUBLIC_NAME_SIZE - 1; i++) |
1029 | marketing_name[i] = name[i]; | |
1030 | marketing_name[i] = '\0'; | |
1031 | } | |
1035 | props->MarketingName[i] = name[i]; | |
1036 | props->MarketingName[i] = '\0'; | |
1037 | } | |
1038 | ||
1039 | if (amdgpu_query_gpu_info(device_handle, &gpu_info)) { | |
1040 | ret = -1; | |
1041 | goto err_query_gpu_info; | |
1042 | } | |
1043 | ||
1044 | props->FamilyID = gpu_info.family_id; | |
1045 | ||
1046 | err_query_gpu_info: | |
1032 | 1047 | amdgpu_device_deinitialize(device_handle); |
1048 | err_device_initialize: | |
1033 | 1049 | drmClose(drm_fd); |
1034 | return 0; | |
1035 | } | |
1036 | ||
1037 | HSAKMT_STATUS topology_sysfs_get_node_props(uint32_t node_id, | |
1038 | HsaNodeProperties *props, | |
1039 | uint32_t *gpu_id, | |
1040 | bool *p2p_links, | |
1041 | uint32_t *num_p2pLinks) | |
1050 | return ret; | |
1051 | } | |
1052 | ||
1053 | static HSAKMT_STATUS topology_sysfs_get_node_props(uint32_t node_id, | |
1054 | HsaNodeProperties *props, | |
1055 | bool *p2p_links, | |
1056 | uint32_t *num_p2pLinks) | |
1042 | 1057 | { |
1043 | 1058 | FILE *fd; |
1044 | 1059 | char *read_buf, *p, *envvar, dummy; |
1055 | 1070 | HSAKMT_STATUS ret = HSAKMT_STATUS_SUCCESS; |
1056 | 1071 | |
1057 | 1072 | assert(props); |
1058 | assert(gpu_id); | |
1059 | 1073 | ret = topology_sysfs_map_node_id(node_id, &sys_node_id); |
1060 | 1074 | if (ret != HSAKMT_STATUS_SUCCESS) |
1061 | 1075 | return ret; |
1062 | 1076 | |
1063 | 1077 | /* Retrieve the GPU ID */ |
1064 | ret = topology_sysfs_get_gpu_id(sys_node_id, gpu_id); | |
1078 | ret = topology_sysfs_get_gpu_id(sys_node_id, &props->KFDGpuID); | |
1079 | if (ret != HSAKMT_STATUS_SUCCESS) | |
1080 | return ret; | |
1065 | 1081 | |
1066 | 1082 | read_buf = malloc(PAGE_SIZE); |
1067 | 1083 | if (!read_buf) |
1170 | 1186 | gfxv = (uint32_t)prop_val; |
1171 | 1187 | } |
1172 | 1188 | |
1189 | /* Bail out early, if a CPU node */ | |
1190 | if (props->NumCPUCores) | |
1191 | goto err; | |
1192 | ||
1173 | 1193 | gfxv_major = HSA_GET_GFX_VERSION_MAJOR(gfxv); |
1174 | 1194 | gfxv_minor = HSA_GET_GFX_VERSION_MINOR(gfxv); |
1175 | 1195 | gfxv_stepping = HSA_GET_GFX_VERSION_STEP(gfxv); |
1176 | 1196 | |
1177 | hsa_gfxip = find_hsa_gfxip_device(props->DeviceId); | |
1197 | hsa_gfxip = find_hsa_gfxip_device(props->DeviceId, gfxv_major); | |
1178 | 1198 | if (hsa_gfxip || gfxv) { |
1179 | 1199 | envvar = getenv("HSA_OVERRIDE_GFX_VERSION"); |
1180 | 1200 | if (envvar) { |
1210 | 1230 | snprintf((char *)props->AMDName, sizeof(props->AMDName)-1, "GFX%06x", |
1211 | 1231 | HSA_GET_GFX_VERSION_FULL(props->EngineId.ui32)); |
1212 | 1232 | |
1213 | if (!props->NumCPUCores) { | |
1214 | /* Is dGPU Node, not APU | |
1215 | * Retrieve the marketing name of the node. | |
1216 | */ | |
1217 | if (topology_get_marketing_name(props->DrmRenderMinor, | |
1218 | props->MarketingName) != 0) | |
1219 | pr_info("failed to get marketing name for device ID 0x%x\n", | |
1220 | props->DeviceId); | |
1221 | } | |
1233 | /* Is dGPU Node, not APU | |
1234 | * Retrieve the marketing name of the node. | |
1235 | */ | |
1236 | if (topology_get_node_props_from_drm(props)) | |
1237 | pr_info("failed to get marketing name for device ID 0x%x\n", props->DeviceId); | |
1222 | 1238 | |
1223 | 1239 | /* Get VGPR/SGPR size in byte per CU */ |
1224 | 1240 | props->SGPRSizePerCU = SGPR_SIZE_PER_CU; |
1722 | 1738 | HsaIoLinkProperties *props = node_props[gpu_node].link; |
1723 | 1739 | uint32_t i; |
1724 | 1740 | |
1725 | if (!node_props[gpu_node].gpu_id || !props || | |
1741 | if (!node_props[gpu_node].node.KFDGpuID || !props || | |
1726 | 1742 | node_props[gpu_node].node.NumIOLinks == 0) |
1727 | 1743 | return -1; |
1728 | 1744 | |
1775 | 1791 | return HSAKMT_STATUS_INVALID_PARAMETER; |
1776 | 1792 | |
1777 | 1793 | /* CPU->CPU is not an indirect link */ |
1778 | if (!node_props[node1].gpu_id && !node_props[node2].gpu_id) | |
1794 | if (!node_props[node1].node.KFDGpuID && !node_props[node2].node.KFDGpuID) | |
1779 | 1795 | return HSAKMT_STATUS_INVALID_NODE_UNIT; |
1780 | 1796 | |
1781 | 1797 | if (node_props[node1].node.HiveID && |
1783 | 1799 | node_props[node1].node.HiveID == node_props[node2].node.HiveID) |
1784 | 1800 | return HSAKMT_STATUS_INVALID_PARAMETER; |
1785 | 1801 | |
1786 | if (node_props[node1].gpu_id) | |
1802 | if (node_props[node1].node.KFDGpuID) | |
1787 | 1803 | dir_cpu1 = gpu_get_direct_link_cpu(node1, node_props); |
1788 | if (node_props[node2].gpu_id) | |
1804 | if (node_props[node2].node.KFDGpuID) | |
1789 | 1805 | dir_cpu2 = gpu_get_direct_link_cpu(node2, node_props); |
1790 | 1806 | |
1791 | 1807 | if (dir_cpu1 < 0 && dir_cpu2 < 0) |
1792 | 1808 | return HSAKMT_STATUS_ERROR; |
1793 | 1809 | |
1794 | 1810 | /* if the node2(dst) is GPU , it need to be large bar for host access*/ |
1795 | if (node_props[node2].gpu_id) { | |
1811 | if (node_props[node2].node.KFDGpuID) { | |
1796 | 1812 | for (i = 0; i < node_props[node2].node.NumMemoryBanks; ++i) |
1797 | 1813 | if (node_props[node2].mem[i].HeapType == |
1798 | 1814 | HSA_HEAPTYPE_FRAME_BUFFER_PUBLIC) |
1921 | 1937 | for (i = 0; i < sys_props.NumNodes; i++) { |
1922 | 1938 | ret = topology_sysfs_get_node_props(i, |
1923 | 1939 | &temp_props[i].node, |
1924 | &temp_props[i].gpu_id, | |
1925 | 1940 | &p2p_links, &num_p2pLinks); |
1926 | 1941 | if (ret != HSAKMT_STATUS_SUCCESS) { |
1927 | 1942 | free_properties(temp_props, i); |
1962 | 1977 | goto err; |
1963 | 1978 | } |
1964 | 1979 | } |
1965 | } else if (!temp_props[i].gpu_id) { /* a CPU node */ | |
1980 | } else if (!temp_props[i].node.KFDGpuID) { /* a CPU node */ | |
1966 | 1981 | ret = topology_get_cpu_cache_props( |
1967 | 1982 | i, cpuinfo, &temp_props[i]); |
1968 | 1983 | if (ret != HSAKMT_STATUS_SUCCESS) { |
2067 | 2082 | } |
2068 | 2083 | |
2069 | 2084 | /* Drop the Snashot of the HSA topology information. Assume lock is held. */ |
2070 | HSAKMT_STATUS topology_drop_snapshot(void) | |
2071 | { | |
2072 | HSAKMT_STATUS err; | |
2073 | ||
2074 | if (!!g_system != !!g_props) { | |
2085 | void topology_drop_snapshot(void) | |
2086 | { | |
2087 | if (!!g_system != !!g_props) | |
2075 | 2088 | pr_warn("Probably inconsistency?\n"); |
2076 | err = HSAKMT_STATUS_SUCCESS; | |
2077 | goto out; | |
2078 | } | |
2079 | 2089 | |
2080 | 2090 | if (g_props) { |
2081 | 2091 | /* Remove state */ |
2091 | 2101 | map_user_to_sysfs_node_id = NULL; |
2092 | 2102 | map_user_to_sysfs_node_id_size = 0; |
2093 | 2103 | } |
2094 | ||
2095 | err = HSAKMT_STATUS_SUCCESS; | |
2096 | ||
2097 | out: | |
2098 | return err; | |
2099 | 2104 | } |
2100 | 2105 | |
2101 | 2106 | HSAKMT_STATUS validate_nodeid(uint32_t nodeid, uint32_t *gpu_id) |
2103 | 2108 | if (!g_props || !g_system || g_system->NumNodes <= nodeid) |
2104 | 2109 | return HSAKMT_STATUS_INVALID_NODE_UNIT; |
2105 | 2110 | if (gpu_id) |
2106 | *gpu_id = g_props[nodeid].gpu_id; | |
2111 | *gpu_id = g_props[nodeid].node.KFDGpuID; | |
2107 | 2112 | |
2108 | 2113 | return HSAKMT_STATUS_SUCCESS; |
2109 | 2114 | } |
2113 | 2118 | uint64_t node_idx; |
2114 | 2119 | |
2115 | 2120 | for (node_idx = 0; node_idx < g_system->NumNodes; node_idx++) { |
2116 | if (g_props[node_idx].gpu_id == gpu_id) { | |
2121 | if (g_props[node_idx].node.KFDGpuID == gpu_id) { | |
2117 | 2122 | *node_id = node_idx; |
2118 | 2123 | return HSAKMT_STATUS_SUCCESS; |
2119 | 2124 | } |
2125 | 2130 | |
2126 | 2131 | HSAKMT_STATUS HSAKMTAPI hsaKmtAcquireSystemProperties(HsaSystemProperties *SystemProperties) |
2127 | 2132 | { |
2128 | HSAKMT_STATUS err; | |
2133 | HSAKMT_STATUS err = HSAKMT_STATUS_SUCCESS; | |
2129 | 2134 | |
2130 | 2135 | CHECK_KFD_OPEN(); |
2131 | 2136 | |
2133 | 2138 | return HSAKMT_STATUS_INVALID_PARAMETER; |
2134 | 2139 | |
2135 | 2140 | pthread_mutex_lock(&hsakmt_mutex); |
2141 | ||
2142 | /* We already have a valid snapshot. Avoid double initialization that | |
2143 | * would leak memory. | |
2144 | */ | |
2145 | if (g_system) { | |
2146 | *SystemProperties = *g_system; | |
2147 | goto out; | |
2148 | } | |
2136 | 2149 | |
2137 | 2150 | err = topology_take_snapshot(); |
2138 | 2151 | if (err != HSAKMT_STATUS_SUCCESS) |
2140 | 2153 | |
2141 | 2154 | assert(g_system); |
2142 | 2155 | |
2156 | err = fmm_init_process_apertures(g_system->NumNodes); | |
2157 | if (err != HSAKMT_STATUS_SUCCESS) | |
2158 | goto init_process_apertures_failed; | |
2159 | ||
2160 | err = init_process_doorbells(g_system->NumNodes); | |
2161 | if (err != HSAKMT_STATUS_SUCCESS) | |
2162 | goto init_doorbells_failed; | |
2163 | ||
2143 | 2164 | *SystemProperties = *g_system; |
2144 | err = HSAKMT_STATUS_SUCCESS; | |
2165 | ||
2166 | goto out; | |
2167 | ||
2168 | init_doorbells_failed: | |
2169 | fmm_destroy_process_apertures(); | |
2170 | init_process_apertures_failed: | |
2171 | topology_drop_snapshot(); | |
2145 | 2172 | |
2146 | 2173 | out: |
2147 | 2174 | pthread_mutex_unlock(&hsakmt_mutex); |
2150 | 2177 | |
2151 | 2178 | HSAKMT_STATUS HSAKMTAPI hsaKmtReleaseSystemProperties(void) |
2152 | 2179 | { |
2153 | HSAKMT_STATUS err; | |
2154 | ||
2155 | 2180 | pthread_mutex_lock(&hsakmt_mutex); |
2156 | 2181 | |
2157 | err = topology_drop_snapshot(); | |
2182 | destroy_process_doorbells(); | |
2183 | fmm_destroy_process_apertures(); | |
2184 | topology_drop_snapshot(); | |
2158 | 2185 | |
2159 | 2186 | pthread_mutex_unlock(&hsakmt_mutex); |
2160 | 2187 | |
2161 | return err; | |
2188 | return HSAKMT_STATUS_SUCCESS; | |
2189 | } | |
2190 | ||
2191 | HSAKMT_STATUS topology_get_node_props(HSAuint32 NodeId, | |
2192 | HsaNodeProperties *NodeProperties) | |
2193 | { | |
2194 | if (!g_system || !g_props || NodeId >= g_system->NumNodes) | |
2195 | return HSAKMT_STATUS_ERROR; | |
2196 | ||
2197 | *NodeProperties = g_props[NodeId].node; | |
2198 | return HSAKMT_STATUS_SUCCESS; | |
2162 | 2199 | } |
2163 | 2200 | |
2164 | 2201 | HSAKMT_STATUS HSAKMTAPI hsaKmtGetNodeProperties(HSAuint32 NodeId, |
2177 | 2214 | if (err != HSAKMT_STATUS_SUCCESS) |
2178 | 2215 | goto out; |
2179 | 2216 | |
2180 | *NodeProperties = g_props[NodeId].node; | |
2217 | err = topology_get_node_props(NodeId, NodeProperties); | |
2218 | if (err != HSAKMT_STATUS_SUCCESS) | |
2219 | goto out; | |
2181 | 2220 | /* For CPU only node don't add any additional GPU memory banks. */ |
2182 | 2221 | if (gpu_id) { |
2183 | 2222 | uint64_t base, limit; |
2189 | 2228 | &limit) == HSAKMT_STATUS_SUCCESS) |
2190 | 2229 | NodeProperties->NumMemoryBanks += 1; |
2191 | 2230 | } |
2192 | err = HSAKMT_STATUS_SUCCESS; | |
2193 | 2231 | |
2194 | 2232 | out: |
2195 | 2233 | pthread_mutex_unlock(&hsakmt_mutex); |
2317 | 2355 | return err; |
2318 | 2356 | } |
2319 | 2357 | |
2358 | HSAKMT_STATUS topology_get_iolink_props(HSAuint32 NodeId, | |
2359 | HSAuint32 NumIoLinks, | |
2360 | HsaIoLinkProperties *IoLinkProperties) | |
2361 | { | |
2362 | if (!g_system || !g_props || NodeId >= g_system->NumNodes) | |
2363 | return HSAKMT_STATUS_ERROR; | |
2364 | ||
2365 | memcpy(IoLinkProperties, g_props[NodeId].link, | |
2366 | NumIoLinks * sizeof(*IoLinkProperties)); | |
2367 | ||
2368 | return HSAKMT_STATUS_SUCCESS; | |
2369 | } | |
2370 | ||
2320 | 2371 | HSAKMT_STATUS HSAKMTAPI hsaKmtGetNodeIoLinkProperties(HSAuint32 NodeId, |
2321 | 2372 | HSAuint32 NumIoLinks, |
2322 | 2373 | HsaIoLinkProperties *IoLinkProperties) |
2323 | 2374 | { |
2324 | 2375 | HSAKMT_STATUS err; |
2325 | uint32_t i; | |
2326 | 2376 | |
2327 | 2377 | if (!IoLinkProperties) |
2328 | 2378 | return HSAKMT_STATUS_INVALID_PARAMETER; |
2342 | 2392 | goto out; |
2343 | 2393 | } |
2344 | 2394 | |
2345 | for (i = 0; i < MIN(g_props[NodeId].node.NumIOLinks, NumIoLinks); i++) { | |
2346 | assert(g_props[NodeId].link); | |
2347 | IoLinkProperties[i] = g_props[NodeId].link[i]; | |
2348 | } | |
2349 | ||
2350 | err = HSAKMT_STATUS_SUCCESS; | |
2395 | assert(g_props[NodeId].link); | |
2396 | err = topology_get_iolink_props(NodeId, NumIoLinks, IoLinkProperties); | |
2351 | 2397 | |
2352 | 2398 | out: |
2353 | 2399 | pthread_mutex_unlock(&hsakmt_mutex); |
2382 | 2428 | return 0; |
2383 | 2429 | |
2384 | 2430 | for (i = 0; i < g_system->NumNodes; i++) { |
2385 | if (g_props[i].gpu_id == gpu_id) | |
2431 | if (g_props[i].node.KFDGpuID == gpu_id) | |
2386 | 2432 | return g_props[i].node.DeviceId; |
2387 | 2433 | } |
2388 | 2434 |
32 | 32 | set ( CPACK_PACKAGE_CONTACT "Advanced Micro Devices Inc." ) |
33 | 33 | set ( CPACK_PACKAGE_DESCRIPTION "This package includes kfdtest, the list of excluded tests for each ASIC, and a convenience script to run the test suite" ) |
34 | 34 | set ( CPACK_PACKAGE_DESCRIPTION_SUMMARY "Test suite for ROCK/KFD" ) |
35 | ||
36 | # Make proper version for appending | |
37 | # Default Value is 99999, setting it first | |
38 | set(ROCM_VERSION_FOR_PACKAGE "99999") | |
39 | if(DEFINED ENV{ROCM_LIBPATCH_VERSION}) | |
40 | set(ROCM_VERSION_FOR_PACKAGE $ENV{ROCM_LIBPATCH_VERSION}) | |
41 | endif() | |
42 | ||
35 | 43 | set ( CPACK_PACKAGE_VERSION_MAJOR "1" ) |
36 | 44 | set ( CPACK_PACKAGE_VERSION_MINOR "0" ) |
37 | 45 | set ( CPACK_PACKAGE_VERSION_PATCH "0" ) |
38 | 46 | set ( CPACK_PACKAGE_HOMEPAGE_URL "https://github.com/RadeonOpenCompute/ROCT-Thunk-Interface" ) |
47 | set ( CPACK_DEBIAN_FILE_NAME "DEB-DEFAULT") | |
48 | set ( CPACK_RPM_FILE_NAME "RPM-DEFAULT") | |
49 | ||
50 | set(PACKAGE_VERSION_STR "${CPACK_PACKAGE_VERSION_MAJOR}.${CPACK_PACKAGE_VERSION_MINOR}.${CPACK_PACKAGE_VERSION_PATCH}.${ROCM_VERSION_FOR_PACKAGE}") | |
51 | set(CPACK_PACKAGE_VERSION "${PACKAGE_VERSION_STR}") | |
52 | ||
39 | 53 | |
40 | 54 | ## Define default variable and variables for the optional build target hsakmt-dev |
41 | 55 | set ( SOURCE_DIR ${CMAKE_CURRENT_SOURCE_DIR} CACHE STRING "Location of hsakmt source code." ) |
94 | 108 | |
95 | 109 | message ( "Find libhsakmt at ${HSAKMT_LIBRARY_DIRS}" ) |
96 | 110 | |
97 | set ( SP3_DIR ${PROJECT_SOURCE_DIR}/sp3 ) | |
111 | if ( POLICY CMP0074 ) | |
112 | cmake_policy( SET CMP0074 NEW ) | |
113 | endif() | |
114 | ||
115 | find_path( LIGHTNING_CMAKE_DIR NAMES LLVMConfig.cmake | |
116 | PATHS $ENV{OUT_DIR}/llvm/lib/cmake/llvm NO_CACHE NO_DEFAULT_PATH) | |
117 | ||
118 | if ( DEFINED LIGHTNING_CMAKE_DIR AND EXISTS ${LIGHTNING_CMAKE_DIR} ) | |
119 | set ( LLVM_DIR ${LIGHTNING_CMAKE_DIR} ) | |
120 | else() | |
121 | message( STATUS "Couldn't find Lightning build in compute directory. " | |
122 | "Searching LLVM_DIR then defaulting to system LLVM install if still not found..." ) | |
123 | endif() | |
124 | ||
125 | find_package( LLVM REQUIRED CONFIG ) | |
126 | ||
127 | if( ${LLVM_PACKAGE_VERSION} VERSION_LESS "7.0" ) | |
128 | message( FATAL_ERROR "Requires LLVM 7.0 or greater " | |
129 | "(found ${LLVM_PACKAGE_VERSION})" ) | |
130 | elseif( ${LLVM_PACKAGE_VERSION} VERSION_LESS "14.0" ) | |
131 | message( WARNING "Not using latest LLVM version. " | |
132 | "Some ASIC targets may not work!" ) | |
133 | endif() | |
134 | ||
135 | message( STATUS "Found LLVM ${LLVM_PACKAGE_VERSION}" ) | |
136 | message( STATUS "Using LLVMConfig.cmake in: ${LLVM_DIR}" ) | |
137 | ||
138 | include_directories(${LLVM_INCLUDE_DIRS}) | |
139 | separate_arguments(LLVM_DEFINITIONS_LIST NATIVE_COMMAND ${LLVM_DEFINITIONS}) | |
140 | add_definitions(${LLVM_DEFINITIONS_LIST}) | |
141 | ||
142 | llvm_map_components_to_libnames(llvm_libs AMDGPUAsmParser Core Support) | |
98 | 143 | |
99 | 144 | include_directories(${PROJECT_SOURCE_DIR}/gtest-1.6.0) |
100 | 145 | include_directories(${PROJECT_SOURCE_DIR}/include) |
101 | 146 | include_directories(${PROJECT_SOURCE_DIR}/../../include) |
102 | include_directories(${SP3_DIR}) | |
103 | 147 | |
104 | 148 | include_directories(${DRM_INCLUDE_DIRS}) |
105 | 149 | |
111 | 155 | src/Dispatch.cpp |
112 | 156 | src/GoogleTestExtension.cpp |
113 | 157 | src/IndirectBuffer.cpp |
114 | src/IsaGenerator.cpp | |
115 | src/IsaGenerator_Aldebaran.cpp | |
116 | src/IsaGenerator_Gfx10.cpp | |
117 | src/IsaGenerator_Gfx72.cpp | |
118 | src/IsaGenerator_Gfx8.cpp | |
119 | src/IsaGenerator_Gfx9.cpp | |
158 | src/Assemble.cpp | |
159 | src/ShaderStore.cpp | |
120 | 160 | src/LinuxOSWrapper.cpp |
121 | 161 | src/PM4Packet.cpp |
122 | 162 | src/PM4Queue.cpp |
139 | 179 | src/KFDExceptionTest.cpp |
140 | 180 | src/KFDGraphicsInterop.cpp |
141 | 181 | src/KFDPerfCounters.cpp |
142 | src/KFDDBGTest.cpp | |
143 | 182 | src/KFDGWSTest.cpp |
144 | 183 | src/KFDIPCTest.cpp |
184 | src/KFDASMTest.cpp | |
145 | 185 | |
146 | 186 | src/KFDEvictTest.cpp |
147 | 187 | src/KFDHWSTest.cpp |
162 | 202 | |
163 | 203 | if ( "${CMAKE_C_COMPILER_VERSION}" STRGREATER "4.8.0") |
164 | 204 | ## Add --enable-new-dtags to generate DT_RUNPATH |
165 | set ( CMAKE_CXX_FLAGS "-std=gnu++11 -Wl,--enable-new-dtags" ) | |
205 | set ( CMAKE_CXX_FLAGS "-std=gnu++17 -Wl,--enable-new-dtags" ) | |
166 | 206 | endif() |
167 | 207 | if ( "${CMAKE_BUILD_TYPE}" STREQUAL Release ) |
168 | 208 | set ( CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -O2" ) |
180 | 220 | # The modules found by pkg_check_modules() in the default pkg config |
181 | 221 | # path do not need to use link_directories() here. |
182 | 222 | link_directories(${HSAKMT_LIBRARY_DIRS}) |
183 | link_directories(${SP3_DIR}) | |
184 | 223 | |
185 | 224 | add_executable(kfdtest ${SRC_FILES}) |
186 | 225 | |
187 | target_link_libraries(kfdtest ${HSAKMT_LIBRARIES} ${DRM_LDFLAGS} ${DRM_AMDGPU_LDFLAGS} pthread m stdc++ rt amdsp3 numa) | |
226 | target_link_libraries(kfdtest ${HSAKMT_LIBRARIES} ${DRM_LDFLAGS} ${DRM_AMDGPU_LDFLAGS} ${llvm_libs} pthread m stdc++ rt numa) | |
188 | 227 | |
189 | 228 | configure_file ( scripts/kfdtest.exclude kfdtest.exclude COPYONLY ) |
190 | 229 | configure_file ( scripts/run_kfdtest.sh run_kfdtest.sh COPYONLY ) |
68 | 68 | "KFDQMTest.mGPUShareBO:"\ |
69 | 69 | "KFDQMTest.SdmaEventInterrupt:"\ |
70 | 70 | "KFDMemoryTest.CacheInvalidateOnRemoteWrite:"\ |
71 | "KFDDBGTest.BasicDebuggerSuspendResume:"\ | |
72 | 71 | "KFDEvictTest.BurstyTest:"\ |
73 | 72 | "KFDHWSTest.*:"\ |
74 | 73 | "KFDSVMRangeTest.ReadOnlyRangeTest" |
105 | 104 | "KFDQMTest.Atomics:"\ |
106 | 105 | "KFDQMTest.GPUDoorbellWrite" |
107 | 106 | |
107 | # KFDCWSRTest.BasicTest*: SWDEV-353206 | |
108 | BLACKLIST_GFX10=\ | |
109 | "KFDMemoryTest.DeviceHdpFlush:"\ | |
110 | "KFDQMTest.BasicCuMaskingEven:"\ | |
111 | "KFDSVMEvictTest.*:"\ | |
112 | "KFDCWSRTest.BasicTest*" | |
113 | ||
114 | BLACKLIST_GFX10_NV2X=\ | |
115 | "$BLACKLIST_GFX10:"\ | |
116 | "KFDPerfCountersTest.*" | |
117 | ||
118 | # GFX11 still undergoing debug. Ticket links: | |
119 | # KFDMemoryTest.FlatScratchAccess - SWDEV-329877 | |
120 | # KFDEvictTest.QueueTest - SWDEV-325064 | |
121 | # KFDQMTest.MultipleCpQueuesStressDispatch - SWDEV-340965 | |
122 | # KFDExceptionTest.* - SWDEV-340972 | |
123 | TEMPORARY_BLACKLIST_GFX11=\ | |
124 | "KFDQMTest.CreateAqlCpQueue:"\ | |
125 | "KFDQMTest.MultipleCpQueuesStressDispatch:"\ | |
126 | "KFDCWSRTest.InterruptRestore:"\ | |
127 | "KFDExceptionTest.*:"\ | |
128 | "KFDEvictTest.QueueTest:"\ | |
129 | "KFDSVMRangeTest.*Migrate*:"\ | |
130 | "KFDSVMRangeTest.*Migration*:"\ | |
131 | "KFDMemoryTest.FlatScratchAccess" | |
132 | ||
108 | 133 | # KFDQMTest.CpuWriteCoherence fails. 0 dwordsAvailable (KFD-338) |
109 | 134 | # KFDMemoryTest.MemoryRegister fails on SDMA queue creation (KFD-337) |
110 | 135 | FILTER[kaveri]=\ |
223 | 248 | "KFDMemoryTest.PtraceAccess:"\ |
224 | 249 | "KFDMemoryTest.DeviceHdpFlush" |
225 | 250 | |
226 | # SP3 Compiler needs to be updated for GFX10. Temporarily disable all tests | |
227 | # that require shader compiler | |
228 | # Adding KFDSVMEvictTest as SVM/HMM was never validated on GFX10 | |
229 | TEMP_GFX10_BLACKLIST=\ | |
230 | "KFDMemoryTest.FlatScratchAccess:"\ | |
231 | "KFDMemoryTest.PtraceAccessInvisibleVram:"\ | |
232 | "KFDQMTest.QueuePriorityOnDifferentPipe:"\ | |
233 | "KFDQMTest.QueuePriorityOnSamePipe:"\ | |
234 | "KFDCWSRTest.BasicTest:"\ | |
235 | "KFDQMTest.BasicCuMaskingEven:"\ | |
236 | "KFDEvictTest.QueueTest:"\ | |
237 | "KFDMemoryTest.MapUnmapToNodes:"\ | |
238 | "KFDMemoryTest.HostHdpFlush:"\ | |
239 | "KFDMemoryTest.DeviceHdpFlush:"\ | |
240 | "KFDSVMEvictTest.*" | |
241 | ||
242 | 251 | FILTER[navi10]=\ |
243 | 252 | "$BLACKLIST_ALL_ASICS:"\ |
244 | "$TEMP_GFX10_BLACKLIST:"\ | |
253 | "$BLACKLIST_GFX10:"\ | |
245 | 254 | "KFDMemoryTest.MMBench" |
246 | 255 | |
247 | 256 | # Need to verify the following failed tests on another machine: |
250 | 259 | # P2PBandwidth failing (wait times out) on node-to-multiple-nodes by [push, NONE] |
251 | 260 | FILTER[navi12]=\ |
252 | 261 | "$BLACKLIST_ALL_ASICS:"\ |
262 | "$BLACKLIST_GFX10:"\ | |
253 | 263 | "KFDExceptionTest.*:"\ |
254 | 264 | "KFDPerfCountersTest.*:"\ |
255 | "KFDPerformanceTest.P2PBandWidthTest:"\ | |
256 | "$TEMP_GFX10_BLACKLIST" | |
265 | "KFDPerformanceTest.P2PBandWidthTest" | |
257 | 266 | |
258 | 267 | FILTER[navi14]=\ |
259 | 268 | "$BLACKLIST_ALL_ASICS:"\ |
260 | "$TEMP_GFX10_BLACKLIST" | |
269 | "$BLACKLIST_GFX10" | |
261 | 270 | |
262 | 271 | FILTER[sienna_cichlid]=\ |
263 | 272 | "$BLACKLIST_ALL_ASICS:"\ |
264 | "$TEMP_GFX10_BLACKLIST:"\ | |
265 | "KFDQMTest.BasicCuMaskingEven:"\ | |
266 | "KFDDBGTest.*:"\ | |
267 | "KFDPerfCountersTest.*:"\ | |
273 | "$BLACKLIST_GFX10_NV2X" | |
268 | 274 | |
269 | 275 | FILTER[navy_flounder]=\ |
270 | 276 | "$BLACKLIST_ALL_ASICS:"\ |
271 | "$TEMP_GFX10_BLACKLIST:"\ | |
272 | "KFDQMTest.BasicCuMaskingEven:"\ | |
273 | "KFDDBGTest.*:"\ | |
274 | "KFDPerfCountersTest.*:"\ | |
277 | "$BLACKLIST_GFX10_NV2X" | |
275 | 278 | |
276 | 279 | FILTER[dimgrey_cavefish]=\ |
277 | 280 | "$BLACKLIST_ALL_ASICS:"\ |
278 | "$TEMP_GFX10_BLACKLIST:"\ | |
279 | "KFDQMTest.BasicCuMaskingEven:"\ | |
280 | "KFDDBGTest.*:"\ | |
281 | "KFDPerfCountersTest.*:"\ | |
281 | "$BLACKLIST_GFX10_NV2X" | |
282 | 282 | |
283 | 283 | FILTER[beige_goby]=\ |
284 | 284 | "$BLACKLIST_ALL_ASICS:"\ |
285 | "$TEMP_GFX10_BLACKLIST:"\ | |
286 | "KFDQMTest.BasicCuMaskingEven:"\ | |
287 | "KFDDBGTest.*:"\ | |
288 | "KFDPerfCountersTest.*:"\ | |
285 | "$BLACKLIST_GFX10_NV2X" | |
289 | 286 | |
290 | 287 | FILTER[yellow_carp]=\ |
291 | 288 | "$BLACKLIST_ALL_ASICS:"\ |
292 | "$TEMP_GFX10_BLACKLIST:"\ | |
293 | "KFDQMTest.BasicCuMaskingEven:"\ | |
294 | "KFDIPCTest.CMABasicTest" | |
289 | "$BLACKLIST_GFX10_NV2X" | |
290 | ||
291 | FILTER[gfx1100]=\ | |
292 | "$BLACKLIST_ALL_ASICS:"\ | |
293 | "$BLACKLIST_GFX10_NV2X:"\ | |
294 | "$TEMPORARY_BLACKLIST_GFX11" | |
295 | ||
296 | FILTER[gfx1101]=\ | |
297 | "$BLACKLIST_ALL_ASICS:"\ | |
298 | "$BLACKLIST_GFX10_NV2X:"\ | |
299 | "$TEMPORARY_BLACKLIST_GFX11" | |
300 | ||
301 | FILTER[gfx1102]=\ | |
302 | "$BLACKLIST_ALL_ASICS:"\ | |
303 | "$BLACKLIST_GFX10_NV2X:"\ | |
304 | "$TEMPORARY_BLACKLIST_GFX11" | |
305 | ||
306 | FILTER[gfx1103]=\ | |
307 | "$BLACKLIST_ALL_ASICS:"\ | |
308 | "$BLACKLIST_GFX10_NV2X:"\ | |
309 | "$TEMPORARY_BLACKLIST_GFX11" | |
310 | ||
311 | FILTER[gfx1036]=\ | |
312 | "$BLACKLIST_ALL_ASICS:"\ | |
313 | "$BLACKLIST_GFX10_NV2X" |
79 | 79 | NODE="" |
80 | 80 | FORCE_HIGH="" |
81 | 81 | RUN_IN_DOCKER="" |
82 | ADDITIONAL_EXCLUDE="" | |
82 | 83 | |
83 | 84 | printUsage() { |
84 | 85 | echo |
94 | 95 | echo " -l , --list List available nodes" |
95 | 96 | echo " --high Force clocks to high for test execution" |
96 | 97 | echo " -d , --docker Run in docker container" |
98 | echo " -e , --exclude Additional tests to exclude, in addition to kfdtest.exclude (colon-separated, single quoted string as an argument)" | |
97 | 99 | echo " -h , --help Prints this help" |
98 | 100 | echo |
99 | 101 | echo "Gtest arguments will be forwarded to the app" |
121 | 123 | gtestFilter="--gtest_filter=${FILTER[$platform]}" |
122 | 124 | ;; |
123 | 125 | esac |
126 | if [ -n "$ADDITIONAL_EXCLUDE" ]; then | |
127 | gtestFilter="$gtestFilter:$ADDITIONAL_EXCLUDE" | |
128 | fi | |
124 | 129 | } |
125 | 130 | |
126 | 131 | TOPOLOGY_SYSFS_DIR=/sys/devices/virtual/kfd/kfd/topology/nodes |
138 | 143 | } |
139 | 144 | |
140 | 145 | |
141 | # Prints GPU Name for the given Node ID | |
146 | # Prints GPU Name for the given Node ID. If transitioned to IP discovery, | |
147 | # use target gfx version | |
142 | 148 | # param - Node ID |
143 | 149 | getNodeName() { |
144 | 150 | local nodeId=$1; shift; |
147 | 153 | local CpuCoresCount=$(cat $TOPOLOGY_SYSFS_DIR/$nodeId/properties | grep cpu_cores_count | awk '{print $2}') |
148 | 154 | local SimdCount=$(cat $TOPOLOGY_SYSFS_DIR/$nodeId/properties | grep simd_count | awk '{print $2}') |
149 | 155 | if [ "$CpuCoresCount" -eq 0 ] && [ "$SimdCount" -gt 0 ]; then |
150 | gpuName="raven_dgpuFallback" | |
156 | gpuName="raven_dgpuFallback" | |
157 | fi | |
158 | elif [ "$gpuName" == "ip discovery" ]; then | |
159 | if [ -n "$HSA_OVERRIDE_GFX_VERSION" ]; then | |
160 | gpuName="gfx$(echo "$HSA_OVERRIDE_GFX_VERSION" | awk 'BEGIN {FS="."; RS=""} {printf "%d%x%x", $1, $2, $3 }')" | |
161 | else | |
162 | local GfxVersionDec=$(cat $TOPOLOGY_SYSFS_DIR/$nodeId/properties | grep gfx_target_version | awk '{print $2}') | |
163 | gpuName="gfx$(printf "$GfxVersionDec" | fold -w2 | awk 'BEGIN {FS="\n"; RS=""} {printf "%d%x%x", $1, $2, $3}')" | |
151 | 164 | fi |
152 | 165 | fi |
153 | 166 | echo "$gpuName" |
165 | 178 | exit 0 |
166 | 179 | fi |
167 | 180 | PKG_ROOT="$(getPackageRoot)" |
181 | fi | |
182 | ||
183 | if [ -n "$GTEST_ARGS" ] && [ -n "$ADDITIONAL_EXCLUDE" ]; then | |
184 | echo "Cannot use -e and --gtest_filter flags together" | |
185 | exit 0 | |
168 | 186 | fi |
169 | 187 | |
170 | 188 | if [ "$NODE" == "" ]; then |
241 | 259 | FORCE_HIGH="true" ;; |
242 | 260 | -d | --docker ) |
243 | 261 | RUN_IN_DOCKER="true" ;; |
262 | -e | --exclude ) | |
263 | ADDITIONAL_EXCLUDE="$2" ; shift ;; | |
244 | 264 | -h | --help ) |
245 | 265 | printUsage; exit 0 ;; |
246 | 266 | *) |
0 | //////////////////////////////////////////////////////////////////////////////// | |
1 | // | |
2 | // The University of Illinois/NCSA | |
3 | // Open Source License (NCSA) | |
4 | // | |
5 | // Copyright (c) 2022, Advanced Micro Devices, Inc. All rights reserved. | |
6 | // | |
7 | // Developed by: | |
8 | // | |
9 | // AMD Research and AMD HSA Software Development | |
10 | // | |
11 | // Advanced Micro Devices, Inc. | |
12 | // | |
13 | // www.amd.com | |
14 | // | |
15 | // Permission is hereby granted, free of charge, to any person obtaining a copy | |
16 | // of this software and associated documentation files (the "Software"), to | |
17 | // deal with the Software without restriction, including without limitation | |
18 | // the rights to use, copy, modify, merge, publish, distribute, sublicense, | |
19 | // and/or sell copies of the Software, and to permit persons to whom the | |
20 | // Software is furnished to do so, subject to the following conditions: | |
21 | // | |
22 | // - Redistributions of source code must retain the above copyright notice, | |
23 | // this list of conditions and the following disclaimers. | |
24 | // - Redistributions in binary form must reproduce the above copyright | |
25 | // notice, this list of conditions and the following disclaimers in | |
26 | // the documentation and/or other materials provided with the distribution. | |
27 | // - Neither the names of Advanced Micro Devices, Inc, | |
28 | // nor the names of its contributors may be used to endorse or promote | |
29 | // products derived from this Software without specific prior written | |
30 | // permission. | |
31 | // | |
32 | // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR | |
33 | // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, | |
34 | // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL | |
35 | // THE CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR | |
36 | // OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, | |
37 | // ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER | |
38 | // DEALINGS WITH THE SOFTWARE. | |
39 | // | |
40 | //////////////////////////////////////////////////////////////////////////////// | |
41 | ||
42 | /** | |
43 | * Self-contained assembler that uses the LLVM MC API to assemble AMDGCN | |
44 | * instructions | |
45 | */ | |
46 | ||
47 | #include <llvm/Config/llvm-config.h> | |
48 | #include <llvm/MC/MCAsmBackend.h> | |
49 | #include <llvm/MC/MCAsmInfo.h> | |
50 | #include <llvm/MC/MCCodeEmitter.h> | |
51 | #include <llvm/MC/MCContext.h> | |
52 | #include <llvm/MC/MCInstPrinter.h> | |
53 | #include <llvm/MC/MCInstrInfo.h> | |
54 | #include <llvm/MC/MCObjectFileInfo.h> | |
55 | #include <llvm/MC/MCObjectWriter.h> | |
56 | #include <llvm/MC/MCParser/AsmLexer.h> | |
57 | #include <llvm/MC/MCParser/MCTargetAsmParser.h> | |
58 | #include <llvm/MC/MCRegisterInfo.h> | |
59 | #include <llvm/MC/MCStreamer.h> | |
60 | #include <llvm/MC/MCSubtargetInfo.h> | |
61 | #include <llvm/Support/CommandLine.h> | |
62 | #include <llvm/Support/InitLLVM.h> | |
63 | #include <llvm/Support/MemoryBuffer.h> | |
64 | #include <llvm/Support/SourceMgr.h> | |
65 | #include <llvm/Support/TargetSelect.h> | |
66 | #if LLVM_VERSION_MAJOR > 13 | |
67 | #include <llvm/MC/TargetRegistry.h> | |
68 | #else | |
69 | #include <llvm/Support/TargetRegistry.h> | |
70 | #endif | |
71 | ||
72 | #include <linux/elf.h> | |
73 | #include "OSWrapper.hpp" | |
74 | #include "Assemble.hpp" | |
75 | ||
76 | using namespace llvm; | |
77 | ||
78 | Assembler::Assembler(const uint32_t Gfxv) { | |
79 | SetTargetAsic(Gfxv); | |
80 | TextData = nullptr; | |
81 | TextSize = 0; | |
82 | LLVMInit(); | |
83 | } | |
84 | ||
85 | Assembler::~Assembler() { | |
86 | FlushText(); | |
87 | llvm_shutdown(); | |
88 | } | |
89 | ||
90 | const char* Assembler::GetInstrStream() { | |
91 | return TextData; | |
92 | } | |
93 | ||
94 | const size_t Assembler::GetInstrStreamSize() { | |
95 | return TextSize; | |
96 | } | |
97 | ||
98 | int Assembler::CopyInstrStream(char* OutBuf, const size_t BufSize) { | |
99 | if (TextSize > BufSize) | |
100 | return -2; | |
101 | ||
102 | std::copy(TextData, TextData + TextSize, OutBuf); | |
103 | return 0; | |
104 | } | |
105 | ||
106 | const char* Assembler::GetTargetAsic() { | |
107 | return MCPU; | |
108 | } | |
109 | ||
110 | /** | |
111 | * Set MCPU via GFX Version from Thunk | |
112 | * LLVM Target IDs use decimal for Maj/Min, hex for Step | |
113 | */ | |
114 | void Assembler::SetTargetAsic(const uint32_t Gfxv) { | |
115 | const uint8_t Major = (Gfxv >> 16) & 0xff; | |
116 | const uint8_t Minor = (Gfxv >> 8) & 0xff; | |
117 | const uint8_t Step = Gfxv & 0xff; | |
118 | ||
119 | snprintf(MCPU, ASM_MCPU_LEN, "gfx%d%d%x", Major, Minor, Step); | |
120 | } | |
121 | ||
122 | /** | |
123 | * Initialize LLVM targets and assembly printers/parsers | |
124 | */ | |
125 | void Assembler::LLVMInit() { | |
126 | LLVMInitializeAMDGPUTargetInfo(); | |
127 | LLVMInitializeAMDGPUTargetMC(); | |
128 | LLVMInitializeAMDGPUAsmParser(); | |
129 | } | |
130 | ||
131 | /** | |
132 | * Flush/reset TextData and TextSize to initial state | |
133 | */ | |
134 | void Assembler::FlushText() { | |
135 | if (TextData) | |
136 | delete[] TextData; | |
137 | TextData = nullptr; | |
138 | TextSize = 0; | |
139 | } | |
140 | ||
141 | /** | |
142 | * Print hex of ELF object to stdout (debug) | |
143 | */ | |
144 | void Assembler::PrintELFHex(const std::string Data) { | |
145 | outs() << "ASM Info: assembled ELF hex data (length " << Data.length() << "):\n"; | |
146 | outs() << "0x00:\t"; | |
147 | for (size_t i = 0; i < Data.length(); ++i) { | |
148 | char c = Data[i]; | |
149 | outs() << format_hex(static_cast<uint8_t>(c), 4); | |
150 | if ((i+1) % 16 == 0) | |
151 | outs() << "\n" << format_hex(i+1, 4) << ":\t"; | |
152 | else | |
153 | outs() << " "; | |
154 | } | |
155 | outs() << "\n"; | |
156 | } | |
157 | ||
158 | /** | |
159 | * Print hex of raw instruction stream to stdout (debug) | |
160 | */ | |
161 | void Assembler::PrintTextHex() { | |
162 | outs() << "ASM Info: assembled .text hex data (length " << TextSize << "):\n"; | |
163 | outs() << "0x00:\t"; | |
164 | for (size_t i = 0; i < TextSize; i++) { | |
165 | outs() << format_hex(static_cast<uint8_t>(TextData[i]), 4); | |
166 | if ((i+1) % 16 == 0) | |
167 | outs() << "\n" << format_hex(i+1, 4) << ":\t"; | |
168 | else | |
169 | outs() << " "; | |
170 | } | |
171 | outs() << "\n"; | |
172 | } | |
173 | ||
174 | /** | |
175 | * Extract raw instruction stream from .text section in ELF object | |
176 | * | |
177 | * @param RawData Raw C string of ELF object | |
178 | * @return 0 on success | |
179 | */ | |
180 | int Assembler::ExtractELFText(const char* RawData) { | |
181 | const Elf64_Ehdr* ElfHeader; | |
182 | const Elf64_Shdr* SectHeader; | |
183 | const Elf64_Shdr* SectStrTable; | |
184 | const char* SectStrAddr; | |
185 | unsigned NumSects, SectIdx; | |
186 | ||
187 | if (!(ElfHeader = reinterpret_cast<const Elf64_Ehdr*>(RawData))) { | |
188 | outs() << "ASM Error: elf data is invalid or corrupted\n"; | |
189 | return -1; | |
190 | } | |
191 | if (ElfHeader->e_ident[EI_CLASS] != ELFCLASS64) { | |
192 | outs() << "ASM Error: elf object must be of 64-bit type\n"; | |
193 | return -1; | |
194 | } | |
195 | ||
196 | SectHeader = reinterpret_cast<const Elf64_Shdr*>(RawData + ElfHeader->e_shoff); | |
197 | SectStrTable = &SectHeader[ElfHeader->e_shstrndx]; | |
198 | SectStrAddr = static_cast<const char*>(RawData + SectStrTable->sh_offset); | |
199 | ||
200 | // Loop through sections, break on .text | |
201 | NumSects = ElfHeader->e_shnum; | |
202 | for (SectIdx = 0; SectIdx < NumSects; SectIdx++) { | |
203 | std::string SectName = std::string(SectStrAddr + SectHeader[SectIdx].sh_name); | |
204 | if (SectName == std::string(".text")) { | |
205 | TextSize = SectHeader[SectIdx].sh_size; | |
206 | TextData = new char[TextSize]; | |
207 | memcpy(TextData, RawData + SectHeader[SectIdx].sh_offset, TextSize); | |
208 | break; | |
209 | } | |
210 | } | |
211 | ||
212 | if (SectIdx >= NumSects) { | |
213 | outs() << "ASM Error: couldn't locate .text section\n"; | |
214 | return -1; | |
215 | } | |
216 | ||
217 | return 0; | |
218 | } | |
219 | ||
220 | /** | |
221 | * Assemble shader, fill member vars, and copy to output buffer | |
222 | * | |
223 | * @param AssemblySource Shader source represented as a raw C string | |
224 | * @param OutBuf Raw instruction stream output buffer | |
225 | * @param BufSize Size of OutBuf (defaults to PAGE_SIZE) | |
226 | * @return Value of RunAssemble() (0 on success) | |
227 | */ | |
228 | int Assembler::RunAssembleBuf(const char* const AssemblySource, char* OutBuf, | |
229 | const size_t BufSize) { | |
230 | int ret = RunAssemble(AssemblySource); | |
231 | return ret ? ret : CopyInstrStream(OutBuf, BufSize); | |
232 | } | |
233 | ||
234 | /** | |
235 | * Assemble shader and fill member vars | |
236 | * | |
237 | * @param AssemblySource Shader source represented as a raw C string | |
238 | * @return 0 on success | |
239 | */ | |
240 | int Assembler::RunAssemble(const char* const AssemblySource) { | |
241 | // Ensure target ASIC has been set | |
242 | if (!MCPU) { | |
243 | outs() << "ASM Error: target asic is uninitialized\n"; | |
244 | return -1; | |
245 | } | |
246 | ||
247 | // Delete TextData for any previous runs | |
248 | FlushText(); | |
249 | ||
250 | #if 0 | |
251 | outs() << "ASM Info: running assembly for target: " << MCPU << "\n"; | |
252 | outs() << "ASM Info: source:\n"; | |
253 | outs() << AssemblySource << "\n"; | |
254 | #endif | |
255 | ||
256 | // Initialize MCOptions and target triple | |
257 | const MCTargetOptions MCOptions; | |
258 | Triple TheTriple; | |
259 | ||
260 | const Target* TheTarget = | |
261 | TargetRegistry::lookupTarget(ArchName, TheTriple, Error); | |
262 | if (!TheTarget) { | |
263 | outs() << Error; | |
264 | return -1; | |
265 | } | |
266 | ||
267 | TheTriple.setArchName(ArchName); | |
268 | TheTriple.setVendorName(VendorName); | |
269 | TheTriple.setOSName(OSName); | |
270 | ||
271 | TripleName = TheTriple.getTriple(); | |
272 | TheTriple.setTriple(Triple::normalize(TripleName)); | |
273 | ||
274 | // Create MemoryBuffer for assembly source | |
275 | StringRef AssemblyRef(AssemblySource); | |
276 | std::unique_ptr<MemoryBuffer> BufferPtr = | |
277 | MemoryBuffer::getMemBuffer(AssemblyRef, "", false); | |
278 | if (!BufferPtr->getBufferSize()) { | |
279 | outs() << "ASM Error: assembly source is empty\n"; | |
280 | return -1; | |
281 | } | |
282 | ||
283 | // Instantiate SrcMgr and transfer BufferPtr ownership | |
284 | SourceMgr SrcMgr; | |
285 | SrcMgr.AddNewSourceBuffer(std::move(BufferPtr), SMLoc()); | |
286 | ||
287 | // Initialize MC interfaces and base class objects | |
288 | std::unique_ptr<const MCRegisterInfo> MRI( | |
289 | TheTarget->createMCRegInfo(TripleName)); | |
290 | if (!MRI) { | |
291 | outs() << "ASM Error: no register info for target " << MCPU << "\n"; | |
292 | return -1; | |
293 | } | |
294 | #if LLVM_VERSION_MAJOR > 9 | |
295 | std::unique_ptr<const MCAsmInfo> MAI( | |
296 | TheTarget->createMCAsmInfo(*MRI, TripleName, MCOptions)); | |
297 | #else | |
298 | std::unique_ptr<const MCAsmInfo> MAI( | |
299 | TheTarget->createMCAsmInfo(*MRI, TripleName)); | |
300 | #endif | |
301 | if (!MAI) { | |
302 | outs() << "ASM Error: no assembly info for target " << MCPU << "\n"; | |
303 | return -1; | |
304 | } | |
305 | std::unique_ptr<MCInstrInfo> MCII( | |
306 | TheTarget->createMCInstrInfo()); | |
307 | if (!MCII) { | |
308 | outs() << "ASM Error: no instruction info for target " << MCPU << "\n"; | |
309 | return -1; | |
310 | } | |
311 | std::unique_ptr<MCSubtargetInfo> STI( | |
312 | TheTarget->createMCSubtargetInfo(TripleName, MCPU, std::string())); | |
313 | if (!STI || !STI->isCPUStringValid(MCPU)) { | |
314 | outs() << "ASM Error: no subtarget info for target " << MCPU << "\n"; | |
315 | return -1; | |
316 | } | |
317 | ||
318 | // Set up the MCContext for creating symbols and MCExpr's | |
319 | #if LLVM_VERSION_MAJOR > 12 | |
320 | MCContext Ctx(TheTriple, MAI.get(), MRI.get(), STI.get(), &SrcMgr, &MCOptions); | |
321 | #else | |
322 | MCObjectFileInfo MOFI; | |
323 | MCContext Ctx(MAI.get(), MRI.get(), &MOFI, &SrcMgr, &MCOptions); | |
324 | MOFI.InitMCObjectFileInfo(TheTriple, true, Ctx); | |
325 | #endif | |
326 | ||
327 | // Finalize setup for output object code stream | |
328 | std::string Data; | |
329 | std::unique_ptr<raw_string_ostream> DataStream(std::make_unique<raw_string_ostream>(Data)); | |
330 | std::unique_ptr<buffer_ostream> BOS(std::make_unique<buffer_ostream>(*DataStream)); | |
331 | raw_pwrite_stream* OS = BOS.get(); | |
332 | ||
333 | #if LLVM_VERSION_MAJOR > 14 | |
334 | MCCodeEmitter* CE = TheTarget->createMCCodeEmitter(*MCII, Ctx); | |
335 | #else | |
336 | MCCodeEmitter* CE = TheTarget->createMCCodeEmitter(*MCII, *MRI, Ctx); | |
337 | #endif | |
338 | MCAsmBackend* MAB = TheTarget->createMCAsmBackend(*STI, *MRI, MCOptions); | |
339 | ||
340 | std::unique_ptr<MCStreamer> Streamer(TheTarget->createMCObjectStreamer( | |
341 | TheTriple, Ctx, | |
342 | std::unique_ptr<MCAsmBackend>(MAB), MAB->createObjectWriter(*OS), | |
343 | std::unique_ptr<MCCodeEmitter>(CE), *STI, MCOptions.MCRelaxAll, | |
344 | MCOptions.MCIncrementalLinkerCompatible, /*DWARFMustBeAtTheEnd*/ false)); | |
345 | ||
346 | std::unique_ptr<MCAsmParser> Parser( | |
347 | createMCAsmParser(SrcMgr, Ctx, *Streamer, *MAI)); | |
348 | ||
349 | // Set parser to target parser and run | |
350 | std::unique_ptr<MCTargetAsmParser> TAP( | |
351 | TheTarget->createMCAsmParser(*STI, *Parser, *MCII, MCOptions)); | |
352 | if (!TAP) { | |
353 | outs() << "ASM Error: no assembly parsing support for target " << MCPU << "\n"; | |
354 | return -1; | |
355 | } | |
356 | Parser->setTargetParser(*TAP); | |
357 | ||
358 | if (Parser->Run(true)) { | |
359 | outs() << "ASM Error: assembly parser failed\n"; | |
360 | return -1; | |
361 | } | |
362 | ||
363 | BOS.reset(); | |
364 | DataStream->flush(); | |
365 | ||
366 | int ret = ExtractELFText(Data.data()); | |
367 | if (ret < 0 || !TextData) { | |
368 | outs() << "ASM Error: .text extraction failed\n"; | |
369 | return ret; | |
370 | } | |
371 | ||
372 | #if 0 | |
373 | PrintELFHex(Data); | |
374 | PrintTextHex(); | |
375 | #endif | |
376 | ||
377 | return 0; | |
378 | } |
0 | //////////////////////////////////////////////////////////////////////////////// | |
1 | // | |
2 | // The University of Illinois/NCSA | |
3 | // Open Source License (NCSA) | |
4 | // | |
5 | // Copyright (c) 2022, Advanced Micro Devices, Inc. All rights reserved. | |
6 | // | |
7 | // Developed by: | |
8 | // | |
9 | // AMD Research and AMD HSA Software Development | |
10 | // | |
11 | // Advanced Micro Devices, Inc. | |
12 | // | |
13 | // www.amd.com | |
14 | // | |
15 | // Permission is hereby granted, free of charge, to any person obtaining a copy | |
16 | // of this software and associated documentation files (the "Software"), to | |
17 | // deal with the Software without restriction, including without limitation | |
18 | // the rights to use, copy, modify, merge, publish, distribute, sublicense, | |
19 | // and/or sell copies of the Software, and to permit persons to whom the | |
20 | // Software is furnished to do so, subject to the following conditions: | |
21 | // | |
22 | // - Redistributions of source code must retain the above copyright notice, | |
23 | // this list of conditions and the following disclaimers. | |
24 | // - Redistributions in binary form must reproduce the above copyright | |
25 | // notice, this list of conditions and the following disclaimers in | |
26 | // the documentation and/or other materials provided with the distribution. | |
27 | // - Neither the names of Advanced Micro Devices, Inc, | |
28 | // nor the names of its contributors may be used to endorse or promote | |
29 | // products derived from this Software without specific prior written | |
30 | // permission. | |
31 | // | |
32 | // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR | |
33 | // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, | |
34 | // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL | |
35 | // THE CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR | |
36 | // OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, | |
37 | // ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER | |
38 | // DEALINGS WITH THE SOFTWARE. | |
39 | // | |
40 | //////////////////////////////////////////////////////////////////////////////// | |
41 | ||
42 | #ifndef _ASSEMBLE_H_ | |
43 | #define _ASSEMBLE_H_ | |
44 | ||
45 | #include "OSWrapper.hpp" | |
46 | ||
47 | #define ASM_MCPU_LEN 16 | |
48 | ||
49 | class Assembler { | |
50 | private: | |
51 | const char* ArchName = "amdgcn"; | |
52 | const char* VendorName = "amd"; | |
53 | const char* OSName = "amdhsa"; | |
54 | char MCPU[ASM_MCPU_LEN]; | |
55 | ||
56 | std::string TripleName; | |
57 | std::string Error; | |
58 | ||
59 | char* TextData; | |
60 | size_t TextSize; | |
61 | ||
62 | void SetTargetAsic(const uint32_t Gfxv); | |
63 | ||
64 | void LLVMInit(); | |
65 | void FlushText(); | |
66 | void PrintELFHex(const std::string Data); | |
67 | int ExtractELFText(const char* RawData); | |
68 | ||
69 | public: | |
70 | Assembler(const uint32_t Gfxv); | |
71 | ~Assembler(); | |
72 | ||
73 | void PrintTextHex(); | |
74 | const char* GetTargetAsic(); | |
75 | ||
76 | const char* GetInstrStream(); | |
77 | const size_t GetInstrStreamSize(); | |
78 | int CopyInstrStream(char* OutBuf, const size_t BufSize = PAGE_SIZE); | |
79 | ||
80 | int RunAssemble(const char* const AssemblySource); | |
81 | int RunAssembleBuf(const char* const AssemblySource, char* OutBuf, | |
82 | const size_t BufSize = PAGE_SIZE); | |
83 | }; | |
84 | ||
85 | #endif // _ASSEMBLE_H_ |
137 | 137 | pgmRsrc2 |= (1 << COMPUTE_PGM_RSRC2__EXCP_EN_MSB__SHIFT) |
138 | 138 | & COMPUTE_PGM_RSRC2__EXCP_EN_MSB_MASK; |
139 | 139 | |
140 | const bool priv = (m_FamilyId == FAMILY_GFX11); | |
140 | 141 | const unsigned int COMPUTE_PGM_RSRC[] = { |
141 | // PGM_RSRC1 = { VGPRS: 16 SGPRS: 16 PRIORITY: m_SpiPriority FLOAT_MODE: c0 PRIV: 0 | |
142 | // DX10_CLAMP: 0 DEBUG_MODE: 0 IEEE_MODE: 0 BULKY: 0 CDBG_USER: 0 } | |
143 | 0x000c0084 | ((m_SpiPriority & 3) << 10), | |
142 | // PGM_RSRC1 = { VGPRS: 16 SGPRS: 16 PRIORITY: m_SpiPriority FLOAT_MODE: c0 | |
143 | // PRIV: 0 (1 for GFX11) DX10_CLAMP: 0 DEBUG_MODE: 0 IEEE_MODE: 0 BULKY: 0 CDBG_USER: 0 } | |
144 | 0x000c0084 | ((m_SpiPriority & 3) << 10) | (priv << 20), | |
144 | 145 | pgmRsrc2 |
145 | 146 | }; |
146 | 147 |
0 | /* | |
1 | * Copyright (C) 2014-2018 Advanced Micro Devices, Inc. All Rights Reserved. | |
2 | * | |
3 | * Permission is hereby granted, free of charge, to any person obtaining a | |
4 | * copy of this software and associated documentation files (the "Software"), | |
5 | * to deal in the Software without restriction, including without limitation | |
6 | * the rights to use, copy, modify, merge, publish, distribute, sublicense, | |
7 | * and/or sell copies of the Software, and to permit persons to whom the | |
8 | * Software is furnished to do so, subject to the following conditions: | |
9 | * | |
10 | * The above copyright notice and this permission notice shall be included in | |
11 | * all copies or substantial portions of the Software. | |
12 | * | |
13 | * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR | |
14 | * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, | |
15 | * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL | |
16 | * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR | |
17 | * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, | |
18 | * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR | |
19 | * OTHER DEALINGS IN THE SOFTWARE. | |
20 | * | |
21 | */ | |
22 | ||
23 | #include "IsaGenerator.hpp" | |
24 | ||
25 | #include <algorithm> | |
26 | #include <string> | |
27 | ||
28 | #include "IsaGenerator_Gfx72.hpp" | |
29 | #include "IsaGenerator_Gfx8.hpp" | |
30 | #include "IsaGenerator_Gfx9.hpp" | |
31 | #include "IsaGenerator_Gfx10.hpp" | |
32 | #include "IsaGenerator_Aldebaran.hpp" | |
33 | ||
34 | #include "GoogleTestExtension.hpp" | |
35 | ||
36 | #include "sp3.h" | |
37 | ||
38 | const std::string IsaGenerator::ADDRESS_WATCH_SP3( | |
39 | "var REG_TRAPSTS_EXCP_MASK = 0x000001ff\n" | |
40 | "var WAVE_COUNT_OFFSET = 12\n" | |
41 | "var TMA_CYCLE_OFFSET = 16\n" | |
42 | "\n" | |
43 | "/*\n" | |
44 | " * ttmp[0:1] -- The ISA address that triggered this trap handler\n" | |
45 | " * ttmp[10:11] -- The TMA user provided, used to store the debug info in this shader\n" | |
46 | " * v[10:14] ttmp[7:8] -- temp use inside this shader\n" | |
47 | " * s5 -- store the counts that this trap been triggered\n" | |
48 | " * Each time when the trap is triggered , this shader will write\n" | |
49 | " * ttmp[0] : ttmp[1] : Trap_Status : [reserved]\n" | |
50 | " * to TMA + (trap count * TMA_CYCLE_OFFSET)\n" | |
51 | " * The TMA + WAVE_COUNT_OFFSET(the first [reserved] address)\n" | |
52 | " * used to store the total triggered trap count.\n" | |
53 | " */\n" | |
54 | "shader main\n" | |
55 | "\n" | |
56 | " asic(VI)\n" | |
57 | "\n" | |
58 | " type(CS)\n" | |
59 | " v_mov_b32 v10, ttmp10\n" | |
60 | " v_mov_b32 v11, ttmp11\n" | |
61 | " s_mov_b32 ttmp7, s5\n" | |
62 | " s_mulk_i32 ttmp7, TMA_CYCLE_OFFSET\n" | |
63 | " s_addk_i32 s5, 1\n" | |
64 | " v_mov_b32 v12, ttmp0\n" | |
65 | " v_add_u32 v10, vcc, ttmp7, v10\n" | |
66 | " flat_store_dword v[10,11], v12 slc glc\n" | |
67 | " v_mov_b32 v12, ttmp1\n" | |
68 | " v_add_u32 v10, vcc, 4, v10\n" | |
69 | " flat_store_dword v[10,11], v12 slc glc\n" | |
70 | " s_getreg_b32 ttmp8, hwreg(HW_REG_TRAPSTS)\n" | |
71 | " s_and_b32 ttmp8, ttmp8, REG_TRAPSTS_EXCP_MASK\n" | |
72 | " v_mov_b32 v12, ttmp8\n" | |
73 | " v_add_u32 v10, vcc, 4, v10\n" | |
74 | " flat_store_dword v[10,11], v12 glc\n" | |
75 | " v_mov_b32 v10, ttmp10\n" | |
76 | " v_add_u32 v10, vcc, WAVE_COUNT_OFFSET, v10\n" | |
77 | " v_mov_b32 v13, 1\n" | |
78 | " flat_atomic_add v14, v[10:11], v13 slc glc\n" | |
79 | " s_and_b32 ttmp1, ttmp1, 0xffff\n" | |
80 | " s_rfe_b64 [ttmp0,ttmp1]\n" | |
81 | "end\n" | |
82 | ); | |
83 | ||
84 | IsaGenerator* IsaGenerator::Create(unsigned int familyId) { | |
85 | switch (familyId) { | |
86 | case FAMILY_CI: | |
87 | case FAMILY_KV: | |
88 | return new IsaGenerator_Gfx72; | |
89 | case FAMILY_VI: | |
90 | case FAMILY_CZ: | |
91 | return new IsaGenerator_Gfx8; | |
92 | case FAMILY_AI: | |
93 | case FAMILY_RV: | |
94 | case FAMILY_AR: | |
95 | return new IsaGenerator_Gfx9; | |
96 | case FAMILY_AL: | |
97 | return new IsaGenerator_Aldbrn; | |
98 | case FAMILY_NV: | |
99 | return new IsaGenerator_Gfx10; | |
100 | ||
101 | default: | |
102 | LOG() << "Error: Invalid ISA" << std::endl; | |
103 | return NULL; | |
104 | } | |
105 | } | |
106 | ||
107 | void IsaGenerator::GetAwTrapHandler(HsaMemoryBuffer& rBuf) { | |
108 | CompileShader(ADDRESS_WATCH_SP3.c_str(), "main", rBuf); | |
109 | } | |
110 | ||
111 | void IsaGenerator::CompileShader(const char* shaderCode, const char* shaderName, HsaMemoryBuffer& rBuf) { | |
112 | sp3_context* pSp3 = sp3_new(); | |
113 | sp3_setasic(pSp3, GetAsicName().c_str()); | |
114 | sp3_parse_string(pSp3, shaderCode); | |
115 | sp3_shader* pShader = sp3_compile(pSp3, shaderName); | |
116 | ||
117 | std::copy(pShader->data, pShader->data + pShader->size, rBuf.As<unsigned int*>()); | |
118 | sp3_free_shader(pShader); | |
119 | ||
120 | /** Inside this close function, there is an unknown reason of free memory not used by compiler. | |
121 | * Comment out this as a workaround. System will do the garbage collection after this | |
122 | * application is closed. | |
123 | */ | |
124 | // sp3_close(pSp3); | |
125 | } |
0 | /* | |
1 | * Copyright (C) 2014-2018 Advanced Micro Devices, Inc. All Rights Reserved. | |
2 | * | |
3 | * Permission is hereby granted, free of charge, to any person obtaining a | |
4 | * copy of this software and associated documentation files (the "Software"), | |
5 | * to deal in the Software without restriction, including without limitation | |
6 | * the rights to use, copy, modify, merge, publish, distribute, sublicense, | |
7 | * and/or sell copies of the Software, and to permit persons to whom the | |
8 | * Software is furnished to do so, subject to the following conditions: | |
9 | * | |
10 | * The above copyright notice and this permission notice shall be included in | |
11 | * all copies or substantial portions of the Software. | |
12 | * | |
13 | * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR | |
14 | * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, | |
15 | * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL | |
16 | * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR | |
17 | * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, | |
18 | * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR | |
19 | * OTHER DEALINGS IN THE SOFTWARE. | |
20 | * | |
21 | */ | |
22 | ||
23 | #ifndef _ISAGENERATOR_H_ | |
24 | #define _ISAGENERATOR_H_ | |
25 | ||
26 | #include "KFDTestUtil.hpp" | |
27 | ||
28 | /* isa generation class - interface */ | |
29 | class IsaGenerator { | |
30 | public: | |
31 | static IsaGenerator* Create(unsigned int familyId); | |
32 | ||
33 | virtual ~IsaGenerator() {} | |
34 | ||
35 | virtual void GetNoopIsa(HsaMemoryBuffer& rBuf) = 0; | |
36 | virtual void GetCopyDwordIsa(HsaMemoryBuffer& rBuf) = 0; | |
37 | virtual void GetInfiniteLoopIsa(HsaMemoryBuffer& rBuf) = 0; | |
38 | virtual void GetAtomicIncIsa(HsaMemoryBuffer& rBuf) = 0; | |
39 | virtual void GetCwsrTrapHandler(HsaMemoryBuffer& rBuf) {} | |
40 | virtual void GetAwTrapHandler(HsaMemoryBuffer& rBuf); | |
41 | ||
42 | void CompileShader(const char* shaderCode, const char* shaderName, HsaMemoryBuffer& rBuf); | |
43 | ||
44 | protected: | |
45 | virtual const std::string& GetAsicName() = 0; | |
46 | ||
47 | private: | |
48 | static const std::string ADDRESS_WATCH_SP3; | |
49 | }; | |
50 | ||
51 | #endif // _ISAGENERATOR_H_ |
0 | /* | |
1 | * Copyright (C) 2020 Advanced Micro Devices, Inc. All Rights Reserved. | |
2 | * | |
3 | * Permission is hereby granted, free of charge, to any person obtaining a | |
4 | * copy of this software and associated documentation files (the "Software"), | |
5 | * to deal in the Software without restriction, including without limitation | |
6 | * the rights to use, copy, modify, merge, publish, distribute, sublicense, | |
7 | * and/or sell copies of the Software, and to permit persons to whom the | |
8 | * Software is furnished to do so, subject to the following conditions: | |
9 | * | |
10 | * The above copyright notice and this permission notice shall be included in | |
11 | * all copies or substantial portions of the Software. | |
12 | * | |
13 | * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR | |
14 | * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, | |
15 | * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL | |
16 | * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR | |
17 | * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, | |
18 | * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR | |
19 | * OTHER DEALINGS IN THE SOFTWARE. | |
20 | * | |
21 | */ | |
22 | ||
23 | #include "IsaGenerator_Aldebaran.hpp" | |
24 | ||
25 | #include <algorithm> | |
26 | #include <string> | |
27 | ||
28 | const std::string IsaGenerator_Aldbrn::ASIC_NAME = "ALDEBARAN"; | |
29 | ||
30 | /* The binaries are generated from following ISA */ | |
31 | #if 0 | |
32 | /* flat_atomic_inc will not support by some PCIE, use flat_atomic_add instead */ | |
33 | shader atomic_add | |
34 | asic(ALDEBARAN) | |
35 | type(CS) | |
36 | v_mov_b32 v0, s0 | |
37 | v_mov_b32 v1, s1 | |
38 | v_mov_b32 v2, 1 | |
39 | flat_atomic_add v3, v[0:1], v2 slc glc scc | |
40 | s_waitcnt 0 | |
41 | s_endpgm | |
42 | end | |
43 | ||
44 | shader copy_dword | |
45 | asic(ALDEBARAN) | |
46 | type(CS) | |
47 | /* copy the parameters from scalar registers to vector registers */ | |
48 | v_mov_b32 v0, s0 | |
49 | v_mov_b32 v1, s1 | |
50 | v_mov_b32 v2, s2 | |
51 | v_mov_b32 v3, s3 | |
52 | /* copy a dword between the passed addresses */ | |
53 | flat_load_dword v4, v[0:1] slc glc | |
54 | s_waitcnt 0 | |
55 | flat_store_dword v[2:3], v4 slc glc | |
56 | s_endpgm | |
57 | end | |
58 | ||
59 | shader main | |
60 | asic(ALDEBARAN) | |
61 | type(CS) | |
62 | loop: | |
63 | s_branch loop | |
64 | s_endpgm | |
65 | end | |
66 | ||
67 | ||
68 | #endif | |
69 | ||
70 | const uint32_t IsaGenerator_Aldbrn::NOOP_ISA[] = { | |
71 | 0xbf810000 | |
72 | }; | |
73 | ||
74 | const uint32_t IsaGenerator_Aldbrn::COPY_DWORD_ISA[] = { | |
75 | 0x7e000200, 0x7e020201, | |
76 | 0x7e040202, 0x7e060203, | |
77 | 0xdc530000, 0x047f0000, | |
78 | 0xbf8c0000, 0xdc730000, | |
79 | 0x007f0402, 0xbf810000 | |
80 | }; | |
81 | ||
82 | const uint32_t IsaGenerator_Aldbrn::INFINITE_LOOP_ISA[] = { | |
83 | 0xbf82ffff, 0xbf810000 | |
84 | }; | |
85 | ||
86 | const uint32_t IsaGenerator_Aldbrn::ATOMIC_ADD_ISA[] = { | |
87 | 0x7e000200, 0x7e020201, | |
88 | 0x7e040281, 0xdf0b0000, | |
89 | 0x037f0200, 0xbf8c0000, | |
90 | 0xbf810000, 0x00000000 | |
91 | }; | |
92 | ||
93 | void IsaGenerator_Aldbrn::GetNoopIsa(HsaMemoryBuffer& rBuf) { | |
94 | std::copy(NOOP_ISA, NOOP_ISA+ARRAY_SIZE(NOOP_ISA), rBuf.As<uint32_t*>()); | |
95 | } | |
96 | ||
97 | void IsaGenerator_Aldbrn::GetCopyDwordIsa(HsaMemoryBuffer& rBuf) { | |
98 | std::copy(COPY_DWORD_ISA, COPY_DWORD_ISA+ARRAY_SIZE(COPY_DWORD_ISA), rBuf.As<uint32_t*>()); | |
99 | } | |
100 | ||
101 | void IsaGenerator_Aldbrn::GetInfiniteLoopIsa(HsaMemoryBuffer& rBuf) { | |
102 | std::copy(INFINITE_LOOP_ISA, INFINITE_LOOP_ISA+ARRAY_SIZE(INFINITE_LOOP_ISA), rBuf.As<uint32_t*>()); | |
103 | } | |
104 | ||
105 | void IsaGenerator_Aldbrn::GetAtomicIncIsa(HsaMemoryBuffer& rBuf) { | |
106 | std::copy(ATOMIC_ADD_ISA, ATOMIC_ADD_ISA+ARRAY_SIZE(ATOMIC_ADD_ISA), rBuf.As<uint32_t*>()); | |
107 | } | |
108 | ||
109 | const std::string& IsaGenerator_Aldbrn::GetAsicName() { | |
110 | return ASIC_NAME; | |
111 | } | |
112 |
0 | /* | |
1 | * Copyright (C) 2020 Advanced Micro Devices, Inc. All Rights Reserved. | |
2 | * | |
3 | * Permission is hereby granted, free of charge, to any person obtaining a | |
4 | * copy of this software and associated documentation files (the "Software"), | |
5 | * to deal in the Software without restriction, including without limitation | |
6 | * the rights to use, copy, modify, merge, publish, distribute, sublicense, | |
7 | * and/or sell copies of the Software, and to permit persons to whom the | |
8 | * Software is furnished to do so, subject to the following conditions: | |
9 | * | |
10 | * The above copyright notice and this permission notice shall be included in | |
11 | * all copies or substantial portions of the Software. | |
12 | * | |
13 | * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR | |
14 | * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, | |
15 | * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL | |
16 | * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR | |
17 | * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, | |
18 | * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR | |
19 | * OTHER DEALINGS IN THE SOFTWARE. | |
20 | * | |
21 | */ | |
22 | ||
23 | #ifndef _ISAGENERATOR_ALDEBARAN_H_ | |
24 | #define _ISAGENERATOR_ALDEBARAN_H_ | |
25 | ||
26 | #include <string> | |
27 | #include "IsaGenerator.hpp" | |
28 | ||
29 | class IsaGenerator_Aldbrn : public IsaGenerator { | |
30 | public: | |
31 | virtual void GetNoopIsa(HsaMemoryBuffer& rBuf); | |
32 | virtual void GetCopyDwordIsa(HsaMemoryBuffer& rBuf); | |
33 | virtual void GetInfiniteLoopIsa(HsaMemoryBuffer& rBuf); | |
34 | virtual void GetAtomicIncIsa(HsaMemoryBuffer& rBuf); | |
35 | ||
36 | protected: | |
37 | virtual const std::string& GetAsicName(); | |
38 | ||
39 | private: | |
40 | static const std::string ASIC_NAME; | |
41 | ||
42 | static const uint32_t NOOP_ISA[]; | |
43 | static const uint32_t COPY_DWORD_ISA[]; | |
44 | static const uint32_t INFINITE_LOOP_ISA[]; | |
45 | static const uint32_t ATOMIC_ADD_ISA[]; | |
46 | }; | |
47 | ||
48 | #endif // _ISAGENERATOR_ALDEBARAN_H_ |
0 | /* | |
1 | * Copyright (C) 2019 Advanced Micro Devices, Inc. All Rights Reserved. | |
2 | * | |
3 | * Permission is hereby granted, free of charge, to any person obtaining a | |
4 | * copy of this software and associated documentation files (the "Software"), | |
5 | * to deal in the Software without restriction, including without limitation | |
6 | * the rights to use, copy, modify, merge, publish, distribute, sublicense, | |
7 | * and/or sell copies of the Software, and to permit persons to whom the | |
8 | * Software is furnished to do so, subject to the following conditions: | |
9 | * | |
10 | * The above copyright notice and this permission notice shall be included in | |
11 | * all copies or substantial portions of the Software. | |
12 | * | |
13 | * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR | |
14 | * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, | |
15 | * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL | |
16 | * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR | |
17 | * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, | |
18 | * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR | |
19 | * OTHER DEALINGS IN THE SOFTWARE. | |
20 | * | |
21 | */ | |
22 | ||
23 | #include "IsaGenerator_Gfx10.hpp" | |
24 | ||
25 | #include <algorithm> | |
26 | #include <string> | |
27 | ||
28 | /* The binaries are generated from following ISA */ | |
29 | const std::string IsaGenerator_Gfx10::ASIC_NAME = "GFX10"; | |
30 | #if 0 | |
31 | static const char * atomic_add = \ | |
32 | "\ | |
33 | shader atomic_add \n\ | |
34 | asic(GFX10) \n\ | |
35 | wave_size(32) \n\ | |
36 | type(CS) \n\ | |
37 | v_mov_b32 v0, s0 \n\ | |
38 | v_mov_b32 v1, s1 \n\ | |
39 | v_mov_b32 v2, 1 \n\ | |
40 | flat_atomic_add v3, v[0:1], v2 slc glc \n\ | |
41 | s_waitcnt 0 \n\ | |
42 | s_endpgm \n\ | |
43 | end \n\ | |
44 | "; | |
45 | ||
46 | static const char * copy_dword = \ | |
47 | "\ | |
48 | shader copy_dword \n\ | |
49 | asic(GFX10) \n\ | |
50 | wave_size(32) \n\ | |
51 | type(CS) \n\ | |
52 | v_mov_b32 v0, s0 \n\ | |
53 | v_mov_b32 v1, s1 \n\ | |
54 | v_mov_b32 v2, s2 \n\ | |
55 | v_mov_b32 v3, s3 \n\ | |
56 | flat_load_dword v4, v[0:1] slc glc \n\ | |
57 | s_waitcnt 0 \n\ | |
58 | flat_store_dword v[2:3], v4 slc glc \n\ | |
59 | s_endpgm \n\ | |
60 | end \n\ | |
61 | "; | |
62 | ||
63 | static const char * loop= \ | |
64 | "\ | |
65 | shader loop \n\ | |
66 | asic(GFX10) \n\ | |
67 | type(CS) \n\ | |
68 | wave_size(32) \n\ | |
69 | loop: \n\ | |
70 | s_branch loop \n\ | |
71 | s_endpgm \n\ | |
72 | end \n\ | |
73 | "; | |
74 | ||
75 | static const char * noop= \ | |
76 | "\ | |
77 | shader noop \n\ | |
78 | asic(GFX10) \n\ | |
79 | type(CS) \n\ | |
80 | wave_size(32) \n\ | |
81 | s_endpgm \n\ | |
82 | end \n\ | |
83 | "; | |
84 | #endif | |
85 | ||
86 | const uint32_t IsaGenerator_Gfx10::NOOP_ISA[] = { | |
87 | 0xb0804004, 0xbf810000, | |
88 | 0xbf9f0000, 0xbf9f0000, | |
89 | 0xbf9f0000, 0xbf9f0000, | |
90 | 0xbf9f0000 | |
91 | }; | |
92 | ||
93 | const uint32_t IsaGenerator_Gfx10::COPY_DWORD_ISA[] = { | |
94 | 0xb0804004, 0x7e000200, | |
95 | 0x7e020201, 0x7e040202, | |
96 | 0x7e060203, 0xdc330000, | |
97 | 0x47d0000, 0xbf8c0000, | |
98 | 0xdc730000, 0x7d0402, | |
99 | 0xbf810000, 0xbf9f0000, | |
100 | 0xbf9f0000, 0xbf9f0000, | |
101 | 0xbf9f0000, 0xbf9f0000 | |
102 | }; | |
103 | ||
104 | const uint32_t IsaGenerator_Gfx10::INFINITE_LOOP_ISA[] = { | |
105 | 0xbf82ffff, 0xb0804004, | |
106 | 0xbf810000, 0xbf9f0000, | |
107 | 0xbf9f0000, 0xbf9f0000, | |
108 | 0xbf9f0000, 0xbf9f0000 | |
109 | }; | |
110 | ||
111 | const uint32_t IsaGenerator_Gfx10::ATOMIC_ADD_ISA[] = { | |
112 | 0xb0804004, 0x7e000200, | |
113 | 0x7e020201, 0x7e040281, | |
114 | 0xdccb0000, 0x37d0200, | |
115 | 0xbf8c0000, 0xbf810000, | |
116 | 0xbf9f0000, 0xbf9f0000, | |
117 | 0xbf9f0000, 0xbf9f0000, | |
118 | 0xbf9f0000 | |
119 | }; | |
120 | ||
121 | ||
122 | void IsaGenerator_Gfx10::GetNoopIsa(HsaMemoryBuffer& rBuf) { | |
123 | std::copy(NOOP_ISA, NOOP_ISA+ARRAY_SIZE(NOOP_ISA), rBuf.As<uint32_t*>()); | |
124 | } | |
125 | ||
126 | void IsaGenerator_Gfx10::GetCopyDwordIsa(HsaMemoryBuffer& rBuf) { | |
127 | std::copy(COPY_DWORD_ISA, COPY_DWORD_ISA+ARRAY_SIZE(COPY_DWORD_ISA), rBuf.As<uint32_t*>()); | |
128 | } | |
129 | ||
130 | void IsaGenerator_Gfx10::GetInfiniteLoopIsa(HsaMemoryBuffer& rBuf) { | |
131 | std::copy(INFINITE_LOOP_ISA, INFINITE_LOOP_ISA+ARRAY_SIZE(INFINITE_LOOP_ISA), rBuf.As<uint32_t*>()); | |
132 | } | |
133 | ||
134 | void IsaGenerator_Gfx10::GetAtomicIncIsa(HsaMemoryBuffer& rBuf) { | |
135 | std::copy(ATOMIC_ADD_ISA, ATOMIC_ADD_ISA+ARRAY_SIZE(ATOMIC_ADD_ISA), rBuf.As<uint32_t*>()); | |
136 | } | |
137 | ||
138 | const std::string& IsaGenerator_Gfx10::GetAsicName() { | |
139 | return ASIC_NAME; | |
140 | } | |
141 |
0 | /* | |
1 | * Copyright (C) 2019 Advanced Micro Devices, Inc. All Rights Reserved. | |
2 | * | |
3 | * Permission is hereby granted, free of charge, to any person obtaining a | |
4 | * copy of this software and associated documentation files (the "Software"), | |
5 | * to deal in the Software without restriction, including without limitation | |
6 | * the rights to use, copy, modify, merge, publish, distribute, sublicense, | |
7 | * and/or sell copies of the Software, and to permit persons to whom the | |
8 | * Software is furnished to do so, subject to the following conditions: | |
9 | * | |
10 | * The above copyright notice and this permission notice shall be included in | |
11 | * all copies or substantial portions of the Software. | |
12 | * | |
13 | * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR | |
14 | * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, | |
15 | * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL | |
16 | * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR | |
17 | * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, | |
18 | * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR | |
19 | * OTHER DEALINGS IN THE SOFTWARE. | |
20 | * | |
21 | */ | |
22 | ||
23 | #ifndef _ISAGENERATOR_GFX10_H_ | |
24 | #define _ISAGENERATOR_GFX10_H_ | |
25 | ||
26 | #include <string> | |
27 | #include "IsaGenerator.hpp" | |
28 | ||
29 | class IsaGenerator_Gfx10 : public IsaGenerator { | |
30 | public: | |
31 | virtual void GetNoopIsa(HsaMemoryBuffer& rBuf); | |
32 | virtual void GetCopyDwordIsa(HsaMemoryBuffer& rBuf); | |
33 | virtual void GetInfiniteLoopIsa(HsaMemoryBuffer& rBuf); | |
34 | virtual void GetAtomicIncIsa(HsaMemoryBuffer& rBuf); | |
35 | ||
36 | protected: | |
37 | virtual const std::string& GetAsicName(); | |
38 | ||
39 | private: | |
40 | static const std::string ASIC_NAME; | |
41 | ||
42 | static const uint32_t NOOP_ISA[]; | |
43 | static const uint32_t COPY_DWORD_ISA[]; | |
44 | static const uint32_t INFINITE_LOOP_ISA[]; | |
45 | static const uint32_t ATOMIC_ADD_ISA[]; | |
46 | }; | |
47 | ||
48 | #endif // _ISAGENERATOR_GFX9_H_ |
0 | /* | |
1 | * Copyright (C) 2014-2018 Advanced Micro Devices, Inc. All Rights Reserved. | |
2 | * | |
3 | * Permission is hereby granted, free of charge, to any person obtaining a | |
4 | * copy of this software and associated documentation files (the "Software"), | |
5 | * to deal in the Software without restriction, including without limitation | |
6 | * the rights to use, copy, modify, merge, publish, distribute, sublicense, | |
7 | * and/or sell copies of the Software, and to permit persons to whom the | |
8 | * Software is furnished to do so, subject to the following conditions: | |
9 | * | |
10 | * The above copyright notice and this permission notice shall be included in | |
11 | * all copies or substantial portions of the Software. | |
12 | * | |
13 | * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR | |
14 | * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, | |
15 | * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL | |
16 | * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR | |
17 | * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, | |
18 | * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR | |
19 | * OTHER DEALINGS IN THE SOFTWARE. | |
20 | * | |
21 | */ | |
22 | ||
23 | #include "IsaGenerator_Gfx72.hpp" | |
24 | ||
25 | #include <algorithm> | |
26 | #include <string> | |
27 | ||
28 | const std::string IsaGenerator_Gfx72::ASIC_NAME = "CI"; | |
29 | ||
30 | const uint32_t IsaGenerator_Gfx72::NOOP_ISA[] = { | |
31 | 0xbf810000 // S_ENDPGM | |
32 | }; | |
33 | ||
34 | /* The below arrays are filled with hex values in order not to reference | |
35 | * proprietary header files, but we still leave the code here for future | |
36 | * reference. | |
37 | */ | |
38 | #if 0 | |
39 | const uint32_t IsaGenerator_Gfx72::COPY_DWORD_ISA[] = { | |
40 | (63u << SQ_VOP1__ENCODING__SHIFT) | (0 << SQ_VOP1__VDST__SHIFT) | (SQ_V_MOV_B32 << SQ_VOP1__OP__SHIFT) | (0 << SQ_VOP1__SRC0__SHIFT), // v_mov_b32 v0, s0 (VOP1) | |
41 | (63u << SQ_VOP1__ENCODING__SHIFT) | (1 << SQ_VOP1__VDST__SHIFT) | (SQ_V_MOV_B32 << SQ_VOP1__OP__SHIFT) | (1 << SQ_VOP1__SRC0__SHIFT), // v_mov_b32 v1, s1 (VOP1) | |
42 | (63u << SQ_VOP1__ENCODING__SHIFT) | (2 << SQ_VOP1__VDST__SHIFT) | (SQ_V_MOV_B32 << SQ_VOP1__OP__SHIFT) | (2 << SQ_VOP1__SRC0__SHIFT), // v_mov_b32 v2, s2 (VOP1) | |
43 | (63u << SQ_VOP1__ENCODING__SHIFT) | (3 << SQ_VOP1__VDST__SHIFT) | (SQ_V_MOV_B32 << SQ_VOP1__OP__SHIFT) | (3 << SQ_VOP1__SRC0__SHIFT), // v_mov_b32 v3, s3 (VOP1) | |
44 | ||
45 | (55u << SQ_FLAT_0__ENCODING__SHIFT) | (SQ_FLAT_LOAD_DWORD << SQ_FLAT_0__OP__SHIFT) | (1 << SQ_FLAT_0__SLC__SHIFT) | (1 << SQ_FLAT_0__GLC__SHIFT)/*(3 << 16)*/, // SQ_FLAT_0, flat_load_dword, slc = 1, glc = 1 (FLAT_0) | |
46 | (4u << SQ_FLAT_1__VDST__SHIFT) | (0 << SQ_FLAT_1__ADDR__SHIFT), // ADDR = V0:V1, VDST = V4 (FLAT_1) | |
47 | ||
48 | (383u << SQ_SOPP__ENCODING__SHIFT) | (SQ_S_WAITCNT << SQ_SOPP__OP__SHIFT) | (0 << SQ_SOPP__SIMM16__SHIFT), // s_waitcnt 0 (SOPP) | |
49 | ||
50 | (55u << SQ_FLAT_0__ENCODING__SHIFT) | (SQ_FLAT_STORE_DWORD << SQ_FLAT_0__OP__SHIFT) | (1 << SQ_FLAT_0__SLC__SHIFT) | (1 << SQ_FLAT_0__GLC__SHIFT), // SQ_FLAT_0, flat_store_dword, slc = 1, glc = 1 (FLAT_0) | |
51 | (4u << SQ_FLAT_1__DATA__SHIFT) | (2 << SQ_FLAT_1__ADDR__SHIFT), // ADDR = V2:V3, DATA = V4 (FLAT_1) | |
52 | ||
53 | 0xBF810000u // s_endpgm, note that we rely on the implicit s_waitcnt 0,0,0 | |
54 | }; | |
55 | ||
56 | const uint32_t IsaGenerator_Gfx72::INFINITE_LOOP_ISA[] = { | |
57 | (0x17F << SQ_SOPP__ENCODING__SHIFT) | (SQ_S_BRANCH << SQ_SOPP__OP__SHIFT) | ( (const uint32_t)-1 << SQ_SOPP__SIMM16__SHIFT), // s_branch -1 (PC <- PC + SIMM*4)+4 | |
58 | 0xBF810000u // S_ENDPGM | |
59 | }; | |
60 | ||
61 | const uint32_t IsaGenerator_Gfx72::ATOMIC_INC_ISA[] = { | |
62 | (63u << SQ_VOP1__ENCODING__SHIFT) | (0 << SQ_VOP1__VDST__SHIFT) | (SQ_V_MOV_B32 << SQ_VOP1__OP__SHIFT) | (0 << SQ_VOP1__SRC0__SHIFT), // v_mov_b32 v0, s0 (VOP1) | |
63 | (63u << SQ_VOP1__ENCODING__SHIFT) | (1 << SQ_VOP1__VDST__SHIFT) | (SQ_V_MOV_B32 << SQ_VOP1__OP__SHIFT) | (1 << SQ_VOP1__SRC0__SHIFT), // v_mov_b32 v1, s1 (VOP1) | |
64 | (63u << SQ_VOP1__ENCODING__SHIFT) | (2 << SQ_VOP1__VDST__SHIFT) | (SQ_V_MOV_B32 << SQ_VOP1__OP__SHIFT) | (0xC1 << SQ_VOP1__SRC0__SHIFT), // v_mov_b32 0xFFFFFFFF, s2 (VOP1) | |
65 | ||
66 | (55u << SQ_FLAT_0__ENCODING__SHIFT) | (SQ_FLAT_ATOMIC_INC << SQ_FLAT_0__OP__SHIFT) | (1 << SQ_FLAT_0__SLC__SHIFT) | (0 << SQ_FLAT_0__GLC__SHIFT), // SQ_FLAT_0, flat_atomic_inc, slc = 1, glc = 0 (FLAT_0) | |
67 | (3u << SQ_FLAT_1__VDST__SHIFT) | (2u << SQ_FLAT_1__DATA__SHIFT) | (0 << SQ_FLAT_1__ADDR__SHIFT), // ADDR/dst = V0:V1, VDST/ret = V3, DATA/src=V2 (FLAT_1) | |
68 | 0xBF810000u // s_endpgm, note that we rely on the implicit s_waitcnt 0,0,0 | |
69 | }; | |
70 | #endif | |
71 | ||
72 | const uint32_t IsaGenerator_Gfx72::COPY_DWORD_ISA[] = { | |
73 | 0x7e000200, // v_mov_b32 v0, s0 (VOP1) | |
74 | 0x7e020201, // v_mov_b32 v1, s1 (VOP1) | |
75 | 0x7e040202, // v_mov_b32 v2, s2 (VOP1) | |
76 | 0x7e060203, // v_mov_b32 v3, s3 (VOP1) | |
77 | ||
78 | 0xdc330000, // SQ_FLAT_0, flat_load_dword, slc = 1, glc = 1 (FLAT_0) | |
79 | 0x04000000, // ADDR = V0:V1, VDST = V4 (FLAT_1) | |
80 | ||
81 | 0xbf8c0000, // s_waitcnt 0 (SOPP) | |
82 | ||
83 | 0xdc730000, // SQ_FLAT_0, flat_store_dword, slc = 1, glc = 1 (FLAT_0) | |
84 | 0x00000402, // ADDR = V2:V3, DATA = V4 (FLAT_1) | |
85 | ||
86 | 0xbf810000 // s_endpgm, note that we rely on the implicit s_waitcnt 0,0,0 | |
87 | }; | |
88 | ||
89 | const uint32_t IsaGenerator_Gfx72::INFINITE_LOOP_ISA[] = { | |
90 | 0xbf82ffff, // s_branch -1 (PC <- PC + SIMM*4)+4 | |
91 | 0xbf810000 // S_ENDPGM | |
92 | }; | |
93 | ||
94 | const uint32_t IsaGenerator_Gfx72::ATOMIC_INC_ISA[] = { | |
95 | 0x7e000200, // v_mov_b32 v0, s0 (VOP1) | |
96 | 0x7e020201, // v_mov_b32 v1, s1 (VOP1) | |
97 | 0x7e0402c1, // v_mov_b32 0xFFFFFFFF, s2 (VOP1) | |
98 | ||
99 | 0xdcf20000, // SQ_FLAT_0, flat_atomic_inc, slc = 1, glc = 0 (FLAT_0) | |
100 | 0x03000200, // ADDR/dst = V0:V1, VDST/ret = V3, DATA/src=V2 (FLAT_1) | |
101 | 0xbf810000 // s_endpgm, note that we rely on the implicit s_waitcnt 0,0,0 | |
102 | }; | |
103 | ||
104 | void IsaGenerator_Gfx72::GetNoopIsa(HsaMemoryBuffer& rBuf) { | |
105 | std::copy(NOOP_ISA, NOOP_ISA+ARRAY_SIZE(NOOP_ISA), rBuf.As<uint32_t*>()); | |
106 | } | |
107 | ||
108 | void IsaGenerator_Gfx72::GetCopyDwordIsa(HsaMemoryBuffer& rBuf) { | |
109 | std::copy(COPY_DWORD_ISA, COPY_DWORD_ISA+ARRAY_SIZE(COPY_DWORD_ISA), rBuf.As<uint32_t*>()); | |
110 | } | |
111 | ||
112 | void IsaGenerator_Gfx72::GetInfiniteLoopIsa(HsaMemoryBuffer& rBuf) { | |
113 | std::copy(INFINITE_LOOP_ISA, INFINITE_LOOP_ISA+ARRAY_SIZE(INFINITE_LOOP_ISA), rBuf.As<uint32_t*>()); | |
114 | } | |
115 | ||
116 | void IsaGenerator_Gfx72::GetAtomicIncIsa(HsaMemoryBuffer& rBuf) { | |
117 | std::copy(ATOMIC_INC_ISA, ATOMIC_INC_ISA+ARRAY_SIZE(ATOMIC_INC_ISA), rBuf.As<uint32_t*>()); | |
118 | } | |
119 | ||
120 | const std::string& IsaGenerator_Gfx72::GetAsicName() { | |
121 | return ASIC_NAME; | |
122 | } |
0 | /* | |
1 | * Copyright (C) 2014-2018 Advanced Micro Devices, Inc. All Rights Reserved. | |
2 | * | |
3 | * Permission is hereby granted, free of charge, to any person obtaining a | |
4 | * copy of this software and associated documentation files (the "Software"), | |
5 | * to deal in the Software without restriction, including without limitation | |
6 | * the rights to use, copy, modify, merge, publish, distribute, sublicense, | |
7 | * and/or sell copies of the Software, and to permit persons to whom the | |
8 | * Software is furnished to do so, subject to the following conditions: | |
9 | * | |
10 | * The above copyright notice and this permission notice shall be included in | |
11 | * all copies or substantial portions of the Software. | |
12 | * | |
13 | * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR | |
14 | * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, | |
15 | * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL | |
16 | * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR | |
17 | * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, | |
18 | * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR | |
19 | * OTHER DEALINGS IN THE SOFTWARE. | |
20 | * | |
21 | */ | |
22 | ||
23 | #ifndef _ISAGENERATOR_GFX72_H_ | |
24 | #define _ISAGENERATOR_GFX72_H_ | |
25 | ||
26 | #include <string> | |
27 | #include "IsaGenerator.hpp" | |
28 | ||
29 | class IsaGenerator_Gfx72 : public IsaGenerator { | |
30 | public: | |
31 | virtual void GetNoopIsa(HsaMemoryBuffer& rBuf); | |
32 | virtual void GetCopyDwordIsa(HsaMemoryBuffer& rBuf); | |
33 | virtual void GetInfiniteLoopIsa(HsaMemoryBuffer& rBuf); | |
34 | virtual void GetAtomicIncIsa(HsaMemoryBuffer& rBuf); | |
35 | ||
36 | protected: | |
37 | virtual const std::string& GetAsicName(); | |
38 | ||
39 | private: | |
40 | static const std::string ASIC_NAME; | |
41 | ||
42 | static const uint32_t NOOP_ISA[]; | |
43 | static const uint32_t COPY_DWORD_ISA[]; | |
44 | static const uint32_t INFINITE_LOOP_ISA[]; | |
45 | static const uint32_t ATOMIC_INC_ISA[]; | |
46 | }; | |
47 | ||
48 | #endif // _ISAGENERATOR_GFX72_H_ |
0 | /* | |
1 | * Copyright (C) 2014-2018 Advanced Micro Devices, Inc. All Rights Reserved. | |
2 | * | |
3 | * Permission is hereby granted, free of charge, to any person obtaining a | |
4 | * copy of this software and associated documentation files (the "Software"), | |
5 | * to deal in the Software without restriction, including without limitation | |
6 | * the rights to use, copy, modify, merge, publish, distribute, sublicense, | |
7 | * and/or sell copies of the Software, and to permit persons to whom the | |
8 | * Software is furnished to do so, subject to the following conditions: | |
9 | * | |
10 | * The above copyright notice and this permission notice shall be included in | |
11 | * all copies or substantial portions of the Software. | |
12 | * | |
13 | * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR | |
14 | * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, | |
15 | * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL | |
16 | * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR | |
17 | * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, | |
18 | * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR | |
19 | * OTHER DEALINGS IN THE SOFTWARE. | |
20 | * | |
21 | */ | |
22 | ||
23 | #include "IsaGenerator_Gfx8.hpp" | |
24 | ||
25 | #include <algorithm> | |
26 | #include <string> | |
27 | ||
28 | const std::string IsaGenerator_Gfx8::ASIC_NAME = "VI"; | |
29 | ||
30 | const uint32_t IsaGenerator_Gfx8::NOOP_ISA[] = { | |
31 | 0xbf810000 // S_ENDPGM | |
32 | }; | |
33 | ||
34 | /** The below arrays are filled with hex values in order not to reference | |
35 | * proprietary header files, but we still leave the code here for future | |
36 | * reference. | |
37 | */ | |
38 | #if 0 | |
39 | const uint32_t IsaGenerator_Gfx8::COPY_DWORD_ISA[] = { | |
40 | (63u << SQ_VOP1__ENCODING__SHIFT) | (0 << SQ_VOP1__VDST__SHIFT) | (SQ_V_MOV_B32 << SQ_VOP1__OP__SHIFT) | (0 << SQ_VOP1__SRC0__SHIFT), // v_mov_b32 v0, s0 (VOP1) | |
41 | (63u << SQ_VOP1__ENCODING__SHIFT) | (1 << SQ_VOP1__VDST__SHIFT) | (SQ_V_MOV_B32 << SQ_VOP1__OP__SHIFT) | (1 << SQ_VOP1__SRC0__SHIFT), // v_mov_b32 v1, s1 (VOP1) | |
42 | (63u << SQ_VOP1__ENCODING__SHIFT) | (2 << SQ_VOP1__VDST__SHIFT) | (SQ_V_MOV_B32 << SQ_VOP1__OP__SHIFT) | (2 << SQ_VOP1__SRC0__SHIFT), // v_mov_b32 v2, s2 (VOP1) | |
43 | (63u << SQ_VOP1__ENCODING__SHIFT) | (3 << SQ_VOP1__VDST__SHIFT) | (SQ_V_MOV_B32 << SQ_VOP1__OP__SHIFT) | (3 << SQ_VOP1__SRC0__SHIFT), // v_mov_b32 v3, s3 (VOP1) | |
44 | ||
45 | (55u << SQ_FLAT_0__ENCODING__SHIFT) | (SQ_FLAT_LOAD_DWORD << SQ_FLAT_0__OP__SHIFT) | (1 << SQ_FLAT_0__SLC__SHIFT) | (1 << SQ_FLAT_0__GLC__SHIFT)/*(3 << 16)*/, // SQ_FLAT_0, flat_load_dword, slc = 1, glc = 1 (FLAT_0) | |
46 | (4u << SQ_FLAT_1__VDST__SHIFT) | (0 << SQ_FLAT_1__ADDR__SHIFT), // ADDR = V0:V1, VDST = V4 (FLAT_1) | |
47 | ||
48 | (383u << SQ_SOPP__ENCODING__SHIFT) | (SQ_S_WAITCNT << SQ_SOPP__OP__SHIFT) | (0 << SQ_SOPP__SIMM16__SHIFT), // s_waitcnt 0 (SOPP) | |
49 | ||
50 | (55u << SQ_FLAT_0__ENCODING__SHIFT) | (SQ_FLAT_STORE_DWORD << SQ_FLAT_0__OP__SHIFT) | (1 << SQ_FLAT_0__SLC__SHIFT) | (1 << SQ_FLAT_0__GLC__SHIFT), // SQ_FLAT_0, flat_store_dword, slc = 1, glc = 1 (FLAT_0) | |
51 | (4u << SQ_FLAT_1__DATA__SHIFT) | (2 << SQ_FLAT_1__ADDR__SHIFT), // ADDR = V2:V3, DATA = V4 (FLAT_1) | |
52 | ||
53 | 0xBF810000u // s_endpgm, note that we rely on the implicit s_waitcnt 0,0,0 | |
54 | }; | |
55 | ||
56 | const uint32_t IsaGenerator_Gfx8::INFINITE_LOOP_ISA[] = { | |
57 | (0x17F << SQ_SOPP__ENCODING__SHIFT) | (SQ_S_BRANCH << SQ_SOPP__OP__SHIFT) | ( (const uint32_t)-1 << SQ_SOPP__SIMM16__SHIFT), // s_branch -1 (PC <- PC + SIMM*4)+4 | |
58 | 0xBF810000u // S_ENDPGM | |
59 | }; | |
60 | #endif | |
61 | ||
62 | const uint32_t IsaGenerator_Gfx8::COPY_DWORD_ISA[] = { | |
63 | 0x7e000200, // v_mov_b32 v0, s0 (VOP1) | |
64 | 0x7e020201, // v_mov_b32 v1, s1 (VOP1) | |
65 | 0x7e040202, // v_mov_b32 v2, s2 (VOP1) | |
66 | 0x7e060203, // v_mov_b32 v3, s3 (VOP1) | |
67 | ||
68 | 0xdc530000, // SQ_FLAT_0, flat_load_dword, slc = 1, glc = 1 (FLAT_0) | |
69 | 0x04000000, // ADDR = V0:V1, VDST = V4 (FLAT_1) | |
70 | ||
71 | 0xbf8c0000, // s_waitcnt 0 (SOPP) | |
72 | ||
73 | 0xdc730000, // SQ_FLAT_0, flat_store_dword, slc = 1, glc = 1 (FLAT_0) | |
74 | 0x00000402, // ADDR = V2:V3, DATA = V4 (FLAT_1) | |
75 | ||
76 | 0xbf810000 // s_endpgm, note that we rely on the implicit s_waitcnt 0,0,0 | |
77 | }; | |
78 | ||
79 | const uint32_t IsaGenerator_Gfx8::INFINITE_LOOP_ISA[] = { | |
80 | 0xbf82ffff, // s_branch -1 (PC <- PC + SIMM*4)+4 | |
81 | 0xbf810000 // S_ENDPGM | |
82 | }; | |
83 | ||
84 | /** | |
85 | * The atomic_add_isa binary is generated from following ISA | |
86 | * The original atomic_inc is not support by some PCIE, so use atomic_add instead | |
87 | * | |
88 | */ | |
89 | /* | |
90 | shader atomic_add | |
91 | asic(VI) | |
92 | type(CS) | |
93 | v_mov_b32 v0, s0 | |
94 | v_mov_b32 v1, s1 | |
95 | v_mov_b32 v2, 1 | |
96 | flat_atomic_add v3, v[0:1], v2 slc glc | |
97 | s_waitcnt 0 | |
98 | s_endpgm | |
99 | end | |
100 | */ | |
101 | ||
102 | const uint32_t IsaGenerator_Gfx8::ATOMIC_ADD_ISA[] = { | |
103 | 0x7e000200, 0x7e020201, | |
104 | 0x7e040281, 0xdd0b0000, | |
105 | 0x03000200, 0xbf8c0000, | |
106 | 0xbf810000, 0x00000000 | |
107 | }; | |
108 | ||
109 | void IsaGenerator_Gfx8::GetNoopIsa(HsaMemoryBuffer& rBuf) { | |
110 | std::copy(NOOP_ISA, NOOP_ISA+ARRAY_SIZE(NOOP_ISA), rBuf.As<uint32_t*>()); | |
111 | } | |
112 | ||
113 | void IsaGenerator_Gfx8::GetCopyDwordIsa(HsaMemoryBuffer& rBuf) { | |
114 | std::copy(COPY_DWORD_ISA, COPY_DWORD_ISA+ARRAY_SIZE(COPY_DWORD_ISA), rBuf.As<uint32_t*>()); | |
115 | } | |
116 | ||
117 | void IsaGenerator_Gfx8::GetInfiniteLoopIsa(HsaMemoryBuffer& rBuf) { | |
118 | std::copy(INFINITE_LOOP_ISA, INFINITE_LOOP_ISA+ARRAY_SIZE(INFINITE_LOOP_ISA), rBuf.As<uint32_t*>()); | |
119 | } | |
120 | ||
121 | void IsaGenerator_Gfx8::GetAtomicIncIsa(HsaMemoryBuffer& rBuf) { | |
122 | std::copy(ATOMIC_ADD_ISA, ATOMIC_ADD_ISA+ARRAY_SIZE(ATOMIC_ADD_ISA), rBuf.As<uint32_t*>()); | |
123 | } | |
124 | ||
125 | const std::string& IsaGenerator_Gfx8::GetAsicName() { | |
126 | return ASIC_NAME; | |
127 | } |
0 | /* | |
1 | * Copyright (C) 2014-2018 Advanced Micro Devices, Inc. All Rights Reserved. | |
2 | * | |
3 | * Permission is hereby granted, free of charge, to any person obtaining a | |
4 | * copy of this software and associated documentation files (the "Software"), | |
5 | * to deal in the Software without restriction, including without limitation | |
6 | * the rights to use, copy, modify, merge, publish, distribute, sublicense, | |
7 | * and/or sell copies of the Software, and to permit persons to whom the | |
8 | * Software is furnished to do so, subject to the following conditions: | |
9 | * | |
10 | * The above copyright notice and this permission notice shall be included in | |
11 | * all copies or substantial portions of the Software. | |
12 | * | |
13 | * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR | |
14 | * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, | |
15 | * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL | |
16 | * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR | |
17 | * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, | |
18 | * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR | |
19 | * OTHER DEALINGS IN THE SOFTWARE. | |
20 | * | |
21 | */ | |
22 | ||
23 | #ifndef _ISAGENERATOR_GFX8_H_ | |
24 | #define _ISAGENERATOR_GFX8_H_ | |
25 | ||
26 | #include <string> | |
27 | #include "IsaGenerator.hpp" | |
28 | ||
29 | class IsaGenerator_Gfx8 : public IsaGenerator { | |
30 | public: | |
31 | virtual void GetNoopIsa(HsaMemoryBuffer& rBuf); | |
32 | virtual void GetCopyDwordIsa(HsaMemoryBuffer& rBuf); | |
33 | virtual void GetInfiniteLoopIsa(HsaMemoryBuffer& rBuf); | |
34 | virtual void GetAtomicIncIsa(HsaMemoryBuffer& rBuf); | |
35 | ||
36 | protected: | |
37 | virtual const std::string& GetAsicName(); | |
38 | ||
39 | private: | |
40 | static const std::string ASIC_NAME; | |
41 | ||
42 | static const uint32_t NOOP_ISA[]; | |
43 | static const uint32_t COPY_DWORD_ISA[]; | |
44 | static const uint32_t INFINITE_LOOP_ISA[]; | |
45 | static const uint32_t ATOMIC_ADD_ISA[]; | |
46 | }; | |
47 | ||
48 | #endif // _ISAGENERATOR_GFX72_H_ |
0 | /* | |
1 | * Copyright (C) 2014-2018 Advanced Micro Devices, Inc. All Rights Reserved. | |
2 | * | |
3 | * Permission is hereby granted, free of charge, to any person obtaining a | |
4 | * copy of this software and associated documentation files (the "Software"), | |
5 | * to deal in the Software without restriction, including without limitation | |
6 | * the rights to use, copy, modify, merge, publish, distribute, sublicense, | |
7 | * and/or sell copies of the Software, and to permit persons to whom the | |
8 | * Software is furnished to do so, subject to the following conditions: | |
9 | * | |
10 | * The above copyright notice and this permission notice shall be included in | |
11 | * all copies or substantial portions of the Software. | |
12 | * | |
13 | * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR | |
14 | * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, | |
15 | * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL | |
16 | * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR | |
17 | * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, | |
18 | * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR | |
19 | * OTHER DEALINGS IN THE SOFTWARE. | |
20 | * | |
21 | */ | |
22 | ||
23 | #include "IsaGenerator_Gfx9.hpp" | |
24 | ||
25 | #include <algorithm> | |
26 | #include <string> | |
27 | ||
28 | const std::string IsaGenerator_Gfx9::ASIC_NAME = "GFX9"; | |
29 | ||
30 | /* The binaries are generated from following ISA */ | |
31 | #if 0 | |
32 | /* flat_atomic_inc will not support by some PCIE, use flat_atomic_add instead */ | |
33 | shader atomic_add | |
34 | asic(GFX9) | |
35 | type(CS) | |
36 | v_mov_b32 v0, s0 | |
37 | v_mov_b32 v1, s1 | |
38 | v_mov_b32 v2, 1 | |
39 | flat_atomic_add v3, v[0:1], v2 slc glc | |
40 | s_waitcnt 0 | |
41 | s_endpgm | |
42 | end | |
43 | ||
44 | shader copy_dword | |
45 | asic(GFX9) | |
46 | type(CS) | |
47 | /* copy the parameters from scalar registers to vector registers */ | |
48 | v_mov_b32 v0, s0 | |
49 | v_mov_b32 v1, s1 | |
50 | v_mov_b32 v2, s2 | |
51 | v_mov_b32 v3, s3 | |
52 | /* copy a dword between the passed addresses */ | |
53 | flat_load_dword v4, v[0:1] slc glc | |
54 | s_waitcnt 0 | |
55 | flat_store_dword v[2:3], v4 slc glc | |
56 | s_endpgm | |
57 | end | |
58 | ||
59 | shader main | |
60 | asic(GFX9) | |
61 | type(CS) | |
62 | loop: | |
63 | s_branch loop | |
64 | s_endpgm | |
65 | end | |
66 | ||
67 | ||
68 | #endif | |
69 | ||
70 | const uint32_t IsaGenerator_Gfx9::NOOP_ISA[] = { | |
71 | 0xbf810000 | |
72 | }; | |
73 | ||
74 | const uint32_t IsaGenerator_Gfx9::COPY_DWORD_ISA[] = { | |
75 | 0x7e000200, 0x7e020201, | |
76 | 0x7e040202, 0x7e060203, | |
77 | 0xdc530000, 0x047f0000, | |
78 | 0xbf8c0000, 0xdc730000, | |
79 | 0x007f0402, 0xbf810000 | |
80 | }; | |
81 | ||
82 | const uint32_t IsaGenerator_Gfx9::INFINITE_LOOP_ISA[] = { | |
83 | 0xbf82ffff, 0xbf810000 | |
84 | }; | |
85 | ||
86 | const uint32_t IsaGenerator_Gfx9::ATOMIC_ADD_ISA[] = { | |
87 | 0x7e000200, 0x7e020201, | |
88 | 0x7e040281, 0xdd0b0000, | |
89 | 0x037f0200, 0xbf8c0000, | |
90 | 0xbf810000, 0x00000000 | |
91 | }; | |
92 | ||
93 | void IsaGenerator_Gfx9::GetNoopIsa(HsaMemoryBuffer& rBuf) { | |
94 | std::copy(NOOP_ISA, NOOP_ISA+ARRAY_SIZE(NOOP_ISA), rBuf.As<uint32_t*>()); | |
95 | } | |
96 | ||
97 | void IsaGenerator_Gfx9::GetCopyDwordIsa(HsaMemoryBuffer& rBuf) { | |
98 | std::copy(COPY_DWORD_ISA, COPY_DWORD_ISA+ARRAY_SIZE(COPY_DWORD_ISA), rBuf.As<uint32_t*>()); | |
99 | } | |
100 | ||
101 | void IsaGenerator_Gfx9::GetInfiniteLoopIsa(HsaMemoryBuffer& rBuf) { | |
102 | std::copy(INFINITE_LOOP_ISA, INFINITE_LOOP_ISA+ARRAY_SIZE(INFINITE_LOOP_ISA), rBuf.As<uint32_t*>()); | |
103 | } | |
104 | ||
105 | void IsaGenerator_Gfx9::GetAtomicIncIsa(HsaMemoryBuffer& rBuf) { | |
106 | std::copy(ATOMIC_ADD_ISA, ATOMIC_ADD_ISA+ARRAY_SIZE(ATOMIC_ADD_ISA), rBuf.As<uint32_t*>()); | |
107 | } | |
108 | ||
109 | const std::string& IsaGenerator_Gfx9::GetAsicName() { | |
110 | return ASIC_NAME; | |
111 | } | |
112 |
0 | /* | |
1 | * Copyright (C) 2014-2018 Advanced Micro Devices, Inc. All Rights Reserved. | |
2 | * | |
3 | * Permission is hereby granted, free of charge, to any person obtaining a | |
4 | * copy of this software and associated documentation files (the "Software"), | |
5 | * to deal in the Software without restriction, including without limitation | |
6 | * the rights to use, copy, modify, merge, publish, distribute, sublicense, | |
7 | * and/or sell copies of the Software, and to permit persons to whom the | |
8 | * Software is furnished to do so, subject to the following conditions: | |
9 | * | |
10 | * The above copyright notice and this permission notice shall be included in | |
11 | * all copies or substantial portions of the Software. | |
12 | * | |
13 | * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR | |
14 | * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, | |
15 | * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL | |
16 | * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR | |
17 | * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, | |
18 | * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR | |
19 | * OTHER DEALINGS IN THE SOFTWARE. | |
20 | * | |
21 | */ | |
22 | ||
23 | #ifndef _ISAGENERATOR_GFX9_H_ | |
24 | #define _ISAGENERATOR_GFX9_H_ | |
25 | ||
26 | #include <string> | |
27 | #include "IsaGenerator.hpp" | |
28 | ||
29 | class IsaGenerator_Gfx9 : public IsaGenerator { | |
30 | public: | |
31 | virtual void GetNoopIsa(HsaMemoryBuffer& rBuf); | |
32 | virtual void GetCopyDwordIsa(HsaMemoryBuffer& rBuf); | |
33 | virtual void GetInfiniteLoopIsa(HsaMemoryBuffer& rBuf); | |
34 | virtual void GetAtomicIncIsa(HsaMemoryBuffer& rBuf); | |
35 | ||
36 | protected: | |
37 | virtual const std::string& GetAsicName(); | |
38 | ||
39 | private: | |
40 | static const std::string ASIC_NAME; | |
41 | ||
42 | static const uint32_t NOOP_ISA[]; | |
43 | static const uint32_t COPY_DWORD_ISA[]; | |
44 | static const uint32_t INFINITE_LOOP_ISA[]; | |
45 | static const uint32_t ATOMIC_ADD_ISA[]; | |
46 | }; | |
47 | ||
48 | #endif // _ISAGENERATOR_GFX9_H_ |
0 | /* | |
1 | * Copyright (C) 2022 Advanced Micro Devices, Inc. All Rights Reserved. | |
2 | * | |
3 | * Permission is hereby granted, free of charge, to any person obtaining a | |
4 | * copy of this software and associated documentation files (the "Software"), | |
5 | * to deal in the Software without restriction, including without limitation | |
6 | * the rights to use, copy, modify, merge, publish, distribute, sublicense, | |
7 | * and/or sell copies of the Software, and to permit persons to whom the | |
8 | * Software is furnished to do so, subject to the following conditions: | |
9 | * | |
10 | * The above copyright notice and this permission notice shall be included in | |
11 | * all copies or substantial portions of the Software. | |
12 | * | |
13 | * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR | |
14 | * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, | |
15 | * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL | |
16 | * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR | |
17 | * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, | |
18 | * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR | |
19 | * OTHER DEALINGS IN THE SOFTWARE. | |
20 | * | |
21 | */ | |
22 | ||
23 | #include "GoogleTestExtension.hpp" | |
24 | #include "KFDASMTest.hpp" | |
25 | #include "ShaderStore.hpp" | |
26 | #include "Assemble.hpp" | |
27 | ||
28 | void KFDASMTest::SetUp() {} | |
29 | void KFDASMTest::TearDown() {} | |
30 | ||
31 | static const std::vector<uint32_t> TargetList = { | |
32 | 0x080001, | |
33 | 0x080002, | |
34 | 0x080003, | |
35 | 0x080005, | |
36 | 0x080100, | |
37 | 0x090000, | |
38 | 0x090002, | |
39 | 0x090004, | |
40 | 0x090006, | |
41 | 0x090008, | |
42 | 0x090009, | |
43 | 0x09000a, | |
44 | 0x09000c, | |
45 | 0x0a0100, | |
46 | 0x0a0101, | |
47 | 0x0a0102, | |
48 | 0x0a0103, | |
49 | 0x0a0300, | |
50 | 0x0a0301, | |
51 | 0x0a0302, | |
52 | 0x0a0303, | |
53 | 0x0a0304, | |
54 | 0x0a0305, | |
55 | 0x0a0306, | |
56 | }; | |
57 | ||
58 | TEST_F(KFDASMTest, AssembleShaders) { | |
59 | TEST_START(TESTPROFILE_RUNALL) | |
60 | ||
61 | for (auto &t : TargetList) { | |
62 | Assembler asmblr(t); | |
63 | ||
64 | LOG() << "Running ASM test for target " << asmblr.GetTargetAsic() << std::endl; | |
65 | ||
66 | for (auto &s : ShaderList) { | |
67 | EXPECT_SUCCESS(asmblr.RunAssemble(s)); | |
68 | } | |
69 | } | |
70 | ||
71 | TEST_END | |
72 | } |
0 | /* | |
1 | * Copyright (C) 2022 Advanced Micro Devices, Inc. All Rights Reserved. | |
2 | * | |
3 | * Permission is hereby granted, free of charge, to any person obtaining a | |
4 | * copy of this software and associated documentation files (the "Software"), | |
5 | * to deal in the Software without restriction, including without limitation | |
6 | * the rights to use, copy, modify, merge, publish, distribute, sublicense, | |
7 | * and/or sell copies of the Software, and to permit persons to whom the | |
8 | * Software is furnished to do so, subject to the following conditions: | |
9 | * | |
10 | * The above copyright notice and this permission notice shall be included in | |
11 | * all copies or substantial portions of the Software. | |
12 | * | |
13 | * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR | |
14 | * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, | |
15 | * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL | |
16 | * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR | |
17 | * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, | |
18 | * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR | |
19 | * OTHER DEALINGS IN THE SOFTWARE. | |
20 | * | |
21 | */ | |
22 | ||
23 | #ifndef __KFD_ASM_TEST__H__ | |
24 | #define __KFD_ASM_TEST__H__ | |
25 | ||
26 | #include <gtest/gtest.h> | |
27 | ||
28 | class KFDASMTest : public testing::Test { | |
29 | public: | |
30 | KFDASMTest() {} | |
31 | ~KFDASMTest() {} | |
32 | ||
33 | protected: | |
34 | virtual void SetUp(); | |
35 | virtual void TearDown(); | |
36 | }; | |
37 | ||
38 | #endif // __KFD_ASM_TEST__H__ |
67 | 67 | |
68 | 68 | g_baseTest = this; |
69 | 69 | |
70 | m_pAsm = new Assembler(GetGfxVersion(nodeProperties)); | |
71 | ||
70 | 72 | ROUTINE_END |
71 | 73 | } |
72 | 74 | |
84 | 86 | EXPECT_SUCCESS(hsaKmtReleaseSystemProperties()); |
85 | 87 | EXPECT_SUCCESS(hsaKmtCloseKFD()); |
86 | 88 | g_baseTest = NULL; |
89 | ||
90 | if (m_pAsm) | |
91 | delete m_pAsm; | |
92 | m_pAsm = nullptr; | |
87 | 93 | |
88 | 94 | ROUTINE_END |
89 | 95 | } |
33 | 33 | #include "hsakmt.h" |
34 | 34 | #include "OSWrapper.hpp" |
35 | 35 | #include "KFDTestUtil.hpp" |
36 | #include "Assemble.hpp" | |
37 | #include "ShaderStore.hpp" | |
36 | 38 | |
37 | 39 | // @class KFDBaseComponentTest |
38 | 40 | class KFDBaseComponentTest : public testing::Test { |
73 | 75 | HsaMemFlags m_MemoryFlags; |
74 | 76 | HsaNodeInfo m_NodeInfo; |
75 | 77 | HSAint32 m_xnack; |
78 | Assembler* m_pAsm; | |
76 | 79 | |
77 | 80 | // @brief Executed before every test that uses KFDBaseComponentTest class and sets all common settings for the tests. |
78 | 81 | virtual void SetUp(); |
23 | 23 | #include "KFDCWSRTest.hpp" |
24 | 24 | #include "Dispatch.hpp" |
25 | 25 | |
26 | ||
27 | /* Initial state: | |
28 | * s[0:1] - 64 bits iteration number; only the lower 32 bits are useful. | |
29 | * s[2:3] - result buffer base address | |
30 | * s4 - workgroup id | |
31 | * v0 - workitem id, always 0 because | |
32 | * NUM_THREADS_X(number of threads) in workgroup set to 1 | |
33 | * Registers: | |
34 | * v0 - calculated workitem = v0 + s4 * NUM_THREADS_X, which is s4 | |
35 | * v2 - = s0, 32 bits iteration number | |
36 | * v[4:5] - corresponding output buf address: s[2:3] + v0 * 4 | |
37 | * v6 - counter | |
38 | */ | |
39 | ||
40 | static const char* iterate_isa_gfx8 = \ | |
41 | "\ | |
42 | shader iterate_isa\n\ | |
43 | wave_size(32)\n\ | |
44 | type(CS)\n\ | |
45 | // copy the parameters from scalar registers to vector registers\n\ | |
46 | v_mov_b32 v2, s0 // v[2:3] = s[0:1] \n\ | |
47 | v_mov_b32 v3, s1 // v[2:3] = s[0:1] \n\ | |
48 | v_mov_b32 v0, s4 // use workgroup id as index \n\ | |
49 | v_lshlrev_b32 v0, 2, v0 // v0 *= 4 \n\ | |
50 | v_add_u32 v4, vcc, s2, v0 // v[4:5] = s[2:3] + v0 * 4 \n\ | |
51 | v_mov_b32 v5, s3 // v[4:5] = s[2:3] + v0 * 4 \n\ | |
52 | v_add_u32 v5, vcc, v5, vcc_lo // v[4:5] = s[2:3] + v0 * 4 \n\ | |
53 | v_mov_b32 v6, 0 \n\ | |
54 | LOOP: \n\ | |
55 | v_add_u32 v6, vcc, 1, v6 \n\ | |
56 | // compare the result value (v6) to iteration value (v2), and \n\ | |
57 | // jump if equal (i.e. if VCC is not zero after the comparison) \n\ | |
58 | v_cmp_lt_u32 vcc, v6, v2 \n\ | |
59 | s_cbranch_vccnz LOOP \n\ | |
60 | flat_store_dword v[4:5], v6 \n\ | |
61 | s_waitcnt vmcnt(0)&lgkmcnt(0) \n\ | |
62 | s_endpgm \n\ | |
63 | end \n\ | |
64 | "; | |
65 | ||
66 | //This shader can be used by gfx9 and gfx10 | |
67 | static const char* iterate_isa_gfx9 = \ | |
68 | "\ | |
69 | shader iterate_isa\n\ | |
70 | wave_size(32)\n\ | |
71 | type(CS)\n\ | |
72 | // copy the parameters from scalar registers to vector registers\n\ | |
73 | v_mov_b32 v2, s0 // v[2:3] = s[0:1] \n\ | |
74 | v_mov_b32 v3, s1 // v[2:3] = s[0:1] \n\ | |
75 | v_mov_b32 v0, s4 // use workgroup id as index \n\ | |
76 | v_lshlrev_b32 v0, 2, v0 // v0 *= 4 \n\ | |
77 | v_add_co_u32 v4, vcc, s2, v0 // v[4:5] = s[2:3] + v0 * 4 \n\ | |
78 | v_mov_b32 v5, s3 // v[4:5] = s[2:3] + v0 * 4 \n\ | |
79 | v_add_co_u32 v5, vcc, v5, vcc_lo // v[4:5] = s[2:3] + v0 * 4 \n\ | |
80 | v_mov_b32 v6, 0 \n\ | |
81 | LOOP: \n\ | |
82 | v_add_co_u32 v6, vcc, 1, v6 \n\ | |
83 | // compare the result value (v6) to iteration value (v2), and \n\ | |
84 | // jump if equal (i.e. if VCC is not zero after the comparison) \n\ | |
85 | v_cmp_lt_u32 vcc, v6, v2 \n\ | |
86 | s_cbranch_vccnz LOOP \n\ | |
87 | flat_store_dword v[4:5], v6 \n\ | |
88 | s_waitcnt vmcnt(0)&lgkmcnt(0) \n\ | |
89 | s_endpgm \n\ | |
90 | end \n\ | |
91 | "; | |
92 | ||
93 | static const char* infinite_isa = \ | |
94 | "\ | |
95 | shader infinite_isa \n\ | |
96 | wave_size(32) \n\ | |
97 | type(CS) \n\ | |
98 | LOOP: \n\ | |
99 | s_branch LOOP \n\ | |
100 | end \n\ | |
101 | "; | |
102 | ||
103 | 26 | void KFDCWSRTest::SetUp() { |
104 | 27 | ROUTINE_START |
105 | 28 | |
106 | 29 | KFDBaseComponentTest::SetUp(); |
107 | 30 | |
108 | m_pIsaGen = IsaGenerator::Create(m_FamilyId); | |
109 | ||
110 | wave_number = 1; | |
111 | ||
112 | 31 | ROUTINE_END |
113 | 32 | } |
114 | 33 | |
115 | 34 | void KFDCWSRTest::TearDown() { |
116 | 35 | ROUTINE_START |
117 | if (m_pIsaGen) | |
118 | delete m_pIsaGen; | |
119 | m_pIsaGen = NULL; | |
120 | 36 | |
121 | 37 | KFDBaseComponentTest::TearDown(); |
122 | 38 | |
123 | 39 | ROUTINE_END |
124 | } | |
125 | ||
126 | bool isOnEmulator() { | |
127 | uint32_t isEmuMode = 0; | |
128 | ||
129 | fscanf_dec("/sys/module/amdgpu/parameters/emu_mode", &isEmuMode); | |
130 | ||
131 | return isEmuMode; | |
132 | 40 | } |
133 | 41 | |
134 | 42 | static inline uint32_t checkCWSREnabled() { |
142 | 50 | /** |
143 | 51 | * KFDCWSRTest.BasicTest |
144 | 52 | * |
145 | * This test dispatches the loop_inc_isa shader and lets it run, ensuring its destination pointer gets incremented. | |
146 | * It then triggers CWSR and ensures the shader stops running. | |
147 | * It then resumes the shader, ensures that it's running again and terminates it. | |
148 | */ | |
149 | TEST_F(KFDCWSRTest, BasicTest) { | |
53 | * This test dispatches the IterateIsa shader, which continuously increments a vgpr for | |
54 | * (num_witems / WAVE_SIZE) waves. While this shader is running, dequeue/requeue requests | |
55 | * are sent in a loop to trigger CWSRs. | |
56 | * | |
57 | * This is a paremeterized test. See the INSTANTIATE_TEST_CASE_P below for an explanation | |
58 | * on the parameters. | |
59 | * | |
60 | * This test defines a CWSR threshold. The shader will continuously loop until inputBuf is | |
61 | * filled with the known stop value, which occurs once cwsr_thresh CWSRs have been | |
62 | * successfully triggered. | |
63 | * | |
64 | * 4 parameterized tests are defined: | |
65 | * | |
66 | * KFDCWSRTest.BasicTest/0 | |
67 | * KFDCWSRTest.BasicTest/1 | |
68 | * KFDCWSRTest.BasicTest/2 | |
69 | * KFDCWSRTest.BasicTest/3 | |
70 | * | |
71 | * 0: 1 work-item, CWSR threshold of 10 | |
72 | * 1: 256 work-items (multi-wave), CWSR threshold of 50 | |
73 | * 2: 512 work-items (multi-wave), CWSR threshold of 100 | |
74 | * 3: 1024 work-items (multi-wave), CWSR threshold of 1000 | |
75 | */ | |
76 | TEST_P(KFDCWSRTest, BasicTest) { | |
150 | 77 | TEST_START(TESTPROFILE_RUNALL); |
151 | 78 | |
79 | int num_witems = std::get<0>(GetParam()); | |
80 | int cwsr_thresh = std::get<1>(GetParam()); | |
152 | 81 | int defaultGPUNode = m_NodeInfo.HsaDefaultGPUNode(); |
153 | 82 | |
154 | 83 | if ((m_FamilyId >= FAMILY_VI) && (checkCWSREnabled())) { |
155 | const char *pIterateIsa; | |
156 | HsaMemoryBuffer isaBuffer(PAGE_SIZE, defaultGPUNode, true/*zero*/, false/*local*/, true/*exec*/); | |
157 | HsaMemoryBuffer resultBuf1(PAGE_SIZE, defaultGPUNode, true, false, false); | |
158 | uint64_t count1 = 400000000; | |
159 | ||
160 | if (m_FamilyId < FAMILY_AI) | |
161 | pIterateIsa = iterate_isa_gfx8; | |
162 | else | |
163 | pIterateIsa = iterate_isa_gfx9; | |
164 | ||
165 | if (isOnEmulator()) { | |
166 | // Divide the iterator times by 10000 so that the test can | |
167 | // finish in a reasonable time. | |
168 | count1 /= 10000; | |
169 | LOG() << "On Emulators" << std::endl; | |
170 | } | |
171 | ||
172 | unsigned int* result1 = resultBuf1.As<unsigned int*>(); | |
173 | ||
174 | m_pIsaGen->CompileShader(pIterateIsa, "iterate_isa", isaBuffer); | |
175 | ||
176 | PM4Queue queue1; | |
177 | ||
178 | ASSERT_SUCCESS(queue1.Create(defaultGPUNode)); | |
179 | ||
180 | Dispatch *dispatch1; | |
181 | ||
182 | dispatch1 = new Dispatch(isaBuffer); | |
183 | ||
184 | dispatch1->SetArgs(reinterpret_cast<void *>(count1), result1); | |
185 | dispatch1->SetDim(wave_number, 1, 1); | |
186 | ||
187 | // Submit the shader, queue1 | |
188 | dispatch1->Submit(queue1); | |
189 | ||
190 | //Give time for waves to launch before disabling queue. | |
191 | Delay(1); | |
192 | EXPECT_SUCCESS(queue1.Update(0/*percentage*/, BaseQueue::DEFAULT_PRIORITY, false)); | |
84 | HsaMemoryBuffer isaBuffer(PAGE_SIZE, defaultGPUNode, true, false, true); | |
85 | ASSERT_SUCCESS(m_pAsm->RunAssembleBuf(IterateIsa, isaBuffer.As<char*>())); | |
86 | ||
87 | unsigned stopval = 0x1234'5678; | |
88 | unsigned outval = 0x8765'4321; | |
89 | ||
90 | // 4B per work-item ==> 1 page per 1024 work-items (take ceiling) | |
91 | unsigned bufSize = PAGE_SIZE * ((num_witems / 1024) + (num_witems % 1024 != 0)); | |
92 | ||
93 | HsaMemoryBuffer inputBuf(bufSize, defaultGPUNode, true, false, false); | |
94 | HsaMemoryBuffer outputBuf(bufSize, defaultGPUNode, true, false, false); | |
95 | unsigned int* input = inputBuf.As<unsigned int*>(); | |
96 | unsigned int* output = outputBuf.As<unsigned int*>(); | |
97 | inputBuf.Fill(0); | |
98 | outputBuf.Fill(outval); | |
99 | ||
100 | PM4Queue queue; | |
101 | ASSERT_SUCCESS(queue.Create(defaultGPUNode)); | |
102 | ||
103 | Dispatch dispatch(isaBuffer); | |
104 | dispatch.SetArgs(input, output); | |
105 | dispatch.SetDim(num_witems, 1, 1); | |
106 | dispatch.Submit(queue); | |
107 | ||
193 | 108 | Delay(5); |
194 | EXPECT_SUCCESS(queue1.Update(100/*percentage*/, BaseQueue::DEFAULT_PRIORITY, false)); | |
195 | ||
196 | dispatch1->Sync(); | |
197 | // Ensure all the waves complete as expected | |
198 | int i; | |
199 | for (i = 0 ; i < wave_number; ++i) { | |
200 | if (result1[i] != count1) { | |
201 | LOG() << "Dispatch 1, work item [" << std::dec << i << "] " | |
202 | << result1[i] << " != " << count1 << std::endl; | |
203 | break; | |
109 | ||
110 | LOG() << "Starting iteration for " << std::dec << num_witems | |
111 | << " work items(s) (targeting " << std::dec << cwsr_thresh | |
112 | << " CWSRs)" << std::endl; | |
113 | ||
114 | for (int num_cwsrs = 0; num_cwsrs < cwsr_thresh; num_cwsrs++) { | |
115 | ||
116 | // Send dequeue request | |
117 | EXPECT_SUCCESS(queue.Update(0, BaseQueue::DEFAULT_PRIORITY, false)); | |
118 | ||
119 | Delay(5); | |
120 | ||
121 | // Send requeue request | |
122 | EXPECT_SUCCESS(queue.Update(100, BaseQueue::DEFAULT_PRIORITY, false)); | |
123 | ||
124 | Delay(50); | |
125 | ||
126 | // Check for reg mangling | |
127 | for (int i = 0; i < num_witems; i++) { | |
128 | EXPECT_EQ(outval, output[i]); | |
204 | 129 | } |
205 | 130 | } |
206 | EXPECT_EQ(i, wave_number); | |
207 | ||
208 | EXPECT_SUCCESS(queue1.Destroy()); | |
209 | ||
210 | delete dispatch1; | |
131 | ||
132 | LOG() << "Successful completion for " << std::dec << num_witems | |
133 | << " work item(s) (CWSRs triggered: " << std::dec << cwsr_thresh | |
134 | << ")" << std::endl; | |
135 | LOG() << "Signalling shader stop..." << std::endl; | |
136 | ||
137 | inputBuf.Fill(stopval); | |
138 | ||
139 | // Wait for shader to finish or timeout if shader has vm page fault | |
140 | EXPECT_EQ(0, dispatch.SyncWithStatus(180000)); | |
141 | ||
142 | EXPECT_SUCCESS(queue.Destroy()); | |
211 | 143 | } else { |
212 | 144 | LOG() << "Skipping test: No CWSR present for family ID 0x" << m_FamilyId << "." << std::endl; |
213 | 145 | } |
214 | 146 | |
215 | 147 | TEST_END |
216 | 148 | } |
149 | ||
150 | /** | |
151 | * Instantiates various KFDCWSRTest.BasicTest parameterizations | |
152 | * Tuple Format: (num_witems, cwsr_thresh) | |
153 | * | |
154 | * num_witems: Defines the number of work-items. | |
155 | * cwsr_thresh: Defines the number of CWSRs to trigger. | |
156 | */ | |
157 | INSTANTIATE_TEST_CASE_P( | |
158 | , KFDCWSRTest, | |
159 | ::testing::Values( | |
160 | std::make_tuple(1, 10), /* Single Wave Test, 10 CWSR Triggers */ | |
161 | std::make_tuple(256, 50), /* Multi Wave Test, 50 CWSR Triggers */ | |
162 | std::make_tuple(512, 100), /* Multi Wave Test, 100 CWSR Triggers */ | |
163 | std::make_tuple(1024, 1000) /* Multi Wave Test, 1000 CWSR Triggers */ | |
164 | ) | |
165 | ); | |
217 | 166 | |
218 | 167 | /** |
219 | 168 | * KFDCWSRTest.InterruptRestore |
235 | 184 | if ((m_FamilyId >= FAMILY_VI) && (checkCWSREnabled())) { |
236 | 185 | HsaMemoryBuffer isaBuffer(PAGE_SIZE, defaultGPUNode, true/*zero*/, false/*local*/, true/*exec*/); |
237 | 186 | |
238 | m_pIsaGen->CompileShader(infinite_isa, "infinite_isa", isaBuffer); | |
187 | ASSERT_SUCCESS(m_pAsm->RunAssembleBuf(InfiniteLoopIsa, isaBuffer.As<char*>())); | |
239 | 188 | |
240 | 189 | PM4Queue queue1, queue2, queue3; |
241 | 190 |
26 | 26 | #include <gtest/gtest.h> |
27 | 27 | |
28 | 28 | #include "PM4Queue.hpp" |
29 | #include "IsaGenerator.hpp" | |
30 | 29 | #include "KFDBaseComponentTest.hpp" |
31 | 30 | |
32 | class KFDCWSRTest : public KFDBaseComponentTest { | |
31 | class KFDCWSRTest : public KFDBaseComponentTest, | |
32 | public ::testing::WithParamInterface<std::tuple<int, int>> { | |
33 | 33 | public: |
34 | KFDCWSRTest() :m_pIsaGen(NULL) {} | |
34 | KFDCWSRTest() {} | |
35 | 35 | ~KFDCWSRTest() {} |
36 | 36 | |
37 | 37 | protected: |
38 | 38 | virtual void SetUp(); |
39 | 39 | virtual void TearDown(); |
40 | ||
41 | protected: // Members | |
42 | unsigned wave_number; | |
43 | IsaGenerator* m_pIsaGen; | |
44 | 40 | }; |
45 | 41 | |
46 | 42 | #endif // __KFD_CWSR_TEST__H__ |
175 | 175 | |
176 | 176 | KFDBaseComponentTest::SetUp(); |
177 | 177 | |
178 | m_pIsaGen = IsaGenerator::Create(m_FamilyId); | |
179 | ||
180 | 178 | ROUTINE_END |
181 | 179 | } |
182 | 180 | |
183 | 181 | void KFDDBGTest::TearDown() { |
184 | 182 | ROUTINE_START |
185 | if (m_pIsaGen) | |
186 | delete m_pIsaGen; | |
187 | m_pIsaGen = NULL; | |
188 | 183 | |
189 | 184 | /* Reset the user trap handler */ |
190 | 185 | hsaKmtSetTrapHandler(m_NodeInfo.HsaDefaultGPUNode(), 0, 0, 0, 0); |
25 | 25 | |
26 | 26 | #include <gtest/gtest.h> |
27 | 27 | |
28 | #include "IsaGenerator.hpp" | |
29 | 28 | #include "KFDBaseComponentTest.hpp" |
30 | 29 | |
31 | 30 | class KFDDBGTest : public KFDBaseComponentTest { |
32 | 31 | public: |
33 | KFDDBGTest() :m_pIsaGen(NULL) {} | |
32 | KFDDBGTest() {} | |
34 | 33 | ~KFDDBGTest() {} |
35 | 34 | |
36 | 35 | protected: |
37 | 36 | virtual void SetUp(); |
38 | 37 | virtual void TearDown(); |
39 | ||
40 | protected: // Members | |
41 | IsaGenerator* m_pIsaGen; | |
42 | 38 | }; |
43 | 39 | |
44 | 40 | #endif // __KFD_DBG_TEST__H__ |
40 | 40 | |
41 | 41 | KFDBaseComponentTest::SetUp(); |
42 | 42 | |
43 | m_pIsaGen = IsaGenerator::Create(m_FamilyId); | |
44 | ||
45 | 43 | ROUTINE_END |
46 | 44 | } |
47 | 45 | |
48 | 46 | void KFDEvictTest::TearDown() { |
49 | 47 | ROUTINE_START |
50 | ||
51 | if (m_pIsaGen) | |
52 | delete m_pIsaGen; | |
53 | m_pIsaGen = NULL; | |
54 | 48 | |
55 | 49 | KFDBaseComponentTest::TearDown(); |
56 | 50 | |
285 | 279 | EXPECT_EQ(0, amdgpu_cs_ctx_free(contextHandle)); |
286 | 280 | } |
287 | 281 | |
288 | /* Shader to read local buffers using multiple wavefronts in parallel | |
289 | * until address buffer is filled with specific value 0x5678 by host program, | |
290 | * then each wavefront fills value 0x5678 at corresponding result buffer and quit | |
291 | * | |
292 | * Initial state: | |
293 | * s[0:1] - address buffer base address | |
294 | * s[2:3] - result buffer base address | |
295 | * s4 - workgroup id | |
296 | * v0 - workitem id, always 0 because NUM_THREADS_X(number of threads) in workgroup set to 1 | |
297 | * Registers: | |
298 | * v0 - calculated workitem id, v0 = v0 + s4 * NUM_THREADS_X | |
299 | * v[2:3] - address of corresponding local buf address offset: s[0:1] + v0 * 8 | |
300 | * v[4:5] - corresponding output buf address: s[2:3] + v0 * 4 | |
301 | * v[6:7] - local buf address used for read test | |
302 | * | |
303 | * This shader can be used by gfx9 and gfx10 | |
304 | * | |
305 | */ | |
306 | ||
307 | static const char* gfx9_ReadMemory = | |
308 | "\ | |
309 | shader ReadMemory\n\ | |
310 | wave_size(32)\n\ | |
311 | type(CS)\n\ | |
312 | \n\ | |
313 | // compute address of corresponding output buffer\n\ | |
314 | v_mov_b32 v0, s4 // use workgroup id as index\n\ | |
315 | v_lshlrev_b32 v0, 2, v0 // v0 *= 4\n\ | |
316 | v_add_co_u32 v4, vcc, s2, v0 // v[4:5] = s[2:3] + v0 * 4\n\ | |
317 | v_mov_b32 v5, s3\n\ | |
318 | v_add_co_u32 v5, vcc, v5, vcc_lo\n\ | |
319 | \n\ | |
320 | // compute input buffer offset used to store corresponding local buffer address\n\ | |
321 | v_lshlrev_b32 v0, 1, v0 // v0 *= 8\n\ | |
322 | v_add_co_u32 v2, vcc, s0, v0 // v[2:3] = s[0:1] + v0 * 8\n\ | |
323 | v_mov_b32 v3, s1\n\ | |
324 | v_add_co_u32 v3, vcc, v3, vcc_lo\n\ | |
325 | \n\ | |
326 | // load 64bit local buffer address stored at v[2:3] to v[6:7]\n\ | |
327 | flat_load_dwordx2 v[6:7], v[2:3] slc\n\ | |
328 | s_waitcnt vmcnt(0) & lgkmcnt(0) // wait for memory reads to finish\n\ | |
329 | \n\ | |
330 | v_mov_b32 v8, 0x5678\n\ | |
331 | s_movk_i32 s8, 0x5678\n\ | |
332 | L_REPEAT:\n\ | |
333 | s_load_dword s16, s[0:1], 0x0 glc\n\ | |
334 | s_waitcnt vmcnt(0) & lgkmcnt(0) // wait for memory reads to finish\n\ | |
335 | s_cmp_eq_i32 s16, s8\n\ | |
336 | s_cbranch_scc1 L_QUIT // if notified to quit by host\n\ | |
337 | // loop read 64M local buffer starting at v[6:7]\n\ | |
338 | // every 4k page only read once\n\ | |
339 | v_mov_b32 v9, 0\n\ | |
340 | v_mov_b32 v10, 0x1000 // 4k page\n\ | |
341 | v_mov_b32 v11, 0x4000000 // 64M size\n\ | |
342 | v_mov_b32 v12, v6\n\ | |
343 | v_mov_b32 v13, v7\n\ | |
344 | L_LOOP_READ:\n\ | |
345 | flat_load_dwordx2 v[14:15], v[12:13] slc\n\ | |
346 | v_add_co_u32 v9, vcc, v9, v10 \n\ | |
347 | v_add_co_u32 v12, vcc, v12, v10\n\ | |
348 | v_add_co_u32 v13, vcc, v13, vcc_lo\n\ | |
349 | v_cmp_lt_u32 vcc, v9, v11\n\ | |
350 | s_cbranch_vccnz L_LOOP_READ\n\ | |
351 | s_branch L_REPEAT\n\ | |
352 | L_QUIT:\n\ | |
353 | flat_store_dword v[4:5], v8\n\ | |
354 | s_waitcnt vmcnt(0) & lgkmcnt(0) // wait for memory writes to finish\n\ | |
355 | s_endpgm\n\ | |
356 | end\n\ | |
357 | "; | |
358 | ||
359 | static const char* gfx8_ReadMemory = | |
360 | "\ | |
361 | shader ReadMemory\n\ | |
362 | asic(VI)\n\ | |
363 | type(CS)\n\ | |
364 | \n\ | |
365 | // compute address of corresponding output buffer\n\ | |
366 | v_mov_b32 v0, s4 // use workgroup id as index\n\ | |
367 | v_lshlrev_b32 v0, 2, v0 // v0 *= 4\n\ | |
368 | v_add_u32 v4, vcc, s2, v0 // v[4:5] = s[2:3] + v0 * 4\n\ | |
369 | v_mov_b32 v5, s3\n\ | |
370 | v_addc_u32 v5, vcc, v5, 0, vcc\n\ | |
371 | \n\ | |
372 | // compute input buffer offset used to store corresponding local buffer address\n\ | |
373 | v_lshlrev_b32 v0, 1, v0 // v0 *= 8\n\ | |
374 | v_add_u32 v2, vcc, s0, v0 // v[2:3] = s[0:1] + v0 * 8\n\ | |
375 | v_mov_b32 v3, s1\n\ | |
376 | v_addc_u32 v3, vcc, v3, 0, vcc\n\ | |
377 | \n\ | |
378 | // load 64bit local buffer address stored at v[2:3] to v[6:7]\n\ | |
379 | flat_load_dwordx2 v[6:7], v[2:3] slc\n\ | |
380 | s_waitcnt vmcnt(0) & lgkmcnt(0) // wait for memory reads to finish\n\ | |
381 | \n\ | |
382 | v_mov_b32 v8, 0x5678\n\ | |
383 | s_movk_i32 s8, 0x5678\n\ | |
384 | L_REPEAT:\n\ | |
385 | s_load_dword s16, s[0:1], 0x0 glc\n\ | |
386 | s_waitcnt vmcnt(0) & lgkmcnt(0) // wait for memory reads to finish\n\ | |
387 | s_cmp_eq_i32 s16, s8\n\ | |
388 | s_cbranch_scc1 L_QUIT // if notified to quit by host\n\ | |
389 | // loop read 64M local buffer starting at v[6:7]\n\ | |
390 | // every 4k page only read once\n\ | |
391 | v_mov_b32 v9, 0\n\ | |
392 | v_mov_b32 v10, 0x1000 // 4k page\n\ | |
393 | v_mov_b32 v11, 0x4000000 // 64M size\n\ | |
394 | v_mov_b32 v12, v6\n\ | |
395 | v_mov_b32 v13, v7\n\ | |
396 | L_LOOP_READ:\n\ | |
397 | flat_load_dwordx2 v[14:15], v[12:13] slc\n\ | |
398 | v_add_u32 v9, vcc, v9, v10 \n\ | |
399 | v_add_u32 v12, vcc, v12, v10\n\ | |
400 | v_addc_u32 v13, vcc, v13, 0, vcc\n\ | |
401 | v_cmp_lt_u32 vcc, v9, v11\n\ | |
402 | s_cbranch_vccnz L_LOOP_READ\n\ | |
403 | s_branch L_REPEAT\n\ | |
404 | L_QUIT:\n\ | |
405 | flat_store_dword v[4:5], v8\n\ | |
406 | s_waitcnt vmcnt(0) & lgkmcnt(0) // wait for memory writes to finish\n\ | |
407 | s_endpgm\n\ | |
408 | end\n\ | |
409 | "; | |
410 | ||
411 | std::string KFDEvictTest::CreateShader() { | |
412 | if (m_FamilyId < FAMILY_AI) | |
413 | return gfx8_ReadMemory; | |
414 | else | |
415 | return gfx9_ReadMemory; | |
416 | } | |
417 | ||
418 | 282 | /* Evict and restore procedure basic test |
419 | 283 | * |
420 | 284 | * Use N_PROCESSES processes to allocate vram buf size larger than total vram size |
566 | 430 | HsaMemoryBuffer addrBuffer(PAGE_SIZE, defaultGPUNode); |
567 | 431 | HsaMemoryBuffer resultBuffer(PAGE_SIZE, defaultGPUNode); |
568 | 432 | |
569 | m_pIsaGen->CompileShader(CreateShader().c_str(), "ReadMemory", isaBuffer); | |
433 | ASSERT_SUCCESS(m_pAsm->RunAssembleBuf(ReadMemoryIsa, isaBuffer.As<char*>())); | |
570 | 434 | |
571 | 435 | PM4Queue pm4Queue; |
572 | 436 | ASSERT_SUCCESS(pm4Queue.Create(defaultGPUNode)); |
26 | 26 | #include <string> |
27 | 27 | #include <vector> |
28 | 28 | #include "KFDMultiProcessTest.hpp" |
29 | #include "IsaGenerator.hpp" | |
30 | 29 | #include "PM4Queue.hpp" |
31 | 30 | |
32 | 31 | // @class KFDEvictTest |
33 | 32 | // Test eviction and restore procedure using two processes |
34 | 33 | class KFDEvictTest : public KFDMultiProcessTest { |
35 | 34 | public: |
36 | KFDEvictTest(void): m_pIsaGen(NULL) {} | |
37 | ||
35 | KFDEvictTest(void) {} | |
38 | 36 | ~KFDEvictTest(void) {} |
39 | 37 | |
40 | 38 | protected: |
41 | 39 | virtual void SetUp(); |
42 | 40 | virtual void TearDown(); |
43 | 41 | |
44 | std::string CreateShader(); | |
45 | 42 | void AllocBuffers(HSAuint32 defaultGPUNode, HSAuint32 count, HSAuint64 vramBufSize, |
46 | 43 | std::vector<void *> &pBuffers); |
47 | 44 | void FreeBuffers(std::vector<void *> &pBuffers, HSAuint64 vramBufSize); |
51 | 48 | PM4Queue *computeQueue); |
52 | 49 | |
53 | 50 | protected: // Members |
54 | IsaGenerator* m_pIsaGen; | |
55 | 51 | HsaMemFlags m_Flags; |
56 | 52 | void* m_pBuf; |
57 | 53 | }; |
32 | 32 | |
33 | 33 | KFDBaseComponentTest::SetUp(); |
34 | 34 | |
35 | m_pIsaGen = IsaGenerator::Create(m_FamilyId); | |
35 | LOG() << "This Exception test might cause expected page fault " | |
36 | "error logs at kernel level." << std::endl; | |
36 | 37 | |
37 | 38 | ROUTINE_END |
38 | 39 | } |
39 | 40 | |
40 | 41 | void KFDExceptionTest::TearDown() { |
41 | 42 | ROUTINE_START |
42 | ||
43 | if (m_pIsaGen) | |
44 | delete m_pIsaGen; | |
45 | m_pIsaGen = NULL; | |
46 | 43 | |
47 | 44 | KFDBaseComponentTest::TearDown(); |
48 | 45 | |
74 | 71 | eventDesc.SyncVar.SyncVar.UserData = NULL; |
75 | 72 | eventDesc.SyncVar.SyncVarSize = 0; |
76 | 73 | |
77 | m_pIsaGen->GetCopyDwordIsa(isaBuffer); | |
74 | ASSERT_SUCCESS(m_pAsm->RunAssembleBuf(CopyDwordIsa, isaBuffer.As<char*>())); | |
75 | ||
78 | 76 | m_ChildStatus = queue.Create(defaultGPUNode); |
79 | 77 | if (m_ChildStatus != HSAKMT_STATUS_SUCCESS) { |
80 | 78 | WARN() << "Queue create failed" << std::endl; |
185 | 183 | |
186 | 184 | m_ChildPid = fork(); |
187 | 185 | if (m_ChildPid == 0) { |
188 | m_ChildStatus = hsaKmtOpenKFD(); | |
189 | if (m_ChildStatus != HSAKMT_STATUS_SUCCESS) { | |
190 | WARN() << "KFD open failed in child process" << std::endl; | |
191 | return; | |
192 | } | |
186 | KFDBaseComponentTest::TearDown(); | |
187 | KFDBaseComponentTest::SetUp(); | |
193 | 188 | |
194 | 189 | HsaMemoryBuffer srcBuffer(PAGE_SIZE, defaultGPUNode, false); |
195 | 190 | |
229 | 224 | |
230 | 225 | m_ChildPid = fork(); |
231 | 226 | if (m_ChildPid == 0) { |
232 | m_ChildStatus = hsaKmtOpenKFD(); | |
233 | if (m_ChildStatus != HSAKMT_STATUS_SUCCESS) { | |
234 | WARN() << "KFD open failed in child process" << std::endl; | |
235 | return; | |
236 | } | |
227 | KFDBaseComponentTest::TearDown(); | |
228 | KFDBaseComponentTest::SetUp(); | |
237 | 229 | |
238 | 230 | HsaMemoryBuffer readOnlyBuffer(PAGE_SIZE, defaultGPUNode, false /*zero*/, |
239 | 231 | false /*isLocal*/, true /*isExec*/, |
279 | 271 | |
280 | 272 | m_ChildPid = fork(); |
281 | 273 | if (m_ChildPid == 0) { |
282 | m_ChildStatus = hsaKmtOpenKFD(); | |
283 | if (m_ChildStatus != HSAKMT_STATUS_SUCCESS) { | |
284 | WARN() << "KFD open failed in child process" << std::endl; | |
285 | return; | |
286 | } | |
274 | KFDBaseComponentTest::TearDown(); | |
275 | KFDBaseComponentTest::SetUp(); | |
287 | 276 | |
288 | 277 | TestMemoryException(defaultGPUNode, 0x12345678, 0x76543210, 1024, 1024, 1); |
289 | 278 | } else { |
322 | 311 | if (m_ChildPid == 0) { |
323 | 312 | unsigned int* pDb = NULL; |
324 | 313 | unsigned int *nullPtr = NULL; |
325 | m_ChildStatus = hsaKmtOpenKFD(); | |
326 | if (m_ChildStatus != HSAKMT_STATUS_SUCCESS) { | |
327 | WARN() << "KFD open failed in child process" << std::endl; | |
328 | return; | |
329 | } | |
314 | ||
315 | KFDBaseComponentTest::TearDown(); | |
316 | KFDBaseComponentTest::SetUp(); | |
317 | ||
330 | 318 | m_MemoryFlags.ui32.NonPaged = 1; |
319 | m_MemoryFlags.ui32.HostAccess = 0; | |
331 | 320 | ASSERT_SUCCESS(hsaKmtAllocMemory(defaultGPUNode, PAGE_SIZE, m_MemoryFlags, |
332 | 321 | reinterpret_cast<void**>(&pDb))); |
333 | 322 | // verify that pDb is not null before it's being used |
25 | 25 | |
26 | 26 | #include <gtest/gtest.h> |
27 | 27 | |
28 | #include "IsaGenerator.hpp" | |
29 | 28 | #include "KFDBaseComponentTest.hpp" |
30 | 29 | |
31 | 30 | class KFDExceptionTest : public KFDBaseComponentTest { |
32 | 31 | public: |
33 | KFDExceptionTest() :m_pIsaGen(NULL), m_ChildPid(-1) { | |
32 | KFDExceptionTest() : m_ChildPid(-1) { | |
34 | 33 | /* Because there could be early return before m_ChildPid is set |
35 | 34 | * by fork(), we should initialize m_ChildPid to a non-zero value |
36 | 35 | * to avoid possible exit of the main process. |
42 | 41 | * child process finishes, gtest assumes the test has finished and |
43 | 42 | * starts the next test while the parent is still active. |
44 | 43 | */ |
45 | if (m_ChildPid == 0) | |
44 | if (m_ChildPid == 0) { | |
45 | if (!m_ChildStatus && HasFatalFailure()) | |
46 | m_ChildStatus = HSAKMT_STATUS_ERROR; | |
46 | 47 | exit(m_ChildStatus); |
48 | } | |
47 | 49 | } |
48 | 50 | |
49 | 51 | protected: |
58 | 60 | protected: // Members |
59 | 61 | pid_t m_ChildPid; |
60 | 62 | HSAKMT_STATUS m_ChildStatus; |
61 | ||
62 | IsaGenerator* m_pIsaGen; | |
63 | 63 | }; |
64 | 64 | |
65 | 65 | #endif // __KFD_EXCEPTION_TEST__H__ |
25 | 25 | #include "PM4Packet.hpp" |
26 | 26 | #include "Dispatch.hpp" |
27 | 27 | |
28 | /* Shader to initialize gws counter to 1*/ | |
29 | const char* gfx9_10_GwsInit = | |
30 | "\ | |
31 | shader GwsInit\n\ | |
32 | type(CS)\n\ | |
33 | wave_size(32)\n\ | |
34 | s_mov_b32 m0, 0\n\ | |
35 | s_nop 0\n\ | |
36 | s_load_dword s16, s[0:1], 0x0 glc\n\ | |
37 | s_waitcnt 0\n\ | |
38 | v_mov_b32 v0, s16\n\ | |
39 | s_waitcnt 0\n\ | |
40 | ds_gws_init v0 gds:1 offset0:0\n\ | |
41 | s_waitcnt 0\n\ | |
42 | s_endpgm\n\ | |
43 | end\n\ | |
44 | "; | |
45 | ||
46 | /* Atomically increase a value in memory | |
47 | * This is expected to be executed from | |
48 | * multiple work groups simultaneously. | |
49 | * GWS semaphore is used to guarantee | |
50 | * the operation is atomic. | |
51 | */ | |
52 | const char* gfx9_AtomicIncrease = | |
53 | "\ | |
54 | shader AtomicIncrease\n\ | |
55 | type(CS)\n\ | |
56 | /* Assume src address in s0, s1 */\n\ | |
57 | s_mov_b32 m0, 0\n\ | |
58 | s_nop 0\n\ | |
59 | ds_gws_sema_p gds:1 offset0:0\n\ | |
60 | s_waitcnt 0\n\ | |
61 | s_load_dword s16, s[0:1], 0x0 glc\n\ | |
62 | s_waitcnt 0\n\ | |
63 | s_add_u32 s16, s16, 1\n\ | |
64 | s_store_dword s16, s[0:1], 0x0 glc\n\ | |
65 | s_waitcnt lgkmcnt(0)\n\ | |
66 | ds_gws_sema_v gds:1 offset0:0\n\ | |
67 | s_waitcnt 0\n\ | |
68 | s_endpgm\n\ | |
69 | end\n\ | |
70 | "; | |
71 | ||
72 | const char* gfx10_AtomicIncrease = | |
73 | "\ | |
74 | shader AtomicIncrease\n\ | |
75 | asic(GFX10)\n\ | |
76 | type(CS)\n\ | |
77 | wave_size(32)\n\ | |
78 | /* Assume src address in s0, s1 */\n\ | |
79 | s_mov_b32 m0, 0\n\ | |
80 | s_mov_b32 exec_lo, 0x1\n\ | |
81 | v_mov_b32 v0, s0\n\ | |
82 | v_mov_b32 v1, s1\n\ | |
83 | ds_gws_sema_p gds:1 offset0:0\n\ | |
84 | s_waitcnt 0\n\ | |
85 | flat_load_dword v2, v[0:1] glc:1 dlc:1\n\ | |
86 | s_waitcnt 0\n\ | |
87 | v_add_nc_u32 v2, v2, 1\n\ | |
88 | flat_store_dword v[0:1], v2\n\ | |
89 | s_waitcnt_vscnt null, 0\n\ | |
90 | ds_gws_sema_v gds:1 offset0:0\n\ | |
91 | s_waitcnt 0\n\ | |
92 | s_endpgm\n\ | |
93 | end\n\ | |
94 | "; | |
95 | ||
96 | 28 | void KFDGWSTest::SetUp() { |
97 | 29 | ROUTINE_START |
98 | 30 | |
99 | 31 | KFDBaseComponentTest::SetUp(); |
100 | ||
101 | m_pIsaGen = IsaGenerator::Create(m_FamilyId); | |
102 | 32 | |
103 | 33 | ROUTINE_END |
104 | 34 | } |
105 | 35 | |
106 | 36 | void KFDGWSTest::TearDown() { |
107 | 37 | ROUTINE_START |
108 | ||
109 | if (m_pIsaGen) | |
110 | delete m_pIsaGen; | |
111 | m_pIsaGen = NULL; | |
112 | 38 | |
113 | 39 | KFDBaseComponentTest::TearDown(); |
114 | 40 | |
159 | 85 | pNodeProperties->NumGws,&firstGWS)); |
160 | 86 | EXPECT_EQ(0, firstGWS); |
161 | 87 | |
162 | m_pIsaGen = IsaGenerator::Create(m_FamilyId); | |
163 | m_pIsaGen->CompileShader(gfx9_10_GwsInit, "GwsInit", isaBuffer); | |
88 | ASSERT_SUCCESS(m_pAsm->RunAssembleBuf(GwsInitIsa, isaBuffer.As<char*>())); | |
89 | ||
164 | 90 | Dispatch dispatch0(isaBuffer); |
165 | 91 | buffer.Fill(numResources, 0, 4); |
166 | 92 | dispatch0.SetArgs(buffer.As<void*>(), NULL); |
167 | 93 | dispatch0.Submit(queue); |
168 | 94 | dispatch0.Sync(); |
169 | 95 | |
170 | const char *pAtomicIncrease; | |
171 | if (m_FamilyId <= FAMILY_AL) | |
172 | pAtomicIncrease = gfx9_AtomicIncrease; | |
173 | else | |
174 | pAtomicIncrease = gfx10_AtomicIncrease; | |
175 | ||
176 | m_pIsaGen->CompileShader(pAtomicIncrease, "AtomicIncrease", isaBuffer); | |
96 | ASSERT_SUCCESS(m_pAsm->RunAssembleBuf(GwsAtomicIncreaseIsa, isaBuffer.As<char*>())); | |
177 | 97 | |
178 | 98 | Dispatch dispatch(isaBuffer); |
179 | 99 | dispatch.SetArgs(buffer.As<void*>(), NULL); |
25 | 25 | |
26 | 26 | #include <gtest/gtest.h> |
27 | 27 | |
28 | #include "IsaGenerator.hpp" | |
29 | 28 | #include "KFDBaseComponentTest.hpp" |
30 | 29 | |
31 | 30 | class KFDGWSTest : public KFDBaseComponentTest { |
32 | 31 | public: |
33 | KFDGWSTest() :m_pIsaGen(NULL) {} | |
32 | KFDGWSTest() {} | |
34 | 33 | ~KFDGWSTest() {} |
35 | 34 | |
36 | 35 | protected: |
37 | 36 | virtual void SetUp(); |
38 | 37 | virtual void TearDown(); |
39 | ||
40 | protected: // Members | |
41 | IsaGenerator* m_pIsaGen; | |
42 | 38 | }; |
43 | 39 | |
44 | 40 | #endif // __KFD_GWS_TEST__H__ |
100 | 100 | |
101 | 101 | // Copy contents to a system memory buffer for comparison |
102 | 102 | HsaMemoryBuffer isaBuffer(PAGE_SIZE, defaultGPUNode, true/*zero*/, false/*local*/, true/*exec*/); |
103 | m_pIsaGen->GetCopyDwordIsa(isaBuffer); | |
103 | ||
104 | ASSERT_SUCCESS(m_pAsm->RunAssembleBuf(CopyDwordIsa, isaBuffer.As<char*>())); | |
104 | 105 | |
105 | 106 | HsaMemoryBuffer dstBuffer(PAGE_SIZE, defaultGPUNode, true/*zero*/); |
106 | 107 |
27 | 27 | |
28 | 28 | KFDBaseComponentTest::SetUp(); |
29 | 29 | |
30 | m_pIsaGen = IsaGenerator::Create(m_FamilyId); | |
31 | ||
32 | 30 | ROUTINE_END |
33 | 31 | } |
34 | 32 | |
35 | 33 | void KFDHWSTest::TearDown() { |
36 | 34 | ROUTINE_START |
37 | ||
38 | if (m_pIsaGen) | |
39 | delete m_pIsaGen; | |
40 | m_pIsaGen = NULL; | |
41 | 35 | |
42 | 36 | KFDBaseComponentTest::TearDown(); |
43 | 37 | |
69 | 63 | |
70 | 64 | // Run work on all queues |
71 | 65 | HsaMemoryBuffer isaBuffer(PAGE_SIZE, defaultGPUNode, true/*zero*/, false/*local*/, true/*exec*/); |
72 | m_pIsaGen->GetNoopIsa(isaBuffer); | |
66 | ||
67 | ASSERT_SUCCESS(m_pAsm->RunAssembleBuf(NoopIsa, isaBuffer.As<char*>())); | |
68 | ||
73 | 69 | for (l = 0; l < nLoops; l++) { |
74 | 70 | for (q = 0; q < nQueues; q++) { |
75 | 71 | if (dispatch[q]) |
26 | 26 | #include <gtest/gtest.h> |
27 | 27 | |
28 | 28 | #include "PM4Queue.hpp" |
29 | #include "IsaGenerator.hpp" | |
30 | 29 | #include "KFDMultiProcessTest.hpp" |
31 | 30 | #include "Dispatch.hpp" |
32 | 31 | |
33 | 32 | class KFDHWSTest : public KFDMultiProcessTest { |
34 | 33 | public: |
35 | KFDHWSTest():m_pIsaGen(NULL) {} | |
36 | ||
34 | KFDHWSTest() {} | |
37 | 35 | ~KFDHWSTest() {} |
38 | 36 | |
39 | 37 | protected: |
41 | 39 | virtual void TearDown(); |
42 | 40 | |
43 | 41 | void RunTest(unsigned nProcesses, unsigned nQueues, unsigned nLoops); |
44 | ||
45 | protected: // Members | |
46 | IsaGenerator* m_pIsaGen; | |
47 | 42 | }; |
48 | 43 | |
49 | 44 | #endif // __KFD_QCM_TEST__H__ |
69 | 69 | /* Open KFD device for child process. This needs to called before |
70 | 70 | * any memory definitions |
71 | 71 | */ |
72 | if (HSAKMT_STATUS_SUCCESS != hsaKmtOpenKFD()) | |
73 | exit(1); | |
72 | TearDown(); | |
73 | SetUp(); | |
74 | 74 | |
75 | 75 | SDMAQueue sdmaQueue; |
76 | 76 | HsaSharedMemoryHandle sharedHandleLM; |
215 | 215 | |
216 | 216 | TEST_END |
217 | 217 | } |
218 | ||
219 | /* Cross Memory Attach Test. Memory Descriptor Array. | |
220 | * The following 2 2D-arrays describe the source and destination memory arrays used | |
221 | * by CMA test. The entry is only valid if Size != 0. Each of these buffers will be | |
222 | * filled intially with "FillPattern". After the test the srcRange is still expected | |
223 | * to have the same pattern. The dstRange is expected to have srcRange pattern. | |
224 | * | |
225 | * For e.g. for TEST_COUNT = 1, | |
226 | * srcRange has 2 buffers of size 0x1800. Buf1 filled with 0xA5A5A5A5 and Buf2 | |
227 | * filled with 0xAAAAAAAA | |
228 | * dstRange has 3 buffers of size 0x1000. All of them filled 0xFFFFFFFF. | |
229 | * After Copy: dstBuf1[0-0x1000] is expected to be 0xA5A5A5A5 | |
230 | * dstBuf2[0-0x800] is expected to be 0xA5A5A5A5 | |
231 | * dstBuf3[0x800-0x1000] is expected to be 0xAAAAAAAA | |
232 | * and dstBuf4[0x0-0x1000] is expected to be 0xAAAAAAAA | |
233 | * | |
234 | * For this CMA test, after copying only the first and the last of dstBuf is checked | |
235 | */ | |
236 | ||
237 | static testMemoryDescriptor srcRange[CMA_TEST_COUNT][CMA_MEMORY_TEST_ARRAY_SIZE] = { | |
238 | { /* Memory Type Size FillPattern FirstItem Last item */ | |
239 | { CMA_MEM_TYPE_USERPTR, 0x801800, 0xA5A5A5A5, 0xA5A5A5A5, 0xA5A5A5A5 }, | |
240 | { CMA_MEM_TYPE_USERPTR, 0x1800, 0xAAAAAAAA, 0xAAAAAAAA, 0xAAAAAAAA }, | |
241 | { CMA_MEM_TYPE_USERPTR, 0x0, 0xA5A5A5A5, 0xA5A5A5A5, 0xA5A5A5A5 }, | |
242 | { CMA_MEM_TYPE_USERPTR, 0x0, 0xA5A5A5A5, 0xA5A5A5A5, 0xA5A5A5A5 }, | |
243 | }, | |
244 | { | |
245 | { CMA_MEM_TYPE_SYSTEM, 0x208000, 0xDEADBEEF, 0xA5A5A5A5, 0xA5A5A5A5 }, | |
246 | { CMA_MEM_TYPE_SYSTEM, 0x4000, 0xA5A5A5A5, 0xA5A5A5A5, 0xA5A5A5A5 }, | |
247 | { CMA_MEM_TYPE_SYSTEM, 0x6000, 0xA5A5A5A5, 0xA5A5A5A5, 0xA5A5A5A5 }, | |
248 | { CMA_MEM_TYPE_SYSTEM, 0x2000, 0xA5A5A5A5, 0xA5A5A5A5, 0xA5A5A5A5 }, | |
249 | }, | |
250 | { | |
251 | { CMA_MEM_TYPE_LOCAL_MEM, 0x800000, 0xDEADBEEF, 0xA5A5A5A5, 0xA5A5A5A5 }, | |
252 | { CMA_MEM_TYPE_LOCAL_MEM, 0x1000, 0xA5A5A5A5, 0xA5A5A5A5, 0xA5A5A5A5 }, | |
253 | { CMA_MEM_TYPE_LOCAL_MEM, 0x1000, 0xA5A5A5A5, 0xA5A5A5A5, 0xA5A5A5A5 }, | |
254 | { CMA_MEM_TYPE_LOCAL_MEM, 0x1000, 0xA5A5A5A5, 0xA5A5A5A5, 0xA5A5A5A5 }, | |
255 | } | |
256 | }; | |
257 | ||
258 | static testMemoryDescriptor dstRange[CMA_TEST_COUNT][CMA_MEMORY_TEST_ARRAY_SIZE] = { | |
259 | { | |
260 | /* Memory Type Size FillPattern FirstItem Last item */ | |
261 | { CMA_MEM_TYPE_USERPTR, 0x801000, 0xFFFFFFFF, 0xA5A5A5A5, 0xA5A5A5A5 }, | |
262 | { CMA_MEM_TYPE_USERPTR, 0x1000, 0xFFFFFFFF, 0xA5A5A5A5, 0xAAAAAAAA }, | |
263 | { CMA_MEM_TYPE_USERPTR, 0x1000, 0xFFFFFFFF, 0xAAAAAAAA, 0xAAAAAAAA }, | |
264 | { CMA_MEM_TYPE_USERPTR, 0x0, 0xFFFFFFFF, 0xA5A5A5A5, 0xA5A5A5A5 }, | |
265 | }, | |
266 | { | |
267 | { CMA_MEM_TYPE_SYSTEM, 0x202000, 0xFFFFFFFF, 0xDEADBEEF, 0xDEADBEEF }, | |
268 | { CMA_MEM_TYPE_SYSTEM, 0x4000, 0xFFFFFFFF, 0xDEADBEEF, 0xDEADBEEF }, | |
269 | { CMA_MEM_TYPE_SYSTEM, 0x8000, 0xFFFFFFFF, 0xDEADBEEF, 0xA5A5A5A5 }, | |
270 | { CMA_MEM_TYPE_SYSTEM, 0x6000, 0xFFFFFFFF, 0xA5A5A5A5, 0xA5A5A5A5 }, | |
271 | }, | |
272 | { | |
273 | { CMA_MEM_TYPE_LOCAL_MEM, 0x800000, 0xFFFFFFFF, 0xDEADBEEF, 0xDEADBEEF }, | |
274 | { CMA_MEM_TYPE_LOCAL_MEM, 0x1000, 0xFFFFFFFF, 0xA5A5A5A5, 0xA5A5A5A5 }, | |
275 | { CMA_MEM_TYPE_LOCAL_MEM, 0x1000, 0xFFFFFFFF, 0xA5A5A5A5, 0xA5A5A5A5 }, | |
276 | { CMA_MEM_TYPE_LOCAL_MEM, 0x1000, 0xFFFFFFFF, 0xA5A5A5A5, 0xA5A5A5A5 }, | |
277 | } | |
278 | }; | |
279 | ||
280 | KFDCMAArray::KFDCMAArray() : m_ValidCount(0), m_QueueArray(HSA_QUEUE_SDMA) { | |
281 | memset(m_MemArray, 0, sizeof(m_MemArray)); | |
282 | memset(m_HsaMemoryRange, 0, sizeof(m_HsaMemoryRange)); | |
283 | } | |
284 | ||
285 | CMA_TEST_STATUS KFDCMAArray::Destroy() { | |
286 | for (int i = 0; i < m_ValidCount; i++) { | |
287 | if (m_MemArray[i]) { | |
288 | void *userPtr; | |
289 | ||
290 | userPtr = m_MemArray[i]->GetUserPtr(); | |
291 | delete m_MemArray[i]; | |
292 | ||
293 | if (userPtr) | |
294 | free(userPtr); | |
295 | } | |
296 | } | |
297 | ||
298 | memset(m_MemArray, 0, sizeof(m_MemArray)); | |
299 | memset(m_HsaMemoryRange, 0, sizeof(m_HsaMemoryRange)); | |
300 | m_ValidCount = 0; | |
301 | ||
302 | return CMA_TEST_SUCCESS; | |
303 | } | |
304 | ||
305 | /* Initialize KFDCMAArray based on array of testMemoryDescriptor. Usually testMemoryDescriptor[] is | |
306 | * statically defined array by the user. Only items with non-zero size are considered valid | |
307 | */ | |
308 | CMA_TEST_STATUS KFDCMAArray::Init(testMemoryDescriptor(*memDescriptor)[CMA_MEMORY_TEST_ARRAY_SIZE], int node) { | |
309 | CMA_TEST_STATUS err = CMA_TEST_SUCCESS; | |
310 | memset(m_MemArray, 0, sizeof(m_MemArray)); | |
311 | memset(m_HsaMemoryRange, 0, sizeof(m_HsaMemoryRange)); | |
312 | ||
313 | m_ValidCount = 0; | |
314 | for (int i = 0; i < CMA_MEMORY_TEST_ARRAY_SIZE; i++) { | |
315 | if ((*memDescriptor)[i].m_MemSize == 0) | |
316 | continue; | |
317 | ||
318 | switch ((*memDescriptor)[i].m_MemType) { | |
319 | case CMA_MEM_TYPE_SYSTEM: | |
320 | m_MemArray[i] = new HsaMemoryBuffer((*memDescriptor)[i].m_MemSize, node); | |
321 | break; | |
322 | ||
323 | case CMA_MEM_TYPE_USERPTR: | |
324 | { | |
325 | void *userPtr = malloc((*memDescriptor)[i].m_MemSize); | |
326 | m_MemArray[i] = new HsaMemoryBuffer(userPtr, (*memDescriptor)[i].m_MemSize); | |
327 | break; | |
328 | } | |
329 | ||
330 | case CMA_MEM_TYPE_LOCAL_MEM: | |
331 | m_MemArray[i] = new HsaMemoryBuffer((*memDescriptor)[i].m_MemSize, node, false, true); | |
332 | break; | |
333 | } | |
334 | ||
335 | if (m_MemArray[i]) { | |
336 | m_HsaMemoryRange[i].MemoryAddress = m_MemArray[i]->As<void*>(); | |
337 | m_HsaMemoryRange[i].SizeInBytes = m_MemArray[i]->Size(); | |
338 | m_ValidCount++; | |
339 | } else { | |
340 | err = CMA_TEST_NOMEM; | |
341 | break; | |
342 | } | |
343 | } | |
344 | ||
345 | return err; | |
346 | } | |
347 | ||
348 | /* Fill each buffer of KFDCMAArray with the pattern described by testMemoryDescriptor[] */ | |
349 | void KFDCMAArray::FillPattern(testMemoryDescriptor(*memDescriptor)[CMA_MEMORY_TEST_ARRAY_SIZE]) { | |
350 | SDMAQueue sdmaQueue; | |
351 | bool queueCreated = false; | |
352 | unsigned int queueNode; | |
353 | ||
354 | for (int i = 0; i < m_ValidCount; i++) { | |
355 | if (m_MemArray[i]->isLocal()) | |
356 | m_MemArray[i]->Fill((*memDescriptor)[i].m_FillPattern, *m_QueueArray.GetQueue(m_MemArray[i]->Node())); | |
357 | else | |
358 | m_MemArray[i]->Fill((*memDescriptor)[i].m_FillPattern); | |
359 | } | |
360 | } | |
361 | ||
362 | /* Check the first and last item of each buffer in KFDCMAArray with the pattern described by | |
363 | * testMemoryDescriptor[]. Return 0 on success. | |
364 | */ | |
365 | CMA_TEST_STATUS KFDCMAArray::checkPattern(testMemoryDescriptor(*memDescriptor)[CMA_MEMORY_TEST_ARRAY_SIZE]) { | |
366 | HSAuint64 lastItem; | |
367 | CMA_TEST_STATUS ret = CMA_TEST_SUCCESS; | |
368 | unsigned int queueNode = 0; | |
369 | bool queueCreated = false; | |
370 | HsaMemoryBuffer tmpBuffer(PAGE_SIZE, 0, true /* zero */); | |
371 | volatile HSAuint32 *tmp = tmpBuffer.As<volatile HSAuint32 *>(); | |
372 | ||
373 | for (int i = 0; i < m_ValidCount; i++) { | |
374 | lastItem = m_MemArray[i]->Size(); | |
375 | lastItem -= sizeof(HSAuint32); | |
376 | ||
377 | if (m_MemArray[i]->isLocal()) { | |
378 | BaseQueue *sdmaQueue = m_QueueArray.GetQueue(m_MemArray[i]->Node()); | |
379 | ||
380 | if (!m_MemArray[i]->IsPattern(0, (*memDescriptor)[i].m_CheckFirstWordPattern, *sdmaQueue, tmp) || | |
381 | !m_MemArray[i]->IsPattern(lastItem, (*memDescriptor)[i].m_CheckLastWordPattern, *sdmaQueue, tmp)) { | |
382 | ret = CMA_CHECK_PATTERN_ERROR; | |
383 | break; | |
384 | } | |
385 | ||
386 | } else { | |
387 | if (!m_MemArray[i]->IsPattern(0, (*memDescriptor)[i].m_CheckFirstWordPattern) || | |
388 | !m_MemArray[i]->IsPattern(lastItem, (*memDescriptor)[i].m_CheckLastWordPattern)) { | |
389 | ret = CMA_CHECK_PATTERN_ERROR; | |
390 | break; | |
391 | } | |
392 | } | |
393 | } | |
394 | ||
395 | return ret; | |
396 | } | |
397 | ||
398 | ||
399 | /* Non-blocking read and write to avoid Test from hanging (block indefinitely) | |
400 | * if either server or client process exits due to assert failure | |
401 | */ | |
402 | static int write_non_block(int fd, const void *buf, int size) { | |
403 | int total_bytes = 0, cur_bytes = 0; | |
404 | int retries = 5; | |
405 | struct timespec tm = { 0, 10000000ULL }; | |
406 | const char *ptr = (const char *)buf; | |
407 | ||
408 | do { | |
409 | cur_bytes = write(fd, ptr, (size - total_bytes)); | |
410 | ||
411 | if (cur_bytes < 0 && errno != EAGAIN) | |
412 | return cur_bytes; | |
413 | ||
414 | if (cur_bytes > 0) { | |
415 | total_bytes += cur_bytes; | |
416 | ptr += cur_bytes; | |
417 | } | |
418 | ||
419 | if (total_bytes < size) | |
420 | nanosleep(&tm, NULL); | |
421 | } while (total_bytes < size && retries--); | |
422 | ||
423 | /* Check for overflow */ | |
424 | if (total_bytes > size) | |
425 | return -1; | |
426 | ||
427 | return total_bytes; | |
428 | } | |
429 | ||
430 | static int read_non_block(int fd, void *buf, int size) { | |
431 | int total_bytes = 0, cur_bytes = 0; | |
432 | int retries = 5; | |
433 | struct timespec tm = { 0, 100000000ULL }; | |
434 | char *ptr = reinterpret_cast<char *>(buf); | |
435 | ||
436 | do { | |
437 | cur_bytes = read(fd, ptr, (size - total_bytes)); | |
438 | ||
439 | if (cur_bytes < 0 && errno != EAGAIN) | |
440 | return cur_bytes; | |
441 | ||
442 | if (cur_bytes > 0) { | |
443 | total_bytes += cur_bytes; | |
444 | ptr += cur_bytes; | |
445 | } | |
446 | ||
447 | if (total_bytes < size) | |
448 | nanosleep(&tm, NULL); | |
449 | } while (total_bytes < size && retries--); | |
450 | ||
451 | if (total_bytes > size) | |
452 | return -1; | |
453 | ||
454 | return total_bytes; | |
455 | } | |
456 | ||
457 | ||
458 | /* Send HsaMemoryRange to another process that is connected via writePipe */ | |
459 | CMA_TEST_STATUS KFDCMAArray::sendCMAArray(int writePipe) { | |
460 | if (write_non_block(writePipe, reinterpret_cast<void*>(&m_HsaMemoryRange), sizeof(m_HsaMemoryRange)) != | |
461 | sizeof(m_HsaMemoryRange)) | |
462 | return CMA_IPC_PIPE_ERROR; | |
463 | return CMA_TEST_SUCCESS; | |
464 | } | |
465 | ||
466 | /* Send HsaMemoryRange from another process and initialize KFDCMAArray */ | |
467 | CMA_TEST_STATUS KFDCMAArray::recvCMAArray(int readPipe) { | |
468 | int i; | |
469 | ||
470 | if (read_non_block(readPipe, reinterpret_cast<void*>(&m_HsaMemoryRange), sizeof(m_HsaMemoryRange)) != | |
471 | sizeof(m_HsaMemoryRange)) | |
472 | return CMA_IPC_PIPE_ERROR; | |
473 | ||
474 | for (i = 0; i < CMA_MEMORY_TEST_ARRAY_SIZE; i++) { | |
475 | if (m_HsaMemoryRange[i].SizeInBytes) | |
476 | m_ValidCount++; | |
477 | } | |
478 | return CMA_TEST_SUCCESS; | |
479 | } | |
480 | ||
481 | ||
482 | CMA_TEST_STATUS KFDIPCTest::CrossMemoryAttachChildProcess(int defaultGPUNode, int writePipe, | |
483 | int readPipe, CMA_TEST_TYPE testType) { | |
484 | KFDCMAArray cmaLocalArray; | |
485 | char msg[16]; | |
486 | int testNo; | |
487 | CMA_TEST_STATUS status; | |
488 | ||
489 | /* Initialize and fill Local Buffer Array with a pattern. | |
490 | * READ_TEST: Send the Array to parent process. Wait for the parent | |
491 | * to finish reading and checking. Then move to next text case or | |
492 | * quit if last one. | |
493 | * WRITE_TEST: Send Local Buffer Array to parent process and and wait | |
494 | * for parent to write to it. Check for new pattern. Then move to next | |
495 | * case or quit if last one. | |
496 | */ | |
497 | for (testNo = 0; testNo < CMA_TEST_COUNT; testNo++) { | |
498 | if (testType == CMA_READ_TEST) { | |
499 | cmaLocalArray.Init(&srcRange[testNo], defaultGPUNode); | |
500 | cmaLocalArray.FillPattern(&srcRange[testNo]); | |
501 | } else { | |
502 | cmaLocalArray.Init(&dstRange[testNo], defaultGPUNode); | |
503 | cmaLocalArray.FillPattern(&dstRange[testNo]); | |
504 | } | |
505 | ||
506 | if (cmaLocalArray.sendCMAArray(writePipe) < 0) { | |
507 | status = CMA_IPC_PIPE_ERROR; | |
508 | break; | |
509 | } | |
510 | ||
511 | /* Wait until the test is over */ | |
512 | memset(msg, 0, sizeof(msg)); | |
513 | if (read_non_block(readPipe, msg, 4) < 0) { | |
514 | status = CMA_IPC_PIPE_ERROR; | |
515 | break; | |
516 | } | |
517 | ||
518 | if (!strcmp(msg, "CHCK")) | |
519 | status = cmaLocalArray.checkPattern(&dstRange[testNo]); | |
520 | else if (!strcmp(msg, "NEXT")) | |
521 | status = CMA_TEST_SUCCESS; | |
522 | else if (!strcmp(msg, "EXIT")) | |
523 | status = CMA_TEST_ABORT; | |
524 | else | |
525 | status = CMA_PARENT_FAIL; | |
526 | ||
527 | cmaLocalArray.Destroy(); | |
528 | if (status != CMA_TEST_SUCCESS) | |
529 | break; | |
530 | } | |
531 | ||
532 | return status; | |
533 | } | |
534 | ||
535 | ||
536 | CMA_TEST_STATUS KFDIPCTest::CrossMemoryAttachParentProcess(int defaultGPUNode, pid_t cid, | |
537 | int writePipe, int readPipe, | |
538 | CMA_TEST_TYPE testType) { | |
539 | KFDCMAArray cmaLocalArray, cmaRemoteArray; | |
540 | HSAuint64 copied = 0; | |
541 | int testNo; | |
542 | CMA_TEST_STATUS status; | |
543 | ||
544 | /* Receive buffer array from child and then initialize and fill in Local Buffer Array. | |
545 | * READ_TEST: Copy remote buffer array into Local Buffer Array and then check | |
546 | * for the new pattern. | |
547 | * WRITE_TEST: Write Local Buffer Array into remote buffer array. Notify child to | |
548 | * to check for the new pattern. | |
549 | */ | |
550 | for (testNo = 0; testNo < CMA_TEST_COUNT; testNo++) { | |
551 | status = cmaRemoteArray.recvCMAArray(readPipe); | |
552 | if (status != CMA_TEST_SUCCESS) | |
553 | break; | |
554 | ||
555 | if (testType == CMA_READ_TEST) { | |
556 | status = cmaLocalArray.Init(&dstRange[testNo], defaultGPUNode); | |
557 | if (status != CMA_TEST_SUCCESS) | |
558 | break; | |
559 | cmaLocalArray.FillPattern(&dstRange[testNo]); | |
560 | ||
561 | if (hsaKmtProcessVMRead(cid, cmaLocalArray.getMemoryRange(), | |
562 | cmaLocalArray.getValidRangeCount(), | |
563 | cmaRemoteArray.getMemoryRange(), | |
564 | cmaRemoteArray.getValidRangeCount(), | |
565 | &copied) != HSAKMT_STATUS_SUCCESS) { | |
566 | status = CMA_TEST_HSA_READ_FAIL; | |
567 | break; | |
568 | } | |
569 | ||
570 | status = cmaLocalArray.checkPattern(&dstRange[testNo]); | |
571 | if (status != CMA_TEST_SUCCESS) | |
572 | break; | |
573 | ||
574 | cmaLocalArray.Destroy(); | |
575 | cmaRemoteArray.Destroy(); | |
576 | ||
577 | if (write_non_block(writePipe, "NEXT", 4) < 0) { | |
578 | status = CMA_IPC_PIPE_ERROR; | |
579 | break; | |
580 | } | |
581 | } else { | |
582 | status = cmaLocalArray.Init(&srcRange[testNo], defaultGPUNode); | |
583 | if (status != CMA_TEST_SUCCESS) | |
584 | break; | |
585 | cmaLocalArray.FillPattern(&srcRange[testNo]); | |
586 | ||
587 | if (hsaKmtProcessVMWrite(cid, cmaLocalArray.getMemoryRange(), | |
588 | cmaLocalArray.getValidRangeCount(), | |
589 | cmaRemoteArray.getMemoryRange(), | |
590 | cmaRemoteArray.getValidRangeCount(), | |
591 | &copied) != HSAKMT_STATUS_SUCCESS) { | |
592 | status = CMA_TEST_HSA_WRITE_FAIL; | |
593 | break; | |
594 | } | |
595 | ||
596 | cmaLocalArray.Destroy(); | |
597 | cmaRemoteArray.Destroy(); | |
598 | if (write_non_block(writePipe, "CHCK", 4) < 0) { | |
599 | status = CMA_IPC_PIPE_ERROR; | |
600 | break; | |
601 | } | |
602 | } | |
603 | } /* for loop */ | |
604 | ||
605 | return status; | |
606 | } | |
607 | ||
608 | /* Test Cross Memory Attach | |
609 | * hsaKmtProcessVMRead and hsaKmtProcessVMWrite are GPU address equivalent to | |
610 | * process_vm_readv and process_vm_writev. These calls transfer data between | |
611 | * the address space of the calling process ("the local process") and the process | |
612 | * identified by pid ("the remote process"). | |
613 | * | |
614 | * In the tests parent process will be the local process and child will be | |
615 | * the remote. | |
616 | */ | |
617 | TEST_F(KFDIPCTest, CrossMemoryAttachTest) { | |
618 | TEST_START(TESTPROFILE_RUNALL) | |
619 | ||
620 | int defaultGPUNode = m_NodeInfo.HsaDefaultGPUNode(); | |
621 | int pipeCtoP[2], pipePtoC[2]; | |
622 | int status; | |
623 | ||
624 | ASSERT_GE(defaultGPUNode, 0) << "failed to get default GPU Node"; | |
625 | ||
626 | if (!GetVramSize(defaultGPUNode)) { | |
627 | LOG() << "Skipping test: No VRAM found." << std::endl; | |
628 | return; | |
629 | } | |
630 | ||
631 | /* Create Pipes for communicating shared handles */ | |
632 | ASSERT_EQ(pipe2(pipeCtoP, O_NONBLOCK), 0); | |
633 | ASSERT_EQ(pipe2(pipePtoC, O_NONBLOCK), 0); | |
634 | ||
635 | /* Create a child process and share the above Local Memory with it */ | |
636 | m_ChildPid = fork(); | |
637 | if (m_ChildPid == 0 && hsaKmtOpenKFD() == HSAKMT_STATUS_SUCCESS) { | |
638 | /* Child Process */ | |
639 | status = CrossMemoryAttachChildProcess(defaultGPUNode, pipeCtoP[1], | |
640 | pipePtoC[0], CMA_READ_TEST); | |
641 | EXPECT_EQ(status, CMA_TEST_SUCCESS) << "Child: Read Test Fail"; | |
642 | status = CrossMemoryAttachChildProcess(defaultGPUNode, pipeCtoP[1], | |
643 | pipePtoC[0], CMA_WRITE_TEST); | |
644 | EXPECT_EQ(status, CMA_TEST_SUCCESS) << "Child: Write Test Fail"; | |
645 | } else { | |
646 | int childStatus; | |
647 | ||
648 | status = CrossMemoryAttachParentProcess(defaultGPUNode, m_ChildPid, | |
649 | pipePtoC[1], pipeCtoP[0], CMA_READ_TEST); /* Parent proces */ | |
650 | EXPECT_EQ(status, CMA_TEST_SUCCESS) << "Parent: Read Test Fail"; | |
651 | status = CrossMemoryAttachParentProcess(defaultGPUNode, m_ChildPid, | |
652 | pipePtoC[1], pipeCtoP[0], CMA_WRITE_TEST); | |
653 | EXPECT_EQ(status, CMA_TEST_SUCCESS) << "Parent: Write Test Fail"; | |
654 | ||
655 | waitpid(m_ChildPid, &childStatus, 0); | |
656 | EXPECT_EQ(WIFEXITED(childStatus), true); | |
657 | EXPECT_EQ(WEXITSTATUS(childStatus), 0); | |
658 | } | |
659 | ||
660 | /* Code path executed by both parent and child with respective fds */ | |
661 | close(pipeCtoP[1]); | |
662 | close(pipeCtoP[0]); | |
663 | close(pipePtoC[1]); | |
664 | close(pipePtoC[0]); | |
665 | TEST_END | |
666 | } | |
667 | ||
668 | /* Test Cross Memory Attach | |
669 | * | |
670 | * hsaKmtProcessVMRead and hsaKmtProcessVMWrite are GPU address equivalent to | |
671 | * process_vm_readv and process_vm_writev. These calls are used to transfer data | |
672 | * between the address space of the calling process ("the local process") and the process | |
673 | * identified by pid ("the remote process"). However, these functions should also work | |
674 | * with a single process and single BO. | |
675 | */ | |
676 | TEST_F(KFDIPCTest, CMABasicTest) { | |
677 | TEST_START(TESTPROFILE_RUNALL) | |
678 | ||
679 | int defaultGPUNode = m_NodeInfo.HsaDefaultGPUNode(); | |
680 | HSAuint64 size = PAGE_SIZE; | |
681 | SDMAQueue sdmaQueue; | |
682 | HsaMemoryRange srcRange, dstRange; | |
683 | HSAuint64 copied; | |
684 | const int PATTERN1 = 0xA5A5A5A5, PATTERN2 = 0xFFFFFFFF; | |
685 | HSAKMT_STATUS status; | |
686 | ||
687 | ASSERT_GE(defaultGPUNode, 0) << "failed to get default GPU Node"; | |
688 | ||
689 | if (!GetVramSize(defaultGPUNode)) { | |
690 | LOG() << "Skipping test: No VRAM found." << std::endl; | |
691 | return; | |
692 | } | |
693 | ||
694 | ASSERT_SUCCESS(sdmaQueue.Create(defaultGPUNode)); | |
695 | HsaMemoryBuffer tmpBuffer(PAGE_SIZE, 0, true /* zero */); | |
696 | volatile HSAuint32 *tmp = tmpBuffer.As<volatile HSAuint32 *>(); | |
697 | ||
698 | /* Initialize test buffer. Fill first half and second half with | |
699 | * different pattern | |
700 | */ | |
701 | HsaMemoryBuffer testLocalBuffer(size, defaultGPUNode, false, true); | |
702 | testLocalBuffer.Fill(PATTERN1, sdmaQueue, 0, size/2); | |
703 | testLocalBuffer.Fill(PATTERN2, sdmaQueue, size/2, size/2); | |
704 | ||
705 | /* Test1. Copy (or overwrite) buffer onto itself */ | |
706 | srcRange.MemoryAddress = testLocalBuffer.As<void*>(); | |
707 | srcRange.SizeInBytes = size; | |
708 | dstRange.MemoryAddress = testLocalBuffer.As<void*>(); | |
709 | dstRange.SizeInBytes = size; | |
710 | ASSERT_SUCCESS(hsaKmtProcessVMRead(getpid(), &dstRange, 1, &srcRange, 1, &copied)); | |
711 | EXPECT_EQ(copied, size); | |
712 | ||
713 | EXPECT_TRUE(testLocalBuffer.IsPattern(0, PATTERN1, sdmaQueue, tmp)); | |
714 | EXPECT_TRUE(testLocalBuffer.IsPattern(size - 4, PATTERN2, sdmaQueue, tmp)); | |
715 | ||
716 | ||
717 | /* Test2. Test unaligned byte copy. Write 3 bytes to an unaligned destination address */ | |
718 | const int unaligned_offset = 1; | |
719 | const int unaligned_size = 3; | |
720 | const int unaligned_mask = (((1 << (unaligned_size * 8)) - 1) << (unaligned_offset * 8)); | |
721 | HSAuint32 expected_pattern; | |
722 | ||
723 | srcRange.MemoryAddress = testLocalBuffer.As<void*>(); | |
724 | ||
725 | /* Deliberately set to value > unaligned_size. Only unaligned_size | |
726 | * should be copied since dstRange.SizeInBytes == unaligned_size | |
727 | */ | |
728 | srcRange.SizeInBytes = size; | |
729 | ||
730 | dstRange.MemoryAddress = reinterpret_cast<void *>(testLocalBuffer.As<char*>() + (size / 2) + unaligned_offset); | |
731 | dstRange.SizeInBytes = unaligned_size; | |
732 | ASSERT_SUCCESS(hsaKmtProcessVMRead(getpid(), &dstRange, 1, &srcRange, 1, &copied)); | |
733 | EXPECT_EQ(copied, unaligned_size); | |
734 | ||
735 | expected_pattern = (PATTERN2 & ~unaligned_mask | (PATTERN1 & unaligned_mask)); | |
736 | EXPECT_TRUE(testLocalBuffer.IsPattern(size/2, expected_pattern, sdmaQueue, tmp)); | |
737 | ||
738 | ||
739 | /* Test3. Test overflow and expect failure */ | |
740 | srcRange.MemoryAddress = testLocalBuffer.As<void*>(); | |
741 | srcRange.SizeInBytes = size; | |
742 | dstRange.MemoryAddress = reinterpret_cast<void *>(testLocalBuffer.As<char*>() + 4); | |
743 | dstRange.SizeInBytes = size; /* This should overflow since offset is VA + 4 */ | |
744 | status = hsaKmtProcessVMRead(getpid(), &dstRange, 1, &srcRange, 1, &copied); | |
745 | EXPECT_NE(status, HSAKMT_STATUS_SUCCESS); | |
746 | EXPECT_LE(copied, (size - 4)); | |
747 | ||
748 | EXPECT_SUCCESS(sdmaQueue.Destroy()); | |
749 | ||
750 | TEST_END | |
751 | } |
22 | 22 | |
23 | 23 | #include "KFDBaseComponentTest.hpp" |
24 | 24 | #include "BaseQueue.hpp" |
25 | #include "IsaGenerator.hpp" | |
26 | 25 | |
27 | 26 | #ifndef __KFD_MEMORY_TEST__H__ |
28 | 27 | #define __KFD_MEMORY_TEST__H__ |
32 | 32 | |
33 | 33 | KFDBaseComponentTest::SetUp(); |
34 | 34 | |
35 | m_pIsaGen = IsaGenerator::Create(m_FamilyId); | |
36 | ||
37 | 35 | ROUTINE_END |
38 | 36 | } |
39 | 37 | |
40 | 38 | void KFDLocalMemoryTest::TearDown() { |
41 | 39 | ROUTINE_START |
42 | ||
43 | if (m_pIsaGen) | |
44 | delete m_pIsaGen; | |
45 | m_pIsaGen = NULL; | |
46 | 40 | |
47 | 41 | KFDBaseComponentTest::TearDown(); |
48 | 42 | |
106 | 100 | |
107 | 101 | srcSysBuffer.Fill(0x01010101); |
108 | 102 | |
109 | m_pIsaGen->GetCopyDwordIsa(isaBuffer); | |
103 | ASSERT_SUCCESS(m_pAsm->RunAssembleBuf(CopyDwordIsa, isaBuffer.As<char*>())); | |
110 | 104 | |
111 | 105 | ASSERT_SUCCESS(hsaKmtMapMemoryToGPUNodes(srcLocalBuffer.As<void*>(), srcLocalBuffer.Size(), &AlternateVAGPU, |
112 | 106 | mapFlags, 1, reinterpret_cast<HSAuint32 *>(&defaultGPUNode))); |
163 | 157 | |
164 | 158 | SysBufferA.Fill(0x01010101); |
165 | 159 | |
166 | m_pIsaGen->GetCopyDwordIsa(isaBuffer); | |
160 | ASSERT_SUCCESS(m_pAsm->RunAssembleBuf(CopyDwordIsa, isaBuffer.As<char*>())); | |
167 | 161 | |
168 | 162 | ASSERT_SUCCESS(queue.Create(defaultGPUNode)); |
169 | 163 | queue.SetSkipWaitConsump(0); |
302 | 296 | PM4Queue queue; |
303 | 297 | ASSERT_SUCCESS(queue.Create(defaultGPUNode)); |
304 | 298 | HsaMemoryBuffer isaBuffer(PAGE_SIZE, defaultGPUNode); |
305 | m_pIsaGen->GetCopyDwordIsa(isaBuffer); | |
299 | ||
300 | ASSERT_SUCCESS(m_pAsm->RunAssembleBuf(CopyDwordIsa, isaBuffer.As<char*>())); | |
306 | 301 | |
307 | 302 | /* Allocate and test memory using the strategy explained at the top */ |
308 | 303 | HSAKMT_STATUS status; |
25 | 25 | |
26 | 26 | #include <gtest/gtest.h> |
27 | 27 | |
28 | #include "IsaGenerator.hpp" | |
29 | 28 | #include "KFDBaseComponentTest.hpp" |
30 | 29 | |
31 | 30 | class KFDLocalMemoryTest : public KFDBaseComponentTest { |
32 | 31 | public: |
33 | KFDLocalMemoryTest() :m_pIsaGen(NULL) {} | |
32 | KFDLocalMemoryTest() {} | |
34 | 33 | ~KFDLocalMemoryTest() {} |
35 | 34 | |
36 | 35 | protected: |
37 | 36 | virtual void SetUp(); |
38 | 37 | virtual void TearDown(); |
39 | ||
40 | protected: // Members | |
41 | IsaGenerator* m_pIsaGen; | |
42 | 38 | }; |
43 | 39 | |
44 | 40 | #endif // __KFD_LOCALMEMORY_TEST__H__ |
38 | 38 | #include "SDMAPacket.hpp" |
39 | 39 | #include "linux/kfd_ioctl.h" |
40 | 40 | |
41 | const char* gfx8_ScratchCopyDword = | |
42 | "\ | |
43 | shader ScratchCopyDword\n\ | |
44 | asic(VI)\n\ | |
45 | type(CS)\n\ | |
46 | /*copy the parameters from scalar registers to vector registers*/\n\ | |
47 | v_mov_b32 v0, s0\n\ | |
48 | v_mov_b32 v1, s1\n\ | |
49 | v_mov_b32 v2, s2\n\ | |
50 | v_mov_b32 v3, s3\n\ | |
51 | /*set up the scratch parameters. This assumes a single 16-reg block.*/\n\ | |
52 | s_mov_b32 flat_scratch_lo, 8/*2 dwords of scratch per thread*/\n\ | |
53 | s_mov_b32 flat_scratch_hi, 0/*offset in units of 256bytes*/\n\ | |
54 | /*copy a dword between the passed addresses*/\n\ | |
55 | flat_load_dword v4, v[0:1] slc\n\ | |
56 | s_waitcnt vmcnt(0)&lgkmcnt(0)\n\ | |
57 | flat_store_dword v[2:3], v4 slc\n\ | |
58 | \n\ | |
59 | s_endpgm\n\ | |
60 | \n\ | |
61 | end\n\ | |
62 | "; | |
63 | ||
64 | const char* gfx9_ScratchCopyDword = | |
65 | "\ | |
66 | shader ScratchCopyDword\n\ | |
67 | asic(GFX9)\n\ | |
68 | type(CS)\n\ | |
69 | /*copy the parameters from scalar registers to vector registers*/\n\ | |
70 | v_mov_b32 v0, s0\n\ | |
71 | v_mov_b32 v1, s1\n\ | |
72 | v_mov_b32 v2, s2\n\ | |
73 | v_mov_b32 v3, s3\n\ | |
74 | /*set up the scratch parameters. This assumes a single 16-reg block.*/\n\ | |
75 | s_mov_b32 flat_scratch_lo, s4\n\ | |
76 | s_mov_b32 flat_scratch_hi, s5\n\ | |
77 | /*copy a dword between the passed addresses*/\n\ | |
78 | flat_load_dword v4, v[0:1] slc\n\ | |
79 | s_waitcnt vmcnt(0)&lgkmcnt(0)\n\ | |
80 | flat_store_dword v[2:3], v4 slc\n\ | |
81 | \n\ | |
82 | s_endpgm\n\ | |
83 | \n\ | |
84 | end\n\ | |
85 | "; | |
86 | const char* gfx10_ScratchCopyDword = | |
87 | "\ | |
88 | shader ScratchCopyDword\n\ | |
89 | asic(GFX10)\n\ | |
90 | type(CS)\n\ | |
91 | wave_size(32)\n\ | |
92 | /*copy the parameters from scalar registers to vector registers*/\n\ | |
93 | v_mov_b32 v0, s0\n\ | |
94 | v_mov_b32 v1, s1\n\ | |
95 | v_mov_b32 v2, s2\n\ | |
96 | v_mov_b32 v3, s3\n\ | |
97 | /*set up the scratch parameters. This assumes a single 16-reg block.*/\n\ | |
98 | s_setreg_b32 hwreg(HW_REG_SHADER_FLAT_SCRATCH_LO), s4\n\ | |
99 | s_setreg_b32 hwreg(HW_REG_SHADER_FLAT_SCRATCH_HI), s5\n\ | |
100 | /*copy a dword between the passed addresses*/\n\ | |
101 | flat_load_dword v4, v[0:1] slc\n\ | |
102 | s_waitcnt vmcnt(0)&lgkmcnt(0)\n\ | |
103 | flat_store_dword v[2:3], v4 slc\n\ | |
104 | \n\ | |
105 | s_endpgm\n\ | |
106 | \n\ | |
107 | end\n\ | |
108 | "; | |
109 | ||
110 | const char* aldbrn_ScratchCopyDword = | |
111 | "\ | |
112 | shader ScratchCopyDword\n\ | |
113 | asic(ALDEBARAN)\n\ | |
114 | type(CS)\n\ | |
115 | /*copy the parameters from scalar registers to vector registers*/\n\ | |
116 | v_mov_b32 v0, s0\n\ | |
117 | v_mov_b32 v1, s1\n\ | |
118 | v_mov_b32 v2, s2\n\ | |
119 | v_mov_b32 v3, s3\n\ | |
120 | /*set up the scratch parameters. This assumes a single 16-reg block.*/\n\ | |
121 | s_mov_b32 flat_scratch_lo, s4\n\ | |
122 | s_mov_b32 flat_scratch_hi, s5\n\ | |
123 | /*copy a dword between the passed addresses*/\n\ | |
124 | flat_load_dword v4, v[0:1] slc\n\ | |
125 | s_waitcnt vmcnt(0)&lgkmcnt(0)\n\ | |
126 | flat_store_dword v[2:3], v4 slc\n\ | |
127 | \n\ | |
128 | s_endpgm\n\ | |
129 | \n\ | |
130 | end\n\ | |
131 | "; | |
132 | ||
133 | ||
134 | ||
135 | /* Continuously poll src buffer and check buffer value | |
136 | * After src buffer is filled with specific value (0x5678, | |
137 | * by host program), fill dst buffer with specific | |
138 | * value(0x5678) and quit | |
139 | */ | |
140 | const char* gfx9_PollMemory = | |
141 | "\ | |
142 | shader ReadMemory\n\ | |
143 | wave_size(32)\n\ | |
144 | type(CS)\n\ | |
145 | /* Assume src address in s0, s1 and dst address in s2, s3*/\n\ | |
146 | s_movk_i32 s18, 0x5678\n\ | |
147 | LOOP:\n\ | |
148 | s_load_dword s16, s[0:1], 0x0 glc\n\ | |
149 | s_cmp_eq_i32 s16, s18\n\ | |
150 | s_cbranch_scc0 LOOP\n\ | |
151 | s_store_dword s18, s[2:3], 0x0 glc\n\ | |
152 | s_endpgm\n\ | |
153 | end\n\ | |
154 | "; | |
155 | ||
156 | /* Similar to gfx9_PollMemory except that the buffer | |
157 | * polled can be Non-coherant memory. SCC system-level | |
158 | * cache coherence is not supported in scalar (smem) path. | |
159 | * Use vmem operations with scc | |
160 | */ | |
161 | const char* gfx9_PollNCMemory = | |
162 | "\ | |
163 | shader ReadMemory\n\ | |
164 | asic(ALDEBARAN)\n\ | |
165 | wave_size(32)\n\ | |
166 | type(CS)\n\ | |
167 | /* Assume src address in s0, s1 and dst address in s2, s3*/\n\ | |
168 | v_mov_b32 v6, 0x5678\n\ | |
169 | v_mov_b32 v0, s0\n\ | |
170 | v_mov_b32 v1, s1\n\ | |
171 | LOOP:\n\ | |
172 | flat_load_dword v4, v[0:1] scc\n\ | |
173 | v_cmp_eq_u32 vcc, v4, v6\n\ | |
174 | s_cbranch_vccz LOOP\n\ | |
175 | v_mov_b32 v0, s2\n\ | |
176 | v_mov_b32 v1, s3\n\ | |
177 | flat_store_dword v[0:1], v6 scc\n\ | |
178 | s_endpgm\n\ | |
179 | end\n\ | |
180 | "; | |
181 | ||
182 | const char* gfx10_PollMemory = | |
183 | "\ | |
184 | shader ReadMemory\n\ | |
185 | wave_size(32)\n\ | |
186 | type(CS)\n\ | |
187 | /* Assume src address in s0, s1 and dst address in s2, s3*/\n\ | |
188 | s_movk_i32 s18, 0x5678\n\ | |
189 | v_mov_b32 v0, s2\n\ | |
190 | v_mov_b32 v1, s3\n\ | |
191 | v_mov_b32 v2, 0x5678\n\ | |
192 | LOOP:\n\ | |
193 | s_load_dword s16, s[0:1], 0x0 glc\n\ | |
194 | s_cmp_eq_i32 s16, s18\n\ | |
195 | s_cbranch_scc0 LOOP\n\ | |
196 | flat_store_dword v[0,1], v2 slc\n\ | |
197 | s_waitcnt vmcnt(0)&lgkmcnt(0)\n\ | |
198 | s_endpgm\n\ | |
199 | end\n\ | |
200 | "; | |
201 | ||
202 | /* Input: A buffer of at least 3 dwords. | |
203 | * DW0: used as a signal. 0xcafe means it is signaled | |
204 | * DW1: Input buffer for device to read. | |
205 | * DW2: Output buffer for device to write. | |
206 | * Once receive signal, device will copy DW1 to DW2 | |
207 | * This shader continously poll the signal buffer, | |
208 | * Once signal buffer is signaled, it copies input buffer | |
209 | * to output buffer | |
210 | */ | |
211 | const char* gfx9_CopyOnSignal = | |
212 | "\ | |
213 | shader CopyOnSignal\n\ | |
214 | wave_size(32)\n\ | |
215 | type(CS)\n\ | |
216 | /* Assume input buffer in s0, s1 */\n\ | |
217 | s_mov_b32 s18, 0xcafe\n\ | |
218 | POLLSIGNAL:\n\ | |
219 | s_load_dword s16, s[0:1], 0x0 glc\n\ | |
220 | s_cmp_eq_i32 s16, s18\n\ | |
221 | s_cbranch_scc0 POLLSIGNAL\n\ | |
222 | s_load_dword s17, s[0:1], 0x4 glc\n\ | |
223 | s_waitcnt vmcnt(0) & lgkmcnt(0)\n\ | |
224 | s_store_dword s17, s[0:1], 0x8 glc\n\ | |
225 | s_waitcnt vmcnt(0) & lgkmcnt(0)\n\ | |
226 | s_endpgm\n\ | |
227 | end\n\ | |
228 | "; | |
229 | ||
230 | const char* gfx10_CopyOnSignal = | |
231 | "\ | |
232 | shader CopyOnSignal\n\ | |
233 | wave_size(32)\n\ | |
234 | type(CS)\n\ | |
235 | /* Assume input buffer in s0, s1 */\n\ | |
236 | s_add_u32 s2, s0, 0x8\n\ | |
237 | s_addc_u32 s3, s1, 0x0\n\ | |
238 | s_mov_b32 s18, 0xcafe\n\ | |
239 | v_mov_b32 v0, s0\n\ | |
240 | v_mov_b32 v1, s1\n\ | |
241 | v_mov_b32 v4, s2\n\ | |
242 | v_mov_b32 v5, s3\n\ | |
243 | POLLSIGNAL:\n\ | |
244 | s_load_dword s16, s[0:1], 0x0 glc\n\ | |
245 | s_cmp_eq_i32 s16, s18\n\ | |
246 | s_cbranch_scc0 POLLSIGNAL\n\ | |
247 | s_load_dword s17, s[0:1], 0x4 glc\n\ | |
248 | s_waitcnt vmcnt(0) & lgkmcnt(0)\n\ | |
249 | v_mov_b32 v2, s17\n\ | |
250 | flat_store_dword v[4,5], v2 glc\n\ | |
251 | s_waitcnt vmcnt(0) & lgkmcnt(0)\n\ | |
252 | s_endpgm\n\ | |
253 | end\n\ | |
254 | "; | |
255 | ||
256 | /* Input0: A buffer of at least 2 dwords. | |
257 | * DW0: used as a signal. Write 0xcafe to signal | |
258 | * DW1: Write to this buffer for other device to read. | |
259 | * Input1: mmio base address | |
260 | */ | |
261 | const char* gfx9_WriteAndSignal = | |
262 | "\ | |
263 | shader WriteAndSignal\n\ | |
264 | wave_size(32)\n\ | |
265 | type(CS)\n\ | |
266 | /* Assume input buffer in s0, s1 */\n\ | |
267 | s_mov_b32 s18, 0xbeef\n\ | |
268 | s_store_dword s18, s[0:1], 0x4 glc\n\ | |
269 | s_mov_b32 s18, 0x1\n\ | |
270 | s_store_dword s18, s[2:3], 0 glc\n\ | |
271 | s_mov_b32 s18, 0xcafe\n\ | |
272 | s_store_dword s18, s[0:1], 0x0 glc\n\ | |
273 | s_endpgm\n\ | |
274 | end\n\ | |
275 | "; | |
276 | ||
277 | /* Continuously poll the flag at src buffer | |
278 | * After the flag of s[0:1] is 1 filled, | |
279 | * copy the value from s[0:1]+4 to dst buffer | |
280 | */ | |
281 | const char* gfx9_PollAndCopy = | |
282 | "\ | |
283 | shader CopyMemory\n\ | |
284 | wave_size(32)\n\ | |
285 | type(CS)\n\ | |
286 | /* Assume src buffer in s[0:1] and dst buffer in s[2:3]*/\n\ | |
287 | s_movk_i32 s18, 0x1\n\ | |
288 | LOOP:\n\ | |
289 | s_load_dword s16, s[0:1], 0x0 glc\n\ | |
290 | s_cmp_eq_i32 s16, s18\n\ | |
291 | s_cbranch_scc0 LOOP\n\ | |
292 | s_load_dword s17, s[0:1], 0x4 glc\n\ | |
293 | s_waitcnt vmcnt(0) & lgkmcnt(0)\n\ | |
294 | s_store_dword s17, s[2:3], 0x0 glc:1\n\ | |
295 | s_waitcnt vmcnt(0) & lgkmcnt(0)\n\ | |
296 | s_endpgm\n\ | |
297 | end\n\ | |
298 | "; | |
299 | ||
300 | const char* gfx9aldbrn_PollAndCopy = | |
301 | "\ | |
302 | shader CopyMemory\n\ | |
303 | wave_size(32)\n\ | |
304 | type(CS)\n\ | |
305 | /* Assume src buffer in s[0:1] and dst buffer in s[2:3]*/\n\ | |
306 | v_mov_b32 v0, s0\n\ | |
307 | v_mov_b32 v1, s1\n\ | |
308 | v_mov_b32 v18, 0x1\n\ | |
309 | LOOP:\n\ | |
310 | flat_load_dword v16, v[0:1] glc\n\ | |
311 | s_waitcnt vmcnt(0) & lgkmcnt(0)\n\ | |
312 | v_cmp_eq_i32 vcc, v16, v18\n\ | |
313 | s_cbranch_vccz LOOP\n\ | |
314 | buffer_invl2\n\ | |
315 | s_load_dword s17, s[0:1], 0x4 glc\n\ | |
316 | s_waitcnt vmcnt(0) & lgkmcnt(0)\n\ | |
317 | s_store_dword s17, s[2:3], 0x0 glc\n\ | |
318 | s_waitcnt vmcnt(0) & lgkmcnt(0)\n\ | |
319 | buffer_wbl2\n\ | |
320 | s_waitcnt vmcnt(0) & lgkmcnt(0)\n\ | |
321 | s_endpgm\n\ | |
322 | end\n\ | |
323 | "; | |
324 | ||
325 | /* Input0: A buffer of at least 2 dwords. | |
326 | * DW0: used as a signal. Write 0x1 to signal | |
327 | * DW1: Write the value from 2nd input buffer | |
328 | * for other device to read. | |
329 | * Input1: A buffer of at least 2 dwords. | |
330 | * DW0: used as the value to be written. | |
331 | */ | |
332 | const char* gfx9aldbrn_WriteFlagAndValue = | |
333 | "\ | |
334 | shader WriteMemory\n\ | |
335 | wave_size(32)\n\ | |
336 | type(CS)\n\ | |
337 | /* Assume two inputs buffer in s[0:1] and s[2:3]*/\n\ | |
338 | v_mov_b32 v0, s0\n\ | |
339 | v_mov_b32 v1, s1\n\ | |
340 | s_load_dword s18, s[2:3], 0x0 glc\n\ | |
341 | s_waitcnt vmcnt(0) & lgkmcnt(0)\n\ | |
342 | s_store_dword s18, s[0:1], 0x4 glc\n\ | |
343 | s_waitcnt vmcnt(0) & lgkmcnt(0)\n\ | |
344 | buffer_wbl2\n\ | |
345 | s_waitcnt vmcnt(0) & lgkmcnt(0)\n\ | |
346 | v_mov_b32 v16, 0x1\n\ | |
347 | flat_store_dword v[0:1], v16 glc\n\ | |
348 | s_endpgm\n\ | |
349 | end\n\ | |
350 | "; | |
351 | ||
352 | const char* gfx10_WriteAndSignal = | |
353 | "\ | |
354 | shader WriteAndSignal\n\ | |
355 | wave_size(32)\n\ | |
356 | type(CS)\n\ | |
357 | /* Assume input buffer in s0, s1 */\n\ | |
358 | s_add_u32 s4, s0, 0x4\n\ | |
359 | s_addc_u32 s5, s1, 0x0\n\ | |
360 | v_mov_b32 v0, s0\n\ | |
361 | v_mov_b32 v1, s1\n\ | |
362 | v_mov_b32 v2, s2\n\ | |
363 | v_mov_b32 v3, s3\n\ | |
364 | v_mov_b32 v4, s4\n\ | |
365 | v_mov_b32 v5, s5\n\ | |
366 | v_mov_b32 v18, 0xbeef\n\ | |
367 | flat_store_dword v[4:5], v18 glc\n\ | |
368 | v_mov_b32 v18, 0x1\n\ | |
369 | flat_store_dword v[2:3], v18 glc\n\ | |
370 | v_mov_b32 v18, 0xcafe\n\ | |
371 | flat_store_dword v[0:1], v18 glc\n\ | |
372 | s_endpgm\n\ | |
373 | end\n\ | |
374 | "; | |
375 | ||
376 | //These gfx9_PullMemory, gfx9_CopyOnSignal, gfx9_WriteAndSignal shaders can be used by both gfx9 and gfx10 | |
377 | ||
378 | 41 | void KFDMemoryTest::SetUp() { |
379 | 42 | ROUTINE_START |
380 | 43 | |
381 | 44 | KFDBaseComponentTest::SetUp(); |
382 | 45 | |
383 | m_pIsaGen = IsaGenerator::Create(m_FamilyId); | |
384 | ||
385 | 46 | ROUTINE_END |
386 | 47 | } |
387 | 48 | |
388 | 49 | void KFDMemoryTest::TearDown() { |
389 | 50 | ROUTINE_START |
390 | ||
391 | if (m_pIsaGen) | |
392 | delete m_pIsaGen; | |
393 | m_pIsaGen = NULL; | |
394 | 51 | |
395 | 52 | KFDBaseComponentTest::TearDown(); |
396 | 53 | |
507 | 164 | HsaMemoryBuffer dstBuffer(PAGE_SIZE, defaultGPUNode); |
508 | 165 | |
509 | 166 | const char *pReadMemory; |
510 | if (m_FamilyId < FAMILY_NV) | |
511 | pReadMemory = gfx9_PollMemory; | |
512 | else | |
513 | pReadMemory = gfx10_PollMemory; | |
514 | ||
515 | 167 | if (m_NodeInfo.IsNodeXGMItoCPU(defaultGPUNode)) |
516 | 168 | /* On A+A system memory is mapped as NC */ |
517 | m_pIsaGen->CompileShader(gfx9_PollNCMemory, "ReadMemory", isaBuffer); | |
169 | pReadMemory = PollNCMemoryIsa; | |
518 | 170 | else |
519 | m_pIsaGen->CompileShader(pReadMemory, "ReadMemory", isaBuffer); | |
171 | pReadMemory = PollMemoryIsa; | |
172 | ||
173 | ASSERT_SUCCESS(m_pAsm->RunAssembleBuf(pReadMemory, isaBuffer.As<char*>())); | |
520 | 174 | |
521 | 175 | PM4Queue pm4Queue; |
522 | 176 | ASSERT_SUCCESS(pm4Queue.Create(defaultGPUNode)); |
591 | 245 | m_MemoryFlags.ui32.NoNUMABind = 1; |
592 | 246 | EXPECT_SUCCESS(hsaKmtAllocMemory(0 /* system */, PAGE_SIZE, m_MemoryFlags, reinterpret_cast<void**>(&pDb))); |
593 | 247 | |
248 | TEST_END | |
249 | } | |
250 | ||
251 | // Basic test for hsaKmtAllocMemory | |
252 | TEST_F(KFDMemoryTest, MemoryAllocAll) { | |
253 | TEST_START(TESTPROFILE_RUNALL) | |
254 | ||
255 | int defaultGPUNode = m_NodeInfo.HsaDefaultGPUNode(); | |
256 | HsaMemFlags memFlags = {0}; | |
257 | memFlags.ui32.NonPaged = 1; // sys mem vs vram | |
258 | HSAuint64 available; | |
259 | void *object = NULL; | |
260 | int shrink = 21, success = HSAKMT_STATUS_NO_MEMORY; | |
261 | ||
262 | EXPECT_SUCCESS(hsaKmtAvailableMemory(defaultGPUNode, &available)); | |
263 | LOG() << "Available: " << available << " bytes" << std::endl; | |
264 | for (int i = 0; i < available >> shrink; i++) { | |
265 | HSAuint64 size = available - ((HSAuint64)i << shrink); | |
266 | if (hsaKmtAllocMemory(defaultGPUNode, size, memFlags, &object) == HSAKMT_STATUS_SUCCESS) { | |
267 | LOG() << "Allocated: " << size << " bytes" << std::endl; | |
268 | success = hsaKmtFreeMemory(object, available); | |
269 | break; | |
270 | } | |
271 | } | |
272 | EXPECT_SUCCESS(success); | |
594 | 273 | TEST_END |
595 | 274 | } |
596 | 275 | |
673 | 352 | ASSERT_SUCCESS(sdmaQueue.Create(defaultGPUNode)); |
674 | 353 | |
675 | 354 | HsaMemoryBuffer isaBuffer(PAGE_SIZE, defaultGPUNode, true/*zero*/, false/*local*/, true/*exec*/); |
676 | m_pIsaGen->GetCopyDwordIsa(isaBuffer); | |
355 | ||
356 | ASSERT_SUCCESS(m_pAsm->RunAssembleBuf(CopyDwordIsa, isaBuffer.As<char*>())); | |
677 | 357 | |
678 | 358 | /* First submit just so the queues are not empty, and to get the |
679 | 359 | * TLB populated (in case we need to flush TLBs somewhere after |
854 | 534 | // Initialize the srcBuffer to some fixed value |
855 | 535 | srcMemBuffer.Fill(0x01010101); |
856 | 536 | |
857 | const char *pScratchCopyDword; | |
858 | if (m_FamilyId < FAMILY_AI) | |
859 | pScratchCopyDword = gfx8_ScratchCopyDword; | |
860 | else if (m_FamilyId < FAMILY_AL) | |
861 | pScratchCopyDword = gfx9_ScratchCopyDword; | |
862 | else if (m_FamilyId == FAMILY_AL) | |
863 | pScratchCopyDword = aldbrn_ScratchCopyDword; | |
864 | else | |
865 | pScratchCopyDword = gfx10_ScratchCopyDword; | |
866 | m_pIsaGen->CompileShader(pScratchCopyDword, "ScratchCopyDword", isaBuffer); | |
537 | ASSERT_SUCCESS(m_pAsm->RunAssembleBuf(ScratchCopyDwordIsa, isaBuffer.As<char*>())); | |
867 | 538 | |
868 | 539 | const HsaNodeProperties *pNodeProperties = m_NodeInfo.GetNodeProperties(defaultGPUNode); |
869 | 540 | |
1143 | 814 | TEST_END |
1144 | 815 | } |
1145 | 816 | |
817 | #define VRAM_ALLOCATION_ALIGN (1 << 21) //Align VRAM allocations to 2MB | |
1146 | 818 | TEST_F(KFDMemoryTest, MMBench) { |
1147 | 819 | TEST_REQUIRE_ENV_CAPABILITIES(ENVCAPS_64BITLINUX); |
1148 | 820 | TEST_START(TESTPROFILE_RUNALL); |
1253 | 925 | memFlags.ui32.PageSize = HSA_PAGE_SIZE_4KB; |
1254 | 926 | memFlags.ui32.HostAccess = 0; |
1255 | 927 | memFlags.ui32.NonPaged = 1; |
1256 | /* Upper limit of buffer number to fit 90% vram size */ | |
1257 | bufLimit = ((vramSizeMB << 20) * 8 / 10) / bufSize ; | |
928 | ||
929 | /* Buffer sizes are 2MB aligned to match new allocation policy. | |
930 | * Upper limit of buffer number to fit 80% vram size. | |
931 | */ | |
932 | bufLimit = ((vramSizeMB << 20) * 8 / 10) / ALIGN_UP(bufSize, VRAM_ALLOCATION_ALIGN); | |
1258 | 933 | |
1259 | 934 | if (bufLimit == 0) |
1260 | 935 | continue; // skip when bufSize > vram |
1727 | 1402 | // dstBuffer is cpu accessible gtt memory |
1728 | 1403 | HsaMemoryBuffer dstBuffer(PAGE_SIZE, defaultGPUNode); |
1729 | 1404 | |
1730 | const char *pScratchCopyDword; | |
1731 | if (m_FamilyId < FAMILY_AI) | |
1732 | pScratchCopyDword = gfx8_ScratchCopyDword; | |
1733 | else if (m_FamilyId < FAMILY_AL) | |
1734 | pScratchCopyDword = gfx9_ScratchCopyDword; | |
1735 | else if (m_FamilyId == FAMILY_AL) | |
1736 | pScratchCopyDword = aldbrn_ScratchCopyDword; | |
1737 | else | |
1738 | pScratchCopyDword = gfx10_ScratchCopyDword; | |
1739 | ||
1740 | m_pIsaGen->CompileShader(pScratchCopyDword, "ScratchCopyDword", isaBuffer); | |
1405 | ASSERT_SUCCESS(m_pAsm->RunAssembleBuf(ScratchCopyDwordIsa, isaBuffer.As<char*>())); | |
1406 | ||
1741 | 1407 | Dispatch dispatch0(isaBuffer); |
1742 | 1408 | dispatch0.SetArgs(mem0, dstBuffer.As<void*>()); |
1743 | 1409 | dispatch0.Submit(queue); |
1920 | 1586 | TEST_REQUIRE_ENV_CAPABILITIES(ENVCAPS_64BITLINUX); |
1921 | 1587 | TEST_START(TESTPROFILE_RUNALL); |
1922 | 1588 | |
1923 | const unsigned nBufs = 1000; /* measure us, report ns */ | |
1589 | unsigned nBufs = 1000; /* measure us, report ns */ | |
1924 | 1590 | unsigned testIndex, sizeIndex, memType; |
1925 | 1591 | const unsigned nMemTypes = 2; |
1926 | 1592 | const char *memTypeStrings[nMemTypes] = {"SysMem", "VRAM"}; |
1965 | 1631 | unsigned memType = _TEST_MEMTYPE(testIndex); |
1966 | 1632 | HSAuint64 mcpRTime, mcpWTime, accessRTime, accessWTime; |
1967 | 1633 | HSAuint32 allocNode; |
1634 | unsigned bufLimit; | |
1968 | 1635 | |
1969 | 1636 | if ((testIndex & (nSizes-1)) == 0) |
1970 | 1637 | LOG() << "----------------------------------------------------------------------" << std::endl; |
1981 | 1648 | memFlags.ui32.PageSize = HSA_PAGE_SIZE_4KB; |
1982 | 1649 | memFlags.ui32.HostAccess = 1; |
1983 | 1650 | memFlags.ui32.NonPaged = 1; |
1651 | ||
1652 | /* Buffer sizes are 2MB aligned to match new allocation policy. | |
1653 | * Upper limit of buffer number to fit 80% vram size. | |
1654 | */ | |
1655 | bufLimit = ((vramSizeMB << 20) * 8 / 10) / ALIGN_UP(bufSize, VRAM_ALLOCATION_ALIGN); | |
1656 | if (bufLimit == 0) | |
1657 | continue; // skip when bufSize > vram | |
1658 | ||
1659 | /* When vram is too small to fit all the buffers, fill 80% vram size*/ | |
1660 | nBufs = std::min(nBufs , bufLimit); | |
1984 | 1661 | } |
1985 | 1662 | |
1986 | 1663 | for (i = 0; i < nBufs; i++) |
2108 | 1785 | PM4Queue queue; |
2109 | 1786 | ASSERT_SUCCESS(queue.Create(defaultGPUNode)); |
2110 | 1787 | HsaMemoryBuffer isaBuffer(PAGE_SIZE, defaultGPUNode, true/*zero*/, false/*local*/, true/*exec*/); |
2111 | const char *pCopyOnSignal; | |
2112 | if (m_FamilyId < FAMILY_NV) | |
2113 | pCopyOnSignal = gfx9_CopyOnSignal; | |
2114 | else | |
2115 | pCopyOnSignal = gfx10_CopyOnSignal; | |
2116 | m_pIsaGen->CompileShader(pCopyOnSignal, "CopyOnSignal", isaBuffer); | |
1788 | ||
1789 | ASSERT_SUCCESS(m_pAsm->RunAssembleBuf(CopyOnSignalIsa, isaBuffer.As<char*>())); | |
1790 | ||
2117 | 1791 | Dispatch dispatch0(isaBuffer); |
2118 | 1792 | dispatch0.SetArgs(buffer, NULL); |
2119 | 1793 | dispatch0.Submit(queue); |
2233 | 1907 | PM4Queue queue; |
2234 | 1908 | ASSERT_SUCCESS(queue.Create(nodes[0])); |
2235 | 1909 | HsaMemoryBuffer isaBuffer(PAGE_SIZE, nodes[0], true/*zero*/, false/*local*/, true/*exec*/); |
2236 | const char *pCopyOnSignal; | |
2237 | if (m_FamilyId < FAMILY_NV) | |
2238 | pCopyOnSignal = gfx9_CopyOnSignal; | |
2239 | else | |
2240 | pCopyOnSignal = gfx10_CopyOnSignal; | |
2241 | m_pIsaGen->CompileShader(pCopyOnSignal, "CopyOnSignal", isaBuffer); | |
1910 | ||
1911 | ASSERT_SUCCESS(m_pAsm->RunAssembleBuf(CopyOnSignalIsa, isaBuffer.As<char*>())); | |
1912 | ||
2242 | 1913 | Dispatch dispatch(isaBuffer); |
2243 | 1914 | dispatch.SetArgs(buffer, NULL); |
2244 | 1915 | dispatch.Submit(queue); |
2246 | 1917 | PM4Queue queue0; |
2247 | 1918 | ASSERT_SUCCESS(queue0.Create(nodes[1])); |
2248 | 1919 | HsaMemoryBuffer isaBuffer0(PAGE_SIZE, nodes[1], true/*zero*/, false/*local*/, true/*exec*/); |
2249 | const char *pWriteAndSignal; | |
2250 | if (m_FamilyId < FAMILY_NV) | |
2251 | pWriteAndSignal = gfx9_WriteAndSignal; | |
2252 | else | |
2253 | pWriteAndSignal = gfx10_WriteAndSignal; | |
2254 | m_pIsaGen->CompileShader(pWriteAndSignal, "WriteAndSignal", isaBuffer0); | |
1920 | ||
1921 | ASSERT_SUCCESS(m_pAsm->RunAssembleBuf(WriteAndSignalIsa, isaBuffer0.As<char*>())); | |
1922 | ||
2255 | 1923 | Dispatch dispatch0(isaBuffer0); |
2256 | 1924 | dispatch0.SetArgs(buffer, mmioBase); |
2257 | 1925 | dispatch0.Submit(queue0); |
2303 | 1971 | PM4Queue queue; |
2304 | 1972 | ASSERT_SUCCESS(queue.Create(defaultGPUNode)); |
2305 | 1973 | HsaMemoryBuffer isaBuffer(PAGE_SIZE, defaultGPUNode, true/*zero*/, false/*local*/, true/*exec*/); |
2306 | m_pIsaGen->CompileShader(gfx9_PollMemory, "ReadMemory", isaBuffer); | |
1974 | ||
1975 | ASSERT_SUCCESS(m_pAsm->RunAssembleBuf(PollMemoryIsa, isaBuffer.As<char*>())); | |
1976 | ||
2307 | 1977 | Dispatch dispatch(isaBuffer); |
2308 | 1978 | dispatch.SetArgs(buffer.As<int*>(), buffer.As<int*>()+dwLocation); |
2309 | 1979 | dispatch.Submit(queue); |
2356 | 2026 | PM4Queue queue; |
2357 | 2027 | ASSERT_SUCCESS(queue.Create(defaultGPUNode)); |
2358 | 2028 | HsaMemoryBuffer isaBuffer(PAGE_SIZE, defaultGPUNode, true/*zero*/, false/*local*/, true/*exec*/); |
2359 | m_pIsaGen->CompileShader(gfx9_PollMemory, "ReadMemory", isaBuffer); | |
2029 | ||
2030 | ASSERT_SUCCESS(m_pAsm->RunAssembleBuf(PollMemoryIsa, isaBuffer.As<char*>())); | |
2031 | ||
2360 | 2032 | Dispatch dispatch(isaBuffer); |
2361 | 2033 | dispatch.SetArgs(buffer, buffer+100); |
2362 | 2034 | dispatch.Submit(queue); |
2418 | 2090 | PM4Queue queue; |
2419 | 2091 | ASSERT_SUCCESS(queue.Create(defaultGPUNode)); |
2420 | 2092 | HsaMemoryBuffer isaBuffer(PAGE_SIZE, defaultGPUNode, true/*zero*/, false/*local*/, true/*exec*/); |
2421 | m_pIsaGen->CompileShader(gfx9_PollMemory, "ReadMemory", isaBuffer); | |
2093 | ||
2094 | ASSERT_SUCCESS(m_pAsm->RunAssembleBuf(PollMemoryIsa, isaBuffer.As<char*>())); | |
2095 | ||
2422 | 2096 | Dispatch dispatch(isaBuffer); |
2423 | 2097 | dispatch.SetArgs(buffer.As<int*>(), buffer.As<int*>()+dwLocation); |
2424 | 2098 | dispatch.Submit(queue); |
2433 | 2107 | ASSERT_SUCCESS(queue1.Create(nondefaultNode)); |
2434 | 2108 | buffer.Fill(0x5678, sdmaQueue, dwLocation1*sizeof(int), 4); |
2435 | 2109 | HsaMemoryBuffer isaBuffer1(PAGE_SIZE, nondefaultNode, true/*zero*/, false/*local*/, true/*exec*/); |
2436 | m_pIsaGen->GetCopyDwordIsa(isaBuffer1); | |
2110 | ||
2111 | ASSERT_SUCCESS(m_pAsm->RunAssembleBuf(CopyDwordIsa, isaBuffer.As<char*>())); | |
2112 | ||
2437 | 2113 | Dispatch dispatch1(isaBuffer1); |
2438 | 2114 | dispatch1.SetArgs(buffer.As<int*>()+dwLocation1, buffer.As<int*>()); |
2439 | 2115 | dispatch1.Submit(queue1); |
2499 | 2175 | PM4Queue queue; |
2500 | 2176 | ASSERT_SUCCESS(queue.Create(defaultGPUNode)); |
2501 | 2177 | HsaMemoryBuffer isaBuffer(PAGE_SIZE, defaultGPUNode, true/*zero*/, false/*local*/, true/*exec*/); |
2502 | m_pIsaGen->CompileShader(gfx9aldbrn_PollAndCopy, "CopyMemory", isaBuffer); | |
2178 | ||
2179 | ASSERT_SUCCESS(m_pAsm->RunAssembleBuf(PollAndCopyIsa, isaBuffer.As<char*>())); | |
2180 | ||
2503 | 2181 | Dispatch dispatch(isaBuffer); |
2504 | 2182 | dispatch.SetArgs(buffer.As<char *>(), buffer.As<char *>()+dwLocation); |
2505 | 2183 | dispatch.Submit(queue); |
2514 | 2192 | PM4Queue queue1; |
2515 | 2193 | ASSERT_SUCCESS(queue1.Create(nondefaultNode)); |
2516 | 2194 | HsaMemoryBuffer isaBuffer1(PAGE_SIZE, nondefaultNode, true/*zero*/, false/*local*/, true/*exec*/); |
2517 | m_pIsaGen->CompileShader(gfx9aldbrn_WriteFlagAndValue, "WriteMemory", isaBuffer1); | |
2195 | ||
2196 | ASSERT_SUCCESS(m_pAsm->RunAssembleBuf(WriteFlagAndValueIsa, isaBuffer1.As<char*>())); | |
2197 | ||
2518 | 2198 | Dispatch dispatch1(isaBuffer1); |
2519 | 2199 | dispatch1.SetArgs(buffer.As<char *>(), buffer.As<char *>()+dwSource); |
2520 | 2200 | dispatch1.Submit(queue1); |
2568 | 2248 | PM4Queue queue; |
2569 | 2249 | ASSERT_SUCCESS(queue.Create(defaultGPUNode)); |
2570 | 2250 | HsaMemoryBuffer isaBuffer(PAGE_SIZE, defaultGPUNode, true/*zero*/, false/*local*/, true/*exec*/); |
2571 | m_pIsaGen->CompileShader(gfx9aldbrn_PollAndCopy, "CopyMemory", isaBuffer); | |
2251 | ||
2252 | ASSERT_SUCCESS(m_pAsm->RunAssembleBuf(PollAndCopyIsa, isaBuffer.As<char*>())); | |
2253 | ||
2572 | 2254 | Dispatch dispatch(isaBuffer); |
2573 | 2255 | dispatch.SetArgs(buffer, buffer+dwLocation); |
2574 | 2256 | dispatch.Submit(queue); |
2607 | 2289 | return; |
2608 | 2290 | } |
2609 | 2291 | |
2292 | int defaultGPUNode = m_NodeInfo.HsaDefaultGPUNode(); | |
2293 | const int dwLocation = 0x80; | |
2294 | ||
2295 | if (!m_NodeInfo.IsNodeXGMItoCPU(defaultGPUNode)) { | |
2296 | LOG() << "Skipping test: XGMI link to CPU is required." << std::endl; | |
2297 | return; | |
2298 | } | |
2299 | ||
2610 | 2300 | unsigned int *fineBuffer = NULL; |
2611 | 2301 | unsigned int tmp; |
2612 | ||
2613 | int defaultGPUNode = m_NodeInfo.HsaDefaultGPUNode(); | |
2614 | const int dwLocation = 0x80; | |
2615 | 2302 | |
2616 | 2303 | ASSERT_SUCCESS(hsaKmtAllocMemory(defaultGPUNode /* system */, PAGE_SIZE, m_MemoryFlags, |
2617 | 2304 | reinterpret_cast<void**>(&fineBuffer))); |
2626 | 2313 | ASSERT_SUCCESS(queue.Create(defaultGPUNode)); |
2627 | 2314 | HsaMemoryBuffer isaBuffer(PAGE_SIZE, defaultGPUNode, true/*zero*/, false/*local*/, true/*exec*/); |
2628 | 2315 | |
2629 | if (m_NodeInfo.IsNodeXGMItoCPU(defaultGPUNode)) | |
2630 | m_pIsaGen->CompileShader(gfx9aldbrn_PollAndCopy, "CopyMemory", isaBuffer); | |
2631 | else | |
2632 | m_pIsaGen->CompileShader(gfx9_PollAndCopy, "CopyMemory", isaBuffer); | |
2316 | ASSERT_SUCCESS(m_pAsm->RunAssembleBuf(PollAndCopyIsa, isaBuffer.As<char*>())); | |
2633 | 2317 | |
2634 | 2318 | Dispatch dispatch(isaBuffer); |
2635 | 2319 | dispatch.SetArgs(fineBuffer, fineBuffer+dwLocation); |
21 | 21 | */ |
22 | 22 | |
23 | 23 | #include "KFDBaseComponentTest.hpp" |
24 | #include "IsaGenerator.hpp" | |
25 | 24 | |
26 | 25 | #ifndef __KFD_MEMORY_TEST__H__ |
27 | 26 | #define __KFD_MEMORY_TEST__H__ |
32 | 31 | */ |
33 | 32 | class KFDMemoryTest : public KFDBaseComponentTest { |
34 | 33 | public: |
35 | KFDMemoryTest(void) :m_pIsaGen(NULL) {} | |
34 | KFDMemoryTest(void) {} | |
36 | 35 | ~KFDMemoryTest(void) {} |
37 | 36 | protected: |
38 | 37 | virtual void SetUp(); |
39 | 38 | virtual void TearDown(); |
40 | 39 | |
41 | 40 | protected: |
42 | IsaGenerator* m_pIsaGen; | |
43 | ||
44 | 41 | void BinarySearchLargestBuffer(int allocNode, const HsaMemFlags &memFlags, |
45 | 42 | HSAuint64 highMB, int nodeToMap, |
46 | 43 | HSAuint64 *lastSizeMB); |
38 | 38 | |
39 | 39 | KFDBaseComponentTest::SetUp(); |
40 | 40 | |
41 | m_pIsaGen = IsaGenerator::Create(m_FamilyId); | |
42 | ||
43 | 41 | ROUTINE_END |
44 | 42 | } |
45 | 43 | |
46 | 44 | void KFDQMTest::TearDown() { |
47 | 45 | ROUTINE_START |
48 | ||
49 | if (m_pIsaGen) | |
50 | delete m_pIsaGen; | |
51 | m_pIsaGen = NULL; | |
52 | 46 | |
53 | 47 | KFDBaseComponentTest::TearDown(); |
54 | 48 | |
676 | 670 | TEST_END |
677 | 671 | } |
678 | 672 | |
679 | /* A simple isa loop program with dense mathematic operations | |
680 | * s1 controls the number iterations of the loop | |
681 | * This shader can be used by GFX8, GFX9 and GFX10 | |
682 | */ | |
683 | static const char *loop_isa = \ | |
684 | "\ | |
685 | shader loop_isa\n\ | |
686 | wave_size(32)\n\ | |
687 | type(CS)\n\ | |
688 | s_movk_i32 s0, 0x0008\n\ | |
689 | s_movk_i32 s1, 0x00ff\n\ | |
690 | v_mov_b32 v0, 0\n\ | |
691 | v_mov_b32 v1, 0\n\ | |
692 | v_mov_b32 v2, 0\n\ | |
693 | v_mov_b32 v3, 0\n\ | |
694 | v_mov_b32 v4, 0\n\ | |
695 | v_mov_b32 v5, 0\n\ | |
696 | v_mov_b32 v6, 0\n\ | |
697 | v_mov_b32 v7, 0\n\ | |
698 | v_mov_b32 v8, 0\n\ | |
699 | v_mov_b32 v9, 0\n\ | |
700 | v_mov_b32 v10, 0\n\ | |
701 | v_mov_b32 v11, 0\n\ | |
702 | v_mov_b32 v12, 0\n\ | |
703 | v_mov_b32 v13, 0\n\ | |
704 | v_mov_b32 v14, 0\n\ | |
705 | v_mov_b32 v15, 0\n\ | |
706 | v_mov_b32 v16, 0\n\ | |
707 | LOOP:\n\ | |
708 | s_mov_b32 s8, s4\n\ | |
709 | s_mov_b32 s9, s1\n\ | |
710 | s_mov_b32 s10, s6\n\ | |
711 | s_mov_b32 s11, s7\n\ | |
712 | s_cmp_le_i32 s1, s0\n\ | |
713 | s_cbranch_scc1 END_OF_PGM\n\ | |
714 | s_buffer_load_dwordx8 s[8:15], s[8:11], 0x10\n\ | |
715 | v_add_f32 v0, 2.0, v0\n\ | |
716 | v_cvt_f32_i32 v17, s1\n\ | |
717 | s_waitcnt lgkmcnt(0)\n\ | |
718 | v_add_f32 v18, s8, v17\n\ | |
719 | v_add_f32 v19, s9, v17\n\ | |
720 | v_add_f32 v20, s10, v17\n\ | |
721 | v_add_f32 v21, s11, v17\n\ | |
722 | v_add_f32 v22, s12, v17\n\ | |
723 | v_add_f32 v23, s13, v17\n\ | |
724 | v_add_f32 v24, s14, v17\n\ | |
725 | v_add_f32 v17, s15, v17\n\ | |
726 | v_log_f32 v25, v18\n\ | |
727 | v_mul_f32 v25, v22, v25\n\ | |
728 | v_exp_f32 v25, v25\n\ | |
729 | v_log_f32 v26, v19\n\ | |
730 | v_mul_f32 v26, v23, v26\n\ | |
731 | v_exp_f32 v26, v26\n\ | |
732 | v_log_f32 v27, v20\n\ | |
733 | v_mul_f32 v27, v24, v27\n\ | |
734 | v_exp_f32 v27, v27\n\ | |
735 | v_log_f32 v28, v21\n\ | |
736 | v_mul_f32 v28, v17, v28\n\ | |
737 | v_exp_f32 v28, v28\n\ | |
738 | v_add_f32 v5, v5, v25\n\ | |
739 | v_add_f32 v6, v6, v26\n\ | |
740 | v_add_f32 v7, v7, v27\n\ | |
741 | v_add_f32 v8, v8, v28\n\ | |
742 | v_mul_f32 v18, 0x3fb8aa3b, v18\n\ | |
743 | v_exp_f32 v18, v18\n\ | |
744 | v_mul_f32 v19, 0x3fb8aa3b, v19\n\ | |
745 | v_exp_f32 v19, v19\n\ | |
746 | v_mul_f32 v20, 0x3fb8aa3b, v20\n\ | |
747 | v_exp_f32 v20, v20\n\ | |
748 | v_mul_f32 v21, 0x3fb8aa3b, v21\n\ | |
749 | v_exp_f32 v21, v21\n\ | |
750 | v_add_f32 v9, v9, v18\n\ | |
751 | v_add_f32 v10, v10, v19\n\ | |
752 | v_add_f32 v11, v11, v20\n\ | |
753 | v_add_f32 v12, v12, v21\n\ | |
754 | v_sqrt_f32 v18, v22\n\ | |
755 | v_sqrt_f32 v19, v23\n\ | |
756 | v_sqrt_f32 v20, v24\n\ | |
757 | v_sqrt_f32 v21, v17\n\ | |
758 | v_add_f32 v13, v13, v18\n\ | |
759 | v_add_f32 v14, v14, v19\n\ | |
760 | v_add_f32 v15, v15, v20\n\ | |
761 | v_add_f32 v16, v16, v21\n\ | |
762 | v_rsq_f32 v18, v22\n\ | |
763 | v_rsq_f32 v19, v23\n\ | |
764 | v_rsq_f32 v20, v24\n\ | |
765 | v_rsq_f32 v17, v17\n\ | |
766 | v_add_f32 v1, v1, v18\n\ | |
767 | v_add_f32 v2, v2, v19\n\ | |
768 | v_add_f32 v3, v3, v20\n\ | |
769 | v_add_f32 v4, v4, v17\n\ | |
770 | s_add_u32 s0, s0, 1\n\ | |
771 | s_branch LOOP\n\ | |
772 | END_OF_PGM:\n\ | |
773 | s_endpgm\n\ | |
774 | end\n\ | |
775 | "; | |
776 | ||
777 | 673 | HSAint64 KFDQMTest::TimeConsumedwithCUMask(int node, uint32_t* mask, uint32_t mask_count) { |
778 | 674 | HsaMemoryBuffer isaBuffer(PAGE_SIZE, node, true/*zero*/, false/*local*/, true/*exec*/); |
779 | 675 | HsaMemoryBuffer dstBuffer(PAGE_SIZE, node, true, false, false); |
780 | 676 | HsaMemoryBuffer ctlBuffer(PAGE_SIZE, node, true, false, false); |
781 | 677 | |
782 | m_pIsaGen = IsaGenerator::Create(m_FamilyId); | |
783 | m_pIsaGen->CompileShader(loop_isa, "loop_isa", isaBuffer); | |
678 | EXPECT_SUCCESS(m_pAsm->RunAssembleBuf(LoopIsa, isaBuffer.As<char*>())); | |
784 | 679 | |
785 | 680 | Dispatch dispatch(isaBuffer); |
786 | 681 | dispatch.SetDim(1024, 16, 16); |
837 | 732 | TEST_START(TESTPROFILE_RUNALL); |
838 | 733 | int defaultGPUNode = m_NodeInfo.HsaDefaultGPUNode(); |
839 | 734 | ASSERT_GE(defaultGPUNode, 0) << "failed to get default GPU Node"; |
840 | m_pIsaGen = IsaGenerator::Create(m_FamilyId); | |
841 | 735 | |
842 | 736 | if (m_FamilyId >= FAMILY_VI) { |
843 | 737 | const HsaNodeProperties *pNodeProperties = m_NodeInfo.GetNodeProperties(defaultGPUNode); |
981 | 875 | HSAint32 *syncBuffer = syncBuf.As<HSAint32*>(); |
982 | 876 | HsaMemoryBuffer isaBuffer(PAGE_SIZE, node, true/*zero*/, false/*local*/, true/*exec*/); |
983 | 877 | |
984 | m_pIsaGen->CompileShader(loop_isa, "loop_isa", isaBuffer); | |
878 | ASSERT_SUCCESS(m_pAsm->RunAssembleBuf(LoopIsa, isaBuffer.As<char*>())); | |
985 | 879 | |
986 | 880 | Dispatch dispatch[2] = { |
987 | 881 | Dispatch(isaBuffer, true), |
1046 | 940 | HSAint32 *syncBuffer = syncBuf.As<HSAint32*>(); |
1047 | 941 | HsaMemoryBuffer isaBuffer(PAGE_SIZE, node, true/*zero*/, false/*local*/, true/*exec*/); |
1048 | 942 | |
1049 | m_pIsaGen->CompileShader(loop_isa, "loop_isa", isaBuffer); | |
943 | ASSERT_SUCCESS(m_pAsm->RunAssembleBuf(LoopIsa, isaBuffer.As<char*>())); | |
1050 | 944 | |
1051 | 945 | Dispatch dispatch[2] = { |
1052 | 946 | Dispatch(isaBuffer, true), |
1139 | 1033 | |
1140 | 1034 | HsaMemoryBuffer isaBuffer(PAGE_SIZE, defaultGPUNode, true/*zero*/, false/*local*/, true/*exec*/); |
1141 | 1035 | |
1142 | m_pIsaGen->GetNoopIsa(isaBuffer); | |
1036 | ASSERT_SUCCESS(m_pAsm->RunAssembleBuf(NoopIsa, isaBuffer.As<char*>())); | |
1143 | 1037 | |
1144 | 1038 | SyncDispatch(isaBuffer, NULL, NULL); |
1145 | 1039 | |
1158 | 1052 | |
1159 | 1053 | srcBuffer.Fill(0x01010101); |
1160 | 1054 | |
1161 | m_pIsaGen->GetCopyDwordIsa(isaBuffer); | |
1055 | ASSERT_SUCCESS(m_pAsm->RunAssembleBuf(CopyDwordIsa, isaBuffer.As<char*>())); | |
1162 | 1056 | |
1163 | 1057 | SyncDispatch(isaBuffer, srcBuffer.As<void*>(), destBuffer.As<void*>()); |
1164 | 1058 | |
1193 | 1087 | |
1194 | 1088 | destBuffer.Fill(0xFF); |
1195 | 1089 | |
1196 | m_pIsaGen->GetCopyDwordIsa(isaBuffer); | |
1090 | ASSERT_SUCCESS(m_pAsm->RunAssembleBuf(CopyDwordIsa, isaBuffer.As<char*>())); | |
1197 | 1091 | |
1198 | 1092 | for (i = 0; i < MAX_CP_QUEUES; ++i) |
1199 | 1093 | ASSERT_SUCCESS(queues[i].Create(defaultGPUNode)) << " QueueId=" << i; |
1532 | 1426 | |
1533 | 1427 | PM4Queue queue; |
1534 | 1428 | |
1535 | m_pIsaGen->GetAtomicIncIsa(isaBuf); | |
1429 | ASSERT_SUCCESS(m_pAsm->RunAssembleBuf(AtomicIncIsa, isaBuf.As<char*>())); | |
1536 | 1430 | |
1537 | 1431 | Dispatch dispatch(isaBuf); |
1538 | 1432 | dispatch.SetArgs(destBuf.As<void*>(), NULL); |
1597 | 1491 | |
1598 | 1492 | srcNodeMem.Fill(0x05050505); |
1599 | 1493 | |
1600 | m_pIsaGen->GetCopyDwordIsa(isaBufferSrc); | |
1494 | ASSERT_SUCCESS(m_pAsm->RunAssemble(CopyDwordIsa)); | |
1495 | ||
1496 | m_pAsm->CopyInstrStream(isaBufferSrc.As<char*>()); | |
1601 | 1497 | SyncDispatch(isaBufferSrc, srcNodeMem.As<void*>(), shared_addr.As<void *>(), src_node); |
1602 | 1498 | |
1603 | m_pIsaGen->GetCopyDwordIsa(isaBufferDst); | |
1499 | m_pAsm->CopyInstrStream(isaBufferDst.As<char*>()); | |
1604 | 1500 | SyncDispatch(isaBufferDst, shared_addr.As<void *>(), dstNodeMem.As<void*>(), dst_node); |
1605 | 1501 | |
1606 | 1502 | EXPECT_EQ(dstNodeMem.As<unsigned int*>()[0], 0x05050505); |
26 | 26 | #include <gtest/gtest.h> |
27 | 27 | |
28 | 28 | #include "PM4Queue.hpp" |
29 | #include "IsaGenerator.hpp" | |
30 | 29 | #include "KFDBaseComponentTest.hpp" |
31 | 30 | #include "Dispatch.hpp" |
32 | 31 | |
33 | 32 | class KFDQMTest : public KFDBaseComponentTest { |
34 | 33 | public: |
35 | KFDQMTest():m_pIsaGen(NULL) {} | |
34 | KFDQMTest() {} | |
36 | 35 | |
37 | 36 | ~KFDQMTest() {} |
38 | 37 | |
48 | 47 | const double CuVariance = 0.15; |
49 | 48 | const double CuNegVariance = 1.0 - CuVariance; |
50 | 49 | const double CuPosVariance = 1.0 + CuVariance; |
51 | IsaGenerator* m_pIsaGen; | |
52 | 50 | }; |
53 | 51 | |
54 | 52 | #endif // __KFD_QCM_TEST__H__ |
87 | 87 | |
88 | 88 | for (HSAuint32 i = 0; i < count; i++) { |
89 | 89 | m_pBuf = mmap(0, vramBufSize, PROT_READ | PROT_WRITE, MAP_ANONYMOUS | MAP_PRIVATE, -1, 0); |
90 | EXPECT_NOTNULL(m_pBuf); | |
90 | ASSERT_NE(MAP_FAILED, m_pBuf); | |
91 | 91 | |
92 | 92 | m_Flags = (HSA_SVM_FLAGS)0; |
93 | 93 | retry: |
231 | 231 | WaitChildProcesses(); |
232 | 232 | |
233 | 233 | TEST_END |
234 | } | |
235 | ||
236 | /* Shader to read local buffers using multiple wavefronts in parallel | |
237 | * until address buffer is filled with specific value 0x5678 by host program, | |
238 | * then each wavefront fills value 0x5678 at corresponding result buffer and quit | |
239 | * | |
240 | * initial state: | |
241 | * s[0:1] - address buffer base address | |
242 | * s[2:3] - result buffer base address | |
243 | * s4 - workgroup id | |
244 | * v0 - workitem id, always 0 because NUM_THREADS_X(number of threads) in workgroup set to 1 | |
245 | * registers: | |
246 | * v0 - calculated workitem id, v0 = v0 + s4 * NUM_THREADS_X | |
247 | * v[2:3] - address of corresponding local buf address offset: s[0:1] + v0 * 8 | |
248 | * v[4:5] - corresponding output buf address: s[2:3] + v0 * 4 | |
249 | * v[6:7] - local buf address used for read test | |
250 | */ | |
251 | static const char* gfx9_ReadMemory = | |
252 | "\ | |
253 | shader ReadMemory\n\ | |
254 | type(CS)\n\ | |
255 | \n\ | |
256 | // compute address of corresponding output buffer\n\ | |
257 | v_mov_b32 v0, s4 // use workgroup id as index\n\ | |
258 | v_lshlrev_b32 v0, 2, v0 // v0 *= 4\n\ | |
259 | v_add_co_u32 v4, vcc, s2, v0 // v[4:5] = s[2:3] + v0 * 4\n\ | |
260 | v_mov_b32 v5, s3\n\ | |
261 | v_add_u32 v5, vcc_lo, v5\n\ | |
262 | \n\ | |
263 | // compute input buffer offset used to store corresponding local buffer address\n\ | |
264 | v_lshlrev_b32 v0, 1, v0 // v0 *= 8\n\ | |
265 | v_add_co_u32 v2, vcc, s0, v0 // v[2:3] = s[0:1] + v0 * 8\n\ | |
266 | v_mov_b32 v3, s1\n\ | |
267 | v_add_u32 v3, vcc_lo, v3\n\ | |
268 | \n\ | |
269 | // load 64bit local buffer address stored at v[2:3] to v[6:7]\n\ | |
270 | flat_load_dwordx2 v[6:7], v[2:3] slc\n\ | |
271 | s_waitcnt vmcnt(0) & lgkmcnt(0) // wait for memory reads to finish\n\ | |
272 | \n\ | |
273 | v_mov_b32 v8, 0x5678\n\ | |
274 | s_movk_i32 s8, 0x5678\n\ | |
275 | L_REPEAT:\n\ | |
276 | s_load_dword s16, s[0:1], 0x0 glc\n\ | |
277 | s_waitcnt vmcnt(0) & lgkmcnt(0) // wait for memory reads to finish\n\ | |
278 | s_cmp_eq_i32 s16, s8\n\ | |
279 | s_cbranch_scc1 L_QUIT // if notified to quit by host\n\ | |
280 | // loop read 64M local buffer starting at v[6:7]\n\ | |
281 | // every 4k page only read once\n\ | |
282 | v_mov_b32 v9, 0\n\ | |
283 | v_mov_b32 v10, 0x1000 // 4k page\n\ | |
284 | v_mov_b32 v11, 0x4000000 // 64M size\n\ | |
285 | v_mov_b32 v12, v6\n\ | |
286 | v_mov_b32 v13, v7\n\ | |
287 | L_LOOP_READ:\n\ | |
288 | flat_load_dwordx2 v[14:15], v[12:13] slc\n\ | |
289 | v_add_u32 v9, v9, v10 \n\ | |
290 | v_add_co_u32 v12, vcc, v12, v10\n\ | |
291 | v_add_u32 v13, vcc_lo, v13\n\ | |
292 | v_cmp_lt_u32 vcc, v9, v11\n\ | |
293 | s_cbranch_vccnz L_LOOP_READ\n\ | |
294 | s_branch L_REPEAT\n\ | |
295 | L_QUIT:\n\ | |
296 | flat_store_dword v[4:5], v8\n\ | |
297 | s_waitcnt vmcnt(0) & lgkmcnt(0) // wait for memory writes to finish\n\ | |
298 | s_endpgm\n\ | |
299 | end\n\ | |
300 | "; | |
301 | ||
302 | static const char* gfx8_ReadMemory = | |
303 | "\ | |
304 | shader ReadMemory\n\ | |
305 | asic(VI)\n\ | |
306 | type(CS)\n\ | |
307 | \n\ | |
308 | // compute address of corresponding output buffer\n\ | |
309 | v_mov_b32 v0, s4 // use workgroup id as index\n\ | |
310 | v_lshlrev_b32 v0, 2, v0 // v0 *= 4\n\ | |
311 | v_add_u32 v4, vcc, s2, v0 // v[4:5] = s[2:3] + v0 * 4\n\ | |
312 | v_mov_b32 v5, s3\n\ | |
313 | v_addc_u32 v5, vcc, v5, 0, vcc\n\ | |
314 | \n\ | |
315 | // compute input buffer offset used to store corresponding local buffer address\n\ | |
316 | v_lshlrev_b32 v0, 1, v0 // v0 *= 8\n\ | |
317 | v_add_u32 v2, vcc, s0, v0 // v[2:3] = s[0:1] + v0 * 8\n\ | |
318 | v_mov_b32 v3, s1\n\ | |
319 | v_addc_u32 v3, vcc, v3, 0, vcc\n\ | |
320 | \n\ | |
321 | // load 64bit local buffer address stored at v[2:3] to v[6:7]\n\ | |
322 | flat_load_dwordx2 v[6:7], v[2:3] slc\n\ | |
323 | s_waitcnt vmcnt(0) & lgkmcnt(0) // wait for memory reads to finish\n\ | |
324 | \n\ | |
325 | v_mov_b32 v8, 0x5678\n\ | |
326 | s_movk_i32 s8, 0x5678\n\ | |
327 | L_REPEAT:\n\ | |
328 | s_load_dword s16, s[0:1], 0x0 glc\n\ | |
329 | s_waitcnt vmcnt(0) & lgkmcnt(0) // wait for memory reads to finish\n\ | |
330 | s_cmp_eq_i32 s16, s8\n\ | |
331 | s_cbranch_scc1 L_QUIT // if notified to quit by host\n\ | |
332 | // loop read 64M local buffer starting at v[6:7]\n\ | |
333 | // every 4k page only read once\n\ | |
334 | v_mov_b32 v9, 0\n\ | |
335 | v_mov_b32 v10, 0x1000 // 4k page\n\ | |
336 | v_mov_b32 v11, 0x4000000 // 64M size\n\ | |
337 | v_mov_b32 v12, v6\n\ | |
338 | v_mov_b32 v13, v7\n\ | |
339 | L_LOOP_READ:\n\ | |
340 | flat_load_dwordx2 v[14:15], v[12:13] slc\n\ | |
341 | v_add_u32 v9, vcc, v9, v10 \n\ | |
342 | v_add_u32 v12, vcc, v12, v10\n\ | |
343 | v_addc_u32 v13, vcc, v13, 0, vcc\n\ | |
344 | v_cmp_lt_u32 vcc, v9, v11\n\ | |
345 | s_cbranch_vccnz L_LOOP_READ\n\ | |
346 | s_branch L_REPEAT\n\ | |
347 | L_QUIT:\n\ | |
348 | flat_store_dword v[4:5], v8\n\ | |
349 | s_waitcnt vmcnt(0) & lgkmcnt(0) // wait for memory writes to finish\n\ | |
350 | s_endpgm\n\ | |
351 | end\n\ | |
352 | "; | |
353 | ||
354 | std::string KFDSVMEvictTest::CreateShader() { | |
355 | if (m_FamilyId >= FAMILY_AI) | |
356 | return gfx9_ReadMemory; | |
357 | else | |
358 | return gfx8_ReadMemory; | |
359 | 234 | } |
360 | 235 | |
361 | 236 | /* Evict and restore queue test |
433 | 308 | for (i = 0; i < wavefront_num; i++) |
434 | 309 | *(localBufAddr + i) = pBuffers[i]; |
435 | 310 | |
436 | m_pIsaGen->CompileShader(CreateShader().c_str(), "ReadMemory", isaBuffer); | |
311 | ASSERT_SUCCESS(m_pAsm->RunAssembleBuf(ReadMemoryIsa, isaBuffer.As<char*>())); | |
437 | 312 | |
438 | 313 | PM4Queue pm4Queue; |
439 | 314 | ASSERT_SUCCESS(pm4Queue.Create(defaultGPUNode)); |
27 | 27 | #include <vector> |
28 | 28 | #include "KFDLocalMemoryTest.hpp" |
29 | 29 | #include "KFDBaseComponentTest.hpp" |
30 | #include "IsaGenerator.hpp" | |
31 | 30 | |
32 | 31 | // @class KFDEvictTest |
33 | 32 | // Test eviction and restore procedure using two processes |
20 | 20 | * |
21 | 21 | */ |
22 | 22 | #include "KFDSVMRangeTest.hpp" |
23 | #include <poll.h> | |
23 | 24 | #include <sys/mman.h> |
24 | 25 | #include <vector> |
25 | 26 | #include "PM4Queue.hpp" |
33 | 34 | |
34 | 35 | KFDBaseComponentTest::SetUp(); |
35 | 36 | |
36 | m_pIsaGen = IsaGenerator::Create(m_FamilyId); | |
37 | ||
38 | 37 | SVMSetXNACKMode(); |
39 | 38 | |
40 | 39 | ROUTINE_END |
42 | 41 | |
43 | 42 | void KFDSVMRangeTest::TearDown() { |
44 | 43 | ROUTINE_START |
45 | ||
46 | if (m_pIsaGen) | |
47 | delete m_pIsaGen; | |
48 | m_pIsaGen = NULL; | |
49 | 44 | |
50 | 45 | SVMRestoreXNACKMode(); |
51 | 46 | |
79 | 74 | |
80 | 75 | srcSysBuffer.Fill(0x01010101); |
81 | 76 | |
82 | m_pIsaGen->GetCopyDwordIsa(isaBuffer); | |
77 | ASSERT_SUCCESS(m_pAsm->RunAssembleBuf(CopyDwordIsa, isaBuffer.As<char*>())); | |
83 | 78 | |
84 | 79 | ASSERT_SUCCESS(queue.Create(defaultGPUNode)); |
85 | 80 | queue.SetSkipWaitConsump(0); |
363 | 358 | ASSERT_SUCCESS(sdmaQueue.Create(defaultGPUNode)); |
364 | 359 | |
365 | 360 | HsaMemoryBuffer isaBuffer(PAGE_SIZE, defaultGPUNode, true/*zero*/, false/*local*/, true/*exec*/); |
366 | m_pIsaGen->GetCopyDwordIsa(isaBuffer); | |
361 | ||
362 | ASSERT_SUCCESS(m_pAsm->RunAssembleBuf(CopyDwordIsa, isaBuffer.As<char*>())); | |
367 | 363 | |
368 | 364 | Dispatch dispatch0(isaBuffer); |
369 | 365 | dispatch0.SetArgs(srcBuffer.As<void*>(), dstBuffer.As<void*>()); |
457 | 453 | |
458 | 454 | munmap(pBuf2, Buf2Size); |
459 | 455 | |
460 | m_pIsaGen->GetCopyDwordIsa(isaBuffer); | |
456 | ASSERT_SUCCESS(m_pAsm->RunAssembleBuf(CopyDwordIsa, isaBuffer.As<char*>())); | |
457 | ||
461 | 458 | ASSERT_SUCCESS(queue.Create(defaultGPUNode)); |
462 | 459 | |
463 | 460 | Dispatch dispatch(isaBuffer); |
506 | 503 | |
507 | 504 | srcSysBuffer.Fill(0x01010101); |
508 | 505 | |
509 | m_pIsaGen->GetCopyDwordIsa(isaBuffer); | |
506 | ASSERT_SUCCESS(m_pAsm->RunAssembleBuf(CopyDwordIsa, isaBuffer.As<char*>())); | |
510 | 507 | |
511 | 508 | ASSERT_SUCCESS(queue.Create(defaultGPUNode)); |
512 | 509 | queue.SetSkipWaitConsump(0); |
942 | 939 | #ifdef USE_PM4_QUEUE_TRIGGER_VM_FAULT |
943 | 940 | HsaMemoryBuffer isaBuffer(PAGE_SIZE, defaultGPUNode); |
944 | 941 | PM4Queue queue; |
945 | m_pIsaGen->GetCopyDwordIsa(isaBuffer); | |
942 | ||
943 | ASSERT_SUCCESS(m_pAsm->RunAssembleBuf(CopyDwordIsa, isaBuffer.As<char*>())); | |
944 | ||
946 | 945 | ASSERT_SUCCESS(queue.Create(defaultGPUNode)); |
947 | 946 | |
948 | 947 | for (HSAuint64 i = 0; i < BufferSize / 8; i += 512) { |
998 | 997 | return; |
999 | 998 | } |
1000 | 999 | |
1001 | const std::vector<int> gpuNodes = m_NodeInfo.GetNodesWithGPU(); | |
1000 | const std::vector<int> gpuNodesAll = m_NodeInfo.GetNodesWithGPU(); | |
1001 | std::vector<int> gpuNodes; | |
1002 | ||
1003 | for (int i : gpuNodesAll) { | |
1004 | const HsaNodeProperties *pNodeProperties; | |
1005 | ||
1006 | pNodeProperties = m_NodeInfo.GetNodeProperties(gpuNodesAll.at(i)); | |
1007 | if (pNodeProperties->Capability.ui32.SVMAPISupported) | |
1008 | gpuNodes.push_back(gpuNodesAll.at(i)); | |
1009 | } | |
1002 | 1010 | if (gpuNodes.size() < 2) { |
1003 | LOG() << "Skipping test: at least two GPUs needed." << std::endl; | |
1011 | LOG() << "Skipping test: at least two SVM supported GPUs needed." << std::endl; | |
1004 | 1012 | return; |
1005 | 1013 | } |
1006 | 1014 | |
1073 | 1081 | return; |
1074 | 1082 | } |
1075 | 1083 | |
1076 | const std::vector<int> gpuNodes = m_NodeInfo.GetNodesWithGPU(); | |
1084 | const std::vector<int> gpuNodesAll = m_NodeInfo.GetNodesWithGPU(); | |
1085 | std::vector<int> gpuNodes; | |
1086 | ||
1087 | for (int i : gpuNodesAll) { | |
1088 | const HsaNodeProperties *pNodeProperties; | |
1089 | ||
1090 | pNodeProperties = m_NodeInfo.GetNodeProperties(gpuNodesAll.at(i)); | |
1091 | if (pNodeProperties->Capability.ui32.SVMAPISupported) | |
1092 | gpuNodes.push_back(gpuNodesAll.at(i)); | |
1093 | } | |
1077 | 1094 | if (gpuNodes.size() < 2) { |
1078 | LOG() << "Skipping test: at least two GPUs needed." << std::endl; | |
1095 | LOG() << "Skipping test: at least two SVM supported GPUs needed." << std::endl; | |
1079 | 1096 | return; |
1080 | 1097 | } |
1081 | 1098 | |
1236 | 1253 | ASSERT_EQ(size, write(fd, buf, size)); |
1237 | 1254 | |
1238 | 1255 | void *MmapedFile = mmap(NULL, size, PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0); |
1239 | ASSERT_NE(nullptr, MmapedFile); | |
1256 | ASSERT_NE(MAP_FAILED, MmapedFile); | |
1240 | 1257 | |
1241 | 1258 | HsaSVMRange filebackedRange(MmapedFile, size, defaultGPUNode, defaultGPUNode); |
1242 | 1259 | |
1384 | 1401 | |
1385 | 1402 | TEST_END |
1386 | 1403 | } |
1404 | ||
1405 | /* | |
1406 | * Test SMI HMM SVM profiling event | |
1407 | * Use separate thread to read event the same way as ROCr and ROCProfiler | |
1408 | */ | |
1409 | struct ReadEventThreadParams { | |
1410 | int nodeid; | |
1411 | HSAuint64 *pBuf; | |
1412 | int BufSize; | |
1413 | pthread_barrier_t *barrier; | |
1414 | }; | |
1415 | ||
1416 | unsigned int ReadSMIEventThread(void* p) { | |
1417 | struct ReadEventThreadParams *pArgs = (struct ReadEventThreadParams *)p; | |
1418 | char msg[HSA_SMI_EVENT_MSG_SIZE]; | |
1419 | struct pollfd fds = {0}; | |
1420 | HSAuint64 events; | |
1421 | int fd; | |
1422 | ||
1423 | EXPECT_SUCCESS(hsaKmtOpenSMI(pArgs->nodeid, &fd)); | |
1424 | events = HSA_SMI_EVENT_MASK_FROM_INDEX(HSA_SMI_EVENT_INDEX_MAX) - 1; | |
1425 | EXPECT_EQ(write(fd, &events, sizeof(events)), sizeof(events)); | |
1426 | ||
1427 | pthread_barrier_wait(pArgs->barrier); | |
1428 | ||
1429 | fds.fd = fd; | |
1430 | fds.events = POLLIN; | |
1431 | EXPECT_GE(poll(&fds, 1, 1000), 0); | |
1432 | ||
1433 | memset(msg, 0, sizeof(msg)); | |
1434 | EXPECT_GE(read(fd, msg, HSA_SMI_EVENT_MSG_SIZE), 0); | |
1435 | ||
1436 | int event_id, pid, size, trigger, unused; | |
1437 | HSAuint64 timestamp; | |
1438 | HSAuint64 addr; | |
1439 | EXPECT_EQ(sscanf(msg, "%x %ld -%d @%lx(%d) %d->%x %x:%d %d\n", &event_id, ×tamp, &pid, | |
1440 | &addr, &size, &unused, &unused, &unused, &unused, &trigger), 10); | |
1441 | EXPECT_EQ(event_id, HSA_SMI_EVENT_MIGRATE_START); | |
1442 | EXPECT_EQ((HSAuint64 *)(addr << PAGE_SHIFT), pArgs->pBuf); | |
1443 | EXPECT_EQ(size << PAGE_SHIFT, pArgs->BufSize); | |
1444 | EXPECT_EQ(pid, getpid()); | |
1445 | EXPECT_EQ(trigger, HSA_MIGRATE_TRIGGER_PREFETCH); | |
1446 | close(fd); | |
1447 | return 0; | |
1448 | } | |
1449 | ||
1450 | TEST_F(KFDSVMRangeTest, HMMProfilingEvent) { | |
1451 | TEST_REQUIRE_ENV_CAPABILITIES(ENVCAPS_64BITLINUX); | |
1452 | TEST_START(TESTPROFILE_RUNALL); | |
1453 | ||
1454 | if (!SVMAPISupported()) | |
1455 | return; | |
1456 | ||
1457 | if (m_VersionInfo.KernelInterfaceMinorVersion < 10) | |
1458 | return; | |
1459 | ||
1460 | int defaultGPUNode = m_NodeInfo.HsaDefaultGPUNode(); | |
1461 | ASSERT_GE(defaultGPUNode, 0) << "failed to get default GPU Node"; | |
1462 | ||
1463 | if (!GetVramSize(defaultGPUNode)) { | |
1464 | LOG() << "Skipping test: No VRAM found." << std::endl; | |
1465 | return; | |
1466 | } | |
1467 | ||
1468 | pthread_barrier_t barrier; | |
1469 | ASSERT_SUCCESS(pthread_barrier_init(&barrier, NULL, 2)); | |
1470 | ||
1471 | int BufSize = 16 << 10; | |
1472 | HsaSVMRange SysBuffer(BufSize, defaultGPUNode); | |
1473 | HSAuint64 *pBuf = SysBuffer.As<HSAuint64 *>(); | |
1474 | ||
1475 | struct ReadEventThreadParams pArgs = {defaultGPUNode, pBuf, BufSize, &barrier}; | |
1476 | uint64_t threadId; | |
1477 | ASSERT_EQ(true, StartThread(&ReadSMIEventThread, &pArgs, threadId)); | |
1478 | ||
1479 | pthread_barrier_wait(&barrier); | |
1480 | ||
1481 | EXPECT_SUCCESS(SVMRangePrefetchToNode(pBuf, BufSize, defaultGPUNode)); | |
1482 | ||
1483 | WaitForThread(threadId); | |
1484 | ||
1485 | TEST_END | |
1486 | } | |
1487 | ||
1488 | /* | |
1489 | * Test SVM support VRAM overcommitment | |
1490 | * | |
1491 | * Prefetch total VRAM size plus overCommitSize SVM range to VRAM. after VRAM is full, | |
1492 | * KFD should support VRAM overcommitment by evicting SVM ranges to system memory to alloc | |
1493 | * VRAM for new ranges. | |
1494 | */ | |
1495 | TEST_F(KFDSVMRangeTest, VramOvercommitTest) { | |
1496 | TEST_REQUIRE_ENV_CAPABILITIES(ENVCAPS_64BITLINUX); | |
1497 | TEST_START(TESTPROFILE_RUNALL); | |
1498 | ||
1499 | if (!SVMAPISupported()) | |
1500 | return; | |
1501 | ||
1502 | int defaultGPUNode = m_NodeInfo.HsaDefaultGPUNode(); | |
1503 | ASSERT_GE(defaultGPUNode, 0) << "failed to get default GPU Node"; | |
1504 | ||
1505 | if (m_FamilyId < FAMILY_AI) { | |
1506 | LOG() << std::hex << "Skipping test: No svm range support for family ID 0x" << m_FamilyId << "." << std::endl; | |
1507 | return; | |
1508 | } | |
1509 | ||
1510 | HSAuint64 vramSize = GetVramSize(defaultGPUNode); | |
1511 | if (!vramSize) { | |
1512 | LOG() << "Skipping test: No VRAM found." << std::endl; | |
1513 | return; | |
1514 | } | |
1515 | ||
1516 | unsigned long overCommitSize = 1UL << 30; | |
1517 | ||
1518 | /* With XNACK off, KFD checks that all SVM memory will fit into system memory */ | |
1519 | if (vramSize + overCommitSize > GetSysMemSize() / 2) { | |
1520 | LOG() << "Skipping test: no enough system memory." << std::endl; | |
1521 | return; | |
1522 | } | |
1523 | ||
1524 | unsigned long BufSize = 512UL << 20; | |
1525 | unsigned long numBufs = (vramSize + overCommitSize) / BufSize; | |
1526 | HSAKMT_STATUS ret; | |
1527 | ||
1528 | void *pBuf[numBufs]; | |
1529 | unsigned long i; | |
1530 | ||
1531 | for (i = 0; i < numBufs; i++) { | |
1532 | pBuf[i] = mmap(0, BufSize, PROT_READ | PROT_WRITE, MAP_ANONYMOUS | MAP_PRIVATE, -1, 0); | |
1533 | ASSERT_NE(MAP_FAILED, pBuf[i]); | |
1534 | ||
1535 | ret = RegisterSVMRange(defaultGPUNode, pBuf[i], BufSize, defaultGPUNode, 0); | |
1536 | if (ret != HSAKMT_STATUS_SUCCESS) | |
1537 | break; | |
1538 | } | |
1539 | ||
1540 | EXPECT_EQ(numBufs, i); | |
1541 | ||
1542 | while (i--) | |
1543 | munmap(pBuf[i], BufSize); | |
1544 | ||
1545 | TEST_END | |
1546 | } | |
1547 | ||
1548 | /* | |
1549 | * Test SVM support VRAM overcommitment | |
1550 | * | |
1551 | * Prefetch giant overcommit SVM range to VRAM, KFD should support VRAM overcommitment | |
1552 | * by spliting giant range into smaller ranges, evicting SVM ranges to system memory to | |
1553 | * alloc VRAM for overcommitment ranges. | |
1554 | */ | |
1555 | TEST_F(KFDSVMRangeTest, VramOvercommitGiantRangeTest) { | |
1556 | TEST_REQUIRE_ENV_CAPABILITIES(ENVCAPS_64BITLINUX); | |
1557 | TEST_START(TESTPROFILE_RUNALL); | |
1558 | ||
1559 | if (!SVMAPISupported()) | |
1560 | return; | |
1561 | ||
1562 | int defaultGPUNode = m_NodeInfo.HsaDefaultGPUNode(); | |
1563 | ASSERT_GE(defaultGPUNode, 0) << "failed to get default GPU Node"; | |
1564 | ||
1565 | if (m_FamilyId < FAMILY_AI) { | |
1566 | LOG() << std::hex << "Skipping test: No svm range support for family ID 0x" << m_FamilyId << "." << std::endl; | |
1567 | return; | |
1568 | } | |
1569 | ||
1570 | HSAuint64 vramSize = GetVramSize(defaultGPUNode); | |
1571 | if (!vramSize) { | |
1572 | LOG() << "Skipping test: No VRAM found." << std::endl; | |
1573 | return; | |
1574 | } | |
1575 | ||
1576 | unsigned long overCommitSize = 1UL << 30; | |
1577 | ||
1578 | /* With XNACK off, KFD checks that all SVM memory will fit into system memory */ | |
1579 | if (vramSize + overCommitSize > GetSysMemSize() / 2) { | |
1580 | LOG() << "Skipping test: no enough system memory." << std::endl; | |
1581 | return; | |
1582 | } | |
1583 | ||
1584 | unsigned long BufSize = vramSize + overCommitSize; | |
1585 | HSAKMT_STATUS ret; | |
1586 | void *pBuf; | |
1587 | ||
1588 | pBuf = mmap(0, BufSize, PROT_READ | PROT_WRITE, MAP_ANONYMOUS | MAP_PRIVATE, -1, 0); | |
1589 | ASSERT_NE(MAP_FAILED, pBuf); | |
1590 | ||
1591 | ret = RegisterSVMRange(defaultGPUNode, pBuf, BufSize, defaultGPUNode, 0); | |
1592 | EXPECT_EQ (HSAKMT_STATUS_SUCCESS, ret); | |
1593 | ||
1594 | munmap(pBuf, BufSize); | |
1595 | TEST_END | |
1596 | } | |
1597 | ||
1598 | /* | |
1599 | * Test partial range prefault | |
1600 | * | |
1601 | * mmap alloc 4 pages range, memset middle 2 pages, prefetch entire range to VRAM, | |
1602 | * use sdma to memset the rest 2 pages, each page has different value 0x1, 0x2, 0x3, 0x4 | |
1603 | * then check if all page have the specific value after migrating 4 pages to system memory. | |
1604 | */ | |
1605 | TEST_F(KFDSVMRangeTest, PrefaultPartialRangeTest) { | |
1606 | TEST_REQUIRE_ENV_CAPABILITIES(ENVCAPS_64BITLINUX); | |
1607 | TEST_START(TESTPROFILE_RUNALL); | |
1608 | ||
1609 | if (!SVMAPISupported()) | |
1610 | return; | |
1611 | ||
1612 | int defaultGPUNode = m_NodeInfo.HsaDefaultGPUNode(); | |
1613 | ASSERT_GE(defaultGPUNode, 0) << "failed to get default GPU Node"; | |
1614 | ||
1615 | if (m_FamilyId < FAMILY_AI) { | |
1616 | LOG() << std::hex << "Skipping test: No svm range support for family ID 0x" << m_FamilyId << "." << std::endl; | |
1617 | return; | |
1618 | } | |
1619 | ||
1620 | unsigned long BufSize = 4 * PAGE_SIZE; | |
1621 | HSAKMT_STATUS ret; | |
1622 | char *pBuf; | |
1623 | ||
1624 | pBuf = (char *)mmap(0, BufSize, PROT_READ | PROT_WRITE, MAP_ANONYMOUS | MAP_PRIVATE, -1, 0); | |
1625 | ASSERT_NE(MAP_FAILED, pBuf); | |
1626 | ||
1627 | memset(pBuf + PAGE_SIZE, 0x2, PAGE_SIZE); | |
1628 | memset(pBuf + 2 * PAGE_SIZE, 0x3, PAGE_SIZE); | |
1629 | ||
1630 | EXPECT_SUCCESS(RegisterSVMRange(defaultGPUNode, pBuf, BufSize, 0, 0)); | |
1631 | EXPECT_SUCCESS(SVMRangePrefetchToNode(pBuf, BufSize, defaultGPUNode)); | |
1632 | ||
1633 | SDMAQueue sdmaQueue; | |
1634 | EXPECT_SUCCESS(sdmaQueue.Create(defaultGPUNode)); | |
1635 | ||
1636 | sdmaQueue.PlaceAndSubmitPacket(SDMAFillDataPacket(sdmaQueue.GetFamilyId(), | |
1637 | pBuf, 0x01010101, PAGE_SIZE)); | |
1638 | sdmaQueue.PlaceAndSubmitPacket(SDMAFillDataPacket(sdmaQueue.GetFamilyId(), | |
1639 | pBuf + 3 * PAGE_SIZE, 0x04040404, PAGE_SIZE)); | |
1640 | sdmaQueue.Wait4PacketConsumption(); | |
1641 | ||
1642 | EXPECT_SUCCESS(sdmaQueue.Destroy()); | |
1643 | ||
1644 | for (int i = 0; i < 4; i++) | |
1645 | EXPECT_EQ(pBuf[i * PAGE_SIZE], i + 1); | |
1646 | ||
1647 | munmap(pBuf, BufSize); | |
1648 | TEST_END | |
1649 | } |
25 | 25 | |
26 | 26 | #include <gtest/gtest.h> |
27 | 27 | |
28 | #include "IsaGenerator.hpp" | |
29 | 28 | #include "KFDBaseComponentTest.hpp" |
30 | 29 | |
31 | 30 | class KFDSVMRangeTest : public KFDBaseComponentTest { |
32 | 31 | public: |
33 | KFDSVMRangeTest() :m_pIsaGen(NULL) {} | |
32 | KFDSVMRangeTest() {} | |
34 | 33 | ~KFDSVMRangeTest() {} |
35 | 34 | void SplitRangeTest(int defaultGPUNode, int prefetch_location); |
36 | 35 | |
37 | 36 | protected: |
38 | 37 | virtual void SetUp(); |
39 | 38 | virtual void TearDown(); |
40 | ||
41 | protected: // Members | |
42 | IsaGenerator* m_pIsaGen; | |
43 | 39 | }; |
44 | 40 | |
45 | 41 | #endif // __KFD_LOCALMEMORY_TEST__H__ |
51 | 51 | |
52 | 52 | enum KfdFamilyId { |
53 | 53 | FAMILY_UNKNOWN = 0, |
54 | FAMILY_CI, // Sea Islands: Hawaii (P), Maui (P), Bonaire (M) | |
55 | FAMILY_KV, // Fusion Kaveri: Spectre, Spooky; Fusion Kabini: Kalindi | |
56 | FAMILY_VI, // Volcanic Islands: Iceland (V), Tonga (M) | |
57 | FAMILY_CZ, // Carrizo, Nolan, Amur | |
58 | FAMILY_AI, // Arctic Islands | |
59 | FAMILY_RV, // Raven | |
60 | FAMILY_AR, // Arcturus | |
61 | FAMILY_AL, // Aldebaran | |
62 | FAMILY_NV, // Navi10 | |
54 | FAMILY_CI, // Sea Islands: Hawaii (P), Maui (P), Bonaire (M) | |
55 | FAMILY_KV, // Fusion Kaveri: Spectre, Spooky; Fusion Kabini: Kalindi | |
56 | FAMILY_VI, // Volcanic Islands: Iceland (V), Tonga (M) | |
57 | FAMILY_CZ, // Carrizo, Nolan, Amur | |
58 | FAMILY_AI, // Arctic Islands | |
59 | FAMILY_RV, // Raven | |
60 | FAMILY_AR, // Arcturus | |
61 | FAMILY_AL, // Aldebaran | |
62 | FAMILY_NV, // Navi10 | |
63 | FAMILY_GFX11, // GFX11 | |
63 | 64 | }; |
64 | 65 | |
65 | 66 | #endif // __KFD_TEST_FLAGS__H__ |
193 | 193 | case 10: |
194 | 194 | familyId = FAMILY_NV; |
195 | 195 | break; |
196 | case 11: | |
197 | familyId = FAMILY_GFX11; | |
198 | break; | |
196 | 199 | } |
197 | 200 | |
198 | 201 | if (props->NumCPUCores && props->NumFComputeCores) |
228 | 231 | } |
229 | 232 | |
230 | 233 | return false; |
234 | } | |
235 | ||
236 | const uint32_t GetGfxVersion(const HsaNodeProperties *props) { | |
237 | return ((props->EngineId.ui32.Major << 16) | | |
238 | (props->EngineId.ui32.Minor << 8) | | |
239 | (props->EngineId.ui32.Stepping)); | |
231 | 240 | } |
232 | 241 | |
233 | 242 | HSAuint64 GetSystemTickCountInMicroSec() { |
854 | 863 | m_SelfAllocated(false) { |
855 | 864 | if (!m_pUser) { |
856 | 865 | m_pUser = mmap(0, m_Size, PROT_READ | PROT_WRITE, MAP_ANONYMOUS | MAP_PRIVATE, -1, 0); |
857 | EXPECT_NOTNULL(m_pUser); | |
866 | EXPECT_NE(MAP_FAILED, m_pUser); | |
858 | 867 | m_SelfAllocated = true; |
859 | 868 | } |
860 | 869 |
51 | 51 | bool isTonga(const HsaNodeProperties *props); |
52 | 52 | bool hasPciAtomicsSupport(int node); |
53 | 53 | unsigned int FamilyIdFromNode(const HsaNodeProperties *props); |
54 | const uint32_t GetGfxVersion(const HsaNodeProperties *props); | |
54 | 55 | |
55 | 56 | void GetHwQueueInfo(const HsaNodeProperties *props, |
56 | 57 | unsigned int *p_num_cp_queues, |
33 | 33 | |
34 | 34 | KFDBaseComponentTest::SetUp(); |
35 | 35 | |
36 | m_pIsaGen = IsaGenerator::Create(m_FamilyId); | |
37 | ||
38 | 36 | ROUTINE_END |
39 | 37 | } |
40 | 38 | |
41 | 39 | void RDMATest::TearDown() { |
42 | 40 | ROUTINE_START |
43 | if (m_pIsaGen) | |
44 | delete m_pIsaGen; | |
45 | m_pIsaGen = NULL; | |
46 | 41 | |
47 | 42 | KFDBaseComponentTest::TearDown(); |
48 | 43 | |
76 | 71 | srcSysBuffer.Fill(0xfe); |
77 | 72 | |
78 | 73 | /* Put 'copy dword' command to ISA buffer */ |
79 | m_pIsaGen->GetCopyDwordIsa(isaBuffer); | |
74 | ASSERT_SUCCESS(m_pAsm->RunAssembleBuf(CopyDwordIsa, isaBuffer.As<char*>())); | |
75 | ||
80 | 76 | |
81 | 77 | ASSERT_SUCCESS(queue.Create(defaultGPUNode)); |
82 | 78 | Dispatch dispatch(isaBuffer); |
25 | 25 | |
26 | 26 | #include <gtest/gtest.h> |
27 | 27 | |
28 | #include "IsaGenerator.hpp" | |
29 | 28 | #include "KFDBaseComponentTest.hpp" |
30 | 29 | |
31 | 30 | class RDMATest : public KFDBaseComponentTest { |
32 | 31 | public: |
33 | RDMATest():m_pIsaGen(NULL) {} | |
32 | RDMATest() {} | |
34 | 33 | ~RDMATest() {} |
35 | 34 | |
36 | 35 | protected: |
37 | 36 | virtual void SetUp(); |
38 | 37 | virtual void TearDown(); |
39 | ||
40 | protected: // Members | |
41 | IsaGenerator* m_pIsaGen; | |
42 | 38 | }; |
43 | 39 | |
44 | 40 | #endif // __RDMA_TEST__H__ |
0 | /* | |
1 | * Copyright (C) 2021 Advanced Micro Devices, Inc. All Rights Reserved. | |
2 | * | |
3 | * Permission is hereby granted, free of charge, to any person obtaining a | |
4 | * copy of this software and associated documentation files (the "Software"), | |
5 | * to deal in the Software without restriction, including without limitation | |
6 | * the rights to use, copy, modify, merge, publish, distribute, sublicense, | |
7 | * and/or sell copies of the Software, and to permit persons to whom the | |
8 | * Software is furnished to do so, subject to the following conditions: | |
9 | * | |
10 | * The above copyright notice and this permission notice shall be included in | |
11 | * all copies or substantial portions of the Software. | |
12 | * | |
13 | * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR | |
14 | * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, | |
15 | * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL | |
16 | * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR | |
17 | * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, | |
18 | * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR | |
19 | * OTHER DEALINGS IN THE SOFTWARE. | |
20 | * | |
21 | */ | |
22 | ||
23 | #include "ShaderStore.hpp" | |
24 | ||
25 | /** | |
26 | * KFDASMTest List | |
27 | */ | |
28 | ||
29 | const std::vector<const char*> ShaderList = { | |
30 | NoopIsa, | |
31 | CopyDwordIsa, | |
32 | InfiniteLoopIsa, | |
33 | AtomicIncIsa, | |
34 | ScratchCopyDwordIsa, | |
35 | PollMemoryIsa, | |
36 | CopyOnSignalIsa, | |
37 | PollAndCopyIsa, | |
38 | WriteFlagAndValueIsa, | |
39 | WriteAndSignalIsa, | |
40 | LoopIsa, | |
41 | IterateIsa, | |
42 | ReadMemoryIsa, | |
43 | GwsInitIsa, | |
44 | GwsAtomicIncreaseIsa, | |
45 | }; | |
46 | ||
47 | /** | |
48 | * Macros | |
49 | */ | |
50 | ||
51 | /* Create macro for portable v_add_co_u32, v_add_co_ci_u32, | |
52 | * and v_cmp_lt_u32 | |
53 | */ | |
54 | #define SHADER_MACROS \ | |
55 | " .text\n"\ | |
56 | " .macro V_ADD_CO_U32 vdst, src0, vsrc1\n"\ | |
57 | " .if (.amdgcn.gfx_generation_number >= 10)\n"\ | |
58 | " v_add_co_u32 \\vdst, vcc_lo, \\src0, \\vsrc1\n"\ | |
59 | " .elseif (.amdgcn.gfx_generation_number >= 9)\n"\ | |
60 | " v_add_co_u32 \\vdst, vcc, \\src0, \\vsrc1\n"\ | |
61 | " .else\n"\ | |
62 | " v_add_u32 \\vdst, vcc, \\src0, \\vsrc1\n"\ | |
63 | " .endif\n"\ | |
64 | " .endm\n"\ | |
65 | " .macro V_ADD_CO_CI_U32 vdst, src0, vsrc1\n"\ | |
66 | " .if (.amdgcn.gfx_generation_number >= 10)\n"\ | |
67 | " v_add_co_ci_u32 \\vdst, vcc_lo, \\src0, \\vsrc1, vcc_lo\n"\ | |
68 | " .elseif (.amdgcn.gfx_generation_number >= 9)\n"\ | |
69 | " v_addc_co_u32 \\vdst, vcc, \\src0, \\vsrc1, vcc\n"\ | |
70 | " .else\n"\ | |
71 | " v_addc_u32 \\vdst, vcc, \\src0, \\vsrc1, vcc\n"\ | |
72 | " .endif\n"\ | |
73 | " .endm\n"\ | |
74 | " .macro V_CMP_LT_U32 src0, vsrc1\n"\ | |
75 | " .if (.amdgcn.gfx_generation_number >= 10)\n"\ | |
76 | " v_cmp_lt_u32 vcc_lo, \\src0, \\vsrc1\n"\ | |
77 | " .else\n"\ | |
78 | " v_cmp_lt_u32 vcc, \\src0, \\vsrc1\n"\ | |
79 | " .endif\n"\ | |
80 | " .endm\n" | |
81 | ||
82 | /** | |
83 | * Common | |
84 | */ | |
85 | ||
86 | const char *NoopIsa = R"( | |
87 | .text | |
88 | s_endpgm | |
89 | )"; | |
90 | ||
91 | const char *CopyDwordIsa = R"( | |
92 | .text | |
93 | v_mov_b32 v0, s0 | |
94 | v_mov_b32 v1, s1 | |
95 | v_mov_b32 v2, s2 | |
96 | v_mov_b32 v3, s3 | |
97 | flat_load_dword v4, v[0:1] glc slc | |
98 | s_waitcnt 0 | |
99 | flat_store_dword v[2:3], v4 glc slc | |
100 | s_endpgm | |
101 | )"; | |
102 | ||
103 | const char *InfiniteLoopIsa = R"( | |
104 | .text | |
105 | LOOP: | |
106 | s_branch LOOP | |
107 | s_endpgm | |
108 | )"; | |
109 | ||
110 | const char *AtomicIncIsa = R"( | |
111 | .text | |
112 | v_mov_b32 v0, s0 | |
113 | v_mov_b32 v1, s1 | |
114 | .if (.amdgcn.gfx_generation_number >= 8) | |
115 | v_mov_b32 v2, 1 | |
116 | flat_atomic_add v3, v[0:1], v2 glc slc | |
117 | .else | |
118 | v_mov_b32 v2, -1 | |
119 | flat_atomic_inc v3, v[0:1], v2 glc slc | |
120 | .endif | |
121 | s_waitcnt 0 | |
122 | s_endpgm | |
123 | )"; | |
124 | ||
125 | /** | |
126 | * KFDMemoryTest | |
127 | */ | |
128 | ||
129 | const char *ScratchCopyDwordIsa = R"( | |
130 | .text | |
131 | // Copy the parameters from scalar registers to vector registers | |
132 | .if (.amdgcn.gfx_generation_number >= 9) | |
133 | v_mov_b32 v0, s0 | |
134 | v_mov_b32 v1, s1 | |
135 | v_mov_b32 v2, s2 | |
136 | v_mov_b32 v3, s3 | |
137 | .else | |
138 | v_mov_b32_e32 v0, s0 | |
139 | v_mov_b32_e32 v1, s1 | |
140 | v_mov_b32_e32 v2, s2 | |
141 | v_mov_b32_e32 v3, s3 | |
142 | .endif | |
143 | // Setup the scratch parameters. This assumes a single 16-reg block | |
144 | .if (.amdgcn.gfx_generation_number >= 10) | |
145 | s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s4 | |
146 | s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s5 | |
147 | .elseif (.amdgcn.gfx_generation_number == 9) | |
148 | s_mov_b32 flat_scratch_lo, s4 | |
149 | s_mov_b32 flat_scratch_hi, s5 | |
150 | .else | |
151 | s_mov_b32 flat_scratch_lo, 8 | |
152 | s_mov_b32 flat_scratch_hi, 0 | |
153 | .endif | |
154 | // Copy a dword between the passed addresses | |
155 | flat_load_dword v4, v[0:1] slc | |
156 | s_waitcnt vmcnt(0) & lgkmcnt(0) | |
157 | flat_store_dword v[2:3], v4 slc | |
158 | s_endpgm | |
159 | )"; | |
160 | ||
161 | /* Continuously poll src buffer and check buffer value | |
162 | * After src buffer is filled with specific value (0x5678, | |
163 | * by host program), fill dst buffer with specific | |
164 | * value(0x5678) and quit | |
165 | */ | |
166 | const char *PollMemoryIsa = R"( | |
167 | .text | |
168 | // Assume src address in s0, s1, and dst address in s2, s3 | |
169 | s_movk_i32 s18, 0x5678 | |
170 | .if (.amdgcn.gfx_generation_number >= 10) | |
171 | v_mov_b32 v0, s2 | |
172 | v_mov_b32 v1, s3 | |
173 | v_mov_b32 v2, 0x5678 | |
174 | .endif | |
175 | LOOP: | |
176 | s_load_dword s16, s[0:1], 0x0 glc | |
177 | s_cmp_eq_i32 s16, s18 | |
178 | s_cbranch_scc0 LOOP | |
179 | .if (.amdgcn.gfx_generation_number >= 10) | |
180 | flat_store_dword v[0:1], v2 slc | |
181 | .else | |
182 | s_store_dword s18, s[2:3], 0x0 glc | |
183 | .endif | |
184 | s_endpgm | |
185 | )"; | |
186 | ||
187 | /* Similar to PollMemoryIsa except that the buffer | |
188 | * polled can be Non-coherant memory. SCC system-level | |
189 | * cache coherence is not supported in scalar (smem) path. | |
190 | * Use vmem operations with scc | |
191 | * | |
192 | * Note: Only works on Aldebaran, and even then the scc modifier | |
193 | * has been defeatured. This shader is more or less | |
194 | * deprecated. | |
195 | */ | |
196 | const char *PollNCMemoryIsa = R"( | |
197 | .text | |
198 | // Assume src address in s0, s1, and dst address in s2, s3 | |
199 | v_mov_b32 v6, 0x5678 | |
200 | v_mov_b32 v0, s0 | |
201 | v_mov_b32 v1, s1 | |
202 | LOOP: | |
203 | flat_load_dword v4, v[0:1] scc | |
204 | v_cmp_eq_u32 vcc, v4, v6 | |
205 | s_cbranch_vccz LOOP | |
206 | v_mov_b32 v0, s2 | |
207 | v_mov_b32 v1, s3 | |
208 | flat_store_dword v[0:1], v6 scc | |
209 | s_endpgm | |
210 | )"; | |
211 | ||
212 | /* Input: A buffer of at least 3 dwords. | |
213 | * DW0: used as a signal. 0xcafe means it is signaled | |
214 | * DW1: Input buffer for device to read. | |
215 | * DW2: Output buffer for device to write. | |
216 | * Once receive signal, device will copy DW1 to DW2 | |
217 | * This shader continously poll the signal buffer, | |
218 | * Once signal buffer is signaled, it copies input buffer | |
219 | * to output buffer | |
220 | */ | |
221 | const char *CopyOnSignalIsa = R"( | |
222 | .text | |
223 | // Assume input buffer in s0, s1 | |
224 | .if (.amdgcn.gfx_generation_number >= 10) | |
225 | s_add_u32 s2, s0, 0x8 | |
226 | s_addc_u32 s3, s1, 0x0 | |
227 | s_mov_b32 s18, 0xcafe | |
228 | v_mov_b32 v0, s0 | |
229 | v_mov_b32 v1, s1 | |
230 | v_mov_b32 v4, s2 | |
231 | v_mov_b32 v5, s3 | |
232 | .else | |
233 | s_mov_b32 s18, 0xcafe | |
234 | .endif | |
235 | POLLSIGNAL: | |
236 | s_load_dword s16, s[0:1], 0x0 glc | |
237 | s_cmp_eq_i32 s16, s18 | |
238 | s_cbranch_scc0 POLLSIGNAL | |
239 | s_load_dword s17, s[0:1], 0x4 glc | |
240 | s_waitcnt vmcnt(0) & lgkmcnt(0) | |
241 | .if (.amdgcn.gfx_generation_number >= 10) | |
242 | v_mov_b32 v2, s17 | |
243 | flat_store_dword v[4:5], v2 glc | |
244 | .else | |
245 | s_store_dword s17, s[0:1], 0x8 glc | |
246 | .endif | |
247 | s_waitcnt vmcnt(0) & lgkmcnt(0) | |
248 | s_endpgm | |
249 | )"; | |
250 | ||
251 | /* Continuously poll the flag at src buffer | |
252 | * After the flag of s[0:1] is 1 filled, | |
253 | * copy the value from s[0:1]+4 to dst buffer | |
254 | * | |
255 | * Note: Only works on GFX9 (only used in | |
256 | * aldebaran tests) | |
257 | */ | |
258 | const char *PollAndCopyIsa = R"( | |
259 | .text | |
260 | // Assume src buffer in s[0:1] and dst buffer in s[2:3] | |
261 | .if (.amdgcn.gfx_generation_number == 9 && .amdgcn.gfx_generation_stepping == 10) | |
262 | // Path for Aldebaran | |
263 | v_mov_b32 v0, s0 | |
264 | v_mov_b32 v1, s1 | |
265 | v_mov_b32 v18, 0x1 | |
266 | LOOP_ALDBRN: | |
267 | flat_load_dword v16, v[0:1] glc | |
268 | s_waitcnt vmcnt(0) & lgkmcnt(0) | |
269 | v_cmp_eq_i32 vcc, v16, v18 | |
270 | s_cbranch_vccz LOOP_ALDBRN | |
271 | buffer_invl2 | |
272 | s_load_dword s17, s[0:1], 0x4 glc | |
273 | s_waitcnt vmcnt(0) & lgkmcnt(0) | |
274 | s_store_dword s17, s[2:3], 0x0 glc | |
275 | s_waitcnt vmcnt(0) & lgkmcnt(0) | |
276 | buffer_wbl2 | |
277 | .elseif (.amdgcn.gfx_generation_number == 9) | |
278 | s_movk_i32 s18, 0x1 | |
279 | LOOP: | |
280 | s_load_dword s16, s[0:1], 0x0 glc | |
281 | s_cmp_eq_i32 s16, s18 | |
282 | s_cbranch_scc0 LOOP | |
283 | s_load_dword s17, s[0:1], 0x4 glc | |
284 | s_waitcnt vmcnt(0) & lgkmcnt(0) | |
285 | s_store_dword s17, s[2:3], 0x0 glc | |
286 | .endif | |
287 | s_waitcnt vmcnt(0) & lgkmcnt(0) | |
288 | s_endpgm | |
289 | )"; | |
290 | ||
291 | /* Input0: A buffer of at least 2 dwords. | |
292 | * DW0: used as a signal. Write 0x1 to signal | |
293 | * DW1: Write the value from 2nd input buffer | |
294 | * for other device to read. | |
295 | * Input1: A buffer of at least 2 dwords. | |
296 | * DW0: used as the value to be written. | |
297 | * | |
298 | * Note: Only works on Aldebaran | |
299 | */ | |
300 | const char *WriteFlagAndValueIsa = R"( | |
301 | .text | |
302 | // Assume two inputs buffer in s[0:1] and s[2:3] | |
303 | .if (.amdgcn.gfx_generation_number == 9 && .amdgcn.gfx_generation_stepping == 10) | |
304 | v_mov_b32 v0, s0 | |
305 | v_mov_b32 v1, s1 | |
306 | s_load_dword s18, s[2:3], 0x0 glc | |
307 | s_waitcnt vmcnt(0) & lgkmcnt(0) | |
308 | s_store_dword s18, s[0:1], 0x4 glc | |
309 | s_waitcnt vmcnt(0) & lgkmcnt(0) | |
310 | buffer_wbl2 | |
311 | s_waitcnt vmcnt(0) & lgkmcnt(0) | |
312 | v_mov_b32 v16, 0x1 | |
313 | flat_store_dword v[0:1], v16 glc | |
314 | .endif | |
315 | s_endpgm | |
316 | )"; | |
317 | ||
318 | /* Input0: A buffer of at least 2 dwords. | |
319 | * DW0: used as a signal. Write 0xcafe to signal | |
320 | * DW1: Write to this buffer for other device to read. | |
321 | * Input1: mmio base address | |
322 | */ | |
323 | const char *WriteAndSignalIsa = R"( | |
324 | .text | |
325 | // Assume input buffer in s0, s1 | |
326 | .if (.amdgcn.gfx_generation_number >= 10) | |
327 | s_add_u32 s4, s0, 0x4 | |
328 | s_addc_u32 s5, s1, 0x0 | |
329 | v_mov_b32 v0, s0 | |
330 | v_mov_b32 v1, s1 | |
331 | v_mov_b32 v2, s2 | |
332 | v_mov_b32 v3, s3 | |
333 | v_mov_b32 v4, s4 | |
334 | v_mov_b32 v5, s5 | |
335 | v_mov_b32 v18, 0xbeef | |
336 | flat_store_dword v[4:5], v18 glc | |
337 | v_mov_b32 v18, 0x1 | |
338 | flat_store_dword v[2:3], v18 glc | |
339 | v_mov_b32 v18, 0xcafe | |
340 | flat_store_dword v[0:1], v18 glc | |
341 | .else | |
342 | s_mov_b32 s18, 0xbeef | |
343 | s_store_dword s18, s[0:1], 0x4 glc | |
344 | s_mov_b32 s18, 0x1 | |
345 | s_store_dword s18, s[2:3], 0 glc | |
346 | s_mov_b32 s18, 0xcafe | |
347 | s_store_dword s18, s[0:1], 0x0 glc | |
348 | .endif | |
349 | s_endpgm | |
350 | )"; | |
351 | ||
352 | /** | |
353 | * KFDQMTest | |
354 | */ | |
355 | ||
356 | /* A simple isa loop program with dense mathematic operations | |
357 | * s1 controls the number iterations of the loop | |
358 | * This shader can be used by GFX8, GFX9 and GFX10 | |
359 | */ | |
360 | const char *LoopIsa = R"( | |
361 | .text | |
362 | s_movk_i32 s0, 0x0008 | |
363 | s_movk_i32 s1, 0x00ff | |
364 | v_mov_b32 v0, 0 | |
365 | v_mov_b32 v1, 0 | |
366 | v_mov_b32 v2, 0 | |
367 | v_mov_b32 v3, 0 | |
368 | v_mov_b32 v4, 0 | |
369 | v_mov_b32 v5, 0 | |
370 | v_mov_b32 v6, 0 | |
371 | v_mov_b32 v7, 0 | |
372 | v_mov_b32 v8, 0 | |
373 | v_mov_b32 v9, 0 | |
374 | v_mov_b32 v10, 0 | |
375 | v_mov_b32 v11, 0 | |
376 | v_mov_b32 v12, 0 | |
377 | v_mov_b32 v13, 0 | |
378 | v_mov_b32 v14, 0 | |
379 | v_mov_b32 v15, 0 | |
380 | v_mov_b32 v16, 0 | |
381 | LOOP: | |
382 | s_mov_b32 s8, s4 | |
383 | s_mov_b32 s9, s1 | |
384 | s_mov_b32 s10, s6 | |
385 | s_mov_b32 s11, s7 | |
386 | s_cmp_le_i32 s1, s0 | |
387 | s_cbranch_scc1 END_OF_PGM | |
388 | v_add_f32 v0, 2.0, v0 | |
389 | v_cvt_f32_i32 v17, s1 | |
390 | s_waitcnt lgkmcnt(0) | |
391 | v_add_f32 v18, s8, v17 | |
392 | v_add_f32 v19, s9, v17 | |
393 | v_add_f32 v20, s10, v17 | |
394 | v_add_f32 v21, s11, v17 | |
395 | v_add_f32 v22, s12, v17 | |
396 | v_add_f32 v23, s13, v17 | |
397 | v_add_f32 v24, s14, v17 | |
398 | v_add_f32 v17, s15, v17 | |
399 | v_log_f32 v25, v18 | |
400 | v_mul_f32 v25, v22, v25 | |
401 | v_exp_f32 v25, v25 | |
402 | v_log_f32 v26, v19 | |
403 | v_mul_f32 v26, v23, v26 | |
404 | v_exp_f32 v26, v26 | |
405 | v_log_f32 v27, v20 | |
406 | v_mul_f32 v27, v24, v27 | |
407 | v_exp_f32 v27, v27 | |
408 | v_log_f32 v28, v21 | |
409 | v_mul_f32 v28, v17, v28 | |
410 | v_exp_f32 v28, v28 | |
411 | v_add_f32 v5, v5, v25 | |
412 | v_add_f32 v6, v6, v26 | |
413 | v_add_f32 v7, v7, v27 | |
414 | v_add_f32 v8, v8, v28 | |
415 | v_mul_f32 v18, 0x3fb8aa3b, v18 | |
416 | v_exp_f32 v18, v18 | |
417 | v_mul_f32 v19, 0x3fb8aa3b, v19 | |
418 | v_exp_f32 v19, v19 | |
419 | v_mul_f32 v20, 0x3fb8aa3b, v20 | |
420 | v_exp_f32 v20, v20 | |
421 | v_mul_f32 v21, 0x3fb8aa3b, v21 | |
422 | v_exp_f32 v21, v21 | |
423 | v_add_f32 v9, v9, v18 | |
424 | v_add_f32 v10, v10, v19 | |
425 | v_add_f32 v11, v11, v20 | |
426 | v_add_f32 v12, v12, v21 | |
427 | v_sqrt_f32 v18, v22 | |
428 | v_sqrt_f32 v19, v23 | |
429 | v_sqrt_f32 v20, v24 | |
430 | v_sqrt_f32 v21, v17 | |
431 | v_add_f32 v13, v13, v18 | |
432 | v_add_f32 v14, v14, v19 | |
433 | v_add_f32 v15, v15, v20 | |
434 | v_add_f32 v16, v16, v21 | |
435 | v_rsq_f32 v18, v22 | |
436 | v_rsq_f32 v19, v23 | |
437 | v_rsq_f32 v20, v24 | |
438 | v_rsq_f32 v17, v17 | |
439 | v_add_f32 v1, v1, v18 | |
440 | v_add_f32 v2, v2, v19 | |
441 | v_add_f32 v3, v3, v20 | |
442 | v_add_f32 v4, v4, v17 | |
443 | s_add_u32 s0, s0, 1 | |
444 | s_branch LOOP | |
445 | END_OF_PGM: | |
446 | s_endpgm | |
447 | )"; | |
448 | ||
449 | ||
450 | /** | |
451 | * KFDCWSRTest | |
452 | */ | |
453 | ||
454 | /* Initial state: | |
455 | * s[0:1] - input buffer base address | |
456 | * s[2:3] - output buffer base address | |
457 | * s4 - workgroup id | |
458 | * v0 - workitem id | |
459 | * Registers: | |
460 | * v0 - calculated workitem = v0 + s4 * NUM_THREADS_X, which is s4 | |
461 | * v[4:5] - corresponding output buf address: s[2:3] + v0 * 4 | |
462 | * v6 - register storing known-value output for mangle testing | |
463 | * v7 - counter | |
464 | */ | |
465 | const char *IterateIsa = SHADER_MACROS R"( | |
466 | // Compute address of output buffer | |
467 | v_mov_b32 v0, s4 // use workgroup id as index | |
468 | v_lshlrev_b32 v0, 2, v0 // v0 *= 4 | |
469 | V_ADD_CO_U32 v4, s2, v0 // v[4:5] = s[2:3] + v0 * 4 | |
470 | v_mov_b32 v5, s3 // v[4:5] = s[2:3] + v0 * 4 | |
471 | V_ADD_CO_CI_U32 v5, v5, 0 // v[4:5] = s[2:3] + v0 * 4 | |
472 | ||
473 | // Store known-value output in register | |
474 | flat_load_dword v6, v[4:5] glc | |
475 | s_waitcnt vmcnt(0) & lgkmcnt(0) // wait for memory reads to finish | |
476 | ||
477 | // Initialize counter | |
478 | v_mov_b32 v7, 0 | |
479 | ||
480 | LOOP: | |
481 | flat_store_dword v[4:5], v6 // store known-val in output | |
482 | V_ADD_CO_U32 v7, 1, v7 // increment counter | |
483 | ||
484 | s_load_dword s6, s[0:1], 0 glc | |
485 | s_waitcnt vmcnt(0) & lgkmcnt(0) // wait for memory reads to finish | |
486 | s_cmp_eq_i32 s6, 0x12345678 // compare input buf to stopval | |
487 | s_cbranch_scc1 L_QUIT // branch if notified to quit by host | |
488 | ||
489 | s_branch LOOP | |
490 | ||
491 | L_QUIT: | |
492 | s_waitcnt vmcnt(0) & lgkmcnt(0) | |
493 | s_endpgm | |
494 | )"; | |
495 | ||
496 | /** | |
497 | * KFDEvictTest | |
498 | */ | |
499 | ||
500 | /* Shader to read local buffers using multiple wavefronts in parallel | |
501 | * until address buffer is filled with specific value 0x5678 by host program, | |
502 | * then each wavefront fills value 0x5678 at corresponding result buffer and quit | |
503 | * | |
504 | * Initial state: | |
505 | * s[0:1] - address buffer base address | |
506 | * s[2:3] - result buffer base address | |
507 | * s4 - workgroup id | |
508 | * v0 - workitem id, always 0 because NUM_THREADS_X(number of threads) in workgroup set to 1 | |
509 | * Registers: | |
510 | * v0 - calculated workitem id, v0 = v0 + s4 * NUM_THREADS_X | |
511 | * v[2:3] - address of corresponding local buf address offset: s[0:1] + v0 * 8 | |
512 | * v[4:5] - corresponding output buf address: s[2:3] + v0 * 4 | |
513 | * v[6:7] - local buf address used for read test | |
514 | */ | |
515 | const char *ReadMemoryIsa = SHADER_MACROS R"( | |
516 | // Compute address of corresponding output buffer | |
517 | v_mov_b32 v0, s4 // use workgroup id as index | |
518 | v_lshlrev_b32 v0, 2, v0 // v0 *= 4 | |
519 | V_ADD_CO_U32 v4, s2, v0 // v[4:5] = s[2:3] + v0 * 4 | |
520 | v_mov_b32 v5, s3 // v[4:5] = s[2:3] + v0 * 4 | |
521 | V_ADD_CO_CI_U32 v5, v5, 0 // v[4:5] = s[2:3] + v0 * 4 | |
522 | ||
523 | // Compute input buffer offset used to store corresponding local buffer address | |
524 | v_lshlrev_b32 v0, 1, v0 // v0 *= 8 | |
525 | V_ADD_CO_U32 v2, s0, v0 // v[2:3] = s[0:1] + v0 * 8 | |
526 | v_mov_b32 v3, s1 // v[2:3] = s[0:1] + v0 * 8 | |
527 | V_ADD_CO_CI_U32 v3, v3, 0 // v[2:3] = s[0:1] + v0 * 8 | |
528 | ||
529 | // Load 64bit local buffer address stored at v[2:3] to v[6:7] | |
530 | flat_load_dwordx2 v[6:7], v[2:3] slc | |
531 | s_waitcnt vmcnt(0) & lgkmcnt(0) // wait for memory reads to finish | |
532 | v_mov_b32 v8, 0x5678 | |
533 | s_movk_i32 s8, 0x5678 | |
534 | L_REPEAT: | |
535 | s_load_dword s16, s[0:1], 0x0 glc | |
536 | s_waitcnt vmcnt(0) & lgkmcnt(0) // wait for memory reads to finish | |
537 | s_cmp_eq_i32 s16, s8 | |
538 | s_cbranch_scc1 L_QUIT // if notified to quit by host | |
539 | ||
540 | // Loop read 64M local buffer starting at v[6:7] | |
541 | // every 4k page only read once | |
542 | v_mov_b32 v9, 0 | |
543 | v_mov_b32 v10, 0x1000 // 4k page | |
544 | v_mov_b32 v11, 0x4000000 // 64M size | |
545 | v_mov_b32 v12, v6 | |
546 | v_mov_b32 v13, v7 | |
547 | L_LOOP_READ: | |
548 | flat_load_dwordx2 v[14:15], v[12:13] slc | |
549 | V_ADD_CO_U32 v9, v9, v10 | |
550 | V_ADD_CO_U32 v12, v12, v10 | |
551 | V_ADD_CO_CI_U32 v13, v13, 0 | |
552 | V_CMP_LT_U32 v9, v11 | |
553 | s_cbranch_vccnz L_LOOP_READ | |
554 | s_branch L_REPEAT | |
555 | L_QUIT: | |
556 | flat_store_dword v[4:5], v8 | |
557 | s_waitcnt vmcnt(0) & lgkmcnt(0) // wait for memory writes to finish | |
558 | s_endpgm | |
559 | )"; | |
560 | ||
561 | /** | |
562 | * KFDGWSTest | |
563 | */ | |
564 | ||
565 | /* Shader to initialize gws counter to 1 */ | |
566 | const char *GwsInitIsa = R"( | |
567 | .text | |
568 | s_mov_b32 m0, 0 | |
569 | s_nop 0 | |
570 | s_load_dword s16, s[0:1], 0x0 glc | |
571 | s_waitcnt 0 | |
572 | v_mov_b32 v0, s16 | |
573 | s_waitcnt 0 | |
574 | ds_gws_init v0 offset:0 gds | |
575 | s_waitcnt 0 | |
576 | s_endpgm | |
577 | )"; | |
578 | ||
579 | /* Atomically increase a value in memory | |
580 | * This is expected to be executed from | |
581 | * multiple work groups simultaneously. | |
582 | * GWS semaphore is used to guarantee | |
583 | * the operation is atomic. | |
584 | */ | |
585 | const char *GwsAtomicIncreaseIsa = R"( | |
586 | .text | |
587 | // Assume src address in s0, s1 | |
588 | .if (.amdgcn.gfx_generation_number >= 10) | |
589 | s_mov_b32 m0, 0 | |
590 | s_mov_b32 exec_lo, 0x1 | |
591 | v_mov_b32 v0, s0 | |
592 | v_mov_b32 v1, s1 | |
593 | ds_gws_sema_p offset:0 gds | |
594 | s_waitcnt 0 | |
595 | flat_load_dword v2, v[0:1] glc dlc | |
596 | s_waitcnt 0 | |
597 | v_add_nc_u32 v2, v2, 1 | |
598 | flat_store_dword v[0:1], v2 | |
599 | s_waitcnt_vscnt null, 0 | |
600 | ds_gws_sema_v offset:0 gds | |
601 | .else | |
602 | s_mov_b32 m0, 0 | |
603 | s_nop 0 | |
604 | ds_gws_sema_p offset:0 gds | |
605 | s_waitcnt 0 | |
606 | s_load_dword s16, s[0:1], 0x0 glc | |
607 | s_waitcnt 0 | |
608 | s_add_u32 s16, s16, 1 | |
609 | s_store_dword s16, s[0:1], 0x0 glc | |
610 | s_waitcnt lgkmcnt(0) | |
611 | ds_gws_sema_v offset:0 gds | |
612 | .endif | |
613 | s_waitcnt 0 | |
614 | s_endpgm | |
615 | )"; |
0 | /* | |
1 | * Copyright (C) 2021 Advanced Micro Devices, Inc. All Rights Reserved. | |
2 | * | |
3 | * Permission is hereby granted, free of charge, to any person obtaining a | |
4 | * copy of this software and associated documentation files (the "Software"), | |
5 | * to deal in the Software without restriction, including without limitation | |
6 | * the rights to use, copy, modify, merge, publish, distribute, sublicense, | |
7 | * and/or sell copies of the Software, and to permit persons to whom the | |
8 | * Software is furnished to do so, subject to the following conditions: | |
9 | * | |
10 | * The above copyright notice and this permission notice shall be included in | |
11 | * all copies or substantial portions of the Software. | |
12 | * | |
13 | * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR | |
14 | * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, | |
15 | * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL | |
16 | * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR | |
17 | * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, | |
18 | * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR | |
19 | * OTHER DEALINGS IN THE SOFTWARE. | |
20 | * | |
21 | */ | |
22 | ||
23 | #ifndef _SHADERSTORE_H_ | |
24 | #define _SHADERSTORE_H_ | |
25 | ||
26 | #include <vector> | |
27 | ||
28 | /* KFDASMTest List */ | |
29 | extern const std::vector<const char*> ShaderList; | |
30 | ||
31 | /* Common */ | |
32 | extern const char *NoopIsa; | |
33 | extern const char *CopyDwordIsa; | |
34 | extern const char *InfiniteLoopIsa; | |
35 | extern const char *AtomicIncIsa; | |
36 | ||
37 | /* KFDMemoryTest */ | |
38 | extern const char *ScratchCopyDwordIsa; | |
39 | extern const char *PollMemoryIsa; | |
40 | extern const char *PollNCMemoryIsa; | |
41 | extern const char *CopyOnSignalIsa; | |
42 | extern const char *PollAndCopyIsa; | |
43 | extern const char *WriteFlagAndValueIsa; | |
44 | extern const char *WriteAndSignalIsa; | |
45 | ||
46 | /* KFDQMTest */ | |
47 | extern const char *LoopIsa; | |
48 | ||
49 | /* KFDCWSRTest */ | |
50 | extern const char *IterateIsa; | |
51 | ||
52 | /* KFDEvictTest */ | |
53 | extern const char *ReadMemoryIsa; | |
54 | ||
55 | /* KFDGWSTest */ | |
56 | extern const char *GwsInitIsa; | |
57 | extern const char *GwsAtomicIncreaseIsa; | |
58 | ||
59 | #endif // _SHADERSTORE_H_ |