New Upstream Release - vkfft
Ready changes
Summary
Merged new upstream version: 1.2.31+ds1 (was: 1.2.26+ds1).
Resulting package
Built on 2023-06-11T12:55 (took 9m47s)
The resulting binary packages can be installed (if you have the apt repository enabled) by running one of:
apt install -t fresh-releases libvkfft-dev
Lintian Result
Diff
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 25e74e8..5b1a2b8 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -3,7 +3,7 @@ project(Vulkan_FFT)
set(CMAKE_CONFIGURATION_TYPES "Release" CACHE STRING "" FORCE)
set(CMAKE_BUILD_TYPE "Release" CACHE STRING "" FORCE)
include(FetchContent)
-set(VKFFT_BACKEND 0 CACHE STRING "0 - Vulkan, 1 - CUDA, 2 - HIP, 3 - OpenCL, 4 - Level Zero")
+set(VKFFT_BACKEND 0 CACHE STRING "0 - Vulkan, 1 - CUDA, 2 - HIP, 3 - OpenCL, 4 - Level Zero, 5 - Metal")
if(${VKFFT_BACKEND} EQUAL 1)
option(build_VkFFT_cuFFT_benchmark "Build VkFFT cuFFT benchmark" ON)
@@ -18,6 +18,8 @@ else()
endif()
option(build_VkFFT_FFTW_precision "Build VkFFT FFTW precision comparison" OFF)
+option(VkFFT_use_FP128_Bluestein_RaderFFT "Use FP128 for Bluestein and Rader FFT kernel calculations. Currently requires FP128 FFT library, like FFTWl" OFF)
+
if (MSVC)
set_property(DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR} PROPERTY VS_STARTUP_PROJECT ${PROJECT_NAME})
add_definitions(-D_CRT_SECURE_NO_WARNINGS)
@@ -75,7 +77,7 @@ else()
benchmark_scripts/vkFFT_scripts/src/sample_1001_benchmark_VkFFT_double_2_4096.cpp
benchmark_scripts/vkFFT_scripts/src/sample_1003_benchmark_VkFFT_single_3d_2_512.cpp)
endif()
-target_compile_features(${PROJECT_NAME} PUBLIC cxx_std_11)
+target_compile_features(${PROJECT_NAME} PUBLIC cxx_std_11)
add_definitions(-DVKFFT_BACKEND=${VKFFT_BACKEND})
if(${VKFFT_BACKEND} EQUAL 0)
find_package(Vulkan REQUIRED)
@@ -122,9 +124,18 @@ elseif(${VKFFT_BACKEND} EQUAL 4)
NO_DEFAULT_PATH
)
target_include_directories(${PROJECT_NAME} PUBLIC ${LevelZero_INCLUDES})
+elseif(${VKFFT_BACKEND} EQUAL 5)
+ add_compile_options(-WMTL_IGNORE_WARNINGS)
+ find_library(FOUNDATION_LIB Foundation REQUIRED)
+ find_library(QUARTZ_CORE_LIB QuartzCore REQUIRED)
+ find_library(METAL_LIB Metal REQUIRED)
+ target_include_directories(${PROJECT_NAME} PUBLIC "metal-cpp/")
endif()
target_compile_definitions(${PROJECT_NAME} PUBLIC -DVK_API_VERSION=11)#10 - Vulkan 1.0, 11 - Vulkan 1.1, 12 - Vulkan 1.2
+if(VkFFT_use_FP128_Bluestein_RaderFFT)
+ target_compile_definitions(${PROJECT_NAME} PUBLIC -DVkFFT_use_FP128_Bluestein_RaderFFT)
+endif()
if(${VKFFT_BACKEND} EQUAL 0)
FetchContent_Declare(
glslang-master
@@ -159,15 +170,17 @@ elseif(${VKFFT_BACKEND} EQUAL 3)
target_link_libraries(${PROJECT_NAME} PUBLIC OpenCL::OpenCL VkFFT half)
elseif(${VKFFT_BACKEND} EQUAL 4)
target_link_libraries(${PROJECT_NAME} PUBLIC ze_loader VkFFT half)
+elseif(${VKFFT_BACKEND} EQUAL 5)
+ target_link_libraries(${PROJECT_NAME} PUBLIC ${FOUNDATION_LIB} ${QUARTZ_CORE_LIB} ${METAL_LIB} VkFFT half)
endif()
-if(build_VkFFT_FFTW_precision)
+if(build_VkFFT_FFTW_precision OR VkFFT_use_FP128_Bluestein_RaderFFT)
add_definitions(-DUSE_FFTW)
set(FFTW3_LIB_DIR "/usr/lib/x86_64-linux-gnu/")
set(FFTW3_INCLUDE_DIR "/usr/include/")
find_library(
FFTW_LIB
- NAMES "libfftw3-3" "fftw3"
+ NAMES "libfftw3-3" "fftw3"
PATHS ${FFTW3_LIB_DIR}
PATH_SUFFIXES "lib" "lib64"
NO_DEFAULT_PATH
@@ -179,9 +192,19 @@ if(build_VkFFT_FFTW_precision)
PATH_SUFFIXES "include"
NO_DEFAULT_PATH
)
-
- target_link_libraries (${PROJECT_NAME} PUBLIC ${FFTW_LIB})
- target_include_directories(${PROJECT_NAME} PUBLIC ${FFTW_INCLUDES})
+ target_include_directories(${PROJECT_NAME} PUBLIC ${FFTW_INCLUDES})
+if(VkFFT_use_FP128_Bluestein_RaderFFT)
+ find_library(
+ FFTWL_LIB
+ NAMES "libfftw3l" "fftw3l"
+ PATHS ${FFTW3_LIB_DIR}
+ PATH_SUFFIXES "lib" "lib64"
+ NO_DEFAULT_PATH
+ )
+ target_link_libraries (${PROJECT_NAME} PUBLIC ${FFTW_LIB} ${FFTWL_LIB})
+else()
+ target_link_libraries (${PROJECT_NAME} PUBLIC ${FFTW_LIB})
+endif()
endif()
if(build_VkFFT_cuFFT_benchmark)
diff --git a/README.md b/README.md
index e0b10e8..0baeef6 100644
--- a/README.md
+++ b/README.md
@@ -1,6 +1,8 @@
[![Build Status](https://travis-ci.com/DTolm/VkFFT.svg?token=nMgUQeqx7PXMeCFaXqsb&branch=master)](https://travis-ci.com/github/DTolm/VkFFT)
-# VkFFT - Vulkan/CUDA/HIP/OpenCL/Level Zero Fast Fourier Transform library
-VkFFT is an efficient GPU-accelerated multidimensional Fast Fourier Transform library for Vulkan/CUDA/HIP/OpenCL/Level Zero projects. VkFFT aims to provide the community with an open-source alternative to Nvidia's cuFFT library while achieving better performance. VkFFT is written in C language and supports Vulkan, CUDA, HIP, OpenCL and Level Zero as backends.
+# VkFFT - Vulkan/CUDA/HIP/OpenCL/Level Zero/Metal Fast Fourier Transform library
+VkFFT is an efficient GPU-accelerated multidimensional Fast Fourier Transform library for Vulkan/CUDA/HIP/OpenCL/Level Zero/Metal projects. VkFFT aims to provide the community with an open-source alternative to Nvidia's cuFFT library while achieving better performance. VkFFT is written in C language and supports Vulkan, CUDA, HIP, OpenCL, Level Zero and Metal as backends.
+
+## Check out my poster at SC22: https://sc22.supercomputing.org/presentation/?id=rpost143&sess=sess273
## Check out my panel at Nvidia's GTC 2021 in Higher Education and Research category: https://gtc21.event.nvidia.com/
@@ -15,6 +17,7 @@ VkFFT is an efficient GPU-accelerated multidimensional Fast Fourier Transform li
- Forward and inverse directions of FFT
- Support for big FFT dimension sizes. Current limits: C2C or even C2R/R2C - (2^32, 2^32, 2^32). Odd C2R/R2C - (2^12, 2^32, 2^32). R2R - (2^12, 2^12, 2^12). Depends on the amount of shared memory on the device. (will be increased later).
- Radix-2/3/4/5/7/8/11/13 FFT. Sequences using radix 3, 5, 7, 11 and 13 have comparable performance to that of powers of 2.
+ - Rader's FFT algorithm for primes from 17 up to max shared memory length (~10000). Inlined and done without additional memory transfers.
- Bluestein's FFT algorithm for all other sequences. Full coverage of C2C range, single upload (2^12, 2^12, 2^12) for R2C/C2R/R2R. Optimized to have as few memory transfers as possible by using zero padding and merged convolution support of VkFFT
- Single, double and half precision support. Double precision uses CPU-generated LUT tables. Half precision still does all computations in single and only uses half precision to store data.
- All transformations are performed in-place with no performance loss. Out-of-place transforms are supported by selecting different input/output buffers.
@@ -25,9 +28,9 @@ VkFFT is an efficient GPU-accelerated multidimensional Fast Fourier Transform li
- WHDCN layout - data is stored in the following order (sorted by increase in strides): the width, the height, the depth, the coordinate (the number of feature maps), the batch number
- Multiple feature/batch convolutions - one input, multiple kernels
- Multiple input/output/temporary buffer split. Allows using data split between different memory allocations and mitigates 4GB single allocation limit.
- - Works on Nvidia, AMD and Intel GPUs. And Raspberry Pi 4 GPU.
+ - Works on Nvidia, AMD, Intel and Apple GPUs. And Raspberry Pi 4 GPU.
- Works on Windows, Linux and macOS
- - VkFFT supports Vulkan, CUDA, HIP, OpenCL and Level Zero as backend to cover wide range of APIs
+ - VkFFT supports Vulkan, CUDA, HIP, OpenCL, Level Zero and Metal as backend to cover wide range of APIs
- Header-only library with Vulkan interface, which allows appending VkFFT directly to user's command buffer. Kernels are compiled at run-time
## Future release plan
- ##### Planned
@@ -52,6 +55,11 @@ To build OpenCL version of the benchmark, replace VKFFT_BACKEND in CMakeLists (l
Level Zero:
Include the vkFFT.h file. Provide the library with correctly chosen VKFFT_BACKEND definition. Clang and llvm-spirv must be valid system calls. Only single/double precision for now.\
To build Level Zero version of the benchmark, replace VKFFT_BACKEND in CMakeLists (line 5) with the value 4 and optionally enable FFTW.
+
+Metal:
+Include the vkFFT.h file. Provide the library with correctly chosen VKFFT_BACKEND definition. VkFFT uses metal-cpp as a C++ bindings to Apple's libraries - Foundation.hpp, QuartzCore.hpp and Metal.hpp. Only single precision.\
+To build Metal version of the benchmark, replace VKFFT_BACKEND in CMakeLists (line 5) with the value 5 and optionally enable FFTW.
+
## Command-line interface
VkFFT has a command-line interface with the following set of commands:\
-h: print help\
@@ -70,32 +78,19 @@ VkFFT.h is a library that can append FFT, iFFT or convolution calculation to the
VkFFT achieves striding by grouping nearby FFTs instead of transpositions. \
Explicit VkFFT documentation can be found in the documentation folder.
## Benchmark results in comparison to cuFFT
-To measure how Vulkan FFT implementation works in comparison to cuFFT, we will perform many 1D, 2D and 3D tests, ranging from the small systems to the big ones. The test will consist of performing C2C FFT and inverse C2C FFT consecutively multiple times to calculate the average time required. The results are obtained on Nvidia RTX 3080, AMD Radeon VII and AMD Radeon 6800XT graphics cards with no other GPU load. Launching -test key from Vulkan_FFT.cpp performs VkFFT/cuFFT benchmark. The overall benchmark score is calculated as an averaged performance score over presented set of systems (the bigger - the better): sum(system_size/average_iteration_time) /num_benchmark_samples
-
-The stable flat lines present for small sequence lengths indicate that time scales linearly with the system size, so the bigger the bandwidth the better the result will be. The stepwise drops occur once the amount of transfers increases from to 2x and to 3x when compute unit can't hold full sequence and splits it into the combination of smaller ones. Radeon VII is faster than RTX 3080 below 2^18 (=2MB - page file size on AMD due to it having HBM2 memory with higher bandwidth, however, this GPU apparently has TLB miss problems on large buffer sizes. On RTX 3080, VkFFT is faster than cuFFT in single precision batched 1D FFTs on the range from 2^3 to 2^27:
-![alt text](https://github.com/DTolm/VkFFT/blob/master/benchmark_plot/vkfft_benchmark_single.png?raw=true)
-In double precision Radeon VII is able to get an advantage due to its high double precision core count. Radeon RX 6800XT can store LUT in the L3 cache and has a higher double precision core count as well:
-![alt text](https://github.com/DTolm/VkFFT/blob/master/benchmark_plot/vkfft_benchmark_double.png?raw=true)
-In half precision mode, VkFFT only uses it for data storage, all computations are performed in single. It still proves to be enough to get a stable 2x performance gain on RTX 3080:
-![alt text](https://github.com/DTolm/VkFFT/blob/master/benchmark_plot/vkfft_benchmark_half.png?raw=true)
-Multidimensional systems are optimized as well. Benchmark shows Radeon RX 6800XT can store systems up to 128MB in the L3 cache for big performance gains. Native support for zero padding allows to transfer less data and get up to 3x performance boost in multidimensional FFTs:
-![alt text](https://github.com/DTolm/VkFFT/blob/master/benchmark_plot/vkfft_benchmark_2d.png?raw=true)
-![alt text](https://github.com/DTolm/VkFFT/blob/master/benchmark_plot/vkfft_benchmark_3d.png?raw=true)
-The test configuration below takes multiple 1D FFTs of a supported sequence length from the range of 2 to 4096, batch them together so the full system takes from 500MB to 1GB of data and perform multiple consecutive FFTs/iFFTs (-vkfft 1000 key). After that time per a single FFT is obtained by averaging the result. Total system size will be divided by the time taken by a single transform upload+download, resulting in the achieved bandwidth. The GPUs used in this comparison are Nvidia A100 and AMD MI100. The performance was compared against Nvidia cuFFT (CUDA 11.2 version) and AMD rocFFT (ROCm 4.1 version) libraries in single precision:
-![alt text](https://github.com/DTolm/VkFFT/blob/master/benchmark_plot/fp32_cuda_a100.png?raw=true)
-![alt text](https://github.com/DTolm/VkFFT/blob/master/benchmark_plot/fp32_hip_mi100.png?raw=true)
+The test configuration below takes multiple 1D FFTs of all lengths from the range of 2 to 4096, batch them together so the full system takes from 500MB to 1GB of data and perform multiple consecutive FFTs/iFFTs (-vkfft 1001 key). After that time per a single FFT is obtained by averaging the result. Total system size will be divided by the time taken by a single transform upload+download, resulting in the estimation of an achieved global bandwidth. The GPUs used in this comparison are Nvidia A100 and AMD MI250. The performance was compared against Nvidia cuFFT (CUDA 11.7 version) and AMD rocFFT (ROCm 5.2 version) libraries in double precision:
+![alt text](https://github.com/DTolm/VkFFT/blob/master/benchmark_plot/fp64_cuda_a100.png?raw=true)
+![alt text](https://github.com/DTolm/VkFFT/blob/master/benchmark_plot/fp64_hip_mi250.png?raw=true)
## Precision comparison of cuFFT/VkFFT/FFTW
-To measure how VkFFT (single/double/half precision) results compare to cuFFT/rocFFT (single/double/half precision) and FFTW (double precision), a set of ~60 systems covering full FFT range was filled with random complex data on the scale of [-1,1] and one C2C transform was performed on each system. Samples 11(single), 12(double), 13(half) calculate for each value of the transformed system:
+![alt text](https://github.com/DTolm/VkFFT/blob/master/precision_results/FP64_precision.png?raw=true)
+![alt text](https://github.com/DTolm/VkFFT/blob/master/precision_results/FP32_precision.png?raw=true)
+
+Above, VkFFT precision is verified by comparing its results with FP128 version of FFTW. We test all FFT lengths from the [2, 100000] range. We perform tests in single and double precision on random input data from [-1;1] range.
+
+For both precisions, all tested libraries exhibit logarithmic error scaling. The main source of error is imprecise twiddle factor computation – sines and cosines used by FFT algorithms. For FP64 they are calculated on the CPU either in FP128 or in FP64 and stored in the lookup tables. With FP128 precomputation (left) VkFFT is more precise than cuFFT and rocFFT.
-- Max difference between cuFFT/rocFFT/VkFFT result and FFTW result
-- Average difference between cuFFT/rocFFT/VkFFT result and FFTW result
-- Max ratio of the difference between cuFFT/rocFFT/VkFFT result and FFTW result to the FFTW result
-- Average ratio of the difference between cuFFT/rocFFT/VkFFT result and FFTW result to the FFTW result
+For FP32, twiddle factors can be calculated on-the-fly in FP32 or precomputed in FP64/FP32. With FP32 twiddle factors (right) VkFFT is slightly less precise in Bluestein’s and Rader’s algorithms. If needed, this can be solved with FP64 precomputation.
-FFTW is required to launch these samples (specify in CMakeLists include and library directories). If cuFFT is disabled, only FFTW/VkFFT results are calculated.\
-The precision_cuFFT_VkFFT_FFTW.txt file contains the single precision results for Nvidia's 1660Ti GPU and AMD Ryzen 2700 CPU. On average, the results fluctuate both for cuFFT and VkFFT with no clear winner in single precision. Max ratio stays in the range of 2% for both cuFFT and VkFFT, while the average ratio stays below 1e-6.\
-The precision_cuFFT_VkFFT_FFTW_double.txt file contains the double precision results for Nvidia's 1660Ti GPU and AMD Ryzen 2700 CPU. On average, VkFFT is more precise than cuFFT in double precision (see: max_difference and max_eps columns), however, it is also ~20% slower (vkfft_benchmark_double.png). Note that double precision is still in testing and these results may change in the future. Max ratio stays in the range of 5e-10% for both cuFFT and VkFFT, while the average ratio stays below 1e-15. Overall, double precision is ~7 times slower than single on Nvidia's 1660Ti GPU.\
-The precision_cuFFT_VkFFT_FFTW_half.txt file contains the half precision results for Nvidia's 1660Ti GPU and AMD Ryzen 2700 CPU. On average, VkFFT is at least two times more precise than cuFFT in half precision (see: max_difference and max_eps columns), while being faster on average (vkfft_benchmark_half.png). Note that half precision is still in testing and is only used to store data in VkFFT. cuFFT script can probably also be improved. The average ratio stays in the range of 0.2% for both cuFFT and VkFFT. Overall, half precision of VkFFT is ~50%-100% times faster than single on Nvidia's 1660Ti GPU.
## Contact information
The initial version of VkFFT is developed by Tolmachev Dmitrii\
-E-mail 1: <dtolm96@gmail.com>
\ No newline at end of file
+E-mail 1: <dtolm96@gmail.com>
diff --git a/Vulkan_FFT.cpp b/Vulkan_FFT.cpp
index 825102e..2e1ad44 100644
--- a/Vulkan_FFT.cpp
+++ b/Vulkan_FFT.cpp
@@ -35,6 +35,19 @@
#endif
#elif(VKFFT_BACKEND==4)
#include <ze_api.h>
+#elif(VKFFT_BACKEND==5)
+#ifndef NS_PRIVATE_IMPLEMENTATION
+#define NS_PRIVATE_IMPLEMENTATION
+#endif
+#ifndef CA_PRIVATE_IMPLEMENTATION
+#define CA_PRIVATE_IMPLEMENTATION
+#endif
+#ifndef MTL_PRIVATE_IMPLEMENTATION
+#define MTL_PRIVATE_IMPLEMENTATION
+#endif
+#include "Foundation/Foundation.hpp"
+#include "QuartzCore/QuartzCore.hpp"
+#include "Metal/Metal.hpp"
#endif
#include "vkFFT.h"
#include "utils_VkFFT.h"
@@ -270,192 +283,198 @@ VkFFTResult launchVkFFT(VkGPU* vkGPU, uint64_t sample_id, bool file_output, FILE
free(deviceList);
}
free(drivers);
+#elif(VKFFT_BACKEND==5)
+ NS::Array* devices = MTL::CopyAllDevices();
+ MTL::Device* device = (MTL::Device*)devices->object(vkGPU->device_id);
+ vkGPU->device = device;
+ MTL::CommandQueue* queue = device->newCommandQueue();
+ vkGPU->queue = queue;
#endif
uint64_t isCompilerInitialized = 1;
- switch (sample_id) {
- case 0:
- {
- resFFT = sample_0_benchmark_VkFFT_single(vkGPU, file_output, output, isCompilerInitialized);
- break;
- }
- case 1:
- {
- resFFT = sample_1_benchmark_VkFFT_double(vkGPU, file_output, output, isCompilerInitialized);
- break;
- }
+ switch (sample_id) {
+ case 0:
+ {
+ resFFT = sample_0_benchmark_VkFFT_single(vkGPU, file_output, output, isCompilerInitialized);
+ break;
+ }
+ case 1:
+ {
+ resFFT = sample_1_benchmark_VkFFT_double(vkGPU, file_output, output, isCompilerInitialized);
+ break;
+ }
#if ((VKFFT_BACKEND==0)&&(VK_API_VERSION>10))
- case 2:
- {
- resFFT = sample_2_benchmark_VkFFT_half(vkGPU, file_output, output, isCompilerInitialized);
- break;
- }
+ case 2:
+ {
+ resFFT = sample_2_benchmark_VkFFT_half(vkGPU, file_output, output, isCompilerInitialized);
+ break;
+ }
#endif
- case 3:
- {
- resFFT = sample_3_benchmark_VkFFT_single_3d(vkGPU, file_output, output, isCompilerInitialized);
- break;
- }
- case 4:
- {
- resFFT = sample_4_benchmark_VkFFT_single_3d_zeropadding(vkGPU, file_output, output, isCompilerInitialized);
- break;
- }
- case 5:
- {
- resFFT = sample_5_benchmark_VkFFT_single_disableReorderFourStep(vkGPU, file_output, output, isCompilerInitialized);
- break;
- }
- case 6:
- {
- resFFT = sample_6_benchmark_VkFFT_single_r2c(vkGPU, file_output, output, isCompilerInitialized);
- break;
- }
- case 7:
- {
- resFFT = sample_7_benchmark_VkFFT_single_Bluestein(vkGPU, file_output, output, isCompilerInitialized);
- break;
- }
- case 8:
- {
- resFFT = sample_8_benchmark_VkFFT_double_Bluestein(vkGPU, file_output, output, isCompilerInitialized);
- break;
- }
+ case 3:
+ {
+ resFFT = sample_3_benchmark_VkFFT_single_3d(vkGPU, file_output, output, isCompilerInitialized);
+ break;
+ }
+ case 4:
+ {
+ resFFT = sample_4_benchmark_VkFFT_single_3d_zeropadding(vkGPU, file_output, output, isCompilerInitialized);
+ break;
+ }
+ case 5:
+ {
+ resFFT = sample_5_benchmark_VkFFT_single_disableReorderFourStep(vkGPU, file_output, output, isCompilerInitialized);
+ break;
+ }
+ case 6:
+ {
+ resFFT = sample_6_benchmark_VkFFT_single_r2c(vkGPU, file_output, output, isCompilerInitialized);
+ break;
+ }
+ case 7:
+ {
+ resFFT = sample_7_benchmark_VkFFT_single_Bluestein(vkGPU, file_output, output, isCompilerInitialized);
+ break;
+ }
+ case 8:
+ {
+ resFFT = sample_8_benchmark_VkFFT_double_Bluestein(vkGPU, file_output, output, isCompilerInitialized);
+ break;
+ }
#if(VKFFT_BACKEND==0)
- case 10:
- {
- resFFT = sample_10_benchmark_VkFFT_single_multipleBuffers(vkGPU, file_output, output, isCompilerInitialized);
- break;
- }
+ case 10:
+ {
+ resFFT = sample_10_benchmark_VkFFT_single_multipleBuffers(vkGPU, file_output, output, isCompilerInitialized);
+ break;
+ }
#endif
#ifdef USE_FFTW
- case 11:
- {
- resFFT = sample_11_precision_VkFFT_single(vkGPU, file_output, output, isCompilerInitialized);
- break;
- }
- case 12:
- {
- resFFT = sample_12_precision_VkFFT_double(vkGPU, file_output, output, isCompilerInitialized);
- break;
- }
+ case 11:
+ {
+ resFFT = sample_11_precision_VkFFT_single(vkGPU, file_output, output, isCompilerInitialized);
+ break;
+ }
+ case 12:
+ {
+ resFFT = sample_12_precision_VkFFT_double(vkGPU, file_output, output, isCompilerInitialized);
+ break;
+ }
#if ((VKFFT_BACKEND==0)&&(VK_API_VERSION>10))
- case 13:
- {
- resFFT = sample_13_precision_VkFFT_half(vkGPU, file_output, output, isCompilerInitialized);
- break;
- }
+ case 13:
+ {
+ resFFT = sample_13_precision_VkFFT_half(vkGPU, file_output, output, isCompilerInitialized);
+ break;
+ }
#endif
- case 14:
- {
- resFFT = sample_14_precision_VkFFT_single_nonPow2(vkGPU, file_output, output, isCompilerInitialized);
- break;
- }
- case 15:
- {
- resFFT = sample_15_precision_VkFFT_single_r2c(vkGPU, file_output, output, isCompilerInitialized);
- break;
- }
- case 16:
- {
- resFFT = sample_16_precision_VkFFT_single_dct(vkGPU, file_output, output, isCompilerInitialized);
- break;
- }
- case 17:
- {
- resFFT = sample_17_precision_VkFFT_double_dct(vkGPU, file_output, output, isCompilerInitialized);
- break;
- }
- case 18:
- {
- resFFT = sample_18_precision_VkFFT_double_nonPow2(vkGPU, file_output, output, isCompilerInitialized);
- break;
- }
+ case 14:
+ {
+ resFFT = sample_14_precision_VkFFT_single_nonPow2(vkGPU, file_output, output, isCompilerInitialized);
+ break;
+ }
+ case 15:
+ {
+ resFFT = sample_15_precision_VkFFT_single_r2c(vkGPU, file_output, output, isCompilerInitialized);
+ break;
+ }
+ case 16:
+ {
+ resFFT = sample_16_precision_VkFFT_single_dct(vkGPU, file_output, output, isCompilerInitialized);
+ break;
+ }
+ case 17:
+ {
+ resFFT = sample_17_precision_VkFFT_double_dct(vkGPU, file_output, output, isCompilerInitialized);
+ break;
+ }
+ case 18:
+ {
+ resFFT = sample_18_precision_VkFFT_double_nonPow2(vkGPU, file_output, output, isCompilerInitialized);
+ break;
+ }
#endif
- case 50:
- {
- resFFT = sample_50_convolution_VkFFT_single_1d_matrix(vkGPU, file_output, output, isCompilerInitialized);
- break;
- }
- case 51:
- {
- resFFT = sample_51_convolution_VkFFT_single_3d_matrix_zeropadding_r2c(vkGPU, file_output, output, isCompilerInitialized);
- break;
- }
- case 52:
- {
- resFFT = sample_52_convolution_VkFFT_single_2d_batched_r2c(vkGPU, file_output, output, isCompilerInitialized);
- break;
- }
- case 110:
- {
- resFFT = sample_100_benchmark_VkFFT_single_nd_dct(vkGPU, file_output, output, isCompilerInitialized, 1);
- break;
- }
- case 120:
- {
- resFFT = sample_100_benchmark_VkFFT_single_nd_dct(vkGPU, file_output, output, isCompilerInitialized, 2);
- break;
- }
- case 130:
- {
- resFFT = sample_100_benchmark_VkFFT_single_nd_dct(vkGPU, file_output, output, isCompilerInitialized, 3);
- break;
- }
- case 140:
- {
- resFFT = sample_100_benchmark_VkFFT_single_nd_dct(vkGPU, file_output, output, isCompilerInitialized, 4);
- break;
- }
- case 111:
- {
- resFFT = sample_101_benchmark_VkFFT_double_nd_dct(vkGPU, file_output, output, isCompilerInitialized, 1);
- break;
- }
- case 121:
- {
- resFFT = sample_101_benchmark_VkFFT_double_nd_dct(vkGPU, file_output, output, isCompilerInitialized, 2);
- break;
- }
- case 131:
- {
- resFFT = sample_101_benchmark_VkFFT_double_nd_dct(vkGPU, file_output, output, isCompilerInitialized, 3);
- break;
- }
- case 141:
- {
- resFFT = sample_101_benchmark_VkFFT_double_nd_dct(vkGPU, file_output, output, isCompilerInitialized, 4);
- break;
- }
- case 200: case 201:
- {
- resFFT = user_benchmark_VkFFT(vkGPU, file_output, output, isCompilerInitialized, userParams);
- break;
- }
+ case 50:
+ {
+ resFFT = sample_50_convolution_VkFFT_single_1d_matrix(vkGPU, file_output, output, isCompilerInitialized);
+ break;
+ }
+ case 51:
+ {
+ resFFT = sample_51_convolution_VkFFT_single_3d_matrix_zeropadding_r2c(vkGPU, file_output, output, isCompilerInitialized);
+ break;
+ }
+ case 52:
+ {
+ resFFT = sample_52_convolution_VkFFT_single_2d_batched_r2c(vkGPU, file_output, output, isCompilerInitialized);
+ break;
+ }
+ case 110:
+ {
+ resFFT = sample_100_benchmark_VkFFT_single_nd_dct(vkGPU, file_output, output, isCompilerInitialized, 1);
+ break;
+ }
+ case 120:
+ {
+ resFFT = sample_100_benchmark_VkFFT_single_nd_dct(vkGPU, file_output, output, isCompilerInitialized, 2);
+ break;
+ }
+ case 130:
+ {
+ resFFT = sample_100_benchmark_VkFFT_single_nd_dct(vkGPU, file_output, output, isCompilerInitialized, 3);
+ break;
+ }
+ case 140:
+ {
+ resFFT = sample_100_benchmark_VkFFT_single_nd_dct(vkGPU, file_output, output, isCompilerInitialized, 4);
+ break;
+ }
+ case 111:
+ {
+ resFFT = sample_101_benchmark_VkFFT_double_nd_dct(vkGPU, file_output, output, isCompilerInitialized, 1);
+ break;
+ }
+ case 121:
+ {
+ resFFT = sample_101_benchmark_VkFFT_double_nd_dct(vkGPU, file_output, output, isCompilerInitialized, 2);
+ break;
+ }
+ case 131:
+ {
+ resFFT = sample_101_benchmark_VkFFT_double_nd_dct(vkGPU, file_output, output, isCompilerInitialized, 3);
+ break;
+ }
+ case 141:
+ {
+ resFFT = sample_101_benchmark_VkFFT_double_nd_dct(vkGPU, file_output, output, isCompilerInitialized, 4);
+ break;
+ }
+ case 200: case 201:
+ {
+ resFFT = user_benchmark_VkFFT(vkGPU, file_output, output, isCompilerInitialized, userParams);
+ break;
+ }
#if ((VKFFT_BACKEND==0)&&(VK_API_VERSION>10))
- case 202:
- {
- resFFT = user_benchmark_VkFFT(vkGPU, file_output, output, isCompilerInitialized, userParams);
- break;
- }
+ case 202:
+ {
+ resFFT = user_benchmark_VkFFT(vkGPU, file_output, output, isCompilerInitialized, userParams);
+ break;
+ }
#endif
- case 1000:
- {
- resFFT = sample_1000_VkFFT_single_2_4096(vkGPU, file_output, output, isCompilerInitialized);
- break;
- }
- case 1001:
- {
- resFFT = sample_1001_benchmark_VkFFT_double_2_4096(vkGPU, file_output, output, isCompilerInitialized);
- break;
- }
- case 1003:
- {
- resFFT = sample_1003_benchmark_VkFFT_single_3d_2_512(vkGPU, file_output, output, isCompilerInitialized);
- break;
- }
- }
+ case 1000:
+ {
+ resFFT = sample_1000_VkFFT_single_2_4096(vkGPU, file_output, output, isCompilerInitialized);
+ break;
+ }
+ case 1001:
+ {
+ resFFT = sample_1001_benchmark_VkFFT_double_2_4096(vkGPU, file_output, output, isCompilerInitialized);
+ break;
+ }
+ case 1003:
+ {
+ resFFT = sample_1003_benchmark_VkFFT_single_3d_2_512(vkGPU, file_output, output, isCompilerInitialized);
+ break;
+ }
+ }
#if(VKFFT_BACKEND==0)
vkDestroyFence(vkGPU->device, vkGPU->fence, NULL);
vkDestroyCommandPool(vkGPU->device, vkGPU->commandPool, NULL);
@@ -475,6 +494,10 @@ VkFFTResult launchVkFFT(VkGPU* vkGPU, uint64_t sample_id, bool file_output, FILE
res = zeCommandQueueDestroy(vkGPU->commandQueue);
if (res != ZE_RESULT_SUCCESS) return VKFFT_ERROR_FAILED_TO_RELEASE_COMMAND_QUEUE;
res = zeContextDestroy(vkGPU->context);
+#elif(VKFFT_BACKEND==5)
+ vkGPU->queue->release();
+ vkGPU->device->release();
+ devices->release();
#endif
return resFFT;
@@ -510,7 +533,7 @@ int main(int argc, char* argv[])
version_decomposed[0] = version / 10000;
version_decomposed[1] = (version - version_decomposed[0] * 10000) / 100;
version_decomposed[2] = (version - version_decomposed[0] * 10000 - version_decomposed[1] * 100);
- printf("VkFFT v%d.%d.%d (03-08-2022). Author: Tolmachev Dmitrii\n", version_decomposed[0], version_decomposed[1], version_decomposed[2]);
+ printf("VkFFT v%d.%d.%d (25-10-2022). Author: Tolmachev Dmitrii\n", version_decomposed[0], version_decomposed[1], version_decomposed[2]);
#if (VKFFT_BACKEND==0)
printf("Vulkan backend\n");
#elif (VKFFT_BACKEND==1)
@@ -521,6 +544,8 @@ int main(int argc, char* argv[])
printf("OpenCL backend\n");
#elif (VKFFT_BACKEND==4)
printf("Level Zero backend\n");
+#elif (VKFFT_BACKEND==5)
+ printf("Metal backend\n");
#endif
printf(" -h: print help\n");
printf(" -devices: print the list of available device ids, used as -d argument\n");
diff --git a/benchmark_plot/fp64_cuda_a100.png b/benchmark_plot/fp64_cuda_a100.png
new file mode 100644
index 0000000..9c949a1
Binary files /dev/null and b/benchmark_plot/fp64_cuda_a100.png differ
diff --git a/benchmark_plot/fp64_hip_mi250.png b/benchmark_plot/fp64_hip_mi250.png
new file mode 100644
index 0000000..333cd0d
Binary files /dev/null and b/benchmark_plot/fp64_hip_mi250.png differ
diff --git a/benchmark_plot/VkFFT_benchmark_results.txt b/benchmark_plot/old_results/old_VkFFT_benchmark_results.txt
similarity index 100%
rename from benchmark_plot/VkFFT_benchmark_results.txt
rename to benchmark_plot/old_results/old_VkFFT_benchmark_results.txt
diff --git a/benchmark_plot/benchmark.png b/benchmark_plot/old_results/old_benchmark.png
similarity index 100%
rename from benchmark_plot/benchmark.png
rename to benchmark_plot/old_results/old_benchmark.png
diff --git a/benchmark_plot/cuFFT_benchmark_results.txt b/benchmark_plot/old_results/old_cuFFT_benchmark_results.txt
similarity index 100%
rename from benchmark_plot/cuFFT_benchmark_results.txt
rename to benchmark_plot/old_results/old_cuFFT_benchmark_results.txt
diff --git a/benchmark_plot/fp32_cuda_a100.png b/benchmark_plot/old_results/old_fp32_cuda_a100.png
similarity index 100%
rename from benchmark_plot/fp32_cuda_a100.png
rename to benchmark_plot/old_results/old_fp32_cuda_a100.png
diff --git a/benchmark_plot/fp32_hip_mi100.png b/benchmark_plot/old_results/old_fp32_hip_mi100.png
similarity index 100%
rename from benchmark_plot/fp32_hip_mi100.png
rename to benchmark_plot/old_results/old_fp32_hip_mi100.png
diff --git a/benchmark_plot/vkfft_benchmark_2d.png b/benchmark_plot/old_results/old_vkfft_benchmark_2d.png
similarity index 100%
rename from benchmark_plot/vkfft_benchmark_2d.png
rename to benchmark_plot/old_results/old_vkfft_benchmark_2d.png
diff --git a/benchmark_plot/vkfft_benchmark_3d.png b/benchmark_plot/old_results/old_vkfft_benchmark_3d.png
similarity index 100%
rename from benchmark_plot/vkfft_benchmark_3d.png
rename to benchmark_plot/old_results/old_vkfft_benchmark_3d.png
diff --git a/benchmark_plot/vkfft_benchmark_double.png b/benchmark_plot/old_results/old_vkfft_benchmark_double.png
similarity index 100%
rename from benchmark_plot/vkfft_benchmark_double.png
rename to benchmark_plot/old_results/old_vkfft_benchmark_double.png
diff --git a/benchmark_plot/vkfft_benchmark_half.png b/benchmark_plot/old_results/old_vkfft_benchmark_half.png
similarity index 100%
rename from benchmark_plot/vkfft_benchmark_half.png
rename to benchmark_plot/old_results/old_vkfft_benchmark_half.png
diff --git a/benchmark_plot/vkfft_benchmark_single.png b/benchmark_plot/old_results/old_vkfft_benchmark_single.png
similarity index 100%
rename from benchmark_plot/vkfft_benchmark_single.png
rename to benchmark_plot/old_results/old_vkfft_benchmark_single.png
diff --git a/benchmark_scripts/cuFFT_scripts/src/user_benchmark_cuFFT.cu b/benchmark_scripts/cuFFT_scripts/src/user_benchmark_cuFFT.cu
index b22a631..48bded0 100644
--- a/benchmark_scripts/cuFFT_scripts/src/user_benchmark_cuFFT.cu
+++ b/benchmark_scripts/cuFFT_scripts/src/user_benchmark_cuFFT.cu
@@ -24,7 +24,7 @@ void user_benchmark_cuFFT(bool file_output, FILE* output, cuFFTUserSystemParamet
cudaSetDevice(device_id);
const int num_runs = 3;
double benchmark_result[2] = { 0,0 };//averaged result = sum(system_size/iteration_time)/num_benchmark_samples
- uint64_t storageComplexSize;
+ uint64_t storageComplexSize=8;
switch (userParams->P) {
case 0:
storageComplexSize = (2 * sizeof(float));
@@ -35,7 +35,22 @@ void user_benchmark_cuFFT(bool file_output, FILE* output, cuFFTUserSystemParamet
case 2:
storageComplexSize = (2 * 2);
break;
+ default:
+ storageComplexSize = (2 * sizeof(float));
+ break;
}
+ uint64_t bufferSize = 0;
+ if (userParams->R2C) {
+ bufferSize = (uint64_t)(storageComplexSize / 2) * (userParams->X + 2) * userParams->Y * userParams->Z * userParams->B;
+ }
+ else {
+ bufferSize = (uint64_t)storageComplexSize * userParams->X * userParams->Y * userParams->Z * userParams->B;
+ }
+
+ float* buffer_input = (float*)malloc(bufferSize);
+ for (uint64_t i = 0; i < bufferSize/sizeof(float); i++) {
+ buffer_input[i] = (float)(2 * ((float)rand()) / RAND_MAX - 1.0);
+ }
for (int n = 0; n < 2; n++) {
double run_time[num_runs][2];
for (int r = 0; r < num_runs; r++) {
@@ -64,17 +79,14 @@ void user_benchmark_cuFFT(bool file_output, FILE* output, cuFFTUserSystemParamet
dims[2] = userParams->X;
break;
}
- uint64_t bufferSize;
- if (userParams->R2C)
- bufferSize = (uint64_t)(storageComplexSize / 2) * (userParams->X + 2) * userParams->Y * userParams->Z * userParams->B;
- else
- bufferSize = (uint64_t)storageComplexSize * userParams->X * userParams->Y * userParams->Z * userParams->B;
cudaMalloc((void**)&dataC, bufferSize);
if (cudaGetLastError() != cudaSuccess) {
fprintf(stderr, "Cuda error: Failed to allocate\n");
return;
}
+ cudaMemcpy(dataC, buffer_input, bufferSize, cudaMemcpyHostToDevice);
+
//forward + inverse
int iembed[2][3];
int istride[2] = { 1, 1 };
@@ -195,4 +207,5 @@ void user_benchmark_cuFFT(bool file_output, FILE* output, cuFFTUserSystemParamet
cudaDeviceSynchronize();
}
}
+ free(buffer_input);
}
diff --git a/benchmark_scripts/rocFFT_scripts/src/user_benchmark_rocFFT.cpp b/benchmark_scripts/rocFFT_scripts/src/user_benchmark_rocFFT.cpp
index 60b06b8..17d479b 100644
--- a/benchmark_scripts/rocFFT_scripts/src/user_benchmark_rocFFT.cpp
+++ b/benchmark_scripts/rocFFT_scripts/src/user_benchmark_rocFFT.cpp
@@ -22,7 +22,7 @@ void user_benchmark_rocFFT(bool file_output, FILE* output, rocFFTUserSystemParam
hipSetDevice(device_id);
const int num_runs = 7;
double benchmark_result[2] = { 0,0 };//averaged result = sum(system_size/iteration_time)/num_benchmark_samples
- uint64_t storageComplexSize;
+ uint64_t storageComplexSize=8;
switch (userParams->P) {
case 0:
storageComplexSize = (2 * sizeof(float));
@@ -33,7 +33,22 @@ void user_benchmark_rocFFT(bool file_output, FILE* output, rocFFTUserSystemParam
case 2:
storageComplexSize = (2 * 2);
break;
+ default:
+ storageComplexSize = (2 * sizeof(float));
+ break;
}
+ uint64_t bufferSize = 0;
+ if (userParams->R2C) {
+ bufferSize = (uint64_t)(storageComplexSize / 2) * (userParams->X + 2) * userParams->Y * userParams->Z * userParams->B;
+ }
+ else {
+ bufferSize = (uint64_t)storageComplexSize * userParams->X * userParams->Y * userParams->Z * userParams->B;
+ }
+
+ float* buffer_input = (float*)malloc(bufferSize);
+ for (uint64_t i = 0; i < bufferSize/sizeof(float); i++) {
+ buffer_input[i] = (float)(2 * ((float)rand()) / RAND_MAX - 1.0);
+ }
for (int n = 0; n < 2; n++) {
double run_time[num_runs][2];
for (int r = 0; r < num_runs; r++) {
@@ -62,19 +77,15 @@ void user_benchmark_rocFFT(bool file_output, FILE* output, rocFFTUserSystemParam
dims[2] = userParams->X;
break;
}
- uint64_t bufferSize;
- if (userParams->R2C)
- bufferSize = (uint64_t)(storageComplexSize / 2) * (userParams->X + 2) * userParams->Y * userParams->Z * userParams->B;
- else
- bufferSize = (uint64_t)storageComplexSize * userParams->X * userParams->Y * userParams->Z * userParams->B;
-
+
hipMalloc((void**)&dataC, bufferSize);
if (hipGetLastError() != hipSuccess) {
fprintf(stderr, "ROCM error: Failed to allocate\n");
return;
}
-
+ hipMemcpy(dataC, buffer_input, bufferSize, hipMemcpyHostToDevice);
+
//forward + inverse
int iembed[2][3];
int istride[2] = { 1, 1 };
@@ -196,4 +207,5 @@ void user_benchmark_rocFFT(bool file_output, FILE* output, rocFFTUserSystemParam
//hipDeviceSynchronize();
}
}
+ free(buffer_input);
}
diff --git a/benchmark_scripts/vkFFT_scripts/include/utils_VkFFT.h b/benchmark_scripts/vkFFT_scripts/include/utils_VkFFT.h
index 518331d..1387118 100644
--- a/benchmark_scripts/vkFFT_scripts/include/utils_VkFFT.h
+++ b/benchmark_scripts/vkFFT_scripts/include/utils_VkFFT.h
@@ -33,6 +33,9 @@ typedef struct {
ze_context_handle_t context;
ze_command_queue_handle_t commandQueue;
uint32_t commandQueueID;
+#elif(VKFFT_BACKEND==5)
+ MTL::Device* device;
+ MTL::CommandQueue* queue;
#endif
uint64_t device_id;//an id of a device, reported by Vulkan device list
} VkGPU;//an example structure containing Vulkan primitives
@@ -65,10 +68,10 @@ VkResult createFence(VkGPU* vkGPU);
VkResult createCommandPool(VkGPU* vkGPU);
VkFFTResult findMemoryType(VkGPU* vkGPU, uint64_t memoryTypeBits, uint64_t memorySize, VkMemoryPropertyFlags properties, uint32_t* memoryTypeIndex);
VkFFTResult allocateBuffer(VkGPU* vkGPU, VkBuffer* buffer, VkDeviceMemory* deviceMemory, VkBufferUsageFlags usageFlags, VkMemoryPropertyFlags propertyFlags, uint64_t size);
-VkFFTResult transferDataFromCPU(VkGPU* vkGPU, void* arr, VkBuffer* buffer, uint64_t bufferSize);
-VkFFTResult transferDataToCPU(VkGPU* vkGPU, void* arr, VkBuffer* buffer, uint64_t bufferSize);
#endif
+VkFFTResult transferDataToCPU(VkGPU* vkGPU, void* cpu_arr, void* output_buffer, uint64_t bufferSize);
+VkFFTResult transferDataFromCPU(VkGPU* vkGPU, void* cpu_arr, void* input_buffer, uint64_t bufferSize);
VkFFTResult devices_list();
VkFFTResult performVulkanFFT(VkGPU* vkGPU, VkFFTApplication* app, VkFFTLaunchParams* launchParams, int inverse, uint64_t num_iter);
VkFFTResult performVulkanFFTiFFT(VkGPU* vkGPU, VkFFTApplication* app, VkFFTLaunchParams* launchParams, uint64_t num_iter, double* time_result);
-#endif
\ No newline at end of file
+#endif
diff --git a/benchmark_scripts/vkFFT_scripts/src/sample_0_benchmark_VkFFT_single.cpp b/benchmark_scripts/vkFFT_scripts/src/sample_0_benchmark_VkFFT_single.cpp
index 1405ecd..b85dde5 100644
--- a/benchmark_scripts/vkFFT_scripts/src/sample_0_benchmark_VkFFT_single.cpp
+++ b/benchmark_scripts/vkFFT_scripts/src/sample_0_benchmark_VkFFT_single.cpp
@@ -39,6 +39,10 @@
#endif
#elif(VKFFT_BACKEND==4)
#include <ze_api.h>
+#elif(VKFFT_BACKEND==5)
+#include "Foundation/Foundation.hpp"
+#include "QuartzCore/QuartzCore.hpp"
+#include "Metal/Metal.hpp"
#endif
#include "vkFFT.h"
#include "utils_VkFFT.h"
@@ -56,6 +60,7 @@ VkFFTResult sample_0_benchmark_VkFFT_single(VkGPU* vkGPU, uint64_t file_output,
cl_int res = CL_SUCCESS;
#elif(VKFFT_BACKEND==4)
ze_result_t res = ZE_RESULT_SUCCESS;
+#elif(VKFFT_BACKEND==5)
#endif
if (file_output)
fprintf(output, "0 - VkFFT FFT + iFFT C2C benchmark 1D batched in single precision\n");
@@ -79,11 +84,18 @@ VkFFTResult sample_0_benchmark_VkFFT_single(VkGPU* vkGPU, uint64_t file_output,
configuration.FFTdim = 1; //FFT dimension, 1D, 2D or 3D (default 1).
configuration.size[0] = 4 * (uint64_t)pow(2, n); //Multidimensional FFT dimensions sizes (default 1). For best performance (and stability), order dimensions in descendant size order as: x>y>z.
if (n == 0) configuration.size[0] = 4096;
- configuration.numberBatches = (uint64_t)((64 * 32 * (uint64_t)pow(2, 16)) / configuration.size[0]);
+ configuration.numberBatches = (uint64_t)((64 * 32 * (uint64_t)pow(2, 16)) / configuration.size[0]);
if (configuration.numberBatches < 1) configuration.numberBatches = 1;
-
+#if(VKFFT_BACKEND!=5)
+ if (r==0) configuration.saveApplicationToString = 1;
+ if (r!=0) configuration.loadApplicationFromString = 1;
+#endif
//After this, configuration file contains pointers to Vulkan objects needed to work with the GPU: VkDevice* device - created device, [uint64_t *bufferSize, VkBuffer *buffer, VkDeviceMemory* bufferDeviceMemory] - allocated GPU memory FFT is performed on. [uint64_t *kernelSize, VkBuffer *kernel, VkDeviceMemory* kernelDeviceMemory] - allocated GPU memory, where kernel for convolution is stored.
- configuration.device = &vkGPU->device;
+#if(VKFFT_BACKEND==5)
+ configuration.device = vkGPU->device;
+#else
+ configuration.device = &vkGPU->device;
+#endif
#if(VKFFT_BACKEND==0)
configuration.queue = &vkGPU->queue; //to allocate memory for LUT, we have to pass a queue, vkGPU->fence, commandPool and physicalDevice pointers
configuration.fence = &vkGPU->fence;
@@ -96,6 +108,8 @@ VkFFTResult sample_0_benchmark_VkFFT_single(VkGPU* vkGPU, uint64_t file_output,
configuration.context = &vkGPU->context;
configuration.commandQueue = &vkGPU->commandQueue;
configuration.commandQueueID = vkGPU->commandQueueID;
+#elif(VKFFT_BACKEND==5)
+ configuration.queue = vkGPU->queue;
#endif
//Allocate buffer for the input data.
uint64_t bufferSize = (uint64_t)sizeof(float) * 2 * configuration.size[0] * configuration.numberBatches;
@@ -127,6 +141,10 @@ VkFFTResult sample_0_benchmark_VkFFT_single(VkGPU* vkGPU, uint64_t file_output,
res = zeMemAllocDevice(vkGPU->context, &device_desc, bufferSize, sizeof(float), vkGPU->device, &buffer);
if (res != ZE_RESULT_SUCCESS) return VKFFT_ERROR_FAILED_TO_ALLOCATE;
configuration.buffer = &buffer;
+#elif(VKFFT_BACKEND==5)
+ MTL::Buffer* buffer = 0;
+ buffer = vkGPU->device->newBuffer(bufferSize, MTL::ResourceStorageModePrivate);
+ configuration.buffer = &buffer;
#endif
configuration.bufferSize = &bufferSize;
@@ -144,42 +162,42 @@ VkFFTResult sample_0_benchmark_VkFFT_single(VkGPU* vkGPU, uint64_t file_output,
}
*/
- //Sample buffer transfer tool. Uses staging buffer of the same size as destination buffer, which can be reduced if transfer is done sequentially in small buffers.
-#if(VKFFT_BACKEND==0)
- resFFT = transferDataFromCPU(vkGPU, buffer_input, &buffer, bufferSize);
- if (resFFT != VKFFT_SUCCESS) return resFFT;
-#elif(VKFFT_BACKEND==1)
- res = cudaMemcpy(buffer, buffer_input, bufferSize, cudaMemcpyHostToDevice);
- if (res != cudaSuccess) return VKFFT_ERROR_FAILED_TO_COPY;
-#elif(VKFFT_BACKEND==2)
- res = hipMemcpy(buffer, buffer_input, bufferSize, hipMemcpyHostToDevice);
- if (res != hipSuccess) return VKFFT_ERROR_FAILED_TO_COPY;
-#elif(VKFFT_BACKEND==3)
- res = clEnqueueWriteBuffer(vkGPU->commandQueue, buffer, CL_TRUE, 0, bufferSize, buffer_input, 0, NULL, NULL);
- if (res != CL_SUCCESS) return VKFFT_ERROR_FAILED_TO_COPY;
-#elif(VKFFT_BACKEND==4)
- ze_command_queue_desc_t commandQueueCopyDesc = {
- ZE_STRUCTURE_TYPE_COMMAND_QUEUE_DESC,
- 0,
- vkGPU->commandQueueID,
- 0, // index
- 0, // flags
- ZE_COMMAND_QUEUE_MODE_DEFAULT,
- ZE_COMMAND_QUEUE_PRIORITY_NORMAL
- };
- ze_command_list_handle_t copyCommandList;
- res = zeCommandListCreateImmediate(vkGPU->context, vkGPU->device, &commandQueueCopyDesc, ©CommandList);
- if (res != ZE_RESULT_SUCCESS) return VKFFT_ERROR_FAILED_TO_CREATE_COMMAND_LIST;
- res = zeCommandListAppendMemoryCopy(copyCommandList,buffer,buffer_input,bufferSize,0,0,0);
- if (res != ZE_RESULT_SUCCESS) return VKFFT_ERROR_FAILED_TO_COPY;
- res = zeCommandQueueSynchronize(vkGPU->commandQueue, UINT32_MAX);
- if (res != 0) return VKFFT_ERROR_FAILED_TO_SYNCHRONIZE;
-#endif
+ //Sample buffer transfer tool. Uses staging buffer (if needed) of the same size as destination buffer, which can be reduced if transfer is done sequentially in small buffers.
+ resFFT = transferDataFromCPU(vkGPU, buffer_input, &buffer, bufferSize);
+ if (resFFT != VKFFT_SUCCESS) return resFFT;
+ if (configuration.loadApplicationFromString) {
+ FILE* kernelCache;
+ uint64_t str_len;
+ char fname[500];
+ int VkFFT_version = VkFFTGetVersion();
+ sprintf(fname, "VkFFT_binary");
+ kernelCache = fopen(fname, "rb");
+ if (!kernelCache) return VKFFT_ERROR_EMPTY_FILE;
+ fseek(kernelCache, 0, SEEK_END);
+ str_len = ftell(kernelCache);
+ fseek(kernelCache, 0, SEEK_SET);
+ configuration.loadApplicationString = malloc(str_len);
+ fread(configuration.loadApplicationString, str_len, 1, kernelCache);
+ fclose(kernelCache);
+ }
//Initialize applications. This function loads shaders, creates pipeline and configures FFT based on configuration file. No buffer allocations inside VkFFT library.
resFFT = initializeVkFFT(&app, configuration);
if (resFFT != VKFFT_SUCCESS) return resFFT;
+ if (configuration.loadApplicationFromString)
+ free(configuration.loadApplicationString);
+
+ if (configuration.saveApplicationToString) {
+ FILE* kernelCache;
+ char fname[500];
+ int VkFFT_version = VkFFTGetVersion();
+ sprintf(fname, "VkFFT_binary");
+ kernelCache = fopen(fname, "wb");
+ fwrite(app.saveApplicationString, app.applicationStringSize, 1, kernelCache);
+ fclose(kernelCache);
+ }
+
//Submit FFT+iFFT.
uint64_t num_iter = (((uint64_t)3 * 4096 * 1024.0 * 1024.0) / bufferSize > 1000) ? 1000 : (uint64_t)(((uint64_t)3 * 4096 * 1024.0 * 1024.0) / bufferSize);
#if(VKFFT_BACKEND==0)
@@ -238,6 +256,8 @@ VkFFTResult sample_0_benchmark_VkFFT_single(VkGPU* vkGPU, uint64_t file_output,
clReleaseMemObject(buffer);
#elif(VKFFT_BACKEND==4)
zeMemFree(vkGPU->context, buffer);
+#elif(VKFFT_BACKEND==5)
+ buffer->release();
#endif
deleteVkFFT(&app);
diff --git a/benchmark_scripts/vkFFT_scripts/src/sample_1000_VkFFT_single_2_4096.cpp b/benchmark_scripts/vkFFT_scripts/src/sample_1000_VkFFT_single_2_4096.cpp
index 564f238..00942e5 100644
--- a/benchmark_scripts/vkFFT_scripts/src/sample_1000_VkFFT_single_2_4096.cpp
+++ b/benchmark_scripts/vkFFT_scripts/src/sample_1000_VkFFT_single_2_4096.cpp
@@ -39,6 +39,10 @@
#endif
#elif(VKFFT_BACKEND==4)
#include <ze_api.h>
+#elif(VKFFT_BACKEND==5)
+#include "Foundation/Foundation.hpp"
+#include "QuartzCore/QuartzCore.hpp"
+#include "Metal/Metal.hpp"
#endif
#include "vkFFT.h"
#include "utils_VkFFT.h"
@@ -56,6 +60,7 @@ VkFFTResult sample_1000_VkFFT_single_2_4096(VkGPU* vkGPU, uint64_t file_output,
cl_int res = CL_SUCCESS;
#elif(VKFFT_BACKEND==4)
ze_result_t res = ZE_RESULT_SUCCESS;
+#elif(VKFFT_BACKEND==5)
#endif
if (file_output)
fprintf(output, "1000 - VkFFT FFT + iFFT C2C benchmark 1D batched in single precision: all supported systems from 2 to 4096\n");
@@ -92,9 +97,17 @@ VkFFTResult sample_1000_VkFFT_single_2_4096(VkGPU* vkGPU, uint64_t file_output,
if (temp != 1) break;*/
configuration.numberBatches = (uint64_t)pow(2, (uint64_t)log2((uint64_t)64 * 32 * (uint64_t)pow(2, 16) / configuration.size[0]));
if (configuration.numberBatches < 1) configuration.numberBatches = 1;
+#if(VKFFT_BACKEND!=5)
+ if (r==0) configuration.saveApplicationToString = 1;
+ if (r!=0) configuration.loadApplicationFromString = 1;
+#endif
//After this, configuration file contains pointers to Vulkan objects needed to work with the GPU: VkDevice* device - created device, [uint64_t *bufferSize, VkBuffer *buffer, VkDeviceMemory* bufferDeviceMemory] - allocated GPU memory FFT is performed on. [uint64_t *kernelSize, VkBuffer *kernel, VkDeviceMemory* kernelDeviceMemory] - allocated GPU memory, where kernel for convolution is stored.
- configuration.device = &vkGPU->device;
+#if(VKFFT_BACKEND==5)
+ configuration.device = vkGPU->device;
+#else
+ configuration.device = &vkGPU->device;
+#endif
#if(VKFFT_BACKEND==0)
configuration.queue = &vkGPU->queue; //to allocate memory for LUT, we have to pass a queue, vkGPU->fence, commandPool and physicalDevice pointers
configuration.fence = &vkGPU->fence;
@@ -107,6 +120,8 @@ VkFFTResult sample_1000_VkFFT_single_2_4096(VkGPU* vkGPU, uint64_t file_output,
configuration.context = &vkGPU->context;
configuration.commandQueue = &vkGPU->commandQueue;
configuration.commandQueueID = vkGPU->commandQueueID;
+#elif(VKFFT_BACKEND==5)
+ configuration.queue = vkGPU->queue;
#endif
//Allocate buffer for the input data.
uint64_t bufferSize = (uint64_t)sizeof(float) * 2 * configuration.size[0] * configuration.numberBatches;
@@ -138,6 +153,10 @@ VkFFTResult sample_1000_VkFFT_single_2_4096(VkGPU* vkGPU, uint64_t file_output,
res = zeMemAllocDevice(vkGPU->context, &device_desc, bufferSize, sizeof(float), vkGPU->device, &buffer);
if (res != ZE_RESULT_SUCCESS) return VKFFT_ERROR_FAILED_TO_ALLOCATE;
configuration.buffer = &buffer;
+#elif(VKFFT_BACKEND==5)
+ MTL::Buffer* buffer = 0;
+ buffer = vkGPU->device->newBuffer(bufferSize, MTL::ResourceStorageModePrivate);
+ configuration.buffer = &buffer;
#endif
configuration.bufferSize = &bufferSize;
@@ -155,42 +174,42 @@ VkFFTResult sample_1000_VkFFT_single_2_4096(VkGPU* vkGPU, uint64_t file_output,
}
*/
- //Sample buffer transfer tool. Uses staging buffer of the same size as destination buffer, which can be reduced if transfer is done sequentially in small buffers.
-#if(VKFFT_BACKEND==0)
- resFFT = transferDataFromCPU(vkGPU, buffer_input, &buffer, bufferSize);
- if (resFFT != VKFFT_SUCCESS) return resFFT;
-#elif(VKFFT_BACKEND==1)
- res = cudaMemcpy(buffer, buffer_input, bufferSize, cudaMemcpyHostToDevice);
- if (res != cudaSuccess) return VKFFT_ERROR_FAILED_TO_COPY;
-#elif(VKFFT_BACKEND==2)
- res = hipMemcpy(buffer, buffer_input, bufferSize, hipMemcpyHostToDevice);
- if (res != hipSuccess) return VKFFT_ERROR_FAILED_TO_COPY;
-#elif(VKFFT_BACKEND==3)
- res = clEnqueueWriteBuffer(vkGPU->commandQueue, buffer, CL_TRUE, 0, bufferSize, buffer_input, 0, NULL, NULL);
- if (res != CL_SUCCESS) return VKFFT_ERROR_FAILED_TO_COPY;
-#elif(VKFFT_BACKEND==4)
- ze_command_queue_desc_t commandQueueCopyDesc = {
- ZE_STRUCTURE_TYPE_COMMAND_QUEUE_DESC,
- 0,
- vkGPU->commandQueueID,
- 0, // index
- 0, // flags
- ZE_COMMAND_QUEUE_MODE_DEFAULT,
- ZE_COMMAND_QUEUE_PRIORITY_NORMAL
- };
- ze_command_list_handle_t copyCommandList;
- res = zeCommandListCreateImmediate(vkGPU->context, vkGPU->device, &commandQueueCopyDesc, ©CommandList);
- if (res != ZE_RESULT_SUCCESS) return VKFFT_ERROR_FAILED_TO_CREATE_COMMAND_LIST;
- res = zeCommandListAppendMemoryCopy(copyCommandList, buffer, buffer_input, bufferSize, 0, 0, 0);
- if (res != ZE_RESULT_SUCCESS) return VKFFT_ERROR_FAILED_TO_COPY;
- res = zeCommandQueueSynchronize(vkGPU->commandQueue, UINT32_MAX);
- if (res != 0) return VKFFT_ERROR_FAILED_TO_SYNCHRONIZE;
-#endif
+ //Sample buffer transfer tool. Uses staging buffer (if needed) of the same size as destination buffer, which can be reduced if transfer is done sequentially in small buffers.
+ resFFT = transferDataFromCPU(vkGPU, buffer_input, &buffer, bufferSize);
+ if (resFFT != VKFFT_SUCCESS) return resFFT;
+ if (configuration.loadApplicationFromString) {
+ FILE* kernelCache;
+ uint64_t str_len;
+ char fname[500];
+ int VkFFT_version = VkFFTGetVersion();
+ sprintf(fname, "VkFFT_binary");
+ kernelCache = fopen(fname, "rb");
+ if (!kernelCache) return VKFFT_ERROR_EMPTY_FILE;
+ fseek(kernelCache, 0, SEEK_END);
+ str_len = ftell(kernelCache);
+ fseek(kernelCache, 0, SEEK_SET);
+ configuration.loadApplicationString = malloc(str_len);
+ fread(configuration.loadApplicationString, str_len, 1, kernelCache);
+ fclose(kernelCache);
+ }
//Initialize applications. This function loads shaders, creates pipeline and configures FFT based on configuration file. No buffer allocations inside VkFFT library.
resFFT = initializeVkFFT(&app, configuration);
if (resFFT != VKFFT_SUCCESS) return resFFT;
+ if (configuration.loadApplicationFromString)
+ free(configuration.loadApplicationString);
+
+ if (configuration.saveApplicationToString) {
+ FILE* kernelCache;
+ char fname[500];
+ int VkFFT_version = VkFFTGetVersion();
+ sprintf(fname, "VkFFT_binary");
+ kernelCache = fopen(fname, "wb");
+ fwrite(app.saveApplicationString, app.applicationStringSize, 1, kernelCache);
+ fclose(kernelCache);
+ }
+
//Submit FFT+iFFT.
uint64_t num_iter = (((uint64_t)3 * 4096 * 1024.0 * 1024.0) / bufferSize > 1000) ? 1000 : (uint64_t)((uint64_t)3 * 4096 * 1024.0 * 1024.0) / bufferSize;
#if(VKFFT_BACKEND==0)
@@ -249,6 +268,8 @@ VkFFTResult sample_1000_VkFFT_single_2_4096(VkGPU* vkGPU, uint64_t file_output,
clReleaseMemObject(buffer);
#elif(VKFFT_BACKEND==4)
zeMemFree(vkGPU->context, buffer);
+#elif(VKFFT_BACKEND==5)
+ buffer->release();
#endif
deleteVkFFT(&app);
diff --git a/benchmark_scripts/vkFFT_scripts/src/sample_1001_benchmark_VkFFT_double_2_4096.cpp b/benchmark_scripts/vkFFT_scripts/src/sample_1001_benchmark_VkFFT_double_2_4096.cpp
index c8af6d6..4d58c00 100644
--- a/benchmark_scripts/vkFFT_scripts/src/sample_1001_benchmark_VkFFT_double_2_4096.cpp
+++ b/benchmark_scripts/vkFFT_scripts/src/sample_1001_benchmark_VkFFT_double_2_4096.cpp
@@ -39,6 +39,10 @@
#endif
#elif(VKFFT_BACKEND==4)
#include <ze_api.h>
+#elif(VKFFT_BACKEND==5)
+#include "Foundation/Foundation.hpp"
+#include "QuartzCore/QuartzCore.hpp"
+#include "Metal/Metal.hpp"
#endif
#include "vkFFT.h"
#include "utils_VkFFT.h"
@@ -56,6 +60,7 @@ VkFFTResult sample_1001_benchmark_VkFFT_double_2_4096(VkGPU* vkGPU, uint64_t fil
cl_int res = CL_SUCCESS;
#elif(VKFFT_BACKEND==4)
ze_result_t res = ZE_RESULT_SUCCESS;
+#elif(VKFFT_BACKEND==5)
#endif
if (file_output)
fprintf(output, "1001 - VkFFT FFT + iFFT C2C benchmark 1D batched in double precision: all supported systems from 2 to 4096\n");
@@ -95,9 +100,17 @@ VkFFTResult sample_1001_benchmark_VkFFT_double_2_4096(VkGPU* vkGPU, uint64_t fil
configuration.size[2] = 1;
configuration.doublePrecision = true;
-
+#if(VKFFT_BACKEND!=5)
+ if (r==0) configuration.saveApplicationToString = 1;
+ if (r!=0) configuration.loadApplicationFromString = 1;
+#endif
+
//After this, configuration file contains pointers to Vulkan objects needed to work with the GPU: VkDevice* device - created device, [uint64_t *bufferSize, VkBuffer *buffer, VkDeviceMemory* bufferDeviceMemory] - allocated GPU memory FFT is performed on. [uint64_t *kernelSize, VkBuffer *kernel, VkDeviceMemory* kernelDeviceMemory] - allocated GPU memory, where kernel for convolution is stored.
- configuration.device = &vkGPU->device;
+#if(VKFFT_BACKEND==5)
+ configuration.device = vkGPU->device;
+#else
+ configuration.device = &vkGPU->device;
+#endif
#if(VKFFT_BACKEND==0)
configuration.queue = &vkGPU->queue; //to allocate memory for LUT, we have to pass a queue, vkGPU->fence, commandPool and physicalDevice pointers
configuration.fence = &vkGPU->fence;
@@ -110,6 +123,8 @@ VkFFTResult sample_1001_benchmark_VkFFT_double_2_4096(VkGPU* vkGPU, uint64_t fil
configuration.context = &vkGPU->context;
configuration.commandQueue = &vkGPU->commandQueue;
configuration.commandQueueID = vkGPU->commandQueueID;
+#elif(VKFFT_BACKEND==5)
+ configuration.queue = vkGPU->queue;
#endif
//Allocate buffer for the input data.
@@ -142,6 +157,10 @@ VkFFTResult sample_1001_benchmark_VkFFT_double_2_4096(VkGPU* vkGPU, uint64_t fil
res = zeMemAllocDevice(vkGPU->context, &device_desc, bufferSize, sizeof(float), vkGPU->device, &buffer);
if (res != ZE_RESULT_SUCCESS) return VKFFT_ERROR_FAILED_TO_ALLOCATE;
configuration.buffer = &buffer;
+#elif(VKFFT_BACKEND==5)
+ MTL::Buffer* buffer = 0;
+ buffer = vkGPU->device->newBuffer(bufferSize, MTL::ResourceStorageModePrivate);
+ configuration.buffer = &buffer;
#endif
configuration.bufferSize = &bufferSize;
@@ -157,43 +176,43 @@ VkFFTResult sample_1001_benchmark_VkFFT_double_2_4096(VkGPU* vkGPU, uint64_t fil
}
}
*/
- //Sample buffer transfer tool. Uses staging buffer of the same size as destination buffer, which can be reduced if transfer is done sequentially in small buffers.
-#if(VKFFT_BACKEND==0)
- resFFT = transferDataFromCPU(vkGPU, buffer_input, &buffer, bufferSize);
- if (resFFT != VKFFT_SUCCESS) return resFFT;
-#elif(VKFFT_BACKEND==1)
- res = cudaMemcpy(buffer, buffer_input, bufferSize, cudaMemcpyHostToDevice);
- if (res != cudaSuccess) return VKFFT_ERROR_FAILED_TO_COPY;
-#elif(VKFFT_BACKEND==2)
- res = hipMemcpy(buffer, buffer_input, bufferSize, hipMemcpyHostToDevice);
- if (res != hipSuccess) return VKFFT_ERROR_FAILED_TO_COPY;
-#elif(VKFFT_BACKEND==3)
- res = clEnqueueWriteBuffer(vkGPU->commandQueue, buffer, CL_TRUE, 0, bufferSize, buffer_input, 0, NULL, NULL);
- if (res != CL_SUCCESS) return VKFFT_ERROR_FAILED_TO_COPY;
-#elif(VKFFT_BACKEND==4)
- ze_command_queue_desc_t commandQueueCopyDesc = {
- ZE_STRUCTURE_TYPE_COMMAND_QUEUE_DESC,
- 0,
- vkGPU->commandQueueID,
- 0, // index
- 0, // flags
- ZE_COMMAND_QUEUE_MODE_DEFAULT,
- ZE_COMMAND_QUEUE_PRIORITY_NORMAL
- };
- ze_command_list_handle_t copyCommandList;
- res = zeCommandListCreateImmediate(vkGPU->context, vkGPU->device, &commandQueueCopyDesc, ©CommandList);
- if (res != ZE_RESULT_SUCCESS) return VKFFT_ERROR_FAILED_TO_CREATE_COMMAND_LIST;
- res = zeCommandListAppendMemoryCopy(copyCommandList, buffer, buffer_input, bufferSize, 0, 0, 0);
- if (res != ZE_RESULT_SUCCESS) return VKFFT_ERROR_FAILED_TO_COPY;
- res = zeCommandQueueSynchronize(vkGPU->commandQueue, UINT32_MAX);
- if (res != 0) return VKFFT_ERROR_FAILED_TO_SYNCHRONIZE;
-#endif
+ //Sample buffer transfer tool. Uses staging buffer (if needed) of the same size as destination buffer, which can be reduced if transfer is done sequentially in small buffers.
+ resFFT = transferDataFromCPU(vkGPU, buffer_input, &buffer, bufferSize);
+ if (resFFT != VKFFT_SUCCESS) return resFFT;
//free(buffer_input);
+ if (configuration.loadApplicationFromString) {
+ FILE* kernelCache;
+ uint64_t str_len;
+ char fname[500];
+ int VkFFT_version = VkFFTGetVersion();
+ sprintf(fname, "VkFFT_binary");
+ kernelCache = fopen(fname, "rb");
+ if (!kernelCache) return VKFFT_ERROR_EMPTY_FILE;
+ fseek(kernelCache, 0, SEEK_END);
+ str_len = ftell(kernelCache);
+ fseek(kernelCache, 0, SEEK_SET);
+ configuration.loadApplicationString = malloc(str_len);
+ fread(configuration.loadApplicationString, str_len, 1, kernelCache);
+ fclose(kernelCache);
+ }
//Initialize applications. This function loads shaders, creates pipeline and configures FFT based on configuration file. No buffer allocations inside VkFFT library.
resFFT = initializeVkFFT(&app, configuration);
if (resFFT != VKFFT_SUCCESS) return resFFT;
+ if (configuration.loadApplicationFromString)
+ free(configuration.loadApplicationString);
+
+ if (configuration.saveApplicationToString) {
+ FILE* kernelCache;
+ char fname[500];
+ int VkFFT_version = VkFFTGetVersion();
+ sprintf(fname, "VkFFT_binary");
+ kernelCache = fopen(fname, "wb");
+ fwrite(app.saveApplicationString, app.applicationStringSize, 1, kernelCache);
+ fclose(kernelCache);
+ }
+
//Submit FFT+iFFT.
uint64_t num_iter = (((uint64_t)4096 * 1024.0 * 1024.0) / bufferSize > 1000) ? 1000 : (uint64_t)((uint64_t)4096 * 1024.0 * 1024.0) / bufferSize;
#if(VKFFT_BACKEND==0)
@@ -252,6 +271,8 @@ VkFFTResult sample_1001_benchmark_VkFFT_double_2_4096(VkGPU* vkGPU, uint64_t fil
clReleaseMemObject(buffer);
#elif(VKFFT_BACKEND==4)
zeMemFree(vkGPU->context, buffer);
+#elif(VKFFT_BACKEND==5)
+ buffer->release();
#endif
deleteVkFFT(&app);
diff --git a/benchmark_scripts/vkFFT_scripts/src/sample_1003_benchmark_VkFFT_single_3d_2_512.cpp b/benchmark_scripts/vkFFT_scripts/src/sample_1003_benchmark_VkFFT_single_3d_2_512.cpp
index 6ceeb8b..997eccd 100644
--- a/benchmark_scripts/vkFFT_scripts/src/sample_1003_benchmark_VkFFT_single_3d_2_512.cpp
+++ b/benchmark_scripts/vkFFT_scripts/src/sample_1003_benchmark_VkFFT_single_3d_2_512.cpp
@@ -39,6 +39,10 @@
#endif
#elif(VKFFT_BACKEND==4)
#include <ze_api.h>
+#elif(VKFFT_BACKEND==5)
+#include "Foundation/Foundation.hpp"
+#include "QuartzCore/QuartzCore.hpp"
+#include "Metal/Metal.hpp"
#endif
#include "vkFFT.h"
#include "utils_VkFFT.h"
@@ -56,6 +60,7 @@ VkFFTResult sample_1003_benchmark_VkFFT_single_3d_2_512(VkGPU* vkGPU, uint64_t f
cl_int res = CL_SUCCESS;
#elif(VKFFT_BACKEND==4)
ze_result_t res = ZE_RESULT_SUCCESS;
+#elif(VKFFT_BACKEND==5)
#endif
if (file_output)
fprintf(output, "1003 - VkFFT FFT + iFFT C2C multidimensional benchmark in single precision: all supported cubes from 2 to 512\n");
@@ -92,8 +97,17 @@ VkFFTResult sample_1003_benchmark_VkFFT_single_3d_2_512(VkGPU* vkGPU, uint64_t f
if (temp != 1) break;*/
configuration.size[1] = configuration.size[0];
configuration.size[2] = configuration.size[0];
+#if(VKFFT_BACKEND!=5)
+ if (r==0) configuration.saveApplicationToString = 1;
+ if (r!=0) configuration.loadApplicationFromString = 1;
+#endif
+
//After this, configuration file contains pointers to Vulkan objects needed to work with the GPU: VkDevice* device - created device, [uint64_t *bufferSize, VkBuffer *buffer, VkDeviceMemory* bufferDeviceMemory] - allocated GPU memory FFT is performed on. [uint64_t *kernelSize, VkBuffer *kernel, VkDeviceMemory* kernelDeviceMemory] - allocated GPU memory, where kernel for convolution is stored.
- configuration.device = &vkGPU->device;
+#if(VKFFT_BACKEND==5)
+ configuration.device = vkGPU->device;
+#else
+ configuration.device = &vkGPU->device;
+#endif
#if(VKFFT_BACKEND==0)
configuration.queue = &vkGPU->queue; //to allocate memory for LUT, we have to pass a queue, vkGPU->fence, commandPool and physicalDevice pointers
configuration.fence = &vkGPU->fence;
@@ -106,6 +120,8 @@ VkFFTResult sample_1003_benchmark_VkFFT_single_3d_2_512(VkGPU* vkGPU, uint64_t f
configuration.context = &vkGPU->context;
configuration.commandQueue = &vkGPU->commandQueue;
configuration.commandQueueID = vkGPU->commandQueueID;
+#elif(VKFFT_BACKEND==5)
+ configuration.queue = vkGPU->queue;
#endif
//Allocate buffer for the input data.
uint64_t bufferSize = (uint64_t)sizeof(float) * 2 * configuration.size[0] * configuration.size[1] * configuration.size[2];;
@@ -137,6 +153,10 @@ VkFFTResult sample_1003_benchmark_VkFFT_single_3d_2_512(VkGPU* vkGPU, uint64_t f
res = zeMemAllocDevice(vkGPU->context, &device_desc, bufferSize, sizeof(float), vkGPU->device, &buffer);
if (res != ZE_RESULT_SUCCESS) return VKFFT_ERROR_FAILED_TO_ALLOCATE;
configuration.buffer = &buffer;
+#elif(VKFFT_BACKEND==5)
+ MTL::Buffer* buffer = 0;
+ buffer = vkGPU->device->newBuffer(bufferSize, MTL::ResourceStorageModePrivate);
+ configuration.buffer = &buffer;
#endif
configuration.bufferSize = &bufferSize;
@@ -154,42 +174,41 @@ VkFFTResult sample_1003_benchmark_VkFFT_single_3d_2_512(VkGPU* vkGPU, uint64_t f
}
*/
- //Sample buffer transfer tool. Uses staging buffer of the same size as destination buffer, which can be reduced if transfer is done sequentially in small buffers.
-#if(VKFFT_BACKEND==0)
- resFFT = transferDataFromCPU(vkGPU, buffer_input, &buffer, bufferSize);
- if (resFFT != VKFFT_SUCCESS) return resFFT;
-#elif(VKFFT_BACKEND==1)
- res = cudaMemcpy(buffer, buffer_input, bufferSize, cudaMemcpyHostToDevice);
- if (res != cudaSuccess) return VKFFT_ERROR_FAILED_TO_COPY;
-#elif(VKFFT_BACKEND==2)
- res = hipMemcpy(buffer, buffer_input, bufferSize, hipMemcpyHostToDevice);
- if (res != hipSuccess) return VKFFT_ERROR_FAILED_TO_COPY;
-#elif(VKFFT_BACKEND==3)
- res = clEnqueueWriteBuffer(vkGPU->commandQueue, buffer, CL_TRUE, 0, bufferSize, buffer_input, 0, NULL, NULL);
- if (res != CL_SUCCESS) return VKFFT_ERROR_FAILED_TO_COPY;
-#elif(VKFFT_BACKEND==4)
- ze_command_queue_desc_t commandQueueCopyDesc = {
- ZE_STRUCTURE_TYPE_COMMAND_QUEUE_DESC,
- 0,
- vkGPU->commandQueueID,
- 0, // index
- 0, // flags
- ZE_COMMAND_QUEUE_MODE_DEFAULT,
- ZE_COMMAND_QUEUE_PRIORITY_NORMAL
- };
- ze_command_list_handle_t copyCommandList;
- res = zeCommandListCreateImmediate(vkGPU->context, vkGPU->device, &commandQueueCopyDesc, ©CommandList);
- if (res != ZE_RESULT_SUCCESS) return VKFFT_ERROR_FAILED_TO_CREATE_COMMAND_LIST;
- res = zeCommandListAppendMemoryCopy(copyCommandList, buffer, buffer_input, bufferSize, 0, 0, 0);
- if (res != ZE_RESULT_SUCCESS) return VKFFT_ERROR_FAILED_TO_COPY;
- res = zeCommandQueueSynchronize(vkGPU->commandQueue, UINT32_MAX);
- if (res != 0) return VKFFT_ERROR_FAILED_TO_SYNCHRONIZE;
-#endif
-
+ //Sample buffer transfer tool. Uses staging buffer (if needed) of the same size as destination buffer, which can be reduced if transfer is done sequentially in small buffers.
+ resFFT = transferDataFromCPU(vkGPU, buffer_input, &buffer, bufferSize);
+ if (resFFT != VKFFT_SUCCESS) return resFFT;
+ if (configuration.loadApplicationFromString) {
+ FILE* kernelCache;
+ uint64_t str_len;
+ char fname[500];
+ int VkFFT_version = VkFFTGetVersion();
+ sprintf(fname, "VkFFT_binary");
+ kernelCache = fopen(fname, "rb");
+ if (!kernelCache) return VKFFT_ERROR_EMPTY_FILE;
+ fseek(kernelCache, 0, SEEK_END);
+ str_len = ftell(kernelCache);
+ fseek(kernelCache, 0, SEEK_SET);
+ configuration.loadApplicationString = malloc(str_len);
+ fread(configuration.loadApplicationString, str_len, 1, kernelCache);
+ fclose(kernelCache);
+ }
//Initialize applications. This function loads shaders, creates pipeline and configures FFT based on configuration file. No buffer allocations inside VkFFT library.
resFFT = initializeVkFFT(&app, configuration);
if (resFFT != VKFFT_SUCCESS) return resFFT;
+ if (configuration.loadApplicationFromString)
+ free(configuration.loadApplicationString);
+
+ if (configuration.saveApplicationToString) {
+ FILE* kernelCache;
+ char fname[500];
+ int VkFFT_version = VkFFTGetVersion();
+ sprintf(fname, "VkFFT_binary");
+ kernelCache = fopen(fname, "wb");
+ fwrite(app.saveApplicationString, app.applicationStringSize, 1, kernelCache);
+ fclose(kernelCache);
+ }
+
//Submit FFT+iFFT.
uint64_t num_iter = (((uint64_t)3 * 4096 * 1024.0 * 1024.0) / bufferSize > 1000) ? 1000 : (uint64_t)((uint64_t)3 * 4096 * 1024.0 * 1024.0) / bufferSize;
#if(VKFFT_BACKEND==0)
@@ -248,6 +267,8 @@ VkFFTResult sample_1003_benchmark_VkFFT_single_3d_2_512(VkGPU* vkGPU, uint64_t f
clReleaseMemObject(buffer);
#elif(VKFFT_BACKEND==4)
zeMemFree(vkGPU->context, buffer);
+#elif(VKFFT_BACKEND==5)
+ buffer->release();
#endif
deleteVkFFT(&app);
diff --git a/benchmark_scripts/vkFFT_scripts/src/sample_100_benchmark_VkFFT_single_nd_dct.cpp b/benchmark_scripts/vkFFT_scripts/src/sample_100_benchmark_VkFFT_single_nd_dct.cpp
index 9be2427..f753278 100644
--- a/benchmark_scripts/vkFFT_scripts/src/sample_100_benchmark_VkFFT_single_nd_dct.cpp
+++ b/benchmark_scripts/vkFFT_scripts/src/sample_100_benchmark_VkFFT_single_nd_dct.cpp
@@ -39,6 +39,10 @@
#endif
#elif(VKFFT_BACKEND==4)
#include <ze_api.h>
+#elif(VKFFT_BACKEND==5)
+#include "Foundation/Foundation.hpp"
+#include "QuartzCore/QuartzCore.hpp"
+#include "Metal/Metal.hpp"
#endif
#include "vkFFT.h"
#include "utils_VkFFT.h"
@@ -56,6 +60,7 @@ VkFFTResult sample_100_benchmark_VkFFT_single_nd_dct(VkGPU* vkGPU, uint64_t file
cl_int res = CL_SUCCESS;
#elif(VKFFT_BACKEND==4)
ze_result_t res = ZE_RESULT_SUCCESS;
+#elif(VKFFT_BACKEND==5)
#endif
if (file_output)
fprintf(output, "1%" PRIu64 "0 - VkFFT FFT + iFFT R2R DCT-%" PRIu64 " multidimensional benchmark in single precision\n", dct_type, dct_type);
@@ -94,8 +99,16 @@ VkFFTResult sample_100_benchmark_VkFFT_single_nd_dct(VkGPU* vkGPU, uint64_t file
configuration.performDCT = dct_type;
//configuration.disableMergeSequencesR2C = 1;
//configuration.doublePrecision = 1;
+#if(VKFFT_BACKEND!=5)
+ if (r==0) configuration.saveApplicationToString = 1;
+ if (r!=0) configuration.loadApplicationFromString = 1;
+#endif
//After this, configuration file contains pointers to Vulkan objects needed to work with the GPU: VkDevice* device - created device, [uint64_t *bufferSize, VkBuffer *buffer, VkDeviceMemory* bufferDeviceMemory] - allocated GPU memory FFT is performed on. [uint64_t *kernelSize, VkBuffer *kernel, VkDeviceMemory* kernelDeviceMemory] - allocated GPU memory, where kernel for convolution is stored.
- configuration.device = &vkGPU->device;
+#if(VKFFT_BACKEND==5)
+ configuration.device = vkGPU->device;
+#else
+ configuration.device = &vkGPU->device;
+#endif
#if(VKFFT_BACKEND==0)
configuration.queue = &vkGPU->queue; //to allocate memory for LUT, we have to pass a queue, vkGPU->fence, commandPool and physicalDevice pointers
configuration.fence = &vkGPU->fence;
@@ -108,6 +121,8 @@ VkFFTResult sample_100_benchmark_VkFFT_single_nd_dct(VkGPU* vkGPU, uint64_t file
configuration.context = &vkGPU->context;
configuration.commandQueue = &vkGPU->commandQueue;
configuration.commandQueueID = vkGPU->commandQueueID;
+#elif(VKFFT_BACKEND==5)
+ configuration.queue = vkGPU->queue;
#endif
//Allocate buffer for the input data.
uint64_t bufferSize = (uint64_t)sizeof(float) * configuration.size[0] * configuration.size[1] * configuration.size[2];;
@@ -139,6 +154,10 @@ VkFFTResult sample_100_benchmark_VkFFT_single_nd_dct(VkGPU* vkGPU, uint64_t file
res = zeMemAllocDevice(vkGPU->context, &device_desc, bufferSize, sizeof(float), vkGPU->device, &buffer);
if (res != ZE_RESULT_SUCCESS) return VKFFT_ERROR_FAILED_TO_ALLOCATE;
configuration.buffer = &buffer;
+#elif(VKFFT_BACKEND==5)
+ MTL::Buffer* buffer = 0;
+ buffer = vkGPU->device->newBuffer(bufferSize, MTL::ResourceStorageModePrivate);
+ configuration.buffer = &buffer;
#endif
configuration.bufferSize = &bufferSize;
@@ -154,39 +173,26 @@ VkFFTResult sample_100_benchmark_VkFFT_single_nd_dct(VkGPU* vkGPU, uint64_t file
}
}
*/
- //Sample buffer transfer tool. Uses staging buffer of the same size as destination buffer, which can be reduced if transfer is done sequentially in small buffers.
-#if(VKFFT_BACKEND==0)
- resFFT = transferDataFromCPU(vkGPU, buffer_input, &buffer, bufferSize);
- if (resFFT != VKFFT_SUCCESS) return resFFT;
-#elif(VKFFT_BACKEND==1)
- res = cudaMemcpy(buffer, buffer_input, bufferSize, cudaMemcpyHostToDevice);
- if (res != cudaSuccess) return VKFFT_ERROR_FAILED_TO_COPY;
-#elif(VKFFT_BACKEND==2)
- res = hipMemcpy(buffer, buffer_input, bufferSize, hipMemcpyHostToDevice);
- if (res != hipSuccess) return VKFFT_ERROR_FAILED_TO_COPY;
-#elif(VKFFT_BACKEND==3)
- res = clEnqueueWriteBuffer(vkGPU->commandQueue, buffer, CL_TRUE, 0, bufferSize, buffer_input, 0, NULL, NULL);
- if (res != CL_SUCCESS) return VKFFT_ERROR_FAILED_TO_COPY;
-#elif(VKFFT_BACKEND==4)
- ze_command_queue_desc_t commandQueueCopyDesc = {
- ZE_STRUCTURE_TYPE_COMMAND_QUEUE_DESC,
- 0,
- vkGPU->commandQueueID,
- 0, // index
- 0, // flags
- ZE_COMMAND_QUEUE_MODE_DEFAULT,
- ZE_COMMAND_QUEUE_PRIORITY_NORMAL
- };
- ze_command_list_handle_t copyCommandList;
- res = zeCommandListCreateImmediate(vkGPU->context, vkGPU->device, &commandQueueCopyDesc, ©CommandList);
- if (res != ZE_RESULT_SUCCESS) return VKFFT_ERROR_FAILED_TO_CREATE_COMMAND_LIST;
- res = zeCommandListAppendMemoryCopy(copyCommandList, buffer, buffer_input, bufferSize, 0, 0, 0);
- if (res != ZE_RESULT_SUCCESS) return VKFFT_ERROR_FAILED_TO_COPY;
- res = zeCommandQueueSynchronize(vkGPU->commandQueue, UINT32_MAX);
- if (res != 0) return VKFFT_ERROR_FAILED_TO_SYNCHRONIZE;
-#endif
+ //Sample buffer transfer tool. Uses staging buffer (if needed) of the same size as destination buffer, which can be reduced if transfer is done sequentially in small buffers.
+ resFFT = transferDataFromCPU(vkGPU, buffer_input, &buffer, bufferSize);
+ if (resFFT != VKFFT_SUCCESS) return resFFT;
//free(buffer_input);
+ if (configuration.loadApplicationFromString) {
+ FILE* kernelCache;
+ uint64_t str_len;
+ char fname[500];
+ int VkFFT_version = VkFFTGetVersion();
+ sprintf(fname, "VkFFT_binary");
+ kernelCache = fopen(fname, "rb");
+ if (!kernelCache) return VKFFT_ERROR_EMPTY_FILE;
+ fseek(kernelCache, 0, SEEK_END);
+ str_len = ftell(kernelCache);
+ fseek(kernelCache, 0, SEEK_SET);
+ configuration.loadApplicationString = malloc(str_len);
+ fread(configuration.loadApplicationString, str_len, 1, kernelCache);
+ fclose(kernelCache);
+ }
//Initialize applications. This function loads shaders, creates pipeline and configures FFT based on configuration file. No buffer allocations inside VkFFT library.
resFFT = initializeVkFFT(&app, configuration);
if (resFFT == VKFFT_ERROR_UNSUPPORTED_FFT_LENGTH_DCT) {
@@ -207,12 +213,29 @@ VkFFTResult sample_100_benchmark_VkFFT_single_nd_dct(VkGPU* vkGPU, uint64_t file
clReleaseMemObject(buffer);
#elif(VKFFT_BACKEND==4)
zeMemFree(vkGPU->context, buffer);
+#elif(VKFFT_BACKEND==5)
+ buffer->release();
#endif
deleteVkFFT(&app);
+ if (configuration.loadApplicationFromString)
+ free(configuration.loadApplicationString);
continue;
}
if (resFFT != VKFFT_SUCCESS) return resFFT;
+ if (configuration.loadApplicationFromString)
+ free(configuration.loadApplicationString);
+
+ if (configuration.saveApplicationToString) {
+ FILE* kernelCache;
+ char fname[500];
+ int VkFFT_version = VkFFTGetVersion();
+ sprintf(fname, "VkFFT_binary");
+ kernelCache = fopen(fname, "wb");
+ fwrite(app.saveApplicationString, app.applicationStringSize, 1, kernelCache);
+ fclose(kernelCache);
+ }
+
//Submit FFT+iFFT.
uint64_t num_iter = (((uint64_t)4096 * 1024.0 * 1024.0) / bufferSize > 1000) ? 1000 : (uint64_t)((uint64_t)4096 * 1024.0 * 1024.0) / bufferSize;
#if(VKFFT_BACKEND==0)
@@ -270,6 +293,8 @@ VkFFTResult sample_100_benchmark_VkFFT_single_nd_dct(VkGPU* vkGPU, uint64_t file
clReleaseMemObject(buffer);
#elif(VKFFT_BACKEND==4)
zeMemFree(vkGPU->context, buffer);
+#elif(VKFFT_BACKEND==5)
+ buffer->release();
#endif
deleteVkFFT(&app);
diff --git a/benchmark_scripts/vkFFT_scripts/src/sample_101_benchmark_VkFFT_double_nd_dct.cpp b/benchmark_scripts/vkFFT_scripts/src/sample_101_benchmark_VkFFT_double_nd_dct.cpp
index 60990f7..2867664 100644
--- a/benchmark_scripts/vkFFT_scripts/src/sample_101_benchmark_VkFFT_double_nd_dct.cpp
+++ b/benchmark_scripts/vkFFT_scripts/src/sample_101_benchmark_VkFFT_double_nd_dct.cpp
@@ -39,6 +39,10 @@
#endif
#elif(VKFFT_BACKEND==4)
#include <ze_api.h>
+#elif(VKFFT_BACKEND==5)
+#include "Foundation/Foundation.hpp"
+#include "QuartzCore/QuartzCore.hpp"
+#include "Metal/Metal.hpp"
#endif
#include "vkFFT.h"
#include "utils_VkFFT.h"
@@ -56,6 +60,7 @@ VkFFTResult sample_101_benchmark_VkFFT_double_nd_dct(VkGPU* vkGPU, uint64_t file
cl_int res = CL_SUCCESS;
#elif(VKFFT_BACKEND==4)
ze_result_t res = ZE_RESULT_SUCCESS;
+#elif(VKFFT_BACKEND==5)
#endif
if (file_output)
fprintf(output, "1%" PRIu64 "1 - VkFFT FFT + iFFT R2R DCT-%" PRIu64 " multidimensional benchmark in double precision\n", dct_type, dct_type);
@@ -95,8 +100,16 @@ VkFFTResult sample_101_benchmark_VkFFT_double_nd_dct(VkGPU* vkGPU, uint64_t file
configuration.doublePrecision = 1;
//configuration.disableMergeSequencesR2C = 1;
//configuration.doublePrecision = 1;
+#if(VKFFT_BACKEND!=5)
+ if (r==0) configuration.saveApplicationToString = 1;
+ if (r!=0) configuration.loadApplicationFromString = 1;
+#endif
//After this, configuration file contains pointers to Vulkan objects needed to work with the GPU: VkDevice* device - created device, [uint64_t *bufferSize, VkBuffer *buffer, VkDeviceMemory* bufferDeviceMemory] - allocated GPU memory FFT is performed on. [uint64_t *kernelSize, VkBuffer *kernel, VkDeviceMemory* kernelDeviceMemory] - allocated GPU memory, where kernel for convolution is stored.
- configuration.device = &vkGPU->device;
+#if(VKFFT_BACKEND==5)
+ configuration.device = vkGPU->device;
+#else
+ configuration.device = &vkGPU->device;
+#endif
#if(VKFFT_BACKEND==0)
configuration.queue = &vkGPU->queue; //to allocate memory for LUT, we have to pass a queue, vkGPU->fence, commandPool and physicalDevice pointers
configuration.fence = &vkGPU->fence;
@@ -109,6 +122,8 @@ VkFFTResult sample_101_benchmark_VkFFT_double_nd_dct(VkGPU* vkGPU, uint64_t file
configuration.context = &vkGPU->context;
configuration.commandQueue = &vkGPU->commandQueue;
configuration.commandQueueID = vkGPU->commandQueueID;
+#elif(VKFFT_BACKEND==5)
+ configuration.queue = vkGPU->queue;
#endif
//Allocate buffer for the input data.
uint64_t bufferSize = (uint64_t)sizeof(double) * configuration.size[0] * configuration.size[1] * configuration.size[2];;
@@ -140,6 +155,10 @@ VkFFTResult sample_101_benchmark_VkFFT_double_nd_dct(VkGPU* vkGPU, uint64_t file
res = zeMemAllocDevice(vkGPU->context, &device_desc, bufferSize, sizeof(float), vkGPU->device, &buffer);
if (res != ZE_RESULT_SUCCESS) return VKFFT_ERROR_FAILED_TO_ALLOCATE;
configuration.buffer = &buffer;
+#elif(VKFFT_BACKEND==5)
+ MTL::Buffer* buffer = 0;
+ buffer = vkGPU->device->newBuffer(bufferSize, MTL::ResourceStorageModePrivate);
+ configuration.buffer = &buffer;
#endif
configuration.bufferSize = &bufferSize;
@@ -155,39 +174,26 @@ VkFFTResult sample_101_benchmark_VkFFT_double_nd_dct(VkGPU* vkGPU, uint64_t file
}
}
*/
- //Sample buffer transfer tool. Uses staging buffer of the same size as destination buffer, which can be reduced if transfer is done sequentially in small buffers.
-#if(VKFFT_BACKEND==0)
- resFFT = transferDataFromCPU(vkGPU, buffer_input, &buffer, bufferSize);
- if (resFFT != VKFFT_SUCCESS) return resFFT;
-#elif(VKFFT_BACKEND==1)
- res = cudaMemcpy(buffer, buffer_input, bufferSize, cudaMemcpyHostToDevice);
- if (res != cudaSuccess) return VKFFT_ERROR_FAILED_TO_COPY;
-#elif(VKFFT_BACKEND==2)
- res = hipMemcpy(buffer, buffer_input, bufferSize, hipMemcpyHostToDevice);
- if (res != hipSuccess) return VKFFT_ERROR_FAILED_TO_COPY;
-#elif(VKFFT_BACKEND==3)
- res = clEnqueueWriteBuffer(vkGPU->commandQueue, buffer, CL_TRUE, 0, bufferSize, buffer_input, 0, NULL, NULL);
- if (res != CL_SUCCESS) return VKFFT_ERROR_FAILED_TO_COPY;
-#elif(VKFFT_BACKEND==4)
- ze_command_queue_desc_t commandQueueCopyDesc = {
- ZE_STRUCTURE_TYPE_COMMAND_QUEUE_DESC,
- 0,
- vkGPU->commandQueueID,
- 0, // index
- 0, // flags
- ZE_COMMAND_QUEUE_MODE_DEFAULT,
- ZE_COMMAND_QUEUE_PRIORITY_NORMAL
- };
- ze_command_list_handle_t copyCommandList;
- res = zeCommandListCreateImmediate(vkGPU->context, vkGPU->device, &commandQueueCopyDesc, ©CommandList);
- if (res != ZE_RESULT_SUCCESS) return VKFFT_ERROR_FAILED_TO_CREATE_COMMAND_LIST;
- res = zeCommandListAppendMemoryCopy(copyCommandList, buffer, buffer_input, bufferSize, 0, 0, 0);
- if (res != ZE_RESULT_SUCCESS) return VKFFT_ERROR_FAILED_TO_COPY;
- res = zeCommandQueueSynchronize(vkGPU->commandQueue, UINT32_MAX);
- if (res != 0) return VKFFT_ERROR_FAILED_TO_SYNCHRONIZE;
-#endif
+ //Sample buffer transfer tool. Uses staging buffer (if needed) of the same size as destination buffer, which can be reduced if transfer is done sequentially in small buffers.
+ resFFT = transferDataFromCPU(vkGPU, buffer_input, &buffer, bufferSize);
+ if (resFFT != VKFFT_SUCCESS) return resFFT;
//free(buffer_input);
+ if (configuration.loadApplicationFromString) {
+ FILE* kernelCache;
+ uint64_t str_len;
+ char fname[500];
+ int VkFFT_version = VkFFTGetVersion();
+ sprintf(fname, "VkFFT_binary");
+ kernelCache = fopen(fname, "rb");
+ if (!kernelCache) return VKFFT_ERROR_EMPTY_FILE;
+ fseek(kernelCache, 0, SEEK_END);
+ str_len = ftell(kernelCache);
+ fseek(kernelCache, 0, SEEK_SET);
+ configuration.loadApplicationString = malloc(str_len);
+ fread(configuration.loadApplicationString, str_len, 1, kernelCache);
+ fclose(kernelCache);
+ }
//Initialize applications. This function loads shaders, creates pipeline and configures FFT based on configuration file. No buffer allocations inside VkFFT library.
resFFT = initializeVkFFT(&app, configuration);
if (resFFT == VKFFT_ERROR_UNSUPPORTED_FFT_LENGTH_DCT) {
@@ -208,12 +214,29 @@ VkFFTResult sample_101_benchmark_VkFFT_double_nd_dct(VkGPU* vkGPU, uint64_t file
clReleaseMemObject(buffer);
#elif(VKFFT_BACKEND==4)
zeMemFree(vkGPU->context, buffer);
+#elif(VKFFT_BACKEND==5)
+ buffer->release();
#endif
deleteVkFFT(&app);
+ if (configuration.loadApplicationFromString)
+ free(configuration.loadApplicationString);
continue;
}
if (resFFT != VKFFT_SUCCESS) return resFFT;
+ if (configuration.loadApplicationFromString)
+ free(configuration.loadApplicationString);
+
+ if (configuration.saveApplicationToString) {
+ FILE* kernelCache;
+ char fname[500];
+ int VkFFT_version = VkFFTGetVersion();
+ sprintf(fname, "VkFFT_binary");
+ kernelCache = fopen(fname, "wb");
+ fwrite(app.saveApplicationString, app.applicationStringSize, 1, kernelCache);
+ fclose(kernelCache);
+ }
+
//Submit FFT+iFFT.
uint64_t num_iter = (((uint64_t)4096 * 1024.0 * 1024.0) / bufferSize > 1000) ? 1000 : (uint64_t)((uint64_t)4096 * 1024.0 * 1024.0) / bufferSize;
#if(VKFFT_BACKEND==0)
@@ -271,6 +294,8 @@ VkFFTResult sample_101_benchmark_VkFFT_double_nd_dct(VkGPU* vkGPU, uint64_t file
clReleaseMemObject(buffer);
#elif(VKFFT_BACKEND==4)
zeMemFree(vkGPU->context, buffer);
+#elif(VKFFT_BACKEND==5)
+ buffer->release();
#endif
deleteVkFFT(&app);
diff --git a/benchmark_scripts/vkFFT_scripts/src/sample_10_benchmark_VkFFT_single_multipleBuffers.cpp b/benchmark_scripts/vkFFT_scripts/src/sample_10_benchmark_VkFFT_single_multipleBuffers.cpp
index 0e29cf7..610ec24 100644
--- a/benchmark_scripts/vkFFT_scripts/src/sample_10_benchmark_VkFFT_single_multipleBuffers.cpp
+++ b/benchmark_scripts/vkFFT_scripts/src/sample_10_benchmark_VkFFT_single_multipleBuffers.cpp
@@ -39,6 +39,10 @@
#endif
#elif(VKFFT_BACKEND==4)
#include <ze_api.h>
+#elif(VKFFT_BACKEND==5)
+#include "Foundation/Foundation.hpp"
+#include "QuartzCore/QuartzCore.hpp"
+#include "Metal/Metal.hpp"
#endif
#include "vkFFT.h"
#include "utils_VkFFT.h"
@@ -56,6 +60,7 @@ VkFFTResult sample_10_benchmark_VkFFT_single_multipleBuffers(VkGPU* vkGPU, uint6
cl_int res = CL_SUCCESS;
#elif(VKFFT_BACKEND==4)
ze_result_t res = ZE_RESULT_SUCCESS;
+#elif(VKFFT_BACKEND==5)
#endif
#if(VKFFT_BACKEND==0)
if (file_output)
@@ -85,8 +90,16 @@ VkFFTResult sample_10_benchmark_VkFFT_single_multipleBuffers(VkGPU* vkGPU, uint6
//configuration.numberBatches = (configuration.numberBatches > 32768) ? 32768 : configuration.numberBatches;
uint64_t numBuf = 4;
+#if(VKFFT_BACKEND!=5)
+ if (r==0) configuration.saveApplicationToString = 1;
+ if (r!=0) configuration.loadApplicationFromString = 1;
+#endif
//After this, configuration file contains pointers to Vulkan objects needed to work with the GPU: VkDevice* device - created device, [uint64_t *bufferSize, VkBuffer *buffer, VkDeviceMemory* bufferDeviceMemory] - allocated GPU memory FFT is performed on. [uint64_t *kernelSize, VkBuffer *kernel, VkDeviceMemory* kernelDeviceMemory] - allocated GPU memory, where kernel for convolution is stored.
- configuration.device = &vkGPU->device;
+#if(VKFFT_BACKEND==5)
+ configuration.device = vkGPU->device;
+#else
+ configuration.device = &vkGPU->device;
+#endif
configuration.queue = &vkGPU->queue; //to allocate memory for LUT, we have to pass a queue, vkGPU->fence, commandPool and physicalDevice pointers
configuration.fence = &vkGPU->fence;
configuration.commandPool = &vkGPU->commandPool;
@@ -141,7 +154,7 @@ VkFFTResult sample_10_benchmark_VkFFT_single_multipleBuffers(VkGPU* vkGPU, uint6
}
}
*/
- //Sample buffer transfer tool. Uses staging buffer of the same size as destination buffer, which can be reduced if transfer is done sequentially in small buffers.
+ //Sample buffer transfer tool. Uses staging buffer (if needed) of the same size as destination buffer, which can be reduced if transfer is done sequentially in small buffers.
uint64_t shift = 0;
for (uint64_t i = 0; i < numBuf; i++) {
resFFT = transferDataFromCPU(vkGPU, (buffer_input + shift / sizeof(float)), &buffer[i], bufferSize[i]);
@@ -151,10 +164,38 @@ VkFFTResult sample_10_benchmark_VkFFT_single_multipleBuffers(VkGPU* vkGPU, uint6
//free(buffer_input);
+ if (configuration.loadApplicationFromString) {
+ FILE* kernelCache;
+ uint64_t str_len;
+ char fname[500];
+ int VkFFT_version = VkFFTGetVersion();
+ sprintf(fname, "VkFFT_binary");
+ kernelCache = fopen(fname, "rb");
+ if (!kernelCache) return VKFFT_ERROR_EMPTY_FILE;
+ fseek(kernelCache, 0, SEEK_END);
+ str_len = ftell(kernelCache);
+ fseek(kernelCache, 0, SEEK_SET);
+ configuration.loadApplicationString = malloc(str_len);
+ fread(configuration.loadApplicationString, str_len, 1, kernelCache);
+ fclose(kernelCache);
+ }
//Initialize applications. This function loads shaders, creates pipeline and configures FFT based on configuration file. No buffer allocations inside VkFFT library.
resFFT = initializeVkFFT(&app, configuration);
if (resFFT != VKFFT_SUCCESS) return resFFT;
+ if (configuration.loadApplicationFromString)
+ free(configuration.loadApplicationString);
+
+ if (configuration.saveApplicationToString) {
+ FILE* kernelCache;
+ char fname[500];
+ int VkFFT_version = VkFFTGetVersion();
+ sprintf(fname, "VkFFT_binary");
+ kernelCache = fopen(fname, "wb");
+ fwrite(app.saveApplicationString, app.applicationStringSize, 1, kernelCache);
+ fclose(kernelCache);
+ }
+
//Submit FFT+iFFT.
uint64_t num_iter = (((uint64_t)4096 * 1024.0 * 1024.0) / (numBuf * bufferSize[0]) > 1000) ? 1000 : (uint64_t)((uint64_t)4096 * 1024.0 * 1024.0) / (numBuf * bufferSize[0]);
if (vkGPU->physicalDeviceProperties.vendorID == 0x8086) num_iter /= 4;
diff --git a/benchmark_scripts/vkFFT_scripts/src/sample_11_precision_VkFFT_single.cpp b/benchmark_scripts/vkFFT_scripts/src/sample_11_precision_VkFFT_single.cpp
index 87c5125..59e8ec9 100644
--- a/benchmark_scripts/vkFFT_scripts/src/sample_11_precision_VkFFT_single.cpp
+++ b/benchmark_scripts/vkFFT_scripts/src/sample_11_precision_VkFFT_single.cpp
@@ -39,6 +39,10 @@
#endif
#elif(VKFFT_BACKEND==4)
#include <ze_api.h>
+#elif(VKFFT_BACKEND==5)
+#include "Foundation/Foundation.hpp"
+#include "QuartzCore/QuartzCore.hpp"
+#include "Metal/Metal.hpp"
#endif
#include "vkFFT.h"
#include "utils_VkFFT.h"
@@ -62,6 +66,7 @@ VkFFTResult sample_11_precision_VkFFT_single(VkGPU* vkGPU, uint64_t file_output,
cl_int res = CL_SUCCESS;
#elif(VKFFT_BACKEND==4)
ze_result_t res = ZE_RESULT_SUCCESS;
+#elif(VKFFT_BACKEND==5)
#endif
if (file_output)
fprintf(output, "11 - VkFFT/FFTW C2C precision test in single precision\n");
@@ -149,7 +154,11 @@ VkFFTResult sample_11_precision_VkFFT_single(VkGPU* vkGPU, uint64_t file_output,
configuration.size[2] = benchmark_dimensions[n][2];
//After this, configuration file contains pointers to Vulkan objects needed to work with the GPU: VkDevice* device - created device, [uint64_t *bufferSize, VkBuffer *buffer, VkDeviceMemory* bufferDeviceMemory] - allocated GPU memory FFT is performed on. [uint64_t *kernelSize, VkBuffer *kernel, VkDeviceMemory* kernelDeviceMemory] - allocated GPU memory, where kernel for convolution is stored.
- configuration.device = &vkGPU->device;
+#if(VKFFT_BACKEND==5)
+ configuration.device = vkGPU->device;
+#else
+ configuration.device = &vkGPU->device;
+#endif
#if(VKFFT_BACKEND==0)
configuration.queue = &vkGPU->queue; //to allocate memory for LUT, we have to pass a queue, vkGPU->fence, commandPool and physicalDevice pointers
configuration.fence = &vkGPU->fence;
@@ -162,6 +171,8 @@ VkFFTResult sample_11_precision_VkFFT_single(VkGPU* vkGPU, uint64_t file_output,
configuration.context = &vkGPU->context;
configuration.commandQueue = &vkGPU->commandQueue;
configuration.commandQueueID = vkGPU->commandQueueID;
+#elif(VKFFT_BACKEND==5)
+ configuration.queue = vkGPU->queue;
#endif
uint64_t numBuf = 1;
@@ -186,6 +197,8 @@ VkFFTResult sample_11_precision_VkFFT_single(VkGPU* vkGPU, uint64_t file_output,
cl_mem buffer = 0;
#elif(VKFFT_BACKEND==4)
void* buffer = 0;
+#elif(VKFFT_BACKEND==5)
+ MTL::Buffer* buffer = 0;
#endif
for (uint64_t i = 0; i < numBuf; i++) {
#if(VKFFT_BACKEND==0)
@@ -207,6 +220,8 @@ VkFFTResult sample_11_precision_VkFFT_single(VkGPU* vkGPU, uint64_t file_output,
device_desc.stype = ZE_STRUCTURE_TYPE_DEVICE_MEM_ALLOC_DESC;
res = zeMemAllocDevice(vkGPU->context, &device_desc, bufferSize[i], sizeof(float), vkGPU->device, &buffer);
if (res != ZE_RESULT_SUCCESS) return VKFFT_ERROR_FAILED_TO_ALLOCATE;
+#elif(VKFFT_BACKEND==5)
+ buffer = vkGPU->device->newBuffer(bufferSize[i], MTL::ResourceStorageModePrivate);
#endif
}
@@ -222,38 +237,15 @@ VkFFTResult sample_11_precision_VkFFT_single(VkGPU* vkGPU, uint64_t file_output,
*/ //Can specify buffers at launch
configuration.bufferSize = bufferSize;
- //Sample buffer transfer tool. Uses staging buffer of the same size as destination buffer, which can be reduced if transfer is done sequentially in small buffers.
+ //Sample buffer transfer tool. Uses staging buffer (if needed) of the same size as destination buffer, which can be reduced if transfer is done sequentially in small buffers.
uint64_t shift = 0;
for (uint64_t i = 0; i < numBuf; i++) {
#if(VKFFT_BACKEND==0)
resFFT = transferDataFromCPU(vkGPU, (inputC + shift / sizeof(fftwf_complex)), &buffer[i], bufferSize[i]);
if (resFFT != VKFFT_SUCCESS) return resFFT;
-#elif(VKFFT_BACKEND==1)
- res = cudaMemcpy(buffer, inputC, bufferSize[i], cudaMemcpyHostToDevice);
- if (res != cudaSuccess) return VKFFT_ERROR_FAILED_TO_COPY;
-#elif(VKFFT_BACKEND==2)
- res = hipMemcpy(buffer, inputC, bufferSize[i], hipMemcpyHostToDevice);
- if (res != hipSuccess) return VKFFT_ERROR_FAILED_TO_COPY;
-#elif(VKFFT_BACKEND==3)
- res = clEnqueueWriteBuffer(vkGPU->commandQueue, buffer, CL_TRUE, 0, bufferSize[i], inputC, 0, NULL, NULL);
- if (res != CL_SUCCESS) return VKFFT_ERROR_FAILED_TO_COPY;
-#elif(VKFFT_BACKEND==4)
- ze_command_queue_desc_t commandQueueCopyDesc = {
- ZE_STRUCTURE_TYPE_COMMAND_QUEUE_DESC,
- 0,
- vkGPU->commandQueueID,
- 0, // index
- 0, // flags
- ZE_COMMAND_QUEUE_MODE_DEFAULT,
- ZE_COMMAND_QUEUE_PRIORITY_NORMAL
- };
- ze_command_list_handle_t copyCommandList;
- res = zeCommandListCreateImmediate(vkGPU->context, vkGPU->device, &commandQueueCopyDesc, ©CommandList);
- if (res != ZE_RESULT_SUCCESS) return VKFFT_ERROR_FAILED_TO_CREATE_COMMAND_LIST;
- res = zeCommandListAppendMemoryCopy(copyCommandList, buffer, inputC, bufferSize[i], 0, 0, 0);
- if (res != ZE_RESULT_SUCCESS) return VKFFT_ERROR_FAILED_TO_COPY;
- res = zeCommandQueueSynchronize(vkGPU->commandQueue, UINT32_MAX);
- if (res != 0) return VKFFT_ERROR_FAILED_TO_SYNCHRONIZE;
+#else
+ resFFT = transferDataFromCPU(vkGPU, (inputC + shift / sizeof(fftwf_complex)), &buffer, bufferSize[i]);
+ if (resFFT != VKFFT_SUCCESS) return resFFT;
#endif
shift += bufferSize[i];
}
@@ -275,6 +267,8 @@ VkFFTResult sample_11_precision_VkFFT_single(VkGPU* vkGPU, uint64_t file_output,
launchParams.buffer = &buffer;
#elif(VKFFT_BACKEND==4)
launchParams.buffer = (void**)&buffer;
+#elif(VKFFT_BACKEND==5)
+ launchParams.buffer = &buffer;
#endif
resFFT = performVulkanFFT(vkGPU, &app, &launchParams, -1, num_iter);
if (resFFT != VKFFT_SUCCESS) return resFFT;
@@ -286,32 +280,9 @@ VkFFTResult sample_11_precision_VkFFT_single(VkGPU* vkGPU, uint64_t file_output,
#if(VKFFT_BACKEND==0)
resFFT = transferDataToCPU(vkGPU, (output_VkFFT + shift / sizeof(fftwf_complex)), &buffer[i], bufferSize[i]);
if (resFFT != VKFFT_SUCCESS) return resFFT;
-#elif(VKFFT_BACKEND==1)
- res = cudaMemcpy(output_VkFFT, buffer, bufferSize[i], cudaMemcpyDeviceToHost);
- if (res != cudaSuccess) return VKFFT_ERROR_FAILED_TO_COPY;
-#elif(VKFFT_BACKEND==2)
- res = hipMemcpy(output_VkFFT, buffer, bufferSize[i], hipMemcpyDeviceToHost);
- if (res != hipSuccess) return VKFFT_ERROR_FAILED_TO_COPY;
-#elif(VKFFT_BACKEND==3)
- res = clEnqueueReadBuffer(vkGPU->commandQueue, buffer, CL_TRUE, 0, bufferSize[i], output_VkFFT, 0, NULL, NULL);
- if (res != CL_SUCCESS) return VKFFT_ERROR_FAILED_TO_COPY;
-#elif(VKFFT_BACKEND==4)
- ze_command_queue_desc_t commandQueueCopyDesc = {
- ZE_STRUCTURE_TYPE_COMMAND_QUEUE_DESC,
- 0,
- vkGPU->commandQueueID,
- 0, // index
- 0, // flags
- ZE_COMMAND_QUEUE_MODE_DEFAULT,
- ZE_COMMAND_QUEUE_PRIORITY_NORMAL
- };
- ze_command_list_handle_t copyCommandList;
- res = zeCommandListCreateImmediate(vkGPU->context, vkGPU->device, &commandQueueCopyDesc, ©CommandList);
- if (res != ZE_RESULT_SUCCESS) return VKFFT_ERROR_FAILED_TO_CREATE_COMMAND_LIST;
- res = zeCommandListAppendMemoryCopy(copyCommandList, output_VkFFT, buffer, bufferSize[i], 0, 0, 0);
- if (res != ZE_RESULT_SUCCESS) return VKFFT_ERROR_FAILED_TO_COPY;
- res = zeCommandQueueSynchronize(vkGPU->commandQueue, UINT32_MAX);
- if (res != 0) return VKFFT_ERROR_FAILED_TO_SYNCHRONIZE;
+#else
+ resFFT = transferDataToCPU(vkGPU, (output_VkFFT + shift / sizeof(fftwf_complex)), &buffer, bufferSize[i]);
+ if (resFFT != VKFFT_SUCCESS) return resFFT;
#endif
shift += bufferSize[i];
}
@@ -384,6 +355,8 @@ VkFFTResult sample_11_precision_VkFFT_single(VkGPU* vkGPU, uint64_t file_output,
clReleaseMemObject(buffer);
#elif(VKFFT_BACKEND==4)
zeMemFree(vkGPU->context, buffer);
+#elif(VKFFT_BACKEND==5)
+ buffer->release();
#endif
}
#if(VKFFT_BACKEND==0)
diff --git a/benchmark_scripts/vkFFT_scripts/src/sample_12_precision_VkFFT_double.cpp b/benchmark_scripts/vkFFT_scripts/src/sample_12_precision_VkFFT_double.cpp
index ea5d00d..a8c53fc 100644
--- a/benchmark_scripts/vkFFT_scripts/src/sample_12_precision_VkFFT_double.cpp
+++ b/benchmark_scripts/vkFFT_scripts/src/sample_12_precision_VkFFT_double.cpp
@@ -39,6 +39,10 @@
#endif
#elif(VKFFT_BACKEND==4)
#include <ze_api.h>
+#elif(VKFFT_BACKEND==5)
+#include "Foundation/Foundation.hpp"
+#include "QuartzCore/QuartzCore.hpp"
+#include "Metal/Metal.hpp"
#endif
#include "vkFFT.h"
#include "utils_VkFFT.h"
@@ -62,6 +66,7 @@ VkFFTResult sample_12_precision_VkFFT_double(VkGPU* vkGPU, uint64_t file_output,
cl_int res = CL_SUCCESS;
#elif(VKFFT_BACKEND==4)
ze_result_t res = ZE_RESULT_SUCCESS;
+#elif(VKFFT_BACKEND==5)
#endif
if (file_output)
fprintf(output, "12 - VkFFT/FFTW C2C precision test in double precision\n");
@@ -148,7 +153,11 @@ VkFFTResult sample_12_precision_VkFFT_double(VkGPU* vkGPU, uint64_t file_output,
configuration.size[2] = benchmark_dimensions[n][2];
//After this, configuration file contains pointers to Vulkan objects needed to work with the GPU: VkDevice* device - created device, [uint64_t *bufferSize, VkBuffer *buffer, VkDeviceMemory* bufferDeviceMemory] - allocated GPU memory FFT is performed on. [uint64_t *kernelSize, VkBuffer *kernel, VkDeviceMemory* kernelDeviceMemory] - allocated GPU memory, where kernel for convolution is stored.
- configuration.device = &vkGPU->device;
+#if(VKFFT_BACKEND==5)
+ configuration.device = vkGPU->device;
+#else
+ configuration.device = &vkGPU->device;
+#endif
#if(VKFFT_BACKEND==0)
configuration.queue = &vkGPU->queue;
configuration.fence = &vkGPU->fence;
@@ -161,6 +170,8 @@ VkFFTResult sample_12_precision_VkFFT_double(VkGPU* vkGPU, uint64_t file_output,
configuration.context = &vkGPU->context;
configuration.commandQueue = &vkGPU->commandQueue;
configuration.commandQueueID = vkGPU->commandQueueID;
+#elif(VKFFT_BACKEND==5)
+ configuration.queue = vkGPU->queue;
#endif
configuration.doublePrecision = true;
@@ -186,6 +197,8 @@ VkFFTResult sample_12_precision_VkFFT_double(VkGPU* vkGPU, uint64_t file_output,
cl_mem buffer = 0;
#elif(VKFFT_BACKEND==4)
void* buffer = 0;
+#elif(VKFFT_BACKEND==5)
+ MTL::Buffer* buffer = 0;
#endif
for (uint64_t i = 0; i < numBuf; i++) {
#if(VKFFT_BACKEND==0)
@@ -207,6 +220,8 @@ VkFFTResult sample_12_precision_VkFFT_double(VkGPU* vkGPU, uint64_t file_output,
device_desc.stype = ZE_STRUCTURE_TYPE_DEVICE_MEM_ALLOC_DESC;
res = zeMemAllocDevice(vkGPU->context, &device_desc, bufferSize[i], sizeof(float), vkGPU->device, &buffer);
if (res != ZE_RESULT_SUCCESS) return VKFFT_ERROR_FAILED_TO_ALLOCATE;
+#elif(VKFFT_BACKEND==5)
+ buffer = vkGPU->device->newBuffer(bufferSize[i], MTL::ResourceStorageModePrivate);
#endif
}
@@ -222,38 +237,15 @@ VkFFTResult sample_12_precision_VkFFT_double(VkGPU* vkGPU, uint64_t file_output,
*/ // Can specify buffers at launch
configuration.bufferSize = bufferSize;
- //Sample buffer transfer tool. Uses staging buffer of the same size as destination buffer, which can be reduced if transfer is done sequentially in small buffers.
+ //Sample buffer transfer tool. Uses staging buffer (if needed) of the same size as destination buffer, which can be reduced if transfer is done sequentially in small buffers.
uint64_t shift = 0;
for (uint64_t i = 0; i < numBuf; i++) {
#if(VKFFT_BACKEND==0)
resFFT = transferDataFromCPU(vkGPU, (inputC + shift / sizeof(fftw_complex)), &buffer[i], bufferSize[i]);
if (resFFT != VKFFT_SUCCESS) return resFFT;
-#elif(VKFFT_BACKEND==1)
- res = cudaMemcpy(buffer, inputC, bufferSize[i], cudaMemcpyHostToDevice);
- if (res != cudaSuccess) return VKFFT_ERROR_FAILED_TO_COPY;
-#elif(VKFFT_BACKEND==2)
- res = hipMemcpy(buffer, inputC, bufferSize[i], hipMemcpyHostToDevice);
- if (res != hipSuccess) return VKFFT_ERROR_FAILED_TO_COPY;
-#elif(VKFFT_BACKEND==3)
- res = clEnqueueWriteBuffer(vkGPU->commandQueue, buffer, CL_TRUE, 0, bufferSize[i], inputC, 0, NULL, NULL);
- if (res != CL_SUCCESS) return VKFFT_ERROR_FAILED_TO_COPY;
-#elif(VKFFT_BACKEND==4)
- ze_command_queue_desc_t commandQueueCopyDesc = {
- ZE_STRUCTURE_TYPE_COMMAND_QUEUE_DESC,
- 0,
- vkGPU->commandQueueID,
- 0, // index
- 0, // flags
- ZE_COMMAND_QUEUE_MODE_DEFAULT,
- ZE_COMMAND_QUEUE_PRIORITY_NORMAL
- };
- ze_command_list_handle_t copyCommandList;
- res = zeCommandListCreateImmediate(vkGPU->context, vkGPU->device, &commandQueueCopyDesc, ©CommandList);
- if (res != ZE_RESULT_SUCCESS) return VKFFT_ERROR_FAILED_TO_CREATE_COMMAND_LIST;
- res = zeCommandListAppendMemoryCopy(copyCommandList, buffer, inputC, bufferSize[i], 0, 0, 0);
- if (res != ZE_RESULT_SUCCESS) return VKFFT_ERROR_FAILED_TO_COPY;
- res = zeCommandQueueSynchronize(vkGPU->commandQueue, UINT32_MAX);
- if (res != 0) return VKFFT_ERROR_FAILED_TO_SYNCHRONIZE;
+#else
+ resFFT = transferDataFromCPU(vkGPU, (inputC + shift / sizeof(fftw_complex)), &buffer, bufferSize[i]);
+ if (resFFT != VKFFT_SUCCESS) return resFFT;
#endif
shift += bufferSize[i];
}
@@ -274,6 +266,8 @@ VkFFTResult sample_12_precision_VkFFT_double(VkGPU* vkGPU, uint64_t file_output,
launchParams.buffer = &buffer;
#elif(VKFFT_BACKEND==4)
launchParams.buffer = (void**)&buffer;
+#elif(VKFFT_BACKEND==5)
+ launchParams.buffer = &buffer;
#endif
resFFT = performVulkanFFT(vkGPU, &app, &launchParams, -1, num_iter);
if (resFFT != VKFFT_SUCCESS) return resFFT;
@@ -285,32 +279,9 @@ VkFFTResult sample_12_precision_VkFFT_double(VkGPU* vkGPU, uint64_t file_output,
#if(VKFFT_BACKEND==0)
resFFT = transferDataToCPU(vkGPU, (output_VkFFT + shift / sizeof(fftw_complex)), &buffer[i], bufferSize[i]);
if (resFFT != VKFFT_SUCCESS) return resFFT;
-#elif(VKFFT_BACKEND==1)
- res = cudaMemcpy(output_VkFFT, buffer, bufferSize[i], cudaMemcpyDeviceToHost);
- if (res != cudaSuccess) return VKFFT_ERROR_FAILED_TO_COPY;
-#elif(VKFFT_BACKEND==2)
- res = hipMemcpy(output_VkFFT, buffer, bufferSize[i], hipMemcpyDeviceToHost);
- if (res != hipSuccess) return VKFFT_ERROR_FAILED_TO_COPY;
-#elif(VKFFT_BACKEND==3)
- res = clEnqueueReadBuffer(vkGPU->commandQueue, buffer, CL_TRUE, 0, bufferSize[i], output_VkFFT, 0, NULL, NULL);
- if (res != CL_SUCCESS) return VKFFT_ERROR_FAILED_TO_COPY;
-#elif(VKFFT_BACKEND==4)
- ze_command_queue_desc_t commandQueueCopyDesc = {
- ZE_STRUCTURE_TYPE_COMMAND_QUEUE_DESC,
- 0,
- vkGPU->commandQueueID,
- 0, // index
- 0, // flags
- ZE_COMMAND_QUEUE_MODE_DEFAULT,
- ZE_COMMAND_QUEUE_PRIORITY_NORMAL
- };
- ze_command_list_handle_t copyCommandList;
- res = zeCommandListCreateImmediate(vkGPU->context, vkGPU->device, &commandQueueCopyDesc, ©CommandList);
- if (res != ZE_RESULT_SUCCESS) return VKFFT_ERROR_FAILED_TO_CREATE_COMMAND_LIST;
- res = zeCommandListAppendMemoryCopy(copyCommandList, output_VkFFT, buffer, bufferSize[i], 0, 0, 0);
- if (res != ZE_RESULT_SUCCESS) return VKFFT_ERROR_FAILED_TO_COPY;
- res = zeCommandQueueSynchronize(vkGPU->commandQueue, UINT32_MAX);
- if (res != 0) return VKFFT_ERROR_FAILED_TO_SYNCHRONIZE;
+#else
+ resFFT = transferDataToCPU(vkGPU, (output_VkFFT + shift / sizeof(fftw_complex)), &buffer, bufferSize[i]);
+ if (resFFT != VKFFT_SUCCESS) return resFFT;
#endif
shift += bufferSize[i];
}
@@ -384,6 +355,8 @@ VkFFTResult sample_12_precision_VkFFT_double(VkGPU* vkGPU, uint64_t file_output,
clReleaseMemObject(buffer);
#elif(VKFFT_BACKEND==4)
zeMemFree(vkGPU->context, buffer);
+#elif(VKFFT_BACKEND==5)
+ buffer->release();
#endif
}
diff --git a/benchmark_scripts/vkFFT_scripts/src/sample_13_precision_VkFFT_half.cpp b/benchmark_scripts/vkFFT_scripts/src/sample_13_precision_VkFFT_half.cpp
index 62bf623..8879fc8 100644
--- a/benchmark_scripts/vkFFT_scripts/src/sample_13_precision_VkFFT_half.cpp
+++ b/benchmark_scripts/vkFFT_scripts/src/sample_13_precision_VkFFT_half.cpp
@@ -39,6 +39,10 @@
#endif
#elif(VKFFT_BACKEND==4)
#include <ze_api.h>
+#elif(VKFFT_BACKEND==5)
+#include "Foundation/Foundation.hpp"
+#include "QuartzCore/QuartzCore.hpp"
+#include "Metal/Metal.hpp"
#endif
#include "vkFFT.h"
#include "half.hpp"
@@ -64,6 +68,7 @@ VkFFTResult sample_13_precision_VkFFT_half(VkGPU* vkGPU, uint64_t file_output, F
cl_int res = CL_SUCCESS;
#elif(VKFFT_BACKEND==4)
ze_result_t res = ZE_RESULT_SUCCESS;
+#elif(VKFFT_BACKEND==5)
#endif
if (file_output)
fprintf(output, "13 - VkFFT/FFTW C2C precision test in half precision\n");
@@ -148,7 +153,11 @@ VkFFTResult sample_13_precision_VkFFT_half(VkGPU* vkGPU, uint64_t file_output, F
configuration.halfPrecision = true;
//After this, configuration file contains pointers to Vulkan objects needed to work with the GPU: VkDevice* device - created device, [uint64_t *bufferSize, VkBuffer *buffer, VkDeviceMemory* bufferDeviceMemory] - allocated GPU memory FFT is performed on. [uint64_t *kernelSize, VkBuffer *kernel, VkDeviceMemory* kernelDeviceMemory] - allocated GPU memory, where kernel for convolution is stored.
- configuration.device = &vkGPU->device;
+#if(VKFFT_BACKEND==5)
+ configuration.device = vkGPU->device;
+#else
+ configuration.device = &vkGPU->device;
+#endif
#if(VKFFT_BACKEND==0)
configuration.queue = &vkGPU->queue; //to allocate memory for LUT, we have to pass a queue, vkGPU->fence, commandPool and physicalDevice pointers
configuration.fence = &vkGPU->fence;
@@ -161,6 +170,8 @@ VkFFTResult sample_13_precision_VkFFT_half(VkGPU* vkGPU, uint64_t file_output, F
configuration.context = &vkGPU->context;
configuration.commandQueue = &vkGPU->commandQueue;
configuration.commandQueueID = vkGPU->commandQueueID;
+#elif(VKFFT_BACKEND==5)
+ configuration.queue = vkGPU->queue;
#endif
uint64_t numBuf = 1;
@@ -185,6 +196,8 @@ VkFFTResult sample_13_precision_VkFFT_half(VkGPU* vkGPU, uint64_t file_output, F
cl_mem buffer = 0;
#elif(VKFFT_BACKEND==4)
void* buffer = 0;
+#elif(VKFFT_BACKEND==5)
+ MTL::Buffer* buffer = 0;
#endif
for (uint64_t i = 0; i < numBuf; i++) {
#if(VKFFT_BACKEND==0)
@@ -206,6 +219,8 @@ VkFFTResult sample_13_precision_VkFFT_half(VkGPU* vkGPU, uint64_t file_output, F
device_desc.stype = ZE_STRUCTURE_TYPE_DEVICE_MEM_ALLOC_DESC;
res = zeMemAllocDevice(vkGPU->context, &device_desc, bufferSize[i], sizeof(float), vkGPU->device, &buffer);
if (res != ZE_RESULT_SUCCESS) return VKFFT_ERROR_FAILED_TO_ALLOCATE;
+#elif(VKFFT_BACKEND==5)
+ buffer = vkGPU->device->newBuffer(bufferSize[i], MTL::ResourceStorageModePrivate);
#endif
}
@@ -221,38 +236,15 @@ VkFFTResult sample_13_precision_VkFFT_half(VkGPU* vkGPU, uint64_t file_output, F
*/ // Can specify buffers at launch
configuration.bufferSize = bufferSize;
- //Sample buffer transfer tool. Uses staging buffer of the same size as destination buffer, which can be reduced if transfer is done sequentially in small buffers.
+ //Sample buffer transfer tool. Uses staging buffer (if needed) of the same size as destination buffer, which can be reduced if transfer is done sequentially in small buffers.
uint64_t shift = 0;
for (uint64_t i = 0; i < numBuf; i++) {
#if(VKFFT_BACKEND==0)
resFFT = transferDataFromCPU(vkGPU, (inputC + shift / 2 / sizeof(half)), &buffer[i], bufferSize[i]);
if (resFFT != VKFFT_SUCCESS) return resFFT;
-#elif(VKFFT_BACKEND==1)
- res = cudaMemcpy(buffer, inputC, bufferSize[i], cudaMemcpyHostToDevice);
- if (res != cudaSuccess) return VKFFT_ERROR_FAILED_TO_COPY;
-#elif(VKFFT_BACKEND==2)
- res = hipMemcpy(buffer, inputC, bufferSize[i], hipMemcpyHostToDevice);
- if (res != hipSuccess) return VKFFT_ERROR_FAILED_TO_COPY;
-#elif(VKFFT_BACKEND==3)
- res = clEnqueueWriteBuffer(vkGPU->commandQueue, buffer, CL_TRUE, 0, bufferSize[i], inputC, 0, NULL, NULL);
- if (res != CL_SUCCESS) return VKFFT_ERROR_FAILED_TO_COPY;
-#elif(VKFFT_BACKEND==4)
- ze_command_queue_desc_t commandQueueCopyDesc = {
- ZE_STRUCTURE_TYPE_COMMAND_QUEUE_DESC,
- 0,
- vkGPU->commandQueueID,
- 0, // index
- 0, // flags
- ZE_COMMAND_QUEUE_MODE_DEFAULT,
- ZE_COMMAND_QUEUE_PRIORITY_NORMAL
- };
- ze_command_list_handle_t copyCommandList;
- res = zeCommandListCreateImmediate(vkGPU->context, vkGPU->device, &commandQueueCopyDesc, ©CommandList);
- if (res != ZE_RESULT_SUCCESS) return VKFFT_ERROR_FAILED_TO_CREATE_COMMAND_LIST;
- res = zeCommandListAppendMemoryCopy(copyCommandList, buffer, inputC, bufferSize[i], 0, 0, 0);
- if (res != ZE_RESULT_SUCCESS) return VKFFT_ERROR_FAILED_TO_COPY;
- res = zeCommandQueueSynchronize(vkGPU->commandQueue, UINT32_MAX);
- if (res != 0) return VKFFT_ERROR_FAILED_TO_SYNCHRONIZE;
+#else
+ resFFT = transferDataFromCPU(vkGPU, (inputC + shift / 2 / sizeof(half)), &buffer, bufferSize[i]);
+ if (resFFT != VKFFT_SUCCESS) return resFFT;
#endif
shift += bufferSize[i];
}
@@ -273,6 +265,8 @@ VkFFTResult sample_13_precision_VkFFT_half(VkGPU* vkGPU, uint64_t file_output, F
launchParams.buffer = &buffer;
#elif(VKFFT_BACKEND==4)
launchParams.buffer = (void**)&buffer;
+#else
+ launchParams.buffer = &buffer;
#endif
resFFT = performVulkanFFT(vkGPU, &app, &launchParams, -1, num_iter);
if (resFFT != VKFFT_SUCCESS) return resFFT;
@@ -284,32 +278,9 @@ VkFFTResult sample_13_precision_VkFFT_half(VkGPU* vkGPU, uint64_t file_output, F
#if(VKFFT_BACKEND==0)
resFFT = transferDataToCPU(vkGPU, (output_VkFFT + shift / 2 / sizeof(half)), &buffer[i], bufferSize[i]);
if (resFFT != VKFFT_SUCCESS) return resFFT;
-#elif(VKFFT_BACKEND==1)
- res = cudaMemcpy(output_VkFFT, buffer, bufferSize[i], cudaMemcpyDeviceToHost);
- if (res != cudaSuccess) return VKFFT_ERROR_FAILED_TO_COPY;
-#elif(VKFFT_BACKEND==2)
- res = hipMemcpy(output_VkFFT, buffer, bufferSize[i], hipMemcpyDeviceToHost);
- if (res != hipSuccess) return VKFFT_ERROR_FAILED_TO_COPY;
-#elif(VKFFT_BACKEND==3)
- res = clEnqueueReadBuffer(vkGPU->commandQueue, buffer, CL_TRUE, 0, bufferSize[i], output_VkFFT, 0, NULL, NULL);
- if (res != CL_SUCCESS) return VKFFT_ERROR_FAILED_TO_COPY;
-#elif(VKFFT_BACKEND==4)
- ze_command_queue_desc_t commandQueueCopyDesc = {
- ZE_STRUCTURE_TYPE_COMMAND_QUEUE_DESC,
- 0,
- vkGPU->commandQueueID,
- 0, // index
- 0, // flags
- ZE_COMMAND_QUEUE_MODE_DEFAULT,
- ZE_COMMAND_QUEUE_PRIORITY_NORMAL
- };
- ze_command_list_handle_t copyCommandList;
- res = zeCommandListCreateImmediate(vkGPU->context, vkGPU->device, &commandQueueCopyDesc, ©CommandList);
- if (res != ZE_RESULT_SUCCESS) return VKFFT_ERROR_FAILED_TO_CREATE_COMMAND_LIST;
- res = zeCommandListAppendMemoryCopy(copyCommandList, output_VkFFT, buffer, bufferSize[i], 0, 0, 0);
- if (res != ZE_RESULT_SUCCESS) return VKFFT_ERROR_FAILED_TO_COPY;
- res = zeCommandQueueSynchronize(vkGPU->commandQueue, UINT32_MAX);
- if (res != 0) return VKFFT_ERROR_FAILED_TO_SYNCHRONIZE;
+#else
+ resFFT = transferDataToCPU(vkGPU, (output_VkFFT + shift / 2 / sizeof(half)), &buffer, bufferSize[i]);
+ if (resFFT != VKFFT_SUCCESS) return resFFT;
#endif
shift += bufferSize[i];
}
@@ -377,6 +348,8 @@ VkFFTResult sample_13_precision_VkFFT_half(VkGPU* vkGPU, uint64_t file_output, F
clReleaseMemObject(buffer);
#elif(VKFFT_BACKEND==4)
zeMemFree(vkGPU->context, buffer);
+#elif(VKFFT_BACKEND==5)
+ buffer->release();
#endif
}
diff --git a/benchmark_scripts/vkFFT_scripts/src/sample_14_precision_VkFFT_single_nonPow2.cpp b/benchmark_scripts/vkFFT_scripts/src/sample_14_precision_VkFFT_single_nonPow2.cpp
index 1ead647..91d035a 100644
--- a/benchmark_scripts/vkFFT_scripts/src/sample_14_precision_VkFFT_single_nonPow2.cpp
+++ b/benchmark_scripts/vkFFT_scripts/src/sample_14_precision_VkFFT_single_nonPow2.cpp
@@ -39,6 +39,10 @@
#endif
#elif(VKFFT_BACKEND==4)
#include <ze_api.h>
+#elif(VKFFT_BACKEND==5)
+#include "Foundation/Foundation.hpp"
+#include "QuartzCore/QuartzCore.hpp"
+#include "Metal/Metal.hpp"
#endif
#include "vkFFT.h"
#include "utils_VkFFT.h"
@@ -62,6 +66,7 @@ VkFFTResult sample_14_precision_VkFFT_single_nonPow2(VkGPU* vkGPU, uint64_t file
cl_int res = CL_SUCCESS;
#elif(VKFFT_BACKEND==4)
ze_result_t res = ZE_RESULT_SUCCESS;
+#elif(VKFFT_BACKEND==5)
#endif
if (file_output)
fprintf(output, "14 - VkFFT/FFTW C2C radix 3/5/7/11/13/Bluestein precision test in single precision\n");
@@ -160,7 +165,11 @@ VkFFTResult sample_14_precision_VkFFT_single_nonPow2(VkGPU* vkGPU, uint64_t file
//configuration.keepShaderCode = 1;
//configuration.disableReorderFourStep = 1;
//After this, configuration file contains pointers to Vulkan objects needed to work with the GPU: VkDevice* device - created device, [uint64_t *bufferSize, VkBuffer *buffer, VkDeviceMemory* bufferDeviceMemory] - allocated GPU memory FFT is performed on. [uint64_t *kernelSize, VkBuffer *kernel, VkDeviceMemory* kernelDeviceMemory] - allocated GPU memory, where kernel for convolution is stored.
+#if(VKFFT_BACKEND==5)
+ configuration.device = vkGPU->device;
+#else
configuration.device = &vkGPU->device;
+#endif
#if(VKFFT_BACKEND==0)
configuration.queue = &vkGPU->queue; //to allocate memory for LUT, we have to pass a queue, vkGPU->fence, commandPool and physicalDevice pointers
configuration.fence = &vkGPU->fence;
@@ -173,6 +182,8 @@ VkFFTResult sample_14_precision_VkFFT_single_nonPow2(VkGPU* vkGPU, uint64_t file
configuration.context = &vkGPU->context;
configuration.commandQueue = &vkGPU->commandQueue;
configuration.commandQueueID = vkGPU->commandQueueID;
+#elif(VKFFT_BACKEND==5)
+ configuration.queue = vkGPU->queue;
#endif
uint64_t numBuf = 1;
@@ -197,6 +208,8 @@ VkFFTResult sample_14_precision_VkFFT_single_nonPow2(VkGPU* vkGPU, uint64_t file
cl_mem buffer = 0;
#elif(VKFFT_BACKEND==4)
void* buffer = 0;
+#elif(VKFFT_BACKEND==5)
+ MTL::Buffer* buffer = 0;
#endif
for (uint64_t i = 0; i < numBuf; i++) {
#if(VKFFT_BACKEND==0)
@@ -218,6 +231,8 @@ VkFFTResult sample_14_precision_VkFFT_single_nonPow2(VkGPU* vkGPU, uint64_t file
device_desc.stype = ZE_STRUCTURE_TYPE_DEVICE_MEM_ALLOC_DESC;
res = zeMemAllocDevice(vkGPU->context, &device_desc, bufferSize[i], sizeof(float), vkGPU->device, &buffer);
if (res != ZE_RESULT_SUCCESS) return VKFFT_ERROR_FAILED_TO_ALLOCATE;
+#elif(VKFFT_BACKEND==5)
+ buffer = vkGPU->device->newBuffer(bufferSize[i], MTL::ResourceStorageModePrivate);
#endif
}
@@ -233,39 +248,15 @@ VkFFTResult sample_14_precision_VkFFT_single_nonPow2(VkGPU* vkGPU, uint64_t file
*/ // Can specify buffers at launch
configuration.bufferSize = bufferSize;
- //Sample buffer transfer tool. Uses staging buffer of the same size as destination buffer, which can be reduced if transfer is done sequentially in small buffers.
+ //Sample buffer transfer tool. Uses staging buffer (if needed) of the same size as destination buffer, which can be reduced if transfer is done sequentially in small buffers.
uint64_t shift = 0;
for (uint64_t i = 0; i < numBuf; i++) {
#if(VKFFT_BACKEND==0)
resFFT = transferDataFromCPU(vkGPU, (inputC + shift / sizeof(fftwf_complex)), &buffer[i], bufferSize[i]);
- if (resFFT != VKFFT_SUCCESS) return resFFT;
-#elif(VKFFT_BACKEND==1)
- res = cudaMemcpy(buffer, inputC, bufferSize[i], cudaMemcpyHostToDevice);
- if (res != cudaSuccess) return VKFFT_ERROR_FAILED_TO_COPY;
-#elif(VKFFT_BACKEND==2)
- res = hipMemcpy(buffer, inputC, bufferSize[i], hipMemcpyHostToDevice);
- if (res != hipSuccess) return VKFFT_ERROR_FAILED_TO_COPY;
-#elif(VKFFT_BACKEND==3)
- res = clEnqueueWriteBuffer(vkGPU->commandQueue, buffer, CL_TRUE, 0, bufferSize[i], inputC, 0, NULL, NULL);
- if (res != CL_SUCCESS) return VKFFT_ERROR_FAILED_TO_COPY;
-#elif(VKFFT_BACKEND==4)
- ze_command_queue_desc_t commandQueueCopyDesc = {
- ZE_STRUCTURE_TYPE_COMMAND_QUEUE_DESC,
- 0,
- vkGPU->commandQueueID,
- 0, // index
- 0, // flags
- ZE_COMMAND_QUEUE_MODE_DEFAULT,
- ZE_COMMAND_QUEUE_PRIORITY_NORMAL
- };
- ze_command_list_handle_t copyCommandList;
- res = zeCommandListCreateImmediate(vkGPU->context, vkGPU->device, &commandQueueCopyDesc, ©CommandList);
- if (res != ZE_RESULT_SUCCESS) return VKFFT_ERROR_FAILED_TO_CREATE_COMMAND_LIST;
- res = zeCommandListAppendMemoryCopy(copyCommandList, buffer, inputC, bufferSize[i], 0, 0, 0);
- if (res != ZE_RESULT_SUCCESS) return VKFFT_ERROR_FAILED_TO_COPY;
- res = zeCommandQueueSynchronize(vkGPU->commandQueue, UINT32_MAX);
- if (res != 0) return VKFFT_ERROR_FAILED_TO_SYNCHRONIZE;
+#else
+ resFFT = transferDataFromCPU(vkGPU, (inputC + shift / sizeof(fftwf_complex)), &buffer, bufferSize[i]);
#endif
+ if (resFFT != VKFFT_SUCCESS) return resFFT;
shift += bufferSize[i];
}
//Initialize applications. This function loads shaders, creates pipeline and configures FFT based on configuration file. No buffer allocations inside VkFFT library.
@@ -285,6 +276,8 @@ VkFFTResult sample_14_precision_VkFFT_single_nonPow2(VkGPU* vkGPU, uint64_t file
launchParams.buffer = &buffer;
#elif(VKFFT_BACKEND==4)
launchParams.buffer = (void**)&buffer;
+#elif(VKFFT_BACKEND==5)
+ launchParams.buffer = &buffer;
#endif
resFFT = performVulkanFFT(vkGPU, &app, &launchParams, -1, num_iter);
if (resFFT != VKFFT_SUCCESS) return resFFT;
@@ -295,34 +288,10 @@ VkFFTResult sample_14_precision_VkFFT_single_nonPow2(VkGPU* vkGPU, uint64_t file
for (uint64_t i = 0; i < numBuf; i++) {
#if(VKFFT_BACKEND==0)
resFFT = transferDataToCPU(vkGPU, (output_VkFFT + shift / sizeof(fftwf_complex)), &buffer[i], sizeof(fftwf_complex) * dims[0] * dims[1] * dims[2]);
- if (resFFT != VKFFT_SUCCESS) return resFFT;
-#elif(VKFFT_BACKEND==1)
- res = cudaMemcpy(output_VkFFT, buffer, bufferSize[i], cudaMemcpyDeviceToHost);
- if (res != cudaSuccess) return VKFFT_ERROR_FAILED_TO_COPY;
-#elif(VKFFT_BACKEND==2)
- res = hipMemcpy(output_VkFFT, buffer, bufferSize[i], hipMemcpyDeviceToHost);
- if (res != hipSuccess) return VKFFT_ERROR_FAILED_TO_COPY;
-#elif(VKFFT_BACKEND==3)
- res = clEnqueueReadBuffer(vkGPU->commandQueue, buffer, CL_TRUE, 0, bufferSize[i], output_VkFFT, 0, NULL, NULL);
- if (res != CL_SUCCESS) return VKFFT_ERROR_FAILED_TO_COPY;
-#elif(VKFFT_BACKEND==4)
- ze_command_queue_desc_t commandQueueCopyDesc = {
- ZE_STRUCTURE_TYPE_COMMAND_QUEUE_DESC,
- 0,
- vkGPU->commandQueueID,
- 0, // index
- 0, // flags
- ZE_COMMAND_QUEUE_MODE_DEFAULT,
- ZE_COMMAND_QUEUE_PRIORITY_NORMAL
- };
- ze_command_list_handle_t copyCommandList;
- res = zeCommandListCreateImmediate(vkGPU->context, vkGPU->device, &commandQueueCopyDesc, ©CommandList);
- if (res != ZE_RESULT_SUCCESS) return VKFFT_ERROR_FAILED_TO_CREATE_COMMAND_LIST;
- res = zeCommandListAppendMemoryCopy(copyCommandList, output_VkFFT, buffer, bufferSize[i], 0, 0, 0);
- if (res != ZE_RESULT_SUCCESS) return VKFFT_ERROR_FAILED_TO_COPY;
- res = zeCommandQueueSynchronize(vkGPU->commandQueue, UINT32_MAX);
- if (res != 0) return VKFFT_ERROR_FAILED_TO_SYNCHRONIZE;
+#else
+ resFFT = transferDataToCPU(vkGPU, (output_VkFFT + shift / sizeof(fftwf_complex)), &buffer, sizeof(fftwf_complex) * dims[0] * dims[1] * dims[2]);
#endif
+ if (resFFT != VKFFT_SUCCESS) return resFFT;
shift += bufferSize[i];
}
double avg_difference[2] = { 0,0 };
@@ -396,6 +365,8 @@ VkFFTResult sample_14_precision_VkFFT_single_nonPow2(VkGPU* vkGPU, uint64_t file
clReleaseMemObject(buffer);
#elif(VKFFT_BACKEND==4)
zeMemFree(vkGPU->context, buffer);
+#elif(VKFFT_BACKEND==5)
+ buffer->release();
#endif
}
diff --git a/benchmark_scripts/vkFFT_scripts/src/sample_15_precision_VkFFT_single_r2c.cpp b/benchmark_scripts/vkFFT_scripts/src/sample_15_precision_VkFFT_single_r2c.cpp
index d10af1b..40bf5c2 100644
--- a/benchmark_scripts/vkFFT_scripts/src/sample_15_precision_VkFFT_single_r2c.cpp
+++ b/benchmark_scripts/vkFFT_scripts/src/sample_15_precision_VkFFT_single_r2c.cpp
@@ -40,6 +40,10 @@
#endif
#elif(VKFFT_BACKEND==4)
#include <ze_api.h>
+#elif(VKFFT_BACKEND==5)
+#include "Foundation/Foundation.hpp"
+#include "QuartzCore/QuartzCore.hpp"
+#include "Metal/Metal.hpp"
#endif
#include "vkFFT.h"
#include "utils_VkFFT.h"
@@ -63,6 +67,7 @@ VkFFTResult sample_15_precision_VkFFT_single_r2c(VkGPU* vkGPU, uint64_t file_out
cl_int res = CL_SUCCESS;
#elif(VKFFT_BACKEND==4)
ze_result_t res = ZE_RESULT_SUCCESS;
+#elif(VKFFT_BACKEND==5)
#endif
if (file_output)
fprintf(output, "15 - VkFFT / FFTW R2C+C2R precision test in single precision\n");
@@ -178,7 +183,11 @@ VkFFTResult sample_15_precision_VkFFT_single_r2c(VkGPU* vkGPU, uint64_t file_out
//configuration.coalescedMemory = 64;
//configuration.useLUT = 1;
//After this, configuration file contains pointers to Vulkan objects needed to work with the GPU: VkDevice* device - created device, [uint64_t *bufferSize, VkBuffer *buffer, VkDeviceMemory* bufferDeviceMemory] - allocated GPU memory FFT is performed on. [uint64_t *kernelSize, VkBuffer *kernel, VkDeviceMemory* kernelDeviceMemory] - allocated GPU memory, where kernel for convolution is stored.
- configuration.device = &vkGPU->device;
+#if(VKFFT_BACKEND==5)
+ configuration.device = vkGPU->device;
+#else
+ configuration.device = &vkGPU->device;
+#endif
#if(VKFFT_BACKEND==0)
configuration.queue = &vkGPU->queue; //to allocate memory for LUT, we have to pass a queue, vkGPU->fence, commandPool and physicalDevice pointers
configuration.fence = &vkGPU->fence;
@@ -191,6 +200,8 @@ VkFFTResult sample_15_precision_VkFFT_single_r2c(VkGPU* vkGPU, uint64_t file_out
configuration.context = &vkGPU->context;
configuration.commandQueue = &vkGPU->commandQueue;
configuration.commandQueueID = vkGPU->commandQueueID;
+#elif(VKFFT_BACKEND==5)
+ configuration.queue = vkGPU->queue;
#endif
uint64_t numBuf = 1;
@@ -228,6 +239,9 @@ VkFFTResult sample_15_precision_VkFFT_single_r2c(VkGPU* vkGPU, uint64_t file_out
#elif(VKFFT_BACKEND==4)
void* ibuffer = 0;
void* buffer = 0;
+#elif(VKFFT_BACKEND==5)
+ MTL::Buffer* ibuffer = 0;
+ MTL::Buffer* buffer = 0;
#endif
for (uint64_t i = 0; i < numBuf; i++) {
#if(VKFFT_BACKEND==0)
@@ -261,6 +275,9 @@ VkFFTResult sample_15_precision_VkFFT_single_r2c(VkGPU* vkGPU, uint64_t file_out
if (res != ZE_RESULT_SUCCESS) return VKFFT_ERROR_FAILED_TO_ALLOCATE;
res = zeMemAllocDevice(vkGPU->context, &device_desc, bufferSize[i], sizeof(float), vkGPU->device, &buffer);
if (res != ZE_RESULT_SUCCESS) return VKFFT_ERROR_FAILED_TO_ALLOCATE;
+#elif(VKFFT_BACKEND==5)
+ ibuffer = vkGPU->device->newBuffer(inputBufferSize[i], MTL::ResourceStorageModePrivate);
+ buffer = vkGPU->device->newBuffer(bufferSize[i], MTL::ResourceStorageModePrivate);
#endif
}
configuration.inputBufferNum = numBuf;
@@ -273,38 +290,15 @@ VkFFTResult sample_15_precision_VkFFT_single_r2c(VkGPU* vkGPU, uint64_t file_out
configuration.inputBufferSize = inputBufferSize;
- //Sample buffer transfer tool. Uses staging buffer of the same size as destination buffer, which can be reduced if transfer is done sequentially in small buffers.
+ //Sample buffer transfer tool. Uses staging buffer (if needed) of the same size as destination buffer, which can be reduced if transfer is done sequentially in small buffers.
uint64_t shift = 0;
for (uint64_t i = 0; i < numBuf; i++) {
#if(VKFFT_BACKEND==0)
resFFT = transferDataFromCPU(vkGPU, (inputC + shift / sizeof(fftwf_complex)), &ibuffer[i], inputBufferSize[i]);
if (resFFT != VKFFT_SUCCESS) return resFFT;
-#elif(VKFFT_BACKEND==1)
- res = cudaMemcpy(ibuffer, inputC, inputBufferSize[i], cudaMemcpyHostToDevice);
- if (res != cudaSuccess) return VKFFT_ERROR_FAILED_TO_COPY;
-#elif(VKFFT_BACKEND==2)
- res = hipMemcpy(ibuffer, inputC, inputBufferSize[i], hipMemcpyHostToDevice);
- if (res != hipSuccess) return VKFFT_ERROR_FAILED_TO_COPY;
-#elif(VKFFT_BACKEND==3)
- res = clEnqueueWriteBuffer(vkGPU->commandQueue, ibuffer, CL_TRUE, 0, inputBufferSize[i], inputC, 0, NULL, NULL);
- if (res != CL_SUCCESS) return VKFFT_ERROR_FAILED_TO_COPY;
-#elif(VKFFT_BACKEND==4)
- ze_command_queue_desc_t commandQueueCopyDesc = {
- ZE_STRUCTURE_TYPE_COMMAND_QUEUE_DESC,
- 0,
- vkGPU->commandQueueID,
- 0, // index
- 0, // flags
- ZE_COMMAND_QUEUE_MODE_DEFAULT,
- ZE_COMMAND_QUEUE_PRIORITY_NORMAL
- };
- ze_command_list_handle_t copyCommandList;
- res = zeCommandListCreateImmediate(vkGPU->context, vkGPU->device, &commandQueueCopyDesc, ©CommandList);
- if (res != ZE_RESULT_SUCCESS) return VKFFT_ERROR_FAILED_TO_CREATE_COMMAND_LIST;
- res = zeCommandListAppendMemoryCopy(copyCommandList, ibuffer, inputC, inputBufferSize[i], 0, 0, 0);
- if (res != ZE_RESULT_SUCCESS) return VKFFT_ERROR_FAILED_TO_COPY;
- res = zeCommandQueueSynchronize(vkGPU->commandQueue, UINT32_MAX);
- if (res != 0) return VKFFT_ERROR_FAILED_TO_SYNCHRONIZE;
+#else
+ resFFT = transferDataFromCPU(vkGPU, (inputC + shift / sizeof(fftwf_complex)), &ibuffer, inputBufferSize[i]);
+ if (resFFT != VKFFT_SUCCESS) return resFFT;
#endif
shift += inputBufferSize[i];
}
@@ -331,6 +325,9 @@ VkFFTResult sample_15_precision_VkFFT_single_r2c(VkGPU* vkGPU, uint64_t file_out
#elif(VKFFT_BACKEND==4)
launchParams.inputBuffer = (void**)&ibuffer;
launchParams.buffer = (void**)&buffer;
+#elif(VKFFT_BACKEND==5)
+ launchParams.inputBuffer = &ibuffer;
+ launchParams.buffer = &buffer;
#endif
resFFT = performVulkanFFT(vkGPU, &app, &launchParams, -1, num_iter);
if (resFFT != VKFFT_SUCCESS) return resFFT;
@@ -350,6 +347,9 @@ VkFFTResult sample_15_precision_VkFFT_single_r2c(VkGPU* vkGPU, uint64_t file_out
#elif(VKFFT_BACKEND==4)
launchParams2.inputBuffer = (void**)&ibuffer;
launchParams2.buffer = (void**)&buffer;
+#elif(VKFFT_BACKEND==5)
+ launchParams2.inputBuffer = &ibuffer;
+ launchParams2.buffer = &buffer;
#endif
resFFT = performVulkanFFT(vkGPU, &app, &launchParams2, 1, num_iter);
if (resFFT != VKFFT_SUCCESS) return resFFT;
@@ -363,34 +363,10 @@ VkFFTResult sample_15_precision_VkFFT_single_r2c(VkGPU* vkGPU, uint64_t file_out
//resFFT = transferDataToCPU(vkGPU, (output_VkFFT + shift / sizeof(fftwf_complex)), &buffer[i], bufferSize[i]);
resFFT = transferDataToCPU(vkGPU, (output_VkFFT + shift / sizeof(fftwf_complex)), &ibuffer[i], inputBufferSize[i]);
if (resFFT != VKFFT_SUCCESS) return resFFT;
-#elif(VKFFT_BACKEND==1)
- //res = cudaMemcpy(output_VkFFT, buffer, bufferSize[i], cudaMemcpyDeviceToHost);
- res = cudaMemcpy(output_VkFFT, ibuffer, inputBufferSize[i], cudaMemcpyDeviceToHost);
- if (res != cudaSuccess) return VKFFT_ERROR_FAILED_TO_COPY;
-#elif(VKFFT_BACKEND==2)
- //res = hipMemcpy(output_VkFFT, buffer, bufferSize[i], hipMemcpyDeviceToHost);
- res = hipMemcpy(output_VkFFT, ibuffer, inputBufferSize[i], hipMemcpyDeviceToHost);
- if (res != hipSuccess) return VKFFT_ERROR_FAILED_TO_COPY;
-#elif(VKFFT_BACKEND==3)
- res = clEnqueueReadBuffer(vkGPU->commandQueue, ibuffer, CL_TRUE, 0, inputBufferSize[i], output_VkFFT, 0, NULL, NULL);
- if (res != CL_SUCCESS) return VKFFT_ERROR_FAILED_TO_COPY;
-#elif(VKFFT_BACKEND==4)
- ze_command_queue_desc_t commandQueueCopyDesc = {
- ZE_STRUCTURE_TYPE_COMMAND_QUEUE_DESC,
- 0,
- vkGPU->commandQueueID,
- 0, // index
- 0, // flags
- ZE_COMMAND_QUEUE_MODE_DEFAULT,
- ZE_COMMAND_QUEUE_PRIORITY_NORMAL
- };
- ze_command_list_handle_t copyCommandList;
- res = zeCommandListCreateImmediate(vkGPU->context, vkGPU->device, &commandQueueCopyDesc, ©CommandList);
- if (res != ZE_RESULT_SUCCESS) return VKFFT_ERROR_FAILED_TO_CREATE_COMMAND_LIST;
- res = zeCommandListAppendMemoryCopy(copyCommandList, output_VkFFT, ibuffer, inputBufferSize[i], 0, 0, 0);
- if (res != ZE_RESULT_SUCCESS) return VKFFT_ERROR_FAILED_TO_COPY;
- res = zeCommandQueueSynchronize(vkGPU->commandQueue, UINT32_MAX);
- if (res != 0) return VKFFT_ERROR_FAILED_TO_SYNCHRONIZE;
+#else
+ //resFFT = transferDataToCPU(vkGPU, (output_VkFFT + shift / sizeof(fftwf_complex)), &buffer, bufferSize[i]);
+ resFFT = transferDataToCPU(vkGPU, (output_VkFFT + shift / sizeof(fftwf_complex)), &ibuffer, inputBufferSize[i]);
+ if (resFFT != VKFFT_SUCCESS) return resFFT;
#endif
shift += inputBufferSize[i];
}
@@ -470,6 +446,9 @@ VkFFTResult sample_15_precision_VkFFT_single_r2c(VkGPU* vkGPU, uint64_t file_out
#elif(VKFFT_BACKEND==4)
zeMemFree(vkGPU->context, ibuffer);
zeMemFree(vkGPU->context, buffer);
+#elif(VKFFT_BACKEND==5)
+ ibuffer->release();
+ buffer->release();
#endif
}
diff --git a/benchmark_scripts/vkFFT_scripts/src/sample_16_precision_VkFFT_single_dct.cpp b/benchmark_scripts/vkFFT_scripts/src/sample_16_precision_VkFFT_single_dct.cpp
index 3872c9e..8eaf573 100644
--- a/benchmark_scripts/vkFFT_scripts/src/sample_16_precision_VkFFT_single_dct.cpp
+++ b/benchmark_scripts/vkFFT_scripts/src/sample_16_precision_VkFFT_single_dct.cpp
@@ -39,6 +39,10 @@
#endif
#elif(VKFFT_BACKEND==4)
#include <ze_api.h>
+#elif(VKFFT_BACKEND==5)
+#include "Foundation/Foundation.hpp"
+#include "QuartzCore/QuartzCore.hpp"
+#include "Metal/Metal.hpp"
#endif
#include "vkFFT.h"
#include "utils_VkFFT.h"
@@ -62,6 +66,7 @@ VkFFTResult sample_16_precision_VkFFT_single_dct(VkGPU* vkGPU, uint64_t file_out
cl_int res = CL_SUCCESS;
#elif(VKFFT_BACKEND==4)
ze_result_t res = ZE_RESULT_SUCCESS;
+#elif(VKFFT_BACKEND==5)
#endif
if (file_output)
fprintf(output, "16 - VkFFT/FFTW R2R DCT-I, II, III and IV precision test in single precision\n");
@@ -158,7 +163,11 @@ VkFFTResult sample_16_precision_VkFFT_single_dct(VkGPU* vkGPU, uint64_t file_out
//configuration.useLUT = 1;
//configuration.disableMergeSequencesR2C = 1;
//After this, configuration file contains pointers to Vulkan objects needed to work with the GPU: VkDevice* device - created device, [uint64_t *bufferSize, VkBuffer *buffer, VkDeviceMemory* bufferDeviceMemory] - allocated GPU memory FFT is performed on. [uint64_t *kernelSize, VkBuffer *kernel, VkDeviceMemory* kernelDeviceMemory] - allocated GPU memory, where kernel for convolution is stored.
+#if(VKFFT_BACKEND==5)
+ configuration.device = vkGPU->device;
+#else
configuration.device = &vkGPU->device;
+#endif
#if(VKFFT_BACKEND==0)
configuration.queue = &vkGPU->queue; //to allocate memory for LUT, we have to pass a queue, vkGPU->fence, commandPool and physicalDevice pointers
configuration.fence = &vkGPU->fence;
@@ -171,6 +180,8 @@ VkFFTResult sample_16_precision_VkFFT_single_dct(VkGPU* vkGPU, uint64_t file_out
configuration.context = &vkGPU->context;
configuration.commandQueue = &vkGPU->commandQueue;
configuration.commandQueueID = vkGPU->commandQueueID;
+#elif(VKFFT_BACKEND==5)
+ configuration.queue = vkGPU->queue;
#endif
uint64_t numBuf = 1;
@@ -195,6 +206,8 @@ VkFFTResult sample_16_precision_VkFFT_single_dct(VkGPU* vkGPU, uint64_t file_out
cl_mem buffer = 0;
#elif(VKFFT_BACKEND==4)
void* buffer = 0;
+#elif(VKFFT_BACKEND==5)
+ MTL::Buffer* buffer = 0;
#endif
for (uint64_t i = 0; i < numBuf; i++) {
#if(VKFFT_BACKEND==0)
@@ -216,6 +229,8 @@ VkFFTResult sample_16_precision_VkFFT_single_dct(VkGPU* vkGPU, uint64_t file_out
device_desc.stype = ZE_STRUCTURE_TYPE_DEVICE_MEM_ALLOC_DESC;
res = zeMemAllocDevice(vkGPU->context, &device_desc, bufferSize[i], sizeof(float), vkGPU->device, &buffer);
if (res != ZE_RESULT_SUCCESS) return VKFFT_ERROR_FAILED_TO_ALLOCATE;
+#elif(VKFFT_BACKEND==5)
+ buffer = vkGPU->device->newBuffer(bufferSize[i], MTL::ResourceStorageModePrivate);
#endif
}
@@ -231,38 +246,15 @@ VkFFTResult sample_16_precision_VkFFT_single_dct(VkGPU* vkGPU, uint64_t file_out
*/ // Can specify buffers at launch
configuration.bufferSize = bufferSize;
- //Sample buffer transfer tool. Uses staging buffer of the same size as destination buffer, which can be reduced if transfer is done sequentially in small buffers.
+ //Sample buffer transfer tool. Uses staging buffer (if needed) of the same size as destination buffer, which can be reduced if transfer is done sequentially in small buffers.
uint64_t shift = 0;
for (uint64_t i = 0; i < numBuf; i++) {
#if(VKFFT_BACKEND==0)
resFFT = transferDataFromCPU(vkGPU, (inputC + shift / sizeof(float)), &buffer[i], bufferSize[i]);
if (resFFT != VKFFT_SUCCESS) return resFFT;
-#elif(VKFFT_BACKEND==1)
- res = cudaMemcpy(buffer, inputC, bufferSize[i], cudaMemcpyHostToDevice);
- if (res != cudaSuccess) return VKFFT_ERROR_FAILED_TO_COPY;
-#elif(VKFFT_BACKEND==2)
- res = hipMemcpy(buffer, inputC, bufferSize[i], hipMemcpyHostToDevice);
- if (res != hipSuccess) return VKFFT_ERROR_FAILED_TO_COPY;
-#elif(VKFFT_BACKEND==3)
- res = clEnqueueWriteBuffer(vkGPU->commandQueue, buffer, CL_TRUE, 0, bufferSize[i], inputC, 0, NULL, NULL);
- if (res != CL_SUCCESS) return VKFFT_ERROR_FAILED_TO_COPY;
-#elif(VKFFT_BACKEND==4)
- ze_command_queue_desc_t commandQueueCopyDesc = {
- ZE_STRUCTURE_TYPE_COMMAND_QUEUE_DESC,
- 0,
- vkGPU->commandQueueID,
- 0, // index
- 0, // flags
- ZE_COMMAND_QUEUE_MODE_DEFAULT,
- ZE_COMMAND_QUEUE_PRIORITY_NORMAL
- };
- ze_command_list_handle_t copyCommandList;
- res = zeCommandListCreateImmediate(vkGPU->context, vkGPU->device, &commandQueueCopyDesc, ©CommandList);
- if (res != ZE_RESULT_SUCCESS) return VKFFT_ERROR_FAILED_TO_CREATE_COMMAND_LIST;
- res = zeCommandListAppendMemoryCopy(copyCommandList, buffer, inputC, bufferSize[i], 0, 0, 0);
- if (res != ZE_RESULT_SUCCESS) return VKFFT_ERROR_FAILED_TO_COPY;
- res = zeCommandQueueSynchronize(vkGPU->commandQueue, UINT32_MAX);
- if (res != 0) return VKFFT_ERROR_FAILED_TO_SYNCHRONIZE;
+#else
+ resFFT = transferDataFromCPU(vkGPU, (inputC + shift / sizeof(float)), &buffer, bufferSize[i]);
+ if (resFFT != VKFFT_SUCCESS) return resFFT;
#endif
shift += bufferSize[i];
}
@@ -285,6 +277,8 @@ VkFFTResult sample_16_precision_VkFFT_single_dct(VkGPU* vkGPU, uint64_t file_out
clReleaseMemObject(buffer);
#elif(VKFFT_BACKEND==4)
zeMemFree(vkGPU->context, buffer);
+#elif(VKFFT_BACKEND==5)
+ buffer->release();
#endif
}
@@ -315,6 +309,8 @@ VkFFTResult sample_16_precision_VkFFT_single_dct(VkGPU* vkGPU, uint64_t file_out
launchParams.buffer = &buffer;
#elif(VKFFT_BACKEND==4)
launchParams.buffer = (void**)&buffer;
+#elif(VKFFT_BACKEND==5)
+ launchParams.buffer = &buffer;
#endif
resFFT = performVulkanFFT(vkGPU, &app, &launchParams, -1, num_iter);
if (resFFT != VKFFT_SUCCESS) return resFFT;
@@ -326,32 +322,9 @@ VkFFTResult sample_16_precision_VkFFT_single_dct(VkGPU* vkGPU, uint64_t file_out
#if(VKFFT_BACKEND==0)
resFFT = transferDataToCPU(vkGPU, (output_VkFFT + shift / sizeof(float)), &buffer[i], bufferSize[i]);
if (resFFT != VKFFT_SUCCESS) return resFFT;
-#elif(VKFFT_BACKEND==1)
- res = cudaMemcpy(output_VkFFT, buffer, bufferSize[i], cudaMemcpyDeviceToHost);
- if (res != cudaSuccess) return VKFFT_ERROR_FAILED_TO_COPY;
-#elif(VKFFT_BACKEND==2)
- res = hipMemcpy(output_VkFFT, buffer, bufferSize[i], hipMemcpyDeviceToHost);
- if (res != hipSuccess) return VKFFT_ERROR_FAILED_TO_COPY;
-#elif(VKFFT_BACKEND==3)
- res = clEnqueueReadBuffer(vkGPU->commandQueue, buffer, CL_TRUE, 0, bufferSize[i], output_VkFFT, 0, NULL, NULL);
- if (res != CL_SUCCESS) return VKFFT_ERROR_FAILED_TO_COPY;
-#elif(VKFFT_BACKEND==4)
- ze_command_queue_desc_t commandQueueCopyDesc = {
- ZE_STRUCTURE_TYPE_COMMAND_QUEUE_DESC,
- 0,
- vkGPU->commandQueueID,
- 0, // index
- 0, // flags
- ZE_COMMAND_QUEUE_MODE_DEFAULT,
- ZE_COMMAND_QUEUE_PRIORITY_NORMAL
- };
- ze_command_list_handle_t copyCommandList;
- res = zeCommandListCreateImmediate(vkGPU->context, vkGPU->device, &commandQueueCopyDesc, ©CommandList);
- if (res != ZE_RESULT_SUCCESS) return VKFFT_ERROR_FAILED_TO_CREATE_COMMAND_LIST;
- res = zeCommandListAppendMemoryCopy(copyCommandList, output_VkFFT, buffer, bufferSize[i], 0, 0, 0);
- if (res != ZE_RESULT_SUCCESS) return VKFFT_ERROR_FAILED_TO_COPY;
- res = zeCommandQueueSynchronize(vkGPU->commandQueue, UINT32_MAX);
- if (res != 0) return VKFFT_ERROR_FAILED_TO_SYNCHRONIZE;
+#else
+ resFFT = transferDataToCPU(vkGPU, (output_VkFFT + shift / sizeof(float)), &buffer, bufferSize[i]);
+ if (resFFT != VKFFT_SUCCESS) return resFFT;
#endif
shift += bufferSize[i];
}
@@ -402,6 +375,8 @@ VkFFTResult sample_16_precision_VkFFT_single_dct(VkGPU* vkGPU, uint64_t file_out
clReleaseMemObject(buffer);
#elif(VKFFT_BACKEND==4)
zeMemFree(vkGPU->context, buffer);
+#elif(VKFFT_BACKEND==5)
+ buffer->release();
#endif
}
diff --git a/benchmark_scripts/vkFFT_scripts/src/sample_17_precision_VkFFT_double_dct.cpp b/benchmark_scripts/vkFFT_scripts/src/sample_17_precision_VkFFT_double_dct.cpp
index f85102a..dd683b9 100644
--- a/benchmark_scripts/vkFFT_scripts/src/sample_17_precision_VkFFT_double_dct.cpp
+++ b/benchmark_scripts/vkFFT_scripts/src/sample_17_precision_VkFFT_double_dct.cpp
@@ -39,6 +39,10 @@
#endif
#elif(VKFFT_BACKEND==4)
#include <ze_api.h>
+#elif(VKFFT_BACKEND==5)
+#include "Foundation/Foundation.hpp"
+#include "QuartzCore/QuartzCore.hpp"
+#include "Metal/Metal.hpp"
#endif
#include "vkFFT.h"
#include "utils_VkFFT.h"
@@ -62,6 +66,7 @@ VkFFTResult sample_17_precision_VkFFT_double_dct(VkGPU* vkGPU, uint64_t file_out
cl_int res = CL_SUCCESS;
#elif(VKFFT_BACKEND==4)
ze_result_t res = ZE_RESULT_SUCCESS;
+#elif(VKFFT_BACKEND==5)
#endif
if (file_output)
fprintf(output, "17 - VkFFT/FFTW R2R DCT-I, II, III and IV precision test in double precision\n");
@@ -156,7 +161,11 @@ VkFFTResult sample_17_precision_VkFFT_double_dct(VkGPU* vkGPU, uint64_t file_out
//configuration.useLUT = 1;
//configuration.disableMergeSequencesR2C = 1;
//After this, configuration file contains pointers to Vulkan objects needed to work with the GPU: VkDevice* device - created device, [uint64_t *bufferSize, VkBuffer *buffer, VkDeviceMemory* bufferDeviceMemory] - allocated GPU memory FFT is performed on. [uint64_t *kernelSize, VkBuffer *kernel, VkDeviceMemory* kernelDeviceMemory] - allocated GPU memory, where kernel for convolution is stored.
- configuration.device = &vkGPU->device;
+ #if(VKFFT_BACKEND==5)
+ configuration.device = vkGPU->device;
+#else
+ configuration.device = &vkGPU->device;
+#endif
#if(VKFFT_BACKEND==0)
configuration.queue = &vkGPU->queue; //to allocate memory for LUT, we have to pass a queue, vkGPU->fence, commandPool and physicalDevice pointers
configuration.fence = &vkGPU->fence;
@@ -169,6 +178,8 @@ VkFFTResult sample_17_precision_VkFFT_double_dct(VkGPU* vkGPU, uint64_t file_out
configuration.context = &vkGPU->context;
configuration.commandQueue = &vkGPU->commandQueue;
configuration.commandQueueID = vkGPU->commandQueueID;
+#elif(VKFFT_BACKEND==5)
+ configuration.queue = vkGPU->queue;
#endif
uint64_t numBuf = 1;
@@ -192,7 +203,9 @@ VkFFTResult sample_17_precision_VkFFT_double_dct(VkGPU* vkGPU, uint64_t file_out
#elif(VKFFT_BACKEND==3)
cl_mem buffer = 0;
#elif(VKFFT_BACKEND==4)
- void* buffer = 0;
+ void* buffer = 0;
+#elif(VKFFT_BACKEND==5)
+ MTL::Buffer* buffer = 0;
#endif
for (uint64_t i = 0; i < numBuf; i++) {
#if(VKFFT_BACKEND==0)
@@ -214,6 +227,8 @@ VkFFTResult sample_17_precision_VkFFT_double_dct(VkGPU* vkGPU, uint64_t file_out
device_desc.stype = ZE_STRUCTURE_TYPE_DEVICE_MEM_ALLOC_DESC;
res = zeMemAllocDevice(vkGPU->context, &device_desc, bufferSize[i], sizeof(float), vkGPU->device, &buffer);
if (res != ZE_RESULT_SUCCESS) return VKFFT_ERROR_FAILED_TO_ALLOCATE;
+#elif(VKFFT_BACKEND==5)
+ buffer = vkGPU->device->newBuffer(bufferSize[i], MTL::ResourceStorageModePrivate);
#endif
}
@@ -229,21 +244,15 @@ VkFFTResult sample_17_precision_VkFFT_double_dct(VkGPU* vkGPU, uint64_t file_out
*/ // Can specify buffers at launch
configuration.bufferSize = bufferSize;
- //Sample buffer transfer tool. Uses staging buffer of the same size as destination buffer, which can be reduced if transfer is done sequentially in small buffers.
+ //Sample buffer transfer tool. Uses staging buffer (if needed) of the same size as destination buffer, which can be reduced if transfer is done sequentially in small buffers.
uint64_t shift = 0;
for (uint64_t i = 0; i < numBuf; i++) {
#if(VKFFT_BACKEND==0)
resFFT = transferDataFromCPU(vkGPU, (inputC + shift / sizeof(double)), &buffer[i], bufferSize[i]);
if (resFFT != VKFFT_SUCCESS) return resFFT;
-#elif(VKFFT_BACKEND==1)
- res = cudaMemcpy(buffer, inputC, bufferSize[i], cudaMemcpyHostToDevice);
- if (res != cudaSuccess) return VKFFT_ERROR_FAILED_TO_COPY;
-#elif(VKFFT_BACKEND==2)
- res = hipMemcpy(buffer, inputC, bufferSize[i], hipMemcpyHostToDevice);
- if (res != hipSuccess) return VKFFT_ERROR_FAILED_TO_COPY;
-#elif(VKFFT_BACKEND==3)
- res = clEnqueueWriteBuffer(vkGPU->commandQueue, buffer, CL_TRUE, 0, bufferSize[i], inputC, 0, NULL, NULL);
- if (res != CL_SUCCESS) return VKFFT_ERROR_FAILED_TO_COPY;
+#else
+ resFFT = transferDataFromCPU(vkGPU, (inputC + shift / sizeof(double)), &buffer, bufferSize[i]);
+ if (resFFT != VKFFT_SUCCESS) return resFFT;
#endif
shift += bufferSize[i];
}
@@ -265,22 +274,9 @@ VkFFTResult sample_17_precision_VkFFT_double_dct(VkGPU* vkGPU, uint64_t file_out
#elif(VKFFT_BACKEND==3)
clReleaseMemObject(buffer);
#elif(VKFFT_BACKEND==4)
- ze_command_queue_desc_t commandQueueCopyDesc = {
- ZE_STRUCTURE_TYPE_COMMAND_QUEUE_DESC,
- 0,
- vkGPU->commandQueueID,
- 0, // index
- 0, // flags
- ZE_COMMAND_QUEUE_MODE_DEFAULT,
- ZE_COMMAND_QUEUE_PRIORITY_NORMAL
- };
- ze_command_list_handle_t copyCommandList;
- res = zeCommandListCreateImmediate(vkGPU->context, vkGPU->device, &commandQueueCopyDesc, ©CommandList);
- if (res != ZE_RESULT_SUCCESS) return VKFFT_ERROR_FAILED_TO_CREATE_COMMAND_LIST;
- res = zeCommandListAppendMemoryCopy(copyCommandList, buffer, inputC, bufferSize[i], 0, 0, 0);
- if (res != ZE_RESULT_SUCCESS) return VKFFT_ERROR_FAILED_TO_COPY;
- res = zeCommandQueueSynchronize(vkGPU->commandQueue, UINT32_MAX);
- if (res != 0) return VKFFT_ERROR_FAILED_TO_SYNCHRONIZE;
+ zeMemFree(vkGPU->context, buffer);
+#elif(VKFFT_BACKEND==5)
+ buffer->release();
#endif
}
@@ -312,6 +308,8 @@ VkFFTResult sample_17_precision_VkFFT_double_dct(VkGPU* vkGPU, uint64_t file_out
launchParams.buffer = &buffer;
#elif(VKFFT_BACKEND==4)
launchParams.buffer = (void**)&buffer;
+#elif(VKFFT_BACKEND==5)
+ launchParams.buffer = &buffer;
#endif
resFFT = performVulkanFFT(vkGPU, &app, &launchParams, -1, num_iter);
if (resFFT != VKFFT_SUCCESS) return resFFT;
@@ -323,32 +321,9 @@ VkFFTResult sample_17_precision_VkFFT_double_dct(VkGPU* vkGPU, uint64_t file_out
#if(VKFFT_BACKEND==0)
resFFT = transferDataToCPU(vkGPU, (output_VkFFT + shift / sizeof(double)), &buffer[i], bufferSize[i]);
if (resFFT != VKFFT_SUCCESS) return resFFT;
-#elif(VKFFT_BACKEND==1)
- res = cudaMemcpy(output_VkFFT, buffer, bufferSize[i], cudaMemcpyDeviceToHost);
- if (res != cudaSuccess) return VKFFT_ERROR_FAILED_TO_COPY;
-#elif(VKFFT_BACKEND==2)
- res = hipMemcpy(output_VkFFT, buffer, bufferSize[i], hipMemcpyDeviceToHost);
- if (res != hipSuccess) return VKFFT_ERROR_FAILED_TO_COPY;
-#elif(VKFFT_BACKEND==3)
- res = clEnqueueReadBuffer(vkGPU->commandQueue, buffer, CL_TRUE, 0, bufferSize[i], output_VkFFT, 0, NULL, NULL);
- if (res != CL_SUCCESS) return VKFFT_ERROR_FAILED_TO_COPY;
-#elif(VKFFT_BACKEND==4)
- ze_command_queue_desc_t commandQueueCopyDesc = {
- ZE_STRUCTURE_TYPE_COMMAND_QUEUE_DESC,
- 0,
- vkGPU->commandQueueID,
- 0, // index
- 0, // flags
- ZE_COMMAND_QUEUE_MODE_DEFAULT,
- ZE_COMMAND_QUEUE_PRIORITY_NORMAL
- };
- ze_command_list_handle_t copyCommandList;
- res = zeCommandListCreateImmediate(vkGPU->context, vkGPU->device, &commandQueueCopyDesc, ©CommandList);
- if (res != ZE_RESULT_SUCCESS) return VKFFT_ERROR_FAILED_TO_CREATE_COMMAND_LIST;
- res = zeCommandListAppendMemoryCopy(copyCommandList, output_VkFFT, buffer, bufferSize[i], 0, 0, 0);
- if (res != ZE_RESULT_SUCCESS) return VKFFT_ERROR_FAILED_TO_COPY;
- res = zeCommandQueueSynchronize(vkGPU->commandQueue, UINT32_MAX);
- if (res != 0) return VKFFT_ERROR_FAILED_TO_SYNCHRONIZE;
+#else
+ resFFT = transferDataToCPU(vkGPU, (output_VkFFT + shift / sizeof(double)), &buffer, bufferSize[i]);
+ if (resFFT != VKFFT_SUCCESS) return resFFT;
#endif
shift += bufferSize[i];
}
@@ -399,6 +374,8 @@ VkFFTResult sample_17_precision_VkFFT_double_dct(VkGPU* vkGPU, uint64_t file_out
clReleaseMemObject(buffer);
#elif(VKFFT_BACKEND==4)
zeMemFree(vkGPU->context, buffer);
+#elif(VKFFT_BACKEND==5)
+ buffer->release();
#endif
}
diff --git a/benchmark_scripts/vkFFT_scripts/src/sample_18_precision_VkFFT_double_nonPow2.cpp b/benchmark_scripts/vkFFT_scripts/src/sample_18_precision_VkFFT_double_nonPow2.cpp
index 27c4813..ab11954 100644
--- a/benchmark_scripts/vkFFT_scripts/src/sample_18_precision_VkFFT_double_nonPow2.cpp
+++ b/benchmark_scripts/vkFFT_scripts/src/sample_18_precision_VkFFT_double_nonPow2.cpp
@@ -39,6 +39,10 @@
#endif
#elif(VKFFT_BACKEND==4)
#include <ze_api.h>
+#elif(VKFFT_BACKEND==5)
+#include "Foundation/Foundation.hpp"
+#include "QuartzCore/QuartzCore.hpp"
+#include "Metal/Metal.hpp"
#endif
#include "vkFFT.h"
#include "utils_VkFFT.h"
@@ -62,6 +66,7 @@ VkFFTResult sample_18_precision_VkFFT_double_nonPow2(VkGPU* vkGPU, uint64_t file
cl_int res = CL_SUCCESS;
#elif(VKFFT_BACKEND==4)
ze_result_t res = ZE_RESULT_SUCCESS;
+#elif(VKFFT_BACKEND==5)
#endif
if (file_output)
fprintf(output, "18 - VkFFT/FFTW C2C radix 3/5/7/11/13/Bluestein precision test in double precision\n");
@@ -160,7 +165,11 @@ VkFFTResult sample_18_precision_VkFFT_double_nonPow2(VkGPU* vkGPU, uint64_t file
//configuration.disableReorderFourStep = 1;
//After this, configuration file contains pointers to Vulkan objects needed to work with the GPU: VkDevice* device - created device, [uint64_t *bufferSize, VkBuffer *buffer, VkDeviceMemory* bufferDeviceMemory] - allocated GPU memory FFT is performed on. [uint64_t *kernelSize, VkBuffer *kernel, VkDeviceMemory* kernelDeviceMemory] - allocated GPU memory, where kernel for convolution is stored.
- configuration.device = &vkGPU->device;
+#if(VKFFT_BACKEND==5)
+ configuration.device = vkGPU->device;
+#else
+ configuration.device = &vkGPU->device;
+#endif
#if(VKFFT_BACKEND==0)
configuration.queue = &vkGPU->queue; //to allocate memory for LUT, we have to pass a queue, vkGPU->fence, commandPool and physicalDevice pointers
configuration.fence = &vkGPU->fence;
@@ -173,6 +182,8 @@ VkFFTResult sample_18_precision_VkFFT_double_nonPow2(VkGPU* vkGPU, uint64_t file
configuration.context = &vkGPU->context;
configuration.commandQueue = &vkGPU->commandQueue;
configuration.commandQueueID = vkGPU->commandQueueID;
+#elif(VKFFT_BACKEND==5)
+ configuration.queue = vkGPU->queue;
#endif
uint64_t numBuf = 1;
@@ -197,6 +208,8 @@ VkFFTResult sample_18_precision_VkFFT_double_nonPow2(VkGPU* vkGPU, uint64_t file
cl_mem buffer = 0;
#elif(VKFFT_BACKEND==4)
void* buffer = 0;
+#elif(VKFFT_BACKEND==5)
+ MTL::Buffer* buffer = 0;
#endif
for (uint64_t i = 0; i < numBuf; i++) {
#if(VKFFT_BACKEND==0)
@@ -218,6 +231,8 @@ VkFFTResult sample_18_precision_VkFFT_double_nonPow2(VkGPU* vkGPU, uint64_t file
device_desc.stype = ZE_STRUCTURE_TYPE_DEVICE_MEM_ALLOC_DESC;
res = zeMemAllocDevice(vkGPU->context, &device_desc, bufferSize[i], sizeof(float), vkGPU->device, &buffer);
if (res != ZE_RESULT_SUCCESS) return VKFFT_ERROR_FAILED_TO_ALLOCATE;
+#elif(VKFFT_BACKEND==5)
+ buffer = vkGPU->device->newBuffer(bufferSize[i], MTL::ResourceStorageModePrivate);
#endif
}
@@ -233,38 +248,15 @@ VkFFTResult sample_18_precision_VkFFT_double_nonPow2(VkGPU* vkGPU, uint64_t file
*/ // Can specify buffers at launch
configuration.bufferSize = bufferSize;
- //Sample buffer transfer tool. Uses staging buffer of the same size as destination buffer, which can be reduced if transfer is done sequentially in small buffers.
+ //Sample buffer transfer tool. Uses staging buffer (if needed) of the same size as destination buffer, which can be reduced if transfer is done sequentially in small buffers.
uint64_t shift = 0;
for (uint64_t i = 0; i < numBuf; i++) {
#if(VKFFT_BACKEND==0)
resFFT = transferDataFromCPU(vkGPU, (inputC + shift / sizeof(fftw_complex)), &buffer[i], bufferSize[i]);
if (resFFT != VKFFT_SUCCESS) return resFFT;
-#elif(VKFFT_BACKEND==1)
- res = cudaMemcpy(buffer, inputC, bufferSize[i], cudaMemcpyHostToDevice);
- if (res != cudaSuccess) return VKFFT_ERROR_FAILED_TO_COPY;
-#elif(VKFFT_BACKEND==2)
- res = hipMemcpy(buffer, inputC, bufferSize[i], hipMemcpyHostToDevice);
- if (res != hipSuccess) return VKFFT_ERROR_FAILED_TO_COPY;
-#elif(VKFFT_BACKEND==3)
- res = clEnqueueWriteBuffer(vkGPU->commandQueue, buffer, CL_TRUE, 0, bufferSize[i], inputC, 0, NULL, NULL);
- if (res != CL_SUCCESS) return VKFFT_ERROR_FAILED_TO_COPY;
-#elif(VKFFT_BACKEND==4)
- ze_command_queue_desc_t commandQueueCopyDesc = {
- ZE_STRUCTURE_TYPE_COMMAND_QUEUE_DESC,
- 0,
- vkGPU->commandQueueID,
- 0, // index
- 0, // flags
- ZE_COMMAND_QUEUE_MODE_DEFAULT,
- ZE_COMMAND_QUEUE_PRIORITY_NORMAL
- };
- ze_command_list_handle_t copyCommandList;
- res = zeCommandListCreateImmediate(vkGPU->context, vkGPU->device, &commandQueueCopyDesc, ©CommandList);
- if (res != ZE_RESULT_SUCCESS) return VKFFT_ERROR_FAILED_TO_CREATE_COMMAND_LIST;
- res = zeCommandListAppendMemoryCopy(copyCommandList, buffer, inputC, bufferSize[i], 0, 0, 0);
- if (res != ZE_RESULT_SUCCESS) return VKFFT_ERROR_FAILED_TO_COPY;
- res = zeCommandQueueSynchronize(vkGPU->commandQueue, UINT32_MAX);
- if (res != 0) return VKFFT_ERROR_FAILED_TO_SYNCHRONIZE;
+#else
+ resFFT = transferDataFromCPU(vkGPU, (inputC + shift / sizeof(fftw_complex)), &buffer, bufferSize[i]);
+ if (resFFT != VKFFT_SUCCESS) return resFFT;
#endif
shift += bufferSize[i];
}
@@ -285,6 +277,8 @@ VkFFTResult sample_18_precision_VkFFT_double_nonPow2(VkGPU* vkGPU, uint64_t file
launchParams.buffer = &buffer;
#elif(VKFFT_BACKEND==4)
launchParams.buffer = (void**)&buffer;
+#elif(VKFFT_BACKEND==5)
+ launchParams.buffer = &buffer;
#endif
resFFT = performVulkanFFT(vkGPU, &app, &launchParams, -1, num_iter);
if (resFFT != VKFFT_SUCCESS) return resFFT;
@@ -296,32 +290,9 @@ VkFFTResult sample_18_precision_VkFFT_double_nonPow2(VkGPU* vkGPU, uint64_t file
#if(VKFFT_BACKEND==0)
resFFT = transferDataToCPU(vkGPU, (output_VkFFT + shift / sizeof(fftw_complex)), &buffer[i], sizeof(fftw_complex) * dims[0] * dims[1] * dims[2]);
if (resFFT != VKFFT_SUCCESS) return resFFT;
-#elif(VKFFT_BACKEND==1)
- res = cudaMemcpy(output_VkFFT, buffer, bufferSize[i], cudaMemcpyDeviceToHost);
- if (res != cudaSuccess) return VKFFT_ERROR_FAILED_TO_COPY;
-#elif(VKFFT_BACKEND==2)
- res = hipMemcpy(output_VkFFT, buffer, bufferSize[i], hipMemcpyDeviceToHost);
- if (res != hipSuccess) return VKFFT_ERROR_FAILED_TO_COPY;
-#elif(VKFFT_BACKEND==3)
- res = clEnqueueReadBuffer(vkGPU->commandQueue, buffer, CL_TRUE, 0, bufferSize[i], output_VkFFT, 0, NULL, NULL);
- if (res != CL_SUCCESS) return VKFFT_ERROR_FAILED_TO_COPY;
-#elif(VKFFT_BACKEND==4)
- ze_command_queue_desc_t commandQueueCopyDesc = {
- ZE_STRUCTURE_TYPE_COMMAND_QUEUE_DESC,
- 0,
- vkGPU->commandQueueID,
- 0, // index
- 0, // flags
- ZE_COMMAND_QUEUE_MODE_DEFAULT,
- ZE_COMMAND_QUEUE_PRIORITY_NORMAL
- };
- ze_command_list_handle_t copyCommandList;
- res = zeCommandListCreateImmediate(vkGPU->context, vkGPU->device, &commandQueueCopyDesc, ©CommandList);
- if (res != ZE_RESULT_SUCCESS) return VKFFT_ERROR_FAILED_TO_CREATE_COMMAND_LIST;
- res = zeCommandListAppendMemoryCopy(copyCommandList, output_VkFFT, buffer, bufferSize[i], 0, 0, 0);
- if (res != ZE_RESULT_SUCCESS) return VKFFT_ERROR_FAILED_TO_COPY;
- res = zeCommandQueueSynchronize(vkGPU->commandQueue, UINT32_MAX);
- if (res != 0) return VKFFT_ERROR_FAILED_TO_SYNCHRONIZE;
+#else
+ resFFT = transferDataToCPU(vkGPU, (output_VkFFT + shift / sizeof(fftw_complex)), &buffer, sizeof(fftw_complex) * dims[0] * dims[1] * dims[2]);
+ if (resFFT != VKFFT_SUCCESS) return resFFT;
#endif
shift += bufferSize[i];
}
@@ -396,6 +367,8 @@ VkFFTResult sample_18_precision_VkFFT_double_nonPow2(VkGPU* vkGPU, uint64_t file
clReleaseMemObject(buffer);
#elif(VKFFT_BACKEND==4)
zeMemFree(vkGPU->context, buffer);
+#elif(VKFFT_BACKEND==5)
+ buffer->release();
#endif
}
diff --git a/benchmark_scripts/vkFFT_scripts/src/sample_1_benchmark_VkFFT_double.cpp b/benchmark_scripts/vkFFT_scripts/src/sample_1_benchmark_VkFFT_double.cpp
index 4757d70..d23999f 100644
--- a/benchmark_scripts/vkFFT_scripts/src/sample_1_benchmark_VkFFT_double.cpp
+++ b/benchmark_scripts/vkFFT_scripts/src/sample_1_benchmark_VkFFT_double.cpp
@@ -39,6 +39,10 @@
#endif
#elif(VKFFT_BACKEND==4)
#include <ze_api.h>
+#elif(VKFFT_BACKEND==5)
+#include "Foundation/Foundation.hpp"
+#include "QuartzCore/QuartzCore.hpp"
+#include "Metal/Metal.hpp"
#endif
#include "vkFFT.h"
#include "utils_VkFFT.h"
@@ -56,6 +60,7 @@ VkFFTResult sample_1_benchmark_VkFFT_double(VkGPU* vkGPU, uint64_t file_output,
cl_int res = CL_SUCCESS;
#elif(VKFFT_BACKEND==4)
ze_result_t res = ZE_RESULT_SUCCESS;
+#elif(VKFFT_BACKEND==5)
#endif
if (file_output)
fprintf(output, "1 - VkFFT FFT + iFFT C2C benchmark 1D batched in double precision LUT\n");
@@ -86,8 +91,16 @@ VkFFTResult sample_1_benchmark_VkFFT_double(VkGPU* vkGPU, uint64_t file_output,
configuration.doublePrecision = true;
+#if(VKFFT_BACKEND!=5)
+ if (r==0) configuration.saveApplicationToString = 1;
+ if (r!=0) configuration.loadApplicationFromString = 1;
+#endif
//After this, configuration file contains pointers to Vulkan objects needed to work with the GPU: VkDevice* device - created device, [uint64_t *bufferSize, VkBuffer *buffer, VkDeviceMemory* bufferDeviceMemory] - allocated GPU memory FFT is performed on. [uint64_t *kernelSize, VkBuffer *kernel, VkDeviceMemory* kernelDeviceMemory] - allocated GPU memory, where kernel for convolution is stored.
- configuration.device = &vkGPU->device;
+#if(VKFFT_BACKEND==5)
+ configuration.device = vkGPU->device;
+#else
+ configuration.device = &vkGPU->device;
+#endif
#if(VKFFT_BACKEND==0)
configuration.queue = &vkGPU->queue; //to allocate memory for LUT, we have to pass a queue, vkGPU->fence, commandPool and physicalDevice pointers
configuration.fence = &vkGPU->fence;
@@ -100,6 +113,8 @@ VkFFTResult sample_1_benchmark_VkFFT_double(VkGPU* vkGPU, uint64_t file_output,
configuration.context = &vkGPU->context;
configuration.commandQueue = &vkGPU->commandQueue;
configuration.commandQueueID = vkGPU->commandQueueID;
+#elif(VKFFT_BACKEND==5)
+ configuration.queue = vkGPU->queue;
#endif
//Allocate buffer for the input data.
@@ -132,6 +147,10 @@ VkFFTResult sample_1_benchmark_VkFFT_double(VkGPU* vkGPU, uint64_t file_output,
res = zeMemAllocDevice(vkGPU->context, &device_desc, bufferSize, sizeof(float), vkGPU->device, &buffer);
if (res != ZE_RESULT_SUCCESS) return VKFFT_ERROR_FAILED_TO_ALLOCATE;
configuration.buffer = &buffer;
+#elif(VKFFT_BACKEND==5)
+ MTL::Buffer* buffer = 0;
+ buffer = vkGPU->device->newBuffer(bufferSize, MTL::ResourceStorageModePrivate);
+ configuration.buffer = &buffer;
#endif
configuration.bufferSize = &bufferSize;
@@ -147,43 +166,43 @@ VkFFTResult sample_1_benchmark_VkFFT_double(VkGPU* vkGPU, uint64_t file_output,
}
}
*/
- //Sample buffer transfer tool. Uses staging buffer of the same size as destination buffer, which can be reduced if transfer is done sequentially in small buffers.
-#if(VKFFT_BACKEND==0)
- resFFT = transferDataFromCPU(vkGPU, buffer_input, &buffer, bufferSize);
- if (resFFT != VKFFT_SUCCESS) return resFFT;
-#elif(VKFFT_BACKEND==1)
- res = cudaMemcpy(buffer, buffer_input, bufferSize, cudaMemcpyHostToDevice);
- if (res != cudaSuccess) return VKFFT_ERROR_FAILED_TO_COPY;
-#elif(VKFFT_BACKEND==2)
- res = hipMemcpy(buffer, buffer_input, bufferSize, hipMemcpyHostToDevice);
- if (res != hipSuccess) return VKFFT_ERROR_FAILED_TO_COPY;
-#elif(VKFFT_BACKEND==3)
- res = clEnqueueWriteBuffer(vkGPU->commandQueue, buffer, CL_TRUE, 0, bufferSize, buffer_input, 0, NULL, NULL);
- if (res != CL_SUCCESS) return VKFFT_ERROR_FAILED_TO_COPY;
-#elif(VKFFT_BACKEND==4)
- ze_command_queue_desc_t commandQueueCopyDesc = {
- ZE_STRUCTURE_TYPE_COMMAND_QUEUE_DESC,
- 0,
- vkGPU->commandQueueID,
- 0, // index
- 0, // flags
- ZE_COMMAND_QUEUE_MODE_DEFAULT,
- ZE_COMMAND_QUEUE_PRIORITY_NORMAL
- };
- ze_command_list_handle_t copyCommandList;
- res = zeCommandListCreateImmediate(vkGPU->context, vkGPU->device, &commandQueueCopyDesc, ©CommandList);
- if (res != ZE_RESULT_SUCCESS) return VKFFT_ERROR_FAILED_TO_CREATE_COMMAND_LIST;
- res = zeCommandListAppendMemoryCopy(copyCommandList, buffer, buffer_input, bufferSize, 0, 0, 0);
- if (res != ZE_RESULT_SUCCESS) return VKFFT_ERROR_FAILED_TO_COPY;
- res = zeCommandQueueSynchronize(vkGPU->commandQueue, UINT32_MAX);
- if (res != 0) return VKFFT_ERROR_FAILED_TO_SYNCHRONIZE;
-#endif
+ //Sample buffer transfer tool. Uses staging buffer (if needed) of the same size as destination buffer, which can be reduced if transfer is done sequentially in small buffers.
+ resFFT = transferDataFromCPU(vkGPU, buffer_input, &buffer, bufferSize);
+ if (resFFT != VKFFT_SUCCESS) return resFFT;
//free(buffer_input);
+ if (configuration.loadApplicationFromString) {
+ FILE* kernelCache;
+ uint64_t str_len;
+ char fname[500];
+ int VkFFT_version = VkFFTGetVersion();
+ sprintf(fname, "VkFFT_binary");
+ kernelCache = fopen(fname, "rb");
+ if (!kernelCache) return VKFFT_ERROR_EMPTY_FILE;
+ fseek(kernelCache, 0, SEEK_END);
+ str_len = ftell(kernelCache);
+ fseek(kernelCache, 0, SEEK_SET);
+ configuration.loadApplicationString = malloc(str_len);
+ fread(configuration.loadApplicationString, str_len, 1, kernelCache);
+ fclose(kernelCache);
+ }
//Initialize applications. This function loads shaders, creates pipeline and configures FFT based on configuration file. No buffer allocations inside VkFFT library.
resFFT = initializeVkFFT(&app, configuration);
if (resFFT != VKFFT_SUCCESS) return resFFT;
+ if (configuration.loadApplicationFromString)
+ free(configuration.loadApplicationString);
+
+ if (configuration.saveApplicationToString) {
+ FILE* kernelCache;
+ char fname[500];
+ int VkFFT_version = VkFFTGetVersion();
+ sprintf(fname, "VkFFT_binary");
+ kernelCache = fopen(fname, "wb");
+ fwrite(app.saveApplicationString, app.applicationStringSize, 1, kernelCache);
+ fclose(kernelCache);
+ }
+
//Submit FFT+iFFT.
uint64_t num_iter = (((uint64_t)4096 * 1024.0 * 1024.0) / bufferSize > 1000) ? 1000 : (uint64_t)((uint64_t)4096 * 1024.0 * 1024.0) / bufferSize;
#if(VKFFT_BACKEND==0)
@@ -241,6 +260,8 @@ VkFFTResult sample_1_benchmark_VkFFT_double(VkGPU* vkGPU, uint64_t file_output,
clReleaseMemObject(buffer);
#elif(VKFFT_BACKEND==4)
zeMemFree(vkGPU->context, buffer);
+#elif(VKFFT_BACKEND==5)
+ buffer->release();
#endif
deleteVkFFT(&app);
diff --git a/benchmark_scripts/vkFFT_scripts/src/sample_2_benchmark_VkFFT_half.cpp b/benchmark_scripts/vkFFT_scripts/src/sample_2_benchmark_VkFFT_half.cpp
index 0a1d2b2..a3bd5c0 100644
--- a/benchmark_scripts/vkFFT_scripts/src/sample_2_benchmark_VkFFT_half.cpp
+++ b/benchmark_scripts/vkFFT_scripts/src/sample_2_benchmark_VkFFT_half.cpp
@@ -39,6 +39,10 @@
#endif
#elif(VKFFT_BACKEND==4)
#include <ze_api.h>
+#elif(VKFFT_BACKEND==5)
+#include "Foundation/Foundation.hpp"
+#include "QuartzCore/QuartzCore.hpp"
+#include "Metal/Metal.hpp"
#endif
#include "vkFFT.h"
#include "half.hpp"
@@ -59,6 +63,7 @@ VkFFTResult sample_2_benchmark_VkFFT_half(VkGPU* vkGPU, uint64_t file_output, FI
cl_int res = CL_SUCCESS;
#elif(VKFFT_BACKEND==4)
ze_result_t res = ZE_RESULT_SUCCESS;
+#elif(VKFFT_BACKEND==5)
#endif
if (file_output)
fprintf(output, "2 - VkFFT FFT + iFFT C2C benchmark 1D batched in half precision\n");
@@ -115,9 +120,17 @@ VkFFTResult sample_2_benchmark_VkFFT_half(VkGPU* vkGPU, uint64_t file_output, FI
else
configuration.coalescedMemory = 128;
}
+#endif
+#if(VKFFT_BACKEND!=5)
+ if (r==0) configuration.saveApplicationToString = 1;
+ if (r!=0) configuration.loadApplicationFromString = 1;
#endif
//After this, configuration file contains pointers to Vulkan objects needed to work with the GPU: VkDevice* device - created device, [uint64_t *bufferSize, VkBuffer *buffer, VkDeviceMemory* bufferDeviceMemory] - allocated GPU memory FFT is performed on. [uint64_t *kernelSize, VkBuffer *kernel, VkDeviceMemory* kernelDeviceMemory] - allocated GPU memory, where kernel for convolution is stored.
- configuration.device = &vkGPU->device;
+#if(VKFFT_BACKEND==5)
+ configuration.device = vkGPU->device;
+#else
+ configuration.device = &vkGPU->device;
+#endif
#if(VKFFT_BACKEND==0)
configuration.queue = &vkGPU->queue; //to allocate memory for LUT, we have to pass a queue, vkGPU->fence, commandPool and physicalDevice pointers
configuration.fence = &vkGPU->fence;
@@ -130,6 +143,8 @@ VkFFTResult sample_2_benchmark_VkFFT_half(VkGPU* vkGPU, uint64_t file_output, FI
configuration.context = &vkGPU->context;
configuration.commandQueue = &vkGPU->commandQueue;
configuration.commandQueueID = vkGPU->commandQueueID;
+#elif(VKFFT_BACKEND==5)
+ configuration.queue = vkGPU->queue;
#endif
//Allocate buffer for the input data.
@@ -162,6 +177,10 @@ VkFFTResult sample_2_benchmark_VkFFT_half(VkGPU* vkGPU, uint64_t file_output, FI
res = zeMemAllocDevice(vkGPU->context, &device_desc, bufferSize, sizeof(half), vkGPU->device, &buffer);
if (res != ZE_RESULT_SUCCESS) return VKFFT_ERROR_FAILED_TO_ALLOCATE;
configuration.buffer = &buffer;
+#elif(VKFFT_BACKEND==5)
+ MTL::Buffer* buffer = 0;
+ buffer = vkGPU->device->newBuffer(bufferSize, MTL::ResourceStorageModePrivate);
+ configuration.buffer = &buffer;
#endif
configuration.bufferSize = &bufferSize;
@@ -177,43 +196,43 @@ VkFFTResult sample_2_benchmark_VkFFT_half(VkGPU* vkGPU, uint64_t file_output, FI
}
}
*/
- //Sample buffer transfer tool. Uses staging buffer of the same size as destination buffer, which can be reduced if transfer is done sequentially in small buffers.
-#if(VKFFT_BACKEND==0)
- resFFT = transferDataFromCPU(vkGPU, buffer_input, &buffer, bufferSize);
- if (resFFT != VKFFT_SUCCESS) return resFFT;
-#elif(VKFFT_BACKEND==1)
- res = cudaMemcpy(buffer, buffer_input, bufferSize, cudaMemcpyHostToDevice);
- if (res != cudaSuccess) return VKFFT_ERROR_FAILED_TO_COPY;
-#elif(VKFFT_BACKEND==2)
- res = hipMemcpy(buffer, buffer_input, bufferSize, hipMemcpyHostToDevice);
- if (res != hipSuccess) return VKFFT_ERROR_FAILED_TO_COPY;
-#elif(VKFFT_BACKEND==3)
- res = clEnqueueWriteBuffer(vkGPU->commandQueue, buffer, CL_TRUE, 0, bufferSize, buffer_input, 0, NULL, NULL);
- if (res != CL_SUCCESS) return VKFFT_ERROR_FAILED_TO_COPY;
-#elif(VKFFT_BACKEND==4)
- ze_command_queue_desc_t commandQueueCopyDesc = {
- ZE_STRUCTURE_TYPE_COMMAND_QUEUE_DESC,
- 0,
- vkGPU->commandQueueID,
- 0, // index
- 0, // flags
- ZE_COMMAND_QUEUE_MODE_DEFAULT,
- ZE_COMMAND_QUEUE_PRIORITY_NORMAL
- };
- ze_command_list_handle_t copyCommandList;
- res = zeCommandListCreateImmediate(vkGPU->context, vkGPU->device, &commandQueueCopyDesc, ©CommandList);
- if (res != ZE_RESULT_SUCCESS) return VKFFT_ERROR_FAILED_TO_CREATE_COMMAND_LIST;
- res = zeCommandListAppendMemoryCopy(copyCommandList, buffer, buffer_input, bufferSize, 0, 0, 0);
- if (res != ZE_RESULT_SUCCESS) return VKFFT_ERROR_FAILED_TO_COPY;
- res = zeCommandQueueSynchronize(vkGPU->commandQueue, UINT32_MAX);
- if (res != 0) return VKFFT_ERROR_FAILED_TO_SYNCHRONIZE;
-#endif
+ //Sample buffer transfer tool. Uses staging buffer (if needed) of the same size as destination buffer, which can be reduced if transfer is done sequentially in small buffers.
+ resFFT = transferDataFromCPU(vkGPU, buffer_input, &buffer, bufferSize);
+ if (resFFT != VKFFT_SUCCESS) return resFFT;
//free(buffer_input);
+ if (configuration.loadApplicationFromString) {
+ FILE* kernelCache;
+ uint64_t str_len;
+ char fname[500];
+ int VkFFT_version = VkFFTGetVersion();
+ sprintf(fname, "VkFFT_binary");
+ kernelCache = fopen(fname, "rb");
+ if (!kernelCache) return VKFFT_ERROR_EMPTY_FILE;
+ fseek(kernelCache, 0, SEEK_END);
+ str_len = ftell(kernelCache);
+ fseek(kernelCache, 0, SEEK_SET);
+ configuration.loadApplicationString = malloc(str_len);
+ fread(configuration.loadApplicationString, str_len, 1, kernelCache);
+ fclose(kernelCache);
+ }
//Initialize applications. This function loads shaders, creates pipeline and configures FFT based on configuration file. No buffer allocations inside VkFFT library.
resFFT = initializeVkFFT(&app, configuration);
if (resFFT != VKFFT_SUCCESS) return resFFT;
+ if (configuration.loadApplicationFromString)
+ free(configuration.loadApplicationString);
+
+ if (configuration.saveApplicationToString) {
+ FILE* kernelCache;
+ char fname[500];
+ int VkFFT_version = VkFFTGetVersion();
+ sprintf(fname, "VkFFT_binary");
+ kernelCache = fopen(fname, "wb");
+ fwrite(app.saveApplicationString, app.applicationStringSize, 1, kernelCache);
+ fclose(kernelCache);
+ }
+
//Submit FFT+iFFT.
uint64_t num_iter = (((uint64_t)4096 * 1024.0 * 1024.0) / bufferSize > 1000) ? 1000 : (uint64_t)((uint64_t)4096 * 1024.0 * 1024.0) / bufferSize;
#if(VKFFT_BACKEND==0)
@@ -266,6 +285,8 @@ VkFFTResult sample_2_benchmark_VkFFT_half(VkGPU* vkGPU, uint64_t file_output, FI
clReleaseMemObject(buffer);
#elif(VKFFT_BACKEND==4)
zeMemFree(vkGPU->context, buffer);
+#elif(VKFFT_BACKEND==5)
+ buffer->release();
#endif
deleteVkFFT(&app);
diff --git a/benchmark_scripts/vkFFT_scripts/src/sample_3_benchmark_VkFFT_single_3d.cpp b/benchmark_scripts/vkFFT_scripts/src/sample_3_benchmark_VkFFT_single_3d.cpp
index 51bb7b3..4579df5 100644
--- a/benchmark_scripts/vkFFT_scripts/src/sample_3_benchmark_VkFFT_single_3d.cpp
+++ b/benchmark_scripts/vkFFT_scripts/src/sample_3_benchmark_VkFFT_single_3d.cpp
@@ -39,6 +39,10 @@
#endif
#elif(VKFFT_BACKEND==4)
#include <ze_api.h>
+#elif(VKFFT_BACKEND==5)
+#include "Foundation/Foundation.hpp"
+#include "QuartzCore/QuartzCore.hpp"
+#include "Metal/Metal.hpp"
#endif
#include "vkFFT.h"
#include "utils_VkFFT.h"
@@ -56,6 +60,7 @@ VkFFTResult sample_3_benchmark_VkFFT_single_3d(VkGPU* vkGPU, uint64_t file_outpu
cl_int res = CL_SUCCESS;
#elif(VKFFT_BACKEND==4)
ze_result_t res = ZE_RESULT_SUCCESS;
+#elif(VKFFT_BACKEND==5)
#endif
if (file_output)
fprintf(output, "3 - VkFFT FFT + iFFT C2C multidimensional benchmark in single precision\n");
@@ -92,8 +97,16 @@ VkFFTResult sample_3_benchmark_VkFFT_single_3d(VkGPU* vkGPU, uint64_t file_outpu
configuration.size[1] = benchmark_dimensions[n][1];
configuration.size[2] = benchmark_dimensions[n][2];
+#if(VKFFT_BACKEND!=5)
+ if (r==0) configuration.saveApplicationToString = 1;
+ if (r!=0) configuration.loadApplicationFromString = 1;
+#endif
//After this, configuration file contains pointers to Vulkan objects needed to work with the GPU: VkDevice* device - created device, [uint64_t *bufferSize, VkBuffer *buffer, VkDeviceMemory* bufferDeviceMemory] - allocated GPU memory FFT is performed on. [uint64_t *kernelSize, VkBuffer *kernel, VkDeviceMemory* kernelDeviceMemory] - allocated GPU memory, where kernel for convolution is stored.
- configuration.device = &vkGPU->device;
+#if(VKFFT_BACKEND==5)
+ configuration.device = vkGPU->device;
+#else
+ configuration.device = &vkGPU->device;
+#endif
#if(VKFFT_BACKEND==0)
configuration.queue = &vkGPU->queue; //to allocate memory for LUT, we have to pass a queue, vkGPU->fence, commandPool and physicalDevice pointers
configuration.fence = &vkGPU->fence;
@@ -106,6 +119,8 @@ VkFFTResult sample_3_benchmark_VkFFT_single_3d(VkGPU* vkGPU, uint64_t file_outpu
configuration.context = &vkGPU->context;
configuration.commandQueue = &vkGPU->commandQueue;
configuration.commandQueueID = vkGPU->commandQueueID;
+#elif(VKFFT_BACKEND==5)
+ configuration.queue = vkGPU->queue;
#endif
//Allocate buffer for the input data.
uint64_t bufferSize = (uint64_t)sizeof(float) * 2 * configuration.size[0] * configuration.size[1] * configuration.size[2];;
@@ -137,6 +152,10 @@ VkFFTResult sample_3_benchmark_VkFFT_single_3d(VkGPU* vkGPU, uint64_t file_outpu
res = zeMemAllocDevice(vkGPU->context, &device_desc, bufferSize, sizeof(float), vkGPU->device, &buffer);
if (res != ZE_RESULT_SUCCESS) return VKFFT_ERROR_FAILED_TO_ALLOCATE;
configuration.buffer = &buffer;
+#elif(VKFFT_BACKEND==5)
+ MTL::Buffer* buffer = 0;
+ buffer = vkGPU->device->newBuffer(bufferSize, MTL::ResourceStorageModePrivate);
+ configuration.buffer = &buffer;
#endif
configuration.bufferSize = &bufferSize;
@@ -152,43 +171,43 @@ VkFFTResult sample_3_benchmark_VkFFT_single_3d(VkGPU* vkGPU, uint64_t file_outpu
}
}
*/
- //Sample buffer transfer tool. Uses staging buffer of the same size as destination buffer, which can be reduced if transfer is done sequentially in small buffers.
-#if(VKFFT_BACKEND==0)
- resFFT = transferDataFromCPU(vkGPU, buffer_input, &buffer, bufferSize);
- if (resFFT != VKFFT_SUCCESS) return resFFT;
-#elif(VKFFT_BACKEND==1)
- res = cudaMemcpy(buffer, buffer_input, bufferSize, cudaMemcpyHostToDevice);
- if (res != cudaSuccess) return VKFFT_ERROR_FAILED_TO_COPY;
-#elif(VKFFT_BACKEND==2)
- res = hipMemcpy(buffer, buffer_input, bufferSize, hipMemcpyHostToDevice);
- if (res != hipSuccess) return VKFFT_ERROR_FAILED_TO_COPY;
-#elif(VKFFT_BACKEND==3)
- res = clEnqueueWriteBuffer(vkGPU->commandQueue, buffer, CL_TRUE, 0, bufferSize, buffer_input, 0, NULL, NULL);
- if (res != CL_SUCCESS) return VKFFT_ERROR_FAILED_TO_COPY;
-#elif(VKFFT_BACKEND==4)
- ze_command_queue_desc_t commandQueueCopyDesc = {
- ZE_STRUCTURE_TYPE_COMMAND_QUEUE_DESC,
- 0,
- vkGPU->commandQueueID,
- 0, // index
- 0, // flags
- ZE_COMMAND_QUEUE_MODE_DEFAULT,
- ZE_COMMAND_QUEUE_PRIORITY_NORMAL
- };
- ze_command_list_handle_t copyCommandList;
- res = zeCommandListCreateImmediate(vkGPU->context, vkGPU->device, &commandQueueCopyDesc, ©CommandList);
- if (res != ZE_RESULT_SUCCESS) return VKFFT_ERROR_FAILED_TO_CREATE_COMMAND_LIST;
- res = zeCommandListAppendMemoryCopy(copyCommandList, buffer, buffer_input, bufferSize, 0, 0, 0);
- if (res != ZE_RESULT_SUCCESS) return VKFFT_ERROR_FAILED_TO_COPY;
- res = zeCommandQueueSynchronize(vkGPU->commandQueue, UINT32_MAX);
- if (res != 0) return VKFFT_ERROR_FAILED_TO_SYNCHRONIZE;
-#endif
+ //Sample buffer transfer tool. Uses staging buffer (if needed) of the same size as destination buffer, which can be reduced if transfer is done sequentially in small buffers.
+ resFFT = transferDataFromCPU(vkGPU, buffer_input, &buffer, bufferSize);
+ if (resFFT != VKFFT_SUCCESS) return resFFT;
//free(buffer_input);
+ if (configuration.loadApplicationFromString) {
+ FILE* kernelCache;
+ uint64_t str_len;
+ char fname[500];
+ int VkFFT_version = VkFFTGetVersion();
+ sprintf(fname, "VkFFT_binary");
+ kernelCache = fopen(fname, "rb");
+ if (!kernelCache) return VKFFT_ERROR_EMPTY_FILE;
+ fseek(kernelCache, 0, SEEK_END);
+ str_len = ftell(kernelCache);
+ fseek(kernelCache, 0, SEEK_SET);
+ configuration.loadApplicationString = malloc(str_len);
+ fread(configuration.loadApplicationString, str_len, 1, kernelCache);
+ fclose(kernelCache);
+ }
//Initialize applications. This function loads shaders, creates pipeline and configures FFT based on configuration file. No buffer allocations inside VkFFT library.
resFFT = initializeVkFFT(&app, configuration);
if (resFFT != VKFFT_SUCCESS) return resFFT;
+ if (configuration.loadApplicationFromString)
+ free(configuration.loadApplicationString);
+
+ if (configuration.saveApplicationToString) {
+ FILE* kernelCache;
+ char fname[500];
+ int VkFFT_version = VkFFTGetVersion();
+ sprintf(fname, "VkFFT_binary");
+ kernelCache = fopen(fname, "wb");
+ fwrite(app.saveApplicationString, app.applicationStringSize, 1, kernelCache);
+ fclose(kernelCache);
+ }
+
//Submit FFT+iFFT.
uint64_t num_iter = (((uint64_t)4096 * 1024.0 * 1024.0) / bufferSize > 1000) ? 1000 : (uint64_t)((uint64_t)4096 * 1024.0 * 1024.0) / bufferSize;
#if(VKFFT_BACKEND==0)
@@ -246,6 +265,8 @@ VkFFTResult sample_3_benchmark_VkFFT_single_3d(VkGPU* vkGPU, uint64_t file_outpu
clReleaseMemObject(buffer);
#elif(VKFFT_BACKEND==4)
zeMemFree(vkGPU->context, buffer);
+#elif(VKFFT_BACKEND==5)
+ buffer->release();
#endif
deleteVkFFT(&app);
diff --git a/benchmark_scripts/vkFFT_scripts/src/sample_4_benchmark_VkFFT_single_3d_zeropadding.cpp b/benchmark_scripts/vkFFT_scripts/src/sample_4_benchmark_VkFFT_single_3d_zeropadding.cpp
index 0b1183a..f8f9277 100644
--- a/benchmark_scripts/vkFFT_scripts/src/sample_4_benchmark_VkFFT_single_3d_zeropadding.cpp
+++ b/benchmark_scripts/vkFFT_scripts/src/sample_4_benchmark_VkFFT_single_3d_zeropadding.cpp
@@ -39,6 +39,10 @@
#endif
#elif(VKFFT_BACKEND==4)
#include <ze_api.h>
+#elif(VKFFT_BACKEND==5)
+#include "Foundation/Foundation.hpp"
+#include "QuartzCore/QuartzCore.hpp"
+#include "Metal/Metal.hpp"
#endif
#include "vkFFT.h"
#include "utils_VkFFT.h"
@@ -56,6 +60,7 @@ VkFFTResult sample_4_benchmark_VkFFT_single_3d_zeropadding(VkGPU* vkGPU, uint64_
cl_int res = CL_SUCCESS;
#elif(VKFFT_BACKEND==4)
ze_result_t res = ZE_RESULT_SUCCESS;
+#elif(VKFFT_BACKEND==5)
#endif
if (file_output)
fprintf(output, "4 - VkFFT FFT + iFFT C2C multidimensional benchmark in single precision, native zeropadding\n");
@@ -103,8 +108,16 @@ VkFFTResult sample_4_benchmark_VkFFT_single_3d_zeropadding(VkGPU* vkGPU, uint64_
configuration.fft_zeropad_left[2] = (uint64_t)ceil(configuration.size[2] / 2.0);
configuration.fft_zeropad_right[2] = configuration.size[2];
+#if(VKFFT_BACKEND!=5)
+ if (r==0) configuration.saveApplicationToString = 1;
+ if (r!=0) configuration.loadApplicationFromString = 1;
+#endif
//After this, configuration file contains pointers to Vulkan objects needed to work with the GPU: VkDevice* device - created device, [uint64_t *bufferSize, VkBuffer *buffer, VkDeviceMemory* bufferDeviceMemory] - allocated GPU memory FFT is performed on. [uint64_t *kernelSize, VkBuffer *kernel, VkDeviceMemory* kernelDeviceMemory] - allocated GPU memory, where kernel for convolution is stored.
- configuration.device = &vkGPU->device;
+#if(VKFFT_BACKEND==5)
+ configuration.device = vkGPU->device;
+#else
+ configuration.device = &vkGPU->device;
+#endif
#if(VKFFT_BACKEND==0)
configuration.queue = &vkGPU->queue; //to allocate memory for LUT, we have to pass a queue, vkGPU->fence, commandPool and physicalDevice pointers
configuration.fence = &vkGPU->fence;
@@ -117,6 +130,8 @@ VkFFTResult sample_4_benchmark_VkFFT_single_3d_zeropadding(VkGPU* vkGPU, uint64_
configuration.context = &vkGPU->context;
configuration.commandQueue = &vkGPU->commandQueue;
configuration.commandQueueID = vkGPU->commandQueueID;
+#elif(VKFFT_BACKEND==5)
+ configuration.queue = vkGPU->queue;
#endif
//Allocate buffer for the input data.
uint64_t bufferSize = (uint64_t)sizeof(float) * 2 * configuration.size[0] * configuration.size[1] * configuration.size[2];;
@@ -148,6 +163,10 @@ VkFFTResult sample_4_benchmark_VkFFT_single_3d_zeropadding(VkGPU* vkGPU, uint64_
res = zeMemAllocDevice(vkGPU->context, &device_desc, bufferSize, sizeof(float), vkGPU->device, &buffer);
if (res != ZE_RESULT_SUCCESS) return VKFFT_ERROR_FAILED_TO_ALLOCATE;
configuration.buffer = &buffer;
+#elif(VKFFT_BACKEND==5)
+ MTL::Buffer* buffer = 0;
+ buffer = vkGPU->device->newBuffer(bufferSize, MTL::ResourceStorageModePrivate);
+ configuration.buffer = &buffer;
#endif
configuration.bufferSize = &bufferSize;
@@ -163,43 +182,43 @@ VkFFTResult sample_4_benchmark_VkFFT_single_3d_zeropadding(VkGPU* vkGPU, uint64_
}
}
*/
- //Sample buffer transfer tool. Uses staging buffer of the same size as destination buffer, which can be reduced if transfer is done sequentially in small buffers.
-#if(VKFFT_BACKEND==0)
- resFFT = transferDataFromCPU(vkGPU, buffer_input, &buffer, bufferSize);
- if (resFFT != VKFFT_SUCCESS) return resFFT;
-#elif(VKFFT_BACKEND==1)
- res = cudaMemcpy(buffer, buffer_input, bufferSize, cudaMemcpyHostToDevice);
- if (res != cudaSuccess) return VKFFT_ERROR_FAILED_TO_COPY;
-#elif(VKFFT_BACKEND==2)
- res = hipMemcpy(buffer, buffer_input, bufferSize, hipMemcpyHostToDevice);
- if (res != hipSuccess) return VKFFT_ERROR_FAILED_TO_COPY;
-#elif(VKFFT_BACKEND==3)
- res = clEnqueueWriteBuffer(vkGPU->commandQueue, buffer, CL_TRUE, 0, bufferSize, buffer_input, 0, NULL, NULL);
- if (res != CL_SUCCESS) return VKFFT_ERROR_FAILED_TO_COPY;
-#elif(VKFFT_BACKEND==4)
- ze_command_queue_desc_t commandQueueCopyDesc = {
- ZE_STRUCTURE_TYPE_COMMAND_QUEUE_DESC,
- 0,
- vkGPU->commandQueueID,
- 0, // index
- 0, // flags
- ZE_COMMAND_QUEUE_MODE_DEFAULT,
- ZE_COMMAND_QUEUE_PRIORITY_NORMAL
- };
- ze_command_list_handle_t copyCommandList;
- res = zeCommandListCreateImmediate(vkGPU->context, vkGPU->device, &commandQueueCopyDesc, ©CommandList);
- if (res != ZE_RESULT_SUCCESS) return VKFFT_ERROR_FAILED_TO_CREATE_COMMAND_LIST;
- res = zeCommandListAppendMemoryCopy(copyCommandList, buffer, buffer_input, bufferSize, 0, 0, 0);
- if (res != ZE_RESULT_SUCCESS) return VKFFT_ERROR_FAILED_TO_COPY;
- res = zeCommandQueueSynchronize(vkGPU->commandQueue, UINT32_MAX);
- if (res != 0) return VKFFT_ERROR_FAILED_TO_SYNCHRONIZE;
-#endif
+ //Sample buffer transfer tool. Uses staging buffer (if needed) of the same size as destination buffer, which can be reduced if transfer is done sequentially in small buffers.
+ resFFT = transferDataFromCPU(vkGPU, buffer_input, &buffer, bufferSize);
+ if (resFFT != VKFFT_SUCCESS) return resFFT;
//free(buffer_input);
+ if (configuration.loadApplicationFromString) {
+ FILE* kernelCache;
+ uint64_t str_len;
+ char fname[500];
+ int VkFFT_version = VkFFTGetVersion();
+ sprintf(fname, "VkFFT_binary");
+ kernelCache = fopen(fname, "rb");
+ if (!kernelCache) return VKFFT_ERROR_EMPTY_FILE;
+ fseek(kernelCache, 0, SEEK_END);
+ str_len = ftell(kernelCache);
+ fseek(kernelCache, 0, SEEK_SET);
+ configuration.loadApplicationString = malloc(str_len);
+ fread(configuration.loadApplicationString, str_len, 1, kernelCache);
+ fclose(kernelCache);
+ }
//Initialize applications. This function loads shaders, creates pipeline and configures FFT based on configuration file. No buffer allocations inside VkFFT library.
resFFT = initializeVkFFT(&app, configuration);
if (resFFT != VKFFT_SUCCESS) return resFFT;
+ if (configuration.loadApplicationFromString)
+ free(configuration.loadApplicationString);
+
+ if (configuration.saveApplicationToString) {
+ FILE* kernelCache;
+ char fname[500];
+ int VkFFT_version = VkFFTGetVersion();
+ sprintf(fname, "VkFFT_binary");
+ kernelCache = fopen(fname, "wb");
+ fwrite(app.saveApplicationString, app.applicationStringSize, 1, kernelCache);
+ fclose(kernelCache);
+ }
+
//Submit FFT+iFFT.
uint64_t num_iter = (((uint64_t)4096 * 1024.0 * 1024.0) / bufferSize > 1000) ? 1000 : (uint64_t)((uint64_t)4096 * 1024.0 * 1024.0) / bufferSize;
#if(VKFFT_BACKEND==0)
@@ -256,6 +275,8 @@ VkFFTResult sample_4_benchmark_VkFFT_single_3d_zeropadding(VkGPU* vkGPU, uint64_
clReleaseMemObject(buffer);
#elif(VKFFT_BACKEND==4)
zeMemFree(vkGPU->context, buffer);
+#elif(VKFFT_BACKEND==5)
+ buffer->release();
#endif
deleteVkFFT(&app);
diff --git a/benchmark_scripts/vkFFT_scripts/src/sample_50_convolution_VkFFT_single_1d_matrix.cpp b/benchmark_scripts/vkFFT_scripts/src/sample_50_convolution_VkFFT_single_1d_matrix.cpp
index 8aea3c3..b53b743 100644
--- a/benchmark_scripts/vkFFT_scripts/src/sample_50_convolution_VkFFT_single_1d_matrix.cpp
+++ b/benchmark_scripts/vkFFT_scripts/src/sample_50_convolution_VkFFT_single_1d_matrix.cpp
@@ -39,6 +39,10 @@
#endif
#elif(VKFFT_BACKEND==4)
#include <ze_api.h>
+#elif(VKFFT_BACKEND==5)
+#include "Foundation/Foundation.hpp"
+#include "QuartzCore/QuartzCore.hpp"
+#include "Metal/Metal.hpp"
#endif
#include "vkFFT.h"
#include "utils_VkFFT.h"
@@ -56,6 +60,7 @@ VkFFTResult sample_50_convolution_VkFFT_single_1d_matrix(VkGPU* vkGPU, uint64_t
cl_int res = CL_SUCCESS;
#elif(VKFFT_BACKEND==4)
ze_result_t res = ZE_RESULT_SUCCESS;
+#elif(VKFFT_BACKEND==5)
#endif
if (file_output)
fprintf(output, "50 - VkFFT convolution example with identitiy kernel\n");
@@ -80,7 +85,11 @@ VkFFTResult sample_50_convolution_VkFFT_single_1d_matrix(VkGPU* vkGPU, uint64_t
configuration.normalize = 1;//normalize iFFT
//After this, configuration file contains pointers to Vulkan objects needed to work with the GPU: VkDevice* device - created device, [uint64_t *bufferSize, VkBuffer *buffer, VkDeviceMemory* bufferDeviceMemory] - allocated GPU memory FFT is performed on. [uint64_t *kernelSize, VkBuffer *kernel, VkDeviceMemory* kernelDeviceMemory] - allocated GPU memory, where kernel for convolution is stored.
+#if(VKFFT_BACKEND==5)
+ configuration.device = vkGPU->device;
+#else
configuration.device = &vkGPU->device;
+#endif
#if(VKFFT_BACKEND==0)
configuration.queue = &vkGPU->queue; //to allocate memory for LUT, we have to pass a queue, vkGPU->fence, commandPool and physicalDevice pointers
configuration.fence = &vkGPU->fence;
@@ -93,6 +102,8 @@ VkFFTResult sample_50_convolution_VkFFT_single_1d_matrix(VkGPU* vkGPU, uint64_t
configuration.context = &vkGPU->context;
configuration.commandQueue = &vkGPU->commandQueue;
configuration.commandQueueID = vkGPU->commandQueueID;
+#elif(VKFFT_BACKEND==5)
+ configuration.queue = vkGPU->queue;
#endif
//In this example, we perform a convolution for a real vectorfield (3vector) with a symmetric kernel (6 values). We use configuration to initialize convolution kernel first from real data, then we create convolution_configuration for convolution. The buffer object from configuration is passed to convolution_configuration as kernel object.
//1. Kernel forward FFT.
@@ -126,6 +137,10 @@ VkFFTResult sample_50_convolution_VkFFT_single_1d_matrix(VkGPU* vkGPU, uint64_t
res = zeMemAllocDevice(vkGPU->context, &device_desc, kernelSize, sizeof(float), vkGPU->device, &kernel);
if (res != ZE_RESULT_SUCCESS) return VKFFT_ERROR_FAILED_TO_ALLOCATE;
configuration.buffer = &kernel;
+#elif(VKFFT_BACKEND==5)
+ MTL::Buffer* kernel = 0;
+ kernel = vkGPU->device->newBuffer(kernelSize, MTL::ResourceStorageModePrivate);
+ configuration.buffer = &kernel;
#endif
configuration.bufferSize = &kernelSize;
@@ -157,38 +172,9 @@ VkFFTResult sample_50_convolution_VkFFT_single_1d_matrix(VkGPU* vkGPU, uint64_t
}
}
}
- //Sample buffer transfer tool. Uses staging buffer of the same size as destination buffer, which can be reduced if transfer is done sequentially in small buffers.
-#if(VKFFT_BACKEND==0)
+ //Sample buffer transfer tool. Uses staging buffer (if needed) of the same size as destination buffer, which can be reduced if transfer is done sequentially in small buffers.
resFFT = transferDataFromCPU(vkGPU, kernel_input, &kernel, kernelSize);
if (resFFT != VKFFT_SUCCESS) return resFFT;
-#elif(VKFFT_BACKEND==1)
- res = cudaMemcpy(kernel, kernel_input, kernelSize, cudaMemcpyHostToDevice);
- if (res != cudaSuccess) return VKFFT_ERROR_FAILED_TO_COPY;
-#elif(VKFFT_BACKEND==2)
- res = hipMemcpy(kernel, kernel_input, kernelSize, hipMemcpyHostToDevice);
- if (res != hipSuccess) return VKFFT_ERROR_FAILED_TO_COPY;
-#elif(VKFFT_BACKEND==3)
- res = clEnqueueWriteBuffer(vkGPU->commandQueue, kernel, CL_TRUE, 0, kernelSize, kernel_input, 0, NULL, NULL);
- if (res != CL_SUCCESS) return VKFFT_ERROR_FAILED_TO_COPY;
-#elif(VKFFT_BACKEND==4)
- ze_command_queue_desc_t commandQueueCopyDesc = {
- ZE_STRUCTURE_TYPE_COMMAND_QUEUE_DESC,
- 0,
- vkGPU->commandQueueID,
- 0, // index
- 0, // flags
- ZE_COMMAND_QUEUE_MODE_DEFAULT,
- ZE_COMMAND_QUEUE_PRIORITY_NORMAL
- };
- ze_command_list_handle_t copyCommandList;
- res = zeCommandListCreateImmediate(vkGPU->context, vkGPU->device, &commandQueueCopyDesc, ©CommandList);
- if (res != ZE_RESULT_SUCCESS) return VKFFT_ERROR_FAILED_TO_CREATE_COMMAND_LIST;
- res = zeCommandListAppendMemoryCopy(copyCommandList, kernel, kernel_input, kernelSize, 0, 0, 0);
- if (res != ZE_RESULT_SUCCESS) return VKFFT_ERROR_FAILED_TO_COPY;
- res = zeCommandQueueSynchronize(vkGPU->commandQueue, UINT32_MAX);
- if (res != 0) return VKFFT_ERROR_FAILED_TO_SYNCHRONIZE;
-#endif
-
//Initialize application responsible for the kernel. This function loads shaders, creates pipeline and configures FFT based on configuration file. No buffer allocations inside VkFFT library.
resFFT = initializeVkFFT(&app_kernel, configuration);
if (resFFT != VKFFT_SUCCESS) return resFFT;
@@ -219,6 +205,8 @@ VkFFTResult sample_50_convolution_VkFFT_single_1d_matrix(VkGPU* vkGPU, uint64_t
convolution_configuration.kernel = &kernel;
#elif(VKFFT_BACKEND==4)
convolution_configuration.kernel = (void**)&kernel;
+#elif(VKFFT_BACKEND==5)
+ convolution_configuration.kernel = &kernel;
#endif
//Allocate separate buffer for the input data.
@@ -249,6 +237,10 @@ VkFFTResult sample_50_convolution_VkFFT_single_1d_matrix(VkGPU* vkGPU, uint64_t
res = zeMemAllocDevice(vkGPU->context, &device_desc, bufferSize, sizeof(float), vkGPU->device, &buffer);
if (res != ZE_RESULT_SUCCESS) return VKFFT_ERROR_FAILED_TO_ALLOCATE;
configuration.buffer = &buffer;
+#elif(VKFFT_BACKEND==5)
+ MTL::Buffer* buffer = 0;
+ buffer = vkGPU->device->newBuffer(bufferSize, MTL::ResourceStorageModePrivate);
+ configuration.buffer = &buffer;
#endif
convolution_configuration.bufferSize = &bufferSize;
@@ -271,29 +263,9 @@ VkFFTResult sample_50_convolution_VkFFT_single_1d_matrix(VkGPU* vkGPU, uint64_t
}
}
//Transfer data to GPU using staging buffer.
-#if(VKFFT_BACKEND==0)
resFFT = transferDataFromCPU(vkGPU, buffer_input, &buffer, bufferSize);
if (resFFT != VKFFT_SUCCESS) return resFFT;
-#elif(VKFFT_BACKEND==1)
- res = cudaMemcpy(buffer, buffer_input, bufferSize, cudaMemcpyHostToDevice);
- if (res != cudaSuccess) return VKFFT_ERROR_FAILED_TO_COPY;
-#elif(VKFFT_BACKEND==2)
- res = hipMemcpy(buffer, buffer_input, bufferSize, hipMemcpyHostToDevice);
- if (res != hipSuccess) return VKFFT_ERROR_FAILED_TO_COPY;
-#elif(VKFFT_BACKEND==3)
- res = clEnqueueWriteBuffer(vkGPU->commandQueue, buffer, CL_TRUE, 0, bufferSize, buffer_input, 0, NULL, NULL);
- if (res != CL_SUCCESS) return VKFFT_ERROR_FAILED_TO_COPY;
-#elif(VKFFT_BACKEND==4)
- res = zeCommandListReset(copyCommandList);
- if (res != ZE_RESULT_SUCCESS)return VKFFT_ERROR_FAILED_TO_DESTROY_COMMAND_LIST;
- res = zeCommandListCreateImmediate(vkGPU->context, vkGPU->device, &commandQueueCopyDesc, ©CommandList);
- if (res != ZE_RESULT_SUCCESS) return VKFFT_ERROR_FAILED_TO_CREATE_COMMAND_LIST;
- res = zeCommandListAppendMemoryCopy(copyCommandList, buffer, buffer_input, bufferSize, 0, 0, 0);
- if (res != ZE_RESULT_SUCCESS) return VKFFT_ERROR_FAILED_TO_COPY;
- res = zeCommandQueueSynchronize(vkGPU->commandQueue, UINT32_MAX);
- if (res != 0) return VKFFT_ERROR_FAILED_TO_SYNCHRONIZE;
-#endif
-
+
//Initialize application responsible for the convolution.
resFFT = initializeVkFFT(&app_convolution, convolution_configuration);
if (resFFT != VKFFT_SUCCESS) return resFFT;
@@ -306,28 +278,8 @@ VkFFTResult sample_50_convolution_VkFFT_single_1d_matrix(VkGPU* vkGPU, uint64_t
float* buffer_output = (float*)malloc(bufferSize);
if (!buffer_output) return VKFFT_ERROR_MALLOC_FAILED;
//Transfer data from GPU using staging buffer.
-#if(VKFFT_BACKEND==0)
resFFT = transferDataToCPU(vkGPU, buffer_output, &buffer, bufferSize);
if (resFFT != VKFFT_SUCCESS) return resFFT;
-#elif(VKFFT_BACKEND==1)
- res = cudaMemcpy(buffer_output, buffer, bufferSize, cudaMemcpyDeviceToHost);
- if (res != cudaSuccess) return VKFFT_ERROR_FAILED_TO_COPY;
-#elif(VKFFT_BACKEND==2)
- res = hipMemcpy(buffer_output, buffer, bufferSize, hipMemcpyDeviceToHost);
- if (res != hipSuccess) return VKFFT_ERROR_FAILED_TO_COPY;
-#elif(VKFFT_BACKEND==3)
- res = clEnqueueReadBuffer(vkGPU->commandQueue, buffer, CL_TRUE, 0, bufferSize, buffer_output, 0, NULL, NULL);
- if (res != CL_SUCCESS) return VKFFT_ERROR_FAILED_TO_COPY;
-#elif(VKFFT_BACKEND==4)
- res = zeCommandListReset(copyCommandList);
- if (res != ZE_RESULT_SUCCESS)return VKFFT_ERROR_FAILED_TO_DESTROY_COMMAND_LIST;
- res = zeCommandListCreateImmediate(vkGPU->context, vkGPU->device, &commandQueueCopyDesc, ©CommandList);
- if (res != ZE_RESULT_SUCCESS) return VKFFT_ERROR_FAILED_TO_CREATE_COMMAND_LIST;
- res = zeCommandListAppendMemoryCopy(copyCommandList, buffer_output, buffer, bufferSize, 0, 0, 0);
- if (res != ZE_RESULT_SUCCESS) return VKFFT_ERROR_FAILED_TO_COPY;
- res = zeCommandQueueSynchronize(vkGPU->commandQueue, UINT32_MAX);
- if (res != 0) return VKFFT_ERROR_FAILED_TO_SYNCHRONIZE;
-#endif
//Print data, if needed.
for (uint64_t v = 0; v < convolution_configuration.coordinateFeatures; v++) {
if (file_output)
@@ -364,6 +316,9 @@ VkFFTResult sample_50_convolution_VkFFT_single_1d_matrix(VkGPU* vkGPU, uint64_t
#elif(VKFFT_BACKEND==4)
zeMemFree(vkGPU->context, buffer);
zeMemFree(vkGPU->context, kernel);
+#elif(VKFFT_BACKEND==5)
+ buffer->release();
+ kernel->release();
#endif
deleteVkFFT(&app_kernel);
deleteVkFFT(&app_convolution);
diff --git a/benchmark_scripts/vkFFT_scripts/src/sample_51_convolution_VkFFT_single_3d_matrix_zeropadding_r2c.cpp b/benchmark_scripts/vkFFT_scripts/src/sample_51_convolution_VkFFT_single_3d_matrix_zeropadding_r2c.cpp
index 8ce3ce1..7eae7dd 100644
--- a/benchmark_scripts/vkFFT_scripts/src/sample_51_convolution_VkFFT_single_3d_matrix_zeropadding_r2c.cpp
+++ b/benchmark_scripts/vkFFT_scripts/src/sample_51_convolution_VkFFT_single_3d_matrix_zeropadding_r2c.cpp
@@ -39,6 +39,10 @@
#endif
#elif(VKFFT_BACKEND==4)
#include <ze_api.h>
+#elif(VKFFT_BACKEND==5)
+#include "Foundation/Foundation.hpp"
+#include "QuartzCore/QuartzCore.hpp"
+#include "Metal/Metal.hpp"
#endif
#include "vkFFT.h"
#include "utils_VkFFT.h"
@@ -56,6 +60,7 @@ VkFFTResult sample_51_convolution_VkFFT_single_3d_matrix_zeropadding_r2c(VkGPU*
cl_int res = CL_SUCCESS;
#elif(VKFFT_BACKEND==4)
ze_result_t res = ZE_RESULT_SUCCESS;
+#elif(VKFFT_BACKEND==5)
#endif
if (file_output)
fprintf(output, "51 - VkFFT zeropadding convolution example with identitiy kernel\n");
@@ -89,7 +94,11 @@ VkFFTResult sample_51_convolution_VkFFT_single_3d_matrix_zeropadding_r2c(VkGPU*
//coordinateFeatures number is an important constant for convolution. If we perform 1x1 convolution, it is equal to number of features, but matrixConvolution should be equal to 1. For matrix convolution, it must be equal to matrixConvolution parameter. If we perform 2x2 convolution, it is equal to 3 for symmetric kernel (stored as xx, xy, yy) and 4 for nonsymmetric (stored as xx, xy, yx, yy). Similarly, 6 (stored as xx, xy, xz, yy, yz, zz) and 9 (stored as xx, xy, xz, yx, yy, yz, zx, zy, zz) for 3x3 convolutions.
//After this, configuration file contains pointers to Vulkan objects needed to work with the GPU: VkDevice* device - created device, [uint64_t *bufferSize, VkBuffer *buffer, VkDeviceMemory* bufferDeviceMemory] - allocated GPU memory FFT is performed on. [uint64_t *kernelSize, VkBuffer *kernel, VkDeviceMemory* kernelDeviceMemory] - allocated GPU memory, where kernel for convolution is stored.
+#if(VKFFT_BACKEND==5)
+ configuration.device = vkGPU->device;
+#else
configuration.device = &vkGPU->device;
+#endif
#if(VKFFT_BACKEND==0)
configuration.queue = &vkGPU->queue; //to allocate memory for LUT, we have to pass a queue, vkGPU->fence, commandPool and physicalDevice pointers
configuration.fence = &vkGPU->fence;
@@ -102,6 +111,8 @@ VkFFTResult sample_51_convolution_VkFFT_single_3d_matrix_zeropadding_r2c(VkGPU*
configuration.context = &vkGPU->context;
configuration.commandQueue = &vkGPU->commandQueue;
configuration.commandQueueID = vkGPU->commandQueueID;
+#elif(VKFFT_BACKEND==5)
+ configuration.queue = vkGPU->queue;
#endif
//In this example, we perform a convolution for a real vectorfield (3vector) with a symmetric kernel (6 values). We use configuration to initialize convolution kernel first from real data, then we create convolution_configuration for convolution. The buffer object from configuration is passed to convolution_configuration as kernel object.
//1. Kernel forward FFT.
@@ -134,6 +145,10 @@ VkFFTResult sample_51_convolution_VkFFT_single_3d_matrix_zeropadding_r2c(VkGPU*
res = zeMemAllocDevice(vkGPU->context, &device_desc, kernelSize, sizeof(float), vkGPU->device, &kernel);
if (res != ZE_RESULT_SUCCESS) return VKFFT_ERROR_FAILED_TO_ALLOCATE;
configuration.buffer = &kernel;
+#elif(VKFFT_BACKEND==5)
+ MTL::Buffer* kernel = 0;
+ kernel = vkGPU->device->newBuffer(kernelSize, MTL::ResourceStorageModePrivate);
+ configuration.buffer = &kernel;
#endif
configuration.bufferSize = &kernelSize;
@@ -166,37 +181,9 @@ VkFFTResult sample_51_convolution_VkFFT_single_3d_matrix_zeropadding_r2c(VkGPU*
}
}
}
- //Sample buffer transfer tool. Uses staging buffer of the same size as destination buffer, which can be reduced if transfer is done sequentially in small buffers.
-#if(VKFFT_BACKEND==0)
+ //Sample buffer transfer tool. Uses staging buffer (if needed) of the same size as destination buffer, which can be reduced if transfer is done sequentially in small buffers.
resFFT = transferDataFromCPU(vkGPU, kernel_input, &kernel, kernelSize);
if (resFFT != VKFFT_SUCCESS) return resFFT;
-#elif(VKFFT_BACKEND==1)
- res = cudaMemcpy(kernel, kernel_input, kernelSize, cudaMemcpyHostToDevice);
- if (res != cudaSuccess) return VKFFT_ERROR_FAILED_TO_COPY;
-#elif(VKFFT_BACKEND==2)
- res = hipMemcpy(kernel, kernel_input, kernelSize, hipMemcpyHostToDevice);
- if (res != hipSuccess) return VKFFT_ERROR_FAILED_TO_COPY;
-#elif(VKFFT_BACKEND==3)
- res = clEnqueueWriteBuffer(vkGPU->commandQueue, kernel, CL_TRUE, 0, kernelSize, kernel_input, 0, NULL, NULL);
- if (res != CL_SUCCESS) return VKFFT_ERROR_FAILED_TO_COPY;
-#elif(VKFFT_BACKEND==4)
- ze_command_queue_desc_t commandQueueCopyDesc = {
- ZE_STRUCTURE_TYPE_COMMAND_QUEUE_DESC,
- 0,
- vkGPU->commandQueueID,
- 0, // index
- 0, // flags
- ZE_COMMAND_QUEUE_MODE_DEFAULT,
- ZE_COMMAND_QUEUE_PRIORITY_NORMAL
- };
- ze_command_list_handle_t copyCommandList;
- res = zeCommandListCreateImmediate(vkGPU->context, vkGPU->device, &commandQueueCopyDesc, ©CommandList);
- if (res != ZE_RESULT_SUCCESS) return VKFFT_ERROR_FAILED_TO_CREATE_COMMAND_LIST;
- res = zeCommandListAppendMemoryCopy(copyCommandList, kernel, kernel_input, kernelSize, 0, 0, 0);
- if (res != ZE_RESULT_SUCCESS) return VKFFT_ERROR_FAILED_TO_COPY;
- res = zeCommandQueueSynchronize(vkGPU->commandQueue, UINT32_MAX);
- if (res != 0) return VKFFT_ERROR_FAILED_TO_SYNCHRONIZE;
-#endif
//Initialize application responsible for the kernel. This function loads shaders, creates pipeline and configures FFT based on configuration file. No buffer allocations inside VkFFT library.
resFFT = initializeVkFFT(&app_kernel, configuration);
if (resFFT != VKFFT_SUCCESS) return resFFT;
@@ -231,6 +218,8 @@ VkFFTResult sample_51_convolution_VkFFT_single_3d_matrix_zeropadding_r2c(VkGPU*
convolution_configuration.kernel = &kernel;
#elif(VKFFT_BACKEND==4)
convolution_configuration.kernel = (void**)&kernel;
+#elif(VKFFT_BACKEND==5)
+ convolution_configuration.kernel = &kernel;
#endif
//Allocate separate buffer for the input data.
@@ -262,6 +251,10 @@ VkFFTResult sample_51_convolution_VkFFT_single_3d_matrix_zeropadding_r2c(VkGPU*
res = zeMemAllocDevice(vkGPU->context, &device_desc, bufferSize, sizeof(float), vkGPU->device, &buffer);
if (res != ZE_RESULT_SUCCESS) return VKFFT_ERROR_FAILED_TO_ALLOCATE;
configuration.buffer = &buffer;
+#elif(VKFFT_BACKEND==5)
+ MTL::Buffer* buffer = 0;
+ buffer = vkGPU->device->newBuffer(bufferSize, MTL::ResourceStorageModePrivate);
+ configuration.buffer = &buffer;
#endif
convolution_configuration.bufferSize = &bufferSize;
@@ -283,29 +276,9 @@ VkFFTResult sample_51_convolution_VkFFT_single_3d_matrix_zeropadding_r2c(VkGPU*
}
}
//Transfer data to GPU using staging buffer.
-#if(VKFFT_BACKEND==0)
resFFT = transferDataFromCPU(vkGPU, buffer_input, &buffer, bufferSize);
- if (resFFT != VKFFT_SUCCESS) return resFFT;
-#elif(VKFFT_BACKEND==1)
- res = cudaMemcpy(buffer, buffer_input, bufferSize, cudaMemcpyHostToDevice);
- if (res != cudaSuccess) return VKFFT_ERROR_FAILED_TO_COPY;
-#elif(VKFFT_BACKEND==2)
- res = hipMemcpy(buffer, buffer_input, bufferSize, hipMemcpyHostToDevice);
- if (res != hipSuccess) return VKFFT_ERROR_FAILED_TO_COPY;
-#elif(VKFFT_BACKEND==3)
- res = clEnqueueWriteBuffer(vkGPU->commandQueue, buffer, CL_TRUE, 0, bufferSize, buffer_input, 0, NULL, NULL);
- if (res != CL_SUCCESS) return VKFFT_ERROR_FAILED_TO_COPY;
-#elif(VKFFT_BACKEND==4)
- res = zeCommandListReset(copyCommandList);
- if (res != ZE_RESULT_SUCCESS)return VKFFT_ERROR_FAILED_TO_DESTROY_COMMAND_LIST;
- res = zeCommandListCreateImmediate(vkGPU->context, vkGPU->device, &commandQueueCopyDesc, ©CommandList);
- if (res != ZE_RESULT_SUCCESS) return VKFFT_ERROR_FAILED_TO_CREATE_COMMAND_LIST;
- res = zeCommandListAppendMemoryCopy(copyCommandList, buffer, buffer_input, bufferSize, 0, 0, 0);
- if (res != ZE_RESULT_SUCCESS) return VKFFT_ERROR_FAILED_TO_COPY;
- res = zeCommandQueueSynchronize(vkGPU->commandQueue, UINT32_MAX);
- if (res != 0) return VKFFT_ERROR_FAILED_TO_SYNCHRONIZE;
-#endif
- //Initialize application responsible for the convolution.
+ if (resFFT != VKFFT_SUCCESS) return resFFT; //Initialize application responsible for the convolution.
+
resFFT = initializeVkFFT(&app_convolution, convolution_configuration);
if (resFFT != VKFFT_SUCCESS) return resFFT;
//Sample forward FFT command buffer allocation + execution performed on kernel. FFT can also be appended to user defined command buffers.
@@ -317,28 +290,8 @@ VkFFTResult sample_51_convolution_VkFFT_single_3d_matrix_zeropadding_r2c(VkGPU*
float* buffer_output = (float*)malloc(bufferSize);
if (!buffer_output) return VKFFT_ERROR_MALLOC_FAILED;
//Transfer data from GPU using staging buffer.
-#if(VKFFT_BACKEND==0)
resFFT = transferDataToCPU(vkGPU, buffer_output, &buffer, bufferSize);
if (resFFT != VKFFT_SUCCESS) return resFFT;
-#elif(VKFFT_BACKEND==1)
- res = cudaMemcpy(buffer_output, buffer, bufferSize, cudaMemcpyDeviceToHost);
- if (res != cudaSuccess) return VKFFT_ERROR_FAILED_TO_COPY;
-#elif(VKFFT_BACKEND==2)
- res = hipMemcpy(buffer_output, buffer, bufferSize, hipMemcpyDeviceToHost);
- if (res != hipSuccess) return VKFFT_ERROR_FAILED_TO_COPY;
-#elif(VKFFT_BACKEND==3)
- res = clEnqueueReadBuffer(vkGPU->commandQueue, buffer, CL_TRUE, 0, bufferSize, buffer_output, 0, NULL, NULL);
- if (res != CL_SUCCESS) return VKFFT_ERROR_FAILED_TO_COPY;
-#elif(VKFFT_BACKEND==4)
- res = zeCommandListReset(copyCommandList);
- if (res != ZE_RESULT_SUCCESS)return VKFFT_ERROR_FAILED_TO_DESTROY_COMMAND_LIST;
- res = zeCommandListCreateImmediate(vkGPU->context, vkGPU->device, &commandQueueCopyDesc, ©CommandList);
- if (res != ZE_RESULT_SUCCESS) return VKFFT_ERROR_FAILED_TO_CREATE_COMMAND_LIST;
- res = zeCommandListAppendMemoryCopy(copyCommandList, buffer_output, buffer, bufferSize, 0, 0, 0);
- if (res != ZE_RESULT_SUCCESS) return VKFFT_ERROR_FAILED_TO_COPY;
- res = zeCommandQueueSynchronize(vkGPU->commandQueue, UINT32_MAX);
- if (res != 0) return VKFFT_ERROR_FAILED_TO_SYNCHRONIZE;
-#endif
//Print data, if needed.
for (uint64_t v = 0; v < convolution_configuration.coordinateFeatures; v++) {
@@ -376,6 +329,9 @@ VkFFTResult sample_51_convolution_VkFFT_single_3d_matrix_zeropadding_r2c(VkGPU*
#elif(VKFFT_BACKEND==4)
zeMemFree(vkGPU->context, buffer);
zeMemFree(vkGPU->context, kernel);
+#elif(VKFFT_BACKEND==5)
+ buffer->release();
+ kernel->release();
#endif
deleteVkFFT(&app_kernel);
deleteVkFFT(&app_convolution);
diff --git a/benchmark_scripts/vkFFT_scripts/src/sample_52_convolution_VkFFT_single_2d_batched_r2c.cpp b/benchmark_scripts/vkFFT_scripts/src/sample_52_convolution_VkFFT_single_2d_batched_r2c.cpp
index fa168ef..a6a2165 100644
--- a/benchmark_scripts/vkFFT_scripts/src/sample_52_convolution_VkFFT_single_2d_batched_r2c.cpp
+++ b/benchmark_scripts/vkFFT_scripts/src/sample_52_convolution_VkFFT_single_2d_batched_r2c.cpp
@@ -39,6 +39,10 @@
#endif
#elif(VKFFT_BACKEND==4)
#include <ze_api.h>
+#elif(VKFFT_BACKEND==5)
+#include "Foundation/Foundation.hpp"
+#include "QuartzCore/QuartzCore.hpp"
+#include "Metal/Metal.hpp"
#endif
#include "vkFFT.h"
#include "utils_VkFFT.h"
@@ -56,6 +60,7 @@ VkFFTResult sample_52_convolution_VkFFT_single_2d_batched_r2c(VkGPU* vkGPU, uint
cl_int res = CL_SUCCESS;
#elif(VKFFT_BACKEND==4)
ze_result_t res = ZE_RESULT_SUCCESS;
+#elif(VKFFT_BACKEND==5)
#endif
if (file_output)
fprintf(output, "52 - VkFFT batched convolution example with identitiy kernel\n");
@@ -81,7 +86,11 @@ VkFFTResult sample_52_convolution_VkFFT_single_2d_batched_r2c(VkGPU* vkGPU, uint
configuration.numberBatches = 2;
//After this, configuration file contains pointers to Vulkan objects needed to work with the GPU: VkDevice* device - created device, [uint64_t *bufferSize, VkBuffer *buffer, VkDeviceMemory* bufferDeviceMemory] - allocated GPU memory FFT is performed on. [uint64_t *kernelSize, VkBuffer *kernel, VkDeviceMemory* kernelDeviceMemory] - allocated GPU memory, where kernel for convolution is stored.
- configuration.device = &vkGPU->device;
+#if(VKFFT_BACKEND==5)
+ configuration.device = vkGPU->device;
+#else
+ configuration.device = &vkGPU->device;
+#endif
#if(VKFFT_BACKEND==0)
configuration.queue = &vkGPU->queue; //to allocate memory for LUT, we have to pass a queue, vkGPU->fence, commandPool and physicalDevice pointers
configuration.fence = &vkGPU->fence;
@@ -94,6 +103,8 @@ VkFFTResult sample_52_convolution_VkFFT_single_2d_batched_r2c(VkGPU* vkGPU, uint
configuration.context = &vkGPU->context;
configuration.commandQueue = &vkGPU->commandQueue;
configuration.commandQueueID = vkGPU->commandQueueID;
+#elif(VKFFT_BACKEND==5)
+ configuration.queue = vkGPU->queue;
#endif
//In this example, we perform a convolution for a real vectorfield (3vector) with a symmetric kernel (6 values). We use configuration to initialize convolution kernel first from real data, then we create convolution_configuration for convolution. The buffer object from configuration is passed to convolution_configuration as kernel object.
//1. Kernel forward FFT.
@@ -127,6 +138,10 @@ VkFFTResult sample_52_convolution_VkFFT_single_2d_batched_r2c(VkGPU* vkGPU, uint
res = zeMemAllocDevice(vkGPU->context, &device_desc, kernelSize, sizeof(float), vkGPU->device, &kernel);
if (res != ZE_RESULT_SUCCESS) return VKFFT_ERROR_FAILED_TO_ALLOCATE;
configuration.buffer = &kernel;
+#elif(VKFFT_BACKEND==5)
+ MTL::Buffer* kernel = 0;
+ kernel = vkGPU->device->newBuffer(kernelSize, MTL::ResourceStorageModePrivate);
+ configuration.buffer = &kernel;
#endif
configuration.bufferSize = &kernelSize;
@@ -154,37 +169,9 @@ VkFFTResult sample_52_convolution_VkFFT_single_2d_batched_r2c(VkGPU* vkGPU, uint
}
}
}
- //Sample buffer transfer tool. Uses staging buffer of the same size as destination buffer, which can be reduced if transfer is done sequentially in small buffers.
-#if(VKFFT_BACKEND==0)
+ //Sample buffer transfer tool. Uses staging buffer (if needed) of the same size as destination buffer, which can be reduced if transfer is done sequentially in small buffers.
resFFT = transferDataFromCPU(vkGPU, kernel_input, &kernel, kernelSize);
if (resFFT != VKFFT_SUCCESS) return resFFT;
-#elif(VKFFT_BACKEND==1)
- res = cudaMemcpy(kernel, kernel_input, kernelSize, cudaMemcpyHostToDevice);
- if (res != cudaSuccess) return VKFFT_ERROR_FAILED_TO_COPY;
-#elif(VKFFT_BACKEND==2)
- res = hipMemcpy(kernel, kernel_input, kernelSize, hipMemcpyHostToDevice);
- if (res != hipSuccess) return VKFFT_ERROR_FAILED_TO_COPY;
-#elif(VKFFT_BACKEND==3)
- res = clEnqueueWriteBuffer(vkGPU->commandQueue, kernel, CL_TRUE, 0, kernelSize, kernel_input, 0, NULL, NULL);
- if (res != CL_SUCCESS) return VKFFT_ERROR_FAILED_TO_COPY;
-#elif(VKFFT_BACKEND==4)
- ze_command_queue_desc_t commandQueueCopyDesc = {
- ZE_STRUCTURE_TYPE_COMMAND_QUEUE_DESC,
- 0,
- vkGPU->commandQueueID,
- 0, // index
- 0, // flags
- ZE_COMMAND_QUEUE_MODE_DEFAULT,
- ZE_COMMAND_QUEUE_PRIORITY_NORMAL
- };
- ze_command_list_handle_t copyCommandList;
- res = zeCommandListCreateImmediate(vkGPU->context, vkGPU->device, &commandQueueCopyDesc, ©CommandList);
- if (res != ZE_RESULT_SUCCESS) return VKFFT_ERROR_FAILED_TO_CREATE_COMMAND_LIST;
- res = zeCommandListAppendMemoryCopy(copyCommandList, kernel, kernel_input, kernelSize, 0, 0, 0);
- if (res != ZE_RESULT_SUCCESS) return VKFFT_ERROR_FAILED_TO_COPY;
- res = zeCommandQueueSynchronize(vkGPU->commandQueue, UINT32_MAX);
- if (res != 0) return VKFFT_ERROR_FAILED_TO_SYNCHRONIZE;
-#endif
//Initialize application responsible for the kernel. This function loads shaders, creates pipeline and configures FFT based on configuration file. No buffer allocations inside VkFFT library.
resFFT = initializeVkFFT(&app_kernel, configuration);
if (resFFT != VKFFT_SUCCESS) return resFFT;
@@ -213,6 +200,8 @@ VkFFTResult sample_52_convolution_VkFFT_single_2d_batched_r2c(VkGPU* vkGPU, uint
convolution_configuration.kernel = &kernel;
#elif(VKFFT_BACKEND==4)
convolution_configuration.kernel = (void**)&kernel;
+#elif(VKFFT_BACKEND==5)
+ convolution_configuration.kernel = &kernel;
#endif
convolution_configuration.kernelSize = &kernelSize;
@@ -269,7 +258,14 @@ VkFFTResult sample_52_convolution_VkFFT_single_2d_batched_r2c(VkGPU* vkGPU, uint
res = zeMemAllocDevice(vkGPU->context, &device_desc, bufferSize, sizeof(float), vkGPU->device, &buffer);
if (res != ZE_RESULT_SUCCESS) return VKFFT_ERROR_FAILED_TO_ALLOCATE;
convolution_configuration.inputBuffer = &inputBuffer;
- configuration.buffer = &buffer;
+ convolution_configuration.buffer = &buffer;
+#elif(VKFFT_BACKEND==5)
+ MTL::Buffer* inputBuffer = 0;
+ MTL::Buffer* buffer = 0;
+ inputBuffer = vkGPU->device->newBuffer(inputBufferSize, MTL::ResourceStorageModePrivate);
+ buffer = vkGPU->device->newBuffer(bufferSize, MTL::ResourceStorageModePrivate);
+ convolution_configuration.inputBuffer = &inputBuffer;
+ convolution_configuration.buffer = &buffer;
#endif
convolution_configuration.inputBufferSize = &inputBufferSize;
@@ -292,28 +288,8 @@ VkFFTResult sample_52_convolution_VkFFT_single_2d_batched_r2c(VkGPU* vkGPU, uint
}
}
//Transfer data to GPU using staging buffer.
-#if(VKFFT_BACKEND==0)
resFFT = transferDataFromCPU(vkGPU, buffer_input, &inputBuffer, inputBufferSize);
if (resFFT != VKFFT_SUCCESS) return resFFT;
-#elif(VKFFT_BACKEND==1)
- res = cudaMemcpy(inputBuffer, buffer_input, inputBufferSize, cudaMemcpyHostToDevice);
- if (res != cudaSuccess) return VKFFT_ERROR_FAILED_TO_COPY;
-#elif(VKFFT_BACKEND==2)
- res = hipMemcpy(inputBuffer, buffer_input, inputBufferSize, hipMemcpyHostToDevice);
- if (res != hipSuccess) return VKFFT_ERROR_FAILED_TO_COPY;
-#elif(VKFFT_BACKEND==3)
- res = clEnqueueWriteBuffer(vkGPU->commandQueue, inputBuffer, CL_TRUE, 0, inputBufferSize, buffer_input, 0, NULL, NULL);
- if (res != CL_SUCCESS) return VKFFT_ERROR_FAILED_TO_COPY;
-#elif(VKFFT_BACKEND==4)
- res = zeCommandListReset(copyCommandList);
- if (res != ZE_RESULT_SUCCESS)return VKFFT_ERROR_FAILED_TO_DESTROY_COMMAND_LIST;
- res = zeCommandListCreateImmediate(vkGPU->context, vkGPU->device, &commandQueueCopyDesc, ©CommandList);
- if (res != ZE_RESULT_SUCCESS) return VKFFT_ERROR_FAILED_TO_CREATE_COMMAND_LIST;
- res = zeCommandListAppendMemoryCopy(copyCommandList, inputBuffer, buffer_input, inputBufferSize, 0, 0, 0);
- if (res != ZE_RESULT_SUCCESS) return VKFFT_ERROR_FAILED_TO_COPY;
- res = zeCommandQueueSynchronize(vkGPU->commandQueue, UINT32_MAX);
- if (res != 0) return VKFFT_ERROR_FAILED_TO_SYNCHRONIZE;
-#endif
//Initialize application responsible for the convolution.
resFFT = initializeVkFFT(&app_convolution, convolution_configuration);
@@ -327,29 +303,9 @@ VkFFTResult sample_52_convolution_VkFFT_single_2d_batched_r2c(VkGPU* vkGPU, uint
float* buffer_output = (float*)malloc(bufferSize);
if (!buffer_output) return VKFFT_ERROR_MALLOC_FAILED;
//Transfer data from GPU using staging buffer.
-#if(VKFFT_BACKEND==0)
resFFT = transferDataToCPU(vkGPU, buffer_output, &buffer, bufferSize);
if (resFFT != VKFFT_SUCCESS) return resFFT;
-#elif(VKFFT_BACKEND==1)
- res = cudaMemcpy(buffer_output, buffer, bufferSize, cudaMemcpyDeviceToHost);
- if (res != cudaSuccess) return VKFFT_ERROR_FAILED_TO_COPY;
-#elif(VKFFT_BACKEND==2)
- res = hipMemcpy(buffer_output, buffer, bufferSize, hipMemcpyDeviceToHost);
- if (res != hipSuccess) return VKFFT_ERROR_FAILED_TO_COPY;
-#elif(VKFFT_BACKEND==3)
- res = clEnqueueReadBuffer(vkGPU->commandQueue, buffer, CL_TRUE, 0, bufferSize, buffer_output, 0, NULL, NULL);
- if (res != CL_SUCCESS) return VKFFT_ERROR_FAILED_TO_COPY;
-#elif(VKFFT_BACKEND==4)
- res = zeCommandListReset(copyCommandList);
- if (res != ZE_RESULT_SUCCESS)return VKFFT_ERROR_FAILED_TO_DESTROY_COMMAND_LIST;
- res = zeCommandListCreateImmediate(vkGPU->context, vkGPU->device, &commandQueueCopyDesc, ©CommandList);
- if (res != ZE_RESULT_SUCCESS) return VKFFT_ERROR_FAILED_TO_CREATE_COMMAND_LIST;
- res = zeCommandListAppendMemoryCopy(copyCommandList, buffer_output, buffer, bufferSize, 0, 0, 0);
- if (res != ZE_RESULT_SUCCESS) return VKFFT_ERROR_FAILED_TO_COPY;
- res = zeCommandQueueSynchronize(vkGPU->commandQueue, UINT32_MAX);
- if (res != 0) return VKFFT_ERROR_FAILED_TO_SYNCHRONIZE;
-#endif
-
+
//Print data, if needed.
for (uint64_t f = 0; f < convolution_configuration.numberKernels; f++) {
if (file_output)
@@ -398,6 +354,10 @@ VkFFTResult sample_52_convolution_VkFFT_single_2d_batched_r2c(VkGPU* vkGPU, uint
zeMemFree(vkGPU->context, inputBuffer);
zeMemFree(vkGPU->context, buffer);
zeMemFree(vkGPU->context, kernel);
+#elif(VKFFT_BACKEND==5)
+ inputBuffer->release();
+ buffer->release();
+ kernel->release();
#endif
deleteVkFFT(&app_kernel);
deleteVkFFT(&app_convolution);
diff --git a/benchmark_scripts/vkFFT_scripts/src/sample_5_benchmark_VkFFT_single_disableReorderFourStep.cpp b/benchmark_scripts/vkFFT_scripts/src/sample_5_benchmark_VkFFT_single_disableReorderFourStep.cpp
index a649168..da81965 100644
--- a/benchmark_scripts/vkFFT_scripts/src/sample_5_benchmark_VkFFT_single_disableReorderFourStep.cpp
+++ b/benchmark_scripts/vkFFT_scripts/src/sample_5_benchmark_VkFFT_single_disableReorderFourStep.cpp
@@ -39,6 +39,10 @@
#endif
#elif(VKFFT_BACKEND==4)
#include <ze_api.h>
+#elif(VKFFT_BACKEND==5)
+#include "Foundation/Foundation.hpp"
+#include "QuartzCore/QuartzCore.hpp"
+#include "Metal/Metal.hpp"
#endif
#include "vkFFT.h"
#include "utils_VkFFT.h"
@@ -56,6 +60,7 @@ VkFFTResult sample_5_benchmark_VkFFT_single_disableReorderFourStep(VkGPU* vkGPU,
cl_int res = CL_SUCCESS;
#elif(VKFFT_BACKEND==4)
ze_result_t res = ZE_RESULT_SUCCESS;
+#elif(VKFFT_BACKEND==5)
#endif
if (file_output)
fprintf(output, "5 - VkFFT FFT + iFFT C2C benchmark 1D batched in single precision, no reshuffling\n");
@@ -85,8 +90,16 @@ VkFFTResult sample_5_benchmark_VkFFT_single_disableReorderFourStep(VkGPU* vkGPU,
configuration.disableReorderFourStep = true;
+#if(VKFFT_BACKEND!=5)
+ if (r==0) configuration.saveApplicationToString = 1;
+ if (r!=0) configuration.loadApplicationFromString = 1;
+#endif
//After this, configuration file contains pointers to Vulkan objects needed to work with the GPU: VkDevice* device - created device, [uint64_t *bufferSize, VkBuffer *buffer, VkDeviceMemory* bufferDeviceMemory] - allocated GPU memory FFT is performed on. [uint64_t *kernelSize, VkBuffer *kernel, VkDeviceMemory* kernelDeviceMemory] - allocated GPU memory, where kernel for convolution is stored.
- configuration.device = &vkGPU->device;
+#if(VKFFT_BACKEND==5)
+ configuration.device = vkGPU->device;
+#else
+ configuration.device = &vkGPU->device;
+#endif
#if(VKFFT_BACKEND==0)
configuration.queue = &vkGPU->queue; //to allocate memory for LUT, we have to pass a queue, vkGPU->fence, commandPool and physicalDevice pointers
configuration.fence = &vkGPU->fence;
@@ -99,6 +112,8 @@ VkFFTResult sample_5_benchmark_VkFFT_single_disableReorderFourStep(VkGPU* vkGPU,
configuration.context = &vkGPU->context;
configuration.commandQueue = &vkGPU->commandQueue;
configuration.commandQueueID = vkGPU->commandQueueID;
+#elif(VKFFT_BACKEND==5)
+ configuration.queue = vkGPU->queue;
#endif
//Allocate buffer for the input data.
uint64_t bufferSize = (uint64_t)sizeof(float) * 2 * configuration.size[0] * configuration.numberBatches;
@@ -130,6 +145,10 @@ VkFFTResult sample_5_benchmark_VkFFT_single_disableReorderFourStep(VkGPU* vkGPU,
res = zeMemAllocDevice(vkGPU->context, &device_desc, bufferSize, sizeof(float), vkGPU->device, &buffer);
if (res != ZE_RESULT_SUCCESS) return VKFFT_ERROR_FAILED_TO_ALLOCATE;
configuration.buffer = &buffer;
+#elif(VKFFT_BACKEND==5)
+ MTL::Buffer* buffer = 0;
+ buffer = vkGPU->device->newBuffer(bufferSize, MTL::ResourceStorageModePrivate);
+ configuration.buffer = &buffer;
#endif
configuration.bufferSize = &bufferSize;
@@ -147,43 +166,43 @@ VkFFTResult sample_5_benchmark_VkFFT_single_disableReorderFourStep(VkGPU* vkGPU,
}
}
*/
- //Sample buffer transfer tool. Uses staging buffer of the same size as destination buffer, which can be reduced if transfer is done sequentially in small buffers.
-#if(VKFFT_BACKEND==0)
- resFFT = transferDataFromCPU(vkGPU, buffer_input, &buffer, bufferSize);
- if (resFFT != VKFFT_SUCCESS) return resFFT;
-#elif(VKFFT_BACKEND==1)
- res = cudaMemcpy(buffer, buffer_input, bufferSize, cudaMemcpyHostToDevice);
- if (res != cudaSuccess) return VKFFT_ERROR_FAILED_TO_COPY;
-#elif(VKFFT_BACKEND==2)
- res = hipMemcpy(buffer, buffer_input, bufferSize, hipMemcpyHostToDevice);
- if (res != hipSuccess) return VKFFT_ERROR_FAILED_TO_COPY;
-#elif(VKFFT_BACKEND==3)
- res = clEnqueueWriteBuffer(vkGPU->commandQueue, buffer, CL_TRUE, 0, bufferSize, buffer_input, 0, NULL, NULL);
- if (res != CL_SUCCESS) return VKFFT_ERROR_FAILED_TO_COPY;
-#elif(VKFFT_BACKEND==4)
- ze_command_queue_desc_t commandQueueCopyDesc = {
- ZE_STRUCTURE_TYPE_COMMAND_QUEUE_DESC,
- 0,
- vkGPU->commandQueueID,
- 0, // index
- 0, // flags
- ZE_COMMAND_QUEUE_MODE_DEFAULT,
- ZE_COMMAND_QUEUE_PRIORITY_NORMAL
- };
- ze_command_list_handle_t copyCommandList;
- res = zeCommandListCreateImmediate(vkGPU->context, vkGPU->device, &commandQueueCopyDesc, ©CommandList);
- if (res != ZE_RESULT_SUCCESS) return VKFFT_ERROR_FAILED_TO_CREATE_COMMAND_LIST;
- res = zeCommandListAppendMemoryCopy(copyCommandList, buffer, buffer_input, bufferSize, 0, 0, 0);
- if (res != ZE_RESULT_SUCCESS) return VKFFT_ERROR_FAILED_TO_COPY;
- res = zeCommandQueueSynchronize(vkGPU->commandQueue, UINT32_MAX);
- if (res != 0) return VKFFT_ERROR_FAILED_TO_SYNCHRONIZE;
-#endif
+ //Sample buffer transfer tool. Uses staging buffer (if needed) of the same size as destination buffer, which can be reduced if transfer is done sequentially in small buffers.
+ resFFT = transferDataFromCPU(vkGPU, buffer_input, &buffer, bufferSize);
+ if (resFFT != VKFFT_SUCCESS) return resFFT;
//free(buffer_input);
+ if (configuration.loadApplicationFromString) {
+ FILE* kernelCache;
+ uint64_t str_len;
+ char fname[500];
+ int VkFFT_version = VkFFTGetVersion();
+ sprintf(fname, "VkFFT_binary");
+ kernelCache = fopen(fname, "rb");
+ if (!kernelCache) return VKFFT_ERROR_EMPTY_FILE;
+ fseek(kernelCache, 0, SEEK_END);
+ str_len = ftell(kernelCache);
+ fseek(kernelCache, 0, SEEK_SET);
+ configuration.loadApplicationString = malloc(str_len);
+ fread(configuration.loadApplicationString, str_len, 1, kernelCache);
+ fclose(kernelCache);
+ }
//Initialize applications. This function loads shaders, creates pipeline and configures FFT based on configuration file. No buffer allocations inside VkFFT library.
resFFT = initializeVkFFT(&app, configuration);
if (resFFT != VKFFT_SUCCESS) return resFFT;
+ if (configuration.loadApplicationFromString)
+ free(configuration.loadApplicationString);
+
+ if (configuration.saveApplicationToString) {
+ FILE* kernelCache;
+ char fname[500];
+ int VkFFT_version = VkFFTGetVersion();
+ sprintf(fname, "VkFFT_binary");
+ kernelCache = fopen(fname, "wb");
+ fwrite(app.saveApplicationString, app.applicationStringSize, 1, kernelCache);
+ fclose(kernelCache);
+ }
+
//Submit FFT+iFFT.
uint64_t num_iter = (((uint64_t)3 * 4096 * 1024.0 * 1024.0) / bufferSize > 1000) ? 1000 : (uint64_t)((uint64_t)3 * 4096 * 1024.0 * 1024.0) / bufferSize;
#if(VKFFT_BACKEND==0)
@@ -241,6 +260,8 @@ VkFFTResult sample_5_benchmark_VkFFT_single_disableReorderFourStep(VkGPU* vkGPU,
clReleaseMemObject(buffer);
#elif(VKFFT_BACKEND==4)
zeMemFree(vkGPU->context, buffer);
+#elif(VKFFT_BACKEND==5)
+ buffer->release();
#endif
deleteVkFFT(&app);
diff --git a/benchmark_scripts/vkFFT_scripts/src/sample_6_benchmark_VkFFT_single_r2c.cpp b/benchmark_scripts/vkFFT_scripts/src/sample_6_benchmark_VkFFT_single_r2c.cpp
index 12b810e..4afbeaf 100644
--- a/benchmark_scripts/vkFFT_scripts/src/sample_6_benchmark_VkFFT_single_r2c.cpp
+++ b/benchmark_scripts/vkFFT_scripts/src/sample_6_benchmark_VkFFT_single_r2c.cpp
@@ -39,6 +39,10 @@
#endif
#elif(VKFFT_BACKEND==4)
#include <ze_api.h>
+#elif(VKFFT_BACKEND==5)
+#include "Foundation/Foundation.hpp"
+#include "QuartzCore/QuartzCore.hpp"
+#include "Metal/Metal.hpp"
#endif
#include "vkFFT.h"
#include "utils_VkFFT.h"
@@ -56,6 +60,7 @@ VkFFTResult sample_6_benchmark_VkFFT_single_r2c(VkGPU* vkGPU, uint64_t file_outp
cl_int res = CL_SUCCESS;
#elif(VKFFT_BACKEND==4)
ze_result_t res = ZE_RESULT_SUCCESS;
+#elif(VKFFT_BACKEND==5)
#endif
if (file_output)
fprintf(output, "6 - VkFFT FFT + iFFT R2C/C2R benchmark\n");
@@ -87,8 +92,16 @@ VkFFTResult sample_6_benchmark_VkFFT_single_r2c(VkGPU* vkGPU, uint64_t file_outp
configuration.performR2C = true; //Perform R2C/C2R transform. Can be combined with all other options. Reduces memory requirements by a factor of 2. Requires special input data alignment: for x*y*z system pad x*y plane to (x+2)*y with last 2*y elements reserved, total array dimensions are (x*y+2y)*z. Memory layout after R2C and before C2R can be found on github.
//configuration.disableMergeSequencesR2C = 1;
+#if(VKFFT_BACKEND!=5)
+ if (r==0) configuration.saveApplicationToString = 1;
+ if (r!=0) configuration.loadApplicationFromString = 1;
+#endif
//After this, configuration file contains pointers to Vulkan objects needed to work with the GPU: VkDevice* device - created device, [uint64_t *bufferSize, VkBuffer *buffer, VkDeviceMemory* bufferDeviceMemory] - allocated GPU memory FFT is performed on. [uint64_t *kernelSize, VkBuffer *kernel, VkDeviceMemory* kernelDeviceMemory] - allocated GPU memory, where kernel for convolution is stored.
- configuration.device = &vkGPU->device;
+#if(VKFFT_BACKEND==5)
+ configuration.device = vkGPU->device;
+#else
+ configuration.device = &vkGPU->device;
+#endif
#if(VKFFT_BACKEND==0)
configuration.queue = &vkGPU->queue; //to allocate memory for LUT, we have to pass a queue, vkGPU->fence, commandPool and physicalDevice pointers
configuration.fence = &vkGPU->fence;
@@ -101,6 +114,8 @@ VkFFTResult sample_6_benchmark_VkFFT_single_r2c(VkGPU* vkGPU, uint64_t file_outp
configuration.context = &vkGPU->context;
configuration.commandQueue = &vkGPU->commandQueue;
configuration.commandQueueID = vkGPU->commandQueueID;
+#elif(VKFFT_BACKEND==5)
+ configuration.queue = vkGPU->queue;
#endif
@@ -134,6 +149,10 @@ VkFFTResult sample_6_benchmark_VkFFT_single_r2c(VkGPU* vkGPU, uint64_t file_outp
res = zeMemAllocDevice(vkGPU->context, &device_desc, bufferSize, sizeof(float), vkGPU->device, &buffer);
if (res != ZE_RESULT_SUCCESS) return VKFFT_ERROR_FAILED_TO_ALLOCATE;
configuration.buffer = &buffer;
+#elif(VKFFT_BACKEND==5)
+ MTL::Buffer* buffer = 0;
+ buffer = vkGPU->device->newBuffer(bufferSize, MTL::ResourceStorageModePrivate);
+ configuration.buffer = &buffer;
#endif
configuration.bufferSize = &bufferSize;
@@ -148,41 +167,41 @@ VkFFTResult sample_6_benchmark_VkFFT_single_r2c(VkGPU* vkGPU, uint64_t file_outp
}
}
*/
- //Sample buffer transfer tool. Uses staging buffer of the same size as destination buffer, which can be reduced if transfer is done sequentially in small buffers.
-#if(VKFFT_BACKEND==0)
- resFFT = transferDataFromCPU(vkGPU, buffer_input, &buffer, bufferSize);
- if (resFFT != VKFFT_SUCCESS) return resFFT;
-#elif(VKFFT_BACKEND==1)
- res = cudaMemcpy(buffer, buffer_input, bufferSize, cudaMemcpyHostToDevice);
- if (res != cudaSuccess) return VKFFT_ERROR_FAILED_TO_COPY;
-#elif(VKFFT_BACKEND==2)
- res = hipMemcpy(buffer, buffer_input, bufferSize, hipMemcpyHostToDevice);
- if (res != hipSuccess) return VKFFT_ERROR_FAILED_TO_COPY;
-#elif(VKFFT_BACKEND==3)
- res = clEnqueueWriteBuffer(vkGPU->commandQueue, buffer, CL_TRUE, 0, bufferSize, buffer_input, 0, NULL, NULL);
- if (res != CL_SUCCESS) return VKFFT_ERROR_FAILED_TO_COPY;
-#elif(VKFFT_BACKEND==4)
- ze_command_queue_desc_t commandQueueCopyDesc = {
- ZE_STRUCTURE_TYPE_COMMAND_QUEUE_DESC,
- 0,
- vkGPU->commandQueueID,
- 0, // index
- 0, // flags
- ZE_COMMAND_QUEUE_MODE_DEFAULT,
- ZE_COMMAND_QUEUE_PRIORITY_NORMAL
- };
- ze_command_list_handle_t copyCommandList;
- res = zeCommandListCreateImmediate(vkGPU->context, vkGPU->device, &commandQueueCopyDesc, ©CommandList);
- if (res != ZE_RESULT_SUCCESS) return VKFFT_ERROR_FAILED_TO_CREATE_COMMAND_LIST;
- res = zeCommandListAppendMemoryCopy(copyCommandList, buffer, buffer_input, bufferSize, 0, 0, 0);
- if (res != ZE_RESULT_SUCCESS) return VKFFT_ERROR_FAILED_TO_COPY;
- res = zeCommandQueueSynchronize(vkGPU->commandQueue, UINT32_MAX);
- if (res != 0) return VKFFT_ERROR_FAILED_TO_SYNCHRONIZE;
-#endif
+ //Sample buffer transfer tool. Uses staging buffer (if needed) of the same size as destination buffer, which can be reduced if transfer is done sequentially in small buffers.
+ resFFT = transferDataFromCPU(vkGPU, buffer_input, &buffer, bufferSize);
+ if (resFFT != VKFFT_SUCCESS) return resFFT;
+ if (configuration.loadApplicationFromString) {
+ FILE* kernelCache;
+ uint64_t str_len;
+ char fname[500];
+ int VkFFT_version = VkFFTGetVersion();
+ sprintf(fname, "VkFFT_binary");
+ kernelCache = fopen(fname, "rb");
+ if (!kernelCache) return VKFFT_ERROR_EMPTY_FILE;
+ fseek(kernelCache, 0, SEEK_END);
+ str_len = ftell(kernelCache);
+ fseek(kernelCache, 0, SEEK_SET);
+ configuration.loadApplicationString = malloc(str_len);
+ fread(configuration.loadApplicationString, str_len, 1, kernelCache);
+ fclose(kernelCache);
+ }
//Initialize applications. This function loads shaders, creates pipeline and configures FFT based on configuration file. No buffer allocations inside VkFFT library.
resFFT = initializeVkFFT(&app, configuration);
if (resFFT != VKFFT_SUCCESS) return resFFT;
+ if (configuration.loadApplicationFromString)
+ free(configuration.loadApplicationString);
+
+ if (configuration.saveApplicationToString) {
+ FILE* kernelCache;
+ char fname[500];
+ int VkFFT_version = VkFFTGetVersion();
+ sprintf(fname, "VkFFT_binary");
+ kernelCache = fopen(fname, "wb");
+ fwrite(app.saveApplicationString, app.applicationStringSize, 1, kernelCache);
+ fclose(kernelCache);
+ }
+
//Submit FFT+iFFT.
uint64_t num_iter = (((uint64_t)4096 * 1024.0 * 1024.0) / bufferSize > 1000) ? 1000 : (uint64_t)((uint64_t)4096 * 1024.0 * 1024.0) / bufferSize;
#if(VKFFT_BACKEND==0)
@@ -256,6 +275,8 @@ VkFFTResult sample_6_benchmark_VkFFT_single_r2c(VkGPU* vkGPU, uint64_t file_outp
clReleaseMemObject(buffer);
#elif(VKFFT_BACKEND==4)
zeMemFree(vkGPU->context, buffer);
+#elif(VKFFT_BACKEND==5)
+ buffer->release();
#endif
deleteVkFFT(&app);
diff --git a/benchmark_scripts/vkFFT_scripts/src/sample_7_benchmark_VkFFT_single_Bluestein.cpp b/benchmark_scripts/vkFFT_scripts/src/sample_7_benchmark_VkFFT_single_Bluestein.cpp
index fe2c3df..bb89465 100644
--- a/benchmark_scripts/vkFFT_scripts/src/sample_7_benchmark_VkFFT_single_Bluestein.cpp
+++ b/benchmark_scripts/vkFFT_scripts/src/sample_7_benchmark_VkFFT_single_Bluestein.cpp
@@ -39,6 +39,10 @@
#endif
#elif(VKFFT_BACKEND==4)
#include <ze_api.h>
+#elif(VKFFT_BACKEND==5)
+#include "Foundation/Foundation.hpp"
+#include "QuartzCore/QuartzCore.hpp"
+#include "Metal/Metal.hpp"
#endif
#include "vkFFT.h"
#include "utils_VkFFT.h"
@@ -56,6 +60,7 @@ VkFFTResult sample_7_benchmark_VkFFT_single_Bluestein(VkGPU* vkGPU, uint64_t fil
cl_int res = CL_SUCCESS;
#elif(VKFFT_BACKEND==4)
ze_result_t res = ZE_RESULT_SUCCESS;
+#elif(VKFFT_BACKEND==5)
#endif
if (file_output)
fprintf(output, "7 - VkFFT FFT + iFFT C2C Bluestein benchmark in single precision\n");
@@ -87,8 +92,16 @@ VkFFTResult sample_7_benchmark_VkFFT_single_Bluestein(VkGPU* vkGPU, uint64_t fil
configuration.size[0] = benchmark_dimensions[n][0]; //Multidimensional FFT dimensions sizes (default 1). For best performance (and stability), order dimensions in descendant size order as: x>y>z.
configuration.size[1] = benchmark_dimensions[n][1];
configuration.size[2] = benchmark_dimensions[n][2];
+#if(VKFFT_BACKEND!=5)
+ if (r==0) configuration.saveApplicationToString = 1;
+ if (r!=0) configuration.loadApplicationFromString = 1;
+#endif
//After this, configuration file contains pointers to Vulkan objects needed to work with the GPU: VkDevice* device - created device, [uint64_t *bufferSize, VkBuffer *buffer, VkDeviceMemory* bufferDeviceMemory] - allocated GPU memory FFT is performed on. [uint64_t *kernelSize, VkBuffer *kernel, VkDeviceMemory* kernelDeviceMemory] - allocated GPU memory, where kernel for convolution is stored.
- configuration.device = &vkGPU->device;
+#if(VKFFT_BACKEND==5)
+ configuration.device = vkGPU->device;
+#else
+ configuration.device = &vkGPU->device;
+#endif
#if(VKFFT_BACKEND==0)
configuration.queue = &vkGPU->queue; //to allocate memory for LUT, we have to pass a queue, vkGPU->fence, commandPool and physicalDevice pointers
configuration.fence = &vkGPU->fence;
@@ -101,6 +114,8 @@ VkFFTResult sample_7_benchmark_VkFFT_single_Bluestein(VkGPU* vkGPU, uint64_t fil
configuration.context = &vkGPU->context;
configuration.commandQueue = &vkGPU->commandQueue;
configuration.commandQueueID = vkGPU->commandQueueID;
+#elif(VKFFT_BACKEND==5)
+ configuration.queue = vkGPU->queue;
#endif
//Allocate buffer for the input data.
uint64_t bufferSize = (uint64_t)sizeof(float) * 2 * configuration.size[0] * configuration.size[1] * configuration.size[2];
@@ -132,6 +147,10 @@ VkFFTResult sample_7_benchmark_VkFFT_single_Bluestein(VkGPU* vkGPU, uint64_t fil
res = zeMemAllocDevice(vkGPU->context, &device_desc, bufferSize, sizeof(float), vkGPU->device, &buffer);
if (res != ZE_RESULT_SUCCESS) return VKFFT_ERROR_FAILED_TO_ALLOCATE;
configuration.buffer = &buffer;
+#elif(VKFFT_BACKEND==5)
+ MTL::Buffer* buffer = 0;
+ buffer = vkGPU->device->newBuffer(bufferSize, MTL::ResourceStorageModePrivate);
+ configuration.buffer = &buffer;
#endif
configuration.bufferSize = &bufferSize;
@@ -147,43 +166,43 @@ VkFFTResult sample_7_benchmark_VkFFT_single_Bluestein(VkGPU* vkGPU, uint64_t fil
}
}
*/
- //Sample buffer transfer tool. Uses staging buffer of the same size as destination buffer, which can be reduced if transfer is done sequentially in small buffers.
-#if(VKFFT_BACKEND==0)
- resFFT = transferDataFromCPU(vkGPU, buffer_input, &buffer, bufferSize);
- if (resFFT != VKFFT_SUCCESS) return resFFT;
-#elif(VKFFT_BACKEND==1)
- res = cudaMemcpy(buffer, buffer_input, bufferSize, cudaMemcpyHostToDevice);
- if (res != cudaSuccess) return VKFFT_ERROR_FAILED_TO_COPY;
-#elif(VKFFT_BACKEND==2)
- res = hipMemcpy(buffer, buffer_input, bufferSize, hipMemcpyHostToDevice);
- if (res != hipSuccess) return VKFFT_ERROR_FAILED_TO_COPY;
-#elif(VKFFT_BACKEND==3)
- res = clEnqueueWriteBuffer(vkGPU->commandQueue, buffer, CL_TRUE, 0, bufferSize, buffer_input, 0, NULL, NULL);
- if (res != CL_SUCCESS) return VKFFT_ERROR_FAILED_TO_COPY;
-#elif(VKFFT_BACKEND==4)
- ze_command_queue_desc_t commandQueueCopyDesc = {
- ZE_STRUCTURE_TYPE_COMMAND_QUEUE_DESC,
- 0,
- vkGPU->commandQueueID,
- 0, // index
- 0, // flags
- ZE_COMMAND_QUEUE_MODE_DEFAULT,
- ZE_COMMAND_QUEUE_PRIORITY_NORMAL
- };
- ze_command_list_handle_t copyCommandList;
- res = zeCommandListCreateImmediate(vkGPU->context, vkGPU->device, &commandQueueCopyDesc, ©CommandList);
- if (res != ZE_RESULT_SUCCESS) return VKFFT_ERROR_FAILED_TO_CREATE_COMMAND_LIST;
- res = zeCommandListAppendMemoryCopy(copyCommandList, buffer, buffer_input, bufferSize, 0, 0, 0);
- if (res != ZE_RESULT_SUCCESS) return VKFFT_ERROR_FAILED_TO_COPY;
- res = zeCommandQueueSynchronize(vkGPU->commandQueue, UINT32_MAX);
- if (res != 0) return VKFFT_ERROR_FAILED_TO_SYNCHRONIZE;
-#endif
+ //Sample buffer transfer tool. Uses staging buffer (if needed) of the same size as destination buffer, which can be reduced if transfer is done sequentially in small buffers.
+ resFFT = transferDataFromCPU(vkGPU, buffer_input, &buffer, bufferSize);
+ if (resFFT != VKFFT_SUCCESS) return resFFT;
//free(buffer_input);
+ if (configuration.loadApplicationFromString) {
+ FILE* kernelCache;
+ uint64_t str_len;
+ char fname[500];
+ int VkFFT_version = VkFFTGetVersion();
+ sprintf(fname, "VkFFT_binary");
+ kernelCache = fopen(fname, "rb");
+ if (!kernelCache) return VKFFT_ERROR_EMPTY_FILE;
+ fseek(kernelCache, 0, SEEK_END);
+ str_len = ftell(kernelCache);
+ fseek(kernelCache, 0, SEEK_SET);
+ configuration.loadApplicationString = malloc(str_len);
+ fread(configuration.loadApplicationString, str_len, 1, kernelCache);
+ fclose(kernelCache);
+ }
//Initialize applications. This function loads shaders, creates pipeline and configures FFT based on configuration file. No buffer allocations inside VkFFT library.
resFFT = initializeVkFFT(&app, configuration);
if (resFFT != VKFFT_SUCCESS) return resFFT;
+ if (configuration.loadApplicationFromString)
+ free(configuration.loadApplicationString);
+
+ if (configuration.saveApplicationToString) {
+ FILE* kernelCache;
+ char fname[500];
+ int VkFFT_version = VkFFTGetVersion();
+ sprintf(fname, "VkFFT_binary");
+ kernelCache = fopen(fname, "wb");
+ fwrite(app.saveApplicationString, app.applicationStringSize, 1, kernelCache);
+ fclose(kernelCache);
+ }
+
//Submit FFT+iFFT.
uint64_t num_iter = (((uint64_t)4096 * 1024.0 * 1024.0) / bufferSize > 1000) ? 1000 : (uint64_t)((uint64_t)4096 * 1024.0 * 1024.0) / bufferSize;
#if(VKFFT_BACKEND==0)
@@ -241,6 +260,8 @@ VkFFTResult sample_7_benchmark_VkFFT_single_Bluestein(VkGPU* vkGPU, uint64_t fil
clReleaseMemObject(buffer);
#elif(VKFFT_BACKEND==4)
zeMemFree(vkGPU->context, buffer);
+#elif(VKFFT_BACKEND==5)
+ buffer->release();
#endif
deleteVkFFT(&app);
diff --git a/benchmark_scripts/vkFFT_scripts/src/sample_8_benchmark_VkFFT_double_Bluestein.cpp b/benchmark_scripts/vkFFT_scripts/src/sample_8_benchmark_VkFFT_double_Bluestein.cpp
index 0bebc6f..bd038a1 100644
--- a/benchmark_scripts/vkFFT_scripts/src/sample_8_benchmark_VkFFT_double_Bluestein.cpp
+++ b/benchmark_scripts/vkFFT_scripts/src/sample_8_benchmark_VkFFT_double_Bluestein.cpp
@@ -39,6 +39,10 @@
#endif
#elif(VKFFT_BACKEND==4)
#include <ze_api.h>
+#elif(VKFFT_BACKEND==5)
+#include "Foundation/Foundation.hpp"
+#include "QuartzCore/QuartzCore.hpp"
+#include "Metal/Metal.hpp"
#endif
#include "vkFFT.h"
#include "utils_VkFFT.h"
@@ -56,6 +60,7 @@ VkFFTResult sample_8_benchmark_VkFFT_double_Bluestein(VkGPU* vkGPU, uint64_t fil
cl_int res = CL_SUCCESS;
#elif(VKFFT_BACKEND==4)
ze_result_t res = ZE_RESULT_SUCCESS;
+#elif(VKFFT_BACKEND==5)
#endif
if (file_output)
fprintf(output, "8 - VkFFT FFT + iFFT C2C Bluestein benchmark in double precision\n");
@@ -88,8 +93,16 @@ VkFFTResult sample_8_benchmark_VkFFT_double_Bluestein(VkGPU* vkGPU, uint64_t fil
configuration.size[1] = benchmark_dimensions[n][1];
configuration.size[2] = benchmark_dimensions[n][2];
configuration.doublePrecision = 1;
+#if(VKFFT_BACKEND!=5)
+ if (r==0) configuration.saveApplicationToString = 1;
+ if (r!=0) configuration.loadApplicationFromString = 1;
+#endif
//After this, configuration file contains pointers to Vulkan objects needed to work with the GPU: VkDevice* device - created device, [uint64_t *bufferSize, VkBuffer *buffer, VkDeviceMemory* bufferDeviceMemory] - allocated GPU memory FFT is performed on. [uint64_t *kernelSize, VkBuffer *kernel, VkDeviceMemory* kernelDeviceMemory] - allocated GPU memory, where kernel for convolution is stored.
- configuration.device = &vkGPU->device;
+#if(VKFFT_BACKEND==5)
+ configuration.device = vkGPU->device;
+#else
+ configuration.device = &vkGPU->device;
+#endif
#if(VKFFT_BACKEND==0)
configuration.queue = &vkGPU->queue; //to allocate memory for LUT, we have to pass a queue, vkGPU->fence, commandPool and physicalDevice pointers
configuration.fence = &vkGPU->fence;
@@ -102,6 +115,8 @@ VkFFTResult sample_8_benchmark_VkFFT_double_Bluestein(VkGPU* vkGPU, uint64_t fil
configuration.context = &vkGPU->context;
configuration.commandQueue = &vkGPU->commandQueue;
configuration.commandQueueID = vkGPU->commandQueueID;
+#elif(VKFFT_BACKEND==5)
+ configuration.queue = vkGPU->queue;
#endif
//Allocate buffer for the input data.
uint64_t bufferSize = (uint64_t)sizeof(double) * 2 * configuration.size[0] * configuration.size[1] * configuration.size[2];;
@@ -133,6 +148,10 @@ VkFFTResult sample_8_benchmark_VkFFT_double_Bluestein(VkGPU* vkGPU, uint64_t fil
res = zeMemAllocDevice(vkGPU->context, &device_desc, bufferSize, sizeof(float), vkGPU->device, &buffer);
if (res != ZE_RESULT_SUCCESS) return VKFFT_ERROR_FAILED_TO_ALLOCATE;
configuration.buffer = &buffer;
+#elif(VKFFT_BACKEND==5)
+ MTL::Buffer* buffer = 0;
+ buffer = vkGPU->device->newBuffer(bufferSize, MTL::ResourceStorageModePrivate);
+ configuration.buffer = &buffer;
#endif
configuration.bufferSize = &bufferSize;
@@ -148,43 +167,43 @@ VkFFTResult sample_8_benchmark_VkFFT_double_Bluestein(VkGPU* vkGPU, uint64_t fil
}
}
*/
- //Sample buffer transfer tool. Uses staging buffer of the same size as destination buffer, which can be reduced if transfer is done sequentially in small buffers.
-#if(VKFFT_BACKEND==0)
- resFFT = transferDataFromCPU(vkGPU, buffer_input, &buffer, bufferSize);
- if (resFFT != VKFFT_SUCCESS) return resFFT;
-#elif(VKFFT_BACKEND==1)
- res = cudaMemcpy(buffer, buffer_input, bufferSize, cudaMemcpyHostToDevice);
- if (res != cudaSuccess) return VKFFT_ERROR_FAILED_TO_COPY;
-#elif(VKFFT_BACKEND==2)
- res = hipMemcpy(buffer, buffer_input, bufferSize, hipMemcpyHostToDevice);
- if (res != hipSuccess) return VKFFT_ERROR_FAILED_TO_COPY;
-#elif(VKFFT_BACKEND==3)
- res = clEnqueueWriteBuffer(vkGPU->commandQueue, buffer, CL_TRUE, 0, bufferSize, buffer_input, 0, NULL, NULL);
- if (res != CL_SUCCESS) return VKFFT_ERROR_FAILED_TO_COPY;
-#elif(VKFFT_BACKEND==4)
- ze_command_queue_desc_t commandQueueCopyDesc = {
- ZE_STRUCTURE_TYPE_COMMAND_QUEUE_DESC,
- 0,
- vkGPU->commandQueueID,
- 0, // index
- 0, // flags
- ZE_COMMAND_QUEUE_MODE_DEFAULT,
- ZE_COMMAND_QUEUE_PRIORITY_NORMAL
- };
- ze_command_list_handle_t copyCommandList;
- res = zeCommandListCreateImmediate(vkGPU->context, vkGPU->device, &commandQueueCopyDesc, ©CommandList);
- if (res != ZE_RESULT_SUCCESS) return VKFFT_ERROR_FAILED_TO_CREATE_COMMAND_LIST;
- res = zeCommandListAppendMemoryCopy(copyCommandList, buffer, buffer_input, bufferSize, 0, 0, 0);
- if (res != ZE_RESULT_SUCCESS) return VKFFT_ERROR_FAILED_TO_COPY;
- res = zeCommandQueueSynchronize(vkGPU->commandQueue, UINT32_MAX);
- if (res != 0) return VKFFT_ERROR_FAILED_TO_SYNCHRONIZE;
-#endif
+ //Sample buffer transfer tool. Uses staging buffer (if needed) of the same size as destination buffer, which can be reduced if transfer is done sequentially in small buffers.
+ resFFT = transferDataFromCPU(vkGPU, buffer_input, &buffer, bufferSize);
+ if (resFFT != VKFFT_SUCCESS) return resFFT;
//free(buffer_input);
+ if (configuration.loadApplicationFromString) {
+ FILE* kernelCache;
+ uint64_t str_len;
+ char fname[500];
+ int VkFFT_version = VkFFTGetVersion();
+ sprintf(fname, "VkFFT_binary");
+ kernelCache = fopen(fname, "rb");
+ if (!kernelCache) return VKFFT_ERROR_EMPTY_FILE;
+ fseek(kernelCache, 0, SEEK_END);
+ str_len = ftell(kernelCache);
+ fseek(kernelCache, 0, SEEK_SET);
+ configuration.loadApplicationString = malloc(str_len);
+ fread(configuration.loadApplicationString, str_len, 1, kernelCache);
+ fclose(kernelCache);
+ }
//Initialize applications. This function loads shaders, creates pipeline and configures FFT based on configuration file. No buffer allocations inside VkFFT library.
resFFT = initializeVkFFT(&app, configuration);
if (resFFT != VKFFT_SUCCESS) return resFFT;
+ if (configuration.loadApplicationFromString)
+ free(configuration.loadApplicationString);
+
+ if (configuration.saveApplicationToString) {
+ FILE* kernelCache;
+ char fname[500];
+ int VkFFT_version = VkFFTGetVersion();
+ sprintf(fname, "VkFFT_binary");
+ kernelCache = fopen(fname, "wb");
+ fwrite(app.saveApplicationString, app.applicationStringSize, 1, kernelCache);
+ fclose(kernelCache);
+ }
+
//Submit FFT+iFFT.
uint64_t num_iter = (((uint64_t)4096 * 1024.0 * 1024.0) / bufferSize > 1000) ? 1000 : (uint64_t)((uint64_t)4096 * 1024.0 * 1024.0) / bufferSize;
#if(VKFFT_BACKEND==0)
@@ -242,6 +261,8 @@ VkFFTResult sample_8_benchmark_VkFFT_double_Bluestein(VkGPU* vkGPU, uint64_t fil
clReleaseMemObject(buffer);
#elif(VKFFT_BACKEND==4)
zeMemFree(vkGPU->context, buffer);
+#elif(VKFFT_BACKEND==5)
+ buffer->release();
#endif
deleteVkFFT(&app);
diff --git a/benchmark_scripts/vkFFT_scripts/src/user_benchmark_VkFFT.cpp b/benchmark_scripts/vkFFT_scripts/src/user_benchmark_VkFFT.cpp
index 66a66c1..f6a1808 100644
--- a/benchmark_scripts/vkFFT_scripts/src/user_benchmark_VkFFT.cpp
+++ b/benchmark_scripts/vkFFT_scripts/src/user_benchmark_VkFFT.cpp
@@ -39,6 +39,10 @@
#endif
#elif(VKFFT_BACKEND==4)
#include <ze_api.h>
+#elif(VKFFT_BACKEND==5)
+#include "Foundation/Foundation.hpp"
+#include "QuartzCore/QuartzCore.hpp"
+#include "Metal/Metal.hpp"
#endif
#include "vkFFT.h"
#include "utils_VkFFT.h"
@@ -57,6 +61,7 @@ VkFFTResult user_benchmark_VkFFT(VkGPU* vkGPU, uint64_t file_output, FILE* outpu
cl_int res = CL_SUCCESS;
#elif(VKFFT_BACKEND==4)
ze_result_t res = ZE_RESULT_SUCCESS;
+#elif(VKFFT_BACKEND==5)
#endif
const int num_runs = 3;
double benchmark_result = 0;//averaged result = sum(system_size/iteration_time)/num_benchmark_samples
@@ -76,6 +81,24 @@ VkFFTResult user_benchmark_VkFFT(VkGPU* vkGPU, uint64_t file_output, FILE* outpu
storageComplexSize = (2 * sizeof(float));
break;
}
+ uint64_t bufferSize = 0;
+ if (userParams->R2C) {
+ bufferSize = (uint64_t)(storageComplexSize / 2) * (userParams->X + 2) * userParams->Y * userParams->Z * userParams->B;
+ }
+ else {
+ if (userParams->DCT) {
+ bufferSize = (uint64_t)(storageComplexSize / 2) * userParams->X * userParams->Y * userParams->Z * userParams->B;
+ }
+ else {
+ bufferSize = (uint64_t)storageComplexSize * userParams->X * userParams->Y * userParams->Z * userParams->B;
+ }
+ }
+
+ float* buffer_input = (float*)malloc(bufferSize);
+ if (!buffer_input) return VKFFT_ERROR_MALLOC_FAILED;
+ for (uint64_t i = 0; i < bufferSize/sizeof(float); i++) {
+ buffer_input[i] = (float)(2 * ((float)rand()) / RAND_MAX - 1.0);
+ }
for (uint64_t n = 0; n < 2; n++) {
double run_time[num_runs];
for (uint64_t r = 0; r < num_runs; r++) {
@@ -97,10 +120,16 @@ VkFFTResult user_benchmark_VkFFT(VkGPU* vkGPU, uint64_t file_output, FILE* outpu
configuration.performDCT = userParams->DCT;
if (userParams->P == 1) configuration.doublePrecision = 1;
if (userParams->P == 2) configuration.halfPrecision = 1;
+#if(VKFFT_BACKEND!=5)
if (userParams->saveApplicationToString && (n==0) && (r==0)) configuration.saveApplicationToString = 1;
if (userParams->loadApplicationFromString || (userParams->saveApplicationToString && ((n != 0) || (r != 0)))) configuration.loadApplicationFromString = 1;
+#endif
//After this, configuration file contains pointers to Vulkan objects needed to work with the GPU: VkDevice* device - created device, [uint64_t *bufferSize, VkBuffer *buffer, VkDeviceMemory* bufferDeviceMemory] - allocated GPU memory FFT is performed on. [uint64_t *kernelSize, VkBuffer *kernel, VkDeviceMemory* kernelDeviceMemory] - allocated GPU memory, where kernel for convolution is stored.
- configuration.device = &vkGPU->device;
+#if(VKFFT_BACKEND==5)
+ configuration.device = vkGPU->device;
+#else
+ configuration.device = &vkGPU->device;
+#endif
#if(VKFFT_BACKEND==0)
configuration.queue = &vkGPU->queue; //to allocate memory for LUT, we have to pass a queue, vkGPU->fence, commandPool and physicalDevice pointers
configuration.fence = &vkGPU->fence;
@@ -113,20 +142,10 @@ VkFFTResult user_benchmark_VkFFT(VkGPU* vkGPU, uint64_t file_output, FILE* outpu
configuration.context = &vkGPU->context;
configuration.commandQueue = &vkGPU->commandQueue;
configuration.commandQueueID = vkGPU->commandQueueID;
+#elif(VKFFT_BACKEND==5)
+ configuration.queue = vkGPU->queue;
#endif
//Allocate buffer for the input data.
- uint64_t bufferSize = 0;
- if (userParams->R2C) {
- bufferSize = (uint64_t)(storageComplexSize / 2) * (configuration.size[0] + 2) * configuration.size[1] * configuration.size[2] * configuration.numberBatches;
- }
- else {
- if (userParams->DCT) {
- bufferSize = (uint64_t)(storageComplexSize / 2) * configuration.size[0] * configuration.size[1] * configuration.size[2] * configuration.numberBatches;
- }
- else {
- bufferSize = (uint64_t)storageComplexSize * configuration.size[0] * configuration.size[1] * configuration.size[2] * configuration.numberBatches;
- }
- }
#if(VKFFT_BACKEND==0)
VkBuffer buffer = {};
VkDeviceMemory bufferDeviceMemory = {};
@@ -155,20 +174,24 @@ VkFFTResult user_benchmark_VkFFT(VkGPU* vkGPU, uint64_t file_output, FILE* outpu
res = zeMemAllocDevice(vkGPU->context, &device_desc, bufferSize, sizeof(float), vkGPU->device, &buffer);
if (res != ZE_RESULT_SUCCESS) return VKFFT_ERROR_FAILED_TO_ALLOCATE;
configuration.buffer = &buffer;
+#elif(VKFFT_BACKEND==5)
+ MTL::Buffer* buffer = 0;
+ buffer = vkGPU->device->newBuffer(bufferSize, MTL::ResourceStorageModePrivate);
+ configuration.buffer = &buffer;
#endif
configuration.bufferSize = &bufferSize;
+
+ resFFT = transferDataFromCPU(vkGPU, buffer_input, &buffer, bufferSize);
+ if (resFFT != VKFFT_SUCCESS) return resFFT;
+
if (configuration.loadApplicationFromString) {
FILE* kernelCache;
uint64_t str_len;
char fname[500];
int VkFFT_version = VkFFTGetVersion();
sprintf(fname, "VkFFT_binary_X%" PRIu64 "_Y%" PRIu64 "_Z%" PRIu64 "_P%" PRIu64 "_B%" PRIu64 "_N%" PRIu64 "_R2C%" PRIu64 "_DCT%" PRIu64 "_ver%d", userParams->X, userParams->Y, userParams->Z, userParams->P, userParams->B, userParams->N, userParams->R2C, userParams->DCT, VkFFT_version);
-#if((VKFFT_BACKEND==0) || (VKFFT_BACKEND==2) || (VKFFT_BACKEND==4))
- kernelCache = fopen(fname, "rb"); //Vulkan and HIP backends load data as a uint32_t sequence
-#else
- kernelCache = fopen(fname, "r");
-#endif
+ kernelCache = fopen(fname, "rb");
if (!kernelCache) return VKFFT_ERROR_EMPTY_FILE;
fseek(kernelCache, 0, SEEK_END);
str_len = ftell(kernelCache);
@@ -189,11 +212,7 @@ VkFFTResult user_benchmark_VkFFT(VkGPU* vkGPU, uint64_t file_output, FILE* outpu
char fname[500];
int VkFFT_version = VkFFTGetVersion();
sprintf(fname, "VkFFT_binary_X%" PRIu64 "_Y%" PRIu64 "_Z%" PRIu64 "_P%" PRIu64 "_B%" PRIu64 "_N%" PRIu64 "_R2C%" PRIu64 "_DCT%" PRIu64 "_ver%d", userParams->X, userParams->Y, userParams->Z, userParams->P, userParams->B, userParams->N, userParams->R2C, userParams->DCT, VkFFT_version);
-#if((VKFFT_BACKEND==0) || (VKFFT_BACKEND==2) || (VKFFT_BACKEND==4))
- kernelCache = fopen(fname, "wb"); //Vulkan and HIP backends save data as a uint32_t sequence
-#else
- kernelCache = fopen(fname, "w");
-#endif
+ kernelCache = fopen(fname, "wb");
fwrite(app.saveApplicationString, app.applicationStringSize, 1, kernelCache);
fclose(kernelCache);
}
@@ -243,11 +262,14 @@ VkFFTResult user_benchmark_VkFFT(VkGPU* vkGPU, uint64_t file_output, FILE* outpu
clReleaseMemObject(buffer);
#elif(VKFFT_BACKEND==4)
zeMemFree(vkGPU->context, buffer);
+#elif(VKFFT_BACKEND==5)
+ buffer->release();
#endif
deleteVkFFT(&app);
}
}
+ free(buffer_input);
return resFFT;
}
diff --git a/benchmark_scripts/vkFFT_scripts/src/utils_VkFFT.cpp b/benchmark_scripts/vkFFT_scripts/src/utils_VkFFT.cpp
index 930e1d6..ee6763c 100644
--- a/benchmark_scripts/vkFFT_scripts/src/utils_VkFFT.cpp
+++ b/benchmark_scripts/vkFFT_scripts/src/utils_VkFFT.cpp
@@ -39,6 +39,10 @@
#endif
#elif(VKFFT_BACKEND==4)
#include <ze_api.h>
+#elif(VKFFT_BACKEND==5)
+#include "Foundation/Foundation.hpp"
+#include "QuartzCore/QuartzCore.hpp"
+#include "Metal/Metal.hpp"
#endif
#include "vkFFT.h"
#include "utils_VkFFT.h"
@@ -221,7 +225,7 @@ VkResult getComputeQueueFamilyIndex(VkGPU* vkGPU) {
return VK_ERROR_INITIALIZATION_FAILED;
}
vkGPU->queueFamilyIndex = i;
- return VK_SUCCESS;
+ return VK_SUCCESS;
}
else
return VK_INCOMPLETE;
@@ -352,20 +356,18 @@ VkFFTResult allocateBuffer(VkGPU* vkGPU, VkBuffer* buffer, VkDeviceMemory* devic
if (res != VK_SUCCESS) return VKFFT_ERROR_FAILED_TO_BIND_BUFFER_MEMORY;
return resFFT;
}
-VkFFTResult transferDataFromCPU(VkGPU* vkGPU, void* arr, VkBuffer* buffer, uint64_t bufferSize) {
- //a function that transfers data from the CPU to the GPU using staging buffer, because the GPU memory is not host-coherent
+#endif
+VkFFTResult transferDataToCPU(VkGPU* vkGPU, void* cpu_arr, void* output_buffer, uint64_t transferSize) {
+ //a function that transfers data from the GPU to the CPU using staging buffer, because the GPU memory is not host-coherent
VkFFTResult resFFT = VKFFT_SUCCESS;
+#if(VKFFT_BACKEND==0)
VkResult res = VK_SUCCESS;
- uint64_t stagingBufferSize = bufferSize;
+ VkBuffer* buffer = (VkBuffer*)output_buffer;
+ uint64_t stagingBufferSize = transferSize;
VkBuffer stagingBuffer = { 0 };
VkDeviceMemory stagingBufferMemory = { 0 };
- resFFT = allocateBuffer(vkGPU, &stagingBuffer, &stagingBufferMemory, VK_BUFFER_USAGE_TRANSFER_SRC_BIT, VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT | VK_MEMORY_PROPERTY_HOST_COHERENT_BIT, stagingBufferSize);
- if (resFFT != VKFFT_SUCCESS) return resFFT;
- void* data;
- res = vkMapMemory(vkGPU->device, stagingBufferMemory, 0, stagingBufferSize, 0, &data);
+ resFFT = allocateBuffer(vkGPU, &stagingBuffer, &stagingBufferMemory, VK_BUFFER_USAGE_TRANSFER_DST_BIT, VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT | VK_MEMORY_PROPERTY_HOST_COHERENT_BIT, stagingBufferSize);
if (resFFT != VKFFT_SUCCESS) return resFFT;
- memcpy(data, arr, stagingBufferSize);
- vkUnmapMemory(vkGPU->device, stagingBufferMemory);
VkCommandBufferAllocateInfo commandBufferAllocateInfo = { VK_STRUCTURE_TYPE_COMMAND_BUFFER_ALLOCATE_INFO };
commandBufferAllocateInfo.commandPool = vkGPU->commandPool;
commandBufferAllocateInfo.level = VK_COMMAND_BUFFER_LEVEL_PRIMARY;
@@ -381,7 +383,7 @@ VkFFTResult transferDataFromCPU(VkGPU* vkGPU, void* arr, VkBuffer* buffer, uint6
copyRegion.srcOffset = 0;
copyRegion.dstOffset = 0;
copyRegion.size = stagingBufferSize;
- vkCmdCopyBuffer(commandBuffer, stagingBuffer, buffer[0], 1, ©Region);
+ vkCmdCopyBuffer(commandBuffer, buffer[0], stagingBuffer, 1, ©Region);
res = vkEndCommandBuffer(commandBuffer);
if (res != VK_SUCCESS) return VKFFT_ERROR_FAILED_TO_END_COMMAND_BUFFER;
VkSubmitInfo submitInfo = { VK_STRUCTURE_TYPE_SUBMIT_INFO };
@@ -394,19 +396,96 @@ VkFFTResult transferDataFromCPU(VkGPU* vkGPU, void* arr, VkBuffer* buffer, uint6
res = vkResetFences(vkGPU->device, 1, &vkGPU->fence);
if (res != VK_SUCCESS) return VKFFT_ERROR_FAILED_TO_RESET_FENCES;
vkFreeCommandBuffers(vkGPU->device, vkGPU->commandPool, 1, &commandBuffer);
+ void* data;
+ res = vkMapMemory(vkGPU->device, stagingBufferMemory, 0, stagingBufferSize, 0, &data);
+ if (resFFT != VKFFT_SUCCESS) return resFFT;
+ memcpy(cpu_arr, data, stagingBufferSize);
+ vkUnmapMemory(vkGPU->device, stagingBufferMemory);
vkDestroyBuffer(vkGPU->device, stagingBuffer, NULL);
vkFreeMemory(vkGPU->device, stagingBufferMemory, NULL);
+#elif(VKFFT_BACKEND==1)
+ cudaError_t res = cudaSuccess;
+ void* buffer = ((void**)output_buffer)[0];
+ res = cudaMemcpy(cpu_arr, buffer, transferSize, cudaMemcpyDeviceToHost);
+ if (res != cudaSuccess) {
+ return VKFFT_ERROR_FAILED_TO_COPY;
+ }
+#elif(VKFFT_BACKEND==2)
+ hipError_t res = hipSuccess;
+ void* buffer = ((void**)output_buffer)[0];
+ res = hipMemcpy(cpu_arr, buffer, transferSize, hipMemcpyDeviceToHost);
+ if (res != hipSuccess) {
+ return VKFFT_ERROR_FAILED_TO_COPY;
+ }
+#elif(VKFFT_BACKEND==3)
+ cl_int res = CL_SUCCESS;
+ cl_mem* buffer = (cl_mem*)output_buffer;
+ cl_command_queue commandQueue = clCreateCommandQueue(vkGPU->context, vkGPU->device, 0, &res);
+ if (res != CL_SUCCESS) return VKFFT_ERROR_FAILED_TO_CREATE_COMMAND_QUEUE;
+ res = clEnqueueReadBuffer(commandQueue, buffer[0], CL_TRUE, 0, transferSize, cpu_arr, 0, NULL, NULL);
+ if (res != CL_SUCCESS) {
+ return VKFFT_ERROR_FAILED_TO_COPY;
+ }
+ res = clReleaseCommandQueue(commandQueue);
+ if (res != CL_SUCCESS) return VKFFT_ERROR_FAILED_TO_RELEASE_COMMAND_QUEUE;
+#elif(VKFFT_BACKEND==4)
+ ze_result_t res = ZE_RESULT_SUCCESS;
+ void* buffer = ((void**)output_buffer)[0];
+ ze_command_queue_desc_t commandQueueCopyDesc = {
+ ZE_STRUCTURE_TYPE_COMMAND_QUEUE_DESC,
+ 0,
+ vkGPU->commandQueueID,
+ 0, // index
+ 0, // flags
+ ZE_COMMAND_QUEUE_MODE_DEFAULT,
+ ZE_COMMAND_QUEUE_PRIORITY_NORMAL
+ };
+ ze_command_list_handle_t copyCommandList;
+ res = zeCommandListCreateImmediate(vkGPU->context, vkGPU->device, &commandQueueCopyDesc, ©CommandList);
+ if (res != ZE_RESULT_SUCCESS) {
+ return VKFFT_ERROR_FAILED_TO_CREATE_COMMAND_LIST;
+ }
+ res = zeCommandListAppendMemoryCopy(copyCommandList, cpu_arr, buffer, transferSize, 0, 0, 0);
+ if (res != ZE_RESULT_SUCCESS) {
+ return VKFFT_ERROR_FAILED_TO_COPY;
+ }
+ res = zeCommandQueueSynchronize(vkGPU->commandQueue, UINT32_MAX);
+ if (res != ZE_RESULT_SUCCESS) {
+ return VKFFT_ERROR_FAILED_TO_SYNCHRONIZE;
+ }
+#elif(VKFFT_BACKEND==5)
+ MTL::Buffer* stagingBuffer = vkGPU->device->newBuffer(transferSize, MTL::ResourceStorageModeShared);
+ MTL::CommandBuffer* copyCommandBuffer = vkGPU->queue->commandBuffer();
+ if (copyCommandBuffer == 0) return VKFFT_ERROR_FAILED_TO_CREATE_COMMAND_LIST;
+ MTL::BlitCommandEncoder* blitCommandEncoder = copyCommandBuffer->blitCommandEncoder();
+ if (blitCommandEncoder == 0) return VKFFT_ERROR_FAILED_TO_CREATE_COMMAND_LIST;
+ MTL::Buffer* buffer = ((MTL::Buffer**)output_buffer)[0];
+ blitCommandEncoder->copyFromBuffer((MTL::Buffer*)buffer, 0, (MTL::Buffer*)stagingBuffer, 0, transferSize);
+ blitCommandEncoder->endEncoding();
+ copyCommandBuffer->commit();
+ copyCommandBuffer->waitUntilCompleted();
+ blitCommandEncoder->release();
+ copyCommandBuffer->release();
+ memcpy(cpu_arr, stagingBuffer->contents(), transferSize);
+ stagingBuffer->release();
+#endif
return resFFT;
}
-VkFFTResult transferDataToCPU(VkGPU* vkGPU, void* arr, VkBuffer* buffer, uint64_t bufferSize) {
- //a function that transfers data from the GPU to the CPU using staging buffer, because the GPU memory is not host-coherent
+VkFFTResult transferDataFromCPU(VkGPU* vkGPU, void* cpu_arr, void* input_buffer, uint64_t transferSize) {
VkFFTResult resFFT = VKFFT_SUCCESS;
+#if(VKFFT_BACKEND==0)
VkResult res = VK_SUCCESS;
- uint64_t stagingBufferSize = bufferSize;
+ VkBuffer* buffer = (VkBuffer*)input_buffer;
+ uint64_t stagingBufferSize = transferSize;
VkBuffer stagingBuffer = { 0 };
VkDeviceMemory stagingBufferMemory = { 0 };
- resFFT = allocateBuffer(vkGPU, &stagingBuffer, &stagingBufferMemory, VK_BUFFER_USAGE_TRANSFER_DST_BIT, VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT | VK_MEMORY_PROPERTY_HOST_COHERENT_BIT, stagingBufferSize);
+ resFFT = allocateBuffer(vkGPU, &stagingBuffer, &stagingBufferMemory, VK_BUFFER_USAGE_TRANSFER_SRC_BIT, VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT | VK_MEMORY_PROPERTY_HOST_COHERENT_BIT, stagingBufferSize);
+ if (resFFT != VKFFT_SUCCESS) return resFFT;
+ void* data;
+ res = vkMapMemory(vkGPU->device, stagingBufferMemory, 0, stagingBufferSize, 0, &data);
if (resFFT != VKFFT_SUCCESS) return resFFT;
+ memcpy(data, cpu_arr, stagingBufferSize);
+ vkUnmapMemory(vkGPU->device, stagingBufferMemory);
VkCommandBufferAllocateInfo commandBufferAllocateInfo = { VK_STRUCTURE_TYPE_COMMAND_BUFFER_ALLOCATE_INFO };
commandBufferAllocateInfo.commandPool = vkGPU->commandPool;
commandBufferAllocateInfo.level = VK_COMMAND_BUFFER_LEVEL_PRIMARY;
@@ -422,7 +501,7 @@ VkFFTResult transferDataToCPU(VkGPU* vkGPU, void* arr, VkBuffer* buffer, uint64_
copyRegion.srcOffset = 0;
copyRegion.dstOffset = 0;
copyRegion.size = stagingBufferSize;
- vkCmdCopyBuffer(commandBuffer, buffer[0], stagingBuffer, 1, ©Region);
+ vkCmdCopyBuffer(commandBuffer, stagingBuffer, buffer[0], 1, ©Region);
res = vkEndCommandBuffer(commandBuffer);
if (res != VK_SUCCESS) return VKFFT_ERROR_FAILED_TO_END_COMMAND_BUFFER;
VkSubmitInfo submitInfo = { VK_STRUCTURE_TYPE_SUBMIT_INFO };
@@ -435,16 +514,76 @@ VkFFTResult transferDataToCPU(VkGPU* vkGPU, void* arr, VkBuffer* buffer, uint64_
res = vkResetFences(vkGPU->device, 1, &vkGPU->fence);
if (res != VK_SUCCESS) return VKFFT_ERROR_FAILED_TO_RESET_FENCES;
vkFreeCommandBuffers(vkGPU->device, vkGPU->commandPool, 1, &commandBuffer);
- void* data;
- res = vkMapMemory(vkGPU->device, stagingBufferMemory, 0, stagingBufferSize, 0, &data);
- if (resFFT != VKFFT_SUCCESS) return resFFT;
- memcpy(arr, data, stagingBufferSize);
- vkUnmapMemory(vkGPU->device, stagingBufferMemory);
vkDestroyBuffer(vkGPU->device, stagingBuffer, NULL);
vkFreeMemory(vkGPU->device, stagingBufferMemory, NULL);
return resFFT;
-}
+#elif(VKFFT_BACKEND==1)
+ cudaError_t res = cudaSuccess;
+ void* buffer = ((void**)input_buffer)[0];
+ res = cudaMemcpy(buffer, cpu_arr, transferSize, cudaMemcpyHostToDevice);
+ if (res != cudaSuccess) {
+ return VKFFT_ERROR_FAILED_TO_COPY;
+ }
+#elif(VKFFT_BACKEND==2)
+ hipError_t res = hipSuccess;
+ void* buffer = ((void**)input_buffer)[0];
+ res = hipMemcpy(buffer, cpu_arr, transferSize, hipMemcpyHostToDevice);
+ if (res != hipSuccess) {
+ return VKFFT_ERROR_FAILED_TO_COPY;
+ }
+#elif(VKFFT_BACKEND==3)
+ cl_int res = CL_SUCCESS;
+ cl_mem* buffer = (cl_mem*)input_buffer;
+ cl_command_queue commandQueue = clCreateCommandQueue(vkGPU->context, vkGPU->device, 0, &res);
+ if (res != CL_SUCCESS) return VKFFT_ERROR_FAILED_TO_CREATE_COMMAND_QUEUE;
+ res = clEnqueueWriteBuffer(commandQueue, buffer[0], CL_TRUE, 0, transferSize, cpu_arr, 0, NULL, NULL);
+ if (res != CL_SUCCESS) {
+ return VKFFT_ERROR_FAILED_TO_COPY;
+ }
+ res = clReleaseCommandQueue(commandQueue);
+ if (res != CL_SUCCESS) return VKFFT_ERROR_FAILED_TO_RELEASE_COMMAND_QUEUE;
+#elif(VKFFT_BACKEND==4)
+ ze_result_t res = ZE_RESULT_SUCCESS;
+ void* buffer = ((void**)input_buffer)[0];
+ ze_command_queue_desc_t commandQueueCopyDesc = {
+ ZE_STRUCTURE_TYPE_COMMAND_QUEUE_DESC,
+ 0,
+ vkGPU->commandQueueID,
+ 0, // index
+ 0, // flags
+ ZE_COMMAND_QUEUE_MODE_DEFAULT,
+ ZE_COMMAND_QUEUE_PRIORITY_NORMAL
+ };
+ ze_command_list_handle_t copyCommandList;
+ res = zeCommandListCreateImmediate(vkGPU->context, vkGPU->device, &commandQueueCopyDesc, ©CommandList);
+ if (res != ZE_RESULT_SUCCESS) {
+ return VKFFT_ERROR_FAILED_TO_CREATE_COMMAND_LIST;
+ }
+ res = zeCommandListAppendMemoryCopy(copyCommandList, buffer, cpu_arr, transferSize, 0, 0, 0);
+ if (res != ZE_RESULT_SUCCESS) {
+ return VKFFT_ERROR_FAILED_TO_COPY;
+ }
+ res = zeCommandQueueSynchronize(vkGPU->commandQueue, UINT32_MAX);
+ if (res != ZE_RESULT_SUCCESS) {
+ return VKFFT_ERROR_FAILED_TO_SYNCHRONIZE;
+ }
+#elif(VKFFT_BACKEND==5)
+ MTL::Buffer* stagingBuffer = vkGPU->device->newBuffer(cpu_arr, transferSize, MTL::ResourceStorageModeShared);
+ MTL::CommandBuffer* copyCommandBuffer = vkGPU->queue->commandBuffer();
+ if (copyCommandBuffer == 0) return VKFFT_ERROR_FAILED_TO_CREATE_COMMAND_LIST;
+ MTL::BlitCommandEncoder* blitCommandEncoder = copyCommandBuffer->blitCommandEncoder();
+ if (blitCommandEncoder == 0) return VKFFT_ERROR_FAILED_TO_CREATE_COMMAND_LIST;
+ MTL::Buffer* buffer = ((MTL::Buffer**)input_buffer)[0];
+ blitCommandEncoder->copyFromBuffer((MTL::Buffer*)stagingBuffer, 0, (MTL::Buffer*)buffer, 0, transferSize);
+ blitCommandEncoder->endEncoding();
+ copyCommandBuffer->commit();
+ copyCommandBuffer->waitUntilCompleted();
+ blitCommandEncoder->release();
+ copyCommandBuffer->release();
+ stagingBuffer->release();
#endif
+ return resFFT;
+}
VkFFTResult devices_list() {
//this function creates an instance and prints the list of available devices
#if(VKFFT_BACKEND==0)
@@ -476,7 +615,7 @@ VkFFTResult devices_list() {
}
free(devices);
}
- else
+ else
return VKFFT_ERROR_FAILED_TO_ENUMERATE_DEVICES;
vkDestroyInstance(local_instance, NULL);
#elif(VKFFT_BACKEND==1)
@@ -572,6 +711,12 @@ VkFFTResult devices_list() {
free(deviceList);
}
free(drivers);
+#elif(VKFFT_BACKEND==5)
+ NS::Array* devices = MTL::CopyAllDevices();
+ for (uint64_t i = 0; i < devices->count(); i++) {
+ MTL::Device* loc_device = (MTL::Device*)devices->object(i);
+ printf("Device id: %" PRIu64 " name: %s\n", i, loc_device->name()->cString(NS::UTF8StringEncoding));
+ }
#endif
return VKFFT_SUCCESS;
}
@@ -653,7 +798,7 @@ VkFFTResult performVulkanFFT(VkGPU* vkGPU, VkFFTApplication* app, VkFFTLaunchPar
ze_command_list_handle_t commandList = {};
res = zeCommandListCreate(vkGPU->context, vkGPU->device, &commandListDescription, &commandList);
if (res != ZE_RESULT_SUCCESS) return VKFFT_ERROR_FAILED_TO_CREATE_COMMAND_LIST;
-
+
launchParams->commandList = &commandList;
//Record commands num_iter times. Allows to perform multiple convolutions/transforms in one submit.
for (uint64_t i = 0; i < num_iter; i++) {
@@ -662,7 +807,7 @@ VkFFTResult performVulkanFFT(VkGPU* vkGPU, VkFFTApplication* app, VkFFTLaunchPar
}
res = zeCommandListClose(commandList);
if (res != 0) return VKFFT_ERROR_FAILED_TO_END_COMMAND_BUFFER;
-
+
std::chrono::steady_clock::time_point timeSubmit = std::chrono::steady_clock::now();
res = zeCommandQueueExecuteCommandLists(vkGPU->commandQueue, 1, &commandList, 0);
if (res != 0) return VKFFT_ERROR_FAILED_TO_SUBMIT_QUEUE;
@@ -674,6 +819,27 @@ VkFFTResult performVulkanFFT(VkGPU* vkGPU, VkFFTApplication* app, VkFFTLaunchPar
//printf("Pure submit execution time per num_iter: %.3f ms\n", totTime / num_iter);
res = zeCommandListDestroy(commandList);
if (res != 0) return VKFFT_ERROR_FAILED_TO_DESTROY_COMMAND_LIST;
+#elif(VKFFT_BACKEND==5)
+ MTL::CommandBuffer* commandBuffer = vkGPU->queue->commandBuffer();
+ if (commandBuffer == 0) return VKFFT_ERROR_FAILED_TO_CREATE_COMMAND_LIST;
+ launchParams->commandBuffer = commandBuffer;
+ MTL::ComputeCommandEncoder* commandEncoder = commandBuffer->computeCommandEncoder();
+ if (commandEncoder == 0) return VKFFT_ERROR_FAILED_TO_CREATE_COMMAND_LIST;
+ launchParams->commandEncoder = commandEncoder;
+ for (uint64_t i = 0; i < num_iter; i++) {
+ resFFT = VkFFTAppend(app, inverse, launchParams);
+ if (resFFT != VKFFT_SUCCESS) return resFFT;
+ }
+ commandEncoder->endEncoding();
+
+ std::chrono::steady_clock::time_point timeSubmit = std::chrono::steady_clock::now();
+ commandBuffer->commit();
+ commandBuffer->waitUntilCompleted();
+ std::chrono::steady_clock::time_point timeEnd = std::chrono::steady_clock::now();
+ double totTime = std::chrono::duration_cast<std::chrono::microseconds>(timeEnd - timeSubmit).count() * 0.001;
+
+ commandEncoder->release();
+ commandBuffer->release();
#endif
return resFFT;
}
@@ -787,6 +953,29 @@ VkFFTResult performVulkanFFTiFFT(VkGPU* vkGPU, VkFFTApplication* app, VkFFTLaunc
time_result[0] = totTime / num_iter;
res = zeCommandListDestroy(commandList);
if (res != 0) return VKFFT_ERROR_FAILED_TO_DESTROY_COMMAND_LIST;
+#elif(VKFFT_BACKEND==5)
+ MTL::CommandBuffer* commandBuffer = vkGPU->queue->commandBuffer();
+ if (commandBuffer == 0) return VKFFT_ERROR_FAILED_TO_CREATE_COMMAND_LIST;
+ launchParams->commandBuffer = commandBuffer;
+ MTL::ComputeCommandEncoder* commandEncoder = commandBuffer->computeCommandEncoder();
+ if (commandEncoder == 0) return VKFFT_ERROR_FAILED_TO_CREATE_COMMAND_LIST;
+ launchParams->commandEncoder = commandEncoder;
+ for (uint64_t i = 0; i < num_iter; i++) {
+ resFFT = VkFFTAppend(app, -1, launchParams);
+ if (resFFT != VKFFT_SUCCESS) return resFFT;
+ resFFT = VkFFTAppend(app, 1, launchParams);
+ if (resFFT != VKFFT_SUCCESS) return resFFT;
+ }
+ commandEncoder->endEncoding();
+
+ std::chrono::steady_clock::time_point timeSubmit = std::chrono::steady_clock::now();
+ commandBuffer->commit();
+ commandBuffer->waitUntilCompleted();
+ std::chrono::steady_clock::time_point timeEnd = std::chrono::steady_clock::now();
+ double totTime = std::chrono::duration_cast<std::chrono::microseconds>(timeEnd - timeSubmit).count() * 0.001;
+ time_result[0] = totTime / num_iter;
+ commandEncoder->release();
+ commandBuffer->release();
#endif
return resFFT;
-}
\ No newline at end of file
+}
diff --git a/debian/changelog b/debian/changelog
index 73b22d3..afc248a 100644
--- a/debian/changelog
+++ b/debian/changelog
@@ -1,3 +1,9 @@
+vkfft (1.2.31+ds1-1) UNRELEASED; urgency=low
+
+ * New upstream release.
+
+ -- Debian Janitor <janitor@jelmer.uk> Sun, 11 Jun 2023 12:51:19 -0000
+
vkfft (1.2.26+ds1-1) unstable; urgency=medium
* New upstream version 1.2.26+ds1
diff --git a/debian/patches/0001-Use-Debian-version-of-glslang.patch b/debian/patches/0001-Use-Debian-version-of-glslang.patch
index f340b71..69e4b6e 100644
--- a/debian/patches/0001-Use-Debian-version-of-glslang.patch
+++ b/debian/patches/0001-Use-Debian-version-of-glslang.patch
@@ -35,10 +35,10 @@ Subject: Use Debian version of glslang
vkFFT/vkFFT.h | 2 +-
30 files changed, 30 insertions(+), 30 deletions(-)
-diff --git a/Vulkan_FFT.cpp b/Vulkan_FFT.cpp
-index 5e7b9bf..f7de958 100644
---- a/Vulkan_FFT.cpp
-+++ b/Vulkan_FFT.cpp
+Index: vkfft.git/Vulkan_FFT.cpp
+===================================================================
+--- vkfft.git.orig/Vulkan_FFT.cpp
++++ vkfft.git/Vulkan_FFT.cpp
@@ -9,7 +9,7 @@
#include <inttypes.h>
#if(VKFFT_BACKEND==0)
@@ -48,10 +48,10 @@ index 5e7b9bf..f7de958 100644
#elif(VKFFT_BACKEND==1)
#include <cuda.h>
#include <cuda_runtime.h>
-diff --git a/benchmark_scripts/vkFFT_scripts/src/sample_0_benchmark_VkFFT_single.cpp b/benchmark_scripts/vkFFT_scripts/src/sample_0_benchmark_VkFFT_single.cpp
-index aae77e2..895ac24 100644
---- a/benchmark_scripts/vkFFT_scripts/src/sample_0_benchmark_VkFFT_single.cpp
-+++ b/benchmark_scripts/vkFFT_scripts/src/sample_0_benchmark_VkFFT_single.cpp
+Index: vkfft.git/benchmark_scripts/vkFFT_scripts/src/sample_0_benchmark_VkFFT_single.cpp
+===================================================================
+--- vkfft.git.orig/benchmark_scripts/vkFFT_scripts/src/sample_0_benchmark_VkFFT_single.cpp
++++ vkfft.git/benchmark_scripts/vkFFT_scripts/src/sample_0_benchmark_VkFFT_single.cpp
@@ -13,7 +13,7 @@
#if(VKFFT_BACKEND==0)
@@ -61,10 +61,10 @@ index aae77e2..895ac24 100644
#elif(VKFFT_BACKEND==1)
#include <cuda.h>
#include <cuda_runtime.h>
-diff --git a/benchmark_scripts/vkFFT_scripts/src/sample_1000_VkFFT_single_2_4096.cpp b/benchmark_scripts/vkFFT_scripts/src/sample_1000_VkFFT_single_2_4096.cpp
-index 84ecc9e..0251a1e 100644
---- a/benchmark_scripts/vkFFT_scripts/src/sample_1000_VkFFT_single_2_4096.cpp
-+++ b/benchmark_scripts/vkFFT_scripts/src/sample_1000_VkFFT_single_2_4096.cpp
+Index: vkfft.git/benchmark_scripts/vkFFT_scripts/src/sample_1000_VkFFT_single_2_4096.cpp
+===================================================================
+--- vkfft.git.orig/benchmark_scripts/vkFFT_scripts/src/sample_1000_VkFFT_single_2_4096.cpp
++++ vkfft.git/benchmark_scripts/vkFFT_scripts/src/sample_1000_VkFFT_single_2_4096.cpp
@@ -13,7 +13,7 @@
#if(VKFFT_BACKEND==0)
@@ -74,10 +74,10 @@ index 84ecc9e..0251a1e 100644
#elif(VKFFT_BACKEND==1)
#include <cuda.h>
#include <cuda_runtime.h>
-diff --git a/benchmark_scripts/vkFFT_scripts/src/sample_1001_benchmark_VkFFT_double_2_4096.cpp b/benchmark_scripts/vkFFT_scripts/src/sample_1001_benchmark_VkFFT_double_2_4096.cpp
-index 4cf990a..b3a83a4 100644
---- a/benchmark_scripts/vkFFT_scripts/src/sample_1001_benchmark_VkFFT_double_2_4096.cpp
-+++ b/benchmark_scripts/vkFFT_scripts/src/sample_1001_benchmark_VkFFT_double_2_4096.cpp
+Index: vkfft.git/benchmark_scripts/vkFFT_scripts/src/sample_1001_benchmark_VkFFT_double_2_4096.cpp
+===================================================================
+--- vkfft.git.orig/benchmark_scripts/vkFFT_scripts/src/sample_1001_benchmark_VkFFT_double_2_4096.cpp
++++ vkfft.git/benchmark_scripts/vkFFT_scripts/src/sample_1001_benchmark_VkFFT_double_2_4096.cpp
@@ -13,7 +13,7 @@
#if(VKFFT_BACKEND==0)
@@ -87,10 +87,10 @@ index 4cf990a..b3a83a4 100644
#elif(VKFFT_BACKEND==1)
#include <cuda.h>
#include <cuda_runtime.h>
-diff --git a/benchmark_scripts/vkFFT_scripts/src/sample_1003_benchmark_VkFFT_single_3d_2_512.cpp b/benchmark_scripts/vkFFT_scripts/src/sample_1003_benchmark_VkFFT_single_3d_2_512.cpp
-index 202ee75..bdb9945 100644
---- a/benchmark_scripts/vkFFT_scripts/src/sample_1003_benchmark_VkFFT_single_3d_2_512.cpp
-+++ b/benchmark_scripts/vkFFT_scripts/src/sample_1003_benchmark_VkFFT_single_3d_2_512.cpp
+Index: vkfft.git/benchmark_scripts/vkFFT_scripts/src/sample_1003_benchmark_VkFFT_single_3d_2_512.cpp
+===================================================================
+--- vkfft.git.orig/benchmark_scripts/vkFFT_scripts/src/sample_1003_benchmark_VkFFT_single_3d_2_512.cpp
++++ vkfft.git/benchmark_scripts/vkFFT_scripts/src/sample_1003_benchmark_VkFFT_single_3d_2_512.cpp
@@ -13,7 +13,7 @@
#if(VKFFT_BACKEND==0)
@@ -100,10 +100,10 @@ index 202ee75..bdb9945 100644
#elif(VKFFT_BACKEND==1)
#include <cuda.h>
#include <cuda_runtime.h>
-diff --git a/benchmark_scripts/vkFFT_scripts/src/sample_100_benchmark_VkFFT_single_nd_dct.cpp b/benchmark_scripts/vkFFT_scripts/src/sample_100_benchmark_VkFFT_single_nd_dct.cpp
-index 4b286e6..d2773f9 100644
---- a/benchmark_scripts/vkFFT_scripts/src/sample_100_benchmark_VkFFT_single_nd_dct.cpp
-+++ b/benchmark_scripts/vkFFT_scripts/src/sample_100_benchmark_VkFFT_single_nd_dct.cpp
+Index: vkfft.git/benchmark_scripts/vkFFT_scripts/src/sample_100_benchmark_VkFFT_single_nd_dct.cpp
+===================================================================
+--- vkfft.git.orig/benchmark_scripts/vkFFT_scripts/src/sample_100_benchmark_VkFFT_single_nd_dct.cpp
++++ vkfft.git/benchmark_scripts/vkFFT_scripts/src/sample_100_benchmark_VkFFT_single_nd_dct.cpp
@@ -13,7 +13,7 @@
#if(VKFFT_BACKEND==0)
@@ -113,10 +113,10 @@ index 4b286e6..d2773f9 100644
#elif(VKFFT_BACKEND==1)
#include <cuda.h>
#include <cuda_runtime.h>
-diff --git a/benchmark_scripts/vkFFT_scripts/src/sample_101_benchmark_VkFFT_double_nd_dct.cpp b/benchmark_scripts/vkFFT_scripts/src/sample_101_benchmark_VkFFT_double_nd_dct.cpp
-index 5c0cb5f..2d34037 100644
---- a/benchmark_scripts/vkFFT_scripts/src/sample_101_benchmark_VkFFT_double_nd_dct.cpp
-+++ b/benchmark_scripts/vkFFT_scripts/src/sample_101_benchmark_VkFFT_double_nd_dct.cpp
+Index: vkfft.git/benchmark_scripts/vkFFT_scripts/src/sample_101_benchmark_VkFFT_double_nd_dct.cpp
+===================================================================
+--- vkfft.git.orig/benchmark_scripts/vkFFT_scripts/src/sample_101_benchmark_VkFFT_double_nd_dct.cpp
++++ vkfft.git/benchmark_scripts/vkFFT_scripts/src/sample_101_benchmark_VkFFT_double_nd_dct.cpp
@@ -13,7 +13,7 @@
#if(VKFFT_BACKEND==0)
@@ -126,10 +126,10 @@ index 5c0cb5f..2d34037 100644
#elif(VKFFT_BACKEND==1)
#include <cuda.h>
#include <cuda_runtime.h>
-diff --git a/benchmark_scripts/vkFFT_scripts/src/sample_10_benchmark_VkFFT_single_multipleBuffers.cpp b/benchmark_scripts/vkFFT_scripts/src/sample_10_benchmark_VkFFT_single_multipleBuffers.cpp
-index dddcdde..2e28af6 100644
---- a/benchmark_scripts/vkFFT_scripts/src/sample_10_benchmark_VkFFT_single_multipleBuffers.cpp
-+++ b/benchmark_scripts/vkFFT_scripts/src/sample_10_benchmark_VkFFT_single_multipleBuffers.cpp
+Index: vkfft.git/benchmark_scripts/vkFFT_scripts/src/sample_10_benchmark_VkFFT_single_multipleBuffers.cpp
+===================================================================
+--- vkfft.git.orig/benchmark_scripts/vkFFT_scripts/src/sample_10_benchmark_VkFFT_single_multipleBuffers.cpp
++++ vkfft.git/benchmark_scripts/vkFFT_scripts/src/sample_10_benchmark_VkFFT_single_multipleBuffers.cpp
@@ -13,7 +13,7 @@
#if(VKFFT_BACKEND==0)
@@ -139,10 +139,10 @@ index dddcdde..2e28af6 100644
#elif(VKFFT_BACKEND==1)
#include <cuda.h>
#include <cuda_runtime.h>
-diff --git a/benchmark_scripts/vkFFT_scripts/src/sample_11_precision_VkFFT_single.cpp b/benchmark_scripts/vkFFT_scripts/src/sample_11_precision_VkFFT_single.cpp
-index 48a6325..53d2bbb 100644
---- a/benchmark_scripts/vkFFT_scripts/src/sample_11_precision_VkFFT_single.cpp
-+++ b/benchmark_scripts/vkFFT_scripts/src/sample_11_precision_VkFFT_single.cpp
+Index: vkfft.git/benchmark_scripts/vkFFT_scripts/src/sample_11_precision_VkFFT_single.cpp
+===================================================================
+--- vkfft.git.orig/benchmark_scripts/vkFFT_scripts/src/sample_11_precision_VkFFT_single.cpp
++++ vkfft.git/benchmark_scripts/vkFFT_scripts/src/sample_11_precision_VkFFT_single.cpp
@@ -13,7 +13,7 @@
#if(VKFFT_BACKEND==0)
@@ -152,10 +152,10 @@ index 48a6325..53d2bbb 100644
#elif(VKFFT_BACKEND==1)
#include <cuda.h>
#include <cuda_runtime.h>
-diff --git a/benchmark_scripts/vkFFT_scripts/src/sample_12_precision_VkFFT_double.cpp b/benchmark_scripts/vkFFT_scripts/src/sample_12_precision_VkFFT_double.cpp
-index d1f25c9..6fbeccc 100644
---- a/benchmark_scripts/vkFFT_scripts/src/sample_12_precision_VkFFT_double.cpp
-+++ b/benchmark_scripts/vkFFT_scripts/src/sample_12_precision_VkFFT_double.cpp
+Index: vkfft.git/benchmark_scripts/vkFFT_scripts/src/sample_12_precision_VkFFT_double.cpp
+===================================================================
+--- vkfft.git.orig/benchmark_scripts/vkFFT_scripts/src/sample_12_precision_VkFFT_double.cpp
++++ vkfft.git/benchmark_scripts/vkFFT_scripts/src/sample_12_precision_VkFFT_double.cpp
@@ -13,7 +13,7 @@
#if(VKFFT_BACKEND==0)
@@ -165,10 +165,10 @@ index d1f25c9..6fbeccc 100644
#elif(VKFFT_BACKEND==1)
#include <cuda.h>
#include <cuda_runtime.h>
-diff --git a/benchmark_scripts/vkFFT_scripts/src/sample_13_precision_VkFFT_half.cpp b/benchmark_scripts/vkFFT_scripts/src/sample_13_precision_VkFFT_half.cpp
-index 59b8354..5e05109 100644
---- a/benchmark_scripts/vkFFT_scripts/src/sample_13_precision_VkFFT_half.cpp
-+++ b/benchmark_scripts/vkFFT_scripts/src/sample_13_precision_VkFFT_half.cpp
+Index: vkfft.git/benchmark_scripts/vkFFT_scripts/src/sample_13_precision_VkFFT_half.cpp
+===================================================================
+--- vkfft.git.orig/benchmark_scripts/vkFFT_scripts/src/sample_13_precision_VkFFT_half.cpp
++++ vkfft.git/benchmark_scripts/vkFFT_scripts/src/sample_13_precision_VkFFT_half.cpp
@@ -13,7 +13,7 @@
#if(VKFFT_BACKEND==0)
@@ -178,10 +178,10 @@ index 59b8354..5e05109 100644
#elif(VKFFT_BACKEND==1)
#include <cuda.h>
#include <cuda_runtime.h>
-diff --git a/benchmark_scripts/vkFFT_scripts/src/sample_14_precision_VkFFT_single_nonPow2.cpp b/benchmark_scripts/vkFFT_scripts/src/sample_14_precision_VkFFT_single_nonPow2.cpp
-index 77fc99b..4aad477 100644
---- a/benchmark_scripts/vkFFT_scripts/src/sample_14_precision_VkFFT_single_nonPow2.cpp
-+++ b/benchmark_scripts/vkFFT_scripts/src/sample_14_precision_VkFFT_single_nonPow2.cpp
+Index: vkfft.git/benchmark_scripts/vkFFT_scripts/src/sample_14_precision_VkFFT_single_nonPow2.cpp
+===================================================================
+--- vkfft.git.orig/benchmark_scripts/vkFFT_scripts/src/sample_14_precision_VkFFT_single_nonPow2.cpp
++++ vkfft.git/benchmark_scripts/vkFFT_scripts/src/sample_14_precision_VkFFT_single_nonPow2.cpp
@@ -13,7 +13,7 @@
#if(VKFFT_BACKEND==0)
@@ -191,10 +191,10 @@ index 77fc99b..4aad477 100644
#elif(VKFFT_BACKEND==1)
#include <cuda.h>
#include <cuda_runtime.h>
-diff --git a/benchmark_scripts/vkFFT_scripts/src/sample_15_precision_VkFFT_single_r2c.cpp b/benchmark_scripts/vkFFT_scripts/src/sample_15_precision_VkFFT_single_r2c.cpp
-index 257e9a5..0194c57 100644
---- a/benchmark_scripts/vkFFT_scripts/src/sample_15_precision_VkFFT_single_r2c.cpp
-+++ b/benchmark_scripts/vkFFT_scripts/src/sample_15_precision_VkFFT_single_r2c.cpp
+Index: vkfft.git/benchmark_scripts/vkFFT_scripts/src/sample_15_precision_VkFFT_single_r2c.cpp
+===================================================================
+--- vkfft.git.orig/benchmark_scripts/vkFFT_scripts/src/sample_15_precision_VkFFT_single_r2c.cpp
++++ vkfft.git/benchmark_scripts/vkFFT_scripts/src/sample_15_precision_VkFFT_single_r2c.cpp
@@ -14,7 +14,7 @@
#if(VKFFT_BACKEND==0)
@@ -204,10 +204,10 @@ index 257e9a5..0194c57 100644
#elif(VKFFT_BACKEND==1)
#include <cuda.h>
#include <cuda_runtime.h>
-diff --git a/benchmark_scripts/vkFFT_scripts/src/sample_16_precision_VkFFT_single_dct.cpp b/benchmark_scripts/vkFFT_scripts/src/sample_16_precision_VkFFT_single_dct.cpp
-index 3d02b66..f23357b 100644
---- a/benchmark_scripts/vkFFT_scripts/src/sample_16_precision_VkFFT_single_dct.cpp
-+++ b/benchmark_scripts/vkFFT_scripts/src/sample_16_precision_VkFFT_single_dct.cpp
+Index: vkfft.git/benchmark_scripts/vkFFT_scripts/src/sample_16_precision_VkFFT_single_dct.cpp
+===================================================================
+--- vkfft.git.orig/benchmark_scripts/vkFFT_scripts/src/sample_16_precision_VkFFT_single_dct.cpp
++++ vkfft.git/benchmark_scripts/vkFFT_scripts/src/sample_16_precision_VkFFT_single_dct.cpp
@@ -13,7 +13,7 @@
#if(VKFFT_BACKEND==0)
@@ -217,10 +217,10 @@ index 3d02b66..f23357b 100644
#elif(VKFFT_BACKEND==1)
#include <cuda.h>
#include <cuda_runtime.h>
-diff --git a/benchmark_scripts/vkFFT_scripts/src/sample_17_precision_VkFFT_double_dct.cpp b/benchmark_scripts/vkFFT_scripts/src/sample_17_precision_VkFFT_double_dct.cpp
-index 60129db..600638f 100644
---- a/benchmark_scripts/vkFFT_scripts/src/sample_17_precision_VkFFT_double_dct.cpp
-+++ b/benchmark_scripts/vkFFT_scripts/src/sample_17_precision_VkFFT_double_dct.cpp
+Index: vkfft.git/benchmark_scripts/vkFFT_scripts/src/sample_17_precision_VkFFT_double_dct.cpp
+===================================================================
+--- vkfft.git.orig/benchmark_scripts/vkFFT_scripts/src/sample_17_precision_VkFFT_double_dct.cpp
++++ vkfft.git/benchmark_scripts/vkFFT_scripts/src/sample_17_precision_VkFFT_double_dct.cpp
@@ -13,7 +13,7 @@
#if(VKFFT_BACKEND==0)
@@ -230,10 +230,10 @@ index 60129db..600638f 100644
#elif(VKFFT_BACKEND==1)
#include <cuda.h>
#include <cuda_runtime.h>
-diff --git a/benchmark_scripts/vkFFT_scripts/src/sample_18_precision_VkFFT_double_nonPow2.cpp b/benchmark_scripts/vkFFT_scripts/src/sample_18_precision_VkFFT_double_nonPow2.cpp
-index edacae7..ba77445 100644
---- a/benchmark_scripts/vkFFT_scripts/src/sample_18_precision_VkFFT_double_nonPow2.cpp
-+++ b/benchmark_scripts/vkFFT_scripts/src/sample_18_precision_VkFFT_double_nonPow2.cpp
+Index: vkfft.git/benchmark_scripts/vkFFT_scripts/src/sample_18_precision_VkFFT_double_nonPow2.cpp
+===================================================================
+--- vkfft.git.orig/benchmark_scripts/vkFFT_scripts/src/sample_18_precision_VkFFT_double_nonPow2.cpp
++++ vkfft.git/benchmark_scripts/vkFFT_scripts/src/sample_18_precision_VkFFT_double_nonPow2.cpp
@@ -13,7 +13,7 @@
#if(VKFFT_BACKEND==0)
@@ -243,10 +243,10 @@ index edacae7..ba77445 100644
#elif(VKFFT_BACKEND==1)
#include <cuda.h>
#include <cuda_runtime.h>
-diff --git a/benchmark_scripts/vkFFT_scripts/src/sample_1_benchmark_VkFFT_double.cpp b/benchmark_scripts/vkFFT_scripts/src/sample_1_benchmark_VkFFT_double.cpp
-index 2e17950..29de8ff 100644
---- a/benchmark_scripts/vkFFT_scripts/src/sample_1_benchmark_VkFFT_double.cpp
-+++ b/benchmark_scripts/vkFFT_scripts/src/sample_1_benchmark_VkFFT_double.cpp
+Index: vkfft.git/benchmark_scripts/vkFFT_scripts/src/sample_1_benchmark_VkFFT_double.cpp
+===================================================================
+--- vkfft.git.orig/benchmark_scripts/vkFFT_scripts/src/sample_1_benchmark_VkFFT_double.cpp
++++ vkfft.git/benchmark_scripts/vkFFT_scripts/src/sample_1_benchmark_VkFFT_double.cpp
@@ -13,7 +13,7 @@
#if(VKFFT_BACKEND==0)
@@ -256,10 +256,10 @@ index 2e17950..29de8ff 100644
#elif(VKFFT_BACKEND==1)
#include <cuda.h>
#include <cuda_runtime.h>
-diff --git a/benchmark_scripts/vkFFT_scripts/src/sample_2_benchmark_VkFFT_half.cpp b/benchmark_scripts/vkFFT_scripts/src/sample_2_benchmark_VkFFT_half.cpp
-index 9aa9776..c81e9fc 100644
---- a/benchmark_scripts/vkFFT_scripts/src/sample_2_benchmark_VkFFT_half.cpp
-+++ b/benchmark_scripts/vkFFT_scripts/src/sample_2_benchmark_VkFFT_half.cpp
+Index: vkfft.git/benchmark_scripts/vkFFT_scripts/src/sample_2_benchmark_VkFFT_half.cpp
+===================================================================
+--- vkfft.git.orig/benchmark_scripts/vkFFT_scripts/src/sample_2_benchmark_VkFFT_half.cpp
++++ vkfft.git/benchmark_scripts/vkFFT_scripts/src/sample_2_benchmark_VkFFT_half.cpp
@@ -13,7 +13,7 @@
#if(VKFFT_BACKEND==0)
@@ -269,10 +269,10 @@ index 9aa9776..c81e9fc 100644
#elif(VKFFT_BACKEND==1)
#include <cuda.h>
#include <cuda_runtime.h>
-diff --git a/benchmark_scripts/vkFFT_scripts/src/sample_3_benchmark_VkFFT_single_3d.cpp b/benchmark_scripts/vkFFT_scripts/src/sample_3_benchmark_VkFFT_single_3d.cpp
-index f337314..8a03916 100644
---- a/benchmark_scripts/vkFFT_scripts/src/sample_3_benchmark_VkFFT_single_3d.cpp
-+++ b/benchmark_scripts/vkFFT_scripts/src/sample_3_benchmark_VkFFT_single_3d.cpp
+Index: vkfft.git/benchmark_scripts/vkFFT_scripts/src/sample_3_benchmark_VkFFT_single_3d.cpp
+===================================================================
+--- vkfft.git.orig/benchmark_scripts/vkFFT_scripts/src/sample_3_benchmark_VkFFT_single_3d.cpp
++++ vkfft.git/benchmark_scripts/vkFFT_scripts/src/sample_3_benchmark_VkFFT_single_3d.cpp
@@ -13,7 +13,7 @@
#if(VKFFT_BACKEND==0)
@@ -282,10 +282,10 @@ index f337314..8a03916 100644
#elif(VKFFT_BACKEND==1)
#include <cuda.h>
#include <cuda_runtime.h>
-diff --git a/benchmark_scripts/vkFFT_scripts/src/sample_4_benchmark_VkFFT_single_3d_zeropadding.cpp b/benchmark_scripts/vkFFT_scripts/src/sample_4_benchmark_VkFFT_single_3d_zeropadding.cpp
-index 466bf9b..69c0c25 100644
---- a/benchmark_scripts/vkFFT_scripts/src/sample_4_benchmark_VkFFT_single_3d_zeropadding.cpp
-+++ b/benchmark_scripts/vkFFT_scripts/src/sample_4_benchmark_VkFFT_single_3d_zeropadding.cpp
+Index: vkfft.git/benchmark_scripts/vkFFT_scripts/src/sample_4_benchmark_VkFFT_single_3d_zeropadding.cpp
+===================================================================
+--- vkfft.git.orig/benchmark_scripts/vkFFT_scripts/src/sample_4_benchmark_VkFFT_single_3d_zeropadding.cpp
++++ vkfft.git/benchmark_scripts/vkFFT_scripts/src/sample_4_benchmark_VkFFT_single_3d_zeropadding.cpp
@@ -13,7 +13,7 @@
#if(VKFFT_BACKEND==0)
@@ -295,10 +295,10 @@ index 466bf9b..69c0c25 100644
#elif(VKFFT_BACKEND==1)
#include <cuda.h>
#include <cuda_runtime.h>
-diff --git a/benchmark_scripts/vkFFT_scripts/src/sample_50_convolution_VkFFT_single_1d_matrix.cpp b/benchmark_scripts/vkFFT_scripts/src/sample_50_convolution_VkFFT_single_1d_matrix.cpp
-index c536376..b9cb2ce 100644
---- a/benchmark_scripts/vkFFT_scripts/src/sample_50_convolution_VkFFT_single_1d_matrix.cpp
-+++ b/benchmark_scripts/vkFFT_scripts/src/sample_50_convolution_VkFFT_single_1d_matrix.cpp
+Index: vkfft.git/benchmark_scripts/vkFFT_scripts/src/sample_50_convolution_VkFFT_single_1d_matrix.cpp
+===================================================================
+--- vkfft.git.orig/benchmark_scripts/vkFFT_scripts/src/sample_50_convolution_VkFFT_single_1d_matrix.cpp
++++ vkfft.git/benchmark_scripts/vkFFT_scripts/src/sample_50_convolution_VkFFT_single_1d_matrix.cpp
@@ -13,7 +13,7 @@
#if(VKFFT_BACKEND==0)
@@ -308,10 +308,10 @@ index c536376..b9cb2ce 100644
#elif(VKFFT_BACKEND==1)
#include <cuda.h>
#include <cuda_runtime.h>
-diff --git a/benchmark_scripts/vkFFT_scripts/src/sample_51_convolution_VkFFT_single_3d_matrix_zeropadding_r2c.cpp b/benchmark_scripts/vkFFT_scripts/src/sample_51_convolution_VkFFT_single_3d_matrix_zeropadding_r2c.cpp
-index 4b471e6..dd7b28c 100644
---- a/benchmark_scripts/vkFFT_scripts/src/sample_51_convolution_VkFFT_single_3d_matrix_zeropadding_r2c.cpp
-+++ b/benchmark_scripts/vkFFT_scripts/src/sample_51_convolution_VkFFT_single_3d_matrix_zeropadding_r2c.cpp
+Index: vkfft.git/benchmark_scripts/vkFFT_scripts/src/sample_51_convolution_VkFFT_single_3d_matrix_zeropadding_r2c.cpp
+===================================================================
+--- vkfft.git.orig/benchmark_scripts/vkFFT_scripts/src/sample_51_convolution_VkFFT_single_3d_matrix_zeropadding_r2c.cpp
++++ vkfft.git/benchmark_scripts/vkFFT_scripts/src/sample_51_convolution_VkFFT_single_3d_matrix_zeropadding_r2c.cpp
@@ -13,7 +13,7 @@
#if(VKFFT_BACKEND==0)
@@ -321,10 +321,10 @@ index 4b471e6..dd7b28c 100644
#elif(VKFFT_BACKEND==1)
#include <cuda.h>
#include <cuda_runtime.h>
-diff --git a/benchmark_scripts/vkFFT_scripts/src/sample_52_convolution_VkFFT_single_2d_batched_r2c.cpp b/benchmark_scripts/vkFFT_scripts/src/sample_52_convolution_VkFFT_single_2d_batched_r2c.cpp
-index 7b6fccf..935d32a 100644
---- a/benchmark_scripts/vkFFT_scripts/src/sample_52_convolution_VkFFT_single_2d_batched_r2c.cpp
-+++ b/benchmark_scripts/vkFFT_scripts/src/sample_52_convolution_VkFFT_single_2d_batched_r2c.cpp
+Index: vkfft.git/benchmark_scripts/vkFFT_scripts/src/sample_52_convolution_VkFFT_single_2d_batched_r2c.cpp
+===================================================================
+--- vkfft.git.orig/benchmark_scripts/vkFFT_scripts/src/sample_52_convolution_VkFFT_single_2d_batched_r2c.cpp
++++ vkfft.git/benchmark_scripts/vkFFT_scripts/src/sample_52_convolution_VkFFT_single_2d_batched_r2c.cpp
@@ -13,7 +13,7 @@
#if(VKFFT_BACKEND==0)
@@ -334,10 +334,10 @@ index 7b6fccf..935d32a 100644
#elif(VKFFT_BACKEND==1)
#include <cuda.h>
#include <cuda_runtime.h>
-diff --git a/benchmark_scripts/vkFFT_scripts/src/sample_5_benchmark_VkFFT_single_disableReorderFourStep.cpp b/benchmark_scripts/vkFFT_scripts/src/sample_5_benchmark_VkFFT_single_disableReorderFourStep.cpp
-index fbfa396..414b180 100644
---- a/benchmark_scripts/vkFFT_scripts/src/sample_5_benchmark_VkFFT_single_disableReorderFourStep.cpp
-+++ b/benchmark_scripts/vkFFT_scripts/src/sample_5_benchmark_VkFFT_single_disableReorderFourStep.cpp
+Index: vkfft.git/benchmark_scripts/vkFFT_scripts/src/sample_5_benchmark_VkFFT_single_disableReorderFourStep.cpp
+===================================================================
+--- vkfft.git.orig/benchmark_scripts/vkFFT_scripts/src/sample_5_benchmark_VkFFT_single_disableReorderFourStep.cpp
++++ vkfft.git/benchmark_scripts/vkFFT_scripts/src/sample_5_benchmark_VkFFT_single_disableReorderFourStep.cpp
@@ -13,7 +13,7 @@
#if(VKFFT_BACKEND==0)
@@ -347,10 +347,10 @@ index fbfa396..414b180 100644
#elif(VKFFT_BACKEND==1)
#include <cuda.h>
#include <cuda_runtime.h>
-diff --git a/benchmark_scripts/vkFFT_scripts/src/sample_6_benchmark_VkFFT_single_r2c.cpp b/benchmark_scripts/vkFFT_scripts/src/sample_6_benchmark_VkFFT_single_r2c.cpp
-index 51818c0..fa1c3be 100644
---- a/benchmark_scripts/vkFFT_scripts/src/sample_6_benchmark_VkFFT_single_r2c.cpp
-+++ b/benchmark_scripts/vkFFT_scripts/src/sample_6_benchmark_VkFFT_single_r2c.cpp
+Index: vkfft.git/benchmark_scripts/vkFFT_scripts/src/sample_6_benchmark_VkFFT_single_r2c.cpp
+===================================================================
+--- vkfft.git.orig/benchmark_scripts/vkFFT_scripts/src/sample_6_benchmark_VkFFT_single_r2c.cpp
++++ vkfft.git/benchmark_scripts/vkFFT_scripts/src/sample_6_benchmark_VkFFT_single_r2c.cpp
@@ -13,7 +13,7 @@
#if(VKFFT_BACKEND==0)
@@ -360,10 +360,10 @@ index 51818c0..fa1c3be 100644
#elif(VKFFT_BACKEND==1)
#include <cuda.h>
#include <cuda_runtime.h>
-diff --git a/benchmark_scripts/vkFFT_scripts/src/sample_7_benchmark_VkFFT_single_Bluestein.cpp b/benchmark_scripts/vkFFT_scripts/src/sample_7_benchmark_VkFFT_single_Bluestein.cpp
-index 52b8a7e..5ce4950 100644
---- a/benchmark_scripts/vkFFT_scripts/src/sample_7_benchmark_VkFFT_single_Bluestein.cpp
-+++ b/benchmark_scripts/vkFFT_scripts/src/sample_7_benchmark_VkFFT_single_Bluestein.cpp
+Index: vkfft.git/benchmark_scripts/vkFFT_scripts/src/sample_7_benchmark_VkFFT_single_Bluestein.cpp
+===================================================================
+--- vkfft.git.orig/benchmark_scripts/vkFFT_scripts/src/sample_7_benchmark_VkFFT_single_Bluestein.cpp
++++ vkfft.git/benchmark_scripts/vkFFT_scripts/src/sample_7_benchmark_VkFFT_single_Bluestein.cpp
@@ -13,7 +13,7 @@
#if(VKFFT_BACKEND==0)
@@ -373,10 +373,10 @@ index 52b8a7e..5ce4950 100644
#elif(VKFFT_BACKEND==1)
#include <cuda.h>
#include <cuda_runtime.h>
-diff --git a/benchmark_scripts/vkFFT_scripts/src/sample_8_benchmark_VkFFT_double_Bluestein.cpp b/benchmark_scripts/vkFFT_scripts/src/sample_8_benchmark_VkFFT_double_Bluestein.cpp
-index 81c1e0d..020b61f 100644
---- a/benchmark_scripts/vkFFT_scripts/src/sample_8_benchmark_VkFFT_double_Bluestein.cpp
-+++ b/benchmark_scripts/vkFFT_scripts/src/sample_8_benchmark_VkFFT_double_Bluestein.cpp
+Index: vkfft.git/benchmark_scripts/vkFFT_scripts/src/sample_8_benchmark_VkFFT_double_Bluestein.cpp
+===================================================================
+--- vkfft.git.orig/benchmark_scripts/vkFFT_scripts/src/sample_8_benchmark_VkFFT_double_Bluestein.cpp
++++ vkfft.git/benchmark_scripts/vkFFT_scripts/src/sample_8_benchmark_VkFFT_double_Bluestein.cpp
@@ -13,7 +13,7 @@
#if(VKFFT_BACKEND==0)
@@ -386,10 +386,10 @@ index 81c1e0d..020b61f 100644
#elif(VKFFT_BACKEND==1)
#include <cuda.h>
#include <cuda_runtime.h>
-diff --git a/benchmark_scripts/vkFFT_scripts/src/user_benchmark_VkFFT.cpp b/benchmark_scripts/vkFFT_scripts/src/user_benchmark_VkFFT.cpp
-index 5e63644..3fa0a79 100644
---- a/benchmark_scripts/vkFFT_scripts/src/user_benchmark_VkFFT.cpp
-+++ b/benchmark_scripts/vkFFT_scripts/src/user_benchmark_VkFFT.cpp
+Index: vkfft.git/benchmark_scripts/vkFFT_scripts/src/user_benchmark_VkFFT.cpp
+===================================================================
+--- vkfft.git.orig/benchmark_scripts/vkFFT_scripts/src/user_benchmark_VkFFT.cpp
++++ vkfft.git/benchmark_scripts/vkFFT_scripts/src/user_benchmark_VkFFT.cpp
@@ -13,7 +13,7 @@
#if(VKFFT_BACKEND==0)
@@ -399,10 +399,10 @@ index 5e63644..3fa0a79 100644
#elif(VKFFT_BACKEND==1)
#include <cuda.h>
#include <cuda_runtime.h>
-diff --git a/benchmark_scripts/vkFFT_scripts/src/utils_VkFFT.cpp b/benchmark_scripts/vkFFT_scripts/src/utils_VkFFT.cpp
-index 00a0426..31ae013 100644
---- a/benchmark_scripts/vkFFT_scripts/src/utils_VkFFT.cpp
-+++ b/benchmark_scripts/vkFFT_scripts/src/utils_VkFFT.cpp
+Index: vkfft.git/benchmark_scripts/vkFFT_scripts/src/utils_VkFFT.cpp
+===================================================================
+--- vkfft.git.orig/benchmark_scripts/vkFFT_scripts/src/utils_VkFFT.cpp
++++ vkfft.git/benchmark_scripts/vkFFT_scripts/src/utils_VkFFT.cpp
@@ -13,7 +13,7 @@
#if(VKFFT_BACKEND==0)
@@ -412,10 +412,10 @@ index 00a0426..31ae013 100644
#elif(VKFFT_BACKEND==1)
#include <cuda.h>
#include <cuda_runtime.h>
-diff --git a/vkFFT/vkFFT.h b/vkFFT/vkFFT.h
-index 26e8580..eae06be 100644
---- a/vkFFT/vkFFT.h
-+++ b/vkFFT/vkFFT.h
+Index: vkfft.git/vkFFT/vkFFT.h
+===================================================================
+--- vkfft.git.orig/vkFFT/vkFFT.h
++++ vkfft.git/vkFFT/vkFFT.h
@@ -35,7 +35,7 @@
#include <inttypes.h>
#if(VKFFT_BACKEND==0)
diff --git a/documentation/VkFFT_API_guide.lyx b/documentation/VkFFT_API_guide.lyx
index 9df3ce8..77cf9d9 100644
--- a/documentation/VkFFT_API_guide.lyx
+++ b/documentation/VkFFT_API_guide.lyx
@@ -136,8 +136,8 @@ vspace{1cm}
\backslash
scshape
\backslash
-LARGE VkFFT - Vulkan/CUDA/HIP/OpenCL/Level Zero Fast Fourier Transform library
-
+LARGE VkFFT - Vulkan/CUDA/HIP/OpenCL/Level Zero/Metal Fast Fourier Transform
+ library
\backslash
par}
\end_layout
@@ -192,7 +192,7 @@ vspace{1cm}
{
\backslash
-large August 2022, version 1.2.26
+large October 2022, version 1.2.30
\backslash
par}
\end_layout
@@ -237,8 +237,8 @@ Introduction
\end_layout
\begin_layout Standard
-This document describes VkFFT - Vulkan/CUDA/HIP/OpenCL/Level Zero Fast Fourier
- Transform library.
+This document describes VkFFT - Vulkan/CUDA/HIP/OpenCL/Level Zero/Metal
+ Fast Fourier Transform library.
It describes the features and current limitations of VkFFT, explains the
API and compares it to other FFT libraries (like FFTW and cuFFT) on the
set of examples.
@@ -334,7 +334,8 @@ Copy vkFFT.h file into one of the directories included in the user's project.
\begin_layout Enumerate
\noindent
Define VKFFT_BACKEND as a number corresponding to the API used in the user's
- project: 0 - Vulkan, 1 - CUDA, 2 - HIP, 3 - OpenCL, 4 - Level Zero.
+ project: 0 - Vulkan, 1 - CUDA, 2 - HIP, 3 - OpenCL, 4 - Level Zero, 5 -
+ Metal.
Definition is done like:
\begin_inset ERT
status open
@@ -395,7 +396,7 @@ begin{minted}[tabsize=4,obeytabs,breaklines]{cmake}
\begin_layout Plain Layout
set(VKFFT_BACKEND 1 CACHE STRING "0 - Vulkan, 1 - CUDA, 2 - HIP, 3 - OpenCL,
- 4 - Level Zero")
+ 4 - Level Zero, 5 - Metal")
\end_layout
\begin_layout Plain Layout
@@ -963,6 +964,100 @@ end{mdframed}
\end_inset
+\end_layout
+
+\begin_layout Enumerate
+Metal API: Metal.
+ Sample CMakeLists can look like this:
+\begin_inset ERT
+status open
+
+\begin_layout Plain Layout
+
+
+\backslash
+begin{mdframed}[backgroundcolor=bg]
+\end_layout
+
+\begin_layout Plain Layout
+
+
+\backslash
+begin{minted}[tabsize=4,obeytabs,breaklines]{cmake}
+\end_layout
+
+\begin_layout Plain Layout
+
+add_compile_options(-WMTL_IGNORE_WARNINGS)
+\end_layout
+
+\begin_layout Plain Layout
+
+find_library(FOUNDATION_LIB Foundation REQUIRED)
+\end_layout
+
+\begin_layout Plain Layout
+
+find_library(QUARTZ_CORE_LIB QuartzCore REQUIRED)
+\end_layout
+
+\begin_layout Plain Layout
+
+find_library(METAL_LIB Metal REQUIRED)
+\end_layout
+
+\begin_layout Plain Layout
+
+target_include_directories(${PROJECT_NAME} PUBLIC "metal-cpp/")
+\end_layout
+
+\begin_layout Plain Layout
+
+\end_layout
+
+\begin_layout Plain Layout
+
+target_include_directories(${PROJECT_NAME} PUBLIC ${CMAKE_CURRENT_SOURCE_DIR}/vk
+FFT/)
+\end_layout
+
+\begin_layout Plain Layout
+
+add_library(VkFFT INTERFACE)
+\end_layout
+
+\begin_layout Plain Layout
+
+target_compile_definitions(VkFFT INTERFACE -DVKFFT_BACKEND=5)
+\end_layout
+
+\begin_layout Plain Layout
+
+\end_layout
+
+\begin_layout Plain Layout
+
+target_link_libraries(${PROJECT_NAME} PUBLIC ${FOUNDATION_LIB} ${QUARTZ_CORE_LIB
+} ${METAL_LIB} VkFFT)
+\end_layout
+
+\begin_layout Plain Layout
+
+
+\backslash
+end{minted}
+\end_layout
+
+\begin_layout Plain Layout
+
+
+\backslash
+end{mdframed}
+\end_layout
+
+\end_inset
+
+
\end_layout
\end_deeper
@@ -1228,7 +1323,8 @@ VkFFT buffers
VkFFT allows for explicit control over the data flow, which makes both in-place
and out-of-place transforms possible.
Buffers are passed to VkFFT as VkBuffer pointer in Vulkan, as double void
- pointers in CUDA/HIP/Level Zero and as cl_mem pointer in OpenCL.
+ pointers in CUDA/HIP/Level Zero, as cl_mem pointer in OpenCL and as MTL::Buffer
+ pointer in Metal.
This is done to maintain a uniform data pattern because some of the buffers
can be allocated automatically.
@@ -2644,6 +2740,14 @@ ndLaunchKernel calls to user-defined command list ze_command_list_handle_t.
\end_layout
+\begin_layout Itemize
+Metal API: similar to Vulkan, VkFFT appends a sequence of dispatchThreads
+ calls to user-defined command encoder MTL::ComputeCommandEncoder.
+ MTL::ComputeCommandEncoder and its MTL::CommandBuffer must be provided
+ as a pointer in VkFFTLaunchParams.
+
+\end_layout
+
\begin_layout Standard
If VkFFT fails during the VkFFTAppend call, it will not free the application
and allocated there resources - use a separate call for that.
@@ -2937,6 +3041,23 @@ uint32_t commandQueueID; // ID of the commandQueue with compute and copy
\begin_layout Plain Layout
+#elif(VKFFT_BACKEND==5)
+\end_layout
+
+\begin_layout Plain Layout
+
+MTL::Device* device; // Pointer to Metal device, obtained from MTL::CopyAllDevic
+es
+\end_layout
+
+\begin_layout Plain Layout
+
+MTL::CommandQueue* queue; // Pointer to Metal queue, obtained from device->newCo
+mmandQueue()
+\end_layout
+
+\begin_layout Plain Layout
+
#endif
\end_layout
@@ -3172,6 +3293,80 @@ cl_mem* kernel; // Pointer to device buffer used to read kernel data from
\begin_layout Plain Layout
+#elif(VKFFT_BACKEND==4)
+\end_layout
+
+\begin_layout Plain Layout
+
+void** buffer; // Pointer to device buffer used for computations
+\end_layout
+
+\begin_layout Plain Layout
+
+void** tempBuffer; // Needed if reorderFourStep is enabled to transpose
+ the array.
+ Same size as buffer.
+ Default 0.
+ Setting to non zero value enables manual user allocation
+\end_layout
+
+\begin_layout Plain Layout
+
+void** inputBuffer; // Pointer to device buffer used to read data from if
+ isInputFormatted is enabled
+\end_layout
+
+\begin_layout Plain Layout
+
+void** outputBuffer; // Pointer to device buffer used to read data from
+ if isOutputFormatted is enabled
+\end_layout
+
+\begin_layout Plain Layout
+
+void** kernel; // Pointer to device buffer used to read kernel data from
+ if performConvolution is enabled
+\end_layout
+
+\begin_layout Plain Layout
+
+#elif(VKFFT_BACKEND==5)
+\end_layout
+
+\begin_layout Plain Layout
+
+MTL::Buffer** buffer; // Pointer to device buffer used for computations
+\end_layout
+
+\begin_layout Plain Layout
+
+MTL::Buffer** tempBuffer; // Needed if reorderFourStep is enabled to transpose
+ the array.
+ Same size as buffer.
+ Default 0.
+ Setting to non zero value enables manual user allocation
+\end_layout
+
+\begin_layout Plain Layout
+
+MTL::Buffer** inputBuffer; // Pointer to device buffer used to read data
+ from if isInputFormatted is enabled
+\end_layout
+
+\begin_layout Plain Layout
+
+MTL::Buffer** outputBuffer; // Pointer to device buffer used to read data
+ from if isOutputFormatted is enabled
+\end_layout
+
+\begin_layout Plain Layout
+
+MTL::Buffer** kernel; // Pointer to device buffer used to read kernel data
+ from if performConvolution is enabled
+\end_layout
+
+\begin_layout Plain Layout
+
#endif
\end_layout
@@ -3429,6 +3624,7 @@ uint64_t saveApplicationToString; // Will save all compiled binaries to
VkFFTApplication.saveApplicationString (will be allocated by VkFFT, deallocated
with deleteVkFFT call).
VkFFTApplication.applicationStringSize will contain size of binary in bytes.
+ Currently disabled in Metal backend.
(0 - off, 1 - on)
\end_layout
@@ -3438,15 +3634,29 @@ uint64_t loadApplicationFromString; // Will load all binaries from loadApplicati
onString instead of recompiling them (must be allocated by user, must contain
what saveApplicationToString call generated previously in VkFFTApplication.saveA
pplicationString).
+ Currently disabled in Metal backend.
(0 - off, 1 - on).
Mutually exclusive with saveApplicationToString
\end_layout
\begin_layout Plain Layout
-void* loadApplicationString; // Memory array (uint32_t* for Vulkan/HIP,
- char* for CUDA/OpenCL) through which user can load VkFFT binaries, must
- be provided by user if loadApplicationFromString = 1.
+void* loadApplicationString; // Memory binary array through which user can
+ load VkFFT binaries, must be provided by user if loadApplicationFromString
+ = 1.
+ Use rb/wb flags to load/save.
+\end_layout
+
+\begin_layout Plain Layout
+
+\end_layout
+
+\begin_layout Plain Layout
+
+uint64_t disableSetLocale; // disables all VkFFT attempts to set locale
+ to C - user must ensure that VkFFT has C locale during the plan initialization.
+ This option is needed for multithreading.
+ Default 0.
\end_layout
@@ -3509,6 +3719,52 @@ uint64_t* paddedSizes; // described in useCustomBluesteinPaddingPattern
\begin_layout Plain Layout
+uint64_t fixMinRaderPrimeMult; // start direct multiplication Rader's algorithm
+ for radix primes from this number.
+ This means that VkFFT will inline custom Rader kernels if sequence is divisible
+ by these primes.
+ Default is 17, as VkFFT has kernels for 2-13.
+ If you make it less than 13, VkFFT will switch from these kernels to Rader.
+\end_layout
+
+\begin_layout Plain Layout
+
+uint64_t fixMaxRaderPrimeMult; // switch from Mult Rader's algorithm for
+ radix primes from this number.
+ Current limitation for Rader is maxThreadNum/2+1, realistically you would
+ want to switch somewhere on 30-100 range.
+ Default is vendor-specific (currently ~40)
+\end_layout
+
+\begin_layout Plain Layout
+
+\end_layout
+
+\begin_layout Plain Layout
+
+uint64_t fixMinRaderPrimeFFT; // start FFT convolution version of Rader
+ for radix primes from this number.
+ Better than direct multiplication version for almost all primes (except
+ small ones, like 17-23 on some GPUs).
+ Must be bigger or equal to fixMinRaderPrimeMult.
+ Deafult 29 on AMD and 17 on other GPUs.
+
+\end_layout
+
+\begin_layout Plain Layout
+
+uint64_t fixMaxRaderPrimeFFT; // switch to Bluestein's algorithm for radix
+ primes from this number.
+ Switch may happen earlier if prime can't fit in shared memory.
+ Default is 16384, which is bigger than most current GPU's shared memory.
+\end_layout
+
+\begin_layout Plain Layout
+
+\end_layout
+
+\begin_layout Plain Layout
+
// Optional zero padding control parameters: (default 0 if not stated otherwise)
\end_layout
@@ -3603,8 +3859,8 @@ uint64_t registerBoost; // Specify if register file size is bigger than
4 to emulate 128KB of shared memory).
Defaults: Nvidia - 4 in Vulkan/OpenCL, 1 in CUDA backend; AMD - 2 if shared
memory >= 64KB, else 4 in Vulkan/OpenCL backend, 1 in HIP backend; Intel
- - 1 if shared memory >= 64KB, else 2 in Vulkan/OpenCL/Level Zero backends;
- Default 1
+ - 1 if shared memory >= 64KB, else 2 in Vulkan/OpenCL/Level Zero backends,
+ 1 in Metal; Default 1
\end_layout
\begin_layout Plain Layout
@@ -3658,6 +3914,17 @@ uint64_t localPageSize; // In KB, the size to split page into if sequence
\begin_layout Plain Layout
+uint64_t computeCapabilityMajor; // CUDA/HIP compute capability of the device
+\end_layout
+
+\begin_layout Plain Layout
+
+uint64_t computeCapabilityMinor; // CUDA/HIP compute capability of the device
+
+\end_layout
+
+\begin_layout Plain Layout
+
uint64_t maxComputeWorkGroupCount[3]; // maxComputeWorkGroupCount from VkPhysica
lDeviceLimits
\end_layout
@@ -3739,6 +4006,11 @@ einPaddingPattern
\begin_layout Plain Layout
+uint64_t useRaderUintLUT; // allocate additional LUT to store g_pow
+\end_layout
+
+\begin_layout Plain Layout
+
uint64_t vendorID; // vendorID 0x10DE - NVIDIA, 0x8086 - Intel, 0x1002 -
AMD, etc
\end_layout
@@ -3825,6 +4097,21 @@ ze_command_list_handle_t* commandList; // Filled at app creation
\begin_layout Plain Layout
+#elif(VKFFT_BACKEND==5)
+\end_layout
+
+\begin_layout Plain Layout
+
+MTL::CommandBuffer* commandBuffer; // Filled at app execution
+\end_layout
+
+\begin_layout Plain Layout
+
+MTL::ComputeCommandEncoder* commandEncoder; // Filled at app execution
+\end_layout
+
+\begin_layout Plain Layout
+
#endif
\end_layout
@@ -3980,6 +4267,19 @@ uint32_t commandQueueID - ID of the commandQueue with compute and copy capabilit
ies
\end_layout
+\begin_layout Standard
+Metal API will need the following information:
+\end_layout
+
+\begin_layout Itemize
+MTL::Device* device - Pointer to Metal device, obtained from MTL::CopyAllDevices
+\end_layout
+
+\begin_layout Itemize
+MTL::CommandQueue* queue - Pointer to Metal queue, obtained from device->newComm
+andQueue()
+\end_layout
+
\begin_layout Subsubsection
Memory management parameters
\end_layout
@@ -4013,11 +4313,12 @@ kernel buffer, used for calculation of convolutions and cross-correlations
\begin_layout Standard
These buffers must be passed by a pointer: in Vulkan API they are provided
as VkBuffer*, in CUDA, HIP and Level Zero they are provided as void**,
- in OpenCL, they are provided as cl_mem*.
- Even though the underlying structure (VkBuffer, void*, cl_mem) is not a
- memory but just a number that the driver can use to access corresponding
- allocated memory on the GPU, passing them by a pointer allows for the user
- to query multiple GPU allocated buffers for VkFFT to use.
+ in OpenCL they are provided as cl_mem*, in Metal they are provided as MTL::Buff
+er*.
+ Even though the underlying structure (VkBuffer, void*, cl_mem, MTL::Buffer*)
+ is not a memory but just a number that the driver can use to access correspondi
+ng allocated memory on the GPU, passing them by a pointer allows for the
+ user to query multiple GPU allocated buffers for VkFFT to use.
Currently, it is only supported in Vulkan API - each of five buffer types
can be made out of multiple separate memory allocations.
For example, it is possible to combine multiple small unused at the point
@@ -4158,8 +4459,9 @@ Precision parameters (and some things that can affect it):
\begin_layout Standard
uint64_t doublePrecision - perform calculations in double precision.
Default 0, set to 1 to enable.
- In Vulkan/OpenCL/Level Zero your device must support double-precision functiona
+ In Vulkan/OpenCL/Level Zero your device must support double precision functiona
lity.
+ Metal API does not support double precision.
Optional parameter.
\end_layout
@@ -4171,8 +4473,9 @@ uint64_t doublePrecisionFloatMemory - perform calculations in double precision,
This option increases precision, but not that much to be recommended for
actual use.
Default 0, set to 1 to enable.
- In Vulkan/OpenCL/Level Zero your device must support double-precision functiona
+ In Vulkan/OpenCL/Level Zero your device must support double precision functiona
lity.
+ Metal API does not support double precision.
Experimental feature.
Optional parameter.
\end_layout
@@ -4345,6 +4648,7 @@ uint64_t saveApplicationToString - will save all compiled binaries to VkFFTAppli
cation.saveApplicationString (will be allocated by VkFFT, deallocated with
deleteVkFFT call).
VkFFTApplication.applicationStringSize will contain size of binary in bytes.
+ Currently disabled in Metal backend.
Default 0, set to 1 to enable.
Optional parameter.
\end_layout
@@ -4354,18 +4658,64 @@ uint64_t loadApplicationFromString - will load all binaries from loadApplication
String instead of recompiling them (loadApplicationString must be allocated
by user, must contain what saveApplicationToString call generated previously
in VkFFTApplication.saveApplicationString).
+ Currently disabled in Metal backend.
Default 0, set to 1 to enable.
Optional parameter.
Mutually exclusive with saveApplicationToString
\end_layout
\begin_layout Standard
-void* loadApplicationString - memory array (uint32_t* for Vulkan, HIP and
- Level Zero, char* for CUDA/OpenCL) through which user can load VkFFT binaries,
- must be provided by user if loadApplicationFromString = 1.
+void* loadApplicationString - memory binary array through which user can
+ load VkFFT binaries, must be provided by user if loadApplicationFromString
+ = 1.
+ Use rb/wb flags to load/save.
+\end_layout
+
+\begin_layout Standard
+uint64_t disableSetLocale - disables all VkFFT attempts to set locale to
+ C - user must ensure that VkFFT has C locale during the plan initialization.
+ This option is needed for multithreading.
+ Default 0.
+\end_layout
+
+\begin_layout Subsubsection
+Rader control parameters
+\end_layout
+
+\begin_layout Standard
+uint64_t fixMinRaderPrimeMult - start direct multiplication Rader's algorithm
+ for radix primes from this number.
+ This means that VkFFT will inline custom Rader kernels if sequence is divisible
+ by these primes.
+ Default is 17, as VkFFT has kernels for 2-13.
+ If you make it less than 13, VkFFT will switch from these kernels to Rader.
+\end_layout
+
+\begin_layout Standard
+uint64_t fixMaxRaderPrimeMult - switch from Mult Rader's algorithm for radix
+ primes from this number.
+ Current limitation for Rader is maxThreadNum/2+1, realistically you would
+ want to switch somewhere on 30-100 range.
+ Default is vendor-specific (currently ~40)
+\end_layout
+
+\begin_layout Standard
+uint64_t fixMinRaderPrimeFFT - start FFT convolution version of Rader for
+ radix primes from this number.
+ Better than direct multiplication version for almost all primes (except
+ small ones, like 17-23 on some GPUs).
+ Must be bigger or equal to fixMinRaderPrimeMult.
+ Deafult 29 on AMD and 17 on other GPUs.
\end_layout
+\begin_layout Standard
+uint64_t fixMaxRaderPrimeFFT - switch to Bluestein's algorithm for radix
+ primes from this number.
+ Switch may happen earlier if prime can't fit in shared memory.
+ Default is 16384, which is bigger than most current GPU's shared memory.
+\end_layout
+
\begin_layout Subsubsection
Bluestein control parameters
\end_layout
@@ -4548,7 +4898,7 @@ Register overutilization
\begin_layout Standard
Only works in C2C mode, without convolution support.
Enabled in Vulkan, OpenCL and Level Zero APIs only (it works in other APIs,
- but worse).
+ but worse, does not work in Metal).
Experimental feature.
\end_layout
@@ -4584,6 +4934,15 @@ uint64_t registerBoost4Step - specify if register file overutilization should
Extra advanced parameters (filled automatically)
\end_layout
+\begin_layout Standard
+uint64_t computeCapabilityMajor - CUDA/HIP compute capability of the device
+\end_layout
+
+\begin_layout Standard
+uint64_t computeCapabilityMinor - CUDA/HIP compute capability of the device
+
+\end_layout
+
\begin_layout Standard
uint64_t maxComputeWorkGroupCount[3] - how many workgroups can be launched
at one dispatch.
@@ -4658,13 +5017,17 @@ int64_t maxTempLength - specify how big can the buffer used for intermediate
\end_layout
\begin_layout Standard
-uint64_t autoCustomBluesteinPaddingPattern; // default value for useCustomBluest
-einPaddingPattern
+uint64_t autoCustomBluesteinPaddingPattern - default value for useCustomBluestei
+nPaddingPattern
\end_layout
\begin_layout Standard
-uint64_t vendorID; // vendorID 0x10DE - NVIDIA, 0x8086 - Intel, 0x1002 -
- AMD, etc.
+uint64_t useRaderUintLUT - allocate additional LUT to store g_pow
+\end_layout
+
+\begin_layout Standard
+uint64_t vendorID - vendorID 0x10DE - NVIDIA, 0x8086 - Intel, 0x1002 - AMD,
+ etc.
\end_layout
\begin_layout Standard
@@ -5022,6 +5385,21 @@ uint32_t commandQueueID;
\begin_layout Plain Layout
+#elif(VKFFT_BACKEND==5) //Metal API
+\end_layout
+
+\begin_layout Plain Layout
+
+MTL::Device* device;
+\end_layout
+
+\begin_layout Plain Layout
+
+MTL::CommandQueue* queue;
+\end_layout
+
+\begin_layout Plain Layout
+
#endif
\end_layout
@@ -6214,6 +6592,21 @@ launchParams->commandList = &commandList;
\begin_layout Plain Layout
+#elif(VKFFT_BACKEND==5) //Metal API
+\end_layout
+
+\begin_layout Plain Layout
+
+launchParams->commandBuffer = commandBuffer;
+\end_layout
+
+\begin_layout Plain Layout
+
+launchParams->commandEncoder = commandEncoder;
+\end_layout
+
+\begin_layout Plain Layout
+
#endif
\end_layout
@@ -7303,6 +7696,8 @@ This example shows how to save/load binaries generated by VkFFT.
This can reduce time taken by initializeVkFFT call by removing RTC components
from it.
Be sure that rest of the configuration stays the same to reuse the binary.
+ Use rb/wb flags to load/save.
+ This does not currently work in Metal backend.
\end_layout
\begin_layout Standard
@@ -7374,28 +7769,7 @@ if (configuration.loadApplicationFromString) {
\begin_layout Plain Layout
-#if((VKFFT_BACKEND==0) || (VKFFT_BACKEND==2) || (VKFFT_BACKEND==4))
-\end_layout
-
-\begin_layout Plain Layout
-
- kernelCache = fopen("VkFFT_binary", "rb"); //Vulkan and HIP backends load
- data as a uint32_t sequence
-\end_layout
-
-\begin_layout Plain Layout
-
-#else
-\end_layout
-
-\begin_layout Plain Layout
-
- kernelCache = fopen("VkFFT_binary", "r");
-\end_layout
-
-\begin_layout Plain Layout
-
-#endif
+ kernelCache = fopen("VkFFT_binary", "rb");
\end_layout
\begin_layout Plain Layout
@@ -7477,28 +7851,7 @@ if (configuration.saveApplicationToString) {
\begin_layout Plain Layout
-#if((VKFFT_BACKEND==0) || (VKFFT_BACKEND==2) || (VKFFT_BACKEND==4))
-\end_layout
-
-\begin_layout Plain Layout
-
- kernelCache = fopen("VkFFT_binary", "wb"); //Vulkan and HIP backends save
- data as a uint32_t sequence
-\end_layout
-
-\begin_layout Plain Layout
-
-#else
-\end_layout
-
-\begin_layout Plain Layout
-
- kernelCache = fopen("VkFFT_binary", "w");
-\end_layout
-
-\begin_layout Plain Layout
-
-#endif
+ kernelCache = fopen("VkFFT_binary", "wb");
\end_layout
\begin_layout Plain Layout
diff --git a/documentation/VkFFT_API_guide.pdf b/documentation/VkFFT_API_guide.pdf
index ea5e079..f9b7ba5 100644
--- a/documentation/VkFFT_API_guide.pdf
+++ b/documentation/VkFFT_API_guide.pdf
@@ -270,434 +270,436 @@ endobj
<< /S /GoTo /D (subsubsection.3.3.6) >>
endobj
156 0 obj
-(\376\377\000B\000l\000u\000e\000s\000t\000e\000i\000n\000\040\000c\000o\000n\000t\000r\000o\000l\000\040\000p\000a\000r\000a\000m\000e\000t\000e\000r\000s)
+(\376\377\000R\000a\000d\000e\000r\000\040\000c\000o\000n\000t\000r\000o\000l\000\040\000p\000a\000r\000a\000m\000e\000t\000e\000r\000s)
endobj
157 0 obj
<< /S /GoTo /D (subsubsection.3.3.7) >>
endobj
160 0 obj
-(\376\377\000Z\000e\000r\000o\000\040\000p\000a\000d\000d\000i\000n\000g\000\040\000p\000a\000r\000a\000m\000e\000t\000e\000r\000s)
+(\376\377\000B\000l\000u\000e\000s\000t\000e\000i\000n\000\040\000c\000o\000n\000t\000r\000o\000l\000\040\000p\000a\000r\000a\000m\000e\000t\000e\000r\000s)
endobj
161 0 obj
<< /S /GoTo /D (subsubsection.3.3.8) >>
endobj
164 0 obj
-(\376\377\000C\000o\000n\000v\000o\000l\000u\000t\000i\000o\000n\000\040\000p\000a\000r\000a\000m\000e\000t\000e\000r\000s)
+(\376\377\000Z\000e\000r\000o\000\040\000p\000a\000d\000d\000i\000n\000g\000\040\000p\000a\000r\000a\000m\000e\000t\000e\000r\000s)
endobj
165 0 obj
<< /S /GoTo /D (subsubsection.3.3.9) >>
endobj
168 0 obj
-(\376\377\000R\000e\000g\000i\000s\000t\000e\000r\000\040\000o\000v\000e\000r\000u\000t\000i\000l\000i\000z\000a\000t\000i\000o\000n)
+(\376\377\000C\000o\000n\000v\000o\000l\000u\000t\000i\000o\000n\000\040\000p\000a\000r\000a\000m\000e\000t\000e\000r\000s)
endobj
169 0 obj
<< /S /GoTo /D (subsubsection.3.3.10) >>
endobj
172 0 obj
-(\376\377\000E\000x\000t\000r\000a\000\040\000a\000d\000v\000a\000n\000c\000e\000d\000\040\000p\000a\000r\000a\000m\000e\000t\000e\000r\000s\000\040\000\050\000f\000i\000l\000l\000e\000d\000\040\000a\000u\000t\000o\000m\000a\000t\000i\000c\000a\000l\000l\000y\000\051)
+(\376\377\000R\000e\000g\000i\000s\000t\000e\000r\000\040\000o\000v\000e\000r\000u\000t\000i\000l\000i\000z\000a\000t\000i\000o\000n)
endobj
173 0 obj
-<< /S /GoTo /D (section.4) >>
+<< /S /GoTo /D (subsubsection.3.3.11) >>
endobj
176 0 obj
-(\376\377\000V\000k\000F\000F\000T\000\040\000B\000e\000n\000c\000h\000m\000a\000r\000k\000/\000P\000r\000e\000c\000i\000s\000i\000o\000n\000\040\000S\000u\000i\000t\000e\000\040\000a\000n\000d\000\040\000u\000t\000i\000l\000s\000\137\000V\000k\000F\000F\000T\000\040\000h\000e\000l\000p\000e\000r\000\040\000r\000o\000u\000t\000i\000n\000e\000s)
+(\376\377\000E\000x\000t\000r\000a\000\040\000a\000d\000v\000a\000n\000c\000e\000d\000\040\000p\000a\000r\000a\000m\000e\000t\000e\000r\000s\000\040\000\050\000f\000i\000l\000l\000e\000d\000\040\000a\000u\000t\000o\000m\000a\000t\000i\000c\000a\000l\000l\000y\000\051)
endobj
177 0 obj
-<< /S /GoTo /D (subsection.4.1) >>
+<< /S /GoTo /D (section.4) >>
endobj
180 0 obj
-(\376\377\000u\000t\000i\000l\000s\000\137\000V\000k\000F\000F\000T\000\040\000h\000e\000l\000p\000e\000r\000\040\000r\000o\000u\000t\000i\000n\000e\000s)
+(\376\377\000V\000k\000F\000F\000T\000\040\000B\000e\000n\000c\000h\000m\000a\000r\000k\000/\000P\000r\000e\000c\000i\000s\000i\000o\000n\000\040\000S\000u\000i\000t\000e\000\040\000a\000n\000d\000\040\000u\000t\000i\000l\000s\000\137\000V\000k\000F\000F\000T\000\040\000h\000e\000l\000p\000e\000r\000\040\000r\000o\000u\000t\000i\000n\000e\000s)
endobj
181 0 obj
-<< /S /GoTo /D (section.5) >>
+<< /S /GoTo /D (subsection.4.1) >>
endobj
184 0 obj
-(\376\377\000V\000k\000F\000F\000T\000\040\000C\000o\000d\000e\000\040\000E\000x\000a\000m\000p\000l\000e\000s)
+(\376\377\000u\000t\000i\000l\000s\000\137\000V\000k\000F\000F\000T\000\040\000h\000e\000l\000p\000e\000r\000\040\000r\000o\000u\000t\000i\000n\000e\000s)
endobj
185 0 obj
-<< /S /GoTo /D (subsection.5.1) >>
+<< /S /GoTo /D (section.5) >>
endobj
188 0 obj
-(\376\377\000D\000r\000i\000v\000e\000r\000\040\000i\000n\000i\000t\000i\000a\000l\000i\000z\000a\000t\000i\000o\000n\000s)
+(\376\377\000V\000k\000F\000F\000T\000\040\000C\000o\000d\000e\000\040\000E\000x\000a\000m\000p\000l\000e\000s)
endobj
189 0 obj
-<< /S /GoTo /D (subsection.5.2) >>
+<< /S /GoTo /D (subsection.5.1) >>
endobj
192 0 obj
-(\376\377\000S\000i\000m\000p\000l\000e\000\040\000F\000F\000T\000\040\000a\000p\000p\000l\000i\000c\000a\000t\000i\000o\000n\000\040\000e\000x\000a\000m\000p\000l\000e\000:\000\040\0001\000D\000\040\000\050\000o\000n\000e\000\040\000d\000i\000m\000e\000n\000s\000i\000o\000n\000a\000l\000\051\000\040\000C\0002\000C\000\040\000\050\000c\000o\000m\000p\000l\000e\000x\000\040\000t\000o\000\040\000c\000o\000m\000p\000l\000e\000x\000\051\000\040\000F\000P\0003\0002\000\040\000\050\000s\000i\000n\000g\000l\000e\000\040\000p\000r\000e\000c\000i\000s\000i\000o\000n\000\051\000\040\000F\000F\000T)
+(\376\377\000D\000r\000i\000v\000e\000r\000\040\000i\000n\000i\000t\000i\000a\000l\000i\000z\000a\000t\000i\000o\000n\000s)
endobj
193 0 obj
-<< /S /GoTo /D (subsection.5.3) >>
+<< /S /GoTo /D (subsection.5.2) >>
endobj
196 0 obj
-(\376\377\000A\000d\000v\000a\000n\000c\000e\000d\000\040\000F\000F\000T\000\040\000a\000p\000p\000l\000i\000c\000a\000t\000i\000o\000n\000\040\000e\000x\000a\000m\000p\000l\000e\000:\000\040\000N\000D\000,\000\040\000C\0002\000C\000/\000R\0002\000C\000/\000R\0002\000R\000,\000\040\000d\000i\000f\000f\000e\000r\000e\000n\000t\000\040\000p\000r\000e\000c\000i\000s\000i\000o\000n\000s\000,\000\040\000b\000a\000t\000c\000h\000e\000d\000\040\000F\000F\000T)
+(\376\377\000S\000i\000m\000p\000l\000e\000\040\000F\000F\000T\000\040\000a\000p\000p\000l\000i\000c\000a\000t\000i\000o\000n\000\040\000e\000x\000a\000m\000p\000l\000e\000:\000\040\0001\000D\000\040\000\050\000o\000n\000e\000\040\000d\000i\000m\000e\000n\000s\000i\000o\000n\000a\000l\000\051\000\040\000C\0002\000C\000\040\000\050\000c\000o\000m\000p\000l\000e\000x\000\040\000t\000o\000\040\000c\000o\000m\000p\000l\000e\000x\000\051\000\040\000F\000P\0003\0002\000\040\000\050\000s\000i\000n\000g\000l\000e\000\040\000p\000r\000e\000c\000i\000s\000i\000o\000n\000\051\000\040\000F\000F\000T)
endobj
197 0 obj
-<< /S /GoTo /D (subsection.5.4) >>
+<< /S /GoTo /D (subsection.5.3) >>
endobj
200 0 obj
-(\376\377\000A\000d\000v\000a\000n\000c\000e\000d\000\040\000F\000F\000T\000\040\000a\000p\000p\000l\000i\000c\000a\000t\000i\000o\000n\000\040\000e\000x\000a\000m\000p\000l\000e\000:\000\040\000o\000u\000t\000-\000o\000f\000-\000p\000l\000a\000c\000e\000\040\000R\0002\000C\000\040\000F\000F\000T\000\040\000w\000i\000t\000h\000\040\000c\000u\000s\000t\000o\000m\000\040\000s\000t\000r\000i\000d\000e\000s)
+(\376\377\000A\000d\000v\000a\000n\000c\000e\000d\000\040\000F\000F\000T\000\040\000a\000p\000p\000l\000i\000c\000a\000t\000i\000o\000n\000\040\000e\000x\000a\000m\000p\000l\000e\000:\000\040\000N\000D\000,\000\040\000C\0002\000C\000/\000R\0002\000C\000/\000R\0002\000R\000,\000\040\000d\000i\000f\000f\000e\000r\000e\000n\000t\000\040\000p\000r\000e\000c\000i\000s\000i\000o\000n\000s\000,\000\040\000b\000a\000t\000c\000h\000e\000d\000\040\000F\000F\000T)
endobj
201 0 obj
-<< /S /GoTo /D (subsection.5.5) >>
+<< /S /GoTo /D (subsection.5.4) >>
endobj
204 0 obj
-(\376\377\000A\000d\000v\000a\000n\000c\000e\000d\000\040\000F\000F\000T\000\040\000a\000p\000p\000l\000i\000c\000a\000t\000i\000o\000n\000\040\000e\000x\000a\000m\000p\000l\000e\000:\000\040\0003\000D\000\040\000z\000e\000r\000o\000-\000p\000a\000d\000d\000e\000d\000\040\000F\000F\000T)
+(\376\377\000A\000d\000v\000a\000n\000c\000e\000d\000\040\000F\000F\000T\000\040\000a\000p\000p\000l\000i\000c\000a\000t\000i\000o\000n\000\040\000e\000x\000a\000m\000p\000l\000e\000:\000\040\000o\000u\000t\000-\000o\000f\000-\000p\000l\000a\000c\000e\000\040\000R\0002\000C\000\040\000F\000F\000T\000\040\000w\000i\000t\000h\000\040\000c\000u\000s\000t\000o\000m\000\040\000s\000t\000r\000i\000d\000e\000s)
endobj
205 0 obj
-<< /S /GoTo /D (subsection.5.6) >>
+<< /S /GoTo /D (subsection.5.5) >>
endobj
208 0 obj
-(\376\377\000C\000o\000n\000v\000o\000l\000u\000t\000i\000o\000n\000\040\000a\000p\000p\000l\000i\000c\000a\000t\000i\000o\000n\000\040\000e\000x\000a\000m\000p\000l\000e\000:\000\040\0003\000x\0003\000\040\000m\000a\000t\000r\000i\000x\000-\000v\000e\000c\000t\000o\000r\000\040\000c\000o\000n\000v\000o\000l\000u\000t\000i\000o\000n\000\040\000i\000n\000\040\0001\000D)
+(\376\377\000A\000d\000v\000a\000n\000c\000e\000d\000\040\000F\000F\000T\000\040\000a\000p\000p\000l\000i\000c\000a\000t\000i\000o\000n\000\040\000e\000x\000a\000m\000p\000l\000e\000:\000\040\0003\000D\000\040\000z\000e\000r\000o\000-\000p\000a\000d\000d\000e\000d\000\040\000F\000F\000T)
endobj
209 0 obj
-<< /S /GoTo /D (subsection.5.7) >>
+<< /S /GoTo /D (subsection.5.6) >>
endobj
212 0 obj
-(\376\377\000C\000o\000n\000v\000o\000l\000u\000t\000i\000o\000n\000\040\000a\000p\000p\000l\000i\000c\000a\000t\000i\000o\000n\000\040\000e\000x\000a\000m\000p\000l\000e\000:\000\040\000R\0002\000C\000\040\000c\000r\000o\000s\000s\000-\000c\000o\000r\000r\000e\000l\000a\000t\000i\000o\000n\000\040\000b\000e\000t\000w\000e\000e\000n\000\040\000t\000w\000o\000\040\000s\000e\000t\000s\000\040\000o\000f\000\040\000N\000\040\000i\000m\000a\000g\000e\000s)
+(\376\377\000C\000o\000n\000v\000o\000l\000u\000t\000i\000o\000n\000\040\000a\000p\000p\000l\000i\000c\000a\000t\000i\000o\000n\000\040\000e\000x\000a\000m\000p\000l\000e\000:\000\040\0003\000x\0003\000\040\000m\000a\000t\000r\000i\000x\000-\000v\000e\000c\000t\000o\000r\000\040\000c\000o\000n\000v\000o\000l\000u\000t\000i\000o\000n\000\040\000i\000n\000\040\0001\000D)
endobj
213 0 obj
-<< /S /GoTo /D (subsection.5.8) >>
+<< /S /GoTo /D (subsection.5.7) >>
endobj
216 0 obj
-(\376\377\000S\000i\000m\000p\000l\000e\000\040\000F\000F\000T\000\040\000a\000p\000p\000l\000i\000c\000a\000t\000i\000o\000n\000\040\000b\000i\000n\000a\000r\000y\000\040\000r\000e\000u\000s\000e\000\040\000a\000p\000p\000l\000i\000c\000a\000t\000i\000o\000n)
+(\376\377\000C\000o\000n\000v\000o\000l\000u\000t\000i\000o\000n\000\040\000a\000p\000p\000l\000i\000c\000a\000t\000i\000o\000n\000\040\000e\000x\000a\000m\000p\000l\000e\000:\000\040\000R\0002\000C\000\040\000c\000r\000o\000s\000s\000-\000c\000o\000r\000r\000e\000l\000a\000t\000i\000o\000n\000\040\000b\000e\000t\000w\000e\000e\000n\000\040\000t\000w\000o\000\040\000s\000e\000t\000s\000\040\000o\000f\000\040\000N\000\040\000i\000m\000a\000g\000e\000s)
endobj
217 0 obj
-<< /S /GoTo /D [ 218 0 R /Fit ] >>
+<< /S /GoTo /D (subsection.5.8) >>
endobj
220 0 obj
-<< /Filter /FlateDecode /Length 395 >>
+(\376\377\000S\000i\000m\000p\000l\000e\000\040\000F\000F\000T\000\040\000a\000p\000p\000l\000i\000c\000a\000t\000i\000o\000n\000\040\000b\000i\000n\000a\000r\000y\000\040\000r\000e\000u\000s\000e\000\040\000a\000p\000p\000l\000i\000c\000a\000t\000i\000o\000n)
+
+endobj
+221 0 obj
+<< /S /GoTo /D [ 222 0 R /Fit ] >>
+endobj
+224 0 obj
+<< /Filter /FlateDecode /Length 420 >>
stream
-x�]��N�0��{�� �v;�N�(\���!V�x�;n����_��A�J�^/p���<��A�J����E a"�\�$�@����~����G��P�Y�a�8�f}[2)&i���ry��m�٬��~ӓY��m��rq�Y��
-v���^�1��7�gD`%�{�;\,%S�$U3A�EVD��x�2Z�cV�'�t`�C�,i��g�65lу�o��'P���{�D��)ԿÊ�0NU5W�/p]9FR|;w��v���b�}滞|>a��?���rX��"�l���ꡀ�[�Æv�(@]��+�uU� �m%q�},z PB΅��+&̂���LL���䚺C����_̫KX?�2"6��ƞ���;�̪�iQ;���
-������yF�\
+x�]S�n1��+�qcp��R��V�yoU����-�����"���@� �O0첝^V.� �P��}��"0�@LY��F �o���
+���'����0ԯ�c�G,�1)��!_�]z|5ko��t3ko�W���Ow� @N���>6?�\�c�1��7�����d��F�2Y���D��8�݆�RE���>q��8��a�P�����d����.�#�@����H4�� �b�49-�ң��i�UZ��5C �Z�qӘ�6�!g����]}W���fj��F�3#J�+�;���c���̉6h�2���O��
X$����{��l��N�iZ���_Se�h�$Nj�2 P@������|��1Ŕ���ӣ������zXlk"�K���یX����htwTvO��^#������oDz
endstream
endobj
-218 0 obj
-<< /Type /Page /Contents 220 0 R /Resources 219 0 R /MediaBox [ 0 0 612 792 ] /Parent 227 0 R >>
+222 0 obj
+<< /Type /Page /Contents 224 0 R /Resources 223 0 R /MediaBox [ 0 0 612 792 ] /Parent 231 0 R >>
endobj
-221 0 obj
-<< /D [ 218 0 R /XYZ 69.866 758.996 null ] >>
+225 0 obj
+<< /D [ 222 0 R /XYZ 69.866 758.996 null ] >>
endobj
-222 0 obj
-<< /D [ 218 0 R /XYZ 70.866 721.134 null ] >>
+226 0 obj
+<< /D [ 222 0 R /XYZ 70.866 721.134 null ] >>
endobj
-219 0 obj
-<< /Font << /F48 223 0 R /F50 224 0 R /F51 225 0 R /F52 226 0 R >> /ProcSet [ /PDF /Text ] >>
+223 0 obj
+<< /Font << /F48 227 0 R /F50 228 0 R /F51 229 0 R /F52 230 0 R >> /ProcSet [ /PDF /Text ] >>
endobj
-272 0 obj
-<< /Filter /FlateDecode /Length 2119 >>
-stream
-x��\�n�6��+�Ò�o``���]gWtۮ��ے��R�i��4-Ų$^�L���緗_���&�����^��Ɲ9hJ�R�` c������R�(�#g�rh?+8���{$�����?�A ��4���7˵�O�q�c�-'��N\�����OcI.���N�W'ɤ"��|�9I�~���
-'w�p-�z\���3�p����L��9�%������[������m�A F�.SpC��Ȍ
-�W9��K��܄���@��fMv`@�
-4�tS�������?X����NI@+�؈'��D�D����T����h�{�\~��]�y�w���&\�]�u���UX���6<S�u ��`!��O��ZL��md�MǏ������Y�n|[>,�tIz\�MI�����K�G ���J���%Vq��@mR��
�T`L9e��"9�\�l
-3*z �k8��lc?� ��� ~7!�$Z�YA�Q#,�#�@^ �QR��3H�4QK�3�6�튉�9ԕ%ν�~�P/(K'�F���P���(��b��-ނݮ�
��ɗRf%����9d8E+{.X錪?��G~.W�Pq��}��6�_B��,�|��@8fG���@�M�Do��)9#x�'��9Tĭ2^�e�&���[P���!�,�j4��X��e��aw��^����ᔏ��,����e����q~���%R_߸�>��o��<hT4�v̓�M�u���N�Ў�B�|C�D0�=�+��H^2��.��MV�c�R·W
-=I>��2���tQ%��f>p'n�5�E5��qa]��XO�>t�@����IސdW�E������d����bw����_�jMo�4v���<4��?��NW߯b��/��4�끎�\S�+�����z�jH��fڶy���."t5�!���Z��_¹Y���!�%��<]���������W1���Y� ]4�;�u1�3��ldd|6FD۾�Oi)���J�M��K���\�"�4>}�pl4�;�՛��
�J~.'�\R#�Ŕ
-=��
--i���MI��(���X,����t@�t����؏�Zp�։ �Hj�d�b�=]㆓�'d�G��p�|����?;Λ�\ܻ��_Ιv��놥�d4�yP��ʦ�t���SCу�)���n;�! ��ƽ-���0ۮ�
#Ft��� �$y<@x�b��g�6�xа{�Dz��<;�&�vi��L����?�M���V�n!�Vg��9[;�x�8Q�~"JmR ��~� H V�]�O���h��[
-��{���*��n��ză��Kz�6�[I����n+J�t�j#�f�W���y� �`�������{כSA�1x?���dZ�1n��D���A}��<��)��$����@",Q���C�o�It-�I��.��B�@F�����Su+Q=��V�
-ɸ;�79�Wf;�A����UG��"ɒ�h�����F#u)�c֟�i~���Iٰ�<�|��!�8��8�����]��2"}�w�h1��Exښ��K��aCbaÆ�zn��z}��C��'�;�囲��n�
V.
��˘�M�~�� IgC�O �̖ �������w��#��� �(�F��������x�|�kxR�uZ���c?
-��w�ۀG��YE���>Mxw���!җ�Չ�
-e,e�R�y���;��B�
-1��גQj�9�cܦ♻εa�nV_��H���=¿�q� >�fD��|��'O GjcA���m,+@M�-0� PB\�z��'�
-Toǣg��={f�@��M`ܙ|�Fp������\�]�I��WW^OO=7��T���0�+,���g�7P�^��Fj���O������ʄbD�n���&�=0�U.�gli�:d�eԇbM�6g�kH�58��є�4Tl��=6�%"[��� ����̇��/��?Cp�X´�+���1�E���]ɡ>�wԔz'���Z��3tu����_.[�F�=�`v��;��`�c���q϶4�� N"o\Ю�P{?�Y� طR��7�����N9�����M���˿��
+276 0 obj
+<< /Filter /FlateDecode /Length 2128 >>
+stream
+x��\Ɏ�6��W��!�;00`��r�[�kr��_�})���v�d�uh�-Kb��v��9���/4�o/�]%=0M�I!o�0��;sД��������ß_)���Q*.G��ge)��Vp���2H�+��0�2@4Sy �/�k��@�(��>~ '��N\���Pk�cI.���N�Wgɤ"��r�9K���Y�A���o��y=�~M��I8�y�S&Ü��y���K�-^]X\]c����#F�)�!M\dF�)���gnn¤�v~��g�$;0 `]��KJ���,�S�[&�$�Zl�|S�h"@`~x�'~pu4�5n.���.���^�����%Fq���wU�t��ϔb�E�:�X�O��遠���6��f���c�a����V�_�@ȗ���ޔUo�鬾�{`�1��\Zb����U*���|�
+�)���AZC$���MaF%/�
��u� w���&�$@�)^���B��� �Hy�$pGUI!�� ��Di,�΄ۨ�+�ue�s/����+���� �'�3�7�0�X�}��`��s�dY��v�d N�ʞ+V����fD�����)T�ܣ��F��cR�e§��cq4l�TO�^��3���CE�*�E�PV-"�f9����-.(���}1��K�#4�߲����U/�z�A P�S>#@c��&@�gP��ߙ�/��\��H{}���d��1�(ʠI�m[����#�����+�>ѳ l�-�1b��ᦨ�9|9��Ş�%Z�d`D��
� S4�a��5�͍�Dž�� l$��A����y��doL��"�U�y�H�]��d^1:z���������IgG?k)�CcJ/����|��*f9�81Lc�?�h�w��\�S�Ow��
�1����~��<|��0Ԯ��K�k1>�u[�w�3�N�dR5(��Ǐ������W)!����&d3�h6w���(f
+�Y�����H�=�Qi-�T��K�]��K����h����W
+W�Fs��ź6���̕����P�s�8��"�,�V豖XhM��u`mj��F��"����*�:�A�b}>��D�����B�YM�lY*�/�l�p��a����Vv;���i`�yW�K{������~�nZr�F#��|hlzI�.�65�<���)��˶C"