From 6635ba9cad53f599e4643aa90e7dfa9e800d144c Mon Sep 17 00:00:00 2001 From: CrabeExtra Date: Tue, 23 Dec 2025 18:10:26 +0330 Subject: [PATCH 1/5] Add 73_ImageUploadBenchmark example --- 73_ImageUploadBenchmark/CMakeLists.txt | 6 + 73_ImageUploadBenchmark/config.json.template | 28 ++ 73_ImageUploadBenchmark/main.cpp | 392 +++++++++++++++++++ 73_ImageUploadBenchmark/pipeline.groovy | 50 +++ CMakeLists.txt | 1 + 5 files changed, 477 insertions(+) create mode 100644 73_ImageUploadBenchmark/CMakeLists.txt create mode 100644 73_ImageUploadBenchmark/config.json.template create mode 100644 73_ImageUploadBenchmark/main.cpp create mode 100644 73_ImageUploadBenchmark/pipeline.groovy diff --git a/73_ImageUploadBenchmark/CMakeLists.txt b/73_ImageUploadBenchmark/CMakeLists.txt new file mode 100644 index 000000000..2f9218f93 --- /dev/null +++ b/73_ImageUploadBenchmark/CMakeLists.txt @@ -0,0 +1,6 @@ +include(common RESULT_VARIABLE RES) +if(NOT RES) + message(FATAL_ERROR "common.cmake not found. Should be in {repo_root}/cmake directory") +endif() + +nbl_create_executable_project("" "" "" "" "${NBL_EXECUTABLE_PROJECT_CREATION_PCH_TARGET}") \ No newline at end of file diff --git a/73_ImageUploadBenchmark/config.json.template b/73_ImageUploadBenchmark/config.json.template new file mode 100644 index 000000000..12215d0bb --- /dev/null +++ b/73_ImageUploadBenchmark/config.json.template @@ -0,0 +1,28 @@ +{ + "enableParallelBuild": true, + "threadsPerBuildProcess" : 2, + "isExecuted": false, + "scriptPath": "", + "cmake": { + "configurations": [ "Release", "Debug", "RelWithDebInfo" ], + "buildModes": [], + "requiredOptions": [] + }, + "profiles": [ + { + "backend": "vulkan", // should be none + "platform": "windows", + "buildModes": [], + "runConfiguration": "Release", // we also need to run in Debug nad RWDI because foundational example + "gpuArchitectures": [] + } + ], + "dependencies": [], + "data": [ + { + "dependencies": [], + "command": [""], + "outputs": [] + } + ] +} diff --git a/73_ImageUploadBenchmark/main.cpp b/73_ImageUploadBenchmark/main.cpp new file mode 100644 index 000000000..a22647750 --- /dev/null +++ b/73_ImageUploadBenchmark/main.cpp @@ -0,0 +1,392 @@ +#include "nbl/examples/examples.hpp" +#include "nbl/this_example/builtin/build/spirv/keys.hpp" + +using namespace nbl; +using namespace nbl::core; +using namespace nbl::hlsl; +using namespace nbl::system; +using namespace nbl::asset; +using namespace nbl::ui; +using namespace nbl::video; +using namespace nbl::examples; + +#include "app_resources/common.hlsl" +#include "nbl/builtin/hlsl/bit.hlsl" + +class CountingSortApp final : public application_templates::MonoDeviceApplication, public BuiltinResourcesApplication +{ + using device_base_t = application_templates::MonoDeviceApplication; + using asset_base_t = BuiltinResourcesApplication; + + public: + // Yay thanks to multiple inheritance we cannot forward ctors anymore + CountingSortApp(const path& _localInputCWD, const path& _localOutputCWD, const path& _sharedInputCWD, const path& _sharedOutputCWD) : + system::IApplicationFramework(_localInputCWD, _localOutputCWD, _sharedInputCWD, _sharedOutputCWD) {} + + // we stuff all our work here because its a "single shot" app + bool onAppInitialized(smart_refctd_ptr&& system) override + { + // Remember to call the base class initialization! + if (!device_base_t::onAppInitialized(smart_refctd_ptr(system))) + return false; + if (!asset_base_t::onAppInitialized(std::move(system))) + return false; + + auto limits = m_physicalDevice->getLimits(); + constexpr std::array AllowedMaxComputeSharedMemorySizes = { + 16384, 32768, 65536 + }; + + auto upperBoundSharedMemSize = std::upper_bound(AllowedMaxComputeSharedMemorySizes.begin(), AllowedMaxComputeSharedMemorySizes.end(), limits.maxComputeSharedMemorySize); + // devices which support less than 16KB of max compute shared memory size are not supported + if (upperBoundSharedMemSize == AllowedMaxComputeSharedMemorySizes.begin()) + { + m_logger->log("maxComputeSharedMemorySize is too low (%u)", ILogger::E_LOG_LEVEL::ELL_ERROR, limits.maxComputeSharedMemorySize); + exit(0); + } + + limits.maxComputeSharedMemorySize = *(upperBoundSharedMemSize - 1); + + const uint32_t WorkgroupSize = limits.maxComputeWorkGroupInvocations; + const uint32_t MaxBucketCount = (limits.maxComputeSharedMemorySize / sizeof(uint32_t)) / 2; + constexpr uint32_t element_count = 100000; + const uint32_t bucket_count = std::min((uint32_t)3000, MaxBucketCount); + const uint32_t elements_per_thread = ceil((float)ceil((float)element_count / limits.computeUnits) / WorkgroupSize); + + auto loadPrecompiledShader = [&]() -> smart_refctd_ptr + { + // this time we load a shader directly from a file + IAssetLoader::SAssetLoadParams lp = {}; + lp.logger = m_logger.get(); + lp.workingDirectory = "app_resources"; // virtual root + auto key = nbl::this_example::builtin::build::get_spirv_key(limits, m_physicalDevice->getFeatures()); + auto assetBundle = m_assetMgr->getAsset(key.data(), lp); + const auto assets = assetBundle.getContents(); + if (assets.empty()) + { + logFail("Could not load shader!"); + return nullptr; + } + + auto shader = IAsset::castDown(assets[0]); + // The down-cast should not fail! + assert(shader); + + // There's two ways of doing stuff like this: + // 1. this - modifying the asset after load + // 2. creating a short shader source file that includes the asset you would have wanted to load + // + //auto overrideSource = CHLSLCompiler::createOverridenCopy( + // source.get(), "#define WorkgroupSize %d\n#define BucketCount %d\n", + // WorkgroupSize, bucket_count + //); + + // this time we skip the use of the asset converter since the IShader->IGPUShader path is quick and simple + return shader; + }; + auto prefixSumShader = loadPrecompiledShader.operator()<"prefix_sum_shader">(); // "app_resources/prefix_sum_shader.comp.hlsl" + auto scatterShader = loadPrecompiledShader.operator()<"scatter_shader">(); // "app_resources/scatter_shader.comp.hlsl" + + // People love Reflection but I prefer Shader Sources instead! + const nbl::asset::SPushConstantRange pcRange = { .stageFlags = IShader::E_SHADER_STAGE::ESS_COMPUTE,.offset = 0,.size = sizeof(CountingPushData) }; + + // This time we'll have no Descriptor Sets or Layouts because our workload has a widely varying size + // and using traditional SSBO bindings would force us to update the Descriptor Set every frame. + // I even started writing this sample with the use of Dynamic SSBOs, however the length of the buffer range is not dynamic + // only the offset. This means that we'd have to write the "worst case" length into the descriptor set binding. + // Then this has a knock-on effect that we couldn't allocate closer to the end of the streaming buffer than the "worst case" size. + smart_refctd_ptr layout; + smart_refctd_ptr prefixSumPipeline; + smart_refctd_ptr scatterPipeline; + { + layout = m_device->createPipelineLayout({ &pcRange,1 }); + IGPUComputePipeline::SCreationParams params = {}; + params.layout = layout.get(); + params.shader.shader = prefixSumShader.get(); + params.shader.entryPoint = "main"; + params.shader.entries = nullptr; + params.shader.requiredSubgroupSize = static_cast(5); + params.cached.requireFullSubgroups = true; + if (!m_device->createComputePipelines(nullptr, { ¶ms,1 }, &prefixSumPipeline)) + return logFail("Failed to create compute pipeline!\n"); + params.shader.shader = scatterShader.get(); + if (!m_device->createComputePipelines(nullptr, { ¶ms,1 }, &scatterPipeline)) + return logFail("Failed to create compute pipeline!\n"); + } + + // Allocate memory + nbl::video::IDeviceMemoryAllocator::SAllocation allocation[5] = {}; + smart_refctd_ptr buffers[5]; + //smart_refctd_ptr ds; + { + auto build_buffer = [this]( + smart_refctd_ptr m_device, + nbl::video::IDeviceMemoryAllocator::SAllocation *allocation, + smart_refctd_ptr& buffer, + size_t buffer_size, + const char *label + ) -> void { + IGPUBuffer::SCreationParams params; + params.size = buffer_size; + params.usage = IGPUBuffer::EUF_STORAGE_BUFFER_BIT | IGPUBuffer::EUF_SHADER_DEVICE_ADDRESS_BIT; + buffer = m_device->createBuffer(std::move(params)); + if (!buffer) + logFail("Failed to create GPU buffer of size %d!\n", buffer_size); + + buffer->setObjectDebugName(label); + + auto reqs = buffer->getMemoryReqs(); + reqs.memoryTypeBits &= m_physicalDevice->getHostVisibleMemoryTypeBits(); + + *allocation = m_device->allocate(reqs, buffer.get(), IDeviceMemoryAllocation::EMAF_DEVICE_ADDRESS_BIT); + if (!allocation->isValid()) + logFail("Failed to allocate Device Memory compatible with our GPU Buffer!\n"); + + assert(allocation->memory.get() == buffer->getBoundMemory().memory); + }; + + build_buffer(m_device, allocation, buffers[0], sizeof(uint32_t) * element_count, "Input Key Buffer"); + build_buffer(m_device, allocation + 1, buffers[1], sizeof(uint32_t) * element_count, "Input Value Buffer"); + build_buffer(m_device, allocation + 2, buffers[2], sizeof(uint32_t) * bucket_count, "Scratch Buffer"); + build_buffer(m_device, allocation + 3, buffers[3], sizeof(uint32_t) * element_count, "Output Key Buffer"); + build_buffer(m_device, allocation + 4, buffers[4], sizeof(uint32_t) * element_count, "Output Value Buffer"); + } + uint64_t buffer_device_address[] = { + buffers[0]->getDeviceAddress(), + buffers[1]->getDeviceAddress(), + buffers[2]->getDeviceAddress(), + buffers[3]->getDeviceAddress(), + buffers[4]->getDeviceAddress() + }; + + void* mapped_memory[] = { + allocation[0].memory->map({0ull,allocation[0].memory->getAllocationSize()}, IDeviceMemoryAllocation::EMCAF_READ), + allocation[1].memory->map({0ull,allocation[1].memory->getAllocationSize()}, IDeviceMemoryAllocation::EMCAF_READ), + allocation[2].memory->map({0ull,allocation[2].memory->getAllocationSize()}, IDeviceMemoryAllocation::EMCAF_READ), + allocation[3].memory->map({0ull,allocation[3].memory->getAllocationSize()}, IDeviceMemoryAllocation::EMCAF_READ), + allocation[4].memory->map({0ull,allocation[3].memory->getAllocationSize()}, IDeviceMemoryAllocation::EMCAF_READ), + }; + if (!mapped_memory[0] || !mapped_memory[1] || !mapped_memory[2] || !mapped_memory[3] || !mapped_memory[4]) + return logFail("Failed to map the Device Memory!\n"); + + // Generate random data + constexpr uint32_t minimum = 0; + const uint32_t range = bucket_count; + unsigned seed = std::chrono::system_clock::now().time_since_epoch().count(); + std::mt19937 g(seed); + + auto bufferData = new uint32_t[2][element_count]; + for (uint32_t i = 0; i < element_count; i++) { + bufferData[0][i] = minimum + g() % range; + } + + memcpy(mapped_memory[0], bufferData[0], sizeof(uint32_t) * element_count); + + for (uint32_t i = 0; i < element_count; i++) { + bufferData[1][i] = g() % std::numeric_limits::max(); + } + + memcpy(mapped_memory[1], bufferData[1], sizeof(uint32_t) * element_count); + + std::string outBuffer; + for (auto i = 0; i < element_count; i++) { + outBuffer.append("{"); + outBuffer.append(std::to_string(bufferData[0][i])); + outBuffer.append(", "); + outBuffer.append(std::to_string(bufferData[1][i])); + outBuffer.append("} "); + } + outBuffer.append("\n"); + outBuffer.append("Count: "); + outBuffer.append(std::to_string(element_count)); + outBuffer.append("\n"); + m_logger->log("Your input array is: \n" + outBuffer, ILogger::ELL_PERFORMANCE); + + auto pc = CountingPushData { + .inputKeyAddress = buffer_device_address[0], + .inputValueAddress = buffer_device_address[1], + .histogramAddress = buffer_device_address[2], + .outputKeyAddress = buffer_device_address[3], + .outputValueAddress = buffer_device_address[4], + .dataElementCount = element_count, + .elementsPerWT = elements_per_thread, + .minimum = minimum, + .maximum = minimum + bucket_count - 1, + }; + + smart_refctd_ptr cmdBuf; + { + smart_refctd_ptr cmdpool = m_device->createCommandPool(getComputeQueue()->getFamilyIndex(), IGPUCommandPool::CREATE_FLAGS::RESET_COMMAND_BUFFER_BIT); + if (!cmdpool->createCommandBuffers(IGPUCommandPool::BUFFER_LEVEL::PRIMARY, 1u, &cmdBuf)) + return logFail("Failed to create Command Buffers!\n"); + } + + // Create the Semaphore for prefix sum + constexpr uint64_t started_value = 0; + uint64_t timeline = started_value; + smart_refctd_ptr progress = m_device->createSemaphore(started_value); + + cmdBuf->begin(IGPUCommandBuffer::USAGE::ONE_TIME_SUBMIT_BIT); + cmdBuf->beginDebugMarker("Prefix Sum Dispatch", core::vectorSIMDf(0, 1, 0, 1)); + cmdBuf->bindComputePipeline(prefixSumPipeline.get()); + cmdBuf->pushConstants(layout.get(), IShader::E_SHADER_STAGE::ESS_COMPUTE, 0u, sizeof(pc), &pc); + cmdBuf->dispatch(ceil((float)element_count / (elements_per_thread * WorkgroupSize)), 1, 1); + cmdBuf->endDebugMarker(); + cmdBuf->end(); + + { + auto queue = getComputeQueue(); + + IQueue::SSubmitInfo submit_infos[1]; + IQueue::SSubmitInfo::SCommandBufferInfo cmdBufs[] = { + { + .cmdbuf = cmdBuf.get() + } + }; + submit_infos[0].commandBuffers = cmdBufs; + IQueue::SSubmitInfo::SSemaphoreInfo signals[] = { + { + .semaphore = progress.get(), + .value = ++timeline, + .stageMask = asset::PIPELINE_STAGE_FLAGS::COMPUTE_SHADER_BIT + } + }; + submit_infos[0].signalSemaphores = signals; + + m_api->startCapture(); + queue->submit(submit_infos); + m_api->endCapture(); + } + + const ISemaphore::SWaitInfo wait_infos[] = { { + .semaphore = progress.get(), + .value = timeline + } }; + m_device->blockForSemaphores(wait_infos); + + // Create the Semaphore for Scatter + uint64_t timeline2 = started_value; + smart_refctd_ptr progress2 = m_device->createSemaphore(started_value); + + cmdBuf->begin(IGPUCommandBuffer::USAGE::ONE_TIME_SUBMIT_BIT); + cmdBuf->beginDebugMarker("Scatter Dispatch", core::vectorSIMDf(0, 1, 0, 1)); + cmdBuf->bindComputePipeline(scatterPipeline.get()); + cmdBuf->pushConstants(layout.get(), IShader::E_SHADER_STAGE::ESS_COMPUTE, 0u, sizeof(pc), &pc); + cmdBuf->dispatch(ceil((float)element_count / (elements_per_thread * WorkgroupSize)), 1, 1); + cmdBuf->endDebugMarker(); + cmdBuf->end(); + + { + auto queue = getComputeQueue(); + + IQueue::SSubmitInfo submit_infos[1]; + IQueue::SSubmitInfo::SCommandBufferInfo cmdBufs[] = { + { + .cmdbuf = cmdBuf.get() + } + }; + submit_infos[0].commandBuffers = cmdBufs; + IQueue::SSubmitInfo::SSemaphoreInfo waits[] = { + { + .semaphore = progress.get(), + .value = timeline, + .stageMask = asset::PIPELINE_STAGE_FLAGS::COMPUTE_SHADER_BIT + } + }; + submit_infos[0].waitSemaphores = waits; + IQueue::SSubmitInfo::SSemaphoreInfo signals[] = { + { + .semaphore = progress2.get(), + .value = ++timeline2, + .stageMask = asset::PIPELINE_STAGE_FLAGS::COMPUTE_SHADER_BIT + } + }; + submit_infos[0].signalSemaphores = signals; + + m_api->startCapture(); + queue->submit(submit_infos); + m_api->endCapture(); + } + + const ISemaphore::SWaitInfo wait_infos2[] = {{ + .semaphore = progress2.get(), + .value = timeline2 + } }; + m_device->blockForSemaphores(wait_infos2); + + const ILogicalDevice::MappedMemoryRange memory_range[] = { + ILogicalDevice::MappedMemoryRange(allocation[0].memory.get(), 0ull, allocation[0].memory->getAllocationSize()), + ILogicalDevice::MappedMemoryRange(allocation[1].memory.get(), 0ull, allocation[1].memory->getAllocationSize()), + ILogicalDevice::MappedMemoryRange(allocation[2].memory.get(), 0ull, allocation[2].memory->getAllocationSize()), + ILogicalDevice::MappedMemoryRange(allocation[3].memory.get(), 0ull, allocation[3].memory->getAllocationSize()), + ILogicalDevice::MappedMemoryRange(allocation[4].memory.get(), 0ull, allocation[4].memory->getAllocationSize()) + }; + + if (!allocation[0].memory->getMemoryPropertyFlags().hasFlags(IDeviceMemoryAllocation::EMPF_HOST_COHERENT_BIT)) + m_device->invalidateMappedMemoryRanges(1, &memory_range[0]); + if (!allocation[1].memory->getMemoryPropertyFlags().hasFlags(IDeviceMemoryAllocation::EMPF_HOST_COHERENT_BIT)) + m_device->invalidateMappedMemoryRanges(1, &memory_range[1]); + if (!allocation[2].memory->getMemoryPropertyFlags().hasFlags(IDeviceMemoryAllocation::EMPF_HOST_COHERENT_BIT)) + m_device->invalidateMappedMemoryRanges(1, &memory_range[2]); + if (!allocation[3].memory->getMemoryPropertyFlags().hasFlags(IDeviceMemoryAllocation::EMPF_HOST_COHERENT_BIT)) + m_device->invalidateMappedMemoryRanges(1, &memory_range[3]); + if (!allocation[4].memory->getMemoryPropertyFlags().hasFlags(IDeviceMemoryAllocation::EMPF_HOST_COHERENT_BIT)) + m_device->invalidateMappedMemoryRanges(1, &memory_range[4]); + + const uint32_t* buffData[] = { + reinterpret_cast(allocation[2].memory->getMappedPointer()), + reinterpret_cast(allocation[3].memory->getMappedPointer()), + reinterpret_cast(allocation[4].memory->getMappedPointer()) + }; + + assert(allocation[2].offset == 0); // simpler than writing out all the pointer arithmetic + assert(allocation[3].offset == 0); // simpler than writing out all the pointer arithmetic + assert(allocation[4].offset == 0); // simpler than writing out all the pointer arithmetic + + outBuffer.clear(); + for (auto i = 0; i < bucket_count; i++) { + outBuffer.append(std::to_string(buffData[0][i])); + outBuffer.append(" "); + } + outBuffer.append("\n"); + + m_logger->log("Scratch buffer is: \n" + outBuffer, ILogger::ELL_PERFORMANCE); + + outBuffer.clear(); + for (auto i = 0; i < element_count; i++) { + outBuffer.append("{"); + outBuffer.append(std::to_string(buffData[1][i])); + outBuffer.append(", "); + outBuffer.append(std::to_string(buffData[2][i])); + outBuffer.append("} "); + } + outBuffer.append("\n"); + outBuffer.append("Count: "); + outBuffer.append(std::to_string(element_count)); + outBuffer.append("\n"); + m_logger->log("Your output array is: \n" + outBuffer, ILogger::ELL_PERFORMANCE); + + allocation[0].memory->unmap(); + allocation[1].memory->unmap(); + allocation[2].memory->unmap(); + allocation[3].memory->unmap(); + allocation[4].memory->unmap(); + + m_device->waitIdle(); + + delete[] bufferData; + + return true; + } + + // Ok this time we'll actually have a work loop (maybe just for the sake of future WASM so we don't timeout a Browser Tab with an unresponsive script) + bool keepRunning() override { return false; } + + // Finally the first actual work-loop + void workLoopBody() override {} + + bool onAppTerminated() override { return true; } +}; + + +NBL_MAIN_FUNC(CountingSortApp) \ No newline at end of file diff --git a/73_ImageUploadBenchmark/pipeline.groovy b/73_ImageUploadBenchmark/pipeline.groovy new file mode 100644 index 000000000..1249f10b5 --- /dev/null +++ b/73_ImageUploadBenchmark/pipeline.groovy @@ -0,0 +1,50 @@ +import org.DevshGraphicsProgramming.Agent +import org.DevshGraphicsProgramming.BuilderInfo +import org.DevshGraphicsProgramming.IBuilder + +class CImageUploadBenchmark extends IBuilder +{ + public CImageUploadBenchmark(Agent _agent, _info) + { + super(_agent, _info) + } + + @Override + public boolean prepare(Map axisMapping) + { + return true + } + + @Override + public boolean build(Map axisMapping) + { + IBuilder.CONFIGURATION config = axisMapping.get("CONFIGURATION") + IBuilder.BUILD_TYPE buildType = axisMapping.get("BUILD_TYPE") + + def nameOfBuildDirectory = getNameOfBuildDirectory(buildType) + def nameOfConfig = getNameOfConfig(config) + + agent.execute("cmake --build ${info.rootProjectPath}/${nameOfBuildDirectory}/${info.targetProjectPathRelativeToRoot} --target ${info.targetBaseName} --config ${nameOfConfig} -j12 -v") + + return true + } + + @Override + public boolean test(Map axisMapping) + { + return true + } + + @Override + public boolean install(Map axisMapping) + { + return true + } +} + +def create(Agent _agent, _info) +{ + return new CStreamingAndBufferDeviceAddressBuilder(_agent, _info) +} + +return this diff --git a/CMakeLists.txt b/CMakeLists.txt index cbe482aa4..2d4ed7408 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -89,6 +89,7 @@ if(NBL_BUILD_EXAMPLES) add_subdirectory(70_FLIPFluids) add_subdirectory(71_RayTracingPipeline) add_subdirectory(72_CooperativeBinarySearch) + add_subdirectory(73_ImageUploadBenchmark) # add new examples *before* NBL_GET_ALL_TARGETS invocation, it gathers recursively all targets created so far in this subdirectory NBL_GET_ALL_TARGETS(TARGETS) From 951e2fdd218abc307f9890d69f7a9be38d28f95a Mon Sep 17 00:00:00 2001 From: CrabeExtra Date: Wed, 24 Dec 2025 18:38:09 +0330 Subject: [PATCH 2/5] Simple benchmark HOST_VISIBLE vs HOST_VISIBLE & DEVICE_LOCAL --- 73_ImageUploadBenchmark/main.cpp | 694 ++++++++++++++++--------------- 1 file changed, 357 insertions(+), 337 deletions(-) diff --git a/73_ImageUploadBenchmark/main.cpp b/73_ImageUploadBenchmark/main.cpp index a22647750..68815681d 100644 --- a/73_ImageUploadBenchmark/main.cpp +++ b/73_ImageUploadBenchmark/main.cpp @@ -1,392 +1,412 @@ #include "nbl/examples/examples.hpp" -#include "nbl/this_example/builtin/build/spirv/keys.hpp" +#include using namespace nbl; using namespace nbl::core; -using namespace nbl::hlsl; using namespace nbl::system; using namespace nbl::asset; -using namespace nbl::ui; using namespace nbl::video; using namespace nbl::examples; -#include "app_resources/common.hlsl" -#include "nbl/builtin/hlsl/bit.hlsl" - -class CountingSortApp final : public application_templates::MonoDeviceApplication, public BuiltinResourcesApplication +class ImageUploadBenchmarkApp final : public application_templates::MonoDeviceApplication, public BuiltinResourcesApplication { - using device_base_t = application_templates::MonoDeviceApplication; - using asset_base_t = BuiltinResourcesApplication; + using device_base_t = application_templates::MonoDeviceApplication; + using asset_base_t = BuiltinResourcesApplication; + +public: + ImageUploadBenchmarkApp(const path& _localInputCWD, const path& _localOutputCWD, const path& _sharedInputCWD, const path& _sharedOutputCWD) : + system::IApplicationFramework(_localInputCWD, _localOutputCWD, _sharedInputCWD, _sharedOutputCWD) {} + + bool onAppInitialized(smart_refctd_ptr&& system) override + { + if (!device_base_t::onAppInitialized(smart_refctd_ptr(system))) + return false; + if (!asset_base_t::onAppInitialized(std::move(system))) + return false; + + constexpr uint32_t TILE_SIZE = 128; + constexpr uint32_t TILE_BYTES_PER_PIXEL = 4; + constexpr uint32_t TILE_SIZE_BYTES = TILE_SIZE * TILE_SIZE * TILE_BYTES_PER_PIXEL; + constexpr uint32_t STAGING_BUFFER_SIZE = 64 * 1024 * 1024; + constexpr uint32_t TILES_PER_FRAME = STAGING_BUFFER_SIZE / TILE_SIZE_BYTES; + constexpr uint32_t FRAMES_IN_FLIGHT = 4; + constexpr uint32_t TOTAL_FRAMES = 1000; + + m_logger->log("GPU Memory Transfer Benchmark", ILogger::ELL_INFO); + m_logger->log("Tile size: %ux%u (%u KB)", ILogger::ELL_INFO, TILE_SIZE, TILE_SIZE, TILE_SIZE_BYTES / 1024); + m_logger->log("Staging buffer: %u MB", ILogger::ELL_INFO, STAGING_BUFFER_SIZE / (1024 * 1024)); + m_logger->log("Tiles per frame: %u", ILogger::ELL_INFO, TILES_PER_FRAME); + m_logger->log("Frames in flight: %u", ILogger::ELL_INFO, FRAMES_IN_FLIGHT); + + uint32_t hostVisibleBits = m_physicalDevice->getHostVisibleMemoryTypeBits(); + uint32_t deviceLocalBits = m_physicalDevice->getDeviceLocalMemoryTypeBits(); + uint32_t hostVisibleOnlyBits = hostVisibleBits & ~deviceLocalBits; + uint32_t hostVisibleDeviceLocalBits = hostVisibleBits & deviceLocalBits; + + if (!hostVisibleOnlyBits) + { + m_logger->log("HOST_VISIBLE memory types not found!", ILogger::ELL_ERROR); + return false; + } - public: - // Yay thanks to multiple inheritance we cannot forward ctors anymore - CountingSortApp(const path& _localInputCWD, const path& _localOutputCWD, const path& _sharedInputCWD, const path& _sharedOutputCWD) : - system::IApplicationFramework(_localInputCWD, _localOutputCWD, _sharedInputCWD, _sharedOutputCWD) {} + if (!deviceLocalBits) + { + m_logger->log("DEVICE_LOCAL memory types not found!", ILogger::ELL_ERROR); + return false; + } - // we stuff all our work here because its a "single shot" app - bool onAppInitialized(smart_refctd_ptr&& system) override + IQueue* queue = getQueue(IQueue::FAMILY_FLAGS::GRAPHICS_BIT); + smart_refctd_ptr destinationImage; { - // Remember to call the base class initialization! - if (!device_base_t::onAppInitialized(smart_refctd_ptr(system))) - return false; - if (!asset_base_t::onAppInitialized(std::move(system))) - return false; + IGPUImage::SCreationParams imgParams{}; + imgParams.type = IImage::E_TYPE::ET_2D; + imgParams.extent.width = TILE_SIZE * 32; + imgParams.extent.height = TILE_SIZE * 32; + imgParams.extent.depth = 1u; + imgParams.format = asset::E_FORMAT::EF_R8G8B8A8_UNORM; + imgParams.mipLevels = 1u; + imgParams.flags = IImage::ECF_NONE; + imgParams.arrayLayers = 1u; + imgParams.samples = IImage::E_SAMPLE_COUNT_FLAGS::ESCF_1_BIT; + imgParams.tiling = video::IGPUImage::TILING::OPTIMAL; + imgParams.usage = asset::IImage::EUF_TRANSFER_DST_BIT; + imgParams.preinitialized = false; + + destinationImage = m_device->createImage(std::move(imgParams)); + if (!destinationImage) + return logFail("Failed to create destination image!\n"); + + destinationImage->setObjectDebugName("Destination Image"); + + auto reqs = destinationImage->getMemoryReqs(); + reqs.memoryTypeBits &= deviceLocalBits; + + auto allocation = m_device->allocate(reqs, destinationImage.get(), IDeviceMemoryAllocation::EMAF_NONE); + if (!allocation.isValid()) + return logFail("Failed to allocate DEVICE_LOCAL memory for destination image!\n"); + } - auto limits = m_physicalDevice->getLimits(); - constexpr std::array AllowedMaxComputeSharedMemorySizes = { - 16384, 32768, 65536 - }; + m_logger->log("\nTesting Strategy 1: System RAM", ILogger::ELL_INFO); + + double throughputSystemRAM = 0.0; + { + smart_refctd_ptr stagingBuffer; + IDeviceMemoryAllocator::SAllocation stagingAlloc; + void* mappedPtr = nullptr; - auto upperBoundSharedMemSize = std::upper_bound(AllowedMaxComputeSharedMemorySizes.begin(), AllowedMaxComputeSharedMemorySizes.end(), limits.maxComputeSharedMemorySize); - // devices which support less than 16KB of max compute shared memory size are not supported - if (upperBoundSharedMemSize == AllowedMaxComputeSharedMemorySizes.begin()) + if (!createStagingBuffer(STAGING_BUFFER_SIZE, hostVisibleOnlyBits, + "Staging Buffer - System RAM", stagingBuffer, stagingAlloc, mappedPtr)) { - m_logger->log("maxComputeSharedMemorySize is too low (%u)", ILogger::E_LOG_LEVEL::ELL_ERROR, limits.maxComputeSharedMemorySize); - exit(0); + return false; } - limits.maxComputeSharedMemorySize = *(upperBoundSharedMemSize - 1); + throughputSystemRAM = runBenchmark( + "System RAM", + stagingBuffer.get(), + mappedPtr, + destinationImage.get(), + TILE_SIZE, + TILE_SIZE_BYTES, + TILES_PER_FRAME, + FRAMES_IN_FLIGHT, + TOTAL_FRAMES, + queue + ); + + stagingAlloc.memory->unmap(); + } - const uint32_t WorkgroupSize = limits.maxComputeWorkGroupInvocations; - const uint32_t MaxBucketCount = (limits.maxComputeSharedMemorySize / sizeof(uint32_t)) / 2; - constexpr uint32_t element_count = 100000; - const uint32_t bucket_count = std::min((uint32_t)3000, MaxBucketCount); - const uint32_t elements_per_thread = ceil((float)ceil((float)element_count / limits.computeUnits) / WorkgroupSize); + m_logger->log("System RAM throughput: %.2f GB/s", ILogger::ELL_PERFORMANCE, throughputSystemRAM); - auto loadPrecompiledShader = [&]() -> smart_refctd_ptr - { - // this time we load a shader directly from a file - IAssetLoader::SAssetLoadParams lp = {}; - lp.logger = m_logger.get(); - lp.workingDirectory = "app_resources"; // virtual root - auto key = nbl::this_example::builtin::build::get_spirv_key(limits, m_physicalDevice->getFeatures()); - auto assetBundle = m_assetMgr->getAsset(key.data(), lp); - const auto assets = assetBundle.getContents(); - if (assets.empty()) - { - logFail("Could not load shader!"); - return nullptr; - } + m_device->waitIdle(); - auto shader = IAsset::castDown(assets[0]); - // The down-cast should not fail! - assert(shader); - - // There's two ways of doing stuff like this: - // 1. this - modifying the asset after load - // 2. creating a short shader source file that includes the asset you would have wanted to load - // - //auto overrideSource = CHLSLCompiler::createOverridenCopy( - // source.get(), "#define WorkgroupSize %d\n#define BucketCount %d\n", - // WorkgroupSize, bucket_count - //); - - // this time we skip the use of the asset converter since the IShader->IGPUShader path is quick and simple - return shader; - }; - auto prefixSumShader = loadPrecompiledShader.operator()<"prefix_sum_shader">(); // "app_resources/prefix_sum_shader.comp.hlsl" - auto scatterShader = loadPrecompiledShader.operator()<"scatter_shader">(); // "app_resources/scatter_shader.comp.hlsl" - - // People love Reflection but I prefer Shader Sources instead! - const nbl::asset::SPushConstantRange pcRange = { .stageFlags = IShader::E_SHADER_STAGE::ESS_COMPUTE,.offset = 0,.size = sizeof(CountingPushData) }; - - // This time we'll have no Descriptor Sets or Layouts because our workload has a widely varying size - // and using traditional SSBO bindings would force us to update the Descriptor Set every frame. - // I even started writing this sample with the use of Dynamic SSBOs, however the length of the buffer range is not dynamic - // only the offset. This means that we'd have to write the "worst case" length into the descriptor set binding. - // Then this has a knock-on effect that we couldn't allocate closer to the end of the streaming buffer than the "worst case" size. - smart_refctd_ptr layout; - smart_refctd_ptr prefixSumPipeline; - smart_refctd_ptr scatterPipeline; - { - layout = m_device->createPipelineLayout({ &pcRange,1 }); - IGPUComputePipeline::SCreationParams params = {}; - params.layout = layout.get(); - params.shader.shader = prefixSumShader.get(); - params.shader.entryPoint = "main"; - params.shader.entries = nullptr; - params.shader.requiredSubgroupSize = static_cast(5); - params.cached.requireFullSubgroups = true; - if (!m_device->createComputePipelines(nullptr, { ¶ms,1 }, &prefixSumPipeline)) - return logFail("Failed to create compute pipeline!\n"); - params.shader.shader = scatterShader.get(); - if (!m_device->createComputePipelines(nullptr, { ¶ms,1 }, &scatterPipeline)) - return logFail("Failed to create compute pipeline!\n"); - } + if (hostVisibleDeviceLocalBits) + { + m_logger->log("\nTesting Strategy 2: VRAM (ReBAR)", ILogger::ELL_INFO); - // Allocate memory - nbl::video::IDeviceMemoryAllocator::SAllocation allocation[5] = {}; - smart_refctd_ptr buffers[5]; - //smart_refctd_ptr ds; + double throughputVRAM = 0.0; { - auto build_buffer = [this]( - smart_refctd_ptr m_device, - nbl::video::IDeviceMemoryAllocator::SAllocation *allocation, - smart_refctd_ptr& buffer, - size_t buffer_size, - const char *label - ) -> void { - IGPUBuffer::SCreationParams params; - params.size = buffer_size; - params.usage = IGPUBuffer::EUF_STORAGE_BUFFER_BIT | IGPUBuffer::EUF_SHADER_DEVICE_ADDRESS_BIT; - buffer = m_device->createBuffer(std::move(params)); - if (!buffer) - logFail("Failed to create GPU buffer of size %d!\n", buffer_size); - - buffer->setObjectDebugName(label); - - auto reqs = buffer->getMemoryReqs(); - reqs.memoryTypeBits &= m_physicalDevice->getHostVisibleMemoryTypeBits(); - - *allocation = m_device->allocate(reqs, buffer.get(), IDeviceMemoryAllocation::EMAF_DEVICE_ADDRESS_BIT); - if (!allocation->isValid()) - logFail("Failed to allocate Device Memory compatible with our GPU Buffer!\n"); - - assert(allocation->memory.get() == buffer->getBoundMemory().memory); - }; + smart_refctd_ptr stagingBuffer; + IDeviceMemoryAllocator::SAllocation stagingAlloc; + void* mappedPtr = nullptr; - build_buffer(m_device, allocation, buffers[0], sizeof(uint32_t) * element_count, "Input Key Buffer"); - build_buffer(m_device, allocation + 1, buffers[1], sizeof(uint32_t) * element_count, "Input Value Buffer"); - build_buffer(m_device, allocation + 2, buffers[2], sizeof(uint32_t) * bucket_count, "Scratch Buffer"); - build_buffer(m_device, allocation + 3, buffers[3], sizeof(uint32_t) * element_count, "Output Key Buffer"); - build_buffer(m_device, allocation + 4, buffers[4], sizeof(uint32_t) * element_count, "Output Value Buffer"); - } - uint64_t buffer_device_address[] = { - buffers[0]->getDeviceAddress(), - buffers[1]->getDeviceAddress(), - buffers[2]->getDeviceAddress(), - buffers[3]->getDeviceAddress(), - buffers[4]->getDeviceAddress() - }; + if (!createStagingBuffer(STAGING_BUFFER_SIZE, hostVisibleDeviceLocalBits, + "Staging Buffer - VRAM (ReBAR)", stagingBuffer, stagingAlloc, mappedPtr)) + { + return false; + } - void* mapped_memory[] = { - allocation[0].memory->map({0ull,allocation[0].memory->getAllocationSize()}, IDeviceMemoryAllocation::EMCAF_READ), - allocation[1].memory->map({0ull,allocation[1].memory->getAllocationSize()}, IDeviceMemoryAllocation::EMCAF_READ), - allocation[2].memory->map({0ull,allocation[2].memory->getAllocationSize()}, IDeviceMemoryAllocation::EMCAF_READ), - allocation[3].memory->map({0ull,allocation[3].memory->getAllocationSize()}, IDeviceMemoryAllocation::EMCAF_READ), - allocation[4].memory->map({0ull,allocation[3].memory->getAllocationSize()}, IDeviceMemoryAllocation::EMCAF_READ), - }; - if (!mapped_memory[0] || !mapped_memory[1] || !mapped_memory[2] || !mapped_memory[3] || !mapped_memory[4]) - return logFail("Failed to map the Device Memory!\n"); - - // Generate random data - constexpr uint32_t minimum = 0; - const uint32_t range = bucket_count; - unsigned seed = std::chrono::system_clock::now().time_since_epoch().count(); - std::mt19937 g(seed); - - auto bufferData = new uint32_t[2][element_count]; - for (uint32_t i = 0; i < element_count; i++) { - bufferData[0][i] = minimum + g() % range; + throughputVRAM = runBenchmark( + "VRAM (ReBAR)", + stagingBuffer.get(), + mappedPtr, + destinationImage.get(), + TILE_SIZE, + TILE_SIZE_BYTES, + TILES_PER_FRAME, + FRAMES_IN_FLIGHT, + TOTAL_FRAMES, + queue + ); + + stagingAlloc.memory->unmap(); } - memcpy(mapped_memory[0], bufferData[0], sizeof(uint32_t) * element_count); - - for (uint32_t i = 0; i < element_count; i++) { - bufferData[1][i] = g() % std::numeric_limits::max(); - } + m_logger->log("VRAM throughput: %.2f GB/s", ILogger::ELL_PERFORMANCE, throughputVRAM); - memcpy(mapped_memory[1], bufferData[1], sizeof(uint32_t) * element_count); + double speedup = throughputVRAM / throughputSystemRAM; + m_logger->log("\nVRAM is %.2fx faster than System RAM", ILogger::ELL_PERFORMANCE, speedup); + } - std::string outBuffer; - for (auto i = 0; i < element_count; i++) { - outBuffer.append("{"); - outBuffer.append(std::to_string(bufferData[0][i])); - outBuffer.append(", "); - outBuffer.append(std::to_string(bufferData[1][i])); - outBuffer.append("} "); - } - outBuffer.append("\n"); - outBuffer.append("Count: "); - outBuffer.append(std::to_string(element_count)); - outBuffer.append("\n"); - m_logger->log("Your input array is: \n" + outBuffer, ILogger::ELL_PERFORMANCE); - - auto pc = CountingPushData { - .inputKeyAddress = buffer_device_address[0], - .inputValueAddress = buffer_device_address[1], - .histogramAddress = buffer_device_address[2], - .outputKeyAddress = buffer_device_address[3], - .outputValueAddress = buffer_device_address[4], - .dataElementCount = element_count, - .elementsPerWT = elements_per_thread, - .minimum = minimum, - .maximum = minimum + bucket_count - 1, - }; + return true; + } + + bool keepRunning() override { return false; } + void workLoopBody() override {} + bool onAppTerminated() override { return true; } + +protected: + core::vector getQueueRequirements() const override + { + using flags_t = IQueue::FAMILY_FLAGS; + return { { + .requiredFlags = flags_t::GRAPHICS_BIT, + .disallowedFlags = flags_t::NONE, + .queueCount = 1, + .maxImageTransferGranularity = {1, 1, 1} + } }; + } + +private: + void transitionImageLayout( + IGPUCommandBuffer* cmdBuf, + IGPUImage* image, + IImage::LAYOUT oldLayout, + IImage::LAYOUT newLayout) + { + IGPUCommandBuffer::SImageMemoryBarrier barrier = {}; + barrier.oldLayout = oldLayout; + barrier.newLayout = newLayout; + barrier.image = image; + barrier.subresourceRange.aspectMask = IImage::E_ASPECT_FLAGS::EAF_COLOR_BIT; + barrier.subresourceRange.baseMipLevel = 0; + barrier.subresourceRange.levelCount = 1; + barrier.subresourceRange.baseArrayLayer = 0; + barrier.subresourceRange.layerCount = 1; + barrier.barrier.dep.srcAccessMask = ACCESS_FLAGS::NONE; + barrier.barrier.dep.dstAccessMask = ACCESS_FLAGS::TRANSFER_WRITE_BIT; + barrier.barrier.dep.srcStageMask = PIPELINE_STAGE_FLAGS::ALL_COMMANDS_BITS; + barrier.barrier.dep.dstStageMask = PIPELINE_STAGE_FLAGS::ALL_TRANSFER_BITS; + cmdBuf->pipelineBarrier(E_DEPENDENCY_FLAGS::EDF_NONE, { .imgBarriers = {&barrier, 1} }); + } + + void generateTileCopyRegions( + IImage::SBufferCopy* outRegions, + uint32_t tilesPerFrame, + uint32_t tileSize, + uint32_t tileSizeBytes, + uint32_t imageWidth) + { + uint32_t tilesPerRow = imageWidth / tileSize; + for (size_t i = 0; i < tilesPerFrame; i++) + { + uint32_t tileX = (i % tilesPerRow) * tileSize; + uint32_t tileY = (i / tilesPerRow) * tileSize; + + outRegions[i].bufferOffset = i * tileSizeBytes; + outRegions[i].bufferRowLength = tileSize; + outRegions[i].bufferImageHeight = tileSize; + outRegions[i].imageOffset = { tileX, tileY, 0 }; + outRegions[i].imageExtent = { tileSize, tileSize, 1 }; + outRegions[i].imageSubresource.aspectMask = IImage::E_ASPECT_FLAGS::EAF_COLOR_BIT; + outRegions[i].imageSubresource.mipLevel = 0; + outRegions[i].imageSubresource.baseArrayLayer = 0; + outRegions[i].imageSubresource.layerCount = 1; + } + } - smart_refctd_ptr cmdBuf; - { - smart_refctd_ptr cmdpool = m_device->createCommandPool(getComputeQueue()->getFamilyIndex(), IGPUCommandPool::CREATE_FLAGS::RESET_COMMAND_BUFFER_BIT); - if (!cmdpool->createCommandBuffers(IGPUCommandPool::BUFFER_LEVEL::PRIMARY, 1u, &cmdBuf)) - return logFail("Failed to create Command Buffers!\n"); - } + void generateRandomTileData(void* mappedPtr, uint32_t sizeBytes) + { + uint32_t* data = (uint32_t*)mappedPtr; + unsigned seed = std::chrono::system_clock::now().time_since_epoch().count(); + std::mt19937 g(seed); + const uint32_t valueCount = sizeBytes / sizeof(uint32_t); - // Create the Semaphore for prefix sum - constexpr uint64_t started_value = 0; - uint64_t timeline = started_value; - smart_refctd_ptr progress = m_device->createSemaphore(started_value); + auto bufferData = new uint32_t[valueCount]; - cmdBuf->begin(IGPUCommandBuffer::USAGE::ONE_TIME_SUBMIT_BIT); - cmdBuf->beginDebugMarker("Prefix Sum Dispatch", core::vectorSIMDf(0, 1, 0, 1)); - cmdBuf->bindComputePipeline(prefixSumPipeline.get()); - cmdBuf->pushConstants(layout.get(), IShader::E_SHADER_STAGE::ESS_COMPUTE, 0u, sizeof(pc), &pc); - cmdBuf->dispatch(ceil((float)element_count / (elements_per_thread * WorkgroupSize)), 1, 1); - cmdBuf->endDebugMarker(); - cmdBuf->end(); + for (uint32_t i = 0; i < valueCount; i++) + { + bufferData[i] = g(); + } + memcpy(mappedPtr, bufferData, sizeBytes); + delete[] bufferData; + } + + double runBenchmark( + const char* strategyName, + IGPUBuffer* stagingBuffer, + void* mappedPtr, + IGPUImage* destinationImage, + uint32_t tileSize, + uint32_t tileSizeBytes, + uint32_t tilesPerFrame, + uint32_t framesInFlight, + uint32_t totalFrames, + IQueue* queue) + { + smart_refctd_ptr timelineSemaphore = m_device->createSemaphore(0); + + auto commandPools = new smart_refctd_ptr[framesInFlight]; + for (uint32_t i = 0; i < framesInFlight; i++) + { + commandPools[i] = m_device->createCommandPool( + queue->getFamilyIndex(), + IGPUCommandPool::CREATE_FLAGS::RESET_COMMAND_BUFFER_BIT + ); + } - { - auto queue = getComputeQueue(); + auto commandBuffers = new smart_refctd_ptr[framesInFlight]; + for (uint32_t i = 0; i < framesInFlight; i++) + { + commandPools[i]->createCommandBuffers( + IGPUCommandPool::BUFFER_LEVEL::PRIMARY, + 1, + &commandBuffers[i] + ); + } - IQueue::SSubmitInfo submit_infos[1]; - IQueue::SSubmitInfo::SCommandBufferInfo cmdBufs[] = { - { - .cmdbuf = cmdBuf.get() - } - }; - submit_infos[0].commandBuffers = cmdBufs; - IQueue::SSubmitInfo::SSemaphoreInfo signals[] = { - { - .semaphore = progress.get(), - .value = ++timeline, - .stageMask = asset::PIPELINE_STAGE_FLAGS::COMPUTE_SHADER_BIT - } - }; - submit_infos[0].signalSemaphores = signals; + uint64_t timelineValue = 0; - m_api->startCapture(); - queue->submit(submit_infos); - m_api->endCapture(); - } + commandBuffers[0]->begin(IGPUCommandBuffer::USAGE::ONE_TIME_SUBMIT_BIT); + transitionImageLayout( + commandBuffers[0].get(), + destinationImage, + IImage::LAYOUT::UNDEFINED, + IImage::LAYOUT::TRANSFER_DST_OPTIMAL + ); + commandBuffers[0]->end(); - const ISemaphore::SWaitInfo wait_infos[] = { { - .semaphore = progress.get(), - .value = timeline - } }; - m_device->blockForSemaphores(wait_infos); + IQueue::SSubmitInfo submitInfo = {}; + IQueue::SSubmitInfo::SCommandBufferInfo cmdBufInfo = { .cmdbuf = commandBuffers[0].get() }; + submitInfo.commandBuffers = { &cmdBufInfo, 1 }; - // Create the Semaphore for Scatter - uint64_t timeline2 = started_value; - smart_refctd_ptr progress2 = m_device->createSemaphore(started_value); + IQueue::SSubmitInfo::SSemaphoreInfo signalInfo = { + .semaphore = timelineSemaphore.get(), + .value = ++timelineValue, + .stageMask = PIPELINE_STAGE_FLAGS::ALL_TRANSFER_BITS + }; + submitInfo.signalSemaphores = { &signalInfo, 1 }; - cmdBuf->begin(IGPUCommandBuffer::USAGE::ONE_TIME_SUBMIT_BIT); - cmdBuf->beginDebugMarker("Scatter Dispatch", core::vectorSIMDf(0, 1, 0, 1)); - cmdBuf->bindComputePipeline(scatterPipeline.get()); - cmdBuf->pushConstants(layout.get(), IShader::E_SHADER_STAGE::ESS_COMPUTE, 0u, sizeof(pc), &pc); - cmdBuf->dispatch(ceil((float)element_count / (elements_per_thread * WorkgroupSize)), 1, 1); - cmdBuf->endDebugMarker(); - cmdBuf->end(); + queue->submit({ &submitInfo, 1 }); - { - auto queue = getComputeQueue(); + ISemaphore::SWaitInfo waitInfo = { + .semaphore = timelineSemaphore.get(), + .value = timelineValue + }; + m_device->blockForSemaphores({ &waitInfo, 1 }); - IQueue::SSubmitInfo submit_infos[1]; - IQueue::SSubmitInfo::SCommandBufferInfo cmdBufs[] = { - { - .cmdbuf = cmdBuf.get() - } - }; - submit_infos[0].commandBuffers = cmdBufs; - IQueue::SSubmitInfo::SSemaphoreInfo waits[] = { - { - .semaphore = progress.get(), - .value = timeline, - .stageMask = asset::PIPELINE_STAGE_FLAGS::COMPUTE_SHADER_BIT - } - }; - submit_infos[0].waitSemaphores = waits; - IQueue::SSubmitInfo::SSemaphoreInfo signals[] = { - { - .semaphore = progress2.get(), - .value = ++timeline2, - .stageMask = asset::PIPELINE_STAGE_FLAGS::COMPUTE_SHADER_BIT - } - }; - submit_infos[0].signalSemaphores = signals; + auto regions = new IImage::SBufferCopy[tilesPerFrame]; - m_api->startCapture(); - queue->submit(submit_infos); - m_api->endCapture(); - } + generateRandomTileData(mappedPtr, tilesPerFrame * tileSizeBytes); - const ISemaphore::SWaitInfo wait_infos2[] = {{ - .semaphore = progress2.get(), - .value = timeline2 - } }; - m_device->blockForSemaphores(wait_infos2); - - const ILogicalDevice::MappedMemoryRange memory_range[] = { - ILogicalDevice::MappedMemoryRange(allocation[0].memory.get(), 0ull, allocation[0].memory->getAllocationSize()), - ILogicalDevice::MappedMemoryRange(allocation[1].memory.get(), 0ull, allocation[1].memory->getAllocationSize()), - ILogicalDevice::MappedMemoryRange(allocation[2].memory.get(), 0ull, allocation[2].memory->getAllocationSize()), - ILogicalDevice::MappedMemoryRange(allocation[3].memory.get(), 0ull, allocation[3].memory->getAllocationSize()), - ILogicalDevice::MappedMemoryRange(allocation[4].memory.get(), 0ull, allocation[4].memory->getAllocationSize()) - }; + uint32_t imageWidth = destinationImage->getCreationParameters().extent.width; + generateTileCopyRegions(regions, tilesPerFrame, tileSize, tileSizeBytes, imageWidth); - if (!allocation[0].memory->getMemoryPropertyFlags().hasFlags(IDeviceMemoryAllocation::EMPF_HOST_COHERENT_BIT)) - m_device->invalidateMappedMemoryRanges(1, &memory_range[0]); - if (!allocation[1].memory->getMemoryPropertyFlags().hasFlags(IDeviceMemoryAllocation::EMPF_HOST_COHERENT_BIT)) - m_device->invalidateMappedMemoryRanges(1, &memory_range[1]); - if (!allocation[2].memory->getMemoryPropertyFlags().hasFlags(IDeviceMemoryAllocation::EMPF_HOST_COHERENT_BIT)) - m_device->invalidateMappedMemoryRanges(1, &memory_range[2]); - if (!allocation[3].memory->getMemoryPropertyFlags().hasFlags(IDeviceMemoryAllocation::EMPF_HOST_COHERENT_BIT)) - m_device->invalidateMappedMemoryRanges(1, &memory_range[3]); - if (!allocation[4].memory->getMemoryPropertyFlags().hasFlags(IDeviceMemoryAllocation::EMPF_HOST_COHERENT_BIT)) - m_device->invalidateMappedMemoryRanges(1, &memory_range[4]); - - const uint32_t* buffData[] = { - reinterpret_cast(allocation[2].memory->getMappedPointer()), - reinterpret_cast(allocation[3].memory->getMappedPointer()), - reinterpret_cast(allocation[4].memory->getMappedPointer()) - }; + auto startTime = std::chrono::high_resolution_clock::now(); - assert(allocation[2].offset == 0); // simpler than writing out all the pointer arithmetic - assert(allocation[3].offset == 0); // simpler than writing out all the pointer arithmetic - assert(allocation[4].offset == 0); // simpler than writing out all the pointer arithmetic + for (uint32_t frame = 0; frame < totalFrames; frame++) + { + uint32_t cmdBufIndex = frame % framesInFlight; - outBuffer.clear(); - for (auto i = 0; i < bucket_count; i++) { - outBuffer.append(std::to_string(buffData[0][i])); - outBuffer.append(" "); - } - outBuffer.append("\n"); + commandBuffers[cmdBufIndex]->begin(IGPUCommandBuffer::USAGE::ONE_TIME_SUBMIT_BIT); - m_logger->log("Scratch buffer is: \n" + outBuffer, ILogger::ELL_PERFORMANCE); + commandBuffers[cmdBufIndex]->copyBufferToImage( + stagingBuffer, + destinationImage, + IImage::LAYOUT::TRANSFER_DST_OPTIMAL, + tilesPerFrame, + regions + ); - outBuffer.clear(); - for (auto i = 0; i < element_count; i++) { - outBuffer.append("{"); - outBuffer.append(std::to_string(buffData[1][i])); - outBuffer.append(", "); - outBuffer.append(std::to_string(buffData[2][i])); - outBuffer.append("} "); - } - outBuffer.append("\n"); - outBuffer.append("Count: "); - outBuffer.append(std::to_string(element_count)); - outBuffer.append("\n"); - m_logger->log("Your output array is: \n" + outBuffer, ILogger::ELL_PERFORMANCE); + commandBuffers[cmdBufIndex]->end(); - allocation[0].memory->unmap(); - allocation[1].memory->unmap(); - allocation[2].memory->unmap(); - allocation[3].memory->unmap(); - allocation[4].memory->unmap(); + // Create submit info for THIS frame + IQueue::SSubmitInfo frameSubmitInfo = {}; + IQueue::SSubmitInfo::SCommandBufferInfo frameCmdBufInfo = {.cmdbuf = commandBuffers[cmdBufIndex].get()}; + frameSubmitInfo.commandBuffers = {&frameCmdBufInfo, 1}; - m_device->waitIdle(); + IQueue::SSubmitInfo::SSemaphoreInfo frameSignalInfo = { + .semaphore = timelineSemaphore.get(), + .value = ++timelineValue, + .stageMask = PIPELINE_STAGE_FLAGS::ALL_TRANSFER_BITS + }; + frameSubmitInfo.signalSemaphores = {&frameSignalInfo, 1}; - delete[] bufferData; + // Submit to GPU + queue->submit({&frameSubmitInfo, 1}); - return true; + // Wait for old frames + if (frame >= framesInFlight) + { + ISemaphore::SWaitInfo frameWaitInfo = { + .semaphore = timelineSemaphore.get(), + .value = timelineValue - framesInFlight + }; + m_device->blockForSemaphores({&frameWaitInfo, 1}); + } } - // Ok this time we'll actually have a work loop (maybe just for the sake of future WASM so we don't timeout a Browser Tab with an unresponsive script) - bool keepRunning() override { return false; } - - // Finally the first actual work-loop - void workLoopBody() override {} - - bool onAppTerminated() override { return true; } + // Wait for all remaining frames to complete + ISemaphore::SWaitInfo finalWait = { + .semaphore = timelineSemaphore.get(), + .value = timelineValue + }; + m_device->blockForSemaphores({&finalWait, 1}); + + auto endTime = std::chrono::high_resolution_clock::now(); + + delete[] regions; + delete[] commandPools; + delete[] commandBuffers; + + // Calculate throughput + double elapsedSeconds = std::chrono::duration(endTime - startTime).count(); + uint64_t totalBytes = (uint64_t)totalFrames * tilesPerFrame * tileSizeBytes; + double throughputGBps = (totalBytes / (1024.0 * 1024.0 * 1024.0)) / elapsedSeconds; + + return throughputGBps; + } + + bool createStagingBuffer( + uint32_t bufferSize, + uint32_t memoryTypeBits, + const char* debugName, + smart_refctd_ptr& outBuffer, + IDeviceMemoryAllocator::SAllocation& outAllocation, + void*& outMappedPtr) + { + IGPUBuffer::SCreationParams params; + params.size = bufferSize; + params.usage = IGPUBuffer::EUF_TRANSFER_SRC_BIT; + outBuffer = m_device->createBuffer(std::move(params)); + if (!outBuffer) + return logFail("Failed to create GPU buffer of size %d!\n", bufferSize); + + outBuffer->setObjectDebugName(debugName); + + auto reqs = outBuffer->getMemoryReqs(); + reqs.memoryTypeBits &= memoryTypeBits; + + outAllocation = m_device->allocate(reqs, outBuffer.get(), IDeviceMemoryAllocation::EMAF_NONE); + if (!outAllocation.isValid()) + return logFail("Failed to allocate Device Memory!\n"); + + outMappedPtr = outAllocation.memory->map({0ull, outAllocation.memory->getAllocationSize()}, IDeviceMemoryAllocation::EMCAF_READ); + if (!outMappedPtr) + return logFail("Failed to map Device Memory!\n"); + + return true; + } }; - -NBL_MAIN_FUNC(CountingSortApp) \ No newline at end of file +NBL_MAIN_FUNC(ImageUploadBenchmarkApp) From 141295bee833de2fb97bc1ef1e7e8bc8980a643c Mon Sep 17 00:00:00 2001 From: CrabeExtra Date: Wed, 24 Dec 2025 21:09:51 +0330 Subject: [PATCH 3/5] Measurment was wierd, added some detail and also fix a bug related to FIF --- 73_ImageUploadBenchmark/main.cpp | 123 +++++++++++++++++++------------ 1 file changed, 77 insertions(+), 46 deletions(-) diff --git a/73_ImageUploadBenchmark/main.cpp b/73_ImageUploadBenchmark/main.cpp index 68815681d..eceb0f9ea 100644 --- a/73_ImageUploadBenchmark/main.cpp +++ b/73_ImageUploadBenchmark/main.cpp @@ -28,8 +28,8 @@ class ImageUploadBenchmarkApp final : public application_templates::MonoDeviceAp constexpr uint32_t TILE_BYTES_PER_PIXEL = 4; constexpr uint32_t TILE_SIZE_BYTES = TILE_SIZE * TILE_SIZE * TILE_BYTES_PER_PIXEL; constexpr uint32_t STAGING_BUFFER_SIZE = 64 * 1024 * 1024; - constexpr uint32_t TILES_PER_FRAME = STAGING_BUFFER_SIZE / TILE_SIZE_BYTES; constexpr uint32_t FRAMES_IN_FLIGHT = 4; + constexpr uint32_t TILES_PER_FRAME = STAGING_BUFFER_SIZE / (TILE_SIZE_BYTES * FRAMES_IN_FLIGHT); constexpr uint32_t TOTAL_FRAMES = 1000; m_logger->log("GPU Memory Transfer Benchmark", ILogger::ELL_INFO); @@ -40,12 +40,20 @@ class ImageUploadBenchmarkApp final : public application_templates::MonoDeviceAp uint32_t hostVisibleBits = m_physicalDevice->getHostVisibleMemoryTypeBits(); uint32_t deviceLocalBits = m_physicalDevice->getDeviceLocalMemoryTypeBits(); - uint32_t hostVisibleOnlyBits = hostVisibleBits & ~deviceLocalBits; + uint32_t hostCachedBits = m_physicalDevice->getMemoryTypeBitsFromMemoryTypeFlags(IDeviceMemoryAllocation::EMPF_HOST_CACHED_BIT); + + uint32_t hostVisibleOnlyBits = hostVisibleBits & ~deviceLocalBits & ~hostCachedBits; + uint32_t hostVisibleDeviceLocalBits = hostVisibleBits & deviceLocalBits; + m_logger->log("Memory type bits - HostVisible: 0x%X, DeviceLocal: 0x%X, HostCached: 0x%X", + ILogger::ELL_INFO, hostVisibleBits, deviceLocalBits, hostCachedBits); + m_logger->log("System RAM (non-cached): 0x%X, VRAM: 0x%X", + ILogger::ELL_INFO, hostVisibleOnlyBits, hostVisibleDeviceLocalBits); + if (!hostVisibleOnlyBits) { - m_logger->log("HOST_VISIBLE memory types not found!", ILogger::ELL_ERROR); + m_logger->log("HOST_VISIBLE non-cached memory types not found!", ILogger::ELL_ERROR); return false; } @@ -122,7 +130,7 @@ class ImageUploadBenchmarkApp final : public application_templates::MonoDeviceAp if (hostVisibleDeviceLocalBits) { - m_logger->log("\nTesting Strategy 2: VRAM (ReBAR)", ILogger::ELL_INFO); + m_logger->log("\nTesting Strategy 2: VRAM", ILogger::ELL_INFO); double throughputVRAM = 0.0; { @@ -131,13 +139,13 @@ class ImageUploadBenchmarkApp final : public application_templates::MonoDeviceAp void* mappedPtr = nullptr; if (!createStagingBuffer(STAGING_BUFFER_SIZE, hostVisibleDeviceLocalBits, - "Staging Buffer - VRAM (ReBAR)", stagingBuffer, stagingAlloc, mappedPtr)) + "Staging Buffer - VRAM", stagingBuffer, stagingAlloc, mappedPtr)) { return false; } throughputVRAM = runBenchmark( - "VRAM (ReBAR)", + "VRAM", stagingBuffer.get(), mappedPtr, destinationImage.get(), @@ -205,7 +213,8 @@ class ImageUploadBenchmarkApp final : public application_templates::MonoDeviceAp uint32_t tilesPerFrame, uint32_t tileSize, uint32_t tileSizeBytes, - uint32_t imageWidth) + uint32_t imageWidth, + uint32_t bufferBaseOffset) { uint32_t tilesPerRow = imageWidth / tileSize; for (size_t i = 0; i < tilesPerFrame; i++) @@ -213,7 +222,7 @@ class ImageUploadBenchmarkApp final : public application_templates::MonoDeviceAp uint32_t tileX = (i % tilesPerRow) * tileSize; uint32_t tileY = (i / tilesPerRow) * tileSize; - outRegions[i].bufferOffset = i * tileSizeBytes; + outRegions[i].bufferOffset = bufferBaseOffset + (i * tileSizeBytes); outRegions[i].bufferRowLength = tileSize; outRegions[i].bufferImageHeight = tileSize; outRegions[i].imageOffset = { tileX, tileY, 0 }; @@ -225,23 +234,6 @@ class ImageUploadBenchmarkApp final : public application_templates::MonoDeviceAp } } - void generateRandomTileData(void* mappedPtr, uint32_t sizeBytes) - { - uint32_t* data = (uint32_t*)mappedPtr; - unsigned seed = std::chrono::system_clock::now().time_since_epoch().count(); - std::mt19937 g(seed); - const uint32_t valueCount = sizeBytes / sizeof(uint32_t); - - auto bufferData = new uint32_t[valueCount]; - - for (uint32_t i = 0; i < valueCount; i++) - { - bufferData[i] = g(); - } - memcpy(mappedPtr, bufferData, sizeBytes); - delete[] bufferData; - } - double runBenchmark( const char* strategyName, IGPUBuffer* stagingBuffer, @@ -305,12 +297,31 @@ class ImageUploadBenchmarkApp final : public application_templates::MonoDeviceAp }; m_device->blockForSemaphores({ &waitInfo, 1 }); - auto regions = new IImage::SBufferCopy[tilesPerFrame]; + uint32_t imageWidth = destinationImage->getCreationParameters().extent.width; + uint32_t partitionSize = tilesPerFrame * tileSizeBytes; - generateRandomTileData(mappedPtr, tilesPerFrame * tileSizeBytes); + // CPU source buffer with random data (generated once, reused each frame) + auto cpuSourceData = new uint8_t[partitionSize]; + { + unsigned seed = std::chrono::system_clock::now().time_since_epoch().count(); + std::mt19937 g(seed); + uint32_t* data = reinterpret_cast(cpuSourceData); + for (uint32_t i = 0; i < partitionSize / sizeof(uint32_t); i++) + data[i] = g(); + } - uint32_t imageWidth = destinationImage->getCreationParameters().extent.width; - generateTileCopyRegions(regions, tilesPerFrame, tileSize, tileSizeBytes, imageWidth); + auto regionsPerFrame = new IImage::SBufferCopy*[framesInFlight]; + for (uint32_t i = 0; i < framesInFlight; i++) + { + regionsPerFrame[i] = new IImage::SBufferCopy[tilesPerFrame]; + uint32_t bufferOffset = i * partitionSize; + generateTileCopyRegions(regionsPerFrame[i], tilesPerFrame, tileSize, tileSizeBytes, imageWidth, bufferOffset); + } + + double totalWaitTime = 0.0; + double totalMemcpyTime = 0.0; + double totalRecordTime = 0.0; + double totalSubmitTime = 0.0; auto startTime = std::chrono::high_resolution_clock::now(); @@ -318,19 +329,35 @@ class ImageUploadBenchmarkApp final : public application_templates::MonoDeviceAp { uint32_t cmdBufIndex = frame % framesInFlight; - commandBuffers[cmdBufIndex]->begin(IGPUCommandBuffer::USAGE::ONE_TIME_SUBMIT_BIT); + auto t1 = std::chrono::high_resolution_clock::now(); + if (frame >= framesInFlight) + { + ISemaphore::SWaitInfo frameWaitInfo = { + .semaphore = timelineSemaphore.get(), + .value = timelineValue - framesInFlight + 1 + }; + m_device->blockForSemaphores({&frameWaitInfo, 1}); + } + auto t2 = std::chrono::high_resolution_clock::now(); + + commandPools[cmdBufIndex]->reset(); + + uint32_t bufferOffset = cmdBufIndex * partitionSize; + void* targetPtr = static_cast(mappedPtr) + bufferOffset; + memcpy(targetPtr, cpuSourceData, partitionSize); + auto t3 = std::chrono::high_resolution_clock::now(); + commandBuffers[cmdBufIndex]->begin(IGPUCommandBuffer::USAGE::ONE_TIME_SUBMIT_BIT); commandBuffers[cmdBufIndex]->copyBufferToImage( stagingBuffer, destinationImage, IImage::LAYOUT::TRANSFER_DST_OPTIMAL, tilesPerFrame, - regions + regionsPerFrame[cmdBufIndex] ); - commandBuffers[cmdBufIndex]->end(); + auto t4 = std::chrono::high_resolution_clock::now(); - // Create submit info for THIS frame IQueue::SSubmitInfo frameSubmitInfo = {}; IQueue::SSubmitInfo::SCommandBufferInfo frameCmdBufInfo = {.cmdbuf = commandBuffers[cmdBufIndex].get()}; frameSubmitInfo.commandBuffers = {&frameCmdBufInfo, 1}; @@ -342,18 +369,13 @@ class ImageUploadBenchmarkApp final : public application_templates::MonoDeviceAp }; frameSubmitInfo.signalSemaphores = {&frameSignalInfo, 1}; - // Submit to GPU queue->submit({&frameSubmitInfo, 1}); + auto t5 = std::chrono::high_resolution_clock::now(); - // Wait for old frames - if (frame >= framesInFlight) - { - ISemaphore::SWaitInfo frameWaitInfo = { - .semaphore = timelineSemaphore.get(), - .value = timelineValue - framesInFlight - }; - m_device->blockForSemaphores({&frameWaitInfo, 1}); - } + totalWaitTime += std::chrono::duration(t2 - t1).count(); + totalMemcpyTime += std::chrono::duration(t3 - t2).count(); + totalRecordTime += std::chrono::duration(t4 - t3).count(); + totalSubmitTime += std::chrono::duration(t5 - t4).count(); } // Wait for all remaining frames to complete @@ -365,15 +387,24 @@ class ImageUploadBenchmarkApp final : public application_templates::MonoDeviceAp auto endTime = std::chrono::high_resolution_clock::now(); - delete[] regions; + delete[] cpuSourceData; + for (uint32_t i = 0; i < framesInFlight; i++) + delete[] regionsPerFrame[i]; + delete[] regionsPerFrame; delete[] commandPools; delete[] commandBuffers; - // Calculate throughput double elapsedSeconds = std::chrono::duration(endTime - startTime).count(); uint64_t totalBytes = (uint64_t)totalFrames * tilesPerFrame * tileSizeBytes; double throughputGBps = (totalBytes / (1024.0 * 1024.0 * 1024.0)) / elapsedSeconds; + m_logger->log(" Timing breakdown for %s:", ILogger::ELL_INFO, strategyName); + m_logger->log(" Wait time: %.3f s (%.1f%%)", ILogger::ELL_INFO, totalWaitTime, 100.0 * totalWaitTime / elapsedSeconds); + m_logger->log(" Memcpy time: %.3f s (%.1f%%)", ILogger::ELL_INFO, totalMemcpyTime, 100.0 * totalMemcpyTime / elapsedSeconds); + m_logger->log(" Record time: %.3f s (%.1f%%)", ILogger::ELL_INFO, totalRecordTime, 100.0 * totalRecordTime / elapsedSeconds); + m_logger->log(" Submit time: %.3f s (%.1f%%)", ILogger::ELL_INFO, totalSubmitTime, 100.0 * totalSubmitTime / elapsedSeconds); + m_logger->log(" Memcpy speed: %.2f GB/s", ILogger::ELL_INFO, (totalBytes / (1024.0 * 1024.0 * 1024.0)) / totalMemcpyTime); + return throughputGBps; } @@ -401,7 +432,7 @@ class ImageUploadBenchmarkApp final : public application_templates::MonoDeviceAp if (!outAllocation.isValid()) return logFail("Failed to allocate Device Memory!\n"); - outMappedPtr = outAllocation.memory->map({0ull, outAllocation.memory->getAllocationSize()}, IDeviceMemoryAllocation::EMCAF_READ); + outMappedPtr = outAllocation.memory->map({0ull, outAllocation.memory->getAllocationSize()}, IDeviceMemoryAllocation::EMCAF_WRITE); if (!outMappedPtr) return logFail("Failed to map Device Memory!\n"); From 874814af7c8dd08c264afbdebef1e0719561dffe Mon Sep 17 00:00:00 2001 From: CrabeExtra Date: Wed, 31 Dec 2025 16:29:19 +0330 Subject: [PATCH 4/5] Resolved PR comments + adding timestamp query --- 73_ImageUploadBenchmark/main.cpp | 159 +++++++++++++++++++++---------- 1 file changed, 110 insertions(+), 49 deletions(-) diff --git a/73_ImageUploadBenchmark/main.cpp b/73_ImageUploadBenchmark/main.cpp index eceb0f9ea..f8124c9ab 100644 --- a/73_ImageUploadBenchmark/main.cpp +++ b/73_ImageUploadBenchmark/main.cpp @@ -1,5 +1,6 @@ #include "nbl/examples/examples.hpp" #include +#include using namespace nbl; using namespace nbl::core; @@ -68,8 +69,9 @@ class ImageUploadBenchmarkApp final : public application_templates::MonoDeviceAp { IGPUImage::SCreationParams imgParams{}; imgParams.type = IImage::E_TYPE::ET_2D; - imgParams.extent.width = TILE_SIZE * 32; - imgParams.extent.height = TILE_SIZE * 32; + uint32_t tilePerRow = (uint32_t)std::sqrt(TILES_PER_FRAME); + imgParams.extent.width = TILE_SIZE * tilePerRow; + imgParams.extent.height = TILE_SIZE * tilePerRow; imgParams.extent.depth = 1u; imgParams.format = asset::E_FORMAT::EF_R8G8B8A8_UNORM; imgParams.mipLevels = 1u; @@ -111,6 +113,7 @@ class ImageUploadBenchmarkApp final : public application_templates::MonoDeviceAp throughputSystemRAM = runBenchmark( "System RAM", stagingBuffer.get(), + stagingAlloc, mappedPtr, destinationImage.get(), TILE_SIZE, @@ -147,6 +150,7 @@ class ImageUploadBenchmarkApp final : public application_templates::MonoDeviceAp throughputVRAM = runBenchmark( "VRAM", stagingBuffer.get(), + stagingAlloc, mappedPtr, destinationImage.get(), TILE_SIZE, @@ -166,6 +170,9 @@ class ImageUploadBenchmarkApp final : public application_templates::MonoDeviceAp m_logger->log("\nVRAM is %.2fx faster than System RAM", ILogger::ELL_PERFORMANCE, speedup); } + m_logger->log("\nWaiting 5 seconds before exit...", ILogger::ELL_INFO); + std::this_thread::sleep_for(std::chrono::seconds(5)); + return true; } @@ -186,28 +193,6 @@ class ImageUploadBenchmarkApp final : public application_templates::MonoDeviceAp } private: - void transitionImageLayout( - IGPUCommandBuffer* cmdBuf, - IGPUImage* image, - IImage::LAYOUT oldLayout, - IImage::LAYOUT newLayout) - { - IGPUCommandBuffer::SImageMemoryBarrier barrier = {}; - barrier.oldLayout = oldLayout; - barrier.newLayout = newLayout; - barrier.image = image; - barrier.subresourceRange.aspectMask = IImage::E_ASPECT_FLAGS::EAF_COLOR_BIT; - barrier.subresourceRange.baseMipLevel = 0; - barrier.subresourceRange.levelCount = 1; - barrier.subresourceRange.baseArrayLayer = 0; - barrier.subresourceRange.layerCount = 1; - barrier.barrier.dep.srcAccessMask = ACCESS_FLAGS::NONE; - barrier.barrier.dep.dstAccessMask = ACCESS_FLAGS::TRANSFER_WRITE_BIT; - barrier.barrier.dep.srcStageMask = PIPELINE_STAGE_FLAGS::ALL_COMMANDS_BITS; - barrier.barrier.dep.dstStageMask = PIPELINE_STAGE_FLAGS::ALL_TRANSFER_BITS; - cmdBuf->pipelineBarrier(E_DEPENDENCY_FLAGS::EDF_NONE, { .imgBarriers = {&barrier, 1} }); - } - void generateTileCopyRegions( IImage::SBufferCopy* outRegions, uint32_t tilesPerFrame, @@ -237,6 +222,7 @@ class ImageUploadBenchmarkApp final : public application_templates::MonoDeviceAp double runBenchmark( const char* strategyName, IGPUBuffer* stagingBuffer, + IDeviceMemoryAllocator::SAllocation& stagingAlloc, void* mappedPtr, IGPUImage* destinationImage, uint32_t tileSize, @@ -248,7 +234,16 @@ class ImageUploadBenchmarkApp final : public application_templates::MonoDeviceAp { smart_refctd_ptr timelineSemaphore = m_device->createSemaphore(0); - auto commandPools = new smart_refctd_ptr[framesInFlight]; + smart_refctd_ptr queryPool; + { + IQueryPool::SCreationParams queryPoolParams = {}; + queryPoolParams.queryType = IQueryPool::TYPE::TIMESTAMP; + queryPoolParams.queryCount = framesInFlight * 2; + queryPoolParams.pipelineStatisticsFlags = IQueryPool::PIPELINE_STATISTICS_FLAGS::NONE; + queryPool = m_device->createQueryPool(queryPoolParams); + } + + std::vector> commandPools(framesInFlight); for (uint32_t i = 0; i < framesInFlight; i++) { commandPools[i] = m_device->createCommandPool( @@ -256,8 +251,7 @@ class ImageUploadBenchmarkApp final : public application_templates::MonoDeviceAp IGPUCommandPool::CREATE_FLAGS::RESET_COMMAND_BUFFER_BIT ); } - - auto commandBuffers = new smart_refctd_ptr[framesInFlight]; + std::vector> commandBuffers(framesInFlight); for (uint32_t i = 0; i < framesInFlight; i++) { commandPools[i]->createCommandBuffers( @@ -270,12 +264,22 @@ class ImageUploadBenchmarkApp final : public application_templates::MonoDeviceAp uint64_t timelineValue = 0; commandBuffers[0]->begin(IGPUCommandBuffer::USAGE::ONE_TIME_SUBMIT_BIT); - transitionImageLayout( - commandBuffers[0].get(), - destinationImage, - IImage::LAYOUT::UNDEFINED, - IImage::LAYOUT::TRANSFER_DST_OPTIMAL - ); + { + IGPUCommandBuffer::SImageMemoryBarrier initBarrier = {}; + initBarrier.oldLayout = IImage::LAYOUT::UNDEFINED; + initBarrier.newLayout = IImage::LAYOUT::GENERAL; + initBarrier.image = destinationImage; + initBarrier.subresourceRange.aspectMask = IImage::E_ASPECT_FLAGS::EAF_COLOR_BIT; + initBarrier.subresourceRange.baseMipLevel = 0; + initBarrier.subresourceRange.levelCount = 1; + initBarrier.subresourceRange.baseArrayLayer = 0; + initBarrier.subresourceRange.layerCount = 1; + initBarrier.barrier.dep.srcAccessMask = ACCESS_FLAGS::NONE; + initBarrier.barrier.dep.dstAccessMask = ACCESS_FLAGS::TRANSFER_WRITE_BIT; + initBarrier.barrier.dep.srcStageMask = PIPELINE_STAGE_FLAGS::NONE; + initBarrier.barrier.dep.dstStageMask = PIPELINE_STAGE_FLAGS::COPY_BIT; + commandBuffers[0]->pipelineBarrier(E_DEPENDENCY_FLAGS::EDF_NONE, {.imgBarriers = {&initBarrier, 1}}); + } commandBuffers[0]->end(); IQueue::SSubmitInfo submitInfo = {}; @@ -300,22 +304,20 @@ class ImageUploadBenchmarkApp final : public application_templates::MonoDeviceAp uint32_t imageWidth = destinationImage->getCreationParameters().extent.width; uint32_t partitionSize = tilesPerFrame * tileSizeBytes; - // CPU source buffer with random data (generated once, reused each frame) - auto cpuSourceData = new uint8_t[partitionSize]; + std::vector cpuSourceData(partitionSize); { unsigned seed = std::chrono::system_clock::now().time_since_epoch().count(); std::mt19937 g(seed); - uint32_t* data = reinterpret_cast(cpuSourceData); + uint32_t* data = reinterpret_cast(cpuSourceData.data()); for (uint32_t i = 0; i < partitionSize / sizeof(uint32_t); i++) data[i] = g(); } - - auto regionsPerFrame = new IImage::SBufferCopy*[framesInFlight]; + std::vector> regionsPerFrame(framesInFlight); for (uint32_t i = 0; i < framesInFlight; i++) { - regionsPerFrame[i] = new IImage::SBufferCopy[tilesPerFrame]; + regionsPerFrame[i].resize(tilesPerFrame); uint32_t bufferOffset = i * partitionSize; - generateTileCopyRegions(regionsPerFrame[i], tilesPerFrame, tileSize, tileSizeBytes, imageWidth, bufferOffset); + generateTileCopyRegions(regionsPerFrame[i].data(), tilesPerFrame, tileSize, tileSizeBytes, imageWidth, bufferOffset); } double totalWaitTime = 0.0; @@ -344,17 +346,63 @@ class ImageUploadBenchmarkApp final : public application_templates::MonoDeviceAp uint32_t bufferOffset = cmdBufIndex * partitionSize; void* targetPtr = static_cast(mappedPtr) + bufferOffset; - memcpy(targetPtr, cpuSourceData, partitionSize); + memcpy(targetPtr, cpuSourceData.data(), partitionSize); + + if (!stagingAlloc.memory->getMemoryPropertyFlags().hasFlags(IDeviceMemoryAllocation::EMPF_HOST_COHERENT_BIT)) + { + ILogicalDevice::MappedMemoryRange range(stagingAlloc.memory.get(), bufferOffset, partitionSize); + m_device->flushMappedMemoryRanges(1, &range); + } + auto t3 = std::chrono::high_resolution_clock::now(); commandBuffers[cmdBufIndex]->begin(IGPUCommandBuffer::USAGE::ONE_TIME_SUBMIT_BIT); + + uint32_t queryStartIndex = cmdBufIndex * 2; + commandBuffers[cmdBufIndex]->resetQueryPool(queryPool.get(), queryStartIndex, 2); + + IGPUCommandBuffer::SImageMemoryBarrier barrier = {}; + barrier.oldLayout = IImage::LAYOUT::GENERAL; + barrier.newLayout = IImage::LAYOUT::GENERAL; + barrier.image = destinationImage; + barrier.subresourceRange.aspectMask = IImage::E_ASPECT_FLAGS::EAF_COLOR_BIT; + barrier.subresourceRange.baseMipLevel = 0; + barrier.subresourceRange.levelCount = 1; + barrier.subresourceRange.baseArrayLayer = 0; + barrier.subresourceRange.layerCount = 1; + barrier.barrier.dep.srcAccessMask = ACCESS_FLAGS::TRANSFER_WRITE_BIT; + barrier.barrier.dep.dstAccessMask = ACCESS_FLAGS::TRANSFER_WRITE_BIT; + barrier.barrier.dep.srcStageMask = PIPELINE_STAGE_FLAGS::ALL_TRANSFER_BITS; + barrier.barrier.dep.dstStageMask = PIPELINE_STAGE_FLAGS::ALL_TRANSFER_BITS; + commandBuffers[cmdBufIndex]->pipelineBarrier(E_DEPENDENCY_FLAGS::EDF_NONE, {.imgBarriers = {&barrier, 1}}); + + commandBuffers[cmdBufIndex]->writeTimestamp(PIPELINE_STAGE_FLAGS::COPY_BIT, queryPool.get(), queryStartIndex + 0); + commandBuffers[cmdBufIndex]->copyBufferToImage( stagingBuffer, destinationImage, - IImage::LAYOUT::TRANSFER_DST_OPTIMAL, + IImage::LAYOUT::GENERAL, tilesPerFrame, - regionsPerFrame[cmdBufIndex] + regionsPerFrame[cmdBufIndex].data() ); + + IGPUCommandBuffer::SImageMemoryBarrier afterBarrier = {}; + afterBarrier.oldLayout = IImage::LAYOUT::GENERAL; + afterBarrier.newLayout = IImage::LAYOUT::GENERAL; + afterBarrier.image = destinationImage; + afterBarrier.subresourceRange.aspectMask = IImage::E_ASPECT_FLAGS::EAF_COLOR_BIT; + afterBarrier.subresourceRange.baseMipLevel = 0; + afterBarrier.subresourceRange.levelCount = 1; + afterBarrier.subresourceRange.baseArrayLayer = 0; + afterBarrier.subresourceRange.layerCount = 1; + afterBarrier.barrier.dep.srcAccessMask = ACCESS_FLAGS::TRANSFER_WRITE_BIT; + afterBarrier.barrier.dep.dstAccessMask = ACCESS_FLAGS::SHADER_READ_BITS; + afterBarrier.barrier.dep.srcStageMask = PIPELINE_STAGE_FLAGS::ALL_TRANSFER_BITS; + afterBarrier.barrier.dep.dstStageMask = PIPELINE_STAGE_FLAGS::FRAGMENT_SHADER_BIT; + commandBuffers[cmdBufIndex]->pipelineBarrier(E_DEPENDENCY_FLAGS::EDF_NONE, {.imgBarriers = {&afterBarrier, 1}}); + + commandBuffers[cmdBufIndex]->writeTimestamp(PIPELINE_STAGE_FLAGS::COPY_BIT, queryPool.get(), queryStartIndex + 1); + commandBuffers[cmdBufIndex]->end(); auto t4 = std::chrono::high_resolution_clock::now(); @@ -387,17 +435,30 @@ class ImageUploadBenchmarkApp final : public application_templates::MonoDeviceAp auto endTime = std::chrono::high_resolution_clock::now(); - delete[] cpuSourceData; - for (uint32_t i = 0; i < framesInFlight; i++) - delete[] regionsPerFrame[i]; - delete[] regionsPerFrame; - delete[] commandPools; - delete[] commandBuffers; + std::vector timestamps(framesInFlight * 2); + const core::bitflag flags = core::bitflag(IQueryPool::RESULTS_FLAGS::_64_BIT) | core::bitflag(IQueryPool::RESULTS_FLAGS::WAIT_BIT); + m_device->getQueryPoolResults(queryPool.get(), 0, framesInFlight * 2, timestamps.data(), sizeof(uint64_t), flags); + uint64_t totalGpuTicks = 0; + for (uint32_t i = 0; i < framesInFlight; i++) { + uint64_t startTick = timestamps[i * 2 + 0]; + uint64_t endTick = timestamps[i * 2 + 1]; + totalGpuTicks += (endTick - startTick); + } + float timestampPeriod = m_physicalDevice->getLimits().timestampPeriodInNanoSeconds; + double sampledGpuTimeSeconds = (totalGpuTicks * timestampPeriod) / 1e9; + + double avgGpuTimePerFrame = sampledGpuTimeSeconds / framesInFlight; + double totalGpuTimeSeconds = avgGpuTimePerFrame * totalFrames; + double elapsedSeconds = std::chrono::duration(endTime - startTime).count(); uint64_t totalBytes = (uint64_t)totalFrames * tilesPerFrame * tileSizeBytes; + double throughputGBps = (totalBytes / (1024.0 * 1024.0 * 1024.0)) / elapsedSeconds; + m_logger->log(" GPU time: %.3f s", ILogger::ELL_INFO, totalGpuTimeSeconds); + m_logger->log(" GPU throughput: %.2f GB/s", ILogger::ELL_INFO, throughputGBps); + m_logger->log(" Timing breakdown for %s:", ILogger::ELL_INFO, strategyName); m_logger->log(" Wait time: %.3f s (%.1f%%)", ILogger::ELL_INFO, totalWaitTime, 100.0 * totalWaitTime / elapsedSeconds); m_logger->log(" Memcpy time: %.3f s (%.1f%%)", ILogger::ELL_INFO, totalMemcpyTime, 100.0 * totalMemcpyTime / elapsedSeconds); From ddb7bfc6ae5889aea89db756b461a0beeb763d0f Mon Sep 17 00:00:00 2001 From: CrabeExtra Date: Thu, 1 Jan 2026 17:01:15 +0330 Subject: [PATCH 5/5] Adding more logs to release build --- 73_ImageUploadBenchmark/main.cpp | 36 ++++++++++++++++---------------- 1 file changed, 18 insertions(+), 18 deletions(-) diff --git a/73_ImageUploadBenchmark/main.cpp b/73_ImageUploadBenchmark/main.cpp index f8124c9ab..ff38b1555 100644 --- a/73_ImageUploadBenchmark/main.cpp +++ b/73_ImageUploadBenchmark/main.cpp @@ -33,11 +33,11 @@ class ImageUploadBenchmarkApp final : public application_templates::MonoDeviceAp constexpr uint32_t TILES_PER_FRAME = STAGING_BUFFER_SIZE / (TILE_SIZE_BYTES * FRAMES_IN_FLIGHT); constexpr uint32_t TOTAL_FRAMES = 1000; - m_logger->log("GPU Memory Transfer Benchmark", ILogger::ELL_INFO); - m_logger->log("Tile size: %ux%u (%u KB)", ILogger::ELL_INFO, TILE_SIZE, TILE_SIZE, TILE_SIZE_BYTES / 1024); - m_logger->log("Staging buffer: %u MB", ILogger::ELL_INFO, STAGING_BUFFER_SIZE / (1024 * 1024)); - m_logger->log("Tiles per frame: %u", ILogger::ELL_INFO, TILES_PER_FRAME); - m_logger->log("Frames in flight: %u", ILogger::ELL_INFO, FRAMES_IN_FLIGHT); + m_logger->log("GPU Memory Transfer Benchmark", ILogger::ELL_PERFORMANCE); + m_logger->log("Tile size: %ux%u (%u KB)", ILogger::ELL_PERFORMANCE, TILE_SIZE, TILE_SIZE, TILE_SIZE_BYTES / 1024); + m_logger->log("Staging buffer: %u MB", ILogger::ELL_PERFORMANCE, STAGING_BUFFER_SIZE / (1024 * 1024)); + m_logger->log("Tiles per frame: %u", ILogger::ELL_PERFORMANCE, TILES_PER_FRAME); + m_logger->log("Frames in flight: %u", ILogger::ELL_PERFORMANCE, FRAMES_IN_FLIGHT); uint32_t hostVisibleBits = m_physicalDevice->getHostVisibleMemoryTypeBits(); uint32_t deviceLocalBits = m_physicalDevice->getDeviceLocalMemoryTypeBits(); @@ -48,9 +48,9 @@ class ImageUploadBenchmarkApp final : public application_templates::MonoDeviceAp uint32_t hostVisibleDeviceLocalBits = hostVisibleBits & deviceLocalBits; m_logger->log("Memory type bits - HostVisible: 0x%X, DeviceLocal: 0x%X, HostCached: 0x%X", - ILogger::ELL_INFO, hostVisibleBits, deviceLocalBits, hostCachedBits); + ILogger::ELL_PERFORMANCE, hostVisibleBits, deviceLocalBits, hostCachedBits); m_logger->log("System RAM (non-cached): 0x%X, VRAM: 0x%X", - ILogger::ELL_INFO, hostVisibleOnlyBits, hostVisibleDeviceLocalBits); + ILogger::ELL_PERFORMANCE, hostVisibleOnlyBits, hostVisibleDeviceLocalBits); if (!hostVisibleOnlyBits) { @@ -96,7 +96,7 @@ class ImageUploadBenchmarkApp final : public application_templates::MonoDeviceAp return logFail("Failed to allocate DEVICE_LOCAL memory for destination image!\n"); } - m_logger->log("\nTesting Strategy 1: System RAM", ILogger::ELL_INFO); + m_logger->log("\nStrategy 1: System RAM", ILogger::ELL_PERFORMANCE); double throughputSystemRAM = 0.0; { @@ -133,7 +133,7 @@ class ImageUploadBenchmarkApp final : public application_templates::MonoDeviceAp if (hostVisibleDeviceLocalBits) { - m_logger->log("\nTesting Strategy 2: VRAM", ILogger::ELL_INFO); + m_logger->log("\nStrategy 2: VRAM", ILogger::ELL_PERFORMANCE); double throughputVRAM = 0.0; { @@ -170,7 +170,7 @@ class ImageUploadBenchmarkApp final : public application_templates::MonoDeviceAp m_logger->log("\nVRAM is %.2fx faster than System RAM", ILogger::ELL_PERFORMANCE, speedup); } - m_logger->log("\nWaiting 5 seconds before exit...", ILogger::ELL_INFO); + m_logger->log("\nWaiting 5 seconds before exit...", ILogger::ELL_PERFORMANCE); std::this_thread::sleep_for(std::chrono::seconds(5)); return true; @@ -456,15 +456,15 @@ class ImageUploadBenchmarkApp final : public application_templates::MonoDeviceAp double throughputGBps = (totalBytes / (1024.0 * 1024.0 * 1024.0)) / elapsedSeconds; - m_logger->log(" GPU time: %.3f s", ILogger::ELL_INFO, totalGpuTimeSeconds); - m_logger->log(" GPU throughput: %.2f GB/s", ILogger::ELL_INFO, throughputGBps); + m_logger->log(" GPU time: %.3f s", ILogger::ELL_PERFORMANCE, totalGpuTimeSeconds); + m_logger->log(" GPU throughput: %.2f GB/s", ILogger::ELL_PERFORMANCE, throughputGBps); - m_logger->log(" Timing breakdown for %s:", ILogger::ELL_INFO, strategyName); - m_logger->log(" Wait time: %.3f s (%.1f%%)", ILogger::ELL_INFO, totalWaitTime, 100.0 * totalWaitTime / elapsedSeconds); - m_logger->log(" Memcpy time: %.3f s (%.1f%%)", ILogger::ELL_INFO, totalMemcpyTime, 100.0 * totalMemcpyTime / elapsedSeconds); - m_logger->log(" Record time: %.3f s (%.1f%%)", ILogger::ELL_INFO, totalRecordTime, 100.0 * totalRecordTime / elapsedSeconds); - m_logger->log(" Submit time: %.3f s (%.1f%%)", ILogger::ELL_INFO, totalSubmitTime, 100.0 * totalSubmitTime / elapsedSeconds); - m_logger->log(" Memcpy speed: %.2f GB/s", ILogger::ELL_INFO, (totalBytes / (1024.0 * 1024.0 * 1024.0)) / totalMemcpyTime); + m_logger->log(" Timing breakdown for %s:", ILogger::ELL_PERFORMANCE, strategyName); + m_logger->log(" Wait time: %.3f s (%.1f%%)", ILogger::ELL_PERFORMANCE, totalWaitTime, 100.0 * totalWaitTime / elapsedSeconds); + m_logger->log(" Memcpy time: %.3f s (%.1f%%)", ILogger::ELL_PERFORMANCE, totalMemcpyTime, 100.0 * totalMemcpyTime / elapsedSeconds); + m_logger->log(" Record time: %.3f s (%.1f%%)", ILogger::ELL_PERFORMANCE, totalRecordTime, 100.0 * totalRecordTime / elapsedSeconds); + m_logger->log(" Submit time: %.3f s (%.1f%%)", ILogger::ELL_PERFORMANCE, totalSubmitTime, 100.0 * totalSubmitTime / elapsedSeconds); + m_logger->log(" Memcpy speed: %.2f GB/s", ILogger::ELL_PERFORMANCE, (totalBytes / (1024.0 * 1024.0 * 1024.0)) / totalMemcpyTime); return throughputGBps; }