From 6635ba9cad53f599e4643aa90e7dfa9e800d144c Mon Sep 17 00:00:00 2001
From: CrabeExtra <abbasgaroosi7@gmail.com>
Date: Tue, 23 Dec 2025 18:10:26 +0330
Subject: [PATCH 1/5] Add 73_ImageUploadBenchmark example

---
 73_ImageUploadBenchmark/CMakeLists.txt       |   6 +
 73_ImageUploadBenchmark/config.json.template |  28 ++
 73_ImageUploadBenchmark/main.cpp             | 392 +++++++++++++++++++
 73_ImageUploadBenchmark/pipeline.groovy      |  50 +++
 CMakeLists.txt                               |   1 +
 5 files changed, 477 insertions(+)
 create mode 100644 73_ImageUploadBenchmark/CMakeLists.txt
 create mode 100644 73_ImageUploadBenchmark/config.json.template
 create mode 100644 73_ImageUploadBenchmark/main.cpp
 create mode 100644 73_ImageUploadBenchmark/pipeline.groovy
diff --git a/73_ImageUploadBenchmark/CMakeLists.txt b/73_ImageUploadBenchmark/CMakeLists.txt
new file mode 100644
index 000000000..2f9218f93
--- /dev/null
+++ b/73_ImageUploadBenchmark/CMakeLists.txt
@@ -0,0 +1,6 @@
+include(common RESULT_VARIABLE RES)
+if(NOT RES)
+	message(FATAL_ERROR "common.cmake not found. Should be in {repo_root}/cmake directory")
+endif()
+
+nbl_create_executable_project("" "" "" "" "${NBL_EXECUTABLE_PROJECT_CREATION_PCH_TARGET}")
\ No newline at end of file
diff --git a/73_ImageUploadBenchmark/config.json.template b/73_ImageUploadBenchmark/config.json.template
new file mode 100644
index 000000000..12215d0bb
--- /dev/null
+++ b/73_ImageUploadBenchmark/config.json.template
@@ -0,0 +1,28 @@
+{
+  "enableParallelBuild": true,
+  "threadsPerBuildProcess" : 2,
+  "isExecuted": false,
+  "scriptPath": "",
+  "cmake": {
+    "configurations": [ "Release", "Debug", "RelWithDebInfo" ],
+    "buildModes": [],
+    "requiredOptions": []
+  }, 
+  "profiles": [
+    {
+      "backend": "vulkan", // should be none
+      "platform": "windows",
+      "buildModes": [],
+      "runConfiguration": "Release", // we also need to run in Debug nad RWDI because foundational example
+      "gpuArchitectures": []
+    }
+  ],
+  "dependencies": [],
+  "data": [
+    {
+      "dependencies": [],
+      "command": [""],
+      "outputs": []
+    }
+  ]
+}
diff --git a/73_ImageUploadBenchmark/main.cpp b/73_ImageUploadBenchmark/main.cpp
new file mode 100644
index 000000000..a22647750
--- /dev/null
+++ b/73_ImageUploadBenchmark/main.cpp
@@ -0,0 +1,392 @@
+#include "nbl/examples/examples.hpp"
+#include "nbl/this_example/builtin/build/spirv/keys.hpp"
+
+using namespace nbl;
+using namespace nbl::core;
+using namespace nbl::hlsl;
+using namespace nbl::system;
+using namespace nbl::asset;
+using namespace nbl::ui;
+using namespace nbl::video;
+using namespace nbl::examples;
+
+#include "app_resources/common.hlsl"
+#include "nbl/builtin/hlsl/bit.hlsl"
+
+class CountingSortApp final : public application_templates::MonoDeviceApplication, public BuiltinResourcesApplication
+{
+		using device_base_t = application_templates::MonoDeviceApplication;
+		using asset_base_t = BuiltinResourcesApplication;
+
+	public:
+		// Yay thanks to multiple inheritance we cannot forward ctors anymore
+		CountingSortApp(const path& _localInputCWD, const path& _localOutputCWD, const path& _sharedInputCWD, const path& _sharedOutputCWD) :
+			system::IApplicationFramework(_localInputCWD, _localOutputCWD, _sharedInputCWD, _sharedOutputCWD) {}
+
+		// we stuff all our work here because its a "single shot" app
+		bool onAppInitialized(smart_refctd_ptr<ISystem>&& system) override
+		{
+			// Remember to call the base class initialization!
+			if (!device_base_t::onAppInitialized(smart_refctd_ptr(system)))
+				return false;
+			if (!asset_base_t::onAppInitialized(std::move(system)))
+				return false;
+
+			auto limits = m_physicalDevice->getLimits();
+			constexpr std::array<uint32_t, 3u> AllowedMaxComputeSharedMemorySizes = {
+				16384, 32768, 65536
+			};
+
+			auto upperBoundSharedMemSize = std::upper_bound(AllowedMaxComputeSharedMemorySizes.begin(), AllowedMaxComputeSharedMemorySizes.end(), limits.maxComputeSharedMemorySize);
+			// devices which support less than 16KB of max compute shared memory size are not supported
+			if (upperBoundSharedMemSize == AllowedMaxComputeSharedMemorySizes.begin())
+			{
+				m_logger->log("maxComputeSharedMemorySize is too low (%u)", ILogger::E_LOG_LEVEL::ELL_ERROR, limits.maxComputeSharedMemorySize);
+				exit(0);
+			}
+
+			limits.maxComputeSharedMemorySize = *(upperBoundSharedMemSize - 1);
+
+			const uint32_t WorkgroupSize = limits.maxComputeWorkGroupInvocations;
+			const uint32_t MaxBucketCount = (limits.maxComputeSharedMemorySize / sizeof(uint32_t)) / 2;
+			constexpr uint32_t element_count = 100000;
+			const uint32_t bucket_count = std::min((uint32_t)3000, MaxBucketCount);
+			const uint32_t elements_per_thread = ceil((float)ceil((float)element_count / limits.computeUnits) / WorkgroupSize);
+
+			auto loadPrecompiledShader = [&]<core::StringLiteral ShaderKey>() -> smart_refctd_ptr<IShader>
+			{
+				// this time we load a shader directly from a file
+				IAssetLoader::SAssetLoadParams lp = {};
+				lp.logger = m_logger.get();
+				lp.workingDirectory = "app_resources"; // virtual root
+				auto key = nbl::this_example::builtin::build::get_spirv_key<ShaderKey>(limits, m_physicalDevice->getFeatures());
+				auto assetBundle = m_assetMgr->getAsset(key.data(), lp);
+				const auto assets = assetBundle.getContents();
+				if (assets.empty())
+				{
+					logFail("Could not load shader!");
+					return nullptr;
+				}
+
+				auto shader = IAsset::castDown<IShader>(assets[0]);
+				// The down-cast should not fail!
+				assert(shader);
+			
+				// There's two ways of doing stuff like this:
+				// 1. this - modifying the asset after load
+				// 2. creating a short shader source file that includes the asset you would have wanted to load
+				// 
+				//auto overrideSource = CHLSLCompiler::createOverridenCopy(
+				//	source.get(), "#define WorkgroupSize %d\n#define BucketCount %d\n",
+				//	WorkgroupSize, bucket_count
+				//);
+
+				// this time we skip the use of the asset converter since the IShader->IGPUShader path is quick and simple
+				return shader;
+			};
+			auto prefixSumShader = loadPrecompiledShader.operator()<"prefix_sum_shader">(); // "app_resources/prefix_sum_shader.comp.hlsl"
+			auto scatterShader = loadPrecompiledShader.operator()<"scatter_shader">(); // "app_resources/scatter_shader.comp.hlsl"
+
+			// People love Reflection but I prefer Shader Sources instead!
+			const nbl::asset::SPushConstantRange pcRange = { .stageFlags = IShader::E_SHADER_STAGE::ESS_COMPUTE,.offset = 0,.size = sizeof(CountingPushData) };
+
+			// This time we'll have no Descriptor Sets or Layouts because our workload has a widely varying size
+			// and using traditional SSBO bindings would force us to update the Descriptor Set every frame.
+			// I even started writing this sample with the use of Dynamic SSBOs, however the length of the buffer range is not dynamic
+			// only the offset. This means that we'd have to write the "worst case" length into the descriptor set binding.
+			// Then this has a knock-on effect that we couldn't allocate closer to the end of the streaming buffer than the "worst case" size.
+			smart_refctd_ptr<IGPUPipelineLayout> layout;
+			smart_refctd_ptr<IGPUComputePipeline> prefixSumPipeline;
+			smart_refctd_ptr<IGPUComputePipeline> scatterPipeline;
+			{
+				layout = m_device->createPipelineLayout({ &pcRange,1 });
+				IGPUComputePipeline::SCreationParams params = {};
+				params.layout = layout.get();
+				params.shader.shader = prefixSumShader.get();
+				params.shader.entryPoint = "main";
+				params.shader.entries = nullptr;
+				params.shader.requiredSubgroupSize = static_cast<IPipelineBase::SUBGROUP_SIZE>(5);
+				params.cached.requireFullSubgroups = true;
+				if (!m_device->createComputePipelines(nullptr, { &params,1 }, &prefixSumPipeline))
+					return logFail("Failed to create compute pipeline!\n");
+				params.shader.shader = scatterShader.get();
+				if (!m_device->createComputePipelines(nullptr, { &params,1 }, &scatterPipeline))
+					return logFail("Failed to create compute pipeline!\n");
+			}
+
+			// Allocate memory
+			nbl::video::IDeviceMemoryAllocator::SAllocation allocation[5] = {};
+			smart_refctd_ptr<IGPUBuffer> buffers[5];
+			//smart_refctd_ptr<nbl::video::IGPUDescriptorSet> ds;
+			{
+				auto build_buffer = [this](
+					smart_refctd_ptr<ILogicalDevice> m_device,
+					nbl::video::IDeviceMemoryAllocator::SAllocation *allocation,
+					smart_refctd_ptr<IGPUBuffer>& buffer,
+					size_t buffer_size,
+					const char *label
+				) -> void {
+					IGPUBuffer::SCreationParams params;
+					params.size = buffer_size;
+					params.usage = IGPUBuffer::EUF_STORAGE_BUFFER_BIT | IGPUBuffer::EUF_SHADER_DEVICE_ADDRESS_BIT;
+					buffer = m_device->createBuffer(std::move(params));
+					if (!buffer)
+						logFail("Failed to create GPU buffer of size %d!\n", buffer_size);
+
+					buffer->setObjectDebugName(label);
+
+					auto reqs = buffer->getMemoryReqs();
+					reqs.memoryTypeBits &= m_physicalDevice->getHostVisibleMemoryTypeBits();
+
+					*allocation = m_device->allocate(reqs, buffer.get(), IDeviceMemoryAllocation::EMAF_DEVICE_ADDRESS_BIT);
+					if (!allocation->isValid())
+						logFail("Failed to allocate Device Memory compatible with our GPU Buffer!\n");
+
+					assert(allocation->memory.get() == buffer->getBoundMemory().memory);
+				};
+
+				build_buffer(m_device,	allocation,		buffers[0], sizeof(uint32_t) * element_count,	"Input Key Buffer");
+				build_buffer(m_device,	allocation + 1,	buffers[1], sizeof(uint32_t) * element_count,	"Input Value Buffer");
+				build_buffer(m_device,	allocation + 2, buffers[2], sizeof(uint32_t) * bucket_count,	"Scratch Buffer");
+				build_buffer(m_device,	allocation + 3,	buffers[3], sizeof(uint32_t) * element_count,	"Output Key Buffer");
+				build_buffer(m_device,	allocation + 4, buffers[4], sizeof(uint32_t) * element_count,	"Output Value Buffer");
+			}
+			uint64_t buffer_device_address[] = {
+				buffers[0]->getDeviceAddress(),
+				buffers[1]->getDeviceAddress(),
+				buffers[2]->getDeviceAddress(),
+				buffers[3]->getDeviceAddress(),
+				buffers[4]->getDeviceAddress()
+			};
+
+			void* mapped_memory[] = {
+				allocation[0].memory->map({0ull,allocation[0].memory->getAllocationSize()}, IDeviceMemoryAllocation::EMCAF_READ),
+				allocation[1].memory->map({0ull,allocation[1].memory->getAllocationSize()}, IDeviceMemoryAllocation::EMCAF_READ),
+				allocation[2].memory->map({0ull,allocation[2].memory->getAllocationSize()}, IDeviceMemoryAllocation::EMCAF_READ),
+				allocation[3].memory->map({0ull,allocation[3].memory->getAllocationSize()}, IDeviceMemoryAllocation::EMCAF_READ),
+				allocation[4].memory->map({0ull,allocation[3].memory->getAllocationSize()}, IDeviceMemoryAllocation::EMCAF_READ),
+			};
+			if (!mapped_memory[0] || !mapped_memory[1] || !mapped_memory[2] || !mapped_memory[3] || !mapped_memory[4])
+				return logFail("Failed to map the Device Memory!\n");
+
+			// Generate random data
+			constexpr uint32_t minimum = 0;
+			const uint32_t range = bucket_count;
+			unsigned seed = std::chrono::system_clock::now().time_since_epoch().count();
+			std::mt19937 g(seed);
+
+			auto bufferData = new uint32_t[2][element_count];
+			for (uint32_t i = 0; i < element_count; i++) {
+				bufferData[0][i] = minimum + g() % range;
+			}
+
+			memcpy(mapped_memory[0], bufferData[0], sizeof(uint32_t) * element_count);
+
+			for (uint32_t i = 0; i < element_count; i++) {
+				bufferData[1][i] = g() % std::numeric_limits<uint32_t>::max();
+			}
+
+			memcpy(mapped_memory[1], bufferData[1], sizeof(uint32_t) * element_count);
+
+			std::string outBuffer;
+			for (auto i = 0; i < element_count; i++) {
+				outBuffer.append("{");
+				outBuffer.append(std::to_string(bufferData[0][i]));
+				outBuffer.append(", ");
+				outBuffer.append(std::to_string(bufferData[1][i]));
+				outBuffer.append("} ");
+			}
+			outBuffer.append("\n");
+			outBuffer.append("Count: ");
+			outBuffer.append(std::to_string(element_count));
+			outBuffer.append("\n");
+			m_logger->log("Your input array is: \n" + outBuffer, ILogger::ELL_PERFORMANCE);
+
+			auto pc = CountingPushData {
+				.inputKeyAddress = buffer_device_address[0],
+				.inputValueAddress = buffer_device_address[1],
+				.histogramAddress = buffer_device_address[2],
+				.outputKeyAddress = buffer_device_address[3],
+				.outputValueAddress = buffer_device_address[4],
+				.dataElementCount = element_count,
+				.elementsPerWT = elements_per_thread,
+				.minimum = minimum,
+				.maximum = minimum + bucket_count - 1,
+			};
+
+			smart_refctd_ptr<nbl::video::IGPUCommandBuffer> cmdBuf;
+			{
+				smart_refctd_ptr<nbl::video::IGPUCommandPool> cmdpool = m_device->createCommandPool(getComputeQueue()->getFamilyIndex(), IGPUCommandPool::CREATE_FLAGS::RESET_COMMAND_BUFFER_BIT);
+				if (!cmdpool->createCommandBuffers(IGPUCommandPool::BUFFER_LEVEL::PRIMARY, 1u, &cmdBuf))
+					return logFail("Failed to create Command Buffers!\n");
+			}
+
+			// Create the Semaphore for prefix sum
+			constexpr uint64_t started_value = 0;
+			uint64_t timeline = started_value;
+			smart_refctd_ptr<ISemaphore> progress = m_device->createSemaphore(started_value);
+
+			cmdBuf->begin(IGPUCommandBuffer::USAGE::ONE_TIME_SUBMIT_BIT);
+			cmdBuf->beginDebugMarker("Prefix Sum Dispatch", core::vectorSIMDf(0, 1, 0, 1));
+			cmdBuf->bindComputePipeline(prefixSumPipeline.get());
+			cmdBuf->pushConstants(layout.get(), IShader::E_SHADER_STAGE::ESS_COMPUTE, 0u, sizeof(pc), &pc);
+			cmdBuf->dispatch(ceil((float)element_count / (elements_per_thread * WorkgroupSize)), 1, 1);
+			cmdBuf->endDebugMarker();
+			cmdBuf->end();
+
+			{
+				auto queue = getComputeQueue();
+
+				IQueue::SSubmitInfo submit_infos[1];
+				IQueue::SSubmitInfo::SCommandBufferInfo cmdBufs[] = {
+					{
+						.cmdbuf = cmdBuf.get()
+					}
+				};
+				submit_infos[0].commandBuffers = cmdBufs;
+				IQueue::SSubmitInfo::SSemaphoreInfo signals[] = {
+					{
+						.semaphore = progress.get(),
+						.value = ++timeline,
+						.stageMask = asset::PIPELINE_STAGE_FLAGS::COMPUTE_SHADER_BIT
+					}
+				};
+				submit_infos[0].signalSemaphores = signals;
+
+				m_api->startCapture();
+				queue->submit(submit_infos);
+				m_api->endCapture();
+			}
+
+			const ISemaphore::SWaitInfo wait_infos[] = { {
+					.semaphore = progress.get(),
+					.value = timeline
+				} };
+			m_device->blockForSemaphores(wait_infos);
+
+			// Create the Semaphore for Scatter
+			uint64_t timeline2 = started_value;
+			smart_refctd_ptr<ISemaphore> progress2 = m_device->createSemaphore(started_value);
+
+			cmdBuf->begin(IGPUCommandBuffer::USAGE::ONE_TIME_SUBMIT_BIT);
+			cmdBuf->beginDebugMarker("Scatter Dispatch", core::vectorSIMDf(0, 1, 0, 1));
+			cmdBuf->bindComputePipeline(scatterPipeline.get());
+			cmdBuf->pushConstants(layout.get(), IShader::E_SHADER_STAGE::ESS_COMPUTE, 0u, sizeof(pc), &pc);
+			cmdBuf->dispatch(ceil((float)element_count / (elements_per_thread * WorkgroupSize)), 1, 1);
+			cmdBuf->endDebugMarker();
+			cmdBuf->end();
+
+			{
+				auto queue = getComputeQueue();
+
+				IQueue::SSubmitInfo submit_infos[1];
+				IQueue::SSubmitInfo::SCommandBufferInfo cmdBufs[] = {
+					{
+						.cmdbuf = cmdBuf.get()
+					}
+				};
+				submit_infos[0].commandBuffers = cmdBufs;
+				IQueue::SSubmitInfo::SSemaphoreInfo waits[] = {
+					{
+						.semaphore = progress.get(),
+						.value = timeline,
+						.stageMask = asset::PIPELINE_STAGE_FLAGS::COMPUTE_SHADER_BIT
+					}
+				};
+				submit_infos[0].waitSemaphores = waits;
+				IQueue::SSubmitInfo::SSemaphoreInfo signals[] = {
+					{
+						.semaphore = progress2.get(),
+						.value = ++timeline2,
+						.stageMask = asset::PIPELINE_STAGE_FLAGS::COMPUTE_SHADER_BIT
+					}
+				};
+				submit_infos[0].signalSemaphores = signals;
+
+				m_api->startCapture();
+				queue->submit(submit_infos);
+				m_api->endCapture();
+			}
+
+			const ISemaphore::SWaitInfo wait_infos2[] = {{
+					.semaphore = progress2.get(),
+					.value = timeline2
+				} };
+			m_device->blockForSemaphores(wait_infos2);
+
+			const ILogicalDevice::MappedMemoryRange memory_range[] = {
+				ILogicalDevice::MappedMemoryRange(allocation[0].memory.get(), 0ull, allocation[0].memory->getAllocationSize()),
+				ILogicalDevice::MappedMemoryRange(allocation[1].memory.get(), 0ull, allocation[1].memory->getAllocationSize()),
+				ILogicalDevice::MappedMemoryRange(allocation[2].memory.get(), 0ull, allocation[2].memory->getAllocationSize()),
+				ILogicalDevice::MappedMemoryRange(allocation[3].memory.get(), 0ull, allocation[3].memory->getAllocationSize()),
+				ILogicalDevice::MappedMemoryRange(allocation[4].memory.get(), 0ull, allocation[4].memory->getAllocationSize())
+			};
+
+			if (!allocation[0].memory->getMemoryPropertyFlags().hasFlags(IDeviceMemoryAllocation::EMPF_HOST_COHERENT_BIT))
+				m_device->invalidateMappedMemoryRanges(1, &memory_range[0]);
+			if (!allocation[1].memory->getMemoryPropertyFlags().hasFlags(IDeviceMemoryAllocation::EMPF_HOST_COHERENT_BIT))
+				m_device->invalidateMappedMemoryRanges(1, &memory_range[1]);
+			if (!allocation[2].memory->getMemoryPropertyFlags().hasFlags(IDeviceMemoryAllocation::EMPF_HOST_COHERENT_BIT))
+				m_device->invalidateMappedMemoryRanges(1, &memory_range[2]);
+			if (!allocation[3].memory->getMemoryPropertyFlags().hasFlags(IDeviceMemoryAllocation::EMPF_HOST_COHERENT_BIT))
+				m_device->invalidateMappedMemoryRanges(1, &memory_range[3]);
+			if (!allocation[4].memory->getMemoryPropertyFlags().hasFlags(IDeviceMemoryAllocation::EMPF_HOST_COHERENT_BIT))
+				m_device->invalidateMappedMemoryRanges(1, &memory_range[4]);
+
+			const uint32_t* buffData[] = {
+				reinterpret_cast<const uint32_t*>(allocation[2].memory->getMappedPointer()),
+				reinterpret_cast<const uint32_t*>(allocation[3].memory->getMappedPointer()),
+				reinterpret_cast<const uint32_t*>(allocation[4].memory->getMappedPointer())
+			};
+
+			assert(allocation[2].offset == 0); // simpler than writing out all the pointer arithmetic
+			assert(allocation[3].offset == 0); // simpler than writing out all the pointer arithmetic
+			assert(allocation[4].offset == 0); // simpler than writing out all the pointer arithmetic
+
+			outBuffer.clear();
+			for (auto i = 0; i < bucket_count; i++) {
+				outBuffer.append(std::to_string(buffData[0][i]));
+				outBuffer.append(" ");
+			}
+			outBuffer.append("\n");
+
+			m_logger->log("Scratch buffer is: \n" + outBuffer, ILogger::ELL_PERFORMANCE);
+
+			outBuffer.clear();
+			for (auto i = 0; i < element_count; i++) {
+				outBuffer.append("{");
+				outBuffer.append(std::to_string(buffData[1][i]));
+				outBuffer.append(", ");
+				outBuffer.append(std::to_string(buffData[2][i]));
+				outBuffer.append("} ");
+			}
+			outBuffer.append("\n");
+			outBuffer.append("Count: ");
+			outBuffer.append(std::to_string(element_count));
+			outBuffer.append("\n");
+			m_logger->log("Your output array is: \n" + outBuffer, ILogger::ELL_PERFORMANCE);
+
+			allocation[0].memory->unmap();
+			allocation[1].memory->unmap();
+			allocation[2].memory->unmap();
+			allocation[3].memory->unmap();
+			allocation[4].memory->unmap();
+
+			m_device->waitIdle();
+
+			delete[] bufferData;
+
+			return true;
+		}
+
+		// Ok this time we'll actually have a work loop (maybe just for the sake of future WASM so we don't timeout a Browser Tab with an unresponsive script)
+		bool keepRunning() override { return false; }
+
+		// Finally the first actual work-loop
+		void workLoopBody() override {}
+
+		bool onAppTerminated() override { return true; }
+};
+
+
+NBL_MAIN_FUNC(CountingSortApp)
\ No newline at end of file
diff --git a/73_ImageUploadBenchmark/pipeline.groovy b/73_ImageUploadBenchmark/pipeline.groovy
new file mode 100644
index 000000000..1249f10b5
--- /dev/null
+++ b/73_ImageUploadBenchmark/pipeline.groovy
@@ -0,0 +1,50 @@
+import org.DevshGraphicsProgramming.Agent
+import org.DevshGraphicsProgramming.BuilderInfo
+import org.DevshGraphicsProgramming.IBuilder
+
+class CImageUploadBenchmark extends IBuilder
+{
+	public CImageUploadBenchmark(Agent _agent, _info)
+	{
+		super(_agent, _info)
+	}
+	
+	@Override
+	public boolean prepare(Map axisMapping)
+	{
+		return true
+	}
+	
+	@Override
+  	public boolean build(Map axisMapping)
+	{
+		IBuilder.CONFIGURATION config = axisMapping.get("CONFIGURATION")
+		IBuilder.BUILD_TYPE buildType = axisMapping.get("BUILD_TYPE")
+		
+		def nameOfBuildDirectory = getNameOfBuildDirectory(buildType)
+		def nameOfConfig = getNameOfConfig(config)
+		
+		agent.execute("cmake --build ${info.rootProjectPath}/${nameOfBuildDirectory}/${info.targetProjectPathRelativeToRoot} --target ${info.targetBaseName} --config ${nameOfConfig} -j12 -v")
+		
+		return true
+	}
+	
+	@Override
+  	public boolean test(Map axisMapping)
+	{
+		return true
+	}
+	
+	@Override
+	public boolean install(Map axisMapping)
+	{
+		return true
+	}
+}
+
+def create(Agent _agent, _info)
+{
+	return new CStreamingAndBufferDeviceAddressBuilder(_agent, _info)
+}
+
+return this
diff --git a/CMakeLists.txt b/CMakeLists.txt
index cbe482aa4..2d4ed7408 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -89,6 +89,7 @@ if(NBL_BUILD_EXAMPLES)
   	add_subdirectory(70_FLIPFluids)
 	add_subdirectory(71_RayTracingPipeline)
 	add_subdirectory(72_CooperativeBinarySearch)
+	add_subdirectory(73_ImageUploadBenchmark)
 
 	# add new examples *before* NBL_GET_ALL_TARGETS invocation, it gathers recursively all targets created so far in this subdirectory
 	NBL_GET_ALL_TARGETS(TARGETS)

From 951e2fdd218abc307f9890d69f7a9be38d28f95a Mon Sep 17 00:00:00 2001
From: CrabeExtra <abbasgaroosi7@gmail.com>
Date: Wed, 24 Dec 2025 18:38:09 +0330
Subject: [PATCH 2/5] Simple benchmark HOST_VISIBLE vs HOST_VISIBLE &
 DEVICE_LOCAL

---
 73_ImageUploadBenchmark/main.cpp | 694 ++++++++++++++++---------------
 1 file changed, 357 insertions(+), 337 deletions(-)

diff --git a/73_ImageUploadBenchmark/main.cpp b/73_ImageUploadBenchmark/main.cpp
index a22647750..68815681d 100644
--- a/73_ImageUploadBenchmark/main.cpp
+++ b/73_ImageUploadBenchmark/main.cpp
@@ -1,392 +1,412 @@
 #include "nbl/examples/examples.hpp"
-#include "nbl/this_example/builtin/build/spirv/keys.hpp"
+#include <chrono>
 
 using namespace nbl;
 using namespace nbl::core;
-using namespace nbl::hlsl;
 using namespace nbl::system;
 using namespace nbl::asset;
-using namespace nbl::ui;
 using namespace nbl::video;
 using namespace nbl::examples;
 
-#include "app_resources/common.hlsl"
-#include "nbl/builtin/hlsl/bit.hlsl"
-
-class CountingSortApp final : public application_templates::MonoDeviceApplication, public BuiltinResourcesApplication
+class ImageUploadBenchmarkApp final : public application_templates::MonoDeviceApplication, public BuiltinResourcesApplication
 {
-		using device_base_t = application_templates::MonoDeviceApplication;
-		using asset_base_t = BuiltinResourcesApplication;
+	using device_base_t = application_templates::MonoDeviceApplication;
+	using asset_base_t = BuiltinResourcesApplication;
+
+public:
+	ImageUploadBenchmarkApp(const path& _localInputCWD, const path& _localOutputCWD, const path& _sharedInputCWD, const path& _sharedOutputCWD) :
+		system::IApplicationFramework(_localInputCWD, _localOutputCWD, _sharedInputCWD, _sharedOutputCWD) {}
+
+	bool onAppInitialized(smart_refctd_ptr<ISystem>&& system) override
+	{
+		if (!device_base_t::onAppInitialized(smart_refctd_ptr(system)))
+			return false;
+		if (!asset_base_t::onAppInitialized(std::move(system)))
+			return false;
+
+		constexpr uint32_t TILE_SIZE = 128;
+		constexpr uint32_t TILE_BYTES_PER_PIXEL = 4;
+		constexpr uint32_t TILE_SIZE_BYTES = TILE_SIZE * TILE_SIZE * TILE_BYTES_PER_PIXEL;
+		constexpr uint32_t STAGING_BUFFER_SIZE = 64 * 1024 * 1024;
+		constexpr uint32_t TILES_PER_FRAME = STAGING_BUFFER_SIZE / TILE_SIZE_BYTES;
+		constexpr uint32_t FRAMES_IN_FLIGHT = 4;
+		constexpr uint32_t TOTAL_FRAMES = 1000;
+
+		m_logger->log("GPU Memory Transfer Benchmark", ILogger::ELL_INFO);
+		m_logger->log("Tile size: %ux%u (%u KB)", ILogger::ELL_INFO, TILE_SIZE, TILE_SIZE, TILE_SIZE_BYTES / 1024);
+		m_logger->log("Staging buffer: %u MB", ILogger::ELL_INFO, STAGING_BUFFER_SIZE / (1024 * 1024));
+		m_logger->log("Tiles per frame: %u", ILogger::ELL_INFO, TILES_PER_FRAME);
+		m_logger->log("Frames in flight: %u", ILogger::ELL_INFO, FRAMES_IN_FLIGHT);
+
+		uint32_t hostVisibleBits = m_physicalDevice->getHostVisibleMemoryTypeBits();
+		uint32_t deviceLocalBits = m_physicalDevice->getDeviceLocalMemoryTypeBits();
+		uint32_t hostVisibleOnlyBits = hostVisibleBits & ~deviceLocalBits;
+		uint32_t hostVisibleDeviceLocalBits = hostVisibleBits & deviceLocalBits;
+
+		if (!hostVisibleOnlyBits)
+		{
+			m_logger->log("HOST_VISIBLE memory types not found!", ILogger::ELL_ERROR);
+			return false;
+		}
 
-	public:
-		// Yay thanks to multiple inheritance we cannot forward ctors anymore
-		CountingSortApp(const path& _localInputCWD, const path& _localOutputCWD, const path& _sharedInputCWD, const path& _sharedOutputCWD) :
-			system::IApplicationFramework(_localInputCWD, _localOutputCWD, _sharedInputCWD, _sharedOutputCWD) {}
+		if (!deviceLocalBits)
+		{
+			m_logger->log("DEVICE_LOCAL memory types not found!", ILogger::ELL_ERROR);
+			return false;
+		}
 
-		// we stuff all our work here because its a "single shot" app
-		bool onAppInitialized(smart_refctd_ptr<ISystem>&& system) override
+		IQueue* queue = getQueue(IQueue::FAMILY_FLAGS::GRAPHICS_BIT);
+		smart_refctd_ptr<IGPUImage> destinationImage;
 		{
-			// Remember to call the base class initialization!
-			if (!device_base_t::onAppInitialized(smart_refctd_ptr(system)))
-				return false;
-			if (!asset_base_t::onAppInitialized(std::move(system)))
-				return false;
+			IGPUImage::SCreationParams imgParams{};
+			imgParams.type = IImage::E_TYPE::ET_2D;
+			imgParams.extent.width = TILE_SIZE * 32;
+			imgParams.extent.height = TILE_SIZE * 32;
+			imgParams.extent.depth = 1u;
+			imgParams.format = asset::E_FORMAT::EF_R8G8B8A8_UNORM;
+			imgParams.mipLevels = 1u;
+			imgParams.flags = IImage::ECF_NONE;
+			imgParams.arrayLayers = 1u;
+			imgParams.samples = IImage::E_SAMPLE_COUNT_FLAGS::ESCF_1_BIT;
+			imgParams.tiling = video::IGPUImage::TILING::OPTIMAL;
+			imgParams.usage = asset::IImage::EUF_TRANSFER_DST_BIT;
+			imgParams.preinitialized = false;
+
+			destinationImage = m_device->createImage(std::move(imgParams));
+			if (!destinationImage)
+				return logFail("Failed to create destination image!\n");
+
+			destinationImage->setObjectDebugName("Destination Image");
+
+			auto reqs = destinationImage->getMemoryReqs();
+			reqs.memoryTypeBits &= deviceLocalBits;
+
+			auto allocation = m_device->allocate(reqs, destinationImage.get(), IDeviceMemoryAllocation::EMAF_NONE);
+			if (!allocation.isValid())
+				return logFail("Failed to allocate DEVICE_LOCAL memory for destination image!\n");
+		}
 
-			auto limits = m_physicalDevice->getLimits();
-			constexpr std::array<uint32_t, 3u> AllowedMaxComputeSharedMemorySizes = {
-				16384, 32768, 65536
-			};
+		m_logger->log("\nTesting Strategy 1: System RAM", ILogger::ELL_INFO);
+
+		double throughputSystemRAM = 0.0;
+		{
+			smart_refctd_ptr<IGPUBuffer> stagingBuffer;
+			IDeviceMemoryAllocator::SAllocation stagingAlloc;
+			void* mappedPtr = nullptr;
 
-			auto upperBoundSharedMemSize = std::upper_bound(AllowedMaxComputeSharedMemorySizes.begin(), AllowedMaxComputeSharedMemorySizes.end(), limits.maxComputeSharedMemorySize);
-			// devices which support less than 16KB of max compute shared memory size are not supported
-			if (upperBoundSharedMemSize == AllowedMaxComputeSharedMemorySizes.begin())
+			if (!createStagingBuffer(STAGING_BUFFER_SIZE, hostVisibleOnlyBits,
+				"Staging Buffer - System RAM", stagingBuffer, stagingAlloc, mappedPtr))
 			{
-				m_logger->log("maxComputeSharedMemorySize is too low (%u)", ILogger::E_LOG_LEVEL::ELL_ERROR, limits.maxComputeSharedMemorySize);
-				exit(0);
+				return false;
 			}
 
-			limits.maxComputeSharedMemorySize = *(upperBoundSharedMemSize - 1);
+			throughputSystemRAM = runBenchmark(
+				"System RAM",
+				stagingBuffer.get(),
+				mappedPtr,
+				destinationImage.get(),
+				TILE_SIZE,
+				TILE_SIZE_BYTES,
+				TILES_PER_FRAME,
+				FRAMES_IN_FLIGHT,
+				TOTAL_FRAMES,
+				queue
+			);
+
+			stagingAlloc.memory->unmap();
+		}
 
-			const uint32_t WorkgroupSize = limits.maxComputeWorkGroupInvocations;
-			const uint32_t MaxBucketCount = (limits.maxComputeSharedMemorySize / sizeof(uint32_t)) / 2;
-			constexpr uint32_t element_count = 100000;
-			const uint32_t bucket_count = std::min((uint32_t)3000, MaxBucketCount);
-			const uint32_t elements_per_thread = ceil((float)ceil((float)element_count / limits.computeUnits) / WorkgroupSize);
+		m_logger->log("System RAM throughput: %.2f GB/s", ILogger::ELL_PERFORMANCE, throughputSystemRAM);
 
-			auto loadPrecompiledShader = [&]<core::StringLiteral ShaderKey>() -> smart_refctd_ptr<IShader>
-			{
-				// this time we load a shader directly from a file
-				IAssetLoader::SAssetLoadParams lp = {};
-				lp.logger = m_logger.get();
-				lp.workingDirectory = "app_resources"; // virtual root
-				auto key = nbl::this_example::builtin::build::get_spirv_key<ShaderKey>(limits, m_physicalDevice->getFeatures());
-				auto assetBundle = m_assetMgr->getAsset(key.data(), lp);
-				const auto assets = assetBundle.getContents();
-				if (assets.empty())
-				{
-					logFail("Could not load shader!");
-					return nullptr;
-				}
+		m_device->waitIdle();
 
-				auto shader = IAsset::castDown<IShader>(assets[0]);
-				// The down-cast should not fail!
-				assert(shader);
-			
-				// There's two ways of doing stuff like this:
-				// 1. this - modifying the asset after load
-				// 2. creating a short shader source file that includes the asset you would have wanted to load
-				// 
-				//auto overrideSource = CHLSLCompiler::createOverridenCopy(
-				//	source.get(), "#define WorkgroupSize %d\n#define BucketCount %d\n",
-				//	WorkgroupSize, bucket_count
-				//);
-
-				// this time we skip the use of the asset converter since the IShader->IGPUShader path is quick and simple
-				return shader;
-			};
-			auto prefixSumShader = loadPrecompiledShader.operator()<"prefix_sum_shader">(); // "app_resources/prefix_sum_shader.comp.hlsl"
-			auto scatterShader = loadPrecompiledShader.operator()<"scatter_shader">(); // "app_resources/scatter_shader.comp.hlsl"
-
-			// People love Reflection but I prefer Shader Sources instead!
-			const nbl::asset::SPushConstantRange pcRange = { .stageFlags = IShader::E_SHADER_STAGE::ESS_COMPUTE,.offset = 0,.size = sizeof(CountingPushData) };
-
-			// This time we'll have no Descriptor Sets or Layouts because our workload has a widely varying size
-			// and using traditional SSBO bindings would force us to update the Descriptor Set every frame.
-			// I even started writing this sample with the use of Dynamic SSBOs, however the length of the buffer range is not dynamic
-			// only the offset. This means that we'd have to write the "worst case" length into the descriptor set binding.
-			// Then this has a knock-on effect that we couldn't allocate closer to the end of the streaming buffer than the "worst case" size.
-			smart_refctd_ptr<IGPUPipelineLayout> layout;
-			smart_refctd_ptr<IGPUComputePipeline> prefixSumPipeline;
-			smart_refctd_ptr<IGPUComputePipeline> scatterPipeline;
-			{
-				layout = m_device->createPipelineLayout({ &pcRange,1 });
-				IGPUComputePipeline::SCreationParams params = {};
-				params.layout = layout.get();
-				params.shader.shader = prefixSumShader.get();
-				params.shader.entryPoint = "main";
-				params.shader.entries = nullptr;
-				params.shader.requiredSubgroupSize = static_cast<IPipelineBase::SUBGROUP_SIZE>(5);
-				params.cached.requireFullSubgroups = true;
-				if (!m_device->createComputePipelines(nullptr, { &params,1 }, &prefixSumPipeline))
-					return logFail("Failed to create compute pipeline!\n");
-				params.shader.shader = scatterShader.get();
-				if (!m_device->createComputePipelines(nullptr, { &params,1 }, &scatterPipeline))
-					return logFail("Failed to create compute pipeline!\n");
-			}
+		if (hostVisibleDeviceLocalBits)
+		{
+			m_logger->log("\nTesting Strategy 2: VRAM (ReBAR)", ILogger::ELL_INFO);
 
-			// Allocate memory
-			nbl::video::IDeviceMemoryAllocator::SAllocation allocation[5] = {};
-			smart_refctd_ptr<IGPUBuffer> buffers[5];
-			//smart_refctd_ptr<nbl::video::IGPUDescriptorSet> ds;
+			double throughputVRAM = 0.0;
 			{
-				auto build_buffer = [this](
-					smart_refctd_ptr<ILogicalDevice> m_device,
-					nbl::video::IDeviceMemoryAllocator::SAllocation *allocation,
-					smart_refctd_ptr<IGPUBuffer>& buffer,
-					size_t buffer_size,
-					const char *label
-				) -> void {
-					IGPUBuffer::SCreationParams params;
-					params.size = buffer_size;
-					params.usage = IGPUBuffer::EUF_STORAGE_BUFFER_BIT | IGPUBuffer::EUF_SHADER_DEVICE_ADDRESS_BIT;
-					buffer = m_device->createBuffer(std::move(params));
-					if (!buffer)
-						logFail("Failed to create GPU buffer of size %d!\n", buffer_size);
-
-					buffer->setObjectDebugName(label);
-
-					auto reqs = buffer->getMemoryReqs();
-					reqs.memoryTypeBits &= m_physicalDevice->getHostVisibleMemoryTypeBits();
-
-					*allocation = m_device->allocate(reqs, buffer.get(), IDeviceMemoryAllocation::EMAF_DEVICE_ADDRESS_BIT);
-					if (!allocation->isValid())
-						logFail("Failed to allocate Device Memory compatible with our GPU Buffer!\n");
-
-					assert(allocation->memory.get() == buffer->getBoundMemory().memory);
-				};
+				smart_refctd_ptr<IGPUBuffer> stagingBuffer;
+				IDeviceMemoryAllocator::SAllocation stagingAlloc;
+				void* mappedPtr = nullptr;
 
-				build_buffer(m_device,	allocation,		buffers[0], sizeof(uint32_t) * element_count,	"Input Key Buffer");
-				build_buffer(m_device,	allocation + 1,	buffers[1], sizeof(uint32_t) * element_count,	"Input Value Buffer");
-				build_buffer(m_device,	allocation + 2, buffers[2], sizeof(uint32_t) * bucket_count,	"Scratch Buffer");
-				build_buffer(m_device,	allocation + 3,	buffers[3], sizeof(uint32_t) * element_count,	"Output Key Buffer");
-				build_buffer(m_device,	allocation + 4, buffers[4], sizeof(uint32_t) * element_count,	"Output Value Buffer");
-			}
-			uint64_t buffer_device_address[] = {
-				buffers[0]->getDeviceAddress(),
-				buffers[1]->getDeviceAddress(),
-				buffers[2]->getDeviceAddress(),
-				buffers[3]->getDeviceAddress(),
-				buffers[4]->getDeviceAddress()
-			};
+				if (!createStagingBuffer(STAGING_BUFFER_SIZE, hostVisibleDeviceLocalBits,
+					"Staging Buffer - VRAM (ReBAR)", stagingBuffer, stagingAlloc, mappedPtr))
+				{
+					return false;
+				}
 
-			void* mapped_memory[] = {
-				allocation[0].memory->map({0ull,allocation[0].memory->getAllocationSize()}, IDeviceMemoryAllocation::EMCAF_READ),
-				allocation[1].memory->map({0ull,allocation[1].memory->getAllocationSize()}, IDeviceMemoryAllocation::EMCAF_READ),
-				allocation[2].memory->map({0ull,allocation[2].memory->getAllocationSize()}, IDeviceMemoryAllocation::EMCAF_READ),
-				allocation[3].memory->map({0ull,allocation[3].memory->getAllocationSize()}, IDeviceMemoryAllocation::EMCAF_READ),
-				allocation[4].memory->map({0ull,allocation[3].memory->getAllocationSize()}, IDeviceMemoryAllocation::EMCAF_READ),
-			};
-			if (!mapped_memory[0] || !mapped_memory[1] || !mapped_memory[2] || !mapped_memory[3] || !mapped_memory[4])
-				return logFail("Failed to map the Device Memory!\n");
-
-			// Generate random data
-			constexpr uint32_t minimum = 0;
-			const uint32_t range = bucket_count;
-			unsigned seed = std::chrono::system_clock::now().time_since_epoch().count();
-			std::mt19937 g(seed);
-
-			auto bufferData = new uint32_t[2][element_count];
-			for (uint32_t i = 0; i < element_count; i++) {
-				bufferData[0][i] = minimum + g() % range;
+				throughputVRAM = runBenchmark(
+					"VRAM (ReBAR)",
+					stagingBuffer.get(),
+					mappedPtr,
+					destinationImage.get(),
+					TILE_SIZE,
+					TILE_SIZE_BYTES,
+					TILES_PER_FRAME,
+					FRAMES_IN_FLIGHT,
+					TOTAL_FRAMES,
+					queue
+				);
+
+				stagingAlloc.memory->unmap();
 			}
 
-			memcpy(mapped_memory[0], bufferData[0], sizeof(uint32_t) * element_count);
-
-			for (uint32_t i = 0; i < element_count; i++) {
-				bufferData[1][i] = g() % std::numeric_limits<uint32_t>::max();
-			}
+			m_logger->log("VRAM throughput: %.2f GB/s", ILogger::ELL_PERFORMANCE, throughputVRAM);
 
-			memcpy(mapped_memory[1], bufferData[1], sizeof(uint32_t) * element_count);
+			double speedup = throughputVRAM / throughputSystemRAM;
+			m_logger->log("\nVRAM is %.2fx faster than System RAM", ILogger::ELL_PERFORMANCE, speedup);
+		}
 
-			std::string outBuffer;
-			for (auto i = 0; i < element_count; i++) {
-				outBuffer.append("{");
-				outBuffer.append(std::to_string(bufferData[0][i]));
-				outBuffer.append(", ");
-				outBuffer.append(std::to_string(bufferData[1][i]));
-				outBuffer.append("} ");
-			}
-			outBuffer.append("\n");
-			outBuffer.append("Count: ");
-			outBuffer.append(std::to_string(element_count));
-			outBuffer.append("\n");
-			m_logger->log("Your input array is: \n" + outBuffer, ILogger::ELL_PERFORMANCE);
-
-			auto pc = CountingPushData {
-				.inputKeyAddress = buffer_device_address[0],
-				.inputValueAddress = buffer_device_address[1],
-				.histogramAddress = buffer_device_address[2],
-				.outputKeyAddress = buffer_device_address[3],
-				.outputValueAddress = buffer_device_address[4],
-				.dataElementCount = element_count,
-				.elementsPerWT = elements_per_thread,
-				.minimum = minimum,
-				.maximum = minimum + bucket_count - 1,
-			};
+		return true;
+	}
+
+	bool keepRunning() override { return false; }
+	void workLoopBody() override {}
+	bool onAppTerminated() override { return true; }
+
+protected:
+	core::vector<queue_req_t> getQueueRequirements() const override
+	{
+		using flags_t = IQueue::FAMILY_FLAGS;
+		return { {
+			.requiredFlags = flags_t::GRAPHICS_BIT,
+			.disallowedFlags = flags_t::NONE,
+			.queueCount = 1,
+			.maxImageTransferGranularity = {1, 1, 1}
+		} };
+	}
+
+private:
+	void transitionImageLayout(
+		IGPUCommandBuffer* cmdBuf,
+		IGPUImage* image,
+		IImage::LAYOUT oldLayout,
+		IImage::LAYOUT newLayout)
+	{
+		IGPUCommandBuffer::SImageMemoryBarrier<IGPUCommandBuffer::SOwnershipTransferBarrier> barrier = {};
+		barrier.oldLayout = oldLayout;
+		barrier.newLayout = newLayout;
+		barrier.image = image;
+		barrier.subresourceRange.aspectMask = IImage::E_ASPECT_FLAGS::EAF_COLOR_BIT;
+		barrier.subresourceRange.baseMipLevel = 0;
+		barrier.subresourceRange.levelCount = 1;
+		barrier.subresourceRange.baseArrayLayer = 0;
+		barrier.subresourceRange.layerCount = 1;
+		barrier.barrier.dep.srcAccessMask = ACCESS_FLAGS::NONE;
+		barrier.barrier.dep.dstAccessMask = ACCESS_FLAGS::TRANSFER_WRITE_BIT;
+		barrier.barrier.dep.srcStageMask = PIPELINE_STAGE_FLAGS::ALL_COMMANDS_BITS;
+		barrier.barrier.dep.dstStageMask = PIPELINE_STAGE_FLAGS::ALL_TRANSFER_BITS;
+		cmdBuf->pipelineBarrier(E_DEPENDENCY_FLAGS::EDF_NONE, { .imgBarriers = {&barrier, 1} });
+	}
+
+	void generateTileCopyRegions(
+		IImage::SBufferCopy* outRegions,
+		uint32_t tilesPerFrame,
+		uint32_t tileSize,
+		uint32_t tileSizeBytes,
+		uint32_t imageWidth)
+	{
+		uint32_t tilesPerRow = imageWidth / tileSize;
+		for (size_t i = 0; i < tilesPerFrame; i++)
+		{
+			uint32_t tileX = (i % tilesPerRow) * tileSize;
+			uint32_t tileY = (i / tilesPerRow) * tileSize;
+
+			outRegions[i].bufferOffset = i * tileSizeBytes;
+			outRegions[i].bufferRowLength = tileSize;
+			outRegions[i].bufferImageHeight = tileSize;
+			outRegions[i].imageOffset = { tileX, tileY, 0 };
+			outRegions[i].imageExtent = { tileSize, tileSize, 1 };
+			outRegions[i].imageSubresource.aspectMask = IImage::E_ASPECT_FLAGS::EAF_COLOR_BIT;
+			outRegions[i].imageSubresource.mipLevel = 0;
+			outRegions[i].imageSubresource.baseArrayLayer = 0;
+			outRegions[i].imageSubresource.layerCount = 1;
+		}
+	}
 
-			smart_refctd_ptr<nbl::video::IGPUCommandBuffer> cmdBuf;
-			{
-				smart_refctd_ptr<nbl::video::IGPUCommandPool> cmdpool = m_device->createCommandPool(getComputeQueue()->getFamilyIndex(), IGPUCommandPool::CREATE_FLAGS::RESET_COMMAND_BUFFER_BIT);
-				if (!cmdpool->createCommandBuffers(IGPUCommandPool::BUFFER_LEVEL::PRIMARY, 1u, &cmdBuf))
-					return logFail("Failed to create Command Buffers!\n");
-			}
+	void generateRandomTileData(void* mappedPtr, uint32_t sizeBytes)
+	{
+		uint32_t* data = (uint32_t*)mappedPtr;
+		unsigned seed = std::chrono::system_clock::now().time_since_epoch().count();
+		std::mt19937 g(seed);
+		const uint32_t valueCount = sizeBytes / sizeof(uint32_t);
 
-			// Create the Semaphore for prefix sum
-			constexpr uint64_t started_value = 0;
-			uint64_t timeline = started_value;
-			smart_refctd_ptr<ISemaphore> progress = m_device->createSemaphore(started_value);
+		auto bufferData = new uint32_t[valueCount];
 
-			cmdBuf->begin(IGPUCommandBuffer::USAGE::ONE_TIME_SUBMIT_BIT);
-			cmdBuf->beginDebugMarker("Prefix Sum Dispatch", core::vectorSIMDf(0, 1, 0, 1));
-			cmdBuf->bindComputePipeline(prefixSumPipeline.get());
-			cmdBuf->pushConstants(layout.get(), IShader::E_SHADER_STAGE::ESS_COMPUTE, 0u, sizeof(pc), &pc);
-			cmdBuf->dispatch(ceil((float)element_count / (elements_per_thread * WorkgroupSize)), 1, 1);
-			cmdBuf->endDebugMarker();
-			cmdBuf->end();
+		for (uint32_t i = 0; i < valueCount; i++)
+		{
+			bufferData[i] = g();
+		}
+		memcpy(mappedPtr, bufferData, sizeBytes);
+		delete[] bufferData;
+	}
+
+	double runBenchmark(
+		const char* strategyName,
+		IGPUBuffer* stagingBuffer,
+		void* mappedPtr,
+		IGPUImage* destinationImage,
+		uint32_t tileSize,
+		uint32_t tileSizeBytes,
+		uint32_t tilesPerFrame,
+		uint32_t framesInFlight,
+		uint32_t totalFrames,
+		IQueue* queue)
+	{
+		smart_refctd_ptr<ISemaphore> timelineSemaphore = m_device->createSemaphore(0);
+
+		auto commandPools = new smart_refctd_ptr<IGPUCommandPool>[framesInFlight];
+		for (uint32_t i = 0; i < framesInFlight; i++)
+		{
+			commandPools[i] = m_device->createCommandPool(
+				queue->getFamilyIndex(),
+				IGPUCommandPool::CREATE_FLAGS::RESET_COMMAND_BUFFER_BIT
+			);
+		}
 
-			{
-				auto queue = getComputeQueue();
+		auto commandBuffers = new smart_refctd_ptr<IGPUCommandBuffer>[framesInFlight];
+		for (uint32_t i = 0; i < framesInFlight; i++)
+		{
+			commandPools[i]->createCommandBuffers(
+				IGPUCommandPool::BUFFER_LEVEL::PRIMARY,
+				1,
+				&commandBuffers[i]
+			);
+		}
 
-				IQueue::SSubmitInfo submit_infos[1];
-				IQueue::SSubmitInfo::SCommandBufferInfo cmdBufs[] = {
-					{
-						.cmdbuf = cmdBuf.get()
-					}
-				};
-				submit_infos[0].commandBuffers = cmdBufs;
-				IQueue::SSubmitInfo::SSemaphoreInfo signals[] = {
-					{
-						.semaphore = progress.get(),
-						.value = ++timeline,
-						.stageMask = asset::PIPELINE_STAGE_FLAGS::COMPUTE_SHADER_BIT
-					}
-				};
-				submit_infos[0].signalSemaphores = signals;
+		uint64_t timelineValue = 0;
 
-				m_api->startCapture();
-				queue->submit(submit_infos);
-				m_api->endCapture();
-			}
+		commandBuffers[0]->begin(IGPUCommandBuffer::USAGE::ONE_TIME_SUBMIT_BIT);
+		transitionImageLayout(
+			commandBuffers[0].get(),
+			destinationImage,
+			IImage::LAYOUT::UNDEFINED,
+			IImage::LAYOUT::TRANSFER_DST_OPTIMAL
+		);
+		commandBuffers[0]->end();
 
-			const ISemaphore::SWaitInfo wait_infos[] = { {
-					.semaphore = progress.get(),
-					.value = timeline
-				} };
-			m_device->blockForSemaphores(wait_infos);
+		IQueue::SSubmitInfo submitInfo = {};
+		IQueue::SSubmitInfo::SCommandBufferInfo cmdBufInfo = { .cmdbuf = commandBuffers[0].get() };
+		submitInfo.commandBuffers = { &cmdBufInfo, 1 };
 
-			// Create the Semaphore for Scatter
-			uint64_t timeline2 = started_value;
-			smart_refctd_ptr<ISemaphore> progress2 = m_device->createSemaphore(started_value);
+		IQueue::SSubmitInfo::SSemaphoreInfo signalInfo = {
+			.semaphore = timelineSemaphore.get(),
+			.value = ++timelineValue,
+			.stageMask = PIPELINE_STAGE_FLAGS::ALL_TRANSFER_BITS
+		};
+		submitInfo.signalSemaphores = { &signalInfo, 1 };
 
-			cmdBuf->begin(IGPUCommandBuffer::USAGE::ONE_TIME_SUBMIT_BIT);
-			cmdBuf->beginDebugMarker("Scatter Dispatch", core::vectorSIMDf(0, 1, 0, 1));
-			cmdBuf->bindComputePipeline(scatterPipeline.get());
-			cmdBuf->pushConstants(layout.get(), IShader::E_SHADER_STAGE::ESS_COMPUTE, 0u, sizeof(pc), &pc);
-			cmdBuf->dispatch(ceil((float)element_count / (elements_per_thread * WorkgroupSize)), 1, 1);
-			cmdBuf->endDebugMarker();
-			cmdBuf->end();
+		queue->submit({ &submitInfo, 1 });
 
-			{
-				auto queue = getComputeQueue();
+		ISemaphore::SWaitInfo waitInfo = {
+			.semaphore = timelineSemaphore.get(),
+			.value = timelineValue
+		};
+		m_device->blockForSemaphores({ &waitInfo, 1 });
 
-				IQueue::SSubmitInfo submit_infos[1];
-				IQueue::SSubmitInfo::SCommandBufferInfo cmdBufs[] = {
-					{
-						.cmdbuf = cmdBuf.get()
-					}
-				};
-				submit_infos[0].commandBuffers = cmdBufs;
-				IQueue::SSubmitInfo::SSemaphoreInfo waits[] = {
-					{
-						.semaphore = progress.get(),
-						.value = timeline,
-						.stageMask = asset::PIPELINE_STAGE_FLAGS::COMPUTE_SHADER_BIT
-					}
-				};
-				submit_infos[0].waitSemaphores = waits;
-				IQueue::SSubmitInfo::SSemaphoreInfo signals[] = {
-					{
-						.semaphore = progress2.get(),
-						.value = ++timeline2,
-						.stageMask = asset::PIPELINE_STAGE_FLAGS::COMPUTE_SHADER_BIT
-					}
-				};
-				submit_infos[0].signalSemaphores = signals;
+		auto regions = new IImage::SBufferCopy[tilesPerFrame];
 
-				m_api->startCapture();
-				queue->submit(submit_infos);
-				m_api->endCapture();
-			}
+		generateRandomTileData(mappedPtr, tilesPerFrame * tileSizeBytes);
 
-			const ISemaphore::SWaitInfo wait_infos2[] = {{
-					.semaphore = progress2.get(),
-					.value = timeline2
-				} };
-			m_device->blockForSemaphores(wait_infos2);
-
-			const ILogicalDevice::MappedMemoryRange memory_range[] = {
-				ILogicalDevice::MappedMemoryRange(allocation[0].memory.get(), 0ull, allocation[0].memory->getAllocationSize()),
-				ILogicalDevice::MappedMemoryRange(allocation[1].memory.get(), 0ull, allocation[1].memory->getAllocationSize()),
-				ILogicalDevice::MappedMemoryRange(allocation[2].memory.get(), 0ull, allocation[2].memory->getAllocationSize()),
-				ILogicalDevice::MappedMemoryRange(allocation[3].memory.get(), 0ull, allocation[3].memory->getAllocationSize()),
-				ILogicalDevice::MappedMemoryRange(allocation[4].memory.get(), 0ull, allocation[4].memory->getAllocationSize())
-			};
+		uint32_t imageWidth = destinationImage->getCreationParameters().extent.width;
+		generateTileCopyRegions(regions, tilesPerFrame, tileSize, tileSizeBytes, imageWidth);
 
-			if (!allocation[0].memory->getMemoryPropertyFlags().hasFlags(IDeviceMemoryAllocation::EMPF_HOST_COHERENT_BIT))
-				m_device->invalidateMappedMemoryRanges(1, &memory_range[0]);
-			if (!allocation[1].memory->getMemoryPropertyFlags().hasFlags(IDeviceMemoryAllocation::EMPF_HOST_COHERENT_BIT))
-				m_device->invalidateMappedMemoryRanges(1, &memory_range[1]);
-			if (!allocation[2].memory->getMemoryPropertyFlags().hasFlags(IDeviceMemoryAllocation::EMPF_HOST_COHERENT_BIT))
-				m_device->invalidateMappedMemoryRanges(1, &memory_range[2]);
-			if (!allocation[3].memory->getMemoryPropertyFlags().hasFlags(IDeviceMemoryAllocation::EMPF_HOST_COHERENT_BIT))
-				m_device->invalidateMappedMemoryRanges(1, &memory_range[3]);
-			if (!allocation[4].memory->getMemoryPropertyFlags().hasFlags(IDeviceMemoryAllocation::EMPF_HOST_COHERENT_BIT))
-				m_device->invalidateMappedMemoryRanges(1, &memory_range[4]);
-
-			const uint32_t* buffData[] = {
-				reinterpret_cast<const uint32_t*>(allocation[2].memory->getMappedPointer()),
-				reinterpret_cast<const uint32_t*>(allocation[3].memory->getMappedPointer()),
-				reinterpret_cast<const uint32_t*>(allocation[4].memory->getMappedPointer())
-			};
+		auto startTime = std::chrono::high_resolution_clock::now();
 
-			assert(allocation[2].offset == 0); // simpler than writing out all the pointer arithmetic
-			assert(allocation[3].offset == 0); // simpler than writing out all the pointer arithmetic
-			assert(allocation[4].offset == 0); // simpler than writing out all the pointer arithmetic
+		for (uint32_t frame = 0; frame < totalFrames; frame++)
+		{
+			uint32_t cmdBufIndex = frame % framesInFlight;
 
-			outBuffer.clear();
-			for (auto i = 0; i < bucket_count; i++) {
-				outBuffer.append(std::to_string(buffData[0][i]));
-				outBuffer.append(" ");
-			}
-			outBuffer.append("\n");
+			commandBuffers[cmdBufIndex]->begin(IGPUCommandBuffer::USAGE::ONE_TIME_SUBMIT_BIT);
 
-			m_logger->log("Scratch buffer is: \n" + outBuffer, ILogger::ELL_PERFORMANCE);
+			commandBuffers[cmdBufIndex]->copyBufferToImage(
+				stagingBuffer,
+				destinationImage,
+				IImage::LAYOUT::TRANSFER_DST_OPTIMAL,
+				tilesPerFrame,
+				regions
+			);
 
-			outBuffer.clear();
-			for (auto i = 0; i < element_count; i++) {
-				outBuffer.append("{");
-				outBuffer.append(std::to_string(buffData[1][i]));
-				outBuffer.append(", ");
-				outBuffer.append(std::to_string(buffData[2][i]));
-				outBuffer.append("} ");
-			}
-			outBuffer.append("\n");
-			outBuffer.append("Count: ");
-			outBuffer.append(std::to_string(element_count));
-			outBuffer.append("\n");
-			m_logger->log("Your output array is: \n" + outBuffer, ILogger::ELL_PERFORMANCE);
+			commandBuffers[cmdBufIndex]->end();
 
-			allocation[0].memory->unmap();
-			allocation[1].memory->unmap();
-			allocation[2].memory->unmap();
-			allocation[3].memory->unmap();
-			allocation[4].memory->unmap();
+			// Create submit info for THIS frame
+			IQueue::SSubmitInfo frameSubmitInfo = {};
+			IQueue::SSubmitInfo::SCommandBufferInfo frameCmdBufInfo = {.cmdbuf = commandBuffers[cmdBufIndex].get()};
+			frameSubmitInfo.commandBuffers = {&frameCmdBufInfo, 1};
 
-			m_device->waitIdle();
+			IQueue::SSubmitInfo::SSemaphoreInfo frameSignalInfo = {
+				.semaphore = timelineSemaphore.get(),
+				.value = ++timelineValue,
+				.stageMask = PIPELINE_STAGE_FLAGS::ALL_TRANSFER_BITS
+			};
+			frameSubmitInfo.signalSemaphores = {&frameSignalInfo, 1};
 
-			delete[] bufferData;
+			// Submit to GPU
+			queue->submit({&frameSubmitInfo, 1});
 
-			return true;
+			// Wait for old frames 
+			if (frame >= framesInFlight)
+			{
+				ISemaphore::SWaitInfo frameWaitInfo = {
+					.semaphore = timelineSemaphore.get(),
+					.value = timelineValue - framesInFlight
+				};
+				m_device->blockForSemaphores({&frameWaitInfo, 1});
+			}
 		}
 
-		// Ok this time we'll actually have a work loop (maybe just for the sake of future WASM so we don't timeout a Browser Tab with an unresponsive script)
-		bool keepRunning() override { return false; }
-
-		// Finally the first actual work-loop
-		void workLoopBody() override {}
-
-		bool onAppTerminated() override { return true; }
+		// Wait for all remaining frames to complete
+		ISemaphore::SWaitInfo finalWait = {
+			.semaphore = timelineSemaphore.get(),
+			.value = timelineValue
+		};
+		m_device->blockForSemaphores({&finalWait, 1});
+
+		auto endTime = std::chrono::high_resolution_clock::now();
+
+		delete[] regions;
+		delete[] commandPools;
+		delete[] commandBuffers;
+
+		// Calculate throughput
+		double elapsedSeconds = std::chrono::duration<double>(endTime - startTime).count();
+		uint64_t totalBytes = (uint64_t)totalFrames * tilesPerFrame * tileSizeBytes;
+		double throughputGBps = (totalBytes / (1024.0 * 1024.0 * 1024.0)) / elapsedSeconds;
+
+		return throughputGBps;
+	}
+
+	bool createStagingBuffer(
+		uint32_t bufferSize,
+		uint32_t memoryTypeBits,
+		const char* debugName,
+		smart_refctd_ptr<IGPUBuffer>& outBuffer,
+		IDeviceMemoryAllocator::SAllocation& outAllocation,
+		void*& outMappedPtr)
+	{
+		IGPUBuffer::SCreationParams params;
+		params.size = bufferSize;
+		params.usage = IGPUBuffer::EUF_TRANSFER_SRC_BIT;
+		outBuffer = m_device->createBuffer(std::move(params));
+		if (!outBuffer)
+			return logFail("Failed to create GPU buffer of size %d!\n", bufferSize);
+
+		outBuffer->setObjectDebugName(debugName);
+
+		auto reqs = outBuffer->getMemoryReqs();
+		reqs.memoryTypeBits &= memoryTypeBits;
+
+		outAllocation = m_device->allocate(reqs, outBuffer.get(), IDeviceMemoryAllocation::EMAF_NONE);
+		if (!outAllocation.isValid())
+			return logFail("Failed to allocate Device Memory!\n");
+
+		outMappedPtr = outAllocation.memory->map({0ull, outAllocation.memory->getAllocationSize()}, IDeviceMemoryAllocation::EMCAF_READ);
+		if (!outMappedPtr)
+			return logFail("Failed to map Device Memory!\n");
+
+		return true;
+	}
 };
 
-
-NBL_MAIN_FUNC(CountingSortApp)
\ No newline at end of file
+NBL_MAIN_FUNC(ImageUploadBenchmarkApp)

From 141295bee833de2fb97bc1ef1e7e8bc8980a643c Mon Sep 17 00:00:00 2001
From: CrabeExtra <abbasgaroosi7@gmail.com>
Date: Wed, 24 Dec 2025 21:09:51 +0330
Subject: [PATCH 3/5] Measurment was wierd, added some detail and also fix a
 bug related to FIF

---
 73_ImageUploadBenchmark/main.cpp | 123 +++++++++++++++++++------------
 1 file changed, 77 insertions(+), 46 deletions(-)

diff --git a/73_ImageUploadBenchmark/main.cpp b/73_ImageUploadBenchmark/main.cpp
index 68815681d..eceb0f9ea 100644
--- a/73_ImageUploadBenchmark/main.cpp
+++ b/73_ImageUploadBenchmark/main.cpp
@@ -28,8 +28,8 @@ class ImageUploadBenchmarkApp final : public application_templates::MonoDeviceAp
 		constexpr uint32_t TILE_BYTES_PER_PIXEL = 4;
 		constexpr uint32_t TILE_SIZE_BYTES = TILE_SIZE * TILE_SIZE * TILE_BYTES_PER_PIXEL;
 		constexpr uint32_t STAGING_BUFFER_SIZE = 64 * 1024 * 1024;
-		constexpr uint32_t TILES_PER_FRAME = STAGING_BUFFER_SIZE / TILE_SIZE_BYTES;
 		constexpr uint32_t FRAMES_IN_FLIGHT = 4;
+		constexpr uint32_t TILES_PER_FRAME = STAGING_BUFFER_SIZE / (TILE_SIZE_BYTES * FRAMES_IN_FLIGHT);
 		constexpr uint32_t TOTAL_FRAMES = 1000;
 
 		m_logger->log("GPU Memory Transfer Benchmark", ILogger::ELL_INFO);
@@ -40,12 +40,20 @@ class ImageUploadBenchmarkApp final : public application_templates::MonoDeviceAp
 
 		uint32_t hostVisibleBits = m_physicalDevice->getHostVisibleMemoryTypeBits();
 		uint32_t deviceLocalBits = m_physicalDevice->getDeviceLocalMemoryTypeBits();
-		uint32_t hostVisibleOnlyBits = hostVisibleBits & ~deviceLocalBits;
+		uint32_t hostCachedBits = m_physicalDevice->getMemoryTypeBitsFromMemoryTypeFlags(IDeviceMemoryAllocation::EMPF_HOST_CACHED_BIT);
+
+		uint32_t hostVisibleOnlyBits = hostVisibleBits & ~deviceLocalBits & ~hostCachedBits;
+
 		uint32_t hostVisibleDeviceLocalBits = hostVisibleBits & deviceLocalBits;
 
+		m_logger->log("Memory type bits - HostVisible: 0x%X, DeviceLocal: 0x%X, HostCached: 0x%X",
+			ILogger::ELL_INFO, hostVisibleBits, deviceLocalBits, hostCachedBits);
+		m_logger->log("System RAM (non-cached): 0x%X, VRAM: 0x%X",
+			ILogger::ELL_INFO, hostVisibleOnlyBits, hostVisibleDeviceLocalBits);
+
 		if (!hostVisibleOnlyBits)
 		{
-			m_logger->log("HOST_VISIBLE memory types not found!", ILogger::ELL_ERROR);
+			m_logger->log("HOST_VISIBLE non-cached memory types not found!", ILogger::ELL_ERROR);
 			return false;
 		}
 
@@ -122,7 +130,7 @@ class ImageUploadBenchmarkApp final : public application_templates::MonoDeviceAp
 
 		if (hostVisibleDeviceLocalBits)
 		{
-			m_logger->log("\nTesting Strategy 2: VRAM (ReBAR)", ILogger::ELL_INFO);
+			m_logger->log("\nTesting Strategy 2: VRAM", ILogger::ELL_INFO);
 
 			double throughputVRAM = 0.0;
 			{
@@ -131,13 +139,13 @@ class ImageUploadBenchmarkApp final : public application_templates::MonoDeviceAp
 				void* mappedPtr = nullptr;
 
 				if (!createStagingBuffer(STAGING_BUFFER_SIZE, hostVisibleDeviceLocalBits,
-					"Staging Buffer - VRAM (ReBAR)", stagingBuffer, stagingAlloc, mappedPtr))
+					"Staging Buffer - VRAM", stagingBuffer, stagingAlloc, mappedPtr))
 				{
 					return false;
 				}
 
 				throughputVRAM = runBenchmark(
-					"VRAM (ReBAR)",
+					"VRAM",
 					stagingBuffer.get(),
 					mappedPtr,
 					destinationImage.get(),
@@ -205,7 +213,8 @@ class ImageUploadBenchmarkApp final : public application_templates::MonoDeviceAp
 		uint32_t tilesPerFrame,
 		uint32_t tileSize,
 		uint32_t tileSizeBytes,
-		uint32_t imageWidth)
+		uint32_t imageWidth,
+		uint32_t bufferBaseOffset)
 	{
 		uint32_t tilesPerRow = imageWidth / tileSize;
 		for (size_t i = 0; i < tilesPerFrame; i++)
@@ -213,7 +222,7 @@ class ImageUploadBenchmarkApp final : public application_templates::MonoDeviceAp
 			uint32_t tileX = (i % tilesPerRow) * tileSize;
 			uint32_t tileY = (i / tilesPerRow) * tileSize;
 
-			outRegions[i].bufferOffset = i * tileSizeBytes;
+			outRegions[i].bufferOffset = bufferBaseOffset + (i * tileSizeBytes);
 			outRegions[i].bufferRowLength = tileSize;
 			outRegions[i].bufferImageHeight = tileSize;
 			outRegions[i].imageOffset = { tileX, tileY, 0 };
@@ -225,23 +234,6 @@ class ImageUploadBenchmarkApp final : public application_templates::MonoDeviceAp
 		}
 	}
 
-	void generateRandomTileData(void* mappedPtr, uint32_t sizeBytes)
-	{
-		uint32_t* data = (uint32_t*)mappedPtr;
-		unsigned seed = std::chrono::system_clock::now().time_since_epoch().count();
-		std::mt19937 g(seed);
-		const uint32_t valueCount = sizeBytes / sizeof(uint32_t);
-
-		auto bufferData = new uint32_t[valueCount];
-
-		for (uint32_t i = 0; i < valueCount; i++)
-		{
-			bufferData[i] = g();
-		}
-		memcpy(mappedPtr, bufferData, sizeBytes);
-		delete[] bufferData;
-	}
-
 	double runBenchmark(
 		const char* strategyName,
 		IGPUBuffer* stagingBuffer,
@@ -305,12 +297,31 @@ class ImageUploadBenchmarkApp final : public application_templates::MonoDeviceAp
 		};
 		m_device->blockForSemaphores({ &waitInfo, 1 });
 
-		auto regions = new IImage::SBufferCopy[tilesPerFrame];
+		uint32_t imageWidth = destinationImage->getCreationParameters().extent.width;
+		uint32_t partitionSize = tilesPerFrame * tileSizeBytes;
 
-		generateRandomTileData(mappedPtr, tilesPerFrame * tileSizeBytes);
+		// CPU source buffer with random data (generated once, reused each frame)
+		auto cpuSourceData = new uint8_t[partitionSize];
+		{
+			unsigned seed = std::chrono::system_clock::now().time_since_epoch().count();
+			std::mt19937 g(seed);
+			uint32_t* data = reinterpret_cast<uint32_t*>(cpuSourceData);
+			for (uint32_t i = 0; i < partitionSize / sizeof(uint32_t); i++)
+				data[i] = g();
+		}
 
-		uint32_t imageWidth = destinationImage->getCreationParameters().extent.width;
-		generateTileCopyRegions(regions, tilesPerFrame, tileSize, tileSizeBytes, imageWidth);
+		auto regionsPerFrame = new IImage::SBufferCopy*[framesInFlight];
+		for (uint32_t i = 0; i < framesInFlight; i++)
+		{
+			regionsPerFrame[i] = new IImage::SBufferCopy[tilesPerFrame];
+			uint32_t bufferOffset = i * partitionSize;
+			generateTileCopyRegions(regionsPerFrame[i], tilesPerFrame, tileSize, tileSizeBytes, imageWidth, bufferOffset);
+		}
+
+		double totalWaitTime = 0.0;
+		double totalMemcpyTime = 0.0;
+		double totalRecordTime = 0.0;
+		double totalSubmitTime = 0.0;
 
 		auto startTime = std::chrono::high_resolution_clock::now();
 
@@ -318,19 +329,35 @@ class ImageUploadBenchmarkApp final : public application_templates::MonoDeviceAp
 		{
 			uint32_t cmdBufIndex = frame % framesInFlight;
 
-			commandBuffers[cmdBufIndex]->begin(IGPUCommandBuffer::USAGE::ONE_TIME_SUBMIT_BIT);
+			auto t1 = std::chrono::high_resolution_clock::now();
+			if (frame >= framesInFlight)
+			{
+				ISemaphore::SWaitInfo frameWaitInfo = {
+					.semaphore = timelineSemaphore.get(),
+					.value = timelineValue - framesInFlight + 1
+				};
+				m_device->blockForSemaphores({&frameWaitInfo, 1});
+			}
+			auto t2 = std::chrono::high_resolution_clock::now();
+
+			commandPools[cmdBufIndex]->reset();
+
+			uint32_t bufferOffset = cmdBufIndex * partitionSize;
+			void* targetPtr = static_cast<uint8_t*>(mappedPtr) + bufferOffset;
+			memcpy(targetPtr, cpuSourceData, partitionSize);
+			auto t3 = std::chrono::high_resolution_clock::now();
 
+			commandBuffers[cmdBufIndex]->begin(IGPUCommandBuffer::USAGE::ONE_TIME_SUBMIT_BIT);
 			commandBuffers[cmdBufIndex]->copyBufferToImage(
 				stagingBuffer,
 				destinationImage,
 				IImage::LAYOUT::TRANSFER_DST_OPTIMAL,
 				tilesPerFrame,
-				regions
+				regionsPerFrame[cmdBufIndex]
 			);
-
 			commandBuffers[cmdBufIndex]->end();
+			auto t4 = std::chrono::high_resolution_clock::now();
 
-			// Create submit info for THIS frame
 			IQueue::SSubmitInfo frameSubmitInfo = {};
 			IQueue::SSubmitInfo::SCommandBufferInfo frameCmdBufInfo = {.cmdbuf = commandBuffers[cmdBufIndex].get()};
 			frameSubmitInfo.commandBuffers = {&frameCmdBufInfo, 1};
@@ -342,18 +369,13 @@ class ImageUploadBenchmarkApp final : public application_templates::MonoDeviceAp
 			};
 			frameSubmitInfo.signalSemaphores = {&frameSignalInfo, 1};
 
-			// Submit to GPU
 			queue->submit({&frameSubmitInfo, 1});
+			auto t5 = std::chrono::high_resolution_clock::now();
 
-			// Wait for old frames 
-			if (frame >= framesInFlight)
-			{
-				ISemaphore::SWaitInfo frameWaitInfo = {
-					.semaphore = timelineSemaphore.get(),
-					.value = timelineValue - framesInFlight
-				};
-				m_device->blockForSemaphores({&frameWaitInfo, 1});
-			}
+			totalWaitTime += std::chrono::duration<double>(t2 - t1).count();
+			totalMemcpyTime += std::chrono::duration<double>(t3 - t2).count();
+			totalRecordTime += std::chrono::duration<double>(t4 - t3).count();
+			totalSubmitTime += std::chrono::duration<double>(t5 - t4).count();
 		}
 
 		// Wait for all remaining frames to complete
@@ -365,15 +387,24 @@ class ImageUploadBenchmarkApp final : public application_templates::MonoDeviceAp
 
 		auto endTime = std::chrono::high_resolution_clock::now();
 
-		delete[] regions;
+		delete[] cpuSourceData;
+		for (uint32_t i = 0; i < framesInFlight; i++)
+			delete[] regionsPerFrame[i];
+		delete[] regionsPerFrame;
 		delete[] commandPools;
 		delete[] commandBuffers;
 
-		// Calculate throughput
 		double elapsedSeconds = std::chrono::duration<double>(endTime - startTime).count();
 		uint64_t totalBytes = (uint64_t)totalFrames * tilesPerFrame * tileSizeBytes;
 		double throughputGBps = (totalBytes / (1024.0 * 1024.0 * 1024.0)) / elapsedSeconds;
 
+		m_logger->log("  Timing breakdown for %s:", ILogger::ELL_INFO, strategyName);
+		m_logger->log("    Wait time:   %.3f s (%.1f%%)", ILogger::ELL_INFO, totalWaitTime, 100.0 * totalWaitTime / elapsedSeconds);
+		m_logger->log("    Memcpy time: %.3f s (%.1f%%)", ILogger::ELL_INFO, totalMemcpyTime, 100.0 * totalMemcpyTime / elapsedSeconds);
+		m_logger->log("    Record time: %.3f s (%.1f%%)", ILogger::ELL_INFO, totalRecordTime, 100.0 * totalRecordTime / elapsedSeconds);
+		m_logger->log("    Submit time: %.3f s (%.1f%%)", ILogger::ELL_INFO, totalSubmitTime, 100.0 * totalSubmitTime / elapsedSeconds);
+		m_logger->log("    Memcpy speed: %.2f GB/s", ILogger::ELL_INFO, (totalBytes / (1024.0 * 1024.0 * 1024.0)) / totalMemcpyTime);
+
 		return throughputGBps;
 	}
 
@@ -401,7 +432,7 @@ class ImageUploadBenchmarkApp final : public application_templates::MonoDeviceAp
 		if (!outAllocation.isValid())
 			return logFail("Failed to allocate Device Memory!\n");
 
-		outMappedPtr = outAllocation.memory->map({0ull, outAllocation.memory->getAllocationSize()}, IDeviceMemoryAllocation::EMCAF_READ);
+		outMappedPtr = outAllocation.memory->map({0ull, outAllocation.memory->getAllocationSize()}, IDeviceMemoryAllocation::EMCAF_WRITE);
 		if (!outMappedPtr)
 			return logFail("Failed to map Device Memory!\n");
 

From 874814af7c8dd08c264afbdebef1e0719561dffe Mon Sep 17 00:00:00 2001
From: CrabeExtra <abbasgaroosi7@gmail.com>
Date: Wed, 31 Dec 2025 16:29:19 +0330
Subject: [PATCH 4/5] Resolved PR comments + adding timestamp query

---
 73_ImageUploadBenchmark/main.cpp | 159 +++++++++++++++++++++----------
 1 file changed, 110 insertions(+), 49 deletions(-)

diff --git a/73_ImageUploadBenchmark/main.cpp b/73_ImageUploadBenchmark/main.cpp
index eceb0f9ea..f8124c9ab 100644
--- a/73_ImageUploadBenchmark/main.cpp
+++ b/73_ImageUploadBenchmark/main.cpp
@@ -1,5 +1,6 @@
 #include "nbl/examples/examples.hpp"
 #include <chrono>
+#include <thread>
 
 using namespace nbl;
 using namespace nbl::core;
@@ -68,8 +69,9 @@ class ImageUploadBenchmarkApp final : public application_templates::MonoDeviceAp
 		{
 			IGPUImage::SCreationParams imgParams{};
 			imgParams.type = IImage::E_TYPE::ET_2D;
-			imgParams.extent.width = TILE_SIZE * 32;
-			imgParams.extent.height = TILE_SIZE * 32;
+			uint32_t tilePerRow = (uint32_t)std::sqrt(TILES_PER_FRAME);
+			imgParams.extent.width = TILE_SIZE * tilePerRow;
+			imgParams.extent.height = TILE_SIZE * tilePerRow;
 			imgParams.extent.depth = 1u;
 			imgParams.format = asset::E_FORMAT::EF_R8G8B8A8_UNORM;
 			imgParams.mipLevels = 1u;
@@ -111,6 +113,7 @@ class ImageUploadBenchmarkApp final : public application_templates::MonoDeviceAp
 			throughputSystemRAM = runBenchmark(
 				"System RAM",
 				stagingBuffer.get(),
+				stagingAlloc,
 				mappedPtr,
 				destinationImage.get(),
 				TILE_SIZE,
@@ -147,6 +150,7 @@ class ImageUploadBenchmarkApp final : public application_templates::MonoDeviceAp
 				throughputVRAM = runBenchmark(
 					"VRAM",
 					stagingBuffer.get(),
+					stagingAlloc,
 					mappedPtr,
 					destinationImage.get(),
 					TILE_SIZE,
@@ -166,6 +170,9 @@ class ImageUploadBenchmarkApp final : public application_templates::MonoDeviceAp
 			m_logger->log("\nVRAM is %.2fx faster than System RAM", ILogger::ELL_PERFORMANCE, speedup);
 		}
 
+		m_logger->log("\nWaiting 5 seconds before exit...", ILogger::ELL_INFO);
+		std::this_thread::sleep_for(std::chrono::seconds(5));
+
 		return true;
 	}
 
@@ -186,28 +193,6 @@ class ImageUploadBenchmarkApp final : public application_templates::MonoDeviceAp
 	}
 
 private:
-	void transitionImageLayout(
-		IGPUCommandBuffer* cmdBuf,
-		IGPUImage* image,
-		IImage::LAYOUT oldLayout,
-		IImage::LAYOUT newLayout)
-	{
-		IGPUCommandBuffer::SImageMemoryBarrier<IGPUCommandBuffer::SOwnershipTransferBarrier> barrier = {};
-		barrier.oldLayout = oldLayout;
-		barrier.newLayout = newLayout;
-		barrier.image = image;
-		barrier.subresourceRange.aspectMask = IImage::E_ASPECT_FLAGS::EAF_COLOR_BIT;
-		barrier.subresourceRange.baseMipLevel = 0;
-		barrier.subresourceRange.levelCount = 1;
-		barrier.subresourceRange.baseArrayLayer = 0;
-		barrier.subresourceRange.layerCount = 1;
-		barrier.barrier.dep.srcAccessMask = ACCESS_FLAGS::NONE;
-		barrier.barrier.dep.dstAccessMask = ACCESS_FLAGS::TRANSFER_WRITE_BIT;
-		barrier.barrier.dep.srcStageMask = PIPELINE_STAGE_FLAGS::ALL_COMMANDS_BITS;
-		barrier.barrier.dep.dstStageMask = PIPELINE_STAGE_FLAGS::ALL_TRANSFER_BITS;
-		cmdBuf->pipelineBarrier(E_DEPENDENCY_FLAGS::EDF_NONE, { .imgBarriers = {&barrier, 1} });
-	}
-
 	void generateTileCopyRegions(
 		IImage::SBufferCopy* outRegions,
 		uint32_t tilesPerFrame,
@@ -237,6 +222,7 @@ class ImageUploadBenchmarkApp final : public application_templates::MonoDeviceAp
 	double runBenchmark(
 		const char* strategyName,
 		IGPUBuffer* stagingBuffer,
+		IDeviceMemoryAllocator::SAllocation& stagingAlloc,
 		void* mappedPtr,
 		IGPUImage* destinationImage,
 		uint32_t tileSize,
@@ -248,7 +234,16 @@ class ImageUploadBenchmarkApp final : public application_templates::MonoDeviceAp
 	{
 		smart_refctd_ptr<ISemaphore> timelineSemaphore = m_device->createSemaphore(0);
 
-		auto commandPools = new smart_refctd_ptr<IGPUCommandPool>[framesInFlight];
+		smart_refctd_ptr<IQueryPool> queryPool;
+		{
+			IQueryPool::SCreationParams queryPoolParams = {};
+			queryPoolParams.queryType = IQueryPool::TYPE::TIMESTAMP;
+			queryPoolParams.queryCount = framesInFlight * 2;  
+			queryPoolParams.pipelineStatisticsFlags = IQueryPool::PIPELINE_STATISTICS_FLAGS::NONE;
+			queryPool = m_device->createQueryPool(queryPoolParams);
+		}
+		
+		std::vector<smart_refctd_ptr<IGPUCommandPool>> commandPools(framesInFlight);
 		for (uint32_t i = 0; i < framesInFlight; i++)
 		{
 			commandPools[i] = m_device->createCommandPool(
@@ -256,8 +251,7 @@ class ImageUploadBenchmarkApp final : public application_templates::MonoDeviceAp
 				IGPUCommandPool::CREATE_FLAGS::RESET_COMMAND_BUFFER_BIT
 			);
 		}
-
-		auto commandBuffers = new smart_refctd_ptr<IGPUCommandBuffer>[framesInFlight];
+		std::vector<smart_refctd_ptr<IGPUCommandBuffer>> commandBuffers(framesInFlight);
 		for (uint32_t i = 0; i < framesInFlight; i++)
 		{
 			commandPools[i]->createCommandBuffers(
@@ -270,12 +264,22 @@ class ImageUploadBenchmarkApp final : public application_templates::MonoDeviceAp
 		uint64_t timelineValue = 0;
 
 		commandBuffers[0]->begin(IGPUCommandBuffer::USAGE::ONE_TIME_SUBMIT_BIT);
-		transitionImageLayout(
-			commandBuffers[0].get(),
-			destinationImage,
-			IImage::LAYOUT::UNDEFINED,
-			IImage::LAYOUT::TRANSFER_DST_OPTIMAL
-		);
+		{
+			IGPUCommandBuffer::SImageMemoryBarrier<IGPUCommandBuffer::SOwnershipTransferBarrier> initBarrier = {};
+			initBarrier.oldLayout = IImage::LAYOUT::UNDEFINED;
+			initBarrier.newLayout = IImage::LAYOUT::GENERAL;
+			initBarrier.image = destinationImage;
+			initBarrier.subresourceRange.aspectMask = IImage::E_ASPECT_FLAGS::EAF_COLOR_BIT;
+			initBarrier.subresourceRange.baseMipLevel = 0;
+			initBarrier.subresourceRange.levelCount = 1;
+			initBarrier.subresourceRange.baseArrayLayer = 0;
+			initBarrier.subresourceRange.layerCount = 1;
+			initBarrier.barrier.dep.srcAccessMask = ACCESS_FLAGS::NONE;
+			initBarrier.barrier.dep.dstAccessMask = ACCESS_FLAGS::TRANSFER_WRITE_BIT;
+			initBarrier.barrier.dep.srcStageMask = PIPELINE_STAGE_FLAGS::NONE;
+			initBarrier.barrier.dep.dstStageMask = PIPELINE_STAGE_FLAGS::COPY_BIT;
+			commandBuffers[0]->pipelineBarrier(E_DEPENDENCY_FLAGS::EDF_NONE, {.imgBarriers = {&initBarrier, 1}});
+		}
 		commandBuffers[0]->end();
 
 		IQueue::SSubmitInfo submitInfo = {};
@@ -300,22 +304,20 @@ class ImageUploadBenchmarkApp final : public application_templates::MonoDeviceAp
 		uint32_t imageWidth = destinationImage->getCreationParameters().extent.width;
 		uint32_t partitionSize = tilesPerFrame * tileSizeBytes;
 
-		// CPU source buffer with random data (generated once, reused each frame)
-		auto cpuSourceData = new uint8_t[partitionSize];
+		std::vector<uint8_t> cpuSourceData(partitionSize);
 		{
 			unsigned seed = std::chrono::system_clock::now().time_since_epoch().count();
 			std::mt19937 g(seed);
-			uint32_t* data = reinterpret_cast<uint32_t*>(cpuSourceData);
+			uint32_t* data = reinterpret_cast<uint32_t*>(cpuSourceData.data());
 			for (uint32_t i = 0; i < partitionSize / sizeof(uint32_t); i++)
 				data[i] = g();
 		}
-
-		auto regionsPerFrame = new IImage::SBufferCopy*[framesInFlight];
+		std::vector<std::vector<IImage::SBufferCopy>> regionsPerFrame(framesInFlight);
 		for (uint32_t i = 0; i < framesInFlight; i++)
 		{
-			regionsPerFrame[i] = new IImage::SBufferCopy[tilesPerFrame];
+			regionsPerFrame[i].resize(tilesPerFrame);
 			uint32_t bufferOffset = i * partitionSize;
-			generateTileCopyRegions(regionsPerFrame[i], tilesPerFrame, tileSize, tileSizeBytes, imageWidth, bufferOffset);
+			generateTileCopyRegions(regionsPerFrame[i].data(), tilesPerFrame, tileSize, tileSizeBytes, imageWidth, bufferOffset);
 		}
 
 		double totalWaitTime = 0.0;
@@ -344,17 +346,63 @@ class ImageUploadBenchmarkApp final : public application_templates::MonoDeviceAp
 
 			uint32_t bufferOffset = cmdBufIndex * partitionSize;
 			void* targetPtr = static_cast<uint8_t*>(mappedPtr) + bufferOffset;
-			memcpy(targetPtr, cpuSourceData, partitionSize);
+			memcpy(targetPtr, cpuSourceData.data(), partitionSize);
+
+			if (!stagingAlloc.memory->getMemoryPropertyFlags().hasFlags(IDeviceMemoryAllocation::EMPF_HOST_COHERENT_BIT))
+			{
+				ILogicalDevice::MappedMemoryRange range(stagingAlloc.memory.get(), bufferOffset, partitionSize);
+				m_device->flushMappedMemoryRanges(1, &range);
+			}
+
 			auto t3 = std::chrono::high_resolution_clock::now();
 
 			commandBuffers[cmdBufIndex]->begin(IGPUCommandBuffer::USAGE::ONE_TIME_SUBMIT_BIT);
+
+			uint32_t queryStartIndex = cmdBufIndex * 2;
+			commandBuffers[cmdBufIndex]->resetQueryPool(queryPool.get(), queryStartIndex, 2);
+
+			IGPUCommandBuffer::SImageMemoryBarrier<IGPUCommandBuffer::SOwnershipTransferBarrier> barrier = {};
+			barrier.oldLayout = IImage::LAYOUT::GENERAL;
+			barrier.newLayout = IImage::LAYOUT::GENERAL;
+			barrier.image = destinationImage;
+			barrier.subresourceRange.aspectMask = IImage::E_ASPECT_FLAGS::EAF_COLOR_BIT;
+			barrier.subresourceRange.baseMipLevel = 0;
+			barrier.subresourceRange.levelCount = 1;
+			barrier.subresourceRange.baseArrayLayer = 0;
+			barrier.subresourceRange.layerCount = 1;
+			barrier.barrier.dep.srcAccessMask = ACCESS_FLAGS::TRANSFER_WRITE_BIT;
+			barrier.barrier.dep.dstAccessMask = ACCESS_FLAGS::TRANSFER_WRITE_BIT;
+			barrier.barrier.dep.srcStageMask = PIPELINE_STAGE_FLAGS::ALL_TRANSFER_BITS;
+			barrier.barrier.dep.dstStageMask = PIPELINE_STAGE_FLAGS::ALL_TRANSFER_BITS;
+			commandBuffers[cmdBufIndex]->pipelineBarrier(E_DEPENDENCY_FLAGS::EDF_NONE, {.imgBarriers = {&barrier, 1}});
+
+			commandBuffers[cmdBufIndex]->writeTimestamp(PIPELINE_STAGE_FLAGS::COPY_BIT, queryPool.get(), queryStartIndex + 0);
+
 			commandBuffers[cmdBufIndex]->copyBufferToImage(
 				stagingBuffer,
 				destinationImage,
-				IImage::LAYOUT::TRANSFER_DST_OPTIMAL,
+				IImage::LAYOUT::GENERAL,
 				tilesPerFrame,
-				regionsPerFrame[cmdBufIndex]
+				regionsPerFrame[cmdBufIndex].data()
 			);
+
+			IGPUCommandBuffer::SImageMemoryBarrier<IGPUCommandBuffer::SOwnershipTransferBarrier> afterBarrier = {};
+			afterBarrier.oldLayout = IImage::LAYOUT::GENERAL;
+			afterBarrier.newLayout = IImage::LAYOUT::GENERAL;
+			afterBarrier.image = destinationImage;
+			afterBarrier.subresourceRange.aspectMask = IImage::E_ASPECT_FLAGS::EAF_COLOR_BIT;
+			afterBarrier.subresourceRange.baseMipLevel = 0;
+			afterBarrier.subresourceRange.levelCount = 1;
+			afterBarrier.subresourceRange.baseArrayLayer = 0;
+			afterBarrier.subresourceRange.layerCount = 1;
+			afterBarrier.barrier.dep.srcAccessMask = ACCESS_FLAGS::TRANSFER_WRITE_BIT;
+			afterBarrier.barrier.dep.dstAccessMask = ACCESS_FLAGS::SHADER_READ_BITS;
+			afterBarrier.barrier.dep.srcStageMask = PIPELINE_STAGE_FLAGS::ALL_TRANSFER_BITS;
+			afterBarrier.barrier.dep.dstStageMask = PIPELINE_STAGE_FLAGS::FRAGMENT_SHADER_BIT;
+			commandBuffers[cmdBufIndex]->pipelineBarrier(E_DEPENDENCY_FLAGS::EDF_NONE, {.imgBarriers = {&afterBarrier, 1}});
+
+			commandBuffers[cmdBufIndex]->writeTimestamp(PIPELINE_STAGE_FLAGS::COPY_BIT, queryPool.get(), queryStartIndex + 1);
+
 			commandBuffers[cmdBufIndex]->end();
 			auto t4 = std::chrono::high_resolution_clock::now();
 
@@ -387,17 +435,30 @@ class ImageUploadBenchmarkApp final : public application_templates::MonoDeviceAp
 
 		auto endTime = std::chrono::high_resolution_clock::now();
 
-		delete[] cpuSourceData;
-		for (uint32_t i = 0; i < framesInFlight; i++)
-			delete[] regionsPerFrame[i];
-		delete[] regionsPerFrame;
-		delete[] commandPools;
-		delete[] commandBuffers;
+		std::vector<uint64_t> timestamps(framesInFlight * 2);
+		const core::bitflag flags = core::bitflag(IQueryPool::RESULTS_FLAGS::_64_BIT) | core::bitflag(IQueryPool::RESULTS_FLAGS::WAIT_BIT);
+		m_device->getQueryPoolResults(queryPool.get(), 0, framesInFlight * 2, timestamps.data(), sizeof(uint64_t), flags);
+		uint64_t totalGpuTicks = 0;
+		for (uint32_t i = 0; i < framesInFlight; i++) {
+			uint64_t startTick = timestamps[i * 2 + 0];
+			uint64_t endTick = timestamps[i * 2 + 1];
+			totalGpuTicks += (endTick - startTick);
+		}
+		float timestampPeriod = m_physicalDevice->getLimits().timestampPeriodInNanoSeconds;
+		double sampledGpuTimeSeconds = (totalGpuTicks * timestampPeriod) / 1e9;
+
+		double avgGpuTimePerFrame = sampledGpuTimeSeconds / framesInFlight;
+		double totalGpuTimeSeconds = avgGpuTimePerFrame * totalFrames;
+
 
 		double elapsedSeconds = std::chrono::duration<double>(endTime - startTime).count();
 		uint64_t totalBytes = (uint64_t)totalFrames * tilesPerFrame * tileSizeBytes;
+
 		double throughputGBps = (totalBytes / (1024.0 * 1024.0 * 1024.0)) / elapsedSeconds;
 
+		m_logger->log("    GPU time: %.3f s", ILogger::ELL_INFO, totalGpuTimeSeconds);
+		m_logger->log("    GPU throughput: %.2f GB/s", ILogger::ELL_INFO, throughputGBps);
+
 		m_logger->log("  Timing breakdown for %s:", ILogger::ELL_INFO, strategyName);
 		m_logger->log("    Wait time:   %.3f s (%.1f%%)", ILogger::ELL_INFO, totalWaitTime, 100.0 * totalWaitTime / elapsedSeconds);
 		m_logger->log("    Memcpy time: %.3f s (%.1f%%)", ILogger::ELL_INFO, totalMemcpyTime, 100.0 * totalMemcpyTime / elapsedSeconds);

From ddb7bfc6ae5889aea89db756b461a0beeb763d0f Mon Sep 17 00:00:00 2001
From: CrabeExtra <abbasgaroosi7@gmail.com>
Date: Thu, 1 Jan 2026 17:01:15 +0330
Subject: [PATCH 5/5] Adding more logs to release build

---
 73_ImageUploadBenchmark/main.cpp | 36 ++++++++++++++++----------------
 1 file changed, 18 insertions(+), 18 deletions(-)

diff --git a/73_ImageUploadBenchmark/main.cpp b/73_ImageUploadBenchmark/main.cpp
index f8124c9ab..ff38b1555 100644
--- a/73_ImageUploadBenchmark/main.cpp
+++ b/73_ImageUploadBenchmark/main.cpp
@@ -33,11 +33,11 @@ class ImageUploadBenchmarkApp final : public application_templates::MonoDeviceAp
 		constexpr uint32_t TILES_PER_FRAME = STAGING_BUFFER_SIZE / (TILE_SIZE_BYTES * FRAMES_IN_FLIGHT);
 		constexpr uint32_t TOTAL_FRAMES = 1000;
 
-		m_logger->log("GPU Memory Transfer Benchmark", ILogger::ELL_INFO);
-		m_logger->log("Tile size: %ux%u (%u KB)", ILogger::ELL_INFO, TILE_SIZE, TILE_SIZE, TILE_SIZE_BYTES / 1024);
-		m_logger->log("Staging buffer: %u MB", ILogger::ELL_INFO, STAGING_BUFFER_SIZE / (1024 * 1024));
-		m_logger->log("Tiles per frame: %u", ILogger::ELL_INFO, TILES_PER_FRAME);
-		m_logger->log("Frames in flight: %u", ILogger::ELL_INFO, FRAMES_IN_FLIGHT);
+		m_logger->log("GPU Memory Transfer Benchmark", ILogger::ELL_PERFORMANCE);
+		m_logger->log("Tile size: %ux%u (%u KB)", ILogger::ELL_PERFORMANCE, TILE_SIZE, TILE_SIZE, TILE_SIZE_BYTES / 1024);
+		m_logger->log("Staging buffer: %u MB", ILogger::ELL_PERFORMANCE, STAGING_BUFFER_SIZE / (1024 * 1024));
+		m_logger->log("Tiles per frame: %u", ILogger::ELL_PERFORMANCE, TILES_PER_FRAME);
+		m_logger->log("Frames in flight: %u", ILogger::ELL_PERFORMANCE, FRAMES_IN_FLIGHT);
 
 		uint32_t hostVisibleBits = m_physicalDevice->getHostVisibleMemoryTypeBits();
 		uint32_t deviceLocalBits = m_physicalDevice->getDeviceLocalMemoryTypeBits();
@@ -48,9 +48,9 @@ class ImageUploadBenchmarkApp final : public application_templates::MonoDeviceAp
 		uint32_t hostVisibleDeviceLocalBits = hostVisibleBits & deviceLocalBits;
 
 		m_logger->log("Memory type bits - HostVisible: 0x%X, DeviceLocal: 0x%X, HostCached: 0x%X",
-			ILogger::ELL_INFO, hostVisibleBits, deviceLocalBits, hostCachedBits);
+			ILogger::ELL_PERFORMANCE, hostVisibleBits, deviceLocalBits, hostCachedBits);
 		m_logger->log("System RAM (non-cached): 0x%X, VRAM: 0x%X",
-			ILogger::ELL_INFO, hostVisibleOnlyBits, hostVisibleDeviceLocalBits);
+			ILogger::ELL_PERFORMANCE, hostVisibleOnlyBits, hostVisibleDeviceLocalBits);
 
 		if (!hostVisibleOnlyBits)
 		{
@@ -96,7 +96,7 @@ class ImageUploadBenchmarkApp final : public application_templates::MonoDeviceAp
 				return logFail("Failed to allocate DEVICE_LOCAL memory for destination image!\n");
 		}
 
-		m_logger->log("\nTesting Strategy 1: System RAM", ILogger::ELL_INFO);
+		m_logger->log("\nStrategy 1: System RAM", ILogger::ELL_PERFORMANCE);
 
 		double throughputSystemRAM = 0.0;
 		{
@@ -133,7 +133,7 @@ class ImageUploadBenchmarkApp final : public application_templates::MonoDeviceAp
 
 		if (hostVisibleDeviceLocalBits)
 		{
-			m_logger->log("\nTesting Strategy 2: VRAM", ILogger::ELL_INFO);
+			m_logger->log("\nStrategy 2: VRAM", ILogger::ELL_PERFORMANCE);
 
 			double throughputVRAM = 0.0;
 			{
@@ -170,7 +170,7 @@ class ImageUploadBenchmarkApp final : public application_templates::MonoDeviceAp
 			m_logger->log("\nVRAM is %.2fx faster than System RAM", ILogger::ELL_PERFORMANCE, speedup);
 		}
 
-		m_logger->log("\nWaiting 5 seconds before exit...", ILogger::ELL_INFO);
+		m_logger->log("\nWaiting 5 seconds before exit...", ILogger::ELL_PERFORMANCE);
 		std::this_thread::sleep_for(std::chrono::seconds(5));
 
 		return true;
@@ -456,15 +456,15 @@ class ImageUploadBenchmarkApp final : public application_templates::MonoDeviceAp
 
 		double throughputGBps = (totalBytes / (1024.0 * 1024.0 * 1024.0)) / elapsedSeconds;
 
-		m_logger->log("    GPU time: %.3f s", ILogger::ELL_INFO, totalGpuTimeSeconds);
-		m_logger->log("    GPU throughput: %.2f GB/s", ILogger::ELL_INFO, throughputGBps);
+		m_logger->log("    GPU time: %.3f s", ILogger::ELL_PERFORMANCE, totalGpuTimeSeconds);
+		m_logger->log("    GPU throughput: %.2f GB/s", ILogger::ELL_PERFORMANCE, throughputGBps);
 
-		m_logger->log("  Timing breakdown for %s:", ILogger::ELL_INFO, strategyName);
-		m_logger->log("    Wait time:   %.3f s (%.1f%%)", ILogger::ELL_INFO, totalWaitTime, 100.0 * totalWaitTime / elapsedSeconds);
-		m_logger->log("    Memcpy time: %.3f s (%.1f%%)", ILogger::ELL_INFO, totalMemcpyTime, 100.0 * totalMemcpyTime / elapsedSeconds);
-		m_logger->log("    Record time: %.3f s (%.1f%%)", ILogger::ELL_INFO, totalRecordTime, 100.0 * totalRecordTime / elapsedSeconds);
-		m_logger->log("    Submit time: %.3f s (%.1f%%)", ILogger::ELL_INFO, totalSubmitTime, 100.0 * totalSubmitTime / elapsedSeconds);
-		m_logger->log("    Memcpy speed: %.2f GB/s", ILogger::ELL_INFO, (totalBytes / (1024.0 * 1024.0 * 1024.0)) / totalMemcpyTime);
+		m_logger->log("  Timing breakdown for %s:", ILogger::ELL_PERFORMANCE, strategyName);
+		m_logger->log("    Wait time:   %.3f s (%.1f%%)", ILogger::ELL_PERFORMANCE, totalWaitTime, 100.0 * totalWaitTime / elapsedSeconds);
+		m_logger->log("    Memcpy time: %.3f s (%.1f%%)", ILogger::ELL_PERFORMANCE, totalMemcpyTime, 100.0 * totalMemcpyTime / elapsedSeconds);
+		m_logger->log("    Record time: %.3f s (%.1f%%)", ILogger::ELL_PERFORMANCE, totalRecordTime, 100.0 * totalRecordTime / elapsedSeconds);
+		m_logger->log("    Submit time: %.3f s (%.1f%%)", ILogger::ELL_PERFORMANCE, totalSubmitTime, 100.0 * totalSubmitTime / elapsedSeconds);
+		m_logger->log("    Memcpy speed: %.2f GB/s", ILogger::ELL_PERFORMANCE, (totalBytes / (1024.0 * 1024.0 * 1024.0)) / totalMemcpyTime);
 
 		return throughputGBps;
 	}