diff --git a/73_ImageUploadBenchmark/CMakeLists.txt b/73_ImageUploadBenchmark/CMakeLists.txt new file mode 100644 index 000000000..2f9218f93 --- /dev/null +++ b/73_ImageUploadBenchmark/CMakeLists.txt @@ -0,0 +1,6 @@ +include(common RESULT_VARIABLE RES) +if(NOT RES) + message(FATAL_ERROR "common.cmake not found. Should be in {repo_root}/cmake directory") +endif() + +nbl_create_executable_project("" "" "" "" "${NBL_EXECUTABLE_PROJECT_CREATION_PCH_TARGET}") \ No newline at end of file diff --git a/73_ImageUploadBenchmark/config.json.template b/73_ImageUploadBenchmark/config.json.template new file mode 100644 index 000000000..12215d0bb --- /dev/null +++ b/73_ImageUploadBenchmark/config.json.template @@ -0,0 +1,28 @@ +{ + "enableParallelBuild": true, + "threadsPerBuildProcess" : 2, + "isExecuted": false, + "scriptPath": "", + "cmake": { + "configurations": [ "Release", "Debug", "RelWithDebInfo" ], + "buildModes": [], + "requiredOptions": [] + }, + "profiles": [ + { + "backend": "vulkan", // should be none + "platform": "windows", + "buildModes": [], + "runConfiguration": "Release", // we also need to run in Debug nad RWDI because foundational example + "gpuArchitectures": [] + } + ], + "dependencies": [], + "data": [ + { + "dependencies": [], + "command": [""], + "outputs": [] + } + ] +} diff --git a/73_ImageUploadBenchmark/main.cpp b/73_ImageUploadBenchmark/main.cpp new file mode 100644 index 000000000..ff38b1555 --- /dev/null +++ b/73_ImageUploadBenchmark/main.cpp @@ -0,0 +1,504 @@ +#include "nbl/examples/examples.hpp" +#include +#include + +using namespace nbl; +using namespace nbl::core; +using namespace nbl::system; +using namespace nbl::asset; +using namespace nbl::video; +using namespace nbl::examples; + +class ImageUploadBenchmarkApp final : public application_templates::MonoDeviceApplication, public BuiltinResourcesApplication +{ + using device_base_t = application_templates::MonoDeviceApplication; + using asset_base_t = BuiltinResourcesApplication; + +public: + ImageUploadBenchmarkApp(const path& _localInputCWD, const path& _localOutputCWD, const path& _sharedInputCWD, const path& _sharedOutputCWD) : + system::IApplicationFramework(_localInputCWD, _localOutputCWD, _sharedInputCWD, _sharedOutputCWD) {} + + bool onAppInitialized(smart_refctd_ptr&& system) override + { + if (!device_base_t::onAppInitialized(smart_refctd_ptr(system))) + return false; + if (!asset_base_t::onAppInitialized(std::move(system))) + return false; + + constexpr uint32_t TILE_SIZE = 128; + constexpr uint32_t TILE_BYTES_PER_PIXEL = 4; + constexpr uint32_t TILE_SIZE_BYTES = TILE_SIZE * TILE_SIZE * TILE_BYTES_PER_PIXEL; + constexpr uint32_t STAGING_BUFFER_SIZE = 64 * 1024 * 1024; + constexpr uint32_t FRAMES_IN_FLIGHT = 4; + constexpr uint32_t TILES_PER_FRAME = STAGING_BUFFER_SIZE / (TILE_SIZE_BYTES * FRAMES_IN_FLIGHT); + constexpr uint32_t TOTAL_FRAMES = 1000; + + m_logger->log("GPU Memory Transfer Benchmark", ILogger::ELL_PERFORMANCE); + m_logger->log("Tile size: %ux%u (%u KB)", ILogger::ELL_PERFORMANCE, TILE_SIZE, TILE_SIZE, TILE_SIZE_BYTES / 1024); + m_logger->log("Staging buffer: %u MB", ILogger::ELL_PERFORMANCE, STAGING_BUFFER_SIZE / (1024 * 1024)); + m_logger->log("Tiles per frame: %u", ILogger::ELL_PERFORMANCE, TILES_PER_FRAME); + m_logger->log("Frames in flight: %u", ILogger::ELL_PERFORMANCE, FRAMES_IN_FLIGHT); + + uint32_t hostVisibleBits = m_physicalDevice->getHostVisibleMemoryTypeBits(); + uint32_t deviceLocalBits = m_physicalDevice->getDeviceLocalMemoryTypeBits(); + uint32_t hostCachedBits = m_physicalDevice->getMemoryTypeBitsFromMemoryTypeFlags(IDeviceMemoryAllocation::EMPF_HOST_CACHED_BIT); + + uint32_t hostVisibleOnlyBits = hostVisibleBits & ~deviceLocalBits & ~hostCachedBits; + + uint32_t hostVisibleDeviceLocalBits = hostVisibleBits & deviceLocalBits; + + m_logger->log("Memory type bits - HostVisible: 0x%X, DeviceLocal: 0x%X, HostCached: 0x%X", + ILogger::ELL_PERFORMANCE, hostVisibleBits, deviceLocalBits, hostCachedBits); + m_logger->log("System RAM (non-cached): 0x%X, VRAM: 0x%X", + ILogger::ELL_PERFORMANCE, hostVisibleOnlyBits, hostVisibleDeviceLocalBits); + + if (!hostVisibleOnlyBits) + { + m_logger->log("HOST_VISIBLE non-cached memory types not found!", ILogger::ELL_ERROR); + return false; + } + + if (!deviceLocalBits) + { + m_logger->log("DEVICE_LOCAL memory types not found!", ILogger::ELL_ERROR); + return false; + } + + IQueue* queue = getQueue(IQueue::FAMILY_FLAGS::GRAPHICS_BIT); + smart_refctd_ptr destinationImage; + { + IGPUImage::SCreationParams imgParams{}; + imgParams.type = IImage::E_TYPE::ET_2D; + uint32_t tilePerRow = (uint32_t)std::sqrt(TILES_PER_FRAME); + imgParams.extent.width = TILE_SIZE * tilePerRow; + imgParams.extent.height = TILE_SIZE * tilePerRow; + imgParams.extent.depth = 1u; + imgParams.format = asset::E_FORMAT::EF_R8G8B8A8_UNORM; + imgParams.mipLevels = 1u; + imgParams.flags = IImage::ECF_NONE; + imgParams.arrayLayers = 1u; + imgParams.samples = IImage::E_SAMPLE_COUNT_FLAGS::ESCF_1_BIT; + imgParams.tiling = video::IGPUImage::TILING::OPTIMAL; + imgParams.usage = asset::IImage::EUF_TRANSFER_DST_BIT; + imgParams.preinitialized = false; + + destinationImage = m_device->createImage(std::move(imgParams)); + if (!destinationImage) + return logFail("Failed to create destination image!\n"); + + destinationImage->setObjectDebugName("Destination Image"); + + auto reqs = destinationImage->getMemoryReqs(); + reqs.memoryTypeBits &= deviceLocalBits; + + auto allocation = m_device->allocate(reqs, destinationImage.get(), IDeviceMemoryAllocation::EMAF_NONE); + if (!allocation.isValid()) + return logFail("Failed to allocate DEVICE_LOCAL memory for destination image!\n"); + } + + m_logger->log("\nStrategy 1: System RAM", ILogger::ELL_PERFORMANCE); + + double throughputSystemRAM = 0.0; + { + smart_refctd_ptr stagingBuffer; + IDeviceMemoryAllocator::SAllocation stagingAlloc; + void* mappedPtr = nullptr; + + if (!createStagingBuffer(STAGING_BUFFER_SIZE, hostVisibleOnlyBits, + "Staging Buffer - System RAM", stagingBuffer, stagingAlloc, mappedPtr)) + { + return false; + } + + throughputSystemRAM = runBenchmark( + "System RAM", + stagingBuffer.get(), + stagingAlloc, + mappedPtr, + destinationImage.get(), + TILE_SIZE, + TILE_SIZE_BYTES, + TILES_PER_FRAME, + FRAMES_IN_FLIGHT, + TOTAL_FRAMES, + queue + ); + + stagingAlloc.memory->unmap(); + } + + m_logger->log("System RAM throughput: %.2f GB/s", ILogger::ELL_PERFORMANCE, throughputSystemRAM); + + m_device->waitIdle(); + + if (hostVisibleDeviceLocalBits) + { + m_logger->log("\nStrategy 2: VRAM", ILogger::ELL_PERFORMANCE); + + double throughputVRAM = 0.0; + { + smart_refctd_ptr stagingBuffer; + IDeviceMemoryAllocator::SAllocation stagingAlloc; + void* mappedPtr = nullptr; + + if (!createStagingBuffer(STAGING_BUFFER_SIZE, hostVisibleDeviceLocalBits, + "Staging Buffer - VRAM", stagingBuffer, stagingAlloc, mappedPtr)) + { + return false; + } + + throughputVRAM = runBenchmark( + "VRAM", + stagingBuffer.get(), + stagingAlloc, + mappedPtr, + destinationImage.get(), + TILE_SIZE, + TILE_SIZE_BYTES, + TILES_PER_FRAME, + FRAMES_IN_FLIGHT, + TOTAL_FRAMES, + queue + ); + + stagingAlloc.memory->unmap(); + } + + m_logger->log("VRAM throughput: %.2f GB/s", ILogger::ELL_PERFORMANCE, throughputVRAM); + + double speedup = throughputVRAM / throughputSystemRAM; + m_logger->log("\nVRAM is %.2fx faster than System RAM", ILogger::ELL_PERFORMANCE, speedup); + } + + m_logger->log("\nWaiting 5 seconds before exit...", ILogger::ELL_PERFORMANCE); + std::this_thread::sleep_for(std::chrono::seconds(5)); + + return true; + } + + bool keepRunning() override { return false; } + void workLoopBody() override {} + bool onAppTerminated() override { return true; } + +protected: + core::vector getQueueRequirements() const override + { + using flags_t = IQueue::FAMILY_FLAGS; + return { { + .requiredFlags = flags_t::GRAPHICS_BIT, + .disallowedFlags = flags_t::NONE, + .queueCount = 1, + .maxImageTransferGranularity = {1, 1, 1} + } }; + } + +private: + void generateTileCopyRegions( + IImage::SBufferCopy* outRegions, + uint32_t tilesPerFrame, + uint32_t tileSize, + uint32_t tileSizeBytes, + uint32_t imageWidth, + uint32_t bufferBaseOffset) + { + uint32_t tilesPerRow = imageWidth / tileSize; + for (size_t i = 0; i < tilesPerFrame; i++) + { + uint32_t tileX = (i % tilesPerRow) * tileSize; + uint32_t tileY = (i / tilesPerRow) * tileSize; + + outRegions[i].bufferOffset = bufferBaseOffset + (i * tileSizeBytes); + outRegions[i].bufferRowLength = tileSize; + outRegions[i].bufferImageHeight = tileSize; + outRegions[i].imageOffset = { tileX, tileY, 0 }; + outRegions[i].imageExtent = { tileSize, tileSize, 1 }; + outRegions[i].imageSubresource.aspectMask = IImage::E_ASPECT_FLAGS::EAF_COLOR_BIT; + outRegions[i].imageSubresource.mipLevel = 0; + outRegions[i].imageSubresource.baseArrayLayer = 0; + outRegions[i].imageSubresource.layerCount = 1; + } + } + + double runBenchmark( + const char* strategyName, + IGPUBuffer* stagingBuffer, + IDeviceMemoryAllocator::SAllocation& stagingAlloc, + void* mappedPtr, + IGPUImage* destinationImage, + uint32_t tileSize, + uint32_t tileSizeBytes, + uint32_t tilesPerFrame, + uint32_t framesInFlight, + uint32_t totalFrames, + IQueue* queue) + { + smart_refctd_ptr timelineSemaphore = m_device->createSemaphore(0); + + smart_refctd_ptr queryPool; + { + IQueryPool::SCreationParams queryPoolParams = {}; + queryPoolParams.queryType = IQueryPool::TYPE::TIMESTAMP; + queryPoolParams.queryCount = framesInFlight * 2; + queryPoolParams.pipelineStatisticsFlags = IQueryPool::PIPELINE_STATISTICS_FLAGS::NONE; + queryPool = m_device->createQueryPool(queryPoolParams); + } + + std::vector> commandPools(framesInFlight); + for (uint32_t i = 0; i < framesInFlight; i++) + { + commandPools[i] = m_device->createCommandPool( + queue->getFamilyIndex(), + IGPUCommandPool::CREATE_FLAGS::RESET_COMMAND_BUFFER_BIT + ); + } + std::vector> commandBuffers(framesInFlight); + for (uint32_t i = 0; i < framesInFlight; i++) + { + commandPools[i]->createCommandBuffers( + IGPUCommandPool::BUFFER_LEVEL::PRIMARY, + 1, + &commandBuffers[i] + ); + } + + uint64_t timelineValue = 0; + + commandBuffers[0]->begin(IGPUCommandBuffer::USAGE::ONE_TIME_SUBMIT_BIT); + { + IGPUCommandBuffer::SImageMemoryBarrier initBarrier = {}; + initBarrier.oldLayout = IImage::LAYOUT::UNDEFINED; + initBarrier.newLayout = IImage::LAYOUT::GENERAL; + initBarrier.image = destinationImage; + initBarrier.subresourceRange.aspectMask = IImage::E_ASPECT_FLAGS::EAF_COLOR_BIT; + initBarrier.subresourceRange.baseMipLevel = 0; + initBarrier.subresourceRange.levelCount = 1; + initBarrier.subresourceRange.baseArrayLayer = 0; + initBarrier.subresourceRange.layerCount = 1; + initBarrier.barrier.dep.srcAccessMask = ACCESS_FLAGS::NONE; + initBarrier.barrier.dep.dstAccessMask = ACCESS_FLAGS::TRANSFER_WRITE_BIT; + initBarrier.barrier.dep.srcStageMask = PIPELINE_STAGE_FLAGS::NONE; + initBarrier.barrier.dep.dstStageMask = PIPELINE_STAGE_FLAGS::COPY_BIT; + commandBuffers[0]->pipelineBarrier(E_DEPENDENCY_FLAGS::EDF_NONE, {.imgBarriers = {&initBarrier, 1}}); + } + commandBuffers[0]->end(); + + IQueue::SSubmitInfo submitInfo = {}; + IQueue::SSubmitInfo::SCommandBufferInfo cmdBufInfo = { .cmdbuf = commandBuffers[0].get() }; + submitInfo.commandBuffers = { &cmdBufInfo, 1 }; + + IQueue::SSubmitInfo::SSemaphoreInfo signalInfo = { + .semaphore = timelineSemaphore.get(), + .value = ++timelineValue, + .stageMask = PIPELINE_STAGE_FLAGS::ALL_TRANSFER_BITS + }; + submitInfo.signalSemaphores = { &signalInfo, 1 }; + + queue->submit({ &submitInfo, 1 }); + + ISemaphore::SWaitInfo waitInfo = { + .semaphore = timelineSemaphore.get(), + .value = timelineValue + }; + m_device->blockForSemaphores({ &waitInfo, 1 }); + + uint32_t imageWidth = destinationImage->getCreationParameters().extent.width; + uint32_t partitionSize = tilesPerFrame * tileSizeBytes; + + std::vector cpuSourceData(partitionSize); + { + unsigned seed = std::chrono::system_clock::now().time_since_epoch().count(); + std::mt19937 g(seed); + uint32_t* data = reinterpret_cast(cpuSourceData.data()); + for (uint32_t i = 0; i < partitionSize / sizeof(uint32_t); i++) + data[i] = g(); + } + std::vector> regionsPerFrame(framesInFlight); + for (uint32_t i = 0; i < framesInFlight; i++) + { + regionsPerFrame[i].resize(tilesPerFrame); + uint32_t bufferOffset = i * partitionSize; + generateTileCopyRegions(regionsPerFrame[i].data(), tilesPerFrame, tileSize, tileSizeBytes, imageWidth, bufferOffset); + } + + double totalWaitTime = 0.0; + double totalMemcpyTime = 0.0; + double totalRecordTime = 0.0; + double totalSubmitTime = 0.0; + + auto startTime = std::chrono::high_resolution_clock::now(); + + for (uint32_t frame = 0; frame < totalFrames; frame++) + { + uint32_t cmdBufIndex = frame % framesInFlight; + + auto t1 = std::chrono::high_resolution_clock::now(); + if (frame >= framesInFlight) + { + ISemaphore::SWaitInfo frameWaitInfo = { + .semaphore = timelineSemaphore.get(), + .value = timelineValue - framesInFlight + 1 + }; + m_device->blockForSemaphores({&frameWaitInfo, 1}); + } + auto t2 = std::chrono::high_resolution_clock::now(); + + commandPools[cmdBufIndex]->reset(); + + uint32_t bufferOffset = cmdBufIndex * partitionSize; + void* targetPtr = static_cast(mappedPtr) + bufferOffset; + memcpy(targetPtr, cpuSourceData.data(), partitionSize); + + if (!stagingAlloc.memory->getMemoryPropertyFlags().hasFlags(IDeviceMemoryAllocation::EMPF_HOST_COHERENT_BIT)) + { + ILogicalDevice::MappedMemoryRange range(stagingAlloc.memory.get(), bufferOffset, partitionSize); + m_device->flushMappedMemoryRanges(1, &range); + } + + auto t3 = std::chrono::high_resolution_clock::now(); + + commandBuffers[cmdBufIndex]->begin(IGPUCommandBuffer::USAGE::ONE_TIME_SUBMIT_BIT); + + uint32_t queryStartIndex = cmdBufIndex * 2; + commandBuffers[cmdBufIndex]->resetQueryPool(queryPool.get(), queryStartIndex, 2); + + IGPUCommandBuffer::SImageMemoryBarrier barrier = {}; + barrier.oldLayout = IImage::LAYOUT::GENERAL; + barrier.newLayout = IImage::LAYOUT::GENERAL; + barrier.image = destinationImage; + barrier.subresourceRange.aspectMask = IImage::E_ASPECT_FLAGS::EAF_COLOR_BIT; + barrier.subresourceRange.baseMipLevel = 0; + barrier.subresourceRange.levelCount = 1; + barrier.subresourceRange.baseArrayLayer = 0; + barrier.subresourceRange.layerCount = 1; + barrier.barrier.dep.srcAccessMask = ACCESS_FLAGS::TRANSFER_WRITE_BIT; + barrier.barrier.dep.dstAccessMask = ACCESS_FLAGS::TRANSFER_WRITE_BIT; + barrier.barrier.dep.srcStageMask = PIPELINE_STAGE_FLAGS::ALL_TRANSFER_BITS; + barrier.barrier.dep.dstStageMask = PIPELINE_STAGE_FLAGS::ALL_TRANSFER_BITS; + commandBuffers[cmdBufIndex]->pipelineBarrier(E_DEPENDENCY_FLAGS::EDF_NONE, {.imgBarriers = {&barrier, 1}}); + + commandBuffers[cmdBufIndex]->writeTimestamp(PIPELINE_STAGE_FLAGS::COPY_BIT, queryPool.get(), queryStartIndex + 0); + + commandBuffers[cmdBufIndex]->copyBufferToImage( + stagingBuffer, + destinationImage, + IImage::LAYOUT::GENERAL, + tilesPerFrame, + regionsPerFrame[cmdBufIndex].data() + ); + + IGPUCommandBuffer::SImageMemoryBarrier afterBarrier = {}; + afterBarrier.oldLayout = IImage::LAYOUT::GENERAL; + afterBarrier.newLayout = IImage::LAYOUT::GENERAL; + afterBarrier.image = destinationImage; + afterBarrier.subresourceRange.aspectMask = IImage::E_ASPECT_FLAGS::EAF_COLOR_BIT; + afterBarrier.subresourceRange.baseMipLevel = 0; + afterBarrier.subresourceRange.levelCount = 1; + afterBarrier.subresourceRange.baseArrayLayer = 0; + afterBarrier.subresourceRange.layerCount = 1; + afterBarrier.barrier.dep.srcAccessMask = ACCESS_FLAGS::TRANSFER_WRITE_BIT; + afterBarrier.barrier.dep.dstAccessMask = ACCESS_FLAGS::SHADER_READ_BITS; + afterBarrier.barrier.dep.srcStageMask = PIPELINE_STAGE_FLAGS::ALL_TRANSFER_BITS; + afterBarrier.barrier.dep.dstStageMask = PIPELINE_STAGE_FLAGS::FRAGMENT_SHADER_BIT; + commandBuffers[cmdBufIndex]->pipelineBarrier(E_DEPENDENCY_FLAGS::EDF_NONE, {.imgBarriers = {&afterBarrier, 1}}); + + commandBuffers[cmdBufIndex]->writeTimestamp(PIPELINE_STAGE_FLAGS::COPY_BIT, queryPool.get(), queryStartIndex + 1); + + commandBuffers[cmdBufIndex]->end(); + auto t4 = std::chrono::high_resolution_clock::now(); + + IQueue::SSubmitInfo frameSubmitInfo = {}; + IQueue::SSubmitInfo::SCommandBufferInfo frameCmdBufInfo = {.cmdbuf = commandBuffers[cmdBufIndex].get()}; + frameSubmitInfo.commandBuffers = {&frameCmdBufInfo, 1}; + + IQueue::SSubmitInfo::SSemaphoreInfo frameSignalInfo = { + .semaphore = timelineSemaphore.get(), + .value = ++timelineValue, + .stageMask = PIPELINE_STAGE_FLAGS::ALL_TRANSFER_BITS + }; + frameSubmitInfo.signalSemaphores = {&frameSignalInfo, 1}; + + queue->submit({&frameSubmitInfo, 1}); + auto t5 = std::chrono::high_resolution_clock::now(); + + totalWaitTime += std::chrono::duration(t2 - t1).count(); + totalMemcpyTime += std::chrono::duration(t3 - t2).count(); + totalRecordTime += std::chrono::duration(t4 - t3).count(); + totalSubmitTime += std::chrono::duration(t5 - t4).count(); + } + + // Wait for all remaining frames to complete + ISemaphore::SWaitInfo finalWait = { + .semaphore = timelineSemaphore.get(), + .value = timelineValue + }; + m_device->blockForSemaphores({&finalWait, 1}); + + auto endTime = std::chrono::high_resolution_clock::now(); + + std::vector timestamps(framesInFlight * 2); + const core::bitflag flags = core::bitflag(IQueryPool::RESULTS_FLAGS::_64_BIT) | core::bitflag(IQueryPool::RESULTS_FLAGS::WAIT_BIT); + m_device->getQueryPoolResults(queryPool.get(), 0, framesInFlight * 2, timestamps.data(), sizeof(uint64_t), flags); + uint64_t totalGpuTicks = 0; + for (uint32_t i = 0; i < framesInFlight; i++) { + uint64_t startTick = timestamps[i * 2 + 0]; + uint64_t endTick = timestamps[i * 2 + 1]; + totalGpuTicks += (endTick - startTick); + } + float timestampPeriod = m_physicalDevice->getLimits().timestampPeriodInNanoSeconds; + double sampledGpuTimeSeconds = (totalGpuTicks * timestampPeriod) / 1e9; + + double avgGpuTimePerFrame = sampledGpuTimeSeconds / framesInFlight; + double totalGpuTimeSeconds = avgGpuTimePerFrame * totalFrames; + + + double elapsedSeconds = std::chrono::duration(endTime - startTime).count(); + uint64_t totalBytes = (uint64_t)totalFrames * tilesPerFrame * tileSizeBytes; + + double throughputGBps = (totalBytes / (1024.0 * 1024.0 * 1024.0)) / elapsedSeconds; + + m_logger->log(" GPU time: %.3f s", ILogger::ELL_PERFORMANCE, totalGpuTimeSeconds); + m_logger->log(" GPU throughput: %.2f GB/s", ILogger::ELL_PERFORMANCE, throughputGBps); + + m_logger->log(" Timing breakdown for %s:", ILogger::ELL_PERFORMANCE, strategyName); + m_logger->log(" Wait time: %.3f s (%.1f%%)", ILogger::ELL_PERFORMANCE, totalWaitTime, 100.0 * totalWaitTime / elapsedSeconds); + m_logger->log(" Memcpy time: %.3f s (%.1f%%)", ILogger::ELL_PERFORMANCE, totalMemcpyTime, 100.0 * totalMemcpyTime / elapsedSeconds); + m_logger->log(" Record time: %.3f s (%.1f%%)", ILogger::ELL_PERFORMANCE, totalRecordTime, 100.0 * totalRecordTime / elapsedSeconds); + m_logger->log(" Submit time: %.3f s (%.1f%%)", ILogger::ELL_PERFORMANCE, totalSubmitTime, 100.0 * totalSubmitTime / elapsedSeconds); + m_logger->log(" Memcpy speed: %.2f GB/s", ILogger::ELL_PERFORMANCE, (totalBytes / (1024.0 * 1024.0 * 1024.0)) / totalMemcpyTime); + + return throughputGBps; + } + + bool createStagingBuffer( + uint32_t bufferSize, + uint32_t memoryTypeBits, + const char* debugName, + smart_refctd_ptr& outBuffer, + IDeviceMemoryAllocator::SAllocation& outAllocation, + void*& outMappedPtr) + { + IGPUBuffer::SCreationParams params; + params.size = bufferSize; + params.usage = IGPUBuffer::EUF_TRANSFER_SRC_BIT; + outBuffer = m_device->createBuffer(std::move(params)); + if (!outBuffer) + return logFail("Failed to create GPU buffer of size %d!\n", bufferSize); + + outBuffer->setObjectDebugName(debugName); + + auto reqs = outBuffer->getMemoryReqs(); + reqs.memoryTypeBits &= memoryTypeBits; + + outAllocation = m_device->allocate(reqs, outBuffer.get(), IDeviceMemoryAllocation::EMAF_NONE); + if (!outAllocation.isValid()) + return logFail("Failed to allocate Device Memory!\n"); + + outMappedPtr = outAllocation.memory->map({0ull, outAllocation.memory->getAllocationSize()}, IDeviceMemoryAllocation::EMCAF_WRITE); + if (!outMappedPtr) + return logFail("Failed to map Device Memory!\n"); + + return true; + } +}; + +NBL_MAIN_FUNC(ImageUploadBenchmarkApp) diff --git a/73_ImageUploadBenchmark/pipeline.groovy b/73_ImageUploadBenchmark/pipeline.groovy new file mode 100644 index 000000000..1249f10b5 --- /dev/null +++ b/73_ImageUploadBenchmark/pipeline.groovy @@ -0,0 +1,50 @@ +import org.DevshGraphicsProgramming.Agent +import org.DevshGraphicsProgramming.BuilderInfo +import org.DevshGraphicsProgramming.IBuilder + +class CImageUploadBenchmark extends IBuilder +{ + public CImageUploadBenchmark(Agent _agent, _info) + { + super(_agent, _info) + } + + @Override + public boolean prepare(Map axisMapping) + { + return true + } + + @Override + public boolean build(Map axisMapping) + { + IBuilder.CONFIGURATION config = axisMapping.get("CONFIGURATION") + IBuilder.BUILD_TYPE buildType = axisMapping.get("BUILD_TYPE") + + def nameOfBuildDirectory = getNameOfBuildDirectory(buildType) + def nameOfConfig = getNameOfConfig(config) + + agent.execute("cmake --build ${info.rootProjectPath}/${nameOfBuildDirectory}/${info.targetProjectPathRelativeToRoot} --target ${info.targetBaseName} --config ${nameOfConfig} -j12 -v") + + return true + } + + @Override + public boolean test(Map axisMapping) + { + return true + } + + @Override + public boolean install(Map axisMapping) + { + return true + } +} + +def create(Agent _agent, _info) +{ + return new CStreamingAndBufferDeviceAddressBuilder(_agent, _info) +} + +return this diff --git a/CMakeLists.txt b/CMakeLists.txt index cbe482aa4..2d4ed7408 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -89,6 +89,7 @@ if(NBL_BUILD_EXAMPLES) add_subdirectory(70_FLIPFluids) add_subdirectory(71_RayTracingPipeline) add_subdirectory(72_CooperativeBinarySearch) + add_subdirectory(73_ImageUploadBenchmark) # add new examples *before* NBL_GET_ALL_TARGETS invocation, it gathers recursively all targets created so far in this subdirectory NBL_GET_ALL_TARGETS(TARGETS)