diff --git a/GPU/Common/GPUCommonMath.h b/GPU/Common/GPUCommonMath.h index bc842d00c6568..0e5db743d0c57 100644 --- a/GPU/Common/GPUCommonMath.h +++ b/GPU/Common/GPUCommonMath.h @@ -399,7 +399,7 @@ GPUdi() T GPUCommonMath::MaxWithRef(T x, T y, T z, T w, S refX, S refY, S refZ, GPUdi() float GPUCommonMath::InvSqrt(float _x) { -#ifdef GPUCA_NO_FAST_MATH +#if defined(GPUCA_NO_FAST_MATH) || defined(__OPENCL__) return 1.f / Sqrt(_x); #elif defined(__CUDACC__) || defined(__HIPCC__) return __frsqrt_rn(_x); diff --git a/GPU/GPUTracking/Base/GPUReconstructionCPU.h b/GPU/GPUTracking/Base/GPUReconstructionCPU.h index ac254221c250c..7eaf3e4a5e40d 100644 --- a/GPU/GPUTracking/Base/GPUReconstructionCPU.h +++ b/GPU/GPUTracking/Base/GPUReconstructionCPU.h @@ -114,7 +114,7 @@ class GPUReconstructionCPU : public GPUReconstructionKernels(), mInternals->Streams[stream])); } +void GPUReconstructionCUDA::RecordMarker(deviceEvent* ev, int32_t stream) { GPUFailedMsg(cudaEventRecord(ev->get(), mInternals->Streams[stream])); } std::unique_ptr GPUReconstructionCUDA::GetThreadContext() { diff --git a/GPU/GPUTracking/Base/cuda/GPUReconstructionCUDA.h b/GPU/GPUTracking/Base/cuda/GPUReconstructionCUDA.h index b9db625a83f1d..070177fb344f1 100644 --- a/GPU/GPUTracking/Base/cuda/GPUReconstructionCUDA.h +++ b/GPU/GPUTracking/Base/cuda/GPUReconstructionCUDA.h @@ -84,7 +84,7 @@ class GPUReconstructionCUDA : public GPUReconstructionKernels* trackerTraits, std::unique_ptr* vertexerTraits, std::unique_ptr* timeFrame) override; diff --git a/GPU/GPUTracking/Base/opencl-common/GPUReconstructionOCL.cxx b/GPU/GPUTracking/Base/opencl-common/GPUReconstructionOCL.cxx index de32f03340c03..cad56e77c79d5 100644 --- a/GPU/GPUTracking/Base/opencl-common/GPUReconstructionOCL.cxx +++ b/GPU/GPUTracking/Base/opencl-common/GPUReconstructionOCL.cxx @@ -359,6 +359,11 @@ int32_t GPUReconstructionOCL::InitDevice_Runtime() mInternals = master->mInternals; } + for (uint32_t i = 0; i < mEvents.size(); i++) { + cl_event* events = (cl_event*)mEvents[i].data(); + new (events) cl_event[mEvents[i].size()]; + } + return (0); } @@ -432,7 +437,7 @@ size_t GPUReconstructionOCL::WriteToConstantMemory(size_t offset, const void* sr void GPUReconstructionOCL::ReleaseEvent(deviceEvent ev) { GPUFailedMsg(clReleaseEvent(ev.get())); } -void GPUReconstructionOCL::RecordMarker(deviceEvent ev, int32_t stream) { GPUFailedMsg(clEnqueueMarkerWithWaitList(mInternals->command_queue[stream], 0, nullptr, ev.getEventList())); } +void GPUReconstructionOCL::RecordMarker(deviceEvent* ev, int32_t stream) { GPUFailedMsg(clEnqueueMarkerWithWaitList(mInternals->command_queue[stream], 0, nullptr, ev->getEventList())); } int32_t GPUReconstructionOCL::DoStuckProtection(int32_t stream, deviceEvent event) { diff --git a/GPU/GPUTracking/Base/opencl-common/GPUReconstructionOCL.h b/GPU/GPUTracking/Base/opencl-common/GPUReconstructionOCL.h index 02ba0469dee2a..6abe1045b550a 100644 --- a/GPU/GPUTracking/Base/opencl-common/GPUReconstructionOCL.h +++ b/GPU/GPUTracking/Base/opencl-common/GPUReconstructionOCL.h @@ -52,7 +52,7 @@ class GPUReconstructionOCL : public GPUReconstructionDeviceBase size_t WriteToConstantMemory(size_t offset, const void* src, size_t size, int32_t stream = -1, deviceEvent* ev = nullptr) override; size_t GPUMemCpy(void* dst, const void* src, size_t size, int32_t stream, int32_t toGPU, deviceEvent* ev = nullptr, deviceEvent* evList = nullptr, int32_t nEvents = 1) override; void ReleaseEvent(deviceEvent ev) override; - void RecordMarker(deviceEvent ev, int32_t stream) override; + void RecordMarker(deviceEvent* ev, int32_t stream) override; virtual int32_t GetOCLPrograms() = 0; virtual bool CheckPlatform(uint32_t i) = 0; diff --git a/GPU/GPUTracking/Base/opencl2/CMakeLists.txt b/GPU/GPUTracking/Base/opencl2/CMakeLists.txt index 73062ad82f728..0e6b9b8d0123d 100644 --- a/GPU/GPUTracking/Base/opencl2/CMakeLists.txt +++ b/GPU/GPUTracking/Base/opencl2/CMakeLists.txt @@ -23,9 +23,11 @@ endif() set(CL_SRC ${GPUDIR}/Base/opencl-common/GPUReconstructionOCL.cl) set(CL_BIN ${CMAKE_CURRENT_BINARY_DIR}/GPUReconstructionOCL2Code) -set(OCL_FLAGS -ferror-limit=1000 -Dcl_clang_storage_class_specifiers -Wno-invalid-constexpr -Wno-unused-command-line-argument -cl-std=CLC++2021) +set(OCL_FLAGS -Dcl_clang_storage_class_specifiers -cl-std=CLC++2021) if(NOT DEFINED GPUCA_NO_FAST_MATH OR NOT ${GPUCA_NO_FAST_MATH}) - set(OCL_FLAGS ${OCL_FLAGS} -Xclang -fdenormal-fp-math-f32=ieee -cl-mad-enable -cl-no-signed-zeros) + set(OCL_FLAGS ${OCL_FLAGS} -cl-denorms-are-zero -cl-mad-enable -cl-no-signed-zeros -cl-fast-relaxed-math) +else() +set(OCL_FLAGS ${OCL_FLAGS} -cl-fp32-correctly-rounded-divide-sqrt) endif() set(OCL_DEFINECL "-D$,$-D>" "-I$,EXCLUDE,^/usr/include/?>,$-I>" @@ -47,6 +49,7 @@ if(OPENCL2_ENABLED_SPIRV) # BUILD OpenCL2 intermediate code for SPIR-V target -O0 --target=spirv64 -fno-integrated-objemitter + -ferror-limit=1000 -Wno-invalid-constexpr -Wno-unused-command-line-argument ${OCL_FLAGS} ${OCL_DEFINECL} -o ${CL_BIN}.spirv -c ${CL_SRC} @@ -64,6 +67,7 @@ if(OPENCL2_ENABLED) # BUILD OpenCL2 source code for runtime compilation target add_custom_command( OUTPUT ${CL_BIN}.src COMMAND ${LLVM_CLANG} + -Wno-unused-command-line-argument ${OCL_FLAGS} ${OCL_DEFINECL} -cl-no-stdinc diff --git a/GPU/GPUTracking/Base/opencl2/GPUReconstructionOCL2.cxx b/GPU/GPUTracking/Base/opencl2/GPUReconstructionOCL2.cxx index a118e6d589712..435e69e91f5fe 100644 --- a/GPU/GPUTracking/Base/opencl2/GPUReconstructionOCL2.cxx +++ b/GPU/GPUTracking/Base/opencl2/GPUReconstructionOCL2.cxx @@ -64,14 +64,14 @@ int32_t GPUReconstructionOCL2Backend::GetOCLPrograms() const char* ocl_flags = GPUCA_M_STR(OCL_FLAGS); #ifdef OPENCL2_ENABLED_SPIRV // clang-format off - if (ver >= 2.2f) { - GPUInfo("Reading OpenCL program from SPIR-V IL (Platform version %f)", ver); + if (ver >= 2.2f && !GetProcessingSettings().oclCompileFromSources) { + GPUInfo("Reading OpenCL program from SPIR-V IL (Platform version %4.2f)", ver); mInternals->program = clCreateProgramWithIL(mInternals->context, _binary_GPUReconstructionOCL2Code_spirv_start, _binary_GPUReconstructionOCL2Code_spirv_len, &ocl_error); ocl_flags = ""; } else #endif // clang-format on { - GPUInfo("Compiling OpenCL program from sources (Platform version %f, %s)", ver); + GPUInfo("Compiling OpenCL program from sources (Platform version %4.2f)", ver); size_t program_sizes[1] = {_binary_GPUReconstructionOCL2Code_src_len}; char* programs_sources[1] = {_binary_GPUReconstructionOCL2Code_src_start}; mInternals->program = clCreateProgramWithSource(mInternals->context, (cl_uint)1, (const char**)&programs_sources, program_sizes, &ocl_error); diff --git a/GPU/GPUTracking/Definitions/GPUSettingsList.h b/GPU/GPUTracking/Definitions/GPUSettingsList.h index c4e0dadb87659..d5494d04930f5 100644 --- a/GPU/GPUTracking/Definitions/GPUSettingsList.h +++ b/GPU/GPUTracking/Definitions/GPUSettingsList.h @@ -218,7 +218,7 @@ AddHelp("help", 'h') EndConfig() BeginSubConfig(GPUSettingsProcessing, proc, configStandalone, "PROC", 0, "Processing settings", proc) -AddOption(platformNum, int32_t, -1, "", 0, "Platform to use, in case the backend provides multiple platforms (-1 = auto-select)") +AddOption(platformNum, int32_t, -1, "", 0, "Platform to use, in case the backend provides multiple platforms (OpenCL only, -1 = auto-select)") AddOption(deviceNum, int32_t, -1, "gpuDevice", 0, "Set GPU device to use (-1: automatic, -2: for round-robin usage in timeslice-pipeline)") AddOption(gpuDeviceOnly, bool, false, "", 0, "Use only GPU as device (i.e. no CPU for OpenCL)") AddOption(globalInitMutex, bool, false, "", 0, "Use global mutex to synchronize initialization of multiple GPU instances") @@ -291,6 +291,7 @@ AddOption(tpcApplyDebugClusterFilter, bool, false, "", 0, "Apply custom cluster AddOption(RTCcacheFolder, std::string, "./rtccache/", "", 0, "Folder in which the cache file is stored") AddOption(RTCprependCommand, std::string, "", "", 0, "Prepend RTC compilation commands by this string") AddOption(RTCoverrideArchitecture, std::string, "", "", 0, "Override arhcitecture part of RTC compilation command line") +AddOption(oclCompileFromSources, bool, false, "", 0, "Compile OpenCL binary from included source code instead of using included spirv code") AddOption(printSettings, bool, false, "", 0, "Print all settings when initializing") AddVariable(eventDisplay, GPUCA_NAMESPACE::gpu::GPUDisplayFrontendInterface*, nullptr) AddSubConfig(GPUSettingsProcessingRTC, rtc) diff --git a/GPU/GPUTracking/Global/GPUChain.h b/GPU/GPUTracking/Global/GPUChain.h index 7a36355bf843d..9c67a05eec443 100644 --- a/GPU/GPUTracking/Global/GPUChain.h +++ b/GPU/GPUTracking/Global/GPUChain.h @@ -101,7 +101,7 @@ class GPUChain } } inline bool IsEventDone(deviceEvent* evList, int32_t nEvents = 1) { return mRec->IsEventDone(evList, nEvents); } - inline void RecordMarker(deviceEvent ev, int32_t stream) { mRec->RecordMarker(ev, stream); } + inline void RecordMarker(deviceEvent* ev, int32_t stream) { mRec->RecordMarker(ev, stream); } virtual inline std::unique_ptr GetThreadContext() { return mRec->GetThreadContext(); } inline void SynchronizeGPU() { mRec->SynchronizeGPU(); } inline void ReleaseEvent(deviceEvent ev, bool doGPU = true) diff --git a/GPU/GPUTracking/Global/GPUChainTrackingClusterizer.cxx b/GPU/GPUTracking/Global/GPUChainTrackingClusterizer.cxx index af7cc03369afc..ae240181eba65 100644 --- a/GPU/GPUTracking/Global/GPUChainTrackingClusterizer.cxx +++ b/GPU/GPUTracking/Global/GPUChainTrackingClusterizer.cxx @@ -865,8 +865,12 @@ int32_t GPUChainTracking::RunTPCClusterizer(bool synchronizeOutput) } if (fragment.index == 0) { - runKernel({GetGridAutoStep(lane, RecoStep::TPCClusterFinding), krnlRunRangeNone, {nullptr, transferRunning[lane] == 1 ? &mEvents->stream[lane] : nullptr}}, clustererShadow.mPclusterInRow, GPUCA_ROW_COUNT * sizeof(*clustererShadow.mPclusterInRow)); - transferRunning[lane] = 2; + deviceEvent* waitEvent = nullptr; + if (transferRunning[lane] == 1) { + waitEvent = &mEvents->stream[lane]; + transferRunning[lane] = 2; + } + runKernel({GetGridAutoStep(lane, RecoStep::TPCClusterFinding), krnlRunRangeNone, {nullptr, waitEvent}}, clustererShadow.mPclusterInRow, GPUCA_ROW_COUNT * sizeof(*clustererShadow.mPclusterInRow)); } if (clusterer.mPmemory->counters.nClusters == 0) { @@ -930,7 +934,7 @@ int32_t GPUChainTracking::RunTPCClusterizer(bool synchronizeOutput) if (transferRunning[lane]) { ReleaseEvent(mEvents->stream[lane], doGPU); } - RecordMarker(mEvents->stream[lane], mRec->NStreams() - 1); + RecordMarker(&mEvents->stream[lane], mRec->NStreams() - 1); transferRunning[lane] = 1; } diff --git a/GPU/GPUTracking/Global/GPUChainTrackingCompression.cxx b/GPU/GPUTracking/Global/GPUChainTrackingCompression.cxx index c0679c090c20c..98109447de034 100644 --- a/GPU/GPUTracking/Global/GPUChainTrackingCompression.cxx +++ b/GPU/GPUTracking/Global/GPUChainTrackingCompression.cxx @@ -37,7 +37,7 @@ int32_t GPUChainTracking::RunTPCCompression() GPUTPCCompression& CompressorShadow = doGPU ? processorsShadow()->tpcCompressor : Compressor; const auto& threadContext = GetThreadContext(); if (mPipelineFinalizationCtx && GetProcessingSettings().doublePipelineClusterizer) { - RecordMarker(mEvents->single, 0); + RecordMarker(&mEvents->single, 0); } if (GetProcessingSettings().tpcCompressionGatherMode == 3) { @@ -124,7 +124,7 @@ int32_t GPUChainTracking::RunTPCCompression() return 1; } if (GetProcessingSettings().tpcCompressionGatherMode == 3) { - RecordMarker(mEvents->stream[outputStream], outputStream); + RecordMarker(&mEvents->stream[outputStream], outputStream); char* deviceFlatPts = (char*)Compressor.mOutput->qTotU; if (GetProcessingSettings().doublePipeline) { const size_t blockSize = CAMath::nextMultipleOf<1024>(copySize / 30); diff --git a/GPU/GPUTracking/Global/GPUChainTrackingMerger.cxx b/GPU/GPUTracking/Global/GPUChainTrackingMerger.cxx index 67a9904a4222f..aba8617ee244d 100644 --- a/GPU/GPUTracking/Global/GPUChainTrackingMerger.cxx +++ b/GPU/GPUTracking/Global/GPUChainTrackingMerger.cxx @@ -33,7 +33,7 @@ void GPUChainTracking::RunTPCTrackingMerger_MergeBorderTracks(int8_t withinSlice uint32_t n = withinSlice == -1 ? NSLICES / 2 : NSLICES; if (GetProcessingSettings().alternateBorderSort && (!mRec->IsGPU() || doGPUall)) { TransferMemoryResourceLinkToHost(RecoStep::TPCMerging, Merger.MemoryResMemory(), 0, &mEvents->init); - RecordMarker(mEvents->single, 0); + RecordMarker(&mEvents->single, 0); for (uint32_t i = 0; i < n; i++) { int32_t stream = i % mRec->NStreams(); runKernel({GetGridAuto(stream, deviceType), krnlRunRangeNone, {nullptr, stream && i < (uint32_t)mRec->NStreams() ? &mEvents->single : nullptr}}, i, withinSlice, mergeMode); @@ -55,7 +55,7 @@ void GPUChainTracking::RunTPCTrackingMerger_MergeBorderTracks(int8_t withinSlice if (i == n - 1) { // Synchronize all execution on stream 0 with the last kernel ne = std::min(n, mRec->NStreams()); for (int32_t j = 1; j < ne; j++) { - RecordMarker(mEvents->slice[j], j); + RecordMarker(&mEvents->slice[j], j); } e = &mEvents->slice[1]; ne--; @@ -251,7 +251,7 @@ int32_t GPUChainTracking::RunTPCTrackingMerger(bool synchronizeOutput) DoDebugAndDump(RecoStep::TPCMerging, 2048, doGPUall, Merger, &GPUTPCGMMerger::DumpFinal, *mDebugFile); if (doGPUall) { - RecordMarker(mEvents->single, 0); + RecordMarker(&mEvents->single, 0); auto* waitEvent = &mEvents->single; if (GetProcessingSettings().keepDisplayMemory || GetProcessingSettings().createO2Output <= 1 || mFractionalQAEnabled) { if (!(GetProcessingSettings().keepDisplayMemory || GetProcessingSettings().createO2Output <= 1)) { @@ -317,7 +317,7 @@ int32_t GPUChainTracking::RunTPCTrackingMerger(bool synchronizeOutput) TransferMemoryResourcesToHost(RecoStep::TPCMerging, &Merger, -1, true); runKernel(GetGridAuto(0, GPUReconstruction::krnlDeviceType::CPU)); } else if (doGPUall) { - RecordMarker(mEvents->single, 0); + RecordMarker(&mEvents->single, 0); TransferMemoryResourceLinkToHost(RecoStep::TPCMerging, Merger.MemoryResOutputO2(), outputStream, nullptr, &mEvents->single); TransferMemoryResourceLinkToHost(RecoStep::TPCMerging, Merger.MemoryResOutputO2Clus(), outputStream); ReleaseEvent(mEvents->single); diff --git a/GPU/GPUTracking/Global/GPUChainTrackingSliceTracker.cxx b/GPU/GPUTracking/Global/GPUChainTrackingSliceTracker.cxx index da4629928789c..8db15fb1aef7e 100644 --- a/GPU/GPUTracking/Global/GPUChainTrackingSliceTracker.cxx +++ b/GPU/GPUTracking/Global/GPUChainTrackingSliceTracker.cxx @@ -164,9 +164,11 @@ int32_t GPUChainTracking::RunTPCTrackingSlices_internal() TransferMemoryResourceLinkToGPU(RecoStep::TPCSliceTracking, mInputsHost->mResourceOccupancyMap, streamOccMap, &mEvents->init); } } - uint32_t& occupancyTotal = *mInputsHost->mTPCClusterOccupancyMap; - occupancyTotal = CAMath::Float2UIntRn(mRec->MemoryScalers()->nTPCHits / (mIOPtrs.settingsTF && mIOPtrs.settingsTF->hasNHBFPerTF ? mIOPtrs.settingsTF->nHBFPerTF : 128)); - mRec->UpdateParamOccupancyMap(param().rec.tpc.occupancyMapTimeBins ? mInputsHost->mTPCClusterOccupancyMap + 2 : nullptr, param().rec.tpc.occupancyMapTimeBins ? mInputsShadow->mTPCClusterOccupancyMap + 2 : nullptr, occupancyTotal, streamOccMap); + if (param().rec.tpc.occupancyMapTimeBins || param().rec.tpc.sysClusErrorC12Norm) { + uint32_t& occupancyTotal = *mInputsHost->mTPCClusterOccupancyMap; + occupancyTotal = CAMath::Float2UIntRn(mRec->MemoryScalers()->nTPCHits / (mIOPtrs.settingsTF && mIOPtrs.settingsTF->hasNHBFPerTF ? mIOPtrs.settingsTF->nHBFPerTF : 128)); + mRec->UpdateParamOccupancyMap(param().rec.tpc.occupancyMapTimeBins ? mInputsHost->mTPCClusterOccupancyMap + 2 : nullptr, param().rec.tpc.occupancyMapTimeBins ? mInputsShadow->mTPCClusterOccupancyMap + 2 : nullptr, occupancyTotal, streamOccMap); + } int32_t streamMap[NSLICES]; @@ -305,7 +307,7 @@ int32_t GPUChainTracking::RunTPCTrackingSlices_internal() SynchronizeGPU(); } else { for (int32_t i = 0; i < mRec->NStreams(); i++) { - RecordMarker(mEvents->stream[i], i); + RecordMarker(&mEvents->stream[i], i); } runKernel({GetGridAuto(0), krnlRunRangeNone, {&mEvents->single, mEvents->stream, mRec->NStreams()}}); for (int32_t i = 0; i < mRec->NStreams(); i++) {