Fixes for indexing and offsets

ChSonnabend · ChSonnabend · commit 1ac8c2cd240a · 2025-07-13T10:54:18.000+02:00
diff --git a/GPU/GPUTracking/Global/GPUChainTrackingClusterizer.cxx b/GPU/GPUTracking/Global/GPUChainTrackingClusterizer.cxx
@@ -706,6 +706,8 @@ int32_t GPUChainTracking::RunTPCClusterizer(bool synchronizeOutput)
         nnApplications[lane].initClusterizer(nn_settings, clustererNNShadow);
       }
       AllocateRegisteredMemory(clustererNN.mMemoryId);
+      nnApplications[lane].createBoundary(clustererNNShadow);
+      nnApplications[lane].createIndexLookup(clustererNNShadow);
     });
     if (doGPU) {
       WriteToConstantMemory(RecoStep::TPCClusterFinding, (char*)&processors()->tpcNNClusterer - (char*)processors(), &processorsShadow()->tpcNNClusterer, sizeof(GPUTPCNNClusterizer) * NSECTORS, mRec->NStreams() - 1, &mEvents->init);
diff --git a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizer.h b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizer.h
@@ -61,8 +61,9 @@ class GPUTPCNNClusterizer : public GPUProcessor
 
   // Boundary lookup table
   int32_t mBoundaryMapSizeRow = 0;
-  int32_t mBoundaryMapSizePerRow = 0;
+  int32_t mBoundaryMapSizePadsPerRow = 0;
   int32_t mBoundaryMapSize = 0;
+  int32_t mBoundaryPadding = 11; // Padding on each side of the boundary map to account for pad_offset
   int8_t* mIsBoundary = nullptr;
 
   // Index lookup table
diff --git a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerHost.cxx b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerHost.cxx
@@ -91,8 +91,11 @@ void GPUTPCNNClusterizerHost::initClusterizer(const GPUSettingsProcessingNNclust
   clustererNN.mNnClusterizerSizeInputTime = settings.nnClusterizerSizeInputTime;
   clustererNN.mNnClusterizerChargeArraySize = ((2 * settings.nnClusterizerSizeInputRow + 1) * (2 * settings.nnClusterizerSizeInputPad + 1) * (2 * settings.nnClusterizerSizeInputTime + 1));
   clustererNN.mNnClusterizerElementSize = clustererNN.mNnClusterizerChargeArraySize + (settings.nnClusterizerAddIndexData ? 3 : 0);
-  clustererNN.mBoundaryMapSize = (3*clustererNN.mNnClusterizerSizeInputRow + o2::tpc::constants::MAXGLOBALPADROW)*(GPUTPCGeometry::NPads(o2::tpc::constants::MAXGLOBALPADROW) + 2*clustererNN.mNnClusterizerSizeInputPad);
-  clustererNN.mIndexLookupSize = 3*clustererNN.mNnClusterizerElementSize; // local row, pad, time coordinate from flat index
+  clustererNN.mBoundaryMapSizeRow = 3 * clustererNN.mNnClusterizerSizeInputRow + o2::tpc::constants::MAXGLOBALPADROW;
+  clustererNN.mBoundaryPadding = 11; // padding on each side to account for pad_offset. N=11 since then mIsBoundary = 24320 ~< (1.5 x 2^14 = 24576) && N must be bigger than (NPads[row(end_iroc + 1)] - NPads[row(end_iroc)])/2 (=6) for pad_offset to work
+  clustererNN.mBoundaryMapSizePadsPerRow = GPUTPCGeometry::NPads(o2::tpc::constants::MAXGLOBALPADROW) + 2*clustererNN.mBoundaryPadding;
+  clustererNN.mBoundaryMapSize = clustererNN.mBoundaryMapSizeRow*clustererNN.mBoundaryMapSizePadsPerRow;
+  clustererNN.mIndexLookupSize = 3*clustererNN.mNnClusterizerChargeArraySize; // local row, pad, time shift from flat index
   clustererNN.mNnClusterizerAddIndexData = settings.nnClusterizerAddIndexData;
   clustererNN.mNnClusterizerBatchedMode = settings.nnClusterizerBatchedMode;
   clustererNN.mNnClusterizerBoundaryFillValue = settings.nnClusterizerBoundaryFillValue;
@@ -119,27 +122,22 @@ void GPUTPCNNClusterizerHost::initClusterizer(const GPUSettingsProcessingNNclust
       clustererNN.mNnClusterizerModelReg2NumOutputNodes = mModelReg2.getNumOutputNodes()[0][1];
     }
   }
-  createBoundary(clustererNN);
-  createIndexLookup(clustererNN);
 }
 
 void GPUTPCNNClusterizerHost::createBoundary(GPUTPCNNClusterizer& clustererNN) {
   // Call after init of the clustererNN elements
-  clustererNN.mBoundaryMapSizeRow = 3 * clustererNN.mNnClusterizerSizeInputRow + o2::tpc::constants::MAXGLOBALPADROW;
-  clustererNN.mBoundaryMapSizePerRow = GPUTPCGeometry::NPads(o2::tpc::constants::MAXGLOBALPADROW) + 2 * clustererNN.mNnClusterizerSizeInputPad;
   for(int r = 0; r < clustererNN.mBoundaryMapSizeRow; r++) {
-    for (int p = 0; p < clustererNN.mBoundaryMapSizePerRow; p++) {
-      int32_t i = r * clustererNN.mBoundaryMapSizePerRow + p;
+    int8_t skipCheckInRow = 0;
+    for (int p = 0; p < clustererNN.mBoundaryMapSizePadsPerRow; p++) {
+      int32_t i = r * clustererNN.mBoundaryMapSizePadsPerRow + p;
       clustererNN.mIsBoundary[i] = 1;
-      if (p >= clustererNN.mNnClusterizerSizeInputPad || r >= clustererNN.mNnClusterizerSizeInputRow) {
+      if (!skipCheckInRow && (p >= clustererNN.mBoundaryPadding || r >= clustererNN.mNnClusterizerSizeInputRow)) {
         if (r < (GPUTPCGeometry::EndIROC() + clustererNN.mNnClusterizerSizeInputRow)) {
-          clustererNN.mIsBoundary[i] = (int32_t)((p - clustererNN.mNnClusterizerSizeInputPad) >= static_cast<int>(GPUTPCGeometry::NPads(r - clustererNN.mNnClusterizerSizeInputRow)));
+          clustererNN.mIsBoundary[i] = (int32_t)((p - clustererNN.mBoundaryPadding) >= static_cast<int>(GPUTPCGeometry::NPads(r - clustererNN.mNnClusterizerSizeInputRow)));
         } else if (r >= (GPUTPCGeometry::EndIROC() + 2*clustererNN.mNnClusterizerSizeInputRow) && r < (o2::tpc::constants::MAXGLOBALPADROW + 2*clustererNN.mNnClusterizerSizeInputRow)) {
-          clustererNN.mIsBoundary[i] = (int32_t)((p - clustererNN.mNnClusterizerSizeInputPad) >= static_cast<int>(GPUTPCGeometry::NPads(r - 2*clustererNN.mNnClusterizerSizeInputRow)));
-        }
-        if (clustererNN.mIsBoundary[i] == 1) {
-          break; // No need to check further pads in this row
+          clustererNN.mIsBoundary[i] = (int32_t)((p - clustererNN.mBoundaryPadding) >= static_cast<int>(GPUTPCGeometry::NPads(r - 2*clustererNN.mNnClusterizerSizeInputRow)));
         }
+        skipCheckInRow = (clustererNN.mIsBoundary[i] == 1); // No need to check further pads in this row
       }
     }
   }
diff --git a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerKernels.cxx b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerKernels.cxx
@@ -121,9 +121,10 @@ GPUdii() void GPUTPCNNClusterizerKernels::Thread<GPUTPCNNClusterizerKernels::fil
 template <>
 GPUdii() void GPUTPCNNClusterizerKernels::Thread<GPUTPCNNClusterizerKernels::fillInputNNGPU>(int32_t nBlocks, int32_t nThreads, int32_t iBlock, int32_t iThread, GPUSharedMemory& smem, processorType& processors, uint8_t sector, int8_t dtype, int8_t withMC, uint32_t batchStart)
 {
-  uint32_t glo_idx = get_global_id(0);
   auto& clusterer = processors.tpcClusterer[sector];
   auto& clustererNN = processors.tpcNNClusterer[sector];
+
+  uint32_t glo_idx = get_global_id(0);
   uint32_t base_idx = CAMath::Floor(glo_idx / clustererNN.mNnClusterizerElementSize);
   uint32_t transient_index = glo_idx - (base_idx * clustererNN.mNnClusterizerElementSize);
 
@@ -153,17 +154,22 @@ GPUdii() void GPUTPCNNClusterizerKernels::Thread<GPUTPCNNClusterizerKernels::fil
       clustererNN.mInputData_32[top_idx - 2] = row / 152.f;
       clustererNN.mInputData_32[top_idx - 1] = static_cast<float>(pad) / GPUTPCGeometry::NPads(row);
     }
-  } else if ((int32_t)transient_index < (clustererNN.mNnClusterizerElementSize - 3)) {
+  } else if ((int32_t)transient_index < clustererNN.mNnClusterizerChargeArraySize) {
     int32_t time = static_cast<int>(peak.time());
     int32_t idxLookup = 3*transient_index;
     int32_t r = clustererNN.mIndexLookup[idxLookup] + row, p = clustererNN.mIndexLookup[idxLookup + 1] + pad, t = clustererNN.mIndexLookup[idxLookup + 2] + time;
     int32_t row_offset = GPUTPCNNClusterizerKernels::rowOffset(row, clustererNN.mNnClusterizerSizeInputRow);
-    int32_t isBoundaryIndex = (r + row_offset + clustererNN.mNnClusterizerSizeInputRow) * clustererNN.mBoundaryMapSizePerRow + p + clustererNN.mNnClusterizerSizeInputPad;
+    int32_t pad_offset = GPUTPCNNClusterizerKernels::padOffset(row, r);
+    p += pad_offset;
+    int32_t isBoundaryIndex = (r + row_offset + clustererNN.mNnClusterizerSizeInputRow) * clustererNN.mBoundaryMapSizePadsPerRow + p + clustererNN.mBoundaryPadding;
 
     if (!clustererNN.mIsBoundary[isBoundaryIndex] && (t >= 0) && (t < TPC_MAX_FRAGMENT_LEN_GPU)) {
-      int32_t pad_offset = GPUTPCNNClusterizerKernels::padOffset(row, r);
       float central_charge = static_cast<float>(chargeMap[peak].unpack());
-      CfChargePos tmp_pos(r, p + pad_offset, t);
+      CfChargePos tmp_pos(r, p, t);
+      // if ((glo_idx % (clustererNN.mNnClusterizerElementSize*1000)) == (int)((clustererNN.mNnClusterizerChargeArraySize-1)/2.f)){
+      //   printf("glo_idx: %d, r: %d, p: %d, t: %d, tmp_pos: (%d, %d, %d), charge: %f, central_charge: %f\n",
+      //          glo_idx, clustererNN.mIndexLookup[idxLookup], clustererNN.mIndexLookup[idxLookup + 1], clustererNN.mIndexLookup[idxLookup + 2], tmp_pos.row(), tmp_pos.pad(), tmp_pos.time(), chargeMap[tmp_pos].unpack(), central_charge);
+      // }
       if (dtype == 0) {
         clustererNN.mInputData_16[glo_idx] = (OrtDataType::Float16_t)(static_cast<float>(chargeMap[tmp_pos].unpack()) / central_charge);
       } else if (dtype == 1) {
@@ -489,24 +495,28 @@ GPUdii() void GPUTPCNNClusterizerKernels::Thread<GPUTPCNNClusterizerKernels::pub
 // THe following arithmetic is done because the network is trained with a split between IROC and OROC boundary
 GPUd() int32_t GPUTPCNNClusterizerKernels::padOffset(int32_t row_ref, int32_t row_current)
 {
-  return (int)((GPUTPCGeometry::NPads(row_current) - GPUTPCGeometry::NPads(row_ref)) / 2);
+  if(row_current < 0 || row_current > o2::tpc::constants::MAXGLOBALPADROW) {
+    return 0; // Short-circuit for negative rows
+  } else {
+    return (int)((GPUTPCGeometry::NPads(row_current) - GPUTPCGeometry::NPads(row_ref)) / 2);
+  }
 }
 
-GPUd() int32_t GPUTPCNNClusterizerKernels::rowOffset(int32_t row, int32_t global_shift)
+GPUd() int32_t GPUTPCNNClusterizerKernels::rowOffset(int32_t row, int32_t offset)
 {
-  return (row > 62 ? global_shift : 0);
+  return (row > 62 ? offset : 0);
 }
 
-GPUd() bool GPUTPCNNClusterizerKernels::isBoundary(int32_t row, int32_t pad, int32_t global_shift)
+GPUd() bool GPUTPCNNClusterizerKernels::isBoundary(int32_t row, int32_t pad, int32_t offset)
 {
   if (pad < 0 || row < 0) { // Faster short-circuit
     return true;
   } else if (row < 63) {
     return (pad >= static_cast<int>(GPUTPCGeometry::NPads(row)));
-  } else if (row < (63 + global_shift)) { // to account for the gap between IROC and OROC. Charge will be set to -1 in order to signal boundary to the neural network
+  } else if (row < (63 + offset)) { // to account for the gap between IROC and OROC. Charge will be set to the boundary fill value in order to signal boundaries to the neural network
     return true;
-  } else if (row < (o2::tpc::constants::MAXGLOBALPADROW + global_shift)) {
-    return (pad >= static_cast<int>(GPUTPCGeometry::NPads(row - global_shift)));
+  } else if (row < (o2::tpc::constants::MAXGLOBALPADROW + offset)) {
+    return (pad >= static_cast<int>(GPUTPCGeometry::NPads(row - offset)));
   } else {
     return true;
   }

Original file line number	Diff line number	Diff line change
`@@ -706,6 +706,8 @@ int32_t GPUChainTracking::RunTPCClusterizer(bool synchronizeOutput)`
`706`	`706`	`nnApplications[lane].initClusterizer(nn_settings, clustererNNShadow);`
`707`	`707`	`}`
`708`	`708`	`AllocateRegisteredMemory(clustererNN.mMemoryId);`
	`709`	`+ nnApplications[lane].createBoundary(clustererNNShadow);`
	`710`	`+ nnApplications[lane].createIndexLookup(clustererNNShadow);`
`709`	`711`	`});`
`710`	`712`	`if (doGPU) {`
`711`	`713`	`WriteToConstantMemory(RecoStep::TPCClusterFinding, (char)&processors()->tpcNNClusterer - (char)processors(), &processorsShadow()->tpcNNClusterer, sizeof(GPUTPCNNClusterizer) * NSECTORS, mRec->NStreams() - 1, &mEvents->init);`