From 18691a50edf49837d76abe8b4197eb0f3d4363e5 Mon Sep 17 00:00:00 2001 From: Marcel Breyer Date: Sat, 24 May 2025 22:07:03 +0200 Subject: [PATCH 001/215] Improve performance (mainly on AMD GPUs) and change implementations slightly such that the backends are more similar. --- .../cg_explicit/kernel_matrix_assembly.cuh | 118 +++++++------- .../kernel_matrix_assembly.hip.hpp | 118 +++++++------- .../cg_explicit/kernel_matrix_assembly.hpp | 2 +- .../cg_explicit/kernel_matrix_assembly.hpp | 126 +++++++-------- .../cg_explicit/kernel_matrix_assembly.cl | 91 +++++------ .../basic/kernel_matrix_assembly.hpp | 55 +++---- .../hierarchical/kernel_matrix_assembly.hpp | 144 +++++++++--------- .../scoped/kernel_matrix_assembly.hpp | 135 ++++++++-------- .../work_group/kernel_matrix_assembly.hpp | 124 +++++++-------- .../cg_explicit/kernel_matrix_assembly.hpp | 2 +- include/plssvm/constants.hpp | 7 +- src/plssvm/backends/Kokkos/csvm.cpp | 10 +- src/plssvm/backends/OpenCL/detail/utility.cpp | 4 +- .../detail/tracking/performance_tracker.cpp | 4 +- 14 files changed, 482 insertions(+), 458 deletions(-) diff --git a/include/plssvm/backends/CUDA/kernel/cg_explicit/kernel_matrix_assembly.cuh b/include/plssvm/backends/CUDA/kernel/cg_explicit/kernel_matrix_assembly.cuh index 8a766b7db..2a3eef5c4 100644 --- a/include/plssvm/backends/CUDA/kernel/cg_explicit/kernel_matrix_assembly.cuh +++ b/include/plssvm/backends/CUDA/kernel/cg_explicit/kernel_matrix_assembly.cuh @@ -14,20 +14,22 @@ #pragma once #include "plssvm/backends/CUDA/kernel/kernel_functions.cuh" // plssvm::cuda::detail::{feature_reduce, apply_kernel_function} -#include "plssvm/constants.hpp" // plssvm::{real_type, THREAD_BLOCK_SIZE, INTERNAL_BLOCK_SIZE, FEATURE_BLOCK_SIZE, PADDING_SIZE} +#include "plssvm/constants.hpp" // plssvm::{real_type, THREAD_BLOCK_SIZE, INTERNAL_BLOCK_SIZE, PADDING_SIZE} #include "plssvm/kernel_function_types.hpp" // plssvm::kernel_function_type +#include // std::size_t + namespace plssvm::cuda::detail { /** * @brief Create the explicit kernel matrix using the @p kernel_function. * @tparam kernel_function the type of the used kernel function * @tparam Args the types of the parameters necessary for the specific kernel function - * @param[out] kernel_matrix_d the calculated kernel matrix - * @param[in] data_d the data points to calculate the kernel matrix from + * @param[out] kernel_matrix the calculated kernel matrix + * @param[in] data the data points to calculate the kernel matrix from * @param[in] num_rows the total number of data points (= total number of rows) * @param[in] device_num_rows the number of rows the current device is responsible for - * @param[in] row_offset the first row in @p data_d the current device is responsible for + * @param[in] device_row_offset the first row in @p data_d the current device is responsible for * @param[in] num_features the number of features per data point * @param[in] q the vector used in the dimensional reduction * @param[in] QA_cost the scalar used in the dimensional reduction @@ -37,80 +39,84 @@ namespace plssvm::cuda::detail { * @param[in] kernel_function_parameter the parameters necessary to apply the @p kernel_function */ template -__global__ void device_kernel_assembly(real_type *kernel_matrix_d, const real_type *data_d, const unsigned long long num_rows, const unsigned long long device_num_rows, const unsigned long long row_offset, const unsigned long long num_features, const real_type *q, const real_type QA_cost, const real_type cost, const unsigned long long grid_x_offset, const unsigned long long grid_y_offset, Args... kernel_function_parameter) { - // cast all values to 64-bit unsigned long long to prevent potential 32-bit overflows - const auto threadIdx_x = static_cast(threadIdx.x); // current thread in block x-dimension - const auto threadIdx_y = static_cast(threadIdx.y); // current thread in block y-dimension - const auto blockDim_x = static_cast(blockDim.x); // number of threads in block x-dimension - const auto blockDim_y = static_cast(blockDim.y); // number of threads in block y-dimension - const auto blockIdx_x = static_cast(blockIdx.x) + grid_x_offset; // current block in grid x-dimension + offsets if the grid size would be too large - const auto blockIdx_y = static_cast(blockIdx.y) + grid_y_offset; // current block in grid y-dimension + offsets if the grid size would be too large - const auto INTERNAL_BLOCK_SIZE_ull = static_cast(INTERNAL_BLOCK_SIZE); - const auto THREAD_BLOCK_SIZE_ull = static_cast(THREAD_BLOCK_SIZE); - const auto FEATURE_BLOCK_SIZE_ull = static_cast(FEATURE_BLOCK_SIZE); - const auto PADDING_SIZE_ull = static_cast(PADDING_SIZE); - - // calculate the indices used in the current thread - const auto i = (blockIdx_x * blockDim_x + threadIdx_x) * INTERNAL_BLOCK_SIZE_ull; - const auto i_linear = blockIdx_x * blockDim_x * INTERNAL_BLOCK_SIZE_ull + threadIdx_x; - const auto j = (blockIdx_y * blockDim_y + threadIdx_y) * INTERNAL_BLOCK_SIZE_ull; - const auto j_linear = blockIdx_y * blockDim_y * INTERNAL_BLOCK_SIZE_ull + threadIdx_x; - - // create the shared memory arrays used for caching data point features - __shared__ real_type data_cache_i[FEATURE_BLOCK_SIZE][INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE]; - __shared__ real_type data_cache_j[FEATURE_BLOCK_SIZE][INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE]; +__global__ void device_kernel_assembly(real_type *kernel_matrix, const real_type *data, const std::size_t num_rows, const std::size_t device_num_rows, const std::size_t device_row_offset, const std::size_t num_features, const real_type *q, const real_type QA_cost, const real_type cost, const std::size_t grid_x_offset, const std::size_t grid_y_offset, Args... kernel_function_parameter) { + // cast all values to 64-bit std::size_t to prevent potential 32-bit overflows + constexpr auto INTERNAL_BLOCK_SIZE_uz = static_cast(INTERNAL_BLOCK_SIZE); + constexpr auto THREAD_BLOCK_SIZE_uz = static_cast(THREAD_BLOCK_SIZE); + constexpr auto PADDING_SIZE_uz = static_cast(PADDING_SIZE); + + const auto threadIdx_x = static_cast(threadIdx.x); // current thread in block x-dimension + const auto threadIdx_y = static_cast(threadIdx.y); // current thread in block y-dimension + const auto blockDim_x = static_cast(blockDim.x); // number of threads in block x-dimension + const auto blockDim_y = static_cast(blockDim.y); // number of threads in block y-dimension + const auto blockIdx_x = static_cast(blockIdx.x) + grid_x_offset; // current block in grid x-dimension + offsets if the grid size is too large + const auto blockIdx_y = static_cast(blockIdx.y) + grid_y_offset; // current block in grid y-dimension + offsets if the grid size is too large + + // create two shared memory arrays used for caching data point features + __shared__ real_type data_i_cache[THREAD_BLOCK_SIZE][INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE]; + __shared__ real_type data_j_cache[THREAD_BLOCK_SIZE][INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE]; // only calculate the upper triangular matrix -> can't use threadIdx since all threads in a warp must progress further if (blockIdx_x >= blockIdx_y) { // create a thread private array used for internal caching real_type temp[INTERNAL_BLOCK_SIZE][INTERNAL_BLOCK_SIZE]{}; - // iterate over all features using blocking to be able to cache them for faster memory accesses - for (unsigned long long dim = 0; dim < num_features; dim += FEATURE_BLOCK_SIZE_ull) { - // load data into shared memory - for (unsigned internal = 0; internal < INTERNAL_BLOCK_SIZE; ++internal) { - const auto global_i = row_offset + i_linear + static_cast(internal) * THREAD_BLOCK_SIZE_ull; - const auto global_j = row_offset + j_linear + static_cast(internal) * THREAD_BLOCK_SIZE_ull; - - // FEATURE_BLOCK_SIZE = 2 * THREAD_BLOCK_SIZE -> store twice as many values in the shared memory - data_cache_i[threadIdx.y][internal * THREAD_BLOCK_SIZE + threadIdx.x] = data_d[(dim + threadIdx_y) * (num_rows + 1ull + PADDING_SIZE_ull) + global_i]; - data_cache_i[threadIdx.y + THREAD_BLOCK_SIZE][internal * THREAD_BLOCK_SIZE + threadIdx.x] = data_d[(dim + threadIdx_y + THREAD_BLOCK_SIZE_ull) * (num_rows + 1ull + PADDING_SIZE_ull) + global_i]; - data_cache_j[threadIdx.y][internal * THREAD_BLOCK_SIZE + threadIdx.x] = data_d[(dim + threadIdx_y) * (num_rows + 1ull + PADDING_SIZE_ull) + global_j]; - data_cache_j[threadIdx.y + THREAD_BLOCK_SIZE][internal * THREAD_BLOCK_SIZE + threadIdx.x] = data_d[(dim + threadIdx_y + THREAD_BLOCK_SIZE_ull) * (num_rows + 1ull + PADDING_SIZE_ull) + global_j]; - } - __syncthreads(); // wait until all threads loaded their part of the data - - // perform the feature reduction calculation - for (unsigned block_dim = 0; block_dim < FEATURE_BLOCK_SIZE; ++block_dim) { - for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) { - for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) { - temp[internal_i][internal_j] += detail::feature_reduce(data_cache_i[block_dim][threadIdx.x * INTERNAL_BLOCK_SIZE + internal_i], - data_cache_j[block_dim][threadIdx.y * INTERNAL_BLOCK_SIZE + internal_j]); + { + // calculate the indices used in the current thread paying attention to coalesced memory accesses + const auto i_linear = blockIdx_x * blockDim_x * INTERNAL_BLOCK_SIZE_uz + threadIdx_x; + const auto j_linear = blockIdx_y * blockDim_y * INTERNAL_BLOCK_SIZE_uz + threadIdx_x; + + // iterate over all features using blocking to be able to cache them for faster memory accesses + for (std::size_t dim = 0; dim < num_features; dim += THREAD_BLOCK_SIZE_uz) { + // load data into shared memory + for (unsigned internal = 0; internal < INTERNAL_BLOCK_SIZE; ++internal) { + // calculate the indices to access the global data points, pays attention to coalesced memory accesses + const auto global_i_linear = device_row_offset + i_linear + static_cast(internal) * THREAD_BLOCK_SIZE_uz; + const auto global_j_linear = device_row_offset + j_linear + static_cast(internal) * THREAD_BLOCK_SIZE_uz; + + // store the values in the shared memory + data_i_cache[threadIdx.y][internal * THREAD_BLOCK_SIZE + threadIdx.x] = data[(dim + threadIdx_y) * (num_rows + std::size_t{ 1 } + PADDING_SIZE_uz) + global_i_linear]; + data_j_cache[threadIdx.y][internal * THREAD_BLOCK_SIZE + threadIdx.x] = data[(dim + threadIdx_y) * (num_rows + std::size_t{ 1 } + PADDING_SIZE_uz) + global_j_linear]; + } + __syncthreads(); // wait until all threads loaded their part of the data + + // perform the feature reduction calculation + for (unsigned block_dim = 0; block_dim < THREAD_BLOCK_SIZE; ++block_dim) { + for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) { + for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) { + temp[internal_i][internal_j] += detail::feature_reduce(data_i_cache[block_dim][threadIdx.x * INTERNAL_BLOCK_SIZE + internal_i], + data_j_cache[block_dim][threadIdx.y * INTERNAL_BLOCK_SIZE + internal_j]); + } } } + __syncthreads(); // wait until all threads performed their part of the calculations } - __syncthreads(); // wait until all threads performed their part of the calculations } + // calculate the indices used in the current thread + const auto i = (blockIdx_x * blockDim_x + threadIdx_x) * INTERNAL_BLOCK_SIZE_uz; + const auto j = (blockIdx_y * blockDim_y + threadIdx_y) * INTERNAL_BLOCK_SIZE_uz; + // apply the remaining part of the kernel function and store the value in the output kernel matrix for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) { for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) { - // calculate the indices to access the kernel matrix (the part stored on the current device) - const auto device_global_i = i + static_cast(internal_i); - const auto global_i = row_offset + i + static_cast(internal_i); - const auto device_global_j = j + static_cast(internal_j); - const auto global_j = row_offset + j + static_cast(internal_j); + // calculate the indices to access the global data points and wrt the current device + const auto device_global_i = i + static_cast(internal_i); + const auto global_i = device_row_offset + device_global_i; + const auto device_global_j = j + static_cast(internal_j); + const auto global_j = device_row_offset + device_global_j; // be sure to not perform out of bounds accesses for the kernel matrix (only using the upper triangular matrix) - if (device_global_i < (num_rows - row_offset) && device_global_j < device_num_rows && global_i >= global_j) { + if (device_global_i < (num_rows - device_row_offset) && device_global_j < device_num_rows && global_i >= global_j) { real_type temp_ij = temp[internal_i][internal_j]; + // apply the final kernel function temp_ij = detail::apply_kernel_function(temp_ij, kernel_function_parameter...) + QA_cost - q[global_i] - q[global_j]; // apply the cost on the diagonal if (global_i == global_j) { temp_ij += cost; } - // update the kernel matrix - kernel_matrix_d[device_global_j * (num_rows - row_offset + PADDING_SIZE_ull) - device_global_j * (device_global_j + 1ull) / 2ull + device_global_i] = temp_ij; + // update the upper triangular kernel matrix + kernel_matrix[device_global_j * (num_rows - device_row_offset + PADDING_SIZE_uz) - device_global_j * (device_global_j + std::size_t{ 1 }) / std::size_t{ 2 } + device_global_i] = temp_ij; } } } diff --git a/include/plssvm/backends/HIP/kernel/cg_explicit/kernel_matrix_assembly.hip.hpp b/include/plssvm/backends/HIP/kernel/cg_explicit/kernel_matrix_assembly.hip.hpp index 75a3cd9a5..f0e01f813 100644 --- a/include/plssvm/backends/HIP/kernel/cg_explicit/kernel_matrix_assembly.hip.hpp +++ b/include/plssvm/backends/HIP/kernel/cg_explicit/kernel_matrix_assembly.hip.hpp @@ -14,23 +14,25 @@ #pragma once #include "plssvm/backends/HIP/kernel/kernel_functions.hip.hpp" // plssvm::hip::detail::{feature_reduce, apply_kernel_function} -#include "plssvm/constants.hpp" // plssvm::{real_type, THREAD_BLOCK_SIZE, INTERNAL_BLOCK_SIZE, FEATURE_BLOCK_SIZE, PADDING_SIZE} +#include "plssvm/constants.hpp" // plssvm::{real_type, THREAD_BLOCK_SIZE, INTERNAL_BLOCK_SIZE, PADDING_SIZE} #include "plssvm/kernel_function_types.hpp" // plssvm::kernel_function_type #include "hip/hip_runtime.h" #include "hip/hip_runtime_api.h" +#include // std::size_t + namespace plssvm::hip::detail { /** * @brief Create the explicit kernel matrix using the @p kernel_function. * @tparam kernel_function the type of the used kernel function * @tparam Args the types of the parameters necessary for the specific kernel function - * @param[out] kernel_matrix_d the calculated kernel matrix - * @param[in] data_d the data points to calculate the kernel matrix from + * @param[out] kernel_matrix the calculated kernel matrix + * @param[in] data the data points to calculate the kernel matrix from * @param[in] num_rows the total number of data points (= total number of rows) * @param[in] device_num_rows the number of rows the current device is responsible for - * @param[in] row_offset the first row in @p data_d the current device is responsible for + * @param[in] device_row_offset the first row in @p data_d the current device is responsible for * @param[in] num_features the number of features per data point * @param[in] q the vector used in the dimensional reduction * @param[in] QA_cost the scalar used in the dimensional reduction @@ -40,80 +42,84 @@ namespace plssvm::hip::detail { * @param[in] kernel_function_parameter the parameters necessary to apply the @p kernel_function */ template -__global__ void device_kernel_assembly(real_type *kernel_matrix_d, const real_type *data_d, const unsigned long long num_rows, const unsigned long long device_num_rows, const unsigned long long row_offset, const unsigned long long num_features, const real_type *q, const real_type QA_cost, const real_type cost, const unsigned long long grid_x_offset, const unsigned long long grid_y_offset, Args... kernel_function_parameter) { - // cast all values to 64-bit unsigned long long to prevent potential 32-bit overflows - const auto threadIdx_x = static_cast(threadIdx.x); // current thread in block x-dimension - const auto threadIdx_y = static_cast(threadIdx.y); // current thread in block y-dimension - const auto blockDim_x = static_cast(blockDim.x); // number of threads in block x-dimension - const auto blockDim_y = static_cast(blockDim.y); // number of threads in block y-dimension - const auto blockIdx_x = static_cast(blockIdx.x) + grid_x_offset; // current block in grid x-dimension + offsets if the grid size would be too large - const auto blockIdx_y = static_cast(blockIdx.y) + grid_y_offset; // current block in grid y-dimension + offsets if the grid size would be too large - const auto INTERNAL_BLOCK_SIZE_ull = static_cast(INTERNAL_BLOCK_SIZE); - const auto THREAD_BLOCK_SIZE_ull = static_cast(THREAD_BLOCK_SIZE); - const auto FEATURE_BLOCK_SIZE_ull = static_cast(FEATURE_BLOCK_SIZE); - const auto PADDING_SIZE_ull = static_cast(PADDING_SIZE); - - // calculate the indices used in the current thread - const auto i = (blockIdx_x * blockDim_x + threadIdx_x) * INTERNAL_BLOCK_SIZE_ull; - const auto i_linear = blockIdx_x * blockDim_x * INTERNAL_BLOCK_SIZE_ull + threadIdx_x; - const auto j = (blockIdx_y * blockDim_y + threadIdx_y) * INTERNAL_BLOCK_SIZE_ull; - const auto j_linear = blockIdx_y * blockDim_y * INTERNAL_BLOCK_SIZE_ull + threadIdx_x; - - // create the shared memory arrays used for caching data point features - __shared__ real_type data_cache_i[FEATURE_BLOCK_SIZE][INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE]; - __shared__ real_type data_cache_j[FEATURE_BLOCK_SIZE][INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE]; +__global__ void device_kernel_assembly(real_type *kernel_matrix, const real_type *data, const std::size_t num_rows, const std::size_t device_num_rows, const std::size_t device_row_offset, const std::size_t num_features, const real_type *q, const real_type QA_cost, const real_type cost, const std::size_t grid_x_offset, const std::size_t grid_y_offset, Args... kernel_function_parameter) { + // cast all values to 64-bit std::size_t to prevent potential 32-bit overflows + constexpr auto INTERNAL_BLOCK_SIZE_uz = static_cast(INTERNAL_BLOCK_SIZE); + constexpr auto THREAD_BLOCK_SIZE_uz = static_cast(THREAD_BLOCK_SIZE); + constexpr auto PADDING_SIZE_uz = static_cast(PADDING_SIZE); + + const auto threadIdx_x = static_cast(threadIdx.x); // current thread in block x-dimension + const auto threadIdx_y = static_cast(threadIdx.y); // current thread in block y-dimension + const auto blockDim_x = static_cast(blockDim.x); // number of threads in block x-dimension + const auto blockDim_y = static_cast(blockDim.y); // number of threads in block y-dimension + const auto blockIdx_x = static_cast(blockIdx.x) + grid_x_offset; // current block in grid x-dimension + offsets if the grid size is too large + const auto blockIdx_y = static_cast(blockIdx.y) + grid_y_offset; // current block in grid y-dimension + offsets if the grid size is too large + + // create two shared memory arrays used for caching data point features + __shared__ real_type data_i_cache[THREAD_BLOCK_SIZE][INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE]; + __shared__ real_type data_j_cache[THREAD_BLOCK_SIZE][INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE]; // only calculate the upper triangular matrix -> can't use threadIdx since all threads in a wavefront must progress further if (blockIdx_x >= blockIdx_y) { // create a thread private array used for internal caching real_type temp[INTERNAL_BLOCK_SIZE][INTERNAL_BLOCK_SIZE]{}; - // iterate over all features using blocking to be able to cache them for faster memory accesses - for (unsigned long long dim = 0; dim < num_features; dim += FEATURE_BLOCK_SIZE_ull) { - // load data into shared memory - for (unsigned internal = 0; internal < INTERNAL_BLOCK_SIZE; ++internal) { - const auto global_i = row_offset + i_linear + static_cast(internal) * THREAD_BLOCK_SIZE_ull; - const auto global_j = row_offset + j_linear + static_cast(internal) * THREAD_BLOCK_SIZE_ull; - - // FEATURE_BLOCK_SIZE = 2 * THREAD_BLOCK_SIZE -> store twice as many values in the shared memory - data_cache_i[threadIdx.y][internal * THREAD_BLOCK_SIZE + threadIdx.x] = data_d[(dim + threadIdx_y) * (num_rows + 1ull + PADDING_SIZE_ull) + global_i]; - data_cache_i[threadIdx.y + THREAD_BLOCK_SIZE][internal * THREAD_BLOCK_SIZE + threadIdx.x] = data_d[(dim + threadIdx_y + THREAD_BLOCK_SIZE_ull) * (num_rows + 1ull + PADDING_SIZE_ull) + global_i]; - data_cache_j[threadIdx.y][internal * THREAD_BLOCK_SIZE + threadIdx.x] = data_d[(dim + threadIdx_y) * (num_rows + 1ull + PADDING_SIZE_ull) + global_j]; - data_cache_j[threadIdx.y + THREAD_BLOCK_SIZE][internal * THREAD_BLOCK_SIZE + threadIdx.x] = data_d[(dim + threadIdx_y + THREAD_BLOCK_SIZE_ull) * (num_rows + 1ull + PADDING_SIZE_ull) + global_j]; - } - __syncthreads(); // wait until all threads loaded their part of the data - - // perform the feature reduction calculation - for (unsigned block_dim = 0; block_dim < FEATURE_BLOCK_SIZE; ++block_dim) { - for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) { - for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) { - temp[internal_i][internal_j] += detail::feature_reduce(data_cache_i[block_dim][threadIdx.x * INTERNAL_BLOCK_SIZE + internal_i], - data_cache_j[block_dim][threadIdx.y * INTERNAL_BLOCK_SIZE + internal_j]); + { + // calculate the indices used in the current thread paying attention to coalesced memory accesses + const auto i_linear = blockIdx_x * blockDim_x * INTERNAL_BLOCK_SIZE_uz + threadIdx_x; + const auto j_linear = blockIdx_y * blockDim_y * INTERNAL_BLOCK_SIZE_uz + threadIdx_x; + + // iterate over all features using blocking to be able to cache them for faster memory accesses + for (std::size_t dim = 0; dim < num_features; dim += THREAD_BLOCK_SIZE_uz) { + // load data into shared memory + for (unsigned internal = 0; internal < INTERNAL_BLOCK_SIZE; ++internal) { + // calculate the indices to access the global data points, pays attention to coalesced memory accesses + const auto global_i_linear = device_row_offset + i_linear + static_cast(internal) * THREAD_BLOCK_SIZE_uz; + const auto global_j_linear = device_row_offset + j_linear + static_cast(internal) * THREAD_BLOCK_SIZE_uz; + + // store the values in the shared memory + data_i_cache[threadIdx.y][internal * THREAD_BLOCK_SIZE + threadIdx.x] = data[(dim + threadIdx_y) * (num_rows + std::size_t{ 1 } + PADDING_SIZE_uz) + global_i_linear]; + data_j_cache[threadIdx.y][internal * THREAD_BLOCK_SIZE + threadIdx.x] = data[(dim + threadIdx_y) * (num_rows + std::size_t{ 1 } + PADDING_SIZE_uz) + global_j_linear]; + } + __syncthreads(); // wait until all threads loaded their part of the data + + // perform the feature reduction calculation + for (unsigned block_dim = 0; block_dim < THREAD_BLOCK_SIZE; ++block_dim) { + for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) { + for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) { + temp[internal_i][internal_j] += detail::feature_reduce(data_i_cache[block_dim][threadIdx.x * INTERNAL_BLOCK_SIZE + internal_i], + data_j_cache[block_dim][threadIdx.y * INTERNAL_BLOCK_SIZE + internal_j]); + } } } + __syncthreads(); // wait until all threads performed their part of the calculations } - __syncthreads(); // wait until all threads performed their part of the calculations } + // calculate the indices used in the current thread + const auto i = (blockIdx_x * blockDim_x + threadIdx_x) * INTERNAL_BLOCK_SIZE_uz; + const auto j = (blockIdx_y * blockDim_y + threadIdx_y) * INTERNAL_BLOCK_SIZE_uz; + // apply the remaining part of the kernel function and store the value in the output kernel matrix for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) { for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) { - // calculate the indices to access the kernel matrix (the part stored on the current device) - const auto device_global_i = i + static_cast(internal_i); - const auto global_i = row_offset + i + static_cast(internal_i); - const auto device_global_j = j + static_cast(internal_j); - const auto global_j = row_offset + j + static_cast(internal_j); + // calculate the indices to access the global data points and wrt the current device + const auto device_global_i = i + static_cast(internal_i); + const auto global_i = device_row_offset + device_global_i; + const auto device_global_j = j + static_cast(internal_j); + const auto global_j = device_row_offset + device_global_j; // be sure to not perform out of bounds accesses for the kernel matrix (only using the upper triangular matrix) - if (device_global_i < (num_rows - row_offset) && device_global_j < device_num_rows && global_i >= global_j) { + if (device_global_i < (num_rows - device_row_offset) && device_global_j < device_num_rows && global_i >= global_j) { real_type temp_ij = temp[internal_i][internal_j]; + // apply the final kernel function temp_ij = detail::apply_kernel_function(temp_ij, kernel_function_parameter...) + QA_cost - q[global_i] - q[global_j]; // apply the cost on the diagonal if (global_i == global_j) { temp_ij += cost; } - // update the kernel matrix - kernel_matrix_d[device_global_j * (num_rows - row_offset + PADDING_SIZE_ull) - device_global_j * (device_global_j + 1ull) / 2ull + device_global_i] = temp_ij; + // update the upper triangular kernel matrix + kernel_matrix[device_global_j * (num_rows - device_row_offset + PADDING_SIZE_uz) - device_global_j * (device_global_j + std::size_t{ 1 }) / std::size_t{ 2 } + device_global_i] = temp_ij; } } } diff --git a/include/plssvm/backends/HPX/kernel/cg_explicit/kernel_matrix_assembly.hpp b/include/plssvm/backends/HPX/kernel/cg_explicit/kernel_matrix_assembly.hpp index e575c6af2..af1d3c9e2 100644 --- a/include/plssvm/backends/HPX/kernel/cg_explicit/kernel_matrix_assembly.hpp +++ b/include/plssvm/backends/HPX/kernel/cg_explicit/kernel_matrix_assembly.hpp @@ -15,7 +15,7 @@ #pragma once #include "plssvm/backends/HPX/kernel/kernel_functions.hpp" // plssvm::hpx::detail::{feature_reduce, apply_kernel_function} -#include "plssvm/constants.hpp" // plssvm::{real_type, INTERNAL_BLOCK_SIZE, FEATURE_BLOCK_SIZE, PADDING_SIZE} +#include "plssvm/constants.hpp" // plssvm::{real_type, INTERNAL_BLOCK_SIZE, PADDING_SIZE} #include "plssvm/detail/assert.hpp" // PLSSVM_ASSERT #include "plssvm/kernel_function_types.hpp" // plssvm::kernel_function_type #include "plssvm/matrix.hpp" // plssvm::aos_matrix diff --git a/include/plssvm/backends/Kokkos/kernel/cg_explicit/kernel_matrix_assembly.hpp b/include/plssvm/backends/Kokkos/kernel/cg_explicit/kernel_matrix_assembly.hpp index 8e42e8b41..2a83b311f 100644 --- a/include/plssvm/backends/Kokkos/kernel/cg_explicit/kernel_matrix_assembly.hpp +++ b/include/plssvm/backends/Kokkos/kernel/cg_explicit/kernel_matrix_assembly.hpp @@ -15,7 +15,7 @@ #include "plssvm/backends/Kokkos/detail/standard_layout_tuple.hpp" // plssvm::kokkos::detail::standard_layout_tuple #include "plssvm/backends/Kokkos/kernel/kernel_functions.hpp" // plssvm::kokkos::detail::{feature_reduce, apply_kernel_function} -#include "plssvm/constants.hpp" // plssvm::{real_type, THREAD_BLOCK_SIZE, INTERNAL_BLOCK_SIZE, FEATURE_BLOCK_SIZE, PADDING_SIZE} +#include "plssvm/constants.hpp" // plssvm::{real_type, THREAD_BLOCK_SIZE, INTERNAL_BLOCK_SIZE, PADDING_SIZE} #include "plssvm/kernel_function_types.hpp" // plssvm::kernel_function_type #include "Kokkos_Core.hpp" // KOKKOS_INLINE_FUNCTION, Kokkos::View, Kokkos::TeamPolicy, Kokkos::mdspan, Kokkos::dextents @@ -41,11 +41,11 @@ class device_kernel_assembly { public: /** * @brief Initialize the Kokkos kernel function object. - * @param[out] kernel_matrix_d the calculated kernel matrix - * @param[in] data_d the data points to calculate the kernel matrix from + * @param[out] kernel_matrix the calculated kernel matrix + * @param[in] data the data points to calculate the kernel matrix from * @param[in] num_rows the number of data points * @param[in] device_num_rows the number of rows the current device is responsible for - * @param[in] row_offset the first row in @p data_d the current device is responsible for + * @param[in] device_row_offset the first row in @p data_d the current device is responsible for * @param[in] num_features the number of features per data point * @param[in] q the vector used in the dimensional reduction * @param[in] QA_cost the scalar used in the dimensional reduction @@ -55,12 +55,12 @@ class device_kernel_assembly { * @param[in] grid_size_x the size of the execution grid in x-dimension * @param[in] kernel_function_parameter the parameters necessary to apply the @p kernel_function */ - device_kernel_assembly(device_view_type kernel_matrix_d, device_view_type data_d, const std::size_t num_rows, const std::size_t device_num_rows, const std::size_t row_offset, const std::size_t num_features, device_view_type q, const real_type QA_cost, const real_type cost, const std::size_t grid_x_offset, const std::size_t grid_y_offset, const std::size_t grid_size_x, Args... kernel_function_parameter) : - kernel_matrix_d_{ kernel_matrix_d }, - data_d_{ data_d }, + device_kernel_assembly(device_view_type kernel_matrix, device_view_type data, const std::size_t num_rows, const std::size_t device_num_rows, const std::size_t device_row_offset, const std::size_t num_features, device_view_type q, const real_type QA_cost, const real_type cost, const std::size_t grid_x_offset, const std::size_t grid_y_offset, const std::size_t grid_size_x, Args... kernel_function_parameter) : + kernel_matrix_{ kernel_matrix_d }, + data_{ data }, num_rows_{ num_rows }, device_num_rows_{ device_num_rows }, - row_offset_{ row_offset }, + device_row_offset_{ device_row_offset }, num_features_{ num_features }, q_{ q }, QA_cost_{ QA_cost }, @@ -78,80 +78,84 @@ class device_kernel_assembly { KOKKOS_INLINE_FUNCTION void operator()(const typename Kokkos::TeamPolicy::member_type &team) const { // cast all values to 64-bit std::size_t to prevent potential 32-bit overflows - const auto INTERNAL_BLOCK_SIZE_sz = static_cast(INTERNAL_BLOCK_SIZE); - const auto THREAD_BLOCK_SIZE_sz = static_cast(THREAD_BLOCK_SIZE); - const auto FEATURE_BLOCK_SIZE_sz = static_cast(FEATURE_BLOCK_SIZE); - const auto PADDING_SIZE_sz = static_cast(PADDING_SIZE); - const auto threadIdx_x = static_cast(team.team_rank()) / THREAD_BLOCK_SIZE_sz; // current thread in block x-dimension - const auto threadIdx_y = static_cast(team.team_rank()) % THREAD_BLOCK_SIZE_sz; // current thread in block y-dimension - const auto blockDim_x = THREAD_BLOCK_SIZE_sz; // number of threads in block x-dimension - const auto blockDim_y = THREAD_BLOCK_SIZE_sz; // number of threads in block y-dimension - const auto blockIdx_x = static_cast(team.league_rank()) % grid_size_x_ + grid_x_offset_; // current block in grid x-dimension + offsets if the grid size would be too large - const auto blockIdx_y = static_cast(team.league_rank()) / grid_size_x_ + grid_y_offset_; // current block in grid y-dimension + offsets if the grid size would be too large - - // calculate the indices used in the current thread - const auto i = (blockIdx_x * blockDim_x + threadIdx_x) * INTERNAL_BLOCK_SIZE_sz; // # rhs -> num_rhs - const auto i_linear = blockIdx_x * blockDim_x * INTERNAL_BLOCK_SIZE_sz + threadIdx_x; - const auto j = (blockIdx_y * blockDim_y + threadIdx_y) * INTERNAL_BLOCK_SIZE_sz; // # rows -> num_mirror_rows - const auto j_linear = blockIdx_y * blockDim_y * INTERNAL_BLOCK_SIZE_sz + threadIdx_x; - - // create the shared memory arrays used for caching data point features - constexpr std::size_t shmem_size = FEATURE_BLOCK_SIZE * THREAD_BLOCK_SIZE * INTERNAL_BLOCK_SIZE; + constexpr auto INTERNAL_BLOCK_SIZE_uz = static_cast(INTERNAL_BLOCK_SIZE); + constexpr auto THREAD_BLOCK_SIZE_uz = static_cast(THREAD_BLOCK_SIZE); + constexpr auto PADDING_SIZE_uz = static_cast(PADDING_SIZE); + + const auto threadIdx_x = static_cast(team.team_rank()) / THREAD_BLOCK_SIZE_uz; // current thread in block x-dimension + const auto threadIdx_y = static_cast(team.team_rank()) % THREAD_BLOCK_SIZE_uz; // current thread in block y-dimension + const auto blockDim_x = THREAD_BLOCK_SIZE_uz; // number of threads in block x-dimension + const auto blockDim_y = THREAD_BLOCK_SIZE_uz; // number of threads in block y-dimension + const auto blockIdx_x = static_cast(team.league_rank()) % grid_size_x_ + grid_x_offset_; // current block in grid x-dimension + offsets if the grid size is too large + const auto blockIdx_y = static_cast(team.league_rank()) / grid_size_x_ + grid_y_offset_; // current block in grid y-dimension + offsets if the grid size is too large + + // create two shared memory arrays used for caching data point features + constexpr std::size_t shmem_size = THREAD_BLOCK_SIZE_uz * THREAD_BLOCK_SIZE_uz * INTERNAL_BLOCK_SIZE_uz; real_type *data_cache_ptr = static_cast(team.team_shmem().get_shmem(2 * shmem_size)); - Kokkos::mdspan> data_cache_i{ data_cache_ptr, FEATURE_BLOCK_SIZE, INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE }; - Kokkos::mdspan> data_cache_j{ data_cache_ptr + shmem_size, FEATURE_BLOCK_SIZE, INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE }; + Kokkos::mdspan> data_i_cache{ data_cache_ptr, THREAD_BLOCK_SIZE, INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE }; + Kokkos::mdspan> data_j_cache{ data_cache_ptr + shmem_size, THREAD_BLOCK_SIZE, INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE }; - // only calculate the upper triangular matrix -> can't use threadIdx since all threads in a warp must progress further + // only calculate the upper triangular matrix -> can't use team.team_rank() since all threads in a team must progress further if (blockIdx_x >= blockIdx_y) { // create a thread private array used for internal caching real_type temp[INTERNAL_BLOCK_SIZE][INTERNAL_BLOCK_SIZE]{}; - // iterate over all features using blocking to be able to cache them for faster memory accesses - for (std::size_t dim = 0; dim < num_features_; dim += FEATURE_BLOCK_SIZE_sz) { - // load data into shared memory - for (unsigned internal = 0; internal < INTERNAL_BLOCK_SIZE; ++internal) { - const auto global_i = row_offset_ + i_linear + static_cast(internal) * THREAD_BLOCK_SIZE_sz; - const auto global_j = row_offset_ + j_linear + static_cast(internal) * THREAD_BLOCK_SIZE_sz; - - // FEATURE_BLOCK_SIZE = 2 * THREAD_BLOCK_SIZE -> store twice as many values in the shared memory - data_cache_i(threadIdx_y, internal * THREAD_BLOCK_SIZE + threadIdx_x) = data_d_[(dim + threadIdx_y) * (num_rows_ + std::size_t{ 1 } + PADDING_SIZE_sz) + global_i]; - data_cache_i(threadIdx_y + THREAD_BLOCK_SIZE, internal * THREAD_BLOCK_SIZE + threadIdx_x) = data_d_[(dim + threadIdx_y + THREAD_BLOCK_SIZE_sz) * (num_rows_ + std::size_t{ 1 } + PADDING_SIZE_sz) + global_i]; - data_cache_j(threadIdx_y, internal * THREAD_BLOCK_SIZE + threadIdx_x) = data_d_[(dim + threadIdx_y) * (num_rows_ + std::size_t{ 1 } + PADDING_SIZE_sz) + global_j]; - data_cache_j(threadIdx_y + THREAD_BLOCK_SIZE, internal * THREAD_BLOCK_SIZE + threadIdx_x) = data_d_[(dim + threadIdx_y + THREAD_BLOCK_SIZE_sz) * (num_rows_ + std::size_t{ 1 } + PADDING_SIZE_sz) + global_j]; - } - team.team_barrier(); // wait until all threads loaded their part of the data - - // perform the feature reduction calculation - for (unsigned block_dim = 0; block_dim < FEATURE_BLOCK_SIZE; ++block_dim) { - for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) { - for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) { - temp[internal_i][internal_j] += detail::feature_reduce(data_cache_i(block_dim, threadIdx_x * INTERNAL_BLOCK_SIZE + internal_i), - data_cache_j(block_dim, threadIdx_y * INTERNAL_BLOCK_SIZE + internal_j)); + { + // calculate the indices used in the current thread paying attention to coalesced memory accesses + const auto i_linear = blockIdx_x * blockDim_x * INTERNAL_BLOCK_SIZE_uz + threadIdx_x; + const auto j_linear = blockIdx_y * blockDim_y * INTERNAL_BLOCK_SIZE_uz + threadIdx_x; + + // iterate over all features using blocking to be able to cache them for faster memory accesses + for (std::size_t dim = 0; dim < num_features_; dim += THREAD_BLOCK_SIZE_uz) { + // load data into shared memory + for (unsigned internal = 0; internal < INTERNAL_BLOCK_SIZE; ++internal) { + // calculate the indices to access the global data points, pays attention to coalesced memory accesses + const auto global_i_linear = device_row_offset_ + i_linear + static_cast(internal) * THREAD_BLOCK_SIZE_uz; + const auto global_j_linear = device_row_offset_ + j_linear + static_cast(internal) * THREAD_BLOCK_SIZE_uz; + + // store the values in the shared memory + data_i_cache(threadIdx_y, internal * THREAD_BLOCK_SIZE + threadIdx_x) = data_[(dim + threadIdx_y) * (num_rows_ + std::size_t{ 1 } + PADDING_SIZE_uz) + global_i_linear]; + data_j_cache(threadIdx_y, internal * THREAD_BLOCK_SIZE + threadIdx_x) = data_[(dim + threadIdx_y) * (num_rows_ + std::size_t{ 1 } + PADDING_SIZE_uz) + global_j_linear]; + } + team.team_barrier(); // wait until all threads loaded their part of the data + + // perform the feature reduction calculation + for (unsigned block_dim = 0; block_dim < THREAD_BLOCK_SIZE; ++block_dim) { + for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) { + for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) { + temp[internal_i][internal_j] += detail::feature_reduce(data_i_cache(block_dim, threadIdx_x * INTERNAL_BLOCK_SIZE + internal_i), + data_j_cache(block_dim, threadIdx_y * INTERNAL_BLOCK_SIZE + internal_j)); + } } } + team.team_barrier(); // wait until all threads performed their part of the calculations } - team.team_barrier(); // wait until all threads performed their part of the calculations } + // calculate the indices used in the current thread + const auto i = (blockIdx_x * blockDim_x + threadIdx_x) * INTERNAL_BLOCK_SIZE_uz; + const auto j = (blockIdx_y * blockDim_y + threadIdx_y) * INTERNAL_BLOCK_SIZE_uz; + // apply the remaining part of the kernel function and store the value in the output kernel matrix for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) { for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) { - // calculate the indices to access the kernel matrix (the part stored on the current device) + // calculate the indices to access the global data points and wrt the current device const auto device_global_i = i + static_cast(internal_i); - const auto global_i = row_offset_ + i + static_cast(internal_i); + const auto global_i = device_row_offset + device_global_i; const auto device_global_j = j + static_cast(internal_j); - const auto global_j = row_offset_ + j + static_cast(internal_j); + const auto global_j = device_row_offset + device_global_j; // be sure to not perform out of bounds accesses for the kernel matrix (only using the upper triangular matrix) - if (device_global_i < (num_rows_ - row_offset_) && device_global_j < device_num_rows_ && global_i >= global_j) { + if (device_global_i < (num_rows_ - device_row_offset_) && device_global_j < device_num_rows_ && global_i >= global_j) { real_type temp_ij = temp[internal_i][internal_j]; + // apply the final kernel function temp_ij = detail::apply_kernel_function(temp_ij, kernel_function_parameter_) + QA_cost_ - q_[global_i] - q_[global_j]; // apply the cost on the diagonal if (global_i == global_j) { temp_ij += cost_; } - // update the kernel matrix - kernel_matrix_d_[device_global_j * (num_rows_ - row_offset_ + PADDING_SIZE_sz) - device_global_j * (device_global_j + std::size_t{ 1 }) / std::size_t{ 2 } + device_global_i] = temp_ij; + // update the upper triangular kernel matrix + kernel_matrix_[device_global_j * (num_rows_ - device_row_offset_ + PADDING_SIZE_uz) - device_global_j * (device_global_j + std::size_t{ 1 }) / std::size_t{ 2 } + device_global_i] = temp_ij; } } } @@ -160,11 +164,11 @@ class device_kernel_assembly { private: /// @cond Doxygen_suppress - device_view_type kernel_matrix_d_; - device_view_type data_d_; + device_view_type kernel_matrix_; + device_view_type data_; const std::size_t num_rows_; const std::size_t device_num_rows_; - const std::size_t row_offset_; + const std::size_t device_row_offset_; const std::size_t num_features_; device_view_type q_; const real_type QA_cost_; diff --git a/include/plssvm/backends/OpenCL/kernel/cg_explicit/kernel_matrix_assembly.cl b/include/plssvm/backends/OpenCL/kernel/cg_explicit/kernel_matrix_assembly.cl index 481945ca6..99bc02933 100644 --- a/include/plssvm/backends/OpenCL/kernel/cg_explicit/kernel_matrix_assembly.cl +++ b/include/plssvm/backends/OpenCL/kernel/cg_explicit/kernel_matrix_assembly.cl @@ -14,11 +14,11 @@ /** * @brief Create the explicit kernel matrix using the kernel function determined at runtime. * @details The `PLSSVM_OPENCL_KERNEL_FUNCTION_PARAMETER_LIST`, `PLSSVM_OPENCL_KERNEL_FUNCTION_PARAMETER`, `PLSSVM_OPENCL_FEATURE_REDUCE_FUNCTION`, and `PLSSVM_OPENCL_APPLY_KERNEL_FUNCTION` placeholder will be replaced by the correct values upon kernel construction. - * @param[out] kernel_matrix_d the calculated kernel matrix - * @param[in] data_d the data points to calculate the kernel matrix from + * @param[out] kernel_matrix the calculated kernel matrix + * @param[in] data the data points to calculate the kernel matrix from * @param[in] num_rows the total number of data points (= total number of rows) * @param[in] device_num_rows the number of rows the current device is responsible for - * @param[in] row_offset the first row in @p data_d the current device is responsible for + * @param[in] device_row_offset the first row in @p data_d the current device is responsible for * @param[in] num_features the number of features per data point * @param[in] q the vector used in the dimensional reduction * @param[in] QA_cost the scalar used in the dimensional reduction @@ -27,78 +27,83 @@ * @param[in] grid_y_offset the offset in y-dimension into the data points if more than one execution grid has to be used * @param[in] PLSSVM_OPENCL_KERNEL_FUNCTION_PARAMETER_LIST a placeholder that is used to string replace the correct kernel parameter (attention: no comma!; Args... only added for Doxygen) */ -__kernel void device_kernel_assembly(__global real_type *kernel_matrix_d, const __global real_type *data_d, const ulong num_rows, const ulong device_num_rows, const ulong row_offset, const ulong num_features, const __global real_type *q, const real_type QA_cost, const real_type cost, const ulong grid_x_offset, const ulong grid_y_offset PLSSVM_OPENCL_KERNEL_FUNCTION_PARAMETER_LIST) { +__kernel void device_kernel_assembly(__global real_type *kernel_matrix, const __global real_type *data, const ulong num_rows, const ulong device_num_rows, const ulong device_row_offset, const ulong num_features, const __global real_type *q, const real_type QA_cost, const real_type cost, const ulong grid_x_offset, const ulong grid_y_offset PLSSVM_OPENCL_KERNEL_FUNCTION_PARAMETER_LIST) { // cast values to 32-bit unsigned int values to prevent implicit conversions const uint local_id_0 = get_local_id(0); const uint local_id_1 = get_local_id(1); // cast all values to 64-bit unsigned long long to prevent potential 32-bit overflows - const ulong threadIdx_x = get_local_id(0); // current thread in block x-dimension - const ulong threadIdx_y = get_local_id(1); // current thread in block y-dimension - const ulong blockDim_x = get_local_size(0); // number of threads in block x-dimension - const ulong blockDim_y = get_local_size(1); // number of threads in block y-dimension - const ulong blockIdx_x = get_group_id(0) + grid_x_offset; // current block in grid x-dimension + offsets if the grid size would be too large - const ulong blockIdx_y = get_group_id(1) + grid_y_offset; // current block in grid y-dimension + offsets if the grid size would be too large - - // calculate the indices used in the current thread - const ulong i = (blockIdx_x * blockDim_x + threadIdx_x) * INTERNAL_BLOCK_SIZE_ul; - const ulong i_linear = blockIdx_x * blockDim_x * INTERNAL_BLOCK_SIZE_ul + threadIdx_x; - const ulong j = (blockIdx_y * blockDim_y + threadIdx_y) * INTERNAL_BLOCK_SIZE_ul; - const ulong j_linear = blockIdx_y * blockDim_y * INTERNAL_BLOCK_SIZE_ul + threadIdx_x; + const ulong threadIdx_x = get_local_id(0); // current work-item in work-group x-dimension + const ulong threadIdx_y = get_local_id(1); // current work-item in work-group y-dimension + const ulong blockDim_x = get_local_size(0); // number of work-items in work-group x-dimension + const ulong blockDim_y = get_local_size(1); // number of work-items in work-group y-dimension + const ulong blockIdx_x = get_group_id(0) + grid_x_offset; // current work-group in global range x-dimension + offsets if the global range is too large + const ulong blockIdx_y = get_group_id(1) + grid_y_offset; // current work-group in global range y-dimension + offsets if the global range is too large // create the local memory arrays used for caching data point features - __local real_type data_cache_i[FEATURE_BLOCK_SIZE][INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE]; - __local real_type data_cache_j[FEATURE_BLOCK_SIZE][INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE]; + __local real_type data_i_cache[THREAD_BLOCK_SIZE][INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE]; + __local real_type data_j_cache[THREAD_BLOCK_SIZE][INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE]; // only calculate the upper triangular matrix -> can't use get_local_id() since all work-items in a work-group must progress further if (blockIdx_x >= blockIdx_y) { - // create a thread private array used for internal caching + // create a private memory array used for internal caching real_type temp[INTERNAL_BLOCK_SIZE][INTERNAL_BLOCK_SIZE] = { (real_type) 0.0 }; - // iterate over all features using blocking to be able to cache them for faster memory accesses - for (ulong dim = 0; dim < num_features; dim += FEATURE_BLOCK_SIZE_ul) { - // load data into local memory - for (uint internal = 0; internal < INTERNAL_BLOCK_SIZE; ++internal) { - const ulong global_i = row_offset + i_linear + (ulong) internal * THREAD_BLOCK_SIZE_ul; - const ulong global_j = row_offset + j_linear + (ulong) internal * THREAD_BLOCK_SIZE_ul; + { + // calculate the indices used in the current work-item paying attention to coalesced memory accesses + const ulong i_linear = blockIdx_x * blockDim_x * INTERNAL_BLOCK_SIZE_ul + threadIdx_x; + const ulong j_linear = blockIdx_y * blockDim_y * INTERNAL_BLOCK_SIZE_ul + threadIdx_x; - // FEATURE_BLOCK_SIZE = 2 * THREAD_BLOCK_SIZE -> store twice as many values in the local memory - data_cache_i[local_id_1][internal * THREAD_BLOCK_SIZE + local_id_0] = data_d[(dim + threadIdx_y) * (num_rows + (ulong) 1 + PADDING_SIZE_ul) + global_i]; - data_cache_i[local_id_1 + THREAD_BLOCK_SIZE][internal * THREAD_BLOCK_SIZE + local_id_0] = data_d[(dim + threadIdx_y + THREAD_BLOCK_SIZE_ul) * (num_rows + (ulong) 1 + PADDING_SIZE_ul) + global_i]; - data_cache_j[local_id_1][internal * THREAD_BLOCK_SIZE + local_id_0] = data_d[(dim + threadIdx_y) * (num_rows + (ulong) 1 + PADDING_SIZE_ul) + global_j]; - data_cache_j[local_id_1 + THREAD_BLOCK_SIZE][internal * THREAD_BLOCK_SIZE + local_id_0] = data_d[(dim + threadIdx_y + THREAD_BLOCK_SIZE_ul) * (num_rows + (ulong) 1 + PADDING_SIZE_ul) + global_j]; - } - barrier(CLK_LOCAL_MEM_FENCE); // wait until all work-items loaded their part of the data + // iterate over all features using blocking to be able to cache them for faster memory accesses + for (ulong dim = 0; dim < num_features; dim += THREAD_BLOCK_SIZE_ul) { + // load data into local memory + for (uint internal = 0; internal < INTERNAL_BLOCK_SIZE; ++internal) { + // calculate the indices to access the global data points, pays attention to coalesced memory accesses + const ulong global_i_linear = device_row_offset + i_linear + (ulong) internal * THREAD_BLOCK_SIZE_ul; + const ulong global_j_linear = device_row_offset + j_linear + (ulong) internal * THREAD_BLOCK_SIZE_ul; - // perform the feature reduction calculation - for (uint block_dim = 0; block_dim < FEATURE_BLOCK_SIZE; ++block_dim) { - for (uint internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) { - for (uint internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) { - temp[internal_i][internal_j] += PLSSVM_OPENCL_FEATURE_REDUCE_FUNCTION(data_cache_i[block_dim][local_id_0 * INTERNAL_BLOCK_SIZE + internal_i], data_cache_j[block_dim][local_id_1 * INTERNAL_BLOCK_SIZE + internal_j]); + // store the values in the local memory + data_i_cache[local_id_1][internal * THREAD_BLOCK_SIZE + local_id_0] = data[(dim + threadIdx_y) * (num_rows + (ulong) 1 + PADDING_SIZE_ul) + global_i_linear]; + data_j_cache[local_id_1][internal * THREAD_BLOCK_SIZE + local_id_0] = data[(dim + threadIdx_y) * (num_rows + (ulong) 1 + PADDING_SIZE_ul) + global_j_linear]; + } + barrier(CLK_LOCAL_MEM_FENCE); // wait until all work-items loaded their part of the data + + // perform the feature reduction calculation + for (uint block_dim = 0; block_dim < THREAD_BLOCK_SIZE; ++block_dim) { + for (uint internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) { + for (uint internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) { + temp[internal_i][internal_j] += PLSSVM_OPENCL_FEATURE_REDUCE_FUNCTION(data_i_cache[block_dim][local_id_0 * INTERNAL_BLOCK_SIZE + internal_i], data_j_cache[block_dim][local_id_1 * INTERNAL_BLOCK_SIZE + internal_j]); + } } } + barrier(CLK_LOCAL_MEM_FENCE); // wait until all work-items performed their part of the calculations } - barrier(CLK_LOCAL_MEM_FENCE); // wait until all work-items performed their part of the calculations } + // calculate the indices used in the current work-item + const ulong i = (blockIdx_x * blockDim_x + threadIdx_x) * INTERNAL_BLOCK_SIZE_ul; + const ulong j = (blockIdx_y * blockDim_y + threadIdx_y) * INTERNAL_BLOCK_SIZE_ul; + // apply the remaining part of the kernel function and store the value in the output kernel matrix for (uint internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) { for (uint internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) { + // calculate the indices to access the global data points and wrt the current device const ulong device_global_i = i + (ulong) internal_i; - const ulong global_i = row_offset + i + (ulong) internal_i; + const ulong global_i = device_row_offset + device_global_i; const ulong device_global_j = j + (ulong) internal_j; - const ulong global_j = row_offset + j + (ulong) internal_j; + const ulong global_j = device_row_offset + device_global_j; // be sure to not perform out of bounds accesses for the kernel matrix (only using the upper triangular matrix) - if (device_global_i < (num_rows - row_offset) && device_global_j < device_num_rows && global_i >= global_j) { + if (device_global_i < (num_rows - device_row_offset) && device_global_j < device_num_rows && global_i >= global_j) { real_type temp_ij = temp[internal_i][internal_j]; + // apply the final kernel function temp_ij = PLSSVM_OPENCL_APPLY_KERNEL_FUNCTION(temp_ij PLSSVM_OPENCL_KERNEL_FUNCTION_PARAMETER) + QA_cost - q[global_i] - q[global_j]; // apply the cost on the diagonal if (global_i == global_j) { temp_ij += cost; } - // update the kernel matrix - kernel_matrix_d[device_global_j * (num_rows - row_offset + PADDING_SIZE_ul) - device_global_j * (device_global_j + (ulong) 1) / (ulong) 2 + device_global_i] = temp_ij; + // update the upper triangular kernel matrix + kernel_matrix[device_global_j * (num_rows - device_row_offset + PADDING_SIZE_ul) - device_global_j * (device_global_j + (ulong) 1) / (ulong) 2 + device_global_i] = temp_ij; } } } diff --git a/include/plssvm/backends/SYCL/kernel/cg_explicit/basic/kernel_matrix_assembly.hpp b/include/plssvm/backends/SYCL/kernel/cg_explicit/basic/kernel_matrix_assembly.hpp index 65587ddaa..22b24bae0 100644 --- a/include/plssvm/backends/SYCL/kernel/cg_explicit/basic/kernel_matrix_assembly.hpp +++ b/include/plssvm/backends/SYCL/kernel/cg_explicit/basic/kernel_matrix_assembly.hpp @@ -14,7 +14,7 @@ #pragma once #include "plssvm/backends/SYCL/kernel/kernel_functions.hpp" // plssvm::sycl::detail::{feature_reduce, apply_kernel_function} -#include "plssvm/constants.hpp" // plssvm::{real_type, THREAD_BLOCK_SIZE, INTERNAL_BLOCK_SIZE, FEATURE_BLOCK_SIZE, PADDING_SIZE} +#include "plssvm/constants.hpp" // plssvm::{real_type, THREAD_BLOCK_SIZE, INTERNAL_BLOCK_SIZE, PADDING_SIZE} #include "plssvm/kernel_function_types.hpp" // plssvm::kernel_function_type #include "sycl/sycl.hpp" // sycl::item @@ -35,11 +35,11 @@ class device_kernel_assembly { public: /** * @brief Initialize the SYCL kernel function object. - * @param[out] kernel_matrix_d the calculated kernel matrix - * @param[in] data_d the data points to calculate the kernel matrix from + * @param[out] kernel_matrix the calculated kernel matrix + * @param[in] data the data points to calculate the kernel matrix from * @param[in] num_rows the number of data points * @param[in] device_num_rows the number of rows the current device is responsible for - * @param[in] row_offset the first row in @p data_d the current device is responsible for + * @param[in] device_row_offset the first row in @p data_d the current device is responsible for * @param[in] num_features the number of features per data point * @param[in] q the vector used in the dimensional reduction * @param[in] QA_cost the scalar used in the dimensional reduction @@ -48,12 +48,12 @@ class device_kernel_assembly { * @param[in] grid_y_offset the offset in y-dimension into the data points if more than one execution grid has to be used * @param[in] kernel_function_parameter the parameters necessary to apply the @p kernel_function */ - device_kernel_assembly(real_type *kernel_matrix_d, const real_type *data_d, const std::size_t num_rows, const std::size_t device_num_rows, const std::size_t row_offset, const std::size_t num_features, const real_type *q, const real_type QA_cost, const real_type cost, const std::size_t grid_x_offset, const std::size_t grid_y_offset, Args... kernel_function_parameter) : - kernel_matrix_d_{ kernel_matrix_d }, - data_d_{ data_d }, + device_kernel_assembly(real_type *kernel_matrix, const real_type *data, const std::size_t num_rows, const std::size_t device_num_rows, const std::size_t device_row_offset, const std::size_t num_features, const real_type *q, const real_type QA_cost, const real_type cost, const std::size_t grid_x_offset, const std::size_t grid_y_offset, Args... kernel_function_parameter) : + kernel_matrix_{ kernel_matrix }, + data_{ data }, num_rows_{ num_rows }, device_num_rows_{ device_num_rows }, - row_offset_{ row_offset }, + device_row_offset_{ device_row_offset }, num_features_{ num_features }, q_{ q }, QA_cost_{ QA_cost }, @@ -69,25 +69,27 @@ class device_kernel_assembly { */ void operator()(::sycl::item<2> idx) const { // cast all values to 64-bit std::size_t to prevent potential 32-bit overflows - const auto INTERNAL_BLOCK_SIZE_uz = static_cast(INTERNAL_BLOCK_SIZE); - const auto PADDING_SIZE_uz = static_cast(PADDING_SIZE); + constexpr auto INTERNAL_BLOCK_SIZE_uz = static_cast(INTERNAL_BLOCK_SIZE); + constexpr auto THREAD_BLOCK_SIZE_uz = static_cast(THREAD_BLOCK_SIZE); + constexpr auto PADDING_SIZE_uz = static_cast(PADDING_SIZE); // calculate the indices used in the current work-item - const std::size_t i = (idx.get_id(1) + grid_x_offset_ * THREAD_BLOCK_SIZE) * INTERNAL_BLOCK_SIZE_uz; - const std::size_t j = (idx.get_id(0) + grid_y_offset_ * THREAD_BLOCK_SIZE) * INTERNAL_BLOCK_SIZE_uz; + const std::size_t i = (idx.get_id(1) + grid_x_offset_ * THREAD_BLOCK_SIZE_uz) * INTERNAL_BLOCK_SIZE_uz; + const std::size_t j = (idx.get_id(0) + grid_y_offset_ * THREAD_BLOCK_SIZE_uz) * INTERNAL_BLOCK_SIZE_uz; + // only calculate the upper triangular matrix if (i >= j) { - // create a work-item private array used for internal caching + // create a private memory array used for internal caching real_type temp[INTERNAL_BLOCK_SIZE][INTERNAL_BLOCK_SIZE]{}; for (std::size_t dim = 0; dim < num_features_; ++dim) { // perform the feature reduction calculation for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) { for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) { - const auto global_i = row_offset_ + i + static_cast(internal_i); - const auto global_j = row_offset_ + j + static_cast(internal_j); - temp[internal_i][internal_j] += detail::feature_reduce(data_d_[dim * (num_rows_ + std::size_t{ 1 } + PADDING_SIZE_uz) + global_i], - data_d_[dim * (num_rows_ + std::size_t{ 1 } + PADDING_SIZE_uz) + global_j]); + const auto global_i = device_row_offset_ + i + static_cast(internal_i); + const auto global_j = device_row_offset_ + j + static_cast(internal_j); + temp[internal_i][internal_j] += detail::feature_reduce(data_[dim * (num_rows_ + std::size_t{ 1 } + PADDING_SIZE_uz) + global_i], + data_[dim * (num_rows_ + std::size_t{ 1 } + PADDING_SIZE_uz) + global_j]); } } } @@ -95,22 +97,23 @@ class device_kernel_assembly { // apply the remaining part of the kernel function and store the value in the output kernel matrix for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) { for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) { - // calculate the indices to access the kernel matrix (the part stored on the current device) + // calculate the indices to access the global data points and wrt the current device const auto device_global_i = i + static_cast(internal_i); - const auto global_i = row_offset_ + i + static_cast(internal_i); + const auto global_i = device_row_offset_ + device_global_i; const auto device_global_j = j + static_cast(internal_j); - const auto global_j = row_offset_ + j + static_cast(internal_j); + const auto global_j = device_row_offset_ + device_global_j; // be sure to not perform out of bounds accesses for the kernel matrix (only using the upper triangular matrix) - if (device_global_i < (num_rows_ - row_offset_) && device_global_j < device_num_rows_ && global_i >= global_j) { + if (device_global_i < (num_rows_ - device_row_offset_) && device_global_j < device_num_rows_ && global_i >= global_j) { real_type temp_ij = temp[internal_i][internal_j]; + // apply the final kernel function temp_ij = detail::apply_kernel_function(temp_ij, kernel_function_parameter_) + QA_cost_ - q_[global_i] - q_[global_j]; // apply the cost on the diagonal if (global_i == global_j) { temp_ij += cost_; } - // update the kernel matrix - kernel_matrix_d_[device_global_j * (num_rows_ - row_offset_ + PADDING_SIZE_uz) - device_global_j * (device_global_j + std::size_t{ 1 }) / std::size_t{ 2 } + device_global_i] = temp_ij; + // update the upper triangular kernel matrix + kernel_matrix_[device_global_j * (num_rows_ - device_row_offset_ + PADDING_SIZE_uz) - device_global_j * (device_global_j + std::size_t{ 1 }) / std::size_t{ 2 } + device_global_i] = temp_ij; } } } @@ -119,11 +122,11 @@ class device_kernel_assembly { private: /// @cond Doxygen_suppress - real_type *kernel_matrix_d_; - const real_type *data_d_; + real_type *kernel_matrix_; + const real_type *data_; const std::size_t num_rows_; const std::size_t device_num_rows_; - const std::size_t row_offset_; + const std::size_t device_row_offset_; const std::size_t num_features_; const real_type *q_; const real_type QA_cost_; diff --git a/include/plssvm/backends/SYCL/kernel/cg_explicit/hierarchical/kernel_matrix_assembly.hpp b/include/plssvm/backends/SYCL/kernel/cg_explicit/hierarchical/kernel_matrix_assembly.hpp index b09fef0f8..d3e37ca54 100644 --- a/include/plssvm/backends/SYCL/kernel/cg_explicit/hierarchical/kernel_matrix_assembly.hpp +++ b/include/plssvm/backends/SYCL/kernel/cg_explicit/hierarchical/kernel_matrix_assembly.hpp @@ -14,11 +14,12 @@ #pragma once #include "plssvm/backends/SYCL/kernel/kernel_functions.hpp" // plssvm::sycl::detail::{feature_reduce, apply_kernel_function} -#include "plssvm/constants.hpp" // plssvm::{real_type, THREAD_BLOCK_SIZE, INTERNAL_BLOCK_SIZE, FEATURE_BLOCK_SIZE, PADDING_SIZE} +#include "plssvm/constants.hpp" // plssvm::{real_type, THREAD_BLOCK_SIZE, INTERNAL_BLOCK_SIZE, PADDING_SIZE} #include "plssvm/kernel_function_types.hpp" // plssvm::kernel_function_type #include "sycl/sycl.hpp" // sycl::group, sycl::private_memory, sycl::h_item +#include // std::array #include // std::size_t #include // std::tuple, std::make_tuple @@ -35,11 +36,11 @@ class device_kernel_assembly { public: /** * @brief Initialize the SYCL kernel function object. - * @param[out] kernel_matrix_d the calculated kernel matrix - * @param[in] data_d the data points to calculate the kernel matrix from + * @param[out] kernel_matrix the calculated kernel matrix + * @param[in] data the data points to calculate the kernel matrix from * @param[in] num_rows the number of data points * @param[in] device_num_rows the number of rows the current device is responsible for - * @param[in] row_offset the first row in @p data_d the current device is responsible for + * @param[in] device_row_offset the first row in @p data_d the current device is responsible for * @param[in] num_features the number of features per data point * @param[in] q the vector used in the dimensional reduction * @param[in] QA_cost the scalar used in the dimensional reduction @@ -48,12 +49,12 @@ class device_kernel_assembly { * @param[in] grid_y_offset the offset in y-dimension into the data points if more than one execution grid has to be used * @param[in] kernel_function_parameter the parameters necessary to apply the @p kernel_function */ - device_kernel_assembly(real_type *kernel_matrix_d, const real_type *data_d, const std::size_t num_rows, const std::size_t device_num_rows, const std::size_t row_offset, const std::size_t num_features, const real_type *q, const real_type QA_cost, const real_type cost, const std::size_t grid_x_offset, const std::size_t grid_y_offset, Args... kernel_function_parameter) : - kernel_matrix_d_{ kernel_matrix_d }, - data_d_{ data_d }, + device_kernel_assembly(real_type *kernel_matrix, const real_type *data, const std::size_t num_rows, const std::size_t device_num_rows, const std::size_t device_row_offset, const std::size_t num_features, const real_type *q, const real_type QA_cost, const real_type cost, const std::size_t grid_x_offset, const std::size_t grid_y_offset, Args... kernel_function_parameter) : + kernel_matrix_{ kernel_matrix }, + data_{ data }, num_rows_{ num_rows }, device_num_rows_{ device_num_rows }, - row_offset_{ row_offset }, + device_row_offset_{ device_row_offset }, num_features_{ num_features }, q_{ q }, QA_cost_{ QA_cost }, @@ -68,67 +69,47 @@ class device_kernel_assembly { * @param[in] group indices representing the current point in the execution space */ void operator()(::sycl::group<2> group) const { - // allocate shared memory - real_type data_cache_i[FEATURE_BLOCK_SIZE][INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE]; - real_type data_cache_j[FEATURE_BLOCK_SIZE][INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE]; - - // calculate the indices used in the current work-item - ::sycl::private_memory i{ group }; - ::sycl::private_memory i_linear{ group }; - ::sycl::private_memory j{ group }; - ::sycl::private_memory j_linear{ group }; - - ::sycl::private_memory temp{ group }; - - // initialize private and local variables - group.parallel_for_work_item([&](::sycl::h_item<2> idx) { - const std::size_t threadIdx_x = idx.get_local_id(0); // current thread in block x-dimension - const std::size_t threadIdx_y = idx.get_local_id(1); // current thread in block y-dimension - const std::size_t blockDim_x = idx.get_local_range(0); // number of threads in block x-dimension - const std::size_t blockDim_y = idx.get_local_range(1); // number of threads in block y-dimension - const std::size_t blockIdx_x = group[0] + grid_x_offset_; // current block in grid x-dimension + offsets if the grid size would be too large - const std::size_t blockIdx_y = group[1] + grid_y_offset_; // current block in grid y-dimension + offsets if the grid size would be too large - - const auto INTERNAL_BLOCK_SIZE_uz = static_cast(INTERNAL_BLOCK_SIZE); - - // indices - i(idx) = (blockIdx_y * blockDim_y + threadIdx_y) * INTERNAL_BLOCK_SIZE_uz; - i_linear(idx) = blockIdx_y * blockDim_y * INTERNAL_BLOCK_SIZE_uz + threadIdx_y; - j(idx) = (blockIdx_x * blockDim_x + threadIdx_x) * INTERNAL_BLOCK_SIZE_uz; - j_linear(idx) = blockIdx_x * blockDim_x * INTERNAL_BLOCK_SIZE_uz + threadIdx_y; - - // initialize private temp matrix to zero - for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) { - for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) { - temp(idx)[internal_i][internal_j] = real_type{ 0.0 }; - } - } - }); + // create two local memory arrays used for caching data point features + real_type data_i_cache[THREAD_BLOCK_SIZE][INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE]; + real_type data_j_cache[THREAD_BLOCK_SIZE][INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE]; - // implicit group barrier + // create a private memory array used for internal caching + ::sycl::private_memory, INTERNAL_BLOCK_SIZE>, 2> temp{ group }; - // exploit symmetry + // only calculate the upper triangular matrix -> can't use get_local_id() since all work-items in a work-group must progress further if (group[1] >= group[0]) { - for (std::size_t dim = 0; dim < num_features_; dim += static_cast(FEATURE_BLOCK_SIZE)) { - // load data into shared memory + // iterate over all features using blocking to be able to cache them for faster memory accesses + for (std::size_t dim = 0; dim < num_features_; dim += static_cast(THREAD_BLOCK_SIZE)) { + // load data into local memory group.parallel_for_work_item([&](::sycl::h_item<2> idx) { + // cast values to 32-bit unsigned int values to prevent implicit conversions const auto local_id_0 = static_cast(idx.get_local_id(0)); const auto local_id_1 = static_cast(idx.get_local_id(1)); - const std::size_t threadIdx_x = idx.get_local_id(0); + // cast all values to 64-bit std::size_t to prevent potential 32-bit overflows + constexpr auto INTERNAL_BLOCK_SIZE_uz = static_cast(INTERNAL_BLOCK_SIZE); + constexpr auto THREAD_BLOCK_SIZE_uz = static_cast(THREAD_BLOCK_SIZE); + constexpr auto PADDING_SIZE_uz = static_cast(PADDING_SIZE); + + const std::size_t threadIdx_x = idx.get_local_id(0); // current work-item in work-group x-dimension + const std::size_t threadIdx_y = idx.get_local_id(1); // current work-item in work-group y-dimension + const std::size_t blockDim_x = idx.get_local_range(0); // number of work-items in work-group x-dimension + const std::size_t blockDim_y = idx.get_local_range(1); // number of work-items in work-group y-dimension + const std::size_t blockIdx_x = group[0] + grid_x_offset_; // current work-group in global range x-dimension + offsets if the global range is too large + const std::size_t blockIdx_y = group[1] + grid_y_offset_; // current work-group in global range y-dimension + offsets if the global range is too large - const auto THREAD_BLOCK_SIZE_uz = static_cast(THREAD_BLOCK_SIZE); - const auto PADDING_SIZE_uz = static_cast(PADDING_SIZE); + // calculate the indices used in the current work-item paying attention to coalesced memory accesses + const auto i_linear = blockIdx_y * blockDim_y * INTERNAL_BLOCK_SIZE_uz + threadIdx_y; + const auto j_linear = blockIdx_x * blockDim_x * INTERNAL_BLOCK_SIZE_uz + threadIdx_y; for (unsigned internal = 0; internal < INTERNAL_BLOCK_SIZE; ++internal) { - const auto global_i = row_offset_ + i_linear(idx) + static_cast(internal) * THREAD_BLOCK_SIZE_uz; - const auto global_j = row_offset_ + j_linear(idx) + static_cast(internal) * THREAD_BLOCK_SIZE_uz; - - // FEATURE_BLOCK_SIZE = 2 * THREAD_BLOCK_SIZE -> store twice as many values in the local memory - data_cache_i[local_id_0][internal * THREAD_BLOCK_SIZE + local_id_1] = data_d_[(dim + threadIdx_x) * (num_rows_ + std::size_t{ 1 } + PADDING_SIZE_uz) + global_i]; - data_cache_i[local_id_0 + THREAD_BLOCK_SIZE][internal * THREAD_BLOCK_SIZE + local_id_1] = data_d_[(dim + threadIdx_x + THREAD_BLOCK_SIZE_uz) * (num_rows_ + std::size_t{ 1 } + PADDING_SIZE_uz) + global_i]; - data_cache_j[local_id_0][internal * THREAD_BLOCK_SIZE + local_id_1] = data_d_[(dim + threadIdx_x) * (num_rows_ + std::size_t{ 1 } + PADDING_SIZE_uz) + global_j]; - data_cache_j[local_id_0 + THREAD_BLOCK_SIZE][internal * THREAD_BLOCK_SIZE + local_id_1] = data_d_[(dim + threadIdx_x + THREAD_BLOCK_SIZE_uz) * (num_rows_ + std::size_t{ 1 } + PADDING_SIZE_uz) + global_j]; + // calculate the indices to access the global data points, pays attention to coalesced memory accesses + const auto global_i_linear = device_row_offset_ + i_linear + static_cast(internal) * THREAD_BLOCK_SIZE_uz; + const auto global_j_linear = device_row_offset_ + j_linear + static_cast(internal) * THREAD_BLOCK_SIZE_uz; + + // store the values in the local memory + data_i_cache[local_id_0][internal * THREAD_BLOCK_SIZE + local_id_1] = data_[(dim + threadIdx_x) * (num_rows_ + std::size_t{ 1 } + PADDING_SIZE_uz) + global_i_linear]; + data_j_cache[local_id_0][internal * THREAD_BLOCK_SIZE + local_id_1] = data_[(dim + threadIdx_x) * (num_rows_ + std::size_t{ 1 } + PADDING_SIZE_uz) + global_j_linear]; } }); @@ -136,14 +117,15 @@ class device_kernel_assembly { // perform the feature reduction calculation group.parallel_for_work_item([&](::sycl::h_item<2> idx) { + // cast values to 32-bit unsigned int values to prevent implicit conversions const auto local_id_0 = static_cast(idx.get_local_id(0)); const auto local_id_1 = static_cast(idx.get_local_id(1)); - for (unsigned block_dim = 0; block_dim < FEATURE_BLOCK_SIZE; ++block_dim) { + for (unsigned block_dim = 0; block_dim < THREAD_BLOCK_SIZE; ++block_dim) { for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) { for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) { - temp(idx)[internal_i][internal_j] += detail::feature_reduce(data_cache_i[block_dim][local_id_1 * INTERNAL_BLOCK_SIZE + internal_i], - data_cache_j[block_dim][local_id_0 * INTERNAL_BLOCK_SIZE + internal_j]); + temp(idx)[internal_i][internal_j] += detail::feature_reduce(data_i_cache[block_dim][local_id_1 * INTERNAL_BLOCK_SIZE + internal_i], + data_j_cache[block_dim][local_id_0 * INTERNAL_BLOCK_SIZE + internal_j]); } } } @@ -154,26 +136,40 @@ class device_kernel_assembly { // apply the remaining part of the kernel function and store the value in the output kernel matrix group.parallel_for_work_item([&](::sycl::h_item<2> idx) { - const auto PADDING_SIZE_uz = static_cast(PADDING_SIZE); + // cast all values to 64-bit std::size_t to prevent potential 32-bit overflows + constexpr auto INTERNAL_BLOCK_SIZE_uz = static_cast(INTERNAL_BLOCK_SIZE); + constexpr auto PADDING_SIZE_uz = static_cast(PADDING_SIZE); + + const std::size_t threadIdx_x = idx.get_local_id(0); // current work-item in work-group x-dimension + const std::size_t threadIdx_y = idx.get_local_id(1); // current work-item in work-group y-dimension + const std::size_t blockDim_x = idx.get_local_range(0); // number of work-items in work-group x-dimension + const std::size_t blockDim_y = idx.get_local_range(1); // number of work-items in work-group y-dimension + const std::size_t blockIdx_x = group[0] + grid_x_offset_; // current work-group in global range x-dimension + offsets if the global range is too large + const std::size_t blockIdx_y = group[1] + grid_y_offset_; // current work-group in global range y-dimension + offsets if the global range is too large + + // calculate the indices used in the current work-item + const auto i = (blockIdx_y * blockDim_y + threadIdx_y) * INTERNAL_BLOCK_SIZE_uz; + const auto j = (blockIdx_x * blockDim_x + threadIdx_x) * INTERNAL_BLOCK_SIZE_uz; for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) { for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) { - // calculate the indices to access the kernel matrix (the part stored on the current device) - const auto device_global_i = i(idx) + static_cast(internal_i); - const auto global_i = row_offset_ + i(idx) + static_cast(internal_i); - const auto device_global_j = j(idx) + static_cast(internal_j); - const auto global_j = row_offset_ + j(idx) + static_cast(internal_j); + // calculate the indices to access the global data points and wrt the current device + const auto device_global_i = i + static_cast(internal_i); + const auto global_i = device_row_offset_ + device_global_i; + const auto device_global_j = j + static_cast(internal_j); + const auto global_j = device_row_offset_ + device_global_j; // be sure to not perform out of bounds accesses for the kernel matrix (only using the upper triangular matrix) - if (device_global_i < (num_rows_ - row_offset_) && device_global_j < device_num_rows_ && global_i >= global_j) { + if (device_global_i < (num_rows_ - device_row_offset_) && device_global_j < device_num_rows_ && global_i >= global_j) { real_type temp_ij = temp(idx)[internal_i][internal_j]; + // apply the final kernel function temp_ij = detail::apply_kernel_function(temp_ij, kernel_function_parameter_) + QA_cost_ - q_[global_i] - q_[global_j]; // apply the cost on the diagonal if (global_i == global_j) { temp_ij += cost_; } - // update the kernel matrix - kernel_matrix_d_[device_global_j * (num_rows_ - row_offset_ + PADDING_SIZE_uz) - device_global_j * (device_global_j + std::size_t{ 1 }) / std::size_t{ 2 } + device_global_i] = temp_ij; + // update the upper triangular kernel matrix + kernel_matrix_[device_global_j * (num_rows_ - device_row_offset_ + PADDING_SIZE_uz) - device_global_j * (device_global_j + std::size_t{ 1 }) / std::size_t{ 2 } + device_global_i] = temp_ij; } } } @@ -183,11 +179,11 @@ class device_kernel_assembly { private: /// @cond Doxygen_suppress - real_type *kernel_matrix_d_; - const real_type *data_d_; + real_type *kernel_matrix_; + const real_type *data_; const std::size_t num_rows_; const std::size_t device_num_rows_; - const std::size_t row_offset_; + const std::size_t device_row_offset_; const std::size_t num_features_; const real_type *q_; const real_type QA_cost_; diff --git a/include/plssvm/backends/SYCL/kernel/cg_explicit/scoped/kernel_matrix_assembly.hpp b/include/plssvm/backends/SYCL/kernel/cg_explicit/scoped/kernel_matrix_assembly.hpp index 4ed3764ce..33c725a46 100644 --- a/include/plssvm/backends/SYCL/kernel/cg_explicit/scoped/kernel_matrix_assembly.hpp +++ b/include/plssvm/backends/SYCL/kernel/cg_explicit/scoped/kernel_matrix_assembly.hpp @@ -14,11 +14,12 @@ #pragma once #include "plssvm/backends/SYCL/kernel/kernel_functions.hpp" // plssvm::sycl::detail::{feature_reduce, apply_kernel_function} -#include "plssvm/constants.hpp" // plssvm::{real_type, THREAD_BLOCK_SIZE, INTERNAL_BLOCK_SIZE, FEATURE_BLOCK_SIZE, PADDING_SIZE} +#include "plssvm/constants.hpp" // plssvm::{real_type, THREAD_BLOCK_SIZE, INTERNAL_BLOCK_SIZE, PADDING_SIZE} #include "plssvm/kernel_function_types.hpp" // plssvm::kernel_function_type #include "sycl/sycl.hpp" // sycl::memory_environment, sycl::require_local_mem, sycl::require_private_mem, sycl::distribute_items_and_wait, sycl::s_item +#include // std::array #include // std::size_t #include // std::tuple, std::make_tuple @@ -35,11 +36,11 @@ class device_kernel_assembly { public: /** * @brief Initialize the SYCL kernel function object. - * @param[out] kernel_matrix_d the calculated kernel matrix - * @param[in] data_d the data points to calculate the kernel matrix from + * @param[out] kernel_matrix the calculated kernel matrix + * @param[in] data the data points to calculate the kernel matrix from * @param[in] num_rows the number of data points * @param[in] device_num_rows the number of rows the current device is responsible for - * @param[in] row_offset the first row in @p data_d the current device is responsible for + * @param[in] device_row_offset the first row in @p data_d the current device is responsible for * @param[in] num_features the number of features per data point * @param[in] q the vector used in the dimensional reduction * @param[in] QA_cost the scalar used in the dimensional reduction @@ -48,12 +49,12 @@ class device_kernel_assembly { * @param[in] grid_y_offset the offset in y-dimension into the data points if more than one execution grid has to be used * @param[in] kernel_function_parameter the parameters necessary to apply the @p kernel_function */ - device_kernel_assembly(real_type *kernel_matrix_d, const real_type *data_d, const std::size_t num_rows, const std::size_t device_num_rows, const std::size_t row_offset, const std::size_t num_features, const real_type *q, const real_type QA_cost, const real_type cost, const std::size_t grid_x_offset, const std::size_t grid_y_offset, Args... kernel_function_parameter) : - kernel_matrix_d_{ kernel_matrix_d }, - data_d_{ data_d }, + device_kernel_assembly(real_type *kernel_matrix, const real_type *data, const std::size_t num_rows, const std::size_t device_num_rows, const std::size_t device_row_offset, const std::size_t num_features, const real_type *q, const real_type QA_cost, const real_type cost, const std::size_t grid_x_offset, const std::size_t grid_y_offset, Args... kernel_function_parameter) : + kernel_matrix_{ kernel_matrix }, + data_{ data }, num_rows_{ num_rows }, device_num_rows_{ device_num_rows }, - row_offset_{ row_offset }, + device_row_offset_{ device_row_offset }, num_features_{ num_features }, q_{ q }, QA_cost_{ QA_cost }, @@ -71,94 +72,100 @@ class device_kernel_assembly { template void operator()(T group) const { ::sycl::memory_environment(group, - ::sycl::require_local_mem(), - ::sycl::require_local_mem(), - ::sycl::require_private_mem(), - ::sycl::require_private_mem(), - ::sycl::require_private_mem(), - ::sycl::require_private_mem(), - ::sycl::require_private_mem, INTERNAL_BLOCK_SIZE>>({}), - [&](auto &data_cache_i, auto &data_cache_j, auto &i, auto &i_linear, auto &j, auto &j_linear, auto &temp) { - // initialize private and local variables - ::sycl::distribute_items_and_wait(group, [&](::sycl::s_item<2> idx) { - const std::size_t threadIdx_x = idx.get_local_id(group, 0); // current thread in block x-dimension - const std::size_t threadIdx_y = idx.get_local_id(group, 1); // current thread in block y-dimension - const std::size_t blockDim_x = group.get_logical_local_range(0); // number of threads in block x-dimension - const std::size_t blockDim_y = group.get_logical_local_range(1); // number of threads in block y-dimension - const std::size_t blockIdx_x = group[0] + grid_x_offset_; // current block in grid x-dimension + offsets if the grid size would be too large - const std::size_t blockIdx_y = group[1] + grid_y_offset_; // current block in grid y-dimension + offsets if the grid size would be too large - - const auto INTERNAL_BLOCK_SIZE_uz = static_cast(INTERNAL_BLOCK_SIZE); - - // indices - i(idx) = (blockIdx_y * blockDim_y + threadIdx_y) * INTERNAL_BLOCK_SIZE_uz; - i_linear(idx) = blockIdx_y * blockDim_y * INTERNAL_BLOCK_SIZE_uz + threadIdx_y; - j(idx) = (blockIdx_x * blockDim_x + threadIdx_x) * INTERNAL_BLOCK_SIZE_uz; - j_linear(idx) = blockIdx_x * blockDim_x * INTERNAL_BLOCK_SIZE_uz + threadIdx_y; - }); - - // exploit symmetry + ::sycl::require_local_mem(), // data_i_cache + ::sycl::require_local_mem(), // data_j_cache + ::sycl::require_private_mem, INTERNAL_BLOCK_SIZE>>({}), // temp + [&](auto &data_i_cache, auto &data_j_cache, auto &temp) { + // only calculate the upper triangular matrix -> can't use get_local_id() since all work-items in a work-group must progress further if (group[1] >= group[0]) { - for (std::size_t dim = 0; dim < num_features_; dim += static_cast(FEATURE_BLOCK_SIZE)) { - // load data into shared memory + // iterate over all features using blocking to be able to cache them for faster memory accesses + for (std::size_t dim = 0; dim < num_features_; dim += static_cast(THREAD_BLOCK_SIZE)) { + // load data into local memory ::sycl::distribute_items_and_wait(group, [&](::sycl::s_item<2> idx) { + // cast values to 32-bit unsigned int values to prevent implicit conversions const auto local_id_0 = static_cast(idx.get_local_id(group, 0)); const auto local_id_1 = static_cast(idx.get_local_id(group, 1)); - const std::size_t threadIdx_x = idx.get_local_id(group, 0); + // cast all values to 64-bit std::size_t to prevent potential 32-bit overflows + constexpr auto INTERNAL_BLOCK_SIZE_uz = static_cast(INTERNAL_BLOCK_SIZE); + constexpr auto THREAD_BLOCK_SIZE_uz = static_cast(THREAD_BLOCK_SIZE); + constexpr auto PADDING_SIZE_uz = static_cast(PADDING_SIZE); - const auto THREAD_BLOCK_SIZE_uz = static_cast(THREAD_BLOCK_SIZE); - const auto PADDING_SIZE_uz = static_cast(PADDING_SIZE); + const std::size_t threadIdx_x = idx.get_local_id(group, 0); // current work-item in work-group x-dimension + const std::size_t threadIdx_y = idx.get_local_id(group, 1); // current work-item in work-group y-dimension + const std::size_t blockDim_x = group.get_logical_local_range(0); // number of work-items in work-group x-dimension + const std::size_t blockDim_y = group.get_logical_local_range(1); // number of work-items in work-group y-dimension + const std::size_t blockIdx_x = group[0] + grid_x_offset_; // current work-group in global range x-dimension + offsets if the global range is too large + const std::size_t blockIdx_y = group[1] + grid_y_offset_; // current work-group in global range y-dimension + offsets if the global range is too large + + // calculate the indices used in the current work-item paying attention to coalesced memory accesses + const auto i_linear = blockIdx_y * blockDim_y * INTERNAL_BLOCK_SIZE_uz + threadIdx_y; + const auto j_linear = blockIdx_x * blockDim_x * INTERNAL_BLOCK_SIZE_uz + threadIdx_y; for (unsigned internal = 0; internal < INTERNAL_BLOCK_SIZE; ++internal) { - const auto global_i = row_offset_ + i_linear(idx) + static_cast(internal) * THREAD_BLOCK_SIZE_uz; - const auto global_j = row_offset_ + j_linear(idx) + static_cast(internal) * THREAD_BLOCK_SIZE_uz; - - // FEATURE_BLOCK_SIZE = 2 * THREAD_BLOCK_SIZE -> store twice as many values in the local memory - data_cache_i[local_id_0][internal * THREAD_BLOCK_SIZE + local_id_1] = data_d_[(dim + threadIdx_x) * (num_rows_ + std::size_t{ 1 } + PADDING_SIZE_uz) + global_i]; - data_cache_i[local_id_0 + THREAD_BLOCK_SIZE][internal * THREAD_BLOCK_SIZE + local_id_1] = data_d_[(dim + threadIdx_x + THREAD_BLOCK_SIZE_uz) * (num_rows_ + std::size_t{ 1 } + PADDING_SIZE_uz) + global_i]; - data_cache_j[local_id_0][internal * THREAD_BLOCK_SIZE + local_id_1] = data_d_[(dim + threadIdx_x) * (num_rows_ + std::size_t{ 1 } + PADDING_SIZE_uz) + global_j]; - data_cache_j[local_id_0 + THREAD_BLOCK_SIZE][internal * THREAD_BLOCK_SIZE + local_id_1] = data_d_[(dim + threadIdx_x + THREAD_BLOCK_SIZE_uz) * (num_rows_ + std::size_t{ 1 } + PADDING_SIZE_uz) + global_j]; + // calculate the indices to access the global data points, pays attention to coalesced memory accesses + const auto global_i_linear = device_row_offset_ + i_linear + static_cast(internal) * THREAD_BLOCK_SIZE_uz; + const auto global_j_linear = device_row_offset_ + j_linear + static_cast(internal) * THREAD_BLOCK_SIZE_uz; + + // store the values in the local memory + data_i_cache[local_id_0][internal * THREAD_BLOCK_SIZE + local_id_1] = data_[(dim + threadIdx_x) * (num_rows_ + std::size_t{ 1 } + PADDING_SIZE_uz) + global_i_linear]; + data_j_cache[local_id_0][internal * THREAD_BLOCK_SIZE + local_id_1] = data_[(dim + threadIdx_x) * (num_rows_ + std::size_t{ 1 } + PADDING_SIZE_uz) + global_j_linear]; } }); - // perform calculations + // perform the feature reduction calculation ::sycl::distribute_items_and_wait(group, [&](::sycl::s_item<2> idx) { + // cast values to 32-bit unsigned int values to prevent implicit conversions const auto local_id_0 = static_cast(idx.get_local_id(group, 0)); const auto local_id_1 = static_cast(idx.get_local_id(group, 1)); - for (unsigned block_dim = 0; block_dim < FEATURE_BLOCK_SIZE; ++block_dim) { + for (unsigned block_dim = 0; block_dim < THREAD_BLOCK_SIZE; ++block_dim) { for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) { for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) { - temp(idx)[internal_i][internal_j] += detail::feature_reduce(data_cache_i[block_dim][local_id_1 * INTERNAL_BLOCK_SIZE + internal_i], - data_cache_j[block_dim][local_id_0 * INTERNAL_BLOCK_SIZE + internal_j]); + temp(idx)[internal_i][internal_j] += detail::feature_reduce(data_i_cache[block_dim][local_id_1 * INTERNAL_BLOCK_SIZE + internal_i], + data_j_cache[block_dim][local_id_0 * INTERNAL_BLOCK_SIZE + internal_j]); } } } }); } + // apply the remaining part of the kernel function and store the value in the output kernel matrix ::sycl::distribute_items_and_wait(group, [&](::sycl::s_item<2> idx) { - const auto PADDING_SIZE_uz = static_cast(PADDING_SIZE); + // cast all values to 64-bit std::size_t to prevent potential 32-bit overflows + constexpr auto INTERNAL_BLOCK_SIZE_uz = static_cast(INTERNAL_BLOCK_SIZE); + constexpr auto PADDING_SIZE_uz = static_cast(PADDING_SIZE); + + const std::size_t threadIdx_x = idx.get_local_id(group, 0); // current work-item in work-group x-dimension + const std::size_t threadIdx_y = idx.get_local_id(group, 1); // current work-item in work-group y-dimension + const std::size_t blockDim_x = group.get_logical_local_range(0); // number of work-items in work-group x-dimension + const std::size_t blockDim_y = group.get_logical_local_range(1); // number of work-items in work-group y-dimension + const std::size_t blockIdx_x = group[0] + grid_x_offset_; // current work-group in global range x-dimension + offsets if the global range is too large + const std::size_t blockIdx_y = group[1] + grid_y_offset_; // current work-group in global range y-dimension + offsets if the global range is too large + + // calculate the indices used in the current work-item + const auto i = (blockIdx_y * blockDim_y + threadIdx_y) * INTERNAL_BLOCK_SIZE_uz; + const auto j = (blockIdx_x * blockDim_x + threadIdx_x) * INTERNAL_BLOCK_SIZE_uz; for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) { for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) { - // calculate the indices to access the kernel matrix (the part stored on the current device) - const auto device_global_i = i(idx) + static_cast(internal_i); - const auto global_i = row_offset_ + i(idx) + static_cast(internal_i); - const auto device_global_j = j(idx) + static_cast(internal_j); - const auto global_j = row_offset_ + j(idx) + static_cast(internal_j); + // calculate the indices to access the global data points and wrt the current device + const auto device_global_i = i + static_cast(internal_i); + const auto global_i = device_row_offset_ + device_global_i; + const auto device_global_j = j + static_cast(internal_j); + const auto global_j = device_row_offset_ + device_global_j; // be sure to not perform out of bounds accesses for the kernel matrix (only using the upper triangular matrix) - if (device_global_i < (num_rows_ - row_offset_) && device_global_j < device_num_rows_ && global_i >= global_j) { + if (device_global_i < (num_rows_ - device_row_offset_) && device_global_j < device_num_rows_ && global_i >= global_j) { real_type temp_ij = temp(idx)[internal_i][internal_j]; + // apply the final kernel function temp_ij = detail::apply_kernel_function(temp_ij, kernel_function_parameter_) + QA_cost_ - q_[global_i] - q_[global_j]; // apply the cost on the diagonal if (global_i == global_j) { temp_ij += cost_; } - // update the kernel matrix - kernel_matrix_d_[device_global_j * (num_rows_ - row_offset_ + PADDING_SIZE_uz) - device_global_j * (device_global_j + std::size_t{ 1 }) / std::size_t{ 2 } + device_global_i] = temp_ij; + // update the upper triangular kernel matrix + kernel_matrix_[device_global_j * (num_rows_ - device_row_offset_ + PADDING_SIZE_uz) - device_global_j * (device_global_j + std::size_t{ 1 }) / std::size_t{ 2 } + device_global_i] = temp_ij; } } } @@ -169,11 +176,11 @@ class device_kernel_assembly { private: /// @cond Doxygen_suppress - real_type *kernel_matrix_d_; - const real_type *data_d_; + real_type *kernel_matrix_; + const real_type *data_; const std::size_t num_rows_; const std::size_t device_num_rows_; - const std::size_t row_offset_; + const std::size_t device_row_offset_; const std::size_t num_features_; const real_type *q_; const real_type QA_cost_; diff --git a/include/plssvm/backends/SYCL/kernel/cg_explicit/work_group/kernel_matrix_assembly.hpp b/include/plssvm/backends/SYCL/kernel/cg_explicit/work_group/kernel_matrix_assembly.hpp index 96030fbe7..6e7fd2033 100644 --- a/include/plssvm/backends/SYCL/kernel/cg_explicit/work_group/kernel_matrix_assembly.hpp +++ b/include/plssvm/backends/SYCL/kernel/cg_explicit/work_group/kernel_matrix_assembly.hpp @@ -14,7 +14,7 @@ #pragma once #include "plssvm/backends/SYCL/kernel/kernel_functions.hpp" // plssvm::sycl::detail::{feature_reduce, apply_kernel_function} -#include "plssvm/constants.hpp" // plssvm::{real_type, THREAD_BLOCK_SIZE, INTERNAL_BLOCK_SIZE, FEATURE_BLOCK_SIZE, PADDING_SIZE} +#include "plssvm/constants.hpp" // plssvm::{real_type, THREAD_BLOCK_SIZE, INTERNAL_BLOCK_SIZE, PADDING_SIZE} #include "plssvm/kernel_function_types.hpp" // plssvm::kernel_function_type #include "sycl/sycl.hpp" // sycl::handler, sycl::range, sycl::nd_item, sycl::local_accessor @@ -36,11 +36,11 @@ class device_kernel_assembly { /** * @brief Initialize the SYCL kernel function object. * @param[in] cgh the SYCL handler used to allocate the local memory - * @param[out] kernel_matrix_d the calculated kernel matrix - * @param[in] data_d the data points to calculate the kernel matrix from + * @param[out] kernel_matrix the calculated kernel matrix + * @param[in] data the data points to calculate the kernel matrix from * @param[in] num_rows the number of data points * @param[in] device_num_rows the number of rows the current device is responsible for - * @param[in] row_offset the first row in @p data_d the current device is responsible for + * @param[in] device_row_offset the first row in @p data_d the current device is responsible for * @param[in] num_features the number of features per data point * @param[in] q the vector used in the dimensional reduction * @param[in] QA_cost the scalar used in the dimensional reduction @@ -49,14 +49,14 @@ class device_kernel_assembly { * @param[in] grid_y_offset the offset in y-dimension into the data points if more than one execution grid has to be used * @param[in] kernel_function_parameter the parameters necessary to apply the @p kernel_function */ - device_kernel_assembly(::sycl::handler &cgh, real_type *kernel_matrix_d, const real_type *data_d, const std::size_t num_rows, const std::size_t device_num_rows, const std::size_t row_offset, const std::size_t num_features, const real_type *q, const real_type QA_cost, const real_type cost, const std::size_t grid_x_offset, const std::size_t grid_y_offset, Args... kernel_function_parameter) : - data_cache_i_{ ::sycl::range<2>{ static_cast(FEATURE_BLOCK_SIZE), static_cast(INTERNAL_BLOCK_SIZE) * static_cast(THREAD_BLOCK_SIZE) }, cgh }, - data_cache_j_{ ::sycl::range<2>{ static_cast(FEATURE_BLOCK_SIZE), static_cast(INTERNAL_BLOCK_SIZE) * static_cast(THREAD_BLOCK_SIZE) }, cgh }, - kernel_matrix_d_{ kernel_matrix_d }, - data_d_{ data_d }, + device_kernel_assembly(::sycl::handler &cgh, real_type *kernel_matrix, const real_type *data, const std::size_t num_rows, const std::size_t device_num_rows, const std::size_t device_row_offset, const std::size_t num_features, const real_type *q, const real_type QA_cost, const real_type cost, const std::size_t grid_x_offset, const std::size_t grid_y_offset, Args... kernel_function_parameter) : + data_i_cache_{ ::sycl::range<2>{ static_cast(THREAD_BLOCK_SIZE), static_cast(INTERNAL_BLOCK_SIZE) * static_cast(THREAD_BLOCK_SIZE) }, cgh }, + data_j_cache_{ ::sycl::range<2>{ static_cast(THREAD_BLOCK_SIZE), static_cast(INTERNAL_BLOCK_SIZE) * static_cast(THREAD_BLOCK_SIZE) }, cgh }, + kernel_matrix_{ kernel_matrix }, + data_{ data }, num_rows_{ num_rows }, device_num_rows_{ device_num_rows }, - row_offset_{ row_offset }, + device_row_offset_{ device_row_offset }, num_features_{ num_features }, q_{ q }, QA_cost_{ QA_cost }, @@ -76,74 +76,78 @@ class device_kernel_assembly { const auto local_id_1 = static_cast(nd_idx.get_local_id(1)); // cast all values to 64-bit std::size_t to prevent potential 32-bit overflows - const std::size_t threadIdx_x = nd_idx.get_local_id(0); // current thread in block x-dimension - const std::size_t threadIdx_y = nd_idx.get_local_id(1); // current thread in block y-dimension - const std::size_t blockDim_x = nd_idx.get_local_range(0); // number of threads in block x-dimension - const std::size_t blockDim_y = nd_idx.get_local_range(1); // number of threads in block y-dimension - const std::size_t blockIdx_x = nd_idx.get_group(0) + grid_x_offset_; // current block in grid x-dimension + offsets if the grid size would be too large - const std::size_t blockIdx_y = nd_idx.get_group(1) + grid_y_offset_; // current block in grid y-dimension + offsets if the grid size would be too large - const auto INTERNAL_BLOCK_SIZE_uz = static_cast(INTERNAL_BLOCK_SIZE); - const auto THREAD_BLOCK_SIZE_uz = static_cast(THREAD_BLOCK_SIZE); - const auto FEATURE_BLOCK_SIZE_uz = static_cast(FEATURE_BLOCK_SIZE); - const auto PADDING_SIZE_uz = static_cast(PADDING_SIZE); - - // calculate the indices used in the current work-item - const auto i = (blockIdx_y * blockDim_y + threadIdx_y) * INTERNAL_BLOCK_SIZE_uz; - const auto i_linear = blockIdx_y * blockDim_y * INTERNAL_BLOCK_SIZE_uz + threadIdx_y; - const auto j = (blockIdx_x * blockDim_x + threadIdx_x) * INTERNAL_BLOCK_SIZE_uz; - const auto j_linear = blockIdx_x * blockDim_x * INTERNAL_BLOCK_SIZE_uz + threadIdx_y; + constexpr auto INTERNAL_BLOCK_SIZE_uz = static_cast(INTERNAL_BLOCK_SIZE); + constexpr auto THREAD_BLOCK_SIZE_uz = static_cast(THREAD_BLOCK_SIZE); + constexpr auto PADDING_SIZE_uz = static_cast(PADDING_SIZE); + + const std::size_t threadIdx_x = nd_idx.get_local_id(0); // current work-item in work-group x-dimension + const std::size_t threadIdx_y = nd_idx.get_local_id(1); // current work-item in work-group y-dimension + const std::size_t blockDim_x = nd_idx.get_local_range(0); // number of work-items in work-group x-dimension + const std::size_t blockDim_y = nd_idx.get_local_range(1); // number of work-items in work-group y-dimension + const std::size_t blockIdx_x = nd_idx.get_group(0) + grid_x_offset_; // current work-group in global range x-dimension + offsets if the global range is too large + const std::size_t blockIdx_y = nd_idx.get_group(1) + grid_y_offset_; // current work-group in global range y-dimension + offsets if the global range is too large // only calculate the upper triangular matrix -> can't use get_local_id() since all work-items in a work-group must progress further if (blockIdx_y >= blockIdx_x) { - // create a work-item private array used for internal caching + // create a private memory array used for internal caching real_type temp[INTERNAL_BLOCK_SIZE][INTERNAL_BLOCK_SIZE]{}; - // iterate over all features using blocking to be able to cache them for faster memory accesses - for (std::size_t dim = 0; dim < num_features_; dim += FEATURE_BLOCK_SIZE_uz) { - // load data into local memory - for (unsigned internal = 0; internal < INTERNAL_BLOCK_SIZE; ++internal) { - const auto global_i = row_offset_ + i_linear + static_cast(internal) * THREAD_BLOCK_SIZE_uz; - const auto global_j = row_offset_ + j_linear + static_cast(internal) * THREAD_BLOCK_SIZE_uz; - - // FEATURE_BLOCK_SIZE = 2 * THREAD_BLOCK_SIZE -> store twice as many values in the local memory - data_cache_i_[local_id_0][internal * THREAD_BLOCK_SIZE + local_id_1] = data_d_[(dim + threadIdx_x) * (num_rows_ + std::size_t{ 1 } + PADDING_SIZE_uz) + global_i]; - data_cache_i_[local_id_0 + THREAD_BLOCK_SIZE][internal * THREAD_BLOCK_SIZE + local_id_1] = data_d_[(dim + threadIdx_x + THREAD_BLOCK_SIZE_uz) * (num_rows_ + std::size_t{ 1 } + PADDING_SIZE_uz) + global_i]; - data_cache_j_[local_id_0][internal * THREAD_BLOCK_SIZE + local_id_1] = data_d_[(dim + threadIdx_x) * (num_rows_ + std::size_t{ 1 } + PADDING_SIZE_uz) + global_j]; - data_cache_j_[local_id_0 + THREAD_BLOCK_SIZE][internal * THREAD_BLOCK_SIZE + local_id_1] = data_d_[(dim + threadIdx_x + THREAD_BLOCK_SIZE_uz) * (num_rows_ + std::size_t{ 1 } + PADDING_SIZE_uz) + global_j]; - } - nd_idx.barrier(); // wait until all work-items loaded their part of the data - - // perform the feature reduction calculation - for (unsigned block_dim = 0; block_dim < FEATURE_BLOCK_SIZE; ++block_dim) { - for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) { - for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) { - temp[internal_i][internal_j] += detail::feature_reduce(data_cache_i_[block_dim][local_id_1 * INTERNAL_BLOCK_SIZE + internal_i], - data_cache_j_[block_dim][local_id_0 * INTERNAL_BLOCK_SIZE + internal_j]); + { + // calculate the indices used in the current work-item paying attention to coalesced memory accesses + const auto i_linear = blockIdx_y * blockDim_y * INTERNAL_BLOCK_SIZE_uz + threadIdx_y; + const auto j_linear = blockIdx_x * blockDim_x * INTERNAL_BLOCK_SIZE_uz + threadIdx_y; + + // iterate over all features using blocking to be able to cache them for faster memory accesses + for (std::size_t dim = 0; dim < num_features_; dim += THREAD_BLOCK_SIZE_uz) { + // load data into local memory + for (unsigned internal = 0; internal < INTERNAL_BLOCK_SIZE; ++internal) { + // calculate the indices to access the global data points, pays attention to coalesced memory accesses + const auto global_i_linear = device_row_offset_ + i_linear + static_cast(internal) * THREAD_BLOCK_SIZE_uz; + const auto global_j_linear = device_row_offset_ + j_linear + static_cast(internal) * THREAD_BLOCK_SIZE_uz; + + // store the values in the local memory + data_i_cache_[local_id_0][internal * THREAD_BLOCK_SIZE + local_id_1] = data_[(dim + threadIdx_x) * (num_rows_ + std::size_t{ 1 } + PADDING_SIZE_uz) + global_i_linear]; + data_j_cache_[local_id_0][internal * THREAD_BLOCK_SIZE + local_id_1] = data_[(dim + threadIdx_x) * (num_rows_ + std::size_t{ 1 } + PADDING_SIZE_uz) + global_j_linear]; + } + nd_idx.barrier(); // wait until all work-items loaded their part of the data + + // perform the feature reduction calculation + for (unsigned block_dim = 0; block_dim < THREAD_BLOCK_SIZE; ++block_dim) { + for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) { + for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) { + temp[internal_i][internal_j] += detail::feature_reduce(data_i_cache_[block_dim][local_id_1 * INTERNAL_BLOCK_SIZE + internal_i], + data_j_cache_[block_dim][local_id_0 * INTERNAL_BLOCK_SIZE + internal_j]); + } } } + nd_idx.barrier(); // wait until all work-items performed their part of the calculations } - nd_idx.barrier(); // wait until all work-items performed their part of the calculations } + // calculate the indices used in the current work-item + const auto i = (blockIdx_y * blockDim_y + threadIdx_y) * INTERNAL_BLOCK_SIZE_uz; + const auto j = (blockIdx_x * blockDim_x + threadIdx_x) * INTERNAL_BLOCK_SIZE_uz; + // apply the remaining part of the kernel function and store the value in the output kernel matrix for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) { for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) { - // calculate the indices to access the kernel matrix (the part stored on the current device) + // calculate the indices to access the global data points and wrt the current device const auto device_global_i = i + static_cast(internal_i); - const auto global_i = row_offset_ + i + static_cast(internal_i); + const auto global_i = device_row_offset_ + device_global_i; const auto device_global_j = j + static_cast(internal_j); - const auto global_j = row_offset_ + j + static_cast(internal_j); + const auto global_j = device_row_offset_ + device_global_j; // be sure to not perform out of bounds accesses for the kernel matrix (only using the upper triangular matrix) - if (device_global_i < (num_rows_ - row_offset_) && device_global_j < device_num_rows_ && global_i >= global_j) { + if (device_global_i < (num_rows_ - device_row_offset_) && device_global_j < device_num_rows_ && global_i >= global_j) { real_type temp_ij = temp[internal_i][internal_j]; + // apply the final kernel function temp_ij = detail::apply_kernel_function(temp_ij, kernel_function_parameter_) + QA_cost_ - q_[global_i] - q_[global_j]; // apply the cost on the diagonal if (global_i == global_j) { temp_ij += cost_; } - // update the kernel matrix - kernel_matrix_d_[device_global_j * (num_rows_ - row_offset_ + PADDING_SIZE_uz) - device_global_j * (device_global_j + std::size_t{ 1 }) / std::size_t{ 2 } + device_global_i] = temp_ij; + // update the upper triangular kernel matrix + kernel_matrix_[device_global_j * (num_rows_ - device_row_offset_ + PADDING_SIZE_uz) - device_global_j * (device_global_j + std::size_t{ 1 }) / std::size_t{ 2 } + device_global_i] = temp_ij; } } } @@ -152,16 +156,16 @@ class device_kernel_assembly { private: /// Local memory used for internal memory access optimizations. - ::sycl::local_accessor data_cache_i_; + ::sycl::local_accessor data_i_cache_; /// Local memory used for internal memory access optimizations. - ::sycl::local_accessor data_cache_j_; + ::sycl::local_accessor data_j_cache_; /// @cond Doxygen_suppress - real_type *kernel_matrix_d_; - const real_type *data_d_; + real_type *kernel_matrix_; + const real_type *data_; const std::size_t num_rows_; const std::size_t device_num_rows_; - const std::size_t row_offset_; + const std::size_t device_row_offset_; const std::size_t num_features_; const real_type *q_; const real_type QA_cost_; diff --git a/include/plssvm/backends/stdpar/kernel/cg_explicit/kernel_matrix_assembly.hpp b/include/plssvm/backends/stdpar/kernel/cg_explicit/kernel_matrix_assembly.hpp index 93772aab3..51e11a282 100644 --- a/include/plssvm/backends/stdpar/kernel/cg_explicit/kernel_matrix_assembly.hpp +++ b/include/plssvm/backends/stdpar/kernel/cg_explicit/kernel_matrix_assembly.hpp @@ -14,7 +14,7 @@ #pragma once #include "plssvm/backends/stdpar/kernel/kernel_functions.hpp" // plssvm::stdpar::detail::{feature_reduce, apply_kernel_function} -#include "plssvm/constants.hpp" // plssvm::{real_type, INTERNAL_BLOCK_SIZE, FEATURE_BLOCK_SIZE, PADDING_SIZE} +#include "plssvm/constants.hpp" // plssvm::{real_type, INTERNAL_BLOCK_SIZE, PADDING_SIZE} #include "plssvm/detail/assert.hpp" // PLSSVM_ASSERT #include "plssvm/kernel_function_types.hpp" // plssvm::kernel_function_type #include "plssvm/matrix.hpp" // plssvm::aos_matrix diff --git a/include/plssvm/constants.hpp b/include/plssvm/constants.hpp index e99dbeddd..81d992991 100644 --- a/include/plssvm/constants.hpp +++ b/include/plssvm/constants.hpp @@ -38,11 +38,8 @@ constexpr unsigned INTERNAL_BLOCK_SIZE = PLSSVM_INTERNAL_BLOCK_SIZE; constexpr unsigned INTERNAL_BLOCK_SIZE = 4; #endif -/// Global compile time constant used for internal feature caching. -constexpr unsigned FEATURE_BLOCK_SIZE = 2 * THREAD_BLOCK_SIZE; - -/// Padding used for the device w_d matrix to prevent out-of-bounce accesses without ifs. -constexpr unsigned PADDING_SIZE = FEATURE_BLOCK_SIZE > (THREAD_BLOCK_SIZE * INTERNAL_BLOCK_SIZE) ? FEATURE_BLOCK_SIZE : (THREAD_BLOCK_SIZE *INTERNAL_BLOCK_SIZE); +/// Padding used for the device arrays and matrices to prevent out-of-bounce accesses without ifs. +constexpr unsigned PADDING_SIZE = THREAD_BLOCK_SIZE *INTERNAL_BLOCK_SIZE; // perform sanity checks static_assert(detail::tuple_contains_v, "Illegal real type provided! See the 'real_type_list' in the type_list.hpp header for a list of the allowed types."); diff --git a/src/plssvm/backends/Kokkos/csvm.cpp b/src/plssvm/backends/Kokkos/csvm.cpp index 2bf512433..e18c88328 100644 --- a/src/plssvm/backends/Kokkos/csvm.cpp +++ b/src/plssvm/backends/Kokkos/csvm.cpp @@ -20,7 +20,7 @@ #include "plssvm/backends/Kokkos/kernel/cg_explicit/kernel_matrix_assembly.hpp" // plssvm::kokkos::detail::device_kernel_assembly #include "plssvm/backends/Kokkos/kernel/cg_implicit/kernel_matrix_assembly_blas.hpp" // plssvm::kokkos::detail::device_kernel_assembly_symm #include "plssvm/backends/Kokkos/kernel/predict_kernel.hpp" // plssvm::kokkos::detail::{device_kernel_w_linear, device_kernel_predict_linear, device_kernel_predict} -#include "plssvm/constants.hpp" // plssvm::THREAD_BLOCK_SIZE, plssvm::INTERNAL_BLOCK_SIZE, plssvm::FEATURE_BLOCK_SIZE +#include "plssvm/constants.hpp" // plssvm::THREAD_BLOCK_SIZE, plssvm::INTERNAL_BLOCK_SIZE #include "plssvm/detail/assert.hpp" // PLSSVM_ASSERT #include "plssvm/detail/data_distribution.hpp" // plssvm::detail::triangular_data_distribution #include "plssvm/detail/logging/log_untracked.hpp" // plssvm::detail::log_untracked @@ -414,7 +414,7 @@ auto csvm::run_assemble_kernel_matrix_explicit(const std::size_t device_id, cons device_ptr_type kernel_matrix_d{ num_entries_padded, devices_[device_id] }; // only explicitly store the upper triangular matrix const real_type cost_factor = real_type{ 1.0 } / params.cost; - const std::size_t scratch_memory_size = static_cast(2u * FEATURE_BLOCK_SIZE * THREAD_BLOCK_SIZE * INTERNAL_BLOCK_SIZE) * sizeof(real_type); + const std::size_t scratch_memory_size = static_cast(2u * THREAD_BLOCK_SIZE * THREAD_BLOCK_SIZE * INTERNAL_BLOCK_SIZE) * sizeof(real_type); // save the team size const int team_size = detail::dim_type_to_native(exec.block); @@ -492,7 +492,7 @@ void csvm::run_blas_level_3_kernel_explicit(const std::size_t device_id, const : // get the offset of the data points this device is responsible for const unsigned long long row_offset = data_distribution_->place_row_offset(device_id); // the necessary amount of scratch memory for the kernels - const std::size_t scratch_memory_size = static_cast(2u * FEATURE_BLOCK_SIZE * THREAD_BLOCK_SIZE * INTERNAL_BLOCK_SIZE) * sizeof(real_type); + const std::size_t scratch_memory_size = static_cast(2u * THREAD_BLOCK_SIZE * THREAD_BLOCK_SIZE * INTERNAL_BLOCK_SIZE) * sizeof(real_type); // save the team size const int team_size = detail::dim_type_to_native(exec.block); @@ -592,7 +592,7 @@ void csvm::run_assemble_kernel_matrix_implicit_blas_level_3(const std::size_t de const unsigned long long row_offset = data_distribution_->place_row_offset(device_id); const real_type cost_factor = real_type{ 1.0 } / params.cost; - const std::size_t scratch_memory_size = static_cast(2u * FEATURE_BLOCK_SIZE * THREAD_BLOCK_SIZE * INTERNAL_BLOCK_SIZE) * sizeof(real_type); + const std::size_t scratch_memory_size = static_cast(2u * THREAD_BLOCK_SIZE * THREAD_BLOCK_SIZE * INTERNAL_BLOCK_SIZE) * sizeof(real_type); // save the team size const int team_size = detail::dim_type_to_native(exec.block); @@ -702,7 +702,7 @@ auto csvm::run_predict_kernel(const std::size_t device_id, const ::plssvm::detai device_ptr_type out_d{ shape{ num_predict_points, num_classes }, shape{ PADDING_SIZE, PADDING_SIZE }, devices_[device_id] }; - const std::size_t scratch_memory_size = static_cast(2u * FEATURE_BLOCK_SIZE * THREAD_BLOCK_SIZE * INTERNAL_BLOCK_SIZE) * sizeof(real_type); + const std::size_t scratch_memory_size = static_cast(2u * THREAD_BLOCK_SIZE * THREAD_BLOCK_SIZE * INTERNAL_BLOCK_SIZE) * sizeof(real_type); // save the team size const int team_size = detail::dim_type_to_native(exec.block); diff --git a/src/plssvm/backends/OpenCL/detail/utility.cpp b/src/plssvm/backends/OpenCL/detail/utility.cpp index 6b3f686ae..e3202bb6b 100644 --- a/src/plssvm/backends/OpenCL/detail/utility.cpp +++ b/src/plssvm/backends/OpenCL/detail/utility.cpp @@ -13,7 +13,7 @@ #include "plssvm/backends/OpenCL/detail/error_code.hpp" // plssvm::opencl::detail::error_code #include "plssvm/backends/OpenCL/detail/jit_info.hpp" // plssvm::opencl::detail::jit_info #include "plssvm/backends/OpenCL/detail/kernel.hpp" // plssvm::opencl::detail::compute_kernel_name, plssvm::opencl::detail::kernel -#include "plssvm/constants.hpp" // plssvm::{real_type, THREAD_BLOCK_SIZE, INTERNAL_BLOCK_SIZE, FEATURE_BLOCK_SIZE, PADDING_SIZE} +#include "plssvm/constants.hpp" // plssvm::{real_type, THREAD_BLOCK_SIZE, INTERNAL_BLOCK_SIZE, PADDING_SIZE} #include "plssvm/detail/arithmetic_type_name.hpp" // plssvm::detail::arithmetic_type_name #include "plssvm/detail/assert.hpp" // PLSSVM_ASSERT #include "plssvm/detail/logging/mpi_log_untracked.hpp" // plssvm::detail::log_untracked @@ -359,12 +359,10 @@ std::pair, jit_info> create_command_queues(const mpi: // replace constants in kernel_src_string // replace the size_t variants -> BEFORE replacing the "normal" values ::plssvm::detail::replace_all(kernel_src_string, "THREAD_BLOCK_SIZE_ul", fmt::format("(ulong) {}", THREAD_BLOCK_SIZE)); - ::plssvm::detail::replace_all(kernel_src_string, "FEATURE_BLOCK_SIZE_ul", fmt::format("(ulong) {}", FEATURE_BLOCK_SIZE)); ::plssvm::detail::replace_all(kernel_src_string, "INTERNAL_BLOCK_SIZE_ul", fmt::format("(ulong) {}", INTERNAL_BLOCK_SIZE)); ::plssvm::detail::replace_all(kernel_src_string, "PADDING_SIZE_ul", fmt::format("(ulong) {}", PADDING_SIZE)); // replace the normal variants ::plssvm::detail::replace_all(kernel_src_string, "THREAD_BLOCK_SIZE", fmt::format("{}", THREAD_BLOCK_SIZE)); - ::plssvm::detail::replace_all(kernel_src_string, "FEATURE_BLOCK_SIZE", fmt::format("{}", FEATURE_BLOCK_SIZE)); ::plssvm::detail::replace_all(kernel_src_string, "INTERNAL_BLOCK_SIZE", fmt::format("{}", INTERNAL_BLOCK_SIZE)); ::plssvm::detail::replace_all(kernel_src_string, "PADDING_SIZE", fmt::format("{}", PADDING_SIZE)); diff --git a/src/plssvm/detail/tracking/performance_tracker.cpp b/src/plssvm/detail/tracking/performance_tracker.cpp index 58b4e975a..8598367dc 100644 --- a/src/plssvm/detail/tracking/performance_tracker.cpp +++ b/src/plssvm/detail/tracking/performance_tracker.cpp @@ -8,7 +8,7 @@ #include "plssvm/detail/tracking/performance_tracker.hpp" -#include "plssvm/constants.hpp" // plssvm::real_type, plssvm::THREAD_BLOCK_SIZE, plssvm::INTERNAL_BLOCK_SIZE, plssvm::FEATURE_BLOCK_SIZE, plssvm::PADDING_SIZE +#include "plssvm/constants.hpp" // plssvm::real_type, plssvm::THREAD_BLOCK_SIZE, plssvm::INTERNAL_BLOCK_SIZE, plssvm::PADDING_SIZE #include "plssvm/detail/arithmetic_type_name.hpp" // plssvm::detail::arithmetic_type_name #include "plssvm/detail/assert.hpp" // PLSSVM_ASSERT, PLSSVM_ASSERT_ENABLED #include "plssvm/detail/cmd/parser_predict.hpp" // plssvm::detail::cmd::parser_predict @@ -280,7 +280,6 @@ void performance_tracker::save(std::ostream &out) { " asserts: {}\n" " enforce_max_mem_alloc_size: {}\n" " THREAD_BLOCK_SIZE: {}\n" - " FEATURE_BLOCK_SIZE: {}\n" " INTERNAL_BLOCK_SIZE: {}\n" " PADDING_SIZE: {}\n", plssvm::detail::current_date_time(), @@ -295,7 +294,6 @@ void performance_tracker::save(std::ostream &out) { assert_enabled, enforce_max_mem_alloc_size, THREAD_BLOCK_SIZE, - FEATURE_BLOCK_SIZE, INTERNAL_BLOCK_SIZE, PADDING_SIZE); From 6cddbb6e98f46ebd21d07d7e347402d03ad093c0 Mon Sep 17 00:00:00 2001 From: Marcel Breyer Date: Mon, 26 May 2025 15:42:48 +0200 Subject: [PATCH 002/215] Additional performance improvement tests. --- .../work_group/kernel_matrix_assembly.hpp | 26 +++++++++++--- src/plssvm/backends/SYCL/DPCPP/csvm.cpp | 34 ++++++++++++++----- 2 files changed, 47 insertions(+), 13 deletions(-) diff --git a/include/plssvm/backends/SYCL/kernel/cg_explicit/work_group/kernel_matrix_assembly.hpp b/include/plssvm/backends/SYCL/kernel/cg_explicit/work_group/kernel_matrix_assembly.hpp index 6e7fd2033..560d556ea 100644 --- a/include/plssvm/backends/SYCL/kernel/cg_explicit/work_group/kernel_matrix_assembly.hpp +++ b/include/plssvm/backends/SYCL/kernel/cg_explicit/work_group/kernel_matrix_assembly.hpp @@ -16,6 +16,7 @@ #include "plssvm/backends/SYCL/kernel/kernel_functions.hpp" // plssvm::sycl::detail::{feature_reduce, apply_kernel_function} #include "plssvm/constants.hpp" // plssvm::{real_type, THREAD_BLOCK_SIZE, INTERNAL_BLOCK_SIZE, PADDING_SIZE} #include "plssvm/kernel_function_types.hpp" // plssvm::kernel_function_type +#include "plssvm/target_platforms.hpp" // plssvm::target_platform #include "sycl/sycl.hpp" // sycl::handler, sycl::range, sycl::nd_item, sycl::local_accessor @@ -27,10 +28,11 @@ namespace plssvm::sycl::detail::work_group { /** * @brief Create the explicit kernel matrix using the @p kernel_function. * @details Uses SYCL's work-group data parallel kernels. + * @details target the target platform * @tparam kernel_function the type of the used kernel function * @tparam Args the types of the parameters necessary for the specific kernel function; stored in a `std::tuple` */ -template +template class device_kernel_assembly { public: /** @@ -111,12 +113,26 @@ class device_kernel_assembly { } nd_idx.barrier(); // wait until all work-items loaded their part of the data - // perform the feature reduction calculation - for (unsigned block_dim = 0; block_dim < THREAD_BLOCK_SIZE; ++block_dim) { + if constexpr (target == target_platform::gpu_amd) { + // perform the feature reduction calculation, the block_dim is the slowest moving index + for (unsigned block_dim = 0; block_dim < THREAD_BLOCK_SIZE; ++block_dim) { + for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) { + for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) { + temp[internal_i][internal_j] += detail::feature_reduce(data_i_cache_[block_dim][local_id_1 * INTERNAL_BLOCK_SIZE + internal_i], + data_j_cache_[block_dim][local_id_0 * INTERNAL_BLOCK_SIZE + internal_j]); + } + } + } + } else { + // perform the feature reduction calculation, the block_dim is the fastest moving index for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) { for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) { - temp[internal_i][internal_j] += detail::feature_reduce(data_i_cache_[block_dim][local_id_1 * INTERNAL_BLOCK_SIZE + internal_i], - data_j_cache_[block_dim][local_id_0 * INTERNAL_BLOCK_SIZE + internal_j]); + real_type sum{ 0.0 }; + for (unsigned block_dim = 0; block_dim < THREAD_BLOCK_SIZE; ++block_dim) { + sum += detail::feature_reduce(data_i_cache_[block_dim][local_id_1 * INTERNAL_BLOCK_SIZE + internal_i], + data_j_cache_[block_dim][local_id_0 * INTERNAL_BLOCK_SIZE + internal_j]); + } + temp[internal_i][internal_j] += sum; } } } diff --git a/src/plssvm/backends/SYCL/DPCPP/csvm.cpp b/src/plssvm/backends/SYCL/DPCPP/csvm.cpp index 7c56bcd91..12910a7ae 100644 --- a/src/plssvm/backends/SYCL/DPCPP/csvm.cpp +++ b/src/plssvm/backends/SYCL/DPCPP/csvm.cpp @@ -223,10 +223,12 @@ ::plssvm::detail::dim_type csvm::get_max_grid_size(const std::size_t device_id) // fit // //***************************************************// -auto csvm::run_assemble_kernel_matrix_explicit(const std::size_t device_id, const ::plssvm::detail::execution_range &exec, const parameter ¶ms, const device_ptr_type &data_d, const device_ptr_type &q_red_d, real_type QA_cost) const -> device_ptr_type { +// TODO: better! +template +auto dispatch_assemble_kernel_matrix_explicit(const std::size_t device_id, const ::plssvm::detail::execution_range &exec, const parameter ¶ms, const device_ptr_type &data_d, const device_ptr_type &q_red_d, real_type QA_cost, sycl::kernel_invocation_type invocation_type_, Device& devices_, Distribution& data_distribution_) { const std::size_t num_rows_reduced = data_d.shape().x - 1; const std::size_t num_features = data_d.shape().y; - const queue_type &device = devices_[device_id]; + const auto &device = devices_[device_id]; // calculate the number of data points this device is responsible for const std::size_t device_specific_num_rows = data_distribution_->place_specific_num_rows(device_id); @@ -260,7 +262,7 @@ auto csvm::run_assemble_kernel_matrix_explicit(const std::size_t device_id, cons case sycl::kernel_invocation_type::work_group: device.impl->sycl_queue.submit([&, &partial_grid_ref = partial_grid, &offsets_ref = offsets](::sycl::handler &cgh) { cgh.parallel_for(detail::get_execution_range(partial_grid_ref, exec.block), - sycl::detail::work_group::device_kernel_assembly{ cgh, kernel_matrix_d.get(), data_d.get(), num_rows_reduced, device_specific_num_rows, row_offset, num_features, q_red_d.get(), QA_cost, cost_factor, offsets_ref.y, offsets_ref.x }); + sycl::detail::work_group::device_kernel_assembly{ cgh, kernel_matrix_d.get(), data_d.get(), num_rows_reduced, device_specific_num_rows, row_offset, num_features, q_red_d.get(), QA_cost, cost_factor, offsets_ref.y, offsets_ref.x }); }); break; case sycl::kernel_invocation_type::hierarchical: @@ -293,7 +295,7 @@ auto csvm::run_assemble_kernel_matrix_explicit(const std::size_t device_id, cons break; case sycl::kernel_invocation_type::work_group: device.impl->sycl_queue.submit([&, &partial_grid_ref = partial_grid, &offsets_ref = offsets](::sycl::handler &cgh) { - using functor_type = sycl::detail::work_group::device_kernel_assembly; + using functor_type = sycl::detail::work_group::device_kernel_assembly; cgh.parallel_for(detail::get_execution_range(partial_grid_ref, exec.block), functor_type{ cgh, kernel_matrix_d.get(), data_d.get(), num_rows_reduced, device_specific_num_rows, row_offset, num_features, q_red_d.get(), QA_cost, cost_factor, offsets_ref.y, offsets_ref.x, params.degree, std::get(params.gamma), params.coef0 }); }); @@ -329,7 +331,7 @@ auto csvm::run_assemble_kernel_matrix_explicit(const std::size_t device_id, cons break; case sycl::kernel_invocation_type::work_group: device.impl->sycl_queue.submit([&, &partial_grid_ref = partial_grid, &offsets_ref = offsets](::sycl::handler &cgh) { - using functor_type = sycl::detail::work_group::device_kernel_assembly; + using functor_type = sycl::detail::work_group::device_kernel_assembly; cgh.parallel_for(detail::get_execution_range(partial_grid_ref, exec.block), functor_type{ cgh, kernel_matrix_d.get(), data_d.get(), num_rows_reduced, device_specific_num_rows, row_offset, num_features, q_red_d.get(), QA_cost, cost_factor, offsets_ref.y, offsets_ref.x, std::get(params.gamma) }); }); @@ -365,7 +367,7 @@ auto csvm::run_assemble_kernel_matrix_explicit(const std::size_t device_id, cons break; case sycl::kernel_invocation_type::work_group: device.impl->sycl_queue.submit([&, &partial_grid_ref = partial_grid, &offsets_ref = offsets](::sycl::handler &cgh) { - using functor_type = sycl::detail::work_group::device_kernel_assembly; + using functor_type = sycl::detail::work_group::device_kernel_assembly; cgh.parallel_for(detail::get_execution_range(partial_grid_ref, exec.block), functor_type{ cgh, kernel_matrix_d.get(), data_d.get(), num_rows_reduced, device_specific_num_rows, row_offset, num_features, q_red_d.get(), QA_cost, cost_factor, offsets_ref.y, offsets_ref.x, std::get(params.gamma), params.coef0 }); }); @@ -401,7 +403,7 @@ auto csvm::run_assemble_kernel_matrix_explicit(const std::size_t device_id, cons break; case sycl::kernel_invocation_type::work_group: device.impl->sycl_queue.submit([&, &partial_grid_ref = partial_grid, &offsets_ref = offsets](::sycl::handler &cgh) { - using functor_type = sycl::detail::work_group::device_kernel_assembly; + using functor_type = sycl::detail::work_group::device_kernel_assembly; cgh.parallel_for(detail::get_execution_range(partial_grid_ref, exec.block), functor_type{ cgh, kernel_matrix_d.get(), data_d.get(), num_rows_reduced, device_specific_num_rows, row_offset, num_features, q_red_d.get(), QA_cost, cost_factor, offsets_ref.y, offsets_ref.x, std::get(params.gamma) }); }); @@ -437,7 +439,7 @@ auto csvm::run_assemble_kernel_matrix_explicit(const std::size_t device_id, cons break; case sycl::kernel_invocation_type::work_group: device.impl->sycl_queue.submit([&, &partial_grid_ref = partial_grid, &offsets_ref = offsets](::sycl::handler &cgh) { - using functor_type = sycl::detail::work_group::device_kernel_assembly; + using functor_type = sycl::detail::work_group::device_kernel_assembly; cgh.parallel_for(detail::get_execution_range(partial_grid_ref, exec.block), functor_type{ cgh, kernel_matrix_d.get(), data_d.get(), num_rows_reduced, device_specific_num_rows, row_offset, num_features, q_red_d.get(), QA_cost, cost_factor, offsets_ref.y, offsets_ref.x, std::get(params.gamma) }); }); @@ -467,6 +469,22 @@ auto csvm::run_assemble_kernel_matrix_explicit(const std::size_t device_id, cons return kernel_matrix_d; } +auto csvm::run_assemble_kernel_matrix_explicit(const std::size_t device_id, const ::plssvm::detail::execution_range &exec, const parameter ¶ms, const device_ptr_type &data_d, const device_ptr_type &q_red_d, real_type QA_cost) const -> device_ptr_type { + switch (target_) { + case target_platform::automatic: + // error + throw backend_exception{ "Can't determine the target platform!" }; + case target_platform::gpu_nvidia: + return dispatch_assemble_kernel_matrix_explicit(device_id, exec, params, data_d, q_red_d, QA_cost, invocation_type_, devices_, data_distribution_); + case target_platform::gpu_amd: + return dispatch_assemble_kernel_matrix_explicit(device_id, exec, params, data_d, q_red_d, QA_cost, invocation_type_, devices_, data_distribution_); + case target_platform::gpu_intel: + return dispatch_assemble_kernel_matrix_explicit(device_id, exec, params, data_d, q_red_d, QA_cost, invocation_type_, devices_, data_distribution_); + case target_platform::cpu: + return dispatch_assemble_kernel_matrix_explicit(device_id, exec, params, data_d, q_red_d, QA_cost, invocation_type_, devices_, data_distribution_); + } +} + void csvm::run_blas_level_3_kernel_explicit(const std::size_t device_id, const ::plssvm::detail::execution_range &exec, const ::plssvm::detail::execution_range &mirror_exec, const real_type alpha, const device_ptr_type &A_d, const device_ptr_type &B_d, const real_type beta, device_ptr_type &C_d) const { const std::size_t num_rhs = B_d.shape().x; const std::size_t num_rows = B_d.shape().y; From a185caf542bc6fd1e65230783e431f158e0633c4 Mon Sep 17 00:00:00 2001 From: Marcel Breyer Date: Mon, 26 May 2025 16:25:07 +0200 Subject: [PATCH 003/215] Preliminary changes. --- .../SYCL/kernel/cg_explicit/basic/blas.hpp | 2 +- .../kernel/cg_explicit/hierarchical/blas.hpp | 30 ++++------ .../SYCL/kernel/cg_explicit/scoped/blas.hpp | 30 ++++------ .../kernel/cg_explicit/work_group/blas.hpp | 32 ++++------ .../basic/kernel_matrix_assembly_blas.hpp | 2 +- .../kernel_matrix_assembly_blas.hpp | 60 +++++++++---------- .../scoped/kernel_matrix_assembly_blas.hpp | 56 +++++++---------- .../kernel_matrix_assembly_blas.hpp | 55 +++++++---------- .../kernel/predict/basic/predict_kernel.hpp | 2 +- .../predict/hierarchical/predict_kernel.hpp | 39 +++++------- .../kernel/predict/scoped/predict_kernel.hpp | 37 +++++------- .../predict/work_group/predict_kernel.hpp | 40 +++++-------- 12 files changed, 151 insertions(+), 234 deletions(-) diff --git a/include/plssvm/backends/SYCL/kernel/cg_explicit/basic/blas.hpp b/include/plssvm/backends/SYCL/kernel/cg_explicit/basic/blas.hpp index 2e528149c..b55b374fe 100644 --- a/include/plssvm/backends/SYCL/kernel/cg_explicit/basic/blas.hpp +++ b/include/plssvm/backends/SYCL/kernel/cg_explicit/basic/blas.hpp @@ -13,7 +13,7 @@ #define PLSSVM_BACKENDS_SYCL_CG_EXPLICIT_BASIC_BLAS_HPP_ #pragma once -#include "plssvm/constants.hpp" // plssvm::{real_type, THREAD_BLOCK_SIZE, INTERNAL_BLOCK_SIZE, FEATURE_BLOCK_SIZE, PADDING_SIZE} +#include "plssvm/constants.hpp" // plssvm::{real_type, THREAD_BLOCK_SIZE, INTERNAL_BLOCK_SIZE, PADDING_SIZE} #include "sycl/sycl.hpp" // sycl::item diff --git a/include/plssvm/backends/SYCL/kernel/cg_explicit/hierarchical/blas.hpp b/include/plssvm/backends/SYCL/kernel/cg_explicit/hierarchical/blas.hpp index de6358ec8..5e5803652 100644 --- a/include/plssvm/backends/SYCL/kernel/cg_explicit/hierarchical/blas.hpp +++ b/include/plssvm/backends/SYCL/kernel/cg_explicit/hierarchical/blas.hpp @@ -13,7 +13,7 @@ #define PLSSVM_BACKENDS_SYCL_CG_EXPLICIT_HIERARCHICAL_BLAS_HPP_ #pragma once -#include "plssvm/constants.hpp" // plssvm::{real_type, THREAD_BLOCK_SIZE, INTERNAL_BLOCK_SIZE, FEATURE_BLOCK_SIZE, PADDING_SIZE} +#include "plssvm/constants.hpp" // plssvm::{real_type, THREAD_BLOCK_SIZE, INTERNAL_BLOCK_SIZE, PADDING_SIZE} #include "sycl/sycl.hpp" // sycl::group, sycl::private_memory, sycl::h_item @@ -60,8 +60,8 @@ class device_kernel_symm { */ void operator()(::sycl::group<2> group) const { // allocate shared memory - real_type A_cache_[FEATURE_BLOCK_SIZE][INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE]; - real_type B_cache_[FEATURE_BLOCK_SIZE][INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE]; + real_type A_cache_[THREAD_BLOCK_SIZE][INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE]; + real_type B_cache_[THREAD_BLOCK_SIZE][INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE]; // calculate the indices used in the current work-item ::sycl::private_memory i{ group }; @@ -98,7 +98,7 @@ class device_kernel_symm { }); // iterate over all features using blocking to be able to cache them for faster memory accesses - for (std::size_t dim = 0; dim < (num_rows_ - row_offset_); dim += static_cast(FEATURE_BLOCK_SIZE)) { + for (std::size_t dim = 0; dim < (num_rows_ - row_offset_); dim += static_cast(THREAD_BLOCK_SIZE)) { // load data into local memory group.parallel_for_work_item([&](::sycl::h_item<2> idx) { const auto local_id_0 = static_cast(idx.get_local_id(0)); @@ -119,15 +119,8 @@ class device_kernel_symm { } else { A_cache_[local_id_0][internal * THREAD_BLOCK_SIZE + local_id_1] = A_[global_j * (num_rows_ - row_offset_ + PADDING_SIZE_uz) + dim + threadIdx_x - global_j * (global_j + std::size_t{ 1 }) / std::size_t{ 2 }]; } - // determine on which side of the diagonal we are located - if (dim + threadIdx_x + THREAD_BLOCK_SIZE < global_j) { - A_cache_[local_id_0 + THREAD_BLOCK_SIZE][internal * THREAD_BLOCK_SIZE + local_id_1] = A_[(dim + threadIdx_x + THREAD_BLOCK_SIZE_uz) * (num_rows_ - row_offset_ + PADDING_SIZE_uz) + global_j - (dim + threadIdx_x + THREAD_BLOCK_SIZE_uz) * (dim + threadIdx_x + THREAD_BLOCK_SIZE_uz + std::size_t{ 1 }) / std::size_t{ 2 }]; - } else { - A_cache_[local_id_0 + THREAD_BLOCK_SIZE][internal * THREAD_BLOCK_SIZE + local_id_1] = A_[global_j * (num_rows_ - row_offset_ + PADDING_SIZE_uz) + dim + threadIdx_x + THREAD_BLOCK_SIZE_uz - global_j * (global_j + std::size_t{ 1 }) / std::size_t{ 2 }]; - } B_cache_[local_id_0][internal * THREAD_BLOCK_SIZE + local_id_1] = B_[(dim + row_offset_ + threadIdx_x) * (num_rhs_ + PADDING_SIZE_uz) + global_i]; - B_cache_[local_id_0 + THREAD_BLOCK_SIZE][internal * THREAD_BLOCK_SIZE + local_id_1] = B_[(dim + row_offset_ + threadIdx_x + THREAD_BLOCK_SIZE_uz) * (num_rhs_ + PADDING_SIZE_uz) + global_i]; } }); @@ -138,7 +131,7 @@ class device_kernel_symm { const auto local_id_0 = static_cast(idx.get_local_id(0)); const auto local_id_1 = static_cast(idx.get_local_id(1)); - for (unsigned block_dim = 0; block_dim < FEATURE_BLOCK_SIZE; ++block_dim) { + for (unsigned block_dim = 0; block_dim < THREAD_BLOCK_SIZE; ++block_dim) { for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) { for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) { temp(idx)[internal_i][internal_j] += A_cache_[block_dim][local_id_0 * INTERNAL_BLOCK_SIZE + internal_j] * B_cache_[block_dim][local_id_1 * INTERNAL_BLOCK_SIZE + internal_i]; @@ -227,8 +220,8 @@ class device_kernel_symm_mirror { */ void operator()(::sycl::group<2> group) const { // allocate shared memory - real_type A_cache_[FEATURE_BLOCK_SIZE][INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE]; - real_type B_cache_[FEATURE_BLOCK_SIZE][INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE]; + real_type A_cache_[THREAD_BLOCK_SIZE][INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE]; + real_type B_cache_[THREAD_BLOCK_SIZE][INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE]; // calculate the indices used in the current work-item ::sycl::private_memory i{ group }; @@ -264,7 +257,7 @@ class device_kernel_symm_mirror { }); // iterate over the remaining features using blocking to be able to cache them for faster memory accesses - for (std::size_t dim = 0; dim < device_specific_num_rows_; dim += static_cast(FEATURE_BLOCK_SIZE)) { + for (std::size_t dim = 0; dim < device_specific_num_rows_; dim += static_cast(THREAD_BLOCK_SIZE)) { // load data into shared memory group.parallel_for_work_item([&](::sycl::h_item<2> idx) { const auto local_id_0 = static_cast(idx.get_local_id(0)); @@ -279,12 +272,9 @@ class device_kernel_symm_mirror { const auto global_i = i_linear(idx) + static_cast(internal) * THREAD_BLOCK_SIZE_uz; const auto global_j = j_linear(idx) + static_cast(internal) * THREAD_BLOCK_SIZE_uz; - // FEATURE_BLOCK_SIZE = 2 * THREAD_BLOCK_SIZE -> store twice as many values in the local memory + // store the values in the local memory A_cache_[local_id_0][internal * THREAD_BLOCK_SIZE + local_id_1] = A_[(dim + threadIdx_x) * (num_rows_ - row_offset_ + PADDING_SIZE_uz) - (dim + threadIdx_x - std::size_t{ 1 }) * (dim + threadIdx_x) / std::size_t{ 2 } + device_specific_num_rows_ - (dim + threadIdx_x) + global_j]; - A_cache_[local_id_0 + THREAD_BLOCK_SIZE][internal * THREAD_BLOCK_SIZE + local_id_1] = A_[(dim + threadIdx_x + THREAD_BLOCK_SIZE_uz) * (num_rows_ - row_offset_ + PADDING_SIZE_uz) - (dim + threadIdx_x + THREAD_BLOCK_SIZE_uz - std::size_t{ 1 }) * (dim + threadIdx_x + THREAD_BLOCK_SIZE_uz) / std::size_t{ 2 } + device_specific_num_rows_ - (dim + threadIdx_x + THREAD_BLOCK_SIZE_uz) + global_j]; - B_cache_[local_id_0][internal * THREAD_BLOCK_SIZE + local_id_1] = B_[(dim + row_offset_ + threadIdx_x) * (num_rhs_ + PADDING_SIZE_uz) + global_i]; - B_cache_[local_id_0 + THREAD_BLOCK_SIZE][internal * THREAD_BLOCK_SIZE + local_id_1] = B_[(dim + row_offset_ + threadIdx_x + THREAD_BLOCK_SIZE_uz) * (num_rhs_ + PADDING_SIZE_uz) + global_i]; } }); @@ -295,7 +285,7 @@ class device_kernel_symm_mirror { const auto local_id_0 = static_cast(idx.get_local_id(0)); const auto local_id_1 = static_cast(idx.get_local_id(1)); - for (unsigned block_dim = 0; block_dim < FEATURE_BLOCK_SIZE; ++block_dim) { + for (unsigned block_dim = 0; block_dim < THREAD_BLOCK_SIZE; ++block_dim) { for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) { for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) { temp(idx)[internal_i][internal_j] += A_cache_[block_dim][local_id_0 * INTERNAL_BLOCK_SIZE + internal_j] * B_cache_[block_dim][local_id_1 * INTERNAL_BLOCK_SIZE + internal_i]; diff --git a/include/plssvm/backends/SYCL/kernel/cg_explicit/scoped/blas.hpp b/include/plssvm/backends/SYCL/kernel/cg_explicit/scoped/blas.hpp index 9e8500d73..2e6983255 100644 --- a/include/plssvm/backends/SYCL/kernel/cg_explicit/scoped/blas.hpp +++ b/include/plssvm/backends/SYCL/kernel/cg_explicit/scoped/blas.hpp @@ -13,7 +13,7 @@ #define PLSSVM_BACKENDS_SYCL_CG_EXPLICIT_SCOPED_BLAS_HPP_ #pragma once -#include "plssvm/constants.hpp" // plssvm::{real_type, THREAD_BLOCK_SIZE, INTERNAL_BLOCK_SIZE, FEATURE_BLOCK_SIZE, PADDING_SIZE} +#include "plssvm/constants.hpp" // plssvm::{real_type, THREAD_BLOCK_SIZE, INTERNAL_BLOCK_SIZE, PADDING_SIZE} #include "sycl/sycl.hpp" // sycl::memory_environment, sycl::require_local_mem, sycl::require_private_mem, sycl::distribute_items_and_wait, sycl::s_item @@ -62,8 +62,8 @@ class device_kernel_symm { template void operator()(T group) const { ::sycl::memory_environment(group, - ::sycl::require_local_mem(), - ::sycl::require_local_mem(), + ::sycl::require_local_mem(), + ::sycl::require_local_mem(), ::sycl::require_private_mem(), ::sycl::require_private_mem(), ::sycl::require_private_mem(), @@ -88,7 +88,7 @@ class device_kernel_symm { j_linear(idx) = blockIdx_x * blockDim_x * INTERNAL_BLOCK_SIZE_uz + threadIdx_y; }); - for (std::size_t dim = 0; dim < (num_rows_ - row_offset_); dim += static_cast(FEATURE_BLOCK_SIZE)) { + for (std::size_t dim = 0; dim < (num_rows_ - row_offset_); dim += static_cast(THREAD_BLOCK_SIZE)) { // load data into shared memory ::sycl::distribute_items_and_wait(group, [&](::sycl::s_item<2> idx) { const auto local_id_0 = static_cast(idx.get_local_id(group, 0)); @@ -109,15 +109,8 @@ class device_kernel_symm { } else { A_cache[local_id_0][internal * THREAD_BLOCK_SIZE + local_id_1] = A_[global_j * (num_rows_ - row_offset_ + PADDING_SIZE_uz) + dim + threadIdx_x - global_j * (global_j + std::size_t{ 1 }) / std::size_t{ 2 }]; } - // determine on which side of the diagonal we are located - if (dim + threadIdx_x + THREAD_BLOCK_SIZE < global_j) { - A_cache[local_id_0 + THREAD_BLOCK_SIZE][internal * THREAD_BLOCK_SIZE + local_id_1] = A_[(dim + threadIdx_x + THREAD_BLOCK_SIZE_uz) * (num_rows_ - row_offset_ + PADDING_SIZE_uz) + global_j - (dim + threadIdx_x + THREAD_BLOCK_SIZE_uz) * (dim + threadIdx_x + THREAD_BLOCK_SIZE_uz + std::size_t{ 1 }) / std::size_t{ 2 }]; - } else { - A_cache[local_id_0 + THREAD_BLOCK_SIZE][internal * THREAD_BLOCK_SIZE + local_id_1] = A_[global_j * (num_rows_ - row_offset_ + PADDING_SIZE_uz) + dim + threadIdx_x + THREAD_BLOCK_SIZE_uz - global_j * (global_j + std::size_t{ 1 }) / std::size_t{ 2 }]; - } B_cache[local_id_0][internal * THREAD_BLOCK_SIZE + local_id_1] = B_[(dim + row_offset_ + threadIdx_x) * (num_rhs_ + PADDING_SIZE_uz) + global_i]; - B_cache[local_id_0 + THREAD_BLOCK_SIZE][internal * THREAD_BLOCK_SIZE + local_id_1] = B_[(dim + row_offset_ + threadIdx_x + THREAD_BLOCK_SIZE_uz) * (num_rhs_ + PADDING_SIZE_uz) + global_i]; } }); @@ -126,7 +119,7 @@ class device_kernel_symm { const auto local_id_0 = static_cast(idx.get_local_id(group, 0)); const auto local_id_1 = static_cast(idx.get_local_id(group, 1)); - for (unsigned block_dim = 0; block_dim < FEATURE_BLOCK_SIZE; ++block_dim) { + for (unsigned block_dim = 0; block_dim < THREAD_BLOCK_SIZE; ++block_dim) { for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) { for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) { temp(idx)[internal_i][internal_j] += A_cache[block_dim][local_id_0 * INTERNAL_BLOCK_SIZE + internal_j] * B_cache[block_dim][local_id_1 * INTERNAL_BLOCK_SIZE + internal_i]; @@ -215,8 +208,8 @@ class device_kernel_symm_mirror { template void operator()(T group) const { ::sycl::memory_environment(group, - ::sycl::require_local_mem(), - ::sycl::require_local_mem(), + ::sycl::require_local_mem(), + ::sycl::require_local_mem(), ::sycl::require_private_mem(), ::sycl::require_private_mem(), ::sycl::require_private_mem(), @@ -241,7 +234,7 @@ class device_kernel_symm_mirror { j_linear(idx) = blockIdx_x * blockDim_x * INTERNAL_BLOCK_SIZE_uz + threadIdx_y; }); - for (std::size_t dim = 0; dim < device_specific_num_rows_; dim += static_cast(FEATURE_BLOCK_SIZE)) { + for (std::size_t dim = 0; dim < device_specific_num_rows_; dim += static_cast(THREAD_BLOCK_SIZE)) { // load data into shared memory ::sycl::distribute_items_and_wait(group, [&](::sycl::s_item<2> idx) { const auto local_id_0 = static_cast(idx.get_local_id(group, 0)); @@ -256,12 +249,9 @@ class device_kernel_symm_mirror { const auto global_i = i_linear(idx) + static_cast(internal) * THREAD_BLOCK_SIZE_uz; const auto global_j = j_linear(idx) + static_cast(internal) * THREAD_BLOCK_SIZE_uz; - // FEATURE_BLOCK_SIZE = 2 * THREAD_BLOCK_SIZE -> store twice as many values in the local memory + // store the values in the local memory A_cache[local_id_0][internal * THREAD_BLOCK_SIZE + local_id_1] = A_[(dim + threadIdx_x) * (num_rows_ - row_offset_ + PADDING_SIZE_uz) - (dim + threadIdx_x - std::size_t{ 1 }) * (dim + threadIdx_x) / std::size_t{ 2 } + device_specific_num_rows_ - (dim + threadIdx_x) + global_j]; - A_cache[local_id_0 + THREAD_BLOCK_SIZE][internal * THREAD_BLOCK_SIZE + local_id_1] = A_[(dim + threadIdx_x + THREAD_BLOCK_SIZE_uz) * (num_rows_ - row_offset_ + PADDING_SIZE_uz) - (dim + threadIdx_x + THREAD_BLOCK_SIZE_uz - std::size_t{ 1 }) * (dim + threadIdx_x + THREAD_BLOCK_SIZE_uz) / std::size_t{ 2 } + device_specific_num_rows_ - (dim + threadIdx_x + THREAD_BLOCK_SIZE_uz) + global_j]; - B_cache[local_id_0][internal * THREAD_BLOCK_SIZE + local_id_1] = B_[(dim + row_offset_ + threadIdx_x) * (num_rhs_ + PADDING_SIZE_uz) + global_i]; - B_cache[local_id_0 + THREAD_BLOCK_SIZE][internal * THREAD_BLOCK_SIZE + local_id_1] = B_[(dim + row_offset_ + threadIdx_x + THREAD_BLOCK_SIZE_uz) * (num_rhs_ + PADDING_SIZE_uz) + global_i]; } }); @@ -270,7 +260,7 @@ class device_kernel_symm_mirror { const auto local_id_0 = static_cast(idx.get_local_id(group, 0)); const auto local_id_1 = static_cast(idx.get_local_id(group, 1)); - for (unsigned block_dim = 0; block_dim < FEATURE_BLOCK_SIZE; ++block_dim) { + for (unsigned block_dim = 0; block_dim < THREAD_BLOCK_SIZE; ++block_dim) { for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) { for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) { temp(idx)[internal_i][internal_j] += A_cache[block_dim][local_id_0 * INTERNAL_BLOCK_SIZE + internal_j] * B_cache[block_dim][local_id_1 * INTERNAL_BLOCK_SIZE + internal_i]; diff --git a/include/plssvm/backends/SYCL/kernel/cg_explicit/work_group/blas.hpp b/include/plssvm/backends/SYCL/kernel/cg_explicit/work_group/blas.hpp index ae07f7ec6..965b043a3 100644 --- a/include/plssvm/backends/SYCL/kernel/cg_explicit/work_group/blas.hpp +++ b/include/plssvm/backends/SYCL/kernel/cg_explicit/work_group/blas.hpp @@ -13,7 +13,7 @@ #define PLSSVM_BACKENDS_SYCL_CG_EXPLICIT_WORK_GROUP_BLAS_HPP_ #pragma once -#include "plssvm/constants.hpp" // plssvm::{real_type, THREAD_BLOCK_SIZE, INTERNAL_BLOCK_SIZE, FEATURE_BLOCK_SIZE, PADDING_SIZE} +#include "plssvm/constants.hpp" // plssvm::{real_type, THREAD_BLOCK_SIZE, INTERNAL_BLOCK_SIZE, PADDING_SIZE} #include "sycl/sycl.hpp" // sycl::handler, sycl::range, sycl::nd_item, sycl::local_accessor @@ -43,8 +43,8 @@ class device_kernel_symm { * @param[in] grid_y_offset the offset in y-dimension into the data points if more than one execution grid has to be used */ device_kernel_symm(::sycl::handler &cgh, const std::size_t num_rows, const std::size_t num_rhs, const std::size_t device_specific_num_rows, const std::size_t row_offset, const real_type alpha, const real_type *A, const real_type *B, const real_type beta, real_type *C, const std::size_t grid_x_offset, const std::size_t grid_y_offset) : - A_cache_{ ::sycl::range<2>{ static_cast(FEATURE_BLOCK_SIZE), static_cast(INTERNAL_BLOCK_SIZE) * static_cast(THREAD_BLOCK_SIZE) }, cgh }, - B_cache_{ ::sycl::range<2>{ static_cast(FEATURE_BLOCK_SIZE), static_cast(INTERNAL_BLOCK_SIZE) * static_cast(THREAD_BLOCK_SIZE) }, cgh }, + A_cache_{ ::sycl::range<2>{ static_cast(THREAD_BLOCK_SIZE), static_cast(INTERNAL_BLOCK_SIZE) * static_cast(THREAD_BLOCK_SIZE) }, cgh }, + B_cache_{ ::sycl::range<2>{ static_cast(THREAD_BLOCK_SIZE), static_cast(INTERNAL_BLOCK_SIZE) * static_cast(THREAD_BLOCK_SIZE) }, cgh }, num_rows_{ num_rows }, num_rhs_{ num_rhs }, device_specific_num_rows_{ device_specific_num_rows }, @@ -75,7 +75,6 @@ class device_kernel_symm { const std::size_t blockIdx_y = nd_idx.get_group(1) + grid_y_offset_; // current block in grid y-dimension + offsets if the grid size would be too large const auto INTERNAL_BLOCK_SIZE_uz = static_cast(INTERNAL_BLOCK_SIZE); const auto THREAD_BLOCK_SIZE_uz = static_cast(THREAD_BLOCK_SIZE); - const auto FEATURE_BLOCK_SIZE_uz = static_cast(FEATURE_BLOCK_SIZE); const auto PADDING_SIZE_uz = static_cast(PADDING_SIZE); // calculate the indices used in the current work-item @@ -88,7 +87,7 @@ class device_kernel_symm { real_type temp[INTERNAL_BLOCK_SIZE][INTERNAL_BLOCK_SIZE]{}; // iterate over all features using blocking to be able to cache them for faster memory accesses - for (std::size_t dim = 0; dim < (num_rows_ - row_offset_); dim += FEATURE_BLOCK_SIZE_uz) { + for (std::size_t dim = 0; dim < (num_rows_ - row_offset_); dim += THREAD_BLOCK_SIZE_uz) { // load data into local memory for (unsigned internal = 0; internal < INTERNAL_BLOCK_SIZE; ++internal) { const auto global_i = i_linear + static_cast(internal) * THREAD_BLOCK_SIZE_uz; @@ -100,20 +99,13 @@ class device_kernel_symm { } else { A_cache_[local_id_0][internal * THREAD_BLOCK_SIZE + local_id_1] = A_[global_j * (num_rows_ - row_offset_ + PADDING_SIZE_uz) + dim + threadIdx_x - global_j * (global_j + std::size_t{ 1 }) / std::size_t{ 2 }]; } - // determine on which side of the diagonal we are located - if (dim + threadIdx_x + THREAD_BLOCK_SIZE < global_j) { - A_cache_[local_id_0 + THREAD_BLOCK_SIZE][internal * THREAD_BLOCK_SIZE + local_id_1] = A_[(dim + threadIdx_x + THREAD_BLOCK_SIZE_uz) * (num_rows_ - row_offset_ + PADDING_SIZE_uz) + global_j - (dim + threadIdx_x + THREAD_BLOCK_SIZE_uz) * (dim + threadIdx_x + THREAD_BLOCK_SIZE_uz + std::size_t{ 1 }) / std::size_t{ 2 }]; - } else { - A_cache_[local_id_0 + THREAD_BLOCK_SIZE][internal * THREAD_BLOCK_SIZE + local_id_1] = A_[global_j * (num_rows_ - row_offset_ + PADDING_SIZE_uz) + dim + threadIdx_x + THREAD_BLOCK_SIZE_uz - global_j * (global_j + std::size_t{ 1 }) / std::size_t{ 2 }]; - } B_cache_[local_id_0][internal * THREAD_BLOCK_SIZE + local_id_1] = B_[(dim + row_offset_ + threadIdx_x) * (num_rhs_ + PADDING_SIZE_uz) + global_i]; - B_cache_[local_id_0 + THREAD_BLOCK_SIZE][internal * THREAD_BLOCK_SIZE + local_id_1] = B_[(dim + row_offset_ + threadIdx_x + THREAD_BLOCK_SIZE_uz) * (num_rhs_ + PADDING_SIZE_uz) + global_i]; } nd_idx.barrier(); // wait until all work-items loaded their part of the data // perform the dot product calculation - for (unsigned block_dim = 0; block_dim < FEATURE_BLOCK_SIZE; ++block_dim) { + for (unsigned block_dim = 0; block_dim < THREAD_BLOCK_SIZE; ++block_dim) { for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) { for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) { temp[internal_i][internal_j] += A_cache_[block_dim][local_id_0 * INTERNAL_BLOCK_SIZE + internal_j] * B_cache_[block_dim][local_id_1 * INTERNAL_BLOCK_SIZE + internal_i]; @@ -183,8 +175,8 @@ class device_kernel_symm_mirror { * @param[in] grid_y_offset the offset in y-dimension into the data points if more than one execution grid has to be used */ device_kernel_symm_mirror(::sycl::handler &cgh, const std::size_t num_rows, const std::size_t num_rhs, const std::size_t num_mirror_rows, const std::size_t device_specific_num_rows, const std::size_t row_offset, const real_type alpha, const real_type *A, const real_type *B, const real_type beta, real_type *C, const std::size_t grid_x_offset, const std::size_t grid_y_offset) : - A_cache_{ ::sycl::range<2>{ static_cast(FEATURE_BLOCK_SIZE), static_cast(INTERNAL_BLOCK_SIZE) * static_cast(THREAD_BLOCK_SIZE) }, cgh }, - B_cache_{ ::sycl::range<2>{ static_cast(FEATURE_BLOCK_SIZE), static_cast(INTERNAL_BLOCK_SIZE) * static_cast(THREAD_BLOCK_SIZE) }, cgh }, + A_cache_{ ::sycl::range<2>{ static_cast(THREAD_BLOCK_SIZE), static_cast(INTERNAL_BLOCK_SIZE) * static_cast(THREAD_BLOCK_SIZE) }, cgh }, + B_cache_{ ::sycl::range<2>{ static_cast(THREAD_BLOCK_SIZE), static_cast(INTERNAL_BLOCK_SIZE) * static_cast(THREAD_BLOCK_SIZE) }, cgh }, num_rows_{ num_rows }, num_rhs_{ num_rhs }, num_mirror_rows_{ num_mirror_rows }, @@ -216,7 +208,6 @@ class device_kernel_symm_mirror { const std::size_t blockIdx_y = nd_idx.get_group(1) + grid_y_offset_; // current block in grid y-dimension + offsets if the grid size would be too large const auto INTERNAL_BLOCK_SIZE_uz = static_cast(INTERNAL_BLOCK_SIZE); const auto THREAD_BLOCK_SIZE_uz = static_cast(THREAD_BLOCK_SIZE); - const auto FEATURE_BLOCK_SIZE_uz = static_cast(FEATURE_BLOCK_SIZE); const auto PADDING_SIZE_uz = static_cast(PADDING_SIZE); // calculate the indices used in the current work-item @@ -229,23 +220,20 @@ class device_kernel_symm_mirror { real_type temp[INTERNAL_BLOCK_SIZE][INTERNAL_BLOCK_SIZE]{}; // iterate over the remaining features using blocking to be able to cache them for faster memory accesses - for (std::size_t dim = 0; dim < device_specific_num_rows_; dim += FEATURE_BLOCK_SIZE_uz) { + for (std::size_t dim = 0; dim < device_specific_num_rows_; dim += THREAD_BLOCK_SIZE_uz) { // load data into shared memory for (unsigned internal = 0; internal < INTERNAL_BLOCK_SIZE; ++internal) { const auto global_i = i_linear + static_cast(internal) * THREAD_BLOCK_SIZE_uz; const auto global_j = j_linear + static_cast(internal) * THREAD_BLOCK_SIZE_uz; - // FEATURE_BLOCK_SIZE = 2 * THREAD_BLOCK_SIZE -> store twice as many values in the local memory + // store the values in the local memory A_cache_[local_id_0][internal * THREAD_BLOCK_SIZE + local_id_1] = A_[(dim + threadIdx_x) * (num_rows_ - row_offset_ + PADDING_SIZE_uz) - (dim + threadIdx_x - std::size_t{ 1 }) * (dim + threadIdx_x) / std::size_t{ 2 } + device_specific_num_rows_ - (dim + threadIdx_x) + global_j]; - A_cache_[local_id_0 + THREAD_BLOCK_SIZE][internal * THREAD_BLOCK_SIZE + local_id_1] = A_[(dim + threadIdx_x + THREAD_BLOCK_SIZE_uz) * (num_rows_ - row_offset_ + PADDING_SIZE_uz) - (dim + threadIdx_x + THREAD_BLOCK_SIZE_uz - std::size_t{ 1 }) * (dim + threadIdx_x + THREAD_BLOCK_SIZE_uz) / std::size_t{ 2 } + device_specific_num_rows_ - (dim + threadIdx_x + THREAD_BLOCK_SIZE_uz) + global_j]; - B_cache_[local_id_0][internal * THREAD_BLOCK_SIZE + local_id_1] = B_[(dim + row_offset_ + threadIdx_x) * (num_rhs_ + PADDING_SIZE_uz) + global_i]; - B_cache_[local_id_0 + THREAD_BLOCK_SIZE][internal * THREAD_BLOCK_SIZE + local_id_1] = B_[(dim + row_offset_ + threadIdx_x + THREAD_BLOCK_SIZE_uz) * (num_rhs_ + PADDING_SIZE_uz) + global_i]; } nd_idx.barrier(); // wait until all threads loaded their part of the data // perform the feature reduction calculation - for (unsigned block_dim = 0; block_dim < FEATURE_BLOCK_SIZE; ++block_dim) { + for (unsigned block_dim = 0; block_dim < THREAD_BLOCK_SIZE; ++block_dim) { for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) { for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) { temp[internal_i][internal_j] += A_cache_[block_dim][local_id_0 * INTERNAL_BLOCK_SIZE + internal_j] * B_cache_[block_dim][local_id_1 * INTERNAL_BLOCK_SIZE + internal_i]; diff --git a/include/plssvm/backends/SYCL/kernel/cg_implicit/basic/kernel_matrix_assembly_blas.hpp b/include/plssvm/backends/SYCL/kernel/cg_implicit/basic/kernel_matrix_assembly_blas.hpp index 7b517a7b1..9c82ad31d 100644 --- a/include/plssvm/backends/SYCL/kernel/cg_implicit/basic/kernel_matrix_assembly_blas.hpp +++ b/include/plssvm/backends/SYCL/kernel/cg_implicit/basic/kernel_matrix_assembly_blas.hpp @@ -15,7 +15,7 @@ #include "plssvm/backends/SYCL/detail/atomics.hpp" // plssvm::sycl::detail::atomic_op #include "plssvm/backends/SYCL/kernel/kernel_functions.hpp" // plssvm::sycl::detail::{feature_reduce, apply_kernel_function} -#include "plssvm/constants.hpp" // plssvm::{real_type, THREAD_BLOCK_SIZE, INTERNAL_BLOCK_SIZE, FEATURE_BLOCK_SIZE, PADDING_SIZE} +#include "plssvm/constants.hpp" // plssvm::{real_type, THREAD_BLOCK_SIZE, INTERNAL_BLOCK_SIZE, PADDING_SIZE} #include "plssvm/kernel_function_types.hpp" // plssvm::kernel_function_type #include "sycl/sycl.hpp" // sycl::item diff --git a/include/plssvm/backends/SYCL/kernel/cg_implicit/hierarchical/kernel_matrix_assembly_blas.hpp b/include/plssvm/backends/SYCL/kernel/cg_implicit/hierarchical/kernel_matrix_assembly_blas.hpp index 1a24024b6..342e8308b 100644 --- a/include/plssvm/backends/SYCL/kernel/cg_implicit/hierarchical/kernel_matrix_assembly_blas.hpp +++ b/include/plssvm/backends/SYCL/kernel/cg_implicit/hierarchical/kernel_matrix_assembly_blas.hpp @@ -15,7 +15,7 @@ #include "plssvm/backends/SYCL/detail/atomics.hpp" // plssvm::sycl::detail::atomic_op #include "plssvm/backends/SYCL/kernel/kernel_functions.hpp" // plssvm::sycl::detail::{feature_reduce, apply_kernel_function} -#include "plssvm/constants.hpp" // plssvm::{real_type, THREAD_BLOCK_SIZE, INTERNAL_BLOCK_SIZE, FEATURE_BLOCK_SIZE, PADDING_SIZE} +#include "plssvm/constants.hpp" // plssvm::{real_type, THREAD_BLOCK_SIZE, INTERNAL_BLOCK_SIZE, PADDING_SIZE} #include "plssvm/kernel_function_types.hpp" // plssvm::kernel_function_type #include "sycl/sycl.hpp" // sycl::group, sycl::private_memory, sycl::h_item @@ -80,6 +80,10 @@ class device_kernel_assembly_symm { ::sycl::private_memory j{ group }; ::sycl::private_memory j_linear{ group }; + // create the shared memory arrays used for caching data point features + real_type data_cache_one[THREAD_BLOCK_SIZE * INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE]; + real_type data_cache_two[THREAD_BLOCK_SIZE * INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE]; + ::sycl::private_memory temp{ group }; // initialize private and local variables @@ -112,12 +116,12 @@ class device_kernel_assembly_symm { // only calculate the upper triangular matrix -> can't use get_local_id() since all work-items in a work-group must progress further if (group[1] >= group[0]) { { - // allocate shared memory - real_type data_cache_i[FEATURE_BLOCK_SIZE][INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE]; - real_type data_cache_j[FEATURE_BLOCK_SIZE][INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE]; + // reinterpret the arrays to be of shape [THREAD_BLOCK_SIZE][INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE] + auto data_cache_i = reinterpret_cast(data_cache_one); + auto data_cache_j = reinterpret_cast(data_cache_two); // iterate over all features using blocking to be able to cache them for faster memory accesses - for (std::size_t dim = 0; dim < num_features_; dim += static_cast(FEATURE_BLOCK_SIZE)) { + for (std::size_t dim = 0; dim < num_features_; dim += static_cast(THREAD_BLOCK_SIZE)) { // load data into local memory group.parallel_for_work_item([&](::sycl::h_item<2> idx) { const auto local_id_0 = static_cast(idx.get_local_id(0)); @@ -132,11 +136,9 @@ class device_kernel_assembly_symm { const auto global_i = row_offset_ + i_linear(idx) + static_cast(internal) * THREAD_BLOCK_SIZE_uz; const auto global_j = row_offset_ + j_linear(idx) + static_cast(internal) * THREAD_BLOCK_SIZE_uz; - // FEATURE_BLOCK_SIZE = 2 * THREAD_BLOCK_SIZE -> store twice as many values in the local memory + // store the values in the local memory data_cache_i[local_id_0][internal * THREAD_BLOCK_SIZE + local_id_1] = data_d_[(dim + threadIdx_x) * (num_rows_ + std::size_t{ 1 } + PADDING_SIZE_uz) + global_i]; - data_cache_i[local_id_0 + THREAD_BLOCK_SIZE][internal * THREAD_BLOCK_SIZE + local_id_1] = data_d_[(dim + threadIdx_x + THREAD_BLOCK_SIZE_uz) * (num_rows_ + std::size_t{ 1 } + PADDING_SIZE_uz) + global_i]; data_cache_j[local_id_0][internal * THREAD_BLOCK_SIZE + local_id_1] = data_d_[(dim + threadIdx_x) * (num_rows_ + std::size_t{ 1 } + PADDING_SIZE_uz) + global_j]; - data_cache_j[local_id_0 + THREAD_BLOCK_SIZE][internal * THREAD_BLOCK_SIZE + local_id_1] = data_d_[(dim + threadIdx_x + THREAD_BLOCK_SIZE_uz) * (num_rows_ + std::size_t{ 1 } + PADDING_SIZE_uz) + global_j]; } }); @@ -147,7 +149,7 @@ class device_kernel_assembly_symm { const auto local_id_0 = static_cast(idx.get_local_id(0)); const auto local_id_1 = static_cast(idx.get_local_id(1)); - for (unsigned block_dim = 0; block_dim < FEATURE_BLOCK_SIZE; ++block_dim) { + for (unsigned block_dim = 0; block_dim < THREAD_BLOCK_SIZE; ++block_dim) { for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) { for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) { temp(idx)[internal_i][internal_j] += detail::feature_reduce(data_cache_i[block_dim][local_id_1 * INTERNAL_BLOCK_SIZE + internal_i], @@ -189,12 +191,12 @@ class device_kernel_assembly_symm { // calculate C += alpha * temp * B for the UPPER triangular matrix { - // allocate shared memory - real_type B_cache[INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE][FEATURE_BLOCK_SIZE]; - real_type C_out_cache[INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE][FEATURE_BLOCK_SIZE]; + // reinterpret the arrays to be of shape [INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE][THREAD_BLOCK_SIZE] + auto B_cache = reinterpret_cast(data_cache_one); + auto C_out_cache = reinterpret_cast(data_cache_two); // iterate over all classes using blocking to be able to cache them for faster memory accesses - for (std::size_t dim = 0; dim < num_classes_; dim += static_cast(FEATURE_BLOCK_SIZE)) { + for (std::size_t dim = 0; dim < num_classes_; dim += static_cast(THREAD_BLOCK_SIZE)) { // load data into local memory group.parallel_for_work_item([&](::sycl::h_item<2> idx) { const auto local_id_0 = static_cast(idx.get_local_id(0)); @@ -208,26 +210,24 @@ class device_kernel_assembly_symm { for (unsigned internal = 0; internal < INTERNAL_BLOCK_SIZE; ++internal) { const std::size_t global_i = row_offset_ + i_linear(idx) + static_cast(internal) * THREAD_BLOCK_SIZE_uz; - // FEATURE_BLOCK_SIZE = 2 * THREAD_BLOCK_SIZE -> store twice as many values in the local memory + // store the values in the local memory B_cache[internal * THREAD_BLOCK_SIZE + local_id_1][local_id_0] = alpha_ * B_[global_i * (num_classes_ + PADDING_SIZE_uz) + dim + threadIdx_x]; - B_cache[internal * THREAD_BLOCK_SIZE + local_id_1][local_id_0 + THREAD_BLOCK_SIZE] = alpha_ * B_[global_i * (num_classes_ + PADDING_SIZE_uz) + dim + threadIdx_x + THREAD_BLOCK_SIZE_uz]; C_out_cache[internal * THREAD_BLOCK_SIZE + local_id_1][local_id_0] = real_type{ 0.0 }; - C_out_cache[internal * THREAD_BLOCK_SIZE + local_id_1][local_id_0 + THREAD_BLOCK_SIZE] = real_type{ 0.0 }; } }); // implicit group barrier // calculate intermediate results and store them in shared memory - for (unsigned class_idx = 0; class_idx < FEATURE_BLOCK_SIZE; ++class_idx) { + for (unsigned class_idx = 0; class_idx < THREAD_BLOCK_SIZE; ++class_idx) { group.parallel_for_work_item([&](::sycl::h_item<2> idx) { const auto local_id_0 = static_cast(idx.get_local_id(0)); const auto local_id_1 = static_cast(idx.get_local_id(1)); for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) { for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) { - C_out_cache[local_id_0 * INTERNAL_BLOCK_SIZE + internal_j][(class_idx + local_id_1) % FEATURE_BLOCK_SIZE] += - temp(idx)[internal_i][internal_j] * B_cache[local_id_1 * INTERNAL_BLOCK_SIZE + internal_i][(class_idx + local_id_1) % FEATURE_BLOCK_SIZE]; + C_out_cache[local_id_0 * INTERNAL_BLOCK_SIZE + internal_j][(class_idx + local_id_1) % THREAD_BLOCK_SIZE] += + temp(idx)[internal_i][internal_j] * B_cache[local_id_1 * INTERNAL_BLOCK_SIZE + internal_i][(class_idx + local_id_1) % THREAD_BLOCK_SIZE]; } } }); @@ -242,13 +242,11 @@ class device_kernel_assembly_symm { const std::size_t threadIdx_y = idx.get_local_id(1); - const auto THREAD_BLOCK_SIZE_uz = static_cast(THREAD_BLOCK_SIZE); const auto PADDING_SIZE_uz = static_cast(PADDING_SIZE); for (unsigned internal = 0; internal < INTERNAL_BLOCK_SIZE; ++internal) { const auto global_j = row_offset_ + j(idx) + static_cast(internal); detail::atomic_op{ C_[global_j * (num_classes_ + PADDING_SIZE_uz) + dim + threadIdx_y] } += C_out_cache[local_id_0 * INTERNAL_BLOCK_SIZE + internal][local_id_1]; - detail::atomic_op{ C_[global_j * (num_classes_ + PADDING_SIZE_uz) + dim + threadIdx_y + THREAD_BLOCK_SIZE_uz] } += C_out_cache[local_id_0 * INTERNAL_BLOCK_SIZE + internal][local_id_1 + THREAD_BLOCK_SIZE]; } }); @@ -274,12 +272,12 @@ class device_kernel_assembly_symm { // calculate C += alpha * temp * B for the LOWER triangular matrix { - // allocate shared memory - real_type B_cache[FEATURE_BLOCK_SIZE][INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE]; - real_type C_out_cache[FEATURE_BLOCK_SIZE][INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE]; + // reinterpret the arrays to be of shape [THREAD_BLOCK_SIZE][INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE] + auto B_cache = reinterpret_cast(data_cache_one); + auto C_out_cache = reinterpret_cast(data_cache_two); // iterate over all classes using blocking to be able to cache them for faster memory accesses - for (std::size_t dim = 0; dim < num_classes_; dim += static_cast(FEATURE_BLOCK_SIZE)) { + for (std::size_t dim = 0; dim < num_classes_; dim += static_cast(THREAD_BLOCK_SIZE)) { group.parallel_for_work_item([&](::sycl::h_item<2> idx) { const auto local_id_0 = static_cast(idx.get_local_id(0)); const auto local_id_1 = static_cast(idx.get_local_id(1)); @@ -293,26 +291,24 @@ class device_kernel_assembly_symm { for (unsigned internal = 0; internal < INTERNAL_BLOCK_SIZE; ++internal) { const auto global_j = row_offset_ + j_linear(idx) + static_cast(internal) * THREAD_BLOCK_SIZE_uz; - // FEATURE_BLOCK_SIZE = 2 * THREAD_BLOCK_SIZE -> store twice as many values in the shared memory + // store the values in the shared memory B_cache[local_id_0][internal * THREAD_BLOCK_SIZE + local_id_1] = alpha_ * B_[global_j * (num_classes_ + PADDING_SIZE_uz) + dim + threadIdx_x]; - B_cache[local_id_0 + THREAD_BLOCK_SIZE][internal * THREAD_BLOCK_SIZE + local_id_1] = alpha_ * B_[global_j * (num_classes_ + PADDING_SIZE_uz) + dim + threadIdx_x + THREAD_BLOCK_SIZE_uz]; C_out_cache[local_id_0][internal * THREAD_BLOCK_SIZE + local_id_1] = real_type{ 0.0 }; - C_out_cache[local_id_0 + THREAD_BLOCK_SIZE][internal * THREAD_BLOCK_SIZE + local_id_1] = real_type{ 0.0 }; } }); // implicit group barrier // calculate intermediate results and store them in shared memory - for (unsigned class_idx = 0; class_idx < FEATURE_BLOCK_SIZE; ++class_idx) { + for (unsigned class_idx = 0; class_idx < THREAD_BLOCK_SIZE; ++class_idx) { group.parallel_for_work_item([&](::sycl::h_item<2> idx) { const auto local_id_0 = static_cast(idx.get_local_id(0)); const auto local_id_1 = static_cast(idx.get_local_id(1)); for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) { for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) { - C_out_cache[(class_idx + local_id_0) % FEATURE_BLOCK_SIZE][internal_i * THREAD_BLOCK_SIZE + local_id_1] += - temp(idx)[internal_i][internal_j] * B_cache[(class_idx + local_id_0) % FEATURE_BLOCK_SIZE][local_id_0 * INTERNAL_BLOCK_SIZE + internal_j]; + C_out_cache[(class_idx + local_id_0) % THREAD_BLOCK_SIZE][internal_i * THREAD_BLOCK_SIZE + local_id_1] += + temp(idx)[internal_i][internal_j] * B_cache[(class_idx + local_id_0) % THREAD_BLOCK_SIZE][local_id_0 * INTERNAL_BLOCK_SIZE + internal_j]; } } }); @@ -327,13 +323,11 @@ class device_kernel_assembly_symm { const std::size_t threadIdx_x = idx.get_local_id(0); - const auto THREAD_BLOCK_SIZE_uz = static_cast(THREAD_BLOCK_SIZE); const auto PADDING_SIZE_uz = static_cast(PADDING_SIZE); for (unsigned internal = 0; internal < INTERNAL_BLOCK_SIZE; ++internal) { const auto global_i = row_offset_ + i(idx) + static_cast(internal); detail::atomic_op{ C_[global_i * (num_classes_ + PADDING_SIZE_uz) + dim + threadIdx_x] } += C_out_cache[local_id_0][internal * THREAD_BLOCK_SIZE + local_id_1]; - detail::atomic_op{ C_[global_i * (num_classes_ + PADDING_SIZE_uz) + dim + threadIdx_x + THREAD_BLOCK_SIZE_uz] } += C_out_cache[local_id_0 + THREAD_BLOCK_SIZE][internal * THREAD_BLOCK_SIZE + local_id_1]; } }); diff --git a/include/plssvm/backends/SYCL/kernel/cg_implicit/scoped/kernel_matrix_assembly_blas.hpp b/include/plssvm/backends/SYCL/kernel/cg_implicit/scoped/kernel_matrix_assembly_blas.hpp index 4391f2f19..c84db480f 100644 --- a/include/plssvm/backends/SYCL/kernel/cg_implicit/scoped/kernel_matrix_assembly_blas.hpp +++ b/include/plssvm/backends/SYCL/kernel/cg_implicit/scoped/kernel_matrix_assembly_blas.hpp @@ -15,7 +15,7 @@ #include "plssvm/backends/SYCL/detail/atomics.hpp" // plssvm::sycl::detail::atomic_op #include "plssvm/backends/SYCL/kernel/kernel_functions.hpp" // plssvm::sycl::detail::{feature_reduce, apply_kernel_function} -#include "plssvm/constants.hpp" // plssvm::{real_type, THREAD_BLOCK_SIZE, INTERNAL_BLOCK_SIZE, FEATURE_BLOCK_SIZE, PADDING_SIZE} +#include "plssvm/constants.hpp" // plssvm::{real_type, THREAD_BLOCK_SIZE, INTERNAL_BLOCK_SIZE, PADDING_SIZE} #include "plssvm/kernel_function_types.hpp" // plssvm::kernel_function_type #include "sycl/sycl.hpp" // sycl::memory_environment, sycl::require_local_mem, sycl::require_private_mem, sycl::distribute_items_and_wait, sycl::s_item @@ -77,8 +77,8 @@ class device_kernel_assembly_symm { template void operator()(T group) const { ::sycl::memory_environment(group, - ::sycl::require_local_mem(), - ::sycl::require_local_mem(), + ::sycl::require_local_mem(), + ::sycl::require_local_mem(), ::sycl::require_private_mem(), ::sycl::require_private_mem(), ::sycl::require_private_mem(), @@ -106,7 +106,7 @@ class device_kernel_assembly_symm { // exploit symmetry if (group[1] >= group[0]) { // iterate over all features using blocking to be able to cache them for faster memory accesses - for (std::size_t dim = 0; dim < num_features_; dim += static_cast(FEATURE_BLOCK_SIZE)) { + for (std::size_t dim = 0; dim < num_features_; dim += static_cast(THREAD_BLOCK_SIZE)) { // load data into local memory ::sycl::distribute_items_and_wait(group, [&](::sycl::s_item<2> idx) { const auto local_id_0 = static_cast(idx.get_local_id(group, 0)); @@ -121,11 +121,9 @@ class device_kernel_assembly_symm { const auto global_i = row_offset_ + i_linear(idx) + static_cast(internal) * THREAD_BLOCK_SIZE_uz; const auto global_j = row_offset_ + j_linear(idx) + static_cast(internal) * THREAD_BLOCK_SIZE_uz; - // FEATURE_BLOCK_SIZE = 2 * THREAD_BLOCK_SIZE -> store twice as many values in the local memory + // store the values in the local memory data_cache_i[local_id_0 * INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE + internal * THREAD_BLOCK_SIZE + local_id_1] = data_d_[(dim + threadIdx_x) * (num_rows_ + std::size_t{ 1 } + PADDING_SIZE_uz) + global_i]; - data_cache_i[(local_id_0 + THREAD_BLOCK_SIZE) * INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE + internal * THREAD_BLOCK_SIZE + local_id_1] = data_d_[(dim + threadIdx_x + THREAD_BLOCK_SIZE_uz) * (num_rows_ + std::size_t{ 1 } + PADDING_SIZE_uz) + global_i]; data_cache_j[local_id_0 * INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE + internal * THREAD_BLOCK_SIZE + local_id_1] = data_d_[(dim + threadIdx_x) * (num_rows_ + std::size_t{ 1 } + PADDING_SIZE_uz) + global_j]; - data_cache_j[(local_id_0 + THREAD_BLOCK_SIZE) * INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE + internal * THREAD_BLOCK_SIZE + local_id_1] = data_d_[(dim + threadIdx_x + THREAD_BLOCK_SIZE_uz) * (num_rows_ + std::size_t{ 1 } + PADDING_SIZE_uz) + global_j]; } }); @@ -134,7 +132,7 @@ class device_kernel_assembly_symm { const auto local_id_0 = static_cast(idx.get_local_id(group, 0)); const auto local_id_1 = static_cast(idx.get_local_id(group, 1)); - for (unsigned block_dim = 0; block_dim < FEATURE_BLOCK_SIZE; ++block_dim) { + for (unsigned block_dim = 0; block_dim < THREAD_BLOCK_SIZE; ++block_dim) { for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) { for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) { temp(idx)[internal_i][internal_j] += detail::feature_reduce(data_cache_i[block_dim * INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE + local_id_1 * INTERNAL_BLOCK_SIZE + internal_i], @@ -172,11 +170,11 @@ class device_kernel_assembly_symm { // calculate C += alpha * temp * B for the UPPER triangular matrix { // rename cached arrays - auto &B_cache = data_cache_i; // [INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE][FEATURE_BLOCK_SIZE] - auto &C_out_cache = data_cache_j; // [INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE][FEATURE_BLOCK_SIZE] + auto &B_cache = data_cache_i; // [INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE][THREAD_BLOCK_SIZE] + auto &C_out_cache = data_cache_j; // [INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE][THREAD_BLOCK_SIZE] // iterate over all classes using blocking to be able to cache them for faster memory accesses - for (std::size_t dim = 0; dim < num_classes_; dim += static_cast(FEATURE_BLOCK_SIZE)) { + for (std::size_t dim = 0; dim < num_classes_; dim += static_cast(THREAD_BLOCK_SIZE)) { // load data into local memory ::sycl::distribute_items_and_wait(group, [&](::sycl::s_item<2> idx) { const auto local_id_0 = static_cast(idx.get_local_id(group, 0)); @@ -190,24 +188,22 @@ class device_kernel_assembly_symm { for (unsigned internal = 0; internal < INTERNAL_BLOCK_SIZE; ++internal) { const std::size_t global_i = row_offset_ + i_linear(idx) + static_cast(internal) * THREAD_BLOCK_SIZE_uz; - // FEATURE_BLOCK_SIZE = 2 * THREAD_BLOCK_SIZE -> store twice as many values in the local memory - B_cache[(internal * THREAD_BLOCK_SIZE + local_id_1) * FEATURE_BLOCK_SIZE + local_id_0] = alpha_ * B_[global_i * (num_classes_ + PADDING_SIZE_uz) + dim + threadIdx_x]; - B_cache[(internal * THREAD_BLOCK_SIZE + local_id_1) * FEATURE_BLOCK_SIZE + local_id_0 + THREAD_BLOCK_SIZE] = alpha_ * B_[global_i * (num_classes_ + PADDING_SIZE_uz) + dim + threadIdx_x + THREAD_BLOCK_SIZE_uz]; - C_out_cache[(internal * THREAD_BLOCK_SIZE + local_id_1) * FEATURE_BLOCK_SIZE + local_id_0] = real_type{ 0.0 }; - C_out_cache[(internal * THREAD_BLOCK_SIZE + local_id_1) * FEATURE_BLOCK_SIZE + local_id_0 + THREAD_BLOCK_SIZE] = real_type{ 0.0 }; + // store the values in the local memory + B_cache[(internal * THREAD_BLOCK_SIZE + local_id_1) * THREAD_BLOCK_SIZE + local_id_0] = alpha_ * B_[global_i * (num_classes_ + PADDING_SIZE_uz) + dim + threadIdx_x]; + C_out_cache[(internal * THREAD_BLOCK_SIZE + local_id_1) * THREAD_BLOCK_SIZE + local_id_0] = real_type{ 0.0 }; } }); // calculate intermediate results and store them in shared memory - for (unsigned class_idx = 0; class_idx < FEATURE_BLOCK_SIZE; ++class_idx) { + for (unsigned class_idx = 0; class_idx < THREAD_BLOCK_SIZE; ++class_idx) { ::sycl::distribute_items_and_wait(group, [&](::sycl::s_item<2> idx) { const auto local_id_0 = static_cast(idx.get_local_id(group, 0)); const auto local_id_1 = static_cast(idx.get_local_id(group, 1)); for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) { for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) { - C_out_cache[(local_id_0 * INTERNAL_BLOCK_SIZE + internal_j) * FEATURE_BLOCK_SIZE + (class_idx + local_id_1) % FEATURE_BLOCK_SIZE] += - temp(idx)[internal_i][internal_j] * B_cache[(local_id_1 * INTERNAL_BLOCK_SIZE + internal_i) * FEATURE_BLOCK_SIZE + (class_idx + local_id_1) % FEATURE_BLOCK_SIZE]; + C_out_cache[(local_id_0 * INTERNAL_BLOCK_SIZE + internal_j) * THREAD_BLOCK_SIZE + (class_idx + local_id_1) % THREAD_BLOCK_SIZE] += + temp(idx)[internal_i][internal_j] * B_cache[(local_id_1 * INTERNAL_BLOCK_SIZE + internal_i) * THREAD_BLOCK_SIZE + (class_idx + local_id_1) % THREAD_BLOCK_SIZE]; } } }); @@ -220,13 +216,11 @@ class device_kernel_assembly_symm { const std::size_t threadIdx_y = idx.get_local_id(group, 1); - const auto THREAD_BLOCK_SIZE_uz = static_cast(THREAD_BLOCK_SIZE); const auto PADDING_SIZE_uz = static_cast(PADDING_SIZE); for (unsigned internal = 0; internal < INTERNAL_BLOCK_SIZE; ++internal) { const auto global_j = row_offset_ + j(idx) + static_cast(internal); - detail::atomic_op{ C_[global_j * (num_classes_ + PADDING_SIZE_uz) + dim + threadIdx_y] } += C_out_cache[(local_id_0 * INTERNAL_BLOCK_SIZE + internal) * FEATURE_BLOCK_SIZE + local_id_1]; - detail::atomic_op{ C_[global_j * (num_classes_ + PADDING_SIZE_uz) + dim + threadIdx_y + THREAD_BLOCK_SIZE_uz] } += C_out_cache[(local_id_0 * INTERNAL_BLOCK_SIZE + internal) * FEATURE_BLOCK_SIZE + local_id_1 + THREAD_BLOCK_SIZE]; + detail::atomic_op{ C_[global_j * (num_classes_ + PADDING_SIZE_uz) + dim + threadIdx_y] } += C_out_cache[(local_id_0 * INTERNAL_BLOCK_SIZE + internal) * THREAD_BLOCK_SIZE + local_id_1]; } }); } @@ -249,11 +243,11 @@ class device_kernel_assembly_symm { // calculate C += alpha * temp * B for the LOWER triangular matrix { // allocate shared memory - auto &B_cache = data_cache_i; // [FEATURE_BLOCK_SIZE][INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE] - auto &C_out_cache = data_cache_j; // [FEATURE_BLOCK_SIZE][INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE] + auto &B_cache = data_cache_i; // [THREAD_BLOCK_SIZE][INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE] + auto &C_out_cache = data_cache_j; // [THREAD_BLOCK_SIZE][INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE] // iterate over all classes using blocking to be able to cache them for faster memory accesses - for (std::size_t dim = 0; dim < num_classes_; dim += static_cast(FEATURE_BLOCK_SIZE)) { + for (std::size_t dim = 0; dim < num_classes_; dim += static_cast(THREAD_BLOCK_SIZE)) { ::sycl::distribute_items_and_wait(group, [&](::sycl::s_item<2> idx) { const auto local_id_0 = static_cast(idx.get_local_id(group, 0)); const auto local_id_1 = static_cast(idx.get_local_id(group, 1)); @@ -267,26 +261,24 @@ class device_kernel_assembly_symm { for (unsigned internal = 0; internal < INTERNAL_BLOCK_SIZE; ++internal) { const auto global_j = row_offset_ + j_linear(idx) + static_cast(internal) * THREAD_BLOCK_SIZE_uz; - // FEATURE_BLOCK_SIZE = 2 * THREAD_BLOCK_SIZE -> store twice as many values in the shared memory + // store the values in the shared memory B_cache[local_id_0 * INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE + internal * THREAD_BLOCK_SIZE + local_id_1] = alpha_ * B_[global_j * (num_classes_ + PADDING_SIZE_uz) + dim + threadIdx_x]; - B_cache[(local_id_0 + THREAD_BLOCK_SIZE) * INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE + internal * THREAD_BLOCK_SIZE + local_id_1] = alpha_ * B_[global_j * (num_classes_ + PADDING_SIZE_uz) + dim + threadIdx_x + THREAD_BLOCK_SIZE_uz]; C_out_cache[local_id_0 * INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE + internal * THREAD_BLOCK_SIZE + local_id_1] = real_type{ 0.0 }; - C_out_cache[(local_id_0 + THREAD_BLOCK_SIZE) * INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE + internal * THREAD_BLOCK_SIZE + local_id_1] = real_type{ 0.0 }; } }); // implicit group barrier // calculate intermediate results and store them in shared memory - for (unsigned class_idx = 0; class_idx < FEATURE_BLOCK_SIZE; ++class_idx) { + for (unsigned class_idx = 0; class_idx < THREAD_BLOCK_SIZE; ++class_idx) { ::sycl::distribute_items_and_wait(group, [&](::sycl::s_item<2> idx) { const auto local_id_0 = static_cast(idx.get_local_id(group, 0)); const auto local_id_1 = static_cast(idx.get_local_id(group, 1)); for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) { for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) { - C_out_cache[((class_idx + local_id_0) % FEATURE_BLOCK_SIZE) * INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE + internal_i * THREAD_BLOCK_SIZE + local_id_1] += - temp(idx)[internal_i][internal_j] * B_cache[((class_idx + local_id_0) % FEATURE_BLOCK_SIZE) * INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE + local_id_0 * INTERNAL_BLOCK_SIZE + internal_j]; + C_out_cache[((class_idx + local_id_0) % THREAD_BLOCK_SIZE) * INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE + internal_i * THREAD_BLOCK_SIZE + local_id_1] += + temp(idx)[internal_i][internal_j] * B_cache[((class_idx + local_id_0) % THREAD_BLOCK_SIZE) * INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE + local_id_0 * INTERNAL_BLOCK_SIZE + internal_j]; } } }); @@ -301,13 +293,11 @@ class device_kernel_assembly_symm { const std::size_t threadIdx_x = idx.get_local_id(group, 0); - const auto THREAD_BLOCK_SIZE_uz = static_cast(THREAD_BLOCK_SIZE); const auto PADDING_SIZE_uz = static_cast(PADDING_SIZE); for (unsigned internal = 0; internal < INTERNAL_BLOCK_SIZE; ++internal) { const auto global_i = row_offset_ + i(idx) + static_cast(internal); detail::atomic_op{ C_[global_i * (num_classes_ + PADDING_SIZE_uz) + dim + threadIdx_x] } += C_out_cache[local_id_0 * INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE + internal * THREAD_BLOCK_SIZE + local_id_1]; - detail::atomic_op{ C_[global_i * (num_classes_ + PADDING_SIZE_uz) + dim + threadIdx_x + THREAD_BLOCK_SIZE_uz] } += C_out_cache[(local_id_0 + THREAD_BLOCK_SIZE) * INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE + internal * THREAD_BLOCK_SIZE + local_id_1]; } }); diff --git a/include/plssvm/backends/SYCL/kernel/cg_implicit/work_group/kernel_matrix_assembly_blas.hpp b/include/plssvm/backends/SYCL/kernel/cg_implicit/work_group/kernel_matrix_assembly_blas.hpp index 34b55fff4..2e6ea3f4f 100644 --- a/include/plssvm/backends/SYCL/kernel/cg_implicit/work_group/kernel_matrix_assembly_blas.hpp +++ b/include/plssvm/backends/SYCL/kernel/cg_implicit/work_group/kernel_matrix_assembly_blas.hpp @@ -15,7 +15,7 @@ #include "plssvm/backends/SYCL/detail/atomics.hpp" // plssvm::sycl::detail::atomic_op #include "plssvm/backends/SYCL/kernel/kernel_functions.hpp" // plssvm::sycl::detail::{feature_reduce, apply_kernel_function} -#include "plssvm/constants.hpp" // plssvm::{real_type, THREAD_BLOCK_SIZE, INTERNAL_BLOCK_SIZE, FEATURE_BLOCK_SIZE, PADDING_SIZE} +#include "plssvm/constants.hpp" // plssvm::{real_type, THREAD_BLOCK_SIZE, INTERNAL_BLOCK_SIZE, PADDING_SIZE} #include "plssvm/kernel_function_types.hpp" // plssvm::kernel_function_type #include "sycl/sycl.hpp" // sycl::handler, sycl::range, sycl::nd_item, sycl::local_accessor @@ -54,8 +54,8 @@ class device_kernel_assembly_symm { * @param[in] kernel_function_parameter the parameters necessary to apply the @p kernel_function */ device_kernel_assembly_symm(::sycl::handler &cgh, const real_type alpha, const real_type *q, const real_type *data_d, const std::size_t num_rows, const std::size_t device_num_rows, const std::size_t row_offset, const std::size_t num_features, const real_type QA_cost, const real_type cost, const real_type *B, real_type *C, const std::size_t num_classes, const std::size_t grid_x_offset, const std::size_t grid_y_offset, Args... kernel_function_parameter) : - data_cache_i_{ ::sycl::range<1>{ static_cast(FEATURE_BLOCK_SIZE) * static_cast(INTERNAL_BLOCK_SIZE) * static_cast(THREAD_BLOCK_SIZE) }, cgh }, // [FEATURE_BLOCK_SIZE][INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE] - data_cache_j_{ ::sycl::range<1>{ static_cast(FEATURE_BLOCK_SIZE) * static_cast(INTERNAL_BLOCK_SIZE) * static_cast(THREAD_BLOCK_SIZE) }, cgh }, // [FEATURE_BLOCK_SIZE][INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE] + data_cache_i_{ ::sycl::range<1>{ static_cast(THREAD_BLOCK_SIZE) * static_cast(INTERNAL_BLOCK_SIZE) * static_cast(THREAD_BLOCK_SIZE) }, cgh }, // [THREAD_BLOCK_SIZE][INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE] + data_cache_j_{ ::sycl::range<1>{ static_cast(THREAD_BLOCK_SIZE) * static_cast(INTERNAL_BLOCK_SIZE) * static_cast(THREAD_BLOCK_SIZE) }, cgh }, // [THREAD_BLOCK_SIZE][INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE] alpha_{ alpha }, q_{ q }, data_d_{ data_d }, @@ -90,7 +90,6 @@ class device_kernel_assembly_symm { const std::size_t blockIdx_y = nd_idx.get_group(1) + grid_y_offset_; // current block in grid y-dimension + offsets if the grid size would be too large const auto INTERNAL_BLOCK_SIZE_uz = static_cast(INTERNAL_BLOCK_SIZE); const auto THREAD_BLOCK_SIZE_uz = static_cast(THREAD_BLOCK_SIZE); - const auto FEATURE_BLOCK_SIZE_uz = static_cast(FEATURE_BLOCK_SIZE); const auto PADDING_SIZE_uz = static_cast(PADDING_SIZE); // calculate the indices used in the current work-item @@ -106,22 +105,20 @@ class device_kernel_assembly_symm { { // iterate over all features using blocking to be able to cache them for faster memory accesses - for (std::size_t dim = 0; dim < num_features_; dim += FEATURE_BLOCK_SIZE_uz) { + for (std::size_t dim = 0; dim < num_features_; dim += THREAD_BLOCK_SIZE_uz) { // load data into local memory for (unsigned internal = 0; internal < INTERNAL_BLOCK_SIZE; ++internal) { const auto global_i = row_offset_ + i_linear + static_cast(internal) * THREAD_BLOCK_SIZE_uz; const auto global_j = row_offset_ + j_linear + static_cast(internal) * THREAD_BLOCK_SIZE_uz; - // FEATURE_BLOCK_SIZE = 2 * THREAD_BLOCK_SIZE -> store twice as many values in the local memory + // store the values in the local memory data_cache_i_[local_id_0 * INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE + internal * THREAD_BLOCK_SIZE + local_id_1] = data_d_[(dim + threadIdx_x) * (num_rows_ + std::size_t{ 1 } + PADDING_SIZE_uz) + global_i]; - data_cache_i_[(local_id_0 + THREAD_BLOCK_SIZE) * INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE + internal * THREAD_BLOCK_SIZE + local_id_1] = data_d_[(dim + threadIdx_x + THREAD_BLOCK_SIZE_uz) * (num_rows_ + std::size_t{ 1 } + PADDING_SIZE_uz) + global_i]; data_cache_j_[local_id_0 * INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE + internal * THREAD_BLOCK_SIZE + local_id_1] = data_d_[(dim + threadIdx_x) * (num_rows_ + std::size_t{ 1 } + PADDING_SIZE_uz) + global_j]; - data_cache_j_[(local_id_0 + THREAD_BLOCK_SIZE) * INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE + internal * THREAD_BLOCK_SIZE + local_id_1] = data_d_[(dim + threadIdx_x + THREAD_BLOCK_SIZE_uz) * (num_rows_ + std::size_t{ 1 } + PADDING_SIZE_uz) + global_j]; } nd_idx.barrier(); // wait until all work-items loaded their part of the data // perform the feature reduction calculation - for (unsigned block_dim = 0; block_dim < FEATURE_BLOCK_SIZE; ++block_dim) { + for (unsigned block_dim = 0; block_dim < THREAD_BLOCK_SIZE; ++block_dim) { for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) { for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) { temp[internal_i][internal_j] += detail::feature_reduce(data_cache_i_[block_dim * INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE + local_id_1 * INTERNAL_BLOCK_SIZE + internal_i], @@ -158,29 +155,27 @@ class device_kernel_assembly_symm { // calculate C += alpha * temp * B for the UPPER triangular matrix { // rename cached arrays - auto &B_cache = data_cache_i_; // [INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE][FEATURE_BLOCK_SIZE] - auto &C_out_cache = data_cache_j_; // [INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE][FEATURE_BLOCK_SIZE] + auto &B_cache = data_cache_i_; // [INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE][THREAD_BLOCK_SIZE] + auto &C_out_cache = data_cache_j_; // [INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE][THREAD_BLOCK_SIZE] // iterate over all classes using blocking to be able to cache them for faster memory accesses - for (std::size_t dim = 0; dim < num_classes_; dim += FEATURE_BLOCK_SIZE_uz) { + for (std::size_t dim = 0; dim < num_classes_; dim += THREAD_BLOCK_SIZE_uz) { // load data into local memory for (unsigned internal = 0; internal < INTERNAL_BLOCK_SIZE; ++internal) { const std::size_t global_i = row_offset_ + i_linear + static_cast(internal) * THREAD_BLOCK_SIZE_uz; - // FEATURE_BLOCK_SIZE = 2 * THREAD_BLOCK_SIZE -> store twice as many values in the local memory - B_cache[(internal * THREAD_BLOCK_SIZE + local_id_1) * FEATURE_BLOCK_SIZE + local_id_0] = alpha_ * B_[global_i * (num_classes_ + PADDING_SIZE_uz) + dim + threadIdx_x]; - B_cache[(internal * THREAD_BLOCK_SIZE + local_id_1) * FEATURE_BLOCK_SIZE + local_id_0 + THREAD_BLOCK_SIZE] = alpha_ * B_[global_i * (num_classes_ + PADDING_SIZE_uz) + dim + threadIdx_x + THREAD_BLOCK_SIZE_uz]; - C_out_cache[(internal * THREAD_BLOCK_SIZE + local_id_1) * FEATURE_BLOCK_SIZE + local_id_0] = real_type{ 0.0 }; - C_out_cache[(internal * THREAD_BLOCK_SIZE + local_id_1) * FEATURE_BLOCK_SIZE + local_id_0 + THREAD_BLOCK_SIZE] = real_type{ 0.0 }; + // store the values in the local memory + B_cache[(internal * THREAD_BLOCK_SIZE + local_id_1) * THREAD_BLOCK_SIZE + local_id_0] = alpha_ * B_[global_i * (num_classes_ + PADDING_SIZE_uz) + dim + threadIdx_x]; + C_out_cache[(internal * THREAD_BLOCK_SIZE + local_id_1) * THREAD_BLOCK_SIZE + local_id_0] = real_type{ 0.0 }; } nd_idx.barrier(); // wait until all work-items loaded their part of the data // calculate intermediate results and store them in shared memory - for (unsigned class_idx = 0; class_idx < FEATURE_BLOCK_SIZE; ++class_idx) { + for (unsigned class_idx = 0; class_idx < THREAD_BLOCK_SIZE; ++class_idx) { for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) { for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) { - C_out_cache[(local_id_0 * INTERNAL_BLOCK_SIZE + internal_j) * FEATURE_BLOCK_SIZE + (class_idx + local_id_1) % FEATURE_BLOCK_SIZE] += - temp[internal_i][internal_j] * B_cache[(local_id_1 * INTERNAL_BLOCK_SIZE + internal_i) * FEATURE_BLOCK_SIZE + (class_idx + local_id_1) % FEATURE_BLOCK_SIZE]; + C_out_cache[(local_id_0 * INTERNAL_BLOCK_SIZE + internal_j) * THREAD_BLOCK_SIZE + (class_idx + local_id_1) % THREAD_BLOCK_SIZE] += + temp[internal_i][internal_j] * B_cache[(local_id_1 * INTERNAL_BLOCK_SIZE + internal_i) * THREAD_BLOCK_SIZE + (class_idx + local_id_1) % THREAD_BLOCK_SIZE]; } } nd_idx.barrier(); // wait until all work-items performed their part of the calculations @@ -189,8 +184,7 @@ class device_kernel_assembly_symm { // add intermediate cached results to C for (unsigned internal = 0; internal < INTERNAL_BLOCK_SIZE; ++internal) { const auto global_j = row_offset_ + j + static_cast(internal); - detail::atomic_op{ C_[global_j * (num_classes_ + PADDING_SIZE_uz) + dim + threadIdx_y] } += C_out_cache[(local_id_0 * INTERNAL_BLOCK_SIZE + internal) * FEATURE_BLOCK_SIZE + local_id_1]; - detail::atomic_op{ C_[global_j * (num_classes_ + PADDING_SIZE_uz) + dim + threadIdx_y + THREAD_BLOCK_SIZE_uz] } += C_out_cache[(local_id_0 * INTERNAL_BLOCK_SIZE + internal) * FEATURE_BLOCK_SIZE + local_id_1 + THREAD_BLOCK_SIZE]; + detail::atomic_op{ C_[global_j * (num_classes_ + PADDING_SIZE_uz) + dim + threadIdx_y] } += C_out_cache[(local_id_0 * INTERNAL_BLOCK_SIZE + internal) * THREAD_BLOCK_SIZE + local_id_1]; } nd_idx.barrier(); // wai until all work-items updated C with their values } @@ -211,29 +205,27 @@ class device_kernel_assembly_symm { // calculate C += alpha * temp * B for the LOWER triangular matrix { // rename cached arrays - auto &B_cache = data_cache_i_; // [FEATURE_BLOCK_SIZE][INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE] - auto &C_out_cache = data_cache_j_; // [FEATURE_BLOCK_SIZE][INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE] + auto &B_cache = data_cache_i_; // [THREAD_BLOCK_SIZE][INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE] + auto &C_out_cache = data_cache_j_; // [THREAD_BLOCK_SIZE][INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE] // iterate over all classes using blocking to be able to cache them for faster memory accesses - for (std::size_t dim = 0; dim < num_classes_; dim += FEATURE_BLOCK_SIZE_uz) { + for (std::size_t dim = 0; dim < num_classes_; dim += THREAD_BLOCK_SIZE_uz) { // load data into local memory for (unsigned internal = 0; internal < INTERNAL_BLOCK_SIZE; ++internal) { const auto global_j = row_offset_ + j_linear + static_cast(internal) * THREAD_BLOCK_SIZE_uz; - // FEATURE_BLOCK_SIZE = 2 * THREAD_BLOCK_SIZE -> store twice as many values in the shared memory + // store the in the shared memory B_cache[local_id_0 * INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE + internal * THREAD_BLOCK_SIZE + local_id_1] = alpha_ * B_[global_j * (num_classes_ + PADDING_SIZE_uz) + dim + threadIdx_x]; - B_cache[(local_id_0 + THREAD_BLOCK_SIZE) * INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE + internal * THREAD_BLOCK_SIZE + local_id_1] = alpha_ * B_[global_j * (num_classes_ + PADDING_SIZE_uz) + dim + threadIdx_x + THREAD_BLOCK_SIZE_uz]; C_out_cache[local_id_0 * INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE + internal * THREAD_BLOCK_SIZE + local_id_1] = real_type{ 0.0 }; - C_out_cache[(local_id_0 + THREAD_BLOCK_SIZE) * INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE + internal * THREAD_BLOCK_SIZE + local_id_1] = real_type{ 0.0 }; } nd_idx.barrier(); // wait until all work-items loaded their part of the data // calculate intermediate results and store them in shared memory - for (unsigned class_idx = 0; class_idx < FEATURE_BLOCK_SIZE; ++class_idx) { + for (unsigned class_idx = 0; class_idx < THREAD_BLOCK_SIZE; ++class_idx) { for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) { for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) { - C_out_cache[((class_idx + local_id_0) % FEATURE_BLOCK_SIZE) * INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE + internal_i * THREAD_BLOCK_SIZE + local_id_1] += - temp[internal_i][internal_j] * B_cache[((class_idx + local_id_0) % FEATURE_BLOCK_SIZE) * INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE + local_id_0 * INTERNAL_BLOCK_SIZE + internal_j]; + C_out_cache[((class_idx + local_id_0) % THREAD_BLOCK_SIZE) * INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE + internal_i * THREAD_BLOCK_SIZE + local_id_1] += + temp[internal_i][internal_j] * B_cache[((class_idx + local_id_0) % THREAD_BLOCK_SIZE) * INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE + local_id_0 * INTERNAL_BLOCK_SIZE + internal_j]; } } nd_idx.barrier(); // wait until all work-items performed their part of the calculations @@ -243,7 +235,6 @@ class device_kernel_assembly_symm { for (unsigned internal = 0; internal < INTERNAL_BLOCK_SIZE; ++internal) { const auto global_i = row_offset_ + i + static_cast(internal); detail::atomic_op{ C_[global_i * (num_classes_ + PADDING_SIZE_uz) + dim + threadIdx_x] } += C_out_cache[local_id_0 * INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE + internal * THREAD_BLOCK_SIZE + local_id_1]; - detail::atomic_op{ C_[global_i * (num_classes_ + PADDING_SIZE_uz) + dim + threadIdx_x + THREAD_BLOCK_SIZE_uz] } += C_out_cache[(local_id_0 + THREAD_BLOCK_SIZE) * INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE + internal * THREAD_BLOCK_SIZE + local_id_1]; } nd_idx.barrier(); // wait until all threads updated C with their values } diff --git a/include/plssvm/backends/SYCL/kernel/predict/basic/predict_kernel.hpp b/include/plssvm/backends/SYCL/kernel/predict/basic/predict_kernel.hpp index c16965cb1..631bf80a1 100644 --- a/include/plssvm/backends/SYCL/kernel/predict/basic/predict_kernel.hpp +++ b/include/plssvm/backends/SYCL/kernel/predict/basic/predict_kernel.hpp @@ -15,7 +15,7 @@ #include "plssvm/backends/SYCL/detail/atomics.hpp" // plssvm::sycl::detail::atomic_op #include "plssvm/backends/SYCL/kernel/kernel_functions.hpp" // plssvm::sycl::detail::{feature_reduce, apply_kernel_function} -#include "plssvm/constants.hpp" // plssvm::{real_type, THREAD_BLOCK_SIZE, INTERNAL_BLOCK_SIZE, FEATURE_BLOCK_SIZE, PADDING_SIZE} +#include "plssvm/constants.hpp" // plssvm::{real_type, THREAD_BLOCK_SIZE, INTERNAL_BLOCK_SIZE, PADDING_SIZE} #include "plssvm/kernel_function_types.hpp" // plssvm::kernel_function_type #include "sycl/sycl.hpp" // sycl::item diff --git a/include/plssvm/backends/SYCL/kernel/predict/hierarchical/predict_kernel.hpp b/include/plssvm/backends/SYCL/kernel/predict/hierarchical/predict_kernel.hpp index 4098c4914..dedfe609e 100644 --- a/include/plssvm/backends/SYCL/kernel/predict/hierarchical/predict_kernel.hpp +++ b/include/plssvm/backends/SYCL/kernel/predict/hierarchical/predict_kernel.hpp @@ -15,7 +15,7 @@ #include "plssvm/backends/SYCL/detail/atomics.hpp" // plssvm::sycl::detail::atomic_op #include "plssvm/backends/SYCL/kernel/kernel_functions.hpp" // plssvm::sycl::detail::{feature_reduce, apply_kernel_function} -#include "plssvm/constants.hpp" // plssvm::{real_type, THREAD_BLOCK_SIZE, INTERNAL_BLOCK_SIZE, FEATURE_BLOCK_SIZE, PADDING_SIZE} +#include "plssvm/constants.hpp" // plssvm::{real_type, THREAD_BLOCK_SIZE, INTERNAL_BLOCK_SIZE, PADDING_SIZE} #include "plssvm/kernel_function_types.hpp" // plssvm::kernel_function_type #include "sycl/sycl.hpp" // sycl::group, sycl::private_memory, sycl::h_item @@ -202,8 +202,8 @@ class device_kernel_predict_linear { */ void operator()(::sycl::group<2> group) const { // allocate shared memory - real_type data_cache_pp[FEATURE_BLOCK_SIZE][INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE]; - real_type data_cache_w[FEATURE_BLOCK_SIZE][INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE]; + real_type data_cache_pp[THREAD_BLOCK_SIZE][INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE]; + real_type data_cache_w[THREAD_BLOCK_SIZE][INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE]; // calculate the indices used in the current work-item ::sycl::private_memory pp_idx{ group }; @@ -241,7 +241,7 @@ class device_kernel_predict_linear { // implicit group barrier // iterate over all support vectors using blocking to be able to cache them for faster memory accesses - for (std::size_t dim = 0; dim < num_features_; dim += static_cast(FEATURE_BLOCK_SIZE)) { + for (std::size_t dim = 0; dim < num_features_; dim += static_cast(THREAD_BLOCK_SIZE)) { group.parallel_for_work_item([&](::sycl::h_item<2> idx) { const auto local_id_0 = static_cast(idx.get_local_id(0)); const auto local_id_1 = static_cast(idx.get_local_id(1)); @@ -256,11 +256,9 @@ class device_kernel_predict_linear { const auto global_pp_idx = pp_idx_linear(idx) + static_cast(internal) * THREAD_BLOCK_SIZE_uz; const auto global_class_idx = class_idx_linear(idx) + static_cast(internal) * THREAD_BLOCK_SIZE_uz; - // FEATURE_BLOCK_SIZE = 2 * THREAD_BLOCK_SIZE -> store twice as many values in the local memory + // store the values in the local memory data_cache_pp[local_id_0][internal * THREAD_BLOCK_SIZE + local_id_1] = predict_points_d_[(dim + threadIdx_x) * (num_predict_points_ + PADDING_SIZE_uz) + global_pp_idx]; - data_cache_pp[local_id_0 + THREAD_BLOCK_SIZE][internal * THREAD_BLOCK_SIZE + local_id_1] = predict_points_d_[(dim + threadIdx_x + THREAD_BLOCK_SIZE_uz) * (num_predict_points_ + PADDING_SIZE_uz) + global_pp_idx]; data_cache_w[local_id_0][internal * THREAD_BLOCK_SIZE + local_id_1] = w_d_[(dim + threadIdx_x) * (num_classes_ + PADDING_SIZE_uz) + global_class_idx]; - data_cache_w[local_id_0 + THREAD_BLOCK_SIZE][internal * THREAD_BLOCK_SIZE + local_id_1] = w_d_[(dim + threadIdx_x + THREAD_BLOCK_SIZE_uz) * (num_classes_ + PADDING_SIZE_uz) + global_class_idx]; } }); @@ -271,7 +269,7 @@ class device_kernel_predict_linear { const auto local_id_0 = static_cast(idx.get_local_id(0)); const auto local_id_1 = static_cast(idx.get_local_id(1)); - for (unsigned block_dim = 0; block_dim < FEATURE_BLOCK_SIZE; ++block_dim) { + for (unsigned block_dim = 0; block_dim < THREAD_BLOCK_SIZE; ++block_dim) { for (unsigned internal_pd = 0; internal_pd < INTERNAL_BLOCK_SIZE; ++internal_pd) { for (unsigned internal_class = 0; internal_class < INTERNAL_BLOCK_SIZE; ++internal_class) { temp(idx)[internal_pd][internal_class] += data_cache_w[block_dim][local_id_0 * INTERNAL_BLOCK_SIZE + internal_class] * data_cache_pp[block_dim][local_id_1 * INTERNAL_BLOCK_SIZE + internal_pd]; @@ -356,8 +354,8 @@ class device_kernel_predict { */ void operator()(::sycl::group<2> group) const { // allocate shared memory - real_type data_cache_pp[FEATURE_BLOCK_SIZE][INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE]; - real_type data_cache_sv[FEATURE_BLOCK_SIZE][INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE]; + real_type data_cache_pp[THREAD_BLOCK_SIZE][INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE]; + real_type data_cache_sv[THREAD_BLOCK_SIZE][INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE]; // calculate the indices used in the current work-item ::sycl::private_memory pp_idx{ group }; @@ -393,7 +391,7 @@ class device_kernel_predict { { // iterate over all features using blocking to be able to cache them for faster memory accesses - for (std::size_t dim = 0; dim < num_features_; dim += static_cast(FEATURE_BLOCK_SIZE)) { + for (std::size_t dim = 0; dim < num_features_; dim += static_cast(THREAD_BLOCK_SIZE)) { group.parallel_for_work_item([&](::sycl::h_item<2> idx) { const auto local_id_0 = static_cast(idx.get_local_id(0)); const auto local_id_1 = static_cast(idx.get_local_id(1)); @@ -408,11 +406,9 @@ class device_kernel_predict { const auto global_pp_idx = pp_idx_linear(idx) + static_cast(internal) * THREAD_BLOCK_SIZE_uz; const auto global_sv_idx = sv_idx_linear(idx) + static_cast(internal) * THREAD_BLOCK_SIZE_uz; - // FEATURE_BLOCK_SIZE = 2 * THREAD_BLOCK_SIZE -> store twice as many values in the shared memory + // store the values in the shared memory data_cache_pp[local_id_0][internal * THREAD_BLOCK_SIZE + local_id_1] = predict_points_d_[(dim + threadIdx_x) * (num_predict_points_ + PADDING_SIZE_uz) + global_pp_idx]; - data_cache_pp[local_id_0 + THREAD_BLOCK_SIZE][internal * THREAD_BLOCK_SIZE + local_id_1] = predict_points_d_[(dim + threadIdx_x + THREAD_BLOCK_SIZE) * (num_predict_points_ + PADDING_SIZE_uz) + global_pp_idx]; data_cache_sv[local_id_0][internal * THREAD_BLOCK_SIZE + local_id_1] = sv_d_[(dim + threadIdx_x) * (num_sv_ + PADDING_SIZE_uz) + global_sv_idx]; - data_cache_sv[local_id_0 + THREAD_BLOCK_SIZE][internal * THREAD_BLOCK_SIZE + local_id_1] = sv_d_[(dim + threadIdx_x + THREAD_BLOCK_SIZE_uz) * (num_sv_ + PADDING_SIZE_uz) + global_sv_idx]; } }); @@ -423,7 +419,7 @@ class device_kernel_predict { const auto local_id_0 = static_cast(idx.get_local_id(0)); const auto local_id_1 = static_cast(idx.get_local_id(1)); - for (unsigned block_dim = 0; block_dim < FEATURE_BLOCK_SIZE; ++block_dim) { + for (unsigned block_dim = 0; block_dim < THREAD_BLOCK_SIZE; ++block_dim) { for (unsigned internal_pd = 0; internal_pd < INTERNAL_BLOCK_SIZE; ++internal_pd) { for (unsigned internal_sv = 0; internal_sv < INTERNAL_BLOCK_SIZE; ++internal_sv) { temp(idx)[internal_pd][internal_sv] += detail::feature_reduce(data_cache_sv[block_dim][local_id_0 * INTERNAL_BLOCK_SIZE + internal_sv], @@ -454,7 +450,7 @@ class device_kernel_predict { // auto &out_cache = data_cache_sv; // iterate over all features using blocking to be able to cache them for faster memory accesses - for (std::size_t dim = 0; dim < num_classes_; dim += static_cast(FEATURE_BLOCK_SIZE)) { + for (std::size_t dim = 0; dim < num_classes_; dim += static_cast(THREAD_BLOCK_SIZE)) { // load data into local memory group.parallel_for_work_item([&](::sycl::h_item<2> idx) { const auto local_id_0 = static_cast(idx.get_local_id(0)); @@ -470,15 +466,12 @@ class device_kernel_predict { const std::size_t global_sv_idx = sv_idx_linear(idx) + static_cast(internal) * THREAD_BLOCK_SIZE_uz; data_cache_pp[local_id_0][internal * THREAD_BLOCK_SIZE + local_id_1] = alpha_d_[(dim + threadIdx_x) * (num_sv_ + PADDING_SIZE_uz) + global_sv_idx]; - data_cache_pp[local_id_0 + THREAD_BLOCK_SIZE][internal * THREAD_BLOCK_SIZE + local_id_1] = alpha_d_[(dim + threadIdx_x + THREAD_BLOCK_SIZE_uz) * (num_sv_ + PADDING_SIZE_uz) + global_sv_idx]; // the bias (rho) must only be applied once for all support vectors if (blockIdx_x == std::size_t{ 0 }) { data_cache_sv[local_id_0][internal * THREAD_BLOCK_SIZE + local_id_1] = -rho_d_[dim + threadIdx_x]; - data_cache_sv[local_id_0 + THREAD_BLOCK_SIZE][internal * THREAD_BLOCK_SIZE + local_id_1] = -rho_d_[dim + threadIdx_x + THREAD_BLOCK_SIZE_uz]; } else { data_cache_sv[local_id_0][internal * THREAD_BLOCK_SIZE + local_id_1] = real_type{ 0.0 }; - data_cache_sv[local_id_0 + THREAD_BLOCK_SIZE][internal * THREAD_BLOCK_SIZE + local_id_1] = real_type{ 0.0 }; } } }); @@ -486,15 +479,15 @@ class device_kernel_predict { // implicit group barrier // calculate intermediate results and store them in local memory - for (unsigned class_idx = 0; class_idx < FEATURE_BLOCK_SIZE; ++class_idx) { + for (unsigned class_idx = 0; class_idx < THREAD_BLOCK_SIZE; ++class_idx) { group.parallel_for_work_item([&](::sycl::h_item<2> idx) { const auto local_id_0 = static_cast(idx.get_local_id(0)); const auto local_id_1 = static_cast(idx.get_local_id(1)); for (unsigned internal_pd = 0; internal_pd < INTERNAL_BLOCK_SIZE; ++internal_pd) { for (unsigned internal_sv = 0; internal_sv < INTERNAL_BLOCK_SIZE; ++internal_sv) { - data_cache_sv[(class_idx + local_id_0) % FEATURE_BLOCK_SIZE][internal_pd * THREAD_BLOCK_SIZE + local_id_1] += - temp(idx)[internal_pd][internal_sv] * data_cache_pp[(class_idx + local_id_0) % FEATURE_BLOCK_SIZE][local_id_0 * INTERNAL_BLOCK_SIZE + internal_sv]; + data_cache_sv[(class_idx + local_id_0) % THREAD_BLOCK_SIZE][internal_pd * THREAD_BLOCK_SIZE + local_id_1] += + temp(idx)[internal_pd][internal_sv] * data_cache_pp[(class_idx + local_id_0) % THREAD_BLOCK_SIZE][local_id_0 * INTERNAL_BLOCK_SIZE + internal_sv]; } } }); @@ -509,14 +502,12 @@ class device_kernel_predict { const std::size_t threadIdx_x = idx.get_local_id(0); - const auto THREAD_BLOCK_SIZE_uz = static_cast(THREAD_BLOCK_SIZE); const auto PADDING_SIZE_uz = static_cast(PADDING_SIZE); for (unsigned internal = 0; internal < INTERNAL_BLOCK_SIZE; ++internal) { const auto global_pp_idx = pp_idx(idx) + static_cast(internal); detail::atomic_op{ prediction_d_[global_pp_idx * (num_classes_ + PADDING_SIZE_uz) + dim + threadIdx_x] } += data_cache_sv[local_id_0][internal * THREAD_BLOCK_SIZE + local_id_1]; - detail::atomic_op{ prediction_d_[global_pp_idx * (num_classes_ + PADDING_SIZE_uz) + dim + threadIdx_x + THREAD_BLOCK_SIZE_uz] } += data_cache_sv[local_id_0 + THREAD_BLOCK_SIZE][internal * THREAD_BLOCK_SIZE + local_id_1]; } }); diff --git a/include/plssvm/backends/SYCL/kernel/predict/scoped/predict_kernel.hpp b/include/plssvm/backends/SYCL/kernel/predict/scoped/predict_kernel.hpp index 1a42161f5..e6d56ec56 100644 --- a/include/plssvm/backends/SYCL/kernel/predict/scoped/predict_kernel.hpp +++ b/include/plssvm/backends/SYCL/kernel/predict/scoped/predict_kernel.hpp @@ -15,7 +15,7 @@ #include "plssvm/backends/SYCL/detail/atomics.hpp" // plssvm::sycl::detail::atomic_op #include "plssvm/backends/SYCL/kernel/kernel_functions.hpp" // plssvm::sycl::detail::{feature_reduce, apply_kernel_function} -#include "plssvm/constants.hpp" // plssvm::{real_type, THREAD_BLOCK_SIZE, INTERNAL_BLOCK_SIZE, FEATURE_BLOCK_SIZE, PADDING_SIZE} +#include "plssvm/constants.hpp" // plssvm::{real_type, THREAD_BLOCK_SIZE, INTERNAL_BLOCK_SIZE, PADDING_SIZE} #include "plssvm/kernel_function_types.hpp" // plssvm::kernel_function_type #include "sycl/sycl.hpp" // sycl::memory_environment, sycl::require_local_mem, sycl::require_private_mem, sycl::distribute_items_and_wait, sycl::s_item @@ -191,8 +191,8 @@ class device_kernel_predict_linear { template void operator()(T group) const { ::sycl::memory_environment(group, - ::sycl::require_local_mem(), - ::sycl::require_local_mem(), + ::sycl::require_local_mem(), + ::sycl::require_local_mem(), ::sycl::require_private_mem(), ::sycl::require_private_mem(), ::sycl::require_private_mem(), @@ -218,7 +218,7 @@ class device_kernel_predict_linear { }); // iterate over all support vectors using blocking to be able to cache them for faster memory accesses - for (std::size_t dim = 0; dim < num_features_; dim += static_cast(FEATURE_BLOCK_SIZE)) { + for (std::size_t dim = 0; dim < num_features_; dim += static_cast(THREAD_BLOCK_SIZE)) { ::sycl::distribute_items_and_wait(group, [&](::sycl::s_item<2> idx) { const auto local_id_0 = static_cast(idx.get_local_id(group, 0)); const auto local_id_1 = static_cast(idx.get_local_id(group, 1)); @@ -233,11 +233,9 @@ class device_kernel_predict_linear { const auto global_pp_idx = pp_idx_linear(idx) + static_cast(internal) * THREAD_BLOCK_SIZE_uz; const auto global_class_idx = class_idx_linear(idx) + static_cast(internal) * THREAD_BLOCK_SIZE_uz; - // FEATURE_BLOCK_SIZE = 2 * THREAD_BLOCK_SIZE -> store twice as many values in the local memory + // store the values in the local memory data_cache_pp[local_id_0][internal * THREAD_BLOCK_SIZE + local_id_1] = predict_points_d_[(dim + threadIdx_x) * (num_predict_points_ + PADDING_SIZE_uz) + global_pp_idx]; - data_cache_pp[local_id_0 + THREAD_BLOCK_SIZE][internal * THREAD_BLOCK_SIZE + local_id_1] = predict_points_d_[(dim + threadIdx_x + THREAD_BLOCK_SIZE_uz) * (num_predict_points_ + PADDING_SIZE_uz) + global_pp_idx]; data_cache_w[local_id_0][internal * THREAD_BLOCK_SIZE + local_id_1] = w_d_[(dim + threadIdx_x) * (num_classes_ + PADDING_SIZE_uz) + global_class_idx]; - data_cache_w[local_id_0 + THREAD_BLOCK_SIZE][internal * THREAD_BLOCK_SIZE + local_id_1] = w_d_[(dim + threadIdx_x + THREAD_BLOCK_SIZE_uz) * (num_classes_ + PADDING_SIZE_uz) + global_class_idx]; } }); @@ -246,7 +244,7 @@ class device_kernel_predict_linear { const auto local_id_0 = static_cast(idx.get_local_id(group, 0)); const auto local_id_1 = static_cast(idx.get_local_id(group, 1)); - for (unsigned block_dim = 0; block_dim < FEATURE_BLOCK_SIZE; ++block_dim) { + for (unsigned block_dim = 0; block_dim < THREAD_BLOCK_SIZE; ++block_dim) { for (unsigned internal_pd = 0; internal_pd < INTERNAL_BLOCK_SIZE; ++internal_pd) { for (unsigned internal_class = 0; internal_class < INTERNAL_BLOCK_SIZE; ++internal_class) { temp(idx)[internal_pd][internal_class] += data_cache_w[block_dim][local_id_0 * INTERNAL_BLOCK_SIZE + internal_class] * data_cache_pp[block_dim][local_id_1 * INTERNAL_BLOCK_SIZE + internal_pd]; @@ -332,8 +330,8 @@ class device_kernel_predict { template void operator()(T group) const { ::sycl::memory_environment(group, - ::sycl::require_local_mem(), - ::sycl::require_local_mem(), + ::sycl::require_local_mem(), + ::sycl::require_local_mem(), ::sycl::require_private_mem(), ::sycl::require_private_mem(), ::sycl::require_private_mem(), @@ -356,7 +354,7 @@ class device_kernel_predict { }); // iterate over all features using blocking to be able to cache them for faster memory accesses - for (std::size_t dim = 0; dim < num_features_; dim += static_cast(FEATURE_BLOCK_SIZE)) { + for (std::size_t dim = 0; dim < num_features_; dim += static_cast(THREAD_BLOCK_SIZE)) { ::sycl::distribute_items_and_wait(group, [&](::sycl::s_item<2> idx) { const auto local_id_0 = static_cast(idx.get_local_id(group, 0)); const auto local_id_1 = static_cast(idx.get_local_id(group, 1)); @@ -371,11 +369,9 @@ class device_kernel_predict { const auto global_pp_idx = pp_idx_linear(idx) + static_cast(internal) * THREAD_BLOCK_SIZE_uz; const auto global_sv_idx = sv_idx_linear(idx) + static_cast(internal) * THREAD_BLOCK_SIZE_uz; - // FEATURE_BLOCK_SIZE = 2 * THREAD_BLOCK_SIZE -> store twice as many values in the shared memory + // store the values in the shared memory data_cache_pp[local_id_0][internal * THREAD_BLOCK_SIZE + local_id_1] = predict_points_d_[(dim + threadIdx_x) * (num_predict_points_ + PADDING_SIZE_uz) + global_pp_idx]; - data_cache_pp[local_id_0 + THREAD_BLOCK_SIZE][internal * THREAD_BLOCK_SIZE + local_id_1] = predict_points_d_[(dim + threadIdx_x + THREAD_BLOCK_SIZE) * (num_predict_points_ + PADDING_SIZE_uz) + global_pp_idx]; data_cache_sv[local_id_0][internal * THREAD_BLOCK_SIZE + local_id_1] = sv_d_[(dim + threadIdx_x) * (num_sv_ + PADDING_SIZE_uz) + global_sv_idx]; - data_cache_sv[local_id_0 + THREAD_BLOCK_SIZE][internal * THREAD_BLOCK_SIZE + local_id_1] = sv_d_[(dim + threadIdx_x + THREAD_BLOCK_SIZE_uz) * (num_sv_ + PADDING_SIZE_uz) + global_sv_idx]; } }); @@ -384,7 +380,7 @@ class device_kernel_predict { const auto local_id_0 = static_cast(idx.get_local_id(group, 0)); const auto local_id_1 = static_cast(idx.get_local_id(group, 1)); - for (unsigned block_dim = 0; block_dim < FEATURE_BLOCK_SIZE; ++block_dim) { + for (unsigned block_dim = 0; block_dim < THREAD_BLOCK_SIZE; ++block_dim) { for (unsigned internal_pd = 0; internal_pd < INTERNAL_BLOCK_SIZE; ++internal_pd) { for (unsigned internal_sv = 0; internal_sv < INTERNAL_BLOCK_SIZE; ++internal_sv) { temp(idx)[internal_pd][internal_sv] += detail::feature_reduce(data_cache_sv[block_dim][local_id_0 * INTERNAL_BLOCK_SIZE + internal_sv], @@ -410,7 +406,7 @@ class device_kernel_predict { auto &out_cache = data_cache_sv; // iterate over all features using blocking to be able to cache them for faster memory accesses - for (std::size_t dim = 0; dim < num_classes_; dim += static_cast(FEATURE_BLOCK_SIZE)) { + for (std::size_t dim = 0; dim < num_classes_; dim += static_cast(THREAD_BLOCK_SIZE)) { // load data into local memory ::sycl::distribute_items_and_wait(group, [&](::sycl::s_item<2> idx) { const auto local_id_0 = static_cast(idx.get_local_id(group, 0)); @@ -426,29 +422,26 @@ class device_kernel_predict { const std::size_t global_sv_idx = sv_idx_linear(idx) + static_cast(internal) * THREAD_BLOCK_SIZE_uz; alpha_cache[local_id_0][internal * THREAD_BLOCK_SIZE + local_id_1] = alpha_d_[(dim + threadIdx_x) * (num_sv_ + PADDING_SIZE_uz) + global_sv_idx]; - alpha_cache[local_id_0 + THREAD_BLOCK_SIZE][internal * THREAD_BLOCK_SIZE + local_id_1] = alpha_d_[(dim + threadIdx_x + THREAD_BLOCK_SIZE_uz) * (num_sv_ + PADDING_SIZE_uz) + global_sv_idx]; // the bias (rho) must only be applied once for all support vectors if (blockIdx_x == std::size_t{ 0 }) { out_cache[local_id_0][internal * THREAD_BLOCK_SIZE + local_id_1] = -rho_d_[dim + threadIdx_x]; - out_cache[local_id_0 + THREAD_BLOCK_SIZE][internal * THREAD_BLOCK_SIZE + local_id_1] = -rho_d_[dim + threadIdx_x + THREAD_BLOCK_SIZE_uz]; } else { out_cache[local_id_0][internal * THREAD_BLOCK_SIZE + local_id_1] = real_type{ 0.0 }; - out_cache[local_id_0 + THREAD_BLOCK_SIZE][internal * THREAD_BLOCK_SIZE + local_id_1] = real_type{ 0.0 }; } } }); // calculate intermediate results and store them in local memory - for (unsigned class_idx = 0; class_idx < FEATURE_BLOCK_SIZE; ++class_idx) { + for (unsigned class_idx = 0; class_idx < THREAD_BLOCK_SIZE; ++class_idx) { ::sycl::distribute_items_and_wait(group, [&](::sycl::s_item<2> idx) { const auto local_id_0 = static_cast(idx.get_local_id(group, 0)); const auto local_id_1 = static_cast(idx.get_local_id(group, 1)); for (unsigned internal_pd = 0; internal_pd < INTERNAL_BLOCK_SIZE; ++internal_pd) { for (unsigned internal_sv = 0; internal_sv < INTERNAL_BLOCK_SIZE; ++internal_sv) { - out_cache[(class_idx + local_id_0) % FEATURE_BLOCK_SIZE][internal_pd * THREAD_BLOCK_SIZE + local_id_1] += - temp(idx)[internal_pd][internal_sv] * alpha_cache[(class_idx + local_id_0) % FEATURE_BLOCK_SIZE][local_id_0 * INTERNAL_BLOCK_SIZE + internal_sv]; + out_cache[(class_idx + local_id_0) % THREAD_BLOCK_SIZE][internal_pd * THREAD_BLOCK_SIZE + local_id_1] += + temp(idx)[internal_pd][internal_sv] * alpha_cache[(class_idx + local_id_0) % THREAD_BLOCK_SIZE][local_id_0 * INTERNAL_BLOCK_SIZE + internal_sv]; } } }); diff --git a/include/plssvm/backends/SYCL/kernel/predict/work_group/predict_kernel.hpp b/include/plssvm/backends/SYCL/kernel/predict/work_group/predict_kernel.hpp index d451ac7d5..6612a10d8 100644 --- a/include/plssvm/backends/SYCL/kernel/predict/work_group/predict_kernel.hpp +++ b/include/plssvm/backends/SYCL/kernel/predict/work_group/predict_kernel.hpp @@ -15,7 +15,7 @@ #include "plssvm/backends/SYCL/detail/atomics.hpp" // plssvm::sycl::detail::atomic_op #include "plssvm/backends/SYCL/kernel/kernel_functions.hpp" // plssvm::sycl::detail::{feature_reduce, apply_kernel_function} -#include "plssvm/constants.hpp" // plssvm::{real_type, THREAD_BLOCK_SIZE, INTERNAL_BLOCK_SIZE, FEATURE_BLOCK_SIZE, PADDING_SIZE} +#include "plssvm/constants.hpp" // plssvm::{real_type, THREAD_BLOCK_SIZE, INTERNAL_BLOCK_SIZE, PADDING_SIZE} #include "plssvm/kernel_function_types.hpp" // plssvm::kernel_function_type #include "sycl/sycl.hpp" // sycl::handler, sycl::range, sycl::nd_item, sycl::local_accessor @@ -159,8 +159,8 @@ class device_kernel_predict_linear { * @param[in] grid_y_offset the offset in y-dimension into the data points if more than one execution grid has to be used */ device_kernel_predict_linear(::sycl::handler &cgh, real_type *prediction_d, const real_type *w_d, const real_type *rho_d, const real_type *predict_points_d, const std::size_t num_classes, const std::size_t num_predict_points, const std::size_t num_features, const std::size_t grid_x_offset, const std::size_t grid_y_offset) : - data_cache_pp_{ ::sycl::range<2>{ static_cast(FEATURE_BLOCK_SIZE), static_cast(INTERNAL_BLOCK_SIZE) * static_cast(THREAD_BLOCK_SIZE) }, cgh }, - data_cache_w_{ ::sycl::range<2>{ static_cast(FEATURE_BLOCK_SIZE), static_cast(INTERNAL_BLOCK_SIZE) * static_cast(THREAD_BLOCK_SIZE) }, cgh }, + data_cache_pp_{ ::sycl::range<2>{ static_cast(THREAD_BLOCK_SIZE), static_cast(INTERNAL_BLOCK_SIZE) * static_cast(THREAD_BLOCK_SIZE) }, cgh }, + data_cache_w_{ ::sycl::range<2>{ static_cast(THREAD_BLOCK_SIZE), static_cast(INTERNAL_BLOCK_SIZE) * static_cast(THREAD_BLOCK_SIZE) }, cgh }, prediction_d_{ prediction_d }, w_d_{ w_d }, rho_d_{ rho_d }, @@ -189,7 +189,6 @@ class device_kernel_predict_linear { const std::size_t blockIdx_y = nd_idx.get_group(1) + grid_y_offset_; // current block in grid y-dimension + offsets if the grid size would be too large const auto INTERNAL_BLOCK_SIZE_uz = static_cast(INTERNAL_BLOCK_SIZE); const auto THREAD_BLOCK_SIZE_uz = static_cast(THREAD_BLOCK_SIZE); - const auto FEATURE_BLOCK_SIZE_uz = static_cast(FEATURE_BLOCK_SIZE); const auto PADDING_SIZE_uz = static_cast(PADDING_SIZE); // calculate the indices used in the current work-item @@ -202,22 +201,20 @@ class device_kernel_predict_linear { real_type temp[INTERNAL_BLOCK_SIZE][INTERNAL_BLOCK_SIZE]{}; // iterate over all support vectors using blocking to be able to cache them for faster memory accesses - for (std::size_t dim = 0; dim < num_features_; dim += FEATURE_BLOCK_SIZE_uz) { + for (std::size_t dim = 0; dim < num_features_; dim += THREAD_BLOCK_SIZE_uz) { // load data into shared memory for (unsigned internal = 0; internal < INTERNAL_BLOCK_SIZE; ++internal) { const auto global_pp_idx = pp_idx_linear + static_cast(internal) * THREAD_BLOCK_SIZE_uz; const auto global_class_idx = class_idx_linear + static_cast(internal) * THREAD_BLOCK_SIZE_uz; - // FEATURE_BLOCK_SIZE = 2 * THREAD_BLOCK_SIZE -> store twice as many values in the local memory + // store the values in the local memory data_cache_pp_[local_id_0][internal * THREAD_BLOCK_SIZE + local_id_1] = predict_points_d_[(dim + threadIdx_x) * (num_predict_points_ + PADDING_SIZE_uz) + global_pp_idx]; - data_cache_pp_[local_id_0 + THREAD_BLOCK_SIZE][internal * THREAD_BLOCK_SIZE + local_id_1] = predict_points_d_[(dim + threadIdx_x + THREAD_BLOCK_SIZE_uz) * (num_predict_points_ + PADDING_SIZE_uz) + global_pp_idx]; data_cache_w_[local_id_0][internal * THREAD_BLOCK_SIZE + local_id_1] = w_d_[(dim + threadIdx_x) * (num_classes_ + PADDING_SIZE_uz) + global_class_idx]; - data_cache_w_[local_id_0 + THREAD_BLOCK_SIZE][internal * THREAD_BLOCK_SIZE + local_id_1] = w_d_[(dim + threadIdx_x + THREAD_BLOCK_SIZE_uz) * (num_classes_ + PADDING_SIZE_uz) + global_class_idx]; } nd_idx.barrier(); // wait until all work-items loaded their part of the data // perform the dot product calculation - for (unsigned block_dim = 0; block_dim < FEATURE_BLOCK_SIZE; ++block_dim) { + for (unsigned block_dim = 0; block_dim < THREAD_BLOCK_SIZE; ++block_dim) { for (unsigned internal_pd = 0; internal_pd < INTERNAL_BLOCK_SIZE; ++internal_pd) { for (unsigned internal_class = 0; internal_class < INTERNAL_BLOCK_SIZE; ++internal_class) { temp[internal_pd][internal_class] += data_cache_w_[block_dim][local_id_0 * INTERNAL_BLOCK_SIZE + internal_class] * data_cache_pp_[block_dim][local_id_1 * INTERNAL_BLOCK_SIZE + internal_pd]; @@ -283,8 +280,8 @@ class device_kernel_predict { * @param[in] kernel_function_parameter the parameters necessary to apply the @p kernel_function */ device_kernel_predict(::sycl::handler &cgh, real_type *prediction_d, const real_type *alpha_d, const real_type *rho_d, const real_type *sv_d, const real_type *predict_points_d, const std::size_t num_classes, const std::size_t num_sv, const std::size_t num_predict_points, const std::size_t num_features, const std::size_t grid_x_offset, const std::size_t grid_y_offset, Args... kernel_function_parameter) : - data_cache_pp_{ ::sycl::range<2>{ static_cast(FEATURE_BLOCK_SIZE), static_cast(INTERNAL_BLOCK_SIZE) * static_cast(THREAD_BLOCK_SIZE) }, cgh }, - data_cache_sv_{ ::sycl::range<2>{ static_cast(FEATURE_BLOCK_SIZE), static_cast(INTERNAL_BLOCK_SIZE) * static_cast(THREAD_BLOCK_SIZE) }, cgh }, + data_cache_pp_{ ::sycl::range<2>{ static_cast(THREAD_BLOCK_SIZE), static_cast(INTERNAL_BLOCK_SIZE) * static_cast(THREAD_BLOCK_SIZE) }, cgh }, + data_cache_sv_{ ::sycl::range<2>{ static_cast(THREAD_BLOCK_SIZE), static_cast(INTERNAL_BLOCK_SIZE) * static_cast(THREAD_BLOCK_SIZE) }, cgh }, prediction_d_{ prediction_d }, alpha_d_{ alpha_d }, rho_d_{ rho_d }, @@ -316,7 +313,6 @@ class device_kernel_predict { const std::size_t blockIdx_y = nd_idx.get_group(1) + grid_y_offset_; // current block in grid y-dimension + offsets if the grid size would be too large const auto INTERNAL_BLOCK_SIZE_uz = static_cast(INTERNAL_BLOCK_SIZE); const auto THREAD_BLOCK_SIZE_uz = static_cast(THREAD_BLOCK_SIZE); - const auto FEATURE_BLOCK_SIZE_uz = static_cast(FEATURE_BLOCK_SIZE); const auto PADDING_SIZE_uz = static_cast(PADDING_SIZE); // calculate the indices used in the current work-item @@ -329,22 +325,20 @@ class device_kernel_predict { { // iterate over all features using blocking to be able to cache them for faster memory accesses - for (std::size_t dim = 0; dim < num_features_; dim += FEATURE_BLOCK_SIZE_uz) { + for (std::size_t dim = 0; dim < num_features_; dim += THREAD_BLOCK_SIZE_uz) { // load data into local memory for (unsigned internal = 0; internal < INTERNAL_BLOCK_SIZE; ++internal) { const auto global_pp_idx = pp_idx_linear + static_cast(internal) * THREAD_BLOCK_SIZE_uz; const auto global_sv_idx = sv_idx_linear + static_cast(internal) * THREAD_BLOCK_SIZE_uz; - // FEATURE_BLOCK_SIZE = 2 * THREAD_BLOCK_SIZE -> store twice as many values in the shared memory + // store the values in the shared memory data_cache_pp_[local_id_0][internal * THREAD_BLOCK_SIZE + local_id_1] = predict_points_d_[(dim + threadIdx_x) * (num_predict_points_ + PADDING_SIZE_uz) + global_pp_idx]; - data_cache_pp_[local_id_0 + THREAD_BLOCK_SIZE][internal * THREAD_BLOCK_SIZE + local_id_1] = predict_points_d_[(dim + threadIdx_x + THREAD_BLOCK_SIZE) * (num_predict_points_ + PADDING_SIZE_uz) + global_pp_idx]; data_cache_sv_[local_id_0][internal * THREAD_BLOCK_SIZE + local_id_1] = sv_d_[(dim + threadIdx_x) * (num_sv_ + PADDING_SIZE_uz) + global_sv_idx]; - data_cache_sv_[local_id_0 + THREAD_BLOCK_SIZE][internal * THREAD_BLOCK_SIZE + local_id_1] = sv_d_[(dim + threadIdx_x + THREAD_BLOCK_SIZE_uz) * (num_sv_ + PADDING_SIZE_uz) + global_sv_idx]; } nd_idx.barrier(); // wait until all work-items loaded their part of the data // perform the feature reduction calculation - for (unsigned block_dim = 0; block_dim < FEATURE_BLOCK_SIZE; ++block_dim) { + for (unsigned block_dim = 0; block_dim < THREAD_BLOCK_SIZE; ++block_dim) { for (unsigned internal_pd = 0; internal_pd < INTERNAL_BLOCK_SIZE; ++internal_pd) { for (unsigned internal_sv = 0; internal_sv < INTERNAL_BLOCK_SIZE; ++internal_sv) { temp[internal_pd][internal_sv] += detail::feature_reduce(data_cache_sv_[block_dim][local_id_0 * INTERNAL_BLOCK_SIZE + internal_sv], @@ -369,31 +363,28 @@ class device_kernel_predict { auto &out_cache = data_cache_sv_; // iterate over all features using blocking to be able to cache them for faster memory accesses - for (std::size_t dim = 0; dim < num_classes_; dim += FEATURE_BLOCK_SIZE_uz) { + for (std::size_t dim = 0; dim < num_classes_; dim += THREAD_BLOCK_SIZE_uz) { // load data into local memory for (unsigned internal = 0; internal < INTERNAL_BLOCK_SIZE; ++internal) { const std::size_t global_sv_idx = sv_idx_linear + static_cast(internal) * THREAD_BLOCK_SIZE_uz; alpha_cache[local_id_0][internal * THREAD_BLOCK_SIZE + local_id_1] = alpha_d_[(dim + threadIdx_x) * (num_sv_ + PADDING_SIZE_uz) + global_sv_idx]; - alpha_cache[local_id_0 + THREAD_BLOCK_SIZE][internal * THREAD_BLOCK_SIZE + local_id_1] = alpha_d_[(dim + threadIdx_x + THREAD_BLOCK_SIZE_uz) * (num_sv_ + PADDING_SIZE_uz) + global_sv_idx]; // the bias (rho) must only be applied once for all support vectors if (blockIdx_x == std::size_t{ 0 }) { out_cache[local_id_0][internal * THREAD_BLOCK_SIZE + local_id_1] = -rho_d_[dim + threadIdx_x]; - out_cache[local_id_0 + THREAD_BLOCK_SIZE][internal * THREAD_BLOCK_SIZE + local_id_1] = -rho_d_[dim + threadIdx_x + THREAD_BLOCK_SIZE_uz]; } else { out_cache[local_id_0][internal * THREAD_BLOCK_SIZE + local_id_1] = real_type{ 0.0 }; - out_cache[local_id_0 + THREAD_BLOCK_SIZE][internal * THREAD_BLOCK_SIZE + local_id_1] = real_type{ 0.0 }; } } nd_idx.barrier(); // wait until all work-items loaded their part of the data // calculate intermediate results and store them in local memory - for (unsigned class_idx = 0; class_idx < FEATURE_BLOCK_SIZE; ++class_idx) { + for (unsigned class_idx = 0; class_idx < THREAD_BLOCK_SIZE; ++class_idx) { for (unsigned internal_pd = 0; internal_pd < INTERNAL_BLOCK_SIZE; ++internal_pd) { for (unsigned internal_sv = 0; internal_sv < INTERNAL_BLOCK_SIZE; ++internal_sv) { - out_cache[(class_idx + local_id_0) % FEATURE_BLOCK_SIZE][internal_pd * THREAD_BLOCK_SIZE + local_id_1] += - temp[internal_pd][internal_sv] * alpha_cache[(class_idx + local_id_0) % FEATURE_BLOCK_SIZE][local_id_0 * INTERNAL_BLOCK_SIZE + internal_sv]; + out_cache[(class_idx + local_id_0) % THREAD_BLOCK_SIZE][internal_pd * THREAD_BLOCK_SIZE + local_id_1] += + temp[internal_pd][internal_sv] * alpha_cache[(class_idx + local_id_0) % THREAD_BLOCK_SIZE][local_id_0 * INTERNAL_BLOCK_SIZE + internal_sv]; } } nd_idx.barrier(); // wait until all work-items performed their part of the calculations @@ -404,7 +395,6 @@ class device_kernel_predict { const auto global_pp_idx = pp_idx + static_cast(internal); detail::atomic_op{ prediction_d_[global_pp_idx * (num_classes_ + PADDING_SIZE_uz) + dim + threadIdx_x] } += out_cache[local_id_0][internal * THREAD_BLOCK_SIZE + local_id_1]; - detail::atomic_op{ prediction_d_[global_pp_idx * (num_classes_ + PADDING_SIZE_uz) + dim + threadIdx_x + THREAD_BLOCK_SIZE_uz] } += out_cache[local_id_0 + THREAD_BLOCK_SIZE][internal * THREAD_BLOCK_SIZE + local_id_1]; } nd_idx.barrier(); // wait until all work-items updated their part of the prediction } From c74aca83b21f21ed12fa6257ca347d35f41997f2 Mon Sep 17 00:00:00 2001 From: Marcel Breyer Date: Wed, 28 May 2025 13:48:21 +0200 Subject: [PATCH 004/215] Update CUDA implementation and update comments. --- .../backends/CUDA/kernel/cg_explicit/blas.cuh | 295 +++++++------- .../cg_explicit/kernel_matrix_assembly.cuh | 44 +-- .../kernel_matrix_assembly_blas.cuh | 179 +++++---- .../backends/CUDA/kernel/kernel_functions.cuh | 34 +- .../backends/CUDA/kernel/predict_kernel.cuh | 374 +++++++++--------- 5 files changed, 471 insertions(+), 455 deletions(-) diff --git a/include/plssvm/backends/CUDA/kernel/cg_explicit/blas.cuh b/include/plssvm/backends/CUDA/kernel/cg_explicit/blas.cuh index 2f7b37a0f..1a6be4ae8 100644 --- a/include/plssvm/backends/CUDA/kernel/cg_explicit/blas.cuh +++ b/include/plssvm/backends/CUDA/kernel/cg_explicit/blas.cuh @@ -13,7 +13,7 @@ #define PLSSVM_BACKENDS_CUDA_KERNEL_CG_EXPLICIT_BLAS_CUH_ #pragma once -#include "plssvm/constants.hpp" // plssvm::{real_type, THREAD_BLOCK_SIZE, INTERNAL_BLOCK_SIZE, FEATURE_BLOCK_SIZE, PADDING_SIZE} +#include "plssvm/constants.hpp" // plssvm::{real_type, THREAD_BLOCK_SIZE, INTERNAL_BLOCK_SIZE, PADDING_SIZE} namespace plssvm::cuda::detail { @@ -22,8 +22,8 @@ namespace plssvm::cuda::detail { * @details In a multi-GPU setting, this function is only responsible for the rows this device is responsible for! * @param[in] num_rows the number of rows in @p A and @p C * @param[in] num_rhs the number of columns in @p B and @p C - * @param[in] device_specific_num_rows the number of rows in @p A and number of rows in @p B; thr rows in @p A are potentially distributed across multiple devices - * @param[in] row_offset the first row this device is responsible for + * @param[in] device_num_rows the number of rows in @p A and number of rows in @p B; thr rows in @p A are potentially distributed across multiple devices + * @param[in] device_row_offset the first row this device is responsible for * @param[in] alpha the scalar alpha value * @param[in] A the matrix @p A * @param[in] B the matrix @p B @@ -32,78 +32,77 @@ namespace plssvm::cuda::detail { * @param[in] grid_x_offset the offset in x-dimension into the data points if more than one execution grid has to be used * @param[in] grid_y_offset the offset in y-dimension into the data points if more than one execution grid has to be used */ -__global__ void device_kernel_symm(const unsigned long long num_rows, const unsigned long long num_rhs, const unsigned long long device_specific_num_rows, const unsigned long long row_offset, const real_type alpha, const real_type *A, const real_type *B, const real_type beta, real_type *C, const unsigned long long grid_x_offset, const unsigned long long grid_y_offset) { - // cast all values to 64-bit unsigned long long to prevent potential 32-bit overflows - const auto threadIdx_x = static_cast(threadIdx.x); // current thread in block x-dimension - const auto threadIdx_y = static_cast(threadIdx.y); // current thread in block y-dimension - const auto blockDim_x = static_cast(blockDim.x); // number of threads in block x-dimension - const auto blockDim_y = static_cast(blockDim.y); // number of threads in block y-dimension - const auto blockIdx_x = static_cast(blockIdx.x) + grid_x_offset; // current block in grid x-dimension + offsets if the grid size would be too large - const auto blockIdx_y = static_cast(blockIdx.y) + grid_y_offset; // current block in grid y-dimension + offsets if the grid size would be too large - const auto INTERNAL_BLOCK_SIZE_ull = static_cast(INTERNAL_BLOCK_SIZE); - const auto THREAD_BLOCK_SIZE_ull = static_cast(THREAD_BLOCK_SIZE); - const auto FEATURE_BLOCK_SIZE_ull = static_cast(FEATURE_BLOCK_SIZE); - const auto PADDING_SIZE_ull = static_cast(PADDING_SIZE); - - // calculate the indices used in the current thread - const auto i = (blockIdx_x * blockDim_x + threadIdx_x) * INTERNAL_BLOCK_SIZE_ull; // # rhs -> num_rhs - const auto i_linear = blockIdx_x * blockDim_x * INTERNAL_BLOCK_SIZE_ull + threadIdx_x; - const auto j = (blockIdx_y * blockDim_y + threadIdx_y) * INTERNAL_BLOCK_SIZE_ull; // # rows -> device_specific_num_rows - const auto j_linear = blockIdx_y * blockDim_y * INTERNAL_BLOCK_SIZE_ull + threadIdx_x; - - // create the shared memory arrays used for caching data point features - __shared__ real_type A_cache[FEATURE_BLOCK_SIZE][INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE]; - __shared__ real_type B_cache[FEATURE_BLOCK_SIZE][INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE]; +__global__ void device_kernel_symm(const std::size_t num_rows, const std::size_t num_rhs, const std::size_t device_num_rows, const std::size_t device_row_offset, const real_type alpha, const real_type *A, const real_type *B, const real_type beta, real_type *C, const std::size_t grid_x_offset, const std::size_t grid_y_offset) { + // cast all values to 64-bit std::size_t to prevent potential 32-bit overflows + const auto INTERNAL_BLOCK_SIZE_uz = static_cast(INTERNAL_BLOCK_SIZE); + const auto THREAD_BLOCK_SIZE_uz = static_cast(THREAD_BLOCK_SIZE); + const auto PADDING_SIZE_uz = static_cast(PADDING_SIZE); + + const auto threadIdx_x = static_cast(threadIdx.x); // current thread in block x-dimension + const auto threadIdx_y = static_cast(threadIdx.y); // current thread in block y-dimension + const auto blockDim_x = static_cast(blockDim.x); // number of threads in block x-dimension + const auto blockDim_y = static_cast(blockDim.y); // number of threads in block y-dimension + const auto blockIdx_x = static_cast(blockIdx.x) + grid_x_offset; // current block in grid x-dimension + offsets if the grid size is too large + const auto blockIdx_y = static_cast(blockIdx.y) + grid_y_offset; // current block in grid y-dimension + offsets if the grid size is too large + + // create two shared memory arrays used for caching + __shared__ real_type A_cache[THREAD_BLOCK_SIZE][INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE]; + __shared__ real_type B_cache[THREAD_BLOCK_SIZE][INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE]; // create a thread private array used for internal caching real_type temp[INTERNAL_BLOCK_SIZE][INTERNAL_BLOCK_SIZE]{}; - // iterate over all features using blocking to be able to cache them for faster memory accesses - for (unsigned long long dim = 0; dim < (num_rows - row_offset); dim += FEATURE_BLOCK_SIZE_ull) { - // load data into shared memory - for (unsigned internal = 0; internal < INTERNAL_BLOCK_SIZE; ++internal) { - const auto global_i = i_linear + static_cast(internal) * THREAD_BLOCK_SIZE_ull; - const auto global_j = j_linear + static_cast(internal) * THREAD_BLOCK_SIZE_ull; - - // determine on which side of the diagonal we are located - if (dim + threadIdx_y < global_j) { - A_cache[threadIdx.y][internal * THREAD_BLOCK_SIZE + threadIdx.x] = A[(dim + threadIdx_y) * (num_rows - row_offset + PADDING_SIZE_ull) + global_j - (dim + threadIdx_y) * (dim + threadIdx_y + 1ull) / 2ull]; - } else { - A_cache[threadIdx.y][internal * THREAD_BLOCK_SIZE + threadIdx.x] = A[global_j * (num_rows - row_offset + PADDING_SIZE_ull) + dim + threadIdx_y - global_j * (global_j + 1ull) / 2ull]; - } - // determine on which side of the diagonal we are located - if (dim + threadIdx.y + THREAD_BLOCK_SIZE < global_j) { - A_cache[threadIdx.y + THREAD_BLOCK_SIZE][internal * THREAD_BLOCK_SIZE + threadIdx.x] = A[(dim + threadIdx_y + THREAD_BLOCK_SIZE_ull) * (num_rows - row_offset + PADDING_SIZE_ull) + global_j - (dim + threadIdx_y + THREAD_BLOCK_SIZE_ull) * (dim + threadIdx_y + THREAD_BLOCK_SIZE_ull + 1ull) / 2ull]; - } else { - A_cache[threadIdx.y + THREAD_BLOCK_SIZE][internal * THREAD_BLOCK_SIZE + threadIdx.x] = A[global_j * (num_rows - row_offset + PADDING_SIZE_ull) + dim + threadIdx_y + THREAD_BLOCK_SIZE_ull - global_j * (global_j + 1ull) / 2ull]; + { + // calculate the indices used in the current thread, pays attention to coalesced memory accesses + const auto i_idx_linear = blockIdx_x * blockDim_x * INTERNAL_BLOCK_SIZE_uz + threadIdx_x; // num_rhs + const auto j_idx_linear = blockIdx_y * blockDim_y * INTERNAL_BLOCK_SIZE_uz + threadIdx_x; // device_num_rows + + // iterate over all values using blocking to be able to cache them for faster memory accesses + for (std::size_t dim = 0; dim < (num_rows - device_row_offset); dim += THREAD_BLOCK_SIZE_uz) { + // load data into shared memory + for (unsigned internal = 0; internal < INTERNAL_BLOCK_SIZE; ++internal) { + // calculate the indices to access the global data, pays attention to coalesced memory accesses + const auto global_i_idx_linear = i_idx_linear + static_cast(internal) * THREAD_BLOCK_SIZE_uz; + const auto global_j_idx_linear = j_idx_linear + static_cast(internal) * THREAD_BLOCK_SIZE_uz; + + // store the values in the shared memory + // determine on which side of the diagonal we are located + if (dim + threadIdx_y < global_j_idx_linear) { + A_cache[threadIdx.y][internal * THREAD_BLOCK_SIZE + threadIdx.x] = A[(dim + threadIdx_y) * (num_rows - device_row_offset + PADDING_SIZE_uz) + global_j_idx_linear - (dim + threadIdx_y) * (dim + threadIdx_y + std::size_t{ 1 }) / std::size_t{ 2 }]; // SoA, upper triangular matrix only + } else { + A_cache[threadIdx.y][internal * THREAD_BLOCK_SIZE + threadIdx.x] = A[global_j_idx_linear * (num_rows - device_row_offset + PADDING_SIZE_uz) + dim + threadIdx_y - global_j_idx_linear * (global_j_idx_linear + std::size_t{ 1 }) / std::size_t{ 2 }]; // SoA, upper triangular matrix only + } + B_cache[threadIdx.y][internal * THREAD_BLOCK_SIZE + threadIdx.x] = B[(dim + device_row_offset + threadIdx_y) * (num_rhs + PADDING_SIZE_uz) + global_i_idx_linear]; // SoA } - - B_cache[threadIdx.y][internal * THREAD_BLOCK_SIZE + threadIdx.x] = B[(dim + row_offset + threadIdx_y) * (num_rhs + PADDING_SIZE_ull) + global_i]; - B_cache[threadIdx.y + THREAD_BLOCK_SIZE][internal * THREAD_BLOCK_SIZE + threadIdx.x] = B[(dim + row_offset + threadIdx_y + THREAD_BLOCK_SIZE_ull) * (num_rhs + PADDING_SIZE_ull) + global_i]; - } - __syncthreads(); // wait until all threads loaded their part of the data - - // perform the dot product calculation - for (unsigned block_dim = 0; block_dim < FEATURE_BLOCK_SIZE; ++block_dim) { - for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) { - for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) { - temp[internal_i][internal_j] += A_cache[block_dim][threadIdx.y * INTERNAL_BLOCK_SIZE + internal_j] * B_cache[block_dim][threadIdx.x * INTERNAL_BLOCK_SIZE + internal_i]; + __syncthreads(); // wait until all threads loaded their part of the data + + // perform the dot product calculation + for (unsigned block_dim = 0; block_dim < THREAD_BLOCK_SIZE; ++block_dim) { + for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) { + for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) { + temp[internal_i][internal_j] += A_cache[block_dim][threadIdx.y * INTERNAL_BLOCK_SIZE + internal_j] * B_cache[block_dim][threadIdx.x * INTERNAL_BLOCK_SIZE + internal_i]; + } } } + __syncthreads(); // wait until all threads performed their part of the calculations } - __syncthreads(); // wait until all threads performed their part of the calculations } + // calculate the indices used in the current thread + const auto i_idx = (blockIdx_x * blockDim_x + threadIdx_x) * INTERNAL_BLOCK_SIZE_uz; // num_rhs + const auto j_idx = (blockIdx_y * blockDim_y + threadIdx_y) * INTERNAL_BLOCK_SIZE_uz; // device_num_rows + // apply the (partial) BLAS operation and update C for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) { for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) { - const auto global_i = i + static_cast(internal_i); - const auto device_global_j = j + static_cast(internal_j); - const auto global_j = row_offset + j + static_cast(internal_j); - - // be sure to not perform out of bounds accesses - if (global_i < num_rhs && device_global_j < device_specific_num_rows) { - C[global_j * (num_rhs + PADDING_SIZE_ull) + global_i] = alpha * temp[internal_i][internal_j] + beta * C[global_j * (num_rhs + PADDING_SIZE_ull) + global_i]; + // calculate the indices to access the global data and the data with respect to the current device + const auto global_i_idx = i_idx + static_cast(internal_i); + const auto device_global_j_idx = j_idx + static_cast(internal_j); + const auto global_j_idx = device_row_offset + device_global_j_idx; + + // be sure to not perform out-of-bounds accesses + if (global_i_idx < num_rhs && device_global_j_idx < device_num_rows) { + C[global_j_idx * (num_rhs + PADDING_SIZE_uz) + global_i_idx] = alpha * temp[internal_i][internal_j] + beta * C[global_j_idx * (num_rhs + PADDING_SIZE_uz) + global_i_idx]; // SoA } } } @@ -115,8 +114,8 @@ __global__ void device_kernel_symm(const unsigned long long num_rows, const unsi * @param[in] num_rows the number of rows in @p A and @p C * @param[in] num_rhs the number of columns in @p B and @p C * @param[in] num_mirror_rows the number of rows to mirror down - * @param[in] device_specific_num_rows the number of rows in @p A and number of rows in @p B; thr rows in @p A are potentially distributed across multiple devices - * @param[in] row_offset the first row this device is responsible for + * @param[in] device_num_rows the number of rows in @p A and number of rows in @p B; thr rows in @p A are potentially distributed across multiple devices + * @param[in] device_row_offset the first row this device is responsible for * @param[in] alpha the scalar alpha value * @param[in] A the matrix @p A * @param[in] B the matrix @p B @@ -125,68 +124,72 @@ __global__ void device_kernel_symm(const unsigned long long num_rows, const unsi * @param[in] grid_x_offset the offset in x-dimension into the data points if more than one execution grid has to be used * @param[in] grid_y_offset the offset in y-dimension into the data points if more than one execution grid has to be used */ -__global__ void device_kernel_symm_mirror(const unsigned long long num_rows, const unsigned long long num_rhs, const unsigned long long num_mirror_rows, const unsigned long long device_specific_num_rows, const unsigned long long row_offset, const real_type alpha, const real_type *A, const real_type *B, const real_type beta, real_type *C, const unsigned long long grid_x_offset, const unsigned long long grid_y_offset) { - // cast all values to 64-bit unsigned long long to prevent potential 32-bit overflows - const auto threadIdx_x = static_cast(threadIdx.x); // current thread in block x-dimension - const auto threadIdx_y = static_cast(threadIdx.y); // current thread in block y-dimension - const auto blockDim_x = static_cast(blockDim.x); // number of threads in block x-dimension - const auto blockDim_y = static_cast(blockDim.y); // number of threads in block y-dimension - const auto blockIdx_x = static_cast(blockIdx.x) + grid_x_offset; // current block in grid x-dimension + offsets if the grid size would be too large - const auto blockIdx_y = static_cast(blockIdx.y) + grid_y_offset; // current block in grid y-dimension + offsets if the grid size would be too large - const auto INTERNAL_BLOCK_SIZE_ull = static_cast(INTERNAL_BLOCK_SIZE); - const auto THREAD_BLOCK_SIZE_ull = static_cast(THREAD_BLOCK_SIZE); - const auto FEATURE_BLOCK_SIZE_ull = static_cast(FEATURE_BLOCK_SIZE); - const auto PADDING_SIZE_ull = static_cast(PADDING_SIZE); - - // calculate the indices used in the current thread - const auto i = (blockIdx_x * blockDim_x + threadIdx_x) * INTERNAL_BLOCK_SIZE_ull; // # rhs -> num_rhs - const auto i_linear = blockIdx_x * blockDim_x * INTERNAL_BLOCK_SIZE_ull + threadIdx_x; - const auto j = (blockIdx_y * blockDim_y + threadIdx_y) * INTERNAL_BLOCK_SIZE_ull; // # rows -> num_mirror_rows - const auto j_linear = blockIdx_y * blockDim_y * INTERNAL_BLOCK_SIZE_ull + threadIdx_x; - - // create the shared memory arrays used for caching data point features - __shared__ real_type A_cache[FEATURE_BLOCK_SIZE][INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE]; - __shared__ real_type B_cache[FEATURE_BLOCK_SIZE][INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE]; +__global__ void device_kernel_symm_mirror(const std::size_t num_rows, const std::size_t num_rhs, const std::size_t num_mirror_rows, const std::size_t device_num_rows, const std::size_t device_row_offset, const real_type alpha, const real_type *A, const real_type *B, const real_type beta, real_type *C, const std::size_t grid_x_offset, const std::size_t grid_y_offset) { + // cast all values to 64-bit std::size_t to prevent potential 32-bit overflows + const auto INTERNAL_BLOCK_SIZE_uz = static_cast(INTERNAL_BLOCK_SIZE); + const auto THREAD_BLOCK_SIZE_uz = static_cast(THREAD_BLOCK_SIZE); + const auto PADDING_SIZE_uz = static_cast(PADDING_SIZE); + + const auto threadIdx_x = static_cast(threadIdx.x); // current thread in block x-dimension + const auto threadIdx_y = static_cast(threadIdx.y); // current thread in block y-dimension + const auto blockDim_x = static_cast(blockDim.x); // number of threads in block x-dimension + const auto blockDim_y = static_cast(blockDim.y); // number of threads in block y-dimension + const auto blockIdx_x = static_cast(blockIdx.x) + grid_x_offset; // current block in grid x-dimension + offsets if the grid size is too large + const auto blockIdx_y = static_cast(blockIdx.y) + grid_y_offset; // current block in grid y-dimension + offsets if the grid size is too large + + // create two shared memory arrays used for caching + __shared__ real_type A_cache[THREAD_BLOCK_SIZE][INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE]; + __shared__ real_type B_cache[THREAD_BLOCK_SIZE][INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE]; // create a thread private array used for internal caching real_type temp[INTERNAL_BLOCK_SIZE][INTERNAL_BLOCK_SIZE]{}; - // iterate over the remaining features using blocking to be able to cache them for faster memory accesses - for (unsigned long long dim = 0; dim < device_specific_num_rows; dim += FEATURE_BLOCK_SIZE_ull) { - // load data into shared memory - for (unsigned internal = 0; internal < INTERNAL_BLOCK_SIZE; ++internal) { - const auto global_i = i_linear + static_cast(internal) * THREAD_BLOCK_SIZE_ull; - const auto global_j = j_linear + static_cast(internal) * THREAD_BLOCK_SIZE_ull; - - // FEATURE_BLOCK_SIZE = 2 * THREAD_BLOCK_SIZE -> store twice as many values in the shared memory - A_cache[threadIdx.y][internal * THREAD_BLOCK_SIZE + threadIdx.x] = A[(dim + threadIdx_y) * (num_rows - row_offset + PADDING_SIZE_ull) - (dim + threadIdx_y - 1ull) * (dim + threadIdx_y) / 2ull + device_specific_num_rows - (dim + threadIdx_y) + global_j]; - A_cache[threadIdx.y + THREAD_BLOCK_SIZE][internal * THREAD_BLOCK_SIZE + threadIdx.x] = A[(dim + threadIdx_y + THREAD_BLOCK_SIZE_ull) * (num_rows - row_offset + PADDING_SIZE_ull) - (dim + threadIdx_y + THREAD_BLOCK_SIZE_ull - 1ull) * (dim + threadIdx_y + THREAD_BLOCK_SIZE_ull) / 2ull + device_specific_num_rows - (dim + threadIdx_y + THREAD_BLOCK_SIZE_ull) + global_j]; - B_cache[threadIdx.y][internal * THREAD_BLOCK_SIZE + threadIdx.x] = B[(row_offset + dim + threadIdx_y) * (num_rhs + PADDING_SIZE_ull) + global_i]; - B_cache[threadIdx.y + THREAD_BLOCK_SIZE][internal * THREAD_BLOCK_SIZE + threadIdx.x] = B[(row_offset + dim + threadIdx_y + THREAD_BLOCK_SIZE_ull) * (num_rhs + PADDING_SIZE_ull) + global_i]; - } - __syncthreads(); // wait until all threads loaded their part of the data - - // perform the feature reduction calculation - for (unsigned block_dim = 0; block_dim < FEATURE_BLOCK_SIZE; ++block_dim) { - for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) { - for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) { - temp[internal_i][internal_j] += A_cache[block_dim][threadIdx.y * INTERNAL_BLOCK_SIZE + internal_j] * B_cache[block_dim][threadIdx.x * INTERNAL_BLOCK_SIZE + internal_i]; + { + // calculate the indices used in the current thread, pays attention to coalesced memory accesses + const auto i_idx_linear = blockIdx_x * blockDim_x * INTERNAL_BLOCK_SIZE_uz + threadIdx_x; // num_rhs + const auto j_idx_linear = blockIdx_y * blockDim_y * INTERNAL_BLOCK_SIZE_uz + threadIdx_x; // num_mirror_rows + + // iterate over the remaining values using blocking to be able to cache them for faster memory accesses + for (std::size_t dim = 0; dim < device_num_rows; dim += THREAD_BLOCK_SIZE_uz) { + // load data into shared memory + for (unsigned internal = 0; internal < INTERNAL_BLOCK_SIZE; ++internal) { + // calculate the indices to access the global data, pays attention to coalesced memory accesses + const auto global_i_idx_linear = i_idx_linear + static_cast(internal) * THREAD_BLOCK_SIZE_uz; + const auto global_j_idx_linear = j_idx_linear + static_cast(internal) * THREAD_BLOCK_SIZE_uz; + + // store the values in the shared memory + A_cache[threadIdx.y][internal * THREAD_BLOCK_SIZE + threadIdx.x] = A[(dim + threadIdx_y) * (num_rows - device_row_offset + PADDING_SIZE_uz) - (dim + threadIdx_y - std::size_t{ 1 }) * (dim + threadIdx_y) / std::size_t{ 2 } + device_num_rows - (dim + threadIdx_y) + global_j_idx_linear]; // SoA, upper triangular matrix only + B_cache[threadIdx.y][internal * THREAD_BLOCK_SIZE + threadIdx.x] = B[(device_row_offset + dim + threadIdx_y) * (num_rhs + PADDING_SIZE_uz) + global_i_idx_linear]; // SoA + } + __syncthreads(); // wait until all threads loaded their part of the data + + // perform the dot product calculation + for (unsigned block_dim = 0; block_dim < THREAD_BLOCK_SIZE; ++block_dim) { + for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) { + for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) { + temp[internal_i][internal_j] += A_cache[block_dim][threadIdx.y * INTERNAL_BLOCK_SIZE + internal_j] * B_cache[block_dim][threadIdx.x * INTERNAL_BLOCK_SIZE + internal_i]; + } } } + __syncthreads(); // wait until all threads performed their part of the calculations } - __syncthreads(); // wait until all threads performed their part of the calculations } + // calculate the indices used in the current thread + const auto i_idx = (blockIdx_x * blockDim_x + threadIdx_x) * INTERNAL_BLOCK_SIZE_uz; // num_rhs + const auto j_idx = (blockIdx_y * blockDim_y + threadIdx_y) * INTERNAL_BLOCK_SIZE_uz; // num_mirror_rows + // apply the (remaining) BLAS operation and update C for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) { for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) { - const auto global_i = i + static_cast(internal_i); - const auto partial_global_j = j + static_cast(internal_j); - const auto global_j = row_offset + device_specific_num_rows + j + static_cast(internal_j); - - // be sure to not perform out of bounds accesses - if (global_i < num_rhs && partial_global_j < num_mirror_rows) { - C[global_j * (num_rhs + PADDING_SIZE_ull) + global_i] = alpha * temp[internal_i][internal_j] + beta * C[global_j * (num_rhs + PADDING_SIZE_ull) + global_i]; + // calculate the indices to access the global data and the data with respect to the current device + const auto global_i_idx = i_idx + static_cast(internal_i); + const auto partial_global_j_idx = j_idx + static_cast(internal_j); + const auto global_j_idx = device_row_offset + device_num_rows + partial_global_j_idx; + + // be sure to not perform out-of-bounds accesses + if (global_i_idx < num_rhs && partial_global_j_idx < num_mirror_rows) { + C[global_j_idx * (num_rhs + PADDING_SIZE_uz) + global_i_idx] = alpha * temp[internal_i][internal_j] + beta * C[global_j_idx * (num_rhs + PADDING_SIZE_uz) + global_i_idx]; // SoA } } } @@ -200,27 +203,29 @@ __global__ void device_kernel_symm_mirror(const unsigned long long num_rows, con * @param[in] grid_x_offset the offset in x-dimension into the data points if more than one execution grid has to be used * @param[in] grid_y_offset the offset in y-dimension into the data points if more than one execution grid has to be used */ -__global__ void device_kernel_inplace_matrix_add(const unsigned long long num_cols, real_type *lhs, const real_type *rhs, const unsigned long long grid_x_offset, const unsigned long long grid_y_offset) { - // cast all values to 64-bit unsigned long long to prevent potential 32-bit overflows - const auto threadIdx_x = static_cast(threadIdx.x); // current thread in block x-dimension - const auto threadIdx_y = static_cast(threadIdx.y); // current thread in block y-dimension - const auto blockDim_x = static_cast(blockDim.x); // number of threads in block x-dimension - const auto blockDim_y = static_cast(blockDim.y); // number of threads in block y-dimension - const auto blockIdx_x = static_cast(blockIdx.x) + grid_x_offset; // current block in grid x-dimension + offsets if the grid size would be too large - const auto blockIdx_y = static_cast(blockIdx.y) + grid_y_offset; // current block in grid y-dimension + offsets if the grid size would be too large - const auto INTERNAL_BLOCK_SIZE_ull = static_cast(INTERNAL_BLOCK_SIZE); - const auto PADDING_SIZE_ull = static_cast(PADDING_SIZE); +__global__ void device_kernel_inplace_matrix_add(const std::size_t num_cols, real_type *lhs, const real_type *rhs, const std::size_t grid_x_offset, const std::size_t grid_y_offset) { + // cast all values to 64-bit std::size_t to prevent potential 32-bit overflows + const auto INTERNAL_BLOCK_SIZE_uz = static_cast(INTERNAL_BLOCK_SIZE); + const auto PADDING_SIZE_uz = static_cast(PADDING_SIZE); + + const auto threadIdx_x = static_cast(threadIdx.x); // current thread in block x-dimension + const auto threadIdx_y = static_cast(threadIdx.y); // current thread in block y-dimension + const auto blockDim_x = static_cast(blockDim.x); // number of threads in block x-dimension + const auto blockDim_y = static_cast(blockDim.y); // number of threads in block y-dimension + const auto blockIdx_x = static_cast(blockIdx.x) + grid_x_offset; // current block in grid x-dimension + offsets if the grid size is too large + const auto blockIdx_y = static_cast(blockIdx.y) + grid_y_offset; // current block in grid y-dimension + offsets if the grid size is too large // calculate the indices used in the current thread - const auto i = (blockIdx_x * blockDim_x + threadIdx_x) * INTERNAL_BLOCK_SIZE_ull; // # num_rows - const auto j = (blockIdx_y * blockDim_y + threadIdx_y) * INTERNAL_BLOCK_SIZE_ull; // # num_rhs + const auto i_idx = (blockIdx_x * blockDim_x + threadIdx_x) * INTERNAL_BLOCK_SIZE_uz; // num_rows + const auto j_idx = (blockIdx_y * blockDim_y + threadIdx_y) * INTERNAL_BLOCK_SIZE_uz; // num_rhs for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) { for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) { - const auto global_i = i + static_cast(internal_i); - const auto global_j = j + static_cast(internal_j); + // calculate the indices to access the global data + const auto global_i_idx = i_idx + static_cast(internal_i); + const auto global_j_idx = j_idx + static_cast(internal_j); - lhs[global_i * (num_cols + PADDING_SIZE_ull) + global_j] += rhs[global_i * (num_cols + PADDING_SIZE_ull) + global_j]; + lhs[global_i_idx * (num_cols + PADDING_SIZE_uz) + global_j_idx] += rhs[global_i_idx * (num_cols + PADDING_SIZE_uz) + global_j_idx]; // SoA } } } @@ -233,27 +238,29 @@ __global__ void device_kernel_inplace_matrix_add(const unsigned long long num_co * @param[in] grid_x_offset the offset in x-dimension into the data points if more than one execution grid has to be used * @param[in] grid_y_offset the offset in y-dimension into the data points if more than one execution grid has to be used */ -__global__ void device_kernel_inplace_matrix_scale(const unsigned long long num_cols, real_type *lhs, const real_type scale, const unsigned long long grid_x_offset, const unsigned long long grid_y_offset) { - // cast all values to 64-bit unsigned long long to prevent potential 32-bit overflows - const auto threadIdx_x = static_cast(threadIdx.x); // current thread in block x-dimension - const auto threadIdx_y = static_cast(threadIdx.y); // current thread in block y-dimension - const auto blockDim_x = static_cast(blockDim.x); // number of threads in block x-dimension - const auto blockDim_y = static_cast(blockDim.y); // number of threads in block y-dimension - const auto blockIdx_x = static_cast(blockIdx.x) + grid_x_offset; // current block in grid x-dimension + offsets if the grid size would be too large - const auto blockIdx_y = static_cast(blockIdx.y) + grid_y_offset; // current block in grid y-dimension + offsets if the grid size would be too large - const auto INTERNAL_BLOCK_SIZE_ull = static_cast(INTERNAL_BLOCK_SIZE); - const auto PADDING_SIZE_ull = static_cast(PADDING_SIZE); +__global__ void device_kernel_inplace_matrix_scale(const std::size_t num_cols, real_type *lhs, const real_type scale, const std::size_t grid_x_offset, const std::size_t grid_y_offset) { + // cast all values to 64-bit std::size_t to prevent potential 32-bit overflows + const auto INTERNAL_BLOCK_SIZE_uz = static_cast(INTERNAL_BLOCK_SIZE); + const auto PADDING_SIZE_uz = static_cast(PADDING_SIZE); + + const auto threadIdx_x = static_cast(threadIdx.x); // current thread in block x-dimension + const auto threadIdx_y = static_cast(threadIdx.y); // current thread in block y-dimension + const auto blockDim_x = static_cast(blockDim.x); // number of threads in block x-dimension + const auto blockDim_y = static_cast(blockDim.y); // number of threads in block y-dimension + const auto blockIdx_x = static_cast(blockIdx.x) + grid_x_offset; // current block in grid x-dimension + offsets if the grid size is too large + const auto blockIdx_y = static_cast(blockIdx.y) + grid_y_offset; // current block in grid y-dimension + offsets if the grid size is too large // calculate the indices used in the current thread - const auto i = (blockIdx_x * blockDim_x + threadIdx_x) * INTERNAL_BLOCK_SIZE_ull; // # num_rows - const auto j = (blockIdx_y * blockDim_y + threadIdx_y) * INTERNAL_BLOCK_SIZE_ull; // # num_rhs + const auto i_idx = (blockIdx_x * blockDim_x + threadIdx_x) * INTERNAL_BLOCK_SIZE_uz; // num_rows + const auto j_idx = (blockIdx_y * blockDim_y + threadIdx_y) * INTERNAL_BLOCK_SIZE_uz; // num_rhs for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) { for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) { - const auto global_i = i + static_cast(internal_i); - const auto global_j = j + static_cast(internal_j); + // calculate the indices to access the global data + const auto global_i_idx = i_idx + static_cast(internal_i); + const auto global_j_idx = j_idx + static_cast(internal_j); - lhs[global_i * (num_cols + PADDING_SIZE_ull) + global_j] *= scale; + lhs[global_i_idx * (num_cols + PADDING_SIZE_uz) + global_j_idx] *= scale; // SoA } } } diff --git a/include/plssvm/backends/CUDA/kernel/cg_explicit/kernel_matrix_assembly.cuh b/include/plssvm/backends/CUDA/kernel/cg_explicit/kernel_matrix_assembly.cuh index 2a3eef5c4..e4a3fa22d 100644 --- a/include/plssvm/backends/CUDA/kernel/cg_explicit/kernel_matrix_assembly.cuh +++ b/include/plssvm/backends/CUDA/kernel/cg_explicit/kernel_matrix_assembly.cuh @@ -52,7 +52,7 @@ __global__ void device_kernel_assembly(real_type *kernel_matrix, const real_type const auto blockIdx_x = static_cast(blockIdx.x) + grid_x_offset; // current block in grid x-dimension + offsets if the grid size is too large const auto blockIdx_y = static_cast(blockIdx.y) + grid_y_offset; // current block in grid y-dimension + offsets if the grid size is too large - // create two shared memory arrays used for caching data point features + // create two shared memory arrays used for caching __shared__ real_type data_i_cache[THREAD_BLOCK_SIZE][INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE]; __shared__ real_type data_j_cache[THREAD_BLOCK_SIZE][INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE]; @@ -62,21 +62,21 @@ __global__ void device_kernel_assembly(real_type *kernel_matrix, const real_type real_type temp[INTERNAL_BLOCK_SIZE][INTERNAL_BLOCK_SIZE]{}; { - // calculate the indices used in the current thread paying attention to coalesced memory accesses - const auto i_linear = blockIdx_x * blockDim_x * INTERNAL_BLOCK_SIZE_uz + threadIdx_x; - const auto j_linear = blockIdx_y * blockDim_y * INTERNAL_BLOCK_SIZE_uz + threadIdx_x; + // calculate the indices used in the current thread, pays attention to coalesced memory accesses + const auto i_idx_linear = blockIdx_x * blockDim_x * INTERNAL_BLOCK_SIZE_uz + threadIdx_x; // num_rows - device_row_offset + const auto j_idx_linear = blockIdx_y * blockDim_y * INTERNAL_BLOCK_SIZE_uz + threadIdx_x; // device_num_rows // iterate over all features using blocking to be able to cache them for faster memory accesses for (std::size_t dim = 0; dim < num_features; dim += THREAD_BLOCK_SIZE_uz) { // load data into shared memory for (unsigned internal = 0; internal < INTERNAL_BLOCK_SIZE; ++internal) { - // calculate the indices to access the global data points, pays attention to coalesced memory accesses - const auto global_i_linear = device_row_offset + i_linear + static_cast(internal) * THREAD_BLOCK_SIZE_uz; - const auto global_j_linear = device_row_offset + j_linear + static_cast(internal) * THREAD_BLOCK_SIZE_uz; + // calculate the indices to access the global data, pays attention to coalesced memory accesses + const auto global_i_idx_linear = device_row_offset + i_idx_linear + static_cast(internal) * THREAD_BLOCK_SIZE_uz; + const auto global_j_idx_linear = device_row_offset + j_idx_linear + static_cast(internal) * THREAD_BLOCK_SIZE_uz; // store the values in the shared memory - data_i_cache[threadIdx.y][internal * THREAD_BLOCK_SIZE + threadIdx.x] = data[(dim + threadIdx_y) * (num_rows + std::size_t{ 1 } + PADDING_SIZE_uz) + global_i_linear]; - data_j_cache[threadIdx.y][internal * THREAD_BLOCK_SIZE + threadIdx.x] = data[(dim + threadIdx_y) * (num_rows + std::size_t{ 1 } + PADDING_SIZE_uz) + global_j_linear]; + data_i_cache[threadIdx.y][internal * THREAD_BLOCK_SIZE + threadIdx.x] = data[(dim + threadIdx_y) * (num_rows + std::size_t{ 1 } + PADDING_SIZE_uz) + global_i_idx_linear]; // SoA + data_j_cache[threadIdx.y][internal * THREAD_BLOCK_SIZE + threadIdx.x] = data[(dim + threadIdx_y) * (num_rows + std::size_t{ 1 } + PADDING_SIZE_uz) + global_j_idx_linear]; // SoA } __syncthreads(); // wait until all threads loaded their part of the data @@ -94,29 +94,29 @@ __global__ void device_kernel_assembly(real_type *kernel_matrix, const real_type } // calculate the indices used in the current thread - const auto i = (blockIdx_x * blockDim_x + threadIdx_x) * INTERNAL_BLOCK_SIZE_uz; - const auto j = (blockIdx_y * blockDim_y + threadIdx_y) * INTERNAL_BLOCK_SIZE_uz; + const auto i_idx = (blockIdx_x * blockDim_x + threadIdx_x) * INTERNAL_BLOCK_SIZE_uz; // num_rows - device_row_offset + const auto j_idx = (blockIdx_y * blockDim_y + threadIdx_y) * INTERNAL_BLOCK_SIZE_uz; // device_num_rows // apply the remaining part of the kernel function and store the value in the output kernel matrix for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) { for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) { - // calculate the indices to access the global data points and wrt the current device - const auto device_global_i = i + static_cast(internal_i); - const auto global_i = device_row_offset + device_global_i; - const auto device_global_j = j + static_cast(internal_j); - const auto global_j = device_row_offset + device_global_j; - - // be sure to not perform out of bounds accesses for the kernel matrix (only using the upper triangular matrix) - if (device_global_i < (num_rows - device_row_offset) && device_global_j < device_num_rows && global_i >= global_j) { + // calculate the indices to access the global data and the data with respect to the current device + const auto device_global_i_idx = i_idx + static_cast(internal_i); + const auto global_i_idx = device_row_offset + device_global_i_idx; + const auto device_global_j_idx = j_idx + static_cast(internal_j); + const auto global_j_idx = device_row_offset + device_global_j_idx; + + // be sure to not perform out-of-bounds accesses (only using the upper triangular matrix) + if (device_global_i_idx < (num_rows - device_row_offset) && device_global_j_idx < device_num_rows && global_i_idx >= global_j_idx) { real_type temp_ij = temp[internal_i][internal_j]; // apply the final kernel function - temp_ij = detail::apply_kernel_function(temp_ij, kernel_function_parameter...) + QA_cost - q[global_i] - q[global_j]; + temp_ij = detail::apply_kernel_function(temp_ij, kernel_function_parameter...) + QA_cost - q[global_i_idx] - q[global_j_idx]; // apply the cost on the diagonal - if (global_i == global_j) { + if (global_i_idx == global_j_idx) { temp_ij += cost; } // update the upper triangular kernel matrix - kernel_matrix[device_global_j * (num_rows - device_row_offset + PADDING_SIZE_uz) - device_global_j * (device_global_j + std::size_t{ 1 }) / std::size_t{ 2 } + device_global_i] = temp_ij; + kernel_matrix[device_global_j_idx * (num_rows - device_row_offset + PADDING_SIZE_uz) - device_global_j_idx * (device_global_j_idx + std::size_t{ 1 }) / std::size_t{ 2 } + device_global_i_idx] = temp_ij; } } } diff --git a/include/plssvm/backends/CUDA/kernel/cg_implicit/kernel_matrix_assembly_blas.cuh b/include/plssvm/backends/CUDA/kernel/cg_implicit/kernel_matrix_assembly_blas.cuh index 62f24d6bf..8e8dd03c2 100644 --- a/include/plssvm/backends/CUDA/kernel/cg_implicit/kernel_matrix_assembly_blas.cuh +++ b/include/plssvm/backends/CUDA/kernel/cg_implicit/kernel_matrix_assembly_blas.cuh @@ -15,7 +15,7 @@ #include "plssvm/backends/CUDA/kernel/detail/atomics.cuh" // atomicAdd for double precision floating point numbers on older CUDA hardware #include "plssvm/backends/CUDA/kernel/kernel_functions.cuh" // plssvm::cuda::detail::{feature_reduce, apply_kernel_function} -#include "plssvm/constants.hpp" // plssvm::{real_type, THREAD_BLOCK_SIZE, INTERNAL_BLOCK_SIZE, FEATURE_BLOCK_SIZE, PADDING_SIZE} +#include "plssvm/constants.hpp" // plssvm::{real_type, THREAD_BLOCK_SIZE, INTERNAL_BLOCK_SIZE, PADDING_SIZE} #include "plssvm/kernel_function_types.hpp" // plssvm::kernel_function_type namespace plssvm::cuda::detail { @@ -26,10 +26,10 @@ namespace plssvm::cuda::detail { * @tparam Args the types of the parameters necessary for the specific kernel function * @param[in] alpha the scalar alpha value * @param[in] q the vector used in the dimensional reduction - * @param[in] data_d the data points to calculate the implicit kernel matrix from + * @param[in] data the data points to calculate the implicit kernel matrix from * @param[in] num_rows the total number of data points (= total number of rows) * @param[in] device_num_rows the number of rows the current device is responsible for - * @param[in] row_offset the first row in @p data_d the current device is responsible for + * @param[in] device_row_offset the first row in @p data the current device is responsible for * @param[in] num_features the number of features per data point * @param[in] QA_cost the scalar used in the dimensional reduction * @param[in] cost the cost factor the diagonal is scaled with @@ -41,56 +41,64 @@ namespace plssvm::cuda::detail { * @param[in] kernel_function_parameter the parameters necessary to apply the @p kernel_function */ template -__global__ void device_kernel_assembly_symm(const real_type alpha, const real_type *q, const real_type *data_d, const unsigned long long num_rows, const unsigned long long device_num_rows, const unsigned long long row_offset, const unsigned long long num_features, const real_type QA_cost, const real_type cost, const real_type *B, real_type *C, const unsigned long long num_classes, const unsigned long long grid_x_offset, const unsigned long long grid_y_offset, Args... kernel_function_parameter) { - // cast all values to 64-bit unsigned long long to prevent potential 32-bit overflows - const auto threadIdx_x = static_cast(threadIdx.x); // current thread in block x-dimension - const auto threadIdx_y = static_cast(threadIdx.y); // current thread in block y-dimension - const auto blockDim_x = static_cast(blockDim.x); // number of threads in block x-dimension - const auto blockDim_y = static_cast(blockDim.y); // number of threads in block y-dimension - const auto blockIdx_x = static_cast(blockIdx.x) + grid_x_offset; // current block in grid x-dimension + offsets if the grid size would be too large - const auto blockIdx_y = static_cast(blockIdx.y) + grid_y_offset; // current block in grid y-dimension + offsets if the grid size would be too large - const auto INTERNAL_BLOCK_SIZE_ull = static_cast(INTERNAL_BLOCK_SIZE); - const auto THREAD_BLOCK_SIZE_ull = static_cast(THREAD_BLOCK_SIZE); - const auto FEATURE_BLOCK_SIZE_ull = static_cast(FEATURE_BLOCK_SIZE); - const auto PADDING_SIZE_ull = static_cast(PADDING_SIZE); +__global__ void device_kernel_assembly_symm(const real_type alpha, const real_type *q, const real_type *data, const std::size_t num_rows, const std::size_t device_num_rows, const std::size_t device_row_offset, const std::size_t num_features, const real_type QA_cost, const real_type cost, const real_type *B, real_type *C, const std::size_t num_classes, const std::size_t grid_x_offset, const std::size_t grid_y_offset, Args... kernel_function_parameter) { + // cast all values to 64-bit std::size_t to prevent potential 32-bit overflows + const auto INTERNAL_BLOCK_SIZE_uz = static_cast(INTERNAL_BLOCK_SIZE); + const auto THREAD_BLOCK_SIZE_uz = static_cast(THREAD_BLOCK_SIZE); + const auto PADDING_SIZE_uz = static_cast(PADDING_SIZE); + + const auto threadIdx_x = static_cast(threadIdx.x); // current thread in block x-dimension + const auto threadIdx_y = static_cast(threadIdx.y); // current thread in block y-dimension + const auto blockDim_x = static_cast(blockDim.x); // number of threads in block x-dimension + const auto blockDim_y = static_cast(blockDim.y); // number of threads in block y-dimension + const auto blockIdx_x = static_cast(blockIdx.x) + grid_x_offset; // current block in grid x-dimension + offsets if the grid size is too large + const auto blockIdx_y = static_cast(blockIdx.y) + grid_y_offset; // current block in grid y-dimension + offsets if the grid size is too large // calculate the indices used in the current thread - const auto i = (blockIdx_x * blockDim_x + threadIdx_x) * INTERNAL_BLOCK_SIZE_ull; - const auto i_linear = blockIdx_x * blockDim_x * INTERNAL_BLOCK_SIZE_ull + threadIdx_x; - const auto j = (blockIdx_y * blockDim_y + threadIdx_y) * INTERNAL_BLOCK_SIZE_ull; - const auto j_linear = blockIdx_y * blockDim_y * INTERNAL_BLOCK_SIZE_ull + threadIdx_x; + const auto i_idx = (blockIdx_x * blockDim_x + threadIdx_x) * INTERNAL_BLOCK_SIZE_uz; // num_rows - device_row_offset + const auto j_idx = (blockIdx_y * blockDim_y + threadIdx_y) * INTERNAL_BLOCK_SIZE_uz; // num_rows - device_row_offset + + // calculate the indices used in the current thread, pays attention to coalesced memory accesses + const auto i_idx_linear = blockIdx_x * blockDim_x * INTERNAL_BLOCK_SIZE_uz + threadIdx_x; // device_num_rows + const auto j_idx_linear = blockIdx_y * blockDim_y * INTERNAL_BLOCK_SIZE_uz + threadIdx_x; // device_num_rows + + // create two shared memory arrays used for caching + __shared__ real_type cache_one[THREAD_BLOCK_SIZE * INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE]; + __shared__ real_type cache_two[THREAD_BLOCK_SIZE * INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE]; // only calculate the upper triangular matrix -> can't use threadIdx since all threads in a warp must progress further if (blockIdx_x >= blockIdx_y) { // create a thread private array used for internal caching real_type temp[INTERNAL_BLOCK_SIZE][INTERNAL_BLOCK_SIZE]{}; + //*************************************************************************// + // inplace kernel matrix construction // + //*************************************************************************// { - // create the shared memory arrays used for caching data point features - __shared__ real_type data_cache_i[FEATURE_BLOCK_SIZE][INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE]; - __shared__ real_type data_cache_j[FEATURE_BLOCK_SIZE][INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE]; + // reinterpret the shared memory arrays to be of shape [THREAD_BLOCK_SIZE][INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE] + auto data_i_cache = reinterpret_cast(cache_one); + auto data_j_cache = reinterpret_cast(cache_two); // iterate over all features using blocking to be able to cache them for faster memory accesses - for (unsigned long long dim = 0; dim < num_features; dim += FEATURE_BLOCK_SIZE_ull) { + for (std::size_t dim = 0; dim < num_features; dim += THREAD_BLOCK_SIZE_uz) { // load data into shared memory for (unsigned internal = 0; internal < INTERNAL_BLOCK_SIZE; ++internal) { - const auto global_i = row_offset + i_linear + static_cast(internal) * THREAD_BLOCK_SIZE_ull; - const auto global_j = row_offset + j_linear + static_cast(internal) * THREAD_BLOCK_SIZE_ull; - - // FEATURE_BLOCK_SIZE = 2 * THREAD_BLOCK_SIZE -> store twice as many values in the shared memory - data_cache_i[threadIdx.y][internal * THREAD_BLOCK_SIZE + threadIdx.x] = data_d[(dim + threadIdx_y) * (num_rows + 1ull + PADDING_SIZE_ull) + global_i]; - data_cache_i[threadIdx.y + THREAD_BLOCK_SIZE][internal * THREAD_BLOCK_SIZE + threadIdx.x] = data_d[(dim + threadIdx_y + THREAD_BLOCK_SIZE_ull) * (num_rows + 1ull + PADDING_SIZE_ull) + global_i]; - data_cache_j[threadIdx.y][internal * THREAD_BLOCK_SIZE + threadIdx.x] = data_d[(dim + threadIdx_y) * (num_rows + 1ull + PADDING_SIZE_ull) + global_j]; - data_cache_j[threadIdx.y + THREAD_BLOCK_SIZE][internal * THREAD_BLOCK_SIZE + threadIdx.x] = data_d[(dim + threadIdx_y + THREAD_BLOCK_SIZE_ull) * (num_rows + 1ull + PADDING_SIZE_ull) + global_j]; + // calculate the indices to access the global data, pays attention to coalesced memory accesses + const auto global_i_idx_linear = device_row_offset + i_idx_linear + static_cast(internal) * THREAD_BLOCK_SIZE_uz; + const auto global_j_idx_linear = device_row_offset + j_idx_linear + static_cast(internal) * THREAD_BLOCK_SIZE_uz; + + // store the values in the shared memory + data_i_cache[threadIdx.y][internal * THREAD_BLOCK_SIZE + threadIdx.x] = data[(dim + threadIdx_y) * (num_rows + std::size_t{ 1 } + PADDING_SIZE_uz) + global_i_idx_linear]; // SoA + data_j_cache[threadIdx.y][internal * THREAD_BLOCK_SIZE + threadIdx.x] = data[(dim + threadIdx_y) * (num_rows + std::size_t{ 1 } + PADDING_SIZE_uz) + global_j_idx_linear]; // SoA } __syncthreads(); // wait until all threads loaded their part of the data // perform the feature reduction calculation - for (unsigned block_dim = 0; block_dim < FEATURE_BLOCK_SIZE; ++block_dim) { + for (unsigned block_dim = 0; block_dim < THREAD_BLOCK_SIZE; ++block_dim) { for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) { for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) { - temp[internal_i][internal_j] += detail::feature_reduce(data_cache_i[block_dim][threadIdx.x * INTERNAL_BLOCK_SIZE + internal_i], - data_cache_j[block_dim][threadIdx.y * INTERNAL_BLOCK_SIZE + internal_j]); + temp[internal_i][internal_j] += detail::feature_reduce(data_i_cache[block_dim][threadIdx.x * INTERNAL_BLOCK_SIZE + internal_i], + data_j_cache[block_dim][threadIdx.y * INTERNAL_BLOCK_SIZE + internal_j]); } } } @@ -101,16 +109,18 @@ __global__ void device_kernel_assembly_symm(const real_type alpha, const real_ty // apply the remaining part of the kernel function and store the value in the output kernel matrix for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) { for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) { - const auto global_i = row_offset + i + static_cast(internal_i); - const auto device_global_i = i + static_cast(internal_i); - const auto global_j = row_offset + j + static_cast(internal_j); - const auto device_global_j = j + static_cast(internal_j); - - // be sure to not perform out of bounds accesses for the kernel matrix (only using the upper triangular matrix) - if ((device_global_i < (num_rows - row_offset) && device_global_j < device_num_rows && global_i >= global_j)) { - temp[internal_i][internal_j] = detail::apply_kernel_function(temp[internal_i][internal_j], kernel_function_parameter...) + QA_cost - q[global_i] - q[global_j]; + // calculate the indices to access the global data and the data with respect to the current device + const auto device_global_i_idx = i_idx + static_cast(internal_i); + const auto global_i_idx = device_row_offset + device_global_i_idx; + const auto device_global_j_idx = j_idx + static_cast(internal_j); + const auto global_j_idx = device_row_offset + device_global_j_idx; + + // be sure to not perform out of bounds accesses (only using the upper triangular matrix) + if ((device_global_i_idx < (num_rows - device_row_offset) && device_global_j_idx < device_num_rows && global_i_idx >= global_j_idx)) { + // apply the final kernel function + temp[internal_i][internal_j] = detail::apply_kernel_function(temp[internal_i][internal_j], kernel_function_parameter...) + QA_cost - q[global_i_idx] - q[global_j_idx]; // apply the cost on the diagonal - if (global_i == global_j) { + if (global_i_idx == global_j_idx) { temp[internal_i][internal_j] += cost; } } else { @@ -120,42 +130,44 @@ __global__ void device_kernel_assembly_symm(const real_type alpha, const real_ty } } - // calculate C += alpha * temp * B for the UPPER triangular matrix + //*************************************************************************// + // calculate C += alpha * temp * B for the UPPER triangular matrix // + //*************************************************************************// { - // same shared memory size but with different dimensions - __shared__ real_type B_cache[INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE][FEATURE_BLOCK_SIZE]; - __shared__ real_type C_out_cache[INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE][FEATURE_BLOCK_SIZE]; + // reinterpret the shared memory arrays to be of shape [INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE][THREAD_BLOCK_SIZE] + auto B_cache = reinterpret_cast(cache_one); + auto C_out_cache = reinterpret_cast(cache_two); // iterate over all classes using blocking to be able to cache them for faster memory accesses - for (unsigned long long dim = 0; dim < num_classes; dim += FEATURE_BLOCK_SIZE_ull) { + for (std::size_t dim = 0; dim < num_classes; dim += THREAD_BLOCK_SIZE_uz) { // load data into shared memory for (unsigned internal = 0; internal < INTERNAL_BLOCK_SIZE; ++internal) { - const auto global_i = row_offset + i_linear + static_cast(internal) * THREAD_BLOCK_SIZE_ull; + // calculate the indices to access the global data, pays attention to coalesced memory accesses + const auto global_i_idx_linear = device_row_offset + i_idx_linear + static_cast(internal) * THREAD_BLOCK_SIZE_uz; - // FEATURE_BLOCK_SIZE = 2 * THREAD_BLOCK_SIZE -> store twice as many values in the shared memory - B_cache[internal * THREAD_BLOCK_SIZE + threadIdx.x][threadIdx.y] = alpha * B[global_i * (num_classes + PADDING_SIZE_ull) + dim + threadIdx_y]; - B_cache[internal * THREAD_BLOCK_SIZE + threadIdx.x][threadIdx.y + THREAD_BLOCK_SIZE] = alpha * B[global_i * (num_classes + PADDING_SIZE_ull) + dim + threadIdx_y + THREAD_BLOCK_SIZE_ull]; - C_out_cache[internal * THREAD_BLOCK_SIZE + threadIdx.x][threadIdx.y] = real_type{ 0.0 }; - C_out_cache[internal * THREAD_BLOCK_SIZE + threadIdx.x][threadIdx.y + THREAD_BLOCK_SIZE] = real_type{ 0.0 }; + // store the values in the shared memory + B_cache[internal * THREAD_BLOCK_SIZE + threadIdx.x][threadIdx.y] = alpha * B[global_i_idx_linear * (num_classes + PADDING_SIZE_uz) + dim + threadIdx_y]; // SoA + C_out_cache[internal * THREAD_BLOCK_SIZE + threadIdx.x][threadIdx.y] = real_type{ 0.0 }; // SoA } __syncthreads(); // wait until all threads loaded their part of the data // calculate intermediate results and store them in shared memory - for (unsigned class_idx = 0; class_idx < FEATURE_BLOCK_SIZE; ++class_idx) { + for (unsigned class_idx = 0; class_idx < THREAD_BLOCK_SIZE; ++class_idx) { for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) { for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) { - C_out_cache[threadIdx.y * INTERNAL_BLOCK_SIZE + internal_j][(class_idx + threadIdx.x) % FEATURE_BLOCK_SIZE] += - temp[internal_i][internal_j] * B_cache[threadIdx.x * INTERNAL_BLOCK_SIZE + internal_i][(class_idx + threadIdx.x) % FEATURE_BLOCK_SIZE]; + C_out_cache[threadIdx.y * INTERNAL_BLOCK_SIZE + internal_j][(class_idx + threadIdx.x) % THREAD_BLOCK_SIZE] += + temp[internal_i][internal_j] * B_cache[threadIdx.x * INTERNAL_BLOCK_SIZE + internal_i][(class_idx + threadIdx.x) % THREAD_BLOCK_SIZE]; } } __syncthreads(); // wait until all threads performed their part of the calculations } - // add intermediate cached results to C + // atomically add the intermediate cached results to the C matrix for (unsigned internal = 0; internal < INTERNAL_BLOCK_SIZE; ++internal) { - const auto global_j = row_offset + j + static_cast(internal); - atomicAdd(&C[global_j * (num_classes + PADDING_SIZE_ull) + dim + threadIdx_x], C_out_cache[threadIdx.y * INTERNAL_BLOCK_SIZE + internal][threadIdx.x]); - atomicAdd(&C[global_j * (num_classes + PADDING_SIZE_ull) + dim + threadIdx_x + THREAD_BLOCK_SIZE_ull], C_out_cache[threadIdx.y * INTERNAL_BLOCK_SIZE + internal][threadIdx.x + THREAD_BLOCK_SIZE]); + // calculate the indices to access the global data + const auto global_j_idx = device_row_offset + j_idx + static_cast(internal); + + atomicAdd(&C[global_j_idx * (num_classes + PADDING_SIZE_uz) + dim + threadIdx_x], C_out_cache[threadIdx.y * INTERNAL_BLOCK_SIZE + internal][threadIdx.x]); // SoA } __syncthreads(); // wai until all threads updated C with their values } @@ -164,51 +176,54 @@ __global__ void device_kernel_assembly_symm(const real_type alpha, const real_ty // set potential diagonal entries in temp to 0.0 such that we don't apply the main diagonal twice to C for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) { for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) { - const auto global_i = row_offset + i + static_cast(internal_i); - const auto global_j = row_offset + j + static_cast(internal_j); + // calculate the indices to access the global data + const auto global_i_idx = device_row_offset + i_idx + static_cast(internal_i); + const auto global_j_idx = device_row_offset + j_idx + static_cast(internal_j); - if (global_i == global_j) { + // update the diagonal + if (global_i_idx == global_j_idx) { temp[internal_i][internal_j] = real_type{ 0.0 }; } } } - - // calculate C += alpha * temp * B for the LOWER triangular matrix + //*************************************************************************// + // calculate C += alpha * temp * B for the LOWER triangular matrix // + //*************************************************************************// { - // same shared memory size but with different dimensions - __shared__ real_type B_cache[FEATURE_BLOCK_SIZE][INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE]; - __shared__ real_type C_out_cache[FEATURE_BLOCK_SIZE][INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE]; + // reinterpret the shared memory arrays to be of shape [THREAD_BLOCK_SIZE][INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE] + auto B_cache = reinterpret_cast(cache_one); + auto C_out_cache = reinterpret_cast(cache_two); // iterate over all classes using blocking to be able to cache them for faster memory accesses - for (unsigned long long dim = 0; dim < num_classes; dim += FEATURE_BLOCK_SIZE_ull) { + for (std::size_t dim = 0; dim < num_classes; dim += THREAD_BLOCK_SIZE_uz) { // load data into shared memory for (unsigned internal = 0; internal < INTERNAL_BLOCK_SIZE; ++internal) { - const auto global_j = row_offset + j_linear + static_cast(internal) * THREAD_BLOCK_SIZE_ull; + // calculate the indices to access the global data, pays attention to coalesced memory accesses + const auto global_j_idx_linear = device_row_offset + j_idx_linear + static_cast(internal) * THREAD_BLOCK_SIZE_uz; - // FEATURE_BLOCK_SIZE = 2 * THREAD_BLOCK_SIZE -> store twice as many values in the shared memory - B_cache[threadIdx.y][internal * THREAD_BLOCK_SIZE + threadIdx.x] = alpha * B[global_j * (num_classes + PADDING_SIZE_ull) + dim + threadIdx_y]; - B_cache[threadIdx.y + THREAD_BLOCK_SIZE][internal * THREAD_BLOCK_SIZE + threadIdx.x] = alpha * B[global_j * (num_classes + PADDING_SIZE_ull) + dim + threadIdx_y + THREAD_BLOCK_SIZE_ull]; + // store the values in the shared memory + B_cache[threadIdx.y][internal * THREAD_BLOCK_SIZE + threadIdx.x] = alpha * B[global_j_idx_linear * (num_classes + PADDING_SIZE_uz) + dim + threadIdx_y]; // SoA C_out_cache[threadIdx.y][internal * THREAD_BLOCK_SIZE + threadIdx.x] = real_type{ 0.0 }; - C_out_cache[threadIdx.y + THREAD_BLOCK_SIZE][internal * THREAD_BLOCK_SIZE + threadIdx.x] = real_type{ 0.0 }; } __syncthreads(); // wait until all threads loaded their part of the data // calculate intermediate results and store them in shared memory - for (unsigned class_idx = 0; class_idx < FEATURE_BLOCK_SIZE; ++class_idx) { + for (unsigned class_idx = 0; class_idx < THREAD_BLOCK_SIZE; ++class_idx) { for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) { for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) { - C_out_cache[(class_idx + threadIdx.y) % FEATURE_BLOCK_SIZE][internal_i * THREAD_BLOCK_SIZE + threadIdx.x] += - temp[internal_i][internal_j] * B_cache[(class_idx + threadIdx.y) % FEATURE_BLOCK_SIZE][threadIdx.y * INTERNAL_BLOCK_SIZE + internal_j]; + C_out_cache[(class_idx + threadIdx.y) % THREAD_BLOCK_SIZE][internal_i * THREAD_BLOCK_SIZE + threadIdx.x] += + temp[internal_i][internal_j] * B_cache[(class_idx + threadIdx.y) % THREAD_BLOCK_SIZE][threadIdx.y * INTERNAL_BLOCK_SIZE + internal_j]; } } __syncthreads(); // wait until all threads performed their part of the calculations } - // add intermediate cached results to C + // atomically add the intermediate cached results to the C matrix for (unsigned internal = 0; internal < INTERNAL_BLOCK_SIZE; ++internal) { - const auto global_i = row_offset + i + static_cast(internal); - atomicAdd(&C[global_i * (num_classes + PADDING_SIZE_ull) + dim + threadIdx_y], C_out_cache[threadIdx.y][internal * THREAD_BLOCK_SIZE + threadIdx.x]); - atomicAdd(&C[global_i * (num_classes + PADDING_SIZE_ull) + dim + threadIdx_y + THREAD_BLOCK_SIZE_ull], C_out_cache[threadIdx.y + THREAD_BLOCK_SIZE][internal * THREAD_BLOCK_SIZE + threadIdx.x]); + // calculate the indices to access the global data + const auto global_i_idx = device_row_offset + i_idx + static_cast(internal); + + atomicAdd(&C[global_i_idx * (num_classes + PADDING_SIZE_uz) + dim + threadIdx_y], C_out_cache[threadIdx.y][internal * THREAD_BLOCK_SIZE + threadIdx.x]); // SoA } __syncthreads(); // wait until all threads updated C with their values } diff --git a/include/plssvm/backends/CUDA/kernel/kernel_functions.cuh b/include/plssvm/backends/CUDA/kernel/kernel_functions.cuh index 8003a51a3..72a4499ae 100644 --- a/include/plssvm/backends/CUDA/kernel/kernel_functions.cuh +++ b/include/plssvm/backends/CUDA/kernel/kernel_functions.cuh @@ -57,36 +57,12 @@ template <> * @return base^exponent (`[[nodiscard]]`) */ [[nodiscard]] __device__ __forceinline__ real_type powi(const real_type base, const int exponent) { - switch (exponent) { - case 0: return real_type{ 1.0 }; - case 1: return base; - case 2: return base * base; - case 3: return base * base * base; - case 4: - { - const real_type temp = base * base; - return temp * temp; - } - case 5: - { - const real_type temp = base * base; - return temp * temp * base; - } - case 6: - { - const real_type temp = base * base * base; - return temp * temp; - } - default: - { - // generic integer power function - real_type result{ 1.0 }; - for (int i = 0; i < exponent; ++i) { - result *= base; - } - return result; - } + // generic integer power function + real_type result{ 1.0 }; + for (int i = 0; i < exponent; ++i) { + result *= base; } + return result; } //***************************************************// diff --git a/include/plssvm/backends/CUDA/kernel/predict_kernel.cuh b/include/plssvm/backends/CUDA/kernel/predict_kernel.cuh index 204d6bd97..5469b01d9 100644 --- a/include/plssvm/backends/CUDA/kernel/predict_kernel.cuh +++ b/include/plssvm/backends/CUDA/kernel/predict_kernel.cuh @@ -15,166 +15,178 @@ #include "plssvm/backends/CUDA/kernel/detail/atomics.cuh" // atomicAdd for double precision floating point numbers on older CUDA hardware #include "plssvm/backends/CUDA/kernel/kernel_functions.cuh" // plssvm::cuda::detail::{feature_reduce, apply_kernel_function} -#include "plssvm/constants.hpp" // plssvm::{real_type, THREAD_BLOCK_SIZE, INTERNAL_BLOCK_SIZE, FEATURE_BLOCK_SIZE, PADDING_SIZE} +#include "plssvm/constants.hpp" // plssvm::{real_type, THREAD_BLOCK_SIZE, INTERNAL_BLOCK_SIZE, PADDING_SIZE} #include "plssvm/kernel_function_types.hpp" // plssvm::kernel_function_type namespace plssvm::cuda::detail { /** * @brief Calculate the `w` vector used to speedup the prediction using the linear kernel function. - * @param[out] w_d the vector to speedup the linear prediction - * @param[in] alpha_d the previously learned weights - * @param[in] sv_d the support vectors + * @param[out] w the vector to speedup the linear prediction + * @param[in] alpha the previously learned weights + * @param[in] sv the support vectors * @param[in] num_classes the number of classes * @param[in] num_sv the number of support vectors - * @param[in] device_specific_num_sv the number of support vectors the current device is responsible for - * @param[in] sv_offset the first support vector (row in @p alpha_d) the current device is responsible for + * @param[in] device_num_sv the number of support vectors the current device is responsible for + * @param[in] sv_offset the first support vector (row in @p alpha) the current device is responsible for * @param[in] grid_x_offset the offset in x-dimension into the data points if more than one execution grid has to be used * @param[in] grid_y_offset the offset in y-dimension into the data points if more than one execution grid has to be used */ -__global__ void device_kernel_w_linear(real_type *w_d, const real_type *alpha_d, const real_type *sv_d, const unsigned long long num_classes, const unsigned long long num_sv, const unsigned long long device_specific_num_sv, const unsigned long long sv_offset, const unsigned long long grid_x_offset, const unsigned long long grid_y_offset) { - // cast all values to 64-bit unsigned long long to prevent potential 32-bit overflows - const auto threadIdx_x = static_cast(threadIdx.x); // current thread in block x-dimension - const auto threadIdx_y = static_cast(threadIdx.y); // current thread in block y-dimension - const auto blockDim_x = static_cast(blockDim.x); // number of threads in block x-dimension - const auto blockDim_y = static_cast(blockDim.y); // number of threads in block y-dimension - const auto blockIdx_x = static_cast(blockIdx.x) + grid_x_offset; // current block in grid x-dimension + offsets if the grid size would be too large - const auto blockIdx_y = static_cast(blockIdx.y) + grid_y_offset; // current block in grid y-dimension + offsets if the grid size would be too large - const auto INTERNAL_BLOCK_SIZE_ull = static_cast(INTERNAL_BLOCK_SIZE); - const auto THREAD_BLOCK_SIZE_ull = static_cast(THREAD_BLOCK_SIZE); - const auto PADDING_SIZE_ull = static_cast(PADDING_SIZE); - - // calculate the indices used in the current thread - const auto feature_idx = (blockIdx_x * blockDim_x + threadIdx_x) * INTERNAL_BLOCK_SIZE_ull; - const auto feature_idx_linear = blockIdx_x * blockDim_x * INTERNAL_BLOCK_SIZE_ull + threadIdx_x; - const auto class_idx = (blockIdx_y * blockDim_y + threadIdx_y) * INTERNAL_BLOCK_SIZE_ull; - const auto class_idx_linear = blockIdx_y * blockDim_y * INTERNAL_BLOCK_SIZE_ull + threadIdx_x; - - // create the shared memory arrays used for caching data point features - __shared__ real_type data_cache_feature[THREAD_BLOCK_SIZE][INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE]; - __shared__ real_type data_cache_alpha[THREAD_BLOCK_SIZE][INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE]; +__global__ void device_kernel_w_linear(real_type *w, const real_type *alpha, const real_type *support_vectors, const std::size_t num_classes, const std::size_t num_sv, const std::size_t device_num_sv, const std::size_t sv_offset, const std::size_t grid_x_offset, const std::size_t grid_y_offset) { + // cast all values to 64-bit std::size_t to prevent potential 32-bit overflows + const auto INTERNAL_BLOCK_SIZE_uz = static_cast(INTERNAL_BLOCK_SIZE); + const auto THREAD_BLOCK_SIZE_uz = static_cast(THREAD_BLOCK_SIZE); + const auto PADDING_SIZE_uz = static_cast(PADDING_SIZE); + + const auto threadIdx_x = static_cast(threadIdx.x); // current thread in block x-dimension + const auto threadIdx_y = static_cast(threadIdx.y); // current thread in block y-dimension + const auto blockDim_x = static_cast(blockDim.x); // number of threads in block x-dimension + const auto blockDim_y = static_cast(blockDim.y); // number of threads in block y-dimension + const auto blockIdx_x = static_cast(blockIdx.x) + grid_x_offset; // current block in grid x-dimension + offsets if the grid size is too large + const auto blockIdx_y = static_cast(blockIdx.y) + grid_y_offset; // current block in grid y-dimension + offsets if the grid size is too large + + // create two shared memory arrays used for caching + __shared__ real_type feature_cache[THREAD_BLOCK_SIZE][INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE]; + __shared__ real_type alpha_cache[THREAD_BLOCK_SIZE][INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE]; // create a thread private array used for internal caching real_type temp[INTERNAL_BLOCK_SIZE][INTERNAL_BLOCK_SIZE]{}; - // iterate over all support vectors using blocking to be able to cache them for faster memory accesses - for (unsigned long long sv = 0; sv < device_specific_num_sv; sv += THREAD_BLOCK_SIZE_ull) { - // load data into shared memory - for (unsigned internal = 0; internal < INTERNAL_BLOCK_SIZE; ++internal) { - const auto global_feature_idx = feature_idx_linear + static_cast(internal) * THREAD_BLOCK_SIZE_ull; - const auto global_class_idx = class_idx_linear + static_cast(internal) * THREAD_BLOCK_SIZE_ull; + { + // calculate the indices used in the current thread, pays attention to coalesced memory accesses + const auto feature_idx_linear = blockIdx_x * blockDim_x * INTERNAL_BLOCK_SIZE_uz + threadIdx_x; // num_features + const auto class_idx_linear = blockIdx_y * blockDim_y * INTERNAL_BLOCK_SIZE_uz + threadIdx_x; // num_classes - data_cache_feature[threadIdx.y][internal * THREAD_BLOCK_SIZE + threadIdx.x] = sv_d[global_feature_idx * (device_specific_num_sv + PADDING_SIZE_ull) + sv + threadIdx_y]; // SoA - data_cache_alpha[threadIdx.y][internal * THREAD_BLOCK_SIZE + threadIdx.x] = alpha_d[global_class_idx * (num_sv + PADDING_SIZE_ull) + sv + sv_offset + threadIdx_y]; // AoS - } - __syncthreads(); // wait until all threads loaded their part of the data + // iterate over all support vectors using blocking to be able to cache them for faster memory accesses + for (std::size_t sv = 0; sv < device_num_sv; sv += THREAD_BLOCK_SIZE_uz) { + // load data into shared memory + for (unsigned internal = 0; internal < INTERNAL_BLOCK_SIZE; ++internal) { + // calculate the indices to access the global data, pays attention to coalesced memory accesses + const auto global_feature_idx_linear = feature_idx_linear + static_cast(internal) * THREAD_BLOCK_SIZE_uz; + const auto global_class_idx_linear = class_idx_linear + static_cast(internal) * THREAD_BLOCK_SIZE_uz; + + // store the values in the shared memory + feature_cache[threadIdx.y][internal * THREAD_BLOCK_SIZE + threadIdx.x] = support_vectors[global_feature_idx_linear * (device_num_sv + PADDING_SIZE_uz) + sv + threadIdx_y]; // SoA + alpha_cache[threadIdx.y][internal * THREAD_BLOCK_SIZE + threadIdx.x] = alpha[global_class_idx_linear * (num_sv + PADDING_SIZE_uz) + sv + sv_offset + threadIdx_y]; // AoS + } + __syncthreads(); // wait until all threads loaded their part of the data - // perform the dot product calculation - for (unsigned block_dim = 0; block_dim < THREAD_BLOCK_SIZE; ++block_dim) { - for (unsigned internal_feature = 0; internal_feature < INTERNAL_BLOCK_SIZE; ++internal_feature) { - for (unsigned internal_class = 0; internal_class < INTERNAL_BLOCK_SIZE; ++internal_class) { - temp[internal_feature][internal_class] += data_cache_alpha[block_dim][threadIdx.y * INTERNAL_BLOCK_SIZE + internal_class] * data_cache_feature[block_dim][threadIdx.x * INTERNAL_BLOCK_SIZE + internal_feature]; + // perform the dot product calculation + for (unsigned block_dim = 0; block_dim < THREAD_BLOCK_SIZE; ++block_dim) { + for (unsigned internal_feature = 0; internal_feature < INTERNAL_BLOCK_SIZE; ++internal_feature) { + for (unsigned internal_class = 0; internal_class < INTERNAL_BLOCK_SIZE; ++internal_class) { + temp[internal_feature][internal_class] += alpha_cache[block_dim][threadIdx.y * INTERNAL_BLOCK_SIZE + internal_class] * feature_cache[block_dim][threadIdx.x * INTERNAL_BLOCK_SIZE + internal_feature]; + } } } + __syncthreads(); // wait until all threads performed their part of the calculations } - __syncthreads(); // wait until all threads performed their part of the calculations } - // update global array with local one + // calculate the indices used in the current thread + const auto feature_idx = (blockIdx_x * blockDim_x + threadIdx_x) * INTERNAL_BLOCK_SIZE_uz; // num_features + const auto class_idx = (blockIdx_y * blockDim_y + threadIdx_y) * INTERNAL_BLOCK_SIZE_uz; // num_classes + + // update the global w-vector with the locally cached values for (unsigned internal_feature = 0; internal_feature < INTERNAL_BLOCK_SIZE; ++internal_feature) { for (unsigned internal_class = 0; internal_class < INTERNAL_BLOCK_SIZE; ++internal_class) { - const auto global_feature_idx = feature_idx + static_cast(internal_feature); - const auto global_class_idx = class_idx + static_cast(internal_class); + // calculate the indices to access the global data + const auto global_feature_idx = feature_idx + static_cast(internal_feature); + const auto global_class_idx = class_idx + static_cast(internal_class); - w_d[global_feature_idx * (num_classes + PADDING_SIZE_ull) + global_class_idx] = temp[internal_feature][internal_class]; + w[global_feature_idx * (num_classes + PADDING_SIZE_uz) + global_class_idx] = temp[internal_feature][internal_class]; // SoA } } } /** - * @brief Predict the @p predict_points_d using the linear kernel speeding up the calculation using the @p w_d vector. - * @param[out] prediction_d the predicted values - * @param[in] w_d the vector to speedup the calculations - * @param[in] rho_d the previously learned bias - * @param[in] predict_points_d the data points to predict + * @brief Predict the @p predict_points using the linear kernel speeding up the calculation using the @p w vector. + * @param[out] prediction the predicted values + * @param[in] w the vector to speedup the calculations + * @param[in] rho the previously learned bias + * @param[in] predict_points the data points to predict * @param[in] num_classes the number of classes * @param[in] num_predict_points the number of data points to predict * @param[in] num_features the number of features per data point * @param[in] grid_x_offset the offset in x-dimension into the data points if more than one execution grid has to be used * @param[in] grid_y_offset the offset in y-dimension into the data points if more than one execution grid has to be used */ -__global__ void device_kernel_predict_linear(real_type *prediction_d, const real_type *w_d, const real_type *rho_d, const real_type *predict_points_d, const unsigned long long num_classes, const unsigned long long num_predict_points, const unsigned long long num_features, const unsigned long long grid_x_offset, const unsigned long long grid_y_offset) { - // cast all values to 64-bit unsigned long long to prevent potential 32-bit overflows - const auto threadIdx_x = static_cast(threadIdx.x); // current thread in block x-dimension - const auto threadIdx_y = static_cast(threadIdx.y); // current thread in block y-dimension - const auto blockDim_x = static_cast(blockDim.x); // number of threads in block x-dimension - const auto blockDim_y = static_cast(blockDim.y); // number of threads in block y-dimension - const auto blockIdx_x = static_cast(blockIdx.x) + grid_x_offset; // current block in grid x-dimension + offsets if the grid size would be too large - const auto blockIdx_y = static_cast(blockIdx.y) + grid_y_offset; // current block in grid y-dimension + offsets if the grid size would be too large - const auto INTERNAL_BLOCK_SIZE_ull = static_cast(INTERNAL_BLOCK_SIZE); - const auto THREAD_BLOCK_SIZE_ull = static_cast(THREAD_BLOCK_SIZE); - const auto FEATURE_BLOCK_SIZE_ull = static_cast(FEATURE_BLOCK_SIZE); - const auto PADDING_SIZE_ull = static_cast(PADDING_SIZE); - - // calculate the indices used in the current thread - const auto pp_idx = (blockIdx_x * blockDim_x + threadIdx_x) * INTERNAL_BLOCK_SIZE_ull; - const auto pp_idx_linear = blockIdx_x * blockDim_x * INTERNAL_BLOCK_SIZE_ull + threadIdx_x; - const auto class_idx = (blockIdx_y * blockDim_y + threadIdx_y) * INTERNAL_BLOCK_SIZE_ull; - const auto class_idx_linear = blockIdx_y * blockDim_y * INTERNAL_BLOCK_SIZE_ull + threadIdx_x; - - // create the shared memory arrays used for caching data point features - __shared__ real_type data_cache_pp[FEATURE_BLOCK_SIZE][INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE]; - __shared__ real_type data_cache_w[FEATURE_BLOCK_SIZE][INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE]; +__global__ void device_kernel_predict_linear(real_type *prediction, const real_type *w, const real_type *rho, const real_type *predict_points, const std::size_t num_classes, const std::size_t num_predict_points, const std::size_t num_features, const std::size_t grid_x_offset, const std::size_t grid_y_offset) { + // cast all values to 64-bit std::size_t to prevent potential 32-bit overflows + const auto INTERNAL_BLOCK_SIZE_uz = static_cast(INTERNAL_BLOCK_SIZE); + const auto THREAD_BLOCK_SIZE_uz = static_cast(THREAD_BLOCK_SIZE); + const auto PADDING_SIZE_uz = static_cast(PADDING_SIZE); + + const auto threadIdx_x = static_cast(threadIdx.x); // current thread in block x-dimension + const auto threadIdx_y = static_cast(threadIdx.y); // current thread in block y-dimension + const auto blockDim_x = static_cast(blockDim.x); // number of threads in block x-dimension + const auto blockDim_y = static_cast(blockDim.y); // number of threads in block y-dimension + const auto blockIdx_x = static_cast(blockIdx.x) + grid_x_offset; // current block in grid x-dimension + offsets if the grid size is too large + const auto blockIdx_y = static_cast(blockIdx.y) + grid_y_offset; // current block in grid y-dimension + offsets if the grid size is too large + + // create two shared memory arrays used for caching + __shared__ real_type pp_cache[THREAD_BLOCK_SIZE][INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE]; + __shared__ real_type w_cache[THREAD_BLOCK_SIZE][INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE]; // create a thread private array used for internal caching real_type temp[INTERNAL_BLOCK_SIZE][INTERNAL_BLOCK_SIZE]{}; - // iterate over all features using blocking to be able to cache them for faster memory accesses - for (unsigned long long dim = 0; dim < num_features; dim += FEATURE_BLOCK_SIZE_ull) { - // load data into shared memory - for (unsigned internal = 0; internal < INTERNAL_BLOCK_SIZE; ++internal) { - const auto global_pp_idx = pp_idx_linear + static_cast(internal) * THREAD_BLOCK_SIZE_ull; - const auto global_class_idx = class_idx_linear + static_cast(internal) * THREAD_BLOCK_SIZE_ull; - - // FEATURE_BLOCK_SIZE = 2 * THREAD_BLOCK_SIZE -> store twice as many values in the shared memory - data_cache_pp[threadIdx.y][internal * THREAD_BLOCK_SIZE + threadIdx.x] = predict_points_d[(dim + threadIdx_y) * (num_predict_points + PADDING_SIZE_ull) + global_pp_idx]; - data_cache_pp[threadIdx.y + THREAD_BLOCK_SIZE][internal * THREAD_BLOCK_SIZE + threadIdx.x] = predict_points_d[(dim + threadIdx_y + THREAD_BLOCK_SIZE_ull) * (num_predict_points + PADDING_SIZE_ull) + global_pp_idx]; - data_cache_w[threadIdx.y][internal * THREAD_BLOCK_SIZE + threadIdx.x] = w_d[(dim + threadIdx_y) * (num_classes + PADDING_SIZE_ull) + global_class_idx]; - data_cache_w[threadIdx.y + THREAD_BLOCK_SIZE][internal * THREAD_BLOCK_SIZE + threadIdx.x] = w_d[(dim + threadIdx_y + THREAD_BLOCK_SIZE_ull) * (num_classes + PADDING_SIZE_ull) + global_class_idx]; - } - __syncthreads(); // wait until all threads loaded their part of the data + { + // calculate the indices used in the current thread, pays attention to coalesced memory accesses + const auto pp_idx_linear = blockIdx_x * blockDim_x * INTERNAL_BLOCK_SIZE_uz + threadIdx_x; // num_predict_points + const auto class_idx_linear = blockIdx_y * blockDim_y * INTERNAL_BLOCK_SIZE_uz + threadIdx_x; // num_classes + + // iterate over all features using blocking to be able to cache them for faster memory accesses + for (std::size_t dim = 0; dim < num_features; dim += THREAD_BLOCK_SIZE_uz) { + // load data into shared memory + for (unsigned internal = 0; internal < INTERNAL_BLOCK_SIZE; ++internal) { + // calculate the indices to access the global data, pays attention to coalesced memory accesses + const auto global_pp_idx_linear = pp_idx_linear + static_cast(internal) * THREAD_BLOCK_SIZE_uz; + const auto global_class_idx_linear = class_idx_linear + static_cast(internal) * THREAD_BLOCK_SIZE_uz; - // perform the dot product calculation - for (unsigned block_dim = 0; block_dim < FEATURE_BLOCK_SIZE; ++block_dim) { - for (unsigned internal_pd = 0; internal_pd < INTERNAL_BLOCK_SIZE; ++internal_pd) { - for (unsigned internal_class = 0; internal_class < INTERNAL_BLOCK_SIZE; ++internal_class) { - temp[internal_pd][internal_class] += data_cache_w[block_dim][threadIdx.y * INTERNAL_BLOCK_SIZE + internal_class] * data_cache_pp[block_dim][threadIdx.x * INTERNAL_BLOCK_SIZE + internal_pd]; + // store the values in the shared memory + pp_cache[threadIdx.y][internal * THREAD_BLOCK_SIZE + threadIdx.x] = predict_points[(dim + threadIdx_y) * (num_predict_points + PADDING_SIZE_uz) + global_pp_idx_linear]; // SoA + w_cache[threadIdx.y][internal * THREAD_BLOCK_SIZE + threadIdx.x] = w[(dim + threadIdx_y) * (num_classes + PADDING_SIZE_uz) + global_class_idx_linear]; // SoA + } + __syncthreads(); // wait until all threads loaded their part of the data + + // perform the dot product calculation + for (unsigned block_dim = 0; block_dim < THREAD_BLOCK_SIZE; ++block_dim) { + for (unsigned internal_pp = 0; internal_pp < INTERNAL_BLOCK_SIZE; ++internal_pp) { + for (unsigned internal_class = 0; internal_class < INTERNAL_BLOCK_SIZE; ++internal_class) { + temp[internal_pp][internal_class] += w_cache[block_dim][threadIdx.y * INTERNAL_BLOCK_SIZE + internal_class] * pp_cache[block_dim][threadIdx.x * INTERNAL_BLOCK_SIZE + internal_pp]; + } } } + __syncthreads(); // wait until all threads performed their part of the calculations } - __syncthreads(); // wait until all threads performed their part of the calculations } - // update global array with local one - for (unsigned internal_pd = 0; internal_pd < INTERNAL_BLOCK_SIZE; ++internal_pd) { + // calculate the indices used in the current thread + const auto pp_idx = (blockIdx_x * blockDim_x + threadIdx_x) * INTERNAL_BLOCK_SIZE_uz; // num_predict_points + const auto class_idx = (blockIdx_y * blockDim_y + threadIdx_y) * INTERNAL_BLOCK_SIZE_uz; // num_classes + + // update the global array with the local one + for (unsigned internal_pp = 0; internal_pp < INTERNAL_BLOCK_SIZE; ++internal_pp) { for (unsigned internal_class = 0; internal_class < INTERNAL_BLOCK_SIZE; ++internal_class) { - const auto global_pp_idx = pp_idx + static_cast(internal_pd); - const auto global_class_idx = class_idx + static_cast(internal_class); + // calculate the indices to access the global data + const auto global_pp_idx = pp_idx + static_cast(internal_pp); + const auto global_class_idx = class_idx + static_cast(internal_class); - prediction_d[global_pp_idx * (num_classes + PADDING_SIZE_ull) + global_class_idx] = temp[internal_pd][internal_class] - rho_d[global_class_idx]; + prediction[global_pp_idx * (num_classes + PADDING_SIZE_uz) + global_class_idx] = temp[internal_pp][internal_class] - rho[global_class_idx]; // AoS } } } /** - * @brief Predict the @p predict_points_d using the @p kernel_function. + * @brief Predict the @p predict_points using the @p kernel_function. * @tparam kernel_function the type of the used kernel function * @tparam Args the types of the parameters necessary for the specific kernel function - * @param[in] prediction_d the predicted values - * @param[in] alpha_d the previously learned weights - * @param[in] rho_d the previously learned biases - * @param[in] sv_d the support vectors - * @param[in] predict_points_d the data points to predict + * @param[in] prediction the predicted values + * @param[in] alpha the previously learned weights + * @param[in] rho the previously learned biases + * @param[in] sv the support vectors + * @param[in] predict_points the data points to predict * @param[in] num_classes the number of classes * @param[in] num_sv the number of support vectors * @param[in] num_predict_points the number of data points to predict @@ -184,53 +196,55 @@ __global__ void device_kernel_predict_linear(real_type *prediction_d, const real * @param[in] kernel_function_parameter the parameters necessary to apply the @p kernel_function */ template -__global__ void device_kernel_predict(real_type *prediction_d, const real_type *alpha_d, const real_type *rho_d, const real_type *sv_d, const real_type *predict_points_d, const unsigned long long num_classes, const unsigned long long num_sv, const unsigned long long num_predict_points, const unsigned long long num_features, const unsigned long long grid_x_offset, const unsigned long long grid_y_offset, Args... kernel_function_parameter) { - // cast all values to 64-bit unsigned long long to prevent potential 32-bit overflows - const auto threadIdx_x = static_cast(threadIdx.x); // current thread in block x-dimension - const auto threadIdx_y = static_cast(threadIdx.y); // current thread in block y-dimension - const auto blockDim_x = static_cast(blockDim.x); // number of threads in block x-dimension - const auto blockDim_y = static_cast(blockDim.y); // number of threads in block y-dimension - const auto blockIdx_x = static_cast(blockIdx.x) + grid_x_offset; // current block in grid x-dimension + offsets if the grid size would be too large - const auto blockIdx_y = static_cast(blockIdx.y) + grid_y_offset; // current block in grid y-dimension + offsets if the grid size would be too large - const auto INTERNAL_BLOCK_SIZE_ull = static_cast(INTERNAL_BLOCK_SIZE); - const auto THREAD_BLOCK_SIZE_ull = static_cast(THREAD_BLOCK_SIZE); - const auto FEATURE_BLOCK_SIZE_ull = static_cast(FEATURE_BLOCK_SIZE); - const auto PADDING_SIZE_ull = static_cast(PADDING_SIZE); - - // calculate the indices used in the current thread - const auto pp_idx = (blockIdx_x * blockDim_x + threadIdx_x) * INTERNAL_BLOCK_SIZE_ull; - const auto pp_idx_linear = blockIdx_x * blockDim_x * INTERNAL_BLOCK_SIZE_ull + threadIdx_x; - const auto sv_idx_linear = blockIdx_y * blockDim_y * INTERNAL_BLOCK_SIZE_ull + threadIdx_x; +__global__ void device_kernel_predict(real_type *prediction, const real_type *alpha, const real_type *rho, const real_type *sv, const real_type *predict_points, const std::size_t num_classes, const std::size_t num_sv, const std::size_t num_predict_points, const std::size_t num_features, const std::size_t grid_x_offset, const std::size_t grid_y_offset, Args... kernel_function_parameter) { + // cast all values to 64-bit std::size_t to prevent potential 32-bit overflows + const auto INTERNAL_BLOCK_SIZE_uz = static_cast(INTERNAL_BLOCK_SIZE); + const auto THREAD_BLOCK_SIZE_uz = static_cast(THREAD_BLOCK_SIZE); + const auto PADDING_SIZE_uz = static_cast(PADDING_SIZE); + + const auto threadIdx_x = static_cast(threadIdx.x); // current thread in block x-dimension + const auto threadIdx_y = static_cast(threadIdx.y); // current thread in block y-dimension + const auto blockDim_x = static_cast(blockDim.x); // number of threads in block x-dimension + const auto blockDim_y = static_cast(blockDim.y); // number of threads in block y-dimension + const auto blockIdx_x = static_cast(blockIdx.x) + grid_x_offset; // current block in grid x-dimension + offsets if the grid size is too large + const auto blockIdx_y = static_cast(blockIdx.y) + grid_y_offset; // current block in grid y-dimension + offsets if the grid size is too large + + // create two shared memory arrays used for caching + __shared__ real_type cache_one[THREAD_BLOCK_SIZE * INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE]; + __shared__ real_type cache_two[THREAD_BLOCK_SIZE * INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE]; // create a thread private array used for internal caching real_type temp[INTERNAL_BLOCK_SIZE][INTERNAL_BLOCK_SIZE]{}; { - // create the shared memory arrays used for caching data point features - __shared__ real_type data_cache_pp[FEATURE_BLOCK_SIZE][INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE]; - __shared__ real_type data_cache_sv[FEATURE_BLOCK_SIZE][INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE]; + // reinterpret the shared memory arrays to be of shape [THREAD_BLOCK_SIZE][INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE] + auto pp_cache = reinterpret_cast(cache_one); + auto sv_cache = reinterpret_cast(cache_two); + + // calculate the indices used in the current thread, pays attention to coalesced memory accesses + const auto pp_idx_linear = blockIdx_x * blockDim_x * INTERNAL_BLOCK_SIZE_uz + threadIdx_x; // num_predict_points + const auto sv_idx_linear = blockIdx_y * blockDim_y * INTERNAL_BLOCK_SIZE_uz + threadIdx_x; // num_support_vectors // iterate over all features using blocking to be able to cache them for faster memory accesses - for (unsigned long long dim = 0; dim < num_features; dim += FEATURE_BLOCK_SIZE_ull) { + for (std::size_t dim = 0; dim < num_features; dim += THREAD_BLOCK_SIZE_uz) { // load data into shared memory for (unsigned internal = 0; internal < INTERNAL_BLOCK_SIZE; ++internal) { - const auto global_pp_idx = pp_idx_linear + static_cast(internal) * THREAD_BLOCK_SIZE; - const auto global_sv_idx = sv_idx_linear + static_cast(internal) * THREAD_BLOCK_SIZE; - - // FEATURE_BLOCK_SIZE = 2 * THREAD_BLOCK_SIZE -> store twice as many values in the shared memory - data_cache_pp[threadIdx.y][internal * THREAD_BLOCK_SIZE + threadIdx.x] = predict_points_d[(dim + threadIdx_y) * (num_predict_points + PADDING_SIZE_ull) + global_pp_idx]; - data_cache_pp[threadIdx.y + THREAD_BLOCK_SIZE][internal * THREAD_BLOCK_SIZE + threadIdx.x] = predict_points_d[(dim + threadIdx_y + THREAD_BLOCK_SIZE_ull) * (num_predict_points + PADDING_SIZE_ull) + global_pp_idx]; - data_cache_sv[threadIdx.y][internal * THREAD_BLOCK_SIZE + threadIdx.x] = sv_d[(dim + threadIdx_y) * (num_sv + PADDING_SIZE_ull) + global_sv_idx]; - data_cache_sv[threadIdx.y + THREAD_BLOCK_SIZE][internal * THREAD_BLOCK_SIZE + threadIdx.x] = sv_d[(dim + threadIdx_y + THREAD_BLOCK_SIZE_ull) * (num_sv + PADDING_SIZE_ull) + global_sv_idx]; + // calculate the indices to access the global data, pays attention to coalesced memory accesses + const auto global_pp_idx_linear = pp_idx_linear + static_cast(internal) * THREAD_BLOCK_SIZE; + const auto global_sv_idx_linear = sv_idx_linear + static_cast(internal) * THREAD_BLOCK_SIZE; + + // store the values in the shared memory + pp_cache[threadIdx.y][internal * THREAD_BLOCK_SIZE + threadIdx.x] = predict_points[(dim + threadIdx_y) * (num_predict_points + PADDING_SIZE_uz) + global_pp_idx_linear]; // SoA + sv_cache[threadIdx.y][internal * THREAD_BLOCK_SIZE + threadIdx.x] = sv[(dim + threadIdx_y) * (num_sv + PADDING_SIZE_uz) + global_sv_idx_linear]; // SoA } __syncthreads(); // wait until all threads loaded their part of the data // perform the feature reduction calculation - for (unsigned block_dim = 0; block_dim < FEATURE_BLOCK_SIZE; ++block_dim) { - for (unsigned internal_pd = 0; internal_pd < INTERNAL_BLOCK_SIZE; ++internal_pd) { + for (unsigned block_dim = 0; block_dim < THREAD_BLOCK_SIZE; ++block_dim) { + for (unsigned internal_pp = 0; internal_pp < INTERNAL_BLOCK_SIZE; ++internal_pp) { for (unsigned internal_sv = 0; internal_sv < INTERNAL_BLOCK_SIZE; ++internal_sv) { - temp[internal_pd][internal_sv] += detail::feature_reduce(data_cache_sv[block_dim][threadIdx.y * INTERNAL_BLOCK_SIZE + internal_sv], - data_cache_pp[block_dim][threadIdx.x * INTERNAL_BLOCK_SIZE + internal_pd]); + temp[internal_pp][internal_sv] += detail::feature_reduce(sv_cache[block_dim][threadIdx.y * INTERNAL_BLOCK_SIZE + internal_sv], + pp_cache[block_dim][threadIdx.x * INTERNAL_BLOCK_SIZE + internal_pp]); } } } @@ -239,57 +253,61 @@ __global__ void device_kernel_predict(real_type *prediction_d, const real_type * } // update temp using the respective kernel function - for (unsigned internal_pd = 0; internal_pd < INTERNAL_BLOCK_SIZE; ++internal_pd) { + for (unsigned internal_pp = 0; internal_pp < INTERNAL_BLOCK_SIZE; ++internal_pp) { for (unsigned internal_sv = 0; internal_sv < INTERNAL_BLOCK_SIZE; ++internal_sv) { - temp[internal_pd][internal_sv] = detail::apply_kernel_function(temp[internal_pd][internal_sv], kernel_function_parameter...); + temp[internal_pp][internal_sv] = detail::apply_kernel_function(temp[internal_pp][internal_sv], kernel_function_parameter...); } } { - // same shared memory size but with different dimensions - __shared__ real_type alpha_cache[FEATURE_BLOCK_SIZE][INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE]; - __shared__ real_type out_cache[FEATURE_BLOCK_SIZE][INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE]; - - // iterate over all features using blocking to be able to cache them for faster memory accesses - for (unsigned long long dim = 0; dim < num_classes; dim += FEATURE_BLOCK_SIZE_ull) { - // load data into shared memory - for (unsigned internal = 0; internal < INTERNAL_BLOCK_SIZE; ++internal) { - const unsigned long long global_sv_idx = sv_idx_linear + internal * THREAD_BLOCK_SIZE; - - // FEATURE_BLOCK_SIZE = 2 * THREAD_BLOCK_SIZE -> store twice as many values in the shared memory - alpha_cache[threadIdx.y][internal * THREAD_BLOCK_SIZE + threadIdx.x] = alpha_d[(dim + threadIdx_y) * (num_sv + PADDING_SIZE_ull) + global_sv_idx]; - alpha_cache[threadIdx.y + THREAD_BLOCK_SIZE][internal * THREAD_BLOCK_SIZE + threadIdx.x] = alpha_d[(dim + threadIdx_y + THREAD_BLOCK_SIZE_ull) * (num_sv + PADDING_SIZE_ull) + global_sv_idx]; - - // the bias (rho) must only be applied once for all support vectors - if (blockIdx_y == 0ull) { - out_cache[threadIdx.y][internal * THREAD_BLOCK_SIZE + threadIdx.x] = -rho_d[dim + threadIdx_y]; - out_cache[threadIdx.y + THREAD_BLOCK_SIZE][internal * THREAD_BLOCK_SIZE + threadIdx.x] = -rho_d[dim + threadIdx_y + THREAD_BLOCK_SIZE_ull]; - } else { - out_cache[threadIdx.y][internal * THREAD_BLOCK_SIZE + threadIdx.x] = real_type{ 0.0 }; - out_cache[threadIdx.y + THREAD_BLOCK_SIZE][internal * THREAD_BLOCK_SIZE + threadIdx.x] = real_type{ 0.0 }; + // reinterpret the shared memory arrays to be of shape [THREAD_BLOCK_SIZE][INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE] + auto alpha_cache = reinterpret_cast(cache_one); + auto out_cache = reinterpret_cast(cache_two); + + { + // calculate the indices used in the current thread + const auto pp_idx = (blockIdx_x * blockDim_x + threadIdx_x) * INTERNAL_BLOCK_SIZE_uz; // num_predict_points + // calculate the indices used in the current thread, pays attention to coalesced memory accesses + const auto sv_idx_linear = blockIdx_y * blockDim_y * INTERNAL_BLOCK_SIZE_uz + threadIdx_x; // num_support_vectors + + // iterate over all classes using blocking to be able to cache them for faster memory accesses + for (std::size_t dim = 0; dim < num_classes; dim += THREAD_BLOCK_SIZE_uz) { + // load data into shared memory + for (unsigned internal = 0; internal < INTERNAL_BLOCK_SIZE; ++internal) { + // calculate the indices to access the global data, pays attention to coalesced memory accesses + const std::size_t global_sv_idx_linear = sv_idx_linear + internal * THREAD_BLOCK_SIZE; + + // store the values in the shared memory + alpha_cache[threadIdx.y][internal * THREAD_BLOCK_SIZE + threadIdx.x] = alpha[(dim + threadIdx_y) * (num_sv + PADDING_SIZE_uz) + global_sv_idx_linear]; // AoS + // the bias (rho) must only be applied once for all support vectors + if (blockIdx_y == std::size_t{ 0 }) { + out_cache[threadIdx.y][internal * THREAD_BLOCK_SIZE + threadIdx.x] = -rho[dim + threadIdx_y]; + } else { + out_cache[threadIdx.y][internal * THREAD_BLOCK_SIZE + threadIdx.x] = real_type{ 0.0 }; + } } - } - __syncthreads(); // wait until all threads loaded their part of the data - - // calculate intermediate results and store them in shared memory - for (unsigned class_idx = 0; class_idx < FEATURE_BLOCK_SIZE; ++class_idx) { - for (unsigned internal_pd = 0; internal_pd < INTERNAL_BLOCK_SIZE; ++internal_pd) { - for (unsigned internal_sv = 0; internal_sv < INTERNAL_BLOCK_SIZE; ++internal_sv) { - out_cache[(class_idx + threadIdx.y) % FEATURE_BLOCK_SIZE][internal_pd * THREAD_BLOCK_SIZE + threadIdx.x] += - temp[internal_pd][internal_sv] * alpha_cache[(class_idx + threadIdx.y) % FEATURE_BLOCK_SIZE][threadIdx.y * INTERNAL_BLOCK_SIZE + internal_sv]; + __syncthreads(); // wait until all threads loaded their part of the data + + // calculate intermediate results and store them in shared memory + for (unsigned class_idx = 0; class_idx < THREAD_BLOCK_SIZE; ++class_idx) { + for (unsigned internal_pp = 0; internal_pp < INTERNAL_BLOCK_SIZE; ++internal_pp) { + for (unsigned internal_sv = 0; internal_sv < INTERNAL_BLOCK_SIZE; ++internal_sv) { + out_cache[(class_idx + threadIdx.y) % THREAD_BLOCK_SIZE][internal_pp * THREAD_BLOCK_SIZE + threadIdx.x] += + temp[internal_pp][internal_sv] * alpha_cache[(class_idx + threadIdx.y) % THREAD_BLOCK_SIZE][threadIdx.y * INTERNAL_BLOCK_SIZE + internal_sv]; + } } + __syncthreads(); // wait until all threads performed their part of the calculations } - __syncthreads(); // wait until all threads performed their part of the calculations - } - // add intermediate cached results to prediction_d - for (unsigned internal = 0; internal < INTERNAL_BLOCK_SIZE; ++internal) { - const auto global_pp_idx = pp_idx + static_cast(internal); + // atomically add the intermediate cached results to the prediction + for (unsigned internal = 0; internal < INTERNAL_BLOCK_SIZE; ++internal) { + // calculate the indices to access the global data + const auto global_pp_idx = pp_idx + static_cast(internal); - atomicAdd(&prediction_d[global_pp_idx * (num_classes + PADDING_SIZE_ull) + dim + threadIdx_y], out_cache[threadIdx.y][internal * THREAD_BLOCK_SIZE + threadIdx.x]); - atomicAdd(&prediction_d[global_pp_idx * (num_classes + PADDING_SIZE_ull) + dim + threadIdx_y + THREAD_BLOCK_SIZE_ull], out_cache[threadIdx.y + THREAD_BLOCK_SIZE][internal * THREAD_BLOCK_SIZE + threadIdx.x]); + atomicAdd(&prediction[global_pp_idx * (num_classes + PADDING_SIZE_uz) + dim + threadIdx_y], out_cache[threadIdx.y][internal * THREAD_BLOCK_SIZE + threadIdx.x]); + } + __syncthreads(); // wait until all threads updated their part of the prediction } - __syncthreads(); // wait until all threads updated their part of the prediction } } } From dbc00aed81991c4ff140be152209954e458d4994 Mon Sep 17 00:00:00 2001 From: Marcel Breyer Date: Wed, 28 May 2025 22:23:13 +0200 Subject: [PATCH 005/215] Do not use std::vector directly for the kernel matrix since it sequentially initializes all values to zero. Instead, use a std::unique_ptr together with a C++17 conformant make_unique_for_overwrite implementation followed by an OpenMP parallel zero initialization of all values drastically reducing the overhead. --- .../OpenMP/kernel/cg_explicit/blas.hpp | 6 +- .../cg_explicit/kernel_matrix_assembly.hpp | 3 +- .../detail/make_unique_for_overwrite.hpp | 101 ++++++++++++++++++ src/plssvm/backends/OpenMP/csvm.cpp | 35 ++++-- 4 files changed, 129 insertions(+), 16 deletions(-) create mode 100644 include/plssvm/detail/make_unique_for_overwrite.hpp diff --git a/include/plssvm/backends/OpenMP/kernel/cg_explicit/blas.hpp b/include/plssvm/backends/OpenMP/kernel/cg_explicit/blas.hpp index e1041024a..ff7fc6f36 100644 --- a/include/plssvm/backends/OpenMP/kernel/cg_explicit/blas.hpp +++ b/include/plssvm/backends/OpenMP/kernel/cg_explicit/blas.hpp @@ -37,9 +37,8 @@ namespace plssvm::openmp::detail { * @param[in] beta the scalar beta value * @param[in,out] C the matrix @p C, also used as result matrix */ -inline void device_kernel_symm(const std::size_t num_rows, const std::size_t num_rhs, const std::size_t device_specific_num_rows, const std::size_t row_offset, const real_type alpha, const std::vector &A, const soa_matrix &B, const real_type beta, soa_matrix &C) { +inline void device_kernel_symm(const std::size_t num_rows, const std::size_t num_rhs, const std::size_t device_specific_num_rows, const std::size_t row_offset, const real_type alpha, const real_type *A, const soa_matrix &B, const real_type beta, soa_matrix &C) { // compute: C = alpha * A * B + beta * C with A in m x k, B in n x k, and C in n x m, alpha, beta as scalar - PLSSVM_ASSERT(!A.empty(), "A matrix may not be empty!"); PLSSVM_ASSERT(B.shape() == (plssvm::shape{ num_rhs, num_rows }), "B matrix sizes mismatch!: {} != [{}, {}]", B.shape(), num_rhs, num_rows); PLSSVM_ASSERT(C.shape() == (plssvm::shape{ num_rhs, num_rows }), "C matrix sizes mismatch!: {} != [{}, {}]", C.shape(), num_rhs, num_rows); PLSSVM_ASSERT(num_rows >= device_specific_num_rows, "The number of place specific rows ({}) cannot be greater the the total number of rows ({})!", device_specific_num_rows, num_rows); @@ -119,9 +118,8 @@ inline void device_kernel_symm(const std::size_t num_rows, const std::size_t num * @param[in] beta the scalar beta value * @param[in,out] C the matrix @p C, also used as result matrix */ -inline void device_kernel_symm_mirror(const std::size_t num_rows, const std::size_t num_rhs, const std::size_t num_mirror_rows, const std::size_t device_specific_num_rows, const std::size_t row_offset, const real_type alpha, const std::vector &A, const soa_matrix &B, const real_type beta, soa_matrix &C) { +inline void device_kernel_symm_mirror(const std::size_t num_rows, const std::size_t num_rhs, const std::size_t num_mirror_rows, const std::size_t device_specific_num_rows, const std::size_t row_offset, const real_type alpha, const real_type *A, const soa_matrix &B, const real_type beta, soa_matrix &C) { // compute: C = alpha * A * B + beta * C with A in m x k, B in n x k, and C in n x m, alpha, beta as scalar - PLSSVM_ASSERT(!A.empty(), "A matrix may not be empty!"); PLSSVM_ASSERT(B.shape() == (plssvm::shape{ num_rhs, num_rows }), "B matrix sizes mismatch!: {} != [{}, {}]", B.shape(), num_rhs, num_rows); PLSSVM_ASSERT(C.shape() == (plssvm::shape{ num_rhs, num_rows }), "C matrix sizes mismatch!: {} != [{}, {}]", C.shape(), num_rhs, num_rows); PLSSVM_ASSERT(num_rows >= device_specific_num_rows, "The number of place specific rows ({}) cannot be greater the the total number of rows ({})!", device_specific_num_rows, num_rows); diff --git a/include/plssvm/backends/OpenMP/kernel/cg_explicit/kernel_matrix_assembly.hpp b/include/plssvm/backends/OpenMP/kernel/cg_explicit/kernel_matrix_assembly.hpp index 9403b12a1..9571513b9 100644 --- a/include/plssvm/backends/OpenMP/kernel/cg_explicit/kernel_matrix_assembly.hpp +++ b/include/plssvm/backends/OpenMP/kernel/cg_explicit/kernel_matrix_assembly.hpp @@ -40,9 +40,8 @@ namespace plssvm::openmp::detail { * @param[in] kernel_function_parameter the potential additional arguments for the @p kernel function */ template -void device_kernel_assembly(std::vector &kernel_matrix, const soa_matrix &data, const std::size_t device_specific_num_rows, const std::size_t row_offset, const std::vector &q, const real_type QA_cost, const real_type cost, Args... kernel_function_parameter) { +void device_kernel_assembly(real_type*kernel_matrix, const soa_matrix &data, const std::size_t device_specific_num_rows, const std::size_t row_offset, const std::vector &q, const real_type QA_cost, const real_type cost, Args... kernel_function_parameter) { PLSSVM_ASSERT(q.size() == data.num_rows() - 1, "Sizes mismatch!: {} != {}", q.size(), data.num_rows() - 1); - PLSSVM_ASSERT(!kernel_matrix.empty(), "A matrix may not be empty!"); PLSSVM_ASSERT(q.size() >= device_specific_num_rows, "The number of place specific rows ({}) cannot be greater the the total number of rows ({})!", device_specific_num_rows, q.size()); PLSSVM_ASSERT(q.size() >= row_offset, "The row offset ({}) cannot be greater the the total number of rows ({})!", row_offset, q.size()); PLSSVM_ASSERT(cost != real_type{ 0.0 }, "cost must not be 0.0 since it is 1 / plssvm::cost!"); diff --git a/include/plssvm/detail/make_unique_for_overwrite.hpp b/include/plssvm/detail/make_unique_for_overwrite.hpp new file mode 100644 index 000000000..51b56e126 --- /dev/null +++ b/include/plssvm/detail/make_unique_for_overwrite.hpp @@ -0,0 +1,101 @@ +/** + * @file + * @author Alexander Van Craen + * @author Marcel Breyer + * @copyright 2018-today The PLSSVM project - All Rights Reserved + * @license This file is part of the PLSSVM project which is released under the MIT license. + * See the LICENSE.md file in the project root for full license information. + * + * @brief A C++17 conform implementation of C++20's std::make_unique_for_overwrite. + * @details For implementation details see: https://en.cppreference.com/w/cpp/memory/unique_ptr/make_unique + */ + +#ifndef PLSSVM_DETAIL_MAKE_UNIQUE_FOR_OVERWRITE_HPP_ +#define PLSSVM_DETAIL_MAKE_UNIQUE_FOR_OVERWRITE_HPP_ + +#include // std::size_t +#include // std::unique_ptr +#include // std::false_type, std::true_type, std::enable_if_t, std::is_array_v + +namespace plssvm::detail { + +/** + * @brief Helper struct to check whether @p T is an unbounded array. + * @tparam T the array type + */ +template +struct is_unbounded_array : std::false_type { }; + +/** + * @brief Specialization of @ref is_unbounded_array for unbounded arrays. + * @tparam T the array type + */ +template +struct is_unbounded_array : std::true_type { }; + +/** + * @brief Shortcut for @ref is_unbounded_array::value. + * @tparam T the array type + */ +template +constexpr bool is_unbounded_array_v = is_unbounded_array::value; + +/** + * @brief Helper struct to check whether @p T is a bounded array. + * @tparam T the array type + */ +template +struct is_bounded_array : std::false_type { }; + +/** + * @brief Specialization of @ref is_bounded_array for unbounded arrays. + * @tparam T the array type + * @tparam N the size of the array + */ +template +struct is_bounded_array : std::true_type { }; + +/** + * @brief Shortcut for @ref is_unbounded_array::value. + * @tparam T the array type + */ +template +constexpr bool is_bounded_array_v = is_bounded_array::value; + +/** + * @brief A C++17 conform implementation of C++20's std::make_unique_for_overwrite. + * @details For implementation details see: https://en.cppreference.com/w/cpp/memory/unique_ptr/make_unique + * @tparam T the type of the object to create + * @return a unique pointer to the newly created object (`[[nodiscard]]`) + */ +template , bool> = true> +[[nodiscard]] std::unique_ptr make_unique_for_overwrite() { + return std::unique_ptr(new T); +} + +/** + * @brief A C++17 conform implementation of C++20's std::make_unique_for_overwrite. + * @details For implementation details see: https://en.cppreference.com/w/cpp/memory/unique_ptr/make_unique + * @tparam T the type of the objects to create + * @param[in] n the size of the array to create + * @return a unique pointer to the newly created object (`[[nodiscard]]`) + */ +template , bool> = true> +std::unique_ptr make_unique_for_overwrite(const std::size_t n) { + return std::unique_ptr(new std::remove_extent_t[n]); +} + +/** + * @brief A C++17 conform implementation of C++20's std::make_unique_for_overwrite. + * @details For implementation details see: https://en.cppreference.com/w/cpp/memory/unique_ptr/make_unique + * @tparam T the type of the object to create + * @tparam Args the types of the constructor arguments + * @param[in] args the arguments to pass to the constructor + * @return a unique pointer to the newly created object (`[[nodiscard]]`) + */ +template , bool> = true> +auto make_unique_for_overwrite(Args &&...args) = delete; + +} // namespace plssvm::detail + +#endif // PLSSVM_DETAIL_MAKE_UNIQUE_FOR_OVERWRITE_HPP_ diff --git a/src/plssvm/backends/OpenMP/csvm.cpp b/src/plssvm/backends/OpenMP/csvm.cpp index 7a7c17ef2..656d966f3 100644 --- a/src/plssvm/backends/OpenMP/csvm.cpp +++ b/src/plssvm/backends/OpenMP/csvm.cpp @@ -19,6 +19,7 @@ #include "plssvm/detail/assert.hpp" // PLSSVM_ASSERT #include "plssvm/detail/data_distribution.hpp" // plssvm::detail::triangular_data_distribution #include "plssvm/detail/logging/mpi_log_untracked.hpp" // plssvm::detail::log_untracked +#include "plssvm/detail/make_unique_for_overwrite.hpp" // plssvm::detail::make_unique_for_overwrite #include "plssvm/detail/memory_size.hpp" // plssvm::detail::memory_size #include "plssvm/detail/move_only_any.hpp" // plssvm::detail::{move_only_any, move_only_any_cast} #include "plssvm/detail/tracking/performance_tracker.hpp" // plssvm::detail::tracking::tracking_entry, PLSSVM_DETAIL_TRACKING_PERFORMANCE_TRACKER_ADD_TRACKING_ENTRY @@ -125,26 +126,40 @@ std::vector<::plssvm::detail::move_only_any> csvm::assemble_kernel_matrix(const // get the offset of the data points this device is responsible for const std::size_t row_offset = dist.place_row_offset(0); - std::vector kernel_matrix(dist.calculate_explicit_kernel_matrix_num_entries_padded(0)); // only explicitly store the upper triangular matrix + // get the number of kernel matrix entries + const std::size_t num_entries = dist.calculate_explicit_kernel_matrix_num_entries_padded(0); + + // only explicitly store the upper triangular matrix + auto kernel_matrix = ::plssvm::detail::make_unique_for_overwrite(num_entries); + // initialize kernel matrix to all zeros in parallel using OpenMP if available, otherwise fall back to a sequential memset +#if defined(_OPENMP) + #pragma omp parallel for + for (std::size_t i = 0; i < num_entries; ++i) { + kernel_matrix[i] = real_type{ 0.0 }; + } +#else + std::memset(kernel_matrix.get(), 0, num_entries * sizeof(real_type)); +#endif + const auto start = std::chrono::steady_clock::now(); switch (params.kernel_type) { case kernel_function_type::linear: - detail::device_kernel_assembly(kernel_matrix, A, device_specific_num_rows, row_offset, q_red, QA_cost, cost); + detail::device_kernel_assembly(kernel_matrix.get(), A, device_specific_num_rows, row_offset, q_red, QA_cost, cost); break; case kernel_function_type::polynomial: - detail::device_kernel_assembly(kernel_matrix, A, device_specific_num_rows, row_offset, q_red, QA_cost, cost, params.degree, std::get(params.gamma), params.coef0); + detail::device_kernel_assembly(kernel_matrix.get(), A, device_specific_num_rows, row_offset, q_red, QA_cost, cost, params.degree, std::get(params.gamma), params.coef0); break; case kernel_function_type::rbf: - detail::device_kernel_assembly(kernel_matrix, A, device_specific_num_rows, row_offset, q_red, QA_cost, cost, std::get(params.gamma)); + detail::device_kernel_assembly(kernel_matrix.get(), A, device_specific_num_rows, row_offset, q_red, QA_cost, cost, std::get(params.gamma)); break; case kernel_function_type::sigmoid: - detail::device_kernel_assembly(kernel_matrix, A, device_specific_num_rows, row_offset, q_red, QA_cost, cost, std::get(params.gamma), params.coef0); + detail::device_kernel_assembly(kernel_matrix.get(), A, device_specific_num_rows, row_offset, q_red, QA_cost, cost, std::get(params.gamma), params.coef0); break; case kernel_function_type::laplacian: - detail::device_kernel_assembly(kernel_matrix, A, device_specific_num_rows, row_offset, q_red, QA_cost, cost, std::get(params.gamma)); + detail::device_kernel_assembly(kernel_matrix.get(), A, device_specific_num_rows, row_offset, q_red, QA_cost, cost, std::get(params.gamma)); break; case kernel_function_type::chi_squared: - detail::device_kernel_assembly(kernel_matrix, A, device_specific_num_rows, row_offset, q_red, QA_cost, cost, std::get(params.gamma)); + detail::device_kernel_assembly(kernel_matrix.get(), A, device_specific_num_rows, row_offset, q_red, QA_cost, cost, std::get(params.gamma)); break; } const auto end = std::chrono::steady_clock::now(); @@ -202,16 +217,16 @@ void csvm::blas_level_3(const solver_type solver, const real_type alpha, const s break; case solver_type::cg_explicit: { - const auto &explicit_A = ::plssvm::detail::move_only_any_cast &>(A.front()); + const auto &explicit_A = ::plssvm::detail::move_only_any_cast &>(A.front()); PLSSVM_ASSERT(!explicit_A.empty(), "The A matrix must not be empty!"); const auto start = std::chrono::steady_clock::now(); - detail::device_kernel_symm(num_rows, num_rhs, device_specific_num_rows, row_offset, alpha, explicit_A, B, beta, C); + detail::device_kernel_symm(num_rows, num_rhs, device_specific_num_rows, row_offset, alpha, explicit_A.get(), B, beta, C); const std::size_t num_mirror_rows = num_rows - row_offset - device_specific_num_rows; if (num_mirror_rows > std::size_t{ 0 }) { - detail::device_kernel_symm_mirror(num_rows, num_rhs, num_mirror_rows, device_specific_num_rows, row_offset, alpha, explicit_A, B, beta, C); + detail::device_kernel_symm_mirror(num_rows, num_rhs, num_mirror_rows, device_specific_num_rows, row_offset, alpha, explicit_A.get(), B, beta, C); } const auto end = std::chrono::steady_clock::now(); From 10d303e3b4fe9835aab75ba3870b5bfe7c276678 Mon Sep 17 00:00:00 2001 From: Marcel Breyer Date: Thu, 29 May 2025 17:20:30 +0200 Subject: [PATCH 006/215] Improve the performance of the OpenMP cg_explicit kernel matrix assembly and BLAS implementation. Align names more to the ones used in the other backends. --- .../OpenMP/kernel/cg_explicit/blas.hpp | 111 ++++++++++-------- .../cg_explicit/kernel_matrix_assembly.hpp | 70 ++++++----- .../OpenMP/kernel/kernel_functions.hpp | 35 +----- 3 files changed, 104 insertions(+), 112 deletions(-) diff --git a/include/plssvm/backends/OpenMP/kernel/cg_explicit/blas.hpp b/include/plssvm/backends/OpenMP/kernel/cg_explicit/blas.hpp index ff7fc6f36..ecd80ab1a 100644 --- a/include/plssvm/backends/OpenMP/kernel/cg_explicit/blas.hpp +++ b/include/plssvm/backends/OpenMP/kernel/cg_explicit/blas.hpp @@ -21,7 +21,6 @@ #include // std::array #include // std::ceil #include // std::size_t -#include // std::vector namespace plssvm::openmp::detail { @@ -29,24 +28,24 @@ namespace plssvm::openmp::detail { * @brief Perform an explicit BLAS SYMM operation: `C = alpha * A * B + beta * C` where @p A is a `m x k` symmetric matrix (memory optimized), @p B is a `k x n` matrix, @p C is a `m x n` matrix, and @p alpha and @p beta are scalars. * @param[in] num_rows the number of rows and columns in @p A * @param[in] num_rhs the number of rows in @p B and @p C - * @param[in] device_specific_num_rows the number of rows the current device is responsible for - * @param[in] row_offset the first row in @p data the current device is responsible for + * @param[in] device_num_rows the number of rows the current device is responsible for + * @param[in] device_row_offset the first row in @p data the current device is responsible for * @param[in] alpha the scalar alpha value * @param[in] A the matrix @p A * @param[in] B the matrix @p B * @param[in] beta the scalar beta value * @param[in,out] C the matrix @p C, also used as result matrix */ -inline void device_kernel_symm(const std::size_t num_rows, const std::size_t num_rhs, const std::size_t device_specific_num_rows, const std::size_t row_offset, const real_type alpha, const real_type *A, const soa_matrix &B, const real_type beta, soa_matrix &C) { +inline void device_kernel_symm(const std::size_t num_rows, const std::size_t num_rhs, const std::size_t device_num_rows, const std::size_t device_row_offset, const real_type alpha, const real_type *A, const soa_matrix &B, const real_type beta, soa_matrix &C) { // compute: C = alpha * A * B + beta * C with A in m x k, B in n x k, and C in n x m, alpha, beta as scalar PLSSVM_ASSERT(B.shape() == (plssvm::shape{ num_rhs, num_rows }), "B matrix sizes mismatch!: {} != [{}, {}]", B.shape(), num_rhs, num_rows); PLSSVM_ASSERT(C.shape() == (plssvm::shape{ num_rhs, num_rows }), "C matrix sizes mismatch!: {} != [{}, {}]", C.shape(), num_rhs, num_rows); - PLSSVM_ASSERT(num_rows >= device_specific_num_rows, "The number of place specific rows ({}) cannot be greater the the total number of rows ({})!", device_specific_num_rows, num_rows); - PLSSVM_ASSERT(num_rows >= row_offset, "The row offset ({}) cannot be greater the the total number of rows ({})!", row_offset, num_rows); + PLSSVM_ASSERT(num_rows >= device_num_rows, "The number of place specific rows ({}) cannot be greater the the total number of rows ({})!", device_num_rows, num_rows); + PLSSVM_ASSERT(num_rows >= device_row_offset, "The row offset ({}) cannot be greater the the total number of rows ({})!", device_row_offset, num_rows); // calculate constants const auto blocked_num_rhs = static_cast(std::ceil(static_cast(num_rhs) / INTERNAL_BLOCK_SIZE)); - const auto blocked_device_specific_num_rows = static_cast(std::ceil(static_cast(device_specific_num_rows) / INTERNAL_BLOCK_SIZE)); + const auto blocked_device_specific_num_rows = static_cast(std::ceil(static_cast(device_num_rows) / INTERNAL_BLOCK_SIZE)); // cast all values to 64-bit unsigned long long to prevent potential 32-bit overflows const auto INTERNAL_BLOCK_SIZE_uz = static_cast(INTERNAL_BLOCK_SIZE); @@ -60,28 +59,33 @@ inline void device_kernel_symm(const std::size_t num_rows, const std::size_t num for (std::size_t rhs_block = 0; rhs_block < THREAD_BLOCK_SIZE_uz; ++rhs_block) { for (std::size_t row_block = 0; row_block < THREAD_BLOCK_SIZE_uz; ++row_block) { // calculate the indices used in the current thread - const std::size_t rhs_idx = (rhs + rhs_block) * INTERNAL_BLOCK_SIZE_uz; - const std::size_t row_idx = (row + row_block) * INTERNAL_BLOCK_SIZE_uz; + const std::size_t i_idx = (rhs + rhs_block) * INTERNAL_BLOCK_SIZE_uz; + const std::size_t j_idx = (row + row_block) * INTERNAL_BLOCK_SIZE_uz; // create a thread private array used for internal caching std::array, INTERNAL_BLOCK_SIZE> temp{}; - // iterate over all features - for (std::size_t dim = 0; dim < (num_rows - row_offset); ++dim) { + // iterate over all values + for (std::size_t dim = 0; dim < (num_rows - device_row_offset); dim += THREAD_BLOCK_SIZE_uz) { // perform the dot product calculation for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) { for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) { - const std::size_t global_rhs = rhs_idx + static_cast(internal_i); - const std::size_t global_row = row_idx + static_cast(internal_j); - - real_type A_val = 0.0; - // determine on which side of the diagonal we are located - if (dim < global_row) { - A_val = A[dim * (num_rows - row_offset + PADDING_SIZE_uz) + global_row - dim * (dim + std::size_t{ 1 }) / std::size_t{ 2 }]; - } else { - A_val = A[global_row * (num_rows - row_offset + PADDING_SIZE_uz) + dim - global_row * (global_row + std::size_t{ 1 }) / std::size_t{ 2 }]; + // calculate the indices to access the global data + const auto global_i_idx = i_idx + static_cast(internal_i); + const auto global_j_idx = j_idx + static_cast(internal_j); + + real_type sum{ 0.0 }; + for (unsigned block_dim = 0; block_dim < THREAD_BLOCK_SIZE; ++block_dim) { + real_type A_cache = 0.0; + // determine on which side of the diagonal we are located + if (dim + block_dim < global_j_idx) { + A_cache = A[(dim + block_dim) * (num_rows - device_row_offset + PADDING_SIZE_uz) + global_j_idx - (dim + block_dim) * (dim + block_dim + std::size_t{ 1 }) / std::size_t{ 2 }]; + } else { + A_cache = A[global_j_idx * (num_rows - device_row_offset + PADDING_SIZE_uz) + dim + block_dim - global_j_idx * (global_j_idx + std::size_t{ 1 }) / std::size_t{ 2 }]; + } + sum += A_cache * B(global_i_idx, dim + block_dim + device_row_offset); } - temp[internal_i][internal_j] += A_val * B(global_rhs, dim + row_offset); + temp[internal_i][internal_j] += sum; } } } @@ -89,13 +93,14 @@ inline void device_kernel_symm(const std::size_t num_rows, const std::size_t num // apply the (partial) BLAS operation and update C for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) { for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) { - const std::size_t global_rhs = rhs_idx + static_cast(internal_i); - const std::size_t device_global_row = row_idx + static_cast(internal_j); - const std::size_t global_row = row_offset + row_idx + static_cast(internal_j); - - // be sure to not perform out of bounds accesses - if (global_rhs < num_rhs && device_global_row < device_specific_num_rows) { - C(global_rhs, global_row) = alpha * temp[internal_i][internal_j] + beta * C(global_rhs, global_row); + // calculate the indices to access the global data and the data with respect to the current device + const auto global_i_idx = i_idx + static_cast(internal_i); + const auto device_global_j_idx = j_idx + static_cast(internal_j); + const auto global_j_idx = device_row_offset + device_global_j_idx; + + // be sure to not perform out-of-bounds accesses + if (global_i_idx < num_rhs && device_global_j_idx < device_num_rows) { + C(global_i_idx, global_j_idx) = alpha * temp[internal_i][internal_j] + beta * C(global_i_idx, global_j_idx); } } } @@ -110,21 +115,21 @@ inline void device_kernel_symm(const std::size_t num_rows, const std::size_t num * @param[in] num_rows the number of rows in @p A and @p C * @param[in] num_rhs the number of columns in @p B and @p C * @param[in] num_mirror_rows the number of rows to mirror down - * @param[in] device_specific_num_rows the number of rows in @p A and number of rows in @p B; thr rows in @p A are potentially distributed across multiple devices - * @param[in] row_offset the first row this device is responsible for + * @param[in] device_num_rows the number of rows in @p A and number of rows in @p B; thr rows in @p A are potentially distributed across multiple devices + * @param[in] device_row_offset the first row this device is responsible for * @param[in] alpha the scalar alpha value * @param[in] A the matrix @p A * @param[in] B the matrix @p B * @param[in] beta the scalar beta value * @param[in,out] C the matrix @p C, also used as result matrix */ -inline void device_kernel_symm_mirror(const std::size_t num_rows, const std::size_t num_rhs, const std::size_t num_mirror_rows, const std::size_t device_specific_num_rows, const std::size_t row_offset, const real_type alpha, const real_type *A, const soa_matrix &B, const real_type beta, soa_matrix &C) { +inline void device_kernel_symm_mirror(const std::size_t num_rows, const std::size_t num_rhs, const std::size_t num_mirror_rows, const std::size_t device_num_rows, const std::size_t device_row_offset, const real_type alpha, const real_type *A, const soa_matrix &B, const real_type beta, soa_matrix &C) { // compute: C = alpha * A * B + beta * C with A in m x k, B in n x k, and C in n x m, alpha, beta as scalar PLSSVM_ASSERT(B.shape() == (plssvm::shape{ num_rhs, num_rows }), "B matrix sizes mismatch!: {} != [{}, {}]", B.shape(), num_rhs, num_rows); PLSSVM_ASSERT(C.shape() == (plssvm::shape{ num_rhs, num_rows }), "C matrix sizes mismatch!: {} != [{}, {}]", C.shape(), num_rhs, num_rows); - PLSSVM_ASSERT(num_rows >= device_specific_num_rows, "The number of place specific rows ({}) cannot be greater the the total number of rows ({})!", device_specific_num_rows, num_rows); + PLSSVM_ASSERT(num_rows >= device_num_rows, "The number of place specific rows ({}) cannot be greater the the total number of rows ({})!", device_num_rows, num_rows); PLSSVM_ASSERT(num_rows >= num_mirror_rows, "The number of mirror rows ({}) cannot be greater the the total number of rows ({})!", num_mirror_rows, num_rows); - PLSSVM_ASSERT(num_rows >= row_offset, "The row offset ({}) cannot be greater the the total number of rows ({})!", row_offset, num_rows); + PLSSVM_ASSERT(num_rows >= device_row_offset, "The row offset ({}) cannot be greater the the total number of rows ({})!", device_row_offset, num_rows); // calculate constants const auto blocked_num_rhs = static_cast(std::ceil(static_cast(num_rhs) / INTERNAL_BLOCK_SIZE)); @@ -142,36 +147,42 @@ inline void device_kernel_symm_mirror(const std::size_t num_rows, const std::siz for (std::size_t rhs_block = 0; rhs_block < THREAD_BLOCK_SIZE_uz; ++rhs_block) { for (std::size_t row_block = 0; row_block < THREAD_BLOCK_SIZE_uz; ++row_block) { // calculate the indices used in the current thread - const std::size_t rhs_idx = (rhs + rhs_block) * INTERNAL_BLOCK_SIZE_uz; - const std::size_t row_idx = (row + row_block) * INTERNAL_BLOCK_SIZE_uz; + const std::size_t i_idx = (rhs + rhs_block) * INTERNAL_BLOCK_SIZE_uz; + const std::size_t j_idx = (row + row_block) * INTERNAL_BLOCK_SIZE_uz; // create a thread private array used for internal caching std::array, INTERNAL_BLOCK_SIZE> temp{}; - // iterate over all features - for (std::size_t dim = 0; dim < device_specific_num_rows; ++dim) { + // iterate over the remaining values + for (std::size_t dim = 0; dim < device_num_rows; dim += THREAD_BLOCK_SIZE_uz) { // perform the dot product calculation for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) { for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) { - const std::size_t global_rhs = rhs_idx + static_cast(internal_i); - const std::size_t global_row = row_idx + static_cast(internal_j); - - const real_type A_val = A[dim * (num_rows - row_offset + PADDING_SIZE_uz) - (dim - std::size_t{ 1 }) * dim / std::size_t{ 2 } + device_specific_num_rows - dim + global_row]; - temp[internal_i][internal_j] += A_val * B(global_rhs, row_offset + dim); + // calculate the indices to access the global data + const auto global_i_idx = i_idx + static_cast(internal_i); + const auto global_j_idx = j_idx + static_cast(internal_j); + + real_type sum{ 0.0 }; + for (unsigned block_dim = 0; block_dim < THREAD_BLOCK_SIZE; ++block_dim) { + const real_type A_cache = A[(dim + block_dim) * (num_rows - device_row_offset + PADDING_SIZE_uz) - (dim + block_dim - std::size_t{ 1 }) * (dim + block_dim) / std::size_t{ 2 } + device_num_rows - dim + block_dim + global_j_idx]; + sum += A_cache * B(global_i_idx, device_row_offset + dim + block_dim); + } + temp[internal_i][internal_j] += sum; } } } - // apply the (partial) BLAS operation and update C + // apply the (remaining) BLAS operation and update C for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) { for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) { - const std::size_t global_rhs = rhs_idx + static_cast(internal_i); - const std::size_t partial_global_row = row_idx + static_cast(internal_j); - const std::size_t global_row = row_offset + device_specific_num_rows + row_idx + static_cast(internal_j); - - // be sure to not perform out of bounds accesses - if (global_rhs < num_rhs && partial_global_row < num_mirror_rows) { - C(global_rhs, global_row) = alpha * temp[internal_i][internal_j] + beta * C(global_rhs, global_row); + // calculate the indices to access the global data and the data with respect to the current device + const auto global_i_idx = i_idx + static_cast(internal_i); + const auto partial_global_j_idx = j_idx + static_cast(internal_j); + const auto global_j_idx = device_row_offset + device_num_rows + partial_global_j_idx; + + // be sure to not perform out-of-bounds accesses + if (global_i_idx < num_rhs && partial_global_j_idx < num_mirror_rows) { + C(global_i_idx, global_j_idx) = alpha * temp[internal_i][internal_j] + beta * C(global_i_idx, global_j_idx); } } } diff --git a/include/plssvm/backends/OpenMP/kernel/cg_explicit/kernel_matrix_assembly.hpp b/include/plssvm/backends/OpenMP/kernel/cg_explicit/kernel_matrix_assembly.hpp index 9571513b9..b734a7c1a 100644 --- a/include/plssvm/backends/OpenMP/kernel/cg_explicit/kernel_matrix_assembly.hpp +++ b/include/plssvm/backends/OpenMP/kernel/cg_explicit/kernel_matrix_assembly.hpp @@ -28,29 +28,29 @@ namespace plssvm::openmp::detail { /** * @brief Assemble the kernel matrix using the @p kernel function. - * @tparam kernel the compile-time kernel function to use + * @tparam kernel_function the compile-time kernel function to use * @tparam Args the types of the potential additional arguments for the @p kernel function * @param[out] kernel_matrix the resulting kernel matrix * @param[in] data the data matrix - * @param[in] device_specific_num_rows the number of rows the current device is responsible for - * @param[in] row_offset the first row in @p data the current device is responsible for + * @param[in] device_num_rows the number of rows the current device is responsible for + * @param[in] device_row_offset the first row in @p data the current device is responsible for * @param[in] q the `q` vector * @param[in] QA_cost he bottom right matrix entry multiplied by cost * @param[in] cost 1 / the cost parameter in the C-SVM * @param[in] kernel_function_parameter the potential additional arguments for the @p kernel function */ -template -void device_kernel_assembly(real_type*kernel_matrix, const soa_matrix &data, const std::size_t device_specific_num_rows, const std::size_t row_offset, const std::vector &q, const real_type QA_cost, const real_type cost, Args... kernel_function_parameter) { +template +void device_kernel_assembly(real_type *kernel_matrix, const soa_matrix &data, const std::size_t device_num_rows, const std::size_t device_row_offset, const std::vector &q, const real_type QA_cost, const real_type cost, Args... kernel_function_parameter) { PLSSVM_ASSERT(q.size() == data.num_rows() - 1, "Sizes mismatch!: {} != {}", q.size(), data.num_rows() - 1); - PLSSVM_ASSERT(q.size() >= device_specific_num_rows, "The number of place specific rows ({}) cannot be greater the the total number of rows ({})!", device_specific_num_rows, q.size()); - PLSSVM_ASSERT(q.size() >= row_offset, "The row offset ({}) cannot be greater the the total number of rows ({})!", row_offset, q.size()); + PLSSVM_ASSERT(q.size() >= device_num_rows, "The number of place specific rows ({}) cannot be greater the the total number of rows ({})!", device_num_rows, q.size()); + PLSSVM_ASSERT(q.size() >= device_row_offset, "The row offset ({}) cannot be greater the the total number of rows ({})!", device_row_offset, q.size()); PLSSVM_ASSERT(cost != real_type{ 0.0 }, "cost must not be 0.0 since it is 1 / plssvm::cost!"); // calculate constants const std::size_t num_rows = data.num_rows() - 1; const std::size_t num_features = data.num_cols(); - const auto blocked_row_range = static_cast(std::ceil(static_cast(num_rows - row_offset) / INTERNAL_BLOCK_SIZE)); - const auto blocked_device_specific_num_rows = static_cast(std::ceil(static_cast(device_specific_num_rows) / INTERNAL_BLOCK_SIZE)); + const auto blocked_row_range = static_cast(std::ceil(static_cast(num_rows - device_row_offset) / INTERNAL_BLOCK_SIZE)); + const auto blocked_device_specific_num_rows = static_cast(std::ceil(static_cast(device_num_rows) / INTERNAL_BLOCK_SIZE)); // cast all values to 64-bit unsigned long long to prevent potential 32-bit overflows const auto INTERNAL_BLOCK_SIZE_uz = static_cast(INTERNAL_BLOCK_SIZE); @@ -64,46 +64,52 @@ void device_kernel_assembly(real_type*kernel_matrix, const soa_matrix for (std::size_t row_block = 0; row_block < THREAD_BLOCK_SIZE_uz; ++row_block) { for (std::size_t col_block = 0; col_block < THREAD_BLOCK_SIZE_uz; ++col_block) { // calculate the indices used in the current thread - const std::size_t row_idx = (row + row_block) * INTERNAL_BLOCK_SIZE_uz; - const std::size_t col_idx = (col + col_block) * INTERNAL_BLOCK_SIZE_uz; + const std::size_t i_idx = (row + row_block) * INTERNAL_BLOCK_SIZE_uz; + const std::size_t j_idx = (col + col_block) * INTERNAL_BLOCK_SIZE_uz; // only calculate the upper triangular matrix - if (row_idx >= col_idx) { + if (i_idx >= j_idx) { // create a thread private array used for internal caching std::array, INTERNAL_BLOCK_SIZE> temp{}; // iterate over all features - for (std::size_t dim = 0; dim < num_features; ++dim) { + for (std::size_t dim = 0; dim < num_features; dim += THREAD_BLOCK_SIZE_uz) { // perform the feature reduction calculation - for (unsigned internal_row = 0; internal_row < INTERNAL_BLOCK_SIZE; ++internal_row) { - for (unsigned internal_col = 0; internal_col < INTERNAL_BLOCK_SIZE; ++internal_col) { - const std::size_t global_row = row_offset + row_idx + static_cast(internal_row); - const std::size_t global_col = row_offset + col_idx + static_cast(internal_col); + for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) { + for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) { + // calculate the indices to access the global data + const auto global_i_idx = device_row_offset + i_idx + static_cast(internal_i); + const auto global_j_idx = device_row_offset + j_idx + static_cast(internal_j); - temp[internal_row][internal_col] += detail::feature_reduce(data(global_row, dim), data(global_col, dim)); + real_type sum{ 0.0 }; + for (unsigned block_dim = 0; block_dim < THREAD_BLOCK_SIZE; ++block_dim) { + sum += detail::feature_reduce(data(global_i_idx, dim + block_dim), data(global_j_idx, dim + block_dim)); + } + temp[internal_j][internal_i] += sum; } } } // apply the remaining part of the kernel function and store the value in the output kernel matrix - for (unsigned internal_row = 0; internal_row < INTERNAL_BLOCK_SIZE; ++internal_row) { - for (unsigned internal_col = 0; internal_col < INTERNAL_BLOCK_SIZE; ++internal_col) { - // calculate the indices to access the kernel matrix (the part stored on the current device) - const std::size_t device_global_row = row_idx + static_cast(internal_row); - const std::size_t global_row = row_offset + row_idx + static_cast(internal_row); - const std::size_t device_global_col = col_idx + static_cast(internal_col); - const std::size_t global_col = row_offset + col_idx + static_cast(internal_col); + for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) { + for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) { + // calculate the indices to access the global data and the data with respect to the current device + const auto device_global_i_idx = i_idx + static_cast(internal_i); + const auto global_i_idx = device_row_offset + device_global_i_idx; + const auto device_global_j_idx = j_idx + static_cast(internal_j); + const auto global_j_idx = device_row_offset + device_global_j_idx; - // be sure to not perform out of bounds accesses for the kernel matrix (only using the upper triangular matrix) - if (device_global_row < (num_rows - row_offset) && device_global_col < device_specific_num_rows && global_row >= global_col) { - real_type temp_ij = temp[internal_row][internal_col]; - temp_ij = detail::apply_kernel_function(temp_ij, kernel_function_parameter...) + QA_cost - q[global_row] - q[global_col]; + // be sure to not perform out-of-bounds accesses (only using the upper triangular matrix) + if (device_global_i_idx < (num_rows - device_row_offset) && device_global_j_idx < device_num_rows && global_i_idx >= global_j_idx) { + real_type temp_ij = temp[internal_j][internal_i]; + // apply the final kernel function + temp_ij = detail::apply_kernel_function(temp_ij, kernel_function_parameter...) + QA_cost - q[global_i_idx] - q[global_j_idx]; // apply the cost on the diagonal - if (global_row == global_col) { + if (global_i_idx == global_j_idx) { temp_ij += cost; } - // update the kernel matrix - kernel_matrix[device_global_col * (num_rows - row_offset + PADDING_SIZE_uz) - device_global_col * (device_global_col + std::size_t{ 1 }) / std::size_t{ 2 } + device_global_row] = temp_ij; + // update the upper triangular kernel matrix + kernel_matrix[device_global_j_idx * (num_rows - device_row_offset + PADDING_SIZE_uz) - device_global_j_idx * (device_global_j_idx + std::size_t{ 1 }) / std::size_t{ 2 } + device_global_i_idx] = temp_ij; } } } diff --git a/include/plssvm/backends/OpenMP/kernel/kernel_functions.hpp b/include/plssvm/backends/OpenMP/kernel/kernel_functions.hpp index 59fd0f43c..359e2f8ff 100644 --- a/include/plssvm/backends/OpenMP/kernel/kernel_functions.hpp +++ b/include/plssvm/backends/OpenMP/kernel/kernel_functions.hpp @@ -27,42 +27,17 @@ namespace plssvm::openmp::detail { /** * @brief Fast integer power function. Computes base^exponent and takes advantage of the fact that degree may only be positive integer values. - * @details Hardcodes the power function for degree <= 6, uses a simple for loop otherwise. * @param[in] base the base * @param[in] exponent the exponent * @return base^exponent (`[[nodiscard]]`) */ [[nodiscard]] inline real_type powi(const real_type base, const int exponent) { - switch (exponent) { - case 0: return real_type{ 1.0 }; - case 1: return base; - case 2: return base * base; - case 3: return base * base * base; - case 4: - { - const real_type temp = base * base; - return temp * temp; - } - case 5: - { - const real_type temp = base * base; - return temp * temp * base; - } - case 6: - { - const real_type temp = base * base * base; - return temp * temp; - } - default: - { - // generic integer power function - real_type result{ 1.0 }; - for (int i = 0; i < exponent; ++i) { - result *= base; - } - return result; - } + // generic integer power function + real_type result{ 1.0 }; + for (int i = 0; i < exponent; ++i) { + result *= base; } + return result; } //***************************************************// From 2e64193492090b25529f4fe4e8c30755375f4461 Mon Sep 17 00:00:00 2001 From: Marcel Breyer Date: Fri, 30 May 2025 11:40:23 +0200 Subject: [PATCH 007/215] Improve the performance of the OpenMP cg_implicit kernel matrix assembly + BLAS implementation. Align names more to the ones used in the other backends. --- .../kernel_matrix_assembly_blas.hpp | 104 +++++++++++------- 1 file changed, 64 insertions(+), 40 deletions(-) diff --git a/include/plssvm/backends/OpenMP/kernel/cg_implicit/kernel_matrix_assembly_blas.hpp b/include/plssvm/backends/OpenMP/kernel/cg_implicit/kernel_matrix_assembly_blas.hpp index 771689209..60c10de07 100644 --- a/include/plssvm/backends/OpenMP/kernel/cg_implicit/kernel_matrix_assembly_blas.hpp +++ b/include/plssvm/backends/OpenMP/kernel/cg_implicit/kernel_matrix_assembly_blas.hpp @@ -26,25 +26,25 @@ namespace plssvm::openmp::detail { /** - * @brief Perform an implicit BLAS SYMM-like operation: `C = alpha * A * B + C` where `A` is the implicitly calculated kernel matrix using the @p kernel function (never actually stored, reducing the amount of needed global memory), @p B and @p C are matrices, and @p alpha is a scalar. - * @tparam kernel the compile-time kernel function to use - * @tparam Args the types of the potential additional arguments for the @p kernel function + * @brief Perform an implicit BLAS SYMM-like operation: `C = alpha * A * B + C` where `A` is the implicitly calculated kernel matrix using the @p kernel_function function (never actually stored, reducing the amount of needed global memory), @p B and @p C are matrices, and @p alpha is a scalar. + * @tparam kernel_function the compile-time kernel function to use + * @tparam Args the types of the potential additional arguments for the @p kernel_function function * @param[in] alpha the scalar alpha value * @param[in] q the `q` vector * @param[in] data the data matrix - * @param[in] device_specific_num_rows the number of rows the current device is responsible for - * @param[in] row_offset the first row in @p data the current device is responsible for + * @param[in] device_num_rows the number of rows the current device is responsible for + * @param[in] device_row_offset the first row in @p data the current device is responsible for * @param[in] QA_cost he bottom right matrix entry multiplied by cost * @param[in] cost 1 / the cost parameter in the C-SVM * @param[in] B the matrix @p B * @param[in,out] C the matrix @p C - * @param[in] kernel_function_parameter the potential additional arguments for the @p kernel function + * @param[in] kernel_function_parameter the potential additional arguments for the @p kernel_function function */ -template -inline void device_kernel_assembly_symm(const real_type alpha, const std::vector &q, const soa_matrix &data, const std::size_t device_specific_num_rows, const std::size_t row_offset, const real_type QA_cost, const real_type cost, const soa_matrix &B, soa_matrix &C, Args... kernel_function_parameter) { +template +inline void device_kernel_assembly_symm(const real_type alpha, const std::vector &q, const soa_matrix &data, const std::size_t device_num_rows, const std::size_t device_row_offset, const real_type QA_cost, const real_type cost, const soa_matrix &B, soa_matrix &C, Args... kernel_function_parameter) { PLSSVM_ASSERT(q.size() == data.num_rows() - 1, "Sizes mismatch!: {} != {}", q.size(), data.num_rows() - 1); - PLSSVM_ASSERT(q.size() >= device_specific_num_rows, "The number of place specific rows ({}) cannot be greater the the total number of rows ({})!", device_specific_num_rows, q.size()); - PLSSVM_ASSERT(q.size() >= row_offset, "The row offset ({}) cannot be greater the the total number of rows ({})!", row_offset, q.size()); + PLSSVM_ASSERT(q.size() >= device_num_rows, "The number of place specific rows ({}) cannot be greater the the total number of rows ({})!", device_num_rows, q.size()); + PLSSVM_ASSERT(q.size() >= device_row_offset, "The row offset ({}) cannot be greater the the total number of rows ({})!", device_row_offset, q.size()); PLSSVM_ASSERT(cost != real_type{ 0.0 }, "cost must not be 0.0 since it is 1 / plssvm::cost!"); PLSSVM_ASSERT(B.shape() == C.shape(), "The matrices B and C must have the same shape!"); PLSSVM_ASSERT(B.num_cols() == q.size(), "The number of columns in B ({}) must be the same as the values in q ({})!", B.num_cols(), q.size()); @@ -53,8 +53,8 @@ inline void device_kernel_assembly_symm(const real_type alpha, const std::vector const std::size_t num_rows = data.num_rows() - 1; const std::size_t num_features = data.num_cols(); const std::size_t num_classes = B.num_rows(); - const auto blocked_row_range = static_cast(std::ceil(static_cast(num_rows - row_offset) / INTERNAL_BLOCK_SIZE)); - const auto blocked_device_specific_num_rows = static_cast(std::ceil(static_cast(device_specific_num_rows) / INTERNAL_BLOCK_SIZE)); + const auto blocked_row_range = static_cast(std::ceil(static_cast(num_rows - device_row_offset) / INTERNAL_BLOCK_SIZE)); + const auto blocked_device_specific_num_rows = static_cast(std::ceil(static_cast(device_num_rows) / INTERNAL_BLOCK_SIZE)); // cast all values to 64-bit unsigned long long to prevent potential 32-bit overflows const auto INTERNAL_BLOCK_SIZE_uz = static_cast(INTERNAL_BLOCK_SIZE); @@ -67,54 +67,78 @@ inline void device_kernel_assembly_symm(const real_type alpha, const std::vector for (std::size_t row_block = 0; row_block < THREAD_BLOCK_SIZE_uz; ++row_block) { for (std::size_t col_block = 0; col_block < THREAD_BLOCK_SIZE_uz; ++col_block) { // calculate the indices used in the current thread - const std::size_t row_idx = (row + row_block) * INTERNAL_BLOCK_SIZE_uz; - const std::size_t col_idx = (col + col_block) * INTERNAL_BLOCK_SIZE_uz; + const std::size_t i_idx = (row + row_block) * INTERNAL_BLOCK_SIZE_uz; + const std::size_t j_idx = (col + col_block) * INTERNAL_BLOCK_SIZE_uz; // only calculate the upper triangular matrix - if (row_idx >= col_idx) { + if (i_idx >= j_idx) { // create a thread private array used for internal caching std::array, INTERNAL_BLOCK_SIZE> temp{}; // iterate over all features - for (std::size_t dim = 0; dim < num_features; ++dim) { - for (unsigned internal_row = 0; internal_row < INTERNAL_BLOCK_SIZE; ++internal_row) { - for (unsigned internal_col = 0; internal_col < INTERNAL_BLOCK_SIZE; ++internal_col) { - const std::size_t global_row = row_offset + row_idx + static_cast(internal_row); - const std::size_t global_col = row_offset + col_idx + static_cast(internal_col); + for (std::size_t dim = 0; dim < num_features; dim += THREAD_BLOCK_SIZE_uz) { + for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) { + for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) { + // calculate the indices to access the global data + const auto global_i_idx = device_row_offset + i_idx + static_cast(internal_i); + const auto global_j_idx = device_row_offset + j_idx + static_cast(internal_j); - temp[internal_row][internal_col] += detail::feature_reduce(data(global_row, dim), data(global_col, dim)); + real_type sum{ 0.0 }; + for (unsigned block_dim = 0; block_dim < THREAD_BLOCK_SIZE; ++block_dim) { + sum += detail::feature_reduce(data(global_i_idx, dim + block_dim), data(global_j_idx, dim + block_dim)); + } + temp[internal_j][internal_i] += sum; } } } // apply the remaining part of the kernel function and store the value in the output kernel matrix - for (unsigned internal_row = 0; internal_row < INTERNAL_BLOCK_SIZE; ++internal_row) { - for (unsigned internal_col = 0; internal_col < INTERNAL_BLOCK_SIZE; ++internal_col) { - const std::size_t device_global_row = row_idx + static_cast(internal_row); - const std::size_t global_row = row_offset + row_idx + static_cast(internal_row); - const std::size_t device_global_col = col_idx + static_cast(internal_col); - const std::size_t global_col = row_offset + col_idx + static_cast(internal_col); + for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) { + for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) { + // calculate the indices to access the global data and the data with respect to the current device + const auto device_global_i_idx = i_idx + static_cast(internal_i); + const auto global_i_idx = device_row_offset + device_global_i_idx; + const auto device_global_j_idx = j_idx + static_cast(internal_j); + const auto global_j_idx = device_row_offset + device_global_j_idx; // be sure to not perform out of bounds accesses for the kernel matrix (only using the upper triangular matrix) - if (device_global_row < (num_rows - row_offset) && device_global_col < device_specific_num_rows && global_row >= global_col) { - real_type temp_ij = temp[internal_row][internal_col]; - temp_ij = detail::apply_kernel_function(temp_ij, kernel_function_parameter...) + QA_cost - q[global_row] - q[global_col]; + if (device_global_i_idx < (num_rows - device_row_offset) && device_global_j_idx < device_num_rows && global_i_idx >= global_j_idx) { + // apply the final kernel function + temp[internal_j][internal_i] = detail::apply_kernel_function(temp[internal_j][internal_i], kernel_function_parameter...) + QA_cost - q[global_i_idx] - q[global_j_idx]; // apply the cost on the diagonal - if (global_row == global_col) { - temp_ij += cost; - // calculate the values of alpha * A * B - for (std::size_t class_idx = 0; class_idx < num_classes; ++class_idx) { + if (global_i_idx == global_j_idx) { + temp[internal_j][internal_i] += cost; + } + } else { + // be sure to set the value to zero otherwise + temp[internal_j][internal_i] = real_type{ 0.0 }; + } + } + } + + //*************************************************************************// + // calculate C += alpha * temp * B // + //*************************************************************************// + for (std::size_t dim = 0; dim < num_classes; dim += THREAD_BLOCK_SIZE_uz) { + for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) { + for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) { + const auto global_i_idx = device_row_offset + i_idx + static_cast(internal_i); + const auto global_j_idx = device_row_offset + j_idx + static_cast(internal_j); + + if (global_i_idx == global_j_idx) { + // only apply once to the diagonal + for (std::size_t class_idx = 0; class_idx < THREAD_BLOCK_SIZE; ++class_idx) { #pragma omp atomic - C(class_idx, global_row) += alpha * temp_ij * B(class_idx, global_row); + C(dim + class_idx, global_i_idx) += alpha * temp[internal_j][internal_i] * B(dim + class_idx, global_i_idx); } } else { - // calculate the values of alpha * A * B - for (std::size_t class_idx = 0; class_idx < num_classes; ++class_idx) { + // apply it for the upper and lower triangular matrix + for (std::size_t class_idx = 0; class_idx < THREAD_BLOCK_SIZE; ++class_idx) { #pragma omp atomic - C(class_idx, global_row) += alpha * temp_ij * B(class_idx, global_col); -// symmetry + C(dim + class_idx, global_i_idx) += alpha * temp[internal_j][internal_i] * B(dim + class_idx, global_j_idx); + // symmetry #pragma omp atomic - C(class_idx, global_col) += alpha * temp_ij * B(class_idx, global_row); + C(dim + class_idx, global_j_idx) += alpha * temp[internal_j][internal_i] * B(dim + class_idx, global_i_idx); } } } From 8aa1c93bbaab8c98310cab60355b539c01f0da66 Mon Sep 17 00:00:00 2001 From: Marcel Breyer Date: Fri, 30 May 2025 11:40:41 +0200 Subject: [PATCH 008/215] Improve the performance of the OpenMP predict implementation. Align names more to the ones used in the other backends. --- .../backends/OpenMP/kernel/predict_kernel.hpp | 199 +++++++++++++----- 1 file changed, 147 insertions(+), 52 deletions(-) diff --git a/include/plssvm/backends/OpenMP/kernel/predict_kernel.hpp b/include/plssvm/backends/OpenMP/kernel/predict_kernel.hpp index 407096055..1540397bc 100644 --- a/include/plssvm/backends/OpenMP/kernel/predict_kernel.hpp +++ b/include/plssvm/backends/OpenMP/kernel/predict_kernel.hpp @@ -31,28 +31,71 @@ namespace plssvm::openmp::detail { * @param[out] w the vector to speedup the linear prediction * @param[in] alpha the previously learned weights * @param[in] support_vectors the support vectors - * @param[in] device_specific_num_sv the number of support vectors the current device is responsible for + * @param[in] device_num_sv the number of support vectors the current device is responsible for * @param[in] sv_offset the first row in @p support_vectors the current device is responsible for */ -inline void device_kernel_w_linear(soa_matrix &w, const aos_matrix &alpha, const soa_matrix &support_vectors, const std::size_t device_specific_num_sv, const std::size_t sv_offset) { +inline void device_kernel_w_linear(soa_matrix &w, const aos_matrix &alpha, const soa_matrix &support_vectors, const std::size_t device_num_sv, const std::size_t sv_offset) { PLSSVM_ASSERT(alpha.num_cols() == support_vectors.num_rows(), "Size mismatch: {} vs {}!", alpha.num_cols(), support_vectors.num_rows()); PLSSVM_ASSERT(w.shape() == (plssvm::shape{ alpha.num_rows(), support_vectors.num_cols() }), "Shape mismatch: {} vs {}!", w.shape(), (plssvm::shape{ alpha.num_rows(), support_vectors.num_cols() })); - PLSSVM_ASSERT(support_vectors.num_rows() >= device_specific_num_sv, "The number of place specific sv ({}) cannot be greater the the total number of sv ({})!", device_specific_num_sv, support_vectors.num_rows()); + PLSSVM_ASSERT(support_vectors.num_rows() >= device_num_sv, "The number of place specific sv ({}) cannot be greater the the total number of sv ({})!", device_num_sv, support_vectors.num_rows()); PLSSVM_ASSERT(support_vectors.num_rows() >= sv_offset, "The sv offset ({}) cannot be greater the the total number of sv ({})!", sv_offset, support_vectors.num_rows()); // calculate constants const std::size_t num_classes = alpha.num_rows(); const std::size_t num_features = support_vectors.num_cols(); -#pragma omp parallel for collapse(2) default(none) shared(w, support_vectors, alpha) firstprivate(num_classes, num_features, device_specific_num_sv, sv_offset) - for (std::size_t a = 0; a < num_classes; ++a) { - for (std::size_t dim = 0; dim < num_features; ++dim) { - real_type temp{ 0.0 }; -#pragma omp simd reduction(+ : temp) - for (std::size_t idx = 0; idx < device_specific_num_sv; ++idx) { - temp = std::fma(alpha(a, sv_offset + idx), support_vectors(sv_offset + idx, dim), temp); + // calculate constants + const auto blocked_num_features = static_cast(std::ceil(static_cast(num_features) / INTERNAL_BLOCK_SIZE)); + const auto blocked_num_classes = static_cast(std::ceil(static_cast(num_classes) / INTERNAL_BLOCK_SIZE)); + + // cast all values to 64-bit unsigned long long to prevent potential 32-bit overflows + const auto INTERNAL_BLOCK_SIZE_uz = static_cast(INTERNAL_BLOCK_SIZE); + const auto THREAD_BLOCK_SIZE_uz = static_cast(THREAD_BLOCK_SIZE); + +#pragma omp parallel for collapse(2) default(none) shared(w, support_vectors, alpha) firstprivate(blocked_num_classes, blocked_num_features, num_classes, num_features, device_num_sv, sv_offset) + for (std::size_t dim = 0; dim < blocked_num_features; dim += THREAD_BLOCK_SIZE_uz) { + for (std::size_t a = 0; a < blocked_num_classes; a += THREAD_BLOCK_SIZE_uz) { + // perform operations on the current block + for (std::size_t dim_block = 0; dim_block < THREAD_BLOCK_SIZE_uz; ++dim_block) { + for (std::size_t a_block = 0; a_block < THREAD_BLOCK_SIZE_uz; ++a_block) { + // calculate the indices used in the current thread + const std::size_t feature_idx = (dim + dim_block) * INTERNAL_BLOCK_SIZE_uz; + const std::size_t class_idx = (a + a_block) * INTERNAL_BLOCK_SIZE_uz; + + // create a thread private array used for internal caching + std::array, INTERNAL_BLOCK_SIZE> temp{}; + + for (std::size_t sv = 0; sv < device_num_sv; sv += THREAD_BLOCK_SIZE_uz) { + // perform the dot product calculation + for (unsigned internal_feature = 0; internal_feature < INTERNAL_BLOCK_SIZE; ++internal_feature) { + for (unsigned internal_class = 0; internal_class < INTERNAL_BLOCK_SIZE; ++internal_class) { + // calculate the indices to access the global data + const auto global_feature_idx = feature_idx + static_cast(internal_feature); + const auto global_class_idx = class_idx + static_cast(internal_class); + + real_type sum{ 0.0 }; + for (unsigned block_sv = 0; block_sv < THREAD_BLOCK_SIZE; ++block_sv) { + sum += alpha(global_class_idx, sv_offset + sv + block_sv) * support_vectors(sv_offset + sv + block_sv, global_feature_idx); + } + temp[internal_class][internal_feature] += sum; + } + } + } + + // store the result back to the w vector + for (unsigned internal_feature = 0; internal_feature < INTERNAL_BLOCK_SIZE; ++internal_feature) { + for (unsigned internal_class = 0; internal_class < INTERNAL_BLOCK_SIZE; ++internal_class) { + // calculate the indices to access the global data + const auto global_feature_idx = feature_idx + static_cast(internal_feature); + const auto global_class_idx = class_idx + static_cast(internal_class); + + if (global_class_idx < num_classes && global_feature_idx < num_features) { + w(global_class_idx, global_feature_idx) = temp[internal_class][internal_feature]; + } + } + } + } } - w(a, dim) = temp; } } } @@ -63,29 +106,73 @@ inline void device_kernel_w_linear(soa_matrix &w, const aos_matrix &prediction, const soa_matrix &w, const std::vector &rho, const soa_matrix &predict_points, const std::size_t device_specific_num_predict_points, const std::size_t row_offset) { +inline void device_kernel_predict_linear(aos_matrix &prediction, const soa_matrix &w, const std::vector &rho, const soa_matrix &predict_points, const std::size_t device_num_predict_points, const std::size_t device_row_offset) { PLSSVM_ASSERT(w.num_rows() == rho.size(), "Size mismatch: {} vs {}!", w.num_rows(), rho.size()); PLSSVM_ASSERT(w.num_cols() == predict_points.num_cols(), "Size mismatch: {} vs {}!", w.num_cols(), predict_points.num_cols()); PLSSVM_ASSERT(prediction.shape() == (plssvm::shape{ predict_points.num_rows(), w.num_rows() }), "Shape mismatch: {} vs {}!", prediction.shape(), (plssvm::shape{ predict_points.num_rows(), w.num_rows() })); - PLSSVM_ASSERT(predict_points.num_rows() >= device_specific_num_predict_points, "The number of place specific predict points ({}) cannot be greater the the total number of predict points ({})!", device_specific_num_predict_points, predict_points.num_rows()); - PLSSVM_ASSERT(predict_points.num_rows() >= row_offset, "The row offset ({}) cannot be greater the the total number of predict points ({})!", row_offset, predict_points.num_rows()); + PLSSVM_ASSERT(predict_points.num_rows() >= device_num_predict_points, "The number of place specific predict points ({}) cannot be greater the the total number of predict points ({})!", device_num_predict_points, predict_points.num_rows()); + PLSSVM_ASSERT(predict_points.num_rows() >= device_row_offset, "The row offset ({}) cannot be greater the the total number of predict points ({})!", device_row_offset, predict_points.num_rows()); // calculate constants const std::size_t num_classes = prediction.num_cols(); const std::size_t num_features = predict_points.num_cols(); -#pragma omp parallel for collapse(2) default(none) shared(prediction, w, rho, predict_points) firstprivate(num_classes, num_features, device_specific_num_predict_points, row_offset) - for (std::size_t point_index = 0; point_index < device_specific_num_predict_points; ++point_index) { - for (std::size_t a = 0; a < num_classes; ++a) { - real_type temp{ 0.0 }; -#pragma omp simd reduction(+ : temp) - for (std::size_t dim = 0; dim < num_features; ++dim) { - temp = std::fma(w(a, dim), predict_points(row_offset + point_index, dim), temp); + // calculate constants + const auto blocked_device_num_predict_points = static_cast(std::ceil(static_cast(device_num_predict_points) / INTERNAL_BLOCK_SIZE)); + const auto blocked_num_classes = static_cast(std::ceil(static_cast(num_classes) / INTERNAL_BLOCK_SIZE)); + + // cast all values to 64-bit unsigned long long to prevent potential 32-bit overflows + const auto INTERNAL_BLOCK_SIZE_uz = static_cast(INTERNAL_BLOCK_SIZE); + const auto THREAD_BLOCK_SIZE_uz = static_cast(THREAD_BLOCK_SIZE); + +#pragma omp parallel for collapse(2) default(none) shared(prediction, w, rho, predict_points) firstprivate(blocked_device_num_predict_points, blocked_num_classes, device_num_predict_points, num_classes, num_features, device_row_offset) + for (std::size_t point = 0; point < blocked_device_num_predict_points; point += THREAD_BLOCK_SIZE_uz) { + for (std::size_t a = 0; a < blocked_num_classes; a += THREAD_BLOCK_SIZE_uz) { + // perform operations on the current block + for (std::size_t point_block = 0; point_block < THREAD_BLOCK_SIZE_uz; ++point_block) { + for (std::size_t a_block = 0; a_block < THREAD_BLOCK_SIZE_uz; ++a_block) { + // calculate the indices used in the current thread + const std::size_t pp_idx = (point + point_block) * INTERNAL_BLOCK_SIZE_uz; + const std::size_t class_idx = (a + a_block) * INTERNAL_BLOCK_SIZE_uz; + + // create a thread private array used for internal caching + std::array, INTERNAL_BLOCK_SIZE> temp{}; + + for (std::size_t dim = 0; dim < num_features; dim += THREAD_BLOCK_SIZE_uz) { + // perform the dot product calculation + for (unsigned internal_pp = 0; internal_pp < INTERNAL_BLOCK_SIZE; ++internal_pp) { + for (unsigned internal_class = 0; internal_class < INTERNAL_BLOCK_SIZE; ++internal_class) { + // calculate the indices to access the global data + const auto global_pp_idx = device_row_offset + pp_idx + static_cast(internal_pp); + const auto global_class_idx = class_idx + static_cast(internal_class); + + real_type sum{ 0.0 }; + for (unsigned block_dim = 0; block_dim < THREAD_BLOCK_SIZE; ++block_dim) { + sum += w(global_class_idx, dim + block_dim) * predict_points(global_pp_idx, dim + block_dim); + } + temp[internal_class][internal_pp] += sum; + } + } + } + + // store the result back to the w vector + for (unsigned internal_pp = 0; internal_pp < INTERNAL_BLOCK_SIZE; ++internal_pp) { + for (unsigned internal_class = 0; internal_class < INTERNAL_BLOCK_SIZE; ++internal_class) { + // calculate the indices to access the global data + const auto device_global_pp_idx = pp_idx + static_cast(internal_pp); + const auto global_pp_idx = device_row_offset + device_global_pp_idx; + const auto global_class_idx = class_idx + static_cast(internal_class); + + if (global_class_idx < num_classes && global_pp_idx < device_num_predict_points) { + prediction(global_pp_idx, global_class_idx) = temp[internal_class][internal_pp] - rho[global_class_idx]; + } + } + } + } } - prediction(row_offset + point_index, a) = temp - rho[a]; } } } @@ -99,24 +186,24 @@ inline void device_kernel_predict_linear(aos_matrix &prediction, cons * @param[in] rho the previously learned bias * @param[in] support_vectors the support vectors * @param[in] predict_points the data points to predict - * @param[in] device_specific_num_predict_points the number of predict points the current device is responsible for - * @param[in] row_offset the first row in @p predict_points the current device is responsible for + * @param[in] device_num_predict_points the number of predict points the current device is responsible for + * @param[in] device_row_offset the first row in @p predict_points the current device is responsible for * @param[in] kernel_function_parameter the parameters necessary to apply the @p kernel_function */ -template -inline void device_kernel_predict(aos_matrix &prediction, const aos_matrix &alpha, const std::vector &rho, const soa_matrix &support_vectors, const soa_matrix &predict_points, const std::size_t device_specific_num_predict_points, const std::size_t row_offset, Args... kernel_function_parameter) { +template +inline void device_kernel_predict(aos_matrix &prediction, const aos_matrix &alpha, const std::vector &rho, const soa_matrix &support_vectors, const soa_matrix &predict_points, const std::size_t device_num_predict_points, const std::size_t device_row_offset, Args... kernel_function_parameter) { PLSSVM_ASSERT(alpha.num_rows() == rho.size(), "Size mismatch: {} vs {}!", alpha.num_rows(), rho.size()); PLSSVM_ASSERT(alpha.num_cols() == support_vectors.num_rows(), "Size mismatch: {} vs {}!", alpha.num_cols(), support_vectors.num_rows()); PLSSVM_ASSERT(support_vectors.num_cols() == predict_points.num_cols(), "Size mismatch: {} vs {}!", support_vectors.num_cols(), predict_points.num_cols()); PLSSVM_ASSERT(prediction.shape() == (plssvm::shape{ predict_points.num_rows(), alpha.num_rows() }), "Shape mismatch: {} vs {}!", prediction.shape(), (plssvm::shape{ predict_points.num_rows(), alpha.num_rows() })); - PLSSVM_ASSERT(predict_points.num_rows() >= device_specific_num_predict_points, "The number of place specific predict points ({}) cannot be greater the the total number of predict points ({})!", device_specific_num_predict_points, predict_points.num_rows()); - PLSSVM_ASSERT(predict_points.num_rows() >= row_offset, "The row offset ({}) cannot be greater the the total number of predict points ({})!", row_offset, predict_points.num_rows()); + PLSSVM_ASSERT(predict_points.num_rows() >= device_num_predict_points, "The number of place specific predict points ({}) cannot be greater the the total number of predict points ({})!", device_num_predict_points, predict_points.num_rows()); + PLSSVM_ASSERT(predict_points.num_rows() >= device_row_offset, "The row offset ({}) cannot be greater the the total number of predict points ({})!", device_row_offset, predict_points.num_rows()); // calculate constants const std::size_t num_classes = alpha.num_rows(); const std::size_t num_support_vectors = support_vectors.num_rows(); const auto blocked_num_support_vectors = static_cast(std::ceil(static_cast(num_support_vectors) / INTERNAL_BLOCK_SIZE)); - const auto blocked_device_specific_num_predict_points = static_cast(std::ceil(static_cast(device_specific_num_predict_points) / INTERNAL_BLOCK_SIZE)); + const auto blocked_device_specific_num_predict_points = static_cast(std::ceil(static_cast(device_num_predict_points) / INTERNAL_BLOCK_SIZE)); const std::size_t num_features = predict_points.num_cols(); // cast all values to 64-bit unsigned long long to prevent potential 32-bit overflows @@ -124,34 +211,39 @@ inline void device_kernel_predict(aos_matrix &prediction, const aos_m const auto THREAD_BLOCK_SIZE_uz = static_cast(THREAD_BLOCK_SIZE); #pragma omp parallel for collapse(2) - for (std::size_t point_index = 0; point_index < device_specific_num_predict_points; ++point_index) { - for (std::size_t a = 0; a < num_classes; ++a) { - prediction(row_offset + point_index, a) -= rho[a]; + for (std::size_t pp_idx = 0; pp_idx < device_num_predict_points; ++pp_idx) { + for (std::size_t class_idx = 0; class_idx < num_classes; ++class_idx) { + prediction(device_row_offset + pp_idx, class_idx) -= rho[class_idx]; } } #pragma omp parallel for collapse(2) - for (std::size_t pp = 0; pp < blocked_device_specific_num_predict_points; pp += THREAD_BLOCK_SIZE_uz) { - for (std::size_t sv = 0; sv < blocked_num_support_vectors; sv += THREAD_BLOCK_SIZE_uz) { + for (std::size_t x_block = 0; x_block < blocked_device_specific_num_predict_points; x_block += THREAD_BLOCK_SIZE_uz) { + for (std::size_t y_block = 0; y_block < blocked_num_support_vectors; y_block += THREAD_BLOCK_SIZE_uz) { // perform operations on the current block - for (std::size_t pp_block = 0; pp_block < THREAD_BLOCK_SIZE_uz; ++pp_block) { - for (std::size_t sv_block = 0; sv_block < THREAD_BLOCK_SIZE_uz; ++sv_block) { + for (std::size_t x_thread = 0; x_thread < THREAD_BLOCK_SIZE_uz; ++x_thread) { + for (std::size_t y_thread = 0; y_thread < THREAD_BLOCK_SIZE_uz; ++y_thread) { // calculate the indices used in the current thread - const std::size_t pp_idx = (pp + pp_block) * INTERNAL_BLOCK_SIZE_uz; - const std::size_t sv_idx = (sv + sv_block) * INTERNAL_BLOCK_SIZE_uz; + const std::size_t pp_idx = (x_block + x_thread) * INTERNAL_BLOCK_SIZE_uz; + const std::size_t sv_idx = (y_block + y_thread) * INTERNAL_BLOCK_SIZE_uz; // create a thread private array used for internal caching std::array, INTERNAL_BLOCK_SIZE> temp{}; // iterate over all features - for (std::size_t dim = 0; dim < num_features; ++dim) { + for (std::size_t dim = 0; dim < num_features; dim += THREAD_BLOCK_SIZE_uz) { // perform the feature reduction calculation for (unsigned internal_pp = 0; internal_pp < INTERNAL_BLOCK_SIZE; ++internal_pp) { for (unsigned internal_sv = 0; internal_sv < INTERNAL_BLOCK_SIZE; ++internal_sv) { - const std::size_t global_pp_idx = row_offset + pp_idx + static_cast(internal_pp); - const std::size_t global_sv_idx = sv_idx + static_cast(internal_sv); + // calculate the indices to access the global data + const auto global_pp_idx = device_row_offset + pp_idx + static_cast(internal_pp); + const auto global_sv_idx = sv_idx + static_cast(internal_sv); - temp[internal_pp][internal_sv] += detail::feature_reduce(support_vectors(global_sv_idx, dim), predict_points(global_pp_idx, dim)); + real_type sum{ 0.0 }; + for (unsigned block_dim = 0; block_dim < THREAD_BLOCK_SIZE; ++block_dim) { + sum += detail::feature_reduce(support_vectors(global_sv_idx, dim + block_dim), predict_points(global_pp_idx, dim + block_dim)); + } + temp[internal_pp][internal_sv] += sum; } } } @@ -159,22 +251,25 @@ inline void device_kernel_predict(aos_matrix &prediction, const aos_m // update temp using the respective kernel function for (unsigned internal_pp = 0; internal_pp < INTERNAL_BLOCK_SIZE; ++internal_pp) { for (unsigned internal_sv = 0; internal_sv < INTERNAL_BLOCK_SIZE; ++internal_sv) { - temp[internal_pp][internal_sv] = detail::apply_kernel_function(temp[internal_pp][internal_sv], kernel_function_parameter...); + temp[internal_pp][internal_sv] = detail::apply_kernel_function(temp[internal_pp][internal_sv], kernel_function_parameter...); } } // add results to prediction - for (std::size_t a = 0; a < num_classes; ++a) { + for (std::size_t dim = 0; dim < num_classes; dim += THREAD_BLOCK_SIZE_uz) { for (unsigned internal_pp = 0; internal_pp < INTERNAL_BLOCK_SIZE; ++internal_pp) { for (unsigned internal_sv = 0; internal_sv < INTERNAL_BLOCK_SIZE; ++internal_sv) { - const std::size_t device_global_pp_idx = pp_idx + static_cast(internal_pp); - const std::size_t global_pp_idx = row_offset + pp_idx + static_cast(internal_pp); - const std::size_t global_sv_idx = sv_idx + static_cast(internal_sv); + // calculate the indices to access the global data and the data with respect to the current device + const auto device_global_pp_idx = pp_idx + static_cast(internal_pp); + const auto global_pp_idx = device_row_offset + device_global_pp_idx; + const auto global_sv_idx = sv_idx + static_cast(internal_sv); - // be sure to not perform out of bounds accesses - if (device_global_pp_idx < device_specific_num_predict_points && global_sv_idx < num_support_vectors) { + // be sure to not perform out-of-bounds accesses + if (device_global_pp_idx < device_num_predict_points && global_sv_idx < num_support_vectors) { + for (std::size_t class_idx = 0; class_idx < THREAD_BLOCK_SIZE_uz; ++class_idx) { #pragma omp atomic - prediction(global_pp_idx, a) += alpha(a, global_sv_idx) * temp[internal_pp][internal_sv]; + prediction(global_pp_idx, dim + class_idx) += alpha(dim + class_idx, global_sv_idx) * temp[internal_pp][internal_sv]; + } } } } From 51b75b60eeb8431d71b0da2db0de7766ebb536c0 Mon Sep 17 00:00:00 2001 From: Marcel Breyer Date: Fri, 30 May 2025 12:21:28 +0200 Subject: [PATCH 009/215] Improve variable names and remove some implicit conversions. --- .../OpenMP/kernel/cg_explicit/blas.hpp | 44 +++++++------- .../cg_explicit/kernel_matrix_assembly.hpp | 22 +++---- .../kernel_matrix_assembly_blas.hpp | 26 ++++----- .../backends/OpenMP/kernel/predict_kernel.hpp | 58 +++++++++---------- 4 files changed, 75 insertions(+), 75 deletions(-) diff --git a/include/plssvm/backends/OpenMP/kernel/cg_explicit/blas.hpp b/include/plssvm/backends/OpenMP/kernel/cg_explicit/blas.hpp index ecd80ab1a..298962c19 100644 --- a/include/plssvm/backends/OpenMP/kernel/cg_explicit/blas.hpp +++ b/include/plssvm/backends/OpenMP/kernel/cg_explicit/blas.hpp @@ -53,20 +53,20 @@ inline void device_kernel_symm(const std::size_t num_rows, const std::size_t num const auto PADDING_SIZE_uz = static_cast(PADDING_SIZE); #pragma omp parallel for collapse(2) - for (std::size_t rhs = 0; rhs < blocked_num_rhs; rhs += THREAD_BLOCK_SIZE_uz) { - for (std::size_t row = 0; row < blocked_device_specific_num_rows; row += THREAD_BLOCK_SIZE_uz) { + for (std::size_t rhs_block = 0; rhs_block < blocked_num_rhs; rhs_block += THREAD_BLOCK_SIZE_uz) { + for (std::size_t row_block = 0; row_block < blocked_device_specific_num_rows; row_block += THREAD_BLOCK_SIZE_uz) { // perform operations on the current block - for (std::size_t rhs_block = 0; rhs_block < THREAD_BLOCK_SIZE_uz; ++rhs_block) { - for (std::size_t row_block = 0; row_block < THREAD_BLOCK_SIZE_uz; ++row_block) { + for (std::size_t rhs_thread = 0; rhs_thread < THREAD_BLOCK_SIZE_uz; ++rhs_thread) { + for (std::size_t row_thread = 0; row_thread < THREAD_BLOCK_SIZE_uz; ++row_thread) { // calculate the indices used in the current thread - const std::size_t i_idx = (rhs + rhs_block) * INTERNAL_BLOCK_SIZE_uz; - const std::size_t j_idx = (row + row_block) * INTERNAL_BLOCK_SIZE_uz; + const std::size_t i_idx = (rhs_block + rhs_thread) * INTERNAL_BLOCK_SIZE_uz; + const std::size_t j_idx = (row_block + row_thread) * INTERNAL_BLOCK_SIZE_uz; // create a thread private array used for internal caching std::array, INTERNAL_BLOCK_SIZE> temp{}; // iterate over all values - for (std::size_t dim = 0; dim < (num_rows - device_row_offset); dim += THREAD_BLOCK_SIZE_uz) { + for (std::size_t dim_block = 0; dim_block < (num_rows - device_row_offset); dim_block += THREAD_BLOCK_SIZE_uz) { // perform the dot product calculation for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) { for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) { @@ -75,15 +75,15 @@ inline void device_kernel_symm(const std::size_t num_rows, const std::size_t num const auto global_j_idx = j_idx + static_cast(internal_j); real_type sum{ 0.0 }; - for (unsigned block_dim = 0; block_dim < THREAD_BLOCK_SIZE; ++block_dim) { + for (std::size_t dim = 0; dim < THREAD_BLOCK_SIZE_uz; ++dim) { real_type A_cache = 0.0; // determine on which side of the diagonal we are located - if (dim + block_dim < global_j_idx) { - A_cache = A[(dim + block_dim) * (num_rows - device_row_offset + PADDING_SIZE_uz) + global_j_idx - (dim + block_dim) * (dim + block_dim + std::size_t{ 1 }) / std::size_t{ 2 }]; + if (dim_block + dim < global_j_idx) { + A_cache = A[(dim_block + dim) * (num_rows - device_row_offset + PADDING_SIZE_uz) + global_j_idx - (dim_block + dim) * (dim_block + dim + std::size_t{ 1 }) / std::size_t{ 2 }]; } else { - A_cache = A[global_j_idx * (num_rows - device_row_offset + PADDING_SIZE_uz) + dim + block_dim - global_j_idx * (global_j_idx + std::size_t{ 1 }) / std::size_t{ 2 }]; + A_cache = A[global_j_idx * (num_rows - device_row_offset + PADDING_SIZE_uz) + dim_block + dim - global_j_idx * (global_j_idx + std::size_t{ 1 }) / std::size_t{ 2 }]; } - sum += A_cache * B(global_i_idx, dim + block_dim + device_row_offset); + sum += A_cache * B(global_i_idx, dim_block + dim + device_row_offset); } temp[internal_i][internal_j] += sum; } @@ -141,20 +141,20 @@ inline void device_kernel_symm_mirror(const std::size_t num_rows, const std::siz const auto PADDING_SIZE_uz = static_cast(PADDING_SIZE); #pragma omp parallel for collapse(2) - for (std::size_t rhs = 0; rhs < blocked_num_rhs; rhs += THREAD_BLOCK_SIZE_uz) { - for (std::size_t row = 0; row < blocked_num_mirror_rows; row += THREAD_BLOCK_SIZE_uz) { + for (std::size_t rhs_block = 0; rhs_block < blocked_num_rhs; rhs_block += THREAD_BLOCK_SIZE_uz) { + for (std::size_t row_block = 0; row_block < blocked_num_mirror_rows; row_block += THREAD_BLOCK_SIZE_uz) { // perform operations on the current block - for (std::size_t rhs_block = 0; rhs_block < THREAD_BLOCK_SIZE_uz; ++rhs_block) { - for (std::size_t row_block = 0; row_block < THREAD_BLOCK_SIZE_uz; ++row_block) { + for (std::size_t rhs_thread = 0; rhs_thread < THREAD_BLOCK_SIZE_uz; ++rhs_thread) { + for (std::size_t row_thread = 0; row_thread < THREAD_BLOCK_SIZE_uz; ++row_thread) { // calculate the indices used in the current thread - const std::size_t i_idx = (rhs + rhs_block) * INTERNAL_BLOCK_SIZE_uz; - const std::size_t j_idx = (row + row_block) * INTERNAL_BLOCK_SIZE_uz; + const std::size_t i_idx = (rhs_block + rhs_thread) * INTERNAL_BLOCK_SIZE_uz; + const std::size_t j_idx = (row_block + row_thread) * INTERNAL_BLOCK_SIZE_uz; // create a thread private array used for internal caching std::array, INTERNAL_BLOCK_SIZE> temp{}; // iterate over the remaining values - for (std::size_t dim = 0; dim < device_num_rows; dim += THREAD_BLOCK_SIZE_uz) { + for (std::size_t dim_block = 0; dim_block < device_num_rows; dim_block += THREAD_BLOCK_SIZE_uz) { // perform the dot product calculation for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) { for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) { @@ -163,9 +163,9 @@ inline void device_kernel_symm_mirror(const std::size_t num_rows, const std::siz const auto global_j_idx = j_idx + static_cast(internal_j); real_type sum{ 0.0 }; - for (unsigned block_dim = 0; block_dim < THREAD_BLOCK_SIZE; ++block_dim) { - const real_type A_cache = A[(dim + block_dim) * (num_rows - device_row_offset + PADDING_SIZE_uz) - (dim + block_dim - std::size_t{ 1 }) * (dim + block_dim) / std::size_t{ 2 } + device_num_rows - dim + block_dim + global_j_idx]; - sum += A_cache * B(global_i_idx, device_row_offset + dim + block_dim); + for (std::size_t dim = 0; dim < THREAD_BLOCK_SIZE_uz; ++dim) { + const real_type A_cache = A[(dim_block + dim) * (num_rows - device_row_offset + PADDING_SIZE_uz) - (dim_block + dim - std::size_t{ 1 }) * (dim_block + dim) / std::size_t{ 2 } + device_num_rows - dim_block + dim + global_j_idx]; + sum += A_cache * B(global_i_idx, device_row_offset + dim_block + dim); } temp[internal_i][internal_j] += sum; } diff --git a/include/plssvm/backends/OpenMP/kernel/cg_explicit/kernel_matrix_assembly.hpp b/include/plssvm/backends/OpenMP/kernel/cg_explicit/kernel_matrix_assembly.hpp index b734a7c1a..f384645b1 100644 --- a/include/plssvm/backends/OpenMP/kernel/cg_explicit/kernel_matrix_assembly.hpp +++ b/include/plssvm/backends/OpenMP/kernel/cg_explicit/kernel_matrix_assembly.hpp @@ -58,14 +58,14 @@ void device_kernel_assembly(real_type *kernel_matrix, const soa_matrix(PADDING_SIZE); #pragma omp parallel for collapse(2) schedule(dynamic) - for (std::size_t row = 0; row < blocked_row_range; row += THREAD_BLOCK_SIZE_uz) { - for (std::size_t col = 0; col < blocked_device_specific_num_rows; col += THREAD_BLOCK_SIZE_uz) { + for (std::size_t row_block = 0; row_block < blocked_row_range; row_block += THREAD_BLOCK_SIZE_uz) { + for (std::size_t col_block = 0; col_block < blocked_device_specific_num_rows; col_block += THREAD_BLOCK_SIZE_uz) { // perform operations on the current block - for (std::size_t row_block = 0; row_block < THREAD_BLOCK_SIZE_uz; ++row_block) { - for (std::size_t col_block = 0; col_block < THREAD_BLOCK_SIZE_uz; ++col_block) { + for (std::size_t row_thread = 0; row_thread < THREAD_BLOCK_SIZE_uz; ++row_thread) { + for (std::size_t col_thread = 0; col_thread < THREAD_BLOCK_SIZE_uz; ++col_thread) { // calculate the indices used in the current thread - const std::size_t i_idx = (row + row_block) * INTERNAL_BLOCK_SIZE_uz; - const std::size_t j_idx = (col + col_block) * INTERNAL_BLOCK_SIZE_uz; + const std::size_t i_idx = (row_block + row_thread) * INTERNAL_BLOCK_SIZE_uz; + const std::size_t j_idx = (col_block + col_thread) * INTERNAL_BLOCK_SIZE_uz; // only calculate the upper triangular matrix if (i_idx >= j_idx) { @@ -73,7 +73,7 @@ void device_kernel_assembly(real_type *kernel_matrix, const soa_matrix, INTERNAL_BLOCK_SIZE> temp{}; // iterate over all features - for (std::size_t dim = 0; dim < num_features; dim += THREAD_BLOCK_SIZE_uz) { + for (std::size_t feature_block = 0; feature_block < num_features; feature_block += THREAD_BLOCK_SIZE_uz) { // perform the feature reduction calculation for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) { for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) { @@ -82,10 +82,10 @@ void device_kernel_assembly(real_type *kernel_matrix, const soa_matrix(internal_j); real_type sum{ 0.0 }; - for (unsigned block_dim = 0; block_dim < THREAD_BLOCK_SIZE; ++block_dim) { - sum += detail::feature_reduce(data(global_i_idx, dim + block_dim), data(global_j_idx, dim + block_dim)); + for (std::size_t feature = 0; feature < THREAD_BLOCK_SIZE_uz; ++feature) { + sum += detail::feature_reduce(data(global_i_idx, feature_block + feature), data(global_j_idx, feature_block + feature)); } - temp[internal_j][internal_i] += sum; + temp[internal_i][internal_j] += sum; } } } @@ -101,7 +101,7 @@ void device_kernel_assembly(real_type *kernel_matrix, const soa_matrix= global_j_idx) { - real_type temp_ij = temp[internal_j][internal_i]; + real_type temp_ij = temp[internal_i][internal_j]; // apply the final kernel function temp_ij = detail::apply_kernel_function(temp_ij, kernel_function_parameter...) + QA_cost - q[global_i_idx] - q[global_j_idx]; // apply the cost on the diagonal diff --git a/include/plssvm/backends/OpenMP/kernel/cg_implicit/kernel_matrix_assembly_blas.hpp b/include/plssvm/backends/OpenMP/kernel/cg_implicit/kernel_matrix_assembly_blas.hpp index 60c10de07..3ca4e4dc6 100644 --- a/include/plssvm/backends/OpenMP/kernel/cg_implicit/kernel_matrix_assembly_blas.hpp +++ b/include/plssvm/backends/OpenMP/kernel/cg_implicit/kernel_matrix_assembly_blas.hpp @@ -61,14 +61,14 @@ inline void device_kernel_assembly_symm(const real_type alpha, const std::vector const auto THREAD_BLOCK_SIZE_uz = static_cast(THREAD_BLOCK_SIZE); #pragma omp parallel for collapse(2) schedule(dynamic) - for (std::size_t row = 0; row < blocked_row_range; row += THREAD_BLOCK_SIZE_uz) { - for (std::size_t col = 0; col < blocked_device_specific_num_rows; col += THREAD_BLOCK_SIZE_uz) { + for (std::size_t row_block = 0; row_block < blocked_row_range; row_block += THREAD_BLOCK_SIZE_uz) { + for (std::size_t col_block = 0; col_block < blocked_device_specific_num_rows; col_block += THREAD_BLOCK_SIZE_uz) { // perform operations on the current block - for (std::size_t row_block = 0; row_block < THREAD_BLOCK_SIZE_uz; ++row_block) { - for (std::size_t col_block = 0; col_block < THREAD_BLOCK_SIZE_uz; ++col_block) { + for (std::size_t row_thread = 0; row_thread < THREAD_BLOCK_SIZE_uz; ++row_thread) { + for (std::size_t col_thread = 0; col_thread < THREAD_BLOCK_SIZE_uz; ++col_thread) { // calculate the indices used in the current thread - const std::size_t i_idx = (row + row_block) * INTERNAL_BLOCK_SIZE_uz; - const std::size_t j_idx = (col + col_block) * INTERNAL_BLOCK_SIZE_uz; + const std::size_t i_idx = (row_block + row_thread) * INTERNAL_BLOCK_SIZE_uz; + const std::size_t j_idx = (col_block + col_thread) * INTERNAL_BLOCK_SIZE_uz; // only calculate the upper triangular matrix if (i_idx >= j_idx) { @@ -76,7 +76,7 @@ inline void device_kernel_assembly_symm(const real_type alpha, const std::vector std::array, INTERNAL_BLOCK_SIZE> temp{}; // iterate over all features - for (std::size_t dim = 0; dim < num_features; dim += THREAD_BLOCK_SIZE_uz) { + for (std::size_t feature_block = 0; feature_block < num_features; feature_block += THREAD_BLOCK_SIZE_uz) { for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) { for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) { // calculate the indices to access the global data @@ -84,8 +84,8 @@ inline void device_kernel_assembly_symm(const real_type alpha, const std::vector const auto global_j_idx = device_row_offset + j_idx + static_cast(internal_j); real_type sum{ 0.0 }; - for (unsigned block_dim = 0; block_dim < THREAD_BLOCK_SIZE; ++block_dim) { - sum += detail::feature_reduce(data(global_i_idx, dim + block_dim), data(global_j_idx, dim + block_dim)); + for (std::size_t feature = 0; feature < THREAD_BLOCK_SIZE_uz; ++feature) { + sum += detail::feature_reduce(data(global_i_idx, feature_block + feature), data(global_j_idx, feature_block + feature)); } temp[internal_j][internal_i] += sum; } @@ -119,7 +119,7 @@ inline void device_kernel_assembly_symm(const real_type alpha, const std::vector //*************************************************************************// // calculate C += alpha * temp * B // //*************************************************************************// - for (std::size_t dim = 0; dim < num_classes; dim += THREAD_BLOCK_SIZE_uz) { + for (std::size_t class_block = 0; class_block < num_classes; class_block += THREAD_BLOCK_SIZE_uz) { for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) { for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) { const auto global_i_idx = device_row_offset + i_idx + static_cast(internal_i); @@ -129,16 +129,16 @@ inline void device_kernel_assembly_symm(const real_type alpha, const std::vector // only apply once to the diagonal for (std::size_t class_idx = 0; class_idx < THREAD_BLOCK_SIZE; ++class_idx) { #pragma omp atomic - C(dim + class_idx, global_i_idx) += alpha * temp[internal_j][internal_i] * B(dim + class_idx, global_i_idx); + C(class_block + class_idx, global_i_idx) += alpha * temp[internal_j][internal_i] * B(class_block + class_idx, global_i_idx); } } else { // apply it for the upper and lower triangular matrix for (std::size_t class_idx = 0; class_idx < THREAD_BLOCK_SIZE; ++class_idx) { #pragma omp atomic - C(dim + class_idx, global_i_idx) += alpha * temp[internal_j][internal_i] * B(dim + class_idx, global_j_idx); + C(class_block + class_idx, global_i_idx) += alpha * temp[internal_j][internal_i] * B(class_block + class_idx, global_j_idx); // symmetry #pragma omp atomic - C(dim + class_idx, global_j_idx) += alpha * temp[internal_j][internal_i] * B(dim + class_idx, global_i_idx); + C(class_block + class_idx, global_j_idx) += alpha * temp[internal_j][internal_i] * B(class_block + class_idx, global_i_idx); } } } diff --git a/include/plssvm/backends/OpenMP/kernel/predict_kernel.hpp b/include/plssvm/backends/OpenMP/kernel/predict_kernel.hpp index 1540397bc..49d98d4da 100644 --- a/include/plssvm/backends/OpenMP/kernel/predict_kernel.hpp +++ b/include/plssvm/backends/OpenMP/kernel/predict_kernel.hpp @@ -53,19 +53,19 @@ inline void device_kernel_w_linear(soa_matrix &w, const aos_matrix(THREAD_BLOCK_SIZE); #pragma omp parallel for collapse(2) default(none) shared(w, support_vectors, alpha) firstprivate(blocked_num_classes, blocked_num_features, num_classes, num_features, device_num_sv, sv_offset) - for (std::size_t dim = 0; dim < blocked_num_features; dim += THREAD_BLOCK_SIZE_uz) { - for (std::size_t a = 0; a < blocked_num_classes; a += THREAD_BLOCK_SIZE_uz) { + for (std::size_t feature_block = 0; feature_block < blocked_num_features; feature_block += THREAD_BLOCK_SIZE_uz) { + for (std::size_t class_block = 0; class_block < blocked_num_classes; class_block += THREAD_BLOCK_SIZE_uz) { // perform operations on the current block - for (std::size_t dim_block = 0; dim_block < THREAD_BLOCK_SIZE_uz; ++dim_block) { - for (std::size_t a_block = 0; a_block < THREAD_BLOCK_SIZE_uz; ++a_block) { + for (std::size_t feature_thread = 0; feature_thread < THREAD_BLOCK_SIZE_uz; ++feature_thread) { + for (std::size_t class_thread = 0; class_thread < THREAD_BLOCK_SIZE_uz; ++class_thread) { // calculate the indices used in the current thread - const std::size_t feature_idx = (dim + dim_block) * INTERNAL_BLOCK_SIZE_uz; - const std::size_t class_idx = (a + a_block) * INTERNAL_BLOCK_SIZE_uz; + const std::size_t feature_idx = (feature_block + feature_thread) * INTERNAL_BLOCK_SIZE_uz; + const std::size_t class_idx = (class_block + class_thread) * INTERNAL_BLOCK_SIZE_uz; // create a thread private array used for internal caching std::array, INTERNAL_BLOCK_SIZE> temp{}; - for (std::size_t sv = 0; sv < device_num_sv; sv += THREAD_BLOCK_SIZE_uz) { + for (std::size_t sv_block = 0; sv_block < device_num_sv; sv_block += THREAD_BLOCK_SIZE_uz) { // perform the dot product calculation for (unsigned internal_feature = 0; internal_feature < INTERNAL_BLOCK_SIZE; ++internal_feature) { for (unsigned internal_class = 0; internal_class < INTERNAL_BLOCK_SIZE; ++internal_class) { @@ -74,8 +74,8 @@ inline void device_kernel_w_linear(soa_matrix &w, const aos_matrix(internal_class); real_type sum{ 0.0 }; - for (unsigned block_sv = 0; block_sv < THREAD_BLOCK_SIZE; ++block_sv) { - sum += alpha(global_class_idx, sv_offset + sv + block_sv) * support_vectors(sv_offset + sv + block_sv, global_feature_idx); + for (std::size_t sv = 0; sv < THREAD_BLOCK_SIZE_uz; ++sv) { + sum += alpha(global_class_idx, sv_offset + sv_block + sv) * support_vectors(sv_offset + sv_block + sv, global_feature_idx); } temp[internal_class][internal_feature] += sum; } @@ -129,19 +129,19 @@ inline void device_kernel_predict_linear(aos_matrix &prediction, cons const auto THREAD_BLOCK_SIZE_uz = static_cast(THREAD_BLOCK_SIZE); #pragma omp parallel for collapse(2) default(none) shared(prediction, w, rho, predict_points) firstprivate(blocked_device_num_predict_points, blocked_num_classes, device_num_predict_points, num_classes, num_features, device_row_offset) - for (std::size_t point = 0; point < blocked_device_num_predict_points; point += THREAD_BLOCK_SIZE_uz) { - for (std::size_t a = 0; a < blocked_num_classes; a += THREAD_BLOCK_SIZE_uz) { + for (std::size_t pp_block = 0; pp_block < blocked_device_num_predict_points; pp_block += THREAD_BLOCK_SIZE_uz) { + for (std::size_t class_block = 0; class_block < blocked_num_classes; class_block += THREAD_BLOCK_SIZE_uz) { // perform operations on the current block - for (std::size_t point_block = 0; point_block < THREAD_BLOCK_SIZE_uz; ++point_block) { - for (std::size_t a_block = 0; a_block < THREAD_BLOCK_SIZE_uz; ++a_block) { + for (std::size_t pp_thread = 0; pp_thread < THREAD_BLOCK_SIZE_uz; ++pp_thread) { + for (std::size_t class_thread = 0; class_thread < THREAD_BLOCK_SIZE_uz; ++class_thread) { // calculate the indices used in the current thread - const std::size_t pp_idx = (point + point_block) * INTERNAL_BLOCK_SIZE_uz; - const std::size_t class_idx = (a + a_block) * INTERNAL_BLOCK_SIZE_uz; + const std::size_t pp_idx = (pp_block + pp_thread) * INTERNAL_BLOCK_SIZE_uz; + const std::size_t class_idx = (class_block + class_thread) * INTERNAL_BLOCK_SIZE_uz; // create a thread private array used for internal caching std::array, INTERNAL_BLOCK_SIZE> temp{}; - for (std::size_t dim = 0; dim < num_features; dim += THREAD_BLOCK_SIZE_uz) { + for (std::size_t feature_block = 0; feature_block < num_features; feature_block += THREAD_BLOCK_SIZE_uz) { // perform the dot product calculation for (unsigned internal_pp = 0; internal_pp < INTERNAL_BLOCK_SIZE; ++internal_pp) { for (unsigned internal_class = 0; internal_class < INTERNAL_BLOCK_SIZE; ++internal_class) { @@ -150,8 +150,8 @@ inline void device_kernel_predict_linear(aos_matrix &prediction, cons const auto global_class_idx = class_idx + static_cast(internal_class); real_type sum{ 0.0 }; - for (unsigned block_dim = 0; block_dim < THREAD_BLOCK_SIZE; ++block_dim) { - sum += w(global_class_idx, dim + block_dim) * predict_points(global_pp_idx, dim + block_dim); + for (std::size_t feature = 0; feature < THREAD_BLOCK_SIZE_uz; ++feature) { + sum += w(global_class_idx, feature_block + feature) * predict_points(global_pp_idx, feature_block + feature); } temp[internal_class][internal_pp] += sum; } @@ -218,20 +218,20 @@ inline void device_kernel_predict(aos_matrix &prediction, const aos_m } #pragma omp parallel for collapse(2) - for (std::size_t x_block = 0; x_block < blocked_device_specific_num_predict_points; x_block += THREAD_BLOCK_SIZE_uz) { - for (std::size_t y_block = 0; y_block < blocked_num_support_vectors; y_block += THREAD_BLOCK_SIZE_uz) { + for (std::size_t pp_block = 0; pp_block < blocked_device_specific_num_predict_points; pp_block += THREAD_BLOCK_SIZE_uz) { + for (std::size_t sv_block = 0; sv_block < blocked_num_support_vectors; sv_block += THREAD_BLOCK_SIZE_uz) { // perform operations on the current block - for (std::size_t x_thread = 0; x_thread < THREAD_BLOCK_SIZE_uz; ++x_thread) { - for (std::size_t y_thread = 0; y_thread < THREAD_BLOCK_SIZE_uz; ++y_thread) { + for (std::size_t pp_thread = 0; pp_thread < THREAD_BLOCK_SIZE_uz; ++pp_thread) { + for (std::size_t sv_thread = 0; sv_thread < THREAD_BLOCK_SIZE_uz; ++sv_thread) { // calculate the indices used in the current thread - const std::size_t pp_idx = (x_block + x_thread) * INTERNAL_BLOCK_SIZE_uz; - const std::size_t sv_idx = (y_block + y_thread) * INTERNAL_BLOCK_SIZE_uz; + const std::size_t pp_idx = (pp_block + pp_thread) * INTERNAL_BLOCK_SIZE_uz; + const std::size_t sv_idx = (sv_block + sv_thread) * INTERNAL_BLOCK_SIZE_uz; // create a thread private array used for internal caching std::array, INTERNAL_BLOCK_SIZE> temp{}; // iterate over all features - for (std::size_t dim = 0; dim < num_features; dim += THREAD_BLOCK_SIZE_uz) { + for (std::size_t feature_block = 0; feature_block < num_features; feature_block += THREAD_BLOCK_SIZE_uz) { // perform the feature reduction calculation for (unsigned internal_pp = 0; internal_pp < INTERNAL_BLOCK_SIZE; ++internal_pp) { for (unsigned internal_sv = 0; internal_sv < INTERNAL_BLOCK_SIZE; ++internal_sv) { @@ -240,8 +240,8 @@ inline void device_kernel_predict(aos_matrix &prediction, const aos_m const auto global_sv_idx = sv_idx + static_cast(internal_sv); real_type sum{ 0.0 }; - for (unsigned block_dim = 0; block_dim < THREAD_BLOCK_SIZE; ++block_dim) { - sum += detail::feature_reduce(support_vectors(global_sv_idx, dim + block_dim), predict_points(global_pp_idx, dim + block_dim)); + for (std::size_t feature = 0; feature < THREAD_BLOCK_SIZE_uz; ++feature) { + sum += detail::feature_reduce(support_vectors(global_sv_idx, feature_block + feature), predict_points(global_pp_idx, feature_block + feature)); } temp[internal_pp][internal_sv] += sum; } @@ -256,7 +256,7 @@ inline void device_kernel_predict(aos_matrix &prediction, const aos_m } // add results to prediction - for (std::size_t dim = 0; dim < num_classes; dim += THREAD_BLOCK_SIZE_uz) { + for (std::size_t class_block = 0; class_block < num_classes; class_block += THREAD_BLOCK_SIZE_uz) { for (unsigned internal_pp = 0; internal_pp < INTERNAL_BLOCK_SIZE; ++internal_pp) { for (unsigned internal_sv = 0; internal_sv < INTERNAL_BLOCK_SIZE; ++internal_sv) { // calculate the indices to access the global data and the data with respect to the current device @@ -268,7 +268,7 @@ inline void device_kernel_predict(aos_matrix &prediction, const aos_m if (device_global_pp_idx < device_num_predict_points && global_sv_idx < num_support_vectors) { for (std::size_t class_idx = 0; class_idx < THREAD_BLOCK_SIZE_uz; ++class_idx) { #pragma omp atomic - prediction(global_pp_idx, dim + class_idx) += alpha(dim + class_idx, global_sv_idx) * temp[internal_pp][internal_sv]; + prediction(global_pp_idx, class_block + class_idx) += alpha(class_block + class_idx, global_sv_idx) * temp[internal_pp][internal_sv]; } } } From 8a570a8adc102e9d2267d5e328b6f9a9a16bb3a8 Mon Sep 17 00:00:00 2001 From: Marcel Breyer Date: Fri, 30 May 2025 14:39:58 +0200 Subject: [PATCH 010/215] Fix tests after slight API changes. --- tests/backends/generic_base_csvm_tests.hpp | 20 ++++++-- tests/backends/generic_csvm_tests.hpp | 55 ++++++++++------------ 2 files changed, 41 insertions(+), 34 deletions(-) diff --git a/tests/backends/generic_base_csvm_tests.hpp b/tests/backends/generic_base_csvm_tests.hpp index fd93963c1..f6c95038a 100644 --- a/tests/backends/generic_base_csvm_tests.hpp +++ b/tests/backends/generic_base_csvm_tests.hpp @@ -41,6 +41,7 @@ #include // std::sqrt, std::abs #include // std::size_t +#include // std::memcpy #include // std::numeric_limits::epsilon #include // std::unique_ptr, std::make_unique #include // std::ignore, std::tuple, std::make_tuple @@ -86,7 +87,10 @@ template == plssvm::backend_type::openmp || plssvm::csvm_to_backend_type_v == plssvm::backend_type::stdpar || plssvm::csvm_to_backend_type_v == plssvm::backend_type::hpx) { // only a single device for OpenMP, stdpar, and HPX on the CPU - result[0] = plssvm::detail::move_only_any{ calculate_partial_kernel_matrix(0, matr.num_rows()) }; + const std::vector partial_kernel_matrix = calculate_partial_kernel_matrix(0, matr.num_rows()); + auto ptr = std::make_unique(partial_kernel_matrix.size()); + std::memcpy(ptr.get(), partial_kernel_matrix.data(), partial_kernel_matrix.size() * sizeof(real_type)); + result[0] = plssvm::detail::move_only_any{ std::move(ptr) }; } else { for (std::size_t device_id = 0; device_id < csvm.num_available_devices(); ++device_id) { auto &device = csvm.devices_[device_id]; @@ -850,7 +854,8 @@ TYPED_TEST_P(GenericCSVMSolverKernelFunction, assemble_kernel_matrix_minimal) { const mock_csvm_type svm = util::construct_from_tuple(params, csvm_test_type::additional_arguments); const std::size_t num_devices = svm.num_available_devices(); // be sure to use the correct data distribution - svm.data_distribution_ = std::make_unique(plssvm::mpi::communicator{}, data.num_rows() - 1, num_devices); + const plssvm::detail::triangular_data_distribution dist{ plssvm::mpi::communicator{}, data.num_rows() - 1, num_devices }; + svm.data_distribution_ = std::make_unique(dist); // automatic solver type not permitted if constexpr (solver == plssvm::solver_type::automatic) { @@ -880,7 +885,9 @@ TYPED_TEST_P(GenericCSVMSolverKernelFunction, assemble_kernel_matrix_minimal) { // get result based on used backend std::vector kernel_matrix{}; if constexpr (plssvm::csvm_to_backend_type_v == plssvm::backend_type::openmp || plssvm::csvm_to_backend_type_v == plssvm::backend_type::stdpar || plssvm::csvm_to_backend_type_v == plssvm::backend_type::hpx) { - kernel_matrix = plssvm::detail::move_only_any_cast>(kernel_matrix_d[device_id]); // std::vector + const auto &kernel_matrix_d_ptr = plssvm::detail::move_only_any_cast &>(kernel_matrix_d[device_id]); // std::unique_ptr + kernel_matrix.resize(dist.calculate_explicit_kernel_matrix_num_entries_padded(0)); + std::memcpy(kernel_matrix.data(), kernel_matrix_d_ptr.get(), kernel_matrix.size() * sizeof(plssvm::real_type)); } else { const auto &kernel_matrix_d_ptr = plssvm::detail::move_only_any_cast(kernel_matrix_d[device_id]); // device_ptr -> convert it to a std::vector kernel_matrix.resize(kernel_matrix_d_ptr.size_padded()); @@ -960,7 +967,8 @@ TYPED_TEST_P(GenericCSVMSolverKernelFunction, assemble_kernel_matrix) { const mock_csvm_type svm = util::construct_from_tuple(params, csvm_test_type::additional_arguments); const std::size_t num_devices = svm.num_available_devices(); // be sure to use the correct data distribution - svm.data_distribution_ = std::make_unique(plssvm::mpi::communicator{}, data.num_rows() - 1, num_devices); + const plssvm::detail::triangular_data_distribution dist{ plssvm::mpi::communicator{}, data.num_rows() - 1, num_devices }; + svm.data_distribution_ = std::make_unique(dist); // automatic solver type not permitted if constexpr (solver == plssvm::solver_type::automatic) { @@ -990,7 +998,9 @@ TYPED_TEST_P(GenericCSVMSolverKernelFunction, assemble_kernel_matrix) { // get result based on used backend std::vector kernel_matrix{}; if constexpr (plssvm::csvm_to_backend_type_v == plssvm::backend_type::openmp || plssvm::csvm_to_backend_type_v == plssvm::backend_type::stdpar || plssvm::csvm_to_backend_type_v == plssvm::backend_type::hpx) { - kernel_matrix = plssvm::detail::move_only_any_cast>(kernel_matrix_d[device_id]); // std::vector + const auto &kernel_matrix_d_ptr = plssvm::detail::move_only_any_cast &>(kernel_matrix_d[device_id]); // std::unique_ptr + kernel_matrix.resize(dist.calculate_explicit_kernel_matrix_num_entries_padded(0)); + std::memcpy(kernel_matrix.data(), kernel_matrix_d_ptr.get(), kernel_matrix.size() * sizeof(plssvm::real_type)); } else { const auto &kernel_matrix_d_ptr = plssvm::detail::move_only_any_cast(kernel_matrix_d[device_id]); // device_ptr -> convert it to a std::vector kernel_matrix.resize(kernel_matrix_d_ptr.size_padded()); diff --git a/tests/backends/generic_csvm_tests.hpp b/tests/backends/generic_csvm_tests.hpp index 84b9b7ad9..549cd3a68 100644 --- a/tests/backends/generic_csvm_tests.hpp +++ b/tests/backends/generic_csvm_tests.hpp @@ -81,14 +81,15 @@ TYPED_TEST_P(GenericBackendCSVM, blas_level_3_kernel_explicit) { const std::size_t specific_num_rows = dist.place_specific_num_rows(device); const std::size_t row_offset = dist.place_row_offset(device); - device_kernel_symm(num_rows, num_rhs, specific_num_rows, row_offset, alpha, kernel_matrix, B, beta, C_temp); + device_kernel_symm(num_rows, num_rhs, specific_num_rows, row_offset, alpha, kernel_matrix.data(), B, beta, C_temp); const std::size_t num_mirror_rows = num_rows - row_offset - specific_num_rows; if (num_mirror_rows > 0) { - device_kernel_symm_mirror(num_rows, num_rhs, num_mirror_rows, specific_num_rows, row_offset, alpha, kernel_matrix, B, beta, C_temp); + device_kernel_symm_mirror(num_rows, num_rhs, num_mirror_rows, specific_num_rows, row_offset, alpha, kernel_matrix.data(), B, beta, C_temp); } C_res += C_temp; } + C_res.restore_padding(); // calculate correct results const plssvm::aos_matrix kernel_matrix_gemm_padded = ground_truth::assemble_full_kernel_matrix(params, data.data(), q_red, QA_cost); @@ -112,6 +113,7 @@ TYPED_TEST_P(GenericBackendCSVM, calculate_w) { const plssvm::detail::rectangular_data_distribution dist{ plssvm::mpi::communicator{}, data.num_data_points(), 1 }; device_kernel_w_linear(w, weights, data.data(), dist.place_specific_num_rows(0), dist.place_row_offset(0)); + w.restore_padding(); // calculate correct results const plssvm::soa_matrix correct_w = ground_truth::calculate_w(weights, data.data()); @@ -160,22 +162,22 @@ TYPED_TEST_P(GenericBackendCSVMKernelFunction, assemble_kernel_matrix_explicit) switch (kernel) { case plssvm::kernel_function_type::linear: - device_kernel_assembly(kernel_matrix, data_matr, device_specific_num_rows, row_offset, q_red, QA_cost, cost); + device_kernel_assembly(kernel_matrix.data(), data_matr, device_specific_num_rows, row_offset, q_red, QA_cost, cost); break; case plssvm::kernel_function_type::polynomial: - device_kernel_assembly(kernel_matrix, data_matr, device_specific_num_rows, row_offset, q_red, QA_cost, cost, params.degree, std::get(params.gamma), params.coef0); + device_kernel_assembly(kernel_matrix.data(), data_matr, device_specific_num_rows, row_offset, q_red, QA_cost, cost, params.degree, std::get(params.gamma), params.coef0); break; case plssvm::kernel_function_type::rbf: - device_kernel_assembly(kernel_matrix, data_matr, device_specific_num_rows, row_offset, q_red, QA_cost, cost, std::get(params.gamma)); + device_kernel_assembly(kernel_matrix.data(), data_matr, device_specific_num_rows, row_offset, q_red, QA_cost, cost, std::get(params.gamma)); break; case plssvm::kernel_function_type::sigmoid: - device_kernel_assembly(kernel_matrix, data_matr, device_specific_num_rows, row_offset, q_red, QA_cost, cost, std::get(params.gamma), params.coef0); + device_kernel_assembly(kernel_matrix.data(), data_matr, device_specific_num_rows, row_offset, q_red, QA_cost, cost, std::get(params.gamma), params.coef0); break; case plssvm::kernel_function_type::laplacian: - device_kernel_assembly(kernel_matrix, data_matr, device_specific_num_rows, row_offset, q_red, QA_cost, cost, std::get(params.gamma)); + device_kernel_assembly(kernel_matrix.data(), data_matr, device_specific_num_rows, row_offset, q_red, QA_cost, cost, std::get(params.gamma)); break; case plssvm::kernel_function_type::chi_squared: - device_kernel_assembly(kernel_matrix, data_matr, device_specific_num_rows, row_offset, q_red, QA_cost, cost, std::get(params.gamma)); + device_kernel_assembly(kernel_matrix.data(), data_matr, device_specific_num_rows, row_offset, q_red, QA_cost, cost, std::get(params.gamma)); break; } const std::vector correct_kernel_matrix = ground_truth::assemble_device_specific_kernel_matrix(params, data_matr, q_red, QA_cost, dist, 0); @@ -297,6 +299,7 @@ TYPED_TEST_P(GenericBackendCSVMKernelFunction, predict_values) { device_kernel_predict(out, weights, rho, data_matr, predict_points, device_specific_num_predict_points, row_offset, std::get(params.gamma)); break; } + out.restore_padding(); // check out for correctness const plssvm::aos_matrix correct_out = ground_truth::predict_values(params, correct_w, weights, rho, data_matr, predict_points); @@ -337,45 +340,39 @@ TYPED_TEST_P(GenericBackendCSVMDeathTest, blas_level_3_kernel_explicit) { const std::size_t row_offset = dist.place_row_offset(0); { - // the A matrix must have the correct size - EXPECT_DEATH(device_kernel_symm(num_rows, num_rhs, specific_num_rows, row_offset, alpha, std::vector{}, B, beta, C), "A matrix may not be empty!"); - // the B matrix must have the correct shape const auto B_wrong = util::generate_random_matrix>(plssvm::shape{ std::min(0ULL, num_rows - 1), std::min(0ULL, num_rhs - 2) }, plssvm::shape{ plssvm::PADDING_SIZE, plssvm::PADDING_SIZE }); - EXPECT_DEATH(device_kernel_symm(num_rows, num_rhs, specific_num_rows, row_offset, alpha, kernel_matrix, B_wrong, beta, C), ::testing::HasSubstr(fmt::format("B matrix sizes mismatch!: [{}, {}] != [{}, {}]", std::min(0, static_cast(num_rows) - 1), std::min(0, static_cast(num_rhs) - 2), num_rows, num_rhs))); + EXPECT_DEATH(device_kernel_symm(num_rows, num_rhs, specific_num_rows, row_offset, alpha, kernel_matrix.data(), B_wrong, beta, C), ::testing::HasSubstr(fmt::format("B matrix sizes mismatch!: [{}, {}] != [{}, {}]", std::min(0, static_cast(num_rows) - 1), std::min(0, static_cast(num_rhs) - 2), num_rows, num_rhs))); // the C matrix must have the correct shape auto C_wrong = util::generate_random_matrix>(plssvm::shape{ std::min(0ULL, num_rows - 1), std::min(0ULL, num_rhs - 2) }, plssvm::shape{ plssvm::PADDING_SIZE, plssvm::PADDING_SIZE }); - EXPECT_DEATH(device_kernel_symm(num_rows, num_rhs, specific_num_rows, row_offset, alpha, kernel_matrix, B, beta, C_wrong), ::testing::HasSubstr(fmt::format("C matrix sizes mismatch!: [{}, {}] != [{}, {}]", std::min(0, static_cast(num_rows) - 1), std::min(0, static_cast(num_rhs) - 2), num_rows, num_rhs))); + EXPECT_DEATH(device_kernel_symm(num_rows, num_rhs, specific_num_rows, row_offset, alpha, kernel_matrix.data(), B, beta, C_wrong), ::testing::HasSubstr(fmt::format("C matrix sizes mismatch!: [{}, {}] != [{}, {}]", std::min(0, static_cast(num_rows) - 1), std::min(0, static_cast(num_rhs) - 2), num_rows, num_rhs))); // the place specific number of rows may not be too large - EXPECT_DEATH(device_kernel_symm(num_rows, num_rhs, num_rows + 1, row_offset, alpha, kernel_matrix, B, beta, C), ::testing::HasSubstr(fmt::format("The number of place specific rows ({}) cannot be greater the the total number of rows ({})!", num_rows + 1, num_rows))); + EXPECT_DEATH(device_kernel_symm(num_rows, num_rhs, num_rows + 1, row_offset, alpha, kernel_matrix.data(), B, beta, C), ::testing::HasSubstr(fmt::format("The number of place specific rows ({}) cannot be greater the the total number of rows ({})!", num_rows + 1, num_rows))); // the row offset may not be too large - EXPECT_DEATH(device_kernel_symm(num_rows, num_rhs, specific_num_rows, num_rows + 1, alpha, kernel_matrix, B, beta, C), ::testing::HasSubstr(fmt::format("The row offset ({}) cannot be greater the the total number of rows ({})!", num_rows + 1, num_rows))); + EXPECT_DEATH(device_kernel_symm(num_rows, num_rhs, specific_num_rows, num_rows + 1, alpha, kernel_matrix.data(), B, beta, C), ::testing::HasSubstr(fmt::format("The row offset ({}) cannot be greater the the total number of rows ({})!", num_rows + 1, num_rows))); } { const std::size_t num_mirror_rows = num_rows - row_offset - specific_num_rows; - // the A matrix must have the correct size - EXPECT_DEATH(device_kernel_symm_mirror(num_rows, num_rhs, num_mirror_rows, specific_num_rows, row_offset, alpha, std::vector{}, B, beta, C), "A matrix may not be empty!"); - // the B matrix must have the correct shape const auto B_wrong = util::generate_random_matrix>(plssvm::shape{ std::min(0ULL, num_rows - 1), std::min(0ULL, num_rhs - 2) }, plssvm::shape{ plssvm::PADDING_SIZE, plssvm::PADDING_SIZE }); - EXPECT_DEATH(device_kernel_symm_mirror(num_rows, num_rhs, num_mirror_rows, specific_num_rows, row_offset, alpha, kernel_matrix, B_wrong, beta, C), ::testing::HasSubstr(fmt::format("B matrix sizes mismatch!: [{}, {}] != [{}, {}]", std::min(0, static_cast(num_rows) - 1), std::min(0, static_cast(num_rhs) - 2), num_rows, num_rhs))); + EXPECT_DEATH(device_kernel_symm_mirror(num_rows, num_rhs, num_mirror_rows, specific_num_rows, row_offset, alpha, kernel_matrix.data(), B_wrong, beta, C), ::testing::HasSubstr(fmt::format("B matrix sizes mismatch!: [{}, {}] != [{}, {}]", std::min(0, static_cast(num_rows) - 1), std::min(0, static_cast(num_rhs) - 2), num_rows, num_rhs))); // the C matrix must have the correct shape auto C_wrong = util::generate_random_matrix>(plssvm::shape{ std::min(0ULL, num_rows - 1), std::min(0ULL, num_rhs - 2) }, plssvm::shape{ plssvm::PADDING_SIZE, plssvm::PADDING_SIZE }); - EXPECT_DEATH(device_kernel_symm_mirror(num_rows, num_rhs, num_mirror_rows, specific_num_rows, row_offset, alpha, kernel_matrix, B, beta, C_wrong), ::testing::HasSubstr(fmt::format("C matrix sizes mismatch!: [{}, {}] != [{}, {}]", std::min(0, static_cast(num_rows) - 1), std::min(0, static_cast(num_rhs) - 2), num_rows, num_rhs))); + EXPECT_DEATH(device_kernel_symm_mirror(num_rows, num_rhs, num_mirror_rows, specific_num_rows, row_offset, alpha, kernel_matrix.data(), B, beta, C_wrong), ::testing::HasSubstr(fmt::format("C matrix sizes mismatch!: [{}, {}] != [{}, {}]", std::min(0, static_cast(num_rows) - 1), std::min(0, static_cast(num_rhs) - 2), num_rows, num_rhs))); // the place specific number of rows may not be too large - EXPECT_DEATH(device_kernel_symm_mirror(num_rows, num_rhs, num_mirror_rows, num_rows + 1, row_offset, alpha, kernel_matrix, B, beta, C), ::testing::HasSubstr(fmt::format("The number of place specific rows ({}) cannot be greater the the total number of rows ({})!", num_rows + 1, num_rows))); + EXPECT_DEATH(device_kernel_symm_mirror(num_rows, num_rhs, num_mirror_rows, num_rows + 1, row_offset, alpha, kernel_matrix.data(), B, beta, C), ::testing::HasSubstr(fmt::format("The number of place specific rows ({}) cannot be greater the the total number of rows ({})!", num_rows + 1, num_rows))); // the mirror number of rows may not be too large - EXPECT_DEATH(device_kernel_symm_mirror(num_rows, num_rhs, num_rows + 1, specific_num_rows, row_offset, alpha, kernel_matrix, B, beta, C), ::testing::HasSubstr(fmt::format("The number of mirror rows ({}) cannot be greater the the total number of rows ({})!", num_rows + 1, num_rows))); + EXPECT_DEATH(device_kernel_symm_mirror(num_rows, num_rhs, num_rows + 1, specific_num_rows, row_offset, alpha, kernel_matrix.data(), B, beta, C), ::testing::HasSubstr(fmt::format("The number of mirror rows ({}) cannot be greater the the total number of rows ({})!", num_rows + 1, num_rows))); // the row offset may not be too large - EXPECT_DEATH(device_kernel_symm_mirror(num_rows, num_rhs, num_mirror_rows, specific_num_rows, num_rows + 1, alpha, kernel_matrix, B, beta, C), ::testing::HasSubstr(fmt::format("The row offset ({}) cannot be greater the the total number of rows ({})!", num_rows + 1, num_rows))); + EXPECT_DEATH(device_kernel_symm_mirror(num_rows, num_rhs, num_mirror_rows, specific_num_rows, num_rows + 1, alpha, kernel_matrix.data(), B, beta, C), ::testing::HasSubstr(fmt::format("The row offset ({}) cannot be greater the the total number of rows ({})!", num_rows + 1, num_rows))); } } @@ -445,22 +442,22 @@ TYPED_TEST_P(GenericBackendCSVMKernelFunctionDeathTest, assemble_kernel_matrix_e const auto run_assembly = [=](const plssvm::parameter ¶ms_p, std::vector &kernel_matrix_p, const plssvm::soa_matrix &data_p, const std::size_t device_specific_num_rows_p, const std::size_t row_offset_p, const std::vector &q_red_p, const plssvm::real_type QA_cost_p) { switch (kernel) { case plssvm::kernel_function_type::linear: - device_kernel_assembly(kernel_matrix_p, data_p, device_specific_num_rows_p, row_offset_p, q_red_p, QA_cost_p, params_p.cost); + device_kernel_assembly(kernel_matrix_p.data(), data_p, device_specific_num_rows_p, row_offset_p, q_red_p, QA_cost_p, params_p.cost); break; case plssvm::kernel_function_type::polynomial: - device_kernel_assembly(kernel_matrix_p, data_p, device_specific_num_rows_p, row_offset_p, q_red_p, QA_cost_p, params_p.cost, params_p.degree, std::get(params_p.gamma), params_p.coef0); + device_kernel_assembly(kernel_matrix_p.data(), data_p, device_specific_num_rows_p, row_offset_p, q_red_p, QA_cost_p, params_p.cost, params_p.degree, std::get(params_p.gamma), params_p.coef0); break; case plssvm::kernel_function_type::rbf: - device_kernel_assembly(kernel_matrix_p, data_p, device_specific_num_rows_p, row_offset_p, q_red_p, QA_cost_p, params_p.cost, std::get(params_p.gamma)); + device_kernel_assembly(kernel_matrix_p.data(), data_p, device_specific_num_rows_p, row_offset_p, q_red_p, QA_cost_p, params_p.cost, std::get(params_p.gamma)); break; case plssvm::kernel_function_type::sigmoid: - device_kernel_assembly(kernel_matrix_p, data_p, device_specific_num_rows_p, row_offset_p, q_red_p, QA_cost_p, params_p.cost, std::get(params_p.gamma), params_p.coef0); + device_kernel_assembly(kernel_matrix_p.data(), data_p, device_specific_num_rows_p, row_offset_p, q_red_p, QA_cost_p, params_p.cost, std::get(params_p.gamma), params_p.coef0); break; case plssvm::kernel_function_type::laplacian: - device_kernel_assembly(kernel_matrix_p, data_p, device_specific_num_rows_p, row_offset_p, q_red_p, QA_cost_p, params_p.cost, std::get(params_p.gamma)); + device_kernel_assembly(kernel_matrix_p.data(), data_p, device_specific_num_rows_p, row_offset_p, q_red_p, QA_cost_p, params_p.cost, std::get(params_p.gamma)); break; case plssvm::kernel_function_type::chi_squared: - device_kernel_assembly(kernel_matrix_p, data_p, device_specific_num_rows_p, row_offset_p, q_red_p, QA_cost_p, params_p.cost, std::get(params_p.gamma)); + device_kernel_assembly(kernel_matrix_p.data(), data_p, device_specific_num_rows_p, row_offset_p, q_red_p, QA_cost_p, params_p.cost, std::get(params_p.gamma)); break; } }; From 3025c7606fd3cb67c63f543a28077fe189eda991 Mon Sep 17 00:00:00 2001 From: Marcel Breyer Date: Fri, 30 May 2025 15:02:20 +0200 Subject: [PATCH 011/215] Remove unnecessary conditions. Improve variable naming. --- .../OpenMP/kernel/cg_explicit/blas.hpp | 8 ++++---- .../backends/OpenMP/kernel/predict_kernel.hpp | 20 ++++++------------- src/plssvm/backends/OpenMP/csvm.cpp | 6 ++++++ 3 files changed, 16 insertions(+), 18 deletions(-) diff --git a/include/plssvm/backends/OpenMP/kernel/cg_explicit/blas.hpp b/include/plssvm/backends/OpenMP/kernel/cg_explicit/blas.hpp index 298962c19..81f560421 100644 --- a/include/plssvm/backends/OpenMP/kernel/cg_explicit/blas.hpp +++ b/include/plssvm/backends/OpenMP/kernel/cg_explicit/blas.hpp @@ -45,7 +45,7 @@ inline void device_kernel_symm(const std::size_t num_rows, const std::size_t num // calculate constants const auto blocked_num_rhs = static_cast(std::ceil(static_cast(num_rhs) / INTERNAL_BLOCK_SIZE)); - const auto blocked_device_specific_num_rows = static_cast(std::ceil(static_cast(device_num_rows) / INTERNAL_BLOCK_SIZE)); + const auto blocked_device_num_rows = static_cast(std::ceil(static_cast(device_num_rows) / INTERNAL_BLOCK_SIZE)); // cast all values to 64-bit unsigned long long to prevent potential 32-bit overflows const auto INTERNAL_BLOCK_SIZE_uz = static_cast(INTERNAL_BLOCK_SIZE); @@ -54,7 +54,7 @@ inline void device_kernel_symm(const std::size_t num_rows, const std::size_t num #pragma omp parallel for collapse(2) for (std::size_t rhs_block = 0; rhs_block < blocked_num_rhs; rhs_block += THREAD_BLOCK_SIZE_uz) { - for (std::size_t row_block = 0; row_block < blocked_device_specific_num_rows; row_block += THREAD_BLOCK_SIZE_uz) { + for (std::size_t row_block = 0; row_block < blocked_device_num_rows; row_block += THREAD_BLOCK_SIZE_uz) { // perform operations on the current block for (std::size_t rhs_thread = 0; rhs_thread < THREAD_BLOCK_SIZE_uz; ++rhs_thread) { for (std::size_t row_thread = 0; row_thread < THREAD_BLOCK_SIZE_uz; ++row_thread) { @@ -83,7 +83,7 @@ inline void device_kernel_symm(const std::size_t num_rows, const std::size_t num } else { A_cache = A[global_j_idx * (num_rows - device_row_offset + PADDING_SIZE_uz) + dim_block + dim - global_j_idx * (global_j_idx + std::size_t{ 1 }) / std::size_t{ 2 }]; } - sum += A_cache * B(global_i_idx, dim_block + dim + device_row_offset); + sum += A_cache * B(global_i_idx, device_row_offset + dim_block + dim); } temp[internal_i][internal_j] += sum; } @@ -164,7 +164,7 @@ inline void device_kernel_symm_mirror(const std::size_t num_rows, const std::siz real_type sum{ 0.0 }; for (std::size_t dim = 0; dim < THREAD_BLOCK_SIZE_uz; ++dim) { - const real_type A_cache = A[(dim_block + dim) * (num_rows - device_row_offset + PADDING_SIZE_uz) - (dim_block + dim - std::size_t{ 1 }) * (dim_block + dim) / std::size_t{ 2 } + device_num_rows - dim_block + dim + global_j_idx]; + const real_type A_cache = A[(dim_block + dim) * (num_rows - device_row_offset + PADDING_SIZE_uz) - (dim_block + dim - std::size_t{ 1 }) * (dim_block + dim) / std::size_t{ 2 } + device_num_rows - (dim_block + dim) + global_j_idx]; sum += A_cache * B(global_i_idx, device_row_offset + dim_block + dim); } temp[internal_i][internal_j] += sum; diff --git a/include/plssvm/backends/OpenMP/kernel/predict_kernel.hpp b/include/plssvm/backends/OpenMP/kernel/predict_kernel.hpp index 49d98d4da..89c0a380c 100644 --- a/include/plssvm/backends/OpenMP/kernel/predict_kernel.hpp +++ b/include/plssvm/backends/OpenMP/kernel/predict_kernel.hpp @@ -89,9 +89,7 @@ inline void device_kernel_w_linear(soa_matrix &w, const aos_matrix(internal_feature); const auto global_class_idx = class_idx + static_cast(internal_class); - if (global_class_idx < num_classes && global_feature_idx < num_features) { - w(global_class_idx, global_feature_idx) = temp[internal_class][internal_feature]; - } + w(global_class_idx, global_feature_idx) = temp[internal_class][internal_feature]; } } } @@ -162,13 +160,10 @@ inline void device_kernel_predict_linear(aos_matrix &prediction, cons for (unsigned internal_pp = 0; internal_pp < INTERNAL_BLOCK_SIZE; ++internal_pp) { for (unsigned internal_class = 0; internal_class < INTERNAL_BLOCK_SIZE; ++internal_class) { // calculate the indices to access the global data - const auto device_global_pp_idx = pp_idx + static_cast(internal_pp); - const auto global_pp_idx = device_row_offset + device_global_pp_idx; + const auto global_pp_idx = device_row_offset + pp_idx + static_cast(internal_pp); const auto global_class_idx = class_idx + static_cast(internal_class); - if (global_class_idx < num_classes && global_pp_idx < device_num_predict_points) { - prediction(global_pp_idx, global_class_idx) = temp[internal_class][internal_pp] - rho[global_class_idx]; - } + prediction(global_pp_idx, global_class_idx) = temp[internal_class][internal_pp] - rho[global_class_idx]; } } } @@ -260,16 +255,13 @@ inline void device_kernel_predict(aos_matrix &prediction, const aos_m for (unsigned internal_pp = 0; internal_pp < INTERNAL_BLOCK_SIZE; ++internal_pp) { for (unsigned internal_sv = 0; internal_sv < INTERNAL_BLOCK_SIZE; ++internal_sv) { // calculate the indices to access the global data and the data with respect to the current device - const auto device_global_pp_idx = pp_idx + static_cast(internal_pp); - const auto global_pp_idx = device_row_offset + device_global_pp_idx; + const auto global_pp_idx = device_row_offset + pp_idx + static_cast(internal_pp); const auto global_sv_idx = sv_idx + static_cast(internal_sv); // be sure to not perform out-of-bounds accesses - if (device_global_pp_idx < device_num_predict_points && global_sv_idx < num_support_vectors) { - for (std::size_t class_idx = 0; class_idx < THREAD_BLOCK_SIZE_uz; ++class_idx) { + for (std::size_t class_idx = 0; class_idx < THREAD_BLOCK_SIZE_uz; ++class_idx) { #pragma omp atomic - prediction(global_pp_idx, class_block + class_idx) += alpha(class_block + class_idx, global_sv_idx) * temp[internal_pp][internal_sv]; - } + prediction(global_pp_idx, class_block + class_idx) += alpha(class_block + class_idx, global_sv_idx) * temp[internal_pp][internal_sv]; } } } diff --git a/src/plssvm/backends/OpenMP/csvm.cpp b/src/plssvm/backends/OpenMP/csvm.cpp index 656d966f3..d34b25066 100644 --- a/src/plssvm/backends/OpenMP/csvm.cpp +++ b/src/plssvm/backends/OpenMP/csvm.cpp @@ -275,6 +275,8 @@ void csvm::blas_level_3(const solver_type solver, const real_type alpha, const s break; } } + // restore padding entries by setting them to zero + C.restore_padding(); } //***************************************************// @@ -330,6 +332,8 @@ aos_matrix csvm::predict_values(const parameter ¶ms, [[maybe_unused]] const auto duration = std::chrono::duration_cast(end - start); PLSSVM_DETAIL_TRACKING_PERFORMANCE_TRACKER_ADD_TRACKING_ENTRY((plssvm::detail::tracking::tracking_entry{ "predict_values", "w_kernel", duration })); } + // restore padding entries by setting them to zero + w.restore_padding(); // reduce w on all MPI ranks comm_.allreduce_inplace(w); @@ -369,6 +373,8 @@ aos_matrix csvm::predict_values(const parameter ¶ms, PLSSVM_DETAIL_TRACKING_PERFORMANCE_TRACKER_ADD_TRACKING_ENTRY((plssvm::detail::tracking::tracking_entry{ "predict_values", "predict_kernel", duration })); } + // restore padding entries by setting them to zero + out.restore_padding(); return out; } From 46a955806df29922477ac79cd5fea07cb0329f3d Mon Sep 17 00:00:00 2001 From: Marcel Breyer Date: Fri, 30 May 2025 15:19:14 +0200 Subject: [PATCH 012/215] Update variable names. --- .../backends/CUDA/kernel/cg_explicit/blas.cuh | 24 +++++------ .../cg_explicit/kernel_matrix_assembly.cuh | 12 +++--- .../kernel_matrix_assembly_blas.cuh | 26 ++++++------ .../backends/CUDA/kernel/predict_kernel.cuh | 40 +++++++++---------- 4 files changed, 51 insertions(+), 51 deletions(-) diff --git a/include/plssvm/backends/CUDA/kernel/cg_explicit/blas.cuh b/include/plssvm/backends/CUDA/kernel/cg_explicit/blas.cuh index 1a6be4ae8..d2adc5618 100644 --- a/include/plssvm/backends/CUDA/kernel/cg_explicit/blas.cuh +++ b/include/plssvm/backends/CUDA/kernel/cg_explicit/blas.cuh @@ -58,7 +58,7 @@ __global__ void device_kernel_symm(const std::size_t num_rows, const std::size_t const auto j_idx_linear = blockIdx_y * blockDim_y * INTERNAL_BLOCK_SIZE_uz + threadIdx_x; // device_num_rows // iterate over all values using blocking to be able to cache them for faster memory accesses - for (std::size_t dim = 0; dim < (num_rows - device_row_offset); dim += THREAD_BLOCK_SIZE_uz) { + for (std::size_t dim_block = 0; dim_block < (num_rows - device_row_offset); dim_block += THREAD_BLOCK_SIZE_uz) { // load data into shared memory for (unsigned internal = 0; internal < INTERNAL_BLOCK_SIZE; ++internal) { // calculate the indices to access the global data, pays attention to coalesced memory accesses @@ -67,20 +67,20 @@ __global__ void device_kernel_symm(const std::size_t num_rows, const std::size_t // store the values in the shared memory // determine on which side of the diagonal we are located - if (dim + threadIdx_y < global_j_idx_linear) { - A_cache[threadIdx.y][internal * THREAD_BLOCK_SIZE + threadIdx.x] = A[(dim + threadIdx_y) * (num_rows - device_row_offset + PADDING_SIZE_uz) + global_j_idx_linear - (dim + threadIdx_y) * (dim + threadIdx_y + std::size_t{ 1 }) / std::size_t{ 2 }]; // SoA, upper triangular matrix only + if (dim_block + threadIdx_y < global_j_idx_linear) { + A_cache[threadIdx.y][internal * THREAD_BLOCK_SIZE + threadIdx.x] = A[(dim_block + threadIdx_y) * (num_rows - device_row_offset + PADDING_SIZE_uz) + global_j_idx_linear - (dim_block + threadIdx_y) * (dim_block + threadIdx_y + std::size_t{ 1 }) / std::size_t{ 2 }]; // SoA, upper triangular matrix only } else { - A_cache[threadIdx.y][internal * THREAD_BLOCK_SIZE + threadIdx.x] = A[global_j_idx_linear * (num_rows - device_row_offset + PADDING_SIZE_uz) + dim + threadIdx_y - global_j_idx_linear * (global_j_idx_linear + std::size_t{ 1 }) / std::size_t{ 2 }]; // SoA, upper triangular matrix only + A_cache[threadIdx.y][internal * THREAD_BLOCK_SIZE + threadIdx.x] = A[global_j_idx_linear * (num_rows - device_row_offset + PADDING_SIZE_uz) + dim_block + threadIdx_y - global_j_idx_linear * (global_j_idx_linear + std::size_t{ 1 }) / std::size_t{ 2 }]; // SoA, upper triangular matrix only } - B_cache[threadIdx.y][internal * THREAD_BLOCK_SIZE + threadIdx.x] = B[(dim + device_row_offset + threadIdx_y) * (num_rhs + PADDING_SIZE_uz) + global_i_idx_linear]; // SoA + B_cache[threadIdx.y][internal * THREAD_BLOCK_SIZE + threadIdx.x] = B[(dim_block + device_row_offset + threadIdx_y) * (num_rhs + PADDING_SIZE_uz) + global_i_idx_linear]; // SoA } __syncthreads(); // wait until all threads loaded their part of the data // perform the dot product calculation - for (unsigned block_dim = 0; block_dim < THREAD_BLOCK_SIZE; ++block_dim) { + for (unsigned dim = 0; dim < THREAD_BLOCK_SIZE; ++dim) { for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) { for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) { - temp[internal_i][internal_j] += A_cache[block_dim][threadIdx.y * INTERNAL_BLOCK_SIZE + internal_j] * B_cache[block_dim][threadIdx.x * INTERNAL_BLOCK_SIZE + internal_i]; + temp[internal_i][internal_j] += A_cache[dim][threadIdx.y * INTERNAL_BLOCK_SIZE + internal_j] * B_cache[dim][threadIdx.x * INTERNAL_BLOCK_SIZE + internal_i]; } } } @@ -150,7 +150,7 @@ __global__ void device_kernel_symm_mirror(const std::size_t num_rows, const std: const auto j_idx_linear = blockIdx_y * blockDim_y * INTERNAL_BLOCK_SIZE_uz + threadIdx_x; // num_mirror_rows // iterate over the remaining values using blocking to be able to cache them for faster memory accesses - for (std::size_t dim = 0; dim < device_num_rows; dim += THREAD_BLOCK_SIZE_uz) { + for (std::size_t dim_block = 0; dim_block < device_num_rows; dim_block += THREAD_BLOCK_SIZE_uz) { // load data into shared memory for (unsigned internal = 0; internal < INTERNAL_BLOCK_SIZE; ++internal) { // calculate the indices to access the global data, pays attention to coalesced memory accesses @@ -158,16 +158,16 @@ __global__ void device_kernel_symm_mirror(const std::size_t num_rows, const std: const auto global_j_idx_linear = j_idx_linear + static_cast(internal) * THREAD_BLOCK_SIZE_uz; // store the values in the shared memory - A_cache[threadIdx.y][internal * THREAD_BLOCK_SIZE + threadIdx.x] = A[(dim + threadIdx_y) * (num_rows - device_row_offset + PADDING_SIZE_uz) - (dim + threadIdx_y - std::size_t{ 1 }) * (dim + threadIdx_y) / std::size_t{ 2 } + device_num_rows - (dim + threadIdx_y) + global_j_idx_linear]; // SoA, upper triangular matrix only - B_cache[threadIdx.y][internal * THREAD_BLOCK_SIZE + threadIdx.x] = B[(device_row_offset + dim + threadIdx_y) * (num_rhs + PADDING_SIZE_uz) + global_i_idx_linear]; // SoA + A_cache[threadIdx.y][internal * THREAD_BLOCK_SIZE + threadIdx.x] = A[(dim_block + threadIdx_y) * (num_rows - device_row_offset + PADDING_SIZE_uz) - (dim_block + threadIdx_y - std::size_t{ 1 }) * (dim_block + threadIdx_y) / std::size_t{ 2 } + device_num_rows - (dim_block + threadIdx_y) + global_j_idx_linear]; // SoA, upper triangular matrix only + B_cache[threadIdx.y][internal * THREAD_BLOCK_SIZE + threadIdx.x] = B[(device_row_offset + dim_block + threadIdx_y) * (num_rhs + PADDING_SIZE_uz) + global_i_idx_linear]; // SoA } __syncthreads(); // wait until all threads loaded their part of the data // perform the dot product calculation - for (unsigned block_dim = 0; block_dim < THREAD_BLOCK_SIZE; ++block_dim) { + for (unsigned dim = 0; dim < THREAD_BLOCK_SIZE; ++dim) { for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) { for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) { - temp[internal_i][internal_j] += A_cache[block_dim][threadIdx.y * INTERNAL_BLOCK_SIZE + internal_j] * B_cache[block_dim][threadIdx.x * INTERNAL_BLOCK_SIZE + internal_i]; + temp[internal_i][internal_j] += A_cache[dim][threadIdx.y * INTERNAL_BLOCK_SIZE + internal_j] * B_cache[dim][threadIdx.x * INTERNAL_BLOCK_SIZE + internal_i]; } } } diff --git a/include/plssvm/backends/CUDA/kernel/cg_explicit/kernel_matrix_assembly.cuh b/include/plssvm/backends/CUDA/kernel/cg_explicit/kernel_matrix_assembly.cuh index e4a3fa22d..70c9b4101 100644 --- a/include/plssvm/backends/CUDA/kernel/cg_explicit/kernel_matrix_assembly.cuh +++ b/include/plssvm/backends/CUDA/kernel/cg_explicit/kernel_matrix_assembly.cuh @@ -67,7 +67,7 @@ __global__ void device_kernel_assembly(real_type *kernel_matrix, const real_type const auto j_idx_linear = blockIdx_y * blockDim_y * INTERNAL_BLOCK_SIZE_uz + threadIdx_x; // device_num_rows // iterate over all features using blocking to be able to cache them for faster memory accesses - for (std::size_t dim = 0; dim < num_features; dim += THREAD_BLOCK_SIZE_uz) { + for (std::size_t feature_block = 0; feature_block < num_features; feature_block += THREAD_BLOCK_SIZE_uz) { // load data into shared memory for (unsigned internal = 0; internal < INTERNAL_BLOCK_SIZE; ++internal) { // calculate the indices to access the global data, pays attention to coalesced memory accesses @@ -75,17 +75,17 @@ __global__ void device_kernel_assembly(real_type *kernel_matrix, const real_type const auto global_j_idx_linear = device_row_offset + j_idx_linear + static_cast(internal) * THREAD_BLOCK_SIZE_uz; // store the values in the shared memory - data_i_cache[threadIdx.y][internal * THREAD_BLOCK_SIZE + threadIdx.x] = data[(dim + threadIdx_y) * (num_rows + std::size_t{ 1 } + PADDING_SIZE_uz) + global_i_idx_linear]; // SoA - data_j_cache[threadIdx.y][internal * THREAD_BLOCK_SIZE + threadIdx.x] = data[(dim + threadIdx_y) * (num_rows + std::size_t{ 1 } + PADDING_SIZE_uz) + global_j_idx_linear]; // SoA + data_i_cache[threadIdx.y][internal * THREAD_BLOCK_SIZE + threadIdx.x] = data[(feature_block + threadIdx_y) * (num_rows + std::size_t{ 1 } + PADDING_SIZE_uz) + global_i_idx_linear]; // SoA + data_j_cache[threadIdx.y][internal * THREAD_BLOCK_SIZE + threadIdx.x] = data[(feature_block + threadIdx_y) * (num_rows + std::size_t{ 1 } + PADDING_SIZE_uz) + global_j_idx_linear]; // SoA } __syncthreads(); // wait until all threads loaded their part of the data // perform the feature reduction calculation - for (unsigned block_dim = 0; block_dim < THREAD_BLOCK_SIZE; ++block_dim) { + for (unsigned feature = 0; feature < THREAD_BLOCK_SIZE; ++feature) { for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) { for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) { - temp[internal_i][internal_j] += detail::feature_reduce(data_i_cache[block_dim][threadIdx.x * INTERNAL_BLOCK_SIZE + internal_i], - data_j_cache[block_dim][threadIdx.y * INTERNAL_BLOCK_SIZE + internal_j]); + temp[internal_i][internal_j] += detail::feature_reduce(data_i_cache[feature][threadIdx.x * INTERNAL_BLOCK_SIZE + internal_i], + data_j_cache[feature][threadIdx.y * INTERNAL_BLOCK_SIZE + internal_j]); } } } diff --git a/include/plssvm/backends/CUDA/kernel/cg_implicit/kernel_matrix_assembly_blas.cuh b/include/plssvm/backends/CUDA/kernel/cg_implicit/kernel_matrix_assembly_blas.cuh index 8e8dd03c2..960f61b9f 100644 --- a/include/plssvm/backends/CUDA/kernel/cg_implicit/kernel_matrix_assembly_blas.cuh +++ b/include/plssvm/backends/CUDA/kernel/cg_implicit/kernel_matrix_assembly_blas.cuh @@ -80,7 +80,7 @@ __global__ void device_kernel_assembly_symm(const real_type alpha, const real_ty auto data_j_cache = reinterpret_cast(cache_two); // iterate over all features using blocking to be able to cache them for faster memory accesses - for (std::size_t dim = 0; dim < num_features; dim += THREAD_BLOCK_SIZE_uz) { + for (std::size_t feature_block = 0; feature_block < num_features; feature_block += THREAD_BLOCK_SIZE_uz) { // load data into shared memory for (unsigned internal = 0; internal < INTERNAL_BLOCK_SIZE; ++internal) { // calculate the indices to access the global data, pays attention to coalesced memory accesses @@ -88,17 +88,17 @@ __global__ void device_kernel_assembly_symm(const real_type alpha, const real_ty const auto global_j_idx_linear = device_row_offset + j_idx_linear + static_cast(internal) * THREAD_BLOCK_SIZE_uz; // store the values in the shared memory - data_i_cache[threadIdx.y][internal * THREAD_BLOCK_SIZE + threadIdx.x] = data[(dim + threadIdx_y) * (num_rows + std::size_t{ 1 } + PADDING_SIZE_uz) + global_i_idx_linear]; // SoA - data_j_cache[threadIdx.y][internal * THREAD_BLOCK_SIZE + threadIdx.x] = data[(dim + threadIdx_y) * (num_rows + std::size_t{ 1 } + PADDING_SIZE_uz) + global_j_idx_linear]; // SoA + data_i_cache[threadIdx.y][internal * THREAD_BLOCK_SIZE + threadIdx.x] = data[(feature_block + threadIdx_y) * (num_rows + std::size_t{ 1 } + PADDING_SIZE_uz) + global_i_idx_linear]; // SoA + data_j_cache[threadIdx.y][internal * THREAD_BLOCK_SIZE + threadIdx.x] = data[(feature_block + threadIdx_y) * (num_rows + std::size_t{ 1 } + PADDING_SIZE_uz) + global_j_idx_linear]; // SoA } __syncthreads(); // wait until all threads loaded their part of the data // perform the feature reduction calculation - for (unsigned block_dim = 0; block_dim < THREAD_BLOCK_SIZE; ++block_dim) { + for (unsigned feature = 0; feature < THREAD_BLOCK_SIZE; ++feature) { for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) { for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) { - temp[internal_i][internal_j] += detail::feature_reduce(data_i_cache[block_dim][threadIdx.x * INTERNAL_BLOCK_SIZE + internal_i], - data_j_cache[block_dim][threadIdx.y * INTERNAL_BLOCK_SIZE + internal_j]); + temp[internal_i][internal_j] += detail::feature_reduce(data_i_cache[feature][threadIdx.x * INTERNAL_BLOCK_SIZE + internal_i], + data_j_cache[feature][threadIdx.y * INTERNAL_BLOCK_SIZE + internal_j]); } } } @@ -139,15 +139,15 @@ __global__ void device_kernel_assembly_symm(const real_type alpha, const real_ty auto C_out_cache = reinterpret_cast(cache_two); // iterate over all classes using blocking to be able to cache them for faster memory accesses - for (std::size_t dim = 0; dim < num_classes; dim += THREAD_BLOCK_SIZE_uz) { + for (std::size_t class_block = 0; class_block < num_classes; class_block += THREAD_BLOCK_SIZE_uz) { // load data into shared memory for (unsigned internal = 0; internal < INTERNAL_BLOCK_SIZE; ++internal) { // calculate the indices to access the global data, pays attention to coalesced memory accesses const auto global_i_idx_linear = device_row_offset + i_idx_linear + static_cast(internal) * THREAD_BLOCK_SIZE_uz; // store the values in the shared memory - B_cache[internal * THREAD_BLOCK_SIZE + threadIdx.x][threadIdx.y] = alpha * B[global_i_idx_linear * (num_classes + PADDING_SIZE_uz) + dim + threadIdx_y]; // SoA - C_out_cache[internal * THREAD_BLOCK_SIZE + threadIdx.x][threadIdx.y] = real_type{ 0.0 }; // SoA + B_cache[internal * THREAD_BLOCK_SIZE + threadIdx.x][threadIdx.y] = alpha * B[global_i_idx_linear * (num_classes + PADDING_SIZE_uz) + class_block + threadIdx_y]; // SoA + C_out_cache[internal * THREAD_BLOCK_SIZE + threadIdx.x][threadIdx.y] = real_type{ 0.0 }; // SoA } __syncthreads(); // wait until all threads loaded their part of the data @@ -167,7 +167,7 @@ __global__ void device_kernel_assembly_symm(const real_type alpha, const real_ty // calculate the indices to access the global data const auto global_j_idx = device_row_offset + j_idx + static_cast(internal); - atomicAdd(&C[global_j_idx * (num_classes + PADDING_SIZE_uz) + dim + threadIdx_x], C_out_cache[threadIdx.y * INTERNAL_BLOCK_SIZE + internal][threadIdx.x]); // SoA + atomicAdd(&C[global_j_idx * (num_classes + PADDING_SIZE_uz) + class_block + threadIdx_x], C_out_cache[threadIdx.y * INTERNAL_BLOCK_SIZE + internal][threadIdx.x]); // SoA } __syncthreads(); // wai until all threads updated C with their values } @@ -195,14 +195,14 @@ __global__ void device_kernel_assembly_symm(const real_type alpha, const real_ty auto C_out_cache = reinterpret_cast(cache_two); // iterate over all classes using blocking to be able to cache them for faster memory accesses - for (std::size_t dim = 0; dim < num_classes; dim += THREAD_BLOCK_SIZE_uz) { + for (std::size_t class_block = 0; class_block < num_classes; class_block += THREAD_BLOCK_SIZE_uz) { // load data into shared memory for (unsigned internal = 0; internal < INTERNAL_BLOCK_SIZE; ++internal) { // calculate the indices to access the global data, pays attention to coalesced memory accesses const auto global_j_idx_linear = device_row_offset + j_idx_linear + static_cast(internal) * THREAD_BLOCK_SIZE_uz; // store the values in the shared memory - B_cache[threadIdx.y][internal * THREAD_BLOCK_SIZE + threadIdx.x] = alpha * B[global_j_idx_linear * (num_classes + PADDING_SIZE_uz) + dim + threadIdx_y]; // SoA + B_cache[threadIdx.y][internal * THREAD_BLOCK_SIZE + threadIdx.x] = alpha * B[global_j_idx_linear * (num_classes + PADDING_SIZE_uz) + class_block + threadIdx_y]; // SoA C_out_cache[threadIdx.y][internal * THREAD_BLOCK_SIZE + threadIdx.x] = real_type{ 0.0 }; } __syncthreads(); // wait until all threads loaded their part of the data @@ -223,7 +223,7 @@ __global__ void device_kernel_assembly_symm(const real_type alpha, const real_ty // calculate the indices to access the global data const auto global_i_idx = device_row_offset + i_idx + static_cast(internal); - atomicAdd(&C[global_i_idx * (num_classes + PADDING_SIZE_uz) + dim + threadIdx_y], C_out_cache[threadIdx.y][internal * THREAD_BLOCK_SIZE + threadIdx.x]); // SoA + atomicAdd(&C[global_i_idx * (num_classes + PADDING_SIZE_uz) + class_block + threadIdx_y], C_out_cache[threadIdx.y][internal * THREAD_BLOCK_SIZE + threadIdx.x]); // SoA } __syncthreads(); // wait until all threads updated C with their values } diff --git a/include/plssvm/backends/CUDA/kernel/predict_kernel.cuh b/include/plssvm/backends/CUDA/kernel/predict_kernel.cuh index 5469b01d9..9c462127e 100644 --- a/include/plssvm/backends/CUDA/kernel/predict_kernel.cuh +++ b/include/plssvm/backends/CUDA/kernel/predict_kernel.cuh @@ -58,7 +58,7 @@ __global__ void device_kernel_w_linear(real_type *w, const real_type *alpha, con const auto class_idx_linear = blockIdx_y * blockDim_y * INTERNAL_BLOCK_SIZE_uz + threadIdx_x; // num_classes // iterate over all support vectors using blocking to be able to cache them for faster memory accesses - for (std::size_t sv = 0; sv < device_num_sv; sv += THREAD_BLOCK_SIZE_uz) { + for (std::size_t sv_block = 0; sv_block < device_num_sv; sv_block += THREAD_BLOCK_SIZE_uz) { // load data into shared memory for (unsigned internal = 0; internal < INTERNAL_BLOCK_SIZE; ++internal) { // calculate the indices to access the global data, pays attention to coalesced memory accesses @@ -66,16 +66,16 @@ __global__ void device_kernel_w_linear(real_type *w, const real_type *alpha, con const auto global_class_idx_linear = class_idx_linear + static_cast(internal) * THREAD_BLOCK_SIZE_uz; // store the values in the shared memory - feature_cache[threadIdx.y][internal * THREAD_BLOCK_SIZE + threadIdx.x] = support_vectors[global_feature_idx_linear * (device_num_sv + PADDING_SIZE_uz) + sv + threadIdx_y]; // SoA - alpha_cache[threadIdx.y][internal * THREAD_BLOCK_SIZE + threadIdx.x] = alpha[global_class_idx_linear * (num_sv + PADDING_SIZE_uz) + sv + sv_offset + threadIdx_y]; // AoS + feature_cache[threadIdx.y][internal * THREAD_BLOCK_SIZE + threadIdx.x] = support_vectors[global_feature_idx_linear * (device_num_sv + PADDING_SIZE_uz) + sv_block + threadIdx_y]; // SoA + alpha_cache[threadIdx.y][internal * THREAD_BLOCK_SIZE + threadIdx.x] = alpha[global_class_idx_linear * (num_sv + PADDING_SIZE_uz) + sv_block + sv_offset + threadIdx_y]; // AoS } __syncthreads(); // wait until all threads loaded their part of the data // perform the dot product calculation - for (unsigned block_dim = 0; block_dim < THREAD_BLOCK_SIZE; ++block_dim) { + for (unsigned sv = 0; sv < THREAD_BLOCK_SIZE; ++sv) { for (unsigned internal_feature = 0; internal_feature < INTERNAL_BLOCK_SIZE; ++internal_feature) { for (unsigned internal_class = 0; internal_class < INTERNAL_BLOCK_SIZE; ++internal_class) { - temp[internal_feature][internal_class] += alpha_cache[block_dim][threadIdx.y * INTERNAL_BLOCK_SIZE + internal_class] * feature_cache[block_dim][threadIdx.x * INTERNAL_BLOCK_SIZE + internal_feature]; + temp[internal_feature][internal_class] += alpha_cache[sv][threadIdx.y * INTERNAL_BLOCK_SIZE + internal_class] * feature_cache[sv][threadIdx.x * INTERNAL_BLOCK_SIZE + internal_feature]; } } } @@ -137,7 +137,7 @@ __global__ void device_kernel_predict_linear(real_type *prediction, const real_t const auto class_idx_linear = blockIdx_y * blockDim_y * INTERNAL_BLOCK_SIZE_uz + threadIdx_x; // num_classes // iterate over all features using blocking to be able to cache them for faster memory accesses - for (std::size_t dim = 0; dim < num_features; dim += THREAD_BLOCK_SIZE_uz) { + for (std::size_t feature_block = 0; feature_block < num_features; feature_block += THREAD_BLOCK_SIZE_uz) { // load data into shared memory for (unsigned internal = 0; internal < INTERNAL_BLOCK_SIZE; ++internal) { // calculate the indices to access the global data, pays attention to coalesced memory accesses @@ -145,16 +145,16 @@ __global__ void device_kernel_predict_linear(real_type *prediction, const real_t const auto global_class_idx_linear = class_idx_linear + static_cast(internal) * THREAD_BLOCK_SIZE_uz; // store the values in the shared memory - pp_cache[threadIdx.y][internal * THREAD_BLOCK_SIZE + threadIdx.x] = predict_points[(dim + threadIdx_y) * (num_predict_points + PADDING_SIZE_uz) + global_pp_idx_linear]; // SoA - w_cache[threadIdx.y][internal * THREAD_BLOCK_SIZE + threadIdx.x] = w[(dim + threadIdx_y) * (num_classes + PADDING_SIZE_uz) + global_class_idx_linear]; // SoA + pp_cache[threadIdx.y][internal * THREAD_BLOCK_SIZE + threadIdx.x] = predict_points[(feature_block + threadIdx_y) * (num_predict_points + PADDING_SIZE_uz) + global_pp_idx_linear]; // SoA + w_cache[threadIdx.y][internal * THREAD_BLOCK_SIZE + threadIdx.x] = w[(feature_block + threadIdx_y) * (num_classes + PADDING_SIZE_uz) + global_class_idx_linear]; // SoA } __syncthreads(); // wait until all threads loaded their part of the data // perform the dot product calculation - for (unsigned block_dim = 0; block_dim < THREAD_BLOCK_SIZE; ++block_dim) { + for (unsigned feature = 0; feature < THREAD_BLOCK_SIZE; ++feature) { for (unsigned internal_pp = 0; internal_pp < INTERNAL_BLOCK_SIZE; ++internal_pp) { for (unsigned internal_class = 0; internal_class < INTERNAL_BLOCK_SIZE; ++internal_class) { - temp[internal_pp][internal_class] += w_cache[block_dim][threadIdx.y * INTERNAL_BLOCK_SIZE + internal_class] * pp_cache[block_dim][threadIdx.x * INTERNAL_BLOCK_SIZE + internal_pp]; + temp[internal_pp][internal_class] += w_cache[feature][threadIdx.y * INTERNAL_BLOCK_SIZE + internal_class] * pp_cache[feature][threadIdx.x * INTERNAL_BLOCK_SIZE + internal_pp]; } } } @@ -226,7 +226,7 @@ __global__ void device_kernel_predict(real_type *prediction, const real_type *al const auto sv_idx_linear = blockIdx_y * blockDim_y * INTERNAL_BLOCK_SIZE_uz + threadIdx_x; // num_support_vectors // iterate over all features using blocking to be able to cache them for faster memory accesses - for (std::size_t dim = 0; dim < num_features; dim += THREAD_BLOCK_SIZE_uz) { + for (std::size_t feature_block = 0; feature_block < num_features; feature_block += THREAD_BLOCK_SIZE_uz) { // load data into shared memory for (unsigned internal = 0; internal < INTERNAL_BLOCK_SIZE; ++internal) { // calculate the indices to access the global data, pays attention to coalesced memory accesses @@ -234,17 +234,17 @@ __global__ void device_kernel_predict(real_type *prediction, const real_type *al const auto global_sv_idx_linear = sv_idx_linear + static_cast(internal) * THREAD_BLOCK_SIZE; // store the values in the shared memory - pp_cache[threadIdx.y][internal * THREAD_BLOCK_SIZE + threadIdx.x] = predict_points[(dim + threadIdx_y) * (num_predict_points + PADDING_SIZE_uz) + global_pp_idx_linear]; // SoA - sv_cache[threadIdx.y][internal * THREAD_BLOCK_SIZE + threadIdx.x] = sv[(dim + threadIdx_y) * (num_sv + PADDING_SIZE_uz) + global_sv_idx_linear]; // SoA + pp_cache[threadIdx.y][internal * THREAD_BLOCK_SIZE + threadIdx.x] = predict_points[(feature_block + threadIdx_y) * (num_predict_points + PADDING_SIZE_uz) + global_pp_idx_linear]; // SoA + sv_cache[threadIdx.y][internal * THREAD_BLOCK_SIZE + threadIdx.x] = sv[(feature_block + threadIdx_y) * (num_sv + PADDING_SIZE_uz) + global_sv_idx_linear]; // SoA } __syncthreads(); // wait until all threads loaded their part of the data // perform the feature reduction calculation - for (unsigned block_dim = 0; block_dim < THREAD_BLOCK_SIZE; ++block_dim) { + for (unsigned feature = 0; feature < THREAD_BLOCK_SIZE; ++feature) { for (unsigned internal_pp = 0; internal_pp < INTERNAL_BLOCK_SIZE; ++internal_pp) { for (unsigned internal_sv = 0; internal_sv < INTERNAL_BLOCK_SIZE; ++internal_sv) { - temp[internal_pp][internal_sv] += detail::feature_reduce(sv_cache[block_dim][threadIdx.y * INTERNAL_BLOCK_SIZE + internal_sv], - pp_cache[block_dim][threadIdx.x * INTERNAL_BLOCK_SIZE + internal_pp]); + temp[internal_pp][internal_sv] += detail::feature_reduce(sv_cache[feature][threadIdx.y * INTERNAL_BLOCK_SIZE + internal_sv], + pp_cache[feature][threadIdx.x * INTERNAL_BLOCK_SIZE + internal_pp]); } } } @@ -271,17 +271,17 @@ __global__ void device_kernel_predict(real_type *prediction, const real_type *al const auto sv_idx_linear = blockIdx_y * blockDim_y * INTERNAL_BLOCK_SIZE_uz + threadIdx_x; // num_support_vectors // iterate over all classes using blocking to be able to cache them for faster memory accesses - for (std::size_t dim = 0; dim < num_classes; dim += THREAD_BLOCK_SIZE_uz) { + for (std::size_t class_block = 0; class_block < num_classes; class_block += THREAD_BLOCK_SIZE_uz) { // load data into shared memory for (unsigned internal = 0; internal < INTERNAL_BLOCK_SIZE; ++internal) { // calculate the indices to access the global data, pays attention to coalesced memory accesses const std::size_t global_sv_idx_linear = sv_idx_linear + internal * THREAD_BLOCK_SIZE; // store the values in the shared memory - alpha_cache[threadIdx.y][internal * THREAD_BLOCK_SIZE + threadIdx.x] = alpha[(dim + threadIdx_y) * (num_sv + PADDING_SIZE_uz) + global_sv_idx_linear]; // AoS + alpha_cache[threadIdx.y][internal * THREAD_BLOCK_SIZE + threadIdx.x] = alpha[(class_block + threadIdx_y) * (num_sv + PADDING_SIZE_uz) + global_sv_idx_linear]; // AoS // the bias (rho) must only be applied once for all support vectors if (blockIdx_y == std::size_t{ 0 }) { - out_cache[threadIdx.y][internal * THREAD_BLOCK_SIZE + threadIdx.x] = -rho[dim + threadIdx_y]; + out_cache[threadIdx.y][internal * THREAD_BLOCK_SIZE + threadIdx.x] = -rho[class_block + threadIdx_y]; } else { out_cache[threadIdx.y][internal * THREAD_BLOCK_SIZE + threadIdx.x] = real_type{ 0.0 }; } @@ -304,7 +304,7 @@ __global__ void device_kernel_predict(real_type *prediction, const real_type *al // calculate the indices to access the global data const auto global_pp_idx = pp_idx + static_cast(internal); - atomicAdd(&prediction[global_pp_idx * (num_classes + PADDING_SIZE_uz) + dim + threadIdx_y], out_cache[threadIdx.y][internal * THREAD_BLOCK_SIZE + threadIdx.x]); + atomicAdd(&prediction[global_pp_idx * (num_classes + PADDING_SIZE_uz) + class_block + threadIdx_y], out_cache[threadIdx.y][internal * THREAD_BLOCK_SIZE + threadIdx.x]); } __syncthreads(); // wait until all threads updated their part of the prediction } From 0c682067bd8a86734bc94c7c5f2c9567a7133aef Mon Sep 17 00:00:00 2001 From: Marcel Breyer Date: Sat, 31 May 2025 14:44:36 +0200 Subject: [PATCH 013/215] Update documentation and add missing headers. --- .../backends/CUDA/kernel/cg_explicit/blas.cuh | 2 + .../kernel_matrix_assembly_blas.cuh | 3 + .../backends/CUDA/kernel/kernel_functions.cuh | 1 - .../backends/CUDA/kernel/predict_kernel.cuh | 84 +++++++++---------- .../backends/OpenMP/kernel/predict_kernel.hpp | 10 +-- 5 files changed, 52 insertions(+), 48 deletions(-) diff --git a/include/plssvm/backends/CUDA/kernel/cg_explicit/blas.cuh b/include/plssvm/backends/CUDA/kernel/cg_explicit/blas.cuh index d2adc5618..bacc84852 100644 --- a/include/plssvm/backends/CUDA/kernel/cg_explicit/blas.cuh +++ b/include/plssvm/backends/CUDA/kernel/cg_explicit/blas.cuh @@ -15,6 +15,8 @@ #include "plssvm/constants.hpp" // plssvm::{real_type, THREAD_BLOCK_SIZE, INTERNAL_BLOCK_SIZE, PADDING_SIZE} +#include // std::size_t + namespace plssvm::cuda::detail { /** diff --git a/include/plssvm/backends/CUDA/kernel/cg_implicit/kernel_matrix_assembly_blas.cuh b/include/plssvm/backends/CUDA/kernel/cg_implicit/kernel_matrix_assembly_blas.cuh index 960f61b9f..bf1ee66e5 100644 --- a/include/plssvm/backends/CUDA/kernel/cg_implicit/kernel_matrix_assembly_blas.cuh +++ b/include/plssvm/backends/CUDA/kernel/cg_implicit/kernel_matrix_assembly_blas.cuh @@ -18,6 +18,8 @@ #include "plssvm/constants.hpp" // plssvm::{real_type, THREAD_BLOCK_SIZE, INTERNAL_BLOCK_SIZE, PADDING_SIZE} #include "plssvm/kernel_function_types.hpp" // plssvm::kernel_function_type +#include // std::size_t + namespace plssvm::cuda::detail { /** @@ -186,6 +188,7 @@ __global__ void device_kernel_assembly_symm(const real_type alpha, const real_ty } } } + //*************************************************************************// // calculate C += alpha * temp * B for the LOWER triangular matrix // //*************************************************************************// diff --git a/include/plssvm/backends/CUDA/kernel/kernel_functions.cuh b/include/plssvm/backends/CUDA/kernel/kernel_functions.cuh index 72a4499ae..7748c45c8 100644 --- a/include/plssvm/backends/CUDA/kernel/kernel_functions.cuh +++ b/include/plssvm/backends/CUDA/kernel/kernel_functions.cuh @@ -51,7 +51,6 @@ template <> /** * @brief Fast integer power function. Computes base^exponent and takes advantage of the fact that degree may only be positive integer values. - * @details Hardcodes the power function for degree <= 6, uses a simple for loop otherwise. * @param[in] base the base * @param[in] exponent the exponent * @return base^exponent (`[[nodiscard]]`) diff --git a/include/plssvm/backends/CUDA/kernel/predict_kernel.cuh b/include/plssvm/backends/CUDA/kernel/predict_kernel.cuh index 9c462127e..285cdc3a6 100644 --- a/include/plssvm/backends/CUDA/kernel/predict_kernel.cuh +++ b/include/plssvm/backends/CUDA/kernel/predict_kernel.cuh @@ -18,21 +18,23 @@ #include "plssvm/constants.hpp" // plssvm::{real_type, THREAD_BLOCK_SIZE, INTERNAL_BLOCK_SIZE, PADDING_SIZE} #include "plssvm/kernel_function_types.hpp" // plssvm::kernel_function_type +#include // std::size_t + namespace plssvm::cuda::detail { /** * @brief Calculate the `w` vector used to speedup the prediction using the linear kernel function. * @param[out] w the vector to speedup the linear prediction * @param[in] alpha the previously learned weights - * @param[in] sv the support vectors + * @param[in] support_vectors the support vectors * @param[in] num_classes the number of classes * @param[in] num_sv the number of support vectors * @param[in] device_num_sv the number of support vectors the current device is responsible for - * @param[in] sv_offset the first support vector (row in @p alpha) the current device is responsible for + * @param[in] device_sv_offset the first support vector (row in @p alpha) the current device is responsible for * @param[in] grid_x_offset the offset in x-dimension into the data points if more than one execution grid has to be used * @param[in] grid_y_offset the offset in y-dimension into the data points if more than one execution grid has to be used */ -__global__ void device_kernel_w_linear(real_type *w, const real_type *alpha, const real_type *support_vectors, const std::size_t num_classes, const std::size_t num_sv, const std::size_t device_num_sv, const std::size_t sv_offset, const std::size_t grid_x_offset, const std::size_t grid_y_offset) { +__global__ void device_kernel_w_linear(real_type *w, const real_type *alpha, const real_type *support_vectors, const std::size_t num_classes, const std::size_t num_sv, const std::size_t device_num_sv, const std::size_t device_sv_offset, const std::size_t grid_x_offset, const std::size_t grid_y_offset) { // cast all values to 64-bit std::size_t to prevent potential 32-bit overflows const auto INTERNAL_BLOCK_SIZE_uz = static_cast(INTERNAL_BLOCK_SIZE); const auto THREAD_BLOCK_SIZE_uz = static_cast(THREAD_BLOCK_SIZE); @@ -67,7 +69,7 @@ __global__ void device_kernel_w_linear(real_type *w, const real_type *alpha, con // store the values in the shared memory feature_cache[threadIdx.y][internal * THREAD_BLOCK_SIZE + threadIdx.x] = support_vectors[global_feature_idx_linear * (device_num_sv + PADDING_SIZE_uz) + sv_block + threadIdx_y]; // SoA - alpha_cache[threadIdx.y][internal * THREAD_BLOCK_SIZE + threadIdx.x] = alpha[global_class_idx_linear * (num_sv + PADDING_SIZE_uz) + sv_block + sv_offset + threadIdx_y]; // AoS + alpha_cache[threadIdx.y][internal * THREAD_BLOCK_SIZE + threadIdx.x] = alpha[global_class_idx_linear * (num_sv + PADDING_SIZE_uz) + sv_block + device_sv_offset + threadIdx_y]; // AoS } __syncthreads(); // wait until all threads loaded their part of the data @@ -264,50 +266,48 @@ __global__ void device_kernel_predict(real_type *prediction, const real_type *al auto alpha_cache = reinterpret_cast(cache_one); auto out_cache = reinterpret_cast(cache_two); - { - // calculate the indices used in the current thread - const auto pp_idx = (blockIdx_x * blockDim_x + threadIdx_x) * INTERNAL_BLOCK_SIZE_uz; // num_predict_points - // calculate the indices used in the current thread, pays attention to coalesced memory accesses - const auto sv_idx_linear = blockIdx_y * blockDim_y * INTERNAL_BLOCK_SIZE_uz + threadIdx_x; // num_support_vectors - - // iterate over all classes using blocking to be able to cache them for faster memory accesses - for (std::size_t class_block = 0; class_block < num_classes; class_block += THREAD_BLOCK_SIZE_uz) { - // load data into shared memory - for (unsigned internal = 0; internal < INTERNAL_BLOCK_SIZE; ++internal) { - // calculate the indices to access the global data, pays attention to coalesced memory accesses - const std::size_t global_sv_idx_linear = sv_idx_linear + internal * THREAD_BLOCK_SIZE; - - // store the values in the shared memory - alpha_cache[threadIdx.y][internal * THREAD_BLOCK_SIZE + threadIdx.x] = alpha[(class_block + threadIdx_y) * (num_sv + PADDING_SIZE_uz) + global_sv_idx_linear]; // AoS - // the bias (rho) must only be applied once for all support vectors - if (blockIdx_y == std::size_t{ 0 }) { - out_cache[threadIdx.y][internal * THREAD_BLOCK_SIZE + threadIdx.x] = -rho[class_block + threadIdx_y]; - } else { - out_cache[threadIdx.y][internal * THREAD_BLOCK_SIZE + threadIdx.x] = real_type{ 0.0 }; - } + // calculate the indices used in the current thread + const auto pp_idx = (blockIdx_x * blockDim_x + threadIdx_x) * INTERNAL_BLOCK_SIZE_uz; // num_predict_points + // calculate the indices used in the current thread, pays attention to coalesced memory accesses + const auto sv_idx_linear = blockIdx_y * blockDim_y * INTERNAL_BLOCK_SIZE_uz + threadIdx_x; // num_support_vectors + + // iterate over all classes using blocking to be able to cache them for faster memory accesses + for (std::size_t class_block = 0; class_block < num_classes; class_block += THREAD_BLOCK_SIZE_uz) { + // load data into shared memory + for (unsigned internal = 0; internal < INTERNAL_BLOCK_SIZE; ++internal) { + // calculate the indices to access the global data, pays attention to coalesced memory accesses + const auto global_sv_idx_linear = sv_idx_linear + internal * THREAD_BLOCK_SIZE; + + // store the values in the shared memory + alpha_cache[threadIdx.y][internal * THREAD_BLOCK_SIZE + threadIdx.x] = alpha[(class_block + threadIdx_y) * (num_sv + PADDING_SIZE_uz) + global_sv_idx_linear]; // AoS + // the bias (rho) must only be applied once for all support vectors + if (blockIdx_y == std::size_t{ 0 }) { + out_cache[threadIdx.y][internal * THREAD_BLOCK_SIZE + threadIdx.x] = -rho[class_block + threadIdx_y]; + } else { + out_cache[threadIdx.y][internal * THREAD_BLOCK_SIZE + threadIdx.x] = real_type{ 0.0 }; } - __syncthreads(); // wait until all threads loaded their part of the data - - // calculate intermediate results and store them in shared memory - for (unsigned class_idx = 0; class_idx < THREAD_BLOCK_SIZE; ++class_idx) { - for (unsigned internal_pp = 0; internal_pp < INTERNAL_BLOCK_SIZE; ++internal_pp) { - for (unsigned internal_sv = 0; internal_sv < INTERNAL_BLOCK_SIZE; ++internal_sv) { - out_cache[(class_idx + threadIdx.y) % THREAD_BLOCK_SIZE][internal_pp * THREAD_BLOCK_SIZE + threadIdx.x] += - temp[internal_pp][internal_sv] * alpha_cache[(class_idx + threadIdx.y) % THREAD_BLOCK_SIZE][threadIdx.y * INTERNAL_BLOCK_SIZE + internal_sv]; - } + } + __syncthreads(); // wait until all threads loaded their part of the data + + // calculate intermediate results and store them in shared memory + for (unsigned class_idx = 0; class_idx < THREAD_BLOCK_SIZE; ++class_idx) { + for (unsigned internal_pp = 0; internal_pp < INTERNAL_BLOCK_SIZE; ++internal_pp) { + for (unsigned internal_sv = 0; internal_sv < INTERNAL_BLOCK_SIZE; ++internal_sv) { + out_cache[(class_idx + threadIdx.y) % THREAD_BLOCK_SIZE][internal_pp * THREAD_BLOCK_SIZE + threadIdx.x] += + temp[internal_pp][internal_sv] * alpha_cache[(class_idx + threadIdx.y) % THREAD_BLOCK_SIZE][threadIdx.y * INTERNAL_BLOCK_SIZE + internal_sv]; } - __syncthreads(); // wait until all threads performed their part of the calculations } + __syncthreads(); // wait until all threads performed their part of the calculations + } - // atomically add the intermediate cached results to the prediction - for (unsigned internal = 0; internal < INTERNAL_BLOCK_SIZE; ++internal) { - // calculate the indices to access the global data - const auto global_pp_idx = pp_idx + static_cast(internal); + // atomically add the intermediate cached results to the prediction + for (unsigned internal = 0; internal < INTERNAL_BLOCK_SIZE; ++internal) { + // calculate the indices to access the global data + const auto global_pp_idx = pp_idx + static_cast(internal); - atomicAdd(&prediction[global_pp_idx * (num_classes + PADDING_SIZE_uz) + class_block + threadIdx_y], out_cache[threadIdx.y][internal * THREAD_BLOCK_SIZE + threadIdx.x]); - } - __syncthreads(); // wait until all threads updated their part of the prediction + atomicAdd(&prediction[global_pp_idx * (num_classes + PADDING_SIZE_uz) + class_block + threadIdx_y], out_cache[threadIdx.y][internal * THREAD_BLOCK_SIZE + threadIdx.x]); } + __syncthreads(); // wait until all threads updated their part of the prediction } } } diff --git a/include/plssvm/backends/OpenMP/kernel/predict_kernel.hpp b/include/plssvm/backends/OpenMP/kernel/predict_kernel.hpp index 89c0a380c..17696bd90 100644 --- a/include/plssvm/backends/OpenMP/kernel/predict_kernel.hpp +++ b/include/plssvm/backends/OpenMP/kernel/predict_kernel.hpp @@ -32,13 +32,13 @@ namespace plssvm::openmp::detail { * @param[in] alpha the previously learned weights * @param[in] support_vectors the support vectors * @param[in] device_num_sv the number of support vectors the current device is responsible for - * @param[in] sv_offset the first row in @p support_vectors the current device is responsible for + * @param[in] device_sv_offset the first row in @p support_vectors the current device is responsible for */ -inline void device_kernel_w_linear(soa_matrix &w, const aos_matrix &alpha, const soa_matrix &support_vectors, const std::size_t device_num_sv, const std::size_t sv_offset) { +inline void device_kernel_w_linear(soa_matrix &w, const aos_matrix &alpha, const soa_matrix &support_vectors, const std::size_t device_num_sv, const std::size_t device_sv_offset) { PLSSVM_ASSERT(alpha.num_cols() == support_vectors.num_rows(), "Size mismatch: {} vs {}!", alpha.num_cols(), support_vectors.num_rows()); PLSSVM_ASSERT(w.shape() == (plssvm::shape{ alpha.num_rows(), support_vectors.num_cols() }), "Shape mismatch: {} vs {}!", w.shape(), (plssvm::shape{ alpha.num_rows(), support_vectors.num_cols() })); PLSSVM_ASSERT(support_vectors.num_rows() >= device_num_sv, "The number of place specific sv ({}) cannot be greater the the total number of sv ({})!", device_num_sv, support_vectors.num_rows()); - PLSSVM_ASSERT(support_vectors.num_rows() >= sv_offset, "The sv offset ({}) cannot be greater the the total number of sv ({})!", sv_offset, support_vectors.num_rows()); + PLSSVM_ASSERT(support_vectors.num_rows() >= device_sv_offset, "The sv offset ({}) cannot be greater the the total number of sv ({})!", device_sv_offset, support_vectors.num_rows()); // calculate constants const std::size_t num_classes = alpha.num_rows(); @@ -52,7 +52,7 @@ inline void device_kernel_w_linear(soa_matrix &w, const aos_matrix(INTERNAL_BLOCK_SIZE); const auto THREAD_BLOCK_SIZE_uz = static_cast(THREAD_BLOCK_SIZE); -#pragma omp parallel for collapse(2) default(none) shared(w, support_vectors, alpha) firstprivate(blocked_num_classes, blocked_num_features, num_classes, num_features, device_num_sv, sv_offset) +#pragma omp parallel for collapse(2) default(none) shared(w, support_vectors, alpha) firstprivate(blocked_num_classes, blocked_num_features, num_classes, num_features, device_num_sv, device_sv_offset) for (std::size_t feature_block = 0; feature_block < blocked_num_features; feature_block += THREAD_BLOCK_SIZE_uz) { for (std::size_t class_block = 0; class_block < blocked_num_classes; class_block += THREAD_BLOCK_SIZE_uz) { // perform operations on the current block @@ -75,7 +75,7 @@ inline void device_kernel_w_linear(soa_matrix &w, const aos_matrix Date: Sat, 31 May 2025 14:45:38 +0200 Subject: [PATCH 014/215] Update the HIP backend kernels. --- .../HIP/kernel/cg_explicit/blas.hip.hpp | 297 +++++++-------- .../kernel_matrix_assembly.hip.hpp | 52 +-- .../kernel_matrix_assembly_blas.hip.hpp | 180 ++++----- .../HIP/kernel/kernel_functions.hip.hpp | 35 +- .../HIP/kernel/predict_kernel.hip.hpp | 342 +++++++++--------- 5 files changed, 463 insertions(+), 443 deletions(-) diff --git a/include/plssvm/backends/HIP/kernel/cg_explicit/blas.hip.hpp b/include/plssvm/backends/HIP/kernel/cg_explicit/blas.hip.hpp index 124688d3a..b2e9c8ce3 100644 --- a/include/plssvm/backends/HIP/kernel/cg_explicit/blas.hip.hpp +++ b/include/plssvm/backends/HIP/kernel/cg_explicit/blas.hip.hpp @@ -13,11 +13,13 @@ #define PLSSVM_BACKENDS_HIP_CG_EXPLICIT_BLAS_HIP_HPP_ #pragma once -#include "plssvm/constants.hpp" // plssvm::{real_type, THREAD_BLOCK_SIZE, INTERNAL_BLOCK_SIZE, FEATURE_BLOCK_SIZE, PADDING_SIZE} +#include "plssvm/constants.hpp" // plssvm::{real_type, THREAD_BLOCK_SIZE, INTERNAL_BLOCK_SIZE, PADDING_SIZE} #include "hip/hip_runtime.h" #include "hip/hip_runtime_api.h" +#include // std::size_t + namespace plssvm::hip::detail { /** @@ -25,8 +27,8 @@ namespace plssvm::hip::detail { * @details In a multi-GPU setting, this function is only responsible for the rows this device is responsible for! * @param[in] num_rows the number of rows in @p A and @p C * @param[in] num_rhs the number of columns in @p B and @p C - * @param[in] device_specific_num_rows the number of rows in @p A and number of rows in @p B; thr rows in @p A are potentially distributed across multiple devices - * @param[in] row_offset the first row this device is responsible for + * @param[in] device_num_rows the number of rows in @p A and number of rows in @p B; thr rows in @p A are potentially distributed across multiple devices + * @param[in] device_row_offset the first row this device is responsible for * @param[in] alpha the scalar alpha value * @param[in] A the matrix @p A * @param[in] B the matrix @p B @@ -35,78 +37,77 @@ namespace plssvm::hip::detail { * @param[in] grid_x_offset the offset in x-dimension into the data points if more than one execution grid has to be used * @param[in] grid_y_offset the offset in y-dimension into the data points if more than one execution grid has to be used */ -__global__ void device_kernel_symm(const unsigned long long num_rows, const unsigned long long num_rhs, const unsigned long long device_specific_num_rows, const unsigned long long row_offset, const real_type alpha, const real_type *A, const real_type *B, const real_type beta, real_type *C, const unsigned long long grid_x_offset, const unsigned long long grid_y_offset) { - // cast all values to 64-bit unsigned long long to prevent potential 32-bit overflows - const auto threadIdx_x = static_cast(threadIdx.x); // current thread in block x-dimension - const auto threadIdx_y = static_cast(threadIdx.y); // current thread in block y-dimension - const auto blockDim_x = static_cast(blockDim.x); // number of threads in block x-dimension - const auto blockDim_y = static_cast(blockDim.y); // number of threads in block y-dimension - const auto blockIdx_x = static_cast(blockIdx.x) + grid_x_offset; // current block in grid x-dimension + offsets if the grid size would be too large - const auto blockIdx_y = static_cast(blockIdx.y) + grid_y_offset; // current block in grid y-dimension + offsets if the grid size would be too large - const auto INTERNAL_BLOCK_SIZE_ull = static_cast(INTERNAL_BLOCK_SIZE); - const auto THREAD_BLOCK_SIZE_ull = static_cast(THREAD_BLOCK_SIZE); - const auto FEATURE_BLOCK_SIZE_ull = static_cast(FEATURE_BLOCK_SIZE); - const auto PADDING_SIZE_ull = static_cast(PADDING_SIZE); - - // calculate the indices used in the current thread - const auto i = (blockIdx_x * blockDim_x + threadIdx_x) * INTERNAL_BLOCK_SIZE_ull; // # rhs -> num_rhs - const auto i_linear = blockIdx_x * blockDim_x * INTERNAL_BLOCK_SIZE_ull + threadIdx_x; - const auto j = (blockIdx_y * blockDim_y + threadIdx_y) * INTERNAL_BLOCK_SIZE_ull; // # rows -> device_specific_num_rows - const auto j_linear = blockIdx_y * blockDim_y * INTERNAL_BLOCK_SIZE_ull + threadIdx_x; - - // create the shared memory arrays used for caching data point features - __shared__ real_type A_cache[FEATURE_BLOCK_SIZE][INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE]; - __shared__ real_type B_cache[FEATURE_BLOCK_SIZE][INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE]; +__global__ void device_kernel_symm(const std::size_t num_rows, const std::size_t num_rhs, const std::size_t device_num_rows, const std::size_t device_row_offset, const real_type alpha, const real_type *A, const real_type *B, const real_type beta, real_type *C, const std::size_t grid_x_offset, const std::size_t grid_y_offset) { + // cast all values to 64-bit std::size_t to prevent potential 32-bit overflows + const auto INTERNAL_BLOCK_SIZE_uz = static_cast(INTERNAL_BLOCK_SIZE); + const auto THREAD_BLOCK_SIZE_uz = static_cast(THREAD_BLOCK_SIZE); + const auto PADDING_SIZE_uz = static_cast(PADDING_SIZE); + + const auto threadIdx_x = static_cast(threadIdx.x); // current thread in block x-dimension + const auto threadIdx_y = static_cast(threadIdx.y); // current thread in block y-dimension + const auto blockDim_x = static_cast(blockDim.x); // number of threads in block x-dimension + const auto blockDim_y = static_cast(blockDim.y); // number of threads in block y-dimension + const auto blockIdx_x = static_cast(blockIdx.x) + grid_x_offset; // current block in grid x-dimension + offsets if the grid size is too large + const auto blockIdx_y = static_cast(blockIdx.y) + grid_y_offset; // current block in grid y-dimension + offsets if the grid size is too large + + // create two shared memory arrays used for caching + __shared__ real_type A_cache[THREAD_BLOCK_SIZE][INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE]; + __shared__ real_type B_cache[THREAD_BLOCK_SIZE][INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE]; // create a thread private array used for internal caching real_type temp[INTERNAL_BLOCK_SIZE][INTERNAL_BLOCK_SIZE]{}; - // iterate over all features using blocking to be able to cache them for faster memory accesses - for (unsigned long long dim = 0; dim < (num_rows - row_offset); dim += FEATURE_BLOCK_SIZE_ull) { - // load data into shared memory - for (unsigned internal = 0; internal < INTERNAL_BLOCK_SIZE; ++internal) { - const auto global_i = i_linear + static_cast(internal) * THREAD_BLOCK_SIZE_ull; - const auto global_j = j_linear + static_cast(internal) * THREAD_BLOCK_SIZE_ull; - - // determine on which side of the diagonal we are located - if (dim + threadIdx_y < global_j) { - A_cache[threadIdx.y][internal * THREAD_BLOCK_SIZE + threadIdx.x] = A[(dim + threadIdx_y) * (num_rows - row_offset + PADDING_SIZE_ull) + global_j - (dim + threadIdx_y) * (dim + threadIdx_y + 1ull) / 2ull]; - } else { - A_cache[threadIdx.y][internal * THREAD_BLOCK_SIZE + threadIdx.x] = A[global_j * (num_rows - row_offset + PADDING_SIZE_ull) + dim + threadIdx_y - global_j * (global_j + 1ull) / 2ull]; - } - // determine on which side of the diagonal we are located - if (dim + threadIdx.y + THREAD_BLOCK_SIZE < global_j) { - A_cache[threadIdx.y + THREAD_BLOCK_SIZE][internal * THREAD_BLOCK_SIZE + threadIdx.x] = A[(dim + threadIdx_y + THREAD_BLOCK_SIZE_ull) * (num_rows - row_offset + PADDING_SIZE_ull) + global_j - (dim + threadIdx_y + THREAD_BLOCK_SIZE_ull) * (dim + threadIdx_y + THREAD_BLOCK_SIZE_ull + 1ull) / 2ull]; - } else { - A_cache[threadIdx.y + THREAD_BLOCK_SIZE][internal * THREAD_BLOCK_SIZE + threadIdx.x] = A[global_j * (num_rows - row_offset + PADDING_SIZE_ull) + dim + threadIdx_y + THREAD_BLOCK_SIZE_ull - global_j * (global_j + 1ull) / 2ull]; + { + // calculate the indices used in the current thread, pays attention to coalesced memory accesses + const auto i_idx_linear = blockIdx_x * blockDim_x * INTERNAL_BLOCK_SIZE_uz + threadIdx_x; // num_rhs + const auto j_idx_linear = blockIdx_y * blockDim_y * INTERNAL_BLOCK_SIZE_uz + threadIdx_x; // device_num_rows + + // iterate over all values using blocking to be able to cache them for faster memory accesses + for (std::size_t dim_block = 0; dim_block < (num_rows - device_row_offset); dim_block += THREAD_BLOCK_SIZE_uz) { + // load data into shared memory + for (unsigned internal = 0; internal < INTERNAL_BLOCK_SIZE; ++internal) { + // calculate the indices to access the global data, pays attention to coalesced memory accesses + const auto global_i_idx_linear = i_idx_linear + static_cast(internal) * THREAD_BLOCK_SIZE_uz; + const auto global_j_idx_linear = j_idx_linear + static_cast(internal) * THREAD_BLOCK_SIZE_uz; + + // store the values in the shared memory + // determine on which side of the diagonal we are located + if (dim_block + threadIdx_y < global_j_idx_linear) { + A_cache[threadIdx.y][internal * THREAD_BLOCK_SIZE + threadIdx.x] = A[(dim_block + threadIdx_y) * (num_rows - device_row_offset + PADDING_SIZE_uz) + global_j_idx_linear - (dim_block + threadIdx_y) * (dim_block + threadIdx_y + std::size_t{ 1 }) / std::size_t{ 2 }]; // SoA, upper triangular matrix only + } else { + A_cache[threadIdx.y][internal * THREAD_BLOCK_SIZE + threadIdx.x] = A[global_j_idx_linear * (num_rows - device_row_offset + PADDING_SIZE_uz) + dim_block + threadIdx_y - global_j_idx_linear * (global_j_idx_linear + std::size_t{ 1 }) / std::size_t{ 2 }]; // SoA, upper triangular matrix only + } + B_cache[threadIdx.y][internal * THREAD_BLOCK_SIZE + threadIdx.x] = B[(dim_block + device_row_offset + threadIdx_y) * (num_rhs + PADDING_SIZE_uz) + global_i_idx_linear]; // SoA } - - B_cache[threadIdx.y][internal * THREAD_BLOCK_SIZE + threadIdx.x] = B[(dim + row_offset + threadIdx_y) * (num_rhs + PADDING_SIZE_ull) + global_i]; - B_cache[threadIdx.y + THREAD_BLOCK_SIZE][internal * THREAD_BLOCK_SIZE + threadIdx.x] = B[(dim + row_offset + threadIdx_y + THREAD_BLOCK_SIZE_ull) * (num_rhs + PADDING_SIZE_ull) + global_i]; - } - __syncthreads(); // wait until all threads loaded their part of the data - - // perform the dot product calculation - for (unsigned block_dim = 0; block_dim < FEATURE_BLOCK_SIZE; ++block_dim) { - for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) { - for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) { - temp[internal_i][internal_j] += A_cache[block_dim][threadIdx.y * INTERNAL_BLOCK_SIZE + internal_j] * B_cache[block_dim][threadIdx.x * INTERNAL_BLOCK_SIZE + internal_i]; + __syncthreads(); // wait until all threads loaded their part of the data + + // perform the dot product calculation + for (unsigned dim = 0; dim < THREAD_BLOCK_SIZE; ++dim) { + for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) { + for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) { + temp[internal_i][internal_j] += A_cache[dim][threadIdx.y * INTERNAL_BLOCK_SIZE + internal_j] * B_cache[dim][threadIdx.x * INTERNAL_BLOCK_SIZE + internal_i]; + } } } + __syncthreads(); // wait until all threads performed their part of the calculations } - __syncthreads(); // wait until all threads performed their part of the calculations } + // calculate the indices used in the current thread + const auto i_idx = (blockIdx_x * blockDim_x + threadIdx_x) * INTERNAL_BLOCK_SIZE_uz; // num_rhs + const auto j_idx = (blockIdx_y * blockDim_y + threadIdx_y) * INTERNAL_BLOCK_SIZE_uz; // device_num_rows + // apply the (partial) BLAS operation and update C for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) { for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) { - const auto global_i = i + static_cast(internal_i); - const auto device_global_j = j + static_cast(internal_j); - const auto global_j = row_offset + j + static_cast(internal_j); - - // be sure to not perform out of bounds accesses - if (global_i < num_rhs && device_global_j < device_specific_num_rows) { - C[global_j * (num_rhs + PADDING_SIZE_ull) + global_i] = alpha * temp[internal_i][internal_j] + beta * C[global_j * (num_rhs + PADDING_SIZE_ull) + global_i]; + // calculate the indices to access the global data and the data with respect to the current device + const auto global_i_idx = i_idx + static_cast(internal_i); + const auto device_global_j_idx = j_idx + static_cast(internal_j); + const auto global_j_idx = device_row_offset + device_global_j_idx; + + // be sure to not perform out-of-bounds accesses + if (global_i_idx < num_rhs && device_global_j_idx < device_num_rows) { + C[global_j_idx * (num_rhs + PADDING_SIZE_uz) + global_i_idx] = alpha * temp[internal_i][internal_j] + beta * C[global_j_idx * (num_rhs + PADDING_SIZE_uz) + global_i_idx]; // SoA } } } @@ -118,8 +119,8 @@ __global__ void device_kernel_symm(const unsigned long long num_rows, const unsi * @param[in] num_rows the number of rows in @p A and @p C * @param[in] num_rhs the number of columns in @p B and @p C * @param[in] num_mirror_rows the number of rows to mirror down - * @param[in] device_specific_num_rows the number of rows in @p A and number of rows in @p B; thr rows in @p A are potentially distributed across multiple devices - * @param[in] row_offset the first row this device is responsible for + * @param[in] device_num_rows the number of rows in @p A and number of rows in @p B; thr rows in @p A are potentially distributed across multiple devices + * @param[in] device_row_offset the first row this device is responsible for * @param[in] alpha the scalar alpha value * @param[in] A the matrix @p A * @param[in] B the matrix @p B @@ -128,68 +129,72 @@ __global__ void device_kernel_symm(const unsigned long long num_rows, const unsi * @param[in] grid_x_offset the offset in x-dimension into the data points if more than one execution grid has to be used * @param[in] grid_y_offset the offset in y-dimension into the data points if more than one execution grid has to be used */ -__global__ void device_kernel_symm_mirror(const unsigned long long num_rows, const unsigned long long num_rhs, const unsigned long long num_mirror_rows, const unsigned long long device_specific_num_rows, const unsigned long long row_offset, const real_type alpha, const real_type *A, const real_type *B, const real_type beta, real_type *C, const unsigned long long grid_x_offset, const unsigned long long grid_y_offset) { - // cast all values to 64-bit unsigned long long to prevent potential 32-bit overflows - const auto threadIdx_x = static_cast(threadIdx.x); // current thread in block x-dimension - const auto threadIdx_y = static_cast(threadIdx.y); // current thread in block y-dimension - const auto blockDim_x = static_cast(blockDim.x); // number of threads in block x-dimension - const auto blockDim_y = static_cast(blockDim.y); // number of threads in block y-dimension - const auto blockIdx_x = static_cast(blockIdx.x) + grid_x_offset; // current block in grid x-dimension + offsets if the grid size would be too large - const auto blockIdx_y = static_cast(blockIdx.y) + grid_y_offset; // current block in grid y-dimension + offsets if the grid size would be too large - const auto INTERNAL_BLOCK_SIZE_ull = static_cast(INTERNAL_BLOCK_SIZE); - const auto THREAD_BLOCK_SIZE_ull = static_cast(THREAD_BLOCK_SIZE); - const auto FEATURE_BLOCK_SIZE_ull = static_cast(FEATURE_BLOCK_SIZE); - const auto PADDING_SIZE_ull = static_cast(PADDING_SIZE); - - // calculate the indices used in the current thread - const auto i = (blockIdx_x * blockDim_x + threadIdx_x) * INTERNAL_BLOCK_SIZE_ull; // # rhs -> num_rhs - const auto i_linear = blockIdx_x * blockDim_x * INTERNAL_BLOCK_SIZE_ull + threadIdx_x; - const auto j = (blockIdx_y * blockDim_y + threadIdx_y) * INTERNAL_BLOCK_SIZE_ull; // # rows -> num_mirror_rows - const auto j_linear = blockIdx_y * blockDim_y * INTERNAL_BLOCK_SIZE_ull + threadIdx_x; - - // create the shared memory arrays used for caching data point features - __shared__ real_type A_cache[FEATURE_BLOCK_SIZE][INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE]; - __shared__ real_type B_cache[FEATURE_BLOCK_SIZE][INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE]; +__global__ void device_kernel_symm_mirror(const std::size_t num_rows, const std::size_t num_rhs, const std::size_t num_mirror_rows, const std::size_t device_num_rows, const std::size_t device_row_offset, const real_type alpha, const real_type *A, const real_type *B, const real_type beta, real_type *C, const std::size_t grid_x_offset, const std::size_t grid_y_offset) { + // cast all values to 64-bit std::size_t to prevent potential 32-bit overflows + const auto INTERNAL_BLOCK_SIZE_uz = static_cast(INTERNAL_BLOCK_SIZE); + const auto THREAD_BLOCK_SIZE_uz = static_cast(THREAD_BLOCK_SIZE); + const auto PADDING_SIZE_uz = static_cast(PADDING_SIZE); + + const auto threadIdx_x = static_cast(threadIdx.x); // current thread in block x-dimension + const auto threadIdx_y = static_cast(threadIdx.y); // current thread in block y-dimension + const auto blockDim_x = static_cast(blockDim.x); // number of threads in block x-dimension + const auto blockDim_y = static_cast(blockDim.y); // number of threads in block y-dimension + const auto blockIdx_x = static_cast(blockIdx.x) + grid_x_offset; // current block in grid x-dimension + offsets if the grid size is too large + const auto blockIdx_y = static_cast(blockIdx.y) + grid_y_offset; // current block in grid y-dimension + offsets if the grid size is too large + + // create two shared memory arrays used for caching + __shared__ real_type A_cache[THREAD_BLOCK_SIZE][INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE]; + __shared__ real_type B_cache[THREAD_BLOCK_SIZE][INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE]; // create a thread private array used for internal caching real_type temp[INTERNAL_BLOCK_SIZE][INTERNAL_BLOCK_SIZE]{}; - // iterate over the remaining features using blocking to be able to cache them for faster memory accesses - for (unsigned long long dim = 0; dim < device_specific_num_rows; dim += FEATURE_BLOCK_SIZE_ull) { - // load data into shared memory - for (unsigned internal = 0; internal < INTERNAL_BLOCK_SIZE; ++internal) { - const auto global_i = i_linear + static_cast(internal) * THREAD_BLOCK_SIZE_ull; - const auto global_j = j_linear + static_cast(internal) * THREAD_BLOCK_SIZE_ull; - - // FEATURE_BLOCK_SIZE = 2 * THREAD_BLOCK_SIZE -> store twice as many values in the shared memory - A_cache[threadIdx.y][internal * THREAD_BLOCK_SIZE + threadIdx.x] = A[(dim + threadIdx_y) * (num_rows - row_offset + PADDING_SIZE_ull) - (dim + threadIdx_y - 1ull) * (dim + threadIdx_y) / 2ull + device_specific_num_rows - (dim + threadIdx_y) + global_j]; - A_cache[threadIdx.y + THREAD_BLOCK_SIZE][internal * THREAD_BLOCK_SIZE + threadIdx.x] = A[(dim + threadIdx_y + THREAD_BLOCK_SIZE_ull) * (num_rows - row_offset + PADDING_SIZE_ull) - (dim + threadIdx_y + THREAD_BLOCK_SIZE_ull - 1ull) * (dim + threadIdx_y + THREAD_BLOCK_SIZE_ull) / 2ull + device_specific_num_rows - (dim + threadIdx_y + THREAD_BLOCK_SIZE_ull) + global_j]; - B_cache[threadIdx.y][internal * THREAD_BLOCK_SIZE + threadIdx.x] = B[(row_offset + dim + threadIdx_y) * (num_rhs + PADDING_SIZE_ull) + global_i]; - B_cache[threadIdx.y + THREAD_BLOCK_SIZE][internal * THREAD_BLOCK_SIZE + threadIdx.x] = B[(row_offset + dim + threadIdx_y + THREAD_BLOCK_SIZE_ull) * (num_rhs + PADDING_SIZE_ull) + global_i]; - } - __syncthreads(); // wait until all threads loaded their part of the data - - // perform the feature reduction calculation - for (unsigned block_dim = 0; block_dim < FEATURE_BLOCK_SIZE; ++block_dim) { - for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) { - for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) { - temp[internal_i][internal_j] += A_cache[block_dim][threadIdx.y * INTERNAL_BLOCK_SIZE + internal_j] * B_cache[block_dim][threadIdx.x * INTERNAL_BLOCK_SIZE + internal_i]; + { + // calculate the indices used in the current thread, pays attention to coalesced memory accesses + const auto i_idx_linear = blockIdx_x * blockDim_x * INTERNAL_BLOCK_SIZE_uz + threadIdx_x; // num_rhs + const auto j_idx_linear = blockIdx_y * blockDim_y * INTERNAL_BLOCK_SIZE_uz + threadIdx_x; // num_mirror_rows + + // iterate over the remaining values using blocking to be able to cache them for faster memory accesses + for (std::size_t dim_block = 0; dim_block < device_num_rows; dim_block += THREAD_BLOCK_SIZE_uz) { + // load data into shared memory + for (unsigned internal = 0; internal < INTERNAL_BLOCK_SIZE; ++internal) { + // calculate the indices to access the global data, pays attention to coalesced memory accesses + const auto global_i_idx_linear = i_idx_linear + static_cast(internal) * THREAD_BLOCK_SIZE_uz; + const auto global_j_idx_linear = j_idx_linear + static_cast(internal) * THREAD_BLOCK_SIZE_uz; + + // store the values in the shared memory + A_cache[threadIdx.y][internal * THREAD_BLOCK_SIZE + threadIdx.x] = A[(dim_block + threadIdx_y) * (num_rows - device_row_offset + PADDING_SIZE_uz) - (dim_block + threadIdx_y - std::size_t{ 1 }) * (dim_block + threadIdx_y) / std::size_t{ 2 } + device_num_rows - (dim_block + threadIdx_y) + global_j_idx_linear]; // SoA, upper triangular matrix only + B_cache[threadIdx.y][internal * THREAD_BLOCK_SIZE + threadIdx.x] = B[(device_row_offset + dim_block + threadIdx_y) * (num_rhs + PADDING_SIZE_uz) + global_i_idx_linear]; // SoA + } + __syncthreads(); // wait until all threads loaded their part of the data + + // perform the dot product calculation + for (unsigned dim = 0; dim < THREAD_BLOCK_SIZE; ++dim) { + for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) { + for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) { + temp[internal_i][internal_j] += A_cache[dim][threadIdx.y * INTERNAL_BLOCK_SIZE + internal_j] * B_cache[dim][threadIdx.x * INTERNAL_BLOCK_SIZE + internal_i]; + } } } + __syncthreads(); // wait until all threads performed their part of the calculations } - __syncthreads(); // wait until all threads performed their part of the calculations } + // calculate the indices used in the current thread + const auto i_idx = (blockIdx_x * blockDim_x + threadIdx_x) * INTERNAL_BLOCK_SIZE_uz; // num_rhs + const auto j_idx = (blockIdx_y * blockDim_y + threadIdx_y) * INTERNAL_BLOCK_SIZE_uz; // num_mirror_rows + // apply the (remaining) BLAS operation and update C for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) { for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) { - const auto global_i = i + static_cast(internal_i); - const auto partial_global_j = j + static_cast(internal_j); - const auto global_j = row_offset + device_specific_num_rows + j + static_cast(internal_j); - - // be sure to not perform out of bounds accesses - if (global_i < num_rhs && partial_global_j < num_mirror_rows) { - C[global_j * (num_rhs + PADDING_SIZE_ull) + global_i] = alpha * temp[internal_i][internal_j] + beta * C[global_j * (num_rhs + PADDING_SIZE_ull) + global_i]; + // calculate the indices to access the global data and the data with respect to the current device + const auto global_i_idx = i_idx + static_cast(internal_i); + const auto partial_global_j_idx = j_idx + static_cast(internal_j); + const auto global_j_idx = device_row_offset + device_num_rows + partial_global_j_idx; + + // be sure to not perform out-of-bounds accesses + if (global_i_idx < num_rhs && partial_global_j_idx < num_mirror_rows) { + C[global_j_idx * (num_rhs + PADDING_SIZE_uz) + global_i_idx] = alpha * temp[internal_i][internal_j] + beta * C[global_j_idx * (num_rhs + PADDING_SIZE_uz) + global_i_idx]; // SoA } } } @@ -203,27 +208,29 @@ __global__ void device_kernel_symm_mirror(const unsigned long long num_rows, con * @param[in] grid_x_offset the offset in x-dimension into the data points if more than one execution grid has to be used * @param[in] grid_y_offset the offset in y-dimension into the data points if more than one execution grid has to be used */ -__global__ void device_kernel_inplace_matrix_add(const unsigned long long num_cols, real_type *lhs, const real_type *rhs, const unsigned long long grid_x_offset, const unsigned long long grid_y_offset) { - // cast all values to 64-bit unsigned long long to prevent potential 32-bit overflows - const auto threadIdx_x = static_cast(threadIdx.x); // current thread in block x-dimension - const auto threadIdx_y = static_cast(threadIdx.y); // current thread in block y-dimension - const auto blockDim_x = static_cast(blockDim.x); // number of threads in block x-dimension - const auto blockDim_y = static_cast(blockDim.y); // number of threads in block y-dimension - const auto blockIdx_x = static_cast(blockIdx.x) + grid_x_offset; // current block in grid x-dimension + offsets if the grid size would be too large - const auto blockIdx_y = static_cast(blockIdx.y) + grid_y_offset; // current block in grid y-dimension + offsets if the grid size would be too large - const auto INTERNAL_BLOCK_SIZE_ull = static_cast(INTERNAL_BLOCK_SIZE); - const auto PADDING_SIZE_ull = static_cast(PADDING_SIZE); +__global__ void device_kernel_inplace_matrix_add(const std::size_t num_cols, real_type *lhs, const real_type *rhs, const std::size_t grid_x_offset, const std::size_t grid_y_offset) { + // cast all values to 64-bit std::size_t to prevent potential 32-bit overflows + const auto INTERNAL_BLOCK_SIZE_uz = static_cast(INTERNAL_BLOCK_SIZE); + const auto PADDING_SIZE_uz = static_cast(PADDING_SIZE); + + const auto threadIdx_x = static_cast(threadIdx.x); // current thread in block x-dimension + const auto threadIdx_y = static_cast(threadIdx.y); // current thread in block y-dimension + const auto blockDim_x = static_cast(blockDim.x); // number of threads in block x-dimension + const auto blockDim_y = static_cast(blockDim.y); // number of threads in block y-dimension + const auto blockIdx_x = static_cast(blockIdx.x) + grid_x_offset; // current block in grid x-dimension + offsets if the grid size is too large + const auto blockIdx_y = static_cast(blockIdx.y) + grid_y_offset; // current block in grid y-dimension + offsets if the grid size is too large // calculate the indices used in the current thread - const auto i = (blockIdx_x * blockDim_x + threadIdx_x) * INTERNAL_BLOCK_SIZE_ull; // # num_rows - const auto j = (blockIdx_y * blockDim_y + threadIdx_y) * INTERNAL_BLOCK_SIZE_ull; // # num_rhs + const auto i_idx = (blockIdx_x * blockDim_x + threadIdx_x) * INTERNAL_BLOCK_SIZE_uz; // num_rows + const auto j_idx = (blockIdx_y * blockDim_y + threadIdx_y) * INTERNAL_BLOCK_SIZE_uz; // num_rhs for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) { for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) { - const auto global_i = i + static_cast(internal_i); - const auto global_j = j + static_cast(internal_j); + // calculate the indices to access the global data + const auto global_i_idx = i_idx + static_cast(internal_i); + const auto global_j_idx = j_idx + static_cast(internal_j); - lhs[global_i * (num_cols + PADDING_SIZE_ull) + global_j] += rhs[global_i * (num_cols + PADDING_SIZE_ull) + global_j]; + lhs[global_i_idx * (num_cols + PADDING_SIZE_uz) + global_j_idx] += rhs[global_i_idx * (num_cols + PADDING_SIZE_uz) + global_j_idx]; // SoA } } } @@ -236,27 +243,29 @@ __global__ void device_kernel_inplace_matrix_add(const unsigned long long num_co * @param[in] grid_x_offset the offset in x-dimension into the data points if more than one execution grid has to be used * @param[in] grid_y_offset the offset in y-dimension into the data points if more than one execution grid has to be used */ -__global__ void device_kernel_inplace_matrix_scale(const unsigned long long num_cols, real_type *lhs, const real_type scale, const unsigned long long grid_x_offset, const unsigned long long grid_y_offset) { - // cast all values to 64-bit unsigned long long to prevent potential 32-bit overflows - const auto threadIdx_x = static_cast(threadIdx.x); // current thread in block x-dimension - const auto threadIdx_y = static_cast(threadIdx.y); // current thread in block y-dimension - const auto blockDim_x = static_cast(blockDim.x); // number of threads in block x-dimension - const auto blockDim_y = static_cast(blockDim.y); // number of threads in block y-dimension - const auto blockIdx_x = static_cast(blockIdx.x) + grid_x_offset; // current block in grid x-dimension + offsets if the grid size would be too large - const auto blockIdx_y = static_cast(blockIdx.y) + grid_y_offset; // current block in grid y-dimension + offsets if the grid size would be too large - const auto INTERNAL_BLOCK_SIZE_ull = static_cast(INTERNAL_BLOCK_SIZE); - const auto PADDING_SIZE_ull = static_cast(PADDING_SIZE); +__global__ void device_kernel_inplace_matrix_scale(const std::size_t num_cols, real_type *lhs, const real_type scale, const std::size_t grid_x_offset, const std::size_t grid_y_offset) { + // cast all values to 64-bit std::size_t to prevent potential 32-bit overflows + const auto INTERNAL_BLOCK_SIZE_uz = static_cast(INTERNAL_BLOCK_SIZE); + const auto PADDING_SIZE_uz = static_cast(PADDING_SIZE); + + const auto threadIdx_x = static_cast(threadIdx.x); // current thread in block x-dimension + const auto threadIdx_y = static_cast(threadIdx.y); // current thread in block y-dimension + const auto blockDim_x = static_cast(blockDim.x); // number of threads in block x-dimension + const auto blockDim_y = static_cast(blockDim.y); // number of threads in block y-dimension + const auto blockIdx_x = static_cast(blockIdx.x) + grid_x_offset; // current block in grid x-dimension + offsets if the grid size is too large + const auto blockIdx_y = static_cast(blockIdx.y) + grid_y_offset; // current block in grid y-dimension + offsets if the grid size is too large // calculate the indices used in the current thread - const auto i = (blockIdx_x * blockDim_x + threadIdx_x) * INTERNAL_BLOCK_SIZE_ull; // # num_rows - const auto j = (blockIdx_y * blockDim_y + threadIdx_y) * INTERNAL_BLOCK_SIZE_ull; // # num_rhs + const auto i_idx = (blockIdx_x * blockDim_x + threadIdx_x) * INTERNAL_BLOCK_SIZE_uz; // num_rows + const auto j_idx = (blockIdx_y * blockDim_y + threadIdx_y) * INTERNAL_BLOCK_SIZE_uz; // num_rhs for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) { for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) { - const auto global_i = i + static_cast(internal_i); - const auto global_j = j + static_cast(internal_j); + // calculate the indices to access the global data + const auto global_i_idx = i_idx + static_cast(internal_i); + const auto global_j_idx = j_idx + static_cast(internal_j); - lhs[global_i * (num_cols + PADDING_SIZE_ull) + global_j] *= scale; + lhs[global_i_idx * (num_cols + PADDING_SIZE_uz) + global_j_idx] *= scale; // SoA } } } diff --git a/include/plssvm/backends/HIP/kernel/cg_explicit/kernel_matrix_assembly.hip.hpp b/include/plssvm/backends/HIP/kernel/cg_explicit/kernel_matrix_assembly.hip.hpp index f0e01f813..308867d76 100644 --- a/include/plssvm/backends/HIP/kernel/cg_explicit/kernel_matrix_assembly.hip.hpp +++ b/include/plssvm/backends/HIP/kernel/cg_explicit/kernel_matrix_assembly.hip.hpp @@ -55,7 +55,7 @@ __global__ void device_kernel_assembly(real_type *kernel_matrix, const real_type const auto blockIdx_x = static_cast(blockIdx.x) + grid_x_offset; // current block in grid x-dimension + offsets if the grid size is too large const auto blockIdx_y = static_cast(blockIdx.y) + grid_y_offset; // current block in grid y-dimension + offsets if the grid size is too large - // create two shared memory arrays used for caching data point features + // create two shared memory arrays used for caching __shared__ real_type data_i_cache[THREAD_BLOCK_SIZE][INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE]; __shared__ real_type data_j_cache[THREAD_BLOCK_SIZE][INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE]; @@ -65,30 +65,30 @@ __global__ void device_kernel_assembly(real_type *kernel_matrix, const real_type real_type temp[INTERNAL_BLOCK_SIZE][INTERNAL_BLOCK_SIZE]{}; { - // calculate the indices used in the current thread paying attention to coalesced memory accesses - const auto i_linear = blockIdx_x * blockDim_x * INTERNAL_BLOCK_SIZE_uz + threadIdx_x; - const auto j_linear = blockIdx_y * blockDim_y * INTERNAL_BLOCK_SIZE_uz + threadIdx_x; + // calculate the indices used in the current thread, pays attention to coalesced memory accesses + const auto i_idx_linear = blockIdx_x * blockDim_x * INTERNAL_BLOCK_SIZE_uz + threadIdx_x; // num_rows - device_row_offset + const auto j_idx_linear = blockIdx_y * blockDim_y * INTERNAL_BLOCK_SIZE_uz + threadIdx_x; // device_num_rows // iterate over all features using blocking to be able to cache them for faster memory accesses - for (std::size_t dim = 0; dim < num_features; dim += THREAD_BLOCK_SIZE_uz) { + for (std::size_t feature_block = 0; feature_block < num_features; feature_block += THREAD_BLOCK_SIZE_uz) { // load data into shared memory for (unsigned internal = 0; internal < INTERNAL_BLOCK_SIZE; ++internal) { - // calculate the indices to access the global data points, pays attention to coalesced memory accesses - const auto global_i_linear = device_row_offset + i_linear + static_cast(internal) * THREAD_BLOCK_SIZE_uz; - const auto global_j_linear = device_row_offset + j_linear + static_cast(internal) * THREAD_BLOCK_SIZE_uz; + // calculate the indices to access the global data, pays attention to coalesced memory accesses + const auto global_i_idx_linear = device_row_offset + i_idx_linear + static_cast(internal) * THREAD_BLOCK_SIZE_uz; + const auto global_j_idx_linear = device_row_offset + j_idx_linear + static_cast(internal) * THREAD_BLOCK_SIZE_uz; // store the values in the shared memory - data_i_cache[threadIdx.y][internal * THREAD_BLOCK_SIZE + threadIdx.x] = data[(dim + threadIdx_y) * (num_rows + std::size_t{ 1 } + PADDING_SIZE_uz) + global_i_linear]; - data_j_cache[threadIdx.y][internal * THREAD_BLOCK_SIZE + threadIdx.x] = data[(dim + threadIdx_y) * (num_rows + std::size_t{ 1 } + PADDING_SIZE_uz) + global_j_linear]; + data_i_cache[threadIdx.y][internal * THREAD_BLOCK_SIZE + threadIdx.x] = data[(feature_block + threadIdx_y) * (num_rows + std::size_t{ 1 } + PADDING_SIZE_uz) + global_i_idx_linear]; // SoA + data_j_cache[threadIdx.y][internal * THREAD_BLOCK_SIZE + threadIdx.x] = data[(feature_block + threadIdx_y) * (num_rows + std::size_t{ 1 } + PADDING_SIZE_uz) + global_j_idx_linear]; // SoA } __syncthreads(); // wait until all threads loaded their part of the data // perform the feature reduction calculation - for (unsigned block_dim = 0; block_dim < THREAD_BLOCK_SIZE; ++block_dim) { + for (unsigned feature = 0; feature < THREAD_BLOCK_SIZE; ++feature) { for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) { for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) { - temp[internal_i][internal_j] += detail::feature_reduce(data_i_cache[block_dim][threadIdx.x * INTERNAL_BLOCK_SIZE + internal_i], - data_j_cache[block_dim][threadIdx.y * INTERNAL_BLOCK_SIZE + internal_j]); + temp[internal_i][internal_j] += detail::feature_reduce(data_i_cache[feature][threadIdx.x * INTERNAL_BLOCK_SIZE + internal_i], + data_j_cache[feature][threadIdx.y * INTERNAL_BLOCK_SIZE + internal_j]); } } } @@ -97,29 +97,29 @@ __global__ void device_kernel_assembly(real_type *kernel_matrix, const real_type } // calculate the indices used in the current thread - const auto i = (blockIdx_x * blockDim_x + threadIdx_x) * INTERNAL_BLOCK_SIZE_uz; - const auto j = (blockIdx_y * blockDim_y + threadIdx_y) * INTERNAL_BLOCK_SIZE_uz; + const auto i_idx = (blockIdx_x * blockDim_x + threadIdx_x) * INTERNAL_BLOCK_SIZE_uz; // num_rows - device_row_offset + const auto j_idx = (blockIdx_y * blockDim_y + threadIdx_y) * INTERNAL_BLOCK_SIZE_uz; // device_num_rows // apply the remaining part of the kernel function and store the value in the output kernel matrix for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) { for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) { - // calculate the indices to access the global data points and wrt the current device - const auto device_global_i = i + static_cast(internal_i); - const auto global_i = device_row_offset + device_global_i; - const auto device_global_j = j + static_cast(internal_j); - const auto global_j = device_row_offset + device_global_j; - - // be sure to not perform out of bounds accesses for the kernel matrix (only using the upper triangular matrix) - if (device_global_i < (num_rows - device_row_offset) && device_global_j < device_num_rows && global_i >= global_j) { + // calculate the indices to access the global data and the data with respect to the current device + const auto device_global_i_idx = i_idx + static_cast(internal_i); + const auto global_i_idx = device_row_offset + device_global_i_idx; + const auto device_global_j_idx = j_idx + static_cast(internal_j); + const auto global_j_idx = device_row_offset + device_global_j_idx; + + // be sure to not perform out-of-bounds accesses (only using the upper triangular matrix) + if (device_global_i_idx < (num_rows - device_row_offset) && device_global_j_idx < device_num_rows && global_i_idx >= global_j_idx) { real_type temp_ij = temp[internal_i][internal_j]; // apply the final kernel function - temp_ij = detail::apply_kernel_function(temp_ij, kernel_function_parameter...) + QA_cost - q[global_i] - q[global_j]; + temp_ij = detail::apply_kernel_function(temp_ij, kernel_function_parameter...) + QA_cost - q[global_i_idx] - q[global_j_idx]; // apply the cost on the diagonal - if (global_i == global_j) { + if (global_i_idx == global_j_idx) { temp_ij += cost; } // update the upper triangular kernel matrix - kernel_matrix[device_global_j * (num_rows - device_row_offset + PADDING_SIZE_uz) - device_global_j * (device_global_j + std::size_t{ 1 }) / std::size_t{ 2 } + device_global_i] = temp_ij; + kernel_matrix[device_global_j_idx * (num_rows - device_row_offset + PADDING_SIZE_uz) - device_global_j_idx * (device_global_j_idx + std::size_t{ 1 }) / std::size_t{ 2 } + device_global_i_idx] = temp_ij; } } } diff --git a/include/plssvm/backends/HIP/kernel/cg_implicit/kernel_matrix_assembly_blas.hip.hpp b/include/plssvm/backends/HIP/kernel/cg_implicit/kernel_matrix_assembly_blas.hip.hpp index 77820e35a..97ef0798b 100644 --- a/include/plssvm/backends/HIP/kernel/cg_implicit/kernel_matrix_assembly_blas.hip.hpp +++ b/include/plssvm/backends/HIP/kernel/cg_implicit/kernel_matrix_assembly_blas.hip.hpp @@ -14,12 +14,14 @@ #pragma once #include "plssvm/backends/HIP/kernel/kernel_functions.hip.hpp" // plssvm::hip::detail::{feature_reduce, apply_kernel_function} -#include "plssvm/constants.hpp" // plssvm::{real_type, THREAD_BLOCK_SIZE, INTERNAL_BLOCK_SIZE, FEATURE_BLOCK_SIZE, PADDING_SIZE} +#include "plssvm/constants.hpp" // plssvm::{real_type, THREAD_BLOCK_SIZE, INTERNAL_BLOCK_SIZE, PADDING_SIZE} #include "plssvm/kernel_function_types.hpp" // plssvm::kernel_function_type #include "hip/hip_runtime.h" #include "hip/hip_runtime_api.h" +#include // std::size_t + namespace plssvm::hip::detail { /** @@ -28,10 +30,10 @@ namespace plssvm::hip::detail { * @tparam Args the types of the parameters necessary for the specific kernel function * @param[in] alpha the scalar alpha value * @param[in] q the vector used in the dimensional reduction - * @param[in] data_d the data points to calculate the implicit kernel matrix from + * @param[in] data the data points to calculate the implicit kernel matrix from * @param[in] num_rows the total number of data points (= total number of rows) * @param[in] device_num_rows the number of rows the current device is responsible for - * @param[in] row_offset the first row in @p data_d the current device is responsible for + * @param[in] device_row_offset the first row in @p data the current device is responsible for * @param[in] num_features the number of features per data point * @param[in] QA_cost the scalar used in the dimensional reduction * @param[in] cost the cost factor the diagonal is scaled with @@ -43,56 +45,64 @@ namespace plssvm::hip::detail { * @param[in] kernel_function_parameter the parameters necessary to apply the @p kernel_function */ template -__global__ void device_kernel_assembly_symm(const real_type alpha, const real_type *q, const real_type *data_d, const unsigned long long num_rows, const unsigned long long device_num_rows, const unsigned long long row_offset, const unsigned long long num_features, const real_type QA_cost, const real_type cost, const real_type *B, real_type *C, const unsigned long long num_classes, const unsigned long long grid_x_offset, const unsigned long long grid_y_offset, Args... kernel_function_parameter) { - // cast all values to 64-bit unsigned long long to prevent potential 32-bit overflows - const auto threadIdx_x = static_cast(threadIdx.x); // current thread in block x-dimension - const auto threadIdx_y = static_cast(threadIdx.y); // current thread in block y-dimension - const auto blockDim_x = static_cast(blockDim.x); // number of threads in block x-dimension - const auto blockDim_y = static_cast(blockDim.y); // number of threads in block y-dimension - const auto blockIdx_x = static_cast(blockIdx.x) + grid_x_offset; // current block in grid x-dimension + offsets if the grid size would be too large - const auto blockIdx_y = static_cast(blockIdx.y) + grid_y_offset; // current block in grid y-dimension + offsets if the grid size would be too large - const auto INTERNAL_BLOCK_SIZE_ull = static_cast(INTERNAL_BLOCK_SIZE); - const auto THREAD_BLOCK_SIZE_ull = static_cast(THREAD_BLOCK_SIZE); - const auto FEATURE_BLOCK_SIZE_ull = static_cast(FEATURE_BLOCK_SIZE); - const auto PADDING_SIZE_ull = static_cast(PADDING_SIZE); +__global__ void device_kernel_assembly_symm(const real_type alpha, const real_type *q, const real_type *data, const std::size_t num_rows, const std::size_t device_num_rows, const std::size_t device_row_offset, const std::size_t num_features, const real_type QA_cost, const real_type cost, const real_type *B, real_type *C, const std::size_t num_classes, const std::size_t grid_x_offset, const std::size_t grid_y_offset, Args... kernel_function_parameter) { + // cast all values to 64-bit std::size_t to prevent potential 32-bit overflows + const auto INTERNAL_BLOCK_SIZE_uz = static_cast(INTERNAL_BLOCK_SIZE); + const auto THREAD_BLOCK_SIZE_uz = static_cast(THREAD_BLOCK_SIZE); + const auto PADDING_SIZE_uz = static_cast(PADDING_SIZE); + + const auto threadIdx_x = static_cast(threadIdx.x); // current thread in block x-dimension + const auto threadIdx_y = static_cast(threadIdx.y); // current thread in block y-dimension + const auto blockDim_x = static_cast(blockDim.x); // number of threads in block x-dimension + const auto blockDim_y = static_cast(blockDim.y); // number of threads in block y-dimension + const auto blockIdx_x = static_cast(blockIdx.x) + grid_x_offset; // current block in grid x-dimension + offsets if the grid size is too large + const auto blockIdx_y = static_cast(blockIdx.y) + grid_y_offset; // current block in grid y-dimension + offsets if the grid size is too large // calculate the indices used in the current thread - const auto i = (blockIdx_x * blockDim_x + threadIdx_x) * INTERNAL_BLOCK_SIZE_ull; - const auto i_linear = blockIdx_x * blockDim_x * INTERNAL_BLOCK_SIZE_ull + threadIdx_x; - const auto j = (blockIdx_y * blockDim_y + threadIdx_y) * INTERNAL_BLOCK_SIZE_ull; - const auto j_linear = blockIdx_y * blockDim_y * INTERNAL_BLOCK_SIZE_ull + threadIdx_x; + const auto i_idx = (blockIdx_x * blockDim_x + threadIdx_x) * INTERNAL_BLOCK_SIZE_uz; // num_rows - device_row_offset + const auto j_idx = (blockIdx_y * blockDim_y + threadIdx_y) * INTERNAL_BLOCK_SIZE_uz; // num_rows - device_row_offset + + // calculate the indices used in the current thread, pays attention to coalesced memory accesses + const auto i_idx_linear = blockIdx_x * blockDim_x * INTERNAL_BLOCK_SIZE_uz + threadIdx_x; // device_num_rows + const auto j_idx_linear = blockIdx_y * blockDim_y * INTERNAL_BLOCK_SIZE_uz + threadIdx_x; // device_num_rows + + // create two shared memory arrays used for caching + __shared__ real_type cache_one[THREAD_BLOCK_SIZE * INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE]; + __shared__ real_type cache_two[THREAD_BLOCK_SIZE * INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE]; // only calculate the upper triangular matrix -> can't use threadIdx since all threads in a wavefront must progress further if (blockIdx_x >= blockIdx_y) { // create a thread private array used for internal caching real_type temp[INTERNAL_BLOCK_SIZE][INTERNAL_BLOCK_SIZE]{}; + //*************************************************************************// + // inplace kernel matrix construction // + //*************************************************************************// { - // create the shared memory arrays used for caching data point features - __shared__ real_type data_cache_i[FEATURE_BLOCK_SIZE][INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE]; - __shared__ real_type data_cache_j[FEATURE_BLOCK_SIZE][INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE]; + // reinterpret the shared memory arrays to be of shape [THREAD_BLOCK_SIZE][INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE] + auto data_i_cache = reinterpret_cast(cache_one); + auto data_j_cache = reinterpret_cast(cache_two); // iterate over all features using blocking to be able to cache them for faster memory accesses - for (unsigned long long dim = 0; dim < num_features; dim += FEATURE_BLOCK_SIZE_ull) { + for (std::size_t feature_block = 0; feature_block < num_features; feature_block += THREAD_BLOCK_SIZE_uz) { // load data into shared memory for (unsigned internal = 0; internal < INTERNAL_BLOCK_SIZE; ++internal) { - const auto global_i = row_offset + i_linear + static_cast(internal) * THREAD_BLOCK_SIZE_ull; - const auto global_j = row_offset + j_linear + static_cast(internal) * THREAD_BLOCK_SIZE_ull; - - // FEATURE_BLOCK_SIZE = 2 * THREAD_BLOCK_SIZE -> store twice as many values in the shared memory - data_cache_i[threadIdx.y][internal * THREAD_BLOCK_SIZE + threadIdx.x] = data_d[(dim + threadIdx_y) * (num_rows + 1ull + PADDING_SIZE_ull) + global_i]; - data_cache_i[threadIdx.y + THREAD_BLOCK_SIZE][internal * THREAD_BLOCK_SIZE + threadIdx.x] = data_d[(dim + threadIdx_y + THREAD_BLOCK_SIZE_ull) * (num_rows + 1ull + PADDING_SIZE_ull) + global_i]; - data_cache_j[threadIdx.y][internal * THREAD_BLOCK_SIZE + threadIdx.x] = data_d[(dim + threadIdx_y) * (num_rows + 1ull + PADDING_SIZE_ull) + global_j]; - data_cache_j[threadIdx.y + THREAD_BLOCK_SIZE][internal * THREAD_BLOCK_SIZE + threadIdx.x] = data_d[(dim + threadIdx_y + THREAD_BLOCK_SIZE_ull) * (num_rows + 1ull + PADDING_SIZE_ull) + global_j]; + // calculate the indices to access the global data, pays attention to coalesced memory accesses + const auto global_i_idx_linear = device_row_offset + i_idx_linear + static_cast(internal) * THREAD_BLOCK_SIZE_uz; + const auto global_j_idx_linear = device_row_offset + j_idx_linear + static_cast(internal) * THREAD_BLOCK_SIZE_uz; + + // store the values in the shared memory + data_i_cache[threadIdx.y][internal * THREAD_BLOCK_SIZE + threadIdx.x] = data[(feature_block + threadIdx_y) * (num_rows + std::size_t{ 1 } + PADDING_SIZE_uz) + global_i_idx_linear]; // SoA + data_j_cache[threadIdx.y][internal * THREAD_BLOCK_SIZE + threadIdx.x] = data[(feature_block + threadIdx_y) * (num_rows + std::size_t{ 1 } + PADDING_SIZE_uz) + global_j_idx_linear]; // SoA } __syncthreads(); // wait until all threads loaded their part of the data // perform the feature reduction calculation - for (unsigned block_dim = 0; block_dim < FEATURE_BLOCK_SIZE; ++block_dim) { + for (unsigned feature = 0; feature < THREAD_BLOCK_SIZE; ++feature) { for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) { for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) { - temp[internal_i][internal_j] += detail::feature_reduce(data_cache_i[block_dim][threadIdx.x * INTERNAL_BLOCK_SIZE + internal_i], - data_cache_j[block_dim][threadIdx.y * INTERNAL_BLOCK_SIZE + internal_j]); + temp[internal_i][internal_j] += detail::feature_reduce(data_i_cache[feature][threadIdx.x * INTERNAL_BLOCK_SIZE + internal_i], + data_j_cache[feature][threadIdx.y * INTERNAL_BLOCK_SIZE + internal_j]); } } } @@ -103,16 +113,18 @@ __global__ void device_kernel_assembly_symm(const real_type alpha, const real_ty // apply the remaining part of the kernel function and store the value in the output kernel matrix for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) { for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) { - const auto global_i = row_offset + i + static_cast(internal_i); - const auto device_global_i = i + static_cast(internal_i); - const auto global_j = row_offset + j + static_cast(internal_j); - const auto device_global_j = j + static_cast(internal_j); - - // be sure to not perform out of bounds accesses for the kernel matrix (only using the upper triangular matrix) - if ((device_global_i < (num_rows - row_offset) && device_global_j < device_num_rows && global_i >= global_j)) { - temp[internal_i][internal_j] = detail::apply_kernel_function(temp[internal_i][internal_j], kernel_function_parameter...) + QA_cost - q[global_i] - q[global_j]; + // calculate the indices to access the global data and the data with respect to the current device + const auto device_global_i_idx = i_idx + static_cast(internal_i); + const auto global_i_idx = device_row_offset + device_global_i_idx; + const auto device_global_j_idx = j_idx + static_cast(internal_j); + const auto global_j_idx = device_row_offset + device_global_j_idx; + + // be sure to not perform out of bounds accesses (only using the upper triangular matrix) + if ((device_global_i_idx < (num_rows - device_row_offset) && device_global_j_idx < device_num_rows && global_i_idx >= global_j_idx)) { + // apply the final kernel function + temp[internal_i][internal_j] = detail::apply_kernel_function(temp[internal_i][internal_j], kernel_function_parameter...) + QA_cost - q[global_i_idx] - q[global_j_idx]; // apply the cost on the diagonal - if (global_i == global_j) { + if (global_i_idx == global_j_idx) { temp[internal_i][internal_j] += cost; } } else { @@ -122,42 +134,44 @@ __global__ void device_kernel_assembly_symm(const real_type alpha, const real_ty } } - // calculate C += alpha * temp * B for the UPPER triangular matrix + //*************************************************************************// + // calculate C += alpha * temp * B for the UPPER triangular matrix // + //*************************************************************************// { - // same shared memory size but with different dimensions - __shared__ real_type B_cache[INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE][FEATURE_BLOCK_SIZE]; - __shared__ real_type C_out_cache[INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE][FEATURE_BLOCK_SIZE]; + // reinterpret the shared memory arrays to be of shape [INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE][THREAD_BLOCK_SIZE] + auto B_cache = reinterpret_cast(cache_one); + auto C_out_cache = reinterpret_cast(cache_two); // iterate over all classes using blocking to be able to cache them for faster memory accesses - for (unsigned long long dim = 0; dim < num_classes; dim += FEATURE_BLOCK_SIZE_ull) { + for (std::size_t class_block = 0; class_block < num_classes; class_block += THREAD_BLOCK_SIZE_uz) { // load data into shared memory for (unsigned internal = 0; internal < INTERNAL_BLOCK_SIZE; ++internal) { - const auto global_i = row_offset + i_linear + static_cast(internal) * THREAD_BLOCK_SIZE_ull; + // calculate the indices to access the global data, pays attention to coalesced memory accesses + const auto global_i_idx_linear = device_row_offset + i_idx_linear + static_cast(internal) * THREAD_BLOCK_SIZE_uz; - // FEATURE_BLOCK_SIZE = 2 * THREAD_BLOCK_SIZE -> store twice as many values in the shared memory - B_cache[internal * THREAD_BLOCK_SIZE + threadIdx.x][threadIdx.y] = alpha * B[global_i * (num_classes + PADDING_SIZE_ull) + dim + threadIdx_y]; - B_cache[internal * THREAD_BLOCK_SIZE + threadIdx.x][threadIdx.y + THREAD_BLOCK_SIZE] = alpha * B[global_i * (num_classes + PADDING_SIZE_ull) + dim + threadIdx_y + THREAD_BLOCK_SIZE_ull]; - C_out_cache[internal * THREAD_BLOCK_SIZE + threadIdx.x][threadIdx.y] = real_type{ 0.0 }; - C_out_cache[internal * THREAD_BLOCK_SIZE + threadIdx.x][threadIdx.y + THREAD_BLOCK_SIZE] = real_type{ 0.0 }; + // store the values in the shared memory + B_cache[internal * THREAD_BLOCK_SIZE + threadIdx.x][threadIdx.y] = alpha * B[global_i_idx_linear * (num_classes + PADDING_SIZE_uz) + class_block + threadIdx_y]; // SoA + C_out_cache[internal * THREAD_BLOCK_SIZE + threadIdx.x][threadIdx.y] = real_type{ 0.0 }; // SoA } __syncthreads(); // wait until all threads loaded their part of the data // calculate intermediate results and store them in shared memory - for (unsigned class_idx = 0; class_idx < FEATURE_BLOCK_SIZE; ++class_idx) { + for (unsigned class_idx = 0; class_idx < THREAD_BLOCK_SIZE; ++class_idx) { for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) { for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) { - C_out_cache[threadIdx.y * INTERNAL_BLOCK_SIZE + internal_j][(class_idx + threadIdx.x) % FEATURE_BLOCK_SIZE] += - temp[internal_i][internal_j] * B_cache[threadIdx.x * INTERNAL_BLOCK_SIZE + internal_i][(class_idx + threadIdx.x) % FEATURE_BLOCK_SIZE]; + C_out_cache[threadIdx.y * INTERNAL_BLOCK_SIZE + internal_j][(class_idx + threadIdx.x) % THREAD_BLOCK_SIZE] += + temp[internal_i][internal_j] * B_cache[threadIdx.x * INTERNAL_BLOCK_SIZE + internal_i][(class_idx + threadIdx.x) % THREAD_BLOCK_SIZE]; } } __syncthreads(); // wait until all threads performed their part of the calculations } - // add intermediate cached results to C + // atomically add the intermediate cached results to the C matrix for (unsigned internal = 0; internal < INTERNAL_BLOCK_SIZE; ++internal) { - const auto global_j = row_offset + j + static_cast(internal); - atomicAdd(&C[global_j * (num_classes + PADDING_SIZE_ull) + dim + threadIdx_x], C_out_cache[threadIdx.y * INTERNAL_BLOCK_SIZE + internal][threadIdx.x]); - atomicAdd(&C[global_j * (num_classes + PADDING_SIZE_ull) + dim + threadIdx_x + THREAD_BLOCK_SIZE_ull], C_out_cache[threadIdx.y * INTERNAL_BLOCK_SIZE + internal][threadIdx.x + THREAD_BLOCK_SIZE]); + // calculate the indices to access the global data + const auto global_j_idx = device_row_offset + j_idx + static_cast(internal); + + atomicAdd(&C[global_j_idx * (num_classes + PADDING_SIZE_uz) + class_block + threadIdx_x], C_out_cache[threadIdx.y * INTERNAL_BLOCK_SIZE + internal][threadIdx.x]); // SoA } __syncthreads(); // wai until all threads updated C with their values } @@ -166,51 +180,55 @@ __global__ void device_kernel_assembly_symm(const real_type alpha, const real_ty // set potential diagonal entries in temp to 0.0 such that we don't apply the main diagonal twice to C for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) { for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) { - const auto global_i = row_offset + i + static_cast(internal_i); - const auto global_j = row_offset + j + static_cast(internal_j); + // calculate the indices to access the global data + const auto global_i_idx = device_row_offset + i_idx + static_cast(internal_i); + const auto global_j_idx = device_row_offset + j_idx + static_cast(internal_j); - if (global_i == global_j) { + // update the diagonal + if (global_i_idx == global_j_idx) { temp[internal_i][internal_j] = real_type{ 0.0 }; } } } - // calculate C += alpha * temp * B for the LOWER triangular matrix + //*************************************************************************// + // calculate C += alpha * temp * B for the LOWER triangular matrix // + //*************************************************************************// { - // same shared memory size but with different dimensions - __shared__ real_type B_cache[FEATURE_BLOCK_SIZE][INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE]; - __shared__ real_type C_out_cache[FEATURE_BLOCK_SIZE][INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE]; + // reinterpret the shared memory arrays to be of shape [THREAD_BLOCK_SIZE][INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE] + auto B_cache = reinterpret_cast(cache_one); + auto C_out_cache = reinterpret_cast(cache_two); // iterate over all classes using blocking to be able to cache them for faster memory accesses - for (unsigned long long dim = 0; dim < num_classes; dim += FEATURE_BLOCK_SIZE_ull) { + for (std::size_t class_block = 0; class_block < num_classes; class_block += THREAD_BLOCK_SIZE_uz) { // load data into shared memory for (unsigned internal = 0; internal < INTERNAL_BLOCK_SIZE; ++internal) { - const auto global_j = row_offset + j_linear + static_cast(internal) * THREAD_BLOCK_SIZE_ull; + // calculate the indices to access the global data, pays attention to coalesced memory accesses + const auto global_j_idx_linear = device_row_offset + j_idx_linear + static_cast(internal) * THREAD_BLOCK_SIZE_uz; - // FEATURE_BLOCK_SIZE = 2 * THREAD_BLOCK_SIZE -> store twice as many values in the shared memory - B_cache[threadIdx.y][internal * THREAD_BLOCK_SIZE + threadIdx.x] = alpha * B[global_j * (num_classes + PADDING_SIZE_ull) + dim + threadIdx_y]; - B_cache[threadIdx.y + THREAD_BLOCK_SIZE][internal * THREAD_BLOCK_SIZE + threadIdx.x] = alpha * B[global_j * (num_classes + PADDING_SIZE_ull) + dim + threadIdx_y + THREAD_BLOCK_SIZE_ull]; + // store the values in the shared memory + B_cache[threadIdx.y][internal * THREAD_BLOCK_SIZE + threadIdx.x] = alpha * B[global_j_idx_linear * (num_classes + PADDING_SIZE_uz) + class_block + threadIdx_y]; // SoA C_out_cache[threadIdx.y][internal * THREAD_BLOCK_SIZE + threadIdx.x] = real_type{ 0.0 }; - C_out_cache[threadIdx.y + THREAD_BLOCK_SIZE][internal * THREAD_BLOCK_SIZE + threadIdx.x] = real_type{ 0.0 }; } __syncthreads(); // wait until all threads loaded their part of the data // calculate intermediate results and store them in shared memory - for (unsigned class_idx = 0; class_idx < FEATURE_BLOCK_SIZE; ++class_idx) { + for (unsigned class_idx = 0; class_idx < THREAD_BLOCK_SIZE; ++class_idx) { for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) { for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) { - C_out_cache[(class_idx + threadIdx.y) % FEATURE_BLOCK_SIZE][internal_i * THREAD_BLOCK_SIZE + threadIdx.x] += - temp[internal_i][internal_j] * B_cache[(class_idx + threadIdx.y) % FEATURE_BLOCK_SIZE][threadIdx.y * INTERNAL_BLOCK_SIZE + internal_j]; + C_out_cache[(class_idx + threadIdx.y) % THREAD_BLOCK_SIZE][internal_i * THREAD_BLOCK_SIZE + threadIdx.x] += + temp[internal_i][internal_j] * B_cache[(class_idx + threadIdx.y) % THREAD_BLOCK_SIZE][threadIdx.y * INTERNAL_BLOCK_SIZE + internal_j]; } } __syncthreads(); // wait until all threads performed their part of the calculations } - // add intermediate cached results to C + // atomically add the intermediate cached results to the C matrix for (unsigned internal = 0; internal < INTERNAL_BLOCK_SIZE; ++internal) { - const auto global_i = row_offset + i + static_cast(internal); - atomicAdd(&C[global_i * (num_classes + PADDING_SIZE_ull) + dim + threadIdx_y], C_out_cache[threadIdx.y][internal * THREAD_BLOCK_SIZE + threadIdx.x]); - atomicAdd(&C[global_i * (num_classes + PADDING_SIZE_ull) + dim + threadIdx_y + THREAD_BLOCK_SIZE_ull], C_out_cache[threadIdx.y + THREAD_BLOCK_SIZE][internal * THREAD_BLOCK_SIZE + threadIdx.x]); + // calculate the indices to access the global data + const auto global_i_idx = device_row_offset + i_idx + static_cast(internal); + + atomicAdd(&C[global_i_idx * (num_classes + PADDING_SIZE_uz) + class_block + threadIdx_y], C_out_cache[threadIdx.y][internal * THREAD_BLOCK_SIZE + threadIdx.x]); // SoA } __syncthreads(); // wait until all threads updated C with their values } diff --git a/include/plssvm/backends/HIP/kernel/kernel_functions.hip.hpp b/include/plssvm/backends/HIP/kernel/kernel_functions.hip.hpp index a98bb0715..1b2be0ae6 100644 --- a/include/plssvm/backends/HIP/kernel/kernel_functions.hip.hpp +++ b/include/plssvm/backends/HIP/kernel/kernel_functions.hip.hpp @@ -51,42 +51,17 @@ template <> /** * @brief Fast integer power function. Computes base^exponent and takes advantage of the fact that degree may only be positive integer values. - * @details Hardcodes the power function for degree <= 6, uses a simple for loop otherwise. * @param[in] base the base * @param[in] exponent the exponent * @return base^exponent (`[[nodiscard]]`) */ [[nodiscard]] __device__ __forceinline__ real_type powi(const real_type base, const int exponent) { - switch (exponent) { - case 0: return real_type{ 1.0 }; - case 1: return base; - case 2: return base * base; - case 3: return base * base * base; - case 4: - { - const real_type temp = base * base; - return temp * temp; - } - case 5: - { - const real_type temp = base * base; - return temp * temp * base; - } - case 6: - { - const real_type temp = base * base * base; - return temp * temp; - } - default: - { - // generic integer power function - real_type result{ 1.0 }; - for (int i = 0; i < exponent; ++i) { - result *= base; - } - return result; - } + // generic integer power function + real_type result{ 1.0 }; + for (int i = 0; i < exponent; ++i) { + result *= base; } + return result; } //***************************************************// diff --git a/include/plssvm/backends/HIP/kernel/predict_kernel.hip.hpp b/include/plssvm/backends/HIP/kernel/predict_kernel.hip.hpp index 6e349927e..9aaba6c5e 100644 --- a/include/plssvm/backends/HIP/kernel/predict_kernel.hip.hpp +++ b/include/plssvm/backends/HIP/kernel/predict_kernel.hip.hpp @@ -14,169 +14,183 @@ #pragma once #include "plssvm/backends/HIP/kernel/kernel_functions.hip.hpp" // plssvm::hip::detail::{feature_reduce, apply_kernel_function} -#include "plssvm/constants.hpp" // plssvm::{real_type, THREAD_BLOCK_SIZE, INTERNAL_BLOCK_SIZE, FEATURE_BLOCK_SIZE, PADDING_SIZE} +#include "plssvm/constants.hpp" // plssvm::{real_type, THREAD_BLOCK_SIZE, INTERNAL_BLOCK_SIZE, PADDING_SIZE} #include "plssvm/kernel_function_types.hpp" // plssvm::kernel_function_type #include "hip/hip_runtime.h" #include "hip/hip_runtime_api.h" +#include // std::size_t + namespace plssvm::hip::detail { /** * @brief Calculate the `w` vector used to speedup the prediction using the linear kernel function. - * @param[out] w_d the vector to speedup the linear prediction - * @param[in] alpha_d the previously learned weights - * @param[in] sv_d the support vectors + * @param[out] w the vector to speedup the linear prediction + * @param[in] alpha the previously learned weights + * @param[in] support_vectors the support vectors * @param[in] num_classes the number of classes * @param[in] num_sv the number of support vectors - * @param[in] device_specific_num_sv the number of support vectors the current device is responsible for - * @param[in] sv_offset the first support vector (row in @p alpha_d) the current device is responsible for + * @param[in] device_num_sv the number of support vectors the current device is responsible for + * @param[in] device_sv_offset the first support vector (row in @p alpha) the current device is responsible for * @param[in] grid_x_offset the offset in x-dimension into the data points if more than one execution grid has to be used * @param[in] grid_y_offset the offset in y-dimension into the data points if more than one execution grid has to be used */ -__global__ void device_kernel_w_linear(real_type *w_d, const real_type *alpha_d, const real_type *sv_d, const unsigned long long num_classes, const unsigned long long num_sv, const unsigned long long device_specific_num_sv, const unsigned long long sv_offset, const unsigned long long grid_x_offset, const unsigned long long grid_y_offset) { - // cast all values to 64-bit unsigned long long to prevent potential 32-bit overflows - const auto threadIdx_x = static_cast(threadIdx.x); // current thread in block x-dimension - const auto threadIdx_y = static_cast(threadIdx.y); // current thread in block y-dimension - const auto blockDim_x = static_cast(blockDim.x); // number of threads in block x-dimension - const auto blockDim_y = static_cast(blockDim.y); // number of threads in block y-dimension - const auto blockIdx_x = static_cast(blockIdx.x) + grid_x_offset; // current block in grid x-dimension + offsets if the grid size would be too large - const auto blockIdx_y = static_cast(blockIdx.y) + grid_y_offset; // current block in grid y-dimension + offsets if the grid size would be too large - const auto INTERNAL_BLOCK_SIZE_ull = static_cast(INTERNAL_BLOCK_SIZE); - const auto THREAD_BLOCK_SIZE_ull = static_cast(THREAD_BLOCK_SIZE); - const auto PADDING_SIZE_ull = static_cast(PADDING_SIZE); - - // calculate the indices used in the current thread - const auto feature_idx = (blockIdx_x * blockDim_x + threadIdx_x) * INTERNAL_BLOCK_SIZE_ull; - const auto feature_idx_linear = blockIdx_x * blockDim_x * INTERNAL_BLOCK_SIZE_ull + threadIdx_x; - const auto class_idx = (blockIdx_y * blockDim_y + threadIdx_y) * INTERNAL_BLOCK_SIZE_ull; - const auto class_idx_linear = blockIdx_y * blockDim_y * INTERNAL_BLOCK_SIZE_ull + threadIdx_x; - - // create the shared memory arrays used for caching data point features - __shared__ real_type data_cache_feature[THREAD_BLOCK_SIZE][INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE]; - __shared__ real_type data_cache_alpha[THREAD_BLOCK_SIZE][INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE]; +__global__ void device_kernel_w_linear(real_type *w, const real_type *alpha, const real_type *support_vectors, const std::size_t num_classes, const std::size_t num_sv, const std::size_t device_num_sv, const std::size_t device_sv_offset, const std::size_t grid_x_offset, const std::size_t grid_y_offset) { + // cast all values to 64-bit std::size_t to prevent potential 32-bit overflows + const auto INTERNAL_BLOCK_SIZE_uz = static_cast(INTERNAL_BLOCK_SIZE); + const auto THREAD_BLOCK_SIZE_uz = static_cast(THREAD_BLOCK_SIZE); + const auto PADDING_SIZE_uz = static_cast(PADDING_SIZE); + + const auto threadIdx_x = static_cast(threadIdx.x); // current thread in block x-dimension + const auto threadIdx_y = static_cast(threadIdx.y); // current thread in block y-dimension + const auto blockDim_x = static_cast(blockDim.x); // number of threads in block x-dimension + const auto blockDim_y = static_cast(blockDim.y); // number of threads in block y-dimension + const auto blockIdx_x = static_cast(blockIdx.x) + grid_x_offset; // current block in grid x-dimension + offsets if the grid size is too large + const auto blockIdx_y = static_cast(blockIdx.y) + grid_y_offset; // current block in grid y-dimension + offsets if the grid size is too large + + // create two shared memory arrays used for caching + __shared__ real_type feature_cache[THREAD_BLOCK_SIZE][INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE]; + __shared__ real_type alpha_cache[THREAD_BLOCK_SIZE][INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE]; // create a thread private array used for internal caching real_type temp[INTERNAL_BLOCK_SIZE][INTERNAL_BLOCK_SIZE]{}; - // iterate over all support vectors using blocking to be able to cache them for faster memory accesses - for (unsigned long long sv = 0; sv < device_specific_num_sv; sv += THREAD_BLOCK_SIZE_ull) { - // load data into shared memory - for (unsigned internal = 0; internal < INTERNAL_BLOCK_SIZE; ++internal) { - const auto global_feature_idx = feature_idx_linear + static_cast(internal) * THREAD_BLOCK_SIZE_ull; - const auto global_class_idx = class_idx_linear + static_cast(internal) * THREAD_BLOCK_SIZE_ull; + { + // calculate the indices used in the current thread, pays attention to coalesced memory accesses + const auto feature_idx_linear = blockIdx_x * blockDim_x * INTERNAL_BLOCK_SIZE_uz + threadIdx_x; // num_features + const auto class_idx_linear = blockIdx_y * blockDim_y * INTERNAL_BLOCK_SIZE_uz + threadIdx_x; // num_classes - data_cache_feature[threadIdx.y][internal * THREAD_BLOCK_SIZE + threadIdx.x] = sv_d[global_feature_idx * (device_specific_num_sv + PADDING_SIZE_ull) + sv + threadIdx_y]; // SoA - data_cache_alpha[threadIdx.y][internal * THREAD_BLOCK_SIZE + threadIdx.x] = alpha_d[global_class_idx * (num_sv + PADDING_SIZE_ull) + sv + sv_offset + threadIdx_y]; // AoS - } - __syncthreads(); // wait until all threads loaded their part of the data + // iterate over all support vectors using blocking to be able to cache them for faster memory accesses + for (std::size_t sv_block = 0; sv_block < device_num_sv; sv_block += THREAD_BLOCK_SIZE_uz) { + // load data into shared memory + for (unsigned internal = 0; internal < INTERNAL_BLOCK_SIZE; ++internal) { + // calculate the indices to access the global data, pays attention to coalesced memory accesses + const auto global_feature_idx_linear = feature_idx_linear + static_cast(internal) * THREAD_BLOCK_SIZE_uz; + const auto global_class_idx_linear = class_idx_linear + static_cast(internal) * THREAD_BLOCK_SIZE_uz; - // perform the dot product calculation - for (unsigned block_dim = 0; block_dim < THREAD_BLOCK_SIZE; ++block_dim) { - for (unsigned internal_feature = 0; internal_feature < INTERNAL_BLOCK_SIZE; ++internal_feature) { - for (unsigned internal_class = 0; internal_class < INTERNAL_BLOCK_SIZE; ++internal_class) { - temp[internal_feature][internal_class] += data_cache_alpha[block_dim][threadIdx.y * INTERNAL_BLOCK_SIZE + internal_class] * data_cache_feature[block_dim][threadIdx.x * INTERNAL_BLOCK_SIZE + internal_feature]; + // store the values in the shared memory + feature_cache[threadIdx.y][internal * THREAD_BLOCK_SIZE + threadIdx.x] = support_vectors[global_feature_idx_linear * (device_num_sv + PADDING_SIZE_uz) + sv_block + threadIdx_y]; // SoA + alpha_cache[threadIdx.y][internal * THREAD_BLOCK_SIZE + threadIdx.x] = alpha[global_class_idx_linear * (num_sv + PADDING_SIZE_uz) + sv_block + device_sv_offset + threadIdx_y]; // AoS + } + __syncthreads(); // wait until all threads loaded their part of the data + + // perform the dot product calculation + for (unsigned sv = 0; sv < THREAD_BLOCK_SIZE; ++sv) { + for (unsigned internal_feature = 0; internal_feature < INTERNAL_BLOCK_SIZE; ++internal_feature) { + for (unsigned internal_class = 0; internal_class < INTERNAL_BLOCK_SIZE; ++internal_class) { + temp[internal_feature][internal_class] += alpha_cache[sv][threadIdx.y * INTERNAL_BLOCK_SIZE + internal_class] * feature_cache[sv][threadIdx.x * INTERNAL_BLOCK_SIZE + internal_feature]; + } } } + __syncthreads(); // wait until all threads performed their part of the calculations } - __syncthreads(); // wait until all threads performed their part of the calculations } - // update global array with local one + // calculate the indices used in the current thread + const auto feature_idx = (blockIdx_x * blockDim_x + threadIdx_x) * INTERNAL_BLOCK_SIZE_uz; // num_features + const auto class_idx = (blockIdx_y * blockDim_y + threadIdx_y) * INTERNAL_BLOCK_SIZE_uz; // num_classes + + // update the global w-vector with the locally cached values for (unsigned internal_feature = 0; internal_feature < INTERNAL_BLOCK_SIZE; ++internal_feature) { for (unsigned internal_class = 0; internal_class < INTERNAL_BLOCK_SIZE; ++internal_class) { - const auto global_feature_idx = feature_idx + static_cast(internal_feature); - const auto global_class_idx = class_idx + static_cast(internal_class); + // calculate the indices to access the global data + const auto global_feature_idx = feature_idx + static_cast(internal_feature); + const auto global_class_idx = class_idx + static_cast(internal_class); - w_d[global_feature_idx * (num_classes + PADDING_SIZE_ull) + global_class_idx] = temp[internal_feature][internal_class]; + w[global_feature_idx * (num_classes + PADDING_SIZE_uz) + global_class_idx] = temp[internal_feature][internal_class]; // SoA } } } /** - * @brief Predict the @p predict_points_d using the linear kernel speeding up the calculation using the @p w_d vector. - * @param[out] prediction_d the predicted values - * @param[in] w_d the vector to speedup the calculations - * @param[in] rho_d the previously learned bias - * @param[in] predict_points_d the data points to predict + * @brief Predict the @p predict_points using the linear kernel speeding up the calculation using the @p w vector. + * @param[out] prediction the predicted values + * @param[in] w the vector to speedup the calculations + * @param[in] rho the previously learned bias + * @param[in] predict_points the data points to predict * @param[in] num_classes the number of classes * @param[in] num_predict_points the number of data points to predict * @param[in] num_features the number of features per data point * @param[in] grid_x_offset the offset in x-dimension into the data points if more than one execution grid has to be used * @param[in] grid_y_offset the offset in y-dimension into the data points if more than one execution grid has to be used */ -__global__ void device_kernel_predict_linear(real_type *prediction_d, const real_type *w_d, const real_type *rho_d, const real_type *predict_points_d, const unsigned long long num_classes, const unsigned long long num_predict_points, const unsigned long long num_features, const unsigned long long grid_x_offset, const unsigned long long grid_y_offset) { - // cast all values to 64-bit unsigned long long to prevent potential 32-bit overflows - const auto threadIdx_x = static_cast(threadIdx.x); // current thread in block x-dimension - const auto threadIdx_y = static_cast(threadIdx.y); // current thread in block y-dimension - const auto blockDim_x = static_cast(blockDim.x); // number of threads in block x-dimension - const auto blockDim_y = static_cast(blockDim.y); // number of threads in block y-dimension - const auto blockIdx_x = static_cast(blockIdx.x) + grid_x_offset; // current block in grid x-dimension + offsets if the grid size would be too large - const auto blockIdx_y = static_cast(blockIdx.y) + grid_y_offset; // current block in grid y-dimension + offsets if the grid size would be too large - const auto INTERNAL_BLOCK_SIZE_ull = static_cast(INTERNAL_BLOCK_SIZE); - const auto THREAD_BLOCK_SIZE_ull = static_cast(THREAD_BLOCK_SIZE); - const auto FEATURE_BLOCK_SIZE_ull = static_cast(FEATURE_BLOCK_SIZE); - const auto PADDING_SIZE_ull = static_cast(PADDING_SIZE); - - // calculate the indices used in the current thread - const auto pp_idx = (blockIdx_x * blockDim_x + threadIdx_x) * INTERNAL_BLOCK_SIZE_ull; - const auto pp_idx_linear = blockIdx_x * blockDim_x * INTERNAL_BLOCK_SIZE_ull + threadIdx_x; - const auto class_idx = (blockIdx_y * blockDim_y + threadIdx_y) * INTERNAL_BLOCK_SIZE_ull; - const auto class_idx_linear = blockIdx_y * blockDim_y * INTERNAL_BLOCK_SIZE_ull + threadIdx_x; - - // create the shared memory arrays used for caching data point features - __shared__ real_type data_cache_pp[FEATURE_BLOCK_SIZE][INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE]; - __shared__ real_type data_cache_w[FEATURE_BLOCK_SIZE][INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE]; +__global__ void device_kernel_predict_linear(real_type *prediction, const real_type *w, const real_type *rho, const real_type *predict_points, const std::size_t num_classes, const std::size_t num_predict_points, const std::size_t num_features, const std::size_t grid_x_offset, const std::size_t grid_y_offset) { + // cast all values to 64-bit std::size_t to prevent potential 32-bit overflows + const auto INTERNAL_BLOCK_SIZE_uz = static_cast(INTERNAL_BLOCK_SIZE); + const auto THREAD_BLOCK_SIZE_uz = static_cast(THREAD_BLOCK_SIZE); + const auto PADDING_SIZE_uz = static_cast(PADDING_SIZE); + + const auto threadIdx_x = static_cast(threadIdx.x); // current thread in block x-dimension + const auto threadIdx_y = static_cast(threadIdx.y); // current thread in block y-dimension + const auto blockDim_x = static_cast(blockDim.x); // number of threads in block x-dimension + const auto blockDim_y = static_cast(blockDim.y); // number of threads in block y-dimension + const auto blockIdx_x = static_cast(blockIdx.x) + grid_x_offset; // current block in grid x-dimension + offsets if the grid size is too large + const auto blockIdx_y = static_cast(blockIdx.y) + grid_y_offset; // current block in grid y-dimension + offsets if the grid size is too large + + // create two shared memory arrays used for caching + __shared__ real_type pp_cache[THREAD_BLOCK_SIZE][INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE]; + __shared__ real_type w_cache[THREAD_BLOCK_SIZE][INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE]; // create a thread private array used for internal caching real_type temp[INTERNAL_BLOCK_SIZE][INTERNAL_BLOCK_SIZE]{}; - // iterate over all features using blocking to be able to cache them for faster memory accesses - for (unsigned long long dim = 0; dim < num_features; dim += FEATURE_BLOCK_SIZE_ull) { - // load data into shared memory - for (unsigned internal = 0; internal < INTERNAL_BLOCK_SIZE; ++internal) { - const auto global_pp_idx = pp_idx_linear + static_cast(internal) * THREAD_BLOCK_SIZE_ull; - const auto global_class_idx = class_idx_linear + static_cast(internal) * THREAD_BLOCK_SIZE_ull; - - // FEATURE_BLOCK_SIZE = 2 * THREAD_BLOCK_SIZE -> store twice as many values in the shared memory - data_cache_pp[threadIdx.y][internal * THREAD_BLOCK_SIZE + threadIdx.x] = predict_points_d[(dim + threadIdx_y) * (num_predict_points + PADDING_SIZE_ull) + global_pp_idx]; - data_cache_pp[threadIdx.y + THREAD_BLOCK_SIZE][internal * THREAD_BLOCK_SIZE + threadIdx.x] = predict_points_d[(dim + threadIdx_y + THREAD_BLOCK_SIZE_ull) * (num_predict_points + PADDING_SIZE_ull) + global_pp_idx]; - data_cache_w[threadIdx.y][internal * THREAD_BLOCK_SIZE + threadIdx.x] = w_d[(dim + threadIdx_y) * (num_classes + PADDING_SIZE_ull) + global_class_idx]; - data_cache_w[threadIdx.y + THREAD_BLOCK_SIZE][internal * THREAD_BLOCK_SIZE + threadIdx.x] = w_d[(dim + threadIdx_y + THREAD_BLOCK_SIZE_ull) * (num_classes + PADDING_SIZE_ull) + global_class_idx]; - } - __syncthreads(); // wait until all threads loaded their part of the data + { + // calculate the indices used in the current thread, pays attention to coalesced memory accesses + const auto pp_idx_linear = blockIdx_x * blockDim_x * INTERNAL_BLOCK_SIZE_uz + threadIdx_x; // num_predict_points + const auto class_idx_linear = blockIdx_y * blockDim_y * INTERNAL_BLOCK_SIZE_uz + threadIdx_x; // num_classes + + // iterate over all features using blocking to be able to cache them for faster memory accesses + for (std::size_t feature_block = 0; feature_block < num_features; feature_block += THREAD_BLOCK_SIZE_uz) { + // load data into shared memory + for (unsigned internal = 0; internal < INTERNAL_BLOCK_SIZE; ++internal) { + // calculate the indices to access the global data, pays attention to coalesced memory accesses + const auto global_pp_idx_linear = pp_idx_linear + static_cast(internal) * THREAD_BLOCK_SIZE_uz; + const auto global_class_idx_linear = class_idx_linear + static_cast(internal) * THREAD_BLOCK_SIZE_uz; - // perform the dot product calculation - for (unsigned block_dim = 0; block_dim < FEATURE_BLOCK_SIZE; ++block_dim) { - for (unsigned internal_pd = 0; internal_pd < INTERNAL_BLOCK_SIZE; ++internal_pd) { - for (unsigned internal_class = 0; internal_class < INTERNAL_BLOCK_SIZE; ++internal_class) { - temp[internal_pd][internal_class] += data_cache_w[block_dim][threadIdx.y * INTERNAL_BLOCK_SIZE + internal_class] * data_cache_pp[block_dim][threadIdx.x * INTERNAL_BLOCK_SIZE + internal_pd]; + // store the values in the shared memory + pp_cache[threadIdx.y][internal * THREAD_BLOCK_SIZE + threadIdx.x] = predict_points[(feature_block + threadIdx_y) * (num_predict_points + PADDING_SIZE_uz) + global_pp_idx_linear]; // SoA + w_cache[threadIdx.y][internal * THREAD_BLOCK_SIZE + threadIdx.x] = w[(feature_block + threadIdx_y) * (num_classes + PADDING_SIZE_uz) + global_class_idx_linear]; // SoA + } + __syncthreads(); // wait until all threads loaded their part of the data + + // perform the dot product calculation + for (unsigned feature = 0; feature < THREAD_BLOCK_SIZE; ++feature) { + for (unsigned internal_pp = 0; internal_pp < INTERNAL_BLOCK_SIZE; ++internal_pp) { + for (unsigned internal_class = 0; internal_class < INTERNAL_BLOCK_SIZE; ++internal_class) { + temp[internal_pp][internal_class] += w_cache[feature][threadIdx.y * INTERNAL_BLOCK_SIZE + internal_class] * pp_cache[feature][threadIdx.x * INTERNAL_BLOCK_SIZE + internal_pp]; + } } } + __syncthreads(); // wait until all threads performed their part of the calculations } - __syncthreads(); // wait until all threads performed their part of the calculations } - // update global array with local one - for (unsigned internal_pd = 0; internal_pd < INTERNAL_BLOCK_SIZE; ++internal_pd) { + // calculate the indices used in the current thread + const auto pp_idx = (blockIdx_x * blockDim_x + threadIdx_x) * INTERNAL_BLOCK_SIZE_uz; // num_predict_points + const auto class_idx = (blockIdx_y * blockDim_y + threadIdx_y) * INTERNAL_BLOCK_SIZE_uz; // num_classes + + // update the global array with the local one + for (unsigned internal_pp = 0; internal_pp < INTERNAL_BLOCK_SIZE; ++internal_pp) { for (unsigned internal_class = 0; internal_class < INTERNAL_BLOCK_SIZE; ++internal_class) { - const auto global_pp_idx = pp_idx + static_cast(internal_pd); - const auto global_class_idx = class_idx + static_cast(internal_class); + // calculate the indices to access the global data + const auto global_pp_idx = pp_idx + static_cast(internal_pp); + const auto global_class_idx = class_idx + static_cast(internal_class); - prediction_d[global_pp_idx * (num_classes + PADDING_SIZE_ull) + global_class_idx] = temp[internal_pd][internal_class] - rho_d[global_class_idx]; + prediction[global_pp_idx * (num_classes + PADDING_SIZE_uz) + global_class_idx] = temp[internal_pp][internal_class] - rho[global_class_idx]; // AoS } } } /** - * @brief Predict the @p predict_points_d using the @p kernel_function. + * @brief Predict the @p predict_points using the @p kernel_function. * @tparam kernel_function the type of the used kernel function * @tparam Args the types of the parameters necessary for the specific kernel function - * @param[in] prediction_d the predicted values - * @param[in] alpha_d the previously learned weights - * @param[in] rho_d the previously learned biases - * @param[in] sv_d the support vectors - * @param[in] predict_points_d the data points to predict + * @param[in] prediction the predicted values + * @param[in] alpha the previously learned weights + * @param[in] rho the previously learned biases + * @param[in] sv the support vectors + * @param[in] predict_points the data points to predict * @param[in] num_classes the number of classes * @param[in] num_sv the number of support vectors * @param[in] num_predict_points the number of data points to predict @@ -186,53 +200,55 @@ __global__ void device_kernel_predict_linear(real_type *prediction_d, const real * @param[in] kernel_function_parameter the parameters necessary to apply the @p kernel_function */ template -__global__ void device_kernel_predict(real_type *prediction_d, const real_type *alpha_d, const real_type *rho_d, const real_type *sv_d, const real_type *predict_points_d, const unsigned long long num_classes, const unsigned long long num_sv, const unsigned long long num_predict_points, const unsigned long long num_features, const unsigned long long grid_x_offset, const unsigned long long grid_y_offset, Args... kernel_function_parameter) { - // cast all values to 64-bit unsigned long long to prevent potential 32-bit overflows - const auto threadIdx_x = static_cast(threadIdx.x); // current thread in block x-dimension - const auto threadIdx_y = static_cast(threadIdx.y); // current thread in block y-dimension - const auto blockDim_x = static_cast(blockDim.x); // number of threads in block x-dimension - const auto blockDim_y = static_cast(blockDim.y); // number of threads in block y-dimension - const auto blockIdx_x = static_cast(blockIdx.x) + grid_x_offset; // current block in grid x-dimension + offsets if the grid size would be too large - const auto blockIdx_y = static_cast(blockIdx.y) + grid_y_offset; // current block in grid y-dimension + offsets if the grid size would be too large - const auto INTERNAL_BLOCK_SIZE_ull = static_cast(INTERNAL_BLOCK_SIZE); - const auto THREAD_BLOCK_SIZE_ull = static_cast(THREAD_BLOCK_SIZE); - const auto FEATURE_BLOCK_SIZE_ull = static_cast(FEATURE_BLOCK_SIZE); - const auto PADDING_SIZE_ull = static_cast(PADDING_SIZE); - - // calculate the indices used in the current thread - const auto pp_idx = (blockIdx_x * blockDim_x + threadIdx_x) * INTERNAL_BLOCK_SIZE_ull; - const auto pp_idx_linear = blockIdx_x * blockDim_x * INTERNAL_BLOCK_SIZE_ull + threadIdx_x; - const auto sv_idx_linear = blockIdx_y * blockDim_y * INTERNAL_BLOCK_SIZE_ull + threadIdx_x; +__global__ void device_kernel_predict(real_type *prediction, const real_type *alpha, const real_type *rho, const real_type *sv, const real_type *predict_points, const std::size_t num_classes, const std::size_t num_sv, const std::size_t num_predict_points, const std::size_t num_features, const std::size_t grid_x_offset, const std::size_t grid_y_offset, Args... kernel_function_parameter) { + // cast all values to 64-bit std::size_t to prevent potential 32-bit overflows + const auto INTERNAL_BLOCK_SIZE_uz = static_cast(INTERNAL_BLOCK_SIZE); + const auto THREAD_BLOCK_SIZE_uz = static_cast(THREAD_BLOCK_SIZE); + const auto PADDING_SIZE_uz = static_cast(PADDING_SIZE); + + const auto threadIdx_x = static_cast(threadIdx.x); // current thread in block x-dimension + const auto threadIdx_y = static_cast(threadIdx.y); // current thread in block y-dimension + const auto blockDim_x = static_cast(blockDim.x); // number of threads in block x-dimension + const auto blockDim_y = static_cast(blockDim.y); // number of threads in block y-dimension + const auto blockIdx_x = static_cast(blockIdx.x) + grid_x_offset; // current block in grid x-dimension + offsets if the grid size is too large + const auto blockIdx_y = static_cast(blockIdx.y) + grid_y_offset; // current block in grid y-dimension + offsets if the grid size is too large + + // create two shared memory arrays used for caching + __shared__ real_type cache_one[THREAD_BLOCK_SIZE * INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE]; + __shared__ real_type cache_two[THREAD_BLOCK_SIZE * INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE]; // create a thread private array used for internal caching real_type temp[INTERNAL_BLOCK_SIZE][INTERNAL_BLOCK_SIZE]{}; { - // create the shared memory arrays used for caching data point features - __shared__ real_type data_cache_pp[FEATURE_BLOCK_SIZE][INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE]; - __shared__ real_type data_cache_sv[FEATURE_BLOCK_SIZE][INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE]; + // reinterpret the shared memory arrays to be of shape [THREAD_BLOCK_SIZE][INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE] + auto pp_cache = reinterpret_cast(cache_one); + auto sv_cache = reinterpret_cast(cache_two); + + // calculate the indices used in the current thread, pays attention to coalesced memory accesses + const auto pp_idx_linear = blockIdx_x * blockDim_x * INTERNAL_BLOCK_SIZE_uz + threadIdx_x; // num_predict_points + const auto sv_idx_linear = blockIdx_y * blockDim_y * INTERNAL_BLOCK_SIZE_uz + threadIdx_x; // num_support_vectors // iterate over all features using blocking to be able to cache them for faster memory accesses - for (unsigned long long dim = 0; dim < num_features; dim += FEATURE_BLOCK_SIZE_ull) { + for (std::size_t feature_block = 0; feature_block < num_features; feature_block += THREAD_BLOCK_SIZE_uz) { // load data into shared memory for (unsigned internal = 0; internal < INTERNAL_BLOCK_SIZE; ++internal) { - const auto global_pp_idx = pp_idx_linear + static_cast(internal) * THREAD_BLOCK_SIZE; - const auto global_sv_idx = sv_idx_linear + static_cast(internal) * THREAD_BLOCK_SIZE; - - // FEATURE_BLOCK_SIZE = 2 * THREAD_BLOCK_SIZE -> store twice as many values in the shared memory - data_cache_pp[threadIdx.y][internal * THREAD_BLOCK_SIZE + threadIdx.x] = predict_points_d[(dim + threadIdx_y) * (num_predict_points + PADDING_SIZE_ull) + global_pp_idx]; - data_cache_pp[threadIdx.y + THREAD_BLOCK_SIZE][internal * THREAD_BLOCK_SIZE + threadIdx.x] = predict_points_d[(dim + threadIdx_y + THREAD_BLOCK_SIZE_ull) * (num_predict_points + PADDING_SIZE_ull) + global_pp_idx]; - data_cache_sv[threadIdx.y][internal * THREAD_BLOCK_SIZE + threadIdx.x] = sv_d[(dim + threadIdx_y) * (num_sv + PADDING_SIZE_ull) + global_sv_idx]; - data_cache_sv[threadIdx.y + THREAD_BLOCK_SIZE][internal * THREAD_BLOCK_SIZE + threadIdx.x] = sv_d[(dim + threadIdx_y + THREAD_BLOCK_SIZE_ull) * (num_sv + PADDING_SIZE_ull) + global_sv_idx]; + // calculate the indices to access the global data, pays attention to coalesced memory accesses + const auto global_pp_idx_linear = pp_idx_linear + static_cast(internal) * THREAD_BLOCK_SIZE; + const auto global_sv_idx_linear = sv_idx_linear + static_cast(internal) * THREAD_BLOCK_SIZE; + + // store the values in the shared memory + pp_cache[threadIdx.y][internal * THREAD_BLOCK_SIZE + threadIdx.x] = predict_points[(feature_block + threadIdx_y) * (num_predict_points + PADDING_SIZE_uz) + global_pp_idx_linear]; // SoA + sv_cache[threadIdx.y][internal * THREAD_BLOCK_SIZE + threadIdx.x] = sv[(feature_block + threadIdx_y) * (num_sv + PADDING_SIZE_uz) + global_sv_idx_linear]; // SoA } __syncthreads(); // wait until all threads loaded their part of the data // perform the feature reduction calculation - for (unsigned block_dim = 0; block_dim < FEATURE_BLOCK_SIZE; ++block_dim) { - for (unsigned internal_pd = 0; internal_pd < INTERNAL_BLOCK_SIZE; ++internal_pd) { + for (unsigned feature = 0; feature < THREAD_BLOCK_SIZE; ++feature) { + for (unsigned internal_pp = 0; internal_pp < INTERNAL_BLOCK_SIZE; ++internal_pp) { for (unsigned internal_sv = 0; internal_sv < INTERNAL_BLOCK_SIZE; ++internal_sv) { - temp[internal_pd][internal_sv] += detail::feature_reduce(data_cache_sv[block_dim][threadIdx.y * INTERNAL_BLOCK_SIZE + internal_sv], - data_cache_pp[block_dim][threadIdx.x * INTERNAL_BLOCK_SIZE + internal_pd]); + temp[internal_pp][internal_sv] += detail::feature_reduce(sv_cache[feature][threadIdx.y * INTERNAL_BLOCK_SIZE + internal_sv], + pp_cache[feature][threadIdx.x * INTERNAL_BLOCK_SIZE + internal_pp]); } } } @@ -241,55 +257,57 @@ __global__ void device_kernel_predict(real_type *prediction_d, const real_type * } // update temp using the respective kernel function - for (unsigned internal_pd = 0; internal_pd < INTERNAL_BLOCK_SIZE; ++internal_pd) { + for (unsigned internal_pp = 0; internal_pp < INTERNAL_BLOCK_SIZE; ++internal_pp) { for (unsigned internal_sv = 0; internal_sv < INTERNAL_BLOCK_SIZE; ++internal_sv) { - temp[internal_pd][internal_sv] = detail::apply_kernel_function(temp[internal_pd][internal_sv], kernel_function_parameter...); + temp[internal_pp][internal_sv] = detail::apply_kernel_function(temp[internal_pp][internal_sv], kernel_function_parameter...); } } { - // same shared memory size but with different dimensions - __shared__ real_type alpha_cache[FEATURE_BLOCK_SIZE][INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE]; - __shared__ real_type out_cache[FEATURE_BLOCK_SIZE][INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE]; + // reinterpret the shared memory arrays to be of shape [THREAD_BLOCK_SIZE][INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE] + auto alpha_cache = reinterpret_cast(cache_one); + auto out_cache = reinterpret_cast(cache_two); - // iterate over all features using blocking to be able to cache them for faster memory accesses - for (unsigned long long dim = 0; dim < num_classes; dim += FEATURE_BLOCK_SIZE_ull) { + // calculate the indices used in the current thread + const auto pp_idx = (blockIdx_x * blockDim_x + threadIdx_x) * INTERNAL_BLOCK_SIZE_uz; // num_predict_points + // calculate the indices used in the current thread, pays attention to coalesced memory accesses + const auto sv_idx_linear = blockIdx_y * blockDim_y * INTERNAL_BLOCK_SIZE_uz + threadIdx_x; // num_support_vectors + + // iterate over all classes using blocking to be able to cache them for faster memory accesses + for (std::size_t class_block = 0; class_block < num_classes; class_block += THREAD_BLOCK_SIZE_uz) { // load data into shared memory for (unsigned internal = 0; internal < INTERNAL_BLOCK_SIZE; ++internal) { - const unsigned long long global_sv_idx = sv_idx_linear + internal * THREAD_BLOCK_SIZE; - - // FEATURE_BLOCK_SIZE = 2 * THREAD_BLOCK_SIZE -> store twice as many values in the shared memory - alpha_cache[threadIdx.y][internal * THREAD_BLOCK_SIZE + threadIdx.x] = alpha_d[(dim + threadIdx_y) * (num_sv + PADDING_SIZE_ull) + global_sv_idx]; - alpha_cache[threadIdx.y + THREAD_BLOCK_SIZE][internal * THREAD_BLOCK_SIZE + threadIdx.x] = alpha_d[(dim + threadIdx_y + THREAD_BLOCK_SIZE_ull) * (num_sv + PADDING_SIZE_ull) + global_sv_idx]; + // calculate the indices to access the global data, pays attention to coalesced memory accesses + const auto global_sv_idx_linear = sv_idx_linear + internal * THREAD_BLOCK_SIZE; + // store the values in the shared memory + alpha_cache[threadIdx.y][internal * THREAD_BLOCK_SIZE + threadIdx.x] = alpha[(class_block + threadIdx_y) * (num_sv + PADDING_SIZE_uz) + global_sv_idx_linear]; // AoS // the bias (rho) must only be applied once for all support vectors - if (blockIdx_y == 0ull) { - out_cache[threadIdx.y][internal * THREAD_BLOCK_SIZE + threadIdx.x] = -rho_d[dim + threadIdx_y]; - out_cache[threadIdx.y + THREAD_BLOCK_SIZE][internal * THREAD_BLOCK_SIZE + threadIdx.x] = -rho_d[dim + threadIdx_y + THREAD_BLOCK_SIZE_ull]; + if (blockIdx_y == std::size_t{ 0 }) { + out_cache[threadIdx.y][internal * THREAD_BLOCK_SIZE + threadIdx.x] = -rho[class_block + threadIdx_y]; } else { out_cache[threadIdx.y][internal * THREAD_BLOCK_SIZE + threadIdx.x] = real_type{ 0.0 }; - out_cache[threadIdx.y + THREAD_BLOCK_SIZE][internal * THREAD_BLOCK_SIZE + threadIdx.x] = real_type{ 0.0 }; } } __syncthreads(); // wait until all threads loaded their part of the data // calculate intermediate results and store them in shared memory - for (unsigned class_idx = 0; class_idx < FEATURE_BLOCK_SIZE; ++class_idx) { - for (unsigned internal_pd = 0; internal_pd < INTERNAL_BLOCK_SIZE; ++internal_pd) { + for (unsigned class_idx = 0; class_idx < THREAD_BLOCK_SIZE; ++class_idx) { + for (unsigned internal_pp = 0; internal_pp < INTERNAL_BLOCK_SIZE; ++internal_pp) { for (unsigned internal_sv = 0; internal_sv < INTERNAL_BLOCK_SIZE; ++internal_sv) { - out_cache[(class_idx + threadIdx.y) % FEATURE_BLOCK_SIZE][internal_pd * THREAD_BLOCK_SIZE + threadIdx.x] += - temp[internal_pd][internal_sv] * alpha_cache[(class_idx + threadIdx.y) % FEATURE_BLOCK_SIZE][threadIdx.y * INTERNAL_BLOCK_SIZE + internal_sv]; + out_cache[(class_idx + threadIdx.y) % THREAD_BLOCK_SIZE][internal_pp * THREAD_BLOCK_SIZE + threadIdx.x] += + temp[internal_pp][internal_sv] * alpha_cache[(class_idx + threadIdx.y) % THREAD_BLOCK_SIZE][threadIdx.y * INTERNAL_BLOCK_SIZE + internal_sv]; } } __syncthreads(); // wait until all threads performed their part of the calculations } - // add intermediate cached results to prediction_d + // atomically add the intermediate cached results to the prediction for (unsigned internal = 0; internal < INTERNAL_BLOCK_SIZE; ++internal) { - const auto global_pp_idx = pp_idx + static_cast(internal); + // calculate the indices to access the global data + const auto global_pp_idx = pp_idx + static_cast(internal); - atomicAdd(&prediction_d[global_pp_idx * (num_classes + PADDING_SIZE_ull) + dim + threadIdx_y], out_cache[threadIdx.y][internal * THREAD_BLOCK_SIZE + threadIdx.x]); - atomicAdd(&prediction_d[global_pp_idx * (num_classes + PADDING_SIZE_ull) + dim + threadIdx_y + THREAD_BLOCK_SIZE_ull], out_cache[threadIdx.y + THREAD_BLOCK_SIZE][internal * THREAD_BLOCK_SIZE + threadIdx.x]); + atomicAdd(&prediction[global_pp_idx * (num_classes + PADDING_SIZE_uz) + class_block + threadIdx_y], out_cache[threadIdx.y][internal * THREAD_BLOCK_SIZE + threadIdx.x]); } __syncthreads(); // wait until all threads updated their part of the prediction } From 45832e70abd46ed5b4042abebef83dcba8c8d32a Mon Sep 17 00:00:00 2001 From: Marcel Breyer Date: Sat, 31 May 2025 14:48:38 +0200 Subject: [PATCH 015/215] Fix Doxygen documentation. --- include/plssvm/detail/make_unique_for_overwrite.hpp | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/include/plssvm/detail/make_unique_for_overwrite.hpp b/include/plssvm/detail/make_unique_for_overwrite.hpp index 51b56e126..06f4cbaa5 100644 --- a/include/plssvm/detail/make_unique_for_overwrite.hpp +++ b/include/plssvm/detail/make_unique_for_overwrite.hpp @@ -27,14 +27,14 @@ template struct is_unbounded_array : std::false_type { }; /** - * @brief Specialization of @ref is_unbounded_array for unbounded arrays. + * @brief Specialization of @ref plssvm::detail::is_unbounded_array for unbounded arrays. * @tparam T the array type */ template struct is_unbounded_array : std::true_type { }; /** - * @brief Shortcut for @ref is_unbounded_array::value. + * @brief Shortcut for @ref plssvm::detail::is_unbounded_array. * @tparam T the array type */ template @@ -48,7 +48,7 @@ template struct is_bounded_array : std::false_type { }; /** - * @brief Specialization of @ref is_bounded_array for unbounded arrays. + * @brief Specialization of @ref plssvm::detail::is_bounded_array for unbounded arrays. * @tparam T the array type * @tparam N the size of the array */ @@ -56,7 +56,7 @@ template struct is_bounded_array : std::true_type { }; /** - * @brief Shortcut for @ref is_unbounded_array::value. + * @brief Shortcut for @ref plssvm::detail::is_bounded_array. * @tparam T the array type */ template From 10ff3c26011db0042860d5ca8df6cd663ac988bd Mon Sep 17 00:00:00 2001 From: Marcel Breyer Date: Sat, 31 May 2025 17:33:47 +0200 Subject: [PATCH 016/215] Add additional assert. --- include/plssvm/backends/OpenMP/kernel/cg_explicit/blas.hpp | 2 ++ .../OpenMP/kernel/cg_explicit/kernel_matrix_assembly.hpp | 1 + 2 files changed, 3 insertions(+) diff --git a/include/plssvm/backends/OpenMP/kernel/cg_explicit/blas.hpp b/include/plssvm/backends/OpenMP/kernel/cg_explicit/blas.hpp index 81f560421..3fbbaaa4b 100644 --- a/include/plssvm/backends/OpenMP/kernel/cg_explicit/blas.hpp +++ b/include/plssvm/backends/OpenMP/kernel/cg_explicit/blas.hpp @@ -38,6 +38,7 @@ namespace plssvm::openmp::detail { */ inline void device_kernel_symm(const std::size_t num_rows, const std::size_t num_rhs, const std::size_t device_num_rows, const std::size_t device_row_offset, const real_type alpha, const real_type *A, const soa_matrix &B, const real_type beta, soa_matrix &C) { // compute: C = alpha * A * B + beta * C with A in m x k, B in n x k, and C in n x m, alpha, beta as scalar + PLSSVM_ASSERT(A != nullptr, "The A matrix result pointer must be valid!"); PLSSVM_ASSERT(B.shape() == (plssvm::shape{ num_rhs, num_rows }), "B matrix sizes mismatch!: {} != [{}, {}]", B.shape(), num_rhs, num_rows); PLSSVM_ASSERT(C.shape() == (plssvm::shape{ num_rhs, num_rows }), "C matrix sizes mismatch!: {} != [{}, {}]", C.shape(), num_rhs, num_rows); PLSSVM_ASSERT(num_rows >= device_num_rows, "The number of place specific rows ({}) cannot be greater the the total number of rows ({})!", device_num_rows, num_rows); @@ -125,6 +126,7 @@ inline void device_kernel_symm(const std::size_t num_rows, const std::size_t num */ inline void device_kernel_symm_mirror(const std::size_t num_rows, const std::size_t num_rhs, const std::size_t num_mirror_rows, const std::size_t device_num_rows, const std::size_t device_row_offset, const real_type alpha, const real_type *A, const soa_matrix &B, const real_type beta, soa_matrix &C) { // compute: C = alpha * A * B + beta * C with A in m x k, B in n x k, and C in n x m, alpha, beta as scalar + PLSSVM_ASSERT(A != nullptr, "The A matrix result pointer must be valid!"); PLSSVM_ASSERT(B.shape() == (plssvm::shape{ num_rhs, num_rows }), "B matrix sizes mismatch!: {} != [{}, {}]", B.shape(), num_rhs, num_rows); PLSSVM_ASSERT(C.shape() == (plssvm::shape{ num_rhs, num_rows }), "C matrix sizes mismatch!: {} != [{}, {}]", C.shape(), num_rhs, num_rows); PLSSVM_ASSERT(num_rows >= device_num_rows, "The number of place specific rows ({}) cannot be greater the the total number of rows ({})!", device_num_rows, num_rows); diff --git a/include/plssvm/backends/OpenMP/kernel/cg_explicit/kernel_matrix_assembly.hpp b/include/plssvm/backends/OpenMP/kernel/cg_explicit/kernel_matrix_assembly.hpp index f384645b1..381c8adf7 100644 --- a/include/plssvm/backends/OpenMP/kernel/cg_explicit/kernel_matrix_assembly.hpp +++ b/include/plssvm/backends/OpenMP/kernel/cg_explicit/kernel_matrix_assembly.hpp @@ -41,6 +41,7 @@ namespace plssvm::openmp::detail { */ template void device_kernel_assembly(real_type *kernel_matrix, const soa_matrix &data, const std::size_t device_num_rows, const std::size_t device_row_offset, const std::vector &q, const real_type QA_cost, const real_type cost, Args... kernel_function_parameter) { + PLSSVM_ASSERT(kernel_matrix != nullptr, "The kernel matrix result pointer must be valid!"); PLSSVM_ASSERT(q.size() == data.num_rows() - 1, "Sizes mismatch!: {} != {}", q.size(), data.num_rows() - 1); PLSSVM_ASSERT(q.size() >= device_num_rows, "The number of place specific rows ({}) cannot be greater the the total number of rows ({})!", device_num_rows, q.size()); PLSSVM_ASSERT(q.size() >= device_row_offset, "The row offset ({}) cannot be greater the the total number of rows ({})!", device_row_offset, q.size()); From dad55f2688eb18ee01684cf9b605dd529273da54 Mon Sep 17 00:00:00 2001 From: Marcel Breyer Date: Sat, 31 May 2025 17:34:57 +0200 Subject: [PATCH 017/215] Fix variable names. --- .../OpenMP/kernel/cg_explicit/kernel_matrix_assembly.hpp | 4 ++-- .../OpenMP/kernel/cg_implicit/kernel_matrix_assembly_blas.hpp | 4 ++-- include/plssvm/backends/OpenMP/kernel/predict_kernel.hpp | 4 ++-- 3 files changed, 6 insertions(+), 6 deletions(-) diff --git a/include/plssvm/backends/OpenMP/kernel/cg_explicit/kernel_matrix_assembly.hpp b/include/plssvm/backends/OpenMP/kernel/cg_explicit/kernel_matrix_assembly.hpp index 381c8adf7..b442288df 100644 --- a/include/plssvm/backends/OpenMP/kernel/cg_explicit/kernel_matrix_assembly.hpp +++ b/include/plssvm/backends/OpenMP/kernel/cg_explicit/kernel_matrix_assembly.hpp @@ -51,7 +51,7 @@ void device_kernel_assembly(real_type *kernel_matrix, const soa_matrix(std::ceil(static_cast(num_rows - device_row_offset) / INTERNAL_BLOCK_SIZE)); - const auto blocked_device_specific_num_rows = static_cast(std::ceil(static_cast(device_num_rows) / INTERNAL_BLOCK_SIZE)); + const auto blocked_device_num_rows = static_cast(std::ceil(static_cast(device_num_rows) / INTERNAL_BLOCK_SIZE)); // cast all values to 64-bit unsigned long long to prevent potential 32-bit overflows const auto INTERNAL_BLOCK_SIZE_uz = static_cast(INTERNAL_BLOCK_SIZE); @@ -60,7 +60,7 @@ void device_kernel_assembly(real_type *kernel_matrix, const soa_matrix(std::ceil(static_cast(num_rows - device_row_offset) / INTERNAL_BLOCK_SIZE)); - const auto blocked_device_specific_num_rows = static_cast(std::ceil(static_cast(device_num_rows) / INTERNAL_BLOCK_SIZE)); + const auto blocked_device_num_rows = static_cast(std::ceil(static_cast(device_num_rows) / INTERNAL_BLOCK_SIZE)); // cast all values to 64-bit unsigned long long to prevent potential 32-bit overflows const auto INTERNAL_BLOCK_SIZE_uz = static_cast(INTERNAL_BLOCK_SIZE); @@ -62,7 +62,7 @@ inline void device_kernel_assembly_symm(const real_type alpha, const std::vector #pragma omp parallel for collapse(2) schedule(dynamic) for (std::size_t row_block = 0; row_block < blocked_row_range; row_block += THREAD_BLOCK_SIZE_uz) { - for (std::size_t col_block = 0; col_block < blocked_device_specific_num_rows; col_block += THREAD_BLOCK_SIZE_uz) { + for (std::size_t col_block = 0; col_block < blocked_device_num_rows; col_block += THREAD_BLOCK_SIZE_uz) { // perform operations on the current block for (std::size_t row_thread = 0; row_thread < THREAD_BLOCK_SIZE_uz; ++row_thread) { for (std::size_t col_thread = 0; col_thread < THREAD_BLOCK_SIZE_uz; ++col_thread) { diff --git a/include/plssvm/backends/OpenMP/kernel/predict_kernel.hpp b/include/plssvm/backends/OpenMP/kernel/predict_kernel.hpp index 17696bd90..a9fa64d07 100644 --- a/include/plssvm/backends/OpenMP/kernel/predict_kernel.hpp +++ b/include/plssvm/backends/OpenMP/kernel/predict_kernel.hpp @@ -198,7 +198,7 @@ inline void device_kernel_predict(aos_matrix &prediction, const aos_m const std::size_t num_classes = alpha.num_rows(); const std::size_t num_support_vectors = support_vectors.num_rows(); const auto blocked_num_support_vectors = static_cast(std::ceil(static_cast(num_support_vectors) / INTERNAL_BLOCK_SIZE)); - const auto blocked_device_specific_num_predict_points = static_cast(std::ceil(static_cast(device_num_predict_points) / INTERNAL_BLOCK_SIZE)); + const auto blocked_device_num_predict_points = static_cast(std::ceil(static_cast(device_num_predict_points) / INTERNAL_BLOCK_SIZE)); const std::size_t num_features = predict_points.num_cols(); // cast all values to 64-bit unsigned long long to prevent potential 32-bit overflows @@ -213,7 +213,7 @@ inline void device_kernel_predict(aos_matrix &prediction, const aos_m } #pragma omp parallel for collapse(2) - for (std::size_t pp_block = 0; pp_block < blocked_device_specific_num_predict_points; pp_block += THREAD_BLOCK_SIZE_uz) { + for (std::size_t pp_block = 0; pp_block < blocked_device_num_predict_points; pp_block += THREAD_BLOCK_SIZE_uz) { for (std::size_t sv_block = 0; sv_block < blocked_num_support_vectors; sv_block += THREAD_BLOCK_SIZE_uz) { // perform operations on the current block for (std::size_t pp_thread = 0; pp_thread < THREAD_BLOCK_SIZE_uz; ++pp_thread) { From e6b76f2d90c21fb560f4af0b2187c7e4ae195594 Mon Sep 17 00:00:00 2001 From: Marcel Breyer Date: Sat, 31 May 2025 17:35:18 +0200 Subject: [PATCH 018/215] Use typename instead of class. --- include/plssvm/detail/make_unique_for_overwrite.hpp | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/include/plssvm/detail/make_unique_for_overwrite.hpp b/include/plssvm/detail/make_unique_for_overwrite.hpp index 06f4cbaa5..8e7603cc1 100644 --- a/include/plssvm/detail/make_unique_for_overwrite.hpp +++ b/include/plssvm/detail/make_unique_for_overwrite.hpp @@ -68,7 +68,7 @@ constexpr bool is_bounded_array_v = is_bounded_array::value; * @tparam T the type of the object to create * @return a unique pointer to the newly created object (`[[nodiscard]]`) */ -template , bool> = true> +template , bool> = true> [[nodiscard]] std::unique_ptr make_unique_for_overwrite() { return std::unique_ptr(new T); } @@ -80,7 +80,7 @@ template , bool> = true> * @param[in] n the size of the array to create * @return a unique pointer to the newly created object (`[[nodiscard]]`) */ -template , bool> = true> +template , bool> = true> std::unique_ptr make_unique_for_overwrite(const std::size_t n) { return std::unique_ptr(new std::remove_extent_t[n]); } @@ -93,7 +93,7 @@ std::unique_ptr make_unique_for_overwrite(const std::size_t n) { * @param[in] args the arguments to pass to the constructor * @return a unique pointer to the newly created object (`[[nodiscard]]`) */ -template , bool> = true> +template , bool> = true> auto make_unique_for_overwrite(Args &&...args) = delete; } // namespace plssvm::detail From 5913b5028aebc2f9492f61ad830828eec17a3ee4 Mon Sep 17 00:00:00 2001 From: Marcel Breyer Date: Sat, 31 May 2025 17:36:06 +0200 Subject: [PATCH 019/215] Move parallel zero memset to header function (used in multiple places). --- .../detail/make_unique_for_overwrite.hpp | 18 ++++++++++++++++++ src/plssvm/backends/OpenMP/csvm.cpp | 13 +++---------- 2 files changed, 21 insertions(+), 10 deletions(-) diff --git a/include/plssvm/detail/make_unique_for_overwrite.hpp b/include/plssvm/detail/make_unique_for_overwrite.hpp index 8e7603cc1..fcb205622 100644 --- a/include/plssvm/detail/make_unique_for_overwrite.hpp +++ b/include/plssvm/detail/make_unique_for_overwrite.hpp @@ -13,7 +13,10 @@ #ifndef PLSSVM_DETAIL_MAKE_UNIQUE_FOR_OVERWRITE_HPP_ #define PLSSVM_DETAIL_MAKE_UNIQUE_FOR_OVERWRITE_HPP_ +#include "plssvm/detail/assert.hpp" // PLSSVM_ASSERT + #include // std::size_t +#include // std::memset #include // std::unique_ptr #include // std::false_type, std::true_type, std::enable_if_t, std::is_array_v @@ -96,6 +99,21 @@ std::unique_ptr make_unique_for_overwrite(const std::size_t n) { template , bool> = true> auto make_unique_for_overwrite(Args &&...args) = delete; +template +void parallel_zero_memset(T *dest, const std::size_t count) { + PLSSVM_ASSERT(dest != nullptr, "The destination pointer may not be a nullptr!"); + +// initialize the data pointed to by dest to all zeros in parallel using OpenMP if available, otherwise fall back to a sequential memset +#if defined(_OPENMP) + #pragma omp parallel for + for (std::size_t i = 0; i < count; ++i) { + dest[i] = T{ 0 }; + } +#else + std::memset(dest, 0, count * sizeof(T)); +#endif +} + } // namespace plssvm::detail #endif // PLSSVM_DETAIL_MAKE_UNIQUE_FOR_OVERWRITE_HPP_ diff --git a/src/plssvm/backends/OpenMP/csvm.cpp b/src/plssvm/backends/OpenMP/csvm.cpp index d34b25066..868ab32e6 100644 --- a/src/plssvm/backends/OpenMP/csvm.cpp +++ b/src/plssvm/backends/OpenMP/csvm.cpp @@ -19,7 +19,7 @@ #include "plssvm/detail/assert.hpp" // PLSSVM_ASSERT #include "plssvm/detail/data_distribution.hpp" // plssvm::detail::triangular_data_distribution #include "plssvm/detail/logging/mpi_log_untracked.hpp" // plssvm::detail::log_untracked -#include "plssvm/detail/make_unique_for_overwrite.hpp" // plssvm::detail::make_unique_for_overwrite +#include "plssvm/detail/make_unique_for_overwrite.hpp" // plssvm::detail::{make_unique_for_overwrite, parallel_zero_memset} #include "plssvm/detail/memory_size.hpp" // plssvm::detail::memory_size #include "plssvm/detail/move_only_any.hpp" // plssvm::detail::{move_only_any, move_only_any_cast} #include "plssvm/detail/tracking/performance_tracker.hpp" // plssvm::detail::tracking::tracking_entry, PLSSVM_DETAIL_TRACKING_PERFORMANCE_TRACKER_ADD_TRACKING_ENTRY @@ -131,15 +131,8 @@ std::vector<::plssvm::detail::move_only_any> csvm::assemble_kernel_matrix(const // only explicitly store the upper triangular matrix auto kernel_matrix = ::plssvm::detail::make_unique_for_overwrite(num_entries); - // initialize kernel matrix to all zeros in parallel using OpenMP if available, otherwise fall back to a sequential memset -#if defined(_OPENMP) - #pragma omp parallel for - for (std::size_t i = 0; i < num_entries; ++i) { - kernel_matrix[i] = real_type{ 0.0 }; - } -#else - std::memset(kernel_matrix.get(), 0, num_entries * sizeof(real_type)); -#endif + // initialize kernel matrix to all zeros in parallel + ::plssvm::detail::parallel_zero_memset(kernel_matrix.get(), num_entries); const auto start = std::chrono::steady_clock::now(); switch (params.kernel_type) { From a67751bd9461722cbed2acf7dbc421722d4a5652 Mon Sep 17 00:00:00 2001 From: Marcel Breyer Date: Sat, 31 May 2025 18:22:26 +0200 Subject: [PATCH 020/215] Add documentation and rearrange constant declarations. --- .../kernel/cg_implicit/kernel_matrix_assembly_blas.hpp | 1 + include/plssvm/backends/OpenMP/kernel/predict_kernel.hpp | 9 +++------ 2 files changed, 4 insertions(+), 6 deletions(-) diff --git a/include/plssvm/backends/OpenMP/kernel/cg_implicit/kernel_matrix_assembly_blas.hpp b/include/plssvm/backends/OpenMP/kernel/cg_implicit/kernel_matrix_assembly_blas.hpp index 067608773..391b9fd90 100644 --- a/include/plssvm/backends/OpenMP/kernel/cg_implicit/kernel_matrix_assembly_blas.hpp +++ b/include/plssvm/backends/OpenMP/kernel/cg_implicit/kernel_matrix_assembly_blas.hpp @@ -122,6 +122,7 @@ inline void device_kernel_assembly_symm(const real_type alpha, const std::vector for (std::size_t class_block = 0; class_block < num_classes; class_block += THREAD_BLOCK_SIZE_uz) { for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) { for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) { + // calculate the indices to access the global data const auto global_i_idx = device_row_offset + i_idx + static_cast(internal_i); const auto global_j_idx = device_row_offset + j_idx + static_cast(internal_j); diff --git a/include/plssvm/backends/OpenMP/kernel/predict_kernel.hpp b/include/plssvm/backends/OpenMP/kernel/predict_kernel.hpp index a9fa64d07..1eed9735e 100644 --- a/include/plssvm/backends/OpenMP/kernel/predict_kernel.hpp +++ b/include/plssvm/backends/OpenMP/kernel/predict_kernel.hpp @@ -43,8 +43,6 @@ inline void device_kernel_w_linear(soa_matrix &w, const aos_matrix(std::ceil(static_cast(num_features) / INTERNAL_BLOCK_SIZE)); const auto blocked_num_classes = static_cast(std::ceil(static_cast(num_classes) / INTERNAL_BLOCK_SIZE)); @@ -65,6 +63,7 @@ inline void device_kernel_w_linear(soa_matrix &w, const aos_matrix, INTERNAL_BLOCK_SIZE> temp{}; + // iterate over all support vectors for (std::size_t sv_block = 0; sv_block < device_num_sv; sv_block += THREAD_BLOCK_SIZE_uz) { // perform the dot product calculation for (unsigned internal_feature = 0; internal_feature < INTERNAL_BLOCK_SIZE; ++internal_feature) { @@ -117,8 +116,6 @@ inline void device_kernel_predict_linear(aos_matrix &prediction, cons // calculate constants const std::size_t num_classes = prediction.num_cols(); const std::size_t num_features = predict_points.num_cols(); - - // calculate constants const auto blocked_device_num_predict_points = static_cast(std::ceil(static_cast(device_num_predict_points) / INTERNAL_BLOCK_SIZE)); const auto blocked_num_classes = static_cast(std::ceil(static_cast(num_classes) / INTERNAL_BLOCK_SIZE)); @@ -139,6 +136,7 @@ inline void device_kernel_predict_linear(aos_matrix &prediction, cons // create a thread private array used for internal caching std::array, INTERNAL_BLOCK_SIZE> temp{}; + // iterate over all features for (std::size_t feature_block = 0; feature_block < num_features; feature_block += THREAD_BLOCK_SIZE_uz) { // perform the dot product calculation for (unsigned internal_pp = 0; internal_pp < INTERNAL_BLOCK_SIZE; ++internal_pp) { @@ -197,9 +195,9 @@ inline void device_kernel_predict(aos_matrix &prediction, const aos_m // calculate constants const std::size_t num_classes = alpha.num_rows(); const std::size_t num_support_vectors = support_vectors.num_rows(); + const std::size_t num_features = predict_points.num_cols(); const auto blocked_num_support_vectors = static_cast(std::ceil(static_cast(num_support_vectors) / INTERNAL_BLOCK_SIZE)); const auto blocked_device_num_predict_points = static_cast(std::ceil(static_cast(device_num_predict_points) / INTERNAL_BLOCK_SIZE)); - const std::size_t num_features = predict_points.num_cols(); // cast all values to 64-bit unsigned long long to prevent potential 32-bit overflows const auto INTERNAL_BLOCK_SIZE_uz = static_cast(INTERNAL_BLOCK_SIZE); @@ -258,7 +256,6 @@ inline void device_kernel_predict(aos_matrix &prediction, const aos_m const auto global_pp_idx = device_row_offset + pp_idx + static_cast(internal_pp); const auto global_sv_idx = sv_idx + static_cast(internal_sv); - // be sure to not perform out-of-bounds accesses for (std::size_t class_idx = 0; class_idx < THREAD_BLOCK_SIZE_uz; ++class_idx) { #pragma omp atomic prediction(global_pp_idx, class_block + class_idx) += alpha(class_block + class_idx, global_sv_idx) * temp[internal_pp][internal_sv]; From 54741fff26ea2fa0ebbd3e508895451682ad88c1 Mon Sep 17 00:00:00 2001 From: Marcel Breyer Date: Sat, 31 May 2025 20:12:51 +0200 Subject: [PATCH 021/215] Inverse all temp indices for better consistency. --- .../plssvm/backends/OpenMP/kernel/cg_explicit/blas.hpp | 8 ++++---- .../OpenMP/kernel/cg_explicit/kernel_matrix_assembly.hpp | 4 ++-- include/plssvm/backends/OpenMP/kernel/predict_kernel.hpp | 6 +++--- 3 files changed, 9 insertions(+), 9 deletions(-) diff --git a/include/plssvm/backends/OpenMP/kernel/cg_explicit/blas.hpp b/include/plssvm/backends/OpenMP/kernel/cg_explicit/blas.hpp index 3fbbaaa4b..01db6a60e 100644 --- a/include/plssvm/backends/OpenMP/kernel/cg_explicit/blas.hpp +++ b/include/plssvm/backends/OpenMP/kernel/cg_explicit/blas.hpp @@ -86,7 +86,7 @@ inline void device_kernel_symm(const std::size_t num_rows, const std::size_t num } sum += A_cache * B(global_i_idx, device_row_offset + dim_block + dim); } - temp[internal_i][internal_j] += sum; + temp[internal_j][internal_i] += sum; } } } @@ -101,7 +101,7 @@ inline void device_kernel_symm(const std::size_t num_rows, const std::size_t num // be sure to not perform out-of-bounds accesses if (global_i_idx < num_rhs && device_global_j_idx < device_num_rows) { - C(global_i_idx, global_j_idx) = alpha * temp[internal_i][internal_j] + beta * C(global_i_idx, global_j_idx); + C(global_i_idx, global_j_idx) = alpha * temp[internal_j][internal_i] + beta * C(global_i_idx, global_j_idx); } } } @@ -169,7 +169,7 @@ inline void device_kernel_symm_mirror(const std::size_t num_rows, const std::siz const real_type A_cache = A[(dim_block + dim) * (num_rows - device_row_offset + PADDING_SIZE_uz) - (dim_block + dim - std::size_t{ 1 }) * (dim_block + dim) / std::size_t{ 2 } + device_num_rows - (dim_block + dim) + global_j_idx]; sum += A_cache * B(global_i_idx, device_row_offset + dim_block + dim); } - temp[internal_i][internal_j] += sum; + temp[internal_j][internal_i] += sum; } } } @@ -184,7 +184,7 @@ inline void device_kernel_symm_mirror(const std::size_t num_rows, const std::siz // be sure to not perform out-of-bounds accesses if (global_i_idx < num_rhs && partial_global_j_idx < num_mirror_rows) { - C(global_i_idx, global_j_idx) = alpha * temp[internal_i][internal_j] + beta * C(global_i_idx, global_j_idx); + C(global_i_idx, global_j_idx) = alpha * temp[internal_j][internal_i] + beta * C(global_i_idx, global_j_idx); } } } diff --git a/include/plssvm/backends/OpenMP/kernel/cg_explicit/kernel_matrix_assembly.hpp b/include/plssvm/backends/OpenMP/kernel/cg_explicit/kernel_matrix_assembly.hpp index b442288df..aa465dead 100644 --- a/include/plssvm/backends/OpenMP/kernel/cg_explicit/kernel_matrix_assembly.hpp +++ b/include/plssvm/backends/OpenMP/kernel/cg_explicit/kernel_matrix_assembly.hpp @@ -86,7 +86,7 @@ void device_kernel_assembly(real_type *kernel_matrix, const soa_matrix(data(global_i_idx, feature_block + feature), data(global_j_idx, feature_block + feature)); } - temp[internal_i][internal_j] += sum; + temp[internal_j][internal_i] += sum; } } } @@ -102,7 +102,7 @@ void device_kernel_assembly(real_type *kernel_matrix, const soa_matrix= global_j_idx) { - real_type temp_ij = temp[internal_i][internal_j]; + real_type temp_ij = temp[internal_j][internal_i]; // apply the final kernel function temp_ij = detail::apply_kernel_function(temp_ij, kernel_function_parameter...) + QA_cost - q[global_i_idx] - q[global_j_idx]; // apply the cost on the diagonal diff --git a/include/plssvm/backends/OpenMP/kernel/predict_kernel.hpp b/include/plssvm/backends/OpenMP/kernel/predict_kernel.hpp index 1eed9735e..7bea4b3c4 100644 --- a/include/plssvm/backends/OpenMP/kernel/predict_kernel.hpp +++ b/include/plssvm/backends/OpenMP/kernel/predict_kernel.hpp @@ -236,7 +236,7 @@ inline void device_kernel_predict(aos_matrix &prediction, const aos_m for (std::size_t feature = 0; feature < THREAD_BLOCK_SIZE_uz; ++feature) { sum += detail::feature_reduce(support_vectors(global_sv_idx, feature_block + feature), predict_points(global_pp_idx, feature_block + feature)); } - temp[internal_pp][internal_sv] += sum; + temp[internal_sv][internal_pp] += sum; } } } @@ -244,7 +244,7 @@ inline void device_kernel_predict(aos_matrix &prediction, const aos_m // update temp using the respective kernel function for (unsigned internal_pp = 0; internal_pp < INTERNAL_BLOCK_SIZE; ++internal_pp) { for (unsigned internal_sv = 0; internal_sv < INTERNAL_BLOCK_SIZE; ++internal_sv) { - temp[internal_pp][internal_sv] = detail::apply_kernel_function(temp[internal_pp][internal_sv], kernel_function_parameter...); + temp[internal_sv][internal_pp] = detail::apply_kernel_function(temp[internal_sv][internal_pp], kernel_function_parameter...); } } @@ -258,7 +258,7 @@ inline void device_kernel_predict(aos_matrix &prediction, const aos_m for (std::size_t class_idx = 0; class_idx < THREAD_BLOCK_SIZE_uz; ++class_idx) { #pragma omp atomic - prediction(global_pp_idx, class_block + class_idx) += alpha(class_block + class_idx, global_sv_idx) * temp[internal_pp][internal_sv]; + prediction(global_pp_idx, class_block + class_idx) += alpha(class_block + class_idx, global_sv_idx) * temp[internal_sv][internal_pp]; } } } From 46891d9b43158ce084aad132f9d90947a28ab7bb Mon Sep 17 00:00:00 2001 From: Marcel Breyer Date: Sat, 31 May 2025 20:31:54 +0200 Subject: [PATCH 022/215] Add missing doxygen documentation. --- include/plssvm/detail/make_unique_for_overwrite.hpp | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/include/plssvm/detail/make_unique_for_overwrite.hpp b/include/plssvm/detail/make_unique_for_overwrite.hpp index fcb205622..ca58eec3a 100644 --- a/include/plssvm/detail/make_unique_for_overwrite.hpp +++ b/include/plssvm/detail/make_unique_for_overwrite.hpp @@ -99,6 +99,12 @@ std::unique_ptr make_unique_for_overwrite(const std::size_t n) { template , bool> = true> auto make_unique_for_overwrite(Args &&...args) = delete; +/** + * @brief Fill the array @p dest with zeros in parallel using OpenMP if available, otherwise fall back to a sequential memset. + * @tparam T the type of the values + * @param[in,out] dest the array to fill with zeros + * @param[in] count the number of values to fill + */ template void parallel_zero_memset(T *dest, const std::size_t count) { PLSSVM_ASSERT(dest != nullptr, "The destination pointer may not be a nullptr!"); From fa5cea380199ef9c8204c2ffd4890ec389493c87 Mon Sep 17 00:00:00 2001 From: Marcel Breyer Date: Sat, 31 May 2025 20:56:51 +0200 Subject: [PATCH 023/215] Update the HPX backend kernels. --- .../backends/HPX/kernel/cg_explicit/blas.hpp | 130 +++++++------- .../cg_explicit/kernel_matrix_assembly.hpp | 87 ++++----- .../kernel_matrix_assembly_blas.hpp | 111 +++++++----- .../backends/HPX/kernel/kernel_functions.hpp | 35 +--- .../backends/HPX/kernel/predict_kernel.hpp | 165 +++++++++--------- src/plssvm/backends/HPX/csvm.cpp | 35 ++-- 6 files changed, 299 insertions(+), 264 deletions(-) diff --git a/include/plssvm/backends/HPX/kernel/cg_explicit/blas.hpp b/include/plssvm/backends/HPX/kernel/cg_explicit/blas.hpp index 20cbad247..99aeec376 100644 --- a/include/plssvm/backends/HPX/kernel/cg_explicit/blas.hpp +++ b/include/plssvm/backends/HPX/kernel/cg_explicit/blas.hpp @@ -34,60 +34,63 @@ namespace plssvm::hpx::detail { * @brief Perform an explicit BLAS SYMM operation: `C = alpha * A * B + beta * C` where @p A is a symmetric matrix (memory optimized), @p B and @p C are matrices, and @p alpha and @p beta are scalars. * @param[in] num_rows the number of rows in @p A and @p C * @param[in] num_rhs the number of columns in @p B and @p C - * @param[in] device_specific_num_rows the number of rows the current device is responsible for - * @param[in] row_offset the first row in @p data the current device is responsible for + * @param[in] device_num_rows the number of rows the current device is responsible for + * @param[in] device_row_offset the first row in @p data the current device is responsible for * @param[in] alpha the scalar alpha value * @param[in] A the matrix @p A * @param[in] B the matrix @p B * @param[in] beta the scalar beta value * @param[in,out] C the matrix @p C, also used as result matrix */ -inline void device_kernel_symm(const std::size_t num_rows, const std::size_t num_rhs, const std::size_t device_specific_num_rows, const std::size_t row_offset, const real_type alpha, const std::vector &A, const soa_matrix &B, const real_type beta, soa_matrix &C) { - PLSSVM_ASSERT(!A.empty(), "A matrix may not be empty!"); +inline void device_kernel_symm(const std::size_t num_rows, const std::size_t num_rhs, const std::size_t device_num_rows, const std::size_t device_row_offset, const real_type alpha, const real_type *A, const soa_matrix &B, const real_type beta, soa_matrix &C) { + PLSSVM_ASSERT(A != nullptr, "The A matrix result pointer must be valid!"); PLSSVM_ASSERT(B.shape() == (plssvm::shape{ num_rhs, num_rows }), "B matrix sizes mismatch!: {} != [{}, {}]", B.shape(), num_rhs, num_rows); PLSSVM_ASSERT(C.shape() == (plssvm::shape{ num_rhs, num_rows }), "C matrix sizes mismatch!: {} != [{}, {}]", C.shape(), num_rhs, num_rows); - PLSSVM_ASSERT(num_rows >= device_specific_num_rows, "The number of place specific rows ({}) cannot be greater the the total number of rows ({})!", device_specific_num_rows, num_rows); - PLSSVM_ASSERT(num_rows >= row_offset, "The row offset ({}) cannot be greater the the total number of rows ({})!", row_offset, num_rows); + PLSSVM_ASSERT(num_rows >= device_num_rows, "The number of place specific rows ({}) cannot be greater the the total number of rows ({})!", device_num_rows, num_rows); + PLSSVM_ASSERT(num_rows >= device_row_offset, "The row offset ({}) cannot be greater the the total number of rows ({})!", device_row_offset, num_rows); // calculate constants const auto blocked_num_rhs = static_cast(std::ceil(static_cast(num_rhs) / INTERNAL_BLOCK_SIZE)); - const auto blocked_device_specific_num_rows = static_cast(std::ceil(static_cast(device_specific_num_rows) / INTERNAL_BLOCK_SIZE)); + const auto blocked_device_num_rows = static_cast(std::ceil(static_cast(device_num_rows) / INTERNAL_BLOCK_SIZE)); // cast all values to 64-bit unsigned long long to prevent potential 32-bit overflows const auto INTERNAL_BLOCK_SIZE_uz = static_cast(INTERNAL_BLOCK_SIZE); + const auto THREAD_BLOCK_SIZE_uz = static_cast(THREAD_BLOCK_SIZE); const auto PADDING_SIZE_uz = static_cast(PADDING_SIZE); - // define range over which should be iterated - std::vector range(blocked_num_rhs * blocked_device_specific_num_rows); // define range over which should be iterated + // define the range over which should be iterated + std::vector range(blocked_num_rhs * blocked_device_num_rows); std::iota(range.begin(), range.end(), 0); ::hpx::for_each(::hpx::execution::par_unseq, range.cbegin(), range.cend(), [&](const std::size_t idx) { // calculate the indices used in the current thread - const std::size_t rhs = idx / blocked_device_specific_num_rows; - const std::size_t row = idx % blocked_device_specific_num_rows; - - const std::size_t rhs_idx = rhs * INTERNAL_BLOCK_SIZE_uz; - const std::size_t row_idx = row * INTERNAL_BLOCK_SIZE_uz; + const std::size_t i_idx = (idx / blocked_device_num_rows) * INTERNAL_BLOCK_SIZE_uz; + const std::size_t j_idx = (idx % blocked_device_num_rows) * INTERNAL_BLOCK_SIZE_uz; // create a thread private array used for internal caching std::array, INTERNAL_BLOCK_SIZE> temp{}; - // iterate over all features - for (std::size_t dim = 0; dim < (num_rows - row_offset); ++dim) { + // iterate over all values + for (std::size_t dim_block = 0; dim_block < (num_rows - device_row_offset); dim_block += THREAD_BLOCK_SIZE_uz) { // perform the dot product calculation for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) { for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) { - const std::size_t global_rhs = rhs_idx + static_cast(internal_i); - const std::size_t global_row = row_idx + static_cast(internal_j); - - real_type A_val = 0.0; - // determine on which side of the diagonal we are located - if (dim < global_row) { - A_val = A[dim * (num_rows - row_offset + PADDING_SIZE_uz) + global_row - dim * (dim + std::size_t{ 1 }) / std::size_t{ 2 }]; - } else { - A_val = A[global_row * (num_rows - row_offset + PADDING_SIZE_uz) + dim - global_row * (global_row + std::size_t{ 1 }) / std::size_t{ 2 }]; + // calculate the indices to access the global data + const auto global_i_idx = i_idx + static_cast(internal_i); + const auto global_j_idx = j_idx + static_cast(internal_j); + + real_type sum{ 0.0 }; + for (std::size_t dim = 0; dim < THREAD_BLOCK_SIZE_uz; ++dim) { + real_type A_cache = 0.0; + // determine on which side of the diagonal we are located + if (dim_block + dim < global_j_idx) { + A_cache = A[(dim_block + dim) * (num_rows - device_row_offset + PADDING_SIZE_uz) + global_j_idx - (dim_block + dim) * (dim_block + dim + std::size_t{ 1 }) / std::size_t{ 2 }]; + } else { + A_cache = A[global_j_idx * (num_rows - device_row_offset + PADDING_SIZE_uz) + dim_block + dim - global_j_idx * (global_j_idx + std::size_t{ 1 }) / std::size_t{ 2 }]; + } + sum += A_cache * B(global_i_idx, device_row_offset + dim_block + dim); } - temp[internal_i][internal_j] += A_val * B(global_rhs, dim + row_offset); + temp[internal_j][internal_i] += sum; } } } @@ -95,13 +98,14 @@ inline void device_kernel_symm(const std::size_t num_rows, const std::size_t num // apply the (partial) BLAS operation and update C for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) { for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) { - const std::size_t global_rhs = rhs_idx + static_cast(internal_i); - const std::size_t device_global_row = row_idx + static_cast(internal_j); - const std::size_t global_row = row_offset + row_idx + static_cast(internal_j); - - // be sure to not perform out of bounds accesses - if (global_rhs < num_rhs && device_global_row < device_specific_num_rows) { - C(global_rhs, global_row) = alpha * temp[internal_i][internal_j] + beta * C(global_rhs, global_row); + // calculate the indices to access the global data and the data with respect to the current device + const auto global_i_idx = i_idx + static_cast(internal_i); + const auto device_global_j_idx = j_idx + static_cast(internal_j); + const auto global_j_idx = device_row_offset + device_global_j_idx; + + // be sure to not perform out-of-bounds accesses + if (global_i_idx < num_rhs && device_global_j_idx < device_num_rows) { + C(global_i_idx, global_j_idx) = alpha * temp[internal_j][internal_i] + beta * C(global_i_idx, global_j_idx); } } } @@ -113,22 +117,22 @@ inline void device_kernel_symm(const std::size_t num_rows, const std::size_t num * @param[in] num_rows the number of rows in @p A and @p C * @param[in] num_rhs the number of columns in @p B and @p C * @param[in] num_mirror_rows the number of rows to mirror down - * @param[in] device_specific_num_rows the number of rows in @p A and number of rows in @p B; thr rows in @p A are potentially distributed across multiple devices - * @param[in] row_offset the first row this device is responsible for + * @param[in] device_num_rows the number of rows in @p A and number of rows in @p B; thr rows in @p A are potentially distributed across multiple devices + * @param[in] device_row_offset the first row this device is responsible for * @param[in] alpha the scalar alpha value * @param[in] A the matrix @p A * @param[in] B the matrix @p B * @param[in] beta the scalar beta value * @param[in,out] C the matrix @p C, also used as result matrix */ -inline void device_kernel_symm_mirror(const std::size_t num_rows, const std::size_t num_rhs, const std::size_t num_mirror_rows, const std::size_t device_specific_num_rows, const std::size_t row_offset, const real_type alpha, const std::vector &A, const soa_matrix &B, const real_type beta, soa_matrix &C) { +inline void device_kernel_symm_mirror(const std::size_t num_rows, const std::size_t num_rhs, const std::size_t num_mirror_rows, const std::size_t device_num_rows, const std::size_t device_row_offset, const real_type alpha, const real_type *A, const soa_matrix &B, const real_type beta, soa_matrix &C) { // compute: C = alpha * A * B + beta * C with A in m x k, B in n x k, and C in n x m, alpha, beta as scalar - PLSSVM_ASSERT(!A.empty(), "A matrix may not be empty!"); + PLSSVM_ASSERT(A != nullptr, "The A matrix result pointer must be valid!"); PLSSVM_ASSERT(B.shape() == (plssvm::shape{ num_rhs, num_rows }), "B matrix sizes mismatch!: {} != [{}, {}]", B.shape(), num_rhs, num_rows); PLSSVM_ASSERT(C.shape() == (plssvm::shape{ num_rhs, num_rows }), "C matrix sizes mismatch!: {} != [{}, {}]", C.shape(), num_rhs, num_rows); - PLSSVM_ASSERT(num_rows >= device_specific_num_rows, "The number of place specific rows ({}) cannot be greater the the total number of rows ({})!", device_specific_num_rows, num_rows); + PLSSVM_ASSERT(num_rows >= device_num_rows, "The number of place specific rows ({}) cannot be greater the the total number of rows ({})!", device_num_rows, num_rows); PLSSVM_ASSERT(num_rows >= num_mirror_rows, "The number of mirror rows ({}) cannot be greater the the total number of rows ({})!", num_mirror_rows, num_rows); - PLSSVM_ASSERT(num_rows >= row_offset, "The row offset ({}) cannot be greater the the total number of rows ({})!", row_offset, num_rows); + PLSSVM_ASSERT(num_rows >= device_row_offset, "The row offset ({}) cannot be greater the the total number of rows ({})!", device_row_offset, num_rows); // calculate constants const auto blocked_num_rhs = static_cast(std::ceil(static_cast(num_rhs) / INTERNAL_BLOCK_SIZE)); @@ -136,47 +140,51 @@ inline void device_kernel_symm_mirror(const std::size_t num_rows, const std::siz // cast all values to 64-bit unsigned long long to prevent potential 32-bit overflows const auto INTERNAL_BLOCK_SIZE_uz = static_cast(INTERNAL_BLOCK_SIZE); + const auto THREAD_BLOCK_SIZE_uz = static_cast(THREAD_BLOCK_SIZE); const auto PADDING_SIZE_uz = static_cast(PADDING_SIZE); - // define range over which should be iterated - std::vector range(blocked_num_rhs * blocked_num_mirror_rows); // define range over which should be iterated + // define the range over which should be iterated + std::vector range(blocked_num_rhs * blocked_num_mirror_rows); std::iota(range.begin(), range.end(), 0); ::hpx::for_each(::hpx::execution::par_unseq, range.cbegin(), range.cend(), [&](const std::size_t idx) { // calculate the indices used in the current thread - const std::size_t rhs = idx / blocked_num_mirror_rows; - const std::size_t row = idx % blocked_num_mirror_rows; - - const std::size_t rhs_idx = rhs * INTERNAL_BLOCK_SIZE_uz; - const std::size_t row_idx = row * INTERNAL_BLOCK_SIZE_uz; + const std::size_t i_idx = (idx / blocked_num_mirror_rows) * INTERNAL_BLOCK_SIZE_uz; + const std::size_t j_idx = (idx % blocked_num_mirror_rows) * INTERNAL_BLOCK_SIZE_uz; // create a thread private array used for internal caching std::array, INTERNAL_BLOCK_SIZE> temp{}; - // iterate over all features - for (std::size_t dim = 0; dim < device_specific_num_rows; ++dim) { + // iterate over the remaining values + for (std::size_t dim_block = 0; dim_block < device_num_rows; dim_block += THREAD_BLOCK_SIZE_uz) { // perform the dot product calculation for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) { for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) { - const std::size_t global_rhs = rhs_idx + static_cast(internal_i); - const std::size_t global_row = row_idx + static_cast(internal_j); - - const real_type A_val = A[dim * (num_rows - row_offset + PADDING_SIZE_uz) - (dim - std::size_t{ 1 }) * dim / std::size_t{ 2 } + device_specific_num_rows - dim + global_row]; - temp[internal_i][internal_j] += A_val * B(global_rhs, row_offset + dim); + // calculate the indices to access the global data + const auto global_i_idx = i_idx + static_cast(internal_i); + const auto global_j_idx = j_idx + static_cast(internal_j); + + real_type sum{ 0.0 }; + for (std::size_t dim = 0; dim < THREAD_BLOCK_SIZE_uz; ++dim) { + const real_type A_cache = A[(dim_block + dim) * (num_rows - device_row_offset + PADDING_SIZE_uz) - (dim_block + dim - std::size_t{ 1 }) * (dim_block + dim) / std::size_t{ 2 } + device_num_rows - (dim_block + dim) + global_j_idx]; + sum += A_cache * B(global_i_idx, device_row_offset + dim_block + dim); + } + temp[internal_j][internal_i] += sum; } } } - // apply the (partial) BLAS operation and update C + // apply the (remaining) BLAS operation and update C for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) { for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) { - const std::size_t global_rhs = rhs_idx + static_cast(internal_i); - const std::size_t partial_global_row = row_idx + static_cast(internal_j); - const std::size_t global_row = row_offset + device_specific_num_rows + row_idx + static_cast(internal_j); - - // be sure to not perform out of bounds accesses - if (global_rhs < num_rhs && partial_global_row < num_mirror_rows) { - C(global_rhs, global_row) = alpha * temp[internal_i][internal_j] + beta * C(global_rhs, global_row); + // calculate the indices to access the global data and the data with respect to the current device + const auto global_i_idx = i_idx + static_cast(internal_i); + const auto partial_global_j_idx = j_idx + static_cast(internal_j); + const auto global_j_idx = device_row_offset + device_num_rows + partial_global_j_idx; + + // be sure to not perform out-of-bounds accesses + if (global_i_idx < num_rhs && partial_global_j_idx < num_mirror_rows) { + C(global_i_idx, global_j_idx) = alpha * temp[internal_j][internal_i] + beta * C(global_i_idx, global_j_idx); } } } diff --git a/include/plssvm/backends/HPX/kernel/cg_explicit/kernel_matrix_assembly.hpp b/include/plssvm/backends/HPX/kernel/cg_explicit/kernel_matrix_assembly.hpp index af1d3c9e2..f4bf41d0d 100644 --- a/include/plssvm/backends/HPX/kernel/cg_explicit/kernel_matrix_assembly.hpp +++ b/include/plssvm/backends/HPX/kernel/cg_explicit/kernel_matrix_assembly.hpp @@ -32,82 +32,89 @@ namespace plssvm::hpx::detail { /** - * @brief Assemble the kernel matrix using the @p kernel function. - * @tparam kernel the compile-time kernel function to use - * @tparam Args the types of the potential additional arguments for the @p kernel function + * @brief Assemble the kernel matrix using the @p kernel_function function. + * @tparam kernel_function the compile-time kernel function to use + * @tparam Args the types of the potential additional arguments for the @p kernel_function function * @param[out] kernel_matrix the resulting kernel matrix * @param[in] data the data matrix - * @param[in] device_specific_num_rows the number of rows the current device is responsible for - * @param[in] row_offset the first row in @p data the current device is responsible for + * @param[in] device_num_rows the number of rows the current device is responsible for + * @param[in] device_row_offset the first row in @p data the current device is responsible for * @param[in] q the `q` vector * @param[in] QA_cost he bottom right matrix entry multiplied by cost * @param[in] cost 1 / the cost parameter in the C-SVM - * @param[in] kernel_function_parameter the potential additional arguments for the @p kernel function + * @param[in] kernel_function_parameter the potential additional arguments for the @p kernel_function function */ -template -void device_kernel_assembly(std::vector &kernel_matrix, const soa_matrix &data, const std::size_t device_specific_num_rows, const std::size_t row_offset, const std::vector &q, const real_type QA_cost, const real_type cost, Args... kernel_function_parameter) { +template +void device_kernel_assembly(real_type *kernel_matrix, const soa_matrix &data, const std::size_t device_num_rows, const std::size_t device_row_offset, const std::vector &q, const real_type QA_cost, const real_type cost, Args... kernel_function_parameter) { + PLSSVM_ASSERT(kernel_matrix != nullptr, "The kernel matrix result pointer must be valid!"); PLSSVM_ASSERT(q.size() == data.num_rows() - 1, "Sizes mismatch!: {} != {}", q.size(), data.num_rows() - 1); - PLSSVM_ASSERT(!kernel_matrix.empty(), "A matrix may not be empty!"); - PLSSVM_ASSERT(q.size() >= device_specific_num_rows, "The number of place specific rows ({}) cannot be greater the the total number of rows ({})!", device_specific_num_rows, q.size()); - PLSSVM_ASSERT(q.size() >= row_offset, "The row offset ({}) cannot be greater the the total number of rows ({})!", row_offset, q.size()); + PLSSVM_ASSERT(q.size() >= device_num_rows, "The number of place specific rows ({}) cannot be greater the the total number of rows ({})!", device_num_rows, q.size()); + PLSSVM_ASSERT(q.size() >= device_row_offset, "The row offset ({}) cannot be greater the the total number of rows ({})!", device_row_offset, q.size()); PLSSVM_ASSERT(cost != real_type{ 0.0 }, "cost must not be 0.0 since it is 1 / plssvm::cost!"); // calculate constants const std::size_t num_rows = data.num_rows() - 1; const std::size_t num_features = data.num_cols(); - const auto blocked_row_range = static_cast(std::ceil(static_cast(num_rows - row_offset) / INTERNAL_BLOCK_SIZE)); - const auto blocked_device_specific_num_rows = static_cast(std::ceil(static_cast(device_specific_num_rows) / INTERNAL_BLOCK_SIZE)); + const auto blocked_row_range = static_cast(std::ceil(static_cast(num_rows - device_row_offset) / INTERNAL_BLOCK_SIZE)); + const auto blocked_device_num_rows = static_cast(std::ceil(static_cast(device_num_rows) / INTERNAL_BLOCK_SIZE)); // cast all values to 64-bit unsigned long long to prevent potential 32-bit overflows const auto INTERNAL_BLOCK_SIZE_uz = static_cast(INTERNAL_BLOCK_SIZE); + const auto THREAD_BLOCK_SIZE_uz = static_cast(THREAD_BLOCK_SIZE); const auto PADDING_SIZE_uz = static_cast(PADDING_SIZE); - // count the number of entries in the final index list - std::vector indices(blocked_row_range * blocked_device_specific_num_rows); // define range over which should be iterated + // define the range over which should be iterated + std::vector indices(blocked_row_range * blocked_device_num_rows); std::iota(indices.begin(), indices.end(), 0); ::hpx::for_each(::hpx::execution::par_unseq, indices.cbegin(), indices.cend(), [&](const std::size_t idx) { // calculate the indices used in the current thread - const std::size_t row_idx = (idx / blocked_device_specific_num_rows) * INTERNAL_BLOCK_SIZE_uz; - const std::size_t col_idx = (idx % blocked_device_specific_num_rows) * INTERNAL_BLOCK_SIZE_uz; + const std::size_t i_idx = (idx / blocked_device_num_rows) * INTERNAL_BLOCK_SIZE_uz; + const std::size_t j_idx = (idx % blocked_device_num_rows) * INTERNAL_BLOCK_SIZE_uz; // only calculate the upper triangular matrix - if (row_idx >= col_idx) { - // only calculate the upper triangular matrix -> done be only iterating over valid row <-> col pairs + if (i_idx >= j_idx) { // create a thread private array used for internal caching std::array, INTERNAL_BLOCK_SIZE> temp{}; // iterate over all features - for (std::size_t dim = 0; dim < num_features; ++dim) { + for (std::size_t feature_block = 0; feature_block < num_features; feature_block += THREAD_BLOCK_SIZE_uz) { // perform the feature reduction calculation - for (unsigned internal_row = 0; internal_row < INTERNAL_BLOCK_SIZE; ++internal_row) { - for (unsigned internal_col = 0; internal_col < INTERNAL_BLOCK_SIZE; ++internal_col) { - const std::size_t global_row = row_offset + row_idx + static_cast(internal_row); - const std::size_t global_col = row_offset + col_idx + static_cast(internal_col); - - temp[internal_row][internal_col] += detail::feature_reduce(data(global_row, dim), data(global_col, dim)); + for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) { + for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) { + // calculate the indices to access the global data + const auto global_i_idx = device_row_offset + i_idx + static_cast(internal_i); + const auto global_j_idx = device_row_offset + j_idx + static_cast(internal_j); + + real_type sum{ 0.0 }; + for (std::size_t feature = 0; feature < THREAD_BLOCK_SIZE_uz; ++feature) { + sum += detail::feature_reduce(data(global_i_idx, feature_block + feature), data(global_j_idx, feature_block + feature)); + } + temp[internal_j][internal_i] += sum; } } } // apply the remaining part of the kernel function and store the value in the output kernel matrix - for (unsigned internal_row = 0; internal_row < INTERNAL_BLOCK_SIZE; ++internal_row) { - for (unsigned internal_col = 0; internal_col < INTERNAL_BLOCK_SIZE; ++internal_col) { - // calculate the indices to access the kernel matrix (the part stored on the current device) - const std::size_t device_global_row = row_idx + static_cast(internal_row); - const std::size_t global_row = row_offset + row_idx + static_cast(internal_row); - const std::size_t device_global_col = col_idx + static_cast(internal_col); - const std::size_t global_col = row_offset + col_idx + static_cast(internal_col); - - // be sure to not perform out of bounds accesses for the kernel matrix (only using the upper triangular matrix) - if (device_global_row < (num_rows - row_offset) && device_global_col < device_specific_num_rows && global_row >= global_col) { - real_type temp_ij = temp[internal_row][internal_col]; - temp_ij = detail::apply_kernel_function(temp_ij, kernel_function_parameter...) + QA_cost - q[global_row] - q[global_col]; + for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) { + for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) { + // calculate the indices to access the global data and the data with respect to the current device + const auto device_global_i_idx = i_idx + static_cast(internal_i); + const auto global_i_idx = device_row_offset + device_global_i_idx; + const auto device_global_j_idx = j_idx + static_cast(internal_j); + const auto global_j_idx = device_row_offset + device_global_j_idx; + + // be sure to not perform out-of-bounds accesses (only using the upper triangular matrix) + if (device_global_i_idx < (num_rows - device_row_offset) && device_global_j_idx < device_num_rows && global_i_idx >= global_j_idx) { + real_type temp_ij = temp[internal_j][internal_i]; + // apply the final kernel function + temp_ij = detail::apply_kernel_function(temp_ij, kernel_function_parameter...) + QA_cost - q[global_i_idx] - q[global_j_idx]; // apply the cost on the diagonal - if (global_row == global_col) { + if (global_i_idx == global_j_idx) { temp_ij += cost; } - kernel_matrix[device_global_col * (num_rows - row_offset + PADDING_SIZE_uz) - device_global_col * (device_global_col + std::size_t{ 1 }) / std::size_t{ 2 } + device_global_row] = temp_ij; + // update the upper triangular kernel matrix + kernel_matrix[device_global_j_idx * (num_rows - device_row_offset + PADDING_SIZE_uz) - device_global_j_idx * (device_global_j_idx + std::size_t{ 1 }) / std::size_t{ 2 } + device_global_i_idx] = temp_ij; } } } diff --git a/include/plssvm/backends/HPX/kernel/cg_implicit/kernel_matrix_assembly_blas.hpp b/include/plssvm/backends/HPX/kernel/cg_implicit/kernel_matrix_assembly_blas.hpp index 06df89dac..78a0f93d1 100644 --- a/include/plssvm/backends/HPX/kernel/cg_implicit/kernel_matrix_assembly_blas.hpp +++ b/include/plssvm/backends/HPX/kernel/cg_implicit/kernel_matrix_assembly_blas.hpp @@ -34,25 +34,25 @@ namespace plssvm::hpx::detail { /** - * @brief Perform an implicit BLAS SYMM-like operation: `C = alpha * A * B + C` where `A` is the implicitly calculated kernel matrix using the @p kernel function (never actually stored, reducing the amount of needed global memory), @p B and @p C are matrices, and @p alpha is a scalar. - * @tparam kernel the compile-time kernel function to use - * @tparam Args the types of the potential additional arguments for the @p kernel function + * @brief Perform an implicit BLAS SYMM-like operation: `C = alpha * A * B + C` where `A` is the implicitly calculated kernel matrix using the @p kernel_function (never actually stored, reducing the amount of needed global memory), @p B and @p C are matrices, and @p alpha is a scalar. + * @tparam kernel_function the compile-time kernel function to use + * @tparam Args the types of the potential additional arguments for the @p kernel_function function * @param[in] alpha the scalar alpha value * @param[in] q the `q` vector * @param[in] data the data matrix - * @param[in] device_specific_num_rows the number of rows the current device is responsible for - * @param[in] row_offset the first row in @p data the current device is responsible for + * @param[in] device_num_rows the number of rows the current device is responsible for + * @param[in] device_row_offset the first row in @p data the current device is responsible for * @param[in] QA_cost he bottom right matrix entry multiplied by cost * @param[in] cost 1 / the cost parameter in the C-SVM * @param[in] B the matrix @p B * @param[in,out] C the matrix @p C - * @param[in] kernel_function_parameter the potential additional arguments for the @p kernel function + * @param[in] kernel_function_parameter the potential additional arguments for the @p kernel_function function */ -template -inline void device_kernel_assembly_symm(const real_type alpha, const std::vector &q, const soa_matrix &data, const std::size_t device_specific_num_rows, const std::size_t row_offset, const real_type QA_cost, const real_type cost, const soa_matrix &B, soa_matrix &C, Args... kernel_function_parameter) { +template +inline void device_kernel_assembly_symm(const real_type alpha, const std::vector &q, const soa_matrix &data, const std::size_t device_num_rows, const std::size_t device_row_offset, const real_type QA_cost, const real_type cost, const soa_matrix &B, soa_matrix &C, Args... kernel_function_parameter) { PLSSVM_ASSERT(q.size() == data.num_rows() - 1, "Sizes mismatch!: {} != {}", q.size(), data.num_rows() - 1); - PLSSVM_ASSERT(q.size() >= device_specific_num_rows, "The number of place specific rows ({}) cannot be greater the the total number of rows ({})!", device_specific_num_rows, q.size()); - PLSSVM_ASSERT(q.size() >= row_offset, "The row offset ({}) cannot be greater the the total number of rows ({})!", row_offset, q.size()); + PLSSVM_ASSERT(q.size() >= device_num_rows, "The number of place specific rows ({}) cannot be greater the the total number of rows ({})!", device_num_rows, q.size()); + PLSSVM_ASSERT(q.size() >= device_row_offset, "The row offset ({}) cannot be greater the the total number of rows ({})!", device_row_offset, q.size()); PLSSVM_ASSERT(cost != real_type{ 0.0 }, "cost must not be 0.0 since it is 1 / plssvm::cost!"); PLSSVM_ASSERT(B.shape() == C.shape(), "The matrices B and C must have the same shape!"); PLSSVM_ASSERT(B.num_cols() == q.size(), "The number of columns in B ({}) must be the same as the values in q ({})!", B.num_cols(), q.size()); @@ -61,64 +61,89 @@ inline void device_kernel_assembly_symm(const real_type alpha, const std::vector const std::size_t num_rows = data.num_rows() - 1; const std::size_t num_features = data.num_cols(); const std::size_t num_classes = B.num_rows(); - const auto blocked_row_range = static_cast(std::ceil(static_cast(num_rows - row_offset) / INTERNAL_BLOCK_SIZE)); - const auto blocked_device_specific_num_rows = static_cast(std::ceil(static_cast(device_specific_num_rows) / INTERNAL_BLOCK_SIZE)); + const auto blocked_row_range = static_cast(std::ceil(static_cast(num_rows - device_row_offset) / INTERNAL_BLOCK_SIZE)); + const auto blocked_device_num_rows = static_cast(std::ceil(static_cast(device_num_rows) / INTERNAL_BLOCK_SIZE)); // cast all values to 64-bit unsigned long long to prevent potential 32-bit overflows const auto INTERNAL_BLOCK_SIZE_uz = static_cast(INTERNAL_BLOCK_SIZE); + const auto THREAD_BLOCK_SIZE_uz = static_cast(THREAD_BLOCK_SIZE); - // count the number of entries in the final index list - std::vector indices(blocked_row_range * blocked_device_specific_num_rows); // define range over which should be iterated + // define the range over which should be iterated + std::vector indices(blocked_row_range * blocked_device_num_rows); std::iota(indices.begin(), indices.end(), 0); ::hpx::for_each(::hpx::execution::par_unseq, indices.cbegin(), indices.cend(), [&](const std::size_t idx) { // calculate the indices used in the current thread - const std::size_t row_idx = (idx / blocked_device_specific_num_rows) * INTERNAL_BLOCK_SIZE_uz; - const std::size_t col_idx = (idx % blocked_device_specific_num_rows) * INTERNAL_BLOCK_SIZE_uz; + const std::size_t i_idx = (idx / blocked_device_num_rows) * INTERNAL_BLOCK_SIZE_uz; + const std::size_t j_idx = (idx % blocked_device_num_rows) * INTERNAL_BLOCK_SIZE_uz; // only calculate the upper triangular matrix - if (row_idx >= col_idx) { - // only calculate the upper triangular matrix -> done be only iterating over valid row <-> col pairs + if (i_idx >= j_idx) { // create a thread private array used for internal caching std::array, INTERNAL_BLOCK_SIZE> temp{}; // iterate over all features - for (std::size_t dim = 0; dim < num_features; ++dim) { - for (unsigned internal_row = 0; internal_row < INTERNAL_BLOCK_SIZE; ++internal_row) { - for (unsigned internal_col = 0; internal_col < INTERNAL_BLOCK_SIZE; ++internal_col) { - const std::size_t global_row = row_offset + row_idx + static_cast(internal_row); - const std::size_t global_col = row_offset + col_idx + static_cast(internal_col); - - temp[internal_row][internal_col] += detail::feature_reduce(data(global_row, dim), data(global_col, dim)); + for (std::size_t feature_block = 0; feature_block < num_features; feature_block += THREAD_BLOCK_SIZE_uz) { + for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) { + for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) { + // calculate the indices to access the global data + const auto global_i_idx = device_row_offset + i_idx + static_cast(internal_i); + const auto global_j_idx = device_row_offset + j_idx + static_cast(internal_j); + + real_type sum{ 0.0 }; + for (std::size_t feature = 0; feature < THREAD_BLOCK_SIZE_uz; ++feature) { + sum += detail::feature_reduce(data(global_i_idx, feature_block + feature), data(global_j_idx, feature_block + feature)); + } + temp[internal_j][internal_i] += sum; } } } // apply the remaining part of the kernel function and store the value in the output kernel matrix - for (unsigned internal_row = 0; internal_row < INTERNAL_BLOCK_SIZE; ++internal_row) { - for (unsigned internal_col = 0; internal_col < INTERNAL_BLOCK_SIZE; ++internal_col) { - const std::size_t device_global_row = row_idx + static_cast(internal_row); - const std::size_t global_row = row_offset + row_idx + static_cast(internal_row); - const std::size_t device_global_col = col_idx + static_cast(internal_col); - const std::size_t global_col = row_offset + col_idx + static_cast(internal_col); + for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) { + for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) { + // calculate the indices to access the global data and the data with respect to the current device + const auto device_global_i_idx = i_idx + static_cast(internal_i); + const auto global_i_idx = device_row_offset + device_global_i_idx; + const auto device_global_j_idx = j_idx + static_cast(internal_j); + const auto global_j_idx = device_row_offset + device_global_j_idx; // be sure to not perform out of bounds accesses for the kernel matrix (only using the upper triangular matrix) - if (device_global_row < (num_rows - row_offset) && device_global_col < device_specific_num_rows && global_row >= global_col) { - real_type temp_ij = temp[internal_row][internal_col]; - temp_ij = detail::apply_kernel_function(temp_ij, kernel_function_parameter...) + QA_cost - q[global_row] - q[global_col]; + if (device_global_i_idx < (num_rows - device_row_offset) && device_global_j_idx < device_num_rows && global_i_idx >= global_j_idx) { + // apply the final kernel function + temp[internal_j][internal_i] = detail::apply_kernel_function(temp[internal_j][internal_i], kernel_function_parameter...) + QA_cost - q[global_i_idx] - q[global_j_idx]; // apply the cost on the diagonal - if (global_row == global_col) { - temp_ij += cost; - // calculate the values of alpha * A * B - for (std::size_t class_idx = 0; class_idx < num_classes; ++class_idx) { - atomic_ref{ C(class_idx, global_row) } += alpha * temp_ij * B(class_idx, global_row); + if (global_i_idx == global_j_idx) { + temp[internal_j][internal_i] += cost; + } + } else { + // be sure to set the value to zero otherwise + temp[internal_j][internal_i] = real_type{ 0.0 }; + } + } + } + + //*************************************************************************// + // calculate C += alpha * temp * B // + //*************************************************************************// + for (std::size_t class_block = 0; class_block < num_classes; class_block += THREAD_BLOCK_SIZE_uz) { + for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) { + for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) { + // calculate the indices to access the global data + const auto global_i_idx = device_row_offset + i_idx + static_cast(internal_i); + const auto global_j_idx = device_row_offset + j_idx + static_cast(internal_j); + + if (global_i_idx == global_j_idx) { + // only apply once to the diagonal + for (std::size_t class_idx = 0; class_idx < THREAD_BLOCK_SIZE; ++class_idx) { + atomic_ref{ C(class_block + class_idx, global_i_idx) } += alpha * temp[internal_j][internal_i] * B(class_block + class_idx, global_i_idx); } } else { - // calculate the values of alpha * A * B - for (std::size_t class_idx = 0; class_idx < num_classes; ++class_idx) { - atomic_ref{ C(class_idx, global_row) } += alpha * temp_ij * B(class_idx, global_col); + // apply it for the upper and lower triangular matrix + for (std::size_t class_idx = 0; class_idx < THREAD_BLOCK_SIZE; ++class_idx) { + atomic_ref{ C(class_block + class_idx, global_i_idx) } += alpha * temp[internal_j][internal_i] * B(class_block + class_idx, global_j_idx); // symmetry - atomic_ref{ C(class_idx, global_col) } += alpha * temp_ij * B(class_idx, global_row); + atomic_ref{ C(class_block + class_idx, global_j_idx) } += alpha * temp[internal_j][internal_i] * B(class_block + class_idx, global_i_idx); } } } diff --git a/include/plssvm/backends/HPX/kernel/kernel_functions.hpp b/include/plssvm/backends/HPX/kernel/kernel_functions.hpp index 6c0cd8a43..35e79d01d 100644 --- a/include/plssvm/backends/HPX/kernel/kernel_functions.hpp +++ b/include/plssvm/backends/HPX/kernel/kernel_functions.hpp @@ -28,42 +28,17 @@ namespace plssvm::hpx::detail { /** * @brief Fast integer power function. Computes base^exponent and takes advantage of the fact that degree may only be positive integer values. - * @details Hardcodes the power function for degree <= 6, uses a simple for loop otherwise. * @param[in] base the base * @param[in] exponent the exponent * @return base^exponent (`[[nodiscard]]`) */ [[nodiscard]] inline real_type powi(const real_type base, const int exponent) { - switch (exponent) { - case 0: return real_type{ 1.0 }; - case 1: return base; - case 2: return base * base; - case 3: return base * base * base; - case 4: - { - const real_type temp = base * base; - return temp * temp; - } - case 5: - { - const real_type temp = base * base; - return temp * temp * base; - } - case 6: - { - const real_type temp = base * base * base; - return temp * temp; - } - default: - { - // generic integer power function - real_type result{ 1.0 }; - for (int i = 0; i < exponent; ++i) { - result *= base; - } - return result; - } + // generic integer power function + real_type result{ 1.0 }; + for (int i = 0; i < exponent; ++i) { + result *= base; } + return result; } //***************************************************// diff --git a/include/plssvm/backends/HPX/kernel/predict_kernel.hpp b/include/plssvm/backends/HPX/kernel/predict_kernel.hpp index 7ea68e172..050425b8a 100644 --- a/include/plssvm/backends/HPX/kernel/predict_kernel.hpp +++ b/include/plssvm/backends/HPX/kernel/predict_kernel.hpp @@ -16,7 +16,7 @@ #include "plssvm/backends/HPX/detail/utility.hpp" // plssvm::hpx::detail::atomic_ref #include "plssvm/backends/HPX/kernel/kernel_functions.hpp" // plssvm::hpx::detail::{feature_reduce, apply_kernel_function} -#include "plssvm/constants.hpp" // plssvm::{real_type, INTERNAL_BLOCK_SIZE, FEATURE_BLOCK_SIZE, PADDING_SIZE} +#include "plssvm/constants.hpp" // plssvm::{real_type, INTERNAL_BLOCK_SIZE, PADDING_SIZE} #include "plssvm/detail/assert.hpp" // PLSSVM_ASSERT #include "plssvm/kernel_function_types.hpp" // plssvm::kernel_function_type #include "plssvm/matrix.hpp" // plssvm::aos_matrix, plssvm::soa_matrix @@ -38,59 +38,63 @@ namespace plssvm::hpx::detail { * @param[out] w the vector to speedup the linear prediction * @param[in] alpha the previously learned weights * @param[in] support_vectors the support vectors - * @param[in] device_specific_num_sv the number of support vectors the current device is responsible for - * @param[in] sv_offset the first row in @p support_vectors the current device is responsible for + * @param[in] device_num_sv the number of support vectors the current device is responsible for + * @param[in] device_sv_offset the first row in @p support_vectors the current device is responsible for */ -inline void device_kernel_w_linear(soa_matrix &w, const aos_matrix &alpha, const soa_matrix &support_vectors, const std::size_t device_specific_num_sv, const std::size_t sv_offset) { +inline void device_kernel_w_linear(soa_matrix &w, const aos_matrix &alpha, const soa_matrix &support_vectors, const std::size_t device_num_sv, const std::size_t device_sv_offset) { PLSSVM_ASSERT(alpha.num_cols() == support_vectors.num_rows(), "Size mismatch: {} vs {}!", alpha.num_cols(), support_vectors.num_rows()); PLSSVM_ASSERT(w.shape() == (plssvm::shape{ alpha.num_rows(), support_vectors.num_cols() }), "Shape mismatch: {} vs {}!", w.shape(), (plssvm::shape{ alpha.num_rows(), support_vectors.num_cols() })); - PLSSVM_ASSERT(support_vectors.num_rows() >= device_specific_num_sv, "The number of place specific sv ({}) cannot be greater the the total number of sv ({})!", device_specific_num_sv, support_vectors.num_rows()); - PLSSVM_ASSERT(support_vectors.num_rows() >= sv_offset, "The sv offset ({}) cannot be greater the the total number of sv ({})!", sv_offset, support_vectors.num_rows()); + PLSSVM_ASSERT(support_vectors.num_rows() >= device_num_sv, "The number of place specific sv ({}) cannot be greater the the total number of sv ({})!", device_num_sv, support_vectors.num_rows()); + PLSSVM_ASSERT(support_vectors.num_rows() >= device_sv_offset, "The sv offset ({}) cannot be greater the the total number of sv ({})!", device_sv_offset, support_vectors.num_rows()); // calculate constants const std::size_t num_classes = alpha.num_rows(); - const auto blocked_num_classes = static_cast(std::ceil(static_cast(num_classes) / INTERNAL_BLOCK_SIZE)); const std::size_t num_features = support_vectors.num_cols(); const auto blocked_num_features = static_cast(std::ceil(static_cast(num_features) / INTERNAL_BLOCK_SIZE)); + const auto blocked_num_classes = static_cast(std::ceil(static_cast(num_classes) / INTERNAL_BLOCK_SIZE)); // cast all values to 64-bit unsigned long long to prevent potential 32-bit overflows const auto INTERNAL_BLOCK_SIZE_uz = static_cast(INTERNAL_BLOCK_SIZE); + const auto THREAD_BLOCK_SIZE_uz = static_cast(THREAD_BLOCK_SIZE); - // define range over which should be iterated + // define the range over which should be iterated std::vector range(blocked_num_classes * blocked_num_features); std::iota(range.begin(), range.end(), 0); ::hpx::for_each(::hpx::execution::par_unseq, range.cbegin(), range.cend(), [&](const std::size_t idx) { // calculate the indices used in the current thread - const std::size_t feature = idx / blocked_num_classes; - const std::size_t c = idx % blocked_num_classes; - - const std::size_t feature_idx = feature * INTERNAL_BLOCK_SIZE_uz; - const std::size_t class_idx = c * INTERNAL_BLOCK_SIZE_uz; + const std::size_t feature_idx = (idx / blocked_num_classes) * INTERNAL_BLOCK_SIZE_uz; + const std::size_t class_idx = (idx % blocked_num_classes) * INTERNAL_BLOCK_SIZE_uz; // create a thread private array used for internal caching std::array, INTERNAL_BLOCK_SIZE> temp{}; - // iterate over all features - for (std::size_t sv = 0; sv < device_specific_num_sv; ++sv) { - // perform the feature reduction calculation + // iterate over all support vectors + for (std::size_t sv_block = 0; sv_block < device_num_sv; sv_block += THREAD_BLOCK_SIZE_uz) { + // perform the dot product calculation for (unsigned internal_feature = 0; internal_feature < INTERNAL_BLOCK_SIZE; ++internal_feature) { for (unsigned internal_class = 0; internal_class < INTERNAL_BLOCK_SIZE; ++internal_class) { - const std::size_t global_feature_idx = feature_idx + static_cast(internal_feature); - const std::size_t global_class_idx = class_idx + static_cast(internal_class); + // calculate the indices to access the global data + const auto global_feature_idx = feature_idx + static_cast(internal_feature); + const auto global_class_idx = class_idx + static_cast(internal_class); - temp[internal_feature][internal_class] += alpha(global_class_idx, sv_offset + sv) * support_vectors(sv_offset + sv, global_feature_idx); + real_type sum{ 0.0 }; + for (std::size_t sv = 0; sv < THREAD_BLOCK_SIZE_uz; ++sv) { + sum += alpha(global_class_idx, device_sv_offset + sv_block + sv) * support_vectors(device_sv_offset + sv_block + sv, global_feature_idx); + } + temp[internal_class][internal_feature] += sum; } } } - // update global array with local one + // store the result back to the w vector for (unsigned internal_feature = 0; internal_feature < INTERNAL_BLOCK_SIZE; ++internal_feature) { for (unsigned internal_class = 0; internal_class < INTERNAL_BLOCK_SIZE; ++internal_class) { - const std::size_t global_feature_idx = feature_idx + static_cast(internal_feature); - const std::size_t global_class_idx = class_idx + static_cast(internal_class); + // calculate the indices to access the global data + const auto global_feature_idx = feature_idx + static_cast(internal_feature); + const auto global_class_idx = class_idx + static_cast(internal_class); - w(global_class_idx, global_feature_idx) = temp[internal_feature][internal_class]; + w(global_class_idx, global_feature_idx) = temp[internal_class][internal_feature]; } } }); @@ -102,63 +106,64 @@ inline void device_kernel_w_linear(soa_matrix &w, const aos_matrix &prediction, const soa_matrix &w, const std::vector &rho, const soa_matrix &predict_points, const std::size_t device_specific_num_predict_points, const std::size_t row_offset) { +inline void device_kernel_predict_linear(aos_matrix &prediction, const soa_matrix &w, const std::vector &rho, const soa_matrix &predict_points, const std::size_t device_num_predict_points, const std::size_t device_row_offset) { PLSSVM_ASSERT(w.num_rows() == rho.size(), "Size mismatch: {} vs {}!", w.num_rows(), rho.size()); PLSSVM_ASSERT(w.num_cols() == predict_points.num_cols(), "Size mismatch: {} vs {}!", w.num_cols(), predict_points.num_cols()); PLSSVM_ASSERT(prediction.shape() == (plssvm::shape{ predict_points.num_rows(), w.num_rows() }), "Shape mismatch: {} vs {}!", prediction.shape(), (plssvm::shape{ predict_points.num_rows(), w.num_rows() })); - PLSSVM_ASSERT(predict_points.num_rows() >= device_specific_num_predict_points, "The number of place specific predict points ({}) cannot be greater the the total number of predict points ({})!", device_specific_num_predict_points, predict_points.num_rows()); - PLSSVM_ASSERT(predict_points.num_rows() >= row_offset, "The row offset ({}) cannot be greater the the total number of predict points ({})!", row_offset, predict_points.num_rows()); + PLSSVM_ASSERT(predict_points.num_rows() >= device_num_predict_points, "The number of place specific predict points ({}) cannot be greater the the total number of predict points ({})!", device_num_predict_points, predict_points.num_rows()); + PLSSVM_ASSERT(predict_points.num_rows() >= device_row_offset, "The row offset ({}) cannot be greater the the total number of predict points ({})!", device_row_offset, predict_points.num_rows()); // calculate constants - const auto blocked_device_specific_num_predict_points = static_cast(std::ceil(static_cast(device_specific_num_predict_points) / INTERNAL_BLOCK_SIZE)); const std::size_t num_classes = prediction.num_cols(); - const auto blocked_num_classes = static_cast(std::ceil(static_cast(num_classes) / INTERNAL_BLOCK_SIZE)); const std::size_t num_features = predict_points.num_cols(); + const auto blocked_device_num_predict_points = static_cast(std::ceil(static_cast(device_num_predict_points) / INTERNAL_BLOCK_SIZE)); + const auto blocked_num_classes = static_cast(std::ceil(static_cast(num_classes) / INTERNAL_BLOCK_SIZE)); // cast all values to 64-bit unsigned long long to prevent potential 32-bit overflows const auto INTERNAL_BLOCK_SIZE_uz = static_cast(INTERNAL_BLOCK_SIZE); + const auto THREAD_BLOCK_SIZE_uz = static_cast(THREAD_BLOCK_SIZE); - // define range over which should be iterated - std::vector range(blocked_device_specific_num_predict_points * blocked_num_classes); + // define the range over which should be iterated + std::vector range(blocked_device_num_predict_points * blocked_num_classes); std::iota(range.begin(), range.end(), 0); ::hpx::for_each(::hpx::execution::par_unseq, range.cbegin(), range.cend(), [&](const std::size_t idx) { // calculate the indices used in the current thread - const std::size_t pp = idx / blocked_num_classes; - const std::size_t c = idx % blocked_num_classes; - - const std::size_t pp_idx = pp * INTERNAL_BLOCK_SIZE_uz; - const std::size_t class_idx = c * INTERNAL_BLOCK_SIZE_uz; + const std::size_t pp_idx = (idx / blocked_num_classes) * INTERNAL_BLOCK_SIZE_uz; + const std::size_t class_idx = (idx % blocked_num_classes) * INTERNAL_BLOCK_SIZE_uz; // create a thread private array used for internal caching std::array, INTERNAL_BLOCK_SIZE> temp{}; // iterate over all features - for (std::size_t dim = 0; dim < num_features; ++dim) { - // perform the feature reduction calculation + for (std::size_t feature_block = 0; feature_block < num_features; feature_block += THREAD_BLOCK_SIZE_uz) { + // perform the dot product calculation for (unsigned internal_pp = 0; internal_pp < INTERNAL_BLOCK_SIZE; ++internal_pp) { for (unsigned internal_class = 0; internal_class < INTERNAL_BLOCK_SIZE; ++internal_class) { - const std::size_t global_pp_idx = row_offset + pp_idx + static_cast(internal_pp); - const std::size_t global_class_idx = class_idx + static_cast(internal_class); + // calculate the indices to access the global data + const auto global_pp_idx = device_row_offset + pp_idx + static_cast(internal_pp); + const auto global_class_idx = class_idx + static_cast(internal_class); - temp[internal_pp][internal_class] += w(global_class_idx, dim) * predict_points(global_pp_idx, dim); + real_type sum{ 0.0 }; + for (std::size_t feature = 0; feature < THREAD_BLOCK_SIZE_uz; ++feature) { + sum += w(global_class_idx, feature_block + feature) * predict_points(global_pp_idx, feature_block + feature); + } + temp[internal_class][internal_pp] += sum; } } } - // perform the dot product calculation + // store the result back to the w vector for (unsigned internal_pp = 0; internal_pp < INTERNAL_BLOCK_SIZE; ++internal_pp) { for (unsigned internal_class = 0; internal_class < INTERNAL_BLOCK_SIZE; ++internal_class) { - const std::size_t device_global_pp_idx = pp_idx + static_cast(internal_pp); - const std::size_t global_pp_idx = row_offset + pp_idx + static_cast(internal_pp); - const std::size_t global_class_idx = class_idx + static_cast(internal_class); + // calculate the indices to access the global data + const auto global_pp_idx = device_row_offset + pp_idx + static_cast(internal_pp); + const auto global_class_idx = class_idx + static_cast(internal_class); - if (device_global_pp_idx < device_specific_num_predict_points && global_class_idx < num_classes) { - prediction(global_pp_idx, global_class_idx) = temp[internal_pp][internal_class] - rho[global_class_idx]; - } + prediction(global_pp_idx, global_class_idx) = temp[internal_class][internal_pp] - rho[global_class_idx]; } } }); @@ -166,61 +171,63 @@ inline void device_kernel_predict_linear(aos_matrix &prediction, cons /** * @brief Predict the @p predict_points_d using the @p kernel_function. - * @tparam kernel the type of the used kernel function + * @tparam kernel_function the type of the used kernel function * @tparam Args the types of the parameters necessary for the specific kernel function * @param[out] prediction the predicted values * @param[in] alpha the previously learned weights * @param[in] rho the previously learned bias * @param[in] support_vectors the support vectors * @param[in] predict_points the data points to predict - * @param[in] device_specific_num_predict_points the number of predict points the current device is responsible for - * @param[in] row_offset the first row in @p predict_points the current device is responsible for + * @param[in] device_num_predict_points the number of predict points the current device is responsible for + * @param[in] device_row_offset the first row in @p predict_points the current device is responsible for * @param[in] kernel_function_parameter the parameters necessary to apply the @p kernel_function */ -template -inline void device_kernel_predict(aos_matrix &prediction, const aos_matrix &alpha, const std::vector &rho, const soa_matrix &support_vectors, const soa_matrix &predict_points, const std::size_t device_specific_num_predict_points, const std::size_t row_offset, Args... kernel_function_parameter) { +template +inline void device_kernel_predict(aos_matrix &prediction, const aos_matrix &alpha, const std::vector &rho, const soa_matrix &support_vectors, const soa_matrix &predict_points, const std::size_t device_num_predict_points, const std::size_t device_row_offset, Args... kernel_function_parameter) { PLSSVM_ASSERT(alpha.num_rows() == rho.size(), "Size mismatch: {} vs {}!", alpha.num_rows(), rho.size()); PLSSVM_ASSERT(alpha.num_cols() == support_vectors.num_rows(), "Size mismatch: {} vs {}!", alpha.num_cols(), support_vectors.num_rows()); PLSSVM_ASSERT(support_vectors.num_cols() == predict_points.num_cols(), "Size mismatch: {} vs {}!", support_vectors.num_cols(), predict_points.num_cols()); PLSSVM_ASSERT(prediction.shape() == (plssvm::shape{ predict_points.num_rows(), alpha.num_rows() }), "Shape mismatch: {} vs {}!", prediction.shape(), (plssvm::shape{ predict_points.num_rows(), alpha.num_rows() })); - PLSSVM_ASSERT(predict_points.num_rows() >= device_specific_num_predict_points, "The number of place specific predict points ({}) cannot be greater the the total number of predict points ({})!", device_specific_num_predict_points, predict_points.num_rows()); - PLSSVM_ASSERT(predict_points.num_rows() >= row_offset, "The row offset ({}) cannot be greater the the total number of predict points ({})!", row_offset, predict_points.num_rows()); + PLSSVM_ASSERT(predict_points.num_rows() >= device_num_predict_points, "The number of place specific predict points ({}) cannot be greater the the total number of predict points ({})!", device_num_predict_points, predict_points.num_rows()); + PLSSVM_ASSERT(predict_points.num_rows() >= device_row_offset, "The row offset ({}) cannot be greater the the total number of predict points ({})!", device_row_offset, predict_points.num_rows()); // calculate constants const std::size_t num_classes = alpha.num_rows(); const std::size_t num_support_vectors = support_vectors.num_rows(); - const auto blocked_num_support_vectors = static_cast(std::ceil(static_cast(num_support_vectors) / INTERNAL_BLOCK_SIZE)); - const auto blocked_device_specific_num_predict_points = static_cast(std::ceil(static_cast(device_specific_num_predict_points) / INTERNAL_BLOCK_SIZE)); const std::size_t num_features = predict_points.num_cols(); + const auto blocked_num_support_vectors = static_cast(std::ceil(static_cast(num_support_vectors) / INTERNAL_BLOCK_SIZE)); + const auto blocked_device_num_predict_points = static_cast(std::ceil(static_cast(device_num_predict_points) / INTERNAL_BLOCK_SIZE)); // cast all values to 64-bit unsigned long long to prevent potential 32-bit overflows const auto INTERNAL_BLOCK_SIZE_uz = static_cast(INTERNAL_BLOCK_SIZE); + const auto THREAD_BLOCK_SIZE_uz = static_cast(THREAD_BLOCK_SIZE); - // define range over which should be iterated - std::vector range(blocked_device_specific_num_predict_points * blocked_num_support_vectors); + // define the range over which should be iterated + std::vector range(blocked_device_num_predict_points * blocked_num_support_vectors); std::iota(range.begin(), range.end(), 0); ::hpx::for_each(::hpx::execution::par_unseq, range.cbegin(), range.cend(), [&](const std::size_t idx) { // calculate the indices used in the current thread - const std::size_t pp = idx / blocked_num_support_vectors; - const std::size_t sv = idx % blocked_num_support_vectors; - - const std::size_t pp_idx = pp * INTERNAL_BLOCK_SIZE_uz; - const std::size_t sv_idx = sv * INTERNAL_BLOCK_SIZE_uz; + const std::size_t pp_idx = (idx / blocked_num_support_vectors) * INTERNAL_BLOCK_SIZE_uz; + const std::size_t sv_idx = (idx % blocked_num_support_vectors) * INTERNAL_BLOCK_SIZE_uz; // create a thread private array used for internal caching std::array, INTERNAL_BLOCK_SIZE> temp{}; // iterate over all features - for (std::size_t dim = 0; dim < num_features; ++dim) { + for (std::size_t feature_block = 0; feature_block < num_features; feature_block += THREAD_BLOCK_SIZE_uz) { // perform the feature reduction calculation for (unsigned internal_pp = 0; internal_pp < INTERNAL_BLOCK_SIZE; ++internal_pp) { for (unsigned internal_sv = 0; internal_sv < INTERNAL_BLOCK_SIZE; ++internal_sv) { - const std::size_t global_pp_idx = row_offset + pp_idx + static_cast(internal_pp); - const std::size_t global_sv_idx = sv_idx + static_cast(internal_sv); + // calculate the indices to access the global data + const auto global_pp_idx = device_row_offset + pp_idx + static_cast(internal_pp); + const auto global_sv_idx = sv_idx + static_cast(internal_sv); - temp[internal_pp][internal_sv] += detail::feature_reduce(support_vectors(global_sv_idx, dim), - predict_points(global_pp_idx, dim)); + real_type sum{ 0.0 }; + for (std::size_t feature = 0; feature < THREAD_BLOCK_SIZE_uz; ++feature) { + sum += detail::feature_reduce(support_vectors(global_sv_idx, feature_block + feature), predict_points(global_pp_idx, feature_block + feature)); + } + temp[internal_sv][internal_pp] += sum; } } } @@ -228,25 +235,23 @@ inline void device_kernel_predict(aos_matrix &prediction, const aos_m // update temp using the respective kernel function for (unsigned internal_pp = 0; internal_pp < INTERNAL_BLOCK_SIZE; ++internal_pp) { for (unsigned internal_sv = 0; internal_sv < INTERNAL_BLOCK_SIZE; ++internal_sv) { - temp[internal_pp][internal_sv] = detail::apply_kernel_function(temp[internal_pp][internal_sv], kernel_function_parameter...); + temp[internal_sv][internal_pp] = detail::apply_kernel_function(temp[internal_sv][internal_pp], kernel_function_parameter...); } } // add results to prediction - for (std::size_t a = 0; a < num_classes; ++a) { + for (std::size_t class_block = 0; class_block < num_classes; class_block += THREAD_BLOCK_SIZE_uz) { for (unsigned internal_pp = 0; internal_pp < INTERNAL_BLOCK_SIZE; ++internal_pp) { for (unsigned internal_sv = 0; internal_sv < INTERNAL_BLOCK_SIZE; ++internal_sv) { - const std::size_t device_global_pp_idx = pp_idx + static_cast(internal_pp); - const std::size_t global_pp_idx = row_offset + pp_idx + static_cast(internal_pp); - const std::size_t global_sv_idx = sv_idx + static_cast(internal_sv); + // calculate the indices to access the global data and the data with respect to the current device + const auto global_pp_idx = device_row_offset + pp_idx + static_cast(internal_pp); + const auto global_sv_idx = sv_idx + static_cast(internal_sv); - // be sure to not perform out of bounds accesses - if (device_global_pp_idx < device_specific_num_predict_points && global_sv_idx < num_support_vectors) { + for (std::size_t class_idx = 0; class_idx < THREAD_BLOCK_SIZE_uz; ++class_idx) { if (global_sv_idx == 0) { - atomic_ref{ prediction(global_pp_idx, a) } += -rho[a]; + atomic_ref{ prediction(global_pp_idx, class_block + class_idx) } += -rho[class_block + class_idx]; } - atomic_ref{ prediction(global_pp_idx, a) } += - temp[internal_pp][internal_sv] * alpha(a, global_sv_idx); + atomic_ref{ prediction(global_pp_idx, class_block + class_idx) } += alpha(class_block + class_idx, global_sv_idx) * temp[internal_sv][internal_pp]; } } } diff --git a/src/plssvm/backends/HPX/csvm.cpp b/src/plssvm/backends/HPX/csvm.cpp index 71f651688..4c24192dd 100644 --- a/src/plssvm/backends/HPX/csvm.cpp +++ b/src/plssvm/backends/HPX/csvm.cpp @@ -18,6 +18,7 @@ #include "plssvm/detail/assert.hpp" // PLSSVM_ASSERT #include "plssvm/detail/data_distribution.hpp" // plssvm::detail::triangular_data_distribution #include "plssvm/detail/logging/mpi_log_untracked.hpp" // plssvm::detail::log_untracked +#include "plssvm/detail/make_unique_for_overwrite.hpp" // plssvm::detail::{make_unique_for_overwrite, parallel_zero_memset} #include "plssvm/detail/memory_size.hpp" // plssvm::detail::memory_size #include "plssvm/detail/move_only_any.hpp" // plssvm::detail::{move_only_any, move_only_any_cast} #include "plssvm/detail/tracking/performance_tracker.hpp" // plssvm::detail::tracking::tracking_entry, PLSSVM_DETAIL_TRACKING_PERFORMANCE_TRACKER_ADD_TRACKING_ENTRY @@ -120,26 +121,33 @@ std::vector<::plssvm::detail::move_only_any> csvm::assemble_kernel_matrix(const // get the offset of the data points this device is responsible for const std::size_t row_offset = dist.place_row_offset(0); - std::vector kernel_matrix(dist.calculate_explicit_kernel_matrix_num_entries_padded(0)); // only explicitly store the upper triangular matrix + // get the number of kernel matrix entries + const std::size_t num_entries = dist.calculate_explicit_kernel_matrix_num_entries_padded(0); + + // only explicitly store the upper triangular matrix + auto kernel_matrix = ::plssvm::detail::make_unique_for_overwrite(num_entries); + // initialize kernel matrix to all zeros in parallel + ::plssvm::detail::parallel_zero_memset(kernel_matrix.get(), num_entries); + const auto start = std::chrono::steady_clock::now(); switch (params.kernel_type) { case kernel_function_type::linear: - detail::device_kernel_assembly(kernel_matrix, A, device_specific_num_rows, row_offset, q_red, QA_cost, cost); + detail::device_kernel_assembly(kernel_matrix.get(), A, device_specific_num_rows, row_offset, q_red, QA_cost, cost); break; case kernel_function_type::polynomial: - detail::device_kernel_assembly(kernel_matrix, A, device_specific_num_rows, row_offset, q_red, QA_cost, cost, params.degree, std::get(params.gamma), params.coef0); + detail::device_kernel_assembly(kernel_matrix.get(), A, device_specific_num_rows, row_offset, q_red, QA_cost, cost, params.degree, std::get(params.gamma), params.coef0); break; case kernel_function_type::rbf: - detail::device_kernel_assembly(kernel_matrix, A, device_specific_num_rows, row_offset, q_red, QA_cost, cost, std::get(params.gamma)); + detail::device_kernel_assembly(kernel_matrix.get(), A, device_specific_num_rows, row_offset, q_red, QA_cost, cost, std::get(params.gamma)); break; case kernel_function_type::sigmoid: - detail::device_kernel_assembly(kernel_matrix, A, device_specific_num_rows, row_offset, q_red, QA_cost, cost, std::get(params.gamma), params.coef0); + detail::device_kernel_assembly(kernel_matrix.get(), A, device_specific_num_rows, row_offset, q_red, QA_cost, cost, std::get(params.gamma), params.coef0); break; case kernel_function_type::laplacian: - detail::device_kernel_assembly(kernel_matrix, A, device_specific_num_rows, row_offset, q_red, QA_cost, cost, std::get(params.gamma)); + detail::device_kernel_assembly(kernel_matrix.get(), A, device_specific_num_rows, row_offset, q_red, QA_cost, cost, std::get(params.gamma)); break; case kernel_function_type::chi_squared: - detail::device_kernel_assembly(kernel_matrix, A, device_specific_num_rows, row_offset, q_red, QA_cost, cost, std::get(params.gamma)); + detail::device_kernel_assembly(kernel_matrix.get(), A, device_specific_num_rows, row_offset, q_red, QA_cost, cost, std::get(params.gamma)); break; } const auto end = std::chrono::steady_clock::now(); @@ -200,16 +208,16 @@ void csvm::blas_level_3(const solver_type solver, const real_type alpha, const s break; case solver_type::cg_explicit: { - const auto &explicit_A = ::plssvm::detail::move_only_any_cast &>(A.front()); + const auto &explicit_A = ::plssvm::detail::move_only_any_cast &>(A.front()); PLSSVM_ASSERT(!explicit_A.empty(), "The A matrix must not be empty!"); const auto start = std::chrono::steady_clock::now(); - detail::device_kernel_symm(num_rows, num_rhs, device_specific_num_rows, row_offset, alpha, explicit_A, B, beta, C); + detail::device_kernel_symm(num_rows, num_rhs, device_specific_num_rows, row_offset, alpha, explicit_A.get(), B, beta, C); const std::size_t num_mirror_rows = num_rows - row_offset - device_specific_num_rows; if (num_mirror_rows > std::size_t{ 0 }) { - detail::device_kernel_symm_mirror(num_rows, num_rhs, num_mirror_rows, device_specific_num_rows, row_offset, alpha, explicit_A, B, beta, C); + detail::device_kernel_symm_mirror(num_rows, num_rhs, num_mirror_rows, device_specific_num_rows, row_offset, alpha, explicit_A.get(), B, beta, C); } const auto end = std::chrono::steady_clock::now(); @@ -261,6 +269,8 @@ void csvm::blas_level_3(const solver_type solver, const real_type alpha, const s }); // wait until operation is completed wait.get(); + // restore padding entries by setting them to zero + C.restore_padding(); } //***************************************************// @@ -317,6 +327,8 @@ aos_matrix csvm::predict_values(const parameter ¶ms, [[maybe_unused]] const auto duration = std::chrono::duration_cast(end - start); PLSSVM_DETAIL_TRACKING_PERFORMANCE_TRACKER_ADD_TRACKING_ENTRY((plssvm::detail::tracking::tracking_entry{ "predict_values", "w_kernel", duration })); } + // restore padding entries by setting them to zero + w.restore_padding(); // reduce w on all MPI ranks comm_.allreduce_inplace(w); @@ -358,6 +370,9 @@ aos_matrix csvm::predict_values(const parameter ¶ms, }); // wait until operation is completed wait.get(); + + // restore padding entries by setting them to zero + out.restore_padding(); return out; } From ff892127bfd52ea44fd498a64a4df558936ebc2a Mon Sep 17 00:00:00 2001 From: Marcel Breyer Date: Sat, 14 Jun 2025 16:13:41 +0200 Subject: [PATCH 024/215] Some small changes: where possible change remaining const to constexpr, remove superfluous braces, add missing static_casts, and use correct THREAD_BLOCK_SIZE_uz. --- .../backends/CUDA/kernel/cg_explicit/blas.cuh | 20 ++++++++-------- .../kernel_matrix_assembly_blas.cuh | 8 +++---- .../backends/CUDA/kernel/predict_kernel.cuh | 24 +++++++++---------- .../HIP/kernel/cg_explicit/blas.hip.hpp | 20 ++++++++-------- .../kernel_matrix_assembly_blas.hip.hpp | 8 +++---- .../HIP/kernel/predict_kernel.hip.hpp | 24 +++++++++---------- 6 files changed, 52 insertions(+), 52 deletions(-) diff --git a/include/plssvm/backends/CUDA/kernel/cg_explicit/blas.cuh b/include/plssvm/backends/CUDA/kernel/cg_explicit/blas.cuh index bacc84852..ab6c7b11b 100644 --- a/include/plssvm/backends/CUDA/kernel/cg_explicit/blas.cuh +++ b/include/plssvm/backends/CUDA/kernel/cg_explicit/blas.cuh @@ -36,9 +36,9 @@ namespace plssvm::cuda::detail { */ __global__ void device_kernel_symm(const std::size_t num_rows, const std::size_t num_rhs, const std::size_t device_num_rows, const std::size_t device_row_offset, const real_type alpha, const real_type *A, const real_type *B, const real_type beta, real_type *C, const std::size_t grid_x_offset, const std::size_t grid_y_offset) { // cast all values to 64-bit std::size_t to prevent potential 32-bit overflows - const auto INTERNAL_BLOCK_SIZE_uz = static_cast(INTERNAL_BLOCK_SIZE); - const auto THREAD_BLOCK_SIZE_uz = static_cast(THREAD_BLOCK_SIZE); - const auto PADDING_SIZE_uz = static_cast(PADDING_SIZE); + constexpr auto INTERNAL_BLOCK_SIZE_uz = static_cast(INTERNAL_BLOCK_SIZE); + constexpr auto THREAD_BLOCK_SIZE_uz = static_cast(THREAD_BLOCK_SIZE); + constexpr auto PADDING_SIZE_uz = static_cast(PADDING_SIZE); const auto threadIdx_x = static_cast(threadIdx.x); // current thread in block x-dimension const auto threadIdx_y = static_cast(threadIdx.y); // current thread in block y-dimension @@ -128,9 +128,9 @@ __global__ void device_kernel_symm(const std::size_t num_rows, const std::size_t */ __global__ void device_kernel_symm_mirror(const std::size_t num_rows, const std::size_t num_rhs, const std::size_t num_mirror_rows, const std::size_t device_num_rows, const std::size_t device_row_offset, const real_type alpha, const real_type *A, const real_type *B, const real_type beta, real_type *C, const std::size_t grid_x_offset, const std::size_t grid_y_offset) { // cast all values to 64-bit std::size_t to prevent potential 32-bit overflows - const auto INTERNAL_BLOCK_SIZE_uz = static_cast(INTERNAL_BLOCK_SIZE); - const auto THREAD_BLOCK_SIZE_uz = static_cast(THREAD_BLOCK_SIZE); - const auto PADDING_SIZE_uz = static_cast(PADDING_SIZE); + constexpr auto INTERNAL_BLOCK_SIZE_uz = static_cast(INTERNAL_BLOCK_SIZE); + constexpr auto THREAD_BLOCK_SIZE_uz = static_cast(THREAD_BLOCK_SIZE); + constexpr auto PADDING_SIZE_uz = static_cast(PADDING_SIZE); const auto threadIdx_x = static_cast(threadIdx.x); // current thread in block x-dimension const auto threadIdx_y = static_cast(threadIdx.y); // current thread in block y-dimension @@ -207,8 +207,8 @@ __global__ void device_kernel_symm_mirror(const std::size_t num_rows, const std: */ __global__ void device_kernel_inplace_matrix_add(const std::size_t num_cols, real_type *lhs, const real_type *rhs, const std::size_t grid_x_offset, const std::size_t grid_y_offset) { // cast all values to 64-bit std::size_t to prevent potential 32-bit overflows - const auto INTERNAL_BLOCK_SIZE_uz = static_cast(INTERNAL_BLOCK_SIZE); - const auto PADDING_SIZE_uz = static_cast(PADDING_SIZE); + constexpr auto INTERNAL_BLOCK_SIZE_uz = static_cast(INTERNAL_BLOCK_SIZE); + constexpr auto PADDING_SIZE_uz = static_cast(PADDING_SIZE); const auto threadIdx_x = static_cast(threadIdx.x); // current thread in block x-dimension const auto threadIdx_y = static_cast(threadIdx.y); // current thread in block y-dimension @@ -242,8 +242,8 @@ __global__ void device_kernel_inplace_matrix_add(const std::size_t num_cols, rea */ __global__ void device_kernel_inplace_matrix_scale(const std::size_t num_cols, real_type *lhs, const real_type scale, const std::size_t grid_x_offset, const std::size_t grid_y_offset) { // cast all values to 64-bit std::size_t to prevent potential 32-bit overflows - const auto INTERNAL_BLOCK_SIZE_uz = static_cast(INTERNAL_BLOCK_SIZE); - const auto PADDING_SIZE_uz = static_cast(PADDING_SIZE); + constexpr auto INTERNAL_BLOCK_SIZE_uz = static_cast(INTERNAL_BLOCK_SIZE); + constexpr auto PADDING_SIZE_uz = static_cast(PADDING_SIZE); const auto threadIdx_x = static_cast(threadIdx.x); // current thread in block x-dimension const auto threadIdx_y = static_cast(threadIdx.y); // current thread in block y-dimension diff --git a/include/plssvm/backends/CUDA/kernel/cg_implicit/kernel_matrix_assembly_blas.cuh b/include/plssvm/backends/CUDA/kernel/cg_implicit/kernel_matrix_assembly_blas.cuh index bf1ee66e5..9861f2fb7 100644 --- a/include/plssvm/backends/CUDA/kernel/cg_implicit/kernel_matrix_assembly_blas.cuh +++ b/include/plssvm/backends/CUDA/kernel/cg_implicit/kernel_matrix_assembly_blas.cuh @@ -45,9 +45,9 @@ namespace plssvm::cuda::detail { template __global__ void device_kernel_assembly_symm(const real_type alpha, const real_type *q, const real_type *data, const std::size_t num_rows, const std::size_t device_num_rows, const std::size_t device_row_offset, const std::size_t num_features, const real_type QA_cost, const real_type cost, const real_type *B, real_type *C, const std::size_t num_classes, const std::size_t grid_x_offset, const std::size_t grid_y_offset, Args... kernel_function_parameter) { // cast all values to 64-bit std::size_t to prevent potential 32-bit overflows - const auto INTERNAL_BLOCK_SIZE_uz = static_cast(INTERNAL_BLOCK_SIZE); - const auto THREAD_BLOCK_SIZE_uz = static_cast(THREAD_BLOCK_SIZE); - const auto PADDING_SIZE_uz = static_cast(PADDING_SIZE); + constexpr auto INTERNAL_BLOCK_SIZE_uz = static_cast(INTERNAL_BLOCK_SIZE); + constexpr auto THREAD_BLOCK_SIZE_uz = static_cast(THREAD_BLOCK_SIZE); + constexpr auto PADDING_SIZE_uz = static_cast(PADDING_SIZE); const auto threadIdx_x = static_cast(threadIdx.x); // current thread in block x-dimension const auto threadIdx_y = static_cast(threadIdx.y); // current thread in block y-dimension @@ -118,7 +118,7 @@ __global__ void device_kernel_assembly_symm(const real_type alpha, const real_ty const auto global_j_idx = device_row_offset + device_global_j_idx; // be sure to not perform out of bounds accesses (only using the upper triangular matrix) - if ((device_global_i_idx < (num_rows - device_row_offset) && device_global_j_idx < device_num_rows && global_i_idx >= global_j_idx)) { + if (device_global_i_idx < (num_rows - device_row_offset) && device_global_j_idx < device_num_rows && global_i_idx >= global_j_idx) { // apply the final kernel function temp[internal_i][internal_j] = detail::apply_kernel_function(temp[internal_i][internal_j], kernel_function_parameter...) + QA_cost - q[global_i_idx] - q[global_j_idx]; // apply the cost on the diagonal diff --git a/include/plssvm/backends/CUDA/kernel/predict_kernel.cuh b/include/plssvm/backends/CUDA/kernel/predict_kernel.cuh index 285cdc3a6..9d20863c8 100644 --- a/include/plssvm/backends/CUDA/kernel/predict_kernel.cuh +++ b/include/plssvm/backends/CUDA/kernel/predict_kernel.cuh @@ -36,9 +36,9 @@ namespace plssvm::cuda::detail { */ __global__ void device_kernel_w_linear(real_type *w, const real_type *alpha, const real_type *support_vectors, const std::size_t num_classes, const std::size_t num_sv, const std::size_t device_num_sv, const std::size_t device_sv_offset, const std::size_t grid_x_offset, const std::size_t grid_y_offset) { // cast all values to 64-bit std::size_t to prevent potential 32-bit overflows - const auto INTERNAL_BLOCK_SIZE_uz = static_cast(INTERNAL_BLOCK_SIZE); - const auto THREAD_BLOCK_SIZE_uz = static_cast(THREAD_BLOCK_SIZE); - const auto PADDING_SIZE_uz = static_cast(PADDING_SIZE); + constexpr auto INTERNAL_BLOCK_SIZE_uz = static_cast(INTERNAL_BLOCK_SIZE); + constexpr auto THREAD_BLOCK_SIZE_uz = static_cast(THREAD_BLOCK_SIZE); + constexpr auto PADDING_SIZE_uz = static_cast(PADDING_SIZE); const auto threadIdx_x = static_cast(threadIdx.x); // current thread in block x-dimension const auto threadIdx_y = static_cast(threadIdx.y); // current thread in block y-dimension @@ -115,9 +115,9 @@ __global__ void device_kernel_w_linear(real_type *w, const real_type *alpha, con */ __global__ void device_kernel_predict_linear(real_type *prediction, const real_type *w, const real_type *rho, const real_type *predict_points, const std::size_t num_classes, const std::size_t num_predict_points, const std::size_t num_features, const std::size_t grid_x_offset, const std::size_t grid_y_offset) { // cast all values to 64-bit std::size_t to prevent potential 32-bit overflows - const auto INTERNAL_BLOCK_SIZE_uz = static_cast(INTERNAL_BLOCK_SIZE); - const auto THREAD_BLOCK_SIZE_uz = static_cast(THREAD_BLOCK_SIZE); - const auto PADDING_SIZE_uz = static_cast(PADDING_SIZE); + constexpr auto INTERNAL_BLOCK_SIZE_uz = static_cast(INTERNAL_BLOCK_SIZE); + constexpr auto THREAD_BLOCK_SIZE_uz = static_cast(THREAD_BLOCK_SIZE); + constexpr auto PADDING_SIZE_uz = static_cast(PADDING_SIZE); const auto threadIdx_x = static_cast(threadIdx.x); // current thread in block x-dimension const auto threadIdx_y = static_cast(threadIdx.y); // current thread in block y-dimension @@ -200,9 +200,9 @@ __global__ void device_kernel_predict_linear(real_type *prediction, const real_t template __global__ void device_kernel_predict(real_type *prediction, const real_type *alpha, const real_type *rho, const real_type *sv, const real_type *predict_points, const std::size_t num_classes, const std::size_t num_sv, const std::size_t num_predict_points, const std::size_t num_features, const std::size_t grid_x_offset, const std::size_t grid_y_offset, Args... kernel_function_parameter) { // cast all values to 64-bit std::size_t to prevent potential 32-bit overflows - const auto INTERNAL_BLOCK_SIZE_uz = static_cast(INTERNAL_BLOCK_SIZE); - const auto THREAD_BLOCK_SIZE_uz = static_cast(THREAD_BLOCK_SIZE); - const auto PADDING_SIZE_uz = static_cast(PADDING_SIZE); + constexpr auto INTERNAL_BLOCK_SIZE_uz = static_cast(INTERNAL_BLOCK_SIZE); + constexpr auto THREAD_BLOCK_SIZE_uz = static_cast(THREAD_BLOCK_SIZE); + constexpr auto PADDING_SIZE_uz = static_cast(PADDING_SIZE); const auto threadIdx_x = static_cast(threadIdx.x); // current thread in block x-dimension const auto threadIdx_y = static_cast(threadIdx.y); // current thread in block y-dimension @@ -232,8 +232,8 @@ __global__ void device_kernel_predict(real_type *prediction, const real_type *al // load data into shared memory for (unsigned internal = 0; internal < INTERNAL_BLOCK_SIZE; ++internal) { // calculate the indices to access the global data, pays attention to coalesced memory accesses - const auto global_pp_idx_linear = pp_idx_linear + static_cast(internal) * THREAD_BLOCK_SIZE; - const auto global_sv_idx_linear = sv_idx_linear + static_cast(internal) * THREAD_BLOCK_SIZE; + const auto global_pp_idx_linear = pp_idx_linear + static_cast(internal) * THREAD_BLOCK_SIZE_uz; + const auto global_sv_idx_linear = sv_idx_linear + static_cast(internal) * THREAD_BLOCK_SIZE_uz; // store the values in the shared memory pp_cache[threadIdx.y][internal * THREAD_BLOCK_SIZE + threadIdx.x] = predict_points[(feature_block + threadIdx_y) * (num_predict_points + PADDING_SIZE_uz) + global_pp_idx_linear]; // SoA @@ -276,7 +276,7 @@ __global__ void device_kernel_predict(real_type *prediction, const real_type *al // load data into shared memory for (unsigned internal = 0; internal < INTERNAL_BLOCK_SIZE; ++internal) { // calculate the indices to access the global data, pays attention to coalesced memory accesses - const auto global_sv_idx_linear = sv_idx_linear + internal * THREAD_BLOCK_SIZE; + const auto global_sv_idx_linear = sv_idx_linear + static_cast(internal) * THREAD_BLOCK_SIZE_uz; // store the values in the shared memory alpha_cache[threadIdx.y][internal * THREAD_BLOCK_SIZE + threadIdx.x] = alpha[(class_block + threadIdx_y) * (num_sv + PADDING_SIZE_uz) + global_sv_idx_linear]; // AoS diff --git a/include/plssvm/backends/HIP/kernel/cg_explicit/blas.hip.hpp b/include/plssvm/backends/HIP/kernel/cg_explicit/blas.hip.hpp index b2e9c8ce3..9f5821634 100644 --- a/include/plssvm/backends/HIP/kernel/cg_explicit/blas.hip.hpp +++ b/include/plssvm/backends/HIP/kernel/cg_explicit/blas.hip.hpp @@ -39,9 +39,9 @@ namespace plssvm::hip::detail { */ __global__ void device_kernel_symm(const std::size_t num_rows, const std::size_t num_rhs, const std::size_t device_num_rows, const std::size_t device_row_offset, const real_type alpha, const real_type *A, const real_type *B, const real_type beta, real_type *C, const std::size_t grid_x_offset, const std::size_t grid_y_offset) { // cast all values to 64-bit std::size_t to prevent potential 32-bit overflows - const auto INTERNAL_BLOCK_SIZE_uz = static_cast(INTERNAL_BLOCK_SIZE); - const auto THREAD_BLOCK_SIZE_uz = static_cast(THREAD_BLOCK_SIZE); - const auto PADDING_SIZE_uz = static_cast(PADDING_SIZE); + constexpr auto INTERNAL_BLOCK_SIZE_uz = static_cast(INTERNAL_BLOCK_SIZE); + constexpr auto THREAD_BLOCK_SIZE_uz = static_cast(THREAD_BLOCK_SIZE); + constexpr auto PADDING_SIZE_uz = static_cast(PADDING_SIZE); const auto threadIdx_x = static_cast(threadIdx.x); // current thread in block x-dimension const auto threadIdx_y = static_cast(threadIdx.y); // current thread in block y-dimension @@ -131,9 +131,9 @@ __global__ void device_kernel_symm(const std::size_t num_rows, const std::size_t */ __global__ void device_kernel_symm_mirror(const std::size_t num_rows, const std::size_t num_rhs, const std::size_t num_mirror_rows, const std::size_t device_num_rows, const std::size_t device_row_offset, const real_type alpha, const real_type *A, const real_type *B, const real_type beta, real_type *C, const std::size_t grid_x_offset, const std::size_t grid_y_offset) { // cast all values to 64-bit std::size_t to prevent potential 32-bit overflows - const auto INTERNAL_BLOCK_SIZE_uz = static_cast(INTERNAL_BLOCK_SIZE); - const auto THREAD_BLOCK_SIZE_uz = static_cast(THREAD_BLOCK_SIZE); - const auto PADDING_SIZE_uz = static_cast(PADDING_SIZE); + constexpr auto INTERNAL_BLOCK_SIZE_uz = static_cast(INTERNAL_BLOCK_SIZE); + constexpr auto THREAD_BLOCK_SIZE_uz = static_cast(THREAD_BLOCK_SIZE); + constexpr auto PADDING_SIZE_uz = static_cast(PADDING_SIZE); const auto threadIdx_x = static_cast(threadIdx.x); // current thread in block x-dimension const auto threadIdx_y = static_cast(threadIdx.y); // current thread in block y-dimension @@ -210,8 +210,8 @@ __global__ void device_kernel_symm_mirror(const std::size_t num_rows, const std: */ __global__ void device_kernel_inplace_matrix_add(const std::size_t num_cols, real_type *lhs, const real_type *rhs, const std::size_t grid_x_offset, const std::size_t grid_y_offset) { // cast all values to 64-bit std::size_t to prevent potential 32-bit overflows - const auto INTERNAL_BLOCK_SIZE_uz = static_cast(INTERNAL_BLOCK_SIZE); - const auto PADDING_SIZE_uz = static_cast(PADDING_SIZE); + constexpr auto INTERNAL_BLOCK_SIZE_uz = static_cast(INTERNAL_BLOCK_SIZE); + constexpr auto PADDING_SIZE_uz = static_cast(PADDING_SIZE); const auto threadIdx_x = static_cast(threadIdx.x); // current thread in block x-dimension const auto threadIdx_y = static_cast(threadIdx.y); // current thread in block y-dimension @@ -245,8 +245,8 @@ __global__ void device_kernel_inplace_matrix_add(const std::size_t num_cols, rea */ __global__ void device_kernel_inplace_matrix_scale(const std::size_t num_cols, real_type *lhs, const real_type scale, const std::size_t grid_x_offset, const std::size_t grid_y_offset) { // cast all values to 64-bit std::size_t to prevent potential 32-bit overflows - const auto INTERNAL_BLOCK_SIZE_uz = static_cast(INTERNAL_BLOCK_SIZE); - const auto PADDING_SIZE_uz = static_cast(PADDING_SIZE); + constexpr auto INTERNAL_BLOCK_SIZE_uz = static_cast(INTERNAL_BLOCK_SIZE); + constexpr auto PADDING_SIZE_uz = static_cast(PADDING_SIZE); const auto threadIdx_x = static_cast(threadIdx.x); // current thread in block x-dimension const auto threadIdx_y = static_cast(threadIdx.y); // current thread in block y-dimension diff --git a/include/plssvm/backends/HIP/kernel/cg_implicit/kernel_matrix_assembly_blas.hip.hpp b/include/plssvm/backends/HIP/kernel/cg_implicit/kernel_matrix_assembly_blas.hip.hpp index 97ef0798b..2bc4a230f 100644 --- a/include/plssvm/backends/HIP/kernel/cg_implicit/kernel_matrix_assembly_blas.hip.hpp +++ b/include/plssvm/backends/HIP/kernel/cg_implicit/kernel_matrix_assembly_blas.hip.hpp @@ -47,9 +47,9 @@ namespace plssvm::hip::detail { template __global__ void device_kernel_assembly_symm(const real_type alpha, const real_type *q, const real_type *data, const std::size_t num_rows, const std::size_t device_num_rows, const std::size_t device_row_offset, const std::size_t num_features, const real_type QA_cost, const real_type cost, const real_type *B, real_type *C, const std::size_t num_classes, const std::size_t grid_x_offset, const std::size_t grid_y_offset, Args... kernel_function_parameter) { // cast all values to 64-bit std::size_t to prevent potential 32-bit overflows - const auto INTERNAL_BLOCK_SIZE_uz = static_cast(INTERNAL_BLOCK_SIZE); - const auto THREAD_BLOCK_SIZE_uz = static_cast(THREAD_BLOCK_SIZE); - const auto PADDING_SIZE_uz = static_cast(PADDING_SIZE); + constexpr auto INTERNAL_BLOCK_SIZE_uz = static_cast(INTERNAL_BLOCK_SIZE); + constexpr auto THREAD_BLOCK_SIZE_uz = static_cast(THREAD_BLOCK_SIZE); + constexpr auto PADDING_SIZE_uz = static_cast(PADDING_SIZE); const auto threadIdx_x = static_cast(threadIdx.x); // current thread in block x-dimension const auto threadIdx_y = static_cast(threadIdx.y); // current thread in block y-dimension @@ -120,7 +120,7 @@ __global__ void device_kernel_assembly_symm(const real_type alpha, const real_ty const auto global_j_idx = device_row_offset + device_global_j_idx; // be sure to not perform out of bounds accesses (only using the upper triangular matrix) - if ((device_global_i_idx < (num_rows - device_row_offset) && device_global_j_idx < device_num_rows && global_i_idx >= global_j_idx)) { + if (device_global_i_idx < (num_rows - device_row_offset) && device_global_j_idx < device_num_rows && global_i_idx >= global_j_idx) { // apply the final kernel function temp[internal_i][internal_j] = detail::apply_kernel_function(temp[internal_i][internal_j], kernel_function_parameter...) + QA_cost - q[global_i_idx] - q[global_j_idx]; // apply the cost on the diagonal diff --git a/include/plssvm/backends/HIP/kernel/predict_kernel.hip.hpp b/include/plssvm/backends/HIP/kernel/predict_kernel.hip.hpp index 9aaba6c5e..6ba12a360 100644 --- a/include/plssvm/backends/HIP/kernel/predict_kernel.hip.hpp +++ b/include/plssvm/backends/HIP/kernel/predict_kernel.hip.hpp @@ -38,9 +38,9 @@ namespace plssvm::hip::detail { */ __global__ void device_kernel_w_linear(real_type *w, const real_type *alpha, const real_type *support_vectors, const std::size_t num_classes, const std::size_t num_sv, const std::size_t device_num_sv, const std::size_t device_sv_offset, const std::size_t grid_x_offset, const std::size_t grid_y_offset) { // cast all values to 64-bit std::size_t to prevent potential 32-bit overflows - const auto INTERNAL_BLOCK_SIZE_uz = static_cast(INTERNAL_BLOCK_SIZE); - const auto THREAD_BLOCK_SIZE_uz = static_cast(THREAD_BLOCK_SIZE); - const auto PADDING_SIZE_uz = static_cast(PADDING_SIZE); + constexpr auto INTERNAL_BLOCK_SIZE_uz = static_cast(INTERNAL_BLOCK_SIZE); + constexpr auto THREAD_BLOCK_SIZE_uz = static_cast(THREAD_BLOCK_SIZE); + constexpr auto PADDING_SIZE_uz = static_cast(PADDING_SIZE); const auto threadIdx_x = static_cast(threadIdx.x); // current thread in block x-dimension const auto threadIdx_y = static_cast(threadIdx.y); // current thread in block y-dimension @@ -117,9 +117,9 @@ __global__ void device_kernel_w_linear(real_type *w, const real_type *alpha, con */ __global__ void device_kernel_predict_linear(real_type *prediction, const real_type *w, const real_type *rho, const real_type *predict_points, const std::size_t num_classes, const std::size_t num_predict_points, const std::size_t num_features, const std::size_t grid_x_offset, const std::size_t grid_y_offset) { // cast all values to 64-bit std::size_t to prevent potential 32-bit overflows - const auto INTERNAL_BLOCK_SIZE_uz = static_cast(INTERNAL_BLOCK_SIZE); - const auto THREAD_BLOCK_SIZE_uz = static_cast(THREAD_BLOCK_SIZE); - const auto PADDING_SIZE_uz = static_cast(PADDING_SIZE); + constexpr auto INTERNAL_BLOCK_SIZE_uz = static_cast(INTERNAL_BLOCK_SIZE); + constexpr auto THREAD_BLOCK_SIZE_uz = static_cast(THREAD_BLOCK_SIZE); + constexpr auto PADDING_SIZE_uz = static_cast(PADDING_SIZE); const auto threadIdx_x = static_cast(threadIdx.x); // current thread in block x-dimension const auto threadIdx_y = static_cast(threadIdx.y); // current thread in block y-dimension @@ -202,9 +202,9 @@ __global__ void device_kernel_predict_linear(real_type *prediction, const real_t template __global__ void device_kernel_predict(real_type *prediction, const real_type *alpha, const real_type *rho, const real_type *sv, const real_type *predict_points, const std::size_t num_classes, const std::size_t num_sv, const std::size_t num_predict_points, const std::size_t num_features, const std::size_t grid_x_offset, const std::size_t grid_y_offset, Args... kernel_function_parameter) { // cast all values to 64-bit std::size_t to prevent potential 32-bit overflows - const auto INTERNAL_BLOCK_SIZE_uz = static_cast(INTERNAL_BLOCK_SIZE); - const auto THREAD_BLOCK_SIZE_uz = static_cast(THREAD_BLOCK_SIZE); - const auto PADDING_SIZE_uz = static_cast(PADDING_SIZE); + constexpr auto INTERNAL_BLOCK_SIZE_uz = static_cast(INTERNAL_BLOCK_SIZE); + constexpr auto THREAD_BLOCK_SIZE_uz = static_cast(THREAD_BLOCK_SIZE); + constexpr auto PADDING_SIZE_uz = static_cast(PADDING_SIZE); const auto threadIdx_x = static_cast(threadIdx.x); // current thread in block x-dimension const auto threadIdx_y = static_cast(threadIdx.y); // current thread in block y-dimension @@ -234,8 +234,8 @@ __global__ void device_kernel_predict(real_type *prediction, const real_type *al // load data into shared memory for (unsigned internal = 0; internal < INTERNAL_BLOCK_SIZE; ++internal) { // calculate the indices to access the global data, pays attention to coalesced memory accesses - const auto global_pp_idx_linear = pp_idx_linear + static_cast(internal) * THREAD_BLOCK_SIZE; - const auto global_sv_idx_linear = sv_idx_linear + static_cast(internal) * THREAD_BLOCK_SIZE; + const auto global_pp_idx_linear = pp_idx_linear + static_cast(internal) * THREAD_BLOCK_SIZE_uz; + const auto global_sv_idx_linear = sv_idx_linear + static_cast(internal) * THREAD_BLOCK_SIZE_uz; // store the values in the shared memory pp_cache[threadIdx.y][internal * THREAD_BLOCK_SIZE + threadIdx.x] = predict_points[(feature_block + threadIdx_y) * (num_predict_points + PADDING_SIZE_uz) + global_pp_idx_linear]; // SoA @@ -278,7 +278,7 @@ __global__ void device_kernel_predict(real_type *prediction, const real_type *al // load data into shared memory for (unsigned internal = 0; internal < INTERNAL_BLOCK_SIZE; ++internal) { // calculate the indices to access the global data, pays attention to coalesced memory accesses - const auto global_sv_idx_linear = sv_idx_linear + internal * THREAD_BLOCK_SIZE; + const auto global_sv_idx_linear = sv_idx_linear + static_cast(internal) * THREAD_BLOCK_SIZE_uz; // store the values in the shared memory alpha_cache[threadIdx.y][internal * THREAD_BLOCK_SIZE + threadIdx.x] = alpha[(class_block + threadIdx_y) * (num_sv + PADDING_SIZE_uz) + global_sv_idx_linear]; // AoS From b4d553ab3fd22ee15814e228b1f34bee5313496c Mon Sep 17 00:00:00 2001 From: Marcel Breyer Date: Sat, 14 Jun 2025 16:14:13 +0200 Subject: [PATCH 025/215] Update comments. --- include/plssvm/backends/HPX/kernel/predict_kernel.hpp | 2 +- include/plssvm/backends/OpenMP/kernel/predict_kernel.hpp | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/include/plssvm/backends/HPX/kernel/predict_kernel.hpp b/include/plssvm/backends/HPX/kernel/predict_kernel.hpp index 050425b8a..e98d09a58 100644 --- a/include/plssvm/backends/HPX/kernel/predict_kernel.hpp +++ b/include/plssvm/backends/HPX/kernel/predict_kernel.hpp @@ -156,7 +156,7 @@ inline void device_kernel_predict_linear(aos_matrix &prediction, cons } } - // store the result back to the w vector + // update the global array with the local one for (unsigned internal_pp = 0; internal_pp < INTERNAL_BLOCK_SIZE; ++internal_pp) { for (unsigned internal_class = 0; internal_class < INTERNAL_BLOCK_SIZE; ++internal_class) { // calculate the indices to access the global data diff --git a/include/plssvm/backends/OpenMP/kernel/predict_kernel.hpp b/include/plssvm/backends/OpenMP/kernel/predict_kernel.hpp index 7bea4b3c4..d8cd4a0be 100644 --- a/include/plssvm/backends/OpenMP/kernel/predict_kernel.hpp +++ b/include/plssvm/backends/OpenMP/kernel/predict_kernel.hpp @@ -154,7 +154,7 @@ inline void device_kernel_predict_linear(aos_matrix &prediction, cons } } - // store the result back to the w vector + // update the global array with the local one for (unsigned internal_pp = 0; internal_pp < INTERNAL_BLOCK_SIZE; ++internal_pp) { for (unsigned internal_class = 0; internal_class < INTERNAL_BLOCK_SIZE; ++internal_class) { // calculate the indices to access the global data From 4020339b1eed19e92427d3470b53dd7f72c82709 Mon Sep 17 00:00:00 2001 From: Marcel Breyer Date: Sat, 14 Jun 2025 16:55:53 +0200 Subject: [PATCH 026/215] Rename sv to support_vectors for better readability and consistency. --- include/plssvm/backends/CUDA/kernel/predict_kernel.cuh | 6 +++--- include/plssvm/backends/HIP/kernel/predict_kernel.hip.hpp | 6 +++--- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/include/plssvm/backends/CUDA/kernel/predict_kernel.cuh b/include/plssvm/backends/CUDA/kernel/predict_kernel.cuh index 9d20863c8..d7ebf45a3 100644 --- a/include/plssvm/backends/CUDA/kernel/predict_kernel.cuh +++ b/include/plssvm/backends/CUDA/kernel/predict_kernel.cuh @@ -187,7 +187,7 @@ __global__ void device_kernel_predict_linear(real_type *prediction, const real_t * @param[in] prediction the predicted values * @param[in] alpha the previously learned weights * @param[in] rho the previously learned biases - * @param[in] sv the support vectors + * @param[in] support_vectors the support vectors * @param[in] predict_points the data points to predict * @param[in] num_classes the number of classes * @param[in] num_sv the number of support vectors @@ -198,7 +198,7 @@ __global__ void device_kernel_predict_linear(real_type *prediction, const real_t * @param[in] kernel_function_parameter the parameters necessary to apply the @p kernel_function */ template -__global__ void device_kernel_predict(real_type *prediction, const real_type *alpha, const real_type *rho, const real_type *sv, const real_type *predict_points, const std::size_t num_classes, const std::size_t num_sv, const std::size_t num_predict_points, const std::size_t num_features, const std::size_t grid_x_offset, const std::size_t grid_y_offset, Args... kernel_function_parameter) { +__global__ void device_kernel_predict(real_type *prediction, const real_type *alpha, const real_type *rho, const real_type *support_vectors, const real_type *predict_points, const std::size_t num_classes, const std::size_t num_sv, const std::size_t num_predict_points, const std::size_t num_features, const std::size_t grid_x_offset, const std::size_t grid_y_offset, Args... kernel_function_parameter) { // cast all values to 64-bit std::size_t to prevent potential 32-bit overflows constexpr auto INTERNAL_BLOCK_SIZE_uz = static_cast(INTERNAL_BLOCK_SIZE); constexpr auto THREAD_BLOCK_SIZE_uz = static_cast(THREAD_BLOCK_SIZE); @@ -237,7 +237,7 @@ __global__ void device_kernel_predict(real_type *prediction, const real_type *al // store the values in the shared memory pp_cache[threadIdx.y][internal * THREAD_BLOCK_SIZE + threadIdx.x] = predict_points[(feature_block + threadIdx_y) * (num_predict_points + PADDING_SIZE_uz) + global_pp_idx_linear]; // SoA - sv_cache[threadIdx.y][internal * THREAD_BLOCK_SIZE + threadIdx.x] = sv[(feature_block + threadIdx_y) * (num_sv + PADDING_SIZE_uz) + global_sv_idx_linear]; // SoA + sv_cache[threadIdx.y][internal * THREAD_BLOCK_SIZE + threadIdx.x] = support_vectors[(feature_block + threadIdx_y) * (num_sv + PADDING_SIZE_uz) + global_sv_idx_linear]; // SoA } __syncthreads(); // wait until all threads loaded their part of the data diff --git a/include/plssvm/backends/HIP/kernel/predict_kernel.hip.hpp b/include/plssvm/backends/HIP/kernel/predict_kernel.hip.hpp index 6ba12a360..9ee22edc4 100644 --- a/include/plssvm/backends/HIP/kernel/predict_kernel.hip.hpp +++ b/include/plssvm/backends/HIP/kernel/predict_kernel.hip.hpp @@ -189,7 +189,7 @@ __global__ void device_kernel_predict_linear(real_type *prediction, const real_t * @param[in] prediction the predicted values * @param[in] alpha the previously learned weights * @param[in] rho the previously learned biases - * @param[in] sv the support vectors + * @param[in] support_vectors the support vectors * @param[in] predict_points the data points to predict * @param[in] num_classes the number of classes * @param[in] num_sv the number of support vectors @@ -200,7 +200,7 @@ __global__ void device_kernel_predict_linear(real_type *prediction, const real_t * @param[in] kernel_function_parameter the parameters necessary to apply the @p kernel_function */ template -__global__ void device_kernel_predict(real_type *prediction, const real_type *alpha, const real_type *rho, const real_type *sv, const real_type *predict_points, const std::size_t num_classes, const std::size_t num_sv, const std::size_t num_predict_points, const std::size_t num_features, const std::size_t grid_x_offset, const std::size_t grid_y_offset, Args... kernel_function_parameter) { +__global__ void device_kernel_predict(real_type *prediction, const real_type *alpha, const real_type *rho, const real_type *support_vectors, const real_type *predict_points, const std::size_t num_classes, const std::size_t num_sv, const std::size_t num_predict_points, const std::size_t num_features, const std::size_t grid_x_offset, const std::size_t grid_y_offset, Args... kernel_function_parameter) { // cast all values to 64-bit std::size_t to prevent potential 32-bit overflows constexpr auto INTERNAL_BLOCK_SIZE_uz = static_cast(INTERNAL_BLOCK_SIZE); constexpr auto THREAD_BLOCK_SIZE_uz = static_cast(THREAD_BLOCK_SIZE); @@ -239,7 +239,7 @@ __global__ void device_kernel_predict(real_type *prediction, const real_type *al // store the values in the shared memory pp_cache[threadIdx.y][internal * THREAD_BLOCK_SIZE + threadIdx.x] = predict_points[(feature_block + threadIdx_y) * (num_predict_points + PADDING_SIZE_uz) + global_pp_idx_linear]; // SoA - sv_cache[threadIdx.y][internal * THREAD_BLOCK_SIZE + threadIdx.x] = sv[(feature_block + threadIdx_y) * (num_sv + PADDING_SIZE_uz) + global_sv_idx_linear]; // SoA + sv_cache[threadIdx.y][internal * THREAD_BLOCK_SIZE + threadIdx.x] = support_vectors[(feature_block + threadIdx_y) * (num_sv + PADDING_SIZE_uz) + global_sv_idx_linear]; // SoA } __syncthreads(); // wait until all threads loaded their part of the data From 39513f8a36be4c9bc821226bf32c0cd3e3323c08 Mon Sep 17 00:00:00 2001 From: Marcel Breyer Date: Fri, 20 Jun 2025 12:01:10 +0200 Subject: [PATCH 027/215] Update some comments. --- .../kernel/cg_implicit/kernel_matrix_assembly_blas.cuh | 6 +++--- .../cg_implicit/kernel_matrix_assembly_blas.hip.hpp | 6 +++--- .../kernel/cg_implicit/kernel_matrix_assembly_blas.hpp | 5 ++++- .../kernel/cg_implicit/kernel_matrix_assembly_blas.hpp | 9 ++++++--- 4 files changed, 16 insertions(+), 10 deletions(-) diff --git a/include/plssvm/backends/CUDA/kernel/cg_implicit/kernel_matrix_assembly_blas.cuh b/include/plssvm/backends/CUDA/kernel/cg_implicit/kernel_matrix_assembly_blas.cuh index 9861f2fb7..186400757 100644 --- a/include/plssvm/backends/CUDA/kernel/cg_implicit/kernel_matrix_assembly_blas.cuh +++ b/include/plssvm/backends/CUDA/kernel/cg_implicit/kernel_matrix_assembly_blas.cuh @@ -58,10 +58,10 @@ __global__ void device_kernel_assembly_symm(const real_type alpha, const real_ty // calculate the indices used in the current thread const auto i_idx = (blockIdx_x * blockDim_x + threadIdx_x) * INTERNAL_BLOCK_SIZE_uz; // num_rows - device_row_offset - const auto j_idx = (blockIdx_y * blockDim_y + threadIdx_y) * INTERNAL_BLOCK_SIZE_uz; // num_rows - device_row_offset + const auto j_idx = (blockIdx_y * blockDim_y + threadIdx_y) * INTERNAL_BLOCK_SIZE_uz; // device_num_rows // calculate the indices used in the current thread, pays attention to coalesced memory accesses - const auto i_idx_linear = blockIdx_x * blockDim_x * INTERNAL_BLOCK_SIZE_uz + threadIdx_x; // device_num_rows + const auto i_idx_linear = blockIdx_x * blockDim_x * INTERNAL_BLOCK_SIZE_uz + threadIdx_x; // num_rows - device_row_offset const auto j_idx_linear = blockIdx_y * blockDim_y * INTERNAL_BLOCK_SIZE_uz + threadIdx_x; // device_num_rows // create two shared memory arrays used for caching @@ -117,7 +117,7 @@ __global__ void device_kernel_assembly_symm(const real_type alpha, const real_ty const auto device_global_j_idx = j_idx + static_cast(internal_j); const auto global_j_idx = device_row_offset + device_global_j_idx; - // be sure to not perform out of bounds accesses (only using the upper triangular matrix) + // be sure to not perform out-of-bounds accesses (only using the upper triangular matrix) if (device_global_i_idx < (num_rows - device_row_offset) && device_global_j_idx < device_num_rows && global_i_idx >= global_j_idx) { // apply the final kernel function temp[internal_i][internal_j] = detail::apply_kernel_function(temp[internal_i][internal_j], kernel_function_parameter...) + QA_cost - q[global_i_idx] - q[global_j_idx]; diff --git a/include/plssvm/backends/HIP/kernel/cg_implicit/kernel_matrix_assembly_blas.hip.hpp b/include/plssvm/backends/HIP/kernel/cg_implicit/kernel_matrix_assembly_blas.hip.hpp index 2bc4a230f..b2bee8d46 100644 --- a/include/plssvm/backends/HIP/kernel/cg_implicit/kernel_matrix_assembly_blas.hip.hpp +++ b/include/plssvm/backends/HIP/kernel/cg_implicit/kernel_matrix_assembly_blas.hip.hpp @@ -60,10 +60,10 @@ __global__ void device_kernel_assembly_symm(const real_type alpha, const real_ty // calculate the indices used in the current thread const auto i_idx = (blockIdx_x * blockDim_x + threadIdx_x) * INTERNAL_BLOCK_SIZE_uz; // num_rows - device_row_offset - const auto j_idx = (blockIdx_y * blockDim_y + threadIdx_y) * INTERNAL_BLOCK_SIZE_uz; // num_rows - device_row_offset + const auto j_idx = (blockIdx_y * blockDim_y + threadIdx_y) * INTERNAL_BLOCK_SIZE_uz; // device_num_rows // calculate the indices used in the current thread, pays attention to coalesced memory accesses - const auto i_idx_linear = blockIdx_x * blockDim_x * INTERNAL_BLOCK_SIZE_uz + threadIdx_x; // device_num_rows + const auto i_idx_linear = blockIdx_x * blockDim_x * INTERNAL_BLOCK_SIZE_uz + threadIdx_x; // num_rows - device_row_offset const auto j_idx_linear = blockIdx_y * blockDim_y * INTERNAL_BLOCK_SIZE_uz + threadIdx_x; // device_num_rows // create two shared memory arrays used for caching @@ -119,7 +119,7 @@ __global__ void device_kernel_assembly_symm(const real_type alpha, const real_ty const auto device_global_j_idx = j_idx + static_cast(internal_j); const auto global_j_idx = device_row_offset + device_global_j_idx; - // be sure to not perform out of bounds accesses (only using the upper triangular matrix) + // be sure to not perform out-of-bounds accesses (only using the upper triangular matrix) if (device_global_i_idx < (num_rows - device_row_offset) && device_global_j_idx < device_num_rows && global_i_idx >= global_j_idx) { // apply the final kernel function temp[internal_i][internal_j] = detail::apply_kernel_function(temp[internal_i][internal_j], kernel_function_parameter...) + QA_cost - q[global_i_idx] - q[global_j_idx]; diff --git a/include/plssvm/backends/HPX/kernel/cg_implicit/kernel_matrix_assembly_blas.hpp b/include/plssvm/backends/HPX/kernel/cg_implicit/kernel_matrix_assembly_blas.hpp index 78a0f93d1..d6abc8cab 100644 --- a/include/plssvm/backends/HPX/kernel/cg_implicit/kernel_matrix_assembly_blas.hpp +++ b/include/plssvm/backends/HPX/kernel/cg_implicit/kernel_matrix_assembly_blas.hpp @@ -82,6 +82,9 @@ inline void device_kernel_assembly_symm(const real_type alpha, const std::vector // create a thread private array used for internal caching std::array, INTERNAL_BLOCK_SIZE> temp{}; + //*************************************************************************// + // inplace kernel matrix construction // + //*************************************************************************// // iterate over all features for (std::size_t feature_block = 0; feature_block < num_features; feature_block += THREAD_BLOCK_SIZE_uz) { for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) { @@ -108,7 +111,7 @@ inline void device_kernel_assembly_symm(const real_type alpha, const std::vector const auto device_global_j_idx = j_idx + static_cast(internal_j); const auto global_j_idx = device_row_offset + device_global_j_idx; - // be sure to not perform out of bounds accesses for the kernel matrix (only using the upper triangular matrix) + // be sure to not perform out-of-bounds accesses (only using the upper triangular matrix) if (device_global_i_idx < (num_rows - device_row_offset) && device_global_j_idx < device_num_rows && global_i_idx >= global_j_idx) { // apply the final kernel function temp[internal_j][internal_i] = detail::apply_kernel_function(temp[internal_j][internal_i], kernel_function_parameter...) + QA_cost - q[global_i_idx] - q[global_j_idx]; diff --git a/include/plssvm/backends/OpenMP/kernel/cg_implicit/kernel_matrix_assembly_blas.hpp b/include/plssvm/backends/OpenMP/kernel/cg_implicit/kernel_matrix_assembly_blas.hpp index 391b9fd90..952225c06 100644 --- a/include/plssvm/backends/OpenMP/kernel/cg_implicit/kernel_matrix_assembly_blas.hpp +++ b/include/plssvm/backends/OpenMP/kernel/cg_implicit/kernel_matrix_assembly_blas.hpp @@ -75,6 +75,9 @@ inline void device_kernel_assembly_symm(const real_type alpha, const std::vector // create a thread private array used for internal caching std::array, INTERNAL_BLOCK_SIZE> temp{}; + //*************************************************************************// + // inplace kernel matrix construction // + //*************************************************************************// // iterate over all features for (std::size_t feature_block = 0; feature_block < num_features; feature_block += THREAD_BLOCK_SIZE_uz) { for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) { @@ -101,7 +104,7 @@ inline void device_kernel_assembly_symm(const real_type alpha, const std::vector const auto device_global_j_idx = j_idx + static_cast(internal_j); const auto global_j_idx = device_row_offset + device_global_j_idx; - // be sure to not perform out of bounds accesses for the kernel matrix (only using the upper triangular matrix) + // be sure to not perform out-of-bounds accesses (only using the upper triangular matrix) if (device_global_i_idx < (num_rows - device_row_offset) && device_global_j_idx < device_num_rows && global_i_idx >= global_j_idx) { // apply the final kernel function temp[internal_j][internal_i] = detail::apply_kernel_function(temp[internal_j][internal_i], kernel_function_parameter...) + QA_cost - q[global_i_idx] - q[global_j_idx]; @@ -128,13 +131,13 @@ inline void device_kernel_assembly_symm(const real_type alpha, const std::vector if (global_i_idx == global_j_idx) { // only apply once to the diagonal - for (std::size_t class_idx = 0; class_idx < THREAD_BLOCK_SIZE; ++class_idx) { + for (std::size_t class_idx = 0; class_idx < THREAD_BLOCK_SIZE_uz; ++class_idx) { #pragma omp atomic C(class_block + class_idx, global_i_idx) += alpha * temp[internal_j][internal_i] * B(class_block + class_idx, global_i_idx); } } else { // apply it for the upper and lower triangular matrix - for (std::size_t class_idx = 0; class_idx < THREAD_BLOCK_SIZE; ++class_idx) { + for (std::size_t class_idx = 0; class_idx < THREAD_BLOCK_SIZE_uz; ++class_idx) { #pragma omp atomic C(class_block + class_idx, global_i_idx) += alpha * temp[internal_j][internal_i] * B(class_block + class_idx, global_j_idx); // symmetry From 3ef281db60488a7cbe891f9c704afb87a041074d Mon Sep 17 00:00:00 2001 From: Marcel Breyer Date: Fri, 20 Jun 2025 15:25:21 +0200 Subject: [PATCH 028/215] Also use trimmed names in performance tracking output. --- src/plssvm/backends/SYCL/DPCPP/csvm.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/plssvm/backends/SYCL/DPCPP/csvm.cpp b/src/plssvm/backends/SYCL/DPCPP/csvm.cpp index 12910a7ae..861344f5b 100644 --- a/src/plssvm/backends/SYCL/DPCPP/csvm.cpp +++ b/src/plssvm/backends/SYCL/DPCPP/csvm.cpp @@ -147,7 +147,7 @@ void csvm::init(const target_platform target) { " [{}, {}]\n", device, trimmed_device_name); - device_names.emplace_back(device_name); + device_names.emplace_back(trimmed_device_name); } } From a20d76d2059ef822a97f3165c5db9193374850c7 Mon Sep 17 00:00:00 2001 From: Marcel Breyer Date: Fri, 20 Jun 2025 15:26:09 +0200 Subject: [PATCH 029/215] Always use a loop for the custom powi function. --- .../backends/SYCL/kernel/kernel_functions.hpp | 35 +++---------------- 1 file changed, 5 insertions(+), 30 deletions(-) diff --git a/include/plssvm/backends/SYCL/kernel/kernel_functions.hpp b/include/plssvm/backends/SYCL/kernel/kernel_functions.hpp index 97c5c6248..6cfa159bc 100644 --- a/include/plssvm/backends/SYCL/kernel/kernel_functions.hpp +++ b/include/plssvm/backends/SYCL/kernel/kernel_functions.hpp @@ -30,42 +30,17 @@ namespace plssvm::sycl::detail { /** * @brief Fast integer power function. Computes base^exponent and takes advantage of the fact that degree may only be positive integer values. - * @details Hardcodes the power function for degree <= 6, uses a simple for loop otherwise. * @param[in] base the base * @param[in] exponent the exponent * @return base^exponent (`[[nodiscard]]`) */ [[nodiscard]] inline real_type powi(const real_type base, const int exponent) { - switch (exponent) { - case 0: return real_type{ 1.0 }; - case 1: return base; - case 2: return base * base; - case 3: return base * base * base; - case 4: - { - const real_type temp = base * base; - return temp * temp; - } - case 5: - { - const real_type temp = base * base; - return temp * temp * base; - } - case 6: - { - const real_type temp = base * base * base; - return temp * temp; - } - default: - { - // generic integer power function - real_type result{ 1.0 }; - for (int i = 0; i < exponent; ++i) { - result *= base; - } - return result; - } + // generic integer power function + real_type result{ 1.0 }; + for (int i = 0; i < exponent; ++i) { + result *= base; } + return result; } //***************************************************// From 1c55fb151d2e7b2f8e29a4a9ec9e8cf3c3015098 Mon Sep 17 00:00:00 2001 From: Marcel Breyer Date: Fri, 20 Jun 2025 15:27:18 +0200 Subject: [PATCH 030/215] The get_default_queue now honors the default target platform. --- src/plssvm/backends/SYCL/DPCPP/detail/utility.cpp | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/src/plssvm/backends/SYCL/DPCPP/detail/utility.cpp b/src/plssvm/backends/SYCL/DPCPP/detail/utility.cpp index 28742b23f..6f14f9271 100644 --- a/src/plssvm/backends/SYCL/DPCPP/detail/utility.cpp +++ b/src/plssvm/backends/SYCL/DPCPP/detail/utility.cpp @@ -10,6 +10,7 @@ #include "plssvm/backends/SYCL/DPCPP/detail/queue.hpp" // plssvm::adaptivecpp::detail::queue #include "plssvm/backends/SYCL/DPCPP/detail/queue_impl.hpp" // plssvm::dpcpp::detail::queue (PImpl implementation) +#include "plssvm/detail/assert.hpp" // PLSSVM_ASSERT #include "plssvm/detail/string_utility.hpp" // plssvm::detail::{as_lower_case, contains} #include "plssvm/detail/utility.hpp" // plssvm::detail::contains #include "plssvm/exceptions/exceptions.hpp" // plssvm::platform_devices_empty @@ -101,9 +102,11 @@ void device_synchronize(const queue &q) { } queue get_default_queue() { - queue q; - q.impl = std::make_shared(); - return q; + const auto &[devices, target] = detail::get_device_list(determine_default_target_platform()); + // at least one platform must be present + PLSSVM_ASSERT(!devices.empty(), "At least one device must be available!"); + // per default, use the first device for the tests + return devices.front(); } std::string get_dpcpp_version() { From b6b98fc6e5f59bc22fe4a928cdc33473a02051bf Mon Sep 17 00:00:00 2001 From: Marcel Breyer Date: Sat, 21 Jun 2025 17:38:25 +0200 Subject: [PATCH 031/215] Improve the AdaptiveCpp device pointer creation performance on CPUs with the OpenMP backend. --- .../backends/SYCL/AdaptiveCpp/detail/device_ptr.cpp | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/src/plssvm/backends/SYCL/AdaptiveCpp/detail/device_ptr.cpp b/src/plssvm/backends/SYCL/AdaptiveCpp/detail/device_ptr.cpp index 0338d10c9..44a9b9108 100644 --- a/src/plssvm/backends/SYCL/AdaptiveCpp/detail/device_ptr.cpp +++ b/src/plssvm/backends/SYCL/AdaptiveCpp/detail/device_ptr.cpp @@ -12,6 +12,7 @@ #include "plssvm/backends/SYCL/AdaptiveCpp/detail/queue_impl.hpp" // plssvm::adaptivecpp::detail::queue (PImpl implementation) #include "plssvm/backends/SYCL/exceptions.hpp" // plssvm::adaptivecpp::backend_exception #include "plssvm/detail/assert.hpp" // PLSSVM_ASSERT +#include "plssvm/detail/make_unique_for_overwrite.hpp" // plssvm::detail::parallel_zero_memset #include "plssvm/matrix.hpp" // plssvm::aos_matrix #include "plssvm/shape.hpp" // plssvm::shape @@ -56,7 +57,14 @@ void device_ptr::memset(const int pattern, const size_type pos, const size_ty throw backend_exception{ fmt::format("Illegal access in memset!: {} >= {}", pos, this->size_padded()) }; } const size_type rnum_bytes = std::min(num_bytes, (this->size_padded() - pos) * sizeof(value_type)); - queue_.impl->sycl_queue.memset(static_cast(data_ + pos), pattern, rnum_bytes).wait(); + + ::sycl::queue &queue = queue_.impl->sycl_queue; + // using our OpenMP enhanced 0 memset functions has dramatically better performance on the OpenMP CPU backend + if (pattern == 0 && queue.get_device().is_cpu() && queue.get_device().get_backend() == ::sycl::backend::omp) { + ::plssvm::detail::parallel_zero_memset(data_ + pos, rnum_bytes / sizeof(value_type)); + } else { + queue.memset(static_cast(data_ + pos), pattern, rnum_bytes).wait(); + } } template From 88e5d80ad781c9ef9051e0de06b77896e5afb65f Mon Sep 17 00:00:00 2001 From: Marcel Breyer Date: Sun, 22 Jun 2025 14:00:22 +0200 Subject: [PATCH 032/215] Based on the provided CPU target architectures, set the correct preferred vector width. Reason: GCC and clang refuse to use AVX-512 for Intel CPUs in their auto-vectorizers even on new Intel CPUs that fully support it. --- CMakeLists.txt | 31 +++++++++++++++++++++++++++++++ 1 file changed, 31 insertions(+) diff --git a/CMakeLists.txt b/CMakeLists.txt index 8b5c16f86..10de8e060 100755 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -638,6 +638,37 @@ if (PLSSVM_ENABLE_LTO) endif () endif () +######################################################################################################################## +# enable the requested vectorization widths for the auto-vectorizers # +######################################################################################################################## +# GCC and clang both do not automatically auto-vectorize for AVX-512 (only AVX2) +# -> enable it if "cpu:avx512" was passed as PLSSVM_TARGET_PLATFORMS +if (PLSSVM_NUM_CPU_TARGET_ARCHS EQUAL 1) + if (${PLSSVM_CPU_TARGET_ARCHS} STREQUAL "avx512") + message(STATUS "Enabling AVX512 support for the auto-vectorizers (-mprefer-vector-width=512).") + target_compile_options( + ${PLSSVM_BASE_LIBRARY_NAME} + PUBLIC $<$:$<$:-mprefer-vector-width=512>> + ) + elseif (${PLSSVM_CPU_TARGET_ARCHS} STREQUAL "avx2" OR ${PLSSVM_CPU_TARGET_ARCHS} STREQUAL "avx") + message(STATUS "Enabling AVX/AVX2 support for the auto-vectorizers (-mprefer-vector-width=256).") + target_compile_options( + ${PLSSVM_BASE_LIBRARY_NAME} + PUBLIC $<$:$<$:-mprefer-vector-width=256>> + ) + elseif (${PLSSVM_CPU_TARGET_ARCHS} MATCHES "^sse") + message(STATUS "Enabling SSE for the auto-vectorizers (-mprefer-vector-width=128).") + target_compile_options( + ${PLSSVM_BASE_LIBRARY_NAME} + PUBLIC $<$:$<$:-mprefer-vector-width=128>> + ) + else () + message(FATAL_ERROR "Unrecognized CPU target architecture \"${PLSSVM_CPU_TARGET_ARCHS}\". Allowed values are: avx512, avx2, avx, sse.") + endif () +else () + # automatically use the "optimal" auto-vectorizer width +endif () + ######################################################################################################################## # check for optional and necessary dependencies # ######################################################################################################################## From 56a0f7d7a903ceb54aa16f73051cea5800307a8c Mon Sep 17 00:00:00 2001 From: Marcel Breyer Date: Sun, 22 Jun 2025 14:40:27 +0200 Subject: [PATCH 033/215] Update the SYCL backend kernels. Now: some parts of the kernels are specialized for the CPU for better performance. --- .../SYCL/kernel/cg_explicit/basic/blas.hpp | 230 ++-- .../basic/kernel_matrix_assembly.hpp | 77 +- .../kernel/cg_explicit/hierarchical/blas.hpp | 359 +++--- .../hierarchical/kernel_matrix_assembly.hpp | 110 +- .../SYCL/kernel/cg_explicit/scoped/blas.hpp | 353 +++--- .../scoped/kernel_matrix_assembly.hpp | 106 +- .../kernel/cg_explicit/work_group/blas.hpp | 311 +++-- .../work_group/kernel_matrix_assembly.hpp | 94 +- .../basic/kernel_matrix_assembly_blas.hpp | 136 ++- .../kernel_matrix_assembly_blas.hpp | 237 ++-- .../scoped/kernel_matrix_assembly_blas.hpp | 283 +++-- .../kernel_matrix_assembly_blas.hpp | 183 +-- .../kernel/predict/basic/predict_kernel.hpp | 315 +++-- .../predict/hierarchical/predict_kernel.hpp | 483 +++++--- .../kernel/predict/scoped/predict_kernel.hpp | 513 ++++---- .../predict/work_group/predict_kernel.hpp | 419 ++++--- src/plssvm/backends/SYCL/AdaptiveCpp/csvm.cpp | 1071 ++++------------- src/plssvm/backends/SYCL/DPCPP/csvm.cpp | 894 +++----------- 18 files changed, 2974 insertions(+), 3200 deletions(-) diff --git a/include/plssvm/backends/SYCL/kernel/cg_explicit/basic/blas.hpp b/include/plssvm/backends/SYCL/kernel/cg_explicit/basic/blas.hpp index b55b374fe..4d19c4746 100644 --- a/include/plssvm/backends/SYCL/kernel/cg_explicit/basic/blas.hpp +++ b/include/plssvm/backends/SYCL/kernel/cg_explicit/basic/blas.hpp @@ -13,7 +13,9 @@ #define PLSSVM_BACKENDS_SYCL_CG_EXPLICIT_BASIC_BLAS_HPP_ #pragma once -#include "plssvm/constants.hpp" // plssvm::{real_type, THREAD_BLOCK_SIZE, INTERNAL_BLOCK_SIZE, PADDING_SIZE} +#include "plssvm/backends/SYCL/kernel_invocation_types.hpp" // plssvm::sycl::kernel_invocation_type +#include "plssvm/constants.hpp" // plssvm::{real_type, THREAD_BLOCK_SIZE, INTERNAL_BLOCK_SIZE, PADDING_SIZE} +#include "plssvm/target_platforms.hpp" // plssvm::target_platform #include "sycl/sycl.hpp" // sycl::item @@ -24,15 +26,20 @@ namespace plssvm::sycl::detail::basic { /** * @brief Perform an explicit BLAS SYMM operation: `C = alpha * A * B + beta * C` where @p A is a `m x k` symmetric matrix (memory optimized), @p B is a `k x n` matrix, @p C is a `m x n` matrix, and @p alpha and @p beta are scalars. * @details Uses SYCL's basic data parallel kernels. + * @tparam target the target platform */ +template class device_kernel_symm { public: + /// The used SYCL kernel invocation type. + constexpr static sycl::kernel_invocation_type invocation_type = sycl::kernel_invocation_type::basic; + /** * @brief Initialize the SYCL kernel function object. * @param[in] num_rows the number of rows in @p A and @p C * @param[in] num_rhs the number of columns in @p B and @p C - * @param[in] device_specific_num_rows the number of rows in @p A and number of rows in @p B; thr rows in @p A are potentially distributed across multiple devices - * @param[in] row_offset the first row this device is responsible for + * @param[in] device_num_rows the number of rows in @p A and number of rows in @p B; thr rows in @p A are potentially distributed across multiple devices + * @param[in] device_row_offset the first row this device is responsible for * @param[in] alpha the scalar alpha value * @param[in] A the matrix @p A * @param[in] B the matrix @p B @@ -41,11 +48,11 @@ class device_kernel_symm { * @param[in] grid_x_offset the offset in x-dimension into the data points if more than one execution grid has to be used * @param[in] grid_y_offset the offset in y-dimension into the data points if more than one execution grid has to be used */ - device_kernel_symm(const std::size_t num_rows, const std::size_t num_rhs, const std::size_t device_specific_num_rows, const std::size_t row_offset, const real_type alpha, const real_type *A, const real_type *B, const real_type beta, real_type *C, const std::size_t grid_x_offset, const std::size_t grid_y_offset) : + device_kernel_symm(const std::size_t num_rows, const std::size_t num_rhs, const std::size_t device_num_rows, const std::size_t device_row_offset, const real_type alpha, const real_type *A, const real_type *B, const real_type beta, real_type *C, const std::size_t grid_x_offset, const std::size_t grid_y_offset) : num_rows_{ num_rows }, num_rhs_{ num_rhs }, - device_specific_num_rows_{ device_specific_num_rows }, - row_offset_{ row_offset }, + device_num_rows_{ device_num_rows }, + device_row_offset_{ device_row_offset }, alpha_{ alpha }, A_{ A }, B_{ B }, @@ -59,33 +66,63 @@ class device_kernel_symm { * @param[in] idx indices representing the current point in the execution space */ void operator()(::sycl::item<2> idx) const { - const auto INTERNAL_BLOCK_SIZE_uz = static_cast(INTERNAL_BLOCK_SIZE); - const auto PADDING_SIZE_uz = static_cast(PADDING_SIZE); + // cast all values to 64-bit std::size_t to prevent potential 32-bit overflows + constexpr auto INTERNAL_BLOCK_SIZE_uz = static_cast(INTERNAL_BLOCK_SIZE); + constexpr auto THREAD_BLOCK_SIZE_uz = static_cast(THREAD_BLOCK_SIZE); + constexpr auto PADDING_SIZE_uz = static_cast(PADDING_SIZE); // calculate the indices used in the current work-item - const std::size_t i = (idx.get_id(1) + grid_x_offset_ * THREAD_BLOCK_SIZE) * INTERNAL_BLOCK_SIZE_uz; - const std::size_t j = (idx.get_id(0) + grid_y_offset_ * THREAD_BLOCK_SIZE) * INTERNAL_BLOCK_SIZE_uz; + const auto i_idx = (idx.get_id(1) + grid_x_offset_ * THREAD_BLOCK_SIZE_uz) * INTERNAL_BLOCK_SIZE_uz; // num_rhs + const auto j_idx = (idx.get_id(0) + grid_y_offset_ * THREAD_BLOCK_SIZE_uz) * INTERNAL_BLOCK_SIZE_uz; // device_num_rows // create a work-item private array used for internal caching real_type temp[INTERNAL_BLOCK_SIZE][INTERNAL_BLOCK_SIZE]{}; - // iterate over all features using blocking to be able to cache them for faster memory accesses - for (unsigned long long dim = 0; dim < (num_rows_ - row_offset_); ++dim) { - // perform the dot product calculation - for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) { - for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) { - const auto global_i = i + static_cast(internal_i); - const auto global_j = j + static_cast(internal_j); - - real_type A_val = 0.0; - // determine on which side of the diagonal we are located - if (dim < global_j) { - A_val = A_[dim * (num_rows_ - row_offset_ + PADDING_SIZE_uz) + global_j - dim * (dim + std::size_t{ 1 }) / std::size_t{ 2 }]; - } else { - A_val = A_[global_j * (num_rows_ - row_offset_ + PADDING_SIZE_uz) + dim - global_j * (global_j + std::size_t{ 1 }) / std::size_t{ 2 }]; + // iterate over all values using blocking + for (std::size_t dim_block = 0; dim_block < (num_rows_ - device_row_offset_); dim_block += THREAD_BLOCK_SIZE_uz) { + if constexpr (target == target_platform::cpu) { + // perform the dot product calculation, the dim is the fastest moving index + for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) { + for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) { + // calculate the indices to access the global data + const auto global_i_idx = i_idx + static_cast(internal_i); + const auto global_j_idx = j_idx + static_cast(internal_j); + + real_type sum{ 0.0 }; + for (std::size_t dim = 0; dim < THREAD_BLOCK_SIZE_uz; ++dim) { + real_type A_val = 0.0; + // determine on which side of the diagonal we are located + if (dim_block + dim < global_j_idx) { + A_val = A_[(dim_block + dim) * (num_rows_ - device_row_offset_ + PADDING_SIZE_uz) + global_j_idx - (dim_block + dim) * (dim_block + dim + std::size_t{ 1 }) / std::size_t{ 2 }]; // SoA, upper triangular matrix only + } else { + A_val = A_[global_j_idx * (num_rows_ - device_row_offset_ + PADDING_SIZE_uz) + dim_block + dim - global_j_idx * (global_j_idx + std::size_t{ 1 }) / std::size_t{ 2 }]; // SoA, upper triangular matrix only + } + + sum += A_val * B_[((dim_block + dim) + device_row_offset_) * (num_rhs_ + PADDING_SIZE_uz) + global_i_idx]; // SoA + } + temp[internal_i][internal_j] += sum; + } + } + } else { + // perform the dot product calculation, the dim is the slowest moving index + for (std::size_t dim = 0; dim < THREAD_BLOCK_SIZE_uz; ++dim) { + for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) { + for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) { + // calculate the indices to access the global data + const auto global_i_idx = i_idx + static_cast(internal_i); + const auto global_j_idx = j_idx + static_cast(internal_j); + + real_type A_val = 0.0; + // determine on which side of the diagonal we are located + if (dim_block + dim < global_j_idx) { + A_val = A_[(dim_block + dim) * (num_rows_ - device_row_offset_ + PADDING_SIZE_uz) + global_j_idx - (dim_block + dim) * (dim_block + dim + std::size_t{ 1 }) / std::size_t{ 2 }]; // SoA, upper triangular matrix only + } else { + A_val = A_[global_j_idx * (num_rows_ - device_row_offset_ + PADDING_SIZE_uz) + dim_block + dim - global_j_idx * (global_j_idx + std::size_t{ 1 }) / std::size_t{ 2 }]; // SoA, upper triangular matrix only + } + + temp[internal_i][internal_j] += A_val * B_[((dim_block + dim) + device_row_offset_) * (num_rhs_ + PADDING_SIZE_uz) + global_i_idx]; // SoA + } } - - temp[internal_i][internal_j] += A_val * B_[(dim + row_offset_) * (num_rhs_ + PADDING_SIZE_uz) + global_i]; } } } @@ -93,13 +130,14 @@ class device_kernel_symm { // apply the (partial) BLAS operation and update C for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) { for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) { - const auto global_i = i + static_cast(internal_i); - const auto device_global_j = j + static_cast(internal_j); - const auto global_j = row_offset_ + j + static_cast(internal_j); - - // be sure to not perform out of bounds accesses - if (global_i < num_rhs_ && device_global_j < device_specific_num_rows_) { - C_[global_j * (num_rhs_ + PADDING_SIZE_uz) + global_i] = alpha_ * temp[internal_i][internal_j] + beta_ * C_[global_j * (num_rhs_ + PADDING_SIZE_uz) + global_i]; + // calculate the indices to access the global data and the data with respect to the current device + const auto global_i_idx = i_idx + static_cast(internal_i); + const auto device_global_j_idx = j_idx + static_cast(internal_j); + const auto global_j_idx = device_row_offset_ + device_global_j_idx; + + // be sure to not perform out-of-bounds accesses + if (global_i_idx < num_rhs_ && device_global_j_idx < device_num_rows_) { + C_[global_j_idx * (num_rhs_ + PADDING_SIZE_uz) + global_i_idx] = alpha_ * temp[internal_i][internal_j] + beta_ * C_[global_j_idx * (num_rhs_ + PADDING_SIZE_uz) + global_i_idx]; // SoA } } } @@ -109,8 +147,8 @@ class device_kernel_symm { /// @cond Doxygen_suppress const std::size_t num_rows_; const std::size_t num_rhs_; - const std::size_t device_specific_num_rows_; - const std::size_t row_offset_; + const std::size_t device_num_rows_; + const std::size_t device_row_offset_; const real_type alpha_; const real_type *A_; const real_type *B_; @@ -125,16 +163,21 @@ class device_kernel_symm { * @brief Perform an explicit BLAS SYMM operation: `C = alpha * A * B + beta * C` where @p A is a `m x k` symmetric matrix (memory optimized), @p B is a `k x n` matrix, @p C is a `m x n` matrix, and @p alpha and @p beta are scalars. * @details In a multi-GPU setting, this function is responsible for mirroring down the columns this device is responsible for! * Uses SYCL's basic data parallel kernels. + * @tparam target the target platform */ +template class device_kernel_symm_mirror { public: + /// The used SYCL kernel invocation type. + constexpr static sycl::kernel_invocation_type invocation_type = sycl::kernel_invocation_type::basic; + /** * @brief Initialize the SYCL kernel function object. * @param[in] num_rows the number of rows in @p A and @p C * @param[in] num_rhs the number of columns in @p B and @p C * @param[in] num_mirror_rows the number of rows to mirror down - * @param[in] device_specific_num_rows the number of rows in @p A and number of rows in @p B; thr rows in @p A are potentially distributed across multiple devices - * @param[in] row_offset the first row this device is responsible for + * @param[in] device_num_rows the number of rows in @p A and number of rows in @p B; thr rows in @p A are potentially distributed across multiple devices + * @param[in] device_row_offset the first row this device is responsible for * @param[in] alpha the scalar alpha value * @param[in] A the matrix @p A * @param[in] B the matrix @p B @@ -143,12 +186,12 @@ class device_kernel_symm_mirror { * @param[in] grid_x_offset the offset in x-dimension into the data points if more than one execution grid has to be used * @param[in] grid_y_offset the offset in y-dimension into the data points if more than one execution grid has to be used */ - device_kernel_symm_mirror(const std::size_t num_rows, const std::size_t num_rhs, const std::size_t num_mirror_rows, const std::size_t device_specific_num_rows, const std::size_t row_offset, const real_type alpha, const real_type *A, const real_type *B, const real_type beta, real_type *C, const std::size_t grid_x_offset, const std::size_t grid_y_offset) : + device_kernel_symm_mirror(const std::size_t num_rows, const std::size_t num_rhs, const std::size_t num_mirror_rows, const std::size_t device_num_rows, const std::size_t device_row_offset, const real_type alpha, const real_type *A, const real_type *B, const real_type beta, real_type *C, const std::size_t grid_x_offset, const std::size_t grid_y_offset) : num_rows_{ num_rows }, num_rhs_{ num_rhs }, num_mirror_rows_{ num_mirror_rows }, - device_specific_num_rows_{ device_specific_num_rows }, - row_offset_{ row_offset }, + device_num_rows_{ device_num_rows }, + device_row_offset_{ device_row_offset }, alpha_{ alpha }, A_{ A }, B_{ B }, @@ -162,25 +205,49 @@ class device_kernel_symm_mirror { * @param[in] idx indices representing the current point in the execution space */ void operator()(::sycl::item<2> idx) const { - const auto INTERNAL_BLOCK_SIZE_uz = static_cast(INTERNAL_BLOCK_SIZE); - const auto PADDING_SIZE_uz = static_cast(PADDING_SIZE); + // cast all values to 64-bit std::size_t to prevent potential 32-bit overflows + constexpr auto INTERNAL_BLOCK_SIZE_uz = static_cast(INTERNAL_BLOCK_SIZE); + constexpr auto THREAD_BLOCK_SIZE_uz = static_cast(THREAD_BLOCK_SIZE); + constexpr auto PADDING_SIZE_uz = static_cast(PADDING_SIZE); // calculate the indices used in the current work-item - const std::size_t i = (idx.get_id(1) + grid_x_offset_ * THREAD_BLOCK_SIZE) * INTERNAL_BLOCK_SIZE_uz; - const std::size_t j = (idx.get_id(0) + grid_y_offset_ * THREAD_BLOCK_SIZE) * INTERNAL_BLOCK_SIZE_uz; + const auto i_idx = (idx.get_id(1) + grid_x_offset_ * THREAD_BLOCK_SIZE) * INTERNAL_BLOCK_SIZE_uz; // num_rhs + const auto j_idx = (idx.get_id(0) + grid_y_offset_ * THREAD_BLOCK_SIZE) * INTERNAL_BLOCK_SIZE_uz; // num_mirror_rows // create a work-item private array used for internal caching real_type temp[INTERNAL_BLOCK_SIZE][INTERNAL_BLOCK_SIZE]{}; - // iterate over the remaining features using blocking to be able to cache them for faster memory accesses - for (std::size_t dim = 0; dim < device_specific_num_rows_; ++dim) { - // perform the feature reduction calculation - for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) { - for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) { - const auto global_i = i + static_cast(internal_i); - const auto global_j = j + static_cast(internal_j); - - temp[internal_i][internal_j] += A_[(dim) * (num_rows_ - row_offset_ + PADDING_SIZE_uz) - (dim - std::size_t{ 1 }) * dim / std::size_t{ 2 } + device_specific_num_rows_ - dim + global_j] * B_[(dim + row_offset_) * (num_rhs_ + PADDING_SIZE_uz) + global_i]; + // iterate over the remaining values using blocking to be able to cache them for faster memory accesses + for (std::size_t dim_block = 0; dim_block < device_num_rows_; dim_block += THREAD_BLOCK_SIZE_uz) { + if constexpr (target == target_platform::cpu) { + // perform the dot product calculation, the dim is the fastest moving index + for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) { + for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) { + // calculate the indices to access the global data + const auto global_i_idx = i_idx + static_cast(internal_i); + const auto global_j_idx = j_idx + static_cast(internal_j); + + real_type sum{ 0.0 }; + for (unsigned dim = 0; dim < THREAD_BLOCK_SIZE; ++dim) { + sum += A_[(dim_block + dim) * (num_rows_ - device_row_offset_ + PADDING_SIZE_uz) - (dim_block + dim - std::size_t{ 1 }) * (dim_block + dim) / std::size_t{ 2 } + device_num_rows_ - (dim_block + dim) + global_j_idx] * // SoA, upper triangular matrix only + B_[(dim_block + dim + device_row_offset_) * (num_rhs_ + PADDING_SIZE_uz) + global_i_idx]; // SoA + } + temp[internal_i][internal_j] += sum; + } + } + } else { + // perform the dot product calculation, the dim is the slowest moving index + for (unsigned dim = 0; dim < THREAD_BLOCK_SIZE; ++dim) { + for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) { + for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) { + // calculate the indices to access the global data + const auto global_i_idx = i_idx + static_cast(internal_i); + const auto global_j_idx = j_idx + static_cast(internal_j); + + temp[internal_i][internal_j] += A_[(dim_block + dim) * (num_rows_ - device_row_offset_ + PADDING_SIZE_uz) - (dim_block + dim - std::size_t{ 1 }) * (dim_block + dim) / std::size_t{ 2 } + device_num_rows_ - (dim_block + dim) + global_j_idx] * // SoA, upper triangular matrix only + B_[(dim_block + dim + device_row_offset_) * (num_rhs_ + PADDING_SIZE_uz) + global_i_idx]; // SoA + } + } } } } @@ -188,13 +255,14 @@ class device_kernel_symm_mirror { // apply the (remaining) BLAS operation and update C for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) { for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) { - const auto global_i = i + static_cast(internal_i); - const auto partial_global_j = j + static_cast(internal_j); - const auto global_j = row_offset_ + device_specific_num_rows_ + j + static_cast(internal_j); - - // be sure to not perform out of bounds accesses - if (global_i < num_rhs_ && partial_global_j < num_mirror_rows_) { - C_[global_j * (num_rhs_ + PADDING_SIZE_uz) + global_i] = alpha_ * temp[internal_i][internal_j] + beta_ * C_[global_j * (num_rhs_ + PADDING_SIZE_uz) + global_i]; + // calculate the indices to access the global data and the data with respect to the current device + const auto global_i_idx = i_idx + static_cast(internal_i); + const auto partial_global_j_idx = j_idx + static_cast(internal_j); + const auto global_j_idx = device_row_offset_ + device_num_rows_ + partial_global_j_idx; + + // be sure to not perform out-of-bounds accesses + if (global_i_idx < num_rhs_ && partial_global_j_idx < num_mirror_rows_) { + C_[global_j_idx * (num_rhs_ + PADDING_SIZE_uz) + global_i_idx] = alpha_ * temp[internal_i][internal_j] + beta_ * C_[global_j_idx * (num_rhs_ + PADDING_SIZE_uz) + global_i_idx]; // SoA } } } @@ -205,8 +273,8 @@ class device_kernel_symm_mirror { const std::size_t num_rows_; const std::size_t num_rhs_; const std::size_t num_mirror_rows_; - const std::size_t device_specific_num_rows_; - const std::size_t row_offset_; + const std::size_t device_num_rows_; + const std::size_t device_row_offset_; const real_type alpha_; const real_type *A_; const real_type *B_; @@ -223,6 +291,9 @@ class device_kernel_symm_mirror { */ class device_kernel_inplace_matrix_add { public: + /// The used SYCL kernel invocation type. + constexpr static sycl::kernel_invocation_type invocation_type = sycl::kernel_invocation_type::basic; + /** * @brief Initialize the SYCL kernel function object. * @param[in] num_cols the number of columns in both matrices @@ -244,19 +315,21 @@ class device_kernel_inplace_matrix_add { */ void operator()(::sycl::item<2> idx) const { // cast all values to 64-bit std::size_t to prevent potential 32-bit overflows - const auto INTERNAL_BLOCK_SIZE_uz = static_cast(INTERNAL_BLOCK_SIZE); - const auto PADDING_SIZE_uz = static_cast(PADDING_SIZE); + constexpr auto INTERNAL_BLOCK_SIZE_uz = static_cast(INTERNAL_BLOCK_SIZE); + constexpr auto THREAD_BLOCK_SIZE_uz = static_cast(THREAD_BLOCK_SIZE); + constexpr auto PADDING_SIZE_uz = static_cast(PADDING_SIZE); // calculate the indices used in the current work-item - const std::size_t i = (idx.get_id(1) + grid_x_offset_ * THREAD_BLOCK_SIZE) * INTERNAL_BLOCK_SIZE_uz; - const std::size_t j = (idx.get_id(0) + grid_y_offset_ * THREAD_BLOCK_SIZE) * INTERNAL_BLOCK_SIZE_uz; + const auto i_idx = (idx.get_id(1) + grid_x_offset_ * THREAD_BLOCK_SIZE_uz) * INTERNAL_BLOCK_SIZE_uz; // num_rows + const auto j_idx = (idx.get_id(0) + grid_y_offset_ * THREAD_BLOCK_SIZE_uz) * INTERNAL_BLOCK_SIZE_uz; // num_rhs for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) { for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) { - const auto global_i = i + static_cast(internal_i); - const auto global_j = j + static_cast(internal_j); + // calculate the indices to access the global data + const auto global_i_idx = i_idx + static_cast(internal_i); + const auto global_j_idx = j_idx + static_cast(internal_j); - lhs_[global_i * (num_cols_ + PADDING_SIZE_uz) + global_j] += rhs_[global_i * (num_cols_ + PADDING_SIZE_uz) + global_j]; + lhs_[global_i_idx * (num_cols_ + PADDING_SIZE_uz) + global_j_idx] += rhs_[global_i_idx * (num_cols_ + PADDING_SIZE_uz) + global_j_idx]; // SoA } } } @@ -277,6 +350,9 @@ class device_kernel_inplace_matrix_add { */ class device_kernel_inplace_matrix_scale { public: + /// The used SYCL kernel invocation type. + constexpr static sycl::kernel_invocation_type invocation_type = sycl::kernel_invocation_type::basic; + /** * @brief Initialize the SYCL kernel function object. * @param[in] num_cols the number of columns in the matrix @@ -298,19 +374,21 @@ class device_kernel_inplace_matrix_scale { */ void operator()(::sycl::item<2> idx) const { // cast all values to 64-bit std::size_t to prevent potential 32-bit overflows - const auto INTERNAL_BLOCK_SIZE_uz = static_cast(INTERNAL_BLOCK_SIZE); - const auto PADDING_SIZE_uz = static_cast(PADDING_SIZE); + constexpr auto INTERNAL_BLOCK_SIZE_uz = static_cast(INTERNAL_BLOCK_SIZE); + constexpr auto THREAD_BLOCK_SIZE_uz = static_cast(THREAD_BLOCK_SIZE); + constexpr auto PADDING_SIZE_uz = static_cast(PADDING_SIZE); // calculate the indices used in the current work-item - const std::size_t i = (idx.get_id(1) + grid_x_offset_ * THREAD_BLOCK_SIZE) * INTERNAL_BLOCK_SIZE_uz; - const std::size_t j = (idx.get_id(0) + grid_y_offset_ * THREAD_BLOCK_SIZE) * INTERNAL_BLOCK_SIZE_uz; + const auto i_idx = (idx.get_id(1) + grid_x_offset_ * THREAD_BLOCK_SIZE_uz) * INTERNAL_BLOCK_SIZE_uz; // num_rows + const auto j_idx = (idx.get_id(0) + grid_y_offset_ * THREAD_BLOCK_SIZE_uz) * INTERNAL_BLOCK_SIZE_uz; // num_rhs for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) { for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) { - const auto global_i = i + static_cast(internal_i); - const auto global_j = j + static_cast(internal_j); + // calculate the indices to access the global data + const auto global_i_idx = i_idx + static_cast(internal_i); + const auto global_j_idx = j_idx + static_cast(internal_j); - lhs_[global_i * (num_cols_ + PADDING_SIZE_uz) + global_j] *= scale_; + lhs_[global_i_idx * (num_cols_ + PADDING_SIZE_uz) + global_j_idx] *= scale_; // SoA } } } diff --git a/include/plssvm/backends/SYCL/kernel/cg_explicit/basic/kernel_matrix_assembly.hpp b/include/plssvm/backends/SYCL/kernel/cg_explicit/basic/kernel_matrix_assembly.hpp index 22b24bae0..f808c56fc 100644 --- a/include/plssvm/backends/SYCL/kernel/cg_explicit/basic/kernel_matrix_assembly.hpp +++ b/include/plssvm/backends/SYCL/kernel/cg_explicit/basic/kernel_matrix_assembly.hpp @@ -14,8 +14,10 @@ #pragma once #include "plssvm/backends/SYCL/kernel/kernel_functions.hpp" // plssvm::sycl::detail::{feature_reduce, apply_kernel_function} +#include "plssvm/backends/SYCL/kernel_invocation_types.hpp" // plssvm::sycl::kernel_invocation_type #include "plssvm/constants.hpp" // plssvm::{real_type, THREAD_BLOCK_SIZE, INTERNAL_BLOCK_SIZE, PADDING_SIZE} #include "plssvm/kernel_function_types.hpp" // plssvm::kernel_function_type +#include "plssvm/target_platforms.hpp" // plssvm::target_platform #include "sycl/sycl.hpp" // sycl::item @@ -27,12 +29,16 @@ namespace plssvm::sycl::detail::basic { /** * @brief Create the explicit kernel matrix using the @p kernel_function. * @details Uses SYCL's basic data parallel kernels. + * @tparam target the target platform * @tparam kernel_function the type of the used kernel function * @tparam Args the types of the parameters necessary for the specific kernel function; stored in a `std::tuple` */ -template +template class device_kernel_assembly { public: + /// The used SYCL kernel invocation type. + constexpr static sycl::kernel_invocation_type invocation_type = sycl::kernel_invocation_type::basic; + /** * @brief Initialize the SYCL kernel function object. * @param[out] kernel_matrix the calculated kernel matrix @@ -60,7 +66,7 @@ class device_kernel_assembly { cost_{ cost }, grid_x_offset_{ grid_x_offset }, grid_y_offset_{ grid_y_offset }, - kernel_function_parameter_{ std::make_tuple(std::forward(kernel_function_parameter)...) } { + kernel_function_parameter_{ std::make_tuple(kernel_function_parameter...) } { } /** @@ -74,22 +80,45 @@ class device_kernel_assembly { constexpr auto PADDING_SIZE_uz = static_cast(PADDING_SIZE); // calculate the indices used in the current work-item - const std::size_t i = (idx.get_id(1) + grid_x_offset_ * THREAD_BLOCK_SIZE_uz) * INTERNAL_BLOCK_SIZE_uz; - const std::size_t j = (idx.get_id(0) + grid_y_offset_ * THREAD_BLOCK_SIZE_uz) * INTERNAL_BLOCK_SIZE_uz; + const auto i_idx = (idx.get_id(1) + grid_x_offset_ * THREAD_BLOCK_SIZE_uz) * INTERNAL_BLOCK_SIZE_uz; // num_rows - device_row_offset + const auto j_idx = (idx.get_id(0) + grid_y_offset_ * THREAD_BLOCK_SIZE_uz) * INTERNAL_BLOCK_SIZE_uz; // device_num_rows // only calculate the upper triangular matrix - if (i >= j) { + if (i_idx >= j_idx) { // create a private memory array used for internal caching real_type temp[INTERNAL_BLOCK_SIZE][INTERNAL_BLOCK_SIZE]{}; - for (std::size_t dim = 0; dim < num_features_; ++dim) { - // perform the feature reduction calculation - for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) { - for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) { - const auto global_i = device_row_offset_ + i + static_cast(internal_i); - const auto global_j = device_row_offset_ + j + static_cast(internal_j); - temp[internal_i][internal_j] += detail::feature_reduce(data_[dim * (num_rows_ + std::size_t{ 1 } + PADDING_SIZE_uz) + global_i], - data_[dim * (num_rows_ + std::size_t{ 1 } + PADDING_SIZE_uz) + global_j]); + // iterate over all features using blocking + for (std::size_t feature_block = 0; feature_block < num_features_; feature_block += THREAD_BLOCK_SIZE_uz) { + if constexpr (target == target_platform::cpu) { + // perform the feature reduction calculation, the feature is the fastest moving index + for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) { + for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) { + // calculate the indices to access the global data + const auto global_i_idx = device_row_offset_ + i_idx + static_cast(internal_i); + const auto global_j_idx = device_row_offset_ + j_idx + static_cast(internal_j); + + real_type sum{ 0.0 }; + for (std::size_t feature = 0; feature < THREAD_BLOCK_SIZE_uz; ++feature) { + sum += detail::feature_reduce(data_[(feature_block + feature) * (num_rows_ + std::size_t{ 1 } + PADDING_SIZE_uz) + global_i_idx], // SoA + data_[(feature_block + feature) * (num_rows_ + std::size_t{ 1 } + PADDING_SIZE_uz) + global_j_idx]); // SoA + } + temp[internal_i][internal_j] += sum; + } + } + } else { + // perform the feature reduction calculation, the feature is the slowest moving index + for (std::size_t feature = 0; feature < THREAD_BLOCK_SIZE_uz; ++feature) { + for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) { + for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) { + // calculate the indices to access the global data + const auto global_i_idx = device_row_offset_ + i_idx + static_cast(internal_i); + const auto global_j_idx = device_row_offset_ + j_idx + static_cast(internal_j); + + temp[internal_i][internal_j] += detail::feature_reduce(data_[(feature_block + feature) * (num_rows_ + std::size_t{ 1 } + PADDING_SIZE_uz) + global_i_idx], // SoA + data_[(feature_block + feature) * (num_rows_ + std::size_t{ 1 } + PADDING_SIZE_uz) + global_j_idx]); // SoA + } + } } } } @@ -97,23 +126,23 @@ class device_kernel_assembly { // apply the remaining part of the kernel function and store the value in the output kernel matrix for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) { for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) { - // calculate the indices to access the global data points and wrt the current device - const auto device_global_i = i + static_cast(internal_i); - const auto global_i = device_row_offset_ + device_global_i; - const auto device_global_j = j + static_cast(internal_j); - const auto global_j = device_row_offset_ + device_global_j; - - // be sure to not perform out of bounds accesses for the kernel matrix (only using the upper triangular matrix) - if (device_global_i < (num_rows_ - device_row_offset_) && device_global_j < device_num_rows_ && global_i >= global_j) { + // calculate the indices to access the global data and the data with respect to the current device + const auto device_global_i_idx = i_idx + static_cast(internal_i); + const auto global_i_idx = device_row_offset_ + device_global_i_idx; + const auto device_global_j_idx = j_idx + static_cast(internal_j); + const auto global_j_idx = device_row_offset_ + device_global_j_idx; + + // be sure to not perform out-of-bounds accesses (only using the upper triangular matrix) + if (device_global_i_idx < (num_rows_ - device_row_offset_) && device_global_j_idx < device_num_rows_ && global_i_idx >= global_j_idx) { real_type temp_ij = temp[internal_i][internal_j]; // apply the final kernel function - temp_ij = detail::apply_kernel_function(temp_ij, kernel_function_parameter_) + QA_cost_ - q_[global_i] - q_[global_j]; + temp_ij = detail::apply_kernel_function(temp_ij, kernel_function_parameter_) + QA_cost_ - q_[global_i_idx] - q_[global_j_idx]; // apply the cost on the diagonal - if (global_i == global_j) { + if (global_i_idx == global_j_idx) { temp_ij += cost_; } // update the upper triangular kernel matrix - kernel_matrix_[device_global_j * (num_rows_ - device_row_offset_ + PADDING_SIZE_uz) - device_global_j * (device_global_j + std::size_t{ 1 }) / std::size_t{ 2 } + device_global_i] = temp_ij; + kernel_matrix_[device_global_j_idx * (num_rows_ - device_row_offset_ + PADDING_SIZE_uz) - device_global_j_idx * (device_global_j_idx + std::size_t{ 1 }) / std::size_t{ 2 } + device_global_i_idx] = temp_ij; } } } diff --git a/include/plssvm/backends/SYCL/kernel/cg_explicit/hierarchical/blas.hpp b/include/plssvm/backends/SYCL/kernel/cg_explicit/hierarchical/blas.hpp index 5e5803652..627eaadbe 100644 --- a/include/plssvm/backends/SYCL/kernel/cg_explicit/hierarchical/blas.hpp +++ b/include/plssvm/backends/SYCL/kernel/cg_explicit/hierarchical/blas.hpp @@ -13,7 +13,9 @@ #define PLSSVM_BACKENDS_SYCL_CG_EXPLICIT_HIERARCHICAL_BLAS_HPP_ #pragma once -#include "plssvm/constants.hpp" // plssvm::{real_type, THREAD_BLOCK_SIZE, INTERNAL_BLOCK_SIZE, PADDING_SIZE} +#include "plssvm/backends/SYCL/kernel_invocation_types.hpp" // plssvm::sycl::kernel_invocation_type +#include "plssvm/constants.hpp" // plssvm::{real_type, THREAD_BLOCK_SIZE, INTERNAL_BLOCK_SIZE, PADDING_SIZE} +#include "plssvm/target_platforms.hpp" // plssvm::target_platform #include "sycl/sycl.hpp" // sycl::group, sycl::private_memory, sycl::h_item @@ -24,15 +26,20 @@ namespace plssvm::sycl::detail::hierarchical { /** * @brief Perform an explicit BLAS SYMM operation: `C = alpha * A * B + beta * C` where @p A is a `m x k` symmetric matrix (memory optimized), @p B is a `k x n` matrix, @p C is a `m x n` matrix, and @p alpha and @p beta are scalars. * @details Uses SYCL's hierarchical data parallel kernels. + * @tparam target the target platform */ +template class device_kernel_symm { public: + /// The used SYCL kernel invocation type. + constexpr static sycl::kernel_invocation_type invocation_type = sycl::kernel_invocation_type::hierarchical; + /** * @brief Initialize the SYCL kernel function object. * @param[in] num_rows the number of rows in @p A and @p C * @param[in] num_rhs the number of columns in @p B and @p C - * @param[in] device_specific_num_rows the number of rows in @p A and number of rows in @p B; thr rows in @p A are potentially distributed across multiple devices - * @param[in] row_offset the first row this device is responsible for + * @param[in] device_num_rows the number of rows in @p A and number of rows in @p B; thr rows in @p A are potentially distributed across multiple devices + * @param[in] device_row_offset the first row this device is responsible for * @param[in] alpha the scalar alpha value * @param[in] A the matrix @p A * @param[in] B the matrix @p B @@ -41,11 +48,11 @@ class device_kernel_symm { * @param[in] grid_x_offset the offset in x-dimension into the data points if more than one execution grid has to be used * @param[in] grid_y_offset the offset in y-dimension into the data points if more than one execution grid has to be used */ - device_kernel_symm(const std::size_t num_rows, const std::size_t num_rhs, const std::size_t device_specific_num_rows, const std::size_t row_offset, const real_type alpha, const real_type *A, const real_type *B, const real_type beta, real_type *C, const std::size_t grid_x_offset, const std::size_t grid_y_offset) : + device_kernel_symm(const std::size_t num_rows, const std::size_t num_rhs, const std::size_t device_num_rows, const std::size_t device_row_offset, const real_type alpha, const real_type *A, const real_type *B, const real_type beta, real_type *C, const std::size_t grid_x_offset, const std::size_t grid_y_offset) : num_rows_{ num_rows }, num_rhs_{ num_rhs }, - device_specific_num_rows_{ device_specific_num_rows }, - row_offset_{ row_offset }, + device_num_rows_{ device_num_rows }, + device_row_offset_{ device_row_offset }, alpha_{ alpha }, A_{ A }, B_{ B }, @@ -59,36 +66,15 @@ class device_kernel_symm { * @param[in] group indices representing the current point in the execution space */ void operator()(::sycl::group<2> group) const { - // allocate shared memory - real_type A_cache_[THREAD_BLOCK_SIZE][INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE]; - real_type B_cache_[THREAD_BLOCK_SIZE][INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE]; - - // calculate the indices used in the current work-item - ::sycl::private_memory i{ group }; - ::sycl::private_memory i_linear{ group }; - ::sycl::private_memory j{ group }; - ::sycl::private_memory j_linear{ group }; + // create two local memory arrays used for caching + real_type A_cache[THREAD_BLOCK_SIZE][INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE]; + real_type B_cache[THREAD_BLOCK_SIZE][INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE]; + // create a private memory array used for internal caching ::sycl::private_memory temp{ group }; - // initialize private and local variables + // initialize private temp matrix to zero group.parallel_for_work_item([&](::sycl::h_item<2> idx) { - // cast all values to 64-bit std::size_t to prevent potential 32-bit overflows - const std::size_t threadIdx_x = idx.get_local_id(0); // current thread in block x-dimension - const std::size_t threadIdx_y = idx.get_local_id(1); // current thread in block y-dimension - const std::size_t blockDim_x = idx.get_local_range(0); // number of threads in block x-dimension - const std::size_t blockDim_y = idx.get_local_range(1); // number of threads in block y-dimension - const std::size_t blockIdx_x = group[0] + grid_x_offset_; // current block in grid x-dimension + offsets if the grid size would be too large - const std::size_t blockIdx_y = group[1] + grid_y_offset_; // current block in grid y-dimension + offsets if the grid size would be too large - - const auto INTERNAL_BLOCK_SIZE_uz = static_cast(INTERNAL_BLOCK_SIZE); - - // indices - i(idx) = (blockIdx_y * blockDim_y + threadIdx_y) * INTERNAL_BLOCK_SIZE_uz; - i_linear(idx) = blockIdx_y * blockDim_y * INTERNAL_BLOCK_SIZE_uz + threadIdx_y; - j(idx) = (blockIdx_x * blockDim_x + threadIdx_x) * INTERNAL_BLOCK_SIZE_uz; - j_linear(idx) = blockIdx_x * blockDim_x * INTERNAL_BLOCK_SIZE_uz + threadIdx_y; - // initialize private temp matrix to zero for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) { for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) { @@ -97,30 +83,44 @@ class device_kernel_symm { } }); - // iterate over all features using blocking to be able to cache them for faster memory accesses - for (std::size_t dim = 0; dim < (num_rows_ - row_offset_); dim += static_cast(THREAD_BLOCK_SIZE)) { + // iterate over all values using blocking to be able to cache them for faster memory accesses + for (std::size_t dim_block = 0; dim_block < (num_rows_ - device_row_offset_); dim_block += static_cast(THREAD_BLOCK_SIZE)) { // load data into local memory group.parallel_for_work_item([&](::sycl::h_item<2> idx) { + // cast values to 32-bit unsigned int values to prevent implicit conversions const auto local_id_0 = static_cast(idx.get_local_id(0)); const auto local_id_1 = static_cast(idx.get_local_id(1)); - const std::size_t threadIdx_x = idx.get_local_id(0); + // cast all values to 64-bit std::size_t to prevent potential 32-bit overflows + constexpr auto THREAD_BLOCK_SIZE_uz = static_cast(THREAD_BLOCK_SIZE); + constexpr auto INTERNAL_BLOCK_SIZE_uz = static_cast(INTERNAL_BLOCK_SIZE); + constexpr auto PADDING_SIZE_uz = static_cast(PADDING_SIZE); - const auto THREAD_BLOCK_SIZE_uz = static_cast(THREAD_BLOCK_SIZE); - const auto PADDING_SIZE_uz = static_cast(PADDING_SIZE); + const auto threadIdx_x = static_cast(idx.get_local_id(0)); // current work-item in work-group x-dimension + const auto threadIdx_y = static_cast(idx.get_local_id(1)); // current work-item in work-group y-dimension + const auto blockDim_x = static_cast(idx.get_local_range(0)); // number of work-items in work-group x-dimension + const auto blockDim_y = static_cast(idx.get_local_range(1)); // number of work-items in work-group y-dimension + const auto blockIdx_x = static_cast(group[0]) + grid_x_offset_; // current work-group in global range x-dimension + offsets if the global range is too large + const auto blockIdx_y = static_cast(group[1]) + grid_y_offset_; // current work-group in global range y-dimension + offsets if the global range is too large + + // calculate the indices to access the global data, pays attention to coalesced memory accesses + const auto i_idx_linear = blockIdx_y * blockDim_y * INTERNAL_BLOCK_SIZE_uz + threadIdx_y; // num_rhs + const auto j_idx_linear = blockIdx_x * blockDim_x * INTERNAL_BLOCK_SIZE_uz + threadIdx_y; // device_num_rows for (unsigned internal = 0; internal < INTERNAL_BLOCK_SIZE; ++internal) { - const auto global_i = i_linear(idx) + static_cast(internal) * THREAD_BLOCK_SIZE_uz; - const auto global_j = j_linear(idx) + static_cast(internal) * THREAD_BLOCK_SIZE_uz; + // calculate the indices to access the global data, pays attention to coalesced memory accesses + const auto global_i_idx_linear = i_idx_linear + static_cast(internal) * THREAD_BLOCK_SIZE_uz; + const auto global_j_idx_linear = j_idx_linear + static_cast(internal) * THREAD_BLOCK_SIZE_uz; + // store the values in the local memory // determine on which side of the diagonal we are located - if (dim + threadIdx_x < global_j) { - A_cache_[local_id_0][internal * THREAD_BLOCK_SIZE + local_id_1] = A_[(dim + threadIdx_x) * (num_rows_ - row_offset_ + PADDING_SIZE_uz) + global_j - (dim + threadIdx_x) * (dim + threadIdx_x + std::size_t{ 1 }) / std::size_t{ 2 }]; + if (dim_block + threadIdx_x < global_j_idx_linear) { + A_cache[local_id_0][internal * THREAD_BLOCK_SIZE + local_id_1] = A_[(dim_block + threadIdx_x) * (num_rows_ - device_row_offset_ + PADDING_SIZE_uz) + global_j_idx_linear - (dim_block + threadIdx_x) * (dim_block + threadIdx_x + std::size_t{ 1 }) / std::size_t{ 2 }]; // SoA, upper triangular matrix only } else { - A_cache_[local_id_0][internal * THREAD_BLOCK_SIZE + local_id_1] = A_[global_j * (num_rows_ - row_offset_ + PADDING_SIZE_uz) + dim + threadIdx_x - global_j * (global_j + std::size_t{ 1 }) / std::size_t{ 2 }]; + A_cache[local_id_0][internal * THREAD_BLOCK_SIZE + local_id_1] = A_[global_j_idx_linear * (num_rows_ - device_row_offset_ + PADDING_SIZE_uz) + dim_block + threadIdx_x - global_j_idx_linear * (global_j_idx_linear + std::size_t{ 1 }) / std::size_t{ 2 }]; // SoA, upper triangular matrix only } - B_cache_[local_id_0][internal * THREAD_BLOCK_SIZE + local_id_1] = B_[(dim + row_offset_ + threadIdx_x) * (num_rhs_ + PADDING_SIZE_uz) + global_i]; + B_cache[local_id_0][internal * THREAD_BLOCK_SIZE + local_id_1] = B_[(dim_block + device_row_offset_ + threadIdx_x) * (num_rhs_ + PADDING_SIZE_uz) + global_i_idx_linear]; // SoA } }); @@ -128,13 +128,28 @@ class device_kernel_symm { // perform the dot product calculation group.parallel_for_work_item([&](::sycl::h_item<2> idx) { + // cast values to 32-bit unsigned int values to prevent implicit conversions const auto local_id_0 = static_cast(idx.get_local_id(0)); const auto local_id_1 = static_cast(idx.get_local_id(1)); - for (unsigned block_dim = 0; block_dim < THREAD_BLOCK_SIZE; ++block_dim) { + if constexpr (target == target_platform::cpu) { + // perform the dot product calculation, the dim is the fastest moving index for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) { for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) { - temp(idx)[internal_i][internal_j] += A_cache_[block_dim][local_id_0 * INTERNAL_BLOCK_SIZE + internal_j] * B_cache_[block_dim][local_id_1 * INTERNAL_BLOCK_SIZE + internal_i]; + real_type sum{ 0.0 }; + for (unsigned dim = 0; dim < THREAD_BLOCK_SIZE; ++dim) { + sum += A_cache[dim][local_id_0 * INTERNAL_BLOCK_SIZE + internal_j] * B_cache[dim][local_id_1 * INTERNAL_BLOCK_SIZE + internal_i]; + } + temp(idx)[internal_i][internal_j] += sum; + } + } + } else { + // perform the dot product calculation, the dim is the slowest moving index + for (unsigned dim = 0; dim < THREAD_BLOCK_SIZE; ++dim) { + for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) { + for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) { + temp(idx)[internal_i][internal_j] += A_cache[dim][local_id_0 * INTERNAL_BLOCK_SIZE + internal_j] * B_cache[dim][local_id_1 * INTERNAL_BLOCK_SIZE + internal_i]; + } } } } @@ -145,17 +160,31 @@ class device_kernel_symm { // apply the (partial) BLAS operation and update C group.parallel_for_work_item([&](::sycl::h_item<2> idx) { - const auto PADDING_SIZE_uz = static_cast(PADDING_SIZE); + // cast all values to 64-bit std::size_t to prevent potential 32-bit overflows + constexpr auto INTERNAL_BLOCK_SIZE_uz = static_cast(INTERNAL_BLOCK_SIZE); + constexpr auto PADDING_SIZE_uz = static_cast(PADDING_SIZE); + + const auto threadIdx_x = static_cast(idx.get_local_id(0)); // current work-item in work-group x-dimension + const auto threadIdx_y = static_cast(idx.get_local_id(1)); // current work-item in work-group y-dimension + const auto blockDim_x = static_cast(idx.get_local_range(0)); // number of work-items in work-group x-dimension + const auto blockDim_y = static_cast(idx.get_local_range(1)); // number of work-items in work-group y-dimension + const auto blockIdx_x = static_cast(group[0]) + grid_x_offset_; // current work-group in global range x-dimension + offsets if the global range is too large + const auto blockIdx_y = static_cast(group[1]) + grid_y_offset_; // current work-group in global range y-dimension + offsets if the global range is too large + + // calculate the indices to access the global data, pays attention to coalesced memory accesses + const auto i_idx = (blockIdx_y * blockDim_y + threadIdx_y) * INTERNAL_BLOCK_SIZE_uz; // num_rhs + const auto j_idx = (blockIdx_x * blockDim_x + threadIdx_x) * INTERNAL_BLOCK_SIZE_uz; // device_num_rows for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) { for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) { - const auto global_i = i(idx) + static_cast(internal_i); - const auto device_global_j = j(idx) + static_cast(internal_j); - const auto global_j = row_offset_ + j(idx) + static_cast(internal_j); - - // be sure to not perform out of bounds accesses - if (global_i < num_rhs_ && device_global_j < device_specific_num_rows_) { - C_[global_j * (num_rhs_ + PADDING_SIZE_uz) + global_i] = alpha_ * temp(idx)[internal_i][internal_j] + beta_ * C_[global_j * (num_rhs_ + PADDING_SIZE_uz) + global_i]; + // calculate the indices to access the global data and the data with respect to the current device + const auto global_i_idx = i_idx + static_cast(internal_i); + const auto device_global_j_idx = j_idx + static_cast(internal_j); + const auto global_j_idx = device_row_offset_ + device_global_j_idx; + + // be sure to not perform out-of-bounds accesses + if (global_i_idx < num_rhs_ && device_global_j_idx < device_num_rows_) { + C_[global_j_idx * (num_rhs_ + PADDING_SIZE_uz) + global_i_idx] = alpha_ * temp(idx)[internal_i][internal_j] + beta_ * C_[global_j_idx * (num_rhs_ + PADDING_SIZE_uz) + global_i_idx]; // SoA } } } @@ -166,8 +195,8 @@ class device_kernel_symm { /// @cond Doxygen_suppress const std::size_t num_rows_; const std::size_t num_rhs_; - const std::size_t device_specific_num_rows_; - const std::size_t row_offset_; + const std::size_t device_num_rows_; + const std::size_t device_row_offset_; const real_type alpha_; const real_type *A_; const real_type *B_; @@ -182,16 +211,21 @@ class device_kernel_symm { * @brief Perform an explicit BLAS SYMM operation: `C = alpha * A * B + beta * C` where @p A is a `m x k` symmetric matrix (memory optimized), @p B is a `k x n` matrix, @p C is a `m x n` matrix, and @p alpha and @p beta are scalars. * @details In a multi-GPU setting, this function is responsible for mirroring down the columns this device is responsible for! * Uses SYCL's hierarchical data parallel kernels. + * @tparam target the target platform */ +template class device_kernel_symm_mirror { public: + /// The used SYCL kernel invocation type. + constexpr static sycl::kernel_invocation_type invocation_type = sycl::kernel_invocation_type::hierarchical; + /** * @brief Initialize the SYCL kernel function object. * @param[in] num_rows the number of rows in @p A and @p C * @param[in] num_rhs the number of columns in @p B and @p C * @param[in] num_mirror_rows the number of rows to mirror down - * @param[in] device_specific_num_rows the number of rows in @p A and number of rows in @p B; thr rows in @p A are potentially distributed across multiple devices - * @param[in] row_offset the first row this device is responsible for + * @param[in] device_num_rows the number of rows in @p A and number of rows in @p B; thr rows in @p A are potentially distributed across multiple devices + * @param[in] device_row_offset the first row this device is responsible for * @param[in] alpha the scalar alpha value * @param[in] A the matrix @p A * @param[in] B the matrix @p B @@ -200,12 +234,12 @@ class device_kernel_symm_mirror { * @param[in] grid_x_offset the offset in x-dimension into the data points if more than one execution grid has to be used * @param[in] grid_y_offset the offset in y-dimension into the data points if more than one execution grid has to be used */ - device_kernel_symm_mirror(const std::size_t num_rows, const std::size_t num_rhs, const std::size_t num_mirror_rows, const std::size_t device_specific_num_rows, const std::size_t row_offset, const real_type alpha, const real_type *A, const real_type *B, const real_type beta, real_type *C, const std::size_t grid_x_offset, const std::size_t grid_y_offset) : + device_kernel_symm_mirror(const std::size_t num_rows, const std::size_t num_rhs, const std::size_t num_mirror_rows, const std::size_t device_num_rows, const std::size_t device_row_offset, const real_type alpha, const real_type *A, const real_type *B, const real_type beta, real_type *C, const std::size_t grid_x_offset, const std::size_t grid_y_offset) : num_rows_{ num_rows }, num_rhs_{ num_rhs }, num_mirror_rows_{ num_mirror_rows }, - device_specific_num_rows_{ device_specific_num_rows }, - row_offset_{ row_offset }, + device_num_rows_{ device_num_rows }, + device_row_offset_{ device_row_offset }, alpha_{ alpha }, A_{ A }, B_{ B }, @@ -219,36 +253,15 @@ class device_kernel_symm_mirror { * @param[in] group indices representing the current point in the execution space */ void operator()(::sycl::group<2> group) const { - // allocate shared memory - real_type A_cache_[THREAD_BLOCK_SIZE][INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE]; - real_type B_cache_[THREAD_BLOCK_SIZE][INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE]; - - // calculate the indices used in the current work-item - ::sycl::private_memory i{ group }; - ::sycl::private_memory i_linear{ group }; - ::sycl::private_memory j{ group }; - ::sycl::private_memory j_linear{ group }; + // create two local memory arrays used for caching + real_type A_cache[THREAD_BLOCK_SIZE][INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE]; + real_type B_cache[THREAD_BLOCK_SIZE][INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE]; + // create a private memory array used for internal caching ::sycl::private_memory temp{ group }; - // initialize private and local variables + // initialize private temp matrix to zero group.parallel_for_work_item([&](::sycl::h_item<2> idx) { - const std::size_t threadIdx_x = idx.get_local_id(0); // current thread in block x-dimension - const std::size_t threadIdx_y = idx.get_local_id(1); // current thread in block y-dimension - const std::size_t blockDim_x = idx.get_local_range(0); // number of threads in block x-dimension - const std::size_t blockDim_y = idx.get_local_range(1); // number of threads in block y-dimension - const std::size_t blockIdx_x = group[0] + grid_x_offset_; // current block in grid x-dimension + offsets if the grid size would be too large - const std::size_t blockIdx_y = group[1] + grid_y_offset_; // current block in grid y-dimension + offsets if the grid size would be too large - - const auto INTERNAL_BLOCK_SIZE_uz = static_cast(INTERNAL_BLOCK_SIZE); - - // indices and diagonal condition - i(idx) = (blockIdx_y * blockDim_y + threadIdx_y) * INTERNAL_BLOCK_SIZE_uz; - i_linear(idx) = blockIdx_y * blockDim_y * INTERNAL_BLOCK_SIZE_uz + threadIdx_y; - j(idx) = (blockIdx_x * blockDim_x + threadIdx_x) * INTERNAL_BLOCK_SIZE_uz; - j_linear(idx) = blockIdx_x * blockDim_x * INTERNAL_BLOCK_SIZE_uz + threadIdx_y; - - // initialize private temp matrix to zero for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) { for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) { temp(idx)[internal_i][internal_j] = real_type{ 0.0 }; @@ -256,39 +269,67 @@ class device_kernel_symm_mirror { } }); - // iterate over the remaining features using blocking to be able to cache them for faster memory accesses - for (std::size_t dim = 0; dim < device_specific_num_rows_; dim += static_cast(THREAD_BLOCK_SIZE)) { - // load data into shared memory + // iterate over the remaining values using blocking to be able to cache them for faster memory accesses + for (std::size_t dim_block = 0; dim_block < device_num_rows_; dim_block += static_cast(THREAD_BLOCK_SIZE)) { + // load data into local memory group.parallel_for_work_item([&](::sycl::h_item<2> idx) { + // cast values to 32-bit unsigned int values to prevent implicit conversions const auto local_id_0 = static_cast(idx.get_local_id(0)); const auto local_id_1 = static_cast(idx.get_local_id(1)); - const std::size_t threadIdx_x = idx.get_local_id(0); + // cast all values to 64-bit std::size_t to prevent potential 32-bit overflows + constexpr auto INTERNAL_BLOCK_SIZE_uz = static_cast(INTERNAL_BLOCK_SIZE); + constexpr auto THREAD_BLOCK_SIZE_uz = static_cast(THREAD_BLOCK_SIZE); + constexpr auto PADDING_SIZE_uz = static_cast(PADDING_SIZE); + + const auto threadIdx_x = static_cast(idx.get_local_id(0)); // current work-item in work-group x-dimension + const auto threadIdx_y = static_cast(idx.get_local_id(1)); // current work-item in work-group y-dimension + const auto blockDim_x = static_cast(idx.get_local_range(0)); // number of work-items in work-group x-dimension + const auto blockDim_y = static_cast(idx.get_local_range(1)); // number of work-items in work-group y-dimension + const auto blockIdx_x = static_cast(group[0]) + grid_x_offset_; // current work-group in global range x-dimension + offsets if the global range is too large + const auto blockIdx_y = static_cast(group[1]) + grid_y_offset_; // current work-group in global range y-dimension + offsets if the global range is too large - const auto THREAD_BLOCK_SIZE_uz = static_cast(THREAD_BLOCK_SIZE); - const auto PADDING_SIZE_uz = static_cast(PADDING_SIZE); + // calculate the indices to access the global data, pays attention to coalesced memory accesses + const auto i_idx_linear = blockIdx_y * blockDim_y * INTERNAL_BLOCK_SIZE_uz + threadIdx_y; + const auto j_idx_linear = blockIdx_x * blockDim_x * INTERNAL_BLOCK_SIZE_uz + threadIdx_y; for (unsigned internal = 0; internal < INTERNAL_BLOCK_SIZE; ++internal) { - const auto global_i = i_linear(idx) + static_cast(internal) * THREAD_BLOCK_SIZE_uz; - const auto global_j = j_linear(idx) + static_cast(internal) * THREAD_BLOCK_SIZE_uz; + // calculate the indices to access the global data, pays attention to coalesced memory accesses + const auto global_i_idx_linear = i_idx_linear + static_cast(internal) * THREAD_BLOCK_SIZE_uz; + const auto global_j_idx_linear = j_idx_linear + static_cast(internal) * THREAD_BLOCK_SIZE_uz; // store the values in the local memory - A_cache_[local_id_0][internal * THREAD_BLOCK_SIZE + local_id_1] = A_[(dim + threadIdx_x) * (num_rows_ - row_offset_ + PADDING_SIZE_uz) - (dim + threadIdx_x - std::size_t{ 1 }) * (dim + threadIdx_x) / std::size_t{ 2 } + device_specific_num_rows_ - (dim + threadIdx_x) + global_j]; - B_cache_[local_id_0][internal * THREAD_BLOCK_SIZE + local_id_1] = B_[(dim + row_offset_ + threadIdx_x) * (num_rhs_ + PADDING_SIZE_uz) + global_i]; + A_cache[local_id_0][internal * THREAD_BLOCK_SIZE + local_id_1] = A_[(dim_block + threadIdx_x) * (num_rows_ - device_row_offset_ + PADDING_SIZE_uz) - (dim_block + threadIdx_x - std::size_t{ 1 }) * (dim_block + threadIdx_x) / std::size_t{ 2 } + device_num_rows_ - (dim_block + threadIdx_x) + global_j_idx_linear]; // SoA, upper triangular matrix only + B_cache[local_id_0][internal * THREAD_BLOCK_SIZE + local_id_1] = B_[(device_row_offset_ + dim_block + threadIdx_x) * (num_rhs_ + PADDING_SIZE_uz) + global_i_idx_linear]; // SoA } }); // implicit barrier - // perform the feature reduction calculation + // perform the dot product calculation group.parallel_for_work_item([&](::sycl::h_item<2> idx) { + // cast values to 32-bit unsigned int values to prevent implicit conversions const auto local_id_0 = static_cast(idx.get_local_id(0)); const auto local_id_1 = static_cast(idx.get_local_id(1)); - for (unsigned block_dim = 0; block_dim < THREAD_BLOCK_SIZE; ++block_dim) { + if constexpr (target == target_platform::cpu) { + // perform the dot product calculation, the dim is the fastest moving index for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) { for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) { - temp(idx)[internal_i][internal_j] += A_cache_[block_dim][local_id_0 * INTERNAL_BLOCK_SIZE + internal_j] * B_cache_[block_dim][local_id_1 * INTERNAL_BLOCK_SIZE + internal_i]; + real_type sum{ 0.0 }; + for (unsigned dim = 0; dim < THREAD_BLOCK_SIZE; ++dim) { + sum += A_cache[dim][local_id_0 * INTERNAL_BLOCK_SIZE + internal_j] * B_cache[dim][local_id_1 * INTERNAL_BLOCK_SIZE + internal_i]; + } + temp(idx)[internal_i][internal_j] += sum; + } + } + } else { + // perform the dot product calculation, the dim is the slowest moving index + for (unsigned dim = 0; dim < THREAD_BLOCK_SIZE; ++dim) { + for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) { + for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) { + temp(idx)[internal_i][internal_j] += A_cache[dim][local_id_0 * INTERNAL_BLOCK_SIZE + internal_j] * B_cache[dim][local_id_1 * INTERNAL_BLOCK_SIZE + internal_i]; + } } } } @@ -299,17 +340,31 @@ class device_kernel_symm_mirror { // apply the (remaining) BLAS operation and update C group.parallel_for_work_item([&](::sycl::h_item<2> idx) { - const auto PADDING_SIZE_uz = static_cast(PADDING_SIZE); + // cast all values to 64-bit std::size_t to prevent potential 32-bit overflows + constexpr auto INTERNAL_BLOCK_SIZE_uz = static_cast(INTERNAL_BLOCK_SIZE); + constexpr auto PADDING_SIZE_uz = static_cast(PADDING_SIZE); + + const auto threadIdx_x = static_cast(idx.get_local_id(0)); // current work-item in work-group x-dimension + const auto threadIdx_y = static_cast(idx.get_local_id(1)); // current work-item in work-group y-dimension + const auto blockDim_x = static_cast(idx.get_local_range(0)); // number of work-items in work-group x-dimension + const auto blockDim_y = static_cast(idx.get_local_range(1)); // number of work-items in work-group y-dimension + const auto blockIdx_x = static_cast(group[0]) + grid_x_offset_; // current work-group in global range x-dimension + offsets if the global range is too large + const auto blockIdx_y = static_cast(group[1]) + grid_y_offset_; // current work-group in global range y-dimension + offsets if the global range is too large + + // calculate the indices to access the global data + const auto i_idx = (blockIdx_y * blockDim_y + threadIdx_y) * INTERNAL_BLOCK_SIZE_uz; + const auto j_idx = (blockIdx_x * blockDim_x + threadIdx_x) * INTERNAL_BLOCK_SIZE_uz; for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) { for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) { - const auto global_i = i(idx) + static_cast(internal_i); - const auto partial_global_j = j(idx) + static_cast(internal_j); - const auto global_j = row_offset_ + device_specific_num_rows_ + j(idx) + static_cast(internal_j); - - // be sure to not perform out of bounds accesses - if (global_i < num_rhs_ && partial_global_j < num_mirror_rows_) { - C_[global_j * (num_rhs_ + PADDING_SIZE_uz) + global_i] = alpha_ * temp(idx)[internal_i][internal_j] + beta_ * C_[global_j * (num_rhs_ + PADDING_SIZE_uz) + global_i]; + // calculate the indices to access the global data and the data with respect to the current device + const auto global_i_idx = i_idx + static_cast(internal_i); + const auto partial_global_j_idx = j_idx + static_cast(internal_j); + const auto global_j_idx = device_row_offset_ + device_num_rows_ + partial_global_j_idx; + + // be sure to not perform out-of-bounds accesses + if (global_i_idx < num_rhs_ && partial_global_j_idx < num_mirror_rows_) { + C_[global_j_idx * (num_rhs_ + PADDING_SIZE_uz) + global_i_idx] = alpha_ * temp(idx)[internal_i][internal_j] + beta_ * C_[global_j_idx * (num_rhs_ + PADDING_SIZE_uz) + global_i_idx]; // SoA } } } @@ -321,8 +376,8 @@ class device_kernel_symm_mirror { const std::size_t num_rows_; const std::size_t num_rhs_; const std::size_t num_mirror_rows_; - const std::size_t device_specific_num_rows_; - const std::size_t row_offset_; + const std::size_t device_num_rows_; + const std::size_t device_row_offset_; const real_type alpha_; const real_type *A_; const real_type *B_; @@ -339,6 +394,9 @@ class device_kernel_symm_mirror { */ class device_kernel_inplace_matrix_add { public: + /// The used SYCL kernel invocation type. + constexpr static sycl::kernel_invocation_type invocation_type = sycl::kernel_invocation_type::hierarchical; + /** * @brief Initialize the SYCL kernel function object. * @param[in] num_cols the number of columns in both matrices @@ -361,25 +419,27 @@ class device_kernel_inplace_matrix_add { void operator()(::sycl::group<2> group) const { group.parallel_for_work_item([&](::sycl::h_item<2> idx) { // cast all values to 64-bit std::size_t to prevent potential 32-bit overflows - const std::size_t threadIdx_x = idx.get_local_id(0); - const std::size_t threadIdx_y = idx.get_local_id(1); - const std::size_t blockDim_x = idx.get_local_range(0); - const std::size_t blockDim_y = idx.get_local_range(1); - const std::size_t blockIdx_x = group[0] + grid_x_offset_; - const std::size_t blockIdx_y = group[1] + grid_y_offset_; - const auto INTERNAL_BLOCK_SIZE_uz = static_cast(INTERNAL_BLOCK_SIZE); - const auto PADDING_SIZE_uz = static_cast(PADDING_SIZE); - - // indices - const std::size_t i = (blockIdx_y * blockDim_y + threadIdx_y) * INTERNAL_BLOCK_SIZE_uz; - const std::size_t j = (blockIdx_x * blockDim_x + threadIdx_x) * INTERNAL_BLOCK_SIZE_uz; - - for (std::size_t internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE_uz; ++internal_i) { - for (std::size_t internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE_uz; ++internal_j) { - const std::size_t global_i = i + internal_i; - const std::size_t global_j = j + internal_j; - - lhs_[global_i * (num_cols_ + PADDING_SIZE_uz) + global_j] += rhs_[global_i * (num_cols_ + PADDING_SIZE_uz) + global_j]; + constexpr auto INTERNAL_BLOCK_SIZE_uz = static_cast(INTERNAL_BLOCK_SIZE); + constexpr auto PADDING_SIZE_uz = static_cast(PADDING_SIZE); + + const auto threadIdx_x = static_cast(idx.get_local_id(0)); // current work-item in work-group x-dimension + const auto threadIdx_y = static_cast(idx.get_local_id(1)); // current work-item in work-group y-dimension + const auto blockDim_x = static_cast(idx.get_local_range(0)); // number of work-items in work-group x-dimension + const auto blockDim_y = static_cast(idx.get_local_range(1)); // number of work-items in work-group y-dimension + const auto blockIdx_x = static_cast(group[0]) + grid_x_offset_; // current work-group in global range x-dimension + offsets if the global range is too large + const auto blockIdx_y = static_cast(group[1]) + grid_y_offset_; // current work-group in global range y-dimension + offsets if the global range is too large + + // calculate the indices used in the current work-item + const auto i_idx = (blockIdx_y * blockDim_y + threadIdx_y) * INTERNAL_BLOCK_SIZE_uz; // num_rows + const auto j_idx = (blockIdx_x * blockDim_x + threadIdx_x) * INTERNAL_BLOCK_SIZE_uz; // num_rhs + + for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) { + for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) { + // calculate the indices to access the global data + const auto global_i_idx = i_idx + static_cast(internal_i); + const auto global_j_idx = j_idx + static_cast(internal_j); + + lhs_[global_i_idx * (num_cols_ + PADDING_SIZE_uz) + global_j_idx] += rhs_[global_i_idx * (num_cols_ + PADDING_SIZE_uz) + global_j_idx]; // SoA } } }); @@ -401,6 +461,9 @@ class device_kernel_inplace_matrix_add { */ class device_kernel_inplace_matrix_scale { public: + /// The used SYCL kernel invocation type. + constexpr static sycl::kernel_invocation_type invocation_type = sycl::kernel_invocation_type::hierarchical; + /** * @brief Initialize the SYCL kernel function object. * @param[in] num_cols the number of columns in the matrix @@ -423,25 +486,27 @@ class device_kernel_inplace_matrix_scale { void operator()(::sycl::group<2> group) const { group.parallel_for_work_item([&](::sycl::h_item<2> idx) { // cast all values to 64-bit std::size_t to prevent potential 32-bit overflows - const std::size_t threadIdx_x = idx.get_local_id(0); - const std::size_t threadIdx_y = idx.get_local_id(1); - const std::size_t blockDim_x = idx.get_local_range(0); - const std::size_t blockDim_y = idx.get_local_range(1); - const std::size_t blockIdx_x = group[0] + grid_x_offset_; - const std::size_t blockIdx_y = group[1] + grid_y_offset_; - const auto INTERNAL_BLOCK_SIZE_uz = static_cast(INTERNAL_BLOCK_SIZE); - const auto PADDING_SIZE_uz = static_cast(PADDING_SIZE); - - // indices - const std::size_t i = (blockIdx_y * blockDim_y + threadIdx_y) * INTERNAL_BLOCK_SIZE_uz; - const std::size_t j = (blockIdx_x * blockDim_x + threadIdx_x) * INTERNAL_BLOCK_SIZE_uz; - - for (std::size_t internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE_uz; ++internal_i) { - for (std::size_t internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE_uz; ++internal_j) { - const std::size_t global_i = i + internal_i; - const std::size_t global_j = j + internal_j; - - lhs_[global_i * (num_cols_ + PADDING_SIZE_uz) + global_j] *= scale_; + constexpr auto INTERNAL_BLOCK_SIZE_uz = static_cast(INTERNAL_BLOCK_SIZE); + constexpr auto PADDING_SIZE_uz = static_cast(PADDING_SIZE); + + const auto threadIdx_x = static_cast(idx.get_local_id(0)); // current work-item in work-group x-dimension + const auto threadIdx_y = static_cast(idx.get_local_id(1)); // current work-item in work-group y-dimension + const auto blockDim_x = static_cast(idx.get_local_range(0)); // number of work-items in work-group x-dimension + const auto blockDim_y = static_cast(idx.get_local_range(1)); // number of work-items in work-group y-dimension + const auto blockIdx_x = static_cast(group[0]) + grid_x_offset_; // current work-group in global range x-dimension + offsets if the global range is too large + const auto blockIdx_y = static_cast(group[1]) + grid_y_offset_; // current work-group in global range y-dimension + offsets if the global range is too large + + // calculate the indices used in the current work-item + const auto i_idx = (blockIdx_y * blockDim_y + threadIdx_y) * INTERNAL_BLOCK_SIZE_uz; // num_rows + const auto j_idx = (blockIdx_x * blockDim_x + threadIdx_x) * INTERNAL_BLOCK_SIZE_uz; // num_rhs + + for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) { + for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) { + // calculate the indices to access the global data + const auto global_i_idx = i_idx + static_cast(internal_i); + const auto global_j_idx = j_idx + static_cast(internal_j); + + lhs_[global_i_idx * (num_cols_ + PADDING_SIZE_uz) + global_j_idx] *= scale_; // SoA } } }); diff --git a/include/plssvm/backends/SYCL/kernel/cg_explicit/hierarchical/kernel_matrix_assembly.hpp b/include/plssvm/backends/SYCL/kernel/cg_explicit/hierarchical/kernel_matrix_assembly.hpp index d3e37ca54..3bc6d0878 100644 --- a/include/plssvm/backends/SYCL/kernel/cg_explicit/hierarchical/kernel_matrix_assembly.hpp +++ b/include/plssvm/backends/SYCL/kernel/cg_explicit/hierarchical/kernel_matrix_assembly.hpp @@ -14,8 +14,10 @@ #pragma once #include "plssvm/backends/SYCL/kernel/kernel_functions.hpp" // plssvm::sycl::detail::{feature_reduce, apply_kernel_function} +#include "plssvm/backends/SYCL/kernel_invocation_types.hpp" // plssvm::sycl::kernel_invocation_type #include "plssvm/constants.hpp" // plssvm::{real_type, THREAD_BLOCK_SIZE, INTERNAL_BLOCK_SIZE, PADDING_SIZE} #include "plssvm/kernel_function_types.hpp" // plssvm::kernel_function_type +#include "plssvm/target_platforms.hpp" // plssvm::target_platform #include "sycl/sycl.hpp" // sycl::group, sycl::private_memory, sycl::h_item @@ -28,12 +30,16 @@ namespace plssvm::sycl::detail::hierarchical { /** * @brief Create the explicit kernel matrix using the @p kernel_function. * @details Uses SYCL's hierarchical data parallel kernels. + * @tparam target the target platform * @tparam kernel_function the type of the used kernel function * @tparam Args the types of the parameters necessary for the specific kernel function; stored in a `std::tuple` */ -template +template class device_kernel_assembly { public: + /// The used SYCL kernel invocation type. + constexpr static sycl::kernel_invocation_type invocation_type = sycl::kernel_invocation_type::hierarchical; + /** * @brief Initialize the SYCL kernel function object. * @param[out] kernel_matrix the calculated kernel matrix @@ -61,7 +67,7 @@ class device_kernel_assembly { cost_{ cost }, grid_x_offset_{ grid_x_offset }, grid_y_offset_{ grid_y_offset }, - kernel_function_parameter_{ std::make_tuple(std::forward(kernel_function_parameter)...) } { + kernel_function_parameter_{ std::make_tuple(kernel_function_parameter...) } { } /** @@ -69,7 +75,7 @@ class device_kernel_assembly { * @param[in] group indices representing the current point in the execution space */ void operator()(::sycl::group<2> group) const { - // create two local memory arrays used for caching data point features + // create two local memory arrays used for caching real_type data_i_cache[THREAD_BLOCK_SIZE][INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE]; real_type data_j_cache[THREAD_BLOCK_SIZE][INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE]; @@ -78,8 +84,17 @@ class device_kernel_assembly { // only calculate the upper triangular matrix -> can't use get_local_id() since all work-items in a work-group must progress further if (group[1] >= group[0]) { + // initialize private temp matrix to zero + group.parallel_for_work_item([&](::sycl::h_item<2> idx) { + for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) { + for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) { + temp(idx)[internal_i][internal_j] = real_type{ 0.0 }; + } + } + }); + // iterate over all features using blocking to be able to cache them for faster memory accesses - for (std::size_t dim = 0; dim < num_features_; dim += static_cast(THREAD_BLOCK_SIZE)) { + for (std::size_t feature_block = 0; feature_block < num_features_; feature_block += static_cast(THREAD_BLOCK_SIZE)) { // load data into local memory group.parallel_for_work_item([&](::sycl::h_item<2> idx) { // cast values to 32-bit unsigned int values to prevent implicit conversions @@ -91,25 +106,25 @@ class device_kernel_assembly { constexpr auto THREAD_BLOCK_SIZE_uz = static_cast(THREAD_BLOCK_SIZE); constexpr auto PADDING_SIZE_uz = static_cast(PADDING_SIZE); - const std::size_t threadIdx_x = idx.get_local_id(0); // current work-item in work-group x-dimension - const std::size_t threadIdx_y = idx.get_local_id(1); // current work-item in work-group y-dimension - const std::size_t blockDim_x = idx.get_local_range(0); // number of work-items in work-group x-dimension - const std::size_t blockDim_y = idx.get_local_range(1); // number of work-items in work-group y-dimension - const std::size_t blockIdx_x = group[0] + grid_x_offset_; // current work-group in global range x-dimension + offsets if the global range is too large - const std::size_t blockIdx_y = group[1] + grid_y_offset_; // current work-group in global range y-dimension + offsets if the global range is too large + const auto threadIdx_x = static_cast(idx.get_local_id(0)); // current work-item in work-group x-dimension + const auto threadIdx_y = static_cast(idx.get_local_id(1)); // current work-item in work-group y-dimension + const auto blockDim_x = static_cast(idx.get_local_range(0)); // number of work-items in work-group x-dimension + const auto blockDim_y = static_cast(idx.get_local_range(1)); // number of work-items in work-group y-dimension + const auto blockIdx_x = static_cast(group[0]) + grid_x_offset_; // current work-group in global range x-dimension + offsets if the global range is too large + const auto blockIdx_y = static_cast(group[1]) + grid_y_offset_; // current work-group in global range y-dimension + offsets if the global range is too large - // calculate the indices used in the current work-item paying attention to coalesced memory accesses - const auto i_linear = blockIdx_y * blockDim_y * INTERNAL_BLOCK_SIZE_uz + threadIdx_y; - const auto j_linear = blockIdx_x * blockDim_x * INTERNAL_BLOCK_SIZE_uz + threadIdx_y; + // calculate the indices used in the current work-item, pays attention to coalesced memory accesses + const auto i_idx_linear = blockIdx_y * blockDim_y * INTERNAL_BLOCK_SIZE_uz + threadIdx_y; // num_rows - device_row_offset + const auto j_idx_linear = blockIdx_x * blockDim_x * INTERNAL_BLOCK_SIZE_uz + threadIdx_y; // device_num_rows for (unsigned internal = 0; internal < INTERNAL_BLOCK_SIZE; ++internal) { - // calculate the indices to access the global data points, pays attention to coalesced memory accesses - const auto global_i_linear = device_row_offset_ + i_linear + static_cast(internal) * THREAD_BLOCK_SIZE_uz; - const auto global_j_linear = device_row_offset_ + j_linear + static_cast(internal) * THREAD_BLOCK_SIZE_uz; + // calculate the indices to access the global data, pays attention to coalesced memory accesses + const auto global_i_idx_linear = device_row_offset_ + i_idx_linear + static_cast(internal) * THREAD_BLOCK_SIZE_uz; + const auto global_j_idx_linear = device_row_offset_ + j_idx_linear + static_cast(internal) * THREAD_BLOCK_SIZE_uz; // store the values in the local memory - data_i_cache[local_id_0][internal * THREAD_BLOCK_SIZE + local_id_1] = data_[(dim + threadIdx_x) * (num_rows_ + std::size_t{ 1 } + PADDING_SIZE_uz) + global_i_linear]; - data_j_cache[local_id_0][internal * THREAD_BLOCK_SIZE + local_id_1] = data_[(dim + threadIdx_x) * (num_rows_ + std::size_t{ 1 } + PADDING_SIZE_uz) + global_j_linear]; + data_i_cache[local_id_0][internal * THREAD_BLOCK_SIZE + local_id_1] = data_[(feature_block + threadIdx_x) * (num_rows_ + std::size_t{ 1 } + PADDING_SIZE_uz) + global_i_idx_linear]; // SoA + data_j_cache[local_id_0][internal * THREAD_BLOCK_SIZE + local_id_1] = data_[(feature_block + threadIdx_x) * (num_rows_ + std::size_t{ 1 } + PADDING_SIZE_uz) + global_j_idx_linear]; // SoA } }); @@ -121,11 +136,26 @@ class device_kernel_assembly { const auto local_id_0 = static_cast(idx.get_local_id(0)); const auto local_id_1 = static_cast(idx.get_local_id(1)); - for (unsigned block_dim = 0; block_dim < THREAD_BLOCK_SIZE; ++block_dim) { + if constexpr (target == target_platform::cpu) { + // perform the feature reduction calculation, the feature is the fastest moving index for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) { for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) { - temp(idx)[internal_i][internal_j] += detail::feature_reduce(data_i_cache[block_dim][local_id_1 * INTERNAL_BLOCK_SIZE + internal_i], - data_j_cache[block_dim][local_id_0 * INTERNAL_BLOCK_SIZE + internal_j]); + real_type sum{ 0.0 }; + for (unsigned feature = 0; feature < THREAD_BLOCK_SIZE; ++feature) { + sum += detail::feature_reduce(data_i_cache[feature][local_id_1 * INTERNAL_BLOCK_SIZE + internal_i], + data_j_cache[feature][local_id_0 * INTERNAL_BLOCK_SIZE + internal_j]); + } + temp(idx)[internal_i][internal_j] += sum; + } + } + } else { + // perform the feature reduction calculation, the feature is the slowest moving index + for (unsigned feature = 0; feature < THREAD_BLOCK_SIZE; ++feature) { + for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) { + for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) { + temp(idx)[internal_i][internal_j] += detail::feature_reduce(data_i_cache[feature][local_id_1 * INTERNAL_BLOCK_SIZE + internal_i], + data_j_cache[feature][local_id_0 * INTERNAL_BLOCK_SIZE + internal_j]); + } } } } @@ -140,36 +170,36 @@ class device_kernel_assembly { constexpr auto INTERNAL_BLOCK_SIZE_uz = static_cast(INTERNAL_BLOCK_SIZE); constexpr auto PADDING_SIZE_uz = static_cast(PADDING_SIZE); - const std::size_t threadIdx_x = idx.get_local_id(0); // current work-item in work-group x-dimension - const std::size_t threadIdx_y = idx.get_local_id(1); // current work-item in work-group y-dimension - const std::size_t blockDim_x = idx.get_local_range(0); // number of work-items in work-group x-dimension - const std::size_t blockDim_y = idx.get_local_range(1); // number of work-items in work-group y-dimension - const std::size_t blockIdx_x = group[0] + grid_x_offset_; // current work-group in global range x-dimension + offsets if the global range is too large - const std::size_t blockIdx_y = group[1] + grid_y_offset_; // current work-group in global range y-dimension + offsets if the global range is too large + const auto threadIdx_x = static_cast(idx.get_local_id(0)); // current work-item in work-group x-dimension + const auto threadIdx_y = static_cast(idx.get_local_id(1)); // current work-item in work-group y-dimension + const auto blockDim_x = static_cast(idx.get_local_range(0)); // number of work-items in work-group x-dimension + const auto blockDim_y = static_cast(idx.get_local_range(1)); // number of work-items in work-group y-dimension + const auto blockIdx_x = static_cast(group[0]) + grid_x_offset_; // current work-group in global range x-dimension + offsets if the global range is too large + const auto blockIdx_y = static_cast(group[1]) + grid_y_offset_; // current work-group in global range y-dimension + offsets if the global range is too large // calculate the indices used in the current work-item - const auto i = (blockIdx_y * blockDim_y + threadIdx_y) * INTERNAL_BLOCK_SIZE_uz; - const auto j = (blockIdx_x * blockDim_x + threadIdx_x) * INTERNAL_BLOCK_SIZE_uz; + const auto i_idx = (blockIdx_y * blockDim_y + threadIdx_y) * INTERNAL_BLOCK_SIZE_uz; // num_rows - device_row_offset + const auto j_idx = (blockIdx_x * blockDim_x + threadIdx_x) * INTERNAL_BLOCK_SIZE_uz; // device_num_rows for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) { for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) { - // calculate the indices to access the global data points and wrt the current device - const auto device_global_i = i + static_cast(internal_i); - const auto global_i = device_row_offset_ + device_global_i; - const auto device_global_j = j + static_cast(internal_j); - const auto global_j = device_row_offset_ + device_global_j; - - // be sure to not perform out of bounds accesses for the kernel matrix (only using the upper triangular matrix) - if (device_global_i < (num_rows_ - device_row_offset_) && device_global_j < device_num_rows_ && global_i >= global_j) { + // calculate the indices to access the global data and the data with respect to the current device + const auto device_global_i_idx = i_idx + static_cast(internal_i); + const auto global_i_idx = device_row_offset_ + device_global_i_idx; + const auto device_global_j_idx = j_idx + static_cast(internal_j); + const auto global_j_idx = device_row_offset_ + device_global_j_idx; + + // be sure to not perform out-of-bounds accesses (only using the upper triangular matrix) + if (device_global_i_idx < (num_rows_ - device_row_offset_) && device_global_j_idx < device_num_rows_ && global_i_idx >= global_j_idx) { real_type temp_ij = temp(idx)[internal_i][internal_j]; // apply the final kernel function - temp_ij = detail::apply_kernel_function(temp_ij, kernel_function_parameter_) + QA_cost_ - q_[global_i] - q_[global_j]; + temp_ij = detail::apply_kernel_function(temp_ij, kernel_function_parameter_) + QA_cost_ - q_[global_i_idx] - q_[global_j_idx]; // apply the cost on the diagonal - if (global_i == global_j) { + if (global_i_idx == global_j_idx) { temp_ij += cost_; } // update the upper triangular kernel matrix - kernel_matrix_[device_global_j * (num_rows_ - device_row_offset_ + PADDING_SIZE_uz) - device_global_j * (device_global_j + std::size_t{ 1 }) / std::size_t{ 2 } + device_global_i] = temp_ij; + kernel_matrix_[device_global_j_idx * (num_rows_ - device_row_offset_ + PADDING_SIZE_uz) - device_global_j_idx * (device_global_j_idx + std::size_t{ 1 }) / std::size_t{ 2 } + device_global_i_idx] = temp_ij; } } } diff --git a/include/plssvm/backends/SYCL/kernel/cg_explicit/scoped/blas.hpp b/include/plssvm/backends/SYCL/kernel/cg_explicit/scoped/blas.hpp index 2e6983255..9d3d6bef8 100644 --- a/include/plssvm/backends/SYCL/kernel/cg_explicit/scoped/blas.hpp +++ b/include/plssvm/backends/SYCL/kernel/cg_explicit/scoped/blas.hpp @@ -13,7 +13,9 @@ #define PLSSVM_BACKENDS_SYCL_CG_EXPLICIT_SCOPED_BLAS_HPP_ #pragma once -#include "plssvm/constants.hpp" // plssvm::{real_type, THREAD_BLOCK_SIZE, INTERNAL_BLOCK_SIZE, PADDING_SIZE} +#include "plssvm/backends/SYCL/kernel_invocation_types.hpp" // plssvm::sycl::kernel_invocation_type +#include "plssvm/constants.hpp" // plssvm::{real_type, THREAD_BLOCK_SIZE, INTERNAL_BLOCK_SIZE, PADDING_SIZE} +#include "plssvm/target_platforms.hpp" // plssvm::target_platform #include "sycl/sycl.hpp" // sycl::memory_environment, sycl::require_local_mem, sycl::require_private_mem, sycl::distribute_items_and_wait, sycl::s_item @@ -24,15 +26,20 @@ namespace plssvm::sycl::detail::scoped { /** * @brief Perform an explicit BLAS SYMM operation: `C = alpha * A * B + beta * C` where @p A is a `m x k` symmetric matrix (memory optimized), @p B is a `k x n` matrix, @p C is a `m x n` matrix, and @p alpha and @p beta are scalars. * @details Uses AdaptiveCpp's scoped parallelism. + * @tparam target the target platform */ +template class device_kernel_symm { public: + /// The used SYCL kernel invocation type. + constexpr static sycl::kernel_invocation_type invocation_type = sycl::kernel_invocation_type::scoped; + /** * @brief Initialize the SYCL kernel function object. * @param[in] num_rows the number of rows in @p A and @p C * @param[in] num_rhs the number of columns in @p B and @p C - * @param[in] device_specific_num_rows the number of rows in @p A and number of rows in @p B; thr rows in @p A are potentially distributed across multiple devices - * @param[in] row_offset the first row this device is responsible for + * @param[in] device_num_rows the number of rows in @p A and number of rows in @p B; thr rows in @p A are potentially distributed across multiple devices + * @param[in] device_row_offset the first row this device is responsible for * @param[in] alpha the scalar alpha value * @param[in] A the matrix @p A * @param[in] B the matrix @p B @@ -41,11 +48,11 @@ class device_kernel_symm { * @param[in] grid_x_offset the offset in x-dimension into the data points if more than one execution grid has to be used * @param[in] grid_y_offset the offset in y-dimension into the data points if more than one execution grid has to be used */ - device_kernel_symm(const std::size_t num_rows, const std::size_t num_rhs, const std::size_t device_specific_num_rows, const std::size_t row_offset, const real_type alpha, const real_type *A, const real_type *B, const real_type beta, real_type *C, const std::size_t grid_x_offset, const std::size_t grid_y_offset) : + device_kernel_symm(const std::size_t num_rows, const std::size_t num_rhs, const std::size_t device_num_rows, const std::size_t device_row_offset, const real_type alpha, const real_type *A, const real_type *B, const real_type beta, real_type *C, const std::size_t grid_x_offset, const std::size_t grid_y_offset) : num_rows_{ num_rows }, num_rhs_{ num_rhs }, - device_specific_num_rows_{ device_specific_num_rows }, - row_offset_{ row_offset }, + device_num_rows_{ device_num_rows }, + device_row_offset_{ device_row_offset }, alpha_{ alpha }, A_{ A }, B_{ B }, @@ -62,85 +69,111 @@ class device_kernel_symm { template void operator()(T group) const { ::sycl::memory_environment(group, - ::sycl::require_local_mem(), - ::sycl::require_local_mem(), - ::sycl::require_private_mem(), - ::sycl::require_private_mem(), - ::sycl::require_private_mem(), - ::sycl::require_private_mem(), - ::sycl::require_private_mem, INTERNAL_BLOCK_SIZE>>({}), - [&](auto &A_cache, auto &B_cache, auto &i, auto &i_linear, auto &j, auto &j_linear, auto &temp) { - // initialize private and local variables - ::sycl::distribute_items_and_wait(group, [&](::sycl::s_item<2> idx) { - const std::size_t threadIdx_x = idx.get_local_id(group, 0); // current thread in block x-dimension - const std::size_t threadIdx_y = idx.get_local_id(group, 1); // current thread in block y-dimension - const std::size_t blockDim_x = group.get_logical_local_range(0); // number of threads in block x-dimension - const std::size_t blockDim_y = group.get_logical_local_range(1); // number of threads in block y-dimension - const std::size_t blockIdx_x = group[0] + grid_x_offset_; // current block in grid x-dimension + offsets if the grid size would be too large - const std::size_t blockIdx_y = group[1] + grid_y_offset_; // current block in grid y-dimension + offsets if the grid size would be too large - - const auto INTERNAL_BLOCK_SIZE_uz = static_cast(INTERNAL_BLOCK_SIZE); - - // indices - i(idx) = (blockIdx_y * blockDim_y + threadIdx_y) * INTERNAL_BLOCK_SIZE_uz; - i_linear(idx) = blockIdx_y * blockDim_y * INTERNAL_BLOCK_SIZE_uz + threadIdx_y; - j(idx) = (blockIdx_x * blockDim_x + threadIdx_x) * INTERNAL_BLOCK_SIZE_uz; - j_linear(idx) = blockIdx_x * blockDim_x * INTERNAL_BLOCK_SIZE_uz + threadIdx_y; - }); + // the indices used in the current work-item + ::sycl::require_local_mem(), // A_cache + ::sycl::require_local_mem(), // B_cache - for (std::size_t dim = 0; dim < (num_rows_ - row_offset_); dim += static_cast(THREAD_BLOCK_SIZE)) { - // load data into shared memory + // create two local memory arrays used for caching + ::sycl::require_private_mem, INTERNAL_BLOCK_SIZE>>({}), + [&](auto &A_cache, auto &B_cache, auto &temp) { + // iterate over all values using blocking to be able to cache them for faster memory accesses + for (std::size_t dim_block = 0; dim_block < (num_rows_ - device_row_offset_); dim_block += static_cast(THREAD_BLOCK_SIZE)) { + // load data into local memory ::sycl::distribute_items_and_wait(group, [&](::sycl::s_item<2> idx) { + // cast values to 32-bit unsigned int values to prevent implicit conversions const auto local_id_0 = static_cast(idx.get_local_id(group, 0)); const auto local_id_1 = static_cast(idx.get_local_id(group, 1)); - const std::size_t threadIdx_x = idx.get_local_id(group, 0); + // cast all values to 64-bit std::size_t to prevent potential 32-bit overflows + constexpr auto THREAD_BLOCK_SIZE_uz = static_cast(THREAD_BLOCK_SIZE); + constexpr auto INTERNAL_BLOCK_SIZE_uz = static_cast(INTERNAL_BLOCK_SIZE); + constexpr auto PADDING_SIZE_uz = static_cast(PADDING_SIZE); - const auto THREAD_BLOCK_SIZE_uz = static_cast(THREAD_BLOCK_SIZE); - const auto PADDING_SIZE_uz = static_cast(PADDING_SIZE); + const auto threadIdx_x = static_cast(idx.get_local_id(group, 0)); // current work-item in work-group x-dimension + const auto threadIdx_y = static_cast(idx.get_local_id(group, 1)); // current work-item in work-group y-dimension + const auto blockDim_x = static_cast(group.get_logical_local_range(0)); // number of work-items in work-group x-dimension + const auto blockDim_y = static_cast(group.get_logical_local_range(1)); // number of work-items in work-group y-dimension + const auto blockIdx_x = static_cast(group[0]) + grid_x_offset_; // current work-group in global range x-dimension + offsets if the global range is too large + const auto blockIdx_y = static_cast(group[1]) + grid_y_offset_; // current work-group in global range y-dimension + offsets if the global range is too large + + // calculate the indices to access the global data, pays attention to coalesced memory accesses + const auto i_idx_linear = blockIdx_y * blockDim_y * INTERNAL_BLOCK_SIZE_uz + threadIdx_y; // num_rhs + const auto j_idx_linear = blockIdx_x * blockDim_x * INTERNAL_BLOCK_SIZE_uz + threadIdx_y; // device_num_rows for (unsigned internal = 0; internal < INTERNAL_BLOCK_SIZE; ++internal) { - const auto global_i = i_linear(idx) + static_cast(internal) * THREAD_BLOCK_SIZE_uz; - const auto global_j = j_linear(idx) + static_cast(internal) * THREAD_BLOCK_SIZE_uz; + // calculate the indices to access the global data, pays attention to coalesced memory accesses + const auto global_i_idx_linear = i_idx_linear + static_cast(internal) * THREAD_BLOCK_SIZE_uz; + const auto global_j_idx_linear = j_idx_linear + static_cast(internal) * THREAD_BLOCK_SIZE_uz; + // store the values in the local memory // determine on which side of the diagonal we are located - if (dim + threadIdx_x < global_j) { - A_cache[local_id_0][internal * THREAD_BLOCK_SIZE + local_id_1] = A_[(dim + threadIdx_x) * (num_rows_ - row_offset_ + PADDING_SIZE_uz) + global_j - (dim + threadIdx_x) * (dim + threadIdx_x + std::size_t{ 1 }) / std::size_t{ 2 }]; + if (dim_block + threadIdx_x < global_j_idx_linear) { + A_cache[local_id_0][internal * THREAD_BLOCK_SIZE + local_id_1] = A_[(dim_block + threadIdx_x) * (num_rows_ - device_row_offset_ + PADDING_SIZE_uz) + global_j_idx_linear - (dim_block + threadIdx_x) * (dim_block + threadIdx_x + std::size_t{ 1 }) / std::size_t{ 2 }]; // SoA, upper triangular matrix only } else { - A_cache[local_id_0][internal * THREAD_BLOCK_SIZE + local_id_1] = A_[global_j * (num_rows_ - row_offset_ + PADDING_SIZE_uz) + dim + threadIdx_x - global_j * (global_j + std::size_t{ 1 }) / std::size_t{ 2 }]; + A_cache[local_id_0][internal * THREAD_BLOCK_SIZE + local_id_1] = A_[global_j_idx_linear * (num_rows_ - device_row_offset_ + PADDING_SIZE_uz) + dim_block + threadIdx_x - global_j_idx_linear * (global_j_idx_linear + std::size_t{ 1 }) / std::size_t{ 2 }]; // SoA, upper triangular matrix only } - B_cache[local_id_0][internal * THREAD_BLOCK_SIZE + local_id_1] = B_[(dim + row_offset_ + threadIdx_x) * (num_rhs_ + PADDING_SIZE_uz) + global_i]; + B_cache[local_id_0][internal * THREAD_BLOCK_SIZE + local_id_1] = B_[(dim_block + device_row_offset_ + threadIdx_x) * (num_rhs_ + PADDING_SIZE_uz) + global_i_idx_linear]; // SoA } }); - // perform calculations + // perform the dot product calculation ::sycl::distribute_items_and_wait(group, [&](::sycl::s_item<2> idx) { + // cast values to 32-bit unsigned int values to prevent implicit conversions const auto local_id_0 = static_cast(idx.get_local_id(group, 0)); const auto local_id_1 = static_cast(idx.get_local_id(group, 1)); - for (unsigned block_dim = 0; block_dim < THREAD_BLOCK_SIZE; ++block_dim) { + if constexpr (target == target_platform::cpu) { + // perform the dot product calculation, the dim is the fastest moving index for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) { for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) { - temp(idx)[internal_i][internal_j] += A_cache[block_dim][local_id_0 * INTERNAL_BLOCK_SIZE + internal_j] * B_cache[block_dim][local_id_1 * INTERNAL_BLOCK_SIZE + internal_i]; + real_type sum{ 0.0 }; + for (unsigned dim = 0; dim < THREAD_BLOCK_SIZE; ++dim) { + sum += A_cache[dim][local_id_0 * INTERNAL_BLOCK_SIZE + internal_j] * B_cache[dim][local_id_1 * INTERNAL_BLOCK_SIZE + internal_i]; + } + temp(idx)[internal_i][internal_j] += sum; + } + } + } else { + // perform the dot product calculation, the dim is the slowest moving index + for (unsigned dim = 0; dim < THREAD_BLOCK_SIZE; ++dim) { + for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) { + for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) { + temp(idx)[internal_i][internal_j] += A_cache[dim][local_id_0 * INTERNAL_BLOCK_SIZE + internal_j] * B_cache[dim][local_id_1 * INTERNAL_BLOCK_SIZE + internal_i]; + } } } } }); } + // apply the (partial) BLAS operation and update C ::sycl::distribute_items_and_wait(group, [&](::sycl::s_item<2> idx) { - const auto PADDING_SIZE_uz = static_cast(PADDING_SIZE); + // cast all values to 64-bit std::size_t to prevent potential 32-bit overflows + constexpr auto INTERNAL_BLOCK_SIZE_uz = static_cast(INTERNAL_BLOCK_SIZE); + constexpr auto PADDING_SIZE_uz = static_cast(PADDING_SIZE); + + const auto threadIdx_x = static_cast(idx.get_local_id(group, 0)); // current work-item in work-group x-dimension + const auto threadIdx_y = static_cast(idx.get_local_id(group, 1)); // current work-item in work-group y-dimension + const auto blockDim_x = static_cast(group.get_logical_local_range(0)); // number of work-items in work-group x-dimension + const auto blockDim_y = static_cast(group.get_logical_local_range(1)); // number of work-items in work-group y-dimension + const auto blockIdx_x = static_cast(group[0]) + grid_x_offset_; // current work-group in global range x-dimension + offsets if the global range is too large + const auto blockIdx_y = static_cast(group[1]) + grid_y_offset_; // current work-group in global range y-dimension + offsets if the global range is too large + + // calculate the indices to access the global data, pays attention to coalesced memory accesses + const auto i_idx = (blockIdx_y * blockDim_y + threadIdx_y) * INTERNAL_BLOCK_SIZE_uz; // num_rhs + const auto j_idx = (blockIdx_x * blockDim_x + threadIdx_x) * INTERNAL_BLOCK_SIZE_uz; // device_num_rows for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) { for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) { - const auto global_i = i(idx) + static_cast(internal_i); - const auto device_global_j = j(idx) + static_cast(internal_j); - const auto global_j = row_offset_ + j(idx) + static_cast(internal_j); - - // be sure to not perform out of bounds accesses - if (global_i < num_rhs_ && device_global_j < device_specific_num_rows_) { - C_[global_j * (num_rhs_ + PADDING_SIZE_uz) + global_i] = alpha_ * temp(idx)[internal_i][internal_j] + beta_ * C_[global_j * (num_rhs_ + PADDING_SIZE_uz) + global_i]; + // calculate the indices to access the global data and the data with respect to the current device + const auto global_i_idx = i_idx + static_cast(internal_i); + const auto device_global_j_idx = j_idx + static_cast(internal_j); + const auto global_j_idx = device_row_offset_ + device_global_j_idx; + + // be sure to not perform out-of-bounds accesses + if (global_i_idx < num_rhs_ && device_global_j_idx < device_num_rows_) { + C_[global_j_idx * (num_rhs_ + PADDING_SIZE_uz) + global_i_idx] = alpha_ * temp(idx)[internal_i][internal_j] + beta_ * C_[global_j_idx * (num_rhs_ + PADDING_SIZE_uz) + global_i_idx]; // SoA } } } @@ -152,8 +185,8 @@ class device_kernel_symm { /// @cond Doxygen_suppress const std::size_t num_rows_; const std::size_t num_rhs_; - const std::size_t device_specific_num_rows_; - const std::size_t row_offset_; + const std::size_t device_num_rows_; + const std::size_t device_row_offset_; const real_type alpha_; const real_type *A_; const real_type *B_; @@ -168,16 +201,21 @@ class device_kernel_symm { * @brief Perform an explicit BLAS SYMM operation: `C = alpha * A * B + beta * C` where @p A is a `m x k` symmetric matrix (memory optimized), @p B is a `k x n` matrix, @p C is a `m x n` matrix, and @p alpha and @p beta are scalars. * @details In a multi-GPU setting, this function is responsible for mirroring down the columns this device is responsible for! * Uses AdaptiveCpp's scoped parallelism. + * @tparam target the target platform */ +template class device_kernel_symm_mirror { public: + /// The used SYCL kernel invocation type. + constexpr static sycl::kernel_invocation_type invocation_type = sycl::kernel_invocation_type::scoped; + /** * @brief Initialize the SYCL kernel function object. * @param[in] num_rows the number of rows in @p A and @p C * @param[in] num_rhs the number of columns in @p B and @p C * @param[in] num_mirror_rows the number of rows to mirror down - * @param[in] device_specific_num_rows the number of rows in @p A and number of rows in @p B; thr rows in @p A are potentially distributed across multiple devices - * @param[in] row_offset the first row this device is responsible for + * @param[in] device_num_rows the number of rows in @p A and number of rows in @p B; thr rows in @p A are potentially distributed across multiple devices + * @param[in] device_row_offset the first row this device is responsible for * @param[in] alpha the scalar alpha value * @param[in] A the matrix @p A * @param[in] B the matrix @p B @@ -186,12 +224,12 @@ class device_kernel_symm_mirror { * @param[in] grid_x_offset the offset in x-dimension into the data points if more than one execution grid has to be used * @param[in] grid_y_offset the offset in y-dimension into the data points if more than one execution grid has to be used */ - device_kernel_symm_mirror(const std::size_t num_rows, const std::size_t num_rhs, const std::size_t num_mirror_rows, const std::size_t device_specific_num_rows, const std::size_t row_offset, const real_type alpha, const real_type *A, const real_type *B, const real_type beta, real_type *C, const std::size_t grid_x_offset, const std::size_t grid_y_offset) : + device_kernel_symm_mirror(const std::size_t num_rows, const std::size_t num_rhs, const std::size_t num_mirror_rows, const std::size_t device_num_rows, const std::size_t device_row_offset, const real_type alpha, const real_type *A, const real_type *B, const real_type beta, real_type *C, const std::size_t grid_x_offset, const std::size_t grid_y_offset) : num_rows_{ num_rows }, num_rhs_{ num_rhs }, num_mirror_rows_{ num_mirror_rows }, - device_specific_num_rows_{ device_specific_num_rows }, - row_offset_{ row_offset }, + device_num_rows_{ device_num_rows }, + device_row_offset_{ device_row_offset }, alpha_{ alpha }, A_{ A }, B_{ B }, @@ -208,80 +246,105 @@ class device_kernel_symm_mirror { template void operator()(T group) const { ::sycl::memory_environment(group, - ::sycl::require_local_mem(), - ::sycl::require_local_mem(), - ::sycl::require_private_mem(), - ::sycl::require_private_mem(), - ::sycl::require_private_mem(), - ::sycl::require_private_mem(), - ::sycl::require_private_mem, INTERNAL_BLOCK_SIZE>>({}), - [&](auto &A_cache, auto &B_cache, auto &i, auto &i_linear, auto &j, auto &j_linear, auto &temp) { - // initialize private and local variables - ::sycl::distribute_items_and_wait(group, [&](::sycl::s_item<2> idx) { - const std::size_t threadIdx_x = idx.get_local_id(group, 0); // current thread in block x-dimension - const std::size_t threadIdx_y = idx.get_local_id(group, 1); // current thread in block y-dimension - const std::size_t blockDim_x = group.get_logical_local_range(0); // number of threads in block x-dimension - const std::size_t blockDim_y = group.get_logical_local_range(1); // number of threads in block y-dimension - const std::size_t blockIdx_x = group[0] + grid_x_offset_; // current block in grid x-dimension + offsets if the grid size would be too large - const std::size_t blockIdx_y = group[1] + grid_y_offset_; // current block in grid y-dimension + offsets if the grid size would be too large - - const auto INTERNAL_BLOCK_SIZE_uz = static_cast(INTERNAL_BLOCK_SIZE); - - // indices - i(idx) = (blockIdx_y * blockDim_y + threadIdx_y) * INTERNAL_BLOCK_SIZE_uz; - i_linear(idx) = blockIdx_y * blockDim_y * INTERNAL_BLOCK_SIZE_uz + threadIdx_y; - j(idx) = (blockIdx_x * blockDim_x + threadIdx_x) * INTERNAL_BLOCK_SIZE_uz; - j_linear(idx) = blockIdx_x * blockDim_x * INTERNAL_BLOCK_SIZE_uz + threadIdx_y; - }); + // the indices used in the current work-item + ::sycl::require_local_mem(), // A_cache + ::sycl::require_local_mem(), // B_cache - for (std::size_t dim = 0; dim < device_specific_num_rows_; dim += static_cast(THREAD_BLOCK_SIZE)) { - // load data into shared memory + // create a private memory array used for internal caching + ::sycl::require_private_mem, INTERNAL_BLOCK_SIZE>>({}), + [&](auto &A_cache, auto &B_cache, auto &temp) { + // iterate over the remaining values using blocking to be able to cache them for faster memory accesses + for (std::size_t dim_block = 0; dim_block < device_num_rows_; dim_block += static_cast(THREAD_BLOCK_SIZE)) { + // load data into local memory ::sycl::distribute_items_and_wait(group, [&](::sycl::s_item<2> idx) { + // cast values to 32-bit unsigned int values to prevent implicit conversions const auto local_id_0 = static_cast(idx.get_local_id(group, 0)); const auto local_id_1 = static_cast(idx.get_local_id(group, 1)); - const std::size_t threadIdx_x = idx.get_local_id(group, 0); + // cast all values to 64-bit std::size_t to prevent potential 32-bit overflows + constexpr auto INTERNAL_BLOCK_SIZE_uz = static_cast(INTERNAL_BLOCK_SIZE); + constexpr auto THREAD_BLOCK_SIZE_uz = static_cast(THREAD_BLOCK_SIZE); + constexpr auto PADDING_SIZE_uz = static_cast(PADDING_SIZE); - const auto THREAD_BLOCK_SIZE_uz = static_cast(THREAD_BLOCK_SIZE); - const auto PADDING_SIZE_uz = static_cast(PADDING_SIZE); + const auto threadIdx_x = static_cast(idx.get_local_id(group, 0)); // current work-item in work-group x-dimension + const auto threadIdx_y = static_cast(idx.get_local_id(group, 1)); // current work-item in work-group y-dimension + const auto blockDim_x = static_cast(group.get_logical_local_range(0)); // number of work-items in work-group x-dimension + const auto blockDim_y = static_cast(group.get_logical_local_range(1)); // number of work-items in work-group y-dimension + const auto blockIdx_x = static_cast(group[0]) + grid_x_offset_; // current work-group in global range x-dimension + offsets if the global range is too large + const auto blockIdx_y = static_cast(group[1]) + grid_y_offset_; // current work-group in global range y-dimension + offsets if the global range is too large + + // calculate the indices to access the global data, pays attention to coalesced memory accesses + const auto i_idx_linear = blockIdx_y * blockDim_y * INTERNAL_BLOCK_SIZE_uz + threadIdx_y; + const auto j_idx_linear = blockIdx_x * blockDim_x * INTERNAL_BLOCK_SIZE_uz + threadIdx_y; for (unsigned internal = 0; internal < INTERNAL_BLOCK_SIZE; ++internal) { - const auto global_i = i_linear(idx) + static_cast(internal) * THREAD_BLOCK_SIZE_uz; - const auto global_j = j_linear(idx) + static_cast(internal) * THREAD_BLOCK_SIZE_uz; + // calculate the indices to access the global data, pays attention to coalesced memory accesses + const auto global_i_idx_linear = i_idx_linear + static_cast(internal) * THREAD_BLOCK_SIZE_uz; + const auto global_j_idx_linear = j_idx_linear + static_cast(internal) * THREAD_BLOCK_SIZE_uz; // store the values in the local memory - A_cache[local_id_0][internal * THREAD_BLOCK_SIZE + local_id_1] = A_[(dim + threadIdx_x) * (num_rows_ - row_offset_ + PADDING_SIZE_uz) - (dim + threadIdx_x - std::size_t{ 1 }) * (dim + threadIdx_x) / std::size_t{ 2 } + device_specific_num_rows_ - (dim + threadIdx_x) + global_j]; - B_cache[local_id_0][internal * THREAD_BLOCK_SIZE + local_id_1] = B_[(dim + row_offset_ + threadIdx_x) * (num_rhs_ + PADDING_SIZE_uz) + global_i]; + A_cache[local_id_0][internal * THREAD_BLOCK_SIZE + local_id_1] = A_[(dim_block + threadIdx_x) * (num_rows_ - device_row_offset_ + PADDING_SIZE_uz) - (dim_block + threadIdx_x - std::size_t{ 1 }) * (dim_block + threadIdx_x) / std::size_t{ 2 } + device_num_rows_ - (dim_block + threadIdx_x) + global_j_idx_linear]; // SoA, upper triangular matrix only + B_cache[local_id_0][internal * THREAD_BLOCK_SIZE + local_id_1] = B_[(device_row_offset_ + dim_block + threadIdx_x) * (num_rhs_ + PADDING_SIZE_uz) + global_i_idx_linear]; // SoA } }); - // perform calculations + // perform the dot product calculation ::sycl::distribute_items_and_wait(group, [&](::sycl::s_item<2> idx) { + // cast values to 32-bit unsigned int values to prevent implicit conversions const auto local_id_0 = static_cast(idx.get_local_id(group, 0)); const auto local_id_1 = static_cast(idx.get_local_id(group, 1)); - for (unsigned block_dim = 0; block_dim < THREAD_BLOCK_SIZE; ++block_dim) { + if constexpr (target == target_platform::cpu) { + // perform the dot product calculation, the dim is the fastest moving index for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) { for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) { - temp(idx)[internal_i][internal_j] += A_cache[block_dim][local_id_0 * INTERNAL_BLOCK_SIZE + internal_j] * B_cache[block_dim][local_id_1 * INTERNAL_BLOCK_SIZE + internal_i]; + real_type sum{ 0.0 }; + for (unsigned dim = 0; dim < THREAD_BLOCK_SIZE; ++dim) { + sum += A_cache[dim][local_id_0 * INTERNAL_BLOCK_SIZE + internal_j] * B_cache[dim][local_id_1 * INTERNAL_BLOCK_SIZE + internal_i]; + } + temp(idx)[internal_i][internal_j] += sum; + } + } + } else { + // perform the dot product calculation, the dim is the slowest moving index + for (unsigned dim = 0; dim < THREAD_BLOCK_SIZE; ++dim) { + for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) { + for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) { + temp(idx)[internal_i][internal_j] += A_cache[dim][local_id_0 * INTERNAL_BLOCK_SIZE + internal_j] * B_cache[dim][local_id_1 * INTERNAL_BLOCK_SIZE + internal_i]; + } } } } }); } + // apply the (remaining) BLAS operation and update C ::sycl::distribute_items_and_wait(group, [&](::sycl::s_item<2> idx) { - const auto PADDING_SIZE_uz = static_cast(PADDING_SIZE); + // cast all values to 64-bit std::size_t to prevent potential 32-bit overflows + constexpr auto INTERNAL_BLOCK_SIZE_uz = static_cast(INTERNAL_BLOCK_SIZE); + constexpr auto PADDING_SIZE_uz = static_cast(PADDING_SIZE); + + const auto threadIdx_x = static_cast(idx.get_local_id(group, 0)); // current work-item in work-group x-dimension + const auto threadIdx_y = static_cast(idx.get_local_id(group, 1)); // current work-item in work-group y-dimension + const auto blockDim_x = static_cast(group.get_logical_local_range(0)); // number of work-items in work-group x-dimension + const auto blockDim_y = static_cast(group.get_logical_local_range(1)); // number of work-items in work-group y-dimension + const auto blockIdx_x = static_cast(group[0]) + grid_x_offset_; // current work-group in global range x-dimension + offsets if the global range is too large + const auto blockIdx_y = static_cast(group[1]) + grid_y_offset_; // current work-group in global range y-dimension + offsets if the global range is too large + + // calculate the indices to access the global data + const auto i_idx = (blockIdx_y * blockDim_y + threadIdx_y) * INTERNAL_BLOCK_SIZE_uz; + const auto j_idx = (blockIdx_x * blockDim_x + threadIdx_x) * INTERNAL_BLOCK_SIZE_uz; for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) { for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) { - const auto global_i = i(idx) + static_cast(internal_i); - const auto partial_global_j = j(idx) + static_cast(internal_j); - const auto global_j = row_offset_ + device_specific_num_rows_ + j(idx) + static_cast(internal_j); - - // be sure to not perform out of bounds accesses - if (global_i < num_rhs_ && partial_global_j < num_mirror_rows_) { - C_[global_j * (num_rhs_ + PADDING_SIZE_uz) + global_i] = alpha_ * temp(idx)[internal_i][internal_j] + beta_ * C_[global_j * (num_rhs_ + PADDING_SIZE_uz) + global_i]; + // calculate the indices to access the global data and the data with respect to the current device + const auto global_i_idx = i_idx + static_cast(internal_i); + const auto partial_global_j_idx = j_idx + static_cast(internal_j); + const auto global_j_idx = device_row_offset_ + device_num_rows_ + partial_global_j_idx; + + // be sure to not perform out-of-bounds accesses + if (global_i_idx < num_rhs_ && partial_global_j_idx < num_mirror_rows_) { + C_[global_j_idx * (num_rhs_ + PADDING_SIZE_uz) + global_i_idx] = alpha_ * temp(idx)[internal_i][internal_j] + beta_ * C_[global_j_idx * (num_rhs_ + PADDING_SIZE_uz) + global_i_idx]; // SoA } } } @@ -294,8 +357,8 @@ class device_kernel_symm_mirror { const std::size_t num_rows_; const std::size_t num_rhs_; const std::size_t num_mirror_rows_; - const std::size_t device_specific_num_rows_; - const std::size_t row_offset_; + const std::size_t device_num_rows_; + const std::size_t device_row_offset_; const real_type alpha_; const real_type *A_; const real_type *B_; @@ -312,6 +375,9 @@ class device_kernel_symm_mirror { */ class device_kernel_inplace_matrix_add { public: + /// The used SYCL kernel invocation type. + constexpr static sycl::kernel_invocation_type invocation_type = sycl::kernel_invocation_type::scoped; + /** * @brief Initialize the SYCL kernel function object. * @param[in] num_cols the number of columns in both matrices @@ -336,28 +402,29 @@ class device_kernel_inplace_matrix_add { void operator()(T group) const { ::sycl::memory_environment(group, [&]() { - // scale ::sycl::distribute_items_and_wait(group, [&](::sycl::s_item<2> idx) { // cast all values to 64-bit std::size_t to prevent potential 32-bit overflows - const std::size_t threadIdx_x = idx.get_local_id(group, 0); - const std::size_t threadIdx_y = idx.get_local_id(group, 1); - const std::size_t blockDim_x = group.get_logical_local_range(0); - const std::size_t blockDim_y = group.get_logical_local_range(1); - const std::size_t blockIdx_x = group[0] + grid_x_offset_; - const std::size_t blockIdx_y = group[1] + grid_y_offset_; - const auto INTERNAL_BLOCK_SIZE_uz = static_cast(INTERNAL_BLOCK_SIZE); - const auto PADDING_SIZE_uz = static_cast(PADDING_SIZE); - - // indices - const std::size_t i = (blockIdx_y * blockDim_y + threadIdx_y) * INTERNAL_BLOCK_SIZE_uz; - const std::size_t j = (blockIdx_x * blockDim_x + threadIdx_x) * INTERNAL_BLOCK_SIZE_uz; + constexpr auto INTERNAL_BLOCK_SIZE_uz = static_cast(INTERNAL_BLOCK_SIZE); + constexpr auto PADDING_SIZE_uz = static_cast(PADDING_SIZE); + + const auto threadIdx_x = static_cast(idx.get_local_id(group, 0)); // current work-item in work-group x-dimension + const auto threadIdx_y = static_cast(idx.get_local_id(group, 1)); // current work-item in work-group y-dimension + const auto blockDim_x = static_cast(group.get_logical_local_range(0)); // number of work-items in work-group x-dimension + const auto blockDim_y = static_cast(group.get_logical_local_range(1)); // number of work-items in work-group y-dimension + const auto blockIdx_x = static_cast(group[0]) + grid_x_offset_; // current work-group in global range x-dimension + offsets if the global range is too large + const auto blockIdx_y = static_cast(group[1]) + grid_y_offset_; // current work-group in global range y-dimension + offsets if the global range is too large + + // calculate the indices used in the current work-item + const auto i_idx = (blockIdx_y * blockDim_y + threadIdx_y) * INTERNAL_BLOCK_SIZE_uz; // num_rows + const auto j_idx = (blockIdx_x * blockDim_x + threadIdx_x) * INTERNAL_BLOCK_SIZE_uz; // num_rhs for (std::size_t internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE_uz; ++internal_i) { for (std::size_t internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE_uz; ++internal_j) { - const std::size_t global_i = i + internal_i; - const std::size_t global_j = j + internal_j; + // calculate the indices to access the global data + const auto global_i_idx = i_idx + static_cast(internal_i); + const auto global_j_idx = j_idx + static_cast(internal_j); - lhs_[global_i * (num_cols_ + PADDING_SIZE_uz) + global_j] += rhs_[global_i * (num_cols_ + PADDING_SIZE_uz) + global_j]; + lhs_[global_i_idx * (num_cols_ + PADDING_SIZE_uz) + global_j_idx] += rhs_[global_i_idx * (num_cols_ + PADDING_SIZE_uz) + global_j_idx]; // SoA } } }); @@ -380,6 +447,9 @@ class device_kernel_inplace_matrix_add { */ class device_kernel_inplace_matrix_scale { public: + /// The used SYCL kernel invocation type. + constexpr static sycl::kernel_invocation_type invocation_type = sycl::kernel_invocation_type::scoped; + /** * @brief Initialize the SYCL kernel function object. * @param[in] num_cols the number of columns in the matrix @@ -404,28 +474,29 @@ class device_kernel_inplace_matrix_scale { void operator()(T group) const { ::sycl::memory_environment(group, [&]() { - // scale ::sycl::distribute_items_and_wait(group, [&](::sycl::s_item<2> idx) { // cast all values to 64-bit std::size_t to prevent potential 32-bit overflows - const std::size_t threadIdx_x = idx.get_local_id(group, 0); - const std::size_t threadIdx_y = idx.get_local_id(group, 1); - const std::size_t blockDim_x = group.get_logical_local_range(0); - const std::size_t blockDim_y = group.get_logical_local_range(1); - const std::size_t blockIdx_x = group[0] + grid_x_offset_; - const std::size_t blockIdx_y = group[1] + grid_y_offset_; - const auto INTERNAL_BLOCK_SIZE_uz = static_cast(INTERNAL_BLOCK_SIZE); - const auto PADDING_SIZE_uz = static_cast(PADDING_SIZE); - - // indices - const std::size_t i = (blockIdx_y * blockDim_y + threadIdx_y) * INTERNAL_BLOCK_SIZE_uz; - const std::size_t j = (blockIdx_x * blockDim_x + threadIdx_x) * INTERNAL_BLOCK_SIZE_uz; + constexpr auto INTERNAL_BLOCK_SIZE_uz = static_cast(INTERNAL_BLOCK_SIZE); + constexpr auto PADDING_SIZE_uz = static_cast(PADDING_SIZE); - for (std::size_t internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE_uz; ++internal_i) { - for (std::size_t internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE_uz; ++internal_j) { - const std::size_t global_i = i + internal_i; - const std::size_t global_j = j + internal_j; + const auto threadIdx_x = static_cast(idx.get_local_id(group, 0)); // current work-item in work-group x-dimension + const auto threadIdx_y = static_cast(idx.get_local_id(group, 1)); // current work-item in work-group y-dimension + const auto blockDim_x = static_cast(group.get_logical_local_range(0)); // number of work-items in work-group x-dimension + const auto blockDim_y = static_cast(group.get_logical_local_range(1)); // number of work-items in work-group y-dimension + const auto blockIdx_x = static_cast(group[0]) + grid_x_offset_; // current work-group in global range x-dimension + offsets if the global range is too large + const auto blockIdx_y = static_cast(group[1]) + grid_y_offset_; // current work-group in global range y-dimension + offsets if the global range is too large + + // calculate the indices used in the current work-item + const auto i_idx = (blockIdx_y * blockDim_y + threadIdx_y) * INTERNAL_BLOCK_SIZE_uz; // num_rows + const auto j_idx = (blockIdx_x * blockDim_x + threadIdx_x) * INTERNAL_BLOCK_SIZE_uz; // num_rhs + + for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) { + for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) { + // calculate the indices to access the global data + const auto global_i_idx = i_idx + static_cast(internal_i); + const auto global_j_idx = j_idx + static_cast(internal_j); - lhs_[global_i * (num_cols_ + PADDING_SIZE_uz) + global_j] *= scale_; + lhs_[global_i_idx * (num_cols_ + PADDING_SIZE_uz) + global_j_idx] *= scale_; // SoA } } }); diff --git a/include/plssvm/backends/SYCL/kernel/cg_explicit/scoped/kernel_matrix_assembly.hpp b/include/plssvm/backends/SYCL/kernel/cg_explicit/scoped/kernel_matrix_assembly.hpp index 33c725a46..b882cdead 100644 --- a/include/plssvm/backends/SYCL/kernel/cg_explicit/scoped/kernel_matrix_assembly.hpp +++ b/include/plssvm/backends/SYCL/kernel/cg_explicit/scoped/kernel_matrix_assembly.hpp @@ -14,8 +14,10 @@ #pragma once #include "plssvm/backends/SYCL/kernel/kernel_functions.hpp" // plssvm::sycl::detail::{feature_reduce, apply_kernel_function} +#include "plssvm/backends/SYCL/kernel_invocation_types.hpp" // plssvm::sycl::kernel_invocation_type #include "plssvm/constants.hpp" // plssvm::{real_type, THREAD_BLOCK_SIZE, INTERNAL_BLOCK_SIZE, PADDING_SIZE} #include "plssvm/kernel_function_types.hpp" // plssvm::kernel_function_type +#include "plssvm/target_platforms.hpp" // plssvm::target_platform #include "sycl/sycl.hpp" // sycl::memory_environment, sycl::require_local_mem, sycl::require_private_mem, sycl::distribute_items_and_wait, sycl::s_item @@ -28,12 +30,16 @@ namespace plssvm::sycl::detail::scoped { /** * @brief Create the explicit kernel matrix using the @p kernel_function. * @details Uses AdaptiveCpp's scoped parallelism. + * @tparam target the target platform * @tparam kernel_function the type of the used kernel function * @tparam Args the types of the parameters necessary for the specific kernel function; stored in a `std::tuple` */ -template +template class device_kernel_assembly { public: + /// The used SYCL kernel invocation type. + constexpr static sycl::kernel_invocation_type invocation_type = sycl::kernel_invocation_type::scoped; + /** * @brief Initialize the SYCL kernel function object. * @param[out] kernel_matrix the calculated kernel matrix @@ -61,7 +67,7 @@ class device_kernel_assembly { cost_{ cost }, grid_x_offset_{ grid_x_offset }, grid_y_offset_{ grid_y_offset }, - kernel_function_parameter_{ std::make_tuple(std::forward(kernel_function_parameter)...) } { + kernel_function_parameter_{ std::make_tuple(kernel_function_parameter...) } { } /** @@ -72,14 +78,17 @@ class device_kernel_assembly { template void operator()(T group) const { ::sycl::memory_environment(group, - ::sycl::require_local_mem(), // data_i_cache - ::sycl::require_local_mem(), // data_j_cache + // create two local memory arrays used for caching + ::sycl::require_local_mem(), // data_i_cache + ::sycl::require_local_mem(), // data_j_cache + + // create a private memory array used for internal caching ::sycl::require_private_mem, INTERNAL_BLOCK_SIZE>>({}), // temp [&](auto &data_i_cache, auto &data_j_cache, auto &temp) { // only calculate the upper triangular matrix -> can't use get_local_id() since all work-items in a work-group must progress further if (group[1] >= group[0]) { // iterate over all features using blocking to be able to cache them for faster memory accesses - for (std::size_t dim = 0; dim < num_features_; dim += static_cast(THREAD_BLOCK_SIZE)) { + for (std::size_t feature_block = 0; feature_block < num_features_; feature_block += static_cast(THREAD_BLOCK_SIZE)) { // load data into local memory ::sycl::distribute_items_and_wait(group, [&](::sycl::s_item<2> idx) { // cast values to 32-bit unsigned int values to prevent implicit conversions @@ -91,25 +100,25 @@ class device_kernel_assembly { constexpr auto THREAD_BLOCK_SIZE_uz = static_cast(THREAD_BLOCK_SIZE); constexpr auto PADDING_SIZE_uz = static_cast(PADDING_SIZE); - const std::size_t threadIdx_x = idx.get_local_id(group, 0); // current work-item in work-group x-dimension - const std::size_t threadIdx_y = idx.get_local_id(group, 1); // current work-item in work-group y-dimension - const std::size_t blockDim_x = group.get_logical_local_range(0); // number of work-items in work-group x-dimension - const std::size_t blockDim_y = group.get_logical_local_range(1); // number of work-items in work-group y-dimension - const std::size_t blockIdx_x = group[0] + grid_x_offset_; // current work-group in global range x-dimension + offsets if the global range is too large - const std::size_t blockIdx_y = group[1] + grid_y_offset_; // current work-group in global range y-dimension + offsets if the global range is too large + const auto threadIdx_x = static_cast(idx.get_local_id(group, 0)); // current work-item in work-group x-dimension + const auto threadIdx_y = static_cast(idx.get_local_id(group, 1)); // current work-item in work-group y-dimension + const auto blockDim_x = static_cast(group.get_logical_local_range(0)); // number of work-items in work-group x-dimension + const auto blockDim_y = static_cast(group.get_logical_local_range(1)); // number of work-items in work-group y-dimension + const auto blockIdx_x = static_cast(group[0]) + grid_x_offset_; // current work-group in global range x-dimension + offsets if the global range is too large + const auto blockIdx_y = static_cast(group[1]) + grid_y_offset_; // current work-group in global range y-dimension + offsets if the global range is too large - // calculate the indices used in the current work-item paying attention to coalesced memory accesses - const auto i_linear = blockIdx_y * blockDim_y * INTERNAL_BLOCK_SIZE_uz + threadIdx_y; - const auto j_linear = blockIdx_x * blockDim_x * INTERNAL_BLOCK_SIZE_uz + threadIdx_y; + // calculate the indices used in the current work-item, pays attention to coalesced memory accesses + const auto i_idx_linear = blockIdx_y * blockDim_y * INTERNAL_BLOCK_SIZE_uz + threadIdx_y; // num_rows - device_row_offset + const auto j_idx_linear = blockIdx_x * blockDim_x * INTERNAL_BLOCK_SIZE_uz + threadIdx_y; // device_num_rows for (unsigned internal = 0; internal < INTERNAL_BLOCK_SIZE; ++internal) { - // calculate the indices to access the global data points, pays attention to coalesced memory accesses - const auto global_i_linear = device_row_offset_ + i_linear + static_cast(internal) * THREAD_BLOCK_SIZE_uz; - const auto global_j_linear = device_row_offset_ + j_linear + static_cast(internal) * THREAD_BLOCK_SIZE_uz; + // calculate the indices to access the global data, pays attention to coalesced memory accesses + const auto global_i_idx_linear = device_row_offset_ + i_idx_linear + static_cast(internal) * THREAD_BLOCK_SIZE_uz; + const auto global_j_idx_linear = device_row_offset_ + j_idx_linear + static_cast(internal) * THREAD_BLOCK_SIZE_uz; // store the values in the local memory - data_i_cache[local_id_0][internal * THREAD_BLOCK_SIZE + local_id_1] = data_[(dim + threadIdx_x) * (num_rows_ + std::size_t{ 1 } + PADDING_SIZE_uz) + global_i_linear]; - data_j_cache[local_id_0][internal * THREAD_BLOCK_SIZE + local_id_1] = data_[(dim + threadIdx_x) * (num_rows_ + std::size_t{ 1 } + PADDING_SIZE_uz) + global_j_linear]; + data_i_cache[local_id_0][internal * THREAD_BLOCK_SIZE + local_id_1] = data_[(feature_block + threadIdx_x) * (num_rows_ + std::size_t{ 1 } + PADDING_SIZE_uz) + global_i_idx_linear]; // SoA + data_j_cache[local_id_0][internal * THREAD_BLOCK_SIZE + local_id_1] = data_[(feature_block + threadIdx_x) * (num_rows_ + std::size_t{ 1 } + PADDING_SIZE_uz) + global_j_idx_linear]; // SoA } }); @@ -119,11 +128,26 @@ class device_kernel_assembly { const auto local_id_0 = static_cast(idx.get_local_id(group, 0)); const auto local_id_1 = static_cast(idx.get_local_id(group, 1)); - for (unsigned block_dim = 0; block_dim < THREAD_BLOCK_SIZE; ++block_dim) { + if constexpr (target == target_platform::cpu) { + // perform the feature reduction calculation, the feature is the fastest moving index for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) { for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) { - temp(idx)[internal_i][internal_j] += detail::feature_reduce(data_i_cache[block_dim][local_id_1 * INTERNAL_BLOCK_SIZE + internal_i], - data_j_cache[block_dim][local_id_0 * INTERNAL_BLOCK_SIZE + internal_j]); + real_type sum{ 0.0 }; + for (unsigned feature = 0; feature < THREAD_BLOCK_SIZE; ++feature) { + sum += detail::feature_reduce(data_i_cache[feature][local_id_1 * INTERNAL_BLOCK_SIZE + internal_i], + data_j_cache[feature][local_id_0 * INTERNAL_BLOCK_SIZE + internal_j]); + } + temp(idx)[internal_i][internal_j] += sum; + } + } + } else { + // perform the feature reduction calculation, the feature is the slowest moving index + for (unsigned feature = 0; feature < THREAD_BLOCK_SIZE; ++feature) { + for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) { + for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) { + temp(idx)[internal_i][internal_j] += detail::feature_reduce(data_i_cache[feature][local_id_1 * INTERNAL_BLOCK_SIZE + internal_i], + data_j_cache[feature][local_id_0 * INTERNAL_BLOCK_SIZE + internal_j]); + } } } } @@ -136,36 +160,36 @@ class device_kernel_assembly { constexpr auto INTERNAL_BLOCK_SIZE_uz = static_cast(INTERNAL_BLOCK_SIZE); constexpr auto PADDING_SIZE_uz = static_cast(PADDING_SIZE); - const std::size_t threadIdx_x = idx.get_local_id(group, 0); // current work-item in work-group x-dimension - const std::size_t threadIdx_y = idx.get_local_id(group, 1); // current work-item in work-group y-dimension - const std::size_t blockDim_x = group.get_logical_local_range(0); // number of work-items in work-group x-dimension - const std::size_t blockDim_y = group.get_logical_local_range(1); // number of work-items in work-group y-dimension - const std::size_t blockIdx_x = group[0] + grid_x_offset_; // current work-group in global range x-dimension + offsets if the global range is too large - const std::size_t blockIdx_y = group[1] + grid_y_offset_; // current work-group in global range y-dimension + offsets if the global range is too large + const auto threadIdx_x = static_cast(idx.get_local_id(group, 0)); // current work-item in work-group x-dimension + const auto threadIdx_y = static_cast(idx.get_local_id(group, 1)); // current work-item in work-group y-dimension + const auto blockDim_x = static_cast(group.get_logical_local_range(0)); // number of work-items in work-group x-dimension + const auto blockDim_y = static_cast(group.get_logical_local_range(1)); // number of work-items in work-group y-dimension + const auto blockIdx_x = static_cast(group[0]) + grid_x_offset_; // current work-group in global range x-dimension + offsets if the global range is too large + const auto blockIdx_y = static_cast(group[1]) + grid_y_offset_; // current work-group in global range y-dimension + offsets if the global range is too large // calculate the indices used in the current work-item - const auto i = (blockIdx_y * blockDim_y + threadIdx_y) * INTERNAL_BLOCK_SIZE_uz; - const auto j = (blockIdx_x * blockDim_x + threadIdx_x) * INTERNAL_BLOCK_SIZE_uz; + const auto i_idx = (blockIdx_y * blockDim_y + threadIdx_y) * INTERNAL_BLOCK_SIZE_uz; // num_rows - device_row_offset + const auto j_idx = (blockIdx_x * blockDim_x + threadIdx_x) * INTERNAL_BLOCK_SIZE_uz; // device_num_rows for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) { for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) { - // calculate the indices to access the global data points and wrt the current device - const auto device_global_i = i + static_cast(internal_i); - const auto global_i = device_row_offset_ + device_global_i; - const auto device_global_j = j + static_cast(internal_j); - const auto global_j = device_row_offset_ + device_global_j; - - // be sure to not perform out of bounds accesses for the kernel matrix (only using the upper triangular matrix) - if (device_global_i < (num_rows_ - device_row_offset_) && device_global_j < device_num_rows_ && global_i >= global_j) { + // calculate the indices to access the global data and the data with respect to the current device + const auto device_global_i_idx = i_idx + static_cast(internal_i); + const auto global_i_idx = device_row_offset_ + device_global_i_idx; + const auto device_global_j_idx = j_idx + static_cast(internal_j); + const auto global_j_idx = device_row_offset_ + device_global_j_idx; + + // be sure to not perform out-of-bounds accesses (only using the upper triangular matrix) + if (device_global_i_idx < (num_rows_ - device_row_offset_) && device_global_j_idx < device_num_rows_ && global_i_idx >= global_j_idx) { real_type temp_ij = temp(idx)[internal_i][internal_j]; // apply the final kernel function - temp_ij = detail::apply_kernel_function(temp_ij, kernel_function_parameter_) + QA_cost_ - q_[global_i] - q_[global_j]; + temp_ij = detail::apply_kernel_function(temp_ij, kernel_function_parameter_) + QA_cost_ - q_[global_i_idx] - q_[global_j_idx]; // apply the cost on the diagonal - if (global_i == global_j) { + if (global_i_idx == global_j_idx) { temp_ij += cost_; } // update the upper triangular kernel matrix - kernel_matrix_[device_global_j * (num_rows_ - device_row_offset_ + PADDING_SIZE_uz) - device_global_j * (device_global_j + std::size_t{ 1 }) / std::size_t{ 2 } + device_global_i] = temp_ij; + kernel_matrix_[device_global_j_idx * (num_rows_ - device_row_offset_ + PADDING_SIZE_uz) - device_global_j_idx * (device_global_j_idx + std::size_t{ 1 }) / std::size_t{ 2 } + device_global_i_idx] = temp_ij; } } } diff --git a/include/plssvm/backends/SYCL/kernel/cg_explicit/work_group/blas.hpp b/include/plssvm/backends/SYCL/kernel/cg_explicit/work_group/blas.hpp index 965b043a3..5c0949c34 100644 --- a/include/plssvm/backends/SYCL/kernel/cg_explicit/work_group/blas.hpp +++ b/include/plssvm/backends/SYCL/kernel/cg_explicit/work_group/blas.hpp @@ -13,7 +13,9 @@ #define PLSSVM_BACKENDS_SYCL_CG_EXPLICIT_WORK_GROUP_BLAS_HPP_ #pragma once -#include "plssvm/constants.hpp" // plssvm::{real_type, THREAD_BLOCK_SIZE, INTERNAL_BLOCK_SIZE, PADDING_SIZE} +#include "plssvm/backends/SYCL/kernel_invocation_types.hpp" // plssvm::sycl::kernel_invocation_type +#include "plssvm/constants.hpp" // plssvm::{real_type, THREAD_BLOCK_SIZE, INTERNAL_BLOCK_SIZE, PADDING_SIZE} +#include "plssvm/target_platforms.hpp" // plssvm::target_platform #include "sycl/sycl.hpp" // sycl::handler, sycl::range, sycl::nd_item, sycl::local_accessor @@ -24,16 +26,21 @@ namespace plssvm::sycl::detail::work_group { /** * @brief Perform an explicit BLAS SYMM operation: `C = alpha * A * B + beta * C` where @p A is a `m x k` symmetric matrix (memory optimized), @p B is a `k x n` matrix, @p C is a `m x n` matrix, and @p alpha and @p beta are scalars. * @details Uses SYCL's work-group data parallel kernels. + * @tparam target the target platform */ +template class device_kernel_symm { public: + /// The used SYCL kernel invocation type. + constexpr static sycl::kernel_invocation_type invocation_type = sycl::kernel_invocation_type::work_group; + /** * @brief Initialize the SYCL kernel function object. * @param[in] cgh the SYCL handler used to allocate the local memory * @param[in] num_rows the number of rows in @p A and @p C * @param[in] num_rhs the number of columns in @p B and @p C - * @param[in] device_specific_num_rows the number of rows in @p A and number of rows in @p B; thr rows in @p A are potentially distributed across multiple devices - * @param[in] row_offset the first row this device is responsible for + * @param[in] device_num_rows the number of rows in @p A and number of rows in @p B; thr rows in @p A are potentially distributed across multiple devices + * @param[in] device_row_offset the first row this device is responsible for * @param[in] alpha the scalar alpha value * @param[in] A the matrix @p A * @param[in] B the matrix @p B @@ -42,13 +49,13 @@ class device_kernel_symm { * @param[in] grid_x_offset the offset in x-dimension into the data points if more than one execution grid has to be used * @param[in] grid_y_offset the offset in y-dimension into the data points if more than one execution grid has to be used */ - device_kernel_symm(::sycl::handler &cgh, const std::size_t num_rows, const std::size_t num_rhs, const std::size_t device_specific_num_rows, const std::size_t row_offset, const real_type alpha, const real_type *A, const real_type *B, const real_type beta, real_type *C, const std::size_t grid_x_offset, const std::size_t grid_y_offset) : + device_kernel_symm(::sycl::handler &cgh, const std::size_t num_rows, const std::size_t num_rhs, const std::size_t device_num_rows, const std::size_t device_row_offset, const real_type alpha, const real_type *A, const real_type *B, const real_type beta, real_type *C, const std::size_t grid_x_offset, const std::size_t grid_y_offset) : A_cache_{ ::sycl::range<2>{ static_cast(THREAD_BLOCK_SIZE), static_cast(INTERNAL_BLOCK_SIZE) * static_cast(THREAD_BLOCK_SIZE) }, cgh }, B_cache_{ ::sycl::range<2>{ static_cast(THREAD_BLOCK_SIZE), static_cast(INTERNAL_BLOCK_SIZE) * static_cast(THREAD_BLOCK_SIZE) }, cgh }, num_rows_{ num_rows }, num_rhs_{ num_rhs }, - device_specific_num_rows_{ device_specific_num_rows }, - row_offset_{ row_offset }, + device_num_rows_{ device_num_rows }, + device_row_offset_{ device_row_offset }, alpha_{ alpha }, A_{ A }, B_{ B }, @@ -67,64 +74,85 @@ class device_kernel_symm { const auto local_id_1 = static_cast(nd_idx.get_local_id(1)); // cast all values to 64-bit std::size_t to prevent potential 32-bit overflows - const std::size_t threadIdx_x = nd_idx.get_local_id(0); // current thread in block x-dimension - const std::size_t threadIdx_y = nd_idx.get_local_id(1); // current thread in block y-dimension - const std::size_t blockDim_x = nd_idx.get_local_range(0); // number of threads in block x-dimension - const std::size_t blockDim_y = nd_idx.get_local_range(1); // number of threads in block y-dimension - const std::size_t blockIdx_x = nd_idx.get_group(0) + grid_x_offset_; // current block in grid x-dimension + offsets if the grid size would be too large - const std::size_t blockIdx_y = nd_idx.get_group(1) + grid_y_offset_; // current block in grid y-dimension + offsets if the grid size would be too large - const auto INTERNAL_BLOCK_SIZE_uz = static_cast(INTERNAL_BLOCK_SIZE); - const auto THREAD_BLOCK_SIZE_uz = static_cast(THREAD_BLOCK_SIZE); - const auto PADDING_SIZE_uz = static_cast(PADDING_SIZE); + constexpr auto INTERNAL_BLOCK_SIZE_uz = static_cast(INTERNAL_BLOCK_SIZE); + constexpr auto THREAD_BLOCK_SIZE_uz = static_cast(THREAD_BLOCK_SIZE); + constexpr auto PADDING_SIZE_uz = static_cast(PADDING_SIZE); - // calculate the indices used in the current work-item - const auto i = (blockIdx_y * blockDim_y + threadIdx_y) * INTERNAL_BLOCK_SIZE_uz; - const auto i_linear = blockIdx_y * blockDim_y * INTERNAL_BLOCK_SIZE_uz + threadIdx_y; - const auto j = (blockIdx_x * blockDim_x + threadIdx_x) * INTERNAL_BLOCK_SIZE_uz; - const auto j_linear = blockIdx_x * blockDim_x * INTERNAL_BLOCK_SIZE_uz + threadIdx_y; + const auto threadIdx_x = static_cast(nd_idx.get_local_id(0)); // current work-item in work-group x-dimension + const auto threadIdx_y = static_cast(nd_idx.get_local_id(1)); // current work-item in work-group y-dimension + const auto blockDim_x = static_cast(nd_idx.get_local_range(0)); // number of work-items in work-group x-dimension + const auto blockDim_y = static_cast(nd_idx.get_local_range(1)); // number of work-items in work-group y-dimension + const auto blockIdx_x = static_cast(nd_idx.get_group(0)) + grid_x_offset_; // current work-group in global range x-dimension + offsets if the global range is too large + const auto blockIdx_y = static_cast(nd_idx.get_group(1)) + grid_y_offset_; // current work-group in global range y-dimension + offsets if the global range is too large // create a work-item private array used for internal caching real_type temp[INTERNAL_BLOCK_SIZE][INTERNAL_BLOCK_SIZE]{}; - // iterate over all features using blocking to be able to cache them for faster memory accesses - for (std::size_t dim = 0; dim < (num_rows_ - row_offset_); dim += THREAD_BLOCK_SIZE_uz) { - // load data into local memory - for (unsigned internal = 0; internal < INTERNAL_BLOCK_SIZE; ++internal) { - const auto global_i = i_linear + static_cast(internal) * THREAD_BLOCK_SIZE_uz; - const auto global_j = j_linear + static_cast(internal) * THREAD_BLOCK_SIZE_uz; + { + // calculate the indices used in the current work-item, pays attention to coalesced memory accesses + const auto i_idx_linear = blockIdx_y * blockDim_y * INTERNAL_BLOCK_SIZE_uz + threadIdx_y; // num_rhs + const auto j_idx_linear = blockIdx_x * blockDim_x * INTERNAL_BLOCK_SIZE_uz + threadIdx_y; // device_num_rows + + // iterate over all values using blocking to be able to cache them for faster memory accesses + for (std::size_t dim_block = 0; dim_block < (num_rows_ - device_row_offset_); dim_block += THREAD_BLOCK_SIZE_uz) { + // load data into local memory + for (unsigned internal = 0; internal < INTERNAL_BLOCK_SIZE; ++internal) { + // calculate the indices to access the global data, pays attention to coalesced memory accesses + const auto global_i_idx_linear = i_idx_linear + static_cast(internal) * THREAD_BLOCK_SIZE_uz; + const auto global_j_idx_linear = j_idx_linear + static_cast(internal) * THREAD_BLOCK_SIZE_uz; + + // store the values in the local memory + // determine on which side of the diagonal we are located + if (dim_block + threadIdx_x < global_j_idx_linear) { + A_cache_[local_id_0][internal * THREAD_BLOCK_SIZE + local_id_1] = A_[(dim_block + threadIdx_x) * (num_rows_ - device_row_offset_ + PADDING_SIZE_uz) + global_j_idx_linear - (dim_block + threadIdx_x) * (dim_block + threadIdx_x + std::size_t{ 1 }) / std::size_t{ 2 }]; // SoA, upper triangular matrix only + } else { + A_cache_[local_id_0][internal * THREAD_BLOCK_SIZE + local_id_1] = A_[global_j_idx_linear * (num_rows_ - device_row_offset_ + PADDING_SIZE_uz) + dim_block + threadIdx_x - global_j_idx_linear * (global_j_idx_linear + std::size_t{ 1 }) / std::size_t{ 2 }]; // SoA, upper triangular matrix only + } - // determine on which side of the diagonal we are located - if (dim + threadIdx_x < global_j) { - A_cache_[local_id_0][internal * THREAD_BLOCK_SIZE + local_id_1] = A_[(dim + threadIdx_x) * (num_rows_ - row_offset_ + PADDING_SIZE_uz) + global_j - (dim + threadIdx_x) * (dim + threadIdx_x + std::size_t{ 1 }) / std::size_t{ 2 }]; - } else { - A_cache_[local_id_0][internal * THREAD_BLOCK_SIZE + local_id_1] = A_[global_j * (num_rows_ - row_offset_ + PADDING_SIZE_uz) + dim + threadIdx_x - global_j * (global_j + std::size_t{ 1 }) / std::size_t{ 2 }]; + B_cache_[local_id_0][internal * THREAD_BLOCK_SIZE + local_id_1] = B_[(dim_block + device_row_offset_ + threadIdx_x) * (num_rhs_ + PADDING_SIZE_uz) + global_i_idx_linear]; // SoA } - - B_cache_[local_id_0][internal * THREAD_BLOCK_SIZE + local_id_1] = B_[(dim + row_offset_ + threadIdx_x) * (num_rhs_ + PADDING_SIZE_uz) + global_i]; - } - nd_idx.barrier(); // wait until all work-items loaded their part of the data - - // perform the dot product calculation - for (unsigned block_dim = 0; block_dim < THREAD_BLOCK_SIZE; ++block_dim) { - for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) { - for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) { - temp[internal_i][internal_j] += A_cache_[block_dim][local_id_0 * INTERNAL_BLOCK_SIZE + internal_j] * B_cache_[block_dim][local_id_1 * INTERNAL_BLOCK_SIZE + internal_i]; + nd_idx.barrier(); // wait until all work-items loaded their part of the data + + if constexpr (target == target_platform::cpu) { + // perform the dot product calculation, the dim is the fastest moving index + for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) { + for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) { + real_type sum{ 0.0 }; + for (unsigned dim = 0; dim < THREAD_BLOCK_SIZE; ++dim) { + sum += A_cache_[dim][local_id_0 * INTERNAL_BLOCK_SIZE + internal_j] * B_cache_[dim][local_id_1 * INTERNAL_BLOCK_SIZE + internal_i]; + } + temp[internal_i][internal_j] += sum; + } + } + } else { + // perform the dot product calculation, the dim is the slowest moving index + for (unsigned dim = 0; dim < THREAD_BLOCK_SIZE; ++dim) { + for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) { + for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) { + temp[internal_i][internal_j] += A_cache_[dim][local_id_0 * INTERNAL_BLOCK_SIZE + internal_j] * B_cache_[dim][local_id_1 * INTERNAL_BLOCK_SIZE + internal_i]; + } + } } } + nd_idx.barrier(); // wait until all work-items performed their part of the calculations } - nd_idx.barrier(); // wait until all work-items performed their part of the calculations } + // calculate the indices used in the current thread + const auto i_idx = (blockIdx_y * blockDim_y + threadIdx_y) * INTERNAL_BLOCK_SIZE_uz; // num_rhs + const auto j_idx = (blockIdx_x * blockDim_x + threadIdx_x) * INTERNAL_BLOCK_SIZE_uz; // device_num_rows + // apply the (partial) BLAS operation and update C for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) { for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) { - const auto global_i = i + static_cast(internal_i); - const auto device_global_j = j + static_cast(internal_j); - const auto global_j = row_offset_ + j + static_cast(internal_j); - - // be sure to not perform out of bounds accesses - if (global_i < num_rhs_ && device_global_j < device_specific_num_rows_) { - C_[global_j * (num_rhs_ + PADDING_SIZE_uz) + global_i] = alpha_ * temp[internal_i][internal_j] + beta_ * C_[global_j * (num_rhs_ + PADDING_SIZE_uz) + global_i]; + // calculate the indices to access the global data and the data with respect to the current device + const auto global_i_idx = i_idx + static_cast(internal_i); + const auto device_global_j_idx = j_idx + static_cast(internal_j); + const auto global_j_idx = device_row_offset_ + device_global_j_idx; + + // be sure to not perform out-of-bounds accesses + if (global_i_idx < num_rhs_ && device_global_j_idx < device_num_rows_) { + C_[global_j_idx * (num_rhs_ + PADDING_SIZE_uz) + global_i_idx] = alpha_ * temp[internal_i][internal_j] + beta_ * C_[global_j_idx * (num_rhs_ + PADDING_SIZE_uz) + global_i_idx]; // SoA } } } @@ -139,8 +167,8 @@ class device_kernel_symm { /// @cond Doxygen_suppress const std::size_t num_rows_; const std::size_t num_rhs_; - const std::size_t device_specific_num_rows_; - const std::size_t row_offset_; + const std::size_t device_num_rows_; + const std::size_t device_row_offset_; const real_type alpha_; const real_type *A_; const real_type *B_; @@ -155,17 +183,22 @@ class device_kernel_symm { * @brief Perform an explicit BLAS SYMM operation: `C = alpha * A * B + beta * C` where @p A is a `m x k` symmetric matrix (memory optimized), @p B is a `k x n` matrix, @p C is a `m x n` matrix, and @p alpha and @p beta are scalars. * @details In a multi-GPU setting, this function is responsible for mirroring down the columns this device is responsible for! * Uses SYCL's work-group data parallel kernels. + * @tparam target the target platform */ +template class device_kernel_symm_mirror { public: + /// The used SYCL kernel invocation type. + constexpr static sycl::kernel_invocation_type invocation_type = sycl::kernel_invocation_type::work_group; + /** * @brief Initialize the SYCL kernel function object. * @param[in] cgh the SYCL handler used to allocate the local memory * @param[in] num_rows the number of rows in @p A and @p C * @param[in] num_rhs the number of columns in @p B and @p C * @param[in] num_mirror_rows the number of rows to mirror down - * @param[in] device_specific_num_rows the number of rows in @p A and number of rows in @p B; thr rows in @p A are potentially distributed across multiple devices - * @param[in] row_offset the first row this device is responsible for + * @param[in] device_num_rows the number of rows in @p A and number of rows in @p B; thr rows in @p A are potentially distributed across multiple devices + * @param[in] device_row_offset the first row this device is responsible for * @param[in] alpha the scalar alpha value * @param[in] A the matrix @p A * @param[in] B the matrix @p B @@ -174,14 +207,14 @@ class device_kernel_symm_mirror { * @param[in] grid_x_offset the offset in x-dimension into the data points if more than one execution grid has to be used * @param[in] grid_y_offset the offset in y-dimension into the data points if more than one execution grid has to be used */ - device_kernel_symm_mirror(::sycl::handler &cgh, const std::size_t num_rows, const std::size_t num_rhs, const std::size_t num_mirror_rows, const std::size_t device_specific_num_rows, const std::size_t row_offset, const real_type alpha, const real_type *A, const real_type *B, const real_type beta, real_type *C, const std::size_t grid_x_offset, const std::size_t grid_y_offset) : + device_kernel_symm_mirror(::sycl::handler &cgh, const std::size_t num_rows, const std::size_t num_rhs, const std::size_t num_mirror_rows, const std::size_t device_num_rows, const std::size_t device_row_offset, const real_type alpha, const real_type *A, const real_type *B, const real_type beta, real_type *C, const std::size_t grid_x_offset, const std::size_t grid_y_offset) : A_cache_{ ::sycl::range<2>{ static_cast(THREAD_BLOCK_SIZE), static_cast(INTERNAL_BLOCK_SIZE) * static_cast(THREAD_BLOCK_SIZE) }, cgh }, B_cache_{ ::sycl::range<2>{ static_cast(THREAD_BLOCK_SIZE), static_cast(INTERNAL_BLOCK_SIZE) * static_cast(THREAD_BLOCK_SIZE) }, cgh }, num_rows_{ num_rows }, num_rhs_{ num_rhs }, num_mirror_rows_{ num_mirror_rows }, - device_specific_num_rows_{ device_specific_num_rows }, - row_offset_{ row_offset }, + device_num_rows_{ device_num_rows }, + device_row_offset_{ device_row_offset }, alpha_{ alpha }, A_{ A }, B_{ B }, @@ -200,59 +233,79 @@ class device_kernel_symm_mirror { const auto local_id_1 = static_cast(nd_idx.get_local_id(1)); // cast all values to 64-bit std::size_t to prevent potential 32-bit overflows - const std::size_t threadIdx_x = nd_idx.get_local_id(0); // current thread in block x-dimension - const std::size_t threadIdx_y = nd_idx.get_local_id(1); // current thread in block y-dimension - const std::size_t blockDim_x = nd_idx.get_local_range(0); // number of threads in block x-dimension - const std::size_t blockDim_y = nd_idx.get_local_range(1); // number of threads in block y-dimension - const std::size_t blockIdx_x = nd_idx.get_group(0) + grid_x_offset_; // current block in grid x-dimension + offsets if the grid size would be too large - const std::size_t blockIdx_y = nd_idx.get_group(1) + grid_y_offset_; // current block in grid y-dimension + offsets if the grid size would be too large - const auto INTERNAL_BLOCK_SIZE_uz = static_cast(INTERNAL_BLOCK_SIZE); - const auto THREAD_BLOCK_SIZE_uz = static_cast(THREAD_BLOCK_SIZE); - const auto PADDING_SIZE_uz = static_cast(PADDING_SIZE); + constexpr auto INTERNAL_BLOCK_SIZE_uz = static_cast(INTERNAL_BLOCK_SIZE); + constexpr auto THREAD_BLOCK_SIZE_uz = static_cast(THREAD_BLOCK_SIZE); + constexpr auto PADDING_SIZE_uz = static_cast(PADDING_SIZE); - // calculate the indices used in the current work-item - const auto i = (blockIdx_y * blockDim_y + threadIdx_y) * INTERNAL_BLOCK_SIZE_uz; - const auto i_linear = blockIdx_y * blockDim_y * INTERNAL_BLOCK_SIZE_uz + threadIdx_y; - const auto j = (blockIdx_x * blockDim_x + threadIdx_x) * INTERNAL_BLOCK_SIZE_uz; - const auto j_linear = blockIdx_x * blockDim_x * INTERNAL_BLOCK_SIZE_uz + threadIdx_y; + const auto threadIdx_x = static_cast(nd_idx.get_local_id(0)); // current work-item in work-group x-dimension + const auto threadIdx_y = static_cast(nd_idx.get_local_id(1)); // current work-item in work-group y-dimension + const auto blockDim_x = static_cast(nd_idx.get_local_range(0)); // number of work-items in work-group x-dimension + const auto blockDim_y = static_cast(nd_idx.get_local_range(1)); // number of work-items in work-group y-dimension + const auto blockIdx_x = static_cast(nd_idx.get_group(0)) + grid_x_offset_; // current work-group in global range x-dimension + offsets if the global range is too large + const auto blockIdx_y = static_cast(nd_idx.get_group(1)) + grid_y_offset_; // current work-group in global range y-dimension + offsets if the global range is too large // create a work-item private array used for internal caching real_type temp[INTERNAL_BLOCK_SIZE][INTERNAL_BLOCK_SIZE]{}; - // iterate over the remaining features using blocking to be able to cache them for faster memory accesses - for (std::size_t dim = 0; dim < device_specific_num_rows_; dim += THREAD_BLOCK_SIZE_uz) { - // load data into shared memory - for (unsigned internal = 0; internal < INTERNAL_BLOCK_SIZE; ++internal) { - const auto global_i = i_linear + static_cast(internal) * THREAD_BLOCK_SIZE_uz; - const auto global_j = j_linear + static_cast(internal) * THREAD_BLOCK_SIZE_uz; - - // store the values in the local memory - A_cache_[local_id_0][internal * THREAD_BLOCK_SIZE + local_id_1] = A_[(dim + threadIdx_x) * (num_rows_ - row_offset_ + PADDING_SIZE_uz) - (dim + threadIdx_x - std::size_t{ 1 }) * (dim + threadIdx_x) / std::size_t{ 2 } + device_specific_num_rows_ - (dim + threadIdx_x) + global_j]; - B_cache_[local_id_0][internal * THREAD_BLOCK_SIZE + local_id_1] = B_[(dim + row_offset_ + threadIdx_x) * (num_rhs_ + PADDING_SIZE_uz) + global_i]; - } - nd_idx.barrier(); // wait until all threads loaded their part of the data - - // perform the feature reduction calculation - for (unsigned block_dim = 0; block_dim < THREAD_BLOCK_SIZE; ++block_dim) { - for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) { - for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) { - temp[internal_i][internal_j] += A_cache_[block_dim][local_id_0 * INTERNAL_BLOCK_SIZE + internal_j] * B_cache_[block_dim][local_id_1 * INTERNAL_BLOCK_SIZE + internal_i]; + { + // calculate the indices used in the current thread, pays attention to coalesced memory accesses + const auto i_idx_linear = blockIdx_y * blockDim_y * INTERNAL_BLOCK_SIZE_uz + threadIdx_y; // num_rhs + const auto j_idx_linear = blockIdx_x * blockDim_x * INTERNAL_BLOCK_SIZE_uz + threadIdx_y; // num_mirror_rows + + // iterate over the remaining values using blocking to be able to cache them for faster memory accesses + for (std::size_t dim_block = 0; dim_block < device_num_rows_; dim_block += THREAD_BLOCK_SIZE_uz) { + // load data into local memory + for (unsigned internal = 0; internal < INTERNAL_BLOCK_SIZE; ++internal) { + // calculate the indices to access the global data, pays attention to coalesced memory accesses + const auto global_i_idx_linear = i_idx_linear + static_cast(internal) * THREAD_BLOCK_SIZE_uz; + const auto global_j_idx_linear = j_idx_linear + static_cast(internal) * THREAD_BLOCK_SIZE_uz; + + // store the values in the local memory + A_cache_[local_id_0][internal * THREAD_BLOCK_SIZE + local_id_1] = A_[(dim_block + threadIdx_x) * (num_rows_ - device_row_offset_ + PADDING_SIZE_uz) - (dim_block + threadIdx_x - std::size_t{ 1 }) * (dim_block + threadIdx_x) / std::size_t{ 2 } + device_num_rows_ - (dim_block + threadIdx_x) + global_j_idx_linear]; // SoA, upper triangular matrix only + B_cache_[local_id_0][internal * THREAD_BLOCK_SIZE + local_id_1] = B_[(device_row_offset_ + dim_block + threadIdx_x) * (num_rhs_ + PADDING_SIZE_uz) + global_i_idx_linear]; // SoA + } + nd_idx.barrier(); // wait until all work-items loaded their part of the data + + if constexpr (target == target_platform::cpu) { + // perform the dot product calculation, the dim is the fastest moving index + for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) { + for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) { + real_type sum{ 0.0 }; + for (unsigned dim = 0; dim < THREAD_BLOCK_SIZE; ++dim) { + sum += A_cache_[dim][local_id_0 * INTERNAL_BLOCK_SIZE + internal_j] * B_cache_[dim][local_id_1 * INTERNAL_BLOCK_SIZE + internal_i]; + } + temp[internal_i][internal_j] += sum; + } + } + } else { + // perform the dot product calculation, the dim is the slowest moving index + for (unsigned dim = 0; dim < THREAD_BLOCK_SIZE; ++dim) { + for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) { + for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) { + temp[internal_i][internal_j] += A_cache_[dim][local_id_0 * INTERNAL_BLOCK_SIZE + internal_j] * B_cache_[dim][local_id_1 * INTERNAL_BLOCK_SIZE + internal_i]; + } + } } } + nd_idx.barrier(); // wait until all work-items performed their part of the calculations } - nd_idx.barrier(); // wait until all threads performed their part of the calculations } + // calculate the indices used in the current work-item + const auto i_idx = (blockIdx_y * blockDim_y + threadIdx_y) * INTERNAL_BLOCK_SIZE_uz; // num_rhs + const auto j_idx = (blockIdx_x * blockDim_x + threadIdx_x) * INTERNAL_BLOCK_SIZE_uz; // num_mirror_rows + // apply the (remaining) BLAS operation and update C for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) { for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) { - const auto global_i = i + static_cast(internal_i); - const auto partial_global_j = j + static_cast(internal_j); - const auto global_j = row_offset_ + device_specific_num_rows_ + j + static_cast(internal_j); - - // be sure to not perform out of bounds accesses - if (global_i < num_rhs_ && partial_global_j < num_mirror_rows_) { - C_[global_j * (num_rhs_ + PADDING_SIZE_uz) + global_i] = alpha_ * temp[internal_i][internal_j] + beta_ * C_[global_j * (num_rhs_ + PADDING_SIZE_uz) + global_i]; + // calculate the indices to access the global data and the data with respect to the current device + const auto global_i_idx = i_idx + static_cast(internal_i); + const auto partial_global_j_idx = j_idx + static_cast(internal_j); + const auto global_j_idx = device_row_offset_ + device_num_rows_ + partial_global_j_idx; + + // be sure to not perform out-of-bounds accesses + if (global_i_idx < num_rhs_ && partial_global_j_idx < num_mirror_rows_) { + C_[global_j_idx * (num_rhs_ + PADDING_SIZE_uz) + global_i_idx] = alpha_ * temp[internal_i][internal_j] + beta_ * C_[global_j_idx * (num_rhs_ + PADDING_SIZE_uz) + global_i_idx]; // SoA } } } @@ -268,8 +321,8 @@ class device_kernel_symm_mirror { const std::size_t num_rows_; const std::size_t num_rhs_; const std::size_t num_mirror_rows_; - const std::size_t device_specific_num_rows_; - const std::size_t row_offset_; + const std::size_t device_num_rows_; + const std::size_t device_row_offset_; const real_type alpha_; const real_type *A_; const real_type *B_; @@ -286,6 +339,9 @@ class device_kernel_symm_mirror { */ class device_kernel_inplace_matrix_add { public: + /// The used SYCL kernel invocation type. + constexpr static sycl::kernel_invocation_type invocation_type = sycl::kernel_invocation_type::work_group; + /** * @brief Initialize the SYCL kernel function object. * @param[in] num_cols the number of columns in both matrices @@ -307,25 +363,27 @@ class device_kernel_inplace_matrix_add { */ void operator()(::sycl::nd_item<2> nd_idx) const { // cast all values to 64-bit std::size_t to prevent potential 32-bit overflows - const std::size_t threadIdx_x = nd_idx.get_local_id(0); // current thread in block x-dimension - const std::size_t threadIdx_y = nd_idx.get_local_id(1); // current thread in block y-dimension - const std::size_t blockDim_x = nd_idx.get_local_range(0); // number of threads in block x-dimension - const std::size_t blockDim_y = nd_idx.get_local_range(1); // number of threads in block y-dimension - const std::size_t blockIdx_x = nd_idx.get_group(0) + grid_x_offset_; // current block in grid x-dimension + offsets if the grid size would be too large - const std::size_t blockIdx_y = nd_idx.get_group(1) + grid_y_offset_; // current block in grid y-dimension + offsets if the grid size would be too large - const auto INTERNAL_BLOCK_SIZE_uz = static_cast(INTERNAL_BLOCK_SIZE); - const auto PADDING_SIZE_uz = static_cast(PADDING_SIZE); + constexpr auto INTERNAL_BLOCK_SIZE_uz = static_cast(INTERNAL_BLOCK_SIZE); + constexpr auto PADDING_SIZE_uz = static_cast(PADDING_SIZE); + + const auto threadIdx_x = static_cast(nd_idx.get_local_id(0)); // current work-item in work-group x-dimension + const auto threadIdx_y = static_cast(nd_idx.get_local_id(1)); // current work-item in work-group y-dimension + const auto blockDim_x = static_cast(nd_idx.get_local_range(0)); // number of work-items in work-group x-dimension + const auto blockDim_y = static_cast(nd_idx.get_local_range(1)); // number of work-items in work-group y-dimension + const auto blockIdx_x = static_cast(nd_idx.get_group(0)) + grid_x_offset_; // current work-group in global range x-dimension + offsets if the global range is too large + const auto blockIdx_y = static_cast(nd_idx.get_group(1)) + grid_y_offset_; // current work-group in global range y-dimension + offsets if the global range is too large // calculate the indices used in the current work-item - const auto i = (blockIdx_y * blockDim_y + threadIdx_y) * INTERNAL_BLOCK_SIZE_uz; // # num_rows - const auto j = (blockIdx_x * blockDim_x + threadIdx_x) * INTERNAL_BLOCK_SIZE_uz; // # num_rhs + const auto i_idx = (blockIdx_y * blockDim_y + threadIdx_y) * INTERNAL_BLOCK_SIZE_uz; // num_rows + const auto j_idx = (blockIdx_x * blockDim_x + threadIdx_x) * INTERNAL_BLOCK_SIZE_uz; // num_rhs for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) { for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) { - const auto global_i = i + static_cast(internal_i); - const auto global_j = j + static_cast(internal_j); + // calculate the indices to access the global data + const auto global_i_idx = i_idx + static_cast(internal_i); + const auto global_j_idx = j_idx + static_cast(internal_j); - lhs_[global_i * (num_cols_ + PADDING_SIZE_uz) + global_j] += rhs_[global_i * (num_cols_ + PADDING_SIZE_uz) + global_j]; + lhs_[global_i_idx * (num_cols_ + PADDING_SIZE_uz) + global_j_idx] += rhs_[global_i_idx * (num_cols_ + PADDING_SIZE_uz) + global_j_idx]; // SoA } } } @@ -346,6 +404,9 @@ class device_kernel_inplace_matrix_add { */ class device_kernel_inplace_matrix_scale { public: + /// The used SYCL kernel invocation type. + constexpr static sycl::kernel_invocation_type invocation_type = sycl::kernel_invocation_type::work_group; + /** * @brief Initialize the SYCL kernel function object. * @param[in] num_cols the number of columns in the matrix @@ -367,25 +428,27 @@ class device_kernel_inplace_matrix_scale { */ void operator()(::sycl::nd_item<2> nd_idx) const { // cast all values to 64-bit std::size_t to prevent potential 32-bit overflows - const std::size_t threadIdx_x = nd_idx.get_local_id(0); // current thread in block x-dimension - const std::size_t threadIdx_y = nd_idx.get_local_id(1); // current thread in block y-dimension - const std::size_t blockDim_x = nd_idx.get_local_range(0); // number of threads in block x-dimension - const std::size_t blockDim_y = nd_idx.get_local_range(1); // number of threads in block y-dimension - const std::size_t blockIdx_x = nd_idx.get_group(0) + grid_x_offset_; // current block in grid x-dimension + offsets if the grid size would be too large - const std::size_t blockIdx_y = nd_idx.get_group(1) + grid_y_offset_; // current block in grid y-dimension + offsets if the grid size would be too large - const auto INTERNAL_BLOCK_SIZE_uz = static_cast(INTERNAL_BLOCK_SIZE); - const auto PADDING_SIZE_uz = static_cast(PADDING_SIZE); + constexpr auto INTERNAL_BLOCK_SIZE_uz = static_cast(INTERNAL_BLOCK_SIZE); + constexpr auto PADDING_SIZE_uz = static_cast(PADDING_SIZE); + + const auto threadIdx_x = static_cast(nd_idx.get_local_id(0)); // current work-item in work-group x-dimension + const auto threadIdx_y = static_cast(nd_idx.get_local_id(1)); // current work-item in work-group y-dimension + const auto blockDim_x = static_cast(nd_idx.get_local_range(0)); // number of work-items in work-group x-dimension + const auto blockDim_y = static_cast(nd_idx.get_local_range(1)); // number of work-items in work-group y-dimension + const auto blockIdx_x = static_cast(nd_idx.get_group(0)) + grid_x_offset_; // current work-group in global range x-dimension + offsets if the global range is too large + const auto blockIdx_y = static_cast(nd_idx.get_group(1)) + grid_y_offset_; // current work-group in global range y-dimension + offsets if the global range is too large // calculate the indices used in the current work-item - const auto i = (blockIdx_y * blockDim_y + threadIdx_y) * INTERNAL_BLOCK_SIZE_uz; // # num_rows - const auto j = (blockIdx_x * blockDim_x + threadIdx_x) * INTERNAL_BLOCK_SIZE_uz; // # num_rhs + const auto i_idx = (blockIdx_y * blockDim_y + threadIdx_y) * INTERNAL_BLOCK_SIZE_uz; // num_rows + const auto j_idx = (blockIdx_x * blockDim_x + threadIdx_x) * INTERNAL_BLOCK_SIZE_uz; // num_rhs for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) { for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) { - const auto global_i = i + static_cast(internal_i); - const auto global_j = j + static_cast(internal_j); + // calculate the indices to access the global data + const auto global_i_idx = i_idx + static_cast(internal_i); + const auto global_j_idx = j_idx + static_cast(internal_j); - lhs_[global_i * (num_cols_ + PADDING_SIZE_uz) + global_j] *= scale_; + lhs_[global_i_idx * (num_cols_ + PADDING_SIZE_uz) + global_j_idx] *= scale_; // SoA } } } diff --git a/include/plssvm/backends/SYCL/kernel/cg_explicit/work_group/kernel_matrix_assembly.hpp b/include/plssvm/backends/SYCL/kernel/cg_explicit/work_group/kernel_matrix_assembly.hpp index 560d556ea..ec9fc1773 100644 --- a/include/plssvm/backends/SYCL/kernel/cg_explicit/work_group/kernel_matrix_assembly.hpp +++ b/include/plssvm/backends/SYCL/kernel/cg_explicit/work_group/kernel_matrix_assembly.hpp @@ -14,6 +14,7 @@ #pragma once #include "plssvm/backends/SYCL/kernel/kernel_functions.hpp" // plssvm::sycl::detail::{feature_reduce, apply_kernel_function} +#include "plssvm/backends/SYCL/kernel_invocation_types.hpp" // plssvm::sycl::kernel_invocation_type #include "plssvm/constants.hpp" // plssvm::{real_type, THREAD_BLOCK_SIZE, INTERNAL_BLOCK_SIZE, PADDING_SIZE} #include "plssvm/kernel_function_types.hpp" // plssvm::kernel_function_type #include "plssvm/target_platforms.hpp" // plssvm::target_platform @@ -28,13 +29,16 @@ namespace plssvm::sycl::detail::work_group { /** * @brief Create the explicit kernel matrix using the @p kernel_function. * @details Uses SYCL's work-group data parallel kernels. - * @details target the target platform + * @tparam target the target platform * @tparam kernel_function the type of the used kernel function * @tparam Args the types of the parameters necessary for the specific kernel function; stored in a `std::tuple` */ template class device_kernel_assembly { public: + /// The used SYCL kernel invocation type. + constexpr static sycl::kernel_invocation_type invocation_type = sycl::kernel_invocation_type::work_group; + /** * @brief Initialize the SYCL kernel function object. * @param[in] cgh the SYCL handler used to allocate the local memory @@ -65,7 +69,7 @@ class device_kernel_assembly { cost_{ cost }, grid_x_offset_{ grid_x_offset }, grid_y_offset_{ grid_y_offset }, - kernel_function_parameter_{ std::make_tuple(std::forward(kernel_function_parameter)...) } { + kernel_function_parameter_{ std::make_tuple(kernel_function_parameter...) } { } /** @@ -82,12 +86,12 @@ class device_kernel_assembly { constexpr auto THREAD_BLOCK_SIZE_uz = static_cast(THREAD_BLOCK_SIZE); constexpr auto PADDING_SIZE_uz = static_cast(PADDING_SIZE); - const std::size_t threadIdx_x = nd_idx.get_local_id(0); // current work-item in work-group x-dimension - const std::size_t threadIdx_y = nd_idx.get_local_id(1); // current work-item in work-group y-dimension - const std::size_t blockDim_x = nd_idx.get_local_range(0); // number of work-items in work-group x-dimension - const std::size_t blockDim_y = nd_idx.get_local_range(1); // number of work-items in work-group y-dimension - const std::size_t blockIdx_x = nd_idx.get_group(0) + grid_x_offset_; // current work-group in global range x-dimension + offsets if the global range is too large - const std::size_t blockIdx_y = nd_idx.get_group(1) + grid_y_offset_; // current work-group in global range y-dimension + offsets if the global range is too large + const auto threadIdx_x = static_cast(nd_idx.get_local_id(0)); // current work-item in work-group x-dimension + const auto threadIdx_y = static_cast(nd_idx.get_local_id(1)); // current work-item in work-group y-dimension + const auto blockDim_x = static_cast(nd_idx.get_local_range(0)); // number of work-items in work-group x-dimension + const auto blockDim_y = static_cast(nd_idx.get_local_range(1)); // number of work-items in work-group y-dimension + const auto blockIdx_x = static_cast(nd_idx.get_group(0)) + grid_x_offset_; // current work-group in global range x-dimension + offsets if the global range is too large + const auto blockIdx_y = static_cast(nd_idx.get_group(1)) + grid_y_offset_; // current work-group in global range y-dimension + offsets if the global range is too large // only calculate the upper triangular matrix -> can't use get_local_id() since all work-items in a work-group must progress further if (blockIdx_y >= blockIdx_x) { @@ -95,75 +99,75 @@ class device_kernel_assembly { real_type temp[INTERNAL_BLOCK_SIZE][INTERNAL_BLOCK_SIZE]{}; { - // calculate the indices used in the current work-item paying attention to coalesced memory accesses - const auto i_linear = blockIdx_y * blockDim_y * INTERNAL_BLOCK_SIZE_uz + threadIdx_y; - const auto j_linear = blockIdx_x * blockDim_x * INTERNAL_BLOCK_SIZE_uz + threadIdx_y; + // calculate the indices used in the current work-item, pays attention to coalesced memory accesses + const auto i_idx_linear = blockIdx_y * blockDim_y * INTERNAL_BLOCK_SIZE_uz + threadIdx_y; // num_rows - device_row_offset + const auto j_idx_linear = blockIdx_x * blockDim_x * INTERNAL_BLOCK_SIZE_uz + threadIdx_y; // device_num_rows // iterate over all features using blocking to be able to cache them for faster memory accesses - for (std::size_t dim = 0; dim < num_features_; dim += THREAD_BLOCK_SIZE_uz) { + for (std::size_t feature_block = 0; feature_block < num_features_; feature_block += THREAD_BLOCK_SIZE_uz) { // load data into local memory for (unsigned internal = 0; internal < INTERNAL_BLOCK_SIZE; ++internal) { - // calculate the indices to access the global data points, pays attention to coalesced memory accesses - const auto global_i_linear = device_row_offset_ + i_linear + static_cast(internal) * THREAD_BLOCK_SIZE_uz; - const auto global_j_linear = device_row_offset_ + j_linear + static_cast(internal) * THREAD_BLOCK_SIZE_uz; + // calculate the indices to access the global data, pays attention to coalesced memory accesses + const auto global_i_idx_linear = device_row_offset_ + i_idx_linear + static_cast(internal) * THREAD_BLOCK_SIZE_uz; + const auto global_j_idx_linear = device_row_offset_ + j_idx_linear + static_cast(internal) * THREAD_BLOCK_SIZE_uz; // store the values in the local memory - data_i_cache_[local_id_0][internal * THREAD_BLOCK_SIZE + local_id_1] = data_[(dim + threadIdx_x) * (num_rows_ + std::size_t{ 1 } + PADDING_SIZE_uz) + global_i_linear]; - data_j_cache_[local_id_0][internal * THREAD_BLOCK_SIZE + local_id_1] = data_[(dim + threadIdx_x) * (num_rows_ + std::size_t{ 1 } + PADDING_SIZE_uz) + global_j_linear]; + data_i_cache_[local_id_0][internal * THREAD_BLOCK_SIZE + local_id_1] = data_[(feature_block + threadIdx_x) * (num_rows_ + std::size_t{ 1 } + PADDING_SIZE_uz) + global_i_idx_linear]; // SoA + data_j_cache_[local_id_0][internal * THREAD_BLOCK_SIZE + local_id_1] = data_[(feature_block + threadIdx_x) * (num_rows_ + std::size_t{ 1 } + PADDING_SIZE_uz) + global_j_idx_linear]; // SoA } nd_idx.barrier(); // wait until all work-items loaded their part of the data - if constexpr (target == target_platform::gpu_amd) { - // perform the feature reduction calculation, the block_dim is the slowest moving index - for (unsigned block_dim = 0; block_dim < THREAD_BLOCK_SIZE; ++block_dim) { - for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) { - for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) { - temp[internal_i][internal_j] += detail::feature_reduce(data_i_cache_[block_dim][local_id_1 * INTERNAL_BLOCK_SIZE + internal_i], - data_j_cache_[block_dim][local_id_0 * INTERNAL_BLOCK_SIZE + internal_j]); - } - } - } - } else { - // perform the feature reduction calculation, the block_dim is the fastest moving index + if constexpr (target == target_platform::cpu) { + // perform the feature reduction calculation, the feature is the fastest moving index for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) { for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) { real_type sum{ 0.0 }; - for (unsigned block_dim = 0; block_dim < THREAD_BLOCK_SIZE; ++block_dim) { - sum += detail::feature_reduce(data_i_cache_[block_dim][local_id_1 * INTERNAL_BLOCK_SIZE + internal_i], - data_j_cache_[block_dim][local_id_0 * INTERNAL_BLOCK_SIZE + internal_j]); + for (unsigned feature = 0; feature < THREAD_BLOCK_SIZE; ++feature) { + sum += detail::feature_reduce(data_i_cache_[feature][local_id_1 * INTERNAL_BLOCK_SIZE + internal_i], + data_j_cache_[feature][local_id_0 * INTERNAL_BLOCK_SIZE + internal_j]); } temp[internal_i][internal_j] += sum; } } + } else { + // perform the feature reduction calculation, the feature is the slowest moving index + for (unsigned feature = 0; feature < THREAD_BLOCK_SIZE; ++feature) { + for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) { + for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) { + temp[internal_i][internal_j] += detail::feature_reduce(data_i_cache_[feature][local_id_1 * INTERNAL_BLOCK_SIZE + internal_i], + data_j_cache_[feature][local_id_0 * INTERNAL_BLOCK_SIZE + internal_j]); + } + } + } } nd_idx.barrier(); // wait until all work-items performed their part of the calculations } } // calculate the indices used in the current work-item - const auto i = (blockIdx_y * blockDim_y + threadIdx_y) * INTERNAL_BLOCK_SIZE_uz; - const auto j = (blockIdx_x * blockDim_x + threadIdx_x) * INTERNAL_BLOCK_SIZE_uz; + const auto i_idx = (blockIdx_y * blockDim_y + threadIdx_y) * INTERNAL_BLOCK_SIZE_uz; // num_rows - device_row_offset + const auto j_idx = (blockIdx_x * blockDim_x + threadIdx_x) * INTERNAL_BLOCK_SIZE_uz; // device_num_rows // apply the remaining part of the kernel function and store the value in the output kernel matrix for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) { for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) { - // calculate the indices to access the global data points and wrt the current device - const auto device_global_i = i + static_cast(internal_i); - const auto global_i = device_row_offset_ + device_global_i; - const auto device_global_j = j + static_cast(internal_j); - const auto global_j = device_row_offset_ + device_global_j; - - // be sure to not perform out of bounds accesses for the kernel matrix (only using the upper triangular matrix) - if (device_global_i < (num_rows_ - device_row_offset_) && device_global_j < device_num_rows_ && global_i >= global_j) { + // calculate the indices to access the global data and the data with respect to the current device + const auto device_global_i_idx = i_idx + static_cast(internal_i); + const auto global_i_idx = device_row_offset_ + device_global_i_idx; + const auto device_global_j_idx = j_idx + static_cast(internal_j); + const auto global_j_idx = device_row_offset_ + device_global_j_idx; + + // be sure to not perform out-of-bounds accesses (only using the upper triangular matrix) + if (device_global_i_idx < (num_rows_ - device_row_offset_) && device_global_j_idx < device_num_rows_ && global_i_idx >= global_j_idx) { real_type temp_ij = temp[internal_i][internal_j]; // apply the final kernel function - temp_ij = detail::apply_kernel_function(temp_ij, kernel_function_parameter_) + QA_cost_ - q_[global_i] - q_[global_j]; + temp_ij = detail::apply_kernel_function(temp_ij, kernel_function_parameter_) + QA_cost_ - q_[global_i_idx] - q_[global_j_idx]; // apply the cost on the diagonal - if (global_i == global_j) { + if (global_i_idx == global_j_idx) { temp_ij += cost_; } // update the upper triangular kernel matrix - kernel_matrix_[device_global_j * (num_rows_ - device_row_offset_ + PADDING_SIZE_uz) - device_global_j * (device_global_j + std::size_t{ 1 }) / std::size_t{ 2 } + device_global_i] = temp_ij; + kernel_matrix_[device_global_j_idx * (num_rows_ - device_row_offset_ + PADDING_SIZE_uz) - device_global_j_idx * (device_global_j_idx + std::size_t{ 1 }) / std::size_t{ 2 } + device_global_i_idx] = temp_ij; } } } diff --git a/include/plssvm/backends/SYCL/kernel/cg_implicit/basic/kernel_matrix_assembly_blas.hpp b/include/plssvm/backends/SYCL/kernel/cg_implicit/basic/kernel_matrix_assembly_blas.hpp index 9c82ad31d..c07186c37 100644 --- a/include/plssvm/backends/SYCL/kernel/cg_implicit/basic/kernel_matrix_assembly_blas.hpp +++ b/include/plssvm/backends/SYCL/kernel/cg_implicit/basic/kernel_matrix_assembly_blas.hpp @@ -15,8 +15,10 @@ #include "plssvm/backends/SYCL/detail/atomics.hpp" // plssvm::sycl::detail::atomic_op #include "plssvm/backends/SYCL/kernel/kernel_functions.hpp" // plssvm::sycl::detail::{feature_reduce, apply_kernel_function} +#include "plssvm/backends/SYCL/kernel_invocation_types.hpp" // plssvm::sycl::kernel_invocation_type #include "plssvm/constants.hpp" // plssvm::{real_type, THREAD_BLOCK_SIZE, INTERNAL_BLOCK_SIZE, PADDING_SIZE} #include "plssvm/kernel_function_types.hpp" // plssvm::kernel_function_type +#include "plssvm/target_platforms.hpp" // plssvm::target_platform #include "sycl/sycl.hpp" // sycl::item @@ -28,20 +30,24 @@ namespace plssvm::sycl::detail::basic { /** * @brief Perform an implicit BLAS SYMM-like operation: `C = alpha * A * B + C` where `A` is the implicitly calculated kernel matrix using the @p kernel_function (never actually stored, reducing the amount of needed global memory), @p B and @p C are matrices, and @p alpha is a scalar. * @details Uses SYCL's basic data parallel kernels. + * @tparam target the target platform * @tparam kernel_function the type of the used kernel function * @tparam Args the types of the parameters necessary for the specific kernel function */ -template +template class device_kernel_assembly_symm { public: + /// The used SYCL kernel invocation type. + constexpr static sycl::kernel_invocation_type invocation_type = sycl::kernel_invocation_type::basic; + /** * @brief Initialize the SYCL kernel function object. * @param[in] alpha the scalar alpha value * @param[in] q the vector used in the dimensional reduction - * @param[in] data_d the data points to calculate the implicit kernel matrix from + * @param[in] data the data points to calculate the implicit kernel matrix from * @param[in] num_rows the total number of data points (= total number of rows) * @param[in] device_num_rows the number of rows the current device is responsible for - * @param[in] row_offset the first row in @p data_d the current device is responsible for + * @param[in] device_row_offset the first row in @p data the current device is responsible for * @param[in] num_features the number of features per data point * @param[in] QA_cost the scalar used in the dimensional reduction * @param[in] cost the cost factor the diagonal is scaled with @@ -52,13 +58,13 @@ class device_kernel_assembly_symm { * @param[in] grid_y_offset the offset in y-dimension into the data points if more than one execution grid has to be used * @param[in] kernel_function_parameter the parameters necessary to apply the @p kernel_function */ - device_kernel_assembly_symm(const real_type alpha, const real_type *q, const real_type *data_d, const std::size_t num_rows, const std::size_t device_num_rows, const std::size_t row_offset, const std::size_t num_features, const real_type QA_cost, const real_type cost, const real_type *B, real_type *C, const std::size_t num_classes, const std::size_t grid_x_offset, const std::size_t grid_y_offset, Args... kernel_function_parameter) : + device_kernel_assembly_symm(const real_type alpha, const real_type *q, const real_type *data, const std::size_t num_rows, const std::size_t device_num_rows, const std::size_t device_row_offset, const std::size_t num_features, const real_type QA_cost, const real_type cost, const real_type *B, real_type *C, const std::size_t num_classes, const std::size_t grid_x_offset, const std::size_t grid_y_offset, Args... kernel_function_parameter) : alpha_{ alpha }, q_{ q }, - data_d_{ data_d }, + data_{ data }, num_rows_{ num_rows }, device_num_rows_{ device_num_rows }, - row_offset_{ row_offset }, + device_row_offset_{ device_row_offset }, num_features_{ num_features }, QA_cost_{ QA_cost }, cost_{ cost }, @@ -67,7 +73,7 @@ class device_kernel_assembly_symm { num_classes_{ num_classes }, grid_x_offset_{ grid_x_offset }, grid_y_offset_{ grid_y_offset }, - kernel_function_parameter_{ std::make_tuple(std::forward(kernel_function_parameter)...) } { } + kernel_function_parameter_{ std::make_tuple(kernel_function_parameter...) } { } /** * @brief Function call operator overload performing the actual calculation. @@ -75,28 +81,53 @@ class device_kernel_assembly_symm { */ void operator()(::sycl::item<2> idx) const { // cast all values to 64-bit std::size_t to prevent potential 32-bit overflows - const auto INTERNAL_BLOCK_SIZE_uz = static_cast(INTERNAL_BLOCK_SIZE); - const auto PADDING_SIZE_uz = static_cast(PADDING_SIZE); + constexpr auto INTERNAL_BLOCK_SIZE_uz = static_cast(INTERNAL_BLOCK_SIZE); + constexpr auto THREAD_BLOCK_SIZE_uz = static_cast(THREAD_BLOCK_SIZE); + constexpr auto PADDING_SIZE_uz = static_cast(PADDING_SIZE); // calculate the indices used in the current work-item - const std::size_t i = (idx.get_id(1) + grid_x_offset_ * THREAD_BLOCK_SIZE) * INTERNAL_BLOCK_SIZE_uz; - const std::size_t j = (idx.get_id(0) + grid_y_offset_ * THREAD_BLOCK_SIZE) * INTERNAL_BLOCK_SIZE_uz; + const auto i_idx = (idx.get_id(1) + grid_x_offset_ * THREAD_BLOCK_SIZE_uz) * INTERNAL_BLOCK_SIZE_uz; // num_rows - device_row_offset + const auto j_idx = (idx.get_id(0) + grid_y_offset_ * THREAD_BLOCK_SIZE_uz) * INTERNAL_BLOCK_SIZE_uz; // device_num_rows // only calculate the upper triangular matrix - if (i >= j) { + if (i_idx >= j_idx) { // create a work-item private array used for internal caching real_type temp[INTERNAL_BLOCK_SIZE][INTERNAL_BLOCK_SIZE]{}; - // iterate over all features using blocking to be able to cache them for faster memory accesses - for (std::size_t dim = 0; dim < num_features_; ++dim) { - // perform the feature reduction calculation - for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) { - for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) { - const auto global_i = row_offset_ + i + static_cast(internal_i); - const auto global_j = row_offset_ + j + static_cast(internal_j); - - temp[internal_i][internal_j] += detail::feature_reduce(data_d_[dim * (num_rows_ + std::size_t{ 1 } + PADDING_SIZE_uz) + global_i], - data_d_[dim * (num_rows_ + std::size_t{ 1 } + PADDING_SIZE_uz) + global_j]); + //*************************************************************************// + // inplace kernel matrix construction // + //*************************************************************************// + // iterate over all features using blocking + for (std::size_t feature_block = 0; feature_block < num_features_; feature_block += THREAD_BLOCK_SIZE_uz) { + if constexpr (target == target_platform::cpu) { + // perform the feature reduction calculation, the feature is the fastest moving index + for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) { + for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) { + // calculate the indices to access the global data + const auto global_i_idx = device_row_offset_ + i_idx + static_cast(internal_i); + const auto global_j_idx = device_row_offset_ + j_idx + static_cast(internal_j); + + real_type sum{ 0.0 }; + for (std::size_t feature = 0; feature < THREAD_BLOCK_SIZE_uz; ++feature) { + sum += detail::feature_reduce(data_[(feature_block + feature) * (num_rows_ + std::size_t{ 1 } + PADDING_SIZE_uz) + global_i_idx], // SoA + data_[(feature_block + feature) * (num_rows_ + std::size_t{ 1 } + PADDING_SIZE_uz) + global_j_idx]); // SoA + } + temp[internal_i][internal_j] += sum; + } + } + } else { + // perform the feature reduction calculation, the feature is the slowest moving index + for (std::size_t feature = 0; feature < THREAD_BLOCK_SIZE_uz; ++feature) { + for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) { + for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) { + // calculate the indices to access the global data + const auto global_i_idx = device_row_offset_ + i_idx + static_cast(internal_i); + const auto global_j_idx = device_row_offset_ + j_idx + static_cast(internal_j); + + temp[internal_i][internal_j] += detail::feature_reduce(data_[(feature_block + feature) * (num_rows_ + std::size_t{ 1 } + PADDING_SIZE_uz) + global_i_idx], // SoA + data_[(feature_block + feature) * (num_rows_ + std::size_t{ 1 } + PADDING_SIZE_uz) + global_j_idx]); // SoA + } + } } } } @@ -104,28 +135,48 @@ class device_kernel_assembly_symm { // apply the remaining part of the kernel function and store the value in the output kernel matrix for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) { for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) { - const auto global_i = row_offset_ + i + static_cast(internal_i); - const auto device_global_i = i + static_cast(internal_i); - const auto global_j = row_offset_ + j + static_cast(internal_j); - const auto device_global_j = j + static_cast(internal_j); - - // be sure to not perform out of bounds accesses for the kernel matrix (only using the upper triangular matrix) - if (device_global_i < (num_rows_ - row_offset_) && device_global_j < device_num_rows_ && global_i >= global_j) { - real_type temp_ij = temp[internal_i][internal_j]; - temp_ij = detail::apply_kernel_function(temp_ij, kernel_function_parameter_) + QA_cost_ - q_[global_i] - q_[global_j]; + // calculate the indices to access the global data and the data with respect to the current device + const auto device_global_i_idx = i_idx + static_cast(internal_i); + const auto global_i_idx = device_row_offset_ + device_global_i_idx; + const auto device_global_j_idx = j_idx + static_cast(internal_j); + const auto global_j_idx = device_row_offset_ + device_global_j_idx; + + // be sure to not perform out-of-bounds accesses (only using the upper triangular matrix) + if (device_global_i_idx < (num_rows_ - device_row_offset_) && device_global_j_idx < device_num_rows_ && global_i_idx >= global_j_idx) { + // apply the final kernel function + temp[internal_i][internal_j] = detail::apply_kernel_function(temp[internal_i][internal_j], kernel_function_parameter_) + QA_cost_ - q_[global_i_idx] - q_[global_j_idx]; // apply the cost on the diagonal - if (global_i == global_j) { - temp_ij += cost_; - // calculate the values of alpha * A * B - for (std::size_t class_idx = 0; class_idx < num_classes_; ++class_idx) { - detail::atomic_op{ C_[global_i * (num_classes_ + PADDING_SIZE_uz) + class_idx] } += alpha_ * temp_ij * B_[global_i * (num_classes_ + PADDING_SIZE_uz) + class_idx]; + if (global_i_idx == global_j_idx) { + temp[internal_i][internal_j] += cost_; + } + } else { + // be sure to set the value to zero otherwise + temp[internal_i][internal_j] = real_type{ 0.0 }; + } + } + } + + //*************************************************************************// + // calculate C += alpha * temp * B // + //*************************************************************************// + for (std::size_t class_block = 0; class_block < num_classes_; class_block += THREAD_BLOCK_SIZE_uz) { + for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) { + for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) { + // calculate the indices to access the global data + const auto global_i_idx = device_row_offset_ + i_idx + static_cast(internal_i); + const auto global_j_idx = device_row_offset_ + j_idx + static_cast(internal_j); + + if (global_i_idx == global_j_idx) { + // only apply once to the diagonal + for (std::size_t class_idx = 0; class_idx < THREAD_BLOCK_SIZE_uz; ++class_idx) { + detail::atomic_op{ C_[global_i_idx * (num_classes_ + PADDING_SIZE_uz) + class_block + class_idx] } += alpha_ * temp[internal_i][internal_j] * B_[global_i_idx * (num_classes_ + PADDING_SIZE_uz) + class_block + class_idx]; } } else { - // calculate the values of alpha * A * B - for (std::size_t class_idx = 0; class_idx < num_classes_; ++class_idx) { - detail::atomic_op{ C_[global_i * (num_classes_ + PADDING_SIZE_uz) + class_idx] } += alpha_ * temp_ij * B_[global_j * (num_classes_ + PADDING_SIZE_uz) + class_idx]; + // apply it for the upper and lower triangular matrix + for (std::size_t class_idx = 0; class_idx < THREAD_BLOCK_SIZE_uz; ++class_idx) { + detail::atomic_op{ C_[global_i_idx * (num_classes_ + PADDING_SIZE_uz) + class_block + class_idx] } += alpha_ * temp[internal_i][internal_j] * B_[global_j_idx * (num_classes_ + PADDING_SIZE_uz) + class_block + class_idx]; // symmetry - detail::atomic_op{ C_[global_j * (num_classes_ + PADDING_SIZE_uz) + class_idx] } += alpha_ * temp_ij * B_[global_i * (num_classes_ + PADDING_SIZE_uz) + class_idx]; + detail::atomic_op{ C_[global_j_idx * (num_classes_ + PADDING_SIZE_uz) + class_block + class_idx] } += alpha_ * temp[internal_i][internal_j] * B_[global_i_idx * (num_classes_ + PADDING_SIZE_uz) + class_block + class_idx]; } } } @@ -137,11 +188,12 @@ class device_kernel_assembly_symm { private: /// @cond Doxygen_suppress const real_type alpha_; + const real_type *q_; - const real_type *data_d_; + const real_type *data_; const std::size_t num_rows_; const std::size_t device_num_rows_; - const std::size_t row_offset_; + const std::size_t device_row_offset_; const std::size_t num_features_; const real_type QA_cost_; const real_type cost_; diff --git a/include/plssvm/backends/SYCL/kernel/cg_implicit/hierarchical/kernel_matrix_assembly_blas.hpp b/include/plssvm/backends/SYCL/kernel/cg_implicit/hierarchical/kernel_matrix_assembly_blas.hpp index 342e8308b..ea9197444 100644 --- a/include/plssvm/backends/SYCL/kernel/cg_implicit/hierarchical/kernel_matrix_assembly_blas.hpp +++ b/include/plssvm/backends/SYCL/kernel/cg_implicit/hierarchical/kernel_matrix_assembly_blas.hpp @@ -15,8 +15,10 @@ #include "plssvm/backends/SYCL/detail/atomics.hpp" // plssvm::sycl::detail::atomic_op #include "plssvm/backends/SYCL/kernel/kernel_functions.hpp" // plssvm::sycl::detail::{feature_reduce, apply_kernel_function} +#include "plssvm/backends/SYCL/kernel_invocation_types.hpp" // plssvm::sycl::kernel_invocation_type #include "plssvm/constants.hpp" // plssvm::{real_type, THREAD_BLOCK_SIZE, INTERNAL_BLOCK_SIZE, PADDING_SIZE} #include "plssvm/kernel_function_types.hpp" // plssvm::kernel_function_type +#include "plssvm/target_platforms.hpp" // plssvm::target_platform #include "sycl/sycl.hpp" // sycl::group, sycl::private_memory, sycl::h_item @@ -28,20 +30,24 @@ namespace plssvm::sycl::detail::hierarchical { /** * @brief Perform an implicit BLAS SYMM-like operation: `C = alpha * A * B + C` where `A` is the implicitly calculated kernel matrix using the @p kernel_function (never actually stored, reducing the amount of needed global memory), @p B and @p C are matrices, and @p alpha is a scalar. * @details Uses SYCL's hierarchical data parallel kernels. + * @tparam target the target platform * @tparam kernel_function the type of the used kernel function * @tparam Args the types of the parameters necessary for the specific kernel function */ -template +template class device_kernel_assembly_symm { public: + /// The used SYCL kernel invocation type. + constexpr static sycl::kernel_invocation_type invocation_type = sycl::kernel_invocation_type::hierarchical; + /** * @brief Initialize the SYCL kernel function object. * @param[in] alpha the scalar alpha value * @param[in] q the vector used in the dimensional reduction - * @param[in] data_d the data points to calculate the implicit kernel matrix from + * @param[in] data the data points to calculate the implicit kernel matrix from * @param[in] num_rows the total number of data points (= total number of rows) * @param[in] device_num_rows the number of rows the current device is responsible for - * @param[in] row_offset the first row in @p data_d the current device is responsible for + * @param[in] device_row_offset the first row in @p data the current device is responsible for * @param[in] num_features the number of features per data point * @param[in] QA_cost the scalar used in the dimensional reduction * @param[in] cost the cost factor the diagonal is scaled with @@ -52,13 +58,13 @@ class device_kernel_assembly_symm { * @param[in] grid_y_offset the offset in y-dimension into the data points if more than one execution grid has to be used * @param[in] kernel_function_parameter the parameters necessary to apply the @p kernel_function */ - device_kernel_assembly_symm(const real_type alpha, const real_type *q, const real_type *data_d, const std::size_t num_rows, const std::size_t device_num_rows, const std::size_t row_offset, const std::size_t num_features, const real_type QA_cost, const real_type cost, const real_type *B, real_type *C, const std::size_t num_classes, const std::size_t grid_x_offset, const std::size_t grid_y_offset, Args... kernel_function_parameter) : + device_kernel_assembly_symm(const real_type alpha, const real_type *q, const real_type *data, const std::size_t num_rows, const std::size_t device_num_rows, const std::size_t device_row_offset, const std::size_t num_features, const real_type QA_cost, const real_type cost, const real_type *B, real_type *C, const std::size_t num_classes, const std::size_t grid_x_offset, const std::size_t grid_y_offset, Args... kernel_function_parameter) : alpha_{ alpha }, q_{ q }, - data_d_{ data_d }, + data_{ data }, num_rows_{ num_rows }, device_num_rows_{ device_num_rows }, - row_offset_{ row_offset }, + device_row_offset_{ device_row_offset }, num_features_{ num_features }, QA_cost_{ QA_cost }, cost_{ cost }, @@ -67,41 +73,45 @@ class device_kernel_assembly_symm { num_classes_{ num_classes }, grid_x_offset_{ grid_x_offset }, grid_y_offset_{ grid_y_offset }, - kernel_function_parameter_{ std::make_tuple(std::forward(kernel_function_parameter)...) } { } + kernel_function_parameter_{ std::make_tuple(kernel_function_parameter...) } { } /** * @brief Function call operator overload performing the actual calculation. * @param[in] group indices representing the current point in the execution space */ void operator()(::sycl::group<2> group) const { - // calculate the indices used in the current work-item - ::sycl::private_memory i{ group }; - ::sycl::private_memory i_linear{ group }; - ::sycl::private_memory j{ group }; - ::sycl::private_memory j_linear{ group }; + // the indices used in the current work-item + ::sycl::private_memory i_idx{ group }; // num_rows - device_row_offset + ::sycl::private_memory j_idx{ group }; // device_num_rows + + ::sycl::private_memory i_idx_linear{ group }; // num_rows - device_row_offset + ::sycl::private_memory j_idx_linear{ group }; // device_num_rows - // create the shared memory arrays used for caching data point features - real_type data_cache_one[THREAD_BLOCK_SIZE * INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE]; - real_type data_cache_two[THREAD_BLOCK_SIZE * INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE]; + // create two local memory arrays used for caching + real_type cache_one[THREAD_BLOCK_SIZE * INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE]; + real_type cache_two[THREAD_BLOCK_SIZE * INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE]; + // create a private memory array used for internal caching ::sycl::private_memory temp{ group }; // initialize private and local variables group.parallel_for_work_item([&](::sycl::h_item<2> idx) { - const std::size_t threadIdx_x = idx.get_local_id(0); // current thread in block x-dimension - const std::size_t threadIdx_y = idx.get_local_id(1); // current thread in block y-dimension - const std::size_t blockDim_x = idx.get_local_range(0); // number of threads in block x-dimension - const std::size_t blockDim_y = idx.get_local_range(1); // number of threads in block y-dimension - const std::size_t blockIdx_x = group[0] + grid_x_offset_; // current block in grid x-dimension + offsets if the grid size would be too large - const std::size_t blockIdx_y = group[1] + grid_y_offset_; // current block in grid y-dimension + offsets if the grid size would be too large - - const auto INTERNAL_BLOCK_SIZE_uz = static_cast(INTERNAL_BLOCK_SIZE); - - // indices - i(idx) = (blockIdx_y * blockDim_y + threadIdx_y) * INTERNAL_BLOCK_SIZE_uz; - i_linear(idx) = blockIdx_y * blockDim_y * INTERNAL_BLOCK_SIZE_uz + threadIdx_y; - j(idx) = (blockIdx_x * blockDim_x + threadIdx_x) * INTERNAL_BLOCK_SIZE_uz; - j_linear(idx) = blockIdx_x * blockDim_x * INTERNAL_BLOCK_SIZE_uz + threadIdx_y; + // cast all values to 64-bit std::size_t to prevent potential 32-bit overflows + constexpr auto INTERNAL_BLOCK_SIZE_uz = static_cast(INTERNAL_BLOCK_SIZE); + + const auto threadIdx_x = static_cast(idx.get_local_id(0)); // current work-item in work-group x-dimension + const auto threadIdx_y = static_cast(idx.get_local_id(1)); // current work-item in work-group y-dimension + const auto blockDim_x = static_cast(idx.get_local_range(0)); // number of work-items in work-group x-dimension + const auto blockDim_y = static_cast(idx.get_local_range(1)); // number of work-items in work-group y-dimension + const auto blockIdx_x = static_cast(group[0]) + grid_x_offset_; // current work-group in global range x-dimension + offsets if the global range is too large + const auto blockIdx_y = static_cast(group[1]) + grid_y_offset_; // current work-group in global range y-dimension + offsets if the global range is too large + + // calculate the indices to access the global data + i_idx(idx) = (blockIdx_y * blockDim_y + threadIdx_y) * INTERNAL_BLOCK_SIZE_uz; + j_idx(idx) = (blockIdx_x * blockDim_x + threadIdx_x) * INTERNAL_BLOCK_SIZE_uz; + // calculate the indices to access the global data, pays attention to coalesced memory accesses + i_idx_linear(idx) = blockIdx_y * blockDim_y * INTERNAL_BLOCK_SIZE_uz + threadIdx_y; + j_idx_linear(idx) = blockIdx_x * blockDim_x * INTERNAL_BLOCK_SIZE_uz + threadIdx_y; // initialize private temp matrix to zero for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) { @@ -115,30 +125,36 @@ class device_kernel_assembly_symm { // only calculate the upper triangular matrix -> can't use get_local_id() since all work-items in a work-group must progress further if (group[1] >= group[0]) { + //*************************************************************************// + // inplace kernel matrix construction // + //*************************************************************************// { - // reinterpret the arrays to be of shape [THREAD_BLOCK_SIZE][INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE] - auto data_cache_i = reinterpret_cast(data_cache_one); - auto data_cache_j = reinterpret_cast(data_cache_two); + // reinterpret the local memory arrays to be of shape [THREAD_BLOCK_SIZE][INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE] + auto data_i_cache = reinterpret_cast(cache_one); + auto data_j_cache = reinterpret_cast(cache_two); // iterate over all features using blocking to be able to cache them for faster memory accesses - for (std::size_t dim = 0; dim < num_features_; dim += static_cast(THREAD_BLOCK_SIZE)) { + for (std::size_t feature_block = 0; feature_block < num_features_; feature_block += static_cast(THREAD_BLOCK_SIZE)) { // load data into local memory group.parallel_for_work_item([&](::sycl::h_item<2> idx) { + // cast values to 32-bit unsigned int values to prevent implicit conversions const auto local_id_0 = static_cast(idx.get_local_id(0)); const auto local_id_1 = static_cast(idx.get_local_id(1)); - const std::size_t threadIdx_x = idx.get_local_id(0); + // cast all values to 64-bit std::size_t to prevent potential 32-bit overflows + constexpr auto THREAD_BLOCK_SIZE_uz = static_cast(THREAD_BLOCK_SIZE); + constexpr auto PADDING_SIZE_uz = static_cast(PADDING_SIZE); - const auto THREAD_BLOCK_SIZE_uz = static_cast(THREAD_BLOCK_SIZE); - const auto PADDING_SIZE_uz = static_cast(PADDING_SIZE); + const auto threadIdx_x = static_cast(idx.get_local_id(0)); // current work-item in work-group x-dimension for (unsigned internal = 0; internal < INTERNAL_BLOCK_SIZE; ++internal) { - const auto global_i = row_offset_ + i_linear(idx) + static_cast(internal) * THREAD_BLOCK_SIZE_uz; - const auto global_j = row_offset_ + j_linear(idx) + static_cast(internal) * THREAD_BLOCK_SIZE_uz; + // calculate the indices to access the global data, pays attention to coalesced memory accesses + const auto global_i_idx_linear = device_row_offset_ + i_idx_linear(idx) + static_cast(internal) * THREAD_BLOCK_SIZE_uz; + const auto global_j_idx_linear = device_row_offset_ + j_idx_linear(idx) + static_cast(internal) * THREAD_BLOCK_SIZE_uz; // store the values in the local memory - data_cache_i[local_id_0][internal * THREAD_BLOCK_SIZE + local_id_1] = data_d_[(dim + threadIdx_x) * (num_rows_ + std::size_t{ 1 } + PADDING_SIZE_uz) + global_i]; - data_cache_j[local_id_0][internal * THREAD_BLOCK_SIZE + local_id_1] = data_d_[(dim + threadIdx_x) * (num_rows_ + std::size_t{ 1 } + PADDING_SIZE_uz) + global_j]; + data_i_cache[local_id_0][internal * THREAD_BLOCK_SIZE + local_id_1] = data_[(feature_block + threadIdx_x) * (num_rows_ + std::size_t{ 1 } + PADDING_SIZE_uz) + global_i_idx_linear]; // SoA + data_j_cache[local_id_0][internal * THREAD_BLOCK_SIZE + local_id_1] = data_[(feature_block + threadIdx_x) * (num_rows_ + std::size_t{ 1 } + PADDING_SIZE_uz) + global_j_idx_linear]; // SoA } }); @@ -146,14 +162,30 @@ class device_kernel_assembly_symm { // perform the feature reduction calculation group.parallel_for_work_item([&](::sycl::h_item<2> idx) { + // cast values to 32-bit unsigned int values to prevent implicit conversions const auto local_id_0 = static_cast(idx.get_local_id(0)); const auto local_id_1 = static_cast(idx.get_local_id(1)); - for (unsigned block_dim = 0; block_dim < THREAD_BLOCK_SIZE; ++block_dim) { + if constexpr (target == target_platform::cpu) { + // perform the feature reduction calculation, the feature is the fastest moving index for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) { for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) { - temp(idx)[internal_i][internal_j] += detail::feature_reduce(data_cache_i[block_dim][local_id_1 * INTERNAL_BLOCK_SIZE + internal_i], - data_cache_j[block_dim][local_id_0 * INTERNAL_BLOCK_SIZE + internal_j]); + real_type sum{ 0.0 }; + for (unsigned feature = 0; feature < THREAD_BLOCK_SIZE; ++feature) { + sum += detail::feature_reduce(data_i_cache[feature][local_id_1 * INTERNAL_BLOCK_SIZE + internal_i], + data_j_cache[feature][local_id_0 * INTERNAL_BLOCK_SIZE + internal_j]); + } + temp(idx)[internal_i][internal_j] += sum; + } + } + } else { + // perform the feature reduction calculation, the feature is the slowest moving index + for (unsigned feature = 0; feature < THREAD_BLOCK_SIZE; ++feature) { + for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) { + for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) { + temp(idx)[internal_i][internal_j] += detail::feature_reduce(data_i_cache[feature][local_id_1 * INTERNAL_BLOCK_SIZE + internal_i], + data_j_cache[feature][local_id_0 * INTERNAL_BLOCK_SIZE + internal_j]); + } } } } @@ -167,16 +199,18 @@ class device_kernel_assembly_symm { group.parallel_for_work_item([&](::sycl::h_item<2> idx) { for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) { for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) { - const auto global_i = row_offset_ + i(idx) + static_cast(internal_i); - const auto device_global_i = i(idx) + static_cast(internal_i); - const auto global_j = row_offset_ + j(idx) + static_cast(internal_j); - const auto device_global_j = j(idx) + static_cast(internal_j); - - // be sure to not perform out of bounds accesses for the kernel matrix (only using the upper triangular matrix) - if (device_global_i < (num_rows_ - row_offset_) && device_global_j < device_num_rows_ && global_i >= global_j) { - temp(idx)[internal_i][internal_j] = detail::apply_kernel_function(temp(idx)[internal_i][internal_j], kernel_function_parameter_) + QA_cost_ - q_[global_i] - q_[global_j]; + // calculate the indices to access the global data and the data with respect to the current device + const auto device_global_i_idx = i_idx(idx) + static_cast(internal_i); + const auto global_i_idx = device_row_offset_ + device_global_i_idx; + const auto device_global_j_idx = j_idx(idx) + static_cast(internal_j); + const auto global_j_idx = device_row_offset_ + device_global_j_idx; + + // be sure to not perform out-of-bounds accesses (only using the upper triangular matrix) + if (device_global_i_idx < (num_rows_ - device_row_offset_) && device_global_j_idx < device_num_rows_ && global_i_idx >= global_j_idx) { + // apply the final kernel function + temp(idx)[internal_i][internal_j] = detail::apply_kernel_function(temp(idx)[internal_i][internal_j], kernel_function_parameter_) + QA_cost_ - q_[global_i_idx] - q_[global_j_idx]; // apply the cost on the diagonal - if (global_i == global_j) { + if (global_i_idx == global_j_idx) { temp(idx)[internal_i][internal_j] += cost_; } } else { @@ -189,38 +223,44 @@ class device_kernel_assembly_symm { // implicit group barrier - // calculate C += alpha * temp * B for the UPPER triangular matrix + //*************************************************************************// + // calculate C += alpha * temp * B for the UPPER triangular matrix // + //*************************************************************************// { - // reinterpret the arrays to be of shape [INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE][THREAD_BLOCK_SIZE] - auto B_cache = reinterpret_cast(data_cache_one); - auto C_out_cache = reinterpret_cast(data_cache_two); + // reinterpret the local memory arrays to be of shape [INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE][THREAD_BLOCK_SIZE] + auto B_cache = reinterpret_cast(cache_one); + auto C_out_cache = reinterpret_cast(cache_two); // iterate over all classes using blocking to be able to cache them for faster memory accesses - for (std::size_t dim = 0; dim < num_classes_; dim += static_cast(THREAD_BLOCK_SIZE)) { + for (std::size_t class_block = 0; class_block < num_classes_; class_block += static_cast(THREAD_BLOCK_SIZE)) { // load data into local memory group.parallel_for_work_item([&](::sycl::h_item<2> idx) { + // cast values to 32-bit unsigned int values to prevent implicit conversions const auto local_id_0 = static_cast(idx.get_local_id(0)); const auto local_id_1 = static_cast(idx.get_local_id(1)); - const std::size_t threadIdx_x = idx.get_local_id(0); + // cast all values to 64-bit std::size_t to prevent potential 32-bit overflows + constexpr auto THREAD_BLOCK_SIZE_uz = static_cast(THREAD_BLOCK_SIZE); + constexpr auto PADDING_SIZE_uz = static_cast(PADDING_SIZE); - const auto THREAD_BLOCK_SIZE_uz = static_cast(THREAD_BLOCK_SIZE); - const auto PADDING_SIZE_uz = static_cast(PADDING_SIZE); + const auto threadIdx_x = static_cast(idx.get_local_id(0)); // current work-item in work-group x-dimension for (unsigned internal = 0; internal < INTERNAL_BLOCK_SIZE; ++internal) { - const std::size_t global_i = row_offset_ + i_linear(idx) + static_cast(internal) * THREAD_BLOCK_SIZE_uz; + // calculate the indices to access the global data, pays attention to coalesced memory accesses + const auto global_i_idx_linear = device_row_offset_ + i_idx_linear(idx) + static_cast(internal) * THREAD_BLOCK_SIZE_uz; // store the values in the local memory - B_cache[internal * THREAD_BLOCK_SIZE + local_id_1][local_id_0] = alpha_ * B_[global_i * (num_classes_ + PADDING_SIZE_uz) + dim + threadIdx_x]; - C_out_cache[internal * THREAD_BLOCK_SIZE + local_id_1][local_id_0] = real_type{ 0.0 }; + B_cache[internal * THREAD_BLOCK_SIZE + local_id_1][local_id_0] = alpha_ * B_[global_i_idx_linear * (num_classes_ + PADDING_SIZE_uz) + class_block + threadIdx_x]; // SoA + C_out_cache[internal * THREAD_BLOCK_SIZE + local_id_1][local_id_0] = real_type{ 0.0 }; // SoA } }); // implicit group barrier - // calculate intermediate results and store them in shared memory + // calculate intermediate results and store them in local memory for (unsigned class_idx = 0; class_idx < THREAD_BLOCK_SIZE; ++class_idx) { group.parallel_for_work_item([&](::sycl::h_item<2> idx) { + // cast values to 32-bit unsigned int values to prevent implicit conversions const auto local_id_0 = static_cast(idx.get_local_id(0)); const auto local_id_1 = static_cast(idx.get_local_id(1)); @@ -235,18 +275,22 @@ class device_kernel_assembly_symm { // implicit group barrier } - // add intermediate cached results to C + // atomically add the intermediate cached results to the C matrix group.parallel_for_work_item([&](::sycl::h_item<2> idx) { + // cast values to 32-bit unsigned int values to prevent implicit conversions const auto local_id_0 = static_cast(idx.get_local_id(0)); const auto local_id_1 = static_cast(idx.get_local_id(1)); - const std::size_t threadIdx_y = idx.get_local_id(1); + // cast all values to 64-bit std::size_t to prevent potential 32-bit overflows + constexpr auto PADDING_SIZE_uz = static_cast(PADDING_SIZE); - const auto PADDING_SIZE_uz = static_cast(PADDING_SIZE); + const auto threadIdx_y = static_cast(idx.get_local_id(1)); // current work-item in work-group y-dimension for (unsigned internal = 0; internal < INTERNAL_BLOCK_SIZE; ++internal) { - const auto global_j = row_offset_ + j(idx) + static_cast(internal); - detail::atomic_op{ C_[global_j * (num_classes_ + PADDING_SIZE_uz) + dim + threadIdx_y] } += C_out_cache[local_id_0 * INTERNAL_BLOCK_SIZE + internal][local_id_1]; + // calculate the indices to access the global data + const auto global_j_idx = device_row_offset_ + j_idx(idx) + static_cast(internal); + + detail::atomic_op{ C_[global_j_idx * (num_classes_ + PADDING_SIZE_uz) + class_block + threadIdx_y] } += C_out_cache[local_id_0 * INTERNAL_BLOCK_SIZE + internal][local_id_1]; // SoA } }); @@ -258,10 +302,11 @@ class device_kernel_assembly_symm { group.parallel_for_work_item([&](::sycl::h_item<2> idx) { for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) { for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) { - const auto global_i = row_offset_ + i(idx) + static_cast(internal_i); - const auto global_j = row_offset_ + j(idx) + static_cast(internal_j); + // calculate the indices to access the global data + const auto global_i_idx = device_row_offset_ + i_idx(idx) + static_cast(internal_i); + const auto global_j_idx = device_row_offset_ + j_idx(idx) + static_cast(internal_j); - if (global_i == global_j) { + if (global_i_idx == global_j_idx) { temp(idx)[internal_i][internal_j] = real_type{ 0.0 }; } } @@ -270,38 +315,44 @@ class device_kernel_assembly_symm { // implicit group barrier - // calculate C += alpha * temp * B for the LOWER triangular matrix + //*************************************************************************// + // calculate C += alpha * temp * B for the LOWER triangular matrix // + //*************************************************************************// { - // reinterpret the arrays to be of shape [THREAD_BLOCK_SIZE][INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE] - auto B_cache = reinterpret_cast(data_cache_one); - auto C_out_cache = reinterpret_cast(data_cache_two); + // reinterpret the local memory arrays to be of shape [THREAD_BLOCK_SIZE][INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE] + auto B_cache = reinterpret_cast(cache_one); + auto C_out_cache = reinterpret_cast(cache_two); // iterate over all classes using blocking to be able to cache them for faster memory accesses - for (std::size_t dim = 0; dim < num_classes_; dim += static_cast(THREAD_BLOCK_SIZE)) { + for (std::size_t class_block = 0; class_block < num_classes_; class_block += static_cast(THREAD_BLOCK_SIZE)) { group.parallel_for_work_item([&](::sycl::h_item<2> idx) { + // cast values to 32-bit unsigned int values to prevent implicit conversions const auto local_id_0 = static_cast(idx.get_local_id(0)); const auto local_id_1 = static_cast(idx.get_local_id(1)); - const std::size_t threadIdx_x = idx.get_local_id(0); + // cast all values to 64-bit std::size_t to prevent potential 32-bit overflows + constexpr auto THREAD_BLOCK_SIZE_uz = static_cast(THREAD_BLOCK_SIZE); + constexpr auto PADDING_SIZE_uz = static_cast(PADDING_SIZE); - const auto THREAD_BLOCK_SIZE_uz = static_cast(THREAD_BLOCK_SIZE); - const auto PADDING_SIZE_uz = static_cast(PADDING_SIZE); + const auto threadIdx_x = static_cast(idx.get_local_id(0)); // current work-item in work-group x-dimension // load data into local memory for (unsigned internal = 0; internal < INTERNAL_BLOCK_SIZE; ++internal) { - const auto global_j = row_offset_ + j_linear(idx) + static_cast(internal) * THREAD_BLOCK_SIZE_uz; + // calculate the indices to access the global data, pays attention to coalesced memory accesses + const auto global_j_idx_linear = device_row_offset_ + j_idx_linear(idx) + static_cast(internal) * THREAD_BLOCK_SIZE_uz; - // store the values in the shared memory - B_cache[local_id_0][internal * THREAD_BLOCK_SIZE + local_id_1] = alpha_ * B_[global_j * (num_classes_ + PADDING_SIZE_uz) + dim + threadIdx_x]; + // store the values in the local memory + B_cache[local_id_0][internal * THREAD_BLOCK_SIZE + local_id_1] = alpha_ * B_[global_j_idx_linear * (num_classes_ + PADDING_SIZE_uz) + class_block + threadIdx_x]; // SoA C_out_cache[local_id_0][internal * THREAD_BLOCK_SIZE + local_id_1] = real_type{ 0.0 }; } }); // implicit group barrier - // calculate intermediate results and store them in shared memory + // calculate intermediate results and store them in local memory for (unsigned class_idx = 0; class_idx < THREAD_BLOCK_SIZE; ++class_idx) { group.parallel_for_work_item([&](::sycl::h_item<2> idx) { + // cast values to 32-bit unsigned int values to prevent implicit conversions const auto local_id_0 = static_cast(idx.get_local_id(0)); const auto local_id_1 = static_cast(idx.get_local_id(1)); @@ -316,18 +367,22 @@ class device_kernel_assembly_symm { // implicit group barrier } - // add intermediate cached results to C + // atomically add the intermediate cached results to the C matrix group.parallel_for_work_item([&](::sycl::h_item<2> idx) { + // cast values to 32-bit unsigned int values to prevent implicit conversions const auto local_id_0 = static_cast(idx.get_local_id(0)); const auto local_id_1 = static_cast(idx.get_local_id(1)); - const std::size_t threadIdx_x = idx.get_local_id(0); + // cast all values to 64-bit std::size_t to prevent potential 32-bit overflows + constexpr auto PADDING_SIZE_uz = static_cast(PADDING_SIZE); - const auto PADDING_SIZE_uz = static_cast(PADDING_SIZE); + const auto threadIdx_x = static_cast(idx.get_local_id(0)); // current work-item in work-group x-dimension for (unsigned internal = 0; internal < INTERNAL_BLOCK_SIZE; ++internal) { - const auto global_i = row_offset_ + i(idx) + static_cast(internal); - detail::atomic_op{ C_[global_i * (num_classes_ + PADDING_SIZE_uz) + dim + threadIdx_x] } += C_out_cache[local_id_0][internal * THREAD_BLOCK_SIZE + local_id_1]; + // calculate the indices to access the global data + const auto global_i_idx = device_row_offset_ + i_idx(idx) + static_cast(internal); + + detail::atomic_op{ C_[global_i_idx * (num_classes_ + PADDING_SIZE_uz) + class_block + threadIdx_x] } += C_out_cache[local_id_0][internal * THREAD_BLOCK_SIZE + local_id_1]; // SoA } }); @@ -341,10 +396,10 @@ class device_kernel_assembly_symm { /// @cond Doxygen_suppress const real_type alpha_; const real_type *q_; - const real_type *data_d_; + const real_type *data_; const std::size_t num_rows_; const std::size_t device_num_rows_; - const std::size_t row_offset_; + const std::size_t device_row_offset_; const std::size_t num_features_; const real_type QA_cost_; const real_type cost_; diff --git a/include/plssvm/backends/SYCL/kernel/cg_implicit/scoped/kernel_matrix_assembly_blas.hpp b/include/plssvm/backends/SYCL/kernel/cg_implicit/scoped/kernel_matrix_assembly_blas.hpp index c84db480f..c833b19da 100644 --- a/include/plssvm/backends/SYCL/kernel/cg_implicit/scoped/kernel_matrix_assembly_blas.hpp +++ b/include/plssvm/backends/SYCL/kernel/cg_implicit/scoped/kernel_matrix_assembly_blas.hpp @@ -15,8 +15,10 @@ #include "plssvm/backends/SYCL/detail/atomics.hpp" // plssvm::sycl::detail::atomic_op #include "plssvm/backends/SYCL/kernel/kernel_functions.hpp" // plssvm::sycl::detail::{feature_reduce, apply_kernel_function} +#include "plssvm/backends/SYCL/kernel_invocation_types.hpp" // plssvm::sycl::kernel_invocation_type #include "plssvm/constants.hpp" // plssvm::{real_type, THREAD_BLOCK_SIZE, INTERNAL_BLOCK_SIZE, PADDING_SIZE} #include "plssvm/kernel_function_types.hpp" // plssvm::kernel_function_type +#include "plssvm/target_platforms.hpp" // plssvm::target_platform #include "sycl/sycl.hpp" // sycl::memory_environment, sycl::require_local_mem, sycl::require_private_mem, sycl::distribute_items_and_wait, sycl::s_item @@ -28,20 +30,24 @@ namespace plssvm::sycl::detail::scoped { /** * @brief Perform an implicit BLAS SYMM-like operation: `C = alpha * A * B + C` where `A` is the implicitly calculated kernel matrix using the @p kernel_function (never actually stored, reducing the amount of needed global memory), @p B and @p C are matrices, and @p alpha is a scalar. * @details Uses AdaptiveCpp's scoped parallelism. + * @tparam target the target platform * @tparam kernel_function the type of the used kernel function * @tparam Args the types of the parameters necessary for the specific kernel function */ -template +template class device_kernel_assembly_symm { public: + /// The used SYCL kernel invocation type. + constexpr static sycl::kernel_invocation_type invocation_type = sycl::kernel_invocation_type::scoped; + /** * @brief Initialize the SYCL kernel function object. * @param[in] alpha the scalar alpha value * @param[in] q the vector used in the dimensional reduction - * @param[in] data_d the data points to calculate the implicit kernel matrix from + * @param[in] data the data points to calculate the implicit kernel matrix from * @param[in] num_rows the total number of data points (= total number of rows) * @param[in] device_num_rows the number of rows the current device is responsible for - * @param[in] row_offset the first row in @p data_d the current device is responsible for + * @param[in] device_row_offset the first row in @p data the current device is responsible for * @param[in] num_features the number of features per data point * @param[in] QA_cost the scalar used in the dimensional reduction * @param[in] cost the cost factor the diagonal is scaled with @@ -52,13 +58,13 @@ class device_kernel_assembly_symm { * @param[in] grid_y_offset the offset in y-dimension into the data points if more than one execution grid has to be used * @param[in] kernel_function_parameter the parameters necessary to apply the @p kernel_function */ - device_kernel_assembly_symm(const real_type alpha, const real_type *q, const real_type *data_d, const std::size_t num_rows, const std::size_t device_num_rows, const std::size_t row_offset, const std::size_t num_features, const real_type QA_cost, const real_type cost, const real_type *B, real_type *C, const std::size_t num_classes, const std::size_t grid_x_offset, const std::size_t grid_y_offset, Args... kernel_function_parameter) : + device_kernel_assembly_symm(const real_type alpha, const real_type *q, const real_type *data, const std::size_t num_rows, const std::size_t device_num_rows, const std::size_t device_row_offset, const std::size_t num_features, const real_type QA_cost, const real_type cost, const real_type *B, real_type *C, const std::size_t num_classes, const std::size_t grid_x_offset, const std::size_t grid_y_offset, Args... kernel_function_parameter) : alpha_{ alpha }, q_{ q }, - data_d_{ data_d }, + data_{ data }, num_rows_{ num_rows }, device_num_rows_{ device_num_rows }, - row_offset_{ row_offset }, + device_row_offset_{ device_row_offset }, num_features_{ num_features }, QA_cost_{ QA_cost }, cost_{ cost }, @@ -67,7 +73,7 @@ class device_kernel_assembly_symm { num_classes_{ num_classes }, grid_x_offset_{ grid_x_offset }, grid_y_offset_{ grid_y_offset }, - kernel_function_parameter_{ std::make_tuple(std::forward(kernel_function_parameter)...) } { } + kernel_function_parameter_{ std::make_tuple(kernel_function_parameter...) } { } /** * @brief Function call operator overload performing the actual calculation. @@ -77,86 +83,124 @@ class device_kernel_assembly_symm { template void operator()(T group) const { ::sycl::memory_environment(group, - ::sycl::require_local_mem(), - ::sycl::require_local_mem(), - ::sycl::require_private_mem(), - ::sycl::require_private_mem(), - ::sycl::require_private_mem(), - ::sycl::require_private_mem(), + // the indices used in the current work-item + ::sycl::require_private_mem(), // num_rows - device_row_offset + ::sycl::require_private_mem(), // device_num_rows + + ::sycl::require_private_mem(), // num_rows - device_row_offset + ::sycl::require_private_mem(), // device_num_rows + + // create two local memory arrays used for caching + ::sycl::require_local_mem(), // cache_one + ::sycl::require_local_mem(), // cache_two + + // create a private memory array used for internal caching ::sycl::require_private_mem, INTERNAL_BLOCK_SIZE>>({}), - [&](auto &data_cache_i, auto &data_cache_j, auto &i, auto &i_linear, auto &j, auto &j_linear, auto &temp) { + [&](auto &i_idx, auto &j_idx, auto &i_idx_linear, auto &j_idx_linear, auto &cache_one, auto &cache_two, auto &temp) { // initialize private and local variables ::sycl::distribute_items_and_wait(group, [&](::sycl::s_item<2> idx) { - const std::size_t threadIdx_x = idx.get_local_id(group, 0); // current thread in block x-dimension - const std::size_t threadIdx_y = idx.get_local_id(group, 1); // current thread in block y-dimension - const std::size_t blockDim_x = group.get_logical_local_range(0); // number of threads in block x-dimension - const std::size_t blockDim_y = group.get_logical_local_range(1); // number of threads in block y-dimension - const std::size_t blockIdx_x = group[0] + grid_x_offset_; // current block in grid x-dimension + offsets if the grid size would be too large - const std::size_t blockIdx_y = group[1] + grid_y_offset_; // current block in grid y-dimension + offsets if the grid size would be too large - - const auto INTERNAL_BLOCK_SIZE_uz = static_cast(INTERNAL_BLOCK_SIZE); - - // indices - i(idx) = (blockIdx_y * blockDim_y + threadIdx_y) * INTERNAL_BLOCK_SIZE_uz; - i_linear(idx) = blockIdx_y * blockDim_y * INTERNAL_BLOCK_SIZE_uz + threadIdx_y; - j(idx) = (blockIdx_x * blockDim_x + threadIdx_x) * INTERNAL_BLOCK_SIZE_uz; - j_linear(idx) = blockIdx_x * blockDim_x * INTERNAL_BLOCK_SIZE_uz + threadIdx_y; + // cast all values to 64-bit std::size_t to prevent potential 32-bit overflows + constexpr auto INTERNAL_BLOCK_SIZE_uz = static_cast(INTERNAL_BLOCK_SIZE); + + const auto threadIdx_x = static_cast(idx.get_local_id(group, 0)); // current work-item in work-group x-dimension + const auto threadIdx_y = static_cast(idx.get_local_id(group, 1)); // current work-item in work-group y-dimension + const auto blockDim_x = static_cast(group.get_logical_local_range(0)); // number of work-items in work-group x-dimension + const auto blockDim_y = static_cast(group.get_logical_local_range(1)); // number of work-items in work-group y-dimension + const auto blockIdx_x = static_cast(group[0]) + grid_x_offset_; // current work-group in global range x-dimension + offsets if the global range is too large + const auto blockIdx_y = static_cast(group[1]) + grid_y_offset_; // current work-group in global range y-dimension + offsets if the global range is too large + + // calculate the indices to access the global data + i_idx(idx) = (blockIdx_y * blockDim_y + threadIdx_y) * INTERNAL_BLOCK_SIZE_uz; + j_idx(idx) = (blockIdx_x * blockDim_x + threadIdx_x) * INTERNAL_BLOCK_SIZE_uz; + // calculate the indices to access the global data, pays attention to coalesced memory accesses + i_idx_linear(idx) = blockIdx_y * blockDim_y * INTERNAL_BLOCK_SIZE_uz + threadIdx_y; + j_idx_linear(idx) = blockIdx_x * blockDim_x * INTERNAL_BLOCK_SIZE_uz + threadIdx_y; }); - // exploit symmetry + // only calculate the upper triangular matrix -> can't use get_local_id() since all work-items in a work-group must progress further if (group[1] >= group[0]) { - // iterate over all features using blocking to be able to cache them for faster memory accesses - for (std::size_t dim = 0; dim < num_features_; dim += static_cast(THREAD_BLOCK_SIZE)) { - // load data into local memory - ::sycl::distribute_items_and_wait(group, [&](::sycl::s_item<2> idx) { - const auto local_id_0 = static_cast(idx.get_local_id(group, 0)); - const auto local_id_1 = static_cast(idx.get_local_id(group, 1)); + //*************************************************************************// + // inplace kernel matrix construction // + //*************************************************************************// + { + // rename cached arrays + auto &data_i_cache = cache_one; // [THREAD_BLOCK_SIZE][INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE] + auto &data_j_cache = cache_two; // [THREAD_BLOCK_SIZE][INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE] + + // iterate over all features using blocking to be able to cache them for faster memory accesses + for (std::size_t feature_block = 0; feature_block < num_features_; feature_block += static_cast(THREAD_BLOCK_SIZE)) { + // load data into local memory + ::sycl::distribute_items_and_wait(group, [&](::sycl::s_item<2> idx) { + // cast values to 32-bit unsigned int values to prevent implicit conversions + const auto local_id_0 = static_cast(idx.get_local_id(group, 0)); + const auto local_id_1 = static_cast(idx.get_local_id(group, 1)); - const std::size_t threadIdx_x = idx.get_local_id(group, 0); + // cast all values to 64-bit std::size_t to prevent potential 32-bit overflows + constexpr auto THREAD_BLOCK_SIZE_uz = static_cast(THREAD_BLOCK_SIZE); + constexpr auto PADDING_SIZE_uz = static_cast(PADDING_SIZE); - const auto THREAD_BLOCK_SIZE_uz = static_cast(THREAD_BLOCK_SIZE); - const auto PADDING_SIZE_uz = static_cast(PADDING_SIZE); + const auto threadIdx_x = static_cast(idx.get_local_id(group, 0)); // current work-item in work-group x-dimension - for (unsigned internal = 0; internal < INTERNAL_BLOCK_SIZE; ++internal) { - const auto global_i = row_offset_ + i_linear(idx) + static_cast(internal) * THREAD_BLOCK_SIZE_uz; - const auto global_j = row_offset_ + j_linear(idx) + static_cast(internal) * THREAD_BLOCK_SIZE_uz; + for (unsigned internal = 0; internal < INTERNAL_BLOCK_SIZE; ++internal) { + // calculate the indices to access the global data, pays attention to coalesced memory accesses + const auto global_i_idx_linear = device_row_offset_ + i_idx_linear(idx) + static_cast(internal) * THREAD_BLOCK_SIZE_uz; + const auto global_j_idx_linear = device_row_offset_ + j_idx_linear(idx) + static_cast(internal) * THREAD_BLOCK_SIZE_uz; - // store the values in the local memory - data_cache_i[local_id_0 * INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE + internal * THREAD_BLOCK_SIZE + local_id_1] = data_d_[(dim + threadIdx_x) * (num_rows_ + std::size_t{ 1 } + PADDING_SIZE_uz) + global_i]; - data_cache_j[local_id_0 * INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE + internal * THREAD_BLOCK_SIZE + local_id_1] = data_d_[(dim + threadIdx_x) * (num_rows_ + std::size_t{ 1 } + PADDING_SIZE_uz) + global_j]; - } - }); - - // perform the feature reduction calculation - ::sycl::distribute_items_and_wait(group, [&](::sycl::s_item<2> idx) { - const auto local_id_0 = static_cast(idx.get_local_id(group, 0)); - const auto local_id_1 = static_cast(idx.get_local_id(group, 1)); - - for (unsigned block_dim = 0; block_dim < THREAD_BLOCK_SIZE; ++block_dim) { - for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) { - for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) { - temp(idx)[internal_i][internal_j] += detail::feature_reduce(data_cache_i[block_dim * INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE + local_id_1 * INTERNAL_BLOCK_SIZE + internal_i], - data_cache_j[block_dim * INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE + local_id_0 * INTERNAL_BLOCK_SIZE + internal_j]); + // store the values in the local memory + data_i_cache[local_id_0 * INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE + internal * THREAD_BLOCK_SIZE + local_id_1] = data_[(feature_block + threadIdx_x) * (num_rows_ + std::size_t{ 1 } + PADDING_SIZE_uz) + global_i_idx_linear]; // SoA + data_j_cache[local_id_0 * INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE + internal * THREAD_BLOCK_SIZE + local_id_1] = data_[(feature_block + threadIdx_x) * (num_rows_ + std::size_t{ 1 } + PADDING_SIZE_uz) + global_j_idx_linear]; // SoA + } + }); + + // perform the feature reduction calculation + ::sycl::distribute_items_and_wait(group, [&](::sycl::s_item<2> idx) { + // cast values to 32-bit unsigned int values to prevent implicit conversions + const auto local_id_0 = static_cast(idx.get_local_id(group, 0)); + const auto local_id_1 = static_cast(idx.get_local_id(group, 1)); + + if constexpr (target == target_platform::cpu) { + // perform the feature reduction calculation, the feature is the fastest moving index + for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) { + for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) { + real_type sum{ 0.0 }; + for (unsigned feature = 0; feature < THREAD_BLOCK_SIZE; ++feature) { + sum += detail::feature_reduce(data_i_cache[feature * INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE + local_id_1 * INTERNAL_BLOCK_SIZE + internal_i], + data_j_cache[feature * INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE + local_id_0 * INTERNAL_BLOCK_SIZE + internal_j]); + } + temp(idx)[internal_i][internal_j] += sum; + } + } + } else { + // perform the feature reduction calculation, the feature is the slowest moving index + for (unsigned feature = 0; feature < THREAD_BLOCK_SIZE; ++feature) { + for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) { + for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) { + temp(idx)[internal_i][internal_j] += detail::feature_reduce(data_i_cache[feature * INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE + local_id_1 * INTERNAL_BLOCK_SIZE + internal_i], + data_j_cache[feature * INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE + local_id_0 * INTERNAL_BLOCK_SIZE + internal_j]); + } + } } } - } - }); + }); + } } // apply the remaining part of the kernel function and store the value in the output kernel matrix ::sycl::distribute_items_and_wait(group, [&](::sycl::s_item<2> idx) { for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) { for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) { - const auto global_i = row_offset_ + i(idx) + static_cast(internal_i); - const auto device_global_i = i(idx) + static_cast(internal_i); - const auto global_j = row_offset_ + j(idx) + static_cast(internal_j); - const auto device_global_j = j(idx) + static_cast(internal_j); - - // be sure to not perform out of bounds accesses for the kernel matrix (only using the upper triangular matrix) - if (device_global_i < (num_rows_ - row_offset_) && device_global_j < device_num_rows_ && global_i >= global_j) { - temp(idx)[internal_i][internal_j] = detail::apply_kernel_function(temp(idx)[internal_i][internal_j], kernel_function_parameter_) + QA_cost_ - q_[global_i] - q_[global_j]; + // calculate the indices to access the global data and the data with respect to the current device + const auto device_global_i_idx = i_idx(idx) + static_cast(internal_i); + const auto global_i_idx = device_row_offset_ + device_global_i_idx; + const auto device_global_j_idx = j_idx(idx) + static_cast(internal_j); + const auto global_j_idx = device_row_offset_ + device_global_j_idx; + + // be sure to not perform out-of-bounds accesses (only using the upper triangular matrix) + if (device_global_i_idx < (num_rows_ - device_row_offset_) && device_global_j_idx < device_num_rows_ && global_i_idx >= global_j_idx) { + // apply the final kernel function + temp(idx)[internal_i][internal_j] = detail::apply_kernel_function(temp(idx)[internal_i][internal_j], kernel_function_parameter_) + QA_cost_ - q_[global_i_idx] - q_[global_j_idx]; // apply the cost on the diagonal - if (global_i == global_j) { + if (global_i_idx == global_j_idx) { temp(idx)[internal_i][internal_j] += cost_; } } else { @@ -167,36 +211,42 @@ class device_kernel_assembly_symm { } }); - // calculate C += alpha * temp * B for the UPPER triangular matrix + //*************************************************************************// + // calculate C += alpha * temp * B for the UPPER triangular matrix // + //*************************************************************************// { // rename cached arrays - auto &B_cache = data_cache_i; // [INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE][THREAD_BLOCK_SIZE] - auto &C_out_cache = data_cache_j; // [INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE][THREAD_BLOCK_SIZE] + auto &B_cache = cache_one; // [INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE][THREAD_BLOCK_SIZE] + auto &C_out_cache = cache_two; // [INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE][THREAD_BLOCK_SIZE] // iterate over all classes using blocking to be able to cache them for faster memory accesses - for (std::size_t dim = 0; dim < num_classes_; dim += static_cast(THREAD_BLOCK_SIZE)) { + for (std::size_t class_block = 0; class_block < num_classes_; class_block += static_cast(THREAD_BLOCK_SIZE)) { // load data into local memory ::sycl::distribute_items_and_wait(group, [&](::sycl::s_item<2> idx) { + // cast values to 32-bit unsigned int values to prevent implicit conversions const auto local_id_0 = static_cast(idx.get_local_id(group, 0)); const auto local_id_1 = static_cast(idx.get_local_id(group, 1)); - const std::size_t threadIdx_x = idx.get_local_id(group, 0); + // cast all values to 64-bit std::size_t to prevent potential 32-bit overflows + constexpr auto THREAD_BLOCK_SIZE_uz = static_cast(THREAD_BLOCK_SIZE); + constexpr auto PADDING_SIZE_uz = static_cast(PADDING_SIZE); - const auto THREAD_BLOCK_SIZE_uz = static_cast(THREAD_BLOCK_SIZE); - const auto PADDING_SIZE_uz = static_cast(PADDING_SIZE); + const auto threadIdx_x = static_cast(idx.get_local_id(group, 0)); // current work-item in work-group x-dimension for (unsigned internal = 0; internal < INTERNAL_BLOCK_SIZE; ++internal) { - const std::size_t global_i = row_offset_ + i_linear(idx) + static_cast(internal) * THREAD_BLOCK_SIZE_uz; + // calculate the indices to access the global data, pays attention to coalesced memory accesses + const auto global_i_idx_linear = device_row_offset_ + i_idx_linear(idx) + static_cast(internal) * THREAD_BLOCK_SIZE_uz; // store the values in the local memory - B_cache[(internal * THREAD_BLOCK_SIZE + local_id_1) * THREAD_BLOCK_SIZE + local_id_0] = alpha_ * B_[global_i * (num_classes_ + PADDING_SIZE_uz) + dim + threadIdx_x]; - C_out_cache[(internal * THREAD_BLOCK_SIZE + local_id_1) * THREAD_BLOCK_SIZE + local_id_0] = real_type{ 0.0 }; + B_cache[(internal * THREAD_BLOCK_SIZE + local_id_1) * THREAD_BLOCK_SIZE + local_id_0] = alpha_ * B_[global_i_idx_linear * (num_classes_ + PADDING_SIZE_uz) + class_block + threadIdx_x]; // SoA + C_out_cache[(internal * THREAD_BLOCK_SIZE + local_id_1) * THREAD_BLOCK_SIZE + local_id_0] = real_type{ 0.0 }; // SoA } }); - // calculate intermediate results and store them in shared memory + // calculate intermediate results and store them in local memory for (unsigned class_idx = 0; class_idx < THREAD_BLOCK_SIZE; ++class_idx) { ::sycl::distribute_items_and_wait(group, [&](::sycl::s_item<2> idx) { + // cast values to 32-bit unsigned int values to prevent implicit conversions const auto local_id_0 = static_cast(idx.get_local_id(group, 0)); const auto local_id_1 = static_cast(idx.get_local_id(group, 1)); @@ -209,18 +259,22 @@ class device_kernel_assembly_symm { }); } - // add intermediate cached results to C + // atomically add the intermediate cached results to the C matrix ::sycl::distribute_items_and_wait(group, [&](::sycl::s_item<2> idx) { + // cast values to 32-bit unsigned int values to prevent implicit conversions const auto local_id_0 = static_cast(idx.get_local_id(group, 0)); const auto local_id_1 = static_cast(idx.get_local_id(group, 1)); - const std::size_t threadIdx_y = idx.get_local_id(group, 1); + // cast all values to 64-bit std::size_t to prevent potential 32-bit overflows + constexpr auto PADDING_SIZE_uz = static_cast(PADDING_SIZE); - const auto PADDING_SIZE_uz = static_cast(PADDING_SIZE); + const auto threadIdx_y = static_cast(idx.get_local_id(group, 1)); // current work-item in work-group y-dimension for (unsigned internal = 0; internal < INTERNAL_BLOCK_SIZE; ++internal) { - const auto global_j = row_offset_ + j(idx) + static_cast(internal); - detail::atomic_op{ C_[global_j * (num_classes_ + PADDING_SIZE_uz) + dim + threadIdx_y] } += C_out_cache[(local_id_0 * INTERNAL_BLOCK_SIZE + internal) * THREAD_BLOCK_SIZE + local_id_1]; + // calculate the indices to access the global data + const auto global_j_idx = device_row_offset_ + j_idx(idx) + static_cast(internal); + + detail::atomic_op{ C_[global_j_idx * (num_classes_ + PADDING_SIZE_uz) + class_block + threadIdx_y] } += C_out_cache[(local_id_0 * INTERNAL_BLOCK_SIZE + internal) * THREAD_BLOCK_SIZE + local_id_1]; // SoA } }); } @@ -230,48 +284,53 @@ class device_kernel_assembly_symm { ::sycl::distribute_items_and_wait(group, [&](::sycl::s_item<2> idx) { for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) { for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) { - const auto global_i = row_offset_ + i(idx) + static_cast(internal_i); - const auto global_j = row_offset_ + j(idx) + static_cast(internal_j); + // calculate the indices to access the global data + const auto global_i_idx = device_row_offset_ + i_idx(idx) + static_cast(internal_i); + const auto global_j_idx = device_row_offset_ + j_idx(idx) + static_cast(internal_j); - if (global_i == global_j) { + if (global_i_idx == global_j_idx) { temp(idx)[internal_i][internal_j] = real_type{ 0.0 }; } } } }); - // calculate C += alpha * temp * B for the LOWER triangular matrix + //*************************************************************************// + // calculate C += alpha * temp * B for the LOWER triangular matrix // + //*************************************************************************// { - // allocate shared memory - auto &B_cache = data_cache_i; // [THREAD_BLOCK_SIZE][INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE] - auto &C_out_cache = data_cache_j; // [THREAD_BLOCK_SIZE][INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE] + // rename local memory + auto &B_cache = cache_one; // [THREAD_BLOCK_SIZE][INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE] + auto &C_out_cache = cache_two; // [THREAD_BLOCK_SIZE][INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE] // iterate over all classes using blocking to be able to cache them for faster memory accesses - for (std::size_t dim = 0; dim < num_classes_; dim += static_cast(THREAD_BLOCK_SIZE)) { + for (std::size_t class_block = 0; class_block < num_classes_; class_block += static_cast(THREAD_BLOCK_SIZE)) { ::sycl::distribute_items_and_wait(group, [&](::sycl::s_item<2> idx) { + // cast values to 32-bit unsigned int values to prevent implicit conversions const auto local_id_0 = static_cast(idx.get_local_id(group, 0)); const auto local_id_1 = static_cast(idx.get_local_id(group, 1)); - const std::size_t threadIdx_x = idx.get_local_id(group, 0); + // cast all values to 64-bit std::size_t to prevent potential 32-bit overflows + constexpr auto THREAD_BLOCK_SIZE_uz = static_cast(THREAD_BLOCK_SIZE); + constexpr auto PADDING_SIZE_uz = static_cast(PADDING_SIZE); - const auto THREAD_BLOCK_SIZE_uz = static_cast(THREAD_BLOCK_SIZE); - const auto PADDING_SIZE_uz = static_cast(PADDING_SIZE); + const auto threadIdx_x = static_cast(idx.get_local_id(group, 0)); // current work-item in work-group x-dimension // load data into local memory for (unsigned internal = 0; internal < INTERNAL_BLOCK_SIZE; ++internal) { - const auto global_j = row_offset_ + j_linear(idx) + static_cast(internal) * THREAD_BLOCK_SIZE_uz; + // calculate the indices to access the global data, pays attention to coalesced memory accesses + const auto global_j_idx_linear = device_row_offset_ + j_idx_linear(idx) + static_cast(internal) * THREAD_BLOCK_SIZE_uz; - // store the values in the shared memory - B_cache[local_id_0 * INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE + internal * THREAD_BLOCK_SIZE + local_id_1] = alpha_ * B_[global_j * (num_classes_ + PADDING_SIZE_uz) + dim + threadIdx_x]; + // store the values in the local memory + B_cache[local_id_0 * INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE + internal * THREAD_BLOCK_SIZE + local_id_1] = alpha_ * B_[global_j_idx_linear * (num_classes_ + PADDING_SIZE_uz) + class_block + threadIdx_x]; // SoA C_out_cache[local_id_0 * INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE + internal * THREAD_BLOCK_SIZE + local_id_1] = real_type{ 0.0 }; } }); - // implicit group barrier - - // calculate intermediate results and store them in shared memory + // calculate intermediate results and store them in local memory for (unsigned class_idx = 0; class_idx < THREAD_BLOCK_SIZE; ++class_idx) { ::sycl::distribute_items_and_wait(group, [&](::sycl::s_item<2> idx) { + // cast values to 32-bit unsigned int values to prevent implicit conversions const auto local_id_0 = static_cast(idx.get_local_id(group, 0)); const auto local_id_1 = static_cast(idx.get_local_id(group, 1)); @@ -282,26 +341,26 @@ class device_kernel_assembly_symm { } } }); - - // implicit group barrier } - // add intermediate cached results to C + // atomically add the intermediate cached results to the C matrix ::sycl::distribute_items_and_wait(group, [&](::sycl::s_item<2> idx) { + // cast values to 32-bit unsigned int values to prevent implicit conversions const auto local_id_0 = static_cast(idx.get_local_id(group, 0)); const auto local_id_1 = static_cast(idx.get_local_id(group, 1)); - const std::size_t threadIdx_x = idx.get_local_id(group, 0); + // cast all values to 64-bit std::size_t to prevent potential 32-bit overflows + constexpr auto PADDING_SIZE_uz = static_cast(PADDING_SIZE); - const auto PADDING_SIZE_uz = static_cast(PADDING_SIZE); + const auto threadIdx_x = static_cast(idx.get_local_id(group, 0)); // current work-item in work-group x-dimension for (unsigned internal = 0; internal < INTERNAL_BLOCK_SIZE; ++internal) { - const auto global_i = row_offset_ + i(idx) + static_cast(internal); - detail::atomic_op{ C_[global_i * (num_classes_ + PADDING_SIZE_uz) + dim + threadIdx_x] } += C_out_cache[local_id_0 * INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE + internal * THREAD_BLOCK_SIZE + local_id_1]; + // calculate the indices to access the global data + const auto global_i_idx = device_row_offset_ + i_idx(idx) + static_cast(internal); + + detail::atomic_op{ C_[global_i_idx * (num_classes_ + PADDING_SIZE_uz) + class_block + threadIdx_x] } += C_out_cache[local_id_0 * INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE + internal * THREAD_BLOCK_SIZE + local_id_1]; // SoA } }); - - // implicit group barrier } } } @@ -312,10 +371,10 @@ class device_kernel_assembly_symm { /// @cond Doxygen_suppress const real_type alpha_; const real_type *q_; - const real_type *data_d_; + const real_type *data_; const std::size_t num_rows_; const std::size_t device_num_rows_; - const std::size_t row_offset_; + const std::size_t device_row_offset_; const std::size_t num_features_; const real_type QA_cost_; const real_type cost_; diff --git a/include/plssvm/backends/SYCL/kernel/cg_implicit/work_group/kernel_matrix_assembly_blas.hpp b/include/plssvm/backends/SYCL/kernel/cg_implicit/work_group/kernel_matrix_assembly_blas.hpp index 2e6ea3f4f..509e6cb25 100644 --- a/include/plssvm/backends/SYCL/kernel/cg_implicit/work_group/kernel_matrix_assembly_blas.hpp +++ b/include/plssvm/backends/SYCL/kernel/cg_implicit/work_group/kernel_matrix_assembly_blas.hpp @@ -15,8 +15,10 @@ #include "plssvm/backends/SYCL/detail/atomics.hpp" // plssvm::sycl::detail::atomic_op #include "plssvm/backends/SYCL/kernel/kernel_functions.hpp" // plssvm::sycl::detail::{feature_reduce, apply_kernel_function} +#include "plssvm/backends/SYCL/kernel_invocation_types.hpp" // plssvm::sycl::kernel_invocation_type #include "plssvm/constants.hpp" // plssvm::{real_type, THREAD_BLOCK_SIZE, INTERNAL_BLOCK_SIZE, PADDING_SIZE} #include "plssvm/kernel_function_types.hpp" // plssvm::kernel_function_type +#include "plssvm/target_platforms.hpp" // plssvm::target_platform #include "sycl/sycl.hpp" // sycl::handler, sycl::range, sycl::nd_item, sycl::local_accessor @@ -28,21 +30,25 @@ namespace plssvm::sycl::detail::work_group { /** * @brief Perform an implicit BLAS SYMM-like operation: `C = alpha * A * B + C` where `A` is the implicitly calculated kernel matrix using the @p kernel_function (never actually stored, reducing the amount of needed global memory), @p B and @p C are matrices, and @p alpha is a scalar. * @details Uses SYCL's work-group data parallel kernels. + * @tparam target the target platform * @tparam kernel_function the type of the used kernel function * @tparam Args the types of the parameters necessary for the specific kernel function */ -template +template class device_kernel_assembly_symm { public: + /// The used SYCL kernel invocation type. + constexpr static sycl::kernel_invocation_type invocation_type = sycl::kernel_invocation_type::work_group; + /** * @brief Initialize the SYCL kernel function object. * @param[in] cgh the SYCL handler used to allocate the local memory * @param[in] alpha the scalar alpha value * @param[in] q the vector used in the dimensional reduction - * @param[in] data_d the data points to calculate the implicit kernel matrix from + * @param[in] data the data points to calculate the implicit kernel matrix from * @param[in] num_rows the total number of data points (= total number of rows) * @param[in] device_num_rows the number of rows the current device is responsible for - * @param[in] row_offset the first row in @p data_d the current device is responsible for + * @param[in] device_row_offset the first row in @p data the current device is responsible for * @param[in] num_features the number of features per data point * @param[in] QA_cost the scalar used in the dimensional reduction * @param[in] cost the cost factor the diagonal is scaled with @@ -53,15 +59,15 @@ class device_kernel_assembly_symm { * @param[in] grid_y_offset the offset in y-dimension into the data points if more than one execution grid has to be used * @param[in] kernel_function_parameter the parameters necessary to apply the @p kernel_function */ - device_kernel_assembly_symm(::sycl::handler &cgh, const real_type alpha, const real_type *q, const real_type *data_d, const std::size_t num_rows, const std::size_t device_num_rows, const std::size_t row_offset, const std::size_t num_features, const real_type QA_cost, const real_type cost, const real_type *B, real_type *C, const std::size_t num_classes, const std::size_t grid_x_offset, const std::size_t grid_y_offset, Args... kernel_function_parameter) : - data_cache_i_{ ::sycl::range<1>{ static_cast(THREAD_BLOCK_SIZE) * static_cast(INTERNAL_BLOCK_SIZE) * static_cast(THREAD_BLOCK_SIZE) }, cgh }, // [THREAD_BLOCK_SIZE][INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE] - data_cache_j_{ ::sycl::range<1>{ static_cast(THREAD_BLOCK_SIZE) * static_cast(INTERNAL_BLOCK_SIZE) * static_cast(THREAD_BLOCK_SIZE) }, cgh }, // [THREAD_BLOCK_SIZE][INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE] + device_kernel_assembly_symm(::sycl::handler &cgh, const real_type alpha, const real_type *q, const real_type *data, const std::size_t num_rows, const std::size_t device_num_rows, const std::size_t device_row_offset, const std::size_t num_features, const real_type QA_cost, const real_type cost, const real_type *B, real_type *C, const std::size_t num_classes, const std::size_t grid_x_offset, const std::size_t grid_y_offset, Args... kernel_function_parameter) : + cache_one_{ ::sycl::range<1>{ static_cast(THREAD_BLOCK_SIZE) * static_cast(INTERNAL_BLOCK_SIZE) * static_cast(THREAD_BLOCK_SIZE) }, cgh }, // [THREAD_BLOCK_SIZE][INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE] + cache_two_{ ::sycl::range<1>{ static_cast(THREAD_BLOCK_SIZE) * static_cast(INTERNAL_BLOCK_SIZE) * static_cast(THREAD_BLOCK_SIZE) }, cgh }, // [THREAD_BLOCK_SIZE][INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE] alpha_{ alpha }, q_{ q }, - data_d_{ data_d }, + data_{ data }, num_rows_{ num_rows }, device_num_rows_{ device_num_rows }, - row_offset_{ row_offset }, + device_row_offset_{ device_row_offset }, num_features_{ num_features }, QA_cost_{ QA_cost }, cost_{ cost }, @@ -70,7 +76,7 @@ class device_kernel_assembly_symm { num_classes_{ num_classes }, grid_x_offset_{ grid_x_offset }, grid_y_offset_{ grid_y_offset }, - kernel_function_parameter_{ std::make_tuple(std::forward(kernel_function_parameter)...) } { } + kernel_function_parameter_{ std::make_tuple(kernel_function_parameter...) } { } /** * @brief Function call operator overload performing the actual calculation. @@ -82,47 +88,72 @@ class device_kernel_assembly_symm { const auto local_id_1 = static_cast(nd_idx.get_local_id(1)); // cast all values to 64-bit std::size_t to prevent potential 32-bit overflows - const std::size_t threadIdx_x = nd_idx.get_local_id(0); // current thread in block x-dimension - const std::size_t threadIdx_y = nd_idx.get_local_id(1); // current thread in block y-dimension - const std::size_t blockDim_x = nd_idx.get_local_range(0); // number of threads in block x-dimension - const std::size_t blockDim_y = nd_idx.get_local_range(1); // number of threads in block y-dimension - const std::size_t blockIdx_x = nd_idx.get_group(0) + grid_x_offset_; // current block in grid x-dimension + offsets if the grid size would be too large - const std::size_t blockIdx_y = nd_idx.get_group(1) + grid_y_offset_; // current block in grid y-dimension + offsets if the grid size would be too large - const auto INTERNAL_BLOCK_SIZE_uz = static_cast(INTERNAL_BLOCK_SIZE); - const auto THREAD_BLOCK_SIZE_uz = static_cast(THREAD_BLOCK_SIZE); - const auto PADDING_SIZE_uz = static_cast(PADDING_SIZE); + constexpr auto INTERNAL_BLOCK_SIZE_uz = static_cast(INTERNAL_BLOCK_SIZE); + constexpr auto THREAD_BLOCK_SIZE_uz = static_cast(THREAD_BLOCK_SIZE); + constexpr auto PADDING_SIZE_uz = static_cast(PADDING_SIZE); + + const auto threadIdx_x = static_cast(nd_idx.get_local_id(0)); // current work-item in work-group x-dimension + const auto threadIdx_y = static_cast(nd_idx.get_local_id(1)); // current work-item in work-group y-dimension + const auto blockDim_x = static_cast(nd_idx.get_local_range(0)); // number of work-items in work-group x-dimension + const auto blockDim_y = static_cast(nd_idx.get_local_range(1)); // number of work-items in work-group y-dimension + const auto blockIdx_x = static_cast(nd_idx.get_group(0)) + grid_x_offset_; // current work-group in global range x-dimension + offsets if the global range is too large + const auto blockIdx_y = static_cast(nd_idx.get_group(1)) + grid_y_offset_; // current work-group in global range y-dimension + offsets if the global range is too large // calculate the indices used in the current work-item - const auto i = (blockIdx_y * blockDim_y + threadIdx_y) * INTERNAL_BLOCK_SIZE_uz; - const auto i_linear = blockIdx_y * blockDim_y * INTERNAL_BLOCK_SIZE_uz + threadIdx_y; - const auto j = (blockIdx_x * blockDim_x + threadIdx_x) * INTERNAL_BLOCK_SIZE_uz; - const auto j_linear = blockIdx_x * blockDim_x * INTERNAL_BLOCK_SIZE_uz + threadIdx_y; + const auto i_idx = (blockIdx_y * blockDim_y + threadIdx_y) * INTERNAL_BLOCK_SIZE_uz; // num_rows - device_row_offset + const auto j_idx = (blockIdx_x * blockDim_x + threadIdx_x) * INTERNAL_BLOCK_SIZE_uz; // device_num_rows + + // calculate the indices used in the current work-item, pays attention to coalesced memory accesses + const auto i_idx_linear = blockIdx_y * blockDim_y * INTERNAL_BLOCK_SIZE_uz + threadIdx_y; // num_rows - device_row_offset + const auto j_idx_linear = blockIdx_x * blockDim_x * INTERNAL_BLOCK_SIZE_uz + threadIdx_y; // device_num_rows // only calculate the upper triangular matrix -> can't use get_local_id() since all work-items in a work-group must progress further if (blockIdx_y >= blockIdx_x) { // create a work-item private array used for internal caching real_type temp[INTERNAL_BLOCK_SIZE][INTERNAL_BLOCK_SIZE]{}; + //*************************************************************************// + // inplace kernel matrix construction // + //*************************************************************************// { + // rename cached arrays + auto &data_i_cache = cache_one_; // [THREAD_BLOCK_SIZE][INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE] + auto &data_j_cache = cache_two_; // [THREAD_BLOCK_SIZE][INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE] + // iterate over all features using blocking to be able to cache them for faster memory accesses - for (std::size_t dim = 0; dim < num_features_; dim += THREAD_BLOCK_SIZE_uz) { + for (std::size_t feature_block = 0; feature_block < num_features_; feature_block += THREAD_BLOCK_SIZE_uz) { // load data into local memory for (unsigned internal = 0; internal < INTERNAL_BLOCK_SIZE; ++internal) { - const auto global_i = row_offset_ + i_linear + static_cast(internal) * THREAD_BLOCK_SIZE_uz; - const auto global_j = row_offset_ + j_linear + static_cast(internal) * THREAD_BLOCK_SIZE_uz; + // calculate the indices to access the global data, pays attention to coalesced memory accesses + const auto global_i_idx_linear = device_row_offset_ + i_idx_linear + static_cast(internal) * THREAD_BLOCK_SIZE_uz; + const auto global_j_idx_linear = device_row_offset_ + j_idx_linear + static_cast(internal) * THREAD_BLOCK_SIZE_uz; // store the values in the local memory - data_cache_i_[local_id_0 * INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE + internal * THREAD_BLOCK_SIZE + local_id_1] = data_d_[(dim + threadIdx_x) * (num_rows_ + std::size_t{ 1 } + PADDING_SIZE_uz) + global_i]; - data_cache_j_[local_id_0 * INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE + internal * THREAD_BLOCK_SIZE + local_id_1] = data_d_[(dim + threadIdx_x) * (num_rows_ + std::size_t{ 1 } + PADDING_SIZE_uz) + global_j]; + data_i_cache[local_id_0 * INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE + internal * THREAD_BLOCK_SIZE + local_id_1] = data_[(feature_block + threadIdx_x) * (num_rows_ + std::size_t{ 1 } + PADDING_SIZE_uz) + global_i_idx_linear]; // SoA + data_j_cache[local_id_0 * INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE + internal * THREAD_BLOCK_SIZE + local_id_1] = data_[(feature_block + threadIdx_x) * (num_rows_ + std::size_t{ 1 } + PADDING_SIZE_uz) + global_j_idx_linear]; // SoA } nd_idx.barrier(); // wait until all work-items loaded their part of the data - // perform the feature reduction calculation - for (unsigned block_dim = 0; block_dim < THREAD_BLOCK_SIZE; ++block_dim) { + if constexpr (target == target_platform::cpu) { + // perform the feature reduction calculation, the feature is the fastest moving index for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) { for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) { - temp[internal_i][internal_j] += detail::feature_reduce(data_cache_i_[block_dim * INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE + local_id_1 * INTERNAL_BLOCK_SIZE + internal_i], - data_cache_j_[block_dim * INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE + local_id_0 * INTERNAL_BLOCK_SIZE + internal_j]); + real_type sum{ 0.0 }; + for (unsigned feature = 0; feature < THREAD_BLOCK_SIZE; ++feature) { + sum += detail::feature_reduce(data_i_cache[feature * INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE + local_id_1 * INTERNAL_BLOCK_SIZE + internal_i], + data_j_cache[feature * INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE + local_id_0 * INTERNAL_BLOCK_SIZE + internal_j]); + } + temp[internal_i][internal_j] += sum; + } + } + } else { + // perform the feature reduction calculation, the feature is the slowest moving index + for (unsigned feature = 0; feature < THREAD_BLOCK_SIZE; ++feature) { + for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) { + for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) { + temp[internal_i][internal_j] += detail::feature_reduce(data_i_cache[feature * INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE + local_id_1 * INTERNAL_BLOCK_SIZE + internal_i], + data_j_cache[feature * INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE + local_id_0 * INTERNAL_BLOCK_SIZE + internal_j]); + } } } } @@ -133,16 +164,18 @@ class device_kernel_assembly_symm { // apply the remaining part of the kernel function and store the value in the output kernel matrix for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) { for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) { - const auto global_i = row_offset_ + i + static_cast(internal_i); - const auto device_global_i = i + static_cast(internal_i); - const auto global_j = row_offset_ + j + static_cast(internal_j); - const auto device_global_j = j + static_cast(internal_j); - - // be sure to not perform out of bounds accesses for the kernel matrix (only using the upper triangular matrix) - if (device_global_i < (num_rows_ - row_offset_) && device_global_j < device_num_rows_ && global_i >= global_j) { - temp[internal_i][internal_j] = detail::apply_kernel_function(temp[internal_i][internal_j], kernel_function_parameter_) + QA_cost_ - q_[global_i] - q_[global_j]; + // calculate the indices to access the global data and the data with respect to the current device + const auto device_global_i_idx = i_idx + static_cast(internal_i); + const auto global_i_idx = device_row_offset_ + device_global_i_idx; + const auto device_global_j_idx = j_idx + static_cast(internal_j); + const auto global_j_idx = device_row_offset_ + device_global_j_idx; + + // be sure to not perform out-of-bounds accesses (only using the upper triangular matrix) + if (device_global_i_idx < (num_rows_ - device_row_offset_) && device_global_j_idx < device_num_rows_ && global_i_idx >= global_j_idx) { + // apply the final kernel function + temp[internal_i][internal_j] = detail::apply_kernel_function(temp[internal_i][internal_j], kernel_function_parameter_) + QA_cost_ - q_[global_i_idx] - q_[global_j_idx]; // apply the cost on the diagonal - if (global_i == global_j) { + if (global_i_idx == global_j_idx) { temp[internal_i][internal_j] += cost_; } } else { @@ -152,25 +185,28 @@ class device_kernel_assembly_symm { } } - // calculate C += alpha * temp * B for the UPPER triangular matrix + //*************************************************************************// + // calculate C += alpha * temp * B for the UPPER triangular matrix // + //*************************************************************************// { // rename cached arrays - auto &B_cache = data_cache_i_; // [INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE][THREAD_BLOCK_SIZE] - auto &C_out_cache = data_cache_j_; // [INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE][THREAD_BLOCK_SIZE] + auto &B_cache = cache_one_; // [INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE][THREAD_BLOCK_SIZE] + auto &C_out_cache = cache_two_; // [INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE][THREAD_BLOCK_SIZE] // iterate over all classes using blocking to be able to cache them for faster memory accesses - for (std::size_t dim = 0; dim < num_classes_; dim += THREAD_BLOCK_SIZE_uz) { + for (std::size_t class_block = 0; class_block < num_classes_; class_block += THREAD_BLOCK_SIZE_uz) { // load data into local memory for (unsigned internal = 0; internal < INTERNAL_BLOCK_SIZE; ++internal) { - const std::size_t global_i = row_offset_ + i_linear + static_cast(internal) * THREAD_BLOCK_SIZE_uz; + // calculate the indices to access the global data, pays attention to coalesced memory accesses + const auto global_i_idx_linear = device_row_offset_ + i_idx_linear + static_cast(internal) * THREAD_BLOCK_SIZE_uz; // store the values in the local memory - B_cache[(internal * THREAD_BLOCK_SIZE + local_id_1) * THREAD_BLOCK_SIZE + local_id_0] = alpha_ * B_[global_i * (num_classes_ + PADDING_SIZE_uz) + dim + threadIdx_x]; - C_out_cache[(internal * THREAD_BLOCK_SIZE + local_id_1) * THREAD_BLOCK_SIZE + local_id_0] = real_type{ 0.0 }; + B_cache[(internal * THREAD_BLOCK_SIZE + local_id_1) * THREAD_BLOCK_SIZE + local_id_0] = alpha_ * B_[global_i_idx_linear * (num_classes_ + PADDING_SIZE_uz) + class_block + threadIdx_x]; // SoA + C_out_cache[(internal * THREAD_BLOCK_SIZE + local_id_1) * THREAD_BLOCK_SIZE + local_id_0] = real_type{ 0.0 }; // SoA } nd_idx.barrier(); // wait until all work-items loaded their part of the data - // calculate intermediate results and store them in shared memory + // calculate intermediate results and store them in local memory for (unsigned class_idx = 0; class_idx < THREAD_BLOCK_SIZE; ++class_idx) { for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) { for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) { @@ -181,10 +217,12 @@ class device_kernel_assembly_symm { nd_idx.barrier(); // wait until all work-items performed their part of the calculations } - // add intermediate cached results to C + // atomically add the intermediate cached results to the C matrix for (unsigned internal = 0; internal < INTERNAL_BLOCK_SIZE; ++internal) { - const auto global_j = row_offset_ + j + static_cast(internal); - detail::atomic_op{ C_[global_j * (num_classes_ + PADDING_SIZE_uz) + dim + threadIdx_y] } += C_out_cache[(local_id_0 * INTERNAL_BLOCK_SIZE + internal) * THREAD_BLOCK_SIZE + local_id_1]; + // calculate the indices to access the global data + const auto global_j_idx = device_row_offset_ + j_idx + static_cast(internal); + + detail::atomic_op{ C_[global_j_idx * (num_classes_ + PADDING_SIZE_uz) + class_block + threadIdx_y] } += C_out_cache[(local_id_0 * INTERNAL_BLOCK_SIZE + internal) * THREAD_BLOCK_SIZE + local_id_1]; // SoA } nd_idx.barrier(); // wai until all work-items updated C with their values } @@ -193,34 +231,39 @@ class device_kernel_assembly_symm { // set potential diagonal entries in temp to 0.0 such that we don't apply the main diagonal twice to C for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) { for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) { - const auto global_i = row_offset_ + i + static_cast(internal_i); - const auto global_j = row_offset_ + j + static_cast(internal_j); + // calculate the indices to access the global data + const auto global_i_idx = device_row_offset_ + i_idx + static_cast(internal_i); + const auto global_j_idx = device_row_offset_ + j_idx + static_cast(internal_j); - if (global_i == global_j) { + // update the diagonal + if (global_i_idx == global_j_idx) { temp[internal_i][internal_j] = real_type{ 0.0 }; } } } - // calculate C += alpha * temp * B for the LOWER triangular matrix + //*************************************************************************// + // calculate C += alpha * temp * B for the LOWER triangular matrix // + //*************************************************************************// { // rename cached arrays - auto &B_cache = data_cache_i_; // [THREAD_BLOCK_SIZE][INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE] - auto &C_out_cache = data_cache_j_; // [THREAD_BLOCK_SIZE][INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE] + auto &B_cache = cache_one_; // [THREAD_BLOCK_SIZE][INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE] + auto &C_out_cache = cache_two_; // [THREAD_BLOCK_SIZE][INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE] // iterate over all classes using blocking to be able to cache them for faster memory accesses - for (std::size_t dim = 0; dim < num_classes_; dim += THREAD_BLOCK_SIZE_uz) { + for (std::size_t class_block = 0; class_block < num_classes_; class_block += THREAD_BLOCK_SIZE_uz) { // load data into local memory for (unsigned internal = 0; internal < INTERNAL_BLOCK_SIZE; ++internal) { - const auto global_j = row_offset_ + j_linear + static_cast(internal) * THREAD_BLOCK_SIZE_uz; + // calculate the indices to access the global data, pays attention to coalesced memory accesses + const auto global_j_idx_linear = device_row_offset_ + j_idx_linear + static_cast(internal) * THREAD_BLOCK_SIZE_uz; - // store the in the shared memory - B_cache[local_id_0 * INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE + internal * THREAD_BLOCK_SIZE + local_id_1] = alpha_ * B_[global_j * (num_classes_ + PADDING_SIZE_uz) + dim + threadIdx_x]; + // store the values in the local memory + B_cache[local_id_0 * INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE + internal * THREAD_BLOCK_SIZE + local_id_1] = alpha_ * B_[global_j_idx_linear * (num_classes_ + PADDING_SIZE_uz) + class_block + threadIdx_x]; // SoA C_out_cache[local_id_0 * INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE + internal * THREAD_BLOCK_SIZE + local_id_1] = real_type{ 0.0 }; } nd_idx.barrier(); // wait until all work-items loaded their part of the data - // calculate intermediate results and store them in shared memory + // calculate intermediate results and store them in local memory for (unsigned class_idx = 0; class_idx < THREAD_BLOCK_SIZE; ++class_idx) { for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) { for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) { @@ -231,10 +274,12 @@ class device_kernel_assembly_symm { nd_idx.barrier(); // wait until all work-items performed their part of the calculations } - // add intermediate cached results to C + // atomically add the intermediate cached results to the C matrix for (unsigned internal = 0; internal < INTERNAL_BLOCK_SIZE; ++internal) { - const auto global_i = row_offset_ + i + static_cast(internal); - detail::atomic_op{ C_[global_i * (num_classes_ + PADDING_SIZE_uz) + dim + threadIdx_x] } += C_out_cache[local_id_0 * INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE + internal * THREAD_BLOCK_SIZE + local_id_1]; + // calculate the indices to access the global data + const auto global_i_idx = device_row_offset_ + i_idx + static_cast(internal); + + detail::atomic_op{ C_[global_i_idx * (num_classes_ + PADDING_SIZE_uz) + class_block + threadIdx_x] } += C_out_cache[local_id_0 * INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE + internal * THREAD_BLOCK_SIZE + local_id_1]; // SoA } nd_idx.barrier(); // wait until all threads updated C with their values } @@ -244,17 +289,17 @@ class device_kernel_assembly_symm { private: /// Local memory used for internal memory access optimizations. - ::sycl::local_accessor data_cache_i_; + ::sycl::local_accessor cache_one_; /// Local memory used for internal memory access optimizations. - ::sycl::local_accessor data_cache_j_; + ::sycl::local_accessor cache_two_; /// @cond Doxygen_suppress const real_type alpha_; const real_type *q_; - const real_type *data_d_; + const real_type *data_; const std::size_t num_rows_; const std::size_t device_num_rows_; - const std::size_t row_offset_; + const std::size_t device_row_offset_; const std::size_t num_features_; const real_type QA_cost_; const real_type cost_; diff --git a/include/plssvm/backends/SYCL/kernel/predict/basic/predict_kernel.hpp b/include/plssvm/backends/SYCL/kernel/predict/basic/predict_kernel.hpp index 631bf80a1..07d1a79dc 100644 --- a/include/plssvm/backends/SYCL/kernel/predict/basic/predict_kernel.hpp +++ b/include/plssvm/backends/SYCL/kernel/predict/basic/predict_kernel.hpp @@ -15,8 +15,10 @@ #include "plssvm/backends/SYCL/detail/atomics.hpp" // plssvm::sycl::detail::atomic_op #include "plssvm/backends/SYCL/kernel/kernel_functions.hpp" // plssvm::sycl::detail::{feature_reduce, apply_kernel_function} +#include "plssvm/backends/SYCL/kernel_invocation_types.hpp" // plssvm::sycl::kernel_invocation_type #include "plssvm/constants.hpp" // plssvm::{real_type, THREAD_BLOCK_SIZE, INTERNAL_BLOCK_SIZE, PADDING_SIZE} #include "plssvm/kernel_function_types.hpp" // plssvm::kernel_function_type +#include "plssvm/target_platforms.hpp" // plssvm::target_platform #include "sycl/sycl.hpp" // sycl::item @@ -28,29 +30,34 @@ namespace plssvm::sycl::detail::basic { /** * @brief Calculate the `q` vector used to speedup the prediction using the linear kernel function. * @details Uses SYCL's basic data parallel kernels. + * @tparam target the target platform */ +template class device_kernel_w_linear { public: + /// The used SYCL kernel invocation type. + constexpr static sycl::kernel_invocation_type invocation_type = sycl::kernel_invocation_type::basic; + /** * @brief Initialize the SYCL kernel function object. - * @param[in,out] w_d the vector to speedup the linear prediction - * @param[in] alpha_d the previously learned weights - * @param[in] sv_d the support vectors + * @param[in,out] w the vector to speedup the linear prediction + * @param[in] alpha the previously learned weights + * @param[in] support_vectors the support vectors * @param[in] num_classes the number of classes * @param[in] num_sv the number of support vectors - * @param[in] device_specific_num_sv the number of support vectors the current device is responsible for - * @param[in] sv_offset the first support vector (row in @p alpha_d) the current device is responsible for + * @param[in] device_num_sv the number of support vectors the current device is responsible for + * @param[in] device_sv_offset the first support vector (row in @p alpha) the current device is responsible for * @param[in] grid_x_offset the offset in x-dimension into the data points if more than one execution grid has to be used * @param[in] grid_y_offset the offset in y-dimension into the data points if more than one execution grid has to be used */ - device_kernel_w_linear(real_type *w_d, const real_type *alpha_d, const real_type *sv_d, const std::size_t num_classes, const std::size_t num_sv, const std::size_t device_specific_num_sv, const std::size_t sv_offset, const std::size_t grid_x_offset, const std::size_t grid_y_offset) : - w_d_{ w_d }, - alpha_d_{ alpha_d }, - sv_d_{ sv_d }, + device_kernel_w_linear(real_type *w, const real_type *alpha, const real_type *support_vectors, const std::size_t num_classes, const std::size_t num_sv, const std::size_t device_num_sv, const std::size_t device_sv_offset, const std::size_t grid_x_offset, const std::size_t grid_y_offset) : + w_{ w }, + alpha_{ alpha }, + support_vectors_{ support_vectors }, num_classes_{ num_classes }, num_sv_{ num_sv }, - device_specific_num_sv_{ device_specific_num_sv }, - sv_offset_{ sv_offset }, + device_num_sv_{ device_num_sv }, + device_sv_offset_{ device_sv_offset }, grid_x_offset_{ grid_x_offset }, grid_y_offset_{ grid_y_offset } { } @@ -60,77 +67,106 @@ class device_kernel_w_linear { */ void operator()(::sycl::item<2> idx) const { // cast all values to 64-bit std::size_t to prevent potential 32-bit overflows - const auto INTERNAL_BLOCK_SIZE_uz = static_cast(INTERNAL_BLOCK_SIZE); - const auto PADDING_SIZE_uz = static_cast(PADDING_SIZE); + constexpr auto INTERNAL_BLOCK_SIZE_uz = static_cast(INTERNAL_BLOCK_SIZE); + constexpr auto THREAD_BLOCK_SIZE_uz = static_cast(THREAD_BLOCK_SIZE); + constexpr auto PADDING_SIZE_uz = static_cast(PADDING_SIZE); // calculate the indices used in the current work-item - const std::size_t feature_idx = (idx.get_id(1) + grid_x_offset_ * THREAD_BLOCK_SIZE) * INTERNAL_BLOCK_SIZE_uz; - const std::size_t class_idx = (idx.get_id(0) + grid_y_offset_ * THREAD_BLOCK_SIZE) * INTERNAL_BLOCK_SIZE_uz; + const auto feature_idx = (idx.get_id(1) + grid_x_offset_ * THREAD_BLOCK_SIZE_uz) * INTERNAL_BLOCK_SIZE_uz; // num_features + const auto class_idx = (idx.get_id(0) + grid_y_offset_ * THREAD_BLOCK_SIZE_uz) * INTERNAL_BLOCK_SIZE_uz; // num_classes // create a work-item private array used for internal caching real_type temp[INTERNAL_BLOCK_SIZE][INTERNAL_BLOCK_SIZE]{}; - // iterate over all support vectors using blocking to be able to cache them for faster memory accesses - for (std::size_t sv = 0; sv < device_specific_num_sv_; ++sv) { - // perform the dot product calculation + // iterate over all support vectors using blocking + for (std::size_t sv_block = 0; sv_block < device_num_sv_; sv_block += THREAD_BLOCK_SIZE_uz) { + if constexpr (target == target_platform::cpu) { + // perform the dot product calculation, the sv is the fastest moving index + for (unsigned internal_feature = 0; internal_feature < INTERNAL_BLOCK_SIZE; ++internal_feature) { + for (unsigned internal_class = 0; internal_class < INTERNAL_BLOCK_SIZE; ++internal_class) { + // calculate the indices to access the global data + const auto global_class_idx = class_idx + static_cast(internal_class); + const auto global_feature_idx = feature_idx + static_cast(internal_feature); + + real_type sum{ 0.0 }; + for (std::size_t sv = 0; sv < THREAD_BLOCK_SIZE_uz; ++sv) { + sum += alpha_[global_class_idx * (num_sv_ + PADDING_SIZE_uz) + sv_block + sv + device_sv_offset_] * // AoS + support_vectors_[global_feature_idx * (device_num_sv_ + PADDING_SIZE_uz) + sv_block + sv]; // SoA + } + temp[internal_feature][internal_class] += sum; + } + } + } else { + // perform the dot product calculation, the sv is the slowest moving index + for (std::size_t sv = 0; sv < THREAD_BLOCK_SIZE_uz; ++sv) { + for (unsigned internal_feature = 0; internal_feature < INTERNAL_BLOCK_SIZE; ++internal_feature) { + for (unsigned internal_class = 0; internal_class < INTERNAL_BLOCK_SIZE; ++internal_class) { + // calculate the indices to access the global data + const auto global_class_idx = class_idx + static_cast(internal_class); + const auto global_feature_idx = feature_idx + static_cast(internal_feature); + + temp[internal_feature][internal_class] += alpha_[global_class_idx * (num_sv_ + PADDING_SIZE_uz) + sv_block + sv + device_sv_offset_] * // AoS + support_vectors_[global_feature_idx * (device_num_sv_ + PADDING_SIZE_uz) + sv_block + sv]; // SoA + } + } + } + } + + // update the global w-vector with the locally cached values for (unsigned internal_feature = 0; internal_feature < INTERNAL_BLOCK_SIZE; ++internal_feature) { for (unsigned internal_class = 0; internal_class < INTERNAL_BLOCK_SIZE; ++internal_class) { - const auto global_class_idx = class_idx + static_cast(internal_class); + // calculate the indices to access the global data const auto global_feature_idx = feature_idx + static_cast(internal_feature); + const auto global_class_idx = class_idx + static_cast(internal_class); - temp[internal_feature][internal_class] += alpha_d_[global_class_idx * (num_sv_ + PADDING_SIZE_uz) + sv + sv_offset_] * sv_d_[global_feature_idx * (device_specific_num_sv_ + PADDING_SIZE_uz) + sv]; + w_[global_feature_idx * (num_classes_ + PADDING_SIZE_uz) + global_class_idx] = temp[internal_feature][internal_class]; // SoA } } } - - // update global array with local one - for (unsigned internal_feature = 0; internal_feature < INTERNAL_BLOCK_SIZE; ++internal_feature) { - for (unsigned internal_class = 0; internal_class < INTERNAL_BLOCK_SIZE; ++internal_class) { - const auto global_class_idx = class_idx + static_cast(internal_class); - const auto global_feature_idx = feature_idx + static_cast(internal_feature); - - w_d_[global_feature_idx * (num_classes_ + PADDING_SIZE_uz) + global_class_idx] = temp[internal_feature][internal_class]; - } - } } private: /// @cond Doxygen_suppress - real_type *w_d_; - const real_type *alpha_d_; - const real_type *sv_d_; + real_type *w_; + const real_type *alpha_; + const real_type *support_vectors_; const std::size_t num_classes_; const std::size_t num_sv_; - const std::size_t device_specific_num_sv_; - const std::size_t sv_offset_; + const std::size_t device_num_sv_; + const std::size_t device_sv_offset_; const std::size_t grid_x_offset_; const std::size_t grid_y_offset_; /// @endcond }; /** - * @brief Predict the @p predict_points_d using the linear kernel speeding up the calculation using the @p w_d vector. + * @brief Predict the @p predict_points using the linear kernel speeding up the calculation using the @p w vector. * @details Uses SYCL's basic data parallel kernels. + * @tparam target the target platform */ +template class device_kernel_predict_linear { public: + /// The used SYCL kernel invocation type. + constexpr static sycl::kernel_invocation_type invocation_type = sycl::kernel_invocation_type::basic; + /** * @brief Initialize the SYCL kernel function object. - * @param[out] prediction_d the predicted values - * @param[in] w_d the vector to speedup the calculations - * @param[in] rho_d the previously learned bias - * @param[in] predict_points_d the data points to predict + * @param[out] prediction the predicted values + * @param[in] w the vector to speedup the calculations + * @param[in] rho the previously learned bias + * @param[in] predict_points the data points to predict * @param[in] num_classes the number of classes * @param[in] num_predict_points the number of data points to predict * @param[in] num_features the number of features per data point * @param[in] grid_x_offset the offset in x-dimension into the data points if more than one execution grid has to be used * @param[in] grid_y_offset the offset in y-dimension into the data points if more than one execution grid has to be used */ - device_kernel_predict_linear(real_type *prediction_d, const real_type *w_d, const real_type *rho_d, const real_type *predict_points_d, const std::size_t num_classes, const std::size_t num_predict_points, const std::size_t num_features, const std::size_t grid_x_offset, const std::size_t grid_y_offset) : - prediction_d_{ prediction_d }, - w_d_{ w_d }, - rho_d_{ rho_d }, - predict_points_d_{ predict_points_d }, + device_kernel_predict_linear(real_type *prediction, const real_type *w, const real_type *rho, const real_type *predict_points, const std::size_t num_classes, const std::size_t num_predict_points, const std::size_t num_features, const std::size_t grid_x_offset, const std::size_t grid_y_offset) : + prediction_{ prediction }, + w_{ w }, + rho_{ rho }, + predict_points_{ predict_points }, num_classes_{ num_classes }, num_predict_points_{ num_predict_points }, num_features_{ num_features }, @@ -143,46 +179,70 @@ class device_kernel_predict_linear { */ void operator()(::sycl::item<2> idx) const { // cast all values to 64-bit std::size_t to prevent potential 32-bit overflows - const auto INTERNAL_BLOCK_SIZE_uz = static_cast(INTERNAL_BLOCK_SIZE); - const auto PADDING_SIZE_uz = static_cast(PADDING_SIZE); + constexpr auto INTERNAL_BLOCK_SIZE_uz = static_cast(INTERNAL_BLOCK_SIZE); + constexpr auto THREAD_BLOCK_SIZE_uz = static_cast(THREAD_BLOCK_SIZE); + constexpr auto PADDING_SIZE_uz = static_cast(PADDING_SIZE); // calculate the indices used in the current work-item - const std::size_t pp_idx = (idx.get_id(1) + grid_x_offset_ * THREAD_BLOCK_SIZE) * INTERNAL_BLOCK_SIZE_uz; - const std::size_t class_idx = (idx.get_id(0) + grid_y_offset_ * THREAD_BLOCK_SIZE) * INTERNAL_BLOCK_SIZE_uz; + const auto pp_idx = (idx.get_id(1) + grid_x_offset_ * THREAD_BLOCK_SIZE_uz) * INTERNAL_BLOCK_SIZE_uz; // num_predict_points + const auto class_idx = (idx.get_id(0) + grid_y_offset_ * THREAD_BLOCK_SIZE_uz) * INTERNAL_BLOCK_SIZE_uz; // num_classes // create a work-item private array used for internal caching real_type temp[INTERNAL_BLOCK_SIZE][INTERNAL_BLOCK_SIZE]{}; - // iterate over all support vectors using blocking to be able to cache them for faster memory accesses - for (std::size_t dim = 0; dim < num_features_; ++dim) { - // perform the dot product calculation - for (unsigned internal_pd = 0; internal_pd < INTERNAL_BLOCK_SIZE; ++internal_pd) { - for (unsigned internal_class = 0; internal_class < INTERNAL_BLOCK_SIZE; ++internal_class) { - const auto global_pp_idx = pp_idx + static_cast(internal_pd); - const auto global_class_idx = class_idx + static_cast(internal_class); - - temp[internal_pd][internal_class] += w_d_[dim * (num_classes_ + PADDING_SIZE_uz) + global_class_idx] * predict_points_d_[dim * (num_predict_points_ + PADDING_SIZE_uz) + global_pp_idx]; + // iterate over all features using blocking + for (std::size_t feature_block = 0; feature_block < num_features_; feature_block += THREAD_BLOCK_SIZE_uz) { + if constexpr (target == target_platform::cpu) { + // perform the dot product calculation, the feature is the fastest moving index + for (unsigned internal_pp = 0; internal_pp < INTERNAL_BLOCK_SIZE; ++internal_pp) { + for (unsigned internal_class = 0; internal_class < INTERNAL_BLOCK_SIZE; ++internal_class) { + // calculate the indices to access the global data + const auto global_pp_idx = pp_idx + static_cast(internal_pp); + const auto global_class_idx = class_idx + static_cast(internal_class); + + real_type sum{ 0.0 }; + for (std::size_t feature = 0; feature < THREAD_BLOCK_SIZE_uz; ++feature) { + sum += w_[(feature_block + feature) * (num_classes_ + PADDING_SIZE_uz) + global_class_idx] * // SoA + predict_points_[(feature_block + feature) * (num_predict_points_ + PADDING_SIZE_uz) + global_pp_idx]; // SoA + } + temp[internal_pp][internal_class] += sum; + } + } + } else { + // perform the dot product calculation, the feature is the slowest moving index + for (std::size_t feature = 0; feature < THREAD_BLOCK_SIZE_uz; ++feature) { + for (unsigned internal_pp = 0; internal_pp < INTERNAL_BLOCK_SIZE; ++internal_pp) { + for (unsigned internal_class = 0; internal_class < INTERNAL_BLOCK_SIZE; ++internal_class) { + // calculate the indices to access the global data + const auto global_pp_idx = pp_idx + static_cast(internal_pp); + const auto global_class_idx = class_idx + static_cast(internal_class); + + temp[internal_pp][internal_class] += w_[(feature_block + feature) * (num_classes_ + PADDING_SIZE_uz) + global_class_idx] * // SoA + predict_points_[(feature_block + feature) * (num_predict_points_ + PADDING_SIZE_uz) + global_pp_idx]; // SoA + } + } } } } - // update global array with local one - for (unsigned internal_pd = 0; internal_pd < INTERNAL_BLOCK_SIZE; ++internal_pd) { + // update the global array with the local one + for (unsigned internal_pp = 0; internal_pp < INTERNAL_BLOCK_SIZE; ++internal_pp) { for (unsigned internal_class = 0; internal_class < INTERNAL_BLOCK_SIZE; ++internal_class) { + // calculate the indices to access the global data + const auto global_pp_idx = pp_idx + static_cast(internal_pp); const auto global_class_idx = class_idx + static_cast(internal_class); - const auto global_pp_idx = pp_idx + static_cast(internal_pd); - prediction_d_[global_pp_idx * (num_classes_ + PADDING_SIZE_uz) + global_class_idx] = temp[internal_pd][internal_class] - rho_d_[global_class_idx]; + prediction_[global_pp_idx * (num_classes_ + PADDING_SIZE_uz) + global_class_idx] = temp[internal_pp][internal_class] - rho_[global_class_idx]; // AoS } } } private: /// @cond Doxygen_suppress - real_type *prediction_d_; - const real_type *w_d_; - const real_type *rho_d_; - const real_type *predict_points_d_; + real_type *prediction_; + const real_type *w_; + const real_type *rho_; + const real_type *predict_points_; const std::size_t num_classes_; const std::size_t num_predict_points_; const std::size_t num_features_; @@ -192,21 +252,25 @@ class device_kernel_predict_linear { }; /** - * @brief Predict the @p predict_points_d using the @p kernel_function. + * @brief Predict the @p predict_points using the @p kernel_function. * @details Uses SYCL's basic data parallel kernels. + * @tparam target the target platform * @tparam kernel_function the type of the used kernel function * @tparam Args the types of the parameters necessary for the specific kernel function; stored in a `std::tuple` */ -template +template class device_kernel_predict { public: + /// The used SYCL kernel invocation type. + constexpr static sycl::kernel_invocation_type invocation_type = sycl::kernel_invocation_type::basic; + /** * @brief Initialize the SYCL kernel function object. - * @param[in] prediction_d the predicted values - * @param[in] alpha_d the previously learned weights - * @param[in] rho_d the previously learned biases - * @param[in] sv_d the support vectors - * @param[in] predict_points_d the data points to predict + * @param[in] prediction the predicted values + * @param[in] alpha the previously learned weights + * @param[in] rho the previously learned biases + * @param[in] support_vectors the support vectors + * @param[in] predict_points the data points to predict * @param[in] num_classes the number of classes * @param[in] num_sv the number of support vectors * @param[in] num_predict_points the number of data points to predict @@ -215,19 +279,19 @@ class device_kernel_predict { * @param[in] grid_y_offset the offset in y-dimension into the data points if more than one execution grid has to be used * @param[in] kernel_function_parameter the parameters necessary to apply the @p kernel_function */ - device_kernel_predict(real_type *prediction_d, const real_type *alpha_d, const real_type *rho_d, const real_type *sv_d, const real_type *predict_points_d, const std::size_t num_classes, const std::size_t num_sv, const std::size_t num_predict_points, const std::size_t num_features, const std::size_t grid_x_offset, const std::size_t grid_y_offset, Args... kernel_function_parameter) : - prediction_d_{ prediction_d }, - alpha_d_{ alpha_d }, - rho_d_{ rho_d }, - sv_d_{ sv_d }, - predict_points_d_{ predict_points_d }, + device_kernel_predict(real_type *prediction, const real_type *alpha, const real_type *rho, const real_type *support_vectors, const real_type *predict_points, const std::size_t num_classes, const std::size_t num_sv, const std::size_t num_predict_points, const std::size_t num_features, const std::size_t grid_x_offset, const std::size_t grid_y_offset, Args... kernel_function_parameter) : + prediction_{ prediction }, + alpha_{ alpha }, + rho_{ rho }, + support_vectors_{ support_vectors }, + predict_points_{ predict_points }, num_classes_{ num_classes }, num_sv_{ num_sv }, num_predict_points_{ num_predict_points }, num_features_{ num_features }, grid_x_offset_{ grid_x_offset }, grid_y_offset_{ grid_y_offset }, - kernel_function_parameter_{ std::make_tuple(std::forward(kernel_function_parameter)...) } { } + kernel_function_parameter_{ std::make_tuple(kernel_function_parameter...) } { } /** * @brief Function call operator overload performing the actual calculation. @@ -235,54 +299,83 @@ class device_kernel_predict { */ void operator()(::sycl::item<2> idx) const { // cast all values to 64-bit std::size_t to prevent potential 32-bit overflows - const auto INTERNAL_BLOCK_SIZE_uz = static_cast(INTERNAL_BLOCK_SIZE); - const auto PADDING_SIZE_uz = static_cast(PADDING_SIZE); + constexpr auto INTERNAL_BLOCK_SIZE_uz = static_cast(INTERNAL_BLOCK_SIZE); + constexpr auto THREAD_BLOCK_SIZE_uz = static_cast(THREAD_BLOCK_SIZE); + constexpr auto PADDING_SIZE_uz = static_cast(PADDING_SIZE); // calculate the indices used in the current work-item - const std::size_t pp_idx = (idx.get_id(1) + grid_x_offset_ * THREAD_BLOCK_SIZE) * INTERNAL_BLOCK_SIZE_uz; - const std::size_t sv_idx = (idx.get_id(0) + grid_y_offset_ * THREAD_BLOCK_SIZE) * INTERNAL_BLOCK_SIZE_uz; + const auto pp_idx = (idx.get_id(1) + grid_x_offset_ * THREAD_BLOCK_SIZE_uz) * INTERNAL_BLOCK_SIZE_uz; // num_predict_points + const auto sv_idx = (idx.get_id(0) + grid_y_offset_ * THREAD_BLOCK_SIZE_uz) * INTERNAL_BLOCK_SIZE_uz; // num_support_vectors // create a work-item private array used for internal caching real_type temp[INTERNAL_BLOCK_SIZE][INTERNAL_BLOCK_SIZE]{}; - // iterate over all features using blocking to be able to cache them for faster memory accesses - for (std::size_t dim = 0; dim < num_features_; ++dim) { - // perform the feature reduction calculation - for (unsigned internal_pd = 0; internal_pd < INTERNAL_BLOCK_SIZE; ++internal_pd) { - for (unsigned internal_sv = 0; internal_sv < INTERNAL_BLOCK_SIZE; ++internal_sv) { - const auto global_pp_idx = pp_idx + static_cast(internal_pd); - const auto global_sv_idx = sv_idx + static_cast(internal_sv); - - temp[internal_pd][internal_sv] += detail::feature_reduce(sv_d_[dim * (num_sv_ + PADDING_SIZE_uz) + global_sv_idx], - predict_points_d_[dim * (num_predict_points_ + PADDING_SIZE_uz) + global_pp_idx]); + // iterate over all features using blocking + for (std::size_t feature_block = 0; feature_block < num_features_; feature_block += THREAD_BLOCK_SIZE_uz) { + if constexpr (target == target_platform::cpu) { + // perform the feature reduction calculation, the feature is the fastest moving index + for (unsigned internal_pp = 0; internal_pp < INTERNAL_BLOCK_SIZE; ++internal_pp) { + for (unsigned internal_sv = 0; internal_sv < INTERNAL_BLOCK_SIZE; ++internal_sv) { + // calculate the indices to access the global data + const auto global_pp_idx = pp_idx + static_cast(internal_pp); + const auto global_sv_idx = sv_idx + static_cast(internal_sv); + + real_type sum{ 0.0 }; + for (std::size_t feature = 0; feature < THREAD_BLOCK_SIZE_uz; ++feature) { + sum += detail::feature_reduce(support_vectors_[(feature_block + feature) * (num_sv_ + PADDING_SIZE_uz) + global_sv_idx], // SoA + predict_points_[(feature_block + feature) * (num_predict_points_ + PADDING_SIZE_uz) + global_pp_idx]); // SoA + } + temp[internal_pp][internal_sv] += sum; + } + } + } else { + // perform the feature reduction calculation, the feature is the slowest moving index + for (std::size_t feature = 0; feature < THREAD_BLOCK_SIZE_uz; ++feature) { + for (unsigned internal_pp = 0; internal_pp < INTERNAL_BLOCK_SIZE; ++internal_pp) { + for (unsigned internal_sv = 0; internal_sv < INTERNAL_BLOCK_SIZE; ++internal_sv) { + // calculate the indices to access the global data + const auto global_pp_idx = pp_idx + static_cast(internal_pp); + const auto global_sv_idx = sv_idx + static_cast(internal_sv); + + temp[internal_pp][internal_sv] += detail::feature_reduce(support_vectors_[(feature_block + feature) * (num_sv_ + PADDING_SIZE_uz) + global_sv_idx], // SoA + predict_points_[(feature_block + feature) * (num_predict_points_ + PADDING_SIZE_uz) + global_pp_idx]); // SoA + } + } } } } // update temp using the respective kernel function - for (unsigned internal_pd = 0; internal_pd < INTERNAL_BLOCK_SIZE; ++internal_pd) { + for (unsigned internal_pp = 0; internal_pp < INTERNAL_BLOCK_SIZE; ++internal_pp) { for (unsigned internal_sv = 0; internal_sv < INTERNAL_BLOCK_SIZE; ++internal_sv) { - temp[internal_pd][internal_sv] = detail::apply_kernel_function(temp[internal_pd][internal_sv], kernel_function_parameter_); + temp[internal_pp][internal_sv] = detail::apply_kernel_function(temp[internal_pp][internal_sv], kernel_function_parameter_); } } - // iterate over all features using blocking to be able to cache them for faster memory accesses - for (std::size_t dim = 0; dim < num_classes_; ++dim) { + // iterate over all classes using blocking + for (std::size_t class_block = 0; class_block < num_classes_; class_block += THREAD_BLOCK_SIZE_uz) { if (sv_idx == 0) { - for (unsigned internal_pd = 0; internal_pd < INTERNAL_BLOCK_SIZE; ++internal_pd) { - const auto global_pp_idx = pp_idx + static_cast(internal_pd); - detail::atomic_op{ prediction_d_[global_pp_idx * (num_classes_ + PADDING_SIZE_uz) + dim] } += -rho_d_[dim]; + for (std::size_t class_idx = 0; class_idx < THREAD_BLOCK_SIZE_uz; ++class_idx) { + for (unsigned internal_pp = 0; internal_pp < INTERNAL_BLOCK_SIZE; ++internal_pp) { + // calculate the index to access the global data + const auto global_pp_idx = pp_idx + static_cast(internal_pp); + + detail::atomic_op{ prediction_[global_pp_idx * (num_classes_ + PADDING_SIZE_uz) + class_block + class_idx] } += -rho_[class_block + class_idx]; + } } } - // calculate intermediate results and store them in local memory - for (unsigned internal_pd = 0; internal_pd < INTERNAL_BLOCK_SIZE; ++internal_pd) { + // atomically add the results to the prediction + for (unsigned internal_pp = 0; internal_pp < INTERNAL_BLOCK_SIZE; ++internal_pp) { for (unsigned internal_sv = 0; internal_sv < INTERNAL_BLOCK_SIZE; ++internal_sv) { - const auto global_pp_idx = pp_idx + static_cast(internal_pd); + // calculate the indices to access the global data + const auto global_pp_idx = pp_idx + static_cast(internal_pp); const auto global_sv_idx = sv_idx + static_cast(internal_sv); - detail::atomic_op{ prediction_d_[global_pp_idx * (num_classes_ + PADDING_SIZE_uz) + dim] } += - temp[internal_pd][internal_sv] * alpha_d_[dim * (num_sv_ + PADDING_SIZE_uz) + global_sv_idx]; + for (std::size_t class_idx = 0; class_idx < THREAD_BLOCK_SIZE_uz; ++class_idx) { + detail::atomic_op{ prediction_[global_pp_idx * (num_classes_ + PADDING_SIZE_uz) + class_block + class_idx] } += + temp[internal_pp][internal_sv] * alpha_[(class_block + class_idx) * (num_sv_ + PADDING_SIZE_uz) + global_sv_idx]; + } } } } @@ -290,11 +383,11 @@ class device_kernel_predict { private: /// @cond Doxygen_suppress - real_type *prediction_d_; - const real_type *alpha_d_; - const real_type *rho_d_; - const real_type *sv_d_; - const real_type *predict_points_d_; + real_type *prediction_; + const real_type *alpha_; + const real_type *rho_; + const real_type *support_vectors_; + const real_type *predict_points_; const std::size_t num_classes_; const std::size_t num_sv_; const std::size_t num_predict_points_; diff --git a/include/plssvm/backends/SYCL/kernel/predict/hierarchical/predict_kernel.hpp b/include/plssvm/backends/SYCL/kernel/predict/hierarchical/predict_kernel.hpp index dedfe609e..1bb93cc3c 100644 --- a/include/plssvm/backends/SYCL/kernel/predict/hierarchical/predict_kernel.hpp +++ b/include/plssvm/backends/SYCL/kernel/predict/hierarchical/predict_kernel.hpp @@ -15,8 +15,10 @@ #include "plssvm/backends/SYCL/detail/atomics.hpp" // plssvm::sycl::detail::atomic_op #include "plssvm/backends/SYCL/kernel/kernel_functions.hpp" // plssvm::sycl::detail::{feature_reduce, apply_kernel_function} +#include "plssvm/backends/SYCL/kernel_invocation_types.hpp" // plssvm::sycl::kernel_invocation_type #include "plssvm/constants.hpp" // plssvm::{real_type, THREAD_BLOCK_SIZE, INTERNAL_BLOCK_SIZE, PADDING_SIZE} #include "plssvm/kernel_function_types.hpp" // plssvm::kernel_function_type +#include "plssvm/target_platforms.hpp" // plssvm::target_platform #include "sycl/sycl.hpp" // sycl::group, sycl::private_memory, sycl::h_item @@ -28,29 +30,34 @@ namespace plssvm::sycl::detail::hierarchical { /** * @brief Calculate the `q` vector used to speedup the prediction using the linear kernel function. * @details Uses SYCL's hierarchical data parallel kernels. + * @tparam target the target platform */ +template class device_kernel_w_linear { public: + /// The used SYCL kernel invocation type. + constexpr static sycl::kernel_invocation_type invocation_type = sycl::kernel_invocation_type::hierarchical; + /** * @brief Initialize the SYCL kernel function object. - * @param[in,out] w_d the vector to speedup the linear prediction - * @param[in] alpha_d the previously learned weights - * @param[in] sv_d the support vectors + * @param[in,out] w the vector to speedup the linear prediction + * @param[in] alpha the previously learned weights + * @param[in] support_vectors the support vectors * @param[in] num_classes the number of classes * @param[in] num_sv the number of support vectors - * @param[in] device_specific_num_sv the number of support vectors the current device is responsible for - * @param[in] sv_offset the first support vector (row in @p alpha_d) the current device is responsible for + * @param[in] device_num_sv the number of support vectors the current device is responsible for + * @param[in] device_sv_offset the first support vector (row in @p alpha) the current device is responsible for * @param[in] grid_x_offset the offset in x-dimension into the data points if more than one execution grid has to be used * @param[in] grid_y_offset the offset in y-dimension into the data points if more than one execution grid has to be used */ - device_kernel_w_linear(real_type *w_d, const real_type *alpha_d, const real_type *sv_d, const std::size_t num_classes, const std::size_t num_sv, const std::size_t device_specific_num_sv, const std::size_t sv_offset, const std::size_t grid_x_offset, const std::size_t grid_y_offset) : - w_d_{ w_d }, - alpha_d_{ alpha_d }, - sv_d_{ sv_d }, + device_kernel_w_linear(real_type *w, const real_type *alpha, const real_type *support_vectors, const std::size_t num_classes, const std::size_t num_sv, const std::size_t device_num_sv, const std::size_t device_sv_offset, const std::size_t grid_x_offset, const std::size_t grid_y_offset) : + w_{ w }, + alpha_{ alpha }, + support_vectors_{ support_vectors }, num_classes_{ num_classes }, num_sv_{ num_sv }, - device_specific_num_sv_{ device_specific_num_sv }, - sv_offset_{ sv_offset }, + device_num_sv_{ device_num_sv }, + device_sv_offset_{ device_sv_offset }, grid_x_offset_{ grid_x_offset }, grid_y_offset_{ grid_y_offset } { } @@ -59,36 +66,15 @@ class device_kernel_w_linear { * @param[in] group indices representing the current point in the execution space */ void operator()(::sycl::group<2> group) const { - // allocate shared memory - real_type data_cache_feature[THREAD_BLOCK_SIZE][INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE]; - real_type data_cache_alpha[THREAD_BLOCK_SIZE][INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE]; - - // calculate the indices used in the current work-item - ::sycl::private_memory feature_idx{ group }; - ::sycl::private_memory feature_idx_linear{ group }; - ::sycl::private_memory class_idx{ group }; - ::sycl::private_memory class_idx_linear{ group }; + // create two local memory arrays used for caching + real_type feature_cache[THREAD_BLOCK_SIZE][INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE]; + real_type alpha_cache[THREAD_BLOCK_SIZE][INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE]; + // create a private memory array used for internal caching ::sycl::private_memory temp{ group }; - // initialize private and local variables + // initialize private temp matrix to zero group.parallel_for_work_item([&](::sycl::h_item<2> idx) { - const std::size_t threadIdx_x = idx.get_local_id(0); // current thread in block x-dimension - const std::size_t threadIdx_y = idx.get_local_id(1); // current thread in block y-dimension - const std::size_t blockDim_x = idx.get_local_range(0); // number of threads in block x-dimension - const std::size_t blockDim_y = idx.get_local_range(1); // number of threads in block y-dimension - const std::size_t blockIdx_x = group[0] + grid_x_offset_; // current block in grid x-dimension + offsets if the grid size would be too large - const std::size_t blockIdx_y = group[1] + grid_y_offset_; // current block in grid y-dimension + offsets if the grid size would be too large - - const auto INTERNAL_BLOCK_SIZE_uz = static_cast(INTERNAL_BLOCK_SIZE); - - // indices - feature_idx(idx) = (blockIdx_y * blockDim_y + threadIdx_y) * INTERNAL_BLOCK_SIZE_uz; - feature_idx_linear(idx) = blockIdx_y * blockDim_y * INTERNAL_BLOCK_SIZE_uz + threadIdx_y; - class_idx(idx) = (blockIdx_x * blockDim_x + threadIdx_x) * INTERNAL_BLOCK_SIZE_uz; - class_idx_linear(idx) = blockIdx_x * blockDim_x * INTERNAL_BLOCK_SIZE_uz + threadIdx_y; - - // initialize private temp matrix to zero for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) { for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) { temp(idx)[internal_i][internal_j] = real_type{ 0.0 }; @@ -99,23 +85,36 @@ class device_kernel_w_linear { // implicit group barrier // iterate over all support vectors using blocking to be able to cache them for faster memory accesses - for (std::size_t sv = 0; sv < device_specific_num_sv_; sv += THREAD_BLOCK_SIZE) { + for (std::size_t sv_block = 0; sv_block < device_num_sv_; sv_block += static_cast(THREAD_BLOCK_SIZE)) { // load data into local memory group.parallel_for_work_item([&](::sycl::h_item<2> idx) { + // cast values to 32-bit unsigned int values to prevent implicit conversions const auto local_id_0 = static_cast(idx.get_local_id(0)); const auto local_id_1 = static_cast(idx.get_local_id(1)); - const std::size_t threadIdx_x = idx.get_local_id(0); + // cast all values to 64-bit std::size_t to prevent potential 32-bit overflows + constexpr auto INTERNAL_BLOCK_SIZE_uz = static_cast(INTERNAL_BLOCK_SIZE); + constexpr auto THREAD_BLOCK_SIZE_uz = static_cast(THREAD_BLOCK_SIZE); + constexpr auto PADDING_SIZE_uz = static_cast(PADDING_SIZE); + + const auto threadIdx_x = static_cast(idx.get_local_id(0)); // current work-item in work-group x-dimension + const auto threadIdx_y = static_cast(idx.get_local_id(1)); // current work-item in work-group y-dimension + const auto blockDim_x = static_cast(idx.get_local_range(0)); // number of work-items in work-group x-dimension + const auto blockDim_y = static_cast(idx.get_local_range(1)); // number of work-items in work-group y-dimension + const auto blockIdx_x = static_cast(group[0]) + grid_x_offset_; // current work-group in global range x-dimension + offsets if the global range is too large + const auto blockIdx_y = static_cast(group[1]) + grid_y_offset_; // current work-group in global range y-dimension + offsets if the global range is too large - const auto THREAD_BLOCK_SIZE_uz = static_cast(THREAD_BLOCK_SIZE); - const auto PADDING_SIZE_uz = static_cast(PADDING_SIZE); + // calculate the indices used in the current work-item, pays attention to coalesced memory accesses + const auto feature_idx_linear = blockIdx_y * blockDim_y * INTERNAL_BLOCK_SIZE_uz + threadIdx_y; // num_features + const auto class_idx_linear = blockIdx_x * blockDim_x * INTERNAL_BLOCK_SIZE_uz + threadIdx_y; // num_classes for (unsigned internal = 0; internal < INTERNAL_BLOCK_SIZE; ++internal) { - const auto global_class_idx = class_idx_linear(idx) + static_cast(internal) * THREAD_BLOCK_SIZE_uz; - const auto global_feature_idx = feature_idx_linear(idx) + static_cast(internal) * THREAD_BLOCK_SIZE_uz; + // calculate the indices to access the global data, pays attention to coalesced memory accesses + const auto global_feature_idx_linear = feature_idx_linear + static_cast(internal) * THREAD_BLOCK_SIZE_uz; + const auto global_class_idx_linear = class_idx_linear + static_cast(internal) * THREAD_BLOCK_SIZE_uz; - data_cache_feature[local_id_0][internal * THREAD_BLOCK_SIZE + local_id_1] = sv_d_[global_feature_idx * (device_specific_num_sv_ + PADDING_SIZE_uz) + sv + threadIdx_x]; // SoA - data_cache_alpha[local_id_0][internal * THREAD_BLOCK_SIZE + local_id_1] = alpha_d_[global_class_idx * (num_sv_ + PADDING_SIZE_uz) + sv + sv_offset_ + threadIdx_x]; // AoS + feature_cache[local_id_0][internal * THREAD_BLOCK_SIZE + local_id_1] = support_vectors_[global_feature_idx_linear * (device_num_sv_ + PADDING_SIZE_uz) + sv_block + threadIdx_x]; // SoA + alpha_cache[local_id_0][internal * THREAD_BLOCK_SIZE + local_id_1] = alpha_[global_class_idx_linear * (num_sv_ + PADDING_SIZE_uz) + sv_block + device_sv_offset_ + threadIdx_x]; // AoS } }); @@ -123,13 +122,28 @@ class device_kernel_w_linear { // perform the dot product calculation group.parallel_for_work_item([&](::sycl::h_item<2> idx) { + // cast values to 32-bit unsigned int values to prevent implicit conversions const auto local_id_0 = static_cast(idx.get_local_id(0)); const auto local_id_1 = static_cast(idx.get_local_id(1)); - for (unsigned block_dim = 0; block_dim < THREAD_BLOCK_SIZE; ++block_dim) { + if constexpr (target == target_platform::cpu) { + // perform the dot product calculation, the sv is the fastest moving index for (unsigned internal_feature = 0; internal_feature < INTERNAL_BLOCK_SIZE; ++internal_feature) { for (unsigned internal_class = 0; internal_class < INTERNAL_BLOCK_SIZE; ++internal_class) { - temp(idx)[internal_feature][internal_class] += data_cache_alpha[block_dim][local_id_0 * INTERNAL_BLOCK_SIZE + internal_class] * data_cache_feature[block_dim][local_id_1 * INTERNAL_BLOCK_SIZE + internal_feature]; + real_type sum{ 0.0 }; + for (unsigned sv = 0; sv < THREAD_BLOCK_SIZE; ++sv) { + sum += alpha_cache[sv][local_id_0 * INTERNAL_BLOCK_SIZE + internal_class] * feature_cache[sv][local_id_1 * INTERNAL_BLOCK_SIZE + internal_feature]; + } + temp(idx)[internal_feature][internal_class] += sum; + } + } + } else { + // perform the dot product calculation, the sv is the slowest moving index + for (unsigned sv = 0; sv < THREAD_BLOCK_SIZE; ++sv) { + for (unsigned internal_feature = 0; internal_feature < INTERNAL_BLOCK_SIZE; ++internal_feature) { + for (unsigned internal_class = 0; internal_class < INTERNAL_BLOCK_SIZE; ++internal_class) { + temp(idx)[internal_feature][internal_class] += alpha_cache[sv][local_id_0 * INTERNAL_BLOCK_SIZE + internal_class] * feature_cache[sv][local_id_1 * INTERNAL_BLOCK_SIZE + internal_feature]; + } } } } @@ -138,16 +152,30 @@ class device_kernel_w_linear { // implicit group barrier } - // update global array with local one + // update the global w-vector with the locally cached values group.parallel_for_work_item([&](::sycl::h_item<2> idx) { - const auto PADDING_SIZE_uz = static_cast(PADDING_SIZE); + // cast all values to 64-bit std::size_t to prevent potential 32-bit overflows + constexpr auto INTERNAL_BLOCK_SIZE_uz = static_cast(INTERNAL_BLOCK_SIZE); + constexpr auto PADDING_SIZE_uz = static_cast(PADDING_SIZE); + + const auto threadIdx_x = static_cast(idx.get_local_id(0)); // current work-item in work-group x-dimension + const auto threadIdx_y = static_cast(idx.get_local_id(1)); // current work-item in work-group y-dimension + const auto blockDim_x = static_cast(idx.get_local_range(0)); // number of work-items in work-group x-dimension + const auto blockDim_y = static_cast(idx.get_local_range(1)); // number of work-items in work-group y-dimension + const auto blockIdx_x = static_cast(group[0]) + grid_x_offset_; // current work-group in global range x-dimension + offsets if the global range is too large + const auto blockIdx_y = static_cast(group[1]) + grid_y_offset_; // current work-group in global range y-dimension + offsets if the global range is too large + + // calculate the indices used in the current work-item + const auto feature_idx = (blockIdx_y * blockDim_y + threadIdx_y) * INTERNAL_BLOCK_SIZE_uz; // num_features + const auto class_idx = (blockIdx_x * blockDim_x + threadIdx_x) * INTERNAL_BLOCK_SIZE_uz; // num_classes for (unsigned internal_feature = 0; internal_feature < INTERNAL_BLOCK_SIZE; ++internal_feature) { for (unsigned internal_class = 0; internal_class < INTERNAL_BLOCK_SIZE; ++internal_class) { - const auto global_class_idx = class_idx(idx) + static_cast(internal_class); - const auto global_feature_idx = feature_idx(idx) + static_cast(internal_feature); + // calculate the indices to access the global data + const auto global_feature_idx = feature_idx + static_cast(internal_feature); + const auto global_class_idx = class_idx + static_cast(internal_class); - w_d_[global_feature_idx * (num_classes_ + PADDING_SIZE_uz) + global_class_idx] = temp(idx)[internal_feature][internal_class]; + w_[global_feature_idx * (num_classes_ + PADDING_SIZE_uz) + global_class_idx] = temp(idx)[internal_feature][internal_class]; // SoA } } }); @@ -155,41 +183,46 @@ class device_kernel_w_linear { private: /// @cond Doxygen_suppress - real_type *w_d_; - const real_type *alpha_d_; - const real_type *sv_d_; + real_type *w_; + const real_type *alpha_; + const real_type *support_vectors_; const std::size_t num_classes_; const std::size_t num_sv_; - const std::size_t device_specific_num_sv_; - const std::size_t sv_offset_; + const std::size_t device_num_sv_; + const std::size_t device_sv_offset_; const std::size_t grid_x_offset_; const std::size_t grid_y_offset_; /// @endcond }; /** - * @brief Predict the @p predict_points_d using the linear kernel speeding up the calculation using the @p w_d vector. + * @brief Predict the @p predict_points using the linear kernel speeding up the calculation using the @p w vector. * @details Uses SYCL's hierarchical data parallel kernels. + * @tparam target the target platform */ +template class device_kernel_predict_linear { public: + /// The used SYCL kernel invocation type. + constexpr static sycl::kernel_invocation_type invocation_type = sycl::kernel_invocation_type::hierarchical; + /** * @brief Initialize the SYCL kernel function object. - * @param[out] prediction_d the predicted values - * @param[in] w_d the vector to speedup the calculations - * @param[in] rho_d the previously learned bias - * @param[in] predict_points_d the data points to predict + * @param[out] prediction the predicted values + * @param[in] w the vector to speedup the calculations + * @param[in] rho the previously learned bias + * @param[in] predict_points the data points to predict * @param[in] num_classes the number of classes * @param[in] num_predict_points the number of data points to predict * @param[in] num_features the number of features per data point * @param[in] grid_x_offset the offset in x-dimension into the data points if more than one execution grid has to be used * @param[in] grid_y_offset the offset in y-dimension into the data points if more than one execution grid has to be used */ - device_kernel_predict_linear(real_type *prediction_d, const real_type *w_d, const real_type *rho_d, const real_type *predict_points_d, const std::size_t num_classes, const std::size_t num_predict_points, const std::size_t num_features, const std::size_t grid_x_offset, const std::size_t grid_y_offset) : - prediction_d_{ prediction_d }, - w_d_{ w_d }, - rho_d_{ rho_d }, - predict_points_d_{ predict_points_d }, + device_kernel_predict_linear(real_type *prediction, const real_type *w, const real_type *rho, const real_type *predict_points, const std::size_t num_classes, const std::size_t num_predict_points, const std::size_t num_features, const std::size_t grid_x_offset, const std::size_t grid_y_offset) : + prediction_{ prediction }, + w_{ w }, + rho_{ rho }, + predict_points_{ predict_points }, num_classes_{ num_classes }, num_predict_points_{ num_predict_points }, num_features_{ num_features }, @@ -201,35 +234,15 @@ class device_kernel_predict_linear { * @param[in] group indices representing the current point in the execution space */ void operator()(::sycl::group<2> group) const { - // allocate shared memory - real_type data_cache_pp[THREAD_BLOCK_SIZE][INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE]; - real_type data_cache_w[THREAD_BLOCK_SIZE][INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE]; - - // calculate the indices used in the current work-item - ::sycl::private_memory pp_idx{ group }; - ::sycl::private_memory pp_idx_linear{ group }; - ::sycl::private_memory class_idx{ group }; - ::sycl::private_memory class_idx_linear{ group }; + // create two local memory arrays used for caching + real_type pp_cache[THREAD_BLOCK_SIZE][INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE]; + real_type w_cache[THREAD_BLOCK_SIZE][INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE]; + // create a private memory array used for internal caching ::sycl::private_memory temp{ group }; - // initialize private and local variables + // initialize private variable group.parallel_for_work_item([&](::sycl::h_item<2> idx) { - const std::size_t threadIdx_x = idx.get_local_id(0); // current thread in block x-dimension - const std::size_t threadIdx_y = idx.get_local_id(1); // current thread in block y-dimension - const std::size_t blockDim_x = idx.get_local_range(0); // number of threads in block x-dimension - const std::size_t blockDim_y = idx.get_local_range(1); // number of threads in block y-dimension - const std::size_t blockIdx_x = group[0] + grid_x_offset_; // current block in grid x-dimension + offsets if the grid size would be too large - const std::size_t blockIdx_y = group[1] + grid_y_offset_; // current block in grid y-dimension + offsets if the grid size would be too large - - const auto INTERNAL_BLOCK_SIZE_uz = static_cast(INTERNAL_BLOCK_SIZE); - - // indices - pp_idx(idx) = (blockIdx_y * blockDim_y + threadIdx_y) * INTERNAL_BLOCK_SIZE_uz; - pp_idx_linear(idx) = blockIdx_y * blockDim_y * INTERNAL_BLOCK_SIZE_uz + threadIdx_y; - class_idx(idx) = (blockIdx_x * blockDim_x + threadIdx_x) * INTERNAL_BLOCK_SIZE_uz; - class_idx_linear(idx) = blockIdx_x * blockDim_x * INTERNAL_BLOCK_SIZE_uz + threadIdx_y; - // initialize private temp matrix to zero for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) { for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) { @@ -240,25 +253,38 @@ class device_kernel_predict_linear { // implicit group barrier - // iterate over all support vectors using blocking to be able to cache them for faster memory accesses - for (std::size_t dim = 0; dim < num_features_; dim += static_cast(THREAD_BLOCK_SIZE)) { + // iterate over all features using blocking to be able to cache them for faster memory accesses + for (std::size_t feature_block = 0; feature_block < num_features_; feature_block += static_cast(THREAD_BLOCK_SIZE)) { group.parallel_for_work_item([&](::sycl::h_item<2> idx) { + // cast values to 32-bit unsigned int values to prevent implicit conversions const auto local_id_0 = static_cast(idx.get_local_id(0)); const auto local_id_1 = static_cast(idx.get_local_id(1)); - const std::size_t threadIdx_x = idx.get_local_id(0); + // cast all values to 64-bit std::size_t to prevent potential 32-bit overflows + constexpr auto INTERNAL_BLOCK_SIZE_uz = static_cast(INTERNAL_BLOCK_SIZE); + constexpr auto THREAD_BLOCK_SIZE_uz = static_cast(THREAD_BLOCK_SIZE); + constexpr auto PADDING_SIZE_uz = static_cast(PADDING_SIZE); + + const auto threadIdx_x = static_cast(idx.get_local_id(0)); // current work-item in work-group x-dimension + const auto threadIdx_y = static_cast(idx.get_local_id(1)); // current work-item in work-group y-dimension + const auto blockDim_x = static_cast(idx.get_local_range(0)); // number of work-items in work-group x-dimension + const auto blockDim_y = static_cast(idx.get_local_range(1)); // number of work-items in work-group y-dimension + const auto blockIdx_x = static_cast(group[0]) + grid_x_offset_; // current work-group in global range x-dimension + offsets if the global range is too large + const auto blockIdx_y = static_cast(group[1]) + grid_y_offset_; // current work-group in global range y-dimension + offsets if the global range is too large - const auto THREAD_BLOCK_SIZE_uz = static_cast(THREAD_BLOCK_SIZE); - const auto PADDING_SIZE_uz = static_cast(PADDING_SIZE); + // calculate the indices used in the current thread, pays attention to coalesced memory accesses + const auto pp_idx_linear = blockIdx_y * blockDim_y * INTERNAL_BLOCK_SIZE_uz + threadIdx_y; // num_predict_points + const auto class_idx_linear = blockIdx_x * blockDim_x * INTERNAL_BLOCK_SIZE_uz + threadIdx_y; // num_classes - // load data into shared memory + // load data into local memory for (unsigned internal = 0; internal < INTERNAL_BLOCK_SIZE; ++internal) { - const auto global_pp_idx = pp_idx_linear(idx) + static_cast(internal) * THREAD_BLOCK_SIZE_uz; - const auto global_class_idx = class_idx_linear(idx) + static_cast(internal) * THREAD_BLOCK_SIZE_uz; + // calculate the indices to access the global data, pays attention to coalesced memory accesses + const auto global_pp_idx_linear = pp_idx_linear + static_cast(internal) * THREAD_BLOCK_SIZE_uz; + const auto global_class_idx_linear = class_idx_linear + static_cast(internal) * THREAD_BLOCK_SIZE_uz; // store the values in the local memory - data_cache_pp[local_id_0][internal * THREAD_BLOCK_SIZE + local_id_1] = predict_points_d_[(dim + threadIdx_x) * (num_predict_points_ + PADDING_SIZE_uz) + global_pp_idx]; - data_cache_w[local_id_0][internal * THREAD_BLOCK_SIZE + local_id_1] = w_d_[(dim + threadIdx_x) * (num_classes_ + PADDING_SIZE_uz) + global_class_idx]; + pp_cache[local_id_0][internal * THREAD_BLOCK_SIZE + local_id_1] = predict_points_[(feature_block + threadIdx_x) * (num_predict_points_ + PADDING_SIZE_uz) + global_pp_idx_linear]; // SoA + w_cache[local_id_0][internal * THREAD_BLOCK_SIZE + local_id_1] = w_[(feature_block + threadIdx_x) * (num_classes_ + PADDING_SIZE_uz) + global_class_idx_linear]; // SoA } }); @@ -266,13 +292,28 @@ class device_kernel_predict_linear { // perform the dot product calculation group.parallel_for_work_item([&](::sycl::h_item<2> idx) { + // cast values to 32-bit unsigned int values to prevent implicit conversions const auto local_id_0 = static_cast(idx.get_local_id(0)); const auto local_id_1 = static_cast(idx.get_local_id(1)); - for (unsigned block_dim = 0; block_dim < THREAD_BLOCK_SIZE; ++block_dim) { - for (unsigned internal_pd = 0; internal_pd < INTERNAL_BLOCK_SIZE; ++internal_pd) { + if constexpr (target == target_platform::cpu) { + // perform the dot product calculation, the feature is the fastest moving index + for (unsigned internal_pp = 0; internal_pp < INTERNAL_BLOCK_SIZE; ++internal_pp) { for (unsigned internal_class = 0; internal_class < INTERNAL_BLOCK_SIZE; ++internal_class) { - temp(idx)[internal_pd][internal_class] += data_cache_w[block_dim][local_id_0 * INTERNAL_BLOCK_SIZE + internal_class] * data_cache_pp[block_dim][local_id_1 * INTERNAL_BLOCK_SIZE + internal_pd]; + real_type sum{ 0.0 }; + for (unsigned feature = 0; feature < THREAD_BLOCK_SIZE; ++feature) { + sum += w_cache[feature][local_id_0 * INTERNAL_BLOCK_SIZE + internal_class] * pp_cache[feature][local_id_1 * INTERNAL_BLOCK_SIZE + internal_pp]; + } + temp(idx)[internal_pp][internal_class] += sum; + } + } + } else { + // perform the dot product calculation, the feature is the slowest moving index + for (unsigned feature = 0; feature < THREAD_BLOCK_SIZE; ++feature) { + for (unsigned internal_pp = 0; internal_pp < INTERNAL_BLOCK_SIZE; ++internal_pp) { + for (unsigned internal_class = 0; internal_class < INTERNAL_BLOCK_SIZE; ++internal_class) { + temp(idx)[internal_pp][internal_class] += w_cache[feature][local_id_0 * INTERNAL_BLOCK_SIZE + internal_class] * pp_cache[feature][local_id_1 * INTERNAL_BLOCK_SIZE + internal_pp]; + } } } } @@ -281,16 +322,30 @@ class device_kernel_predict_linear { // implicit group barrier } - // update global array with local one + // update the global array with the local one group.parallel_for_work_item([&](::sycl::h_item<2> idx) { - const auto PADDING_SIZE_uz = static_cast(PADDING_SIZE); - - for (unsigned internal_pd = 0; internal_pd < INTERNAL_BLOCK_SIZE; ++internal_pd) { + // cast all values to 64-bit std::size_t to prevent potential 32-bit overflows + constexpr auto INTERNAL_BLOCK_SIZE_uz = static_cast(INTERNAL_BLOCK_SIZE); + constexpr auto PADDING_SIZE_uz = static_cast(PADDING_SIZE); + + const auto threadIdx_x = static_cast(idx.get_local_id(0)); // current work-item in work-group x-dimension + const auto threadIdx_y = static_cast(idx.get_local_id(1)); // current work-item in work-group y-dimension + const auto blockDim_x = static_cast(idx.get_local_range(0)); // number of work-items in work-group x-dimension + const auto blockDim_y = static_cast(idx.get_local_range(1)); // number of work-items in work-group y-dimension + const auto blockIdx_x = static_cast(group[0]) + grid_x_offset_; // current work-group in global range x-dimension + offsets if the global range is too large + const auto blockIdx_y = static_cast(group[1]) + grid_y_offset_; // current work-group in global range y-dimension + offsets if the global range is too large + + // calculate the indices used in the current work-item + const auto pp_idx = (blockIdx_y * blockDim_y + threadIdx_y) * INTERNAL_BLOCK_SIZE_uz; // num_predict_points + const auto class_idx = (blockIdx_x * blockDim_x + threadIdx_x) * INTERNAL_BLOCK_SIZE_uz; // num_classes + + for (unsigned internal_pp = 0; internal_pp < INTERNAL_BLOCK_SIZE; ++internal_pp) { for (unsigned internal_class = 0; internal_class < INTERNAL_BLOCK_SIZE; ++internal_class) { - const auto global_class_idx = class_idx(idx) + static_cast(internal_class); - const auto global_pp_idx = pp_idx(idx) + static_cast(internal_pd); + // calculate the indices to access the global data + const auto global_pp_idx = pp_idx + static_cast(internal_pp); + const auto global_class_idx = class_idx + static_cast(internal_class); - prediction_d_[global_pp_idx * (num_classes_ + PADDING_SIZE_uz) + global_class_idx] = temp(idx)[internal_pd][internal_class] - rho_d_[global_class_idx]; + prediction_[global_pp_idx * (num_classes_ + PADDING_SIZE_uz) + global_class_idx] = temp(idx)[internal_pp][internal_class] - rho_[global_class_idx]; // AoS } } }); @@ -298,10 +353,10 @@ class device_kernel_predict_linear { private: /// @cond Doxygen_suppress - real_type *prediction_d_; - const real_type *w_d_; - const real_type *rho_d_; - const real_type *predict_points_d_; + real_type *prediction_; + const real_type *w_; + const real_type *rho_; + const real_type *predict_points_; const std::size_t num_classes_; const std::size_t num_predict_points_; const std::size_t num_features_; @@ -311,21 +366,25 @@ class device_kernel_predict_linear { }; /** - * @brief Predict the @p predict_points_d using the @p kernel_function. + * @brief Predict the @p predict_points using the @p kernel_function. * @details Uses SYCL's hierarchical data parallel kernels. + * @tparam target the target platform * @tparam kernel_function the type of the used kernel function * @tparam Args the types of the parameters necessary for the specific kernel function; stored in a `std::tuple` */ -template +template class device_kernel_predict { public: + /// The used SYCL kernel invocation type. + constexpr static sycl::kernel_invocation_type invocation_type = sycl::kernel_invocation_type::hierarchical; + /** * @brief Initialize the SYCL kernel function object. - * @param[in] prediction_d the predicted values - * @param[in] alpha_d the previously learned weights - * @param[in] rho_d the previously learned biases - * @param[in] sv_d the support vectors - * @param[in] predict_points_d the data points to predict + * @param[in] prediction the predicted values + * @param[in] alpha the previously learned weights + * @param[in] rho the previously learned biases + * @param[in] support_vectors the support vectors + * @param[in] predict_points the data points to predict * @param[in] num_classes the number of classes * @param[in] num_sv the number of support vectors * @param[in] num_predict_points the number of data points to predict @@ -334,51 +393,34 @@ class device_kernel_predict { * @param[in] grid_y_offset the offset in y-dimension into the data points if more than one execution grid has to be used * @param[in] kernel_function_parameter the parameters necessary to apply the @p kernel_function */ - device_kernel_predict(real_type *prediction_d, const real_type *alpha_d, const real_type *rho_d, const real_type *sv_d, const real_type *predict_points_d, const std::size_t num_classes, const std::size_t num_sv, const std::size_t num_predict_points, const std::size_t num_features, const std::size_t grid_x_offset, const std::size_t grid_y_offset, Args... kernel_function_parameter) : - prediction_d_{ prediction_d }, - alpha_d_{ alpha_d }, - rho_d_{ rho_d }, - sv_d_{ sv_d }, - predict_points_d_{ predict_points_d }, + device_kernel_predict(real_type *prediction, const real_type *alpha, const real_type *rho, const real_type *support_vectors, const real_type *predict_points, const std::size_t num_classes, const std::size_t num_sv, const std::size_t num_predict_points, const std::size_t num_features, const std::size_t grid_x_offset, const std::size_t grid_y_offset, Args... kernel_function_parameter) : + prediction_{ prediction }, + alpha_{ alpha }, + rho_{ rho }, + support_vectors_{ support_vectors }, + predict_points_{ predict_points }, num_classes_{ num_classes }, num_sv_{ num_sv }, num_predict_points_{ num_predict_points }, num_features_{ num_features }, grid_x_offset_{ grid_x_offset }, grid_y_offset_{ grid_y_offset }, - kernel_function_parameter_{ std::make_tuple(std::forward(kernel_function_parameter)...) } { } + kernel_function_parameter_{ std::make_tuple(kernel_function_parameter...) } { } /** * @brief Function call operator overload performing the actual calculation. * @param[in] group indices representing the current point in the execution space */ void operator()(::sycl::group<2> group) const { - // allocate shared memory - real_type data_cache_pp[THREAD_BLOCK_SIZE][INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE]; - real_type data_cache_sv[THREAD_BLOCK_SIZE][INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE]; - - // calculate the indices used in the current work-item - ::sycl::private_memory pp_idx{ group }; - ::sycl::private_memory pp_idx_linear{ group }; - ::sycl::private_memory sv_idx_linear{ group }; + // create two local memory arrays used for caching + real_type cache_one[THREAD_BLOCK_SIZE][INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE]; + real_type cache_two[THREAD_BLOCK_SIZE][INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE]; + // create a private memory array used for internal caching ::sycl::private_memory temp{ group }; - // initialize private and local variables + // initialize private variable group.parallel_for_work_item([&](::sycl::h_item<2> idx) { - const std::size_t threadIdx_y = idx.get_local_id(1); // current thread in block y-dimension - const std::size_t blockDim_x = idx.get_local_range(0); // number of threads in block x-dimension - const std::size_t blockDim_y = idx.get_local_range(1); // number of threads in block y-dimension - const std::size_t blockIdx_x = group[0] + grid_x_offset_; // current block in grid x-dimension + offsets if the grid size would be too large - const std::size_t blockIdx_y = group[1] + grid_y_offset_; // current block in grid y-dimension + offsets if the grid size would be too large - - const auto INTERNAL_BLOCK_SIZE_uz = static_cast(INTERNAL_BLOCK_SIZE); - - // indices - pp_idx(idx) = (blockIdx_y * blockDim_y + threadIdx_y) * INTERNAL_BLOCK_SIZE_uz; - pp_idx_linear(idx) = blockIdx_y * blockDim_y * INTERNAL_BLOCK_SIZE_uz + threadIdx_y; - sv_idx_linear(idx) = blockIdx_x * blockDim_x * INTERNAL_BLOCK_SIZE_uz + threadIdx_y; - // initialize private temp matrix to zero for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) { for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) { @@ -390,25 +432,42 @@ class device_kernel_predict { // implicit group barrier { + // rename cached arrays -> not possible due to an AdaptiveCpp runtime exception + // auto &pp_cache = cache_one; + // auto &sv_cache = cache_two; + // iterate over all features using blocking to be able to cache them for faster memory accesses - for (std::size_t dim = 0; dim < num_features_; dim += static_cast(THREAD_BLOCK_SIZE)) { + for (std::size_t feature_block = 0; feature_block < num_features_; feature_block += static_cast(THREAD_BLOCK_SIZE)) { group.parallel_for_work_item([&](::sycl::h_item<2> idx) { + // cast values to 32-bit unsigned int values to prevent implicit conversions const auto local_id_0 = static_cast(idx.get_local_id(0)); const auto local_id_1 = static_cast(idx.get_local_id(1)); - const std::size_t threadIdx_x = idx.get_local_id(0); + // cast all values to 64-bit std::size_t to prevent potential 32-bit overflows + constexpr auto INTERNAL_BLOCK_SIZE_uz = static_cast(INTERNAL_BLOCK_SIZE); + constexpr auto THREAD_BLOCK_SIZE_uz = static_cast(THREAD_BLOCK_SIZE); + constexpr auto PADDING_SIZE_uz = static_cast(PADDING_SIZE); + + const auto threadIdx_x = static_cast(idx.get_local_id(0)); // current work-item in work-group x-dimension + const auto threadIdx_y = static_cast(idx.get_local_id(1)); // current work-item in work-group y-dimension + const auto blockDim_x = static_cast(idx.get_local_range(0)); // number of work-items in work-group x-dimension + const auto blockDim_y = static_cast(idx.get_local_range(1)); // number of work-items in work-group y-dimension + const auto blockIdx_x = static_cast(group[0]) + grid_x_offset_; // current work-group in global range x-dimension + offsets if the global range is too large + const auto blockIdx_y = static_cast(group[1]) + grid_y_offset_; // current work-group in global range y-dimension + offsets if the global range is too large - const auto THREAD_BLOCK_SIZE_uz = static_cast(THREAD_BLOCK_SIZE); - const auto PADDING_SIZE_uz = static_cast(PADDING_SIZE); + // calculate the indices used in the current thread, pays attention to coalesced memory accesses + const auto pp_idx_linear = blockIdx_y * blockDim_y * INTERNAL_BLOCK_SIZE_uz + threadIdx_y; // num_predict_points + const auto sv_idx_linear = blockIdx_x * blockDim_x * INTERNAL_BLOCK_SIZE_uz + threadIdx_y; // num_support_vectors // load data into local memory for (unsigned internal = 0; internal < INTERNAL_BLOCK_SIZE; ++internal) { - const auto global_pp_idx = pp_idx_linear(idx) + static_cast(internal) * THREAD_BLOCK_SIZE_uz; - const auto global_sv_idx = sv_idx_linear(idx) + static_cast(internal) * THREAD_BLOCK_SIZE_uz; + // calculate the indices to access the global data, pays attention to coalesced memory accesses + const auto global_pp_idx_linear = pp_idx_linear + static_cast(internal) * THREAD_BLOCK_SIZE_uz; + const auto global_sv_idx_linear = sv_idx_linear + static_cast(internal) * THREAD_BLOCK_SIZE_uz; - // store the values in the shared memory - data_cache_pp[local_id_0][internal * THREAD_BLOCK_SIZE + local_id_1] = predict_points_d_[(dim + threadIdx_x) * (num_predict_points_ + PADDING_SIZE_uz) + global_pp_idx]; - data_cache_sv[local_id_0][internal * THREAD_BLOCK_SIZE + local_id_1] = sv_d_[(dim + threadIdx_x) * (num_sv_ + PADDING_SIZE_uz) + global_sv_idx]; + // store the values in the local memory + cache_one[local_id_0][internal * THREAD_BLOCK_SIZE + local_id_1] = predict_points_[(feature_block + threadIdx_x) * (num_predict_points_ + PADDING_SIZE_uz) + global_pp_idx_linear]; + cache_two[local_id_0][internal * THREAD_BLOCK_SIZE + local_id_1] = support_vectors_[(feature_block + threadIdx_x) * (num_sv_ + PADDING_SIZE_uz) + global_sv_idx_linear]; } }); @@ -416,14 +475,30 @@ class device_kernel_predict { // perform the feature reduction calculation group.parallel_for_work_item([&](::sycl::h_item<2> idx) { + // cast values to 32-bit unsigned int values to prevent implicit conversions const auto local_id_0 = static_cast(idx.get_local_id(0)); const auto local_id_1 = static_cast(idx.get_local_id(1)); - for (unsigned block_dim = 0; block_dim < THREAD_BLOCK_SIZE; ++block_dim) { - for (unsigned internal_pd = 0; internal_pd < INTERNAL_BLOCK_SIZE; ++internal_pd) { + if constexpr (target == target_platform::cpu) { + // perform the feature reduction calculation, the feature is the fastest moving index + for (unsigned internal_pp = 0; internal_pp < INTERNAL_BLOCK_SIZE; ++internal_pp) { for (unsigned internal_sv = 0; internal_sv < INTERNAL_BLOCK_SIZE; ++internal_sv) { - temp(idx)[internal_pd][internal_sv] += detail::feature_reduce(data_cache_sv[block_dim][local_id_0 * INTERNAL_BLOCK_SIZE + internal_sv], - data_cache_pp[block_dim][local_id_1 * INTERNAL_BLOCK_SIZE + internal_pd]); + real_type sum{ 0.0 }; + for (unsigned feature = 0; feature < THREAD_BLOCK_SIZE; ++feature) { + sum += detail::feature_reduce(cache_two[feature][local_id_0 * INTERNAL_BLOCK_SIZE + internal_sv], + cache_one[feature][local_id_1 * INTERNAL_BLOCK_SIZE + internal_pp]); + } + temp(idx)[internal_pp][internal_sv] += sum; + } + } + } else { + // perform the feature reduction calculation, the feature is the slowest moving index + for (unsigned feature = 0; feature < THREAD_BLOCK_SIZE; ++feature) { + for (unsigned internal_pp = 0; internal_pp < INTERNAL_BLOCK_SIZE; ++internal_pp) { + for (unsigned internal_sv = 0; internal_sv < INTERNAL_BLOCK_SIZE; ++internal_sv) { + temp(idx)[internal_pp][internal_sv] += detail::feature_reduce(cache_two[feature][local_id_0 * INTERNAL_BLOCK_SIZE + internal_sv], + cache_one[feature][local_id_1 * INTERNAL_BLOCK_SIZE + internal_pp]); + } } } } @@ -435,9 +510,9 @@ class device_kernel_predict { // update temp using the respective kernel function group.parallel_for_work_item([&](::sycl::h_item<2> idx) { - for (unsigned internal_pd = 0; internal_pd < INTERNAL_BLOCK_SIZE; ++internal_pd) { + for (unsigned internal_pp = 0; internal_pp < INTERNAL_BLOCK_SIZE; ++internal_pp) { for (unsigned internal_sv = 0; internal_sv < INTERNAL_BLOCK_SIZE; ++internal_sv) { - temp(idx)[internal_pd][internal_sv] = detail::apply_kernel_function(temp(idx)[internal_pd][internal_sv], kernel_function_parameter_); + temp(idx)[internal_pp][internal_sv] = detail::apply_kernel_function(temp(idx)[internal_pp][internal_sv], kernel_function_parameter_); } } }); @@ -445,33 +520,42 @@ class device_kernel_predict { // implicit group barrier { - // rename cached arrays -> can't rename the arrays due to AdaptiveCpp runtime exception - // auto &alpha_cache = data_cache_pp; - // auto &out_cache = data_cache_sv; + // rename cached arrays -> not possible due to an AdaptiveCpp runtime exception + // auto &alpha_cache = cache_one; + // auto &out_cache = cache_two; - // iterate over all features using blocking to be able to cache them for faster memory accesses - for (std::size_t dim = 0; dim < num_classes_; dim += static_cast(THREAD_BLOCK_SIZE)) { + // iterate over all classes using blocking to be able to cache them for faster memory accesses + for (std::size_t class_block = 0; class_block < num_classes_; class_block += static_cast(THREAD_BLOCK_SIZE)) { // load data into local memory group.parallel_for_work_item([&](::sycl::h_item<2> idx) { + // cast values to 32-bit unsigned int values to prevent implicit conversions const auto local_id_0 = static_cast(idx.get_local_id(0)); const auto local_id_1 = static_cast(idx.get_local_id(1)); - const std::size_t blockIdx_x = group[0] + grid_x_offset_; // current block in grid x-dimension + offsets if the grid size would be too large - const std::size_t threadIdx_x = idx.get_local_id(0); + // cast all values to 64-bit std::size_t to prevent potential 32-bit overflows + constexpr auto INTERNAL_BLOCK_SIZE_uz = static_cast(INTERNAL_BLOCK_SIZE); + constexpr auto THREAD_BLOCK_SIZE_uz = static_cast(THREAD_BLOCK_SIZE); + constexpr auto PADDING_SIZE_uz = static_cast(PADDING_SIZE); - const auto THREAD_BLOCK_SIZE_uz = static_cast(THREAD_BLOCK_SIZE); - const auto PADDING_SIZE_uz = static_cast(PADDING_SIZE); + const auto threadIdx_x = static_cast(idx.get_local_id(0)); // current work-item in work-group x-dimension + const auto threadIdx_y = static_cast(idx.get_local_id(1)); // current work-item in work-group y-dimension + const auto blockDim_x = static_cast(idx.get_local_range(0)); // number of work-items in work-group x-dimension + const auto blockIdx_x = static_cast(group[0]) + grid_x_offset_; // current work-group in global range x-dimension + offsets if the global range is too large - for (unsigned internal = 0; internal < INTERNAL_BLOCK_SIZE; ++internal) { - const std::size_t global_sv_idx = sv_idx_linear(idx) + static_cast(internal) * THREAD_BLOCK_SIZE_uz; + // calculate the indices used in the current thread, pays attention to coalesced memory accesses + const auto sv_idx_linear = blockIdx_x * blockDim_x * INTERNAL_BLOCK_SIZE_uz + threadIdx_y; // num_support_vectors - data_cache_pp[local_id_0][internal * THREAD_BLOCK_SIZE + local_id_1] = alpha_d_[(dim + threadIdx_x) * (num_sv_ + PADDING_SIZE_uz) + global_sv_idx]; + for (unsigned internal = 0; internal < INTERNAL_BLOCK_SIZE; ++internal) { + // calculate the indices to access the global data, pays attention to coalesced memory accesses + const auto global_sv_idx_linear = sv_idx_linear + static_cast(internal) * THREAD_BLOCK_SIZE_uz; + // store the values in the local memory + cache_one[local_id_0][internal * THREAD_BLOCK_SIZE + local_id_1] = alpha_[(class_block + threadIdx_x) * (num_sv_ + PADDING_SIZE_uz) + global_sv_idx_linear]; // AoS // the bias (rho) must only be applied once for all support vectors if (blockIdx_x == std::size_t{ 0 }) { - data_cache_sv[local_id_0][internal * THREAD_BLOCK_SIZE + local_id_1] = -rho_d_[dim + threadIdx_x]; + cache_two[local_id_0][internal * THREAD_BLOCK_SIZE + local_id_1] = -rho_[class_block + threadIdx_x]; } else { - data_cache_sv[local_id_0][internal * THREAD_BLOCK_SIZE + local_id_1] = real_type{ 0.0 }; + cache_two[local_id_0][internal * THREAD_BLOCK_SIZE + local_id_1] = real_type{ 0.0 }; } } }); @@ -481,13 +565,14 @@ class device_kernel_predict { // calculate intermediate results and store them in local memory for (unsigned class_idx = 0; class_idx < THREAD_BLOCK_SIZE; ++class_idx) { group.parallel_for_work_item([&](::sycl::h_item<2> idx) { + // cast values to 32-bit unsigned int values to prevent implicit conversions const auto local_id_0 = static_cast(idx.get_local_id(0)); const auto local_id_1 = static_cast(idx.get_local_id(1)); - for (unsigned internal_pd = 0; internal_pd < INTERNAL_BLOCK_SIZE; ++internal_pd) { + for (unsigned internal_pp = 0; internal_pp < INTERNAL_BLOCK_SIZE; ++internal_pp) { for (unsigned internal_sv = 0; internal_sv < INTERNAL_BLOCK_SIZE; ++internal_sv) { - data_cache_sv[(class_idx + local_id_0) % THREAD_BLOCK_SIZE][internal_pd * THREAD_BLOCK_SIZE + local_id_1] += - temp(idx)[internal_pd][internal_sv] * data_cache_pp[(class_idx + local_id_0) % THREAD_BLOCK_SIZE][local_id_0 * INTERNAL_BLOCK_SIZE + internal_sv]; + cache_two[(class_idx + local_id_0) % THREAD_BLOCK_SIZE][internal_pp * THREAD_BLOCK_SIZE + local_id_1] += + temp(idx)[internal_pp][internal_sv] * cache_one[(class_idx + local_id_0) % THREAD_BLOCK_SIZE][local_id_0 * INTERNAL_BLOCK_SIZE + internal_sv]; } } }); @@ -495,19 +580,29 @@ class device_kernel_predict { // implicit group barrier } - // add intermediate cached results to prediction_d + // atomically add the intermediate cached results to the prediction group.parallel_for_work_item([&](::sycl::h_item<2> idx) { + // cast values to 32-bit unsigned int values to prevent implicit conversions const auto local_id_0 = static_cast(idx.get_local_id(0)); const auto local_id_1 = static_cast(idx.get_local_id(1)); - const std::size_t threadIdx_x = idx.get_local_id(0); + // cast all values to 64-bit std::size_t to prevent potential 32-bit overflows + constexpr auto INTERNAL_BLOCK_SIZE_uz = static_cast(INTERNAL_BLOCK_SIZE); + constexpr auto PADDING_SIZE_uz = static_cast(PADDING_SIZE); + + const auto threadIdx_x = static_cast(idx.get_local_id(0)); // current work-item in work-group x-dimension + const auto threadIdx_y = static_cast(idx.get_local_id(1)); // current work-item in work-group y-dimension + const auto blockDim_y = static_cast(idx.get_local_range(1)); // number of work-items in work-group y-dimension + const auto blockIdx_y = static_cast(group[1]) + grid_y_offset_; // current work-group in global range y-dimension + offsets if the global range is too large - const auto PADDING_SIZE_uz = static_cast(PADDING_SIZE); + // calculate the indices used in the current thread + const auto pp_idx = (blockIdx_y * blockDim_y + threadIdx_y) * INTERNAL_BLOCK_SIZE_uz; // num_predict_points for (unsigned internal = 0; internal < INTERNAL_BLOCK_SIZE; ++internal) { - const auto global_pp_idx = pp_idx(idx) + static_cast(internal); + // calculate the indices to access the global data + const auto global_pp_idx = pp_idx + static_cast(internal); - detail::atomic_op{ prediction_d_[global_pp_idx * (num_classes_ + PADDING_SIZE_uz) + dim + threadIdx_x] } += data_cache_sv[local_id_0][internal * THREAD_BLOCK_SIZE + local_id_1]; + detail::atomic_op{ prediction_[global_pp_idx * (num_classes_ + PADDING_SIZE_uz) + class_block + threadIdx_x] } += cache_two[local_id_0][internal * THREAD_BLOCK_SIZE + local_id_1]; } }); @@ -518,11 +613,11 @@ class device_kernel_predict { private: /// @cond Doxygen_suppress - real_type *prediction_d_; - const real_type *alpha_d_; - const real_type *rho_d_; - const real_type *sv_d_; - const real_type *predict_points_d_; + real_type *prediction_; + const real_type *alpha_; + const real_type *rho_; + const real_type *support_vectors_; + const real_type *predict_points_; const std::size_t num_classes_; const std::size_t num_sv_; const std::size_t num_predict_points_; diff --git a/include/plssvm/backends/SYCL/kernel/predict/scoped/predict_kernel.hpp b/include/plssvm/backends/SYCL/kernel/predict/scoped/predict_kernel.hpp index e6d56ec56..a62418057 100644 --- a/include/plssvm/backends/SYCL/kernel/predict/scoped/predict_kernel.hpp +++ b/include/plssvm/backends/SYCL/kernel/predict/scoped/predict_kernel.hpp @@ -15,8 +15,10 @@ #include "plssvm/backends/SYCL/detail/atomics.hpp" // plssvm::sycl::detail::atomic_op #include "plssvm/backends/SYCL/kernel/kernel_functions.hpp" // plssvm::sycl::detail::{feature_reduce, apply_kernel_function} +#include "plssvm/backends/SYCL/kernel_invocation_types.hpp" // plssvm::sycl::kernel_invocation_type #include "plssvm/constants.hpp" // plssvm::{real_type, THREAD_BLOCK_SIZE, INTERNAL_BLOCK_SIZE, PADDING_SIZE} #include "plssvm/kernel_function_types.hpp" // plssvm::kernel_function_type +#include "plssvm/target_platforms.hpp" // plssvm::target_platform #include "sycl/sycl.hpp" // sycl::memory_environment, sycl::require_local_mem, sycl::require_private_mem, sycl::distribute_items_and_wait, sycl::s_item @@ -28,29 +30,34 @@ namespace plssvm::sycl::detail::scoped { /** * @brief Calculate the `q` vector used to speedup the prediction using the linear kernel function. * @details Uses AdaptiveCpp's scoped parallelism. + * @tparam target the target platform */ +template class device_kernel_w_linear { public: + /// The used SYCL kernel invocation type. + constexpr static sycl::kernel_invocation_type invocation_type = sycl::kernel_invocation_type::scoped; + /** * @brief Initialize the SYCL kernel function object. - * @param[in,out] w_d the vector to speedup the linear prediction - * @param[in] alpha_d the previously learned weights - * @param[in] sv_d the support vectors + * @param[in,out] w the vector to speedup the linear prediction + * @param[in] alpha the previously learned weights + * @param[in] support_vectors the support vectors * @param[in] num_classes the number of classes * @param[in] num_sv the number of support vectors - * @param[in] device_specific_num_sv the number of support vectors the current device is responsible for - * @param[in] sv_offset the first support vector (row in @p alpha_d) the current device is responsible for + * @param[in] device_num_sv the number of support vectors the current device is responsible for + * @param[in] device_sv_offset the first support vector (row in @p alpha) the current device is responsible for * @param[in] grid_x_offset the offset in x-dimension into the data points if more than one execution grid has to be used * @param[in] grid_y_offset the offset in y-dimension into the data points if more than one execution grid has to be used */ - device_kernel_w_linear(real_type *w_d, const real_type *alpha_d, const real_type *sv_d, const std::size_t num_classes, const std::size_t num_sv, const std::size_t device_specific_num_sv, const std::size_t sv_offset, const std::size_t grid_x_offset, const std::size_t grid_y_offset) : - w_d_{ w_d }, - alpha_d_{ alpha_d }, - sv_d_{ sv_d }, + device_kernel_w_linear(real_type *w, const real_type *alpha, const real_type *support_vectors, const std::size_t num_classes, const std::size_t num_sv, const std::size_t device_num_sv, const std::size_t device_sv_offset, const std::size_t grid_x_offset, const std::size_t grid_y_offset) : + w_{ w }, + alpha_{ alpha }, + support_vectors_{ support_vectors }, num_classes_{ num_classes }, num_sv_{ num_sv }, - device_specific_num_sv_{ device_specific_num_sv }, - sv_offset_{ sv_offset }, + device_num_sv_{ device_num_sv }, + device_sv_offset_{ device_sv_offset }, grid_x_offset_{ grid_x_offset }, grid_y_offset_{ grid_y_offset } { } @@ -62,78 +69,101 @@ class device_kernel_w_linear { template void operator()(T group) const { ::sycl::memory_environment(group, - ::sycl::require_local_mem(), - ::sycl::require_local_mem(), - ::sycl::require_private_mem(), - ::sycl::require_private_mem(), - ::sycl::require_private_mem(), - ::sycl::require_private_mem(), - ::sycl::require_private_mem, INTERNAL_BLOCK_SIZE>>({}), - [&](auto &data_cache_feature, auto &data_cache_alpha, auto &feature_idx, auto &feature_idx_linear, auto &class_idx, auto &class_idx_linear, auto &temp) { - // initialize private and local variables - ::sycl::distribute_items_and_wait(group, [&](::sycl::s_item<2> idx) { - const std::size_t threadIdx_x = idx.get_local_id(group, 0); // current thread in block x-dimension - const std::size_t threadIdx_y = idx.get_local_id(group, 1); // current thread in block y-dimension - const std::size_t blockDim_x = group.get_logical_local_range(0); // number of threads in block x-dimension - const std::size_t blockDim_y = group.get_logical_local_range(1); // number of threads in block y-dimension - const std::size_t blockIdx_x = group[0] + grid_x_offset_; // current block in grid x-dimension + offsets if the grid size would be too large - const std::size_t blockIdx_y = group[1] + grid_y_offset_; // current block in grid y-dimension + offsets if the grid size would be too large - - const auto INTERNAL_BLOCK_SIZE_uz = static_cast(INTERNAL_BLOCK_SIZE); - - // indices - feature_idx(idx) = (blockIdx_y * blockDim_y + threadIdx_y) * INTERNAL_BLOCK_SIZE_uz; - feature_idx_linear(idx) = blockIdx_y * blockDim_y * INTERNAL_BLOCK_SIZE_uz + threadIdx_y; - class_idx(idx) = (blockIdx_x * blockDim_x + threadIdx_x) * INTERNAL_BLOCK_SIZE_uz; - class_idx_linear(idx) = blockIdx_x * blockDim_x * INTERNAL_BLOCK_SIZE_uz + threadIdx_y; - }); + // create two local memory arrays used for caching + ::sycl::require_local_mem(), // feature_cache + ::sycl::require_local_mem(), // alpha_cache + // create a private memory array used for internal caching + ::sycl::require_private_mem, INTERNAL_BLOCK_SIZE>>({}), + [&](auto &feature_cache, auto &alpha_cache, auto &temp) { // iterate over all support vectors using blocking to be able to cache them for faster memory accesses - for (std::size_t sv = 0; sv < device_specific_num_sv_; sv += THREAD_BLOCK_SIZE) { + for (std::size_t sv_block = 0; sv_block < device_num_sv_; sv_block += THREAD_BLOCK_SIZE) { // load data into local memory ::sycl::distribute_items_and_wait(group, [&](::sycl::s_item<2> idx) { + // cast values to 32-bit unsigned int values to prevent implicit conversions const auto local_id_0 = static_cast(idx.get_local_id(group, 0)); const auto local_id_1 = static_cast(idx.get_local_id(group, 1)); - const std::size_t threadIdx_x = idx.get_local_id(group, 0); + // cast all values to 64-bit std::size_t to prevent potential 32-bit overflows + constexpr auto INTERNAL_BLOCK_SIZE_uz = static_cast(INTERNAL_BLOCK_SIZE); + constexpr auto THREAD_BLOCK_SIZE_uz = static_cast(THREAD_BLOCK_SIZE); + constexpr auto PADDING_SIZE_uz = static_cast(PADDING_SIZE); - const auto THREAD_BLOCK_SIZE_uz = static_cast(THREAD_BLOCK_SIZE); - const auto PADDING_SIZE_uz = static_cast(PADDING_SIZE); + const auto threadIdx_x = static_cast(idx.get_local_id(group, 0)); // current work-item in work-group x-dimension + const auto threadIdx_y = static_cast(idx.get_local_id(group, 1)); // current work-item in work-group y-dimension + const auto blockDim_x = static_cast(group.get_logical_local_range(0)); // number of work-items in work-group x-dimension + const auto blockDim_y = static_cast(group.get_logical_local_range(1)); // number of work-items in work-group y-dimension + const auto blockIdx_x = static_cast(group[0]) + grid_x_offset_; // current work-group in global range x-dimension + offsets if the global range is too large + const auto blockIdx_y = static_cast(group[1]) + grid_y_offset_; // current work-group in global range y-dimension + offsets if the global range is too large + + // calculate the indices used in the current work-item, pays attention to coalesced memory accesses + const auto feature_idx_linear = blockIdx_y * blockDim_y * INTERNAL_BLOCK_SIZE_uz + threadIdx_y; // num_features + const auto class_idx_linear = blockIdx_x * blockDim_x * INTERNAL_BLOCK_SIZE_uz + threadIdx_y; // num_classes for (unsigned internal = 0; internal < INTERNAL_BLOCK_SIZE; ++internal) { - const auto global_class_idx = class_idx_linear(idx) + static_cast(internal) * THREAD_BLOCK_SIZE_uz; - const auto global_feature_idx = feature_idx_linear(idx) + static_cast(internal) * THREAD_BLOCK_SIZE_uz; + // calculate the indices to access the global data, pays attention to coalesced memory accesses + const auto global_feature_idx_linear = feature_idx_linear + static_cast(internal) * THREAD_BLOCK_SIZE_uz; + const auto global_class_idx_linear = class_idx_linear + static_cast(internal) * THREAD_BLOCK_SIZE_uz; - data_cache_feature[local_id_0][internal * THREAD_BLOCK_SIZE + local_id_1] = sv_d_[global_feature_idx * (device_specific_num_sv_ + PADDING_SIZE_uz) + sv + threadIdx_x]; // SoA - data_cache_alpha[local_id_0][internal * THREAD_BLOCK_SIZE + local_id_1] = alpha_d_[global_class_idx * (num_sv_ + PADDING_SIZE_uz) + sv + sv_offset_ + threadIdx_x]; // AoS + feature_cache[local_id_0][internal * THREAD_BLOCK_SIZE + local_id_1] = support_vectors_[global_feature_idx_linear * (device_num_sv_ + PADDING_SIZE_uz) + sv_block + threadIdx_x]; // SoA + alpha_cache[local_id_0][internal * THREAD_BLOCK_SIZE + local_id_1] = alpha_[global_class_idx_linear * (num_sv_ + PADDING_SIZE_uz) + sv_block + device_sv_offset_ + threadIdx_x]; // AoS } }); // perform the dot product calculation ::sycl::distribute_items_and_wait(group, [&](::sycl::s_item<2> idx) { + // cast values to 32-bit unsigned int values to prevent implicit conversions const auto local_id_0 = static_cast(idx.get_local_id(group, 0)); const auto local_id_1 = static_cast(idx.get_local_id(group, 1)); - for (unsigned block_dim = 0; block_dim < THREAD_BLOCK_SIZE; ++block_dim) { + if constexpr (target == target_platform::cpu) { + // perform the dot product calculation, the sv is the fastest moving index for (unsigned internal_feature = 0; internal_feature < INTERNAL_BLOCK_SIZE; ++internal_feature) { for (unsigned internal_class = 0; internal_class < INTERNAL_BLOCK_SIZE; ++internal_class) { - temp(idx)[internal_feature][internal_class] += data_cache_alpha[block_dim][local_id_0 * INTERNAL_BLOCK_SIZE + internal_class] * data_cache_feature[block_dim][local_id_1 * INTERNAL_BLOCK_SIZE + internal_feature]; + real_type sum{ 0.0 }; + for (unsigned sv = 0; sv < THREAD_BLOCK_SIZE; ++sv) { + sum += alpha_cache[sv][local_id_0 * INTERNAL_BLOCK_SIZE + internal_class] * feature_cache[sv][local_id_1 * INTERNAL_BLOCK_SIZE + internal_feature]; + } + temp(idx)[internal_feature][internal_class] += sum; + } + } + } else { + // perform the dot product calculation, the sv is the fastest moving index + for (unsigned sv = 0; sv < THREAD_BLOCK_SIZE; ++sv) { + for (unsigned internal_feature = 0; internal_feature < INTERNAL_BLOCK_SIZE; ++internal_feature) { + for (unsigned internal_class = 0; internal_class < INTERNAL_BLOCK_SIZE; ++internal_class) { + temp(idx)[internal_feature][internal_class] += alpha_cache[sv][local_id_0 * INTERNAL_BLOCK_SIZE + internal_class] * feature_cache[sv][local_id_1 * INTERNAL_BLOCK_SIZE + internal_feature]; + } } } } }); } - // update global array with local one + // update the global w-vector with the locally cached values ::sycl::distribute_items_and_wait(group, [&](::sycl::s_item<2> idx) { - const auto PADDING_SIZE_uz = static_cast(PADDING_SIZE); + // cast all values to 64-bit std::size_t to prevent potential 32-bit overflows + constexpr auto INTERNAL_BLOCK_SIZE_uz = static_cast(INTERNAL_BLOCK_SIZE); + constexpr auto PADDING_SIZE_uz = static_cast(PADDING_SIZE); + + const auto threadIdx_x = static_cast(idx.get_local_id(group, 0)); // current work-item in work-group x-dimension + const auto threadIdx_y = static_cast(idx.get_local_id(group, 1)); // current work-item in work-group y-dimension + const auto blockDim_x = static_cast(group.get_logical_local_range(0)); // number of work-items in work-group x-dimension + const auto blockDim_y = static_cast(group.get_logical_local_range(1)); // number of work-items in work-group y-dimension + const auto blockIdx_x = static_cast(group[0]) + grid_x_offset_; // current work-group in global range x-dimension + offsets if the global range is too large + const auto blockIdx_y = static_cast(group[1]) + grid_y_offset_; // current work-group in global range y-dimension + offsets if the global range is too large + + // calculate the indices used in the current thread + const auto feature_idx = (blockIdx_y * blockDim_y + threadIdx_y) * INTERNAL_BLOCK_SIZE_uz; // num_features + const auto class_idx = (blockIdx_x * blockDim_x + threadIdx_x) * INTERNAL_BLOCK_SIZE_uz; // num_classes for (unsigned internal_feature = 0; internal_feature < INTERNAL_BLOCK_SIZE; ++internal_feature) { for (unsigned internal_class = 0; internal_class < INTERNAL_BLOCK_SIZE; ++internal_class) { - const auto global_class_idx = class_idx(idx) + static_cast(internal_class); - const auto global_feature_idx = feature_idx(idx) + static_cast(internal_feature); + // calculate the indices to access the global data + const auto global_feature_idx = feature_idx + static_cast(internal_feature); + const auto global_class_idx = class_idx + static_cast(internal_class); - w_d_[global_feature_idx * (num_classes_ + PADDING_SIZE_uz) + global_class_idx] = temp(idx)[internal_feature][internal_class]; + w_[global_feature_idx * (num_classes_ + PADDING_SIZE_uz) + global_class_idx] = temp(idx)[internal_feature][internal_class]; // SoA } } }); @@ -142,41 +172,46 @@ class device_kernel_w_linear { private: /// @cond Doxygen_suppress - real_type *w_d_; - const real_type *alpha_d_; - const real_type *sv_d_; + real_type *w_; + const real_type *alpha_; + const real_type *support_vectors_; const std::size_t num_classes_; const std::size_t num_sv_; - const std::size_t device_specific_num_sv_; - const std::size_t sv_offset_; + const std::size_t device_num_sv_; + const std::size_t device_sv_offset_; const std::size_t grid_x_offset_; const std::size_t grid_y_offset_; /// @endcond }; /** - * @brief Predict the @p predict_points_d using the linear kernel speeding up the calculation using the @p w_d vector. + * @brief Predict the @p predict_points using the linear kernel speeding up the calculation using the @p w vector. * @details Uses AdaptiveCpp's scoped parallelism. + * @tparam target the target platform */ +template class device_kernel_predict_linear { public: + /// The used SYCL kernel invocation type. + constexpr static sycl::kernel_invocation_type invocation_type = sycl::kernel_invocation_type::scoped; + /** * @brief Initialize the SYCL kernel function object. - * @param[out] prediction_d the predicted values - * @param[in] w_d the vector to speedup the calculations - * @param[in] rho_d the previously learned bias - * @param[in] predict_points_d the data points to predict + * @param[out] prediction the predicted values + * @param[in] w the vector to speedup the calculations + * @param[in] rho the previously learned bias + * @param[in] predict_points the data points to predict * @param[in] num_classes the number of classes * @param[in] num_predict_points the number of data points to predict * @param[in] num_features the number of features per data point * @param[in] grid_x_offset the offset in x-dimension into the data points if more than one execution grid has to be used * @param[in] grid_y_offset the offset in y-dimension into the data points if more than one execution grid has to be used */ - device_kernel_predict_linear(real_type *prediction_d, const real_type *w_d, const real_type *rho_d, const real_type *predict_points_d, const std::size_t num_classes, const std::size_t num_predict_points, const std::size_t num_features, const std::size_t grid_x_offset, const std::size_t grid_y_offset) : - prediction_d_{ prediction_d }, - w_d_{ w_d }, - rho_d_{ rho_d }, - predict_points_d_{ predict_points_d }, + device_kernel_predict_linear(real_type *prediction, const real_type *w, const real_type *rho, const real_type *predict_points, const std::size_t num_classes, const std::size_t num_predict_points, const std::size_t num_features, const std::size_t grid_x_offset, const std::size_t grid_y_offset) : + prediction_{ prediction }, + w_{ w }, + rho_{ rho }, + predict_points_{ predict_points }, num_classes_{ num_classes }, num_predict_points_{ num_predict_points }, num_features_{ num_features }, @@ -191,79 +226,102 @@ class device_kernel_predict_linear { template void operator()(T group) const { ::sycl::memory_environment(group, - ::sycl::require_local_mem(), - ::sycl::require_local_mem(), - ::sycl::require_private_mem(), - ::sycl::require_private_mem(), - ::sycl::require_private_mem(), - ::sycl::require_private_mem(), - ::sycl::require_private_mem, INTERNAL_BLOCK_SIZE>>({}), - [&](auto &data_cache_pp, auto &data_cache_w, auto &pp_idx, auto &pp_idx_linear, auto &class_idx, auto &class_idx_linear, auto &temp) { - // initialize private and local variables - ::sycl::distribute_items_and_wait(group, [&](::sycl::s_item<2> idx) { - const std::size_t threadIdx_x = idx.get_local_id(group, 0); // current thread in block x-dimension - const std::size_t threadIdx_y = idx.get_local_id(group, 1); // current thread in block y-dimension - const std::size_t blockDim_x = group.get_logical_local_range(0); // number of threads in block x-dimension - const std::size_t blockDim_y = group.get_logical_local_range(1); // number of threads in block y-dimension - const std::size_t blockIdx_x = group[0] + grid_x_offset_; // current block in grid x-dimension + offsets if the grid size would be too large - const std::size_t blockIdx_y = group[1] + grid_y_offset_; // current block in grid y-dimension + offsets if the grid size would be too large - - const auto INTERNAL_BLOCK_SIZE_uz = static_cast(INTERNAL_BLOCK_SIZE); - - // indices - pp_idx(idx) = (blockIdx_y * blockDim_y + threadIdx_y) * INTERNAL_BLOCK_SIZE_uz; - pp_idx_linear(idx) = blockIdx_y * blockDim_y * INTERNAL_BLOCK_SIZE_uz + threadIdx_y; - class_idx(idx) = (blockIdx_x * blockDim_x + threadIdx_x) * INTERNAL_BLOCK_SIZE_uz; - class_idx_linear(idx) = blockIdx_x * blockDim_x * INTERNAL_BLOCK_SIZE_uz + threadIdx_y; - }); + // create two local memory arrays used for caching + ::sycl::require_local_mem(), // pp_cache + ::sycl::require_local_mem(), // w_cache - // iterate over all support vectors using blocking to be able to cache them for faster memory accesses - for (std::size_t dim = 0; dim < num_features_; dim += static_cast(THREAD_BLOCK_SIZE)) { + // create a private memory array used for internal caching + ::sycl::require_private_mem, INTERNAL_BLOCK_SIZE>>({}), + [&](auto &pp_cache, auto &w_cache, auto &temp) { + // iterate over all features using blocking to be able to cache them for faster memory accesses + for (std::size_t feature_block = 0; feature_block < num_features_; feature_block += static_cast(THREAD_BLOCK_SIZE)) { ::sycl::distribute_items_and_wait(group, [&](::sycl::s_item<2> idx) { + // cast values to 32-bit unsigned int values to prevent implicit conversions const auto local_id_0 = static_cast(idx.get_local_id(group, 0)); const auto local_id_1 = static_cast(idx.get_local_id(group, 1)); - const std::size_t threadIdx_x = idx.get_local_id(group, 0); + // cast all values to 64-bit std::size_t to prevent potential 32-bit overflows + constexpr auto INTERNAL_BLOCK_SIZE_uz = static_cast(INTERNAL_BLOCK_SIZE); + constexpr auto THREAD_BLOCK_SIZE_uz = static_cast(THREAD_BLOCK_SIZE); + constexpr auto PADDING_SIZE_uz = static_cast(PADDING_SIZE); - const auto THREAD_BLOCK_SIZE_uz = static_cast(THREAD_BLOCK_SIZE); - const auto PADDING_SIZE_uz = static_cast(PADDING_SIZE); + const auto threadIdx_x = static_cast(idx.get_local_id(group, 0)); // current work-item in work-group x-dimension + const auto threadIdx_y = static_cast(idx.get_local_id(group, 1)); // current work-item in work-group y-dimension + const auto blockDim_x = static_cast(group.get_logical_local_range(0)); // number of work-items in work-group x-dimension + const auto blockDim_y = static_cast(group.get_logical_local_range(1)); // number of work-items in work-group y-dimension + const auto blockIdx_x = static_cast(group[0]) + grid_x_offset_; // current work-group in global range x-dimension + offsets if the global range is too large + const auto blockIdx_y = static_cast(group[1]) + grid_y_offset_; // current work-group in global range y-dimension + offsets if the global range is too large - // load data into shared memory + // calculate the indices used in the current thread, pays attention to coalesced memory accesses + const auto pp_idx_linear = blockIdx_y * blockDim_y * INTERNAL_BLOCK_SIZE_uz + threadIdx_y; // num_predict_points + const auto class_idx_linear = blockIdx_x * blockDim_x * INTERNAL_BLOCK_SIZE_uz + threadIdx_y; // num_classes + + // load data into local memory for (unsigned internal = 0; internal < INTERNAL_BLOCK_SIZE; ++internal) { - const auto global_pp_idx = pp_idx_linear(idx) + static_cast(internal) * THREAD_BLOCK_SIZE_uz; - const auto global_class_idx = class_idx_linear(idx) + static_cast(internal) * THREAD_BLOCK_SIZE_uz; + // calculate the indices to access the global data, pays attention to coalesced memory accesses + const auto global_pp_idx_linear = pp_idx_linear + static_cast(internal) * THREAD_BLOCK_SIZE_uz; + const auto global_class_idx_linear = class_idx_linear + static_cast(internal) * THREAD_BLOCK_SIZE_uz; // store the values in the local memory - data_cache_pp[local_id_0][internal * THREAD_BLOCK_SIZE + local_id_1] = predict_points_d_[(dim + threadIdx_x) * (num_predict_points_ + PADDING_SIZE_uz) + global_pp_idx]; - data_cache_w[local_id_0][internal * THREAD_BLOCK_SIZE + local_id_1] = w_d_[(dim + threadIdx_x) * (num_classes_ + PADDING_SIZE_uz) + global_class_idx]; + pp_cache[local_id_0][internal * THREAD_BLOCK_SIZE + local_id_1] = predict_points_[(feature_block + threadIdx_x) * (num_predict_points_ + PADDING_SIZE_uz) + global_pp_idx_linear]; // SoA + w_cache[local_id_0][internal * THREAD_BLOCK_SIZE + local_id_1] = w_[(feature_block + threadIdx_x) * (num_classes_ + PADDING_SIZE_uz) + global_class_idx_linear]; // SoA } }); // perform the dot product calculation ::sycl::distribute_items_and_wait(group, [&](::sycl::s_item<2> idx) { + // cast values to 32-bit unsigned int values to prevent implicit conversions const auto local_id_0 = static_cast(idx.get_local_id(group, 0)); const auto local_id_1 = static_cast(idx.get_local_id(group, 1)); - for (unsigned block_dim = 0; block_dim < THREAD_BLOCK_SIZE; ++block_dim) { - for (unsigned internal_pd = 0; internal_pd < INTERNAL_BLOCK_SIZE; ++internal_pd) { + if constexpr (target == target_platform::cpu) { + // perform the dot product calculation, the feature is the fastest moving index + for (unsigned internal_pp = 0; internal_pp < INTERNAL_BLOCK_SIZE; ++internal_pp) { for (unsigned internal_class = 0; internal_class < INTERNAL_BLOCK_SIZE; ++internal_class) { - temp(idx)[internal_pd][internal_class] += data_cache_w[block_dim][local_id_0 * INTERNAL_BLOCK_SIZE + internal_class] * data_cache_pp[block_dim][local_id_1 * INTERNAL_BLOCK_SIZE + internal_pd]; + real_type sum{ 0.0 }; + for (unsigned feature = 0; feature < THREAD_BLOCK_SIZE; ++feature) { + sum += w_cache[feature][local_id_0 * INTERNAL_BLOCK_SIZE + internal_class] * pp_cache[feature][local_id_1 * INTERNAL_BLOCK_SIZE + internal_pp]; + } + temp(idx)[internal_pp][internal_class] += sum; + } + } + } else { + // perform the dot product calculation, the feature is the slowest moving index + for (unsigned feature = 0; feature < THREAD_BLOCK_SIZE; ++feature) { + for (unsigned internal_pp = 0; internal_pp < INTERNAL_BLOCK_SIZE; ++internal_pp) { + for (unsigned internal_class = 0; internal_class < INTERNAL_BLOCK_SIZE; ++internal_class) { + temp(idx)[internal_pp][internal_class] += w_cache[feature][local_id_0 * INTERNAL_BLOCK_SIZE + internal_class] * pp_cache[feature][local_id_1 * INTERNAL_BLOCK_SIZE + internal_pp]; + } } } } }); } - // update global array with local one + // update the global array with the local one ::sycl::distribute_items_and_wait(group, [&](::sycl::s_item<2> idx) { - const auto PADDING_SIZE_uz = static_cast(PADDING_SIZE); - - for (unsigned internal_pd = 0; internal_pd < INTERNAL_BLOCK_SIZE; ++internal_pd) { + // cast all values to 64-bit std::size_t to prevent potential 32-bit overflows + constexpr auto INTERNAL_BLOCK_SIZE_uz = static_cast(INTERNAL_BLOCK_SIZE); + constexpr auto PADDING_SIZE_uz = static_cast(PADDING_SIZE); + + const auto threadIdx_x = static_cast(idx.get_local_id(group, 0)); // current work-item in work-group x-dimension + const auto threadIdx_y = static_cast(idx.get_local_id(group, 1)); // current work-item in work-group y-dimension + const auto blockDim_x = static_cast(group.get_logical_local_range(0)); // number of work-items in work-group x-dimension + const auto blockDim_y = static_cast(group.get_logical_local_range(1)); // number of work-items in work-group y-dimension + const auto blockIdx_x = static_cast(group[0]) + grid_x_offset_; // current work-group in global range x-dimension + offsets if the global range is too large + const auto blockIdx_y = static_cast(group[1]) + grid_y_offset_; // current work-group in global range y-dimension + offsets if the global range is too large + + // calculate the indices used in the current work-item + const auto pp_idx = (blockIdx_y * blockDim_y + threadIdx_y) * INTERNAL_BLOCK_SIZE_uz; // num_predict_points + const auto class_idx = (blockIdx_x * blockDim_x + threadIdx_x) * INTERNAL_BLOCK_SIZE_uz; // num_classes + + for (unsigned internal_pp = 0; internal_pp < INTERNAL_BLOCK_SIZE; ++internal_pp) { for (unsigned internal_class = 0; internal_class < INTERNAL_BLOCK_SIZE; ++internal_class) { - const auto global_class_idx = class_idx(idx) + static_cast(internal_class); - const auto global_pp_idx = pp_idx(idx) + static_cast(internal_pd); + // calculate the indices to access the global data + const auto global_pp_idx = pp_idx + static_cast(internal_pp); + const auto global_class_idx = class_idx + static_cast(internal_class); - prediction_d_[global_pp_idx * (num_classes_ + PADDING_SIZE_uz) + global_class_idx] = temp(idx)[internal_pd][internal_class] - rho_d_[global_class_idx]; + prediction_[global_pp_idx * (num_classes_ + PADDING_SIZE_uz) + global_class_idx] = temp(idx)[internal_pp][internal_class] - rho_[global_class_idx]; // AoS } } }); @@ -272,10 +330,10 @@ class device_kernel_predict_linear { private: /// @cond Doxygen_suppress - real_type *prediction_d_; - const real_type *w_d_; - const real_type *rho_d_; - const real_type *predict_points_d_; + real_type *prediction_; + const real_type *w_; + const real_type *rho_; + const real_type *predict_points_; const std::size_t num_classes_; const std::size_t num_predict_points_; const std::size_t num_features_; @@ -285,21 +343,25 @@ class device_kernel_predict_linear { }; /** - * @brief Predict the @p predict_points_d using the @p kernel_function. + * @brief Predict the @p predict_points using the @p kernel_function. * @details Uses AdaptiveCpp's scoped parallelism. + * @tparam target the target platform * @tparam kernel_function the type of the used kernel function * @tparam Args the types of the parameters necessary for the specific kernel function; stored in a `std::tuple` */ -template +template class device_kernel_predict { public: + /// The used SYCL kernel invocation type. + constexpr static sycl::kernel_invocation_type invocation_type = sycl::kernel_invocation_type::scoped; + /** * @brief Initialize the SYCL kernel function object. - * @param[in] prediction_d the predicted values - * @param[in] alpha_d the previously learned weights - * @param[in] rho_d the previously learned biases - * @param[in] sv_d the support vectors - * @param[in] predict_points_d the data points to predict + * @param[in] prediction the predicted values + * @param[in] alpha the previously learned weights + * @param[in] rho the previously learned biases + * @param[in] support_vectors the support vectors + * @param[in] predict_points the data points to predict * @param[in] num_classes the number of classes * @param[in] num_sv the number of support vectors * @param[in] num_predict_points the number of data points to predict @@ -308,19 +370,19 @@ class device_kernel_predict { * @param[in] grid_y_offset the offset in y-dimension into the data points if more than one execution grid has to be used * @param[in] kernel_function_parameter the parameters necessary to apply the @p kernel_function */ - device_kernel_predict(real_type *prediction_d, const real_type *alpha_d, const real_type *rho_d, const real_type *sv_d, const real_type *predict_points_d, const std::size_t num_classes, const std::size_t num_sv, const std::size_t num_predict_points, const std::size_t num_features, const std::size_t grid_x_offset, const std::size_t grid_y_offset, Args... kernel_function_parameter) : - prediction_d_{ prediction_d }, - alpha_d_{ alpha_d }, - rho_d_{ rho_d }, - sv_d_{ sv_d }, - predict_points_d_{ predict_points_d }, + device_kernel_predict(real_type *prediction, const real_type *alpha, const real_type *rho, const real_type *support_vectors, const real_type *predict_points, const std::size_t num_classes, const std::size_t num_sv, const std::size_t num_predict_points, const std::size_t num_features, const std::size_t grid_x_offset, const std::size_t grid_y_offset, Args... kernel_function_parameter) : + prediction_{ prediction }, + alpha_{ alpha }, + rho_{ rho }, + support_vectors_{ support_vectors }, + predict_points_{ predict_points }, num_classes_{ num_classes }, num_sv_{ num_sv }, num_predict_points_{ num_predict_points }, num_features_{ num_features }, grid_x_offset_{ grid_x_offset }, grid_y_offset_{ grid_y_offset }, - kernel_function_parameter_{ std::make_tuple(std::forward(kernel_function_parameter)...) } { } + kernel_function_parameter_{ std::make_tuple(kernel_function_parameter...) } { } /** * @brief Function call operator overload performing the actual calculation. @@ -330,102 +392,130 @@ class device_kernel_predict { template void operator()(T group) const { ::sycl::memory_environment(group, - ::sycl::require_local_mem(), - ::sycl::require_local_mem(), - ::sycl::require_private_mem(), - ::sycl::require_private_mem(), - ::sycl::require_private_mem(), + // create two local memory arrays used for caching + ::sycl::require_local_mem(), // cache_one + ::sycl::require_local_mem(), // cache_two + + // create a private memory array used for internal caching ::sycl::require_private_mem, INTERNAL_BLOCK_SIZE>>({}), - [&](auto &data_cache_pp, auto &data_cache_sv, auto &pp_idx, auto &pp_idx_linear, auto &sv_idx_linear, auto &temp) { - // initialize private and local variables - ::sycl::distribute_items_and_wait(group, [&](::sycl::s_item<2> idx) { - const std::size_t threadIdx_y = idx.get_local_id(group, 1); // current thread in block y-dimension - const std::size_t blockDim_x = group.get_logical_local_range(0); // number of threads in block x-dimension - const std::size_t blockDim_y = group.get_logical_local_range(1); // number of threads in block y-dimension - const std::size_t blockIdx_x = group[0] + grid_x_offset_; // current block in grid x-dimension + offsets if the grid size would be too large - const std::size_t blockIdx_y = group[1] + grid_y_offset_; // current block in grid y-dimension + offsets if the grid size would be too large - - const auto INTERNAL_BLOCK_SIZE_uz = static_cast(INTERNAL_BLOCK_SIZE); - - // indices - pp_idx(idx) = (blockIdx_y * blockDim_y + threadIdx_y) * INTERNAL_BLOCK_SIZE_uz; - pp_idx_linear(idx) = blockIdx_y * blockDim_y * INTERNAL_BLOCK_SIZE_uz + threadIdx_y; - sv_idx_linear(idx) = blockIdx_x * blockDim_x * INTERNAL_BLOCK_SIZE_uz + threadIdx_y; - }); + [&](auto &cache_one, auto &cache_two, auto &temp) { + { + // rename cached arrays + auto &pp_cache = cache_one; + auto &sv_cache = cache_two; - // iterate over all features using blocking to be able to cache them for faster memory accesses - for (std::size_t dim = 0; dim < num_features_; dim += static_cast(THREAD_BLOCK_SIZE)) { - ::sycl::distribute_items_and_wait(group, [&](::sycl::s_item<2> idx) { - const auto local_id_0 = static_cast(idx.get_local_id(group, 0)); - const auto local_id_1 = static_cast(idx.get_local_id(group, 1)); + // iterate over all features using blocking to be able to cache them for faster memory accesses + for (std::size_t feature_block = 0; feature_block < num_features_; feature_block += static_cast(THREAD_BLOCK_SIZE)) { + ::sycl::distribute_items_and_wait(group, [&](::sycl::s_item<2> idx) { + // cast values to 32-bit unsigned int values to prevent implicit conversions + const auto local_id_0 = static_cast(idx.get_local_id(group, 0)); + const auto local_id_1 = static_cast(idx.get_local_id(group, 1)); - const std::size_t threadIdx_x = idx.get_local_id(group, 0); + // cast all values to 64-bit std::size_t to prevent potential 32-bit overflows + constexpr auto INTERNAL_BLOCK_SIZE_uz = static_cast(INTERNAL_BLOCK_SIZE); + constexpr auto THREAD_BLOCK_SIZE_uz = static_cast(THREAD_BLOCK_SIZE); + constexpr auto PADDING_SIZE_uz = static_cast(PADDING_SIZE); - const auto THREAD_BLOCK_SIZE_uz = static_cast(THREAD_BLOCK_SIZE); - const auto PADDING_SIZE_uz = static_cast(PADDING_SIZE); + const auto threadIdx_x = static_cast(idx.get_local_id(group, 0)); // current work-item in work-group x-dimension + const auto threadIdx_y = static_cast(idx.get_local_id(group, 1)); // current work-item in work-group y-dimension + const auto blockDim_x = static_cast(group.get_logical_local_range(0)); // number of work-items in work-group x-dimension + const auto blockDim_y = static_cast(group.get_logical_local_range(1)); // number of work-items in work-group y-dimension + const auto blockIdx_x = static_cast(group[0]) + grid_x_offset_; // current work-group in global range x-dimension + offsets if the global range is too large + const auto blockIdx_y = static_cast(group[1]) + grid_y_offset_; // current work-group in global range y-dimension + offsets if the global range is too large - // load data into local memory - for (unsigned internal = 0; internal < INTERNAL_BLOCK_SIZE; ++internal) { - const auto global_pp_idx = pp_idx_linear(idx) + static_cast(internal) * THREAD_BLOCK_SIZE_uz; - const auto global_sv_idx = sv_idx_linear(idx) + static_cast(internal) * THREAD_BLOCK_SIZE_uz; + // calculate the indices used in the current thread, pays attention to coalesced memory accesses + const auto pp_idx_linear = blockIdx_y * blockDim_y * INTERNAL_BLOCK_SIZE_uz + threadIdx_y; // num_predict_points + const auto sv_idx_linear = blockIdx_x * blockDim_x * INTERNAL_BLOCK_SIZE_uz + threadIdx_y; // num_support_vectors - // store the values in the shared memory - data_cache_pp[local_id_0][internal * THREAD_BLOCK_SIZE + local_id_1] = predict_points_d_[(dim + threadIdx_x) * (num_predict_points_ + PADDING_SIZE_uz) + global_pp_idx]; - data_cache_sv[local_id_0][internal * THREAD_BLOCK_SIZE + local_id_1] = sv_d_[(dim + threadIdx_x) * (num_sv_ + PADDING_SIZE_uz) + global_sv_idx]; - } - }); + // load data into local memory + for (unsigned internal = 0; internal < INTERNAL_BLOCK_SIZE; ++internal) { + // calculate the indices to access the global data, pays attention to coalesced memory accesses + const auto global_pp_idx_linear = pp_idx_linear + static_cast(internal) * THREAD_BLOCK_SIZE_uz; + const auto global_sv_idx_linear = sv_idx_linear + static_cast(internal) * THREAD_BLOCK_SIZE_uz; - // perform the feature reduction calculation - ::sycl::distribute_items_and_wait(group, [&](::sycl::s_item<2> idx) { - const auto local_id_0 = static_cast(idx.get_local_id(group, 0)); - const auto local_id_1 = static_cast(idx.get_local_id(group, 1)); + // store the values in the local memory + pp_cache[local_id_0][internal * THREAD_BLOCK_SIZE + local_id_1] = predict_points_[(feature_block + threadIdx_x) * (num_predict_points_ + PADDING_SIZE_uz) + global_pp_idx_linear]; + sv_cache[local_id_0][internal * THREAD_BLOCK_SIZE + local_id_1] = support_vectors_[(feature_block + threadIdx_x) * (num_sv_ + PADDING_SIZE_uz) + global_sv_idx_linear]; + } + }); - for (unsigned block_dim = 0; block_dim < THREAD_BLOCK_SIZE; ++block_dim) { - for (unsigned internal_pd = 0; internal_pd < INTERNAL_BLOCK_SIZE; ++internal_pd) { - for (unsigned internal_sv = 0; internal_sv < INTERNAL_BLOCK_SIZE; ++internal_sv) { - temp(idx)[internal_pd][internal_sv] += detail::feature_reduce(data_cache_sv[block_dim][local_id_0 * INTERNAL_BLOCK_SIZE + internal_sv], - data_cache_pp[block_dim][local_id_1 * INTERNAL_BLOCK_SIZE + internal_pd]); + // perform the feature reduction calculation + ::sycl::distribute_items_and_wait(group, [&](::sycl::s_item<2> idx) { + // cast values to 32-bit unsigned int values to prevent implicit conversions + const auto local_id_0 = static_cast(idx.get_local_id(group, 0)); + const auto local_id_1 = static_cast(idx.get_local_id(group, 1)); + + if constexpr (target == target_platform::cpu) { + // perform the feature reduction calculation, the feature is the fastest moving index + for (unsigned internal_pp = 0; internal_pp < INTERNAL_BLOCK_SIZE; ++internal_pp) { + for (unsigned internal_sv = 0; internal_sv < INTERNAL_BLOCK_SIZE; ++internal_sv) { + real_type sum{ 0.0 }; + for (unsigned feature = 0; feature < THREAD_BLOCK_SIZE; ++feature) { + sum += detail::feature_reduce(sv_cache[feature][local_id_0 * INTERNAL_BLOCK_SIZE + internal_sv], + pp_cache[feature][local_id_1 * INTERNAL_BLOCK_SIZE + internal_pp]); + } + temp(idx)[internal_pp][internal_sv] += sum; + } + } + } else { + // perform the feature reduction calculation, the feature is the slowest moving index + for (unsigned feature = 0; feature < THREAD_BLOCK_SIZE; ++feature) { + for (unsigned internal_pp = 0; internal_pp < INTERNAL_BLOCK_SIZE; ++internal_pp) { + for (unsigned internal_sv = 0; internal_sv < INTERNAL_BLOCK_SIZE; ++internal_sv) { + temp(idx)[internal_pp][internal_sv] += detail::feature_reduce(sv_cache[feature][local_id_0 * INTERNAL_BLOCK_SIZE + internal_sv], + pp_cache[feature][local_id_1 * INTERNAL_BLOCK_SIZE + internal_pp]); + } + } } } - } - }); + }); + } } // update temp using the respective kernel function ::sycl::distribute_items_and_wait(group, [&](::sycl::s_item<2> idx) { - for (unsigned internal_pd = 0; internal_pd < INTERNAL_BLOCK_SIZE; ++internal_pd) { + for (unsigned internal_pp = 0; internal_pp < INTERNAL_BLOCK_SIZE; ++internal_pp) { for (unsigned internal_sv = 0; internal_sv < INTERNAL_BLOCK_SIZE; ++internal_sv) { - temp(idx)[internal_pd][internal_sv] = detail::apply_kernel_function(temp(idx)[internal_pd][internal_sv], kernel_function_parameter_); + temp(idx)[internal_pp][internal_sv] = detail::apply_kernel_function(temp(idx)[internal_pp][internal_sv], kernel_function_parameter_); } } }); { // rename cached arrays - auto &alpha_cache = data_cache_pp; - auto &out_cache = data_cache_sv; + auto &alpha_cache = cache_one; + auto &out_cache = cache_two; - // iterate over all features using blocking to be able to cache them for faster memory accesses - for (std::size_t dim = 0; dim < num_classes_; dim += static_cast(THREAD_BLOCK_SIZE)) { + // iterate over all classes using blocking to be able to cache them for faster memory accesses + for (std::size_t class_block = 0; class_block < num_classes_; class_block += static_cast(THREAD_BLOCK_SIZE)) { // load data into local memory ::sycl::distribute_items_and_wait(group, [&](::sycl::s_item<2> idx) { + // cast values to 32-bit unsigned int values to prevent implicit conversions const auto local_id_0 = static_cast(idx.get_local_id(group, 0)); const auto local_id_1 = static_cast(idx.get_local_id(group, 1)); - const std::size_t blockIdx_x = group[0] + grid_x_offset_; // current block in grid x-dimension + offsets if the grid size would be too large - const std::size_t threadIdx_x = idx.get_local_id(group, 0); + // cast all values to 64-bit std::size_t to prevent potential 32-bit overflows + constexpr auto INTERNAL_BLOCK_SIZE_uz = static_cast(INTERNAL_BLOCK_SIZE); + constexpr auto THREAD_BLOCK_SIZE_uz = static_cast(THREAD_BLOCK_SIZE); + constexpr auto PADDING_SIZE_uz = static_cast(PADDING_SIZE); - const auto THREAD_BLOCK_SIZE_uz = static_cast(THREAD_BLOCK_SIZE); - const auto PADDING_SIZE_uz = static_cast(PADDING_SIZE); + const auto threadIdx_x = static_cast(idx.get_local_id(group, 0)); // current work-item in work-group x-dimension + const auto threadIdx_y = static_cast(idx.get_local_id(group, 1)); // current work-item in work-group y-dimension + const auto blockDim_x = static_cast(group.get_logical_local_range(0)); // number of work-items in work-group x-dimension + const auto blockIdx_x = static_cast(group[0]) + grid_x_offset_; // current work-group in global range x-dimension + offsets if the global range is too large - for (unsigned internal = 0; internal < INTERNAL_BLOCK_SIZE; ++internal) { - const std::size_t global_sv_idx = sv_idx_linear(idx) + static_cast(internal) * THREAD_BLOCK_SIZE_uz; + // calculate the indices used in the current thread, pays attention to coalesced memory accesses + const auto sv_idx_linear = blockIdx_x * blockDim_x * INTERNAL_BLOCK_SIZE_uz + threadIdx_y; // num_support_vectors - alpha_cache[local_id_0][internal * THREAD_BLOCK_SIZE + local_id_1] = alpha_d_[(dim + threadIdx_x) * (num_sv_ + PADDING_SIZE_uz) + global_sv_idx]; + for (unsigned internal = 0; internal < INTERNAL_BLOCK_SIZE; ++internal) { + // calculate the indices to access the global data, pays attention to coalesced memory accesses + const auto global_sv_idx_linear = sv_idx_linear + static_cast(internal) * THREAD_BLOCK_SIZE_uz; + // store the values in the local memory + alpha_cache[local_id_0][internal * THREAD_BLOCK_SIZE + local_id_1] = alpha_[(class_block + threadIdx_x) * (num_sv_ + PADDING_SIZE_uz) + global_sv_idx_linear]; // AoS // the bias (rho) must only be applied once for all support vectors if (blockIdx_x == std::size_t{ 0 }) { - out_cache[local_id_0][internal * THREAD_BLOCK_SIZE + local_id_1] = -rho_d_[dim + threadIdx_x]; + out_cache[local_id_0][internal * THREAD_BLOCK_SIZE + local_id_1] = -rho_[class_block + threadIdx_x]; } else { out_cache[local_id_0][internal * THREAD_BLOCK_SIZE + local_id_1] = real_type{ 0.0 }; } @@ -435,33 +525,42 @@ class device_kernel_predict { // calculate intermediate results and store them in local memory for (unsigned class_idx = 0; class_idx < THREAD_BLOCK_SIZE; ++class_idx) { ::sycl::distribute_items_and_wait(group, [&](::sycl::s_item<2> idx) { + // cast values to 32-bit unsigned int values to prevent implicit conversions const auto local_id_0 = static_cast(idx.get_local_id(group, 0)); const auto local_id_1 = static_cast(idx.get_local_id(group, 1)); - for (unsigned internal_pd = 0; internal_pd < INTERNAL_BLOCK_SIZE; ++internal_pd) { + for (unsigned internal_pp = 0; internal_pp < INTERNAL_BLOCK_SIZE; ++internal_pp) { for (unsigned internal_sv = 0; internal_sv < INTERNAL_BLOCK_SIZE; ++internal_sv) { - out_cache[(class_idx + local_id_0) % THREAD_BLOCK_SIZE][internal_pd * THREAD_BLOCK_SIZE + local_id_1] += - temp(idx)[internal_pd][internal_sv] * alpha_cache[(class_idx + local_id_0) % THREAD_BLOCK_SIZE][local_id_0 * INTERNAL_BLOCK_SIZE + internal_sv]; + out_cache[(class_idx + local_id_0) % THREAD_BLOCK_SIZE][internal_pp * THREAD_BLOCK_SIZE + local_id_1] += + temp(idx)[internal_pp][internal_sv] * alpha_cache[(class_idx + local_id_0) % THREAD_BLOCK_SIZE][local_id_0 * INTERNAL_BLOCK_SIZE + internal_sv]; } } }); } - // add intermediate cached results to prediction_d + // atomically add the intermediate cached results to the prediction ::sycl::distribute_items_and_wait(group, [&](::sycl::s_item<2> idx) { + // cast values to 32-bit unsigned int values to prevent implicit conversions const auto local_id_0 = static_cast(idx.get_local_id(group, 0)); const auto local_id_1 = static_cast(idx.get_local_id(group, 1)); - const std::size_t threadIdx_x = idx.get_local_id(group, 0); + // cast all values to 64-bit std::size_t to prevent potential 32-bit overflows + constexpr auto INTERNAL_BLOCK_SIZE_uz = static_cast(INTERNAL_BLOCK_SIZE); + constexpr auto PADDING_SIZE_uz = static_cast(PADDING_SIZE); + + const auto threadIdx_x = static_cast(idx.get_local_id(group, 0)); // current work-item in work-group x-dimension + const auto threadIdx_y = static_cast(idx.get_local_id(group, 1)); // current work-item in work-group y-dimension + const auto blockDim_y = static_cast(group.get_logical_local_range(1)); // number of work-items in work-group y-dimension + const auto blockIdx_y = static_cast(group[1]) + grid_y_offset_; // current work-group in global range y-dimension + offsets if the global range is too large - const auto THREAD_BLOCK_SIZE_uz = static_cast(THREAD_BLOCK_SIZE); - const auto PADDING_SIZE_uz = static_cast(PADDING_SIZE); + // calculate the indices used in the current thread + const auto pp_idx = (blockIdx_y * blockDim_y + threadIdx_y) * INTERNAL_BLOCK_SIZE_uz; // num_predict_points for (unsigned internal = 0; internal < INTERNAL_BLOCK_SIZE; ++internal) { - const auto global_pp_idx = pp_idx(idx) + static_cast(internal); + // calculate the indices to access the global data + const auto global_pp_idx = pp_idx + static_cast(internal); - detail::atomic_op{ prediction_d_[global_pp_idx * (num_classes_ + PADDING_SIZE_uz) + dim + threadIdx_x] } += out_cache[local_id_0][internal * THREAD_BLOCK_SIZE + local_id_1]; - detail::atomic_op{ prediction_d_[global_pp_idx * (num_classes_ + PADDING_SIZE_uz) + dim + threadIdx_x + THREAD_BLOCK_SIZE_uz] } += out_cache[local_id_0 + THREAD_BLOCK_SIZE][internal * THREAD_BLOCK_SIZE + local_id_1]; + detail::atomic_op{ prediction_[global_pp_idx * (num_classes_ + PADDING_SIZE_uz) + class_block + threadIdx_x] } += out_cache[local_id_0][internal * THREAD_BLOCK_SIZE + local_id_1]; } }); } @@ -471,11 +570,11 @@ class device_kernel_predict { private: /// @cond Doxygen_suppress - real_type *prediction_d_; - const real_type *alpha_d_; - const real_type *rho_d_; - const real_type *sv_d_; - const real_type *predict_points_d_; + real_type *prediction_; + const real_type *alpha_; + const real_type *rho_; + const real_type *support_vectors_; + const real_type *predict_points_; const std::size_t num_classes_; const std::size_t num_sv_; const std::size_t num_predict_points_; diff --git a/include/plssvm/backends/SYCL/kernel/predict/work_group/predict_kernel.hpp b/include/plssvm/backends/SYCL/kernel/predict/work_group/predict_kernel.hpp index 6612a10d8..25bec3f13 100644 --- a/include/plssvm/backends/SYCL/kernel/predict/work_group/predict_kernel.hpp +++ b/include/plssvm/backends/SYCL/kernel/predict/work_group/predict_kernel.hpp @@ -15,8 +15,10 @@ #include "plssvm/backends/SYCL/detail/atomics.hpp" // plssvm::sycl::detail::atomic_op #include "plssvm/backends/SYCL/kernel/kernel_functions.hpp" // plssvm::sycl::detail::{feature_reduce, apply_kernel_function} +#include "plssvm/backends/SYCL/kernel_invocation_types.hpp" // plssvm::sycl::kernel_invocation_type #include "plssvm/constants.hpp" // plssvm::{real_type, THREAD_BLOCK_SIZE, INTERNAL_BLOCK_SIZE, PADDING_SIZE} #include "plssvm/kernel_function_types.hpp" // plssvm::kernel_function_type +#include "plssvm/target_platforms.hpp" // plssvm::target_platform #include "sycl/sycl.hpp" // sycl::handler, sycl::range, sycl::nd_item, sycl::local_accessor @@ -28,32 +30,37 @@ namespace plssvm::sycl::detail::work_group { /** * @brief Calculate the `q` vector used to speedup the prediction using the linear kernel function. * @details Uses SYCL's work-group data parallel kernels. + * @tparam target the target platform */ +template class device_kernel_w_linear { public: + /// The used SYCL kernel invocation type. + constexpr static sycl::kernel_invocation_type invocation_type = sycl::kernel_invocation_type::work_group; + /** * @brief Initialize the SYCL kernel function object. * @param[in] cgh the SYCL handler used to allocate the local memory - * @param[in,out] w_d the vector to speedup the linear prediction - * @param[in] alpha_d the previously learned weights - * @param[in] sv_d the support vectors + * @param[in,out] w the vector to speedup the linear prediction + * @param[in] alpha the previously learned weights + * @param[in] support_vectors the support vectors * @param[in] num_classes the number of classes * @param[in] num_sv the number of support vectors - * @param[in] device_specific_num_sv the number of support vectors the current device is responsible for - * @param[in] sv_offset the first support vector (row in @p alpha_d) the current device is responsible for + * @param[in] device_num_sv the number of support vectors the current device is responsible for + * @param[in] device_sv_offset the first support vector (row in @p alpha) the current device is responsible for * @param[in] grid_x_offset the offset in x-dimension into the data points if more than one execution grid has to be used * @param[in] grid_y_offset the offset in y-dimension into the data points if more than one execution grid has to be used */ - device_kernel_w_linear(::sycl::handler &cgh, real_type *w_d, const real_type *alpha_d, const real_type *sv_d, const std::size_t num_classes, const std::size_t num_sv, const std::size_t device_specific_num_sv, const std::size_t sv_offset, const std::size_t grid_x_offset, const std::size_t grid_y_offset) : - data_cache_feature_{ ::sycl::range<2>{ static_cast(THREAD_BLOCK_SIZE), static_cast(INTERNAL_BLOCK_SIZE) * static_cast(THREAD_BLOCK_SIZE) }, cgh }, - data_cache_alpha_{ ::sycl::range<2>{ static_cast(THREAD_BLOCK_SIZE), static_cast(INTERNAL_BLOCK_SIZE) * static_cast(THREAD_BLOCK_SIZE) }, cgh }, - w_d_{ w_d }, - alpha_d_{ alpha_d }, - sv_d_{ sv_d }, + device_kernel_w_linear(::sycl::handler &cgh, real_type *w, const real_type *alpha, const real_type *support_vectors, const std::size_t num_classes, const std::size_t num_sv, const std::size_t device_num_sv, const std::size_t device_sv_offset, const std::size_t grid_x_offset, const std::size_t grid_y_offset) : + feature_cache_{ ::sycl::range<2>{ static_cast(THREAD_BLOCK_SIZE), static_cast(INTERNAL_BLOCK_SIZE) * static_cast(THREAD_BLOCK_SIZE) }, cgh }, + alpha_cache_{ ::sycl::range<2>{ static_cast(THREAD_BLOCK_SIZE), static_cast(INTERNAL_BLOCK_SIZE) * static_cast(THREAD_BLOCK_SIZE) }, cgh }, + w_{ w }, + alpha_{ alpha }, + support_vectors_{ support_vectors }, num_classes_{ num_classes }, num_sv_{ num_sv }, - device_specific_num_sv_{ device_specific_num_sv }, - sv_offset_{ sv_offset }, + device_num_sv_{ device_num_sv }, + device_sv_offset_{ device_sv_offset }, grid_x_offset_{ grid_x_offset }, grid_y_offset_{ grid_y_offset } { } @@ -67,104 +74,130 @@ class device_kernel_w_linear { const auto local_id_1 = static_cast(nd_idx.get_local_id(1)); // cast all values to 64-bit std::size_t to prevent potential 32-bit overflows - const std::size_t threadIdx_x = nd_idx.get_local_id(0); // current thread in block x-dimension - const std::size_t threadIdx_y = nd_idx.get_local_id(1); // current thread in block y-dimension - const std::size_t blockDim_x = nd_idx.get_local_range(0); // number of threads in block x-dimension - const std::size_t blockDim_y = nd_idx.get_local_range(1); // number of threads in block y-dimension - const std::size_t blockIdx_x = nd_idx.get_group(0) + grid_x_offset_; // current block in grid x-dimension + offsets if the grid size would be too large - const std::size_t blockIdx_y = nd_idx.get_group(1) + grid_y_offset_; // current block in grid y-dimension + offsets if the grid size would be too large - const auto INTERNAL_BLOCK_SIZE_uz = static_cast(INTERNAL_BLOCK_SIZE); - const auto THREAD_BLOCK_SIZE_uz = static_cast(THREAD_BLOCK_SIZE); - const auto PADDING_SIZE_uz = static_cast(PADDING_SIZE); + constexpr auto INTERNAL_BLOCK_SIZE_uz = static_cast(INTERNAL_BLOCK_SIZE); + constexpr auto THREAD_BLOCK_SIZE_uz = static_cast(THREAD_BLOCK_SIZE); + constexpr auto PADDING_SIZE_uz = static_cast(PADDING_SIZE); - // calculate the indices used in the current work-item - const auto feature_idx = (blockIdx_y * blockDim_y + threadIdx_y) * INTERNAL_BLOCK_SIZE_uz; - const auto feature_idx_linear = blockIdx_y * blockDim_y * INTERNAL_BLOCK_SIZE_uz + threadIdx_y; - const auto class_idx = (blockIdx_x * blockDim_x + threadIdx_x) * INTERNAL_BLOCK_SIZE_uz; - const auto class_idx_linear = blockIdx_x * blockDim_x * INTERNAL_BLOCK_SIZE_uz + threadIdx_y; + const auto threadIdx_x = static_cast(nd_idx.get_local_id(0)); // current work-item in work-group x-dimension + const auto threadIdx_y = static_cast(nd_idx.get_local_id(1)); // current work-item in work-group y-dimension + const auto blockDim_x = static_cast(nd_idx.get_local_range(0)); // number of work-items in work-group x-dimension + const auto blockDim_y = static_cast(nd_idx.get_local_range(1)); // number of work-items in work-group y-dimension + const auto blockIdx_x = static_cast(nd_idx.get_group(0)) + grid_x_offset_; // current work-group in global range x-dimension + offsets if the global range is too large + const auto blockIdx_y = static_cast(nd_idx.get_group(1)) + grid_y_offset_; // current work-group in global range y-dimension + offsets if the global range is too large // create a work-item private array used for internal caching real_type temp[INTERNAL_BLOCK_SIZE][INTERNAL_BLOCK_SIZE]{}; - // iterate over all support vectors using blocking to be able to cache them for faster memory accesses - for (std::size_t sv = 0; sv < device_specific_num_sv_; sv += THREAD_BLOCK_SIZE) { - // load data into local memory - for (unsigned internal = 0; internal < INTERNAL_BLOCK_SIZE; ++internal) { - const auto global_class_idx = class_idx_linear + static_cast(internal) * THREAD_BLOCK_SIZE_uz; - const auto global_feature_idx = feature_idx_linear + static_cast(internal) * THREAD_BLOCK_SIZE_uz; + { + // calculate the indices used in the current work-item, pays attention to coalesced memory accesses + const auto feature_idx_linear = blockIdx_y * blockDim_y * INTERNAL_BLOCK_SIZE_uz + threadIdx_y; // num_features + const auto class_idx_linear = blockIdx_x * blockDim_x * INTERNAL_BLOCK_SIZE_uz + threadIdx_y; // num_classes - data_cache_feature_[local_id_0][internal * THREAD_BLOCK_SIZE + local_id_1] = sv_d_[global_feature_idx * (device_specific_num_sv_ + PADDING_SIZE_uz) + sv + threadIdx_x]; // SoA - data_cache_alpha_[local_id_0][internal * THREAD_BLOCK_SIZE + local_id_1] = alpha_d_[global_class_idx * (num_sv_ + PADDING_SIZE_uz) + sv + sv_offset_ + threadIdx_x]; // AoS - } - nd_idx.barrier(); // wait until all work-items loaded their part of the data + // iterate over all support vectors using blocking to be able to cache them for faster memory accesses + for (std::size_t sv_block = 0; sv_block < device_num_sv_; sv_block += THREAD_BLOCK_SIZE_uz) { + // load data into local memory + for (unsigned internal = 0; internal < INTERNAL_BLOCK_SIZE; ++internal) { + // calculate the indices to access the global data, pays attention to coalesced memory accesses + const auto global_feature_idx_linear = feature_idx_linear + static_cast(internal) * THREAD_BLOCK_SIZE_uz; + const auto global_class_idx_linear = class_idx_linear + static_cast(internal) * THREAD_BLOCK_SIZE_uz; - // perform the dot product calculation - for (unsigned block_dim = 0; block_dim < THREAD_BLOCK_SIZE; ++block_dim) { - for (unsigned internal_feature = 0; internal_feature < INTERNAL_BLOCK_SIZE; ++internal_feature) { - for (unsigned internal_class = 0; internal_class < INTERNAL_BLOCK_SIZE; ++internal_class) { - temp[internal_feature][internal_class] += data_cache_alpha_[block_dim][local_id_0 * INTERNAL_BLOCK_SIZE + internal_class] * data_cache_feature_[block_dim][local_id_1 * INTERNAL_BLOCK_SIZE + internal_feature]; + // store the values in the local memory + feature_cache_[local_id_0][internal * THREAD_BLOCK_SIZE + local_id_1] = support_vectors_[global_feature_idx_linear * (device_num_sv_ + PADDING_SIZE_uz) + sv_block + threadIdx_x]; // SoA + alpha_cache_[local_id_0][internal * THREAD_BLOCK_SIZE + local_id_1] = alpha_[global_class_idx_linear * (num_sv_ + PADDING_SIZE_uz) + sv_block + device_sv_offset_ + threadIdx_x]; // AoS + } + nd_idx.barrier(); // wait until all work-items loaded their part of the data + + if constexpr (target == target_platform::cpu) { + // perform the dot product calculation, the sv is the fastest moving index + for (unsigned internal_feature = 0; internal_feature < INTERNAL_BLOCK_SIZE; ++internal_feature) { + for (unsigned internal_class = 0; internal_class < INTERNAL_BLOCK_SIZE; ++internal_class) { + real_type sum{ 0.0 }; + for (unsigned sv = 0; sv < THREAD_BLOCK_SIZE; ++sv) { + sum += alpha_cache_[sv][local_id_0 * INTERNAL_BLOCK_SIZE + internal_class] * feature_cache_[sv][local_id_1 * INTERNAL_BLOCK_SIZE + internal_feature]; + } + temp[internal_feature][internal_class] += sum; + } + } + } else { + // perform the dot product calculation, the sv is the slowest moving index + for (unsigned sv = 0; sv < THREAD_BLOCK_SIZE; ++sv) { + for (unsigned internal_feature = 0; internal_feature < INTERNAL_BLOCK_SIZE; ++internal_feature) { + for (unsigned internal_class = 0; internal_class < INTERNAL_BLOCK_SIZE; ++internal_class) { + temp[internal_feature][internal_class] += alpha_cache_[sv][local_id_0 * INTERNAL_BLOCK_SIZE + internal_class] * feature_cache_[sv][local_id_1 * INTERNAL_BLOCK_SIZE + internal_feature]; + } + } } } + nd_idx.barrier(); // wait until all work-items performed their part of the calculations } - nd_idx.barrier(); // wait until all work-items performed their part of the calculations } - // update global array with local one + // calculate the indices used in the current work-item + const auto feature_idx = (blockIdx_y * blockDim_y + threadIdx_y) * INTERNAL_BLOCK_SIZE_uz; // num_features + const auto class_idx = (blockIdx_x * blockDim_x + threadIdx_x) * INTERNAL_BLOCK_SIZE_uz; // num_classes + + // update the global w-vector with the locally cached values for (unsigned internal_feature = 0; internal_feature < INTERNAL_BLOCK_SIZE; ++internal_feature) { for (unsigned internal_class = 0; internal_class < INTERNAL_BLOCK_SIZE; ++internal_class) { - const auto global_class_idx = class_idx + static_cast(internal_class); + // calculate the indices to access the global data const auto global_feature_idx = feature_idx + static_cast(internal_feature); + const auto global_class_idx = class_idx + static_cast(internal_class); - w_d_[global_feature_idx * (num_classes_ + PADDING_SIZE_uz) + global_class_idx] = temp[internal_feature][internal_class]; + w_[global_feature_idx * (num_classes_ + PADDING_SIZE_uz) + global_class_idx] = temp[internal_feature][internal_class]; // SoA } } } private: /// Local memory used for internal memory access optimizations. - ::sycl::local_accessor data_cache_feature_; + ::sycl::local_accessor feature_cache_; /// Local memory used for internal memory access optimizations. - ::sycl::local_accessor data_cache_alpha_; + ::sycl::local_accessor alpha_cache_; /// @cond Doxygen_suppress - real_type *w_d_; - const real_type *alpha_d_; - const real_type *sv_d_; + real_type *w_; + const real_type *alpha_; + const real_type *support_vectors_; const std::size_t num_classes_; const std::size_t num_sv_; - const std::size_t device_specific_num_sv_; - const std::size_t sv_offset_; + const std::size_t device_num_sv_; + const std::size_t device_sv_offset_; const std::size_t grid_x_offset_; const std::size_t grid_y_offset_; /// @endcond }; /** - * @brief Predict the @p predict_points_d using the linear kernel speeding up the calculation using the @p w_d vector. + * @brief Predict the @p predict_points using the linear kernel speeding up the calculation using the @p w vector. * @details Uses SYCL's work-group data parallel kernels. + * @tparam target the target platform */ +template class device_kernel_predict_linear { public: + /// The used SYCL kernel invocation type. + constexpr static sycl::kernel_invocation_type invocation_type = sycl::kernel_invocation_type::work_group; + /** * @brief Initialize the SYCL kernel function object. * @param[in] cgh the SYCL handler used to allocate the local memory - * @param[out] prediction_d the predicted values - * @param[in] w_d the vector to speedup the calculations - * @param[in] rho_d the previously learned bias - * @param[in] predict_points_d the data points to predict + * @param[out] prediction the predicted values + * @param[in] w the vector to speedup the calculations + * @param[in] rho the previously learned bias + * @param[in] predict_points the data points to predict * @param[in] num_classes the number of classes * @param[in] num_predict_points the number of data points to predict * @param[in] num_features the number of features per data point * @param[in] grid_x_offset the offset in x-dimension into the data points if more than one execution grid has to be used * @param[in] grid_y_offset the offset in y-dimension into the data points if more than one execution grid has to be used */ - device_kernel_predict_linear(::sycl::handler &cgh, real_type *prediction_d, const real_type *w_d, const real_type *rho_d, const real_type *predict_points_d, const std::size_t num_classes, const std::size_t num_predict_points, const std::size_t num_features, const std::size_t grid_x_offset, const std::size_t grid_y_offset) : - data_cache_pp_{ ::sycl::range<2>{ static_cast(THREAD_BLOCK_SIZE), static_cast(INTERNAL_BLOCK_SIZE) * static_cast(THREAD_BLOCK_SIZE) }, cgh }, - data_cache_w_{ ::sycl::range<2>{ static_cast(THREAD_BLOCK_SIZE), static_cast(INTERNAL_BLOCK_SIZE) * static_cast(THREAD_BLOCK_SIZE) }, cgh }, - prediction_d_{ prediction_d }, - w_d_{ w_d }, - rho_d_{ rho_d }, - predict_points_d_{ predict_points_d }, + device_kernel_predict_linear(::sycl::handler &cgh, real_type *prediction, const real_type *w, const real_type *rho, const real_type *predict_points, const std::size_t num_classes, const std::size_t num_predict_points, const std::size_t num_features, const std::size_t grid_x_offset, const std::size_t grid_y_offset) : + pp_cache_{ ::sycl::range<2>{ static_cast(THREAD_BLOCK_SIZE), static_cast(INTERNAL_BLOCK_SIZE) * static_cast(THREAD_BLOCK_SIZE) }, cgh }, + w_cache_{ ::sycl::range<2>{ static_cast(THREAD_BLOCK_SIZE), static_cast(INTERNAL_BLOCK_SIZE) * static_cast(THREAD_BLOCK_SIZE) }, cgh }, + prediction_{ prediction }, + w_{ w }, + rho_{ rho }, + predict_points_{ predict_points }, num_classes_{ num_classes }, num_predict_points_{ num_predict_points }, num_features_{ num_features }, @@ -181,71 +214,91 @@ class device_kernel_predict_linear { const auto local_id_1 = static_cast(nd_idx.get_local_id(1)); // cast all values to 64-bit std::size_t to prevent potential 32-bit overflows - const std::size_t threadIdx_x = nd_idx.get_local_id(0); // current thread in block x-dimension - const std::size_t threadIdx_y = nd_idx.get_local_id(1); // current thread in block y-dimension - const std::size_t blockDim_x = nd_idx.get_local_range(0); // number of threads in block x-dimension - const std::size_t blockDim_y = nd_idx.get_local_range(1); // number of threads in block y-dimension - const std::size_t blockIdx_x = nd_idx.get_group(0) + grid_x_offset_; // current block in grid x-dimension + offsets if the grid size would be too large - const std::size_t blockIdx_y = nd_idx.get_group(1) + grid_y_offset_; // current block in grid y-dimension + offsets if the grid size would be too large - const auto INTERNAL_BLOCK_SIZE_uz = static_cast(INTERNAL_BLOCK_SIZE); - const auto THREAD_BLOCK_SIZE_uz = static_cast(THREAD_BLOCK_SIZE); - const auto PADDING_SIZE_uz = static_cast(PADDING_SIZE); + constexpr auto INTERNAL_BLOCK_SIZE_uz = static_cast(INTERNAL_BLOCK_SIZE); + constexpr auto THREAD_BLOCK_SIZE_uz = static_cast(THREAD_BLOCK_SIZE); + constexpr auto PADDING_SIZE_uz = static_cast(PADDING_SIZE); - // calculate the indices used in the current work-item - const auto pp_idx = (blockIdx_y * blockDim_y + threadIdx_y) * INTERNAL_BLOCK_SIZE_uz; - const auto pp_idx_linear = blockIdx_y * blockDim_y * INTERNAL_BLOCK_SIZE_uz + threadIdx_y; - const auto class_idx = (blockIdx_x * blockDim_x + threadIdx_x) * INTERNAL_BLOCK_SIZE_uz; - const auto class_idx_linear = blockIdx_x * blockDim_x * INTERNAL_BLOCK_SIZE_uz + threadIdx_y; + const auto threadIdx_x = static_cast(nd_idx.get_local_id(0)); // current work-item in work-group x-dimension + const auto threadIdx_y = static_cast(nd_idx.get_local_id(1)); // current work-item in work-group y-dimension + const auto blockDim_x = static_cast(nd_idx.get_local_range(0)); // number of work-items in work-group x-dimension + const auto blockDim_y = static_cast(nd_idx.get_local_range(1)); // number of work-items in work-group y-dimension + const auto blockIdx_x = static_cast(nd_idx.get_group(0)) + grid_x_offset_; // current work-group in global range x-dimension + offsets if the global range is too large + const auto blockIdx_y = static_cast(nd_idx.get_group(1)) + grid_y_offset_; // current work-group in global range y-dimension + offsets if the global range is too large // create a work-item private array used for internal caching real_type temp[INTERNAL_BLOCK_SIZE][INTERNAL_BLOCK_SIZE]{}; - // iterate over all support vectors using blocking to be able to cache them for faster memory accesses - for (std::size_t dim = 0; dim < num_features_; dim += THREAD_BLOCK_SIZE_uz) { - // load data into shared memory - for (unsigned internal = 0; internal < INTERNAL_BLOCK_SIZE; ++internal) { - const auto global_pp_idx = pp_idx_linear + static_cast(internal) * THREAD_BLOCK_SIZE_uz; - const auto global_class_idx = class_idx_linear + static_cast(internal) * THREAD_BLOCK_SIZE_uz; + { + // calculate the indices used in the current thread, pays attention to coalesced memory accesses + const auto pp_idx_linear = blockIdx_y * blockDim_y * INTERNAL_BLOCK_SIZE_uz + threadIdx_y; // num_predict_points + const auto class_idx_linear = blockIdx_x * blockDim_x * INTERNAL_BLOCK_SIZE_uz + threadIdx_y; // num_classes - // store the values in the local memory - data_cache_pp_[local_id_0][internal * THREAD_BLOCK_SIZE + local_id_1] = predict_points_d_[(dim + threadIdx_x) * (num_predict_points_ + PADDING_SIZE_uz) + global_pp_idx]; - data_cache_w_[local_id_0][internal * THREAD_BLOCK_SIZE + local_id_1] = w_d_[(dim + threadIdx_x) * (num_classes_ + PADDING_SIZE_uz) + global_class_idx]; - } - nd_idx.barrier(); // wait until all work-items loaded their part of the data + // iterate over all features using blocking to be able to cache them for faster memory accesses + for (std::size_t feature_block = 0; feature_block < num_features_; feature_block += THREAD_BLOCK_SIZE_uz) { + // load data into local memory + for (unsigned internal = 0; internal < INTERNAL_BLOCK_SIZE; ++internal) { + // calculate the indices to access the global data, pays attention to coalesced memory accesses + const auto global_pp_idx_linear = pp_idx_linear + static_cast(internal) * THREAD_BLOCK_SIZE_uz; + const auto global_class_idx_linear = class_idx_linear + static_cast(internal) * THREAD_BLOCK_SIZE_uz; + + // store the values in the local memory + pp_cache_[local_id_0][internal * THREAD_BLOCK_SIZE + local_id_1] = predict_points_[(feature_block + threadIdx_x) * (num_predict_points_ + PADDING_SIZE_uz) + global_pp_idx_linear]; // SoA + w_cache_[local_id_0][internal * THREAD_BLOCK_SIZE + local_id_1] = w_[(feature_block + threadIdx_x) * (num_classes_ + PADDING_SIZE_uz) + global_class_idx_linear]; // SoA + } + nd_idx.barrier(); // wait until all work-items loaded their part of the data - // perform the dot product calculation - for (unsigned block_dim = 0; block_dim < THREAD_BLOCK_SIZE; ++block_dim) { - for (unsigned internal_pd = 0; internal_pd < INTERNAL_BLOCK_SIZE; ++internal_pd) { - for (unsigned internal_class = 0; internal_class < INTERNAL_BLOCK_SIZE; ++internal_class) { - temp[internal_pd][internal_class] += data_cache_w_[block_dim][local_id_0 * INTERNAL_BLOCK_SIZE + internal_class] * data_cache_pp_[block_dim][local_id_1 * INTERNAL_BLOCK_SIZE + internal_pd]; + if constexpr (target == target_platform::cpu) { + // perform the dot product calculation, the feature is the fastest moving index + for (unsigned internal_pp = 0; internal_pp < INTERNAL_BLOCK_SIZE; ++internal_pp) { + for (unsigned internal_class = 0; internal_class < INTERNAL_BLOCK_SIZE; ++internal_class) { + real_type sum{ 0.0 }; + for (unsigned feature = 0; feature < THREAD_BLOCK_SIZE; ++feature) { + sum += w_cache_[feature][local_id_0 * INTERNAL_BLOCK_SIZE + internal_class] * pp_cache_[feature][local_id_1 * INTERNAL_BLOCK_SIZE + internal_pp]; + } + temp[internal_pp][internal_class] += sum; + } + } + } else { + // perform the dot product calculation, the feature is the slowest moving index + for (unsigned feature = 0; feature < THREAD_BLOCK_SIZE; ++feature) { + for (unsigned internal_pp = 0; internal_pp < INTERNAL_BLOCK_SIZE; ++internal_pp) { + for (unsigned internal_class = 0; internal_class < INTERNAL_BLOCK_SIZE; ++internal_class) { + temp[internal_pp][internal_class] += w_cache_[feature][local_id_0 * INTERNAL_BLOCK_SIZE + internal_class] * pp_cache_[feature][local_id_1 * INTERNAL_BLOCK_SIZE + internal_pp]; + } + } } } + nd_idx.barrier(); // wait until all work-items performed their part of the calculations } - nd_idx.barrier(); // wait until all work-items performed their part of the calculations } - // update global array with local one - for (unsigned internal_pd = 0; internal_pd < INTERNAL_BLOCK_SIZE; ++internal_pd) { + // calculate the indices used in the current work-item + const auto pp_idx = (blockIdx_y * blockDim_y + threadIdx_y) * INTERNAL_BLOCK_SIZE_uz; // num_predict_points + const auto class_idx = (blockIdx_x * blockDim_x + threadIdx_x) * INTERNAL_BLOCK_SIZE_uz; // num_classes + + // update the global array with the local one + for (unsigned internal_pp = 0; internal_pp < INTERNAL_BLOCK_SIZE; ++internal_pp) { for (unsigned internal_class = 0; internal_class < INTERNAL_BLOCK_SIZE; ++internal_class) { + // calculate the indices to access the global data + const auto global_pp_idx = pp_idx + static_cast(internal_pp); const auto global_class_idx = class_idx + static_cast(internal_class); - const auto global_pp_idx = pp_idx + static_cast(internal_pd); - prediction_d_[global_pp_idx * (num_classes_ + PADDING_SIZE_uz) + global_class_idx] = temp[internal_pd][internal_class] - rho_d_[global_class_idx]; + prediction_[global_pp_idx * (num_classes_ + PADDING_SIZE_uz) + global_class_idx] = temp[internal_pp][internal_class] - rho_[global_class_idx]; // AoS } } } private: /// Local memory used for internal memory access optimizations. - ::sycl::local_accessor data_cache_pp_; + ::sycl::local_accessor pp_cache_; /// Local memory used for internal memory access optimizations. - ::sycl::local_accessor data_cache_w_; + ::sycl::local_accessor w_cache_; /// @cond Doxygen_suppress - real_type *prediction_d_; - const real_type *w_d_; - const real_type *rho_d_; - const real_type *predict_points_d_; + real_type *prediction_; + const real_type *w_; + const real_type *rho_; + const real_type *predict_points_; const std::size_t num_classes_; const std::size_t num_predict_points_; const std::size_t num_features_; @@ -255,22 +308,26 @@ class device_kernel_predict_linear { }; /** - * @brief Predict the @p predict_points_d using the @p kernel_function. + * @brief Predict the @p predict_points using the @p kernel_function. * @details Uses SYCL's work-group data parallel kernels. + * @tparam target the target platform * @tparam kernel_function the type of the used kernel function * @tparam Args the types of the parameters necessary for the specific kernel function; stored in a `std::tuple` */ -template +template class device_kernel_predict { public: + /// The used SYCL kernel invocation type. + constexpr static sycl::kernel_invocation_type invocation_type = sycl::kernel_invocation_type::work_group; + /** * @brief Initialize the SYCL kernel function object. * @param[in] cgh the SYCL handler used to allocate the local memory - * @param[in] prediction_d the predicted values - * @param[in] alpha_d the previously learned weights - * @param[in] rho_d the previously learned biases - * @param[in] sv_d the support vectors - * @param[in] predict_points_d the data points to predict + * @param[in] prediction the predicted values + * @param[in] alpha the previously learned weights + * @param[in] rho the previously learned biases + * @param[in] support_vectors the support vectors + * @param[in] predict_points the data points to predict * @param[in] num_classes the number of classes * @param[in] num_sv the number of support vectors * @param[in] num_predict_points the number of data points to predict @@ -279,21 +336,21 @@ class device_kernel_predict { * @param[in] grid_y_offset the offset in y-dimension into the data points if more than one execution grid has to be used * @param[in] kernel_function_parameter the parameters necessary to apply the @p kernel_function */ - device_kernel_predict(::sycl::handler &cgh, real_type *prediction_d, const real_type *alpha_d, const real_type *rho_d, const real_type *sv_d, const real_type *predict_points_d, const std::size_t num_classes, const std::size_t num_sv, const std::size_t num_predict_points, const std::size_t num_features, const std::size_t grid_x_offset, const std::size_t grid_y_offset, Args... kernel_function_parameter) : - data_cache_pp_{ ::sycl::range<2>{ static_cast(THREAD_BLOCK_SIZE), static_cast(INTERNAL_BLOCK_SIZE) * static_cast(THREAD_BLOCK_SIZE) }, cgh }, - data_cache_sv_{ ::sycl::range<2>{ static_cast(THREAD_BLOCK_SIZE), static_cast(INTERNAL_BLOCK_SIZE) * static_cast(THREAD_BLOCK_SIZE) }, cgh }, - prediction_d_{ prediction_d }, - alpha_d_{ alpha_d }, - rho_d_{ rho_d }, - sv_d_{ sv_d }, - predict_points_d_{ predict_points_d }, + device_kernel_predict(::sycl::handler &cgh, real_type *prediction, const real_type *alpha, const real_type *rho, const real_type *support_vectors, const real_type *predict_points, const std::size_t num_classes, const std::size_t num_sv, const std::size_t num_predict_points, const std::size_t num_features, const std::size_t grid_x_offset, const std::size_t grid_y_offset, Args... kernel_function_parameter) : + cache_one_{ ::sycl::range<2>{ static_cast(THREAD_BLOCK_SIZE), static_cast(INTERNAL_BLOCK_SIZE) * static_cast(THREAD_BLOCK_SIZE) }, cgh }, + cache_two_{ ::sycl::range<2>{ static_cast(THREAD_BLOCK_SIZE), static_cast(INTERNAL_BLOCK_SIZE) * static_cast(THREAD_BLOCK_SIZE) }, cgh }, + prediction_{ prediction }, + alpha_{ alpha }, + rho_{ rho }, + support_vectors_{ support_vectors }, + predict_points_{ predict_points }, num_classes_{ num_classes }, num_sv_{ num_sv }, num_predict_points_{ num_predict_points }, num_features_{ num_features }, grid_x_offset_{ grid_x_offset }, grid_y_offset_{ grid_y_offset }, - kernel_function_parameter_{ std::make_tuple(std::forward(kernel_function_parameter)...) } { } + kernel_function_parameter_{ std::make_tuple(kernel_function_parameter...) } { } /** * @brief Function call operator overload performing the actual calculation. @@ -305,44 +362,63 @@ class device_kernel_predict { const auto local_id_1 = static_cast(nd_idx.get_local_id(1)); // cast all values to 64-bit std::size_t to prevent potential 32-bit overflows - const std::size_t threadIdx_x = nd_idx.get_local_id(0); // current thread in block x-dimension - const std::size_t threadIdx_y = nd_idx.get_local_id(1); // current thread in block y-dimension - const std::size_t blockDim_x = nd_idx.get_local_range(0); // number of threads in block x-dimension - const std::size_t blockDim_y = nd_idx.get_local_range(1); // number of threads in block y-dimension - const std::size_t blockIdx_x = nd_idx.get_group(0) + grid_x_offset_; // current block in grid x-dimension + offsets if the grid size would be too large - const std::size_t blockIdx_y = nd_idx.get_group(1) + grid_y_offset_; // current block in grid y-dimension + offsets if the grid size would be too large - const auto INTERNAL_BLOCK_SIZE_uz = static_cast(INTERNAL_BLOCK_SIZE); - const auto THREAD_BLOCK_SIZE_uz = static_cast(THREAD_BLOCK_SIZE); - const auto PADDING_SIZE_uz = static_cast(PADDING_SIZE); + constexpr auto INTERNAL_BLOCK_SIZE_uz = static_cast(INTERNAL_BLOCK_SIZE); + constexpr auto THREAD_BLOCK_SIZE_uz = static_cast(THREAD_BLOCK_SIZE); + constexpr auto PADDING_SIZE_uz = static_cast(PADDING_SIZE); - // calculate the indices used in the current work-item - const auto pp_idx = (blockIdx_y * blockDim_y + threadIdx_y) * INTERNAL_BLOCK_SIZE_uz; - const auto pp_idx_linear = blockIdx_y * blockDim_y * INTERNAL_BLOCK_SIZE_uz + threadIdx_y; - const auto sv_idx_linear = blockIdx_x * blockDim_x * INTERNAL_BLOCK_SIZE_uz + threadIdx_y; + const auto threadIdx_x = static_cast(nd_idx.get_local_id(0)); // current work-item in work-group x-dimension + const auto threadIdx_y = static_cast(nd_idx.get_local_id(1)); // current work-item in work-group y-dimension + const auto blockDim_x = static_cast(nd_idx.get_local_range(0)); // number of work-items in work-group x-dimension + const auto blockDim_y = static_cast(nd_idx.get_local_range(1)); // number of work-items in work-group y-dimension + const auto blockIdx_x = static_cast(nd_idx.get_group(0)) + grid_x_offset_; // current work-group in global range x-dimension + offsets if the global range is too large + const auto blockIdx_y = static_cast(nd_idx.get_group(1)) + grid_y_offset_; // current work-group in global range y-dimension + offsets if the global range is too large // create a work-item private array used for internal caching real_type temp[INTERNAL_BLOCK_SIZE][INTERNAL_BLOCK_SIZE]{}; { + // rename cached arrays + auto &pp_cache = cache_one_; + auto &sv_cache = cache_two_; + + // calculate the indices used in the current thread, pays attention to coalesced memory accesses + const auto pp_idx_linear = blockIdx_y * blockDim_y * INTERNAL_BLOCK_SIZE_uz + threadIdx_y; // num_predict_points + const auto sv_idx_linear = blockIdx_x * blockDim_x * INTERNAL_BLOCK_SIZE_uz + threadIdx_y; // num_support_vectors + // iterate over all features using blocking to be able to cache them for faster memory accesses - for (std::size_t dim = 0; dim < num_features_; dim += THREAD_BLOCK_SIZE_uz) { + for (std::size_t feature_block = 0; feature_block < num_features_; feature_block += THREAD_BLOCK_SIZE_uz) { // load data into local memory for (unsigned internal = 0; internal < INTERNAL_BLOCK_SIZE; ++internal) { - const auto global_pp_idx = pp_idx_linear + static_cast(internal) * THREAD_BLOCK_SIZE_uz; - const auto global_sv_idx = sv_idx_linear + static_cast(internal) * THREAD_BLOCK_SIZE_uz; + // calculate the indices to access the global data, pays attention to coalesced memory accesses + const auto global_pp_idx_linear = pp_idx_linear + static_cast(internal) * THREAD_BLOCK_SIZE_uz; + const auto global_sv_idx_linear = sv_idx_linear + static_cast(internal) * THREAD_BLOCK_SIZE_uz; - // store the values in the shared memory - data_cache_pp_[local_id_0][internal * THREAD_BLOCK_SIZE + local_id_1] = predict_points_d_[(dim + threadIdx_x) * (num_predict_points_ + PADDING_SIZE_uz) + global_pp_idx]; - data_cache_sv_[local_id_0][internal * THREAD_BLOCK_SIZE + local_id_1] = sv_d_[(dim + threadIdx_x) * (num_sv_ + PADDING_SIZE_uz) + global_sv_idx]; + // store the values in the local memory + pp_cache[local_id_0][internal * THREAD_BLOCK_SIZE + local_id_1] = predict_points_[(feature_block + threadIdx_x) * (num_predict_points_ + PADDING_SIZE_uz) + global_pp_idx_linear]; // SoA + sv_cache[local_id_0][internal * THREAD_BLOCK_SIZE + local_id_1] = support_vectors_[(feature_block + threadIdx_x) * (num_sv_ + PADDING_SIZE_uz) + global_sv_idx_linear]; // SoA } nd_idx.barrier(); // wait until all work-items loaded their part of the data - // perform the feature reduction calculation - for (unsigned block_dim = 0; block_dim < THREAD_BLOCK_SIZE; ++block_dim) { - for (unsigned internal_pd = 0; internal_pd < INTERNAL_BLOCK_SIZE; ++internal_pd) { + if constexpr (target == target_platform::cpu) { + // perform the feature reduction calculation, the feature is the fastest moving index + for (unsigned internal_pp = 0; internal_pp < INTERNAL_BLOCK_SIZE; ++internal_pp) { for (unsigned internal_sv = 0; internal_sv < INTERNAL_BLOCK_SIZE; ++internal_sv) { - temp[internal_pd][internal_sv] += detail::feature_reduce(data_cache_sv_[block_dim][local_id_0 * INTERNAL_BLOCK_SIZE + internal_sv], - data_cache_pp_[block_dim][local_id_1 * INTERNAL_BLOCK_SIZE + internal_pd]); + real_type sum{ 0.0 }; + for (unsigned feature = 0; feature < THREAD_BLOCK_SIZE; ++feature) { + sum += detail::feature_reduce(sv_cache[feature][local_id_0 * INTERNAL_BLOCK_SIZE + internal_sv], + pp_cache[feature][local_id_1 * INTERNAL_BLOCK_SIZE + internal_pp]); + } + temp[internal_pp][internal_sv] += sum; + } + } + } else { + // perform the feature reduction calculation, the feature is the slowest moving index + for (unsigned feature = 0; feature < THREAD_BLOCK_SIZE; ++feature) { + for (unsigned internal_pp = 0; internal_pp < INTERNAL_BLOCK_SIZE; ++internal_pp) { + for (unsigned internal_sv = 0; internal_sv < INTERNAL_BLOCK_SIZE; ++internal_sv) { + temp[internal_pp][internal_sv] += detail::feature_reduce(sv_cache[feature][local_id_0 * INTERNAL_BLOCK_SIZE + internal_sv], + pp_cache[feature][local_id_1 * INTERNAL_BLOCK_SIZE + internal_pp]); + } } } } @@ -351,28 +427,34 @@ class device_kernel_predict { } // update temp using the respective kernel function - for (unsigned internal_pd = 0; internal_pd < INTERNAL_BLOCK_SIZE; ++internal_pd) { + for (unsigned internal_pp = 0; internal_pp < INTERNAL_BLOCK_SIZE; ++internal_pp) { for (unsigned internal_sv = 0; internal_sv < INTERNAL_BLOCK_SIZE; ++internal_sv) { - temp[internal_pd][internal_sv] = detail::apply_kernel_function(temp[internal_pd][internal_sv], kernel_function_parameter_); + temp[internal_pp][internal_sv] = detail::apply_kernel_function(temp[internal_pp][internal_sv], kernel_function_parameter_); } } { // rename cached arrays - auto &alpha_cache = data_cache_pp_; - auto &out_cache = data_cache_sv_; + auto &alpha_cache = cache_one_; + auto &out_cache = cache_two_; - // iterate over all features using blocking to be able to cache them for faster memory accesses - for (std::size_t dim = 0; dim < num_classes_; dim += THREAD_BLOCK_SIZE_uz) { + // calculate the indices used in the current work-item + const auto pp_idx = (blockIdx_y * blockDim_y + threadIdx_y) * INTERNAL_BLOCK_SIZE_uz; // num_predict_points + // calculate the indices used in the current work-item, pays attention to coalesced memory accesses + const auto sv_idx_linear = blockIdx_x * blockDim_x * INTERNAL_BLOCK_SIZE_uz + threadIdx_y; // num_support_vectors + + // iterate over all classes using blocking to be able to cache them for faster memory accesses + for (std::size_t class_block = 0; class_block < num_classes_; class_block += THREAD_BLOCK_SIZE_uz) { // load data into local memory for (unsigned internal = 0; internal < INTERNAL_BLOCK_SIZE; ++internal) { - const std::size_t global_sv_idx = sv_idx_linear + static_cast(internal) * THREAD_BLOCK_SIZE_uz; - - alpha_cache[local_id_0][internal * THREAD_BLOCK_SIZE + local_id_1] = alpha_d_[(dim + threadIdx_x) * (num_sv_ + PADDING_SIZE_uz) + global_sv_idx]; + // calculate the indices to access the global data, pays attention to coalesced memory accesses + const auto global_sv_idx_linear = sv_idx_linear + static_cast(internal) * THREAD_BLOCK_SIZE_uz; + // store the values in the local memory + alpha_cache[local_id_0][internal * THREAD_BLOCK_SIZE + local_id_1] = alpha_[(class_block + threadIdx_x) * (num_sv_ + PADDING_SIZE_uz) + global_sv_idx_linear]; // AoS // the bias (rho) must only be applied once for all support vectors if (blockIdx_x == std::size_t{ 0 }) { - out_cache[local_id_0][internal * THREAD_BLOCK_SIZE + local_id_1] = -rho_d_[dim + threadIdx_x]; + out_cache[local_id_0][internal * THREAD_BLOCK_SIZE + local_id_1] = -rho_[class_block + threadIdx_x]; } else { out_cache[local_id_0][internal * THREAD_BLOCK_SIZE + local_id_1] = real_type{ 0.0 }; } @@ -381,20 +463,21 @@ class device_kernel_predict { // calculate intermediate results and store them in local memory for (unsigned class_idx = 0; class_idx < THREAD_BLOCK_SIZE; ++class_idx) { - for (unsigned internal_pd = 0; internal_pd < INTERNAL_BLOCK_SIZE; ++internal_pd) { + for (unsigned internal_pp = 0; internal_pp < INTERNAL_BLOCK_SIZE; ++internal_pp) { for (unsigned internal_sv = 0; internal_sv < INTERNAL_BLOCK_SIZE; ++internal_sv) { - out_cache[(class_idx + local_id_0) % THREAD_BLOCK_SIZE][internal_pd * THREAD_BLOCK_SIZE + local_id_1] += - temp[internal_pd][internal_sv] * alpha_cache[(class_idx + local_id_0) % THREAD_BLOCK_SIZE][local_id_0 * INTERNAL_BLOCK_SIZE + internal_sv]; + out_cache[(class_idx + local_id_0) % THREAD_BLOCK_SIZE][internal_pp * THREAD_BLOCK_SIZE + local_id_1] += + temp[internal_pp][internal_sv] * alpha_cache[(class_idx + local_id_0) % THREAD_BLOCK_SIZE][local_id_0 * INTERNAL_BLOCK_SIZE + internal_sv]; } } nd_idx.barrier(); // wait until all work-items performed their part of the calculations } - // add intermediate cached results to prediction_d + // atomically add the intermediate cached results to the prediction for (unsigned internal = 0; internal < INTERNAL_BLOCK_SIZE; ++internal) { + // calculate the indices to access the global data const auto global_pp_idx = pp_idx + static_cast(internal); - detail::atomic_op{ prediction_d_[global_pp_idx * (num_classes_ + PADDING_SIZE_uz) + dim + threadIdx_x] } += out_cache[local_id_0][internal * THREAD_BLOCK_SIZE + local_id_1]; + detail::atomic_op{ prediction_[global_pp_idx * (num_classes_ + PADDING_SIZE_uz) + class_block + threadIdx_x] } += out_cache[local_id_0][internal * THREAD_BLOCK_SIZE + local_id_1]; } nd_idx.barrier(); // wait until all work-items updated their part of the prediction } @@ -403,16 +486,16 @@ class device_kernel_predict { private: /// Local memory used for internal memory access optimizations. - ::sycl::local_accessor data_cache_pp_; + ::sycl::local_accessor cache_one_; /// Local memory used for internal memory access optimizations. - ::sycl::local_accessor data_cache_sv_; + ::sycl::local_accessor cache_two_; /// @cond Doxygen_suppress - real_type *prediction_d_; - const real_type *alpha_d_; - const real_type *rho_d_; - const real_type *sv_d_; - const real_type *predict_points_d_; + real_type *prediction_; + const real_type *alpha_; + const real_type *rho_; + const real_type *support_vectors_; + const real_type *predict_points_; const std::size_t num_classes_; const std::size_t num_sv_; const std::size_t num_predict_points_; diff --git a/src/plssvm/backends/SYCL/AdaptiveCpp/csvm.cpp b/src/plssvm/backends/SYCL/AdaptiveCpp/csvm.cpp index 6f0772db0..c03aa46b0 100644 --- a/src/plssvm/backends/SYCL/AdaptiveCpp/csvm.cpp +++ b/src/plssvm/backends/SYCL/AdaptiveCpp/csvm.cpp @@ -67,6 +67,144 @@ #include // std::get #include // std::vector +namespace { + +/** + * @brief Run the kernel functor on the given device. + * @tparam KernelFunctor the type of the kernel functor to run + * @tparam QueueType the type of the SYCL queue to run the kernel on + * @tparam Args the types of the parameters necessary for the specific kernel functor + * @param[in] device the SYCL queue to run the kernel on + * @param[in] partial_grid the number of work-groups in each dimension of the execution grid + * @param[in] block the number of work-items in each dimension per work-group + * @param[in] args the parameters necessary for the specific kernel functor + */ +template +void run_kernel_functor(const QueueType &device, const plssvm::detail::dim_type partial_grid, const plssvm::detail::dim_type block, Args &&...args) { + constexpr plssvm::sycl::kernel_invocation_type invocation = KernelFunctor::invocation_type; + + if constexpr (invocation == plssvm::sycl::kernel_invocation_type::basic) { + device.impl->sycl_queue.submit([&](::sycl::handler &cgh) { + cgh.parallel_for(plssvm::adaptivecpp::detail::get_execution_range(partial_grid, block), + KernelFunctor{ std::forward(args)... }); + }); + } else if constexpr (invocation == plssvm::sycl::kernel_invocation_type::work_group) { + device.impl->sycl_queue.submit([&](::sycl::handler &cgh) { + cgh.parallel_for(plssvm::adaptivecpp::detail::get_execution_range(partial_grid, block), + KernelFunctor{ cgh, std::forward(args)... }); + }); + } else if constexpr (invocation == plssvm::sycl::kernel_invocation_type::hierarchical) { +#if defined(PLSSVM_SYCL_HIERARCHICAL_AND_SCOPED_KERNELS_ENABLED) + device.impl->sycl_queue.submit([&](::sycl::handler &cgh) { + const auto exec_range = plssvm::adaptivecpp::detail::get_execution_range(partial_grid, block); + cgh.parallel_for_work_group(exec_range.get_global_range(), exec_range.get_local_range(), KernelFunctor{ std::forward(args)... }); + }); +#else + throw plssvm::adaptivecpp::backend_exception{ "Support for sycl::kernel_invocation_type::hierarchical was disabled!" }; +#endif + } else if constexpr (invocation == plssvm::sycl::kernel_invocation_type::scoped) { +#if defined(PLSSVM_SYCL_HIERARCHICAL_AND_SCOPED_KERNELS_ENABLED) + device.impl->sycl_queue.submit([&](::sycl::handler &cgh) { + const auto exec_range = plssvm::adaptivecpp::detail::get_execution_range(partial_grid, block); + cgh.parallel(exec_range.get_global_range(), exec_range.get_local_range(), KernelFunctor{ std::forward(args)... }); + }); +#else + throw plssvm::adaptivecpp::backend_exception{ "Support for sycl::kernel_invocation_type::scoped was disabled!" }; +#endif + } else { + static_assert(::plssvm::detail::always_false_v, "Unsupported kernel function!"); + } +} + +/** + * @brief Dispatch the kernel functor to the correct kernel function type. + * @tparam KernelFunctor the type of the kernel functor to run + * @tparam target the target platform to run the kernel on + * @tparam Args the types of the parameters necessary for the specific kernel functor; stored in a `std::tuple` + * @param[in] params the parameters used to determine the kernel function type + * @param[in] args the parameters necessary for the specific kernel functor + */ +template