Skip to content

Commit eb492bf

Browse files
authored
opencl: unpack q4_0 for adreno in get_tensor (#18278)
1 parent e3b35dd commit eb492bf

File tree

3 files changed

+124
-1
lines changed

3 files changed

+124
-1
lines changed

ggml/src/ggml-opencl/ggml-opencl.cpp

Lines changed: 90 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -494,6 +494,7 @@ struct ggml_backend_opencl_context {
494494
cl_kernel kernel_convert_block_q8_0, kernel_restore_block_q8_0;
495495
cl_kernel kernel_mul_mat_q4_0_f32_8x_flat;
496496
cl_kernel kernel_convert_block_q4_0_noshuffle;
497+
cl_kernel kernel_restore_block_q4_0_noshuffle;
497498
cl_kernel kernel_mul_mat_q4_0_f32_1d_8x_flat, kernel_mul_mat_q4_0_f32_1d_16x_flat;
498499
cl_kernel kernel_mul_mv_q6_K_f32;
499500
cl_kernel kernel_mul_mv_mxfp4_f32, kernel_mul_mv_mxfp4_f32_flat;
@@ -634,6 +635,7 @@ struct ggml_backend_opencl_context {
634635
cl_kernel kernel_transpose_32;
635636
cl_kernel kernel_transpose_32_16;
636637
cl_kernel kernel_transpose_16;
638+
cl_kernel kernel_transpose_16_buf;
637639
cl_kernel kernel_transpose_16_4x1;
638640

639641
cl_mem A_s_d_max; // max scale buffer size for transpose
@@ -806,6 +808,7 @@ static void load_cl_kernels(ggml_backend_opencl_context *backend_ctx, ggml_cl_ve
806808
build_program_from_source(backend_ctx->context, backend_ctx->device, kernel_src.c_str(), compile_opts);
807809

808810
CL_CHECK((backend_ctx->kernel_convert_block_q4_0_noshuffle = clCreateKernel(backend_ctx->program_cvt, "kernel_convert_block_q4_0_noshuffle", &err), err));
811+
CL_CHECK((backend_ctx->kernel_restore_block_q4_0_noshuffle = clCreateKernel(backend_ctx->program_cvt, "kernel_restore_block_q4_0_noshuffle", &err), err));
809812
CL_CHECK((backend_ctx->kernel_convert_block_q4_0 = clCreateKernel(backend_ctx->program_cvt, "kernel_convert_block_q4_0", &err), err));
810813
CL_CHECK((backend_ctx->kernel_restore_block_q4_0 = clCreateKernel(backend_ctx->program_cvt, "kernel_restore_block_q4_0", &err), err));
811814
CL_CHECK((backend_ctx->kernel_convert_block_mxfp4 = clCreateKernel(backend_ctx->program_cvt, "kernel_convert_block_mxfp4", &err), err));
@@ -2004,7 +2007,8 @@ static void load_cl_kernels(ggml_backend_opencl_context *backend_ctx, ggml_cl_ve
20042007
CL_CHECK((backend_ctx->kernel_transpose_32_16 = clCreateKernel(backend_ctx->program_transpose, "kernel_transpose_32_16", &err), err));
20052008
CL_CHECK((backend_ctx->kernel_transpose_32 = clCreateKernel(backend_ctx->program_transpose, "kernel_transpose_32", &err), err));
20062009
CL_CHECK((backend_ctx->kernel_transpose_16 = clCreateKernel(backend_ctx->program_transpose, "kernel_transpose_16", &err), err));
2007-
CL_CHECK((backend_ctx->kernel_transpose_16_4x1 = clCreateKernel(backend_ctx->program_transpose, "kernel_transpose_16_4x1", &err), err));
2010+
CL_CHECK((backend_ctx->kernel_transpose_16_buf = clCreateKernel(backend_ctx->program_transpose, "kernel_transpose_16_buf", &err), err));
2011+
CL_CHECK((backend_ctx->kernel_transpose_16_4x1 = clCreateKernel(backend_ctx->program_transpose, "kernel_transpose_16_4x1", &err), err));
20082012
GGML_LOG_CONT(".");
20092013
}
20102014

@@ -3933,6 +3937,91 @@ static void ggml_backend_opencl_buffer_get_tensor(ggml_backend_buffer_t buffer,
39333937
if (tensor->type == GGML_TYPE_Q4_0) {
39343938
ggml_tensor_extra_cl_q4_0 * extra = (ggml_tensor_extra_cl_q4_0 *)tensor->extra;
39353939

3940+
#ifdef GGML_OPENCL_USE_ADRENO_KERNELS
3941+
if (use_adreno_kernels(backend_ctx, tensor)) {
3942+
cl_int err;
3943+
cl_kernel kernel;
3944+
3945+
cl_int M = tensor->ne[1]; // ne01
3946+
cl_int K = tensor->ne[0]; // ne00
3947+
3948+
GGML_ASSERT(K % 32 == 0);
3949+
GGML_ASSERT(M % 4 == 0);
3950+
3951+
size_t size_q = (ggml_nelements(tensor)/ggml_blck_size(tensor->type))*ggml_blck_size(tensor->type)/2;
3952+
size_t size_d = (ggml_nelements(tensor)/ggml_blck_size(tensor->type))*sizeof(ggml_fp16_t);
3953+
GGML_ASSERT(size_d + size_q == ggml_nbytes(tensor) && "Incorrect tensor size");
3954+
3955+
cl_mem buf_trans_q;
3956+
cl_mem buf_trans_d;
3957+
3958+
CL_CHECK((buf_trans_q = clCreateBuffer(context, CL_MEM_READ_WRITE,
3959+
size_q, NULL, &err), err));
3960+
CL_CHECK((buf_trans_d = clCreateBuffer(context, CL_MEM_READ_WRITE,
3961+
size_d, NULL, &err), err));
3962+
3963+
kernel = backend_ctx->kernel_transpose_16_buf;
3964+
3965+
// transpose q back
3966+
cl_int stride_k_q = K/4;
3967+
size_t local_size_q[3] = {64, 1, 1};
3968+
size_t global_size_q[3] = {(size_t)M, (size_t)stride_k_q, 1};
3969+
3970+
CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &extra->q));
3971+
CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), &buf_trans_q));
3972+
CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_int), &M));
3973+
CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_int), &stride_k_q));
3974+
3975+
CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL,
3976+
global_size_q, local_size_q, 0, NULL, NULL));
3977+
3978+
// transpose scales back
3979+
cl_int stride_k_d = K/32;
3980+
size_t local_size_d[3] = {64, 1, 1};
3981+
size_t global_size_d[3] = {(size_t)M, (size_t)stride_k_d, 1};
3982+
3983+
CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &extra->d));
3984+
CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), &buf_trans_d));
3985+
CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_int), &M));
3986+
CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_int), &stride_k_d));
3987+
3988+
CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL,
3989+
global_size_d, local_size_d, 0, NULL, NULL));
3990+
3991+
// unpack
3992+
cl_mem data_device = clCreateBuffer(context, CL_MEM_READ_WRITE,
3993+
ggml_nbytes(tensor), NULL, &err);
3994+
CL_CHECK(err);
3995+
3996+
cl_uchar mask_0F = 0x0F;
3997+
cl_uchar mask_F0 = 0xF0;
3998+
3999+
size_t global_work_size[] = {(size_t)ggml_nelements(tensor)/ggml_blck_size(tensor->type), 1, 1};
4000+
size_t local_work_size[] = {1, 1, 1};
4001+
4002+
kernel = backend_ctx->kernel_restore_block_q4_0_noshuffle;
4003+
CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &buf_trans_q));
4004+
CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), &buf_trans_d));
4005+
CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &data_device));
4006+
CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_uchar), &mask_0F));
4007+
CL_CHECK(clSetKernelArg(kernel, 4, sizeof(cl_uchar), &mask_F0));
4008+
4009+
CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL,
4010+
global_work_size, local_work_size, 0, NULL, NULL));
4011+
4012+
// read back to host
4013+
CL_CHECK(clEnqueueReadBuffer(
4014+
queue, data_device, CL_TRUE, offset,
4015+
size, data, 0, NULL, NULL));
4016+
4017+
CL_CHECK(clReleaseMemObject(data_device));
4018+
CL_CHECK(clReleaseMemObject(buf_trans_q));
4019+
CL_CHECK(clReleaseMemObject(buf_trans_d));
4020+
4021+
return;
4022+
}
4023+
#endif
4024+
39364025
cl_int err;
39374026
cl_mem data_device = clCreateBuffer(context, CL_MEM_READ_WRITE,
39384027
ggml_nbytes(tensor), NULL, &err);

ggml/src/ggml-opencl/kernels/cvt.cl

Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -117,6 +117,27 @@ kernel void kernel_convert_block_q4_0_noshuffle(
117117
}
118118
}
119119

120+
kernel void kernel_restore_block_q4_0_noshuffle(
121+
global uchar * src_q,
122+
global half * src_d,
123+
global struct block_q4_0 * dst,
124+
uchar mask_0F,
125+
uchar mask_F0
126+
) {
127+
global struct block_q4_0 * b = (global struct block_q4_0 *) dst + get_global_id(0);
128+
global uchar * q = (global uchar *) src_q + QK4_0/2*get_global_id(0);
129+
global half * d = (global half *) src_d + get_global_id(0);
130+
131+
b->d = *d;
132+
for (int i = 0; i < QK4_0/4; ++i) {
133+
uchar x0 = q[i + 0 ] ;
134+
uchar x1 = q[i + QK4_0/4];
135+
136+
b->qs[2*i + 0] = convert_uchar((x0 & mask_0F) | ((x1 & mask_0F) << 4));
137+
b->qs[2*i + 1] = convert_uchar(((x0 & mask_F0) >> 4) | (x1 & mask_F0));
138+
}
139+
}
140+
120141
//------------------------------------------------------------------------------
121142
// block_mxfp4
122143
//------------------------------------------------------------------------------

ggml/src/ggml-opencl/kernels/transpose.cl

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -44,6 +44,19 @@ kernel void kernel_transpose_16_4x1(
4444
write_imageh(output, i * rows + j, (half4)(temp0, temp1, temp2, temp3));
4545
}
4646

47+
// Transpose treating each element as 16-bit using buffer
48+
kernel void kernel_transpose_16_buf(
49+
global const ushort * input,
50+
global ushort * output,
51+
const int ldi,
52+
const int ldo
53+
) {
54+
const int x = get_global_id(0);
55+
const int y = get_global_id(1);
56+
57+
output[x*ldo + y] = input[y*ldi + x];
58+
}
59+
4760
// 32-bit transpose, loading/storing a 4x4 tile of elements
4861
kernel void kernel_transpose_32(
4962
__read_only image1d_buffer_t input,

0 commit comments

Comments
 (0)