@@ -494,6 +494,7 @@ struct ggml_backend_opencl_context {
494494 cl_kernel kernel_convert_block_q8_0, kernel_restore_block_q8_0;
495495 cl_kernel kernel_mul_mat_q4_0_f32_8x_flat;
496496 cl_kernel kernel_convert_block_q4_0_noshuffle;
497+ cl_kernel kernel_restore_block_q4_0_noshuffle;
497498 cl_kernel kernel_mul_mat_q4_0_f32_1d_8x_flat, kernel_mul_mat_q4_0_f32_1d_16x_flat;
498499 cl_kernel kernel_mul_mv_q6_K_f32;
499500 cl_kernel kernel_mul_mv_mxfp4_f32, kernel_mul_mv_mxfp4_f32_flat;
@@ -634,6 +635,7 @@ struct ggml_backend_opencl_context {
634635 cl_kernel kernel_transpose_32;
635636 cl_kernel kernel_transpose_32_16;
636637 cl_kernel kernel_transpose_16;
638+ cl_kernel kernel_transpose_16_buf;
637639 cl_kernel kernel_transpose_16_4x1;
638640
639641 cl_mem A_s_d_max; // max scale buffer size for transpose
@@ -806,6 +808,7 @@ static void load_cl_kernels(ggml_backend_opencl_context *backend_ctx, ggml_cl_ve
806808 build_program_from_source (backend_ctx->context , backend_ctx->device , kernel_src.c_str (), compile_opts);
807809
808810 CL_CHECK ((backend_ctx->kernel_convert_block_q4_0_noshuffle = clCreateKernel (backend_ctx->program_cvt , " kernel_convert_block_q4_0_noshuffle" , &err), err));
811+ CL_CHECK ((backend_ctx->kernel_restore_block_q4_0_noshuffle = clCreateKernel (backend_ctx->program_cvt , " kernel_restore_block_q4_0_noshuffle" , &err), err));
809812 CL_CHECK ((backend_ctx->kernel_convert_block_q4_0 = clCreateKernel (backend_ctx->program_cvt , " kernel_convert_block_q4_0" , &err), err));
810813 CL_CHECK ((backend_ctx->kernel_restore_block_q4_0 = clCreateKernel (backend_ctx->program_cvt , " kernel_restore_block_q4_0" , &err), err));
811814 CL_CHECK ((backend_ctx->kernel_convert_block_mxfp4 = clCreateKernel (backend_ctx->program_cvt , " kernel_convert_block_mxfp4" , &err), err));
@@ -2004,7 +2007,8 @@ static void load_cl_kernels(ggml_backend_opencl_context *backend_ctx, ggml_cl_ve
20042007 CL_CHECK ((backend_ctx->kernel_transpose_32_16 = clCreateKernel (backend_ctx->program_transpose , " kernel_transpose_32_16" , &err), err));
20052008 CL_CHECK ((backend_ctx->kernel_transpose_32 = clCreateKernel (backend_ctx->program_transpose , " kernel_transpose_32" , &err), err));
20062009 CL_CHECK ((backend_ctx->kernel_transpose_16 = clCreateKernel (backend_ctx->program_transpose , " kernel_transpose_16" , &err), err));
2007- CL_CHECK ((backend_ctx->kernel_transpose_16_4x1 = clCreateKernel (backend_ctx->program_transpose , " kernel_transpose_16_4x1" , &err), err));
2010+ CL_CHECK ((backend_ctx->kernel_transpose_16_buf = clCreateKernel (backend_ctx->program_transpose , " kernel_transpose_16_buf" , &err), err));
2011+ CL_CHECK ((backend_ctx->kernel_transpose_16_4x1 = clCreateKernel (backend_ctx->program_transpose , " kernel_transpose_16_4x1" , &err), err));
20082012 GGML_LOG_CONT (" ." );
20092013 }
20102014
@@ -3933,6 +3937,91 @@ static void ggml_backend_opencl_buffer_get_tensor(ggml_backend_buffer_t buffer,
39333937 if (tensor->type == GGML_TYPE_Q4_0) {
39343938 ggml_tensor_extra_cl_q4_0 * extra = (ggml_tensor_extra_cl_q4_0 *)tensor->extra ;
39353939
3940+ #ifdef GGML_OPENCL_USE_ADRENO_KERNELS
3941+ if (use_adreno_kernels (backend_ctx, tensor)) {
3942+ cl_int err;
3943+ cl_kernel kernel;
3944+
3945+ cl_int M = tensor->ne [1 ]; // ne01
3946+ cl_int K = tensor->ne [0 ]; // ne00
3947+
3948+ GGML_ASSERT (K % 32 == 0 );
3949+ GGML_ASSERT (M % 4 == 0 );
3950+
3951+ size_t size_q = (ggml_nelements (tensor)/ggml_blck_size (tensor->type ))*ggml_blck_size (tensor->type )/2 ;
3952+ size_t size_d = (ggml_nelements (tensor)/ggml_blck_size (tensor->type ))*sizeof (ggml_fp16_t );
3953+ GGML_ASSERT (size_d + size_q == ggml_nbytes (tensor) && " Incorrect tensor size" );
3954+
3955+ cl_mem buf_trans_q;
3956+ cl_mem buf_trans_d;
3957+
3958+ CL_CHECK ((buf_trans_q = clCreateBuffer (context, CL_MEM_READ_WRITE,
3959+ size_q, NULL , &err), err));
3960+ CL_CHECK ((buf_trans_d = clCreateBuffer (context, CL_MEM_READ_WRITE,
3961+ size_d, NULL , &err), err));
3962+
3963+ kernel = backend_ctx->kernel_transpose_16_buf ;
3964+
3965+ // transpose q back
3966+ cl_int stride_k_q = K/4 ;
3967+ size_t local_size_q[3 ] = {64 , 1 , 1 };
3968+ size_t global_size_q[3 ] = {(size_t )M, (size_t )stride_k_q, 1 };
3969+
3970+ CL_CHECK (clSetKernelArg (kernel, 0 , sizeof (cl_mem), &extra->q ));
3971+ CL_CHECK (clSetKernelArg (kernel, 1 , sizeof (cl_mem), &buf_trans_q));
3972+ CL_CHECK (clSetKernelArg (kernel, 2 , sizeof (cl_int), &M));
3973+ CL_CHECK (clSetKernelArg (kernel, 3 , sizeof (cl_int), &stride_k_q));
3974+
3975+ CL_CHECK (clEnqueueNDRangeKernel (queue, kernel, 3 , NULL ,
3976+ global_size_q, local_size_q, 0 , NULL , NULL ));
3977+
3978+ // transpose scales back
3979+ cl_int stride_k_d = K/32 ;
3980+ size_t local_size_d[3 ] = {64 , 1 , 1 };
3981+ size_t global_size_d[3 ] = {(size_t )M, (size_t )stride_k_d, 1 };
3982+
3983+ CL_CHECK (clSetKernelArg (kernel, 0 , sizeof (cl_mem), &extra->d ));
3984+ CL_CHECK (clSetKernelArg (kernel, 1 , sizeof (cl_mem), &buf_trans_d));
3985+ CL_CHECK (clSetKernelArg (kernel, 2 , sizeof (cl_int), &M));
3986+ CL_CHECK (clSetKernelArg (kernel, 3 , sizeof (cl_int), &stride_k_d));
3987+
3988+ CL_CHECK (clEnqueueNDRangeKernel (queue, kernel, 3 , NULL ,
3989+ global_size_d, local_size_d, 0 , NULL , NULL ));
3990+
3991+ // unpack
3992+ cl_mem data_device = clCreateBuffer (context, CL_MEM_READ_WRITE,
3993+ ggml_nbytes (tensor), NULL , &err);
3994+ CL_CHECK (err);
3995+
3996+ cl_uchar mask_0F = 0x0F ;
3997+ cl_uchar mask_F0 = 0xF0 ;
3998+
3999+ size_t global_work_size[] = {(size_t )ggml_nelements (tensor)/ggml_blck_size (tensor->type ), 1 , 1 };
4000+ size_t local_work_size[] = {1 , 1 , 1 };
4001+
4002+ kernel = backend_ctx->kernel_restore_block_q4_0_noshuffle ;
4003+ CL_CHECK (clSetKernelArg (kernel, 0 , sizeof (cl_mem), &buf_trans_q));
4004+ CL_CHECK (clSetKernelArg (kernel, 1 , sizeof (cl_mem), &buf_trans_d));
4005+ CL_CHECK (clSetKernelArg (kernel, 2 , sizeof (cl_mem), &data_device));
4006+ CL_CHECK (clSetKernelArg (kernel, 3 , sizeof (cl_uchar), &mask_0F));
4007+ CL_CHECK (clSetKernelArg (kernel, 4 , sizeof (cl_uchar), &mask_F0));
4008+
4009+ CL_CHECK (clEnqueueNDRangeKernel (queue, kernel, 3 , NULL ,
4010+ global_work_size, local_work_size, 0 , NULL , NULL ));
4011+
4012+ // read back to host
4013+ CL_CHECK (clEnqueueReadBuffer (
4014+ queue, data_device, CL_TRUE, offset,
4015+ size, data, 0 , NULL , NULL ));
4016+
4017+ CL_CHECK (clReleaseMemObject (data_device));
4018+ CL_CHECK (clReleaseMemObject (buf_trans_q));
4019+ CL_CHECK (clReleaseMemObject (buf_trans_d));
4020+
4021+ return ;
4022+ }
4023+ #endif
4024+
39364025 cl_int err;
39374026 cl_mem data_device = clCreateBuffer (context, CL_MEM_READ_WRITE,
39384027 ggml_nbytes (tensor), NULL , &err);
0 commit comments