From 2283fcbbe79a686c50e3fe7aa041cb5772990511 Mon Sep 17 00:00:00 2001 From: Rajalakshmi Srinivasaraghavan Date: Wed, 17 Dec 2025 22:21:11 -0600 Subject: [PATCH] POWER10: Reduce sgemm loop unrolling With GCC 14, unnecessary move and lxvp instructions appear when unrolling the inner loop for larger sizes. Reducing the loop unroll factor restores performance to GCC 11. --- kernel/power/sgemm_kernel_power10.c | 112 +--------------------------- 1 file changed, 1 insertion(+), 111 deletions(-) diff --git a/kernel/power/sgemm_kernel_power10.c b/kernel/power/sgemm_kernel_power10.c index 80f495f708..1d86b57fcc 100644 --- a/kernel/power/sgemm_kernel_power10.c +++ b/kernel/power/sgemm_kernel_power10.c @@ -245,118 +245,8 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT * A, FLOAT * B, AO += 16; BO += 8; temp--; - BLASLONG K = temp / 64; + BLASLONG K = temp / 16; for (l = 0; l < K; l++) - { - vec_t *rowA = (vec_t *) & AO[0]; - vec_t *rowB = (vec_t *) & BO[0]; - KERNEL (0, 0); - KERNEL (2, 4); - KERNEL (4, 8); - KERNEL (6, 12); - KERNEL (8, 16); - KERNEL (10, 20); - KERNEL (12, 24); - KERNEL (14, 28); - KERNEL (16, 32); - KERNEL (18, 36); - KERNEL (20, 40); - KERNEL (22, 44); - KERNEL (24, 48); - KERNEL (26, 52); - KERNEL (28, 56); - KERNEL (30, 60); - KERNEL (32, 64); - KERNEL (34, 68); - KERNEL (36, 72); - KERNEL (38, 76); - KERNEL (40, 80); - KERNEL (42, 84); - KERNEL (44, 88); - KERNEL (46, 92); - KERNEL (48, 96); - KERNEL (50, 100); - KERNEL (52, 104); - KERNEL (54, 108); - KERNEL (56, 112); - KERNEL (58, 116); - KERNEL (60, 120); - KERNEL (62, 124); - KERNEL (64, 128); - KERNEL (66, 132); - KERNEL (68, 136); - KERNEL (70, 140); - KERNEL (72, 144); - KERNEL (74, 148); - KERNEL (76, 152); - KERNEL (78, 156); - KERNEL (80, 160); - KERNEL (82, 164); - KERNEL (84, 168); - KERNEL (86, 172); - KERNEL (88, 176); - KERNEL (90, 180); - KERNEL (92, 184); - KERNEL (94, 188); - KERNEL (96, 192); - KERNEL (98, 196); - KERNEL (100, 200); - KERNEL (102, 204); - KERNEL (104, 208); - KERNEL (106, 212); - KERNEL (108, 216); - KERNEL (110, 220); - KERNEL (112, 224); - KERNEL (114, 228); - KERNEL (116, 232); - KERNEL (118, 236); - KERNEL (120, 240); - KERNEL (122, 244); - KERNEL (124, 248); - KERNEL (126, 252); - AO += 1024; - BO += 512; - } - if ((temp & 63) >> 5) - { - vec_t *rowA = (vec_t *) & AO[0]; - vec_t *rowB = (vec_t *) & BO[0]; - KERNEL (0, 0); - KERNEL (2, 4); - KERNEL (4, 8); - KERNEL (6, 12); - KERNEL (8, 16); - KERNEL (10, 20); - KERNEL (12, 24); - KERNEL (14, 28); - KERNEL (16, 32); - KERNEL (18, 36); - KERNEL (20, 40); - KERNEL (22, 44); - KERNEL (24, 48); - KERNEL (26, 52); - KERNEL (28, 56); - KERNEL (30, 60); - KERNEL (32, 64); - KERNEL (34, 68); - KERNEL (36, 72); - KERNEL (38, 76); - KERNEL (40, 80); - KERNEL (42, 84); - KERNEL (44, 88); - KERNEL (46, 92); - KERNEL (48, 96); - KERNEL (50, 100); - KERNEL (52, 104); - KERNEL (54, 108); - KERNEL (56, 112); - KERNEL (58, 116); - KERNEL (60, 120); - KERNEL (62, 124); - AO += 512; - BO += 256; - } - if ((temp & 31) >> 4) { vec_t *rowA = (vec_t *) & AO[0]; vec_t *rowB = (vec_t *) & BO[0];