Fix masked_multihead_self_attention meta registration (#3584)

jiayisunx · web-flow · commit 8a5b92c6a3c4 · 2025-03-27T09:57:32.000+08:00
* fix masked_multihead_self_attention

* update code
diff --git a/intel_extension_for_pytorch/_meta_registrations.py b/intel_extension_for_pytorch/_meta_registrations.py
@@ -2,6 +2,7 @@
 from typing import List, Optional
 
 import torch
+import torch._custom_ops
 import torch.library
 from torch._prims_common import IntLike
 from .utils.channels_last_1d import to_channels_last_1d
@@ -623,7 +624,7 @@ def meta_tpp_linear_mul(
     return input.new_empty((*input.shape[:-1], out_features))
 
 
-@register_meta("masked_multihead_self_attention")
+@torch.library.register_fake("torch_ipex::masked_multihead_self_attention")
 def meta_masked_multihead_self_attention(
     query,
     key,
@@ -641,24 +642,25 @@ def meta_masked_multihead_self_attention(
     attn_output = query.new_empty(
         (query.shape[0], query.shape[2], query.shape[1], query.shape[3])
     )
-    if query.dtype == torch.bfloat16:
-        attn_output.as_strided_(
-            attn_output.shape,
-            (
-                query.shape[1] * query.shape[2] * query.shape[3],
-                query.shape[3],
-                query.shape[2] * query.shape[3],
-                1,
-            ),
-        )
+    attn_output.as_strided_(
+        attn_output.shape,
+        (
+            query.shape[1] * query.shape[2] * query.shape[3],
+            query.shape[3],
+            query.shape[2] * query.shape[3],
+            1,
+        ),
+    )
     attn_weights = None
     key_cache_out = query.new_empty(
         (key_cache.shape[0], key_cache.shape[1], key.shape[2], key.shape[3])
     )
     value_cache_out = query.new_empty(
         (value_cache.shape[0], value_cache.shape[1], value.shape[2], value.shape[3])
     )
-    beam_idx_out = query.new_empty(beam_idx.shape)
+    ctx = torch._custom_ops.get_ctx()
+    num_to_keep = ctx.new_dynamic_size()
+    beam_idx_out = query.new_empty((num_to_keep, beam_idx.shape[1]))
     return (attn_output, attn_weights, key_cache_out, value_cache_out, beam_idx_out)
 
 
diff --git a/tests/cpu/test_masked_mha.py b/tests/cpu/test_masked_mha.py
@@ -152,8 +152,9 @@ def _test_mha(self, torchcompile=False):
 
             if torchcompile:
                 torch._dynamo.reset()
+                torch._dynamo.config.capture_dynamic_output_shape_ops = True
                 ipex._set_compiler_backend("inductor")
-                mha = torch.compile(mha, backend="ipex")
+                mha = torch.compile(mha, backend="ipex", dynamic=True)
 
             # first token decode
             input_t = torch.randn(
diff --git a/tests/cpu/test_masked_mha_fp8.py b/tests/cpu/test_masked_mha_fp8.py
@@ -169,8 +169,9 @@ def _test_mha(self, torchcompile=False):
 
             if torchcompile:
                 torch._dynamo.reset()
+                torch._dynamo.config.capture_dynamic_output_shape_ops = True
                 ipex._set_compiler_backend("inductor")
-                mha = torch.compile(mha, backend="ipex")
+                mha = torch.compile(mha, backend="ipex", dynamic=True)
 
             # first token decode
             input_t = torch.randn(