[Helion + torch.compile] Handle multi-output templates in prologue fusion dtype heuristic

yf225 · yf225 · commit 1fca0ecdfeed · 2026-03-16T16:32:32.000-07:00
TemplateBuffer subclasses with MultiOutputLayout (e.g. Helion kernels)
don't have a single dtype. Add an explicit error in TemplateBuffer.dtype
for this case, and guard the scheduler's low-precision heuristic with
is_multi_outputs_template() so it skips the check rather than crashing.

[ghstack-poisoned]
diff --git a/torch/_inductor/ir.py b/torch/_inductor/ir.py
@@ -5257,6 +5257,14 @@ def __init__(
             allowed_prologue_inps or OrderedSet()
         )
 
+    @property
+    def dtype(self) -> torch.dtype:
+        if isinstance(self.layout, MultiOutputLayout):
+            raise NotImplementedError(
+                "Multi-output templates do not have a single dtype"
+            )
+        return self.get_layout().dtype
+
     def get_read_writes(self) -> dependencies.ReadWrites:
         return self.extract_read_writes(normalize=True)
 
diff --git a/torch/_inductor/scheduler.py b/torch/_inductor/scheduler.py
@@ -5593,8 +5593,10 @@ def check_prologue_fusion_heuristics_fusable(
         def low_prec_fp(dtype: torch.dtype) -> bool:
             return dtype.itemsize <= 2 and dtype.is_floating_point
 
+        template_buf = template_node.get_template_node_or_throw()
         if (
-            low_prec_fp(template_node.get_template_node_or_throw().dtype)
+            not template_buf.is_multi_outputs_template()
+            and low_prec_fp(template_buf.dtype)
             and not prologue_node.can_codegen_in_low_precision()
         ):
             why(