[OPTIONS] Set mma as default in PassContext() (#530)

ZichuWu · vadiklyutiy · commit 35f02b96f4e0 · 2024-12-20T20:28:38.000+04:00
## PR Description ### Summary This PR sets `mma` (matrix-matrix multiply accumulate) as the default computation mode, replacing `simt`. Due to the switch, precision is affected since `fp32` inputs are cast to `tf32`, which has the same mantissa bit as `fp16`. Test cases are updated to align with `fp16` precision where feasible. ### Details 1. **Default MMA Computation Mode**: - Set `mma` as the default mode in place of `simt`. 2. **Precision Adjustments in Tests**: - Adjusted test cases to align with `fp16` precision, as `tf32` casting shares the same mantissa bit with `fp16`. 3. **Tolerance Modifications**: - Updated tolerances in tests affected by the precision difference, particularly `test_attention`. - Minimum observed tolerances: - `mma`: 0.2 - `simt`: ~1e-4 ### Testing Script for Tolerance To determine the minimum tolerance for `test_attention`, the following script was used: ``` import torch from typing import Optional, Union import pytest import numpy as np import numpy.testing import hidet import hidet.testing from hidet.graph import FlowGraph from hidet import ops # Define fixture to create Hidet models @pytest.fixture def shared_weights(device): wte = torch.randn([50257, 768], device=device) wpe = torch.randn([1024, 768], device=device) w1 = torch.randn([768, 768 * 3], device=device) b1 = torch.randn([768 * 3], device=device) return wte, wpe, w1, b1 # Define fixture to create Hidet models @pytest.fixture def hidet_model(shared_weights, device): wte, wpe, w1, b1 = shared_weights # Convert PyTorch tensors to Hidet tensors wte_hidet = hidet.from_torch(wte) wpe_hidet = hidet.from_torch(wpe) w1_hidet = hidet.from_torch(w1) b1_hidet = hidet.from_torch(b1) def get_graph(seq): n_head = 12 ids = hidet.symbol([seq], dtype='int32', device=device) x = hidet.ops.take(wte_hidet, ids) + hidet.ops.take(wpe_hidet, hidet.ops.arange(ids.shape[0], device=device)) causal_mask = (1 - hidet.ops.tri(x.shape[0], dtype=x.dtype, device=x.device)) * -1e10 x = hidet.ops.matmul(x, w1_hidet) + b1_hidet x = hidet.ops.reshape(x, [x.shape[0], 3, n_head, x.shape[1] // (3 * n_head)]) x = hidet.ops.transpose(x, [1, 2, 0, 3]) q, k, v = [t for t in hidet.ops.split(x, 3, axis=0)] x = hidet.ops.softmax(q @ hidet.ops.transpose(k, [-1, -2]) / float(np.sqrt(q.shape[-1])) + causal_mask, axis=-1) @ v return hidet.trace_from(x) graph_dynamic = get_graph('seq') graph_dynamic_opt = hidet.graph.optimize(graph_dynamic) return graph_dynamic, graph_dynamic_opt, get_graph # Define fixture to create PyTorch model @pytest.fixture def torch_model(shared_weights, device): wte, wpe, w1, b1 = shared_weights def get_graph(seq, ids): n_head = 12 x = torch.index_select(wte, 0, ids) + torch.index_select(wpe, 0, torch.arange(ids.shape[0], device=device)) causal_mask = (1 - torch.triu(torch.ones((x.shape[0], x.shape[0]), dtype=x.dtype, device=x.device))) * -1e10 x = torch.matmul(x, w1) + b1 x = x.view(x.shape[0], 3, n_head, x.shape[1] // (3 * n_head)) x = x.permute(1, 2, 0, 3) q, k, v = torch.chunk(x, 3, dim=0) x = torch.softmax((q @ k.transpose(-1, -2)) / np.sqrt(q.shape[-1]) + causal_mask, dim=-1) @ v return x return get_graph # Test case for each sequence length @pytest.mark.parametrize("seq", [1, 2, 3, 4, 8]) @pytest.mark.parametrize('device', ['cuda']) def test_attention(seq, hidet_model, torch_model, device): graph_dynamic, graph_dynamic_opt, hidet_static_fn = hidet_model torch_fn = torch_model # Generate consistent ids ids_torch = torch.randint(0, 50257, (seq,), dtype=torch.int32, device=device) ids_hidet = hidet.from_torch(ids_torch) # Hidet results graph_static = hidet_static_fn(seq) y_static_hidet = graph_static(ids_hidet) y_dynamic_hidet = graph_dynamic(ids_hidet) y_dynamic_opt_hidet = graph_dynamic_opt(ids_hidet) # PyTorch results y_torch = torch_fn(seq, ids_torch) # Check for close matches for y_hidet in [y_dynamic_opt_hidet]: np.testing.assert_allclose(y_hidet.cpu().numpy(), y_torch.cpu().numpy(), atol=0, rtol=0) ``` This script runs `test_attention` across multiple sequence lengths, generating consistent `ids` for both Hidet and PyTorch, and outputs the observed tolerance. ### Code Changes - Set `mma` as the default instead of `simt`. - Updated test case precision settings to `fp16` where applicable. - Adjusted tolerance in `test_attention` and other sensitive tests based on the minimum tolerance values identified with the above script. ### Note For `tests/unit_tests/test_dynamic_shape.py::test_attention[xxx-cuda]`, it is occasionally observed that the absolute difference will boost up to as high as 60.
diff --git a/examples/gpt-2/gpt_model.py b/examples/gpt-2/gpt_model.py
@@ -102,7 +102,6 @@ def gpt2(model_size: str = "124M", seq_length: Optional[int] = 1000, use_fp16=Fa
         with hidet.graph.PassContext() as ctx:
             if use_fp16:
                 ctx.set_precision('float16')
-                ctx.set_mma('mma')
             graph_opt = hidet.graph.optimize(graph)
 
         hidet.save_graph(graph_opt, hf_path)
diff --git a/gallery/developer-guides/add-torch-operator-mapping.py b/gallery/developer-guides/add-torch-operator-mapping.py
@@ -96,7 +96,7 @@ def run_model():
     x = torch.randn(10, 10, device='cuda')
     y1 = model_opt(x)
     y2 = model(x)
-    torch.testing.assert_close(actual=y1, expected=y2)
+    torch.testing.assert_close(actual=y1, expected=y2, atol=3e-3, rtol=3e-3)
     print('success!')
 
 
diff --git a/gallery/getting-started/quick-start.py b/gallery/getting-started/quick-start.py
@@ -45,7 +45,7 @@
 y2 = model(x)
 
 # check the correctness
-torch.testing.assert_close(actual=y1, expected=y2, rtol=1e-2, atol=1e-2)
+torch.testing.assert_close(actual=y1, expected=y2, rtol=2e-2, atol=2e-2)
 
 
 # benchmark the performance
diff --git a/gallery/hidet-script/3-kernel-functions.py b/gallery/hidet-script/3-kernel-functions.py
@@ -104,7 +104,7 @@ def matmul(a: f32[m_size, k_size], b: f32[k_size, n_size], c: f32[m_size, n_size
 module(a, b, c)
 
 # compare the result with torch.matmul
-hidet.utils.assert_close(c, a.torch() @ b.torch(), atol=1e-4, rtol=1e-4)
+hidet.utils.assert_close(c, a.torch() @ b.torch(), atol=1e-3, rtol=1e-3)
 
 # %%
 # We can check the generated source code:
diff --git a/gallery/tutorials/optimize-onnx-model.py b/gallery/tutorials/optimize-onnx-model.py
@@ -117,7 +117,7 @@ def bench_hidet_graph(graph: hidet.FlowGraph):
     cuda_graph = graph.cuda_graph()
     (output,) = cuda_graph.run([data])
     np.testing.assert_allclose(
-        actual=output.cpu().numpy(), desired=torch_output.cpu().numpy(), rtol=1e-2, atol=1e-2
+        actual=output.cpu().numpy(), desired=torch_output.cpu().numpy(), rtol=5e-2, atol=5e-2
     )
     print('  Hidet: {:.3f} ms'.format(benchmark_func(lambda: cuda_graph.run())))
 
diff --git a/python/hidet/graph/frontend/torch/dynamo_backends.py b/python/hidet/graph/frontend/torch/dynamo_backends.py
@@ -107,14 +107,11 @@ def get_flow_graph(interpreter: Interpreter, example_inputs):
 
 def get_compiled_graph(flow_graph: FlowGraph):
     parallel_k = dynamo_config['parallel_k']
-    tensor_core = dynamo_config['use_tensor_core']
     save_dir = dynamo_config['dump_graph_ir']
     with PassContext() as ctx:
         if save_dir:
             graph_dir = resolve_save_dir_multigraph(save_dir)
             ctx.save_graph_instrument(graph_dir)
-        if tensor_core:
-            ctx.set_mma('mma' if tensor_core else 'simt')
         ctx.set_parallel_k(disabled=(parallel_k == 'disabled'), search=(parallel_k == 'search'))
         ctx.allow_source_graph_removal(True)
         logger.info('start to optimize the flow graph')
diff --git a/python/hidet/graph/ops/matmul/batch_matmul.py b/python/hidet/graph/ops/matmul/batch_matmul.py
@@ -25,7 +25,7 @@
 
 
 class BatchMatmulTask(Task):
-    def __init__(self, a: TensorNode, b: TensorNode, mma: str = 'simt'):
+    def __init__(self, a: TensorNode, b: TensorNode, mma: str = 'mma'):
         batch_size, m_size, k_size = a.shape
         batch_size, k_size, n_size = b.shape
         self.batch_size = batch_size
@@ -779,7 +779,7 @@ def is_false():
 
 
 class BatchMatmulOp(Operator):
-    def __init__(self, a: Tensor, b: Tensor, mma: str = 'simt'):
+    def __init__(self, a: Tensor, b: Tensor, mma: str = 'mma'):
         # if is_false(a.shape[0] == b.shape[0]) or is_false(a.shape[2] == b.shape[1]):
         #     raise
         if not (
@@ -795,7 +795,7 @@ def __init__(self, a: Tensor, b: Tensor, mma: str = 'simt'):
         super().__init__(inputs=[a, b], attributes={'mma': mma}, task=task)
 
 
-def batch_matmul(a: Tensor, b: Tensor, mma: str = 'simt') -> Tensor:
+def batch_matmul(a: Tensor, b: Tensor, mma: str = 'mma') -> Tensor:
     """Batched matrix multiplication.
 
     Parameters
diff --git a/python/hidet/graph/ops/matmul/resolve.py b/python/hidet/graph/ops/matmul/resolve.py
@@ -96,7 +96,7 @@ class MatmulResolveRule(ResolveRule):
 
     def run_batch_matmul(self, a: Tensor, b: Tensor) -> Tensor:
         parallel_k = self.get_config('parallel_k', default='default')  # 'default', 'search', 2, 4, ...
-        mma = self.get_config('mma', default='simt')  # 'simt', 'mma'
+        mma = self.get_config('mma', default='mma')  # 'simt', 'mma'
 
         if any(not isinstance(v, int) for v in a.shape + b.shape):
             nparts = 1
diff --git a/python/hidet/graph/transforms/base.py b/python/hidet/graph/transforms/base.py
@@ -79,7 +79,7 @@ def __init__(self):
             'reduce_precision': None,
             # mma primitive:
             # ['simt', 'mma']
-            'mma': 'simt',
+            'mma': 'mma',
             # parallel k
             # ['default', 'disabled', 'search', 2, 4, ...]
             'parallel_k': 'default',
diff --git a/tests/benchmarks/bench_op.py b/tests/benchmarks/bench_op.py
@@ -151,6 +151,5 @@ def bench_reduce(params: str, *args, **kwargs) -> float:
 
     with hidet.graph.PassContext() as ctx:
         ctx.set_reduce_precision(dtype)
-        ctx.set_mma('mma')
         latency = bench_func(params, dtype)
     print(latency)
diff --git a/tests/frontends/torch/models/test_torch_bert.py b/tests/frontends/torch/models/test_torch_bert.py
@@ -31,7 +31,7 @@ def test_bert(batch_size: int, seq_length: int, use_tensor_core, dynamic):
     try:
         hidet.torch.dynamo_config.use_tensor_core(use_tensor_core)
         y2 = model_opt(*args, **kwargs).last_hidden_state
-        tol = 1e-2
+        tol = 2e-2
         torch.testing.assert_close(y1, y2, atol=tol, rtol=tol)
     finally:
         # in case of failure, reset the config
diff --git a/tests/frontends/torch/models/test_torch_resnet50.py b/tests/frontends/torch/models/test_torch_resnet50.py
@@ -17,7 +17,7 @@
 
 @pytest.mark.parametrize('shape', [[1, 3, 224, 224]])
 @pytest.mark.parametrize('dynamic', [False, True])
-@pytest.mark.parametrize('dtype, tol', [(torch.float16, 2e-2), (torch.float32, 1e-4)])
+@pytest.mark.parametrize('dtype, tol', [(torch.float16, 2e-2), (torch.float32, 2e-2)])
 def test_resnet18(shape, dynamic, dtype, tol):
     model = torch.hub.load('pytorch/vision:v0.6.0', 'resnet18', pretrained=True).cuda().eval().to(dtype)
     x = torch.randn(*shape).cuda().to(dtype) * 0.1796 + 0.5491
diff --git a/tests/frontends/torch/test_torch_conv1d.py b/tests/frontends/torch/test_torch_conv1d.py
@@ -18,7 +18,6 @@
 @pytest.mark.parametrize('in_shape,w_shape,stride,padding,groups', [[[1, 3, 224], [42, 3, 7], 2, 1, 1]])
 @pytest.mark.parametrize('dtype', [torch.float32])
 def test_conv1d(in_shape, w_shape, stride, padding, groups, dtype):
-    cudnn.allow_tf32 = False
     check_module(
         model=torch.nn.Conv1d(
             in_channels=in_shape[1],
@@ -29,8 +28,8 @@ def test_conv1d(in_shape, w_shape, stride, padding, groups, dtype):
             groups=groups,
         ),
         args=[torch.randn(in_shape, dtype=dtype)],
+        atol=2e-3,
     )
-    cudnn.allow_tf32 = True
 
 
 if __name__ == '__main__':
diff --git a/tests/frontends/torch/test_torch_conv2d.py b/tests/frontends/torch/test_torch_conv2d.py
@@ -22,7 +22,6 @@
 @pytest.mark.parametrize('groups', [1, 3])
 @pytest.mark.parametrize('dtype', [torch.float32])
 def test_conv2d(in_shape, w_shape, stride, padding, groups, dtype):
-    cudnn.allow_tf32 = False
     check_module(
         model=torch.nn.Conv2d(
             in_channels=in_shape[1],
@@ -33,8 +32,9 @@ def test_conv2d(in_shape, w_shape, stride, padding, groups, dtype):
             groups=groups,
         ),
         args=[torch.randn(in_shape, dtype=dtype)],
+        atol=2e-3,
+        rtol=1e-3,
     )
-    cudnn.allow_tf32 = True
 
 
 if __name__ == '__main__':
diff --git a/tests/frontends/torch/test_torch_conv2d_transpose.py b/tests/frontends/torch/test_torch_conv2d_transpose.py
@@ -23,7 +23,6 @@
 @pytest.mark.parametrize('groups', [1])
 @pytest.mark.parametrize('dtype', [torch.float32])
 def test_conv2d_transpose(in_shape, w_shape, stride, padding, output_padding, groups, dtype):
-    cudnn.allow_tf32 = False
     check_module(
         model=torch.nn.ConvTranspose2d(
             in_channels=in_shape[1],
@@ -36,8 +35,8 @@ def test_conv2d_transpose(in_shape, w_shape, stride, padding, output_padding, gr
         ),
         args=[torch.randn(in_shape, dtype=dtype)],
         atol=2e-4,
+        rtol=2e-3,
     )
-    cudnn.allow_tf32 = True
 
 
 if __name__ == '__main__':
diff --git a/tests/frontends/torch/test_torch_conv3d.py b/tests/frontends/torch/test_torch_conv3d.py
@@ -22,7 +22,6 @@
 @pytest.mark.parametrize('groups', [1, 3])
 @pytest.mark.parametrize('dtype', [torch.float32])
 def test_conv3d(in_shape, w_shape, stride, padding, groups, dtype):
-    cudnn.allow_tf32 = False
     check_module(
         model=torch.nn.Conv3d(
             in_channels=in_shape[1],
@@ -33,8 +32,9 @@ def test_conv3d(in_shape, w_shape, stride, padding, groups, dtype):
             groups=groups,
         ),
         args=[torch.randn(in_shape, dtype=dtype)],
+        atol=2e-3,
+        rtol=1e-3,
     )
-    cudnn.allow_tf32 = True
 
 
 if __name__ == '__main__':
diff --git a/tests/ir/parser/test_parser.py b/tests/ir/parser/test_parser.py
@@ -58,7 +58,7 @@ def get_matmul_task():
     return mod
 
 
-def get_bmatmul_task(mma_str='simt'):
+def get_bmatmul_task(mma_str='mma'):
     s = symbol_var('s')
     a = tensor_input('a', 'float16', [1, s, 256])
     b = tensor_input('b', 'float16', [1, 256, 256])
diff --git a/tests/lang/cute/test_matmul_cast_dynamic.py b/tests/lang/cute/test_matmul_cast_dynamic.py
@@ -53,7 +53,6 @@ def check_matmul_dynamic(a_shape, b_shape, bias_shape, torch_op, hidet_op, dtype
 
     with hidet.graph.PassContext() as ctx:
         ctx.set_parallel_k()
-        ctx.set_mma('mma')
         graph_opt: hidet.FlowGraph = hidet.graph.optimize(graph)
 
     def hidet_opt(inp_a, inp_b, inp_bias):
diff --git a/tests/operators/test_fusion.py b/tests/operators/test_fusion.py
@@ -75,7 +75,7 @@ def optimize_and_build(op):
     y1 = graph_1(a)
     y2 = graph_2(a)
 
-    hidet.utils.assert_close(y1, y2, atol=1e-5, rtol=1e-5)
+    hidet.utils.assert_close(y1, y2, atol=2e-1, rtol=2e-1)
 
 
 if __name__ == '__main__':
diff --git a/tests/unit_tests/test_dynamic_shape.py b/tests/unit_tests/test_dynamic_shape.py
@@ -49,7 +49,7 @@ def get_graph(seq: Union[int, str]) -> FlowGraph:
         y_dynamic = graph_dynamic(x)
         y_dynamic_opt = graph_dynamic_opt(x)
         for y in [y_dynamic, y_dynamic_opt]:
-            numpy.testing.assert_allclose(y_static.cpu().numpy(), y.cpu().numpy(), atol=1e-3, rtol=1e-3)
+            numpy.testing.assert_allclose(y_static.cpu().numpy(), y.cpu().numpy(), atol=2e-1, rtol=2e-1)
 
 
 @pytest.mark.parametrize('device', ['cuda'])
@@ -71,4 +71,4 @@ def test_resnet50(device, bs, h, w):
     y3 = graph_dynamic_opt(xx)
     # we used random weights, thus the tolerance is larger than 1e-5
     numpy.testing.assert_allclose(y1.cpu().numpy(), y2.cpu().numpy(), rtol=5e-4, atol=5e-4)
-    numpy.testing.assert_allclose(y1.cpu().numpy(), y3.cpu().numpy(), rtol=5e-4, atol=5e-4)
+    numpy.testing.assert_allclose(y1.cpu().numpy(), y3.cpu().numpy(), rtol=5e-2, atol=5e-2)
diff --git a/tests/unit_tests/test_frontend_onnx.py b/tests/unit_tests/test_frontend_onnx.py
@@ -53,7 +53,7 @@ def check_model(model_path: str, input_names: List[str], input_tensors: List[Ten
     hidet_outputs = [tensor.cpu().numpy() for tensor in hidet_outputs]
 
     assert len(onnx_outputs) == len(hidet_outputs)
-    tol = {'float32': 1e-4, 'float16': 5e-2}[dtype]
+    tol = {'float32': 5e-2, 'float16': 5e-2}[dtype]
     for onnx_output, hidet_output in zip(onnx_outputs, hidet_outputs):
         np.testing.assert_allclose(actual=hidet_output, desired=onnx_output, rtol=tol, atol=tol)
 
diff --git a/tests/unit_tests/test_save_lower_ir.py b/tests/unit_tests/test_save_lower_ir.py
@@ -57,7 +57,6 @@ def test_save_lower_ir():
 
     with hidet.graph.PassContext() as ctx:
         ctx.set_reduce_precision('float16')
-        ctx.set_mma('mma')
 
         run_hidet_benchmark()
 

Original file line number	Diff line number	Diff line change
`@@ -117,7 +117,7 @@ def bench_hidet_graph(graph: hidet.FlowGraph):`
`117`	`117`	`cuda_graph = graph.cuda_graph()`
`118`	`118`	`(output,) = cuda_graph.run([data])`
`119`	`119`	`np.testing.assert_allclose(`
`120`		`- actual=output.cpu().numpy(), desired=torch_output.cpu().numpy(), rtol=1e-2, atol=1e-2`
	`120`	`+ actual=output.cpu().numpy(), desired=torch_output.cpu().numpy(), rtol=5e-2, atol=5e-2`
`121`	`121`	`)`
`122`	`122`	`print(' Hidet: {:.3f} ms'.format(benchmark_func(lambda: cuda_graph.run())))`
`123`	`123`