Fusion fusion;
FusionGuard fg(&fusion);
int y = 6, z = 8;
DataType dtype = DataType::Float;
auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
auto tv0 = makeDummyTensor(1);
auto tv1 = makeDummyTensor(2);
fusion.addInput(tv0);
fusion.addInput(tv1);
auto tv2 = broadcast(tv0, {true, false});
auto tv3 = add(tv2, new Float(1.0));
auto tv4 = add(tv3, tv1);
fusion.addOutput(tv4);
tv4->merge(0);
tv4->split(0, 128);
tv4->split(0, 4);
tv2->computeAt(tv4, 1);
fusion.printKernel();
torch::jit::fuser::cuda::FusionExecutor fe;
at::Tensor t0 = at::randn({z}, options).fill_(1.0);
at::Tensor t1 = at::randn({y, z}, options).fill_(3.0);
fe.compileFusion(&fusion);
auto outputs = fe.runFusion({t0, t1});
auto t3 = t0.add(1.0);
auto t4 = t3.add(t1);
TORCH_CHECK(t4.allclose(outputs[0]));
produces:
C++ exception with description "fuser_tv_ != nullptr INTERNAL ASSERT FAILED at "../torch/csrc/jit/codegen/cuda/kernel_ir.h":318, please report a bug to PyTorch.
Exception raised from fuserTv at ../torch/csrc/jit/codegen/cuda/kernel_ir.h:318 (most recent call first):
Fusion fusion; FusionGuard fg(&fusion); int y = 6, z = 8; DataType dtype = DataType::Float; auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); auto tv0 = makeDummyTensor(1); auto tv1 = makeDummyTensor(2); fusion.addInput(tv0); fusion.addInput(tv1); auto tv2 = broadcast(tv0, {true, false}); auto tv3 = add(tv2, new Float(1.0)); auto tv4 = add(tv3, tv1); fusion.addOutput(tv4); tv4->merge(0); tv4->split(0, 128); tv4->split(0, 4); tv2->computeAt(tv4, 1); fusion.printKernel(); torch::jit::fuser::cuda::FusionExecutor fe; at::Tensor t0 = at::randn({z}, options).fill_(1.0); at::Tensor t1 = at::randn({y, z}, options).fill_(3.0); fe.compileFusion(&fusion); auto outputs = fe.runFusion({t0, t1}); auto t3 = t0.add(1.0); auto t4 = t3.add(t1); TORCH_CHECK(t4.allclose(outputs[0]));produces: