pytorch
diff --git a/‎caffe2/CMakeLists.txt‎
Lines changed: 3 additions & 0 deletions b/‎caffe2/CMakeLists.txt‎
Lines changed: 3 additions & 0 deletions
diff --git a/‎test/cpp/jit/test_gpu.cpp‎
Lines changed: 193 additions & 121 deletions b/‎test/cpp/jit/test_gpu.cpp‎
Lines changed: 193 additions & 121 deletions
diff --git a/‎test/cpp/jit/tests.h‎
Lines changed: 2 additions & 1 deletion b/‎test/cpp/jit/tests.h‎
Lines changed: 2 additions & 1 deletion
diff --git a/‎test/test_jit_cuda_fuser.py‎
Lines changed: 41 additions & 4 deletions b/‎test/test_jit_cuda_fuser.py‎
Lines changed: 41 additions & 4 deletions
diff --git a/‎tools/build_variables.bzl‎
Lines changed: 3 additions & 0 deletions b/‎tools/build_variables.bzl‎
Lines changed: 3 additions & 0 deletions
diff --git a/‎torch/csrc/jit/codegen/cuda/data_struct_str.h‎
Lines changed: 0 additions & 11 deletions b/‎torch/csrc/jit/codegen/cuda/data_struct_str.h‎
Lines changed: 0 additions & 11 deletions
diff --git a/‎torch/csrc/jit/codegen/cuda/dispatch.cpp‎
Lines changed: 0 additions & 1 deletion b/‎torch/csrc/jit/codegen/cuda/dispatch.cpp‎
Lines changed: 0 additions & 1 deletion
diff --git a/‎torch/csrc/jit/codegen/cuda/fusion.cpp‎
Lines changed: 21 additions & 0 deletions b/‎torch/csrc/jit/codegen/cuda/fusion.cpp‎
Lines changed: 21 additions & 0 deletions
diff --git a/‎torch/csrc/jit/codegen/cuda/fusion.h‎
Lines changed: 17 additions & 2 deletions b/‎torch/csrc/jit/codegen/cuda/fusion.h‎
Lines changed: 17 additions & 2 deletions
diff --git a/‎torch/csrc/jit/codegen/cuda/index_compute.cpp‎
Lines changed: 6 additions & 6 deletions b/‎torch/csrc/jit/codegen/cuda/index_compute.cpp‎
Lines changed: 6 additions & 6 deletions
@@ -587,8 +587,11 @@ if(NOT INTERN_BUILD_MOBILE OR NOT BUILD_CAFFE2_MOBILE)
       ${TORCH_SRC_DIR}/csrc/jit/codegen/cuda/ir_iostream.cpp
       ${TORCH_SRC_DIR}/csrc/jit/codegen/cuda/iter_visitor.cpp
       ${TORCH_SRC_DIR}/csrc/jit/codegen/cuda/kernel.cpp
+      ${TORCH_SRC_DIR}/csrc/jit/codegen/cuda/kernel_cache.cpp
       ${TORCH_SRC_DIR}/csrc/jit/codegen/cuda/manager.cpp
       ${TORCH_SRC_DIR}/csrc/jit/codegen/cuda/mutator.cpp
+      ${TORCH_SRC_DIR}/csrc/jit/codegen/cuda/lower_loops.cpp
+      ${TORCH_SRC_DIR}/csrc/jit/codegen/cuda/lower_utils.cpp
       ${TORCH_SRC_DIR}/csrc/jit/codegen/cuda/lower2device.cpp
       ${TORCH_SRC_DIR}/csrc/jit/codegen/cuda/parser.cpp
       ${TORCH_SRC_DIR}/csrc/jit/codegen/cuda/partition.cpp
 
@@ -117,7 +117,8 @@ namespace jit {
   _(GPU_FusionCodeGen2)          \
   _(GPU_FusionSimplePWise)       \
   _(GPU_FusionExecKernel)        \
-  _(GPU_FusionForLoop)
+  _(GPU_FusionForLoop)           \
+  _(GPU_FusionLoopUnroll)
 #else
 #define TH_FORALL_TESTS_CUDA(_) \
   _(ArgumentSpec)               \
 
@@ -86,20 +86,57 @@ def t(x, y, z, q):
     @unittest.skipIf(GRAPH_EXECUTOR != ProfilingMode.PROFILING, "Requires profiling node to run cuda fuser")
     @skipIfRocm
     def test_scalar_input(self):
-        def t(x, y, z):
-            # type: (Tensor, Tensor, float) -> Tensor
+        def t(x : torch.Tensor, y : torch.Tensor, z : float):
             o = x + y
             o = o + z
             return o
         t_jit = torch.jit.script(t)
-        x = torch.randn(4, 8, dtype=torch.float, device="cuda")
-        y = torch.randn(4, 8, dtype=torch.float, device="cuda")
+        x = torch.randn(4, 8, 32, 32, dtype=torch.float, device="cuda")
+        y = torch.randn(4, 8, 1, 32, dtype=torch.float, device="cuda")
+        y = y.expand(4, 8, 32, 32)
+        jit_o = t_jit(x, y, 2.0)
+        jit_o = t_jit(x, y, 2.0)
+        o = t(x, y, 2.0)
+        self.assertEqual(o, jit_o)
+        self.assertTrue(self._has_cuda_fusion_group(t_jit.graph_for(x, y, 2.0)))
+
+    @unittest.skipIf(not RUN_CUDA, "requires CUDA")
+    @unittest.skipIf(GRAPH_EXECUTOR != ProfilingMode.PROFILING, "Requires profiling node to run cuda fuser")
+    @skipIfRocm
+    def test_broadcasting(self):
+        def t(x : torch.Tensor, y : torch.Tensor, z : float):
+            o = x + y
+            o = o + z
+            return o
+        t_jit = torch.jit.script(t)
+        x = torch.randn(4, 8, 32, 32, dtype=torch.float, device="cuda")
+        y = torch.randn(32, 32, dtype=torch.float, device="cuda")
         jit_o = t_jit(x, y, 2.0)
         jit_o = t_jit(x, y, 2.0)
         o = t(x, y, 2.0)
         self.assertEqual(o, jit_o)
         self.assertTrue(self._has_cuda_fusion_group(t_jit.graph_for(x, y, 2.0)))
 
+    @unittest.skipIf(not RUN_CUDA, "requires CUDA")
+    @unittest.skipIf(GRAPH_EXECUTOR != ProfilingMode.PROFILING, "Requires profiling node to run cuda fuser")
+    @skipIfRocm
+    def test_broadcasting_multiple_output_shape(self):
+        def t(x : torch.Tensor, y : torch.Tensor, z : torch.Tensor):
+            o = x + 12
+            o1 = o + y
+            o2 = o + z
+            oo = o1.sum() + o2.sum()
+            return oo
+        t_jit = torch.jit.script(t)
+        x = torch.randn(32, 32, dtype=torch.float, device="cuda")
+        y = torch.randn(2, 32, 32, dtype=torch.float, device="cuda")
+        z = torch.randn(4, 32, 32, dtype=torch.float, device="cuda")
+        jit_o = t_jit(x, y, z)
+        jit_o = t_jit(x, y, z)
+        o = t(x, y, z)
+        self.assertEqual(o, jit_o)
+        # Currently cannot fuse this
+        self.assertFalse(self._has_cuda_fusion_group(t_jit.graph_for(x, y, z)))
 
 if __name__ == '__main__':
     run_tests()
@@ -246,6 +246,9 @@ libtorch_cuda_sources = [
     "torch/csrc/jit/codegen/cuda/ir_iostream.cpp",
     "torch/csrc/jit/codegen/cuda/iter_visitor.cpp",
     "torch/csrc/jit/codegen/cuda/kernel.cpp",
+    "torch/csrc/jit/codegen/cuda/kernel_cache.cpp",
+    "torch/csrc/jit/codegen/cuda/lower_loops.cpp",
+    "torch/csrc/jit/codegen/cuda/lower_utils.cpp",
     "torch/csrc/jit/codegen/cuda/lower2device.cpp",
     "torch/csrc/jit/codegen/cuda/manager.cpp",
     "torch/csrc/jit/codegen/cuda/mutator.cpp",
 
@@ -1,6 +1,5 @@
 #include <torch/csrc/jit/codegen/cuda/fusion.h>
 #include <torch/csrc/jit/codegen/cuda/ir_all_nodes.h>
-#include <torch/csrc/jit/codegen/cuda/tensor.h>
 #include <torch/csrc/jit/codegen/cuda/type.h>
 
 #include <torch/csrc/jit/codegen/cuda/dispatch.h>
 
@@ -1,4 +1,5 @@
 #include <torch/csrc/jit/codegen/cuda/fusion.h>
+#include <torch/csrc/jit/codegen/cuda/ir_all_nodes.h>
 #include <torch/csrc/jit/codegen/cuda/ir_printer.h>
 
 namespace torch {
@@ -33,6 +34,22 @@ std::vector<Expr*> ExprSort::getExprs(
   return es.exprs;
 }
 
+void InputsOf::handle(TensorView* tv) {
+  if (FusionGuard::getCurFusion()->hasInput(tv))
+    inputs.push_back(tv);
+}
+
+std::vector<TensorView*> InputsOf::output(Fusion* fusion, Val* output_) {
+  TORCH_CHECK(
+      fusion->hasOutput(output_),
+      "Asked for the inputs of ",
+      output_,
+      " however, it is not an output of the provided fusion.");
+  InputsOf io;
+  io.traverseFrom(FusionGuard::getCurFusion(), {output_});
+  return io.inputs;
+}
+
 Fusion::~Fusion() {
   {
     auto it = val_set_.begin();
@@ -140,6 +157,10 @@ std::vector<Expr*> Fusion::exprs(bool from_outputs_only, bool breadth_first) {
   return ExprSort::getExprs(this, from_outputs_only, breadth_first);
 }
 
+std::vector<TensorView*> Fusion::inputsOf(Val* val) {
+  return InputsOf::output(this, val);
+}
+
 void Fusion::print() {
   FusionGuard fg(this);
   std::cout << "%kernel {\n";
 
@@ -49,6 +49,7 @@ struct TypeHash {
  */
 
 struct Fusion;
+struct TensorView;
 
 // Fusion Guard is our "context manager". It holds the actrive fusion and allows
 // it to be accessed anywhere through FusionGuard::getCurFusion().
@@ -79,6 +80,20 @@ struct ExprSort : public IterVisitor {
       bool breadth_first);
 };
 
+// Expr sort will take a fusion and return a topologically sorted list of
+// expressions.
+struct InputsOf : public IterVisitor {
+  using IterVisitor::handle;
+
+ private:
+  std::vector<TensorView*> inputs;
+
+  void handle(TensorView* tv) override;
+
+ public:
+  static std::vector<TensorView*> output(Fusion* fusion, Val* output_);
+};
+
 /*
  * Fusion is mutable but unique. Nodes cannot be copied in any way from one
  * Fusion to another. If anything like that is desired, it would require
@@ -139,6 +154,8 @@ struct TORCH_CUDA_API Fusion : public IRInputOutput {
       bool from_outputs_only = false,
       bool breadth_first = false);
 
+  std::vector<TensorView*> inputsOf(Val* val);
+
   // Print this fusion to cout.
   void print();
 
@@ -174,8 +191,6 @@ struct TORCH_CUDA_API Fusion : public IRInputOutput {
   // Return the Expr that produces val (const version)
   const Expr* origin(const Val* val) const;
 
-  bool lowered = false;
-
  private:
   // Sets of all Vals/Exprs registered with this fusion
   std::set<Val*> val_set_;
 
@@ -21,7 +21,7 @@ void IndexCompute::replayBackward(Merge* expr) {
       ax >= 0 && ax < indices.size(),
       "Hit an invalid MERGE transformation during IndexCompute, axis is not within bounds.");
 
-  Val* I = expr->in()->axis(ax + 1)->size();
+  Val* I = expr->in()->axis(ax + 1)->extent();
   Val* ind = indices[ax];
   indices[ax] = div(ind, I);
   indices.insert(indices.begin() + ax + 1, mod(ind, I));
@@ -62,18 +62,18 @@ IndexCompute::IndexCompute(const TensorView* tv, std::vector<Val*> _indices) {
 
   TensorDomain* td = tv->domain();
 
-  bool exclude_reduction = td->size() > indices.size();
+  bool exclude_reduction = td->nDims() > indices.size();
 
   TORCH_CHECK(
-      exclude_reduction || td->size() == indices.size(),
+      exclude_reduction || td->nDims() == indices.size(),
       "For IndexCompute the number of axis should match the number of dimensions"
       " in the TensorView.");
 
   // If we need to ignore the reduction dimensions because a tensor is
   // being consumed, not produced, then insert dummy dimensions in the
   // indices for bookkeeping while replaying split/merge/reorder operations.
   if (exclude_reduction)
-    for (decltype(td->size()) i{0}; i < td->size(); i++)
+    for (decltype(td->nDims()) i{0}; i < td->nDims(); i++)
       if (td->axis(i)->isReduction())
         indices.insert(indices.begin() + i, new Int(-1));
 
@@ -83,15 +83,15 @@ IndexCompute::IndexCompute(const TensorView* tv, std::vector<Val*> _indices) {
   TensorDomain* root = TransformIter::runBackward(td, true);
 
   TORCH_INTERNAL_ASSERT(
-      root->size() == indices.size(),
+      root->nDims() == indices.size(),
       "Error during IndexCompute. The number of indices generated"
       " after running the transformations backwards should match"
       " the number of dimensions of the root TensorView.");
 
   // Remove indices associated with reduction axes, we had them just for
   // bookkeeping.
   if (exclude_reduction) {
-    for (auto i = root->size() - 1; i >= 0; i--)
+    for (auto i = root->nDims() - 1; i >= 0; i--)
       if (root->axis(i)->isReduction())
         indices.erase(indices.begin() + i);
   }