[static runtime] convert to->to_copy (#53524)

ajyu · facebook-github-bot · commit 7c0a4e78ca71 · 2021-03-08T16:19:03.000-08:00
Summary: Pull Request resolved: #53524 Add to->to_copy in the ReplaceWithCopy pass for playing well with AliasDb Test Plan: Run bench with CastedBatchOneHot fusion off (https://www.internalfb.com/intern/diff/view-version/123230476/), on adindexer and adfinder models Reviewed By: hlu1 Differential Revision: D26887050 fbshipit-source-id: 3f2fb9e27783bcdeb91c8b4181575f059317aff1
diff --git a/torch/csrc/jit/runtime/static/ops.cpp b/torch/csrc/jit/runtime/static/ops.cpp
@@ -784,36 +784,39 @@ REGISTER_OPERATOR_FUNCTOR(aten::pow, aten_pow, [](Node* n) -> SROperator {
   };
 });
 // out variant takes precedence over native
-REGISTER_OPERATOR_FUNCTOR(aten::to, aten_to, [](Node* n) -> SROperator {
-  return [](ProcessedNode* p_node) {
-    // support 4- or 5-arg for adindexer/adfinder models
-    DCHECK(p_node->inputs().size() >= 4);
-    const auto& in0_t = p_node->Input(0).toTensor();
-    auto in2_i = p_node->Input(2).toBool(); // non_blocking
-    // ignore input 3 (copy)
-    if (p_node->Output(0).isNone()) {
-      auto in1_i = p_node->Input(1).toScalarType();
-      c10::optional<c10::MemoryFormat> in4_o = c10::nullopt;
-      if (p_node->inputs().size() > 4 && p_node->Input(4).isInt()) {
-        in4_o = p_node->Input(4).toOptional<c10::MemoryFormat>();
-      }
-      if (in4_o.value_or(c10::MemoryFormat::Preserve) ==
-          c10::MemoryFormat::Preserve) {
-        if (in0_t.is_non_overlapping_and_dense()) {
-          in4_o = c10::nullopt;
-        } else {
-          in4_o = in0_t.suggest_memory_format();
+REGISTER_OPERATOR_FUNCTOR(
+    static_runtime::to_copy,
+    aten_to_copy,
+    [](Node* n) -> SROperator {
+      return [](ProcessedNode* p_node) {
+        // support 4- or 5-arg for adindexer/adfinder models
+        DCHECK(p_node->inputs().size() >= 4);
+        const auto& in0_t = p_node->Input(0).toTensor();
+        auto in2_i = p_node->Input(2).toBool(); // non_blocking
+        // ignore input 3 (copy)
+        if (p_node->Output(0).isNone()) {
+          auto in1_i = p_node->Input(1).toScalarType();
+          c10::optional<c10::MemoryFormat> in4_o = c10::nullopt;
+          if (p_node->inputs().size() > 4 && p_node->Input(4).isInt()) {
+            in4_o = p_node->Input(4).toOptional<c10::MemoryFormat>();
+          }
+          if (in4_o.value_or(c10::MemoryFormat::Preserve) ==
+              c10::MemoryFormat::Preserve) {
+            if (in0_t.is_non_overlapping_and_dense()) {
+              in4_o = c10::nullopt;
+            } else {
+              in4_o = in0_t.suggest_memory_format();
+            }
+          }
+          // See Note [Explicit nullopt MemoryFormat argument]
+          p_node->Output(0) = at::detail::empty_cpu(
+              {0}, in1_i, in0_t.layout(), in0_t.device(), c10::nullopt, in4_o);
         }
-      }
-      // See Note [Explicit nullopt MemoryFormat argument]
-      p_node->Output(0) = at::detail::empty_cpu(
-          {0}, in1_i, in0_t.layout(), in0_t.device(), c10::nullopt, in4_o);
-    }
-    auto& out_t = p_node->Output(0).toTensor();
-    fastResizeToZero(out_t);
-    at::native::to_copy_out(out_t, in0_t, in2_i);
-  };
-});
+        auto& out_t = p_node->Output(0).toTensor();
+        fastResizeToZero(out_t);
+        at::native::to_copy_out(out_t, in0_t, in2_i);
+      };
+    });
 
 // Out variants for view ops are registered to a separate registry because
 // their outputs (views) can't participate in memory reuse.
diff --git a/torch/csrc/jit/runtime/static/passes.cpp b/torch/csrc/jit/runtime/static/passes.cpp
@@ -332,6 +332,14 @@ TORCH_LIBRARY_FRAGMENT(static_runtime, m) {
         at::native::copy_(out, self);
         return out.permute(dims);
       });
+  m.def(
+      "static_runtime::to_copy(Tensor self, ScalarType dtype, bool non_blocking, bool copy) -> Tensor",
+      [](at::Tensor self, at::ScalarType dtype, bool non_blocking, bool copy)
+          -> at::Tensor {
+        at::Tensor out = at::empty_like(self);
+        at::native::copy_(out, self);
+        return out.to(dtype, non_blocking, copy);
+      });
 }
 
 void ReplaceWithCopy(std::shared_ptr<torch::jit::Graph>& graph) {
@@ -357,7 +365,9 @@ void ReplaceWithCopy(std::shared_ptr<torch::jit::Graph>& graph) {
       {c10::Symbol::fromQualString("aten::permute"),
        c10::Symbol::fromQualString("static_runtime::permute_copy")},
       {c10::Symbol::fromQualString("aten::narrow"),
-       c10::Symbol::fromQualString("aten::narrow_copy")}};
+       c10::Symbol::fromQualString("aten::narrow_copy")},
+      {c10::Symbol::fromQualString("aten::to"),
+       c10::Symbol::fromQualString("static_runtime::to_copy")}};
   std::vector<std::pair<Node*, Node*>> replacement;
   for (auto* n : graph->nodes()) {
     if (!supported.count(n->kind())) {