pytorch
diff --git a/‎.github/workflows/lint.yml‎
Lines changed: 1 addition & 1 deletion b/‎.github/workflows/lint.yml‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎.gitignore‎
Lines changed: 3 additions & 0 deletions b/‎.gitignore‎
Lines changed: 3 additions & 0 deletions
diff --git a/‎android/pytorch_android/src/main/cpp/pytorch_jni_jit.cpp‎
Lines changed: 8 additions & 1 deletion b/‎android/pytorch_android/src/main/cpp/pytorch_jni_jit.cpp‎
Lines changed: 8 additions & 1 deletion
diff --git a/‎android/pytorch_android/src/main/cpp/pytorch_jni_lite.cpp‎
Lines changed: 18 additions & 5 deletions b/‎android/pytorch_android/src/main/cpp/pytorch_jni_lite.cpp‎
Lines changed: 18 additions & 5 deletions
diff --git a/‎aten/src/ATen/native/BinaryOps.cpp‎
Lines changed: 34 additions & 13 deletions b/‎aten/src/ATen/native/BinaryOps.cpp‎
Lines changed: 34 additions & 13 deletions
diff --git a/‎aten/src/ATen/native/native_functions.yaml‎
Lines changed: 30 additions & 2 deletions b/‎aten/src/ATen/native/native_functions.yaml‎
Lines changed: 30 additions & 2 deletions
@@ -105,7 +105,7 @@ jobs:
         run: |
           set -eux
           pip install flake8
-          rm -rf .circleci
+          rm -rf .circleci tools/clang_format_new.py
           flake8 --exit-zero > ${GITHUB_WORKSPACE}/flake8-output.txt
           cat ${GITHUB_WORKSPACE}/flake8-output.txt
       - name: Add annotations
 
@@ -252,3 +252,6 @@ TAGS
 
 # clang-format storage location used by apply_clang_format.py
 .clang-format-bin
+
+# clangd background index
+.clangd/
@@ -26,6 +26,12 @@ namespace {
 struct JITCallGuard {
   // AutoGrad is disabled for mobile by default.
   torch::autograd::AutoGradMode no_autograd_guard{false};
+  // VariableType dispatch is not included in default mobile build. We need set
+  // this guard globally to avoid dispatch error (only for dynamic dispatch).
+  // Thanks to the unification of Variable class and Tensor class it's no longer
+  // required to toggle the NonVariableTypeMode per op - so it doesn't hurt to
+  // always set NonVariableTypeMode for inference only use case.
+  torch::AutoNonVariableTypeMode non_var_guard{true};
   // Disable graph optimizer to ensure list of unused ops are not changed for
   // custom mobile build.
   torch::jit::GraphOptimizerEnabledGuard no_optimizer_guard{false};
@@ -111,11 +117,11 @@ class PytorchJni : public facebook::jni::HybridClass<PytorchJni> {
         /* need_inputs */ false,
         /* sampled */ false);
 #endif
-    JITCallGuard guard;
   }
 
   PytorchJni(facebook::jni::alias_ref<jstring> modelPath) {
     preModuleLoadSetup();
+    JITCallGuard guard;
     module_ = torch::jit::load(std::move(modelPath->toStdString()));
     module_.eval();
   }
@@ -147,6 +153,7 @@ class PytorchJni : public facebook::jni::HybridClass<PytorchJni> {
           "Could not get buffer for asset '%s'",
           assetName->toStdString().c_str());
     }
+    JITCallGuard guard;
     module_ = torch::jit::load(torch::make_unique<MemoryReadAdapter>(
         assetBuffer, AAsset_getLength(asset)));
     AAsset_close(asset);
 
@@ -12,10 +12,23 @@
 
 #include "pytorch_jni_common.h"
 
-using namespace pytorch_jni;
-
 namespace pytorch_jni {
 
+namespace {
+
+struct LiteJITCallGuard {
+  // VariableType dispatch is not included in default mobile build. We need set
+  // this guard globally to avoid dispatch error (only for dynamic dispatch).
+  // Thanks to the unification of Variable class and Tensor class it's no longer
+  // required to toggle the NonVariableTypeMode per op - so it doesn't hurt to
+  // always set NonVariableTypeMode for inference only use case.
+  // TODO: avoid having to set this guard for custom mobile build with mobile
+  // interpreter.
+  torch::AutoNonVariableTypeMode non_var_guard{true};
+};
+
+} // namespace
+
 class PytorchJni : public facebook::jni::HybridClass<PytorchJni> {
  private:
   friend HybridBase;
@@ -31,6 +44,7 @@ class PytorchJni : public facebook::jni::HybridClass<PytorchJni> {
   }
 
   PytorchJni(facebook::jni::alias_ref<jstring> modelPath) {
+    LiteJITCallGuard guard;
     module_ = torch::jit::_load_for_mobile(std::move(modelPath->toStdString()));
   }
 
@@ -55,8 +69,7 @@ class PytorchJni : public facebook::jni::HybridClass<PytorchJni> {
     }
 
     auto output = [&]() {
-      torch::autograd::AutoGradMode guard(false);
-      at::AutoNonVariableTypeMode non_var_type_mode(true);
+      LiteJITCallGuard guard;
       return module_.forward(inputs);
     }();
     return JIValue::newJIValueFromAtIValue(output);
@@ -78,7 +91,7 @@ class PytorchJni : public facebook::jni::HybridClass<PytorchJni> {
     }
     if (auto method = module_.find_method(methodName)) {
       auto output = [&]() {
-        at::AutoNonVariableTypeMode non_var_type_mode(true);
+        LiteJITCallGuard guard;
         return module_.run_method(methodName, inputs);
       }();
       return JIValue::newJIValueFromAtIValue(output);
 
@@ -108,13 +108,6 @@ Tensor& remainder_(Tensor& self, const Tensor& other) {
   return native::remainder_out(self, self, other);
 }
 
-Tensor truncate(const Tensor& tensor) {
-  if (tensor.is_floating_point()) {
-    return tensor.trunc();
-  }
-  return tensor;
-}
-
 Tensor& true_divide_out(Tensor& result, const Tensor& self, const Tensor& divisor) {
   TORCH_CHECK(!isIntegralType(result.scalar_type(), /*includeBool=*/ true),
             "True division requires a floating output type, but got ",
@@ -145,14 +138,34 @@ Tensor true_divide(const Tensor& self, const Tensor& divisor) {
   return iter.output();
 }
 
-Tensor floor_divide(const Tensor& input, const Tensor& other) {
-  Tensor out = input / other;
-  return truncate(out);
+Tensor& floor_divide_out(Tensor& result, const Tensor& self, const Tensor& other) {
+  auto iter = TensorIterator::binary_op(result, self, other,
+    /*check_mem_overlap=*/true);
+  div_stub(iter.device_type(), iter);
+
+  if (result.is_floating_point()) {
+    result.trunc_();
+  }
+
+  return result;
+}
+
+Tensor floor_divide(const Tensor& self, const Tensor& other) {
+  Tensor result;
+  auto iter = TensorIterator::binary_op(result, self, other);
+
+  div_stub(iter.device_type(), iter);
+
+  auto out = iter.output();
+  if (out.is_floating_point()) {
+    out.trunc_();
+  }
+
+  return out;
 }
 
-Tensor floor_divide(const Tensor& input, Scalar other) {
-  Tensor out = input / other;
-  return truncate(out);
+Tensor& floor_divide_(Tensor& self, const Tensor& other) {
+  return native::floor_divide_out(self, self, other);
 }
 
 Tensor& mul_out(Tensor& result, const Tensor& self, const Tensor& other) {
@@ -675,6 +688,14 @@ Tensor min(const Tensor& self, const Tensor& other) {
 
 Tensor& min_(Tensor& self, const Tensor& other) { return at::min_out(self, self, other); }
 
+Tensor floor_divide(const Tensor& self, Scalar other) {
+  return at::floor_divide(self, wrapped_scalar_tensor(other));
+}
+
+Tensor& floor_divide_(Tensor& self, Scalar other) {
+  return at::floor_divide_out(self, self, wrapped_scalar_tensor(other));
+}
+
 Tensor& fmod_out(Tensor & result, const Tensor& self, const Tensor& other) {
   auto iter = TensorIterator::binary_op(result, self, other,
                                         /*check_mem_overlap=*/true);
 
@@ -1306,10 +1306,38 @@
     CPU: floor_out
     CUDA: floor_out
 
-- func: floor_divide(Tensor input, Tensor other) -> Tensor
+- func: floor_divide(Tensor self, Tensor other) -> Tensor
+  variants: function, method
+  dispatch:
+    CPU: floor_divide
+    CUDA: floor_divide
+    SparseCPU: floor_divide_sparse
+    SparseCUDA: floor_divide_sparse
   supports_named_tensor: True
 
-- func: floor_divide.Scalar(Tensor input, Scalar other) -> Tensor
+- func: floor_divide_.Tensor(Tensor(a!) self, Tensor other) -> Tensor(a!)
+  variants: method
+  dispatch:
+    CPU: floor_divide_
+    CUDA: floor_divide_
+    SparseCPU: floor_divide_sparse_
+    SparseCUDA: floor_divide_sparse_
+  supports_named_tensor: True
+
+- func: floor_divide.out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!)
+  dispatch:
+    CPU: floor_divide_out
+    CUDA: floor_divide_out
+    SparseCPU: floor_divide_out_sparse_zerodim
+    SparseCUDA: floor_divide_out_sparse_zerodim
+  supports_named_tensor: True
+
+- func: floor_divide.Scalar(Tensor self, Scalar other) -> Tensor
+  variants: function, method
+  supports_named_tensor: True
+
+- func: floor_divide_.Scalar(Tensor(a!) self, Scalar other) -> Tensor(a!)
+  variants: method
   supports_named_tensor: True
 
 - func: frac(Tensor self) -> Tensor