pytorch
diff --git a/‎.circleci/config.yml‎
Lines changed: 1 addition & 0 deletions b/‎.circleci/config.yml‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎.circleci/scripts/binary_populate_env.sh‎
Lines changed: 1 addition & 0 deletions b/‎.circleci/scripts/binary_populate_env.sh‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎.circleci/scripts/upload_binary_size_to_scuba.py‎
Lines changed: 2 additions & 0 deletions b/‎.circleci/scripts/upload_binary_size_to_scuba.py‎
Lines changed: 2 additions & 0 deletions
diff --git a/‎.circleci/verbatim-sources/job-specs/pytorch-job-specs.yml‎
Lines changed: 1 addition & 0 deletions b/‎.circleci/verbatim-sources/job-specs/pytorch-job-specs.yml‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎aten/src/ATen/BatchedFallback.cpp‎
Lines changed: 32 additions & 1 deletion b/‎aten/src/ATen/BatchedFallback.cpp‎
Lines changed: 32 additions & 1 deletion
diff --git a/‎aten/src/ATen/Context.h‎
Lines changed: 4 additions & 2 deletions b/‎aten/src/ATen/Context.h‎
Lines changed: 4 additions & 2 deletions
diff --git a/‎aten/src/ATen/core/Dict_inl.h‎
Lines changed: 1 addition & 1 deletion b/‎aten/src/ATen/core/Dict_inl.h‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎aten/src/ATen/core/boxing/impl/boxing.h‎
Lines changed: 7 additions & 1 deletion b/‎aten/src/ATen/core/boxing/impl/boxing.h‎
Lines changed: 7 additions & 1 deletion
diff --git a/‎aten/src/ATen/core/builtin_function.h‎
Lines changed: 10 additions & 2 deletions b/‎aten/src/ATen/core/builtin_function.h‎
Lines changed: 10 additions & 2 deletions
diff --git a/‎aten/src/ATen/core/function.h‎
Lines changed: 5 additions & 0 deletions b/‎aten/src/ATen/core/function.h‎
Lines changed: 5 additions & 0 deletions
@@ -645,6 +645,7 @@ jobs:
           export CIRCLE_PR_NUMBER="${CIRCLE_PR_NUMBER:-}"
           export CIRCLE_BRANCH="$CIRCLE_BRANCH"
           export CIRCLE_JOB="$CIRCLE_JOB"
+          export CIRCLE_WORKFLOW_ID="$CIRCLE_WORKFLOW_ID"
           cd workspace
           python test/print_test_stats.py test
           EOL
 
@@ -167,6 +167,7 @@ export CIRCLE_TAG="${CIRCLE_TAG:-}"
 export CIRCLE_SHA1="$CIRCLE_SHA1"
 export CIRCLE_PR_NUMBER="${CIRCLE_PR_NUMBER:-}"
 export CIRCLE_BRANCH="$CIRCLE_BRANCH"
+export CIRCLE_WORKFLOW_ID="$CIRCLE_WORKFLOW_ID"
 # =================== The above code will be executed inside Docker container ===================
 EOL
 
 
@@ -41,6 +41,7 @@ def build_message(size):
             "build_num": os.environ.get("CIRCLE_BUILD_NUM"),
             "sha1": os.environ.get("CIRCLE_SHA1"),
             "branch": os.environ.get("CIRCLE_BRANCH"),
+            "workflow_id": os.environ.get("CIRCLE_WORKFLOW_ID"),
         },
         "int": {
             "time": int(time.time()),
@@ -115,6 +116,7 @@ def gen_messages():
                     "build_num": os.environ.get("CIRCLE_BUILD_NUM"),
                     "sha1": os.environ.get("CIRCLE_SHA1"),
                     "branch": os.environ.get("CIRCLE_BRANCH"),
+                    "workflow_id": os.environ.get("CIRCLE_WORKFLOW_ID"),
                 },
                 "int": {
                     "time": int(time.time()),
 
@@ -207,6 +207,7 @@ jobs:
           export CIRCLE_PR_NUMBER="${CIRCLE_PR_NUMBER:-}"
           export CIRCLE_BRANCH="$CIRCLE_BRANCH"
           export CIRCLE_JOB="$CIRCLE_JOB"
+          export CIRCLE_WORKFLOW_ID="$CIRCLE_WORKFLOW_ID"
           cd workspace
           python test/print_test_stats.py test
           EOL
 
@@ -195,6 +195,32 @@ void batchedTensorInplaceForLoopFallback(const c10::OperatorHandle& op, torch::j
   torch::jit::push(stack, self);
 }
 
+static Tensor safeStack(TensorList tensors) {
+  auto is_defined = [](const Tensor& t) { return t.defined(); };
+  if (std::all_of(tensors.begin(), tensors.end(), is_defined)) {
+    return at::stack(tensors);
+  }
+  // NOTE [vmap through backward and undefined grad]
+  // While vmapping through backward functions (to compute batched grad), it
+  // is possible for the backward function to return an undefined grad for some
+  // grad_input for each example. In that case, we return an undefined grad.
+  //
+  // It is theoretically posssible for *some* of the examples to produce an
+  // undefined grad (a kernel could peek at the gradient values and return an
+  // undefined tensor if it determines the gradient is full of zeros). We
+  // could handle this by treating the undefined grad as a zero-filled tensor
+  // of the correct shape while stacking the tensors together. However I expect
+  // this to happen very rarely (I have not been able to find an example in our
+  // codebase) so we just error out in this case.
+  if (std::none_of(tensors.begin(), tensors.end(), is_defined)) {
+    return Tensor();
+  }
+  TORCH_CHECK(false,
+      "vmap: slow fallback received a mix of undefined and defined tensors ",
+      "as the result of an operation. This is not supported, please file us ",
+      "an issue on github.");
+}
+
 // The general flow of the algorithm is as follows.
 // - First, we figure out which arguments are BatchedTensors and save them
 //   to a vector. We also store a vector of which index of the arguments list
@@ -318,7 +344,12 @@ void batchedTensorForLoopFallback(const c10::OperatorHandle& op, torch::jit::Sta
   auto output_shards_chunks = MatrixRef<Tensor>(output_shards, num_batches);
   for (int64_t return_idx = 0; return_idx < num_returns; ++return_idx) {
     auto shards = output_shards_chunks[return_idx];
-    auto flat_output = at::stack(shards);
+    auto flat_output = safeStack(shards);
+    // See NOTE [vmap through backward and undefined grad]
+    if (!flat_output.defined()) {
+      torch::jit::push(stack, flat_output);
+      continue;
+    }
     VmapDimVector output_sizes(batch_sizes);
     output_sizes.insert(
         output_sizes.end(),
 
@@ -314,10 +314,12 @@ static inline void manual_seed(uint64_t seed) {
   }
   // NB: Sometimes we build with CUDA, but we don't have any GPUs
   // available. In that case, we must not seed CUDA; it will fail!
-  int num_gpus = detail::getCUDAHooks().getNumGPUs();
+  const auto num_gpus = detail::getCUDAHooks().getNumGPUs();
   if (hasCUDA() && num_gpus > 0) {
     for (int i = 0; i < num_gpus; i++) {
-      auto cuda_gen = globalContext().defaultGenerator(Device(at::kCUDA, i));
+      auto cuda_gen = globalContext().defaultGenerator(
+        Device(at::kCUDA, static_cast<c10::DeviceIndex>(i))
+      );
       {
         // See Note [Acquire lock when using random generators]
         std::lock_guard<std::mutex> lock(cuda_gen.mutex());
 
@@ -38,7 +38,7 @@ namespace detail {
 
 inline size_t DictKeyHash::operator()(const IValue& ivalue) const {
   if (ivalue.isInt()) {
-    return std::hash<int>()(ivalue.toInt());
+    return std::hash<int64_t>()(ivalue.toInt());
   } else if (ivalue.isString()) {
     return std::hash<std::string>()(ivalue.toStringRef());
   } else if (ivalue.isDouble()) {
 
@@ -97,7 +97,13 @@ using can_unbox =
 //
 template <class FuncType, class Enable = void>
 struct BoxedKernelWrapper {
-  static_assert(sizeof(FuncType) == -1,
+  // The reason we're not just doing straight up static_assert(false, ...) here:
+  // Basically, the way to make sure a static_assert only fires if a template
+  // is actually instantiated (rather than every time the file is parsed) is to use
+  // template parameters in the expression, e.g. FuncType here. However, since
+  // `sizeof(FuncType) != sizeof(FuncType)` is always false, this has the same
+  // effect.
+  static_assert(sizeof(FuncType) != sizeof(FuncType),
     "Function signature contains one or more unsupported parameter and/or return types. "
     "Look for a nearby error like "
     "\"'call' is not a member of 'c10::impl::BoxedKernelWrapper<(your function type), void>'\" "
 
@@ -10,13 +10,19 @@ struct BuiltinOpFunction : public Function {
   BuiltinOpFunction(
       c10::QualifiedName qualname,
       c10::FunctionSchema schema,
-      std::function<void(Stack&)> callable)
+      std::function<void(Stack&)> callable,
+      std::string doc_string = "")
       : name_(std::move(qualname)),
         callable_(std::move(callable)),
-        schema_(std::move(schema)) {
+        schema_(std::move(schema)),
+        doc_string_(std::move(doc_string)) {
     TORCH_INTERNAL_ASSERT(schema_.returns().size() == 1);
   }
 
+  const std::string& doc_string() const override {
+    return doc_string_;
+  }
+
   bool isGraphFunction() const override {
     return false;
   }
@@ -110,6 +116,8 @@ struct BuiltinOpFunction : public Function {
   std::function<void(Stack&)> callable_;
 
   c10::FunctionSchema schema_;
+
+  std::string doc_string_;
 };
 
 } // namespace jit
 
@@ -25,6 +25,11 @@ TORCH_API void preoptimizeGraph(std::shared_ptr<Graph>& graph);
 // execution of the function. Method is a wrapper around an
 // underlying Function that also provides a `self` object.
 struct TORCH_API Function {
+  virtual const std::string& doc_string() const {
+    static const std::string no_doc_string = "";
+    return no_doc_string;
+  }
+
   virtual bool isGraphFunction() const = 0;
 
   virtual void run(Stack& stack) = 0;