pytorch
diff --git a/‎.ci/docker/ci_commit_pins/executorch.txt‎
Lines changed: 1 addition & 1 deletion b/‎.ci/docker/ci_commit_pins/executorch.txt‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎aten/src/ATen/native/TensorShape.cpp‎
Lines changed: 1 addition & 1 deletion b/‎aten/src/ATen/native/TensorShape.cpp‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎aten/src/ATen/native/cuda/CUDALoops.cuh‎
Lines changed: 29 additions & 15 deletions b/‎aten/src/ATen/native/cuda/CUDALoops.cuh‎
Lines changed: 29 additions & 15 deletions
diff --git a/‎aten/src/ATen/native/cuda/MemoryAccess.cuh‎
Lines changed: 2 additions & 2 deletions b/‎aten/src/ATen/native/cuda/MemoryAccess.cuh‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎aten/src/ATen/native/cuda/TensorShape.cu‎
Lines changed: 4 additions & 3 deletions b/‎aten/src/ATen/native/cuda/TensorShape.cu‎
Lines changed: 4 additions & 3 deletions
diff --git a/‎aten/src/ATen/native/transformers/attention.cpp‎
Lines changed: 1 addition & 1 deletion b/‎aten/src/ATen/native/transformers/attention.cpp‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎test/cpp_extensions/open_registration_extension/README.md‎
Lines changed: 19 additions & 11 deletions b/‎test/cpp_extensions/open_registration_extension/README.md‎
Lines changed: 19 additions & 11 deletions
diff --git a/‎test/cpp_extensions/open_registration_extension/pytorch_openreg/csrc/OpenReg.h‎
Lines changed: 31 additions & 0 deletions b/‎test/cpp_extensions/open_registration_extension/pytorch_openreg/csrc/OpenReg.h‎
Lines changed: 31 additions & 0 deletions
diff --git a/‎test/cpp_extensions/open_registration_extension/pytorch_openreg/csrc/OpenRegHooks.cpp‎
Lines changed: 27 additions & 59 deletions b/‎test/cpp_extensions/open_registration_extension/pytorch_openreg/csrc/OpenRegHooks.cpp‎
Lines changed: 27 additions & 59 deletions
@@ -1 +1 @@
-4022ff142a5392aa5197e05f4dfe85d356f742bf
+047bbc720fda70cd5742c76b3c9e01d504577d65
@@ -3366,7 +3366,7 @@ static std::vector<Tensor> _pad_chunk(
     std::vector<int64_t> view_sizes(
         tensor_size.begin(), tensor_size.begin() + dim);
     view_sizes.insert(view_sizes.end(), {num_chunks, -1});
-    padded_tensors.push_back(padded_tensor.view(view_sizes));
+    padded_tensors.push_back(padded_tensor.reshape(view_sizes));
   }
   return padded_tensors;
 }
 
@@ -612,28 +612,41 @@ struct check_binary_functor_types_for_specialization<
 };
 
 // The following is a list of type specializations for vectorized_templated
-// elementwise kernel. It refers to the first and second runtime types of the
-// arguments of a binary functor.
-
+// elementwise kernel. The three types refer to runtime types of the output
+// tensor, first tensor argument, and the second tensor argument used for a
+// binary functor.
 constexpr std::array rt_binary_specializations = {
-    std::array<c10::ScalarType, 2>(
+    std::array<c10::ScalarType, 3>(
         {c10::CppTypeToScalarType<float>::value,
+         c10::CppTypeToScalarType<float>::value,
          c10::CppTypeToScalarType<BFloat16>::value}),
-    std::array<c10::ScalarType, 2>(
+    std::array<c10::ScalarType, 3>(
+        {c10::CppTypeToScalarType<float>::value,
+         c10::CppTypeToScalarType<BFloat16>::value,
+         c10::CppTypeToScalarType<float>::value}),
+    std::array<c10::ScalarType, 3>(
         {c10::CppTypeToScalarType<BFloat16>::value,
+         c10::CppTypeToScalarType<BFloat16>::value,
          c10::CppTypeToScalarType<float>::value}),
-    std::array<c10::ScalarType, 2>(
+    std::array<c10::ScalarType, 3>(
         {c10::CppTypeToScalarType<float>::value,
+         c10::CppTypeToScalarType<float>::value,
          c10::CppTypeToScalarType<Half>::value}),
-    std::array<c10::ScalarType, 2>(
+    std::array<c10::ScalarType, 3>(
+        {c10::CppTypeToScalarType<float>::value,
+         c10::CppTypeToScalarType<Half>::value,
+         c10::CppTypeToScalarType<float>::value}),
+    std::array<c10::ScalarType, 3>(
         {c10::CppTypeToScalarType<Half>::value,
+         c10::CppTypeToScalarType<Half>::value,
          c10::CppTypeToScalarType<float>::value})};
 
 bool check_binary_rt_types_for_specialization(TensorIteratorBase& iter) {
   if (iter.ninputs() != 2)
     return false;
   for (auto spec : rt_binary_specializations)
-    if (iter.input_dtype(0) == spec[0] && iter.input_dtype(1) == spec[1])
+    if (iter.dtype(0) == spec[0] && iter.input_dtype(0) == spec[1] &&
+        iter.input_dtype(1) == spec[2])
       return true;
   return false;
 }
@@ -648,6 +661,7 @@ struct type_specialized_kernel_launcher {
       typename loader_t,
       typename storer_t>
   static void apply(
+      ScalarType ret_t,
       ScalarType arg0_t,
       ScalarType arg1_t,
       int64_t numel,
@@ -657,22 +671,22 @@ struct type_specialized_kernel_launcher {
       out_calc_t output_offset_calculator,
       loader_t loader,
       storer_t storer) {
-    using traits = function_traits<func_t>;
-    using return_t = typename traits::result_type;
-    if (arg0_t == rt_binary_specializations[arg_index][0] &&
-        arg1_t == rt_binary_specializations[arg_index][1])
+    if (ret_t == rt_binary_specializations[arg_index][0] &&
+        arg0_t == rt_binary_specializations[arg_index][1] &&
+        arg1_t == rt_binary_specializations[arg_index][2])
       launch_vectorized_templated_kernel<
           func_t,
           array_t,
           inp_calc_t,
           out_calc_t,
           loader_t,
           storer_t,
-          return_t,
           decltype(c10::impl::ScalarTypeToCPPType<
                    rt_binary_specializations[arg_index][0]>::t),
           decltype(c10::impl::ScalarTypeToCPPType<
-                   rt_binary_specializations[arg_index][1]>::t)>(
+                   rt_binary_specializations[arg_index][1]>::t),
+          decltype(c10::impl::ScalarTypeToCPPType<
+                   rt_binary_specializations[arg_index][2]>::t)>(
           numel,
           f,
           data,
@@ -712,7 +726,6 @@ void gpu_kernel_impl(TensorIteratorBase& iter, const func_t& f) {
 #ifdef USE_ROCM
     // Attempt to call specialized vectorized elementwise kernel
     // that enables interleaving.
-
     if (check_binary_rt_types_for_specialization(iter) &&
         memory::can_vectorize_up_to<func_t>(data) > 1) {
       // constexpr to reduce the amount of kernels generated for
@@ -740,6 +753,7 @@ void gpu_kernel_impl(TensorIteratorBase& iter, const func_t& f) {
             type_specialized_kernel_launcher,
             rt_binary_specializations.size()>::
             with_args(
+                iter.dtype(0),
                 iter.input_dtype(0),
                 iter.input_dtype(1),
                 numel,
 
@@ -407,8 +407,8 @@ struct vectorized_templated {
   // float(float,bfloat16) and functor add on float(float,float).
   template <typename scalar_t>
   __device__ inline void store(scalar_t* from, int idx) {
-    using vec_t = aligned_vector<scalar_t, vec_size>;
-    scalar_t* to = reinterpret_cast<scalar_t*>(data[0]) + block_work_size * idx;
+    using vec_t = aligned_vector<CastToT, vec_size>;
+    CastToT* to = reinterpret_cast<CastToT*>(data[0]) + block_work_size * idx;
     vec_t* to_ = reinterpret_cast<vec_t*>(to);
     int thread_idx = threadIdx.x;
 #pragma unroll
 
@@ -422,11 +422,12 @@ static __global__ void chunk_cat_cuda_kernel(
 }
 
 bool all_contiguous(TensorList tensors) {
-  bool contiguous = true;
   for (const auto& t : tensors) {
-    contiguous &= t.is_non_overlapping_and_dense();
+    if (!t.is_contiguous()) {
+      return false;
+    }
   }
-  return contiguous;
+  return true;
 }
 
 // Get leading dimensions before `dim`-th dimension.
 
@@ -449,7 +449,7 @@ REGISTER_AVX512_DISPATCH(_fused_sdp_choice_stub, &_fused_sdp_choice_cpp)
 REGISTER_VSX_DISPATCH(_fused_sdp_choice_stub, &_fused_sdp_choice_cpp)
 REGISTER_ZVECTOR_DISPATCH(_fused_sdp_choice_stub, &_fused_sdp_choice_cpp)
 REGISTER_SVE256_DISPATCH(_fused_sdp_choice_stub, &_fused_sdp_choice_cpp)
-REGISTER_HPU_DISPATCH(_fused_sdp_choice_stub, &_fused_sdp_choice_meta);
+REGISTER_HPU_DISPATCH(_fused_sdp_choice_stub, &_fused_sdp_choice_meta)
 
 int64_t _fused_sdp_choice_meta(
     const Tensor& query_,
 
@@ -1,29 +1,37 @@
+# PyTorch OpenReg
+
 This folder contains a self-contained example of a PyTorch out-of-tree backend leveraging the "PrivateUse1" backend from core.
 
 ## How to use
+
 Install as standalone with `python setup.py develop` (or install) from this folder.
-You can run test via `python test/test_openreg.py`.
+You can run test via `python {PYTORCH_ROOT_PATH}/test/test_openreg.py`.
 
 ## Design principles
+
 For simplicity anything that can be implemented from python is done so.
 A real implementation will most likely want to call these different APIs from c++ directly.
 
 The current version sends everything back to python and contains enough implementation to run basic model, transfer host/device and printing.
 
 The codebase is split as follows:
-- `pytorch_openreg/__init__.py` imports torch to get core state initialized, imports `._aten_impl` to register our aten op implementations to torch, imports `.C` to load our c++ extension that registers more ops, allocator and hooks and finally renames the PrivateUse1 backend and register our python-side module.
-- `pytorch_openreg/_aten_impl.py` does two main things. Use the `_register_same_name()` function to register hooks from c++ (like getDevice, getStream, etc) and send them to our device daemon. Define a new `torch.Library` that registers a fallback that will be called whenever a backend kernel for PrivateUse1 is called. It contains the logic to handle all kind of native functions, computing the output metadata, allocating it and only calling into the device daemon to perform computation
-- `pytorch_openreg/_device_daemon.py` contains the Allocator (responsible for allocating memory on the device side, as int8 buffers, and recreating nice looking Tensors on the device side to be able to use aten ops to run code there), `run_op` that is the logic running on the device side to perform compute (for simplicity of coverage, we are re-building full blown Tensors here and calling aten ops on them). It also contains the Daemon responsible for the device worker process and sending data back and forth.
-- `pytorch_openreg/_meta_parser.py` mainly contain utilities to send objects over the wire from the user process to the device process. The main class there is `OpenRegTensorMeta` that contains all the metadata sent to the device which should be enough for it to populate the output Tensor.
+
+- `pytorch_openreg/__init__.py`
+  - imports torch to get core state initialized.
+  - imports `._aten_impl` to register our aten op implementations to torch.
+  - imports `.C` to load our c++ extension that registers more ops, allocator and hooks.
+  - renames the PrivateUse1 backend and register our python-side module.
+- `pytorch_openreg/_aten_impl.py`
+  - Define a new `torch.Library` that registers a fallback that will be called whenever a backend kernel for PrivateUse1 is called. It contains the logic to handle all kind of native functions, computing the output metadata, allocating it and only calling into the device daemon to perform computation.
+- `pytorch_openreg/_device_daemon.py`
+  - contains the Allocator (responsible for allocating memory on the device side and host side, as int8 buffers).
+  - contains `Driver`, which as user-process driver to deal with some information needed to be done in driver.
+  - contains `Executor`, which as device-process exector to do something related device logic.
+- `pytorch_openreg/_meta_parser.py` mainly contain utilities to send objects over the wire from the user process to the device process.
+  - The main class there is `OpenRegTensorMeta` that contains all the metadata sent to the device which should be enough for it to populate the output Tensor.
 
 ## Next steps
 
-Currently, the autograd test is disabled because it's missing the getStream implementation.
 The main next step would be to:
-- Split the daemon into a proper user-process driver vs device-process executor. The main goal would be to better mimick which information is held on the user-process side and when we're actually communicating with the device. In particular current device or stream should be user-process informations.
-- Add Stream/Event system. Most likely by having multiple requests queue that go to the device from the driver.
-- Add RNG Generator.
 
-Longer term:
 - Replace the current `open_registration_extension.cpp` test in PyTorch CI with this.
-- Build this module in the CI environment and enable Device-generic tests on this device.
 
@@ -4,7 +4,38 @@
 
 namespace openreg {
 
+using openreg_ptr_t = uint64_t;
+
 void set_impl_factory(PyObject* factory);
 py::function get_method(const char* name);
 
+static constexpr char kFreeMethod[] = "free";
+static constexpr char kHostFreeMethod[] = "hostFree";
+
+template <const char* name>
+static void ReportAndDelete(void* ptr) {
+  if (!ptr || !Py_IsInitialized()) {
+    return;
+  }
+
+  py::gil_scoped_acquire acquire;
+
+  PyObject *type = nullptr, *value = nullptr, *traceback = nullptr;
+  // Always stash, this will be a no-op if there is no error
+  PyErr_Fetch(&type, &value, &traceback);
+
+  TORCH_CHECK(
+      get_method(name)(reinterpret_cast<openreg_ptr_t>(ptr)).cast<bool>(),
+      "Failed to free memory pointer at ",
+      ptr);
+
+  // If that user code raised an error, just print it without raising it
+  if (PyErr_Occurred()) {
+    PyErr_Print();
+  }
+
+  // Restore the original error
+  PyErr_Restore(type, value, traceback);
+}
+
 } // namespace openreg
@@ -3,20 +3,17 @@
 #include <ATen/CPUGeneratorImpl.h>
 #include <ATen/core/GeneratorForPrivateuseone.h>
 #include <ATen/detail/PrivateUse1HooksInterface.h>
+
+#include <c10/core/Allocator.h>
 #include <c10/core/Device.h>
 #include <c10/core/impl/DeviceGuardImplInterface.h>
-#include <c10/util/CallOnce.h>
-
-#include <iostream>
 
 namespace openreg {
-
 namespace {
+
 // Python factory function where real implementations can be found
 PyObject* py_factory;
 
-using host_ptr_t = uint64_t;
-
 struct HostAllocator final : at::Allocator {
   HostAllocator() = default;
 
@@ -25,35 +22,25 @@ struct HostAllocator final : at::Allocator {
     void* data = nullptr;
     if (nbytes > 0) {
       data = reinterpret_cast<void*>(
-          get_method("hostMalloc")(nbytes).cast<host_ptr_t>());
+          get_method("hostMalloc")(nbytes).cast<openreg_ptr_t>());
       TORCH_CHECK(data, "Failed to allocator ", nbytes, " bytes on host.");
     }
-    return {data, data, &ReportAndDelete, at::Device(at::kCPU)};
-  }
-
-  static void ReportAndDelete(void* ptr) {
-    if (!ptr) {
-      return;
-    }
-    py::gil_scoped_acquire acquire;
-    TORCH_CHECK(
-        get_method("hostFree")(reinterpret_cast<host_ptr_t>(ptr)).cast<bool>(),
-        "Failed to free memory pointer at ",
-        ptr);
+    return {data, data, &ReportAndDelete<kHostFreeMethod>, at::Device(at::kCPU)};
   }
 
   at::DeleterFnPtr raw_deleter() const override {
-    return &ReportAndDelete;
+    return &ReportAndDelete<kHostFreeMethod>;
   }
 
   void copy_data(void* dest, const void* src, std::size_t count) const final {
     py::gil_scoped_acquire acquire;
     get_method("hostCopyData")(
-        reinterpret_cast<host_ptr_t>(dest),
-        reinterpret_cast<host_ptr_t>(src),
+        reinterpret_cast<openreg_ptr_t>(dest),
+        reinterpret_cast<openreg_ptr_t>(src),
         count);
   }
 };
+
 static HostAllocator global_host_alloc;
 
 static c10::DeviceIndex device_count() {
@@ -82,20 +69,8 @@ static at::Generator make_openreg_generator(c10::DeviceIndex device_index) {
 // Default, global generators, one per device.
 static std::vector<at::Generator> default_generators;
 
-static void initGenerators() {
-  auto deivce_nums = device_count();
-  default_generators.resize(deivce_nums);
-  for (auto i = 0; i < deivce_nums; i++) {
-    default_generators[i] = make_openreg_generator(i);
-    default_generators[i].seed();
-  }
-}
-
-// C++ hooks implementation
-struct OpenRegHooksArgs : public at::PrivateUse1HooksArgs {};
-
 struct OpenRegHooksInterface : public at::PrivateUse1HooksInterface {
-  OpenRegHooksInterface(OpenRegHooksArgs) {};
+  OpenRegHooksInterface() {};
   ~OpenRegHooksInterface() override = default;
 
   bool hasPrimaryContext(c10::DeviceIndex device_index) const override {
@@ -109,14 +84,22 @@ struct OpenRegHooksInterface : public at::PrivateUse1HooksInterface {
 
   bool isPinnedPtr(const void* data) const override {
     py::gil_scoped_acquire acquire;
-    return get_method("isPinnedPtr")(reinterpret_cast<host_ptr_t>(data))
+    return get_method("isPinnedPtr")(reinterpret_cast<openreg_ptr_t>(data))
         .cast<bool>();
   }
 
   const at::Generator& getDefaultGenerator(
       c10::DeviceIndex device_index) const override {
-    static c10::once_flag generator_init_flag;
-    c10::call_once(generator_init_flag, initGenerators);
+    static bool flag [[maybe_unused]] = []() {
+      auto deivce_nums = device_count();
+      default_generators.resize(deivce_nums);
+      for (auto i = 0; i < deivce_nums; i++) {
+        default_generators[i] = make_openreg_generator(i);
+        default_generators[i].seed();
+      }
+      return true;
+    }();
+
     c10::DeviceIndex idx = device_index;
     if (idx == -1) {
       idx = current_device_idx();
@@ -131,27 +114,11 @@ struct OpenRegHooksInterface : public at::PrivateUse1HooksInterface {
   }
 };
 
-int register_hook() {
-  at::RegisterPrivateUse1HooksInterface(
-      new OpenRegHooksInterface(OpenRegHooksArgs{}));
-  return 0;
-}
-int temp_register_hook = register_hook();
-
-TORCH_DECLARE_REGISTRY(
-    PrivateUse1HooksRegistry,
-    OpenRegHooksInterface,
-    OpenRegHooksArgs);
-C10_DEFINE_REGISTRY(
-    PrivateUse1HooksRegistry,
-    OpenRegHooksInterface,
-    OpenRegHooksArgs);
-// Using Create function to get PrivateUse1HooksInterface point from
-// PrivateUse1HooksRegistry class.
-C10_REGISTER_TYPED_CLASS(
-    PrivateUse1HooksRegistry,
-    "OpenRegHooks",
-    OpenRegHooksInterface);
+static bool register_hook_flag [[maybe_unused]] = []() {
+  at::RegisterPrivateUse1HooksInterface(new OpenRegHooksInterface());
+
+  return true;
+}();
 
 // Device guard registration
 struct OpenRegGuardImpl final : public c10::impl::DeviceGuardImplInterface {
@@ -379,4 +346,5 @@ py::function get_method(const char* name) {
   auto factory = py::cast<py::function>(py_factory);
   return factory(name);
 }
+
 } // namespace openreg
Original file line number	Diff line number	Diff line change
`@@ -1 +1 @@`
`1`		`-4022ff142a5392aa5197e05f4dfe85d356f742bf`
	`1`	`+047bbc720fda70cd5742c76b3c9e01d504577d65`
Original file line number	Diff line number	Diff line change
`@@ -3366,7 +3366,7 @@ static std::vector<Tensor> _pad_chunk(`
`3366`	`3366`	`std::vector<int64_t> view_sizes(`
`3367`	`3367`	`tensor_size.begin(), tensor_size.begin() + dim);`
`3368`	`3368`	`view_sizes.insert(view_sizes.end(), {num_chunks, -1});`
`3369`		`- padded_tensors.push_back(padded_tensor.view(view_sizes));`
	`3369`	`+ padded_tensors.push_back(padded_tensor.reshape(view_sizes));`
`3370`	`3370`	`}`
`3371`	`3371`	`return padded_tensors;`
`3372`	`3372`	`}`
Original file line number	Diff line number	Diff line change
`@@ -422,11 +422,12 @@ static __global__ void chunk_cat_cuda_kernel(`
`422`	`422`	`}`
`423`	`423`
`424`	`424`	`bool all_contiguous(TensorList tensors) {`
`425`		`- bool contiguous = true;`
`426`	`425`	`for (const auto& t : tensors) {`
`427`		`- contiguous &= t.is_non_overlapping_and_dense();`
	`426`	`+ if (!t.is_contiguous()) {`
	`427`	`+ return false;`
	`428`	`+ }`
`428`	`429`	`}`
`429`		`- return contiguous;`
	`430`	`+ return true;`
`430`	`431`	`}`
`431`	`432`
`432`	`433`	// Get leading dimensions before `dim`-th dimension.