Merge torch_from_blob and torch_from_blob_v2 into a single shim (#177048)

mikaylagawarecki · pytorchmergebot · commit e0408b9d468e · 2026-03-11T17:54:32.000Z
Mirror the changes in #176440 Mirror the changes on release/2.11 branch Keep just the _v2 signature under the name torch_from_blob, and have the C++ wrapper in ops.h adapt simple function-pointer deleters via a trampoline using if constexpr, avoiding heap allocation for that case. Authored with Claude. Pull Request resolved: #177048 Approved by: https://github.com/malfet
diff --git a/test/cpp_extensions/libtorch_agn_2_11_extension/csrc/my_from_blob_with_lambda_deleter.cpp b/test/cpp_extensions/libtorch_agn_2_11_extension/csrc/my_from_blob_with_lambda_deleter.cpp
@@ -14,7 +14,7 @@ static int64_t g_lambda_deleter_call_count = 0;
 
 // Wrapper for from_blob with a capturing-lambda deleter.
 // The lambda captures a pointer to the global counter and increments it,
-// which exercises the torch_from_blob_v2 code path (deleter + context).
+// which exercises the capturing-lambda code path in torch_from_blob.
 Tensor my_from_blob_with_lambda_deleter(
     int64_t data_ptr,
     torch::headeronly::HeaderOnlyArrayRef<int64_t> sizes,
@@ -60,7 +60,7 @@ STABLE_TORCH_LIBRARY_IMPL(
 #ifdef LAE_USE_CUDA
 
 // Same as my_from_blob_with_cuda_deleter (from 2.11) but uses a non-capturing
-// lambda deleter, exercising the from_blob_v2 code path.
+// lambda deleter.
 Tensor my_from_blob_with_cuda_lambda_deleter(
     int64_t numel,
     torch::stable::Device device) {
diff --git a/test/cpp_extensions/libtorch_agn_2_11_extension/libtorch_agn_2_11/ops.py b/test/cpp_extensions/libtorch_agn_2_11_extension/libtorch_agn_2_11/ops.py
@@ -57,6 +57,59 @@ def my_from_blob_with_cuda_deleter(numel: int, device) -> Tensor:
     )
 
 
+def my_from_blob_with_lambda_deleter(data_ptr, sizes, strides, device, dtype) -> Tensor:
+    """
+    Creates a Tensor from existing memory with a capturing-lambda deleter.
+
+    The deleter is a capturing lambda that updates a global call count,
+    exercising the capturing-lambda code path in torch_from_blob.
+
+    Args:
+        data_ptr: int - pointer to the data buffer
+        sizes: tuple[int] - size of the tensor
+        strides: tuple[int] - strides of the tensor
+        device: Device - device on which the tensor resides
+        dtype: ScalarType - data type of the tensor
+
+    Returns: Tensor - tensor wrapping the existing memory
+    """
+    return torch.ops.libtorch_agn_2_11.my_from_blob_with_lambda_deleter.default(
+        data_ptr, sizes, strides, device, dtype
+    )
+
+
+def get_lambda_deleter_call_count() -> int:
+    """
+    Returns the number of times the lambda test deleter has been called.
+    """
+    return torch.ops.libtorch_agn_2_11.get_lambda_deleter_call_count.default()
+
+
+def reset_lambda_deleter_call_count() -> None:
+    """
+    Resets the lambda deleter call counter to zero.
+    """
+    torch.ops.libtorch_agn_2_11.reset_lambda_deleter_call_count.default()
+
+
+def my_from_blob_with_cuda_lambda_deleter(numel: int, device) -> Tensor:
+    """
+    Creates a CUDA tensor that owns its memory via cudaMalloc, using a lambda deleter.
+
+    Similar to my_from_blob_with_cuda_deleter but uses the capturing-lambda
+    code path in torch_from_blob.
+
+    Args:
+        numel: int - number of elements in the tensor
+        device: Device - CUDA device
+
+    Returns: Tensor - a 1D float32 tensor of zeros
+    """
+    return torch.ops.libtorch_agn_2_11.my_from_blob_with_cuda_lambda_deleter.default(
+        numel, device
+    )
+
+
 # =============================================================================
 # Proxy for inherited ops (from libtorch_agn_2_9 and libtorch_agn_2_10 csrc/)
 #
diff --git a/test/cpp_extensions/libtorch_agn_2_12_extension/libtorch_agn_2_12/ops.py b/test/cpp_extensions/libtorch_agn_2_12_extension/libtorch_agn_2_12/ops.py
@@ -1,58 +1,4 @@
 import torch
-from torch import Tensor
-
-
-def my_from_blob_with_lambda_deleter(data_ptr, sizes, strides, device, dtype) -> Tensor:
-    """
-    Creates a Tensor from existing memory with a capturing-lambda deleter.
-
-    The lambda deleter captures a pointer to a global counter and increments it,
-    exercising the torch_from_blob_v2 code path (deleter + context).
-
-    Args:
-        data_ptr: int - pointer to the data buffer
-        sizes: tuple[int] - size of the tensor
-        strides: tuple[int] - strides of the tensor
-        device: Device - device on which the tensor resides
-        dtype: ScalarType - data type of the tensor
-
-    Returns: Tensor - tensor wrapping the existing memory
-    """
-    return torch.ops.libtorch_agn_2_12.my_from_blob_with_lambda_deleter.default(
-        data_ptr, sizes, strides, device, dtype
-    )
-
-
-def get_lambda_deleter_call_count() -> int:
-    """
-    Returns the number of times the lambda deleter has been called.
-    """
-    return torch.ops.libtorch_agn_2_12.get_lambda_deleter_call_count.default()
-
-
-def reset_lambda_deleter_call_count() -> None:
-    """
-    Resets the lambda deleter call counter to zero.
-    """
-    torch.ops.libtorch_agn_2_12.reset_lambda_deleter_call_count.default()
-
-
-def my_from_blob_with_cuda_lambda_deleter(numel: int, device) -> Tensor:
-    """
-    Creates a CUDA tensor that owns its memory via cudaMalloc with a lambda deleter.
-
-    The tensor's memory is allocated with cudaMalloc and will be freed
-    with cudaFree when the tensor is destroyed (via from_blob's lambda deleter).
-
-    Args:
-        numel: int - number of elements in the tensor
-        device: Device - CUDA device
-
-    Returns: Tensor - a 1D float32 tensor of zeros
-    """
-    return torch.ops.libtorch_agn_2_12.my_from_blob_with_cuda_lambda_deleter.default(
-        numel, device
-    )
 
 
 # =============================================================================
diff --git a/test/cpp_extensions/test_libtorch_agnostic.py b/test/cpp_extensions/test_libtorch_agnostic.py
@@ -1798,11 +1798,11 @@ def inner():
             curr_mem = torch.cuda.memory_allocated(device)
             self.assertEqual(curr_mem, init_mem)
 
-    @skipIfTorchVersionLessThan(2, 12)
+    @skipIfTorchVersionLessThan(2, 11)
     @skipIfTorchDynamo("no data pointer defined for FakeTensor, FunctionalTensor")
     def test_my_from_blob_with_lambda_deleter(self, device):
-        """Test for from_blob with capturing-lambda deleter (2.12 feature)."""
-        import libtorch_agn_2_12 as libtorch_agnostic
+        """Test for from_blob with capturing-lambda deleter (2.11 feature)."""
+        import libtorch_agn_2_11 as libtorch_agnostic
 
         from_blob_fn = libtorch_agnostic.ops.my_from_blob_with_lambda_deleter
         get_count = libtorch_agnostic.ops.get_lambda_deleter_call_count
@@ -1872,10 +1872,10 @@ def test_my_from_blob_with_cuda_deleter_no_leak(self, device):
             self.assertEqual(curr_mem, init_mem)
 
     @onlyCUDA
-    @skipIfTorchVersionLessThan(2, 12)
+    @skipIfTorchVersionLessThan(2, 11)
     def test_my_from_blob_with_cuda_lambda_deleter_no_leak(self, device):
         """Test that from_blob lambda deleter properly frees cudaMalloc'd memory."""
-        import libtorch_agn_2_12 as libtorch_agnostic
+        import libtorch_agn_2_11 as libtorch_agnostic
 
         from_blob_fn = libtorch_agnostic.ops.my_from_blob_with_cuda_lambda_deleter
 
diff --git a/torch/csrc/shim_common.cpp b/torch/csrc/shim_common.cpp
@@ -655,49 +655,6 @@ TORCH_DTYPE_IMPL(float4_e2m1fn_x2, Float4_e2m1fn_x2)
 #undef TORCH_DTYPE_IMPL
 
 AOTI_TORCH_EXPORT AOTITorchError torch_from_blob(
-    void* data,
-    int64_t ndim,
-    const int64_t* sizes_ptr,
-    const int64_t* strides_ptr,
-    int64_t storage_offset,
-    int32_t dtype,
-    int32_t device_type,
-    int32_t device_index,
-    AtenTensorHandle* ret_new_tensor,
-    int32_t layout,
-    const uint8_t* opaque_metadata,
-    int64_t opaque_metadata_size,
-    void (*deleter)(void*)) {
-  AOTI_TORCH_CONVERT_EXCEPTION_TO_ERROR_CODE({
-    c10::IntArrayRef sizes(sizes_ptr, ndim);
-    c10::IntArrayRef strides(strides_ptr, ndim);
-    c10::Device device(static_cast<c10::DeviceType>(device_type), device_index);
-    c10::TensorOptions options = c10::TensorOptions().device(device).dtype(
-        static_cast<c10::ScalarType>(dtype));
-    at::Tensor tensor;
-    if (data != nullptr) {
-      if (deleter != nullptr) {
-        tensor = at::for_blob(data, sizes)
-                     .strides(strides)
-                     .storage_offset(storage_offset)
-                     .deleter(deleter)
-                     .options(options)
-                     .make_tensor();
-      } else {
-        tensor = at::for_blob(data, sizes)
-                     .strides(strides)
-                     .storage_offset(storage_offset)
-                     .options(options)
-                     .make_tensor();
-      }
-    } else {
-      tensor = at::empty_strided(sizes, strides, options);
-    }
-    *ret_new_tensor = torch::aot_inductor::new_tensor_handle(std::move(tensor));
-  });
-}
-
-AOTI_TORCH_EXPORT AOTITorchError torch_from_blob_v2(
     void* data,
     int64_t ndim,
     const int64_t* sizes_ptr,
@@ -721,8 +678,6 @@ AOTI_TORCH_EXPORT AOTITorchError torch_from_blob_v2(
     at::Tensor tensor;
     if (data != nullptr) {
       if (deleter_callback != nullptr) {
-        // Combine the two-arg C callback and its context into a single-arg
-        // C++ callable that at::for_blob().deleter() expects.
         auto wrapped_deleter = [deleter_callback, deleter_ctx](void* data) {
           deleter_callback(data, deleter_ctx);
         };
diff --git a/torch/csrc/stable/c/shim.h b/torch/csrc/stable/c/shim.h
@@ -165,8 +165,9 @@ AOTI_TORCH_EXPORT int32_t torch_dtype_float8_e8m0fnu();
 AOTI_TORCH_EXPORT int32_t torch_dtype_float4_e2m1fn_x2();
 
 // Creates a tensor from an existing data blob with an optional deleter.
-// The deleter is called with the data pointer when the tensor's storage
-// is deallocated.
+// The deleter receives both the data pointer and a caller-supplied context
+// pointer, which allows passing capturing lambdas across the C ABI boundary
+// by heap-allocating the callable and passing it as deleter_ctx.
 AOTI_TORCH_EXPORT AOTITorchError torch_from_blob(
     void* data,
     int64_t ndim,
@@ -180,35 +181,10 @@ AOTI_TORCH_EXPORT AOTITorchError torch_from_blob(
     int32_t layout,
     const uint8_t* opaque_metadata,
     int64_t opaque_metadata_size,
-    void (*deleter)(void*));
-
-#endif // TORCH_FEATURE_VERSION >= TORCH_VERSION_2_11_0
-
-/**
- * The beginning of all shims added in 2.12.0 onwards.
- */
-#if TORCH_FEATURE_VERSION >= TORCH_VERSION_2_12_0
-
-// Like torch_from_blob, but accepts a deleter with a context pointer.
-// This allows passing capturing lambdas across the C ABI boundary by
-// heap-allocating the callable and passing it as deleter_ctx.
-AOTI_TORCH_EXPORT AOTITorchError torch_from_blob_v2(
-    void* data,
-    int64_t ndim,
-    const int64_t* sizes_ptr,
-    const int64_t* strides_ptr,
-    int64_t storage_offset,
-    int32_t dtype,
-    int32_t device_type,
-    int32_t device_index,
-    AtenTensorHandle* ret,
-    int32_t layout,
-    const uint8_t* opaque_metadata,
-    int64_t opaque_metadata_size,
     void (*deleter)(void* data, void* ctx),
     void* deleter_ctx);
 
-#endif // TORCH_FEATURE_VERSION >= TORCH_VERSION_2_12_0
+#endif // TORCH_FEATURE_VERSION >= TORCH_VERSION_2_11_0
 
 #ifdef __cplusplus
 } // extern "C"
diff --git a/torch/csrc/stable/c/shim_function_versions.txt b/torch/csrc/stable/c/shim_function_versions.txt
@@ -26,4 +26,3 @@ torch_parse_device_string: TORCH_VERSION_2_10_0
 torch_dtype_float4_e2m1fn_x2: TORCH_VERSION_2_11_0
 torch_dtype_float8_e8m0fnu: TORCH_VERSION_2_11_0
 torch_from_blob: TORCH_VERSION_2_11_0
-torch_from_blob_v2: TORCH_VERSION_2_12_0
diff --git a/torch/csrc/stable/ops.h b/torch/csrc/stable/ops.h