pytorch
diff --git a/‎.lintrunner.toml‎
Lines changed: 31 additions & 0 deletions b/‎.lintrunner.toml‎
Lines changed: 31 additions & 0 deletions
diff --git a/‎aten/src/ATen/cuda/CuSparseHandlePool.cpp‎
Lines changed: 1 addition & 1 deletion b/‎aten/src/ATen/cuda/CuSparseHandlePool.cpp‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎aten/src/ATen/cuda/CublasHandlePool.cpp‎
Lines changed: 1 addition & 1 deletion b/‎aten/src/ATen/cuda/CublasHandlePool.cpp‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎aten/src/ATen/cuda/detail/CUDAHooks.cpp‎
Lines changed: 2 additions & 1 deletion b/‎aten/src/ATen/cuda/detail/CUDAHooks.cpp‎
Lines changed: 2 additions & 1 deletion
diff --git a/‎aten/src/ATen/cudnn/Handle.cpp‎
Lines changed: 1 addition & 1 deletion b/‎aten/src/ATen/cudnn/Handle.cpp‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎aten/src/ATen/native/cuda/RNN.cu‎
Lines changed: 1 addition & 1 deletion b/‎aten/src/ATen/native/cuda/RNN.cu‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎aten/src/ATen/native/cuda/UniqueCub.cu‎
Lines changed: 1 addition & 1 deletion b/‎aten/src/ATen/native/cuda/UniqueCub.cu‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎aten/src/ATen/native/cuda/linalg/CusolverDnHandlePool.cpp‎
Lines changed: 1 addition & 1 deletion b/‎aten/src/ATen/native/cuda/linalg/CusolverDnHandlePool.cpp‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎aten/src/ATen/native/cudnn/Conv_v8.cpp‎
Lines changed: 1 addition & 1 deletion b/‎aten/src/ATen/native/cudnn/Conv_v8.cpp‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎aten/src/ATen/native/sparse/cuda/SparseCUDATensorMath.cu‎
Lines changed: 3 additions & 3 deletions b/‎aten/src/ATen/native/sparse/cuda/SparseCUDATensorMath.cu‎
Lines changed: 3 additions & 3 deletions
@@ -637,6 +637,37 @@ command = [
     '@{{PATHSFILE}}'
 ]
 
+[[linter]]
+code = 'RAWCUDADEVICE'
+include_patterns = [
+    'aten/**',
+    'c10/**',
+    'torch/csrc/**',
+]
+exclude_patterns = [
+    'aten/src/ATen/cuda/CUDAContext.cpp',
+    'aten/src/ATen/cuda/CUDAGeneratorImpl.cpp',
+    'aten/src/ATen/test/**',
+    'c10/core/impl/InlineDeviceGuard.h',
+    'c10/cuda/CUDAFunctions.cpp',
+    'c10/cuda/CUDAGuard.h',
+    'c10/cuda/impl/CUDATest.cpp',
+    'torch/csrc/cuda/nccl.cpp',
+]
+command = [
+    'python3',
+    'tools/linter/adapters/grep_linter.py',
+    '--pattern=cudaSetDevice',
+    '--pattern=cudaGetDevice',
+    '--linter-name=RAWCUDADEVICE',
+    '--error-name=raw CUDA API usage',
+    """--error-description=\
+        This line calls raw CUDA APIs directly; please use c10::cuda wrappers instead.
+    """,
+    '--',
+    '@{{PATHSFILE}}'
+]
+
 [[linter]]
 code = 'ROOT_LOGGING'
 include_patterns = [
 
@@ -27,7 +27,7 @@ using CuSparsePoolType = DeviceThreadHandlePool<cusparseHandle_t, createCusparse
 
 cusparseHandle_t getCurrentCUDASparseHandle() {
   int device;
-  AT_CUDA_CHECK(cudaGetDevice(&device));
+  AT_CUDA_CHECK(c10::cuda::GetDevice(&device));
 
   // Thread local PoolWindows are lazily-initialized
   // to avoid initialization issues that caused hangs on Windows.
 
@@ -81,7 +81,7 @@ at::DataPtr getNewWorkspace() {
 
 cublasHandle_t getCurrentCUDABlasHandle() {
   int device;
-  AT_CUDA_CHECK(cudaGetDevice(&device));
+  AT_CUDA_CHECK(c10::cuda::GetDevice(&device));
 
   // Thread local PoolWindows are lazily-initialized
   // to avoid initialization issues that caused hangs on Windows.
 
@@ -15,6 +15,7 @@
 #include <ATen/native/cuda/CuFFTPlanCache.h>
 #include <c10/util/Exception.h>
 #include <c10/cuda/CUDACachingAllocator.h>
+#include <c10/cuda/CUDAFunctions.h>
 #include <c10/util/irange.h>
 
 #if AT_CUDNN_ENABLED()
@@ -225,7 +226,7 @@ const at::cuda::NVRTC& CUDAHooks::nvrtc() const {
 
 int64_t current_device() {
   int device;
-  cudaError_t err = cudaGetDevice(&device);
+  cudaError_t err = c10::cuda::GetDevice(&device);
   if (err == cudaSuccess) {
     return device;
   }
 
@@ -33,7 +33,7 @@ using CudnnPoolType = at::cuda::DeviceThreadHandlePool<cudnnHandle_t, createCuDN
 
 cudnnHandle_t getCudnnHandle() {
   int device;
-  AT_CUDA_CHECK(cudaGetDevice(&device));
+  AT_CUDA_CHECK(c10::cuda::GetDevice(&device));
 
   // Thread local PoolWindows are lazily-initialized
   // to avoid initialization issues that caused hangs on Windows.
 
@@ -56,7 +56,7 @@ bool allContiguous(at::TensorList tensors) {
 
 void getLaunchConfig(dim3* block, dim3* grid, int64_t numel) {
   int curDevice = -1;
-  cudaGetDevice(&curDevice);
+  c10::cuda::GetDevice(&curDevice);
   *block = cuda::getApplyBlock();
   TORCH_INTERNAL_ASSERT(cuda::getApplyGrid(numel, *grid, curDevice),
                         "Could not get grid size for pointwise apply.");
 
@@ -85,7 +85,7 @@ std::tuple<Tensor, Tensor, Tensor> compute_unique(
         dim3(std::min(static_cast<int64_t>(cuda::getApplyBlock().x), num_inp));
     dim3 grid;
     int curDevice = -1;
-    cudaGetDevice(&curDevice);
+    c10::cuda::GetDevice(&curDevice);
     cuda::getApplyGrid(num_inp, grid, curDevice);
     adjacent_difference_kernel<<<grid, block, 0, stream>>>(
         num_inp, data, inv_loc_ptr);
 
@@ -30,7 +30,7 @@ using CuSolverDnPoolType = DeviceThreadHandlePool<cusolverDnHandle_t, createCuso
 
 cusolverDnHandle_t getCurrentCUDASolverDnHandle() {
   int device;
-  AT_CUDA_CHECK(cudaGetDevice(&device));
+  AT_CUDA_CHECK(c10::cuda::GetDevice(&device));
 
   // Thread local PoolWindows are lazily-initialized
   // to avoid initialization issues that caused hangs on Windows.
 
@@ -326,7 +326,7 @@ auto get_generator_sources(const cudnnBackendDescriptorType_t& desc, const Tenso
 
 int64_t get_available_workspace() {
   int device;
-  C10_CUDA_CHECK(cudaGetDevice(&device));
+  C10_CUDA_CHECK(c10::cuda::GetDevice(&device));
   size_t max_block_size = 0;
   c10::cuda::CUDACachingAllocator::cacheInfo(device, &max_block_size);
   return static_cast<int64_t>(max_block_size);
 
@@ -314,7 +314,7 @@ Tensor& add_out_dense_sparse_cuda(Tensor& r_, const Tensor& dense, const SparseT
     const dim3 block = cuda::getApplyBlock();
     dim3 grid;
     int curDevice = -1;
-    cudaGetDevice(&curDevice);
+    c10::cuda::GetDevice(&curDevice);
     cudaStream_t stream = at::cuda::getCurrentCUDAStream(curDevice);
     if (sparse.dense_dim() == 0) {
       TORCH_CHECK(cuda::getApplyGrid(nnz, grid, curDevice), "add: Argument #0: tensor too large or too many dimensions");
@@ -606,7 +606,7 @@ Tensor _sparse_sum_backward_cuda(const Tensor& grad_, const SparseTensor& input_
     }
     else {
       int curDevice = -1;
-      cudaGetDevice(&curDevice);
+      c10::cuda::GetDevice(&curDevice);
       cudaStream_t stream = at::cuda::getCurrentCUDAStream(curDevice);
       at::cuda::ThrustAllocator allocator;
       auto policy = thrust::cuda::par(allocator).on(stream);
@@ -711,7 +711,7 @@ __global__ void search_end_matrix_indices_cuda_kernel(
 // indices to find the end index for each matrix
 void search_end_matrix_indices(int64_t* mat_el_end_indices, int64_t num_matrices, const Tensor& indices_1D) {
   int curDevice = -1;
-  cudaGetDevice(&curDevice);
+  c10::cuda::GetDevice(&curDevice);
   cudaStream_t stream = at::cuda::getCurrentCUDAStream(curDevice);
 
   auto indices_1D_ti = getTensorInfo<int64_t, int64_t>(indices_1D);