pytorch
diff --git a/‎.github/workflows/lint.yml‎
Lines changed: 1 addition & 4 deletions b/‎.github/workflows/lint.yml‎
Lines changed: 1 addition & 4 deletions
diff --git a/‎.github/workflows/test_tools.yml‎
Lines changed: 6 additions & 1 deletion b/‎.github/workflows/test_tools.yml‎
Lines changed: 6 additions & 1 deletion
diff --git a/‎.gitignore‎
Lines changed: 1 addition & 1 deletion b/‎.gitignore‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎CMakeLists.txt‎
Lines changed: 13 additions & 0 deletions b/‎CMakeLists.txt‎
Lines changed: 13 additions & 0 deletions
diff --git a/‎aten/src/ATen/core/aten_interned_strings.h‎
Lines changed: 2 additions & 0 deletions b/‎aten/src/ATen/core/aten_interned_strings.h‎
Lines changed: 2 additions & 0 deletions
diff --git a/‎aten/src/ATen/cudnn/Descriptors.h‎
Lines changed: 17 additions & 0 deletions b/‎aten/src/ATen/cudnn/Descriptors.h‎
Lines changed: 17 additions & 0 deletions
diff --git a/‎aten/src/ATen/native/BatchLinearAlgebra.cpp‎
Lines changed: 3 additions & 3 deletions b/‎aten/src/ATen/native/BatchLinearAlgebra.cpp‎
Lines changed: 3 additions & 3 deletions
diff --git a/‎aten/src/ATen/native/cuda/BatchLinearAlgebra.cu‎
Lines changed: 9 additions & 10 deletions b/‎aten/src/ATen/native/cuda/BatchLinearAlgebra.cu‎
Lines changed: 9 additions & 10 deletions
diff --git a/‎aten/src/ATen/native/cudnn/ConvPlaceholders.cpp‎
Lines changed: 25 additions & 1 deletion b/‎aten/src/ATen/native/cudnn/ConvPlaceholders.cpp‎
Lines changed: 25 additions & 1 deletion
diff --git a/‎aten/src/ATen/native/cudnn/ConvShared.cpp‎
Lines changed: 83 additions & 0 deletions b/‎aten/src/ATen/native/cudnn/ConvShared.cpp‎
Lines changed: 83 additions & 0 deletions
@@ -218,17 +218,14 @@ jobs:
     runs-on: ubuntu-18.04
     steps:
       - name: Setup Python
-        uses: actions/setup-python@v1
+        uses: actions/setup-python@v2
         with:
           python-version: 3.8
           architecture: x64
       - name: Fetch PyTorch
         uses: actions/checkout@v2
         with:
           ref: ${{ github.event.pull_request.head.sha }}
-      - name: Get HEAD commit SHA
-        run: echo ::set-output name=commit-sha::$(git rev-parse HEAD)
-        id: get-commit-sha
       - name: Install dependencies
         run: |
           set -eux
 
@@ -19,7 +19,12 @@ jobs:
         uses: actions/checkout@v2
         with:
           ref: ${{ github.event.pull_request.head.sha }}
+          fetch-depth: 0 # deep clone, to allow us to use git log
       - name: Install dependencies
-        run: pip install -r requirements.txt
+        # boto3 version copied from .circleci/docker/common/install_conda.sh
+        run: |
+          set -eux
+          pip install -r requirements.txt
+          pip install boto3==1.16.34
       - name: Run tests
         run: python -m unittest discover -vs tools/test -p 'test_*.py'
@@ -14,7 +14,7 @@ coverage.xml
 .gradle
 .hypothesis
 .mypy_cache
-.pytorch-test-times
+**/.pytorch-test-times
 */*.pyc
 */*.so*
 */**/__pycache__
 
@@ -295,6 +295,19 @@ option(HAVE_SOVERSION "Whether to add SOVERSION to the shared objects" OFF)
 cmake_dependent_option(
     USE_DEPLOY "Build embedded torch::deploy interpreter" OFF
     "BUILD_PYTHON" OFF)
+cmake_dependent_option(USE_CCACHE "Attempt using CCache to wrap the compilation" ON "UNIX" OFF)
+
+if(USE_CCACHE)
+  find_program(CCACHE_PROGRAM ccache)
+  if(CCACHE_PROGRAM)
+    set(CMAKE_C_COMPILER_LAUNCHER "${CCACHE_PROGRAM}")
+    set(CMAKE_CXX_COMPILER_LAUNCHER "${CCACHE_PROGRAM}")
+    set(CMAKE_CUDA_COMPILER_LAUNCHER "${CCACHE_PROGRAM}")
+  else()
+    message(STATUS "Could not find ccache. Consider installing ccache to speed up compilation.")
+  endif()
+endif()
+
 # Since TensorPipe does not support Windows, set it to OFF when WIN32 detected
 # On Windows platform, if user does not install libuv in build conda env and
 # does not set libuv_ROOT environment variable. Set USE_DISTRIBUTED to OFF.
 
@@ -276,6 +276,8 @@ _(aten, cudnn_convolution_transpose_backward) \
 _(aten, cudnn_convolution_transpose_backward_bias) \
 _(aten, cudnn_convolution_transpose_backward_input) \
 _(aten, cudnn_convolution_transpose_backward_weight) \
+_(aten, cudnn_convolution_relu) \
+_(aten, cudnn_convolution_add_relu) \
 _(aten, cudnn_grid_sampler) \
 _(aten, cudnn_grid_sampler_backward) \
 _(aten, cudnn_is_acceptable) \
 
@@ -301,6 +301,23 @@ struct TORCH_CUDA_CPP_API CTCLossDescriptor
 #endif
 };
 
+struct TORCH_CUDA_CPP_API ActivationDescriptor
+    : public Descriptor<
+          cudnnActivationStruct,
+          &cudnnCreateActivationDescriptor,
+          &cudnnDestroyActivationDescriptor> {
+  void set(cudnnActivationMode_t mode) {
+    AT_ASSERT(
+        mode == CUDNN_ACTIVATION_RELU,
+        "TODO: support more cuDNN activation modes");
+    AT_CUDNN_CHECK(cudnnSetActivationDescriptor(
+        mut_desc(),
+        mode,
+        cudnnNanPropagation_t::CUDNN_NOT_PROPAGATE_NAN,
+        std::numeric_limits<double>::max()));
+  }
+};
+
 union Constant
 {
   float f;
 
@@ -2461,7 +2461,7 @@ struct LapackLstsqHelper {
   }
   self_type& set_ldb(int ldb) { this->ldb = ldb; return *this; }
   self_type& set_work() {
-    lwork = static_cast<int>(real_impl<scalar_t, value_t>(work_opt));
+    lwork = std::max<int>(1, real_impl<scalar_t, value_t>(work_opt));
     work = at::empty({lwork}, scalar_type);
     work_ptr = work.data_ptr<scalar_t>();
     return *this;
@@ -2507,7 +2507,7 @@ struct LapackLstsqHelper {
         break;
       // case LapackLstsqDriverType::Gelsd:
       default:
-        rwork_len = static_cast<int64_t>(rwork_opt);
+        rwork_len = std::max<int64_t>(1, rwork_opt);
     }
     rwork = at::empty({rwork_len}, c10::toValueType(scalar_type));
     rwork_ptr = rwork.data_ptr<value_t>();
@@ -2530,7 +2530,7 @@ struct LapackLstsqHelper {
   self_type& set_iwork() {
     // handle `iwork` workspace array (relevant only for `?gelsd`)
     if (LapackLstsqDriverType::Gelsd == driver_type) {
-      iwork = at::empty({iwork_opt}, at::kInt);
+      iwork = at::empty({std::max<int>(1, iwork_opt)}, at::kInt);
       iwork_ptr = iwork.data_ptr<int>();
     }
     return *this;
 
@@ -2088,15 +2088,15 @@ AT_ERROR("symeig: MAGMA library not found in "
 
   scalar_t* work;
   magma_int_t* iwork;
-  lwork = magma_int_cast(real_impl<scalar_t, value_t>(wkopt), "work_size");
-  liwork = magma_int_cast(iwkopt, "iwork_size");
+  lwork = magma_int_cast(std::max<int64_t>(1, real_impl<scalar_t, value_t>(wkopt)), "work_size");
+  liwork = magma_int_cast(std::max<int64_t>(1, iwkopt), "iwork_size");
   ALLOCATE_ARRAY(work, scalar_t, lwork);
   ALLOCATE_ARRAY(iwork, magma_int_t, liwork);
 
   value_t* rwork = nullptr;
   c10::Storage storage_rwork;
   if (isComplexType(at::typeMetaToScalarType(self.dtype()))) {
-    lrwork = magma_int_cast(rwkopt, "rwork_size");
+    lrwork = magma_int_cast(std::max<int64_t>(1, rwkopt), "rwork_size");
     storage_rwork = pin_memory<value_t>(lrwork);
     rwork = static_cast<value_t*>(storage_rwork.data());
   }
@@ -2288,9 +2288,9 @@ AT_ERROR("svd: MAGMA library not found in "
   value_t* rwork = nullptr;
 
   magma_int_t* iwork;
-  ALLOCATE_ARRAY(iwork, magma_int_t, 8 * mn);
+  ALLOCATE_ARRAY(iwork, magma_int_t, std::max<magma_int_t>(1, 8 * mn));
   if (isComplexType(at::typeMetaToScalarType(self.dtype()))) {
-    auto lrwork = computeLRWorkDim(jobchar, m, n);
+    auto lrwork = std::max<int64_t>(1, computeLRWorkDim(jobchar, m, n));
     storage_rwork = pin_memory<value_t>(lrwork);
     rwork = static_cast<value_t*>(storage_rwork.data());
   }
@@ -2303,7 +2303,7 @@ AT_ERROR("svd: MAGMA library not found in "
   magma_int_t lwork = -1;
   scalar_t wkopt;
   magmaSvd<scalar_t, value_t>(jobz, m, n, self_data, lda, S_data, U_data, lda, VT_data, ldvt, &wkopt, lwork, rwork, iwork, &info);
-  lwork = magma_int_cast(real_impl<scalar_t, value_t>(wkopt), "work_size");
+  lwork = magma_int_cast(std::max<int64_t>(1, real_impl<scalar_t, value_t>(wkopt)), "work_size");
   scalar_t* work;
   ALLOCATE_ARRAY(work, scalar_t, lwork);
 
@@ -2475,9 +2475,9 @@ Tensor _lu_solve_helper_cuda(const Tensor& self, const Tensor& LU_data, const Te
   TORCH_CHECK(info == 0, "MAGMA lu_solve : invalid argument: ", -info);
   return self_working_copy;
 }
-// ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
 // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ lstsq ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
 std::tuple<Tensor, Tensor, Tensor> _lstsq_helper_cuda(
     const Tensor& a, const Tensor& b, double cond, c10::optional<std::string> driver_name) {
 #ifndef USE_MAGMA
@@ -2492,8 +2492,8 @@ AT_ERROR("torch.linalg.lstsq: MAGMA library not found in "
     auto ldda = std::max<magma_int_t>(1, m);
     auto lddb = std::max<magma_int_t>(1, std::max(m, n));
     auto nb = magmaGeqrfOptimalBlocksize<scalar_t>(m, n);
-    auto lwork = (m - n + nb) * (nrhs + nb) + nrhs * nb;
-    Tensor hwork = at::empty({static_cast<int64_t>(lwork)}, a.scalar_type());
+    magma_int_t lwork = magma_int_cast(std::max<int64_t>(1, (m - n + nb) * (nrhs + nb) + nrhs * nb), "work_size");
+    Tensor hwork = at::empty({lwork}, a.scalar_type());
     auto* hwork_ptr = hwork.data_ptr<scalar_t>();
     magma_int_t info;
 
@@ -2512,7 +2512,6 @@ AT_ERROR("torch.linalg.lstsq: MAGMA library not found in "
   return std::make_tuple(b, rank, singular_values);
 #endif
 }
-// ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
 }}  // namespace at::native
 
 
@@ -1,5 +1,5 @@
-#include <ATen/cuda/CUDAConfig.h>  // for the definition of AT_CUDNN_ENABLED
 #include <ATen/ATen.h>
+#include <ATen/cuda/CUDAConfig.h> // for the definition of AT_CUDNN_ENABLED
 #include <ATen/native/ConvUtils.h>
 
 namespace at { namespace native {
@@ -93,6 +93,30 @@ void raw_cudnn_convolution_backward_weight_out(
   AT_ERROR("raw_cudnn_convolution_backward_weight_out: ATen not compiled with cuDNN support");
 }
 
+Tensor cudnn_convolution_relu(
+    const Tensor& input_t,
+    const Tensor& weight_t,
+    const c10::optional<Tensor>& bias_t,
+    IntArrayRef stride,
+    IntArrayRef padding,
+    IntArrayRef dilation,
+    int64_t groups) {
+  AT_ERROR("cudnn_convolution_relu: ATen not compiled with cuDNN support");
+}
+
+Tensor cudnn_convolution_add_relu(
+    const Tensor& input_t,
+    const Tensor& weight_t,
+    const Tensor& z_t,
+    const c10::optional<Scalar>& alpha,
+    const c10::optional<Tensor>& bias_t,
+    IntArrayRef stride,
+    IntArrayRef padding,
+    IntArrayRef dilation,
+    int64_t groups) {
+  AT_ERROR("cudnn_convolution_add_relu: ATen not compiled with cuDNN support");
+}
+
 #endif  // AT_CUDNN_ENABLED
 
 // ---------------------------------------------------------------------
 
@@ -491,6 +491,89 @@ Tensor cudnn_convolution_transpose_backward_weight(
       padding, stride, dilation, groups, benchmark, deterministic, allow_tf32);
 }
 
+Tensor cudnn_convolution_relu(
+    const Tensor& input_t,
+    const Tensor& weight_t,
+    const c10::optional<Tensor>& bias_t,
+    IntArrayRef stride,
+    IntArrayRef padding,
+    IntArrayRef dilation,
+    int64_t groups) {
+  // FuseFrozenConvAddRelu performs some tensor shape checking
+  auto output_t = at::native::empty_cuda(
+      conv_output_size(
+          input_t.sizes(), weight_t.sizes(), padding, stride, dilation),
+      /*dtype=*/input_t.scalar_type(),
+      /*layout=*/c10::nullopt,
+      /*device=*/kCUDA,
+      /*pin_memory=*/c10::nullopt,
+      /*memory_format=*/at::MemoryFormat::Contiguous);
+  if (output_t.numel() == 0) {
+    return output_t;
+  }
+
+  raw_cudnn_convolution_add_relu_out(
+      output_t,
+      input_t,
+      weight_t,
+      output_t, // use output_t as z to satisfy CUDNN API
+      0, // alpha
+      bias_t.has_value() ? bias_t.value()
+                         : zeros({output_t.size(1)}, output_t.options()),
+      stride,
+      padding,
+      dilation,
+      groups,
+      false, // benchmark
+      false, // deterministic
+      true // allow_tf32
+  );
+
+  return output_t;
+}
+
+Tensor cudnn_convolution_add_relu(
+    const Tensor& input_t,
+    const Tensor& weight_t,
+    const Tensor& z_t,
+    const c10::optional<Scalar>& alpha,
+    const c10::optional<Tensor>& bias_t,
+    IntArrayRef stride,
+    IntArrayRef padding,
+    IntArrayRef dilation,
+    int64_t groups) {
+  // FuseFrozenConvAddRelu performs some tensor shape checking
+  auto output_t = at::native::empty_cuda(
+      conv_output_size(
+          input_t.sizes(), weight_t.sizes(), padding, stride, dilation),
+      /*dtype=*/input_t.scalar_type(),
+      /*layout=*/c10::nullopt,
+      /*device=*/kCUDA,
+      /*pin_memory=*/c10::nullopt,
+      /*memory_format=*/at::MemoryFormat::Contiguous);
+  if (output_t.numel() == 0) {
+    return output_t;
+  }
+
+  raw_cudnn_convolution_add_relu_out(
+      output_t,
+      input_t,
+      weight_t,
+      z_t,
+      alpha.has_value() ? alpha.value().to<float>() : 1.0,
+      bias_t.has_value() ? bias_t.value()
+                         : zeros({output_t.size(1)}, output_t.options()),
+      stride,
+      padding,
+      dilation,
+      groups,
+      false, // benchmark
+      false, // deterministic
+      true // allow_tf32
+  );
+
+  return output_t;
+}
 }}
 
 #endif  // AT_CUDNN_ENABLED