Revert D20289209: Support RowWiseSparseAdam on GPU

ezyang · facebook-github-bot · commit d927d58c2a72 · 2020-03-18T07:35:07.000-07:00
Test Plan: revert-hammer

Differential Revision:
D20289209

Original commit changeset: a7a8a21bd18c

fbshipit-source-id: 4a8ae684d099a5499c28b7e65578fc7ab10b248d
diff --git a/caffe2/python/operator_test/adam_test.py b/caffe2/python/operator_test/adam_test.py
@@ -262,7 +262,7 @@ def ref_sparse_output_grad(param, mom1, mom2, indices, grad, LR, ITER,
            epsilon=st.floats(min_value=0.01, max_value=0.99,
                              allow_nan=False, allow_infinity=False),
            data_strategy=st.data(),
-           **hu.gcs)
+               **hu.gcs_cpu_only)
     def test_row_wise_sparse_adam(self, inputs, ITER, LR, beta1, beta2, epsilon,
                                   data_strategy, gc, dc):
         param, mom1, grad = inputs
@@ -321,12 +321,6 @@ def ref_row_wise_sparse(param, mom1, mom2, indices, grad, LR, ITER):
         # Iter lives on the CPU
         input_device_options = {'iter': hu.cpu_do}
 
-        self.assertDeviceChecks(
-            dc, op,
-            [param, mom1, mom2, indices, grad, LR, ITER],
-            [0, 1, 2],
-            input_device_options=input_device_options)
-
         self.assertReferenceChecks(
             gc, op,
             [param, mom1, mom2, indices, grad, LR, ITER],
@@ -344,7 +338,7 @@ def ref_row_wise_sparse(param, mom1, mom2, indices, grad, LR, ITER):
            epsilon=st.floats(min_value=0.01, max_value=0.99,
                              allow_nan=False, allow_infinity=False),
            data_strategy=st.data(),
-           **hu.gcs)
+               **hu.gcs_cpu_only)
     def test_row_wise_sparse_adam_output_grad(self, inputs, ITER, LR, beta1, beta2,
                                   epsilon, data_strategy, gc, dc):
         param, mom1, grad = inputs
@@ -406,12 +400,6 @@ def ref_row_wise_sparse_output_grad(param, mom1, mom2, indices, grad, LR, ITER,
         # Iter lives on the CPU
         input_device_options = {'iter': hu.cpu_do}
 
-        self.assertDeviceChecks(
-            dc, op,
-            [param, mom1, mom2, indices, grad, LR, ITER],
-            [0, 1, 2, 3],
-            input_device_options=input_device_options)   
-
         self.assertReferenceChecks(
             gc, op,
             [param, mom1, mom2, indices, grad, LR, ITER],
diff --git a/caffe2/sgd/adam_op.cc b/caffe2/sgd/adam_op.cc
@@ -53,15 +53,6 @@ OPERATOR_SCHEMA(SparseAdam)
     .NumInputs(7)
     .NumOutputs(3, 4)
     .EnforceInplace({{0, 0}, {1, 1}, {2, 2}})
-    .DeviceInferenceFunction([](const OperatorDef& def) {
-      auto op_device =
-          def.has_device_option() ? def.device_option() : DeviceOption();
-      vector<DeviceOption> in_dev(def.input_size(), op_device);
-      vector<DeviceOption> out_dev(def.output_size(), op_device);
-      // ITER input lives on CPU
-      in_dev[6] = DeviceOption();
-      return std::make_pair(in_dev, out_dev);
-    })
     .SetDoc(R"DOC(
 
     Computes the Adam Update for the sparse case.
@@ -94,15 +85,6 @@ OPERATOR_SCHEMA(RowWiseSparseAdam)
     .NumInputs(7)
     .NumOutputs(3, 4)
     .EnforceInplace({{0, 0}, {1, 1}, {2, 2}})
-    .DeviceInferenceFunction([](const OperatorDef& def) {
-      auto op_device =
-          def.has_device_option() ? def.device_option() : DeviceOption();
-      vector<DeviceOption> in_dev(def.input_size(), op_device);
-      vector<DeviceOption> out_dev(def.output_size(), op_device);
-      // ITER input lives on CPU
-      in_dev[6] = DeviceOption();
-      return std::make_pair(in_dev, out_dev);
-    })
     .SetDoc(R"DOC(
 
     Computes a modified Adam Update for the sparse case.
diff --git a/caffe2/sgd/adam_op_gpu.cu b/caffe2/sgd/adam_op_gpu.cu
@@ -1,4 +1,3 @@
-#include <cub/block/block_reduce.cuh>
 #include "caffe2/core/common_gpu.h"
 #include "caffe2/core/context_gpu.h"
 #include "caffe2/sgd/adam_op.h"
@@ -204,102 +203,6 @@ __global__ void SparseAdamOutputGradKernel(
   }
 }
 
-template <typename SIndex>
-__global__ void RowWiseSparseAdamKernel(
-    const int M,
-    const int N,
-    const float beta1,
-    const float beta2,
-    const float epsilon,
-    float* param,
-    float* mom1,
-    float* mom2,
-    const SIndex* indices,
-    const float* grad,
-    const float correction,
-    const float* lr) {
-  typedef cub::BlockReduce<float, CAFFE_CUDA_NUM_THREADS> BlockReduce;
-  __shared__ BlockReduce::TempStorage temp_storage;
-  int valid = min(N, CAFFE_CUDA_NUM_THREADS);
-  // in case gridDim is smaller than M
-  for (int i = blockIdx.x; i < M; i += gridDim.x) {
-    const SIndex index = indices[i];
-    float sum_squares = 0.0;
-    __shared__ float row_sum_squares_avg;
-
-    // in case N is bigger than block size which is 512 by default
-    for (int j = threadIdx.x; j < N; j += blockDim.x) {
-      const float x_ij = grad[i * N + j];
-      sum_squares += x_ij * x_ij;
-    }
-
-    float reduce_sum_squares =
-        BlockReduce(temp_storage).Sum(sum_squares, valid);
-    if (threadIdx.x == 0) {
-      row_sum_squares_avg = reduce_sum_squares / (float)N;
-      mom2[index] = mom2[index] * beta2 + row_sum_squares_avg * (1.0f - beta2);
-    }
-
-    __syncthreads();
-    // update param
-    float step = correction / (std::sqrt(mom2[index]) + epsilon);
-    for (int j = threadIdx.x; j < N; j += blockDim.x) {
-      mom1[index * N + j] =
-          mom1[index * N + j] * beta1 + grad[i * N + j] * (1.0f - beta1);
-      param[index * N + j] += lr[0] * mom1[index * N + j] * step;
-    }
-  }
-}
-
-template <typename SIndex>
-__global__ void RowWiseSparseAdamOutputGradKernel(
-    const int M,
-    const int N,
-    const float beta1,
-    const float beta2,
-    const float epsilon,
-    float* param,
-    float* mom1,
-    float* mom2,
-    float* output_grad,
-    const SIndex* indices,
-    const float* grad,
-    const float correction,
-    const float* lr) {
-  typedef cub::BlockReduce<float, CAFFE_CUDA_NUM_THREADS> BlockReduce;
-  __shared__ BlockReduce::TempStorage temp_storage;
-  int valid = min(N, CAFFE_CUDA_NUM_THREADS);
-  // in case gridDim is smaller than M
-  for (int i = blockIdx.x; i < M; i += gridDim.x) {
-    const SIndex index = indices[i];
-    float sum_squares = 0.0;
-    __shared__ float row_sum_squares_avg;
-
-    // in case N is bigger than block size which is 512 by default
-    for (int j = threadIdx.x; j < N; j += blockDim.x) {
-      const float x_ij = grad[i * N + j];
-      sum_squares += x_ij * x_ij;
-    }
-
-    float reduce_sum_squares =
-        BlockReduce(temp_storage).Sum(sum_squares, valid);
-    if (threadIdx.x == 0) {
-      row_sum_squares_avg = reduce_sum_squares / (float)N;
-      mom2[index] = mom2[index] * beta2 + row_sum_squares_avg * (1.0f - beta2);
-    }
-
-    __syncthreads();
-    // update param
-    float step = correction / (std::sqrt(mom2[index]) + epsilon);
-    for (int j = threadIdx.x; j < N; j += blockDim.x) {
-      mom1[index * N + j] =
-          mom1[index * N + j] * beta1 + grad[i * N + j] * (1.0f - beta1);
-      output_grad[i * N + j] = mom1[index * N + j] * step;
-      param[index * N + j] += lr[0] * output_grad[i * N + j];
-    }
-  }
-}
-
 template <>
 template <typename SIndex>
 bool SparseAdamOp<float, CUDAContext>::DoRunWithType() {
@@ -359,73 +262,7 @@ bool SparseAdamOp<float, CUDAContext>::DoRunWithType() {
   return true;
 }
 
-template <>
-template <typename SIndex>
-bool RowWiseSparseAdamOp<float, CUDAContext>::DoRunWithType() {
-  Output(OUTPUT_PARAM)->ResizeLike(Input(PARAM));
-  Output(OUTPUT_MOMENT_1)->ResizeLike(Input(MOMENT_1));
-  Output(OUTPUT_MOMENT_2)->ResizeLike(Input(MOMENT_2));
-
-  auto N = Input(GRAD).size();
-  if (N == 0) {
-    // empty grad, nothing to do here, not even launching the kernel
-    return true;
-  }
-  const auto iter =
-      OperatorBase::Input<Tensor>(ITER, CPU).template data<int64_t>()[0];
-  const float correction = sqrtf(1.0f - std::pow(beta2_, iter + 1)) /
-      (1.0f - std::pow(beta1_, iter + 1));
-
-  // size of the 1st dimension of the input gradient
-  auto GRAD_M = Input(GRAD).dim32(0);
-  auto GRAD_N = N / GRAD_M;
-
-  if (OutputSize() == 3) {
-    RowWiseSparseAdamKernel<SIndex>
-        <<<std::min(GRAD_M, CAFFE_MAXIMUM_NUM_BLOCKS),
-           CAFFE_CUDA_NUM_THREADS,
-           0,
-           context_.cuda_stream()>>>(
-            GRAD_M,
-            GRAD_N,
-            beta1_,
-            beta2_,
-            epsilon_,
-            Output(OUTPUT_PARAM)->template mutable_data<float>(),
-            Output(OUTPUT_MOMENT_1)->template mutable_data<float>(),
-            Output(OUTPUT_MOMENT_2)->template mutable_data<float>(),
-            Input(INDICES).template data<SIndex>(),
-            Input(GRAD).template data<float>(),
-            correction,
-            Input(LR).template data<float>());
-  } else {
-    Output(OUTPUT_GRAD)->ResizeLike(Input(GRAD));
-    RowWiseSparseAdamOutputGradKernel<SIndex>
-        <<<std::min(GRAD_M, CAFFE_MAXIMUM_NUM_BLOCKS),
-           CAFFE_CUDA_NUM_THREADS,
-           0,
-           context_.cuda_stream()>>>(
-            GRAD_M,
-            GRAD_N,
-            beta1_,
-            beta2_,
-            epsilon_,
-            Output(OUTPUT_PARAM)->template mutable_data<float>(),
-            Output(OUTPUT_MOMENT_1)->template mutable_data<float>(),
-            Output(OUTPUT_MOMENT_2)->template mutable_data<float>(),
-            Output(OUTPUT_GRAD)->template mutable_data<float>(),
-            Input(INDICES).template data<SIndex>(),
-            Input(GRAD).template data<float>(),
-            correction,
-            Input(LR).template data<float>());
-  }
-
-  return true;
-}
-
 REGISTER_CUDA_OPERATOR(Adam, AdamOp<float, CUDAContext>);
 REGISTER_CUDA_OPERATOR(SparseAdam, SparseAdamOp<float, CUDAContext>);
-REGISTER_CUDA_OPERATOR(
-    RowWiseSparseAdam,
-    RowWiseSparseAdamOp<float, CUDAContext>);
+
 } // namespace caffe2