Skip to content

Commit d927d58

Browse files
ezyangfacebook-github-bot
authored andcommitted
Revert D20289209: Support RowWiseSparseAdam on GPU
Test Plan: revert-hammer Differential Revision: D20289209 Original commit changeset: a7a8a21bd18c fbshipit-source-id: 4a8ae684d099a5499c28b7e65578fc7ab10b248d
1 parent a1eaaea commit d927d58

3 files changed

Lines changed: 3 additions & 196 deletions

File tree

caffe2/python/operator_test/adam_test.py

Lines changed: 2 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -262,7 +262,7 @@ def ref_sparse_output_grad(param, mom1, mom2, indices, grad, LR, ITER,
262262
epsilon=st.floats(min_value=0.01, max_value=0.99,
263263
allow_nan=False, allow_infinity=False),
264264
data_strategy=st.data(),
265-
**hu.gcs)
265+
**hu.gcs_cpu_only)
266266
def test_row_wise_sparse_adam(self, inputs, ITER, LR, beta1, beta2, epsilon,
267267
data_strategy, gc, dc):
268268
param, mom1, grad = inputs
@@ -321,12 +321,6 @@ def ref_row_wise_sparse(param, mom1, mom2, indices, grad, LR, ITER):
321321
# Iter lives on the CPU
322322
input_device_options = {'iter': hu.cpu_do}
323323

324-
self.assertDeviceChecks(
325-
dc, op,
326-
[param, mom1, mom2, indices, grad, LR, ITER],
327-
[0, 1, 2],
328-
input_device_options=input_device_options)
329-
330324
self.assertReferenceChecks(
331325
gc, op,
332326
[param, mom1, mom2, indices, grad, LR, ITER],
@@ -344,7 +338,7 @@ def ref_row_wise_sparse(param, mom1, mom2, indices, grad, LR, ITER):
344338
epsilon=st.floats(min_value=0.01, max_value=0.99,
345339
allow_nan=False, allow_infinity=False),
346340
data_strategy=st.data(),
347-
**hu.gcs)
341+
**hu.gcs_cpu_only)
348342
def test_row_wise_sparse_adam_output_grad(self, inputs, ITER, LR, beta1, beta2,
349343
epsilon, data_strategy, gc, dc):
350344
param, mom1, grad = inputs
@@ -406,12 +400,6 @@ def ref_row_wise_sparse_output_grad(param, mom1, mom2, indices, grad, LR, ITER,
406400
# Iter lives on the CPU
407401
input_device_options = {'iter': hu.cpu_do}
408402

409-
self.assertDeviceChecks(
410-
dc, op,
411-
[param, mom1, mom2, indices, grad, LR, ITER],
412-
[0, 1, 2, 3],
413-
input_device_options=input_device_options)
414-
415403
self.assertReferenceChecks(
416404
gc, op,
417405
[param, mom1, mom2, indices, grad, LR, ITER],

caffe2/sgd/adam_op.cc

Lines changed: 0 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -53,15 +53,6 @@ OPERATOR_SCHEMA(SparseAdam)
5353
.NumInputs(7)
5454
.NumOutputs(3, 4)
5555
.EnforceInplace({{0, 0}, {1, 1}, {2, 2}})
56-
.DeviceInferenceFunction([](const OperatorDef& def) {
57-
auto op_device =
58-
def.has_device_option() ? def.device_option() : DeviceOption();
59-
vector<DeviceOption> in_dev(def.input_size(), op_device);
60-
vector<DeviceOption> out_dev(def.output_size(), op_device);
61-
// ITER input lives on CPU
62-
in_dev[6] = DeviceOption();
63-
return std::make_pair(in_dev, out_dev);
64-
})
6556
.SetDoc(R"DOC(
6657
6758
Computes the Adam Update for the sparse case.
@@ -94,15 +85,6 @@ OPERATOR_SCHEMA(RowWiseSparseAdam)
9485
.NumInputs(7)
9586
.NumOutputs(3, 4)
9687
.EnforceInplace({{0, 0}, {1, 1}, {2, 2}})
97-
.DeviceInferenceFunction([](const OperatorDef& def) {
98-
auto op_device =
99-
def.has_device_option() ? def.device_option() : DeviceOption();
100-
vector<DeviceOption> in_dev(def.input_size(), op_device);
101-
vector<DeviceOption> out_dev(def.output_size(), op_device);
102-
// ITER input lives on CPU
103-
in_dev[6] = DeviceOption();
104-
return std::make_pair(in_dev, out_dev);
105-
})
10688
.SetDoc(R"DOC(
10789
10890
Computes a modified Adam Update for the sparse case.

caffe2/sgd/adam_op_gpu.cu

Lines changed: 1 addition & 164 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,3 @@
1-
#include <cub/block/block_reduce.cuh>
21
#include "caffe2/core/common_gpu.h"
32
#include "caffe2/core/context_gpu.h"
43
#include "caffe2/sgd/adam_op.h"
@@ -204,102 +203,6 @@ __global__ void SparseAdamOutputGradKernel(
204203
}
205204
}
206205
207-
template <typename SIndex>
208-
__global__ void RowWiseSparseAdamKernel(
209-
const int M,
210-
const int N,
211-
const float beta1,
212-
const float beta2,
213-
const float epsilon,
214-
float* param,
215-
float* mom1,
216-
float* mom2,
217-
const SIndex* indices,
218-
const float* grad,
219-
const float correction,
220-
const float* lr) {
221-
typedef cub::BlockReduce<float, CAFFE_CUDA_NUM_THREADS> BlockReduce;
222-
__shared__ BlockReduce::TempStorage temp_storage;
223-
int valid = min(N, CAFFE_CUDA_NUM_THREADS);
224-
// in case gridDim is smaller than M
225-
for (int i = blockIdx.x; i < M; i += gridDim.x) {
226-
const SIndex index = indices[i];
227-
float sum_squares = 0.0;
228-
__shared__ float row_sum_squares_avg;
229-
230-
// in case N is bigger than block size which is 512 by default
231-
for (int j = threadIdx.x; j < N; j += blockDim.x) {
232-
const float x_ij = grad[i * N + j];
233-
sum_squares += x_ij * x_ij;
234-
}
235-
236-
float reduce_sum_squares =
237-
BlockReduce(temp_storage).Sum(sum_squares, valid);
238-
if (threadIdx.x == 0) {
239-
row_sum_squares_avg = reduce_sum_squares / (float)N;
240-
mom2[index] = mom2[index] * beta2 + row_sum_squares_avg * (1.0f - beta2);
241-
}
242-
243-
__syncthreads();
244-
// update param
245-
float step = correction / (std::sqrt(mom2[index]) + epsilon);
246-
for (int j = threadIdx.x; j < N; j += blockDim.x) {
247-
mom1[index * N + j] =
248-
mom1[index * N + j] * beta1 + grad[i * N + j] * (1.0f - beta1);
249-
param[index * N + j] += lr[0] * mom1[index * N + j] * step;
250-
}
251-
}
252-
}
253-
254-
template <typename SIndex>
255-
__global__ void RowWiseSparseAdamOutputGradKernel(
256-
const int M,
257-
const int N,
258-
const float beta1,
259-
const float beta2,
260-
const float epsilon,
261-
float* param,
262-
float* mom1,
263-
float* mom2,
264-
float* output_grad,
265-
const SIndex* indices,
266-
const float* grad,
267-
const float correction,
268-
const float* lr) {
269-
typedef cub::BlockReduce<float, CAFFE_CUDA_NUM_THREADS> BlockReduce;
270-
__shared__ BlockReduce::TempStorage temp_storage;
271-
int valid = min(N, CAFFE_CUDA_NUM_THREADS);
272-
// in case gridDim is smaller than M
273-
for (int i = blockIdx.x; i < M; i += gridDim.x) {
274-
const SIndex index = indices[i];
275-
float sum_squares = 0.0;
276-
__shared__ float row_sum_squares_avg;
277-
278-
// in case N is bigger than block size which is 512 by default
279-
for (int j = threadIdx.x; j < N; j += blockDim.x) {
280-
const float x_ij = grad[i * N + j];
281-
sum_squares += x_ij * x_ij;
282-
}
283-
284-
float reduce_sum_squares =
285-
BlockReduce(temp_storage).Sum(sum_squares, valid);
286-
if (threadIdx.x == 0) {
287-
row_sum_squares_avg = reduce_sum_squares / (float)N;
288-
mom2[index] = mom2[index] * beta2 + row_sum_squares_avg * (1.0f - beta2);
289-
}
290-
291-
__syncthreads();
292-
// update param
293-
float step = correction / (std::sqrt(mom2[index]) + epsilon);
294-
for (int j = threadIdx.x; j < N; j += blockDim.x) {
295-
mom1[index * N + j] =
296-
mom1[index * N + j] * beta1 + grad[i * N + j] * (1.0f - beta1);
297-
output_grad[i * N + j] = mom1[index * N + j] * step;
298-
param[index * N + j] += lr[0] * output_grad[i * N + j];
299-
}
300-
}
301-
}
302-
303206
template <>
304207
template <typename SIndex>
305208
bool SparseAdamOp<float, CUDAContext>::DoRunWithType() {
@@ -359,73 +262,7 @@ bool SparseAdamOp<float, CUDAContext>::DoRunWithType() {
359262
return true;
360263
}
361264
362-
template <>
363-
template <typename SIndex>
364-
bool RowWiseSparseAdamOp<float, CUDAContext>::DoRunWithType() {
365-
Output(OUTPUT_PARAM)->ResizeLike(Input(PARAM));
366-
Output(OUTPUT_MOMENT_1)->ResizeLike(Input(MOMENT_1));
367-
Output(OUTPUT_MOMENT_2)->ResizeLike(Input(MOMENT_2));
368-
369-
auto N = Input(GRAD).size();
370-
if (N == 0) {
371-
// empty grad, nothing to do here, not even launching the kernel
372-
return true;
373-
}
374-
const auto iter =
375-
OperatorBase::Input<Tensor>(ITER, CPU).template data<int64_t>()[0];
376-
const float correction = sqrtf(1.0f - std::pow(beta2_, iter + 1)) /
377-
(1.0f - std::pow(beta1_, iter + 1));
378-
379-
// size of the 1st dimension of the input gradient
380-
auto GRAD_M = Input(GRAD).dim32(0);
381-
auto GRAD_N = N / GRAD_M;
382-
383-
if (OutputSize() == 3) {
384-
RowWiseSparseAdamKernel<SIndex>
385-
<<<std::min(GRAD_M, CAFFE_MAXIMUM_NUM_BLOCKS),
386-
CAFFE_CUDA_NUM_THREADS,
387-
0,
388-
context_.cuda_stream()>>>(
389-
GRAD_M,
390-
GRAD_N,
391-
beta1_,
392-
beta2_,
393-
epsilon_,
394-
Output(OUTPUT_PARAM)->template mutable_data<float>(),
395-
Output(OUTPUT_MOMENT_1)->template mutable_data<float>(),
396-
Output(OUTPUT_MOMENT_2)->template mutable_data<float>(),
397-
Input(INDICES).template data<SIndex>(),
398-
Input(GRAD).template data<float>(),
399-
correction,
400-
Input(LR).template data<float>());
401-
} else {
402-
Output(OUTPUT_GRAD)->ResizeLike(Input(GRAD));
403-
RowWiseSparseAdamOutputGradKernel<SIndex>
404-
<<<std::min(GRAD_M, CAFFE_MAXIMUM_NUM_BLOCKS),
405-
CAFFE_CUDA_NUM_THREADS,
406-
0,
407-
context_.cuda_stream()>>>(
408-
GRAD_M,
409-
GRAD_N,
410-
beta1_,
411-
beta2_,
412-
epsilon_,
413-
Output(OUTPUT_PARAM)->template mutable_data<float>(),
414-
Output(OUTPUT_MOMENT_1)->template mutable_data<float>(),
415-
Output(OUTPUT_MOMENT_2)->template mutable_data<float>(),
416-
Output(OUTPUT_GRAD)->template mutable_data<float>(),
417-
Input(INDICES).template data<SIndex>(),
418-
Input(GRAD).template data<float>(),
419-
correction,
420-
Input(LR).template data<float>());
421-
}
422-
423-
return true;
424-
}
425-
426265
REGISTER_CUDA_OPERATOR(Adam, AdamOp<float, CUDAContext>);
427266
REGISTER_CUDA_OPERATOR(SparseAdam, SparseAdamOp<float, CUDAContext>);
428-
REGISTER_CUDA_OPERATOR(
429-
RowWiseSparseAdam,
430-
RowWiseSparseAdamOp<float, CUDAContext>);
267+
431268
} // namespace caffe2

0 commit comments

Comments
 (0)