|
1 | | -#include <cub/block/block_reduce.cuh> |
2 | 1 | #include "caffe2/core/common_gpu.h" |
3 | 2 | #include "caffe2/core/context_gpu.h" |
4 | 3 | #include "caffe2/sgd/adam_op.h" |
@@ -204,102 +203,6 @@ __global__ void SparseAdamOutputGradKernel( |
204 | 203 | } |
205 | 204 | } |
206 | 205 |
|
207 | | -template <typename SIndex> |
208 | | -__global__ void RowWiseSparseAdamKernel( |
209 | | - const int M, |
210 | | - const int N, |
211 | | - const float beta1, |
212 | | - const float beta2, |
213 | | - const float epsilon, |
214 | | - float* param, |
215 | | - float* mom1, |
216 | | - float* mom2, |
217 | | - const SIndex* indices, |
218 | | - const float* grad, |
219 | | - const float correction, |
220 | | - const float* lr) { |
221 | | - typedef cub::BlockReduce<float, CAFFE_CUDA_NUM_THREADS> BlockReduce; |
222 | | - __shared__ BlockReduce::TempStorage temp_storage; |
223 | | - int valid = min(N, CAFFE_CUDA_NUM_THREADS); |
224 | | - // in case gridDim is smaller than M |
225 | | - for (int i = blockIdx.x; i < M; i += gridDim.x) { |
226 | | - const SIndex index = indices[i]; |
227 | | - float sum_squares = 0.0; |
228 | | - __shared__ float row_sum_squares_avg; |
229 | | -
|
230 | | - // in case N is bigger than block size which is 512 by default |
231 | | - for (int j = threadIdx.x; j < N; j += blockDim.x) { |
232 | | - const float x_ij = grad[i * N + j]; |
233 | | - sum_squares += x_ij * x_ij; |
234 | | - } |
235 | | -
|
236 | | - float reduce_sum_squares = |
237 | | - BlockReduce(temp_storage).Sum(sum_squares, valid); |
238 | | - if (threadIdx.x == 0) { |
239 | | - row_sum_squares_avg = reduce_sum_squares / (float)N; |
240 | | - mom2[index] = mom2[index] * beta2 + row_sum_squares_avg * (1.0f - beta2); |
241 | | - } |
242 | | -
|
243 | | - __syncthreads(); |
244 | | - // update param |
245 | | - float step = correction / (std::sqrt(mom2[index]) + epsilon); |
246 | | - for (int j = threadIdx.x; j < N; j += blockDim.x) { |
247 | | - mom1[index * N + j] = |
248 | | - mom1[index * N + j] * beta1 + grad[i * N + j] * (1.0f - beta1); |
249 | | - param[index * N + j] += lr[0] * mom1[index * N + j] * step; |
250 | | - } |
251 | | - } |
252 | | -} |
253 | | -
|
254 | | -template <typename SIndex> |
255 | | -__global__ void RowWiseSparseAdamOutputGradKernel( |
256 | | - const int M, |
257 | | - const int N, |
258 | | - const float beta1, |
259 | | - const float beta2, |
260 | | - const float epsilon, |
261 | | - float* param, |
262 | | - float* mom1, |
263 | | - float* mom2, |
264 | | - float* output_grad, |
265 | | - const SIndex* indices, |
266 | | - const float* grad, |
267 | | - const float correction, |
268 | | - const float* lr) { |
269 | | - typedef cub::BlockReduce<float, CAFFE_CUDA_NUM_THREADS> BlockReduce; |
270 | | - __shared__ BlockReduce::TempStorage temp_storage; |
271 | | - int valid = min(N, CAFFE_CUDA_NUM_THREADS); |
272 | | - // in case gridDim is smaller than M |
273 | | - for (int i = blockIdx.x; i < M; i += gridDim.x) { |
274 | | - const SIndex index = indices[i]; |
275 | | - float sum_squares = 0.0; |
276 | | - __shared__ float row_sum_squares_avg; |
277 | | -
|
278 | | - // in case N is bigger than block size which is 512 by default |
279 | | - for (int j = threadIdx.x; j < N; j += blockDim.x) { |
280 | | - const float x_ij = grad[i * N + j]; |
281 | | - sum_squares += x_ij * x_ij; |
282 | | - } |
283 | | -
|
284 | | - float reduce_sum_squares = |
285 | | - BlockReduce(temp_storage).Sum(sum_squares, valid); |
286 | | - if (threadIdx.x == 0) { |
287 | | - row_sum_squares_avg = reduce_sum_squares / (float)N; |
288 | | - mom2[index] = mom2[index] * beta2 + row_sum_squares_avg * (1.0f - beta2); |
289 | | - } |
290 | | -
|
291 | | - __syncthreads(); |
292 | | - // update param |
293 | | - float step = correction / (std::sqrt(mom2[index]) + epsilon); |
294 | | - for (int j = threadIdx.x; j < N; j += blockDim.x) { |
295 | | - mom1[index * N + j] = |
296 | | - mom1[index * N + j] * beta1 + grad[i * N + j] * (1.0f - beta1); |
297 | | - output_grad[i * N + j] = mom1[index * N + j] * step; |
298 | | - param[index * N + j] += lr[0] * output_grad[i * N + j]; |
299 | | - } |
300 | | - } |
301 | | -} |
302 | | -
|
303 | 206 | template <> |
304 | 207 | template <typename SIndex> |
305 | 208 | bool SparseAdamOp<float, CUDAContext>::DoRunWithType() { |
@@ -359,73 +262,7 @@ bool SparseAdamOp<float, CUDAContext>::DoRunWithType() { |
359 | 262 | return true; |
360 | 263 | } |
361 | 264 |
|
362 | | -template <> |
363 | | -template <typename SIndex> |
364 | | -bool RowWiseSparseAdamOp<float, CUDAContext>::DoRunWithType() { |
365 | | - Output(OUTPUT_PARAM)->ResizeLike(Input(PARAM)); |
366 | | - Output(OUTPUT_MOMENT_1)->ResizeLike(Input(MOMENT_1)); |
367 | | - Output(OUTPUT_MOMENT_2)->ResizeLike(Input(MOMENT_2)); |
368 | | -
|
369 | | - auto N = Input(GRAD).size(); |
370 | | - if (N == 0) { |
371 | | - // empty grad, nothing to do here, not even launching the kernel |
372 | | - return true; |
373 | | - } |
374 | | - const auto iter = |
375 | | - OperatorBase::Input<Tensor>(ITER, CPU).template data<int64_t>()[0]; |
376 | | - const float correction = sqrtf(1.0f - std::pow(beta2_, iter + 1)) / |
377 | | - (1.0f - std::pow(beta1_, iter + 1)); |
378 | | -
|
379 | | - // size of the 1st dimension of the input gradient |
380 | | - auto GRAD_M = Input(GRAD).dim32(0); |
381 | | - auto GRAD_N = N / GRAD_M; |
382 | | -
|
383 | | - if (OutputSize() == 3) { |
384 | | - RowWiseSparseAdamKernel<SIndex> |
385 | | - <<<std::min(GRAD_M, CAFFE_MAXIMUM_NUM_BLOCKS), |
386 | | - CAFFE_CUDA_NUM_THREADS, |
387 | | - 0, |
388 | | - context_.cuda_stream()>>>( |
389 | | - GRAD_M, |
390 | | - GRAD_N, |
391 | | - beta1_, |
392 | | - beta2_, |
393 | | - epsilon_, |
394 | | - Output(OUTPUT_PARAM)->template mutable_data<float>(), |
395 | | - Output(OUTPUT_MOMENT_1)->template mutable_data<float>(), |
396 | | - Output(OUTPUT_MOMENT_2)->template mutable_data<float>(), |
397 | | - Input(INDICES).template data<SIndex>(), |
398 | | - Input(GRAD).template data<float>(), |
399 | | - correction, |
400 | | - Input(LR).template data<float>()); |
401 | | - } else { |
402 | | - Output(OUTPUT_GRAD)->ResizeLike(Input(GRAD)); |
403 | | - RowWiseSparseAdamOutputGradKernel<SIndex> |
404 | | - <<<std::min(GRAD_M, CAFFE_MAXIMUM_NUM_BLOCKS), |
405 | | - CAFFE_CUDA_NUM_THREADS, |
406 | | - 0, |
407 | | - context_.cuda_stream()>>>( |
408 | | - GRAD_M, |
409 | | - GRAD_N, |
410 | | - beta1_, |
411 | | - beta2_, |
412 | | - epsilon_, |
413 | | - Output(OUTPUT_PARAM)->template mutable_data<float>(), |
414 | | - Output(OUTPUT_MOMENT_1)->template mutable_data<float>(), |
415 | | - Output(OUTPUT_MOMENT_2)->template mutable_data<float>(), |
416 | | - Output(OUTPUT_GRAD)->template mutable_data<float>(), |
417 | | - Input(INDICES).template data<SIndex>(), |
418 | | - Input(GRAD).template data<float>(), |
419 | | - correction, |
420 | | - Input(LR).template data<float>()); |
421 | | - } |
422 | | -
|
423 | | - return true; |
424 | | -} |
425 | | -
|
426 | 265 | REGISTER_CUDA_OPERATOR(Adam, AdamOp<float, CUDAContext>); |
427 | 266 | REGISTER_CUDA_OPERATOR(SparseAdam, SparseAdamOp<float, CUDAContext>); |
428 | | -REGISTER_CUDA_OPERATOR( |
429 | | - RowWiseSparseAdam, |
430 | | - RowWiseSparseAdamOp<float, CUDAContext>); |
| 267 | +
|
431 | 268 | } // namespace caffe2 |
0 commit comments