Skip to content

Commit d016dbd

Browse files
committed
Accumulate gradients in SGDSolver and AdaGrad
and its test. I do not tested with AdaGrad but it should work. Conflicts: src/caffe/proto/caffe.proto src/caffe/solver.cpp
1 parent a677076 commit d016dbd

File tree

4 files changed

+183
-3
lines changed

4 files changed

+183
-3
lines changed

include/caffe/solver.hpp

Lines changed: 12 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -53,6 +53,9 @@ class Solver {
5353
virtual void RestoreSolverState(const SolverState& state) = 0;
5454
void DisplayOutputBlobs(const int net_id);
5555

56+
virtual inline void AccumulateGradients() { NOT_IMPLEMENTED; }
57+
virtual inline void ResetAccumulateGradients() { NOT_IMPLEMENTED; }
58+
5659
SolverParameter param_;
5760
int iter_;
5861
int current_step_;
@@ -88,7 +91,12 @@ class SGDSolver : public Solver<Dtype> {
8891
// update maintains update related data and is not needed in snapshots.
8992
// temp maintains other information that might be needed in computation
9093
// of gradients/updates and is not needed in snapshots
91-
vector<shared_ptr<Blob<Dtype> > > history_, update_, temp_;
94+
// accum is used to accumlate gradients over multiple forward-backward
95+
// passes and is not needed in snapshots
96+
vector<shared_ptr<Blob<Dtype> > > history_, update_, temp_, accum_;
97+
98+
virtual void AccumulateGradients();
99+
virtual void ResetAccumulateGradients();
92100

93101
DISABLE_COPY_AND_ASSIGN(SGDSolver);
94102
};
@@ -104,6 +112,9 @@ class NesterovSolver : public SGDSolver<Dtype> {
104112
protected:
105113
virtual void ComputeUpdateValue();
106114

115+
virtual inline void AccumulateGradients() { NOT_IMPLEMENTED; }
116+
virtual inline void ResetAccumulateGradients() { NOT_IMPLEMENTED; }
117+
107118
DISABLE_COPY_AND_ASSIGN(NesterovSolver);
108119
};
109120

src/caffe/proto/caffe.proto

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -75,7 +75,7 @@ message NetParameter {
7575
// NOTE
7676
// Update the next available ID when you add a new SolverParameter field.
7777
//
78-
// SolverParameter next available ID: 36 (last added: clip_gradients)
78+
// SolverParameter next available ID: 37 (last added: accum_grad)
7979
message SolverParameter {
8080
//////////////////////////////////////////////////////////////////////////////
8181
// Specifying the train and test networks
@@ -145,6 +145,9 @@ message SolverParameter {
145145
// whenever their actual L2 norm is larger.
146146
optional float clip_gradients = 35 [default = -1];
147147

148+
// Accumulate gradients. This only works with SGDSolver.
149+
optional int32 accum_grad = 36 [default = 1];
150+
148151
optional int32 snapshot = 14 [default = 0]; // The snapshot interval
149152
optional string snapshot_prefix = 15; // The prefix for the snapshot.
150153
// whether to snapshot diff in the results or not. Snapshotting diff will help

src/caffe/solver.cpp

Lines changed: 77 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -33,6 +33,7 @@ void Solver<Dtype>::Init(const SolverParameter& param) {
3333
<< param.DebugString();
3434
param_ = param;
3535
CHECK_GE(param_.average_loss(), 1) << "average_loss should be non-negative.";
36+
CHECK_GE(param_.accum_grad(), 1) << "accum_grad should be non-negative.";
3637
if (param_.random_seed() >= 0) {
3738
Caffe::set_random_seed(param_.random_seed());
3839
}
@@ -164,6 +165,7 @@ void Solver<Dtype>::Step(int iters) {
164165
const int start_iter = iter_;
165166
const int stop_iter = iter_ + iters;
166167
int average_loss = this->param_.average_loss();
168+
const int accum_grad = this->param_.accum_grad();
167169
vector<Dtype> losses;
168170
Dtype smoothed_loss = 0;
169171

@@ -175,7 +177,17 @@ void Solver<Dtype>::Step(int iters) {
175177

176178
const bool display = param_.display() && iter_ % param_.display() == 0;
177179
net_->set_debug_info(display && param_.debug_info());
178-
Dtype loss = net_->ForwardBackward(bottom_vec);
180+
Dtype loss = 0;
181+
if (accum_grad > 1) {
182+
ResetAccumulateGradients();
183+
for (int i = 0; i < accum_grad; ++i) {
184+
loss += net_->ForwardBackward(bottom_vec);
185+
AccumulateGradients();
186+
}
187+
loss /= accum_grad;
188+
} else {
189+
loss = net_->ForwardBackward(bottom_vec);
190+
}
179191
if (losses.size() < average_loss) {
180192
losses.push_back(loss);
181193
int size = losses.size();
@@ -430,6 +442,9 @@ void SGDSolver<Dtype>::PreSolve() {
430442
temp_.push_back(shared_ptr<Blob<Dtype> >(new Blob<Dtype>(
431443
net_param->num(), net_param->channels(), net_param->height(),
432444
net_param->width())));
445+
accum_.push_back(shared_ptr<Blob<Dtype> >(new Blob<Dtype>(
446+
net_param->num(), net_param->channels(), net_param->height(),
447+
net_param->width())));
433448
}
434449
}
435450

@@ -458,12 +473,54 @@ void SGDSolver<Dtype>::ClipGradients() {
458473
}
459474
}
460475

476+
template <typename Dtype>
477+
void SGDSolver<Dtype>::AccumulateGradients() {
478+
const vector<shared_ptr<Blob<Dtype> > >& net_params = this->net_->params();
479+
const int accum_grad = this->param_.accum_grad();
480+
if (Caffe::mode() == Caffe::GPU) {
481+
#ifndef CPU_ONLY
482+
for (int param_id = 0; param_id < net_params.size(); ++param_id) {
483+
caffe_gpu_axpy(net_params[param_id]->count(), Dtype(1. / accum_grad),
484+
net_params[param_id]->gpu_diff(),
485+
accum_[param_id]->mutable_gpu_data());
486+
}
487+
#else
488+
NO_GPU;
489+
#endif
490+
} else {
491+
for (int param_id = 0; param_id < net_params.size(); ++param_id) {
492+
caffe_axpy(net_params[param_id]->count(), Dtype(1. / accum_grad),
493+
net_params[param_id]->cpu_diff(),
494+
accum_[param_id]->mutable_cpu_data());
495+
}
496+
}
497+
}
498+
template <typename Dtype>
499+
void SGDSolver<Dtype>::ResetAccumulateGradients() {
500+
if (Caffe::mode() == Caffe::GPU) {
501+
#ifndef CPU_ONLY
502+
for (int param_id = 0; param_id < accum_.size(); ++param_id) {
503+
caffe_gpu_set(accum_[param_id]->count(), Dtype(0),
504+
accum_[param_id]->mutable_gpu_data());
505+
}
506+
#else
507+
NO_GPU;
508+
#endif
509+
} else {
510+
for (int param_id = 0; param_id < accum_.size(); ++param_id) {
511+
caffe_set(accum_[param_id]->count(), Dtype(0),
512+
accum_[param_id]->mutable_cpu_data());
513+
}
514+
}
515+
}
516+
461517
template <typename Dtype>
462518
void SGDSolver<Dtype>::ComputeUpdateValue() {
463519
const vector<shared_ptr<Blob<Dtype> > >& net_params = this->net_->params();
464520
const vector<float>& net_params_lr = this->net_->params_lr();
465521
const vector<float>& net_params_weight_decay =
466522
this->net_->params_weight_decay();
523+
const int accum_grad = this->param_.accum_grad();
467524
// get the learning rate
468525
Dtype rate = GetLearningRate();
469526
if (this->param_.display() && this->iter_ % this->param_.display() == 0) {
@@ -477,6 +534,10 @@ void SGDSolver<Dtype>::ComputeUpdateValue() {
477534
case Caffe::CPU:
478535
for (int param_id = 0; param_id < net_params.size(); ++param_id) {
479536
// Compute the value to history, and then copy them to the blob's diff.
537+
if (accum_grad > 1) {
538+
caffe_copy(accum_[param_id]->count(), accum_[param_id]->cpu_data(),
539+
net_params[param_id]->mutable_cpu_diff());
540+
}
480541
Dtype local_rate = rate * net_params_lr[param_id];
481542
Dtype local_decay = weight_decay * net_params_weight_decay[param_id];
482543

@@ -513,6 +574,10 @@ void SGDSolver<Dtype>::ComputeUpdateValue() {
513574
#ifndef CPU_ONLY
514575
for (int param_id = 0; param_id < net_params.size(); ++param_id) {
515576
// Compute the value to history, and then copy them to the blob's diff.
577+
if (accum_grad > 1) {
578+
caffe_copy(accum_[param_id]->count(), accum_[param_id]->gpu_data(),
579+
net_params[param_id]->mutable_gpu_diff());
580+
}
516581
Dtype local_rate = rate * net_params_lr[param_id];
517582
Dtype local_decay = weight_decay * net_params_weight_decay[param_id];
518583

@@ -696,6 +761,7 @@ void AdaGradSolver<Dtype>::ComputeUpdateValue() {
696761
const vector<float>& net_params_lr = this->net_->params_lr();
697762
const vector<float>& net_params_weight_decay =
698763
this->net_->params_weight_decay();
764+
const int accum_grad = this->param_.accum_grad();
699765
// get the learning rate
700766
Dtype rate = this->GetLearningRate();
701767
Dtype delta = this->param_.delta();
@@ -708,6 +774,11 @@ void AdaGradSolver<Dtype>::ComputeUpdateValue() {
708774
switch (Caffe::mode()) {
709775
case Caffe::CPU:
710776
for (int param_id = 0; param_id < net_params.size(); ++param_id) {
777+
if (accum_grad > 1) {
778+
caffe_copy(this->accum_[param_id]->count(),
779+
this->accum_[param_id]->cpu_data(),
780+
net_params[param_id]->mutable_cpu_diff());
781+
}
711782
Dtype local_rate = rate * net_params_lr[param_id];
712783
Dtype local_decay = weight_decay * net_params_weight_decay[param_id];
713784

@@ -764,6 +835,11 @@ void AdaGradSolver<Dtype>::ComputeUpdateValue() {
764835
case Caffe::GPU:
765836
#ifndef CPU_ONLY
766837
for (int param_id = 0; param_id < net_params.size(); ++param_id) {
838+
if (accum_grad > 1) {
839+
caffe_copy(this->accum_[param_id]->count(),
840+
this->accum_[param_id]->gpu_data(),
841+
net_params[param_id]->mutable_gpu_diff());
842+
}
767843
Dtype local_rate = rate * net_params_lr[param_id];
768844
Dtype local_decay = weight_decay * net_params_weight_decay[param_id];
769845

src/caffe/test/test_solver.cpp

Lines changed: 90 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,5 @@
1+
#include <boost/format.hpp>
2+
13
#include <string>
24
#include <utility>
35
#include <vector>
@@ -6,6 +8,8 @@
68
#include "gtest/gtest.h"
79

810
#include "caffe/common.hpp"
11+
#include "caffe/data_layers.hpp"
12+
#include "caffe/filler.hpp"
913
#include "caffe/proto/caffe.pb.h"
1014
#include "caffe/solver.hpp"
1115

@@ -36,6 +40,49 @@ class SolverTest : public MultiDeviceTest<TypeParam> {
3640
}
3741
solver_.reset(new SGDSolver<Dtype>(param));
3842
}
43+
virtual void InitSolverAccumGrad(int batch_size, int accum_grad) {
44+
boost::format fmt(
45+
"net_param { "
46+
" name: 'TestAccumGrad' "
47+
" layer { "
48+
" name: 'data' "
49+
" type: 'MemoryData' "
50+
" top: 'data' "
51+
" top: 'label' "
52+
" memory_data_param { "
53+
" batch_size: %1% "
54+
" channels: 3 "
55+
" height: 2 "
56+
" width: 2 "
57+
" } "
58+
" } "
59+
" layer { "
60+
" name: 'ip' "
61+
" type: 'InnerProduct' "
62+
" bottom: 'data' "
63+
" top: 'ip' "
64+
" inner_product_param { "
65+
" num_output: 1 "
66+
" } "
67+
" } "
68+
" layer { "
69+
" name: 'loss' "
70+
" type: 'EuclideanLoss' "
71+
" bottom: 'ip' "
72+
" bottom: 'label' "
73+
" top: 'loss' "
74+
" } "
75+
"} "
76+
"base_lr: 0.01 "
77+
"momentum: 0.9 "
78+
"weight_decay: 0.004 "
79+
"lr_policy: 'fixed' "
80+
"display: 100 "
81+
"max_iter: 100 "
82+
"accum_grad: %2%");
83+
fmt % batch_size % accum_grad;
84+
this->InitSolverFromProtoString(fmt.str());
85+
}
3986

4087
shared_ptr<Solver<Dtype> > solver_;
4188
};
@@ -104,4 +151,47 @@ TYPED_TEST(SolverTest, TestInitTrainTestNets) {
104151
EXPECT_TRUE(this->solver_->test_nets()[1]->has_layer("accuracy"));
105152
}
106153

154+
TYPED_TEST(SolverTest, TestSolverGradientAccumulation) {
155+
typedef typename TypeParam::Dtype Dtype;
156+
// Data preparation
157+
const int batch_size = 8;
158+
const int step = 8;
159+
Blob<Dtype> data(batch_size * step, 3, 2, 2);
160+
Blob<Dtype> label(batch_size * step, 1, 1, 1);
161+
FillerParameter data_filler_param;
162+
data_filler_param.set_std(1);
163+
GaussianFiller<Dtype> data_filler(data_filler_param);
164+
data_filler.Fill(&data);
165+
data_filler.Fill(&label);
166+
167+
// Run with batch_size=8, accum_grad=1
168+
this->InitSolverAccumGrad(batch_size, 1);
169+
boost::static_pointer_cast<MemoryDataLayer<Dtype> >(
170+
this->solver_->net()->layers()[0])->Reset(
171+
data.mutable_cpu_data(), label.mutable_cpu_data(), batch_size * step);
172+
this->solver_->Step(step);
173+
shared_ptr<Blob<Dtype> > weight1 = this->solver_->net()->params()[0];
174+
shared_ptr<Blob<Dtype> > bias1 = this->solver_->net()->params()[1];
175+
176+
// Run with batch_size=4, accum_grad=2
177+
this->InitSolverAccumGrad(batch_size / 2, 2);
178+
boost::static_pointer_cast<MemoryDataLayer<Dtype> >(
179+
this->solver_->net()->layers()[0])->Reset(
180+
data.mutable_cpu_data(), label.mutable_cpu_data(), batch_size * step);
181+
this->solver_->Step(step);
182+
shared_ptr<Blob<Dtype> > weight2 = this->solver_->net()->params()[0];
183+
shared_ptr<Blob<Dtype> > bias2 = this->solver_->net()->params()[1];
184+
185+
// Check if the numbers are the same for both settings.
186+
for (int i = 0; i < weight1->count(); ++i) {
187+
Dtype value1 = weight1->cpu_data()[i];
188+
Dtype value2 = weight2->cpu_data()[i];
189+
EXPECT_NEAR(value1, value2, 1e-7);
190+
}
191+
for (int i = 0; i < bias1->count(); ++i) {
192+
Dtype value1 = bias1->cpu_data()[i];
193+
Dtype value2 = bias2->cpu_data()[i];
194+
EXPECT_NEAR(value1, value2, 1e-7);
195+
}
196+
}
107197
} // namespace caffe

0 commit comments

Comments
 (0)