Accumulate gradients in SGDSolver and AdaGrad

tnarihi · tnarihi · commit d016dbde5f7c · 2015-02-25T18:55:13.000-08:00
and its test. I do not tested with AdaGrad but it should work.

Conflicts:
	src/caffe/proto/caffe.proto
	src/caffe/solver.cpp
diff --git a/include/caffe/solver.hpp b/include/caffe/solver.hpp
@@ -53,6 +53,9 @@ class Solver {
   virtual void RestoreSolverState(const SolverState& state) = 0;
   void DisplayOutputBlobs(const int net_id);
 
+  virtual inline void AccumulateGradients() { NOT_IMPLEMENTED; }
+  virtual inline void ResetAccumulateGradients() { NOT_IMPLEMENTED; }
+
   SolverParameter param_;
   int iter_;
   int current_step_;
@@ -88,7 +91,12 @@ class SGDSolver : public Solver<Dtype> {
   // update maintains update related data and is not needed in snapshots.
   // temp maintains other information that might be needed in computation
   //   of gradients/updates and is not needed in snapshots
-  vector<shared_ptr<Blob<Dtype> > > history_, update_, temp_;
+  // accum is used to accumlate gradients over multiple forward-backward
+  // passes and is not needed in snapshots
+  vector<shared_ptr<Blob<Dtype> > > history_, update_, temp_, accum_;
+
+  virtual void AccumulateGradients();
+  virtual void ResetAccumulateGradients();
 
   DISABLE_COPY_AND_ASSIGN(SGDSolver);
 };
@@ -104,6 +112,9 @@ class NesterovSolver : public SGDSolver<Dtype> {
  protected:
   virtual void ComputeUpdateValue();
 
+  virtual inline void AccumulateGradients() { NOT_IMPLEMENTED; }
+  virtual inline void ResetAccumulateGradients() { NOT_IMPLEMENTED; }
+
   DISABLE_COPY_AND_ASSIGN(NesterovSolver);
 };
 
diff --git a/src/caffe/proto/caffe.proto b/src/caffe/proto/caffe.proto
@@ -75,7 +75,7 @@ message NetParameter {
 // NOTE
 // Update the next available ID when you add a new SolverParameter field.
 //
-// SolverParameter next available ID: 36 (last added: clip_gradients)
+// SolverParameter next available ID: 37 (last added: accum_grad)
 message SolverParameter {
   //////////////////////////////////////////////////////////////////////////////
   // Specifying the train and test networks
@@ -145,6 +145,9 @@ message SolverParameter {
   // whenever their actual L2 norm is larger.
   optional float clip_gradients = 35 [default = -1];
 
+  // Accumulate gradients. This only works with SGDSolver.
+  optional int32 accum_grad = 36 [default = 1];
+
   optional int32 snapshot = 14 [default = 0]; // The snapshot interval
   optional string snapshot_prefix = 15; // The prefix for the snapshot.
   // whether to snapshot diff in the results or not. Snapshotting diff will help
diff --git a/src/caffe/solver.cpp b/src/caffe/solver.cpp
@@ -33,6 +33,7 @@ void Solver<Dtype>::Init(const SolverParameter& param) {
             << param.DebugString();
   param_ = param;
   CHECK_GE(param_.average_loss(), 1) << "average_loss should be non-negative.";
+  CHECK_GE(param_.accum_grad(), 1) << "accum_grad should be non-negative.";
   if (param_.random_seed() >= 0) {
     Caffe::set_random_seed(param_.random_seed());
   }
@@ -164,6 +165,7 @@ void Solver<Dtype>::Step(int iters) {
   const int start_iter = iter_;
   const int stop_iter = iter_ + iters;
   int average_loss = this->param_.average_loss();
+  const int accum_grad = this->param_.accum_grad();
   vector<Dtype> losses;
   Dtype smoothed_loss = 0;
 
@@ -175,7 +177,17 @@ void Solver<Dtype>::Step(int iters) {
 
     const bool display = param_.display() && iter_ % param_.display() == 0;
     net_->set_debug_info(display && param_.debug_info());
-    Dtype loss = net_->ForwardBackward(bottom_vec);
+    Dtype loss = 0;
+    if (accum_grad > 1) {
+      ResetAccumulateGradients();
+      for (int i = 0; i < accum_grad; ++i) {
+        loss += net_->ForwardBackward(bottom_vec);
+        AccumulateGradients();
+      }
+      loss /= accum_grad;
+    } else {
+      loss = net_->ForwardBackward(bottom_vec);
+    }
     if (losses.size() < average_loss) {
       losses.push_back(loss);
       int size = losses.size();
@@ -430,6 +442,9 @@ void SGDSolver<Dtype>::PreSolve() {
     temp_.push_back(shared_ptr<Blob<Dtype> >(new Blob<Dtype>(
         net_param->num(), net_param->channels(), net_param->height(),
         net_param->width())));
+    accum_.push_back(shared_ptr<Blob<Dtype> >(new Blob<Dtype>(
+        net_param->num(), net_param->channels(), net_param->height(),
+        net_param->width())));
   }
 }
 
@@ -458,12 +473,54 @@ void SGDSolver<Dtype>::ClipGradients() {
   }
 }
 
+template <typename Dtype>
+void SGDSolver<Dtype>::AccumulateGradients() {
+  const vector<shared_ptr<Blob<Dtype> > >& net_params = this->net_->params();
+  const int accum_grad = this->param_.accum_grad();
+  if (Caffe::mode() == Caffe::GPU) {
+#ifndef CPU_ONLY
+    for (int param_id = 0; param_id < net_params.size(); ++param_id) {
+      caffe_gpu_axpy(net_params[param_id]->count(), Dtype(1. / accum_grad),
+        net_params[param_id]->gpu_diff(),
+        accum_[param_id]->mutable_gpu_data());
+    }
+#else
+    NO_GPU;
+#endif
+  } else {
+    for (int param_id = 0; param_id < net_params.size(); ++param_id) {
+      caffe_axpy(net_params[param_id]->count(), Dtype(1. / accum_grad),
+        net_params[param_id]->cpu_diff(),
+        accum_[param_id]->mutable_cpu_data());
+    }
+  }
+}
+template <typename Dtype>
+void SGDSolver<Dtype>::ResetAccumulateGradients() {
+  if (Caffe::mode() == Caffe::GPU) {
+#ifndef CPU_ONLY
+    for (int param_id = 0; param_id < accum_.size(); ++param_id) {
+      caffe_gpu_set(accum_[param_id]->count(), Dtype(0),
+        accum_[param_id]->mutable_gpu_data());
+    }
+#else
+    NO_GPU;
+#endif
+  } else {
+    for (int param_id = 0; param_id < accum_.size(); ++param_id) {
+      caffe_set(accum_[param_id]->count(), Dtype(0),
+        accum_[param_id]->mutable_cpu_data());
+    }
+  }
+}
+
 template <typename Dtype>
 void SGDSolver<Dtype>::ComputeUpdateValue() {
   const vector<shared_ptr<Blob<Dtype> > >& net_params = this->net_->params();
   const vector<float>& net_params_lr = this->net_->params_lr();
   const vector<float>& net_params_weight_decay =
       this->net_->params_weight_decay();
+  const int accum_grad = this->param_.accum_grad();
   // get the learning rate
   Dtype rate = GetLearningRate();
   if (this->param_.display() && this->iter_ % this->param_.display() == 0) {
@@ -477,6 +534,10 @@ void SGDSolver<Dtype>::ComputeUpdateValue() {
   case Caffe::CPU:
     for (int param_id = 0; param_id < net_params.size(); ++param_id) {
       // Compute the value to history, and then copy them to the blob's diff.
+      if (accum_grad > 1) {
+        caffe_copy(accum_[param_id]->count(), accum_[param_id]->cpu_data(),
+          net_params[param_id]->mutable_cpu_diff());
+      }
       Dtype local_rate = rate * net_params_lr[param_id];
       Dtype local_decay = weight_decay * net_params_weight_decay[param_id];
 
@@ -513,6 +574,10 @@ void SGDSolver<Dtype>::ComputeUpdateValue() {
 #ifndef CPU_ONLY
     for (int param_id = 0; param_id < net_params.size(); ++param_id) {
       // Compute the value to history, and then copy them to the blob's diff.
+      if (accum_grad > 1) {
+        caffe_copy(accum_[param_id]->count(), accum_[param_id]->gpu_data(),
+          net_params[param_id]->mutable_gpu_diff());
+      }
       Dtype local_rate = rate * net_params_lr[param_id];
       Dtype local_decay = weight_decay * net_params_weight_decay[param_id];
 
@@ -696,6 +761,7 @@ void AdaGradSolver<Dtype>::ComputeUpdateValue() {
   const vector<float>& net_params_lr = this->net_->params_lr();
   const vector<float>& net_params_weight_decay =
       this->net_->params_weight_decay();
+  const int accum_grad = this->param_.accum_grad();
   // get the learning rate
   Dtype rate = this->GetLearningRate();
   Dtype delta = this->param_.delta();
@@ -708,6 +774,11 @@ void AdaGradSolver<Dtype>::ComputeUpdateValue() {
   switch (Caffe::mode()) {
   case Caffe::CPU:
     for (int param_id = 0; param_id < net_params.size(); ++param_id) {
+      if (accum_grad > 1) {
+        caffe_copy(this->accum_[param_id]->count(),
+          this->accum_[param_id]->cpu_data(),
+          net_params[param_id]->mutable_cpu_diff());
+      }
       Dtype local_rate = rate * net_params_lr[param_id];
       Dtype local_decay = weight_decay * net_params_weight_decay[param_id];
 
@@ -764,6 +835,11 @@ void AdaGradSolver<Dtype>::ComputeUpdateValue() {
   case Caffe::GPU:
 #ifndef CPU_ONLY
     for (int param_id = 0; param_id < net_params.size(); ++param_id) {
+      if (accum_grad > 1) {
+        caffe_copy(this->accum_[param_id]->count(),
+          this->accum_[param_id]->gpu_data(),
+          net_params[param_id]->mutable_gpu_diff());
+      }
       Dtype local_rate = rate * net_params_lr[param_id];
       Dtype local_decay = weight_decay * net_params_weight_decay[param_id];
 
diff --git a/src/caffe/test/test_solver.cpp b/src/caffe/test/test_solver.cpp
@@ -1,3 +1,5 @@
+#include <boost/format.hpp>
+
 #include <string>
 #include <utility>
 #include <vector>
@@ -6,6 +8,8 @@
 #include "gtest/gtest.h"
 
 #include "caffe/common.hpp"
+#include "caffe/data_layers.hpp"
+#include "caffe/filler.hpp"
 #include "caffe/proto/caffe.pb.h"
 #include "caffe/solver.hpp"
 
@@ -36,6 +40,49 @@ class SolverTest : public MultiDeviceTest<TypeParam> {
     }
     solver_.reset(new SGDSolver<Dtype>(param));
   }
+  virtual void InitSolverAccumGrad(int batch_size, int accum_grad) {
+    boost::format fmt(
+        "net_param { "
+        "  name: 'TestAccumGrad' "
+        "  layer { "
+        "    name: 'data' "
+        "    type: 'MemoryData' "
+        "    top: 'data' "
+        "    top: 'label' "
+        "    memory_data_param { "
+        "      batch_size: %1% "
+        "      channels: 3 "
+        "      height: 2 "
+        "      width: 2 "
+        "    } "
+        "  } "
+        "  layer { "
+        "    name: 'ip' "
+        "    type: 'InnerProduct' "
+        "    bottom: 'data' "
+        "    top: 'ip' "
+        "    inner_product_param { "
+        "      num_output: 1 "
+        "    } "
+        "  } "
+        "  layer { "
+        "    name: 'loss' "
+        "    type: 'EuclideanLoss' "
+        "    bottom: 'ip' "
+        "    bottom: 'label' "
+        "    top: 'loss' "
+        "  } "
+        "} "
+        "base_lr: 0.01 "
+        "momentum: 0.9 "
+        "weight_decay: 0.004 "
+        "lr_policy: 'fixed' "
+        "display: 100 "
+        "max_iter: 100 "
+        "accum_grad: %2%");
+        fmt % batch_size % accum_grad;
+        this->InitSolverFromProtoString(fmt.str());
+  }
 
   shared_ptr<Solver<Dtype> > solver_;
 };
@@ -104,4 +151,47 @@ TYPED_TEST(SolverTest, TestInitTrainTestNets) {
   EXPECT_TRUE(this->solver_->test_nets()[1]->has_layer("accuracy"));
 }
 
+TYPED_TEST(SolverTest, TestSolverGradientAccumulation) {
+  typedef typename TypeParam::Dtype Dtype;
+  // Data preparation
+  const int batch_size = 8;
+  const int step = 8;
+  Blob<Dtype> data(batch_size * step, 3, 2, 2);
+  Blob<Dtype> label(batch_size * step, 1, 1, 1);
+  FillerParameter data_filler_param;
+  data_filler_param.set_std(1);
+  GaussianFiller<Dtype> data_filler(data_filler_param);
+  data_filler.Fill(&data);
+  data_filler.Fill(&label);
+
+  // Run with batch_size=8, accum_grad=1
+  this->InitSolverAccumGrad(batch_size, 1);
+  boost::static_pointer_cast<MemoryDataLayer<Dtype> >(
+    this->solver_->net()->layers()[0])->Reset(
+    data.mutable_cpu_data(), label.mutable_cpu_data(), batch_size * step);
+  this->solver_->Step(step);
+  shared_ptr<Blob<Dtype> > weight1 = this->solver_->net()->params()[0];
+  shared_ptr<Blob<Dtype> > bias1 = this->solver_->net()->params()[1];
+
+  // Run with batch_size=4, accum_grad=2
+  this->InitSolverAccumGrad(batch_size / 2, 2);
+  boost::static_pointer_cast<MemoryDataLayer<Dtype> >(
+    this->solver_->net()->layers()[0])->Reset(
+    data.mutable_cpu_data(), label.mutable_cpu_data(), batch_size * step);
+  this->solver_->Step(step);
+  shared_ptr<Blob<Dtype> > weight2 = this->solver_->net()->params()[0];
+  shared_ptr<Blob<Dtype> > bias2 = this->solver_->net()->params()[1];
+
+  // Check if the numbers are the same for both settings.
+  for (int i = 0; i < weight1->count(); ++i) {
+    Dtype value1 = weight1->cpu_data()[i];
+    Dtype value2 = weight2->cpu_data()[i];
+    EXPECT_NEAR(value1, value2, 1e-7);
+  }
+  for (int i = 0; i < bias1->count(); ++i) {
+    Dtype value1 = bias1->cpu_data()[i];
+    Dtype value2 = bias2->cpu_data()[i];
+    EXPECT_NEAR(value1, value2, 1e-7);
+  }
+}
 }  // namespace caffe