[optimizer] refactor SGD to use functional API (#45597)

wanchaol · facebook-github-bot · commit a0cf5566d885 · 2021-01-21T10:57:08.000-08:00
Summary: Pull Request resolved: #45597 Test Plan: Imported from OSS Reviewed By: izdeby Differential Revision: D25932773 Pulled By: wanchaol fbshipit-source-id: bc5f830d6812f847475b9bdcc67865d9968e3282
diff --git a/torch/optim/functional.py b/torch/optim/functional.py
@@ -2,7 +2,7 @@
 import math
 import torch
 from torch import Tensor
-from typing import List
+from typing import List, Optional
 
 # TODO: use foreach API in optim.functional to do all the computation
 
@@ -96,3 +96,39 @@ def adam(params: List[Tensor],
         step_size = lr / bias_correction1
 
         param.addcdiv_(exp_avg, denom, value=-step_size)
+
+
+def sgd(params: List[Tensor],
+        d_p_list: List[Tensor],
+        momentum_buffer_list: List[Optional[Tensor]],
+        weight_decay: float,
+        momentum: float,
+        lr: float,
+        dampening: float,
+        nesterov: bool):
+    r"""Functional API that performs SGD algorithm computation.
+
+    See :class:`~torch.optim.SGD` for details.
+    """
+
+    for i, param in enumerate(params):
+
+        d_p = d_p_list[i]
+        if weight_decay != 0:
+            d_p = d_p.add(param, alpha=weight_decay)
+
+        if momentum != 0:
+            buf = momentum_buffer_list[i]
+
+            if buf is None:
+                buf = torch.clone(d_p).detach()
+                momentum_buffer_list[i] = buf
+            else:
+                buf.mul_(momentum).add_(d_p, alpha=1 - dampening)
+
+            if nesterov:
+                d_p = d_p.add(buf, alpha=momentum)
+            else:
+                d_p = buf
+
+        param.add_(d_p, alpha=-lr)
diff --git a/torch/optim/sgd.py b/torch/optim/sgd.py
@@ -1,4 +1,5 @@
 import torch
+from . import functional as F
 from .optimizer import Optimizer, required
 
 
@@ -86,29 +87,38 @@ def step(self, closure=None):
                 loss = closure()
 
         for group in self.param_groups:
+            params_with_grad = []
+            d_p_list = []
+            momentum_buffer_list = []
             weight_decay = group['weight_decay']
             momentum = group['momentum']
             dampening = group['dampening']
             nesterov = group['nesterov']
+            lr = group['lr']
 
             for p in group['params']:
-                if p.grad is None:
-                    continue
-                d_p = p.grad
-                if weight_decay != 0:
-                    d_p = d_p.add(p, alpha=weight_decay)
-                if momentum != 0:
-                    param_state = self.state[p]
-                    if 'momentum_buffer' not in param_state:
-                        buf = param_state['momentum_buffer'] = torch.clone(d_p).detach()
-                    else:
-                        buf = param_state['momentum_buffer']
-                        buf.mul_(momentum).add_(d_p, alpha=1 - dampening)
-                    if nesterov:
-                        d_p = d_p.add(buf, alpha=momentum)
-                    else:
-                        d_p = buf
+                if p.grad is not None:
+                    params_with_grad.append(p)
+                    d_p_list.append(p.grad)
 
-                p.add_(d_p, alpha=-group['lr'])
+                    state = self.state[p]
+                    if 'momentum_buffer' not in state:
+                        momentum_buffer_list.append(None)
+                    else:
+                        momentum_buffer_list.append(state['momentum_buffer'])
+
+            F.sgd(params_with_grad,
+                  d_p_list,
+                  momentum_buffer_list,
+                  weight_decay,
+                  momentum,
+                  lr,
+                  dampening,
+                  nesterov)
+
+            # update momentum_buffers in state
+            for p, momentum_buffer in zip(params_with_grad, momentum_buffer_list):
+                state = self.state[p]
+                state['momentum_buffer'] = momentum_buffer
 
         return loss