Fuse MatMul and Add into Gemm (#1542)

vloncar · houseroad · commit f85221facef4 · 2018-12-04T10:54:31.000-08:00
* Fuse MatMul and add into Gemm

* Fix signed &amp; unsigned comparison

* Fix whitespace in optimizer_test

* Typecheck fix

* Add MatMul symbol

* Remove unnecessary print statement

* Remove unnecessary check

* Additional tests for fuse_matmul_add_bias_into_gemm

* Compare graphs instead of nodes with asserts

* Additional shape checks and test fixes

* Minor style fix

* Simpler check of shape compatibility

* Reintroduce more strict shape checks
diff --git a/onnx/common/interned_strings.h b/onnx/common/interned_strings.h
@@ -40,6 +40,7 @@ namespace ONNX_NAMESPACE {
   _(Squeeze)                      \
   _(Undefined)                    \
   _(FusionGroup)                  \
+  _(MatMul)                       \
   _(Gemm)                         \
   _(Tile)                         \
   _(SubConstant)                  \
diff --git a/onnx/examples/optimize_onnx.ipynb b/onnx/examples/optimize_onnx.ipynb
@@ -63,6 +63,7 @@
       "\tfuse_consecutive_reduce_unsqueeze\n",
       "\tfuse_consecutive_squeezes\n",
       "\tfuse_consecutive_transposes\n",
+      "\tfuse_matmul_add_bias_into_gemm\n",
       "\tfuse_pad_into_conv\n",
       "\tfuse_transpose_into_gemm\n",
       "\tlift_lexical_references\n",
diff --git a/onnx/optimizer/pass_registry.h b/onnx/optimizer/pass_registry.h
@@ -15,13 +15,14 @@
 #include "onnx/optimizer/passes/eliminate_unused_initializer.h"
 #include "onnx/optimizer/passes/extract_constant_to_initializer.h"
 #include "onnx/optimizer/passes/fuse_add_bias_into_conv.h"
-#include "onnx/optimizer/passes/fuse_pad_into_conv.h"
 #include "onnx/optimizer/passes/fuse_bn_into_conv.h"
 #include "onnx/optimizer/passes/fuse_consecutive_concats.h"
 #include "onnx/optimizer/passes/fuse_consecutive_log_softmax.h"
 #include "onnx/optimizer/passes/fuse_consecutive_reduce_unsqueeze.h"
 #include "onnx/optimizer/passes/fuse_consecutive_squeezes.h"
 #include "onnx/optimizer/passes/fuse_consecutive_transposes.h"
+#include "onnx/optimizer/passes/fuse_matmul_add_bias_into_gemm.h"
+#include "onnx/optimizer/passes/fuse_pad_into_conv.h"
 #include "onnx/optimizer/passes/fuse_transpose_into_gemm.h"
 #include "onnx/optimizer/passes/lift_lexical_references.h"
 #include "onnx/optimizer/passes/nop.h"
@@ -50,13 +51,14 @@ struct GlobalPassRegistry {
     registerPass<EliminateUnusedInitializer>();
     registerPass<ExtractConstantToInitializer>();
     registerPass<FuseAddBiasIntoConv>();
-    registerPass<FusePadIntoConv>();
     registerPass<FuseBNIntoConv>();
     registerPass<FuseConsecutiveConcats>();
     registerPass<FuseConsecutiveLogSoftmax>();
     registerPass<FuseConsecutiveReduceUnsqueeze>();
     registerPass<FuseConsecutiveSqueezes>();
     registerPass<FuseConsecutiveTransposes>();
+    registerPass<FuseMatMulAddBiasIntoGemm>();
+    registerPass<FusePadIntoConv>();
     registerPass<FuseTransposeIntoGemm>();
     registerPass<LiftLexicalReferences>();
     registerPass<SplitInit>();
diff --git a/onnx/optimizer/passes/fuse_matmul_add_bias_into_gemm.h b/onnx/optimizer/passes/fuse_matmul_add_bias_into_gemm.h
@@ -0,0 +1,107 @@
+// ATTENTION: The code in this file is highly EXPERIMENTAL.
+// Adventurous users should note that the APIs will probably change.
+
+#pragma once
+
+// Before:
+//   Z = MatMul(X, Y)
+//   A = Z + Bias
+// After:
+//   A = Gemm(X, Y, Bias)
+//
+// the pass can handle the case when:
+//   case 1: Bias is 1D tensor and Bias.dim[0] == Z.dim[1]
+//   case 2: Bias is 2D tensor and Bias.dim[0] == Z.dim[0] or 1
+//           and Bias.dim[1] = Z.dim[1]
+
+#include <numeric>
+
+#include "onnx/common/assertions.h"
+#include "onnx/optimizer/pass.h"
+
+namespace ONNX_NAMESPACE {
+namespace optimization {
+
+struct FuseMatMulAddBiasIntoGemm final : public PredicateBasedPass {
+  explicit FuseMatMulAddBiasIntoGemm()
+      : PredicateBasedPass(
+            PassType::Fuse,
+            PassEfficiency::Complete,
+            PassOptimizationType::Compute) {}
+  std::string getPassName() const override {
+    return "fuse_matmul_add_bias_into_gemm";
+  }
+  bool patternMatchPredicate(Node* node) override {
+    return node->kind() == kAdd &&
+        node->inputs()[0]->node()->kind() == kMatMul;
+  }
+  bool runTransform(Node* n, Graph& graph, NodeDestroyType& destroy_current)
+      override {
+    // due to current broadcasting's constraint, MatMul has to be the first
+    // operand
+    destroy_current = NodeDestroyType::DestroyZero;
+    auto orig_matmul = n->inputs()[0];
+    auto orig_bias = n->inputs()[1];
+    // check if bias is Const or in graph's initializers
+    if (orig_bias->node()->kind() != kConstant &&
+        orig_bias->node()->kind() != kParam) {
+      return false;
+    }
+    // check if MatMul is only used by Add
+    if (orig_matmul->uses().size() > 1) {
+      return false;
+    }
+    auto x_shape = orig_matmul->node()->inputs()[0]->sizes();
+    auto y_shape = orig_matmul->node()->inputs()[1]->sizes();
+    int64_t z_N = -1;
+    int64_t z_M = -1;
+    // try to get feature N from x_shape
+    if (static_cast<int64_t>(x_shape.size()) == 2 && x_shape[0].is_int) {
+      z_N = x_shape[0].dim;
+    } else {
+      return false;
+    }
+    // try to get feature M from y_shape
+    if (static_cast<int64_t>(y_shape.size()) == 2 && y_shape[1].is_int) {
+      z_M = y_shape[1].dim;
+    } else {
+      return false;
+    }
+    // check if bias_shape is compatible
+    auto bias_shape = orig_bias->sizes();
+    auto bias_dim = static_cast<int64_t>(bias_shape.size());
+    int64_t bias_N = -1;
+    int64_t bias_M = -1;
+    if (bias_dim == 1 && bias_shape[0].is_int) {
+      bias_N = 1;
+      bias_M = bias_shape[0].dim;
+    } else if (bias_dim == 2 && bias_shape[0].is_int && bias_shape[1].is_int) {
+      bias_N = bias_shape[0].dim;
+      bias_M = bias_shape[1].dim;
+    } else {
+      return false;
+    }
+    if ((bias_N != z_N && bias_N != 1) || bias_M != z_M) {
+        return false;
+    }
+    // proceed to fuse MatMul and Add into Gemm
+    Node* gemm = graph.create(kGemm,
+        orig_matmul->node()->inputs(),
+        n->outputs().size());
+    gemm->addInput(n->inputs()[1]);
+    for (int i = 0; i < static_cast<int64_t>(gemm->outputs().size()); ++i) {
+      gemm->outputs()[i]->copyMetadata(n->outputs()[i]);
+    }
+    gemm->f_(kalpha, 1.0);
+    gemm->f_(kbeta, 1.0);
+    gemm->i_(ktransA, 0);
+    gemm->i_(ktransB, 0);
+    gemm->insertBefore(orig_matmul->node());
+    n->replaceAllUsesWith(gemm);
+    destroy_current = NodeDestroyType::DestroyTwo;
+    return true;
+  }
+};
+
+} // namespace optimization
+} // namespace ONNX_NAMESPACE
diff --git a/onnx/test/optimizer_test.py b/onnx/test/optimizer_test.py
@@ -653,6 +653,116 @@ def test_fuse_add_bias_into_conv_squeeze_4d_bias_no_fuse(self):  # type: () -> N
         assert optimized_model.graph.node[0].op_type == 'Conv'
         assert optimized_model.graph.node[1].op_type == 'Add'
 
+    def test_fuse_matmul_add_bias_into_gemm(self):  # type: () -> None
+        matmul = helper.make_node("MatMul", ["X", "Y"], ["Z"])
+        add = helper.make_node("Add", ["Z", "B"], ["A"])
+        graph = helper.make_graph(
+            [matmul, add],
+            "test",
+            [helper.make_tensor_value_info("X", TensorProto.FLOAT, (32, 10)),
+             helper.make_tensor_value_info("Y", TensorProto.FLOAT, (10, 16)),
+             helper.make_tensor_value_info("B", TensorProto.FLOAT, (16,))],
+            [helper.make_tensor_value_info("A", TensorProto.FLOAT, (32, 16))]
+        )
+        optimized_model = self._optimized(graph, ["fuse_matmul_add_bias_into_gemm"])
+
+        assert len(list(optimized_model.graph.node)) == 1
+        assert optimized_model.graph.node[0].op_type == "Gemm"
+
+    def test_fuse_matmul_add_bias_into_gemm_2d_bias(self):  # type: () -> None
+        matmul = helper.make_node("MatMul", ["X", "Y"], ["Z"])
+        add = helper.make_node("Add", ["Z", "B"], ["A"])
+        graph = helper.make_graph(
+            [matmul, add],
+            "test",
+            [helper.make_tensor_value_info("X", TensorProto.FLOAT, (32, 10)),
+             helper.make_tensor_value_info("Y", TensorProto.FLOAT, (10, 16)),
+             helper.make_tensor_value_info("B", TensorProto.FLOAT, (1, 16))],
+            [helper.make_tensor_value_info("A", TensorProto.FLOAT, (32, 16))]
+        )
+        optimized_model = self._optimized(graph, ["fuse_matmul_add_bias_into_gemm"])
+
+        assert len(list(optimized_model.graph.node)) == 1
+        assert optimized_model.graph.node[0].op_type == "Gemm"
+
+    def test_fuse_matmul_add_bias_into_gemm_2d_bias_same_shape(self):  # type: () -> None
+        matmul = helper.make_node("MatMul", ["X", "Y"], ["Z"])
+        add = helper.make_node("Add", ["Z", "B"], ["A"])
+        graph = helper.make_graph(
+            [matmul, add],
+            "test",
+            [helper.make_tensor_value_info("X", TensorProto.FLOAT, (32, 10)),
+             helper.make_tensor_value_info("Y", TensorProto.FLOAT, (10, 16)),
+             helper.make_tensor_value_info("B", TensorProto.FLOAT, (32, 16))],
+            [helper.make_tensor_value_info("A", TensorProto.FLOAT, (32, 16))]
+        )
+        optimized_model = self._optimized(graph, ["fuse_matmul_add_bias_into_gemm"])
+
+        assert len(list(optimized_model.graph.node)) == 1
+        assert optimized_model.graph.node[0].op_type == "Gemm"
+
+    def test_fuse_matmul_add_bias_into_gemm_2d_bias_bcast_no_fuse(self):  # type: () -> None
+        matmul = helper.make_node("MatMul", ["X", "Y"], ["Z"])
+        add = helper.make_node("Add", ["Z", "B"], ["A"])
+        graph = helper.make_graph(
+            [matmul, add],
+            "test",
+            [helper.make_tensor_value_info("X", TensorProto.FLOAT, (1, 10)),
+             helper.make_tensor_value_info("Y", TensorProto.FLOAT, (10, 16)),
+             helper.make_tensor_value_info("B", TensorProto.FLOAT, (16, 16))],
+            [helper.make_tensor_value_info("A", TensorProto.FLOAT, (16, 16))]
+        )
+        optimized_model = self._optimized(graph, ["fuse_matmul_add_bias_into_gemm"])
+
+        assert optimized_model.graph == graph
+
+    def test_fuse_matmul_add_bias_into_gemm_3d_matmul_no_fuse(self):  # type: () -> None
+        matmul = helper.make_node("MatMul", ["X", "Y"], ["Z"])
+        add = helper.make_node("Add", ["Z", "B"], ["A"])
+        graph = helper.make_graph(
+            [matmul, add],
+            "test",
+            [helper.make_tensor_value_info("X", TensorProto.FLOAT, (2, 3, 4)),
+             helper.make_tensor_value_info("Y", TensorProto.FLOAT, (2, 4, 3)),
+             helper.make_tensor_value_info("B", TensorProto.FLOAT, (3, 3))],
+            [helper.make_tensor_value_info("A", TensorProto.FLOAT, (2, 3, 3))]
+        )
+        optimized_model = self._optimized(graph, ["fuse_matmul_add_bias_into_gemm"])
+
+        assert optimized_model.graph == graph
+
+    def test_fuse_matmul_add_bias_into_gemm_3d_bias_no_fuse(self):  # type: () -> None
+        matmul = helper.make_node("MatMul", ["X", "Y"], ["Z"])
+        add = helper.make_node("Add", ["Z", "B"], ["A"])
+        graph = helper.make_graph(
+            [matmul, add],
+            "test",
+            [helper.make_tensor_value_info("X", TensorProto.FLOAT, (32, 10)),
+             helper.make_tensor_value_info("Y", TensorProto.FLOAT, (10, 16)),
+             helper.make_tensor_value_info("B", TensorProto.FLOAT, (4, 1, 16))],
+            [helper.make_tensor_value_info("A", TensorProto.FLOAT, (32, 16))]
+        )
+        optimized_model = self._optimized(graph, ["fuse_matmul_add_bias_into_gemm"])
+
+        assert optimized_model.graph == graph
+
+    def test_fuse_matmul_add_bias_into_gemm_multiple_use_no_fuse(self):  # type: () -> None
+        matmul = helper.make_node("MatMul", ["X", "Y"], ["Z"])
+        identity = helper.make_node("Identity", ["Z"], ["A1"])
+        add = helper.make_node("Add", ["Z", "B"], ["A2"])
+        graph = helper.make_graph(
+            [matmul, add, identity],
+            "test",
+            [helper.make_tensor_value_info("X", TensorProto.FLOAT, (32, 10)),
+             helper.make_tensor_value_info("Y", TensorProto.FLOAT, (10, 16)),
+             helper.make_tensor_value_info("B", TensorProto.FLOAT, (1, 16))],
+            [helper.make_tensor_value_info("A1", TensorProto.FLOAT, (32, 16)),
+             helper.make_tensor_value_info("A2", TensorProto.FLOAT, (32, 16))]
+        )
+        optimized_model = self._optimized(graph, ["fuse_matmul_add_bias_into_gemm"])
+
+        assert optimized_model.graph == graph
+
     def test_fuse_pad_into_conv(self):  # type: () -> None
         pad = helper.make_node(
             "Pad",