Use Int8QuantParamsBlob to pass the scale and zeropoint params (#40494)

csummersea · facebook-github-bot · commit 91e89c60fbc6 · 2020-06-23T23:06:12.000-07:00
Summary: Pull Request resolved: #40494 Resubmit the diff because D22124313 (1ec4337) was reverted due to CI test failures Added the int8_gen_quant_params.cc to CMakeList.txt to fix the CI failures Test Plan: buck test caffe2/caffe2/quantization/server: Differential Revision: D22204244 fbshipit-source-id: ff4668f6aaa573ada76d053b8b8cdcafa5f50aa9
diff --git a/caffe2/operators/quantized/int8_fc_op.cc b/caffe2/operators/quantized/int8_fc_op.cc
@@ -10,7 +10,7 @@ REGISTER_CPU_OPERATOR(Int8FC, int8::Int8FCOp);
 
 using namespace std::placeholders;
 OPERATOR_SCHEMA(Int8FC)
-    .NumInputs(3, 5)
+    .NumInputs(3, 4)
     .NumOutputs(1, 4)
     .TensorInferenceFunction(std::bind(FCShapeInference, _1, _2, false))
     .CostInferenceFunction(std::bind(CostInferenceForFC, _1, _2, false))
@@ -45,14 +45,9 @@ will throw errors.
     .Input(2, "b", "1D blob containing bias vector")
     .Input(
         3,
-        "Scale qparam",
-        "Optional scale quantization param computed on activation histogram data"
-        "Will overwrite Y_scale argument if specified")
-    .Input(
-        4,
-        "Zero-point qparam",
-        "Optionsl zero-point quantization param computed on activation data"
-        "Will overwrite Y_zero_point argument if specified")
+        "Qparam",
+        "Optional Qparam blob that constans quant param computed on activation histogram data"
+        "Will overwrite Y_scale and Y_zero_point argument if specified")
     .Output(0, "Y", "2D output tensor");
 
 } // namespace caffe2
diff --git a/caffe2/operators/quantized/int8_quantize_op.cc b/caffe2/operators/quantized/int8_quantize_op.cc
@@ -13,14 +13,9 @@ OPERATOR_SCHEMA(Int8Quantize)
     .Input(0, "X", "FP32 Tensor X.")
     .Input(
         1,
-        "Scale qparam",
-        "Optional scale quantization param computed on activation histogram data"
-        "Will overwrite Y_scale argument if specified")
-    .Input(
-        2,
-        "Zero-point qparam",
-        "Optionsl zero-point quantization param computed on activation data"
-        "Will overwrite Y_zero_point argument if specified")
+        "Qparam",
+        "Optional Qparam blob that constans quant param computed on activation histogram data"
+        "Will overwrite Y_scale and Y_zero_point argument if specified")
     .Output(0, "Y", "Int8 Tensor qX representing X with linear quantization.");
 
 } // namespace caffe2
diff --git a/caffe2/quantization/server/CMakeLists.txt b/caffe2/quantization/server/CMakeLists.txt
@@ -35,6 +35,7 @@ list(APPEND Caffe2_CPU_SRCS
   "${CMAKE_CURRENT_SOURCE_DIR}/fully_connected_dnnlowp_op.cc"
   "${CMAKE_CURRENT_SOURCE_DIR}/fully_connected_fake_lowp_op.cc"
   "${CMAKE_CURRENT_SOURCE_DIR}/group_norm_dnnlowp_op.cc"
+  "${CMAKE_CURRENT_SOURCE_DIR}/int8_gen_quant_params.cc"
   "${CMAKE_CURRENT_SOURCE_DIR}/lstm_unit_dnnlowp_op.cc"
   "${CMAKE_CURRENT_SOURCE_DIR}/pool_dnnlowp_op.cc"
   "${CMAKE_CURRENT_SOURCE_DIR}/quantize_dnnlowp_op.cc"
diff --git a/caffe2/quantization/server/dnnlowp_test_utils.py b/caffe2/quantization/server/dnnlowp_test_utils.py
@@ -4,6 +4,7 @@
 
 import numpy as np
 from caffe2.python import utils, workspace
+from caffe2.quantization.server import dnnlowp_pybind11
 from hypothesis import assume
 
 
@@ -371,7 +372,19 @@ def generate_conv_inputs(
 
 
 def run_conv_or_fc(
-    test_case, init_net, net, X, W, b, op_type, engine, order, gc, outputs, scale=None, zero_point=None
+    test_case,
+    init_net,
+    net,
+    X,
+    W,
+    b,
+    op_type,
+    engine,
+    order,
+    gc,
+    outputs,
+    scale=None,
+    zero_point=None,
 ):
     if order:
         # Conv
@@ -390,8 +403,10 @@ def run_conv_or_fc(
     test_case.ws.create_blob("W").feed(W, device_option=gc)
     test_case.ws.create_blob("b").feed(b, device_option=gc)
     if scale is not None and zero_point is not None:
-        test_case.ws.create_blob("scale").feed(scale, device_option=gc)
-        test_case.ws.create_blob("zero_point").feed(zero_point, device_option=gc)
+        with workspace.WorkspaceGuard(test_case.ws):
+            dnnlowp_pybind11.CreateInt8QuantParamsBlob(
+                "quant_param", float(scale), int(zero_point)
+            )
 
     if init_net:
         test_case.ws.run(init_net)
@@ -409,8 +424,9 @@ def run_conv_or_fc(
         workspace.FeedBlob("W", W)
         workspace.FeedBlob("b", b)
         if scale is not None and zero_point is not None:
-            workspace.FeedBlob("scale", scale)
-            workspace.FeedBlob("zero_point", zero_point)
+            dnnlowp_pybind11.CreateInt8QuantParamsBlob(
+                "quant_param", float(scale), int(zero_point)
+            )
 
         if init_net:
             workspace.RunNetOnce(init_net)
diff --git a/caffe2/quantization/server/fully_connected_dnnlowp_op.cc b/caffe2/quantization/server/fully_connected_dnnlowp_op.cc
@@ -7,6 +7,7 @@
 #include "caffe2/core/flags.h"
 #include "caffe2/core/tensor_int8.h"
 #include "caffe2/operators/fc_inference.h"
+#include "caffe2/quantization/server/int8_gen_quant_params.h"
 #include "caffe2/utils/cpuid.h"
 #include "fbgemm_pack_matrix_cache.h"
 #include "fbgemm_pack_op.h"
@@ -871,17 +872,16 @@ bool FullyConnectedDNNLowPOp<T, ReluFused>::GetQuantizationParameters_() {
 #endif
 
   if (!dequantize_output_ && !requantization_param_selected_) {
-    CAFFE_ENFORCE(InputSize() == 3 || InputSize() == 5);
-    if (InputSize() == 5) {
-      CAFFE_ENFORCE(Input(3).template IsType<float>());
-      CAFFE_ENFORCE(Input(4).template IsType<int>());
-
-      const auto& in_3 = Input(3);
-      CAFFE_ENFORCE_EQ(in_3.numel(), 1);
-      float in_scale = *(in_3.template data<float>());
-      const auto& in_4 = Input(4);
-      CAFFE_ENFORCE_EQ(in_4.numel(), 1);
-      int in_zero_point = *(in_4.template data<int>());
+    CAFFE_ENFORCE(InputSize() <= 4);
+    if (InputSize() == 4) {
+      const auto* input_qparam_blob =
+          this->template Input<caffe2::unique_ptr<caffe2::Int8QuantParamsBlob>>(
+                  3)
+              .get();
+      CAFFE_ENFORCE(input_qparam_blob);
+
+      float in_scale = input_qparam_blob->qparam.scale;
+      int in_zero_point = input_qparam_blob->qparam.zero_point;
 
       dnnlowp::TensorQuantizationParams out_qparams_overwrite;
       out_qparams_overwrite.scale = in_scale;
@@ -964,7 +964,7 @@ REGISTER_CPU_OPERATOR_WITH_ENGINE(
 
 using namespace std::placeholders;
 OPERATOR_SCHEMA(Int8FCRelu)
-    .NumInputs(3, 5)
+    .NumInputs(3, 4)
     .NumOutputs(1)
     .TensorInferenceFunction(std::bind(FCShapeInference, _1, _2, false))
     .CostInferenceFunction(std::bind(CostInferenceForFC, _1, _2, false));
diff --git a/caffe2/quantization/server/fully_connected_dnnlowp_op_test.py b/caffe2/quantization/server/fully_connected_dnnlowp_op_test.py
@@ -6,7 +6,7 @@
 import hypothesis.strategies as st
 import numpy as np
 from caffe2.python import core, dyndep, workspace
-from caffe2.quantization.server import utils as dnnlowp_utils
+from caffe2.quantization.server import dnnlowp_pybind11, utils as dnnlowp_utils
 from caffe2.quantization.server.dnnlowp_test_utils import (
     avoid_vpmaddubsw_overflow_fc,
     check_quantized_results_close,
@@ -176,8 +176,7 @@ def test_dnnlowp_fully_connected_int(
                         if do_prepack_weight
                         else ("W_q" if do_quantize_weight else "W"),
                         "b_q" if do_quantize_weight else "b",
-                        "scale",
-                        "zero_point",
+                        "quant_param",
                     ],
                     ["Y_q" if do_dequantize else "Y"],
                     dequantize_output=not do_dequantize,
@@ -226,6 +225,7 @@ def test_dnnlowp_fully_connected_int(
                 ref_output = outputs[0][0]
                 ref_output_min = 0 if ref_output.size == 0 else ref_output.min()
                 ref_output_max = 0 if ref_output.size == 0 else ref_output.max()
+
                 q_param = dnnlowp_utils.choose_quantization_params(
                     ref_output_min, ref_output_max, preserve_activation_sparsity
                 )
@@ -241,8 +241,8 @@ def test_dnnlowp_fully_connected_int(
                     None,
                     gc,
                     outputs,
-                    np.array([q_param.scale]).astype(np.float32),
-                    np.array([q_param.zero_point]).astype(np.int32),
+                    q_param.scale,
+                    q_param.zero_point,
                 )
             else:
                 run_conv_or_fc(
diff --git a/caffe2/quantization/server/pybind.cc b/caffe2/quantization/server/pybind.cc
@@ -431,6 +431,23 @@ PYBIND11_MODULE(dnnlowp_pybind11, m) {
       pybind11::arg("quant_scheme_blob_name"),
       pybind11::arg("quantization_kind"),
       pybind11::arg("preserve_sparsity"));
+  m.def(
+      "CreateInt8QuantParamsBlob",
+      [](std::string quant_params_blob_name, float scale, int zero_point) {
+        Workspace* gWorkspace = caffe2::python::GetCurrentWorkspace();
+        CAFFE_ENFORCE(gWorkspace);
+        auto* quant_params_blob = gWorkspace->GetBlob(quant_params_blob_name);
+        if (quant_params_blob == nullptr) {
+          quant_params_blob = gWorkspace->CreateBlob(quant_params_blob_name);
+        }
+        auto* quant_params_blob_data =
+            quant_params_blob->GetMutable<unique_ptr<Int8QuantParamsBlob>>();
+        quant_params_blob_data->reset(
+            new Int8QuantParamsBlob(scale, zero_point));
+      },
+      pybind11::arg("quant_param_blob_name"),
+      pybind11::arg("scale"),
+      pybind11::arg("zero_point"));
   m.def(
       "ObserveInt8QuantParamsBlob",
       [](std::string quant_params_blob_name) {
diff --git a/caffe2/quantization/server/quantize_dnnlowp_op.cc b/caffe2/quantization/server/quantize_dnnlowp_op.cc
@@ -6,6 +6,7 @@
 #endif
 
 #include "caffe2/core/tensor_int8.h"
+#include "caffe2/quantization/server/int8_gen_quant_params.h"
 #include "caffe2_dnnlowp_utils.h"
 #include "dnnlowp_partition.h"
 
@@ -29,25 +30,20 @@ bool QuantizeDNNLowPOp<T>::RunOnDevice() {
     arguments_parsed_ = true;
   }
 
-  CAFFE_ENFORCE(InputSize() == 1 || InputSize() == 3);
+  CAFFE_ENFORCE(InputSize() <= 2);
   CAFFE_ENFORCE(Input(0).template IsType<float>());
 
   bool use_input_qparam = false;
   float in_scale = 0;
   int in_zero_point = 0;
-  if (InputSize() == 3) {
+  if (InputSize() == 2) {
     use_input_qparam = true;
 
-    CAFFE_ENFORCE(Input(1).template IsType<float>());
-    CAFFE_ENFORCE(Input(2).template IsType<int>());
-
-    const auto& in_1 = Input(1);
-    CAFFE_ENFORCE_EQ(in_1.numel(), 1);
-    in_scale = *(in_1.template data<float>());
-
-    const auto& in_2 = Input(2);
-    CAFFE_ENFORCE_EQ(in_2.numel(), 1);
-    in_zero_point = *(in_2.template data<int>());
+    const auto* input_qparam_blob =
+        Input<caffe2::unique_ptr<Int8QuantParamsBlob>>(1).get();
+    CAFFE_ENFORCE(input_qparam_blob);
+    in_scale = input_qparam_blob->qparam.scale;
+    in_zero_point = input_qparam_blob->qparam.zero_point;
   }
 
   TensorQuantizationParams in_qparams;
@@ -88,7 +84,7 @@ bool QuantizeDNNLowPOp<T>::RunOnDevice() {
 }
 
 OPERATOR_SCHEMA(Quantize)
-    .NumInputs(1, 3)
+    .NumInputs(1, 2)
     .NumOutputs(1)
     .IdenticalTypeAndShapeOfInput(0);
 
diff --git a/caffe2/quantization/server/quantize_dnnlowp_op_test.py b/caffe2/quantization/server/quantize_dnnlowp_op_test.py
@@ -4,6 +4,7 @@
 import hypothesis.strategies as st
 import numpy as np
 from caffe2.python import core, dyndep, workspace
+from caffe2.quantization.server import dnnlowp_pybind11
 from hypothesis import given
 
 
@@ -34,17 +35,23 @@ def test_dnnlowp_quantize(self, size, is_empty, gc, dc):
                 op_type, ["X"], ["X_q"], engine=engine, device_option=gc
             )
             net.Proto().op.extend([quantize])
+
+            dnnlowp_pybind11.CreateInt8QuantParamsBlob(
+                "quant_param", float(X_scale), int(X_zero)
+            )
             quantize_2 = core.CreateOperator(
-                op_type, ["X", "scale", "zero"], ["X_q_2"], engine=engine, device_option=gc
+                op_type,
+                ["X", "quant_param"],
+                ["X_q_2"],
+                engine=engine,
+                device_option=gc,
             )
             net.Proto().op.extend([quantize_2])
 
-            self.ws.create_blob("X").feed(X, device_option=gc)
-            self.ws.create_blob("scale").feed(np.array([X_scale]).astype(np.float32), device_option=gc)
-            self.ws.create_blob("zero").feed(np.array([X_zero]).astype(np.int32), device_option=gc)
-            self.ws.run(net)
-            X_q = self.ws.blobs["X_q"].fetch()[0]
-            X_q_2 = self.ws.blobs["X_q_2"].fetch()[0]
+            workspace.FeedBlob("X", X, device_option=gc)
+            workspace.RunNetOnce(net)
+            X_q = workspace.FetchInt8Blob("X_q")[0]
+            X_q_2 = workspace.FetchInt8Blob("X_q_2")[0]
 
             # Dequantize results and measure quantization error against inputs
             X_dq = X_scale * (X_q - X_zero)