Skip to content

Commit 91e89c6

Browse files
csummerseafacebook-github-bot
authored andcommitted
Use Int8QuantParamsBlob to pass the scale and zeropoint params (#40494)
Summary: Pull Request resolved: #40494 Resubmit the diff because D22124313 (1ec4337) was reverted due to CI test failures Added the int8_gen_quant_params.cc to CMakeList.txt to fix the CI failures Test Plan: buck test caffe2/caffe2/quantization/server: Differential Revision: D22204244 fbshipit-source-id: ff4668f6aaa573ada76d053b8b8cdcafa5f50aa9
1 parent a2d4d9e commit 91e89c6

9 files changed

Lines changed: 86 additions & 59 deletions

caffe2/operators/quantized/int8_fc_op.cc

Lines changed: 4 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,7 @@ REGISTER_CPU_OPERATOR(Int8FC, int8::Int8FCOp);
1010

1111
using namespace std::placeholders;
1212
OPERATOR_SCHEMA(Int8FC)
13-
.NumInputs(3, 5)
13+
.NumInputs(3, 4)
1414
.NumOutputs(1, 4)
1515
.TensorInferenceFunction(std::bind(FCShapeInference, _1, _2, false))
1616
.CostInferenceFunction(std::bind(CostInferenceForFC, _1, _2, false))
@@ -45,14 +45,9 @@ will throw errors.
4545
.Input(2, "b", "1D blob containing bias vector")
4646
.Input(
4747
3,
48-
"Scale qparam",
49-
"Optional scale quantization param computed on activation histogram data"
50-
"Will overwrite Y_scale argument if specified")
51-
.Input(
52-
4,
53-
"Zero-point qparam",
54-
"Optionsl zero-point quantization param computed on activation data"
55-
"Will overwrite Y_zero_point argument if specified")
48+
"Qparam",
49+
"Optional Qparam blob that constans quant param computed on activation histogram data"
50+
"Will overwrite Y_scale and Y_zero_point argument if specified")
5651
.Output(0, "Y", "2D output tensor");
5752

5853
} // namespace caffe2

caffe2/operators/quantized/int8_quantize_op.cc

Lines changed: 3 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -13,14 +13,9 @@ OPERATOR_SCHEMA(Int8Quantize)
1313
.Input(0, "X", "FP32 Tensor X.")
1414
.Input(
1515
1,
16-
"Scale qparam",
17-
"Optional scale quantization param computed on activation histogram data"
18-
"Will overwrite Y_scale argument if specified")
19-
.Input(
20-
2,
21-
"Zero-point qparam",
22-
"Optionsl zero-point quantization param computed on activation data"
23-
"Will overwrite Y_zero_point argument if specified")
16+
"Qparam",
17+
"Optional Qparam blob that constans quant param computed on activation histogram data"
18+
"Will overwrite Y_scale and Y_zero_point argument if specified")
2419
.Output(0, "Y", "Int8 Tensor qX representing X with linear quantization.");
2520

2621
} // namespace caffe2

caffe2/quantization/server/CMakeLists.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -35,6 +35,7 @@ list(APPEND Caffe2_CPU_SRCS
3535
"${CMAKE_CURRENT_SOURCE_DIR}/fully_connected_dnnlowp_op.cc"
3636
"${CMAKE_CURRENT_SOURCE_DIR}/fully_connected_fake_lowp_op.cc"
3737
"${CMAKE_CURRENT_SOURCE_DIR}/group_norm_dnnlowp_op.cc"
38+
"${CMAKE_CURRENT_SOURCE_DIR}/int8_gen_quant_params.cc"
3839
"${CMAKE_CURRENT_SOURCE_DIR}/lstm_unit_dnnlowp_op.cc"
3940
"${CMAKE_CURRENT_SOURCE_DIR}/pool_dnnlowp_op.cc"
4041
"${CMAKE_CURRENT_SOURCE_DIR}/quantize_dnnlowp_op.cc"

caffe2/quantization/server/dnnlowp_test_utils.py

Lines changed: 21 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@
44

55
import numpy as np
66
from caffe2.python import utils, workspace
7+
from caffe2.quantization.server import dnnlowp_pybind11
78
from hypothesis import assume
89

910

@@ -371,7 +372,19 @@ def generate_conv_inputs(
371372

372373

373374
def run_conv_or_fc(
374-
test_case, init_net, net, X, W, b, op_type, engine, order, gc, outputs, scale=None, zero_point=None
375+
test_case,
376+
init_net,
377+
net,
378+
X,
379+
W,
380+
b,
381+
op_type,
382+
engine,
383+
order,
384+
gc,
385+
outputs,
386+
scale=None,
387+
zero_point=None,
375388
):
376389
if order:
377390
# Conv
@@ -390,8 +403,10 @@ def run_conv_or_fc(
390403
test_case.ws.create_blob("W").feed(W, device_option=gc)
391404
test_case.ws.create_blob("b").feed(b, device_option=gc)
392405
if scale is not None and zero_point is not None:
393-
test_case.ws.create_blob("scale").feed(scale, device_option=gc)
394-
test_case.ws.create_blob("zero_point").feed(zero_point, device_option=gc)
406+
with workspace.WorkspaceGuard(test_case.ws):
407+
dnnlowp_pybind11.CreateInt8QuantParamsBlob(
408+
"quant_param", float(scale), int(zero_point)
409+
)
395410

396411
if init_net:
397412
test_case.ws.run(init_net)
@@ -409,8 +424,9 @@ def run_conv_or_fc(
409424
workspace.FeedBlob("W", W)
410425
workspace.FeedBlob("b", b)
411426
if scale is not None and zero_point is not None:
412-
workspace.FeedBlob("scale", scale)
413-
workspace.FeedBlob("zero_point", zero_point)
427+
dnnlowp_pybind11.CreateInt8QuantParamsBlob(
428+
"quant_param", float(scale), int(zero_point)
429+
)
414430

415431
if init_net:
416432
workspace.RunNetOnce(init_net)

caffe2/quantization/server/fully_connected_dnnlowp_op.cc

Lines changed: 12 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,7 @@
77
#include "caffe2/core/flags.h"
88
#include "caffe2/core/tensor_int8.h"
99
#include "caffe2/operators/fc_inference.h"
10+
#include "caffe2/quantization/server/int8_gen_quant_params.h"
1011
#include "caffe2/utils/cpuid.h"
1112
#include "fbgemm_pack_matrix_cache.h"
1213
#include "fbgemm_pack_op.h"
@@ -871,17 +872,16 @@ bool FullyConnectedDNNLowPOp<T, ReluFused>::GetQuantizationParameters_() {
871872
#endif
872873

873874
if (!dequantize_output_ && !requantization_param_selected_) {
874-
CAFFE_ENFORCE(InputSize() == 3 || InputSize() == 5);
875-
if (InputSize() == 5) {
876-
CAFFE_ENFORCE(Input(3).template IsType<float>());
877-
CAFFE_ENFORCE(Input(4).template IsType<int>());
878-
879-
const auto& in_3 = Input(3);
880-
CAFFE_ENFORCE_EQ(in_3.numel(), 1);
881-
float in_scale = *(in_3.template data<float>());
882-
const auto& in_4 = Input(4);
883-
CAFFE_ENFORCE_EQ(in_4.numel(), 1);
884-
int in_zero_point = *(in_4.template data<int>());
875+
CAFFE_ENFORCE(InputSize() <= 4);
876+
if (InputSize() == 4) {
877+
const auto* input_qparam_blob =
878+
this->template Input<caffe2::unique_ptr<caffe2::Int8QuantParamsBlob>>(
879+
3)
880+
.get();
881+
CAFFE_ENFORCE(input_qparam_blob);
882+
883+
float in_scale = input_qparam_blob->qparam.scale;
884+
int in_zero_point = input_qparam_blob->qparam.zero_point;
885885

886886
dnnlowp::TensorQuantizationParams out_qparams_overwrite;
887887
out_qparams_overwrite.scale = in_scale;
@@ -964,7 +964,7 @@ REGISTER_CPU_OPERATOR_WITH_ENGINE(
964964

965965
using namespace std::placeholders;
966966
OPERATOR_SCHEMA(Int8FCRelu)
967-
.NumInputs(3, 5)
967+
.NumInputs(3, 4)
968968
.NumOutputs(1)
969969
.TensorInferenceFunction(std::bind(FCShapeInference, _1, _2, false))
970970
.CostInferenceFunction(std::bind(CostInferenceForFC, _1, _2, false));

caffe2/quantization/server/fully_connected_dnnlowp_op_test.py

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,7 @@
66
import hypothesis.strategies as st
77
import numpy as np
88
from caffe2.python import core, dyndep, workspace
9-
from caffe2.quantization.server import utils as dnnlowp_utils
9+
from caffe2.quantization.server import dnnlowp_pybind11, utils as dnnlowp_utils
1010
from caffe2.quantization.server.dnnlowp_test_utils import (
1111
avoid_vpmaddubsw_overflow_fc,
1212
check_quantized_results_close,
@@ -176,8 +176,7 @@ def test_dnnlowp_fully_connected_int(
176176
if do_prepack_weight
177177
else ("W_q" if do_quantize_weight else "W"),
178178
"b_q" if do_quantize_weight else "b",
179-
"scale",
180-
"zero_point",
179+
"quant_param",
181180
],
182181
["Y_q" if do_dequantize else "Y"],
183182
dequantize_output=not do_dequantize,
@@ -226,6 +225,7 @@ def test_dnnlowp_fully_connected_int(
226225
ref_output = outputs[0][0]
227226
ref_output_min = 0 if ref_output.size == 0 else ref_output.min()
228227
ref_output_max = 0 if ref_output.size == 0 else ref_output.max()
228+
229229
q_param = dnnlowp_utils.choose_quantization_params(
230230
ref_output_min, ref_output_max, preserve_activation_sparsity
231231
)
@@ -241,8 +241,8 @@ def test_dnnlowp_fully_connected_int(
241241
None,
242242
gc,
243243
outputs,
244-
np.array([q_param.scale]).astype(np.float32),
245-
np.array([q_param.zero_point]).astype(np.int32),
244+
q_param.scale,
245+
q_param.zero_point,
246246
)
247247
else:
248248
run_conv_or_fc(

caffe2/quantization/server/pybind.cc

Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -431,6 +431,23 @@ PYBIND11_MODULE(dnnlowp_pybind11, m) {
431431
pybind11::arg("quant_scheme_blob_name"),
432432
pybind11::arg("quantization_kind"),
433433
pybind11::arg("preserve_sparsity"));
434+
m.def(
435+
"CreateInt8QuantParamsBlob",
436+
[](std::string quant_params_blob_name, float scale, int zero_point) {
437+
Workspace* gWorkspace = caffe2::python::GetCurrentWorkspace();
438+
CAFFE_ENFORCE(gWorkspace);
439+
auto* quant_params_blob = gWorkspace->GetBlob(quant_params_blob_name);
440+
if (quant_params_blob == nullptr) {
441+
quant_params_blob = gWorkspace->CreateBlob(quant_params_blob_name);
442+
}
443+
auto* quant_params_blob_data =
444+
quant_params_blob->GetMutable<unique_ptr<Int8QuantParamsBlob>>();
445+
quant_params_blob_data->reset(
446+
new Int8QuantParamsBlob(scale, zero_point));
447+
},
448+
pybind11::arg("quant_param_blob_name"),
449+
pybind11::arg("scale"),
450+
pybind11::arg("zero_point"));
434451
m.def(
435452
"ObserveInt8QuantParamsBlob",
436453
[](std::string quant_params_blob_name) {

caffe2/quantization/server/quantize_dnnlowp_op.cc

Lines changed: 9 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,7 @@
66
#endif
77

88
#include "caffe2/core/tensor_int8.h"
9+
#include "caffe2/quantization/server/int8_gen_quant_params.h"
910
#include "caffe2_dnnlowp_utils.h"
1011
#include "dnnlowp_partition.h"
1112

@@ -29,25 +30,20 @@ bool QuantizeDNNLowPOp<T>::RunOnDevice() {
2930
arguments_parsed_ = true;
3031
}
3132

32-
CAFFE_ENFORCE(InputSize() == 1 || InputSize() == 3);
33+
CAFFE_ENFORCE(InputSize() <= 2);
3334
CAFFE_ENFORCE(Input(0).template IsType<float>());
3435

3536
bool use_input_qparam = false;
3637
float in_scale = 0;
3738
int in_zero_point = 0;
38-
if (InputSize() == 3) {
39+
if (InputSize() == 2) {
3940
use_input_qparam = true;
4041

41-
CAFFE_ENFORCE(Input(1).template IsType<float>());
42-
CAFFE_ENFORCE(Input(2).template IsType<int>());
43-
44-
const auto& in_1 = Input(1);
45-
CAFFE_ENFORCE_EQ(in_1.numel(), 1);
46-
in_scale = *(in_1.template data<float>());
47-
48-
const auto& in_2 = Input(2);
49-
CAFFE_ENFORCE_EQ(in_2.numel(), 1);
50-
in_zero_point = *(in_2.template data<int>());
42+
const auto* input_qparam_blob =
43+
Input<caffe2::unique_ptr<Int8QuantParamsBlob>>(1).get();
44+
CAFFE_ENFORCE(input_qparam_blob);
45+
in_scale = input_qparam_blob->qparam.scale;
46+
in_zero_point = input_qparam_blob->qparam.zero_point;
5147
}
5248

5349
TensorQuantizationParams in_qparams;
@@ -88,7 +84,7 @@ bool QuantizeDNNLowPOp<T>::RunOnDevice() {
8884
}
8985

9086
OPERATOR_SCHEMA(Quantize)
91-
.NumInputs(1, 3)
87+
.NumInputs(1, 2)
9288
.NumOutputs(1)
9389
.IdenticalTypeAndShapeOfInput(0);
9490

caffe2/quantization/server/quantize_dnnlowp_op_test.py

Lines changed: 14 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@
44
import hypothesis.strategies as st
55
import numpy as np
66
from caffe2.python import core, dyndep, workspace
7+
from caffe2.quantization.server import dnnlowp_pybind11
78
from hypothesis import given
89

910

@@ -34,17 +35,23 @@ def test_dnnlowp_quantize(self, size, is_empty, gc, dc):
3435
op_type, ["X"], ["X_q"], engine=engine, device_option=gc
3536
)
3637
net.Proto().op.extend([quantize])
38+
39+
dnnlowp_pybind11.CreateInt8QuantParamsBlob(
40+
"quant_param", float(X_scale), int(X_zero)
41+
)
3742
quantize_2 = core.CreateOperator(
38-
op_type, ["X", "scale", "zero"], ["X_q_2"], engine=engine, device_option=gc
43+
op_type,
44+
["X", "quant_param"],
45+
["X_q_2"],
46+
engine=engine,
47+
device_option=gc,
3948
)
4049
net.Proto().op.extend([quantize_2])
4150

42-
self.ws.create_blob("X").feed(X, device_option=gc)
43-
self.ws.create_blob("scale").feed(np.array([X_scale]).astype(np.float32), device_option=gc)
44-
self.ws.create_blob("zero").feed(np.array([X_zero]).astype(np.int32), device_option=gc)
45-
self.ws.run(net)
46-
X_q = self.ws.blobs["X_q"].fetch()[0]
47-
X_q_2 = self.ws.blobs["X_q_2"].fetch()[0]
51+
workspace.FeedBlob("X", X, device_option=gc)
52+
workspace.RunNetOnce(net)
53+
X_q = workspace.FetchInt8Blob("X_q")[0]
54+
X_q_2 = workspace.FetchInt8Blob("X_q_2")[0]
4855

4956
# Dequantize results and measure quantization error against inputs
5057
X_dq = X_scale * (X_q - X_zero)

0 commit comments

Comments
 (0)