Skip to content

Commit 4d61109

Browse files
jianyuhfacebook-github-bot
authored andcommitted
[pt][quant] Make the CUDA fake quantize logic consistent with CPU fake quantize logic (#49808)
Summary: Pull Request resolved: #49808 In PyTorch, it uses `dst = std::nearbyint(src * inv_scale) + zero_point` instead of the LEGACY `dst = std::nearbyint(src * inv_scale + zero_point)`. However, the CUDA implementation doesn't match this. This Diff makes the CPU and CUDA implementation consistent. - FBGEMM code pointer: https://github.com/pytorch/FBGEMM/blob/master/include/fbgemm/QuantUtils.h#L76-L80 - PyTorch code pointer: https://github.com/pytorch/pytorch/blob/master/aten/src/ATen/native/quantized/affine_quantizer.cpp#L306 Test Plan: CI Reviewed By: dskhudia Differential Revision: D25694235 fbshipit-source-id: 0a615e559132aafe18543deac1ea5028dd840cb9
1 parent e163172 commit 4d61109

2 files changed

Lines changed: 29 additions & 28 deletions

File tree

aten/src/ATen/native/quantized/cuda/affine_quantizer.cu

Lines changed: 10 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -25,14 +25,16 @@ void quantize_tensor_per_tensor_affine_cuda(
2525
.add_input(qtensor)
2626
.build();
2727

28-
gpu_kernel(iter,
29-
[=] GPU_LAMBDA (float raw_val, scalar_t quantized_val) -> scalar_t {
30-
int64_t qvalue = static_cast<int64_t>(nearbyint(raw_val / scale + zero_point));
31-
qvalue = std::max<int64_t>(qvalue, qmin);
32-
qvalue = std::min<int64_t>(qvalue, qmax);
33-
quantized_val.val_ = qvalue;
34-
return quantized_val;
35-
});
28+
gpu_kernel(
29+
iter,
30+
[=] GPU_LAMBDA(float raw_val, scalar_t quantized_val) -> scalar_t {
31+
int64_t qvalue =
32+
static_cast<int64_t>(nearbyint(raw_val / scale) + zero_point);
33+
qvalue = std::max<int64_t>(qvalue, qmin);
34+
qvalue = std::min<int64_t>(qvalue, qmax);
35+
quantized_val.val_ = qvalue;
36+
return quantized_val;
37+
});
3638
});
3739
}
3840

aten/src/ATen/native/quantized/cuda/fake_quantize_core.cu

Lines changed: 19 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -34,17 +34,16 @@ void fake_quantize_tensor_kernel_cuda(
3434
.add_output(output)
3535
.add_input(input)
3636
.build();
37-
gpu_kernel(iter,
38-
[=] GPU_LAMBDA (float input_val) -> float {
39-
return (fminf(
37+
gpu_kernel(iter, [=] GPU_LAMBDA(float input_val) -> float {
38+
return (fminf(
4039
quant_max,
4140
fmaxf(
4241
quant_min,
43-
static_cast<int64_t>(std::nearbyint(
44-
input_val * inv_scale + zero_point)))) -
42+
static_cast<int64_t>(
43+
std::nearbyint(input_val * inv_scale) + zero_point))) -
4544
zero_point) *
46-
scale;
47-
});
45+
scale;
46+
});
4847
}
4948

5049
void fake_quantize_grad_tensor_kernel_cuda(
@@ -63,11 +62,10 @@ void fake_quantize_grad_tensor_kernel_cuda(
6362
.add_input(output_grad)
6463
.add_input(input)
6564
.build();
66-
gpu_kernel(iter,
67-
[=] GPU_LAMBDA (float dy, float x) -> float {
68-
int64_t Xq = std::nearbyint(x * inv_scale + zero_point);
69-
return (Xq >= quant_min && Xq <= quant_max) * dy;
70-
});
65+
gpu_kernel(iter, [=] GPU_LAMBDA(float dy, float x) -> float {
66+
int64_t Xq = std::nearbyint(x * inv_scale) + zero_point;
67+
return (Xq >= quant_min && Xq <= quant_max) * dy;
68+
});
7169
}
7270

7371
void _fake_quantize_grad_learnable_tensor_kernel_cuda(
@@ -82,7 +80,7 @@ void _fake_quantize_grad_learnable_tensor_kernel_cuda(
8280
gpu_kernel_multiple_outputs(
8381
iter, [=] GPU_LAMBDA (float XInput, float dYInput) -> thrust::tuple<float, float, float> {
8482
float dXOutput, dZeroPointOutput, dScaleOutput;
85-
int64_t xq = std::nearbyint(zero_point + XInput * inv_scale);
83+
int64_t xq = std::nearbyint(XInput * inv_scale) + zero_point;
8684
dXOutput = dYInput * (xq >= quant_min && xq <= quant_max);
8785
xq = std::max(std::min(xq, quant_max), quant_min);
8886
float xfq = static_cast<float>((xq - zero_point) * scale);
@@ -108,12 +106,13 @@ void fake_quant_per_channel_cuda(TensorIterator &iter, int64_t quant_min, int64_
108106
[=] GPU_LAMBDA (float input_val, float scale, int64_t zero_point) -> float {
109107
float inv_scale = 1.0f / scale;
110108
return (fminf(
111-
quant_max,
112-
fmaxf(
113-
quant_min,
114-
static_cast<int64_t>(std::nearbyint(
115-
input_val * inv_scale + zero_point)))) -
116-
zero_point) *
109+
quant_max,
110+
fmaxf(
111+
quant_min,
112+
static_cast<int64_t>(
113+
std::nearbyint(input_val * inv_scale) +
114+
zero_point))) -
115+
zero_point) *
117116
scale;
118117
});
119118
}
@@ -122,7 +121,7 @@ void fake_quant_grad_per_channel_cuda(TensorIterator &iter, int64_t quant_min, i
122121
gpu_kernel(iter,
123122
[=] GPU_LAMBDA (float x, float dy, float scale, int64_t zero_point) -> float {
124123
float inv_scale = 1.0f / scale;
125-
int64_t Xq = std::nearbyint(x * inv_scale + zero_point);
124+
int64_t Xq = std::nearbyint(x * inv_scale) + zero_point;
126125
return (Xq >= quant_min && Xq <= quant_max) * dy;
127126
});
128127
}

0 commit comments

Comments
 (0)