@@ -290,219 +290,5 @@ Tensor dequantize_tensor_per_channel_float_qparams(
290290 return rtensor;
291291}
292292
293- #ifdef USE_FBGEMM
294- // Note: quantize_val is only explicitly used in test outside of this file
295- template <typename T>
296- T quantize_val (double scale, int64_t zero_point, float value) {
297- // Internally, fbgemm::Quantize uses std::nearbyint.
298- // std::nearbyint results in nearest integer value according to the current
299- // rounding mode and the default rounding mode is rounds to even in half-way
300- // cases in most popular processor architectures like x86 and ARM. This is
301- // typically faster than an alternatives like std::round that rounds half-way
302- // cases away from zero, and can be consistent with SIMD implementations for
303- // example in x86 using _mm512_cvtps_epi32 or mm512_round_ps with
304- // _MM_FROUND_CUR_DIRECTION option that also follow the current rounding mode.
305- int32_t qvalue;
306- qvalue = fbgemm::Quantize<typename T::underlying, false /* LEGACY*/ >(
307- value,
308- static_cast <int32_t >(zero_point),
309- static_cast <float >(scale),
310- /* result_precision=*/ CHAR_BIT * sizeof (typename T::underlying));
311- return static_cast <T>(qvalue);
312- }
313-
314- template <typename T, int precision>
315- void quantize_vec (
316- double scale,
317- int64_t zero_point,
318- const float * src,
319- T* dst,
320- size_t count) {
321- fbgemm::Quantize<typename T::underlying, false /* LEGACY*/ >(
322- src,
323- (typename T::underlying*)dst,
324- count,
325- fbgemm::TensorQuantizationParams{
326- (float )scale, (int32_t )zero_point, precision});
327- }
328-
329- template <typename T>
330- inline float dequantize_val (double scale, int64_t zero_point, T value) {
331- fbgemm::TensorQuantizationParams qparams;
332- qparams.scale = static_cast <float >(scale);
333- qparams.zero_point = static_cast <int32_t >(zero_point);
334- return fbgemm::Dequantize<typename T::underlying>(value.val_ , qparams);
335- }
336- #else // USE_FBGEMM
337-
338- #if defined(__ANDROID__) && !defined(__NDK_MAJOR__)
339- template <class T >
340- inline float Round (const float x) {
341- return ::nearbyintf (x);
342- }
343- inline double Round (const double x) {
344- return ::nearbyint (x);
345- }
346- #else
347- template <class T >
348- inline T Round (const T x) {
349- return std::nearbyint (x);
350- }
351- #endif
352-
353- template <typename T>
354- T quantize_val (double scale, int64_t zero_point, float value) {
355- // std::nearbyint results in nearest integer value according to the current
356- // rounding mode and the default rounding mode is rounds to even in half-way
357- // cases in most popular processor architectures like x86 and ARM. This is
358- // typically faster than an alternatives like std::round that rounds half-way
359- // cases away from zero, and can be consistent with SIMD implementations for
360- // example in x86 using _mm512_cvtps_epi32 or mm512_round_ps with
361- // _MM_FROUND_CUR_DIRECTION option that also follow the current rounding mode.
362- int64_t qvalue;
363- constexpr int64_t qmin = std::numeric_limits<typename T::underlying>::min ();
364- constexpr int64_t qmax = std::numeric_limits<typename T::underlying>::max ();
365- float inv_scale = 1 .0f / static_cast <float >(scale);
366- qvalue = static_cast <int64_t >(zero_point + Round (value * inv_scale));
367- qvalue = std::max<int64_t >(qvalue, qmin);
368- qvalue = std::min<int64_t >(qvalue, qmax);
369- return static_cast <T>(qvalue);
370- }
371-
372- uint8_t quantize_val_arm (
373- const float scale,
374- const int32_t zero_point,
375- const float value) {
376- const int32_t qmin = std::numeric_limits<uint8_t >::min ();
377- const int32_t qmax = std::numeric_limits<uint8_t >::max ();
378- float inv_scale = 1 .0f / scale;
379- auto r = zero_point + static_cast <int32_t >(Round (value * inv_scale));
380- r = std::max (r, qmin);
381- r = std::min (r, qmax);
382- return static_cast <uint8_t >(r);
383- }
384-
385- template <typename T, int precision>
386- void quantize_vec (
387- double scale,
388- int64_t zero_point,
389- const float * src,
390- T* dst,
391- size_t count) {
392- checkZeroPoint<typename T::underlying>(" quantize_vec" , zero_point);
393- for (int64_t i = 0 ; i < count; ++i) {
394- dst[i] = quantize_val<T>(scale, zero_point, src[i]);
395- }
396- }
397-
398- template <typename T>
399- TORCH_API float dequantize_val (double scale, int64_t zero_point, T value) {
400- // We need to convert the qint8 value to float to ensure the subtraction
401- // subexpression returns a float
402- return (static_cast <float >(value.val_ ) - zero_point) * scale;
403- }
404- #endif // USE_FBGEMM
405-
406- /*
407- * Quantize value based on the following equation
408- * Xq = Round(Xf * inv_scale + zero_point)
409- * where zero_point is in float.
410- *
411- * Note: For the case of embedding quantization we will set zero_point
412- * to (-Xmin/scale), where Xmin is the min value in input tensor row.
413- */
414- int quantize_val_float_qparams (float scale, float zero_point, float value, int qmin, int qmax) {
415- int qvalue;
416-
417- float inv_scale = scale == 0 ? 1 .0f : 1 .0f / scale;
418- qvalue = lrintf (value * inv_scale + zero_point);
419- qvalue = std::max (qmin, std::min (qvalue, qmax));
420- return qvalue;
421- }
422-
423- template <typename SRC_T, typename DST_T>
424- DST_T requantize_val (
425- double src_scale,
426- int64_t src_zero_point,
427- double dst_scale,
428- int64_t dst_zero_point,
429- SRC_T src) {
430- const auto dq = dequantize_val<SRC_T>(src_scale, src_zero_point, src);
431- return quantize_val<DST_T>(dst_scale, dst_zero_point, dq);
432- }
433-
434- template <typename DST_T>
435- DST_T requantize_from_int (double multiplier, int64_t zero_point, int64_t src) {
436- int64_t quantize_down =
437- zero_point + lrintf (src * static_cast <float >(multiplier));
438- int32_t min = std::numeric_limits<typename DST_T::underlying>::min ();
439- int32_t max = std::numeric_limits<typename DST_T::underlying>::max ();
440- return static_cast <DST_T>(
441- std::min<int64_t >(std::max<int64_t >(quantize_down, min), max));
442- }
443-
444- template TORCH_API qint8
445- quantize_val<qint8>(double scale, int64_t zero_point, float value);
446- template TORCH_API quint8
447- quantize_val<quint8>(double scale, int64_t zero_point, float value);
448- template TORCH_API qint32
449- quantize_val<qint32>(double scale, int64_t zero_point, float value);
450- template TORCH_API void quantize_vec<c10::qint8>(
451- double scale,
452- int64_t zero_point,
453- const float * src,
454- c10::qint8* dst,
455- size_t count);
456- template TORCH_API void quantize_vec<c10::quint8>(
457- double scale,
458- int64_t zero_point,
459- const float * src,
460- c10::quint8* dst,
461- size_t count);
462- template TORCH_API void quantize_vec<c10::qint32, 32 >(
463- double scale,
464- int64_t zero_point,
465- const float * src,
466- c10::qint32* dst,
467- size_t count);
468-
469- template TORCH_API float dequantize_val<qint8>(
470- double scale,
471- int64_t zero_point,
472- qint8 value);
473- template TORCH_API float dequantize_val<quint8>(
474- double scale,
475- int64_t zero_point,
476- quint8 value);
477- template TORCH_API float dequantize_val<qint32>(
478- double scale,
479- int64_t zero_point,
480- qint32 value);
481-
482- template TORCH_API qint8
483- requantize_val<qint8, qint8>(double , int64_t , double , int64_t , qint8);
484- template TORCH_API quint8
485- requantize_val<qint8, quint8>(double , int64_t , double , int64_t , qint8);
486- template TORCH_API qint32
487- requantize_val<qint8, qint32>(double , int64_t , double , int64_t , qint8);
488- template TORCH_API qint8
489- requantize_val<quint8, qint8>(double , int64_t , double , int64_t , quint8);
490- template TORCH_API quint8
491- requantize_val<quint8, quint8>(double , int64_t , double , int64_t , quint8);
492- template TORCH_API qint32
493- requantize_val<quint8, qint32>(double , int64_t , double , int64_t , quint8);
494- template TORCH_API qint8
495- requantize_val<qint32, qint8>(double , int64_t , double , int64_t , qint32);
496- template TORCH_API quint8
497- requantize_val<qint32, quint8>(double , int64_t , double , int64_t , qint32);
498- template TORCH_API qint32
499- requantize_val<qint32, qint32>(double , int64_t , double , int64_t , qint32);
500-
501- template TORCH_API qint8 requantize_from_int<qint8>(double , int64_t , int64_t );
502- template TORCH_API quint8
503- requantize_from_int<quint8>(double , int64_t , int64_t );
504- template TORCH_API qint32
505- requantize_from_int<qint32>(double , int64_t , int64_t );
506-
507293} // namespace native
508294} // namespace at
0 commit comments