@@ -332,226 +332,11 @@ void ByteStreamSplitEncodeAvx2(const uint8_t* raw_values, const int64_t num_valu
332332}
333333#endif // ARROW_HAVE_AVX2
334334
335- #if defined(ARROW_HAVE_AVX512)
336- template <int kNumStreams >
337- void ByteStreamSplitDecodeAvx512 (const uint8_t * data, int64_t num_values, int64_t stride,
338- uint8_t * out) {
339- static_assert (kNumStreams == 4 || kNumStreams == 8 , " Invalid number of streams." );
340- constexpr int kNumStreamsLog2 = (kNumStreams == 8 ? 3 : 2 );
341- constexpr int64_t kBlockSize = sizeof (__m512i) * kNumStreams ;
342-
343- const int64_t size = num_values * kNumStreams ;
344- if (size < kBlockSize ) // Back to AVX2 for small size
345- return ByteStreamSplitDecodeAvx2<kNumStreams >(data, num_values, stride, out);
346- const int64_t num_blocks = size / kBlockSize ;
347-
348- // First handle suffix.
349- const int64_t num_processed_elements = (num_blocks * kBlockSize ) / kNumStreams ;
350- for (int64_t i = num_processed_elements; i < num_values; ++i) {
351- uint8_t gathered_byte_data[kNumStreams ];
352- for (int b = 0 ; b < kNumStreams ; ++b) {
353- const int64_t byte_index = b * stride + i;
354- gathered_byte_data[b] = data[byte_index];
355- }
356- memcpy (out + i * kNumStreams , gathered_byte_data, kNumStreams );
357- }
358-
359- // Processed hierarchically using the unpack, then two shuffles.
360- __m512i stage[kNumStreamsLog2 + 1 ][kNumStreams ];
361- __m512i shuffle[kNumStreams ];
362- __m512i final_result[kNumStreams ];
363- constexpr int kNumStreamsHalf = kNumStreams / 2U ;
364-
365- for (int64_t i = 0 ; i < num_blocks; ++i) {
366- for (int j = 0 ; j < kNumStreams ; ++j) {
367- stage[0 ][j] = _mm512_loadu_si512 (
368- reinterpret_cast <const __m512i*>(&data[i * sizeof (__m512i) + j * stride]));
369- }
370-
371- for (int step = 0 ; step < kNumStreamsLog2 ; ++step) {
372- for (int j = 0 ; j < kNumStreamsHalf ; ++j) {
373- stage[step + 1 ][j * 2 ] =
374- _mm512_unpacklo_epi8 (stage[step][j], stage[step][kNumStreamsHalf + j]);
375- stage[step + 1 ][j * 2 + 1 ] =
376- _mm512_unpackhi_epi8 (stage[step][j], stage[step][kNumStreamsHalf + j]);
377- }
378- }
379-
380- if constexpr (kNumStreams == 8 ) {
381- // path for double, 128i index:
382- // {0x00, 0x04, 0x08, 0x0C}, {0x10, 0x14, 0x18, 0x1C},
383- // {0x01, 0x05, 0x09, 0x0D}, {0x11, 0x15, 0x19, 0x1D},
384- // {0x02, 0x06, 0x0A, 0x0E}, {0x12, 0x16, 0x1A, 0x1E},
385- // {0x03, 0x07, 0x0B, 0x0F}, {0x13, 0x17, 0x1B, 0x1F},
386- shuffle[0 ] = _mm512_shuffle_i32x4 (stage[kNumStreamsLog2 ][0 ],
387- stage[kNumStreamsLog2 ][1 ], 0b01000100 );
388- shuffle[1 ] = _mm512_shuffle_i32x4 (stage[kNumStreamsLog2 ][2 ],
389- stage[kNumStreamsLog2 ][3 ], 0b01000100 );
390- shuffle[2 ] = _mm512_shuffle_i32x4 (stage[kNumStreamsLog2 ][4 ],
391- stage[kNumStreamsLog2 ][5 ], 0b01000100 );
392- shuffle[3 ] = _mm512_shuffle_i32x4 (stage[kNumStreamsLog2 ][6 ],
393- stage[kNumStreamsLog2 ][7 ], 0b01000100 );
394- shuffle[4 ] = _mm512_shuffle_i32x4 (stage[kNumStreamsLog2 ][0 ],
395- stage[kNumStreamsLog2 ][1 ], 0b11101110 );
396- shuffle[5 ] = _mm512_shuffle_i32x4 (stage[kNumStreamsLog2 ][2 ],
397- stage[kNumStreamsLog2 ][3 ], 0b11101110 );
398- shuffle[6 ] = _mm512_shuffle_i32x4 (stage[kNumStreamsLog2 ][4 ],
399- stage[kNumStreamsLog2 ][5 ], 0b11101110 );
400- shuffle[7 ] = _mm512_shuffle_i32x4 (stage[kNumStreamsLog2 ][6 ],
401- stage[kNumStreamsLog2 ][7 ], 0b11101110 );
402-
403- final_result[0 ] = _mm512_shuffle_i32x4 (shuffle[0 ], shuffle[1 ], 0b10001000 );
404- final_result[1 ] = _mm512_shuffle_i32x4 (shuffle[2 ], shuffle[3 ], 0b10001000 );
405- final_result[2 ] = _mm512_shuffle_i32x4 (shuffle[0 ], shuffle[1 ], 0b11011101 );
406- final_result[3 ] = _mm512_shuffle_i32x4 (shuffle[2 ], shuffle[3 ], 0b11011101 );
407- final_result[4 ] = _mm512_shuffle_i32x4 (shuffle[4 ], shuffle[5 ], 0b10001000 );
408- final_result[5 ] = _mm512_shuffle_i32x4 (shuffle[6 ], shuffle[7 ], 0b10001000 );
409- final_result[6 ] = _mm512_shuffle_i32x4 (shuffle[4 ], shuffle[5 ], 0b11011101 );
410- final_result[7 ] = _mm512_shuffle_i32x4 (shuffle[6 ], shuffle[7 ], 0b11011101 );
411- } else {
412- // path for float, 128i index:
413- // {0x00, 0x04, 0x08, 0x0C}, {0x01, 0x05, 0x09, 0x0D}
414- // {0x02, 0x06, 0x0A, 0x0E}, {0x03, 0x07, 0x0B, 0x0F},
415- shuffle[0 ] = _mm512_shuffle_i32x4 (stage[kNumStreamsLog2 ][0 ],
416- stage[kNumStreamsLog2 ][1 ], 0b01000100 );
417- shuffle[1 ] = _mm512_shuffle_i32x4 (stage[kNumStreamsLog2 ][2 ],
418- stage[kNumStreamsLog2 ][3 ], 0b01000100 );
419- shuffle[2 ] = _mm512_shuffle_i32x4 (stage[kNumStreamsLog2 ][0 ],
420- stage[kNumStreamsLog2 ][1 ], 0b11101110 );
421- shuffle[3 ] = _mm512_shuffle_i32x4 (stage[kNumStreamsLog2 ][2 ],
422- stage[kNumStreamsLog2 ][3 ], 0b11101110 );
423-
424- final_result[0 ] = _mm512_shuffle_i32x4 (shuffle[0 ], shuffle[1 ], 0b10001000 );
425- final_result[1 ] = _mm512_shuffle_i32x4 (shuffle[0 ], shuffle[1 ], 0b11011101 );
426- final_result[2 ] = _mm512_shuffle_i32x4 (shuffle[2 ], shuffle[3 ], 0b10001000 );
427- final_result[3 ] = _mm512_shuffle_i32x4 (shuffle[2 ], shuffle[3 ], 0b11011101 );
428- }
429-
430- for (int j = 0 ; j < kNumStreams ; ++j) {
431- _mm512_storeu_si512 (
432- reinterpret_cast <__m512i*>(out + (i * kNumStreams + j) * sizeof (__m512i)),
433- final_result[j]);
434- }
435- }
436- }
437-
438- template <int kNumStreams >
439- void ByteStreamSplitEncodeAvx512 (const uint8_t * raw_values, const int64_t num_values,
440- uint8_t * output_buffer_raw) {
441- static_assert (kNumStreams == 4 || kNumStreams == 8 , " Invalid number of streams." );
442- constexpr int kBlockSize = sizeof (__m512i) * kNumStreams ;
443-
444- const int64_t size = num_values * kNumStreams ;
445-
446- if (size < kBlockSize ) // Back to AVX2 for small size
447- return ByteStreamSplitEncodeAvx2<kNumStreams >(raw_values, num_values,
448- output_buffer_raw);
449-
450- const int64_t num_blocks = size / kBlockSize ;
451- const __m512i* raw_values_simd = reinterpret_cast <const __m512i*>(raw_values);
452- __m512i* output_buffer_streams[kNumStreams ];
453- for (int i = 0 ; i < kNumStreams ; ++i) {
454- output_buffer_streams[i] =
455- reinterpret_cast <__m512i*>(&output_buffer_raw[num_values * i]);
456- }
457-
458- // First handle suffix.
459- const int64_t num_processed_elements = (num_blocks * kBlockSize ) / kNumStreams ;
460- for (int64_t i = num_processed_elements; i < num_values; ++i) {
461- for (int j = 0 ; j < kNumStreams ; ++j) {
462- const uint8_t byte_in_value = raw_values[i * kNumStreams + j];
463- output_buffer_raw[j * num_values + i] = byte_in_value;
464- }
465- }
466-
467- constexpr int KNumUnpack = (kNumStreams == 8 ) ? 2 : 3 ;
468- __m512i final_result[kNumStreams ];
469- __m512i unpack[KNumUnpack + 1 ][kNumStreams ];
470- __m512i permutex[kNumStreams ];
471- __m512i permutex_mask;
472- if constexpr (kNumStreams == 8 ) {
473- // use _mm512_set_epi32, no _mm512_set_epi16 for some old gcc version.
474- permutex_mask = _mm512_set_epi32 (0x001F0017 , 0x000F0007 , 0x001E0016 , 0x000E0006 ,
475- 0x001D0015 , 0x000D0005 , 0x001C0014 , 0x000C0004 ,
476- 0x001B0013 , 0x000B0003 , 0x001A0012 , 0x000A0002 ,
477- 0x00190011 , 0x00090001 , 0x00180010 , 0x00080000 );
478- } else {
479- permutex_mask = _mm512_set_epi32 (0x0F , 0x0B , 0x07 , 0x03 , 0x0E , 0x0A , 0x06 , 0x02 , 0x0D ,
480- 0x09 , 0x05 , 0x01 , 0x0C , 0x08 , 0x04 , 0x00 );
481- }
482-
483- for (int64_t block_index = 0 ; block_index < num_blocks; ++block_index) {
484- for (int i = 0 ; i < kNumStreams ; ++i) {
485- unpack[0 ][i] = _mm512_loadu_si512 (&raw_values_simd[block_index * kNumStreams + i]);
486- }
487-
488- for (int unpack_lvl = 0 ; unpack_lvl < KNumUnpack; ++unpack_lvl) {
489- for (int i = 0 ; i < kNumStreams / 2 ; ++i) {
490- unpack[unpack_lvl + 1 ][i * 2 ] = _mm512_unpacklo_epi8 (
491- unpack[unpack_lvl][i * 2 ], unpack[unpack_lvl][i * 2 + 1 ]);
492- unpack[unpack_lvl + 1 ][i * 2 + 1 ] = _mm512_unpackhi_epi8 (
493- unpack[unpack_lvl][i * 2 ], unpack[unpack_lvl][i * 2 + 1 ]);
494- }
495- }
496-
497- if constexpr (kNumStreams == 8 ) {
498- // path for double
499- // 1. unpack to epi16 block
500- // 2. permutexvar_epi16 to 128i block
501- // 3. shuffle 128i to final 512i target, index:
502- // {0x00, 0x04, 0x08, 0x0C}, {0x10, 0x14, 0x18, 0x1C},
503- // {0x01, 0x05, 0x09, 0x0D}, {0x11, 0x15, 0x19, 0x1D},
504- // {0x02, 0x06, 0x0A, 0x0E}, {0x12, 0x16, 0x1A, 0x1E},
505- // {0x03, 0x07, 0x0B, 0x0F}, {0x13, 0x17, 0x1B, 0x1F},
506- for (int i = 0 ; i < kNumStreams ; ++i)
507- permutex[i] = _mm512_permutexvar_epi16 (permutex_mask, unpack[KNumUnpack][i]);
508-
509- __m512i shuffle[kNumStreams ];
510- shuffle[0 ] = _mm512_shuffle_i32x4 (permutex[0 ], permutex[2 ], 0b01000100 );
511- shuffle[1 ] = _mm512_shuffle_i32x4 (permutex[4 ], permutex[6 ], 0b01000100 );
512- shuffle[2 ] = _mm512_shuffle_i32x4 (permutex[0 ], permutex[2 ], 0b11101110 );
513- shuffle[3 ] = _mm512_shuffle_i32x4 (permutex[4 ], permutex[6 ], 0b11101110 );
514- shuffle[4 ] = _mm512_shuffle_i32x4 (permutex[1 ], permutex[3 ], 0b01000100 );
515- shuffle[5 ] = _mm512_shuffle_i32x4 (permutex[5 ], permutex[7 ], 0b01000100 );
516- shuffle[6 ] = _mm512_shuffle_i32x4 (permutex[1 ], permutex[3 ], 0b11101110 );
517- shuffle[7 ] = _mm512_shuffle_i32x4 (permutex[5 ], permutex[7 ], 0b11101110 );
518-
519- final_result[0 ] = _mm512_shuffle_i32x4 (shuffle[0 ], shuffle[1 ], 0b10001000 );
520- final_result[1 ] = _mm512_shuffle_i32x4 (shuffle[0 ], shuffle[1 ], 0b11011101 );
521- final_result[2 ] = _mm512_shuffle_i32x4 (shuffle[2 ], shuffle[3 ], 0b10001000 );
522- final_result[3 ] = _mm512_shuffle_i32x4 (shuffle[2 ], shuffle[3 ], 0b11011101 );
523- final_result[4 ] = _mm512_shuffle_i32x4 (shuffle[4 ], shuffle[5 ], 0b10001000 );
524- final_result[5 ] = _mm512_shuffle_i32x4 (shuffle[4 ], shuffle[5 ], 0b11011101 );
525- final_result[6 ] = _mm512_shuffle_i32x4 (shuffle[6 ], shuffle[7 ], 0b10001000 );
526- final_result[7 ] = _mm512_shuffle_i32x4 (shuffle[6 ], shuffle[7 ], 0b11011101 );
527- } else {
528- // Path for float.
529- // 1. Processed hierarchically to 32i block using the unpack intrinsics.
530- // 2. Pack 128i block using _mm256_permutevar8x32_epi32.
531- // 3. Pack final 256i block with _mm256_permute2x128_si256.
532- for (int i = 0 ; i < kNumStreams ; ++i)
533- permutex[i] = _mm512_permutexvar_epi32 (permutex_mask, unpack[KNumUnpack][i]);
534-
535- final_result[0 ] = _mm512_shuffle_i32x4 (permutex[0 ], permutex[2 ], 0b01000100 );
536- final_result[1 ] = _mm512_shuffle_i32x4 (permutex[0 ], permutex[2 ], 0b11101110 );
537- final_result[2 ] = _mm512_shuffle_i32x4 (permutex[1 ], permutex[3 ], 0b01000100 );
538- final_result[3 ] = _mm512_shuffle_i32x4 (permutex[1 ], permutex[3 ], 0b11101110 );
539- }
540-
541- for (int i = 0 ; i < kNumStreams ; ++i) {
542- _mm512_storeu_si512 (&output_buffer_streams[i][block_index], final_result[i]);
543- }
544- }
545- }
546- #endif // ARROW_HAVE_AVX512
547-
548335#if defined(ARROW_HAVE_SIMD_SPLIT)
549336template <int kNumStreams >
550337void inline ByteStreamSplitDecodeSimd (const uint8_t * data, int64_t num_values,
551338 int64_t stride, uint8_t * out) {
552- #if defined(ARROW_HAVE_AVX512)
553- return ByteStreamSplitDecodeAvx512<kNumStreams >(data, num_values, stride, out);
554- #elif defined(ARROW_HAVE_AVX2)
339+ #if defined(ARROW_HAVE_AVX2)
555340 return ByteStreamSplitDecodeAvx2<kNumStreams >(data, num_values, stride, out);
556341#elif defined(ARROW_HAVE_SSE4_2)
557342 return ByteStreamSplitDecodeSse2<kNumStreams >(data, num_values, stride, out);
@@ -563,10 +348,7 @@ void inline ByteStreamSplitDecodeSimd(const uint8_t* data, int64_t num_values,
563348template <int kNumStreams >
564349void inline ByteStreamSplitEncodeSimd (const uint8_t * raw_values, const int64_t num_values,
565350 uint8_t * output_buffer_raw) {
566- #if defined(ARROW_HAVE_AVX512)
567- return ByteStreamSplitEncodeAvx512<kNumStreams >(raw_values, num_values,
568- output_buffer_raw);
569- #elif defined(ARROW_HAVE_AVX2)
351+ #if defined(ARROW_HAVE_AVX2)
570352 return ByteStreamSplitEncodeAvx2<kNumStreams >(raw_values, num_values,
571353 output_buffer_raw);
572354#elif defined(ARROW_HAVE_SSE4_2)
0 commit comments