@@ -180,6 +180,40 @@ uint32_t KeyCompare::NullUpdateColumnToRowImp_avx2(
180180 }
181181}
182182
183+ namespace {
184+
185+ // Intrinsics `_mm256_i32gather_epi32/64` treat the `vindex` as signed integer, and we
186+ // are using `uint32_t` to represent the offset, in range of [0, 4G), within the row
187+ // table. When the offset is larger than `0x80000000` (2GB), those intrinsics will treat
188+ // it as negative offset and gather the data from undesired address. To avoid this issue,
189+ // we normalize the addresses by translating `base` `0x80000000` higher, and `offset`
190+ // `0x80000000` lower. This way, the offset is always in range of [-2G, 2G) and those
191+ // intrinsics are safe.
192+
193+ constexpr uint64_t kTwoGB = 0x80000000ull ;
194+
195+ template <uint32_t kScale >
196+ inline __m256i UnsignedOffsetSafeGather32 (int const * base, __m256i offset) {
197+ int const * normalized_base = base + kTwoGB / sizeof (int );
198+ __m256i normalized_offset =
199+ _mm256_sub_epi32 (offset, _mm256_set1_epi32 (static_cast <int >(kTwoGB / kScale )));
200+ return _mm256_i32gather_epi32 (normalized_base, normalized_offset,
201+ static_cast <int >(kScale ));
202+ }
203+
204+ template <uint32_t kScale >
205+ inline __m256i UnsignedOffsetSafeGather64 (arrow::util::int64_for_gather_t const * base,
206+ __m128i offset) {
207+ arrow::util::int64_for_gather_t const * normalized_base =
208+ base + kTwoGB / sizeof (arrow::util::int64_for_gather_t );
209+ __m128i normalized_offset =
210+ _mm_sub_epi32 (offset, _mm_set1_epi32 (static_cast <int >(kTwoGB / kScale )));
211+ return _mm256_i32gather_epi64 (normalized_base, normalized_offset,
212+ static_cast <int >(kScale ));
213+ }
214+
215+ } // namespace
216+
183217template <bool use_selection, class COMPARE8_FN >
184218uint32_t KeyCompare::CompareBinaryColumnToRowHelper_avx2 (
185219 uint32_t offset_within_row, uint32_t num_rows_to_compare,
@@ -236,10 +270,8 @@ uint32_t KeyCompare::CompareBinaryColumnToRowHelper_avx2(
236270 irow_right =
237271 _mm256_loadu_si256 (reinterpret_cast <const __m256i*>(left_to_right_map) + i);
238272 }
239- // TODO: Need to test if this gather is OK when irow_right is larger than
240- // 0x80000000u.
241273 __m256i offset_right =
242- _mm256_i32gather_epi32 (( const int *)offsets_right, irow_right, 4 );
274+ UnsignedOffsetSafeGather32< 4 >(( int const *)offsets_right, irow_right);
243275 offset_right = _mm256_add_epi32 (offset_right, _mm256_set1_epi32 (offset_within_row));
244276
245277 reinterpret_cast <uint64_t *>(match_bytevector)[i] =
@@ -253,40 +285,6 @@ uint32_t KeyCompare::CompareBinaryColumnToRowHelper_avx2(
253285 }
254286}
255287
256- namespace {
257-
258- // Intrinsics `_mm256_i32gather_epi32/64` treat the `vindex` as signed integer, and we
259- // are using `uint32_t` to represent the offset, in range of [0, 4G), within the row
260- // table. When the offset is larger than `0x80000000` (2GB), those intrinsics will treat
261- // it as negative offset and gather the data from undesired address. To avoid this issue,
262- // we normalize the addresses by translating `base` `0x80000000` higher, and `offset`
263- // `0x80000000` lower. This way, the offset is always in range of [-2G, 2G) and those
264- // intrinsics are safe.
265-
266- constexpr uint64_t kTwoGB = 0x80000000ull ;
267-
268- template <uint32_t kScale >
269- inline __m256i UnsignedOffsetSafeGather32 (int const * base, __m256i offset) {
270- int const * normalized_base = base + kTwoGB / sizeof (int );
271- __m256i normalized_offset =
272- _mm256_sub_epi32 (offset, _mm256_set1_epi32 (static_cast <int >(kTwoGB / kScale )));
273- return _mm256_i32gather_epi32 (normalized_base, normalized_offset,
274- static_cast <int >(kScale ));
275- }
276-
277- template <uint32_t kScale >
278- inline __m256i UnsignedOffsetSafeGather64 (arrow::util::int64_for_gather_t const * base,
279- __m128i offset) {
280- arrow::util::int64_for_gather_t const * normalized_base =
281- base + kTwoGB / sizeof (arrow::util::int64_for_gather_t );
282- __m128i normalized_offset =
283- _mm_sub_epi32 (offset, _mm_set1_epi32 (static_cast <int >(kTwoGB / kScale )));
284- return _mm256_i32gather_epi64 (normalized_base, normalized_offset,
285- static_cast <int >(kScale ));
286- }
287-
288- } // namespace
289-
290288template <int column_width>
291289inline uint64_t CompareSelected8_avx2 (const uint8_t * left_base, const uint8_t * right_base,
292290 __m256i irow_left, __m256i offset_right,
0 commit comments