Skip to content

Enable SIMD256 for warpAffineBlocklineNN #26185

@FantasqueX

Description

@FantasqueX

Describe the feature and motivation

Currently, if architecture is not RV, there is only SSE4.1 and SIMD128 optimization instead of SIMD256 or more generic SIMD.

Additional context

void warpAffineBlocklineNN(int *adelta, int *bdelta, short* xy, int X0, int Y0, int bw)
{
CALL_HAL(warpAffineBlocklineNN, cv_hal_warpAffineBlocklineNN, adelta, bdelta, xy, X0, Y0, bw);
const int AB_BITS = MAX(10, (int)INTER_BITS);
int x1 = 0;
#if CV_TRY_SSE4_1
bool useSSE4_1 = CV_CPU_HAS_SUPPORT_SSE4_1;
if( useSSE4_1 )
opt_SSE4_1::WarpAffineInvoker_Blockline_SSE41(adelta, bdelta, xy, X0, Y0, bw);
else
#endif
{
#if CV_SIMD128
{
v_int32x4 v_X0 = v_setall_s32(X0), v_Y0 = v_setall_s32(Y0);
int span = VTraits<v_uint16x8>::vlanes();
for( ; x1 <= bw - span; x1 += span )
{
v_int16x8 v_dst[2];
#define CV_CONVERT_MAP(ptr,offset,shift) v_pack(v_shr<AB_BITS>(v_add(shift,v_load(ptr + offset))),\
v_shr<AB_BITS>(v_add(shift,v_load(ptr + offset + 4))))
v_dst[0] = CV_CONVERT_MAP(adelta, x1, v_X0);
v_dst[1] = CV_CONVERT_MAP(bdelta, x1, v_Y0);
#undef CV_CONVERT_MAP
v_store_interleave(xy + (x1 << 1), v_dst[0], v_dst[1]);
}
}
#endif
for( ; x1 < bw; x1++ )
{
int X = (X0 + adelta[x1]) >> AB_BITS;
int Y = (Y0 + bdelta[x1]) >> AB_BITS;
xy[x1*2] = saturate_cast<short>(X);
xy[x1*2+1] = saturate_cast<short>(Y);
}
}
}

Metadata

Metadata

Assignees

Type

No type

Projects

No projects

Milestone

No milestone

Relationships

None yet

Development

No branches or pull requests

Issue actions