GAPI: SIMD optimization for AddWeighted kernel.#18466
Conversation
d1bdfcd to
c14bca7
Compare
4187207 to
a1eec7b
Compare
|
@terfendail , please take a look on SIMD optimization of And, Or, Xor kernels. |
|
@anna-khakimova @anton-potapov Friendly reminder. |
|
@anna-khakimova could you rebase the PR and fix conflicts? |
|
Hello @asmorkalov |
26b9ac6 to
1dc3ab2
Compare
OrestChura
left a comment
There was a problem hiding this comment.
I also can suggest the same changes for the code of addw_simd function; but it is more complicated question, let's discuss it then
Reworked. |
|
@alalek , @terfendail , @rgarnov , @OrestChura please check. |
| v_float32 a = vx_setall_f32(_alpha); | ||
| v_float32 b = vx_setall_f32(_beta); | ||
| v_float32 g = vx_setall_f32(_gamma); | ||
|
|
||
| x = addw_simd(in1, in2, out, a, b, g, length); |
There was a problem hiding this comment.
Why?
I don't know any case where passing SIMD vectors through parameters (into non-inline function) can improve performance.
There was a problem hiding this comment.
I don't tried to improve performance with commit "Refactoring.Step2". I've just tried remove excess intermediate function which has the same name addw_simd() but look like this:
template<typename SRC, typename DST>
CV_ALWAYS_INLINE int addw_simd(const SRC in1[], const SRC in2[], DST out[],
float _alpha, float _beta, float _gamma, int length)
{
//Cases when dst type is float are successfully vectorized with compiler.
if (std::is_same<DST, float>::value)
return 0;
v_float32 alpha = vx_setall_f32(_alpha);
v_float32 beta = vx_setall_f32(_beta);
v_float32 gamma = vx_setall_f32(_gamma);
if (std::is_same<SRC, ushort>::value && std::is_same<DST, ushort>::value)
{
return addw_short2short(reinterpret_cast<const ushort*>(in1), reinterpret_cast<const ushort*>(in2),
reinterpret_cast<ushort*>(out), alpha, beta, gamma, length);
}
else if (std::is_same<SRC, short>::value && std::is_same<DST, short>::value)
{
return addw_short2short(reinterpret_cast<const short*>(in1), reinterpret_cast<const short*>(in2),
reinterpret_cast<short*>(out), alpha, beta, gamma, length);
}
else if (std::is_same<SRC, short>::value && std::is_same<DST, uchar>::value)
{
return addw_short2uchar(reinterpret_cast<const short*>(in1), reinterpret_cast<const short*>(in2),
reinterpret_cast<uchar*>(out), alpha, beta, gamma, length);
}
else if (std::is_same<SRC, ushort>::value && std::is_same<DST, uchar>::value)
{
return addw_short2uchar(reinterpret_cast<const ushort*>(in1), reinterpret_cast<const ushort*>(in2),
reinterpret_cast<uchar*>(out), alpha, beta, gamma, length);
}
else if (std::is_same<SRC, uchar>::value && std::is_same<DST, uchar>::value)
{
constexpr int nlanes = v_uint8::nlanes;
if (length < nlanes)
return 0;
int x = 0;
for (;;)
{
for (; x <= length - nlanes; x += nlanes)
{
v_float32 a1 = v_load_f32(reinterpret_cast<const uchar*>(&in1[x]));
v_float32 a2 = v_load_f32(reinterpret_cast<const uchar*>(&in1[x + nlanes / 4]));
v_float32 a3 = v_load_f32(reinterpret_cast<const uchar*>(&in1[x + nlanes / 2]));
v_float32 a4 = v_load_f32(reinterpret_cast<const uchar*>(&in1[x + 3 * nlanes / 4]));
v_float32 b1 = v_load_f32(reinterpret_cast<const uchar*>(&in2[x]));
v_float32 b2 = v_load_f32(reinterpret_cast<const uchar*>(&in2[x + nlanes / 4]));
v_float32 b3 = v_load_f32(reinterpret_cast<const uchar*>(&in2[x + nlanes / 2]));
v_float32 b4 = v_load_f32(reinterpret_cast<const uchar*>(&in2[x + 3 * nlanes / 4]));
v_int32 sum1 = v_round(v_fma(a1, alpha, v_fma(b1, beta, gamma))),
sum2 = v_round(v_fma(a2, alpha, v_fma(b2, beta, gamma))),
sum3 = v_round(v_fma(a3, alpha, v_fma(b3, beta, gamma))),
sum4 = v_round(v_fma(a4, alpha, v_fma(b4, beta, gamma)));
vx_store(reinterpret_cast<uchar*>(&out[x]), v_pack_u(v_pack(sum1, sum2), v_pack(sum3, sum4)));
}
if (x < length)
{
x = length - nlanes;
continue; // process one more time (unaligned tail)
}
break;
}
return x;
}
return 0;
}`
```
There was a problem hiding this comment.
Functions addw_short2uchar(), addw_short2short() were renamed to addw_simd() and became templated with 2 template specializations. Please don't rely on github's diff. It often shows a difference incorrectly.
Since showed above intermediate function was removed, initialization of vectors alpha, beta, gamma had to be moved to function run_addweighted() on one call stack's level higher.
There was a problem hiding this comment.
So now addw_simd() look like this:
template<typename SRC, typename DST>
CV_ALWAYS_INLINE int addw_simd(const SRC in1[], const SRC in2[], DST out[],
const v_float32& alpha, const v_float32& beta,
const v_float32& gamma, int length)
{
GAPI_Assert((std::is_same<DST, ushort>::value) ||
(std::is_same<DST, short>::value));
constexpr int nlanes = (std::is_same<DST, ushort>::value) ? static_cast<int>(v_uint16::nlanes) :
static_cast<int>(v_int16::nlanes);
if (length < nlanes)
return 0;
int x = 0;
for (;;)
{
for (; x <= length - nlanes; x += nlanes)
{
v_float32 a1 = v_load_f32(&in1[x]);
v_float32 a2 = v_load_f32(&in1[x + nlanes / 2]);
v_float32 b1 = v_load_f32(&in2[x]);
v_float32 b2 = v_load_f32(&in2[x + nlanes / 2]);
addw_short_store(&out[x], v_round(v_fma(a1, alpha, v_fma(b1, beta, gamma))),
v_round(v_fma(a2, alpha, v_fma(b2, beta, gamma))));
}
if (x < length)
{
x = length - nlanes;
continue; // process one more time (unaligned tail)
}
break;
}
return x;
}
template<typename SRC>
CV_ALWAYS_INLINE int addw_simd(const SRC in1[], const SRC in2[], uchar out[],
const v_float32& alpha, const v_float32& beta,
const v_float32& gamma, int length)
{
constexpr int nlanes = v_uint8::nlanes;
if (length < nlanes)
return 0;
int x = 0;
for (;;)
{
for (; x <= length - nlanes; x += nlanes)
{
v_float32 a1 = v_load_f32(&in1[x]);
v_float32 a2 = v_load_f32(&in1[x + nlanes / 4]);
v_float32 a3 = v_load_f32(&in1[x + nlanes / 2]);
v_float32 a4 = v_load_f32(&in1[x + 3 * nlanes / 4]);
v_float32 b1 = v_load_f32(&in2[x]);
v_float32 b2 = v_load_f32(&in2[x + nlanes / 4]);
v_float32 b3 = v_load_f32(&in2[x + nlanes / 2]);
v_float32 b4 = v_load_f32(&in2[x + 3 * nlanes / 4]);
v_int32 sum1 = v_round(v_fma(a1, alpha, v_fma(b1, beta, gamma))),
sum2 = v_round(v_fma(a2, alpha, v_fma(b2, beta, gamma))),
sum3 = v_round(v_fma(a3, alpha, v_fma(b3, beta, gamma))),
sum4 = v_round(v_fma(a4, alpha, v_fma(b4, beta, gamma)));
vx_store(&out[x], v_pack_u(v_pack(sum1, sum2), v_pack(sum3, sum4)));
}
if (x < length)
{
x = length - nlanes;
continue; // process one more time (unaligned tail)
}
break;
}
return x;
}
template<typename SRC>
CV_ALWAYS_INLINE int addw_simd(const SRC*, const SRC*, float*,
const v_float32&, const v_float32&,
const v_float32&, int length)
{
//Cases when dst type is float are successfully vectorized with compiler.
return 0;
}
There was a problem hiding this comment.
But if you mind I can move
v_float32 a = vx_setall_f32(_alpha);
v_float32 b = vx_setall_f32(_beta);
v_float32 g = vx_setall_f32(_gamma);
to each of the addw_simd() template specialization. But it cause code duplication.
There was a problem hiding this comment.
I think it would be better and easier to add static assert.
@alalek could you please comment?
There was a problem hiding this comment.
if you mind I can move
This should be moved into SIMD functions.
Think about runtime dispatching (not in a scope of this PR) where we don't even know SIMD size and registers types of underlying functions.
Which SIMD value do you want to send there?
Consider following these rules:
- no SIMD variables in dispatching code (generic C++ part)
- no SIMD arguments in SIMD functions which are called from generic C++ code.
6fad040 to
cca8f7e
Compare
cca8f7e to
1b4529b
Compare
e2cebd1 to
bb30bac
Compare
|
@alalek I've already applied all comments. It seems to me that Vitaly is on vacation. Could you please check, remove change request from Vitaly and merge? |
alalek
left a comment
There was a problem hiding this comment.
@rgarnov @dmatveev In this patch SIMD code targeted for NEON is explicitly disabled. It is very unusual case, especially for such simple loops.
Need to properly re-measure performance with enabled NEON intrinsics on the target platform.
On public TX-1 (no access to target platform) I see 1.5-3.5x speedup.
Anna provides report with 12% degradation.
We need third independent measurement through OpenCV perf tests scripts to confirm performance observations.
| // Fluid kernels: addWeighted | ||
| // | ||
| //--------------------------- | ||
| #if CV_SSE2 | CV_AVX2 | CV_AVX512_SKX |
There was a problem hiding this comment.
#if CV_SSE2 | CV_AVX2 | CV_AVX512_SKX
Why is just #if CV_SSE2 not enough?
| v_float32 a = vx_setall_f32(_alpha); | ||
| v_float32 b = vx_setall_f32(_beta); | ||
| v_float32 g = vx_setall_f32(_gamma); | ||
|
|
||
| x = addw_simd(in1, in2, out, a, b, g, length); |
There was a problem hiding this comment.
if you mind I can move
This should be moved into SIMD functions.
Think about runtime dispatching (not in a scope of this PR) where we don't even know SIMD size and registers types of underlying functions.
Which SIMD value do you want to send there?
Consider following these rules:
- no SIMD variables in dispatching code (generic C++ part)
- no SIMD arguments in SIMD functions which are called from generic C++ code.
|
@dbudniko please take a look. |
@alalek As you can see Dmitry's result is the same as our with Orest. Speedup is not observed. |
bb30bac to
b819847
Compare
|
@alalek please take a look. |
terfendail
left a comment
There was a problem hiding this comment.
I haven't yet collected performance statistics for this update, however the change looks fine for me
GAPI: SIMD optimization for AddWeighted kernel. * Add, sub, absdiff kernels optimization * AddW kernel * And, or kernels * AddWeighted refactoring and SIMD opt for AbsDiffC kernel * Remove simd opt of AbsDiffC kernel * Refactoring * Applied comments * Refactoring.Step2 * Applied comments.Step2
SIMD optimization for AddWeighted kernel via universal intrinsics.
Perf report for AddWeighted:
AddWeighted_avx512_avx2_sse42_perf_report.xlsx