Skip to content

[InstanceNorm Optimize x86] AVX512/AVX/SSE intrinsic with elempack merged#4062

Closed
LRY89757 wants to merge 16 commits intoTencent:masterfrom
LRY89757:instancenorm
Closed

[InstanceNorm Optimize x86] AVX512/AVX/SSE intrinsic with elempack merged#4062
LRY89757 wants to merge 16 commits intoTencent:masterfrom
LRY89757:instancenorm

Conversation

@LRY89757
Copy link
Copy Markdown
Contributor

  • Add the avx512/avx/sse inrinsic for instancenorm

@LRY89757 LRY89757 closed this Jul 21, 2022
@LRY89757 LRY89757 reopened this Jul 21, 2022
@codecov-commenter
Copy link
Copy Markdown

codecov-commenter commented Jul 21, 2022

Codecov Report

All modified and coverable lines are covered by tests ✅

Project coverage is 95.13%. Comparing base (8363040) to head (888f55b).
Report is 3 commits behind head on master.

Additional details and impacted files
@@             Coverage Diff             @@
##           master    #4062       +/-   ##
===========================================
- Coverage   95.16%   95.13%    -0.04%     
===========================================
  Files         743      409      -334     
  Lines      252233    99630   -152603     
===========================================
- Hits       240043    94782   -145261     
+ Misses      12190     4848     -7342     

☔ View full report in Codecov by Sentry.
📢 Have feedback on the report? Share it here.

🚀 New features to boost your workflow:
  • ❄️ Test Analytics: Detect flaky tests, report on failures, and find test suite problems.

@LRY89757 LRY89757 closed this Jul 21, 2022
@LRY89757 LRY89757 reopened this Jul 21, 2022
Copy link
Copy Markdown
Member

@nihui nihui left a comment

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

missing avx/avx512 optimization for pack4 and avx512 optimization for pack8 ?

Comment on lines +228 to +493
#if __SSE2__
#if __AVX__
#if __AVX512F__
#pragma omp parallel for num_threads(opt.num_threads)
for (int q = 0; q < c; q++)
{
float* ptr = bottom_top_blob.channel(q);
int ssize = size / 16;
int remainsize = ssize * 16;

__m512 _fLoad;

// mean
float sum = 0.f;
float sqsum = 0.f;

__m512 _fsum = _mm512_setzero_ps();

for (int i = 0; i < ssize; i++)
{
_fLoad = _mm512_loadu_ps(ptr + (i * 16));
_fsum = _mm512_add_ps(_fsum, _fLoad);
}

sum = _mm512_reduce_add_ps(_fsum);

for (int i = remainsize; i < size; i++)
sum += ptr[i];

float mean = sum / size;
__m512 _mean = _mm512_set1_ps(mean);
__m512 _fsqsum = _mm512_setzero_ps();

for (int i = 0; i < ssize; i++)
{
_fLoad = _mm512_loadu_ps(ptr + (i * 16));
_fLoad = _mm512_sub_ps(_fLoad, _mean);
_fLoad = _mm512_mul_ps(_fLoad, _fLoad);
_fsqsum = _mm512_add_ps(_fsqsum, _fLoad);
}

sqsum = _mm512_reduce_add_ps(_fsqsum);

float tmp = 0.f;
for (int i = remainsize; i < size; i++)
{
tmp = ptr[i] - mean;
sqsum += tmp * tmp;
}

// var
float var = sqsum / size;
float a, b;
__m512 _a, _b;

if (affine)
{
float gamma = gamma_data[q];
float beta = beta_data[q];

a = static_cast<float>(gamma / (sqrt(var + eps)));
b = -mean * a + beta;

_a = _mm512_set1_ps(a);
_b = _mm512_set1_ps(b);
}
else
{
a = static_cast<float>(1.f / (sqrt(var + eps)));
b = -mean * a;

_a = _mm512_set1_ps(a);
_b = _mm512_set1_ps(b);
}

for (int i = 0; i < ssize; i++)
{
_fLoad = _mm512_loadu_ps(ptr + (i * 16));
_fLoad = _mm512_mul_ps(_fLoad, _a);
_fLoad = _mm512_add_ps(_fLoad, _b);

_mm512_storeu_ps(ptr + (i * 16), _fLoad);
}
for (int i = remainsize; i < size; i++)
{
ptr[i] = ptr[i] * a + b;
}
}
return 0;
#endif // __AVX512F__
#pragma omp parallel for num_threads(opt.num_threads)
for (int q = 0; q < c; q++)
{
float* ptr = bottom_top_blob.channel(q);
int ssize = size / 8;
int remainsize = ssize * 8;

__m256 _fLoad;

// mean
float sum = 0.f;
float sqsum = 0.f;

__m256 _fsum = _mm256_setzero_ps();

for (int i = 0; i < ssize; i++)
{
_fLoad = _mm256_loadu_ps(ptr + (i * 8));
_fsum = _mm256_add_ps(_fsum, _fLoad);
}

sum = _mm256_reduce_add_ps(_fsum);

for (int i = remainsize; i < size; i++)
sum += ptr[i];

float mean = sum / size;
__m256 _mean = _mm256_set1_ps(mean);
__m256 _fsqsum = _mm256_setzero_ps();

for (int i = 0; i < ssize; i++)
{
_fLoad = _mm256_loadu_ps(ptr + (i * 8));
_fLoad = _mm256_sub_ps(_fLoad, _mean);
_fLoad = _mm256_mul_ps(_fLoad, _fLoad);
_fsqsum = _mm256_add_ps(_fsqsum, _fLoad);
}

sqsum = _mm256_reduce_add_ps(_fsqsum);

float tmp = 0.f;
for (int i = remainsize; i < size; i++)
{
tmp = ptr[i] - mean;
sqsum += tmp * tmp;
}

// var
float var = sqsum / size;
float a, b;
__m256 _a, _b;

if (affine)
{
float gamma = gamma_data[q];
float beta = beta_data[q];

a = static_cast<float>(gamma / (sqrt(var + eps)));
b = -mean * a + beta;

_a = _mm256_set1_ps(a);
_b = _mm256_set1_ps(b);
}
else
{
a = static_cast<float>(1.f / (sqrt(var + eps)));
b = -mean * a;

_a = _mm256_set1_ps(a);
_b = _mm256_set1_ps(b);
}

for (int i = 0; i < ssize; i++)
{
_fLoad = _mm256_loadu_ps(ptr + (i * 8));
_fLoad = _mm256_mul_ps(_fLoad, _a);
_fLoad = _mm256_add_ps(_fLoad, _b);

_mm256_storeu_ps(ptr + (i * 8), _fLoad);
}
for (int i = remainsize; i < size; i++)
{
ptr[i] = ptr[i] * a + b;
}
}
return 0;
#endif // __AVX__
#pragma omp parallel for num_threads(opt.num_threads)
for (int q = 0; q < c; q++)
{
float* ptr = bottom_top_blob.channel(q);
int ssize = size / 4;
int remainsize = ssize * 4;

__m128 _fLoad;

// mean
float sum = 0.f;
float sqsum = 0.f;

__m128 _fsum = _mm_setzero_ps();

for (int i = 0; i < ssize; i++)
{
_fLoad = _mm_load_ps(ptr + (i * 4));
_fsum = _mm_add_ps(_fsum, _fLoad);
}

sum = _mm_reduce_add_ps(_fsum);

for (int i = remainsize; i < size; i++)
sum += ptr[i];

float mean = sum / size;
__m128 _mean = _mm_set1_ps(mean);
__m128 _fsqsum = _mm_setzero_ps();

for (int i = 0; i < ssize; i++)
{
_fLoad = _mm_load_ps(ptr + (i * 4));
_fLoad = _mm_sub_ps(_fLoad, _mean);
_fLoad = _mm_mul_ps(_fLoad, _fLoad);
_fsqsum = _mm_add_ps(_fsqsum, _fLoad);
}

sqsum = _mm_reduce_add_ps(_fsqsum);

float tmp = 0.f;
for (int i = remainsize; i < size; i++)
{
tmp = ptr[i] - mean;
sqsum += tmp * tmp;
}

// var
float var = sqsum / size;
float a, b;
__m128 _a, _b;

if (affine)
{
float gamma = gamma_data[q];
float beta = beta_data[q];

a = static_cast<float>(gamma / (sqrt(var + eps)));
b = -mean * a + beta;

_a = _mm_set1_ps(a);
_b = _mm_set1_ps(b);
}
else
{
a = static_cast<float>(1.f / (sqrt(var + eps)));
b = -mean * a;

_a = _mm_set1_ps(a);
_b = _mm_set1_ps(b);
}

for (int i = 0; i < ssize; i++)
{
_fLoad = _mm_load_ps(ptr + (i * 4));
_fLoad = _mm_mul_ps(_fLoad, _a);
_fLoad = _mm_add_ps(_fLoad, _b);

_mm_store_ps(ptr + (i * 4), _fLoad);
}
for (int i = remainsize; i < size; i++)
{
ptr[i] = ptr[i] * a + b;
}
}
return 0;
#endif // __SSE2__

return InstanceNorm::forward_inplace(bottom_top_blob, opt);
Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

merge into one
refer clip_x86.cpp

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Get it! Thanks for the guidance

@LRY89757
Copy link
Copy Markdown
Contributor Author

missing avx/avx512 optimization for pack4 and avx512 optimization for pack8 ?

If so, does the x86 part of batchnorm also need further optimization? @nihui

@nihui
Copy link
Copy Markdown
Member

nihui commented Jul 24, 2022

missing avx/avx512 optimization for pack4 and avx512 optimization for pack8 ?

If so, does the x86 part of batchnorm also need further optimization? @nihui

You could merge the multiple elempack codepath in batchnorm

@LRY89757
Copy link
Copy Markdown
Contributor Author

missing avx/avx512 optimization for pack4 and avx512 optimization for pack8 ?

If so, does the x86 part of batchnorm also need further optimization? @nihui

You could merge the multiple elempack codepath in batchnorm

Ok, I will try to merge the elempack into one

@LRY89757 LRY89757 closed this Jul 26, 2022
@LRY89757 LRY89757 reopened this Jul 26, 2022
@LRY89757 LRY89757 closed this Jul 26, 2022
@LRY89757 LRY89757 reopened this Jul 26, 2022
@LRY89757 LRY89757 changed the title [InstanceNorm Optimize x86] AVX512/AVX/SSE intrinsic [InstanceNorm Optimize x86] AVX512/AVX/SSE intrinsic with elempack merged Aug 4, 2022
@LRY89757 LRY89757 closed this Aug 5, 2022
@LRY89757 LRY89757 reopened this Aug 5, 2022
@LRY89757 LRY89757 closed this Aug 6, 2022
@LRY89757 LRY89757 reopened this Aug 6, 2022
@tencent-adm
Copy link
Copy Markdown
Member

tencent-adm commented Apr 18, 2025

CLA assistant check
Thank you for your submission, we really appreciate it. Like many open source projects, we ask that you all sign our Contributor License Agreement before we can accept your contribution.
0 out of 2 committers have signed the CLA.

❌ LRY89757
❌ nihui
You have signed the CLA already but the status is still pending? Let us recheck it.

@github-actions
Copy link
Copy Markdown

The binary size change of libncnn.so (bytes)

architecture base size pr size difference
x86_64 16465128 16490096 +24968 ⚠️
armhf 7335212 7335212 0 😘
aarch64 10704240 10704240 0 😘

@LRY89757
Copy link
Copy Markdown
Contributor Author

LRY89757 commented Apr 18, 2025

Thanks nihui! The age of this pr is 3 years haha

@nihui
Copy link
Copy Markdown
Member

nihui commented Sep 12, 2025

close for #6097

@nihui nihui closed this Sep 12, 2025
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment

Projects

None yet

Development

Successfully merging this pull request may close these issues.

4 participants