[InstanceNorm Optimize x86] AVX512/AVX/SSE intrinsic with elempack merged#4062
[InstanceNorm Optimize x86] AVX512/AVX/SSE intrinsic with elempack merged#4062LRY89757 wants to merge 16 commits intoTencent:masterfrom
Conversation
LRY89757
commented
Jul 21, 2022
- Add the avx512/avx/sse inrinsic for instancenorm
Codecov ReportAll modified and coverable lines are covered by tests ✅
Additional details and impacted files@@ Coverage Diff @@
## master #4062 +/- ##
===========================================
- Coverage 95.16% 95.13% -0.04%
===========================================
Files 743 409 -334
Lines 252233 99630 -152603
===========================================
- Hits 240043 94782 -145261
+ Misses 12190 4848 -7342 ☔ View full report in Codecov by Sentry. 🚀 New features to boost your workflow:
|
nihui
left a comment
There was a problem hiding this comment.
missing avx/avx512 optimization for pack4 and avx512 optimization for pack8 ?
src/layer/x86/instancenorm_x86.cpp
Outdated
| #if __SSE2__ | ||
| #if __AVX__ | ||
| #if __AVX512F__ | ||
| #pragma omp parallel for num_threads(opt.num_threads) | ||
| for (int q = 0; q < c; q++) | ||
| { | ||
| float* ptr = bottom_top_blob.channel(q); | ||
| int ssize = size / 16; | ||
| int remainsize = ssize * 16; | ||
|
|
||
| __m512 _fLoad; | ||
|
|
||
| // mean | ||
| float sum = 0.f; | ||
| float sqsum = 0.f; | ||
|
|
||
| __m512 _fsum = _mm512_setzero_ps(); | ||
|
|
||
| for (int i = 0; i < ssize; i++) | ||
| { | ||
| _fLoad = _mm512_loadu_ps(ptr + (i * 16)); | ||
| _fsum = _mm512_add_ps(_fsum, _fLoad); | ||
| } | ||
|
|
||
| sum = _mm512_reduce_add_ps(_fsum); | ||
|
|
||
| for (int i = remainsize; i < size; i++) | ||
| sum += ptr[i]; | ||
|
|
||
| float mean = sum / size; | ||
| __m512 _mean = _mm512_set1_ps(mean); | ||
| __m512 _fsqsum = _mm512_setzero_ps(); | ||
|
|
||
| for (int i = 0; i < ssize; i++) | ||
| { | ||
| _fLoad = _mm512_loadu_ps(ptr + (i * 16)); | ||
| _fLoad = _mm512_sub_ps(_fLoad, _mean); | ||
| _fLoad = _mm512_mul_ps(_fLoad, _fLoad); | ||
| _fsqsum = _mm512_add_ps(_fsqsum, _fLoad); | ||
| } | ||
|
|
||
| sqsum = _mm512_reduce_add_ps(_fsqsum); | ||
|
|
||
| float tmp = 0.f; | ||
| for (int i = remainsize; i < size; i++) | ||
| { | ||
| tmp = ptr[i] - mean; | ||
| sqsum += tmp * tmp; | ||
| } | ||
|
|
||
| // var | ||
| float var = sqsum / size; | ||
| float a, b; | ||
| __m512 _a, _b; | ||
|
|
||
| if (affine) | ||
| { | ||
| float gamma = gamma_data[q]; | ||
| float beta = beta_data[q]; | ||
|
|
||
| a = static_cast<float>(gamma / (sqrt(var + eps))); | ||
| b = -mean * a + beta; | ||
|
|
||
| _a = _mm512_set1_ps(a); | ||
| _b = _mm512_set1_ps(b); | ||
| } | ||
| else | ||
| { | ||
| a = static_cast<float>(1.f / (sqrt(var + eps))); | ||
| b = -mean * a; | ||
|
|
||
| _a = _mm512_set1_ps(a); | ||
| _b = _mm512_set1_ps(b); | ||
| } | ||
|
|
||
| for (int i = 0; i < ssize; i++) | ||
| { | ||
| _fLoad = _mm512_loadu_ps(ptr + (i * 16)); | ||
| _fLoad = _mm512_mul_ps(_fLoad, _a); | ||
| _fLoad = _mm512_add_ps(_fLoad, _b); | ||
|
|
||
| _mm512_storeu_ps(ptr + (i * 16), _fLoad); | ||
| } | ||
| for (int i = remainsize; i < size; i++) | ||
| { | ||
| ptr[i] = ptr[i] * a + b; | ||
| } | ||
| } | ||
| return 0; | ||
| #endif // __AVX512F__ | ||
| #pragma omp parallel for num_threads(opt.num_threads) | ||
| for (int q = 0; q < c; q++) | ||
| { | ||
| float* ptr = bottom_top_blob.channel(q); | ||
| int ssize = size / 8; | ||
| int remainsize = ssize * 8; | ||
|
|
||
| __m256 _fLoad; | ||
|
|
||
| // mean | ||
| float sum = 0.f; | ||
| float sqsum = 0.f; | ||
|
|
||
| __m256 _fsum = _mm256_setzero_ps(); | ||
|
|
||
| for (int i = 0; i < ssize; i++) | ||
| { | ||
| _fLoad = _mm256_loadu_ps(ptr + (i * 8)); | ||
| _fsum = _mm256_add_ps(_fsum, _fLoad); | ||
| } | ||
|
|
||
| sum = _mm256_reduce_add_ps(_fsum); | ||
|
|
||
| for (int i = remainsize; i < size; i++) | ||
| sum += ptr[i]; | ||
|
|
||
| float mean = sum / size; | ||
| __m256 _mean = _mm256_set1_ps(mean); | ||
| __m256 _fsqsum = _mm256_setzero_ps(); | ||
|
|
||
| for (int i = 0; i < ssize; i++) | ||
| { | ||
| _fLoad = _mm256_loadu_ps(ptr + (i * 8)); | ||
| _fLoad = _mm256_sub_ps(_fLoad, _mean); | ||
| _fLoad = _mm256_mul_ps(_fLoad, _fLoad); | ||
| _fsqsum = _mm256_add_ps(_fsqsum, _fLoad); | ||
| } | ||
|
|
||
| sqsum = _mm256_reduce_add_ps(_fsqsum); | ||
|
|
||
| float tmp = 0.f; | ||
| for (int i = remainsize; i < size; i++) | ||
| { | ||
| tmp = ptr[i] - mean; | ||
| sqsum += tmp * tmp; | ||
| } | ||
|
|
||
| // var | ||
| float var = sqsum / size; | ||
| float a, b; | ||
| __m256 _a, _b; | ||
|
|
||
| if (affine) | ||
| { | ||
| float gamma = gamma_data[q]; | ||
| float beta = beta_data[q]; | ||
|
|
||
| a = static_cast<float>(gamma / (sqrt(var + eps))); | ||
| b = -mean * a + beta; | ||
|
|
||
| _a = _mm256_set1_ps(a); | ||
| _b = _mm256_set1_ps(b); | ||
| } | ||
| else | ||
| { | ||
| a = static_cast<float>(1.f / (sqrt(var + eps))); | ||
| b = -mean * a; | ||
|
|
||
| _a = _mm256_set1_ps(a); | ||
| _b = _mm256_set1_ps(b); | ||
| } | ||
|
|
||
| for (int i = 0; i < ssize; i++) | ||
| { | ||
| _fLoad = _mm256_loadu_ps(ptr + (i * 8)); | ||
| _fLoad = _mm256_mul_ps(_fLoad, _a); | ||
| _fLoad = _mm256_add_ps(_fLoad, _b); | ||
|
|
||
| _mm256_storeu_ps(ptr + (i * 8), _fLoad); | ||
| } | ||
| for (int i = remainsize; i < size; i++) | ||
| { | ||
| ptr[i] = ptr[i] * a + b; | ||
| } | ||
| } | ||
| return 0; | ||
| #endif // __AVX__ | ||
| #pragma omp parallel for num_threads(opt.num_threads) | ||
| for (int q = 0; q < c; q++) | ||
| { | ||
| float* ptr = bottom_top_blob.channel(q); | ||
| int ssize = size / 4; | ||
| int remainsize = ssize * 4; | ||
|
|
||
| __m128 _fLoad; | ||
|
|
||
| // mean | ||
| float sum = 0.f; | ||
| float sqsum = 0.f; | ||
|
|
||
| __m128 _fsum = _mm_setzero_ps(); | ||
|
|
||
| for (int i = 0; i < ssize; i++) | ||
| { | ||
| _fLoad = _mm_load_ps(ptr + (i * 4)); | ||
| _fsum = _mm_add_ps(_fsum, _fLoad); | ||
| } | ||
|
|
||
| sum = _mm_reduce_add_ps(_fsum); | ||
|
|
||
| for (int i = remainsize; i < size; i++) | ||
| sum += ptr[i]; | ||
|
|
||
| float mean = sum / size; | ||
| __m128 _mean = _mm_set1_ps(mean); | ||
| __m128 _fsqsum = _mm_setzero_ps(); | ||
|
|
||
| for (int i = 0; i < ssize; i++) | ||
| { | ||
| _fLoad = _mm_load_ps(ptr + (i * 4)); | ||
| _fLoad = _mm_sub_ps(_fLoad, _mean); | ||
| _fLoad = _mm_mul_ps(_fLoad, _fLoad); | ||
| _fsqsum = _mm_add_ps(_fsqsum, _fLoad); | ||
| } | ||
|
|
||
| sqsum = _mm_reduce_add_ps(_fsqsum); | ||
|
|
||
| float tmp = 0.f; | ||
| for (int i = remainsize; i < size; i++) | ||
| { | ||
| tmp = ptr[i] - mean; | ||
| sqsum += tmp * tmp; | ||
| } | ||
|
|
||
| // var | ||
| float var = sqsum / size; | ||
| float a, b; | ||
| __m128 _a, _b; | ||
|
|
||
| if (affine) | ||
| { | ||
| float gamma = gamma_data[q]; | ||
| float beta = beta_data[q]; | ||
|
|
||
| a = static_cast<float>(gamma / (sqrt(var + eps))); | ||
| b = -mean * a + beta; | ||
|
|
||
| _a = _mm_set1_ps(a); | ||
| _b = _mm_set1_ps(b); | ||
| } | ||
| else | ||
| { | ||
| a = static_cast<float>(1.f / (sqrt(var + eps))); | ||
| b = -mean * a; | ||
|
|
||
| _a = _mm_set1_ps(a); | ||
| _b = _mm_set1_ps(b); | ||
| } | ||
|
|
||
| for (int i = 0; i < ssize; i++) | ||
| { | ||
| _fLoad = _mm_load_ps(ptr + (i * 4)); | ||
| _fLoad = _mm_mul_ps(_fLoad, _a); | ||
| _fLoad = _mm_add_ps(_fLoad, _b); | ||
|
|
||
| _mm_store_ps(ptr + (i * 4), _fLoad); | ||
| } | ||
| for (int i = remainsize; i < size; i++) | ||
| { | ||
| ptr[i] = ptr[i] * a + b; | ||
| } | ||
| } | ||
| return 0; | ||
| #endif // __SSE2__ | ||
|
|
||
| return InstanceNorm::forward_inplace(bottom_top_blob, opt); |
There was a problem hiding this comment.
merge into one
refer clip_x86.cpp
There was a problem hiding this comment.
Get it! Thanks for the guidance
If so, does the x86 part of batchnorm also need further optimization? @nihui |
You could merge the multiple elempack codepath in batchnorm |
Ok, I will try to merge the elempack into one |
|
|
|
The binary size change of libncnn.so (bytes)
|
|
Thanks nihui! The age of this pr is 3 years haha |
|
close for #6097 |