[InstanceNorm Optimize x86] AVX512/AVX/SSE intrinsic with elempack merged by LRY89757 · Pull Request #4062 · Tencent/ncnn

LRY89757 · 2022-07-21T02:34:46Z

Add the avx512/avx/sse inrinsic for instancenorm

codecov-commenter · 2022-07-21T03:54:00Z

Codecov Report

All modified and coverable lines are covered by tests ✅

Project coverage is 95.13%. Comparing base (8363040) to head (888f55b).
Report is 3 commits behind head on master.

Additional details and impacted files

@@             Coverage Diff             @@
##           master    #4062       +/-   ##
===========================================
- Coverage   95.16%   95.13%    -0.04%     
===========================================
  Files         743      409      -334     
  Lines      252233    99630   -152603     
===========================================
- Hits       240043    94782   -145261     
+ Misses      12190     4848     -7342

☔ View full report in Codecov by Sentry.
📢 Have feedback on the report? Share it here.

🚀 New features to boost your workflow:

❄️ Test Analytics: Detect flaky tests, report on failures, and find test suite problems.

nihui

missing avx/avx512 optimization for pack4 and avx512 optimization for pack8 ?

nihui · 2022-07-21T13:45:33Z

src/layer/x86/instancenorm_x86.cpp

+#if __SSE2__
+#if __AVX__
+#if __AVX512F__
+    #pragma omp parallel for num_threads(opt.num_threads)
+    for (int q = 0; q < c; q++)
+    {
+        float* ptr = bottom_top_blob.channel(q);
+        int ssize = size / 16;
+        int remainsize = ssize * 16;
+
+        __m512 _fLoad;
+
+        // mean
+        float sum = 0.f;
+        float sqsum = 0.f;
+
+        __m512 _fsum = _mm512_setzero_ps();
+
+        for (int i = 0; i < ssize; i++)
+        {
+            _fLoad = _mm512_loadu_ps(ptr + (i * 16));
+            _fsum = _mm512_add_ps(_fsum, _fLoad);
+        }
+
+        sum = _mm512_reduce_add_ps(_fsum);
+
+        for (int i = remainsize; i < size; i++)
+            sum += ptr[i];
+
+        float mean = sum / size;
+        __m512 _mean = _mm512_set1_ps(mean);
+        __m512 _fsqsum = _mm512_setzero_ps();
+
+        for (int i = 0; i < ssize; i++)
+        {
+            _fLoad = _mm512_loadu_ps(ptr + (i * 16));
+            _fLoad = _mm512_sub_ps(_fLoad, _mean);
+            _fLoad = _mm512_mul_ps(_fLoad, _fLoad);
+            _fsqsum = _mm512_add_ps(_fsqsum, _fLoad);
+        }
+
+        sqsum = _mm512_reduce_add_ps(_fsqsum);
+
+        float tmp = 0.f;
+        for (int i = remainsize; i < size; i++)
+        {
+            tmp = ptr[i] - mean;
+            sqsum += tmp * tmp;
+        }
+
+        // var
+        float var = sqsum / size;
+        float a, b;
+        __m512 _a, _b;
+
+        if (affine)
+        {
+            float gamma = gamma_data[q];
+            float beta = beta_data[q];
+
+            a = static_cast<float>(gamma / (sqrt(var + eps)));
+            b = -mean * a + beta;
+
+            _a = _mm512_set1_ps(a);
+            _b = _mm512_set1_ps(b);
+        }
+        else
+        {
+            a = static_cast<float>(1.f / (sqrt(var + eps)));
+            b = -mean * a;
+
+            _a = _mm512_set1_ps(a);
+            _b = _mm512_set1_ps(b);
+        }
+
+        for (int i = 0; i < ssize; i++)
+        {
+            _fLoad = _mm512_loadu_ps(ptr + (i * 16));
+            _fLoad = _mm512_mul_ps(_fLoad, _a);
+            _fLoad = _mm512_add_ps(_fLoad, _b);
+
+            _mm512_storeu_ps(ptr + (i * 16), _fLoad);
+        }
+        for (int i = remainsize; i < size; i++)
+        {
+            ptr[i] = ptr[i] * a + b;
+        }
+    }
+    return 0;
+#endif // __AVX512F__
+    #pragma omp parallel for num_threads(opt.num_threads)
+    for (int q = 0; q < c; q++)
+    {
+        float* ptr = bottom_top_blob.channel(q);
+        int ssize = size / 8;
+        int remainsize = ssize * 8;
+
+        __m256 _fLoad;
+
+        // mean
+        float sum = 0.f;
+        float sqsum = 0.f;
+
+        __m256 _fsum = _mm256_setzero_ps();
+
+        for (int i = 0; i < ssize; i++)
+        {
+            _fLoad = _mm256_loadu_ps(ptr + (i * 8));
+            _fsum = _mm256_add_ps(_fsum, _fLoad);
+        }
+
+        sum = _mm256_reduce_add_ps(_fsum);
+
+        for (int i = remainsize; i < size; i++)
+            sum += ptr[i];
+
+        float mean = sum / size;
+        __m256 _mean = _mm256_set1_ps(mean);
+        __m256 _fsqsum = _mm256_setzero_ps();
+
+        for (int i = 0; i < ssize; i++)
+        {
+            _fLoad = _mm256_loadu_ps(ptr + (i * 8));
+            _fLoad = _mm256_sub_ps(_fLoad, _mean);
+            _fLoad = _mm256_mul_ps(_fLoad, _fLoad);
+            _fsqsum = _mm256_add_ps(_fsqsum, _fLoad);
+        }
+
+        sqsum = _mm256_reduce_add_ps(_fsqsum);
+
+        float tmp = 0.f;
+        for (int i = remainsize; i < size; i++)
+        {
+            tmp = ptr[i] - mean;
+            sqsum += tmp * tmp;
+        }
+
+        // var
+        float var = sqsum / size;
+        float a, b;
+        __m256 _a, _b;
+
+        if (affine)
+        {
+            float gamma = gamma_data[q];
+            float beta = beta_data[q];
+
+            a = static_cast<float>(gamma / (sqrt(var + eps)));
+            b = -mean * a + beta;
+
+            _a = _mm256_set1_ps(a);
+            _b = _mm256_set1_ps(b);
+        }
+        else
+        {
+            a = static_cast<float>(1.f / (sqrt(var + eps)));
+            b = -mean * a;
+
+            _a = _mm256_set1_ps(a);
+            _b = _mm256_set1_ps(b);
+        }
+
+        for (int i = 0; i < ssize; i++)
+        {
+            _fLoad = _mm256_loadu_ps(ptr + (i * 8));
+            _fLoad = _mm256_mul_ps(_fLoad, _a);
+            _fLoad = _mm256_add_ps(_fLoad, _b);
+
+            _mm256_storeu_ps(ptr + (i * 8), _fLoad);
+        }
+        for (int i = remainsize; i < size; i++)
+        {
+            ptr[i] = ptr[i] * a + b;
+        }
+    }
+    return 0;
+#endif // __AVX__
+    #pragma omp parallel for num_threads(opt.num_threads)
+    for (int q = 0; q < c; q++)
+    {
+        float* ptr = bottom_top_blob.channel(q);
+        int ssize = size / 4;
+        int remainsize = ssize * 4;
+
+        __m128 _fLoad;
+
+        // mean
+        float sum = 0.f;
+        float sqsum = 0.f;
+
+        __m128 _fsum = _mm_setzero_ps();
+
+        for (int i = 0; i < ssize; i++)
+        {
+            _fLoad = _mm_load_ps(ptr + (i * 4));
+            _fsum = _mm_add_ps(_fsum, _fLoad);
+        }
+
+        sum = _mm_reduce_add_ps(_fsum);
+
+        for (int i = remainsize; i < size; i++)
+            sum += ptr[i];
+
+        float mean = sum / size;
+        __m128 _mean = _mm_set1_ps(mean);
+        __m128 _fsqsum = _mm_setzero_ps();
+
+        for (int i = 0; i < ssize; i++)
+        {
+            _fLoad = _mm_load_ps(ptr + (i * 4));
+            _fLoad = _mm_sub_ps(_fLoad, _mean);
+            _fLoad = _mm_mul_ps(_fLoad, _fLoad);
+            _fsqsum = _mm_add_ps(_fsqsum, _fLoad);
+        }
+
+        sqsum = _mm_reduce_add_ps(_fsqsum);
+
+        float tmp = 0.f;
+        for (int i = remainsize; i < size; i++)
+        {
+            tmp = ptr[i] - mean;
+            sqsum += tmp * tmp;
+        }
+
+        // var
+        float var = sqsum / size;
+        float a, b;
+        __m128 _a, _b;
+
+        if (affine)
+        {
+            float gamma = gamma_data[q];
+            float beta = beta_data[q];
+
+            a = static_cast<float>(gamma / (sqrt(var + eps)));
+            b = -mean * a + beta;
+
+            _a = _mm_set1_ps(a);
+            _b = _mm_set1_ps(b);
+        }
+        else
+        {
+            a = static_cast<float>(1.f / (sqrt(var + eps)));
+            b = -mean * a;
+
+            _a = _mm_set1_ps(a);
+            _b = _mm_set1_ps(b);
+        }
+
+        for (int i = 0; i < ssize; i++)
+        {
+            _fLoad = _mm_load_ps(ptr + (i * 4));
+            _fLoad = _mm_mul_ps(_fLoad, _a);
+            _fLoad = _mm_add_ps(_fLoad, _b);
+
+            _mm_store_ps(ptr + (i * 4), _fLoad);
+        }
+        for (int i = remainsize; i < size; i++)
+        {
+            ptr[i] = ptr[i] * a + b;
+        }
+    }
+    return 0;
+#endif // __SSE2__
+
+    return InstanceNorm::forward_inplace(bottom_top_blob, opt);


merge into one
refer clip_x86.cpp

Get it! Thanks for the guidance

LRY89757 · 2022-07-24T03:36:25Z

missing avx/avx512 optimization for pack4 and avx512 optimization for pack8 ?

If so, does the x86 part of batchnorm also need further optimization? @nihui

nihui · 2022-07-24T14:02:52Z

missing avx/avx512 optimization for pack4 and avx512 optimization for pack8 ?

If so, does the x86 part of batchnorm also need further optimization? @nihui

You could merge the multiple elempack codepath in batchnorm

…norm

LRY89757 · 2022-07-25T08:20:58Z

missing avx/avx512 optimization for pack4 and avx512 optimization for pack8 ?

If so, does the x86 part of batchnorm also need further optimization? @nihui

You could merge the multiple elempack codepath in batchnorm

Ok, I will try to merge the elempack into one

tencent-adm · 2025-04-18T06:19:17Z

Thank you for your submission, we really appreciate it. Like many open source projects, we ask that you all sign our Contributor License Agreement before we can accept your contribution.
0 out of 2 committers have signed the CLA.

❌ LRY89757
❌ nihui
_{You have signed the CLA already but the status is still pending? Let us recheck it.}

github-actions · 2025-04-18T06:39:31Z

The binary size change of libncnn.so (bytes)

architecture	base size	pr size	difference
x86_64	16465128	16490096	+24968 ⚠️
armhf	7335212	7335212	0 😘
aarch64	10704240	10704240	0 😘

LRY89757 · 2025-04-18T13:17:10Z

Thanks nihui! The age of this pr is 3 years haha

nihui · 2025-09-12T02:13:59Z

close for #6097

LRY89757 and others added 3 commits July 21, 2022 09:19

Add the AVX512/AVX/SSE support for instancenorm

16a54ca

apply code-format changes

370fb85

Add the codes without sse

6df4398

LRY89757 closed this Jul 21, 2022

LRY89757 reopened this Jul 21, 2022

Merge branch 'Tencent:master' into instancenorm

c5ba57c

LRY89757 closed this Jul 21, 2022

LRY89757 reopened this Jul 21, 2022

nihui reviewed Jul 21, 2022

View reviewed changes

Merge branch 'Tencent:master' into instancenorm

98015eb

Merge the for loop into one when elempack==1

abb8707

LRY89757 and others added 2 commits July 25, 2022 15:56

Merge branch 'Tencent:master' into instancenorm

2daa1f2

Merge branch 'instancenorm' of github.com:LRY89757/ncnn into instance…

61b9e10

…norm

LRY89757 and others added 2 commits July 25, 2022 08:47

apply code-format changes

734d83a

Merge branch 'Tencent:master' into instancenorm

355419b

LRY89757 closed this Jul 26, 2022

LRY89757 reopened this Jul 26, 2022

LRY89757 closed this Jul 26, 2022

LRY89757 reopened this Jul 26, 2022

LRY89757 and others added 2 commits August 3, 2022 08:52

Merge branch 'Tencent:master' into instancenorm

3cd5a62

Finish the Merge of elempack

e6193f0

LRY89757 changed the title ~~[InstanceNorm Optimize x86] AVX512/AVX/SSE intrinsic~~ [InstanceNorm Optimize x86] AVX512/AVX/SSE intrinsic with elempack merged Aug 4, 2022

LRY89757 mentioned this pull request Aug 5, 2022

[Prelu x86] Finish intrinsic with elempack merged #4116

Closed

LRY89757 and others added 2 commits August 5, 2022 23:07

Add the test samples for coverage

78345b2

apply code-format changes

5b5dc21

LRY89757 closed this Aug 5, 2022

LRY89757 reopened this Aug 5, 2022

Merge branch 'Tencent:master' into instancenorm

b0e9531

LRY89757 closed this Aug 6, 2022

LRY89757 reopened this Aug 6, 2022

Merge branch 'master' into instancenorm

888f55b

github-actions bot added test x86 labels Apr 18, 2025

nihui closed this Sep 12, 2025

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

[InstanceNorm Optimize x86] AVX512/AVX/SSE intrinsic with elempack merged#4062

[InstanceNorm Optimize x86] AVX512/AVX/SSE intrinsic with elempack merged#4062
LRY89757 wants to merge 16 commits intoTencent:masterfrom
LRY89757:instancenorm

LRY89757 commented Jul 21, 2022

Uh oh!

codecov-commenter commented Jul 21, 2022 •

edited

Loading

Uh oh!

nihui left a comment

Uh oh!

nihui Jul 21, 2022

Uh oh!

LRY89757 Jul 21, 2022

Uh oh!

LRY89757 commented Jul 24, 2022

Uh oh!

nihui commented Jul 24, 2022

Uh oh!

LRY89757 commented Jul 25, 2022

Uh oh!

tencent-adm commented Apr 18, 2025 •

edited

Loading

Uh oh!

github-actions bot commented Apr 18, 2025

Uh oh!

LRY89757 commented Apr 18, 2025 •

edited

Loading

Uh oh!

nihui commented Sep 12, 2025

Uh oh!

Reviewers

Assignees

Labels

Projects

Milestone

Development

Uh oh!

4 participants

Conversation

LRY89757 commented Jul 21, 2022

Uh oh!

codecov-commenter commented Jul 21, 2022 • edited Loading Uh oh! There was an error while loading. Please reload this page.

Uh oh!

Codecov Report

Uh oh!

nihui left a comment

Choose a reason for hiding this comment

Uh oh!

nihui Jul 21, 2022

Choose a reason for hiding this comment

Uh oh!

LRY89757 Jul 21, 2022

Choose a reason for hiding this comment

Uh oh!

LRY89757 commented Jul 24, 2022

Uh oh!

nihui commented Jul 24, 2022

Uh oh!

LRY89757 commented Jul 25, 2022

Uh oh!

tencent-adm commented Apr 18, 2025 • edited Loading Uh oh! There was an error while loading. Please reload this page.

Uh oh!

Uh oh!

github-actions bot commented Apr 18, 2025

Uh oh!

LRY89757 commented Apr 18, 2025 • edited Loading Uh oh! There was an error while loading. Please reload this page.

Uh oh!

Uh oh!

nihui commented Sep 12, 2025

Uh oh!

Reviewers

Assignees

Labels

Projects

Milestone

Development

Uh oh!

4 participants

codecov-commenter commented Jul 21, 2022 •

edited

Loading

tencent-adm commented Apr 18, 2025 •

edited

Loading

LRY89757 commented Apr 18, 2025 •

edited

Loading