opencv
diff --git a/‎modules/dnn/include/opencv2/dnn/all_layers.hpp‎
Lines changed: 3 additions & 0 deletions b/‎modules/dnn/include/opencv2/dnn/all_layers.hpp‎
Lines changed: 3 additions & 0 deletions
diff --git a/‎modules/dnn/src/dnn_common.hpp‎
Lines changed: 1 addition & 0 deletions b/‎modules/dnn/src/dnn_common.hpp‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎modules/dnn/src/layers/convolution_layer.cpp‎
Lines changed: 17 additions & 25 deletions b/‎modules/dnn/src/layers/convolution_layer.cpp‎
Lines changed: 17 additions & 25 deletions
diff --git a/‎modules/dnn/src/layers/fast_convolution/fast_convolution.avx2.cpp‎
Lines changed: 45 additions & 46 deletions b/‎modules/dnn/src/layers/fast_convolution/fast_convolution.avx2.cpp‎
Lines changed: 45 additions & 46 deletions
@@ -256,6 +256,9 @@ CV__DNN_INLINE_NS_BEGIN
     {
     public:
         static Ptr<BaseConvolutionLayer> create(const LayerParams& params);
+        bool fusedActivation = false;
+        bool fusedAdd = false;
+        bool isConv2D = false; // Should be deleted after fastconv branch support Conv1D and Conv3D.
     };
 
     class CV_EXPORTS ConvolutionLayerInt8 : public BaseConvolutionLayer
 
@@ -13,6 +13,7 @@
 namespace cv { namespace dnn {
 CV__DNN_INLINE_NS_BEGIN
 #define IS_DNN_OPENCL_TARGET(id) (id == DNN_TARGET_OPENCL || id == DNN_TARGET_OPENCL_FP16)
+#define IS_DNN_CPU_TARGET(id) (id == DNN_TARGET_CPU) // TODO: add DNN_TARGET_CPU_FP16
 Mutex& getInitializationMutex();
 void initializeLayerFactory();
 
 
@@ -118,6 +118,9 @@ class BaseConvolutionLayerImpl : public ConvolutionLayer
 
         fusedWeights = false;
         fusedBias = false;
+
+        if (kernel_size.size() == 2)
+            isConv2D = true;
     }
 
     virtual void finalize(InputArrayOfArrays inputs_arr, OutputArrayOfArrays outputs_arr) CV_OVERRIDE
@@ -188,6 +191,9 @@ class BaseConvolutionLayerImpl : public ConvolutionLayer
 
     virtual bool tryFuse(Ptr<Layer>& top) CV_OVERRIDE
     {
+        if (fusedAdd)   // If the Conv layer has fused Add layer, it cannot fuse other layers.
+            return false;
+
         Ptr<BlankLayer> blank_layer = top.dynamicCast<BlankLayer>();
         if (blank_layer)
             return true;
@@ -260,7 +266,6 @@ class ConvolutionLayerImpl CV_FINAL : public BaseConvolutionLayerImpl
     std::vector<float> reluslope;
     Ptr<ActivationLayer> activ;
 
-    Mat fastWeights; // Used to store weight params. It will be used for layer fusion and without memory alignment.
     Ptr<FastConv2d> fastConv2dImpl;
 
 #ifdef HAVE_OPENCL
@@ -438,7 +443,6 @@ class ConvolutionLayerImpl CV_FINAL : public BaseConvolutionLayerImpl
                 wm.copyTo(wm_aligned);
                 wm = wm_aligned;
             }
-            fastWeights = blobs[0].reshape(1, numOutput);
             weightsMat = wm;
         }
         else
@@ -584,11 +588,15 @@ class ConvolutionLayerImpl CV_FINAL : public BaseConvolutionLayerImpl
             }
         }
 #endif
-        return !activ.empty();
+        fusedActivation = !activ.empty();
+        return fusedActivation;
     }
 
     virtual bool tryFuse(Ptr<Layer>& top) CV_OVERRIDE
     {
+        if (fusedAdd)   // If the Conv layer has fused Add layer, it cannot fuse other layers.
+            return false;
+
 #ifdef HAVE_CUDA
         if(IS_DNN_CUDA_TARGET(preferableTarget))
         {
@@ -634,26 +642,14 @@ class ConvolutionLayerImpl CV_FINAL : public BaseConvolutionLayerImpl
             if (weightsMat.data == blobs[0].data)
                 weightsMat = weightsMat.clone();
 
-            // If fastWeights is the same as weightsMat, we don't need to allocate more space for fastWeights.
-            bool sameFastWeights = false;
-            if (fastWeights.step1() == weightsMat.step1()) // If weightsMat is realigned, it is not the same as fastWeights.
-                sameFastWeights = true;
-
-            if (!sameFastWeights && fastWeights.data == blobs[0].data)
-                fastWeights = fastWeights.clone();
-
             Mat originWeights = blobs[0].reshape(1, outCn);
             for (int i = 0; i < outCn; ++i)
             {
                 double wi = w.at<float>(i);
                 weightsMultipliers[i] *= wi;
                 cv::multiply(originWeights.row(i), weightsMultipliers[i], weightsMat.row(i));
-                if (!sameFastWeights)
-                    cv::multiply(originWeights.row(i), weightsMultipliers[i], fastWeights.row(i));
                 biasvec[i] *= wi;
             }
-            if (sameFastWeights)
-                fastWeights = weightsMat;
         }
 
         if (!b.empty())
@@ -1970,9 +1966,6 @@ class ConvolutionLayerImpl CV_FINAL : public BaseConvolutionLayerImpl
         if (blobs.empty())
         {
             variableWeight = true;
-            if (fastWeights.data != inputs[1].data)
-                fastWeights = inputs[1].clone();
-
             Mat wm = inputs[1].reshape(1, outCn);
             if (wm.data != weightsMat.data)
             {
@@ -2089,7 +2082,7 @@ class ConvolutionLayerImpl CV_FINAL : public BaseConvolutionLayerImpl
         {
             int nstripes = std::max(getNumThreads(), 1);
 
-            // Initialization of FastCovn2d
+            // Initialization of FastCovn2d, pack weight.
             if ((!fastConv2dImpl || variableWeight) && inputs[0].dims == 4)
             {
                 int K = outputs[0].size[1];
@@ -2103,23 +2096,22 @@ class ConvolutionLayerImpl CV_FINAL : public BaseConvolutionLayerImpl
 
                 int dilation_h = dilations[dilations.size() - 2];
                 int dilation_w = dilations.back();
-                float* weightsPtr = fastWeights.ptr<float>();
-                CV_Assert(weightsPtr);
 
-                fastConv2dImpl = initFastConv2d(ngroups, K, C, Hk, Wk, stride_w, stride_h,
-                                              dilation_w, dilation_h, pads_begin, pads_end, weightsPtr, &biasvec[0]);
+                fastConv2dImpl = initFastConv2d(ngroups, K, C, Hk, Wk, stride_w, stride_h, dilation_w,
+                                                dilation_h, pads_begin, pads_end, weightsMat, &biasvec[0]);
             }
 
             if (fastConv2dImpl)
             {
-                runFastConv2d(inputs[0], outputs[0], fastConv2dImpl, nstripes, activ);
+                runFastConv2d(inputs[0], outputs[0], fastConv2dImpl, nstripes, activ, fusedAdd);
                 return;
             }
 
+            //TODO: Add support of Conv1D and Conv3D to fastConv, and remove the old Conv branch.
             // Use only for Conv1D and Conv3D.
+            CV_Assert(!fusedAdd);
             ParallelConv::run(inputs[0], outputs[0], weightsMat, biasvec, reluslope,
                             kernel_size, strides, pads_begin, pads_end, dilations, activ.get(), ngroups, nstripes);
-
         }
     }
 
 
@@ -9,67 +9,67 @@ namespace cv {
 namespace opt_AVX2
 {
 #if CV_TRY_AVX2
-void convBlock_AVX2(int k, const float *a, const float *b,
-                float *c, int ldc, const float *bias,
-                float minval, float maxval, bool ifActiv)
+void convBlock_AVX2(int np, const float* a, const float* b, float* c, int ldc, bool init_c)
 {
-#if FAST_CONV_MR == 4 && FAST_CONV_NR == 24
-    __m256 vminval = _mm256_set1_ps(minval), vmaxval = _mm256_set1_ps(maxval);
-    __m256 c0 = _mm256_set1_ps(bias[0]), c1 = c0, c2 = c0;
-    __m256 c3 = _mm256_set1_ps(bias[1]), c4 = c3, c5 = c3;
-    __m256 c6 = _mm256_set1_ps(bias[2]), c7 = c6, c8 = c6;
-    __m256 c9 = _mm256_set1_ps(bias[3]), c10 = c9, c11 = c9;
+#if CONV_MR == 4 && CONV_NR == 24
+    __m256 c00 = _mm256_set1_ps(0.f), c01 = c00, c02 = c00;
+    __m256 c10 = c00, c11 = c00, c12 = c00;
+    __m256 c20 = c00, c21 = c00, c22 = c00;
+    __m256 c30 = c00, c31 = c00, c32 = c00;
 
     __m256 a0 = _mm256_setzero_ps(), a1 = _mm256_setzero_ps();
     __m256 b0 = _mm256_setzero_ps(), b1 = _mm256_setzero_ps(), b2 = _mm256_setzero_ps();
 
-    for (int p = 0; p < k; p++, a += FAST_CONV_MR, b += FAST_CONV_NR)
+    for (int p = 0; p < np; p++, a += CONV_MR, b += CONV_NR)
     {
         a0 = _mm256_set1_ps(a[0]), a1 = _mm256_set1_ps(a[1]);
         b0 = _mm256_load_ps(b), b1 = _mm256_load_ps(b + 8), b2 = _mm256_load_ps(b + 16);
 
-        c0 = _mm256_fmadd_ps(b0, a0, c0);
-        c1 = _mm256_fmadd_ps(b1, a0, c1);
-        c2 = _mm256_fmadd_ps(b2, a0, c2);
+        c00 = _mm256_fmadd_ps(b0, a0, c00);
+        c01 = _mm256_fmadd_ps(b1, a0, c01);
+        c02 = _mm256_fmadd_ps(b2, a0, c02);
 
-        c3 = _mm256_fmadd_ps(b0, a1, c3);
-        a0 = _mm256_set1_ps(a[2]);
-        c4 = _mm256_fmadd_ps(b1, a1, c4);
-        c5 = _mm256_fmadd_ps(b2, a1, c5);
+        c10 = _mm256_fmadd_ps(b0, a1, c10);
+        c11 = _mm256_fmadd_ps(b1, a1, c11);
+        c12 = _mm256_fmadd_ps(b2, a1, c12);
 
-        c6 = _mm256_fmadd_ps(b0, a0, c6);
-        a1 = _mm256_set1_ps(a[3]);
-        c7 = _mm256_fmadd_ps(b1, a0, c7);
-        c8 = _mm256_fmadd_ps(b2, a0, c8);
+        a0 = _mm256_set1_ps(a[2]), a1 = _mm256_set1_ps(a[3]);
 
-        c9 = _mm256_fmadd_ps(b0, a1, c9);
-        c10 = _mm256_fmadd_ps(b1, a1, c10);
-        c11 = _mm256_fmadd_ps(b2, a1, c11);
+        c20 = _mm256_fmadd_ps(b0, a0, c20);
+        c21 = _mm256_fmadd_ps(b1, a0, c21);
+        c22 = _mm256_fmadd_ps(b2, a0, c22);
+
+        c30 = _mm256_fmadd_ps(b0, a1, c30);
+        c31 = _mm256_fmadd_ps(b1, a1, c31);
+        c32 = _mm256_fmadd_ps(b2, a1, c32);
     }
 
-    if (ifActiv)
+    if (!init_c)
     {
-        c0 = _mm256_min_ps(_mm256_max_ps(c0, vminval), vmaxval);
-        c1 = _mm256_min_ps(_mm256_max_ps(c1, vminval), vmaxval);
-        c2 = _mm256_min_ps(_mm256_max_ps(c2, vminval), vmaxval);
-        c3 = _mm256_min_ps(_mm256_max_ps(c3, vminval), vmaxval);
-        c4 = _mm256_min_ps(_mm256_max_ps(c4, vminval), vmaxval);
-        c5 = _mm256_min_ps(_mm256_max_ps(c5, vminval), vmaxval);
-        c6 = _mm256_min_ps(_mm256_max_ps(c6, vminval), vmaxval);
-        c7 = _mm256_min_ps(_mm256_max_ps(c7, vminval), vmaxval);
-        c8 = _mm256_min_ps(_mm256_max_ps(c8, vminval), vmaxval);
-        c9 = _mm256_min_ps(_mm256_max_ps(c9, vminval), vmaxval);
-        c10 = _mm256_min_ps(_mm256_max_ps(c10, vminval), vmaxval);
-        c11 = _mm256_min_ps(_mm256_max_ps(c11, vminval), vmaxval);
+        c00 = _mm256_add_ps(c00, _mm256_load_ps(c));
+        c01 = _mm256_add_ps(c01, _mm256_load_ps(c + 8));
+        c02 = _mm256_add_ps(c02, _mm256_load_ps(c + 16));
+
+        c10 = _mm256_add_ps(c10, _mm256_load_ps(c + ldc));
+        c11 = _mm256_add_ps(c11, _mm256_load_ps(c + ldc + 8));
+        c12 = _mm256_add_ps(c12, _mm256_load_ps(c + ldc + 16));
+
+        c20 = _mm256_add_ps(c20, _mm256_load_ps(c + ldc*2));
+        c21 = _mm256_add_ps(c21, _mm256_load_ps(c + ldc*2 + 8));
+        c22 = _mm256_add_ps(c22, _mm256_load_ps(c + ldc*2 + 16));
+
+        c30 = _mm256_add_ps(c30, _mm256_load_ps(c + ldc*3));
+        c31 = _mm256_add_ps(c31, _mm256_load_ps(c + ldc*3 + 8));
+        c32 = _mm256_add_ps(c32, _mm256_load_ps(c + ldc*3 + 16));
     }
 
-    _mm256_storeu_ps(c, c0); _mm256_storeu_ps(c+8, c1); _mm256_storeu_ps(c+16, c2);
-    _mm256_storeu_ps(c + ldc, c3); _mm256_storeu_ps(c + ldc + 8, c4); _mm256_storeu_ps(c + ldc + 16, c5);
-    _mm256_storeu_ps(c + ldc*2, c6); _mm256_storeu_ps(c + ldc*2 + 8, c7); _mm256_storeu_ps(c + ldc*2 + 16, c8);
-    _mm256_storeu_ps(c + ldc*3, c9); _mm256_storeu_ps(c + ldc*3 + 8, c10); _mm256_storeu_ps(c + ldc*3 + 16, c11);
+    _mm256_storeu_ps(c, c00), _mm256_storeu_ps(c+8, c01), _mm256_storeu_ps(c+16, c02);
+    _mm256_storeu_ps(c + ldc, c10), _mm256_storeu_ps(c + ldc + 8, c11), _mm256_storeu_ps(c + ldc + 16, c12);
+    _mm256_storeu_ps(c + ldc*2, c20), _mm256_storeu_ps(c + ldc*2 + 8, c21), _mm256_storeu_ps(c + ldc*2 + 16, c22);
+    _mm256_storeu_ps(c + ldc*3, c30), _mm256_storeu_ps(c + ldc*3 + 8, c31), _mm256_storeu_ps(c + ldc*3 + 16, c32);
     _mm256_zeroupper();
 #else
-#error "unsupported FAST_CONV_MR and/or FAST_CONV_NR in convBlock_AVX2."
+#error "unsupported CONV_MR and/or CONV_NR in convBlock_AVX2."
 #endif
 }
 
@@ -78,7 +78,6 @@ void depthWiseBlock_AVX2(const float *inptr, float *outptr, const float *weights
                     int dilation_y, int stride_x, int stride_y, int inner_xleft, int inner_xright, int inner_ytop,
                     int inner_ybottom, bool ifMinMaxAct, bool useSIMD, bool is3x3)
 {
-    const int VECSZ = 8;
     __m256 vminval = _mm256_set1_ps(minval);
     __m256 vmaxval = _mm256_set1_ps(maxval);
 
@@ -175,7 +174,7 @@ void depthWiseBlock_AVX2(const float *inptr, float *outptr, const float *weights
                 {
                     if (dy0 == 3)
                     {
-                        for (; x0 <= x1 - VECSZ; x0 += VECSZ)
+                        for (; x0 <= x1 - FAST_VEC_NLANES; x0 += FAST_VEC_NLANES)
                         {
                             int xi_ = x0 * stride_x - pad_left;
                             const float *inptr_xi = inptr + Wi * yi_ + xi_;
@@ -251,7 +250,7 @@ void depthWiseBlock_AVX2(const float *inptr, float *outptr, const float *weights
                     }
                     else
                     {
-                        for (; x0 <= x1 - VECSZ; x0 += VECSZ)
+                        for (; x0 <= x1 - FAST_VEC_NLANES; x0 += FAST_VEC_NLANES)
                         {
                             int xi_ = x0 * stride_x - pad_left;
                             const float *inptr_xi = inptr + Wi * yi_ + xi_;
@@ -277,7 +276,7 @@ void depthWiseBlock_AVX2(const float *inptr, float *outptr, const float *weights
                 }
                 else
                 {
-                    for (; x0 <= x1 - VECSZ; x0 += VECSZ)
+                    for (; x0 <= x1 - FAST_VEC_NLANES; x0 += FAST_VEC_NLANES)
                     {
                         int xi_ = x0 * stride_x - pad_left, k = 0;
                         const float *inptr_xi = inptr + Wi * yi_ + xi_;
Original file line number	Diff line number	Diff line change
`@@ -256,6 +256,9 @@ CV__DNN_INLINE_NS_BEGIN`
`256`	`256`	`{`
`257`	`257`	`public:`
`258`	`258`	`static Ptr<BaseConvolutionLayer> create(const LayerParams& params);`
	`259`	`+ bool fusedActivation = false;`
	`260`	`+ bool fusedAdd = false;`
	`261`	`+ bool isConv2D = false; // Should be deleted after fastconv branch support Conv1D and Conv3D.`
`259`	`262`	`};`
`260`	`263`
`261`	`264`	`class CV_EXPORTS ConvolutionLayerInt8 : public BaseConvolutionLayer`
Original file line number	Diff line number	Diff line change
`@@ -118,6 +118,9 @@ class BaseConvolutionLayerImpl : public ConvolutionLayer`
`118`	`118`
`119`	`119`	`fusedWeights = false;`
`120`	`120`	`fusedBias = false;`
	`121`	`+`
	`122`	`+ if (kernel_size.size() == 2)`
	`123`	`+ isConv2D = true;`
`121`	`124`	`}`
`122`	`125`
`123`	`126`	`virtual void finalize(InputArrayOfArrays inputs_arr, OutputArrayOfArrays outputs_arr) CV_OVERRIDE`
`@@ -188,6 +191,9 @@ class BaseConvolutionLayerImpl : public ConvolutionLayer`
`188`	`191`
`189`	`192`	`virtual bool tryFuse(Ptr<Layer>& top) CV_OVERRIDE`
`190`	`193`	`{`
	`194`	`+ if (fusedAdd) // If the Conv layer has fused Add layer, it cannot fuse other layers.`
	`195`	`+ return false;`
	`196`	`+`
`191`	`197`	`Ptr<BlankLayer> blank_layer = top.dynamicCast<BlankLayer>();`
`192`	`198`	`if (blank_layer)`
`193`	`199`	`return true;`
`@@ -260,7 +266,6 @@ class ConvolutionLayerImpl CV_FINAL : public BaseConvolutionLayerImpl`
`260`	`266`	`std::vector<float> reluslope;`
`261`	`267`	`Ptr<ActivationLayer> activ;`
`262`	`268`
`263`		`- Mat fastWeights; // Used to store weight params. It will be used for layer fusion and without memory alignment.`
`264`	`269`	`Ptr<FastConv2d> fastConv2dImpl;`
`265`	`270`
`266`	`271`	`#ifdef HAVE_OPENCL`
`@@ -438,7 +443,6 @@ class ConvolutionLayerImpl CV_FINAL : public BaseConvolutionLayerImpl`
`438`	`443`	`wm.copyTo(wm_aligned);`
`439`	`444`	`wm = wm_aligned;`
`440`	`445`	`}`
`441`		`- fastWeights = blobs[0].reshape(1, numOutput);`
`442`	`446`	`weightsMat = wm;`
`443`	`447`	`}`
`444`	`448`	`else`
`@@ -584,11 +588,15 @@ class ConvolutionLayerImpl CV_FINAL : public BaseConvolutionLayerImpl`
`584`	`588`	`}`
`585`	`589`	`}`
`586`	`590`	`#endif`
`587`		`- return !activ.empty();`
	`591`	`+ fusedActivation = !activ.empty();`
	`592`	`+ return fusedActivation;`
`588`	`593`	`}`
`589`	`594`
`590`	`595`	`virtual bool tryFuse(Ptr<Layer>& top) CV_OVERRIDE`
`591`	`596`	`{`
	`597`	`+ if (fusedAdd) // If the Conv layer has fused Add layer, it cannot fuse other layers.`
	`598`	`+ return false;`
	`599`	`+`
`592`	`600`	`#ifdef HAVE_CUDA`
`593`	`601`	`if(IS_DNN_CUDA_TARGET(preferableTarget))`
`594`	`602`	`{`
`@@ -634,26 +642,14 @@ class ConvolutionLayerImpl CV_FINAL : public BaseConvolutionLayerImpl`
`634`	`642`	`if (weightsMat.data == blobs[0].data)`
`635`	`643`	`weightsMat = weightsMat.clone();`
`636`	`644`
`637`		`- // If fastWeights is the same as weightsMat, we don't need to allocate more space for fastWeights.`
`638`		`- bool sameFastWeights = false;`
`639`		`- if (fastWeights.step1() == weightsMat.step1()) // If weightsMat is realigned, it is not the same as fastWeights.`
`640`		`- sameFastWeights = true;`
`641`		`-`
`642`		`- if (!sameFastWeights && fastWeights.data == blobs[0].data)`
`643`		`- fastWeights = fastWeights.clone();`
`644`		`-`
`645`	`645`	`Mat originWeights = blobs[0].reshape(1, outCn);`
`646`	`646`	`for (int i = 0; i < outCn; ++i)`
`647`	`647`	`{`
`648`	`648`	`double wi = w.at<float>(i);`
`649`	`649`	`weightsMultipliers[i] *= wi;`
`650`	`650`	`cv::multiply(originWeights.row(i), weightsMultipliers[i], weightsMat.row(i));`
`651`		`- if (!sameFastWeights)`
`652`		`- cv::multiply(originWeights.row(i), weightsMultipliers[i], fastWeights.row(i));`
`653`	`651`	`biasvec[i] *= wi;`
`654`	`652`	`}`
`655`		`- if (sameFastWeights)`
`656`		`- fastWeights = weightsMat;`
`657`	`653`	`}`
`658`	`654`
`659`	`655`	`if (!b.empty())`
`@@ -1970,9 +1966,6 @@ class ConvolutionLayerImpl CV_FINAL : public BaseConvolutionLayerImpl`
`1970`	`1966`	`if (blobs.empty())`
`1971`	`1967`	`{`
`1972`	`1968`	`variableWeight = true;`
`1973`		`- if (fastWeights.data != inputs[1].data)`
`1974`		`- fastWeights = inputs[1].clone();`
`1975`		`-`
`1976`	`1969`	`Mat wm = inputs[1].reshape(1, outCn);`
`1977`	`1970`	`if (wm.data != weightsMat.data)`
`1978`	`1971`	`{`
`@@ -2089,7 +2082,7 @@ class ConvolutionLayerImpl CV_FINAL : public BaseConvolutionLayerImpl`
`2089`	`2082`	`{`
`2090`	`2083`	`int nstripes = std::max(getNumThreads(), 1);`
`2091`	`2084`
`2092`		`- // Initialization of FastCovn2d`
	`2085`	`+ // Initialization of FastCovn2d, pack weight.`
`2093`	`2086`	`if ((!fastConv2dImpl \|\| variableWeight) && inputs[0].dims == 4)`
`2094`	`2087`	`{`
`2095`	`2088`	`int K = outputs[0].size[1];`
`@@ -2103,23 +2096,22 @@ class ConvolutionLayerImpl CV_FINAL : public BaseConvolutionLayerImpl`
`2103`	`2096`
`2104`	`2097`	`int dilation_h = dilations[dilations.size() - 2];`
`2105`	`2098`	`int dilation_w = dilations.back();`
`2106`		`- float* weightsPtr = fastWeights.ptr<float>();`
`2107`		`- CV_Assert(weightsPtr);`
`2108`	`2099`
`2109`		`- fastConv2dImpl = initFastConv2d(ngroups, K, C, Hk, Wk, stride_w, stride_h,`
`2110`		`- dilation_w, dilation_h, pads_begin, pads_end, weightsPtr, &biasvec[0]);`
	`2100`	`+ fastConv2dImpl = initFastConv2d(ngroups, K, C, Hk, Wk, stride_w, stride_h, dilation_w,`
	`2101`	`+ dilation_h, pads_begin, pads_end, weightsMat, &biasvec[0]);`
`2111`	`2102`	`}`
`2112`	`2103`
`2113`	`2104`	`if (fastConv2dImpl)`
`2114`	`2105`	`{`
`2115`		`- runFastConv2d(inputs[0], outputs[0], fastConv2dImpl, nstripes, activ);`
	`2106`	`+ runFastConv2d(inputs[0], outputs[0], fastConv2dImpl, nstripes, activ, fusedAdd);`
`2116`	`2107`	`return;`
`2117`	`2108`	`}`
`2118`	`2109`
	`2110`	`+ //TODO: Add support of Conv1D and Conv3D to fastConv, and remove the old Conv branch.`
`2119`	`2111`	`// Use only for Conv1D and Conv3D.`
	`2112`	`+ CV_Assert(!fusedAdd);`
`2120`	`2113`	`ParallelConv::run(inputs[0], outputs[0], weightsMat, biasvec, reluslope,`
`2121`	`2114`	`kernel_size, strides, pads_begin, pads_end, dilations, activ.get(), ngroups, nstripes);`
`2122`		`-`
`2123`	`2115`	`}`
`2124`	`2116`	`}`
`2125`	`2117`