Skip to content

Commit bb64db9

Browse files
authored
Further optimization of Conv2D, fused Conv_Add_Activation, bring latest code from ficus OpConv.fx. (#22401)
1 parent 67fa8a2 commit bb64db9

12 files changed

Lines changed: 1202 additions & 913 deletions

modules/dnn/include/opencv2/dnn/all_layers.hpp

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -256,6 +256,9 @@ CV__DNN_INLINE_NS_BEGIN
256256
{
257257
public:
258258
static Ptr<BaseConvolutionLayer> create(const LayerParams& params);
259+
bool fusedActivation = false;
260+
bool fusedAdd = false;
261+
bool isConv2D = false; // Should be deleted after fastconv branch support Conv1D and Conv3D.
259262
};
260263

261264
class CV_EXPORTS ConvolutionLayerInt8 : public BaseConvolutionLayer

modules/dnn/src/dnn_common.hpp

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,7 @@
1313
namespace cv { namespace dnn {
1414
CV__DNN_INLINE_NS_BEGIN
1515
#define IS_DNN_OPENCL_TARGET(id) (id == DNN_TARGET_OPENCL || id == DNN_TARGET_OPENCL_FP16)
16+
#define IS_DNN_CPU_TARGET(id) (id == DNN_TARGET_CPU) // TODO: add DNN_TARGET_CPU_FP16
1617
Mutex& getInitializationMutex();
1718
void initializeLayerFactory();
1819

modules/dnn/src/layers/convolution_layer.cpp

Lines changed: 17 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -118,6 +118,9 @@ class BaseConvolutionLayerImpl : public ConvolutionLayer
118118

119119
fusedWeights = false;
120120
fusedBias = false;
121+
122+
if (kernel_size.size() == 2)
123+
isConv2D = true;
121124
}
122125

123126
virtual void finalize(InputArrayOfArrays inputs_arr, OutputArrayOfArrays outputs_arr) CV_OVERRIDE
@@ -188,6 +191,9 @@ class BaseConvolutionLayerImpl : public ConvolutionLayer
188191

189192
virtual bool tryFuse(Ptr<Layer>& top) CV_OVERRIDE
190193
{
194+
if (fusedAdd) // If the Conv layer has fused Add layer, it cannot fuse other layers.
195+
return false;
196+
191197
Ptr<BlankLayer> blank_layer = top.dynamicCast<BlankLayer>();
192198
if (blank_layer)
193199
return true;
@@ -260,7 +266,6 @@ class ConvolutionLayerImpl CV_FINAL : public BaseConvolutionLayerImpl
260266
std::vector<float> reluslope;
261267
Ptr<ActivationLayer> activ;
262268

263-
Mat fastWeights; // Used to store weight params. It will be used for layer fusion and without memory alignment.
264269
Ptr<FastConv2d> fastConv2dImpl;
265270

266271
#ifdef HAVE_OPENCL
@@ -438,7 +443,6 @@ class ConvolutionLayerImpl CV_FINAL : public BaseConvolutionLayerImpl
438443
wm.copyTo(wm_aligned);
439444
wm = wm_aligned;
440445
}
441-
fastWeights = blobs[0].reshape(1, numOutput);
442446
weightsMat = wm;
443447
}
444448
else
@@ -584,11 +588,15 @@ class ConvolutionLayerImpl CV_FINAL : public BaseConvolutionLayerImpl
584588
}
585589
}
586590
#endif
587-
return !activ.empty();
591+
fusedActivation = !activ.empty();
592+
return fusedActivation;
588593
}
589594

590595
virtual bool tryFuse(Ptr<Layer>& top) CV_OVERRIDE
591596
{
597+
if (fusedAdd) // If the Conv layer has fused Add layer, it cannot fuse other layers.
598+
return false;
599+
592600
#ifdef HAVE_CUDA
593601
if(IS_DNN_CUDA_TARGET(preferableTarget))
594602
{
@@ -634,26 +642,14 @@ class ConvolutionLayerImpl CV_FINAL : public BaseConvolutionLayerImpl
634642
if (weightsMat.data == blobs[0].data)
635643
weightsMat = weightsMat.clone();
636644

637-
// If fastWeights is the same as weightsMat, we don't need to allocate more space for fastWeights.
638-
bool sameFastWeights = false;
639-
if (fastWeights.step1() == weightsMat.step1()) // If weightsMat is realigned, it is not the same as fastWeights.
640-
sameFastWeights = true;
641-
642-
if (!sameFastWeights && fastWeights.data == blobs[0].data)
643-
fastWeights = fastWeights.clone();
644-
645645
Mat originWeights = blobs[0].reshape(1, outCn);
646646
for (int i = 0; i < outCn; ++i)
647647
{
648648
double wi = w.at<float>(i);
649649
weightsMultipliers[i] *= wi;
650650
cv::multiply(originWeights.row(i), weightsMultipliers[i], weightsMat.row(i));
651-
if (!sameFastWeights)
652-
cv::multiply(originWeights.row(i), weightsMultipliers[i], fastWeights.row(i));
653651
biasvec[i] *= wi;
654652
}
655-
if (sameFastWeights)
656-
fastWeights = weightsMat;
657653
}
658654

659655
if (!b.empty())
@@ -1970,9 +1966,6 @@ class ConvolutionLayerImpl CV_FINAL : public BaseConvolutionLayerImpl
19701966
if (blobs.empty())
19711967
{
19721968
variableWeight = true;
1973-
if (fastWeights.data != inputs[1].data)
1974-
fastWeights = inputs[1].clone();
1975-
19761969
Mat wm = inputs[1].reshape(1, outCn);
19771970
if (wm.data != weightsMat.data)
19781971
{
@@ -2089,7 +2082,7 @@ class ConvolutionLayerImpl CV_FINAL : public BaseConvolutionLayerImpl
20892082
{
20902083
int nstripes = std::max(getNumThreads(), 1);
20912084

2092-
// Initialization of FastCovn2d
2085+
// Initialization of FastCovn2d, pack weight.
20932086
if ((!fastConv2dImpl || variableWeight) && inputs[0].dims == 4)
20942087
{
20952088
int K = outputs[0].size[1];
@@ -2103,23 +2096,22 @@ class ConvolutionLayerImpl CV_FINAL : public BaseConvolutionLayerImpl
21032096

21042097
int dilation_h = dilations[dilations.size() - 2];
21052098
int dilation_w = dilations.back();
2106-
float* weightsPtr = fastWeights.ptr<float>();
2107-
CV_Assert(weightsPtr);
21082099

2109-
fastConv2dImpl = initFastConv2d(ngroups, K, C, Hk, Wk, stride_w, stride_h,
2110-
dilation_w, dilation_h, pads_begin, pads_end, weightsPtr, &biasvec[0]);
2100+
fastConv2dImpl = initFastConv2d(ngroups, K, C, Hk, Wk, stride_w, stride_h, dilation_w,
2101+
dilation_h, pads_begin, pads_end, weightsMat, &biasvec[0]);
21112102
}
21122103

21132104
if (fastConv2dImpl)
21142105
{
2115-
runFastConv2d(inputs[0], outputs[0], fastConv2dImpl, nstripes, activ);
2106+
runFastConv2d(inputs[0], outputs[0], fastConv2dImpl, nstripes, activ, fusedAdd);
21162107
return;
21172108
}
21182109

2110+
//TODO: Add support of Conv1D and Conv3D to fastConv, and remove the old Conv branch.
21192111
// Use only for Conv1D and Conv3D.
2112+
CV_Assert(!fusedAdd);
21202113
ParallelConv::run(inputs[0], outputs[0], weightsMat, biasvec, reluslope,
21212114
kernel_size, strides, pads_begin, pads_end, dilations, activ.get(), ngroups, nstripes);
2122-
21232115
}
21242116
}
21252117

modules/dnn/src/layers/fast_convolution/fast_convolution.avx2.cpp

Lines changed: 45 additions & 46 deletions
Original file line numberDiff line numberDiff line change
@@ -9,67 +9,67 @@ namespace cv {
99
namespace opt_AVX2
1010
{
1111
#if CV_TRY_AVX2
12-
void convBlock_AVX2(int k, const float *a, const float *b,
13-
float *c, int ldc, const float *bias,
14-
float minval, float maxval, bool ifActiv)
12+
void convBlock_AVX2(int np, const float* a, const float* b, float* c, int ldc, bool init_c)
1513
{
16-
#if FAST_CONV_MR == 4 && FAST_CONV_NR == 24
17-
__m256 vminval = _mm256_set1_ps(minval), vmaxval = _mm256_set1_ps(maxval);
18-
__m256 c0 = _mm256_set1_ps(bias[0]), c1 = c0, c2 = c0;
19-
__m256 c3 = _mm256_set1_ps(bias[1]), c4 = c3, c5 = c3;
20-
__m256 c6 = _mm256_set1_ps(bias[2]), c7 = c6, c8 = c6;
21-
__m256 c9 = _mm256_set1_ps(bias[3]), c10 = c9, c11 = c9;
14+
#if CONV_MR == 4 && CONV_NR == 24
15+
__m256 c00 = _mm256_set1_ps(0.f), c01 = c00, c02 = c00;
16+
__m256 c10 = c00, c11 = c00, c12 = c00;
17+
__m256 c20 = c00, c21 = c00, c22 = c00;
18+
__m256 c30 = c00, c31 = c00, c32 = c00;
2219

2320
__m256 a0 = _mm256_setzero_ps(), a1 = _mm256_setzero_ps();
2421
__m256 b0 = _mm256_setzero_ps(), b1 = _mm256_setzero_ps(), b2 = _mm256_setzero_ps();
2522

26-
for (int p = 0; p < k; p++, a += FAST_CONV_MR, b += FAST_CONV_NR)
23+
for (int p = 0; p < np; p++, a += CONV_MR, b += CONV_NR)
2724
{
2825
a0 = _mm256_set1_ps(a[0]), a1 = _mm256_set1_ps(a[1]);
2926
b0 = _mm256_load_ps(b), b1 = _mm256_load_ps(b + 8), b2 = _mm256_load_ps(b + 16);
3027

31-
c0 = _mm256_fmadd_ps(b0, a0, c0);
32-
c1 = _mm256_fmadd_ps(b1, a0, c1);
33-
c2 = _mm256_fmadd_ps(b2, a0, c2);
28+
c00 = _mm256_fmadd_ps(b0, a0, c00);
29+
c01 = _mm256_fmadd_ps(b1, a0, c01);
30+
c02 = _mm256_fmadd_ps(b2, a0, c02);
3431

35-
c3 = _mm256_fmadd_ps(b0, a1, c3);
36-
a0 = _mm256_set1_ps(a[2]);
37-
c4 = _mm256_fmadd_ps(b1, a1, c4);
38-
c5 = _mm256_fmadd_ps(b2, a1, c5);
32+
c10 = _mm256_fmadd_ps(b0, a1, c10);
33+
c11 = _mm256_fmadd_ps(b1, a1, c11);
34+
c12 = _mm256_fmadd_ps(b2, a1, c12);
3935

40-
c6 = _mm256_fmadd_ps(b0, a0, c6);
41-
a1 = _mm256_set1_ps(a[3]);
42-
c7 = _mm256_fmadd_ps(b1, a0, c7);
43-
c8 = _mm256_fmadd_ps(b2, a0, c8);
36+
a0 = _mm256_set1_ps(a[2]), a1 = _mm256_set1_ps(a[3]);
4437

45-
c9 = _mm256_fmadd_ps(b0, a1, c9);
46-
c10 = _mm256_fmadd_ps(b1, a1, c10);
47-
c11 = _mm256_fmadd_ps(b2, a1, c11);
38+
c20 = _mm256_fmadd_ps(b0, a0, c20);
39+
c21 = _mm256_fmadd_ps(b1, a0, c21);
40+
c22 = _mm256_fmadd_ps(b2, a0, c22);
41+
42+
c30 = _mm256_fmadd_ps(b0, a1, c30);
43+
c31 = _mm256_fmadd_ps(b1, a1, c31);
44+
c32 = _mm256_fmadd_ps(b2, a1, c32);
4845
}
4946

50-
if (ifActiv)
47+
if (!init_c)
5148
{
52-
c0 = _mm256_min_ps(_mm256_max_ps(c0, vminval), vmaxval);
53-
c1 = _mm256_min_ps(_mm256_max_ps(c1, vminval), vmaxval);
54-
c2 = _mm256_min_ps(_mm256_max_ps(c2, vminval), vmaxval);
55-
c3 = _mm256_min_ps(_mm256_max_ps(c3, vminval), vmaxval);
56-
c4 = _mm256_min_ps(_mm256_max_ps(c4, vminval), vmaxval);
57-
c5 = _mm256_min_ps(_mm256_max_ps(c5, vminval), vmaxval);
58-
c6 = _mm256_min_ps(_mm256_max_ps(c6, vminval), vmaxval);
59-
c7 = _mm256_min_ps(_mm256_max_ps(c7, vminval), vmaxval);
60-
c8 = _mm256_min_ps(_mm256_max_ps(c8, vminval), vmaxval);
61-
c9 = _mm256_min_ps(_mm256_max_ps(c9, vminval), vmaxval);
62-
c10 = _mm256_min_ps(_mm256_max_ps(c10, vminval), vmaxval);
63-
c11 = _mm256_min_ps(_mm256_max_ps(c11, vminval), vmaxval);
49+
c00 = _mm256_add_ps(c00, _mm256_load_ps(c));
50+
c01 = _mm256_add_ps(c01, _mm256_load_ps(c + 8));
51+
c02 = _mm256_add_ps(c02, _mm256_load_ps(c + 16));
52+
53+
c10 = _mm256_add_ps(c10, _mm256_load_ps(c + ldc));
54+
c11 = _mm256_add_ps(c11, _mm256_load_ps(c + ldc + 8));
55+
c12 = _mm256_add_ps(c12, _mm256_load_ps(c + ldc + 16));
56+
57+
c20 = _mm256_add_ps(c20, _mm256_load_ps(c + ldc*2));
58+
c21 = _mm256_add_ps(c21, _mm256_load_ps(c + ldc*2 + 8));
59+
c22 = _mm256_add_ps(c22, _mm256_load_ps(c + ldc*2 + 16));
60+
61+
c30 = _mm256_add_ps(c30, _mm256_load_ps(c + ldc*3));
62+
c31 = _mm256_add_ps(c31, _mm256_load_ps(c + ldc*3 + 8));
63+
c32 = _mm256_add_ps(c32, _mm256_load_ps(c + ldc*3 + 16));
6464
}
6565

66-
_mm256_storeu_ps(c, c0); _mm256_storeu_ps(c+8, c1); _mm256_storeu_ps(c+16, c2);
67-
_mm256_storeu_ps(c + ldc, c3); _mm256_storeu_ps(c + ldc + 8, c4); _mm256_storeu_ps(c + ldc + 16, c5);
68-
_mm256_storeu_ps(c + ldc*2, c6); _mm256_storeu_ps(c + ldc*2 + 8, c7); _mm256_storeu_ps(c + ldc*2 + 16, c8);
69-
_mm256_storeu_ps(c + ldc*3, c9); _mm256_storeu_ps(c + ldc*3 + 8, c10); _mm256_storeu_ps(c + ldc*3 + 16, c11);
66+
_mm256_storeu_ps(c, c00), _mm256_storeu_ps(c+8, c01), _mm256_storeu_ps(c+16, c02);
67+
_mm256_storeu_ps(c + ldc, c10), _mm256_storeu_ps(c + ldc + 8, c11), _mm256_storeu_ps(c + ldc + 16, c12);
68+
_mm256_storeu_ps(c + ldc*2, c20), _mm256_storeu_ps(c + ldc*2 + 8, c21), _mm256_storeu_ps(c + ldc*2 + 16, c22);
69+
_mm256_storeu_ps(c + ldc*3, c30), _mm256_storeu_ps(c + ldc*3 + 8, c31), _mm256_storeu_ps(c + ldc*3 + 16, c32);
7070
_mm256_zeroupper();
7171
#else
72-
#error "unsupported FAST_CONV_MR and/or FAST_CONV_NR in convBlock_AVX2."
72+
#error "unsupported CONV_MR and/or CONV_NR in convBlock_AVX2."
7373
#endif
7474
}
7575

@@ -78,7 +78,6 @@ void depthWiseBlock_AVX2(const float *inptr, float *outptr, const float *weights
7878
int dilation_y, int stride_x, int stride_y, int inner_xleft, int inner_xright, int inner_ytop,
7979
int inner_ybottom, bool ifMinMaxAct, bool useSIMD, bool is3x3)
8080
{
81-
const int VECSZ = 8;
8281
__m256 vminval = _mm256_set1_ps(minval);
8382
__m256 vmaxval = _mm256_set1_ps(maxval);
8483

@@ -175,7 +174,7 @@ void depthWiseBlock_AVX2(const float *inptr, float *outptr, const float *weights
175174
{
176175
if (dy0 == 3)
177176
{
178-
for (; x0 <= x1 - VECSZ; x0 += VECSZ)
177+
for (; x0 <= x1 - FAST_VEC_NLANES; x0 += FAST_VEC_NLANES)
179178
{
180179
int xi_ = x0 * stride_x - pad_left;
181180
const float *inptr_xi = inptr + Wi * yi_ + xi_;
@@ -251,7 +250,7 @@ void depthWiseBlock_AVX2(const float *inptr, float *outptr, const float *weights
251250
}
252251
else
253252
{
254-
for (; x0 <= x1 - VECSZ; x0 += VECSZ)
253+
for (; x0 <= x1 - FAST_VEC_NLANES; x0 += FAST_VEC_NLANES)
255254
{
256255
int xi_ = x0 * stride_x - pad_left;
257256
const float *inptr_xi = inptr + Wi * yi_ + xi_;
@@ -277,7 +276,7 @@ void depthWiseBlock_AVX2(const float *inptr, float *outptr, const float *weights
277276
}
278277
else
279278
{
280-
for (; x0 <= x1 - VECSZ; x0 += VECSZ)
279+
for (; x0 <= x1 - FAST_VEC_NLANES; x0 += FAST_VEC_NLANES)
281280
{
282281
int xi_ = x0 * stride_x - pad_left, k = 0;
283282
const float *inptr_xi = inptr + Wi * yi_ + xi_;

0 commit comments

Comments
 (0)