Skip to content

Commit fe12daa

Browse files
committed
Optimize Conv2D
1 parent 6e99106 commit fe12daa

File tree

1 file changed

+107
-54
lines changed

1 file changed

+107
-54
lines changed

modules/dnn/src/layers/convolution_layer.cpp

Lines changed: 107 additions & 54 deletions
Original file line numberDiff line numberDiff line change
@@ -612,12 +612,23 @@ class ConvolutionLayerImpl CV_FINAL : public BaseConvolutionLayerImpl
612612
p.ofstab_.resize(karea * ncn);
613613
int* ofstab = &p.ofstab_[0];
614614

615-
for( int k = 0; k < ncn; k++ )
616-
for (int k_d = 0; k_d < kernel_d; k_d++)
615+
if (isConv2D)
616+
{
617+
for( int k = 0; k < ncn; k++ )
617618
for( int k_r = 0; k_r < kernel_h; k_r++ )
618619
for( int k_c = 0; k_c < kernel_w; k_c++ )
619-
ofstab[(k*kernel_d*kernel_h + k_d*kernel_h + k_r)*kernel_w + k_c] =
620-
(k*depth*height + k_d*dil_d*height + k_r*dil_h)*width + k_c*dil_w;
620+
ofstab[(k*kernel_h + k_r)*kernel_w + k_c] =
621+
(k*height + k_r*dil_h)*width + k_c*dil_w;
622+
}
623+
else
624+
{
625+
for( int k = 0; k < ncn; k++ )
626+
for (int k_d = 0; k_d < kernel_d; k_d++)
627+
for( int k_r = 0; k_r < kernel_h; k_r++ )
628+
for( int k_c = 0; k_c < kernel_w; k_c++ )
629+
ofstab[(k*kernel_d*kernel_h + k_d*kernel_h + k_r)*kernel_w + k_c] =
630+
(k*depth*height + k_d*dil_d*height + k_r*dil_h)*width + k_c*dil_w;
631+
}
621632

622633
p.biasvec_ = &biasvec;
623634
p.reluslope_ = &reluslope;
@@ -739,75 +750,117 @@ class ConvolutionLayerImpl CV_FINAL : public BaseConvolutionLayerImpl
739750
// do im2row for a part of input tensor
740751
float* rowbuf = rowbuf0;
741752

742-
for( ofs = ofs0; ofs < ofs1; out_d += (out_i + 1) / outH, out_i = (out_i + 1) % outH, out_j = 0 )
753+
if (isConv2D)
743754
{
744-
int delta = std::min(ofs1 - ofs, outW - out_j);
745-
int out_j1 = out_j + delta;
755+
for( ofs = ofs0; ofs < ofs1; out_j = 0, ++out_i )
756+
{
757+
int delta = std::min(ofs1 - ofs, outW - out_j);
758+
int out_j1 = out_j + delta;
746759

747-
int in_d = out_d * stride_d - pad_d;
748-
int in_i = out_i * stride_h - pad_t;
749-
int in_j = out_j * stride_w - pad_l;
750-
const float* imgptr = data_inp0 + (cn0*depth*height + in_d*height + in_i)*width + in_j;
751-
ofs += delta;
760+
int in_i = out_i * stride_h - pad_t;
761+
int in_j = out_j * stride_w - pad_l;
762+
const float* imgptr = data_inp0 + (cn0*height + in_i)*width + in_j;
763+
ofs += delta;
752764

753-
// do im2row for a part of input tensor
754-
if( is1x1 )
755-
{
756-
for( ; out_j < out_j1; out_j++, rowbuf += vsz_a, imgptr += stride_w )
765+
// do im2row for a part of input tensor
766+
if( is1x1 )
767+
{
768+
for( ; out_j < out_j1; out_j++, rowbuf += vsz_a, imgptr += stride_w )
769+
{
770+
for( k = 0; k < vsz; k++ )
771+
rowbuf[k] = imgptr[k*inpPlaneSize];
772+
}
773+
}
774+
else
757775
{
758-
for( k = 0; k < vsz; k++ )
759-
rowbuf[k] = imgptr[k*inpPlaneSize];
776+
bool ok_i = 0 <= in_i && in_i < height - (kernel_h-1)*dilation_h;
777+
int i0 = std::max(0, (-in_i + dilation_h-1)/dilation_h);
778+
int i1 = std::min(kernel_h, (height - in_i + dilation_h-1)/dilation_h);
779+
780+
for( ; out_j < out_j1; out_j++, rowbuf += vsz_a, imgptr += stride_w, in_j += stride_w )
781+
{
782+
// this condition should be true for most of the tensor elements, i.e.
783+
// most of the time the kernel aperture is inside the tensor X-Y plane.
784+
if( ok_i && out_j + 2 <= out_j1 && 0 <= in_j && in_j + stride_w*2 <= width - (kernel_w-1)*dilation_w )
785+
{
786+
for( k = 0; k < vsz; k++ )
787+
{
788+
int k1 = ofstab[k];
789+
float v0 = imgptr[k1];
790+
float v1 = imgptr[k1 + stride_w];
791+
rowbuf[k] = v0;
792+
rowbuf[k+vsz_a] = v1;
793+
}
794+
out_j++;
795+
rowbuf += vsz_a;
796+
imgptr += stride_w;
797+
in_j += stride_w;
798+
}
799+
else
800+
{
801+
int j0 = std::max(0, (-in_j + dilation_w-1)/dilation_w);
802+
int j1 = std::min(kernel_w, (width - in_j + dilation_w-1)/dilation_w);
803+
804+
// here some non-continuous sub-row of the row will not be
805+
// filled from the tensor; we need to make sure that the uncovered
806+
// elements are explicitly set to 0's. the easiest way is to
807+
// set all the elements to 0's before the loop.
808+
memset(rowbuf, 0, vsz*sizeof(rowbuf[0]));
809+
for( k = 0; k < ncn; k++ )
810+
{
811+
for( i = i0; i < i1; i++ )
812+
{
813+
for( j = j0; j < j1; j++ )
814+
{
815+
int imgofs = k*(width*height) + i*(dilation_h*width) + j*dilation_w;
816+
rowbuf[(k*kernel_h + i)*kernel_w + j] = imgptr[imgofs];
817+
}
818+
}
819+
}
820+
}
821+
}
760822
}
761823
}
762-
else
824+
}
825+
else
826+
{
827+
for( ofs = ofs0; ofs < ofs1; out_d += (out_i + 1) / outH, out_i = (out_i + 1) % outH, out_j = 0 )
763828
{
829+
int delta = std::min(ofs1 - ofs, outW - out_j);
830+
int out_j1 = out_j + delta;
831+
832+
int in_d = out_d * stride_d - pad_d;
833+
int in_i = out_i * stride_h - pad_t;
834+
int in_j = out_j * stride_w - pad_l;
835+
const float* imgptr = data_inp0 + (cn0*depth*height + in_d*height + in_i)*width + in_j;
836+
ofs += delta;
837+
764838
int d0 = std::max(0, (-in_d + dilation_d - 1) / dilation_d);
765839
int d1 = std::min(kernel_d, (depth - in_d + dilation_d - 1) / dilation_d);
766840

767-
bool ok_i = 0 <= in_i && in_i < height - (kernel_h-1)*dilation_h;
768841
int i0 = std::max(0, (-in_i + dilation_h-1)/dilation_h);
769842
int i1 = std::min(kernel_h, (height - in_i + dilation_h-1)/dilation_h);
770843

771844
for( ; out_j < out_j1; out_j++, rowbuf += vsz_a, imgptr += stride_w, in_j += stride_w )
772845
{
773-
// this condition should be true for most of the tensor elements, i.e.
774-
// most of the time the kernel aperture is inside the tensor X-Y plane.
775-
if( isConv2D && ok_i && out_j + 2 <= out_j1 && 0 <= in_j && in_j + stride_w*2 <= width - (kernel_w-1)*dilation_w )
846+
int j0 = std::max(0, (-in_j + dilation_w-1)/dilation_w);
847+
int j1 = std::min(kernel_w, (width - in_j + dilation_w-1)/dilation_w);
848+
849+
// here some non-continuous sub-row of the row will not be
850+
// filled from the tensor; we need to make sure that the uncovered
851+
// elements are explicitly set to 0's. the easiest way is to
852+
// set all the elements to 0's before the loop.
853+
memset(rowbuf, 0, vsz*sizeof(rowbuf[0]));
854+
for( k = 0; k < ncn; k++ )
776855
{
777-
for( k = 0; k < vsz; k++ )
856+
for ( d = d0; d < d1; d++)
778857
{
779-
int k1 = ofstab[k];
780-
float v0 = imgptr[k1];
781-
float v1 = imgptr[k1 + stride_w];
782-
rowbuf[k] = v0;
783-
rowbuf[k+vsz_a] = v1;
784-
}
785-
out_j++;
786-
rowbuf += vsz_a;
787-
imgptr += stride_w;
788-
in_j += stride_w;
789-
}
790-
else
791-
{
792-
int j0 = std::max(0, (-in_j + dilation_w-1)/dilation_w);
793-
int j1 = std::min(kernel_w, (width - in_j + dilation_w-1)/dilation_w);
794-
795-
// here some non-continuous sub-row of the row will not be
796-
// filled from the tensor; we need to make sure that the uncovered
797-
// elements are explicitly set to 0's. the easiest way is to
798-
// set all the elements to 0's before the loop.
799-
memset(rowbuf, 0, vsz*sizeof(rowbuf[0]));
800-
for( k = 0; k < ncn; k++ )
801-
{
802-
for ( d = d0; d < d1; d++)
858+
for( i = i0; i < i1; i++ )
803859
{
804-
for( i = i0; i < i1; i++ )
860+
for( j = j0; j < j1; j++ )
805861
{
806-
for( j = j0; j < j1; j++ )
807-
{
808-
int imgofs = k*(depth*width*height) + d*dilation_d*width*height + i*(dilation_h*width) + j*dilation_w;
809-
rowbuf[(k*kernel_d*kernel_h + d*kernel_h + i)*kernel_w + j] = imgptr[imgofs];
810-
}
862+
int imgofs = k*(depth*width*height) + d*dilation_d*width*height + i*(dilation_h*width) + j*dilation_w;
863+
rowbuf[(k*kernel_d*kernel_h + d*kernel_h + i)*kernel_w + j] = imgptr[imgofs];
811864
}
812865
}
813866
}

0 commit comments

Comments
 (0)