@@ -612,12 +612,23 @@ class ConvolutionLayerImpl CV_FINAL : public BaseConvolutionLayerImpl
612612 p.ofstab_ .resize (karea * ncn);
613613 int * ofstab = &p.ofstab_ [0 ];
614614
615- for ( int k = 0 ; k < ncn; k++ )
616- for (int k_d = 0 ; k_d < kernel_d; k_d++)
615+ if (isConv2D)
616+ {
617+ for ( int k = 0 ; k < ncn; k++ )
617618 for ( int k_r = 0 ; k_r < kernel_h; k_r++ )
618619 for ( int k_c = 0 ; k_c < kernel_w; k_c++ )
619- ofstab[(k*kernel_d*kernel_h + k_d*kernel_h + k_r)*kernel_w + k_c] =
620- (k*depth*height + k_d*dil_d*height + k_r*dil_h)*width + k_c*dil_w;
620+ ofstab[(k*kernel_h + k_r)*kernel_w + k_c] =
621+ (k*height + k_r*dil_h)*width + k_c*dil_w;
622+ }
623+ else
624+ {
625+ for ( int k = 0 ; k < ncn; k++ )
626+ for (int k_d = 0 ; k_d < kernel_d; k_d++)
627+ for ( int k_r = 0 ; k_r < kernel_h; k_r++ )
628+ for ( int k_c = 0 ; k_c < kernel_w; k_c++ )
629+ ofstab[(k*kernel_d*kernel_h + k_d*kernel_h + k_r)*kernel_w + k_c] =
630+ (k*depth*height + k_d*dil_d*height + k_r*dil_h)*width + k_c*dil_w;
631+ }
621632
622633 p.biasvec_ = &biasvec;
623634 p.reluslope_ = &reluslope;
@@ -739,75 +750,117 @@ class ConvolutionLayerImpl CV_FINAL : public BaseConvolutionLayerImpl
739750 // do im2row for a part of input tensor
740751 float * rowbuf = rowbuf0;
741752
742- for ( ofs = ofs0; ofs < ofs1; out_d += (out_i + 1 ) / outH, out_i = (out_i + 1 ) % outH, out_j = 0 )
753+ if (isConv2D )
743754 {
744- int delta = std::min (ofs1 - ofs, outW - out_j);
745- int out_j1 = out_j + delta;
755+ for ( ofs = ofs0; ofs < ofs1; out_j = 0 , ++out_i )
756+ {
757+ int delta = std::min (ofs1 - ofs, outW - out_j);
758+ int out_j1 = out_j + delta;
746759
747- int in_d = out_d * stride_d - pad_d;
748- int in_i = out_i * stride_h - pad_t ;
749- int in_j = out_j * stride_w - pad_l;
750- const float * imgptr = data_inp0 + (cn0*depth*height + in_d*height + in_i)*width + in_j;
751- ofs += delta;
760+ int in_i = out_i * stride_h - pad_t ;
761+ int in_j = out_j * stride_w - pad_l;
762+ const float * imgptr = data_inp0 + (cn0*height + in_i)*width + in_j;
763+ ofs += delta;
752764
753- // do im2row for a part of input tensor
754- if ( is1x1 )
755- {
756- for ( ; out_j < out_j1; out_j++, rowbuf += vsz_a, imgptr += stride_w )
765+ // do im2row for a part of input tensor
766+ if ( is1x1 )
767+ {
768+ for ( ; out_j < out_j1; out_j++, rowbuf += vsz_a, imgptr += stride_w )
769+ {
770+ for ( k = 0 ; k < vsz; k++ )
771+ rowbuf[k] = imgptr[k*inpPlaneSize];
772+ }
773+ }
774+ else
757775 {
758- for ( k = 0 ; k < vsz; k++ )
759- rowbuf[k] = imgptr[k*inpPlaneSize];
776+ bool ok_i = 0 <= in_i && in_i < height - (kernel_h-1 )*dilation_h;
777+ int i0 = std::max (0 , (-in_i + dilation_h-1 )/dilation_h);
778+ int i1 = std::min (kernel_h, (height - in_i + dilation_h-1 )/dilation_h);
779+
780+ for ( ; out_j < out_j1; out_j++, rowbuf += vsz_a, imgptr += stride_w, in_j += stride_w )
781+ {
782+ // this condition should be true for most of the tensor elements, i.e.
783+ // most of the time the kernel aperture is inside the tensor X-Y plane.
784+ if ( ok_i && out_j + 2 <= out_j1 && 0 <= in_j && in_j + stride_w*2 <= width - (kernel_w-1 )*dilation_w )
785+ {
786+ for ( k = 0 ; k < vsz; k++ )
787+ {
788+ int k1 = ofstab[k];
789+ float v0 = imgptr[k1];
790+ float v1 = imgptr[k1 + stride_w];
791+ rowbuf[k] = v0;
792+ rowbuf[k+vsz_a] = v1;
793+ }
794+ out_j++;
795+ rowbuf += vsz_a;
796+ imgptr += stride_w;
797+ in_j += stride_w;
798+ }
799+ else
800+ {
801+ int j0 = std::max (0 , (-in_j + dilation_w-1 )/dilation_w);
802+ int j1 = std::min (kernel_w, (width - in_j + dilation_w-1 )/dilation_w);
803+
804+ // here some non-continuous sub-row of the row will not be
805+ // filled from the tensor; we need to make sure that the uncovered
806+ // elements are explicitly set to 0's. the easiest way is to
807+ // set all the elements to 0's before the loop.
808+ memset (rowbuf, 0 , vsz*sizeof (rowbuf[0 ]));
809+ for ( k = 0 ; k < ncn; k++ )
810+ {
811+ for ( i = i0; i < i1; i++ )
812+ {
813+ for ( j = j0; j < j1; j++ )
814+ {
815+ int imgofs = k*(width*height) + i*(dilation_h*width) + j*dilation_w;
816+ rowbuf[(k*kernel_h + i)*kernel_w + j] = imgptr[imgofs];
817+ }
818+ }
819+ }
820+ }
821+ }
760822 }
761823 }
762- else
824+ }
825+ else
826+ {
827+ for ( ofs = ofs0; ofs < ofs1; out_d += (out_i + 1 ) / outH, out_i = (out_i + 1 ) % outH, out_j = 0 )
763828 {
829+ int delta = std::min (ofs1 - ofs, outW - out_j);
830+ int out_j1 = out_j + delta;
831+
832+ int in_d = out_d * stride_d - pad_d;
833+ int in_i = out_i * stride_h - pad_t ;
834+ int in_j = out_j * stride_w - pad_l;
835+ const float * imgptr = data_inp0 + (cn0*depth*height + in_d*height + in_i)*width + in_j;
836+ ofs += delta;
837+
764838 int d0 = std::max (0 , (-in_d + dilation_d - 1 ) / dilation_d);
765839 int d1 = std::min (kernel_d, (depth - in_d + dilation_d - 1 ) / dilation_d);
766840
767- bool ok_i = 0 <= in_i && in_i < height - (kernel_h-1 )*dilation_h;
768841 int i0 = std::max (0 , (-in_i + dilation_h-1 )/dilation_h);
769842 int i1 = std::min (kernel_h, (height - in_i + dilation_h-1 )/dilation_h);
770843
771844 for ( ; out_j < out_j1; out_j++, rowbuf += vsz_a, imgptr += stride_w, in_j += stride_w )
772845 {
773- // this condition should be true for most of the tensor elements, i.e.
774- // most of the time the kernel aperture is inside the tensor X-Y plane.
775- if ( isConv2D && ok_i && out_j + 2 <= out_j1 && 0 <= in_j && in_j + stride_w*2 <= width - (kernel_w-1 )*dilation_w )
846+ int j0 = std::max (0 , (-in_j + dilation_w-1 )/dilation_w);
847+ int j1 = std::min (kernel_w, (width - in_j + dilation_w-1 )/dilation_w);
848+
849+ // here some non-continuous sub-row of the row will not be
850+ // filled from the tensor; we need to make sure that the uncovered
851+ // elements are explicitly set to 0's. the easiest way is to
852+ // set all the elements to 0's before the loop.
853+ memset (rowbuf, 0 , vsz*sizeof (rowbuf[0 ]));
854+ for ( k = 0 ; k < ncn; k++ )
776855 {
777- for ( k = 0 ; k < vsz; k++ )
856+ for ( d = d0; d < d1; d++ )
778857 {
779- int k1 = ofstab[k];
780- float v0 = imgptr[k1];
781- float v1 = imgptr[k1 + stride_w];
782- rowbuf[k] = v0;
783- rowbuf[k+vsz_a] = v1;
784- }
785- out_j++;
786- rowbuf += vsz_a;
787- imgptr += stride_w;
788- in_j += stride_w;
789- }
790- else
791- {
792- int j0 = std::max (0 , (-in_j + dilation_w-1 )/dilation_w);
793- int j1 = std::min (kernel_w, (width - in_j + dilation_w-1 )/dilation_w);
794-
795- // here some non-continuous sub-row of the row will not be
796- // filled from the tensor; we need to make sure that the uncovered
797- // elements are explicitly set to 0's. the easiest way is to
798- // set all the elements to 0's before the loop.
799- memset (rowbuf, 0 , vsz*sizeof (rowbuf[0 ]));
800- for ( k = 0 ; k < ncn; k++ )
801- {
802- for ( d = d0; d < d1; d++)
858+ for ( i = i0; i < i1; i++ )
803859 {
804- for ( i = i0; i < i1; i ++ )
860+ for ( j = j0; j < j1; j ++ )
805861 {
806- for ( j = j0; j < j1; j++ )
807- {
808- int imgofs = k*(depth*width*height) + d*dilation_d*width*height + i*(dilation_h*width) + j*dilation_w;
809- rowbuf[(k*kernel_d*kernel_h + d*kernel_h + i)*kernel_w + j] = imgptr[imgofs];
810- }
862+ int imgofs = k*(depth*width*height) + d*dilation_d*width*height + i*(dilation_h*width) + j*dilation_w;
863+ rowbuf[(k*kernel_d*kernel_h + d*kernel_h + i)*kernel_w + j] = imgptr[imgofs];
811864 }
812865 }
813866 }
0 commit comments