Gaussian filter padding updates with QA support#517
Gaussian filter padding updates with QA support#517r-abishek merged 16 commits intor-abishek:ar/opt_gaussian_filter_qa_f32from
Conversation
|
@RooseweltMcW please update the branch with latest develop changes |
| rpp_load16_u8_to_f32_avx(srcPtrTemp[k], &pRow[k * 2]); | ||
| for (int k = rowKernelLoopLimit * 2; k < 10; k += 2) | ||
| pRow[k] = pRow[k + 1] = avx_p0; | ||
| const int radius = 5 - rowKernelLoopLimit; |
There was a problem hiding this comment.
what is radius here
| pRow[k] = pRow[k + 1] = avx_p0; | ||
| const int radius = 7 - rowKernelLoopLimit; | ||
| int centerRowOffset = padIndex ? radius : 0; // The offset tells us where the center row is located within srcPtrTemp | ||
| for (int k = 0; k < 7; k++) |
There was a problem hiding this comment.
I can see many additional computations here that are not needed. Please check and revert to the older style of loads.
applicable to all the load funcs in this file
There was a problem hiding this comment.
The additional computations are required because unlike box filter the order of the padding needs to be preserved. This is evident from 3x3 cases where there are multiple conditions handled just to preserve the order
There was a problem hiding this comment.
The functions can be templated. Will attempt it
r-abishek
left a comment
There was a problem hiding this comment.
@Srihari-mcw Pls address comments
| dst_f8->f1[7] = fmaf(src_f1, filter[7], dst_f8->f1[7]); | ||
| src_f1 = rpp_hip_unpack3(src_ui4.w); | ||
| dst_f8->f1[7] = fmaf(src_f1, filter[8], dst_f8->f1[7]); | ||
| #pragma unroll |
There was a problem hiding this comment.
tab in for these pragmas. Check other instances
There was a problem hiding this comment.
Tab is there for the pragma in all other places in RPP too, it is indented with respect to the loop on which its applied
| @@ -309,15 +58,22 @@ __global__ void gaussian_filter_3x3_pkd_tensor(T *srcPtr, | |||
| RpptROIPtr roiTensorPtrSrc, | |||
| float *filterTensor) | |||
| { | |||
| int hipThreadIdx_x8 = hipThreadIdx_x << 3; | |||
| int hipThreadIdx_x8 = hipThreadIdx_x << 3; | |||
| { | ||
| #pragma unroll | ||
| for(int k = 0; k < filterSize; ++k) | ||
| dst_f8->f1[j] = fmaf(srcPtr[j + k], filter[k], dst_f8->f1[j]); |
There was a problem hiding this comment.
check u8 old vs new performance
| gaussian_row_hip_compute<7>(&src_smem[hipThreadIdx_y_channel.x + 6][hipThreadIdx_x8], &sum_f24.f8[0], filter_row7); | ||
| gaussian_row_hip_compute<7>(&src_smem[hipThreadIdx_y_channel.y + 6][hipThreadIdx_x8], &sum_f24.f8[1], filter_row7); | ||
| gaussian_row_hip_compute<7>(&src_smem[hipThreadIdx_y_channel.z + 6][hipThreadIdx_x8], &sum_f24.f8[2], filter_row7); | ||
| if constexpr (std::is_same<T, float>::value) |
There was a problem hiding this comment.
Pixel check was found to be not needed
| template<typename T> | ||
| inline void process_left_border_columns_pln_pln(T **srcPtrTemp, T *dstPtrTemp, Rpp32u kernelSize, Rpp32u padLength, | ||
| Rpp32u unpaddedWidth, Rpp32s rowKernelLoopLimit, Rpp32f *filterTensor) | ||
| Rpp32u unpaddedWidth, Rpp32s rowKernelLoopLimit, Rpp32f *filterTensor, Rpp32s padVertical) |
There was a problem hiding this comment.
Can padVertical and paddHorizontal be boolean or preferably enums? TOP_EDGE etc or other
| } | ||
|
|
||
| inline void rpp_load_filter_3x3_pkd_host(__m256 *pRow, Rpp8u **srcPtrTemp, Rpp32s rowKernelLoopLimit) | ||
| inline void rpp_load_filter_3x3_pln_host(__m256 *pRow, Rpp8u **srcPtrTemp, Rpp32s rowKernelLoopLimit, Rpp32s padIndex) |
There was a problem hiding this comment.
can these be templated below too?
| Rpp32s rowKernelLoopLimit = kernelSize; | ||
| get_kernel_loop_limit(i, rowKernelLoopLimit, padLength, unpaddedHeight); | ||
| process_left_border_columns_pln_pln(srcPtrTemp, dstPtrTemp, kernelSize, padLength, unpaddedWidth, rowKernelLoopLimit, filterTensor); | ||
| Rpp32s padVertical = i < padLength ? 0 : 1; |
| process_left_border_columns_pln_pln(srcPtrTemp, dstPtrTemp, kernelSize, padLength, unpaddedWidth, rowKernelLoopLimit, filterTensor, padVertical); | ||
| dstPtrTemp += padLength; | ||
| #if __AVX2__ | ||
| Rpp32s padindex = (padVertical == 1) ? rowKernelLoopLimit - 1 : 0; |
| } | ||
|
|
||
| if constexpr (std::is_same<T, Rpp32f>::value) | ||
| rpp_pixel_check_0to1(pDst, 2); |
| if constexpr (std::is_same<T, Rpp32f>::value) | ||
| rpp_pixel_check_0to1(pDst, 2); | ||
| rpp_store_filter_3x3_host(dstPtrTemp, pDst); | ||
| increment_row_ptrs(srcPtrTemp, kernelSize, 14); |
There was a problem hiding this comment.
why 14 - all instances - either in-line comment or variable
6893fa3 to
805acb4
Compare
r-abishek
left a comment
There was a problem hiding this comment.
@Srihari-mcw Please address these
api/rppdefs.h
Outdated
| { | ||
| LEFT_EDGE = 0, | ||
| RIGHT_EDGE | ||
| } RpptBorderHorizontalDirection; |
There was a problem hiding this comment.
Just like the one below, call this RpptImageBorderEdge instead of RpptImageBorderType.
Are two separate enums for horizontal and vertical needed, can one suffice?
| Rpp32u kernelSize, Rpp32u padLength, Rpp32u unpaddedWidth, | ||
| Rpp32s rowKernelLoopLimit, Rpp32f *filterTensor, Rpp32u channels = 1, | ||
| RpptBorderVerticalDirection padVertical = RpptBorderVerticalDirection::BOTTOM_EDGE, | ||
| RpptBorderHorizontalDirection padHorizontal = RpptBorderHorizontalDirection::RIGHT_EDGE) |
There was a problem hiding this comment.
With a combined enum, these arguments can be:
RpptImageBorderEdge padVertical = RpptImageBorderEdge::BOTTOM_EDGE,
RpptImageBorderEdge padHorizontal = RpptImageBorderEdge::RIGHT_EDGE)
|
…shek#517) Co-authored-by: Srihari-mcw <srihari@multicorewareinc.com>
…shek#517) Co-authored-by: Srihari-mcw <srihari@multicorewareinc.com>








Implemented nearest neighbor padding logic with respect to gaussian filter HOST and HIP and updated QA test