RPP Dilate on HOST and HIP#554
Conversation
remove commented code
| blend_shuffle_max_7x7_host<7, 63, 1, 15, 127, 3>(&pxTemp[0], pxMaskPkd, blendRegisterOrder); | ||
| blend_shuffle_max_7x7_host<7, 63, 1, 15, 127, 3>(&pxTemp[1], pxMaskPkd, blendRegisterOrder); | ||
|
|
||
| if constexpr (std::is_same<T, Rpp8s>::value) |
There was a problem hiding this comment.
Remove an empty space before if
| @@ -26,287 +26,38 @@ SOFTWARE. | |||
|
|
|||
| // -------------------- Set 0 - dilate device helpers -------------------- | |||
|
|
|||
| __device__ void dilate_3x3_row_hip_compute(uchar *srcPtr, d_float8 *dst_f8) | |||
| // Templated dilate row compute function - works for any filter size (3, 5, 7, 9) | |||
There was a problem hiding this comment.
device void erode_row_hip_compute(T *srcPtr, d_float8 *dst_f8)
{
#pragma unroll
for (int k = 0; k < 8; k++)
{
float minVal = static_cast<float>(srcPtr[k]);
for (int j = 1; j < filterSize; j++)
minVal = fminf(minVal, static_cast<float>(srcPtr[k + j]));
dst_f8->f1[k] = fminf(dst_f8->f1[k], minVal);
}
}
Modify function similar to erode
| dilate_row_hip_compute<7>(&src_smem[hipThreadIdx_y + 4][hipThreadIdx_x8], &sum_f8); | ||
| dilate_row_hip_compute<7>(&src_smem[hipThreadIdx_y + 5][hipThreadIdx_x8], &sum_f8); | ||
| dilate_row_hip_compute<7>(&src_smem[hipThreadIdx_y + 6][hipThreadIdx_x8], &sum_f8); | ||
| if constexpr (std::is_same<T, Rpp8s>::value) |
There was a problem hiding this comment.
Dont have any if else just have
rpp_hip_pack_float8_and_store8(dstPtr + dstIdx, &sum_f8);
| int clampedX = roiBeginX + max(0, min(id_x_i + i, (roiWidth - 1))); | ||
| int clampedIdx = (id_z * srcStridesNH.x) + (clampedY * srcStridesNH.y) + (clampedX * 3); | ||
|
|
||
| src_smem[hipThreadIdx_y_channel.x][hipThreadIdx_x8 + i] = srcPtr[clampedIdx]; // R |
There was a problem hiding this comment.
Pls indent R similar to G and B in all places across the code
| if (roiTypeSrc == RpptRoiType::LTRB) | ||
| convert_roi(roiTensorPtrDst, RpptRoiType::XYWH, dstDescPtr->n); | ||
|
|
||
There was a problem hiding this comment.
Remove the whitespace
|
|
||
| // If DEBUG_MODE is set to 1 dump the outputs to csv files for debugging | ||
| if(DEBUG_MODE && iterCount == 0) | ||
| if (DEBUG_MODE && iterCount == 0) |
There was a problem hiding this comment.
Restore all these unnecessary changes
…nd aligned indent R.
…for single line condition
There was a problem hiding this comment.
Pull request overview
This PR adds HOST backend support for the RPP (Rocm Performance Primitives) dilate morphological operation, extending the existing HIP-only implementation.
Changes:
- Enabled dilate operation for both HOST and HIP backends in the test suite configuration
- Added CPU implementation of dilate operation with support for multiple data types (U8, I8, F16, F32)
- Implemented SIMD-optimized helper functions for efficient dilate computation across different kernel sizes (3x3, 5x5, 7x7, 9x9)
Reviewed changes
Copilot reviewed 7 out of 17 changed files in this pull request and generated 2 comments.
Show a summary per file
| File | Description |
|---|---|
| utilities/test_suite/common.py | Updated dilate backend support from HIP-only to include HOST |
| utilities/test_suite/HOST/runImageTests.py | Added dilate to kernel size test configurations |
| utilities/test_suite/HOST/Tensor_image_host.cpp | Implemented dilate test case with HOST backend API call |
| src/modules/tensor/rppt_tensor_morphological_operations.cpp | Added rppt_dilate_host function with multi-datatype support |
| src/include/tensor/host_tensor_executors.hpp | Added function declarations for dilate HOST implementations |
| src/include/common/cpu/rpp_cpu_filter.hpp | Added SIMD helper functions and morphological operation utilities |
| api/rppt_tensor_morphological_operations.h | Added HOST API documentation and fixed HIP documentation typo |
💡 Add Copilot custom instructions for smarter, more guided reviews. Learn how to get started.
|
|
||
| for (int i = 0; i < oBufferSize; i++) | ||
| refFile << static_cast<int>(*(outputu8 + i)) << ","; | ||
|
|
There was a problem hiding this comment.
The added blank lines (1870 and 1873) create inconsistent spacing around the for loop. This section appears to be unrelated to the dilate implementation and these formatting changes are unnecessary.
| for (int i = 0; i < oBufferSize; i++) | |
| refFile << static_cast<int>(*(outputu8 + i)) << ","; | |
| for (int i = 0; i < oBufferSize; i++) | |
| refFile << static_cast<int>(*(outputu8 + i)) << ","; |
| { | ||
| /* For PLN inputs | For PKD inputs | ||
| pSrc[0] - [X01|X02|X03|X04|X05|X06|X07|X08], pSrc[1] - [X09|X10|X11|X12|X13|X14|X15|X16]| pSrc[0] - [R01|G01|B01|R02|G02|B02|R03|G03], pSrc[1] - [B03|R04|G04|B04|R05|G05|B05|R06], | ||
| [X02|X03|X04|X05|X06|X07|X08|X09] (blend with mask [0000 0001] and permute) pSrc[2] - [G06|B06|R07|G07|B07|R08|G08|B08], pSrc[3] - [R09|G09|B09|R10|G10|B10|R11|G11] |
There was a problem hiding this comment.
Missing pipe character '|' separator before 'pSrc[2]' in this comment line. All other similar comment blocks in this file consistently use '|' to separate PLN and PKD input descriptions.
No description provided.