Solarize HIP and HOST implementation#454
Conversation
HazarathKumarM
commented
Jun 26, 2025
- Solarize HIP and HOST tensor implementation
r-abishek
left a comment
There was a problem hiding this comment.
@HazarathKumarM Please address comments
| testCaseName = "solarize"; | ||
|
|
||
|
|
||
| for (int i = 0; i < batchSize; i++) |
| RpptDescPtr srcDescPtr, | ||
| RppPtr_t dstPtr, | ||
| RpptDescPtr dstDescPtr, | ||
| Rpp32f *tresholdTensor, |
| { | ||
| RppLayoutParams layoutParams = get_layout_params(srcDescPtr->layout, srcDescPtr->c); | ||
|
|
||
| if ((srcDescPtr->dataType == RpptDataType::U8) && (dstDescPtr->dataType == RpptDataType::U8)) |
There was a problem hiding this comment.
Can thresholdTensor have any float value with no restriction or checks?
| RppStatus rppt_solarize_host(RppPtr_t srcPtr, RpptDescPtr srcDescPtr, RppPtr_t dstPtr, RpptDescPtr dstDescPtr, Rpp32f* thresholdTensor, RpptROIPtr roiTensorPtrSrc, RpptRoiType roiType, rppHandle_t rppHandle); | ||
|
|
||
| #ifdef GPU_SUPPORT | ||
| RppStatus rppt_solarize_gpu(RppPtr_t srcPtr, RpptDescPtr srcDescPtr, RppPtr_t dstPtr, RpptDescPtr dstDescPtr, Rpp32f *thresholdTensor, RpptROIPtr roiTensorPtrSrc, RpptRoiType roiType, rppHandle_t rppHandle); |
There was a problem hiding this comment.
Header documentation to be added?
| RppStatus rppt_fog_gpu(RppPtr_t srcPtr, RpptDescPtr srcDescPtr, RppPtr_t dstPtr, RpptDescPtr dstDescPtr, Rpp32f *intensityFactor, Rpp32f *greyFactor, RpptROIPtr roiTensorPtrSrc, RpptRoiType roiType, rppHandle_t rppHandle); | ||
| #endif // GPU_SUPPORT | ||
|
|
||
| RppStatus rppt_solarize_host(RppPtr_t srcPtr, RpptDescPtr srcDescPtr, RppPtr_t dstPtr, RpptDescPtr dstDescPtr, Rpp32f* thresholdTensor, RpptROIPtr roiTensorPtrSrc, RpptRoiType roiType, rppHandle_t rppHandle); |
There was a problem hiding this comment.
ROCm docs input/output images to be added?
Bin images too for QA
| px[2] = _mm256_shuffle_epi8(_mm256_unpacklo_epi8(pxSrc[2], pxSrc[3]), pxMaskRGB); /* unpack 8 lo-pixels of pxSrc[2] and pxSrc[3] to get B01-16 */ | ||
| } | ||
|
|
||
| inline void rpp_load96_i8pkd3_to_i8pln3(Rpp8s *srcPtr, __m256i *px) |
There was a problem hiding this comment.
This rpp_load96_i8pkd3_to_i8pln3() and the helper above this rpp_load96_u8pkd3_to_u8pln3() are exactly the same on Diffchecker. Why are two needed? Can be templated?
There was a problem hiding this comment.
all the changes in the file are reverted as these changes are no longer needed in the updated version
| _mm256_storeu_si256((__m256i *)dstPtrB, px[2]); /* store [B01|B02|B03|B04|B05|B06|B07|B08|B09|B10|B11|B12|B13|B14|B15|B16|B17|B18|B19|B20|B21|B22|B23|B24|B25|B26|B27|B28|B29|B30|B31|B32] */ | ||
| } | ||
|
|
||
| inline void rpp_store96_i8pln3_to_i8pln3(Rpp8s *dstPtrR, Rpp8s *dstPtrG, Rpp8s *dstPtrB, __m256i *px) |
There was a problem hiding this comment.
all the changes in the file are reverted as these changes are no longer needed in the updated version
| _mm_storeu_si128((__m128i *)(dstPtr + 84), _mm256_extractf128_si256(pxDst[7], 1)); /* store [R29|G29|B29|R30|G30|B30|R31|G31|B31|R32|G32|B32|00|00|00|00] */ | ||
| } | ||
|
|
||
| inline void rpp_store96_i8pln3_to_i8pkd3(Rpp8s *dstPtr, __m256i *px) |
There was a problem hiding this comment.
Same comment:
I would either template or suggest this naming without even templating:
rpp_load96_u8pkd3_to_u8pln3(), rpp_store96_i8pln3_to_i8pln3() and rpp_store96_i8pln3_to_i8pkd3() seem to be used only in bitwise_not, bitwise_xor and channel_permute.
If this functionality really works correct for both u8 and i8, lets rename these 3 to rpp_load96_8bitpkd3_to_8bitpln3(), rpp_store96_8bitpln3_to_8bitpln3() and rpp_store96_8bitpln3_to_8bitpkd3() respectively.
Then just call the same 3 helpers everywhere.
Pls add comments above helper functions that this is meant to accept only U8/I8.
There was a problem hiding this comment.
all the changes in the file are reverted as these changes are no longer needed in the updated version
|
|
||
| #include "hip_tensor_executors.hpp" | ||
|
|
||
| __device__ void solarize_hip_rgb_compute(d_float24 *pix_f24, float thresholdParam, float maxVal) |
There was a problem hiding this comment.
Change to float &thresholdParam, float &maxVal)
These are already local variables inside gpu kernel. No need to re-init another local variable
| // { | ||
| // __m256i p[3]; | ||
| // rpp_simd_load(rpp_load96_u8pkd3_to_u8pln3, srcPtrTemp, p); // simd loads | ||
| // compute_solarize_96_host(p, pxThresholdParam); // threshold adjustment |
84437fe to
d9f65d3
Compare
r-abishek
left a comment
There was a problem hiding this comment.
@HazarathKumarM Pls check two comments
utilities/test_suite/common.py
Outdated
| ImageAugmentationGroupMap = { | ||
| "color_augmentations" : [0, 1, 2, 3, 4, 13, 31, 34, 36, 42, 43, 45, 81], | ||
| "effects_augmentations" : [5, 6, 8, 10, 11, 29, 30, 32, 35, 46, 82, 83, 84], | ||
| "effects_augmentations" : [5, 6, 8, 10, 11, 29, 30, 32, 35, 46, 82, 83, 84, 95], |
There was a problem hiding this comment.
Lets make a change in this PR for this.
Or before issuing this PR, lets issue another separate PR to fix this.
Lets ensure that once case numbers have been tagged to function names, we only use the names and not the numbers again.
(ROCm#586 (comment))
There was a problem hiding this comment.
sure Abishek
we will issue a seperate PR for this test suite changes
|
|
||
| startWallTime = omp_get_wtime(); | ||
| startCpuTime = clock(); | ||
| if (inputBitDepth == 0 || inputBitDepth == 1 || inputBitDepth == 2 || inputBitDepth == 5) |
There was a problem hiding this comment.
In another seperate PR please also convert the 0/1/2/3 to enum.
For all test suite types.
There was a problem hiding this comment.
Pull Request Overview
This PR implements the solarize image augmentation effect for both HIP (GPU) and HOST (CPU) tensor operations. Solarize is an image effect that inverts pixel values above a specified threshold, creating a photographic negative effect for those regions.
Key changes include:
- Complete implementation of solarize augmentation supporting multiple data types (U8, F32, F16, I8) and tensor layouts (NCHW, NHWC)
- Integration of solarize into test suites and augmentation groupings
- Addition of proper API headers and function declarations
Reviewed Changes
Copilot reviewed 10 out of 13 changed files in this pull request and generated 4 comments.
Show a summary per file
| File | Description |
|---|---|
| utilities/test_suite/rpp_test_suite_image.h | Adds solarize enum (95) and string mapping for test framework |
| utilities/test_suite/common.py | Registers solarize in augmentation maps and effect groupings |
| utilities/test_suite/HOST/Tensor_image_host.cpp | Implements HOST test case for solarize with threshold parameter |
| utilities/test_suite/HIP/Tensor_image_hip.cpp | Implements HIP test case with memory allocation and kernel invocation |
| src/modules/tensor/rppt_tensor_effects_augmentations.cpp | Core solarize implementation for both HOST and GPU backends |
| src/modules/tensor/hip/kernel/solarize.cpp | HIP kernel implementation with CUDA device functions |
| src/modules/tensor/cpu/kernel/solarize.cpp | CPU implementation with SIMD optimizations using AVX2 |
| src/include/tensor/host_tensor_executors.hpp | Function declarations for HOST solarize implementations |
| src/include/tensor/hip_tensor_executors.hpp | Template function declaration for HIP solarize executor |
| api/rppt_tensor_effects_augmentations.h | Public API documentation and function declarations |
Tip: Customize your code reviews with copilot-instructions.md. Create the file or learn how to get started.
| RpptRoiType roiType, | ||
| rpp::Handle& handle); | ||
|
|
||
| // -------------------- Solarize -------------------- |
| Rpp32u bufferLength = roi.xywhROI.roiWidth * layoutParams.bufferMultiplier; | ||
| Rpp32u vectorIncrement = 48; | ||
| Rpp32u vectorIncrementPerChannel = 16; | ||
| Rpp32u alignedLength = (bufferLength / vectorIncrement) * vectorIncrement; |
There was a problem hiding this comment.
Please put alignedLength inside AVX2 condition
| srcPtrChannel = srcPtrImage + (roi.xywhROI.xy.y * srcDescPtr->strides.hStride) + (roi.xywhROI.xy.x * layoutParams.bufferMultiplier); | ||
| dstPtrChannel = dstPtrImage; | ||
|
|
||
| Rpp32u bufferLength = roi.xywhROI.roiWidth * layoutParams.bufferMultiplier; |
There was a problem hiding this comment.
Declare buffer length alone before srcPtrChannel to maintain uniformity across kernels
| dstPtrRowR = dstPtrChannel; | ||
| dstPtrRowG = dstPtrRowR + dstDescPtr->strides.cStride; | ||
| dstPtrRowB = dstPtrRowG + dstDescPtr->strides.cStride; | ||
| for(int i = 0; i < roi.xywhROI.roiHeight; i++) |
There was a problem hiding this comment.
Add an empty line before
| dstPtrTempR = dstPtrRowR; | ||
| dstPtrTempG = dstPtrRowG; | ||
| dstPtrTempB = dstPtrRowB; | ||
| int vectorLoopCount = 0; |
There was a problem hiding this comment.
before vectorLoopCount= 0
| { | ||
| __m256 p[6]; | ||
| rpp_simd_load(rpp_load48_u8pkd3_to_f32pln3_avx, srcPtrTemp, p); // simd loads | ||
| compute_solarize_host<6>(p, pxThresholdParam, avx_p255); // threshold adjustment |
There was a problem hiding this comment.
Keep comments in the same line vertically
| *dstPtrTempR++ = (*(srcPtrTemp) >= thresholdParam) ? (maxVal - *(srcPtrTemp)) : *(srcPtrTemp); | ||
| *dstPtrTempG++ = (*(srcPtrTemp + 1) >= thresholdParam) ? (maxVal - *(srcPtrTemp + 1)) : *(srcPtrTemp + 1); | ||
| *dstPtrTempB++ = (*(srcPtrTemp + 2) >= thresholdParam) ? (maxVal - *(srcPtrTemp + 2)) : *(srcPtrTemp + 2); | ||
| srcPtrTemp += 3; |
There was a problem hiding this comment.
Please add empty lines appropriately before and after lines
There was a problem hiding this comment.
similar to other files
| } | ||
| else | ||
| { | ||
| Rpp32u alignedLength = bufferLength & ~(vectorIncrementPerChannel - 1); |
There was a problem hiding this comment.
Enclose this and others inside AVX2 flag
| } | ||
|
|
||
| RppStatus solarize_f16_f16_host_tensor(Rpp16f *srcPtr, | ||
| RpptDescPtr srcDescPtr, |
There was a problem hiding this comment.
Please indent all params in same line and check for similar places
| Rpp32u bufferLength = roi.xywhROI.roiWidth * layoutParams.bufferMultiplier; | ||
| const Rpp32u vectorIncrement = 48; | ||
| const Rpp32u vectorIncrementPerChannel = 16; | ||
| Rpp32u alignedLength = (bufferLength / vectorIncrement) * vectorIncrement; |
There was a problem hiding this comment.
Same comment as earlier, pls check for instances across the file. Thanks
| for (; vectorLoopCount < alignedLength; vectorLoopCount += vectorIncrementPerChannel) | ||
| { | ||
| #if __AVX2__ | ||
| __m256 p[1]; |
There was a problem hiding this comment.
Please use __m256 p; and pass the address. Thanks - For maintaining consistency
| } | ||
| for (; vectorLoopCount < bufferLength; vectorLoopCount++) | ||
| { | ||
| *dstPtrTemp++ = ((*srcPtrTemp + offset) >= thresholdParam) ? (maxVal - (*srcPtrTemp)) : *srcPtrTemp; |
There was a problem hiding this comment.
Can we add a comment on the requirement of the offset
Srihari-mcw
left a comment
There was a problem hiding this comment.
Please address review comments