Diff
checker
文本
文本
圖像
文檔
Excel
文件夾
Legal
Enterprise
桌面版
定價
登入
下載 Diffchecker 桌面版
比較文本
尋找兩個文字檔案之間的差異
工具
歷史
即時編輯器
摺疊未變更行
關閉換行
檢視
拆分
統一
比對精度
智能
單詞
字符
語法突出顯示
選擇語法
忽略
文字轉換
前往第一個差異
編輯輸入
Diffchecker Desktop
執行Diffchecker最安全的方式。取得Diffchecker桌面應用程式:您的差異永遠不會離開您的電腦!
取得桌面版
pkd3 vs pkd3_pln3
建立於
去年
差異永不過期
清除
匯出
分享
解釋
56 刪除
行
總計
刪除
字符
總計
刪除
要繼續使用此功能,請升級到
Diff
checker
Pro
查看價格
188 行
全部複製
81 新增
行
總計
新增
字符
總計
新增
要繼續使用此功能,請升級到
Diff
checker
Pro
查看價格
204 行
全部複製
template <typename T>
template <typename T>
複製
已複製
複製
已複製
__global__ void jpeg_compression_distortion_pkd3_
hip_tensor(
T *srcPtr,
__global__ void jpeg_compression_distortion_pkd3_
pln3_
hip_tensor(
T *srcPtr,
uint2 srcStridesNH,
uint2 srcStridesNH,
T *dstPtr,
T *dstPtr,
uint
2
dstStridesN
H,
uint
3
dstStridesN
C
H,
RpptROIPtr roiTensorPtrSrc,
RpptROIPtr roiTensorPtrSrc,
int *tableY,
int *tableY,
int *tableCbCr,
int *tableCbCr,
float qScale)
float qScale)
{
{
int id_x = (hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x) * 8;
int id_x = (hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x) * 8;
int id_y = hipBlockIdx_y * hipBlockDim_y + hipThreadIdx_y;
int id_y = hipBlockIdx_y * hipBlockDim_y + hipThreadIdx_y;
int id_z = hipBlockIdx_z * hipBlockDim_z + hipThreadIdx_z;
int id_z = hipBlockIdx_z * hipBlockDim_z + hipThreadIdx_z;
int hipThreadIdx_x8 = hipThreadIdx_x * 8;
int hipThreadIdx_x8 = hipThreadIdx_x * 8;
int hipThreadIdx_x4 = hipThreadIdx_x * 4;
int hipThreadIdx_x4 = hipThreadIdx_x * 4;
int alignedWidth = (roiTensorPtrSrc[id_z].xywhROI.roiWidth + 15) & ~15;
int alignedWidth = (roiTensorPtrSrc[id_z].xywhROI.roiWidth + 15) & ~15;
int alignedHeight = (roiTensorPtrSrc[id_z].xywhROI.roiHeight + 15) & ~15;
int alignedHeight = (roiTensorPtrSrc[id_z].xywhROI.roiHeight + 15) & ~15;
// Boundary checks
// Boundary checks
if((id_y >= alignedHeight) || (id_x >= alignedWidth))
if((id_y >= alignedHeight) || (id_x >= alignedWidth))
return;
return;
// ROI parameters
// ROI parameters
int roiX = roiTensorPtrSrc[id_z].xywhROI.xy.x;
int roiX = roiTensorPtrSrc[id_z].xywhROI.xy.x;
int roiY = roiTensorPtrSrc[id_z].xywhROI.xy.y;
int roiY = roiTensorPtrSrc[id_z].xywhROI.xy.y;
int roiWidth = roiTensorPtrSrc[id_z].xywhROI.roiWidth;
int roiWidth = roiTensorPtrSrc[id_z].xywhROI.roiWidth;
int roiHeight = roiTensorPtrSrc[id_z].xywhROI.roiHeight;
int roiHeight = roiTensorPtrSrc[id_z].xywhROI.roiHeight;
複製
已複製
複製
已複製
__shared__ float src_smem[48][128];
// Shared memory declaration
int3 hipThreadIdx_y_channel
=
{
hipThreadIdx_y
,
hipThreadIdx_y + 16
,
hipThreadIdx_y + 32
}
;
__shared__ float src_smem[48][128];
// Assuming 48 rows (aligned height for 3 channels)
int3 hipThreadIdx_y_channel
;
hipThreadIdx_y_channel.x
=
hipThreadIdx_y
;
hipThreadIdx_y_channel.y =
hipThreadIdx_y + 16
;
hipThreadIdx_y_channel.z =
hipThreadIdx_y + 32
;
複製
已複製
複製
已複製
float *src_smem_channel[3]
= {
float *src_smem_channel[3]
;
&src_smem[hipThreadIdx_y_channel.x][hipThreadIdx_x8]
,
src_smem_channel[0] =
&src_smem[hipThreadIdx_y_channel.x][hipThreadIdx_x8]
;
&src_smem[hipThreadIdx_y_channel.y][hipThreadIdx_x8]
,
src_smem_channel[1] =
&src_smem[hipThreadIdx_y_channel.y][hipThreadIdx_x8]
;
&src_smem[hipThreadIdx_y_channel.z][hipThreadIdx_x8]
src_smem_channel[2] =
&src_smem[hipThreadIdx_y_channel.z][hipThreadIdx_x8]
;
}
;
// ----------- Step 1: Load from Global Memory to Shared Memory -----------
// ----------- Step 1: Load from Global Memory to Shared Memory -----------
int srcIdx;
int srcIdx;
複製
已複製
複製
已複製
int
dstIdx
= (id_z * dstStridesN
H.x) + (id_y * dstStridesN
H.y
) + id_x
* 3
;
u
int
3 dstIdx;
dstIdx
.x
= (id_z * dstStridesN
C
H.x) + (id_y * dstStridesN
CH.z
) + id_x
;
dstIdx.y = dstIdx.x + dstStridesNCH.y;
dstIdx.z = dstIdx.y + dstStridesNCH.y
;
// Check if we need special handling for image edges
// Check if we need special handling for image edges
if(id_y < roiHeight)
if(id_y < roiHeight)
srcIdx = (id_z * srcStridesNH.x) + ((id_y + roiY) * srcStridesNH.y) + ((id_x + roiX) * 3);
srcIdx = (id_z * srcStridesNH.x) + ((id_y + roiY) * srcStridesNH.y) + ((id_x + roiX) * 3);
else // All out-of-bounds threads use the last valid row
else // All out-of-bounds threads use the last valid row
srcIdx = (id_z * srcStridesNH.x) + ((roiHeight - 1 + roiY) * srcStridesNH.y) + ((id_x + roiX) * 3);
srcIdx = (id_z * srcStridesNH.x) + ((roiHeight - 1 + roiY) * srcStridesNH.y) + ((id_x + roiX) * 3);
bool isEdge = ((id_x + 8) > roiWidth) && (id_x < alignedWidth);
bool isEdge = ((id_x + 8) > roiWidth) && (id_x < alignedWidth);
複製
已複製
複製
已複製
if
(!isEdge)
if
(!isEdge)
{
rpp_hip_load24_pkd3_to_float24_pln3(srcPtr + srcIdx, src_smem_channel);
rpp_hip_load24_pkd3_to_float24_pln3(srcPtr + srcIdx, src_smem_channel);
複製
已複製
複製
已複製
}
else
else
{
{
複製
已複製
複製
已複製
// Partial block load with edge pixel replication
int validPixels = roiWidth - id_x;
int validPixels = roiWidth - id_x;
複製
已複製
複製
已複製
if
(validPixels > 0)
// Load valid pixels (only if id_x is within valid range)
if
(validPixels > 0)
{
{
複製
已複製
複製
已複製
for
(int i = 0, idx = srcIdx; i < validPixels; i++, idx += 3)
for
(int i = 0, idx = srcIdx; i < validPixels; i++, idx += 3)
{
{
src_smem[hipThreadIdx_y_channel.x][hipThreadIdx_x8 + i] = srcPtr[idx];
src_smem[hipThreadIdx_y_channel.x][hipThreadIdx_x8 + i] = srcPtr[idx];
src_smem[hipThreadIdx_y_channel.y][hipThreadIdx_x8 + i] = srcPtr[idx + 1];
src_smem[hipThreadIdx_y_channel.y][hipThreadIdx_x8 + i] = srcPtr[idx + 1];
src_smem[hipThreadIdx_y_channel.z][hipThreadIdx_x8 + i] = srcPtr[idx + 2];
src_smem[hipThreadIdx_y_channel.z][hipThreadIdx_x8 + i] = srcPtr[idx + 2];
}
}
}
}
複製
已複製
複製
已複製
// Pad 16 pixels by duplicating the last valid pixel
int lastValidIdx = srcIdx + ((validPixels - 1) * 3);
int lastValidIdx = srcIdx + ((validPixels - 1) * 3);
複製
已複製
複製
已複製
for
(int i = validPixels; i < min(validPixels + 16, 8); i++)
for
(int i = validPixels; i < min(validPixels + 16, 8); i++)
{
{
src_smem[hipThreadIdx_y_channel.x][hipThreadIdx_x8 + i] = srcPtr[lastValidIdx];
src_smem[hipThreadIdx_y_channel.x][hipThreadIdx_x8 + i] = srcPtr[lastValidIdx];
src_smem[hipThreadIdx_y_channel.y][hipThreadIdx_x8 + i] = srcPtr[lastValidIdx + 1];
src_smem[hipThreadIdx_y_channel.y][hipThreadIdx_x8 + i] = srcPtr[lastValidIdx + 1];
src_smem[hipThreadIdx_y_channel.z][hipThreadIdx_x8 + i] = srcPtr[lastValidIdx + 2];
src_smem[hipThreadIdx_y_channel.z][hipThreadIdx_x8 + i] = srcPtr[lastValidIdx + 2];
}
}
}
}
__syncthreads();
__syncthreads();
// ----------- Step 2: RGB to YCbCr Conversion -----------
// ----------- Step 2: RGB to YCbCr Conversion -----------
d_float8 y_f8;
d_float8 y_f8;
d_float24 rgb_f24;
d_float24 rgb_f24;
rgb_f24.f8[0] = *((d_float8*)&src_smem[hipThreadIdx_y][hipThreadIdx_x8]);
rgb_f24.f8[0] = *((d_float8*)&src_smem[hipThreadIdx_y][hipThreadIdx_x8]);
rgb_f24.f8[1] = *((d_float8*)&src_smem[hipThreadIdx_y + 16][hipThreadIdx_x8]);
rgb_f24.f8[1] = *((d_float8*)&src_smem[hipThreadIdx_y + 16][hipThreadIdx_x8]);
rgb_f24.f8[2] = *((d_float8*)&src_smem[hipThreadIdx_y + 32][hipThreadIdx_x8]);
rgb_f24.f8[2] = *((d_float8*)&src_smem[hipThreadIdx_y + 32][hipThreadIdx_x8]);
複製
已複製
複製
已複製
int cbcrY = hipThreadIdx_y * 2;
int cbcrY = hipThreadIdx_y * 2;
y_hip_compute(srcPtr, rgb_f24, &y_f8);
y_hip_compute(srcPtr, rgb_f24, &y_f8);
__syncthreads();
__syncthreads();
// ----------- Step 3: Downsample CbCr -----------
// ----------- Step 3: Downsample CbCr -----------
複製
已複製
複製
已複製
if
(hipThreadIdx_y < 8)
if
(hipThreadIdx_y < 8)
{
{
float4 cb_f4, cr_f4;
float4 cb_f4, cr_f4;
複製
已複製
複製
已複製
downsample_cbcr_hip_compute(
// Downsample RGB and convert to CbCr
(d_float8*)&src_smem[cbcrY][hipThreadIdx_x8],
downsample_cbcr_hip_compute(
(d_float8*)&src_smem[cbcrY][hipThreadIdx_x8],
(d_float8*)&src_smem[cbcrY + 1][hipThreadIdx_x8],
(d_float8*)&src_smem[cbcrY + 16][hipThreadIdx_x8],
(d_float8*)&src_smem[cbcrY + 17][hipThreadIdx_x8],
(d_float8*)&src_smem[cbcrY + 32][hipThreadIdx_x8],
(d_float8*)&src_smem[cbcrY + 33][hipThreadIdx_x8],
&cb_f4, &cr_f4);
(d_float8*)&src_smem[cbcrY + 1][hipThreadIdx_x8],
// Store Y and downsampled CbCr
(d_float8*)&src_smem[cbcrY + 16][hipThreadIdx_x8],
(d_float8*)&src_smem[cbcrY + 17][hipThreadIdx_x8],
(d_float8*)&src_smem[cbcrY + 32][hipThreadIdx_x8],
(d_float8*)&src_smem[cbcrY + 33][hipThreadIdx_x8],
&cb_f4, &cr_f4);
*(float4*)&src_smem[hipThreadIdx_y_channel.y][hipThreadIdx_x4] = cb_f4;
*(float4*)&src_smem[hipThreadIdx_y_channel.y][hipThreadIdx_x4] = cb_f4;
複製
已複製
複製
已複製
// Storing Cr below Cb (8 x 64)
*(float4*)&src_smem[8 + hipThreadIdx_y_channel.y][hipThreadIdx_x4] = cr_f4;
*(float4*)&src_smem[8 + hipThreadIdx_y_channel.y][hipThreadIdx_x4] = cr_f4;
}
}
*(d_float8*)&src_smem[hipThreadIdx_y_channel.x][hipThreadIdx_x8] = y_f8;
*(d_float8*)&src_smem[hipThreadIdx_y_channel.x][hipThreadIdx_x8] = y_f8;
__syncthreads();
__syncthreads();
// ----------- Step 4: Clamp + Forward DCT -----------
// ----------- Step 4: Clamp + Forward DCT -----------
clamp_range((float*)&src_smem[hipThreadIdx_y_channel.x][hipThreadIdx_x8]);
clamp_range((float*)&src_smem[hipThreadIdx_y_channel.x][hipThreadIdx_x8]);
clamp_range((float*)&src_smem[hipThreadIdx_y_channel.y][hipThreadIdx_x8]);
clamp_range((float*)&src_smem[hipThreadIdx_y_channel.y][hipThreadIdx_x8]);
__syncthreads();
__syncthreads();
複製
已複製
複製
已複製
// Doing -128 as part of DCT,
// 1D row wise FWD DCT for Y Cb and Cr channels
dct_fwd_8x8_1d(&src_smem[hipThreadIdx_y_channel.x][hipThreadIdx_x8], true);
dct_fwd_8x8_1d(&src_smem[hipThreadIdx_y_channel.x][hipThreadIdx_x8], true);
dct_fwd_8x8_1d(&src_smem[hipThreadIdx_y_channel.y][hipThreadIdx_x8], true);
dct_fwd_8x8_1d(&src_smem[hipThreadIdx_y_channel.y][hipThreadIdx_x8], true);
__syncthreads();
__syncthreads();
複製
已複製
複製
已複製
//
----------- Step5 Column-wise DCT -----------
//
//
----------- Step5 Column-wise DCT -----------
int col = (hipThreadIdx_x * 16) + hipThreadIdx_y;
int col = (hipThreadIdx_x * 16) + hipThreadIdx_y;
複製
已複製
複製
已複製
if
((col < 128) && (col < alignedWidth))
// Process all 128 columns
if
((col < 128) && (col < alignedWidth))
{
{
複製
已複製
複製
已複製
// Load column into temporary array
float colVec[32];
float colVec[32];
複製
已複製
複製
已複製
for(int i = 0; i < 32; i++) colVec[i] = src_smem[i][col];
複製
已複製
複製
已複製
dct_fwd_8x8_1d(&colVec[0],
false);
for(int i = 0; i < 32; i++)
dct_fwd_8x8_1d(&colVec[8],
false);
colVec[i] = src_smem[i][col];
dct_fwd_8x8_1d(&colVec[0],
false);
dct_fwd_8x8_1d(&colVec[8],
false);
dct_fwd_8x8_1d(&colVec[16], false);
dct_fwd_8x8_1d(&colVec[16], false);
dct_fwd_8x8_1d(&colVec[24], false);
dct_fwd_8x8_1d(&colVec[24], false);
複製
已複製
複製
已複製
for(int i = 0; i < 32; i++)
src_smem[i][col] = colVec[i];
for(int i = 0; i < 32; i++)
src_smem[i][col] = colVec[i];
}
}
__syncthreads();
__syncthreads();
// ----------- Step 6: Quantization -----------
// ----------- Step 6: Quantization -----------
quantize(&src_smem[hipThreadIdx_y_channel.x][hipThreadIdx_x8], &tableY[(hipThreadIdx_y % 8) * 8]);
quantize(&src_smem[hipThreadIdx_y_channel.x][hipThreadIdx_x8], &tableY[(hipThreadIdx_y % 8) * 8]);
quantize(&src_smem[hipThreadIdx_y_channel.y][hipThreadIdx_x8], &tableCbCr[(hipThreadIdx_y % 8) * 8]);
quantize(&src_smem[hipThreadIdx_y_channel.y][hipThreadIdx_x8], &tableCbCr[(hipThreadIdx_y % 8) * 8]);
__syncthreads();
__syncthreads();
// ----------- Step 7: Inverse DCT -----------
// ----------- Step 7: Inverse DCT -----------
複製
已複製
複製
已複製
// 1D column wise IDCT for Y Cb and Cr channels
if((col < 128) && (col < alignedWidth))
if((col < 128) && (col < alignedWidth))
{
{
複製
已複製
複製
已複製
// Load column into temporary array
float colVec[32];
float colVec[32];
複製
已複製
複製
已複製
for(int i = 0; i < 32; i++) colVec[i] = src_smem[i][col];
複製
已複製
複製
已複製
dct_inv_8x8_1d(&colVec[0],
false);
for(int i = 0; i < 32; i++)
dct_inv_8x8_1d(&colVec[8],
false);
colVec[i] = src_smem[i][col];
dct_inv_8x8_1d(&colVec[0],
false);
dct_inv_8x8_1d(&colVec[8],
false);
dct_inv_8x8_1d(&colVec[16], false);
dct_inv_8x8_1d(&colVec[16], false);
dct_inv_8x8_1d(&colVec[24], false);
dct_inv_8x8_1d(&colVec[24], false);
複製
已複製
複製
已複製
for(int i = 0; i < 32; i++)
src_smem[i][col] = colVec[i];
for(int i = 0; i < 32; i++)
src_smem[i][col] = colVec[i];
}
}
__syncthreads();
__syncthreads();
複製
已複製
複製
已複製
// 1D row wise IDCT for Y Cb and Cr channels
// Adding back 128 as part of INV DCT
dct_inv_8x8_1d(&src_smem[hipThreadIdx_y_channel.x][hipThreadIdx_x8], true);
dct_inv_8x8_1d(&src_smem[hipThreadIdx_y_channel.x][hipThreadIdx_x8], true);
dct_inv_8x8_1d(&src_smem[hipThreadIdx_y_channel.y][hipThreadIdx_x8], true);
dct_inv_8x8_1d(&src_smem[hipThreadIdx_y_channel.y][hipThreadIdx_x8], true);
__syncthreads();
__syncthreads();
// ----------- Step 8: Clamp & Upsample -----------
// ----------- Step 8: Clamp & Upsample -----------
clamp_range((float*)&src_smem[hipThreadIdx_y_channel.x][hipThreadIdx_x8]);
clamp_range((float*)&src_smem[hipThreadIdx_y_channel.x][hipThreadIdx_x8]);
clamp_range((float*)&src_smem[hipThreadIdx_y_channel.y][hipThreadIdx_x8]);
clamp_range((float*)&src_smem[hipThreadIdx_y_channel.y][hipThreadIdx_x8]);
__syncthreads();
__syncthreads();
// Vertical Upsampling
// Vertical Upsampling
float4 cb_f4, cr_f4;
float4 cb_f4, cr_f4;
cbcrY = hipThreadIdx_y / 2;
cbcrY = hipThreadIdx_y / 2;
cb_f4 = *(float4*)&src_smem[cbcrY + 16][hipThreadIdx_x4];
cb_f4 = *(float4*)&src_smem[cbcrY + 16][hipThreadIdx_x4];
cr_f4 = *(float4*)&src_smem[cbcrY + 24][hipThreadIdx_x4];
cr_f4 = *(float4*)&src_smem[cbcrY + 24][hipThreadIdx_x4];
__syncthreads();
__syncthreads();
複製
已複製
複製
已複製
//
Convert back
to RGB
//
YCbCr
to RGB
upsample_and_RGB_hip_compute(cb_f4, cr_f4, (d_float8*)&src_smem[hipThreadIdx_y_channel.x][hipThreadIdx_x8], (d_float8*)&src_smem[hipThreadIdx_y_channel.y][hipThreadIdx_x8], (d_float8*)&src_smem[hipThreadIdx_y_channel.z][hipThreadIdx_x8]);
upsample_and_RGB_hip_compute(cb_f4, cr_f4, (d_float8*)&src_smem[hipThreadIdx_y_channel.x][hipThreadIdx_x8], (d_float8*)&src_smem[hipThreadIdx_y_channel.y][hipThreadIdx_x8], (d_float8*)&src_smem[hipThreadIdx_y_channel.z][hipThreadIdx_x8]);
__syncthreads();
__syncthreads();
複製
已複製
複製
已複製
//
----------- Step 9: Final
Clamp
& S
tore
-----------
//
Clamp
values and s
tore
results
rpp_hip_adjust_range(dstPtr, (d_float8*)&src_smem[hipThreadIdx_y_channel.x][hipThreadIdx_x8]);
rpp_hip_adjust_range(dstPtr, (d_float8*)&src_smem[hipThreadIdx_y_channel.x][hipThreadIdx_x8]);
rpp_hip_adjust_range(dstPtr, (d_float8*)&src_smem[hipThreadIdx_y_channel.y][hipThreadIdx_x8]);
rpp_hip_adjust_range(dstPtr, (d_float8*)&src_smem[hipThreadIdx_y_channel.y][hipThreadIdx_x8]);
rpp_hip_adjust_range(dstPtr, (d_float8*)&src_smem[hipThreadIdx_y_channel.z][hipThreadIdx_x8]);
rpp_hip_adjust_range(dstPtr, (d_float8*)&src_smem[hipThreadIdx_y_channel.z][hipThreadIdx_x8]);
複製
已複製
複製
已複製
// ----------- Step 9: Final Clamp & Store -----------
clamp_range(srcPtr, (float*)&src_smem[hipThreadIdx_y_channel.x][hipThreadIdx_x8]);
clamp_range(srcPtr, (float*)&src_smem[hipThreadIdx_y_channel.x][hipThreadIdx_x8]);
clamp_range(srcPtr, (float*)&src_smem[hipThreadIdx_y_channel.y][hipThreadIdx_x8]);
clamp_range(srcPtr, (float*)&src_smem[hipThreadIdx_y_channel.y][hipThreadIdx_x8]);
clamp_range(srcPtr, (float*)&src_smem[hipThreadIdx_y_channel.z][hipThreadIdx_x8]);
clamp_range(srcPtr, (float*)&src_smem[hipThreadIdx_y_channel.z][hipThreadIdx_x8]);
__syncthreads();
__syncthreads();
if((id_x < roiWidth) && (id_y < roiHeight))
if((id_x < roiWidth) && (id_y < roiHeight))
複製
已複製
複製
已複製
rpp_hip_pack_float
24_pln3_and_store24_pkd3
(dstPtr + dstIdx
,
src_smem
_channel
);
{
rpp_hip_pack_float
8_and_store8(dstPtr + dstIdx.x, (d_float8 *)&src_smem[hipThreadIdx_y_channel.x][hipThreadIdx_x8]);
rpp_hip_pack_float8_and_store8(dstPtr + dstIdx.y, (d_float8 *)&src_smem[hipThreadIdx_y_channel.y][hipThreadIdx_x8]);
rpp_hip_pack_float8_and_store8
(dstPtr + dstIdx
.z, (d_float8 *)&
src_smem
[hipThreadIdx_y
_channel
.z][hipThreadIdx_x8]);
}
}
}
已保存差異
原始文本
開啟檔案
template <typename T> __global__ void jpeg_compression_distortion_pkd3_hip_tensor(T *srcPtr, uint2 srcStridesNH, T *dstPtr, uint2 dstStridesNH, RpptROIPtr roiTensorPtrSrc, int *tableY, int *tableCbCr, float qScale) { int id_x = (hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x) * 8; int id_y = hipBlockIdx_y * hipBlockDim_y + hipThreadIdx_y; int id_z = hipBlockIdx_z * hipBlockDim_z + hipThreadIdx_z; int hipThreadIdx_x8 = hipThreadIdx_x * 8; int hipThreadIdx_x4 = hipThreadIdx_x * 4; int alignedWidth = (roiTensorPtrSrc[id_z].xywhROI.roiWidth + 15) & ~15; int alignedHeight = (roiTensorPtrSrc[id_z].xywhROI.roiHeight + 15) & ~15; // Boundary checks if((id_y >= alignedHeight) || (id_x >= alignedWidth)) return; // ROI parameters int roiX = roiTensorPtrSrc[id_z].xywhROI.xy.x; int roiY = roiTensorPtrSrc[id_z].xywhROI.xy.y; int roiWidth = roiTensorPtrSrc[id_z].xywhROI.roiWidth; int roiHeight = roiTensorPtrSrc[id_z].xywhROI.roiHeight; __shared__ float src_smem[48][128]; int3 hipThreadIdx_y_channel = {hipThreadIdx_y, hipThreadIdx_y + 16, hipThreadIdx_y + 32}; float *src_smem_channel[3] = { &src_smem[hipThreadIdx_y_channel.x][hipThreadIdx_x8], &src_smem[hipThreadIdx_y_channel.y][hipThreadIdx_x8], &src_smem[hipThreadIdx_y_channel.z][hipThreadIdx_x8] }; // ----------- Step 1: Load from Global Memory to Shared Memory ----------- int srcIdx; int dstIdx = (id_z * dstStridesNH.x) + (id_y * dstStridesNH.y) + id_x * 3; // Check if we need special handling for image edges if(id_y < roiHeight) srcIdx = (id_z * srcStridesNH.x) + ((id_y + roiY) * srcStridesNH.y) + ((id_x + roiX) * 3); else // All out-of-bounds threads use the last valid row srcIdx = (id_z * srcStridesNH.x) + ((roiHeight - 1 + roiY) * srcStridesNH.y) + ((id_x + roiX) * 3); bool isEdge = ((id_x + 8) > roiWidth) && (id_x < alignedWidth); if (!isEdge) { rpp_hip_load24_pkd3_to_float24_pln3(srcPtr + srcIdx, src_smem_channel); } else { // Partial block load with edge pixel replication int validPixels = roiWidth - id_x; if (validPixels > 0) { for (int i = 0, idx = srcIdx; i < validPixels; i++, idx += 3) { src_smem[hipThreadIdx_y_channel.x][hipThreadIdx_x8 + i] = srcPtr[idx]; src_smem[hipThreadIdx_y_channel.y][hipThreadIdx_x8 + i] = srcPtr[idx + 1]; src_smem[hipThreadIdx_y_channel.z][hipThreadIdx_x8 + i] = srcPtr[idx + 2]; } } int lastValidIdx = srcIdx + ((validPixels - 1) * 3); for (int i = validPixels; i < min(validPixels + 16, 8); i++) { src_smem[hipThreadIdx_y_channel.x][hipThreadIdx_x8 + i] = srcPtr[lastValidIdx]; src_smem[hipThreadIdx_y_channel.y][hipThreadIdx_x8 + i] = srcPtr[lastValidIdx + 1]; src_smem[hipThreadIdx_y_channel.z][hipThreadIdx_x8 + i] = srcPtr[lastValidIdx + 2]; } } __syncthreads(); // ----------- Step 2: RGB to YCbCr Conversion ----------- d_float8 y_f8; d_float24 rgb_f24; rgb_f24.f8[0] = *((d_float8*)&src_smem[hipThreadIdx_y][hipThreadIdx_x8]); rgb_f24.f8[1] = *((d_float8*)&src_smem[hipThreadIdx_y + 16][hipThreadIdx_x8]); rgb_f24.f8[2] = *((d_float8*)&src_smem[hipThreadIdx_y + 32][hipThreadIdx_x8]); int cbcrY = hipThreadIdx_y * 2; y_hip_compute(srcPtr, rgb_f24, &y_f8); __syncthreads(); // ----------- Step 3: Downsample CbCr ----------- if (hipThreadIdx_y < 8) { float4 cb_f4, cr_f4; downsample_cbcr_hip_compute( (d_float8*)&src_smem[cbcrY][hipThreadIdx_x8], (d_float8*)&src_smem[cbcrY + 1][hipThreadIdx_x8], (d_float8*)&src_smem[cbcrY + 16][hipThreadIdx_x8], (d_float8*)&src_smem[cbcrY + 17][hipThreadIdx_x8], (d_float8*)&src_smem[cbcrY + 32][hipThreadIdx_x8], (d_float8*)&src_smem[cbcrY + 33][hipThreadIdx_x8], &cb_f4, &cr_f4); *(float4*)&src_smem[hipThreadIdx_y_channel.y][hipThreadIdx_x4] = cb_f4; *(float4*)&src_smem[8 + hipThreadIdx_y_channel.y][hipThreadIdx_x4] = cr_f4; } *(d_float8*)&src_smem[hipThreadIdx_y_channel.x][hipThreadIdx_x8] = y_f8; __syncthreads(); // ----------- Step 4: Clamp + Forward DCT ----------- clamp_range((float*)&src_smem[hipThreadIdx_y_channel.x][hipThreadIdx_x8]); clamp_range((float*)&src_smem[hipThreadIdx_y_channel.y][hipThreadIdx_x8]); __syncthreads(); dct_fwd_8x8_1d(&src_smem[hipThreadIdx_y_channel.x][hipThreadIdx_x8], true); dct_fwd_8x8_1d(&src_smem[hipThreadIdx_y_channel.y][hipThreadIdx_x8], true); __syncthreads(); // ----------- Step5 Column-wise DCT ----------- int col = (hipThreadIdx_x * 16) + hipThreadIdx_y; if ((col < 128) && (col < alignedWidth)) { float colVec[32]; for(int i = 0; i < 32; i++) colVec[i] = src_smem[i][col]; dct_fwd_8x8_1d(&colVec[0], false); dct_fwd_8x8_1d(&colVec[8], false); dct_fwd_8x8_1d(&colVec[16], false); dct_fwd_8x8_1d(&colVec[24], false); for(int i = 0; i < 32; i++) src_smem[i][col] = colVec[i]; } __syncthreads(); // ----------- Step 6: Quantization ----------- quantize(&src_smem[hipThreadIdx_y_channel.x][hipThreadIdx_x8], &tableY[(hipThreadIdx_y % 8) * 8]); quantize(&src_smem[hipThreadIdx_y_channel.y][hipThreadIdx_x8], &tableCbCr[(hipThreadIdx_y % 8) * 8]); __syncthreads(); // ----------- Step 7: Inverse DCT ----------- if((col < 128) && (col < alignedWidth)) { float colVec[32]; for(int i = 0; i < 32; i++) colVec[i] = src_smem[i][col]; dct_inv_8x8_1d(&colVec[0], false); dct_inv_8x8_1d(&colVec[8], false); dct_inv_8x8_1d(&colVec[16], false); dct_inv_8x8_1d(&colVec[24], false); for(int i = 0; i < 32; i++) src_smem[i][col] = colVec[i]; } __syncthreads(); dct_inv_8x8_1d(&src_smem[hipThreadIdx_y_channel.x][hipThreadIdx_x8], true); dct_inv_8x8_1d(&src_smem[hipThreadIdx_y_channel.y][hipThreadIdx_x8], true); __syncthreads(); // ----------- Step 8: Clamp & Upsample ----------- clamp_range((float*)&src_smem[hipThreadIdx_y_channel.x][hipThreadIdx_x8]); clamp_range((float*)&src_smem[hipThreadIdx_y_channel.y][hipThreadIdx_x8]); __syncthreads(); // Vertical Upsampling float4 cb_f4, cr_f4; cbcrY = hipThreadIdx_y / 2; cb_f4 = *(float4*)&src_smem[cbcrY + 16][hipThreadIdx_x4]; cr_f4 = *(float4*)&src_smem[cbcrY + 24][hipThreadIdx_x4]; __syncthreads(); // Convert back to RGB upsample_and_RGB_hip_compute(cb_f4, cr_f4, (d_float8*)&src_smem[hipThreadIdx_y_channel.x][hipThreadIdx_x8], (d_float8*)&src_smem[hipThreadIdx_y_channel.y][hipThreadIdx_x8], (d_float8*)&src_smem[hipThreadIdx_y_channel.z][hipThreadIdx_x8]); __syncthreads(); // ----------- Step 9: Final Clamp & Store ----------- rpp_hip_adjust_range(dstPtr, (d_float8*)&src_smem[hipThreadIdx_y_channel.x][hipThreadIdx_x8]); rpp_hip_adjust_range(dstPtr, (d_float8*)&src_smem[hipThreadIdx_y_channel.y][hipThreadIdx_x8]); rpp_hip_adjust_range(dstPtr, (d_float8*)&src_smem[hipThreadIdx_y_channel.z][hipThreadIdx_x8]); clamp_range(srcPtr, (float*)&src_smem[hipThreadIdx_y_channel.x][hipThreadIdx_x8]); clamp_range(srcPtr, (float*)&src_smem[hipThreadIdx_y_channel.y][hipThreadIdx_x8]); clamp_range(srcPtr, (float*)&src_smem[hipThreadIdx_y_channel.z][hipThreadIdx_x8]); __syncthreads(); if((id_x < roiWidth) && (id_y < roiHeight)) rpp_hip_pack_float24_pln3_and_store24_pkd3(dstPtr + dstIdx, src_smem_channel); }
更改後文本
開啟檔案
template <typename T> __global__ void jpeg_compression_distortion_pkd3_pln3_hip_tensor( T *srcPtr, uint2 srcStridesNH, T *dstPtr, uint3 dstStridesNCH, RpptROIPtr roiTensorPtrSrc, int *tableY, int *tableCbCr, float qScale) { int id_x = (hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x) * 8; int id_y = hipBlockIdx_y * hipBlockDim_y + hipThreadIdx_y; int id_z = hipBlockIdx_z * hipBlockDim_z + hipThreadIdx_z; int hipThreadIdx_x8 = hipThreadIdx_x * 8; int hipThreadIdx_x4 = hipThreadIdx_x * 4; int alignedWidth = (roiTensorPtrSrc[id_z].xywhROI.roiWidth + 15) & ~15; int alignedHeight = (roiTensorPtrSrc[id_z].xywhROI.roiHeight + 15) & ~15; // Boundary checks if((id_y >= alignedHeight) || (id_x >= alignedWidth)) return; // ROI parameters int roiX = roiTensorPtrSrc[id_z].xywhROI.xy.x; int roiY = roiTensorPtrSrc[id_z].xywhROI.xy.y; int roiWidth = roiTensorPtrSrc[id_z].xywhROI.roiWidth; int roiHeight = roiTensorPtrSrc[id_z].xywhROI.roiHeight; // Shared memory declaration __shared__ float src_smem[48][128]; // Assuming 48 rows (aligned height for 3 channels) int3 hipThreadIdx_y_channel; hipThreadIdx_y_channel.x = hipThreadIdx_y; hipThreadIdx_y_channel.y = hipThreadIdx_y + 16; hipThreadIdx_y_channel.z = hipThreadIdx_y + 32; float *src_smem_channel[3]; src_smem_channel[0] = &src_smem[hipThreadIdx_y_channel.x][hipThreadIdx_x8]; src_smem_channel[1] = &src_smem[hipThreadIdx_y_channel.y][hipThreadIdx_x8]; src_smem_channel[2] = &src_smem[hipThreadIdx_y_channel.z][hipThreadIdx_x8]; // ----------- Step 1: Load from Global Memory to Shared Memory ----------- int srcIdx; uint3 dstIdx; dstIdx.x = (id_z * dstStridesNCH.x) + (id_y * dstStridesNCH.z) + id_x; dstIdx.y = dstIdx.x + dstStridesNCH.y; dstIdx.z = dstIdx.y + dstStridesNCH.y; // Check if we need special handling for image edges if(id_y < roiHeight) srcIdx = (id_z * srcStridesNH.x) + ((id_y + roiY) * srcStridesNH.y) + ((id_x + roiX) * 3); else // All out-of-bounds threads use the last valid row srcIdx = (id_z * srcStridesNH.x) + ((roiHeight - 1 + roiY) * srcStridesNH.y) + ((id_x + roiX) * 3); bool isEdge = ((id_x + 8) > roiWidth) && (id_x < alignedWidth); if(!isEdge) rpp_hip_load24_pkd3_to_float24_pln3(srcPtr + srcIdx, src_smem_channel); else { int validPixels = roiWidth - id_x; // Load valid pixels (only if id_x is within valid range) if(validPixels > 0) { for(int i = 0, idx = srcIdx; i < validPixels; i++, idx += 3) { src_smem[hipThreadIdx_y_channel.x][hipThreadIdx_x8 + i] = srcPtr[idx]; src_smem[hipThreadIdx_y_channel.y][hipThreadIdx_x8 + i] = srcPtr[idx + 1]; src_smem[hipThreadIdx_y_channel.z][hipThreadIdx_x8 + i] = srcPtr[idx + 2]; } } // Pad 16 pixels by duplicating the last valid pixel int lastValidIdx = srcIdx + ((validPixels - 1) * 3); for(int i = validPixels; i < min(validPixels + 16, 8); i++) { src_smem[hipThreadIdx_y_channel.x][hipThreadIdx_x8 + i] = srcPtr[lastValidIdx]; src_smem[hipThreadIdx_y_channel.y][hipThreadIdx_x8 + i] = srcPtr[lastValidIdx + 1]; src_smem[hipThreadIdx_y_channel.z][hipThreadIdx_x8 + i] = srcPtr[lastValidIdx + 2]; } } __syncthreads(); // ----------- Step 2: RGB to YCbCr Conversion ----------- d_float8 y_f8; d_float24 rgb_f24; rgb_f24.f8[0] = *((d_float8*)&src_smem[hipThreadIdx_y][hipThreadIdx_x8]); rgb_f24.f8[1] = *((d_float8*)&src_smem[hipThreadIdx_y + 16][hipThreadIdx_x8]); rgb_f24.f8[2] = *((d_float8*)&src_smem[hipThreadIdx_y + 32][hipThreadIdx_x8]); int cbcrY = hipThreadIdx_y * 2; y_hip_compute(srcPtr, rgb_f24, &y_f8); __syncthreads(); // ----------- Step 3: Downsample CbCr ----------- if(hipThreadIdx_y < 8) { float4 cb_f4, cr_f4; // Downsample RGB and convert to CbCr downsample_cbcr_hip_compute((d_float8*)&src_smem[cbcrY][hipThreadIdx_x8], (d_float8*)&src_smem[cbcrY + 1][hipThreadIdx_x8], (d_float8*)&src_smem[cbcrY + 16][hipThreadIdx_x8], (d_float8*)&src_smem[cbcrY + 17][hipThreadIdx_x8], (d_float8*)&src_smem[cbcrY + 32][hipThreadIdx_x8], (d_float8*)&src_smem[cbcrY + 33][hipThreadIdx_x8],&cb_f4, &cr_f4); // Store Y and downsampled CbCr *(float4*)&src_smem[hipThreadIdx_y_channel.y][hipThreadIdx_x4] = cb_f4; // Storing Cr below Cb (8 x 64) *(float4*)&src_smem[8 + hipThreadIdx_y_channel.y][hipThreadIdx_x4] = cr_f4; } *(d_float8*)&src_smem[hipThreadIdx_y_channel.x][hipThreadIdx_x8] = y_f8; __syncthreads(); // ----------- Step 4: Clamp + Forward DCT ----------- clamp_range((float*)&src_smem[hipThreadIdx_y_channel.x][hipThreadIdx_x8]); clamp_range((float*)&src_smem[hipThreadIdx_y_channel.y][hipThreadIdx_x8]); __syncthreads(); // Doing -128 as part of DCT, // 1D row wise FWD DCT for Y Cb and Cr channels dct_fwd_8x8_1d(&src_smem[hipThreadIdx_y_channel.x][hipThreadIdx_x8], true); dct_fwd_8x8_1d(&src_smem[hipThreadIdx_y_channel.y][hipThreadIdx_x8], true); __syncthreads(); // // ----------- Step5 Column-wise DCT ----------- int col = (hipThreadIdx_x * 16) + hipThreadIdx_y; // Process all 128 columns if((col < 128) && (col < alignedWidth)) { // Load column into temporary array float colVec[32]; for(int i = 0; i < 32; i++) colVec[i] = src_smem[i][col]; dct_fwd_8x8_1d(&colVec[0], false); dct_fwd_8x8_1d(&colVec[8], false); dct_fwd_8x8_1d(&colVec[16], false); dct_fwd_8x8_1d(&colVec[24], false); for(int i = 0; i < 32; i++) src_smem[i][col] = colVec[i]; } __syncthreads(); // ----------- Step 6: Quantization ----------- quantize(&src_smem[hipThreadIdx_y_channel.x][hipThreadIdx_x8], &tableY[(hipThreadIdx_y % 8) * 8]); quantize(&src_smem[hipThreadIdx_y_channel.y][hipThreadIdx_x8], &tableCbCr[(hipThreadIdx_y % 8) * 8]); __syncthreads(); // ----------- Step 7: Inverse DCT ----------- // 1D column wise IDCT for Y Cb and Cr channels if((col < 128) && (col < alignedWidth)) { // Load column into temporary array float colVec[32]; for(int i = 0; i < 32; i++) colVec[i] = src_smem[i][col]; dct_inv_8x8_1d(&colVec[0], false); dct_inv_8x8_1d(&colVec[8], false); dct_inv_8x8_1d(&colVec[16], false); dct_inv_8x8_1d(&colVec[24], false); for(int i = 0; i < 32; i++) src_smem[i][col] = colVec[i]; } __syncthreads(); // 1D row wise IDCT for Y Cb and Cr channels // Adding back 128 as part of INV DCT dct_inv_8x8_1d(&src_smem[hipThreadIdx_y_channel.x][hipThreadIdx_x8], true); dct_inv_8x8_1d(&src_smem[hipThreadIdx_y_channel.y][hipThreadIdx_x8], true); __syncthreads(); // ----------- Step 8: Clamp & Upsample ----------- clamp_range((float*)&src_smem[hipThreadIdx_y_channel.x][hipThreadIdx_x8]); clamp_range((float*)&src_smem[hipThreadIdx_y_channel.y][hipThreadIdx_x8]); __syncthreads(); // Vertical Upsampling float4 cb_f4, cr_f4; cbcrY = hipThreadIdx_y / 2; cb_f4 = *(float4*)&src_smem[cbcrY + 16][hipThreadIdx_x4]; cr_f4 = *(float4*)&src_smem[cbcrY + 24][hipThreadIdx_x4]; __syncthreads(); // YCbCr to RGB upsample_and_RGB_hip_compute(cb_f4, cr_f4, (d_float8*)&src_smem[hipThreadIdx_y_channel.x][hipThreadIdx_x8], (d_float8*)&src_smem[hipThreadIdx_y_channel.y][hipThreadIdx_x8], (d_float8*)&src_smem[hipThreadIdx_y_channel.z][hipThreadIdx_x8]); __syncthreads(); // Clamp values and store results rpp_hip_adjust_range(dstPtr, (d_float8*)&src_smem[hipThreadIdx_y_channel.x][hipThreadIdx_x8]); rpp_hip_adjust_range(dstPtr, (d_float8*)&src_smem[hipThreadIdx_y_channel.y][hipThreadIdx_x8]); rpp_hip_adjust_range(dstPtr, (d_float8*)&src_smem[hipThreadIdx_y_channel.z][hipThreadIdx_x8]); // ----------- Step 9: Final Clamp & Store ----------- clamp_range(srcPtr, (float*)&src_smem[hipThreadIdx_y_channel.x][hipThreadIdx_x8]); clamp_range(srcPtr, (float*)&src_smem[hipThreadIdx_y_channel.y][hipThreadIdx_x8]); clamp_range(srcPtr, (float*)&src_smem[hipThreadIdx_y_channel.z][hipThreadIdx_x8]); __syncthreads(); if((id_x < roiWidth) && (id_y < roiHeight)) { rpp_hip_pack_float8_and_store8(dstPtr + dstIdx.x, (d_float8 *)&src_smem[hipThreadIdx_y_channel.x][hipThreadIdx_x8]); rpp_hip_pack_float8_and_store8(dstPtr + dstIdx.y, (d_float8 *)&src_smem[hipThreadIdx_y_channel.y][hipThreadIdx_x8]); rpp_hip_pack_float8_and_store8(dstPtr + dstIdx.z, (d_float8 *)&src_smem[hipThreadIdx_y_channel.z][hipThreadIdx_x8]); } }
尋找差異