8000 Draft : HIP Build Parallelized by Srihari-mcw · Pull Request #397 · r-abishek/rpp · GitHub
[go: up one dir, main page]
More Web Proxy on the site http://driver.im/
Skip to content

Draft : HIP Build Parallelized #397

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Draft
wants to merge 51 commits into
base: master
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
51 commits
Select commit Hold shift + click to select a range
d8df7a5
Initial changes to parallelize HIP build
Srihari-mcw Dec 30, 2024
0a2ef51
Update code to split tensor_sum definition and declaration into separ…
Srihari-mcw Jan 2, 2025
71f33de
Remove redundant thresholding.cpp
Srihari-mcw Jan 2, 2025
dfd0373
Move two batch pd kernels and split tensor operations into hpp and cp…
Srihari-mcw Jan 2, 2025
dc239f1
Modify file import path
Srihari-mcw Jan 2, 2025
aaff07f
Update code for spatter, spectrogram, subtract_channel, swap_channels…
Srihari-mcw Jan 2, 2025
8137607
Fixed compilation issues for spatter
Srihari-mcw Jan 2, 2025
78859ae
Split resize, ricap, slice into hpp and cpp
Srihari-mcw Jan 23, 2025
839ec2b
Split resize_crop_mirror an 8000 d resize_mirror_normalize into hpp and cpp
Srihari-mcw Jan 6, 2025
9652c92
Fix issues with kernels
Srihari-mcw Jan 6, 2025
d9cfd4e
Add changes for normalize, pre emphasis filter, phase, remap and resa…
Srihari-mcw Jan 6, 2025
ab11873
Fix compilation issues with batch pd code
Srihari-mcw Jan 6, 2025
952f878
Update code for noise functions
Srihari-mcw Jan 7, 2025
73c518e
Add fixes for noise functions and update jitter
Srihari-mcw Jan 7, 2025
71db981
Add updates for log, lut, magnitude, mel_filter_bank and multiply_scalar
Srihari-mcw Jan 7, 2025
472c7a4
Updates for two more files to be split into hpp and cpp
Srihari-mcw Jan 8, 2025
5b8ef7f
Separated batchpd and HIP tensor code
HazarathKumarM Jan 6, 2025
4f8d1cb
Spiliting Kernels upto fog kernels
HazarathKumarM Jan 7, 2025
0fc9774
Update code for gaussian_filter, gamma_correction and fused_multiply_…
Srihari-mcw Jan 10, 2025
53311cb
Updates for glitch
Srihari-mcw Jan 10, 2025
4cc0e99
Updates for crop mirror normalize
Srihari-mcw Jan 10, 2025
9085e80
Minor fixes and updates across kernels along with further updates to …
Srihari-mcw Jan 10, 2025
035d6ea
Make updates for spatter kernel
Srihari-mcw Jan 10, 2025
9b52639
Remove batchpd duplicate folder
Srihari-mcw Jan 12, 2025
76e5db5
Update code to move rpp_hip_host_decls.hpp into batch_pd code
Srihari-mcw Jan 12, 2025
7677bf7
Update code to move gaussian_image_pyramid.cpp to batch_pd code
Srihari-mcw Jan 12, 2025
9305b33
Cleanup arithmetic operations
Srihari-mcw Jan 13, 2025
1b7b3aa
Cleanup audio operations
Srihari-mcw Jan 13, 2025
f7a3720
Updates for color augmentations
Srihari-mcw Jan 13, 2025
05a64fe
Updates for data exchange augmentations
Srihari-mcw Jan 13, 2025
c351608
Clean effects augmentation
Srihari-mcw Jan 13, 2025
d9d0635
Update spatter.cpp header
Srihari-mcw Jan 13, 2025
0c7fe94
Cleanup geometric augmentations
Srihari-mcw Jan 13, 2025
9531502
Cleanup filter augmentations
Srihari-mcw Jan 13, 2025
f9c9cda
Further cleanup of gaussian filter
Srihari-mcw Jan 13, 2025
27fa437
Cleanup logical, morphological and statistical operations
Srihari-mcw Jan 14, 2025
f1cc92b
Move additional header files imports to respective cpp files
Srihari-mcw Jan 15, 2025
1986beb
Initial changes for grouping header files
Srihari-mcw Jan 15, 2025
efc5a42
Grouping of headers
Srihari-mcw Jan 15, 2025
d6fca27
Remove unused hpp files
Srihari-mcw Jan 15, 2025
5f6d531
Import necessary group headers into respective kernels
Srihari-mcw Jan 15, 2025
d451274
Change path of hip_tensor file in .cpp file import
Srihari-mcw Jan 15, 2025
e18a247
Update file imports
Srihari-mcw Jan 15, 2025
6855422
Update file imports again
Srihari-mcw Jan 15, 2025
57f1d72
Fix compiler bugs with resize_mirror_normalize.cpp
Srihari-mcw Jan 15, 2025
a6c0385
Fix compiler bugs with gaussian filter
Srihari-mcw Jan 15, 2025
2479a01
Add updates for threshold and rain
Srihari-mcw Jan 15, 2025
7852c79
Fix bugs with rain and threshold
Srihari-mcw Jan 15, 2025
870588d
Fix issues after rebasing
Srihari-mcw Jan 23, 2025
d0636f0
Update hip code for spectrogram
Srihari-mcw Jan 24, 2025
a165575
Remove resize.hpp
Srihari-mcw Jan 24, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
36 changes: 0 additions & 36 deletions src/include/cpu/rpp_cpu_common.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -6655,42 +6655,6 @@ inline void compute_rain_24_host(__m256 *p1, __m256 p2, __m256 &pMul)
p1[2] = _mm256_fmadd_ps(_mm256_sub_ps(p2, p1[2]), pMul, p1[2]); // alpha-blending adjustment
}

// Compute hanning window
inline RPP_HOST_DEVICE void hann_window(Rpp32f *output, Rpp32s windowSize)
{
Rpp64f a = (2.0 * M_PI) / windowSize;
for (Rpp32s t = 0; t < windowSize; t++)
{
Rpp64f phase = a * (t + 0.5);
output[t] = (0.5 * (1.0 - std::cos(phase)));
}
}

// Compute number of spectrogram windows
inline RPP_HOST_DEVICE Rpp32s get_num_windows(Rpp32s length, Rpp32s windowLength, Rpp32s windowStep, bool centerWindows)
{
if (!centerWindows)
length -= windowLength;
return ((length / windowStep) + 1);
}

// Compute reflect start idx to pad
inline RPP_HOST_DEVICE Rpp32s get_idx_reflect(Rpp32s loc, Rpp32s minLoc, Rpp32s maxLoc)
{
if (maxLoc - minLoc < 2)
return maxLoc - 1;
for (;;)
{
if (loc < minLoc)
loc = 2 * minLoc - loc;
else if (loc >= maxLoc)
loc = 2 * maxLoc - 2 - loc;
else
break;
}
return loc;
}

inline void compute_threshold_8_host(__m256 *p, __m256 *pThresholdParams)
{
p[0] = _mm256_blendv_ps(avx_p0, avx_p1, _mm256_and_ps(_mm256_cmp_ps(p[0], pThresholdParams[0], _CMP_GE_OQ), _mm256_cmp_ps(p[0], pThresholdParams[1],_CMP_LE_OQ)));
Expand Down
2 changes: 1 addition & 1 deletion src/include/func_specific/fog_mask.hpp

Large diffs are not rendered by default.

2 changes: 1 addition & 1 deletion src/include/func_specific/rng_seed_stream.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,7 @@ SOFTWARE.
#include <rppdefs.h>
#define SEED_STREAM_MAX_SIZE 4050

alignas(64) Rpp32u rngSeedStream4050[SEED_STREAM_MAX_SIZE] = {
alignas(64) inline Rpp32u rngSeedStream4050[SEED_STREAM_MAX_SIZE] = {
764129, 618740, 582752, 692507, 789955, 785851, 112601, 380578, 896709, 549581, 965817, 162355, 338288, 502799, 447091, 144368, 163130, 717741, 150842, 376320, 241741, 177798, 584469, 620725, 812482,
125948, 712195, 171647, 734259, 318909, 530081, 752344, 134681, 976179, 596526, 768118, 926256, 586994, 583726, 931896, 164391, 257965, 194286, 344998, 859147, 928669, 765093, 514952, 375650, 573219,
316707, 254968, 946657, 111935, 170126, 710776, 386138, 165523, 260376, 872164, 303180, 137553, 767258, 149950, 208631, 762140, 381674, 723139, 474377, 609227, 251119, 861729, 993061, 869503, 666490,
Expand Down
4 changes: 2 additions & 2 deletions src/include/func_specific/spatter_mask.hpp

Large diffs are not rendered by default.

4 changes: 3 additions & 1 deletion src/modules/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@ project(modules)

if( "${BACKEND}" STREQUAL "HIP")
file(GLOB RPP_KERNELS hip/kernel/*.cpp)
file(GLOB RPP_KERNELS hip/batch_pd/*.cpp)
elseif( "${BACKEND}" STREQUAL "OCL")
file(GLOB RPP_KERNELS cl/kernel/*.cl)
endif()
Expand Down Expand Up @@ -90,7 +91,8 @@ if( "${BACKEND}" STREQUAL "HIP")
# Add HIP kernels
file(GLOB MOD_HIP_CPP "hip/*.cpp" )
file(GLOB MOD_HIP_CPP_KERNELS "hip/kernel/*.cpp" )
list(APPEND Rpp_Source ${CPPFILES} ${MOD_HIP_CPP} ${MOD_HIP_CPP_KERNELS})
file(GLOB MOD_HIP_CPP_BATCHPD_KERNELS "hip/batch_pd/*.cpp" )
list(APPEND Rpp_Source ${CPPFILES} ${MOD_HIP_CPP} ${MOD_HIP_CPP_KERNELS} ${MOD_HIP_CPP_BATCHPD_KERNELS})
message("-- ${Green}HIP kernels added${ColourReset}")

# Set compiler flags
Expand Down
36 changes: 36 additions & 0 deletions src/modules/cpu/kernel/spectrogram.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,42 @@ inline bool can_use_real_impl(Rpp64s n) { return is_pow2(n); }
inline Rpp64s size_in_buf(Rpp64s n) { return can_use_real_impl(n) ? n : 2 * n; }
inline Rpp64s size_out_buf(Rpp64s n) { return can_use_real_impl(n) ? n + 2 : 2 * n; }

// Compute hanning window
inline RPP_HOST_DEVICE void hann_window(Rpp32f *output, Rpp32s windowSize)
{
Rpp64f a = (2.0 * M_PI) / windowSize;
for (Rpp32s t = 0; t < windowSize; t++)
{
Rpp64f phase = a * (t + 0.5);
output[t] = (0.5 * (1.0 - std::cos(phase)));
}
}

// Compute number of spectrogram windows
inline RPP_HOST_DEVICE Rpp32s get_num_windows(Rpp32s length, Rpp32s windowLength, Rpp32s windowStep, bool centerWindows)
{
if (!centerWindows)
length -= windowLength;
return ((length / windowStep) + 1);
}

// Compute reflect start idx to pad
inline RPP_HOST_DEVICE Rpp32s get_idx_reflect(Rpp32s loc, Rpp32s minLoc, Rpp32s maxLoc)
{
if (maxLoc - minLoc < 2)
return maxLoc - 1;
for (;;)
{
if (loc < minLoc)
loc = 2 * minLoc - loc;
else if (loc >= maxLoc)
loc = 2 * maxLoc - 2 - loc;
else
break;
}
return loc;
}

RppStatus spectrogram_host_tensor(Rpp32f *srcPtr,
RpptDescPtr srcDescPtr,
Rpp32f *dstPtr,
Expand Down
File renamed without changes.
110 changes: 110 additions & 0 deletions src/modules/hip/batch_pd/blend.cpp
F438
Original file line number Diff line number Diff line change
@@ -0,0 +1,110 @@
#include <hip/hip_runtime.h>
#include "rpp_hip_host_decls.hpp"

#define saturate_8u(value) ((value) > 255 ? 255 : ((value) < 0 ? 0 : (value)))

__device__ unsigned char blend_formula(unsigned char input_pixel1, unsigned char input_pixel2, float alpha)
{
return saturate_8u(alpha * input_pixel1 + (1 - alpha) * input_pixel2);
}

extern "C" __global__ void blend(unsigned char *input1,
unsigned char *input2,
unsigned char *output,
const unsigned int height,
const unsigned int width,
const float alpha,
const unsigned int channel)
{
int id_x = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;
int id_y = hipBlockIdx_y * hipBlockDim_y + hipThreadIdx_y;
int id_z = hipBlockIdx_z * hipBlockDim_z + hipThreadIdx_z;

if (id_x >= width || id_y >= height || id_z >= channel)
{
return;
}

int pixIdx = id_x + id_y * width + id_z * width * height;

output[pixIdx] = ((1-alpha) * input1[pixIdx]) + (alpha * input2[pixIdx]);
}

extern "C" __global__ void blend_batch(unsigned char *input1,
unsigned char *input2,
unsigned char *output,
float *alpha,
unsigned int *xroi_begin,
unsigned int *xroi_end,
unsigned int *yroi_begin,
unsigned int *yroi_end,
unsigned int *height,
unsigned int *width,
unsigned int *max_width,
unsigned long long *batch_index,
const unsigned int channel,
unsigned int *inc, // use width * height for pln and 1 for pkd
const int plnpkdindex) // use 1 pln 3 for pkd
{
int id_x = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;
int id_y = hipBlockIdx_y * hipBlockDim_y + hipThreadIdx_y;
int id_z = hipBlockIdx_z * hipBlockDim_z + hipThreadIdx_z;

float alphatmp = alpha[id_z];
int indextmp=0;
unsigned long pixIdx = 0;

pixIdx = batch_index[id_z] + (id_x + id_y * max_width[id_z] ) * plnpkdindex;

if((id_y >= yroi_begin[id_z]) && (id_y <= yroi_end[id_z]) && (id_x >= xroi_begin[id_z]) && (id_x <= xroi_end[id_z]))
{
for(indextmp = 0; indextmp < channel; indextmp++)
{
unsigned char valuergb1 = input1[pixIdx];
unsigned char valuergb2 = input2[pixIdx];
output[pixIdx] = blend_formula(valuergb1, valuergb2, alphatmp);
pixIdx += inc[id_z];
}
}
else if((id_x < width[id_z] ) && (id_y < height[id_z]))
{
for(indextmp = 0; indextmp < channel; indextmp++)
{
output[pixIdx] = input1[pixIdx];
pixIdx += inc[id_z];
}
}
}

RppStatus hip_exec_blend_batch(Rpp8u *srcPtr1, Rpp8u *srcPtr2, Rpp8u *dstPtr, rpp::Handle& handle, RppiChnFormat chnFormat, Rpp32u channel, Rpp32s plnpkdind, Rpp32u max_height, Rpp32u max_width)
{
int localThreads_x = 32;
int localThreads_y = 32;
int localThreads_z = 1;
int globalThreads_x = (max_width + 31) & ~31;
int globalThreads_y = (max_height + 31) & ~31;
int globalThreads_z = handle.GetBatchSize();

hipLaunchKernelGGL(blend_batch,
dim3(ceil((float)globalThreads_x/localThreads_x), ceil((float)globalThreads_y/localThreads_y), ceil((float)globalThreads_z/localThreads_z)),
dim3(localThreads_x, localThreads_y, localThreads_z),
0,
handle.GetStream(),
srcPtr1,
srcPtr2,
dstPtr,
handle.GetInitHandle()->mem.mgpu.floatArr[0].floatmem,
handle.GetInitHandle()->mem.mgpu.roiPoints.x,
handle.GetInitHandle()->mem.mgpu.roiPoints.roiWidth,
handle.GetInitHandle()->mem.mgpu.roiPoints.y,
handle.GetInitHandle()->mem.mgpu.roiPoints.roiHeight,
handle.GetInitHandle()->mem.mgpu.srcSize.height,
handle.GetInitHandle()->mem.mgpu.srcSize.width,
handle.GetInitHandle()->mem.mgpu.maxSrcSize.width,
handle.GetInitHandle()->mem.mgpu.srcBatchIndex,
channel,
handle.GetInitHandle()->mem.mgpu.inc,
plnpkdind);

return RPP_SUCCESS;
}
127 changes: 127 additions & 0 deletions src/modules/hip/batch_pd/box_filter.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,127 @@
#include <hip/hip_runtime.h>
#include "rpp_hip_host_decls.hpp"

#define saturate_8u(value) ((value) > 255 ? 255 : ((value) < 0 ? 0 : (value)))


extern "C" __global__ void box_filter_batch(unsigned char *input,
unsigned char *output,
unsigned int *kernelSize,
unsigned int *xroi_begin,
unsigned int *xroi_end,
unsigned int *yroi_begin,
unsigned int *yroi_end,
unsigned int *height,
unsigned int *width,
unsigned int *max_width,
unsigned long long *batch_index,
const unsigned int channel,
unsigned int *inc, // use width * height for pln and 1 for pkd
const int plnpkdindex) // use 1 pln 3 for pkd
{
int id_x = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;
int id_y = hipBlockIdx_y * hipBlockDim_y + hipThreadIdx_y;
int id_z = hipBlockIdx_z * hipBlockDim_z + hipThreadIdx_z;

unsigned char valuer, valuer1, valueg, valueg1, valueb, valueb1;
int kernelSizeTemp = kernelSize[id_z];

int bound = (kernelSizeTemp - 1) / 2;
if(id_x < width[id_z] && id_y < height[id_z])
{
long pixIdx = batch_index[id_z] + (id_x + id_y * max_width[id_z]) * plnpkdindex;
if((id_y >= yroi_begin[id_z]) && (id_y <= yroi_end[id_z]) && (id_x >= xroi_begin[id_z]) && (id_x <= xroi_end[id_z]))
{
int r = 0, g = 0, b = 0;
for(int i = -bound; i <= bound; i++)
{
for(int j = -bound; j <= bound; j++)
{
if(id_x + j >= 0 && id_x + j <= width[id_z] - 1 && id_y + i >= 0 && id_y + i <= height[id_z] - 1)
{
unsigned int index = pixIdx + (j + (i * max_width[id_z])) * plnpkdindex;
r += input[index];
if(channel == 3)
{
index = pixIdx + (j + (i * max_width[id_z])) * plnpkdindex + inc[id_z];
g += input[index];
index = pixIdx + (j + (i * max_width[id_z])) * plnpkdindex + inc[id_z] * 2;
b += input[index];
}
}
else
{
r = 0;
if(channel == 3)
{
g = 0;
b = 0;
}
break;
}
}
}

if(id_x >= bound && id_x <= width[id_z] - bound - 1 && id_y >= bound && id_y <= height[id_z] - bound - 1 )
{
int temp = (int)(r / (kernelSizeTemp * kernelSizeTemp));
output[pixIdx] = saturate_8u(temp);
if(channel == 3)
{
temp = (int)(g / (kernelSizeTemp * kernelSizeTemp));
output[pixIdx + inc[id_z]] = saturate_8u(temp);
temp = (int)(b / (kernelSizeTemp * kernelSizeTemp));
output[pixIdx + inc[id_z] * 2] = saturate_8u(temp);
}
}
else
{
for(int indextmp = 0; indextmp < channel; indextmp++)
{
output[pixIdx] = input[pixIdx];
pixIdx += inc[id_z];
}
}
}
else if((id_x < width[id_z]) && (id_y < height[id_z]))
{
for(int indextmp = 0; indextmp < channel; indextmp++)
{
output[pixIdx] = input[pixIdx];
pixIdx += inc[id_z];
}
}
}
}

RppStatus hip_exec_box_filter_batch(Rpp8u *srcPtr, Rpp8u *dstPtr, rpp::Handle& handle, RppiChnFormat chnFormat, Rpp32u channel, Rpp32s plnpkdind, Rpp32u max_height, Rpp32u max_width)
{
int localThreads_x = 32;
int localThreads_y = 32;
int localThreads_z = 1;
int globalThreads_x = (max_width + 31) & ~31;
int globalThreads_y = (max_height + 31) & ~31;
int globalThreads_z = handle.GetBatchSize();

hipLaunchKernelGGL(box_filter_batch,
dim3(ceil((float)globalThreads_x/localThreads_x), ceil((float)globalThreads_y/localThreads_y), ceil((float)globalThreads_z/localThreads_z)),
dim3(localThreads_x, localThreads_y, localThreads_z),
0,
handle.GetStream(),
srcPtr,
dstPtr,
handle.GetInitHandle()->mem.mgpu.uintArr[0].uintmem,
handle.GetInitHandle()->mem.mgpu.roiPoints.x,
handle.GetInitHandle()->mem.mgpu.roiPoints.roiWidth,
handle.GetInitHandle()->mem.mgpu.roiPoints.y,
handle.GetInitHandle()->mem.mgpu.roiPoints.roiHeight,
handle.GetInitHandle()->mem.mgpu.srcSize.height,
handle.GetInitHandle()->mem.mgpu.srcSize.width,
handle.GetInitHandle()->mem.mgpu.maxSrcSize.width,
handle.GetInitHandle()->mem.mgpu.srcBatchIndex,
channel,
handle.GetInitHandle()->mem.mgpu.inc,
plnpkdind);

return RPP_SUCCESS;
}
Loading
0