r-abishek · Srihari-mcw · Dec 30, 2024 · Jan 2, 2025 · Jan 2, 2025 · Jan 2, 2025
diff --git a/src/include/cpu/rpp_cpu_common.hpp b/src/include/cpu/rpp_cpu_common.hpp
@@ -6655,42 +6655,6 @@ inline void compute_rain_24_host(__m256 *p1, __m256 p2, __m256 &pMul)
     p1[2] = _mm256_fmadd_ps(_mm256_sub_ps(p2, p1[2]), pMul, p1[2]);    // alpha-blending adjustment
 }
 
-// Compute hanning window
-inline RPP_HOST_DEVICE void hann_window(Rpp32f *output, Rpp32s windowSize)
-{
-    Rpp64f a = (2.0 * M_PI) / windowSize;
-    for (Rpp32s t = 0; t < windowSize; t++)
-    {
-        Rpp64f phase = a * (t + 0.5);
-        output[t] = (0.5 * (1.0 - std::cos(phase)));
-    }
-}
-
-// Compute number of spectrogram windows
-inline RPP_HOST_DEVICE Rpp32s get_num_windows(Rpp32s length, Rpp32s windowLength, Rpp32s windowStep, bool centerWindows)
-{
-    if (!centerWindows)
-        length -= windowLength;
-    return ((length / windowStep) + 1);
-}
-
-// Compute reflect start idx to pad
-inline RPP_HOST_DEVICE Rpp32s get_idx_reflect(Rpp32s loc, Rpp32s minLoc, Rpp32s maxLoc)
-{
-    if (maxLoc - minLoc < 2)
-        return maxLoc - 1;
-    for (;;)
-    {
-        if (loc < minLoc)
-            loc = 2 * minLoc - loc;
-        else if (loc >= maxLoc)
-            loc = 2 * maxLoc - 2 - loc;
-        else
-            break;
-    }
-    return loc;
-}
-
 inline void compute_threshold_8_host(__m256 *p, __m256 *pThresholdParams)
 {
     p[0] = _mm256_blendv_ps(avx_p0, avx_p1, _mm256_and_ps(_mm256_cmp_ps(p[0], pThresholdParams[0], _CMP_GE_OQ), _mm256_cmp_ps(p[0], pThresholdParams[1],_CMP_LE_OQ)));

diff --git a/src/include/func_specific/fog_mask.hpp b/src/include/func_specific/fog_mask.hpp
diff --git a/src/include/func_specific/rng_seed_stream.hpp b/src/include/func_specific/rng_seed_stream.hpp
@@ -28,7 +28,7 @@ SOFTWARE.
 #include <rppdefs.h>
 #define SEED_STREAM_MAX_SIZE 4050
 
-alignas(64) Rpp32u rngSeedStream4050[SEED_STREAM_MAX_SIZE] = {
+alignas(64) inline Rpp32u rngSeedStream4050[SEED_STREAM_MAX_SIZE] = {
 764129, 618740, 582752, 692507, 789955, 785851, 112601, 380578, 896709, 549581, 965817, 162355, 338288, 502799, 447091, 144368, 163130, 717741, 150842, 376320, 241741, 177798, 584469, 620725, 812482,
 125948, 712195, 171647, 734259, 318909, 530081, 752344, 134681, 976179, 596526, 768118, 926256, 586994, 583726, 931896, 164391, 257965, 194286, 344998, 859147, 928669, 765093, 514952, 375650, 573219,
 316707, 254968, 946657, 111935, 170126, 710776, 386138, 165523, 260376, 872164, 303180, 137553, 767258, 149950, 208631, 762140, 381674, 723139, 474377, 609227, 251119, 861729, 993061, 869503, 666490,

diff --git a/src/include/func_specific/spatter_mask.hpp b/src/include/func_specific/spatter_mask.hpp
diff --git a/src/modules/CMakeLists.txt b/src/modules/CMakeLists.txt
@@ -26,6 +26,7 @@ project(modules)
 
 if( "${BACKEND}" STREQUAL "HIP")
     file(GLOB RPP_KERNELS hip/kernel/*.cpp)
+    file(GLOB RPP_KERNELS hip/batch_pd/*.cpp)
 elseif( "${BACKEND}" STREQUAL "OCL")
     file(GLOB RPP_KERNELS cl/kernel/*.cl)
 endif()
@@ -90,7 +91,8 @@ if( "${BACKEND}" STREQUAL "HIP")
     # Add HIP kernels
     file(GLOB MOD_HIP_CPP "hip/*.cpp" )
     file(GLOB MOD_HIP_CPP_KERNELS "hip/kernel/*.cpp" )
-    list(APPEND Rpp_Source ${CPPFILES} ${MOD_HIP_CPP} ${MOD_HIP_CPP_KERNELS})
+    file(GLOB MOD_HIP_CPP_BATCHPD_KERNELS "hip/batch_pd/*.cpp" )
+    list(APPEND Rpp_Source ${CPPFILES} ${MOD_HIP_CPP} ${MOD_HIP_CPP_KERNELS} ${MOD_HIP_CPP_BATCHPD_KERNELS})
     message("-- ${Green}HIP kernels added${ColourReset}")
 
     # Set compiler flags

diff --git a/src/modules/cpu/kernel/spectrogram.hpp b/src/modules/cpu/kernel/spectrogram.hpp
@@ -32,6 +32,42 @@ inline bool can_use_real_impl(Rpp64s n) { return is_pow2(n); }
 inline Rpp64s size_in_buf(Rpp64s n) { return can_use_real_impl(n) ? n : 2 * n; }
 inline Rpp64s size_out_buf(Rpp64s n) { return can_use_real_impl(n) ? n + 2 : 2 * n; }
 
+// Compute hanning window
+inline RPP_HOST_DEVICE void hann_window(Rpp32f *output, Rpp32s windowSize)
+{
+    Rpp64f a = (2.0 * M_PI) / windowSize;
+    for (Rpp32s t = 0; t < windowSize; t++)
+    {
+        Rpp64f phase = a * (t + 0.5);
+        output[t] = (0.5 * (1.0 - std::cos(phase)));
+    }
+}
+
+// Compute number of spectrogram windows
+inline RPP_HOST_DEVICE Rpp32s get_num_windows(Rpp32s length, Rpp32s windowLength, Rpp32s windowStep, bool centerWindows)
+{
+    if (!centerWindows)
+        length -= windowLength;
+    return ((length / windowStep) + 1);
+}
+
+// Compute reflect start idx to pad
+inline RPP_HOST_DEVICE Rpp32s get_idx_reflect(Rpp32s loc, Rpp32s minLoc, Rpp32s maxLoc)
+{
+    if (maxLoc - minLoc < 2)
+        return maxLoc - 1;
+    for (;;)
+    {
+        if (loc < minLoc)
+            loc = 2 * minLoc - loc;
+        else if (loc >= maxLoc)
+            loc = 2 * maxLoc - 2 - loc;
+        else
+            break;
+    }
+    return loc;
+}
+
 RppStatus spectrogram_host_tensor(Rpp32f *srcPtr,
                                   RpptDescPtr srcDescPtr,
                                   Rpp32f *dstPtr,

diff --git a/...odules/hip/kernel/absolute_difference.cpp → ...ules/hip/batch_pd/absolute_difference.cpp b/...odules/hip/kernel/absolute_difference.cpp → ...ules/hip/batch_pd/absolute_difference.cpp
diff --git a/src/modules/hip/kernel/accumulate.cpp → src/modules/hip/batch_pd/accumulate.cpp b/src/modules/hip/kernel/accumulate.cpp → src/modules/hip/batch_pd/accumulate.cpp
diff --git a/src/modules/hip/kernel/add.cpp → src/modules/hip/batch_pd/add.cpp b/src/modules/hip/kernel/add.cpp → src/modules/hip/batch_pd/add.cpp
diff --git a/src/modules/hip/kernel/bilateral_filter.cpp → ...modules/hip/batch_pd/bilateral_filter.cpp b/src/modules/hip/kernel/bilateral_filter.cpp → ...modules/hip/batch_pd/bilateral_filter.cpp
diff --git a/src/modules/hip/kernel/bitwise_AND.cpp → src/modules/hip/batch_pd/bitwise_AND.cpp b/src/modules/hip/kernel/bitwise_AND.cpp → src/modules/hip/batch_pd/bitwise_AND.cpp
diff --git a/src/modules/hip/kernel/bitwise_NOT.cpp → src/modules/hip/batch_pd/bitwise_NOT.cpp b/src/modules/hip/kernel/bitwise_NOT.cpp → src/modules/hip/batch_pd/bitwise_NOT.cpp
diff --git a/src/modules/hip/batch_pd/blend.cpp b/src/modules/hip/batch_pd/blend.cpp
@@ -0,0 +1,110 @@
+#include <hip/hip_runtime.h>
+#include "rpp_hip_host_decls.hpp"
+
+#define saturate_8u(value) ((value) > 255 ? 255 : ((value) < 0 ? 0 : (value)))
+
+__device__ unsigned char blend_formula(unsigned char input_pixel1, unsigned char input_pixel2, float alpha)
+{
+    return saturate_8u(alpha * input_pixel1 + (1 - alpha) * input_pixel2);
+}
+
+extern "C" __global__ void blend(unsigned char *input1,
+                                 unsigned char *input2,
+                                 unsigned char *output,
+                                 const unsigned int height,
+                                 const unsigned int width,
+                                 const float alpha,
+                                 const unsigned int channel)
+{
+    int id_x = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;
+    int id_y = hipBlockIdx_y * hipBlockDim_y + hipThreadIdx_y;
+    int id_z = hipBlockIdx_z * hipBlockDim_z + hipThreadIdx_z;
+
+    if (id_x >= width || id_y >= height || id_z >= channel)
+    {
+        return;
+    }
+
+    int pixIdx = id_x + id_y * width + id_z * width * height;
+
+    output[pixIdx] = ((1-alpha) * input1[pixIdx]) + (alpha * input2[pixIdx]);
+}
+
+extern "C" __global__ void blend_batch(unsigned char *input1,
+                                       unsigned char *input2,
+                                       unsigned char *output,
+                                       float *alpha,
+                                       unsigned int *xroi_begin,
+                                       unsigned int *xroi_end,
+                                       unsigned int *yroi_begin,
+                                       unsigned int *yroi_end,
+                                       unsigned int *height,
+                                       unsigned int *width,
+                                       unsigned int *max_width,
+                                       unsigned long long *batch_index,
+                                       const unsigned int channel,
+                                       unsigned int *inc, // use width * height for pln and 1 for pkd
+                                       const int plnpkdindex) // use 1 pln 3 for pkd
+{
+    int id_x = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;
+    int id_y = hipBlockIdx_y * hipBlockDim_y + hipThreadIdx_y;
+    int id_z = hipBlockIdx_z * hipBlockDim_z + hipThreadIdx_z;
+
+    float alphatmp = alpha[id_z];
+    int indextmp=0;
+    unsigned long pixIdx = 0;
+
+    pixIdx = batch_index[id_z] + (id_x + id_y * max_width[id_z] ) * plnpkdindex;
+
+    if((id_y >= yroi_begin[id_z]) && (id_y <= yroi_end[id_z]) && (id_x >= xroi_begin[id_z]) && (id_x <= xroi_end[id_z]))
+    {
+        for(indextmp = 0; indextmp < channel; indextmp++)
+        {
+            unsigned char valuergb1 = input1[pixIdx];
+            unsigned char valuergb2 = input2[pixIdx];
+            output[pixIdx] = blend_formula(valuergb1, valuergb2, alphatmp);
+            pixIdx += inc[id_z];
+        }
+    }
+    else if((id_x < width[id_z] ) && (id_y < height[id_z]))
+    {
+        for(indextmp = 0; indextmp < channel; indextmp++)
+        {
+            output[pixIdx] = input1[pixIdx];
+            pixIdx += inc[id_z];
+        }
+    }
+}
+
+RppStatus hip_exec_blend_batch(Rpp8u *srcPtr1, Rpp8u *srcPtr2, Rpp8u *dstPtr, rpp::Handle& handle, RppiChnFormat chnFormat, Rpp32u channel, Rpp32s plnpkdind, Rpp32u max_height, Rpp32u max_width)
+{
+    int localThreads_x = 32;
+    int localThreads_y = 32;
+    int localThreads_z = 1;
+    int globalThreads_x = (max_width + 31) & ~31;
+    int globalThreads_y = (max_height + 31) & ~31;
+    int globalThreads_z = handle.GetBatchSize();
+
+    hipLaunchKernelGGL(blend_batch,
+                       dim3(ceil((float)globalThreads_x/localThreads_x), ceil((float)globalThreads_y/localThreads_y), ceil((float)globalThreads_z/localThreads_z)),
+                       dim3(localThreads_x, localThreads_y, localThreads_z),
+                       0,
+                       handle.GetStream(),
+                       srcPtr1,
+                       srcPtr2,
+                       dstPtr,
+                       handle.GetInitHandle()->mem.mgpu.floatArr[0].floatmem,
+                       handle.GetInitHandle()->mem.mgpu.roiPoints.x,
+                       handle.GetInitHandle()->mem.mgpu.roiPoints.roiWidth,
+                       handle.GetInitHandle()->mem.mgpu.roiPoints.y,
+                       handle.GetInitHandle()->mem.mgpu.roiPoints.roiHeight,
+                       handle.GetInitHandle()->mem.mgpu.srcSize.height,
+                       handle.GetInitHandle()->mem.mgpu.srcSize.width,
+                       handle.GetInitHandle()->mem.mgpu.maxSrcSize.width,
+                       handle.GetInitHandle()->mem.mgpu.srcBatchIndex,
+                       channel,
+                       handle.GetInitHandle()->mem.mgpu.inc,
+                       plnpkdind);
+
+    return RPP_SUCCESS;
+}
diff --git a/src/modules/hip/batch_pd/box_filter.cpp b/src/modules/hip/batch_pd/box_filter.cpp
@@ -0,0 +1,127 @@
+#include <hip/hip_runtime.h>
+#include "rpp_hip_host_decls.hpp"
+
+#define saturate_8u(value) ((value) > 255 ? 255 : ((value) < 0 ? 0 : (value)))
+
+
+extern "C" __global__ void box_filter_batch(unsigned char *input,
+                                            unsigned char *output,
+                                            unsigned int *kernelSize,
+                                            unsigned int *xroi_begin,
+                                            unsigned int *xroi_end,
+                                            unsigned int *yroi_begin,
+                                            unsigned int *yroi_end,
+                                            unsigned int *height,
+                                            unsigned int *width,
+                                            unsigned int *max_width,
+                                            unsigned long long *batch_index,
+                                            const unsigned int channel,
+                                            unsigned int *inc, // use width * height for pln and 1 for pkd
+                                            const int plnpkdindex) // use 1 pln 3 for pkd
+{
+    int id_x = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;
+    int id_y = hipBlockIdx_y * hipBlockDim_y + hipThreadIdx_y;
+    int id_z = hipBlockIdx_z * hipBlockDim_z + hipThreadIdx_z;
+
+    unsigned char valuer, valuer1, valueg, valueg1, valueb, valueb1;
+    int kernelSizeTemp = kernelSize[id_z];
+
+    int bound = (kernelSizeTemp - 1) / 2;
+    if(id_x < width[id_z] && id_y < height[id_z])
+    {
+        long pixIdx = batch_index[id_z] + (id_x + id_y * max_width[id_z]) * plnpkdindex;
+        if((id_y >= yroi_begin[id_z]) && (id_y <= yroi_end[id_z]) && (id_x >= xroi_begin[id_z]) && (id_x <= xroi_end[id_z]))
+        {
+            int r = 0, g = 0, b = 0;
+            for(int i = -bound; i <= bound; i++)
+            {
+                for(int j = -bound; j <= bound; j++)
+                {
+                    if(id_x + j >= 0 && id_x + j <= width[id_z] - 1 && id_y + i >= 0 && id_y + i <= height[id_z] - 1)
+                    {
+                        unsigned int index = pixIdx + (j + (i * max_width[id_z])) * plnpkdindex;
+                        r += input[index];
+                        if(channel == 3)
+                        {
+                            index = pixIdx + (j + (i * max_width[id_z])) * plnpkdindex + inc[id_z];
+                            g += input[index];
+                            index = pixIdx + (j + (i * max_width[id_z])) * plnpkdindex + inc[id_z] * 2;
+                            b += input[index];
+                        }
+                    }
+                    else
+                    {
+                        r = 0;
+                        if(channel == 3)
+                        {
+                            g = 0;
+                            b = 0;
+                        }
+                        break;
+                    }
+                }
+            }
+
+            if(id_x >= bound && id_x <= width[id_z] - bound - 1 && id_y >= bound && id_y <= height[id_z] - bound - 1 )
+            {
+                int temp = (int)(r / (kernelSizeTemp * kernelSizeTemp));
+                output[pixIdx] = saturate_8u(temp);
+                if(channel == 3)
+                {
+                    temp = (int)(g / (kernelSizeTemp * kernelSizeTemp));
+                    output[pixIdx + inc[id_z]] = saturate_8u(temp);
+                    temp = (int)(b / (kernelSizeTemp * kernelSizeTemp));
+                    output[pixIdx + inc[id_z] * 2] = saturate_8u(temp);
+                }
+            }
+            else
+            {
+                for(int indextmp = 0; indextmp < channel; indextmp++)
+                {
+                    output[pixIdx] = input[pixIdx];
+                    pixIdx += inc[id_z];
+                }
+            }
+        }
+        else if((id_x < width[id_z]) && (id_y < height[id_z]))
+        {
+            for(int indextmp = 0; indextmp < channel; indextmp++)
+            {
+                output[pixIdx] = input[pixIdx];
+                pixIdx += inc[id_z];
+            }
+        }
+    }
+}
+
+RppStatus hip_exec_box_filter_batch(Rpp8u *srcPtr, Rpp8u *dstPtr, rpp::Handle& handle, RppiChnFormat chnFormat, Rpp32u channel, Rpp32s plnpkdind, Rpp32u max_height, Rpp32u max_width)
+{
+    int localThreads_x = 32;
+    int localThreads_y = 32;
+    int localThreads_z = 1;
+    int globalThreads_x = (max_width + 31) & ~31;
+    int globalThreads_y = (max_height + 31) & ~31;
+    int globalThreads_z = handle.GetBatchSize();
+
+    hipLaunchKernelGGL(box_filter_batch,
+                       dim3(ceil((float)globalThreads_x/localThreads_x), ceil((float)globalThreads_y/localThreads_y), ceil((float)globalThreads_z/localThreads_z)),
+                       dim3(localThreads_x, localThreads_y, localThreads_z),
+                       0,
+                       handle.GetStream(),
+                       srcPtr,
+                       dstPtr,
+                       handle.GetInitHandle()->mem.mgpu.uintArr[0].uintmem,
+                       handle.GetInitHandle()->mem.mgpu.roiPoints.x,
+                       handle.GetInitHandle()->mem.mgpu.roiPoints.roiWidth,
+                       handle.GetInitHandle()->mem.mgpu.roiPoints.y,
+                       handle.GetInitHandle()->mem.mgpu.roiPoints.roiHeight,
+                       handle.GetInitHandle()->mem.mgpu.srcSize.height,
+                       handle.GetInitHandle()->mem.mgpu.srcSize.width,
+                       handle.GetInitHandle()->mem.mgpu.maxSrcSize.width,
+                       handle.GetInitHandle()->mem.mgpu.srcBatchIndex,
+                       channel,
+                       handle.GetInitHandle()->mem.mgpu.inc,
+                       plnpkdind);
+
+    return RPP_SUCCESS;
+}