ginkgo-project · tcojean · May 29, 2024 · Apr 18, 2024 · Apr 24, 2024 · Apr 24, 2024
diff --git a/common/cuda_hip/preconditioner/batch_block_jacobi.hpp.inc b/common/cuda_hip/preconditioner/batch_block_jacobi.hpp.inc
@@ -49,25 +49,120 @@ public:
         const gko::batch::matrix::ell::batch_item<const value_type,
                                                   const index_type>&,
         value_type* const __restrict__)
-    {}
+    {
+        common_generate_for_all_system_matrix_types(batch_id);
+    }
 
     __device__ __forceinline__ void generate(
         size_type batch_id,
         const gko::batch::matrix::csr::batch_item<const value_type,
                                                   const index_type>&,
         value_type* const __restrict__)
-    {}
+    {
+        common_generate_for_all_system_matrix_types(batch_id);
+    }
 
     __device__ __forceinline__ void generate(
         size_type batch_id,
         const gko::batch::matrix::dense::batch_item<const value_type>&,
         value_type* const __restrict__)
-    {}
+    {
+        common_generate_for_all_system_matrix_types(batch_id);
+    }
 
     __device__ __forceinline__ void apply(const int num_rows,
                                           const value_type* const r,
                                           value_type* const z) const
-    {}
+    {
+        const int required_subwarp_size = get_larger_power(max_block_size_);
+
+        if (required_subwarp_size == 1) {
+            apply_helper<1>(num_rows, r, z);
+        } else if (required_subwarp_size == 2) {
+            // clang (with HIP) has issues with subwarp size 2 (fails with
+            // segfault on the compiler), so we set it to 4 instead.
+            apply_helper<4>(num_rows, r, z);
+        } else if (required_subwarp_size == 4) {
+            apply_helper<4>(num_rows, r, z);
+        } else if (required_subwarp_size == 8) {
+            apply_helper<8>(num_rows, r, z);
+        } else if (required_subwarp_size == 16) {
+            apply_helper<16>(num_rows, r, z);
+        } else if (required_subwarp_size == 32) {
+            apply_helper<32>(num_rows, r, z);
+        }
+#if defined(__HIP_DEVICE_COMPILE__) && !defined(__CUDA_ARCH__)
+        else if (required_subwarp_size == 64) {
+            apply_helper<64>(num_rows, r, z);
+        }
+#endif
+        else {
+            apply_helper<config::warp_size>(num_rows, r, z);
+        }
+    }
+
+    __device__ __forceinline__ void common_generate_for_all_system_matrix_types(
+        size_type batch_id)
+    {
+        blocks_arr_entry_ =
+            blocks_arr_batch_ +
+            gko::detail::batch_jacobi::get_batch_offset(
+                batch_id, num_blocks_, blocks_cumulative_offsets_);
+    }
+
+    __device__ constexpr int get_larger_power(int value, int guess = 1) const
+    {
+        return guess >= value ? guess : get_larger_power(value, guess << 1);
+    }
+
+    template <int tile_size>
+    __device__ __forceinline__ void apply_helper(const int num_rows,
+                                                 const ValueType* const r,
+                                                 ValueType* const z) const
+    {
+        // Structure-aware SpMV
+        auto thread_block = group::this_thread_block();
+        auto subwarp_grp = group::tiled_partition<tile_size>(thread_block);
+        const auto subwarp_grp_id = static_cast<int>(threadIdx.x / tile_size);
+        const int num_subwarp_grps_per_block = ceildiv(blockDim.x, tile_size);
+        const int id_within_subwarp = subwarp_grp.thread_rank();
+
+        // one subwarp per row
+        for (int row_idx = subwarp_grp_id; row_idx < num_rows;
+             row_idx += num_subwarp_grps_per_block) {
+            const int block_idx = row_block_map_[row_idx];
+            const value_type* dense_block_ptr =
+                blocks_arr_entry_ + gko::detail::batch_jacobi::get_block_offset(
+                                        block_idx, blocks_cumulative_offsets_);
+            const auto stride = gko::detail::batch_jacobi::get_stride(
+                block_idx, block_ptrs_arr_);
+
+            const int idx_start = block_ptrs_arr_[block_idx];
+            const int idx_end = block_ptrs_arr_[block_idx + 1];
+            const int bsize = idx_end - idx_start;
+
+            const int dense_block_row = row_idx - idx_start;
+            auto sum = zero<value_type>();
+
+            for (int dense_block_col = id_within_subwarp;
+                 dense_block_col < bsize; dense_block_col += tile_size) {
+                const auto block_val =
+                    dense_block_ptr[dense_block_row * stride +
+                                    dense_block_col];  // coalesced accesses
+                sum += block_val * r[dense_block_col + idx_start];
+            }
+
+// reduction
+#pragma unroll
+            for (int i = static_cast<int>(tile_size) / 2; i > 0; i /= 2) {
+                sum += subwarp_grp.shfl_down(sum, i);
+            }
+
+            if (id_within_subwarp == 0) {
+                z[row_idx] = sum;
+            }
+        }
+    }
 
 private:
     const uint32 max_block_size_;

diff --git a/common/cuda_hip/preconditioner/batch_jacobi_kernels.hpp.inc b/common/cuda_hip/preconditioner/batch_jacobi_kernels.hpp.inc
@@ -0,0 +1,245 @@
+// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors
+//
+// SPDX-License-Identifier: BSD-3-Clause
+
+__global__ void compute_block_storage_kernel(
+    const gko::size_type num_blocks,
+    const int* const __restrict__ block_pointers,
+    int* const __restrict__ blocks_cumulative_offsets)
+{
+    const auto tid = threadIdx.x + blockIdx.x * blockDim.x;
+
+    for (int i = tid; i < num_blocks; i += blockDim.x * gridDim.x) {
+        const auto bsize = block_pointers[i + 1] - block_pointers[i];
+        blocks_cumulative_offsets[i] = bsize * bsize;
+    }
+}
+
+
+__global__ __launch_bounds__(default_block_size) void find_row_block_map_kernel(
+    const gko::size_type num_blocks,
+    const int* const __restrict__ block_pointers,
+    int* const __restrict__ map_block_to_row)
+{
+    const auto tid = threadIdx.x + blockIdx.x * blockDim.x;
+
+    for (int block_idx = tid; block_idx < num_blocks;
+         block_idx += blockDim.x * gridDim.x) {
+        for (int i = block_pointers[block_idx];
+             i < block_pointers[block_idx + 1]; i++) {
+            map_block_to_row[i] = block_idx;  // uncoalesced accesses
+        }
+    }
+}
+
+
+__global__
+__launch_bounds__(default_block_size) void extract_common_block_pattern_kernel(
+    const int nrows, const int* const __restrict__ sys_row_ptrs,
+    const int* const __restrict__ sys_col_idxs, const gko::size_type num_blocks,
+    const int* const __restrict__ blocks_cumulative_offsets,
+    const int* const __restrict__ block_pointers,
+    const int* const __restrict__ map_block_to_row, int* const blocks_pattern)
+{
+    constexpr auto tile_size =
+        config::warp_size;  // use full warp for coalesced memory accesses
+    auto thread_block = group::this_thread_block();
+    auto warp_grp = group::tiled_partition<tile_size>(thread_block);
+    const int warp_id_in_grid = thread::get_subwarp_id_flat<tile_size, int>();
+    const int total_num_warps_in_grid =
+        thread::get_subwarp_num_flat<tile_size, int>();
+    const int id_within_warp = warp_grp.thread_rank();
+
+    // one warp per row of the matrix
+    for (int row_idx = warp_id_in_grid; row_idx < nrows;
+         row_idx += total_num_w
5D32
arps_in_grid) {
+        const int block_idx = map_block_to_row[row_idx];
+        const int idx_start = block_pointers[block_idx];
+        const int idx_end = block_pointers[block_idx + 1];
+        int* __restrict__ pattern_ptr =
+            blocks_pattern + gko::detail::batch_jacobi::get_block_offset(
+                                 block_idx, blocks_cumulative_offsets);
+        const auto stride =
+            gko::detail::batch_jacobi::get_stride(block_idx, block_pointers);
+
+        for (int i = sys_row_ptrs[row_idx] + id_within_warp;
+             i < sys_row_ptrs[row_idx + 1]; i += tile_size) {
+            const int col_idx = sys_col_idxs[i];  // coalesced accesses
+
+            if (col_idx >= idx_start && col_idx < idx_end) {
+                // element at (row_idx, col_idx) is part of the diagonal block
+                // store it into the pattern
+                const int dense_block_row = row_idx - idx_start;
+                const int dense_block_col = col_idx - idx_start;
+
+                // The pattern is stored in row-major order
+                pattern_ptr[dense_block_row * stride + dense_block_col] =
+                    i;  // coalesced accesses
+            }
+        }
+    }
+}
+
+
+template <typename Group, typename ValueType>
+__device__ __forceinline__ int choose_pivot(
+    Group subwarp_grp, const int block_size,
+    const ValueType* const __restrict__ block_row, const int& perm, const int k)
+{
+    auto my_abs_ele = abs(block_row[k]);
+    if (perm > -1) {
+        my_abs_ele = -1;
+    }
+    if (subwarp_grp.thread_rank() >= block_size) {
+        my_abs_ele = -1;
+    }
+    subwarp_grp.sync();
+    int my_piv_idx = subwarp_grp.thread_rank();
+#pragma unroll
+    for (int a = subwarp_grp.size() / 2; a > 0; a /= 2) {
+        const auto abs_ele_other = subwarp_grp.shfl_down(my_abs_ele, a);
+        const int piv_idx_other = subwarp_grp.shfl_down(my_piv_idx, a);
+
+        if (my_abs_ele < abs_ele_other) {
+            my_abs_ele = abs_ele_other;
+            my_piv_idx = piv_idx_other;
+        }
+    }
+    subwarp_grp.sync();
+    const int ipiv = subwarp_grp.shfl(my_piv_idx, 0);
+    return ipiv;
+}
+
+
+template <typename Group, typename ValueType>
+__device__ __forceinline__ void invert_dense_block(Group subwarp_grp,
+                                                   const int block_size,
+                                                   ValueType* const block_row,
+                                                   int& perm)
+{
+    // Gauss Jordan Elimination with implicit pivoting
+    for (int k = 0; k < block_size; k++) {
+        // implicit pivoting
+        const int ipiv = choose_pivot(subwarp_grp, block_size, block_row, perm,
+                                      k);  // pivot index
+        if (subwarp_grp.thread_rank() == ipiv) {
+            perm = k;
+        }
+        const ValueType d =
+            (subwarp_grp.shfl(block_row[k], ipiv) == zero<ValueType>())
+                ? one<ValueType>()
+                : subwarp_grp.shfl(block_row[k], ipiv);
+        // scale kth col
+        block_row[k] /= -d;
+        if (subwarp_grp.thread_rank() == ipiv) {
+            block_row[k] = zero<ValueType>();
+        }
+        const ValueType row_val = block_row[k];
+        // rank-1 update
+        for (int col = 0; col < block_size; col++) {
+            const ValueType col_val = subwarp_grp.shfl(block_row[col], ipiv);
+            block_row[col] += row_val * col_val;
+        }
+        // Computations for the threads of the subwarp having local id >=
+        // block_size are meaningless.
+
+        // scale ipiv th row
+        if (subwarp_grp.thread_rank() == ipiv) {
+            for (int i = 0; i < block_size; i++) {
+                block_row[i] /= d;
+            }
+            block_row[k] = one<ValueType>() / d;
+        }
+    }
+}
+
+
+template <int subwarp_size, typename ValueType>
+__global__
+__launch_bounds__(default_block_size) void compute_block_jacobi_kernel(
+    const gko::size_type nbatch, const int nnz, const ValueType* const A_vals,
+    const gko::size_type num_blocks,
+    const int* const __restrict__ blocks_cumulative_offsets,
+    const int* const __restrict__ block_pointers,
+    const int* const blocks_pattern, ValueType* const blocks)
+{
+    auto thread_block = group::this_thread_block();
+    auto subwarp_grp = group::tiled_partition<subwarp_size>(thread_block);
+    const int subwarp_id_in_grid =
+        thread::get_subwarp_id_flat<subwarp_size, int>();
+    const int total_num_subwarps_in_grid =
+        thread::get_subwarp_num_flat<subwarp_size, int>();
+    const int id_within_subwarp = subwarp_grp.thread_rank();
+
+    // one subwarp per small diagonal block
+    for (size_type i = subwarp_id_in_grid; i < nbatch * num_blocks;
+         i += total_num_subwarps_in_grid) {
+        const auto batch_idx = i / num_blocks;
+        const auto block_idx = i % num_blocks;
+
+        ValueType block_row[subwarp_size];
+        const auto block_size =
+            block_pointers[block_idx + 1] - block_pointers[block_idx];
+        assert(block_size <= subwarp_size);
+
+        const int* __restrict__ current_block_pattern =
+            blocks_pattern + gko::detail::batch_jacobi::get_block_offset(
+                                 block_idx, blocks_cumulative_offsets);
+        ValueType* __restrict__ current_block_data =
+            blocks +
+            gko::detail::batch_jacobi::get_global_block_offset(
+                batch_idx, num_blocks, block_idx, blocks_cumulative_offsets);
+        const auto stride =
+            gko::detail::batch_jacobi::get_stride(block_idx, block_pointers);
+
+        // each thread of the subwarp stores the column of the dense block/row
+        // of the transposed block in its local memory
+        if (id_within_subwarp < block_size) {
+            for (int a = 0; a < block_size; a++) {
+                const auto idx = current_block_pattern
+                    [a * gko::detail::batch_jacobi::get_stride(block_idx,
+                                                               block_pointers) +
+                     id_within_subwarp];  // coalesced
+                                          // accesses
+                ValueType val_to_fill = zero<ValueType>();
+                if (idx >= 0) {
+                    assert(idx < nnz);
+                    val_to_fill = A_vals[idx + nnz * batch_idx];
+                }
+                block_row[a] = val_to_fill;
+            }
+        }
+
+        int perm = -1;
+
+        // invert
+        invert_dense_block(subwarp_grp, block_size, block_row,
+                           perm);  // invert the transpose of the dense block.
+        // Note: Each thread of the subwarp has a row of the block to be
+        // inverted. (local id: 0 thread has 0th row, 1st thread has 1st row and
+        // so on..)
+        // If block_size < subwarp_size, then threads with local id >=
+        // block_size do not mean anything. Also, values in the block_row for
+        // index >= block_size are meaningless
+
+        subwarp_grp.sync();
+
+        // write back the transpose of the transposed inverse matrix to block
+        // array
+        for (int a = 0; a < block_size; a++) {
+            const int col_inv_transposed_mat = a;
+            const int col = subwarp_grp.shfl(perm, a);  // column permutation
+            const int row_inv_transposed_mat =
+                perm;  // accumulated row swaps during pivoting
+            const auto val_to_write = block_row[col];
+
+            const int row_diag_block = col_inv_transposed_mat;
+            const int col_diag_block = row_inv_transposed_mat;
+
+            if (id_within_subwarp < block_size) {
+                current_block_data[row_diag_block * stride + col_diag_block] =
+                    val_to_write;  // non-coalesced accesses due to pivoting
+            }
+        }
+    }
+}