ginkgo-project · upsj · Feb 12, 2022 · Dec 21, 2021 · Dec 21, 2021 · Dec 21, 2021
diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -68,8 +68,7 @@ if(MSVC)
 elseif(GINKGO_BUILD_DPCPP OR CMAKE_CXX_COMPILER MATCHES "dpcpp")
     # For now always use `-ffp-model=precise` with DPC++. This can be removed when
     # the floating point issues are fixed.
-    # -sycl-std=1.2.1 (or -sycl-std=2017) is temporary workaround after 2021.4 to propagate subgroup setting correctly
-    set(GINKGO_COMPILER_FLAGS "-Wpedantic;-ffp-model=precise;-sycl-std=1.2.1" CACHE STRING
+    set(GINKGO_COMPILER_FLAGS "-Wpedantic;-ffp-model=precise" CACHE STRING
         "Set the required CXX compiler flags, mainly used for warnings. Current default is `-Wpedantic;-ffp-model=precise`")
 else()
     set(GINKGO_COMPILER_FLAGS "-Wpedantic" CACHE STRING

diff --git a/core/test/solver/multigrid.cpp b/core/test/solver/multigrid.cpp
@@ -199,7 +199,7 @@ class Multigrid : public ::testing::Test {
     }
 };
 
-TYPED_TEST_CASE(Multigrid, gko::test::ValueTypes, TypenameNameGenerator);
+TYPED_TEST_SUITE(Multigrid, gko::test::ValueTypes, TypenameNameGenerator);
 
 
 TYPED_TEST(Multigrid, MultigridFactoryKnowsItsExecutor)

diff --git a/dpcpp/base/executor.dp.cpp b/dpcpp/base/executor.dp.cpp
@@ -250,7 +250,7 @@ void DpcppExecutor::set_device_property()
             for (auto& i : subgroup_sizes) {
                 this->get_exec_info().subgroup_sizes.push_back(i);
             }
-        } catch (cl::sycl::runtime_error& err) {
+        } catch (cl::sycl::exception& err) {
             GKO_NOT_SUPPORTED(device);
         }
     }

diff --git a/dpcpp/base/helper.hpp b/dpcpp/base/helper.hpp
@@ -62,10 +62,12 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
                InferredArgs... args)                                      \
     {                                                                     \
         queue->submit([&](sycl::handler& cgh) {                           \
-            cgh.parallel_for(sycl_nd_range(grid, block),                  \
-                             [=](sycl::nd_item<3> item_ct1) {             \
-                                 kernel_(args..., item_ct1);              \
-                             });                                          \
+            cgh.parallel_for(                                             \
+                sycl_nd_range(grid, block), [=                            \
+            ](sycl::nd_item<3> item_ct1) [[sycl::reqd_sub_group_size(     \
+                                                config::warp_size)]] {    \
+                    kernel_(args..., item_ct1);                           \
+                });                                                       \
         });                                                               \
     }
 
@@ -78,17 +80,27 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  * @param name_  the name of the host function with config
  * @param kernel_  the kernel name
  */
-#define GKO_ENABLE_DEFAULT_HOST_CONFIG(name_, kernel_)                \
-    template <std::uint32_t encoded, typename... InferredArgs>        \
-    inline void name_(dim3 grid, dim3 block, gko::size_type,          \
-                      sycl::queue* queue, InferredArgs... args)       \
-    {                                                                 \
-        queue->submit([&](sycl::handler& cgh) {                       \
-            cgh.parallel_for(sycl_nd_range(grid, block),              \
-                             [=](sycl::nd_item<3> item_ct1) {         \
-                                 kernel_<encoded>(args..., item_ct1); \
-                             });                                      \
-        });                                                           \
+#define GKO_ENABLE_DEFAULT_HOST_CONFIG(name_, kernel_)                    \
+    template <std::uint32_t encoded, typename... InferredArgs>            \
+    inline void name_(dim3 grid, dim3 block, gko::size_type,              \
+                      sycl::queue* queue, InferredArgs... args)           \
+    {                                                                     \
+        queue->submit([&](sycl::handler& cgh) {                           \
+            if constexpr (KCFG_1D::decode<1>(encoded) > 1) {              \
+                cgh.parallel_for(                                         \
+                    sycl_nd_range(grid, block), [=                        \
+                ](sycl::nd_item<3> item_ct1) [[sycl::reqd_sub_group_size( \
+                                                    KCFG_1D::decode<1>(   \
+                                                        encoded))]] {     \
+                        kernel_<encoded>(args..., item_ct1);              \
+                    });                                                   \
+            } else {                                                      \
+                cgh.parallel_for(sycl_nd_range(grid, block),              \
+                                 [=](sycl::nd_item<3> item_ct1) {         \
+                                     kernel_<encoded>(args..., item_ct1); \
+                                 });                                      \
+            }                                                             \
+        });                                                               \
     }
 
 /**
@@ -119,9 +131,9 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
     }
 
 // __WG_BOUND__ gives the cuda-like launch bound in cuda ordering
-#define __WG_BOUND_1D__(x) [[intel::reqd_work_group_size(1, 1, x)]]
-#define __WG_BOUND_2D__(x, y) [[intel::reqd_work_group_size(1, y, x)]]
-#define __WG_BOUND_3D__(x, y, z) [[intel::reqd_work_group_size(z, y, x)]]
+#define __WG_BOUND_1D__(x) [[sycl::reqd_work_group_size(1, 1, x)]]
+#define __WG_BOUND_2D__(x, y) [[sycl::reqd_work_group_size(1, y, x)]]
+#define __WG_BOUND_3D__(x, y, z) [[sycl::reqd_work_group_size(z, y, x)]]
 #define WG_BOUND_OVERLOAD(_1, _2, _3, NAME, ...) NAME
 #define __WG_BOUND__(...)                                            \
     WG_BOUND_OVERLOAD(__VA_ARGS__, __WG_BOUND_3D__, __WG_BOUND_2D__, \

diff --git a/dpcpp/base/kernel_launch_reduction.dp.hpp b/dpcpp/base/kernel_launch_reduction.dp.hpp
@@ -81,7 +81,7 @@ void generic_kernel_reduction_1d(sycl::handler& cgh, int64 size,
 
     cgh.parallel_for(
         range, [=
-    ](sycl::nd_item<3> idx) [[intel::reqd_sub_group_size(sg_size)]] {
+    ](sycl::nd_item<3> idx) [[sycl::reqd_sub_group_size(sg_size)]] {
             auto subgroup_partial = &(*subgroup_partial_acc.get_pointer())[0];
             const auto tidx = thread::get_thread_id_flat<int64>(idx);
             const auto local_tidx = static_cast<int64>(tidx % wg_size);
@@ -130,7 +130,7 @@ void generic_kernel_reduction_2d(sycl::handler& cgh, int64 rows, int64 cols,
 
     cgh.parallel_for(
         range, [=
-    ](sycl::nd_item<3> idx) [[intel::reqd_sub_group_size(sg_size)]] {
+    ](sycl::nd_item<3> idx) [[sycl::reqd_sub_group_size(sg_size)]] {
             auto subgroup_partial = &(*subgroup_partial_acc.get_pointer())[0];
             const auto tidx = thread::get_thread_id_flat<int64>(idx);
             const auto local_tidx = static_cast<int64>(tidx % wg_size);
@@ -312,7 +312,7 @@ void generic_kernel_row_reduction_2d(syn::value_list<int, ssg_size>,
     exec->get_queue()->submit([&](sycl::handler& cgh) {
         cgh.parallel_for(
             range, [=
-        ](sycl::nd_item<3> id) [[intel::reqd_sub_group_size(sg_size)]] {
+        ](sycl::nd_item<3> id) [[sycl::reqd_sub_group_size(sg_size)]] {
                 const auto idx =
                     thread::get_subwarp_id_flat<ssg_size, int64>(id);
                 const auto row = idx % rows;
@@ -368,8 +368,7 @@ void generic_kernel_col_reduction_2d_small(
         block_partial_acc(cgh);
     const auto range = sycl_nd_range(dim3(row_blocks), dim3(wg_size));
     cgh.parallel_for(
-        range, [=
-    ](sycl::nd_item<3> id) [[intel::reqd_sub_group_size(sg_size)]] {
+        range, [=](sycl::nd_item<3> id) [[sycl::reqd_sub_group_size(sg_size)]] {
             auto block_partial = &(*block_partial_acc.get_pointer())[0];
             const auto ssg_id =
                 thread::get_subwarp_id_flat<ssg_size, int64>(id);
@@ -441,8 +440,7 @@ void generic_kernel_col_reduction_2d_blocked(
                    sycl::access_mode::read_write, sycl::access::target::local>
         block_partial_acc(cgh);
     cgh.parallel_for(
-        range, [=
-    ](sycl::nd_item<3> id) [[intel::reqd_sub_group_size(sg_size)]] {
+        range, [=](sycl::nd_item<3> id) [[sycl::reqd_sub_group_size(sg_size)]] {
             const auto sg_id = thread::get_subwarp_id_flat<sg_size, int64>(id);
             const auto sg_num =
                 thread::get_subwarp_num_flat<sg_size, int64>(id);

diff --git a/dpcpp/components/cooperative_groups.dp.hpp b/dpcpp/components/cooperative_groups.dp.hpp
@@ -165,8 +165,8 @@ namespace detail {
  * This is a limited implementation of the DPCPP thread_block_tile.
  */
 template <unsigned Size>
-class thread_block_tile : public sycl::ONEAPI::sub_group {
-    using sub_group = sycl::ONEAPI::sub_group;
+class thread_block_tile : public sycl::sub_group {
+    using sub_group = sycl::sub_group;
     using id_type = sub_group::id_type;
     using mask_type = config::lane_mask_type;
 
@@ -236,9 +236,9 @@ class thread_block_tile : public sycl::ONEAPI::sub_group {
     __dpct_inline__ mask_type ballot(int predicate) const noexcept
     {
         // todo: change it when OneAPI update the mask related api
-        return sycl::ONEAPI::reduce(
+        return sycl::reduce_over_group(
             *this, (predicate != 0) ? mask_type(1) << data_.rank : mask_type(0),
-            sycl::ONEAPI::plus<mask_type>());
+            sycl::plus<mask_type>());
     }
 
     /**
@@ -247,7 +247,7 @@ class thread_block_tile : public sycl::ONEAPI::sub_group {
      */
     __dpct_inline__ bool any(int predicate) const noexcept
     {
-        return sycl::ONEAPI::any_of(*this, (predicate != 0));
+        return sycl::any_of_group(*this, (predicate != 0));
     }
 
10000    /**
@@ -256,7 +256,7 @@ class thread_block_tile : public sycl::ONEAPI::sub_group {
      */
     __dpct_inline__ bool all(int predicate) const noexcept
     {
-        return sycl::ONEAPI::all_of(*this, (predicate != 0));
+        return sycl::all_of_group(*this, (predicate != 0));
     }
 
 
@@ -353,8 +353,7 @@ template <unsigned Size, typename Group>
 __dpct_inline__
     std::enable_if_t<(Size > 1) && Size <= 64 && !(Size & (Size - 1)),
                      detail::thread_block_tile<Size>>
-        tiled_partition
-    [[intel::reqd_sub_group_size(Size)]] (const Group& group)
+    tiled_partition(const Group& group)
 {
     return detail::thread_block_tile<Size>(group);
 }

diff --git a/dpcpp/components/reduction.dp.hpp b/dpcpp/components/reduction.dp.hpp
@@ -229,7 +229,9 @@ void reduce_add_array(dim3 grid, dim3 block, size_type dynamic_shared_memory,
             block_sum_acc_ct1(cgh);
 
         cgh.parallel_for(
-            sycl_nd_range(grid, block), [=](sycl::nd_item<3> item_ct1) {
+            sycl_nd_range(grid, block), [=
+        ](sycl::nd_item<3> item_ct1) [[sycl::reqd_sub_group_size(
+                                            KCFG_1D::decode<1>(cfg))]] {
                 reduce_add_array<cfg>(size, source, result, item_ct1,
                                       *block_sum_acc_ct1.get_pointer());
             });

diff --git a/dpcpp/matrix/csr_kernels.dp.cpp b/dpcpp/matrix/csr_kernels.dp.cpp
@@ -678,7 +678,9 @@ void abstract_classical_spmv(dim3 grid, dim3 block,
 {
     queue->submit([&](sycl::handler& cgh) {
         cgh.parallel_for(
-            sycl_nd_range(grid, block), [=](sycl::nd_item<3> item_ct1) {
+            sycl_nd_range(grid, block), [=
+        ](sycl::nd_item<3> item_ct1) [[sycl::reqd_sub_group_size(
+                                            subgroup_size)]] {
                 abstract_classical_spmv<subgroup_size>(num_rows, val, col_idxs,
                                                        row_ptrs, b, b_stride, c,
                                                        c_stride, item_ct1);
@@ -744,52 +746,6 @@ void fill_in_dense(size_type num_rows, const IndexType* __restrict__ row_ptrs,
 GKO_ENABLE_DEFAULT_HOST(fill_in_dense, fill_in_dense);
 
 
-template <typename IndexType>
-void calculate_nnz_per_row(size_type num_rows,
-                           const IndexType* __restrict__ row_ptrs,
-                           size_type* __restrict__ nnz_per_row,
-                           sycl::nd_item<3> item_ct1)
-{
-    const auto tidx = thread::get_thread_id_flat(item_ct1);
-    if (tidx < num_rows) {
-        nnz_per_row[tidx] = row_ptrs[tidx + 1] - row_ptrs[tidx];
-    }
-}
-
-GKO_ENABLE_DEFAULT_HOST(calculate_nnz_per_row, calculate_nnz_per_row);
-
-
-void reduce_max_nnz(size_type size, const size_type* __restrict__ nnz_per_row,
-                    size_type* __restrict__ result, sycl::nd_item<3> item_ct1,
-                    size_type* block_max)
-{
-    reduce_array(
-        size, nnz_per_row, block_max, item_ct1,
-        [](const size_type& x, const size_type& y) { return max(x, y); });
-
-    if (item_ct1.get_local_id(2) == 0) {
-        result[item_ct1.get_group(2)] = block_max[0];
-    }
-}
-
-void reduce_max_nnz(dim3 grid, dim3 block, size_type dynamic_shared_memory,
-                    sycl::queue* queue, size_type size,
-                    const size_type* nnz_per_row, size_type* result)
-{
-    queue->submit([&](sycl::handler& cgh) {
-        sycl::accessor<size_type, 1, sycl::access_mode::read_write,
-                       sycl::access::target::local>
-            block_max_acc_ct1(sycl::range<1>(default_block_size), cgh);
-
-        cgh.parallel_for(sycl_nd_range(grid, block),
-                         [=](sycl::nd_item<3> item_ct1) {
-                             reduce_max_nnz(size, nnz_per_row, result, item_ct1,
-                                            block_max_acc_ct1.get_pointer());
-                         });
-    });
-}
-
-
 template <typename IndexType>
 void check_unsorted(const IndexType* __restrict__ row_ptrs,
                     const IndexType* __restrict__ col_idxs, IndexType num_rows,
@@ -1231,9 +1187,10 @@ void spmv(std::shared_ptr<const DpcppExecutor> exec,
                     zero<ValueType>(), c->get_values());
             } else {
                 oneapi::mkl::sparse::gemm(
-                    *exec->get_queue(), oneapi::mkl::transpose::nontrans,
-                    one<ValueType>(), mat_handle,
-                    const_cast<ValueType*>(b->get_const_values()),
+                    *exec->get_queue(), oneapi::mkl::layout::row_major,
+                    oneapi::mkl::transpose::nontrans,
+                    oneapi::mkl::transpose::nontrans, one<ValueType>(),
+                    mat_handle, const_cast<ValueType*>(b->get_const_values()),
                     b->get_size()[1], b->get_stride(), zero<ValueType>(),
                     c->get_values(), c->get_stride());
             }
@@ -1296,7 +1253,9 @@ void advanced_spmv(std::shared_ptr<const DpcppExecutor> exec,
                     c->get_values());
             } else {
                 oneapi::mkl::sparse::gemm(
-                    *exec->get_queue(), oneapi::mkl::transpose::nontrans,
+                    *exec->get_queue(), oneapi::mkl::layout::row_major,
+                    oneapi::mkl::transpose::nontrans,
+                    oneapi::mkl::transpose::nontrans,
                     exec->copy_val_to_host(alpha->get_const_values()),
                     mat_handle, const_cast<ValueType*>(b->get_const_values()),
                     b->get_size()[1], b->get_stride(),

diff --git a/dpcpp/matrix/dense_kernels.dp.cpp b/dpcpp/matrix/dense_kernels.dp.cpp
@@ -109,7 +109,6 @@ void transpose(const size_type nrows, const size_type ncols,
 }
 
 template <std::uint32_t sg_size, typename ValueType>
-__WG_BOUND__(sg_size, sg_size)
 void transpose(const size_type nrows, const size_type ncols,
                const ValueType* __restrict__ in, const size_type in_stride,
                ValueType* __restrict__ out, const size_type out_stride,
@@ -134,7 +133,8 @@ void transpose(dim3 grid, dim3 block, size_type dynamic_shared_memory,
             space_acc_ct1(cgh);
 
         cgh.parallel_for(
-            sycl_nd_range(grid, block), [=](sycl::nd_item<3> item_ct1) {
+            sycl_nd_range(grid, block),
+            [=](sycl::nd_item<3> item_ct1) __WG_BOUND__(sg_size, sg_size) {
                 transpose<sg_size>(nrows, ncols, in, in_stride, out, out_stride,
                                    item_ct1, *space_acc_ct1.get_pointer());
             });
@@ -146,7 +146,6 @@ GKO_ENABLE_DEFAULT_CONFIG_CALL(transpose_call, transpose, subgroup_list);
 
 
 template <std::uint32_t sg_size, typename ValueType>
-__WG_BOUND__(sg_size, sg_size)
 void conj_transpose(const size_type nrows, const size_type ncols,
                     const ValueType* __restrict__ in, const size_type in_stride,
                     ValueType* __restrict__ out, const size_type out_stride,
@@ -172,7 +171,8 @@ void conj_transpose(dim3 grid, dim3 block, size_type dynamic_shared_memory,
             space_acc_ct1(cgh);
 
         cgh.parallel_for(
-            sycl_nd_range(grid, block), [=](sycl::nd_item<3> item_ct1) {
+            sycl_nd_range(grid, block),
+            [=](sycl::nd_item<3> item_ct1) __WG_BOUND__(sg_size, sg_size) {
                 conj_transpose<sg_size>(nrows, ncols, in, in_stride, out,
                                         out_stride, item_ct1,
                                         *space_acc_ct1.get_pointer());
-Original file line number
+Diff line change
@@ Expand Up / @@ -199,7 +199,7 @@ class Multigrid : public ::testing::Test { @@
         }
     };
-    TYPED_TEST_CASE(Multigrid, gko::test::ValueTypes, TypenameNameGenerator);
+    TYPED_TEST_SUITE(Multigrid, gko::test::ValueTypes, TypenameNameGenerator);
     TYPED_TEST(Multigrid, MultigridFactoryKnowsItsExecutor)
@@ Expand Down @@