ginkgo-project · tcojean · Feb 10, 2022 · Feb 8, 2022 · Feb 8, 2022 · Feb 9, 2022
diff --git a/cuda/test/matrix/csr_kernels.cpp b/cuda/test/matrix/csr_kernels.cpp
@@ -409,7 +409,7 @@ TEST_F(Csr, AdvancedApplyToDenseMatrixIsEquivalentToRefWithMergePath)
 
 TEST_F(Csr, AdvancedApplyToCsrMatrixIsEquivalentToRef)
 {
-    set_up_apply_data(std::make_shared<Mtx::automatical>());
+    set_up_apply_data(std::make_shared<Mtx::automatical>(cuda));
     auto trans = mtx->transpose();
     auto d_trans = dmtx->transpose();
 
@@ -424,7 +424,7 @@ TEST_F(Csr, AdvancedApplyToCsrMatrixIsEquivalentToRef)
 
 TEST_F(Csr, SimpleApplyToCsrMatrixIsEquivalentToRef)
 {
-    set_up_apply_data(std::make_shared<Mtx::automatical>());
+    set_up_apply_data(std::make_shared<Mtx::automatical>(cuda));
     auto trans = mtx->transpose();
     auto d_trans = dmtx->transpose();
 
@@ -439,7 +439,7 @@ TEST_F(Csr, SimpleApplyToCsrMatrixIsEquivalentToRef)
 
 TEST_F(Csr, AdvancedApplyToIdentityMatrixIsEquivalentToRef)
 {
-    set_up_apply_data(std::make_shared<Mtx::automatical>());
+    set_up_apply_data(std::make_shared<Mtx::automatical>(cuda));
     auto a = gen_mtx<Mtx>(mtx_size[0], mtx_size[1], 0);
     auto b = gen_mtx<Mtx>(mtx_size[0], mtx_size[1], 0);
     auto da = gko::clone(cuda, a);
@@ -459,7 +459,7 @@ TEST_F(Csr, AdvancedApplyToIdentityMatrixIsEquivalentToRef)
 
 TEST_F(Csr, ApplyToComplexIsEquivalentToRef)
 {
-    set_up_apply_data(std::make_shared<Mtx::automatical>());
+    set_up_apply_data(std::make_shared<Mtx::automatical>(cuda));
     auto complex_b = gen_mtx<ComplexVec>(this->mtx_size[1], 3, 1);
     auto dcomplex_b = gko::clone(cuda, complex_b);
     auto complex_x = gen_mtx<ComplexVec>(this->mtx_size[0], 3, 1);
@@ -474,7 +474,7 @@ TEST_F(Csr, ApplyToComplexIsEquivalentToRef)
 
 TEST_F(Csr, AdvancedApplyToComplexIsEquivalentToRef)
 {
-    set_up_apply_data(std::make_shared<Mtx::automatical>());
+    set_up_apply_data(std::make_shared<Mtx::automatical>(cuda));
     auto complex_b = gen_mtx<ComplexVec>(this->mtx_size[1], 3, 1);
     auto dcomplex_b = gko::clone(cuda, complex_b);
     auto complex_x = gen_mtx<ComplexVec>(this->mtx_size[0], 3, 1);
@@ -767,7 +767,7 @@ TEST_F(Csr, IsInverseColPermutable)
 
 TEST_F(Csr, RecognizeSortedMatrixIsEquivalentToRef)
 {
-    set_up_apply_data(std::make_shared<Mtx::automatical>());
+    set_up_apply_data(std::make_shared<Mtx::automatical>(cuda));
     bool is_sorted_cuda{};
     bool is_sorted_ref{};
 
@@ -794,7 +794,7 @@ TEST_F(Csr, RecognizeUnsortedMatrixIsEquivalentToRef)
 
 TEST_F(Csr, SortSortedMatrixIsEquivalentToRef)
 {
-    set_up_apply_data(std::make_shared<Mtx::automatical>());
+    set_up_apply_data(std::make_shared<Mtx::automatical>(cuda));
 
     mtx->sort_by_column_index();
     dmtx->sort_by_column_index();
@@ -819,7 +819,7 @@ TEST_F(Csr, SortUnsortedMatrixIsEquivalentToRef)
 
 TEST_F(Csr, OneAutomaticalWorksWithDifferentMatrices)
 {
-    auto automatical = std::make_shared<Mtx::automatical>();
+    auto automatical = std::make_shared<Mtx::automatical>(cuda);
     auto row_len_limit = std::max(automatical->nvidia_row_len_limit,
                                   automatical->amd_row_len_limit);
     auto load_balance_mtx =
@@ -840,7 +840,7 @@ TEST_F(Csr, OneAutomaticalWorksWithDifferentMatrices)
 
 TEST_F(Csr, ExtractDiagonalIsEquivalentToRef)
 {
-    set_up_apply_data(std::make_shared<Mtx::automatical>());
+    set_up_apply_data(std::make_shared<Mtx::automatical>(cuda));
 
     auto diag = mtx->extract_diagonal();
     auto ddiag = dmtx->extract_diagonal();

diff --git a/dpcpp/test/matrix/csr_kernels.cpp b/dpcpp/test/matrix/csr_kernels.cpp
@@ -410,7 +410,7 @@ TEST_F(Csr, AdvancedApplyToDenseMatrixIsEquivalentToRefWithMergePath)
 
 TEST_F(Csr, AdvancedApplyToCsrMatrixIsEquivalentToRef)
 {
-    set_up_apply_data(std::make_shared<Mtx::automatical>());
+    set_up_apply_data(std::make_shared<Mtx::automatical>(dpcpp));
     auto trans = mtx->transpose();
     auto d_trans = dmtx->transpose();
 
@@ -425,7 +425,7 @@ TEST_F(Csr, AdvancedApplyToCsrMatrixIsEquivalentToRef)
 
 TEST_F(Csr, SimpleApplyToCsrMatrixIsEquivalentToRef)
 {
-    set_up_apply_data(std::make_shared<Mtx::automatical>());
+    set_up_apply_data(std::make_shared<Mtx::automatical>(dpcpp));
     auto trans = mtx->transpose();
     auto d_trans = dmtx->transpose();
 
@@ -440,7 +440,7 @@ TEST_F(Csr, SimpleApplyToCsrMatrixIsEquivalentToRef)
 
 TEST_F(Csr, AdvancedApplyToIdentityMatrixIsEquivalentToRef)
 {
-    set_up_apply_data(std::make_shared<Mtx::automatical>());
+    set_up_apply_data(std::make_shared<Mtx::automatical>(dpcpp));
     auto a = gen_mtx<Mtx>(mtx_size[0], mtx_size[1], 0);
     auto b = gen_mtx<Mtx>(mtx_size[0], mtx_size[1], 0);
     auto da = gko::clone(dpcpp, a);
@@ -460,7 +460,7 @@ TEST_F(Csr, AdvancedApplyToIdentityMatrixIsEquivalentToRef)
 
 TEST_F(Csr, ApplyToComplexIsEquivalentToRef)
 {
-    set_up_apply_data(std::make_shared<Mtx::automatical>());
+    set_up_apply_data(std::make_shared<Mtx::automatical>(dpcpp));
     auto complex_b = gen_mtx<ComplexVec>(this->mtx_size[1], 3, 1);
     auto dcomplex_b = gko::clone(dpcpp, complex_b);
     auto complex_x = gen_mtx<ComplexVec>(this->mtx_size[0], 3, 1);
@@ -475,7 +475,7 @@ TEST_F(Csr, ApplyToComplexIsEquivalentToRef)
 
 TEST_F(Csr, AdvancedApplyToComplexIsEquivalentToRef)
 {
-    set_up_apply_data(std::make_shared<Mtx::automatical>());
+    set_up_apply_data(std::make_shared<Mtx::automatical>(dpcpp));
     auto complex_b = gen_mtx<ComplexVec>(this->mtx_size[1], 3, 1);
     auto dcomplex_b = gko::clone(dpcpp, complex_b);
     auto complex_x = gen_mtx<ComplexVec>(this->mtx_size[0], 3, 1);
@@ -768,7 +768,7 @@ TEST_F(Csr, IsInverseColPermutable)
 
 TEST_F(Csr, RecognizeSortedMatrixIsEquivalentToRef)
 {
-    set_up_apply_data(std::make_shared<Mtx::automatical>());
+    set_up_apply_data(std::make_shared<Mtx::automatical>(dpcpp));
     bool is_sorted_dpcpp{};
     bool is_sorted_ref{};
 
@@ -795,7 +795,7 @@ TEST_F(Csr, RecognizeUnsortedMatrixIsEquivalentToRef)
 
 TEST_F(Csr, SortSortedMatrixIsEquivalentToRef)
 {
-    set_up_apply_data(std::make_shared<Mtx::automatical>());
+    set_up_apply_data(std::make_shared<Mtx::automatical>(dpcpp));
 
     mtx->sort_by_column_index();
     dmtx->sort_by_column_index();
@@ -820,9 +820,8 @@ TEST_F(Csr, SortUnsortedMatrixIsEquivalentToRef)
 
 TEST_F(Csr, OneAutomaticalWorksWithDifferentMatrices)
 {
-    auto automatical = std::make_shared<Mtx::automatical>();
-    auto row_len_limit = std::max(automatical->nvidia_row_len_limit,
-                                  automatical->amd_row_len_limit);
+    auto automatical = std::make_shared<Mtx::automatical>(dpcpp);
+    auto row_len_limit = automatical->intel_row_len_limit;
     auto load_balance_mtx =
         gen_mtx<Mtx>(1, row_len_limit + 1000, row_len_limit + 1);
     auto classical_mtx = gen_mtx<Mtx>(50, 50, 1);
@@ -841,7 +840,7 @@ TEST_F(Csr, OneAutomaticalWorksWithDifferentMatrices)
 
 TEST_F(Csr, ExtractDiagonalIsEquivalentToRef)
 {
-    set_up_apply_data(std::make_shared<Mtx::automatical>());
+    set_up_apply_data(std::make_shared<Mtx::automatical>(dpcpp));
 
     auto diag = mtx->extract_diagonal();
     auto ddiag = dmtx->extract_diagonal();

diff --git a/include/ginkgo/core/matrix/csr.hpp b/include/ginkgo/core/matrix/csr.hpp
@@ -351,8 +351,11 @@ class Csr : public EnableLinOp<Csr<ValueType, IndexType>>,
     public:
         /**
          * Creates a load_balance strategy.
+         *
+         * @warning this is deprecated! Please rely on the new automatic
+         *          strategy instantiation or use one of the other constructors.
          */
-        load_balance()
+        [[deprecated]] load_balance()
             : load_balance(std::move(
                   gko::CudaExecutor::create(0, gko::OmpExecutor::create())))
         {}
@@ -528,12 +531,21 @@ class Csr : public EnableLinOp<Csr<ValueType, IndexType>>,
         /* Use imbalance strategy when the matrix has more more than 1e8 on AMD
          * hardware */
         const index_type amd_nnz_limit{static_cast<index_type>(1e8)};
+        /* Use imbalance strategy when the maximum number of nonzero per row is
+         * more than 25600 on Intel hardware */
+        const index_type intel_row_len_limit = 25600;
+        /* Use imbalance strategy when the matrix has more more than 3e8 on
+         * Intel hardware */
+        const index_type intel_nnz_limit{static_cast<index_type>(3e8)};
 
     public:
         /**
          * Creates an automatical strategy.
+         *
+         * @warning this is deprecated! Please rely on the new automatic
+         *          strategy instantiation or use one of the other constructors.
          */
-        automatical()
+        [[deprecated]] automatical()
             : automatical(std::move(
                   gko::CudaExecutor::create(0, gko::OmpExecutor::create())))
         {}
@@ -600,12 +612,8 @@ class Csr : public EnableLinOp<Csr<ValueType, IndexType>>,
             index_type nnz_limit = nvidia_nnz_limit;
             index_type row_len_limit = nvidia_row_len_limit;
             if (strategy_name_ == "intel") {
-                /* Use imbalance strategy when the maximum number of nonzero per
-                 * row is more than 25600 on Intel hardware. */
-                nnz_limit = 25600;
-                /* Use imbalance strategy when the matrix has more more than 3e8
-                 * on Intel hardware */
-                row_len_limit = 3e8;
+                nnz_limit = intel_nnz_limit;
+                row_len_limit = intel_row_len_limit;
             }
 #if GINKGO_HIP_PLATFORM_HCC
             if (!cuda_strategy_) {
@@ -963,7 +971,7 @@ class Csr : public EnableLinOp<Csr<ValueType, IndexType>>,
         gko::detail::ConstArrayView<ValueType>&& values,
         gko::detail::ConstArrayView<IndexType>&& col_idxs,
         gko::detail::ConstArrayView<IndexType>&& row_ptrs,
-        std::shared_ptr<strategy_type> strategy = std::make_shared<sparselib>())
+        std::shared_ptr<strategy_type> strategy)
     {
         // cast const-ness away, but return a const object afterwards,
         // so we can ensure that no modifications take place.
@@ -973,6 +981,20 @@ class Csr : public EnableLinOp<Csr<ValueType, IndexType>>,
             gko::detail::array_const_cast(std::move(row_ptrs)), strategy});
     }
 
+    /*
+     * This is version of create_const with a default strategy.
+     */
+    static std::unique_ptr<const Csr> create_const(
+        std::shared_ptr<const Executor> exec, const dim<2>& size,
+        gko::detail::ConstArrayView<ValueType>&& values,
+        gko::detail::ConstArrayView<IndexType>&& col_idxs,
+        gko::detail::ConstArrayView<IndexType>&& row_ptrs)
+    {
+        return Csr::create_const(exec, size, std::move(values),
+                                 std::move(col_idxs), std::move(row_ptrs),
+                                 Csr::make_default_strategy(exec));
+    }
+
 protected:
     /**
      * Creates an uninitialized CSR matrix of the specified size.
@@ -986,16 +1008,16 @@ class Csr : public EnableLinOp<Csr<ValueType, IndexType>>,
     {}
 
     /**
-     * Creates an uninitialized CSR matrix of the specified size.
+     * Creates an uninitialized CSR matrix of the specified size with a user
+     * chosen strategy.
      *
      * @param exec  Executor associated to the matrix
      * @param size  size of the matrix
      * @param num_nonzeros  number of nonzeros
      * @param strategy  the strategy of CSR
      */
-    Csr(std::shared_ptr<const Executor> exec, const dim<2>& size = dim<2>{},
-        size_type num_nonzeros = {},
-        std::shared_ptr<strategy_type> strategy = std::make_shared<sparselib>())
+    Csr(std::shared_ptr<const Executor> exec, const dim<2>& size,
+        size_type num_nonzeros, std::shared_ptr<strategy_type> strategy)
         : EnableLinOp<Csr>(exec, size),
           values_(exec, num_nonzeros),
           col_idxs_(exec, num_nonzeros),
@@ -1007,6 +1029,19 @@ class Csr : public EnableLinOp<Csr<ValueType, IndexType>>,
         this->make_srow();
     }
 
+    /**
+     * Creates an uninitialized CSR matrix of the specified size with a
+     * default strategy.
+     *
+     * @param exec  Executor associated to the matrix
+     * @param size  size of the matrix
+     * @param num_nonzeros  number of nonzeros
+     */
+    Csr(std::shared_ptr<const Executor> exec, const dim<2>& size = dim<2>{},
+        size_type num_nonzeros = {})
+        : Csr{exec, size, num_nonzeros, Csr::make_default_strategy(exec)}
+    {}
+
     /**
      * Creates a CSR matrix from already allocated (and initialized) row
      * pointer, column index and value arrays.
@@ -1020,6 +1055,7 @@ class Csr : public EnableLinOp<Csr<ValueType, IndexType>>,
      * @param values  array of matrix values
      * @param col_idxs  array of column indexes
      * @param row_ptrs  array of row pointers
+     * @param strategy  the strategy the matrix uses for SpMV operations
      *
      * @note If one of `row_ptrs`, `col_idxs` or `values` is not an rvalue, not
      *       an array of IndexType, IndexType and ValueType, respectively, or
@@ -1031,7 +1067,7 @@ class Csr : public EnableLinOp<Csr<ValueType, IndexType>>,
               typename RowPtrsArray>
     Csr(std::shared_ptr<const Executor> exec, const dim<2>& size,
         ValuesArray&& values, ColIdxsArray&& col_idxs, RowPtrsArray&& row_ptrs,
-        std::shared_ptr<strategy_type> strategy = std::make_shared<sparselib>())
+        std::shared_ptr<strategy_type> strategy)
         : EnableLinOp<Csr>(exec, size),
           values_{exec, std::forward<ValuesArray>(values)},
           col_idxs_{exec, std::forward<ColIdxsArray>(col_idxs)},
@@ -1044,11 +1080,50 @@ class Csr : public EnableLinOp<Csr<ValueType, IndexType>>,
         this->make_srow();
     }
 
+    /**
+     * Creates a CSR matrix from already allocated (and initialized) row
+     * pointer, column index and value arrays.
+     *
+     * @note This is the same as the previous constructor but with a default
+     *       strategy.
+     */
+    template <typename ValuesArray, typename ColIdxsArray,
+              typename RowPtrsArray>
+    Csr(std::shared_ptr<const Executor> exec, const dim<2>& size,
+        ValuesArray&& values, ColIdxsArray&& col_idxs, RowPtrsArray&& row_ptrs)
+        : Csr{exec,
+              size,
+              std::forward<ValuesArray>(values),
+              std::forward<ColIdxsArray>(col_idxs),
+              std::forward<RowPtrsArray>(row_ptrs),
+              Csr::make_default_strategy(exec)}
+    {}
+
     void apply_impl(const LinOp* b, LinOp* x) const override;
 
     void apply_impl(const LinOp* alpha, const LinOp* b, const LinOp* beta,
                     LinOp* x) const override;
 
+    // TODO: This provides some more sane settings. Please fix this!
+    static std::shared_ptr<strategy_type> make_default_strategy(
+        std::shared_ptr<const Executor> exec)
+    {
+        auto cuda_exec = std::dynamic_pointer_cast<const CudaExecutor>(exec);
+        auto hip_exec = std::dynamic_pointer_cast<const HipExecutor>(exec);
+        auto dpcpp_exec = std::dynamic_pointer_cast<const DpcppExecutor>(exec);
+        std::shared_ptr<strategy_type> new_strategy;
+        if (cuda_exec) {
+            new_strategy = std::make_shared<automatical>(cuda_exec);
+        } else if (hip_exec) {
+            new_strategy = std::make_shared<automatical>(hip_exec);
+        } else if (dpcpp_exec) {
+            new_strategy = std::make_shared<automatical>(dpcpp_exec);
+        } else {
+            new_strategy = std::make_shared<classical>();
+        }
+        return new_strategy;
+    }
+    
     // TODO clean this up as soon as we improve strategy_type
     template <typename CsrType>
     void convert_strategy_helper(CsrType* result) const
@@ -1140,17 +1215,11 @@ class Csr : public EnableLinOp<Csr<ValueType, IndexType>>,
                                 this_dpcpp_exec);
                     }
                 } else {
+                    // FIXME: this changes strategies.
                     // We had a load balance or automatical strategy from a non
                     // HIP or Cuda executor and are moving to a non HIP or Cuda
                     // executor.
-                    // FIXME this creates a long delay
-                    if (lb) {
-                        new_strat =
-                            std::make_shared<typename CsrType::load_balance>();
-                    } else {
-                        new_strat =
-                            std::make_shared<typename CsrType::automatical>();
-                    }
+                    new_strat = std::make_shared<typename CsrType::classical>();
                 }
             }
         }