8000 CSR: better strategy defaults. by tcojean · Pull Request #969 · ginkgo-project/ginkgo · GitHub
[go: up one dir, main page]
More Web Proxy on the site http://driver.im/
Skip to content

CSR: better strategy defaults. #969

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 4 commits into from
Feb 10, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
18 changes: 9 additions & 9 deletions cuda/test/matrix/csr_kernels.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -409,7 +409,7 @@ TEST_F(Csr, AdvancedApplyToDenseMatrixIsEquivalentToRefWithMergePath)

TEST_F(Csr, AdvancedApplyToCsrMatrixIsEquivalentToRef)
{
set_up_apply_data(std::make_shared<Mtx::automatical>());
set_up_apply_data(std::make_shared<Mtx::automatical>(cuda));
auto trans = mtx->transpose();
auto d_trans = dmtx->transpose();

Expand All @@ -424,7 +424,7 @@ TEST_F(Csr, AdvancedApplyToCsrMatrixIsEquivalentToRef)

TEST_F(Csr, SimpleApplyToCsrMatrixIsEquivalentToRef)
{
set_up_apply_data(std::make_shared<Mtx::automatical>());
set_up_apply_data(std::make_shared<Mtx::automatical>(cuda));
auto trans = mtx->transpose();
auto d_trans = dmtx->transpose();

Expand All @@ -439,7 +439,7 @@ TEST_F(Csr, SimpleApplyToCsrMatrixIsEquivalentToRef)

TEST_F(Csr, AdvancedApplyToIdentityMatrixIsEquivalentToRef)
{
set_up_apply_data(std::make_shared<Mtx::automatical>());
set_up_apply_data(std::make_shared<Mtx::automatical>(cuda));
auto a = gen_mtx<Mtx>(mtx_size[0], mtx_size[1], 0);
auto b = gen_mtx<Mtx>(mtx_size[0], mtx_size[1], 0);
auto da = gko::clone(cuda, a);
Expand All @@ -459,7 +459,7 @@ TEST_F(Csr, AdvancedApplyToIdentityMatrixIsEquivalentToRef)

TEST_F(Csr, ApplyToComplexIsEquivalentToRef)
{
set_up_apply_data(std::make_shared<Mtx::automatical>());
set_up_apply_data(std::make_shared<Mtx::automatical>(cuda));
auto complex_b = gen_mtx<ComplexVec>(this->mtx_size[1], 3, 1);
auto dcomplex_b = gko::clone(cuda, complex_b);
auto complex_x = gen_mtx<ComplexVec>(this->mtx_size[0], 3, 1);
Expand All @@ -474,7 +474,7 @@ TEST_F(Csr, ApplyToComplexIsEquivalentToRef)

TEST_F(Csr, AdvancedApplyToComplexIsEquivalentToRef)
{
set_up_apply_data(std::make_shared<Mtx::automatical>());
set_up_apply_data(std::make_shared<Mtx::automatical>(cuda));
auto complex_b = gen_mtx<ComplexVec>(this->mtx_size[1], 3, 1);
auto dcomplex_b = gko::clone(cuda, complex_b);
auto complex_x = gen_mtx<ComplexVec>(this->mtx_size[0], 3, 1);
Expand Down Expand Up @@ -767,7 +767,7 @@ TEST_F(Csr, IsInverseColPermutable)

TEST_F(Csr, RecognizeSortedMatrixIsEquivalentToRef)
{
set_up_apply_data(std::make_shared<Mtx::automatical>());
set_up_apply_data(std::make_shared<Mtx::automatical>(cuda));
bool is_sorted_cuda{};
bool is_sorted_ref{};

Expand All @@ -794,7 +794,7 @@ TEST_F(Csr, RecognizeUnsortedMatrixIsEquivalentToRef)

TEST_F(Csr, SortSortedMatrixIsEquivalentToRef)
{
set_up_apply_data(std::make_shared<Mtx::automatical>());
set_up_apply_data(std::make_shared<Mtx::automatical>(cuda));

mtx->sort_by_column_index();
dmtx->sort_by_column_index();
Expand All @@ -819,7 +819,7 @@ TEST_F(Csr, SortUnsortedMatrixIsEquivalentToRef)

TEST_F(Csr, OneAutomaticalWorksWithDifferentMatrices)
{
auto automatical = std::make_shared<Mtx::automatical>();
auto automatical = std::make_shared<Mtx::automatical>(cuda);
auto row_len_limit = std::max(automatical->nvidia_row_len_limit,
automatical->amd_row_len_limit);
auto load_balance_mtx =
Expand All @@ -840,7 +840,7 @@ TEST_F(Csr, OneAutomaticalWorksWithDifferentMatrices)

TEST_F(Csr, ExtractDiagonalIsEquivalentToRef)
{
set_up_apply_data(std::make_shared<Mtx::automatical>());
set_up_apply_data(std::make_shared<Mtx::automatical>(cuda));

auto diag = mtx->extract_diagonal();
auto ddiag = dmtx->extract_diagonal();
Expand Down
21 changes: 10 additions & 11 deletions dpcpp/test/matrix/csr_kernels.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -410,7 +410,7 @@ TEST_F(Csr, AdvancedApplyToDenseMatrixIsEquivalentToRefWithMergePath)

TEST_F(Csr, AdvancedApplyToCsrMatrixIsEquivalentToRef)
{
set_up_apply_data(std::make_shared<Mtx::automatical>());
set_up_apply_data(std::make_shared<Mtx::automatical>(dpcpp));
auto trans = mtx->transpose();
auto d_trans = dmtx->transpose();

Expand All @@ -425,7 +425,7 @@ TEST_F(Csr, AdvancedApplyToCsrMatrixIsEquivalentToRef)

TEST_F(Csr, SimpleApplyToCsrMatrixIsEquivalentToRef)
{
set_up_apply_data(std::make_shared<Mtx::automatical>());
set_up_apply_data(std::make_shared<Mtx::automatical>(dpcpp));
auto trans = mtx->transpose();
auto d_trans = dmtx->transpose();

Expand All @@ -440,7 +440,7 @@ TEST_F(Csr, SimpleApplyToCsrMatrixIsEquivalentToRef)

TEST_F(Csr, AdvancedApplyToIdentityMatrixIsEquivalentToRef)
{
set_up_apply_data(std::make_shared<Mtx::automatical>());
set_up_apply_data(std::make_shared<Mtx::automatical>(dpcpp));
auto a = gen_mtx<Mtx>(mtx_size[0], mtx_size[1], 0);
auto b = gen_mtx<Mtx>(mtx_size[0], mtx_size[1], 0);
auto da = gko::clone(dpcpp, a);
Expand All @@ -460,7 +460,7 @@ TEST_F(Csr, AdvancedApplyToIdentityMatrixIsEquivalentToRef)

TEST_F(Csr, ApplyToComplexIsEquivalentToRef)
{
set_up_apply_data(std::make_shared<Mtx::automatical>());
set_up_apply_data(std::make_shared<Mtx::automatical>(dpcpp));
auto complex_b = gen_mtx<ComplexVec>(this->mtx_size[1], 3, 1);
auto dcomplex_b = gko::clone(dpcpp, complex_b);
auto complex_x = gen_mtx<ComplexVec>(this->mtx_size[0], 3, 1);
Expand All @@ -475,7 +475,7 @@ TEST_F(Csr, ApplyToComplexIsEquivalentToRef)

TEST_F(Csr, AdvancedApplyToComplexIsEquivalentToRef)
{
set_up_apply_data(std::make_shared<Mtx::automatical>());
set_up_apply_data(std::make_shared<Mtx::automatical>(dpcpp));
auto complex_b = gen_mtx<ComplexVec>(this->mtx_size[1], 3, 1);
auto dcomplex_b = gko::clone(dpcpp, complex_b);
auto complex_x = gen_mtx<ComplexVec>(this->mtx_size[0], 3, 1);
Expand Down Expand Up @@ -768,7 +768,7 @@ TEST_F(Csr, IsInverseColPermutable)

TEST_F(Csr, RecognizeSortedMatrixIsEquivalentToRef)
{
set_up_apply_data(std::make_shared<Mtx::automatical>());
set_up_apply_data(std::make_shared<Mtx::automatical>(dpcpp));
bool is_sorted_dpcpp{};
bool is_sorted_ref{};

Expand All @@ -795,7 +795,7 @@ TEST_F(Csr, RecognizeUnsortedMatrixIsEquivalentToRef)

TEST_F(Csr, SortSortedMatrixIsEquivalentToRef)
{
set_up_apply_data(std::make_shared<Mtx::automatical>());
set_up_apply_data(std::make_shared<Mtx::automatical>(dpcpp));

mtx->sort_by_column_index();
dmtx->sort_by_column_index();
Expand All @@ -820,9 +820,8 @@ TEST_F(Csr, SortUnsortedMatrixIsEquivalentToRef)

TEST_F(Csr, OneAutomaticalWorksWithDifferentMatrices)
{
auto automatical = std::make_shared<Mtx::automatical>();
auto row_len_limit = std::max(automatical->nvidia_row_len_limit,
automatical->amd_row_len_limit);
auto automatical = std::make_shared<Mtx::automatical>(dpcpp);
auto row_len_limit = automatical->intel_row_len_limit;
auto load_balance_mtx =
gen_mtx<Mtx>(1, row_len_limit + 1000, row_len_limit + 1);
auto classical_mtx = gen_mtx<Mtx>(50, 50, 1);
Expand All @@ -841,7 +840,7 @@ TEST_F(Csr, OneAutomaticalWorksWithDifferentMatrices)

TEST_F(Csr, ExtractDiagonalIsEquivalentToRef)
{
set_up_apply_data(std::make_shared<Mtx::automatical>());
set_up_apply_data(std::make_shared<Mtx::automatical>(dpcpp));

auto diag = mtx->extract_diagonal();
auto ddiag = dmtx->extract_diagonal();
Expand Down
113 changes: 91 additions & 22 deletions include/ginkgo/core/matrix/csr.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -351,8 +351,11 @@ class Csr : public EnableLinOp<Csr<ValueType, IndexType>>,
public:
/**
* Creates a load_balance strategy.
*
* @warning this is deprecated! Please rely on the new automatic
* strategy instantiation or use one of the other constructors.
*/
load_balance()
[[deprecated]] load_balance()
: load_balance(std::move(
gko::CudaExecutor::create(0, gko::OmpExecutor::create())))
{}
Expand Down Expand Up @@ -528,12 +531,21 @@ class Csr : public EnableLinOp<Csr<ValueType, IndexType>>,
/* Use imbalance strategy when the matrix has more more than 1e8 on AMD
* hardware */
const index_type amd_nnz_limit{static_cast<index_type>(1e8)};
/* Use imbalance strategy when the maximum number of nonzero per row is
* more than 25600 on Intel hardware */
const index_type intel_row_len_limit = 25600;
/* Use imbalance strategy when the matrix has more more than 3e8 on
* Intel hardware */
const index_type intel_nnz_limit{static_cast<index_type>(3e8)};

public:
/**
* Creates an automatical strategy.
*
* @warning this is deprecated! Please rely on the new automatic
* strategy instantiation or use one of the other constructors.
*/
automatical()
[[deprecated]] automatical()
: automatical(std::move(
gko::CudaExecutor::create(0, gko::OmpExecutor::create())))
{}
Expand Down Expand Up @@ -600,12 +612,8 @@ class Csr : public EnableLinOp<Csr<ValueType, IndexType>>,
index_type nnz_limit = nvidia_nnz_limit;
index_type row_len_limit = nvidia_row_len_limit;
if (strategy_name_ == "intel") {
/* Use imbalance strategy when the maximum number of nonzero per
* row is more than 25600 on Intel hardware. */
nnz_limit = 25600;
/* Use imbalance strategy when the matrix has more more than 3e8
* on Intel hardware */
row_len_limit = 3e8;
nnz_limit = intel_nnz_limit;
row_len_limit = intel_row_len_limit;
}
#if GINKGO_HIP_PLATFORM_HCC
if (!cuda_strategy_) {
Expand Down Expand Up @@ -963,7 +971,7 @@ class Csr : public EnableLinOp<Csr<ValueType, IndexType>>,
gko::detail::ConstArrayView<ValueType>&& values,
gko::detail::ConstArrayView<IndexType>&& col_idxs,
gko::detail::ConstArrayView<IndexType>&& row_ptrs,
std::shared_ptr<strategy_type> strategy = std::make_shared<sparselib>())
std::shared_ptr<strategy_type> strategy)
{
// cast const-ness away, but return a const object afterwards,
// so we can ensure that no modifications take place.
Expand All @@ -973,6 +981,20 @@ class Csr : public EnableLinOp<Csr<ValueType, IndexType>>,
gko::detail::array_const_cast(std::move(row_ptrs)), strategy});
}

/*
* This is version of create_const with a default strategy.
*/
static std::unique_ptr<const Csr> create_const(
std::shared_ptr<const Executor> exec, const dim<2>& size,
gko::detail::ConstArrayView<ValueType>&& values,
gko::detail::ConstArrayView<IndexType>&& col_idxs,
gko::detail::ConstArrayView<IndexType>&& row_ptrs)
{
return Csr::create_const(exec, size, std::move(values),
std::move(col_idxs), std::move(row_ptrs),
Csr::make_default_strategy(exec));
}

protected:
/**
* Creates an uninitialized CSR matrix of the specified size.
Expand All @@ -986,16 +1008,16 @@ class Csr : public EnableLinOp<Csr<ValueType, IndexType>>,
{}

/**
* Creates an uninitialized CSR matrix of the specified size.
* Creates an uninitialized CSR matrix of the specified size with a user
* chosen strategy.
*
* @param exec Executor associated to the matrix
* @param size size of the matrix
* @param num_nonzeros number of nonzeros
* @param strategy the strategy of CSR
*/
Csr(std::shared_ptr<const Executor> exec, const dim<2>& size = dim<2>{},
size_type num_nonzeros = {},
std::shared_ptr<strategy_type> strategy = std::make_shared<sparselib>())
Csr(std::shared_ptr<const Executor> exec, const dim<2>& size,
size_type num_nonzeros, std::shared_ptr<strategy_type> strategy)
: EnableLinOp<Csr>(exec, size),
values_(exec, num_nonzeros),
col_idxs_(exec, num_nonzeros),
Expand All @@ -1007,6 +1029,19 @@ class Csr : public EnableLinOp<Csr<ValueType, IndexType>>,
this->make_srow();
}

/**
* Creates an uninitialized CSR matrix of the specified size with a
* default strategy.
*
* @param exec Executor associated to the matrix
* @param size size of the matrix
* @param num_nonzeros number of nonzeros
*/
Csr(std::shared_ptr<const Executor> exec, const dim<2>& size = dim<2>{},
size_type num_nonzeros = {})
: Csr{exec, size, num_nonzeros, Csr::make_default_strategy(exec)}
{}

/**
* Creates a CSR matrix from already allocated (and initialized) row
* pointer, column index and value arrays.
Expand All @@ -1020,6 +1055,7 @@ class Csr : public EnableLinOp<Csr<ValueType, IndexType>>,
* @param values array of matrix values
* @param col_idxs array of column indexes
* @param row_ptrs array of row pointers
* @param strategy the strategy the matrix uses for SpMV operations
*
* @note If one of `row_ptrs`, `col_idxs` or `values` is not an rvalue, not
* an array of IndexType, IndexType and ValueType, respectively, or
Expand All @@ -1031,7 +1067,7 @@ class Csr : public EnableLinOp<Csr<ValueType, IndexType>>,
typename RowPtrsArray>
Csr(std::shared_ptr<const Executor> exec, const dim<2>& size,
ValuesArray&& values, ColIdxsArray&& col_idxs, RowPtrsArray&& row_ptrs,
std::shared_ptr<strategy_type> strategy = std::make_shared<sparselib>())
std::shared_ptr<strategy_type> strategy)
: EnableLinOp<Csr>(exec, size),
values_{exec, std::forward<ValuesArray>(values)},
col_idxs_{exec, std::forward<ColIdxsArray>(col_idxs)},
Expand All @@ -1044,11 +1080,50 @@ class Csr : public EnableLinOp<Csr<ValueType, IndexType>>,
this->make_srow();
}

/**
* Creates a CSR matrix from already allocated (and initialized) row
* pointer, column index and value arrays.
*
* @note This is the same as the previous constructor but with a default
* strategy.
*/
template <typename ValuesArray, typename ColIdxsArray,
typename RowPtrsArray>
Csr(std::shared_ptr<const Executor> exec, const dim<2>& size,
ValuesArray&& values, ColIdxsArray&& col_idxs, RowPtrsArray&& row_ptrs)
: Csr{exec,
size,
std::forward<ValuesArray>(values),
std::forward<ColIdxsArray>(col_idxs),
std::forward<RowPtrsArray>(row_ptrs),
Csr::make_default_strategy(exec)}
{}

void apply_impl(const LinOp* b, LinOp* x) const override;

void apply_impl(const LinOp* alpha, const LinOp* b, const LinOp* beta,
LinOp* x) const override;

// TODO: This provides some more sane settings. Please fix this!
static std::shared_ptr<strategy_type> make_default_strategy(
std::shared_ptr<const Executor> exec)
{
auto cuda_exec = std::dynamic_pointer_cast<const CudaExecutor>(exec);
auto hip_exec = std::dynamic_pointer_cast<const HipExecutor>(exec);
auto dpcpp_exec = std::dynamic_pointer_cast<const DpcppExecutor>(exec);
std::shared_ptr<strategy_type> new_strategy;
if (cuda_exec) {
new_strategy = std::make_shared<automatical>(cuda_exec);
} else if (hip_exec) {
new_strategy = std::make_shared<automatical>(hip_exec);
} else if (dpcpp_exec) {
new_strategy = std::make_shared<automatical>(dpcpp_exec);
} else {
new_strategy = std::make_shared<classical>();
}
return new_strategy;
}

// TODO clean this up as soon as we improve strategy_type
template <typename CsrType>
void convert_strategy_helper(CsrType* result) const
Expand Down Expand Up @@ -1140,17 +1215,11 @@ class Csr : public EnableLinOp<Csr<ValueType, IndexType>>,
this_dpcpp_exec);
}
} else {
// FIXME: this changes strategies.
// We had a load balance or automatical strategy from a non
// HIP or Cuda executor and are moving to a non HIP or Cuda
// executor.
// FIXME this creates a long delay
if (lb) {
new_strat =
std::make_shared<typename CsrType::load_balance>();
} else {
new_strat =
std::make_shared<typename CsrType::automatical>();
}
new_strat = std::make_shared<typename CsrType::classical>();
}
}
}
Expand Down
0