8000 Fix/multinode issues by greole · Pull Request #161 · hpsim/OGL · GitHub
[go: up one dir, main page]
More Web Proxy on the site http://driver.im/
Skip to content

Fix/multinode issues #161

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 10 commits into from
May 13, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
16 changes: 12 additions & 4 deletions include/OGL/DevicePersistent/ExecutorHandler.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,13 @@ struct DeviceIdHandler {
FatalErrorInFunction << "Only parallel runs are supported for OGL"
<< exit(FatalError);
}

if (Pstream::nProcs(0) % ranks_per_gpu != 0) {
FatalErrorInFunction
<< " Total number of ranks = " << Pstream::nProcs(0)
<< " is not a multiple of "
<< " ranksPerGPU " << ranks_per_gpu << exit(FatalError);
}
}

/* @brief compute the local device id
Expand Down Expand Up @@ -102,7 +109,6 @@ struct ExecutorInitFunctor {
{
auto host_exec = gko::share(gko::ReferenceExecutor::create());


auto msg = [](auto exec, auto id) {
std::string s;
// auto node_comm = Pstream::commInterHost();
Expand All @@ -111,16 +117,17 @@ struct ExecutorInitFunctor {
label global_ranks = Pstream::nProcs(0);
label device_ranks = Pstream::nProcs(node_comm);
label node_id = global_ranks / device_ranks;

// Pstream::barrier(0);
// sleep(0.03 * global_rank);
s += std::string("Create ") + std::string(exec) +
std::string(" executor device ") + std::to_string(id) +
std::string(" node ") + std::to_string(node_id 10000 ) +
std::string(" local rank [") +
std::to_string(Pstream::myProcNo(node_comm)) +
std::string("/") + std::to_string(device_ranks) +
std::string("/") + std::to_string(device_ranks - 1) +
std::string("] global rank [") + std::to_string(global_rank) +
std::string("/") + std::to_string(global_ranks) +
std::string("/") + std::to_string(global_ranks - 1) +
std::string("]");
return s;
};
Expand All @@ -135,7 +142,8 @@ struct ExecutorInitFunctor {
label id = device_id_handler_.compute_device_id(
gko::CudaExecutor::get_num_devices());
LOG_0(verbose_, msg(executor_name_, id))
return gko::share(gko::CudaExecutor::create(id, host_exec));
auto ret = gko::share(gko::CudaExecutor::create(id, host_exec));
return ret;
}
if (executor_name_ == "sycl" || executor_name_ == "dpcpp") {
if (version.dpcpp_version.tag == not_compiled_tag) {
Expand Down
4 changes: 4 additions & 0 deletions include/OGL/MatrixWrapper/HostMatrix.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -81,6 +81,8 @@ class HostMatrixWrapper {

const word field_name_;

const word folder_;

// Whether the matrix coefficients should be reordered
// during copy or on device
const bool reorder_on_copy_;
Expand Down Expand Up @@ -268,6 +270,8 @@ class HostMatrixWrapper {
const ExecutorHandler &get_exec_handler() const { return exec_; }

const word get_field_name() const { return field_name_; }

const word get_folder() const { return folder_; }
};


Expand Down
2 changes: 1 addition & 1 deletion include/OGL/MatrixWrapper/SparsityPattern.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -49,7 +49,7 @@ std::vector<label> sort_permutation(const std::vector<T> &vec, Compare compare)
*to
*/
std::pair<std::vector<std::vector<label>>, std::vector<label>> compress_cols(
std::vector<std::vector<label>> in, std::vector<label> comm_id);
std::vector<std::vector<label>> in, std::vector<label> comm_rank);
} // namespace detail

namespace Foam {
Expand Down
16 changes: 8 additions & 8 deletions include/OGL/Preconditioner.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -302,14 +302,6 @@ class Preconditioner {
.with_max_block_size(static_cast<gko::uint32>(1))
.on(device_exec));

auto coarsest_gen = gko::share(
cg::build()
.with_preconditioner(ras::build()
.with_local_solver(pre_factory)
.on(device_exec))
.with_criteria(gko::stop::Iteration::build().with_max_iters(
static_cast<gko::uint32>(coarse_solver_iters)))
.on(device_exec));

word msg =
"Generate preconditioner: " + name +
Expand Down Expand Up @@ -361,6 +353,14 @@ class Preconditioner {
}

if (type == "Distributed") {
auto coarsest_gen = gko::share(
cg::build()
.with_preconditioner(
ras::build().with_local_solver(pre_factory))
.with_criteria(
gko::stop::Iteration::build().with_max_iters(
static_cast<gko::uint32>(coarse_solver_iters)))
.on(device_exec));
auto gkodistmatrix =
gko::as<RepartDistMatrix>(gkomatrix)->get_dist_matrix();
auto smoother_gen = gko::share(
Expand Down
4 changes: 2 additions & 2 deletions include/OGL/lduLduBase.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -272,9 +272,10 @@ class lduLduBase : public OGL_Info,
auto precond = this->init_preconditioner(
dist_A_v, exec_handler_.get_device_exec());)

bool active = repartitioner->get_repart_size() != 0;
bool export_system(
solver_controls_.lookupOrDefault<Switch>("export", false));
if (export_system && db_.time().writeTime()) {
if (export_system && db_.time().writeTime() && active) {
bool write_global(
solver_controls_.lookupOrDefault<Switch>("writeGlobal", true));
LOG_0(verbose_, "Export system")
Expand All @@ -293,7 +294,6 @@ class lduLduBase : public OGL_Info,
LOG_1(verbose_, "done create solver")

// solve only on active rank
bool active = repartitioner->get_repart_size() != 0;
label delta_t_solve_ = 0;
bool split_mpi_comm =
solver_controls_.lookupOrDefault<Switch>("splitMPIComm", true);
Expand Down
8 changes: 5 additions & 3 deletions src/CommunicationPattern.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -280,15 +280,17 @@ std::ostream &operator<<(std::ostream &out, const CommunicationPattern &e)
}


// computes flat vector of send idxs
std::vector<label> CommunicationPattern::total_rank_send_idx() const
{
std::vector<label> tmp;
// flatten and return
std::vector<label> ret;

for (auto &rows : send_idxs) {
tmp.insert(tmp.end(), rows.begin(), rows.end());
ret.insert(ret.end(), rows.begin(), rows.end());
}

return tmp;
return ret;
}


Expand Down
13 changes: 13 additions & 0 deletions src/MatrixWrapper/Distributed.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
// SPDX-License-Identifier: GPL-3.0-or-later

#include "OGL/MatrixWrapper/Distributed.hpp"
#include <fstream>

/* helper function to convert a sparsity pattern into a vector of linops with
* zero coefficients
Expand Down Expand Up @@ -169,6 +170,7 @@ void RepartDistMatrix::write(const ExecutorHandler &exec_handler,
auto non_local = gko::share(
gko::matrix::Coo<scalar, label>::create(exec_handler.get_ref_exec()));


if (fuse_) {
gko::as<LocalMatrixType>(dist_mtx_->get_local_matrix())
->convert_to(local.get());
Expand Down Expand Up @@ -461,6 +463,17 @@ std::shared_ptr<RepartDistMatrix> create_impl(
auto [send_counts, send_offsets, recv_sizes, recv_offsets] =
repart_comm_pattern->send_recv_pattern();

if (verbose > 1){
std::ofstream myfile;
std::string folder = host_A->get_folder();
myfile.open(folder + "/host_comm_pattern_" +
std::to_string(Pstream::myProcNo()));
myfile << "repart_comm_pattern " << *repart_comm_pattern.get() << "\n";
myfile << "\nrecv_gather_idxs: " << convert_to_vector(recv_gather_idxs)
<< "\nrecv_sizes: " << recv_sizes
<< "\nrecv_offsets: " << recv_offsets << "\n";
}

if (fuse) {
dist_A = gko::share(dist_mtx::create(
device_exec, device_comm, global_dim, local_linops[0],
Expand Down
6 changes: 5 additions & 1 deletion src/MatrixWrapper/HostMatrix.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,7 @@ HostMatrixWrapper::HostMatrixWrapper(
device_id_guard_{db, fieldName, exec_.get_device_exec()},
verbose_(verbose),
field_name_(fieldName),
folder_(db.time().rootPath()),
reorder_on_copy_(
solverControls.lookupOrDefault<Switch>("reorderOnHost", true)),
addr_(addr),
Expand Down Expand Up @@ -168,7 +169,10 @@ HostMatrixWrapper::create_communication_pattern() const
// create index_sets
std::vector<std::vector<label>> send_idxs;
for (label proc : target_ids) {
send_idxs.emplace_back(interface_cell_map[proc]);
auto send_idx = interface_cell_map[proc];
// comm pattern send idxs need to be in order
std::stable_sort(send_idx.begin(), send_idx.end());
send_idxs.emplace_back(send_idx);
}

return std::make_shared<CommunicationPattern>(get_exec_handler(),
Expand Down
41 changes: 33 additions & 8 deletions src/MatrixWrapper/SparsityPattern.cpp
D6ED
Original file line number Diff line number Diff line change
Expand Up @@ -8,12 +8,12 @@ namespace detail {


std::pair<std::vector<std::vector<label>>, std::vector<label>> compress_cols(
std::vector<std::vector<label>> in, std::vector<label> comm_id)
std::vector<std::vector<label>> in, std::vector<label> comm_rank)
{
// create a sorting map based on the comm ids
// here the ids with higher id should receive data first
auto id_permutation =
sort_permutation(comm_id, [](label a, label b) { return a < b; });
// create a sorting map based on the comm ranks
// here data from lower ranks is received first
auto comm_permutation =
sort_permutation(comm_rank, [](label a, label b) { return a < b; });
std::map<label, label> col_map;

std::vector<label> global_cols;
Expand All @@ -23,10 +23,35 @@ std::pair<std::vector<std::vector<label>>, std::vector<label>> compress_cols(
}
}

label ctr = 0;

// ranks that are the same need to be fused
// first before sorting the interface
// because we send indices sorted per interface
std::vector<std::vector<label>> fused_in;
label prev = -1;
for (auto id : comm_permutation) {
if (comm_rank[id] == prev) {
auto &back = fused_in.back();
for (auto col : in[id]) {
back.push_back(col);
}
} else {
std::vector<label> ins;
for (auto col : in[id]) {
ins.push_back(col);
}
fused_in.push_back(ins);
prev = comm_rank[id];
}
}


// iterate in the order of communication ranks
for (auto id : id_permutation) {
auto &cols = in[id];
label ctr = 0;
for (auto cols : fused_in) {
// sort by global id because this the order how they are sent
std::stable_sort(cols.begin(), cols.end());
// based on global col we compute the compressed recv ctr
for (auto col : cols) {
// new element found
if (col_map.find(col) == col_map.end()) {
Expand Down
13 changes: 10 additions & 3 deletions src/Repartitioner.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -150,9 +150,11 @@ Repartitioner::repartition_comm_pattern(
const ExecutorHandler &exec_handler,
std::shared_ptr<const CommunicationPattern> src_comm_pattern) const
{
if (ranks_per_gpu_ == 1) {
return src_comm_pattern;
}
// TODO:add early return again
// and just sort send_idxs
// if (ranks_per_gpu_ == 1) {
// return src_comm_pattern;
// }

// using comm_size_type = label;
auto exec = exec_handler.get_ref_exec();
Expand Down Expand Up @@ -326,6 +328,11 @@ Repartitioner::repartition_comm_pattern(
}
}

// sort the send_idxs so that we send ordered dofs per interface
for (auto &iface_idxs : merged_send_idxs) {
std::stable_sort(iface_idxs.begin(), iface_idxs.end());
}

// recompute send_idxs
send_idxs.clear();

Expand Down
Loading
0