hpsim · greole · May 13, 2025 · Feb 20, 2025 · Feb 22, 2025 · Feb 27, 2025
diff --git a/include/OGL/DevicePersistent/ExecutorHandler.hpp b/include/OGL/DevicePersistent/ExecutorHandler.hpp
@@ -31,6 +31,13 @@ struct DeviceIdHandler {
             FatalErrorInFunction << "Only parallel runs are supported for OGL"
                                  << exit(FatalError);
         }
+
+        if (Pstream::nProcs(0) % ranks_per_gpu != 0) {
+            FatalErrorInFunction
+                << " Total number of ranks = " << Pstream::nProcs(0)
+                << " is not a multiple of "
+                << " ranksPerGPU " << ranks_per_gpu << exit(FatalError);
+        }
     }
 
     /* @brief compute the local device id
@@ -102,7 +109,6 @@ struct ExecutorInitFunctor {
     {
         auto host_exec = gko::share(gko::ReferenceExecutor::create());
 
-
         auto msg = [](auto exec, auto id) {
             std::string s;
             // auto node_comm = Pstream::commInterHost();
@@ -111,16 +117,17 @@ struct ExecutorInitFunctor {
             label global_ranks = Pstream::nProcs(0);
             label device_ranks = Pstream::nProcs(node_comm);
             label node_id = global_ranks / device_ranks;
+
             // Pstream::barrier(0);
             // sleep(0.03 * global_rank);
             s += std::string("Create ") + std::string(exec) +
                  std::string(" executor device ") + std::to_string(id) +
                  std::string(" node ") + std::to_string(node_id
10000
) +
                  std::string(" local rank [") +
                  std::to_string(Pstream::myProcNo(node_comm)) +
-                 std::string("/") + std::to_string(device_ranks) +
+                 std::string("/") + std::to_string(device_ranks - 1) +
                  std::string("] global rank [") + std::to_string(global_rank) +
-                 std::string("/") + std::to_string(global_ranks) +
+                 std::string("/") + std::to_string(global_ranks - 1) +
                  std::string("]");
             return s;
         };
@@ -135,7 +142,8 @@ struct ExecutorInitFunctor {
             label id = device_id_handler_.compute_device_id(
                 gko::CudaExecutor::get_num_devices());
             LOG_0(verbose_, msg(executor_name_, id))
-            return gko::share(gko::CudaExecutor::create(id, host_exec));
+            auto ret = gko::share(gko::CudaExecutor::create(id, host_exec));
+            return ret;
         }
         if (executor_name_ == "sycl" || executor_name_ == "dpcpp") {
             if (version.dpcpp_version.tag == not_compiled_tag) {

diff --git a/include/OGL/MatrixWrapper/HostMatrix.hpp b/include/OGL/MatrixWrapper/HostMatrix.hpp
@@ -81,6 +81,8 @@ class HostMatrixWrapper {
 
     const word field_name_;
 
+    const word folder_;
+
     // Whether the matrix coefficients should be reordered
     // during copy or on device
     const bool reorder_on_copy_;
@@ -268,6 +270,8 @@ class HostMatrixWrapper {
     const ExecutorHandler &get_exec_handler() const { return exec_; }
 
     const word get_field_name() const { return field_name_; }
+
+    const word get_folder() const { return folder_; }
 };
 
 

diff --git a/include/OGL/MatrixWrapper/SparsityPattern.hpp b/include/OGL/MatrixWrapper/SparsityPattern.hpp
@@ -49,7 +49,7 @@ std::vector<label> sort_permutation(const std::vector<T> &vec, Compare compare)
 *to
 */
 std::pair<std::vector<std::vector<label>>, std::vector<label>> compress_cols(
-    std::vector<std::vector<label>> in, std::vector<label> comm_id);
+    std::vector<std::vector<label>> in, std::vector<label> comm_rank);
 }  // namespace detail
 
 namespace Foam {

diff --git a/include/OGL/Preconditioner.hpp b/include/OGL/Preconditioner.hpp
@@ -302,14 +302,6 @@ class Preconditioner {
                                .with_max_block_size(static_cast<gko::uint32>(1))
                                .on(device_exec));
 
-            auto coarsest_gen = gko::share(
-                cg::build()
-                    .with_preconditioner(ras::build()
-                                             .with_local_solver(pre_factory)
-                                             .on(device_exec))
-                    .with_criteria(gko::stop::Iteration::build().with_max_iters(
-                        static_cast<gko::uint32>(coarse_solver_iters)))
-                    .on(device_exec));
 
             word msg =
                 "Generate preconditioner: " + name +
@@ -361,6 +353,14 @@ class Preconditioner {
             }
 
             if (type == "Distributed") {
+                auto coarsest_gen = gko::share(
+                    cg::build()
+                        .with_preconditioner(
+                            ras::build().with_local_solver(pre_factory))
+                        .with_criteria(
+                            gko::stop::Iteration::build().with_max_iters(
+                                static_cast<gko::uint32>(coarse_solver_iters)))
+                        .on(device_exec));
                 auto gkodistmatrix =
                     gko::as<RepartDistMatrix>(gkomatrix)->get_dist_matrix();
                 auto smoother_gen = gko::share(

diff --git a/include/OGL/lduLduBase.hpp b/include/OGL/lduLduBase.hpp
@@ -272,9 +272,10 @@ class lduLduBase : public OGL_Info,
                             auto precond = this->init_preconditioner(
                                 dist_A_v, exec_handler_.get_device_exec());)
 
+        bool active = repartitioner->get_repart_size() != 0;
         bool export_system(
             solver_controls_.lookupOrDefault<Switch>("export", false));
-        if (export_system && db_.time().writeTime()) {
+        if (export_system && db_.time().writeTime() && active) {
             bool write_global(
                 solver_controls_.lookupOrDefault<Switch>("writeGlobal", true));
             LOG_0(verbose_, "Export system")
@@ -293,7 +294,6 @@ class lduLduBase : public OGL_Info,
         LOG_1(verbose_, "done create solver")
 
         // solve only on active rank
-        bool active = repartitioner->get_repart_size() != 0;
         label delta_t_solve_ = 0;
         bool split_mpi_comm =
             solver_controls_.lookupOrDefault<Switch>("splitMPIComm", true);

diff --git a/src/CommunicationPattern.cpp b/src/CommunicationPattern.cpp
@@ -280,15 +280,17 @@ std::ostream &operator<<(std::ostream &out, const CommunicationPattern &e)
 }
 
 
+// computes flat vector of send idxs
 std::vector<label> CommunicationPattern::total_rank_send_idx() const
 {
-    std::vector<label> tmp;
+    // flatten and return
+    std::vector<label> ret;
 
     for (auto &rows : send_idxs) {
-        tmp.insert(tmp.end(), rows.begin(), rows.end());
+        ret.insert(ret.end(), rows.begin(), rows.end());
     }
 
-    return tmp;
+    return ret;
 }
 
 

diff --git a/src/MatrixWrapper/Distributed.cpp b/src/MatrixWrapper/Distributed.cpp
@@ -3,6 +3,7 @@
 // SPDX-License-Identifier: GPL-3.0-or-later
 
 #include "OGL/MatrixWrapper/Distributed.hpp"
+#include <fstream>
 
 /* helper function to convert a sparsity pattern into a vector of linops with
  * zero coefficients
@@ -169,6 +170,7 @@ void RepartDistMatrix::write(const ExecutorHandler &exec_handler,
     auto non_local = gko::share(
         gko::matrix::Coo<scalar, label>::create(exec_handler.get_ref_exec()));
 
+
     if (fuse_) {
         gko::as<LocalMatrixType>(dist_mtx_->get_local_matrix())
             ->convert_to(local.get());
@@ -461,6 +463,17 @@ std::shared_ptr<RepartDistMatrix> create_impl(
     auto [send_counts, send_offsets, recv_sizes, recv_offsets] =
         repart_comm_pattern->send_recv_pattern();
 
+    if (verbose > 1){
+	    std::ofstream myfile;
+	    std::string folder = host_A->get_folder();
+	    myfile.open(folder + "/host_comm_pattern_" +
+			std::to_string(Pstream::myProcNo()));
+	    myfile << "repart_comm_pattern " << *repart_comm_pattern.get() << "\n";
+	    myfile << "\nrecv_gather_idxs: " << convert_to_vector(recv_gather_idxs)
+		   << "\nrecv_sizes: " << recv_sizes
+		   << "\nrecv_offsets: " << recv_offsets << "\n";
+    }
+
     if (fuse) {
         dist_A = gko::share(dist_mtx::create(
             device_exec, device_comm, global_dim, local_linops[0],

diff --git a/src/MatrixWrapper/HostMatrix.cpp b/src/MatrixWrapper/HostMatrix.cpp
@@ -35,6 +35,7 @@ HostMatrixWrapper::HostMatrixWrapper(
       device_id_guard_{db, fieldName, exec_.get_device_exec()},
       verbose_(verbose),
       field_name_(fieldName),
+      folder_(db.time().rootPath()),
       reorder_on_copy_(
           solverControls.lookupOrDefault<Switch>("reorderOnHost", true)),
       addr_(addr),
@@ -168,7 +169,10 @@ HostMatrixWrapper::create_communication_pattern() const
     // create index_sets
     std::vector<std::vector<label>> send_idxs;
     for (label proc : target_ids) {
-        send_idxs.emplace_back(interface_cell_map[proc]);
+        auto send_idx = interface_cell_map[proc];
+        // comm pattern send idxs need to be in order
+        std::stable_sort(send_idx.begin(), send_idx.end());
+        send_idxs.emplace_back(send_idx);
     }
 
     return std::make_shared<CommunicationPattern>(get_exec_handler(),

diff --git a/src/MatrixWrapper/SparsityPattern.cpp b/src/MatrixWrapper/SparsityPattern.cpp
@@ -8,12 +8,12 @@ namespace detail {
 
 
 std::pair<std::vector<std::vector<label>>, std::vector<label>> compress_cols(
-    std::vector<std::vector<label>> in, std::vector<label> comm_id)
+    std::vector<std::vector<label>> in, std::vector<label> comm_rank)
 {
-    // create a sorting map based on the comm ids
-    // here the ids with higher id should receive data first
-    auto id_permutation =
-        sort_permutation(comm_id, [](label a, label b) { return a < b; });
+    // create a sorting map based on the comm ranks
+    // here data from lower ranks is received  first
+    auto comm_permutation =
+        sort_permutation(comm_rank, [](label a, label b) { return a < b; });
     std::map<label, label> col_map;
 
     std::vector<label> global_cols;
@@ -23,10 +23,35 @@ std::pair<std::vector<std::vector<label>>, std::vector<label>> compress_cols(
         }
     }
 
-    label ctr = 0;
+
+    // ranks that are the same need to be fused
+    // first before sorting the interface
+    // because we send indices sorted per interface
+    std::vector<std::vector<label>> fused_in;
+    label prev = -1;
+    for (auto id : comm_permutation) {
+        if (comm_rank[id] == prev) {
+            auto &back = fused_in.back();
+            for (auto col : in[id]) {
+                back.push_back(col);
+            }
+        } else {
+            std::vector<label> ins;
+            for (auto col : in[id]) {
+                ins.push_back(col);
+            }
+            fused_in.push_back(ins);
+            prev = comm_rank[id];
+        }
+    }
+
+
     // iterate in the order of communication ranks
-    for (auto id : id_permutation) {
-        auto &cols = in[id];
+    label ctr = 0;
+    for (auto cols : fused_in) {
+        // sort by global id because this the order how they are sent
+        std::stable_sort(cols.begin(), cols.end());
+        // based on global col we compute the compressed recv ctr
         for (auto col : cols) {
             // new element found
             if (col_map.find(col) == col_map.end()) {

diff --git a/src/Repartitioner.cpp b/src/Repartitioner.cpp
@@ -150,9 +150,11 @@ Repartitioner::repartition_comm_pattern(
     const ExecutorHandler &exec_handler,
     std::shared_ptr<const CommunicationPattern> src_comm_pattern) const
 {
-    if (ranks_per_gpu_ == 1) {
-        return src_comm_pattern;
-    }
+    // TODO:add early return again
+    // and just sort send_idxs
+    // if (ranks_per_gpu_ == 1) {
+    //     return src_comm_pattern;
+    // }
 
     // using comm_size_type = label;
     auto exec = exec_handler.get_ref_exec();
@@ -326,6 +328,11 @@ Repartitioner::repartition_comm_pattern(
         }
     }
 
+    // sort the send_idxs so that we send ordered dofs per interface
+    for (auto &iface_idxs : merged_send_idxs) {
+        std::stable_sort(iface_idxs.begin(), iface_idxs.end());
+    }
+
     // recompute send_idxs
     send_idxs.clear();