diff --git a/.bazelversion b/.bazelversion
index f22d756da39..a8907c025d5 100644
--- a/.bazelversion
+++ b/.bazelversion
@@ -1 +1 @@
-6.5.0
+7.0.2
diff --git a/.ci/env/openblas.sh b/.ci/env/openblas.sh
index f154c5463df..2a2e8ddf448 100755
--- a/.ci/env/openblas.sh
+++ b/.ci/env/openblas.sh
@@ -16,7 +16,7 @@
 #===============================================================================
 
 sudo apt-get update
-sudo apt-get install build-essential gcc gfortran
+sudo apt-get -y install build-essential gcc gfortran
 git clone https://github.com/xianyi/OpenBLAS.git
 CoreCount=$(lscpu -p | grep -Ev '^#' | wc -l)
 pushd OpenBLAS
diff --git a/.ci/env/tbb.sh b/.ci/env/tbb.sh
new file mode 100755
index 00000000000..3b6a991647c
--- /dev/null
+++ b/.ci/env/tbb.sh
@@ -0,0 +1,83 @@
+#!/bin/bash
+#===============================================================================
+# Copyright contributors to the oneDAL project
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#===============================================================================
+
+# Function to display help
+show_help() {
+    echo "Usage: $0 [-h]"
+    echo "  -h  Display this information"
+    echo "  Set CC and CXX environment variables to change the compiler. Default is GNU."
+}
+
+# Check for command-line options
+while getopts ":h" opt; do
+    case $opt in
+        h)
+            show_help
+            exit 0
+            ;;
+        \?)
+            echo "Invalid option: -$OPTARG" >&2
+            show_help
+            exit 1
+            ;;
+    esac
+done
+
+# Set default values for CXX and CC
+CXX="${CXX:-g++}"
+CC="${CC:-gcc}"
+
+echo "CXX is set to: $CXX"
+echo "CC is set to: $CC"
+
+TBB_VERSION="v2021.10.0"
+
+arch=$(uname -m)
+if [ "${arch}" == "x86_64" ]; then
+    arch_dir="intel64"
+elif [ "${arch}" == "aarch64" ]; then
+    arch_dir="arm"
+else
+    arch_dir=${arch}
+fi
+
+sudo apt-get update
+sudo apt-get install build-essential gcc gfortran cmake -y
+git clone --depth 1 --branch ${TBB_VERSION} https://github.com/oneapi-src/oneTBB.git onetbb-src
+
+CoreCount=$(lscpu -p | grep -Ev '^#' | wc -l)
+
+rm -rf __deps/tbb
+pushd onetbb-src
+mkdir build
+pushd build
+cmake -DCMAKE_CXX_COMPILER=${CXX} -DCMAKE_BUILD_TYPE=Release -DTBB_TEST=OFF -DTBB_STRICT_PROTOTYPES=OFF -DCMAKE_INSTALL_PREFIX=../../__deps/tbb .. 
+make -j${CoreCount} 
+make install
+popd
+popd
+rm -rf onetbb-src
+
+pushd __deps/tbb
+    mkdir -p lnx
+    mv lib/ lnx/
+    mv include/ lnx/ 
+    pushd lnx
+        mkdir -p lib/${arch_dir}/gcc4.8
+        mv lib/libtbb* lib/${arch_dir}/gcc4.8
+    popd
+popd 
diff --git a/.ci/scripts/build.bat b/.ci/scripts/build.bat
index f002b5e755e..d218464707c 100644
--- a/.ci/scripts/build.bat
+++ b/.ci/scripts/build.bat
@@ -40,4 +40,4 @@ call "C:\Program Files\Microsoft Visual Studio\2022\Enterprise\VC\Auxiliary\Buil
 echo make %1 -j%CPUCOUNT% COMPILER=%2 PLAT=win32e REQCPU=%3
 make %1 -j%CPUCOUNT% COMPILER=%2 PLAT=win32e REQCPU=%3
 
-cmake -DINSTALL_DIR=__release_win_vc\daal\latest\lib\cmake\oneDAL -P cmake\scripts\generate_config.cmake
+cmake -DINSTALL_DIR=__release_win_vc\daal\latest\lib\cmake\oneDAL -DARCH_DIR=intel64 -P cmake\scripts\generate_config.cmake
diff --git a/.ci/scripts/build.sh b/.ci/scripts/build.sh
index 8d47bbe9655..62b3623a3fe 100755
--- a/.ci/scripts/build.sh
+++ b/.ci/scripts/build.sh
@@ -1,6 +1,7 @@
 #! /bin/bash
 #===============================================================================
 # Copyright 2019 Intel Corporation
+# Copyright contributors to the oneDAL project
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -47,7 +48,17 @@ PLATFORM=$(bash dev/make/identify_os.sh)
 OS=${PLATFORM::3}
 ARCH=${PLATFORM:3:3}
 
+if [[ "${ARCH}" == "32e" ]]
+then
 optimizations=${optimizations:-avx2}
+elif [[ "${ARCH}" == "arm" ]]
+then
+optimizations=${optimizations:-sve}
+else
+echo "Unknown architecture '${ARCH}'"
+exit 1
+fi
+
 backend_config=${backend_config:-mkl}
 GLOBAL_RETURN=0
 
@@ -97,7 +108,16 @@ elif [ "${backend_config}" == "ref" ]; then
 else
     echo "Not supported backend env"
 fi
+
+#TBB setup
+if [[ "${ARCH}" == "32e" ]]
+then
 $(pwd)/dev/download_tbb.sh
+elif [[ "${ARCH}" == "arm" ]]
+then
+$(pwd)/.ci/env/tbb.sh
+fi
+
 echo "Calling make"
 make ${target:-daal_c} ${make_op} \
     COMPILER=${compiler} \
diff --git a/.ci/scripts/test.sh b/.ci/scripts/test.sh
index d014eb9ede4..b9856cf8554 100755
--- a/.ci/scripts/test.sh
+++ b/.ci/scripts/test.sh
@@ -1,6 +1,7 @@
 #! /bin/bash
 #===============================================================================
 # Copyright 2019 Intel Corporation
+# Copyright contributors to the oneDAL project
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -54,7 +55,17 @@ TESTING_RETURN=0
 PLATFORM=$(bash dev/make/identify_os.sh)
 OS=${PLATFORM::3}
 ARCH=${PLATFORM:3:3}
-full_arch=intel64
+if [ "$ARCH" == "32e" ]; then
+    full_arch=intel64
+    arch_dir=intel_intel64
+elif [ "$ARCH" == "arm" ]; then
+    full_arch=arm
+    arch_dir=arm_aarch64
+else
+    echo "Unknown architecture ${ARCH} detected for platform ${PLATFORM}"
+    exit 1
+fi
+
 build_system=${build_system:-cmake}
 backend=${backend:-mkl}
 
@@ -161,7 +172,7 @@ for link_mode in ${link_modes}; do
         fi
         output_result=
         err=
-        cmake_results_dir="_cmake_results/intel_intel64_${lib_ext}"
+        cmake_results_dir="_cmake_results/${arch_dir}_${lib_ext}"
         for p in ${cmake_results_dir}/*; do
             e=$(basename "$p")
             ${p} 2>&1 > ${e}.res
diff --git a/.github/renovate.json b/.github/renovate.json
index 2e9a9582bb6..810cef4990a 100644
--- a/.github/renovate.json
+++ b/.github/renovate.json
@@ -1,6 +1,6 @@
 {
   "extends": [
-    "config:base",
+    "config:recommended",
     ":preserveSemverRanges"
   ],
   "pip_requirements": {
diff --git a/.github/workflows/renovate-validation.yml b/.github/workflows/renovate-validation.yml
index a82ae67b1cd..3241cb94564 100644
--- a/.github/workflows/renovate-validation.yml
+++ b/.github/workflows/renovate-validation.yml
@@ -25,6 +25,6 @@ jobs:
       - name: Checkout
         uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11 # v4
       - name: Validate
-        uses: suzuki-shunsuke/github-action-renovate-config-validator@v0.1.3
+        uses: suzuki-shunsuke/github-action-renovate-config-validator@v1.0.1
         with:
           config_file_path: .github/renovate.json
diff --git a/.gitignore b/.gitignore
index f61844615bf..b74a2dbc3d2 100644
--- a/.gitignore
+++ b/.gitignore
@@ -15,12 +15,12 @@ bazel-*
 # Visual Studio related files, e.g., ".vscode"
 .vs*
 
-# Bazel directories
-bazel-*
-
 # PyCharm directories
 .idea*
 
 # CMake directories and cache
 CMakeFiles
 CMakeCache.txt
+
+# MODULE.bazel lock file
+MODULE.bazel.lock
diff --git a/MODULE.bazel b/MODULE.bazel
new file mode 100644
index 00000000000..01ab1547fa8
--- /dev/null
+++ b/MODULE.bazel
@@ -0,0 +1,3 @@
+module(name = "onedal")
+
+bazel_dep(name = "bazel_skylib", version = "1.5.0")
diff --git a/WORKSPACE b/WORKSPACE
index 6373b40e270..888ba44e002 100644
--- a/WORKSPACE
+++ b/WORKSPACE
@@ -1,14 +1,6 @@
 workspace(name = "onedal")
 
 load("@bazel_tools//tools/build_defs/repo:http.bzl", "http_archive")
-http_archive(
-  name = "bazel_skylib",
-  urls = [
-    "https://mirror.bazel.build/github.com/bazelbuild/bazel-skylib/releases/download/1.5.0/bazel-skylib-1.5.0.tar.gz",
-    "https://github.com/bazelbuild/bazel-skylib/releases/download/1.5.0/bazel-skylib-1.5.0.tar.gz",
-  ],
-  sha256 = "cd55a062e763b9349921f0f5db8c3933288dc8ba4f76dd9416aac68acee3cb94",
-)
 
 load("@onedal//dev/bazel/config:config.bzl", "declare_onedal_config")
 declare_onedal_config(
diff --git a/cmake/scripts/generate_config.cmake b/cmake/scripts/generate_config.cmake
index fb04a832435..a891736dd46 100644
--- a/cmake/scripts/generate_config.cmake
+++ b/cmake/scripts/generate_config.cmake
@@ -1,5 +1,6 @@
 #===============================================================================
 # Copyright 2021 Intel Corporation
+# Copyright contributors to the oneDAL project
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -19,7 +20,7 @@ set(DAL_ROOT_REL_PATH "../../..")
 set(INC_REL_PATH "include")
 set(LIB_REL_PATH "lib")
 set(DLL_REL_PATH "redist")
-set(SUB_DIR "intel64")
+set(ARCH_DIR_ONEDAL "${ARCH_DIR_ONEDAL}")
 
 # Parse version info if possible
 if (NOT "$ENV{DALROOT}" STREQUAL "")
diff --git a/cmake/templates/oneDALConfig.cmake.in b/cmake/templates/oneDALConfig.cmake.in
index 26ce3143a2b..73a63b625e7 100644
--- a/cmake/templates/oneDALConfig.cmake.in
+++ b/cmake/templates/oneDALConfig.cmake.in
@@ -209,14 +209,14 @@ foreach (_dal_component ${DAL_LIBS})
         find_library(
             _dal_lib
             NAMES "${LIB_PREFIX}${_dal_component}${LIB_EXT}"
-            PATH_SUFFIXES "lib/intel64"
+            PATH_SUFFIXES "lib/@ARCH_DIR_ONEDAL@"
             PATHS "${_dal_root}")
     elseif (${ONEDAL_LINK} STREQUAL "dynamic")
         add_library(oneDAL::${_dal_component} SHARED IMPORTED)
         find_library(
             _dal_lib
             NAMES "${LIB_PREFIX}${_dal_component}${DLL_EXT}"
-            PATH_SUFFIXES "lib/intel64"
+            PATH_SUFFIXES "lib/@ARCH_DIR_ONEDAL@"
             PATHS "${_dal_root}")
     endif()
 
diff --git a/cpp/daal/include/algorithms/algorithm_container_base_batch.h b/cpp/daal/include/algorithms/algorithm_container_base_batch.h
index 2efe6fdf942..03b3d48b16e 100644
--- a/cpp/daal/include/algorithms/algorithm_container_base_batch.h
+++ b/cpp/daal/include/algorithms/algorithm_container_base_batch.h
@@ -1,6 +1,7 @@
 /* file: algorithm_container_base_batch.h */
 /*******************************************************************************
 * Copyright 2014 Intel Corporation
+* Copyright contributors to the oneDAL project
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -142,11 +143,18 @@ class AlgorithmContainerImpl<batch> : public AlgorithmContainer<batch>
  * \tparam sse42Container       Implementation for Intel(R) Streaming SIMD Extensions 4.2 (Intel(R) SSE4.2)
  * \tparam avx2Container        Implementation for Intel(R) Advanced Vector Extensions 2 (Intel(R) AVX2)
  * \tparam avx512Container      Implementation for Intel(R) Xeon(R) processors based on Intel AVX-512
+ * \tparam sve                  Implementation for ARM processors based on Arm Scalable Vector Extension
  */
+
+#if defined(TARGET_X86_64)
 template <typename sse2Container DAAL_KERNEL_SSE42_ONLY(typename sse42Container) DAAL_KERNEL_AVX2_ONLY(typename avx2Container)
               DAAL_KERNEL_AVX512_ONLY(typename avx512Container)>
 class DAAL_EXPORT AlgorithmDispatchContainer<batch, sse2Container DAAL_KERNEL_SSE42_ONLY(sse42Container) DAAL_KERNEL_AVX2_ONLY(avx2Container)
                                                         DAAL_KERNEL_AVX512_ONLY(avx512Container)> : public AlgorithmContainerImpl<batch>
+#elif defined(TARGET_ARM)
+template <typename SVEContainer DAAL_KERNEL_SVE_ONLY(typename sveContainer)>
+class DAAL_EXPORT AlgorithmDispatchContainer<batch, SVEContainer DAAL_KERNEL_SVE_ONLY(sveContainer)> : public AlgorithmContainerImpl<batch>
+#endif
 {
 public:
     /**
diff --git a/cpp/daal/include/algorithms/algorithm_container_base_common.h b/cpp/daal/include/algorithms/algorithm_container_base_common.h
index 6d0c946fead..5f63a868634 100644
--- a/cpp/daal/include/algorithms/algorithm_container_base_common.h
+++ b/cpp/daal/include/algorithms/algorithm_container_base_common.h
@@ -1,6 +1,7 @@
 /* file: algorithm_container_base_common.h */
 /*******************************************************************************
 * Copyright 2014 Intel Corporation
+* Copyright contributors to the oneDAL project
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -24,6 +25,8 @@
 #ifndef __ALGORITHM_CONTAINER_BASE_COMMON_H__
 #define __ALGORITHM_CONTAINER_BASE_COMMON_H__
 
+#include "services/daal_defines.h"
+
 #include "algorithms/algorithm_container_base.h"
 #include "services/error_handling.h"
 #include "services/internal/gpu_support_checker.h"
@@ -53,8 +56,13 @@ namespace interface1
  * \tparam avx2Container        Implementation for Intel(R) Advanced Vector Extensions 2 (Intel(R) AVX2)
  * \tparam avx512Container      Implementation for Intel(R) Xeon(R) processors based on Intel AVX-512
  */
+
+#if defined(TARGET_X86_64)
 template <ComputeMode mode, typename sse2Container DAAL_KERNEL_SSE42_ONLY(typename sse42Container) DAAL_KERNEL_AVX2_ONLY(typename avx2Container)
                                 DAAL_KERNEL_AVX512_ONLY(typename avx512Container)>
+#elif defined(TARGET_ARM)
+template <ComputeMode mode, typename SVEContainer DAAL_KERNEL_SVE_ONLY(typename sveContainer)>
+#endif
 class DAAL_EXPORT AlgorithmDispatchContainer : public AlgorithmContainerImpl<mode>
 {
 public:
@@ -99,10 +107,15 @@ class DAAL_EXPORT AlgorithmDispatchContainer : public AlgorithmContainerImpl<mod
     AlgorithmDispatchContainer & operator=(const AlgorithmDispatchContainer &);
 };
 
-#define __DAAL_ALGORITHM_CONTAINER(Mode, ContainerTemplate, ...)                                                                                  \
-    algorithms::AlgorithmDispatchContainer<Mode, ContainerTemplate<__VA_ARGS__, sse2> DAAL_KERNEL_SSE42_CONTAINER(ContainerTemplate, __VA_ARGS__) \
-                                                     DAAL_KERNEL_AVX2_CONTAINER(ContainerTemplate, __VA_ARGS__)                                   \
+#if defined(TARGET_X86_64)
+    #define __DAAL_ALGORITHM_CONTAINER(Mode, ContainerTemplate, ...)                                                                                \
+        algorithms::AlgorithmDispatchContainer<Mode, ContainerTemplate<__VA_ARGS__, sse2> DAAL_KERNEL_SSE42_CONTAINER(                              \
+                                                         ContainerTemplate, __VA_ARGS__) DAAL_KERNEL_AVX2_CONTAINER(ContainerTemplate, __VA_ARGS__) \
                                                          DAAL_KERNEL_AVX512_CONTAINER(ContainerTemplate, __VA_ARGS__)>
+#elif defined(TARGET_ARM)
+    #define __DAAL_ALGORITHM_CONTAINER(Mode, ContainerTemplate, ...) \
+        algorithms::AlgorithmDispatchContainer<Mode, ContainerTemplate<__VA_ARGS__, sve> DAAL_KERNEL_SVE_CONTAINER(ContainerTemplate, __VA_ARGS__)>
+#endif
 
 /** @} */
 } // namespace interface1
diff --git a/cpp/daal/include/services/daal_defines.h b/cpp/daal/include/services/daal_defines.h
index 6d43749d7c8..5415d31dcb7 100644
--- a/cpp/daal/include/services/daal_defines.h
+++ b/cpp/daal/include/services/daal_defines.h
@@ -1,6 +1,7 @@
 /* file: daal_defines.h */
 /*******************************************************************************
 * Copyright 2014 Intel Corporation
+* Copyright contributors to the oneDAL project
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -28,6 +29,14 @@
 
 #include <cstddef> // for size_t
 
+#if defined(__x86_64__) || defined(__x86_64) || defined(__amd64) || defined(_M_AMD64)
+    #define TARGET_X86_64
+#endif
+
+#if defined(__ARM_ARCH) || defined(__aarch64__)
+    #define TARGET_ARM
+#endif
+
 #if (defined(__INTEL_COMPILER) || defined(__INTEL_LLVM_COMPILER)) && !defined(SYCL_LANGUAGE_VERSION)
     #define DAAL_INTEL_CPP_COMPILER
 #endif
@@ -65,6 +74,8 @@
 #if !defined(DAAL_INT)
     #if defined(_WIN64) || defined(__x86_64__)
         #define DAAL_INT __int64
+    #elif defined(TARGET_ARM)
+        #define DAAL_INT __int64
     #else
         #define DAAL_INT __int32
     #endif
diff --git a/cpp/daal/include/services/env_detect.h b/cpp/daal/include/services/env_detect.h
index 83f4040dfac..9f6ad24fef7 100644
--- a/cpp/daal/include/services/env_detect.h
+++ b/cpp/daal/include/services/env_detect.h
@@ -1,6 +1,7 @@
 /* file: env_detect.h */
 /*******************************************************************************
 * Copyright 2014 Intel Corporation
+* Copyright contributors to the oneDAL project
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -42,11 +43,16 @@ namespace daal
  */
 enum CpuType
 {
+#if defined(TARGET_X86_64)
     sse2        = 0, /*!< Intel(R) Streaming SIMD Extensions 2 (Intel(R) SSE2) */
     sse42       = 2, /*!< Intel(R) Streaming SIMD Extensions 4.2 (Intel(R) SSE4.2) */
     avx2        = 4, /*!< Intel(R) Advanced Vector Extensions 2 (Intel(R) AVX2) */
     avx512      = 6, /*!< Intel(R) Xeon(R) processors based on Intel(R) Advanced Vector Extensions 512 (Intel(R) AVX-512) */
     lastCpuType = avx512
+#elif defined(TARGET_ARM)
+    sve         = 0, /*!< ARM(R) processors based on Arm's Scalable Vector Extension (SVE) */
+    lastCpuType = sve
+#endif
 };
 
 namespace services
@@ -91,7 +97,12 @@ class DAAL_EXPORT Environment : public Base
     enum CpuTypeEnable
     {
         cpu_default = 0, /*!< Default processor type */
-        avx512      = 2  /*!< Intel(R) Xeon(R) processors based on Intel(R) Advanced Vector Extensions 512 (Intel(R) AVX-512) \DAAL_DEPRECATED */
+
+#if defined(TARGET_X86_64)
+        avx512 = 2 /*!< Intel(R) Xeon(R) processors based on Intel(R) Advanced Vector Extensions 512 (Intel(R) AVX-512) \DAAL_DEPRECATED */
+#elif defined(TARGET_ARM)
+        sve = 2, /*!< ARM(R) processors based on Arm's Scalable Vector Extension (SVE) */
+#endif
     };
 
     /**
@@ -167,7 +178,10 @@ class DAAL_EXPORT Environment : public Base
         _executionContext = internal::ImplAccessor::getImplPtr<services::internal::sycl::ExecutionContextIface>(ctx);
     }
 
-    services::internal::sycl::ExecutionContextIface & getDefaultExecutionContext() { return *_executionContext; }
+    services::internal::sycl::ExecutionContextIface & getDefaultExecutionContext()
+    {
+        return *_executionContext;
+    }
 
 private:
     Environment();
diff --git a/cpp/daal/include/services/internal/aarch64/aarch64_kernel_defines.h b/cpp/daal/include/services/internal/aarch64/aarch64_kernel_defines.h
new file mode 100644
index 00000000000..799525128ef
--- /dev/null
+++ b/cpp/daal/include/services/internal/aarch64/aarch64_kernel_defines.h
@@ -0,0 +1,41 @@
+/* file: aarch64_kernel_defines.h */
+/*******************************************************************************
+* Copyright contributors to the oneDAL project
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*     http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*******************************************************************************/
+
+#ifndef __aarch64_KERNEL_DEFINES_H__
+#define __aarch64_KERNEL_DEFINES_H__
+
+#define DAAL_KERNEL_SVE
+
+#if defined(DAAL_KERNEL_SVE)
+    #undef DAAL_KERNEL_BUILD_MAX_INSTRUCTION_SET_ID
+    #define DAAL_KERNEL_BUILD_MAX_INSTRUCTION_SET_ID               daal::sve
+    #define DAAL_KERNEL_SVE_ONLY(something)                        , something
+    #define DAAL_KERNEL_SVE_ONLY_CODE(...)                         __VA_ARGS__
+    #define DAAL_KERNEL_SVE_CONTAINER(ContainerTemplate, ...)      , DAAL_KERNEL_CONTAINER_TEMPL(ContainerTemplate, sve, __VA_ARGS__)
+    #define DAAL_KERNEL_SVE_CONTAINER1(ContainerTemplate, ...)     extern template class DAAL_KERNEL_CONTAINER_TEMPL(ContainerTemplate, sve, __VA_ARGS__);
+    #define DAAL_KERNEL_SVE_CONTAINER_CASE(ContainerTemplate, ...) DAAL_KERNEL_CONTAINER_CASE(ContainerTemplate, sve, __VA_ARGS__)
+    #define DAAL_KERNEL_SVE_CONTAINER_CASE_SYCL(ContainerTemplate, ...)
+#else
+    #define DAAL_KERNEL_SVE_ONLY(something)
+    #define DAAL_KERNEL_SVE_ONLY_CODE(...)
+    #define DAAL_KERNEL_SVE_CONTAINER(ContainerTemplate, ...)
+    #define DAAL_KERNEL_SVE_CONTAINER1(ContainerTemplate, ...)
+    #define DAAL_KERNEL_SVE_CONTAINER_CASE(ContainerTemplate, ...)
+    #define DAAL_KERNEL_SVE_CONTAINER_CASE_SYCL(ContainerTemplate, ...)
+#endif
+
+#endif
diff --git a/cpp/daal/include/services/internal/daal_kernel_defines.h b/cpp/daal/include/services/internal/daal_kernel_defines.h
index fd631a61f3f..f4f723dfd13 100644
--- a/cpp/daal/include/services/internal/daal_kernel_defines.h
+++ b/cpp/daal/include/services/internal/daal_kernel_defines.h
@@ -1,6 +1,7 @@
 /* file: daal_kernel_defines.h */
 /*******************************************************************************
 * Copyright 2014 Intel Corporation
+* Copyright contributors to the oneDAL project
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -31,11 +32,18 @@
  * @ingroup services
  * @{
  */
+
 #define DAAL_KERNEL_SSE2
 #define DAAL_KERNEL_SSE42
 #define DAAL_KERNEL_AVX2
 #define DAAL_KERNEL_AVX512
 
+#if defined(TARGET_X86_64)
+    #include "services/internal/x86_64/x86_64_kernel_defines.h"
+#elif defined(TARGET_ARM)
+    #include "services/internal/aarch64/aarch64_kernel_defines.h"
+#endif
+
 #define DAAL_KERNEL_CONTAINER_TEMPL(ContainerTemplate, cpuType, ...) ContainerTemplate<__VA_ARGS__, cpuType>
 #define DAAL_KERNEL_CONTAINER_CASE(ContainerTemplate, cpuType, ...)                              \
 case cpuType:                                                                                    \
@@ -50,81 +58,6 @@ case cpuType:
     break;                                                                                      \
 }
 
-#if defined(DAAL_KERNEL_SSE2)
-    #undef DAAL_KERNEL_BUILD_MAX_INSTRUCTION_SET_ID
-    #define DAAL_KERNEL_BUILD_MAX_INSTRUCTION_SET_ID           daal::sse2
-    #define DAAL_KERNEL_SSE2_ONLY(something)                   , something
-    #define DAAL_KERNEL_SSE2_ONLY_CODE(...)                    __VA_ARGS__
-    #define DAAL_KERNEL_SSE2_CONTAINER(ContainerTemplate, ...) , DAAL_KERNEL_CONTAINER_TEMPL(ContainerTemplate, sse2, __VA_ARGS__)
-    #define DAAL_KERNEL_SSE2_CONTAINER1(ContainerTemplate, ...) \
-        extern template class DAAL_KERNEL_CONTAINER_TEMPL(ContainerTemplate, sse2, __VA_ARGS__);
-    #define DAAL_KERNEL_SSE2_CONTAINER_CASE(ContainerTemplate, ...) DAAL_KERNEL_CONTAINER_CASE(ContainerTemplate, sse2, __VA_ARGS__)
-#else
-    #define DAAL_KERNEL_SSE2_ONLY(something)
-    #define DAAL_KERNEL_SSE2_ONLY_CODE(...)
-    #define DAAL_KERNEL_SSE2_CONTAINER(ContainerTemplate, ...)
-    #define DAAL_KERNEL_SSE2_CONTAINER1(ContainerTemplate, ...)
-    #define DAAL_KERNEL_SSE2_CONTAINER_CASE(ContainerTemplate, ...)
-    #define DAAL_KERNEL_SSE2_CONTAINER_CASE_SYCL(ContainerTemplate, ...)
-#endif
-
-#if defined(DAAL_KERNEL_SSE42)
-    #undef DAAL_KERNEL_BUILD_MAX_INSTRUCTION_SET_ID
-    #define DAAL_KERNEL_BUILD_MAX_INSTRUCTION_SET_ID            daal::sse42
-    #define DAAL_KERNEL_SSE42_ONLY(something)                   , something
-    #define DAAL_KERNEL_SSE42_ONLY_CODE(...)                    __VA_ARGS__
-    #define DAAL_KERNEL_SSE42_CONTAINER(ContainerTemplate, ...) , DAAL_KERNEL_CONTAINER_TEMPL(ContainerTemplate, sse42, __VA_ARGS__)
-    #define DAAL_KERNEL_SSE42_CONTAINER1(ContainerTemplate, ...) \
-        extern template class DAAL_KERNEL_CONTAINER_TEMPL(ContainerTemplate, sse42, __VA_ARGS__);
-    #define DAAL_KERNEL_SSE42_CONTAINER_CASE(ContainerTemplate, ...)      DAAL_KERNEL_CONTAINER_CASE(ContainerTemplate, sse42, __VA_ARGS__)
-    #define DAAL_KERNEL_SSE42_CONTAINER_CASE_SYCL(ContainerTemplate, ...) DAAL_KERNEL_CONTAINER_CASE_SYCL(ContainerTemplate, sse42, __VA_ARGS__)
-#else
-    #define DAAL_KERNEL_SSE42_ONLY(something)
-    #define DAAL_KERNEL_SSE42_ONLY_CODE(...)
-    #define DAAL_KERNEL_SSE42_CONTAINER(ContainerTemplate, ...)
-    #define DAAL_KERNEL_SSE42_CONTAINER1(ContainerTemplate, ...)
-    #define DAAL_KERNEL_SSE42_CONTAINER_CASE(ContainerTemplate, ...)
-    #define DAAL_KERNEL_SSE42_CONTAINER_CASE_SYCL(ContainerTemplate, ...)
-#endif
-
-#if defined(DAAL_KERNEL_AVX2)
-    #undef DAAL_KERNEL_BUILD_MAX_INSTRUCTION_SET_ID
-    #define DAAL_KERNEL_BUILD_MAX_INSTRUCTION_SET_ID           daal::avx2
-    #define DAAL_KERNEL_AVX2_ONLY(something)                   , something
-    #define DAAL_KERNEL_AVX2_ONLY_CODE(...)                    __VA_ARGS__
-    #define DAAL_KERNEL_AVX2_CONTAINER(ContainerTemplate, ...) , DAAL_KERNEL_CONTAINER_TEMPL(ContainerTemplate, avx2, __VA_ARGS__)
-    #define DAAL_KERNEL_AVX2_CONTAINER1(ContainerTemplate, ...) \
-        extern template class DAAL_KERNEL_CONTAINER_TEMPL(ContainerTemplate, avx2, __VA_ARGS__);
-    #define DAAL_KERNEL_AVX2_CONTAINER_CASE(ContainerTemplate, ...)      DAAL_KERNEL_CONTAINER_CASE(ContainerTemplate, avx2, __VA_ARGS__)
-    #define DAAL_KERNEL_AVX2_CONTAINER_CASE_SYCL(ContainerTemplate, ...) DAAL_KERNEL_CONTAINER_CASE_SYCL(ContainerTemplate, avx2, __VA_ARGS__)
-#else
-    #define DAAL_KERNEL_AVX2_ONLY(something)
-    #define DAAL_KERNEL_AVX2_ONLY_CODE(...)
-    #define DAAL_KERNEL_AVX2_CONTAINER(ContainerTemplate, ...)
-    #define DAAL_KERNEL_AVX2_CONTAINER1(ContainerTemplate, ...)
-    #define DAAL_KERNEL_AVX2_CONTAINER_CASE(ContainerTemplate, ...)
-    #define DAAL_KERNEL_AVX2_CONTAINER_CASE_SYCL(ContainerTemplate, ...)
-#endif
-
-#if defined(DAAL_KERNEL_AVX512)
-    #undef DAAL_KERNEL_BUILD_MAX_INSTRUCTION_SET_ID
-    #define DAAL_KERNEL_BUILD_MAX_INSTRUCTION_SET_ID             daal::avx512
-    #define DAAL_KERNEL_AVX512_ONLY(something)                   , something
-    #define DAAL_KERNEL_AVX512_ONLY_CODE(...)                    __VA_ARGS__
-    #define DAAL_KERNEL_AVX512_CONTAINER(ContainerTemplate, ...) , DAAL_KERNEL_CONTAINER_TEMPL(ContainerTemplate, avx512, __VA_ARGS__)
-    #define DAAL_KERNEL_AVX512_CONTAINER1(ContainerTemplate, ...) \
-        extern template class DAAL_KERNEL_CONTAINER_TEMPL(ContainerTemplate, avx512, __VA_ARGS__);
-    #define DAAL_KERNEL_AVX512_CONTAINER_CASE(ContainerTemplate, ...)      DAAL_KERNEL_CONTAINER_CASE(ContainerTemplate, avx512, __VA_ARGS__)
-    #define DAAL_KERNEL_AVX512_CONTAINER_CASE_SYCL(ContainerTemplate, ...) DAAL_KERNEL_CONTAINER_CASE_SYCL(ContainerTemplate, avx512, __VA_ARGS__)
-#else
-    #define DAAL_KERNEL_AVX512_ONLY(something)
-    #define DAAL_KERNEL_AVX512_ONLY_CODE(...)
-    #define DAAL_KERNEL_AVX512_CONTAINER(ContainerTemplate, ...)
-    #define DAAL_KERNEL_AVX512_CONTAINER1(ContainerTemplate, ...)
-    #define DAAL_KERNEL_AVX512_CONTAINER_CASE(ContainerTemplate, ...)
-    #define DAAL_KERNEL_AVX512_CONTAINER_CASE_SYCL(ContainerTemplate, ...)
-#endif
-
 #define DAAL_EXPAND(...) __VA_ARGS__
 /** @} */
 
diff --git a/cpp/daal/include/services/internal/x86_64/x86_64_kernel_defines.h b/cpp/daal/include/services/internal/x86_64/x86_64_kernel_defines.h
new file mode 100644
index 00000000000..f9570309739
--- /dev/null
+++ b/cpp/daal/include/services/internal/x86_64/x86_64_kernel_defines.h
@@ -0,0 +1,96 @@
+/* file: x86_64_kernel_defines.h */
+/*******************************************************************************
+* Copyright contributors to the oneDAL project
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*     http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*******************************************************************************/
+
+#ifndef __x86_64_KERNEL_DEFINES_H__
+#define __x86_64_KERNEL_DEFINES_H__
+
+#if defined(DAAL_KERNEL_SSE2)
+    #undef DAAL_KERNEL_BUILD_MAX_INSTRUCTION_SET_ID
+    #define DAAL_KERNEL_BUILD_MAX_INSTRUCTION_SET_ID           daal::sse2
+    #define DAAL_KERNEL_SSE2_ONLY(something)                   , something
+    #define DAAL_KERNEL_SSE2_ONLY_CODE(...)                    __VA_ARGS__
+    #define DAAL_KERNEL_SSE2_CONTAINER(ContainerTemplate, ...) , DAAL_KERNEL_CONTAINER_TEMPL(ContainerTemplate, sse2, __VA_ARGS__)
+    #define DAAL_KERNEL_SSE2_CONTAINER1(ContainerTemplate, ...) \
+        extern template class DAAL_KERNEL_CONTAINER_TEMPL(ContainerTemplate, sse2, __VA_ARGS__);
+    #define DAAL_KERNEL_SSE2_CONTAINER_CASE(ContainerTemplate, ...) DAAL_KERNEL_CONTAINER_CASE(ContainerTemplate, sse2, __VA_ARGS__)
+#else
+    #define DAAL_KERNEL_SSE2_ONLY(something)
+    #define DAAL_KERNEL_SSE2_ONLY_CODE(...)
+    #define DAAL_KERNEL_SSE2_CONTAINER(ContainerTemplate, ...)
+    #define DAAL_KERNEL_SSE2_CONTAINER1(ContainerTemplate, ...)
+    #define DAAL_KERNEL_SSE2_CONTAINER_CASE(ContainerTemplate, ...)
+    #define DAAL_KERNEL_SSE2_CONTAINER_CASE_SYCL(ContainerTemplate, ...)
+#endif
+
+#if defined(DAAL_KERNEL_SSE42)
+    #undef DAAL_KERNEL_BUILD_MAX_INSTRUCTION_SET_ID
+    #define DAAL_KERNEL_BUILD_MAX_INSTRUCTION_SET_ID            daal::sse42
+    #define DAAL_KERNEL_SSE42_ONLY(something)                   , something
+    #define DAAL_KERNEL_SSE42_ONLY_CODE(...)                    __VA_ARGS__
+    #define DAAL_KERNEL_SSE42_CONTAINER(ContainerTemplate, ...) , DAAL_KERNEL_CONTAINER_TEMPL(ContainerTemplate, sse42, __VA_ARGS__)
+    #define DAAL_KERNEL_SSE42_CONTAINER1(ContainerTemplate, ...) \
+        extern template class DAAL_KERNEL_CONTAINER_TEMPL(ContainerTemplate, sse42, __VA_ARGS__);
+    #define DAAL_KERNEL_SSE42_CONTAINER_CASE(ContainerTemplate, ...)      DAAL_KERNEL_CONTAINER_CASE(ContainerTemplate, sse42, __VA_ARGS__)
+    #define DAAL_KERNEL_SSE42_CONTAINER_CASE_SYCL(ContainerTemplate, ...) DAAL_KERNEL_CONTAINER_CASE_SYCL(ContainerTemplate, sse42, __VA_ARGS__)
+#else
+    #define DAAL_KERNEL_SSE42_ONLY(something)
+    #define DAAL_KERNEL_SSE42_ONLY_CODE(...)
+    #define DAAL_KERNEL_SSE42_CONTAINER(ContainerTemplate, ...)
+    #define DAAL_KERNEL_SSE42_CONTAINER1(ContainerTemplate, ...)
+    #define DAAL_KERNEL_SSE42_CONTAINER_CASE(ContainerTemplate, ...)
+    #define DAAL_KERNEL_SSE42_CONTAINER_CASE_SYCL(ContainerTemplate, ...)
+#endif
+
+#if defined(DAAL_KERNEL_AVX2)
+    #undef DAAL_KERNEL_BUILD_MAX_INSTRUCTION_SET_ID
+    #define DAAL_KERNEL_BUILD_MAX_INSTRUCTION_SET_ID           daal::avx2
+    #define DAAL_KERNEL_AVX2_ONLY(something)                   , something
+    #define DAAL_KERNEL_AVX2_ONLY_CODE(...)                    __VA_ARGS__
+    #define DAAL_KERNEL_AVX2_CONTAINER(ContainerTemplate, ...) , DAAL_KERNEL_CONTAINER_TEMPL(ContainerTemplate, avx2, __VA_ARGS__)
+    #define DAAL_KERNEL_AVX2_CONTAINER1(ContainerTemplate, ...) \
+        extern template class DAAL_KERNEL_CONTAINER_TEMPL(ContainerTemplate, avx2, __VA_ARGS__);
+    #define DAAL_KERNEL_AVX2_CONTAINER_CASE(ContainerTemplate, ...)      DAAL_KERNEL_CONTAINER_CASE(ContainerTemplate, avx2, __VA_ARGS__)
+    #define DAAL_KERNEL_AVX2_CONTAINER_CASE_SYCL(ContainerTemplate, ...) DAAL_KERNEL_CONTAINER_CASE_SYCL(ContainerTemplate, avx2, __VA_ARGS__)
+#else
+    #define DAAL_KERNEL_AVX2_ONLY(something)
+    #define DAAL_KERNEL_AVX2_ONLY_CODE(...)
+    #define DAAL_KERNEL_AVX2_CONTAINER(ContainerTemplate, ...)
+    #define DAAL_KERNEL_AVX2_CONTAINER1(ContainerTemplate, ...)
+    #define DAAL_KERNEL_AVX2_CONTAINER_CASE(ContainerTemplate, ...)
+    #define DAAL_KERNEL_AVX2_CONTAINER_CASE_SYCL(ContainerTemplate, ...)
+#endif
+
+#if defined(DAAL_KERNEL_AVX512)
+    #undef DAAL_KERNEL_BUILD_MAX_INSTRUCTION_SET_ID
+    #define DAAL_KERNEL_BUILD_MAX_INSTRUCTION_SET_ID             daal::avx512
+    #define DAAL_KERNEL_AVX512_ONLY(something)                   , something
+    #define DAAL_KERNEL_AVX512_ONLY_CODE(...)                    __VA_ARGS__
+    #define DAAL_KERNEL_AVX512_CONTAINER(ContainerTemplate, ...) , DAAL_KERNEL_CONTAINER_TEMPL(ContainerTemplate, avx512, __VA_ARGS__)
+    #define DAAL_KERNEL_AVX512_CONTAINER1(ContainerTemplate, ...) \
+        extern template class DAAL_KERNEL_CONTAINER_TEMPL(ContainerTemplate, avx512, __VA_ARGS__);
+    #define DAAL_KERNEL_AVX512_CONTAINER_CASE(ContainerTemplate, ...)      DAAL_KERNEL_CONTAINER_CASE(ContainerTemplate, avx512, __VA_ARGS__)
+    #define DAAL_KERNEL_AVX512_CONTAINER_CASE_SYCL(ContainerTemplate, ...) DAAL_KERNEL_CONTAINER_CASE_SYCL(ContainerTemplate, avx512, __VA_ARGS__)
+#else
+    #define DAAL_KERNEL_AVX512_ONLY(something)
+    #define DAAL_KERNEL_AVX512_ONLY_CODE(...)
+    #define DAAL_KERNEL_AVX512_CONTAINER(ContainerTemplate, ...)
+    #define DAAL_KERNEL_AVX512_CONTAINER1(ContainerTemplate, ...)
+    #define DAAL_KERNEL_AVX512_CONTAINER_CASE(ContainerTemplate, ...)
+    #define DAAL_KERNEL_AVX512_CONTAINER_CASE_SYCL(ContainerTemplate, ...)
+#endif
+
+#endif
diff --git a/cpp/daal/src/algorithms/algorithm_hyperparameter.cpp b/cpp/daal/src/algorithms/algorithm_hyperparameter.cpp
index 17b4c923eef..81b157d79ff 100644
--- a/cpp/daal/src/algorithms/algorithm_hyperparameter.cpp
+++ b/cpp/daal/src/algorithms/algorithm_hyperparameter.cpp
@@ -1,6 +1,7 @@
 /** file algorithm_hyperparameter.cpp */
 /*******************************************************************************
 * Copyright 2023 Intel Corporation
+* Copyright contributors to the oneDAL project
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -66,10 +67,10 @@ struct HyperparameterImpl : public HyperparameterBaseImpl
 
 protected:
     /** Stores integer hyperparameters of the algorithm */
-    HashTable<sse2, uint32_t, DAAL_INT64> _iHT;
+    HashTable<DAAL_BASE_CPU, uint32_t, DAAL_INT64> _iHT;
 
     /** Stores floating point hyperparameters of the algorithm */
-    HashTable<sse2, uint32_t, double> _dHT;
+    HashTable<DAAL_BASE_CPU, uint32_t, double> _dHT;
 };
 
 } // namespace internal
diff --git a/cpp/daal/src/algorithms/covariance/covariance_impl.i b/cpp/daal/src/algorithms/covariance/covariance_impl.i
index 24cb48524c5..0ebceeffcd7 100644
--- a/cpp/daal/src/algorithms/covariance/covariance_impl.i
+++ b/cpp/daal/src/algorithms/covariance/covariance_impl.i
@@ -1,6 +1,7 @@
 /* file: covariance_impl.i */
 /*******************************************************************************
 * Copyright 2014 Intel Corporation
+* Copyright contributors to the oneDAL project
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -135,8 +136,14 @@ static inline size_t getBlockSize(size_t nrows)
     return 140;
 }
 
+#if defined(TARGET_X86_64)
+    #define DAAL_CPU_TYPE avx512
+#elif defined(TARGET_ARM)
+    #define DAAL_CPU_TYPE sve
+#endif
+
 template <>
-inline size_t getBlockSize<avx512>(size_t nrows)
+inline size_t getBlockSize<DAAL_CPU_TYPE>(size_t nrows)
 {
     return (nrows > 5000 && nrows <= 50000) ? 1024 : 140;
 }
diff --git a/cpp/daal/src/algorithms/dtrees/dtrees_train_data_helper.i b/cpp/daal/src/algorithms/dtrees/dtrees_train_data_helper.i
index b9d354caf90..16064a701b0 100644
--- a/cpp/daal/src/algorithms/dtrees/dtrees_train_data_helper.i
+++ b/cpp/daal/src/algorithms/dtrees/dtrees_train_data_helper.i
@@ -115,6 +115,7 @@ struct SplitData
 
     SplitData()
         : impurityDecrease(-daal::services::internal::MaxVal<algorithmFPType>::get()),
+          left {},
           featureValue(0.0),
           nLeft(0),
           iStart(0),
diff --git a/cpp/daal/src/algorithms/dtrees/forest/classification/df_classification_predict_dense_default_batch_impl.i b/cpp/daal/src/algorithms/dtrees/forest/classification/df_classification_predict_dense_default_batch_impl.i
index 426457ecad7..377fffd20dd 100644
--- a/cpp/daal/src/algorithms/dtrees/forest/classification/df_classification_predict_dense_default_batch_impl.i
+++ b/cpp/daal/src/algorithms/dtrees/forest/classification/df_classification_predict_dense_default_batch_impl.i
@@ -1,6 +1,7 @@
 /* file: df_classification_predict_dense_default_batch_impl.i */
 /*******************************************************************************
 * Copyright 2014 Intel Corporation
+* Copyright contributors to the oneDAL project
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -945,8 +946,12 @@ Status PredictClassificationTask<algorithmFPType, cpu>::predictAllPointsByAllTre
     algorithmFPType * const res  = resBD.get();
     algorithmFPType * const prob = probBD.get();
     daal::SafeStatus safeStat;
-    const size_t nRowsOfRes        = _data->getNumberOfRows();
-    const size_t blockSize         = cpu == avx512 ? _DEFAULT_BLOCK_SIZE : _DEFAULT_BLOCK_SIZE_COMMON;
+    const size_t nRowsOfRes = _data->getNumberOfRows();
+#if defined(TARGET_X86_64)
+    const size_t blockSize = cpu == avx512 ? _DEFAULT_BLOCK_SIZE : _DEFAULT_BLOCK_SIZE_COMMON;
+#elif defined(TARGET_ARM)
+    const size_t blockSize = cpu == sve ? _DEFAULT_BLOCK_SIZE : _DEFAULT_BLOCK_SIZE_COMMON;
+#endif
     const size_t nBlocks           = nRowsOfRes / blockSize;
     const size_t residualSize      = nRowsOfRes - nBlocks * blockSize;
     algorithmFPType * commonBufVal = nullptr;
diff --git a/cpp/daal/src/algorithms/dtrees/forest/df_train_dense_default_impl.i b/cpp/daal/src/algorithms/dtrees/forest/df_train_dense_default_impl.i
index 03b89e7843c..1306c154037 100644
--- a/cpp/daal/src/algorithms/dtrees/forest/df_train_dense_default_impl.i
+++ b/cpp/daal/src/algorithms/dtrees/forest/df_train_dense_default_impl.i
@@ -947,8 +947,8 @@ typename DataHelper::NodeType::Base * TrainBatchTaskBase<algorithmFPType, BinInd
         double improvement;
         algorithmFPType leftWeights;
         algorithmFPType totalWeights;
-        typename DataHelper::ImpurityData impurityLeft;
-        typename DataHelper::ImpurityData impurityRight;
+        typename DataHelper::ImpurityData impurityLeft {};
+        typename DataHelper::ImpurityData impurityRight {};
         typename DataHelper::NodeType::Split * node;
 
         WorkItem()
@@ -1132,7 +1132,7 @@ NodeSplitResult TrainBatchTaskBase<algorithmFPType, BinIndexType, DataHelper, cp
     /* zero-based index of best split */
     int64_t iBestSplit               = -1;
     int64_t idxFeatureValueBestSplit = -1;
-    typename DataHelper::TSplitData split;
+    typename DataHelper::TSplitData split {};
     /* RNG for sample drawing */
     RNGsInst<IndexType, cpu> rng;
     /* index for swapping samples in Fisher-Yates sampling */
diff --git a/cpp/daal/src/algorithms/export_win32e.def b/cpp/daal/src/algorithms/export_win32e.def
index a55e32aea15..443714aef69 100644
--- a/cpp/daal/src/algorithms/export_win32e.def
+++ b/cpp/daal/src/algorithms/export_win32e.def
@@ -21,6 +21,9 @@ fpk_serv_memcpy_s
 fpk_serv_lock
 fpk_serv_unlock
 fpk_serv_strnlen_s
+fpk_serv_strncpy_s
+fpk_serv_strncat_s
+fpk_serv_thread_yield
 fpk_serv_core_register_cleanup
 fpk_serv_calloc
 fpk_serv_printf_s
diff --git a/cpp/daal/src/algorithms/kernel_config.h b/cpp/daal/src/algorithms/kernel_config.h
index ed19658b813..e328311714f 100644
--- a/cpp/daal/src/algorithms/kernel_config.h
+++ b/cpp/daal/src/algorithms/kernel_config.h
@@ -1,6 +1,7 @@
 /* file: kernel_config.h */
 /*******************************************************************************
 * Copyright 2023 Intel Corporation
+* Copyright contributors to the oneDAL project
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -24,6 +25,38 @@
 #ifndef __KERNEL_CONFIG_H__
 #define __KERNEL_CONFIG_H__
 
-#include "src/algorithms/kernel_inst_x86.h"
+#include "services/daal_defines.h"
+#include "src/services/service_defines.h"
+#include "services/internal/daal_kernel_defines.h"
+#include "services/internal/gpu_support_checker.h"
+
+#if defined(TARGET_X86_64)
+    #include "src/algorithms/kernel_inst_x86.h"
+#elif defined(TARGET_ARM)
+    #include "src/algorithms/kernel_inst_arm.h"
+#endif
+
+#define __DAAL_GET_CPUID int cpuid = daalEnv->cpuid;
+
+#define __DAAL_GET_CPUID_SAFE  \
+    int cpuid = DAAL_BASE_CPU; \
+    DAAL_SAFE_CPU_CALL((cpuid = daalEnv->cpuid), (cpuid = DAAL_BASE_CPU))
+
+#define __DAAL_KERNEL_MIN(a, b) ((a) < (b) ? (a) : (b))
+
+#define __DAAL_INSTANTIATE_DISPATCH_CONTAINER_SAFE(ContainerTemplate, Mode, ...)                                                               \
+    __DAAL_INSTANTIATE_DISPATCH_IMPL(ContainerTemplate, Mode, AlgorithmDispatchContainer, AlgorithmContainerImpl<Mode>, __DAAL_GET_CPUID_SAFE, \
+                                     __VA_ARGS__)
+
+#define __DAAL_INSTANTIATE_DISPATCH_CONTAINER(ContainerTemplate, Mode, ...) \
+    __DAAL_INSTANTIATE_DISPATCH_IMPL(ContainerTemplate, Mode, AlgorithmDispatchContainer, AlgorithmContainerImpl<Mode>, __DAAL_GET_CPUID, __VA_ARGS__)
+
+#define __DAAL_INSTANTIATE_DISPATCH_CONTAINER_SYCL(ContainerTemplate, Mode, ...)                                                               \
+    __DAAL_INSTANTIATE_DISPATCH_SYCL_IMPL(ContainerTemplate, Mode, AlgorithmDispatchContainer, AlgorithmContainerImpl<Mode>, __DAAL_GET_CPUID, \
+                                          __VA_ARGS__)
+
+#define __DAAL_INSTANTIATE_DISPATCH_CONTAINER_SYCL_SAFE(ContainerTemplate, Mode, ...)                                                               \
+    __DAAL_INSTANTIATE_DISPATCH_SYCL_IMPL(ContainerTemplate, Mode, AlgorithmDispatchContainer, AlgorithmContainerImpl<Mode>, __DAAL_GET_CPUID_SAFE, \
+                                          __VA_ARGS__)
 
 #endif
diff --git a/cpp/daal/src/algorithms/kernel_inst_arm.h b/cpp/daal/src/algorithms/kernel_inst_arm.h
new file mode 100644
index 00000000000..e72d94ef019
--- /dev/null
+++ b/cpp/daal/src/algorithms/kernel_inst_arm.h
@@ -0,0 +1,71 @@
+/* file: kernel_inst_arm.h */
+/*******************************************************************************
+* Copyright contributors to the oneDAL project
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*     http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*******************************************************************************/
+
+/*
+//++
+//  The defines used for kernel allocation, deallocation, and calling kernel methods
+//--
+*/
+
+#ifndef __KERNEL_INST_ARM_H__
+#define __KERNEL_INST_ARM_H__
+
+#define __DAAL_INSTANTIATE_DISPATCH_IMPL(ContainerTemplate, Mode, ClassName, BaseClassName, GetCpuid, ...)                         \
+    DAAL_KERNEL_SVE_CONTAINER1(ContainerTemplate, __VA_ARGS__)                                                                     \
+    namespace interface1                                                                                                           \
+    {                                                                                                                              \
+    template <>                                                                                                                    \
+    ClassName<Mode, ContainerTemplate<__VA_ARGS__, sve> DAAL_KERNEL_SVE_CONTAINER(ContainerTemplate, __VA_ARGS__)>::ClassName(     \
+        daal::services::Environment::env * daalEnv)                                                                                \
+        : BaseClassName(daalEnv), _cntr(nullptr)                                                                                   \
+    {                                                                                                                              \
+        GetCpuid switch (__DAAL_KERNEL_MIN(DAAL_KERNEL_BUILD_MAX_INSTRUCTION_SET_ID, cpuid))                                       \
+        {                                                                                                                          \
+            DAAL_KERNEL_SVE_CONTAINER_CASE(ContainerTemplate, __VA_ARGS__)                                                         \
+        default: _cntr = (new ContainerTemplate<__VA_ARGS__, sve>(daalEnv)); break;                                                \
+        }                                                                                                                          \
+    }                                                                                                                              \
+                                                                                                                                   \
+    template class ClassName<Mode, ContainerTemplate<__VA_ARGS__, sve> DAAL_KERNEL_SVE_CONTAINER(ContainerTemplate, __VA_ARGS__)>; \
+    }
+
+#define __DAAL_INSTANTIATE_DISPATCH_SYCL_IMPL(ContainerTemplate, Mode, ClassName, BaseClassName, GetCpuid, ...)                    \
+    DAAL_KERNEL_SVE_CONTAINER1(ContainerTemplate, __VA_ARGS__)                                                                     \
+    namespace interface1                                                                                                           \
+    {                                                                                                                              \
+    template <>                                                                                                                    \
+    ClassName<Mode, ContainerTemplate<__VA_ARGS__, sve> DAAL_KERNEL_SVE_CONTAINER(ContainerTemplate, __VA_ARGS__)>::ClassName(     \
+        daal::services::Environment::env * daalEnv)                                                                                \
+        : BaseClassName(daalEnv), _cntr(NULL)                                                                                      \
+    {                                                                                                                              \
+        GetCpuid switch (__DAAL_KERNEL_MIN(DAAL_KERNEL_BUILD_MAX_INSTRUCTION_SET_ID, cpuid))                                       \
+        {                                                                                                                          \
+            DAAL_KERNEL_SVE_CONTAINER_CASE(ContainerTemplate, __VA_ARGS__)                                                         \
+        default:                                                                                                                   \
+        {                                                                                                                          \
+            using cntrTemplateInst = ContainerTemplate<__VA_ARGS__, sve>;                                                          \
+            static volatile services::internal::GpuSupportRegistrar<cntrTemplateInst> registrar;                                   \
+            _cntr = (new cntrTemplateInst(daalEnv));                                                                               \
+            break;                                                                                                                 \
+        }                                                                                                                          \
+        }                                                                                                                          \
+    }                                                                                                                              \
+                                                                                                                                   \
+    template class ClassName<Mode, ContainerTemplate<__VA_ARGS__, sve> DAAL_KERNEL_SVE_CONTAINER(ContainerTemplate, __VA_ARGS__)>; \
+    }
+
+#endif
diff --git a/cpp/daal/src/algorithms/kernel_inst_x86.h b/cpp/daal/src/algorithms/kernel_inst_x86.h
index baf3d8d4153..1b30c74ccb1 100644
--- a/cpp/daal/src/algorithms/kernel_inst_x86.h
+++ b/cpp/daal/src/algorithms/kernel_inst_x86.h
@@ -24,19 +24,6 @@
 #ifndef __KERNEL_INST_X86_H__
 #define __KERNEL_INST_X86_H__
 
-#include "services/daal_defines.h"
-#include "src/services/service_defines.h"
-#include "services/internal/daal_kernel_defines.h"
-#include "services/internal/gpu_support_checker.h"
-
-#define __DAAL_GET_CPUID int cpuid = daalEnv->cpuid;
-
-#define __DAAL_GET_CPUID_SAFE  \
-    int cpuid = DAAL_BASE_CPU; \
-    DAAL_SAFE_CPU_CALL((cpuid = daalEnv->cpuid), (cpuid = DAAL_BASE_CPU))
-
-#define __DAAL_KERNEL_MIN(a, b) ((a) < (b) ? (a) : (b))
-
 #define __DAAL_INSTANTIATE_DISPATCH_IMPL(ContainerTemplate, Mode, ClassName, BaseClassName, GetCpuid, ...)                                       \
     DAAL_KERNEL_SSE2_CONTAINER1(ContainerTemplate, __VA_ARGS__)                                                                                  \
     DAAL_KERNEL_SSE42_CONTAINER1(ContainerTemplate, __VA_ARGS__)                                                                                 \
@@ -64,13 +51,6 @@
                                            DAAL_KERNEL_AVX512_CONTAINER(ContainerTemplate, __VA_ARGS__)>;                                        \
     }
 
-#define __DAAL_INSTANTIATE_DISPATCH_CONTAINER_SAFE(ContainerTemplate, Mode, ...)                                                               \
-    __DAAL_INSTANTIATE_DISPATCH_IMPL(ContainerTemplate, Mode, AlgorithmDispatchContainer, AlgorithmContainerImpl<Mode>, __DAAL_GET_CPUID_SAFE, \
-                                     __VA_ARGS__)
-
-#define __DAAL_INSTANTIATE_DISPATCH_CONTAINER(ContainerTemplate, Mode, ...) \
-    __DAAL_INSTANTIATE_DISPATCH_IMPL(ContainerTemplate, Mode, AlgorithmDispatchContainer, AlgorithmContainerImpl<Mode>, __DAAL_GET_CPUID, __VA_ARGS__)
-
 #define __DAAL_INSTANTIATE_DISPATCH_SYCL_IMPL(ContainerTemplate, Mode, ClassName, BaseClassName, GetCpuid, ...)                                  \
     DAAL_KERNEL_SSE2_CONTAINER1(ContainerTemplate, __VA_ARGS__)                                                                                  \
     DAAL_KERNEL_SSE42_CONTAINER1(ContainerTemplate, __VA_ARGS__)                                                                                 \
@@ -104,12 +84,4 @@
                                            DAAL_KERNEL_AVX512_CONTAINER(ContainerTemplate, __VA_ARGS__)>;                                        \
     }
 
-#define __DAAL_INSTANTIATE_DISPATCH_CONTAINER_SYCL(ContainerTemplate, Mode, ...)                                                               \
-    __DAAL_INSTANTIATE_DISPATCH_SYCL_IMPL(ContainerTemplate, Mode, AlgorithmDispatchContainer, AlgorithmContainerImpl<Mode>, __DAAL_GET_CPUID, \
-                                          __VA_ARGS__)
-
-#define __DAAL_INSTANTIATE_DISPATCH_CONTAINER_SYCL_SAFE(ContainerTemplate, Mode, ...)                                                               \
-    __DAAL_INSTANTIATE_DISPATCH_SYCL_IMPL(ContainerTemplate, Mode, AlgorithmDispatchContainer, AlgorithmContainerImpl<Mode>, __DAAL_GET_CPUID_SAFE, \
-                                          __VA_ARGS__)
-
 #endif
diff --git a/cpp/daal/src/algorithms/kmeans/kmeans_csr_lloyd_batch_fpt_cpu.cpp b/cpp/daal/src/algorithms/kmeans/kmeans_csr_lloyd_batch_fpt_cpu.cpp
index 6ba45be595f..2f4e64e3c7d 100644
--- a/cpp/daal/src/algorithms/kmeans/kmeans_csr_lloyd_batch_fpt_cpu.cpp
+++ b/cpp/daal/src/algorithms/kmeans/kmeans_csr_lloyd_batch_fpt_cpu.cpp
@@ -37,7 +37,7 @@ template class BatchContainer<DAAL_FPTYPE, lloydCSR, DAAL_CPU>;
 }
 namespace internal
 {
-template class KMeansBatchKernel<lloydCSR, DAAL_FPTYPE, DAAL_CPU>;
+template class DAAL_EXPORT KMeansBatchKernel<lloydCSR, DAAL_FPTYPE, DAAL_CPU>;
 } // namespace internal
 } // namespace kmeans
 } // namespace algorithms
diff --git a/cpp/daal/src/algorithms/kmeans/kmeans_init_csr_plusplus_batch_fpt_cpu.cpp b/cpp/daal/src/algorithms/kmeans/kmeans_init_csr_plusplus_batch_fpt_cpu.cpp
index c46d8302cf0..0ccd42d0429 100644
--- a/cpp/daal/src/algorithms/kmeans/kmeans_init_csr_plusplus_batch_fpt_cpu.cpp
+++ b/cpp/daal/src/algorithms/kmeans/kmeans_init_csr_plusplus_batch_fpt_cpu.cpp
@@ -39,7 +39,7 @@ template class BatchContainer<DAAL_FPTYPE, plusPlusCSR, DAAL_CPU>;
 }
 namespace internal
 {
-template class KMeansInitKernel<plusPlusCSR, DAAL_FPTYPE, DAAL_CPU>;
+template class DAAL_EXPORT KMeansInitKernel<plusPlusCSR, DAAL_FPTYPE, DAAL_CPU>;
 } // namespace internal
 } // namespace init
 } // namespace kmeans
diff --git a/cpp/daal/src/algorithms/kmeans/kmeans_plusplus_init_impl.i b/cpp/daal/src/algorithms/kmeans/kmeans_plusplus_init_impl.i
old mode 100755
new mode 100644
index c32c63ed1fd..4f6ee638fd1
--- a/cpp/daal/src/algorithms/kmeans/kmeans_plusplus_init_impl.i
+++ b/cpp/daal/src/algorithms/kmeans/kmeans_plusplus_init_impl.i
@@ -247,13 +247,13 @@ template <typename algorithmFPType, CpuType cpu>
 class DataHelperCSR
 {
 public:
-    typedef BlockHelperCSR<algorithmFPType, cpu, CSRNumericTableIface> BlockHelperType;
+    typedef BlockHelperCSR<algorithmFPType, cpu, CSRNumericTable> BlockHelperType;
 
     DataHelperCSR(NumericTable * ntData)
-        : dim(ntData->getNumberOfColumns()), nRows(ntData->getNumberOfRows()), _nt(ntData), _csr(dynamic_cast<CSRNumericTableIface *>(ntData))
+        : dim(ntData->getNumberOfColumns()), nRows(ntData->getNumberOfRows()), _nt(ntData), _csr(dynamic_cast<CSRNumericTable *>(ntData))
     {}
     NumericTable * nt() const { return _nt; }
-    CSRNumericTableIface * ntIface() const { return _csr; }
+    CSRNumericTable * ntIface() const { return _csr; }
 
     Status updateMinDistInBlock(algorithmFPType * const minDistAccTrials, size_t nBlock, size_t iBlock, size_t nTrials, size_t iBestTrial,
                                 const algorithmFPType * aWeights, const algorithmFPType * const pLastAddedCenter, algorithmFPType * const aMinDist)
@@ -261,11 +261,13 @@ public:
         const size_t iStartRow      = iBlock * _nRowsInBlock;                                                  //start row
         const size_t nRowsToProcess = (iBlock == nBlock - 1) ? nRows - iBlock * _nRowsInBlock : _nRowsInBlock; //rows to process
 
-        ReadRowsCSR<algorithmFPType, cpu> ntDataBD(_csr, iStartRow, nRowsToProcess);
-        DAAL_CHECK_BLOCK_STATUS(ntDataBD);
-        const algorithmFPType * const pData = ntDataBD.values();
-        const size_t * const colIdx         = ntDataBD.cols();
-        const size_t * const rowIdx         = ntDataBD.rows();
+        // TODO: Better to use ReadRowsCSR, but there is a bug related to static library linking.
+        // Fixme when ReadRowsCSR will be fixed.
+        daal::data_management::CSRBlockDescriptor<algorithmFPType> block;
+        _csr->getSparseBlock(iStartRow, nRowsToProcess, daal::data_management::readOnly, block);
+        const auto pData  = block.getBlockValuesPtr();
+        const auto colIdx = block.getBlockColumnIndicesPtr();
+        const auto rowIdx = block.getBlockRowIndicesPtr();
 
         algorithmFPType * const pDistSqBest   = &aMinDist[iBestTrial * nRows + iStartRow];
         const algorithmFPType * const weights = aWeights ? &aWeights[iStartRow] : nullptr;
@@ -282,7 +284,7 @@ public:
         minDistAccTrials[iBestTrial * nBlock + iBlock] =
             updateMinDistForITrials(pDistSqBest, iBestTrial, nRowsToProcess, pData, colIdx, rowIdx, pLastAddedCenter, weights, pDistSqBest);
 
-        return Status();
+        return _csr->releaseSparseBlock(block);
     }
 
     algorithmFPType updateMinDistForITrials(algorithmFPType * const pDistSq, size_t iTrials, size_t nRowsToProcess,
@@ -316,19 +318,25 @@ public:
     //of the data in this row
     algorithmFPType copyOneRowCalcSumSq(size_t iRow, algorithmFPType * pDst) const
     {
-        ReadRowsCSR<algorithmFPType, cpu> ntDataBD(_csr, iRow, 1);
-        const algorithmFPType * pData = ntDataBD.values();
-        const size_t * colIdx         = ntDataBD.cols();
-        const size_t * rowIdx         = ntDataBD.rows();
+        // TODO: Better to use ReadRowsCSR, but there is a bug related to static library linking.
+        // Fixme when ReadRowsCSR will be fixed.
+        daal::data_management::CSRBlockDescriptor<algorithmFPType> block;
+        _csr->getSparseBlock(iRow, 1, daal::data_management::readOnly, block);
+        const auto pData  = block.getBlockValuesPtr();
+        const auto colIdx = block.getBlockColumnIndicesPtr();
+        const auto rowIdx = block.getBlockRowIndicesPtr();
 
         daal::services::internal::service_memset<algorithmFPType, cpu>(pDst, algorithmFPType(0.), dim);
         algorithmFPType res(0.);
         const size_t nValues = rowIdx[1] - rowIdx[0];
-        for (size_t i = 0; i < nValues; ++i, ++pData, ++colIdx)
+        for (size_t i = 0; i < nValues; ++i)
         {
-            res += (*pData) * (*pData);
-            pDst[(*colIdx) - 1] = *pData;
+            const auto val = pData[i];
+            res += val * val;
+            const auto colIndex = colIdx[i];
+            pDst[colIndex - 1]  = val;
         }
+        _csr->releaseSparseBlock(block);
         return res;
     }
 
@@ -338,7 +346,7 @@ public:
 
 protected:
     NumericTable * _nt;
-    CSRNumericTableIface * _csr;
+    CSRNumericTable * _csr;
 };
 
 //Base task class for kmeans++ and kmeans||
@@ -546,6 +554,7 @@ Status TaskPlusPlusBatch<algorithmFPType, cpu, DataHelper>::run()
         //copy it to the result
         status |= this->copyPoints(&clusters[iCluster * this->_data.dim], &this->_lastAddedCenter[this->_trialBest * this->_data.dim], 1u);
     }
+
     return status;
 }
 
diff --git a/cpp/daal/src/algorithms/optimization_solver/coordinate_descent/coordinate_descent_types_fpt.cpp b/cpp/daal/src/algorithms/optimization_solver/coordinate_descent/coordinate_descent_types_fpt.cpp
index e4d361eb40c..dff9b1aa101 100644
--- a/cpp/daal/src/algorithms/optimization_solver/coordinate_descent/coordinate_descent_types_fpt.cpp
+++ b/cpp/daal/src/algorithms/optimization_solver/coordinate_descent/coordinate_descent_types_fpt.cpp
@@ -60,10 +60,6 @@ DAAL_EXPORT services::Status Result::allocate(const daal::algorithms::Input * in
     }
 
     const Parameter * algParam = static_cast<const Parameter *>(par);
-    if (!algParam->optionalResultRequired)
-    {
-        return s;
-    }
     return s;
 }
 template DAAL_EXPORT services::Status Result::allocate<DAAL_FPTYPE>(const daal::algorithms::Input * input, const daal::algorithms::Parameter * par,
diff --git a/cpp/daal/src/algorithms/qr/qr_dense_default_pcl_impl.i b/cpp/daal/src/algorithms/qr/qr_dense_default_pcl_impl.i
index 84f34861fb8..ce438f8b1fa 100755
--- a/cpp/daal/src/algorithms/qr/qr_dense_default_pcl_impl.i
+++ b/cpp/daal/src/algorithms/qr/qr_dense_default_pcl_impl.i
@@ -1,6 +1,7 @@
 /* file: qr_dense_default_pcl_impl.i */
 /*******************************************************************************
 * Copyright 2014 Intel Corporation
+* Copyright contributors to the oneDAL project
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -88,6 +89,8 @@ inline int * get_nblocks_array(int * size)
     return array;
 }
 /* rows/cols is greater or equal to: --------------------------------------------------------- 0   1   2   4   8  16  32  64 128 256 512  1K  2K ----------------------------------------------------*/
+
+#if defined(TARGET_X86_64)
 template <>
 inline int * get_nblocks_array<float, avx2>(int * size)
 {
@@ -116,6 +119,22 @@ inline int * get_nblocks_array<double, avx512>(int * size)
     *size              = sizeof(array) / sizeof(int) - 1;
     return array;
 }
+#elif defined(TARGET_ARM)
+template <>
+inline int * get_nblocks_array<float, sve>(int * size)
+{
+    static int array[] = { 1, 1, 1, 2, 4, 8, 16, 20, 24, 24, 20, 0 };
+    *size              = sizeof(array) / sizeof(int) - 1;
+    return array;
+}
+template <>
+inline int * get_nblocks_array<double, sve>(int * size)
+{
+    static int array[] = { 1, 1, 1, 2, 4, 8, 16, 20, 20, 24, 20, 0 };
+    *size              = sizeof(array) / sizeof(int) - 1;
+    return array;
+}
+#endif
 
 #define QR_CHECK_BREAK(cond, error) \
     if (!(cond))                    \
diff --git a/cpp/daal/src/externals/service_dispatch.h b/cpp/daal/src/externals/service_dispatch.h
index 0afb2d01921..9a6aef97e92 100644
--- a/cpp/daal/src/externals/service_dispatch.h
+++ b/cpp/daal/src/externals/service_dispatch.h
@@ -1,6 +1,7 @@
 /* file: service_dispatch.h */
 /*******************************************************************************
 * Copyright 2018 Intel Corporation
+* Copyright contributors to the oneDAL project
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -26,26 +27,43 @@
 
 #include "services/internal/daal_kernel_defines.h"
 
-#define DAAL_DISPATCH_FUNCTION_BY_CPU(func, ...)                                                                    \
-    switch (static_cast<daal::CpuType>(daal::services::Environment::getInstance()->getCpuId()))                     \
-    {                                                                                                               \
-        DAAL_KERNEL_SSE42_ONLY_CODE(case daal::CpuType::sse42 : func(daal::CpuType::sse42, __VA_ARGS__); break;)    \
-        DAAL_KERNEL_AVX2_ONLY_CODE(case daal::CpuType::avx2 : func(daal::CpuType::avx2, __VA_ARGS__); break;)       \
-        DAAL_KERNEL_AVX512_ONLY_CODE(case daal::CpuType::avx512 : func(daal::CpuType::avx512, __VA_ARGS__); break;) \
-        DAAL_EXPAND(default : func(daal::CpuType::sse2, __VA_ARGS__); break;)                                       \
-    }
+#if defined(TARGET_X86_64)
+    #define DAAL_DISPATCH_FUNCTION_BY_CPU(func, ...)                                                                    \
+        switch (static_cast<daal::CpuType>(daal::services::Environment::getInstance()->getCpuId()))                     \
+        {                                                                                                               \
+            DAAL_KERNEL_SSE42_ONLY_CODE(case daal::CpuType::sse42 : func(daal::CpuType::sse42, __VA_ARGS__); break;)    \
+            DAAL_KERNEL_AVX2_ONLY_CODE(case daal::CpuType::avx2 : func(daal::CpuType::avx2, __VA_ARGS__); break;)       \
+            DAAL_KERNEL_AVX512_ONLY_CODE(case daal::CpuType::avx512 : func(daal::CpuType::avx512, __VA_ARGS__); break;) \
+            DAAL_EXPAND(default : func(daal::CpuType::sse2, __VA_ARGS__); break;)                                       \
+        }
 
-#define DAAL_DISPATCH_FUNCTION_BY_CPU_SAFE(func, ...)                                                                    \
-    services::Status st;                                                                                                 \
-    int cpuid = daal::sse2;                                                                                              \
-    DAAL_SAFE_CPU_CALL((cpuid = daal::services::Environment::getInstance()->getCpuId()), (cpuid = daal::sse2))           \
-    switch (static_cast<daal::CpuType>(cpuid))                                                                           \
-    {                                                                                                                    \
-        DAAL_KERNEL_SSE42_ONLY_CODE(case daal::CpuType::sse42 : st = func(daal::CpuType::sse42, __VA_ARGS__); break;)    \
-        DAAL_KERNEL_AVX2_ONLY_CODE(case daal::CpuType::avx2 : st = func(daal::CpuType::avx2, __VA_ARGS__); break;)       \
-        DAAL_KERNEL_AVX512_ONLY_CODE(case daal::CpuType::avx512 : st = func(daal::CpuType::avx512, __VA_ARGS__); break;) \
-        DAAL_EXPAND(default : st = func(daal::CpuType::sse2, __VA_ARGS__); break;)                                       \
-    }                                                                                                                    \
-    services::throwIfPossible(st);
+    #define DAAL_DISPATCH_FUNCTION_BY_CPU_SAFE(func, ...)                                                                    \
+        services::Status st;                                                                                                 \
+        int cpuid = daal::sse2;                                                                                              \
+        DAAL_SAFE_CPU_CALL((cpuid = daal::services::Environment::getInstance()->getCpuId()), (cpuid = daal::sse2))           \
+        switch (static_cast<daal::CpuType>(cpuid))                                                                           \
+        {                                                                                                                    \
+            DAAL_KERNEL_SSE42_ONLY_CODE(case daal::CpuType::sse42 : st = func(daal::CpuType::sse42, __VA_ARGS__); break;)    \
+            DAAL_KERNEL_AVX2_ONLY_CODE(case daal::CpuType::avx2 : st = func(daal::CpuType::avx2, __VA_ARGS__); break;)       \
+            DAAL_KERNEL_AVX512_ONLY_CODE(case daal::CpuType::avx512 : st = func(daal::CpuType::avx512, __VA_ARGS__); break;) \
+            DAAL_EXPAND(default : st = func(daal::CpuType::sse2, __VA_ARGS__); break;)                                       \
+        }                                                                                                                    \
+        services::throwIfPossible(st);
+#elif defined(TARGET_ARM)
+    #define DAAL_DISPATCH_FUNCTION_BY_CPU(func, ...)                                                           \
+        switch (static_cast<daal::CpuType>(daal::services::Environment::getInstance()->getCpuId()))            \
+        {                                                                                                      \
+            DAAL_KERNEL_SVE_ONLY_CODE(case daal::CpuType::sve : func(daal::CpuType::sve, __VA_ARGS__); break;) \
+        }
 
+    #define DAAL_DISPATCH_FUNCTION_BY_CPU_SAFE(func, ...)                                                           \
+        services::Status st;                                                                                        \
+        int cpuid = daal::sve;                                                                                      \
+        DAAL_SAFE_CPU_CALL((cpuid = daal::services::Environment::getInstance()->getCpuId()), (cpuid = daal::sve))   \
+        switch (static_cast<daal::CpuType>(cpuid))                                                                  \
+        {                                                                                                           \
+            DAAL_KERNEL_SVE_ONLY_CODE(case daal::CpuType::sve : st = func(daal::CpuType::sve, __VA_ARGS__); break;) \
+        }                                                                                                           \
+        services::throwIfPossible(st);
+#endif
 #endif
diff --git a/cpp/daal/src/services/compiler/generic/env_detect_features.cpp b/cpp/daal/src/services/compiler/generic/env_detect_features.cpp
index ea61430b4a4..0f50e003f17 100644
--- a/cpp/daal/src/services/compiler/generic/env_detect_features.cpp
+++ b/cpp/daal/src/services/compiler/generic/env_detect_features.cpp
@@ -1,6 +1,7 @@
 /* file: env_detect_features.cpp */
 /*******************************************************************************
 * Copyright 2014 Intel Corporation
+* Copyright contributors to the oneDAL project
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -21,10 +22,15 @@
 //--
 */
 
-#include <immintrin.h>
-
 #include "services/env_detect.h"
 #include "services/daal_defines.h"
+
+#if defined(TARGET_X86_64)
+    #include <immintrin.h>
+#elif defined(TARGET_ARM)
+    #include <arm_sve.h>
+#endif
+
 #include "src/services/service_defines.h"
 #include "src/threading/threading.h"
 
@@ -41,23 +47,24 @@
 void __daal_serv_CPUHasAVX512f_enable_it_mac();
 #endif
 
+#if defined(TARGET_X86_64)
 void run_cpuid(uint32_t eax, uint32_t ecx, uint32_t * abcd)
 {
-#if defined(_MSC_VER)
+    #if defined(_MSC_VER)
     __cpuidex((int *)abcd, eax, ecx);
-#else
+    #else
     uint32_t ebx, edx;
-    #if defined(__i386__) && defined(__PIC__)
+        #if defined(__i386__) && defined(__PIC__)
     /* in case of PIC under 32-bit EBX cannot be clobbered */
     __asm__("movl %%ebx, %%edi \n\t cpuid \n\t xchgl %%ebx, %%edi" : "=D"(ebx), "+a"(eax), "+c"(ecx), "=d"(edx));
-    #else
+        #else
     __asm__("cpuid" : "+b"(ebx), "+a"(eax), "+c"(ecx), "=d"(edx));
-    #endif
+        #endif
     abcd[0] = eax;
     abcd[1] = ebx;
     abcd[2] = ecx;
     abcd[3] = edx;
-#endif
+    #endif
 }
 
 bool __daal_internal_is_intel_cpu()
@@ -86,11 +93,11 @@ static int check_cpuid(uint32_t eax, uint32_t ecx, int abcd_index, uint32_t mask
 static int check_xgetbv_xcr0_ymm(uint32_t mask)
 {
     uint32_t xcr0;
-#if defined(_MSC_VER)
+    #if defined(_MSC_VER)
     xcr0 = (uint32_t)_xgetbv(0);
-#else
+    #else
     __asm__("xgetbv" : "=a"(xcr0) : "c"(0) : "%edx");
-#endif
+    #endif
     return ((xcr0 & mask) == mask); /* checking if xmm and ymm state are enabled in XCR0 */
 }
 
@@ -187,9 +194,9 @@ DAAL_EXPORT bool __daal_serv_cpu_extensions_available()
 
 DAAL_EXPORT int __daal_serv_cpu_detect(int enable)
 {
-#if defined(__APPLE__)
+    #if defined(__APPLE__)
     __daal_serv_CPUHasAVX512f_enable_it_mac();
-#endif
+    #endif
     if (check_avx512_features() && daal_check_is_intel_cpu())
     {
         return daal::avx512;
@@ -207,3 +214,24 @@ DAAL_EXPORT int __daal_serv_cpu_detect(int enable)
 
     return daal::sse2;
 }
+#elif defined(TARGET_ARM)
+DAAL_EXPORT bool __daal_serv_cpu_extensions_available()
+{
+    return 0;
+}
+
+DAAL_EXPORT int __daal_serv_cpu_detect(int enable)
+{
+    return daal::sve;
+}
+
+void run_cpuid(uint32_t eax, uint32_t ecx, uint32_t * abcd)
+{
+    // TODO: ARM implementation for cpuid
+}
+
+bool daal_check_is_intel_cpu()
+{
+    return false;
+}
+#endif
diff --git a/cpp/daal/src/services/env_detect.cpp b/cpp/daal/src/services/env_detect.cpp
index 36f61d7f903..f50bd6358fa 100644
--- a/cpp/daal/src/services/env_detect.cpp
+++ b/cpp/daal/src/services/env_detect.cpp
@@ -1,6 +1,7 @@
 /* file: env_detect.cpp */
 /*******************************************************************************
 * Copyright 2014 Intel Corporation
+* Copyright contributors to the oneDAL project
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -31,6 +32,12 @@
 #include "src/services/service_topo.h"
 #include "src/threading/service_thread_pinner.h"
 
+#if defined(TARGET_X86_64)
+    #define DAAL_HOST_CPUID daal::services::Environment::avx512
+#elif defined(TARGET_ARM)
+    #define DAAL_HOST_CPUID daal::services::Environment::sve
+#endif
+
 static daal::services::Environment::LibraryThreadingType daal_thr_set = (daal::services::Environment::LibraryThreadingType)-1;
 static bool isInit                                                    = false;
 
@@ -80,7 +87,8 @@ DAAL_EXPORT int daal::services::Environment::enableInstructionsSet(int enable)
 DAAL_EXPORT int daal::services::Environment::setCpuId(int cpuid)
 {
     initNumberOfThreads();
-    int host_cpuid = __daal_serv_cpu_detect(daal::services::Environment::avx512);
+
+    int host_cpuid = __daal_serv_cpu_detect(DAAL_HOST_CPUID);
 
     if (!_env.cpuid_init_flag)
     {
@@ -90,7 +98,7 @@ DAAL_EXPORT int daal::services::Environment::setCpuId(int cpuid)
 
             if (cpuid > host_cpuid)
             {
-                _cpu_detect(daal::services::Environment::avx512);
+                _cpu_detect(DAAL_HOST_CPUID);
             }
             else
             {
diff --git a/cpp/daal/src/services/service_defines.h b/cpp/daal/src/services/service_defines.h
index 70af3f301ff..ce1e0cd75f5 100644
--- a/cpp/daal/src/services/service_defines.h
+++ b/cpp/daal/src/services/service_defines.h
@@ -1,6 +1,7 @@
 /* file: service_defines.h */
 /*******************************************************************************
 * Copyright 2014 Intel Corporation
+* Copyright contributors to the oneDAL project
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -33,7 +34,11 @@ DAAL_EXPORT int __daal_serv_cpu_detect(int);
 void run_cpuid(uint32_t eax, uint32_t ecx, uint32_t * abcd);
 bool daal_check_is_intel_cpu();
 
-#define DAAL_BASE_CPU daal::sse2
+#if defined(TARGET_X86_64)
+    #define DAAL_BASE_CPU daal::sse2
+#elif defined(TARGET_ARM)
+    #define DAAL_BASE_CPU daal::sve
+#endif
 
 #define DAAL_CHECK_CPU_ENVIRONMENT (daal_check_is_intel_cpu())
 
@@ -117,18 +122,26 @@ enum DataFormat
 } // namespace daal
 
 /* CPU comparison macro */
-#define __sse2__   (0)
-#define __sse42__  (2)
-#define __avx2__   (4)
-#define __avx512__ (6)
+#if defined(TARGET_X86_64)
+    #define __sse2__   (0)
+    #define __sse42__  (2)
+    #define __avx2__   (4)
+    #define __avx512__ (6)
+#elif defined(TARGET_ARM)
+    #define __sve__ (0)
+#endif
 
 #define __float__  (0)
 #define __double__ (1)
 
-#define CPU_sse2   __sse2__
-#define CPU_sse42  __sse42__
-#define CPU_avx2   __avx2__
-#define CPU_avx512 __avx512__
+#if defined(TARGET_X86_64)
+    #define CPU_sse2   __sse2__
+    #define CPU_sse42  __sse42__
+    #define CPU_avx2   __avx2__
+    #define CPU_avx512 __avx512__
+#elif defined(TARGET_ARM)
+    #define CPU_sve __sve__
+#endif
 
 #define FPTYPE_float  __float__
 #define FPTYPE_double __double__
diff --git a/cpp/daal/src/services/service_topo.h b/cpp/daal/src/services/service_topo.h
index 0340265f3ad..483e705af96 100644
--- a/cpp/daal/src/services/service_topo.h
+++ b/cpp/daal/src/services/service_topo.h
@@ -1,6 +1,7 @@
 /* file: service_topo.h */
 /*******************************************************************************
 * Copyright 2014 Intel Corporation
+* Copyright contributors to the oneDAL project
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -57,9 +58,12 @@ typedef cpuset_t cpu_set_t;
 
         #define __cdecl
 
-        #ifdef __x86_64__
+        #if defined(TARGET_X86_64)
             #define LNX_PTR2INT unsigned long long
             #define LNX_MY1CON  1LL
+        #elif defined(TARGET_ARM)
+using LNX_PTR2INT                = uintptr_t;
+constexpr LNX_PTR2INT LNX_MY1CON = 1LL;
         #else
             #define LNX_PTR2INT unsigned int
             #define LNX_MY1CON  1
diff --git a/cpp/daal/src/threading/export_lnxarm.ref.def b/cpp/daal/src/threading/export_lnxarm.ref.def
new file mode 100644
index 00000000000..58fccd3c71a
--- /dev/null
+++ b/cpp/daal/src/threading/export_lnxarm.ref.def
@@ -0,0 +1,63 @@
+;===============================================================================
+; Copyright contributors to the oneDAL project
+;
+; Licensed under the Apache License, Version 2.0 (the "License");
+; you may not use this file except in compliance with the License.
+; You may obtain a copy of the License at
+;
+;     http://www.apache.org/licenses/LICENSE-2.0
+;
+; Unless required by applicable law or agreed to in writing, software
+; distributed under the License is distributed on an "AS IS" BASIS,
+; WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+; See the License for the specific language governing permissions and
+; limitations under the License.
+;===============================================================================
+
+EXPORTS
+openblas_set_num_threads
+openblas_get_num_threads
+ssyrk_
+dsyrk_
+ssyr_
+dsyr_
+sgemm_
+dgemm_
+ssymm_
+dsymm_
+sgemv_
+dgemv_
+saxpy_
+daxpy_
+sdot_
+ddot_
+sgetrf_
+dgetrf_
+sgetrs_
+dgetrs_
+spotrf_
+dpotrf_
+spotrs_
+dpotrs_
+spotri_
+dpotri_
+sgerqf_
+dgerqf_
+sormrq_
+dormrq_
+strtrs_
+dtrtrs_
+spptrf_
+dpptrf_
+sgeqrf_
+dgeqrf_
+sgeqp3_
+dgeqp3_
+sorgqr_
+dorgqr_
+sgesvd_
+dgesvd_
+ssyevd_
+dsyevd_
+sormqr_
+dormqr_
diff --git a/cpp/oneapi/dal/algo/covariance/backend/cpu/finalize_compute_kernel_dense.cpp b/cpp/oneapi/dal/algo/covariance/backend/cpu/finalize_compute_kernel_dense.cpp
index 4091a492cbc..42b8186a5fe 100644
--- a/cpp/oneapi/dal/algo/covariance/backend/cpu/finalize_compute_kernel_dense.cpp
+++ b/cpp/oneapi/dal/algo/covariance/backend/cpu/finalize_compute_kernel_dense.cpp
@@ -1,5 +1,6 @@
 /*******************************************************************************
 * Copyright 2023 Intel Corporation
+* Copyright contributors to the oneDAL project
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -14,6 +15,8 @@
 * limitations under the License.
 *******************************************************************************/
 
+#include <daal/include/services/daal_defines.h>
+
 #include "daal/src/algorithms/covariance/covariance_kernel.h"
 
 #include "oneapi/dal/algo/covariance/backend/cpu/finalize_compute_kernel.hpp"
@@ -23,6 +26,12 @@
 
 #include "oneapi/dal/table/row_accessor.hpp"
 
+#if defined(TARGET_X86_64)
+#define CPU_EXTENSION dal::detail::cpu_extension::avx512
+#elif defined(TARGET_ARM)
+#define CPU_EXTENSION dal::detail::cpu_extension::sve
+#endif
+
 namespace oneapi::dal::covariance::backend {
 
 using dal::backend::context_cpu;
@@ -64,7 +73,7 @@ static compute_result<Task> call_daal_kernel_finalize(const context_cpu& ctx,
     /// the logic of block size calculation is copied from DAAL,
     /// to be changed to passing the values from the performance model
     std::int64_t blockSize = 140;
-    if (ctx.get_enabled_cpu_extensions() == dal::detail::cpu_extension::avx512) {
+    if (ctx.get_enabled_cpu_extensions() == CPU_EXTENSION) {
         const std::int64_t row_count = rows_count_global;
         if (5000 < row_count && row_count <= 50000) {
             blockSize = 1024;
diff --git a/cpp/oneapi/dal/algo/covariance/backend/cpu/partial_compute_kernel_dense.cpp b/cpp/oneapi/dal/algo/covariance/backend/cpu/partial_compute_kernel_dense.cpp
index 2058eeb457a..d7ec3fc3acc 100644
--- a/cpp/oneapi/dal/algo/covariance/backend/cpu/partial_compute_kernel_dense.cpp
+++ b/cpp/oneapi/dal/algo/covariance/backend/cpu/partial_compute_kernel_dense.cpp
@@ -1,5 +1,6 @@
 /*******************************************************************************
 * Copyright 2023 Intel Corporation
+* Copyright contributors to the oneDAL project
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -23,6 +24,12 @@
 
 #include "oneapi/dal/table/row_accessor.hpp"
 
+#if defined(TARGET_X86_64)
+#define CPU_EXTENSION dal::detail::cpu_extension::avx512
+#elif defined(TARGET_ARM)
+#define CPU_EXTENSION dal::detail::cpu_extension::sve
+#endif
+
 namespace oneapi::dal::covariance::backend {
 
 using dal::backend::context_cpu;
@@ -53,7 +60,7 @@ static partial_compute_result<Task> call_daal_kernel_partial_compute(
     /// the logic of block size calculation is copied from DAAL,
     /// to be changed to passing the values from the performance model
     std::int64_t blockSize = 140;
-    if (ctx.get_enabled_cpu_extensions() == dal::detail::cpu_extension::avx512) {
+    if (ctx.get_enabled_cpu_extensions() == CPU_EXTENSION) {
         const std::int64_t row_count = data.get_row_count();
         if (5000 < row_count && row_count <= 50000) {
             blockSize = 1024;
diff --git a/cpp/oneapi/dal/algo/covariance/backend/gpu/finalize_compute_kernel_dense_dpc.cpp b/cpp/oneapi/dal/algo/covariance/backend/gpu/finalize_compute_kernel_dense_dpc.cpp
index c841038f172..10bf2da4501 100644
--- a/cpp/oneapi/dal/algo/covariance/backend/gpu/finalize_compute_kernel_dense_dpc.cpp
+++ b/cpp/oneapi/dal/algo/covariance/backend/gpu/finalize_compute_kernel_dense_dpc.cpp
@@ -15,77 +15,43 @@
 *******************************************************************************/
 
 #include "oneapi/dal/algo/covariance/backend/gpu/finalize_compute_kernel.hpp"
-#include "oneapi/dal/algo/covariance/backend/gpu/misc.hpp"
-
+#include "oneapi/dal/algo/covariance/backend/gpu/finalize_compute_kernel_dense_impl.hpp"
 #include "oneapi/dal/backend/primitives/lapack.hpp"
 #include "oneapi/dal/backend/primitives/reduction.hpp"
 #include "oneapi/dal/backend/primitives/stat.hpp"
 #include "oneapi/dal/backend/primitives/utils.hpp"
-
+#include "oneapi/dal/detail/policy.hpp"
+#include "oneapi/dal/detail/common.hpp"
 #include "oneapi/dal/table/row_accessor.hpp"
+#include "oneapi/dal/detail/profiler.hpp"
 
 namespace oneapi::dal::covariance::backend {
 
-namespace bk = dal::backend;
 namespace pr = oneapi::dal::backend::primitives;
-using alloc = sycl::usm::alloc;
 
-using bk::context_gpu;
+using method_t = method::dense;
 using task_t = task::compute;
 using input_t = partial_compute_result<task_t>;
 using result_t = compute_result<task_t>;
 using descriptor_t = detail::descriptor_base<task_t>;
 
-template <typename Float, typename Task>
-static compute_result<Task> finalize_compute(const context_gpu& ctx,
-                                             const descriptor_t& desc,
-                                             const partial_compute_result<Task>& input) {
-    auto& q = ctx.get_queue();
-
-    const std::int64_t column_count = input.get_partial_crossproduct().get_column_count();
-    ONEDAL_ASSERT(column_count > 0);
-
-    dal::detail::check_mul_overflow(column_count, column_count);
-
-    auto bias = desc.get_bias();
-    auto result = compute_result<task_t>{}.set_result_options(desc.get_result_options());
-
-    const auto nobs_host = pr::table2ndarray<Float>(q, input.get_partial_n_rows());
-    auto rows_count_global = nobs_host.get_data()[0];
-    ONEDAL_ASSERT(rows_count_global > 0);
-
-    const auto sums =
-        pr::table2ndarray_1d<Float>(q, input.get_partial_sum(), sycl::usm::alloc::device);
-    const auto xtx =
-        pr::table2ndarray<Float>(q, input.get_partial_crossproduct(), sycl::usm::alloc::device);
-
-    if (desc.get_result_options().test(result_options::cov_matrix)) {
-        auto [cov, cov_event] = compute_covariance(q, rows_count_global, xtx, sums, bias);
-        result.set_cov_matrix(
-            (homogen_table::wrap(cov.flatten(q, { cov_event }), column_count, column_count)));
-    }
-    if (desc.get_result_options().test(result_options::cor_matrix)) {
-        auto [corr, corr_event] = compute_correlation(q, rows_count_global, xtx, sums);
-        result.set_cor_matrix(
-            (homogen_table::wrap(corr.flatten(q, { corr_event }), column_count, column_count)));
-    }
-    if (desc.get_result_options().test(result_options::means)) {
-        auto [means, means_event] = compute_means(q, sums, rows_count_global);
-        result.set_means(homogen_table::wrap(means.flatten(q, { means_event }), 1, column_count));
-    }
-    return result;
+template <typename Float>
+static result_t finalize_compute(const bk::context_gpu& ctx,
+                                 const descriptor_t& desc,
+                                 const input_t& input) {
+    return finalize_compute_kernel_dense_impl<Float>(ctx)(desc, input);
 }
 
 template <typename Float>
-struct finalize_compute_kernel_gpu<Float, method::dense, task_t> {
-    result_t operator()(const context_gpu& ctx,
+struct finalize_compute_kernel_gpu<Float, method_t, task_t> {
+    result_t operator()(const bk::context_gpu& ctx,
                         const descriptor_t& desc,
                         const input_t& input) const {
-        return finalize_compute<Float, task_t>(ctx, desc, input);
+        return finalize_compute<Float>(ctx, desc, input);
     }
 };
 
-template struct finalize_compute_kernel_gpu<float, method::dense, task_t>;
-template struct finalize_compute_kernel_gpu<double, method::dense, task_t>;
+template struct finalize_compute_kernel_gpu<float, method_t, task_t>;
+template struct finalize_compute_kernel_gpu<double, method_t, task_t>;
 
 } // namespace oneapi::dal::covariance::backend
diff --git a/cpp/oneapi/dal/algo/covariance/backend/gpu/finalize_compute_kernel_dense_impl.hpp b/cpp/oneapi/dal/algo/covariance/backend/gpu/finalize_compute_kernel_dense_impl.hpp
new file mode 100644
index 00000000000..611ebb341b6
--- /dev/null
+++ b/cpp/oneapi/dal/algo/covariance/backend/gpu/finalize_compute_kernel_dense_impl.hpp
@@ -0,0 +1,53 @@
+/*******************************************************************************
+* Copyright contributors to the oneDAL project
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*     http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*******************************************************************************/
+
+#pragma once
+
+#include "oneapi/dal/algo/covariance/backend/gpu/finalize_compute_kernel.hpp"
+#include "oneapi/dal/backend/primitives/utils.hpp"
+#include "oneapi/dal/util/common.hpp"
+#include "oneapi/dal/detail/policy.hpp"
+#include "oneapi/dal/backend/communicator.hpp"
+
+#ifdef ONEDAL_DATA_PARALLEL
+
+namespace oneapi::dal::covariance::backend {
+
+namespace bk = dal::backend;
+
+template <typename Float>
+class finalize_compute_kernel_dense_impl {
+    using task_t = task::compute;
+    using comm_t = bk::communicator<spmd::device_memory_access::usm>;
+    using input_t = partial_compute_result<task_t>;
+    using result_t = compute_result<task_t>;
+    using descriptor_t = detail::descriptor_base<task_t>;
+    using parameters_t = detail::compute_parameters<task_t>;
+
+public:
+    finalize_compute_kernel_dense_impl(const bk::context_gpu& ctx)
+            : q(ctx.get_queue()),
+              comm_(ctx.get_communicator()) {}
+    result_t operator()(const descriptor_t& desc, const input_t& input);
+
+private:
+    sycl::queue q;
+    comm_t comm_;
+};
+
+} // namespace oneapi::dal::covariance::backend
+
+#endif // ONEDAL_DATA_PARALLEL
diff --git a/cpp/oneapi/dal/algo/covariance/backend/gpu/finalize_compute_kernel_dense_impl_dpc.cpp b/cpp/oneapi/dal/algo/covariance/backend/gpu/finalize_compute_kernel_dense_impl_dpc.cpp
new file mode 100644
index 00000000000..3a198252c17
--- /dev/null
+++ b/cpp/oneapi/dal/algo/covariance/backend/gpu/finalize_compute_kernel_dense_impl_dpc.cpp
@@ -0,0 +1,112 @@
+/*******************************************************************************
+* Copyright contributors to the oneDAL project
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*     http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*******************************************************************************/
+
+#include "oneapi/dal/algo/covariance/backend/gpu/finalize_compute_kernel_dense_impl.hpp"
+#include "oneapi/dal/algo/covariance/backend/gpu/misc.hpp"
+
+#include "oneapi/dal/backend/common.hpp"
+#include "oneapi/dal/detail/common.hpp"
+#include "oneapi/dal/detail/policy.hpp"
+#include "oneapi/dal/detail/profiler.hpp"
+
+#include "oneapi/dal/backend/primitives/lapack.hpp"
+#include "oneapi/dal/backend/primitives/reduction.hpp"
+#include "oneapi/dal/backend/primitives/stat.hpp"
+#include "oneapi/dal/backend/primitives/utils.hpp"
+
+#include "oneapi/dal/table/row_accessor.hpp"
+
+#ifdef ONEDAL_DATA_PARALLEL
+
+namespace oneapi::dal::covariance::backend {
+
+namespace bk = dal::backend;
+namespace pr = oneapi::dal::backend::primitives;
+using alloc = sycl::usm::alloc;
+
+using bk::context_gpu;
+using task_t = task::compute;
+using input_t = partial_compute_result<task_t>;
+using result_t = compute_result<task_t>;
+using descriptor_t = detail::descriptor_base<task_t>;
+
+///  A wrapper that computes 2d arrays of correlation or covariance matrix and 1d array of means.
+///  The choice is based on the optional results
+///
+/// @tparam Float Floating-point type used to perform computations
+///
+/// @param[in]  desc  The descriptor of the algorithm
+/// @param[in]  input The partial_compute_result class with partial sums and xtx matrix
+///
+/// @return The compute_result object, which contains functions to get covariance/correlation matrix or means.
+template <typename Float>
+result_t finalize_compute_kernel_dense_impl<Float>::operator()(const descriptor_t& desc,
+                                                               const input_t& input) {
+    const std::int64_t column_count = input.get_partial_crossproduct().get_column_count();
+    ONEDAL_ASSERT(column_count > 0);
+
+    dal::detail::check_mul_overflow(column_count, column_count);
+
+    auto bias = desc.get_bias();
+    auto result = compute_result<task_t>{}.set_result_options(desc.get_result_options());
+
+    const auto nobs_host = pr::table2ndarray<Float>(q, input.get_partial_n_rows());
+    auto rows_count_global = nobs_host.get_data()[0];
+    {
+        ONEDAL_PROFILER_TASK(allreduce_rows_count_global);
+        comm_.allreduce(rows_count_global, spmd::reduce_op::sum).wait();
+    }
+
+    ONEDAL_ASSERT(rows_count_global > 0);
+
+    const auto sums =
+        pr::table2ndarray_1d<Float>(q, input.get_partial_sum(), sycl::usm::alloc::device);
+
+    {
+        ONEDAL_PROFILER_TASK(allreduce_sums, q);
+        comm_.allreduce(sums.flatten(q, {}), spmd::reduce_op::sum).wait();
+    }
+
+    const auto xtx =
+        pr::table2ndarray<Float>(q, input.get_partial_crossproduct(), sycl::usm::alloc::device);
+
+    {
+        ONEDAL_PROFILER_TASK(allreduce_xtx, q);
+        comm_.allreduce(xtx.flatten(q, {}), spmd::reduce_op::sum).wait();
+    }
+
+    if (desc.get_result_options().test(result_options::cov_matrix)) {
+        auto [cov, cov_event] = compute_covariance(q, rows_count_global, xtx, sums, bias);
+        result.set_cov_matrix(
+            (homogen_table::wrap(cov.flatten(q, { cov_event }), column_count, column_count)));
+    }
+    if (desc.get_result_options().test(result_options::cor_matrix)) {
+        auto [corr, corr_event] = compute_correlation(q, rows_count_global, xtx, sums);
+        result.set_cor_matrix(
+            (homogen_table::wrap(corr.flatten(q, { corr_event }), column_count, column_count)));
+    }
+    if (desc.get_result_options().test(result_options::means)) {
+        auto [means, means_event] = compute_means(q, sums, rows_count_global);
+        result.set_means(homogen_table::wrap(means.flatten(q, { means_event }), 1, column_count));
+    }
+    return result;
+}
+
+template class finalize_compute_kernel_dense_impl<float>;
+template class finalize_compute_kernel_dense_impl<double>;
+} // namespace oneapi::dal::covariance::backend
+
+#endif // ONEDAL_DATA_PARALLEL
diff --git a/cpp/oneapi/dal/algo/covariance/detail/finalize_compute_ops_dpc.cpp b/cpp/oneapi/dal/algo/covariance/detail/finalize_compute_ops_dpc.cpp
index 6092e5c81ff..d916608d6b3 100644
--- a/cpp/oneapi/dal/algo/covariance/detail/finalize_compute_ops_dpc.cpp
+++ b/cpp/oneapi/dal/algo/covariance/detail/finalize_compute_ops_dpc.cpp
@@ -29,14 +29,16 @@ struct finalize_compute_ops_dispatcher<Policy, Float, Method, Task> {
                                     const partial_compute_result<Task>& input) const {
         using kernel_dispatcher_t = dal::backend::kernel_dispatcher< //
             KERNEL_SINGLE_NODE_CPU(backend::finalize_compute_kernel_cpu<Float, Method, Task>),
-            KERNEL_SINGLE_NODE_GPU(backend::finalize_compute_kernel_gpu<Float, Method, Task>)>;
+            KERNEL_UNIVERSAL_SPMD_GPU(backend::finalize_compute_kernel_gpu<Float, Method, Task>)>;
         return kernel_dispatcher_t()(policy, desc, input);
     }
 };
 
-#define INSTANTIATE(F, M, T)      \
-    template struct ONEDAL_EXPORT \
-        finalize_compute_ops_dispatcher<dal::detail::data_parallel_policy, F, M, T>;
+#define INSTANTIATE(F, M, T)                                                         \
+    template struct ONEDAL_EXPORT                                                    \
+        finalize_compute_ops_dispatcher<dal::detail::data_parallel_policy, F, M, T>; \
+    template struct ONEDAL_EXPORT                                                    \
+        finalize_compute_ops_dispatcher<dal::detail::spmd_data_parallel_policy, F, M, T>;
 
 INSTANTIATE(float, method::dense, task::compute)
 INSTANTIATE(double, method::dense, task::compute)
diff --git a/cpp/oneapi/dal/algo/covariance/parameters/cpu/compute_parameters.cpp b/cpp/oneapi/dal/algo/covariance/parameters/cpu/compute_parameters.cpp
index 22ee3d8a4b0..6f2b7e59065 100644
--- a/cpp/oneapi/dal/algo/covariance/parameters/cpu/compute_parameters.cpp
+++ b/cpp/oneapi/dal/algo/covariance/parameters/cpu/compute_parameters.cpp
@@ -1,5 +1,6 @@
 /*******************************************************************************
 * Copyright 2023 Intel Corporation
+* Copyright contributors to the oneDAL project
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -15,6 +16,7 @@
 *******************************************************************************/
 
 #include <algorithm>
+#include <daal/include/services/daal_defines.h>
 
 #include "oneapi/dal/detail/common.hpp"
 #include "oneapi/dal/detail/profiler.hpp"
@@ -27,6 +29,12 @@
 
 #include "oneapi/dal/algo/covariance/parameters/cpu/compute_parameters.hpp"
 
+#if defined(TARGET_X86_64)
+#define CPU_EXTENSION dal::detail::cpu_extension::avx512
+#elif defined(TARGET_ARM)
+#define CPU_EXTENSION dal::detail::cpu_extension::sve
+#endif
+
 namespace oneapi::dal::covariance::parameters {
 
 using dal::backend::context_cpu;
@@ -46,7 +54,7 @@ std::int64_t propose_block_size(const context_cpu& ctx, const std::int64_t row_c
     /// The constants are defined as the values that show the best performance results
     /// in the series of performance measurements with the varying block sizes and dataset sizes.
     std::int64_t block_size = 140l;
-    if (ctx.get_enabled_cpu_extensions() == dal::detail::cpu_extension::avx512) {
+    if (ctx.get_enabled_cpu_extensions() == CPU_EXTENSION) {
         /// Here if AVX512 extensions are available on CPU
         if (5000l < row_count && row_count <= 50000l) {
             block_size = 1024l;
diff --git a/cpp/oneapi/dal/algo/covariance/test/fixture.hpp b/cpp/oneapi/dal/algo/covariance/test/fixture.hpp
index 5cd74690a7f..f79e481db55 100644
--- a/cpp/oneapi/dal/algo/covariance/test/fixture.hpp
+++ b/cpp/oneapi/dal/algo/covariance/test/fixture.hpp
@@ -37,6 +37,8 @@ class covariance_test : public te::crtp_algo_fixture<TestType, Derived> {
     using Float = std::tuple_element_t<0, TestType>;
     using Method = std::tuple_element_t<1, TestType>;
     using input_t = cov::compute_input<>;
+    using partial_input_t = cov::partial_compute_input<>;
+    using partial_result_t = cov::partial_compute_result<>;
     using result_t = cov::compute_result<>;
     using descriptor_t = cov::descriptor<Float, Method>;
 
diff --git a/cpp/oneapi/dal/algo/covariance/test/online_spmd.cpp b/cpp/oneapi/dal/algo/covariance/test/online_spmd.cpp
new file mode 100644
index 00000000000..bc4cf4f8dbd
--- /dev/null
+++ b/cpp/oneapi/dal/algo/covariance/test/online_spmd.cpp
@@ -0,0 +1,131 @@
+/*******************************************************************************
+* Copyright contributors to the oneDAL project
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*     http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*******************************************************************************/
+
+#include "oneapi/dal/algo/covariance/test/fixture.hpp"
+#include "oneapi/dal/test/engine/tables.hpp"
+#include "oneapi/dal/test/engine/io.hpp"
+
+namespace oneapi::dal::covariance::test {
+
+namespace te = dal::test::engine;
+namespace la = te::linalg;
+namespace cov = oneapi::dal::covariance;
+
+template <typename TestType>
+class covariance_online_spmd_test
+        : public covariance_test<TestType, covariance_online_spmd_test<TestType>> {
+public:
+    using base_t = covariance_test<TestType, covariance_online_spmd_test<TestType>>;
+    using float_t = typename base_t::float_t;
+    using input_t = typename base_t::input_t;
+    using partial_input_t = typename base_t::partial_input_t;
+    using partial_result_t = typename base_t::partial_result_t;
+    using result_t = typename base_t::result_t;
+
+    void set_rank_count(std::int64_t rank_count) {
+        rank_count_ = rank_count;
+    }
+
+    void set_blocks_count(std::int64_t blocks_count) {
+        blocks_count_ = blocks_count;
+    }
+
+    template <typename... Args>
+    result_t finalize_compute_override(Args&&... args) {
+        return this->finalize_compute_via_spmd_threads_and_merge(rank_count_,
+                                                                 std::forward<Args>(args)...);
+    }
+
+    result_t merge_finalize_compute_result_override(const std::vector<result_t>& results) {
+        return results[0];
+    }
+
+    template <typename... Args>
+    std::vector<partial_result_t> split_finalize_compute_input_override(std::int64_t split_count,
+                                                                        Args&&... args) {
+        ONEDAL_ASSERT(split_count == rank_count_);
+        const std::vector<partial_result_t> input{ std::forward<Args>(args)... };
+
+        return input;
+    }
+
+    void online_spmd_general_checks(const te::dataframe& data_fr,
+                                    cov::result_option_id compute_mode,
+                                    const te::table_id& data_table_id) {
+        CAPTURE(static_cast<std::uint64_t>(compute_mode));
+        const table data = data_fr.get_table(this->get_policy(), data_table_id);
+
+        const auto cov_desc = base_t::get_descriptor(compute_mode);
+        std::vector<partial_result_t> partial_results;
+        auto input_table = base_t::template split_table_by_rows<double>(data, rank_count_);
+        for (int64_t i = 0; i < rank_count_; i++) {
+            dal::covariance::partial_compute_result<> partial_result;
+            auto input_table_blocks =
+                base_t::template split_table_by_rows<double>(input_table[i], blocks_count_);
+            for (int64_t j = 0; j < blocks_count_; j++) {
+                partial_result =
+                    this->partial_compute(cov_desc, partial_result, input_table_blocks[j]);
+            }
+            partial_results.push_back(partial_result);
+        }
+        const auto compute_result = this->finalize_compute_override(cov_desc, partial_results);
+
+        base_t::check_compute_result(cov_desc, data, compute_result);
+    }
+
+private:
+    std::int64_t rank_count_;
+    std::int64_t blocks_count_;
+};
+
+using covariance_types = COMBINE_TYPES((float, double), (covariance::method::dense));
+
+TEMPLATE_LIST_TEST_M(covariance_online_spmd_test,
+                     "covariance common flow",
+                     "[covariance][integration][spmd]",
+                     covariance_types) {
+    SKIP_IF(this->get_policy().is_cpu());
+    SKIP_IF(this->not_float64_friendly());
+
+    const te::dataframe data =
+        GENERATE_DATAFRAME(te::dataframe_builder{ 1000, 100 }.fill_normal(-30, 30, 7777),
+                           te::dataframe_builder{ 2000, 20 }.fill_normal(0, 1, 7777),
+                           te::dataframe_builder{ 2500, 20 }.fill_normal(-30, 30, 7777));
+    this->set_rank_count(GENERATE(1, 2, 4));
+    this->set_blocks_count(GENERATE(1, 3, 10));
+    cov::result_option_id mode_mean = result_options::means;
+    cov::result_option_id mode_cov = result_options::cov_matrix;
+    cov::result_option_id mode_cor = result_options::cor_matrix;
+    cov::result_option_id mode_cov_mean = result_options::cov_matrix | result_options::means;
+    cov::result_option_id mode_cov_cor = result_options::cov_matrix | result_options::cor_matrix;
+    cov::result_option_id mode_cor_mean = result_options::cor_matrix | result_options::means;
+    cov::result_option_id res_all =
+        result_options::cov_matrix | result_options::cor_matrix | result_options::means;
+
+    const cov::result_option_id compute_mode = GENERATE_COPY(mode_mean,
+                                                             mode_cor,
+                                                             mode_cov,
+                                                             mode_cor_mean,
+                                                             mode_cov_mean,
+                                                             mode_cov_cor,
+                                                             res_all);
+
+    const auto data_table_id = this->get_homogen_table_id();
+
+    this->online_spmd_general_checks(data, compute_mode, data_table_id);
+}
+
+} // namespace oneapi::dal::covariance::test
diff --git a/cpp/oneapi/dal/algo/covariance/test/spmd.cpp b/cpp/oneapi/dal/algo/covariance/test/spmd.cpp
index 568fd9cfaf0..ac50be44d2a 100644
--- a/cpp/oneapi/dal/algo/covariance/test/spmd.cpp
+++ b/cpp/oneapi/dal/algo/covariance/test/spmd.cpp
@@ -71,7 +71,7 @@ class covariance_spmd_test : public covariance_test<TestType, covariance_spmd_te
 
         const auto cov_desc = base_t::get_descriptor(compute_mode);
 
-        const auto compute_result = this->compute(cov_desc, data);
+        const auto compute_result = this->compute_override(cov_desc, data);
 
         base_t::check_compute_result(cov_desc, data, compute_result);
     }
diff --git a/cpp/oneapi/dal/algo/jaccard/backend/cpu/vertex_similarity_default_kernel.hpp b/cpp/oneapi/dal/algo/jaccard/backend/cpu/vertex_similarity_default_kernel.hpp
index 56ea3021929..b2269d4cdd9 100644
--- a/cpp/oneapi/dal/algo/jaccard/backend/cpu/vertex_similarity_default_kernel.hpp
+++ b/cpp/oneapi/dal/algo/jaccard/backend/cpu/vertex_similarity_default_kernel.hpp
@@ -1,5 +1,6 @@
 /*******************************************************************************
 * Copyright 2020 Intel Corporation
+* Copyright contributors to the oneDAL project
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -107,10 +108,11 @@ vertex_similarity_result<task::all_vertex_pairs> jaccard(
     return res;
 }
 
+#if defined(TARGET_X86_64)
 template <>
 vertex_similarity_result<task::all_vertex_pairs> jaccard<dal::backend::cpu_dispatch_avx512>(
     const detail::descriptor_base<task::all_vertex_pairs> &desc,
     const dal::preview::detail::topology<std::int32_t> &t,
     void *result_ptr);
-
+#endif
 } // namespace oneapi::dal::preview::jaccard::backend
diff --git a/cpp/oneapi/dal/algo/jaccard/backend/cpu/vertex_similarity_default_kernel_avx512.hpp b/cpp/oneapi/dal/algo/jaccard/backend/cpu/vertex_similarity_default_kernel_avx512.hpp
index ca350bec92a..7e5a2dda834 100644
--- a/cpp/oneapi/dal/algo/jaccard/backend/cpu/vertex_similarity_default_kernel_avx512.hpp
+++ b/cpp/oneapi/dal/algo/jaccard/backend/cpu/vertex_similarity_default_kernel_avx512.hpp
@@ -1,5 +1,6 @@
 /*******************************************************************************
 * Copyright 2020 Intel Corporation
+* Copyright contributors to the oneDAL project
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -16,8 +17,6 @@
 
 #pragma once
 
-#include <immintrin.h>
-
 #include <daal/src/services/service_defines.h>
 
 #include "oneapi/dal/algo/jaccard/backend/cpu/vertex_similarity_default_kernel.hpp"
diff --git a/cpp/oneapi/dal/algo/jaccard/backend/cpu/vertex_similarity_default_kernel_cpu.cpp b/cpp/oneapi/dal/algo/jaccard/backend/cpu/vertex_similarity_default_kernel_cpu.cpp
index f9c81a76cbc..8bb76b2ddbe 100644
--- a/cpp/oneapi/dal/algo/jaccard/backend/cpu/vertex_similarity_default_kernel_cpu.cpp
+++ b/cpp/oneapi/dal/algo/jaccard/backend/cpu/vertex_similarity_default_kernel_cpu.cpp
@@ -1,5 +1,6 @@
 /*******************************************************************************
 * Copyright 2020 Intel Corporation
+* Copyright contributors to the oneDAL project
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
diff --git a/cpp/oneapi/dal/algo/jaccard/backend/cpu/vertex_similarity_default_kernel_skx.cpp b/cpp/oneapi/dal/algo/jaccard/backend/cpu/vertex_similarity_default_kernel_skx.cpp
index 15c5e3976cc..eef60006c5f 100644
--- a/cpp/oneapi/dal/algo/jaccard/backend/cpu/vertex_similarity_default_kernel_skx.cpp
+++ b/cpp/oneapi/dal/algo/jaccard/backend/cpu/vertex_similarity_default_kernel_skx.cpp
@@ -1,5 +1,6 @@
 /*******************************************************************************
 * Copyright 2020 Intel Corporation
+* Copyright contributors to the oneDAL project
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -14,7 +15,9 @@
 * limitations under the License.
 *******************************************************************************/
 
+#if defined(TARGET_X86_64)
 #include <immintrin.h>
+#endif
 
 #include "oneapi/dal/algo/jaccard/backend/cpu/vertex_similarity_default_kernel.hpp"
 #include "oneapi/dal/algo/jaccard/backend/cpu/vertex_similarity_default_kernel_avx512.hpp"
@@ -27,6 +30,7 @@
 
 namespace oneapi::dal::preview::jaccard::backend {
 
+#if defined(TARGET_X86_64)
 template vertex_similarity_result<task::all_vertex_pairs> jaccard_avx512<
     dal::backend::cpu_dispatch_avx512>(const detail::descriptor_base<task::all_vertex_pairs>& desc,
                                        const dal::preview::detail::topology<std::int32_t>& t,
@@ -39,5 +43,6 @@ vertex_similarity_result<task::all_vertex_pairs> jaccard<dal::backend::cpu_dispa
     void* result_ptr) {
     return jaccard_avx512<dal::backend::cpu_dispatch_avx512>(desc, t, result_ptr);
 }
+#endif
 
 } // namespace oneapi::dal::preview::jaccard::backend
diff --git a/cpp/oneapi/dal/algo/kmeans/backend/cpu/infer_kernel.cpp b/cpp/oneapi/dal/algo/kmeans/backend/cpu/infer_kernel.cpp
index dd2d31f0277..17f44c6483a 100644
--- a/cpp/oneapi/dal/algo/kmeans/backend/cpu/infer_kernel.cpp
+++ b/cpp/oneapi/dal/algo/kmeans/backend/cpu/infer_kernel.cpp
@@ -33,9 +33,21 @@ using descriptor_t = detail::descriptor_base<task::clustering>;
 namespace daal_kmeans = daal::algorithms::kmeans;
 namespace interop = dal::backend::interop;
 
-template <typename Float, daal::CpuType Cpu>
-using daal_kmeans_lloyd_dense_kernel_t =
-    daal_kmeans::internal::KMeansBatchKernel<daal_kmeans::lloydDense, Float, Cpu>;
+template <daal_kmeans::Method Value>
+using daal_method_constant = std::integral_constant<daal_kmeans::Method, Value>;
+
+template <typename Method>
+struct to_daal_method;
+
+template <>
+struct to_daal_method<method::lloyd_dense> : daal_method_constant<daal_kmeans::lloydDense> {};
+
+template <>
+struct to_daal_method<method::lloyd_csr> : daal_method_constant<daal_kmeans::lloydCSR> {};
+
+template <typename Float, daal::CpuType Cpu, typename Method>
+using batch_kernel_t =
+    daal_kmeans::internal::KMeansBatchKernel<to_daal_method<Method>::value, Float, Cpu>;
 
 inline auto get_daal_parameter_to_infer(const descriptor_t& desc) {
     const std::int64_t max_iteration_count = 0;
@@ -55,11 +67,11 @@ inline auto get_daal_parameter_to_infer(const descriptor_t& desc) {
     return parameter;
 }
 
-template <typename Float, typename Task>
+template <typename Float, typename Task, typename Method, typename Table>
 static infer_result<Task> call_daal_kernel(const context_cpu& ctx,
                                            const descriptor_t& desc,
                                            const model<Task>& trained_model,
-                                           const table& data) {
+                                           const Table& data) {
     const std::int64_t row_count = data.get_row_count();
 
     auto result = infer_result<Task>{}.set_result_options(desc.get_result_options());
@@ -84,11 +96,13 @@ static infer_result<Task> call_daal_kernel(const context_cpu& ctx,
                                                        daal_objective_function_value.get(),
                                                        nullptr };
 
-    interop::status_to_exception(
-        interop::call_daal_kernel<Float, daal_kmeans_lloyd_dense_kernel_t>(ctx,
-                                                                           input,
-                                                                           output,
-                                                                           &par));
+    interop::status_to_exception(dal::backend::dispatch_by_cpu(ctx, [&](auto cpu) {
+        return batch_kernel_t<Float,
+                              oneapi::dal::backend::interop::to_daal_cpu_type<decltype(cpu)>::value,
+                              Method>()
+            .compute(input, output, &par);
+    }));
+
     if (desc.get_result_options().test(result_options::compute_assignments)) {
         result.set_responses(
             dal::detail::homogen_table_builder{}.reset(arr_responses, row_count, 1).build());
@@ -101,23 +115,28 @@ static infer_result<Task> call_daal_kernel(const context_cpu& ctx,
     return result;
 }
 
-template <typename Float, typename Task>
+template <typename Float, typename Task, typename Method>
 static infer_result<Task> infer(const context_cpu& ctx,
                                 const descriptor_t& desc,
                                 const infer_input<Task>& input) {
-    return call_daal_kernel<Float, Task>(ctx, desc, input.get_model(), input.get_data());
+    using table_type =
+        std::conditional_t<std::is_same_v<Method, method::lloyd_csr>, csr_table, table>;
+    const auto data = static_cast<table_type>(input.get_data());
+    return call_daal_kernel<Float, Task, Method>(ctx, desc, input.get_model(), data);
 }
 
-template <typename Float>
-struct infer_kernel_cpu<Float, method::by_default, task::clustering> {
+template <typename Float, typename Method>
+struct infer_kernel_cpu<Float, Method, task::clustering> {
     infer_result<task::clustering> operator()(const context_cpu& ctx,
                                               const descriptor_t& desc,
                                               const infer_input<task::clustering>& input) const {
-        return infer<Float, task::clustering>(ctx, desc, input);
+        return infer<Float, task::clustering, Method>(ctx, desc, input);
     }
 };
 
-template struct infer_kernel_cpu<float, method::by_default, task::clustering>;
-template struct infer_kernel_cpu<double, method::by_default, task::clustering>;
+template struct infer_kernel_cpu<float, method::lloyd_csr, task::clustering>;
+template struct infer_kernel_cpu<double, method::lloyd_csr, task::clustering>;
+template struct infer_kernel_cpu<float, method::lloyd_dense, task::clustering>;
+template struct infer_kernel_cpu<double, method::lloyd_dense, task::clustering>;
 
 } // namespace oneapi::dal::kmeans::backend
diff --git a/cpp/oneapi/dal/algo/kmeans/backend/cpu/train_kernel_lloyd_dense.cpp b/cpp/oneapi/dal/algo/kmeans/backend/cpu/train_kernel_lloyd.cpp
similarity index 68%
rename from cpp/oneapi/dal/algo/kmeans/backend/cpu/train_kernel_lloyd_dense.cpp
rename to cpp/oneapi/dal/algo/kmeans/backend/cpu/train_kernel_lloyd.cpp
index e3404e8b47d..f32dcaae63a 100644
--- a/cpp/oneapi/dal/algo/kmeans/backend/cpu/train_kernel_lloyd_dense.cpp
+++ b/cpp/oneapi/dal/algo/kmeans/backend/cpu/train_kernel_lloyd.cpp
@@ -18,6 +18,7 @@
 #include <daal/src/algorithms/kmeans/kmeans_lloyd_kernel.h>
 
 #include "oneapi/dal/algo/kmeans/backend/cpu/train_kernel.hpp"
+#include "oneapi/dal/algo/kmeans/detail/train_init_centroids.hpp"
 #include "oneapi/dal/backend/interop/common.hpp"
 #include "oneapi/dal/backend/interop/error_converter.hpp"
 #include "oneapi/dal/backend/interop/table_conversion.hpp"
@@ -32,50 +33,34 @@ using dal::backend::context_cpu;
 using descriptor_t = detail::descriptor_base<task::clustering>;
 
 namespace daal_kmeans = daal::algorithms::kmeans;
-namespace daal_kmeans_init = daal::algorithms::kmeans::init;
 namespace interop = dal::backend::interop;
 
-template <typename Float, daal::CpuType Cpu>
-using daal_kmeans_lloyd_dense_kernel_t =
-    daal_kmeans::internal::KMeansBatchKernel<daal_kmeans::lloydDense, Float, Cpu>;
+template <daal_kmeans::Method Value>
+using daal_method_constant = std::integral_constant<daal_kmeans::Method, Value>;
 
-template <typename Float, daal::CpuType Cpu>
-using daal_kmeans_init_plus_plus_dense_kernel_t =
-    daal_kmeans_init::internal::KMeansInitKernel<daal_kmeans_init::plusPlusDense, Float, Cpu>;
+template <typename Method>
+struct to_daal_method;
 
-template <typename Float>
+template <>
+struct to_daal_method<method::lloyd_dense> : daal_method_constant<daal_kmeans::lloydDense> {};
+
+template <>
+struct to_daal_method<method::lloyd_csr> : daal_method_constant<daal_kmeans::lloydCSR> {};
+
+template <typename Float, daal::CpuType Cpu, typename Method>
+using batch_kernel_t =
+    daal_kmeans::internal::KMeansBatchKernel<to_daal_method<Method>::value, Float, Cpu>;
+
+template <typename Float, typename Method, typename Table>
 static daal::data_management::NumericTablePtr get_initial_centroids(
     const context_cpu& ctx,
     const descriptor_t& desc,
-    const table& data,
+    const Table& data,
     const table& initial_centroids) {
-    const std::int64_t column_count = data.get_column_count();
-    const std::int64_t cluster_count = desc.get_cluster_count();
-
     daal::data_management::NumericTablePtr daal_initial_centroids;
     if (!initial_centroids.has_data()) {
-        const auto daal_data = interop::convert_to_daal_table<Float>(data);
-        daal_kmeans_init::Parameter par(dal::detail::integral_cast<std::size_t>(cluster_count));
-
-        const std::size_t init_len_input = 1;
-        daal::data_management::NumericTable* init_input[init_len_input] = { daal_data.get() };
-
         daal_initial_centroids =
-            interop::allocate_daal_homogen_table<Float>(cluster_count, column_count);
-        const std::size_t init_len_output = 1;
-        daal::data_management::NumericTable* init_output[init_len_output] = {
-            daal_initial_centroids.get()
-        };
-
-        interop::status_to_exception(
-            interop::call_daal_kernel<Float, daal_kmeans_init_plus_plus_dense_kernel_t>(
-                ctx,
-                init_len_input,
-                init_input,
-                init_len_output,
-                init_output,
-                &par,
-                *(par.engine)));
+            oneapi::dal::kmeans::detail::daal_generate_centroids<Float, Method>(desc, data);
     }
     else {
         daal_initial_centroids = interop::convert_to_daal_table<Float>(initial_centroids);
@@ -96,10 +81,10 @@ inline auto get_daal_parameter_to_train(const descriptor_t& desc) {
     return par;
 }
 
-template <typename Float, typename Task>
+template <typename Float, typename Task, typename Method, typename Table>
 static train_result<Task> call_daal_kernel(const context_cpu& ctx,
                                            const descriptor_t& desc,
-                                           const table& data,
+                                           const Table& data,
                                            const table& initial_centroids) {
     const std::int64_t row_count = data.get_row_count();
     const std::int64_t column_count = data.get_column_count();
@@ -107,7 +92,8 @@ static train_result<Task> call_daal_kernel(const context_cpu& ctx,
 
     auto par = get_daal_parameter_to_train(desc);
 
-    auto daal_initial_centroids = get_initial_centroids<Float>(ctx, desc, data, initial_centroids);
+    auto daal_initial_centroids =
+        get_initial_centroids<Float, Method>(ctx, desc, data, initial_centroids);
 
     const auto daal_data = interop::convert_to_daal_table<Float>(data);
     auto result = train_result<Task>{};
@@ -127,7 +113,6 @@ static train_result<Task> call_daal_kernel(const context_cpu& ctx,
 
     array<int> arr_responses = array<int>::empty(row_count);
     array<Float> arr_objective_function_value = array<Float>::empty(1);
-
     const auto daal_responses = interop::convert_to_daal_homogen_table(arr_responses, row_count, 1);
     const auto daal_objective_function_value =
         interop::convert_to_daal_homogen_table(arr_objective_function_value, 1, 1);
@@ -136,11 +121,13 @@ static train_result<Task> call_daal_kernel(const context_cpu& ctx,
                                                        daal_responses.get(),
                                                        daal_objective_function_value.get(),
                                                        daal_iteration_count.get() };
-    interop::status_to_exception(
-        interop::call_daal_kernel<Float, daal_kmeans_lloyd_dense_kernel_t>(ctx,
-                                                                           input,
-                                                                           output,
-                                                                           &par));
+
+    interop::status_to_exception(dal::backend::dispatch_by_cpu(ctx, [&](auto cpu) {
+        return batch_kernel_t<Float,
+                              oneapi::dal::backend::interop::to_daal_cpu_type<decltype(cpu)>::value,
+                              Method>()
+            .compute(input, output, &par);
+    }));
 
     result.set_objective_function_value(static_cast<double>(arr_objective_function_value[0]));
 
@@ -153,30 +140,31 @@ static train_result<Task> call_daal_kernel(const context_cpu& ctx,
         model<Task>().set_centroids(dal::detail::homogen_table_builder{}
                                         .reset(arr_centroids, cluster_count, column_count)
                                         .build()));
-
     return result;
 }
 
-template <typename Float, typename Task>
+template <typename Float, typename Task, typename Method>
 static train_result<Task> train(const context_cpu& ctx,
                                 const descriptor_t& desc,
                                 const train_input<Task>& input) {
-    return call_daal_kernel<Float, Task>(ctx,
-                                         desc,
-                                         input.get_data(),
-                                         input.get_initial_centroids());
+    using table_type =
+        std::conditional_t<std::is_same_v<Method, method::lloyd_csr>, csr_table, table>;
+    const auto data = static_cast<table_type>(input.get_data());
+    return call_daal_kernel<Float, Task, Method>(ctx, desc, data, input.get_initial_centroids());
 }
 
-template <typename Float>
-struct train_kernel_cpu<Float, method::lloyd_dense, task::clustering> {
+template <typename Float, typename Method>
+struct train_kernel_cpu<Float, Method, task::clustering> {
     train_result<task::clustering> operator()(const context_cpu& ctx,
                                               const descriptor_t& desc,
                                               const train_input<task::clustering>& input) const {
-        return train<Float, task::clustering>(ctx, desc, input);
+        return train<Float, task::clustering, Method>(ctx, desc, input);
     }
 };
 
 template struct train_kernel_cpu<float, method::lloyd_dense, task::clustering>;
 template struct train_kernel_cpu<double, method::lloyd_dense, task::clustering>;
+template struct train_kernel_cpu<float, method::lloyd_csr, task::clustering>;
+template struct train_kernel_cpu<double, method::lloyd_csr, task::clustering>;
 
 } // namespace oneapi::dal::kmeans::backend
diff --git a/cpp/oneapi/dal/algo/kmeans/backend/gpu/infer_kernel_dpc.cpp b/cpp/oneapi/dal/algo/kmeans/backend/gpu/infer_kernel_dpc.cpp
index 99c44b93b31..2eefffd1cd7 100644
--- a/cpp/oneapi/dal/algo/kmeans/backend/gpu/infer_kernel_dpc.cpp
+++ b/cpp/oneapi/dal/algo/kmeans/backend/gpu/infer_kernel_dpc.cpp
@@ -19,7 +19,9 @@
 #include "oneapi/dal/backend/primitives/utils.hpp"
 #include "oneapi/dal/algo/kmeans/backend/gpu/kernels_integral.hpp"
 #include "oneapi/dal/algo/kmeans/backend/gpu/kernels_fp.hpp"
+#include "oneapi/dal/algo/kmeans/backend/gpu/kernels_csr_impl.hpp"
 #include "oneapi/dal/table/row_accessor.hpp"
+#include "oneapi/dal/table/csr_accessor.hpp"
 
 #include "oneapi/dal/detail/profiler.hpp"
 
@@ -105,7 +107,83 @@ struct infer_kernel_gpu<Float, method::lloyd_dense, task::clustering> {
     }
 };
 
-template struct infer_kernel_gpu<float, method::by_default, task::clustering>;
-template struct infer_kernel_gpu<double, method::by_default, task::clustering>;
+template <typename Float>
+struct infer_kernel_gpu<Float, method::lloyd_csr, task::clustering> {
+    infer_result<task::clustering> operator()(const dal::backend::context_gpu& ctx,
+                                              const descriptor_t& desc,
+                                              const infer_input<task::clustering>& input) const {
+        auto& queue = ctx.get_queue();
+        auto& comm = ctx.get_communicator();
+        ONEDAL_ASSERT(input.get_data().get_kind() == dal::csr_table::kind());
+        const auto data = static_cast<const csr_table&>(input.get_data());
+        const std::int64_t row_count = data.get_row_count();
+        const std::int64_t column_count = data.get_column_count();
+        const std::int64_t cluster_count = desc.get_cluster_count();
+        dal::detail::check_mul_overflow(cluster_count, column_count);
+
+        auto [arr_val, arr_col, arr_row] =
+            csr_accessor<const Float>(data).pull(queue,
+                                                 { 0, -1 },
+                                                 sparse_indexing::zero_based,
+                                                 sycl::usm::alloc::device);
+        auto values = pr::ndarray<Float, 1>::wrap(arr_val.get_data(), arr_val.get_count());
+        auto column_indices =
+            pr::ndarray<std::int64_t, 1>::wrap(arr_col.get_data(), arr_col.get_count());
+        auto row_offsets =
+            pr::ndarray<std::int64_t, 1>::wrap(arr_row.get_data(), arr_row.get_count());
+        auto arr_centroid_squares =
+            pr::ndarray<Float, 1>::empty(queue, cluster_count, sycl::usm::alloc::device);
+        auto arr_data_squares =
+            pr::ndarray<Float, 1>::empty(queue, row_count, sycl::usm::alloc::device);
+        auto data_squares_event =
+            compute_data_squares(queue, values, column_indices, row_offsets, arr_data_squares);
+
+        auto distances = pr::ndarray<Float, 2>::empty(queue,
+                                                      { row_count, cluster_count },
+                                                      sycl::usm::alloc::device);
+
+        auto arr_closest_distances =
+            pr::ndarray<Float, 2>::empty(queue, { row_count, 1 }, sycl::usm::alloc::device);
+        auto arr_centroids = pr::table2ndarray<Float>(queue,
+                                                      input.get_model().get_centroids(),
+                                                      sycl::usm::alloc::device);
+        auto arr_responses =
+            pr::ndarray<std::int32_t, 2>::empty(queue, { row_count, 1 }, sycl::usm::alloc::device);
+
+        auto centroid_squares_event = kernels_fp<Float>::compute_squares(queue,
+                                                                         arr_centroids,
+                                                                         arr_centroid_squares,
+                                                                         { data_squares_event });
+        auto assign_event = assign_clusters(queue,
+                                            values,
+                                            column_indices,
+                                            row_offsets,
+                                            arr_data_squares,
+                                            arr_centroids,
+                                            arr_centroid_squares,
+                                            distances,
+                                            arr_responses,
+                                            arr_closest_distances,
+                                            { data_squares_event, centroid_squares_event });
+        auto objective_function =
+            calc_objective_function(queue, arr_closest_distances, { assign_event });
+        {
+            // Reduce objective function value over all ranks
+            comm.allreduce(objective_function).wait();
+        }
+        auto result = infer_result<task::clustering>{};
+        result.set_objective_function_value(objective_function);
+
+        result.set_responses(
+            dal::homogen_table::wrap(arr_responses.flatten(queue, { assign_event }), row_count, 1));
+
+        return result;
+    }
+};
+
+template struct infer_kernel_gpu<float, method::lloyd_csr, task::clustering>;
+template struct infer_kernel_gpu<double, method::lloyd_csr, task::clustering>;
+template struct infer_kernel_gpu<float, method::lloyd_dense, task::clustering>;
+template struct infer_kernel_gpu<double, method::lloyd_dense, task::clustering>;
 
 } // namespace oneapi::dal::kmeans::backend
diff --git a/cpp/oneapi/dal/algo/kmeans/backend/gpu/kernels_csr_impl.hpp b/cpp/oneapi/dal/algo/kmeans/backend/gpu/kernels_csr_impl.hpp
new file mode 100644
index 00000000000..478bf9de85d
--- /dev/null
+++ b/cpp/oneapi/dal/algo/kmeans/backend/gpu/kernels_csr_impl.hpp
@@ -0,0 +1,412 @@
+/*******************************************************************************
+* Copyright contributors to the oneDAL project
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*     http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*******************************************************************************/
+
+#include "oneapi/dal/backend/primitives/reduction.hpp"
+#include "oneapi/dal/backend/interop/common_dpc.hpp"
+#include "oneapi/dal/backend/interop/error_converter.hpp"
+#include "oneapi/dal/backend/interop/table_conversion.hpp"
+#include "oneapi/dal/backend/primitives/ndarray.hpp"
+#include "oneapi/dal/backend/atomic.hpp"
+
+namespace oneapi::dal::kmeans::backend {
+
+using dal::backend::context_gpu;
+using descriptor_t = detail::descriptor_base<task::clustering>;
+using event_vector = std::vector<sycl::event>;
+
+template <typename Data>
+using local_accessor_rw_t = sycl::local_accessor<Data, 1>;
+
+namespace interop = dal::backend::interop;
+namespace pr = dal::backend::primitives;
+namespace de = dal::detail;
+namespace bk = dal::backend;
+
+template <typename Float>
+sycl::event compute_data_squares(sycl::queue& q,
+                                 const pr::ndview<Float, 1>& values,
+                                 const pr::ndview<std::int64_t, 1>& column_indices,
+                                 const pr::ndview<std::int64_t, 1>& row_offsets,
+                                 pr::ndview<Float, 1>& squares) {
+    ONEDAL_PROFILER_TASK(compute_data_squares, q);
+    return pr::reduce_by_rows(q,
+                              values,
+                              column_indices,
+                              row_offsets,
+                              sparse_indexing::zero_based,
+                              squares,
+                              pr::sum<Float>{},
+                              pr::square<Float>{});
+}
+
+// Temporary function, TODO: replace this call with spgemm call
+// TODO: need to add dimensions integer overflow
+template <typename Float>
+sycl::event custom_spgemm(sycl::queue& q,
+                          const pr::ndview<Float, 1>& values,
+                          const pr::ndview<std::int64_t, 1>& column_indices,
+                          const pr::ndview<std::int64_t, 1>& row_offsets,
+                          const pr::ndview<Float, 2>& b,
+                          pr::ndview<Float, 2>& c,
+                          const Float alpha,
+                          const Float beta,
+                          const event_vector& deps = {}) {
+    ONEDAL_PROFILER_TASK(custom_spgemm, q);
+    const size_t a_row_count = row_offsets.get_count() - 1;
+    const size_t reduce_dim = b.get_dimension(1);
+    const size_t b_row_count = b.get_dimension(0);
+
+    const auto local_size =
+        std::min<std::int32_t>(bk::device_max_wg_size(q), bk::down_pow2(reduce_dim));
+    auto res_ptr = c.get_mutable_data();
+    const auto a_ptr = values.get_data();
+    const auto row_ofs = row_offsets.get_data();
+    const auto col_ind = column_indices.get_data();
+    const auto b_ptr = b.get_data();
+
+    // Compute matrix block by block to avoid integer overflow
+    const std::int64_t row_block = 8 * bk::device_max_wg_size(q);
+    const std::int64_t row_block_size = std::min<std::int64_t>(row_block, a_row_count);
+    const std::int64_t col_block_size = std::min<std::int64_t>(row_block, b_row_count);
+
+    const auto nd_range =
+        bk::make_multiple_nd_range_3d({ row_block_size, col_block_size, local_size },
+                                      { 1, 1, local_size });
+
+    return q.submit([&](sycl::handler& cgh) {
+        cgh.depends_on(deps);
+        cgh.parallel_for(nd_range, [=](auto item) {
+            const auto row_shift = item.get_global_id(0);
+            const auto col_shift = item.get_global_id(1);
+            const auto local_id = item.get_local_id(2);
+
+            for (auto row_idx = row_shift; row_idx < a_row_count; row_idx += row_block) {
+                for (auto col_idx = col_shift; col_idx < b_row_count; col_idx += row_block) {
+                    const auto start = row_ofs[row_idx] + local_id;
+                    const auto end = row_ofs[row_idx + 1];
+                    Float acc = Float(0);
+                    for (std::int64_t data_idx = start; data_idx < end; data_idx += local_size) {
+                        const auto reduce_id = col_ind[data_idx];
+                        acc += a_ptr[data_idx] * b_ptr[col_idx * reduce_dim + reduce_id];
+                    }
+                    const Float scalar_mul =
+                        sycl::reduce_over_group(item.get_group(),
+                                                acc,
+                                                Float(0),
+                                                sycl::ext::oneapi::plus<Float>());
+                    if (local_id == 0) {
+                        res_ptr[row_idx * b_row_count + col_idx] =
+                            beta * res_ptr[row_idx * b_row_count + col_idx] + alpha * scalar_mul;
+                    }
+                }
+            }
+        });
+    });
+}
+
+/// Calculates distances from each data point to each centroid and selects the closest centroid to each data point
+/// @param[in] q                    A sycl-queue to perform operations on device
+/// @param[in] values               A data part of csr table with :expr:`non_zero_count x 1` dimensions
+/// @param[in] column_indices       An array of column indices in csr table with :expr:`non_zero_count x 1` dimensions
+/// @param[in] row_offsets          An arrat of row offsets in csr table with :expr:`(row_count + 1) x 1` dimensions
+/// @param[in] data_squares         An array of data squared elementwise with :expr:`row_count x 1` dimensions
+/// @param[in] centroids           An array of centroids with :expr:`cluster_count x column_count` dimensions
+/// @param[in] centroids_squares   An array of centroids squares with :expr:`cluster_count x 1` dimensions
+/// @param[out] distances           An array of distances of dataset to each cluster with :expr:`row_count x cluster_count` dimensions
+/// @param[out] responses           An array of responses with :expr:`row_count x 1` dimensions
+/// @param[out] closest_dists       An array of closests distances for each data point with :expr:`row_count x 1` dimensions
+/// @param[in] deps                 An event vector of dependencies for specified kernel
+template <typename Float>
+sycl::event assign_clusters(sycl::queue& q,
+                            const pr::ndview<Float, 1>& values,
+                            const pr::ndview<std::int64_t, 1>& column_indices,
+                            const pr::ndview<std::int64_t, 1>& row_offsets,
+                            const pr::ndview<Float, 1>& data_squares,
+                            const pr::ndview<Float, 2>& centroids,
+                            const pr::ndview<Float, 1>& centroid_squares,
+                            pr::ndview<Float, 2>& distances,
+                            pr::ndview<std::int32_t, 2>& responses,
+                            pr::ndview<Float, 2>& closest_dists,
+                            const event_vector& deps = {}) {
+    ONEDAL_PROFILER_TASK(assign_clusters, q);
+    auto data_squares_ptr = data_squares.get_data();
+    auto cent_squares_ptr = centroid_squares.get_data();
+    auto responses_ptr = responses.get_mutable_data();
+    auto closest_dists_ptr = closest_dists.get_mutable_data();
+    // Calculate rest part of distances
+    auto dist_event = custom_spgemm(q,
+                                    values,
+                                    column_indices,
+                                    row_offsets,
+                                    centroids,
+                                    distances,
+                                    Float(-2.0),
+                                    Float(0),
+                                    deps);
+
+    const auto distances_ptr = distances.get_data();
+
+    const auto cluster_count = centroids.get_dimension(0);
+    const auto row_count = static_cast<size_t>(row_offsets.get_count() - 1);
+    // based on bechmarks an optimal block size is equal to 8 work-group sizes
+    const std::int64_t block_multiplier = 8;
+    const std::int64_t row_block = block_multiplier * bk::device_max_wg_size(q);
+
+    const auto local_size =
+        std::min<std::int64_t>(bk::device_max_wg_size(q), bk::down_pow2(cluster_count));
+    const auto nd_range =
+        bk::make_multiple_nd_range_2d({ row_block, local_size }, { 1, local_size });
+
+    auto event = q.submit([&](sycl::handler& cgh) {
+        cgh.depends_on({ dist_event });
+        cgh.depends_on(deps);
+        cgh.parallel_for(nd_range, [=](auto item) {
+            const auto row_shift = item.get_global_id(0);
+            const auto local_id = item.get_local_id(1);
+            const auto max_val = std::numeric_limits<Float>::max();
+            const auto max_index = std::numeric_limits<std::int32_t>::max();
+            for (auto row_idx = row_shift; row_idx < row_count; row_idx += row_block) {
+                auto min_dist = max_val;
+                auto min_idx = max_index;
+                auto row_dists = distances_ptr + row_idx * cluster_count;
+                for (std::int32_t cluster_id = local_id; cluster_id < cluster_count;
+                     cluster_id += local_size) {
+                    const auto dist = cent_squares_ptr[cluster_id] + row_dists[cluster_id] +
+                                      data_squares_ptr[row_idx];
+                    if (dist < min_dist) {
+                        min_dist = dist;
+                        min_idx = cluster_id;
+                    }
+                }
+                const Float closest = sycl::reduce_over_group(item.get_group(),
+                                                              min_dist,
+                                                              max_val,
+                                                              sycl::ext::oneapi::minimum<Float>());
+                const std::int32_t dist_idx = closest == min_dist ? min_idx : max_index;
+                const std::int32_t closest_id =
+                    sycl::reduce_over_group(item.get_group(),
+                                            dist_idx,
+                                            max_index,
+                                            sycl::ext::oneapi::minimum<std::int32_t>());
+                if (local_id == 0) {
+                    responses_ptr[row_idx] = closest_id;
+                    closest_dists_ptr[row_idx] = closest;
+                }
+            }
+        });
+    });
+    return event;
+}
+
+// Calculates an objective function, which is sum of all distances from points to centroid.
+/// @param[in] q                A sycl-queue to perform operations on device
+/// @param[in] dists            An array of distances for each data point to the closest cluster
+/// @param[in] deps             An event vector of dependencies for specified kernel
+template <typename Float>
+Float calc_objective_function(sycl::queue& q,
+                              const pr::ndview<Float, 2>& dists,
+                              const event_vector& deps = {}) {
+    ONEDAL_PROFILER_TASK(calc_objective_function, q);
+    pr::sum<Float> sum{};
+    pr::identity<Float> ident{};
+    auto view_1d = dists.template reshape<1>(pr::ndshape<1>{ dists.get_dimension(0) });
+    return pr::reduce_1d(q, view_1d, sum, ident, deps);
+}
+
+// Updates the centroids based on new responses and cluster counts.
+// New centroid is a mean among all points in cluster.
+// If cluster is empty, centroid remains the same as in previous iteration.
+/// @param[in] q                A sycl-queue to perform operations on device
+/// @param[in] values           A data part of csr table with :expr:`non_zero_count x 1` dimensions
+/// @param[in] column_indices   An array of column indices in csr table :expr:`non_zero_count x 1` dimensions
+/// @param[in] row_offsets      An arrat of row offsets in csr table with :expr:`(row_count + 1) x 1` dimensions
+/// @param[in] column_count     A number of column in input dataset
+/// @param[in] reponses         An array of cluster assignments with :expr:`row_count x 1` dimensions
+/// @param[out] centroids       An array of centroids with :expr:`cluster_count x column_count` dimensions
+/// @param[in] cluster_counts   An array of cluster counts with :expr:`cluster_count x 1` dimensions
+/// @param[in] deps             An event vector of dependencies for specified kernel
+template <typename Float>
+sycl::event update_centroids(sycl::queue& q,
+                             const bk::communicator<spmd::device_memory_access::usm>& comm,
+                             const pr::ndview<Float, 1>& values,
+                             const pr::ndview<std::int64_t, 1>& column_indices,
+                             const pr::ndview<std::int64_t, 1>& row_offsets,
+                             std::int64_t column_count,
+                             const pr::ndarray<std::int32_t, 2>& responses,
+                             pr::ndarray<Float, 2>& centroids,
+                             const pr::ndarray<std::int32_t, 1>& cluster_counts,
+                             const event_vector& deps = {}) {
+    ONEDAL_PROFILER_TASK(update_centroids, q);
+    const auto resp_ptr = responses.get_data();
+    auto centroids_ptr = centroids.get_mutable_data();
+    const auto row_count = row_offsets.get_count() - 1;
+    const auto data_ptr = values.get_data();
+    const auto row_ofs_ptr = row_offsets.get_data();
+    const auto col_ind_ptr = column_indices.get_data();
+    const auto counts_ptr = cluster_counts.get_data();
+
+    const auto local_size = bk::device_max_wg_size(q);
+    const auto num_clusters = centroids.get_dimension(0);
+
+    const auto clean_range =
+        bk::make_multiple_nd_range_2d({ num_clusters, column_count }, { 1, 1 });
+    auto clean_event = q.submit([&](sycl::handler& cgh) {
+        cgh.depends_on(deps);
+        cgh.parallel_for(clean_range, [=](auto it) {
+            const auto cluster_id = it.get_global_id(0);
+            const auto col_id = it.get_global_id(1);
+            centroids_ptr[cluster_id * column_count + col_id] = 0;
+        });
+    });
+
+    const auto row_block =
+        std::min<std::int32_t>(bk::device_max_wg_size(q) * 8, bk::down_pow2(row_count));
+    const auto col_block =
+        std::min<std::int32_t>(bk::device_max_wg_size(q), bk::down_pow2(column_count));
+    const auto range =
+        bk::make_multiple_nd_range_3d({ num_clusters, row_block, col_block }, { 1, 1, col_block });
+
+    auto centroids_sum_event = q.submit([&](sycl::handler& cgh) {
+        cgh.depends_on(clean_event);
+        local_accessor_rw_t<Float> local_centroid(column_count, cgh);
+        cgh.parallel_for(range, [=](auto it) {
+            const auto cluster_id = it.get_global_id(0);
+            const auto row_shift = it.get_global_id(1);
+            const auto local_id = static_cast<std::int64_t>(it.get_local_id(2));
+            if (counts_ptr[cluster_id] == 0) {
+                return;
+            }
+            auto local_centroid_ptr =
+                local_centroid.template get_multi_ptr<sycl::access::decorated::yes>().get_raw();
+            for (std::int64_t col_idx = local_id; col_idx < column_count; col_idx += col_block) {
+                local_centroid_ptr[col_idx] = 0;
+            }
+            it.barrier();
+            for (std::int64_t row_idx = row_shift; row_idx < row_count; row_idx += row_block) {
+                if (resp_ptr[row_idx] == static_cast<std::int32_t>(cluster_id)) {
+                    const auto start = row_ofs_ptr[row_idx];
+                    const auto end = row_ofs_ptr[row_idx + 1];
+                    for (auto idx = start + local_id; idx < end; idx += col_block) {
+                        const auto col_idx = col_ind_ptr[idx];
+                        const auto val = data_ptr[idx];
+                        bk::atomic_local_add(local_centroid_ptr + col_idx, val);
+                    }
+                }
+            }
+            it.barrier();
+            if (local_id == 0) {
+                for (std::int64_t col_idx = 0; col_idx < column_count; ++col_idx) {
+                    const auto pos = cluster_id * column_count + col_idx;
+                    bk::atomic_global_add(centroids_ptr + pos, local_centroid_ptr[col_idx]);
+                }
+            }
+        });
+    });
+    {
+        // Reduce centroids over all ranks in of distributed computing
+        auto centroids_reduce_event = comm.allreduce(centroids.flatten(q, { centroids_sum_event }));
+        centroids_reduce_event.wait();
+    }
+
+    const auto finalize_range =
+        bk::make_multiple_nd_range_2d({ num_clusters, local_size }, { 1, local_size });
+    auto finalize_centroids = q.submit([&](sycl::handler& cgh) {
+        cgh.depends_on(centroids_sum_event);
+        cgh.parallel_for(finalize_range, [=](auto it) {
+            const auto cluster_id = it.get_global_id(0);
+            const auto local_id = it.get_local_id(1);
+            const auto cent_count = counts_ptr[cluster_id];
+            if (cent_count == 0) {
+                return;
+            }
+            for (std::int32_t col_idx = local_id; col_idx < column_count; col_idx += local_size) {
+                centroids_ptr[cluster_id * column_count + col_idx] /= cent_count;
+            }
+        });
+    });
+    return finalize_centroids;
+}
+
+/// Handling empty clusters.
+/// @param[in] ctx              GPU context structure
+/// @param[in] row_count        A number of rows in the dataset
+/// @param[out] responses       An array of cluster assignments with :expr:`row_count x 1` dimensions
+/// @param[out] cluster_counts  An array of cluster counts with :expr:`cluster_count x 1` dimensions
+/// @param[out] dists           An array of closest distances to cluster with :expr:`row_count x 1` dimensions
+/// @param[in] deps             An event vector of dependencies for specified kernel
+template <typename Float>
+sycl::event handle_empty_clusters(const dal::backend::context_gpu& ctx,
+                                  const std::int64_t row_count,
+                                  pr::ndarray<std::int32_t, 2>& responses,
+                                  pr::ndarray<std::int32_t, 1>& cluster_counts,
+                                  pr::ndarray<Float, 2>& dists,
+                                  const event_vector& deps = {}) {
+    auto& queue = ctx.get_queue();
+    auto& comm = ctx.get_communicator();
+    ONEDAL_PROFILER_TASK(handle_empty_clusters, queue);
+    const auto rank_count = comm.get_rank_count();
+    const auto rank = comm.get_rank();
+    const auto num_clusters = cluster_counts.get_dimension(0);
+
+    auto resp_ptr = responses.get_mutable_data();
+    auto counts_ptr = cluster_counts.get_mutable_data();
+    auto dists_ptr = dists.get_mutable_data();
+
+    const auto abs_min_val = -std::numeric_limits<Float>::max();
+
+    auto local_size = bk::device_max_wg_size(queue);
+    auto range = bk::make_multiple_nd_range_1d(local_size, local_size);
+    auto event = queue.submit([&](sycl::handler& cgh) {
+        cgh.depends_on(deps);
+        cgh.parallel_for(range, [=](auto it) {
+            const auto local_id = it.get_local_id(1);
+            for (std::int64_t cluster_id = rank; cluster_id < num_clusters;
+                 cluster_id += rank_count) {
+                // no need to handle non-empty clusters
+                if (counts_ptr[cluster_id] > 0) {
+                    continue;
+                }
+                std::int64_t cand_idx = -1;
+                Float cand_dist = abs_min_val;
+                for (std::int64_t row_idx = local_id; row_idx < row_count; row_idx += local_size) {
+                    const auto dist = dists_ptr[row_idx];
+                    if (dist > cand_dist) {
+                        cand_dist = dist;
+                        cand_idx = row_idx;
+                    }
+                }
+                const Float longest_dist =
+                    sycl::reduce_over_group(it.get_group(),
+                                            cand_dist,
+                                            abs_min_val,
+                                            sycl::ext::oneapi::maximum<Float>());
+                const auto id = longest_dist == cand_dist ? cand_idx : -1;
+                const auto longest_id =
+                    sycl::reduce_over_group(it.get_group(),
+                                            id,
+                                            sycl::ext::oneapi::maximum<std::int64_t>());
+                if (local_id == 0 && longest_id != -1) {
+                    resp_ptr[longest_id] = cluster_id;
+                    counts_ptr[longest_id] = 1;
+                    dists_ptr[cluster_id] = Float(0);
+                }
+            }
+        });
+    });
+    return event;
+}
+
+} // namespace oneapi::dal::kmeans::backend
diff --git a/cpp/oneapi/dal/algo/kmeans/backend/gpu/train_kernel_lloyd_csr_dpc.cpp b/cpp/oneapi/dal/algo/kmeans/backend/gpu/train_kernel_lloyd_csr_dpc.cpp
new file mode 100644
index 00000000000..00eeebf42c4
--- /dev/null
+++ b/cpp/oneapi/dal/algo/kmeans/backend/gpu/train_kernel_lloyd_csr_dpc.cpp
@@ -0,0 +1,229 @@
+/*******************************************************************************
+* Copyright contributors to the oneDAL project
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*     http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*******************************************************************************/
+
+#include "oneapi/dal/algo/kmeans/backend/gpu/train_kernel.hpp"
+#include "oneapi/dal/algo/kmeans/backend/gpu/kernels_integral.hpp"
+#include "oneapi/dal/algo/kmeans/backend/gpu/kernels_fp.hpp"
+#include "oneapi/dal/algo/kmeans/backend/gpu/kernels_csr_impl.hpp"
+#include "oneapi/dal/algo/kmeans/detail/train_init_centroids.hpp"
+#include "oneapi/dal/backend/primitives/ndarray.hpp"
+
+#include "oneapi/dal/detail/profiler.hpp"
+
+namespace oneapi::dal::kmeans::backend {
+
+using dal::backend::context_gpu;
+using descriptor_t = detail::descriptor_base<task::clustering>;
+using event_vector = std::vector<sycl::event>;
+
+namespace interop = dal::backend::interop;
+namespace pr = dal::backend::primitives;
+namespace de = dal::detail;
+namespace bk = dal::backend;
+
+// Initializes centroids randomly on CPU if it was not set by user.
+template <typename Float, typename Method>
+static pr::ndarray<Float, 2> get_initial_centroids(const dal::backend::context_gpu& ctx,
+                                                   const descriptor_t& params,
+                                                   const train_input<task::clustering>& input) {
+    auto& queue = ctx.get_queue();
+
+    const auto data = static_cast<const csr_table&>(input.get_data());
+
+    const std::int64_t column_count = data.get_column_count();
+    const std::int64_t cluster_count = params.get_cluster_count();
+
+    if (!input.get_initial_centroids().has_data()) {
+        auto daal_initial_centroids =
+            oneapi::dal::kmeans::detail::daal_generate_centroids<Float, Method>(params, data);
+        daal::data_management::BlockDescriptor<Float> block;
+        daal_initial_centroids->getBlockOfRows(0,
+                                               cluster_count,
+                                               daal::data_management::readOnly,
+                                               block);
+        Float* initial_centroids_ptr = block.getBlockPtr();
+        auto arr_host_initial =
+            pr::ndarray<Float, 2>::wrap(initial_centroids_ptr, { cluster_count, column_count });
+        return arr_host_initial.to_device(queue);
+    }
+    auto initial_centroids_ptr = row_accessor<const Float>(input.get_initial_centroids())
+                                     .pull(queue, { 0, -1 }, sycl::usm::alloc::device);
+    return pr::ndarray<Float, 2>::wrap(initial_centroids_ptr, { cluster_count, column_count });
+}
+
+/// Main entrypoint for GPU CSR Kmeans algorithm
+/// @param[in] ctx          GPU context structure
+/// @param[in] params       A descriptor containing parameters for algorithm
+/// @param[in] input        A train input
+template <typename Float>
+struct train_kernel_gpu<Float, method::lloyd_csr, task::clustering> {
+    train_result<task::clustering> operator()(const dal::backend::context_gpu& ctx,
+                                              const descriptor_t& params,
+                                              const train_input<task::clustering>& input) const {
+        auto& queue = ctx.get_queue();
+        auto& comm = ctx.get_communicator();
+        ONEDAL_ASSERT(input.get_data().get_kind() == dal::csr_table::kind());
+        const auto data = static_cast<const csr_table&>(input.get_data());
+        const std::int64_t row_count = data.get_row_count();
+        const std::int64_t column_count = data.get_column_count();
+        const std::int64_t cluster_count = params.get_cluster_count();
+        const std::int64_t max_iteration_count = params.get_max_iteration_count();
+        const double accuracy_threshold = params.get_accuracy_threshold();
+        dal::detail::check_mul_overflow(cluster_count, column_count);
+
+        auto [arr_val, arr_col, arr_row] =
+            csr_accessor<const Float>(data).pull(queue,
+                                                 { 0, -1 },
+                                                 sparse_indexing::zero_based,
+                                                 sycl::usm::alloc::device);
+        auto values = pr::ndarray<Float, 1>::wrap(arr_val.get_data(), arr_val.get_count());
+        auto column_indices =
+            pr::ndarray<std::int64_t, 1>::wrap(arr_col.get_data(), arr_col.get_count());
+        auto row_offsets =
+            pr::ndarray<std::int64_t, 1>::wrap(arr_row.get_data(), arr_row.get_count());
+        auto arr_initial = get_initial_centroids<Float, method::lloyd_csr>(ctx, params, input);
+        auto arr_centroid_squares =
+            pr::ndarray<Float, 1>::empty(queue, cluster_count, sycl::usm::alloc::device);
+        auto arr_data_squares =
+            pr::ndarray<Float, 1>::empty(queue, row_count, sycl::usm::alloc::device);
+        auto data_squares_event =
+            compute_data_squares(queue, values, column_indices, row_offsets, arr_data_squares);
+
+        auto distances = pr::ndarray<Float, 2>::empty(queue,
+                                                      { row_count, cluster_count },
+                                                      sycl::usm::alloc::device);
+
+        auto arr_closest_distances =
+            pr::ndarray<Float, 2>::empty(queue, { row_count, 1 }, sycl::usm::alloc::device);
+        auto arr_centroids = pr::ndarray<Float, 2>::empty(queue,
+                                                          { cluster_count, column_count },
+                                                          sycl::usm::alloc::device);
+        auto arr_responses =
+            pr::ndarray<std::int32_t, 2>::empty(queue, { row_count, 1 }, sycl::usm::alloc::device);
+        auto cluster_counts =
+            pr::ndarray<std::int32_t, 1>::empty(queue, cluster_count, sycl::usm::alloc::device);
+
+        Float prev_objective_function = de::limits<Float>::max();
+        std::int64_t iter;
+        sycl::event last_event = data_squares_event;
+
+        for (iter = 0; iter < max_iteration_count; iter++) {
+            auto centroid_squares_event =
+                kernels_fp<Float>::compute_squares(queue,
+                                                   iter == 0 ? arr_initial : arr_centroids,
+                                                   arr_centroid_squares,
+                                                   { last_event });
+            auto assign_event = assign_clusters(queue,
+                                                values,
+                                                column_indices,
+                                                row_offsets,
+                                                arr_data_squares,
+                                                iter == 0 ? arr_initial : arr_centroids,
+                                                arr_centroid_squares,
+                                                distances,
+                                                arr_responses,
+                                                arr_closest_distances,
+                                                { centroid_squares_event, last_event });
+            auto count_event = count_clusters(queue,
+                                              arr_responses,
+                                              cluster_count,
+                                              cluster_counts,
+                                              { assign_event });
+
+            {
+                // Cluster counters over all ranks in case of distributed computing
+                auto count_reduce_event =
+                    comm.allreduce(cluster_counts.flatten(queue, { count_event }));
+                count_reduce_event.wait();
+            }
+
+            auto empty_cluster_event = handle_empty_clusters(ctx,
+                                                             row_count,
+                                                             arr_responses,
+                                                             cluster_counts,
+                                                             arr_closest_distances,
+                                                             { count_event });
+
+            auto objective_function = calc_objective_function(queue,
+                                                              arr_closest_distances,
+                                                              { empty_cluster_event, count_event });
+
+            {
+                // Reduce objective function value over all ranks
+                auto obj_func_reduce_event = comm.allreduce(objective_function);
+                obj_func_reduce_event.wait();
+            }
+            auto update_event = update_centroids(queue,
+                                                 comm,
+                                                 values,
+                                                 column_indices,
+                                                 row_offsets,
+                                                 column_count,
+                                                 arr_responses,
+                                                 arr_centroids,
+                                                 cluster_counts,
+                                                 { count_event });
+
+            last_event = update_event;
+
+            if (accuracy_threshold > 0 &&
+                objective_function + accuracy_threshold > prev_objective_function) {
+                iter++;
+                break;
+            }
+            prev_objective_function = objective_function;
+        }
+        auto centroid_squares_event =
+            kernels_fp<Float>::compute_squares(queue,
+                                               iter == 0 ? arr_initial : arr_centroids,
+                                               arr_centroid_squares,
+                                               { last_event });
+        auto assign_event = assign_clusters(queue,
+                                            values,
+                                            column_indices,
+                                            row_offsets,
+                                            arr_data_squares,
+                                            iter == 0 ? arr_initial : arr_centroids,
+                                            arr_centroid_squares,
+                                            distances,
+                                            arr_responses,
+                                            arr_closest_distances,
+                                            { last_event, centroid_squares_event });
+        auto objective_function =
+            calc_objective_function(queue,
+                                    arr_closest_distances,
+                                    { last_event, centroid_squares_event, assign_event });
+        {
+            // Reduce objective function value over all ranks
+            auto obj_func_reduce_event = comm.allreduce(objective_function);
+            obj_func_reduce_event.wait();
+        }
+
+        model<task::clustering> model;
+        model.set_centroids(
+            dal::homogen_table::wrap(arr_centroids.flatten(queue), cluster_count, column_count));
+        return train_result<task::clustering>()
+            .set_responses(dal::homogen_table::wrap(arr_responses.flatten(queue), row_count, 1))
+            .set_iteration_count(iter)
+            .set_objective_function_value(objective_function)
+            .set_model(model);
+    }
+};
+
+template struct train_kernel_gpu<float, method::lloyd_csr, task::clustering>;
+template struct train_kernel_gpu<double, method::lloyd_csr, task::clustering>;
+
+} // namespace oneapi::dal::kmeans::backend
diff --git a/cpp/oneapi/dal/algo/kmeans/backend/gpu/train_kernel_lloyd_dense_dpc.cpp b/cpp/oneapi/dal/algo/kmeans/backend/gpu/train_kernel_lloyd_dense_dpc.cpp
index efca39dd338..a8c02f27318 100644
--- a/cpp/oneapi/dal/algo/kmeans/backend/gpu/train_kernel_lloyd_dense_dpc.cpp
+++ b/cpp/oneapi/dal/algo/kmeans/backend/gpu/train_kernel_lloyd_dense_dpc.cpp
@@ -21,6 +21,7 @@
 #include "oneapi/dal/algo/kmeans/backend/gpu/kernels_integral.hpp"
 #include "oneapi/dal/algo/kmeans/backend/gpu/cluster_updater.hpp"
 #include "oneapi/dal/algo/kmeans/backend/gpu/kernels_fp.hpp"
+#include "oneapi/dal/algo/kmeans/detail/train_init_centroids.hpp"
 #include "oneapi/dal/exceptions.hpp"
 #include "oneapi/dal/backend/primitives/ndarray.hpp"
 #include "oneapi/dal/table/row_accessor.hpp"
@@ -36,16 +37,11 @@ namespace oneapi::dal::kmeans::backend {
 using dal::backend::context_gpu;
 using descriptor_t = detail::descriptor_base<task::clustering>;
 
-namespace daal_kmeans_init = daal::algorithms::kmeans::init;
 namespace interop = dal::backend::interop;
 namespace pr = dal::backend::primitives;
 namespace de = dal::detail;
 namespace bk = dal::backend;
 
-template <typename Float, daal::CpuType Cpu>
-using daal_kmeans_init_plus_plus_dense_kernel_t =
-    daal_kmeans_init::internal::KMeansInitKernel<daal_kmeans_init::plusPlusDense, Float, Cpu>;
-
 template <typename Float>
 static pr::ndarray<Float, 2> get_initial_centroids(const dal::backend::context_gpu& ctx,
                                                    const descriptor_t& params,
@@ -60,31 +56,9 @@ static pr::ndarray<Float, 2> get_initial_centroids(const dal::backend::context_g
     daal::data_management::NumericTablePtr daal_initial_centroids;
 
     if (!input.get_initial_centroids().has_data()) {
-        // We use CPU algorithm for initialization, so input data
-        // may be copied to DAAL homogen table
-        const auto daal_data = interop::copy_to_daal_homogen_table<Float>(data);
-        daal_kmeans_init::Parameter par(dal::detail::integral_cast<std::size_t>(cluster_count));
-
-        const std::size_t init_len_input = 1;
-        daal::data_management::NumericTable* init_input[init_len_input] = { daal_data.get() };
-
         daal_initial_centroids =
-            interop::allocate_daal_homogen_table<Float>(cluster_count, column_count);
-        const std::size_t init_len_output = 1;
-        daal::data_management::NumericTable* init_output[init_len_output] = {
-            daal_initial_centroids.get()
-        };
-
-        const dal::backend::context_cpu cpu_ctx;
-        interop::status_to_exception(
-            interop::call_daal_kernel<Float, daal_kmeans_init_plus_plus_dense_kernel_t>(
-                cpu_ctx,
-                init_len_input,
-                init_input,
-                init_len_output,
-                init_output,
-                &par,
-                *(par.engine)));
+            oneapi::dal::kmeans::detail::daal_generate_centroids<Float, method::lloyd_dense>(params,
+                                                                                             data);
         daal::data_management::BlockDescriptor<Float> block;
         daal_initial_centroids->getBlockOfRows(0,
                                                cluster_count,
@@ -107,8 +81,8 @@ struct train_kernel_gpu<Float, method::lloyd_dense, task::clustering> {
                                               const train_input<task::clustering>& input) const {
         auto& queue = ctx.get_queue();
         auto& comm = ctx.get_communicator();
-
         const auto data = input.get_data();
+        ONEDAL_ASSERT(data.get_kind() != dal::csr_table::kind());
         const std::int64_t row_count = data.get_row_count();
         const std::int64_t column_count = data.get_column_count();
         const std::int64_t cluster_count = desc.get_cluster_count();
diff --git a/cpp/oneapi/dal/algo/kmeans/common.hpp b/cpp/oneapi/dal/algo/kmeans/common.hpp
index 39ded2f1d82..7c406212b16 100644
--- a/cpp/oneapi/dal/algo/kmeans/common.hpp
+++ b/cpp/oneapi/dal/algo/kmeans/common.hpp
@@ -43,12 +43,17 @@ namespace v1 {
 /// method.
 struct lloyd_dense {};
 
+/// Tag-type that denotes :ref:`Lloyd's <kmeans_t_math_lloyd>` computational
+/// method for sparse data.
+struct lloyd_csr {};
+
 /// Alias tag-type for :ref:`Lloyd's <kmeans_t_math_lloyd>` computational
 /// method.
 using by_default = lloyd_dense;
 } // namespace v1
 
 using v1::lloyd_dense;
+using v1::lloyd_csr;
 using v1::by_default;
 
 } // namespace method
@@ -95,7 +100,8 @@ template <typename Float>
 constexpr bool is_valid_float_v = dal::detail::is_one_of_v<Float, float, double>;
 
 template <typename Method>
-constexpr bool is_valid_method_v = dal::detail::is_one_of_v<Method, method::lloyd_dense>;
+constexpr bool is_valid_method_v =
+    dal::detail::is_one_of_v<Method, method::lloyd_dense, method::lloyd_csr>;
 
 template <typename Task>
 constexpr bool is_valid_task_v = dal::detail::is_one_of_v<Task, task::clustering>;
diff --git a/cpp/oneapi/dal/algo/kmeans/detail/infer_ops.cpp b/cpp/oneapi/dal/algo/kmeans/detail/infer_ops.cpp
index 8c04129fb05..ce05fd6b75e 100644
--- a/cpp/oneapi/dal/algo/kmeans/detail/infer_ops.cpp
+++ b/cpp/oneapi/dal/algo/kmeans/detail/infer_ops.cpp
@@ -37,8 +37,10 @@ struct infer_ops_dispatcher<host_policy, Float, Method, Task> {
 #define INSTANTIATE(F, M, T) \
     template struct ONEDAL_EXPORT infer_ops_dispatcher<host_policy, F, M, T>;
 
-INSTANTIATE(float, method::by_default, task::clustering)
-INSTANTIATE(double, method::by_default, task::clustering)
+INSTANTIATE(float, method::lloyd_dense, task::clustering)
+INSTANTIATE(double, method::lloyd_dense, task::clustering)
+INSTANTIATE(float, method::lloyd_csr, task::clustering)
+INSTANTIATE(double, method::lloyd_csr, task::clustering)
 
 } // namespace v1
 } // namespace oneapi::dal::kmeans::detail
diff --git a/cpp/oneapi/dal/algo/kmeans/detail/infer_ops_dpc.cpp b/cpp/oneapi/dal/algo/kmeans/detail/infer_ops_dpc.cpp
index e27ac3549b0..6f00c81ecfb 100644
--- a/cpp/oneapi/dal/algo/kmeans/detail/infer_ops_dpc.cpp
+++ b/cpp/oneapi/dal/algo/kmeans/detail/infer_ops_dpc.cpp
@@ -42,8 +42,10 @@ struct infer_ops_dispatcher<Policy, Float, Method, Task> {
     template struct ONEDAL_EXPORT                                         \
         infer_ops_dispatcher<dal::detail::spmd_data_parallel_policy, F, M, T>;
 
-INSTANTIATE(float, method::by_default, task::clustering)
-INSTANTIATE(double, method::by_default, task::clustering)
+INSTANTIATE(float, method::lloyd_dense, task::clustering)
+INSTANTIATE(double, method::lloyd_dense, task::clustering)
+INSTANTIATE(float, method::lloyd_csr, task::clustering)
+INSTANTIATE(double, method::lloyd_csr, task::clustering)
 
 } // namespace v1
 } // namespace oneapi::dal::kmeans::detail
diff --git a/cpp/oneapi/dal/algo/kmeans/detail/train_init_centroids.hpp b/cpp/oneapi/dal/algo/kmeans/detail/train_init_centroids.hpp
new file mode 100644
index 00000000000..544e24546f8
--- /dev/null
+++ b/cpp/oneapi/dal/algo/kmeans/detail/train_init_centroids.hpp
@@ -0,0 +1,89 @@
+/*******************************************************************************
+* Copyright contributors to the oneDAL project
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*     http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*******************************************************************************/
+#include <daal/src/algorithms/kmeans/kmeans_init_kernel.h>
+
+#include "oneapi/dal/backend/interop/common_dpc.hpp"
+#include "oneapi/dal/backend/interop/error_converter.hpp"
+#include "oneapi/dal/backend/interop/table_conversion.hpp"
+#include "oneapi/dal/backend/transfer.hpp"
+
+namespace oneapi::dal::kmeans::detail {
+
+namespace daal_kmeans_init = daal::algorithms::kmeans::init;
+
+template <daal_kmeans_init::Method Value>
+using daal_init_method_constant = std::integral_constant<daal_kmeans_init::Method, Value>;
+using descriptor_t = detail::descriptor_base<task::clustering>;
+namespace interop = dal::backend::interop;
+
+template <typename Method>
+struct to_daal_init_method;
+
+template <>
+struct to_daal_init_method<method::lloyd_dense>
+        : daal_init_method_constant<daal_kmeans_init::plusPlusDense> {};
+
+template <>
+struct to_daal_init_method<method::lloyd_csr>
+        : daal_init_method_constant<daal_kmeans_init::plusPlusCSR> {};
+
+template <typename Float, daal::CpuType Cpu, typename Method>
+using init_kernel_t =
+    daal_kmeans_init::internal::KMeansInitKernel<to_daal_init_method<Method>::value, Float, Cpu>;
+
+template <typename Float, typename Method, typename Table>
+inline daal::data_management::NumericTablePtr daal_generate_centroids(const descriptor_t& desc,
+                                                                      const Table& data) {
+    const std::int64_t column_count = data.get_column_count();
+    const std::int64_t cluster_count = desc.get_cluster_count();
+    daal::data_management::NumericTablePtr daal_initial_centroids;
+    const auto daal_data = interop::convert_to_daal_table<Float>(data, true);
+
+    daal_kmeans_init::Parameter par(dal::detail::integral_cast<std::size_t>(cluster_count));
+
+    const std::size_t init_len_input = 1;
+    const daal::data_management::NumericTable* init_input[init_len_input] = { daal_data.get() };
+
+    daal_initial_centroids =
+        interop::allocate_daal_homogen_table<Float>(cluster_count, column_count);
+    const std::size_t init_len_output = 1;
+    daal::data_management::NumericTable* init_output[init_len_output] = {
+        daal_initial_centroids.get()
+    };
+    const dal::backend::context_cpu cpu_ctx;
+    interop::status_to_exception(dal::backend::dispatch_by_cpu(cpu_ctx, [&](auto cpu) {
+        return init_kernel_t<Float,
+                             oneapi::dal::backend::interop::to_daal_cpu_type<decltype(cpu)>::value,
+                             Method>()
+            .compute(init_len_input, init_input, init_len_output, init_output, &par, *(par.engine));
+    }));
+    return daal_initial_centroids;
+}
+
+template daal::data_management::NumericTablePtr
+daal_generate_centroids<float, method::lloyd_dense, table>(const descriptor_t& desc,
+                                                           const table& data);
+template daal::data_management::NumericTablePtr
+daal_generate_centroids<double, method::lloyd_dense, table>(const descriptor_t& desc,
+                                                            const table& data);
+template daal::data_management::NumericTablePtr
+daal_generate_centroids<float, method::lloyd_csr, csr_table>(const descriptor_t& desc,
+                                                             const csr_table& data);
+template daal::data_management::NumericTablePtr
+daal_generate_centroids<double, method::lloyd_csr, csr_table>(const descriptor_t& desc,
+                                                              const csr_table& data);
+
+} // namespace oneapi::dal::kmeans::detail
diff --git a/cpp/oneapi/dal/algo/kmeans/detail/train_ops.cpp b/cpp/oneapi/dal/algo/kmeans/detail/train_ops.cpp
index f7f085a068d..0be157b6f86 100644
--- a/cpp/oneapi/dal/algo/kmeans/detail/train_ops.cpp
+++ b/cpp/oneapi/dal/algo/kmeans/detail/train_ops.cpp
@@ -38,6 +38,8 @@ struct train_ops_dispatcher<Policy, Float, Method, Task> {
 
 INSTANTIATE(float, method::lloyd_dense, task::clustering)
 INSTANTIATE(double, method::lloyd_dense, task::clustering)
+INSTANTIATE(float, method::lloyd_csr, task::clustering)
+INSTANTIATE(double, method::lloyd_csr, task::clustering)
 
 } // namespace v1
 } // namespace oneapi::dal::kmeans::detail
diff --git a/cpp/oneapi/dal/algo/kmeans/detail/train_ops_dpc.cpp b/cpp/oneapi/dal/algo/kmeans/detail/train_ops_dpc.cpp
index cc071b82000..d7e672e1777 100644
--- a/cpp/oneapi/dal/algo/kmeans/detail/train_ops_dpc.cpp
+++ b/cpp/oneapi/dal/algo/kmeans/detail/train_ops_dpc.cpp
@@ -45,6 +45,8 @@ struct train_ops_dispatcher<Policy, Float, Method, Task> {
 
 INSTANTIATE(float, method::lloyd_dense, task::clustering)
 INSTANTIATE(double, method::lloyd_dense, task::clustering)
+INSTANTIATE(float, method::lloyd_csr, task::clustering)
+INSTANTIATE(double, method::lloyd_csr, task::clustering)
 
 } // namespace v1
 } // namespace oneapi::dal::kmeans::detail
diff --git a/cpp/oneapi/dal/algo/kmeans/test/batch.cpp b/cpp/oneapi/dal/algo/kmeans/test/batch.cpp
index 01663a209fb..3211daba5ee 100644
--- a/cpp/oneapi/dal/algo/kmeans/test/batch.cpp
+++ b/cpp/oneapi/dal/algo/kmeans/test/batch.cpp
@@ -15,7 +15,7 @@
 *******************************************************************************/
 
 #include "oneapi/dal/algo/kmeans/test/fixture.hpp"
-
+#include "oneapi/dal/table/csr_accessor.hpp"
 namespace oneapi::dal::kmeans::test {
 
 template <typename TestType>
@@ -74,6 +74,7 @@ TEMPLATE_LIST_TEST_M(kmeans_batch_test,
                      "[kmeans][batch]",
                      kmeans_types) {
     SKIP_IF(this->not_float64_friendly());
+    SKIP_IF(this->is_sparse_method());
     this->check_empty_clusters();
 }
 
@@ -82,6 +83,7 @@ TEMPLATE_LIST_TEST_M(kmeans_batch_test,
                      "[kmeans][batch]",
                      kmeans_types) {
     SKIP_IF(this->not_float64_friendly());
+    SKIP_IF(this->is_sparse_method());
     this->check_on_smoke_data();
 }
 
@@ -90,6 +92,7 @@ TEMPLATE_LIST_TEST_M(kmeans_batch_test,
                      "[kmeans][batch]",
                      kmeans_types) {
     SKIP_IF(this->not_float64_friendly());
+    SKIP_IF(this->is_sparse_method());
     this->check_on_gold_data();
 }
 
@@ -100,7 +103,7 @@ TEMPLATE_LIST_TEST_M(kmeans_batch_test,
     // This test is not stable on CPU
     // TODO: Remove the following `SKIP_IF` once stability problem is resolved
     SKIP_IF(this->get_policy().is_cpu());
-
+    SKIP_IF(this->is_sparse_method());
     SKIP_IF(this->not_float64_friendly());
     this->check_on_large_data_with_one_cluster();
 }
@@ -110,6 +113,7 @@ TEMPLATE_LIST_TEST_M(kmeans_batch_test,
                      "[kmeans][batch][nightly][stress]",
                      kmeans_types) {
     SKIP_IF(this->not_float64_friendly());
+    SKIP_IF(this->is_sparse_method());
     this->partial_centroids_stress_test();
 }
 
@@ -118,6 +122,7 @@ TEMPLATE_LIST_TEST_M(kmeans_batch_test,
                      "[kmeans][batch][external-dataset][higgs]",
                      kmeans_types) {
     SKIP_IF(this->not_float64_friendly());
+    SKIP_IF(this->is_sparse_method());
 
     const std::int64_t iters = 3;
     const std::string higgs_path = "workloads/higgs/dataset/higgs_1m_test.csv";
@@ -140,6 +145,7 @@ TEMPLATE_LIST_TEST_M(kmeans_batch_test,
                      "[kmeans][nightly][batch][external-dataset][susy]",
                      kmeans_types) {
     SKIP_IF(this->not_float64_friendly());
+    SKIP_IF(this->is_sparse_method());
 
     const std::int64_t iters = 10;
     const std::string susy_path = "workloads/susy/dataset/susy_test.csv";
@@ -162,6 +168,7 @@ TEMPLATE_LIST_TEST_M(kmeans_batch_test,
                      "[kmeans][nightly][batch][external-dataset][epsilon]",
                      kmeans_types) {
     SKIP_IF(this->not_float64_friendly());
+    SKIP_IF(this->is_sparse_method());
 
     const std::int64_t iters = 2;
     const std::string epsilon_path = "workloads/epsilon/dataset/epsilon_80k_train.csv";
@@ -184,6 +191,7 @@ TEMPLATE_LIST_TEST_M(kmeans_batch_test,
                      "[kmeans][batch][external-dataset][higgs]",
                      kmeans_types) {
     SKIP_IF(this->not_float64_friendly());
+    SKIP_IF(this->is_sparse_method());
 
     const std::int64_t iters = 3;
     const std::string higgs_path = "workloads/higgs/dataset/higgs_1m_test.csv";
@@ -206,6 +214,7 @@ TEMPLATE_LIST_TEST_M(kmeans_batch_test,
                      "[kmeans][nightly][batch][external-dataset][susy]",
                      kmeans_types) {
     SKIP_IF(this->not_float64_friendly());
+    SKIP_IF(this->is_sparse_method());
 
     const std::int64_t iters = 10;
     const std::string susy_path = "workloads/susy/dataset/susy_test.csv";
@@ -228,6 +237,7 @@ TEMPLATE_LIST_TEST_M(kmeans_batch_test,
                      "[kmeans][nightly][batch][external-dataset][epsilon]",
                      kmeans_types) {
     SKIP_IF(this->not_float64_friendly());
+    SKIP_IF(this->is_sparse_method());
 
     const std::int64_t iters = 2;
     const std::string epsilon_path = "workloads/epsilon/dataset/epsilon_80k_train.csv";
@@ -260,4 +270,50 @@ TEMPLATE_LIST_TEST_M(kmeans_batch_test,
     }
 }
 
+TEMPLATE_LIST_TEST_M(kmeans_batch_test,
+                     "KMmeans sparse default cases",
+                     "[kmeans][batch]",
+                     kmeans_types) {
+    SKIP_IF(!this->is_sparse_method());
+    SKIP_IF(this->not_float64_friendly());
+
+    SECTION("cluster=5") {
+        auto input = oneapi::dal::test::engine::csr_make_blobs(5, 50, 20);
+        bool init_centroids = true;
+        this->test_on_sparse_data(input, 10, 0.01, init_centroids);
+    }
+
+    SECTION("cluster=16") {
+        bool init_centroids = true;
+        auto input = oneapi::dal::test::engine::csr_make_blobs(16, 200, 100);
+        this->test_on_sparse_data(input, 10, 0.01, init_centroids);
+    }
+
+    SECTION("cluster=128") {
+        SKIP_IF(this->get_policy().is_cpu());
+        bool init_centroids = true;
+        auto input = oneapi::dal::test::engine::csr_make_blobs(128, 100000, 200);
+        this->test_on_sparse_data(input, 10, 0.01, init_centroids);
+    }
+
+    SECTION("cluster=5") {
+        auto input = oneapi::dal::test::engine::csr_make_blobs(5, 50, 20);
+        bool init_centroids = false;
+        this->test_on_sparse_data(input, 20, 0.01, init_centroids);
+    }
+
+    SECTION("cluster=16") {
+        bool init_centroids = false;
+        auto input = oneapi::dal::test::engine::csr_make_blobs(16, 200, 100);
+        this->test_on_sparse_data(input, 10, 0.01, init_centroids);
+    }
+
+    SECTION("cluster=32") {
+        SKIP_IF(this->get_policy().is_cpu());
+        bool init_centroids = false;
+        auto input = oneapi::dal::test::engine::csr_make_blobs(32, 10000, 100);
+        this->test_on_sparse_data(input, 30, 0.01, init_centroids);
+    }
+}
+
 } // namespace oneapi::dal::kmeans::test
diff --git a/cpp/oneapi/dal/algo/kmeans/test/fixture.hpp b/cpp/oneapi/dal/algo/kmeans/test/fixture.hpp
index 0a0e3acf3ae..1fdf3cc00e1 100644
--- a/cpp/oneapi/dal/algo/kmeans/test/fixture.hpp
+++ b/cpp/oneapi/dal/algo/kmeans/test/fixture.hpp
@@ -26,6 +26,7 @@
 #include "oneapi/dal/table/homogen.hpp"
 #include "oneapi/dal/table/row_accessor.hpp"
 #include "oneapi/dal/test/engine/fixtures.hpp"
+#include "oneapi/dal/test/engine/csr_table_builder.hpp"
 #include "oneapi/dal/test/engine/math.hpp"
 #include "oneapi/dal/test/engine/metrics/clustering.hpp"
 
@@ -34,7 +35,8 @@ namespace oneapi::dal::kmeans::test {
 namespace te = dal::test::engine;
 namespace la = dal::test::engine::linalg;
 
-using kmeans_types = COMBINE_TYPES((float, double), (kmeans::method::lloyd_dense));
+using kmeans_types = COMBINE_TYPES((float, double),
+                                   (kmeans::method::lloyd_dense, kmeans::method::lloyd_csr));
 
 template <typename TestType, typename Derived>
 class kmeans_test : public te::crtp_algo_fixture<TestType, Derived> {
@@ -63,6 +65,10 @@ class kmeans_test : public te::crtp_algo_fixture<TestType, Derived> {
         return descriptor_t{ cluster_count };
     }
 
+    bool is_sparse_method() {
+        return std::is_same_v<method_t, kmeans::method::lloyd_csr>;
+    }
+
     void exact_checks(const table& data,
                       const table& initial_centroids,
                       const table& ref_centroids,
@@ -285,6 +291,32 @@ class kmeans_test : public te::crtp_algo_fixture<TestType, Derived> {
         this->exact_checks(x, x, x, y, cluster_count, 1, 0.0);
     }
 
+    void test_on_sparse_data(const oneapi::dal::test::engine::csr_make_blobs& input,
+                             std::int64_t max_iter_count,
+                             float_t accuracy_threshold,
+                             bool init_centroids) {
+        const table data = input.get_data(this->get_policy());
+        const auto cluster_count = input.cluster_count_;
+        REQUIRE(data.get_kind() == csr_table::kind());
+        auto desc = this->get_descriptor(cluster_count, max_iter_count, accuracy_threshold);
+        INFO("KMeans sparse training");
+        if (init_centroids) {
+            const table initial_centroids = input.get_initial_centroids();
+            const auto train_result = this->train(desc, data, initial_centroids);
+            check_response_match(input.get_responses(), train_result.get_responses());
+        }
+        else {
+            const auto train_result = this->train(desc, data);
+            const auto model = train_result.get_model();
+            auto match_map = array<float_t>::zeros(cluster_count);
+            find_match_centroids(input.get_result_centroids(),
+                                 model.get_centroids(),
+                                 input.column_count_,
+                                 match_map);
+            check_response_match(match_map, input.get_responses(), train_result.get_responses());
+        }
+    }
+
     void test_on_dataset(const std::string& dataset_path,
                          std::int64_t cluster_count,
                          std::int64_t max_iteration_count,
diff --git a/cpp/oneapi/dal/algo/kmeans/test/spmd.cpp b/cpp/oneapi/dal/algo/kmeans/test/spmd.cpp
index 792d5d74901..2c7e3d34063 100644
--- a/cpp/oneapi/dal/algo/kmeans/test/spmd.cpp
+++ b/cpp/oneapi/dal/algo/kmeans/test/spmd.cpp
@@ -134,6 +134,7 @@ TEMPLATE_LIST_TEST_M(kmeans_spmd_test,
     // removed once it's supported for CPU. The same for the rest of tests cases.
     SKIP_IF(this->get_policy().is_cpu());
     SKIP_IF(this->not_float64_friendly());
+    SKIP_IF(this->is_sparse_method());
 
     this->set_rank_count(GENERATE(2, 4));
     this->check_if_results_same_on_all_ranks();
@@ -145,6 +146,7 @@ TEMPLATE_LIST_TEST_M(kmeans_spmd_test,
                      kmeans_types) {
     SKIP_IF(this->get_policy().is_cpu());
     SKIP_IF(this->not_float64_friendly());
+    SKIP_IF(this->is_sparse_method());
 
     this->set_rank_count(GENERATE(1, 2));
     this->check_empty_clusters();
@@ -156,6 +158,7 @@ TEMPLATE_LIST_TEST_M(kmeans_spmd_test,
                      kmeans_types) {
     SKIP_IF(this->get_policy().is_cpu());
     SKIP_IF(this->not_float64_friendly());
+    SKIP_IF(this->is_sparse_method());
 
     this->set_rank_count(GENERATE(1, 2));
     this->check_on_smoke_data();
@@ -167,6 +170,7 @@ TEMPLATE_LIST_TEST_M(kmeans_spmd_test,
                      kmeans_types) {
     SKIP_IF(this->get_policy().is_cpu());
     SKIP_IF(this->not_float64_friendly());
+    SKIP_IF(this->is_sparse_method());
 
     this->set_rank_count(GENERATE(1, 2, 4, 8));
     this->check_on_gold_data();
@@ -178,6 +182,7 @@ TEMPLATE_LIST_TEST_M(kmeans_spmd_test,
                      kmeans_types) {
     SKIP_IF(this->get_policy().is_cpu());
     SKIP_IF(this->not_float64_friendly());
+    SKIP_IF(this->is_sparse_method());
 
     this->set_rank_count(GENERATE(1, 8));
     this->check_on_large_data_with_one_cluster();
@@ -189,6 +194,7 @@ TEMPLATE_LIST_TEST_M(kmeans_spmd_test,
                      kmeans_types) {
     SKIP_IF(this->get_policy().is_cpu());
     SKIP_IF(this->not_float64_friendly());
+    SKIP_IF(this->is_sparse_method());
 
     this->set_rank_count(10);
     const std::int64_t iters = 3;
@@ -213,6 +219,7 @@ TEMPLATE_LIST_TEST_M(kmeans_spmd_test,
                      kmeans_types) {
     SKIP_IF(this->get_policy().is_cpu());
     SKIP_IF(this->not_float64_friendly());
+    SKIP_IF(this->is_sparse_method());
 
     this->set_rank_count(10);
     const std::int64_t iters = 10;
@@ -237,6 +244,7 @@ TEMPLATE_LIST_TEST_M(kmeans_spmd_test,
                      kmeans_types) {
     SKIP_IF(this->get_policy().is_cpu());
     SKIP_IF(this->not_float64_friendly());
+    SKIP_IF(this->is_sparse_method());
 
     this->set_rank_count(10);
     const std::int64_t iters = 2;
diff --git a/cpp/oneapi/dal/algo/logistic_regression/backend/gpu/train_kernel_dense_batch_dpc.cpp b/cpp/oneapi/dal/algo/logistic_regression/backend/gpu/train_kernel_dense_batch_dpc.cpp
index f193593b588..e24cdb02539 100644
--- a/cpp/oneapi/dal/algo/logistic_regression/backend/gpu/train_kernel_dense_batch_dpc.cpp
+++ b/cpp/oneapi/dal/algo/logistic_regression/backend/gpu/train_kernel_dense_batch_dpc.cpp
@@ -113,6 +113,10 @@ static train_result<Task> call_dal_kernel(const context_gpu& ctx,
         result.set_iterations_count(iter_num);
     }
 
+    if (options.test(result_options::inner_iterations_count)) {
+        result.set_inner_iterations_count(opt_impl->get_inner_iter());
+    }
+
     return result;
 }
 
diff --git a/cpp/oneapi/dal/algo/logistic_regression/backend/optimizer_impl.hpp b/cpp/oneapi/dal/algo/logistic_regression/backend/optimizer_impl.hpp
index 449a0cac61b..4fd6240f409 100644
--- a/cpp/oneapi/dal/algo/logistic_regression/backend/optimizer_impl.hpp
+++ b/cpp/oneapi/dal/algo/logistic_regression/backend/optimizer_impl.hpp
@@ -36,7 +36,14 @@ class optimizer_impl : public base {
     virtual double get_tol() = 0;
     virtual std::int64_t get_max_iter() = 0;
 
+    // this function returns meaningful value only for newton_cg optimizer
+    // inner iterations value can be accessed after minimize method was called
+    virtual std::int64_t get_inner_iter() {
+        return -1;
+    }
+
 #ifdef ONEDAL_DATA_PARALLEL
+
     virtual std::pair<sycl::event, std::int64_t> minimize(sycl::queue& q,
                                                           pr::base_function<float>& f,
                                                           pr::ndview<float, 1>& x,
diff --git a/cpp/oneapi/dal/algo/logistic_regression/common.cpp b/cpp/oneapi/dal/algo/logistic_regression/common.cpp
index 01a3a047187..d830dd3f441 100644
--- a/cpp/oneapi/dal/algo/logistic_regression/common.cpp
+++ b/cpp/oneapi/dal/algo/logistic_regression/common.cpp
@@ -34,6 +34,10 @@ result_option_id get_iterations_count_id() {
     return result_option_id{ result_option_id::make_by_index(2) };
 }
 
+result_option_id get_inner_iterations_count_id() {
+    return result_option_id{ result_option_id::make_by_index(3) };
+}
+
 template <typename Task>
 result_option_id get_default_result_options() {
     return result_option_id{};
diff --git a/cpp/oneapi/dal/algo/logistic_regression/common.hpp b/cpp/oneapi/dal/algo/logistic_regression/common.hpp
index f8dd60c6ef2..b817b5b1101 100644
--- a/cpp/oneapi/dal/algo/logistic_regression/common.hpp
+++ b/cpp/oneapi/dal/algo/logistic_regression/common.hpp
@@ -68,6 +68,7 @@ namespace detail {
 ONEDAL_EXPORT result_option_id get_intercept_id();
 ONEDAL_EXPORT result_option_id get_coefficients_id();
 ONEDAL_EXPORT result_option_id get_iterations_count_id();
+ONEDAL_EXPORT result_option_id get_inner_iterations_count_id();
 
 } // namespace detail
 
@@ -84,6 +85,9 @@ const inline result_option_id coefficients = detail::get_coefficients_id();
 /// Return the number of iterations made by optimizer
 const inline result_option_id iterations_count = detail::get_iterations_count_id();
 
+/// Return the number of subiterations made by optimizer. Only available for newton-cg optimizer
+const inline result_option_id inner_iterations_count = detail::get_inner_iterations_count_id();
+
 } // namespace result_options
 
 namespace detail {
diff --git a/cpp/oneapi/dal/algo/logistic_regression/detail/optimizer.cpp b/cpp/oneapi/dal/algo/logistic_regression/detail/optimizer.cpp
index 0ef343bbaec..b56df6bf0e9 100644
--- a/cpp/oneapi/dal/algo/logistic_regression/detail/optimizer.cpp
+++ b/cpp/oneapi/dal/algo/logistic_regression/detail/optimizer.cpp
@@ -32,7 +32,10 @@ namespace pr = be::primitives;
 
 class newton_cg_optimizer_impl : public optimizer_impl {
 public:
-    newton_cg_optimizer_impl(std::int64_t max_iter, double tol) : max_iter_(max_iter), tol_(tol) {}
+    newton_cg_optimizer_impl(std::int64_t max_iter, double tol)
+            : max_iter_(max_iter),
+              tol_(tol),
+              inner_iter_(0) {}
 
     optimizer_type get_optimizer_type() override {
         return optimizer_type::newton_cg;
@@ -46,13 +49,22 @@ class newton_cg_optimizer_impl : public optimizer_impl {
         return max_iter_;
     }
 
+    // this parameter is set after minimize function was called
+    std::int64_t get_inner_iter() override {
+        return inner_iter_;
+    }
+
 #ifdef ONEDAL_DATA_PARALLEL
+
     template <typename Float>
     std::pair<sycl::event, std::int64_t> minimize_impl(sycl::queue& q,
                                                        pr::base_function<Float>& f,
                                                        pr::ndview<Float, 1>& x,
                                                        const be::event_vector& deps = {}) {
-        return pr::newton_cg(q, f, x, Float(tol_), max_iter_, 200l, deps);
+        auto [opt_event, max_iter, inner_iter] =
+            pr::newton_cg(q, f, x, Float(tol_), max_iter_, 200l, deps);
+        inner_iter_ = inner_iter;
+        return { opt_event, max_iter };
     }
 
     std::pair<sycl::event, std::int64_t> minimize(sycl::queue& q,
@@ -73,6 +85,7 @@ class newton_cg_optimizer_impl : public optimizer_impl {
 private:
     std::int64_t max_iter_;
     double tol_;
+    std::int64_t inner_iter_;
 };
 
 template <typename F, typename M>
diff --git a/cpp/oneapi/dal/algo/logistic_regression/train_types.cpp b/cpp/oneapi/dal/algo/logistic_regression/train_types.cpp
index a0f88b94ad9..b6d9591b2c5 100644
--- a/cpp/oneapi/dal/algo/logistic_regression/train_types.cpp
+++ b/cpp/oneapi/dal/algo/logistic_regression/train_types.cpp
@@ -39,6 +39,7 @@ class train_result_impl : public base {
     table intercept;
     table coefficients;
     std::int64_t iter_cnt;
+    std::int64_t inner_iter_cnt;
 
     result_option_id options;
 
@@ -157,6 +158,24 @@ void train_result<Task>::set_iterations_count_impl(std::int64_t value) {
     impl_->iter_cnt = value;
 }
 
+template <typename Task>
+std::int64_t train_result<Task>::get_inner_iterations_count() const {
+    using msg = dal::detail::error_messages;
+    if (!get_result_options().test(result_options::inner_iterations_count)) {
+        throw domain_error(msg::this_result_is_not_enabled_via_result_options());
+    }
+    return impl_->inner_iter_cnt;
+}
+
+template <typename Task>
+void train_result<Task>::set_inner_iterations_count_impl(std::int64_t value) {
+    using msg = dal::detail::error_messages;
+    if (!get_result_options().test(result_options::inner_iterations_count)) {
+        throw domain_error(msg::this_result_is_not_enabled_via_result_options());
+    }
+    impl_->inner_iter_cnt = value;
+}
+
 template <typename Task>
 const table& train_result<Task>::get_coefficients() const {
     using msg = dal::detail::error_messages;
diff --git a/cpp/oneapi/dal/algo/logistic_regression/train_types.hpp b/cpp/oneapi/dal/algo/logistic_regression/train_types.hpp
index e39da6e2d1d..555e32f5cd9 100644
--- a/cpp/oneapi/dal/algo/logistic_regression/train_types.hpp
+++ b/cpp/oneapi/dal/algo/logistic_regression/train_types.hpp
@@ -150,6 +150,14 @@ class train_result {
         return *this;
     }
 
+    /// Number of optimizer subiterations
+    std::int64_t get_inner_iterations_count() const;
+
+    auto& set_inner_iterations_count(std::int64_t value) {
+        set_inner_iterations_count_impl(value);
+        return *this;
+    }
+
     /// Table of Logistic Regression coefficients and intercept
     const table& get_packed_coefficients() const;
 
@@ -173,6 +181,7 @@ class train_result {
     void set_coefficients_impl(const table&);
     void set_packed_coefficients_impl(const table&);
     void set_iterations_count_impl(std::int64_t);
+    void set_inner_iterations_count_impl(std::int64_t);
 
     void set_result_options_impl(const result_option_id&);
 
diff --git a/cpp/oneapi/dal/algo/pca/backend/cpu/finalize_train_kernel_cov.cpp b/cpp/oneapi/dal/algo/pca/backend/cpu/finalize_train_kernel_cov.cpp
index e59c44d53b1..ffe447ec5d2 100644
--- a/cpp/oneapi/dal/algo/pca/backend/cpu/finalize_train_kernel_cov.cpp
+++ b/cpp/oneapi/dal/algo/pca/backend/cpu/finalize_train_kernel_cov.cpp
@@ -1,5 +1,6 @@
 /*******************************************************************************
 * Copyright 2023 Intel Corporation
+* Copyright contributors to the oneDAL project
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -14,6 +15,8 @@
 * limitations under the License.
 *******************************************************************************/
 
+#include <daal/include/services/daal_defines.h>
+
 #include <daal/src/algorithms/pca/pca_dense_correlation_online_kernel.h>
 #include <daal/src/algorithms/covariance/covariance_hyperparameter_impl.h>
 #include "daal/src/algorithms/covariance/covariance_kernel.h"
@@ -26,6 +29,12 @@
 #include "oneapi/dal/backend/interop/table_conversion.hpp"
 #include "oneapi/dal/table/row_accessor.hpp"
 
+#if defined(TARGET_X86_64)
+#define CPU_EXTENSION dal::detail::cpu_extension::avx512
+#elif defined(TARGET_ARM)
+#define CPU_EXTENSION dal::detail::cpu_extension::sve
+#endif
+
 namespace oneapi::dal::pca::backend {
 
 using dal::backend::context_cpu;
@@ -84,7 +93,7 @@ static train_result<Task> call_daal_kernel_finalize_train(const context_cpu& ctx
     /// the logic of block size calculation is copied from DAAL,
     /// to be changed to passing the values from the performance model
     std::int64_t blockSize = 140;
-    if (ctx.get_enabled_cpu_extensions() == dal::detail::cpu_extension::avx512) {
+    if (ctx.get_enabled_cpu_extensions() == CPU_EXTENSION) {
         if (5000 < row_count && row_count <= 50000) {
             blockSize = 1024;
         }
diff --git a/cpp/oneapi/dal/algo/pca/backend/cpu/partial_train_kernel_cov.cpp b/cpp/oneapi/dal/algo/pca/backend/cpu/partial_train_kernel_cov.cpp
index 8400fd918fc..dff5d1a3a2a 100644
--- a/cpp/oneapi/dal/algo/pca/backend/cpu/partial_train_kernel_cov.cpp
+++ b/cpp/oneapi/dal/algo/pca/backend/cpu/partial_train_kernel_cov.cpp
@@ -1,5 +1,6 @@
 /*******************************************************************************
 * Copyright 2023 Intel Corporation
+* Copyright contributors to the oneDAL project
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -14,6 +15,9 @@
 * limitations under the License.
 *******************************************************************************/
 
+#include <daal/include/services/daal_defines.h>
+
+#include <daal/src/algorithms/pca/pca_dense_correlation_online_kernel.h>
 #include <daal/src/algorithms/covariance/covariance_hyperparameter_impl.h>
 #include "daal/src/algorithms/covariance/covariance_kernel.h"
 
@@ -25,6 +29,12 @@
 #include "oneapi/dal/backend/interop/table_conversion.hpp"
 #include "oneapi/dal/table/row_accessor.hpp"
 
+#if defined(TARGET_X86_64)
+#define CPU_EXTENSION dal::detail::cpu_extension::avx512
+#elif defined(TARGET_ARM)
+#define CPU_EXTENSION dal::detail::cpu_extension::sve
+#endif
+
 namespace oneapi::dal::pca::backend {
 
 using dal::backend::context_cpu;
@@ -66,7 +76,7 @@ static partial_train_result<task_t> call_daal_kernel_partial_train(
     /// the logic of block size calculation is copied from DAAL,
     /// to be changed to passing the values from the performance model
     std::int64_t blockSize = 140;
-    if (ctx.get_enabled_cpu_extensions() == dal::detail::cpu_extension::avx512) {
+    if (ctx.get_enabled_cpu_extensions() == CPU_EXTENSION) {
         const std::int64_t row_count = data.get_row_count();
         if (5000 < row_count && row_count <= 50000) {
             blockSize = 1024;
diff --git a/cpp/oneapi/dal/algo/pca/backend/cpu/train_kernel_cov.cpp b/cpp/oneapi/dal/algo/pca/backend/cpu/train_kernel_cov.cpp
index ef12d49fe74..63b364f7e6b 100644
--- a/cpp/oneapi/dal/algo/pca/backend/cpu/train_kernel_cov.cpp
+++ b/cpp/oneapi/dal/algo/pca/backend/cpu/train_kernel_cov.cpp
@@ -1,5 +1,6 @@
 /*******************************************************************************
 * Copyright 2020 Intel Corporation
+* Copyright contributors to the oneDAL project
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -14,6 +15,8 @@
 * limitations under the License.
 *******************************************************************************/
 
+#include <daal/include/services/daal_defines.h>
+
 #include <daal/src/algorithms/pca/pca_dense_correlation_batch_kernel.h>
 #include <daal/src/algorithms/covariance/covariance_hyperparameter_impl.h>
 
@@ -25,6 +28,12 @@
 #include "oneapi/dal/backend/interop/table_conversion.hpp"
 #include "oneapi/dal/table/row_accessor.hpp"
 
+#if defined(TARGET_X86_64)
+#define CPU_EXTENSION dal::detail::cpu_extension::avx512
+#elif defined(TARGET_ARM)
+#define CPU_EXTENSION dal::detail::cpu_extension::sve
+#endif
+
 namespace oneapi::dal::pca::backend {
 
 using dal::backend::context_cpu;
@@ -83,7 +92,8 @@ static result_t call_daal_kernel(const context_cpu& ctx,
     /// the logic of block size calculation is copied from DAAL,
     /// to be changed to passing the values from the performance model
     std::int64_t blockSize = 140;
-    if (ctx.get_enabled_cpu_extensions() == dal::detail::cpu_extension::avx512) {
+    if (ctx.get_enabled_cpu_extensions() == CPU_EXTENSION) {
+        const std::int64_t row_count = data.get_row_count();
         if (5000 < row_count && row_count <= 50000) {
             blockSize = 1024;
         }
diff --git a/cpp/oneapi/dal/algo/subgraph_isomorphism/backend/cpu/compiler_adapt.hpp b/cpp/oneapi/dal/algo/subgraph_isomorphism/backend/cpu/compiler_adapt.hpp
index 181dc2f31bc..f98462bc963 100644
--- a/cpp/oneapi/dal/algo/subgraph_isomorphism/backend/cpu/compiler_adapt.hpp
+++ b/cpp/oneapi/dal/algo/subgraph_isomorphism/backend/cpu/compiler_adapt.hpp
@@ -1,5 +1,6 @@
 /*******************************************************************************
 * Copyright 2021 Intel Corporation
+* Copyright contributors to the oneDAL project
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -16,7 +17,13 @@
 
 #pragma once
 #include <cstdint>
+
+#include <daal/include/services/daal_defines.h>
+
+#if defined(TARGET_X86_64)
 #include <immintrin.h>
+#endif
+
 #include <daal/src/services/service_defines.h>
 #include "oneapi/dal/backend/dispatcher.hpp"
 
@@ -83,6 +90,7 @@ ONEDAL_FORCEINLINE std::int32_t ONEDAL_popcnt64(std::uint64_t a) {
 #endif
 }
 
+#if defined(TARGET_X86_64)
 template <>
 ONEDAL_FORCEINLINE std::int32_t ONEDAL_lzcnt_u32<dal::backend::cpu_dispatch_sse2>(std::uint32_t a) {
     if (a == 0)
@@ -164,5 +172,20 @@ ONEDAL_FORCEINLINE std::int32_t ONEDAL_popcnt64<dal::backend::cpu_dispatch_avx2>
     }
     return bit_cnt;
 }
+#elif defined(TARGET_ARM)
+template <>
+ONEDAL_FORCEINLINE std::int32_t ONEDAL_lzcnt_u32<dal::backend::cpu_dispatch_sve>(std::uint32_t a) {
+    return __builtin_clz(a);
+}
 
+template <>
+ONEDAL_FORCEINLINE std::int32_t ONEDAL_lzcnt_u64<dal::backend::cpu_dispatch_sve>(std::uint64_t a) {
+    return __builtin_clzl(a);
+}
+
+template <>
+ONEDAL_FORCEINLINE std::int32_t ONEDAL_popcnt64<dal::backend::cpu_dispatch_sve>(std::uint64_t a) {
+    return __builtin_popcountl(a);
+}
+#endif
 } // namespace oneapi::dal::preview::subgraph_isomorphism::backend
diff --git a/cpp/oneapi/dal/algo/triangle_counting/backend/cpu/intersection_tc.hpp b/cpp/oneapi/dal/algo/triangle_counting/backend/cpu/intersection_tc.hpp
index cfbdc57231d..935e7057332 100644
--- a/cpp/oneapi/dal/algo/triangle_counting/backend/cpu/intersection_tc.hpp
+++ b/cpp/oneapi/dal/algo/triangle_counting/backend/cpu/intersection_tc.hpp
@@ -1,5 +1,6 @@
 /*******************************************************************************
 * Copyright 2020 Intel Corporation
+* Copyright contributors to the oneDAL project
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -16,7 +17,9 @@
 
 #pragma once
 
+#if defined(TARGET_X86_64)
 #include <immintrin.h>
+#endif
 
 #include <daal/src/services/service_defines.h>
 
@@ -68,6 +71,7 @@ ONEDAL_FORCEINLINE std::int32_t _popcnt32_redef(const std::int32_t& x) {
     {}
 #endif
 
+#if defined(TARGET_X86_64)
 template <>
 struct intersection_local_tc<dal::backend::cpu_dispatch_avx512> {
     ONEDAL_FORCEINLINE std::int64_t operator()(const std::int32_t* neigh_u,
@@ -419,5 +423,6 @@ struct intersection_local_tc<dal::backend::cpu_dispatch_avx512> {
         return total;
     }
 };
+#endif
 
 } // namespace oneapi::dal::preview::triangle_counting::backend
diff --git a/cpp/oneapi/dal/backend/dispatcher.cpp b/cpp/oneapi/dal/backend/dispatcher.cpp
index 1a5e39e95dd..69974fabdbb 100644
--- a/cpp/oneapi/dal/backend/dispatcher.cpp
+++ b/cpp/oneapi/dal/backend/dispatcher.cpp
@@ -1,5 +1,6 @@
 /*******************************************************************************
 * Copyright 2020 Intel Corporation
+* Copyright contributors to the oneDAL project
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -37,17 +38,25 @@ void context_cpu::global_init() {
 inline constexpr detail::cpu_extension from_daal_cpu_type(daal::CpuType cpu) {
     using detail::cpu_extension;
     switch (cpu) {
+#if defined(TARGET_X86_64)
         case daal::sse2: return cpu_extension::sse2;
         case daal::sse42: return cpu_extension::sse42;
         case daal::avx2: return cpu_extension::avx2;
         case daal::avx512: return cpu_extension::avx512;
+#elif defined(TARGET_ARM)
+        case daal::sve: return cpu_extension::sve;
+#endif
     }
     return cpu_extension::none;
 }
 
 detail::cpu_extension detect_top_cpu_extension() {
     if (!__daal_serv_cpu_extensions_available()) {
+#if defined(TARGET_X86_64)
         return detail::cpu_extension::sse2;
+#elif defined(TARGET_ARM)
+        return detail::cpu_extension::sve;
+#endif
     }
     const auto daal_cpu = (daal::CpuType)__daal_serv_cpu_detect(0);
     return from_daal_cpu_type(daal_cpu);
diff --git a/cpp/oneapi/dal/backend/dispatcher.hpp b/cpp/oneapi/dal/backend/dispatcher.hpp
index 5325cb2efab..7737f214ebf 100644
--- a/cpp/oneapi/dal/backend/dispatcher.hpp
+++ b/cpp/oneapi/dal/backend/dispatcher.hpp
@@ -1,5 +1,6 @@
 /*******************************************************************************
 * Copyright 2020 Intel Corporation
+* Copyright contributors to the oneDAL project
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -16,6 +17,8 @@
 
 #pragma once
 
+#include "daal/include/services/daal_defines.h"
+
 #include "oneapi/dal/detail/policy.hpp"
 #include "oneapi/dal/detail/spmd_policy.hpp"
 
@@ -38,11 +41,16 @@ namespace oneapi::dal::backend {
 
 detail::cpu_extension detect_top_cpu_extension();
 
+#if defined(TARGET_X86_64)
 struct cpu_dispatch_sse2 {};
 struct cpu_dispatch_sse42 {};
 struct cpu_dispatch_avx2 {};
 struct cpu_dispatch_avx512 {};
+#elif defined(TARGET_ARM)
+struct cpu_dispatch_sve {};
+#endif
 
+#if defined(TARGET_X86_64)
 using cpu_dispatch_default = cpu_dispatch_sse2;
 
 #define __CPU_TAG_SSE2__    oneapi::dal::backend::cpu_dispatch_sse2
@@ -51,6 +59,13 @@ using cpu_dispatch_default = cpu_dispatch_sse2;
 #define __CPU_TAG_AVX512__  oneapi::dal::backend::cpu_dispatch_avx512
 #define __CPU_TAG_DEFAULT__ oneapi::dal::backend::cpu_dispatch_default
 
+#elif defined(TARGET_ARM)
+using cpu_dispatch_default = cpu_dispatch_sve;
+
+#define __CPU_TAG_ARMV8SVE__ oneapi::dal::backend::cpu_dispatch_sve
+
+#endif
+
 template <typename MemoryAccessKind>
 class communicator_provider : public base {
 public:
@@ -279,6 +294,8 @@ inline constexpr auto dispatch_by_cpu(const context_cpu& ctx, Op&& op) {
     using detail::cpu_extension;
 
     [[maybe_unused]] const cpu_extension cpu_ex = ctx.get_enabled_cpu_extensions();
+
+#if defined(TARGET_X86_64)
     ONEDAL_IF_CPU_DISPATCH_AVX512(if (test_cpu_extension(cpu_ex, cpu_extension::avx512)) {
         return op(cpu_dispatch_avx512{});
     })
@@ -286,6 +303,12 @@ inline constexpr auto dispatch_by_cpu(const context_cpu& ctx, Op&& op) {
         if (test_cpu_extension(cpu_ex, cpu_extension::avx2)) { return op(cpu_dispatch_avx2{}); })
     ONEDAL_IF_CPU_DISPATCH_SSE42(
         if (test_cpu_extension(cpu_ex, cpu_extension::sse42)) { return op(cpu_dispatch_sse42{}); })
+
+#elif defined(TARGET_ARM)
+    ONEDAL_IF_CPU_DISPATCH_A8SVE(
+        if (test_cpu_extension(cpu_ex, cpu_extension::sve)) { return op(cpu_dispatch_sve{}); })
+#endif
+
     return op(cpu_dispatch_default{});
 }
 
diff --git a/cpp/oneapi/dal/backend/dispatcher_cpu.hpp b/cpp/oneapi/dal/backend/dispatcher_cpu.hpp
index ef93e796f4b..ca7c92e0d38 100644
--- a/cpp/oneapi/dal/backend/dispatcher_cpu.hpp
+++ b/cpp/oneapi/dal/backend/dispatcher_cpu.hpp
@@ -1,5 +1,6 @@
 /*******************************************************************************
 * Copyright 2020 Intel Corporation
+* Copyright contributors to the oneDAL project
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -16,14 +17,21 @@
 
 #pragma once
 
+#include <daal/include/services/daal_defines.h>
+
 #ifdef __ONEDAL_IDE_MODE__
 // If this file is openned in IDE it will complain about
 // `_onedal_dispatcher_cpu.hpp` as this file is generated at build time.
 // It's recommended to define __ONEDAL_IDE_MODE__ in your IDE settings to
 // enable this branch for preprocessor.
+
+#if defined(TARGET_X86_64)
 #define ONEDAL_CPU_DISPATCH_SSE42
 #define ONEDAL_CPU_DISPATCH_AVX2
 #define ONEDAL_CPU_DISPATCH_AVX512
+#elif defined(TARGET_ARM)
+#define ONEDAL_CPU_DISPATCH_A8SVE
+#endif
 #else
 // This file is automatically generated by build system
 #include "oneapi/dal/_dal_cpu_dispatcher_gen.hpp"
@@ -46,3 +54,9 @@
 #else
 #define ONEDAL_IF_CPU_DISPATCH_AVX512(x)
 #endif
+
+#ifdef ONEDAL_CPU_DISPATCH_A8SVE
+#define ONEDAL_IF_CPU_DISPATCH_A8SVE(x) x
+#else
+#define ONEDAL_IF_CPU_DISPATCH_A8SVE(x)
+#endif
diff --git a/cpp/oneapi/dal/backend/interop/common.hpp b/cpp/oneapi/dal/backend/interop/common.hpp
index c9d6652396c..82a2d77700b 100644
--- a/cpp/oneapi/dal/backend/interop/common.hpp
+++ b/cpp/oneapi/dal/backend/interop/common.hpp
@@ -1,5 +1,6 @@
 /*******************************************************************************
 * Copyright 2020 Intel Corporation
+* Copyright contributors to the oneDAL project
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -30,6 +31,7 @@ struct daal_cpu_value {
     constexpr static daal::CpuType value = cpu;
 };
 
+#if defined(TARGET_X86_64)
 template <>
 struct to_daal_cpu_type<cpu_dispatch_default> : daal_cpu_value<daal::sse2> {};
 template <>
@@ -39,6 +41,12 @@ struct to_daal_cpu_type<cpu_dispatch_avx2> : daal_cpu_value<daal::avx2> {};
 template <>
 struct to_daal_cpu_type<cpu_dispatch_avx512> : daal_cpu_value<daal::avx512> {};
 
+#elif defined(TARGET_ARM)
+template <>
+struct to_daal_cpu_type<cpu_dispatch_sve> : daal_cpu_value<daal::sve> {};
+
+#endif
+
 template <typename Float, template <typename, daal::CpuType> typename CpuKernel, typename... Args>
 inline auto call_daal_kernel(const context_cpu& ctx, Args&&... args) {
     return dal::backend::dispatch_by_cpu(ctx, [&](auto cpu) {
diff --git a/cpp/oneapi/dal/backend/interop/table_conversion.hpp b/cpp/oneapi/dal/backend/interop/table_conversion.hpp
index b1fca83cd3c..d68f1f179c6 100644
--- a/cpp/oneapi/dal/backend/interop/table_conversion.hpp
+++ b/cpp/oneapi/dal/backend/interop/table_conversion.hpp
@@ -20,6 +20,9 @@
 #include <daal/include/data_management/data/internal/numeric_table_sycl_homogen.h>
 #endif
 
+#include <daal/include/services/env_detect.h>
+
+#include "daal/src/data_management/service_numeric_table.h"
 #include "oneapi/dal/backend/memory.hpp"
 #include "oneapi/dal/table/detail/table_builder.hpp"
 #include "oneapi/dal/table/backend/interop/sycl_table_adapter.hpp"
@@ -122,7 +125,11 @@ inline daal::data_management::NumericTablePtr wrap_by_host_soa_adapter(const hom
 }
 
 template <typename Data>
-inline daal::data_management::NumericTablePtr convert_to_daal_table(const homogen_table& table) {
+inline daal::data_management::NumericTablePtr convert_to_daal_table(const homogen_table& table,
+                                                                    bool need_copy = false) {
+    if (need_copy) {
+        return copy_to_daal_homogen_table<Data>(table);
+    }
     if (table.get_data_layout() == data_layout::row_major) {
         if (auto wrapper = wrap_by_host_homogen_adapter(table)) {
             return wrapper;
@@ -143,6 +150,7 @@ inline auto convert_to_daal_csr_table(array<T>& data,
                                       std::int64_t row_count,
                                       std::int64_t column_count,
                                       bool allow_copy = false) {
+    using daal::services::Status;
     ONEDAL_ASSERT(data.get_count() == column_indices.get_count());
     ONEDAL_ASSERT(row_indices.get_count() == row_count + 1);
 
@@ -170,12 +178,17 @@ inline auto convert_to_daal_csr_table(array<T>& data,
         reinterpret_cast<std::size_t*>(row_indices.get_mutable_data()),
         daal_object_owner{ row_indices });
 
-    return daal::data_management::CSRNumericTable::create(
+    Status status;
+    const auto table = daal::data_management::CSRNumericTable::create(
         daal_data,
         daal_column_indices,
         daal_row_indices,
         dal::detail::integral_cast<std::size_t>(column_count),
-        dal::detail::integral_cast<std::size_t>(row_count));
+        dal::detail::integral_cast<std::size_t>(row_count),
+        daal::data_management::CSRNumericTable::CSRIndexing::oneBased,
+        &status);
+    status_to_exception(status);
+    return table;
 }
 
 template <typename Float>
@@ -222,25 +235,22 @@ inline daal::data_management::CSRNumericTablePtr wrap_by_host_csr_adapter(const
 }
 
 template <typename Float>
-inline daal::data_management::CSRNumericTablePtr convert_to_daal_table(const csr_table& table) {
+inline daal::data_management::CSRNumericTablePtr convert_to_daal_table(const csr_table& table,
+                                                                       bool need_copy = false) {
     auto wrapper = wrap_by_host_csr_adapter(table);
-    if (!wrapper) {
-        return copy_to_daal_csr_table<Float>(table);
-    }
-    else {
-        return wrapper;
-    }
+    return need_copy || !wrapper ? copy_to_daal_csr_table<Float>(table) : wrapper;
 }
 
 template <typename Data>
-inline daal::data_management::NumericTablePtr convert_to_daal_table(const table& table) {
+inline daal::data_management::NumericTablePtr convert_to_daal_table(const table& table,
+                                                                    bool need_copy = false) {
     if (table.get_kind() == homogen_table::kind()) {
         const auto& homogen = static_cast<const homogen_table&>(table);
-        return convert_to_daal_table<Data>(homogen);
+        return convert_to_daal_table<Data>(homogen, need_copy);
     }
     else if (table.get_kind() == csr_table::kind()) {
         const auto& csr = static_cast<const csr_table&>(table);
-        return convert_to_daal_table<Data>(csr);
+        return convert_to_daal_table<Data>(csr, need_copy);
     }
     else {
         return copy_to_daal_homogen_table<Data>(table);
diff --git a/cpp/oneapi/dal/backend/memory.hpp b/cpp/oneapi/dal/backend/memory.hpp
index 5f9e4bb71ff..4579af7fc9e 100644
--- a/cpp/oneapi/dal/backend/memory.hpp
+++ b/cpp/oneapi/dal/backend/memory.hpp
@@ -392,7 +392,7 @@ inline sycl::event copy_all2all(sycl::queue& queue,
         event = memcpy_host2usm(queue, dest, src, sizeof(T) * n, deps);
     }
     else {
-        copy(dest, src, sizeof(T) * n);
+        memcpy(dest, src, sizeof(T) * n);
     }
     return event;
 }
diff --git a/cpp/oneapi/dal/backend/micromkl/macro.hpp b/cpp/oneapi/dal/backend/micromkl/macro.hpp
index 3cd555e78cb..b46910ce6e5 100644
--- a/cpp/oneapi/dal/backend/micromkl/macro.hpp
+++ b/cpp/oneapi/dal/backend/micromkl/macro.hpp
@@ -1,5 +1,6 @@
 /*******************************************************************************
 * Copyright 2021 Intel Corporation
+* Copyright contributors to the oneDAL project
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -16,6 +17,8 @@
 
 #pragma once
 
+#include <daal/include/services/daal_defines.h>
+
 #ifndef __MICROMKL_INCLUDE_GUARD__
 #error "This header cannot be included outside of micromkl module"
 #endif
@@ -50,8 +53,12 @@
     FUNC_CPU_DECL(nominal_cpu, prefix, name, argdecl)                     \
     DISPATCH_FUNC_CPU(nominal_cpu, actual_cpu, prefix, name, argdecl, argcall)
 
+#if defined(TARGET_X86_64)
 #define FUNC_AVX512(...) EXPAND(FUNC_CPU(avx512, avx512, __VA_ARGS__))
 #define FUNC_AVX2(...)   EXPAND(FUNC_CPU(avx2, avx2, __VA_ARGS__))
+#elif defined(TARGET_ARM)
+#define FUNC_A8SVE(...) EXPAND(FUNC_CPU(sve, sve, __VA_ARGS__))
+#endif
 
 #ifdef __APPLE__
 #define FUNC_SSE42(...) EXPAND(FUNC_CPU(sse42, avx2, __VA_ARGS__))
@@ -61,12 +68,18 @@
 #define FUNC_SSE2(...)  EXPAND(FUNC_CPU(sse2, sse2, __VA_ARGS__))
 #endif
 
+#if defined(TARGET_X86_64)
 #define FUNC(prefix, name, argdecl, argcall)    \
     DISPATCH_FUNC_DECL(prefix, name, argdecl)   \
     FUNC_AVX512(prefix, name, argdecl, argcall) \
     FUNC_AVX2(prefix, name, argdecl, argcall)   \
     FUNC_SSE42(prefix, name, argdecl, argcall)  \
     FUNC_SSE2(prefix, name, argdecl, argcall)
+#elif defined(TARGET_ARM)
+#define FUNC(prefix, name, argdecl, argcall)  \
+    DISPATCH_FUNC_DECL(prefix, name, argdecl) \
+    FUNC_A8SVE(prefix, name, argdecl, argcall)
+#endif
 
 #ifdef ONEDAL_REF
 #define FUNC_DECL(prefix, floatabr, name, argdecl, argcall) \
@@ -83,6 +96,12 @@
 #define INSTANTIATE_CPU(cpu, name, Float, argdecl) \
     template void name<DISPATCH_ID_NAME(cpu), Float> argdecl(Float);
 
+#ifdef ONEDAL_CPU_DISPATCH_A8SVE
+#define INSTANTIATE_A8SVE(...) EXPAND(INSTANTIATE_CPU(sve, __VA_ARGS__))
+#else
+#define INSTANTIATE_A8SVE(...)
+#endif
+
 #ifdef ONEDAL_CPU_DISPATCH_AVX512
 #define INSTANTIATE_AVX512(...) EXPAND(INSTANTIATE_CPU(avx512, __VA_ARGS__))
 #else
@@ -103,11 +122,15 @@
 
 #define INSTANTIATE_SSE2(...) EXPAND(INSTANTIATE_CPU(sse2, __VA_ARGS__))
 
+#if defined(TARGET_X86_64)
 #define INSTANTIATE_FLOAT(name, Float, argdecl) \
     INSTANTIATE_AVX512(name, Float, argdecl)    \
     INSTANTIATE_AVX2(name, Float, argdecl)      \
     INSTANTIATE_SSE42(name, Float, argdecl)     \
     INSTANTIATE_SSE2(name, Float, argdecl)
+#elif defined(TARGET_ARM)
+#define INSTANTIATE_FLOAT(name, Float, argdecl) INSTANTIATE_A8SVE(name, Float, argdecl)
+#endif
 
 #define FUNC_TEMPLATE(prefix, name, fargdecl, cargdecl, fargcall, cargcall) \
     FUNC_DECL(prefix, s, name, fargdecl(float), fargcall)                   \
diff --git a/cpp/oneapi/dal/backend/primitives/intersection/intersection.hpp b/cpp/oneapi/dal/backend/primitives/intersection/intersection.hpp
index b8dcc175838..341a0b7637e 100644
--- a/cpp/oneapi/dal/backend/primitives/intersection/intersection.hpp
+++ b/cpp/oneapi/dal/backend/primitives/intersection/intersection.hpp
@@ -1,5 +1,6 @@
 /*******************************************************************************
 * Copyright 2020 Intel Corporation
+* Copyright contributors to the oneDAL project
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -16,7 +17,9 @@
 
 #pragma once
 
+#if defined(TARGET_X86_64)
 #include <immintrin.h>
+#endif
 
 #include <daal/src/services/service_defines.h>
 
@@ -62,6 +65,7 @@ ONEDAL_FORCEINLINE std::int32_t _popcnt32_redef(const std::int32_t &x) {
     {}
 #endif
 
+#if defined(TARGET_X86_64)
 template <>
 ONEDAL_FORCEINLINE std::int64_t intersection<dal::backend::cpu_dispatch_avx512>(
     const std::int32_t *neigh_u,
@@ -569,5 +573,6 @@ ONEDAL_FORCEINLINE std::int64_t intersection<dal::backend::cpu_dispatch_avx2>(
     }
     return total;
 }
+#endif
 
 } // namespace oneapi::dal::preview::backend
diff --git a/cpp/oneapi/dal/backend/primitives/objective_function/test/fixture.hpp b/cpp/oneapi/dal/backend/primitives/objective_function/test/fixture.hpp
index a18a727b163..39cae7db796 100644
--- a/cpp/oneapi/dal/backend/primitives/objective_function/test/fixture.hpp
+++ b/cpp/oneapi/dal/backend/primitives/objective_function/test/fixture.hpp
@@ -291,6 +291,18 @@ class logloss_test : public te::float_algo_fixture<Param> {
         }
     }
 
+    float_t clip_prob(float_t prob) {
+        constexpr float_t bottom = sizeof(float_t) > 4 ? 1e-15 : 1e-7;
+        constexpr float_t top = float_t(1.0) - bottom;
+        if (prob < bottom) {
+            prob = bottom;
+        }
+        if (prob > top) {
+            prob = top;
+        }
+        return prob;
+    }
+
     float_t test_predictions_and_logloss(const ndview<float_t, 2>& data_host,
                                          const ndview<float_t, 1>& params_host,
                                          const ndview<std::int32_t, 1>& labels_host,
@@ -313,7 +325,7 @@ class logloss_test : public te::float_algo_fixture<Param> {
             if (fit_intercept) {
                 pred += params_host.at(0);
             }
-            float_t prob = 1 / (1 + std::exp(-pred));
+            float_t prob = clip_prob(float_t(1.0) / (1 + std::exp(-pred)));
             logloss -=
                 labels_host.at(i) * std::log(prob) + (1 - labels_host.at(i)) * std::log(1 - prob);
             float_t out_val = probabilities.at(i);
diff --git a/cpp/oneapi/dal/backend/primitives/optimizers/newton_cg.hpp b/cpp/oneapi/dal/backend/primitives/optimizers/newton_cg.hpp
index 21516511acc..1035811798d 100644
--- a/cpp/oneapi/dal/backend/primitives/optimizers/newton_cg.hpp
+++ b/cpp/oneapi/dal/backend/primitives/optimizers/newton_cg.hpp
@@ -25,12 +25,12 @@ namespace oneapi::dal::backend::primitives {
 // pp. 168 (also known as the truncated Newton method)
 // https://link.springer.com/book/10.1007/978-0-387-40065-5
 template <typename Float>
-std::pair<sycl::event, std::int64_t> newton_cg(sycl::queue& queue,
-                                               base_function<Float>& f,
-                                               ndview<Float, 1>& x,
-                                               Float tol = 1.0e-5,
-                                               std::int64_t maxiter = 100l,
-                                               std::int64_t maxinner = 200l,
-                                               const event_vector& deps = {});
+std::tuple<sycl::event, std::int64_t, std::int64_t> newton_cg(sycl::queue& queue,
+                                                              base_function<Float>& f,
+                                                              ndview<Float, 1>& x,
+                                                              Float tol = 1.0e-5,
+                                                              std::int64_t maxiter = 100l,
+                                                              std::int64_t maxinner = 200l,
+                                                              const event_vector& deps = {});
 
 } // namespace oneapi::dal::backend::primitives
diff --git a/cpp/oneapi/dal/backend/primitives/optimizers/newton_cg_dpc.cpp b/cpp/oneapi/dal/backend/primitives/optimizers/newton_cg_dpc.cpp
index f5acb90a3d7..b0c84ae0727 100644
--- a/cpp/oneapi/dal/backend/primitives/optimizers/newton_cg_dpc.cpp
+++ b/cpp/oneapi/dal/backend/primitives/optimizers/newton_cg_dpc.cpp
@@ -27,13 +27,13 @@
 namespace oneapi::dal::backend::primitives {
 
 template <typename Float>
-std::pair<sycl::event, std::int64_t> newton_cg(sycl::queue& queue,
-                                               base_function<Float>& f,
-                                               ndview<Float, 1>& x,
-                                               Float tol,
-                                               std::int64_t maxiter,
-                                               std::int64_t maxinner,
-                                               const event_vector& deps) {
+std::tuple<sycl::event, std::int64_t, std::int64_t> newton_cg(sycl::queue& queue,
+                                                              base_function<Float>& f,
+                                                              ndview<Float, 1>& x,
+                                                              Float tol,
+                                                              std::int64_t maxiter,
+                                                              std::int64_t maxinner,
+                                                              const event_vector& deps) {
     ONEDAL_PROFILER_TASK(newton_cg, queue);
     std::int64_t n = x.get_dimension(0);
 
@@ -55,6 +55,7 @@ std::pair<sycl::event, std::int64_t> newton_cg(sycl::queue& queue,
     Float update_norm = tol + 1;
 
     std::int64_t cur_iter_id = 0;
+    std::int64_t inner_iter_sum = 0;
     while (cur_iter_id < maxiter) {
         cur_iter_id++;
         auto update_event_vec = f.update_x(x, true, last_iter_deps);
@@ -98,6 +99,7 @@ std::pair<sycl::event, std::int64_t> newton_cg(sycl::queue& queue,
                                                       Float(0),
                                                       maxinner,
                                                       { last_event });
+            inner_iter_sum += inner_iter;
 
             // <-grad, direction> should be > 0 if direction is descent direction
             last_event = dot_product(queue, gradient, direction, tmp_gpu, &desc, { solve_event });
@@ -106,7 +108,7 @@ std::pair<sycl::event, std::int64_t> newton_cg(sycl::queue& queue,
 
         if (desc < 0) {
             // failed to find descent direction
-            return { last_event, cur_iter_id };
+            return make_tuple(last_event, cur_iter_id, inner_iter_sum);
         }
 
         Float alpha_opt = backtracking(queue,
@@ -127,17 +129,18 @@ std::pair<sycl::event, std::int64_t> newton_cg(sycl::queue& queue,
         last = copy(queue, x, buffer2, {});
         last_iter_deps = { last };
     }
-    return { last, cur_iter_id };
+    return make_tuple(last, cur_iter_id, inner_iter_sum);
 }
 
-#define INSTANTIATE(F)                                                            \
-    template std::pair<sycl::event, std::int64_t> newton_cg<F>(sycl::queue&,      \
-                                                               base_function<F>&, \
-                                                               ndview<F, 1>&,     \
-                                                               F,                 \
-                                                               std::int64_t,      \
-                                                               std::int64_t,      \
-                                                               const event_vector&);
+#define INSTANTIATE(F)                                                         \
+    template std::tuple<sycl::event, std::int64_t, std::int64_t> newton_cg<F>( \
+        sycl::queue&,                                                          \
+        base_function<F>&,                                                     \
+        ndview<F, 1>&,                                                         \
+        F,                                                                     \
+        std::int64_t,                                                          \
+        std::int64_t,                                                          \
+        const event_vector&);
 
 INSTANTIATE(float);
 INSTANTIATE(double);
diff --git a/cpp/oneapi/dal/backend/primitives/optimizers/test/newton_cg_dpc.cpp b/cpp/oneapi/dal/backend/primitives/optimizers/test/newton_cg_dpc.cpp
index fa045d41142..914bda60f1f 100644
--- a/cpp/oneapi/dal/backend/primitives/optimizers/test/newton_cg_dpc.cpp
+++ b/cpp/oneapi/dal/backend/primitives/optimizers/test/newton_cg_dpc.cpp
@@ -90,13 +90,13 @@ class newton_cg_test : public te::float_algo_fixture<Param> {
             logloss_function<float_t>(this->get_queue(), data, y_gpu, 3.0, true, bsz);
         auto [solution_, fill_e] =
             ndarray<float_t, 1>::zeros(this->get_queue(), { p_ + 1 }, sycl::usm::alloc::device);
-        auto [opt_event, num_iter] = newton_cg(this->get_queue(),
-                                               logloss_func,
-                                               solution_,
-                                               float_t(1e-8),
-                                               100l,
-                                               200l,
-                                               { fill_e });
+        auto [opt_event, num_iter, inner_iter] = newton_cg(this->get_queue(),
+                                                           logloss_func,
+                                                           solution_,
+                                                           float_t(1e-8),
+                                                           100l,
+                                                           200l,
+                                                           { fill_e });
         opt_event.wait_and_throw();
         auto solution_host = solution_.to_host(this->get_queue());
 
@@ -200,7 +200,7 @@ class newton_cg_test : public te::float_algo_fixture<Param> {
             ndarray<float_t, 1>::zeros(this->get_queue(), { n_ }, sycl::usm::alloc::device);
 
         float_t conv_tol = sizeof(float_t) == 4 ? 1e-7 : 1e-14;
-        auto [opt_event, num_iter] =
+        auto [opt_event, num_iter, inner_iter] =
             newton_cg(this->get_queue(), *func_, x, conv_tol, 100, 200l, { x_event });
         opt_event.wait_and_throw();
         auto x_host = x.to_host(this->get_queue());
diff --git a/cpp/oneapi/dal/backend/primitives/reduction/reduction.hpp b/cpp/oneapi/dal/backend/primitives/reduction/reduction.hpp
index 7eeaf45cc26..5ce78c5f598 100644
--- a/cpp/oneapi/dal/backend/primitives/reduction/reduction.hpp
+++ b/cpp/oneapi/dal/backend/primitives/reduction/reduction.hpp
@@ -16,6 +16,7 @@
 
 #pragma once
 
+#include "oneapi/dal/table/csr.hpp"
 #include "oneapi/dal/backend/primitives/common.hpp"
 #include "oneapi/dal/backend/primitives/ndarray.hpp"
 #include "oneapi/dal/backend/primitives/reduction/functors.hpp"
@@ -107,6 +108,61 @@ inline sycl::event reduce_by_columns(sycl::queue& q,
     return reduce_by_columns_impl(q, input, output, binary, unary, deps, override_init);
 }
 
+template <typename Float, typename BinaryOp, typename UnaryOp>
+sycl::event reduce_by_rows_impl(sycl::queue& q,
+                                const ndview<Float, 1>& values,
+                                const ndview<std::int64_t, 1>& column_indices,
+                                const ndview<std::int64_t, 1>& row_offsets,
+                                const dal::sparse_indexing indexing,
+                                ndview<Float, 1>& output,
+                                const BinaryOp& binary,
+                                const UnaryOp& unary,
+                                const event_vector& deps,
+                                bool override_init = true);
+
+/// Reduces `input` rows in CSR format and put result into output
+///
+/// @tparam Float       Floating-point type used to perform computations
+/// @tparam BinaryOp    Type of binary operator functor
+/// @tparam UnaryOp     Type of unary operator functor
+///
+/// @param[in] queue            SYCL queue
+/// @param[in] values           An input of values array in CSR format
+/// @param[in] column_indices   An input of column indices array in CSR format
+/// @param[in] row_offsets      An input of row offsets array in CSR format
+/// @param[in] indexing         CSR indexing type. It can be `one_based` or `zero_based`
+/// @param[out] output          The result of reduction
+/// @param[in] deps             A vector of `sycl::event`s that represents list of dependencies
+template <typename Float, typename BinaryOp, typename UnaryOp>
+inline sycl::event reduce_by_rows(sycl::queue& q,
+                                  const ndview<Float, 1>& values,
+                                  const ndview<std::int64_t, 1>& column_indices,
+                                  const ndview<std::int64_t, 1>& row_offsets,
+                                  const dal::sparse_indexing indexing,
+                                  ndview<Float, 1>& output,
+                                  const BinaryOp& binary = BinaryOp{},
+                                  const UnaryOp& unary = UnaryOp{},
+                                  const event_vector& deps = {},
+                                  bool override_init = true) {
+    ONEDAL_PROFILER_TASK(reduction.reduce_by_rows, q);
+    static_assert(dal::detail::is_tag_one_of_v<BinaryOp, reduce_binary_op_tag>,
+                  "BinaryOp must be a special binary operation defined "
+                  "at the primitives level");
+    static_assert(dal::detail::is_tag_one_of_v<UnaryOp, reduce_unary_op_tag>,
+                  "UnaryOp must be a special unary operation defined "
+                  "at the primitives level");
+    return reduce_by_rows_impl(q,
+                               values,
+                               column_indices,
+                               row_offsets,
+                               indexing,
+                               output,
+                               binary,
+                               unary,
+                               deps,
+                               override_init);
+}
+
 #endif
 
 } // namespace oneapi::dal::backend::primitives
diff --git a/cpp/oneapi/dal/backend/primitives/reduction/reduction_dpc.cpp b/cpp/oneapi/dal/backend/primitives/reduction/reduction_dpc.cpp
index b71e75eb8c8..7e1251cb915 100644
--- a/cpp/oneapi/dal/backend/primitives/reduction/reduction_dpc.cpp
+++ b/cpp/oneapi/dal/backend/primitives/reduction/reduction_dpc.cpp
@@ -88,6 +88,65 @@ sycl::event reduce_by_rows_impl(sycl::queue& q,
     return sycl::event{};
 }
 
+/// Reduces CSR table with `n x m` dimensions by rows
+///
+/// @tparam Float       Floating point type, it can be `float` or `double`
+/// @tparam BinaryOp    Binary operation class, it reduces 2 input values into 1
+/// @tparam UnaryOp     Unary operation class, it modifies an input value
+///
+/// @param[in] q                Sycl queue
+/// @param[in] values           An array of values in CSR table
+/// @param[in] column_indices   An array of column indices in CSR table
+/// @param[in] row_offsets      An array of row offsets in CSR table
+/// @param[in] indexing         Indexing kind of CSR table
+/// @param[out] output          An output array with dimensions `n x 1`
+/// @param[in] binary           A binary operation used in reduction
+/// @param[in] unary            An unary operation used in reduction
+/// @param[in] deps             A vector of dependent events
+template <typename Float, typename BinaryOp, typename UnaryOp>
+sycl::event reduce_by_rows_impl(sycl::queue& q,
+                                const ndview<Float, 1>& values,
+                                const ndview<std::int64_t, 1>& column_indices,
+                                const ndview<std::int64_t, 1>& row_offsets,
+                                const dal::sparse_indexing indexing,
+                                ndview<Float, 1>& output,
+                                const BinaryOp& binary,
+                                const UnaryOp& unary,
+                                const event_vector& deps,
+                                bool override_init) {
+    ONEDAL_ASSERT(values.get_count() == column_indices.get_count());
+    const std::int64_t row_block_size = device_max_wg_size(q);
+    const std::int64_t column_block_size = device_max_wg_size(q) / 2;
+    const auto range =
+        make_multiple_nd_range_2d({ row_block_size, column_block_size }, { 1, column_block_size });
+    const auto val_ptr = values.get_data();
+    const auto row_ptr = row_offsets.get_data();
+    auto const out_ptr = output.get_mutable_data();
+    const std::int64_t shift = bool(indexing == sparse_indexing::one_based);
+    const auto row_count = row_offsets.get_count() - 1;
+    return q.submit([&](sycl::handler& cgh) {
+        cgh.depends_on(deps);
+        cgh.parallel_for(range, [=](auto it) {
+            const std::int64_t row_shift = it.get_global_id(0);
+            const std::int64_t col_shift = it.get_local_id(1);
+            for (auto row_idx = row_shift; row_idx < row_count; row_idx += row_block_size) {
+                const auto start = row_ptr[row_idx] - shift;
+                const auto end = row_ptr[row_idx + 1] - shift;
+                Float local_accum = binary.init_value;
+                for (auto idx = start + col_shift; idx < end; idx += column_block_size) {
+                    const auto val = val_ptr[idx];
+                    local_accum = binary.native(local_accum, unary(val));
+                }
+                const auto result =
+                    sycl::reduce_over_group(it.get_group(), local_accum, binary.native);
+                if (col_shift == 0) {
+                    out_ptr[row_idx] = override_init ? result : out_ptr[row_idx] + result;
+                }
+            }
+        });
+    });
+}
+
 template <typename Float, ndorder order, typename BinaryOp, typename UnaryOp>
 sycl::event reduce_by_columns_impl(sycl::queue& q,
                                    const ndview<Float, 2, order>& input,
@@ -123,10 +182,22 @@ sycl::event reduce_by_columns_impl(sycl::queue& q,
                                                             const U&,               \
                                                             const event_vector&,    \
                                                             bool);
+#define INSTANTIATE_CSR(F, B, U)                                                      \
+    template sycl::event reduce_by_rows_impl<F, B, U>(sycl::queue&,                   \
+                                                      const ndview<F, 1>&,            \
+                                                      const ndview<std::int64_t, 1>&, \
+                                                      const ndview<std::int64_t, 1>&, \
+                                                      dal::sparse_indexing,           \
+                                                      ndview<F, 1>&,                  \
+                                                      const B&,                       \
+                                                      const U&,                       \
+                                                      const event_vector&,            \
+                                                      bool);
 
 #define INSTANTIATE_LAYOUT(F, B, U)  \
     INSTANTIATE(F, ndorder::c, B, U) \
-    INSTANTIATE(F, ndorder::f, B, U)
+    INSTANTIATE(F, ndorder::f, B, U) \
+    INSTANTIATE_CSR(F, B, U)
 
 #define INSTANTIATE_FLOAT(B, U)                       \
     INSTANTIATE_LAYOUT(double, B<double>, U<double>); \
diff --git a/cpp/oneapi/dal/detail/dispatcher.hpp b/cpp/oneapi/dal/detail/dispatcher.hpp
index 522287c3663..15d94d098d3 100644
--- a/cpp/oneapi/dal/detail/dispatcher.hpp
+++ b/cpp/oneapi/dal/detail/dispatcher.hpp
@@ -1,5 +1,6 @@
 /*******************************************************************************
 * Copyright 2020 Intel Corporation
+* Copyright contributors to the oneDAL project
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -16,22 +17,33 @@
 
 #pragma once
 
+#include <daal/include/services/daal_defines.h>
+
 namespace oneapi::dal::detail {
 namespace v1 {
 
+#if defined(TARGET_X86_64)
 struct cpu_dispatch_sse2 {};
 struct cpu_dispatch_sse42 {};
 struct cpu_dispatch_avx2 {};
 struct cpu_dispatch_avx512 {};
-
 using cpu_dispatch_default = cpu_dispatch_sse2;
+#elif defined(TARGET_ARM)
+struct cpu_dispatch_sve {};
+using cpu_dispatch_default = cpu_dispatch_sve;
+#endif
 
 } // namespace v1
 
+#if defined(TARGET_X86_64)
 using v1::cpu_dispatch_sse2;
 using v1::cpu_dispatch_sse42;
 using v1::cpu_dispatch_avx2;
 using v1::cpu_dispatch_avx512;
+#elif defined(TARGET_ARM)
+using v1::cpu_dispatch_sve;
+#endif
+
 using v1::cpu_dispatch_default;
 
 } // namespace oneapi::dal::detail
diff --git a/cpp/oneapi/dal/detail/policy.hpp b/cpp/oneapi/dal/detail/policy.hpp
index c62c31e9c30..127c9770d6c 100644
--- a/cpp/oneapi/dal/detail/policy.hpp
+++ b/cpp/oneapi/dal/detail/policy.hpp
@@ -1,5 +1,6 @@
 /*******************************************************************************
 * Copyright 2020 Intel Corporation
+* Copyright contributors to the oneDAL project
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -16,6 +17,15 @@
 
 #pragma once
 
+// TODO: Clean up this redefinition and import the defines globally.
+#if defined(__x86_64__) || defined(__x86_64) || defined(__amd64) || defined(_M_AMD64)
+#define TARGET_X86_64
+#endif
+
+#if defined(__ARM_ARCH) || defined(__aarch64__)
+#define TARGET_ARM
+#endif
+
 #include <type_traits>
 #ifdef ONEDAL_DATA_PARALLEL
 #include <sycl/sycl.hpp>
@@ -61,10 +71,14 @@ inline constexpr bool is_data_parallel_policy_v = is_data_parallel_policy<T>::va
 
 enum class cpu_extension : uint64_t {
     none = 0U,
+#if defined(TARGET_X86_64)
     sse2 = 1U << 0,
     sse42 = 1U << 2,
     avx2 = 1U << 4,
     avx512 = 1U << 5
+#elif defined(TARGET_ARM)
+    sve = 1U << 0,
+#endif
 };
 
 class ONEDAL_EXPORT default_host_policy {};
diff --git a/cpp/oneapi/dal/partial_compute.hpp b/cpp/oneapi/dal/partial_compute.hpp
index 820f74f9685..8ec259796f0 100644
--- a/cpp/oneapi/dal/partial_compute.hpp
+++ b/cpp/oneapi/dal/partial_compute.hpp
@@ -40,26 +40,4 @@ auto partial_compute(sycl::queue& queue, Args&&... args) {
 
 using v1::partial_compute;
 
-namespace preview {
-
-template <typename... Args>
-auto partial_compute(spmd::communicator<spmd::device_memory_access::none>& comm, Args&&... args) {
-    return dal::detail::partial_compute_dispatch(
-        dal::detail::spmd_policy{ dal::detail::host_policy{}, comm },
-        std::forward<Args>(args)...);
-}
-
-#ifdef ONEDAL_DATA_PARALLEL
-template <typename... Args>
-auto partial_compute(spmd::communicator<spmd::device_memory_access::usm>& comm, Args&&... args) {
-    return dal::detail::partial_compute_dispatch(
-        dal::detail::spmd_policy<dal::detail::data_parallel_policy>{
-            dal::detail::data_parallel_policy{ comm.get_queue() },
-            comm },
-        std::forward<Args>(args)...);
-}
-#endif
-
-} // namespace preview
-
 } // namespace oneapi::dal
diff --git a/cpp/oneapi/dal/table/test/csr_accessor.cpp b/cpp/oneapi/dal/table/test/csr_accessor.cpp
index eec2e2bb879..40cea3b0a32 100644
--- a/cpp/oneapi/dal/table/test/csr_accessor.cpp
+++ b/cpp/oneapi/dal/table/test/csr_accessor.cpp
@@ -351,11 +351,10 @@ TEMPLATE_LIST_TEST_M(csr_accessor_test,
                                   test_alloc_kind::usm_device,
                                   test_alloc_kind::usm_shared);
 
-    this->accessor_alloc_ = GENERATE(test_alloc_kind::usm_device, test_alloc_kind::usm_shared);
-
-    // Furter improvement: Add support of the following accessor allocation types:
-    // test_alloc_kind::host,
-    // test_alloc_kind::usm_host.
+    this->accessor_alloc_ = GENERATE(test_alloc_kind::host,
+                                     test_alloc_kind::usm_host,
+                                     test_alloc_kind::usm_device,
+                                     test_alloc_kind::usm_shared);
 #else
     this->table_alloc_ = test_alloc_kind::host;
     this->accessor_alloc_ = test_alloc_kind::host;
@@ -379,11 +378,10 @@ TEMPLATE_LIST_TEST_M(csr_accessor_test,
                                   test_alloc_kind::usm_device,
                                   test_alloc_kind::usm_shared);
 
-    this->accessor_alloc_ = GENERATE(test_alloc_kind::usm_device, test_alloc_kind::usm_shared);
-
-    // Furter improvement: Add support of the following accessor allocation types:
-    // test_alloc_kind::host,
-    // test_alloc_kind::usm_host.
+    this->accessor_alloc_ = GENERATE(test_alloc_kind::host,
+                                     test_alloc_kind::usm_host,
+                                     test_alloc_kind::usm_device,
+                                     test_alloc_kind::usm_shared);
 #else
     this->table_alloc_ = test_alloc_kind::host;
     this->accessor_alloc_ = test_alloc_kind::host;
diff --git a/cpp/oneapi/dal/test/engine/csr_table_builder.hpp b/cpp/oneapi/dal/test/engine/csr_table_builder.hpp
index e8de4036bfe..2e4656f388c 100644
--- a/cpp/oneapi/dal/test/engine/csr_table_builder.hpp
+++ b/cpp/oneapi/dal/test/engine/csr_table_builder.hpp
@@ -19,6 +19,68 @@
 
 namespace oneapi::dal::test::engine {
 
+csr_table copy_data_to_csr(const dal::array<float>& data,
+                           const dal::array<std::int64_t>& column_indices,
+                           const dal::array<std::int64_t>& row_offsets,
+                           const sparse_indexing indexing,
+                           const std::int64_t column_count,
+                           const std::int64_t row_count) {
+    auto row_offs_ptr = row_offsets.get_data();
+    auto data_ptr = data.get_data();
+    auto col_indices_ptr = column_indices.get_data();
+    auto nnz_count = row_offs_ptr[row_count] - row_offs_ptr[0];
+    const auto copied_data = dal::array<float>::empty(nnz_count);
+    const auto copied_col_indices = dal::array<std::int64_t>::empty(nnz_count);
+    const auto copied_row_offsets = dal::array<std::int64_t>::empty(row_count + 1);
+
+    auto copied_data_ptr = copied_data.get_mutable_data();
+    auto copied_col_indices_ptr = copied_col_indices.get_mutable_data();
+    auto copied_row_offsets_ptr = copied_row_offsets.get_mutable_data();
+    for (std::int32_t i = 0; i < nnz_count; ++i) {
+        copied_data_ptr[i] = data_ptr[i];
+        copied_col_indices_ptr[i] = col_indices_ptr[i];
+    }
+    for (std::int32_t i = 0; i <= row_count; ++i) {
+        copied_row_offsets_ptr[i] = row_offs_ptr[i];
+    }
+    return csr_table::wrap(copied_data,
+                           copied_col_indices,
+                           copied_row_offsets,
+                           column_count,
+                           indexing);
+}
+
+#ifdef ONEDAL_DATA_PARALLEL
+csr_table copy_data_to_csr(sycl::queue& queue,
+                           const dal::array<float>& data,
+                           const dal::array<std::int64_t>& column_indices,
+                           const dal::array<std::int64_t>& row_offsets,
+                           const sparse_indexing indexing,
+                           const std::int64_t column_count,
+                           const std::int64_t row_count) {
+    auto row_offs_ptr = row_offsets.get_data();
+    auto nnz_count = row_offs_ptr[row_count] - row_offs_ptr[0];
+    const auto copied_data = dal::array<float>::empty(queue, nnz_count, sycl::usm::alloc::device);
+    const auto copied_col_indices =
+        dal::array<std::int64_t>::empty(queue, nnz_count, sycl::usm::alloc::device);
+    const auto copied_row_offsets =
+        dal::array<std::int64_t>::empty(queue, row_count + 1, sycl::usm::alloc::device);
+    auto data_event = queue.copy<float>(data.get_data(), copied_data.get_mutable_data(), nnz_count);
+    auto col_indices_event = queue.copy<std::int64_t>(column_indices.get_data(),
+                                                      copied_col_indices.get_mutable_data(),
+                                                      nnz_count);
+    auto row_offsets_event = queue.copy<std::int64_t>(row_offsets.get_data(),
+                                                      copied_row_offsets.get_mutable_data(),
+                                                      row_count + 1);
+    sycl::event::wait_and_throw({ data_event, col_indices_event, row_offsets_event });
+    return csr_table::wrap(copied_data,
+                           copied_col_indices,
+                           copied_row_offsets,
+                           column_count,
+                           indexing);
+}
+#endif // ONEDAL_DATA_PARALLEL
+
 /**
 * Generates random CSR table based on inputs
 */
@@ -105,53 +167,23 @@ struct csr_table_builder {
 #ifdef ONEDAL_DATA_PARALLEL
     csr_table build_csr_table(device_test_policy& policy) const {
         auto queue = policy.get_queue();
-        auto row_offs_ptr = row_offsets_.get_data();
-        auto nnz_count = row_offs_ptr[row_count_] - row_offs_ptr[0];
-        const auto copied_data =
-            dal::array<Float>::empty(queue, nnz_count, sycl::usm::alloc::device);
-        const auto copied_col_indices =
-            dal::array<std::int64_t>::empty(queue, nnz_count, sycl::usm::alloc::device);
-        const auto copied_row_offsets =
-            dal::array<std::int64_t>::empty(queue, row_count_ + 1, sycl::usm::alloc::device);
-        auto data_event =
-            queue.copy<float>(data_.get_data(), copied_data.get_mutable_data(), nnz_count);
-        auto col_indices_event = queue.copy<std::int64_t>(column_indices_.get_data(),
-                                                          copied_col_indices.get_mutable_data(),
-                                                          nnz_count);
-        auto row_offsets_event = queue.copy<std::int64_t>(row_offsets_.get_data(),
-                                                          copied_row_offsets.get_mutable_data(),
-                                                          row_count_ + 1);
-        sycl::event::wait_and_throw({ data_event, col_indices_event, row_offsets_event });
-        return csr_table::wrap(copied_data,
-                               copied_col_indices,
-                               copied_row_offsets,
-                               column_count_,
-                               indexing_);
+        return copy_data_to_csr(queue,
+                                data_,
+                                column_indices_,
+                                row_offsets_,
+                                indexing_,
+                                column_count_,
+                                row_count_);
     }
 #endif // ONEDAL_DATA_PARALLEL
 
     csr_table build_csr_table(host_test_policy& policy) const {
-        auto row_offs_ptr = row_offsets_.get_data();
-        auto nnz_count = row_offs_ptr[row_count_] - row_offs_ptr[0];
-        const auto copied_data = dal::array<Float>::empty(nnz_count);
-        const auto copied_col_indices = dal::array<std::int64_t>::empty(nnz_count);
-        const auto copied_row_offsets = dal::array<std::int64_t>::empty(row_count_ + 1);
-
-        auto copied_data_ptr = copied_data.get_mutable_data();
-        auto copied_col_indices_ptr = copied_col_indices.get_mutable_data();
-        auto copied_row_offsets_ptr = copied_row_offsets.get_mutable_data();
-        for (std::int32_t i = 0; i < nnz_count; ++i) {
-            copied_data_ptr[i] = data_.get_data()[i];
-            copied_col_indices_ptr[i] = column_indices_.get_data()[i];
-        }
-        for (std::int32_t i = 0; i <= row_count_; ++i) {
-            copied_row_offsets_ptr[i] = row_offs_ptr[i];
-        }
-        return csr_table::wrap(copied_data,
-                               copied_col_indices,
-                               copied_row_offsets,
-                               column_count_,
-                               indexing_);
+        return copy_data_to_csr(data_,
+                                column_indices_,
+                                row_offsets_,
+                                indexing_,
+                                column_count_,
+                                row_count_);
     }
 
     table build_dense_table() const {
@@ -173,4 +205,176 @@ struct csr_table_builder {
     }
 };
 
+/// Generates CSR table with clustering dataset.
+/// Dataset is looks like multidimensional blobs
+/// with fixed centroid and randomized points around centroid
+/// with radius :expr:`r=1.0`.
+struct csr_make_blobs {
+    /// Floating type used for generation
+    using Float = float;
+    /// Indexing type used for generation
+    using Index = std::int64_t;
+    /// Dataset paramters
+    Index row_count_, column_count_, cluster_count_;
+    float nonzero_fraction_;
+    sparse_indexing indexing_;
+    const dal::array<Float> data_;
+    const dal::array<Index> column_indices_;
+    const dal::array<Index> row_offsets_;
+    /// Dataset generation parameters
+    const Float centroid_fill_value = 10.0f;
+    const Float min_val = -1.0f;
+    const Float max_val = 1.0f;
+
+    csr_make_blobs(Index cluster_count,
+                   Index row_count,
+                   Index column_count,
+                   float nnz_fraction = 0.05,
+                   sparse_indexing indexing = sparse_indexing::one_based,
+                   Index seed = 42)
+            : row_count_(row_count),
+              column_count_(column_count),
+              cluster_count_(cluster_count),
+              nonzero_fraction_(nnz_fraction),
+              indexing_(indexing),
+              data_(dal::array<Float>::empty(nnz_fraction * row_count * column_count)),
+              column_indices_(dal::array<Index>::empty(nnz_fraction * row_count * column_count)),
+              row_offsets_(dal::array<Index>::empty(row_count + 1)) {
+        // Get data arrays
+        auto data_ptr = data_.get_mutable_data();
+        auto col_indices_ptr = column_indices_.get_mutable_data();
+        auto row_offs_ptr = row_offsets_.get_mutable_data();
+        const Index indexing_shift = bool(indexing == sparse_indexing::one_based);
+        // Estimate number of non-zero values in each row
+        const Index row_nonzero_count = column_count * nnz_fraction;
+        // Init random engines
+        std::mt19937 rng(seed);
+        std::uniform_real_distribution<Float> uniform_data(min_val, max_val);
+        std::uniform_int_distribution<Index> uniform_indices(indexing_shift,
+                                                             column_count + indexing_shift - 1);
+        // Check if it is possible to generate non-empty row
+        if (row_nonzero_count < 1) {
+            std::cout << "ERROR: Non-zero fraction is too small to generate rows" << std::endl;
+            ONEDAL_ASSERT(row_nonzero_count >= 1);
+            return;
+        }
+        Index fill_count = 0;
+        row_offs_ptr[0] = indexing_shift;
+        // Create centroids
+        for (Index cent_idx = 0; cent_idx < cluster_count; ++cent_idx) {
+            std::set<Index> columns;
+            while (Index(columns.size()) < row_nonzero_count) {
+                const Index col_idx = uniform_indices(rng);
+                columns.insert(col_idx);
+            }
+            for (auto iter = columns.begin(); iter != columns.end(); iter++) {
+                data_ptr[fill_count] = centroid_fill_value * (cent_idx + 1);
+                col_indices_ptr[fill_count] = *iter;
+                fill_count++;
+            }
+            row_offs_ptr[cent_idx + 1] = fill_count + indexing_shift;
+        }
+
+        // Generate remaining rows adding random noise to centroids
+        for (Index row_idx = cluster_count; row_idx < row_count; ++row_idx) {
+            const Index centroid_id = row_idx % cluster_count;
+            for (Index data_idx = row_offs_ptr[centroid_id] - indexing_shift;
+                 data_idx < row_offs_ptr[centroid_id + 1] - indexing_shift;
+                 ++data_idx) {
+                col_indices_ptr[fill_count] = col_indices_ptr[data_idx];
+                data_ptr[fill_count] = data_ptr[data_idx] + uniform_data(rng);
+                fill_count++;
+            }
+            row_offs_ptr[row_idx + 1] = fill_count + indexing_shift;
+        }
+    }
+
+    table get_data(host_test_policy& policy) const {
+        return copy_data_to_csr(data_,
+                                column_indices_,
+                                row_offsets_,
+                                indexing_,
+                                column_count_,
+                                row_count_);
+    }
+
+    table get_initial_centroids() const {
+        const auto result = dal::array<float>::empty(cluster_count_ * column_count_);
+        auto result_ptr = result.get_mutable_data();
+
+        const Index shift = bool(indexing_ == sparse_indexing::one_based);
+        const auto data_ptr = data_.get_data();
+        const auto col_ind_ptr = column_indices_.get_data();
+        const auto row_offs_ptr = row_offsets_.get_data();
+        for (Index row_idx = 0; row_idx < cluster_count_; ++row_idx) {
+            for (Index col_id = 0; col_id < column_count_; ++col_id) {
+                result_ptr[row_idx * column_count_ + col_id] = 0;
+            }
+            const auto start = row_offs_ptr[row_idx] - shift;
+            const auto end = row_offs_ptr[row_idx + 1] - shift;
+            for (Index data_idx = start; data_idx < end; ++data_idx) {
+                auto col_idx = col_ind_ptr[data_idx] - shift;
+                result_ptr[row_idx * column_count_ + col_idx] = data_ptr[data_idx];
+            }
+        }
+        return homogen_table::wrap(result, cluster_count_, column_count_);
+    }
+
+    table get_result_centroids() const {
+        const auto result = dal::array<float>::empty(cluster_count_ * column_count_);
+        auto result_ptr = result.get_mutable_data();
+        const auto cluster_counts = dal::array<std::int32_t>::empty(cluster_count_);
+        auto counts_ptr = cluster_counts.get_mutable_data();
+
+        const Index shift = bool(indexing_ == sparse_indexing::one_based);
+        const auto data_ptr = data_.get_data();
+        const auto col_ind_ptr = column_indices_.get_data();
+        const auto row_offs_ptr = row_offsets_.get_data();
+        for (Index row_idx = 0; row_idx < cluster_count_; ++row_idx) {
+            counts_ptr[row_idx] = 0;
+            for (Index col_id = 0; col_id < column_count_; ++col_id) {
+                result_ptr[row_idx * column_count_ + col_id] = 0;
+            }
+        }
+        for (Index row_idx = 0; row_idx < row_count_; ++row_idx) {
+            const auto start = row_offs_ptr[row_idx] - shift;
+            const auto end = row_offs_ptr[row_idx + 1] - shift;
+            for (Index data_idx = start; data_idx < end; ++data_idx) {
+                auto col_idx = col_ind_ptr[data_idx] - shift;
+                result_ptr[(row_idx % cluster_count_) * column_count_ + col_idx] +=
+                    data_ptr[data_idx];
+            }
+            counts_ptr[row_idx % cluster_count_]++;
+        }
+        for (Index row_idx = 0; row_idx < cluster_count_; ++row_idx) {
+            for (Index col_id = 0; col_id < column_count_; ++col_id) {
+                result_ptr[row_idx * column_count_ + col_id] /= counts_ptr[row_idx];
+            }
+        }
+        return homogen_table::wrap(result, cluster_count_, column_count_);
+    }
+
+    table get_responses() const {
+        auto responses = dal::array<std::int32_t>::empty(row_count_);
+        auto response_ptr = responses.get_mutable_data();
+        for (std::int32_t i = 0; i < row_count_; ++i) {
+            response_ptr[i] = i % cluster_count_;
+        }
+        return homogen_table::wrap(response_ptr, row_count_, 1);
+    }
+
+#ifdef ONEDAL_DATA_PARALLEL
+    table get_data(device_test_policy& policy) const {
+        auto queue = policy.get_queue();
+        return copy_data_to_csr(queue,
+                                data_,
+                                column_indices_,
+                                row_offsets_,
+                                indexing_,
+                                column_count_,
+                                row_count_);
+    }
+#endif // ONEDAL_DATA_PARALLEL
+};
+
 } //namespace oneapi::dal::test::engine
diff --git a/cpp/oneapi/dal/test/engine/fixtures.hpp b/cpp/oneapi/dal/test/engine/fixtures.hpp
index 93e3363dbe8..8219c43a686 100644
--- a/cpp/oneapi/dal/test/engine/fixtures.hpp
+++ b/cpp/oneapi/dal/test/engine/fixtures.hpp
@@ -156,6 +156,16 @@ class crtp_base_algo_fixture : public float_algo_fixture<std::tuple_element_t<0,
         return derived().merge_compute_result_override(std::forward<Args>(args)...);
     }
 
+    template <typename... Args>
+    auto split_finalize_compute_input(Args&&... args) {
+        return derived().split_finalize_compute_input_override(std::forward<Args>(args)...);
+    }
+
+    template <typename... Args>
+    auto merge_finalize_compute_result(Args&&... args) {
+        return derived().merge_finalize_compute_result_override(std::forward<Args>(args)...);
+    }
+
     template <typename... Args>
     auto split_infer_input(Args&&... args) {
         return derived().split_infer_input_override(std::forward<Args>(args)...);
@@ -221,6 +231,16 @@ class crtp_base_algo_fixture : public float_algo_fixture<std::tuple_element_t<0,
         ONEDAL_ASSERT(!"This method must be overriden in the derived class");
     }
 
+    template <typename... Args>
+    auto split_finalize_compute_input_override(Args&&... args) {
+        ONEDAL_ASSERT(!"This method must be overriden in the derived class");
+    }
+
+    template <typename... Args>
+    auto merge_finalize_compute_result_override(Args&&... args) {
+        ONEDAL_ASSERT(!"This method must be overriden in the derived class");
+    }
+
     template <typename... Args>
     auto split_infer_input_override(Args&&... args) {
         ONEDAL_ASSERT(!"This method must be overriden in the derived class");
@@ -326,6 +346,49 @@ class crtp_algo_fixture : public crtp_base_algo_fixture<TestType, Derived> {
         return this->merge_compute_result(results);
     }
 
+    template <typename Descriptor, typename... Args>
+    auto finalize_compute_via_spmd_threads(std::int64_t thread_count,
+                                           const Descriptor& desc,
+                                           Args&&... args) {
+        ONEDAL_ASSERT(thread_count > 0);
+
+        CAPTURE(thread_count);
+#ifdef ONEDAL_DATA_PARALLEL
+        using comm_t = thread_communicator<spmd::device_memory_access::usm>;
+        comm_t comm{ this->get_queue(), thread_count };
+#else
+        using comm_t = thread_communicator<spmd::device_memory_access::none>;
+        comm_t comm{ thread_count };
+#endif
+
+        const auto input_per_rank =
+            this->split_finalize_compute_input(thread_count, std::forward<Args>(args)...);
+        ONEDAL_ASSERT(input_per_rank.size() ==
+                      dal::detail::integral_cast<std::size_t>(thread_count));
+
+        const auto results = comm.map([&](std::int64_t rank) {
+            return dal::test::engine::spmd_finalize_compute(this->get_policy(),
+                                                            comm,
+                                                            desc,
+                                                            input_per_rank[rank]);
+        });
+        ONEDAL_ASSERT(results.size() == dal::detail::integral_cast<std::size_t>(thread_count));
+
+        return results;
+    }
+
+    template <typename Descriptor, typename... Args>
+    auto finalize_compute_via_spmd_threads_and_merge(std::int64_t thread_count,
+                                                     const Descriptor& desc,
+                                                     Args&&... args) {
+        const auto results = this->finalize_compute_via_spmd_threads( //
+            thread_count,
+            desc,
+            std::forward<Args>(args)...);
+
+        return this->merge_finalize_compute_result(results);
+    }
+
     template <typename Descriptor, typename... Args>
     auto infer_via_spmd_threads(std::int64_t thread_count, const Descriptor& desc, Args&&... args) {
         ONEDAL_ASSERT(thread_count > 0);
diff --git a/cpp/oneapi/dal/test/engine/spmd.hpp b/cpp/oneapi/dal/test/engine/spmd.hpp
index 7f795525704..a97837a72ed 100644
--- a/cpp/oneapi/dal/test/engine/spmd.hpp
+++ b/cpp/oneapi/dal/test/engine/spmd.hpp
@@ -79,4 +79,23 @@ inline auto spmd_compute(device_test_policy& policy,
 }
 #endif
 
+template <typename... Args>
+inline auto spmd_finalize_compute(host_test_policy& policy,
+                                  const spmd::communicator<spmd::device_memory_access::none>& comm,
+                                  Args&&... args) {
+    return dal::finalize_compute(dal::detail::spmd_policy{ dal::detail::host_policy{}, comm },
+                                 std::forward<Args>(args)...);
+}
+
+#ifdef ONEDAL_DATA_PARALLEL
+template <typename... Args>
+inline auto spmd_finalize_compute(device_test_policy& policy,
+                                  const spmd::communicator<spmd::device_memory_access::usm>& comm,
+                                  Args&&... args) {
+    dal::detail::data_parallel_policy local_policy{ policy.get_queue() };
+    dal::detail::spmd_policy<detail::data_parallel_policy> spmd_policy{ local_policy, comm };
+    return dal::finalize_compute(spmd_policy, std::forward<Args>(args)...);
+}
+#endif
+
 } // namespace oneapi::dal::test::engine
diff --git a/deploy/local/dal b/deploy/local/dal
index 20ea287f505..6ddff7e537b 100644
--- a/deploy/local/dal
+++ b/deploy/local/dal
@@ -1,6 +1,7 @@
 #%Module1.0###################################################################
 #===============================================================================
 # Copyright 2020 Intel Corporation
+# Copyright contributors to the oneDAL project
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -60,7 +61,14 @@ proc ModulesHelp { } {
 
 # Set intermediate variables
 set dalroot "$componentroot"
-set daal_target_arch "intel64"
+set daalroot "$componentroot/$modulefilever"
+if {[string equal [info machine] "aarch64"]} {
+    set daal_target_arch "arm"
+} else {
+    set daal_target_arch "intel64"
+}
+
+module-whatis "oneAPI Data Analytics Library for $daal_target_arch."
 
 # Setup environment variables
 setenv          DAL_MAJOR_BINARY   1
diff --git a/deploy/local/vars_lnx.sh b/deploy/local/vars_lnx.sh
index fc0172ff9bc..8cb606d13cf 100644
--- a/deploy/local/vars_lnx.sh
+++ b/deploy/local/vars_lnx.sh
@@ -4,6 +4,7 @@
 
 #===============================================================================
 # Copyright 2014 Intel Corporation
+# Copyright contributors to the oneDAL project
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -223,6 +224,17 @@ if [ ! -d $__daal_tmp_dir ]; then
     __daal_tmp_dir=${component_root}
 fi
 
+ARCH_ONEDAL=$(uname -m)
+
+if [ "${ARCH_ONEDAL}" = "x86_64" ]; then
+    ARCH_DIR_ONEDAL="intel64"
+elif [ "${ARCH_ONEDAL}" = "aarch64" ]; then
+    ARCH_DIR_ONEDAL="arm"
+else
+    echo "Unsupported CPU architecture '${ARCH_ONEDAL}'"
+    exit 1
+fi
+
 if [ "$(basename "${my_script_path}")" = "env" ] ; then   # assume stand-alone
 # case "${my_script_path}" in
   # *"env"*)
@@ -239,8 +251,8 @@ if [ "$(basename "${my_script_path}")" = "env" ] ; then   # assume stand-alone
       export LD_LIBRARY_PATH="$__daal_tmp_dir/lib${LD_LIBRARY_PATH+:${LD_LIBRARY_PATH}}"
     else
       export CPATH="$__daal_tmp_dir/include${CPATH+:${CPATH}}"
-      export LIBRARY_PATH="$__daal_tmp_dir/lib/intel64${LIBRARY_PATH+:${LIBRARY_PATH}}"
-      export LD_LIBRARY_PATH="$__daal_tmp_dir/lib/intel64${LD_LIBRARY_PATH+:${LD_LIBRARY_PATH}}"
+      export LIBRARY_PATH="$__daal_tmp_dir/lib/$ARCH_DIR_ONEDAL${LIBRARY_PATH+:${LIBRARY_PATH}}"
+      export LD_LIBRARY_PATH="$__daal_tmp_dir/lib/$ARCH_DIR_ONEDAL${LD_LIBRARY_PATH+:${LD_LIBRARY_PATH}}"
     fi
   # ;;
 else   # must be a consolidated layout
diff --git a/deploy/nuget/prepare_dal_nuget.sh b/deploy/nuget/prepare_dal_nuget.sh
index bf05f73978c..5d115c71e9b 100755
--- a/deploy/nuget/prepare_dal_nuget.sh
+++ b/deploy/nuget/prepare_dal_nuget.sh
@@ -1,6 +1,7 @@
 #!/bin/bash
 #===============================================================================
 # Copyright 2022 Intel Corporation
+# Copyright contributors to the oneDAL project
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -53,12 +54,20 @@ create_package() {
     # platform specific
     platform=$(bash $(dirname "$0")/../../dev/make/identify_os.sh)
     if [ ${platform} = "lnx32e" ]; then
-        platform=linux-x64
+        platform=linux
         tbb_platform=linux
         rls_prefix=${rls_dir}/daal/latest
         dynamic_lib_path=lib/intel64
         static_lib_path=lib/intel64
         lib_prefix=libonedal
+    elif [ ${platform} = "lnxarm" ]; then
+        platform=linux
+        tbb_platform=linux
+        rls_prefix=${rls_dir}/daal/latest
+        dynamic_lib_path=lib/arm
+        static_lib_path=lib/arm
+        lib_prefix=libonedal
+
     elif [ ${platform} = "mac32e" ]; then
         platform=osx-x64
         tbb_platform=osx
@@ -98,7 +107,7 @@ create_package() {
 
     if [ "${build_nupkg}" = "yes" ]; then
         # extension of libraries
-        if [ "${platform}" = "linux-x64" ]; then
+        if [ "${platform}" = "linux" ]; then
             dl_postfix=.so.${major_binary_version}.${minor_binary_version}
             sl_postfix=.a
         elif [ "${platform}" = "osx-x64" ]; then
diff --git a/deploy/pkg-config/generate_pkgconfig.py b/deploy/pkg-config/generate_pkgconfig.py
index b287f853bc4..323ff7e5138 100755
--- a/deploy/pkg-config/generate_pkgconfig.py
+++ b/deploy/pkg-config/generate_pkgconfig.py
@@ -1,5 +1,7 @@
+'''generate_pkgconfig.py'''
 #===============================================================================
 # Copyright 2021 Intel Corporation
+# Copyright contributors to the oneDAL project
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -19,6 +21,21 @@
 import glob
 import argparse
 from sys import platform
+import platform as plt
+
+def detect_cpu_architecture():
+    """
+    Detect CPU architecture
+    """
+    architecture = plt.machine()
+    if architecture in ('x86_64', 'AMD64'):
+        return 'x86_64'
+    elif architecture.startswith('arm') or architecture == 'aarch64':
+        return 'aarch64'
+    else:
+        sys.stderr.write(f"Unknown Architecture {architecture} Detected. " \
+                         "Only 'x86_64', 'AMD64' and 'aarch64' supported.\n")
+        sys.exit(1)
 
 LIBS_PAR_STAT, LIBS_PAR_DYN = [], []
 
@@ -45,9 +62,18 @@
     },
 }
 
+ARCH = detect_cpu_architecture()
+
 if platform in ["linux2", "linux"]:
     PREF_LIB = "lib"
-    LIBDIR = 'lib/intel64'
+
+    if ARCH == 'x86_64':
+        LIBDIR = 'lib/intel64'
+    elif ARCH == 'aarch64':
+        LIBDIR = 'lib/arm'
+    else:
+        sys.stderr.write(f"Unknown CPU architecture '{ARCH}'\n")
+
     SUFF_DYN_LIB = ".so"
     SUFF_STAT_LIB = ".a"
     TBB_LIBS = "-ltbb -ltbbmalloc"
diff --git a/deploy/pkg-config/pkg-config.tpl b/deploy/pkg-config/pkg-config.tpl
index 1d59f3e5df3..53fd1276066 100755
--- a/deploy/pkg-config/pkg-config.tpl
+++ b/deploy/pkg-config/pkg-config.tpl
@@ -22,7 +22,7 @@ includedir=${{prefix}}/include
 #info
 Name: oneDAL
 Description: Intel(R) oneAPI Data Analytics Library
-Version: 2024.2
+Version: 2024.3
 URL: https://www.intel.com/content/www/us/en/developer/tools/oneapi/onedal.html
 #Link line
 Libs: {libs}
diff --git a/dev/bazel/config/config.bzl b/dev/bazel/config/config.bzl
index cfcfad074ab..8ff87b7e54b 100644
--- a/dev/bazel/config/config.bzl
+++ b/dev/bazel/config/config.bzl
@@ -210,7 +210,7 @@ def _declare_onedal_config_impl(repo_ctx):
         substitutions = {
             "%{auto_cpu}":         auto_cpu,
             "%{version_major}":    "2024",
-            "%{version_minor}":    "2",
+            "%{version_minor}":    "3",
             "%{version_update}":   "0",
             "%{version_build}":    utils.datestamp(repo_ctx),
             "%{version_buildrev}": "work",
diff --git a/dev/bazel/config/cpudetect.cpp b/dev/bazel/config/cpudetect.cpp
index a6abc42eb01..eee6a3d3752 100644
--- a/dev/bazel/config/cpudetect.cpp
+++ b/dev/bazel/config/cpudetect.cpp
@@ -1,5 +1,6 @@
 /*******************************************************************************
 * Copyright 2014 Intel Corporation
+* Copyright contributors to the oneDAL project
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -14,7 +15,17 @@
 * limitations under the License.
 *******************************************************************************/
 
+#if defined(__x86_64__) || defined(__x86_64) || defined(__amd64) || defined(_M_AMD64)
+    #define TARGET_X86_64
+#endif
+
+#if defined(__ARM_ARCH) || defined(__aarch64__)
+    #define TARGET_ARM
+#endif
+
+#if defined(TARGET_X86_64)
 #include <immintrin.h>
+#endif
 
 #if defined(_MSC_VER)
 #if (_MSC_FULL_VER >= 160040219)
@@ -154,20 +165,25 @@ int check_sse42_features() {
 }
 
 std::string detect_cpu() {
-    try_enable_avx512f_on_macos();
 
-    if (check_avx512_features()) {
-        return "avx512";
-    }
-    else if (check_avx2_features()) {
-        return "avx2";
-    }
-    else if (check_sse42_features()) {
-        return "sse42";
-    }
-    else {
-        return "sse2";
-    }
+    #if defined(TARGET_X86_64)
+        try_enable_avx512f_on_macos();
+
+        if (check_avx512_features()) {
+            return "avx512";
+        }
+        else if (check_avx2_features()) {
+            return "avx2";
+        }
+        else if (check_sse42_features()) {
+            return "sse42";
+        }
+        else {
+            return "sse2";
+        }
+    #elif defined(TARGET_ARM)
+        return "sve";
+    #endif
 }
 
 int main(int argc, char const *argv[]) {
diff --git a/dev/docker/onedal-dev.Dockerfile b/dev/docker/onedal-dev.Dockerfile
index 1add2d27724..fa9bea8ed8d 100644
--- a/dev/docker/onedal-dev.Dockerfile
+++ b/dev/docker/onedal-dev.Dockerfile
@@ -14,7 +14,7 @@
 # limitations under the License.
 #===============================================================================
 
-FROM ubuntu:22.04
+FROM ubuntu:22.04@sha256:77906da86b60585ce12215807090eb327e7386c8fafb5402369e421f44eff17e
 
 ARG workdirectory="/sources/oneDAL"
 
diff --git a/dev/download_tbb.sh b/dev/download_tbb.sh
index fe05c687795..5ba94225c95 100755
--- a/dev/download_tbb.sh
+++ b/dev/download_tbb.sh
@@ -1,6 +1,7 @@
 #!/bin/bash
 #===============================================================================
 # Copyright 2014 Intel Corporation
+# Copyright contributors to the oneDAL project
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
diff --git a/dev/make/common.mk b/dev/make/common.mk
index 694d7a3bca1..291ca7e406b 100644
--- a/dev/make/common.mk
+++ b/dev/make/common.mk
@@ -78,6 +78,7 @@ secure.opts.link.mac =
 
 RC.COMPILE = rc.exe $(RCOPT) -fo$@ $<
 
+# Used as $(eval $(call set_c_compile,$(COMPILER),$(_OS),$(gcc_toolchain))
 C.COMPILE = $(if $(COMPILER.$(_OS).$(COMPILER)),$(COMPILER.$(_OS).$(COMPILER)),$(error COMPILER.$(_OS).$(COMPILER) must be defined)) \
             $(if $(C.COMPILE.gcc_toolchain),--gcc-toolchain=$(C.COMPILE.gcc_toolchain)) \
             -c $(secure.opts.icc.$(_OS)) $(COPT) $(INCLUDES) $1 $(-Fo)$@ $<
diff --git a/dev/make/cmplr.clang.mkl.mk b/dev/make/compiler_definitions/clang.32e.mk
similarity index 82%
rename from dev/make/cmplr.clang.mkl.mk
rename to dev/make/compiler_definitions/clang.32e.mk
index 52d77cf757f..4f4844896a9 100644
--- a/dev/make/cmplr.clang.mkl.mk
+++ b/dev/make/compiler_definitions/clang.32e.mk
@@ -1,4 +1,4 @@
-# file: cmplt.clang.mk
+# file: clang.32e.mk
 #===============================================================================
 # Copyright 2012 Intel Corporation
 #
@@ -16,17 +16,15 @@
 #===============================================================================
 
 #++
-#  Clang defenitions for makefile
+#  Clang definitions for makefile.
+#  This file contains definitions common to clang on a 32e (intel64) platform.
+#  It should only be included from files which have more specializations (e.g.
+#  clang.mkl.32e.mk)
 #--
 
-PLATs.clang = lnx32e mac32e
-
-CMPLRDIRSUFF.clang = _clang
-
-CORE.SERV.COMPILER.clang = generic
+include dev/make/compiler_definitions/clang.mk
 
--Zl.clang =
--DEBC.clang = -g
+PLATs.clang = lnx32e mac32e
 
 COMPILER.mac.clang = clang++ -m64 -fgnu-runtime -stdlib=libc++ -mmacosx-version-min=10.15 -fwrapv \
                      -Werror -Wreturn-type
@@ -36,11 +34,6 @@ COMPILER.lnx.clang = clang++ -m64 \
 link.dynamic.mac.clang = clang++ -m64
 link.dynamic.lnx.clang = clang++ -m64
 
-pedantic.opts.clang = -pedantic \
-                      -Wall \
-                      -Wextra \
-                      -Wno-unused-parameter
-
 pedantic.opts.mac.clang = $(pedantic.opts.clang)
 pedantic.opts.lnx.clang = $(pedantic.opts.clang)
 
diff --git a/dev/make/cmplr.clang.ref.mk b/dev/make/compiler_definitions/clang.mk
similarity index 55%
rename from dev/make/cmplr.clang.ref.mk
rename to dev/make/compiler_definitions/clang.mk
index b7d12348253..5c962ef511b 100644
--- a/dev/make/cmplr.clang.ref.mk
+++ b/dev/make/compiler_definitions/clang.mk
@@ -1,6 +1,6 @@
-# file: cmplt.clang.mk
+# file: clang.mk
 #===============================================================================
-# Copyright 2023 Intel Corporation
+# Copyright contributors to the oneDAL project
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -16,11 +16,12 @@
 #===============================================================================
 
 #++
-#  Clang defenitions for makefile
+#  Clang definitions for makefile
+#  This file contains definitions common to clang on all platforms.
+#  It should only be included from files which have more specializations (e.g.
+#  clang.32e.mk)
 #--
 
-PLATs.clang = lnx32e mac32e
-
 CMPLRDIRSUFF.clang = _clang
 
 CORE.SERV.COMPILER.clang = generic
@@ -28,23 +29,7 @@ CORE.SERV.COMPILER.clang = generic
 -Zl.clang =
 -DEBC.clang = -g
 
-COMPILER.mac.clang = clang++ -m64 -fgnu-runtime -stdlib=libc++ -mmacosx-version-min=10.15 -fwrapv \
-                     -DDAAL_REF -DONEDAL_REF -Werror -Wreturn-type
-COMPILER.lnx.clang = clang++ -m64 \
-                     -DDAAL_REF -DONEDAL_REF -Werror -Wreturn-type
-
-link.dynamic.mac.clang = clang++ -m64
-link.dynamic.lnx.clang = clang++ -m64
-
 pedantic.opts.clang = -pedantic \
                       -Wall \
                       -Wextra \
                       -Wno-unused-parameter
-
-pedantic.opts.mac.clang = $(pedantic.opts.clang)
-pedantic.opts.lnx.clang = $(pedantic.opts.clang)
-
-p4_OPT.clang   = $(-Q)march=nocona
-mc3_OPT.clang  = $(-Q)$(if $(OS_is_mac),march=nocona,march=nehalem) $(if $(OS_is_mac),$(-Q)mtune=nehalem)
-avx2_OPT.clang = $(-Q)march=haswell
-skx_OPT.clang  = $(-Q)march=skx
diff --git a/dev/make/compiler_definitions/clang.mkl.32e.mk b/dev/make/compiler_definitions/clang.mkl.32e.mk
new file mode 100644
index 00000000000..9bac0e95622
--- /dev/null
+++ b/dev/make/compiler_definitions/clang.mkl.32e.mk
@@ -0,0 +1,22 @@
+# file: clang.mkl.32e.mk
+#===============================================================================
+# Copyright 2012 Intel Corporation
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#===============================================================================
+
+#++
+#  Clang definitions for makefile
+#--
+
+include dev/make/compiler_definitions/clang.32e.mk
diff --git a/dev/make/compiler_definitions/clang.ref.32e.mk b/dev/make/compiler_definitions/clang.ref.32e.mk
new file mode 100644
index 00000000000..291bc0295d5
--- /dev/null
+++ b/dev/make/compiler_definitions/clang.ref.32e.mk
@@ -0,0 +1,25 @@
+# file: clang.ref.32e.mk
+#===============================================================================
+# Copyright contributors to the oneDAL project
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#===============================================================================
+
+#++
+#  Clang definitions for makefile
+#--
+
+include dev/make/compiler_definitions/clang.32e.mk
+
+COMPILER.mac.clang = $(COMPILER.mac.clang) -DDAAL_REF -DONEDAL_REF
+COMPILER.lnx.clang = $(COMPILER.lnx.clang) -DDAAL_REF -DONEDAL_REF
diff --git a/dev/make/compiler_definitions/clang.ref.arm.mk b/dev/make/compiler_definitions/clang.ref.arm.mk
new file mode 100644
index 00000000000..6b61a52c0dc
--- /dev/null
+++ b/dev/make/compiler_definitions/clang.ref.arm.mk
@@ -0,0 +1,34 @@
+# file: clang.ref.arm.mk
+#===============================================================================
+# Copyright contributors to the oneDAL project
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#===============================================================================
+
+#++
+#  Clang definitions for makefile
+#--
+
+include dev/make/compiler_definitions/clang.mk
+
+PLATs.clang = lnxarm
+
+COMPILER.lnx.clang= clang++ -march=armv8-a+sve \
+                     -DDAAL_REF -DONEDAL_REF -DDAAL_CPU=sve -Werror -Wreturn-type
+# Linker flags
+link.dynamic.lnx.clang = clang++ -march=armv8-a+sve
+
+pedantic.opts.lnx.clang = $(pedantic.opts.clang)
+
+# For SVE
+a8sve_OPT.clang = $(-Q)march=armv8-a+sve
diff --git a/dev/make/cmplr.dpcpp.mk b/dev/make/compiler_definitions/dpcpp.mk
similarity index 97%
rename from dev/make/cmplr.dpcpp.mk
rename to dev/make/compiler_definitions/dpcpp.mk
index 2a78043b729..848f36c2db1 100644
--- a/dev/make/cmplr.dpcpp.mk
+++ b/dev/make/compiler_definitions/dpcpp.mk
@@ -16,7 +16,7 @@
 #===============================================================================
 
 #++
-#  DPC++ Compiler defenitions for makefile
+#  DPC++ Compiler definitions for makefile
 #--
 
 PLATs.dpcpp = lnx32e win32e
diff --git a/dev/make/cmplr.gnu.mkl.mk b/dev/make/compiler_definitions/gnu.32e.mk
similarity index 76%
rename from dev/make/cmplr.gnu.mkl.mk
rename to dev/make/compiler_definitions/gnu.32e.mk
index 2e5008a519a..f90f0a95eed 100644
--- a/dev/make/cmplr.gnu.mkl.mk
+++ b/dev/make/compiler_definitions/gnu.32e.mk
@@ -1,5 +1,5 @@
 #===============================================================================
-# Copyright 2023 Intel Corporation
+# Copyright contributors to the oneDAL project
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -15,35 +15,26 @@
 #===============================================================================
 
 #++
-#  g++ defenitions for makefile
+#  g++ definitions for makefile
+#  This file contains definitions common to gnu on a 32e (intel64) platform. It
+#  should only be included from files which have more specializations (e.g.
+#  gnu.mkl.32e.mk)
 #--
 
-PLATs.gnu = lnx32e mac32e
-
-CMPLRDIRSUFF.gnu = _gnu
-
-CORE.SERV.COMPILER.gnu = generic
+include dev/make/compiler_definitions/gnu.mk
 
--Zl.gnu =
--DEBC.gnu = -g
+PLATs.gnu = lnx32e mac32e
 
 COMPILER.all.gnu =  ${CXX} -m64 -fwrapv -fno-strict-overflow -fno-delete-null-pointer-checks \
                     -Werror -Wreturn-type
 
 link.dynamic.all.gnu = ${CXX} -m64
 
-pedantic.opts.all.gnu = -pedantic \
-                        -Wall \
-                        -Wextra \
-                        -Wno-unused-parameter
-
-COMPILER.lnx.gnu = $(COMPILER.all.gnu)
-link.dynamic.lnx.gnu = $(link.dynamic.all.gnu)
 pedantic.opts.lnx.gnu = $(pedantic.opts.all.gnu)
+pedantic.opts.mac.gnu = $(pedantic.opts.all.gnu)
 
-COMPILER.mac.gnu = $(COMPILER.all.gnu)
+link.dynamic.lnx.gnu = $(link.dynamic.all.gnu)
 link.dynamic.mac.gnu = $(link.dynamic.all.gnu)
-pedantic.opts.mac.gnu = $(pedantic.opts.all.gnu)
 
 p4_OPT.gnu   = $(-Q)march=nocona
 mc3_OPT.gnu  = $(-Q)march=corei7
diff --git a/dev/make/cmplr.gnu.ref.mk b/dev/make/compiler_definitions/gnu.mk
similarity index 57%
rename from dev/make/cmplr.gnu.ref.mk
rename to dev/make/compiler_definitions/gnu.mk
index fac0235da8d..cb0679037f1 100644
--- a/dev/make/cmplr.gnu.ref.mk
+++ b/dev/make/compiler_definitions/gnu.mk
@@ -1,5 +1,5 @@
 #===============================================================================
-# Copyright 2023 Intel Corporation
+# Copyright contributors to the oneDAL project
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -15,11 +15,12 @@
 #===============================================================================
 
 #++
-#  g++ defenitions for makefile
+#  g++ definitions for makefile
+#  This file contains definitions common to gnu on all platforms. It
+#  should only be included from files which have more specializations (e.g.
+#  gnu.32e.mk)
 #--
 
-PLATs.gnu = lnx32e mac32e
-
 CMPLRDIRSUFF.gnu = _gnu
 
 CORE.SERV.COMPILER.gnu = generic
@@ -27,25 +28,7 @@ CORE.SERV.COMPILER.gnu = generic
 -Zl.gnu =
 -DEBC.gnu = -g
 
-COMPILER.all.gnu =  ${CXX} -m64 -fwrapv -fno-strict-overflow -fno-delete-null-pointer-checks \
-                    -DDAAL_REF -DONEDAL_REF -Werror -Wreturn-type
-
-link.dynamic.all.gnu = ${CXX} -m64
-
 pedantic.opts.all.gnu = -pedantic \
                         -Wall \
                         -Wextra \
                         -Wno-unused-parameter
-
-COMPILER.lnx.gnu = $(COMPILER.all.gnu)
-link.dynamic.lnx.gnu = $(link.dynamic.all.gnu)
-pedantic.opts.lnx.gnu = $(pedantic.opts.all.gnu)
-
-COMPILER.mac.gnu = $(COMPILER.all.gnu)
-link.dynamic.mac.gnu = $(link.dynamic.all.gnu)
-pedantic.opts.mac.gnu = $(pedantic.opts.all.gnu)
-
-p4_OPT.gnu   = $(-Q)march=nocona
-mc3_OPT.gnu  = $(-Q)march=corei7
-avx2_OPT.gnu = $(-Q)march=haswell
-skx_OPT.gnu  = $(-Q)march=skylake
diff --git a/dev/make/compiler_definitions/gnu.mkl.32e.mk b/dev/make/compiler_definitions/gnu.mkl.32e.mk
new file mode 100644
index 00000000000..6877ee330dd
--- /dev/null
+++ b/dev/make/compiler_definitions/gnu.mkl.32e.mk
@@ -0,0 +1,24 @@
+#===============================================================================
+# Copyright 2023 Intel Corporation
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#===============================================================================
+
+#++
+#  g++ definitions for makefile
+#--
+
+include dev/make/compiler_definitions/gnu.32e.mk
+
+COMPILER.lnx.gnu = $(COMPILER.all.gnu)
+COMPILER.mac.gnu = $(COMPILER.all.gnu)
diff --git a/dev/make/compiler_definitions/gnu.ref.32e.mk b/dev/make/compiler_definitions/gnu.ref.32e.mk
new file mode 100644
index 00000000000..bd58dc8ab1d
--- /dev/null
+++ b/dev/make/compiler_definitions/gnu.ref.32e.mk
@@ -0,0 +1,24 @@
+#===============================================================================
+# Copyright 2023 Intel Corporation
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#===============================================================================
+
+#++
+#  g++ definitions for makefile
+#--
+
+include dev/make/compiler_definitions/gnu.32e.mk
+
+COMPILER.lnx.gnu = $(COMPILER.all.gnu) -DDAAL_REF -DONEDAL_REF
+COMPILER.mac.gnu = $(COMPILER.all.gnu) -DDAAL_REF -DONEDAL_REF
diff --git a/dev/make/compiler_definitions/gnu.ref.arm.mk b/dev/make/compiler_definitions/gnu.ref.arm.mk
new file mode 100644
index 00000000000..bf7379cc8bc
--- /dev/null
+++ b/dev/make/compiler_definitions/gnu.ref.arm.mk
@@ -0,0 +1,34 @@
+#===============================================================================
+# Copyright contributors to the oneDAL project
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#===============================================================================
+
+#++
+#  g++ definitions for makefile
+#--
+
+include dev/make/compiler_definitions/gnu.mk
+
+PLATs.gnu = lnxarm
+
+COMPILER.all.gnu =  ${CXX} -march=armv8-a+sve -fwrapv -fno-strict-overflow -fno-delete-null-pointer-checks \
+                    -DDAAL_REF -DONEDAL_REF -DDAAL_CPU=sve -Werror -Wreturn-type
+
+link.dynamic.all.gnu = ${CXX} -march=native
+
+COMPILER.lnx.gnu = $(COMPILER.all.gnu)
+link.dynamic.lnx.gnu = $(link.dynamic.all.gnu)
+pedantic.opts.lnx.gnu = $(pedantic.opts.all.gnu)
+
+a8sve_OPT.gnu = $(-Q)march=armv8-a+sve
diff --git a/dev/make/cmplr.icc.mkl.mk b/dev/make/compiler_definitions/icc.mkl.32e.mk
similarity index 97%
rename from dev/make/cmplr.icc.mkl.mk
rename to dev/make/compiler_definitions/icc.mkl.32e.mk
index a3ccb0750f3..a6ff2410ecc 100644
--- a/dev/make/cmplr.icc.mkl.mk
+++ b/dev/make/compiler_definitions/icc.mkl.32e.mk
@@ -15,7 +15,7 @@
 #===============================================================================
 
 #++
-#  Intel compiler defenitions for makefile
+#  Intel compiler definitions for makefile
 #--
 
 PLATs.icc = lnx32e win32e mac32e
diff --git a/dev/make/cmplr.icx.mkl.mk b/dev/make/compiler_definitions/icx.mkl.32e.mk
similarity index 96%
rename from dev/make/cmplr.icx.mkl.mk
rename to dev/make/compiler_definitions/icx.mkl.32e.mk
index cbcde1a7e09..b22bcfe22ac 100644
--- a/dev/make/cmplr.icx.mkl.mk
+++ b/dev/make/compiler_definitions/icx.mkl.32e.mk
@@ -15,7 +15,7 @@
 #===============================================================================
 
 #++
-#  Intel compiler defenitions for makefile
+#  Intel compiler definitions for makefile
 #--
 
 PLATs.icx = lnx32e mac32e
diff --git a/dev/make/cmplr.vc.mkl.mk b/dev/make/compiler_definitions/vc.mkl.32e.mk
similarity index 100%
rename from dev/make/cmplr.vc.mkl.mk
rename to dev/make/compiler_definitions/vc.mkl.32e.mk
diff --git a/dev/make/function_definitions/32e.mk b/dev/make/function_definitions/32e.mk
new file mode 100644
index 00000000000..41dfbb96fe9
--- /dev/null
+++ b/dev/make/function_definitions/32e.mk
@@ -0,0 +1,107 @@
+#===============================================================================
+# Copyright contributors to the oneDAL project
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#===============================================================================
+
+ifeq ($(filter mkl ref,$(BACKEND_CONFIG)),)
+  $(error Unsupported backend config '$(BACKEND_CONFIG)'. \
+          Supported config for '$(PLAT)' are ['mkl', 'ref'])
+endif
+
+COMPILERs = icc icx gnu clang vc
+COMPILER ?= icc
+CPUs := sse2 sse42 avx2 avx512
+CPUs.files := nrh neh hsw skx
+
+ONEAPI.dispatcher_tag.nrh := -D__CPU_TAG__=__CPU_TAG_SSE2__
+ONEAPI.dispatcher_tag.neh := -D__CPU_TAG__=__CPU_TAG_SSE42__
+ONEAPI.dispatcher_tag.hsw := -D__CPU_TAG__=__CPU_TAG_AVX2__
+ONEAPI.dispatcher_tag.skx := -D__CPU_TAG__=__CPU_TAG_AVX512__
+
+# Used as $(eval $(call add_mandatory_cpu,var_name)) to add the mandatory CPU
+# sse2 to the start of the list of CPUs stored in 'var_name'
+define add_mandatory_cpu
+  $$(eval $1 := $$(if $$(filter sse2,$$($1)),$$($1),sse2 $$($1)))
+endef
+
+# Used as $(eval $(call set_uarch_options_for_compiler,$(COMPILER)))
+define set_uarch_options_for_compiler
+  $$(eval p4_OPT := $$(p4_OPT.$1))
+  $$(eval mc3_OPT := $$(mc3_OPT.$1))
+  $$(eval avx2_OPT := $$(avx2_OPT.$1))
+  $$(eval skx_OPT := $$(skx_OPT.$1))
+endef
+
+# Used as $(eval $(call set_arch_file_suffix,var_name))
+define set_arch_file_suffix
+  $$(eval $1.files := $$(subst sse2,nrh,$$(subst sse42,neh,$$(subst avx2,hsw,$$(subst avx512,skx,$$($1))))))
+endef
+
+# Used as $(eval $(call set_usecpu_defs))
+# There are no parameters, as we assume we want to update the variable USECPUS,
+# but we can't set this without a function call, as we rely on other variables
+# already being set
+define set_usecpu_defs
+  $$(eval USECPUS.out.defs := $$(subst sse2,^\#define DAAL_KERNEL_SSE2$$(sed.eow),\
+                              $$(subst sse42,^\#define DAAL_KERNEL_SSE42$$(sed.eow),\
+                              $$(subst avx2,^\#define DAAL_KERNEL_AVX2$$(sed.eow),\
+                              $$(subst avx512,^\#define DAAL_KERNEL_AVX512$$(sed.eow),$$(USECPUS.out))))))
+endef
+
+# Used as $(eval $(call append_uarch_copt,$(OBJNAME)))
+define append_uarch_copt
+$$(eval $$(call containing,_nrh, $1): COPT += $$(p4_OPT)   -DDAAL_CPU=sse2)
+$$(eval $$(call containing,_neh, $1): COPT += $$(mc3_OPT)  -DDAAL_CPU=sse42)
+$$(eval $$(call containing,_hsw, $1): COPT += $$(avx2_OPT) -DDAAL_CPU=avx2)
+$$(eval $$(call containing,_skx, $1): COPT += $$(skx_OPT)  -DDAAL_CPU=avx512)
+
+$$(eval $$(call containing,_flt, $1): COPT += -DDAAL_FPTYPE=float)
+$$(eval $$(call containing,_dbl, $1): COPT += -DDAAL_FPTYPE=double)
+endef
+
+# Used as $(eval $(call subst_arch_cpu_in_var,VARNAME))
+define subst_arch_cpu_in_var
+  $$(eval $1 := $$(subst _cpu_nrh,_cpu,$$($1)))
+  $$(eval $1 := $$(subst _cpu_neh,_cpu,$$($1)))
+  $$(eval $1 := $$(subst _cpu_hsw,_cpu,$$($1)))
+  $$(eval $1 := $$(subst _cpu_skx,_cpu,$$($1)))
+endef
+
+# Use as $(eval $(call add_cpu_to_uarch_in_files,VAR_NAME
+define add_cpu_to_uarch_in_files
+  $$(eval nrh_files := $$(subst _nrh,_cpu_nrh,$$(call containing,_nrh,$$($1))))
+  $$(eval neh_files := $$(subst _neh,_cpu_neh,$$(call containing,_neh,$$($1))))
+  $$(eval hsw_files := $$(subst _hsw,_cpu_hsw,$$(call containing,_hsw,$$($1))))
+  $$(eval skx_files := $$(subst _skx,_cpu_skx,$$(call containing,_skx,$$($1))))
+  $$(eval user_cpu_files := $$(nrh_files) $$(neh_files) $$(hsw_files) $$(skx_files))
+endef
+
+# Used as $(eval $(call dispatcher_cpu_rule,rule_name,$(USECPUS))))
+define dispatcher_cpu_rule
+$1: | $(dir $1)/.
+	$(if $(filter sse42,$2),echo "#define ONEDAL_CPU_DISPATCH_SSE42" >> $$@)
+	$(if $(filter avx2,$2),echo "#define ONEDAL_CPU_DISPATCH_AVX2" >> $$@)
+	$(if $(filter avx512,$2),echo "#define ONEDAL_CPU_DISPATCH_AVX512" >> $$@)
+endef
+
+# Used as $(eval $(call update_copt_from_dispatcher_tag,$(OBJ_NAME),suffix))
+# This must be called after the p4_OPT, mc3_OPT, avx2_OPT, skx_OPT, a8sve_OPT,
+# and ONEAPI.dispatcher_tag.* variables are defined. Otherwise this will be a
+# no-op
+define update_copt_from_dispatcher_tag
+  $$(eval $(call containing,_nrh, $1): COPT += $$(p4_OPT$2)   $$(ONEAPI.dispatcher_tag.nrh))
+  $$(eval $(call containing,_neh, $1): COPT += $$(mc3_OPT$2)  $$(ONEAPI.dispatcher_tag.neh))
+  $$(eval $(call containing,_hsw, $1): COPT += $$(avx2_OPT$2) $$(ONEAPI.dispatcher_tag.hsw))
+  $$(eval $(call containing,_skx, $1): COPT += $$(skx_OPT$2)  $$(ONEAPI.dispatcher_tag.skx))
+endef
diff --git a/dev/make/function_definitions/arm.mk b/dev/make/function_definitions/arm.mk
new file mode 100644
index 00000000000..181b1c9ee2a
--- /dev/null
+++ b/dev/make/function_definitions/arm.mk
@@ -0,0 +1,82 @@
+#===============================================================================
+# Copyright contributors to the oneDAL project
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#===============================================================================
+
+ifeq ($(filter ref,$(BACKEND_CONFIG)),)
+  $(error Unsupported backend config '$(BACKEND_CONFIG)'. \
+          Supported config for '$(PLAT)' are ['ref'])
+endif
+
+COMPILERs = gnu clang
+COMPILER ?= gnu
+CPUs := sve
+CPUs.files := a8sve
+
+ONEAPI.dispatcher_tag.a8sve := -D__CPU_TAG__=__CPU_TAG_ARMV8SVE__
+
+# Used as $(eval $(call add_mandatory_cpu,var_name)) to add the mandatory CPU
+# sse2 to the start of the list of CPUs stored in 'var_name'
+define add_mandatory_cpu
+  $$(eval $1 := $$(if $$(filter sve,$$($1)),$$($1),sve $$($1)))
+endef
+
+# Used as $(eval $(call set_uarch_options_for_compiler,$(COMPILER)))
+define set_uarch_options_for_compiler
+  $$(eval a8sve_OPT := $$(a8sve_OPT.$1))
+endef
+
+# Used as $(eval $(call set_arch_file_suffix,var_name))
+define set_arch_file_suffix
+  $$(eval $1.files := $$(subst sve,a8sve,$$($1)))
+endef
+
+# Used as $(eval $(call set_usecpu_defs))
+# There are no parameters, as we assume we want to update the variable USECPUS,
+# but we can't set this without a function call, as we rely on other variables
+# already being set
+define set_usecpu_defs
+  $$(eval USECPUS.out.defs := $$(subst sve,^\#define DAAL_KERNEL_SVE$$(sed.eow),$$(USECPUS.out)))
+endef
+
+# Used as $(eval $(call append_uarch_copt,$(OBJNAME)))
+define append_uarch_copt
+$$(eval $$(call containing,_flt, $1): COPT += -DDAAL_FPTYPE=float)
+$$(eval $$(call containing,_dbl, $1): COPT += -DDAAL_FPTYPE=double)
+endef
+
+# Used as $(eval $(call subst_arch_cpu_in_var,VARNAME))
+define subst_arch_cpu_in_var
+  $$(eval $1 := $$(subst _cpu_a8sve,_cpu,$$($1)))
+endef
+
+# Use as $(eval $(call add_cpu_to_uarch_in_files,VAR_NAME
+define add_cpu_to_uarch_in_files
+  $$(eval a8sve_files := $$(subst _a8sve,_cpu_a8sve,$$(call containing,_a8sve,$$($1))))
+  $$(eval user_cpu_files := $$(a8sve_files))
+endef
+
+# Used as $(eval $(call dispatcher_cpu_rule,rule_name,$(USECPUS))))
+define dispatcher_cpu_rule
+$1: | $(dir $1)/.
+	$(if $(filter sve,$2),echo "#define ONEDAL_CPU_DISPATCH_A8SVE" >> $$@)
+endef
+
+# Used as $(eval $(call update_copt_from_dispatcher_tag,$(OBJ_NAME),suffix))
+# This must be called after the p4_OPT, mc3_OPT, avx2_OPT, skx_OPT, a8sve_OPT,
+# and ONEAPI.dispatcher_tag.* variables are defined. Otherwise this will be a
+# no-op
+define update_copt_from_dispatcher_tag
+  $$(eval $(call containing,_a8sve, $1): COPT += $$(a8sve_OPT$2) $$(ONEAPI.dispatcher_tag.a8sve))
+endef
diff --git a/dev/make/function_definitions/lnx32e.mk b/dev/make/function_definitions/lnx32e.mk
new file mode 100644
index 00000000000..ea5e759520a
--- /dev/null
+++ b/dev/make/function_definitions/lnx32e.mk
@@ -0,0 +1,37 @@
+#===============================================================================
+# Copyright contributors to the oneDAL project
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#===============================================================================
+
+BACKEND_CONFIG ?= mkl
+ARCH = 32e
+ARCH_DIR_ONEDAL = intel64
+_OS := lnx
+_IA := intel64
+
+include dev/make/function_definitions/32e.mk
+
+# Used as $(eval $(call set_daal_rt_deps))
+define set_daal_rt_deps
+  $$(eval daaldep.lnx32e.rt.thr := -L$$(TBBDIR.soia.lnx) -ltbb -ltbbmalloc \
+          -lpthread $$(daaldep.lnx32e.rt.$$(COMPILER)) \
+          $$(if $$(COV.libia),$$(COV.libia)/libcov.a))
+  $$(eval daaldep.lnx32e.rt.seq := -lpthread $$(daaldep.lnx32e.rt.$$(COMPILER)) \
+          $$(if $$(COV.libia),$$(COV.libia)/libcov.a))
+  $$(eval daaldep.lnx32e.rt.dpc := -lpthread -lOpenCL \
+          $$(if $$(COV.libia),$$(COV.libia)/libcov.a))
+  $$(eval daaldep.lnx32e.threxport := export_lnx32e.$$(BACKEND_CONFIG).def)
+
+  $$(eval daaldep.lnx.threxport.create = grep -v -E '^(EXPORTS|;|$$$$$$$$)' $$$$< $$$$(USECPUS.out.grep.filter) | sed -e 's/^/-u /')
+endef
diff --git a/dev/make/function_definitions/lnxarm.mk b/dev/make/function_definitions/lnxarm.mk
new file mode 100644
index 00000000000..c44df217e03
--- /dev/null
+++ b/dev/make/function_definitions/lnxarm.mk
@@ -0,0 +1,37 @@
+#===============================================================================
+# Copyright contributors to the oneDAL project
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#===============================================================================
+
+BACKEND_CONFIG ?= ref
+ARCH = arm
+ARCH_DIR_ONEDAL = arm
+_OS := lnx
+_IA := arm
+
+include dev/make/function_definitions/arm.mk
+
+# Used as $(eval $(call set_daal_rt_deps))
+define set_daal_rt_deps
+  $$(eval daaldep.lnxarm.rt.thr := -L$$(TBBDIR.soia.lnx) -ltbb -ltbbmalloc \
+          -lpthread $$(daaldep.lnxarm.rt.$$(COMPILER)) \
+          $$(if $$(COV.libia),$$(COV.libia)/libcov.a))
+  $$(eval daaldep.lnxarm.rt.seq := -lpthread $$(daaldep.lnxarm.rt.$$(COMPILER)) \
+          $$(if $$(COV.libia),$$(COV.libia)/libcov.a))
+  $$(eval daaldep.lnxarm.rt.dpc := -lpthread -lOpenCL \
+          $$(if $$(COV.libia),$$(COV.libia)/libcov.a))
+  $$(eval daaldep.lnxarm.threxport := export_lnxarm.$$(BACKEND_CONFIG).def)
+
+  $$(eval daaldep.lnx.threxport.create = grep -v -E '^(EXPORTS|;|$$$$$$$$)' $$$$< $$$$(USECPUS.out.grep.filter) | sed -e 's/^/-u /')
+endef
diff --git a/dev/make/function_definitions/mac32e.mk b/dev/make/function_definitions/mac32e.mk
new file mode 100644
index 00000000000..a86b2416838
--- /dev/null
+++ b/dev/make/function_definitions/mac32e.mk
@@ -0,0 +1,33 @@
+#===============================================================================
+# Copyright contributors to the oneDAL project
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#===============================================================================
+
+BACKEND_CONFIG ?= mkl
+ARCH = 32e
+ARCH_DIR_ONEDAL = intel64
+_OS := mac
+_IA := intel64
+
+include dev/make/function_definitions/32e.mk
+
+# Used as $(eval $(call set_daal_rt_deps))
+define set_daal_rt_deps
+  $$(eval daaldep.mac32e.rt.thr := -L$$(RELEASEDIR.tbb.soia) -ltbb -ltbbmalloc \
+          $$(daaldep.mac32e.rt.$$(COMPILER)))
+  $$(eval daaldep.mac32e.rt.seq := $$(daaldep.mac32e.rt.$$(COMPILER)))
+  $$(eval daaldep.mac32e.threxport := export_mac.def)
+
+  $$(eval daaldep.mac.threxport.create = grep -v -E '^(EXPORTS|;|$$$$$$$$)' $$$$< $$$$(USECPUS.out.grep.filter) | sed -e 's/^/-u /')
+endef
diff --git a/dev/make/function_definitions/win32e.mk b/dev/make/function_definitions/win32e.mk
new file mode 100644
index 00000000000..c37480ef549
--- /dev/null
+++ b/dev/make/function_definitions/win32e.mk
@@ -0,0 +1,34 @@
+#===============================================================================
+# Copyright contributors to the oneDAL project
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#===============================================================================
+
+BACKEND_CONFIG ?= mkl
+ARCH = 32e
+ARCH_DIR_ONEDAL = intel64
+_OS := win
+_IA := intel64
+
+include dev/make/function_definitions/32e.mk
+
+# Used as $(eval $(call set_daal_rt_deps))
+define set_daal_rt_deps
+  $$(eval daaldep.win32e.rt.thr  := -LIBPATH:$$(RELEASEDIR.tbb.libia) \
+          $$(dep_thr) $$(if $$(CHECK_DLL_SIG),Wintrust.lib))
+  $$(eval daaldep.win32e.rt.seq  := $$(dep_seq) \
+          $$(if $$(CHECK_DLL_SIG),Wintrust.lib))
+  $$(eval daaldep.win32e.threxport := export.def)
+
+  $$(eval daaldep.win.threxport.create = grep -v -E '^(;|$$$$$$$$)' $$$$< $$$$(USECPUS.out.grep.filter))
+endef
diff --git a/dev/make/identify_os.sh b/dev/make/identify_os.sh
index 12816582bbc..d8d70054d9f 100755
--- a/dev/make/identify_os.sh
+++ b/dev/make/identify_os.sh
@@ -16,12 +16,20 @@
 #===============================================================================
 
 os=$(uname)
+ARCH=$(uname -m)
 if [ "${os}" = "Linux" ]; then
-  echo lnx32e
+  if [ "${ARCH}" = "x86_64" ]; then
+    echo lnx32e
+  elif [ "${ARCH}" = "aarch64" ]; then
+    echo lnxarm
+  else
+    echo "Unkown architecture: ${ARCH}"
+    exit 1
+  fi
 elif [ "${os}" = "Darwin" ]; then
   echo mac32e
 elif [[ "${os}" =~ "MSYS" || "${os}" =~ "CYGWIN" ]]; then
   echo win32e
 else
-  echo "UnknownOS"
+  echo "Unknown OS: ${os}"
 fi
diff --git a/docs/doxygen/doxygen_conf_cpp.txt b/docs/doxygen/doxygen_conf_cpp.txt
index f28ca2e879f..26cd6fb69b2 100644
--- a/docs/doxygen/doxygen_conf_cpp.txt
+++ b/docs/doxygen/doxygen_conf_cpp.txt
@@ -38,7 +38,7 @@ PROJECT_NAME           = "C++ API Reference for Intel(R) oneAPI Data Analytics L
 # could be handy for archiving the generated documentation or if some version
 # control system is used.
 
-PROJECT_NUMBER         = "2024.2"
+PROJECT_NUMBER         = "2024.3"
 
 # Using the PROJECT_BRIEF tag one can provide an optional one line description
 # for a project that appears at the top of each page and should give viewer a
diff --git a/docs/doxygen/doxygen_conf_cpp_examples.txt b/docs/doxygen/doxygen_conf_cpp_examples.txt
index 2bb1aa9bfce..f1927e0a04e 100644
--- a/docs/doxygen/doxygen_conf_cpp_examples.txt
+++ b/docs/doxygen/doxygen_conf_cpp_examples.txt
@@ -38,7 +38,7 @@ PROJECT_NAME           = "C++ API Reference for Intel(R) oneAPI Data Analytics L
 # could be handy for archiving the generated documentation or if some version
 # control system is used.
 
-PROJECT_NUMBER         = "2024.2"
+PROJECT_NUMBER         = "2024.3"
 
 # Using the PROJECT_BRIEF tag one can provide an optional one line description
 # for a project that appears at the top of each page and should give viewer a
diff --git a/docs/doxygen/doxygen_conf_cpp_web.txt b/docs/doxygen/doxygen_conf_cpp_web.txt
index c71f084e687..d47f4402e11 100644
--- a/docs/doxygen/doxygen_conf_cpp_web.txt
+++ b/docs/doxygen/doxygen_conf_cpp_web.txt
@@ -38,7 +38,7 @@ PROJECT_NAME           = "C++ API Reference for Intel(R) oneAPI Data Analytics L
 # could be handy for archiving the generated documentation or if some version
 # control system is used.
 
-PROJECT_NUMBER         = "2024.2"
+PROJECT_NUMBER         = "2024.3"
 
 # Using the PROJECT_BRIEF tag one can provide an optional one line description
 # for a project that appears at the top of each page and should give viewer a
diff --git a/docs/requirements.txt b/docs/requirements.txt
index 381ee0deae3..63f249ddea7 100644
--- a/docs/requirements.txt
+++ b/docs/requirements.txt
@@ -1,7 +1,7 @@
 alabaster==0.7.13
 Babel==2.13.1
 beautifulsoup4==4.12.2
-certifi==2023.7.22
+certifi==2024.2.2
 chardet==5.2.0
 click==8.1.7
 colorama==0.4.6
@@ -13,7 +13,7 @@ importlib-resources==6.1.1
 Jinja2==3.1.3
 lxml==5.1.0
 MarkupSafe==2.1.3
-packaging==23.2
+packaging==24.0
 pydata-sphinx-theme==0.14.3
 Pygments==2.16.1
 pyparsing==3.1.1
diff --git a/examples/cmake/setup_examples.cmake b/examples/cmake/setup_examples.cmake
index 78e37159f17..a705b7e10f6 100644
--- a/examples/cmake/setup_examples.cmake
+++ b/examples/cmake/setup_examples.cmake
@@ -1,5 +1,6 @@
 #===============================================================================
 # Copyright 2023 Intel Corporation
+# Copyright contributors to the oneDAL project
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -101,6 +102,16 @@ endfunction()
 function (add_examples examples_paths)
     foreach(example_file_path ${examples_paths})
         get_filename_component(example ${example_file_path} NAME_WE)
+
+        # Detect CPU architecture
+        if(CMAKE_HOST_SYSTEM_PROCESSOR STREQUAL "x86_64" OR CMAKE_HOST_SYSTEM_PROCESSOR STREQUAL "AMD64")
+            set(CPU_ARCHITECTURE "intel_intel64")
+        elseif(CMAKE_HOST_SYSTEM_PROCESSOR STREQUAL "aarch64")
+            set(CPU_ARCHITECTURE "arm_aarch64")
+        else()
+            message(FATAL_ERROR "Unkown architecture ${CMAKE_HOST_SYSTEM_PROCESSOR}")
+        endif()
+
         add_executable(${example} ${example_file_path})
         target_include_directories(${example} PRIVATE ${oneDAL_INCLUDE_DIRS})
         if (UNIX AND NOT APPLE)
@@ -110,7 +121,7 @@ function (add_examples examples_paths)
         endif()
         target_compile_options(${example} PRIVATE ${ONEDAL_CUSTOM_COMPILE_OPTIONS})
         target_link_options(${example} PRIVATE ${ONEDAL_CUSTOM_LINK_OPTIONS})
-        set_target_properties(${example} PROPERTIES RUNTIME_OUTPUT_DIRECTORY "${PROJECT_SOURCE_DIR}/_cmake_results/intel_intel64_${LINK_TYPE}")
+        set_target_properties(${example} PROPERTIES RUNTIME_OUTPUT_DIRECTORY "${PROJECT_SOURCE_DIR}/_cmake_results/${CPU_ARCHITECTURE}_${LINK_TYPE}")
     endforeach()
     set_common_compiler_options()
 endfunction()
diff --git a/makefile b/makefile
index f4ecdffa268..18211bbe973 100644
--- a/makefile
+++ b/makefile
@@ -1,5 +1,6 @@
 #===============================================================================
 # Copyright 2014 Intel Corporation
+# Copyright contributors to the oneDAL project
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -18,21 +19,25 @@
 # Common macros
 #===============================================================================
 
-ifeq ($(PLAT),)
-    PLAT:=$(shell bash dev/make/identify_os.sh)
-endif
-
 ifeq (help,$(MAKECMDGOALS))
     PLAT:=win32e
+else ifeq ($(PLAT),)
+    PLAT:=$(shell bash dev/make/identify_os.sh)
 endif
 
-attr.lnx32e = lnx intel64 lin
-attr.mac32e = mac intel64
-attr.win32e = win intel64 win
+# Check that we know how to build for the identified platform
+PLATs := lnx32e mac32e win32e lnxarm
+$(if $(filter $(PLAT),$(PLATs)),,$(error Unknown platform $(PLAT)))
 
-_OS := $(word 1,$(attr.$(PLAT)))
-_IA := $(word 2,$(attr.$(PLAT)))
-_OSc:= $(word 3,$(attr.$(PLAT)))
+# Non-platform or architecture specific defines live in common.mk
+include dev/make/common.mk
+
+# Platform specific variables are set in dev/make/function_definitions/$(PLAT).mk
+# There are also files dev/make/function_definitions/$(ARCH).mk, but these are included from
+# the $(PLAT).mk files, rather than here.
+include dev/make/function_definitions/$(PLAT).mk
+
+$(if $(filter $(COMPILERs),$(COMPILER)),,$(error COMPILER must be one of $(COMPILERs)))
 
 MSVC_RUNTIME_VERSIONs = release debug
 MSVC_RUNTIME_VERSION ?= release
@@ -43,6 +48,7 @@ OS_is_$(_OS)                       := yes
 IA_is_$(_IA)                       := yes
 PLAT_is_$(PLAT)                    := yes
 MSVC_RT_is_$(MSVC_RUNTIME_VERSION) := yes
+ARCH_is_$(ARCH)                    := yes
 
 DEFAULT_BUILD_PARAMETERS_LIB       := $(if $(OS_is_win),no,yes)
 BUILD_PARAMETERS_LIB               ?= $(DEFAULT_BUILD_PARAMETERS_LIB)
@@ -53,19 +59,10 @@ $(error Building with the parameters library is not available on Windows OS)
 endif
 endif
 
-COMPILERs = icc icx gnu clang vc
-COMPILER ?= icc
-
-BACKEND_CONFIG ?= mkl
-
-$(if $(filter $(COMPILERs),$(COMPILER)),,$(error COMPILER must be one of $(COMPILERs)))
-
-CPUs := sse2 sse42 avx2 avx512
-CPUs.files := nrh neh hsw skx
-
 USERREQCPU := $(filter-out $(filter $(CPUs),$(REQCPU)),$(REQCPU))
 USECPUS := $(if $(REQCPU),$(if $(USERREQCPU),$(error Unsupported value/s in REQCPU: $(USERREQCPU). List of supported CPUs: $(CPUs)),$(REQCPU)),$(CPUs))
-USECPUS := $(if $(filter sse2,$(USECPUS)),$(USECPUS),sse2 $(USECPUS))
+
+$(eval $(call add_mandatory_cpu,USECPUS))
 
 $(info Selected list of CPUs - USECPUS: $(USECPUS))
 
@@ -91,8 +88,8 @@ endif
 DPC.COMPILE.gcc_toolchain := $(GCC_TOOLCHAIN_PATH)
 endif
 
-include dev/make/cmplr.$(COMPILER).$(BACKEND_CONFIG).mk
-include dev/make/cmplr.dpcpp.mk
+include dev/make/compiler_definitions/$(COMPILER).$(BACKEND_CONFIG).$(ARCH).mk
+include dev/make/compiler_definitions/dpcpp.mk
 
 $(if $(filter $(PLATs.$(COMPILER)),$(PLAT)),,$(error PLAT for $(COMPILER) must be defined to one of $(PLATs.$(COMPILER))))
 
@@ -100,7 +97,6 @@ $(if $(filter $(PLATs.$(COMPILER)),$(PLAT)),,$(error PLAT for $(COMPILER) must b
 # Dependencies generation
 #===============================================================================
 
-include dev/make/common.mk
 include dev/make/deps.mk
 
 #===============================================================================
@@ -133,21 +129,16 @@ y      := $(notdir $(filter $(_OS)/%,lnx/so win/dll mac/dylib))
 -eGRP  = $(if $(OS_is_lnx),-Wl$(comma)--end-group,)
 daalmake = make
 
-p4_OPT   := $(p4_OPT.$(COMPILER))
-mc3_OPT  := $(mc3_OPT.$(COMPILER))
-avx2_OPT := $(avx2_OPT.$(COMPILER))
-skx_OPT  := $(skx_OPT.$(COMPILER))
+$(eval $(call set_uarch_options_for_compiler,$(COMPILER)))
 
-_OSr := $(if $(OS_is_win),win,$(if $(OS_is_lnx),lin,))
+$(eval $(call set_arch_file_suffix,USECPUS))
 
-USECPUS.files := $(subst sse2,nrh,$(subst sse42,neh,$(subst avx2,hsw,$(subst avx512,skx,$(USECPUS)))))
 USECPUS.out := $(filter-out $(USECPUS),$(CPUs))
 USECPUS.out.for.grep.filter := $(addprefix _,$(addsuffix _,$(subst $(space),_|_,$(USECPUS.out))))
 USECPUS.out.grep.filter := $(if $(USECPUS.out),| grep -v -E '$(USECPUS.out.for.grep.filter)')
-USECPUS.out.defs := $(subst sse2,^\#define DAAL_KERNEL_SSE2$(sed.eow),\
-                    $(subst sse42,^\#define DAAL_KERNEL_SSE42$(sed.eow),\
-                    $(subst avx2,^\#define DAAL_KERNEL_AVX2$(sed.eow),\
-                    $(subst avx512,^\#define DAAL_KERNEL_AVX512$(sed.eow),$(USECPUS.out)))))
+
+$(eval $(call set_usecpu_defs))
+
 USECPUS.out.defs := $(subst $(space)^,|^,$(strip $(USECPUS.out.defs)))
 USECPUS.out.defs.filter := $(if $(USECPUS.out.defs),sed $(sed.-b) $(sed.-i) -E -e 's/$(USECPUS.out.defs)/$(sed.eol)/')
 
@@ -291,7 +282,6 @@ mklgpufpk.HEADERS := $(MKLGPUFPKDIR.include)/mkl_dal_sycl.hpp $(MKLGPUFPKDIR.inc
 
 include dev/make/deps.$(BACKEND_CONFIG).mk
 
-
 #============================= oneAPI folders =====================================
 ifeq ($(if $(or $(OS_is_lnx),$(OS_is_win)),yes,),yes)
 ONEAPIDIR := $(call topf,$$ONEAPI_ROOT)
@@ -349,25 +339,8 @@ release.PARAMETERS.LIBS_A.dpc := $(parameters_a.dpc) \
                              $(if $(OS_is_win),$(foreach ilib,$(parameters_a.dpc),$(ilib:%.lib=%_dll.lib)),)
 release.PARAMETERS.LIBS_Y.dpc := $(parameters_y.dpc)
 
-# Libraries required for building
-daaldep.lnx32e.rt.thr := -L$(TBBDIR.soia.lnx) -ltbb -ltbbmalloc -lpthread $(daaldep.lnx32e.rt.$(COMPILER)) $(if $(COV.libia),$(COV.libia)/libcov.a)
-daaldep.lnx32e.rt.seq := -lpthread $(daaldep.lnx32e.rt.$(COMPILER)) $(if $(COV.libia),$(COV.libia)/libcov.a)
-daaldep.lnx32e.rt.dpc := -lpthread -lOpenCL $(if $(COV.libia),$(COV.libia)/libcov.a)
-daaldep.lnx32e.threxport := export_lnx32e.$(BACKEND_CONFIG).def
 
-daaldep.lnx.threxport.create = grep -v -E '^(EXPORTS|;|$$)' $< $(USECPUS.out.grep.filter) | sed -e 's/^/-u /'
-
-daaldep.win32e.rt.thr  := -LIBPATH:$(RELEASEDIR.tbb.libia) $(dep_thr) $(if $(CHECK_DLL_SIG),Wintrust.lib)
-daaldep.win32e.rt.seq  := $(dep_seq) $(if $(CHECK_DLL_SIG),Wintrust.lib)
-daaldep.win32e.threxport := export.def
-
-daaldep.win.threxport.create = grep -v -E '^(;|$$)' $< $(USECPUS.out.grep.filter)
-
-daaldep.mac32e.rt.thr := -L$(RELEASEDIR.tbb.soia) -ltbb -ltbbmalloc $(daaldep.mac32e.rt.$(COMPILER))
-daaldep.mac32e.rt.seq := $(daaldep.mac32e.rt.$(COMPILER))
-daaldep.mac32e.threxport := export_mac.def
-
-daaldep.mac.threxport.create = grep -v -E '^(EXPORTS|;|$$)' $< $(USECPUS.out.grep.filter) | sed -e 's/^/-u /'
+$(eval $(call set_daal_rt_deps))
 
 daaldep.rt.thr  := $(daaldep.$(PLAT).rt.thr)
 daaldep.rt.seq  := $(daaldep.$(PLAT).rt.seq)
@@ -514,12 +487,8 @@ $(CORE.objs_a): COPT += -D__TBB_NO_IMPLICIT_LINKAGE -DDAAL_NOTHROW_EXCEPTIONS \
                         $(if $(CHECK_DLL_SIG),-DDAAL_CHECK_DLL_SIG)
 $(CORE.objs_a): COPT += @$(CORE.tmpdir_a)/inc_a_folders.txt
 $(filter %threading.$o, $(CORE.objs_a)): COPT += -D__DO_TBB_LAYER__
-$(call containing,_nrh, $(CORE.objs_a)): COPT += $(p4_OPT)   -DDAAL_CPU=sse2
-$(call containing,_neh, $(CORE.objs_a)): COPT += $(mc3_OPT)  -DDAAL_CPU=sse42
-$(call containing,_hsw, $(CORE.objs_a)): COPT += $(avx2_OPT) -DDAAL_CPU=avx2
-$(call containing,_skx, $(CORE.objs_a)): COPT += $(skx_OPT)  -DDAAL_CPU=avx512
-$(call containing,_flt, $(CORE.objs_a)): COPT += -DDAAL_FPTYPE=float
-$(call containing,_dbl, $(CORE.objs_a)): COPT += -DDAAL_FPTYPE=double
+
+$(eval $(call append_uarch_copt,$(CORE.objs_a)))
 
 $(CORE.objs_y): $(CORE.tmpdir_y)/inc_y_folders.txt
 $(CORE.objs_y): COPT += $(-fPIC) $(-cxx11) $(-Zl) $(-DEBC)
@@ -529,12 +498,8 @@ $(CORE.objs_y): COPT += -D__DAAL_IMPLEMENTATION \
                         $(if $(CHECK_DLL_SIG),-DDAAL_CHECK_DLL_SIG)
 $(CORE.objs_y): COPT += @$(CORE.tmpdir_y)/inc_y_folders.txt
 $(filter %threading.$o, $(CORE.objs_y)): COPT += -D__DO_TBB_LAYER__
-$(call containing,_nrh, $(CORE.objs_y)): COPT += $(p4_OPT)   -DDAAL_CPU=sse2
-$(call containing,_neh, $(CORE.objs_y)): COPT += $(mc3_OPT)  -DDAAL_CPU=sse42
-$(call containing,_hsw, $(CORE.objs_y)): COPT += $(avx2_OPT) -DDAAL_CPU=avx2
-$(call containing,_skx, $(CORE.objs_y)): COPT += $(skx_OPT)  -DDAAL_CPU=avx512
-$(call containing,_flt, $(CORE.objs_y)): COPT += -DDAAL_FPTYPE=float
-$(call containing,_dbl, $(CORE.objs_y)): COPT += -DDAAL_FPTYPE=double
+
+$(eval $(call append_uarch_copt,$(CORE.objs_y)))
 
 vpath
 vpath %.cpp $(CORE.srcdirs)
@@ -546,19 +511,19 @@ $(CORE.tmpdir_y)/inc_y_folders.txt: makefile.lst | $(CORE.tmpdir_y)/. $(CORE.inc
 $(CORE.tmpdir_a)/library_version_info.$(o): $(VERSION_DATA_FILE)
 $(CORE.tmpdir_y)/library_version_info.$(o): $(VERSION_DATA_FILE)
 
+# Used as $(eval $(call .compile.template.ay,obj_file))
 define .compile.template.ay
 $(eval template_source_cpp := $(subst .$o,.cpp,$(notdir $1)))
 $(eval template_source_cpp := $(subst _fpt_flt,_fpt,$(template_source_cpp)))
 $(eval template_source_cpp := $(subst _fpt_dbl,_fpt,$(template_source_cpp)))
-$(eval template_source_cpp := $(subst _cpu_nrh,_cpu,$(template_source_cpp)))
-$(eval template_source_cpp := $(subst _cpu_neh,_cpu,$(template_source_cpp)))
-$(eval template_source_cpp := $(subst _cpu_hsw,_cpu,$(template_source_cpp)))
-$(eval template_source_cpp := $(subst _cpu_skx,_cpu,$(template_source_cpp)))
+
+$(eval $(call subst_arch_cpu_in_var,template_source_cpp))
+
 $1: $(template_source_cpp) ; $(value C.COMPILE)
 endef
 
-$(foreach a,$(CORE.objs_a),$(eval $(call .compile.template.ay,$a,$(CORE.tmpdir_a))))
-$(foreach a,$(CORE.objs_y),$(eval $(call .compile.template.ay,$a,$(CORE.tmpdir_y))))
+$(foreach a,$(CORE.objs_a),$(eval $(call .compile.template.ay,$a)))
+$(foreach a,$(CORE.objs_y),$(eval $(call .compile.template.ay,$a)))
 
 $(CORE.tmpdir_y)/dll.res: $(VERSION_DATA_FILE)
 $(CORE.tmpdir_y)/dll.res: RCOPT += $(addprefix -I, $(CORE.incdirs.common))
@@ -582,10 +547,6 @@ ONEAPI.incdirs.thirdp := $(CORE.incdirs.common) $(daaldep.math_backend.incdir) $
 ONEAPI.incdirs := $(ONEAPI.incdirs.common) $(CORE.incdirs.thirdp) $(ONEAPI.incdirs.thirdp)
 
 ONEAPI.dispatcher_cpu = $(WORKDIR)/oneapi/dal/_dal_cpu_dispatcher_gen.hpp
-ONEAPI.dispatcher_tag.nrh := -D__CPU_TAG__=__CPU_TAG_SSE2__
-ONEAPI.dispatcher_tag.neh := -D__CPU_TAG__=__CPU_TAG_SSE42__
-ONEAPI.dispatcher_tag.hsw := -D__CPU_TAG__=__CPU_TAG_AVX2__
-ONEAPI.dispatcher_tag.skx := -D__CPU_TAG__=__CPU_TAG_AVX512__
 
 ONEAPI.srcdir := $(CPPDIR.onedal)
 ONEAPI.srcdirs.base := $(ONEAPI.srcdir) \
@@ -628,11 +589,9 @@ ONEAPI.objs_y.all := $(ONEAPI.objs_y) $(ONEAPI.objs_y.dpc)
 define .populate_cpus
 $(eval non_cpu_files := $(call notcontaining,_cpu,$2))
 $(eval cpu_files := $(call containing,_cpu,$2))
-$(eval nrh_files := $(subst _nrh,_cpu_nrh,$(call containing,_nrh,$(non_cpu_files))))
-$(eval neh_files := $(subst _neh,_cpu_neh,$(call containing,_neh,$(non_cpu_files))))
-$(eval hsw_files := $(subst _hsw,_cpu_hsw,$(call containing,_hsw,$(non_cpu_files))))
-$(eval skx_files := $(subst _skx,_cpu_skx,$(call containing,_skx,$(non_cpu_files))))
-$(eval user_cpu_files := $(nrh_files) $(neh_files) $(hsw_files) $(skx_files))
+
+$(eval $(call add_cpu_to_uarch_in_files,non_cpu_files))
+
 $(eval populated_cpu_files := $(foreach ccc,$(USECPUS.files),$(subst _cpu,_cpu_$(ccc),$(cpu_files))))
 $(eval populated_cpu_files := $(filter-out $(user_cpu_files),$(populated_cpu_files)))
 $(eval $1 := $(non_cpu_files) $(populated_cpu_files))
@@ -655,10 +614,9 @@ $(eval $(call .populate_cpus,ONEAPI.objs_y.dpc,$(ONEAPI.objs_y.dpc)))
 define .ONEAPI.compile
 $(eval template_source_cpp := $(1:$2/%.$o=%.cpp))
 $(eval template_source_cpp := $(subst -,/,$(template_source_cpp)))
-$(eval template_source_cpp := $(subst _cpu_nrh,_cpu,$(template_source_cpp)))
-$(eval template_source_cpp := $(subst _cpu_neh,_cpu,$(template_source_cpp)))
-$(eval template_source_cpp := $(subst _cpu_hsw,_cpu,$(template_source_cpp)))
-$(eval template_source_cpp := $(subst _cpu_skx,_cpu,$(template_source_cpp)))
+
+$(eval $(call subst_arch_cpu_in_var,template_source_cpp))
+
 $1: $(template_source_cpp) | $(dir $1)/. ; $(value $3.COMPILE)
 endef
 
@@ -671,10 +629,7 @@ $1: LOPT:=
 $1: $(1:%.$a=%_link.txt) | $(dir $1)/. ; $(value LINK.STATIC)
 endef
 
-$(ONEAPI.dispatcher_cpu): | $(dir $(ONEAPI.dispatcher_cpu))/.
-	$(if $(filter sse42,$(USECPUS)),echo "#define ONEDAL_CPU_DISPATCH_SSE42" >> $@)
-	$(if $(filter avx2,$(USECPUS)),echo "#define ONEDAL_CPU_DISPATCH_AVX2" >> $@)
-	$(if $(filter avx512,$(USECPUS)),echo "#define ONEDAL_CPU_DISPATCH_AVX512" >> $@)
+$(eval $(call dispatcher_cpu_rule,$(ONEAPI.dispatcher_cpu),$(USECPUS)))
 
 # Create file with include paths
 ONEAPI.include_options := $(addprefix -I, $(ONEAPI.incdirs.common)) \
@@ -701,10 +656,8 @@ $(ONEAPI.objs_a): COPT += $(-fPIC) $(-cxx17) $(-Zl) $(-DEBC) $(-EHsc) $(pedantic
                           -D__TBB_NO_IMPLICIT_LINKAGE \
                           -DTBB_USE_ASSERT=0 \
                            @$(ONEAPI.tmpdir_a)/inc_a_folders.txt
-$(call containing,_nrh, $(ONEAPI.objs_a)): COPT += $(p4_OPT)   $(ONEAPI.dispatcher_tag.nrh)
-$(call containing,_neh, $(ONEAPI.objs_a)): COPT += $(mc3_OPT)  $(ONEAPI.dispatcher_tag.neh)
-$(call containing,_hsw, $(ONEAPI.objs_a)): COPT += $(avx2_OPT) $(ONEAPI.dispatcher_tag.hsw)
-$(call containing,_skx, $(ONEAPI.objs_a)): COPT += $(skx_OPT)  $(ONEAPI.dispatcher_tag.skx)
+
+$(eval $(call update_copt_from_dispatcher_tag,$(ONEAPI.objs_a)))
 
 $(ONEAPI.objs_a.dpc): $(ONEAPI.dispatcher_cpu) $(ONEAPI.tmpdir_a.dpc)/inc_a_folders.txt
 $(ONEAPI.objs_a.dpc): COPT += $(-fPIC) $(-cxx17) $(-DEBC) $(-EHsc) $(pedantic.opts.dpcpp) \
@@ -716,10 +669,8 @@ $(ONEAPI.objs_a.dpc): COPT += $(-fPIC) $(-cxx17) $(-DEBC) $(-EHsc) $(pedantic.op
                               -D_ENABLE_ATOMIC_ALIGNMENT_FIX \
                               -DTBB_USE_ASSERT=0 \
                                @$(ONEAPI.tmpdir_a.dpc)/inc_a_folders.txt
-$(call containing,_nrh, $(ONEAPI.objs_a.dpc)): COPT += $(p4_OPT.dpcpp)   $(ONEAPI.dispatcher_tag.nrh)
-$(call containing,_neh, $(ONEAPI.objs_a.dpc)): COPT += $(mc3_OPT.dpcpp)  $(ONEAPI.dispatcher_tag.neh)
-$(call containing,_hsw, $(ONEAPI.objs_a.dpc)): COPT += $(avx2_OPT.dpcpp) $(ONEAPI.dispatcher_tag.hsw)
-$(call containing,_skx, $(ONEAPI.objs_a.dpc)): COPT += $(skx_OPT.dpcpp)  $(ONEAPI.dispatcher_tag.skx)
+
+$(eval $(call update_copt_from_dispatcher_tag,$(ONEAPI.objs_a.dpc),.dpcpp))
 
 # Set compilation options to the object files which are part of DYNAMIC lib
 $(ONEAPI.objs_y): $(ONEAPI.dispatcher_cpu) $(ONEAPI.tmpdir_y)/inc_y_folders.txt
@@ -732,10 +683,8 @@ $(ONEAPI.objs_y): COPT += $(-fPIC) $(-cxx17) $(-Zl) $(-DEBC) $(-EHsc) $(pedantic
                           -D__TBB_NO_IMPLICIT_LINKAGE \
                           -DTBB_USE_ASSERT=0 \
                           @$(ONEAPI.tmpdir_y)/inc_y_folders.txt
-$(call containing,_nrh, $(ONEAPI.objs_y)): COPT += $(p4_OPT)   $(ONEAPI.dispatcher_tag.nrh)
-$(call containing,_neh, $(ONEAPI.objs_y)): COPT += $(mc3_OPT)  $(ONEAPI.dispatcher_tag.neh)
-$(call containing,_hsw, $(ONEAPI.objs_y)): COPT += $(avx2_OPT) $(ONEAPI.dispatcher_tag.hsw)
-$(call containing,_skx, $(ONEAPI.objs_y)): COPT += $(skx_OPT)  $(ONEAPI.dispatcher_tag.skx)
+
+$(eval $(call update_copt_from_dispatcher_tag,$(ONEAPI.objs_y)))
 
 $(ONEAPI.objs_y.dpc): $(ONEAPI.dispatcher_cpu) $(ONEAPI.tmpdir_y.dpc)/inc_y_folders.txt
 $(ONEAPI.objs_y.dpc): COPT += $(-fPIC) $(-cxx17) $(-DEBC) $(-EHsc) $(pedantic.opts.dpcpp) \
@@ -749,10 +698,8 @@ $(ONEAPI.objs_y.dpc): COPT += $(-fPIC) $(-cxx17) $(-DEBC) $(-EHsc) $(pedantic.op
                               -D__TBB_NO_IMPLICIT_LINKAGE \
                               -DTBB_USE_ASSERT=0 \
                               @$(ONEAPI.tmpdir_y.dpc)/inc_y_folders.txt
-$(call containing,_nrh, $(ONEAPI.objs_y.dpc)): COPT += $(p4_OPT.dpcpp)   $(ONEAPI.dispatcher_tag.nrh)
-$(call containing,_neh, $(ONEAPI.objs_y.dpc)): COPT += $(mc3_OPT.dpcpp)  $(ONEAPI.dispatcher_tag.neh)
-$(call containing,_hsw, $(ONEAPI.objs_y.dpc)): COPT += $(avx2_OPT.dpcpp) $(ONEAPI.dispatcher_tag.hsw)
-$(call containing,_skx, $(ONEAPI.objs_y.dpc)): COPT += $(skx_OPT.dpcpp)  $(ONEAPI.dispatcher_tag.skx)
+
+$(eval $(call update_copt_from_dispatcher_tag,$(ONEAPI.objs_y.dpc),.dpcpp))
 
 # Filtering parameter files
 PARAMETERS.objs_a.filtered := $(filter %parameters.$(o),$(ONEAPI.objs_a))
@@ -1122,8 +1069,9 @@ $(foreach t,$(releasetbb.LIBS_Y),$(eval $(call .release.t,$t,$(RELEASEDIR.tbb.so
 $(foreach t,$(releasetbb.LIBS_A),$(eval $(call .release.t,$t,$(RELEASEDIR.tbb.libia))))
 
 #----- cmake configs generation
+
 _release_cmake_configs:
-	$(if $(shell bash -c "command -v cmake"),cmake -DINSTALL_DIR=$(RELEASEDIR.lib)/cmake/oneDAL -P cmake/scripts/generate_config.cmake,echo 'cmake configs generation skipped')
+	$(if $(shell bash -c "command -v cmake"),cmake -DINSTALL_DIR=$(RELEASEDIR.lib)/cmake/oneDAL -DARCH_DIR_ONEDAL=$(ARCH_DIR_ONEDAL) -P cmake/scripts/generate_config.cmake,echo 'cmake configs generation skipped')
 
 #----- nuspecs generation
 _release_common: _release_nuspec
diff --git a/makefile.ver b/makefile.ver
index e9941372c02..a22557005c0 100644
--- a/makefile.ver
+++ b/makefile.ver
@@ -15,7 +15,7 @@
 #===============================================================================
 
 MAJOR   =       2024
-MINOR   =       2
+MINOR   =       3
 UPDATE  =       0
 BUILD   =       $(shell date +'%Y%m%d')
 STATUS  =       P
diff --git a/samples/oneapi/dpc/ccl/onedal_lnx.lst b/samples/oneapi/dpc/ccl/onedal_lnx.lst
index 43e4812bfd4..ffbee1afd11 100644
--- a/samples/oneapi/dpc/ccl/onedal_lnx.lst
+++ b/samples/oneapi/dpc/ccl/onedal_lnx.lst
@@ -20,7 +20,9 @@
 
 MPI  =  basic_statistics_distr_ccl            \
         cor_distr_ccl                         \
+        cov_biased_distr_ccl                  \
         cov_distr_ccl                         \
+        cov_online_distr_ccl                  \
         dbscan_distr_ccl                      \
         decision_forest_cls_hist_distr_ccl    \
         decision_forest_reg_hist_distr_ccl    \
diff --git a/samples/oneapi/dpc/ccl/sources/cov_online_distr_ccl.cpp b/samples/oneapi/dpc/ccl/sources/cov_online_distr_ccl.cpp
new file mode 100644
index 00000000000..d10831a6df4
--- /dev/null
+++ b/samples/oneapi/dpc/ccl/sources/cov_online_distr_ccl.cpp
@@ -0,0 +1,79 @@
+/*******************************************************************************
+* Copyright contributors to the oneDAL project
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*     http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*******************************************************************************/
+
+#include <sycl/sycl.hpp>
+#include <iomanip>
+#include <iostream>
+#include <mpi.h>
+
+#ifndef ONEDAL_DATA_PARALLEL
+#define ONEDAL_DATA_PARALLEL
+#endif
+
+#include "oneapi/dal/algo/covariance.hpp"
+#include "oneapi/dal/spmd/ccl/communicator.hpp"
+#include "oneapi/dal/io/csv.hpp"
+
+#include "utils.hpp"
+
+namespace dal = oneapi::dal;
+
+void run(sycl::queue& queue) {
+    const auto data_file_name = get_data_path("data/covcormoments_dense.csv");
+    const std::int64_t nBlocks = 10;
+    const auto data = dal::read<dal::table>(queue, dal::csv::data_source{ data_file_name });
+
+    const auto cov_desc = dal::covariance::descriptor{}.set_result_options(
+        dal::covariance::result_options::cov_matrix);
+
+    auto comm = dal::preview::spmd::make_communicator<dal::preview::spmd::backend::ccl>(queue);
+    auto rank_id = comm.get_rank();
+    auto rank_count = comm.get_rank_count();
+
+    auto input_vec = split_table_by_rows<float>(queue, data, rank_count);
+
+    auto input_blocks = split_table_by_rows<float>(queue, input_vec[rank_id], nBlocks);
+    dal::covariance::partial_compute_result<> partial_result;
+
+    for (std::int64_t i = 0; i < nBlocks; i++) {
+        partial_result = dal::partial_compute(queue, cov_desc, partial_result, input_blocks[i]);
+    }
+    const auto result = dal::preview::finalize_compute(comm, cov_desc, partial_result);
+
+    if (comm.get_rank() == 0) {
+        std::cout << "Sample covariance:\n" << result.get_cov_matrix() << std::endl;
+    }
+}
+
+int main(int argc, char const* argv[]) {
+    ccl::init();
+    int status = MPI_Init(nullptr, nullptr);
+    if (status != MPI_SUCCESS) {
+        throw std::runtime_error{ "Problem occurred during MPI init" };
+    }
+
+    auto device = sycl::device(sycl::gpu_selector_v);
+    std::cout << "Running on " << device.get_platform().get_info<sycl::info::platform::name>()
+              << ", " << device.get_info<sycl::info::device::name>() << std::endl;
+    sycl::queue q{ device };
+    run(q);
+
+    status = MPI_Finalize();
+    if (status != MPI_SUCCESS) {
+        throw std::runtime_error{ "Problem occurred during MPI finalize" };
+    }
+    return 0;
+}
diff --git a/samples/oneapi/dpc/mpi/onedal_lnx.lst b/samples/oneapi/dpc/mpi/onedal_lnx.lst
index 47764bfba7b..3bcceee3290 100644
--- a/samples/oneapi/dpc/mpi/onedal_lnx.lst
+++ b/samples/oneapi/dpc/mpi/onedal_lnx.lst
@@ -20,7 +20,9 @@
 
 MPI  =  basic_statistics_distr_mpi              \
         cor_distr_mpi                           \
+        cov_biased_distr_mpi                    \
         cov_distr_mpi                           \
+        cov_online_distr_mpi                    \
         dbscan_distr_mpi                        \
         decision_forest_cls_hist_distr_mpi      \
         decision_forest_reg_hist_distr_mpi      \
diff --git a/samples/oneapi/dpc/mpi/sources/cov_online_distr_mpi.cpp b/samples/oneapi/dpc/mpi/sources/cov_online_distr_mpi.cpp
new file mode 100644
index 00000000000..5126ccdbf5c
--- /dev/null
+++ b/samples/oneapi/dpc/mpi/sources/cov_online_distr_mpi.cpp
@@ -0,0 +1,78 @@
+/*******************************************************************************
+* Copyright contributors to the oneDAL project
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*     http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*******************************************************************************/
+
+#include <sycl/sycl.hpp>
+#include <iomanip>
+#include <iostream>
+#include <mpi.h>
+
+#ifndef ONEDAL_DATA_PARALLEL
+#define ONEDAL_DATA_PARALLEL
+#endif
+
+#include "oneapi/dal/algo/covariance.hpp"
+#include "oneapi/dal/spmd/mpi/communicator.hpp"
+#include "oneapi/dal/io/csv.hpp"
+
+#include "utils.hpp"
+
+namespace dal = oneapi::dal;
+
+void run(sycl::queue& queue) {
+    const auto data_file_name = get_data_path("data/covcormoments_dense.csv");
+    const std::int64_t nBlocks = 10;
+    const auto data = dal::read<dal::table>(queue, dal::csv::data_source{ data_file_name });
+
+    const auto cov_desc = dal::covariance::descriptor{}.set_result_options(
+        dal::covariance::result_options::cov_matrix);
+
+    auto comm = dal::preview::spmd::make_communicator<dal::preview::spmd::backend::mpi>(queue);
+    auto rank_id = comm.get_rank();
+    auto rank_count = comm.get_rank_count();
+
+    auto input_vec = split_table_by_rows<float>(queue, data, rank_count);
+
+    auto input_blocks = split_table_by_rows<float>(queue, input_vec[rank_id], nBlocks);
+    dal::covariance::partial_compute_result<> partial_result;
+
+    for (std::int64_t i = 0; i < nBlocks; i++) {
+        partial_result = dal::partial_compute(queue, cov_desc, partial_result, input_blocks[i]);
+    }
+    const auto result = dal::preview::finalize_compute(comm, cov_desc, partial_result);
+
+    if (comm.get_rank() == 0) {
+        std::cout << "Sample covariance:\n" << result.get_cov_matrix() << std::endl;
+    }
+}
+
+int main(int argc, char const* argv[]) {
+    int status = MPI_Init(nullptr, nullptr);
+    if (status != MPI_SUCCESS) {
+        throw std::runtime_error{ "Problem occurred during MPI init" };
+    }
+
+    auto device = sycl::device(sycl::gpu_selector_v);
+    std::cout << "Running on " << device.get_platform().get_info<sycl::info::platform::name>()
+              << ", " << device.get_info<sycl::info::device::name>() << std::endl;
+    sycl::queue q{ device };
+    run(q);
+
+    status = MPI_Finalize();
+    if (status != MPI_SUCCESS) {
+        throw std::runtime_error{ "Problem occurred during MPI finalize" };
+    }
+    return 0;
+}